diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,85106 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999588562024275, + "eval_steps": 500, + "global_step": 12152, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.228759514503188e-05, + "grad_norm": 9.532528095057138, + "learning_rate": 5.479452054794521e-08, + "loss": 0.7901, + "step": 1 + }, + { + "epoch": 0.00016457519029006376, + "grad_norm": 30.026945671831577, + "learning_rate": 1.0958904109589042e-07, + "loss": 2.1253, + "step": 2 + }, + { + "epoch": 0.00024686278543509563, + "grad_norm": 8.88519815829157, + "learning_rate": 1.6438356164383561e-07, + "loss": 0.7715, + "step": 3 + }, + { + "epoch": 0.00032915038058012753, + "grad_norm": 29.197616305414858, + "learning_rate": 2.1917808219178084e-07, + "loss": 2.1284, + "step": 4 + }, + { + "epoch": 0.0004114379757251594, + "grad_norm": 29.892559190290434, + "learning_rate": 2.73972602739726e-07, + "loss": 2.0685, + "step": 5 + }, + { + "epoch": 0.0004937255708701913, + "grad_norm": 10.567782598278942, + "learning_rate": 3.2876712328767123e-07, + "loss": 0.8122, + "step": 6 + }, + { + "epoch": 0.0005760131660152232, + "grad_norm": 28.912763215741734, + "learning_rate": 3.835616438356165e-07, + "loss": 2.1056, + "step": 7 + }, + { + "epoch": 0.0006583007611602551, + "grad_norm": 29.51664131482477, + "learning_rate": 4.383561643835617e-07, + "loss": 2.0418, + "step": 8 + }, + { + "epoch": 0.000740588356305287, + "grad_norm": 28.30266632286417, + "learning_rate": 4.931506849315068e-07, + "loss": 2.0237, + "step": 9 + }, + { + "epoch": 0.0008228759514503189, + "grad_norm": 27.46875103243188, + "learning_rate": 5.47945205479452e-07, + "loss": 1.9595, + "step": 10 + }, + { + "epoch": 0.0009051635465953507, + "grad_norm": 24.865752165641698, + "learning_rate": 6.027397260273974e-07, + "loss": 1.9174, + "step": 11 + }, + { + "epoch": 0.0009874511417403825, + "grad_norm": 24.328147714809518, + "learning_rate": 6.575342465753425e-07, + "loss": 1.9307, + "step": 12 + }, + { + "epoch": 0.0010697387368854144, + "grad_norm": 5.5234808874616395, + "learning_rate": 7.123287671232878e-07, + "loss": 0.8138, + "step": 13 + }, + { + "epoch": 0.0011520263320304463, + "grad_norm": 24.035678143620423, + "learning_rate": 7.67123287671233e-07, + "loss": 1.9803, + "step": 14 + }, + { + "epoch": 0.0012343139271754782, + "grad_norm": 20.7270429685146, + "learning_rate": 8.219178082191781e-07, + "loss": 1.8216, + "step": 15 + }, + { + "epoch": 0.0013166015223205101, + "grad_norm": 3.1954913902580597, + "learning_rate": 8.767123287671234e-07, + "loss": 0.7577, + "step": 16 + }, + { + "epoch": 0.001398889117465542, + "grad_norm": 19.0932823831642, + "learning_rate": 9.315068493150686e-07, + "loss": 1.8765, + "step": 17 + }, + { + "epoch": 0.001481176712610574, + "grad_norm": 17.783753558169572, + "learning_rate": 9.863013698630137e-07, + "loss": 1.7423, + "step": 18 + }, + { + "epoch": 0.0015634643077556058, + "grad_norm": 13.929582396803928, + "learning_rate": 1.041095890410959e-06, + "loss": 1.5683, + "step": 19 + }, + { + "epoch": 0.0016457519029006377, + "grad_norm": 10.860155069125868, + "learning_rate": 1.095890410958904e-06, + "loss": 1.5344, + "step": 20 + }, + { + "epoch": 0.0017280394980456696, + "grad_norm": 10.868210550382598, + "learning_rate": 1.1506849315068494e-06, + "loss": 1.4788, + "step": 21 + }, + { + "epoch": 0.0018103270931907015, + "grad_norm": 9.306619668804826, + "learning_rate": 1.2054794520547947e-06, + "loss": 1.4831, + "step": 22 + }, + { + "epoch": 0.0018926146883357334, + "grad_norm": 2.4601086961337857, + "learning_rate": 1.26027397260274e-06, + "loss": 0.7305, + "step": 23 + }, + { + "epoch": 0.001974902283480765, + "grad_norm": 7.6886950923134005, + "learning_rate": 1.315068493150685e-06, + "loss": 1.4257, + "step": 24 + }, + { + "epoch": 0.002057189878625797, + "grad_norm": 6.220708397685521, + "learning_rate": 1.3698630136986302e-06, + "loss": 1.3468, + "step": 25 + }, + { + "epoch": 0.002139477473770829, + "grad_norm": 4.674476253548759, + "learning_rate": 1.4246575342465755e-06, + "loss": 1.3151, + "step": 26 + }, + { + "epoch": 0.002221765068915861, + "grad_norm": 3.895214381538298, + "learning_rate": 1.4794520547945206e-06, + "loss": 1.3041, + "step": 27 + }, + { + "epoch": 0.0023040526640608927, + "grad_norm": 3.527134956076901, + "learning_rate": 1.534246575342466e-06, + "loss": 1.2878, + "step": 28 + }, + { + "epoch": 0.0023863402592059248, + "grad_norm": 3.5362809667326522, + "learning_rate": 1.5890410958904112e-06, + "loss": 1.2726, + "step": 29 + }, + { + "epoch": 0.0024686278543509564, + "grad_norm": 2.966450361552696, + "learning_rate": 1.6438356164383561e-06, + "loss": 1.2993, + "step": 30 + }, + { + "epoch": 0.0025509154494959886, + "grad_norm": 2.458939366346722, + "learning_rate": 1.6986301369863014e-06, + "loss": 1.281, + "step": 31 + }, + { + "epoch": 0.0026332030446410202, + "grad_norm": 2.535030337573037, + "learning_rate": 1.7534246575342468e-06, + "loss": 1.2708, + "step": 32 + }, + { + "epoch": 0.0027154906397860524, + "grad_norm": 1.239317382781359, + "learning_rate": 1.808219178082192e-06, + "loss": 0.6648, + "step": 33 + }, + { + "epoch": 0.002797778234931084, + "grad_norm": 1.1180854196130607, + "learning_rate": 1.8630136986301372e-06, + "loss": 0.6646, + "step": 34 + }, + { + "epoch": 0.002880065830076116, + "grad_norm": 2.1450564270921646, + "learning_rate": 1.9178082191780823e-06, + "loss": 1.2447, + "step": 35 + }, + { + "epoch": 0.002962353425221148, + "grad_norm": 1.8049145439148968, + "learning_rate": 1.9726027397260274e-06, + "loss": 1.1815, + "step": 36 + }, + { + "epoch": 0.00304464102036618, + "grad_norm": 0.795375753210199, + "learning_rate": 2.027397260273973e-06, + "loss": 0.6292, + "step": 37 + }, + { + "epoch": 0.0031269286155112116, + "grad_norm": 0.7439259016336192, + "learning_rate": 2.082191780821918e-06, + "loss": 0.6468, + "step": 38 + }, + { + "epoch": 0.0032092162106562437, + "grad_norm": 2.102073236832498, + "learning_rate": 2.1369863013698635e-06, + "loss": 1.1965, + "step": 39 + }, + { + "epoch": 0.0032915038058012754, + "grad_norm": 1.7507482751861791, + "learning_rate": 2.191780821917808e-06, + "loss": 1.147, + "step": 40 + }, + { + "epoch": 0.0033737914009463075, + "grad_norm": 2.115499646494852, + "learning_rate": 2.2465753424657537e-06, + "loss": 1.2079, + "step": 41 + }, + { + "epoch": 0.003456078996091339, + "grad_norm": 1.5822724466961147, + "learning_rate": 2.301369863013699e-06, + "loss": 1.213, + "step": 42 + }, + { + "epoch": 0.0035383665912363713, + "grad_norm": 0.6843357265370693, + "learning_rate": 2.356164383561644e-06, + "loss": 0.624, + "step": 43 + }, + { + "epoch": 0.003620654186381403, + "grad_norm": 1.9669305292499641, + "learning_rate": 2.4109589041095894e-06, + "loss": 1.1691, + "step": 44 + }, + { + "epoch": 0.003702941781526435, + "grad_norm": 4.293989393639943, + "learning_rate": 2.4657534246575345e-06, + "loss": 1.1484, + "step": 45 + }, + { + "epoch": 0.003785229376671467, + "grad_norm": 1.3873591085798673, + "learning_rate": 2.52054794520548e-06, + "loss": 1.177, + "step": 46 + }, + { + "epoch": 0.0038675169718164985, + "grad_norm": 3.6561002665760807, + "learning_rate": 2.5753424657534247e-06, + "loss": 1.1469, + "step": 47 + }, + { + "epoch": 0.00394980456696153, + "grad_norm": 1.5450365482515196, + "learning_rate": 2.63013698630137e-06, + "loss": 1.1521, + "step": 48 + }, + { + "epoch": 0.004032092162106563, + "grad_norm": 1.5565124011894804, + "learning_rate": 2.6849315068493153e-06, + "loss": 1.1589, + "step": 49 + }, + { + "epoch": 0.004114379757251594, + "grad_norm": 0.6675144755255817, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.6406, + "step": 50 + }, + { + "epoch": 0.004196667352396626, + "grad_norm": 1.5292143908928457, + "learning_rate": 2.794520547945206e-06, + "loss": 1.1297, + "step": 51 + }, + { + "epoch": 0.004278954947541658, + "grad_norm": 0.6502938857874467, + "learning_rate": 2.849315068493151e-06, + "loss": 0.6186, + "step": 52 + }, + { + "epoch": 0.00436124254268669, + "grad_norm": 1.4333837148693778, + "learning_rate": 2.9041095890410957e-06, + "loss": 1.1303, + "step": 53 + }, + { + "epoch": 0.004443530137831722, + "grad_norm": 1.4749791593345467, + "learning_rate": 2.9589041095890413e-06, + "loss": 1.1387, + "step": 54 + }, + { + "epoch": 0.004525817732976754, + "grad_norm": 1.4998339630977238, + "learning_rate": 3.0136986301369864e-06, + "loss": 1.1857, + "step": 55 + }, + { + "epoch": 0.004608105328121785, + "grad_norm": 1.5507431529256293, + "learning_rate": 3.068493150684932e-06, + "loss": 1.1487, + "step": 56 + }, + { + "epoch": 0.004690392923266818, + "grad_norm": 1.6348282836598194, + "learning_rate": 3.123287671232877e-06, + "loss": 1.1641, + "step": 57 + }, + { + "epoch": 0.0047726805184118495, + "grad_norm": 0.5752534532225031, + "learning_rate": 3.1780821917808225e-06, + "loss": 0.5701, + "step": 58 + }, + { + "epoch": 0.004854968113556881, + "grad_norm": 1.6099812024773308, + "learning_rate": 3.2328767123287676e-06, + "loss": 1.1721, + "step": 59 + }, + { + "epoch": 0.004937255708701913, + "grad_norm": 0.6408161226805661, + "learning_rate": 3.2876712328767123e-06, + "loss": 0.5998, + "step": 60 + }, + { + "epoch": 0.0050195433038469454, + "grad_norm": 0.5617271278467075, + "learning_rate": 3.342465753424658e-06, + "loss": 0.6265, + "step": 61 + }, + { + "epoch": 0.005101830898991977, + "grad_norm": 1.9160395609787255, + "learning_rate": 3.397260273972603e-06, + "loss": 1.1687, + "step": 62 + }, + { + "epoch": 0.005184118494137009, + "grad_norm": 1.7944962743686514, + "learning_rate": 3.4520547945205484e-06, + "loss": 1.0999, + "step": 63 + }, + { + "epoch": 0.0052664060892820405, + "grad_norm": 1.6550254402978586, + "learning_rate": 3.5068493150684935e-06, + "loss": 1.1283, + "step": 64 + }, + { + "epoch": 0.005348693684427073, + "grad_norm": 2.06701106889446, + "learning_rate": 3.5616438356164386e-06, + "loss": 1.1449, + "step": 65 + }, + { + "epoch": 0.005430981279572105, + "grad_norm": 1.334891505276627, + "learning_rate": 3.616438356164384e-06, + "loss": 1.0978, + "step": 66 + }, + { + "epoch": 0.005513268874717136, + "grad_norm": 1.809032539584058, + "learning_rate": 3.671232876712329e-06, + "loss": 1.1172, + "step": 67 + }, + { + "epoch": 0.005595556469862168, + "grad_norm": 0.5631162064075181, + "learning_rate": 3.7260273972602743e-06, + "loss": 0.5793, + "step": 68 + }, + { + "epoch": 0.0056778440650072, + "grad_norm": 1.6486487445332147, + "learning_rate": 3.7808219178082194e-06, + "loss": 1.0659, + "step": 69 + }, + { + "epoch": 0.005760131660152232, + "grad_norm": 1.7514518974861626, + "learning_rate": 3.8356164383561645e-06, + "loss": 1.1786, + "step": 70 + }, + { + "epoch": 0.005842419255297264, + "grad_norm": 2.6958756773092887, + "learning_rate": 3.89041095890411e-06, + "loss": 1.1019, + "step": 71 + }, + { + "epoch": 0.005924706850442296, + "grad_norm": 1.7803679070531404, + "learning_rate": 3.945205479452055e-06, + "loss": 1.0859, + "step": 72 + }, + { + "epoch": 0.006006994445587327, + "grad_norm": 1.5059878641321802, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0788, + "step": 73 + }, + { + "epoch": 0.00608928204073236, + "grad_norm": 1.8716327109844846, + "learning_rate": 4.054794520547946e-06, + "loss": 1.1095, + "step": 74 + }, + { + "epoch": 0.0061715696358773916, + "grad_norm": 1.5616475319286818, + "learning_rate": 4.109589041095891e-06, + "loss": 1.1278, + "step": 75 + }, + { + "epoch": 0.006253857231022423, + "grad_norm": 1.493898527453622, + "learning_rate": 4.164383561643836e-06, + "loss": 1.104, + "step": 76 + }, + { + "epoch": 0.006336144826167455, + "grad_norm": 1.8452837120263397, + "learning_rate": 4.219178082191781e-06, + "loss": 1.1095, + "step": 77 + }, + { + "epoch": 0.0064184324213124875, + "grad_norm": 1.784319898693149, + "learning_rate": 4.273972602739727e-06, + "loss": 1.0949, + "step": 78 + }, + { + "epoch": 0.006500720016457519, + "grad_norm": 2.137737098454538, + "learning_rate": 4.328767123287671e-06, + "loss": 1.1302, + "step": 79 + }, + { + "epoch": 0.006583007611602551, + "grad_norm": 1.5914074135685312, + "learning_rate": 4.383561643835616e-06, + "loss": 1.0916, + "step": 80 + }, + { + "epoch": 0.0066652952067475825, + "grad_norm": 2.3489068213528266, + "learning_rate": 4.438356164383562e-06, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.006747582801892615, + "grad_norm": 2.073369039063705, + "learning_rate": 4.493150684931507e-06, + "loss": 1.0892, + "step": 82 + }, + { + "epoch": 0.006829870397037647, + "grad_norm": 1.8770075428367665, + "learning_rate": 4.5479452054794525e-06, + "loss": 1.1187, + "step": 83 + }, + { + "epoch": 0.006912157992182678, + "grad_norm": 4.506883747948483, + "learning_rate": 4.602739726027398e-06, + "loss": 1.0762, + "step": 84 + }, + { + "epoch": 0.00699444558732771, + "grad_norm": 1.7209663187813125, + "learning_rate": 4.657534246575343e-06, + "loss": 1.1226, + "step": 85 + }, + { + "epoch": 0.007076733182472743, + "grad_norm": 0.6052191270162426, + "learning_rate": 4.712328767123288e-06, + "loss": 0.6055, + "step": 86 + }, + { + "epoch": 0.007159020777617774, + "grad_norm": 1.7994312730778819, + "learning_rate": 4.767123287671233e-06, + "loss": 1.0967, + "step": 87 + }, + { + "epoch": 0.007241308372762806, + "grad_norm": 1.9304702595282108, + "learning_rate": 4.821917808219179e-06, + "loss": 1.1492, + "step": 88 + }, + { + "epoch": 0.007323595967907838, + "grad_norm": 2.088564652992412, + "learning_rate": 4.876712328767124e-06, + "loss": 1.0985, + "step": 89 + }, + { + "epoch": 0.00740588356305287, + "grad_norm": 1.8604994381662585, + "learning_rate": 4.931506849315069e-06, + "loss": 1.0923, + "step": 90 + }, + { + "epoch": 0.007488171158197902, + "grad_norm": 0.5594391183994828, + "learning_rate": 4.986301369863014e-06, + "loss": 0.6021, + "step": 91 + }, + { + "epoch": 0.007570458753342934, + "grad_norm": 1.7905925850647735, + "learning_rate": 5.04109589041096e-06, + "loss": 1.1047, + "step": 92 + }, + { + "epoch": 0.007652746348487965, + "grad_norm": 2.5829004230758055, + "learning_rate": 5.095890410958904e-06, + "loss": 1.0856, + "step": 93 + }, + { + "epoch": 0.007735033943632997, + "grad_norm": 2.8109366679812817, + "learning_rate": 5.1506849315068494e-06, + "loss": 1.0906, + "step": 94 + }, + { + "epoch": 0.00781732153877803, + "grad_norm": 1.9488333893087777, + "learning_rate": 5.2054794520547945e-06, + "loss": 1.1174, + "step": 95 + }, + { + "epoch": 0.00789960913392306, + "grad_norm": 1.8898489727850725, + "learning_rate": 5.26027397260274e-06, + "loss": 1.0764, + "step": 96 + }, + { + "epoch": 0.007981896729068093, + "grad_norm": 1.9662220110655733, + "learning_rate": 5.3150684931506856e-06, + "loss": 1.0687, + "step": 97 + }, + { + "epoch": 0.008064184324213125, + "grad_norm": 2.012210892740288, + "learning_rate": 5.369863013698631e-06, + "loss": 1.0688, + "step": 98 + }, + { + "epoch": 0.008146471919358156, + "grad_norm": 2.0256582980555145, + "learning_rate": 5.424657534246576e-06, + "loss": 1.0435, + "step": 99 + }, + { + "epoch": 0.008228759514503189, + "grad_norm": 2.3161294458478228, + "learning_rate": 5.479452054794521e-06, + "loss": 1.1027, + "step": 100 + }, + { + "epoch": 0.008311047109648221, + "grad_norm": 2.159842764055281, + "learning_rate": 5.534246575342466e-06, + "loss": 1.0223, + "step": 101 + }, + { + "epoch": 0.008393334704793252, + "grad_norm": 2.7342793057170964, + "learning_rate": 5.589041095890412e-06, + "loss": 1.0485, + "step": 102 + }, + { + "epoch": 0.008475622299938285, + "grad_norm": 0.6133807544248717, + "learning_rate": 5.643835616438357e-06, + "loss": 0.5933, + "step": 103 + }, + { + "epoch": 0.008557909895083315, + "grad_norm": 2.0957817610708593, + "learning_rate": 5.698630136986302e-06, + "loss": 1.084, + "step": 104 + }, + { + "epoch": 0.008640197490228348, + "grad_norm": 3.0607800999765105, + "learning_rate": 5.753424657534246e-06, + "loss": 1.0369, + "step": 105 + }, + { + "epoch": 0.00872248508537338, + "grad_norm": 2.3550652220766404, + "learning_rate": 5.8082191780821915e-06, + "loss": 1.0785, + "step": 106 + }, + { + "epoch": 0.008804772680518411, + "grad_norm": 2.885362070393249, + "learning_rate": 5.863013698630137e-06, + "loss": 1.1143, + "step": 107 + }, + { + "epoch": 0.008887060275663444, + "grad_norm": 2.726344088292101, + "learning_rate": 5.9178082191780825e-06, + "loss": 1.0423, + "step": 108 + }, + { + "epoch": 0.008969347870808476, + "grad_norm": 2.720421039977678, + "learning_rate": 5.972602739726028e-06, + "loss": 1.0424, + "step": 109 + }, + { + "epoch": 0.009051635465953507, + "grad_norm": 2.7737084246092043, + "learning_rate": 6.027397260273973e-06, + "loss": 1.0669, + "step": 110 + }, + { + "epoch": 0.00913392306109854, + "grad_norm": 2.4862795852431696, + "learning_rate": 6.082191780821919e-06, + "loss": 1.0798, + "step": 111 + }, + { + "epoch": 0.00921621065624357, + "grad_norm": 1.9953691894673529, + "learning_rate": 6.136986301369864e-06, + "loss": 1.0337, + "step": 112 + }, + { + "epoch": 0.009298498251388603, + "grad_norm": 2.1734409375655908, + "learning_rate": 6.191780821917809e-06, + "loss": 1.0769, + "step": 113 + }, + { + "epoch": 0.009380785846533636, + "grad_norm": 2.4691052918090457, + "learning_rate": 6.246575342465754e-06, + "loss": 1.0758, + "step": 114 + }, + { + "epoch": 0.009463073441678667, + "grad_norm": 2.51765809469206, + "learning_rate": 6.301369863013699e-06, + "loss": 1.1065, + "step": 115 + }, + { + "epoch": 0.009545361036823699, + "grad_norm": 2.3976820917439916, + "learning_rate": 6.356164383561645e-06, + "loss": 1.0454, + "step": 116 + }, + { + "epoch": 0.00962764863196873, + "grad_norm": 0.5713752667519881, + "learning_rate": 6.41095890410959e-06, + "loss": 0.5767, + "step": 117 + }, + { + "epoch": 0.009709936227113762, + "grad_norm": 2.9303587471653385, + "learning_rate": 6.465753424657535e-06, + "loss": 1.0596, + "step": 118 + }, + { + "epoch": 0.009792223822258795, + "grad_norm": 2.625385971373383, + "learning_rate": 6.5205479452054794e-06, + "loss": 1.0694, + "step": 119 + }, + { + "epoch": 0.009874511417403826, + "grad_norm": 2.6850490082257368, + "learning_rate": 6.5753424657534245e-06, + "loss": 1.0629, + "step": 120 + }, + { + "epoch": 0.009956799012548858, + "grad_norm": 2.8941680627630575, + "learning_rate": 6.630136986301371e-06, + "loss": 1.0797, + "step": 121 + }, + { + "epoch": 0.010039086607693891, + "grad_norm": 2.437227451528501, + "learning_rate": 6.684931506849316e-06, + "loss": 1.0446, + "step": 122 + }, + { + "epoch": 0.010121374202838922, + "grad_norm": 4.2330170384868655, + "learning_rate": 6.739726027397261e-06, + "loss": 1.077, + "step": 123 + }, + { + "epoch": 0.010203661797983954, + "grad_norm": 3.742681446646284, + "learning_rate": 6.794520547945206e-06, + "loss": 1.0578, + "step": 124 + }, + { + "epoch": 0.010285949393128985, + "grad_norm": 2.905751102486295, + "learning_rate": 6.849315068493151e-06, + "loss": 1.0397, + "step": 125 + }, + { + "epoch": 0.010368236988274018, + "grad_norm": 2.248809486049495, + "learning_rate": 6.904109589041097e-06, + "loss": 1.0057, + "step": 126 + }, + { + "epoch": 0.01045052458341905, + "grad_norm": 2.793469113179832, + "learning_rate": 6.958904109589042e-06, + "loss": 1.0423, + "step": 127 + }, + { + "epoch": 0.010532812178564081, + "grad_norm": 3.044433211099124, + "learning_rate": 7.013698630136987e-06, + "loss": 1.0519, + "step": 128 + }, + { + "epoch": 0.010615099773709114, + "grad_norm": 3.453404138683163, + "learning_rate": 7.068493150684932e-06, + "loss": 1.0492, + "step": 129 + }, + { + "epoch": 0.010697387368854146, + "grad_norm": 3.294896819292345, + "learning_rate": 7.123287671232877e-06, + "loss": 1.0186, + "step": 130 + }, + { + "epoch": 0.010779674963999177, + "grad_norm": 2.652529510878711, + "learning_rate": 7.178082191780823e-06, + "loss": 1.0481, + "step": 131 + }, + { + "epoch": 0.01086196255914421, + "grad_norm": 2.5635334133873835, + "learning_rate": 7.232876712328768e-06, + "loss": 1.0189, + "step": 132 + }, + { + "epoch": 0.01094425015428924, + "grad_norm": 2.310822969570939, + "learning_rate": 7.287671232876713e-06, + "loss": 1.0804, + "step": 133 + }, + { + "epoch": 0.011026537749434273, + "grad_norm": 2.7939745420750532, + "learning_rate": 7.342465753424658e-06, + "loss": 1.0731, + "step": 134 + }, + { + "epoch": 0.011108825344579305, + "grad_norm": 10.159052417359996, + "learning_rate": 7.397260273972603e-06, + "loss": 1.0013, + "step": 135 + }, + { + "epoch": 0.011191112939724336, + "grad_norm": 2.492104076947929, + "learning_rate": 7.452054794520549e-06, + "loss": 1.058, + "step": 136 + }, + { + "epoch": 0.011273400534869369, + "grad_norm": 2.7323610574219512, + "learning_rate": 7.506849315068494e-06, + "loss": 1.0503, + "step": 137 + }, + { + "epoch": 0.0113556881300144, + "grad_norm": 2.94667222448598, + "learning_rate": 7.561643835616439e-06, + "loss": 1.0283, + "step": 138 + }, + { + "epoch": 0.011437975725159432, + "grad_norm": 4.017422542900321, + "learning_rate": 7.616438356164384e-06, + "loss": 1.0883, + "step": 139 + }, + { + "epoch": 0.011520263320304465, + "grad_norm": 3.6715275879486633, + "learning_rate": 7.671232876712329e-06, + "loss": 1.0536, + "step": 140 + }, + { + "epoch": 0.011602550915449495, + "grad_norm": 3.0172048685106603, + "learning_rate": 7.726027397260276e-06, + "loss": 1.055, + "step": 141 + }, + { + "epoch": 0.011684838510594528, + "grad_norm": 3.077620329335805, + "learning_rate": 7.78082191780822e-06, + "loss": 1.0195, + "step": 142 + }, + { + "epoch": 0.01176712610573956, + "grad_norm": 2.959594926294125, + "learning_rate": 7.835616438356164e-06, + "loss": 1.0369, + "step": 143 + }, + { + "epoch": 0.011849413700884591, + "grad_norm": 5.2531338908420055, + "learning_rate": 7.89041095890411e-06, + "loss": 1.0524, + "step": 144 + }, + { + "epoch": 0.011931701296029624, + "grad_norm": 2.9462988063147755, + "learning_rate": 7.945205479452055e-06, + "loss": 1.0258, + "step": 145 + }, + { + "epoch": 0.012013988891174655, + "grad_norm": 2.835501864556677, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0035, + "step": 146 + }, + { + "epoch": 0.012096276486319687, + "grad_norm": 3.1002864915340798, + "learning_rate": 8.054794520547946e-06, + "loss": 1.0379, + "step": 147 + }, + { + "epoch": 0.01217856408146472, + "grad_norm": 2.7184860323108464, + "learning_rate": 8.109589041095892e-06, + "loss": 1.0373, + "step": 148 + }, + { + "epoch": 0.01226085167660975, + "grad_norm": 3.093424317685046, + "learning_rate": 8.164383561643837e-06, + "loss": 1.0559, + "step": 149 + }, + { + "epoch": 0.012343139271754783, + "grad_norm": 2.9403313251924064, + "learning_rate": 8.219178082191782e-06, + "loss": 1.0312, + "step": 150 + }, + { + "epoch": 0.012425426866899816, + "grad_norm": 3.334710236004298, + "learning_rate": 8.273972602739727e-06, + "loss": 1.032, + "step": 151 + }, + { + "epoch": 0.012507714462044846, + "grad_norm": 3.754339855053731, + "learning_rate": 8.328767123287672e-06, + "loss": 1.007, + "step": 152 + }, + { + "epoch": 0.012590002057189879, + "grad_norm": 3.468367068790295, + "learning_rate": 8.383561643835617e-06, + "loss": 1.0352, + "step": 153 + }, + { + "epoch": 0.01267228965233491, + "grad_norm": 3.08946479512089, + "learning_rate": 8.438356164383562e-06, + "loss": 1.0285, + "step": 154 + }, + { + "epoch": 0.012754577247479942, + "grad_norm": 2.7171722187405463, + "learning_rate": 8.493150684931507e-06, + "loss": 1.0355, + "step": 155 + }, + { + "epoch": 0.012836864842624975, + "grad_norm": 2.9125857783989955, + "learning_rate": 8.547945205479454e-06, + "loss": 1.0383, + "step": 156 + }, + { + "epoch": 0.012919152437770006, + "grad_norm": 3.431055558365553, + "learning_rate": 8.602739726027397e-06, + "loss": 0.9858, + "step": 157 + }, + { + "epoch": 0.013001440032915038, + "grad_norm": 2.5695243675652906, + "learning_rate": 8.657534246575343e-06, + "loss": 1.0257, + "step": 158 + }, + { + "epoch": 0.013083727628060069, + "grad_norm": 3.1403965108405645, + "learning_rate": 8.712328767123288e-06, + "loss": 1.0161, + "step": 159 + }, + { + "epoch": 0.013166015223205102, + "grad_norm": 3.0914617102513535, + "learning_rate": 8.767123287671233e-06, + "loss": 1.0126, + "step": 160 + }, + { + "epoch": 0.013248302818350134, + "grad_norm": 2.974266261740425, + "learning_rate": 8.82191780821918e-06, + "loss": 1.0146, + "step": 161 + }, + { + "epoch": 0.013330590413495165, + "grad_norm": 4.453619610906972, + "learning_rate": 8.876712328767125e-06, + "loss": 1.01, + "step": 162 + }, + { + "epoch": 0.013412878008640198, + "grad_norm": 3.3339134633525203, + "learning_rate": 8.93150684931507e-06, + "loss": 1.0164, + "step": 163 + }, + { + "epoch": 0.01349516560378523, + "grad_norm": 3.096524915506246, + "learning_rate": 8.986301369863015e-06, + "loss": 1.0436, + "step": 164 + }, + { + "epoch": 0.013577453198930261, + "grad_norm": 0.5714699105064062, + "learning_rate": 9.04109589041096e-06, + "loss": 0.5844, + "step": 165 + }, + { + "epoch": 0.013659740794075293, + "grad_norm": 3.3053733088978294, + "learning_rate": 9.095890410958905e-06, + "loss": 1.01, + "step": 166 + }, + { + "epoch": 0.013742028389220324, + "grad_norm": 3.042487650681917, + "learning_rate": 9.15068493150685e-06, + "loss": 1.0258, + "step": 167 + }, + { + "epoch": 0.013824315984365357, + "grad_norm": 3.0826602321214267, + "learning_rate": 9.205479452054795e-06, + "loss": 1.0152, + "step": 168 + }, + { + "epoch": 0.01390660357951039, + "grad_norm": 4.049305212778963, + "learning_rate": 9.26027397260274e-06, + "loss": 1.0344, + "step": 169 + }, + { + "epoch": 0.01398889117465542, + "grad_norm": 2.262878129775452, + "learning_rate": 9.315068493150685e-06, + "loss": 0.9903, + "step": 170 + }, + { + "epoch": 0.014071178769800453, + "grad_norm": 2.5478144837312904, + "learning_rate": 9.36986301369863e-06, + "loss": 1.0255, + "step": 171 + }, + { + "epoch": 0.014153466364945485, + "grad_norm": 0.5963923221726043, + "learning_rate": 9.424657534246576e-06, + "loss": 0.5835, + "step": 172 + }, + { + "epoch": 0.014235753960090516, + "grad_norm": 2.4229291883624775, + "learning_rate": 9.47945205479452e-06, + "loss": 0.9969, + "step": 173 + }, + { + "epoch": 0.014318041555235549, + "grad_norm": 2.5861485778295563, + "learning_rate": 9.534246575342466e-06, + "loss": 1.0321, + "step": 174 + }, + { + "epoch": 0.01440032915038058, + "grad_norm": 3.0535728376170868, + "learning_rate": 9.589041095890411e-06, + "loss": 1.0545, + "step": 175 + }, + { + "epoch": 0.014482616745525612, + "grad_norm": 3.167624134264756, + "learning_rate": 9.643835616438358e-06, + "loss": 1.0212, + "step": 176 + }, + { + "epoch": 0.014564904340670645, + "grad_norm": 2.532407359117499, + "learning_rate": 9.698630136986303e-06, + "loss": 1.0395, + "step": 177 + }, + { + "epoch": 0.014647191935815675, + "grad_norm": 3.335905765902237, + "learning_rate": 9.753424657534248e-06, + "loss": 1.0444, + "step": 178 + }, + { + "epoch": 0.014729479530960708, + "grad_norm": 2.6694368517880376, + "learning_rate": 9.808219178082193e-06, + "loss": 1.0609, + "step": 179 + }, + { + "epoch": 0.01481176712610574, + "grad_norm": 2.4432476499205946, + "learning_rate": 9.863013698630138e-06, + "loss": 1.028, + "step": 180 + }, + { + "epoch": 0.014894054721250771, + "grad_norm": 3.074867289580692, + "learning_rate": 9.917808219178083e-06, + "loss": 1.0277, + "step": 181 + }, + { + "epoch": 0.014976342316395804, + "grad_norm": 2.8234239360995548, + "learning_rate": 9.972602739726028e-06, + "loss": 1.0145, + "step": 182 + }, + { + "epoch": 0.015058629911540835, + "grad_norm": 2.7243533214462636, + "learning_rate": 1.0027397260273975e-05, + "loss": 0.9962, + "step": 183 + }, + { + "epoch": 0.015140917506685867, + "grad_norm": 9.268831121545867, + "learning_rate": 1.008219178082192e-05, + "loss": 1.0202, + "step": 184 + }, + { + "epoch": 0.0152232051018309, + "grad_norm": 0.6032487906705319, + "learning_rate": 1.0136986301369864e-05, + "loss": 0.5914, + "step": 185 + }, + { + "epoch": 0.01530549269697593, + "grad_norm": 2.446903956621448, + "learning_rate": 1.0191780821917809e-05, + "loss": 1.0332, + "step": 186 + }, + { + "epoch": 0.015387780292120963, + "grad_norm": 2.9898530283159857, + "learning_rate": 1.0246575342465754e-05, + "loss": 1.0058, + "step": 187 + }, + { + "epoch": 0.015470067887265994, + "grad_norm": 3.1462756197093147, + "learning_rate": 1.0301369863013699e-05, + "loss": 0.9956, + "step": 188 + }, + { + "epoch": 0.015552355482411026, + "grad_norm": 2.603677254795289, + "learning_rate": 1.0356164383561644e-05, + "loss": 1.0567, + "step": 189 + }, + { + "epoch": 0.01563464307755606, + "grad_norm": 2.888609337531178, + "learning_rate": 1.0410958904109589e-05, + "loss": 1.0117, + "step": 190 + }, + { + "epoch": 0.01571693067270109, + "grad_norm": 3.4481892347405694, + "learning_rate": 1.0465753424657534e-05, + "loss": 1.0312, + "step": 191 + }, + { + "epoch": 0.01579921826784612, + "grad_norm": 2.723259220748936, + "learning_rate": 1.052054794520548e-05, + "loss": 1.0011, + "step": 192 + }, + { + "epoch": 0.015881505862991155, + "grad_norm": 2.400388335266181, + "learning_rate": 1.0575342465753426e-05, + "loss": 1.0397, + "step": 193 + }, + { + "epoch": 0.015963793458136186, + "grad_norm": 2.459799194471057, + "learning_rate": 1.0630136986301371e-05, + "loss": 1.0051, + "step": 194 + }, + { + "epoch": 0.016046081053281216, + "grad_norm": 2.493367813709158, + "learning_rate": 1.0684931506849316e-05, + "loss": 0.9877, + "step": 195 + }, + { + "epoch": 0.01612836864842625, + "grad_norm": 2.997365023733453, + "learning_rate": 1.0739726027397261e-05, + "loss": 0.9991, + "step": 196 + }, + { + "epoch": 0.01621065624357128, + "grad_norm": 3.1534988892754927, + "learning_rate": 1.0794520547945206e-05, + "loss": 1.0088, + "step": 197 + }, + { + "epoch": 0.016292943838716312, + "grad_norm": 0.7839570400001313, + "learning_rate": 1.0849315068493152e-05, + "loss": 0.5796, + "step": 198 + }, + { + "epoch": 0.016375231433861347, + "grad_norm": 2.968831135340441, + "learning_rate": 1.0904109589041097e-05, + "loss": 1.0169, + "step": 199 + }, + { + "epoch": 0.016457519029006377, + "grad_norm": 3.1769343467774736, + "learning_rate": 1.0958904109589042e-05, + "loss": 1.0097, + "step": 200 + }, + { + "epoch": 0.01653980662415141, + "grad_norm": 2.941876345769733, + "learning_rate": 1.1013698630136987e-05, + "loss": 1.0021, + "step": 201 + }, + { + "epoch": 0.016622094219296443, + "grad_norm": 3.3680817014108353, + "learning_rate": 1.1068493150684932e-05, + "loss": 1.0218, + "step": 202 + }, + { + "epoch": 0.016704381814441473, + "grad_norm": 2.908397865551594, + "learning_rate": 1.1123287671232879e-05, + "loss": 0.9939, + "step": 203 + }, + { + "epoch": 0.016786669409586504, + "grad_norm": 2.822395296594326, + "learning_rate": 1.1178082191780824e-05, + "loss": 1.0172, + "step": 204 + }, + { + "epoch": 0.016868957004731535, + "grad_norm": 2.758365809402905, + "learning_rate": 1.1232876712328769e-05, + "loss": 1.05, + "step": 205 + }, + { + "epoch": 0.01695124459987657, + "grad_norm": 2.9222144058188984, + "learning_rate": 1.1287671232876714e-05, + "loss": 1.0073, + "step": 206 + }, + { + "epoch": 0.0170335321950216, + "grad_norm": 2.7763083571649547, + "learning_rate": 1.1342465753424659e-05, + "loss": 0.9958, + "step": 207 + }, + { + "epoch": 0.01711581979016663, + "grad_norm": 0.9573751817349475, + "learning_rate": 1.1397260273972604e-05, + "loss": 0.6336, + "step": 208 + }, + { + "epoch": 0.017198107385311665, + "grad_norm": 3.6768856466236857, + "learning_rate": 1.1452054794520548e-05, + "loss": 0.9839, + "step": 209 + }, + { + "epoch": 0.017280394980456696, + "grad_norm": 0.6002615125347783, + "learning_rate": 1.1506849315068493e-05, + "loss": 0.5964, + "step": 210 + }, + { + "epoch": 0.017362682575601727, + "grad_norm": 3.003839522918383, + "learning_rate": 1.1561643835616438e-05, + "loss": 1.0106, + "step": 211 + }, + { + "epoch": 0.01744497017074676, + "grad_norm": 3.0141237654512305, + "learning_rate": 1.1616438356164383e-05, + "loss": 1.005, + "step": 212 + }, + { + "epoch": 0.017527257765891792, + "grad_norm": 2.3380796106197583, + "learning_rate": 1.1671232876712331e-05, + "loss": 1.0025, + "step": 213 + }, + { + "epoch": 0.017609545361036823, + "grad_norm": 2.749317750470713, + "learning_rate": 1.1726027397260275e-05, + "loss": 1.0208, + "step": 214 + }, + { + "epoch": 0.017691832956181857, + "grad_norm": 2.5174324368341363, + "learning_rate": 1.178082191780822e-05, + "loss": 1.0225, + "step": 215 + }, + { + "epoch": 0.017774120551326888, + "grad_norm": 2.6939469770631206, + "learning_rate": 1.1835616438356165e-05, + "loss": 1.0181, + "step": 216 + }, + { + "epoch": 0.01785640814647192, + "grad_norm": 2.7969043874385218, + "learning_rate": 1.189041095890411e-05, + "loss": 1.0321, + "step": 217 + }, + { + "epoch": 0.017938695741616953, + "grad_norm": 2.130515743950604, + "learning_rate": 1.1945205479452055e-05, + "loss": 0.9939, + "step": 218 + }, + { + "epoch": 0.018020983336761984, + "grad_norm": 2.8848097718992296, + "learning_rate": 1.2e-05, + "loss": 1.0064, + "step": 219 + }, + { + "epoch": 0.018103270931907015, + "grad_norm": 1.496463088281579, + "learning_rate": 1.2054794520547945e-05, + "loss": 0.6077, + "step": 220 + }, + { + "epoch": 0.018185558527052045, + "grad_norm": 3.6292481030110935, + "learning_rate": 1.210958904109589e-05, + "loss": 1.0446, + "step": 221 + }, + { + "epoch": 0.01826784612219708, + "grad_norm": 2.252792644024641, + "learning_rate": 1.2164383561643837e-05, + "loss": 0.9739, + "step": 222 + }, + { + "epoch": 0.01835013371734211, + "grad_norm": 2.4478822538483755, + "learning_rate": 1.2219178082191782e-05, + "loss": 1.0131, + "step": 223 + }, + { + "epoch": 0.01843242131248714, + "grad_norm": 2.559717897830331, + "learning_rate": 1.2273972602739727e-05, + "loss": 1.0394, + "step": 224 + }, + { + "epoch": 0.018514708907632176, + "grad_norm": 2.869935242686829, + "learning_rate": 1.2328767123287673e-05, + "loss": 0.982, + "step": 225 + }, + { + "epoch": 0.018596996502777206, + "grad_norm": 2.5009663006221974, + "learning_rate": 1.2383561643835618e-05, + "loss": 1.0108, + "step": 226 + }, + { + "epoch": 0.018679284097922237, + "grad_norm": 2.9956405565150654, + "learning_rate": 1.2438356164383563e-05, + "loss": 0.9902, + "step": 227 + }, + { + "epoch": 0.01876157169306727, + "grad_norm": 2.674322004514903, + "learning_rate": 1.2493150684931508e-05, + "loss": 0.9927, + "step": 228 + }, + { + "epoch": 0.018843859288212302, + "grad_norm": 2.8674094236769583, + "learning_rate": 1.2547945205479453e-05, + "loss": 1.003, + "step": 229 + }, + { + "epoch": 0.018926146883357333, + "grad_norm": 2.9710081363188703, + "learning_rate": 1.2602739726027398e-05, + "loss": 0.9844, + "step": 230 + }, + { + "epoch": 0.019008434478502367, + "grad_norm": 2.98201549226896, + "learning_rate": 1.2657534246575343e-05, + "loss": 0.967, + "step": 231 + }, + { + "epoch": 0.019090722073647398, + "grad_norm": 2.903452559676373, + "learning_rate": 1.271232876712329e-05, + "loss": 1.0102, + "step": 232 + }, + { + "epoch": 0.01917300966879243, + "grad_norm": 2.5049333400477813, + "learning_rate": 1.2767123287671235e-05, + "loss": 1.0096, + "step": 233 + }, + { + "epoch": 0.01925529726393746, + "grad_norm": 2.6342420325330522, + "learning_rate": 1.282191780821918e-05, + "loss": 0.9718, + "step": 234 + }, + { + "epoch": 0.019337584859082494, + "grad_norm": 2.616314817819011, + "learning_rate": 1.2876712328767125e-05, + "loss": 0.9977, + "step": 235 + }, + { + "epoch": 0.019419872454227525, + "grad_norm": 2.420031810864845, + "learning_rate": 1.293150684931507e-05, + "loss": 1.0117, + "step": 236 + }, + { + "epoch": 0.019502160049372556, + "grad_norm": 2.9412487319960126, + "learning_rate": 1.2986301369863015e-05, + "loss": 1.0471, + "step": 237 + }, + { + "epoch": 0.01958444764451759, + "grad_norm": 2.7984406162708906, + "learning_rate": 1.3041095890410959e-05, + "loss": 0.9501, + "step": 238 + }, + { + "epoch": 0.01966673523966262, + "grad_norm": 4.841561737416111, + "learning_rate": 1.3095890410958904e-05, + "loss": 1.0138, + "step": 239 + }, + { + "epoch": 0.01974902283480765, + "grad_norm": 2.1778156992905577, + "learning_rate": 1.3150684931506849e-05, + "loss": 1.0101, + "step": 240 + }, + { + "epoch": 0.019831310429952686, + "grad_norm": 2.67809296527932, + "learning_rate": 1.3205479452054794e-05, + "loss": 0.982, + "step": 241 + }, + { + "epoch": 0.019913598025097717, + "grad_norm": 2.738306662356033, + "learning_rate": 1.3260273972602743e-05, + "loss": 0.9953, + "step": 242 + }, + { + "epoch": 0.019995885620242747, + "grad_norm": 3.69258760845872, + "learning_rate": 1.3315068493150686e-05, + "loss": 0.9933, + "step": 243 + }, + { + "epoch": 0.020078173215387782, + "grad_norm": 3.4285570541743096, + "learning_rate": 1.3369863013698631e-05, + "loss": 0.9891, + "step": 244 + }, + { + "epoch": 0.020160460810532813, + "grad_norm": 2.1884703037736175, + "learning_rate": 1.3424657534246576e-05, + "loss": 0.9615, + "step": 245 + }, + { + "epoch": 0.020242748405677843, + "grad_norm": 2.278997433805173, + "learning_rate": 1.3479452054794521e-05, + "loss": 0.9984, + "step": 246 + }, + { + "epoch": 0.020325036000822878, + "grad_norm": 0.9732502137516167, + "learning_rate": 1.3534246575342466e-05, + "loss": 0.5964, + "step": 247 + }, + { + "epoch": 0.02040732359596791, + "grad_norm": 4.111007905694721, + "learning_rate": 1.3589041095890412e-05, + "loss": 1.03, + "step": 248 + }, + { + "epoch": 0.02048961119111294, + "grad_norm": 2.104309544659177, + "learning_rate": 1.3643835616438357e-05, + "loss": 0.9696, + "step": 249 + }, + { + "epoch": 0.02057189878625797, + "grad_norm": 2.5670779853119665, + "learning_rate": 1.3698630136986302e-05, + "loss": 0.9589, + "step": 250 + }, + { + "epoch": 0.020654186381403004, + "grad_norm": 2.7898261074191777, + "learning_rate": 1.3753424657534247e-05, + "loss": 1.0084, + "step": 251 + }, + { + "epoch": 0.020736473976548035, + "grad_norm": 3.2009246830375204, + "learning_rate": 1.3808219178082194e-05, + "loss": 0.9911, + "step": 252 + }, + { + "epoch": 0.020818761571693066, + "grad_norm": 3.1563797863262777, + "learning_rate": 1.3863013698630139e-05, + "loss": 0.9947, + "step": 253 + }, + { + "epoch": 0.0209010491668381, + "grad_norm": 3.193090081286074, + "learning_rate": 1.3917808219178084e-05, + "loss": 1.0069, + "step": 254 + }, + { + "epoch": 0.02098333676198313, + "grad_norm": 5.521797116199944, + "learning_rate": 1.3972602739726029e-05, + "loss": 0.9842, + "step": 255 + }, + { + "epoch": 0.021065624357128162, + "grad_norm": 1.243014761274919, + "learning_rate": 1.4027397260273974e-05, + "loss": 0.6147, + "step": 256 + }, + { + "epoch": 0.021147911952273196, + "grad_norm": 3.191364616862045, + "learning_rate": 1.4082191780821919e-05, + "loss": 0.974, + "step": 257 + }, + { + "epoch": 0.021230199547418227, + "grad_norm": 2.93570172220106, + "learning_rate": 1.4136986301369864e-05, + "loss": 0.9719, + "step": 258 + }, + { + "epoch": 0.021312487142563258, + "grad_norm": 4.468162617805659, + "learning_rate": 1.419178082191781e-05, + "loss": 0.9904, + "step": 259 + }, + { + "epoch": 0.021394774737708292, + "grad_norm": 2.2571244653960862, + "learning_rate": 1.4246575342465754e-05, + "loss": 0.9613, + "step": 260 + }, + { + "epoch": 0.021477062332853323, + "grad_norm": 4.467563699694284, + "learning_rate": 1.43013698630137e-05, + "loss": 0.9944, + "step": 261 + }, + { + "epoch": 0.021559349927998354, + "grad_norm": 0.68889362412214, + "learning_rate": 1.4356164383561646e-05, + "loss": 0.5789, + "step": 262 + }, + { + "epoch": 0.021641637523143385, + "grad_norm": 0.6373164384054985, + "learning_rate": 1.4410958904109591e-05, + "loss": 0.5688, + "step": 263 + }, + { + "epoch": 0.02172392511828842, + "grad_norm": 3.597782460566262, + "learning_rate": 1.4465753424657537e-05, + "loss": 0.9776, + "step": 264 + }, + { + "epoch": 0.02180621271343345, + "grad_norm": 2.7541673143111347, + "learning_rate": 1.4520547945205482e-05, + "loss": 0.9927, + "step": 265 + }, + { + "epoch": 0.02188850030857848, + "grad_norm": 0.6805788182804722, + "learning_rate": 1.4575342465753427e-05, + "loss": 0.5971, + "step": 266 + }, + { + "epoch": 0.021970787903723515, + "grad_norm": 2.725379141853366, + "learning_rate": 1.463013698630137e-05, + "loss": 0.9675, + "step": 267 + }, + { + "epoch": 0.022053075498868546, + "grad_norm": 4.08013853272879, + "learning_rate": 1.4684931506849315e-05, + "loss": 0.9786, + "step": 268 + }, + { + "epoch": 0.022135363094013576, + "grad_norm": 2.5492247984913483, + "learning_rate": 1.473972602739726e-05, + "loss": 0.9988, + "step": 269 + }, + { + "epoch": 0.02221765068915861, + "grad_norm": 3.8860413387854327, + "learning_rate": 1.4794520547945205e-05, + "loss": 0.9697, + "step": 270 + }, + { + "epoch": 0.02229993828430364, + "grad_norm": 3.0719505820425925, + "learning_rate": 1.484931506849315e-05, + "loss": 0.9778, + "step": 271 + }, + { + "epoch": 0.022382225879448672, + "grad_norm": 3.065813452275364, + "learning_rate": 1.4904109589041097e-05, + "loss": 1.0114, + "step": 272 + }, + { + "epoch": 0.022464513474593707, + "grad_norm": 3.119520514603019, + "learning_rate": 1.4958904109589042e-05, + "loss": 1.0143, + "step": 273 + }, + { + "epoch": 0.022546801069738737, + "grad_norm": 2.8059490672957823, + "learning_rate": 1.5013698630136988e-05, + "loss": 0.9815, + "step": 274 + }, + { + "epoch": 0.022629088664883768, + "grad_norm": 2.6271007340037706, + "learning_rate": 1.5068493150684933e-05, + "loss": 1.0251, + "step": 275 + }, + { + "epoch": 0.0227113762600288, + "grad_norm": 3.114887825941429, + "learning_rate": 1.5123287671232878e-05, + "loss": 0.9722, + "step": 276 + }, + { + "epoch": 0.022793663855173833, + "grad_norm": 3.222134871844559, + "learning_rate": 1.5178082191780823e-05, + "loss": 0.9895, + "step": 277 + }, + { + "epoch": 0.022875951450318864, + "grad_norm": 0.8596732284566506, + "learning_rate": 1.5232876712328768e-05, + "loss": 0.6421, + "step": 278 + }, + { + "epoch": 0.022958239045463895, + "grad_norm": 2.688881192050172, + "learning_rate": 1.5287671232876713e-05, + "loss": 0.9709, + "step": 279 + }, + { + "epoch": 0.02304052664060893, + "grad_norm": 0.5908184070761948, + "learning_rate": 1.5342465753424658e-05, + "loss": 0.5813, + "step": 280 + }, + { + "epoch": 0.02312281423575396, + "grad_norm": 2.5626042733441565, + "learning_rate": 1.5397260273972603e-05, + "loss": 1.0054, + "step": 281 + }, + { + "epoch": 0.02320510183089899, + "grad_norm": 0.6319032426639426, + "learning_rate": 1.545205479452055e-05, + "loss": 0.569, + "step": 282 + }, + { + "epoch": 0.023287389426044025, + "grad_norm": 3.381429029921771, + "learning_rate": 1.5506849315068497e-05, + "loss": 0.9924, + "step": 283 + }, + { + "epoch": 0.023369677021189056, + "grad_norm": 0.6893518849945868, + "learning_rate": 1.556164383561644e-05, + "loss": 0.5947, + "step": 284 + }, + { + "epoch": 0.023451964616334087, + "grad_norm": 0.6030322287256665, + "learning_rate": 1.5616438356164384e-05, + "loss": 0.5849, + "step": 285 + }, + { + "epoch": 0.02353425221147912, + "grad_norm": 2.584371231162671, + "learning_rate": 1.567123287671233e-05, + "loss": 1.0113, + "step": 286 + }, + { + "epoch": 0.023616539806624152, + "grad_norm": 2.617374246670965, + "learning_rate": 1.5726027397260274e-05, + "loss": 0.9952, + "step": 287 + }, + { + "epoch": 0.023698827401769183, + "grad_norm": 3.131756380862052, + "learning_rate": 1.578082191780822e-05, + "loss": 0.9978, + "step": 288 + }, + { + "epoch": 0.023781114996914217, + "grad_norm": 0.7149086621817794, + "learning_rate": 1.5835616438356164e-05, + "loss": 0.6005, + "step": 289 + }, + { + "epoch": 0.023863402592059248, + "grad_norm": 2.8572031223595804, + "learning_rate": 1.589041095890411e-05, + "loss": 0.9764, + "step": 290 + }, + { + "epoch": 0.02394569018720428, + "grad_norm": 3.0067656548078525, + "learning_rate": 1.5945205479452054e-05, + "loss": 0.9931, + "step": 291 + }, + { + "epoch": 0.02402797778234931, + "grad_norm": 2.9396448545767067, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.0167, + "step": 292 + }, + { + "epoch": 0.024110265377494344, + "grad_norm": 2.551576593689318, + "learning_rate": 1.6054794520547948e-05, + "loss": 0.9652, + "step": 293 + }, + { + "epoch": 0.024192552972639374, + "grad_norm": 3.4929495312083376, + "learning_rate": 1.6109589041095893e-05, + "loss": 0.9741, + "step": 294 + }, + { + "epoch": 0.024274840567784405, + "grad_norm": 0.5986861672946895, + "learning_rate": 1.6164383561643838e-05, + "loss": 0.5967, + "step": 295 + }, + { + "epoch": 0.02435712816292944, + "grad_norm": 2.3369563375899163, + "learning_rate": 1.6219178082191783e-05, + "loss": 0.9541, + "step": 296 + }, + { + "epoch": 0.02443941575807447, + "grad_norm": 3.115001072277964, + "learning_rate": 1.6273972602739728e-05, + "loss": 1.002, + "step": 297 + }, + { + "epoch": 0.0245217033532195, + "grad_norm": 3.594307440216849, + "learning_rate": 1.6328767123287673e-05, + "loss": 0.9483, + "step": 298 + }, + { + "epoch": 0.024603990948364535, + "grad_norm": 2.4315114201324977, + "learning_rate": 1.638356164383562e-05, + "loss": 0.9844, + "step": 299 + }, + { + "epoch": 0.024686278543509566, + "grad_norm": 3.3312431748162528, + "learning_rate": 1.6438356164383563e-05, + "loss": 1.0031, + "step": 300 + }, + { + "epoch": 0.024768566138654597, + "grad_norm": 2.7478721222497695, + "learning_rate": 1.649315068493151e-05, + "loss": 0.9942, + "step": 301 + }, + { + "epoch": 0.02485085373379963, + "grad_norm": 2.7443057694383097, + "learning_rate": 1.6547945205479454e-05, + "loss": 0.9841, + "step": 302 + }, + { + "epoch": 0.024933141328944662, + "grad_norm": 2.5333469665657797, + "learning_rate": 1.66027397260274e-05, + "loss": 0.9751, + "step": 303 + }, + { + "epoch": 0.025015428924089693, + "grad_norm": 3.161735273370277, + "learning_rate": 1.6657534246575344e-05, + "loss": 0.9687, + "step": 304 + }, + { + "epoch": 0.025097716519234724, + "grad_norm": 2.6737823247108183, + "learning_rate": 1.671232876712329e-05, + "loss": 0.9787, + "step": 305 + }, + { + "epoch": 0.025180004114379758, + "grad_norm": 0.6510425400067263, + "learning_rate": 1.6767123287671234e-05, + "loss": 0.5622, + "step": 306 + }, + { + "epoch": 0.02526229170952479, + "grad_norm": 4.574909987598007, + "learning_rate": 1.682191780821918e-05, + "loss": 0.9643, + "step": 307 + }, + { + "epoch": 0.02534457930466982, + "grad_norm": 3.4438804774031935, + "learning_rate": 1.6876712328767124e-05, + "loss": 0.9615, + "step": 308 + }, + { + "epoch": 0.025426866899814854, + "grad_norm": 2.9285136796976015, + "learning_rate": 1.693150684931507e-05, + "loss": 0.9527, + "step": 309 + }, + { + "epoch": 0.025509154494959885, + "grad_norm": 2.779888649016243, + "learning_rate": 1.6986301369863014e-05, + "loss": 0.9544, + "step": 310 + }, + { + "epoch": 0.025591442090104916, + "grad_norm": 2.7248520567063848, + "learning_rate": 1.7041095890410963e-05, + "loss": 0.9473, + "step": 311 + }, + { + "epoch": 0.02567372968524995, + "grad_norm": 3.5709762174348954, + "learning_rate": 1.7095890410958908e-05, + "loss": 0.9575, + "step": 312 + }, + { + "epoch": 0.02575601728039498, + "grad_norm": 3.0856327234258827, + "learning_rate": 1.715068493150685e-05, + "loss": 0.9652, + "step": 313 + }, + { + "epoch": 0.02583830487554001, + "grad_norm": 2.2692448164089343, + "learning_rate": 1.7205479452054795e-05, + "loss": 0.9735, + "step": 314 + }, + { + "epoch": 0.025920592470685046, + "grad_norm": 5.769054110868784, + "learning_rate": 1.726027397260274e-05, + "loss": 0.9703, + "step": 315 + }, + { + "epoch": 0.026002880065830077, + "grad_norm": 2.508893910476298, + "learning_rate": 1.7315068493150685e-05, + "loss": 0.944, + "step": 316 + }, + { + "epoch": 0.026085167660975107, + "grad_norm": 2.8832916992173767, + "learning_rate": 1.736986301369863e-05, + "loss": 0.9646, + "step": 317 + }, + { + "epoch": 0.026167455256120138, + "grad_norm": 2.919174367177141, + "learning_rate": 1.7424657534246575e-05, + "loss": 0.9642, + "step": 318 + }, + { + "epoch": 0.026249742851265172, + "grad_norm": 2.3758292544134068, + "learning_rate": 1.747945205479452e-05, + "loss": 0.9819, + "step": 319 + }, + { + "epoch": 0.026332030446410203, + "grad_norm": 2.8844662683768822, + "learning_rate": 1.7534246575342465e-05, + "loss": 0.9757, + "step": 320 + }, + { + "epoch": 0.026414318041555234, + "grad_norm": 2.2651505276443964, + "learning_rate": 1.7589041095890414e-05, + "loss": 0.9461, + "step": 321 + }, + { + "epoch": 0.02649660563670027, + "grad_norm": 3.148064595511082, + "learning_rate": 1.764383561643836e-05, + "loss": 0.9457, + "step": 322 + }, + { + "epoch": 0.0265788932318453, + "grad_norm": 2.593793697550568, + "learning_rate": 1.7698630136986304e-05, + "loss": 0.9564, + "step": 323 + }, + { + "epoch": 0.02666118082699033, + "grad_norm": 3.5777764577994637, + "learning_rate": 1.775342465753425e-05, + "loss": 0.9585, + "step": 324 + }, + { + "epoch": 0.026743468422135364, + "grad_norm": 2.5200344733829434, + "learning_rate": 1.7808219178082194e-05, + "loss": 0.9429, + "step": 325 + }, + { + "epoch": 0.026825756017280395, + "grad_norm": 0.7344214528472546, + "learning_rate": 1.786301369863014e-05, + "loss": 0.6191, + "step": 326 + }, + { + "epoch": 0.026908043612425426, + "grad_norm": 3.3825851018048962, + "learning_rate": 1.7917808219178085e-05, + "loss": 0.9739, + "step": 327 + }, + { + "epoch": 0.02699033120757046, + "grad_norm": 2.4626600175420212, + "learning_rate": 1.797260273972603e-05, + "loss": 0.9813, + "step": 328 + }, + { + "epoch": 0.02707261880271549, + "grad_norm": 2.604744324101538, + "learning_rate": 1.8027397260273975e-05, + "loss": 0.9605, + "step": 329 + }, + { + "epoch": 0.027154906397860522, + "grad_norm": 2.3443898191922408, + "learning_rate": 1.808219178082192e-05, + "loss": 0.968, + "step": 330 + }, + { + "epoch": 0.027237193993005556, + "grad_norm": 2.2972121260527274, + "learning_rate": 1.8136986301369865e-05, + "loss": 0.9636, + "step": 331 + }, + { + "epoch": 0.027319481588150587, + "grad_norm": 0.6704215743863139, + "learning_rate": 1.819178082191781e-05, + "loss": 0.5832, + "step": 332 + }, + { + "epoch": 0.027401769183295618, + "grad_norm": 2.5588332490587806, + "learning_rate": 1.8246575342465755e-05, + "loss": 0.967, + "step": 333 + }, + { + "epoch": 0.02748405677844065, + "grad_norm": 0.5729720504764441, + "learning_rate": 1.83013698630137e-05, + "loss": 0.5796, + "step": 334 + }, + { + "epoch": 0.027566344373585683, + "grad_norm": 0.536934165288964, + "learning_rate": 1.8356164383561645e-05, + "loss": 0.586, + "step": 335 + }, + { + "epoch": 0.027648631968730714, + "grad_norm": 2.729927929300927, + "learning_rate": 1.841095890410959e-05, + "loss": 1.0006, + "step": 336 + }, + { + "epoch": 0.027730919563875744, + "grad_norm": 2.9380300033617193, + "learning_rate": 1.8465753424657535e-05, + "loss": 0.9806, + "step": 337 + }, + { + "epoch": 0.02781320715902078, + "grad_norm": 3.1871007449922595, + "learning_rate": 1.852054794520548e-05, + "loss": 1.0205, + "step": 338 + }, + { + "epoch": 0.02789549475416581, + "grad_norm": 2.7551362648970454, + "learning_rate": 1.8575342465753426e-05, + "loss": 0.9843, + "step": 339 + }, + { + "epoch": 0.02797778234931084, + "grad_norm": 2.341899316621362, + "learning_rate": 1.863013698630137e-05, + "loss": 0.9828, + "step": 340 + }, + { + "epoch": 0.028060069944455875, + "grad_norm": 3.0041315739517143, + "learning_rate": 1.8684931506849316e-05, + "loss": 0.9599, + "step": 341 + }, + { + "epoch": 0.028142357539600905, + "grad_norm": 1.098290342373438, + "learning_rate": 1.873972602739726e-05, + "loss": 0.5762, + "step": 342 + }, + { + "epoch": 0.028224645134745936, + "grad_norm": 2.793401629061216, + "learning_rate": 1.8794520547945206e-05, + "loss": 0.9599, + "step": 343 + }, + { + "epoch": 0.02830693272989097, + "grad_norm": 3.381992225466734, + "learning_rate": 1.884931506849315e-05, + "loss": 1.0128, + "step": 344 + }, + { + "epoch": 0.028389220325036, + "grad_norm": 3.0552921674313107, + "learning_rate": 1.8904109589041096e-05, + "loss": 0.9683, + "step": 345 + }, + { + "epoch": 0.028471507920181032, + "grad_norm": 2.59026883064129, + "learning_rate": 1.895890410958904e-05, + "loss": 0.9361, + "step": 346 + }, + { + "epoch": 0.028553795515326063, + "grad_norm": 3.0842540515307473, + "learning_rate": 1.9013698630136986e-05, + "loss": 0.9697, + "step": 347 + }, + { + "epoch": 0.028636083110471097, + "grad_norm": 2.443425049236279, + "learning_rate": 1.906849315068493e-05, + "loss": 0.9183, + "step": 348 + }, + { + "epoch": 0.028718370705616128, + "grad_norm": 3.127867492745528, + "learning_rate": 1.9123287671232877e-05, + "loss": 0.9601, + "step": 349 + }, + { + "epoch": 0.02880065830076116, + "grad_norm": 4.402570399866093, + "learning_rate": 1.9178082191780822e-05, + "loss": 0.9303, + "step": 350 + }, + { + "epoch": 0.028882945895906193, + "grad_norm": 0.8543818428159927, + "learning_rate": 1.923287671232877e-05, + "loss": 0.5988, + "step": 351 + }, + { + "epoch": 0.028965233491051224, + "grad_norm": 0.7093532126289934, + "learning_rate": 1.9287671232876715e-05, + "loss": 0.5831, + "step": 352 + }, + { + "epoch": 0.029047521086196255, + "grad_norm": 0.6407564149823172, + "learning_rate": 1.934246575342466e-05, + "loss": 0.577, + "step": 353 + }, + { + "epoch": 0.02912980868134129, + "grad_norm": 3.390283574742443, + "learning_rate": 1.9397260273972606e-05, + "loss": 0.9609, + "step": 354 + }, + { + "epoch": 0.02921209627648632, + "grad_norm": 2.53734497566345, + "learning_rate": 1.945205479452055e-05, + "loss": 0.9909, + "step": 355 + }, + { + "epoch": 0.02929438387163135, + "grad_norm": 1.0115473868573372, + "learning_rate": 1.9506849315068496e-05, + "loss": 0.6035, + "step": 356 + }, + { + "epoch": 0.029376671466776385, + "grad_norm": 0.8686466035185451, + "learning_rate": 1.956164383561644e-05, + "loss": 0.5971, + "step": 357 + }, + { + "epoch": 0.029458959061921416, + "grad_norm": 3.039718625814903, + "learning_rate": 1.9616438356164386e-05, + "loss": 0.9912, + "step": 358 + }, + { + "epoch": 0.029541246657066447, + "grad_norm": 3.1175114788948473, + "learning_rate": 1.967123287671233e-05, + "loss": 0.9866, + "step": 359 + }, + { + "epoch": 0.02962353425221148, + "grad_norm": 6.758106134116968, + "learning_rate": 1.9726027397260276e-05, + "loss": 0.9847, + "step": 360 + }, + { + "epoch": 0.02970582184735651, + "grad_norm": 2.589972092841794, + "learning_rate": 1.978082191780822e-05, + "loss": 0.9565, + "step": 361 + }, + { + "epoch": 0.029788109442501542, + "grad_norm": 1.073769179644345, + "learning_rate": 1.9835616438356166e-05, + "loss": 0.6201, + "step": 362 + }, + { + "epoch": 0.029870397037646573, + "grad_norm": 2.620541255700163, + "learning_rate": 1.989041095890411e-05, + "loss": 0.9694, + "step": 363 + }, + { + "epoch": 0.029952684632791608, + "grad_norm": 2.9983273469412, + "learning_rate": 1.9945205479452057e-05, + "loss": 0.9517, + "step": 364 + }, + { + "epoch": 0.03003497222793664, + "grad_norm": 3.1705127831701176, + "learning_rate": 2e-05, + "loss": 0.9757, + "step": 365 + }, + { + "epoch": 0.03011725982308167, + "grad_norm": 3.0769206086851493, + "learning_rate": 1.9999999644807997e-05, + "loss": 0.9725, + "step": 366 + }, + { + "epoch": 0.030199547418226703, + "grad_norm": 2.6381794624352346, + "learning_rate": 1.999999857923201e-05, + "loss": 0.9579, + "step": 367 + }, + { + "epoch": 0.030281835013371734, + "grad_norm": 2.524417719057271, + "learning_rate": 1.999999680327212e-05, + "loss": 0.9491, + "step": 368 + }, + { + "epoch": 0.030364122608516765, + "grad_norm": 2.0772737485337958, + "learning_rate": 1.9999994316928445e-05, + "loss": 0.9802, + "step": 369 + }, + { + "epoch": 0.0304464102036618, + "grad_norm": 0.695305872906948, + "learning_rate": 1.9999991120201172e-05, + "loss": 0.6179, + "step": 370 + }, + { + "epoch": 0.03052869779880683, + "grad_norm": 2.034367122214282, + "learning_rate": 1.999998721309052e-05, + "loss": 0.9365, + "step": 371 + }, + { + "epoch": 0.03061098539395186, + "grad_norm": 2.5094859416224096, + "learning_rate": 1.999998259559677e-05, + "loss": 0.9806, + "step": 372 + }, + { + "epoch": 0.030693272989096895, + "grad_norm": 2.037387180631793, + "learning_rate": 1.9999977267720245e-05, + "loss": 0.9625, + "step": 373 + }, + { + "epoch": 0.030775560584241926, + "grad_norm": 1.9827245047395246, + "learning_rate": 1.999997122946133e-05, + "loss": 0.996, + "step": 374 + }, + { + "epoch": 0.030857848179386957, + "grad_norm": 2.000201005705768, + "learning_rate": 1.9999964480820448e-05, + "loss": 0.9247, + "step": 375 + }, + { + "epoch": 0.030940135774531988, + "grad_norm": 2.237696098262905, + "learning_rate": 1.999995702179809e-05, + "loss": 0.9432, + "step": 376 + }, + { + "epoch": 0.031022423369677022, + "grad_norm": 2.1572992959011668, + "learning_rate": 1.999994885239477e-05, + "loss": 0.9567, + "step": 377 + }, + { + "epoch": 0.031104710964822053, + "grad_norm": 2.5949178993773656, + "learning_rate": 1.999993997261108e-05, + "loss": 0.9523, + "step": 378 + }, + { + "epoch": 0.031186998559967084, + "grad_norm": 4.412522046641788, + "learning_rate": 1.9999930382447644e-05, + "loss": 0.9463, + "step": 379 + }, + { + "epoch": 0.03126928615511212, + "grad_norm": 4.095975078147534, + "learning_rate": 1.9999920081905148e-05, + "loss": 0.9562, + "step": 380 + }, + { + "epoch": 0.03135157375025715, + "grad_norm": 0.7238222599759508, + "learning_rate": 1.999990907098432e-05, + "loss": 0.6367, + "step": 381 + }, + { + "epoch": 0.03143386134540218, + "grad_norm": 2.051737393292375, + "learning_rate": 1.9999897349685948e-05, + "loss": 0.9396, + "step": 382 + }, + { + "epoch": 0.03151614894054721, + "grad_norm": 3.608873989338571, + "learning_rate": 1.999988491801086e-05, + "loss": 0.9427, + "step": 383 + }, + { + "epoch": 0.03159843653569224, + "grad_norm": 0.5731166749659096, + "learning_rate": 1.999987177595994e-05, + "loss": 0.6066, + "step": 384 + }, + { + "epoch": 0.03168072413083728, + "grad_norm": 2.7911800909686244, + "learning_rate": 1.9999857923534117e-05, + "loss": 0.9553, + "step": 385 + }, + { + "epoch": 0.03176301172598231, + "grad_norm": 0.5640032520210956, + "learning_rate": 1.9999843360734384e-05, + "loss": 0.6089, + "step": 386 + }, + { + "epoch": 0.03184529932112734, + "grad_norm": 3.218289339029279, + "learning_rate": 1.999982808756177e-05, + "loss": 1.002, + "step": 387 + }, + { + "epoch": 0.03192758691627237, + "grad_norm": 0.5298496199217386, + "learning_rate": 1.999981210401736e-05, + "loss": 0.6014, + "step": 388 + }, + { + "epoch": 0.0320098745114174, + "grad_norm": 2.1651032679205544, + "learning_rate": 1.9999795410102288e-05, + "loss": 0.977, + "step": 389 + }, + { + "epoch": 0.03209216210656243, + "grad_norm": 3.0876660454466336, + "learning_rate": 1.999977800581775e-05, + "loss": 0.954, + "step": 390 + }, + { + "epoch": 0.03217444970170747, + "grad_norm": 2.8016809296721186, + "learning_rate": 1.999975989116497e-05, + "loss": 0.9773, + "step": 391 + }, + { + "epoch": 0.0322567372968525, + "grad_norm": 2.2686954346227584, + "learning_rate": 1.999974106614524e-05, + "loss": 0.9284, + "step": 392 + }, + { + "epoch": 0.03233902489199753, + "grad_norm": 2.848599719139828, + "learning_rate": 1.9999721530759896e-05, + "loss": 0.9666, + "step": 393 + }, + { + "epoch": 0.03242131248714256, + "grad_norm": 2.5480580332195792, + "learning_rate": 1.9999701285010327e-05, + "loss": 0.9748, + "step": 394 + }, + { + "epoch": 0.032503600082287594, + "grad_norm": 3.0659568674712587, + "learning_rate": 1.999968032889797e-05, + "loss": 0.9773, + "step": 395 + }, + { + "epoch": 0.032585887677432625, + "grad_norm": 3.2486686691126607, + "learning_rate": 1.9999658662424318e-05, + "loss": 0.9378, + "step": 396 + }, + { + "epoch": 0.032668175272577656, + "grad_norm": 2.231555735516029, + "learning_rate": 1.9999636285590903e-05, + "loss": 0.9402, + "step": 397 + }, + { + "epoch": 0.03275046286772269, + "grad_norm": 7.750954267677904, + "learning_rate": 1.999961319839932e-05, + "loss": 0.9212, + "step": 398 + }, + { + "epoch": 0.032832750462867724, + "grad_norm": 3.9379616174216747, + "learning_rate": 1.9999589400851208e-05, + "loss": 0.957, + "step": 399 + }, + { + "epoch": 0.032915038058012755, + "grad_norm": 3.09592161673104, + "learning_rate": 1.9999564892948254e-05, + "loss": 0.9644, + "step": 400 + }, + { + "epoch": 0.032997325653157786, + "grad_norm": 0.6258510816084707, + "learning_rate": 1.9999539674692206e-05, + "loss": 0.6, + "step": 401 + }, + { + "epoch": 0.03307961324830282, + "grad_norm": 2.757532242911201, + "learning_rate": 1.9999513746084848e-05, + "loss": 0.9627, + "step": 402 + }, + { + "epoch": 0.03316190084344785, + "grad_norm": 0.518069489983011, + "learning_rate": 1.999948710712803e-05, + "loss": 0.5736, + "step": 403 + }, + { + "epoch": 0.033244188438592885, + "grad_norm": 2.7302377830347293, + "learning_rate": 1.9999459757823632e-05, + "loss": 0.9452, + "step": 404 + }, + { + "epoch": 0.033326476033737916, + "grad_norm": 3.8829507326351678, + "learning_rate": 1.9999431698173614e-05, + "loss": 0.9501, + "step": 405 + }, + { + "epoch": 0.03340876362888295, + "grad_norm": 3.030860642634053, + "learning_rate": 1.9999402928179953e-05, + "loss": 0.935, + "step": 406 + }, + { + "epoch": 0.03349105122402798, + "grad_norm": 2.7297517789446735, + "learning_rate": 1.99993734478447e-05, + "loss": 0.9816, + "step": 407 + }, + { + "epoch": 0.03357333881917301, + "grad_norm": 2.9131211283428864, + "learning_rate": 1.999934325716995e-05, + "loss": 0.953, + "step": 408 + }, + { + "epoch": 0.03365562641431804, + "grad_norm": 2.8724758175032457, + "learning_rate": 1.999931235615785e-05, + "loss": 0.9543, + "step": 409 + }, + { + "epoch": 0.03373791400946307, + "grad_norm": 3.8558067751787894, + "learning_rate": 1.999928074481059e-05, + "loss": 0.9024, + "step": 410 + }, + { + "epoch": 0.03382020160460811, + "grad_norm": 4.890426251595657, + "learning_rate": 1.9999248423130414e-05, + "loss": 0.9557, + "step": 411 + }, + { + "epoch": 0.03390248919975314, + "grad_norm": 3.9224502088816307, + "learning_rate": 1.9999215391119623e-05, + "loss": 0.9625, + "step": 412 + }, + { + "epoch": 0.03398477679489817, + "grad_norm": 4.121169405356662, + "learning_rate": 1.9999181648780564e-05, + "loss": 0.9836, + "step": 413 + }, + { + "epoch": 0.0340670643900432, + "grad_norm": 3.2570143865225365, + "learning_rate": 1.999914719611563e-05, + "loss": 0.9548, + "step": 414 + }, + { + "epoch": 0.03414935198518823, + "grad_norm": 0.8551591188426197, + "learning_rate": 1.999911203312727e-05, + "loss": 0.6257, + "step": 415 + }, + { + "epoch": 0.03423163958033326, + "grad_norm": 2.282348243685617, + "learning_rate": 1.9999076159817984e-05, + "loss": 0.9534, + "step": 416 + }, + { + "epoch": 0.0343139271754783, + "grad_norm": 3.1849388817078417, + "learning_rate": 1.999903957619032e-05, + "loss": 0.9559, + "step": 417 + }, + { + "epoch": 0.03439621477062333, + "grad_norm": 3.0160267374462744, + "learning_rate": 1.9999002282246877e-05, + "loss": 0.9414, + "step": 418 + }, + { + "epoch": 0.03447850236576836, + "grad_norm": 2.8630460192439484, + "learning_rate": 1.99989642779903e-05, + "loss": 0.97, + "step": 419 + }, + { + "epoch": 0.03456078996091339, + "grad_norm": 0.6092993503428186, + "learning_rate": 1.999892556342329e-05, + "loss": 0.5762, + "step": 420 + }, + { + "epoch": 0.03464307755605842, + "grad_norm": 3.558089457861364, + "learning_rate": 1.9998886138548597e-05, + "loss": 0.9674, + "step": 421 + }, + { + "epoch": 0.034725365151203454, + "grad_norm": 0.5392883644170888, + "learning_rate": 1.9998846003369028e-05, + "loss": 0.6002, + "step": 422 + }, + { + "epoch": 0.03480765274634849, + "grad_norm": 2.4265611825364175, + "learning_rate": 1.9998805157887432e-05, + "loss": 0.9469, + "step": 423 + }, + { + "epoch": 0.03488994034149352, + "grad_norm": 2.5084390180607508, + "learning_rate": 1.9998763602106704e-05, + "loss": 0.9547, + "step": 424 + }, + { + "epoch": 0.03497222793663855, + "grad_norm": 3.0592802155387284, + "learning_rate": 1.99987213360298e-05, + "loss": 0.9549, + "step": 425 + }, + { + "epoch": 0.035054515531783584, + "grad_norm": 3.0606106243138353, + "learning_rate": 1.9998678359659726e-05, + "loss": 0.925, + "step": 426 + }, + { + "epoch": 0.035136803126928615, + "grad_norm": 0.5614840770252022, + "learning_rate": 1.999863467299953e-05, + "loss": 0.6226, + "step": 427 + }, + { + "epoch": 0.035219090722073645, + "grad_norm": 2.3274481514972636, + "learning_rate": 1.9998590276052318e-05, + "loss": 0.9627, + "step": 428 + }, + { + "epoch": 0.035301378317218676, + "grad_norm": 0.5247325522573751, + "learning_rate": 1.999854516882124e-05, + "loss": 0.5626, + "step": 429 + }, + { + "epoch": 0.035383665912363714, + "grad_norm": 2.4963541117374635, + "learning_rate": 1.999849935130951e-05, + "loss": 0.9198, + "step": 430 + }, + { + "epoch": 0.035465953507508745, + "grad_norm": 2.470517097187284, + "learning_rate": 1.999845282352037e-05, + "loss": 0.9433, + "step": 431 + }, + { + "epoch": 0.035548241102653776, + "grad_norm": 2.7560008424762183, + "learning_rate": 1.9998405585457134e-05, + "loss": 0.9428, + "step": 432 + }, + { + "epoch": 0.035630528697798806, + "grad_norm": 2.7637029961336226, + "learning_rate": 1.9998357637123157e-05, + "loss": 0.942, + "step": 433 + }, + { + "epoch": 0.03571281629294384, + "grad_norm": 2.9100289752309045, + "learning_rate": 1.9998308978521842e-05, + "loss": 0.9457, + "step": 434 + }, + { + "epoch": 0.03579510388808887, + "grad_norm": 4.313071561196342, + "learning_rate": 1.9998259609656645e-05, + "loss": 0.9367, + "step": 435 + }, + { + "epoch": 0.035877391483233906, + "grad_norm": 2.9430306639688384, + "learning_rate": 1.999820953053108e-05, + "loss": 0.9292, + "step": 436 + }, + { + "epoch": 0.03595967907837894, + "grad_norm": 3.336500502830984, + "learning_rate": 1.9998158741148695e-05, + "loss": 0.9517, + "step": 437 + }, + { + "epoch": 0.03604196667352397, + "grad_norm": 2.830315148432978, + "learning_rate": 1.99981072415131e-05, + "loss": 0.9619, + "step": 438 + }, + { + "epoch": 0.036124254268669, + "grad_norm": 2.9628110908182506, + "learning_rate": 1.9998055031627964e-05, + "loss": 0.9342, + "step": 439 + }, + { + "epoch": 0.03620654186381403, + "grad_norm": 5.046468138436623, + "learning_rate": 1.9998002111496986e-05, + "loss": 0.9577, + "step": 440 + }, + { + "epoch": 0.03628882945895906, + "grad_norm": 3.1781915402537324, + "learning_rate": 1.9997948481123925e-05, + "loss": 0.9275, + "step": 441 + }, + { + "epoch": 0.03637111705410409, + "grad_norm": 3.291481831836819, + "learning_rate": 1.9997894140512595e-05, + "loss": 0.9504, + "step": 442 + }, + { + "epoch": 0.03645340464924913, + "grad_norm": 3.1084220240196254, + "learning_rate": 1.9997839089666854e-05, + "loss": 0.9236, + "step": 443 + }, + { + "epoch": 0.03653569224439416, + "grad_norm": 3.1887037749162093, + "learning_rate": 1.9997783328590613e-05, + "loss": 0.8855, + "step": 444 + }, + { + "epoch": 0.03661797983953919, + "grad_norm": 3.305256714504642, + "learning_rate": 1.9997726857287834e-05, + "loss": 0.9552, + "step": 445 + }, + { + "epoch": 0.03670026743468422, + "grad_norm": 4.754531864085289, + "learning_rate": 1.9997669675762528e-05, + "loss": 0.9504, + "step": 446 + }, + { + "epoch": 0.03678255502982925, + "grad_norm": 2.474649426046985, + "learning_rate": 1.9997611784018754e-05, + "loss": 0.9518, + "step": 447 + }, + { + "epoch": 0.03686484262497428, + "grad_norm": 2.880288649426941, + "learning_rate": 1.9997553182060633e-05, + "loss": 0.8702, + "step": 448 + }, + { + "epoch": 0.03694713022011932, + "grad_norm": 2.9619541365703976, + "learning_rate": 1.999749386989232e-05, + "loss": 0.948, + "step": 449 + }, + { + "epoch": 0.03702941781526435, + "grad_norm": 3.0040457692945552, + "learning_rate": 1.999743384751803e-05, + "loss": 0.9161, + "step": 450 + }, + { + "epoch": 0.03711170541040938, + "grad_norm": 0.6917840645754628, + "learning_rate": 1.999737311494203e-05, + "loss": 0.5999, + "step": 451 + }, + { + "epoch": 0.03719399300555441, + "grad_norm": 2.500969399378362, + "learning_rate": 1.9997311672168632e-05, + "loss": 0.9321, + "step": 452 + }, + { + "epoch": 0.037276280600699443, + "grad_norm": 3.4756867592830076, + "learning_rate": 1.99972495192022e-05, + "loss": 0.9468, + "step": 453 + }, + { + "epoch": 0.037358568195844474, + "grad_norm": 2.4507954914499974, + "learning_rate": 1.9997186656047154e-05, + "loss": 0.9367, + "step": 454 + }, + { + "epoch": 0.037440855790989505, + "grad_norm": 2.3319357748120066, + "learning_rate": 1.9997123082707954e-05, + "loss": 0.9506, + "step": 455 + }, + { + "epoch": 0.03752314338613454, + "grad_norm": 2.4614553831803896, + "learning_rate": 1.999705879918912e-05, + "loss": 0.9812, + "step": 456 + }, + { + "epoch": 0.037605430981279574, + "grad_norm": 2.7421103733102665, + "learning_rate": 1.999699380549521e-05, + "loss": 0.975, + "step": 457 + }, + { + "epoch": 0.037687718576424604, + "grad_norm": 3.193134683800622, + "learning_rate": 1.9996928101630853e-05, + "loss": 0.9462, + "step": 458 + }, + { + "epoch": 0.037770006171569635, + "grad_norm": 2.4788434065823353, + "learning_rate": 1.999686168760071e-05, + "loss": 0.9442, + "step": 459 + }, + { + "epoch": 0.037852293766714666, + "grad_norm": 2.67715161966991, + "learning_rate": 1.99967945634095e-05, + "loss": 0.9497, + "step": 460 + }, + { + "epoch": 0.0379345813618597, + "grad_norm": 2.8286753306256234, + "learning_rate": 1.9996726729061995e-05, + "loss": 0.9371, + "step": 461 + }, + { + "epoch": 0.038016868957004735, + "grad_norm": 2.494636914608068, + "learning_rate": 1.999665818456301e-05, + "loss": 0.9369, + "step": 462 + }, + { + "epoch": 0.038099156552149765, + "grad_norm": 3.3684641604813312, + "learning_rate": 1.9996588929917413e-05, + "loss": 0.9167, + "step": 463 + }, + { + "epoch": 0.038181444147294796, + "grad_norm": 2.8300347810651836, + "learning_rate": 1.9996518965130126e-05, + "loss": 0.96, + "step": 464 + }, + { + "epoch": 0.03826373174243983, + "grad_norm": 2.7216914732590634, + "learning_rate": 1.9996448290206117e-05, + "loss": 0.9587, + "step": 465 + }, + { + "epoch": 0.03834601933758486, + "grad_norm": 2.8897584926398223, + "learning_rate": 1.999637690515041e-05, + "loss": 0.9424, + "step": 466 + }, + { + "epoch": 0.03842830693272989, + "grad_norm": 2.6782745713753364, + "learning_rate": 1.9996304809968074e-05, + "loss": 0.9421, + "step": 467 + }, + { + "epoch": 0.03851059452787492, + "grad_norm": 0.8391702922649521, + "learning_rate": 1.9996232004664232e-05, + "loss": 0.6291, + "step": 468 + }, + { + "epoch": 0.03859288212301996, + "grad_norm": 2.9110538284406213, + "learning_rate": 1.9996158489244054e-05, + "loss": 0.9548, + "step": 469 + }, + { + "epoch": 0.03867516971816499, + "grad_norm": 2.9735024191976813, + "learning_rate": 1.9996084263712764e-05, + "loss": 0.9397, + "step": 470 + }, + { + "epoch": 0.03875745731331002, + "grad_norm": 2.459802449779267, + "learning_rate": 1.9996009328075635e-05, + "loss": 0.9516, + "step": 471 + }, + { + "epoch": 0.03883974490845505, + "grad_norm": 1.4795476906818943, + "learning_rate": 1.999593368233799e-05, + "loss": 0.6175, + "step": 472 + }, + { + "epoch": 0.03892203250360008, + "grad_norm": 2.7329559825050844, + "learning_rate": 1.9995857326505202e-05, + "loss": 0.9279, + "step": 473 + }, + { + "epoch": 0.03900432009874511, + "grad_norm": 2.7310837617231307, + "learning_rate": 1.999578026058269e-05, + "loss": 0.9325, + "step": 474 + }, + { + "epoch": 0.03908660769389015, + "grad_norm": 3.580150174543716, + "learning_rate": 1.999570248457594e-05, + "loss": 0.9403, + "step": 475 + }, + { + "epoch": 0.03916889528903518, + "grad_norm": 3.518367412394758, + "learning_rate": 1.9995623998490473e-05, + "loss": 0.9346, + "step": 476 + }, + { + "epoch": 0.03925118288418021, + "grad_norm": 2.1655004063703167, + "learning_rate": 1.999554480233186e-05, + "loss": 0.9294, + "step": 477 + }, + { + "epoch": 0.03933347047932524, + "grad_norm": 2.857429287491222, + "learning_rate": 1.9995464896105727e-05, + "loss": 0.9201, + "step": 478 + }, + { + "epoch": 0.03941575807447027, + "grad_norm": 2.3230944603500094, + "learning_rate": 1.999538427981776e-05, + "loss": 0.9172, + "step": 479 + }, + { + "epoch": 0.0394980456696153, + "grad_norm": 2.686091492583088, + "learning_rate": 1.9995302953473673e-05, + "loss": 0.7009, + "step": 480 + }, + { + "epoch": 0.039580333264760334, + "grad_norm": 2.5370139223659445, + "learning_rate": 1.999522091707925e-05, + "loss": 0.9547, + "step": 481 + }, + { + "epoch": 0.03966262085990537, + "grad_norm": 2.9114624346952787, + "learning_rate": 1.9995138170640322e-05, + "loss": 0.9309, + "step": 482 + }, + { + "epoch": 0.0397449084550504, + "grad_norm": 2.636772148383987, + "learning_rate": 1.9995054714162757e-05, + "loss": 0.9224, + "step": 483 + }, + { + "epoch": 0.03982719605019543, + "grad_norm": 2.3887969483327005, + "learning_rate": 1.9994970547652495e-05, + "loss": 0.9509, + "step": 484 + }, + { + "epoch": 0.039909483645340464, + "grad_norm": 2.9497130431080256, + "learning_rate": 1.9994885671115506e-05, + "loss": 0.9693, + "step": 485 + }, + { + "epoch": 0.039991771240485495, + "grad_norm": 2.225873777913106, + "learning_rate": 1.9994800084557826e-05, + "loss": 0.9382, + "step": 486 + }, + { + "epoch": 0.040074058835630526, + "grad_norm": 3.015548118510522, + "learning_rate": 1.9994713787985534e-05, + "loss": 0.9084, + "step": 487 + }, + { + "epoch": 0.040156346430775564, + "grad_norm": 3.2147762822609787, + "learning_rate": 1.9994626781404754e-05, + "loss": 0.9432, + "step": 488 + }, + { + "epoch": 0.040238634025920594, + "grad_norm": 2.732749831828487, + "learning_rate": 1.9994539064821676e-05, + "loss": 0.9493, + "step": 489 + }, + { + "epoch": 0.040320921621065625, + "grad_norm": 2.718095114325169, + "learning_rate": 1.9994450638242524e-05, + "loss": 0.6999, + "step": 490 + }, + { + "epoch": 0.040403209216210656, + "grad_norm": 1.192110613853859, + "learning_rate": 1.9994361501673586e-05, + "loss": 0.606, + "step": 491 + }, + { + "epoch": 0.04048549681135569, + "grad_norm": 2.6545275290481523, + "learning_rate": 1.9994271655121187e-05, + "loss": 0.9562, + "step": 492 + }, + { + "epoch": 0.04056778440650072, + "grad_norm": 2.6306786770452217, + "learning_rate": 1.999418109859171e-05, + "loss": 0.932, + "step": 493 + }, + { + "epoch": 0.040650072001645755, + "grad_norm": 0.7723300623794189, + "learning_rate": 1.99940898320916e-05, + "loss": 0.6167, + "step": 494 + }, + { + "epoch": 0.040732359596790786, + "grad_norm": 3.4539680548732075, + "learning_rate": 1.9993997855627323e-05, + "loss": 0.9547, + "step": 495 + }, + { + "epoch": 0.04081464719193582, + "grad_norm": 8.174151834055909, + "learning_rate": 1.9993905169205425e-05, + "loss": 0.9532, + "step": 496 + }, + { + "epoch": 0.04089693478708085, + "grad_norm": 2.4333462034983517, + "learning_rate": 1.9993811772832487e-05, + "loss": 0.9201, + "step": 497 + }, + { + "epoch": 0.04097922238222588, + "grad_norm": 2.621241890180304, + "learning_rate": 1.9993717666515143e-05, + "loss": 0.9336, + "step": 498 + }, + { + "epoch": 0.04106150997737091, + "grad_norm": 2.8830815398438308, + "learning_rate": 1.999362285026008e-05, + "loss": 0.9254, + "step": 499 + }, + { + "epoch": 0.04114379757251594, + "grad_norm": 3.0315366250694136, + "learning_rate": 1.9993527324074028e-05, + "loss": 0.9272, + "step": 500 + }, + { + "epoch": 0.04122608516766098, + "grad_norm": 2.657554413096405, + "learning_rate": 1.999343108796378e-05, + "loss": 0.9462, + "step": 501 + }, + { + "epoch": 0.04130837276280601, + "grad_norm": 2.905472644448609, + "learning_rate": 1.999333414193617e-05, + "loss": 0.9034, + "step": 502 + }, + { + "epoch": 0.04139066035795104, + "grad_norm": 3.925086807406567, + "learning_rate": 1.9993236485998085e-05, + "loss": 0.9315, + "step": 503 + }, + { + "epoch": 0.04147294795309607, + "grad_norm": 3.0313048521155146, + "learning_rate": 1.999313812015646e-05, + "loss": 0.9535, + "step": 504 + }, + { + "epoch": 0.0415552355482411, + "grad_norm": 2.962993951360446, + "learning_rate": 1.9993039044418286e-05, + "loss": 0.9309, + "step": 505 + }, + { + "epoch": 0.04163752314338613, + "grad_norm": 0.6779011051688715, + "learning_rate": 1.99929392587906e-05, + "loss": 0.5869, + "step": 506 + }, + { + "epoch": 0.04171981073853117, + "grad_norm": 2.579639640184937, + "learning_rate": 1.9992838763280488e-05, + "loss": 0.9118, + "step": 507 + }, + { + "epoch": 0.0418020983336762, + "grad_norm": 2.1450772300859655, + "learning_rate": 1.9992737557895093e-05, + "loss": 0.932, + "step": 508 + }, + { + "epoch": 0.04188438592882123, + "grad_norm": 2.4058977622816977, + "learning_rate": 1.9992635642641605e-05, + "loss": 0.9301, + "step": 509 + }, + { + "epoch": 0.04196667352396626, + "grad_norm": 2.4723871593300584, + "learning_rate": 1.999253301752726e-05, + "loss": 0.9362, + "step": 510 + }, + { + "epoch": 0.04204896111911129, + "grad_norm": 2.7787980954607616, + "learning_rate": 1.999242968255935e-05, + "loss": 0.949, + "step": 511 + }, + { + "epoch": 0.042131248714256324, + "grad_norm": 2.7091957078534783, + "learning_rate": 1.9992325637745214e-05, + "loss": 0.8939, + "step": 512 + }, + { + "epoch": 0.042213536309401355, + "grad_norm": 3.104398485557938, + "learning_rate": 1.9992220883092247e-05, + "loss": 0.9201, + "step": 513 + }, + { + "epoch": 0.04229582390454639, + "grad_norm": 2.688893801232366, + "learning_rate": 1.9992115418607886e-05, + "loss": 0.9314, + "step": 514 + }, + { + "epoch": 0.04237811149969142, + "grad_norm": 0.6175757936794599, + "learning_rate": 1.999200924429963e-05, + "loss": 0.5823, + "step": 515 + }, + { + "epoch": 0.042460399094836454, + "grad_norm": 2.134638530502557, + "learning_rate": 1.9991902360175017e-05, + "loss": 0.8988, + "step": 516 + }, + { + "epoch": 0.042542686689981485, + "grad_norm": 2.660777130272323, + "learning_rate": 1.9991794766241638e-05, + "loss": 0.9058, + "step": 517 + }, + { + "epoch": 0.042624974285126516, + "grad_norm": 2.519959303045957, + "learning_rate": 1.9991686462507137e-05, + "loss": 0.9157, + "step": 518 + }, + { + "epoch": 0.042707261880271546, + "grad_norm": 0.5033254525320345, + "learning_rate": 1.9991577448979213e-05, + "loss": 0.5637, + "step": 519 + }, + { + "epoch": 0.042789549475416584, + "grad_norm": 2.3638963921206777, + "learning_rate": 1.9991467725665604e-05, + "loss": 0.9532, + "step": 520 + }, + { + "epoch": 0.042871837070561615, + "grad_norm": 2.760667379358993, + "learning_rate": 1.9991357292574106e-05, + "loss": 0.9194, + "step": 521 + }, + { + "epoch": 0.042954124665706646, + "grad_norm": 2.285449190484726, + "learning_rate": 1.9991246149712564e-05, + "loss": 0.854, + "step": 522 + }, + { + "epoch": 0.04303641226085168, + "grad_norm": 2.9222709070685315, + "learning_rate": 1.9991134297088877e-05, + "loss": 0.9534, + "step": 523 + }, + { + "epoch": 0.04311869985599671, + "grad_norm": 3.1630611007009355, + "learning_rate": 1.9991021734710988e-05, + "loss": 0.9505, + "step": 524 + }, + { + "epoch": 0.04320098745114174, + "grad_norm": 3.174869013367673, + "learning_rate": 1.999090846258689e-05, + "loss": 0.964, + "step": 525 + }, + { + "epoch": 0.04328327504628677, + "grad_norm": 2.4328576962151693, + "learning_rate": 1.9990794480724634e-05, + "loss": 0.9084, + "step": 526 + }, + { + "epoch": 0.04336556264143181, + "grad_norm": 0.5700103881605539, + "learning_rate": 1.9990679789132317e-05, + "loss": 0.5734, + "step": 527 + }, + { + "epoch": 0.04344785023657684, + "grad_norm": 2.392627489613796, + "learning_rate": 1.9990564387818087e-05, + "loss": 0.916, + "step": 528 + }, + { + "epoch": 0.04353013783172187, + "grad_norm": 3.2074775648239453, + "learning_rate": 1.999044827679014e-05, + "loss": 0.9095, + "step": 529 + }, + { + "epoch": 0.0436124254268669, + "grad_norm": 3.140601191667111, + "learning_rate": 1.999033145605672e-05, + "loss": 0.904, + "step": 530 + }, + { + "epoch": 0.04369471302201193, + "grad_norm": 2.3743918081273505, + "learning_rate": 1.9990213925626135e-05, + "loss": 0.9173, + "step": 531 + }, + { + "epoch": 0.04377700061715696, + "grad_norm": 2.803625633325397, + "learning_rate": 1.999009568550673e-05, + "loss": 0.9425, + "step": 532 + }, + { + "epoch": 0.043859288212302, + "grad_norm": 2.624304052527756, + "learning_rate": 1.9989976735706903e-05, + "loss": 0.8778, + "step": 533 + }, + { + "epoch": 0.04394157580744703, + "grad_norm": 3.611007788459353, + "learning_rate": 1.9989857076235105e-05, + "loss": 0.9454, + "step": 534 + }, + { + "epoch": 0.04402386340259206, + "grad_norm": 3.0477796789876885, + "learning_rate": 1.9989736707099836e-05, + "loss": 0.9301, + "step": 535 + }, + { + "epoch": 0.04410615099773709, + "grad_norm": 3.661229035903915, + "learning_rate": 1.998961562830965e-05, + "loss": 0.9234, + "step": 536 + }, + { + "epoch": 0.04418843859288212, + "grad_norm": 3.014314493078093, + "learning_rate": 1.9989493839873144e-05, + "loss": 0.9205, + "step": 537 + }, + { + "epoch": 0.04427072618802715, + "grad_norm": 3.1607667446866348, + "learning_rate": 1.998937134179897e-05, + "loss": 0.9184, + "step": 538 + }, + { + "epoch": 0.044353013783172184, + "grad_norm": 0.5679302245778807, + "learning_rate": 1.9989248134095835e-05, + "loss": 0.5808, + "step": 539 + }, + { + "epoch": 0.04443530137831722, + "grad_norm": 3.4927267069905827, + "learning_rate": 1.9989124216772486e-05, + "loss": 0.9068, + "step": 540 + }, + { + "epoch": 0.04451758897346225, + "grad_norm": 3.2792902354283524, + "learning_rate": 1.9988999589837727e-05, + "loss": 0.9441, + "step": 541 + }, + { + "epoch": 0.04459987656860728, + "grad_norm": 3.2813608886269465, + "learning_rate": 1.9988874253300415e-05, + "loss": 0.9135, + "step": 542 + }, + { + "epoch": 0.044682164163752314, + "grad_norm": 3.6532563430030387, + "learning_rate": 1.9988748207169448e-05, + "loss": 0.9124, + "step": 543 + }, + { + "epoch": 0.044764451758897345, + "grad_norm": 3.0411510483789708, + "learning_rate": 1.9988621451453783e-05, + "loss": 0.9437, + "step": 544 + }, + { + "epoch": 0.044846739354042375, + "grad_norm": 2.947067350806481, + "learning_rate": 1.9988493986162426e-05, + "loss": 0.9377, + "step": 545 + }, + { + "epoch": 0.04492902694918741, + "grad_norm": 3.733984375480931, + "learning_rate": 1.9988365811304434e-05, + "loss": 0.9302, + "step": 546 + }, + { + "epoch": 0.045011314544332444, + "grad_norm": 0.5973399530190582, + "learning_rate": 1.99882369268889e-05, + "loss": 0.5985, + "step": 547 + }, + { + "epoch": 0.045093602139477475, + "grad_norm": 3.1946558451893483, + "learning_rate": 1.9988107332924997e-05, + "loss": 0.9306, + "step": 548 + }, + { + "epoch": 0.045175889734622506, + "grad_norm": 3.0518182224655184, + "learning_rate": 1.998797702942192e-05, + "loss": 0.9238, + "step": 549 + }, + { + "epoch": 0.045258177329767536, + "grad_norm": 0.5186994011171457, + "learning_rate": 1.9987846016388927e-05, + "loss": 0.5534, + "step": 550 + }, + { + "epoch": 0.04534046492491257, + "grad_norm": 2.9538180602678072, + "learning_rate": 1.9987714293835326e-05, + "loss": 0.9131, + "step": 551 + }, + { + "epoch": 0.0454227525200576, + "grad_norm": 3.583039419798021, + "learning_rate": 1.9987581861770476e-05, + "loss": 0.931, + "step": 552 + }, + { + "epoch": 0.045505040115202636, + "grad_norm": 3.872167117824797, + "learning_rate": 1.9987448720203783e-05, + "loss": 0.9149, + "step": 553 + }, + { + "epoch": 0.045587327710347667, + "grad_norm": 0.5153323660807152, + "learning_rate": 1.9987314869144704e-05, + "loss": 0.5707, + "step": 554 + }, + { + "epoch": 0.0456696153054927, + "grad_norm": 3.2458016621373162, + "learning_rate": 1.9987180308602752e-05, + "loss": 0.9481, + "step": 555 + }, + { + "epoch": 0.04575190290063773, + "grad_norm": 0.5131089745749331, + "learning_rate": 1.998704503858748e-05, + "loss": 0.6107, + "step": 556 + }, + { + "epoch": 0.04583419049578276, + "grad_norm": 3.826718669936501, + "learning_rate": 1.99869090591085e-05, + "loss": 0.9334, + "step": 557 + }, + { + "epoch": 0.04591647809092779, + "grad_norm": 2.808877894852513, + "learning_rate": 1.9986772370175475e-05, + "loss": 0.9313, + "step": 558 + }, + { + "epoch": 0.04599876568607283, + "grad_norm": 3.429756806838896, + "learning_rate": 1.998663497179811e-05, + "loss": 0.9041, + "step": 559 + }, + { + "epoch": 0.04608105328121786, + "grad_norm": 3.927553685701978, + "learning_rate": 1.998649686398617e-05, + "loss": 0.9229, + "step": 560 + }, + { + "epoch": 0.04616334087636289, + "grad_norm": 4.358404357254217, + "learning_rate": 1.9986358046749463e-05, + "loss": 0.9453, + "step": 561 + }, + { + "epoch": 0.04624562847150792, + "grad_norm": 0.6974205247527027, + "learning_rate": 1.998621852009785e-05, + "loss": 0.582, + "step": 562 + }, + { + "epoch": 0.04632791606665295, + "grad_norm": 2.8790199811794213, + "learning_rate": 1.9986078284041245e-05, + "loss": 0.9073, + "step": 563 + }, + { + "epoch": 0.04641020366179798, + "grad_norm": 3.1507198941552343, + "learning_rate": 1.998593733858961e-05, + "loss": 0.9285, + "step": 564 + }, + { + "epoch": 0.04649249125694301, + "grad_norm": 3.3010925203438757, + "learning_rate": 1.9985795683752955e-05, + "loss": 0.8975, + "step": 565 + }, + { + "epoch": 0.04657477885208805, + "grad_norm": 2.4173724120050277, + "learning_rate": 1.9985653319541345e-05, + "loss": 0.9211, + "step": 566 + }, + { + "epoch": 0.04665706644723308, + "grad_norm": 3.219239778661617, + "learning_rate": 1.9985510245964894e-05, + "loss": 0.9414, + "step": 567 + }, + { + "epoch": 0.04673935404237811, + "grad_norm": 4.702680418398121, + "learning_rate": 1.9985366463033763e-05, + "loss": 0.8886, + "step": 568 + }, + { + "epoch": 0.04682164163752314, + "grad_norm": 2.946137626961066, + "learning_rate": 1.9985221970758166e-05, + "loss": 0.907, + "step": 569 + }, + { + "epoch": 0.04690392923266817, + "grad_norm": 3.1637086789258224, + "learning_rate": 1.9985076769148373e-05, + "loss": 0.9063, + "step": 570 + }, + { + "epoch": 0.046986216827813204, + "grad_norm": 2.7457117180469286, + "learning_rate": 1.9984930858214695e-05, + "loss": 0.9163, + "step": 571 + }, + { + "epoch": 0.04706850442295824, + "grad_norm": 2.8795617581547597, + "learning_rate": 1.9984784237967495e-05, + "loss": 0.9272, + "step": 572 + }, + { + "epoch": 0.04715079201810327, + "grad_norm": 3.539552457926088, + "learning_rate": 1.998463690841719e-05, + "loss": 0.9254, + "step": 573 + }, + { + "epoch": 0.047233079613248304, + "grad_norm": 2.590893854876316, + "learning_rate": 1.998448886957425e-05, + "loss": 0.9135, + "step": 574 + }, + { + "epoch": 0.047315367208393334, + "grad_norm": 3.385121747004568, + "learning_rate": 1.9984340121449187e-05, + "loss": 0.898, + "step": 575 + }, + { + "epoch": 0.047397654803538365, + "grad_norm": 2.8668381053066248, + "learning_rate": 1.998419066405257e-05, + "loss": 0.9111, + "step": 576 + }, + { + "epoch": 0.047479942398683396, + "grad_norm": 0.5561294337589316, + "learning_rate": 1.9984040497395016e-05, + "loss": 0.6026, + "step": 577 + }, + { + "epoch": 0.047562229993828434, + "grad_norm": 2.7790207529975683, + "learning_rate": 1.9983889621487193e-05, + "loss": 0.8813, + "step": 578 + }, + { + "epoch": 0.047644517588973465, + "grad_norm": 2.929493346002011, + "learning_rate": 1.9983738036339818e-05, + "loss": 0.934, + "step": 579 + }, + { + "epoch": 0.047726805184118495, + "grad_norm": 2.6432622003873294, + "learning_rate": 1.9983585741963655e-05, + "loss": 0.935, + "step": 580 + }, + { + "epoch": 0.047809092779263526, + "grad_norm": 2.343596103466015, + "learning_rate": 1.998343273836953e-05, + "loss": 0.8885, + "step": 581 + }, + { + "epoch": 0.04789138037440856, + "grad_norm": 2.6377392327317355, + "learning_rate": 1.998327902556831e-05, + "loss": 0.9195, + "step": 582 + }, + { + "epoch": 0.04797366796955359, + "grad_norm": 0.5734849677326599, + "learning_rate": 1.9983124603570915e-05, + "loss": 0.5804, + "step": 583 + }, + { + "epoch": 0.04805595556469862, + "grad_norm": 2.359098397716237, + "learning_rate": 1.9982969472388313e-05, + "loss": 0.9154, + "step": 584 + }, + { + "epoch": 0.048138243159843656, + "grad_norm": 3.07285660000184, + "learning_rate": 1.9982813632031526e-05, + "loss": 0.9293, + "step": 585 + }, + { + "epoch": 0.04822053075498869, + "grad_norm": 3.145177565014435, + "learning_rate": 1.9982657082511624e-05, + "loss": 0.909, + "step": 586 + }, + { + "epoch": 0.04830281835013372, + "grad_norm": 2.4460324686547, + "learning_rate": 1.9982499823839726e-05, + "loss": 0.9172, + "step": 587 + }, + { + "epoch": 0.04838510594527875, + "grad_norm": 2.7860695223687335, + "learning_rate": 1.9982341856027006e-05, + "loss": 0.8962, + "step": 588 + }, + { + "epoch": 0.04846739354042378, + "grad_norm": 2.5003193611135126, + "learning_rate": 1.9982183179084683e-05, + "loss": 0.9523, + "step": 589 + }, + { + "epoch": 0.04854968113556881, + "grad_norm": 0.5728078039718163, + "learning_rate": 1.998202379302403e-05, + "loss": 0.5939, + "step": 590 + }, + { + "epoch": 0.04863196873071385, + "grad_norm": 2.513890686672487, + "learning_rate": 1.9981863697856376e-05, + "loss": 0.9027, + "step": 591 + }, + { + "epoch": 0.04871425632585888, + "grad_norm": 6.401109317568734, + "learning_rate": 1.9981702893593086e-05, + "loss": 0.9041, + "step": 592 + }, + { + "epoch": 0.04879654392100391, + "grad_norm": 0.526955304818451, + "learning_rate": 1.9981541380245586e-05, + "loss": 0.6109, + "step": 593 + }, + { + "epoch": 0.04887883151614894, + "grad_norm": 0.5280472746795982, + "learning_rate": 1.9981379157825346e-05, + "loss": 0.5801, + "step": 594 + }, + { + "epoch": 0.04896111911129397, + "grad_norm": 2.831289529507686, + "learning_rate": 1.99812162263439e-05, + "loss": 0.9296, + "step": 595 + }, + { + "epoch": 0.049043406706439, + "grad_norm": 2.5183731275746637, + "learning_rate": 1.998105258581281e-05, + "loss": 0.9373, + "step": 596 + }, + { + "epoch": 0.04912569430158403, + "grad_norm": 2.290556291606923, + "learning_rate": 1.998088823624371e-05, + "loss": 0.9339, + "step": 597 + }, + { + "epoch": 0.04920798189672907, + "grad_norm": 2.9827790643550065, + "learning_rate": 1.998072317764827e-05, + "loss": 0.9341, + "step": 598 + }, + { + "epoch": 0.0492902694918741, + "grad_norm": 3.9980040686222535, + "learning_rate": 1.998055741003822e-05, + "loss": 0.9428, + "step": 599 + }, + { + "epoch": 0.04937255708701913, + "grad_norm": 2.9421068715344125, + "learning_rate": 1.998039093342533e-05, + "loss": 0.9183, + "step": 600 + }, + { + "epoch": 0.04945484468216416, + "grad_norm": 2.3512621164999654, + "learning_rate": 1.998022374782143e-05, + "loss": 0.9139, + "step": 601 + }, + { + "epoch": 0.049537132277309194, + "grad_norm": 2.8922341692853863, + "learning_rate": 1.9980055853238394e-05, + "loss": 0.8847, + "step": 602 + }, + { + "epoch": 0.049619419872454225, + "grad_norm": 2.5544870335833916, + "learning_rate": 1.9979887249688158e-05, + "loss": 0.9322, + "step": 603 + }, + { + "epoch": 0.04970170746759926, + "grad_norm": 2.3713588179833427, + "learning_rate": 1.9979717937182685e-05, + "loss": 0.8953, + "step": 604 + }, + { + "epoch": 0.04978399506274429, + "grad_norm": 2.567195793905517, + "learning_rate": 1.9979547915734014e-05, + "loss": 0.9287, + "step": 605 + }, + { + "epoch": 0.049866282657889324, + "grad_norm": 2.116439796262553, + "learning_rate": 1.997937718535422e-05, + "loss": 0.9122, + "step": 606 + }, + { + "epoch": 0.049948570253034355, + "grad_norm": 2.6728583449200967, + "learning_rate": 1.9979205746055426e-05, + "loss": 0.9409, + "step": 607 + }, + { + "epoch": 0.050030857848179386, + "grad_norm": 2.9303321533796147, + "learning_rate": 1.9979033597849817e-05, + "loss": 0.877, + "step": 608 + }, + { + "epoch": 0.05011314544332442, + "grad_norm": 2.6453736009345103, + "learning_rate": 1.9978860740749618e-05, + "loss": 0.9264, + "step": 609 + }, + { + "epoch": 0.05019543303846945, + "grad_norm": 0.6463475109604742, + "learning_rate": 1.9978687174767115e-05, + "loss": 0.6037, + "step": 610 + }, + { + "epoch": 0.050277720633614485, + "grad_norm": 2.1568723876857514, + "learning_rate": 1.9978512899914632e-05, + "loss": 0.9291, + "step": 611 + }, + { + "epoch": 0.050360008228759516, + "grad_norm": 2.779974581309181, + "learning_rate": 1.997833791620455e-05, + "loss": 0.9487, + "step": 612 + }, + { + "epoch": 0.05044229582390455, + "grad_norm": 2.6541794961423726, + "learning_rate": 1.9978162223649303e-05, + "loss": 0.9314, + "step": 613 + }, + { + "epoch": 0.05052458341904958, + "grad_norm": 2.204822617972563, + "learning_rate": 1.9977985822261367e-05, + "loss": 0.9195, + "step": 614 + }, + { + "epoch": 0.05060687101419461, + "grad_norm": 2.528877153941993, + "learning_rate": 1.9977808712053276e-05, + "loss": 0.925, + "step": 615 + }, + { + "epoch": 0.05068915860933964, + "grad_norm": 2.89407673046398, + "learning_rate": 1.9977630893037613e-05, + "loss": 0.9164, + "step": 616 + }, + { + "epoch": 0.05077144620448468, + "grad_norm": 2.8147196835709924, + "learning_rate": 1.9977452365227005e-05, + "loss": 0.9109, + "step": 617 + }, + { + "epoch": 0.05085373379962971, + "grad_norm": 2.8624190313017697, + "learning_rate": 1.997727312863414e-05, + "loss": 0.9227, + "step": 618 + }, + { + "epoch": 0.05093602139477474, + "grad_norm": 2.6853591545801243, + "learning_rate": 1.9977093183271746e-05, + "loss": 0.9043, + "step": 619 + }, + { + "epoch": 0.05101830898991977, + "grad_norm": 2.847809177384018, + "learning_rate": 1.997691252915261e-05, + "loss": 0.8797, + "step": 620 + }, + { + "epoch": 0.0511005965850648, + "grad_norm": 2.5413962256979477, + "learning_rate": 1.9976731166289565e-05, + "loss": 0.888, + "step": 621 + }, + { + "epoch": 0.05118288418020983, + "grad_norm": 2.4434297876428768, + "learning_rate": 1.997654909469549e-05, + "loss": 0.9193, + "step": 622 + }, + { + "epoch": 0.05126517177535486, + "grad_norm": 2.554334961124947, + "learning_rate": 1.9976366314383323e-05, + "loss": 0.945, + "step": 623 + }, + { + "epoch": 0.0513474593704999, + "grad_norm": 3.0606359366025155, + "learning_rate": 1.9976182825366052e-05, + "loss": 0.9018, + "step": 624 + }, + { + "epoch": 0.05142974696564493, + "grad_norm": 2.7602463387503877, + "learning_rate": 1.9975998627656704e-05, + "loss": 0.9572, + "step": 625 + }, + { + "epoch": 0.05151203456078996, + "grad_norm": 2.645779738054759, + "learning_rate": 1.997581372126837e-05, + "loss": 0.8986, + "step": 626 + }, + { + "epoch": 0.05159432215593499, + "grad_norm": 2.3004786981907808, + "learning_rate": 1.997562810621418e-05, + "loss": 0.9378, + "step": 627 + }, + { + "epoch": 0.05167660975108002, + "grad_norm": 3.0529134410232954, + "learning_rate": 1.9975441782507327e-05, + "loss": 0.9374, + "step": 628 + }, + { + "epoch": 0.051758897346225054, + "grad_norm": 6.366982443959264, + "learning_rate": 1.997525475016104e-05, + "loss": 0.9572, + "step": 629 + }, + { + "epoch": 0.05184118494137009, + "grad_norm": 7.143057307651942, + "learning_rate": 1.9975067009188608e-05, + "loss": 0.9368, + "step": 630 + }, + { + "epoch": 0.05192347253651512, + "grad_norm": 2.486114121904295, + "learning_rate": 1.997487855960337e-05, + "loss": 0.8618, + "step": 631 + }, + { + "epoch": 0.05200576013166015, + "grad_norm": 2.909503733964849, + "learning_rate": 1.9974689401418712e-05, + "loss": 0.8998, + "step": 632 + }, + { + "epoch": 0.052088047726805184, + "grad_norm": 2.506345699862428, + "learning_rate": 1.9974499534648068e-05, + "loss": 0.9119, + "step": 633 + }, + { + "epoch": 0.052170335321950215, + "grad_norm": 0.5966023669088316, + "learning_rate": 1.9974308959304933e-05, + "loss": 0.5656, + "step": 634 + }, + { + "epoch": 0.052252622917095246, + "grad_norm": 2.9205909740125784, + "learning_rate": 1.997411767540284e-05, + "loss": 0.9109, + "step": 635 + }, + { + "epoch": 0.052334910512240276, + "grad_norm": 2.2641759973862534, + "learning_rate": 1.9973925682955378e-05, + "loss": 0.9023, + "step": 636 + }, + { + "epoch": 0.052417198107385314, + "grad_norm": 2.4641130571954086, + "learning_rate": 1.9973732981976188e-05, + "loss": 0.909, + "step": 637 + }, + { + "epoch": 0.052499485702530345, + "grad_norm": 2.2247912270982195, + "learning_rate": 1.9973539572478955e-05, + "loss": 0.9111, + "step": 638 + }, + { + "epoch": 0.052581773297675376, + "grad_norm": 2.182850954981328, + "learning_rate": 1.9973345454477422e-05, + "loss": 0.885, + "step": 639 + }, + { + "epoch": 0.05266406089282041, + "grad_norm": 0.5616279149900174, + "learning_rate": 1.997315062798538e-05, + "loss": 0.5634, + "step": 640 + }, + { + "epoch": 0.05274634848796544, + "grad_norm": 2.1709200144119287, + "learning_rate": 1.9972955093016662e-05, + "loss": 0.9021, + "step": 641 + }, + { + "epoch": 0.05282863608311047, + "grad_norm": 3.0243470611887853, + "learning_rate": 1.9972758849585167e-05, + "loss": 0.923, + "step": 642 + }, + { + "epoch": 0.052910923678255506, + "grad_norm": 0.5181983481216014, + "learning_rate": 1.9972561897704832e-05, + "loss": 0.589, + "step": 643 + }, + { + "epoch": 0.05299321127340054, + "grad_norm": 2.3618384003718904, + "learning_rate": 1.997236423738965e-05, + "loss": 0.8893, + "step": 644 + }, + { + "epoch": 0.05307549886854557, + "grad_norm": 2.83302899205139, + "learning_rate": 1.997216586865366e-05, + "loss": 0.9056, + "step": 645 + }, + { + "epoch": 0.0531577864636906, + "grad_norm": 2.1524435897397756, + "learning_rate": 1.9971966791510952e-05, + "loss": 0.8875, + "step": 646 + }, + { + "epoch": 0.05324007405883563, + "grad_norm": 0.5403616002875096, + "learning_rate": 1.9971767005975676e-05, + "loss": 0.5864, + "step": 647 + }, + { + "epoch": 0.05332236165398066, + "grad_norm": 3.032727501630103, + "learning_rate": 1.9971566512062016e-05, + "loss": 0.9269, + "step": 648 + }, + { + "epoch": 0.0534046492491257, + "grad_norm": 2.677613120586094, + "learning_rate": 1.9971365309784222e-05, + "loss": 0.9319, + "step": 649 + }, + { + "epoch": 0.05348693684427073, + "grad_norm": 2.7527601762070626, + "learning_rate": 1.9971163399156577e-05, + "loss": 0.911, + "step": 650 + }, + { + "epoch": 0.05356922443941576, + "grad_norm": 2.456807133771137, + "learning_rate": 1.9970960780193435e-05, + "loss": 0.9274, + "step": 651 + }, + { + "epoch": 0.05365151203456079, + "grad_norm": 0.5512339745238304, + "learning_rate": 1.9970757452909185e-05, + "loss": 0.5999, + "step": 652 + }, + { + "epoch": 0.05373379962970582, + "grad_norm": 3.3078302086877454, + "learning_rate": 1.997055341731827e-05, + "loss": 0.9161, + "step": 653 + }, + { + "epoch": 0.05381608722485085, + "grad_norm": 1.9567891820560834, + "learning_rate": 1.9970348673435187e-05, + "loss": 0.8954, + "step": 654 + }, + { + "epoch": 0.05389837481999588, + "grad_norm": 2.4558167849951027, + "learning_rate": 1.9970143221274477e-05, + "loss": 0.9041, + "step": 655 + }, + { + "epoch": 0.05398066241514092, + "grad_norm": 2.6700615275845214, + "learning_rate": 1.996993706085074e-05, + "loss": 0.9406, + "step": 656 + }, + { + "epoch": 0.05406295001028595, + "grad_norm": 2.47054592661293, + "learning_rate": 1.9969730192178618e-05, + "loss": 0.9075, + "step": 657 + }, + { + "epoch": 0.05414523760543098, + "grad_norm": 2.527986443897195, + "learning_rate": 1.9969522615272806e-05, + "loss": 0.9012, + "step": 658 + }, + { + "epoch": 0.05422752520057601, + "grad_norm": 0.5565334590513972, + "learning_rate": 1.9969314330148056e-05, + "loss": 0.5587, + "step": 659 + }, + { + "epoch": 0.054309812795721044, + "grad_norm": 1.8601076711624556, + "learning_rate": 1.9969105336819154e-05, + "loss": 0.8991, + "step": 660 + }, + { + "epoch": 0.054392100390866074, + "grad_norm": 2.0210809868042356, + "learning_rate": 1.9968895635300956e-05, + "loss": 0.9302, + "step": 661 + }, + { + "epoch": 0.05447438798601111, + "grad_norm": 2.1871429796039363, + "learning_rate": 1.9968685225608353e-05, + "loss": 0.8719, + "step": 662 + }, + { + "epoch": 0.05455667558115614, + "grad_norm": 2.699275991596056, + "learning_rate": 1.9968474107756295e-05, + "loss": 0.9107, + "step": 663 + }, + { + "epoch": 0.054638963176301174, + "grad_norm": 2.921814293546767, + "learning_rate": 1.996826228175978e-05, + "loss": 0.9124, + "step": 664 + }, + { + "epoch": 0.054721250771446205, + "grad_norm": 2.9121454433336917, + "learning_rate": 1.9968049747633848e-05, + "loss": 0.8872, + "step": 665 + }, + { + "epoch": 0.054803538366591235, + "grad_norm": 4.665109966003875, + "learning_rate": 1.996783650539361e-05, + "loss": 0.9337, + "step": 666 + }, + { + "epoch": 0.054885825961736266, + "grad_norm": 2.2334882062761814, + "learning_rate": 1.9967622555054204e-05, + "loss": 0.9249, + "step": 667 + }, + { + "epoch": 0.0549681135568813, + "grad_norm": 1.8093225226331142, + "learning_rate": 1.9967407896630837e-05, + "loss": 0.8666, + "step": 668 + }, + { + "epoch": 0.055050401152026335, + "grad_norm": 0.5652676807003993, + "learning_rate": 1.996719253013875e-05, + "loss": 0.5961, + "step": 669 + }, + { + "epoch": 0.055132688747171366, + "grad_norm": 0.5100457321950321, + "learning_rate": 1.9966976455593247e-05, + "loss": 0.5618, + "step": 670 + }, + { + "epoch": 0.055214976342316396, + "grad_norm": 2.773850609378529, + "learning_rate": 1.9966759673009677e-05, + "loss": 0.9275, + "step": 671 + }, + { + "epoch": 0.05529726393746143, + "grad_norm": 2.5443256480658296, + "learning_rate": 1.9966542182403437e-05, + "loss": 0.9077, + "step": 672 + }, + { + "epoch": 0.05537955153260646, + "grad_norm": 3.282011580384134, + "learning_rate": 1.9966323983789983e-05, + "loss": 0.921, + "step": 673 + }, + { + "epoch": 0.05546183912775149, + "grad_norm": 2.2203588190464885, + "learning_rate": 1.996610507718481e-05, + "loss": 0.8988, + "step": 674 + }, + { + "epoch": 0.05554412672289653, + "grad_norm": 4.790143157081725, + "learning_rate": 1.996588546260347e-05, + "loss": 0.9526, + "step": 675 + }, + { + "epoch": 0.05562641431804156, + "grad_norm": 2.092143807841506, + "learning_rate": 1.9965665140061565e-05, + "loss": 0.915, + "step": 676 + }, + { + "epoch": 0.05570870191318659, + "grad_norm": 1.9784649465852888, + "learning_rate": 1.9965444109574744e-05, + "loss": 0.905, + "step": 677 + }, + { + "epoch": 0.05579098950833162, + "grad_norm": 2.7843501048163217, + "learning_rate": 1.9965222371158718e-05, + "loss": 0.8951, + "step": 678 + }, + { + "epoch": 0.05587327710347665, + "grad_norm": 2.6331805589786383, + "learning_rate": 1.9964999924829224e-05, + "loss": 0.8614, + "step": 679 + }, + { + "epoch": 0.05595556469862168, + "grad_norm": 0.7467735870885243, + "learning_rate": 1.9964776770602078e-05, + "loss": 0.6063, + "step": 680 + }, + { + "epoch": 0.05603785229376671, + "grad_norm": 2.680536053721946, + "learning_rate": 1.9964552908493123e-05, + "loss": 0.8782, + "step": 681 + }, + { + "epoch": 0.05612013988891175, + "grad_norm": 3.49552823109986, + "learning_rate": 1.9964328338518264e-05, + "loss": 0.902, + "step": 682 + }, + { + "epoch": 0.05620242748405678, + "grad_norm": 2.120123047682193, + "learning_rate": 1.996410306069346e-05, + "loss": 0.9496, + "step": 683 + }, + { + "epoch": 0.05628471507920181, + "grad_norm": 1.937156037107827, + "learning_rate": 1.9963877075034706e-05, + "loss": 0.8875, + "step": 684 + }, + { + "epoch": 0.05636700267434684, + "grad_norm": 2.4742509534066754, + "learning_rate": 1.9963650381558063e-05, + "loss": 0.9192, + "step": 685 + }, + { + "epoch": 0.05644929026949187, + "grad_norm": 2.3426169694208903, + "learning_rate": 1.996342298027963e-05, + "loss": 0.9481, + "step": 686 + }, + { + "epoch": 0.0565315778646369, + "grad_norm": 2.1543307158741434, + "learning_rate": 1.9963194871215557e-05, + "loss": 0.8948, + "step": 687 + }, + { + "epoch": 0.05661386545978194, + "grad_norm": 1.7721734117310426, + "learning_rate": 1.9962966054382062e-05, + "loss": 0.8769, + "step": 688 + }, + { + "epoch": 0.05669615305492697, + "grad_norm": 2.637184520870366, + "learning_rate": 1.9962736529795388e-05, + "loss": 0.9305, + "step": 689 + }, + { + "epoch": 0.056778440650072, + "grad_norm": 2.5552424968357306, + "learning_rate": 1.9962506297471846e-05, + "loss": 0.9011, + "step": 690 + }, + { + "epoch": 0.05686072824521703, + "grad_norm": 2.1091093097631797, + "learning_rate": 1.9962275357427787e-05, + "loss": 0.9153, + "step": 691 + }, + { + "epoch": 0.056943015840362064, + "grad_norm": 3.8893843496883775, + "learning_rate": 1.996204370967962e-05, + "loss": 0.9516, + "step": 692 + }, + { + "epoch": 0.057025303435507095, + "grad_norm": 0.6989567675386245, + "learning_rate": 1.9961811354243798e-05, + "loss": 0.6088, + "step": 693 + }, + { + "epoch": 0.057107591030652126, + "grad_norm": 3.0703220705587326, + "learning_rate": 1.9961578291136834e-05, + "loss": 0.9468, + "step": 694 + }, + { + "epoch": 0.057189878625797164, + "grad_norm": 0.5452905698296876, + "learning_rate": 1.9961344520375276e-05, + "loss": 0.5795, + "step": 695 + }, + { + "epoch": 0.057272166220942194, + "grad_norm": 3.477621910759164, + "learning_rate": 1.9961110041975732e-05, + "loss": 0.9586, + "step": 696 + }, + { + "epoch": 0.057354453816087225, + "grad_norm": 3.5385882928206454, + "learning_rate": 1.9960874855954863e-05, + "loss": 0.9508, + "step": 697 + }, + { + "epoch": 0.057436741411232256, + "grad_norm": 2.6972731084205437, + "learning_rate": 1.996063896232938e-05, + "loss": 0.9313, + "step": 698 + }, + { + "epoch": 0.05751902900637729, + "grad_norm": 0.6344603977192381, + "learning_rate": 1.9960402361116026e-05, + "loss": 0.6044, + "step": 699 + }, + { + "epoch": 0.05760131660152232, + "grad_norm": 5.571545453742246, + "learning_rate": 1.996016505233162e-05, + "loss": 0.92, + "step": 700 + }, + { + "epoch": 0.057683604196667355, + "grad_norm": 2.859612009759652, + "learning_rate": 1.9959927035993017e-05, + "loss": 0.897, + "step": 701 + }, + { + "epoch": 0.057765891791812386, + "grad_norm": 2.426187536557682, + "learning_rate": 1.9959688312117128e-05, + "loss": 0.9305, + "step": 702 + }, + { + "epoch": 0.05784817938695742, + "grad_norm": 2.7388965530788, + "learning_rate": 1.995944888072091e-05, + "loss": 0.9145, + "step": 703 + }, + { + "epoch": 0.05793046698210245, + "grad_norm": 2.776291815110774, + "learning_rate": 1.995920874182137e-05, + "loss": 0.9075, + "step": 704 + }, + { + "epoch": 0.05801275457724748, + "grad_norm": 2.575679639237728, + "learning_rate": 1.995896789543557e-05, + "loss": 0.9045, + "step": 705 + }, + { + "epoch": 0.05809504217239251, + "grad_norm": 3.5403132152741263, + "learning_rate": 1.9958726341580615e-05, + "loss": 0.913, + "step": 706 + }, + { + "epoch": 0.05817732976753754, + "grad_norm": 2.58072580176139, + "learning_rate": 1.995848408027367e-05, + "loss": 0.9229, + "step": 707 + }, + { + "epoch": 0.05825961736268258, + "grad_norm": 2.5124996774654473, + "learning_rate": 1.9958241111531942e-05, + "loss": 0.9126, + "step": 708 + }, + { + "epoch": 0.05834190495782761, + "grad_norm": 2.36119565147592, + "learning_rate": 1.995799743537269e-05, + "loss": 0.9066, + "step": 709 + }, + { + "epoch": 0.05842419255297264, + "grad_norm": 3.2376572469679847, + "learning_rate": 1.9957753051813228e-05, + "loss": 0.9107, + "step": 710 + }, + { + "epoch": 0.05850648014811767, + "grad_norm": 0.5718002254539629, + "learning_rate": 1.9957507960870908e-05, + "loss": 0.5838, + "step": 711 + }, + { + "epoch": 0.0585887677432627, + "grad_norm": 2.9835296928097765, + "learning_rate": 1.9957262162563155e-05, + "loss": 0.9062, + "step": 712 + }, + { + "epoch": 0.05867105533840773, + "grad_norm": 2.312335655498833, + "learning_rate": 1.9957015656907417e-05, + "loss": 0.9331, + "step": 713 + }, + { + "epoch": 0.05875334293355277, + "grad_norm": 2.3792417930038168, + "learning_rate": 1.9956768443921214e-05, + "loss": 0.9371, + "step": 714 + }, + { + "epoch": 0.0588356305286978, + "grad_norm": 3.0747711781753955, + "learning_rate": 1.99565205236221e-05, + "loss": 0.9245, + "step": 715 + }, + { + "epoch": 0.05891791812384283, + "grad_norm": 2.469147337654409, + "learning_rate": 1.9956271896027696e-05, + "loss": 0.9053, + "step": 716 + }, + { + "epoch": 0.05900020571898786, + "grad_norm": 4.677348829502867, + "learning_rate": 1.9956022561155655e-05, + "loss": 0.9316, + "step": 717 + }, + { + "epoch": 0.05908249331413289, + "grad_norm": 2.574073344258724, + "learning_rate": 1.9955772519023694e-05, + "loss": 0.9144, + "step": 718 + }, + { + "epoch": 0.059164780909277924, + "grad_norm": 0.6010291838312377, + "learning_rate": 1.995552176964958e-05, + "loss": 0.5969, + "step": 719 + }, + { + "epoch": 0.05924706850442296, + "grad_norm": 0.48362592184616704, + "learning_rate": 1.9955270313051115e-05, + "loss": 0.6105, + "step": 720 + }, + { + "epoch": 0.05932935609956799, + "grad_norm": 4.6846130266410935, + "learning_rate": 1.995501814924617e-05, + "loss": 0.9146, + "step": 721 + }, + { + "epoch": 0.05941164369471302, + "grad_norm": 2.577204170673208, + "learning_rate": 1.9954765278252656e-05, + "loss": 0.9073, + "step": 722 + }, + { + "epoch": 0.059493931289858054, + "grad_norm": 4.7923802267754985, + "learning_rate": 1.995451170008854e-05, + "loss": 0.9192, + "step": 723 + }, + { + "epoch": 0.059576218885003085, + "grad_norm": 3.637556402050712, + "learning_rate": 1.995425741477183e-05, + "loss": 0.8916, + "step": 724 + }, + { + "epoch": 0.059658506480148116, + "grad_norm": 3.318312481516906, + "learning_rate": 1.9954002422320593e-05, + "loss": 0.8979, + "step": 725 + }, + { + "epoch": 0.05974079407529315, + "grad_norm": 2.2896767162285476, + "learning_rate": 1.9953746722752944e-05, + "loss": 0.9078, + "step": 726 + }, + { + "epoch": 0.059823081670438184, + "grad_norm": 2.4261610228532433, + "learning_rate": 1.9953490316087045e-05, + "loss": 0.9094, + "step": 727 + }, + { + "epoch": 0.059905369265583215, + "grad_norm": 3.5742603087267533, + "learning_rate": 1.9953233202341115e-05, + "loss": 0.9668, + "step": 728 + }, + { + "epoch": 0.059987656860728246, + "grad_norm": 3.646866686252275, + "learning_rate": 1.995297538153341e-05, + "loss": 0.9081, + "step": 729 + }, + { + "epoch": 0.06006994445587328, + "grad_norm": 3.5756298093016134, + "learning_rate": 1.9952716853682258e-05, + "loss": 0.932, + "step": 730 + }, + { + "epoch": 0.06015223205101831, + "grad_norm": 2.461737210935374, + "learning_rate": 1.9952457618806016e-05, + "loss": 0.9161, + "step": 731 + }, + { + "epoch": 0.06023451964616334, + "grad_norm": 2.9435688364135038, + "learning_rate": 1.99521976769231e-05, + "loss": 0.8791, + "step": 732 + }, + { + "epoch": 0.060316807241308376, + "grad_norm": 3.752079579941048, + "learning_rate": 1.995193702805198e-05, + "loss": 0.8864, + "step": 733 + }, + { + "epoch": 0.06039909483645341, + "grad_norm": 4.53396790098707, + "learning_rate": 1.9951675672211163e-05, + "loss": 0.8929, + "step": 734 + }, + { + "epoch": 0.06048138243159844, + "grad_norm": 4.961620647630342, + "learning_rate": 1.9951413609419225e-05, + "loss": 0.8536, + "step": 735 + }, + { + "epoch": 0.06056367002674347, + "grad_norm": 3.891304133200799, + "learning_rate": 1.995115083969478e-05, + "loss": 0.8944, + "step": 736 + }, + { + "epoch": 0.0606459576218885, + "grad_norm": 2.712319861053012, + "learning_rate": 1.9950887363056495e-05, + "loss": 0.9206, + "step": 737 + }, + { + "epoch": 0.06072824521703353, + "grad_norm": 4.223019111124196, + "learning_rate": 1.9950623179523085e-05, + "loss": 0.9025, + "step": 738 + }, + { + "epoch": 0.06081053281217856, + "grad_norm": 5.016232013409377, + "learning_rate": 1.9950358289113317e-05, + "loss": 0.8815, + "step": 739 + }, + { + "epoch": 0.0608928204073236, + "grad_norm": 2.6897434242049694, + "learning_rate": 1.995009269184601e-05, + "loss": 0.8836, + "step": 740 + }, + { + "epoch": 0.06097510800246863, + "grad_norm": 0.7568433896575619, + "learning_rate": 1.994982638774003e-05, + "loss": 0.5993, + "step": 741 + }, + { + "epoch": 0.06105739559761366, + "grad_norm": 2.553452324246678, + "learning_rate": 1.9949559376814296e-05, + "loss": 0.8986, + "step": 742 + }, + { + "epoch": 0.06113968319275869, + "grad_norm": 0.5018812785768227, + "learning_rate": 1.9949291659087776e-05, + "loss": 0.5597, + "step": 743 + }, + { + "epoch": 0.06122197078790372, + "grad_norm": 2.4064235706469, + "learning_rate": 1.994902323457949e-05, + "loss": 0.8943, + "step": 744 + }, + { + "epoch": 0.06130425838304875, + "grad_norm": 2.295948111702661, + "learning_rate": 1.9948754103308504e-05, + "loss": 0.8668, + "step": 745 + }, + { + "epoch": 0.06138654597819379, + "grad_norm": 0.6531820015601002, + "learning_rate": 1.9948484265293934e-05, + "loss": 0.5944, + "step": 746 + }, + { + "epoch": 0.06146883357333882, + "grad_norm": 2.488686897667554, + "learning_rate": 1.9948213720554955e-05, + "loss": 0.8939, + "step": 747 + }, + { + "epoch": 0.06155112116848385, + "grad_norm": 2.2478829073807867, + "learning_rate": 1.994794246911078e-05, + "loss": 0.878, + "step": 748 + }, + { + "epoch": 0.06163340876362888, + "grad_norm": 3.21297658438237, + "learning_rate": 1.9947670510980686e-05, + "loss": 0.9367, + "step": 749 + }, + { + "epoch": 0.061715696358773914, + "grad_norm": 2.5032219143064296, + "learning_rate": 1.9947397846183986e-05, + "loss": 0.909, + "step": 750 + }, + { + "epoch": 0.061797983953918945, + "grad_norm": 2.3821398027611367, + "learning_rate": 1.9947124474740052e-05, + "loss": 0.8767, + "step": 751 + }, + { + "epoch": 0.061880271549063975, + "grad_norm": 4.029427101966951, + "learning_rate": 1.99468503966683e-05, + "loss": 0.8618, + "step": 752 + }, + { + "epoch": 0.06196255914420901, + "grad_norm": 2.404778806152705, + "learning_rate": 1.9946575611988207e-05, + "loss": 0.9047, + "step": 753 + }, + { + "epoch": 0.062044846739354044, + "grad_norm": 2.962612526189809, + "learning_rate": 1.9946300120719287e-05, + "loss": 0.889, + "step": 754 + }, + { + "epoch": 0.062127134334499075, + "grad_norm": 2.5437765511188695, + "learning_rate": 1.994602392288112e-05, + "loss": 0.9399, + "step": 755 + }, + { + "epoch": 0.062209421929644106, + "grad_norm": 0.5539735241167393, + "learning_rate": 1.9945747018493314e-05, + "loss": 0.5963, + "step": 756 + }, + { + "epoch": 0.062291709524789136, + "grad_norm": 3.1779858985642817, + "learning_rate": 1.9945469407575543e-05, + "loss": 0.876, + "step": 757 + }, + { + "epoch": 0.06237399711993417, + "grad_norm": 2.687485842671492, + "learning_rate": 1.9945191090147537e-05, + "loss": 0.9022, + "step": 758 + }, + { + "epoch": 0.062456284715079205, + "grad_norm": 2.9422463927653766, + "learning_rate": 1.9944912066229058e-05, + "loss": 0.8956, + "step": 759 + }, + { + "epoch": 0.06253857231022424, + "grad_norm": 4.157936413648122, + "learning_rate": 1.9944632335839927e-05, + "loss": 0.9138, + "step": 760 + }, + { + "epoch": 0.06262085990536927, + "grad_norm": 0.48567249965915693, + "learning_rate": 1.9944351899000026e-05, + "loss": 0.5563, + "step": 761 + }, + { + "epoch": 0.0627031475005143, + "grad_norm": 2.7821820465506, + "learning_rate": 1.9944070755729266e-05, + "loss": 0.9122, + "step": 762 + }, + { + "epoch": 0.06278543509565933, + "grad_norm": 2.65823773191475, + "learning_rate": 1.9943788906047624e-05, + "loss": 0.9009, + "step": 763 + }, + { + "epoch": 0.06286772269080436, + "grad_norm": 0.4745158162176376, + "learning_rate": 1.9943506349975118e-05, + "loss": 0.5845, + "step": 764 + }, + { + "epoch": 0.06295001028594939, + "grad_norm": 4.304541123505603, + "learning_rate": 1.9943223087531824e-05, + "loss": 0.911, + "step": 765 + }, + { + "epoch": 0.06303229788109442, + "grad_norm": 2.599121308286042, + "learning_rate": 1.9942939118737866e-05, + "loss": 0.9082, + "step": 766 + }, + { + "epoch": 0.06311458547623945, + "grad_norm": 2.661380985142305, + "learning_rate": 1.9942654443613413e-05, + "loss": 0.889, + "step": 767 + }, + { + "epoch": 0.06319687307138448, + "grad_norm": 2.7289869422777406, + "learning_rate": 1.994236906217869e-05, + "loss": 0.8807, + "step": 768 + }, + { + "epoch": 0.06327916066652953, + "grad_norm": 3.552184676009908, + "learning_rate": 1.9942082974453968e-05, + "loss": 0.8869, + "step": 769 + }, + { + "epoch": 0.06336144826167456, + "grad_norm": 3.3116779659066222, + "learning_rate": 1.994179618045957e-05, + "loss": 0.886, + "step": 770 + }, + { + "epoch": 0.06344373585681959, + "grad_norm": 2.733151926112565, + "learning_rate": 1.9941508680215874e-05, + "loss": 0.878, + "step": 771 + }, + { + "epoch": 0.06352602345196462, + "grad_norm": 3.689575278866226, + "learning_rate": 1.9941220473743297e-05, + "loss": 0.9012, + "step": 772 + }, + { + "epoch": 0.06360831104710965, + "grad_norm": 3.6509278934675344, + "learning_rate": 1.994093156106232e-05, + "loss": 0.8859, + "step": 773 + }, + { + "epoch": 0.06369059864225468, + "grad_norm": 3.4408763078150373, + "learning_rate": 1.9940641942193462e-05, + "loss": 0.9895, + "step": 774 + }, + { + "epoch": 0.06377288623739971, + "grad_norm": 3.356367722166113, + "learning_rate": 1.9940351617157298e-05, + "loss": 0.9321, + "step": 775 + }, + { + "epoch": 0.06385517383254474, + "grad_norm": 2.6685489053310905, + "learning_rate": 1.994006058597445e-05, + "loss": 0.871, + "step": 776 + }, + { + "epoch": 0.06393746142768977, + "grad_norm": 2.1000398415565447, + "learning_rate": 1.99397688486656e-05, + "loss": 0.8799, + "step": 777 + }, + { + "epoch": 0.0640197490228348, + "grad_norm": 2.1292877692214462, + "learning_rate": 1.9939476405251464e-05, + "loss": 0.8955, + "step": 778 + }, + { + "epoch": 0.06410203661797984, + "grad_norm": 3.4132241841166073, + "learning_rate": 1.9939183255752817e-05, + "loss": 0.8757, + "step": 779 + }, + { + "epoch": 0.06418432421312487, + "grad_norm": 2.62487277122737, + "learning_rate": 1.9938889400190494e-05, + "loss": 0.8884, + "step": 780 + }, + { + "epoch": 0.0642666118082699, + "grad_norm": 2.044302329571613, + "learning_rate": 1.993859483858536e-05, + "loss": 0.9023, + "step": 781 + }, + { + "epoch": 0.06434889940341494, + "grad_norm": 0.5567547220538414, + "learning_rate": 1.993829957095834e-05, + "loss": 0.5694, + "step": 782 + }, + { + "epoch": 0.06443118699855997, + "grad_norm": 0.48731474493235843, + "learning_rate": 1.9938003597330415e-05, + "loss": 0.5764, + "step": 783 + }, + { + "epoch": 0.064513474593705, + "grad_norm": 2.335128235917664, + "learning_rate": 1.9937706917722607e-05, + "loss": 0.9091, + "step": 784 + }, + { + "epoch": 0.06459576218885003, + "grad_norm": 2.6840226763995383, + "learning_rate": 1.9937409532155992e-05, + "loss": 0.8881, + "step": 785 + }, + { + "epoch": 0.06467804978399506, + "grad_norm": 2.3949102024541653, + "learning_rate": 1.99371114406517e-05, + "loss": 0.9183, + "step": 786 + }, + { + "epoch": 0.0647603373791401, + "grad_norm": 2.6216703824274488, + "learning_rate": 1.99368126432309e-05, + "loss": 0.9207, + "step": 787 + }, + { + "epoch": 0.06484262497428513, + "grad_norm": 2.614435269135524, + "learning_rate": 1.993651313991482e-05, + "loss": 0.9145, + "step": 788 + }, + { + "epoch": 0.06492491256943016, + "grad_norm": 1.9122678315195296, + "learning_rate": 1.9936212930724742e-05, + "loss": 0.8829, + "step": 789 + }, + { + "epoch": 0.06500720016457519, + "grad_norm": 0.5913835221535177, + "learning_rate": 1.9935912015681984e-05, + "loss": 0.6145, + "step": 790 + }, + { + "epoch": 0.06508948775972022, + "grad_norm": 2.528199419410872, + "learning_rate": 1.993561039480793e-05, + "loss": 0.8655, + "step": 791 + }, + { + "epoch": 0.06517177535486525, + "grad_norm": 3.3798538121747326, + "learning_rate": 1.9935308068124e-05, + "loss": 0.9251, + "step": 792 + }, + { + "epoch": 0.06525406295001028, + "grad_norm": 2.6588327121370194, + "learning_rate": 1.9935005035651676e-05, + "loss": 0.8983, + "step": 793 + }, + { + "epoch": 0.06533635054515531, + "grad_norm": 0.5232567113259947, + "learning_rate": 1.9934701297412482e-05, + "loss": 0.578, + "step": 794 + }, + { + "epoch": 0.06541863814030036, + "grad_norm": 4.752300485944965, + "learning_rate": 1.9934396853427998e-05, + "loss": 0.8953, + "step": 795 + }, + { + "epoch": 0.06550092573544539, + "grad_norm": 2.2269507955655987, + "learning_rate": 1.9934091703719846e-05, + "loss": 0.9245, + "step": 796 + }, + { + "epoch": 0.06558321333059042, + "grad_norm": 3.122445969674065, + "learning_rate": 1.9933785848309708e-05, + "loss": 0.8914, + "step": 797 + }, + { + "epoch": 0.06566550092573545, + "grad_norm": 3.1204724551293426, + "learning_rate": 1.9933479287219312e-05, + "loss": 0.9287, + "step": 798 + }, + { + "epoch": 0.06574778852088048, + "grad_norm": 14.479758337139925, + "learning_rate": 1.9933172020470433e-05, + "loss": 0.8677, + "step": 799 + }, + { + "epoch": 0.06583007611602551, + "grad_norm": 2.1224285416282953, + "learning_rate": 1.99328640480849e-05, + "loss": 0.8755, + "step": 800 + }, + { + "epoch": 0.06591236371117054, + "grad_norm": 2.487164087508179, + "learning_rate": 1.9932555370084588e-05, + "loss": 0.8775, + "step": 801 + }, + { + "epoch": 0.06599465130631557, + "grad_norm": 0.5728404010402629, + "learning_rate": 1.9932245986491425e-05, + "loss": 0.5477, + "step": 802 + }, + { + "epoch": 0.0660769389014606, + "grad_norm": 3.245446623126787, + "learning_rate": 1.9931935897327396e-05, + "loss": 0.9005, + "step": 803 + }, + { + "epoch": 0.06615922649660563, + "grad_norm": 2.5198170754823237, + "learning_rate": 1.9931625102614524e-05, + "loss": 0.9251, + "step": 804 + }, + { + "epoch": 0.06624151409175066, + "grad_norm": 2.7124091417439447, + "learning_rate": 1.9931313602374886e-05, + "loss": 0.9043, + "step": 805 + }, + { + "epoch": 0.0663238016868957, + "grad_norm": 2.295917945326921, + "learning_rate": 1.9931001396630613e-05, + "loss": 0.9037, + "step": 806 + }, + { + "epoch": 0.06640608928204073, + "grad_norm": 2.5595180677086176, + "learning_rate": 1.9930688485403885e-05, + "loss": 0.8916, + "step": 807 + }, + { + "epoch": 0.06648837687718577, + "grad_norm": 2.54401264532517, + "learning_rate": 1.993037486871693e-05, + "loss": 0.8865, + "step": 808 + }, + { + "epoch": 0.0665706644723308, + "grad_norm": 2.7644346282703567, + "learning_rate": 1.993006054659202e-05, + "loss": 0.875, + "step": 809 + }, + { + "epoch": 0.06665295206747583, + "grad_norm": 2.145314542653547, + "learning_rate": 1.9929745519051497e-05, + "loss": 0.9358, + "step": 810 + }, + { + "epoch": 0.06673523966262086, + "grad_norm": 3.2713117109960583, + "learning_rate": 1.9929429786117724e-05, + "loss": 0.8777, + "step": 811 + }, + { + "epoch": 0.0668175272577659, + "grad_norm": 0.5829653015669467, + "learning_rate": 1.9929113347813145e-05, + "loss": 0.5366, + "step": 812 + }, + { + "epoch": 0.06689981485291092, + "grad_norm": 2.4233464969419516, + "learning_rate": 1.992879620416023e-05, + "loss": 0.9099, + "step": 813 + }, + { + "epoch": 0.06698210244805596, + "grad_norm": 2.7021068296091624, + "learning_rate": 1.9928478355181512e-05, + "loss": 0.9092, + "step": 814 + }, + { + "epoch": 0.06706439004320099, + "grad_norm": 2.522776219516862, + "learning_rate": 1.992815980089957e-05, + "loss": 0.9024, + "step": 815 + }, + { + "epoch": 0.06714667763834602, + "grad_norm": 2.232284370603574, + "learning_rate": 1.9927840541337037e-05, + "loss": 0.9233, + "step": 816 + }, + { + "epoch": 0.06722896523349105, + "grad_norm": 2.9343145896014255, + "learning_rate": 1.9927520576516587e-05, + "loss": 0.9312, + "step": 817 + }, + { + "epoch": 0.06731125282863608, + "grad_norm": 3.3222486630048764, + "learning_rate": 1.9927199906460947e-05, + "loss": 0.8681, + "step": 818 + }, + { + "epoch": 0.06739354042378111, + "grad_norm": 2.1225744897957153, + "learning_rate": 1.9926878531192908e-05, + "loss": 0.8916, + "step": 819 + }, + { + "epoch": 0.06747582801892614, + "grad_norm": 5.166258547080567, + "learning_rate": 1.992655645073529e-05, + "loss": 0.9153, + "step": 820 + }, + { + "epoch": 0.06755811561407118, + "grad_norm": 3.2639889220707077, + "learning_rate": 1.992623366511098e-05, + "loss": 0.8715, + "step": 821 + }, + { + "epoch": 0.06764040320921622, + "grad_norm": 4.714497016717951, + "learning_rate": 1.9925910174342907e-05, + "loss": 0.8723, + "step": 822 + }, + { + "epoch": 0.06772269080436125, + "grad_norm": 2.5352280280058315, + "learning_rate": 1.9925585978454043e-05, + "loss": 0.9045, + "step": 823 + }, + { + "epoch": 0.06780497839950628, + "grad_norm": 3.485579632575649, + "learning_rate": 1.992526107746743e-05, + "loss": 0.8797, + "step": 824 + }, + { + "epoch": 0.06788726599465131, + "grad_norm": 12.454695730191421, + "learning_rate": 1.992493547140614e-05, + "loss": 0.8755, + "step": 825 + }, + { + "epoch": 0.06796955358979634, + "grad_norm": 0.5679287848373274, + "learning_rate": 1.9924609160293308e-05, + "loss": 0.5737, + "step": 826 + }, + { + "epoch": 0.06805184118494137, + "grad_norm": 6.733588252523935, + "learning_rate": 1.9924282144152115e-05, + "loss": 0.8607, + "step": 827 + }, + { + "epoch": 0.0681341287800864, + "grad_norm": 2.8353728427421965, + "learning_rate": 1.9923954423005786e-05, + "loss": 0.8658, + "step": 828 + }, + { + "epoch": 0.06821641637523143, + "grad_norm": 2.226675047912921, + "learning_rate": 1.9923625996877607e-05, + "loss": 0.8908, + "step": 829 + }, + { + "epoch": 0.06829870397037646, + "grad_norm": 2.090011013197403, + "learning_rate": 1.9923296865790907e-05, + "loss": 0.9027, + "step": 830 + }, + { + "epoch": 0.06838099156552149, + "grad_norm": 2.4269097740027687, + "learning_rate": 1.992296702976907e-05, + "loss": 0.8743, + "step": 831 + }, + { + "epoch": 0.06846327916066652, + "grad_norm": 2.4454075613373174, + "learning_rate": 1.9922636488835528e-05, + "loss": 0.9188, + "step": 832 + }, + { + "epoch": 0.06854556675581157, + "grad_norm": 2.708156376904729, + "learning_rate": 1.992230524301375e-05, + "loss": 0.8753, + "step": 833 + }, + { + "epoch": 0.0686278543509566, + "grad_norm": 6.9289687760917955, + "learning_rate": 1.9921973292327285e-05, + "loss": 0.8714, + "step": 834 + }, + { + "epoch": 0.06871014194610163, + "grad_norm": 2.833475838520833, + "learning_rate": 1.9921640636799697e-05, + "loss": 0.878, + "step": 835 + }, + { + "epoch": 0.06879242954124666, + "grad_norm": 0.6390100760660502, + "learning_rate": 1.992130727645463e-05, + "loss": 0.5892, + "step": 836 + }, + { + "epoch": 0.06887471713639169, + "grad_norm": 3.503075844449775, + "learning_rate": 1.992097321131576e-05, + "loss": 0.9134, + "step": 837 + }, + { + "epoch": 0.06895700473153672, + "grad_norm": 2.928003367939948, + "learning_rate": 1.992063844140682e-05, + "loss": 0.916, + "step": 838 + }, + { + "epoch": 0.06903929232668175, + "grad_norm": 2.79325002366026, + "learning_rate": 1.992030296675159e-05, + "loss": 0.8767, + "step": 839 + }, + { + "epoch": 0.06912157992182678, + "grad_norm": 2.312184411585912, + "learning_rate": 1.9919966787373902e-05, + "loss": 0.9053, + "step": 840 + }, + { + "epoch": 0.06920386751697181, + "grad_norm": 2.9138317208293594, + "learning_rate": 1.991962990329764e-05, + "loss": 0.9005, + "step": 841 + }, + { + "epoch": 0.06928615511211685, + "grad_norm": 2.418947503313838, + "learning_rate": 1.991929231454673e-05, + "loss": 0.8876, + "step": 842 + }, + { + "epoch": 0.06936844270726188, + "grad_norm": 2.746227734046784, + "learning_rate": 1.9918954021145162e-05, + "loss": 0.9174, + "step": 843 + }, + { + "epoch": 0.06945073030240691, + "grad_norm": 4.054877897574317, + "learning_rate": 1.991861502311696e-05, + "loss": 0.8785, + "step": 844 + }, + { + "epoch": 0.06953301789755194, + "grad_norm": 3.3645447414769856, + "learning_rate": 1.9918275320486212e-05, + "loss": 0.8885, + "step": 845 + }, + { + "epoch": 0.06961530549269698, + "grad_norm": 0.6257651466469342, + "learning_rate": 1.9917934913277047e-05, + "loss": 0.5679, + "step": 846 + }, + { + "epoch": 0.06969759308784201, + "grad_norm": 2.9579632903454987, + "learning_rate": 1.9917593801513645e-05, + "loss": 0.8892, + "step": 847 + }, + { + "epoch": 0.06977988068298704, + "grad_norm": 2.3255674692633703, + "learning_rate": 1.991725198522024e-05, + "loss": 0.8969, + "step": 848 + }, + { + "epoch": 0.06986216827813208, + "grad_norm": 1.8812338541653777, + "learning_rate": 1.9916909464421118e-05, + "loss": 0.84, + "step": 849 + }, + { + "epoch": 0.0699444558732771, + "grad_norm": 4.348093261520783, + "learning_rate": 1.9916566239140605e-05, + "loss": 0.9035, + "step": 850 + }, + { + "epoch": 0.07002674346842214, + "grad_norm": 2.2375985456191003, + "learning_rate": 1.9916222309403085e-05, + "loss": 0.8754, + "step": 851 + }, + { + "epoch": 0.07010903106356717, + "grad_norm": 3.613200403801302, + "learning_rate": 1.9915877675232992e-05, + "loss": 0.8815, + "step": 852 + }, + { + "epoch": 0.0701913186587122, + "grad_norm": 3.839543987455212, + "learning_rate": 1.9915532336654807e-05, + "loss": 0.9072, + "step": 853 + }, + { + "epoch": 0.07027360625385723, + "grad_norm": 2.105567560984786, + "learning_rate": 1.991518629369306e-05, + "loss": 0.896, + "step": 854 + }, + { + "epoch": 0.07035589384900226, + "grad_norm": 2.267537355899574, + "learning_rate": 1.9914839546372336e-05, + "loss": 0.9158, + "step": 855 + }, + { + "epoch": 0.07043818144414729, + "grad_norm": 3.589047414435187, + "learning_rate": 1.991449209471727e-05, + "loss": 0.8734, + "step": 856 + }, + { + "epoch": 0.07052046903929232, + "grad_norm": 3.1819343869570536, + "learning_rate": 1.991414393875254e-05, + "loss": 0.9089, + "step": 857 + }, + { + "epoch": 0.07060275663443735, + "grad_norm": 2.5055069972264503, + "learning_rate": 1.991379507850288e-05, + "loss": 0.8681, + "step": 858 + }, + { + "epoch": 0.0706850442295824, + "grad_norm": 2.545062208600291, + "learning_rate": 1.991344551399307e-05, + "loss": 0.8835, + "step": 859 + }, + { + "epoch": 0.07076733182472743, + "grad_norm": 2.8423181256983487, + "learning_rate": 1.9913095245247948e-05, + "loss": 0.8855, + "step": 860 + }, + { + "epoch": 0.07084961941987246, + "grad_norm": 2.623939420394984, + "learning_rate": 1.9912744272292392e-05, + "loss": 0.8912, + "step": 861 + }, + { + "epoch": 0.07093190701501749, + "grad_norm": 2.456776383887346, + "learning_rate": 1.9912392595151336e-05, + "loss": 0.9026, + "step": 862 + }, + { + "epoch": 0.07101419461016252, + "grad_norm": 2.7531225878969177, + "learning_rate": 1.9912040213849762e-05, + "loss": 0.8875, + "step": 863 + }, + { + "epoch": 0.07109648220530755, + "grad_norm": 4.481796954208249, + "learning_rate": 1.9911687128412708e-05, + "loss": 0.8636, + "step": 864 + }, + { + "epoch": 0.07117876980045258, + "grad_norm": 2.545397332779262, + "learning_rate": 1.9911333338865245e-05, + "loss": 0.8803, + "step": 865 + }, + { + "epoch": 0.07126105739559761, + "grad_norm": 3.045980428767302, + "learning_rate": 1.9910978845232517e-05, + "loss": 0.9035, + "step": 866 + }, + { + "epoch": 0.07134334499074264, + "grad_norm": 3.6871914250355715, + "learning_rate": 1.9910623647539702e-05, + "loss": 0.8666, + "step": 867 + }, + { + "epoch": 0.07142563258588767, + "grad_norm": 2.116550202268351, + "learning_rate": 1.991026774581203e-05, + "loss": 0.9031, + "step": 868 + }, + { + "epoch": 0.0715079201810327, + "grad_norm": 2.532009330642646, + "learning_rate": 1.9909911140074788e-05, + "loss": 0.8661, + "step": 869 + }, + { + "epoch": 0.07159020777617774, + "grad_norm": 3.33485917673071, + "learning_rate": 1.9909553830353308e-05, + "loss": 0.8776, + "step": 870 + }, + { + "epoch": 0.07167249537132277, + "grad_norm": 2.3439342371747167, + "learning_rate": 1.990919581667297e-05, + "loss": 0.9151, + "step": 871 + }, + { + "epoch": 0.07175478296646781, + "grad_norm": 2.488600787006511, + "learning_rate": 1.9908837099059212e-05, + "loss": 0.9165, + "step": 872 + }, + { + "epoch": 0.07183707056161284, + "grad_norm": 3.95670742389146, + "learning_rate": 1.990847767753751e-05, + "loss": 0.8659, + "step": 873 + }, + { + "epoch": 0.07191935815675787, + "grad_norm": 0.5947750160477462, + "learning_rate": 1.99081175521334e-05, + "loss": 0.5886, + "step": 874 + }, + { + "epoch": 0.0720016457519029, + "grad_norm": 2.033586754058639, + "learning_rate": 1.9907756722872465e-05, + "loss": 0.8897, + "step": 875 + }, + { + "epoch": 0.07208393334704793, + "grad_norm": 3.346298659721499, + "learning_rate": 1.9907395189780335e-05, + "loss": 0.902, + "step": 876 + }, + { + "epoch": 0.07216622094219297, + "grad_norm": 3.004056249927372, + "learning_rate": 1.9907032952882703e-05, + "loss": 0.8715, + "step": 877 + }, + { + "epoch": 0.072248508537338, + "grad_norm": 5.4098932917643285, + "learning_rate": 1.9906670012205286e-05, + "loss": 0.8866, + "step": 878 + }, + { + "epoch": 0.07233079613248303, + "grad_norm": 6.828654192266096, + "learning_rate": 1.990630636777388e-05, + "loss": 0.8689, + "step": 879 + }, + { + "epoch": 0.07241308372762806, + "grad_norm": 2.6337207605941737, + "learning_rate": 1.9905942019614312e-05, + "loss": 0.8647, + "step": 880 + }, + { + "epoch": 0.07249537132277309, + "grad_norm": 0.5235737963953581, + "learning_rate": 1.990557696775246e-05, + "loss": 0.5661, + "step": 881 + }, + { + "epoch": 0.07257765891791812, + "grad_norm": 11.548238836629363, + "learning_rate": 1.9905211212214266e-05, + "loss": 0.9294, + "step": 882 + }, + { + "epoch": 0.07265994651306315, + "grad_norm": 5.489164212385315, + "learning_rate": 1.990484475302571e-05, + "loss": 0.8685, + "step": 883 + }, + { + "epoch": 0.07274223410820818, + "grad_norm": 7.88390924258145, + "learning_rate": 1.990447759021282e-05, + "loss": 0.874, + "step": 884 + }, + { + "epoch": 0.07282452170335323, + "grad_norm": 4.299200684634295, + "learning_rate": 1.9904109723801684e-05, + "loss": 0.9146, + "step": 885 + }, + { + "epoch": 0.07290680929849826, + "grad_norm": 6.21170690266594, + "learning_rate": 1.990374115381843e-05, + "loss": 0.8728, + "step": 886 + }, + { + "epoch": 0.07298909689364329, + "grad_norm": 4.563438990093578, + "learning_rate": 1.9903371880289247e-05, + "loss": 0.8747, + "step": 887 + }, + { + "epoch": 0.07307138448878832, + "grad_norm": 3.6273703961737187, + "learning_rate": 1.990300190324036e-05, + "loss": 0.9008, + "step": 888 + }, + { + "epoch": 0.07315367208393335, + "grad_norm": 7.441233530871766, + "learning_rate": 1.9902631222698057e-05, + "loss": 0.9141, + "step": 889 + }, + { + "epoch": 0.07323595967907838, + "grad_norm": 4.82833921873659, + "learning_rate": 1.990225983868867e-05, + "loss": 0.9339, + "step": 890 + }, + { + "epoch": 0.07331824727422341, + "grad_norm": 5.887738980648113, + "learning_rate": 1.9901887751238577e-05, + "loss": 0.8799, + "step": 891 + }, + { + "epoch": 0.07340053486936844, + "grad_norm": 2.5245499693701072, + "learning_rate": 1.9901514960374217e-05, + "loss": 0.8835, + "step": 892 + }, + { + "epoch": 0.07348282246451347, + "grad_norm": 6.763974106441189, + "learning_rate": 1.990114146612207e-05, + "loss": 0.891, + "step": 893 + }, + { + "epoch": 0.0735651100596585, + "grad_norm": 2.8844071869365835, + "learning_rate": 1.9900767268508666e-05, + "loss": 0.9097, + "step": 894 + }, + { + "epoch": 0.07364739765480353, + "grad_norm": 5.440132687337712, + "learning_rate": 1.9900392367560588e-05, + "loss": 0.8831, + "step": 895 + }, + { + "epoch": 0.07372968524994856, + "grad_norm": 3.745407109325051, + "learning_rate": 1.9900016763304472e-05, + "loss": 0.8805, + "step": 896 + }, + { + "epoch": 0.0738119728450936, + "grad_norm": 4.288740968099518, + "learning_rate": 1.9899640455766997e-05, + "loss": 0.8891, + "step": 897 + }, + { + "epoch": 0.07389426044023864, + "grad_norm": 2.755838421562454, + "learning_rate": 1.9899263444974894e-05, + "loss": 0.8973, + "step": 898 + }, + { + "epoch": 0.07397654803538367, + "grad_norm": 2.63866374184814, + "learning_rate": 1.9898885730954948e-05, + "loss": 0.8418, + "step": 899 + }, + { + "epoch": 0.0740588356305287, + "grad_norm": 3.0901321494386598, + "learning_rate": 1.9898507313733995e-05, + "loss": 0.8614, + "step": 900 + }, + { + "epoch": 0.07414112322567373, + "grad_norm": 2.754917360078824, + "learning_rate": 1.9898128193338907e-05, + "loss": 0.8964, + "step": 901 + }, + { + "epoch": 0.07422341082081876, + "grad_norm": 2.4717700343085163, + "learning_rate": 1.9897748369796627e-05, + "loss": 0.8793, + "step": 902 + }, + { + "epoch": 0.0743056984159638, + "grad_norm": 2.2819538240312585, + "learning_rate": 1.989736784313413e-05, + "loss": 0.9086, + "step": 903 + }, + { + "epoch": 0.07438798601110883, + "grad_norm": 2.7031870546344385, + "learning_rate": 1.989698661337845e-05, + "loss": 0.8601, + "step": 904 + }, + { + "epoch": 0.07447027360625386, + "grad_norm": 2.2788277737039757, + "learning_rate": 1.9896604680556664e-05, + "loss": 0.8464, + "step": 905 + }, + { + "epoch": 0.07455256120139889, + "grad_norm": 2.0567769102378954, + "learning_rate": 1.9896222044695914e-05, + "loss": 0.8807, + "step": 906 + }, + { + "epoch": 0.07463484879654392, + "grad_norm": 2.384203325674513, + "learning_rate": 1.9895838705823377e-05, + "loss": 0.8923, + "step": 907 + }, + { + "epoch": 0.07471713639168895, + "grad_norm": 2.0967277384590535, + "learning_rate": 1.989545466396628e-05, + "loss": 0.8793, + "step": 908 + }, + { + "epoch": 0.07479942398683398, + "grad_norm": 9.442852725541027, + "learning_rate": 1.9895069919151915e-05, + "loss": 0.8965, + "step": 909 + }, + { + "epoch": 0.07488171158197901, + "grad_norm": 5.109761027664979, + "learning_rate": 1.9894684471407605e-05, + "loss": 0.8983, + "step": 910 + }, + { + "epoch": 0.07496399917712405, + "grad_norm": 2.2367018687313185, + "learning_rate": 1.9894298320760733e-05, + "loss": 0.8879, + "step": 911 + }, + { + "epoch": 0.07504628677226909, + "grad_norm": 2.6873708972425656, + "learning_rate": 1.989391146723873e-05, + "loss": 0.8975, + "step": 912 + }, + { + "epoch": 0.07512857436741412, + "grad_norm": 0.5656242706848698, + "learning_rate": 1.9893523910869085e-05, + "loss": 0.617, + "step": 913 + }, + { + "epoch": 0.07521086196255915, + "grad_norm": 3.9316911134297814, + "learning_rate": 1.989313565167932e-05, + "loss": 0.9385, + "step": 914 + }, + { + "epoch": 0.07529314955770418, + "grad_norm": 2.783913423475105, + "learning_rate": 1.9892746689697024e-05, + "loss": 0.898, + "step": 915 + }, + { + "epoch": 0.07537543715284921, + "grad_norm": 4.235687618463353, + "learning_rate": 1.989235702494982e-05, + "loss": 0.8539, + "step": 916 + }, + { + "epoch": 0.07545772474799424, + "grad_norm": 2.387819568149409, + "learning_rate": 1.9891966657465397e-05, + "loss": 0.8369, + "step": 917 + }, + { + "epoch": 0.07554001234313927, + "grad_norm": 3.6947231383398424, + "learning_rate": 1.989157558727148e-05, + "loss": 0.8834, + "step": 918 + }, + { + "epoch": 0.0756222999382843, + "grad_norm": 2.604963394831731, + "learning_rate": 1.989118381439585e-05, + "loss": 0.9019, + "step": 919 + }, + { + "epoch": 0.07570458753342933, + "grad_norm": 0.5332477363950743, + "learning_rate": 1.9890791338866344e-05, + "loss": 0.5771, + "step": 920 + }, + { + "epoch": 0.07578687512857436, + "grad_norm": 3.2104258542562953, + "learning_rate": 1.9890398160710837e-05, + "loss": 0.9337, + "step": 921 + }, + { + "epoch": 0.0758691627237194, + "grad_norm": 0.48633325822320617, + "learning_rate": 1.9890004279957266e-05, + "loss": 0.5602, + "step": 922 + }, + { + "epoch": 0.07595145031886442, + "grad_norm": 12.835475358323716, + "learning_rate": 1.9889609696633606e-05, + "loss": 0.8553, + "step": 923 + }, + { + "epoch": 0.07603373791400947, + "grad_norm": 3.2124511867282037, + "learning_rate": 1.9889214410767887e-05, + "loss": 0.8674, + "step": 924 + }, + { + "epoch": 0.0761160255091545, + "grad_norm": 2.904116877033008, + "learning_rate": 1.9888818422388193e-05, + "loss": 0.8747, + "step": 925 + }, + { + "epoch": 0.07619831310429953, + "grad_norm": 3.157871788078832, + "learning_rate": 1.9888421731522656e-05, + "loss": 0.8891, + "step": 926 + }, + { + "epoch": 0.07628060069944456, + "grad_norm": 2.3718730999123547, + "learning_rate": 1.9888024338199448e-05, + "loss": 0.8993, + "step": 927 + }, + { + "epoch": 0.07636288829458959, + "grad_norm": 2.4565769064213723, + "learning_rate": 1.988762624244681e-05, + "loss": 0.9013, + "step": 928 + }, + { + "epoch": 0.07644517588973462, + "grad_norm": 2.540968098318489, + "learning_rate": 1.988722744429301e-05, + "loss": 0.8633, + "step": 929 + }, + { + "epoch": 0.07652746348487965, + "grad_norm": 3.56518007003656, + "learning_rate": 1.988682794376639e-05, + "loss": 0.8882, + "step": 930 + }, + { + "epoch": 0.07660975108002469, + "grad_norm": 2.176182910474906, + "learning_rate": 1.9886427740895325e-05, + "loss": 0.9149, + "step": 931 + }, + { + "epoch": 0.07669203867516972, + "grad_norm": 0.5807290241092793, + "learning_rate": 1.9886026835708242e-05, + "loss": 0.5897, + "step": 932 + }, + { + "epoch": 0.07677432627031475, + "grad_norm": 0.5568253540494434, + "learning_rate": 1.9885625228233624e-05, + "loss": 0.5944, + "step": 933 + }, + { + "epoch": 0.07685661386545978, + "grad_norm": 0.46307351633355415, + "learning_rate": 1.9885222918499998e-05, + "loss": 0.5687, + "step": 934 + }, + { + "epoch": 0.07693890146060481, + "grad_norm": 2.21686936101954, + "learning_rate": 1.9884819906535946e-05, + "loss": 0.899, + "step": 935 + }, + { + "epoch": 0.07702118905574984, + "grad_norm": 2.7051990886793758, + "learning_rate": 1.9884416192370096e-05, + "loss": 0.9015, + "step": 936 + }, + { + "epoch": 0.07710347665089488, + "grad_norm": 2.1375647901334385, + "learning_rate": 1.988401177603113e-05, + "loss": 0.9001, + "step": 937 + }, + { + "epoch": 0.07718576424603991, + "grad_norm": 4.132265546672556, + "learning_rate": 1.988360665754777e-05, + "loss": 0.8908, + "step": 938 + }, + { + "epoch": 0.07726805184118495, + "grad_norm": 2.1359019957192533, + "learning_rate": 1.9883200836948803e-05, + "loss": 0.8717, + "step": 939 + }, + { + "epoch": 0.07735033943632998, + "grad_norm": 3.9513646854514386, + "learning_rate": 1.9882794314263053e-05, + "loss": 0.8718, + "step": 940 + }, + { + "epoch": 0.07743262703147501, + "grad_norm": 2.321609974282721, + "learning_rate": 1.9882387089519398e-05, + "loss": 0.869, + "step": 941 + }, + { + "epoch": 0.07751491462662004, + "grad_norm": 3.70309268916697, + "learning_rate": 1.9881979162746772e-05, + "loss": 0.8649, + "step": 942 + }, + { + "epoch": 0.07759720222176507, + "grad_norm": 3.361767416529052, + "learning_rate": 1.9881570533974148e-05, + "loss": 0.8683, + "step": 943 + }, + { + "epoch": 0.0776794898169101, + "grad_norm": 3.4179325921845036, + "learning_rate": 1.988116120323056e-05, + "loss": 0.8963, + "step": 944 + }, + { + "epoch": 0.07776177741205513, + "grad_norm": 3.021751145368183, + "learning_rate": 1.988075117054508e-05, + "loss": 0.8746, + "step": 945 + }, + { + "epoch": 0.07784406500720016, + "grad_norm": 3.5878829514900974, + "learning_rate": 1.9880340435946837e-05, + "loss": 0.8516, + "step": 946 + }, + { + "epoch": 0.07792635260234519, + "grad_norm": 1.920072678794743, + "learning_rate": 1.9879928999465016e-05, + "loss": 0.8937, + "step": 947 + }, + { + "epoch": 0.07800864019749022, + "grad_norm": 2.2091268186489796, + "learning_rate": 1.9879516861128835e-05, + "loss": 0.8475, + "step": 948 + }, + { + "epoch": 0.07809092779263525, + "grad_norm": 2.2168445139505644, + "learning_rate": 1.9879104020967577e-05, + "loss": 0.8633, + "step": 949 + }, + { + "epoch": 0.0781732153877803, + "grad_norm": 1.0323698606460356, + "learning_rate": 1.9878690479010568e-05, + "loss": 0.6111, + "step": 950 + }, + { + "epoch": 0.07825550298292533, + "grad_norm": 2.682420816107399, + "learning_rate": 1.987827623528719e-05, + "loss": 0.9341, + "step": 951 + }, + { + "epoch": 0.07833779057807036, + "grad_norm": 0.6240540448167275, + "learning_rate": 1.987786128982686e-05, + "loss": 0.5523, + "step": 952 + }, + { + "epoch": 0.07842007817321539, + "grad_norm": 3.6752862094905905, + "learning_rate": 1.9877445642659066e-05, + "loss": 0.9273, + "step": 953 + }, + { + "epoch": 0.07850236576836042, + "grad_norm": 2.3734201750601858, + "learning_rate": 1.987702929381333e-05, + "loss": 0.8919, + "step": 954 + }, + { + "epoch": 0.07858465336350545, + "grad_norm": 0.7387548503010232, + "learning_rate": 1.9876612243319228e-05, + "loss": 0.5746, + "step": 955 + }, + { + "epoch": 0.07866694095865048, + "grad_norm": 0.6959735516945202, + "learning_rate": 1.9876194491206388e-05, + "loss": 0.5751, + "step": 956 + }, + { + "epoch": 0.07874922855379551, + "grad_norm": 2.1882974936345394, + "learning_rate": 1.9875776037504482e-05, + "loss": 0.9006, + "step": 957 + }, + { + "epoch": 0.07883151614894054, + "grad_norm": 2.341847998608011, + "learning_rate": 1.9875356882243245e-05, + "loss": 0.9041, + "step": 958 + }, + { + "epoch": 0.07891380374408558, + "grad_norm": 2.1628210206575433, + "learning_rate": 1.9874937025452445e-05, + "loss": 0.8883, + "step": 959 + }, + { + "epoch": 0.0789960913392306, + "grad_norm": 2.8510221399462483, + "learning_rate": 1.9874516467161914e-05, + "loss": 0.9231, + "step": 960 + }, + { + "epoch": 0.07907837893437564, + "grad_norm": 4.694838855869676, + "learning_rate": 1.9874095207401526e-05, + "loss": 0.9156, + "step": 961 + }, + { + "epoch": 0.07916066652952067, + "grad_norm": 2.877307386668155, + "learning_rate": 1.98736732462012e-05, + "loss": 0.8686, + "step": 962 + }, + { + "epoch": 0.07924295412466571, + "grad_norm": 2.581259841624273, + "learning_rate": 1.9873250583590923e-05, + "loss": 0.9125, + "step": 963 + }, + { + "epoch": 0.07932524171981074, + "grad_norm": 2.3158798477006037, + "learning_rate": 1.9872827219600716e-05, + "loss": 0.8926, + "step": 964 + }, + { + "epoch": 0.07940752931495577, + "grad_norm": 3.0098712265326784, + "learning_rate": 1.987240315426065e-05, + "loss": 0.8758, + "step": 965 + }, + { + "epoch": 0.0794898169101008, + "grad_norm": 3.1422180864323233, + "learning_rate": 1.987197838760085e-05, + "loss": 0.8908, + "step": 966 + }, + { + "epoch": 0.07957210450524584, + "grad_norm": 0.9645131727703571, + "learning_rate": 1.9871552919651494e-05, + "loss": 0.6045, + "step": 967 + }, + { + "epoch": 0.07965439210039087, + "grad_norm": 3.56520313826412, + "learning_rate": 1.9871126750442807e-05, + "loss": 0.8696, + "step": 968 + }, + { + "epoch": 0.0797366796955359, + "grad_norm": 2.0059409411059113, + "learning_rate": 1.9870699880005063e-05, + "loss": 0.8799, + "step": 969 + }, + { + "epoch": 0.07981896729068093, + "grad_norm": 4.983123742682501, + "learning_rate": 1.9870272308368584e-05, + "loss": 0.8693, + "step": 970 + }, + { + "epoch": 0.07990125488582596, + "grad_norm": 2.1182309366583474, + "learning_rate": 1.9869844035563747e-05, + "loss": 0.8649, + "step": 971 + }, + { + "epoch": 0.07998354248097099, + "grad_norm": 2.157976641839583, + "learning_rate": 1.986941506162097e-05, + "loss": 0.8844, + "step": 972 + }, + { + "epoch": 0.08006583007611602, + "grad_norm": 3.1179516322271117, + "learning_rate": 1.9868985386570734e-05, + "loss": 0.8702, + "step": 973 + }, + { + "epoch": 0.08014811767126105, + "grad_norm": 2.1804704549093246, + "learning_rate": 1.986855501044356e-05, + "loss": 0.8963, + "step": 974 + }, + { + "epoch": 0.08023040526640608, + "grad_norm": 2.825665735780858, + "learning_rate": 1.986812393327002e-05, + "loss": 0.9028, + "step": 975 + }, + { + "epoch": 0.08031269286155113, + "grad_norm": 2.7064578154820276, + "learning_rate": 1.9867692155080736e-05, + "loss": 0.8922, + "step": 976 + }, + { + "epoch": 0.08039498045669616, + "grad_norm": 4.940848988099329, + "learning_rate": 1.9867259675906383e-05, + "loss": 0.9096, + "step": 977 + }, + { + "epoch": 0.08047726805184119, + "grad_norm": 3.7159663449631943, + "learning_rate": 1.9866826495777683e-05, + "loss": 0.8946, + "step": 978 + }, + { + "epoch": 0.08055955564698622, + "grad_norm": 4.235722900766384, + "learning_rate": 1.9866392614725408e-05, + "loss": 0.8844, + "step": 979 + }, + { + "epoch": 0.08064184324213125, + "grad_norm": 2.5725805077545796, + "learning_rate": 1.9865958032780383e-05, + "loss": 0.8849, + "step": 980 + }, + { + "epoch": 0.08072413083727628, + "grad_norm": 3.2900229009140367, + "learning_rate": 1.986552274997348e-05, + "loss": 0.8712, + "step": 981 + }, + { + "epoch": 0.08080641843242131, + "grad_norm": 2.7018112393037206, + "learning_rate": 1.986508676633561e-05, + "loss": 0.881, + "step": 982 + }, + { + "epoch": 0.08088870602756634, + "grad_norm": 3.2565064868257356, + "learning_rate": 1.986465008189776e-05, + "loss": 0.8741, + "step": 983 + }, + { + "epoch": 0.08097099362271137, + "grad_norm": 2.977427479800942, + "learning_rate": 1.986421269669094e-05, + "loss": 0.864, + "step": 984 + }, + { + "epoch": 0.0810532812178564, + "grad_norm": 2.8391838913702734, + "learning_rate": 1.986377461074623e-05, + "loss": 0.8777, + "step": 985 + }, + { + "epoch": 0.08113556881300144, + "grad_norm": 2.228144074432828, + "learning_rate": 1.9863335824094742e-05, + "loss": 0.8873, + "step": 986 + }, + { + "epoch": 0.08121785640814647, + "grad_norm": 2.6153835393886444, + "learning_rate": 1.9862896336767654e-05, + "loss": 0.8565, + "step": 987 + }, + { + "epoch": 0.08130014400329151, + "grad_norm": 2.469488378896095, + "learning_rate": 1.9862456148796182e-05, + "loss": 0.9062, + "step": 988 + }, + { + "epoch": 0.08138243159843654, + "grad_norm": 0.9008951474609029, + "learning_rate": 1.98620152602116e-05, + "loss": 0.5855, + "step": 989 + }, + { + "epoch": 0.08146471919358157, + "grad_norm": 3.1010964992276335, + "learning_rate": 1.986157367104522e-05, + "loss": 0.8901, + "step": 990 + }, + { + "epoch": 0.0815470067887266, + "grad_norm": 2.745575020455269, + "learning_rate": 1.9861131381328422e-05, + "loss": 0.8992, + "step": 991 + }, + { + "epoch": 0.08162929438387163, + "grad_norm": 2.319333762749616, + "learning_rate": 1.9860688391092623e-05, + "loss": 0.8489, + "step": 992 + }, + { + "epoch": 0.08171158197901666, + "grad_norm": 1.8701951574677815, + "learning_rate": 1.9860244700369288e-05, + "loss": 0.8895, + "step": 993 + }, + { + "epoch": 0.0817938695741617, + "grad_norm": 2.4973895580746928, + "learning_rate": 1.985980030918994e-05, + "loss": 0.8414, + "step": 994 + }, + { + "epoch": 0.08187615716930673, + "grad_norm": 2.542292639884159, + "learning_rate": 1.9859355217586144e-05, + "loss": 0.8865, + "step": 995 + }, + { + "epoch": 0.08195844476445176, + "grad_norm": 0.5992255264191748, + "learning_rate": 1.9858909425589524e-05, + "loss": 0.5575, + "step": 996 + }, + { + "epoch": 0.08204073235959679, + "grad_norm": 2.143472686925439, + "learning_rate": 1.9858462933231742e-05, + "loss": 0.8543, + "step": 997 + }, + { + "epoch": 0.08212301995474182, + "grad_norm": 2.49083696229216, + "learning_rate": 1.9858015740544524e-05, + "loss": 0.8961, + "step": 998 + }, + { + "epoch": 0.08220530754988685, + "grad_norm": 5.032363107017064, + "learning_rate": 1.985756784755963e-05, + "loss": 0.869, + "step": 999 + }, + { + "epoch": 0.08228759514503188, + "grad_norm": 3.456646347683982, + "learning_rate": 1.9857119254308885e-05, + "loss": 0.868, + "step": 1000 + }, + { + "epoch": 0.08236988274017693, + "grad_norm": 3.7630419410589755, + "learning_rate": 1.9856669960824147e-05, + "loss": 0.9249, + "step": 1001 + }, + { + "epoch": 0.08245217033532196, + "grad_norm": 3.1625549709552994, + "learning_rate": 1.985621996713734e-05, + "loss": 0.8869, + "step": 1002 + }, + { + "epoch": 0.08253445793046699, + "grad_norm": 3.881507636381793, + "learning_rate": 1.985576927328043e-05, + "loss": 0.888, + "step": 1003 + }, + { + "epoch": 0.08261674552561202, + "grad_norm": 2.544247409259161, + "learning_rate": 1.9855317879285434e-05, + "loss": 0.8715, + "step": 1004 + }, + { + "epoch": 0.08269903312075705, + "grad_norm": 2.5279916413903583, + "learning_rate": 1.9854865785184417e-05, + "loss": 0.8849, + "step": 1005 + }, + { + "epoch": 0.08278132071590208, + "grad_norm": 3.4196695037594576, + "learning_rate": 1.9854412991009494e-05, + "loss": 0.8364, + "step": 1006 + }, + { + "epoch": 0.08286360831104711, + "grad_norm": 2.759961086631554, + "learning_rate": 1.985395949679283e-05, + "loss": 0.854, + "step": 1007 + }, + { + "epoch": 0.08294589590619214, + "grad_norm": 0.5731316878529051, + "learning_rate": 1.9853505302566646e-05, + "loss": 0.6152, + "step": 1008 + }, + { + "epoch": 0.08302818350133717, + "grad_norm": 2.9549671685361525, + "learning_rate": 1.98530504083632e-05, + "loss": 0.861, + "step": 1009 + }, + { + "epoch": 0.0831104710964822, + "grad_norm": 2.3193711696281025, + "learning_rate": 1.9852594814214812e-05, + "loss": 0.865, + "step": 1010 + }, + { + "epoch": 0.08319275869162723, + "grad_norm": 3.0076758009209636, + "learning_rate": 1.9852138520153846e-05, + "loss": 0.8852, + "step": 1011 + }, + { + "epoch": 0.08327504628677226, + "grad_norm": 2.732008977686221, + "learning_rate": 1.9851681526212716e-05, + "loss": 0.8928, + "step": 1012 + }, + { + "epoch": 0.0833573338819173, + "grad_norm": 2.37950207279815, + "learning_rate": 1.9851223832423886e-05, + "loss": 0.8617, + "step": 1013 + }, + { + "epoch": 0.08343962147706234, + "grad_norm": 2.464424002675186, + "learning_rate": 1.985076543881987e-05, + "loss": 0.8625, + "step": 1014 + }, + { + "epoch": 0.08352190907220737, + "grad_norm": 2.9080302916718015, + "learning_rate": 1.985030634543323e-05, + "loss": 0.8832, + "step": 1015 + }, + { + "epoch": 0.0836041966673524, + "grad_norm": 2.6287476224799655, + "learning_rate": 1.984984655229658e-05, + "loss": 0.8728, + "step": 1016 + }, + { + "epoch": 0.08368648426249743, + "grad_norm": 2.5936175763493052, + "learning_rate": 1.9849386059442585e-05, + "loss": 0.8678, + "step": 1017 + }, + { + "epoch": 0.08376877185764246, + "grad_norm": 2.3604963235792904, + "learning_rate": 1.9848924866903955e-05, + "loss": 0.8783, + "step": 1018 + }, + { + "epoch": 0.0838510594527875, + "grad_norm": 0.5341112663835049, + "learning_rate": 1.984846297471345e-05, + "loss": 0.605, + "step": 1019 + }, + { + "epoch": 0.08393334704793252, + "grad_norm": 2.9860218730439057, + "learning_rate": 1.984800038290389e-05, + "loss": 0.8525, + "step": 1020 + }, + { + "epoch": 0.08401563464307756, + "grad_norm": 2.4630212214875025, + "learning_rate": 1.9847537091508134e-05, + "loss": 0.8825, + "step": 1021 + }, + { + "epoch": 0.08409792223822259, + "grad_norm": 2.424908485494412, + "learning_rate": 1.984707310055909e-05, + "loss": 0.891, + "step": 1022 + }, + { + "epoch": 0.08418020983336762, + "grad_norm": 2.886480910540036, + "learning_rate": 1.984660841008972e-05, + "loss": 0.8935, + "step": 1023 + }, + { + "epoch": 0.08426249742851265, + "grad_norm": 2.4246756718684384, + "learning_rate": 1.9846143020133035e-05, + "loss": 0.8679, + "step": 1024 + }, + { + "epoch": 0.08434478502365768, + "grad_norm": 4.020038177987053, + "learning_rate": 1.98456769307221e-05, + "loss": 0.8191, + "step": 1025 + }, + { + "epoch": 0.08442707261880271, + "grad_norm": 2.6823999549769795, + "learning_rate": 1.9845210141890018e-05, + "loss": 0.8618, + "step": 1026 + }, + { + "epoch": 0.08450936021394775, + "grad_norm": 2.2350487266641035, + "learning_rate": 1.9844742653669953e-05, + "loss": 0.8595, + "step": 1027 + }, + { + "epoch": 0.08459164780909278, + "grad_norm": 4.977761117586025, + "learning_rate": 1.9844274466095117e-05, + "loss": 0.8516, + "step": 1028 + }, + { + "epoch": 0.08467393540423782, + "grad_norm": 3.31805191100729, + "learning_rate": 1.9843805579198766e-05, + "loss": 0.8636, + "step": 1029 + }, + { + "epoch": 0.08475622299938285, + "grad_norm": 2.5881873279624648, + "learning_rate": 1.9843335993014206e-05, + "loss": 0.8667, + "step": 1030 + }, + { + "epoch": 0.08483851059452788, + "grad_norm": 3.9560157884462, + "learning_rate": 1.98428657075748e-05, + "loss": 0.8799, + "step": 1031 + }, + { + "epoch": 0.08492079818967291, + "grad_norm": 2.5965271671259753, + "learning_rate": 1.984239472291396e-05, + "loss": 0.8714, + "step": 1032 + }, + { + "epoch": 0.08500308578481794, + "grad_norm": 2.9384162786300094, + "learning_rate": 1.9841923039065136e-05, + "loss": 0.8784, + "step": 1033 + }, + { + "epoch": 0.08508537337996297, + "grad_norm": 4.575841979886102, + "learning_rate": 1.984145065606184e-05, + "loss": 0.871, + "step": 1034 + }, + { + "epoch": 0.085167660975108, + "grad_norm": 2.6762798398130205, + "learning_rate": 1.984097757393763e-05, + "loss": 0.8884, + "step": 1035 + }, + { + "epoch": 0.08524994857025303, + "grad_norm": 2.3317749715867757, + "learning_rate": 1.9840503792726107e-05, + "loss": 0.8582, + "step": 1036 + }, + { + "epoch": 0.08533223616539806, + "grad_norm": 2.5192408862448925, + "learning_rate": 1.9840029312460936e-05, + "loss": 0.8987, + "step": 1037 + }, + { + "epoch": 0.08541452376054309, + "grad_norm": 3.0314447963476954, + "learning_rate": 1.9839554133175815e-05, + "loss": 0.9115, + "step": 1038 + }, + { + "epoch": 0.08549681135568812, + "grad_norm": 2.718611923577393, + "learning_rate": 1.983907825490451e-05, + "loss": 0.8768, + "step": 1039 + }, + { + "epoch": 0.08557909895083317, + "grad_norm": 3.2506331598038063, + "learning_rate": 1.9838601677680818e-05, + "loss": 0.8892, + "step": 1040 + }, + { + "epoch": 0.0856613865459782, + "grad_norm": 2.8785960552339844, + "learning_rate": 1.9838124401538596e-05, + "loss": 0.8762, + "step": 1041 + }, + { + "epoch": 0.08574367414112323, + "grad_norm": 3.255205364224761, + "learning_rate": 1.9837646426511755e-05, + "loss": 0.8878, + "step": 1042 + }, + { + "epoch": 0.08582596173626826, + "grad_norm": 2.152447959926313, + "learning_rate": 1.9837167752634243e-05, + "loss": 0.8939, + "step": 1043 + }, + { + "epoch": 0.08590824933141329, + "grad_norm": 6.038167525170103, + "learning_rate": 1.983668837994006e-05, + "loss": 0.854, + "step": 1044 + }, + { + "epoch": 0.08599053692655832, + "grad_norm": 2.4872882270608296, + "learning_rate": 1.983620830846327e-05, + "loss": 0.865, + "step": 1045 + }, + { + "epoch": 0.08607282452170335, + "grad_norm": 5.0878964623293905, + "learning_rate": 1.9835727538237977e-05, + "loss": 0.8848, + "step": 1046 + }, + { + "epoch": 0.08615511211684838, + "grad_norm": 0.5466809522376739, + "learning_rate": 1.9835246069298325e-05, + "loss": 0.5879, + "step": 1047 + }, + { + "epoch": 0.08623739971199341, + "grad_norm": 2.8930059060138134, + "learning_rate": 1.9834763901678523e-05, + "loss": 0.9032, + "step": 1048 + }, + { + "epoch": 0.08631968730713845, + "grad_norm": 3.481150201855255, + "learning_rate": 1.983428103541282e-05, + "loss": 0.895, + "step": 1049 + }, + { + "epoch": 0.08640197490228348, + "grad_norm": 2.2668611618771806, + "learning_rate": 1.983379747053552e-05, + "loss": 0.8841, + "step": 1050 + }, + { + "epoch": 0.08648426249742851, + "grad_norm": 0.5012767267519984, + "learning_rate": 1.9833313207080976e-05, + "loss": 0.5584, + "step": 1051 + }, + { + "epoch": 0.08656655009257354, + "grad_norm": 4.03230401593853, + "learning_rate": 1.983282824508359e-05, + "loss": 0.8722, + "step": 1052 + }, + { + "epoch": 0.08664883768771858, + "grad_norm": 3.2238027639613662, + "learning_rate": 1.9832342584577808e-05, + "loss": 0.9061, + "step": 1053 + }, + { + "epoch": 0.08673112528286361, + "grad_norm": 2.5875473888993827, + "learning_rate": 1.9831856225598134e-05, + "loss": 0.8655, + "step": 1054 + }, + { + "epoch": 0.08681341287800864, + "grad_norm": 2.9531227295823435, + "learning_rate": 1.9831369168179116e-05, + "loss": 0.9014, + "step": 1055 + }, + { + "epoch": 0.08689570047315368, + "grad_norm": 3.2403950768604273, + "learning_rate": 1.9830881412355356e-05, + "loss": 0.8802, + "step": 1056 + }, + { + "epoch": 0.0869779880682987, + "grad_norm": 2.6421330385224406, + "learning_rate": 1.9830392958161505e-05, + "loss": 0.8624, + "step": 1057 + }, + { + "epoch": 0.08706027566344374, + "grad_norm": 2.796247945415367, + "learning_rate": 1.9829903805632257e-05, + "loss": 0.8465, + "step": 1058 + }, + { + "epoch": 0.08714256325858877, + "grad_norm": 0.5356691167104551, + "learning_rate": 1.982941395480236e-05, + "loss": 0.5749, + "step": 1059 + }, + { + "epoch": 0.0872248508537338, + "grad_norm": 2.543782162970702, + "learning_rate": 1.9828923405706622e-05, + "loss": 0.8651, + "step": 1060 + }, + { + "epoch": 0.08730713844887883, + "grad_norm": 5.052374438346327, + "learning_rate": 1.982843215837988e-05, + "loss": 0.8556, + "step": 1061 + }, + { + "epoch": 0.08738942604402386, + "grad_norm": 2.709282429422679, + "learning_rate": 1.9827940212857038e-05, + "loss": 0.8739, + "step": 1062 + }, + { + "epoch": 0.08747171363916889, + "grad_norm": 12.014153200069254, + "learning_rate": 1.982744756917304e-05, + "loss": 0.8685, + "step": 1063 + }, + { + "epoch": 0.08755400123431392, + "grad_norm": 4.7874082941622875, + "learning_rate": 1.9826954227362883e-05, + "loss": 0.8968, + "step": 1064 + }, + { + "epoch": 0.08763628882945895, + "grad_norm": 3.094799934600602, + "learning_rate": 1.9826460187461616e-05, + "loss": 0.8678, + "step": 1065 + }, + { + "epoch": 0.087718576424604, + "grad_norm": 2.2422659009449664, + "learning_rate": 1.982596544950433e-05, + "loss": 0.8764, + "step": 1066 + }, + { + "epoch": 0.08780086401974903, + "grad_norm": 3.436687255418153, + "learning_rate": 1.982547001352617e-05, + "loss": 0.8516, + "step": 1067 + }, + { + "epoch": 0.08788315161489406, + "grad_norm": 0.4947838359746663, + "learning_rate": 1.982497387956234e-05, + "loss": 0.5591, + "step": 1068 + }, + { + "epoch": 0.08796543921003909, + "grad_norm": 2.6289534390817098, + "learning_rate": 1.9824477047648073e-05, + "loss": 0.8481, + "step": 1069 + }, + { + "epoch": 0.08804772680518412, + "grad_norm": 0.4837575812403313, + "learning_rate": 1.9823979517818672e-05, + "loss": 0.5778, + "step": 1070 + }, + { + "epoch": 0.08813001440032915, + "grad_norm": 3.538024856422455, + "learning_rate": 1.9823481290109478e-05, + "loss": 0.8619, + "step": 1071 + }, + { + "epoch": 0.08821230199547418, + "grad_norm": 4.321407175482124, + "learning_rate": 1.982298236455588e-05, + "loss": 0.8846, + "step": 1072 + }, + { + "epoch": 0.08829458959061921, + "grad_norm": 3.616450253072054, + "learning_rate": 1.9822482741193324e-05, + "loss": 0.8856, + "step": 1073 + }, + { + "epoch": 0.08837687718576424, + "grad_norm": 4.473435045577941, + "learning_rate": 1.9821982420057308e-05, + "loss": 0.8608, + "step": 1074 + }, + { + "epoch": 0.08845916478090927, + "grad_norm": 0.5344599795616546, + "learning_rate": 1.9821481401183364e-05, + "loss": 0.5741, + "step": 1075 + }, + { + "epoch": 0.0885414523760543, + "grad_norm": 3.608389298386541, + "learning_rate": 1.982097968460709e-05, + "loss": 0.8832, + "step": 1076 + }, + { + "epoch": 0.08862373997119934, + "grad_norm": 4.223422665021111, + "learning_rate": 1.9820477270364123e-05, + "loss": 0.8854, + "step": 1077 + }, + { + "epoch": 0.08870602756634437, + "grad_norm": 3.236757188788279, + "learning_rate": 1.981997415849016e-05, + "loss": 0.8727, + "step": 1078 + }, + { + "epoch": 0.08878831516148941, + "grad_norm": 0.5297374533084104, + "learning_rate": 1.9819470349020936e-05, + "loss": 0.5883, + "step": 1079 + }, + { + "epoch": 0.08887060275663444, + "grad_norm": 2.8725890412006656, + "learning_rate": 1.9818965841992243e-05, + "loss": 0.8719, + "step": 1080 + }, + { + "epoch": 0.08895289035177947, + "grad_norm": 0.4917914943060142, + "learning_rate": 1.9818460637439917e-05, + "loss": 0.5497, + "step": 1081 + }, + { + "epoch": 0.0890351779469245, + "grad_norm": 3.666129989863918, + "learning_rate": 1.9817954735399853e-05, + "loss": 0.855, + "step": 1082 + }, + { + "epoch": 0.08911746554206953, + "grad_norm": 3.667558282780085, + "learning_rate": 1.9817448135907984e-05, + "loss": 0.8618, + "step": 1083 + }, + { + "epoch": 0.08919975313721457, + "grad_norm": 2.8134358753083597, + "learning_rate": 1.9816940839000303e-05, + "loss": 0.8639, + "step": 1084 + }, + { + "epoch": 0.0892820407323596, + "grad_norm": 3.8554001706730907, + "learning_rate": 1.981643284471284e-05, + "loss": 0.8449, + "step": 1085 + }, + { + "epoch": 0.08936432832750463, + "grad_norm": 3.767364747903415, + "learning_rate": 1.981592415308169e-05, + "loss": 0.8549, + "step": 1086 + }, + { + "epoch": 0.08944661592264966, + "grad_norm": 2.8398571302805453, + "learning_rate": 1.9815414764142986e-05, + "loss": 0.8735, + "step": 1087 + }, + { + "epoch": 0.08952890351779469, + "grad_norm": 2.980261363247237, + "learning_rate": 1.9814904677932912e-05, + "loss": 0.8725, + "step": 1088 + }, + { + "epoch": 0.08961119111293972, + "grad_norm": 3.7219107197197916, + "learning_rate": 1.9814393894487713e-05, + "loss": 0.9151, + "step": 1089 + }, + { + "epoch": 0.08969347870808475, + "grad_norm": 4.035211371174713, + "learning_rate": 1.981388241384366e-05, + "loss": 0.8825, + "step": 1090 + }, + { + "epoch": 0.08977576630322978, + "grad_norm": 3.053085785512212, + "learning_rate": 1.9813370236037098e-05, + "loss": 0.8497, + "step": 1091 + }, + { + "epoch": 0.08985805389837483, + "grad_norm": 0.5368604454434628, + "learning_rate": 1.981285736110441e-05, + "loss": 0.5812, + "step": 1092 + }, + { + "epoch": 0.08994034149351986, + "grad_norm": 4.355844807027429, + "learning_rate": 1.981234378908203e-05, + "loss": 0.8887, + "step": 1093 + }, + { + "epoch": 0.09002262908866489, + "grad_norm": 2.649968557975437, + "learning_rate": 1.9811829520006433e-05, + "loss": 0.8415, + "step": 1094 + }, + { + "epoch": 0.09010491668380992, + "grad_norm": 3.4417587859008214, + "learning_rate": 1.9811314553914166e-05, + "loss": 0.8685, + "step": 1095 + }, + { + "epoch": 0.09018720427895495, + "grad_norm": 0.48295286929932113, + "learning_rate": 1.98107988908418e-05, + "loss": 0.5608, + "step": 1096 + }, + { + "epoch": 0.09026949187409998, + "grad_norm": 4.948234702126818, + "learning_rate": 1.981028253082597e-05, + "loss": 0.8638, + "step": 1097 + }, + { + "epoch": 0.09035177946924501, + "grad_norm": 2.8257336957776733, + "learning_rate": 1.9809765473903362e-05, + "loss": 0.8402, + "step": 1098 + }, + { + "epoch": 0.09043406706439004, + "grad_norm": 0.48328014205289604, + "learning_rate": 1.98092477201107e-05, + "loss": 0.5797, + "step": 1099 + }, + { + "epoch": 0.09051635465953507, + "grad_norm": 3.1346349138814418, + "learning_rate": 1.980872926948477e-05, + "loss": 0.8675, + "step": 1100 + }, + { + "epoch": 0.0905986422546801, + "grad_norm": 2.707381646623277, + "learning_rate": 1.9808210122062396e-05, + "loss": 0.8588, + "step": 1101 + }, + { + "epoch": 0.09068092984982513, + "grad_norm": 0.4754150829561111, + "learning_rate": 1.9807690277880464e-05, + "loss": 0.5962, + "step": 1102 + }, + { + "epoch": 0.09076321744497017, + "grad_norm": 3.2149488041323946, + "learning_rate": 1.98071697369759e-05, + "loss": 0.849, + "step": 1103 + }, + { + "epoch": 0.0908455050401152, + "grad_norm": 3.1468421046064887, + "learning_rate": 1.9806648499385678e-05, + "loss": 0.8525, + "step": 1104 + }, + { + "epoch": 0.09092779263526024, + "grad_norm": 3.011551334891878, + "learning_rate": 1.9806126565146835e-05, + "loss": 0.862, + "step": 1105 + }, + { + "epoch": 0.09101008023040527, + "grad_norm": 3.7542041127163235, + "learning_rate": 1.980560393429644e-05, + "loss": 0.878, + "step": 1106 + }, + { + "epoch": 0.0910923678255503, + "grad_norm": 3.924675309445745, + "learning_rate": 1.9805080606871625e-05, + "loss": 0.8932, + "step": 1107 + }, + { + "epoch": 0.09117465542069533, + "grad_norm": 3.149434195229172, + "learning_rate": 1.980455658290956e-05, + "loss": 0.8968, + "step": 1108 + }, + { + "epoch": 0.09125694301584036, + "grad_norm": 0.4528941005660691, + "learning_rate": 1.9804031862447483e-05, + "loss": 0.5658, + "step": 1109 + }, + { + "epoch": 0.0913392306109854, + "grad_norm": 3.2710296854560688, + "learning_rate": 1.9803506445522658e-05, + "loss": 0.8739, + "step": 1110 + }, + { + "epoch": 0.09142151820613043, + "grad_norm": 0.48322757491755364, + "learning_rate": 1.9802980332172415e-05, + "loss": 0.592, + "step": 1111 + }, + { + "epoch": 0.09150380580127546, + "grad_norm": 3.600092282955291, + "learning_rate": 1.9802453522434123e-05, + "loss": 0.8524, + "step": 1112 + }, + { + "epoch": 0.09158609339642049, + "grad_norm": 3.7142303319750773, + "learning_rate": 1.980192601634521e-05, + "loss": 0.8811, + "step": 1113 + }, + { + "epoch": 0.09166838099156552, + "grad_norm": 3.133621188104266, + "learning_rate": 1.9801397813943156e-05, + "loss": 0.8937, + "step": 1114 + }, + { + "epoch": 0.09175066858671055, + "grad_norm": 5.265940334189566, + "learning_rate": 1.980086891526547e-05, + "loss": 0.8761, + "step": 1115 + }, + { + "epoch": 0.09183295618185558, + "grad_norm": 0.5062751751465183, + "learning_rate": 1.9800339320349732e-05, + "loss": 0.5516, + "step": 1116 + }, + { + "epoch": 0.09191524377700061, + "grad_norm": 3.772473804543901, + "learning_rate": 1.9799809029233558e-05, + "loss": 0.8375, + "step": 1117 + }, + { + "epoch": 0.09199753137214566, + "grad_norm": 3.8490743801526803, + "learning_rate": 1.9799278041954628e-05, + "loss": 0.877, + "step": 1118 + }, + { + "epoch": 0.09207981896729069, + "grad_norm": 3.5820410192444174, + "learning_rate": 1.9798746358550656e-05, + "loss": 0.8833, + "step": 1119 + }, + { + "epoch": 0.09216210656243572, + "grad_norm": 8.839295550642253, + "learning_rate": 1.9798213979059412e-05, + "loss": 0.8553, + "step": 1120 + }, + { + "epoch": 0.09224439415758075, + "grad_norm": 3.7706882959014205, + "learning_rate": 1.979768090351872e-05, + "loss": 0.8564, + "step": 1121 + }, + { + "epoch": 0.09232668175272578, + "grad_norm": 4.312690219016083, + "learning_rate": 1.9797147131966445e-05, + "loss": 0.8605, + "step": 1122 + }, + { + "epoch": 0.09240896934787081, + "grad_norm": 6.342821693734463, + "learning_rate": 1.9796612664440503e-05, + "loss": 0.8863, + "step": 1123 + }, + { + "epoch": 0.09249125694301584, + "grad_norm": 3.480039566309057, + "learning_rate": 1.979607750097887e-05, + "loss": 0.8676, + "step": 1124 + }, + { + "epoch": 0.09257354453816087, + "grad_norm": 0.5209974485249531, + "learning_rate": 1.9795541641619552e-05, + "loss": 0.6128, + "step": 1125 + }, + { + "epoch": 0.0926558321333059, + "grad_norm": 3.0644541451290106, + "learning_rate": 1.9795005086400623e-05, + "loss": 0.8596, + "step": 1126 + }, + { + "epoch": 0.09273811972845093, + "grad_norm": 4.0339545836639585, + "learning_rate": 1.9794467835360198e-05, + "loss": 0.8956, + "step": 1127 + }, + { + "epoch": 0.09282040732359596, + "grad_norm": 3.606396064787203, + "learning_rate": 1.9793929888536443e-05, + "loss": 0.8446, + "step": 1128 + }, + { + "epoch": 0.092902694918741, + "grad_norm": 3.266963278351553, + "learning_rate": 1.979339124596757e-05, + "loss": 0.8804, + "step": 1129 + }, + { + "epoch": 0.09298498251388602, + "grad_norm": 4.171351560316691, + "learning_rate": 1.9792851907691847e-05, + "loss": 0.8764, + "step": 1130 + }, + { + "epoch": 0.09306727010903107, + "grad_norm": 3.1333885189366066, + "learning_rate": 1.9792311873747584e-05, + "loss": 0.8882, + "step": 1131 + }, + { + "epoch": 0.0931495577041761, + "grad_norm": 4.115748009743592, + "learning_rate": 1.9791771144173146e-05, + "loss": 0.8693, + "step": 1132 + }, + { + "epoch": 0.09323184529932113, + "grad_norm": 4.248749716560056, + "learning_rate": 1.9791229719006947e-05, + "loss": 0.866, + "step": 1133 + }, + { + "epoch": 0.09331413289446616, + "grad_norm": 0.5602770220421947, + "learning_rate": 1.979068759828745e-05, + "loss": 0.5729, + "step": 1134 + }, + { + "epoch": 0.09339642048961119, + "grad_norm": 3.208526975104471, + "learning_rate": 1.979014478205316e-05, + "loss": 0.8447, + "step": 1135 + }, + { + "epoch": 0.09347870808475622, + "grad_norm": 3.837179354794119, + "learning_rate": 1.978960127034264e-05, + "loss": 0.8395, + "step": 1136 + }, + { + "epoch": 0.09356099567990125, + "grad_norm": 4.22608442690413, + "learning_rate": 1.9789057063194505e-05, + "loss": 0.8345, + "step": 1137 + }, + { + "epoch": 0.09364328327504629, + "grad_norm": 4.512917248957414, + "learning_rate": 1.978851216064741e-05, + "loss": 0.8755, + "step": 1138 + }, + { + "epoch": 0.09372557087019132, + "grad_norm": 4.485181370046995, + "learning_rate": 1.978796656274007e-05, + "loss": 0.9001, + "step": 1139 + }, + { + "epoch": 0.09380785846533635, + "grad_norm": 4.311526149543538, + "learning_rate": 1.978742026951123e-05, + "loss": 0.8147, + "step": 1140 + }, + { + "epoch": 0.09389014606048138, + "grad_norm": 3.400869370992463, + "learning_rate": 1.9786873280999716e-05, + "loss": 0.8458, + "step": 1141 + }, + { + "epoch": 0.09397243365562641, + "grad_norm": 3.484007931145798, + "learning_rate": 1.978632559724437e-05, + "loss": 0.8396, + "step": 1142 + }, + { + "epoch": 0.09405472125077145, + "grad_norm": 5.974225023368629, + "learning_rate": 1.9785777218284107e-05, + "loss": 0.8544, + "step": 1143 + }, + { + "epoch": 0.09413700884591648, + "grad_norm": 4.758176933846711, + "learning_rate": 1.978522814415788e-05, + "loss": 0.8738, + "step": 1144 + }, + { + "epoch": 0.09421929644106151, + "grad_norm": 4.054376339470337, + "learning_rate": 1.9784678374904694e-05, + "loss": 0.8647, + "step": 1145 + }, + { + "epoch": 0.09430158403620655, + "grad_norm": 3.254256033254886, + "learning_rate": 1.9784127910563606e-05, + "loss": 0.8353, + "step": 1146 + }, + { + "epoch": 0.09438387163135158, + "grad_norm": 0.5816738083728531, + "learning_rate": 1.978357675117372e-05, + "loss": 0.5812, + "step": 1147 + }, + { + "epoch": 0.09446615922649661, + "grad_norm": 0.49793035339456754, + "learning_rate": 1.9783024896774187e-05, + "loss": 0.5791, + "step": 1148 + }, + { + "epoch": 0.09454844682164164, + "grad_norm": 4.179537892792988, + "learning_rate": 1.9782472347404206e-05, + "loss": 0.8907, + "step": 1149 + }, + { + "epoch": 0.09463073441678667, + "grad_norm": 4.067029184300302, + "learning_rate": 1.978191910310304e-05, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.0947130220119317, + "grad_norm": 4.248345665782451, + "learning_rate": 1.9781365163909984e-05, + "loss": 0.8632, + "step": 1151 + }, + { + "epoch": 0.09479530960707673, + "grad_norm": 6.439138971096778, + "learning_rate": 1.978081052986439e-05, + "loss": 0.8629, + "step": 1152 + }, + { + "epoch": 0.09487759720222176, + "grad_norm": 6.71298685938902, + "learning_rate": 1.9780255201005656e-05, + "loss": 0.8549, + "step": 1153 + }, + { + "epoch": 0.09495988479736679, + "grad_norm": 3.967437431624442, + "learning_rate": 1.9779699177373236e-05, + "loss": 0.8732, + "step": 1154 + }, + { + "epoch": 0.09504217239251182, + "grad_norm": 0.8392360999561069, + "learning_rate": 1.9779142459006626e-05, + "loss": 0.5872, + "step": 1155 + }, + { + "epoch": 0.09512445998765687, + "grad_norm": 4.657178845971167, + "learning_rate": 1.9778585045945374e-05, + "loss": 0.8495, + "step": 1156 + }, + { + "epoch": 0.0952067475828019, + "grad_norm": 4.123727952348605, + "learning_rate": 1.977802693822908e-05, + "loss": 0.9142, + "step": 1157 + }, + { + "epoch": 0.09528903517794693, + "grad_norm": 0.5860758553236142, + "learning_rate": 1.9777468135897387e-05, + "loss": 0.5549, + "step": 1158 + }, + { + "epoch": 0.09537132277309196, + "grad_norm": 0.5401053295003246, + "learning_rate": 1.9776908638989996e-05, + "loss": 0.5801, + "step": 1159 + }, + { + "epoch": 0.09545361036823699, + "grad_norm": 0.5496816005625466, + "learning_rate": 1.9776348447546653e-05, + "loss": 0.5839, + "step": 1160 + }, + { + "epoch": 0.09553589796338202, + "grad_norm": 6.020685438337091, + "learning_rate": 1.977578756160715e-05, + "loss": 0.866, + "step": 1161 + }, + { + "epoch": 0.09561818555852705, + "grad_norm": 2.792057637957128, + "learning_rate": 1.9775225981211333e-05, + "loss": 0.8638, + "step": 1162 + }, + { + "epoch": 0.09570047315367208, + "grad_norm": 0.5553177375677683, + "learning_rate": 1.9774663706399092e-05, + "loss": 0.5612, + "step": 1163 + }, + { + "epoch": 0.09578276074881711, + "grad_norm": 5.245834669495098, + "learning_rate": 1.9774100737210376e-05, + "loss": 0.8688, + "step": 1164 + }, + { + "epoch": 0.09586504834396214, + "grad_norm": 3.5768926302294344, + "learning_rate": 1.977353707368518e-05, + "loss": 0.897, + "step": 1165 + }, + { + "epoch": 0.09594733593910718, + "grad_norm": 3.381007087662086, + "learning_rate": 1.9772972715863534e-05, + "loss": 0.8956, + "step": 1166 + }, + { + "epoch": 0.0960296235342522, + "grad_norm": 4.24711216964703, + "learning_rate": 1.9772407663785538e-05, + "loss": 0.8546, + "step": 1167 + }, + { + "epoch": 0.09611191112939724, + "grad_norm": 0.5978826180005935, + "learning_rate": 1.977184191749133e-05, + "loss": 0.5658, + "step": 1168 + }, + { + "epoch": 0.09619419872454228, + "grad_norm": 5.6864731543708285, + "learning_rate": 1.9771275477021102e-05, + "loss": 0.8573, + "step": 1169 + }, + { + "epoch": 0.09627648631968731, + "grad_norm": 0.5306016735606011, + "learning_rate": 1.9770708342415087e-05, + "loss": 0.5443, + "step": 1170 + }, + { + "epoch": 0.09635877391483234, + "grad_norm": 3.4108513712835733, + "learning_rate": 1.9770140513713582e-05, + "loss": 0.9162, + "step": 1171 + }, + { + "epoch": 0.09644106150997737, + "grad_norm": 3.0240876250486775, + "learning_rate": 1.976957199095692e-05, + "loss": 0.8959, + "step": 1172 + }, + { + "epoch": 0.0965233491051224, + "grad_norm": 4.329264160111276, + "learning_rate": 1.9769002774185483e-05, + "loss": 0.8581, + "step": 1173 + }, + { + "epoch": 0.09660563670026744, + "grad_norm": 2.8538371301611045, + "learning_rate": 1.9768432863439714e-05, + "loss": 0.8472, + "step": 1174 + }, + { + "epoch": 0.09668792429541247, + "grad_norm": 4.192529144078922, + "learning_rate": 1.97678622587601e-05, + "loss": 0.8697, + "step": 1175 + }, + { + "epoch": 0.0967702118905575, + "grad_norm": 3.729038589656874, + "learning_rate": 1.976729096018717e-05, + "loss": 0.8319, + "step": 1176 + }, + { + "epoch": 0.09685249948570253, + "grad_norm": 0.6437788103093597, + "learning_rate": 1.976671896776151e-05, + "loss": 0.5736, + "step": 1177 + }, + { + "epoch": 0.09693478708084756, + "grad_norm": 3.9035454070115017, + "learning_rate": 1.9766146281523753e-05, + "loss": 0.8874, + "step": 1178 + }, + { + "epoch": 0.09701707467599259, + "grad_norm": 3.819713897204886, + "learning_rate": 1.9765572901514583e-05, + "loss": 0.8422, + "step": 1179 + }, + { + "epoch": 0.09709936227113762, + "grad_norm": 5.277006488684462, + "learning_rate": 1.9764998827774734e-05, + "loss": 0.8849, + "step": 1180 + }, + { + "epoch": 0.09718164986628265, + "grad_norm": 5.189466257849834, + "learning_rate": 1.9764424060344988e-05, + "loss": 0.8612, + "step": 1181 + }, + { + "epoch": 0.0972639374614277, + "grad_norm": 3.4415909778873743, + "learning_rate": 1.9763848599266168e-05, + "loss": 0.8649, + "step": 1182 + }, + { + "epoch": 0.09734622505657273, + "grad_norm": 3.5762421871051, + "learning_rate": 1.976327244457916e-05, + "loss": 0.8643, + "step": 1183 + }, + { + "epoch": 0.09742851265171776, + "grad_norm": 2.9475630534612116, + "learning_rate": 1.976269559632489e-05, + "loss": 0.8756, + "step": 1184 + }, + { + "epoch": 0.09751080024686279, + "grad_norm": 2.865959286407617, + "learning_rate": 1.976211805454434e-05, + "loss": 0.8317, + "step": 1185 + }, + { + "epoch": 0.09759308784200782, + "grad_norm": 0.5278838170529865, + "learning_rate": 1.976153981927853e-05, + "loss": 0.5707, + "step": 1186 + }, + { + "epoch": 0.09767537543715285, + "grad_norm": 0.5151202226322995, + "learning_rate": 1.976096089056855e-05, + "loss": 0.5589, + "step": 1187 + }, + { + "epoch": 0.09775766303229788, + "grad_norm": 5.474549135950859, + "learning_rate": 1.9760381268455515e-05, + "loss": 0.8707, + "step": 1188 + }, + { + "epoch": 0.09783995062744291, + "grad_norm": 2.886942130305931, + "learning_rate": 1.9759800952980604e-05, + "loss": 0.8764, + "step": 1189 + }, + { + "epoch": 0.09792223822258794, + "grad_norm": 3.5448856849038015, + "learning_rate": 1.9759219944185045e-05, + "loss": 0.8546, + "step": 1190 + }, + { + "epoch": 0.09800452581773297, + "grad_norm": 2.3163053463145022, + "learning_rate": 1.9758638242110105e-05, + "loss": 0.827, + "step": 1191 + }, + { + "epoch": 0.098086813412878, + "grad_norm": 3.2678753876711903, + "learning_rate": 1.9758055846797113e-05, + "loss": 0.8456, + "step": 1192 + }, + { + "epoch": 0.09816910100802304, + "grad_norm": 4.046087494412628, + "learning_rate": 1.9757472758287437e-05, + "loss": 0.8565, + "step": 1193 + }, + { + "epoch": 0.09825138860316807, + "grad_norm": 5.312871548189173, + "learning_rate": 1.9756888976622504e-05, + "loss": 0.8316, + "step": 1194 + }, + { + "epoch": 0.09833367619831311, + "grad_norm": 3.5965506794172035, + "learning_rate": 1.9756304501843782e-05, + "loss": 0.8479, + "step": 1195 + }, + { + "epoch": 0.09841596379345814, + "grad_norm": 4.869038156703397, + "learning_rate": 1.975571933399279e-05, + "loss": 0.8957, + "step": 1196 + }, + { + "epoch": 0.09849825138860317, + "grad_norm": 5.073504198475643, + "learning_rate": 1.9755133473111097e-05, + "loss": 0.8748, + "step": 1197 + }, + { + "epoch": 0.0985805389837482, + "grad_norm": 4.129896753535656, + "learning_rate": 1.9754546919240325e-05, + "loss": 0.8624, + "step": 1198 + }, + { + "epoch": 0.09866282657889323, + "grad_norm": 0.75499109894716, + "learning_rate": 1.975395967242214e-05, + "loss": 0.5753, + "step": 1199 + }, + { + "epoch": 0.09874511417403826, + "grad_norm": 4.926214741317277, + "learning_rate": 1.9753371732698255e-05, + "loss": 0.8514, + "step": 1200 + }, + { + "epoch": 0.0988274017691833, + "grad_norm": 4.113995566064139, + "learning_rate": 1.9752783100110443e-05, + "loss": 0.8735, + "step": 1201 + }, + { + "epoch": 0.09890968936432833, + "grad_norm": 0.5883860438611207, + "learning_rate": 1.975219377470052e-05, + "loss": 0.6035, + "step": 1202 + }, + { + "epoch": 0.09899197695947336, + "grad_norm": 3.3466076308514863, + "learning_rate": 1.9751603756510344e-05, + "loss": 0.8769, + "step": 1203 + }, + { + "epoch": 0.09907426455461839, + "grad_norm": 0.47595350765066086, + "learning_rate": 1.9751013045581835e-05, + "loss": 0.5663, + "step": 1204 + }, + { + "epoch": 0.09915655214976342, + "grad_norm": 3.4049170080353615, + "learning_rate": 1.975042164195695e-05, + "loss": 0.8363, + "step": 1205 + }, + { + "epoch": 0.09923883974490845, + "grad_norm": 3.7661200169302327, + "learning_rate": 1.974982954567771e-05, + "loss": 0.8437, + "step": 1206 + }, + { + "epoch": 0.09932112734005348, + "grad_norm": 3.6094210284619286, + "learning_rate": 1.9749236756786167e-05, + "loss": 0.861, + "step": 1207 + }, + { + "epoch": 0.09940341493519853, + "grad_norm": 3.145969814243711, + "learning_rate": 1.9748643275324438e-05, + "loss": 0.8454, + "step": 1208 + }, + { + "epoch": 0.09948570253034356, + "grad_norm": 3.6067880218861568, + "learning_rate": 1.9748049101334684e-05, + "loss": 0.8682, + "step": 1209 + }, + { + "epoch": 0.09956799012548859, + "grad_norm": 3.0185050449291984, + "learning_rate": 1.974745423485911e-05, + "loss": 0.8708, + "step": 1210 + }, + { + "epoch": 0.09965027772063362, + "grad_norm": 3.128449103884966, + "learning_rate": 1.9746858675939974e-05, + "loss": 0.8594, + "step": 1211 + }, + { + "epoch": 0.09973256531577865, + "grad_norm": 0.6028578588325906, + "learning_rate": 1.9746262424619585e-05, + "loss": 0.6006, + "step": 1212 + }, + { + "epoch": 0.09981485291092368, + "grad_norm": 0.5378805528352323, + "learning_rate": 1.9745665480940304e-05, + "loss": 0.5702, + "step": 1213 + }, + { + "epoch": 0.09989714050606871, + "grad_norm": 2.9709104250769025, + "learning_rate": 1.974506784494453e-05, + "loss": 0.8769, + "step": 1214 + }, + { + "epoch": 0.09997942810121374, + "grad_norm": 3.5710834059738983, + "learning_rate": 1.974446951667472e-05, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.10006171569635877, + "grad_norm": 3.564453597862319, + "learning_rate": 1.9743870496173385e-05, + "loss": 0.8602, + "step": 1216 + }, + { + "epoch": 0.1001440032915038, + "grad_norm": 3.7485777754801415, + "learning_rate": 1.974327078348307e-05, + "loss": 0.8478, + "step": 1217 + }, + { + "epoch": 0.10022629088664883, + "grad_norm": 0.6391149383767559, + "learning_rate": 1.974267037864638e-05, + "loss": 0.5585, + "step": 1218 + }, + { + "epoch": 0.10030857848179386, + "grad_norm": 3.9853421053234044, + "learning_rate": 1.9742069281705967e-05, + "loss": 0.8742, + "step": 1219 + }, + { + "epoch": 0.1003908660769389, + "grad_norm": 7.216394178355804, + "learning_rate": 1.974146749270453e-05, + "loss": 0.8459, + "step": 1220 + }, + { + "epoch": 0.10047315367208394, + "grad_norm": 2.582703369923991, + "learning_rate": 1.9740865011684827e-05, + "loss": 0.8772, + "step": 1221 + }, + { + "epoch": 0.10055544126722897, + "grad_norm": 4.096893921176322, + "learning_rate": 1.974026183868965e-05, + "loss": 0.8564, + "step": 1222 + }, + { + "epoch": 0.100637728862374, + "grad_norm": 3.625029367682308, + "learning_rate": 1.973965797376185e-05, + "loss": 0.8505, + "step": 1223 + }, + { + "epoch": 0.10072001645751903, + "grad_norm": 3.42182935905832, + "learning_rate": 1.973905341694432e-05, + "loss": 0.8314, + "step": 1224 + }, + { + "epoch": 0.10080230405266406, + "grad_norm": 2.8684151430131664, + "learning_rate": 1.9738448168280014e-05, + "loss": 0.8524, + "step": 1225 + }, + { + "epoch": 0.1008845916478091, + "grad_norm": 4.2068547384992545, + "learning_rate": 1.9737842227811924e-05, + "loss": 0.8525, + "step": 1226 + }, + { + "epoch": 0.10096687924295412, + "grad_norm": 3.637604906458846, + "learning_rate": 1.9737235595583093e-05, + "loss": 0.8927, + "step": 1227 + }, + { + "epoch": 0.10104916683809916, + "grad_norm": 3.986554301688107, + "learning_rate": 1.973662827163662e-05, + "loss": 0.9003, + "step": 1228 + }, + { + "epoch": 0.10113145443324419, + "grad_norm": 0.6119674016964393, + "learning_rate": 1.9736020256015647e-05, + "loss": 0.5653, + "step": 1229 + }, + { + "epoch": 0.10121374202838922, + "grad_norm": 3.947009339846442, + "learning_rate": 1.9735411548763364e-05, + "loss": 0.8614, + "step": 1230 + }, + { + "epoch": 0.10129602962353425, + "grad_norm": 3.8850893245972666, + "learning_rate": 1.9734802149923014e-05, + "loss": 0.8663, + "step": 1231 + }, + { + "epoch": 0.10137831721867928, + "grad_norm": 3.260028438383931, + "learning_rate": 1.9734192059537888e-05, + "loss": 0.864, + "step": 1232 + }, + { + "epoch": 0.10146060481382431, + "grad_norm": 3.3728492367289795, + "learning_rate": 1.9733581277651327e-05, + "loss": 0.8524, + "step": 1233 + }, + { + "epoch": 0.10154289240896935, + "grad_norm": 3.2625677444712946, + "learning_rate": 1.9732969804306716e-05, + "loss": 0.8299, + "step": 1234 + }, + { + "epoch": 0.10162518000411438, + "grad_norm": 0.5270258088317135, + "learning_rate": 1.9732357639547497e-05, + "loss": 0.5695, + "step": 1235 + }, + { + "epoch": 0.10170746759925942, + "grad_norm": 4.034862594266343, + "learning_rate": 1.9731744783417154e-05, + "loss": 0.9067, + "step": 1236 + }, + { + "epoch": 0.10178975519440445, + "grad_norm": 3.368163010498083, + "learning_rate": 1.9731131235959228e-05, + "loss": 0.8785, + "step": 1237 + }, + { + "epoch": 0.10187204278954948, + "grad_norm": 4.268507894834593, + "learning_rate": 1.97305169972173e-05, + "loss": 0.8497, + "step": 1238 + }, + { + "epoch": 0.10195433038469451, + "grad_norm": 4.262009151943327, + "learning_rate": 1.9729902067235006e-05, + "loss": 0.8528, + "step": 1239 + }, + { + "epoch": 0.10203661797983954, + "grad_norm": 3.7072453125521734, + "learning_rate": 1.9729286446056033e-05, + "loss": 0.837, + "step": 1240 + }, + { + "epoch": 0.10211890557498457, + "grad_norm": 0.5042716296341209, + "learning_rate": 1.9728670133724108e-05, + "loss": 0.5718, + "step": 1241 + }, + { + "epoch": 0.1022011931701296, + "grad_norm": 3.5004783261501466, + "learning_rate": 1.9728053130283015e-05, + "loss": 0.8695, + "step": 1242 + }, + { + "epoch": 0.10228348076527463, + "grad_norm": 3.269137481777619, + "learning_rate": 1.9727435435776584e-05, + "loss": 0.8456, + "step": 1243 + }, + { + "epoch": 0.10236576836041966, + "grad_norm": 4.183726994796829, + "learning_rate": 1.97268170502487e-05, + "loss": 0.8246, + "step": 1244 + }, + { + "epoch": 0.10244805595556469, + "grad_norm": 3.447500278075762, + "learning_rate": 1.9726197973743285e-05, + "loss": 0.8538, + "step": 1245 + }, + { + "epoch": 0.10253034355070972, + "grad_norm": 6.2832374035907606, + "learning_rate": 1.9725578206304323e-05, + "loss": 0.8363, + "step": 1246 + }, + { + "epoch": 0.10261263114585477, + "grad_norm": 3.3223666951374327, + "learning_rate": 1.972495774797584e-05, + "loss": 0.8068, + "step": 1247 + }, + { + "epoch": 0.1026949187409998, + "grad_norm": 4.527729681936454, + "learning_rate": 1.972433659880191e-05, + "loss": 0.8515, + "step": 1248 + }, + { + "epoch": 0.10277720633614483, + "grad_norm": 3.219592992240681, + "learning_rate": 1.9723714758826657e-05, + "loss": 0.8491, + "step": 1249 + }, + { + "epoch": 0.10285949393128986, + "grad_norm": 14.881817686003856, + "learning_rate": 1.9723092228094262e-05, + "loss": 0.8535, + "step": 1250 + }, + { + "epoch": 0.10294178152643489, + "grad_norm": 7.751162451175856, + "learning_rate": 1.9722469006648946e-05, + "loss": 0.8366, + "step": 1251 + }, + { + "epoch": 0.10302406912157992, + "grad_norm": 3.6264058447910785, + "learning_rate": 1.9721845094534977e-05, + "loss": 0.8544, + "step": 1252 + }, + { + "epoch": 0.10310635671672495, + "grad_norm": 3.422457414693753, + "learning_rate": 1.9721220491796682e-05, + "loss": 0.8615, + "step": 1253 + }, + { + "epoch": 0.10318864431186998, + "grad_norm": 6.929960642374395, + "learning_rate": 1.972059519847843e-05, + "loss": 0.8971, + "step": 1254 + }, + { + "epoch": 0.10327093190701502, + "grad_norm": 3.4436829246073937, + "learning_rate": 1.971996921462464e-05, + "loss": 0.859, + "step": 1255 + }, + { + "epoch": 0.10335321950216005, + "grad_norm": 0.6287334412236155, + "learning_rate": 1.9719342540279783e-05, + "loss": 0.5832, + "step": 1256 + }, + { + "epoch": 0.10343550709730508, + "grad_norm": 4.455529227081377, + "learning_rate": 1.9718715175488373e-05, + "loss": 0.8551, + "step": 1257 + }, + { + "epoch": 0.10351779469245011, + "grad_norm": 0.49625825545453955, + "learning_rate": 1.9718087120294983e-05, + "loss": 0.5907, + "step": 1258 + }, + { + "epoch": 0.10360008228759514, + "grad_norm": 7.610855562933589, + "learning_rate": 1.9717458374744226e-05, + "loss": 0.8518, + "step": 1259 + }, + { + "epoch": 0.10368236988274018, + "grad_norm": 6.814266905432093, + "learning_rate": 1.9716828938880766e-05, + "loss": 0.892, + "step": 1260 + }, + { + "epoch": 0.10376465747788521, + "grad_norm": 5.503964342264624, + "learning_rate": 1.9716198812749316e-05, + "loss": 0.8575, + "step": 1261 + }, + { + "epoch": 0.10384694507303024, + "grad_norm": 6.63227200743735, + "learning_rate": 1.9715567996394642e-05, + "loss": 0.899, + "step": 1262 + }, + { + "epoch": 0.10392923266817528, + "grad_norm": 5.730417088676314, + "learning_rate": 1.9714936489861557e-05, + "loss": 0.8747, + "step": 1263 + }, + { + "epoch": 0.1040115202633203, + "grad_norm": 0.48714044685236985, + "learning_rate": 1.9714304293194918e-05, + "loss": 0.5698, + "step": 1264 + }, + { + "epoch": 0.10409380785846534, + "grad_norm": 4.811813633190729, + "learning_rate": 1.971367140643964e-05, + "loss": 0.8528, + "step": 1265 + }, + { + "epoch": 0.10417609545361037, + "grad_norm": 0.45476138424455886, + "learning_rate": 1.971303782964068e-05, + "loss": 0.5733, + "step": 1266 + }, + { + "epoch": 0.1042583830487554, + "grad_norm": 4.144890858016052, + "learning_rate": 1.9712403562843045e-05, + "loss": 0.8308, + "step": 1267 + }, + { + "epoch": 0.10434067064390043, + "grad_norm": 5.275387836703206, + "learning_rate": 1.9711768606091795e-05, + "loss": 0.8931, + "step": 1268 + }, + { + "epoch": 0.10442295823904546, + "grad_norm": 5.053640055345281, + "learning_rate": 1.9711132959432033e-05, + "loss": 0.84, + "step": 1269 + }, + { + "epoch": 0.10450524583419049, + "grad_norm": 19.786582333651765, + "learning_rate": 1.9710496622908917e-05, + "loss": 0.8148, + "step": 1270 + }, + { + "epoch": 0.10458753342933552, + "grad_norm": 3.9891501660738253, + "learning_rate": 1.970985959656765e-05, + "loss": 0.8575, + "step": 1271 + }, + { + "epoch": 0.10466982102448055, + "grad_norm": 4.510634946553714, + "learning_rate": 1.9709221880453488e-05, + "loss": 0.865, + "step": 1272 + }, + { + "epoch": 0.1047521086196256, + "grad_norm": 3.734578158484028, + "learning_rate": 1.970858347461173e-05, + "loss": 0.8837, + "step": 1273 + }, + { + "epoch": 0.10483439621477063, + "grad_norm": 5.721110074673601, + "learning_rate": 1.9707944379087727e-05, + "loss": 0.8538, + "step": 1274 + }, + { + "epoch": 0.10491668380991566, + "grad_norm": 4.410682194182307, + "learning_rate": 1.9707304593926883e-05, + "loss": 0.8515, + "step": 1275 + }, + { + "epoch": 0.10499897140506069, + "grad_norm": 6.312032763782244, + "learning_rate": 1.9706664119174643e-05, + "loss": 0.8473, + "step": 1276 + }, + { + "epoch": 0.10508125900020572, + "grad_norm": 4.691303195768097, + "learning_rate": 1.970602295487651e-05, + "loss": 0.8725, + "step": 1277 + }, + { + "epoch": 0.10516354659535075, + "grad_norm": 4.316070261641844, + "learning_rate": 1.9705381101078028e-05, + "loss": 0.821, + "step": 1278 + }, + { + "epoch": 0.10524583419049578, + "grad_norm": 4.997740394972133, + "learning_rate": 1.9704738557824795e-05, + "loss": 0.8647, + "step": 1279 + }, + { + "epoch": 0.10532812178564081, + "grad_norm": 9.863962397731285, + "learning_rate": 1.970409532516245e-05, + "loss": 0.8627, + "step": 1280 + }, + { + "epoch": 0.10541040938078584, + "grad_norm": 4.11123640524789, + "learning_rate": 1.9703451403136696e-05, + "loss": 0.8407, + "step": 1281 + }, + { + "epoch": 0.10549269697593087, + "grad_norm": 0.5916809195753411, + "learning_rate": 1.9702806791793277e-05, + "loss": 0.5848, + "step": 1282 + }, + { + "epoch": 0.1055749845710759, + "grad_norm": 6.69582821146116, + "learning_rate": 1.9702161491177976e-05, + "loss": 0.8853, + "step": 1283 + }, + { + "epoch": 0.10565727216622094, + "grad_norm": 0.4953187105131954, + "learning_rate": 1.9701515501336642e-05, + "loss": 0.5822, + "step": 1284 + }, + { + "epoch": 0.10573955976136598, + "grad_norm": 8.936946054345633, + "learning_rate": 1.970086882231516e-05, + "loss": 0.851, + "step": 1285 + }, + { + "epoch": 0.10582184735651101, + "grad_norm": 6.8488454890517705, + "learning_rate": 1.970022145415947e-05, + "loss": 0.8961, + "step": 1286 + }, + { + "epoch": 0.10590413495165604, + "grad_norm": 4.237563628391966, + "learning_rate": 1.9699573396915563e-05, + "loss": 0.8378, + "step": 1287 + }, + { + "epoch": 0.10598642254680107, + "grad_norm": 51.42004889155801, + "learning_rate": 1.969892465062947e-05, + "loss": 0.8389, + "step": 1288 + }, + { + "epoch": 0.1060687101419461, + "grad_norm": 6.786819850265654, + "learning_rate": 1.9698275215347287e-05, + "loss": 0.887, + "step": 1289 + }, + { + "epoch": 0.10615099773709114, + "grad_norm": 22.715342269516267, + "learning_rate": 1.969762509111514e-05, + "loss": 0.8792, + "step": 1290 + }, + { + "epoch": 0.10623328533223617, + "grad_norm": 4.6055650003906194, + "learning_rate": 1.969697427797922e-05, + "loss": 0.8886, + "step": 1291 + }, + { + "epoch": 0.1063155729273812, + "grad_norm": 0.6622955664034255, + "learning_rate": 1.9696322775985748e-05, + "loss": 0.5781, + "step": 1292 + }, + { + "epoch": 0.10639786052252623, + "grad_norm": 0.5456666190829798, + "learning_rate": 1.9695670585181016e-05, + "loss": 0.5594, + "step": 1293 + }, + { + "epoch": 0.10648014811767126, + "grad_norm": 0.5361303752940896, + "learning_rate": 1.969501770561135e-05, + "loss": 0.6009, + "step": 1294 + }, + { + "epoch": 0.10656243571281629, + "grad_norm": 7.635441041173641, + "learning_rate": 1.9694364137323133e-05, + "loss": 0.8371, + "step": 1295 + }, + { + "epoch": 0.10664472330796132, + "grad_norm": 12.228827021078185, + "learning_rate": 1.969370988036279e-05, + "loss": 0.8745, + "step": 1296 + }, + { + "epoch": 0.10672701090310635, + "grad_norm": 0.6908564482765909, + "learning_rate": 1.9693054934776803e-05, + "loss": 0.5781, + "step": 1297 + }, + { + "epoch": 0.1068092984982514, + "grad_norm": 4.800629808465259, + "learning_rate": 1.9692399300611693e-05, + "loss": 0.8426, + "step": 1298 + }, + { + "epoch": 0.10689158609339643, + "grad_norm": 6.024553599030264, + "learning_rate": 1.969174297791404e-05, + "loss": 0.8792, + "step": 1299 + }, + { + "epoch": 0.10697387368854146, + "grad_norm": 11.239833398928637, + "learning_rate": 1.969108596673046e-05, + "loss": 0.8752, + "step": 1300 + }, + { + "epoch": 0.10705616128368649, + "grad_norm": 6.631855999332642, + "learning_rate": 1.9690428267107636e-05, + "loss": 0.864, + "step": 1301 + }, + { + "epoch": 0.10713844887883152, + "grad_norm": 8.509042674536547, + "learning_rate": 1.9689769879092285e-05, + "loss": 0.8539, + "step": 1302 + }, + { + "epoch": 0.10722073647397655, + "grad_norm": 6.153625548192156, + "learning_rate": 1.9689110802731174e-05, + "loss": 0.8872, + "step": 1303 + }, + { + "epoch": 0.10730302406912158, + "grad_norm": 0.7071137613706345, + "learning_rate": 1.968845103807113e-05, + "loss": 0.6264, + "step": 1304 + }, + { + "epoch": 0.10738531166426661, + "grad_norm": 4.709892779228607, + "learning_rate": 1.968779058515902e-05, + "loss": 0.8795, + "step": 1305 + }, + { + "epoch": 0.10746759925941164, + "grad_norm": 18.75805597847832, + "learning_rate": 1.968712944404176e-05, + "loss": 0.8674, + "step": 1306 + }, + { + "epoch": 0.10754988685455667, + "grad_norm": 0.5225280033189051, + "learning_rate": 1.9686467614766317e-05, + "loss": 0.576, + "step": 1307 + }, + { + "epoch": 0.1076321744497017, + "grad_norm": 11.926670592194546, + "learning_rate": 1.9685805097379706e-05, + "loss": 0.8787, + "step": 1308 + }, + { + "epoch": 0.10771446204484673, + "grad_norm": 7.4263466600964625, + "learning_rate": 1.9685141891928988e-05, + "loss": 0.8328, + "step": 1309 + }, + { + "epoch": 0.10779674963999177, + "grad_norm": 0.5357602955430172, + "learning_rate": 1.968447799846128e-05, + "loss": 0.5619, + "step": 1310 + }, + { + "epoch": 0.10787903723513681, + "grad_norm": 6.813630602811481, + "learning_rate": 1.9683813417023744e-05, + "loss": 0.8788, + "step": 1311 + }, + { + "epoch": 0.10796132483028184, + "grad_norm": 10.657671843850299, + "learning_rate": 1.968314814766359e-05, + "loss": 0.8245, + "step": 1312 + }, + { + "epoch": 0.10804361242542687, + "grad_norm": 6.152874752366795, + "learning_rate": 1.9682482190428078e-05, + "loss": 0.8491, + "step": 1313 + }, + { + "epoch": 0.1081259000205719, + "grad_norm": 8.128402041044515, + "learning_rate": 1.9681815545364514e-05, + "loss": 0.8548, + "step": 1314 + }, + { + "epoch": 0.10820818761571693, + "grad_norm": 9.164034587713218, + "learning_rate": 1.968114821252026e-05, + "loss": 0.8648, + "step": 1315 + }, + { + "epoch": 0.10829047521086196, + "grad_norm": 6.8988719562955785, + "learning_rate": 1.9680480191942715e-05, + "loss": 0.8515, + "step": 1316 + }, + { + "epoch": 0.108372762806007, + "grad_norm": 5.77279873046973, + "learning_rate": 1.9679811483679344e-05, + "loss": 0.8743, + "step": 1317 + }, + { + "epoch": 0.10845505040115203, + "grad_norm": 0.6678932721554799, + "learning_rate": 1.9679142087777646e-05, + "loss": 0.5631, + "step": 1318 + }, + { + "epoch": 0.10853733799629706, + "grad_norm": 5.142613068462937, + "learning_rate": 1.9678472004285168e-05, + "loss": 0.8364, + "step": 1319 + }, + { + "epoch": 0.10861962559144209, + "grad_norm": 4.36636440759262, + "learning_rate": 1.9677801233249522e-05, + "loss": 0.8776, + "step": 1320 + }, + { + "epoch": 0.10870191318658712, + "grad_norm": 0.515905767911221, + "learning_rate": 1.9677129774718354e-05, + "loss": 0.5542, + "step": 1321 + }, + { + "epoch": 0.10878420078173215, + "grad_norm": 0.5052244867766346, + "learning_rate": 1.967645762873936e-05, + "loss": 0.5827, + "step": 1322 + }, + { + "epoch": 0.10886648837687718, + "grad_norm": 5.083386834635681, + "learning_rate": 1.9675784795360294e-05, + "loss": 0.8736, + "step": 1323 + }, + { + "epoch": 0.10894877597202222, + "grad_norm": 5.440391953229143, + "learning_rate": 1.967511127462895e-05, + "loss": 0.8759, + "step": 1324 + }, + { + "epoch": 0.10903106356716726, + "grad_norm": 15.884511720887403, + "learning_rate": 1.9674437066593172e-05, + "loss": 0.8322, + "step": 1325 + }, + { + "epoch": 0.10911335116231229, + "grad_norm": 7.211730831323343, + "learning_rate": 1.9673762171300857e-05, + "loss": 0.8625, + "step": 1326 + }, + { + "epoch": 0.10919563875745732, + "grad_norm": 0.5669850108936473, + "learning_rate": 1.967308658879995e-05, + "loss": 0.5535, + "step": 1327 + }, + { + "epoch": 0.10927792635260235, + "grad_norm": 4.598126891030946, + "learning_rate": 1.9672410319138442e-05, + "loss": 0.8267, + "step": 1328 + }, + { + "epoch": 0.10936021394774738, + "grad_norm": 5.389091584276692, + "learning_rate": 1.967173336236437e-05, + "loss": 0.8526, + "step": 1329 + }, + { + "epoch": 0.10944250154289241, + "grad_norm": 5.396740224845529, + "learning_rate": 1.967105571852583e-05, + "loss": 0.8583, + "step": 1330 + }, + { + "epoch": 0.10952478913803744, + "grad_norm": 15.756743051056382, + "learning_rate": 1.9670377387670962e-05, + "loss": 0.8352, + "step": 1331 + }, + { + "epoch": 0.10960707673318247, + "grad_norm": 45.36041441825305, + "learning_rate": 1.966969836984794e-05, + "loss": 0.8664, + "step": 1332 + }, + { + "epoch": 0.1096893643283275, + "grad_norm": 0.6091498990566993, + "learning_rate": 1.9669018665105022e-05, + "loss": 0.5725, + "step": 1333 + }, + { + "epoch": 0.10977165192347253, + "grad_norm": 4.9099814047052766, + "learning_rate": 1.9668338273490476e-05, + "loss": 0.8601, + "step": 1334 + }, + { + "epoch": 0.10985393951861756, + "grad_norm": 5.695516010249542, + "learning_rate": 1.966765719505264e-05, + "loss": 0.8469, + "step": 1335 + }, + { + "epoch": 0.1099362271137626, + "grad_norm": 0.5018115432883228, + "learning_rate": 1.9666975429839898e-05, + "loss": 0.5708, + "step": 1336 + }, + { + "epoch": 0.11001851470890764, + "grad_norm": 5.03543554455548, + "learning_rate": 1.9666292977900683e-05, + "loss": 0.8538, + "step": 1337 + }, + { + "epoch": 0.11010080230405267, + "grad_norm": 6.0730970796747785, + "learning_rate": 1.966560983928347e-05, + "loss": 0.8436, + "step": 1338 + }, + { + "epoch": 0.1101830898991977, + "grad_norm": 0.5251518856290188, + "learning_rate": 1.96649260140368e-05, + "loss": 0.5561, + "step": 1339 + }, + { + "epoch": 0.11026537749434273, + "grad_norm": 4.080683188875858, + "learning_rate": 1.9664241502209235e-05, + "loss": 0.8623, + "step": 1340 + }, + { + "epoch": 0.11034766508948776, + "grad_norm": 5.393952148873809, + "learning_rate": 1.9663556303849413e-05, + "loss": 0.8716, + "step": 1341 + }, + { + "epoch": 0.11042995268463279, + "grad_norm": 5.475630634902594, + "learning_rate": 1.9662870419006005e-05, + "loss": 0.864, + "step": 1342 + }, + { + "epoch": 0.11051224027977782, + "grad_norm": 5.368155159963875, + "learning_rate": 1.9662183847727738e-05, + "loss": 0.85, + "step": 1343 + }, + { + "epoch": 0.11059452787492285, + "grad_norm": 5.333748381981491, + "learning_rate": 1.966149659006338e-05, + "loss": 0.9021, + "step": 1344 + }, + { + "epoch": 0.11067681547006789, + "grad_norm": 5.152529335147286, + "learning_rate": 1.9660808646061755e-05, + "loss": 0.8265, + "step": 1345 + }, + { + "epoch": 0.11075910306521292, + "grad_norm": 5.014489867671769, + "learning_rate": 1.9660120015771736e-05, + "loss": 0.8614, + "step": 1346 + }, + { + "epoch": 0.11084139066035795, + "grad_norm": 6.878459736348937, + "learning_rate": 1.965943069924224e-05, + "loss": 0.8413, + "step": 1347 + }, + { + "epoch": 0.11092367825550298, + "grad_norm": 1.309054235475711, + "learning_rate": 1.9658740696522235e-05, + "loss": 0.5864, + "step": 1348 + }, + { + "epoch": 0.11100596585064801, + "grad_norm": 4.10092938202275, + "learning_rate": 1.9658050007660736e-05, + "loss": 0.8697, + "step": 1349 + }, + { + "epoch": 0.11108825344579305, + "grad_norm": 3.718338112475671, + "learning_rate": 1.9657358632706812e-05, + "loss": 0.8629, + "step": 1350 + }, + { + "epoch": 0.11117054104093808, + "grad_norm": 5.648391310440999, + "learning_rate": 1.9656666571709575e-05, + "loss": 0.8538, + "step": 1351 + }, + { + "epoch": 0.11125282863608311, + "grad_norm": 6.739012308284066, + "learning_rate": 1.965597382471819e-05, + "loss": 0.8593, + "step": 1352 + }, + { + "epoch": 0.11133511623122815, + "grad_norm": 4.490203845995398, + "learning_rate": 1.9655280391781862e-05, + "loss": 0.884, + "step": 1353 + }, + { + "epoch": 0.11141740382637318, + "grad_norm": 7.535420440316182, + "learning_rate": 1.965458627294986e-05, + "loss": 0.8226, + "step": 1354 + }, + { + "epoch": 0.11149969142151821, + "grad_norm": 4.304043216970036, + "learning_rate": 1.965389146827149e-05, + "loss": 0.8435, + "step": 1355 + }, + { + "epoch": 0.11158197901666324, + "grad_norm": 0.46979177459935967, + "learning_rate": 1.9653195977796108e-05, + "loss": 0.546, + "step": 1356 + }, + { + "epoch": 0.11166426661180827, + "grad_norm": 3.244913645135368, + "learning_rate": 1.9652499801573122e-05, + "loss": 0.8431, + "step": 1357 + }, + { + "epoch": 0.1117465542069533, + "grad_norm": 4.113074523021319, + "learning_rate": 1.9651802939651988e-05, + "loss": 0.8569, + "step": 1358 + }, + { + "epoch": 0.11182884180209833, + "grad_norm": 3.8188152990212383, + "learning_rate": 1.9651105392082206e-05, + "loss": 0.8706, + "step": 1359 + }, + { + "epoch": 0.11191112939724336, + "grad_norm": 5.332042480420847, + "learning_rate": 1.9650407158913335e-05, + "loss": 0.8566, + "step": 1360 + }, + { + "epoch": 0.11199341699238839, + "grad_norm": 5.354305156748114, + "learning_rate": 1.964970824019497e-05, + "loss": 0.8499, + "step": 1361 + }, + { + "epoch": 0.11207570458753342, + "grad_norm": 5.591727704877664, + "learning_rate": 1.9649008635976765e-05, + "loss": 0.8842, + "step": 1362 + }, + { + "epoch": 0.11215799218267847, + "grad_norm": 0.5192326170528965, + "learning_rate": 1.964830834630842e-05, + "loss": 0.5876, + "step": 1363 + }, + { + "epoch": 0.1122402797778235, + "grad_norm": 0.4703501221850813, + "learning_rate": 1.9647607371239678e-05, + "loss": 0.5843, + "step": 1364 + }, + { + "epoch": 0.11232256737296853, + "grad_norm": 4.252531567672639, + "learning_rate": 1.964690571082034e-05, + "loss": 0.8273, + "step": 1365 + }, + { + "epoch": 0.11240485496811356, + "grad_norm": 5.191619286934386, + "learning_rate": 1.9646203365100243e-05, + "loss": 0.8585, + "step": 1366 + }, + { + "epoch": 0.11248714256325859, + "grad_norm": 4.322950576422303, + "learning_rate": 1.964550033412929e-05, + "loss": 0.8582, + "step": 1367 + }, + { + "epoch": 0.11256943015840362, + "grad_norm": 4.108989491540691, + "learning_rate": 1.9644796617957418e-05, + "loss": 0.8742, + "step": 1368 + }, + { + "epoch": 0.11265171775354865, + "grad_norm": 3.002075018468358, + "learning_rate": 1.9644092216634618e-05, + "loss": 0.8378, + "step": 1369 + }, + { + "epoch": 0.11273400534869368, + "grad_norm": 3.84643687208559, + "learning_rate": 1.9643387130210933e-05, + "loss": 0.8217, + "step": 1370 + }, + { + "epoch": 0.11281629294383871, + "grad_norm": 3.504634608794414, + "learning_rate": 1.9642681358736446e-05, + "loss": 0.8462, + "step": 1371 + }, + { + "epoch": 0.11289858053898374, + "grad_norm": 8.219632510725962, + "learning_rate": 1.9641974902261296e-05, + "loss": 0.8589, + "step": 1372 + }, + { + "epoch": 0.11298086813412878, + "grad_norm": 4.787889769332161, + "learning_rate": 1.964126776083567e-05, + "loss": 0.8478, + "step": 1373 + }, + { + "epoch": 0.1130631557292738, + "grad_norm": 3.431865259864041, + "learning_rate": 1.96405599345098e-05, + "loss": 0.8936, + "step": 1374 + }, + { + "epoch": 0.11314544332441884, + "grad_norm": 4.790101098374644, + "learning_rate": 1.9639851423333973e-05, + "loss": 0.8771, + "step": 1375 + }, + { + "epoch": 0.11322773091956388, + "grad_norm": 30.15921587978486, + "learning_rate": 1.9639142227358515e-05, + "loss": 0.8205, + "step": 1376 + }, + { + "epoch": 0.11331001851470891, + "grad_norm": 5.235814790178753, + "learning_rate": 1.9638432346633813e-05, + "loss": 0.8403, + "step": 1377 + }, + { + "epoch": 0.11339230610985394, + "grad_norm": 3.8559194687305203, + "learning_rate": 1.9637721781210285e-05, + "loss": 0.8873, + "step": 1378 + }, + { + "epoch": 0.11347459370499897, + "grad_norm": 4.562934487291937, + "learning_rate": 1.963701053113842e-05, + "loss": 0.8147, + "step": 1379 + }, + { + "epoch": 0.113556881300144, + "grad_norm": 4.762200299495142, + "learning_rate": 1.9636298596468734e-05, + "loss": 0.8363, + "step": 1380 + }, + { + "epoch": 0.11363916889528904, + "grad_norm": 3.958688391159092, + "learning_rate": 1.9635585977251813e-05, + "loss": 0.8677, + "step": 1381 + }, + { + "epoch": 0.11372145649043407, + "grad_norm": 5.206608356672264, + "learning_rate": 1.963487267353827e-05, + "loss": 0.8687, + "step": 1382 + }, + { + "epoch": 0.1138037440855791, + "grad_norm": 0.5547846036648372, + "learning_rate": 1.963415868537878e-05, + "loss": 0.5501, + "step": 1383 + }, + { + "epoch": 0.11388603168072413, + "grad_norm": 0.5226904808147554, + "learning_rate": 1.9633444012824066e-05, + "loss": 0.5864, + "step": 1384 + }, + { + "epoch": 0.11396831927586916, + "grad_norm": 7.417593071882848, + "learning_rate": 1.96327286559249e-05, + "loss": 0.8479, + "step": 1385 + }, + { + "epoch": 0.11405060687101419, + "grad_norm": 4.63099678124493, + "learning_rate": 1.963201261473209e-05, + "loss": 0.8265, + "step": 1386 + }, + { + "epoch": 0.11413289446615922, + "grad_norm": 4.481887681345341, + "learning_rate": 1.963129588929651e-05, + "loss": 0.8559, + "step": 1387 + }, + { + "epoch": 0.11421518206130425, + "grad_norm": 7.2760595228059, + "learning_rate": 1.963057847966907e-05, + "loss": 0.8443, + "step": 1388 + }, + { + "epoch": 0.1142974696564493, + "grad_norm": 4.414691898420469, + "learning_rate": 1.962986038590074e-05, + "loss": 0.8377, + "step": 1389 + }, + { + "epoch": 0.11437975725159433, + "grad_norm": 3.410257185708971, + "learning_rate": 1.9629141608042527e-05, + "loss": 0.8198, + "step": 1390 + }, + { + "epoch": 0.11446204484673936, + "grad_norm": 5.969142940247697, + "learning_rate": 1.9628422146145496e-05, + "loss": 0.842, + "step": 1391 + }, + { + "epoch": 0.11454433244188439, + "grad_norm": 4.102136287601403, + "learning_rate": 1.9627702000260755e-05, + "loss": 0.8504, + "step": 1392 + }, + { + "epoch": 0.11462662003702942, + "grad_norm": 4.63289281295388, + "learning_rate": 1.962698117043946e-05, + "loss": 0.8669, + "step": 1393 + }, + { + "epoch": 0.11470890763217445, + "grad_norm": 4.412255314793883, + "learning_rate": 1.9626259656732816e-05, + "loss": 0.8385, + "step": 1394 + }, + { + "epoch": 0.11479119522731948, + "grad_norm": 3.424254537799306, + "learning_rate": 1.962553745919208e-05, + "loss": 0.8332, + "step": 1395 + }, + { + "epoch": 0.11487348282246451, + "grad_norm": 0.7843996001653674, + "learning_rate": 1.962481457786856e-05, + "loss": 0.5559, + "step": 1396 + }, + { + "epoch": 0.11495577041760954, + "grad_norm": 6.392616540047419, + "learning_rate": 1.9624091012813606e-05, + "loss": 0.861, + "step": 1397 + }, + { + "epoch": 0.11503805801275457, + "grad_norm": 4.375124486829069, + "learning_rate": 1.9623366764078616e-05, + "loss": 0.8593, + "step": 1398 + }, + { + "epoch": 0.1151203456078996, + "grad_norm": 4.818446405657229, + "learning_rate": 1.962264183171504e-05, + "loss": 0.8331, + "step": 1399 + }, + { + "epoch": 0.11520263320304464, + "grad_norm": 4.703078440427859, + "learning_rate": 1.9621916215774382e-05, + "loss": 0.8457, + "step": 1400 + }, + { + "epoch": 0.11528492079818967, + "grad_norm": 4.037026903799907, + "learning_rate": 1.9621189916308178e-05, + "loss": 0.816, + "step": 1401 + }, + { + "epoch": 0.11536720839333471, + "grad_norm": 5.119130138779516, + "learning_rate": 1.9620462933368033e-05, + "loss": 0.8436, + "step": 1402 + }, + { + "epoch": 0.11544949598847974, + "grad_norm": 4.852298451053651, + "learning_rate": 1.961973526700559e-05, + "loss": 0.8485, + "step": 1403 + }, + { + "epoch": 0.11553178358362477, + "grad_norm": 4.92540174605456, + "learning_rate": 1.961900691727253e-05, + "loss": 0.8549, + "step": 1404 + }, + { + "epoch": 0.1156140711787698, + "grad_norm": 4.282680446115329, + "learning_rate": 1.9618277884220606e-05, + "loss": 0.8503, + "step": 1405 + }, + { + "epoch": 0.11569635877391483, + "grad_norm": 6.16103079429699, + "learning_rate": 1.9617548167901606e-05, + "loss": 0.8613, + "step": 1406 + }, + { + "epoch": 0.11577864636905986, + "grad_norm": 0.6928414088371742, + "learning_rate": 1.9616817768367362e-05, + "loss": 0.579, + "step": 1407 + }, + { + "epoch": 0.1158609339642049, + "grad_norm": 6.34405553140399, + "learning_rate": 1.9616086685669764e-05, + "loss": 0.839, + "step": 1408 + }, + { + "epoch": 0.11594322155934993, + "grad_norm": 4.315757032376313, + "learning_rate": 1.9615354919860748e-05, + "loss": 0.8458, + "step": 1409 + }, + { + "epoch": 0.11602550915449496, + "grad_norm": 0.5455750851135943, + "learning_rate": 1.961462247099229e-05, + "loss": 0.5573, + "step": 1410 + }, + { + "epoch": 0.11610779674963999, + "grad_norm": 6.034656447100853, + "learning_rate": 1.9613889339116436e-05, + "loss": 0.8626, + "step": 1411 + }, + { + "epoch": 0.11619008434478502, + "grad_norm": 7.09081398025819, + "learning_rate": 1.9613155524285257e-05, + "loss": 0.8381, + "step": 1412 + }, + { + "epoch": 0.11627237193993005, + "grad_norm": 6.863304280435548, + "learning_rate": 1.961242102655088e-05, + "loss": 0.8502, + "step": 1413 + }, + { + "epoch": 0.11635465953507508, + "grad_norm": 10.784533484864859, + "learning_rate": 1.961168584596549e-05, + "loss": 0.8679, + "step": 1414 + }, + { + "epoch": 0.11643694713022013, + "grad_norm": 4.470623239984508, + "learning_rate": 1.9610949982581305e-05, + "loss": 0.8402, + "step": 1415 + }, + { + "epoch": 0.11651923472536516, + "grad_norm": 5.269519406473394, + "learning_rate": 1.9610213436450605e-05, + "loss": 0.8472, + "step": 1416 + }, + { + "epoch": 0.11660152232051019, + "grad_norm": 5.275728737736502, + "learning_rate": 1.9609476207625712e-05, + "loss": 0.8542, + "step": 1417 + }, + { + "epoch": 0.11668380991565522, + "grad_norm": 5.631423086351522, + "learning_rate": 1.9608738296158997e-05, + "loss": 0.8693, + "step": 1418 + }, + { + "epoch": 0.11676609751080025, + "grad_norm": 4.4364700256265195, + "learning_rate": 1.9607999702102882e-05, + "loss": 0.852, + "step": 1419 + }, + { + "epoch": 0.11684838510594528, + "grad_norm": 0.5703836069388057, + "learning_rate": 1.9607260425509832e-05, + "loss": 0.5766, + "step": 1420 + }, + { + "epoch": 0.11693067270109031, + "grad_norm": 4.058718090767148, + "learning_rate": 1.9606520466432368e-05, + "loss": 0.8632, + "step": 1421 + }, + { + "epoch": 0.11701296029623534, + "grad_norm": 0.5038706574336734, + "learning_rate": 1.9605779824923053e-05, + "loss": 0.579, + "step": 1422 + }, + { + "epoch": 0.11709524789138037, + "grad_norm": 0.4777953546804379, + "learning_rate": 1.96050385010345e-05, + "loss": 0.5404, + "step": 1423 + }, + { + "epoch": 0.1171775354865254, + "grad_norm": 4.103894856069992, + "learning_rate": 1.9604296494819372e-05, + "loss": 0.8169, + "step": 1424 + }, + { + "epoch": 0.11725982308167043, + "grad_norm": 5.3210038004322335, + "learning_rate": 1.9603553806330383e-05, + "loss": 0.8412, + "step": 1425 + }, + { + "epoch": 0.11734211067681546, + "grad_norm": 5.333076136990717, + "learning_rate": 1.960281043562029e-05, + "loss": 0.8566, + "step": 1426 + }, + { + "epoch": 0.1174243982719605, + "grad_norm": 4.6932345814293965, + "learning_rate": 1.96020663827419e-05, + "loss": 0.8448, + "step": 1427 + }, + { + "epoch": 0.11750668586710554, + "grad_norm": 8.408659107060835, + "learning_rate": 1.960132164774807e-05, + "loss": 0.8552, + "step": 1428 + }, + { + "epoch": 0.11758897346225057, + "grad_norm": 5.633908689330064, + "learning_rate": 1.9600576230691704e-05, + "loss": 0.8782, + "step": 1429 + }, + { + "epoch": 0.1176712610573956, + "grad_norm": 12.574732342527673, + "learning_rate": 1.9599830131625763e-05, + "loss": 0.8689, + "step": 1430 + }, + { + "epoch": 0.11775354865254063, + "grad_norm": 0.6380068292188105, + "learning_rate": 1.9599083350603237e-05, + "loss": 0.5682, + "step": 1431 + }, + { + "epoch": 0.11783583624768566, + "grad_norm": 5.858612887393994, + "learning_rate": 1.959833588767718e-05, + "loss": 0.8578, + "step": 1432 + }, + { + "epoch": 0.1179181238428307, + "grad_norm": 5.579945680423859, + "learning_rate": 1.9597587742900693e-05, + "loss": 0.8282, + "step": 1433 + }, + { + "epoch": 0.11800041143797572, + "grad_norm": 6.137557946832193, + "learning_rate": 1.9596838916326923e-05, + "loss": 0.8397, + "step": 1434 + }, + { + "epoch": 0.11808269903312076, + "grad_norm": 19.398037303752524, + "learning_rate": 1.9596089408009066e-05, + "loss": 0.8243, + "step": 1435 + }, + { + "epoch": 0.11816498662826579, + "grad_norm": 6.767037667437653, + "learning_rate": 1.959533921800036e-05, + "loss": 0.8431, + "step": 1436 + }, + { + "epoch": 0.11824727422341082, + "grad_norm": 7.424308817399147, + "learning_rate": 1.9594588346354104e-05, + "loss": 0.8434, + "step": 1437 + }, + { + "epoch": 0.11832956181855585, + "grad_norm": 4.930241166882705, + "learning_rate": 1.9593836793123637e-05, + "loss": 0.8736, + "step": 1438 + }, + { + "epoch": 0.11841184941370088, + "grad_norm": 11.449117099938325, + "learning_rate": 1.9593084558362347e-05, + "loss": 0.8572, + "step": 1439 + }, + { + "epoch": 0.11849413700884592, + "grad_norm": 5.001807158753299, + "learning_rate": 1.9592331642123667e-05, + "loss": 0.8825, + "step": 1440 + }, + { + "epoch": 0.11857642460399095, + "grad_norm": 0.5722309140048856, + "learning_rate": 1.9591578044461092e-05, + "loss": 0.595, + "step": 1441 + }, + { + "epoch": 0.11865871219913598, + "grad_norm": 4.519860069953814, + "learning_rate": 1.959082376542815e-05, + "loss": 0.8882, + "step": 1442 + }, + { + "epoch": 0.11874099979428102, + "grad_norm": 6.76893243360037, + "learning_rate": 1.959006880507843e-05, + "loss": 0.8414, + "step": 1443 + }, + { + "epoch": 0.11882328738942605, + "grad_norm": 9.155966381696617, + "learning_rate": 1.958931316346556e-05, + "loss": 0.8287, + "step": 1444 + }, + { + "epoch": 0.11890557498457108, + "grad_norm": 5.19033125874358, + "learning_rate": 1.9588556840643212e-05, + "loss": 0.8754, + "step": 1445 + }, + { + "epoch": 0.11898786257971611, + "grad_norm": 5.356560891356975, + "learning_rate": 1.9587799836665125e-05, + "loss": 0.8372, + "step": 1446 + }, + { + "epoch": 0.11907015017486114, + "grad_norm": 18.341970904215607, + "learning_rate": 1.958704215158507e-05, + "loss": 0.8482, + "step": 1447 + }, + { + "epoch": 0.11915243777000617, + "grad_norm": 8.160225573294031, + "learning_rate": 1.9586283785456873e-05, + "loss": 0.8293, + "step": 1448 + }, + { + "epoch": 0.1192347253651512, + "grad_norm": 9.809055722956309, + "learning_rate": 1.9585524738334408e-05, + "loss": 0.8323, + "step": 1449 + }, + { + "epoch": 0.11931701296029623, + "grad_norm": 6.1116327045703445, + "learning_rate": 1.9584765010271593e-05, + "loss": 0.8255, + "step": 1450 + }, + { + "epoch": 0.11939930055544126, + "grad_norm": 5.676397508769702, + "learning_rate": 1.9584004601322403e-05, + "loss": 0.8729, + "step": 1451 + }, + { + "epoch": 0.1194815881505863, + "grad_norm": 11.731682837110242, + "learning_rate": 1.958324351154085e-05, + "loss": 0.8485, + "step": 1452 + }, + { + "epoch": 0.11956387574573134, + "grad_norm": 5.524439480046927, + "learning_rate": 1.9582481740981006e-05, + "loss": 0.8268, + "step": 1453 + }, + { + "epoch": 0.11964616334087637, + "grad_norm": 4.631670905958911, + "learning_rate": 1.9581719289696982e-05, + "loss": 0.852, + "step": 1454 + }, + { + "epoch": 0.1197284509360214, + "grad_norm": 8.965374291575179, + "learning_rate": 1.9580956157742946e-05, + "loss": 0.8404, + "step": 1455 + }, + { + "epoch": 0.11981073853116643, + "grad_norm": 0.6261324902081594, + "learning_rate": 1.958019234517311e-05, + "loss": 0.5977, + "step": 1456 + }, + { + "epoch": 0.11989302612631146, + "grad_norm": 11.465995096047832, + "learning_rate": 1.9579427852041726e-05, + "loss": 0.8321, + "step": 1457 + }, + { + "epoch": 0.11997531372145649, + "grad_norm": 6.000409897850183, + "learning_rate": 1.957866267840311e-05, + "loss": 0.8627, + "step": 1458 + }, + { + "epoch": 0.12005760131660152, + "grad_norm": 18.843059181510863, + "learning_rate": 1.9577896824311614e-05, + "loss": 0.8605, + "step": 1459 + }, + { + "epoch": 0.12013988891174655, + "grad_norm": 0.557334104917724, + "learning_rate": 1.9577130289821645e-05, + "loss": 0.5944, + "step": 1460 + }, + { + "epoch": 0.12022217650689158, + "grad_norm": 6.479047097320461, + "learning_rate": 1.9576363074987657e-05, + "loss": 0.8217, + "step": 1461 + }, + { + "epoch": 0.12030446410203662, + "grad_norm": 7.966997648213342, + "learning_rate": 1.9575595179864152e-05, + "loss": 0.8549, + "step": 1462 + }, + { + "epoch": 0.12038675169718165, + "grad_norm": 5.241930410780214, + "learning_rate": 1.957482660450568e-05, + "loss": 0.846, + "step": 1463 + }, + { + "epoch": 0.12046903929232668, + "grad_norm": 4.581892002978914, + "learning_rate": 1.9574057348966836e-05, + "loss": 0.8455, + "step": 1464 + }, + { + "epoch": 0.12055132688747171, + "grad_norm": 6.431741301592298, + "learning_rate": 1.957328741330227e-05, + "loss": 0.8713, + "step": 1465 + }, + { + "epoch": 0.12063361448261675, + "grad_norm": 6.792937329047407, + "learning_rate": 1.9572516797566684e-05, + "loss": 0.8235, + "step": 1466 + }, + { + "epoch": 0.12071590207776178, + "grad_norm": 5.865092979344445, + "learning_rate": 1.9571745501814804e-05, + "loss": 0.8509, + "step": 1467 + }, + { + "epoch": 0.12079818967290681, + "grad_norm": 6.617024495241083, + "learning_rate": 1.9570973526101436e-05, + "loss": 0.8415, + "step": 1468 + }, + { + "epoch": 0.12088047726805184, + "grad_norm": 6.63634081135842, + "learning_rate": 1.9570200870481412e-05, + "loss": 0.831, + "step": 1469 + }, + { + "epoch": 0.12096276486319688, + "grad_norm": 12.04891396249779, + "learning_rate": 1.9569427535009628e-05, + "loss": 0.8267, + "step": 1470 + }, + { + "epoch": 0.1210450524583419, + "grad_norm": 10.237297997250556, + "learning_rate": 1.956865351974101e-05, + "loss": 0.8459, + "step": 1471 + }, + { + "epoch": 0.12112734005348694, + "grad_norm": 7.236361914133526, + "learning_rate": 1.9567878824730555e-05, + "loss": 0.8306, + "step": 1472 + }, + { + "epoch": 0.12120962764863197, + "grad_norm": 10.608567363064175, + "learning_rate": 1.9567103450033287e-05, + "loss": 0.8419, + "step": 1473 + }, + { + "epoch": 0.121291915243777, + "grad_norm": 6.2236993124342455, + "learning_rate": 1.956632739570429e-05, + "loss": 0.8488, + "step": 1474 + }, + { + "epoch": 0.12137420283892203, + "grad_norm": 6.201916448058059, + "learning_rate": 1.9565550661798694e-05, + "loss": 0.8447, + "step": 1475 + }, + { + "epoch": 0.12145649043406706, + "grad_norm": 14.813294651870368, + "learning_rate": 1.9564773248371675e-05, + "loss": 0.8574, + "step": 1476 + }, + { + "epoch": 0.12153877802921209, + "grad_norm": 6.638992018546782, + "learning_rate": 1.9563995155478465e-05, + "loss": 0.8426, + "step": 1477 + }, + { + "epoch": 0.12162106562435712, + "grad_norm": 9.428113607773538, + "learning_rate": 1.9563216383174334e-05, + "loss": 0.8247, + "step": 1478 + }, + { + "epoch": 0.12170335321950217, + "grad_norm": 8.214290390573499, + "learning_rate": 1.95624369315146e-05, + "loss": 0.8271, + "step": 1479 + }, + { + "epoch": 0.1217856408146472, + "grad_norm": 8.920576079471006, + "learning_rate": 1.9561656800554646e-05, + "loss": 0.8576, + "step": 1480 + }, + { + "epoch": 0.12186792840979223, + "grad_norm": 8.787444339921239, + "learning_rate": 1.956087599034988e-05, + "loss": 0.8277, + "step": 1481 + }, + { + "epoch": 0.12195021600493726, + "grad_norm": 0.5502297806463167, + "learning_rate": 1.9560094500955776e-05, + "loss": 0.6048, + "step": 1482 + }, + { + "epoch": 0.12203250360008229, + "grad_norm": 11.694419053216938, + "learning_rate": 1.9559312332427845e-05, + "loss": 0.8305, + "step": 1483 + }, + { + "epoch": 0.12211479119522732, + "grad_norm": 12.873042218198869, + "learning_rate": 1.9558529484821657e-05, + "loss": 0.8183, + "step": 1484 + }, + { + "epoch": 0.12219707879037235, + "grad_norm": 0.46111310227130453, + "learning_rate": 1.955774595819282e-05, + "loss": 0.5531, + "step": 1485 + }, + { + "epoch": 0.12227936638551738, + "grad_norm": 0.45642862343281065, + "learning_rate": 1.9556961752596996e-05, + "loss": 0.5605, + "step": 1486 + }, + { + "epoch": 0.12236165398066241, + "grad_norm": 0.4551291025310518, + "learning_rate": 1.955617686808989e-05, + "loss": 0.5594, + "step": 1487 + }, + { + "epoch": 0.12244394157580744, + "grad_norm": 9.604778819022467, + "learning_rate": 1.955539130472727e-05, + "loss": 0.8574, + "step": 1488 + }, + { + "epoch": 0.12252622917095247, + "grad_norm": 7.949165431488096, + "learning_rate": 1.9554605062564924e-05, + "loss": 0.8115, + "step": 1489 + }, + { + "epoch": 0.1226085167660975, + "grad_norm": 0.552813848862915, + "learning_rate": 1.955381814165872e-05, + "loss": 0.5678, + "step": 1490 + }, + { + "epoch": 0.12269080436124254, + "grad_norm": 25.858842385856658, + "learning_rate": 1.955303054206455e-05, + "loss": 0.8248, + "step": 1491 + }, + { + "epoch": 0.12277309195638758, + "grad_norm": 21.311435668873827, + "learning_rate": 1.9552242263838373e-05, + "loss": 0.8456, + "step": 1492 + }, + { + "epoch": 0.12285537955153261, + "grad_norm": 12.870319025289076, + "learning_rate": 1.9551453307036184e-05, + "loss": 0.8608, + "step": 1493 + }, + { + "epoch": 0.12293766714667764, + "grad_norm": 23.32006316043952, + "learning_rate": 1.955066367171402e-05, + "loss": 0.8373, + "step": 1494 + }, + { + "epoch": 0.12301995474182267, + "grad_norm": 9.485367656306368, + "learning_rate": 1.954987335792799e-05, + "loss": 0.8557, + "step": 1495 + }, + { + "epoch": 0.1231022423369677, + "grad_norm": 13.860805709310752, + "learning_rate": 1.9549082365734223e-05, + "loss": 0.8742, + "step": 1496 + }, + { + "epoch": 0.12318452993211274, + "grad_norm": 0.6621327410930654, + "learning_rate": 1.9548290695188922e-05, + "loss": 0.5893, + "step": 1497 + }, + { + "epoch": 0.12326681752725777, + "grad_norm": 0.49494709664955483, + "learning_rate": 1.9547498346348316e-05, + "loss": 0.5697, + "step": 1498 + }, + { + "epoch": 0.1233491051224028, + "grad_norm": 11.047657969799024, + "learning_rate": 1.9546705319268697e-05, + "loss": 0.8373, + "step": 1499 + }, + { + "epoch": 0.12343139271754783, + "grad_norm": 10.749614937500533, + "learning_rate": 1.95459116140064e-05, + "loss": 0.8224, + "step": 1500 + }, + { + "epoch": 0.12351368031269286, + "grad_norm": 9.894337094098182, + "learning_rate": 1.954511723061781e-05, + "loss": 0.8733, + "step": 1501 + }, + { + "epoch": 0.12359596790783789, + "grad_norm": 7.20174895173815, + "learning_rate": 1.9544322169159356e-05, + "loss": 0.8386, + "step": 1502 + }, + { + "epoch": 0.12367825550298292, + "grad_norm": 17.273897223584264, + "learning_rate": 1.954352642968752e-05, + "loss": 0.8412, + "step": 1503 + }, + { + "epoch": 0.12376054309812795, + "grad_norm": 12.15000397023379, + "learning_rate": 1.9542730012258827e-05, + "loss": 0.8449, + "step": 1504 + }, + { + "epoch": 0.123842830693273, + "grad_norm": 0.8275745786613676, + "learning_rate": 1.9541932916929856e-05, + "loss": 0.5836, + "step": 1505 + }, + { + "epoch": 0.12392511828841803, + "grad_norm": 15.852588331242886, + "learning_rate": 1.954113514375723e-05, + "loss": 0.8527, + "step": 1506 + }, + { + "epoch": 0.12400740588356306, + "grad_norm": 13.570552755618733, + "learning_rate": 1.9540336692797624e-05, + "loss": 0.8706, + "step": 1507 + }, + { + "epoch": 0.12408969347870809, + "grad_norm": 10.231318756335888, + "learning_rate": 1.9539537564107757e-05, + "loss": 0.8975, + "step": 1508 + }, + { + "epoch": 0.12417198107385312, + "grad_norm": 20.140957454242425, + "learning_rate": 1.9538737757744397e-05, + "loss": 0.8995, + "step": 1509 + }, + { + "epoch": 0.12425426866899815, + "grad_norm": 16.907927618041068, + "learning_rate": 1.953793727376436e-05, + "loss": 0.8719, + "step": 1510 + }, + { + "epoch": 0.12433655626414318, + "grad_norm": 9.640180209086472, + "learning_rate": 1.9537136112224515e-05, + "loss": 0.8524, + "step": 1511 + }, + { + "epoch": 0.12441884385928821, + "grad_norm": 1.0609910785257508, + "learning_rate": 1.9536334273181774e-05, + "loss": 0.5868, + "step": 1512 + }, + { + "epoch": 0.12450113145443324, + "grad_norm": 0.5945416770474524, + "learning_rate": 1.9535531756693093e-05, + "loss": 0.5585, + "step": 1513 + }, + { + "epoch": 0.12458341904957827, + "grad_norm": 14.737935841719995, + "learning_rate": 1.953472856281549e-05, + "loss": 0.8653, + "step": 1514 + }, + { + "epoch": 0.1246657066447233, + "grad_norm": 18.226124842762093, + "learning_rate": 1.9533924691606015e-05, + "loss": 0.8444, + "step": 1515 + }, + { + "epoch": 0.12474799423986833, + "grad_norm": 11.981069540475804, + "learning_rate": 1.953312014312178e-05, + "loss": 0.8725, + "step": 1516 + }, + { + "epoch": 0.12483028183501337, + "grad_norm": 17.168765421546752, + "learning_rate": 1.9532314917419936e-05, + "loss": 0.9122, + "step": 1517 + }, + { + "epoch": 0.12491256943015841, + "grad_norm": 10.036564112856656, + "learning_rate": 1.9531509014557683e-05, + "loss": 0.9176, + "step": 1518 + }, + { + "epoch": 0.12499485702530344, + "grad_norm": 20.07946819855691, + "learning_rate": 1.9530702434592274e-05, + "loss": 0.8879, + "step": 1519 + }, + { + "epoch": 0.12507714462044847, + "grad_norm": 29.057862151469628, + "learning_rate": 1.9529895177581007e-05, + "loss": 0.8805, + "step": 1520 + }, + { + "epoch": 0.1251594322155935, + "grad_norm": 21.321561085033913, + "learning_rate": 1.9529087243581228e-05, + "loss": 0.8783, + "step": 1521 + }, + { + "epoch": 0.12524171981073853, + "grad_norm": 10.669113063471517, + "learning_rate": 1.9528278632650325e-05, + "loss": 0.8843, + "step": 1522 + }, + { + "epoch": 0.12532400740588356, + "grad_norm": 15.487682058293332, + "learning_rate": 1.9527469344845752e-05, + "loss": 0.8621, + "step": 1523 + }, + { + "epoch": 0.1254062950010286, + "grad_norm": 22.91067214478477, + "learning_rate": 1.9526659380224994e-05, + "loss": 0.871, + "step": 1524 + }, + { + "epoch": 0.12548858259617363, + "grad_norm": 18.78756629418322, + "learning_rate": 1.9525848738845586e-05, + "loss": 0.8383, + "step": 1525 + }, + { + "epoch": 0.12557087019131866, + "grad_norm": 1.826156850833794, + "learning_rate": 1.952503742076512e-05, + "loss": 0.6692, + "step": 1526 + }, + { + "epoch": 0.1256531577864637, + "grad_norm": 10.01852490231935, + "learning_rate": 1.9524225426041225e-05, + "loss": 0.8774, + "step": 1527 + }, + { + "epoch": 0.12573544538160872, + "grad_norm": 9.799823932923529, + "learning_rate": 1.9523412754731594e-05, + "loss": 0.8553, + "step": 1528 + }, + { + "epoch": 0.12581773297675375, + "grad_norm": 10.243780274009985, + "learning_rate": 1.9522599406893946e-05, + "loss": 0.8825, + "step": 1529 + }, + { + "epoch": 0.12590002057189878, + "grad_norm": 0.5629974857686041, + "learning_rate": 1.952178538258607e-05, + "loss": 0.582, + "step": 1530 + }, + { + "epoch": 0.1259823081670438, + "grad_norm": 11.36979045705732, + "learning_rate": 1.9520970681865784e-05, + "loss": 0.8768, + "step": 1531 + }, + { + "epoch": 0.12606459576218884, + "grad_norm": 10.058423558398582, + "learning_rate": 1.9520155304790966e-05, + "loss": 0.8548, + "step": 1532 + }, + { + "epoch": 0.12614688335733387, + "grad_norm": 8.76973647154349, + "learning_rate": 1.9519339251419546e-05, + "loss": 0.8628, + "step": 1533 + }, + { + "epoch": 0.1262291709524789, + "grad_norm": 19.24178771102118, + "learning_rate": 1.9518522521809483e-05, + "loss": 0.8974, + "step": 1534 + }, + { + "epoch": 0.12631145854762393, + "grad_norm": 0.8687538872056417, + "learning_rate": 1.951770511601881e-05, + "loss": 0.6081, + "step": 1535 + }, + { + "epoch": 0.12639374614276896, + "grad_norm": 18.208746524258896, + "learning_rate": 1.9516887034105582e-05, + "loss": 0.8705, + "step": 1536 + }, + { + "epoch": 0.12647603373791402, + "grad_norm": 0.5991121850056094, + "learning_rate": 1.951606827612792e-05, + "loss": 0.5812, + "step": 1537 + }, + { + "epoch": 0.12655832133305905, + "grad_norm": 11.188990497417041, + "learning_rate": 1.9515248842143985e-05, + "loss": 0.8493, + "step": 1538 + }, + { + "epoch": 0.12664060892820408, + "grad_norm": 12.13754210039802, + "learning_rate": 1.951442873221199e-05, + "loss": 0.8539, + "step": 1539 + }, + { + "epoch": 0.12672289652334912, + "grad_norm": 0.6371445875057375, + "learning_rate": 1.9513607946390198e-05, + "loss": 0.5771, + "step": 1540 + }, + { + "epoch": 0.12680518411849415, + "grad_norm": 38.38383515449128, + "learning_rate": 1.9512786484736907e-05, + "loss": 0.8685, + "step": 1541 + }, + { + "epoch": 0.12688747171363918, + "grad_norm": 12.647804021527618, + "learning_rate": 1.951196434731048e-05, + "loss": 0.8547, + "step": 1542 + }, + { + "epoch": 0.1269697593087842, + "grad_norm": 11.663684157984594, + "learning_rate": 1.951114153416932e-05, + "loss": 0.8518, + "step": 1543 + }, + { + "epoch": 0.12705204690392924, + "grad_norm": 14.698290238642006, + "learning_rate": 1.9510318045371873e-05, + "loss": 0.8299, + "step": 1544 + }, + { + "epoch": 0.12713433449907427, + "grad_norm": 18.836062665500368, + "learning_rate": 1.9509493880976645e-05, + "loss": 0.8649, + "step": 1545 + }, + { + "epoch": 0.1272166220942193, + "grad_norm": 37.604495524678796, + "learning_rate": 1.9508669041042175e-05, + "loss": 0.8771, + "step": 1546 + }, + { + "epoch": 0.12729890968936433, + "grad_norm": 16.242906985077767, + "learning_rate": 1.950784352562707e-05, + "loss": 0.8559, + "step": 1547 + }, + { + "epoch": 0.12738119728450936, + "grad_norm": 12.303960320937623, + "learning_rate": 1.950701733478996e-05, + "loss": 0.8269, + "step": 1548 + }, + { + "epoch": 0.1274634848796544, + "grad_norm": 15.582543893350989, + "learning_rate": 1.9506190468589542e-05, + "loss": 0.8547, + "step": 1549 + }, + { + "epoch": 0.12754577247479942, + "grad_norm": 12.36757962681498, + "learning_rate": 1.950536292708456e-05, + "loss": 0.854, + "step": 1550 + }, + { + "epoch": 0.12762806006994445, + "grad_norm": 11.80476370196413, + "learning_rate": 1.9504534710333795e-05, + "loss": 0.8952, + "step": 1551 + }, + { + "epoch": 0.12771034766508949, + "grad_norm": 30.303720277928623, + "learning_rate": 1.950370581839609e-05, + "loss": 0.8396, + "step": 1552 + }, + { + "epoch": 0.12779263526023452, + "grad_norm": 9.097984679042607, + "learning_rate": 1.9502876251330315e-05, + "loss": 0.8424, + "step": 1553 + }, + { + "epoch": 0.12787492285537955, + "grad_norm": 16.918561440988324, + "learning_rate": 1.9502046009195413e-05, + "loss": 0.8429, + "step": 1554 + }, + { + "epoch": 0.12795721045052458, + "grad_norm": 13.858699886134135, + "learning_rate": 1.9501215092050357e-05, + "loss": 0.8222, + "step": 1555 + }, + { + "epoch": 0.1280394980456696, + "grad_norm": 0.7950557504894921, + "learning_rate": 1.9500383499954178e-05, + "loss": 0.5902, + "step": 1556 + }, + { + "epoch": 0.12812178564081464, + "grad_norm": 24.803535791656607, + "learning_rate": 1.9499551232965948e-05, + "loss": 0.8334, + "step": 1557 + }, + { + "epoch": 0.12820407323595967, + "grad_norm": 11.828695261044466, + "learning_rate": 1.949871829114479e-05, + "loss": 0.8758, + "step": 1558 + }, + { + "epoch": 0.1282863608311047, + "grad_norm": 12.441875233794036, + "learning_rate": 1.9497884674549875e-05, + "loss": 0.8732, + "step": 1559 + }, + { + "epoch": 0.12836864842624973, + "grad_norm": 28.468422713163175, + "learning_rate": 1.9497050383240423e-05, + "loss": 0.8573, + "step": 1560 + }, + { + "epoch": 0.12845093602139476, + "grad_norm": 17.260133565508898, + "learning_rate": 1.94962154172757e-05, + "loss": 0.8569, + "step": 1561 + }, + { + "epoch": 0.1285332236165398, + "grad_norm": 17.78196970160732, + "learning_rate": 1.949537977671502e-05, + "loss": 0.8392, + "step": 1562 + }, + { + "epoch": 0.12861551121168485, + "grad_norm": 11.158957006384213, + "learning_rate": 1.949454346161775e-05, + "loss": 0.8347, + "step": 1563 + }, + { + "epoch": 0.12869779880682988, + "grad_norm": 17.498878506507015, + "learning_rate": 1.949370647204329e-05, + "loss": 0.8809, + "step": 1564 + }, + { + "epoch": 0.1287800864019749, + "grad_norm": 7.9176045197846205, + "learning_rate": 1.9492868808051112e-05, + "loss": 0.8456, + "step": 1565 + }, + { + "epoch": 0.12886237399711994, + "grad_norm": 20.166846846916215, + "learning_rate": 1.9492030469700712e-05, + "loss": 0.832, + "step": 1566 + }, + { + "epoch": 0.12894466159226498, + "grad_norm": 19.174742273691734, + "learning_rate": 1.9491191457051646e-05, + "loss": 0.8443, + "step": 1567 + }, + { + "epoch": 0.12902694918741, + "grad_norm": 21.32891181756379, + "learning_rate": 1.9490351770163523e-05, + "loss": 0.8464, + "step": 1568 + }, + { + "epoch": 0.12910923678255504, + "grad_norm": 25.5931955657056, + "learning_rate": 1.9489511409095982e-05, + "loss": 0.8524, + "step": 1569 + }, + { + "epoch": 0.12919152437770007, + "grad_norm": 6.885350368161633, + "learning_rate": 1.9488670373908732e-05, + "loss": 0.8566, + "step": 1570 + }, + { + "epoch": 0.1292738119728451, + "grad_norm": 0.8064509572258421, + "learning_rate": 1.948782866466151e-05, + "loss": 0.5833, + "step": 1571 + }, + { + "epoch": 0.12935609956799013, + "grad_norm": 0.6437332857165583, + "learning_rate": 1.9486986281414113e-05, + "loss": 0.591, + "step": 1572 + }, + { + "epoch": 0.12943838716313516, + "grad_norm": 8.783260476561136, + "learning_rate": 1.9486143224226386e-05, + "loss": 0.8232, + "step": 1573 + }, + { + "epoch": 0.1295206747582802, + "grad_norm": 27.307179915869348, + "learning_rate": 1.9485299493158213e-05, + "loss": 0.8362, + "step": 1574 + }, + { + "epoch": 0.12960296235342522, + "grad_norm": 14.887699977158707, + "learning_rate": 1.948445508826953e-05, + "loss": 0.8774, + "step": 1575 + }, + { + "epoch": 0.12968524994857025, + "grad_norm": 8.94111905597464, + "learning_rate": 1.948361000962033e-05, + "loss": 0.8493, + "step": 1576 + }, + { + "epoch": 0.12976753754371528, + "grad_norm": 11.638077472882795, + "learning_rate": 1.9482764257270643e-05, + "loss": 0.8597, + "step": 1577 + }, + { + "epoch": 0.12984982513886031, + "grad_norm": 9.045639740647525, + "learning_rate": 1.9481917831280547e-05, + "loss": 0.8523, + "step": 1578 + }, + { + "epoch": 0.12993211273400535, + "grad_norm": 13.046344860495239, + "learning_rate": 1.948107073171017e-05, + "loss": 0.8627, + "step": 1579 + }, + { + "epoch": 0.13001440032915038, + "grad_norm": 14.174127420393626, + "learning_rate": 1.9480222958619696e-05, + "loss": 0.8322, + "step": 1580 + }, + { + "epoch": 0.1300966879242954, + "grad_norm": 7.026950552196628, + "learning_rate": 1.947937451206934e-05, + "loss": 0.86, + "step": 1581 + }, + { + "epoch": 0.13017897551944044, + "grad_norm": 1.335200372111077, + "learning_rate": 1.947852539211938e-05, + "loss": 0.6371, + "step": 1582 + }, + { + "epoch": 0.13026126311458547, + "grad_norm": 12.298240422318646, + "learning_rate": 1.9477675598830135e-05, + "loss": 0.8637, + "step": 1583 + }, + { + "epoch": 0.1303435507097305, + "grad_norm": 11.297299148156007, + "learning_rate": 1.947682513226197e-05, + "loss": 0.8403, + "step": 1584 + }, + { + "epoch": 0.13042583830487553, + "grad_norm": 11.722864354186749, + "learning_rate": 1.947597399247531e-05, + "loss": 0.8774, + "step": 1585 + }, + { + "epoch": 0.13050812590002056, + "grad_norm": 0.6703589211407367, + "learning_rate": 1.9475122179530608e-05, + "loss": 0.5872, + "step": 1586 + }, + { + "epoch": 0.1305904134951656, + "grad_norm": 7.201572575314816, + "learning_rate": 1.947426969348838e-05, + "loss": 0.8333, + "step": 1587 + }, + { + "epoch": 0.13067270109031062, + "grad_norm": 8.51219810707983, + "learning_rate": 1.9473416534409183e-05, + "loss": 0.8692, + "step": 1588 + }, + { + "epoch": 0.13075498868545568, + "grad_norm": 12.72665901538907, + "learning_rate": 1.9472562702353628e-05, + "loss": 0.8353, + "step": 1589 + }, + { + "epoch": 0.1308372762806007, + "grad_norm": 8.440240359174169, + "learning_rate": 1.9471708197382367e-05, + "loss": 0.8447, + "step": 1590 + }, + { + "epoch": 0.13091956387574574, + "grad_norm": 18.794459104702618, + "learning_rate": 1.9470853019556105e-05, + "loss": 0.8514, + "step": 1591 + }, + { + "epoch": 0.13100185147089077, + "grad_norm": 15.204805304916004, + "learning_rate": 1.946999716893559e-05, + "loss": 0.8918, + "step": 1592 + }, + { + "epoch": 0.1310841390660358, + "grad_norm": 20.447557635429625, + "learning_rate": 1.946914064558162e-05, + "loss": 0.8291, + "step": 1593 + }, + { + "epoch": 0.13116642666118083, + "grad_norm": 0.8902410259266276, + "learning_rate": 1.9468283449555044e-05, + "loss": 0.5855, + "step": 1594 + }, + { + "epoch": 0.13124871425632587, + "grad_norm": 16.32691590599941, + "learning_rate": 1.946742558091675e-05, + "loss": 0.8729, + "step": 1595 + }, + { + "epoch": 0.1313310018514709, + "grad_norm": 12.629234690471154, + "learning_rate": 1.946656703972769e-05, + "loss": 0.8502, + "step": 1596 + }, + { + "epoch": 0.13141328944661593, + "grad_norm": 11.942150001738323, + "learning_rate": 1.946570782604884e-05, + "loss": 0.8488, + "step": 1597 + }, + { + "epoch": 0.13149557704176096, + "grad_norm": 18.66420798538323, + "learning_rate": 1.9464847939941253e-05, + "loss": 0.8415, + "step": 1598 + }, + { + "epoch": 0.131577864636906, + "grad_norm": 0.5487160534091807, + "learning_rate": 1.9463987381465997e-05, + "loss": 0.5807, + "step": 1599 + }, + { + "epoch": 0.13166015223205102, + "grad_norm": 0.47378948261483483, + "learning_rate": 1.9463126150684215e-05, + "loss": 0.5611, + "step": 1600 + }, + { + "epoch": 0.13174243982719605, + "grad_norm": 11.826166345708458, + "learning_rate": 1.946226424765709e-05, + "loss": 0.8621, + "step": 1601 + }, + { + "epoch": 0.13182472742234108, + "grad_norm": 15.67907329393183, + "learning_rate": 1.946140167244584e-05, + "loss": 0.8326, + "step": 1602 + }, + { + "epoch": 0.1319070150174861, + "grad_norm": 13.265461124752493, + "learning_rate": 1.9460538425111747e-05, + "loss": 0.8491, + "step": 1603 + }, + { + "epoch": 0.13198930261263114, + "grad_norm": 21.302704279466422, + "learning_rate": 1.9459674505716134e-05, + "loss": 0.8538, + "step": 1604 + }, + { + "epoch": 0.13207159020777617, + "grad_norm": 0.8219538162066992, + "learning_rate": 1.9458809914320376e-05, + "loss": 0.6139, + "step": 1605 + }, + { + "epoch": 0.1321538778029212, + "grad_norm": 14.037933923062655, + "learning_rate": 1.9457944650985883e-05, + "loss": 0.825, + "step": 1606 + }, + { + "epoch": 0.13223616539806624, + "grad_norm": 16.51723934307185, + "learning_rate": 1.9457078715774137e-05, + "loss": 0.8398, + "step": 1607 + }, + { + "epoch": 0.13231845299321127, + "grad_norm": 14.88358516004508, + "learning_rate": 1.9456212108746638e-05, + "loss": 0.8464, + "step": 1608 + }, + { + "epoch": 0.1324007405883563, + "grad_norm": 30.189032243968434, + "learning_rate": 1.9455344829964952e-05, + "loss": 0.8403, + "step": 1609 + }, + { + "epoch": 0.13248302818350133, + "grad_norm": 12.53490946521001, + "learning_rate": 1.945447687949069e-05, + "loss": 0.8593, + "step": 1610 + }, + { + "epoch": 0.13256531577864636, + "grad_norm": 15.356236880805202, + "learning_rate": 1.9453608257385515e-05, + "loss": 0.8752, + "step": 1611 + }, + { + "epoch": 0.1326476033737914, + "grad_norm": 30.490886482179693, + "learning_rate": 1.9452738963711127e-05, + "loss": 0.8433, + "step": 1612 + }, + { + "epoch": 0.13272989096893642, + "grad_norm": 13.432846768419989, + "learning_rate": 1.945186899852928e-05, + "loss": 0.8632, + "step": 1613 + }, + { + "epoch": 0.13281217856408145, + "grad_norm": 15.210857748618107, + "learning_rate": 1.9450998361901778e-05, + "loss": 0.8299, + "step": 1614 + }, + { + "epoch": 0.1328944661592265, + "grad_norm": 9.248268507271018, + "learning_rate": 1.945012705389046e-05, + "loss": 0.8795, + "step": 1615 + }, + { + "epoch": 0.13297675375437154, + "grad_norm": 13.312884263474453, + "learning_rate": 1.9449255074557233e-05, + "loss": 0.8447, + "step": 1616 + }, + { + "epoch": 0.13305904134951657, + "grad_norm": 0.6387822773814731, + "learning_rate": 1.9448382423964038e-05, + "loss": 0.5803, + "step": 1617 + }, + { + "epoch": 0.1331413289446616, + "grad_norm": 8.987025451840744, + "learning_rate": 1.944750910217287e-05, + "loss": 0.8486, + "step": 1618 + }, + { + "epoch": 0.13322361653980663, + "grad_norm": 15.860555119958025, + "learning_rate": 1.944663510924576e-05, + "loss": 0.8429, + "step": 1619 + }, + { + "epoch": 0.13330590413495166, + "grad_norm": 15.755917461958926, + "learning_rate": 1.94457604452448e-05, + "loss": 0.8648, + "step": 1620 + }, + { + "epoch": 0.1333881917300967, + "grad_norm": 9.495081106148424, + "learning_rate": 1.9444885110232122e-05, + "loss": 0.8511, + "step": 1621 + }, + { + "epoch": 0.13347047932524173, + "grad_norm": 9.228948535480242, + "learning_rate": 1.9444009104269912e-05, + "loss": 0.8903, + "step": 1622 + }, + { + "epoch": 0.13355276692038676, + "grad_norm": 0.6835320058663733, + "learning_rate": 1.9443132427420402e-05, + "loss": 0.5648, + "step": 1623 + }, + { + "epoch": 0.1336350545155318, + "grad_norm": 7.512489044480011, + "learning_rate": 1.944225507974586e-05, + "loss": 0.8648, + "step": 1624 + }, + { + "epoch": 0.13371734211067682, + "grad_norm": 10.385149112147042, + "learning_rate": 1.9441377061308625e-05, + "loss": 0.8467, + "step": 1625 + }, + { + "epoch": 0.13379962970582185, + "grad_norm": 0.628760629783465, + "learning_rate": 1.9440498372171057e-05, + "loss": 0.5519, + "step": 1626 + }, + { + "epoch": 0.13388191730096688, + "grad_norm": 8.0862924915863, + "learning_rate": 1.9439619012395587e-05, + "loss": 0.8742, + "step": 1627 + }, + { + "epoch": 0.1339642048961119, + "grad_norm": 9.007382090603102, + "learning_rate": 1.9438738982044678e-05, + "loss": 0.8313, + "step": 1628 + }, + { + "epoch": 0.13404649249125694, + "grad_norm": 15.188130778601375, + "learning_rate": 1.9437858281180845e-05, + "loss": 0.8616, + "step": 1629 + }, + { + "epoch": 0.13412878008640197, + "grad_norm": 8.695715161522436, + "learning_rate": 1.9436976909866652e-05, + "loss": 0.8798, + "step": 1630 + }, + { + "epoch": 0.134211067681547, + "grad_norm": 8.332586114090097, + "learning_rate": 1.9436094868164714e-05, + "loss": 0.8304, + "step": 1631 + }, + { + "epoch": 0.13429335527669203, + "grad_norm": 6.131546976942165, + "learning_rate": 1.943521215613769e-05, + "loss": 0.8323, + "step": 1632 + }, + { + "epoch": 0.13437564287183706, + "grad_norm": 7.75079357067481, + "learning_rate": 1.9434328773848275e-05, + "loss": 0.8761, + "step": 1633 + }, + { + "epoch": 0.1344579304669821, + "grad_norm": 9.591056911031714, + "learning_rate": 1.943344472135924e-05, + "loss": 0.8391, + "step": 1634 + }, + { + "epoch": 0.13454021806212713, + "grad_norm": 0.5571523026882707, + "learning_rate": 1.943255999873338e-05, + "loss": 0.5626, + "step": 1635 + }, + { + "epoch": 0.13462250565727216, + "grad_norm": 7.180341879101182, + "learning_rate": 1.9431674606033535e-05, + "loss": 0.8573, + "step": 1636 + }, + { + "epoch": 0.1347047932524172, + "grad_norm": 6.297703126019438, + "learning_rate": 1.9430788543322614e-05, + "loss": 0.8645, + "step": 1637 + }, + { + "epoch": 0.13478708084756222, + "grad_norm": 13.328252581658347, + "learning_rate": 1.942990181066356e-05, + "loss": 0.841, + "step": 1638 + }, + { + "epoch": 0.13486936844270725, + "grad_norm": 9.478995566997952, + "learning_rate": 1.9429014408119354e-05, + "loss": 0.843, + "step": 1639 + }, + { + "epoch": 0.13495165603785228, + "grad_norm": 10.30497161424214, + "learning_rate": 1.942812633575305e-05, + "loss": 0.8561, + "step": 1640 + }, + { + "epoch": 0.13503394363299734, + "grad_norm": 10.783364375169487, + "learning_rate": 1.9427237593627727e-05, + "loss": 0.8702, + "step": 1641 + }, + { + "epoch": 0.13511623122814237, + "grad_norm": 5.936021952203429, + "learning_rate": 1.9426348181806527e-05, + "loss": 0.8651, + "step": 1642 + }, + { + "epoch": 0.1351985188232874, + "grad_norm": 6.865865248520576, + "learning_rate": 1.9425458100352622e-05, + "loss": 0.8369, + "step": 1643 + }, + { + "epoch": 0.13528080641843243, + "grad_norm": 9.25539230502882, + "learning_rate": 1.942456734932925e-05, + "loss": 0.8614, + "step": 1644 + }, + { + "epoch": 0.13536309401357746, + "grad_norm": 6.449494273747695, + "learning_rate": 1.9423675928799684e-05, + "loss": 0.8495, + "step": 1645 + }, + { + "epoch": 0.1354453816087225, + "grad_norm": 9.981428791114054, + "learning_rate": 1.942278383882725e-05, + "loss": 0.8203, + "step": 1646 + }, + { + "epoch": 0.13552766920386752, + "grad_norm": 11.905054324962366, + "learning_rate": 1.9421891079475323e-05, + "loss": 0.8565, + "step": 1647 + }, + { + "epoch": 0.13560995679901255, + "grad_norm": 5.099205899457276, + "learning_rate": 1.9420997650807324e-05, + "loss": 0.8462, + "step": 1648 + }, + { + "epoch": 0.13569224439415759, + "grad_norm": 7.551077756646788, + "learning_rate": 1.9420103552886718e-05, + "loss": 0.8278, + "step": 1649 + }, + { + "epoch": 0.13577453198930262, + "grad_norm": 9.34742160721662, + "learning_rate": 1.941920878577702e-05, + "loss": 0.855, + "step": 1650 + }, + { + "epoch": 0.13585681958444765, + "grad_norm": 14.953328811805543, + "learning_rate": 1.9418313349541792e-05, + "loss": 0.8555, + "step": 1651 + }, + { + "epoch": 0.13593910717959268, + "grad_norm": 19.65415961612349, + "learning_rate": 1.9417417244244645e-05, + "loss": 0.8416, + "step": 1652 + }, + { + "epoch": 0.1360213947747377, + "grad_norm": 6.723089668465277, + "learning_rate": 1.9416520469949242e-05, + "loss": 0.8485, + "step": 1653 + }, + { + "epoch": 0.13610368236988274, + "grad_norm": 9.685387477166035, + "learning_rate": 1.9415623026719282e-05, + "loss": 0.8333, + "step": 1654 + }, + { + "epoch": 0.13618596996502777, + "grad_norm": 6.442031893344583, + "learning_rate": 1.941472491461852e-05, + "loss": 0.8313, + "step": 1655 + }, + { + "epoch": 0.1362682575601728, + "grad_norm": 11.978462205062288, + "learning_rate": 1.941382613371076e-05, + "loss": 0.8356, + "step": 1656 + }, + { + "epoch": 0.13635054515531783, + "grad_norm": 8.959450516597315, + "learning_rate": 1.9412926684059844e-05, + "loss": 0.8299, + "step": 1657 + }, + { + "epoch": 0.13643283275046286, + "grad_norm": 8.564749166275782, + "learning_rate": 1.9412026565729668e-05, + "loss": 0.8062, + "step": 1658 + }, + { + "epoch": 0.1365151203456079, + "grad_norm": 0.48110518320018675, + "learning_rate": 1.941112577878418e-05, + "loss": 0.5745, + "step": 1659 + }, + { + "epoch": 0.13659740794075292, + "grad_norm": 11.492434132872301, + "learning_rate": 1.9410224323287368e-05, + "loss": 0.8564, + "step": 1660 + }, + { + "epoch": 0.13667969553589795, + "grad_norm": 7.0096165345110455, + "learning_rate": 1.9409322199303265e-05, + "loss": 0.8245, + "step": 1661 + }, + { + "epoch": 0.13676198313104299, + "grad_norm": 7.244034742758887, + "learning_rate": 1.9408419406895963e-05, + "loss": 0.8423, + "step": 1662 + }, + { + "epoch": 0.13684427072618802, + "grad_norm": 8.758950640082848, + "learning_rate": 1.9407515946129596e-05, + "loss": 0.8159, + "step": 1663 + }, + { + "epoch": 0.13692655832133305, + "grad_norm": 11.355355331778188, + "learning_rate": 1.9406611817068342e-05, + "loss": 0.8395, + "step": 1664 + }, + { + "epoch": 0.13700884591647808, + "grad_norm": 21.457605657742324, + "learning_rate": 1.9405707019776426e-05, + "loss": 0.8284, + "step": 1665 + }, + { + "epoch": 0.13709113351162314, + "grad_norm": 9.208046052928568, + "learning_rate": 1.9404801554318124e-05, + "loss": 0.8354, + "step": 1666 + }, + { + "epoch": 0.13717342110676817, + "grad_norm": 11.164511842285489, + "learning_rate": 1.940389542075776e-05, + "loss": 0.8446, + "step": 1667 + }, + { + "epoch": 0.1372557087019132, + "grad_norm": 7.698644554716207, + "learning_rate": 1.9402988619159706e-05, + "loss": 0.8295, + "step": 1668 + }, + { + "epoch": 0.13733799629705823, + "grad_norm": 0.49519598486863875, + "learning_rate": 1.940208114958838e-05, + "loss": 0.5877, + "step": 1669 + }, + { + "epoch": 0.13742028389220326, + "grad_norm": 10.405215695876064, + "learning_rate": 1.9401173012108244e-05, + "loss": 0.858, + "step": 1670 + }, + { + "epoch": 0.1375025714873483, + "grad_norm": 6.274166537890226, + "learning_rate": 1.940026420678381e-05, + "loss": 0.8495, + "step": 1671 + }, + { + "epoch": 0.13758485908249332, + "grad_norm": 8.247076915562847, + "learning_rate": 1.9399354733679644e-05, + "loss": 0.8241, + "step": 1672 + }, + { + "epoch": 0.13766714667763835, + "grad_norm": 9.44348381745056, + "learning_rate": 1.9398444592860346e-05, + "loss": 0.8807, + "step": 1673 + }, + { + "epoch": 0.13774943427278338, + "grad_norm": 0.46561126197328223, + "learning_rate": 1.9397533784390577e-05, + "loss": 0.5663, + "step": 1674 + }, + { + "epoch": 0.13783172186792841, + "grad_norm": 0.444368069087187, + "learning_rate": 1.939662230833504e-05, + "loss": 0.5139, + "step": 1675 + }, + { + "epoch": 0.13791400946307344, + "grad_norm": 7.357820789657159, + "learning_rate": 1.9395710164758478e-05, + "loss": 0.7818, + "step": 1676 + }, + { + "epoch": 0.13799629705821848, + "grad_norm": 6.9252347936815974, + "learning_rate": 1.9394797353725693e-05, + "loss": 0.8313, + "step": 1677 + }, + { + "epoch": 0.1380785846533635, + "grad_norm": 6.812431247075556, + "learning_rate": 1.9393883875301528e-05, + "loss": 0.7983, + "step": 1678 + }, + { + "epoch": 0.13816087224850854, + "grad_norm": 15.635575837691977, + "learning_rate": 1.9392969729550874e-05, + "loss": 0.8385, + "step": 1679 + }, + { + "epoch": 0.13824315984365357, + "grad_norm": 7.966855868898477, + "learning_rate": 1.9392054916538676e-05, + "loss": 0.8513, + "step": 1680 + }, + { + "epoch": 0.1383254474387986, + "grad_norm": 0.5191616103239319, + "learning_rate": 1.939113943632992e-05, + "loss": 0.5545, + "step": 1681 + }, + { + "epoch": 0.13840773503394363, + "grad_norm": 9.268850841092327, + "learning_rate": 1.939022328898963e-05, + "loss": 0.8638, + "step": 1682 + }, + { + "epoch": 0.13849002262908866, + "grad_norm": 6.2611125201139055, + "learning_rate": 1.9389306474582898e-05, + "loss": 0.8328, + "step": 1683 + }, + { + "epoch": 0.1385723102242337, + "grad_norm": 18.750042480887604, + "learning_rate": 1.938838899317485e-05, + "loss": 0.8319, + "step": 1684 + }, + { + "epoch": 0.13865459781937872, + "grad_norm": 9.045445107488831, + "learning_rate": 1.9387470844830663e-05, + "loss": 0.8632, + "step": 1685 + }, + { + "epoch": 0.13873688541452375, + "grad_norm": 10.10333245390392, + "learning_rate": 1.938655202961556e-05, + "loss": 0.8402, + "step": 1686 + }, + { + "epoch": 0.13881917300966878, + "grad_norm": 14.37009496922347, + "learning_rate": 1.938563254759481e-05, + "loss": 0.842, + "step": 1687 + }, + { + "epoch": 0.13890146060481381, + "grad_norm": 4.62456351377046, + "learning_rate": 1.9384712398833737e-05, + "loss": 0.8379, + "step": 1688 + }, + { + "epoch": 0.13898374819995885, + "grad_norm": 5.48808023393408, + "learning_rate": 1.9383791583397704e-05, + "loss": 0.8401, + "step": 1689 + }, + { + "epoch": 0.13906603579510388, + "grad_norm": 0.508054869823019, + "learning_rate": 1.9382870101352122e-05, + "loss": 0.5855, + "step": 1690 + }, + { + "epoch": 0.1391483233902489, + "grad_norm": 0.45467031682961434, + "learning_rate": 1.9381947952762456e-05, + "loss": 0.5582, + "step": 1691 + }, + { + "epoch": 0.13923061098539397, + "grad_norm": 4.698170093241064, + "learning_rate": 1.9381025137694213e-05, + "loss": 0.8488, + "step": 1692 + }, + { + "epoch": 0.139312898580539, + "grad_norm": 17.925934835917072, + "learning_rate": 1.9380101656212942e-05, + "loss": 0.8374, + "step": 1693 + }, + { + "epoch": 0.13939518617568403, + "grad_norm": 21.74128240936288, + "learning_rate": 1.937917750838425e-05, + "loss": 0.835, + "step": 1694 + }, + { + "epoch": 0.13947747377082906, + "grad_norm": 0.5455463949858089, + "learning_rate": 1.9378252694273793e-05, + "loss": 0.5776, + "step": 1695 + }, + { + "epoch": 0.1395597613659741, + "grad_norm": 7.021491632670848, + "learning_rate": 1.937732721394726e-05, + "loss": 0.8338, + "step": 1696 + }, + { + "epoch": 0.13964204896111912, + "grad_norm": 0.47906268792032236, + "learning_rate": 1.93764010674704e-05, + "loss": 0.5718, + "step": 1697 + }, + { + "epoch": 0.13972433655626415, + "grad_norm": 9.18921205759664, + "learning_rate": 1.9375474254909002e-05, + "loss": 0.8374, + "step": 1698 + }, + { + "epoch": 0.13980662415140918, + "grad_norm": 6.4706598091478424, + "learning_rate": 1.9374546776328906e-05, + "loss": 0.8371, + "step": 1699 + }, + { + "epoch": 0.1398889117465542, + "grad_norm": 11.019615591253086, + "learning_rate": 1.9373618631796e-05, + "loss": 0.8192, + "step": 1700 + }, + { + "epoch": 0.13997119934169924, + "grad_norm": 6.695489764575939, + "learning_rate": 1.937268982137622e-05, + "loss": 0.7922, + "step": 1701 + }, + { + "epoch": 0.14005348693684427, + "grad_norm": 7.916432098380521, + "learning_rate": 1.937176034513554e-05, + "loss": 0.8493, + "step": 1702 + }, + { + "epoch": 0.1401357745319893, + "grad_norm": 5.689792227478553, + "learning_rate": 1.9370830203139998e-05, + "loss": 0.8239, + "step": 1703 + }, + { + "epoch": 0.14021806212713434, + "grad_norm": 4.746618108561176, + "learning_rate": 1.936989939545566e-05, + "loss": 0.8285, + "step": 1704 + }, + { + "epoch": 0.14030034972227937, + "grad_norm": 4.051544179948662, + "learning_rate": 1.936896792214866e-05, + "loss": 0.8394, + "step": 1705 + }, + { + "epoch": 0.1403826373174244, + "grad_norm": 5.5482681923761525, + "learning_rate": 1.9368035783285157e-05, + "loss": 0.8484, + "step": 1706 + }, + { + "epoch": 0.14046492491256943, + "grad_norm": 11.365762748018584, + "learning_rate": 1.9367102978931375e-05, + "loss": 0.8805, + "step": 1707 + }, + { + "epoch": 0.14054721250771446, + "grad_norm": 0.5814669329358698, + "learning_rate": 1.9366169509153578e-05, + "loss": 0.5721, + "step": 1708 + }, + { + "epoch": 0.1406295001028595, + "grad_norm": 8.761055022275535, + "learning_rate": 1.936523537401808e-05, + "loss": 0.8048, + "step": 1709 + }, + { + "epoch": 0.14071178769800452, + "grad_norm": 4.94096986820974, + "learning_rate": 1.9364300573591234e-05, + "loss": 0.8751, + "step": 1710 + }, + { + "epoch": 0.14079407529314955, + "grad_norm": 8.95653472365129, + "learning_rate": 1.9363365107939454e-05, + "loss": 0.8411, + "step": 1711 + }, + { + "epoch": 0.14087636288829458, + "grad_norm": 5.431161332386783, + "learning_rate": 1.936242897712919e-05, + "loss": 0.8469, + "step": 1712 + }, + { + "epoch": 0.1409586504834396, + "grad_norm": 9.867753201582625, + "learning_rate": 1.9361492181226947e-05, + "loss": 0.8505, + "step": 1713 + }, + { + "epoch": 0.14104093807858464, + "grad_norm": 5.136338927889941, + "learning_rate": 1.936055472029927e-05, + "loss": 0.8538, + "step": 1714 + }, + { + "epoch": 0.14112322567372967, + "grad_norm": 8.355395875659436, + "learning_rate": 1.9359616594412754e-05, + "loss": 0.8068, + "step": 1715 + }, + { + "epoch": 0.1412055132688747, + "grad_norm": 5.819359443189252, + "learning_rate": 1.9358677803634044e-05, + "loss": 0.8652, + "step": 1716 + }, + { + "epoch": 0.14128780086401974, + "grad_norm": 6.059353532101893, + "learning_rate": 1.9357738348029832e-05, + "loss": 0.834, + "step": 1717 + }, + { + "epoch": 0.1413700884591648, + "grad_norm": 0.5260853686785989, + "learning_rate": 1.9356798227666852e-05, + "loss": 0.5886, + "step": 1718 + }, + { + "epoch": 0.14145237605430983, + "grad_norm": 5.154728517737795, + "learning_rate": 1.935585744261189e-05, + "loss": 0.8592, + "step": 1719 + }, + { + "epoch": 0.14153466364945486, + "grad_norm": 4.60543857905063, + "learning_rate": 1.9354915992931778e-05, + "loss": 0.8158, + "step": 1720 + }, + { + "epoch": 0.1416169512445999, + "grad_norm": 6.439591937901596, + "learning_rate": 1.9353973878693393e-05, + "loss": 0.8429, + "step": 1721 + }, + { + "epoch": 0.14169923883974492, + "grad_norm": 5.191853855244775, + "learning_rate": 1.9353031099963665e-05, + "loss": 0.8704, + "step": 1722 + }, + { + "epoch": 0.14178152643488995, + "grad_norm": 7.89752048115663, + "learning_rate": 1.9352087656809563e-05, + "loss": 0.8092, + "step": 1723 + }, + { + "epoch": 0.14186381403003498, + "grad_norm": 5.331792359032669, + "learning_rate": 1.9351143549298115e-05, + "loss": 0.8468, + "step": 1724 + }, + { + "epoch": 0.14194610162518, + "grad_norm": 0.48989316809380234, + "learning_rate": 1.935019877749638e-05, + "loss": 0.563, + "step": 1725 + }, + { + "epoch": 0.14202838922032504, + "grad_norm": 5.256869924215583, + "learning_rate": 1.9349253341471483e-05, + "loss": 0.8459, + "step": 1726 + }, + { + "epoch": 0.14211067681547007, + "grad_norm": 4.772772230009674, + "learning_rate": 1.9348307241290574e-05, + "loss": 0.8424, + "step": 1727 + }, + { + "epoch": 0.1421929644106151, + "grad_norm": 5.139167308693871, + "learning_rate": 1.9347360477020873e-05, + "loss": 0.8294, + "step": 1728 + }, + { + "epoch": 0.14227525200576013, + "grad_norm": 8.427018162829064, + "learning_rate": 1.934641304872963e-05, + "loss": 0.8984, + "step": 1729 + }, + { + "epoch": 0.14235753960090516, + "grad_norm": 5.195894888253091, + "learning_rate": 1.934546495648415e-05, + "loss": 0.8575, + "step": 1730 + }, + { + "epoch": 0.1424398271960502, + "grad_norm": 6.03048911441289, + "learning_rate": 1.934451620035179e-05, + "loss": 0.8305, + "step": 1731 + }, + { + "epoch": 0.14252211479119523, + "grad_norm": 5.3710703103909765, + "learning_rate": 1.934356678039994e-05, + "loss": 0.8413, + "step": 1732 + }, + { + "epoch": 0.14260440238634026, + "grad_norm": 0.45884838090967267, + "learning_rate": 1.934261669669605e-05, + "loss": 0.5654, + "step": 1733 + }, + { + "epoch": 0.1426866899814853, + "grad_norm": 5.871990326718468, + "learning_rate": 1.934166594930761e-05, + "loss": 0.8636, + "step": 1734 + }, + { + "epoch": 0.14276897757663032, + "grad_norm": 6.040345693572168, + "learning_rate": 1.9340714538302165e-05, + "loss": 0.8436, + "step": 1735 + }, + { + "epoch": 0.14285126517177535, + "grad_norm": 16.452741842652962, + "learning_rate": 1.9339762463747293e-05, + "loss": 0.8507, + "step": 1736 + }, + { + "epoch": 0.14293355276692038, + "grad_norm": 8.023246371031073, + "learning_rate": 1.9338809725710636e-05, + "loss": 0.795, + "step": 1737 + }, + { + "epoch": 0.1430158403620654, + "grad_norm": 9.169126419921785, + "learning_rate": 1.933785632425987e-05, + "loss": 0.8365, + "step": 1738 + }, + { + "epoch": 0.14309812795721044, + "grad_norm": 0.4843788693491957, + "learning_rate": 1.933690225946272e-05, + "loss": 0.5824, + "step": 1739 + }, + { + "epoch": 0.14318041555235547, + "grad_norm": 6.87259085555059, + "learning_rate": 1.933594753138697e-05, + "loss": 0.85, + "step": 1740 + }, + { + "epoch": 0.1432627031475005, + "grad_norm": 10.501019350819892, + "learning_rate": 1.9334992140100437e-05, + "loss": 0.859, + "step": 1741 + }, + { + "epoch": 0.14334499074264553, + "grad_norm": 5.621762683180854, + "learning_rate": 1.9334036085670993e-05, + "loss": 0.8416, + "step": 1742 + }, + { + "epoch": 0.14342727833779056, + "grad_norm": 12.795324591638737, + "learning_rate": 1.933307936816655e-05, + "loss": 0.8138, + "step": 1743 + }, + { + "epoch": 0.14350956593293562, + "grad_norm": 6.250772565375806, + "learning_rate": 1.933212198765508e-05, + "loss": 0.8431, + "step": 1744 + }, + { + "epoch": 0.14359185352808065, + "grad_norm": 6.649923452980046, + "learning_rate": 1.933116394420458e-05, + "loss": 0.8683, + "step": 1745 + }, + { + "epoch": 0.14367414112322568, + "grad_norm": 0.4783295138092446, + "learning_rate": 1.9330205237883125e-05, + "loss": 0.5472, + "step": 1746 + }, + { + "epoch": 0.14375642871837072, + "grad_norm": 6.450808259182912, + "learning_rate": 1.9329245868758805e-05, + "loss": 0.8689, + "step": 1747 + }, + { + "epoch": 0.14383871631351575, + "grad_norm": 8.144945238397163, + "learning_rate": 1.9328285836899782e-05, + "loss": 0.8409, + "step": 1748 + }, + { + "epoch": 0.14392100390866078, + "grad_norm": 8.41829303690563, + "learning_rate": 1.932732514237425e-05, + "loss": 0.7911, + "step": 1749 + }, + { + "epoch": 0.1440032915038058, + "grad_norm": 7.756732625129085, + "learning_rate": 1.9326363785250456e-05, + "loss": 0.8231, + "step": 1750 + }, + { + "epoch": 0.14408557909895084, + "grad_norm": 6.8419012862661965, + "learning_rate": 1.9325401765596695e-05, + "loss": 0.7966, + "step": 1751 + }, + { + "epoch": 0.14416786669409587, + "grad_norm": 5.118896582942905, + "learning_rate": 1.9324439083481308e-05, + "loss": 0.8469, + "step": 1752 + }, + { + "epoch": 0.1442501542892409, + "grad_norm": 0.46273923634227443, + "learning_rate": 1.9323475738972682e-05, + "loss": 0.5401, + "step": 1753 + }, + { + "epoch": 0.14433244188438593, + "grad_norm": 5.933555491033483, + "learning_rate": 1.9322511732139247e-05, + "loss": 0.8232, + "step": 1754 + }, + { + "epoch": 0.14441472947953096, + "grad_norm": 19.74022464785281, + "learning_rate": 1.9321547063049487e-05, + "loss": 0.8377, + "step": 1755 + }, + { + "epoch": 0.144497017074676, + "grad_norm": 0.47208399084192787, + "learning_rate": 1.9320581731771933e-05, + "loss": 0.5788, + "step": 1756 + }, + { + "epoch": 0.14457930466982102, + "grad_norm": 6.155109939821394, + "learning_rate": 1.9319615738375156e-05, + "loss": 0.8477, + "step": 1757 + }, + { + "epoch": 0.14466159226496605, + "grad_norm": 4.5236917326626855, + "learning_rate": 1.9318649082927784e-05, + "loss": 0.8241, + "step": 1758 + }, + { + "epoch": 0.14474387986011109, + "grad_norm": 6.68261711206281, + "learning_rate": 1.9317681765498485e-05, + "loss": 0.8389, + "step": 1759 + }, + { + "epoch": 0.14482616745525612, + "grad_norm": 8.82307043815515, + "learning_rate": 1.9316713786155974e-05, + "loss": 0.8591, + "step": 1760 + }, + { + "epoch": 0.14490845505040115, + "grad_norm": 0.5330655449901153, + "learning_rate": 1.9315745144969017e-05, + "loss": 0.595, + "step": 1761 + }, + { + "epoch": 0.14499074264554618, + "grad_norm": 4.782447942910814, + "learning_rate": 1.9314775842006422e-05, + "loss": 0.8231, + "step": 1762 + }, + { + "epoch": 0.1450730302406912, + "grad_norm": 6.461784587179363, + "learning_rate": 1.931380587733705e-05, + "loss": 0.8385, + "step": 1763 + }, + { + "epoch": 0.14515531783583624, + "grad_norm": 6.211444562729787, + "learning_rate": 1.93128352510298e-05, + "loss": 0.8268, + "step": 1764 + }, + { + "epoch": 0.14523760543098127, + "grad_norm": 10.531808114746536, + "learning_rate": 1.931186396315363e-05, + "loss": 0.8177, + "step": 1765 + }, + { + "epoch": 0.1453198930261263, + "grad_norm": 19.19707142844517, + "learning_rate": 1.9310892013777533e-05, + "loss": 0.831, + "step": 1766 + }, + { + "epoch": 0.14540218062127133, + "grad_norm": 0.5389341914459922, + "learning_rate": 1.930991940297056e-05, + "loss": 0.6002, + "step": 1767 + }, + { + "epoch": 0.14548446821641636, + "grad_norm": 5.89170072392646, + "learning_rate": 1.93089461308018e-05, + "loss": 0.8448, + "step": 1768 + }, + { + "epoch": 0.1455667558115614, + "grad_norm": 5.109916223855104, + "learning_rate": 1.9307972197340397e-05, + "loss": 0.8092, + "step": 1769 + }, + { + "epoch": 0.14564904340670645, + "grad_norm": 0.48671468331497286, + "learning_rate": 1.9306997602655534e-05, + "loss": 0.5869, + "step": 1770 + }, + { + "epoch": 0.14573133100185148, + "grad_norm": 0.4650224096015914, + "learning_rate": 1.9306022346816446e-05, + "loss": 0.5473, + "step": 1771 + }, + { + "epoch": 0.1458136185969965, + "grad_norm": 10.109459292829206, + "learning_rate": 1.930504642989241e-05, + "loss": 0.8677, + "step": 1772 + }, + { + "epoch": 0.14589590619214154, + "grad_norm": 9.295217402252955, + "learning_rate": 1.930406985195276e-05, + "loss": 0.8392, + "step": 1773 + }, + { + "epoch": 0.14597819378728658, + "grad_norm": 0.4680780702597178, + "learning_rate": 1.9303092613066868e-05, + "loss": 0.5466, + "step": 1774 + }, + { + "epoch": 0.1460604813824316, + "grad_norm": 8.171994410417643, + "learning_rate": 1.9302114713304156e-05, + "loss": 0.857, + "step": 1775 + }, + { + "epoch": 0.14614276897757664, + "grad_norm": 5.044836964241432, + "learning_rate": 1.9301136152734087e-05, + "loss": 0.852, + "step": 1776 + }, + { + "epoch": 0.14622505657272167, + "grad_norm": 6.317438834212792, + "learning_rate": 1.9300156931426182e-05, + "loss": 0.8407, + "step": 1777 + }, + { + "epoch": 0.1463073441678667, + "grad_norm": 5.174942827941844, + "learning_rate": 1.9299177049450004e-05, + "loss": 0.8338, + "step": 1778 + }, + { + "epoch": 0.14638963176301173, + "grad_norm": 0.4723884851141025, + "learning_rate": 1.9298196506875158e-05, + "loss": 0.5827, + "step": 1779 + }, + { + "epoch": 0.14647191935815676, + "grad_norm": 13.596767621441385, + "learning_rate": 1.9297215303771304e-05, + "loss": 0.8679, + "step": 1780 + }, + { + "epoch": 0.1465542069533018, + "grad_norm": 5.205013158256939, + "learning_rate": 1.9296233440208142e-05, + "loss": 0.8362, + "step": 1781 + }, + { + "epoch": 0.14663649454844682, + "grad_norm": 5.587153299584088, + "learning_rate": 1.9295250916255425e-05, + "loss": 0.8351, + "step": 1782 + }, + { + "epoch": 0.14671878214359185, + "grad_norm": 0.47266552949506424, + "learning_rate": 1.9294267731982948e-05, + "loss": 0.5719, + "step": 1783 + }, + { + "epoch": 0.14680106973873688, + "grad_norm": 5.473479336109091, + "learning_rate": 1.9293283887460553e-05, + "loss": 0.8347, + "step": 1784 + }, + { + "epoch": 0.14688335733388191, + "grad_norm": 7.213933436839228, + "learning_rate": 1.9292299382758138e-05, + "loss": 0.8379, + "step": 1785 + }, + { + "epoch": 0.14696564492902695, + "grad_norm": 18.805904514994236, + "learning_rate": 1.9291314217945634e-05, + "loss": 0.8388, + "step": 1786 + }, + { + "epoch": 0.14704793252417198, + "grad_norm": 4.844653756754676, + "learning_rate": 1.9290328393093026e-05, + "loss": 0.8472, + "step": 1787 + }, + { + "epoch": 0.147130220119317, + "grad_norm": 4.296135787825302, + "learning_rate": 1.9289341908270347e-05, + "loss": 0.8629, + "step": 1788 + }, + { + "epoch": 0.14721250771446204, + "grad_norm": 11.658303520335162, + "learning_rate": 1.9288354763547673e-05, + "loss": 0.8029, + "step": 1789 + }, + { + "epoch": 0.14729479530960707, + "grad_norm": 4.837349799431046, + "learning_rate": 1.9287366958995136e-05, + "loss": 0.8772, + "step": 1790 + }, + { + "epoch": 0.1473770829047521, + "grad_norm": 0.48305002203280695, + "learning_rate": 1.9286378494682896e-05, + "loss": 0.5614, + "step": 1791 + }, + { + "epoch": 0.14745937049989713, + "grad_norm": 4.7653711011756075, + "learning_rate": 1.9285389370681184e-05, + "loss": 0.8333, + "step": 1792 + }, + { + "epoch": 0.14754165809504216, + "grad_norm": 6.453771640410041, + "learning_rate": 1.9284399587060262e-05, + "loss": 0.8446, + "step": 1793 + }, + { + "epoch": 0.1476239456901872, + "grad_norm": 10.175339873053852, + "learning_rate": 1.928340914389044e-05, + "loss": 0.8579, + "step": 1794 + }, + { + "epoch": 0.14770623328533222, + "grad_norm": 5.858011029442454, + "learning_rate": 1.9282418041242078e-05, + "loss": 0.8283, + "step": 1795 + }, + { + "epoch": 0.14778852088047728, + "grad_norm": 5.123269754961845, + "learning_rate": 1.9281426279185586e-05, + "loss": 0.8182, + "step": 1796 + }, + { + "epoch": 0.1478708084756223, + "grad_norm": 6.518221006236574, + "learning_rate": 1.928043385779141e-05, + "loss": 0.8492, + "step": 1797 + }, + { + "epoch": 0.14795309607076734, + "grad_norm": 5.832793043417557, + "learning_rate": 1.9279440777130056e-05, + "loss": 0.8485, + "step": 1798 + }, + { + "epoch": 0.14803538366591237, + "grad_norm": 6.497382970971675, + "learning_rate": 1.9278447037272072e-05, + "loss": 0.8638, + "step": 1799 + }, + { + "epoch": 0.1481176712610574, + "grad_norm": 9.185854302788478, + "learning_rate": 1.927745263828805e-05, + "loss": 0.8222, + "step": 1800 + }, + { + "epoch": 0.14819995885620244, + "grad_norm": 7.203459832191322, + "learning_rate": 1.9276457580248628e-05, + "loss": 0.8328, + "step": 1801 + }, + { + "epoch": 0.14828224645134747, + "grad_norm": 0.4659154540512444, + "learning_rate": 1.9275461863224492e-05, + "loss": 0.5878, + "step": 1802 + }, + { + "epoch": 0.1483645340464925, + "grad_norm": 4.761079806856456, + "learning_rate": 1.9274465487286383e-05, + "loss": 0.8401, + "step": 1803 + }, + { + "epoch": 0.14844682164163753, + "grad_norm": 5.43480146188045, + "learning_rate": 1.9273468452505075e-05, + "loss": 0.8504, + "step": 1804 + }, + { + "epoch": 0.14852910923678256, + "grad_norm": 0.4589008559325783, + "learning_rate": 1.92724707589514e-05, + "loss": 0.5615, + "step": 1805 + }, + { + "epoch": 0.1486113968319276, + "grad_norm": 0.4490133485736386, + "learning_rate": 1.9271472406696236e-05, + "loss": 0.5758, + "step": 1806 + }, + { + "epoch": 0.14869368442707262, + "grad_norm": 4.8766467786430665, + "learning_rate": 1.9270473395810494e-05, + "loss": 0.8508, + "step": 1807 + }, + { + "epoch": 0.14877597202221765, + "grad_norm": 4.857249289546154, + "learning_rate": 1.9269473726365147e-05, + "loss": 0.818, + "step": 1808 + }, + { + "epoch": 0.14885825961736268, + "grad_norm": 4.3364511276912845, + "learning_rate": 1.9268473398431217e-05, + "loss": 0.807, + "step": 1809 + }, + { + "epoch": 0.1489405472125077, + "grad_norm": 5.55022355695734, + "learning_rate": 1.9267472412079755e-05, + "loss": 0.8409, + "step": 1810 + }, + { + "epoch": 0.14902283480765274, + "grad_norm": 5.009007386374566, + "learning_rate": 1.9266470767381876e-05, + "loss": 0.8357, + "step": 1811 + }, + { + "epoch": 0.14910512240279777, + "grad_norm": 7.485805210738528, + "learning_rate": 1.9265468464408734e-05, + "loss": 0.8193, + "step": 1812 + }, + { + "epoch": 0.1491874099979428, + "grad_norm": 0.4919132266850831, + "learning_rate": 1.9264465503231526e-05, + "loss": 0.5705, + "step": 1813 + }, + { + "epoch": 0.14926969759308784, + "grad_norm": 7.072148553826402, + "learning_rate": 1.9263461883921506e-05, + "loss": 0.8298, + "step": 1814 + }, + { + "epoch": 0.14935198518823287, + "grad_norm": 15.97777993374769, + "learning_rate": 1.9262457606549973e-05, + "loss": 0.8325, + "step": 1815 + }, + { + "epoch": 0.1494342727833779, + "grad_norm": 10.369257659029978, + "learning_rate": 1.9261452671188257e-05, + "loss": 0.8727, + "step": 1816 + }, + { + "epoch": 0.14951656037852293, + "grad_norm": 5.9281264245080925, + "learning_rate": 1.926044707790776e-05, + "loss": 0.828, + "step": 1817 + }, + { + "epoch": 0.14959884797366796, + "grad_norm": 4.551465103131345, + "learning_rate": 1.9259440826779915e-05, + "loss": 0.8484, + "step": 1818 + }, + { + "epoch": 0.149681135568813, + "grad_norm": 4.877909704956422, + "learning_rate": 1.9258433917876197e-05, + "loss": 0.8548, + "step": 1819 + }, + { + "epoch": 0.14976342316395802, + "grad_norm": 0.5042068110016803, + "learning_rate": 1.9257426351268145e-05, + "loss": 0.5747, + "step": 1820 + }, + { + "epoch": 0.14984571075910308, + "grad_norm": 0.4843633549516813, + "learning_rate": 1.9256418127027325e-05, + "loss": 0.5803, + "step": 1821 + }, + { + "epoch": 0.1499279983542481, + "grad_norm": 6.15038334110192, + "learning_rate": 1.9255409245225366e-05, + "loss": 0.8112, + "step": 1822 + }, + { + "epoch": 0.15001028594939314, + "grad_norm": 6.421532923241749, + "learning_rate": 1.925439970593394e-05, + "loss": 0.8305, + "step": 1823 + }, + { + "epoch": 0.15009257354453817, + "grad_norm": 7.677045661984815, + "learning_rate": 1.9253389509224754e-05, + "loss": 0.8404, + "step": 1824 + }, + { + "epoch": 0.1501748611396832, + "grad_norm": 4.960096719845569, + "learning_rate": 1.925237865516958e-05, + "loss": 0.8275, + "step": 1825 + }, + { + "epoch": 0.15025714873482823, + "grad_norm": 4.836857288232429, + "learning_rate": 1.9251367143840218e-05, + "loss": 0.8426, + "step": 1826 + }, + { + "epoch": 0.15033943632997326, + "grad_norm": 4.6523616976989866, + "learning_rate": 1.9250354975308534e-05, + "loss": 0.8433, + "step": 1827 + }, + { + "epoch": 0.1504217239251183, + "grad_norm": 5.486424130354286, + "learning_rate": 1.9249342149646426e-05, + "loss": 0.83, + "step": 1828 + }, + { + "epoch": 0.15050401152026333, + "grad_norm": 6.185401090574369, + "learning_rate": 1.9248328666925838e-05, + "loss": 0.8208, + "step": 1829 + }, + { + "epoch": 0.15058629911540836, + "grad_norm": 6.335103339362094, + "learning_rate": 1.9247314527218778e-05, + "loss": 0.8487, + "step": 1830 + }, + { + "epoch": 0.1506685867105534, + "grad_norm": 4.5706381595966015, + "learning_rate": 1.9246299730597284e-05, + "loss": 0.8587, + "step": 1831 + }, + { + "epoch": 0.15075087430569842, + "grad_norm": 5.392989295897071, + "learning_rate": 1.924528427713344e-05, + "loss": 0.8276, + "step": 1832 + }, + { + "epoch": 0.15083316190084345, + "grad_norm": 8.741338516555382, + "learning_rate": 1.924426816689939e-05, + "loss": 0.8151, + "step": 1833 + }, + { + "epoch": 0.15091544949598848, + "grad_norm": 0.6685839984852447, + "learning_rate": 1.9243251399967313e-05, + "loss": 0.5844, + "step": 1834 + }, + { + "epoch": 0.1509977370911335, + "grad_norm": 0.5107566191246578, + "learning_rate": 1.9242233976409438e-05, + "loss": 0.5402, + "step": 1835 + }, + { + "epoch": 0.15108002468627854, + "grad_norm": 0.4613973290236187, + "learning_rate": 1.9241215896298043e-05, + "loss": 0.5524, + "step": 1836 + }, + { + "epoch": 0.15116231228142357, + "grad_norm": 8.64624478329892, + "learning_rate": 1.9240197159705448e-05, + "loss": 0.8503, + "step": 1837 + }, + { + "epoch": 0.1512445998765686, + "grad_norm": 5.785794004079789, + "learning_rate": 1.9239177766704026e-05, + "loss": 0.8447, + "step": 1838 + }, + { + "epoch": 0.15132688747171363, + "grad_norm": 6.449386312407525, + "learning_rate": 1.923815771736619e-05, + "loss": 0.8625, + "step": 1839 + }, + { + "epoch": 0.15140917506685866, + "grad_norm": 5.974603624558202, + "learning_rate": 1.9237137011764404e-05, + "loss": 0.8859, + "step": 1840 + }, + { + "epoch": 0.1514914626620037, + "grad_norm": 6.07613604597075, + "learning_rate": 1.9236115649971177e-05, + "loss": 0.8288, + "step": 1841 + }, + { + "epoch": 0.15157375025714873, + "grad_norm": 5.610188501747942, + "learning_rate": 1.9235093632059067e-05, + "loss": 0.8472, + "step": 1842 + }, + { + "epoch": 0.15165603785229376, + "grad_norm": 6.513611642507301, + "learning_rate": 1.9234070958100675e-05, + "loss": 0.8406, + "step": 1843 + }, + { + "epoch": 0.1517383254474388, + "grad_norm": 4.706538246713925, + "learning_rate": 1.923304762816865e-05, + "loss": 0.8313, + "step": 1844 + }, + { + "epoch": 0.15182061304258382, + "grad_norm": 5.335789671949779, + "learning_rate": 1.9232023642335683e-05, + "loss": 0.8423, + "step": 1845 + }, + { + "epoch": 0.15190290063772885, + "grad_norm": 5.374080569021992, + "learning_rate": 1.9230999000674526e-05, + "loss": 0.8584, + "step": 1846 + }, + { + "epoch": 0.1519851882328739, + "grad_norm": 4.023515750853757, + "learning_rate": 1.922997370325796e-05, + "loss": 0.8269, + "step": 1847 + }, + { + "epoch": 0.15206747582801894, + "grad_norm": 5.075361759924576, + "learning_rate": 1.9228947750158826e-05, + "loss": 0.8628, + "step": 1848 + }, + { + "epoch": 0.15214976342316397, + "grad_norm": 7.288310947212566, + "learning_rate": 1.922792114145e-05, + "loss": 0.8284, + "step": 1849 + }, + { + "epoch": 0.152232051018309, + "grad_norm": 4.911647064052488, + "learning_rate": 1.9226893877204418e-05, + "loss": 0.8098, + "step": 1850 + }, + { + "epoch": 0.15231433861345403, + "grad_norm": 6.14158528557125, + "learning_rate": 1.922586595749505e-05, + "loss": 0.8586, + "step": 1851 + }, + { + "epoch": 0.15239662620859906, + "grad_norm": 0.9609316724016479, + "learning_rate": 1.9224837382394915e-05, + "loss": 0.5865, + "step": 1852 + }, + { + "epoch": 0.1524789138037441, + "grad_norm": 4.8074089673842115, + "learning_rate": 1.9223808151977086e-05, + "loss": 0.8574, + "step": 1853 + }, + { + "epoch": 0.15256120139888912, + "grad_norm": 5.09401658512305, + "learning_rate": 1.9222778266314682e-05, + "loss": 0.8242, + "step": 1854 + }, + { + "epoch": 0.15264348899403415, + "grad_norm": 7.660360501973494, + "learning_rate": 1.9221747725480858e-05, + "loss": 0.8409, + "step": 1855 + }, + { + "epoch": 0.15272577658917919, + "grad_norm": 0.6386425451713716, + "learning_rate": 1.922071652954882e-05, + "loss": 0.565, + "step": 1856 + }, + { + "epoch": 0.15280806418432422, + "grad_norm": 3.8625412908418486, + "learning_rate": 1.9219684678591828e-05, + "loss": 0.8589, + "step": 1857 + }, + { + "epoch": 0.15289035177946925, + "grad_norm": 4.707426779826058, + "learning_rate": 1.9218652172683182e-05, + "loss": 0.8378, + "step": 1858 + }, + { + "epoch": 0.15297263937461428, + "grad_norm": 8.068080909985033, + "learning_rate": 1.9217619011896228e-05, + "loss": 0.8404, + "step": 1859 + }, + { + "epoch": 0.1530549269697593, + "grad_norm": 5.122827903994561, + "learning_rate": 1.9216585196304362e-05, + "loss": 0.8384, + "step": 1860 + }, + { + "epoch": 0.15313721456490434, + "grad_norm": 4.44536713211764, + "learning_rate": 1.9215550725981025e-05, + "loss": 0.851, + "step": 1861 + }, + { + "epoch": 0.15321950216004937, + "grad_norm": 5.1179320546544576, + "learning_rate": 1.92145156009997e-05, + "loss": 0.8349, + "step": 1862 + }, + { + "epoch": 0.1533017897551944, + "grad_norm": 4.460759987151901, + "learning_rate": 1.9213479821433922e-05, + "loss": 0.88, + "step": 1863 + }, + { + "epoch": 0.15338407735033943, + "grad_norm": 0.6059111588013519, + "learning_rate": 1.9212443387357274e-05, + "loss": 0.5736, + "step": 1864 + }, + { + "epoch": 0.15346636494548446, + "grad_norm": 4.023527415266168, + "learning_rate": 1.921140629884338e-05, + "loss": 0.8557, + "step": 1865 + }, + { + "epoch": 0.1535486525406295, + "grad_norm": 5.42493421555172, + "learning_rate": 1.9210368555965915e-05, + "loss": 0.8703, + "step": 1866 + }, + { + "epoch": 0.15363094013577452, + "grad_norm": 5.751996259296113, + "learning_rate": 1.9209330158798597e-05, + "loss": 0.844, + "step": 1867 + }, + { + "epoch": 0.15371322773091955, + "grad_norm": 26.285583113760325, + "learning_rate": 1.920829110741519e-05, + "loss": 0.8506, + "step": 1868 + }, + { + "epoch": 0.15379551532606459, + "grad_norm": 3.7662288767932037, + "learning_rate": 1.9207251401889514e-05, + "loss": 0.8539, + "step": 1869 + }, + { + "epoch": 0.15387780292120962, + "grad_norm": 6.159059075831218, + "learning_rate": 1.920621104229542e-05, + "loss": 0.8153, + "step": 1870 + }, + { + "epoch": 0.15396009051635465, + "grad_norm": 3.529280846535288, + "learning_rate": 1.920517002870682e-05, + "loss": 0.8152, + "step": 1871 + }, + { + "epoch": 0.15404237811149968, + "grad_norm": 3.8640076235353713, + "learning_rate": 1.920412836119766e-05, + "loss": 0.833, + "step": 1872 + }, + { + "epoch": 0.15412466570664474, + "grad_norm": 0.5522002280100713, + "learning_rate": 1.9203086039841944e-05, + "loss": 0.5538, + "step": 1873 + }, + { + "epoch": 0.15420695330178977, + "grad_norm": 4.57263225829868, + "learning_rate": 1.9202043064713708e-05, + "loss": 0.8199, + "step": 1874 + }, + { + "epoch": 0.1542892408969348, + "grad_norm": 5.739782670934446, + "learning_rate": 1.9200999435887053e-05, + "loss": 0.8286, + "step": 1875 + }, + { + "epoch": 0.15437152849207983, + "grad_norm": 4.3925079943317265, + "learning_rate": 1.919995515343611e-05, + "loss": 0.8358, + "step": 1876 + }, + { + "epoch": 0.15445381608722486, + "grad_norm": 4.789241874040683, + "learning_rate": 1.9198910217435073e-05, + "loss": 0.8344, + "step": 1877 + }, + { + "epoch": 0.1545361036823699, + "grad_norm": 4.400609569663084, + "learning_rate": 1.919786462795816e-05, + "loss": 0.8432, + "step": 1878 + }, + { + "epoch": 0.15461839127751492, + "grad_norm": 2.9184772315404124, + "learning_rate": 1.9196818385079655e-05, + "loss": 0.8271, + "step": 1879 + }, + { + "epoch": 0.15470067887265995, + "grad_norm": 4.851603050312278, + "learning_rate": 1.919577148887388e-05, + "loss": 0.8408, + "step": 1880 + }, + { + "epoch": 0.15478296646780498, + "grad_norm": 3.7963500670198562, + "learning_rate": 1.9194723939415203e-05, + "loss": 0.8466, + "step": 1881 + }, + { + "epoch": 0.15486525406295001, + "grad_norm": 4.176493857818385, + "learning_rate": 1.9193675736778047e-05, + "loss": 0.8282, + "step": 1882 + }, + { + "epoch": 0.15494754165809504, + "grad_norm": 5.221962997293721, + "learning_rate": 1.9192626881036866e-05, + "loss": 0.86, + "step": 1883 + }, + { + "epoch": 0.15502982925324008, + "grad_norm": 4.184852828323477, + "learning_rate": 1.9191577372266174e-05, + "loss": 0.8329, + "step": 1884 + }, + { + "epoch": 0.1551121168483851, + "grad_norm": 0.5636559518533191, + "learning_rate": 1.9190527210540524e-05, + "loss": 0.5639, + "step": 1885 + }, + { + "epoch": 0.15519440444353014, + "grad_norm": 6.193137368635008, + "learning_rate": 1.918947639593452e-05, + "loss": 0.8209, + "step": 1886 + }, + { + "epoch": 0.15527669203867517, + "grad_norm": 4.6492158556775625, + "learning_rate": 1.918842492852281e-05, + "loss": 0.8231, + "step": 1887 + }, + { + "epoch": 0.1553589796338202, + "grad_norm": 4.2648570505473575, + "learning_rate": 1.9187372808380085e-05, + "loss": 0.8558, + "step": 1888 + }, + { + "epoch": 0.15544126722896523, + "grad_norm": 4.4365923177857765, + "learning_rate": 1.918632003558109e-05, + "loss": 0.871, + "step": 1889 + }, + { + "epoch": 0.15552355482411026, + "grad_norm": 6.451657210590928, + "learning_rate": 1.9185266610200612e-05, + "loss": 0.8205, + "step": 1890 + }, + { + "epoch": 0.1556058424192553, + "grad_norm": 5.063597696743102, + "learning_rate": 1.9184212532313483e-05, + "loss": 0.823, + "step": 1891 + }, + { + "epoch": 0.15568813001440032, + "grad_norm": 5.33443100117654, + "learning_rate": 1.9183157801994585e-05, + "loss": 0.8152, + "step": 1892 + }, + { + "epoch": 0.15577041760954535, + "grad_norm": 6.843779103961609, + "learning_rate": 1.9182102419318842e-05, + "loss": 0.8466, + "step": 1893 + }, + { + "epoch": 0.15585270520469038, + "grad_norm": 0.5560946461260619, + "learning_rate": 1.9181046384361228e-05, + "loss": 0.5749, + "step": 1894 + }, + { + "epoch": 0.15593499279983541, + "grad_norm": 5.741982504307724, + "learning_rate": 1.9179989697196762e-05, + "loss": 0.819, + "step": 1895 + }, + { + "epoch": 0.15601728039498045, + "grad_norm": 6.531500373200525, + "learning_rate": 1.9178932357900505e-05, + "loss": 0.8298, + "step": 1896 + }, + { + "epoch": 0.15609956799012548, + "grad_norm": 6.1546326697051, + "learning_rate": 1.917787436654758e-05, + "loss": 0.837, + "step": 1897 + }, + { + "epoch": 0.1561818555852705, + "grad_norm": 5.185203064348844, + "learning_rate": 1.9176815723213132e-05, + "loss": 0.8632, + "step": 1898 + }, + { + "epoch": 0.15626414318041557, + "grad_norm": 5.4084609854089365, + "learning_rate": 1.9175756427972375e-05, + "loss": 0.8307, + "step": 1899 + }, + { + "epoch": 0.1563464307755606, + "grad_norm": 5.703538856423496, + "learning_rate": 1.9174696480900554e-05, + "loss": 0.8472, + "step": 1900 + }, + { + "epoch": 0.15642871837070563, + "grad_norm": 5.405477693059169, + "learning_rate": 1.9173635882072967e-05, + "loss": 0.8721, + "step": 1901 + }, + { + "epoch": 0.15651100596585066, + "grad_norm": 4.921592944151, + "learning_rate": 1.9172574631564963e-05, + "loss": 0.8327, + "step": 1902 + }, + { + "epoch": 0.1565932935609957, + "grad_norm": 7.000417448334732, + "learning_rate": 1.917151272945192e-05, + "loss": 0.8074, + "step": 1903 + }, + { + "epoch": 0.15667558115614072, + "grad_norm": 8.881365953302256, + "learning_rate": 1.9170450175809283e-05, + "loss": 0.8486, + "step": 1904 + }, + { + "epoch": 0.15675786875128575, + "grad_norm": 3.557591238337821, + "learning_rate": 1.9169386970712532e-05, + "loss": 0.8428, + "step": 1905 + }, + { + "epoch": 0.15684015634643078, + "grad_norm": 5.326825829060843, + "learning_rate": 1.9168323114237193e-05, + "loss": 0.8439, + "step": 1906 + }, + { + "epoch": 0.1569224439415758, + "grad_norm": 5.478137574938721, + "learning_rate": 1.9167258606458846e-05, + "loss": 0.8479, + "step": 1907 + }, + { + "epoch": 0.15700473153672084, + "grad_norm": 10.86664357588869, + "learning_rate": 1.9166193447453107e-05, + "loss": 0.8481, + "step": 1908 + }, + { + "epoch": 0.15708701913186587, + "grad_norm": 5.939616047516853, + "learning_rate": 1.916512763729564e-05, + "loss": 0.8572, + "step": 1909 + }, + { + "epoch": 0.1571693067270109, + "grad_norm": 3.7039344771688256, + "learning_rate": 1.9164061176062166e-05, + "loss": 0.8424, + "step": 1910 + }, + { + "epoch": 0.15725159432215594, + "grad_norm": 0.49197443127291796, + "learning_rate": 1.9162994063828445e-05, + "loss": 0.5803, + "step": 1911 + }, + { + "epoch": 0.15733388191730097, + "grad_norm": 0.4553081859661651, + "learning_rate": 1.9161926300670277e-05, + "loss": 0.5647, + "step": 1912 + }, + { + "epoch": 0.157416169512446, + "grad_norm": 5.5999535437779215, + "learning_rate": 1.916085788666352e-05, + "loss": 0.8441, + "step": 1913 + }, + { + "epoch": 0.15749845710759103, + "grad_norm": 5.132735593812985, + "learning_rate": 1.9159788821884064e-05, + "loss": 0.8579, + "step": 1914 + }, + { + "epoch": 0.15758074470273606, + "grad_norm": 5.5625878807458005, + "learning_rate": 1.9158719106407862e-05, + "loss": 0.8574, + "step": 1915 + }, + { + "epoch": 0.1576630322978811, + "grad_norm": 3.8672917005273706, + "learning_rate": 1.9157648740310905e-05, + "loss": 0.8474, + "step": 1916 + }, + { + "epoch": 0.15774531989302612, + "grad_norm": 0.5195626974877546, + "learning_rate": 1.915657772366922e-05, + "loss": 0.5716, + "step": 1917 + }, + { + "epoch": 0.15782760748817115, + "grad_norm": 0.5118379542817698, + "learning_rate": 1.9155506056558903e-05, + "loss": 0.5727, + "step": 1918 + }, + { + "epoch": 0.15790989508331618, + "grad_norm": 0.546243421780183, + "learning_rate": 1.9154433739056078e-05, + "loss": 0.5735, + "step": 1919 + }, + { + "epoch": 0.1579921826784612, + "grad_norm": 0.4496319616069995, + "learning_rate": 1.9153360771236915e-05, + "loss": 0.5672, + "step": 1920 + }, + { + "epoch": 0.15807447027360624, + "grad_norm": 17.06080467505602, + "learning_rate": 1.9152287153177646e-05, + "loss": 0.8598, + "step": 1921 + }, + { + "epoch": 0.15815675786875127, + "grad_norm": 5.0084313071310484, + "learning_rate": 1.9151212884954534e-05, + "loss": 0.8499, + "step": 1922 + }, + { + "epoch": 0.1582390454638963, + "grad_norm": 0.6107687958291511, + "learning_rate": 1.9150137966643892e-05, + "loss": 0.5938, + "step": 1923 + }, + { + "epoch": 0.15832133305904134, + "grad_norm": 6.509270710519911, + "learning_rate": 1.9149062398322084e-05, + "loss": 0.8879, + "step": 1924 + }, + { + "epoch": 0.1584036206541864, + "grad_norm": 7.504768979432874, + "learning_rate": 1.9147986180065515e-05, + "loss": 0.8179, + "step": 1925 + }, + { + "epoch": 0.15848590824933143, + "grad_norm": 8.842157642443578, + "learning_rate": 1.9146909311950636e-05, + "loss": 0.829, + "step": 1926 + }, + { + "epoch": 0.15856819584447646, + "grad_norm": 4.932666405596189, + "learning_rate": 1.914583179405395e-05, + "loss": 0.8504, + "step": 1927 + }, + { + "epoch": 0.1586504834396215, + "grad_norm": 6.372961990608066, + "learning_rate": 1.9144753626452e-05, + "loss": 0.8624, + "step": 1928 + }, + { + "epoch": 0.15873277103476652, + "grad_norm": 4.522677476310382, + "learning_rate": 1.9143674809221376e-05, + "loss": 0.8391, + "step": 1929 + }, + { + "epoch": 0.15881505862991155, + "grad_norm": 0.5270211749274826, + "learning_rate": 1.914259534243872e-05, + "loss": 0.5819, + "step": 1930 + }, + { + "epoch": 0.15889734622505658, + "grad_norm": 0.46338890013052, + "learning_rate": 1.9141515226180708e-05, + "loss": 0.5434, + "step": 1931 + }, + { + "epoch": 0.1589796338202016, + "grad_norm": 5.053929441164545, + "learning_rate": 1.9140434460524075e-05, + "loss": 0.8184, + "step": 1932 + }, + { + "epoch": 0.15906192141534664, + "grad_norm": 0.46822296714643163, + "learning_rate": 1.9139353045545595e-05, + "loss": 0.5468, + "step": 1933 + }, + { + "epoch": 0.15914420901049167, + "grad_norm": 4.818972577828098, + "learning_rate": 1.9138270981322093e-05, + "loss": 0.8158, + "step": 1934 + }, + { + "epoch": 0.1592264966056367, + "grad_norm": 5.056437597852356, + "learning_rate": 1.9137188267930434e-05, + "loss": 0.8395, + "step": 1935 + }, + { + "epoch": 0.15930878420078173, + "grad_norm": 5.0850642468159535, + "learning_rate": 1.9136104905447533e-05, + "loss": 0.861, + "step": 1936 + }, + { + "epoch": 0.15939107179592676, + "grad_norm": 0.5152390077794073, + "learning_rate": 1.913502089395035e-05, + "loss": 0.5544, + "step": 1937 + }, + { + "epoch": 0.1594733593910718, + "grad_norm": 0.49678784494055134, + "learning_rate": 1.9133936233515893e-05, + "loss": 0.5633, + "step": 1938 + }, + { + "epoch": 0.15955564698621683, + "grad_norm": 8.081617306656465, + "learning_rate": 1.9132850924221214e-05, + "loss": 0.8199, + "step": 1939 + }, + { + "epoch": 0.15963793458136186, + "grad_norm": 4.050834409746043, + "learning_rate": 1.913176496614341e-05, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.1597202221765069, + "grad_norm": 5.538913745420739, + "learning_rate": 1.913067835935963e-05, + "loss": 0.8568, + "step": 1941 + }, + { + "epoch": 0.15980250977165192, + "grad_norm": 5.169208456192216, + "learning_rate": 1.912959110394706e-05, + "loss": 0.8502, + "step": 1942 + }, + { + "epoch": 0.15988479736679695, + "grad_norm": 3.717640082018756, + "learning_rate": 1.9128503199982934e-05, + "loss": 0.8515, + "step": 1943 + }, + { + "epoch": 0.15996708496194198, + "grad_norm": 4.289581072812726, + "learning_rate": 1.9127414647544546e-05, + "loss": 0.8574, + "step": 1944 + }, + { + "epoch": 0.160049372557087, + "grad_norm": 7.156272072258111, + "learning_rate": 1.9126325446709217e-05, + "loss": 0.8228, + "step": 1945 + }, + { + "epoch": 0.16013166015223204, + "grad_norm": 3.666071946569591, + "learning_rate": 1.912523559755432e-05, + "loss": 0.8278, + "step": 1946 + }, + { + "epoch": 0.16021394774737707, + "grad_norm": 4.716701647887716, + "learning_rate": 1.9124145100157284e-05, + "loss": 0.8434, + "step": 1947 + }, + { + "epoch": 0.1602962353425221, + "grad_norm": 4.404603024503413, + "learning_rate": 1.9123053954595572e-05, + "loss": 0.8568, + "step": 1948 + }, + { + "epoch": 0.16037852293766713, + "grad_norm": 12.693487709300616, + "learning_rate": 1.9121962160946696e-05, + "loss": 0.8659, + "step": 1949 + }, + { + "epoch": 0.16046081053281216, + "grad_norm": 4.994468190010449, + "learning_rate": 1.9120869719288216e-05, + "loss": 0.8583, + "step": 1950 + }, + { + "epoch": 0.16054309812795722, + "grad_norm": 3.350272271821083, + "learning_rate": 1.9119776629697738e-05, + "loss": 0.844, + "step": 1951 + }, + { + "epoch": 0.16062538572310225, + "grad_norm": 0.626835237381398, + "learning_rate": 1.911868289225291e-05, + "loss": 0.6034, + "step": 1952 + }, + { + "epoch": 0.16070767331824728, + "grad_norm": 0.5174318048940684, + "learning_rate": 1.911758850703144e-05, + "loss": 0.5745, + "step": 1953 + }, + { + "epoch": 0.16078996091339232, + "grad_norm": 0.46025061805771117, + "learning_rate": 1.9116493474111056e-05, + "loss": 0.5644, + "step": 1954 + }, + { + "epoch": 0.16087224850853735, + "grad_norm": 3.954810471983505, + "learning_rate": 1.9115397793569558e-05, + "loss": 0.8655, + "step": 1955 + }, + { + "epoch": 0.16095453610368238, + "grad_norm": 5.181663850622272, + "learning_rate": 1.911430146548478e-05, + "loss": 0.88, + "step": 1956 + }, + { + "epoch": 0.1610368236988274, + "grad_norm": 5.090124727277422, + "learning_rate": 1.9113204489934603e-05, + "loss": 0.8526, + "step": 1957 + }, + { + "epoch": 0.16111911129397244, + "grad_norm": 4.995989345346734, + "learning_rate": 1.911210686699695e-05, + "loss": 0.8515, + "step": 1958 + }, + { + "epoch": 0.16120139888911747, + "grad_norm": 3.8664089788873537, + "learning_rate": 1.91110085967498e-05, + "loss": 0.8226, + "step": 1959 + }, + { + "epoch": 0.1612836864842625, + "grad_norm": 3.485843985195341, + "learning_rate": 1.9109909679271173e-05, + "loss": 0.8226, + "step": 1960 + }, + { + "epoch": 0.16136597407940753, + "grad_norm": 6.108467733707024, + "learning_rate": 1.910881011463913e-05, + "loss": 0.8439, + "step": 1961 + }, + { + "epoch": 0.16144826167455256, + "grad_norm": 0.8281518473412987, + "learning_rate": 1.910770990293178e-05, + "loss": 0.6601, + "step": 1962 + }, + { + "epoch": 0.1615305492696976, + "grad_norm": 4.1919775317779, + "learning_rate": 1.910660904422729e-05, + "loss": 0.8358, + "step": 1963 + }, + { + "epoch": 0.16161283686484262, + "grad_norm": 3.6486336150800818, + "learning_rate": 1.910550753860385e-05, + "loss": 0.8752, + "step": 1964 + }, + { + "epoch": 0.16169512445998765, + "grad_norm": 18.125327367252893, + "learning_rate": 1.9104405386139722e-05, + "loss": 0.8411, + "step": 1965 + }, + { + "epoch": 0.16177741205513269, + "grad_norm": 0.519789005202432, + "learning_rate": 1.9103302586913194e-05, + "loss": 0.5852, + "step": 1966 + }, + { + "epoch": 0.16185969965027772, + "grad_norm": 10.74734646569047, + "learning_rate": 1.9102199141002612e-05, + "loss": 0.8597, + "step": 1967 + }, + { + "epoch": 0.16194198724542275, + "grad_norm": 4.997595569212722, + "learning_rate": 1.9101095048486353e-05, + "loss": 0.8487, + "step": 1968 + }, + { + "epoch": 0.16202427484056778, + "grad_norm": 0.4693045988249685, + "learning_rate": 1.9099990309442863e-05, + "loss": 0.5493, + "step": 1969 + }, + { + "epoch": 0.1621065624357128, + "grad_norm": 9.65821630646961, + "learning_rate": 1.909888492395061e-05, + "loss": 0.8503, + "step": 1970 + }, + { + "epoch": 0.16218885003085784, + "grad_norm": 4.327280300731587, + "learning_rate": 1.9097778892088126e-05, + "loss": 0.8611, + "step": 1971 + }, + { + "epoch": 0.16227113762600287, + "grad_norm": 0.4866359984742805, + "learning_rate": 1.9096672213933983e-05, + "loss": 0.6039, + "step": 1972 + }, + { + "epoch": 0.1623534252211479, + "grad_norm": 4.267347798551128, + "learning_rate": 1.9095564889566787e-05, + "loss": 0.8286, + "step": 1973 + }, + { + "epoch": 0.16243571281629293, + "grad_norm": 3.478809856338831, + "learning_rate": 1.909445691906521e-05, + "loss": 0.8343, + "step": 1974 + }, + { + "epoch": 0.16251800041143796, + "grad_norm": 0.47794812337530074, + "learning_rate": 1.9093348302507958e-05, + "loss": 0.5616, + "step": 1975 + }, + { + "epoch": 0.16260028800658302, + "grad_norm": 3.955142852276849, + "learning_rate": 1.909223903997379e-05, + "loss": 0.8123, + "step": 1976 + }, + { + "epoch": 0.16268257560172805, + "grad_norm": 3.964513425565348, + "learning_rate": 1.9091129131541496e-05, + "loss": 0.8416, + "step": 1977 + }, + { + "epoch": 0.16276486319687308, + "grad_norm": 4.549111628205324, + "learning_rate": 1.909001857728993e-05, + "loss": 0.8257, + "step": 1978 + }, + { + "epoch": 0.1628471507920181, + "grad_norm": 6.970898550322873, + "learning_rate": 1.9088907377297977e-05, + "loss": 0.8488, + "step": 1979 + }, + { + "epoch": 0.16292943838716314, + "grad_norm": 4.2908238891553445, + "learning_rate": 1.9087795531644583e-05, + "loss": 0.8412, + "step": 1980 + }, + { + "epoch": 0.16301172598230818, + "grad_norm": 4.755754852120496, + "learning_rate": 1.9086683040408728e-05, + "loss": 0.8137, + "step": 1981 + }, + { + "epoch": 0.1630940135774532, + "grad_norm": 4.9388049580707705, + "learning_rate": 1.9085569903669444e-05, + "loss": 0.8425, + "step": 1982 + }, + { + "epoch": 0.16317630117259824, + "grad_norm": 3.7789696800914507, + "learning_rate": 1.9084456121505802e-05, + "loss": 0.8601, + "step": 1983 + }, + { + "epoch": 0.16325858876774327, + "grad_norm": 0.5036875299743072, + "learning_rate": 1.9083341693996926e-05, + "loss": 0.5823, + "step": 1984 + }, + { + "epoch": 0.1633408763628883, + "grad_norm": 0.4745416575117629, + "learning_rate": 1.908222662122198e-05, + "loss": 0.5523, + "step": 1985 + }, + { + "epoch": 0.16342316395803333, + "grad_norm": 5.2700911406234185, + "learning_rate": 1.9081110903260184e-05, + "loss": 0.8406, + "step": 1986 + }, + { + "epoch": 0.16350545155317836, + "grad_norm": 4.278987118782611, + "learning_rate": 1.907999454019079e-05, + "loss": 0.8375, + "step": 1987 + }, + { + "epoch": 0.1635877391483234, + "grad_norm": 4.375545347985468, + "learning_rate": 1.907887753209311e-05, + "loss": 0.8304, + "step": 1988 + }, + { + "epoch": 0.16367002674346842, + "grad_norm": 4.920337609057978, + "learning_rate": 1.907775987904648e-05, + "loss": 0.8077, + "step": 1989 + }, + { + "epoch": 0.16375231433861345, + "grad_norm": 6.998763866445164, + "learning_rate": 1.9076641581130313e-05, + "loss": 0.8528, + "step": 1990 + }, + { + "epoch": 0.16383460193375848, + "grad_norm": 5.1844149471073075, + "learning_rate": 1.907552263842404e-05, + "loss": 0.8272, + "step": 1991 + }, + { + "epoch": 0.16391688952890351, + "grad_norm": 5.675331934385374, + "learning_rate": 1.9074403051007158e-05, + "loss": 0.8266, + "step": 1992 + }, + { + "epoch": 0.16399917712404855, + "grad_norm": 6.85376327606783, + "learning_rate": 1.9073282818959192e-05, + "loss": 0.8199, + "step": 1993 + }, + { + "epoch": 0.16408146471919358, + "grad_norm": 5.037618123313906, + "learning_rate": 1.907216194235973e-05, + "loss": 0.8397, + "step": 1994 + }, + { + "epoch": 0.1641637523143386, + "grad_norm": 7.450741530103121, + "learning_rate": 1.9071040421288388e-05, + "loss": 0.8075, + "step": 1995 + }, + { + "epoch": 0.16424603990948364, + "grad_norm": 4.939981647738216, + "learning_rate": 1.906991825582484e-05, + "loss": 0.8243, + "step": 1996 + }, + { + "epoch": 0.16432832750462867, + "grad_norm": 4.301415531275492, + "learning_rate": 1.9068795446048806e-05, + "loss": 0.8604, + "step": 1997 + }, + { + "epoch": 0.1644106150997737, + "grad_norm": 4.4888916464050865, + "learning_rate": 1.9067671992040046e-05, + "loss": 0.8721, + "step": 1998 + }, + { + "epoch": 0.16449290269491873, + "grad_norm": 6.357665442149673, + "learning_rate": 1.9066547893878372e-05, + "loss": 0.8874, + "step": 1999 + }, + { + "epoch": 0.16457519029006376, + "grad_norm": 0.5634041513601309, + "learning_rate": 1.9065423151643633e-05, + "loss": 0.5899, + "step": 2000 + }, + { + "epoch": 0.1646574778852088, + "grad_norm": 8.209637125511108, + "learning_rate": 1.906429776541573e-05, + "loss": 0.8543, + "step": 2001 + }, + { + "epoch": 0.16473976548035385, + "grad_norm": 4.881472200579807, + "learning_rate": 1.9063171735274615e-05, + "loss": 0.8203, + "step": 2002 + }, + { + "epoch": 0.16482205307549888, + "grad_norm": 4.506268312516657, + "learning_rate": 1.906204506130027e-05, + "loss": 0.8246, + "step": 2003 + }, + { + "epoch": 0.1649043406706439, + "grad_norm": 0.4482842576468044, + "learning_rate": 1.906091774357274e-05, + "loss": 0.5494, + "step": 2004 + }, + { + "epoch": 0.16498662826578894, + "grad_norm": 0.46462466183979095, + "learning_rate": 1.90597897821721e-05, + "loss": 0.5828, + "step": 2005 + }, + { + "epoch": 0.16506891586093397, + "grad_norm": 3.535446139587554, + "learning_rate": 1.9058661177178487e-05, + "loss": 0.8295, + "step": 2006 + }, + { + "epoch": 0.165151203456079, + "grad_norm": 6.898430563265459, + "learning_rate": 1.905753192867207e-05, + "loss": 0.8122, + "step": 2007 + }, + { + "epoch": 0.16523349105122404, + "grad_norm": 6.659782186420017, + "learning_rate": 1.905640203673307e-05, + "loss": 0.8368, + "step": 2008 + }, + { + "epoch": 0.16531577864636907, + "grad_norm": 5.247470551212812, + "learning_rate": 1.905527150144175e-05, + "loss": 0.8493, + "step": 2009 + }, + { + "epoch": 0.1653980662415141, + "grad_norm": 0.5007023360676847, + "learning_rate": 1.9054140322878426e-05, + "loss": 0.5794, + "step": 2010 + }, + { + "epoch": 0.16548035383665913, + "grad_norm": 5.512743851784457, + "learning_rate": 1.9053008501123456e-05, + "loss": 0.8303, + "step": 2011 + }, + { + "epoch": 0.16556264143180416, + "grad_norm": 0.47692966581809587, + "learning_rate": 1.9051876036257236e-05, + "loss": 0.5789, + "step": 2012 + }, + { + "epoch": 0.1656449290269492, + "grad_norm": 4.661215851298964, + "learning_rate": 1.905074292836022e-05, + "loss": 0.8438, + "step": 2013 + }, + { + "epoch": 0.16572721662209422, + "grad_norm": 4.417221172728927, + "learning_rate": 1.90496091775129e-05, + "loss": 0.8223, + "step": 2014 + }, + { + "epoch": 0.16580950421723925, + "grad_norm": 4.542426359477869, + "learning_rate": 1.904847478379582e-05, + "loss": 0.8531, + "step": 2015 + }, + { + "epoch": 0.16589179181238428, + "grad_norm": 8.947547537374573, + "learning_rate": 1.9047339747289562e-05, + "loss": 0.8367, + "step": 2016 + }, + { + "epoch": 0.1659740794075293, + "grad_norm": 6.689553062284241, + "learning_rate": 1.904620406807476e-05, + "loss": 0.8381, + "step": 2017 + }, + { + "epoch": 0.16605636700267434, + "grad_norm": 4.414270465815786, + "learning_rate": 1.904506774623208e-05, + "loss": 0.7952, + "step": 2018 + }, + { + "epoch": 0.16613865459781937, + "grad_norm": 4.617600219967089, + "learning_rate": 1.904393078184226e-05, + "loss": 0.8529, + "step": 2019 + }, + { + "epoch": 0.1662209421929644, + "grad_norm": 4.8939785097994, + "learning_rate": 1.9042793174986057e-05, + "loss": 0.818, + "step": 2020 + }, + { + "epoch": 0.16630322978810944, + "grad_norm": 5.503071643587064, + "learning_rate": 1.9041654925744292e-05, + "loss": 0.8312, + "step": 2021 + }, + { + "epoch": 0.16638551738325447, + "grad_norm": 3.8994514270947325, + "learning_rate": 1.904051603419782e-05, + "loss": 0.8381, + "step": 2022 + }, + { + "epoch": 0.1664678049783995, + "grad_norm": 30.549691571453124, + "learning_rate": 1.9039376500427543e-05, + "loss": 0.8293, + "step": 2023 + }, + { + "epoch": 0.16655009257354453, + "grad_norm": 6.388021109679703, + "learning_rate": 1.9038236324514418e-05, + "loss": 0.8559, + "step": 2024 + }, + { + "epoch": 0.16663238016868956, + "grad_norm": 5.235002404805821, + "learning_rate": 1.903709550653944e-05, + "loss": 0.8253, + "step": 2025 + }, + { + "epoch": 0.1667146677638346, + "grad_norm": 5.684263599129397, + "learning_rate": 1.903595404658365e-05, + "loss": 0.8288, + "step": 2026 + }, + { + "epoch": 0.16679695535897962, + "grad_norm": 6.189154945456667, + "learning_rate": 1.9034811944728134e-05, + "loss": 0.8284, + "step": 2027 + }, + { + "epoch": 0.16687924295412468, + "grad_norm": 4.925352091032951, + "learning_rate": 1.903366920105403e-05, + "loss": 0.8593, + "step": 2028 + }, + { + "epoch": 0.1669615305492697, + "grad_norm": 8.79075541252156, + "learning_rate": 1.903252581564251e-05, + "loss": 0.8553, + "step": 2029 + }, + { + "epoch": 0.16704381814441474, + "grad_norm": 5.720905547782337, + "learning_rate": 1.9031381788574803e-05, + "loss": 0.8607, + "step": 2030 + }, + { + "epoch": 0.16712610573955977, + "grad_norm": 6.2820248941486545, + "learning_rate": 1.9030237119932175e-05, + "loss": 0.8562, + "step": 2031 + }, + { + "epoch": 0.1672083933347048, + "grad_norm": 0.5465467961898273, + "learning_rate": 1.9029091809795948e-05, + "loss": 0.5764, + "step": 2032 + }, + { + "epoch": 0.16729068092984983, + "grad_norm": 4.015285658563253, + "learning_rate": 1.9027945858247475e-05, + "loss": 0.8362, + "step": 2033 + }, + { + "epoch": 0.16737296852499486, + "grad_norm": 4.679455678251065, + "learning_rate": 1.9026799265368168e-05, + "loss": 0.8772, + "step": 2034 + }, + { + "epoch": 0.1674552561201399, + "grad_norm": 5.128976395208152, + "learning_rate": 1.9025652031239478e-05, + "loss": 0.8291, + "step": 2035 + }, + { + "epoch": 0.16753754371528493, + "grad_norm": 0.46671868261974975, + "learning_rate": 1.9024504155942897e-05, + "loss": 0.5705, + "step": 2036 + }, + { + "epoch": 0.16761983131042996, + "grad_norm": 3.9936315880019753, + "learning_rate": 1.902335563955998e-05, + "loss": 0.8426, + "step": 2037 + }, + { + "epoch": 0.167702118905575, + "grad_norm": 9.886683613104376, + "learning_rate": 1.9022206482172304e-05, + "loss": 0.835, + "step": 2038 + }, + { + "epoch": 0.16778440650072002, + "grad_norm": 5.57469886239202, + "learning_rate": 1.9021056683861513e-05, + "loss": 0.8303, + "step": 2039 + }, + { + "epoch": 0.16786669409586505, + "grad_norm": 7.1891426779298335, + "learning_rate": 1.9019906244709276e-05, + "loss": 0.8464, + "step": 2040 + }, + { + "epoch": 0.16794898169101008, + "grad_norm": 4.623477995459667, + "learning_rate": 1.901875516479733e-05, + "loss": 0.8101, + "step": 2041 + }, + { + "epoch": 0.1680312692861551, + "grad_norm": 3.0508878772984485, + "learning_rate": 1.901760344420744e-05, + "loss": 0.8234, + "step": 2042 + }, + { + "epoch": 0.16811355688130014, + "grad_norm": 5.360858070318203, + "learning_rate": 1.9016451083021422e-05, + "loss": 0.8556, + "step": 2043 + }, + { + "epoch": 0.16819584447644517, + "grad_norm": 6.773934807897697, + "learning_rate": 1.9015298081321138e-05, + "loss": 0.865, + "step": 2044 + }, + { + "epoch": 0.1682781320715902, + "grad_norm": 3.4190071351252214, + "learning_rate": 1.90141444391885e-05, + "loss": 0.8337, + "step": 2045 + }, + { + "epoch": 0.16836041966673523, + "grad_norm": 3.96545776358378, + "learning_rate": 1.9012990156705447e-05, + "loss": 0.8042, + "step": 2046 + }, + { + "epoch": 0.16844270726188026, + "grad_norm": 0.5430641590195081, + "learning_rate": 1.9011835233953995e-05, + "loss": 0.5773, + "step": 2047 + }, + { + "epoch": 0.1685249948570253, + "grad_norm": 6.5560663968018815, + "learning_rate": 1.901067967101618e-05, + "loss": 0.802, + "step": 2048 + }, + { + "epoch": 0.16860728245217033, + "grad_norm": 5.259779927167468, + "learning_rate": 1.9009523467974093e-05, + "loss": 0.8522, + "step": 2049 + }, + { + "epoch": 0.16868957004731536, + "grad_norm": 5.439013570375262, + "learning_rate": 1.9008366624909866e-05, + "loss": 0.8457, + "step": 2050 + }, + { + "epoch": 0.1687718576424604, + "grad_norm": 4.656209247100697, + "learning_rate": 1.900720914190568e-05, + "loss": 0.8294, + "step": 2051 + }, + { + "epoch": 0.16885414523760542, + "grad_norm": 0.46861758893659844, + "learning_rate": 1.900605101904376e-05, + "loss": 0.5416, + "step": 2052 + }, + { + "epoch": 0.16893643283275045, + "grad_norm": 0.4447180872394492, + "learning_rate": 1.9004892256406383e-05, + "loss": 0.5309, + "step": 2053 + }, + { + "epoch": 0.1690187204278955, + "grad_norm": 3.5959277617933796, + "learning_rate": 1.9003732854075857e-05, + "loss": 0.8213, + "step": 2054 + }, + { + "epoch": 0.16910100802304054, + "grad_norm": 7.590801550911286, + "learning_rate": 1.900257281213455e-05, + "loss": 0.8512, + "step": 2055 + }, + { + "epoch": 0.16918329561818557, + "grad_norm": 6.650467541507469, + "learning_rate": 1.9001412130664868e-05, + "loss": 0.8201, + "step": 2056 + }, + { + "epoch": 0.1692655832133306, + "grad_norm": 4.544222679328591, + "learning_rate": 1.9000250809749262e-05, + "loss": 0.8474, + "step": 2057 + }, + { + "epoch": 0.16934787080847563, + "grad_norm": 4.950922151070744, + "learning_rate": 1.8999088849470237e-05, + "loss": 0.8212, + "step": 2058 + }, + { + "epoch": 0.16943015840362066, + "grad_norm": 3.541665949963335, + "learning_rate": 1.8997926249910326e-05, + "loss": 0.8193, + "step": 2059 + }, + { + "epoch": 0.1695124459987657, + "grad_norm": 3.4851879291005834, + "learning_rate": 1.8996763011152127e-05, + "loss": 0.8621, + "step": 2060 + }, + { + "epoch": 0.16959473359391072, + "grad_norm": 4.630559669844992, + "learning_rate": 1.899559913327827e-05, + "loss": 0.791, + "step": 2061 + }, + { + "epoch": 0.16967702118905575, + "grad_norm": 6.316492323089438, + "learning_rate": 1.899443461637144e-05, + "loss": 0.8477, + "step": 2062 + }, + { + "epoch": 0.16975930878420079, + "grad_norm": 3.8105295270536765, + "learning_rate": 1.899326946051436e-05, + "loss": 0.8205, + "step": 2063 + }, + { + "epoch": 0.16984159637934582, + "grad_norm": 3.5873582141439075, + "learning_rate": 1.89921036657898e-05, + "loss": 0.852, + "step": 2064 + }, + { + "epoch": 0.16992388397449085, + "grad_norm": 3.4961803242089005, + "learning_rate": 1.8990937232280574e-05, + "loss": 0.8069, + "step": 2065 + }, + { + "epoch": 0.17000617156963588, + "grad_norm": 4.709922748386927, + "learning_rate": 1.8989770160069546e-05, + "loss": 0.7968, + "step": 2066 + }, + { + "epoch": 0.1700884591647809, + "grad_norm": 4.419602258456932, + "learning_rate": 1.8988602449239626e-05, + "loss": 0.8233, + "step": 2067 + }, + { + "epoch": 0.17017074675992594, + "grad_norm": 4.165559426153498, + "learning_rate": 1.8987434099873757e-05, + "loss": 0.8783, + "step": 2068 + }, + { + "epoch": 0.17025303435507097, + "grad_norm": 5.665039435657368, + "learning_rate": 1.898626511205495e-05, + "loss": 0.8422, + "step": 2069 + }, + { + "epoch": 0.170335321950216, + "grad_norm": 3.261634336133814, + "learning_rate": 1.8985095485866235e-05, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.17041760954536103, + "grad_norm": 5.4431406561359355, + "learning_rate": 1.898392522139071e-05, + "loss": 0.8437, + "step": 2071 + }, + { + "epoch": 0.17049989714050606, + "grad_norm": 3.5106150268111196, + "learning_rate": 1.8982754318711506e-05, + "loss": 0.8896, + "step": 2072 + }, + { + "epoch": 0.1705821847356511, + "grad_norm": 0.5309314180484399, + "learning_rate": 1.8981582777911795e-05, + "loss": 0.5644, + "step": 2073 + }, + { + "epoch": 0.17066447233079612, + "grad_norm": 0.49354945222966823, + "learning_rate": 1.8980410599074812e-05, + "loss": 0.5708, + "step": 2074 + }, + { + "epoch": 0.17074675992594116, + "grad_norm": 3.537172251389112, + "learning_rate": 1.897923778228382e-05, + "loss": 0.8529, + "step": 2075 + }, + { + "epoch": 0.17082904752108619, + "grad_norm": 3.6627295874426844, + "learning_rate": 1.8978064327622138e-05, + "loss": 0.8605, + "step": 2076 + }, + { + "epoch": 0.17091133511623122, + "grad_norm": 4.426810425328736, + "learning_rate": 1.8976890235173125e-05, + "loss": 0.8576, + "step": 2077 + }, + { + "epoch": 0.17099362271137625, + "grad_norm": 0.5453550393689479, + "learning_rate": 1.8975715505020186e-05, + "loss": 0.5629, + "step": 2078 + }, + { + "epoch": 0.17107591030652128, + "grad_norm": 3.1760197055119965, + "learning_rate": 1.897454013724677e-05, + "loss": 0.8448, + "step": 2079 + }, + { + "epoch": 0.17115819790166634, + "grad_norm": 3.6427547600767234, + "learning_rate": 1.8973364131936374e-05, + "loss": 0.8095, + "step": 2080 + }, + { + "epoch": 0.17124048549681137, + "grad_norm": 5.314413481064159, + "learning_rate": 1.8972187489172544e-05, + "loss": 0.8585, + "step": 2081 + }, + { + "epoch": 0.1713227730919564, + "grad_norm": 3.9553194322089307, + "learning_rate": 1.8971010209038864e-05, + "loss": 0.8398, + "step": 2082 + }, + { + "epoch": 0.17140506068710143, + "grad_norm": 3.6803775188325965, + "learning_rate": 1.8969832291618963e-05, + "loss": 0.8357, + "step": 2083 + }, + { + "epoch": 0.17148734828224646, + "grad_norm": 2.910049437995119, + "learning_rate": 1.896865373699652e-05, + "loss": 0.864, + "step": 2084 + }, + { + "epoch": 0.1715696358773915, + "grad_norm": 3.1274442749778775, + "learning_rate": 1.8967474545255264e-05, + "loss": 0.8159, + "step": 2085 + }, + { + "epoch": 0.17165192347253652, + "grad_norm": 2.769262707484498, + "learning_rate": 1.8966294716478955e-05, + "loss": 0.8091, + "step": 2086 + }, + { + "epoch": 0.17173421106768155, + "grad_norm": 2.814835709251874, + "learning_rate": 1.896511425075141e-05, + "loss": 0.836, + "step": 2087 + }, + { + "epoch": 0.17181649866282658, + "grad_norm": 2.749983643046786, + "learning_rate": 1.8963933148156484e-05, + "loss": 0.819, + "step": 2088 + }, + { + "epoch": 0.17189878625797161, + "grad_norm": 2.9841402775497534, + "learning_rate": 1.8962751408778083e-05, + "loss": 0.8329, + "step": 2089 + }, + { + "epoch": 0.17198107385311664, + "grad_norm": 4.490372651250418, + "learning_rate": 1.8961569032700158e-05, + "loss": 0.8227, + "step": 2090 + }, + { + "epoch": 0.17206336144826168, + "grad_norm": 0.5195677303990671, + "learning_rate": 1.89603860200067e-05, + "loss": 0.5714, + "step": 2091 + }, + { + "epoch": 0.1721456490434067, + "grad_norm": 4.7162828671329535, + "learning_rate": 1.895920237078175e-05, + "loss": 0.8219, + "step": 2092 + }, + { + "epoch": 0.17222793663855174, + "grad_norm": 3.1845930400558493, + "learning_rate": 1.895801808510939e-05, + "loss": 0.8057, + "step": 2093 + }, + { + "epoch": 0.17231022423369677, + "grad_norm": 3.856861957628647, + "learning_rate": 1.895683316307375e-05, + "loss": 0.8164, + "step": 2094 + }, + { + "epoch": 0.1723925118288418, + "grad_norm": 4.3911680999892715, + "learning_rate": 1.8955647604759007e-05, + "loss": 0.805, + "step": 2095 + }, + { + "epoch": 0.17247479942398683, + "grad_norm": 3.1328124148047105, + "learning_rate": 1.8954461410249383e-05, + "loss": 0.8281, + "step": 2096 + }, + { + "epoch": 0.17255708701913186, + "grad_norm": 2.8193595324575518, + "learning_rate": 1.895327457962914e-05, + "loss": 0.8265, + "step": 2097 + }, + { + "epoch": 0.1726393746142769, + "grad_norm": 3.0449731741865556, + "learning_rate": 1.895208711298259e-05, + "loss": 0.8186, + "step": 2098 + }, + { + "epoch": 0.17272166220942192, + "grad_norm": 2.842969553300606, + "learning_rate": 1.8950899010394086e-05, + "loss": 0.8155, + "step": 2099 + }, + { + "epoch": 0.17280394980456695, + "grad_norm": 2.908844344437982, + "learning_rate": 1.8949710271948032e-05, + "loss": 0.8327, + "step": 2100 + }, + { + "epoch": 0.17288623739971198, + "grad_norm": 12.840651351543812, + "learning_rate": 1.8948520897728873e-05, + "loss": 0.8483, + "step": 2101 + }, + { + "epoch": 0.17296852499485701, + "grad_norm": 5.194797922834837, + "learning_rate": 1.8947330887821103e-05, + "loss": 0.8508, + "step": 2102 + }, + { + "epoch": 0.17305081259000205, + "grad_norm": 3.247249400708314, + "learning_rate": 1.8946140242309252e-05, + "loss": 0.8728, + "step": 2103 + }, + { + "epoch": 0.17313310018514708, + "grad_norm": 2.3298477189440168, + "learning_rate": 1.894494896127791e-05, + "loss": 0.8029, + "step": 2104 + }, + { + "epoch": 0.1732153877802921, + "grad_norm": 3.4162144543592334, + "learning_rate": 1.8943757044811698e-05, + "loss": 0.8188, + "step": 2105 + }, + { + "epoch": 0.17329767537543717, + "grad_norm": 3.1878432889019943, + "learning_rate": 1.8942564492995285e-05, + "loss": 0.8428, + "step": 2106 + }, + { + "epoch": 0.1733799629705822, + "grad_norm": 2.59384199789863, + "learning_rate": 1.8941371305913395e-05, + "loss": 0.7934, + "step": 2107 + }, + { + "epoch": 0.17346225056572723, + "grad_norm": 2.870188493472898, + "learning_rate": 1.8940177483650787e-05, + "loss": 0.804, + "step": 2108 + }, + { + "epoch": 0.17354453816087226, + "grad_norm": 0.5251962887433449, + "learning_rate": 1.8938983026292268e-05, + "loss": 0.5658, + "step": 2109 + }, + { + "epoch": 0.1736268257560173, + "grad_norm": 0.4726930814322826, + "learning_rate": 1.893778793392269e-05, + "loss": 0.5555, + "step": 2110 + }, + { + "epoch": 0.17370911335116232, + "grad_norm": 2.8381361257445836, + "learning_rate": 1.893659220662695e-05, + "loss": 0.8473, + "step": 2111 + }, + { + "epoch": 0.17379140094630735, + "grad_norm": 2.513114361510452, + "learning_rate": 1.8935395844489993e-05, + "loss": 0.832, + "step": 2112 + }, + { + "epoch": 0.17387368854145238, + "grad_norm": 3.2468542434176597, + "learning_rate": 1.8934198847596807e-05, + "loss": 0.8507, + "step": 2113 + }, + { + "epoch": 0.1739559761365974, + "grad_norm": 3.4980797369338186, + "learning_rate": 1.8933001216032422e-05, + "loss": 0.8275, + "step": 2114 + }, + { + "epoch": 0.17403826373174244, + "grad_norm": 3.1993556523484075, + "learning_rate": 1.8931802949881913e-05, + "loss": 0.8563, + "step": 2115 + }, + { + "epoch": 0.17412055132688747, + "grad_norm": 2.287521279159301, + "learning_rate": 1.893060404923041e-05, + "loss": 0.8244, + "step": 2116 + }, + { + "epoch": 0.1742028389220325, + "grad_norm": 2.6968982726780824, + "learning_rate": 1.892940451416308e-05, + "loss": 0.8495, + "step": 2117 + }, + { + "epoch": 0.17428512651717754, + "grad_norm": 2.2600810375191656, + "learning_rate": 1.892820434476513e-05, + "loss": 0.8619, + "step": 2118 + }, + { + "epoch": 0.17436741411232257, + "grad_norm": 2.4966318778937953, + "learning_rate": 1.8927003541121823e-05, + "loss": 0.843, + "step": 2119 + }, + { + "epoch": 0.1744497017074676, + "grad_norm": 2.4997626727819773, + "learning_rate": 1.8925802103318463e-05, + "loss": 0.8273, + "step": 2120 + }, + { + "epoch": 0.17453198930261263, + "grad_norm": 3.1828425470337995, + "learning_rate": 1.8924600031440398e-05, + "loss": 0.8407, + "step": 2121 + }, + { + "epoch": 0.17461427689775766, + "grad_norm": 3.0897156196532816, + "learning_rate": 1.8923397325573015e-05, + "loss": 0.8385, + "step": 2122 + }, + { + "epoch": 0.1746965644929027, + "grad_norm": 2.871249207390649, + "learning_rate": 1.892219398580176e-05, + "loss": 0.8133, + "step": 2123 + }, + { + "epoch": 0.17477885208804772, + "grad_norm": 3.207694935164232, + "learning_rate": 1.8920990012212108e-05, + "loss": 0.8474, + "step": 2124 + }, + { + "epoch": 0.17486113968319275, + "grad_norm": 3.0197963812764996, + "learning_rate": 1.8919785404889596e-05, + "loss": 0.8259, + "step": 2125 + }, + { + "epoch": 0.17494342727833778, + "grad_norm": 2.3693479079198645, + "learning_rate": 1.8918580163919795e-05, + "loss": 0.8511, + "step": 2126 + }, + { + "epoch": 0.1750257148734828, + "grad_norm": 0.6715909607363658, + "learning_rate": 1.891737428938832e-05, + "loss": 0.5611, + "step": 2127 + }, + { + "epoch": 0.17510800246862784, + "grad_norm": 0.7381501490976441, + "learning_rate": 1.891616778138084e-05, + "loss": 0.5944, + "step": 2128 + }, + { + "epoch": 0.17519029006377287, + "grad_norm": 2.945923101304315, + "learning_rate": 1.8914960639983056e-05, + "loss": 0.8633, + "step": 2129 + }, + { + "epoch": 0.1752725776589179, + "grad_norm": 3.276267836360085, + "learning_rate": 1.891375286528073e-05, + "loss": 0.8466, + "step": 2130 + }, + { + "epoch": 0.17535486525406296, + "grad_norm": 2.6643947216924886, + "learning_rate": 1.891254445735965e-05, + "loss": 0.855, + "step": 2131 + }, + { + "epoch": 0.175437152849208, + "grad_norm": 2.5634784016407615, + "learning_rate": 1.891133541630567e-05, + "loss": 0.85, + "step": 2132 + }, + { + "epoch": 0.17551944044435303, + "grad_norm": 3.122244685290648, + "learning_rate": 1.8910125742204674e-05, + "loss": 0.8372, + "step": 2133 + }, + { + "epoch": 0.17560172803949806, + "grad_norm": 3.587765275523561, + "learning_rate": 1.8908915435142593e-05, + "loss": 0.8405, + "step": 2134 + }, + { + "epoch": 0.1756840156346431, + "grad_norm": 2.9341338775130046, + "learning_rate": 1.8907704495205408e-05, + "loss": 0.8305, + "step": 2135 + }, + { + "epoch": 0.17576630322978812, + "grad_norm": 0.8269887826396807, + "learning_rate": 1.8906492922479138e-05, + "loss": 0.5875, + "step": 2136 + }, + { + "epoch": 0.17584859082493315, + "grad_norm": 3.0404267692410323, + "learning_rate": 1.890528071704986e-05, + "loss": 0.8665, + "step": 2137 + }, + { + "epoch": 0.17593087842007818, + "grad_norm": 3.8451043737340926, + "learning_rate": 1.8904067879003678e-05, + "loss": 0.829, + "step": 2138 + }, + { + "epoch": 0.1760131660152232, + "grad_norm": 2.9384675238670286, + "learning_rate": 1.8902854408426754e-05, + "loss": 0.8368, + "step": 2139 + }, + { + "epoch": 0.17609545361036824, + "grad_norm": 3.1352954501943757, + "learning_rate": 1.8901640305405293e-05, + "loss": 0.8595, + "step": 2140 + }, + { + "epoch": 0.17617774120551327, + "grad_norm": 0.5265036938780178, + "learning_rate": 1.890042557002554e-05, + "loss": 0.5988, + "step": 2141 + }, + { + "epoch": 0.1762600288006583, + "grad_norm": 3.213441123374223, + "learning_rate": 1.8899210202373787e-05, + "loss": 0.8325, + "step": 2142 + }, + { + "epoch": 0.17634231639580333, + "grad_norm": 5.0681082814516865, + "learning_rate": 1.8897994202536377e-05, + "loss": 0.8512, + "step": 2143 + }, + { + "epoch": 0.17642460399094836, + "grad_norm": 3.210520788729363, + "learning_rate": 1.8896777570599685e-05, + "loss": 0.8836, + "step": 2144 + }, + { + "epoch": 0.1765068915860934, + "grad_norm": 2.923135957435321, + "learning_rate": 1.8895560306650145e-05, + "loss": 0.8766, + "step": 2145 + }, + { + "epoch": 0.17658917918123843, + "grad_norm": 2.888963184179868, + "learning_rate": 1.8894342410774226e-05, + "loss": 0.8824, + "step": 2146 + }, + { + "epoch": 0.17667146677638346, + "grad_norm": 2.739409826930194, + "learning_rate": 1.8893123883058448e-05, + "loss": 0.8197, + "step": 2147 + }, + { + "epoch": 0.1767537543715285, + "grad_norm": 2.650298520937812, + "learning_rate": 1.8891904723589373e-05, + "loss": 0.8215, + "step": 2148 + }, + { + "epoch": 0.17683604196667352, + "grad_norm": 3.0306167804289426, + "learning_rate": 1.8890684932453602e-05, + "loss": 0.8613, + "step": 2149 + }, + { + "epoch": 0.17691832956181855, + "grad_norm": 2.812060438572058, + "learning_rate": 1.8889464509737795e-05, + "loss": 0.8626, + "step": 2150 + }, + { + "epoch": 0.17700061715696358, + "grad_norm": 3.0994618400593175, + "learning_rate": 1.8888243455528648e-05, + "loss": 0.8097, + "step": 2151 + }, + { + "epoch": 0.1770829047521086, + "grad_norm": 2.976960760141478, + "learning_rate": 1.8887021769912896e-05, + "loss": 0.873, + "step": 2152 + }, + { + "epoch": 0.17716519234725364, + "grad_norm": 2.9470177979558816, + "learning_rate": 1.8885799452977332e-05, + "loss": 0.8498, + "step": 2153 + }, + { + "epoch": 0.17724747994239867, + "grad_norm": 2.9807070159935933, + "learning_rate": 1.8884576504808787e-05, + "loss": 0.8261, + "step": 2154 + }, + { + "epoch": 0.1773297675375437, + "grad_norm": 3.55425495238515, + "learning_rate": 1.8883352925494132e-05, + "loss": 0.8763, + "step": 2155 + }, + { + "epoch": 0.17741205513268873, + "grad_norm": 3.4293358551956596, + "learning_rate": 1.8882128715120295e-05, + "loss": 0.859, + "step": 2156 + }, + { + "epoch": 0.1774943427278338, + "grad_norm": 3.5065020238393316, + "learning_rate": 1.888090387377424e-05, + "loss": 0.8426, + "step": 2157 + }, + { + "epoch": 0.17757663032297882, + "grad_norm": 3.4946671740505963, + "learning_rate": 1.8879678401542977e-05, + "loss": 0.8238, + "step": 2158 + }, + { + "epoch": 0.17765891791812385, + "grad_norm": 3.08246124790394, + "learning_rate": 1.8878452298513558e-05, + "loss": 0.855, + "step": 2159 + }, + { + "epoch": 0.17774120551326889, + "grad_norm": 3.304694159764267, + "learning_rate": 1.887722556477309e-05, + "loss": 0.8558, + "step": 2160 + }, + { + "epoch": 0.17782349310841392, + "grad_norm": 2.8888763632081638, + "learning_rate": 1.8875998200408715e-05, + "loss": 0.8549, + "step": 2161 + }, + { + "epoch": 0.17790578070355895, + "grad_norm": 3.0898876229980377, + "learning_rate": 1.887477020550762e-05, + "loss": 0.8719, + "step": 2162 + }, + { + "epoch": 0.17798806829870398, + "grad_norm": 0.5400253029327587, + "learning_rate": 1.8873541580157044e-05, + "loss": 0.5705, + "step": 2163 + }, + { + "epoch": 0.178070355893849, + "grad_norm": 3.2094499045309814, + "learning_rate": 1.8872312324444263e-05, + "loss": 0.8729, + "step": 2164 + }, + { + "epoch": 0.17815264348899404, + "grad_norm": 4.768655096715249, + "learning_rate": 1.8871082438456607e-05, + "loss": 0.8439, + "step": 2165 + }, + { + "epoch": 0.17823493108413907, + "grad_norm": 0.46479355653430865, + "learning_rate": 1.8869851922281443e-05, + "loss": 0.5564, + "step": 2166 + }, + { + "epoch": 0.1783172186792841, + "grad_norm": 3.2141222616690452, + "learning_rate": 1.8868620776006177e-05, + "loss": 0.8033, + "step": 2167 + }, + { + "epoch": 0.17839950627442913, + "grad_norm": 2.8977339445629355, + "learning_rate": 1.8867388999718282e-05, + "loss": 0.826, + "step": 2168 + }, + { + "epoch": 0.17848179386957416, + "grad_norm": 4.146203873336381, + "learning_rate": 1.8866156593505248e-05, + "loss": 0.8297, + "step": 2169 + }, + { + "epoch": 0.1785640814647192, + "grad_norm": 0.4677473869816791, + "learning_rate": 1.8864923557454635e-05, + "loss": 0.5713, + "step": 2170 + }, + { + "epoch": 0.17864636905986422, + "grad_norm": 3.1766961448469506, + "learning_rate": 1.8863689891654027e-05, + "loss": 0.8487, + "step": 2171 + }, + { + "epoch": 0.17872865665500925, + "grad_norm": 3.983573244064544, + "learning_rate": 1.886245559619106e-05, + "loss": 0.818, + "step": 2172 + }, + { + "epoch": 0.17881094425015429, + "grad_norm": 3.813218593430025, + "learning_rate": 1.8861220671153427e-05, + "loss": 0.8538, + "step": 2173 + }, + { + "epoch": 0.17889323184529932, + "grad_norm": 3.5142200829496835, + "learning_rate": 1.8859985116628845e-05, + "loss": 0.8509, + "step": 2174 + }, + { + "epoch": 0.17897551944044435, + "grad_norm": 0.47604867648075594, + "learning_rate": 1.8858748932705093e-05, + "loss": 0.5557, + "step": 2175 + }, + { + "epoch": 0.17905780703558938, + "grad_norm": 5.865810912897995, + "learning_rate": 1.8857512119469982e-05, + "loss": 0.8107, + "step": 2176 + }, + { + "epoch": 0.1791400946307344, + "grad_norm": 4.34519437878938, + "learning_rate": 1.8856274677011375e-05, + "loss": 0.8393, + "step": 2177 + }, + { + "epoch": 0.17922238222587944, + "grad_norm": 4.675423402486858, + "learning_rate": 1.8855036605417182e-05, + "loss": 0.8242, + "step": 2178 + }, + { + "epoch": 0.17930466982102447, + "grad_norm": 3.0460405802151294, + "learning_rate": 1.8853797904775347e-05, + "loss": 0.8499, + "step": 2179 + }, + { + "epoch": 0.1793869574161695, + "grad_norm": 4.6587197890485, + "learning_rate": 1.885255857517387e-05, + "loss": 0.862, + "step": 2180 + }, + { + "epoch": 0.17946924501131453, + "grad_norm": 3.595468610999008, + "learning_rate": 1.8851318616700785e-05, + "loss": 0.8425, + "step": 2181 + }, + { + "epoch": 0.17955153260645956, + "grad_norm": 4.6716822565870935, + "learning_rate": 1.8850078029444184e-05, + "loss": 0.8323, + "step": 2182 + }, + { + "epoch": 0.17963382020160462, + "grad_norm": 3.4397771250372275, + "learning_rate": 1.8848836813492198e-05, + "loss": 0.8398, + "step": 2183 + }, + { + "epoch": 0.17971610779674965, + "grad_norm": 3.1128586403508813, + "learning_rate": 1.8847594968932988e-05, + "loss": 0.8073, + "step": 2184 + }, + { + "epoch": 0.17979839539189468, + "grad_norm": 4.137928527045669, + "learning_rate": 1.884635249585479e-05, + "loss": 0.8435, + "step": 2185 + }, + { + "epoch": 0.17988068298703971, + "grad_norm": 3.9153345854479276, + "learning_rate": 1.884510939434585e-05, + "loss": 0.8529, + "step": 2186 + }, + { + "epoch": 0.17996297058218474, + "grad_norm": 3.132348968761736, + "learning_rate": 1.884386566449449e-05, + "loss": 0.8046, + "step": 2187 + }, + { + "epoch": 0.18004525817732978, + "grad_norm": 3.736988154773954, + "learning_rate": 1.8842621306389055e-05, + "loss": 0.8396, + "step": 2188 + }, + { + "epoch": 0.1801275457724748, + "grad_norm": 6.114132305121805, + "learning_rate": 1.8841376320117942e-05, + "loss": 0.8277, + "step": 2189 + }, + { + "epoch": 0.18020983336761984, + "grad_norm": 3.328527236630722, + "learning_rate": 1.8840130705769598e-05, + "loss": 0.7974, + "step": 2190 + }, + { + "epoch": 0.18029212096276487, + "grad_norm": 3.881694260683007, + "learning_rate": 1.8838884463432505e-05, + "loss": 0.8145, + "step": 2191 + }, + { + "epoch": 0.1803744085579099, + "grad_norm": 4.009278703627772, + "learning_rate": 1.8837637593195196e-05, + "loss": 0.8377, + "step": 2192 + }, + { + "epoch": 0.18045669615305493, + "grad_norm": 3.953343089533144, + "learning_rate": 1.8836390095146246e-05, + "loss": 0.8501, + "step": 2193 + }, + { + "epoch": 0.18053898374819996, + "grad_norm": 3.8815659926184716, + "learning_rate": 1.8835141969374274e-05, + "loss": 0.8278, + "step": 2194 + }, + { + "epoch": 0.180621271343345, + "grad_norm": 9.408638288023258, + "learning_rate": 1.883389321596795e-05, + "loss": 0.862, + "step": 2195 + }, + { + "epoch": 0.18070355893849002, + "grad_norm": 3.00111833564058, + "learning_rate": 1.8832643835015977e-05, + "loss": 0.8334, + "step": 2196 + }, + { + "epoch": 0.18078584653363505, + "grad_norm": 3.9288506872472997, + "learning_rate": 1.8831393826607112e-05, + "loss": 0.8268, + "step": 2197 + }, + { + "epoch": 0.18086813412878008, + "grad_norm": 3.151568521244493, + "learning_rate": 1.883014319083015e-05, + "loss": 0.8345, + "step": 2198 + }, + { + "epoch": 0.18095042172392511, + "grad_norm": 3.6006903348835126, + "learning_rate": 1.882889192777394e-05, + "loss": 0.8291, + "step": 2199 + }, + { + "epoch": 0.18103270931907015, + "grad_norm": 3.064791691782749, + "learning_rate": 1.882764003752737e-05, + "loss": 0.8236, + "step": 2200 + }, + { + "epoch": 0.18111499691421518, + "grad_norm": 2.687808317055217, + "learning_rate": 1.8826387520179366e-05, + "loss": 0.836, + "step": 2201 + }, + { + "epoch": 0.1811972845093602, + "grad_norm": 3.020220830213787, + "learning_rate": 1.8825134375818907e-05, + "loss": 0.8315, + "step": 2202 + }, + { + "epoch": 0.18127957210450524, + "grad_norm": 3.0945061455675784, + "learning_rate": 1.882388060453502e-05, + "loss": 0.8298, + "step": 2203 + }, + { + "epoch": 0.18136185969965027, + "grad_norm": 6.79903493713913, + "learning_rate": 1.8822626206416765e-05, + "loss": 0.8438, + "step": 2204 + }, + { + "epoch": 0.1814441472947953, + "grad_norm": 3.2340255761593224, + "learning_rate": 1.8821371181553255e-05, + "loss": 0.8217, + "step": 2205 + }, + { + "epoch": 0.18152643488994033, + "grad_norm": 3.3572188316655374, + "learning_rate": 1.882011553003364e-05, + "loss": 0.8471, + "step": 2206 + }, + { + "epoch": 0.18160872248508536, + "grad_norm": 2.407818729109725, + "learning_rate": 1.8818859251947126e-05, + "loss": 0.8119, + "step": 2207 + }, + { + "epoch": 0.1816910100802304, + "grad_norm": 2.8458563815348534, + "learning_rate": 1.8817602347382956e-05, + "loss": 0.8398, + "step": 2208 + }, + { + "epoch": 0.18177329767537545, + "grad_norm": 3.8214337213544702, + "learning_rate": 1.8816344816430414e-05, + "loss": 0.8369, + "step": 2209 + }, + { + "epoch": 0.18185558527052048, + "grad_norm": 0.5075834375751112, + "learning_rate": 1.8815086659178838e-05, + "loss": 0.5619, + "step": 2210 + }, + { + "epoch": 0.1819378728656655, + "grad_norm": 2.8951969539455296, + "learning_rate": 1.8813827875717603e-05, + "loss": 0.8389, + "step": 2211 + }, + { + "epoch": 0.18202016046081054, + "grad_norm": 2.6573677658322485, + "learning_rate": 1.8812568466136128e-05, + "loss": 0.822, + "step": 2212 + }, + { + "epoch": 0.18210244805595557, + "grad_norm": 3.766069897146698, + "learning_rate": 1.8811308430523888e-05, + "loss": 0.8325, + "step": 2213 + }, + { + "epoch": 0.1821847356511006, + "grad_norm": 3.0191461597717706, + "learning_rate": 1.8810047768970387e-05, + "loss": 0.8303, + "step": 2214 + }, + { + "epoch": 0.18226702324624564, + "grad_norm": 4.4512451838577345, + "learning_rate": 1.880878648156518e-05, + "loss": 0.8336, + "step": 2215 + }, + { + "epoch": 0.18234931084139067, + "grad_norm": 0.46711778997403974, + "learning_rate": 1.8807524568397873e-05, + "loss": 0.584, + "step": 2216 + }, + { + "epoch": 0.1824315984365357, + "grad_norm": 3.34850100547728, + "learning_rate": 1.88062620295581e-05, + "loss": 0.8361, + "step": 2217 + }, + { + "epoch": 0.18251388603168073, + "grad_norm": 2.5871023413010223, + "learning_rate": 1.880499886513556e-05, + "loss": 0.8372, + "step": 2218 + }, + { + "epoch": 0.18259617362682576, + "grad_norm": 2.7853773719177233, + "learning_rate": 1.8803735075219985e-05, + "loss": 0.8164, + "step": 2219 + }, + { + "epoch": 0.1826784612219708, + "grad_norm": 0.43484423574492026, + "learning_rate": 1.8802470659901143e-05, + "loss": 0.5434, + "step": 2220 + }, + { + "epoch": 0.18276074881711582, + "grad_norm": 2.270864089725784, + "learning_rate": 1.8801205619268867e-05, + "loss": 0.8195, + "step": 2221 + }, + { + "epoch": 0.18284303641226085, + "grad_norm": 2.5762907517132447, + "learning_rate": 1.8799939953413017e-05, + "loss": 0.8508, + "step": 2222 + }, + { + "epoch": 0.18292532400740588, + "grad_norm": 3.792182714917021, + "learning_rate": 1.879867366242351e-05, + "loss": 0.8724, + "step": 2223 + }, + { + "epoch": 0.1830076116025509, + "grad_norm": 2.737508395308536, + "learning_rate": 1.8797406746390295e-05, + "loss": 0.8249, + "step": 2224 + }, + { + "epoch": 0.18308989919769594, + "grad_norm": 3.3931450650826553, + "learning_rate": 1.8796139205403373e-05, + "loss": 0.8194, + "step": 2225 + }, + { + "epoch": 0.18317218679284097, + "grad_norm": 3.244932583038611, + "learning_rate": 1.8794871039552792e-05, + "loss": 0.8389, + "step": 2226 + }, + { + "epoch": 0.183254474387986, + "grad_norm": 2.6631938105606485, + "learning_rate": 1.8793602248928636e-05, + "loss": 0.8195, + "step": 2227 + }, + { + "epoch": 0.18333676198313104, + "grad_norm": 0.499931675074203, + "learning_rate": 1.8792332833621038e-05, + "loss": 0.5945, + "step": 2228 + }, + { + "epoch": 0.18341904957827607, + "grad_norm": 0.4348697621921401, + "learning_rate": 1.879106279372018e-05, + "loss": 0.5656, + "step": 2229 + }, + { + "epoch": 0.1835013371734211, + "grad_norm": 3.2176301339993567, + "learning_rate": 1.878979212931628e-05, + "loss": 0.8338, + "step": 2230 + }, + { + "epoch": 0.18358362476856613, + "grad_norm": 3.385877652749623, + "learning_rate": 1.8788520840499602e-05, + "loss": 0.8569, + "step": 2231 + }, + { + "epoch": 0.18366591236371116, + "grad_norm": 2.6879516134859016, + "learning_rate": 1.8787248927360456e-05, + "loss": 0.8562, + "step": 2232 + }, + { + "epoch": 0.1837481999588562, + "grad_norm": 3.1793592151350816, + "learning_rate": 1.8785976389989206e-05, + "loss": 0.8183, + "step": 2233 + }, + { + "epoch": 0.18383048755400122, + "grad_norm": 3.5810162565636308, + "learning_rate": 1.878470322847624e-05, + "loss": 0.8458, + "step": 2234 + }, + { + "epoch": 0.18391277514914628, + "grad_norm": 2.7189540947205972, + "learning_rate": 1.878342944291201e-05, + "loss": 0.8702, + "step": 2235 + }, + { + "epoch": 0.1839950627442913, + "grad_norm": 2.8595668370124354, + "learning_rate": 1.8782155033386994e-05, + "loss": 0.8538, + "step": 2236 + }, + { + "epoch": 0.18407735033943634, + "grad_norm": 3.703233163107493, + "learning_rate": 1.8780879999991733e-05, + "loss": 0.7903, + "step": 2237 + }, + { + "epoch": 0.18415963793458137, + "grad_norm": 0.4798818692626459, + "learning_rate": 1.87796043428168e-05, + "loss": 0.558, + "step": 2238 + }, + { + "epoch": 0.1842419255297264, + "grad_norm": 3.9357951606994956, + "learning_rate": 1.8778328061952812e-05, + "loss": 0.8604, + "step": 2239 + }, + { + "epoch": 0.18432421312487143, + "grad_norm": 0.4613130280558913, + "learning_rate": 1.877705115749044e-05, + "loss": 0.5557, + "step": 2240 + }, + { + "epoch": 0.18440650072001646, + "grad_norm": 0.4468114582336002, + "learning_rate": 1.877577362952039e-05, + "loss": 0.5699, + "step": 2241 + }, + { + "epoch": 0.1844887883151615, + "grad_norm": 3.7317140281652956, + "learning_rate": 1.8774495478133413e-05, + "loss": 0.8726, + "step": 2242 + }, + { + "epoch": 0.18457107591030653, + "grad_norm": 3.622122345180218, + "learning_rate": 1.8773216703420316e-05, + "loss": 0.8426, + "step": 2243 + }, + { + "epoch": 0.18465336350545156, + "grad_norm": 2.7851269583116323, + "learning_rate": 1.8771937305471933e-05, + "loss": 0.8274, + "step": 2244 + }, + { + "epoch": 0.1847356511005966, + "grad_norm": 3.0839548975497557, + "learning_rate": 1.877065728437915e-05, + "loss": 0.8522, + "step": 2245 + }, + { + "epoch": 0.18481793869574162, + "grad_norm": 3.3013680164209305, + "learning_rate": 1.87693766402329e-05, + "loss": 0.8541, + "step": 2246 + }, + { + "epoch": 0.18490022629088665, + "grad_norm": 2.9028412908174626, + "learning_rate": 1.8768095373124163e-05, + "loss": 0.8221, + "step": 2247 + }, + { + "epoch": 0.18498251388603168, + "grad_norm": 3.138930779052706, + "learning_rate": 1.8766813483143948e-05, + "loss": 0.8398, + "step": 2248 + }, + { + "epoch": 0.1850648014811767, + "grad_norm": 3.381505399118227, + "learning_rate": 1.8765530970383327e-05, + "loss": 0.8375, + "step": 2249 + }, + { + "epoch": 0.18514708907632174, + "grad_norm": 2.9033167705998433, + "learning_rate": 1.87642478349334e-05, + "loss": 0.8365, + "step": 2250 + }, + { + "epoch": 0.18522937667146677, + "grad_norm": 3.2505234992977696, + "learning_rate": 1.8762964076885328e-05, + "loss": 0.8237, + "step": 2251 + }, + { + "epoch": 0.1853116642666118, + "grad_norm": 2.5321575440720956, + "learning_rate": 1.8761679696330298e-05, + "loss": 0.7935, + "step": 2252 + }, + { + "epoch": 0.18539395186175683, + "grad_norm": 3.04670905608212, + "learning_rate": 1.876039469335956e-05, + "loss": 0.8158, + "step": 2253 + }, + { + "epoch": 0.18547623945690186, + "grad_norm": 3.0164781648706636, + "learning_rate": 1.875910906806439e-05, + "loss": 0.83, + "step": 2254 + }, + { + "epoch": 0.1855585270520469, + "grad_norm": 4.150397095497481, + "learning_rate": 1.875782282053612e-05, + "loss": 0.8448, + "step": 2255 + }, + { + "epoch": 0.18564081464719193, + "grad_norm": 5.5579579203703435, + "learning_rate": 1.875653595086612e-05, + "loss": 0.8278, + "step": 2256 + }, + { + "epoch": 0.18572310224233696, + "grad_norm": 2.7887068893538762, + "learning_rate": 1.875524845914581e-05, + "loss": 0.862, + "step": 2257 + }, + { + "epoch": 0.185805389837482, + "grad_norm": 4.214553200363649, + "learning_rate": 1.8753960345466658e-05, + "loss": 0.842, + "step": 2258 + }, + { + "epoch": 0.18588767743262702, + "grad_norm": 3.3289965629927916, + "learning_rate": 1.875267160992016e-05, + "loss": 0.834, + "step": 2259 + }, + { + "epoch": 0.18596996502777205, + "grad_norm": 20.600639052601434, + "learning_rate": 1.8751382252597868e-05, + "loss": 0.8326, + "step": 2260 + }, + { + "epoch": 0.1860522526229171, + "grad_norm": 2.710388129719034, + "learning_rate": 1.8750092273591374e-05, + "loss": 0.8377, + "step": 2261 + }, + { + "epoch": 0.18613454021806214, + "grad_norm": 0.5909281433096669, + "learning_rate": 1.8748801672992324e-05, + "loss": 0.5645, + "step": 2262 + }, + { + "epoch": 0.18621682781320717, + "grad_norm": 0.4908149271615487, + "learning_rate": 1.874751045089239e-05, + "loss": 0.5609, + "step": 2263 + }, + { + "epoch": 0.1862991154083522, + "grad_norm": 3.1296412097537996, + "learning_rate": 1.8746218607383304e-05, + "loss": 0.8435, + "step": 2264 + }, + { + "epoch": 0.18638140300349723, + "grad_norm": 2.9727680344314704, + "learning_rate": 1.874492614255684e-05, + "loss": 0.8584, + "step": 2265 + }, + { + "epoch": 0.18646369059864226, + "grad_norm": 3.078013174658112, + "learning_rate": 1.87436330565048e-05, + "loss": 0.8797, + "step": 2266 + }, + { + "epoch": 0.1865459781937873, + "grad_norm": 2.683367791467896, + "learning_rate": 1.8742339349319056e-05, + "loss": 0.8384, + "step": 2267 + }, + { + "epoch": 0.18662826578893232, + "grad_norm": 3.1448392740468853, + "learning_rate": 1.874104502109151e-05, + "loss": 0.8489, + "step": 2268 + }, + { + "epoch": 0.18671055338407735, + "grad_norm": 2.727871775490667, + "learning_rate": 1.8739750071914096e-05, + "loss": 0.8488, + "step": 2269 + }, + { + "epoch": 0.18679284097922239, + "grad_norm": 3.3007638198806455, + "learning_rate": 1.873845450187882e-05, + "loss": 0.8398, + "step": 2270 + }, + { + "epoch": 0.18687512857436742, + "grad_norm": 3.2603426010994294, + "learning_rate": 1.873715831107771e-05, + "loss": 0.8339, + "step": 2271 + }, + { + "epoch": 0.18695741616951245, + "grad_norm": 2.4598611597771702, + "learning_rate": 1.873586149960285e-05, + "loss": 0.86, + "step": 2272 + }, + { + "epoch": 0.18703970376465748, + "grad_norm": 0.7355623803063913, + "learning_rate": 1.8734564067546354e-05, + "loss": 0.594, + "step": 2273 + }, + { + "epoch": 0.1871219913598025, + "grad_norm": 2.8609086065159235, + "learning_rate": 1.8733266015000397e-05, + "loss": 0.8542, + "step": 2274 + }, + { + "epoch": 0.18720427895494754, + "grad_norm": 3.213644959041243, + "learning_rate": 1.8731967342057192e-05, + "loss": 0.8496, + "step": 2275 + }, + { + "epoch": 0.18728656655009257, + "grad_norm": 2.839023213264933, + "learning_rate": 1.8730668048808992e-05, + "loss": 0.8466, + "step": 2276 + }, + { + "epoch": 0.1873688541452376, + "grad_norm": 2.8302014001992735, + "learning_rate": 1.8729368135348092e-05, + "loss": 0.8277, + "step": 2277 + }, + { + "epoch": 0.18745114174038263, + "grad_norm": 2.336413397518663, + "learning_rate": 1.8728067601766843e-05, + "loss": 0.8037, + "step": 2278 + }, + { + "epoch": 0.18753342933552766, + "grad_norm": 3.093535380847658, + "learning_rate": 1.872676644815763e-05, + "loss": 0.8531, + "step": 2279 + }, + { + "epoch": 0.1876157169306727, + "grad_norm": 2.923666112609439, + "learning_rate": 1.8725464674612886e-05, + "loss": 0.8425, + "step": 2280 + }, + { + "epoch": 0.18769800452581772, + "grad_norm": 2.8664539298672573, + "learning_rate": 1.8724162281225085e-05, + "loss": 0.849, + "step": 2281 + }, + { + "epoch": 0.18778029212096276, + "grad_norm": 3.538690882300127, + "learning_rate": 1.8722859268086745e-05, + "loss": 0.8086, + "step": 2282 + }, + { + "epoch": 0.18786257971610779, + "grad_norm": 0.5196858721316295, + "learning_rate": 1.8721555635290435e-05, + "loss": 0.5675, + "step": 2283 + }, + { + "epoch": 0.18794486731125282, + "grad_norm": 2.93418455305132, + "learning_rate": 1.8720251382928762e-05, + "loss": 0.8235, + "step": 2284 + }, + { + "epoch": 0.18802715490639785, + "grad_norm": 3.2004573663386946, + "learning_rate": 1.8718946511094375e-05, + "loss": 0.8581, + "step": 2285 + }, + { + "epoch": 0.1881094425015429, + "grad_norm": 2.9202248368901285, + "learning_rate": 1.8717641019879972e-05, + "loss": 0.8238, + "step": 2286 + }, + { + "epoch": 0.18819173009668794, + "grad_norm": 3.1236514413686636, + "learning_rate": 1.8716334909378294e-05, + "loss": 0.8501, + "step": 2287 + }, + { + "epoch": 0.18827401769183297, + "grad_norm": 3.1609170877521557, + "learning_rate": 1.8715028179682122e-05, + "loss": 0.8686, + "step": 2288 + }, + { + "epoch": 0.188356305286978, + "grad_norm": 2.6161046242258497, + "learning_rate": 1.871372083088429e-05, + "loss": 0.8283, + "step": 2289 + }, + { + "epoch": 0.18843859288212303, + "grad_norm": 2.8853603189305623, + "learning_rate": 1.871241286307766e-05, + "loss": 0.8106, + "step": 2290 + }, + { + "epoch": 0.18852088047726806, + "grad_norm": 3.085578931198705, + "learning_rate": 1.8711104276355153e-05, + "loss": 0.8465, + "step": 2291 + }, + { + "epoch": 0.1886031680724131, + "grad_norm": 0.48302128504076397, + "learning_rate": 1.8709795070809737e-05, + "loss": 0.5678, + "step": 2292 + }, + { + "epoch": 0.18868545566755812, + "grad_norm": 3.288958363632039, + "learning_rate": 1.87084852465344e-05, + "loss": 0.8566, + "step": 2293 + }, + { + "epoch": 0.18876774326270315, + "grad_norm": 3.4831897332745387, + "learning_rate": 1.8707174803622202e-05, + "loss": 0.8428, + "step": 2294 + }, + { + "epoch": 0.18885003085784818, + "grad_norm": 0.4349912345338283, + "learning_rate": 1.8705863742166232e-05, + "loss": 0.5621, + "step": 2295 + }, + { + "epoch": 0.18893231845299321, + "grad_norm": 2.931342167177246, + "learning_rate": 1.8704552062259624e-05, + "loss": 0.8505, + "step": 2296 + }, + { + "epoch": 0.18901460604813825, + "grad_norm": 4.189017728249225, + "learning_rate": 1.870323976399556e-05, + "loss": 0.8631, + "step": 2297 + }, + { + "epoch": 0.18909689364328328, + "grad_norm": 4.18904839099143, + "learning_rate": 1.870192684746726e-05, + "loss": 0.8408, + "step": 2298 + }, + { + "epoch": 0.1891791812384283, + "grad_norm": 0.4624111826904663, + "learning_rate": 1.8700613312767994e-05, + "loss": 0.5512, + "step": 2299 + }, + { + "epoch": 0.18926146883357334, + "grad_norm": 3.8345017192105604, + "learning_rate": 1.8699299159991072e-05, + "loss": 0.8251, + "step": 2300 + }, + { + "epoch": 0.18934375642871837, + "grad_norm": 2.81460141306639, + "learning_rate": 1.869798438922985e-05, + "loss": 0.8566, + "step": 2301 + }, + { + "epoch": 0.1894260440238634, + "grad_norm": 3.4766589626322815, + "learning_rate": 1.8696669000577726e-05, + "loss": 0.8427, + "step": 2302 + }, + { + "epoch": 0.18950833161900843, + "grad_norm": 3.7855392027129473, + "learning_rate": 1.869535299412815e-05, + "loss": 0.8159, + "step": 2303 + }, + { + "epoch": 0.18959061921415346, + "grad_norm": 2.9712686170543345, + "learning_rate": 1.86940363699746e-05, + "loss": 0.8284, + "step": 2304 + }, + { + "epoch": 0.1896729068092985, + "grad_norm": 3.3034575793695518, + "learning_rate": 1.8692719128210607e-05, + "loss": 0.8163, + "step": 2305 + }, + { + "epoch": 0.18975519440444352, + "grad_norm": 3.44328822803266, + "learning_rate": 1.8691401268929754e-05, + "loss": 0.8304, + "step": 2306 + }, + { + "epoch": 0.18983748199958855, + "grad_norm": 3.393030951997428, + "learning_rate": 1.8690082792225653e-05, + "loss": 0.826, + "step": 2307 + }, + { + "epoch": 0.18991976959473358, + "grad_norm": 3.112853842296299, + "learning_rate": 1.868876369819197e-05, + "loss": 0.8202, + "step": 2308 + }, + { + "epoch": 0.19000205718987861, + "grad_norm": 3.018481253544045, + "learning_rate": 1.8687443986922408e-05, + "loss": 0.8323, + "step": 2309 + }, + { + "epoch": 0.19008434478502365, + "grad_norm": 4.303472704157856, + "learning_rate": 1.8686123658510715e-05, + "loss": 0.8461, + "step": 2310 + }, + { + "epoch": 0.19016663238016868, + "grad_norm": 11.860345394687899, + "learning_rate": 1.8684802713050692e-05, + "loss": 0.8089, + "step": 2311 + }, + { + "epoch": 0.19024891997531373, + "grad_norm": 2.8572470787809126, + "learning_rate": 1.8683481150636176e-05, + "loss": 0.8382, + "step": 2312 + }, + { + "epoch": 0.19033120757045877, + "grad_norm": 0.4654004570208718, + "learning_rate": 1.8682158971361044e-05, + "loss": 0.5562, + "step": 2313 + }, + { + "epoch": 0.1904134951656038, + "grad_norm": 2.9838829688293855, + "learning_rate": 1.8680836175319223e-05, + "loss": 0.8277, + "step": 2314 + }, + { + "epoch": 0.19049578276074883, + "grad_norm": 2.810228948693419, + "learning_rate": 1.8679512762604683e-05, + "loss": 0.8648, + "step": 2315 + }, + { + "epoch": 0.19057807035589386, + "grad_norm": 2.881269477726161, + "learning_rate": 1.8678188733311436e-05, + "loss": 0.8557, + "step": 2316 + }, + { + "epoch": 0.1906603579510389, + "grad_norm": 2.7469044292228353, + "learning_rate": 1.8676864087533542e-05, + "loss": 0.8506, + "step": 2317 + }, + { + "epoch": 0.19074264554618392, + "grad_norm": 3.34744286595029, + "learning_rate": 1.8675538825365104e-05, + "loss": 0.8227, + "step": 2318 + }, + { + "epoch": 0.19082493314132895, + "grad_norm": 2.737049820008201, + "learning_rate": 1.8674212946900257e-05, + "loss": 0.8469, + "step": 2319 + }, + { + "epoch": 0.19090722073647398, + "grad_norm": 2.6458546665277303, + "learning_rate": 1.8672886452233195e-05, + "loss": 0.8492, + "step": 2320 + }, + { + "epoch": 0.190989508331619, + "grad_norm": 0.4571970442666248, + "learning_rate": 1.8671559341458148e-05, + "loss": 0.5322, + "step": 2321 + }, + { + "epoch": 0.19107179592676404, + "grad_norm": 2.482157540329627, + "learning_rate": 1.8670231614669395e-05, + "loss": 0.8354, + "step": 2322 + }, + { + "epoch": 0.19115408352190907, + "grad_norm": 3.095874840133814, + "learning_rate": 1.8668903271961258e-05, + "loss": 0.8443, + "step": 2323 + }, + { + "epoch": 0.1912363711170541, + "grad_norm": 2.638500405402555, + "learning_rate": 1.8667574313428096e-05, + "loss": 0.8592, + "step": 2324 + }, + { + "epoch": 0.19131865871219914, + "grad_norm": 2.6137043327453715, + "learning_rate": 1.866624473916431e-05, + "loss": 0.8019, + "step": 2325 + }, + { + "epoch": 0.19140094630734417, + "grad_norm": 2.394976700091314, + "learning_rate": 1.8664914549264362e-05, + "loss": 0.8372, + "step": 2326 + }, + { + "epoch": 0.1914832339024892, + "grad_norm": 2.888092102030868, + "learning_rate": 1.866358374382274e-05, + "loss": 0.8372, + "step": 2327 + }, + { + "epoch": 0.19156552149763423, + "grad_norm": 2.6980010643491124, + "learning_rate": 1.8662252322933986e-05, + "loss": 0.847, + "step": 2328 + }, + { + "epoch": 0.19164780909277926, + "grad_norm": 2.796077785977032, + "learning_rate": 1.866092028669268e-05, + "loss": 0.8385, + "step": 2329 + }, + { + "epoch": 0.1917300966879243, + "grad_norm": 2.9754697600807343, + "learning_rate": 1.8659587635193447e-05, + "loss": 0.8177, + "step": 2330 + }, + { + "epoch": 0.19181238428306932, + "grad_norm": 2.5725791673462703, + "learning_rate": 1.865825436853096e-05, + "loss": 0.8363, + "step": 2331 + }, + { + "epoch": 0.19189467187821435, + "grad_norm": 2.4801240805526388, + "learning_rate": 1.8656920486799927e-05, + "loss": 0.8175, + "step": 2332 + }, + { + "epoch": 0.19197695947335938, + "grad_norm": 0.4704649399722642, + "learning_rate": 1.8655585990095105e-05, + "loss": 0.5492, + "step": 2333 + }, + { + "epoch": 0.1920592470685044, + "grad_norm": 2.529497640955037, + "learning_rate": 1.86542508785113e-05, + "loss": 0.81, + "step": 2334 + }, + { + "epoch": 0.19214153466364944, + "grad_norm": 0.4428561186683141, + "learning_rate": 1.8652915152143353e-05, + "loss": 0.5552, + "step": 2335 + }, + { + "epoch": 0.19222382225879447, + "grad_norm": 2.446374147239063, + "learning_rate": 1.8651578811086152e-05, + "loss": 0.8422, + "step": 2336 + }, + { + "epoch": 0.1923061098539395, + "grad_norm": 2.2281607455842605, + "learning_rate": 1.8650241855434625e-05, + "loss": 0.837, + "step": 2337 + }, + { + "epoch": 0.19238839744908456, + "grad_norm": 0.4770791744471931, + "learning_rate": 1.8648904285283754e-05, + "loss": 0.5636, + "step": 2338 + }, + { + "epoch": 0.1924706850442296, + "grad_norm": 2.8785971320310972, + "learning_rate": 1.8647566100728553e-05, + "loss": 0.8528, + "step": 2339 + }, + { + "epoch": 0.19255297263937463, + "grad_norm": 3.100990086853741, + "learning_rate": 1.864622730186409e-05, + "loss": 0.819, + "step": 2340 + }, + { + "epoch": 0.19263526023451966, + "grad_norm": 2.1552854912066515, + "learning_rate": 1.8644887888785464e-05, + "loss": 0.8404, + "step": 2341 + }, + { + "epoch": 0.1927175478296647, + "grad_norm": 2.2255369657187076, + "learning_rate": 1.8643547861587827e-05, + "loss": 0.8721, + "step": 2342 + }, + { + "epoch": 0.19279983542480972, + "grad_norm": 0.46985992482292277, + "learning_rate": 1.8642207220366373e-05, + "loss": 0.5565, + "step": 2343 + }, + { + "epoch": 0.19288212301995475, + "grad_norm": 2.0616661872374173, + "learning_rate": 1.8640865965216338e-05, + "loss": 0.8231, + "step": 2344 + }, + { + "epoch": 0.19296441061509978, + "grad_norm": 2.2729463116759927, + "learning_rate": 1.8639524096233008e-05, + "loss": 0.8361, + "step": 2345 + }, + { + "epoch": 0.1930466982102448, + "grad_norm": 2.417986376102915, + "learning_rate": 1.8638181613511702e-05, + "loss": 0.8203, + "step": 2346 + }, + { + "epoch": 0.19312898580538984, + "grad_norm": 2.0973418519154996, + "learning_rate": 1.8636838517147785e-05, + "loss": 0.8541, + "step": 2347 + }, + { + "epoch": 0.19321127340053487, + "grad_norm": 5.893917396678879, + "learning_rate": 1.8635494807236675e-05, + "loss": 0.8374, + "step": 2348 + }, + { + "epoch": 0.1932935609956799, + "grad_norm": 2.3484552007853186, + "learning_rate": 1.8634150483873824e-05, + "loss": 0.8721, + "step": 2349 + }, + { + "epoch": 0.19337584859082493, + "grad_norm": 2.1061799447137677, + "learning_rate": 1.8632805547154735e-05, + "loss": 0.8276, + "step": 2350 + }, + { + "epoch": 0.19345813618596996, + "grad_norm": 2.231070914752389, + "learning_rate": 1.8631459997174942e-05, + "loss": 0.8332, + "step": 2351 + }, + { + "epoch": 0.193540423781115, + "grad_norm": 0.4785412716293615, + "learning_rate": 1.8630113834030035e-05, + "loss": 0.5741, + "step": 2352 + }, + { + "epoch": 0.19362271137626003, + "grad_norm": 2.933088496173211, + "learning_rate": 1.8628767057815643e-05, + "loss": 0.8362, + "step": 2353 + }, + { + "epoch": 0.19370499897140506, + "grad_norm": 3.3919255724286694, + "learning_rate": 1.862741966862744e-05, + "loss": 0.8178, + "step": 2354 + }, + { + "epoch": 0.1937872865665501, + "grad_norm": 3.969290909229754, + "learning_rate": 1.862607166656114e-05, + "loss": 0.8415, + "step": 2355 + }, + { + "epoch": 0.19386957416169512, + "grad_norm": 0.43363832800498575, + "learning_rate": 1.8624723051712504e-05, + "loss": 0.5441, + "step": 2356 + }, + { + "epoch": 0.19395186175684015, + "grad_norm": 1.9923432140871553, + "learning_rate": 1.8623373824177337e-05, + "loss": 0.8504, + "step": 2357 + }, + { + "epoch": 0.19403414935198518, + "grad_norm": 2.138729060920641, + "learning_rate": 1.8622023984051486e-05, + "loss": 0.8531, + "step": 2358 + }, + { + "epoch": 0.1941164369471302, + "grad_norm": 2.996103398885607, + "learning_rate": 1.8620673531430835e-05, + "loss": 0.8295, + "step": 2359 + }, + { + "epoch": 0.19419872454227524, + "grad_norm": 1.9881513928273853, + "learning_rate": 1.8619322466411327e-05, + "loss": 0.8423, + "step": 2360 + }, + { + "epoch": 0.19428101213742027, + "grad_norm": 4.609204489054207, + "learning_rate": 1.8617970789088936e-05, + "loss": 0.8215, + "step": 2361 + }, + { + "epoch": 0.1943632997325653, + "grad_norm": 2.4870160713294323, + "learning_rate": 1.861661849955968e-05, + "loss": 0.8403, + "step": 2362 + }, + { + "epoch": 0.19444558732771033, + "grad_norm": 2.5663255039551793, + "learning_rate": 1.8615265597919628e-05, + "loss": 0.8504, + "step": 2363 + }, + { + "epoch": 0.1945278749228554, + "grad_norm": 0.4462116251981424, + "learning_rate": 1.8613912084264885e-05, + "loss": 0.5743, + "step": 2364 + }, + { + "epoch": 0.19461016251800042, + "grad_norm": 2.599011971955559, + "learning_rate": 1.8612557958691603e-05, + "loss": 0.8316, + "step": 2365 + }, + { + "epoch": 0.19469245011314545, + "grad_norm": 2.162713113376905, + "learning_rate": 1.861120322129598e-05, + "loss": 0.8602, + "step": 2366 + }, + { + "epoch": 0.19477473770829049, + "grad_norm": 2.427465370256237, + "learning_rate": 1.860984787217425e-05, + "loss": 0.852, + "step": 2367 + }, + { + "epoch": 0.19485702530343552, + "grad_norm": 2.1567218802548083, + "learning_rate": 1.8608491911422696e-05, + "loss": 0.7971, + "step": 2368 + }, + { + "epoch": 0.19493931289858055, + "grad_norm": 3.099411551421204, + "learning_rate": 1.860713533913764e-05, + "loss": 0.8112, + "step": 2369 + }, + { + "epoch": 0.19502160049372558, + "grad_norm": 0.458001012562976, + "learning_rate": 1.8605778155415462e-05, + "loss": 0.5699, + "step": 2370 + }, + { + "epoch": 0.1951038880888706, + "grad_norm": 2.490707822113723, + "learning_rate": 1.860442036035256e-05, + "loss": 0.8604, + "step": 2371 + }, + { + "epoch": 0.19518617568401564, + "grad_norm": 0.4231122112274931, + "learning_rate": 1.8603061954045404e-05, + "loss": 0.5447, + "step": 2372 + }, + { + "epoch": 0.19526846327916067, + "grad_norm": 0.43311431879242407, + "learning_rate": 1.860170293659048e-05, + "loss": 0.577, + "step": 2373 + }, + { + "epoch": 0.1953507508743057, + "grad_norm": 3.3753031020556534, + "learning_rate": 1.8600343308084338e-05, + "loss": 0.8346, + "step": 2374 + }, + { + "epoch": 0.19543303846945073, + "grad_norm": 2.504799282753451, + "learning_rate": 1.859898306862356e-05, + "loss": 0.8768, + "step": 2375 + }, + { + "epoch": 0.19551532606459576, + "grad_norm": 2.4364845514832423, + "learning_rate": 1.8597622218304775e-05, + "loss": 0.8097, + "step": 2376 + }, + { + "epoch": 0.1955976136597408, + "grad_norm": 0.43615558945077293, + "learning_rate": 1.8596260757224664e-05, + "loss": 0.5263, + "step": 2377 + }, + { + "epoch": 0.19567990125488582, + "grad_norm": 0.43233615700524114, + "learning_rate": 1.859489868547993e-05, + "loss": 0.5498, + "step": 2378 + }, + { + "epoch": 0.19576218885003085, + "grad_norm": 2.567122012275527, + "learning_rate": 1.8593536003167343e-05, + "loss": 0.8423, + "step": 2379 + }, + { + "epoch": 0.19584447644517589, + "grad_norm": 2.205559994325501, + "learning_rate": 1.8592172710383698e-05, + "loss": 0.8249, + "step": 2380 + }, + { + "epoch": 0.19592676404032092, + "grad_norm": 2.6574824466331655, + "learning_rate": 1.8590808807225848e-05, + "loss": 0.8404, + "step": 2381 + }, + { + "epoch": 0.19600905163546595, + "grad_norm": 4.540012850243885, + "learning_rate": 1.8589444293790676e-05, + "loss": 0.8608, + "step": 2382 + }, + { + "epoch": 0.19609133923061098, + "grad_norm": 2.376925428453061, + "learning_rate": 1.858807917017512e-05, + "loss": 0.8189, + "step": 2383 + }, + { + "epoch": 0.196173626825756, + "grad_norm": 3.0647815868191, + "learning_rate": 1.8586713436476157e-05, + "loss": 0.8576, + "step": 2384 + }, + { + "epoch": 0.19625591442090104, + "grad_norm": 2.205726772660387, + "learning_rate": 1.85853470927908e-05, + "loss": 0.8207, + "step": 2385 + }, + { + "epoch": 0.19633820201604607, + "grad_norm": 2.5188989094601104, + "learning_rate": 1.8583980139216118e-05, + "loss": 0.8313, + "step": 2386 + }, + { + "epoch": 0.1964204896111911, + "grad_norm": 3.4042284049906733, + "learning_rate": 1.8582612575849213e-05, + "loss": 0.8392, + "step": 2387 + }, + { + "epoch": 0.19650277720633613, + "grad_norm": 2.626590132398553, + "learning_rate": 1.858124440278724e-05, + "loss": 0.8152, + "step": 2388 + }, + { + "epoch": 0.19658506480148116, + "grad_norm": 0.4681042846841439, + "learning_rate": 1.8579875620127383e-05, + "loss": 0.55, + "step": 2389 + }, + { + "epoch": 0.19666735239662622, + "grad_norm": 2.616631543764943, + "learning_rate": 1.8578506227966888e-05, + "loss": 0.8676, + "step": 2390 + }, + { + "epoch": 0.19674963999177125, + "grad_norm": 2.7293629687423233, + "learning_rate": 1.857713622640303e-05, + "loss": 0.8578, + "step": 2391 + }, + { + "epoch": 0.19683192758691628, + "grad_norm": 2.7035619612336235, + "learning_rate": 1.8575765615533127e-05, + "loss": 0.8684, + "step": 2392 + }, + { + "epoch": 0.19691421518206131, + "grad_norm": 2.9052169281551765, + "learning_rate": 1.8574394395454553e-05, + "loss": 0.8565, + "step": 2393 + }, + { + "epoch": 0.19699650277720634, + "grad_norm": 2.7331128487533873, + "learning_rate": 1.8573022566264714e-05, + "loss": 0.8281, + "step": 2394 + }, + { + "epoch": 0.19707879037235138, + "grad_norm": 3.636132586598684, + "learning_rate": 1.857165012806106e-05, + "loss": 0.8503, + "step": 2395 + }, + { + "epoch": 0.1971610779674964, + "grad_norm": 0.4439175289748809, + "learning_rate": 1.8570277080941094e-05, + "loss": 0.5331, + "step": 2396 + }, + { + "epoch": 0.19724336556264144, + "grad_norm": 2.6806719070401916, + "learning_rate": 1.8568903425002345e-05, + "loss": 0.8419, + "step": 2397 + }, + { + "epoch": 0.19732565315778647, + "grad_norm": 2.6079792907790753, + "learning_rate": 1.8567529160342402e-05, + "loss": 0.8212, + "step": 2398 + }, + { + "epoch": 0.1974079407529315, + "grad_norm": 3.2683764361117613, + "learning_rate": 1.8566154287058893e-05, + "loss": 0.8368, + "step": 2399 + }, + { + "epoch": 0.19749022834807653, + "grad_norm": 2.7142642134286192, + "learning_rate": 1.8564778805249478e-05, + "loss": 0.8486, + "step": 2400 + }, + { + "epoch": 0.19757251594322156, + "grad_norm": 3.1786352003299063, + "learning_rate": 1.856340271501188e-05, + "loss": 0.8377, + "step": 2401 + }, + { + "epoch": 0.1976548035383666, + "grad_norm": 0.4578846665888037, + "learning_rate": 1.856202601644384e-05, + "loss": 0.5602, + "step": 2402 + }, + { + "epoch": 0.19773709113351162, + "grad_norm": 3.3414556761978127, + "learning_rate": 1.856064870964317e-05, + "loss": 0.8377, + "step": 2403 + }, + { + "epoch": 0.19781937872865665, + "grad_norm": 3.5318250292925404, + "learning_rate": 1.8559270794707705e-05, + "loss": 0.8063, + "step": 2404 + }, + { + "epoch": 0.19790166632380168, + "grad_norm": 5.51079782302022, + "learning_rate": 1.855789227173533e-05, + "loss": 0.8297, + "step": 2405 + }, + { + "epoch": 0.19798395391894671, + "grad_norm": 2.7529644026691935, + "learning_rate": 1.855651314082398e-05, + "loss": 0.8244, + "step": 2406 + }, + { + "epoch": 0.19806624151409175, + "grad_norm": 2.5494023443174614, + "learning_rate": 1.8555133402071614e-05, + "loss": 0.8445, + "step": 2407 + }, + { + "epoch": 0.19814852910923678, + "grad_norm": 4.775236277519297, + "learning_rate": 1.8553753055576254e-05, + "loss": 0.8293, + "step": 2408 + }, + { + "epoch": 0.1982308167043818, + "grad_norm": 7.835477243177075, + "learning_rate": 1.8552372101435962e-05, + "loss": 0.8173, + "step": 2409 + }, + { + "epoch": 0.19831310429952684, + "grad_norm": 0.44220497880015885, + "learning_rate": 1.855099053974883e-05, + "loss": 0.5415, + "step": 2410 + }, + { + "epoch": 0.19839539189467187, + "grad_norm": 0.4449089177486766, + "learning_rate": 1.8549608370613006e-05, + "loss": 0.5428, + "step": 2411 + }, + { + "epoch": 0.1984776794898169, + "grad_norm": 3.004236595412859, + "learning_rate": 1.8548225594126675e-05, + "loss": 0.8524, + "step": 2412 + }, + { + "epoch": 0.19855996708496193, + "grad_norm": 3.1225989803474627, + "learning_rate": 1.8546842210388068e-05, + "loss": 0.9006, + "step": 2413 + }, + { + "epoch": 0.19864225468010696, + "grad_norm": 2.8258591055606126, + "learning_rate": 1.854545821949546e-05, + "loss": 0.8293, + "step": 2414 + }, + { + "epoch": 0.198724542275252, + "grad_norm": 2.913102608268323, + "learning_rate": 1.8544073621547166e-05, + "loss": 0.8325, + "step": 2415 + }, + { + "epoch": 0.19880682987039705, + "grad_norm": 2.694231768464871, + "learning_rate": 1.854268841664155e-05, + "loss": 0.8299, + "step": 2416 + }, + { + "epoch": 0.19888911746554208, + "grad_norm": 2.548369516157711, + "learning_rate": 1.8541302604877006e-05, + "loss": 0.8597, + "step": 2417 + }, + { + "epoch": 0.1989714050606871, + "grad_norm": 3.8359828941267007, + "learning_rate": 1.8539916186351984e-05, + "loss": 0.8384, + "step": 2418 + }, + { + "epoch": 0.19905369265583214, + "grad_norm": 0.4921292366844118, + "learning_rate": 1.8538529161164977e-05, + "loss": 0.5389, + "step": 2419 + }, + { + "epoch": 0.19913598025097717, + "grad_norm": 2.626437336085917, + "learning_rate": 1.8537141529414516e-05, + "loss": 0.8366, + "step": 2420 + }, + { + "epoch": 0.1992182678461222, + "grad_norm": 2.9140289684900327, + "learning_rate": 1.853575329119917e-05, + "loss": 0.8311, + "step": 2421 + }, + { + "epoch": 0.19930055544126724, + "grad_norm": 2.785950193969917, + "learning_rate": 1.8534364446617564e-05, + "loss": 0.8464, + "step": 2422 + }, + { + "epoch": 0.19938284303641227, + "grad_norm": 2.57493998214015, + "learning_rate": 1.853297499576835e-05, + "loss": 0.8371, + "step": 2423 + }, + { + "epoch": 0.1994651306315573, + "grad_norm": 3.038908047752371, + "learning_rate": 1.8531584938750248e-05, + "loss": 0.8134, + "step": 2424 + }, + { + "epoch": 0.19954741822670233, + "grad_norm": 3.2786711756817204, + "learning_rate": 1.8530194275661988e-05, + "loss": 0.8103, + "step": 2425 + }, + { + "epoch": 0.19962970582184736, + "grad_norm": 2.963800782951755, + "learning_rate": 1.852880300660237e-05, + "loss": 0.8317, + "step": 2426 + }, + { + "epoch": 0.1997119934169924, + "grad_norm": 3.337774402134451, + "learning_rate": 1.852741113167023e-05, + "loss": 0.8266, + "step": 2427 + }, + { + "epoch": 0.19979428101213742, + "grad_norm": 2.5929113251908276, + "learning_rate": 1.852601865096444e-05, + "loss": 0.8395, + "step": 2428 + }, + { + "epoch": 0.19987656860728245, + "grad_norm": 2.5937196786890477, + "learning_rate": 1.852462556458392e-05, + "loss": 0.8333, + "step": 2429 + }, + { + "epoch": 0.19995885620242748, + "grad_norm": 0.5152107323732656, + "learning_rate": 1.852323187262763e-05, + "loss": 0.557, + "step": 2430 + }, + { + "epoch": 0.2000411437975725, + "grad_norm": 3.3407100425128715, + "learning_rate": 1.8521837575194583e-05, + "loss": 0.8529, + "step": 2431 + }, + { + "epoch": 0.20012343139271754, + "grad_norm": 2.6538977779080963, + "learning_rate": 1.852044267238382e-05, + "loss": 0.8414, + "step": 2432 + }, + { + "epoch": 0.20020571898786257, + "grad_norm": 2.314162147595894, + "learning_rate": 1.851904716429444e-05, + "loss": 0.8125, + "step": 2433 + }, + { + "epoch": 0.2002880065830076, + "grad_norm": 2.2460408127807168, + "learning_rate": 1.851765105102557e-05, + "loss": 0.8334, + "step": 2434 + }, + { + "epoch": 0.20037029417815264, + "grad_norm": 3.0959390544791385, + "learning_rate": 1.8516254332676393e-05, + "loss": 0.8331, + "step": 2435 + }, + { + "epoch": 0.20045258177329767, + "grad_norm": 2.5566174122477814, + "learning_rate": 1.8514857009346125e-05, + "loss": 0.8476, + "step": 2436 + }, + { + "epoch": 0.2005348693684427, + "grad_norm": 0.44869119102794647, + "learning_rate": 1.8513459081134036e-05, + "loss": 0.5641, + "step": 2437 + }, + { + "epoch": 0.20061715696358773, + "grad_norm": 0.45617551602186374, + "learning_rate": 1.8512060548139427e-05, + "loss": 0.5813, + "step": 2438 + }, + { + "epoch": 0.20069944455873276, + "grad_norm": 0.43039996828216154, + "learning_rate": 1.8510661410461655e-05, + "loss": 0.5552, + "step": 2439 + }, + { + "epoch": 0.2007817321538778, + "grad_norm": 0.4327012016490628, + "learning_rate": 1.8509261668200103e-05, + "loss": 0.5479, + "step": 2440 + }, + { + "epoch": 0.20086401974902285, + "grad_norm": 2.525208048517637, + "learning_rate": 1.8507861321454207e-05, + "loss": 0.8417, + "step": 2441 + }, + { + "epoch": 0.20094630734416788, + "grad_norm": 2.311404228985444, + "learning_rate": 1.8506460370323452e-05, + "loss": 0.8064, + "step": 2442 + }, + { + "epoch": 0.2010285949393129, + "grad_norm": 2.039238767153375, + "learning_rate": 1.8505058814907358e-05, + "loss": 0.8351, + "step": 2443 + }, + { + "epoch": 0.20111088253445794, + "grad_norm": 0.47468901596411156, + "learning_rate": 1.8503656655305488e-05, + "loss": 0.538, + "step": 2444 + }, + { + "epoch": 0.20119317012960297, + "grad_norm": 2.275462288576652, + "learning_rate": 1.8502253891617447e-05, + "loss": 0.8254, + "step": 2445 + }, + { + "epoch": 0.201275457724748, + "grad_norm": 2.1855146818106106, + "learning_rate": 1.8500850523942886e-05, + "loss": 0.8405, + "step": 2446 + }, + { + "epoch": 0.20135774531989303, + "grad_norm": 2.4976463221502985, + "learning_rate": 1.84994465523815e-05, + "loss": 0.8452, + "step": 2447 + }, + { + "epoch": 0.20144003291503806, + "grad_norm": 2.367625706280731, + "learning_rate": 1.8498041977033027e-05, + "loss": 0.8341, + "step": 2448 + }, + { + "epoch": 0.2015223205101831, + "grad_norm": 2.260678210056219, + "learning_rate": 1.8496636797997238e-05, + "loss": 0.8404, + "step": 2449 + }, + { + "epoch": 0.20160460810532813, + "grad_norm": 2.5435304382753827, + "learning_rate": 1.849523101537396e-05, + "loss": 0.8238, + "step": 2450 + }, + { + "epoch": 0.20168689570047316, + "grad_norm": 2.6618203820717263, + "learning_rate": 1.8493824629263058e-05, + "loss": 0.8193, + "step": 2451 + }, + { + "epoch": 0.2017691832956182, + "grad_norm": 2.6893590793056763, + "learning_rate": 1.8492417639764438e-05, + "loss": 0.85, + "step": 2452 + }, + { + "epoch": 0.20185147089076322, + "grad_norm": 0.48216153771983744, + "learning_rate": 1.849101004697805e-05, + "loss": 0.5712, + "step": 2453 + }, + { + "epoch": 0.20193375848590825, + "grad_norm": 2.5294000953688967, + "learning_rate": 1.8489601851003888e-05, + "loss": 0.8515, + "step": 2454 + }, + { + "epoch": 0.20201604608105328, + "grad_norm": 0.4643243447618097, + "learning_rate": 1.8488193051941986e-05, + "loss": 0.5578, + "step": 2455 + }, + { + "epoch": 0.2020983336761983, + "grad_norm": 2.319559503794457, + "learning_rate": 1.8486783649892426e-05, + "loss": 0.8621, + "step": 2456 + }, + { + "epoch": 0.20218062127134334, + "grad_norm": 2.4512435702486406, + "learning_rate": 1.848537364495533e-05, + "loss": 0.7958, + "step": 2457 + }, + { + "epoch": 0.20226290886648837, + "grad_norm": 2.6411049911355655, + "learning_rate": 1.848396303723086e-05, + "loss": 0.8378, + "step": 2458 + }, + { + "epoch": 0.2023451964616334, + "grad_norm": 2.8276073390762066, + "learning_rate": 1.8482551826819222e-05, + "loss": 0.8325, + "step": 2459 + }, + { + "epoch": 0.20242748405677843, + "grad_norm": 2.2864955529084185, + "learning_rate": 1.848114001382067e-05, + "loss": 0.8112, + "step": 2460 + }, + { + "epoch": 0.20250977165192346, + "grad_norm": 2.0420321250633875, + "learning_rate": 1.847972759833549e-05, + "loss": 0.8234, + "step": 2461 + }, + { + "epoch": 0.2025920592470685, + "grad_norm": 0.48223680461587926, + "learning_rate": 1.847831458046403e-05, + "loss": 0.554, + "step": 2462 + }, + { + "epoch": 0.20267434684221353, + "grad_norm": 2.129881382514526, + "learning_rate": 1.8476900960306652e-05, + "loss": 0.8267, + "step": 2463 + }, + { + "epoch": 0.20275663443735856, + "grad_norm": 2.859892646285134, + "learning_rate": 1.8475486737963796e-05, + "loss": 0.8667, + "step": 2464 + }, + { + "epoch": 0.2028389220325036, + "grad_norm": 2.624248665968573, + "learning_rate": 1.847407191353591e-05, + "loss": 0.8523, + "step": 2465 + }, + { + "epoch": 0.20292120962764862, + "grad_norm": 2.8516232385657068, + "learning_rate": 1.847265648712351e-05, + "loss": 0.8541, + "step": 2466 + }, + { + "epoch": 0.20300349722279368, + "grad_norm": 2.6055637034912404, + "learning_rate": 1.8471240458827146e-05, + "loss": 0.8502, + "step": 2467 + }, + { + "epoch": 0.2030857848179387, + "grad_norm": 2.4577761773176783, + "learning_rate": 1.8469823828747407e-05, + "loss": 0.8413, + "step": 2468 + }, + { + "epoch": 0.20316807241308374, + "grad_norm": 2.47006715525194, + "learning_rate": 1.8468406596984926e-05, + "loss": 0.8081, + "step": 2469 + }, + { + "epoch": 0.20325036000822877, + "grad_norm": 2.5083809333937372, + "learning_rate": 1.8466988763640384e-05, + "loss": 0.833, + "step": 2470 + }, + { + "epoch": 0.2033326476033738, + "grad_norm": 2.2500043659647817, + "learning_rate": 1.84655703288145e-05, + "loss": 0.8458, + "step": 2471 + }, + { + "epoch": 0.20341493519851883, + "grad_norm": 0.47109708584420956, + "learning_rate": 1.846415129260804e-05, + "loss": 0.533, + "step": 2472 + }, + { + "epoch": 0.20349722279366386, + "grad_norm": 2.735180153674388, + "learning_rate": 1.846273165512181e-05, + "loss": 0.8279, + "step": 2473 + }, + { + "epoch": 0.2035795103888089, + "grad_norm": 2.5548649604074583, + "learning_rate": 1.8461311416456656e-05, + "loss": 0.8504, + "step": 2474 + }, + { + "epoch": 0.20366179798395392, + "grad_norm": 2.7375890325929926, + "learning_rate": 1.845989057671347e-05, + "loss": 0.8506, + "step": 2475 + }, + { + "epoch": 0.20374408557909895, + "grad_norm": 2.218342428166774, + "learning_rate": 1.8458469135993188e-05, + "loss": 0.8416, + "step": 2476 + }, + { + "epoch": 0.20382637317424399, + "grad_norm": 2.44498954622628, + "learning_rate": 1.845704709439679e-05, + "loss": 0.7982, + "step": 2477 + }, + { + "epoch": 0.20390866076938902, + "grad_norm": 2.3257198537986996, + "learning_rate": 1.8455624452025284e-05, + "loss": 0.8516, + "step": 2478 + }, + { + "epoch": 0.20399094836453405, + "grad_norm": 3.589919113908559, + "learning_rate": 1.845420120897974e-05, + "loss": 0.848, + "step": 2479 + }, + { + "epoch": 0.20407323595967908, + "grad_norm": 2.6683764973422375, + "learning_rate": 1.8452777365361266e-05, + "loss": 0.843, + "step": 2480 + }, + { + "epoch": 0.2041555235548241, + "grad_norm": 0.46016880736381627, + "learning_rate": 1.8451352921271007e-05, + "loss": 0.5541, + "step": 2481 + }, + { + "epoch": 0.20423781114996914, + "grad_norm": 0.45308129612654763, + "learning_rate": 1.844992787681015e-05, + "loss": 0.5546, + "step": 2482 + }, + { + "epoch": 0.20432009874511417, + "grad_norm": 2.163903037214254, + "learning_rate": 1.8448502232079933e-05, + "loss": 0.8245, + "step": 2483 + }, + { + "epoch": 0.2044023863402592, + "grad_norm": 2.5581655100420733, + "learning_rate": 1.844707598718163e-05, + "loss": 0.7763, + "step": 2484 + }, + { + "epoch": 0.20448467393540423, + "grad_norm": 2.288182080148139, + "learning_rate": 1.8445649142216553e-05, + "loss": 0.8515, + "step": 2485 + }, + { + "epoch": 0.20456696153054926, + "grad_norm": 2.559408657335864, + "learning_rate": 1.844422169728607e-05, + "loss": 0.8182, + "step": 2486 + }, + { + "epoch": 0.2046492491256943, + "grad_norm": 2.2948972501014118, + "learning_rate": 1.8442793652491583e-05, + "loss": 0.816, + "step": 2487 + }, + { + "epoch": 0.20473153672083932, + "grad_norm": 3.059449870890419, + "learning_rate": 1.8441365007934537e-05, + "loss": 0.8346, + "step": 2488 + }, + { + "epoch": 0.20481382431598436, + "grad_norm": 2.3426692532650955, + "learning_rate": 1.843993576371642e-05, + "loss": 0.8382, + "step": 2489 + }, + { + "epoch": 0.20489611191112939, + "grad_norm": 3.668869085131925, + "learning_rate": 1.8438505919938764e-05, + "loss": 0.8097, + "step": 2490 + }, + { + "epoch": 0.20497839950627442, + "grad_norm": 0.4843583755464429, + "learning_rate": 1.8437075476703145e-05, + "loss": 0.5594, + "step": 2491 + }, + { + "epoch": 0.20506068710141945, + "grad_norm": 2.9411304009446413, + "learning_rate": 1.8435644434111172e-05, + "loss": 0.8315, + "step": 2492 + }, + { + "epoch": 0.2051429746965645, + "grad_norm": 3.0347377147594616, + "learning_rate": 1.8434212792264512e-05, + "loss": 0.8413, + "step": 2493 + }, + { + "epoch": 0.20522526229170954, + "grad_norm": 2.948040171867707, + "learning_rate": 1.8432780551264864e-05, + "loss": 0.8287, + "step": 2494 + }, + { + "epoch": 0.20530754988685457, + "grad_norm": 2.526042320792797, + "learning_rate": 1.8431347711213975e-05, + "loss": 0.8192, + "step": 2495 + }, + { + "epoch": 0.2053898374819996, + "grad_norm": 2.4529626679650933, + "learning_rate": 1.8429914272213624e-05, + "loss": 0.8374, + "step": 2496 + }, + { + "epoch": 0.20547212507714463, + "grad_norm": 3.0109020139279776, + "learning_rate": 1.8428480234365648e-05, + "loss": 0.8538, + "step": 2497 + }, + { + "epoch": 0.20555441267228966, + "grad_norm": 2.7309518930126084, + "learning_rate": 1.8427045597771915e-05, + "loss": 0.8288, + "step": 2498 + }, + { + "epoch": 0.2056367002674347, + "grad_norm": 2.7571851152572324, + "learning_rate": 1.8425610362534336e-05, + "loss": 0.818, + "step": 2499 + }, + { + "epoch": 0.20571898786257972, + "grad_norm": 3.6541677601404463, + "learning_rate": 1.8424174528754874e-05, + "loss": 0.8242, + "step": 2500 + }, + { + "epoch": 0.20580127545772475, + "grad_norm": 3.2673973935668363, + "learning_rate": 1.8422738096535528e-05, + "loss": 0.8559, + "step": 2501 + }, + { + "epoch": 0.20588356305286978, + "grad_norm": 3.280268088475709, + "learning_rate": 1.8421301065978336e-05, + "loss": 0.8289, + "step": 2502 + }, + { + "epoch": 0.20596585064801481, + "grad_norm": 3.644658448351934, + "learning_rate": 1.8419863437185385e-05, + "loss": 0.8492, + "step": 2503 + }, + { + "epoch": 0.20604813824315985, + "grad_norm": 3.9409239786685584, + "learning_rate": 1.84184252102588e-05, + "loss": 0.861, + "step": 2504 + }, + { + "epoch": 0.20613042583830488, + "grad_norm": 2.770614222633164, + "learning_rate": 1.841698638530075e-05, + "loss": 0.8466, + "step": 2505 + }, + { + "epoch": 0.2062127134334499, + "grad_norm": 2.6802796451434037, + "learning_rate": 1.841554696241345e-05, + "loss": 0.8178, + "step": 2506 + }, + { + "epoch": 0.20629500102859494, + "grad_norm": 5.256236524078497, + "learning_rate": 1.8414106941699152e-05, + "loss": 0.8525, + "step": 2507 + }, + { + "epoch": 0.20637728862373997, + "grad_norm": 2.8550407022060478, + "learning_rate": 1.8412666323260154e-05, + "loss": 0.8244, + "step": 2508 + }, + { + "epoch": 0.206459576218885, + "grad_norm": 2.837058387140895, + "learning_rate": 1.8411225107198795e-05, + "loss": 0.8158, + "step": 2509 + }, + { + "epoch": 0.20654186381403003, + "grad_norm": 3.2657972442124312, + "learning_rate": 1.8409783293617454e-05, + "loss": 0.8482, + "step": 2510 + }, + { + "epoch": 0.20662415140917506, + "grad_norm": 3.107298788236777, + "learning_rate": 1.8408340882618557e-05, + "loss": 0.8323, + "step": 2511 + }, + { + "epoch": 0.2067064390043201, + "grad_norm": 3.7888502892935323, + "learning_rate": 1.8406897874304576e-05, + "loss": 0.8402, + "step": 2512 + }, + { + "epoch": 0.20678872659946512, + "grad_norm": 2.793788251488496, + "learning_rate": 1.840545426877801e-05, + "loss": 0.8629, + "step": 2513 + }, + { + "epoch": 0.20687101419461015, + "grad_norm": 2.7013088321665997, + "learning_rate": 1.8404010066141414e-05, + "loss": 0.8053, + "step": 2514 + }, + { + "epoch": 0.20695330178975518, + "grad_norm": 3.230473671989884, + "learning_rate": 1.840256526649739e-05, + "loss": 0.8322, + "step": 2515 + }, + { + "epoch": 0.20703558938490021, + "grad_norm": 4.101974661085115, + "learning_rate": 1.840111986994856e-05, + "loss": 0.8134, + "step": 2516 + }, + { + "epoch": 0.20711787698004525, + "grad_norm": 4.152939298483347, + "learning_rate": 1.8399673876597615e-05, + "loss": 0.8474, + "step": 2517 + }, + { + "epoch": 0.20720016457519028, + "grad_norm": 3.0769463216688115, + "learning_rate": 1.839822728654727e-05, + "loss": 0.8299, + "step": 2518 + }, + { + "epoch": 0.20728245217033534, + "grad_norm": 2.9770483204175453, + "learning_rate": 1.8396780099900287e-05, + "loss": 0.8034, + "step": 2519 + }, + { + "epoch": 0.20736473976548037, + "grad_norm": 10.913510804926778, + "learning_rate": 1.8395332316759474e-05, + "loss": 0.8102, + "step": 2520 + }, + { + "epoch": 0.2074470273606254, + "grad_norm": 3.301389018336796, + "learning_rate": 1.8393883937227682e-05, + "loss": 0.8269, + "step": 2521 + }, + { + "epoch": 0.20752931495577043, + "grad_norm": 3.9626609639773585, + "learning_rate": 1.83924349614078e-05, + "loss": 0.8484, + "step": 2522 + }, + { + "epoch": 0.20761160255091546, + "grad_norm": 3.308462805380347, + "learning_rate": 1.8390985389402757e-05, + "loss": 0.8193, + "step": 2523 + }, + { + "epoch": 0.2076938901460605, + "grad_norm": 3.446980055063646, + "learning_rate": 1.8389535221315533e-05, + "loss": 0.8165, + "step": 2524 + }, + { + "epoch": 0.20777617774120552, + "grad_norm": 3.361166331767642, + "learning_rate": 1.8388084457249145e-05, + "loss": 0.8403, + "step": 2525 + }, + { + "epoch": 0.20785846533635055, + "grad_norm": 0.4489733132461202, + "learning_rate": 1.8386633097306652e-05, + "loss": 0.5779, + "step": 2526 + }, + { + "epoch": 0.20794075293149558, + "grad_norm": 3.3763074588542126, + "learning_rate": 1.8385181141591155e-05, + "loss": 0.8355, + "step": 2527 + }, + { + "epoch": 0.2080230405266406, + "grad_norm": 3.1426058775191716, + "learning_rate": 1.83837285902058e-05, + "loss": 0.8228, + "step": 2528 + }, + { + "epoch": 0.20810532812178564, + "grad_norm": 3.408831364628426, + "learning_rate": 1.838227544325377e-05, + "loss": 0.8374, + "step": 2529 + }, + { + "epoch": 0.20818761571693067, + "grad_norm": 3.4386486634409086, + "learning_rate": 1.8380821700838306e-05, + "loss": 0.8511, + "step": 2530 + }, + { + "epoch": 0.2082699033120757, + "grad_norm": 3.2638778987282717, + "learning_rate": 1.8379367363062667e-05, + "loss": 0.8327, + "step": 2531 + }, + { + "epoch": 0.20835219090722074, + "grad_norm": 3.5679775987459994, + "learning_rate": 1.8377912430030172e-05, + "loss": 0.8329, + "step": 2532 + }, + { + "epoch": 0.20843447850236577, + "grad_norm": 3.2528208627335955, + "learning_rate": 1.8376456901844174e-05, + "loss": 0.8053, + "step": 2533 + }, + { + "epoch": 0.2085167660975108, + "grad_norm": 3.264120422841936, + "learning_rate": 1.8375000778608077e-05, + "loss": 0.825, + "step": 2534 + }, + { + "epoch": 0.20859905369265583, + "grad_norm": 0.45110572381254516, + "learning_rate": 1.8373544060425318e-05, + "loss": 0.5506, + "step": 2535 + }, + { + "epoch": 0.20868134128780086, + "grad_norm": 2.593216453166507, + "learning_rate": 1.8372086747399377e-05, + "loss": 0.8277, + "step": 2536 + }, + { + "epoch": 0.2087636288829459, + "grad_norm": 4.31170410793178, + "learning_rate": 1.8370628839633786e-05, + "loss": 0.8103, + "step": 2537 + }, + { + "epoch": 0.20884591647809092, + "grad_norm": 3.069700353630395, + "learning_rate": 1.836917033723211e-05, + "loss": 0.8451, + "step": 2538 + }, + { + "epoch": 0.20892820407323595, + "grad_norm": 2.727415507565117, + "learning_rate": 1.8367711240297955e-05, + "loss": 0.8193, + "step": 2539 + }, + { + "epoch": 0.20901049166838098, + "grad_norm": 2.583035618535975, + "learning_rate": 1.836625154893498e-05, + "loss": 0.8272, + "step": 2540 + }, + { + "epoch": 0.209092779263526, + "grad_norm": 2.8510094205198984, + "learning_rate": 1.8364791263246872e-05, + "loss": 0.8416, + "step": 2541 + }, + { + "epoch": 0.20917506685867104, + "grad_norm": 2.5411935858791237, + "learning_rate": 1.8363330383337367e-05, + "loss": 0.8165, + "step": 2542 + }, + { + "epoch": 0.20925735445381607, + "grad_norm": 3.818919343874717, + "learning_rate": 1.8361868909310252e-05, + "loss": 0.816, + "step": 2543 + }, + { + "epoch": 0.2093396420489611, + "grad_norm": 3.227265299702525, + "learning_rate": 1.836040684126934e-05, + "loss": 0.8406, + "step": 2544 + }, + { + "epoch": 0.20942192964410616, + "grad_norm": 2.8657353537124775, + "learning_rate": 1.8358944179318493e-05, + "loss": 0.8013, + "step": 2545 + }, + { + "epoch": 0.2095042172392512, + "grad_norm": 2.834396947381362, + "learning_rate": 1.8357480923561626e-05, + "loss": 0.816, + "step": 2546 + }, + { + "epoch": 0.20958650483439623, + "grad_norm": 3.236954335848728, + "learning_rate": 1.835601707410268e-05, + "loss": 0.8182, + "step": 2547 + }, + { + "epoch": 0.20966879242954126, + "grad_norm": 3.2899460403922824, + "learning_rate": 1.835455263104564e-05, + "loss": 0.8374, + "step": 2548 + }, + { + "epoch": 0.2097510800246863, + "grad_norm": 3.436877513077752, + "learning_rate": 1.8353087594494543e-05, + "loss": 0.8474, + "step": 2549 + }, + { + "epoch": 0.20983336761983132, + "grad_norm": 2.9944542174818594, + "learning_rate": 1.8351621964553463e-05, + "loss": 0.84, + "step": 2550 + }, + { + "epoch": 0.20991565521497635, + "grad_norm": 3.6190473827362992, + "learning_rate": 1.8350155741326518e-05, + "loss": 0.8452, + "step": 2551 + }, + { + "epoch": 0.20999794281012138, + "grad_norm": 2.4354433594418294, + "learning_rate": 1.834868892491786e-05, + "loss": 0.8122, + "step": 2552 + }, + { + "epoch": 0.2100802304052664, + "grad_norm": 2.5426108248638855, + "learning_rate": 1.8347221515431692e-05, + "loss": 0.7887, + "step": 2553 + }, + { + "epoch": 0.21016251800041144, + "grad_norm": 3.25334084668476, + "learning_rate": 1.8345753512972258e-05, + "loss": 0.8431, + "step": 2554 + }, + { + "epoch": 0.21024480559555647, + "grad_norm": 2.7030206230832645, + "learning_rate": 1.834428491764384e-05, + "loss": 0.8317, + "step": 2555 + }, + { + "epoch": 0.2103270931907015, + "grad_norm": 0.47494536114521607, + "learning_rate": 1.834281572955077e-05, + "loss": 0.5453, + "step": 2556 + }, + { + "epoch": 0.21040938078584653, + "grad_norm": 4.098758048826732, + "learning_rate": 1.834134594879741e-05, + "loss": 0.8475, + "step": 2557 + }, + { + "epoch": 0.21049166838099156, + "grad_norm": 3.3335138581703943, + "learning_rate": 1.8339875575488176e-05, + "loss": 0.8151, + "step": 2558 + }, + { + "epoch": 0.2105739559761366, + "grad_norm": 3.66003173031739, + "learning_rate": 1.8338404609727517e-05, + "loss": 0.8462, + "step": 2559 + }, + { + "epoch": 0.21065624357128163, + "grad_norm": 3.3051867621868847, + "learning_rate": 1.833693305161993e-05, + "loss": 0.8143, + "step": 2560 + }, + { + "epoch": 0.21073853116642666, + "grad_norm": 3.3695611559510197, + "learning_rate": 1.833546090126995e-05, + "loss": 0.8315, + "step": 2561 + }, + { + "epoch": 0.2108208187615717, + "grad_norm": 0.4589791313932974, + "learning_rate": 1.8333988158782162e-05, + "loss": 0.5253, + "step": 2562 + }, + { + "epoch": 0.21090310635671672, + "grad_norm": 3.5936763964991374, + "learning_rate": 1.833251482426118e-05, + "loss": 0.8455, + "step": 2563 + }, + { + "epoch": 0.21098539395186175, + "grad_norm": 3.2932378054726503, + "learning_rate": 1.8331040897811672e-05, + "loss": 0.8313, + "step": 2564 + }, + { + "epoch": 0.21106768154700678, + "grad_norm": 3.232556623686311, + "learning_rate": 1.8329566379538342e-05, + "loss": 0.8091, + "step": 2565 + }, + { + "epoch": 0.2111499691421518, + "grad_norm": 3.104605617968999, + "learning_rate": 1.832809126954594e-05, + "loss": 0.8163, + "step": 2566 + }, + { + "epoch": 0.21123225673729684, + "grad_norm": 3.496201671234622, + "learning_rate": 1.832661556793925e-05, + "loss": 0.8261, + "step": 2567 + }, + { + "epoch": 0.21131454433244187, + "grad_norm": 3.136057053194723, + "learning_rate": 1.8325139274823108e-05, + "loss": 0.8178, + "step": 2568 + }, + { + "epoch": 0.2113968319275869, + "grad_norm": 3.0653813480017713, + "learning_rate": 1.8323662390302385e-05, + "loss": 0.8255, + "step": 2569 + }, + { + "epoch": 0.21147911952273196, + "grad_norm": 3.8601064789131834, + "learning_rate": 1.8322184914482e-05, + "loss": 0.8372, + "step": 2570 + }, + { + "epoch": 0.211561407117877, + "grad_norm": 2.8367654502679, + "learning_rate": 1.8320706847466905e-05, + "loss": 0.8198, + "step": 2571 + }, + { + "epoch": 0.21164369471302202, + "grad_norm": 4.618800398679875, + "learning_rate": 1.8319228189362105e-05, + "loss": 0.8153, + "step": 2572 + }, + { + "epoch": 0.21172598230816705, + "grad_norm": 3.2117553898555675, + "learning_rate": 1.8317748940272637e-05, + "loss": 0.799, + "step": 2573 + }, + { + "epoch": 0.21180826990331209, + "grad_norm": 3.427210044976752, + "learning_rate": 1.8316269100303586e-05, + "loss": 0.8151, + "step": 2574 + }, + { + "epoch": 0.21189055749845712, + "grad_norm": 3.5133491900994476, + "learning_rate": 1.8314788669560083e-05, + "loss": 0.8532, + "step": 2575 + }, + { + "epoch": 0.21197284509360215, + "grad_norm": 3.720043866224015, + "learning_rate": 1.8313307648147286e-05, + "loss": 0.8238, + "step": 2576 + }, + { + "epoch": 0.21205513268874718, + "grad_norm": 3.635938030216419, + "learning_rate": 1.831182603617041e-05, + "loss": 0.8029, + "step": 2577 + }, + { + "epoch": 0.2121374202838922, + "grad_norm": 3.6691540244521414, + "learning_rate": 1.8310343833734704e-05, + "loss": 0.8238, + "step": 2578 + }, + { + "epoch": 0.21221970787903724, + "grad_norm": 2.9683873093103847, + "learning_rate": 1.830886104094546e-05, + "loss": 0.8452, + "step": 2579 + }, + { + "epoch": 0.21230199547418227, + "grad_norm": 3.92284826064443, + "learning_rate": 1.830737765790802e-05, + "loss": 0.7959, + "step": 2580 + }, + { + "epoch": 0.2123842830693273, + "grad_norm": 6.433500257947444, + "learning_rate": 1.8305893684727758e-05, + "loss": 0.8113, + "step": 2581 + }, + { + "epoch": 0.21246657066447233, + "grad_norm": 0.4783127403947231, + "learning_rate": 1.830440912151009e-05, + "loss": 0.5652, + "step": 2582 + }, + { + "epoch": 0.21254885825961736, + "grad_norm": 3.060683872511528, + "learning_rate": 1.8302923968360476e-05, + "loss": 0.8024, + "step": 2583 + }, + { + "epoch": 0.2126311458547624, + "grad_norm": 3.0053452613095413, + "learning_rate": 1.830143822538443e-05, + "loss": 0.8557, + "step": 2584 + }, + { + "epoch": 0.21271343344990742, + "grad_norm": 0.4673855917288871, + "learning_rate": 1.829995189268748e-05, + "loss": 0.5718, + "step": 2585 + }, + { + "epoch": 0.21279572104505246, + "grad_norm": 6.640886036898429, + "learning_rate": 1.8298464970375228e-05, + "loss": 0.831, + "step": 2586 + }, + { + "epoch": 0.21287800864019749, + "grad_norm": 0.42158033064070444, + "learning_rate": 1.8296977458553292e-05, + "loss": 0.5438, + "step": 2587 + }, + { + "epoch": 0.21296029623534252, + "grad_norm": 2.555875718353797, + "learning_rate": 1.8295489357327345e-05, + "loss": 0.8275, + "step": 2588 + }, + { + "epoch": 0.21304258383048755, + "grad_norm": 3.406091656124588, + "learning_rate": 1.8294000666803104e-05, + "loss": 0.8295, + "step": 2589 + }, + { + "epoch": 0.21312487142563258, + "grad_norm": 2.5938644299533613, + "learning_rate": 1.8292511387086317e-05, + "loss": 0.8365, + "step": 2590 + }, + { + "epoch": 0.2132071590207776, + "grad_norm": 2.8496068081536685, + "learning_rate": 1.8291021518282786e-05, + "loss": 0.8452, + "step": 2591 + }, + { + "epoch": 0.21328944661592264, + "grad_norm": 3.3752857892659893, + "learning_rate": 1.8289531060498345e-05, + "loss": 0.8271, + "step": 2592 + }, + { + "epoch": 0.21337173421106767, + "grad_norm": 3.1170922614104795, + "learning_rate": 1.8288040013838873e-05, + "loss": 0.7921, + "step": 2593 + }, + { + "epoch": 0.2134540218062127, + "grad_norm": 2.9388409386634513, + "learning_rate": 1.8286548378410295e-05, + "loss": 0.7972, + "step": 2594 + }, + { + "epoch": 0.21353630940135773, + "grad_norm": 3.4323486322698193, + "learning_rate": 1.8285056154318573e-05, + "loss": 0.8038, + "step": 2595 + }, + { + "epoch": 0.2136185969965028, + "grad_norm": 3.2115666371727642, + "learning_rate": 1.828356334166971e-05, + "loss": 0.8441, + "step": 2596 + }, + { + "epoch": 0.21370088459164782, + "grad_norm": 0.4663816033019578, + "learning_rate": 1.8282069940569756e-05, + "loss": 0.5721, + "step": 2597 + }, + { + "epoch": 0.21378317218679285, + "grad_norm": 0.4355461155177228, + "learning_rate": 1.8280575951124796e-05, + "loss": 0.5343, + "step": 2598 + }, + { + "epoch": 0.21386545978193788, + "grad_norm": 3.747322259261724, + "learning_rate": 1.8279081373440967e-05, + "loss": 0.8277, + "step": 2599 + }, + { + "epoch": 0.21394774737708291, + "grad_norm": 2.6362418750782846, + "learning_rate": 1.8277586207624436e-05, + "loss": 0.8239, + "step": 2600 + }, + { + "epoch": 0.21403003497222794, + "grad_norm": 2.8752720486831325, + "learning_rate": 1.827609045378142e-05, + "loss": 0.8247, + "step": 2601 + }, + { + "epoch": 0.21411232256737298, + "grad_norm": 2.5111329831110525, + "learning_rate": 1.8274594112018172e-05, + "loss": 0.818, + "step": 2602 + }, + { + "epoch": 0.214194610162518, + "grad_norm": 0.47382323197265425, + "learning_rate": 1.827309718244099e-05, + "loss": 0.552, + "step": 2603 + }, + { + "epoch": 0.21427689775766304, + "grad_norm": 2.30810347940035, + "learning_rate": 1.827159966515622e-05, + "loss": 0.814, + "step": 2604 + }, + { + "epoch": 0.21435918535280807, + "grad_norm": 2.3416958083524873, + "learning_rate": 1.8270101560270234e-05, + "loss": 0.8063, + "step": 2605 + }, + { + "epoch": 0.2144414729479531, + "grad_norm": 2.880474026652732, + "learning_rate": 1.8268602867889462e-05, + "loss": 0.8221, + "step": 2606 + }, + { + "epoch": 0.21452376054309813, + "grad_norm": 0.4761050768702208, + "learning_rate": 1.8267103588120364e-05, + "loss": 0.5639, + "step": 2607 + }, + { + "epoch": 0.21460604813824316, + "grad_norm": 2.478967954728775, + "learning_rate": 1.8265603721069453e-05, + "loss": 0.8187, + "step": 2608 + }, + { + "epoch": 0.2146883357333882, + "grad_norm": 4.3294368537334345, + "learning_rate": 1.8264103266843264e-05, + "loss": 0.8205, + "step": 2609 + }, + { + "epoch": 0.21477062332853322, + "grad_norm": 0.4533011865343612, + "learning_rate": 1.8262602225548403e-05, + "loss": 0.5647, + "step": 2610 + }, + { + "epoch": 0.21485291092367825, + "grad_norm": 3.5416383187210565, + "learning_rate": 1.826110059729149e-05, + "loss": 0.8418, + "step": 2611 + }, + { + "epoch": 0.21493519851882328, + "grad_norm": 2.932491105752039, + "learning_rate": 1.82595983821792e-05, + "loss": 0.8319, + "step": 2612 + }, + { + "epoch": 0.21501748611396831, + "grad_norm": 2.254319838401644, + "learning_rate": 1.8258095580318258e-05, + "loss": 0.7838, + "step": 2613 + }, + { + "epoch": 0.21509977370911335, + "grad_norm": 2.4112108755813737, + "learning_rate": 1.8256592191815407e-05, + "loss": 0.8142, + "step": 2614 + }, + { + "epoch": 0.21518206130425838, + "grad_norm": 2.0638729753662615, + "learning_rate": 1.8255088216777454e-05, + "loss": 0.8094, + "step": 2615 + }, + { + "epoch": 0.2152643488994034, + "grad_norm": 0.4540631567309475, + "learning_rate": 1.8253583655311232e-05, + "loss": 0.5444, + "step": 2616 + }, + { + "epoch": 0.21534663649454844, + "grad_norm": 2.868258138099938, + "learning_rate": 1.8252078507523633e-05, + "loss": 0.8408, + "step": 2617 + }, + { + "epoch": 0.21542892408969347, + "grad_norm": 0.46467846212608765, + "learning_rate": 1.8250572773521568e-05, + "loss": 0.527, + "step": 2618 + }, + { + "epoch": 0.2155112116848385, + "grad_norm": 0.4244812006406836, + "learning_rate": 1.824906645341201e-05, + "loss": 0.5297, + "step": 2619 + }, + { + "epoch": 0.21559349927998353, + "grad_norm": 2.530046080436701, + "learning_rate": 1.8247559547301966e-05, + "loss": 0.8493, + "step": 2620 + }, + { + "epoch": 0.21567578687512856, + "grad_norm": 2.273032185986812, + "learning_rate": 1.8246052055298478e-05, + "loss": 0.8309, + "step": 2621 + }, + { + "epoch": 0.21575807447027362, + "grad_norm": 3.6569740933446746, + "learning_rate": 1.824454397750864e-05, + "loss": 0.8308, + "step": 2622 + }, + { + "epoch": 0.21584036206541865, + "grad_norm": 3.0536589173423576, + "learning_rate": 1.8243035314039587e-05, + "loss": 0.8164, + "step": 2623 + }, + { + "epoch": 0.21592264966056368, + "grad_norm": 3.0019301593091954, + "learning_rate": 1.8241526064998485e-05, + "loss": 0.8532, + "step": 2624 + }, + { + "epoch": 0.2160049372557087, + "grad_norm": 3.427353544502481, + "learning_rate": 1.8240016230492554e-05, + "loss": 0.8472, + "step": 2625 + }, + { + "epoch": 0.21608722485085374, + "grad_norm": 3.6884329481060103, + "learning_rate": 1.8238505810629045e-05, + "loss": 0.8313, + "step": 2626 + }, + { + "epoch": 0.21616951244599877, + "grad_norm": 4.17700274528595, + "learning_rate": 1.8236994805515263e-05, + "loss": 0.858, + "step": 2627 + }, + { + "epoch": 0.2162518000411438, + "grad_norm": 3.5142493762355222, + "learning_rate": 1.8235483215258538e-05, + "loss": 0.8013, + "step": 2628 + }, + { + "epoch": 0.21633408763628884, + "grad_norm": 0.5701316952851451, + "learning_rate": 1.823397103996626e-05, + "loss": 0.5865, + "step": 2629 + }, + { + "epoch": 0.21641637523143387, + "grad_norm": 6.342171267674201, + "learning_rate": 1.8232458279745845e-05, + "loss": 0.7946, + "step": 2630 + }, + { + "epoch": 0.2164986628265789, + "grad_norm": 4.56657322487598, + "learning_rate": 1.823094493470476e-05, + "loss": 0.8507, + "step": 2631 + }, + { + "epoch": 0.21658095042172393, + "grad_norm": 4.7063511682274, + "learning_rate": 1.822943100495051e-05, + "loss": 0.793, + "step": 2632 + }, + { + "epoch": 0.21666323801686896, + "grad_norm": 5.375144420205079, + "learning_rate": 1.8227916490590644e-05, + "loss": 0.8057, + "step": 2633 + }, + { + "epoch": 0.216745525612014, + "grad_norm": 3.8597372199138364, + "learning_rate": 1.822640139173275e-05, + "loss": 0.8367, + "step": 2634 + }, + { + "epoch": 0.21682781320715902, + "grad_norm": 3.734335633179637, + "learning_rate": 1.8224885708484454e-05, + "loss": 0.8396, + "step": 2635 + }, + { + "epoch": 0.21691010080230405, + "grad_norm": 4.897625227987675, + "learning_rate": 1.8223369440953434e-05, + "loss": 0.8158, + "step": 2636 + }, + { + "epoch": 0.21699238839744908, + "grad_norm": 0.5368699806869238, + "learning_rate": 1.82218525892474e-05, + "loss": 0.5561, + "step": 2637 + }, + { + "epoch": 0.2170746759925941, + "grad_norm": 3.8252691208651184, + "learning_rate": 1.8220335153474104e-05, + "loss": 0.7976, + "step": 2638 + }, + { + "epoch": 0.21715696358773914, + "grad_norm": 3.7756639892156727, + "learning_rate": 1.8218817133741348e-05, + "loss": 0.8426, + "step": 2639 + }, + { + "epoch": 0.21723925118288417, + "grad_norm": 4.658025704820251, + "learning_rate": 1.8217298530156963e-05, + "loss": 0.8187, + "step": 2640 + }, + { + "epoch": 0.2173215387780292, + "grad_norm": 3.733061302198968, + "learning_rate": 1.8215779342828835e-05, + "loss": 0.8049, + "step": 2641 + }, + { + "epoch": 0.21740382637317424, + "grad_norm": 3.6973346475021387, + "learning_rate": 1.8214259571864886e-05, + "loss": 0.8648, + "step": 2642 + }, + { + "epoch": 0.21748611396831927, + "grad_norm": 3.184437840138913, + "learning_rate": 1.821273921737307e-05, + "loss": 0.8411, + "step": 2643 + }, + { + "epoch": 0.2175684015634643, + "grad_norm": 3.8392363631998614, + "learning_rate": 1.8211218279461393e-05, + "loss": 0.8379, + "step": 2644 + }, + { + "epoch": 0.21765068915860933, + "grad_norm": 3.88941746570052, + "learning_rate": 1.82096967582379e-05, + "loss": 0.8284, + "step": 2645 + }, + { + "epoch": 0.21773297675375436, + "grad_norm": 4.027135426349342, + "learning_rate": 1.8208174653810683e-05, + "loss": 0.8311, + "step": 2646 + }, + { + "epoch": 0.2178152643488994, + "grad_norm": 3.719374349183988, + "learning_rate": 1.8206651966287863e-05, + "loss": 0.8015, + "step": 2647 + }, + { + "epoch": 0.21789755194404445, + "grad_norm": 3.94239636368207, + "learning_rate": 1.8205128695777613e-05, + "loss": 0.8184, + "step": 2648 + }, + { + "epoch": 0.21797983953918948, + "grad_norm": 0.5068293052496876, + "learning_rate": 1.820360484238814e-05, + "loss": 0.5834, + "step": 2649 + }, + { + "epoch": 0.2180621271343345, + "grad_norm": 3.0598014030618925, + "learning_rate": 1.8202080406227703e-05, + "loss": 0.8277, + "step": 2650 + }, + { + "epoch": 0.21814441472947954, + "grad_norm": 2.8718911462563117, + "learning_rate": 1.820055538740459e-05, + "loss": 0.8531, + "step": 2651 + }, + { + "epoch": 0.21822670232462457, + "grad_norm": 2.882907738566527, + "learning_rate": 1.8199029786027133e-05, + "loss": 0.818, + "step": 2652 + }, + { + "epoch": 0.2183089899197696, + "grad_norm": 0.4460061330621163, + "learning_rate": 1.8197503602203716e-05, + "loss": 0.5489, + "step": 2653 + }, + { + "epoch": 0.21839127751491463, + "grad_norm": 2.787502149270521, + "learning_rate": 1.8195976836042753e-05, + "loss": 0.8272, + "step": 2654 + }, + { + "epoch": 0.21847356511005966, + "grad_norm": 3.6657569464084636, + "learning_rate": 1.8194449487652704e-05, + "loss": 0.8209, + "step": 2655 + }, + { + "epoch": 0.2185558527052047, + "grad_norm": 4.190422309727537, + "learning_rate": 1.8192921557142068e-05, + "loss": 0.8266, + "step": 2656 + }, + { + "epoch": 0.21863814030034973, + "grad_norm": 0.4720717771807136, + "learning_rate": 1.8191393044619386e-05, + "loss": 0.5499, + "step": 2657 + }, + { + "epoch": 0.21872042789549476, + "grad_norm": 3.5968065813132255, + "learning_rate": 1.818986395019324e-05, + "loss": 0.8364, + "step": 2658 + }, + { + "epoch": 0.2188027154906398, + "grad_norm": 2.6988580933542057, + "learning_rate": 1.818833427397226e-05, + "loss": 0.812, + "step": 2659 + }, + { + "epoch": 0.21888500308578482, + "grad_norm": 3.1563976116639547, + "learning_rate": 1.818680401606511e-05, + "loss": 0.802, + "step": 2660 + }, + { + "epoch": 0.21896729068092985, + "grad_norm": 2.9698731854020854, + "learning_rate": 1.8185273176580494e-05, + "loss": 0.8007, + "step": 2661 + }, + { + "epoch": 0.21904957827607488, + "grad_norm": 3.209600169593195, + "learning_rate": 1.818374175562716e-05, + "loss": 0.8291, + "step": 2662 + }, + { + "epoch": 0.2191318658712199, + "grad_norm": 3.2855530219150206, + "learning_rate": 1.8182209753313903e-05, + "loss": 0.8061, + "step": 2663 + }, + { + "epoch": 0.21921415346636494, + "grad_norm": 2.9586512787993042, + "learning_rate": 1.8180677169749547e-05, + "loss": 0.848, + "step": 2664 + }, + { + "epoch": 0.21929644106150997, + "grad_norm": 2.355790190628477, + "learning_rate": 1.817914400504297e-05, + "loss": 0.8436, + "step": 2665 + }, + { + "epoch": 0.219378728656655, + "grad_norm": 3.9499318826017222, + "learning_rate": 1.8177610259303085e-05, + "loss": 0.8172, + "step": 2666 + }, + { + "epoch": 0.21946101625180003, + "grad_norm": 3.587633446988715, + "learning_rate": 1.8176075932638842e-05, + "loss": 0.8162, + "step": 2667 + }, + { + "epoch": 0.21954330384694506, + "grad_norm": 2.408074965663899, + "learning_rate": 1.8174541025159242e-05, + "loss": 0.8084, + "step": 2668 + }, + { + "epoch": 0.2196255914420901, + "grad_norm": 2.3988549424557015, + "learning_rate": 1.817300553697332e-05, + "loss": 0.8377, + "step": 2669 + }, + { + "epoch": 0.21970787903723513, + "grad_norm": 2.6276552918579172, + "learning_rate": 1.8171469468190156e-05, + "loss": 0.8095, + "step": 2670 + }, + { + "epoch": 0.21979016663238016, + "grad_norm": 0.4931309790183725, + "learning_rate": 1.816993281891887e-05, + "loss": 0.5831, + "step": 2671 + }, + { + "epoch": 0.2198724542275252, + "grad_norm": 2.862710610340563, + "learning_rate": 1.8168395589268624e-05, + "loss": 0.8434, + "step": 2672 + }, + { + "epoch": 0.21995474182267022, + "grad_norm": 2.7040519252261705, + "learning_rate": 1.8166857779348618e-05, + "loss": 0.835, + "step": 2673 + }, + { + "epoch": 0.22003702941781528, + "grad_norm": 2.2027133058893513, + "learning_rate": 1.816531938926809e-05, + "loss": 0.808, + "step": 2674 + }, + { + "epoch": 0.2201193170129603, + "grad_norm": 2.1305283929039462, + "learning_rate": 1.816378041913634e-05, + "loss": 0.801, + "step": 2675 + }, + { + "epoch": 0.22020160460810534, + "grad_norm": 2.1727604185232288, + "learning_rate": 1.816224086906268e-05, + "loss": 0.7963, + "step": 2676 + }, + { + "epoch": 0.22028389220325037, + "grad_norm": 2.0940359561475055, + "learning_rate": 1.816070073915648e-05, + "loss": 0.8036, + "step": 2677 + }, + { + "epoch": 0.2203661797983954, + "grad_norm": 2.6473855146481498, + "learning_rate": 1.815916002952716e-05, + "loss": 0.8284, + "step": 2678 + }, + { + "epoch": 0.22044846739354043, + "grad_norm": 2.8347082933489376, + "learning_rate": 1.8157618740284153e-05, + "loss": 0.7905, + "step": 2679 + }, + { + "epoch": 0.22053075498868546, + "grad_norm": 3.203459633707998, + "learning_rate": 1.8156076871536958e-05, + "loss": 0.8463, + "step": 2680 + }, + { + "epoch": 0.2206130425838305, + "grad_norm": 2.5115557324427824, + "learning_rate": 1.8154534423395107e-05, + "loss": 0.8296, + "step": 2681 + }, + { + "epoch": 0.22069533017897552, + "grad_norm": 2.6138770428847358, + "learning_rate": 1.815299139596817e-05, + "loss": 0.8051, + "step": 2682 + }, + { + "epoch": 0.22077761777412055, + "grad_norm": 3.2232601828969694, + "learning_rate": 1.8151447789365764e-05, + "loss": 0.8251, + "step": 2683 + }, + { + "epoch": 0.22085990536926559, + "grad_norm": 2.35388990966657, + "learning_rate": 1.814990360369754e-05, + "loss": 0.8204, + "step": 2684 + }, + { + "epoch": 0.22094219296441062, + "grad_norm": 2.9872998389223513, + "learning_rate": 1.8148358839073205e-05, + "loss": 0.8165, + "step": 2685 + }, + { + "epoch": 0.22102448055955565, + "grad_norm": 2.8245273602696557, + "learning_rate": 1.8146813495602484e-05, + "loss": 0.8324, + "step": 2686 + }, + { + "epoch": 0.22110676815470068, + "grad_norm": 3.1009018581544483, + "learning_rate": 1.8145267573395163e-05, + "loss": 0.7939, + "step": 2687 + }, + { + "epoch": 0.2211890557498457, + "grad_norm": 3.0898385828991026, + "learning_rate": 1.8143721072561062e-05, + "loss": 0.8672, + "step": 2688 + }, + { + "epoch": 0.22127134334499074, + "grad_norm": 2.4901137109662494, + "learning_rate": 1.8142173993210034e-05, + "loss": 0.8209, + "step": 2689 + }, + { + "epoch": 0.22135363094013577, + "grad_norm": 2.6992537201209488, + "learning_rate": 1.814062633545199e-05, + "loss": 0.8152, + "step": 2690 + }, + { + "epoch": 0.2214359185352808, + "grad_norm": 2.47623420446661, + "learning_rate": 1.813907809939687e-05, + "loss": 0.8095, + "step": 2691 + }, + { + "epoch": 0.22151820613042583, + "grad_norm": 2.483085492924412, + "learning_rate": 1.813752928515466e-05, + "loss": 0.8188, + "step": 2692 + }, + { + "epoch": 0.22160049372557086, + "grad_norm": 2.7071635220071273, + "learning_rate": 1.8135979892835383e-05, + "loss": 0.8264, + "step": 2693 + }, + { + "epoch": 0.2216827813207159, + "grad_norm": 2.2474688512352206, + "learning_rate": 1.8134429922549106e-05, + "loss": 0.8026, + "step": 2694 + }, + { + "epoch": 0.22176506891586092, + "grad_norm": 2.4233908375852065, + "learning_rate": 1.8132879374405937e-05, + "loss": 0.8337, + "step": 2695 + }, + { + "epoch": 0.22184735651100596, + "grad_norm": 3.165042749257207, + "learning_rate": 1.813132824851602e-05, + "loss": 0.8341, + "step": 2696 + }, + { + "epoch": 0.221929644106151, + "grad_norm": 3.0657787022724947, + "learning_rate": 1.812977654498955e-05, + "loss": 0.8126, + "step": 2697 + }, + { + "epoch": 0.22201193170129602, + "grad_norm": 2.7194251670803857, + "learning_rate": 1.812822426393676e-05, + "loss": 0.8572, + "step": 2698 + }, + { + "epoch": 0.22209421929644105, + "grad_norm": 2.755261631156646, + "learning_rate": 1.8126671405467914e-05, + "loss": 0.809, + "step": 2699 + }, + { + "epoch": 0.2221765068915861, + "grad_norm": 2.7498923006459854, + "learning_rate": 1.812511796969333e-05, + "loss": 0.8131, + "step": 2700 + }, + { + "epoch": 0.22225879448673114, + "grad_norm": 2.435235398052274, + "learning_rate": 1.8123563956723357e-05, + "loss": 0.8362, + "step": 2701 + }, + { + "epoch": 0.22234108208187617, + "grad_norm": 0.457018645766829, + "learning_rate": 1.8122009366668394e-05, + "loss": 0.5594, + "step": 2702 + }, + { + "epoch": 0.2224233696770212, + "grad_norm": 4.292693915757756, + "learning_rate": 1.8120454199638874e-05, + "loss": 0.8575, + "step": 2703 + }, + { + "epoch": 0.22250565727216623, + "grad_norm": 2.8863182053111234, + "learning_rate": 1.8118898455745276e-05, + "loss": 0.7982, + "step": 2704 + }, + { + "epoch": 0.22258794486731126, + "grad_norm": 2.7521147693360715, + "learning_rate": 1.811734213509811e-05, + "loss": 0.839, + "step": 2705 + }, + { + "epoch": 0.2226702324624563, + "grad_norm": 3.2656957555222323, + "learning_rate": 1.8115785237807948e-05, + "loss": 0.834, + "step": 2706 + }, + { + "epoch": 0.22275252005760132, + "grad_norm": 3.789129894489883, + "learning_rate": 1.811422776398538e-05, + "loss": 0.8128, + "step": 2707 + }, + { + "epoch": 0.22283480765274635, + "grad_norm": 2.3356779771566245, + "learning_rate": 1.8112669713741046e-05, + "loss": 0.8131, + "step": 2708 + }, + { + "epoch": 0.22291709524789138, + "grad_norm": 3.555488258057913, + "learning_rate": 1.8111111087185633e-05, + "loss": 0.8236, + "step": 2709 + }, + { + "epoch": 0.22299938284303641, + "grad_norm": 2.0143410520363356, + "learning_rate": 1.8109551884429858e-05, + "loss": 0.8185, + "step": 2710 + }, + { + "epoch": 0.22308167043818145, + "grad_norm": 2.63263049454595, + "learning_rate": 1.8107992105584488e-05, + "loss": 0.8389, + "step": 2711 + }, + { + "epoch": 0.22316395803332648, + "grad_norm": 2.279152679520782, + "learning_rate": 1.8106431750760326e-05, + "loss": 0.7975, + "step": 2712 + }, + { + "epoch": 0.2232462456284715, + "grad_norm": 2.6298664235486275, + "learning_rate": 1.8104870820068214e-05, + "loss": 0.8051, + "step": 2713 + }, + { + "epoch": 0.22332853322361654, + "grad_norm": 2.6437941616288865, + "learning_rate": 1.8103309313619042e-05, + "loss": 0.81, + "step": 2714 + }, + { + "epoch": 0.22341082081876157, + "grad_norm": 2.77868949196984, + "learning_rate": 1.8101747231523735e-05, + "loss": 0.8422, + "step": 2715 + }, + { + "epoch": 0.2234931084139066, + "grad_norm": 2.386387317962988, + "learning_rate": 1.8100184573893265e-05, + "loss": 0.8189, + "step": 2716 + }, + { + "epoch": 0.22357539600905163, + "grad_norm": 0.47172623629404353, + "learning_rate": 1.8098621340838635e-05, + "loss": 0.5472, + "step": 2717 + }, + { + "epoch": 0.22365768360419666, + "grad_norm": 2.2257199614717864, + "learning_rate": 1.8097057532470893e-05, + "loss": 0.8239, + "step": 2718 + }, + { + "epoch": 0.2237399711993417, + "grad_norm": 2.2490034040330222, + "learning_rate": 1.809549314890114e-05, + "loss": 0.8069, + "step": 2719 + }, + { + "epoch": 0.22382225879448672, + "grad_norm": 2.0704084361324018, + "learning_rate": 1.8093928190240496e-05, + "loss": 0.8194, + "step": 2720 + }, + { + "epoch": 0.22390454638963175, + "grad_norm": 1.9673046749980998, + "learning_rate": 1.809236265660014e-05, + "loss": 0.8221, + "step": 2721 + }, + { + "epoch": 0.22398683398477678, + "grad_norm": 0.4623247008641807, + "learning_rate": 1.809079654809128e-05, + "loss": 0.5534, + "step": 2722 + }, + { + "epoch": 0.22406912157992182, + "grad_norm": 2.435738230686578, + "learning_rate": 1.8089229864825175e-05, + "loss": 0.8088, + "step": 2723 + }, + { + "epoch": 0.22415140917506685, + "grad_norm": 2.4030263135753187, + "learning_rate": 1.8087662606913116e-05, + "loss": 0.7986, + "step": 2724 + }, + { + "epoch": 0.2242336967702119, + "grad_norm": 2.3586380489701733, + "learning_rate": 1.808609477446644e-05, + "loss": 0.8449, + "step": 2725 + }, + { + "epoch": 0.22431598436535694, + "grad_norm": 2.7952344002987983, + "learning_rate": 1.808452636759652e-05, + "loss": 0.8458, + "step": 2726 + }, + { + "epoch": 0.22439827196050197, + "grad_norm": 2.7536534246615854, + "learning_rate": 1.8082957386414782e-05, + "loss": 0.829, + "step": 2727 + }, + { + "epoch": 0.224480559555647, + "grad_norm": 0.4342296596543259, + "learning_rate": 1.8081387831032675e-05, + "loss": 0.5589, + "step": 2728 + }, + { + "epoch": 0.22456284715079203, + "grad_norm": 2.3765572582680106, + "learning_rate": 1.8079817701561702e-05, + "loss": 0.8591, + "step": 2729 + }, + { + "epoch": 0.22464513474593706, + "grad_norm": 2.807707088671858, + "learning_rate": 1.80782469981134e-05, + "loss": 0.8012, + "step": 2730 + }, + { + "epoch": 0.2247274223410821, + "grad_norm": 2.87764660553564, + "learning_rate": 1.807667572079935e-05, + "loss": 0.839, + "step": 2731 + }, + { + "epoch": 0.22480970993622712, + "grad_norm": 0.43016929955597694, + "learning_rate": 1.8075103869731174e-05, + "loss": 0.5304, + "step": 2732 + }, + { + "epoch": 0.22489199753137215, + "grad_norm": 2.8712395483576296, + "learning_rate": 1.8073531445020533e-05, + "loss": 0.8004, + "step": 2733 + }, + { + "epoch": 0.22497428512651718, + "grad_norm": 3.9125598864320894, + "learning_rate": 1.8071958446779133e-05, + "loss": 0.8353, + "step": 2734 + }, + { + "epoch": 0.2250565727216622, + "grad_norm": 2.8766794137375276, + "learning_rate": 1.807038487511871e-05, + "loss": 0.7854, + "step": 2735 + }, + { + "epoch": 0.22513886031680724, + "grad_norm": 3.76834116747564, + "learning_rate": 1.8068810730151053e-05, + "loss": 0.8137, + "step": 2736 + }, + { + "epoch": 0.22522114791195227, + "grad_norm": 3.853892293284356, + "learning_rate": 1.8067236011987987e-05, + "loss": 0.8351, + "step": 2737 + }, + { + "epoch": 0.2253034355070973, + "grad_norm": 3.415055201893161, + "learning_rate": 1.8065660720741374e-05, + "loss": 0.8034, + "step": 2738 + }, + { + "epoch": 0.22538572310224234, + "grad_norm": 2.940132599899046, + "learning_rate": 1.8064084856523126e-05, + "loss": 0.8307, + "step": 2739 + }, + { + "epoch": 0.22546801069738737, + "grad_norm": 3.581335962493952, + "learning_rate": 1.8062508419445187e-05, + "loss": 0.8331, + "step": 2740 + }, + { + "epoch": 0.2255502982925324, + "grad_norm": 3.63356357971262, + "learning_rate": 1.806093140961954e-05, + "loss": 0.8409, + "step": 2741 + }, + { + "epoch": 0.22563258588767743, + "grad_norm": 6.103630461426989, + "learning_rate": 1.805935382715822e-05, + "loss": 0.805, + "step": 2742 + }, + { + "epoch": 0.22571487348282246, + "grad_norm": 4.939277422318664, + "learning_rate": 1.8057775672173292e-05, + "loss": 0.8172, + "step": 2743 + }, + { + "epoch": 0.2257971610779675, + "grad_norm": 3.5615037872775708, + "learning_rate": 1.8056196944776867e-05, + "loss": 0.8284, + "step": 2744 + }, + { + "epoch": 0.22587944867311252, + "grad_norm": 4.683351176812795, + "learning_rate": 1.8054617645081094e-05, + "loss": 0.797, + "step": 2745 + }, + { + "epoch": 0.22596173626825755, + "grad_norm": 3.9070880231642082, + "learning_rate": 1.8053037773198165e-05, + "loss": 0.7933, + "step": 2746 + }, + { + "epoch": 0.22604402386340258, + "grad_norm": 3.919312161038904, + "learning_rate": 1.8051457329240315e-05, + "loss": 0.8028, + "step": 2747 + }, + { + "epoch": 0.2261263114585476, + "grad_norm": 3.340988457131581, + "learning_rate": 1.8049876313319807e-05, + "loss": 0.8357, + "step": 2748 + }, + { + "epoch": 0.22620859905369264, + "grad_norm": 3.8575957682483053, + "learning_rate": 1.8048294725548966e-05, + "loss": 0.8367, + "step": 2749 + }, + { + "epoch": 0.22629088664883767, + "grad_norm": 5.825965824067105, + "learning_rate": 1.8046712566040135e-05, + "loss": 0.8053, + "step": 2750 + }, + { + "epoch": 0.22637317424398273, + "grad_norm": 3.497742944083964, + "learning_rate": 1.8045129834905713e-05, + "loss": 0.8078, + "step": 2751 + }, + { + "epoch": 0.22645546183912776, + "grad_norm": 3.7369114255439047, + "learning_rate": 1.8043546532258133e-05, + "loss": 0.7694, + "step": 2752 + }, + { + "epoch": 0.2265377494342728, + "grad_norm": 3.5319658700375576, + "learning_rate": 1.8041962658209873e-05, + "loss": 0.8042, + "step": 2753 + }, + { + "epoch": 0.22662003702941783, + "grad_norm": 0.4604383030756832, + "learning_rate": 1.8040378212873445e-05, + "loss": 0.5335, + "step": 2754 + }, + { + "epoch": 0.22670232462456286, + "grad_norm": 0.4470665522275032, + "learning_rate": 1.803879319636141e-05, + "loss": 0.5761, + "step": 2755 + }, + { + "epoch": 0.2267846122197079, + "grad_norm": 0.4366989067725965, + "learning_rate": 1.803720760878636e-05, + "loss": 0.5476, + "step": 2756 + }, + { + "epoch": 0.22686689981485292, + "grad_norm": 0.4340565178454343, + "learning_rate": 1.8035621450260934e-05, + "loss": 0.5576, + "step": 2757 + }, + { + "epoch": 0.22694918740999795, + "grad_norm": 3.256507120264482, + "learning_rate": 1.8034034720897815e-05, + "loss": 0.8254, + "step": 2758 + }, + { + "epoch": 0.22703147500514298, + "grad_norm": 3.547568746283773, + "learning_rate": 1.8032447420809714e-05, + "loss": 0.7996, + "step": 2759 + }, + { + "epoch": 0.227113762600288, + "grad_norm": 3.162031884420745, + "learning_rate": 1.8030859550109395e-05, + "loss": 0.7906, + "step": 2760 + }, + { + "epoch": 0.22719605019543304, + "grad_norm": 5.324282934442062, + "learning_rate": 1.8029271108909658e-05, + "loss": 0.8107, + "step": 2761 + }, + { + "epoch": 0.22727833779057807, + "grad_norm": 0.5110136509069735, + "learning_rate": 1.8027682097323345e-05, + "loss": 0.5632, + "step": 2762 + }, + { + "epoch": 0.2273606253857231, + "grad_norm": 3.1907523347530984, + "learning_rate": 1.802609251546333e-05, + "loss": 0.8165, + "step": 2763 + }, + { + "epoch": 0.22744291298086813, + "grad_norm": 0.45997827251280465, + "learning_rate": 1.802450236344254e-05, + "loss": 0.5599, + "step": 2764 + }, + { + "epoch": 0.22752520057601316, + "grad_norm": 4.260820446140328, + "learning_rate": 1.802291164137394e-05, + "loss": 0.8009, + "step": 2765 + }, + { + "epoch": 0.2276074881711582, + "grad_norm": 3.0574181449459585, + "learning_rate": 1.802132034937052e-05, + "loss": 0.827, + "step": 2766 + }, + { + "epoch": 0.22768977576630323, + "grad_norm": 19.53527055975888, + "learning_rate": 1.8019728487545337e-05, + "loss": 0.8205, + "step": 2767 + }, + { + "epoch": 0.22777206336144826, + "grad_norm": 2.5986421856673134, + "learning_rate": 1.8018136056011464e-05, + "loss": 0.8297, + "step": 2768 + }, + { + "epoch": 0.2278543509565933, + "grad_norm": 2.5704257965519504, + "learning_rate": 1.801654305488203e-05, + "loss": 0.8153, + "step": 2769 + }, + { + "epoch": 0.22793663855173832, + "grad_norm": 3.16912014397892, + "learning_rate": 1.8014949484270196e-05, + "loss": 0.8531, + "step": 2770 + }, + { + "epoch": 0.22801892614688335, + "grad_norm": 2.4673152456843064, + "learning_rate": 1.8013355344289172e-05, + "loss": 0.7883, + "step": 2771 + }, + { + "epoch": 0.22810121374202838, + "grad_norm": 11.922974258036625, + "learning_rate": 1.8011760635052198e-05, + "loss": 0.8156, + "step": 2772 + }, + { + "epoch": 0.2281835013371734, + "grad_norm": 3.306458350222661, + "learning_rate": 1.801016535667256e-05, + "loss": 0.7997, + "step": 2773 + }, + { + "epoch": 0.22826578893231844, + "grad_norm": 3.5841902170576923, + "learning_rate": 1.8008569509263588e-05, + "loss": 0.8171, + "step": 2774 + }, + { + "epoch": 0.22834807652746347, + "grad_norm": 2.7601808874883744, + "learning_rate": 1.8006973092938645e-05, + "loss": 0.8349, + "step": 2775 + }, + { + "epoch": 0.2284303641226085, + "grad_norm": 2.4278025413104993, + "learning_rate": 1.8005376107811136e-05, + "loss": 0.8082, + "step": 2776 + }, + { + "epoch": 0.22851265171775356, + "grad_norm": 3.01193194541238, + "learning_rate": 1.8003778553994515e-05, + "loss": 0.8296, + "step": 2777 + }, + { + "epoch": 0.2285949393128986, + "grad_norm": 0.5176617522878121, + "learning_rate": 1.8002180431602264e-05, + "loss": 0.5477, + "step": 2778 + }, + { + "epoch": 0.22867722690804362, + "grad_norm": 3.0329698356134895, + "learning_rate": 1.8000581740747913e-05, + "loss": 0.8466, + "step": 2779 + }, + { + "epoch": 0.22875951450318865, + "grad_norm": 2.6835536612614037, + "learning_rate": 1.799898248154503e-05, + "loss": 0.8423, + "step": 2780 + }, + { + "epoch": 0.22884180209833369, + "grad_norm": 3.424392110523395, + "learning_rate": 1.7997382654107227e-05, + "loss": 0.8356, + "step": 2781 + }, + { + "epoch": 0.22892408969347872, + "grad_norm": 2.7777316016252547, + "learning_rate": 1.7995782258548146e-05, + "loss": 0.8572, + "step": 2782 + }, + { + "epoch": 0.22900637728862375, + "grad_norm": 3.304412721528233, + "learning_rate": 1.799418129498148e-05, + "loss": 0.8053, + "step": 2783 + }, + { + "epoch": 0.22908866488376878, + "grad_norm": 2.9525198650899975, + "learning_rate": 1.7992579763520964e-05, + "loss": 0.8395, + "step": 2784 + }, + { + "epoch": 0.2291709524789138, + "grad_norm": 3.224856904335899, + "learning_rate": 1.799097766428036e-05, + "loss": 0.8374, + "step": 2785 + }, + { + "epoch": 0.22925324007405884, + "grad_norm": 4.018838225746211, + "learning_rate": 1.7989374997373486e-05, + "loss": 0.811, + "step": 2786 + }, + { + "epoch": 0.22933552766920387, + "grad_norm": 2.934213298461492, + "learning_rate": 1.7987771762914185e-05, + "loss": 0.8178, + "step": 2787 + }, + { + "epoch": 0.2294178152643489, + "grad_norm": 3.174148191405845, + "learning_rate": 1.7986167961016355e-05, + "loss": 0.8313, + "step": 2788 + }, + { + "epoch": 0.22950010285949393, + "grad_norm": 2.606152650937208, + "learning_rate": 1.7984563591793923e-05, + "loss": 0.8251, + "step": 2789 + }, + { + "epoch": 0.22958239045463896, + "grad_norm": 3.111609899574041, + "learning_rate": 1.7982958655360866e-05, + "loss": 0.8243, + "step": 2790 + }, + { + "epoch": 0.229664678049784, + "grad_norm": 2.986420250781335, + "learning_rate": 1.7981353151831193e-05, + "loss": 0.8266, + "step": 2791 + }, + { + "epoch": 0.22974696564492902, + "grad_norm": 3.883321008722349, + "learning_rate": 1.7979747081318956e-05, + "loss": 0.8164, + "step": 2792 + }, + { + "epoch": 0.22982925324007406, + "grad_norm": 3.106270550600261, + "learning_rate": 1.7978140443938244e-05, + "loss": 0.8306, + "step": 2793 + }, + { + "epoch": 0.22991154083521909, + "grad_norm": 3.2300498098741723, + "learning_rate": 1.79765332398032e-05, + "loss": 0.8203, + "step": 2794 + }, + { + "epoch": 0.22999382843036412, + "grad_norm": 4.038394248381576, + "learning_rate": 1.7974925469027986e-05, + "loss": 0.822, + "step": 2795 + }, + { + "epoch": 0.23007611602550915, + "grad_norm": 2.7286891766184125, + "learning_rate": 1.7973317131726823e-05, + "loss": 0.8417, + "step": 2796 + }, + { + "epoch": 0.23015840362065418, + "grad_norm": 0.535474203626563, + "learning_rate": 1.7971708228013966e-05, + "loss": 0.5823, + "step": 2797 + }, + { + "epoch": 0.2302406912157992, + "grad_norm": 0.4653735052850396, + "learning_rate": 1.7970098758003697e-05, + "loss": 0.5227, + "step": 2798 + }, + { + "epoch": 0.23032297881094424, + "grad_norm": 2.797506340326556, + "learning_rate": 1.7968488721810364e-05, + "loss": 0.8128, + "step": 2799 + }, + { + "epoch": 0.23040526640608927, + "grad_norm": 4.896588128397363, + "learning_rate": 1.7966878119548335e-05, + "loss": 0.8044, + "step": 2800 + }, + { + "epoch": 0.2304875540012343, + "grad_norm": 2.876903760083837, + "learning_rate": 1.7965266951332027e-05, + "loss": 0.7948, + "step": 2801 + }, + { + "epoch": 0.23056984159637933, + "grad_norm": 2.5982431023174777, + "learning_rate": 1.796365521727589e-05, + "loss": 0.8107, + "step": 2802 + }, + { + "epoch": 0.2306521291915244, + "grad_norm": 2.615159312652529, + "learning_rate": 1.7962042917494427e-05, + "loss": 0.8297, + "step": 2803 + }, + { + "epoch": 0.23073441678666942, + "grad_norm": 2.9217124157634693, + "learning_rate": 1.7960430052102166e-05, + "loss": 0.844, + "step": 2804 + }, + { + "epoch": 0.23081670438181445, + "grad_norm": 0.5660495591221812, + "learning_rate": 1.7958816621213684e-05, + "loss": 0.5424, + "step": 2805 + }, + { + "epoch": 0.23089899197695948, + "grad_norm": 2.664884669703133, + "learning_rate": 1.79572026249436e-05, + "loss": 0.7837, + "step": 2806 + }, + { + "epoch": 0.23098127957210451, + "grad_norm": 3.2720350383314964, + "learning_rate": 1.7955588063406564e-05, + "loss": 0.7938, + "step": 2807 + }, + { + "epoch": 0.23106356716724955, + "grad_norm": 2.689662319312349, + "learning_rate": 1.7953972936717278e-05, + "loss": 0.8643, + "step": 2808 + }, + { + "epoch": 0.23114585476239458, + "grad_norm": 2.6269361429987277, + "learning_rate": 1.795235724499047e-05, + "loss": 0.8291, + "step": 2809 + }, + { + "epoch": 0.2312281423575396, + "grad_norm": 0.47857928016042517, + "learning_rate": 1.7950740988340926e-05, + "loss": 0.5723, + "step": 2810 + }, + { + "epoch": 0.23131042995268464, + "grad_norm": 0.4619817927680795, + "learning_rate": 1.7949124166883457e-05, + "loss": 0.5529, + "step": 2811 + }, + { + "epoch": 0.23139271754782967, + "grad_norm": 4.420187743344128, + "learning_rate": 1.794750678073292e-05, + "loss": 0.7903, + "step": 2812 + }, + { + "epoch": 0.2314750051429747, + "grad_norm": 2.674173263406235, + "learning_rate": 1.794588883000421e-05, + "loss": 0.8273, + "step": 2813 + }, + { + "epoch": 0.23155729273811973, + "grad_norm": 3.903932346463788, + "learning_rate": 1.7944270314812265e-05, + "loss": 0.8509, + "step": 2814 + }, + { + "epoch": 0.23163958033326476, + "grad_norm": 2.374337227802502, + "learning_rate": 1.7942651235272064e-05, + "loss": 0.8112, + "step": 2815 + }, + { + "epoch": 0.2317218679284098, + "grad_norm": 2.8597494534517365, + "learning_rate": 1.7941031591498623e-05, + "loss": 0.8384, + "step": 2816 + }, + { + "epoch": 0.23180415552355482, + "grad_norm": 2.5462037393295383, + "learning_rate": 1.793941138360699e-05, + "loss": 0.846, + "step": 2817 + }, + { + "epoch": 0.23188644311869985, + "grad_norm": 2.5528082911592986, + "learning_rate": 1.7937790611712275e-05, + "loss": 0.8105, + "step": 2818 + }, + { + "epoch": 0.23196873071384488, + "grad_norm": 2.5290198154963535, + "learning_rate": 1.793616927592961e-05, + "loss": 0.8408, + "step": 2819 + }, + { + "epoch": 0.23205101830898991, + "grad_norm": 2.401467555558881, + "learning_rate": 1.793454737637417e-05, + "loss": 0.817, + "step": 2820 + }, + { + "epoch": 0.23213330590413495, + "grad_norm": 0.7866378048410645, + "learning_rate": 1.7932924913161173e-05, + "loss": 0.5505, + "step": 2821 + }, + { + "epoch": 0.23221559349927998, + "grad_norm": 3.133879280551612, + "learning_rate": 1.793130188640588e-05, + "loss": 0.7989, + "step": 2822 + }, + { + "epoch": 0.232297881094425, + "grad_norm": 2.6587889416921553, + "learning_rate": 1.7929678296223585e-05, + "loss": 0.8313, + "step": 2823 + }, + { + "epoch": 0.23238016868957004, + "grad_norm": 0.458861730857705, + "learning_rate": 1.7928054142729622e-05, + "loss": 0.5455, + "step": 2824 + }, + { + "epoch": 0.23246245628471507, + "grad_norm": 0.4621236247774008, + "learning_rate": 1.7926429426039376e-05, + "loss": 0.555, + "step": 2825 + }, + { + "epoch": 0.2325447438798601, + "grad_norm": 3.52670740753614, + "learning_rate": 1.7924804146268257e-05, + "loss": 0.8106, + "step": 2826 + }, + { + "epoch": 0.23262703147500513, + "grad_norm": 2.873579756099866, + "learning_rate": 1.7923178303531727e-05, + "loss": 0.8178, + "step": 2827 + }, + { + "epoch": 0.23270931907015016, + "grad_norm": 3.9109130325921884, + "learning_rate": 1.792155189794528e-05, + "loss": 0.804, + "step": 2828 + }, + { + "epoch": 0.23279160666529522, + "grad_norm": 3.486775955872337, + "learning_rate": 1.7919924929624457e-05, + "loss": 0.8197, + "step": 2829 + }, + { + "epoch": 0.23287389426044025, + "grad_norm": 2.7880849629169937, + "learning_rate": 1.7918297398684828e-05, + "loss": 0.8276, + "step": 2830 + }, + { + "epoch": 0.23295618185558528, + "grad_norm": 3.0345851221812734, + "learning_rate": 1.791666930524202e-05, + "loss": 0.8391, + "step": 2831 + }, + { + "epoch": 0.2330384694507303, + "grad_norm": 2.7802856478901328, + "learning_rate": 1.7915040649411687e-05, + "loss": 0.8261, + "step": 2832 + }, + { + "epoch": 0.23312075704587534, + "grad_norm": 3.01147576928371, + "learning_rate": 1.7913411431309523e-05, + "loss": 0.8339, + "step": 2833 + }, + { + "epoch": 0.23320304464102037, + "grad_norm": 3.2421585360985836, + "learning_rate": 1.7911781651051263e-05, + "loss": 0.8345, + "step": 2834 + }, + { + "epoch": 0.2332853322361654, + "grad_norm": 2.984325765197047, + "learning_rate": 1.791015130875269e-05, + "loss": 0.8164, + "step": 2835 + }, + { + "epoch": 0.23336761983131044, + "grad_norm": 0.6096625903346664, + "learning_rate": 1.7908520404529618e-05, + "loss": 0.559, + "step": 2836 + }, + { + "epoch": 0.23344990742645547, + "grad_norm": 2.873760656894134, + "learning_rate": 1.7906888938497906e-05, + "loss": 0.812, + "step": 2837 + }, + { + "epoch": 0.2335321950216005, + "grad_norm": 0.5020987306387462, + "learning_rate": 1.7905256910773446e-05, + "loss": 0.5341, + "step": 2838 + }, + { + "epoch": 0.23361448261674553, + "grad_norm": 4.234624919345039, + "learning_rate": 1.7903624321472183e-05, + "loss": 0.8344, + "step": 2839 + }, + { + "epoch": 0.23369677021189056, + "grad_norm": 2.8206007894552187, + "learning_rate": 1.790199117071008e-05, + "loss": 0.7979, + "step": 2840 + }, + { + "epoch": 0.2337790578070356, + "grad_norm": 2.739405453759977, + "learning_rate": 1.7900357458603168e-05, + "loss": 0.8385, + "step": 2841 + }, + { + "epoch": 0.23386134540218062, + "grad_norm": 3.0773254877839733, + "learning_rate": 1.7898723185267496e-05, + "loss": 0.8249, + "step": 2842 + }, + { + "epoch": 0.23394363299732565, + "grad_norm": 4.644462021471566, + "learning_rate": 1.789708835081916e-05, + "loss": 0.8543, + "step": 2843 + }, + { + "epoch": 0.23402592059247068, + "grad_norm": 3.052268322199209, + "learning_rate": 1.7895452955374296e-05, + "loss": 0.8183, + "step": 2844 + }, + { + "epoch": 0.2341082081876157, + "grad_norm": 3.2071646031808556, + "learning_rate": 1.789381699904908e-05, + "loss": 0.8029, + "step": 2845 + }, + { + "epoch": 0.23419049578276074, + "grad_norm": 7.718251805801209, + "learning_rate": 1.789218048195973e-05, + "loss": 0.8158, + "step": 2846 + }, + { + "epoch": 0.23427278337790577, + "grad_norm": 3.693545358611462, + "learning_rate": 1.78905434042225e-05, + "loss": 0.831, + "step": 2847 + }, + { + "epoch": 0.2343550709730508, + "grad_norm": 3.871319336017675, + "learning_rate": 1.788890576595369e-05, + "loss": 0.7702, + "step": 2848 + }, + { + "epoch": 0.23443735856819584, + "grad_norm": 3.3689317720232053, + "learning_rate": 1.7887267567269627e-05, + "loss": 0.8003, + "step": 2849 + }, + { + "epoch": 0.23451964616334087, + "grad_norm": 0.6691224385810717, + "learning_rate": 1.788562880828669e-05, + "loss": 0.5571, + "step": 2850 + }, + { + "epoch": 0.2346019337584859, + "grad_norm": 3.202024357123697, + "learning_rate": 1.7883989489121293e-05, + "loss": 0.8483, + "step": 2851 + }, + { + "epoch": 0.23468422135363093, + "grad_norm": 2.3964130234978556, + "learning_rate": 1.7882349609889896e-05, + "loss": 0.807, + "step": 2852 + }, + { + "epoch": 0.23476650894877596, + "grad_norm": 2.6580648611006334, + "learning_rate": 1.788070917070898e-05, + "loss": 0.799, + "step": 2853 + }, + { + "epoch": 0.234848796543921, + "grad_norm": 2.538196363098514, + "learning_rate": 1.7879068171695095e-05, + "loss": 0.8416, + "step": 2854 + }, + { + "epoch": 0.23493108413906605, + "grad_norm": 3.285871614256874, + "learning_rate": 1.7877426612964805e-05, + "loss": 0.8215, + "step": 2855 + }, + { + "epoch": 0.23501337173421108, + "grad_norm": 0.476710228187294, + "learning_rate": 1.7875784494634727e-05, + "loss": 0.5773, + "step": 2856 + }, + { + "epoch": 0.2350956593293561, + "grad_norm": 2.710230028079557, + "learning_rate": 1.7874141816821516e-05, + "loss": 0.7879, + "step": 2857 + }, + { + "epoch": 0.23517794692450114, + "grad_norm": 0.4302482991466844, + "learning_rate": 1.787249857964186e-05, + "loss": 0.5361, + "step": 2858 + }, + { + "epoch": 0.23526023451964617, + "grad_norm": 2.55335830830582, + "learning_rate": 1.7870854783212497e-05, + "loss": 0.8313, + "step": 2859 + }, + { + "epoch": 0.2353425221147912, + "grad_norm": 2.9154561286287004, + "learning_rate": 1.7869210427650197e-05, + "loss": 0.8045, + "step": 2860 + }, + { + "epoch": 0.23542480970993623, + "grad_norm": 2.967332392427343, + "learning_rate": 1.7867565513071775e-05, + "loss": 0.8084, + "step": 2861 + }, + { + "epoch": 0.23550709730508126, + "grad_norm": 2.4313743351898625, + "learning_rate": 1.7865920039594077e-05, + "loss": 0.8063, + "step": 2862 + }, + { + "epoch": 0.2355893849002263, + "grad_norm": 0.49104905859343323, + "learning_rate": 1.7864274007334e-05, + "loss": 0.5597, + "step": 2863 + }, + { + "epoch": 0.23567167249537133, + "grad_norm": 0.44956185246382396, + "learning_rate": 1.786262741640848e-05, + "loss": 0.5427, + "step": 2864 + }, + { + "epoch": 0.23575396009051636, + "grad_norm": 2.719530503696874, + "learning_rate": 1.7860980266934477e-05, + "loss": 0.8003, + "step": 2865 + }, + { + "epoch": 0.2358362476856614, + "grad_norm": 3.9138205896080156, + "learning_rate": 1.7859332559029007e-05, + "loss": 0.8104, + "step": 2866 + }, + { + "epoch": 0.23591853528080642, + "grad_norm": 2.3110630830595973, + "learning_rate": 1.7857684292809125e-05, + "loss": 0.8174, + "step": 2867 + }, + { + "epoch": 0.23600082287595145, + "grad_norm": 4.057356326105194, + "learning_rate": 1.7856035468391916e-05, + "loss": 0.8488, + "step": 2868 + }, + { + "epoch": 0.23608311047109648, + "grad_norm": 3.178059190227258, + "learning_rate": 1.785438608589451e-05, + "loss": 0.8503, + "step": 2869 + }, + { + "epoch": 0.2361653980662415, + "grad_norm": 3.742005522748046, + "learning_rate": 1.785273614543408e-05, + "loss": 0.8381, + "step": 2870 + }, + { + "epoch": 0.23624768566138654, + "grad_norm": 3.611675148428781, + "learning_rate": 1.7851085647127834e-05, + "loss": 0.8436, + "step": 2871 + }, + { + "epoch": 0.23632997325653157, + "grad_norm": 0.52588085352646, + "learning_rate": 1.7849434591093016e-05, + "loss": 0.5563, + "step": 2872 + }, + { + "epoch": 0.2364122608516766, + "grad_norm": 3.0529399713122354, + "learning_rate": 1.784778297744692e-05, + "loss": 0.8279, + "step": 2873 + }, + { + "epoch": 0.23649454844682163, + "grad_norm": 3.8698554755964962, + "learning_rate": 1.784613080630687e-05, + "loss": 0.8416, + "step": 2874 + }, + { + "epoch": 0.23657683604196666, + "grad_norm": 3.2625105635611513, + "learning_rate": 1.7844478077790233e-05, + "loss": 0.8031, + "step": 2875 + }, + { + "epoch": 0.2366591236371117, + "grad_norm": 3.2943509488596137, + "learning_rate": 1.7842824792014427e-05, + "loss": 0.8336, + "step": 2876 + }, + { + "epoch": 0.23674141123225673, + "grad_norm": 3.235947871537383, + "learning_rate": 1.7841170949096883e-05, + "loss": 0.8093, + "step": 2877 + }, + { + "epoch": 0.23682369882740176, + "grad_norm": 3.77576612731386, + "learning_rate": 1.78395165491551e-05, + "loss": 0.8065, + "step": 2878 + }, + { + "epoch": 0.2369059864225468, + "grad_norm": 5.456827759241825, + "learning_rate": 1.7837861592306597e-05, + "loss": 0.8474, + "step": 2879 + }, + { + "epoch": 0.23698827401769185, + "grad_norm": 3.6312496615136545, + "learning_rate": 1.783620607866894e-05, + "loss": 0.8046, + "step": 2880 + }, + { + "epoch": 0.23707056161283688, + "grad_norm": 3.862392789417281, + "learning_rate": 1.7834550008359738e-05, + "loss": 0.809, + "step": 2881 + }, + { + "epoch": 0.2371528492079819, + "grad_norm": 3.1797948789980706, + "learning_rate": 1.783289338149663e-05, + "loss": 0.8026, + "step": 2882 + }, + { + "epoch": 0.23723513680312694, + "grad_norm": 4.0496067473943835, + "learning_rate": 1.7831236198197305e-05, + "loss": 0.8172, + "step": 2883 + }, + { + "epoch": 0.23731742439827197, + "grad_norm": 3.201889161643371, + "learning_rate": 1.7829578458579483e-05, + "loss": 0.8203, + "step": 2884 + }, + { + "epoch": 0.237399711993417, + "grad_norm": 3.0994372587653944, + "learning_rate": 1.782792016276093e-05, + "loss": 0.8286, + "step": 2885 + }, + { + "epoch": 0.23748199958856203, + "grad_norm": 6.101329439325836, + "learning_rate": 1.7826261310859447e-05, + "loss": 0.8153, + "step": 2886 + }, + { + "epoch": 0.23756428718370706, + "grad_norm": 3.0580009754371407, + "learning_rate": 1.782460190299288e-05, + "loss": 0.8317, + "step": 2887 + }, + { + "epoch": 0.2376465747788521, + "grad_norm": 3.1252717952475177, + "learning_rate": 1.78229419392791e-05, + "loss": 0.8358, + "step": 2888 + }, + { + "epoch": 0.23772886237399712, + "grad_norm": 0.4793194081351795, + "learning_rate": 1.7821281419836044e-05, + "loss": 0.5412, + "step": 2889 + }, + { + "epoch": 0.23781114996914215, + "grad_norm": 3.541850890247188, + "learning_rate": 1.7819620344781657e-05, + "loss": 0.8569, + "step": 2890 + }, + { + "epoch": 0.23789343756428719, + "grad_norm": 3.882668007827672, + "learning_rate": 1.7817958714233952e-05, + "loss": 0.8249, + "step": 2891 + }, + { + "epoch": 0.23797572515943222, + "grad_norm": 3.3992855029015074, + "learning_rate": 1.781629652831096e-05, + "loss": 0.8093, + "step": 2892 + }, + { + "epoch": 0.23805801275457725, + "grad_norm": 3.5866512708918097, + "learning_rate": 1.781463378713076e-05, + "loss": 0.7686, + "step": 2893 + }, + { + "epoch": 0.23814030034972228, + "grad_norm": 4.804221247572846, + "learning_rate": 1.781297049081148e-05, + "loss": 0.8252, + "step": 2894 + }, + { + "epoch": 0.2382225879448673, + "grad_norm": 4.3070667034437085, + "learning_rate": 1.7811306639471267e-05, + "loss": 0.7975, + "step": 2895 + }, + { + "epoch": 0.23830487554001234, + "grad_norm": 3.1785831184030444, + "learning_rate": 1.7809642233228324e-05, + "loss": 0.8118, + "step": 2896 + }, + { + "epoch": 0.23838716313515737, + "grad_norm": 0.4418882975218361, + "learning_rate": 1.780797727220089e-05, + "loss": 0.5324, + "step": 2897 + }, + { + "epoch": 0.2384694507303024, + "grad_norm": 2.862289776828012, + "learning_rate": 1.7806311756507232e-05, + "loss": 0.831, + "step": 2898 + }, + { + "epoch": 0.23855173832544743, + "grad_norm": 3.4885230093797825, + "learning_rate": 1.7804645686265672e-05, + "loss": 0.8414, + "step": 2899 + }, + { + "epoch": 0.23863402592059246, + "grad_norm": 2.584012821454574, + "learning_rate": 1.7802979061594564e-05, + "loss": 0.846, + "step": 2900 + }, + { + "epoch": 0.2387163135157375, + "grad_norm": 2.6570487273962646, + "learning_rate": 1.7801311882612307e-05, + "loss": 0.8186, + "step": 2901 + }, + { + "epoch": 0.23879860111088252, + "grad_norm": 0.44804613300031815, + "learning_rate": 1.7799644149437325e-05, + "loss": 0.5788, + "step": 2902 + }, + { + "epoch": 0.23888088870602756, + "grad_norm": 2.253489610537284, + "learning_rate": 1.77979758621881e-05, + "loss": 0.7736, + "step": 2903 + }, + { + "epoch": 0.2389631763011726, + "grad_norm": 2.943977446980764, + "learning_rate": 1.7796307020983137e-05, + "loss": 0.8284, + "step": 2904 + }, + { + "epoch": 0.23904546389631762, + "grad_norm": 0.4239993593652877, + "learning_rate": 1.7794637625940992e-05, + "loss": 0.5514, + "step": 2905 + }, + { + "epoch": 0.23912775149146268, + "grad_norm": 2.9737445711445463, + "learning_rate": 1.7792967677180263e-05, + "loss": 0.7948, + "step": 2906 + }, + { + "epoch": 0.2392100390866077, + "grad_norm": 2.4626561112597805, + "learning_rate": 1.7791297174819562e-05, + "loss": 0.7985, + "step": 2907 + }, + { + "epoch": 0.23929232668175274, + "grad_norm": 2.2496018970898675, + "learning_rate": 1.778962611897758e-05, + "loss": 0.7983, + "step": 2908 + }, + { + "epoch": 0.23937461427689777, + "grad_norm": 3.8808962895028514, + "learning_rate": 1.778795450977301e-05, + "loss": 0.8173, + "step": 2909 + }, + { + "epoch": 0.2394569018720428, + "grad_norm": 2.9670632893908646, + "learning_rate": 1.7786282347324607e-05, + "loss": 0.8209, + "step": 2910 + }, + { + "epoch": 0.23953918946718783, + "grad_norm": 2.4468118274384874, + "learning_rate": 1.7784609631751162e-05, + "loss": 0.8414, + "step": 2911 + }, + { + "epoch": 0.23962147706233286, + "grad_norm": 2.6242870827617444, + "learning_rate": 1.7782936363171496e-05, + "loss": 0.7946, + "step": 2912 + }, + { + "epoch": 0.2397037646574779, + "grad_norm": 2.869216446545581, + "learning_rate": 1.778126254170448e-05, + "loss": 0.8297, + "step": 2913 + }, + { + "epoch": 0.23978605225262292, + "grad_norm": 2.1225891935942536, + "learning_rate": 1.7779588167469014e-05, + "loss": 0.8305, + "step": 2914 + }, + { + "epoch": 0.23986833984776795, + "grad_norm": 0.4726639378498239, + "learning_rate": 1.7777913240584046e-05, + "loss": 0.5738, + "step": 2915 + }, + { + "epoch": 0.23995062744291298, + "grad_norm": 2.39252667296644, + "learning_rate": 1.7776237761168562e-05, + "loss": 0.7883, + "step": 2916 + }, + { + "epoch": 0.24003291503805801, + "grad_norm": 2.0755652310001627, + "learning_rate": 1.7774561729341583e-05, + "loss": 0.8075, + "step": 2917 + }, + { + "epoch": 0.24011520263320305, + "grad_norm": 2.157183836032903, + "learning_rate": 1.7772885145222175e-05, + "loss": 0.801, + "step": 2918 + }, + { + "epoch": 0.24019749022834808, + "grad_norm": 2.886558156591231, + "learning_rate": 1.7771208008929434e-05, + "loss": 0.8196, + "step": 2919 + }, + { + "epoch": 0.2402797778234931, + "grad_norm": 2.503925351985564, + "learning_rate": 1.7769530320582504e-05, + "loss": 0.8104, + "step": 2920 + }, + { + "epoch": 0.24036206541863814, + "grad_norm": 2.3860694335338177, + "learning_rate": 1.776785208030057e-05, + "loss": 0.8078, + "step": 2921 + }, + { + "epoch": 0.24044435301378317, + "grad_norm": 2.892237805255794, + "learning_rate": 1.776617328820284e-05, + "loss": 0.8211, + "step": 2922 + }, + { + "epoch": 0.2405266406089282, + "grad_norm": 3.012439927965674, + "learning_rate": 1.7764493944408583e-05, + "loss": 0.8232, + "step": 2923 + }, + { + "epoch": 0.24060892820407323, + "grad_norm": 2.996719130927815, + "learning_rate": 1.7762814049037096e-05, + "loss": 0.8153, + "step": 2924 + }, + { + "epoch": 0.24069121579921826, + "grad_norm": 2.342317094738393, + "learning_rate": 1.7761133602207712e-05, + "loss": 0.8192, + "step": 2925 + }, + { + "epoch": 0.2407735033943633, + "grad_norm": 2.887662051850029, + "learning_rate": 1.775945260403981e-05, + "loss": 0.7935, + "step": 2926 + }, + { + "epoch": 0.24085579098950832, + "grad_norm": 2.4837332329260535, + "learning_rate": 1.77577710546528e-05, + "loss": 0.8325, + "step": 2927 + }, + { + "epoch": 0.24093807858465335, + "grad_norm": 3.059684047216481, + "learning_rate": 1.7756088954166147e-05, + "loss": 0.8319, + "step": 2928 + }, + { + "epoch": 0.24102036617979838, + "grad_norm": 3.4499819342486924, + "learning_rate": 1.7754406302699333e-05, + "loss": 0.8232, + "step": 2929 + }, + { + "epoch": 0.24110265377494342, + "grad_norm": 0.4818175678770239, + "learning_rate": 1.77527231003719e-05, + "loss": 0.55, + "step": 2930 + }, + { + "epoch": 0.24118494137008845, + "grad_norm": 3.0920558417930617, + "learning_rate": 1.7751039347303417e-05, + "loss": 0.7753, + "step": 2931 + }, + { + "epoch": 0.2412672289652335, + "grad_norm": 2.998597893505298, + "learning_rate": 1.7749355043613493e-05, + "loss": 0.8651, + "step": 2932 + }, + { + "epoch": 0.24134951656037854, + "grad_norm": 3.9038102066258777, + "learning_rate": 1.7747670189421786e-05, + "loss": 0.8337, + "step": 2933 + }, + { + "epoch": 0.24143180415552357, + "grad_norm": 2.5723479252406913, + "learning_rate": 1.774598478484797e-05, + "loss": 0.7963, + "step": 2934 + }, + { + "epoch": 0.2415140917506686, + "grad_norm": 0.4300721442737975, + "learning_rate": 1.774429883001179e-05, + "loss": 0.5481, + "step": 2935 + }, + { + "epoch": 0.24159637934581363, + "grad_norm": 2.6283222051192503, + "learning_rate": 1.7742612325033e-05, + "loss": 0.8372, + "step": 2936 + }, + { + "epoch": 0.24167866694095866, + "grad_norm": 2.415803374736619, + "learning_rate": 1.7740925270031417e-05, + "loss": 0.8244, + "step": 2937 + }, + { + "epoch": 0.2417609545361037, + "grad_norm": 1.971274404106693, + "learning_rate": 1.7739237665126885e-05, + "loss": 0.8127, + "step": 2938 + }, + { + "epoch": 0.24184324213124872, + "grad_norm": 7.078131438955339, + "learning_rate": 1.773754951043928e-05, + "loss": 0.799, + "step": 2939 + }, + { + "epoch": 0.24192552972639375, + "grad_norm": 2.5314945703551155, + "learning_rate": 1.7735860806088538e-05, + "loss": 0.8475, + "step": 2940 + }, + { + "epoch": 0.24200781732153878, + "grad_norm": 0.47201850617328384, + "learning_rate": 1.7734171552194613e-05, + "loss": 0.5476, + "step": 2941 + }, + { + "epoch": 0.2420901049166838, + "grad_norm": 2.4755906245496977, + "learning_rate": 1.773248174887751e-05, + "loss": 0.8388, + "step": 2942 + }, + { + "epoch": 0.24217239251182884, + "grad_norm": 3.512614041752255, + "learning_rate": 1.773079139625727e-05, + "loss": 0.8126, + "step": 2943 + }, + { + "epoch": 0.24225468010697387, + "grad_norm": 2.515303401227105, + "learning_rate": 1.772910049445397e-05, + "loss": 0.8275, + "step": 2944 + }, + { + "epoch": 0.2423369677021189, + "grad_norm": 3.250687085735003, + "learning_rate": 1.7727409043587736e-05, + "loss": 0.8373, + "step": 2945 + }, + { + "epoch": 0.24241925529726394, + "grad_norm": 2.324671566478648, + "learning_rate": 1.7725717043778724e-05, + "loss": 0.7995, + "step": 2946 + }, + { + "epoch": 0.24250154289240897, + "grad_norm": 0.468722771305979, + "learning_rate": 1.7724024495147123e-05, + "loss": 0.5426, + "step": 2947 + }, + { + "epoch": 0.242583830487554, + "grad_norm": 3.4777394590970947, + "learning_rate": 1.7722331397813177e-05, + "loss": 0.8027, + "step": 2948 + }, + { + "epoch": 0.24266611808269903, + "grad_norm": 3.0206885939563164, + "learning_rate": 1.772063775189716e-05, + "loss": 0.8526, + "step": 2949 + }, + { + "epoch": 0.24274840567784406, + "grad_norm": 3.331833102443693, + "learning_rate": 1.771894355751938e-05, + "loss": 0.8295, + "step": 2950 + }, + { + "epoch": 0.2428306932729891, + "grad_norm": 2.8204423273747867, + "learning_rate": 1.7717248814800198e-05, + "loss": 0.7961, + "step": 2951 + }, + { + "epoch": 0.24291298086813412, + "grad_norm": 2.612675828083287, + "learning_rate": 1.771555352386e-05, + "loss": 0.8055, + "step": 2952 + }, + { + "epoch": 0.24299526846327915, + "grad_norm": 2.4201788857878737, + "learning_rate": 1.771385768481922e-05, + "loss": 0.8306, + "step": 2953 + }, + { + "epoch": 0.24307755605842418, + "grad_norm": 2.3617116892356536, + "learning_rate": 1.771216129779833e-05, + "loss": 0.8315, + "step": 2954 + }, + { + "epoch": 0.2431598436535692, + "grad_norm": 2.2479992095226575, + "learning_rate": 1.771046436291783e-05, + "loss": 0.7626, + "step": 2955 + }, + { + "epoch": 0.24324213124871424, + "grad_norm": 2.7526501879998015, + "learning_rate": 1.7708766880298275e-05, + "loss": 0.8079, + "step": 2956 + }, + { + "epoch": 0.24332441884385927, + "grad_norm": 2.474424972586215, + "learning_rate": 1.7707068850060247e-05, + "loss": 0.8315, + "step": 2957 + }, + { + "epoch": 0.24340670643900433, + "grad_norm": 2.7862231336500685, + "learning_rate": 1.7705370272324375e-05, + "loss": 0.8485, + "step": 2958 + }, + { + "epoch": 0.24348899403414936, + "grad_norm": 2.5829056955847247, + "learning_rate": 1.770367114721132e-05, + "loss": 0.8221, + "step": 2959 + }, + { + "epoch": 0.2435712816292944, + "grad_norm": 2.667211592504044, + "learning_rate": 1.7701971474841793e-05, + "loss": 0.8145, + "step": 2960 + }, + { + "epoch": 0.24365356922443943, + "grad_norm": 0.48873734030396343, + "learning_rate": 1.7700271255336525e-05, + "loss": 0.5677, + "step": 2961 + }, + { + "epoch": 0.24373585681958446, + "grad_norm": 2.888232728250638, + "learning_rate": 1.76985704888163e-05, + "loss": 0.8315, + "step": 2962 + }, + { + "epoch": 0.2438181444147295, + "grad_norm": 20.925313878929853, + "learning_rate": 1.769686917540194e-05, + "loss": 0.8133, + "step": 2963 + }, + { + "epoch": 0.24390043200987452, + "grad_norm": 2.762209834419681, + "learning_rate": 1.769516731521431e-05, + "loss": 0.8132, + "step": 2964 + }, + { + "epoch": 0.24398271960501955, + "grad_norm": 2.8652691832779413, + "learning_rate": 1.7693464908374295e-05, + "loss": 0.8303, + "step": 2965 + }, + { + "epoch": 0.24406500720016458, + "grad_norm": 2.267440807893723, + "learning_rate": 1.7691761955002837e-05, + "loss": 0.7961, + "step": 2966 + }, + { + "epoch": 0.2441472947953096, + "grad_norm": 2.5922017142258764, + "learning_rate": 1.769005845522091e-05, + "loss": 0.8466, + "step": 2967 + }, + { + "epoch": 0.24422958239045464, + "grad_norm": 2.435470874214321, + "learning_rate": 1.768835440914953e-05, + "loss": 0.8278, + "step": 2968 + }, + { + "epoch": 0.24431186998559967, + "grad_norm": 2.734768088007503, + "learning_rate": 1.768664981690975e-05, + "loss": 0.8162, + "step": 2969 + }, + { + "epoch": 0.2443941575807447, + "grad_norm": 2.2931159488349095, + "learning_rate": 1.768494467862266e-05, + "loss": 0.8015, + "step": 2970 + }, + { + "epoch": 0.24447644517588973, + "grad_norm": 2.33505639795374, + "learning_rate": 1.768323899440939e-05, + "loss": 0.8067, + "step": 2971 + }, + { + "epoch": 0.24455873277103476, + "grad_norm": 3.1191561268536265, + "learning_rate": 1.7681532764391108e-05, + "loss": 0.8209, + "step": 2972 + }, + { + "epoch": 0.2446410203661798, + "grad_norm": 3.439317442966874, + "learning_rate": 1.767982598868902e-05, + "loss": 0.8156, + "step": 2973 + }, + { + "epoch": 0.24472330796132483, + "grad_norm": 4.029913486068659, + "learning_rate": 1.767811866742438e-05, + "loss": 0.8809, + "step": 2974 + }, + { + "epoch": 0.24480559555646986, + "grad_norm": 0.47992780240279903, + "learning_rate": 1.767641080071847e-05, + "loss": 0.527, + "step": 2975 + }, + { + "epoch": 0.2448878831516149, + "grad_norm": 1.9906042687346728, + "learning_rate": 1.7674702388692612e-05, + "loss": 0.8284, + "step": 2976 + }, + { + "epoch": 0.24497017074675992, + "grad_norm": 5.348166325662937, + "learning_rate": 1.7672993431468167e-05, + "loss": 0.8185, + "step": 2977 + }, + { + "epoch": 0.24505245834190495, + "grad_norm": 3.0302113443408167, + "learning_rate": 1.7671283929166545e-05, + "loss": 0.8256, + "step": 2978 + }, + { + "epoch": 0.24513474593704998, + "grad_norm": 2.765533594139685, + "learning_rate": 1.766957388190918e-05, + "loss": 0.8216, + "step": 2979 + }, + { + "epoch": 0.245217033532195, + "grad_norm": 5.965302503829052, + "learning_rate": 1.766786328981755e-05, + "loss": 0.7883, + "step": 2980 + }, + { + "epoch": 0.24529932112734004, + "grad_norm": 2.7406380130111803, + "learning_rate": 1.7666152153013177e-05, + "loss": 0.7926, + "step": 2981 + }, + { + "epoch": 0.24538160872248507, + "grad_norm": 0.47825800329757423, + "learning_rate": 1.766444047161761e-05, + "loss": 0.5796, + "step": 2982 + }, + { + "epoch": 0.2454638963176301, + "grad_norm": 2.310350076864101, + "learning_rate": 1.7662728245752453e-05, + "loss": 0.8045, + "step": 2983 + }, + { + "epoch": 0.24554618391277516, + "grad_norm": 0.4363743762926369, + "learning_rate": 1.7661015475539337e-05, + "loss": 0.5486, + "step": 2984 + }, + { + "epoch": 0.2456284715079202, + "grad_norm": 3.286691924601928, + "learning_rate": 1.7659302161099935e-05, + "loss": 0.8471, + "step": 2985 + }, + { + "epoch": 0.24571075910306522, + "grad_norm": 2.3988825258194364, + "learning_rate": 1.7657588302555956e-05, + "loss": 0.8123, + "step": 2986 + }, + { + "epoch": 0.24579304669821025, + "grad_norm": 2.1323590274633224, + "learning_rate": 1.7655873900029147e-05, + "loss": 0.8215, + "step": 2987 + }, + { + "epoch": 0.24587533429335529, + "grad_norm": 2.7473213088400184, + "learning_rate": 1.7654158953641303e-05, + "loss": 0.8162, + "step": 2988 + }, + { + "epoch": 0.24595762188850032, + "grad_norm": 3.185590661191936, + "learning_rate": 1.7652443463514245e-05, + "loss": 0.8183, + "step": 2989 + }, + { + "epoch": 0.24603990948364535, + "grad_norm": 2.676247353052082, + "learning_rate": 1.7650727429769844e-05, + "loss": 0.8187, + "step": 2990 + }, + { + "epoch": 0.24612219707879038, + "grad_norm": 0.4753685846545083, + "learning_rate": 1.7649010852530005e-05, + "loss": 0.5607, + "step": 2991 + }, + { + "epoch": 0.2462044846739354, + "grad_norm": 2.261811425327102, + "learning_rate": 1.7647293731916664e-05, + "loss": 0.8185, + "step": 2992 + }, + { + "epoch": 0.24628677226908044, + "grad_norm": 2.3374305262319464, + "learning_rate": 1.7645576068051806e-05, + "loss": 0.796, + "step": 2993 + }, + { + "epoch": 0.24636905986422547, + "grad_norm": 0.45724340821086584, + "learning_rate": 1.7643857861057453e-05, + "loss": 0.5572, + "step": 2994 + }, + { + "epoch": 0.2464513474593705, + "grad_norm": 2.8307572222686566, + "learning_rate": 1.764213911105566e-05, + "loss": 0.8397, + "step": 2995 + }, + { + "epoch": 0.24653363505451553, + "grad_norm": 2.5129736766859203, + "learning_rate": 1.764041981816853e-05, + "loss": 0.7905, + "step": 2996 + }, + { + "epoch": 0.24661592264966056, + "grad_norm": 2.9659850416830102, + "learning_rate": 1.7638699982518193e-05, + "loss": 0.8138, + "step": 2997 + }, + { + "epoch": 0.2466982102448056, + "grad_norm": 2.6159577238005025, + "learning_rate": 1.7636979604226826e-05, + "loss": 0.8193, + "step": 2998 + }, + { + "epoch": 0.24678049783995062, + "grad_norm": 0.4223222850046514, + "learning_rate": 1.763525868341664e-05, + "loss": 0.5336, + "step": 2999 + }, + { + "epoch": 0.24686278543509566, + "grad_norm": 2.5919015592128196, + "learning_rate": 1.763353722020989e-05, + "loss": 0.8278, + "step": 3000 + }, + { + "epoch": 0.24694507303024069, + "grad_norm": 2.9501868940145024, + "learning_rate": 1.763181521472886e-05, + "loss": 0.8371, + "step": 3001 + }, + { + "epoch": 0.24702736062538572, + "grad_norm": 8.262148809997365, + "learning_rate": 1.7630092667095886e-05, + "loss": 0.8384, + "step": 3002 + }, + { + "epoch": 0.24710964822053075, + "grad_norm": 6.492544055589544, + "learning_rate": 1.7628369577433328e-05, + "loss": 0.8187, + "step": 3003 + }, + { + "epoch": 0.24719193581567578, + "grad_norm": 0.44702863278235955, + "learning_rate": 1.7626645945863598e-05, + "loss": 0.5541, + "step": 3004 + }, + { + "epoch": 0.2472742234108208, + "grad_norm": 2.4050113395335355, + "learning_rate": 1.7624921772509137e-05, + "loss": 0.8374, + "step": 3005 + }, + { + "epoch": 0.24735651100596584, + "grad_norm": 2.6401344327287224, + "learning_rate": 1.762319705749243e-05, + "loss": 0.825, + "step": 3006 + }, + { + "epoch": 0.24743879860111087, + "grad_norm": 2.7101886526639904, + "learning_rate": 1.762147180093599e-05, + "loss": 0.8071, + "step": 3007 + }, + { + "epoch": 0.2475210861962559, + "grad_norm": 0.4569123759115584, + "learning_rate": 1.7619746002962385e-05, + "loss": 0.5317, + "step": 3008 + }, + { + "epoch": 0.24760337379140093, + "grad_norm": 2.1143175811102766, + "learning_rate": 1.7618019663694213e-05, + "loss": 0.8247, + "step": 3009 + }, + { + "epoch": 0.247685661386546, + "grad_norm": 2.697483320930217, + "learning_rate": 1.76162927832541e-05, + "loss": 0.828, + "step": 3010 + }, + { + "epoch": 0.24776794898169102, + "grad_norm": 2.319060517273934, + "learning_rate": 1.7614565361764736e-05, + "loss": 0.8083, + "step": 3011 + }, + { + "epoch": 0.24785023657683605, + "grad_norm": 2.1388551387396966, + "learning_rate": 1.761283739934882e-05, + "loss": 0.815, + "step": 3012 + }, + { + "epoch": 0.24793252417198108, + "grad_norm": 2.0823734950527504, + "learning_rate": 1.761110889612911e-05, + "loss": 0.8429, + "step": 3013 + }, + { + "epoch": 0.24801481176712611, + "grad_norm": 2.7512328496729084, + "learning_rate": 1.76093798522284e-05, + "loss": 0.8381, + "step": 3014 + }, + { + "epoch": 0.24809709936227115, + "grad_norm": 3.2382987863079076, + "learning_rate": 1.7607650267769518e-05, + "loss": 0.8127, + "step": 3015 + }, + { + "epoch": 0.24817938695741618, + "grad_norm": 2.8477801097635145, + "learning_rate": 1.760592014287532e-05, + "loss": 0.8231, + "step": 3016 + }, + { + "epoch": 0.2482616745525612, + "grad_norm": 2.403751343409887, + "learning_rate": 1.7604189477668723e-05, + "loss": 0.8021, + "step": 3017 + }, + { + "epoch": 0.24834396214770624, + "grad_norm": 2.530935094848329, + "learning_rate": 1.7602458272272664e-05, + "loss": 0.8153, + "step": 3018 + }, + { + "epoch": 0.24842624974285127, + "grad_norm": 2.63352136642727, + "learning_rate": 1.760072652681013e-05, + "loss": 0.809, + "step": 3019 + }, + { + "epoch": 0.2485085373379963, + "grad_norm": 3.458558440607959, + "learning_rate": 1.7598994241404138e-05, + "loss": 0.8249, + "step": 3020 + }, + { + "epoch": 0.24859082493314133, + "grad_norm": 5.264472625440188, + "learning_rate": 1.7597261416177748e-05, + "loss": 0.8381, + "step": 3021 + }, + { + "epoch": 0.24867311252828636, + "grad_norm": 2.7924585028224773, + "learning_rate": 1.759552805125406e-05, + "loss": 0.8332, + "step": 3022 + }, + { + "epoch": 0.2487554001234314, + "grad_norm": 2.811529056203523, + "learning_rate": 1.75937941467562e-05, + "loss": 0.8245, + "step": 3023 + }, + { + "epoch": 0.24883768771857642, + "grad_norm": 2.3834870192262585, + "learning_rate": 1.7592059702807355e-05, + "loss": 0.8127, + "step": 3024 + }, + { + "epoch": 0.24891997531372145, + "grad_norm": 3.080319879970508, + "learning_rate": 1.7590324719530727e-05, + "loss": 0.8208, + "step": 3025 + }, + { + "epoch": 0.24900226290886648, + "grad_norm": 2.4573468356132198, + "learning_rate": 1.7588589197049567e-05, + "loss": 0.792, + "step": 3026 + }, + { + "epoch": 0.24908455050401151, + "grad_norm": 3.0390976228111892, + "learning_rate": 1.7586853135487173e-05, + "loss": 0.8175, + "step": 3027 + }, + { + "epoch": 0.24916683809915655, + "grad_norm": 3.010888603421016, + "learning_rate": 1.7585116534966862e-05, + "loss": 0.7831, + "step": 3028 + }, + { + "epoch": 0.24924912569430158, + "grad_norm": 2.2894645992642015, + "learning_rate": 1.7583379395612e-05, + "loss": 0.8009, + "step": 3029 + }, + { + "epoch": 0.2493314132894466, + "grad_norm": 2.3589866437890654, + "learning_rate": 1.7581641717546e-05, + "loss": 0.8162, + "step": 3030 + }, + { + "epoch": 0.24941370088459164, + "grad_norm": 0.45847778884193263, + "learning_rate": 1.7579903500892295e-05, + "loss": 0.554, + "step": 3031 + }, + { + "epoch": 0.24949598847973667, + "grad_norm": 2.667053134073255, + "learning_rate": 1.7578164745774365e-05, + "loss": 0.805, + "step": 3032 + }, + { + "epoch": 0.2495782760748817, + "grad_norm": 2.5236571720177006, + "learning_rate": 1.7576425452315734e-05, + "loss": 0.8178, + "step": 3033 + }, + { + "epoch": 0.24966056367002673, + "grad_norm": 2.258153479818695, + "learning_rate": 1.7574685620639955e-05, + "loss": 0.8225, + "step": 3034 + }, + { + "epoch": 0.2497428512651718, + "grad_norm": 2.3110737984568654, + "learning_rate": 1.7572945250870622e-05, + "loss": 0.7955, + "step": 3035 + }, + { + "epoch": 0.24982513886031682, + "grad_norm": 2.5234714054140923, + "learning_rate": 1.7571204343131373e-05, + "loss": 0.7953, + "step": 3036 + }, + { + "epoch": 0.24990742645546185, + "grad_norm": 5.904592799601487, + "learning_rate": 1.7569462897545873e-05, + "loss": 0.8165, + "step": 3037 + }, + { + "epoch": 0.24998971405060688, + "grad_norm": 2.0965767073497155, + "learning_rate": 1.7567720914237835e-05, + "loss": 0.7897, + "step": 3038 + }, + { + "epoch": 0.2500720016457519, + "grad_norm": 4.594650483128967, + "learning_rate": 1.7565978393331005e-05, + "loss": 0.8458, + "step": 3039 + }, + { + "epoch": 0.25015428924089694, + "grad_norm": 0.4553532023589861, + "learning_rate": 1.756423533494917e-05, + "loss": 0.5334, + "step": 3040 + }, + { + "epoch": 0.25023657683604195, + "grad_norm": 2.6783122193891757, + "learning_rate": 1.7562491739216155e-05, + "loss": 0.8186, + "step": 3041 + }, + { + "epoch": 0.250318864431187, + "grad_norm": 2.2723065495937087, + "learning_rate": 1.756074760625582e-05, + "loss": 0.8018, + "step": 3042 + }, + { + "epoch": 0.250401152026332, + "grad_norm": 2.9650880609188435, + "learning_rate": 1.755900293619207e-05, + "loss": 0.8399, + "step": 3043 + }, + { + "epoch": 0.25048343962147707, + "grad_norm": 3.9171173074396917, + "learning_rate": 1.755725772914884e-05, + "loss": 0.8322, + "step": 3044 + }, + { + "epoch": 0.25056572721662207, + "grad_norm": 0.44238452905452086, + "learning_rate": 1.75555119852501e-05, + "loss": 0.5509, + "step": 3045 + }, + { + "epoch": 0.25064801481176713, + "grad_norm": 2.532269176697092, + "learning_rate": 1.7553765704619877e-05, + "loss": 0.8376, + "step": 3046 + }, + { + "epoch": 0.25073030240691213, + "grad_norm": 2.1601877532443083, + "learning_rate": 1.755201888738222e-05, + "loss": 0.8079, + "step": 3047 + }, + { + "epoch": 0.2508125900020572, + "grad_norm": 0.4864105965006659, + "learning_rate": 1.7550271533661217e-05, + "loss": 0.5467, + "step": 3048 + }, + { + "epoch": 0.25089487759720225, + "grad_norm": 2.379700540347124, + "learning_rate": 1.7548523643581e-05, + "loss": 0.7873, + "step": 3049 + }, + { + "epoch": 0.25097716519234725, + "grad_norm": 4.0656425979348985, + "learning_rate": 1.7546775217265734e-05, + "loss": 0.7955, + "step": 3050 + }, + { + "epoch": 0.2510594527874923, + "grad_norm": 2.7719299480072084, + "learning_rate": 1.7545026254839627e-05, + "loss": 0.8208, + "step": 3051 + }, + { + "epoch": 0.2511417403826373, + "grad_norm": 2.112697347780536, + "learning_rate": 1.754327675642692e-05, + "loss": 0.8283, + "step": 3052 + }, + { + "epoch": 0.25122402797778237, + "grad_norm": 2.7457155480951747, + "learning_rate": 1.7541526722151897e-05, + "loss": 0.7984, + "step": 3053 + }, + { + "epoch": 0.2513063155729274, + "grad_norm": 2.964443751057906, + "learning_rate": 1.753977615213888e-05, + "loss": 0.8019, + "step": 3054 + }, + { + "epoch": 0.25138860316807243, + "grad_norm": 0.46730221582743114, + "learning_rate": 1.7538025046512218e-05, + "loss": 0.5251, + "step": 3055 + }, + { + "epoch": 0.25147089076321744, + "grad_norm": 0.4461078153487976, + "learning_rate": 1.7536273405396314e-05, + "loss": 0.5648, + "step": 3056 + }, + { + "epoch": 0.2515531783583625, + "grad_norm": 2.298544228777589, + "learning_rate": 1.75345212289156e-05, + "loss": 0.8048, + "step": 3057 + }, + { + "epoch": 0.2516354659535075, + "grad_norm": 2.8616057906402537, + "learning_rate": 1.753276851719455e-05, + "loss": 0.8246, + "step": 3058 + }, + { + "epoch": 0.25171775354865256, + "grad_norm": 2.659983701913832, + "learning_rate": 1.7531015270357667e-05, + "loss": 0.7937, + "step": 3059 + }, + { + "epoch": 0.25180004114379756, + "grad_norm": 3.5499141636761755, + "learning_rate": 1.7529261488529503e-05, + "loss": 0.8159, + "step": 3060 + }, + { + "epoch": 0.2518823287389426, + "grad_norm": 2.3718170006117827, + "learning_rate": 1.7527507171834647e-05, + "loss": 0.8379, + "step": 3061 + }, + { + "epoch": 0.2519646163340876, + "grad_norm": 6.495089047174273, + "learning_rate": 1.7525752320397717e-05, + "loss": 0.8113, + "step": 3062 + }, + { + "epoch": 0.2520469039292327, + "grad_norm": 0.4670703002111636, + "learning_rate": 1.752399693434338e-05, + "loss": 0.5719, + "step": 3063 + }, + { + "epoch": 0.2521291915243777, + "grad_norm": 3.035483833386936, + "learning_rate": 1.7522241013796336e-05, + "loss": 0.8339, + "step": 3064 + }, + { + "epoch": 0.25221147911952274, + "grad_norm": 3.1199229312583663, + "learning_rate": 1.7520484558881316e-05, + "loss": 0.8418, + "step": 3065 + }, + { + "epoch": 0.25229376671466774, + "grad_norm": 3.4612403146143547, + "learning_rate": 1.7518727569723104e-05, + "loss": 0.8355, + "step": 3066 + }, + { + "epoch": 0.2523760543098128, + "grad_norm": 3.3961735804307867, + "learning_rate": 1.7516970046446506e-05, + "loss": 0.7901, + "step": 3067 + }, + { + "epoch": 0.2524583419049578, + "grad_norm": 3.058194691007951, + "learning_rate": 1.751521198917638e-05, + "loss": 0.8025, + "step": 3068 + }, + { + "epoch": 0.25254062950010286, + "grad_norm": 2.906124566549867, + "learning_rate": 1.7513453398037613e-05, + "loss": 0.8066, + "step": 3069 + }, + { + "epoch": 0.25262291709524787, + "grad_norm": 2.4134898879968363, + "learning_rate": 1.7511694273155133e-05, + "loss": 0.7797, + "step": 3070 + }, + { + "epoch": 0.2527052046903929, + "grad_norm": 2.3780332334605876, + "learning_rate": 1.7509934614653903e-05, + "loss": 0.8551, + "step": 3071 + }, + { + "epoch": 0.25278749228553793, + "grad_norm": 0.4675698799777684, + "learning_rate": 1.750817442265893e-05, + "loss": 0.5587, + "step": 3072 + }, + { + "epoch": 0.252869779880683, + "grad_norm": 2.507602846575027, + "learning_rate": 1.7506413697295253e-05, + "loss": 0.8095, + "step": 3073 + }, + { + "epoch": 0.25295206747582805, + "grad_norm": 2.423926741802012, + "learning_rate": 1.7504652438687952e-05, + "loss": 0.7885, + "step": 3074 + }, + { + "epoch": 0.25303435507097305, + "grad_norm": 2.427193422320289, + "learning_rate": 1.7502890646962143e-05, + "loss": 0.81, + "step": 3075 + }, + { + "epoch": 0.2531166426661181, + "grad_norm": 3.0277514236068934, + "learning_rate": 1.7501128322242982e-05, + "loss": 0.818, + "step": 3076 + }, + { + "epoch": 0.2531989302612631, + "grad_norm": 2.9834204488960965, + "learning_rate": 1.7499365464655663e-05, + "loss": 0.8075, + "step": 3077 + }, + { + "epoch": 0.25328121785640817, + "grad_norm": 2.950887337851664, + "learning_rate": 1.7497602074325412e-05, + "loss": 0.8039, + "step": 3078 + }, + { + "epoch": 0.2533635054515532, + "grad_norm": 2.67647405206412, + "learning_rate": 1.74958381513775e-05, + "loss": 0.7867, + "step": 3079 + }, + { + "epoch": 0.25344579304669823, + "grad_norm": 2.394261796844176, + "learning_rate": 1.7494073695937233e-05, + "loss": 0.8258, + "step": 3080 + }, + { + "epoch": 0.25352808064184323, + "grad_norm": 3.6425048453575566, + "learning_rate": 1.749230870812996e-05, + "loss": 0.7776, + "step": 3081 + }, + { + "epoch": 0.2536103682369883, + "grad_norm": 2.6208238884816084, + "learning_rate": 1.7490543188081056e-05, + "loss": 0.7842, + "step": 3082 + }, + { + "epoch": 0.2536926558321333, + "grad_norm": 3.017990497572143, + "learning_rate": 1.748877713591594e-05, + "loss": 0.7864, + "step": 3083 + }, + { + "epoch": 0.25377494342727835, + "grad_norm": 3.0416081537456017, + "learning_rate": 1.748701055176008e-05, + "loss": 0.7868, + "step": 3084 + }, + { + "epoch": 0.25385723102242336, + "grad_norm": 4.6039910995048, + "learning_rate": 1.748524343573896e-05, + "loss": 0.8093, + "step": 3085 + }, + { + "epoch": 0.2539395186175684, + "grad_norm": 3.7897892956488795, + "learning_rate": 1.7483475787978116e-05, + "loss": 0.8165, + "step": 3086 + }, + { + "epoch": 0.2540218062127134, + "grad_norm": 2.4618042382111094, + "learning_rate": 1.748170760860312e-05, + "loss": 0.814, + "step": 3087 + }, + { + "epoch": 0.2541040938078585, + "grad_norm": 3.5491587012727734, + "learning_rate": 1.7479938897739584e-05, + "loss": 0.8284, + "step": 3088 + }, + { + "epoch": 0.2541863814030035, + "grad_norm": 3.2529004658417864, + "learning_rate": 1.747816965551315e-05, + "loss": 0.8226, + "step": 3089 + }, + { + "epoch": 0.25426866899814854, + "grad_norm": 2.945095187811677, + "learning_rate": 1.7476399882049504e-05, + "loss": 0.8261, + "step": 3090 + }, + { + "epoch": 0.25435095659329354, + "grad_norm": 0.4431246499664612, + "learning_rate": 1.7474629577474364e-05, + "loss": 0.5286, + "step": 3091 + }, + { + "epoch": 0.2544332441884386, + "grad_norm": 2.8315496862525857, + "learning_rate": 1.7472858741913494e-05, + "loss": 0.7767, + "step": 3092 + }, + { + "epoch": 0.2545155317835836, + "grad_norm": 2.7369266356349566, + "learning_rate": 1.747108737549269e-05, + "loss": 0.7892, + "step": 3093 + }, + { + "epoch": 0.25459781937872866, + "grad_norm": 2.7472613775880665, + "learning_rate": 1.746931547833779e-05, + "loss": 0.822, + "step": 3094 + }, + { + "epoch": 0.25468010697387367, + "grad_norm": 0.43128596916781586, + "learning_rate": 1.7467543050574663e-05, + "loss": 0.5538, + "step": 3095 + }, + { + "epoch": 0.2547623945690187, + "grad_norm": 0.4429071764601574, + "learning_rate": 1.7465770092329216e-05, + "loss": 0.5472, + "step": 3096 + }, + { + "epoch": 0.2548446821641637, + "grad_norm": 2.5446895062718333, + "learning_rate": 1.7463996603727405e-05, + "loss": 0.7802, + "step": 3097 + }, + { + "epoch": 0.2549269697593088, + "grad_norm": 3.206538126288419, + "learning_rate": 1.746222258489521e-05, + "loss": 0.8024, + "step": 3098 + }, + { + "epoch": 0.2550092573544538, + "grad_norm": 2.3407871094989745, + "learning_rate": 1.746044803595866e-05, + "loss": 0.8284, + "step": 3099 + }, + { + "epoch": 0.25509154494959885, + "grad_norm": 2.683572965991484, + "learning_rate": 1.7458672957043807e-05, + "loss": 0.8101, + "step": 3100 + }, + { + "epoch": 0.2551738325447439, + "grad_norm": 0.4420254669332104, + "learning_rate": 1.7456897348276764e-05, + "loss": 0.5457, + "step": 3101 + }, + { + "epoch": 0.2552561201398889, + "grad_norm": 2.363029115605435, + "learning_rate": 1.745512120978365e-05, + "loss": 0.8116, + "step": 3102 + }, + { + "epoch": 0.25533840773503397, + "grad_norm": 2.1223518351061506, + "learning_rate": 1.7453344541690653e-05, + "loss": 0.816, + "step": 3103 + }, + { + "epoch": 0.25542069533017897, + "grad_norm": 3.655723669767227, + "learning_rate": 1.7451567344123978e-05, + "loss": 0.8411, + "step": 3104 + }, + { + "epoch": 0.25550298292532403, + "grad_norm": 2.356116906927714, + "learning_rate": 1.7449789617209876e-05, + "loss": 0.7996, + "step": 3105 + }, + { + "epoch": 0.25558527052046903, + "grad_norm": 2.309729737380879, + "learning_rate": 1.7448011361074634e-05, + "loss": 0.8313, + "step": 3106 + }, + { + "epoch": 0.2556675581156141, + "grad_norm": 2.316446164810882, + "learning_rate": 1.7446232575844578e-05, + "loss": 0.8265, + "step": 3107 + }, + { + "epoch": 0.2557498457107591, + "grad_norm": 2.2446147804060566, + "learning_rate": 1.744445326164607e-05, + "loss": 0.8034, + "step": 3108 + }, + { + "epoch": 0.25583213330590415, + "grad_norm": 2.101950996193958, + "learning_rate": 1.74426734186055e-05, + "loss": 0.7628, + "step": 3109 + }, + { + "epoch": 0.25591442090104916, + "grad_norm": 2.161627019623046, + "learning_rate": 1.744089304684932e-05, + "loss": 0.7809, + "step": 3110 + }, + { + "epoch": 0.2559967084961942, + "grad_norm": 2.2963699180473482, + "learning_rate": 1.7439112146503994e-05, + "loss": 0.7928, + "step": 3111 + }, + { + "epoch": 0.2560789960913392, + "grad_norm": 2.633370868874133, + "learning_rate": 1.743733071769604e-05, + "loss": 0.7789, + "step": 3112 + }, + { + "epoch": 0.2561612836864843, + "grad_norm": 2.329791359543784, + "learning_rate": 1.7435548760552005e-05, + "loss": 0.8215, + "step": 3113 + }, + { + "epoch": 0.2562435712816293, + "grad_norm": 2.4293776147390234, + "learning_rate": 1.743376627519848e-05, + "loss": 0.7877, + "step": 3114 + }, + { + "epoch": 0.25632585887677434, + "grad_norm": 2.4472772592053995, + "learning_rate": 1.7431983261762087e-05, + "loss": 0.7643, + "step": 3115 + }, + { + "epoch": 0.25640814647191934, + "grad_norm": 2.4275086488371405, + "learning_rate": 1.743019972036949e-05, + "loss": 0.8256, + "step": 3116 + }, + { + "epoch": 0.2564904340670644, + "grad_norm": 2.6011920159469684, + "learning_rate": 1.742841565114738e-05, + "loss": 0.7854, + "step": 3117 + }, + { + "epoch": 0.2565727216622094, + "grad_norm": 2.606686750827679, + "learning_rate": 1.742663105422251e-05, + "loss": 0.8066, + "step": 3118 + }, + { + "epoch": 0.25665500925735446, + "grad_norm": 0.46905392054729245, + "learning_rate": 1.7424845929721645e-05, + "loss": 0.5637, + "step": 3119 + }, + { + "epoch": 0.25673729685249946, + "grad_norm": 0.43242755372410013, + "learning_rate": 1.74230602777716e-05, + "loss": 0.5467, + "step": 3120 + }, + { + "epoch": 0.2568195844476445, + "grad_norm": 2.0432006955888564, + "learning_rate": 1.7421274098499223e-05, + "loss": 0.7777, + "step": 3121 + }, + { + "epoch": 0.2569018720427895, + "grad_norm": 2.703151990446935, + "learning_rate": 1.74194873920314e-05, + "loss": 0.8186, + "step": 3122 + }, + { + "epoch": 0.2569841596379346, + "grad_norm": 2.3934733083927613, + "learning_rate": 1.741770015849506e-05, + "loss": 0.8089, + "step": 3123 + }, + { + "epoch": 0.2570664472330796, + "grad_norm": 2.012062497140323, + "learning_rate": 1.7415912398017167e-05, + "loss": 0.8043, + "step": 3124 + }, + { + "epoch": 0.25714873482822465, + "grad_norm": 2.0661448979537003, + "learning_rate": 1.7414124110724718e-05, + "loss": 0.83, + "step": 3125 + }, + { + "epoch": 0.2572310224233697, + "grad_norm": 2.501706805105995, + "learning_rate": 1.7412335296744744e-05, + "loss": 0.8424, + "step": 3126 + }, + { + "epoch": 0.2573133100185147, + "grad_norm": 2.3208443689743388, + "learning_rate": 1.741054595620433e-05, + "loss": 0.8136, + "step": 3127 + }, + { + "epoch": 0.25739559761365977, + "grad_norm": 4.837653791257567, + "learning_rate": 1.740875608923058e-05, + "loss": 0.8082, + "step": 3128 + }, + { + "epoch": 0.25747788520880477, + "grad_norm": 2.8197341867948444, + "learning_rate": 1.7406965695950644e-05, + "loss": 0.8033, + "step": 3129 + }, + { + "epoch": 0.2575601728039498, + "grad_norm": 1.9629309753444786, + "learning_rate": 1.740517477649171e-05, + "loss": 0.8161, + "step": 3130 + }, + { + "epoch": 0.25764246039909483, + "grad_norm": 2.270871953975815, + "learning_rate": 1.7403383330981008e-05, + "loss": 0.8429, + "step": 3131 + }, + { + "epoch": 0.2577247479942399, + "grad_norm": 0.4591756755142735, + "learning_rate": 1.740159135954579e-05, + "loss": 0.5406, + "step": 3132 + }, + { + "epoch": 0.2578070355893849, + "grad_norm": 2.251577055692386, + "learning_rate": 1.739979886231336e-05, + "loss": 0.7972, + "step": 3133 + }, + { + "epoch": 0.25788932318452995, + "grad_norm": 1.859966438912752, + "learning_rate": 1.7398005839411056e-05, + "loss": 0.8233, + "step": 3134 + }, + { + "epoch": 0.25797161077967495, + "grad_norm": 2.8639139708150196, + "learning_rate": 1.7396212290966247e-05, + "loss": 0.8191, + "step": 3135 + }, + { + "epoch": 0.25805389837482, + "grad_norm": 2.4944107163482157, + "learning_rate": 1.7394418217106342e-05, + "loss": 0.7591, + "step": 3136 + }, + { + "epoch": 0.258136185969965, + "grad_norm": 2.366722872021388, + "learning_rate": 1.7392623617958795e-05, + "loss": 0.8167, + "step": 3137 + }, + { + "epoch": 0.2582184735651101, + "grad_norm": 2.603370055066394, + "learning_rate": 1.739082849365109e-05, + "loss": 0.8104, + "step": 3138 + }, + { + "epoch": 0.2583007611602551, + "grad_norm": 2.754780707706453, + "learning_rate": 1.7389032844310746e-05, + "loss": 0.8262, + "step": 3139 + }, + { + "epoch": 0.25838304875540014, + "grad_norm": 7.339005960377376, + "learning_rate": 1.7387236670065325e-05, + "loss": 0.7942, + "step": 3140 + }, + { + "epoch": 0.25846533635054514, + "grad_norm": 2.5408794751755517, + "learning_rate": 1.7385439971042428e-05, + "loss": 0.8389, + "step": 3141 + }, + { + "epoch": 0.2585476239456902, + "grad_norm": 3.301905373284494, + "learning_rate": 1.7383642747369688e-05, + "loss": 0.8037, + "step": 3142 + }, + { + "epoch": 0.2586299115408352, + "grad_norm": 0.46772970353286447, + "learning_rate": 1.7381844999174773e-05, + "loss": 0.5683, + "step": 3143 + }, + { + "epoch": 0.25871219913598026, + "grad_norm": 3.331965492546159, + "learning_rate": 1.7380046726585396e-05, + "loss": 0.8083, + "step": 3144 + }, + { + "epoch": 0.25879448673112526, + "grad_norm": 2.1550660455142987, + "learning_rate": 1.73782479297293e-05, + "loss": 0.8134, + "step": 3145 + }, + { + "epoch": 0.2588767743262703, + "grad_norm": 2.9972605054435735, + "learning_rate": 1.7376448608734275e-05, + "loss": 0.7925, + "step": 3146 + }, + { + "epoch": 0.2589590619214153, + "grad_norm": 2.164026283547475, + "learning_rate": 1.7374648763728134e-05, + "loss": 0.8315, + "step": 3147 + }, + { + "epoch": 0.2590413495165604, + "grad_norm": 3.129881200517848, + "learning_rate": 1.737284839483874e-05, + "loss": 0.8187, + "step": 3148 + }, + { + "epoch": 0.2591236371117054, + "grad_norm": 2.249471320311643, + "learning_rate": 1.7371047502193988e-05, + "loss": 0.7994, + "step": 3149 + }, + { + "epoch": 0.25920592470685044, + "grad_norm": 0.43681294157060224, + "learning_rate": 1.7369246085921808e-05, + "loss": 0.5399, + "step": 3150 + }, + { + "epoch": 0.25928821230199545, + "grad_norm": 4.811660631207214, + "learning_rate": 1.736744414615017e-05, + "loss": 0.8236, + "step": 3151 + }, + { + "epoch": 0.2593704998971405, + "grad_norm": 2.5431996434505657, + "learning_rate": 1.7365641683007085e-05, + "loss": 0.803, + "step": 3152 + }, + { + "epoch": 0.25945278749228556, + "grad_norm": 0.4447693002784123, + "learning_rate": 1.7363838696620593e-05, + "loss": 0.5437, + "step": 3153 + }, + { + "epoch": 0.25953507508743057, + "grad_norm": 2.817772431461036, + "learning_rate": 1.7362035187118777e-05, + "loss": 0.8013, + "step": 3154 + }, + { + "epoch": 0.2596173626825756, + "grad_norm": 3.3812300131117334, + "learning_rate": 1.7360231154629756e-05, + "loss": 0.8215, + "step": 3155 + }, + { + "epoch": 0.25969965027772063, + "grad_norm": 2.6892117455209004, + "learning_rate": 1.7358426599281686e-05, + "loss": 0.8064, + "step": 3156 + }, + { + "epoch": 0.2597819378728657, + "grad_norm": 3.053879603907526, + "learning_rate": 1.7356621521202757e-05, + "loss": 0.7923, + "step": 3157 + }, + { + "epoch": 0.2598642254680107, + "grad_norm": 0.4570535125627652, + "learning_rate": 1.73548159205212e-05, + "loss": 0.5413, + "step": 3158 + }, + { + "epoch": 0.25994651306315575, + "grad_norm": 4.174512131282455, + "learning_rate": 1.7353009797365283e-05, + "loss": 0.7724, + "step": 3159 + }, + { + "epoch": 0.26002880065830075, + "grad_norm": 2.467743389444022, + "learning_rate": 1.735120315186331e-05, + "loss": 0.8162, + "step": 3160 + }, + { + "epoch": 0.2601110882534458, + "grad_norm": 2.4087878649109564, + "learning_rate": 1.734939598414362e-05, + "loss": 0.8459, + "step": 3161 + }, + { + "epoch": 0.2601933758485908, + "grad_norm": 2.882879818085009, + "learning_rate": 1.7347588294334595e-05, + "loss": 0.7865, + "step": 3162 + }, + { + "epoch": 0.26027566344373587, + "grad_norm": 3.042682180964853, + "learning_rate": 1.7345780082564646e-05, + "loss": 0.7795, + "step": 3163 + }, + { + "epoch": 0.2603579510388809, + "grad_norm": 2.684497062312509, + "learning_rate": 1.734397134896223e-05, + "loss": 0.8, + "step": 3164 + }, + { + "epoch": 0.26044023863402593, + "grad_norm": 2.168903824528707, + "learning_rate": 1.734216209365583e-05, + "loss": 0.8216, + "step": 3165 + }, + { + "epoch": 0.26052252622917094, + "grad_norm": 3.3438572773747843, + "learning_rate": 1.734035231677398e-05, + "loss": 0.8096, + "step": 3166 + }, + { + "epoch": 0.260604813824316, + "grad_norm": 2.790640753291628, + "learning_rate": 1.7338542018445242e-05, + "loss": 0.8268, + "step": 3167 + }, + { + "epoch": 0.260687101419461, + "grad_norm": 2.122562413274879, + "learning_rate": 1.7336731198798214e-05, + "loss": 0.8282, + "step": 3168 + }, + { + "epoch": 0.26076938901460606, + "grad_norm": 2.0906485747970835, + "learning_rate": 1.7334919857961533e-05, + "loss": 0.7994, + "step": 3169 + }, + { + "epoch": 0.26085167660975106, + "grad_norm": 2.5413232611159553, + "learning_rate": 1.733310799606388e-05, + "loss": 0.8384, + "step": 3170 + }, + { + "epoch": 0.2609339642048961, + "grad_norm": 2.113615118093388, + "learning_rate": 1.733129561323396e-05, + "loss": 0.807, + "step": 3171 + }, + { + "epoch": 0.2610162518000411, + "grad_norm": 3.8110046625338994, + "learning_rate": 1.732948270960052e-05, + "loss": 0.8028, + "step": 3172 + }, + { + "epoch": 0.2610985393951862, + "grad_norm": 2.274089846845691, + "learning_rate": 1.7327669285292357e-05, + "loss": 0.8159, + "step": 3173 + }, + { + "epoch": 0.2611808269903312, + "grad_norm": 2.155339007298501, + "learning_rate": 1.7325855340438286e-05, + "loss": 0.8151, + "step": 3174 + }, + { + "epoch": 0.26126311458547624, + "grad_norm": 2.3913425727713538, + "learning_rate": 1.7324040875167165e-05, + "loss": 0.8157, + "step": 3175 + }, + { + "epoch": 0.26134540218062124, + "grad_norm": 2.3386302794190748, + "learning_rate": 1.7322225889607893e-05, + "loss": 0.846, + "step": 3176 + }, + { + "epoch": 0.2614276897757663, + "grad_norm": 2.678919405039354, + "learning_rate": 1.7320410383889404e-05, + "loss": 0.8309, + "step": 3177 + }, + { + "epoch": 0.26150997737091136, + "grad_norm": 2.6864776396729035, + "learning_rate": 1.7318594358140672e-05, + "loss": 0.8027, + "step": 3178 + }, + { + "epoch": 0.26159226496605636, + "grad_norm": 2.190560347922986, + "learning_rate": 1.73167778124907e-05, + "loss": 0.7834, + "step": 3179 + }, + { + "epoch": 0.2616745525612014, + "grad_norm": 2.0674677033535866, + "learning_rate": 1.731496074706853e-05, + "loss": 0.8237, + "step": 3180 + }, + { + "epoch": 0.2617568401563464, + "grad_norm": 2.2650779122141484, + "learning_rate": 1.731314316200325e-05, + "loss": 0.8189, + "step": 3181 + }, + { + "epoch": 0.2618391277514915, + "grad_norm": 0.4501782139277584, + "learning_rate": 1.7311325057423975e-05, + "loss": 0.5598, + "step": 3182 + }, + { + "epoch": 0.2619214153466365, + "grad_norm": 2.537932066869163, + "learning_rate": 1.730950643345986e-05, + "loss": 0.8132, + "step": 3183 + }, + { + "epoch": 0.26200370294178155, + "grad_norm": 1.8526185042324084, + "learning_rate": 1.73076872902401e-05, + "loss": 0.8154, + "step": 3184 + }, + { + "epoch": 0.26208599053692655, + "grad_norm": 1.902302784050244, + "learning_rate": 1.730586762789392e-05, + "loss": 0.7661, + "step": 3185 + }, + { + "epoch": 0.2621682781320716, + "grad_norm": 2.053819447779948, + "learning_rate": 1.7304047446550587e-05, + "loss": 0.843, + "step": 3186 + }, + { + "epoch": 0.2622505657272166, + "grad_norm": 1.8359116217629687, + "learning_rate": 1.7302226746339405e-05, + "loss": 0.8086, + "step": 3187 + }, + { + "epoch": 0.26233285332236167, + "grad_norm": 1.6847594337426774, + "learning_rate": 1.7300405527389715e-05, + "loss": 0.8187, + "step": 3188 + }, + { + "epoch": 0.2624151409175067, + "grad_norm": 0.41702344315935097, + "learning_rate": 1.729858378983089e-05, + "loss": 0.5229, + "step": 3189 + }, + { + "epoch": 0.26249742851265173, + "grad_norm": 1.9228251205119076, + "learning_rate": 1.7296761533792344e-05, + "loss": 0.7767, + "step": 3190 + }, + { + "epoch": 0.26257971610779673, + "grad_norm": 0.4265786138023259, + "learning_rate": 1.729493875940353e-05, + "loss": 0.5629, + "step": 3191 + }, + { + "epoch": 0.2626620037029418, + "grad_norm": 2.1452185533518406, + "learning_rate": 1.729311546679393e-05, + "loss": 0.8223, + "step": 3192 + }, + { + "epoch": 0.2627442912980868, + "grad_norm": 1.7012131918386595, + "learning_rate": 1.7291291656093076e-05, + "loss": 0.8546, + "step": 3193 + }, + { + "epoch": 0.26282657889323185, + "grad_norm": 2.648389059941007, + "learning_rate": 1.728946732743052e-05, + "loss": 0.8148, + "step": 3194 + }, + { + "epoch": 0.26290886648837686, + "grad_norm": 9.042676560045523, + "learning_rate": 1.7287642480935863e-05, + "loss": 0.8144, + "step": 3195 + }, + { + "epoch": 0.2629911540835219, + "grad_norm": 2.7412001255716434, + "learning_rate": 1.7285817116738738e-05, + "loss": 0.7991, + "step": 3196 + }, + { + "epoch": 0.2630734416786669, + "grad_norm": 2.3624966281421322, + "learning_rate": 1.728399123496882e-05, + "loss": 0.7915, + "step": 3197 + }, + { + "epoch": 0.263155729273812, + "grad_norm": 2.1492800104720264, + "learning_rate": 1.728216483575581e-05, + "loss": 0.781, + "step": 3198 + }, + { + "epoch": 0.263238016868957, + "grad_norm": 0.4780175499628937, + "learning_rate": 1.728033791922946e-05, + "loss": 0.5888, + "step": 3199 + }, + { + "epoch": 0.26332030446410204, + "grad_norm": 2.5847737094687724, + "learning_rate": 1.7278510485519548e-05, + "loss": 0.8161, + "step": 3200 + }, + { + "epoch": 0.26340259205924704, + "grad_norm": 6.83724462118161, + "learning_rate": 1.727668253475589e-05, + "loss": 0.7952, + "step": 3201 + }, + { + "epoch": 0.2634848796543921, + "grad_norm": 3.3337817483900976, + "learning_rate": 1.7274854067068337e-05, + "loss": 0.8391, + "step": 3202 + }, + { + "epoch": 0.26356716724953716, + "grad_norm": 3.6413631005329785, + "learning_rate": 1.727302508258679e-05, + "loss": 0.8345, + "step": 3203 + }, + { + "epoch": 0.26364945484468216, + "grad_norm": 0.4708623517734391, + "learning_rate": 1.7271195581441174e-05, + "loss": 0.5692, + "step": 3204 + }, + { + "epoch": 0.2637317424398272, + "grad_norm": 0.4665677729352944, + "learning_rate": 1.7269365563761452e-05, + "loss": 0.5749, + "step": 3205 + }, + { + "epoch": 0.2638140300349722, + "grad_norm": 3.2795161486720605, + "learning_rate": 1.726753502967762e-05, + "loss": 0.8285, + "step": 3206 + }, + { + "epoch": 0.2638963176301173, + "grad_norm": 20.250846110052386, + "learning_rate": 1.726570397931973e-05, + "loss": 0.8299, + "step": 3207 + }, + { + "epoch": 0.2639786052252623, + "grad_norm": 3.9499671072623643, + "learning_rate": 1.7263872412817847e-05, + "loss": 0.8233, + "step": 3208 + }, + { + "epoch": 0.26406089282040734, + "grad_norm": 2.2860994668827193, + "learning_rate": 1.7262040330302085e-05, + "loss": 0.8133, + "step": 3209 + }, + { + "epoch": 0.26414318041555235, + "grad_norm": 3.473394250186446, + "learning_rate": 1.7260207731902586e-05, + "loss": 0.8001, + "step": 3210 + }, + { + "epoch": 0.2642254680106974, + "grad_norm": 3.1593135449863845, + "learning_rate": 1.7258374617749547e-05, + "loss": 0.804, + "step": 3211 + }, + { + "epoch": 0.2643077556058424, + "grad_norm": 2.044762678096964, + "learning_rate": 1.725654098797318e-05, + "loss": 0.8373, + "step": 3212 + }, + { + "epoch": 0.26439004320098747, + "grad_norm": 2.2672027919069437, + "learning_rate": 1.725470684270375e-05, + "loss": 0.8107, + "step": 3213 + }, + { + "epoch": 0.26447233079613247, + "grad_norm": 2.746222310105515, + "learning_rate": 1.7252872182071543e-05, + "loss": 0.7898, + "step": 3214 + }, + { + "epoch": 0.26455461839127753, + "grad_norm": 2.34381887200085, + "learning_rate": 1.72510370062069e-05, + "loss": 0.8069, + "step": 3215 + }, + { + "epoch": 0.26463690598642253, + "grad_norm": 2.1027650450293893, + "learning_rate": 1.724920131524018e-05, + "loss": 0.8081, + "step": 3216 + }, + { + "epoch": 0.2647191935815676, + "grad_norm": 2.681313461674057, + "learning_rate": 1.7247365109301797e-05, + "loss": 0.8357, + "step": 3217 + }, + { + "epoch": 0.2648014811767126, + "grad_norm": 0.4873380246831437, + "learning_rate": 1.7245528388522184e-05, + "loss": 0.5659, + "step": 3218 + }, + { + "epoch": 0.26488376877185765, + "grad_norm": 2.1376457729444898, + "learning_rate": 1.7243691153031824e-05, + "loss": 0.785, + "step": 3219 + }, + { + "epoch": 0.26496605636700266, + "grad_norm": 2.4174703642569852, + "learning_rate": 1.7241853402961227e-05, + "loss": 0.8179, + "step": 3220 + }, + { + "epoch": 0.2650483439621477, + "grad_norm": 1.7698760368671045, + "learning_rate": 1.7240015138440947e-05, + "loss": 0.822, + "step": 3221 + }, + { + "epoch": 0.2651306315572927, + "grad_norm": 0.45256651843325324, + "learning_rate": 1.723817635960157e-05, + "loss": 0.5495, + "step": 3222 + }, + { + "epoch": 0.2652129191524378, + "grad_norm": 1.8859690396403943, + "learning_rate": 1.7236337066573717e-05, + "loss": 0.8127, + "step": 3223 + }, + { + "epoch": 0.2652952067475828, + "grad_norm": 1.778989937861356, + "learning_rate": 1.7234497259488056e-05, + "loss": 0.8183, + "step": 3224 + }, + { + "epoch": 0.26537749434272784, + "grad_norm": 2.164404304323049, + "learning_rate": 1.7232656938475278e-05, + "loss": 0.8276, + "step": 3225 + }, + { + "epoch": 0.26545978193787284, + "grad_norm": 0.4274796517906534, + "learning_rate": 1.7230816103666118e-05, + "loss": 0.5293, + "step": 3226 + }, + { + "epoch": 0.2655420695330179, + "grad_norm": 2.515911549884607, + "learning_rate": 1.7228974755191346e-05, + "loss": 0.8335, + "step": 3227 + }, + { + "epoch": 0.2656243571281629, + "grad_norm": 2.665495710145411, + "learning_rate": 1.722713289318177e-05, + "loss": 0.8183, + "step": 3228 + }, + { + "epoch": 0.26570664472330796, + "grad_norm": 2.1032879064622914, + "learning_rate": 1.7225290517768227e-05, + "loss": 0.7815, + "step": 3229 + }, + { + "epoch": 0.265788932318453, + "grad_norm": 0.45918301181841464, + "learning_rate": 1.7223447629081606e-05, + "loss": 0.5528, + "step": 3230 + }, + { + "epoch": 0.265871219913598, + "grad_norm": 2.0918680503795812, + "learning_rate": 1.7221604227252813e-05, + "loss": 0.8358, + "step": 3231 + }, + { + "epoch": 0.2659535075087431, + "grad_norm": 2.024008414579574, + "learning_rate": 1.721976031241281e-05, + "loss": 0.8327, + "step": 3232 + }, + { + "epoch": 0.2660357951038881, + "grad_norm": 1.8557295954758255, + "learning_rate": 1.7217915884692575e-05, + "loss": 0.8278, + "step": 3233 + }, + { + "epoch": 0.26611808269903314, + "grad_norm": 2.1019216116121737, + "learning_rate": 1.721607094422314e-05, + "loss": 0.785, + "step": 3234 + }, + { + "epoch": 0.26620037029417815, + "grad_norm": 1.8352365534118282, + "learning_rate": 1.721422549113557e-05, + "loss": 0.8203, + "step": 3235 + }, + { + "epoch": 0.2662826578893232, + "grad_norm": 2.1039521614565118, + "learning_rate": 1.7212379525560956e-05, + "loss": 0.8513, + "step": 3236 + }, + { + "epoch": 0.2663649454844682, + "grad_norm": 2.520211759922124, + "learning_rate": 1.7210533047630436e-05, + "loss": 0.7667, + "step": 3237 + }, + { + "epoch": 0.26644723307961327, + "grad_norm": 2.0121725529690613, + "learning_rate": 1.720868605747518e-05, + "loss": 0.8022, + "step": 3238 + }, + { + "epoch": 0.26652952067475827, + "grad_norm": 3.0568058022460805, + "learning_rate": 1.7206838555226394e-05, + "loss": 0.7937, + "step": 3239 + }, + { + "epoch": 0.2666118082699033, + "grad_norm": 2.7439724996226382, + "learning_rate": 1.720499054101532e-05, + "loss": 0.8571, + "step": 3240 + }, + { + "epoch": 0.26669409586504833, + "grad_norm": 2.3999917272443185, + "learning_rate": 1.7203142014973245e-05, + "loss": 0.8092, + "step": 3241 + }, + { + "epoch": 0.2667763834601934, + "grad_norm": 2.548770342859101, + "learning_rate": 1.7201292977231475e-05, + "loss": 0.8116, + "step": 3242 + }, + { + "epoch": 0.2668586710553384, + "grad_norm": 2.5150954313517304, + "learning_rate": 1.7199443427921375e-05, + "loss": 0.8253, + "step": 3243 + }, + { + "epoch": 0.26694095865048345, + "grad_norm": 0.45147598726438465, + "learning_rate": 1.7197593367174326e-05, + "loss": 0.5544, + "step": 3244 + }, + { + "epoch": 0.26702324624562845, + "grad_norm": 3.1494051772033442, + "learning_rate": 1.7195742795121754e-05, + "loss": 0.83, + "step": 3245 + }, + { + "epoch": 0.2671055338407735, + "grad_norm": 2.4260353862913413, + "learning_rate": 1.7193891711895122e-05, + "loss": 0.8145, + "step": 3246 + }, + { + "epoch": 0.2671878214359185, + "grad_norm": 2.366121050333026, + "learning_rate": 1.7192040117625927e-05, + "loss": 0.8437, + "step": 3247 + }, + { + "epoch": 0.2672701090310636, + "grad_norm": 0.4277435140922396, + "learning_rate": 1.7190188012445707e-05, + "loss": 0.533, + "step": 3248 + }, + { + "epoch": 0.2673523966262086, + "grad_norm": 2.5357390411312393, + "learning_rate": 1.7188335396486024e-05, + "loss": 0.8084, + "step": 3249 + }, + { + "epoch": 0.26743468422135364, + "grad_norm": 2.8034007544837753, + "learning_rate": 1.7186482269878496e-05, + "loss": 0.8069, + "step": 3250 + }, + { + "epoch": 0.26751697181649864, + "grad_norm": 2.7529826625470846, + "learning_rate": 1.718462863275476e-05, + "loss": 0.799, + "step": 3251 + }, + { + "epoch": 0.2675992594116437, + "grad_norm": 2.181450039061867, + "learning_rate": 1.7182774485246493e-05, + "loss": 0.8287, + "step": 3252 + }, + { + "epoch": 0.2676815470067887, + "grad_norm": 0.5190998331313099, + "learning_rate": 1.7180919827485414e-05, + "loss": 0.5556, + "step": 3253 + }, + { + "epoch": 0.26776383460193376, + "grad_norm": 2.6524747277559366, + "learning_rate": 1.7179064659603277e-05, + "loss": 0.8174, + "step": 3254 + }, + { + "epoch": 0.2678461221970788, + "grad_norm": 2.857466915807229, + "learning_rate": 1.7177208981731864e-05, + "loss": 0.7897, + "step": 3255 + }, + { + "epoch": 0.2679284097922238, + "grad_norm": 2.1330095635523376, + "learning_rate": 1.717535279400301e-05, + "loss": 0.7827, + "step": 3256 + }, + { + "epoch": 0.2680106973873689, + "grad_norm": 1.6286409905128667, + "learning_rate": 1.7173496096548562e-05, + "loss": 0.8053, + "step": 3257 + }, + { + "epoch": 0.2680929849825139, + "grad_norm": 1.9075208132843064, + "learning_rate": 1.717163888950043e-05, + "loss": 0.8175, + "step": 3258 + }, + { + "epoch": 0.26817527257765894, + "grad_norm": 2.1464092784064395, + "learning_rate": 1.7169781172990532e-05, + "loss": 0.805, + "step": 3259 + }, + { + "epoch": 0.26825756017280394, + "grad_norm": 2.354882095513986, + "learning_rate": 1.716792294715085e-05, + "loss": 0.8168, + "step": 3260 + }, + { + "epoch": 0.268339847767949, + "grad_norm": 2.4235854013908784, + "learning_rate": 1.716606421211339e-05, + "loss": 0.8108, + "step": 3261 + }, + { + "epoch": 0.268422135363094, + "grad_norm": 0.441932485843513, + "learning_rate": 1.7164204968010186e-05, + "loss": 0.5369, + "step": 3262 + }, + { + "epoch": 0.26850442295823906, + "grad_norm": 3.8674534779204675, + "learning_rate": 1.7162345214973316e-05, + "loss": 0.8112, + "step": 3263 + }, + { + "epoch": 0.26858671055338407, + "grad_norm": 2.0192449661176077, + "learning_rate": 1.71604849531349e-05, + "loss": 0.8362, + "step": 3264 + }, + { + "epoch": 0.2686689981485291, + "grad_norm": 2.1024227141859564, + "learning_rate": 1.715862418262708e-05, + "loss": 0.8234, + "step": 3265 + }, + { + "epoch": 0.26875128574367413, + "grad_norm": 1.7913169528107178, + "learning_rate": 1.715676290358205e-05, + "loss": 0.8087, + "step": 3266 + }, + { + "epoch": 0.2688335733388192, + "grad_norm": 2.4916457168510275, + "learning_rate": 1.715490111613203e-05, + "loss": 0.8152, + "step": 3267 + }, + { + "epoch": 0.2689158609339642, + "grad_norm": 0.44607243016910886, + "learning_rate": 1.7153038820409272e-05, + "loss": 0.5177, + "step": 3268 + }, + { + "epoch": 0.26899814852910925, + "grad_norm": 1.9388380159999783, + "learning_rate": 1.7151176016546078e-05, + "loss": 0.8064, + "step": 3269 + }, + { + "epoch": 0.26908043612425425, + "grad_norm": 1.8908928592795229, + "learning_rate": 1.7149312704674778e-05, + "loss": 0.8106, + "step": 3270 + }, + { + "epoch": 0.2691627237193993, + "grad_norm": 2.153614448672576, + "learning_rate": 1.7147448884927737e-05, + "loss": 0.8105, + "step": 3271 + }, + { + "epoch": 0.2692450113145443, + "grad_norm": 1.6705307534234544, + "learning_rate": 1.7145584557437357e-05, + "loss": 0.7938, + "step": 3272 + }, + { + "epoch": 0.26932729890968937, + "grad_norm": 1.8896210907500344, + "learning_rate": 1.714371972233608e-05, + "loss": 0.8239, + "step": 3273 + }, + { + "epoch": 0.2694095865048344, + "grad_norm": 2.8554806232336962, + "learning_rate": 1.7141854379756373e-05, + "loss": 0.7964, + "step": 3274 + }, + { + "epoch": 0.26949187409997943, + "grad_norm": 6.209599869394988, + "learning_rate": 1.713998852983076e-05, + "loss": 0.802, + "step": 3275 + }, + { + "epoch": 0.26957416169512444, + "grad_norm": 2.07612603899728, + "learning_rate": 1.7138122172691774e-05, + "loss": 0.8126, + "step": 3276 + }, + { + "epoch": 0.2696564492902695, + "grad_norm": 0.44897503609566247, + "learning_rate": 1.713625530847201e-05, + "loss": 0.5385, + "step": 3277 + }, + { + "epoch": 0.2697387368854145, + "grad_norm": 1.9728522028250042, + "learning_rate": 1.7134387937304075e-05, + "loss": 0.8193, + "step": 3278 + }, + { + "epoch": 0.26982102448055956, + "grad_norm": 1.978362275479669, + "learning_rate": 1.7132520059320635e-05, + "loss": 0.8097, + "step": 3279 + }, + { + "epoch": 0.26990331207570456, + "grad_norm": 2.757889220842567, + "learning_rate": 1.7130651674654374e-05, + "loss": 0.8308, + "step": 3280 + }, + { + "epoch": 0.2699855996708496, + "grad_norm": 2.398643541912809, + "learning_rate": 1.7128782783438027e-05, + "loss": 0.7921, + "step": 3281 + }, + { + "epoch": 0.2700678872659947, + "grad_norm": 1.7756416210361876, + "learning_rate": 1.712691338580435e-05, + "loss": 0.8216, + "step": 3282 + }, + { + "epoch": 0.2701501748611397, + "grad_norm": 1.8358732316988162, + "learning_rate": 1.712504348188614e-05, + "loss": 0.7986, + "step": 3283 + }, + { + "epoch": 0.27023246245628474, + "grad_norm": 1.9998406902584105, + "learning_rate": 1.712317307181624e-05, + "loss": 0.8137, + "step": 3284 + }, + { + "epoch": 0.27031475005142974, + "grad_norm": 2.323256865689543, + "learning_rate": 1.7121302155727516e-05, + "loss": 0.803, + "step": 3285 + }, + { + "epoch": 0.2703970376465748, + "grad_norm": 1.8768893607837283, + "learning_rate": 1.7119430733752875e-05, + "loss": 0.8083, + "step": 3286 + }, + { + "epoch": 0.2704793252417198, + "grad_norm": 2.6528067614762163, + "learning_rate": 1.7117558806025262e-05, + "loss": 0.7954, + "step": 3287 + }, + { + "epoch": 0.27056161283686486, + "grad_norm": 1.973609765171818, + "learning_rate": 1.7115686372677652e-05, + "loss": 0.7853, + "step": 3288 + }, + { + "epoch": 0.27064390043200987, + "grad_norm": 2.3217110177707623, + "learning_rate": 1.7113813433843063e-05, + "loss": 0.8039, + "step": 3289 + }, + { + "epoch": 0.2707261880271549, + "grad_norm": 2.838651310164189, + "learning_rate": 1.7111939989654544e-05, + "loss": 0.8316, + "step": 3290 + }, + { + "epoch": 0.2708084756222999, + "grad_norm": 2.382217470261549, + "learning_rate": 1.7110066040245183e-05, + "loss": 0.8242, + "step": 3291 + }, + { + "epoch": 0.270890763217445, + "grad_norm": 2.887414017571467, + "learning_rate": 1.7108191585748103e-05, + "loss": 0.8337, + "step": 3292 + }, + { + "epoch": 0.27097305081259, + "grad_norm": 0.47604219888877264, + "learning_rate": 1.710631662629646e-05, + "loss": 0.5184, + "step": 3293 + }, + { + "epoch": 0.27105533840773505, + "grad_norm": 2.0852978156132593, + "learning_rate": 1.7104441162023444e-05, + "loss": 0.8153, + "step": 3294 + }, + { + "epoch": 0.27113762600288005, + "grad_norm": 2.1675255629575583, + "learning_rate": 1.7102565193062294e-05, + "loss": 0.8, + "step": 3295 + }, + { + "epoch": 0.2712199135980251, + "grad_norm": 4.09311237887044, + "learning_rate": 1.710068871954627e-05, + "loss": 0.7962, + "step": 3296 + }, + { + "epoch": 0.2713022011931701, + "grad_norm": 0.4518855461550148, + "learning_rate": 1.7098811741608675e-05, + "loss": 0.5521, + "step": 3297 + }, + { + "epoch": 0.27138448878831517, + "grad_norm": 1.9238316344585502, + "learning_rate": 1.709693425938285e-05, + "loss": 0.8074, + "step": 3298 + }, + { + "epoch": 0.2714667763834602, + "grad_norm": 2.1643329311004273, + "learning_rate": 1.709505627300216e-05, + "loss": 0.8162, + "step": 3299 + }, + { + "epoch": 0.27154906397860523, + "grad_norm": 2.177575648049798, + "learning_rate": 1.7093177782600023e-05, + "loss": 0.8079, + "step": 3300 + }, + { + "epoch": 0.27163135157375023, + "grad_norm": 2.6025431418214824, + "learning_rate": 1.709129878830988e-05, + "loss": 0.8499, + "step": 3301 + }, + { + "epoch": 0.2717136391688953, + "grad_norm": 3.4271863171360755, + "learning_rate": 1.708941929026521e-05, + "loss": 0.8361, + "step": 3302 + }, + { + "epoch": 0.2717959267640403, + "grad_norm": 2.7718425253153596, + "learning_rate": 1.7087539288599533e-05, + "loss": 0.8321, + "step": 3303 + }, + { + "epoch": 0.27187821435918536, + "grad_norm": 2.03764504045628, + "learning_rate": 1.70856587834464e-05, + "loss": 0.8324, + "step": 3304 + }, + { + "epoch": 0.27196050195433036, + "grad_norm": 0.4650979904841794, + "learning_rate": 1.7083777774939396e-05, + "loss": 0.5372, + "step": 3305 + }, + { + "epoch": 0.2720427895494754, + "grad_norm": 2.0512935842462423, + "learning_rate": 1.708189626321215e-05, + "loss": 0.8092, + "step": 3306 + }, + { + "epoch": 0.2721250771446205, + "grad_norm": 2.0198795891242054, + "learning_rate": 1.708001424839832e-05, + "loss": 0.8012, + "step": 3307 + }, + { + "epoch": 0.2722073647397655, + "grad_norm": 2.5750179825763673, + "learning_rate": 1.70781317306316e-05, + "loss": 0.7717, + "step": 3308 + }, + { + "epoch": 0.27228965233491054, + "grad_norm": 1.6929684242043748, + "learning_rate": 1.7076248710045723e-05, + "loss": 0.8148, + "step": 3309 + }, + { + "epoch": 0.27237193993005554, + "grad_norm": 2.553340454097266, + "learning_rate": 1.7074365186774452e-05, + "loss": 0.8086, + "step": 3310 + }, + { + "epoch": 0.2724542275252006, + "grad_norm": 0.4513239731225968, + "learning_rate": 1.7072481160951592e-05, + "loss": 0.5447, + "step": 3311 + }, + { + "epoch": 0.2725365151203456, + "grad_norm": 2.147515676061674, + "learning_rate": 1.707059663271098e-05, + "loss": 0.7959, + "step": 3312 + }, + { + "epoch": 0.27261880271549066, + "grad_norm": 1.781932232000643, + "learning_rate": 1.7068711602186495e-05, + "loss": 0.7834, + "step": 3313 + }, + { + "epoch": 0.27270109031063566, + "grad_norm": 2.2065108390079247, + "learning_rate": 1.706682606951204e-05, + "loss": 0.8294, + "step": 3314 + }, + { + "epoch": 0.2727833779057807, + "grad_norm": 0.44205612058721017, + "learning_rate": 1.706494003482156e-05, + "loss": 0.5366, + "step": 3315 + }, + { + "epoch": 0.2728656655009257, + "grad_norm": 2.305688184989724, + "learning_rate": 1.7063053498249043e-05, + "loss": 0.8396, + "step": 3316 + }, + { + "epoch": 0.2729479530960708, + "grad_norm": 2.3601766963878523, + "learning_rate": 1.70611664599285e-05, + "loss": 0.8406, + "step": 3317 + }, + { + "epoch": 0.2730302406912158, + "grad_norm": 1.8335893965816523, + "learning_rate": 1.7059278919993984e-05, + "loss": 0.8155, + "step": 3318 + }, + { + "epoch": 0.27311252828636084, + "grad_norm": 0.41818738369309927, + "learning_rate": 1.705739087857958e-05, + "loss": 0.5361, + "step": 3319 + }, + { + "epoch": 0.27319481588150585, + "grad_norm": 1.9509967112825835, + "learning_rate": 1.7055502335819424e-05, + "loss": 0.7959, + "step": 3320 + }, + { + "epoch": 0.2732771034766509, + "grad_norm": 1.8398501099325344, + "learning_rate": 1.7053613291847656e-05, + "loss": 0.7916, + "step": 3321 + }, + { + "epoch": 0.2733593910717959, + "grad_norm": 1.6060444808115466, + "learning_rate": 1.7051723746798485e-05, + "loss": 0.7943, + "step": 3322 + }, + { + "epoch": 0.27344167866694097, + "grad_norm": 1.8905149545844033, + "learning_rate": 1.7049833700806137e-05, + "loss": 0.8041, + "step": 3323 + }, + { + "epoch": 0.27352396626208597, + "grad_norm": 1.8663100124717953, + "learning_rate": 1.7047943154004875e-05, + "loss": 0.8031, + "step": 3324 + }, + { + "epoch": 0.27360625385723103, + "grad_norm": 2.0161913834988017, + "learning_rate": 1.7046052106529004e-05, + "loss": 0.8337, + "step": 3325 + }, + { + "epoch": 0.27368854145237603, + "grad_norm": 2.0274683750441658, + "learning_rate": 1.704416055851286e-05, + "loss": 0.7846, + "step": 3326 + }, + { + "epoch": 0.2737708290475211, + "grad_norm": 0.44113211885882087, + "learning_rate": 1.7042268510090814e-05, + "loss": 0.5408, + "step": 3327 + }, + { + "epoch": 0.2738531166426661, + "grad_norm": 2.2224775662705114, + "learning_rate": 1.7040375961397278e-05, + "loss": 0.7821, + "step": 3328 + }, + { + "epoch": 0.27393540423781115, + "grad_norm": 1.9250995186429698, + "learning_rate": 1.703848291256669e-05, + "loss": 0.8445, + "step": 3329 + }, + { + "epoch": 0.27401769183295616, + "grad_norm": 1.864181688494493, + "learning_rate": 1.7036589363733534e-05, + "loss": 0.797, + "step": 3330 + }, + { + "epoch": 0.2740999794281012, + "grad_norm": 1.7449914040884869, + "learning_rate": 1.7034695315032323e-05, + "loss": 0.8396, + "step": 3331 + }, + { + "epoch": 0.2741822670232463, + "grad_norm": 2.0412298979855183, + "learning_rate": 1.7032800766597608e-05, + "loss": 0.8351, + "step": 3332 + }, + { + "epoch": 0.2742645546183913, + "grad_norm": 2.1068397051206755, + "learning_rate": 1.7030905718563972e-05, + "loss": 0.7893, + "step": 3333 + }, + { + "epoch": 0.27434684221353633, + "grad_norm": 1.8306211345628756, + "learning_rate": 1.7029010171066042e-05, + "loss": 0.8022, + "step": 3334 + }, + { + "epoch": 0.27442912980868134, + "grad_norm": 1.5634997113375737, + "learning_rate": 1.7027114124238466e-05, + "loss": 0.8086, + "step": 3335 + }, + { + "epoch": 0.2745114174038264, + "grad_norm": 1.7569546506128608, + "learning_rate": 1.7025217578215943e-05, + "loss": 0.8014, + "step": 3336 + }, + { + "epoch": 0.2745937049989714, + "grad_norm": 2.0232013413047096, + "learning_rate": 1.7023320533133198e-05, + "loss": 0.8504, + "step": 3337 + }, + { + "epoch": 0.27467599259411646, + "grad_norm": 1.696664859128998, + "learning_rate": 1.702142298912499e-05, + "loss": 0.8252, + "step": 3338 + }, + { + "epoch": 0.27475828018926146, + "grad_norm": 1.9048938072450126, + "learning_rate": 1.7019524946326128e-05, + "loss": 0.8314, + "step": 3339 + }, + { + "epoch": 0.2748405677844065, + "grad_norm": 1.6242850934087922, + "learning_rate": 1.7017626404871438e-05, + "loss": 0.7981, + "step": 3340 + }, + { + "epoch": 0.2749228553795515, + "grad_norm": 1.5322679268428667, + "learning_rate": 1.7015727364895794e-05, + "loss": 0.8063, + "step": 3341 + }, + { + "epoch": 0.2750051429746966, + "grad_norm": 1.8137498841382922, + "learning_rate": 1.7013827826534096e-05, + "loss": 0.7779, + "step": 3342 + }, + { + "epoch": 0.2750874305698416, + "grad_norm": 1.7887008817254988, + "learning_rate": 1.7011927789921283e-05, + "loss": 0.8315, + "step": 3343 + }, + { + "epoch": 0.27516971816498664, + "grad_norm": 1.940416298679269, + "learning_rate": 1.7010027255192337e-05, + "loss": 0.8019, + "step": 3344 + }, + { + "epoch": 0.27525200576013165, + "grad_norm": 1.5770078546653166, + "learning_rate": 1.7008126222482265e-05, + "loss": 0.7964, + "step": 3345 + }, + { + "epoch": 0.2753342933552767, + "grad_norm": 2.067596160022381, + "learning_rate": 1.7006224691926113e-05, + "loss": 0.7864, + "step": 3346 + }, + { + "epoch": 0.2754165809504217, + "grad_norm": 2.756085711775809, + "learning_rate": 1.7004322663658967e-05, + "loss": 0.8236, + "step": 3347 + }, + { + "epoch": 0.27549886854556677, + "grad_norm": 1.9318322636331693, + "learning_rate": 1.7002420137815936e-05, + "loss": 0.8145, + "step": 3348 + }, + { + "epoch": 0.27558115614071177, + "grad_norm": 1.7523196389978979, + "learning_rate": 1.700051711453218e-05, + "loss": 0.7977, + "step": 3349 + }, + { + "epoch": 0.27566344373585683, + "grad_norm": 1.7864940799926223, + "learning_rate": 1.6998613593942886e-05, + "loss": 0.8005, + "step": 3350 + }, + { + "epoch": 0.27574573133100183, + "grad_norm": 1.8804326018457047, + "learning_rate": 1.699670957618327e-05, + "loss": 0.8339, + "step": 3351 + }, + { + "epoch": 0.2758280189261469, + "grad_norm": 1.6044837569260737, + "learning_rate": 1.6994805061388597e-05, + "loss": 0.8433, + "step": 3352 + }, + { + "epoch": 0.2759103065212919, + "grad_norm": 1.9980538892293163, + "learning_rate": 1.699290004969416e-05, + "loss": 0.8173, + "step": 3353 + }, + { + "epoch": 0.27599259411643695, + "grad_norm": 1.4377196318751049, + "learning_rate": 1.6990994541235287e-05, + "loss": 0.8204, + "step": 3354 + }, + { + "epoch": 0.27607488171158195, + "grad_norm": 1.945350080976809, + "learning_rate": 1.6989088536147343e-05, + "loss": 0.8322, + "step": 3355 + }, + { + "epoch": 0.276157169306727, + "grad_norm": 2.294389988296374, + "learning_rate": 1.6987182034565727e-05, + "loss": 0.7996, + "step": 3356 + }, + { + "epoch": 0.276239456901872, + "grad_norm": 1.41771829095886, + "learning_rate": 1.698527503662587e-05, + "loss": 0.8343, + "step": 3357 + }, + { + "epoch": 0.2763217444970171, + "grad_norm": 1.5926276389564853, + "learning_rate": 1.698336754246325e-05, + "loss": 0.7884, + "step": 3358 + }, + { + "epoch": 0.27640403209216213, + "grad_norm": 1.586524028319989, + "learning_rate": 1.6981459552213363e-05, + "loss": 0.8287, + "step": 3359 + }, + { + "epoch": 0.27648631968730714, + "grad_norm": 1.740900423555991, + "learning_rate": 1.697955106601176e-05, + "loss": 0.8004, + "step": 3360 + }, + { + "epoch": 0.2765686072824522, + "grad_norm": 1.758580264555415, + "learning_rate": 1.6977642083994006e-05, + "loss": 0.8201, + "step": 3361 + }, + { + "epoch": 0.2766508948775972, + "grad_norm": 0.4741258364442609, + "learning_rate": 1.697573260629572e-05, + "loss": 0.5618, + "step": 3362 + }, + { + "epoch": 0.27673318247274226, + "grad_norm": 2.1009382288448015, + "learning_rate": 1.6973822633052547e-05, + "loss": 0.809, + "step": 3363 + }, + { + "epoch": 0.27681547006788726, + "grad_norm": 1.4906072475343315, + "learning_rate": 1.6971912164400163e-05, + "loss": 0.7993, + "step": 3364 + }, + { + "epoch": 0.2768977576630323, + "grad_norm": 0.4185664842935336, + "learning_rate": 1.6970001200474296e-05, + "loss": 0.5429, + "step": 3365 + }, + { + "epoch": 0.2769800452581773, + "grad_norm": 2.0662172481020034, + "learning_rate": 1.6968089741410684e-05, + "loss": 0.8321, + "step": 3366 + }, + { + "epoch": 0.2770623328533224, + "grad_norm": 0.42729417981410084, + "learning_rate": 1.6966177787345125e-05, + "loss": 0.5264, + "step": 3367 + }, + { + "epoch": 0.2771446204484674, + "grad_norm": 1.4645916126136944, + "learning_rate": 1.6964265338413434e-05, + "loss": 0.8361, + "step": 3368 + }, + { + "epoch": 0.27722690804361244, + "grad_norm": 2.0284938459903494, + "learning_rate": 1.6962352394751473e-05, + "loss": 0.7948, + "step": 3369 + }, + { + "epoch": 0.27730919563875744, + "grad_norm": 1.8160325925978076, + "learning_rate": 1.696043895649513e-05, + "loss": 0.8187, + "step": 3370 + }, + { + "epoch": 0.2773914832339025, + "grad_norm": 1.6372608945764426, + "learning_rate": 1.6958525023780337e-05, + "loss": 0.8418, + "step": 3371 + }, + { + "epoch": 0.2774737708290475, + "grad_norm": 1.4144229825435075, + "learning_rate": 1.6956610596743057e-05, + "loss": 0.8092, + "step": 3372 + }, + { + "epoch": 0.27755605842419256, + "grad_norm": 1.488036616750426, + "learning_rate": 1.695469567551928e-05, + "loss": 0.8124, + "step": 3373 + }, + { + "epoch": 0.27763834601933757, + "grad_norm": 1.4610354658407356, + "learning_rate": 1.695278026024505e-05, + "loss": 0.7949, + "step": 3374 + }, + { + "epoch": 0.2777206336144826, + "grad_norm": 1.5881282267404018, + "learning_rate": 1.6950864351056426e-05, + "loss": 0.8117, + "step": 3375 + }, + { + "epoch": 0.27780292120962763, + "grad_norm": 2.0521286867664714, + "learning_rate": 1.6948947948089512e-05, + "loss": 0.7922, + "step": 3376 + }, + { + "epoch": 0.2778852088047727, + "grad_norm": 0.4982262468961942, + "learning_rate": 1.6947031051480457e-05, + "loss": 0.5381, + "step": 3377 + }, + { + "epoch": 0.2779674963999177, + "grad_norm": 1.932526443724488, + "learning_rate": 1.694511366136542e-05, + "loss": 0.8203, + "step": 3378 + }, + { + "epoch": 0.27804978399506275, + "grad_norm": 2.0835624700650395, + "learning_rate": 1.6943195777880615e-05, + "loss": 0.8289, + "step": 3379 + }, + { + "epoch": 0.27813207159020775, + "grad_norm": 0.41924968430352944, + "learning_rate": 1.6941277401162292e-05, + "loss": 0.5271, + "step": 3380 + }, + { + "epoch": 0.2782143591853528, + "grad_norm": 0.42900497961077644, + "learning_rate": 1.693935853134672e-05, + "loss": 0.527, + "step": 3381 + }, + { + "epoch": 0.2782966467804978, + "grad_norm": 1.70942845007818, + "learning_rate": 1.6937439168570217e-05, + "loss": 0.8289, + "step": 3382 + }, + { + "epoch": 0.2783789343756429, + "grad_norm": 0.4169202457645185, + "learning_rate": 1.693551931296913e-05, + "loss": 0.5405, + "step": 3383 + }, + { + "epoch": 0.27846122197078793, + "grad_norm": 1.5249997090807386, + "learning_rate": 1.693359896467984e-05, + "loss": 0.7746, + "step": 3384 + }, + { + "epoch": 0.27854350956593293, + "grad_norm": 1.844099104485872, + "learning_rate": 1.693167812383877e-05, + "loss": 0.8101, + "step": 3385 + }, + { + "epoch": 0.278625797161078, + "grad_norm": 0.4620191998761672, + "learning_rate": 1.6929756790582374e-05, + "loss": 0.5587, + "step": 3386 + }, + { + "epoch": 0.278708084756223, + "grad_norm": 1.8276177937378877, + "learning_rate": 1.6927834965047134e-05, + "loss": 0.8152, + "step": 3387 + }, + { + "epoch": 0.27879037235136805, + "grad_norm": 1.521403662930178, + "learning_rate": 1.692591264736958e-05, + "loss": 0.8188, + "step": 3388 + }, + { + "epoch": 0.27887265994651306, + "grad_norm": 4.367006002729017, + "learning_rate": 1.6923989837686266e-05, + "loss": 0.7981, + "step": 3389 + }, + { + "epoch": 0.2789549475416581, + "grad_norm": 1.5441132673864073, + "learning_rate": 1.692206653613379e-05, + "loss": 0.8155, + "step": 3390 + }, + { + "epoch": 0.2790372351368031, + "grad_norm": 0.43893254704922163, + "learning_rate": 1.6920142742848775e-05, + "loss": 0.5416, + "step": 3391 + }, + { + "epoch": 0.2791195227319482, + "grad_norm": 2.6249098901055126, + "learning_rate": 1.6918218457967888e-05, + "loss": 0.8534, + "step": 3392 + }, + { + "epoch": 0.2792018103270932, + "grad_norm": 1.627178716854893, + "learning_rate": 1.6916293681627823e-05, + "loss": 0.8187, + "step": 3393 + }, + { + "epoch": 0.27928409792223824, + "grad_norm": 1.9555270765008443, + "learning_rate": 1.691436841396532e-05, + "loss": 0.8209, + "step": 3394 + }, + { + "epoch": 0.27936638551738324, + "grad_norm": 0.4305921737725394, + "learning_rate": 1.6912442655117144e-05, + "loss": 0.5327, + "step": 3395 + }, + { + "epoch": 0.2794486731125283, + "grad_norm": 1.490785336532508, + "learning_rate": 1.691051640522009e-05, + "loss": 0.7933, + "step": 3396 + }, + { + "epoch": 0.2795309607076733, + "grad_norm": 2.677448047961783, + "learning_rate": 1.6908589664411007e-05, + "loss": 0.7862, + "step": 3397 + }, + { + "epoch": 0.27961324830281836, + "grad_norm": 2.3547165596097472, + "learning_rate": 1.6906662432826763e-05, + "loss": 0.8082, + "step": 3398 + }, + { + "epoch": 0.27969553589796337, + "grad_norm": 1.6494931737575393, + "learning_rate": 1.690473471060426e-05, + "loss": 0.8217, + "step": 3399 + }, + { + "epoch": 0.2797778234931084, + "grad_norm": 1.6765976469942176, + "learning_rate": 1.6902806497880454e-05, + "loss": 0.8225, + "step": 3400 + }, + { + "epoch": 0.2798601110882534, + "grad_norm": 0.46128265450139233, + "learning_rate": 1.690087779479231e-05, + "loss": 0.5631, + "step": 3401 + }, + { + "epoch": 0.2799423986833985, + "grad_norm": 5.213573703540176, + "learning_rate": 1.6898948601476842e-05, + "loss": 0.8275, + "step": 3402 + }, + { + "epoch": 0.2800246862785435, + "grad_norm": 1.9002942814116162, + "learning_rate": 1.68970189180711e-05, + "loss": 0.8315, + "step": 3403 + }, + { + "epoch": 0.28010697387368855, + "grad_norm": 1.710672469450862, + "learning_rate": 1.6895088744712164e-05, + "loss": 0.814, + "step": 3404 + }, + { + "epoch": 0.28018926146883355, + "grad_norm": 2.4750463606181876, + "learning_rate": 1.689315808153715e-05, + "loss": 0.7951, + "step": 3405 + }, + { + "epoch": 0.2802715490639786, + "grad_norm": 0.4357926929191528, + "learning_rate": 1.6891226928683213e-05, + "loss": 0.5428, + "step": 3406 + }, + { + "epoch": 0.2803538366591236, + "grad_norm": 1.6990175521384374, + "learning_rate": 1.688929528628753e-05, + "loss": 0.8015, + "step": 3407 + }, + { + "epoch": 0.28043612425426867, + "grad_norm": 1.8882436532486544, + "learning_rate": 1.6887363154487336e-05, + "loss": 0.828, + "step": 3408 + }, + { + "epoch": 0.2805184118494137, + "grad_norm": 2.155423063392042, + "learning_rate": 1.688543053341987e-05, + "loss": 0.8267, + "step": 3409 + }, + { + "epoch": 0.28060069944455873, + "grad_norm": 0.4279141131980349, + "learning_rate": 1.6883497423222435e-05, + "loss": 0.5409, + "step": 3410 + }, + { + "epoch": 0.2806829870397038, + "grad_norm": 2.2826003792873366, + "learning_rate": 1.6881563824032354e-05, + "loss": 0.7995, + "step": 3411 + }, + { + "epoch": 0.2807652746348488, + "grad_norm": 1.9585010119599398, + "learning_rate": 1.6879629735986978e-05, + "loss": 0.8228, + "step": 3412 + }, + { + "epoch": 0.28084756222999385, + "grad_norm": 2.6976405683684583, + "learning_rate": 1.687769515922371e-05, + "loss": 0.8281, + "step": 3413 + }, + { + "epoch": 0.28092984982513886, + "grad_norm": 0.4670841176023679, + "learning_rate": 1.6875760093879978e-05, + "loss": 0.5713, + "step": 3414 + }, + { + "epoch": 0.2810121374202839, + "grad_norm": 0.46234681406345274, + "learning_rate": 1.6873824540093245e-05, + "loss": 0.5615, + "step": 3415 + }, + { + "epoch": 0.2810944250154289, + "grad_norm": 1.7765530034273658, + "learning_rate": 1.687188849800101e-05, + "loss": 0.8235, + "step": 3416 + }, + { + "epoch": 0.281176712610574, + "grad_norm": 0.42952479338740623, + "learning_rate": 1.6869951967740806e-05, + "loss": 0.5411, + "step": 3417 + }, + { + "epoch": 0.281259000205719, + "grad_norm": 2.0545519167604427, + "learning_rate": 1.68680149494502e-05, + "loss": 0.8392, + "step": 3418 + }, + { + "epoch": 0.28134128780086404, + "grad_norm": 2.3305775779080875, + "learning_rate": 1.68660774432668e-05, + "loss": 0.8384, + "step": 3419 + }, + { + "epoch": 0.28142357539600904, + "grad_norm": 2.2918573491440593, + "learning_rate": 1.6864139449328237e-05, + "loss": 0.8096, + "step": 3420 + }, + { + "epoch": 0.2815058629911541, + "grad_norm": 2.065669036400418, + "learning_rate": 1.686220096777218e-05, + "loss": 0.8052, + "step": 3421 + }, + { + "epoch": 0.2815881505862991, + "grad_norm": 2.6104333684954164, + "learning_rate": 1.6860261998736347e-05, + "loss": 0.831, + "step": 3422 + }, + { + "epoch": 0.28167043818144416, + "grad_norm": 3.4434340957132856, + "learning_rate": 1.685832254235847e-05, + "loss": 0.7722, + "step": 3423 + }, + { + "epoch": 0.28175272577658916, + "grad_norm": 1.8938224377515387, + "learning_rate": 1.685638259877633e-05, + "loss": 0.7882, + "step": 3424 + }, + { + "epoch": 0.2818350133717342, + "grad_norm": 2.552662398864058, + "learning_rate": 1.6854442168127733e-05, + "loss": 0.843, + "step": 3425 + }, + { + "epoch": 0.2819173009668792, + "grad_norm": 3.748955949469818, + "learning_rate": 1.6852501250550527e-05, + "loss": 0.8169, + "step": 3426 + }, + { + "epoch": 0.2819995885620243, + "grad_norm": 0.46337970709394594, + "learning_rate": 1.685055984618259e-05, + "loss": 0.5366, + "step": 3427 + }, + { + "epoch": 0.2820818761571693, + "grad_norm": 2.703284585055913, + "learning_rate": 1.684861795516184e-05, + "loss": 0.8095, + "step": 3428 + }, + { + "epoch": 0.28216416375231435, + "grad_norm": 4.795160195831921, + "learning_rate": 1.684667557762622e-05, + "loss": 0.8175, + "step": 3429 + }, + { + "epoch": 0.28224645134745935, + "grad_norm": 1.9984490983691139, + "learning_rate": 1.6844732713713718e-05, + "loss": 0.8195, + "step": 3430 + }, + { + "epoch": 0.2823287389426044, + "grad_norm": 2.2528306971361842, + "learning_rate": 1.6842789363562354e-05, + "loss": 0.802, + "step": 3431 + }, + { + "epoch": 0.2824110265377494, + "grad_norm": 2.624866901729511, + "learning_rate": 1.6840845527310176e-05, + "loss": 0.7985, + "step": 3432 + }, + { + "epoch": 0.28249331413289447, + "grad_norm": 4.036532510382469, + "learning_rate": 1.6838901205095267e-05, + "loss": 0.8637, + "step": 3433 + }, + { + "epoch": 0.28257560172803947, + "grad_norm": 2.4014428806472012, + "learning_rate": 1.683695639705576e-05, + "loss": 0.8383, + "step": 3434 + }, + { + "epoch": 0.28265788932318453, + "grad_norm": 2.339737367909026, + "learning_rate": 1.68350111033298e-05, + "loss": 0.821, + "step": 3435 + }, + { + "epoch": 0.2827401769183296, + "grad_norm": 0.4517504767826104, + "learning_rate": 1.6833065324055582e-05, + "loss": 0.5493, + "step": 3436 + }, + { + "epoch": 0.2828224645134746, + "grad_norm": 2.23861206286531, + "learning_rate": 1.6831119059371332e-05, + "loss": 0.792, + "step": 3437 + }, + { + "epoch": 0.28290475210861965, + "grad_norm": 3.041436048220035, + "learning_rate": 1.6829172309415313e-05, + "loss": 0.8169, + "step": 3438 + }, + { + "epoch": 0.28298703970376465, + "grad_norm": 2.847666493958736, + "learning_rate": 1.6827225074325812e-05, + "loss": 0.767, + "step": 3439 + }, + { + "epoch": 0.2830693272989097, + "grad_norm": 0.4506691367955305, + "learning_rate": 1.6825277354241156e-05, + "loss": 0.5467, + "step": 3440 + }, + { + "epoch": 0.2831516148940547, + "grad_norm": 2.657056454748742, + "learning_rate": 1.6823329149299716e-05, + "loss": 0.8191, + "step": 3441 + }, + { + "epoch": 0.2832339024891998, + "grad_norm": 0.4452525425742602, + "learning_rate": 1.6821380459639888e-05, + "loss": 0.5525, + "step": 3442 + }, + { + "epoch": 0.2833161900843448, + "grad_norm": 2.421257029111015, + "learning_rate": 1.6819431285400096e-05, + "loss": 0.7846, + "step": 3443 + }, + { + "epoch": 0.28339847767948984, + "grad_norm": 2.16142347243168, + "learning_rate": 1.6817481626718818e-05, + "loss": 0.8067, + "step": 3444 + }, + { + "epoch": 0.28348076527463484, + "grad_norm": 2.064303219148629, + "learning_rate": 1.6815531483734543e-05, + "loss": 0.7633, + "step": 3445 + }, + { + "epoch": 0.2835630528697799, + "grad_norm": 2.0973469858510887, + "learning_rate": 1.681358085658581e-05, + "loss": 0.8081, + "step": 3446 + }, + { + "epoch": 0.2836453404649249, + "grad_norm": 2.3990758947427264, + "learning_rate": 1.6811629745411195e-05, + "loss": 0.8388, + "step": 3447 + }, + { + "epoch": 0.28372762806006996, + "grad_norm": 1.821405179949142, + "learning_rate": 1.6809678150349293e-05, + "loss": 0.8008, + "step": 3448 + }, + { + "epoch": 0.28380991565521496, + "grad_norm": 0.4627300071898438, + "learning_rate": 1.6807726071538745e-05, + "loss": 0.5309, + "step": 3449 + }, + { + "epoch": 0.28389220325036, + "grad_norm": 2.4438573828258385, + "learning_rate": 1.6805773509118227e-05, + "loss": 0.8391, + "step": 3450 + }, + { + "epoch": 0.283974490845505, + "grad_norm": 0.4373096974424113, + "learning_rate": 1.6803820463226443e-05, + "loss": 0.5076, + "step": 3451 + }, + { + "epoch": 0.2840567784406501, + "grad_norm": 2.5300726616808538, + "learning_rate": 1.6801866934002133e-05, + "loss": 0.8217, + "step": 3452 + }, + { + "epoch": 0.2841390660357951, + "grad_norm": 0.4513361038680293, + "learning_rate": 1.6799912921584075e-05, + "loss": 0.5711, + "step": 3453 + }, + { + "epoch": 0.28422135363094014, + "grad_norm": 0.44598526131806787, + "learning_rate": 1.6797958426111077e-05, + "loss": 0.5681, + "step": 3454 + }, + { + "epoch": 0.28430364122608515, + "grad_norm": 2.1791061463851435, + "learning_rate": 1.6796003447721983e-05, + "loss": 0.8099, + "step": 3455 + }, + { + "epoch": 0.2843859288212302, + "grad_norm": 2.1521791907156484, + "learning_rate": 1.6794047986555676e-05, + "loss": 0.8312, + "step": 3456 + }, + { + "epoch": 0.2844682164163752, + "grad_norm": 1.7549749460192268, + "learning_rate": 1.679209204275106e-05, + "loss": 0.7879, + "step": 3457 + }, + { + "epoch": 0.28455050401152027, + "grad_norm": 1.9520709768223774, + "learning_rate": 1.6790135616447095e-05, + "loss": 0.7895, + "step": 3458 + }, + { + "epoch": 0.28463279160666527, + "grad_norm": 1.9994730042248807, + "learning_rate": 1.678817870778275e-05, + "loss": 0.7944, + "step": 3459 + }, + { + "epoch": 0.28471507920181033, + "grad_norm": 2.093491173269509, + "learning_rate": 1.6786221316897044e-05, + "loss": 0.8137, + "step": 3460 + }, + { + "epoch": 0.28479736679695533, + "grad_norm": 1.5889757859406402, + "learning_rate": 1.6784263443929033e-05, + "loss": 0.8225, + "step": 3461 + }, + { + "epoch": 0.2848796543921004, + "grad_norm": 0.4634819706610228, + "learning_rate": 1.6782305089017797e-05, + "loss": 0.5537, + "step": 3462 + }, + { + "epoch": 0.28496194198724545, + "grad_norm": 1.8487655355484818, + "learning_rate": 1.678034625230245e-05, + "loss": 0.8266, + "step": 3463 + }, + { + "epoch": 0.28504422958239045, + "grad_norm": 2.038986069756992, + "learning_rate": 1.6778386933922153e-05, + "loss": 0.8367, + "step": 3464 + }, + { + "epoch": 0.2851265171775355, + "grad_norm": 2.17004702794202, + "learning_rate": 1.6776427134016087e-05, + "loss": 0.817, + "step": 3465 + }, + { + "epoch": 0.2852088047726805, + "grad_norm": 1.9199445673909137, + "learning_rate": 1.6774466852723474e-05, + "loss": 0.8215, + "step": 3466 + }, + { + "epoch": 0.28529109236782557, + "grad_norm": 1.7262092987215405, + "learning_rate": 1.677250609018357e-05, + "loss": 0.8053, + "step": 3467 + }, + { + "epoch": 0.2853733799629706, + "grad_norm": 1.5914902114461569, + "learning_rate": 1.6770544846535666e-05, + "loss": 0.7638, + "step": 3468 + }, + { + "epoch": 0.28545566755811563, + "grad_norm": 1.6011141890970073, + "learning_rate": 1.676858312191908e-05, + "loss": 0.8046, + "step": 3469 + }, + { + "epoch": 0.28553795515326064, + "grad_norm": 0.4546462728884523, + "learning_rate": 1.6766620916473177e-05, + "loss": 0.5333, + "step": 3470 + }, + { + "epoch": 0.2856202427484057, + "grad_norm": 4.032771336860496, + "learning_rate": 1.6764658230337346e-05, + "loss": 0.785, + "step": 3471 + }, + { + "epoch": 0.2857025303435507, + "grad_norm": 1.7267029252870196, + "learning_rate": 1.6762695063651013e-05, + "loss": 0.828, + "step": 3472 + }, + { + "epoch": 0.28578481793869576, + "grad_norm": 1.5879765058811814, + "learning_rate": 1.6760731416553638e-05, + "loss": 0.8442, + "step": 3473 + }, + { + "epoch": 0.28586710553384076, + "grad_norm": 3.4180243470837173, + "learning_rate": 1.6758767289184715e-05, + "loss": 0.8218, + "step": 3474 + }, + { + "epoch": 0.2859493931289858, + "grad_norm": 1.7781900009410883, + "learning_rate": 1.675680268168377e-05, + "loss": 0.8227, + "step": 3475 + }, + { + "epoch": 0.2860316807241308, + "grad_norm": 1.4123618115578194, + "learning_rate": 1.6754837594190372e-05, + "loss": 0.7955, + "step": 3476 + }, + { + "epoch": 0.2861139683192759, + "grad_norm": 1.6657900768321887, + "learning_rate": 1.6752872026844114e-05, + "loss": 0.7854, + "step": 3477 + }, + { + "epoch": 0.2861962559144209, + "grad_norm": 1.6508324682525461, + "learning_rate": 1.6750905979784622e-05, + "loss": 0.8093, + "step": 3478 + }, + { + "epoch": 0.28627854350956594, + "grad_norm": 2.0792386003784196, + "learning_rate": 1.6748939453151573e-05, + "loss": 0.8028, + "step": 3479 + }, + { + "epoch": 0.28636083110471094, + "grad_norm": 1.5017518287802412, + "learning_rate": 1.674697244708465e-05, + "loss": 0.8267, + "step": 3480 + }, + { + "epoch": 0.286443118699856, + "grad_norm": 1.581528239807794, + "learning_rate": 1.6745004961723604e-05, + "loss": 0.8114, + "step": 3481 + }, + { + "epoch": 0.286525406295001, + "grad_norm": 1.4857891678884565, + "learning_rate": 1.674303699720819e-05, + "loss": 0.7895, + "step": 3482 + }, + { + "epoch": 0.28660769389014606, + "grad_norm": 1.5787661760144176, + "learning_rate": 1.6741068553678208e-05, + "loss": 0.8402, + "step": 3483 + }, + { + "epoch": 0.28668998148529107, + "grad_norm": 1.900995643556635, + "learning_rate": 1.6739099631273497e-05, + "loss": 0.8176, + "step": 3484 + }, + { + "epoch": 0.2867722690804361, + "grad_norm": 1.9360343091947392, + "learning_rate": 1.6737130230133927e-05, + "loss": 0.799, + "step": 3485 + }, + { + "epoch": 0.28685455667558113, + "grad_norm": 1.7475632880592815, + "learning_rate": 1.67351603503994e-05, + "loss": 0.809, + "step": 3486 + }, + { + "epoch": 0.2869368442707262, + "grad_norm": 2.2072607505968183, + "learning_rate": 1.6733189992209852e-05, + "loss": 0.8084, + "step": 3487 + }, + { + "epoch": 0.28701913186587125, + "grad_norm": 2.3932208696484776, + "learning_rate": 1.6731219155705258e-05, + "loss": 0.8207, + "step": 3488 + }, + { + "epoch": 0.28710141946101625, + "grad_norm": 1.8002529054592749, + "learning_rate": 1.6729247841025618e-05, + "loss": 0.7709, + "step": 3489 + }, + { + "epoch": 0.2871837070561613, + "grad_norm": 2.0561098850829205, + "learning_rate": 1.6727276048310974e-05, + "loss": 0.8071, + "step": 3490 + }, + { + "epoch": 0.2872659946513063, + "grad_norm": 1.8680311857539131, + "learning_rate": 1.67253037777014e-05, + "loss": 0.7797, + "step": 3491 + }, + { + "epoch": 0.28734828224645137, + "grad_norm": 3.5448812326687045, + "learning_rate": 1.6723331029336994e-05, + "loss": 0.8386, + "step": 3492 + }, + { + "epoch": 0.2874305698415964, + "grad_norm": 2.056876501784213, + "learning_rate": 1.672135780335791e-05, + "loss": 0.8361, + "step": 3493 + }, + { + "epoch": 0.28751285743674143, + "grad_norm": 2.136317029629569, + "learning_rate": 1.6719384099904318e-05, + "loss": 0.8258, + "step": 3494 + }, + { + "epoch": 0.28759514503188643, + "grad_norm": 2.5939735686338232, + "learning_rate": 1.671740991911642e-05, + "loss": 0.825, + "step": 3495 + }, + { + "epoch": 0.2876774326270315, + "grad_norm": 2.0944690429347004, + "learning_rate": 1.671543526113447e-05, + "loss": 0.8222, + "step": 3496 + }, + { + "epoch": 0.2877597202221765, + "grad_norm": 3.411959560559101, + "learning_rate": 1.6713460126098736e-05, + "loss": 0.8005, + "step": 3497 + }, + { + "epoch": 0.28784200781732155, + "grad_norm": 2.110041202716306, + "learning_rate": 1.671148451414953e-05, + "loss": 0.8173, + "step": 3498 + }, + { + "epoch": 0.28792429541246656, + "grad_norm": 0.45389707370787435, + "learning_rate": 1.6709508425427202e-05, + "loss": 0.5691, + "step": 3499 + }, + { + "epoch": 0.2880065830076116, + "grad_norm": 2.2752430550938874, + "learning_rate": 1.6707531860072122e-05, + "loss": 0.779, + "step": 3500 + }, + { + "epoch": 0.2880888706027566, + "grad_norm": 2.9098953632036104, + "learning_rate": 1.670555481822471e-05, + "loss": 0.7919, + "step": 3501 + }, + { + "epoch": 0.2881711581979017, + "grad_norm": 1.8995725284770844, + "learning_rate": 1.67035773000254e-05, + "loss": 0.8184, + "step": 3502 + }, + { + "epoch": 0.2882534457930467, + "grad_norm": 2.003185274187615, + "learning_rate": 1.6701599305614685e-05, + "loss": 0.8289, + "step": 3503 + }, + { + "epoch": 0.28833573338819174, + "grad_norm": 2.084723279405393, + "learning_rate": 1.669962083513307e-05, + "loss": 0.8283, + "step": 3504 + }, + { + "epoch": 0.28841802098333674, + "grad_norm": 4.067890464085165, + "learning_rate": 1.6697641888721107e-05, + "loss": 0.8025, + "step": 3505 + }, + { + "epoch": 0.2885003085784818, + "grad_norm": 0.4481290034369956, + "learning_rate": 1.6695662466519377e-05, + "loss": 0.5435, + "step": 3506 + }, + { + "epoch": 0.2885825961736268, + "grad_norm": 2.278029783924429, + "learning_rate": 1.669368256866849e-05, + "loss": 0.8009, + "step": 3507 + }, + { + "epoch": 0.28866488376877186, + "grad_norm": 0.4230219089422917, + "learning_rate": 1.6691702195309105e-05, + "loss": 0.5255, + "step": 3508 + }, + { + "epoch": 0.28874717136391687, + "grad_norm": 0.45033485534258255, + "learning_rate": 1.6689721346581892e-05, + "loss": 0.5458, + "step": 3509 + }, + { + "epoch": 0.2888294589590619, + "grad_norm": 1.960789229166538, + "learning_rate": 1.6687740022627573e-05, + "loss": 0.8029, + "step": 3510 + }, + { + "epoch": 0.2889117465542069, + "grad_norm": 1.9707966632273453, + "learning_rate": 1.66857582235869e-05, + "loss": 0.8049, + "step": 3511 + }, + { + "epoch": 0.288994034149352, + "grad_norm": 3.348967297731327, + "learning_rate": 1.6683775949600654e-05, + "loss": 0.7836, + "step": 3512 + }, + { + "epoch": 0.28907632174449704, + "grad_norm": 0.4351955326287272, + "learning_rate": 1.6681793200809656e-05, + "loss": 0.5538, + "step": 3513 + }, + { + "epoch": 0.28915860933964205, + "grad_norm": 2.0498748058278453, + "learning_rate": 1.6679809977354754e-05, + "loss": 0.7876, + "step": 3514 + }, + { + "epoch": 0.2892408969347871, + "grad_norm": 1.8919493558704104, + "learning_rate": 1.6677826279376832e-05, + "loss": 0.8057, + "step": 3515 + }, + { + "epoch": 0.2893231845299321, + "grad_norm": 2.334579373937873, + "learning_rate": 1.6675842107016814e-05, + "loss": 0.7895, + "step": 3516 + }, + { + "epoch": 0.28940547212507717, + "grad_norm": 5.3766265211063375, + "learning_rate": 1.6673857460415647e-05, + "loss": 0.7938, + "step": 3517 + }, + { + "epoch": 0.28948775972022217, + "grad_norm": 1.971049825711844, + "learning_rate": 1.667187233971432e-05, + "loss": 0.8235, + "step": 3518 + }, + { + "epoch": 0.28957004731536723, + "grad_norm": 2.2536217470647375, + "learning_rate": 1.666988674505385e-05, + "loss": 0.7829, + "step": 3519 + }, + { + "epoch": 0.28965233491051223, + "grad_norm": 6.287185634952787, + "learning_rate": 1.666790067657529e-05, + "loss": 0.8167, + "step": 3520 + }, + { + "epoch": 0.2897346225056573, + "grad_norm": 2.0640992367691884, + "learning_rate": 1.666591413441974e-05, + "loss": 0.786, + "step": 3521 + }, + { + "epoch": 0.2898169101008023, + "grad_norm": 1.8931435716380844, + "learning_rate": 1.6663927118728302e-05, + "loss": 0.8118, + "step": 3522 + }, + { + "epoch": 0.28989919769594735, + "grad_norm": 2.5014981392717015, + "learning_rate": 1.6661939629642142e-05, + "loss": 0.8086, + "step": 3523 + }, + { + "epoch": 0.28998148529109236, + "grad_norm": 2.2167087484870898, + "learning_rate": 1.665995166730244e-05, + "loss": 0.783, + "step": 3524 + }, + { + "epoch": 0.2900637728862374, + "grad_norm": 2.0101485502511878, + "learning_rate": 1.6657963231850432e-05, + "loss": 0.7882, + "step": 3525 + }, + { + "epoch": 0.2901460604813824, + "grad_norm": 2.291672665287918, + "learning_rate": 1.6655974323427354e-05, + "loss": 0.8176, + "step": 3526 + }, + { + "epoch": 0.2902283480765275, + "grad_norm": 2.195417285730131, + "learning_rate": 1.6653984942174513e-05, + "loss": 0.8262, + "step": 3527 + }, + { + "epoch": 0.2903106356716725, + "grad_norm": 0.45560175155715454, + "learning_rate": 1.665199508823322e-05, + "loss": 0.5565, + "step": 3528 + }, + { + "epoch": 0.29039292326681754, + "grad_norm": 0.4457673720306088, + "learning_rate": 1.665000476174483e-05, + "loss": 0.5617, + "step": 3529 + }, + { + "epoch": 0.29047521086196254, + "grad_norm": 1.850716696724229, + "learning_rate": 1.6648013962850743e-05, + "loss": 0.7899, + "step": 3530 + }, + { + "epoch": 0.2905574984571076, + "grad_norm": 2.246953512173027, + "learning_rate": 1.6646022691692373e-05, + "loss": 0.7851, + "step": 3531 + }, + { + "epoch": 0.2906397860522526, + "grad_norm": 2.8944961863444636, + "learning_rate": 1.6644030948411177e-05, + "loss": 0.8117, + "step": 3532 + }, + { + "epoch": 0.29072207364739766, + "grad_norm": 2.073029290927673, + "learning_rate": 1.6642038733148654e-05, + "loss": 0.8105, + "step": 3533 + }, + { + "epoch": 0.29080436124254266, + "grad_norm": 1.9774566624610654, + "learning_rate": 1.664004604604632e-05, + "loss": 0.8142, + "step": 3534 + }, + { + "epoch": 0.2908866488376877, + "grad_norm": 1.84749961783582, + "learning_rate": 1.6638052887245733e-05, + "loss": 0.8005, + "step": 3535 + }, + { + "epoch": 0.2909689364328327, + "grad_norm": 0.48264343720441555, + "learning_rate": 1.6636059256888484e-05, + "loss": 0.5329, + "step": 3536 + }, + { + "epoch": 0.2910512240279778, + "grad_norm": 2.3891852329936207, + "learning_rate": 1.66340651551162e-05, + "loss": 0.7946, + "step": 3537 + }, + { + "epoch": 0.2911335116231228, + "grad_norm": 1.9987906573804815, + "learning_rate": 1.6632070582070536e-05, + "loss": 0.7835, + "step": 3538 + }, + { + "epoch": 0.29121579921826785, + "grad_norm": 2.7784467982667107, + "learning_rate": 1.6630075537893183e-05, + "loss": 0.8092, + "step": 3539 + }, + { + "epoch": 0.2912980868134129, + "grad_norm": 0.4415210978944295, + "learning_rate": 1.6628080022725866e-05, + "loss": 0.5425, + "step": 3540 + }, + { + "epoch": 0.2913803744085579, + "grad_norm": 2.380535919426329, + "learning_rate": 1.662608403671035e-05, + "loss": 0.8228, + "step": 3541 + }, + { + "epoch": 0.29146266200370297, + "grad_norm": 5.061045389787459, + "learning_rate": 1.6624087579988416e-05, + "loss": 0.8045, + "step": 3542 + }, + { + "epoch": 0.29154494959884797, + "grad_norm": 1.8563695073025637, + "learning_rate": 1.6622090652701896e-05, + "loss": 0.8018, + "step": 3543 + }, + { + "epoch": 0.291627237193993, + "grad_norm": 1.999734316894825, + "learning_rate": 1.6620093254992646e-05, + "loss": 0.7917, + "step": 3544 + }, + { + "epoch": 0.29170952478913803, + "grad_norm": 1.9433176007559034, + "learning_rate": 1.6618095387002556e-05, + "loss": 0.7849, + "step": 3545 + }, + { + "epoch": 0.2917918123842831, + "grad_norm": 2.2309708240767576, + "learning_rate": 1.6616097048873557e-05, + "loss": 0.8153, + "step": 3546 + }, + { + "epoch": 0.2918740999794281, + "grad_norm": 2.341814818226442, + "learning_rate": 1.6614098240747606e-05, + "loss": 0.8647, + "step": 3547 + }, + { + "epoch": 0.29195638757457315, + "grad_norm": 2.9791401434190417, + "learning_rate": 1.661209896276669e-05, + "loss": 0.7999, + "step": 3548 + }, + { + "epoch": 0.29203867516971815, + "grad_norm": 0.430303260464014, + "learning_rate": 1.661009921507284e-05, + "loss": 0.5291, + "step": 3549 + }, + { + "epoch": 0.2921209627648632, + "grad_norm": 1.8123400429439562, + "learning_rate": 1.6608098997808114e-05, + "loss": 0.8052, + "step": 3550 + }, + { + "epoch": 0.2922032503600082, + "grad_norm": 2.347163456518436, + "learning_rate": 1.66060983111146e-05, + "loss": 0.8203, + "step": 3551 + }, + { + "epoch": 0.2922855379551533, + "grad_norm": 1.721626934446873, + "learning_rate": 1.6604097155134427e-05, + "loss": 0.797, + "step": 3552 + }, + { + "epoch": 0.2923678255502983, + "grad_norm": 2.201616811051578, + "learning_rate": 1.660209553000976e-05, + "loss": 0.7654, + "step": 3553 + }, + { + "epoch": 0.29245011314544334, + "grad_norm": 1.816300835480641, + "learning_rate": 1.6600093435882777e-05, + "loss": 0.8073, + "step": 3554 + }, + { + "epoch": 0.29253240074058834, + "grad_norm": 0.4658253741741589, + "learning_rate": 1.6598090872895715e-05, + "loss": 0.5435, + "step": 3555 + }, + { + "epoch": 0.2926146883357334, + "grad_norm": 0.4081132818015702, + "learning_rate": 1.6596087841190832e-05, + "loss": 0.5015, + "step": 3556 + }, + { + "epoch": 0.2926969759308784, + "grad_norm": 1.9217498282118224, + "learning_rate": 1.6594084340910416e-05, + "loss": 0.8015, + "step": 3557 + }, + { + "epoch": 0.29277926352602346, + "grad_norm": 2.459476842735801, + "learning_rate": 1.659208037219679e-05, + "loss": 0.822, + "step": 3558 + }, + { + "epoch": 0.29286155112116846, + "grad_norm": 0.4187755012731122, + "learning_rate": 1.659007593519232e-05, + "loss": 0.539, + "step": 3559 + }, + { + "epoch": 0.2929438387163135, + "grad_norm": 0.4439044605797784, + "learning_rate": 1.6588071030039395e-05, + "loss": 0.5437, + "step": 3560 + }, + { + "epoch": 0.2930261263114585, + "grad_norm": 1.9430080081113947, + "learning_rate": 1.6586065656880442e-05, + "loss": 0.8191, + "step": 3561 + }, + { + "epoch": 0.2931084139066036, + "grad_norm": 0.4261739322393414, + "learning_rate": 1.6584059815857917e-05, + "loss": 0.5266, + "step": 3562 + }, + { + "epoch": 0.2931907015017486, + "grad_norm": 2.0059153612762817, + "learning_rate": 1.658205350711431e-05, + "loss": 0.7954, + "step": 3563 + }, + { + "epoch": 0.29327298909689364, + "grad_norm": 1.992803682953668, + "learning_rate": 1.658004673079215e-05, + "loss": 0.8117, + "step": 3564 + }, + { + "epoch": 0.2933552766920387, + "grad_norm": 1.9218001671195901, + "learning_rate": 1.657803948703399e-05, + "loss": 0.8139, + "step": 3565 + }, + { + "epoch": 0.2934375642871837, + "grad_norm": 2.202882243209258, + "learning_rate": 1.6576031775982428e-05, + "loss": 0.7937, + "step": 3566 + }, + { + "epoch": 0.29351985188232876, + "grad_norm": 1.6770391115358514, + "learning_rate": 1.6574023597780086e-05, + "loss": 0.8279, + "step": 3567 + }, + { + "epoch": 0.29360213947747377, + "grad_norm": 1.8078657382158536, + "learning_rate": 1.6572014952569622e-05, + "loss": 0.8006, + "step": 3568 + }, + { + "epoch": 0.2936844270726188, + "grad_norm": 2.1096069549377243, + "learning_rate": 1.6570005840493723e-05, + "loss": 0.7904, + "step": 3569 + }, + { + "epoch": 0.29376671466776383, + "grad_norm": 0.42460506300564094, + "learning_rate": 1.656799626169512e-05, + "loss": 0.5501, + "step": 3570 + }, + { + "epoch": 0.2938490022629089, + "grad_norm": 2.0963250119352423, + "learning_rate": 1.6565986216316564e-05, + "loss": 0.8106, + "step": 3571 + }, + { + "epoch": 0.2939312898580539, + "grad_norm": 2.0656139084436598, + "learning_rate": 1.6563975704500847e-05, + "loss": 0.8339, + "step": 3572 + }, + { + "epoch": 0.29401357745319895, + "grad_norm": 2.500541500884672, + "learning_rate": 1.6561964726390797e-05, + "loss": 0.8418, + "step": 3573 + }, + { + "epoch": 0.29409586504834395, + "grad_norm": 1.8127364690862025, + "learning_rate": 1.6559953282129262e-05, + "loss": 0.8124, + "step": 3574 + }, + { + "epoch": 0.294178152643489, + "grad_norm": 2.0207547834774857, + "learning_rate": 1.655794137185914e-05, + "loss": 0.807, + "step": 3575 + }, + { + "epoch": 0.294260440238634, + "grad_norm": 0.45181317862494913, + "learning_rate": 1.655592899572335e-05, + "loss": 0.5462, + "step": 3576 + }, + { + "epoch": 0.29434272783377907, + "grad_norm": 1.9261816815698125, + "learning_rate": 1.655391615386485e-05, + "loss": 0.8061, + "step": 3577 + }, + { + "epoch": 0.2944250154289241, + "grad_norm": 1.787594653223662, + "learning_rate": 1.6551902846426626e-05, + "loss": 0.8171, + "step": 3578 + }, + { + "epoch": 0.29450730302406913, + "grad_norm": 2.2310815549232825, + "learning_rate": 1.6549889073551705e-05, + "loss": 0.8099, + "step": 3579 + }, + { + "epoch": 0.29458959061921414, + "grad_norm": 0.4485931026862, + "learning_rate": 1.6547874835383137e-05, + "loss": 0.5532, + "step": 3580 + }, + { + "epoch": 0.2946718782143592, + "grad_norm": 0.4301956132003114, + "learning_rate": 1.6545860132064015e-05, + "loss": 0.5388, + "step": 3581 + }, + { + "epoch": 0.2947541658095042, + "grad_norm": 4.830932316927706, + "learning_rate": 1.6543844963737454e-05, + "loss": 0.7782, + "step": 3582 + }, + { + "epoch": 0.29483645340464926, + "grad_norm": 1.8174347345887978, + "learning_rate": 1.6541829330546616e-05, + "loss": 0.801, + "step": 3583 + }, + { + "epoch": 0.29491874099979426, + "grad_norm": 1.6929817178799809, + "learning_rate": 1.653981323263468e-05, + "loss": 0.7876, + "step": 3584 + }, + { + "epoch": 0.2950010285949393, + "grad_norm": 1.958737548661962, + "learning_rate": 1.6537796670144873e-05, + "loss": 0.7925, + "step": 3585 + }, + { + "epoch": 0.2950833161900843, + "grad_norm": 1.7981141842359325, + "learning_rate": 1.653577964322045e-05, + "loss": 0.817, + "step": 3586 + }, + { + "epoch": 0.2951656037852294, + "grad_norm": 1.8185287740526173, + "learning_rate": 1.6533762152004687e-05, + "loss": 0.7807, + "step": 3587 + }, + { + "epoch": 0.2952478913803744, + "grad_norm": 0.4537506028450974, + "learning_rate": 1.6531744196640915e-05, + "loss": 0.5344, + "step": 3588 + }, + { + "epoch": 0.29533017897551944, + "grad_norm": 0.44577322939836633, + "learning_rate": 1.6529725777272476e-05, + "loss": 0.5653, + "step": 3589 + }, + { + "epoch": 0.29541246657066444, + "grad_norm": 2.0387141776373214, + "learning_rate": 1.6527706894042765e-05, + "loss": 0.8222, + "step": 3590 + }, + { + "epoch": 0.2954947541658095, + "grad_norm": 0.438508063126075, + "learning_rate": 1.6525687547095194e-05, + "loss": 0.5543, + "step": 3591 + }, + { + "epoch": 0.29557704176095456, + "grad_norm": 1.3420787245609647, + "learning_rate": 1.6523667736573216e-05, + "loss": 0.7896, + "step": 3592 + }, + { + "epoch": 0.29565932935609957, + "grad_norm": 1.4944961177832303, + "learning_rate": 1.652164746262032e-05, + "loss": 0.7956, + "step": 3593 + }, + { + "epoch": 0.2957416169512446, + "grad_norm": 2.067155769754942, + "learning_rate": 1.651962672538001e-05, + "loss": 0.8027, + "step": 3594 + }, + { + "epoch": 0.2958239045463896, + "grad_norm": 0.4880661651769194, + "learning_rate": 1.651760552499585e-05, + "loss": 0.575, + "step": 3595 + }, + { + "epoch": 0.2959061921415347, + "grad_norm": 1.549996096043789, + "learning_rate": 1.6515583861611413e-05, + "loss": 0.824, + "step": 3596 + }, + { + "epoch": 0.2959884797366797, + "grad_norm": 1.4569372189154544, + "learning_rate": 1.651356173537032e-05, + "loss": 0.8063, + "step": 3597 + }, + { + "epoch": 0.29607076733182475, + "grad_norm": 2.1045506375221876, + "learning_rate": 1.6511539146416217e-05, + "loss": 0.8321, + "step": 3598 + }, + { + "epoch": 0.29615305492696975, + "grad_norm": 1.603101180312977, + "learning_rate": 1.6509516094892788e-05, + "loss": 0.7873, + "step": 3599 + }, + { + "epoch": 0.2962353425221148, + "grad_norm": 1.5464961268220638, + "learning_rate": 1.6507492580943746e-05, + "loss": 0.7793, + "step": 3600 + }, + { + "epoch": 0.2963176301172598, + "grad_norm": 1.4547001761588105, + "learning_rate": 1.650546860471284e-05, + "loss": 0.8206, + "step": 3601 + }, + { + "epoch": 0.29639991771240487, + "grad_norm": 1.5156790874536277, + "learning_rate": 1.6503444166343846e-05, + "loss": 0.807, + "step": 3602 + }, + { + "epoch": 0.2964822053075499, + "grad_norm": 1.6607693253933526, + "learning_rate": 1.650141926598058e-05, + "loss": 0.818, + "step": 3603 + }, + { + "epoch": 0.29656449290269493, + "grad_norm": 0.45200961268789075, + "learning_rate": 1.6499393903766886e-05, + "loss": 0.5318, + "step": 3604 + }, + { + "epoch": 0.29664678049783993, + "grad_norm": 0.4281520155308469, + "learning_rate": 1.6497368079846646e-05, + "loss": 0.51, + "step": 3605 + }, + { + "epoch": 0.296729068092985, + "grad_norm": 1.4844329190620529, + "learning_rate": 1.6495341794363768e-05, + "loss": 0.8147, + "step": 3606 + }, + { + "epoch": 0.29681135568813, + "grad_norm": 1.4468093740635921, + "learning_rate": 1.649331504746219e-05, + "loss": 0.8401, + "step": 3607 + }, + { + "epoch": 0.29689364328327505, + "grad_norm": 7.349607156958685, + "learning_rate": 1.6491287839285903e-05, + "loss": 0.8087, + "step": 3608 + }, + { + "epoch": 0.29697593087842006, + "grad_norm": 0.48797003096312785, + "learning_rate": 1.6489260169978908e-05, + "loss": 0.5222, + "step": 3609 + }, + { + "epoch": 0.2970582184735651, + "grad_norm": 1.4909203688886028, + "learning_rate": 1.6487232039685246e-05, + "loss": 0.8239, + "step": 3610 + }, + { + "epoch": 0.2971405060687101, + "grad_norm": 1.45832375397737, + "learning_rate": 1.6485203448548995e-05, + "loss": 0.8116, + "step": 3611 + }, + { + "epoch": 0.2972227936638552, + "grad_norm": 1.451419169359552, + "learning_rate": 1.6483174396714265e-05, + "loss": 0.8069, + "step": 3612 + }, + { + "epoch": 0.2973050812590002, + "grad_norm": 0.5008600469907893, + "learning_rate": 1.6481144884325193e-05, + "loss": 0.5822, + "step": 3613 + }, + { + "epoch": 0.29738736885414524, + "grad_norm": 0.45991870974867094, + "learning_rate": 1.6479114911525952e-05, + "loss": 0.5504, + "step": 3614 + }, + { + "epoch": 0.29746965644929024, + "grad_norm": 1.9837374203990883, + "learning_rate": 1.647708447846075e-05, + "loss": 0.8216, + "step": 3615 + }, + { + "epoch": 0.2975519440444353, + "grad_norm": 1.4163826272645652, + "learning_rate": 1.647505358527383e-05, + "loss": 0.812, + "step": 3616 + }, + { + "epoch": 0.29763423163958036, + "grad_norm": 2.0305024301193457, + "learning_rate": 1.6473022232109453e-05, + "loss": 0.8104, + "step": 3617 + }, + { + "epoch": 0.29771651923472536, + "grad_norm": 1.5169338642840535, + "learning_rate": 1.647099041911193e-05, + "loss": 0.805, + "step": 3618 + }, + { + "epoch": 0.2977988068298704, + "grad_norm": 1.7748947218775537, + "learning_rate": 1.64689581464256e-05, + "loss": 0.8281, + "step": 3619 + }, + { + "epoch": 0.2978810944250154, + "grad_norm": 0.5102719187792992, + "learning_rate": 1.6466925414194827e-05, + "loss": 0.5624, + "step": 3620 + }, + { + "epoch": 0.2979633820201605, + "grad_norm": 1.5103048053312222, + "learning_rate": 1.646489222256401e-05, + "loss": 0.8261, + "step": 3621 + }, + { + "epoch": 0.2980456696153055, + "grad_norm": 2.285924143020317, + "learning_rate": 1.6462858571677593e-05, + "loss": 0.7961, + "step": 3622 + }, + { + "epoch": 0.29812795721045054, + "grad_norm": 1.5727073825492892, + "learning_rate": 1.6460824461680037e-05, + "loss": 0.8123, + "step": 3623 + }, + { + "epoch": 0.29821024480559555, + "grad_norm": 1.5823667385972964, + "learning_rate": 1.6458789892715845e-05, + "loss": 0.8246, + "step": 3624 + }, + { + "epoch": 0.2982925324007406, + "grad_norm": 0.44216108395055426, + "learning_rate": 1.645675486492955e-05, + "loss": 0.5517, + "step": 3625 + }, + { + "epoch": 0.2983748199958856, + "grad_norm": 0.4443349389522302, + "learning_rate": 1.6454719378465714e-05, + "loss": 0.5221, + "step": 3626 + }, + { + "epoch": 0.29845710759103067, + "grad_norm": 1.7290493167311836, + "learning_rate": 1.6452683433468934e-05, + "loss": 0.7989, + "step": 3627 + }, + { + "epoch": 0.29853939518617567, + "grad_norm": 0.43576759443993884, + "learning_rate": 1.6450647030083845e-05, + "loss": 0.5295, + "step": 3628 + }, + { + "epoch": 0.29862168278132073, + "grad_norm": 1.4800601153777706, + "learning_rate": 1.6448610168455105e-05, + "loss": 0.8092, + "step": 3629 + }, + { + "epoch": 0.29870397037646573, + "grad_norm": 0.4482543666374695, + "learning_rate": 1.6446572848727416e-05, + "loss": 0.5222, + "step": 3630 + }, + { + "epoch": 0.2987862579716108, + "grad_norm": 1.9043113521649464, + "learning_rate": 1.64445350710455e-05, + "loss": 0.7889, + "step": 3631 + }, + { + "epoch": 0.2988685455667558, + "grad_norm": 0.47399637243483855, + "learning_rate": 1.6442496835554112e-05, + "loss": 0.5404, + "step": 3632 + }, + { + "epoch": 0.29895083316190085, + "grad_norm": 1.694965329913979, + "learning_rate": 1.644045814239806e-05, + "loss": 0.7979, + "step": 3633 + }, + { + "epoch": 0.29903312075704586, + "grad_norm": 1.7030898842474096, + "learning_rate": 1.643841899172216e-05, + "loss": 0.7616, + "step": 3634 + }, + { + "epoch": 0.2991154083521909, + "grad_norm": 1.711673123185248, + "learning_rate": 1.643637938367127e-05, + "loss": 0.7985, + "step": 3635 + }, + { + "epoch": 0.2991976959473359, + "grad_norm": 1.4704819630463015, + "learning_rate": 1.6434339318390286e-05, + "loss": 0.8161, + "step": 3636 + }, + { + "epoch": 0.299279983542481, + "grad_norm": 1.5446355311555044, + "learning_rate": 1.643229879602412e-05, + "loss": 0.8116, + "step": 3637 + }, + { + "epoch": 0.299362271137626, + "grad_norm": 1.571432630304453, + "learning_rate": 1.6430257816717743e-05, + "loss": 0.8019, + "step": 3638 + }, + { + "epoch": 0.29944455873277104, + "grad_norm": 0.4422729175523287, + "learning_rate": 1.642821638061613e-05, + "loss": 0.5291, + "step": 3639 + }, + { + "epoch": 0.29952684632791604, + "grad_norm": 1.8673988008895486, + "learning_rate": 1.6426174487864304e-05, + "loss": 0.8319, + "step": 3640 + }, + { + "epoch": 0.2996091339230611, + "grad_norm": 1.6221967539246775, + "learning_rate": 1.642413213860732e-05, + "loss": 0.8057, + "step": 3641 + }, + { + "epoch": 0.29969142151820616, + "grad_norm": 1.5028216429220547, + "learning_rate": 1.6422089332990264e-05, + "loss": 0.8044, + "step": 3642 + }, + { + "epoch": 0.29977370911335116, + "grad_norm": 1.6762577400321679, + "learning_rate": 1.6420046071158253e-05, + "loss": 0.8024, + "step": 3643 + }, + { + "epoch": 0.2998559967084962, + "grad_norm": 1.699525059905768, + "learning_rate": 1.6418002353256436e-05, + "loss": 0.8158, + "step": 3644 + }, + { + "epoch": 0.2999382843036412, + "grad_norm": 1.7940824039037504, + "learning_rate": 1.6415958179429996e-05, + "loss": 0.8126, + "step": 3645 + }, + { + "epoch": 0.3000205718987863, + "grad_norm": 1.7449849959524295, + "learning_rate": 1.6413913549824147e-05, + "loss": 0.8142, + "step": 3646 + }, + { + "epoch": 0.3001028594939313, + "grad_norm": 0.43320109979126875, + "learning_rate": 1.641186846458414e-05, + "loss": 0.5415, + "step": 3647 + }, + { + "epoch": 0.30018514708907634, + "grad_norm": 2.2913328597836684, + "learning_rate": 1.6409822923855248e-05, + "loss": 0.7865, + "step": 3648 + }, + { + "epoch": 0.30026743468422135, + "grad_norm": 1.8618971692247046, + "learning_rate": 1.6407776927782787e-05, + "loss": 0.8344, + "step": 3649 + }, + { + "epoch": 0.3003497222793664, + "grad_norm": 1.8089576166987351, + "learning_rate": 1.64057304765121e-05, + "loss": 0.8059, + "step": 3650 + }, + { + "epoch": 0.3004320098745114, + "grad_norm": 0.4394608719087332, + "learning_rate": 1.6403683570188567e-05, + "loss": 0.5263, + "step": 3651 + }, + { + "epoch": 0.30051429746965647, + "grad_norm": 2.4289340317821773, + "learning_rate": 1.640163620895759e-05, + "loss": 0.796, + "step": 3652 + }, + { + "epoch": 0.30059658506480147, + "grad_norm": 1.625348899423735, + "learning_rate": 1.639958839296462e-05, + "loss": 0.7909, + "step": 3653 + }, + { + "epoch": 0.30067887265994653, + "grad_norm": 1.5166507240089266, + "learning_rate": 1.6397540122355122e-05, + "loss": 0.8002, + "step": 3654 + }, + { + "epoch": 0.30076116025509153, + "grad_norm": 1.4184455757549488, + "learning_rate": 1.6395491397274608e-05, + "loss": 0.8222, + "step": 3655 + }, + { + "epoch": 0.3008434478502366, + "grad_norm": 0.4327869707190793, + "learning_rate": 1.639344221786861e-05, + "loss": 0.5416, + "step": 3656 + }, + { + "epoch": 0.3009257354453816, + "grad_norm": 1.4562376716327414, + "learning_rate": 1.6391392584282705e-05, + "loss": 0.8151, + "step": 3657 + }, + { + "epoch": 0.30100802304052665, + "grad_norm": 0.438335227303146, + "learning_rate": 1.638934249666249e-05, + "loss": 0.5366, + "step": 3658 + }, + { + "epoch": 0.30109031063567165, + "grad_norm": 1.5033019033013582, + "learning_rate": 1.6387291955153603e-05, + "loss": 0.7965, + "step": 3659 + }, + { + "epoch": 0.3011725982308167, + "grad_norm": 1.5225162562640404, + "learning_rate": 1.638524095990171e-05, + "loss": 0.8058, + "step": 3660 + }, + { + "epoch": 0.3012548858259617, + "grad_norm": 1.5016565914911202, + "learning_rate": 1.6383189511052507e-05, + "loss": 0.8118, + "step": 3661 + }, + { + "epoch": 0.3013371734211068, + "grad_norm": 1.4769933744123498, + "learning_rate": 1.6381137608751733e-05, + "loss": 0.8256, + "step": 3662 + }, + { + "epoch": 0.3014194610162518, + "grad_norm": 1.3113247016849063, + "learning_rate": 1.637908525314515e-05, + "loss": 0.7982, + "step": 3663 + }, + { + "epoch": 0.30150174861139684, + "grad_norm": 0.45340394266460904, + "learning_rate": 1.637703244437855e-05, + "loss": 0.5557, + "step": 3664 + }, + { + "epoch": 0.30158403620654184, + "grad_norm": 1.522511390412077, + "learning_rate": 1.6374979182597766e-05, + "loss": 0.8411, + "step": 3665 + }, + { + "epoch": 0.3016663238016869, + "grad_norm": 0.4256116592130053, + "learning_rate": 1.6372925467948656e-05, + "loss": 0.5497, + "step": 3666 + }, + { + "epoch": 0.3017486113968319, + "grad_norm": 1.3973244380801353, + "learning_rate": 1.6370871300577112e-05, + "loss": 0.8016, + "step": 3667 + }, + { + "epoch": 0.30183089899197696, + "grad_norm": 0.42105939597568615, + "learning_rate": 1.6368816680629058e-05, + "loss": 0.5377, + "step": 3668 + }, + { + "epoch": 0.301913186587122, + "grad_norm": 1.622145350120707, + "learning_rate": 1.6366761608250453e-05, + "loss": 0.8198, + "step": 3669 + }, + { + "epoch": 0.301995474182267, + "grad_norm": 1.4773642812742485, + "learning_rate": 1.6364706083587287e-05, + "loss": 0.7939, + "step": 3670 + }, + { + "epoch": 0.3020777617774121, + "grad_norm": 1.4255914313834837, + "learning_rate": 1.6362650106785577e-05, + "loss": 0.8244, + "step": 3671 + }, + { + "epoch": 0.3021600493725571, + "grad_norm": 1.8726203492671103, + "learning_rate": 1.6360593677991383e-05, + "loss": 0.8347, + "step": 3672 + }, + { + "epoch": 0.30224233696770214, + "grad_norm": 1.4678091772662945, + "learning_rate": 1.6358536797350783e-05, + "loss": 0.7806, + "step": 3673 + }, + { + "epoch": 0.30232462456284714, + "grad_norm": 0.43833813379823816, + "learning_rate": 1.6356479465009898e-05, + "loss": 0.5562, + "step": 3674 + }, + { + "epoch": 0.3024069121579922, + "grad_norm": 1.292202911636895, + "learning_rate": 1.635442168111488e-05, + "loss": 0.8095, + "step": 3675 + }, + { + "epoch": 0.3024891997531372, + "grad_norm": 1.3720240213000705, + "learning_rate": 1.6352363445811907e-05, + "loss": 0.8064, + "step": 3676 + }, + { + "epoch": 0.30257148734828226, + "grad_norm": 1.828225512415719, + "learning_rate": 1.6350304759247194e-05, + "loss": 0.8045, + "step": 3677 + }, + { + "epoch": 0.30265377494342727, + "grad_norm": 1.6394823344470917, + "learning_rate": 1.6348245621566987e-05, + "loss": 0.8236, + "step": 3678 + }, + { + "epoch": 0.3027360625385723, + "grad_norm": 0.4391593547423728, + "learning_rate": 1.634618603291756e-05, + "loss": 0.5518, + "step": 3679 + }, + { + "epoch": 0.30281835013371733, + "grad_norm": 1.4520181940736636, + "learning_rate": 1.634412599344523e-05, + "loss": 0.809, + "step": 3680 + }, + { + "epoch": 0.3029006377288624, + "grad_norm": 1.4174240110221976, + "learning_rate": 1.6342065503296333e-05, + "loss": 0.7964, + "step": 3681 + }, + { + "epoch": 0.3029829253240074, + "grad_norm": 1.4677308594644047, + "learning_rate": 1.6340004562617248e-05, + "loss": 0.791, + "step": 3682 + }, + { + "epoch": 0.30306521291915245, + "grad_norm": 1.4786293669077117, + "learning_rate": 1.633794317155438e-05, + "loss": 0.8026, + "step": 3683 + }, + { + "epoch": 0.30314750051429745, + "grad_norm": 1.8994862403985697, + "learning_rate": 1.633588133025416e-05, + "loss": 0.8178, + "step": 3684 + }, + { + "epoch": 0.3032297881094425, + "grad_norm": 1.5848395704359917, + "learning_rate": 1.633381903886307e-05, + "loss": 0.8295, + "step": 3685 + }, + { + "epoch": 0.3033120757045875, + "grad_norm": 1.7457959954236808, + "learning_rate": 1.6331756297527595e-05, + "loss": 0.8141, + "step": 3686 + }, + { + "epoch": 0.3033943632997326, + "grad_norm": 1.953308606724481, + "learning_rate": 1.6329693106394285e-05, + "loss": 0.7854, + "step": 3687 + }, + { + "epoch": 0.3034766508948776, + "grad_norm": 3.1688517850914857, + "learning_rate": 1.6327629465609697e-05, + "loss": 0.819, + "step": 3688 + }, + { + "epoch": 0.30355893849002263, + "grad_norm": 0.4226930045061616, + "learning_rate": 1.6325565375320437e-05, + "loss": 0.5135, + "step": 3689 + }, + { + "epoch": 0.30364122608516764, + "grad_norm": 1.9921789316023617, + "learning_rate": 1.632350083567312e-05, + "loss": 0.841, + "step": 3690 + }, + { + "epoch": 0.3037235136803127, + "grad_norm": 2.144021981352117, + "learning_rate": 1.6321435846814425e-05, + "loss": 0.7732, + "step": 3691 + }, + { + "epoch": 0.3038058012754577, + "grad_norm": 2.1150059213115098, + "learning_rate": 1.6319370408891033e-05, + "loss": 0.7836, + "step": 3692 + }, + { + "epoch": 0.30388808887060276, + "grad_norm": 0.4225605763671702, + "learning_rate": 1.6317304522049676e-05, + "loss": 0.5257, + "step": 3693 + }, + { + "epoch": 0.3039703764657478, + "grad_norm": 1.8297556296284003, + "learning_rate": 1.6315238186437105e-05, + "loss": 0.7853, + "step": 3694 + }, + { + "epoch": 0.3040526640608928, + "grad_norm": 1.834168255019643, + "learning_rate": 1.6313171402200113e-05, + "loss": 0.7548, + "step": 3695 + }, + { + "epoch": 0.3041349516560379, + "grad_norm": 3.037930138675158, + "learning_rate": 1.6311104169485524e-05, + "loss": 0.8082, + "step": 3696 + }, + { + "epoch": 0.3042172392511829, + "grad_norm": 2.0655840771497047, + "learning_rate": 1.6309036488440188e-05, + "loss": 0.7983, + "step": 3697 + }, + { + "epoch": 0.30429952684632794, + "grad_norm": 1.633093211202821, + "learning_rate": 1.630696835921099e-05, + "loss": 0.8161, + "step": 3698 + }, + { + "epoch": 0.30438181444147294, + "grad_norm": 1.691470651596845, + "learning_rate": 1.6304899781944843e-05, + "loss": 0.8128, + "step": 3699 + }, + { + "epoch": 0.304464102036618, + "grad_norm": 2.0532750008623837, + "learning_rate": 1.63028307567887e-05, + "loss": 0.7754, + "step": 3700 + }, + { + "epoch": 0.304546389631763, + "grad_norm": 1.9264864628222687, + "learning_rate": 1.630076128388954e-05, + "loss": 0.8237, + "step": 3701 + }, + { + "epoch": 0.30462867722690806, + "grad_norm": 1.7260987270858321, + "learning_rate": 1.6298691363394376e-05, + "loss": 0.8195, + "step": 3702 + }, + { + "epoch": 0.30471096482205307, + "grad_norm": 1.5578352879677724, + "learning_rate": 1.629662099545025e-05, + "loss": 0.7841, + "step": 3703 + }, + { + "epoch": 0.3047932524171981, + "grad_norm": 2.2106924467540554, + "learning_rate": 1.6294550180204238e-05, + "loss": 0.8069, + "step": 3704 + }, + { + "epoch": 0.3048755400123431, + "grad_norm": 1.8921781975692649, + "learning_rate": 1.6292478917803448e-05, + "loss": 0.7939, + "step": 3705 + }, + { + "epoch": 0.3049578276074882, + "grad_norm": 1.673642078076112, + "learning_rate": 1.629040720839502e-05, + "loss": 0.8063, + "step": 3706 + }, + { + "epoch": 0.3050401152026332, + "grad_norm": 1.8564314006699936, + "learning_rate": 1.6288335052126127e-05, + "loss": 0.8101, + "step": 3707 + }, + { + "epoch": 0.30512240279777825, + "grad_norm": 0.4393600634993367, + "learning_rate": 1.628626244914396e-05, + "loss": 0.5327, + "step": 3708 + }, + { + "epoch": 0.30520469039292325, + "grad_norm": 1.5957945769842101, + "learning_rate": 1.6284189399595767e-05, + "loss": 0.7861, + "step": 3709 + }, + { + "epoch": 0.3052869779880683, + "grad_norm": 1.7141665761524336, + "learning_rate": 1.628211590362881e-05, + "loss": 0.7668, + "step": 3710 + }, + { + "epoch": 0.3053692655832133, + "grad_norm": 1.7253634739463022, + "learning_rate": 1.6280041961390387e-05, + "loss": 0.8165, + "step": 3711 + }, + { + "epoch": 0.30545155317835837, + "grad_norm": 2.218877930057157, + "learning_rate": 1.6277967573027823e-05, + "loss": 0.7945, + "step": 3712 + }, + { + "epoch": 0.3055338407735034, + "grad_norm": 0.4341320111159625, + "learning_rate": 1.6275892738688484e-05, + "loss": 0.5605, + "step": 3713 + }, + { + "epoch": 0.30561612836864843, + "grad_norm": 0.43930733101248265, + "learning_rate": 1.6273817458519764e-05, + "loss": 0.5232, + "step": 3714 + }, + { + "epoch": 0.30569841596379344, + "grad_norm": 2.0033466027439313, + "learning_rate": 1.627174173266908e-05, + "loss": 0.8007, + "step": 3715 + }, + { + "epoch": 0.3057807035589385, + "grad_norm": 1.758130594597491, + "learning_rate": 1.6269665561283898e-05, + "loss": 0.808, + "step": 3716 + }, + { + "epoch": 0.3058629911540835, + "grad_norm": 2.083476995210029, + "learning_rate": 1.62675889445117e-05, + "loss": 0.7918, + "step": 3717 + }, + { + "epoch": 0.30594527874922856, + "grad_norm": 0.4606851012547155, + "learning_rate": 1.6265511882500008e-05, + "loss": 0.5494, + "step": 3718 + }, + { + "epoch": 0.30602756634437356, + "grad_norm": 1.9313786148077983, + "learning_rate": 1.626343437539637e-05, + "loss": 0.8238, + "step": 3719 + }, + { + "epoch": 0.3061098539395186, + "grad_norm": 1.8681515209882509, + "learning_rate": 1.626135642334837e-05, + "loss": 0.7808, + "step": 3720 + }, + { + "epoch": 0.3061921415346637, + "grad_norm": 2.03958255232041, + "learning_rate": 1.6259278026503625e-05, + "loss": 0.8132, + "step": 3721 + }, + { + "epoch": 0.3062744291298087, + "grad_norm": 2.180998620356756, + "learning_rate": 1.625719918500978e-05, + "loss": 0.7985, + "step": 3722 + }, + { + "epoch": 0.30635671672495374, + "grad_norm": 2.1147018237252615, + "learning_rate": 1.6255119899014514e-05, + "loss": 0.8145, + "step": 3723 + }, + { + "epoch": 0.30643900432009874, + "grad_norm": 2.2660656384183224, + "learning_rate": 1.625304016866553e-05, + "loss": 0.8245, + "step": 3724 + }, + { + "epoch": 0.3065212919152438, + "grad_norm": 2.0843798415517343, + "learning_rate": 1.6250959994110575e-05, + "loss": 0.8288, + "step": 3725 + }, + { + "epoch": 0.3066035795103888, + "grad_norm": 2.433152350777801, + "learning_rate": 1.6248879375497416e-05, + "loss": 0.8016, + "step": 3726 + }, + { + "epoch": 0.30668586710553386, + "grad_norm": 1.8635624104923694, + "learning_rate": 1.6246798312973862e-05, + "loss": 0.8302, + "step": 3727 + }, + { + "epoch": 0.30676815470067886, + "grad_norm": 1.9733434384603887, + "learning_rate": 1.6244716806687746e-05, + "loss": 0.8362, + "step": 3728 + }, + { + "epoch": 0.3068504422958239, + "grad_norm": 0.4827561540472388, + "learning_rate": 1.6242634856786934e-05, + "loss": 0.5584, + "step": 3729 + }, + { + "epoch": 0.3069327298909689, + "grad_norm": 2.0026515293908065, + "learning_rate": 1.6240552463419325e-05, + "loss": 0.8013, + "step": 3730 + }, + { + "epoch": 0.307015017486114, + "grad_norm": 2.421089896619913, + "learning_rate": 1.623846962673285e-05, + "loss": 0.7854, + "step": 3731 + }, + { + "epoch": 0.307097305081259, + "grad_norm": 2.004747768730618, + "learning_rate": 1.6236386346875473e-05, + "loss": 0.796, + "step": 3732 + }, + { + "epoch": 0.30717959267640405, + "grad_norm": 1.8191195652060745, + "learning_rate": 1.623430262399518e-05, + "loss": 0.7972, + "step": 3733 + }, + { + "epoch": 0.30726188027154905, + "grad_norm": 0.43757093610523434, + "learning_rate": 1.623221845824e-05, + "loss": 0.5311, + "step": 3734 + }, + { + "epoch": 0.3073441678666941, + "grad_norm": 2.240566483832002, + "learning_rate": 1.6230133849757984e-05, + "loss": 0.7828, + "step": 3735 + }, + { + "epoch": 0.3074264554618391, + "grad_norm": 2.1385825153781566, + "learning_rate": 1.6228048798697228e-05, + "loss": 0.7757, + "step": 3736 + }, + { + "epoch": 0.30750874305698417, + "grad_norm": 2.127148299389216, + "learning_rate": 1.6225963305205845e-05, + "loss": 0.8251, + "step": 3737 + }, + { + "epoch": 0.30759103065212917, + "grad_norm": 2.120414731679044, + "learning_rate": 1.6223877369431983e-05, + "loss": 0.8012, + "step": 3738 + }, + { + "epoch": 0.30767331824727423, + "grad_norm": 1.9228746790037947, + "learning_rate": 1.622179099152383e-05, + "loss": 0.7766, + "step": 3739 + }, + { + "epoch": 0.30775560584241923, + "grad_norm": 5.699396120222066, + "learning_rate": 1.621970417162959e-05, + "loss": 0.8022, + "step": 3740 + }, + { + "epoch": 0.3078378934375643, + "grad_norm": 0.4309867200604675, + "learning_rate": 1.6217616909897516e-05, + "loss": 0.5309, + "step": 3741 + }, + { + "epoch": 0.3079201810327093, + "grad_norm": 1.9015914971003123, + "learning_rate": 1.621552920647588e-05, + "loss": 0.8028, + "step": 3742 + }, + { + "epoch": 0.30800246862785435, + "grad_norm": 1.9000230700954626, + "learning_rate": 1.621344106151299e-05, + "loss": 0.7936, + "step": 3743 + }, + { + "epoch": 0.30808475622299936, + "grad_norm": 0.4426055699337771, + "learning_rate": 1.6211352475157183e-05, + "loss": 0.5109, + "step": 3744 + }, + { + "epoch": 0.3081670438181444, + "grad_norm": 2.2152525796595812, + "learning_rate": 1.620926344755683e-05, + "loss": 0.8065, + "step": 3745 + }, + { + "epoch": 0.3082493314132895, + "grad_norm": 0.4481125883668462, + "learning_rate": 1.620717397886033e-05, + "loss": 0.5508, + "step": 3746 + }, + { + "epoch": 0.3083316190084345, + "grad_norm": 2.3399247026612757, + "learning_rate": 1.6205084069216122e-05, + "loss": 0.796, + "step": 3747 + }, + { + "epoch": 0.30841390660357954, + "grad_norm": 1.8345421801615338, + "learning_rate": 1.6202993718772662e-05, + "loss": 0.801, + "step": 3748 + }, + { + "epoch": 0.30849619419872454, + "grad_norm": 1.7932846729941831, + "learning_rate": 1.6200902927678447e-05, + "loss": 0.7722, + "step": 3749 + }, + { + "epoch": 0.3085784817938696, + "grad_norm": 1.687207271881378, + "learning_rate": 1.6198811696082008e-05, + "loss": 0.7969, + "step": 3750 + }, + { + "epoch": 0.3086607693890146, + "grad_norm": 1.8714625068204986, + "learning_rate": 1.61967200241319e-05, + "loss": 0.7993, + "step": 3751 + }, + { + "epoch": 0.30874305698415966, + "grad_norm": 1.7409895731594691, + "learning_rate": 1.619462791197671e-05, + "loss": 0.7805, + "step": 3752 + }, + { + "epoch": 0.30882534457930466, + "grad_norm": 1.739045393950046, + "learning_rate": 1.619253535976506e-05, + "loss": 0.7832, + "step": 3753 + }, + { + "epoch": 0.3089076321744497, + "grad_norm": 0.44313475687242815, + "learning_rate": 1.6190442367645603e-05, + "loss": 0.5453, + "step": 3754 + }, + { + "epoch": 0.3089899197695947, + "grad_norm": 1.6893521002681389, + "learning_rate": 1.6188348935767018e-05, + "loss": 0.7769, + "step": 3755 + }, + { + "epoch": 0.3090722073647398, + "grad_norm": 2.308498242005326, + "learning_rate": 1.618625506427802e-05, + "loss": 0.7868, + "step": 3756 + }, + { + "epoch": 0.3091544949598848, + "grad_norm": 1.8145631519769951, + "learning_rate": 1.618416075332736e-05, + "loss": 0.7977, + "step": 3757 + }, + { + "epoch": 0.30923678255502984, + "grad_norm": 1.7295348876425896, + "learning_rate": 1.618206600306381e-05, + "loss": 0.798, + "step": 3758 + }, + { + "epoch": 0.30931907015017485, + "grad_norm": 2.0147213977428864, + "learning_rate": 1.6179970813636175e-05, + "loss": 0.7525, + "step": 3759 + }, + { + "epoch": 0.3094013577453199, + "grad_norm": 0.4449268535613779, + "learning_rate": 1.61778751851933e-05, + "loss": 0.5563, + "step": 3760 + }, + { + "epoch": 0.3094836453404649, + "grad_norm": 0.40231439760617077, + "learning_rate": 1.6175779117884046e-05, + "loss": 0.5245, + "step": 3761 + }, + { + "epoch": 0.30956593293560997, + "grad_norm": 1.7766369949909324, + "learning_rate": 1.6173682611857327e-05, + "loss": 0.8161, + "step": 3762 + }, + { + "epoch": 0.30964822053075497, + "grad_norm": 1.5194083385934598, + "learning_rate": 1.6171585667262068e-05, + "loss": 0.7975, + "step": 3763 + }, + { + "epoch": 0.30973050812590003, + "grad_norm": 0.4391010126701544, + "learning_rate": 1.6169488284247227e-05, + "loss": 0.5278, + "step": 3764 + }, + { + "epoch": 0.30981279572104503, + "grad_norm": 1.9057323951157967, + "learning_rate": 1.6167390462961812e-05, + "loss": 0.7963, + "step": 3765 + }, + { + "epoch": 0.3098950833161901, + "grad_norm": 1.496900484594616, + "learning_rate": 1.6165292203554835e-05, + "loss": 0.8146, + "step": 3766 + }, + { + "epoch": 0.3099773709113351, + "grad_norm": 1.3203974600978459, + "learning_rate": 1.6163193506175365e-05, + "loss": 0.8, + "step": 3767 + }, + { + "epoch": 0.31005965850648015, + "grad_norm": 1.4659118769716732, + "learning_rate": 1.6161094370972486e-05, + "loss": 0.8395, + "step": 3768 + }, + { + "epoch": 0.31014194610162515, + "grad_norm": 1.6799103341614017, + "learning_rate": 1.615899479809531e-05, + "loss": 0.7951, + "step": 3769 + }, + { + "epoch": 0.3102242336967702, + "grad_norm": 0.42356524852903504, + "learning_rate": 1.6156894787693002e-05, + "loss": 0.5293, + "step": 3770 + }, + { + "epoch": 0.3103065212919152, + "grad_norm": 1.5727058387890596, + "learning_rate": 1.615479433991473e-05, + "loss": 0.8121, + "step": 3771 + }, + { + "epoch": 0.3103888088870603, + "grad_norm": 1.2670970557759353, + "learning_rate": 1.6152693454909706e-05, + "loss": 0.7827, + "step": 3772 + }, + { + "epoch": 0.31047109648220533, + "grad_norm": 1.3323156008193104, + "learning_rate": 1.6150592132827186e-05, + "loss": 0.7858, + "step": 3773 + }, + { + "epoch": 0.31055338407735034, + "grad_norm": 1.2028385372754486, + "learning_rate": 1.6148490373816435e-05, + "loss": 0.7607, + "step": 3774 + }, + { + "epoch": 0.3106356716724954, + "grad_norm": 1.4909932774781338, + "learning_rate": 1.614638817802676e-05, + "loss": 0.8046, + "step": 3775 + }, + { + "epoch": 0.3107179592676404, + "grad_norm": 1.4754248739906664, + "learning_rate": 1.61442855456075e-05, + "loss": 0.7874, + "step": 3776 + }, + { + "epoch": 0.31080024686278546, + "grad_norm": 1.5094229241942867, + "learning_rate": 1.614218247670802e-05, + "loss": 0.8253, + "step": 3777 + }, + { + "epoch": 0.31088253445793046, + "grad_norm": 0.4295052349695629, + "learning_rate": 1.614007897147772e-05, + "loss": 0.543, + "step": 3778 + }, + { + "epoch": 0.3109648220530755, + "grad_norm": 1.6799174553659821, + "learning_rate": 1.613797503006603e-05, + "loss": 0.7956, + "step": 3779 + }, + { + "epoch": 0.3110471096482205, + "grad_norm": 1.5824840220914298, + "learning_rate": 1.613587065262241e-05, + "loss": 0.7853, + "step": 3780 + }, + { + "epoch": 0.3111293972433656, + "grad_norm": 1.4575534258065173, + "learning_rate": 1.613376583929635e-05, + "loss": 0.7989, + "step": 3781 + }, + { + "epoch": 0.3112116848385106, + "grad_norm": 1.6635486970673155, + "learning_rate": 1.613166059023738e-05, + "loss": 0.8021, + "step": 3782 + }, + { + "epoch": 0.31129397243365564, + "grad_norm": 1.3912852168757746, + "learning_rate": 1.6129554905595043e-05, + "loss": 0.7991, + "step": 3783 + }, + { + "epoch": 0.31137626002880064, + "grad_norm": 1.8348284522634324, + "learning_rate": 1.612744878551893e-05, + "loss": 0.7884, + "step": 3784 + }, + { + "epoch": 0.3114585476239457, + "grad_norm": 0.45404800627571795, + "learning_rate": 1.6125342230158653e-05, + "loss": 0.5297, + "step": 3785 + }, + { + "epoch": 0.3115408352190907, + "grad_norm": 1.568224830463089, + "learning_rate": 1.612323523966386e-05, + "loss": 0.7889, + "step": 3786 + }, + { + "epoch": 0.31162312281423576, + "grad_norm": 1.4615826506999914, + "learning_rate": 1.612112781418423e-05, + "loss": 0.8027, + "step": 3787 + }, + { + "epoch": 0.31170541040938077, + "grad_norm": 1.3600983387015941, + "learning_rate": 1.611901995386947e-05, + "loss": 0.7957, + "step": 3788 + }, + { + "epoch": 0.3117876980045258, + "grad_norm": 2.4823284104691257, + "learning_rate": 1.6116911658869313e-05, + "loss": 0.8086, + "step": 3789 + }, + { + "epoch": 0.31186998559967083, + "grad_norm": 1.4425764579983513, + "learning_rate": 1.611480292933354e-05, + "loss": 0.86, + "step": 3790 + }, + { + "epoch": 0.3119522731948159, + "grad_norm": 3.5898890527626572, + "learning_rate": 1.6112693765411944e-05, + "loss": 0.7922, + "step": 3791 + }, + { + "epoch": 0.3120345607899609, + "grad_norm": 0.4405848046693677, + "learning_rate": 1.611058416725436e-05, + "loss": 0.5164, + "step": 3792 + }, + { + "epoch": 0.31211684838510595, + "grad_norm": 1.930749253968764, + "learning_rate": 1.6108474135010647e-05, + "loss": 0.8474, + "step": 3793 + }, + { + "epoch": 0.31219913598025095, + "grad_norm": 1.6005631874722352, + "learning_rate": 1.61063636688307e-05, + "loss": 0.7985, + "step": 3794 + }, + { + "epoch": 0.312281423575396, + "grad_norm": 2.1228915952675322, + "learning_rate": 1.6104252768864447e-05, + "loss": 0.8236, + "step": 3795 + }, + { + "epoch": 0.312363711170541, + "grad_norm": 1.5504485182917005, + "learning_rate": 1.6102141435261837e-05, + "loss": 0.8064, + "step": 3796 + }, + { + "epoch": 0.3124459987656861, + "grad_norm": 2.174381516183099, + "learning_rate": 1.610002966817286e-05, + "loss": 0.7895, + "step": 3797 + }, + { + "epoch": 0.31252828636083113, + "grad_norm": 2.1588757115892836, + "learning_rate": 1.609791746774753e-05, + "loss": 0.8281, + "step": 3798 + }, + { + "epoch": 0.31261057395597613, + "grad_norm": 1.8016195608269203, + "learning_rate": 1.6095804834135895e-05, + "loss": 0.8017, + "step": 3799 + }, + { + "epoch": 0.3126928615511212, + "grad_norm": 1.9215240891550083, + "learning_rate": 1.6093691767488032e-05, + "loss": 0.8288, + "step": 3800 + }, + { + "epoch": 0.3127751491462662, + "grad_norm": 1.7081490290132, + "learning_rate": 1.609157826795406e-05, + "loss": 0.8244, + "step": 3801 + }, + { + "epoch": 0.31285743674141125, + "grad_norm": 1.4362733132847159, + "learning_rate": 1.6089464335684097e-05, + "loss": 0.8203, + "step": 3802 + }, + { + "epoch": 0.31293972433655626, + "grad_norm": 1.5092490177822409, + "learning_rate": 1.6087349970828335e-05, + "loss": 0.8063, + "step": 3803 + }, + { + "epoch": 0.3130220119317013, + "grad_norm": 1.4989585796869414, + "learning_rate": 1.6085235173536965e-05, + "loss": 0.7916, + "step": 3804 + }, + { + "epoch": 0.3131042995268463, + "grad_norm": 1.4282121061703668, + "learning_rate": 1.6083119943960215e-05, + "loss": 0.7771, + "step": 3805 + }, + { + "epoch": 0.3131865871219914, + "grad_norm": 1.361727204221599, + "learning_rate": 1.6081004282248358e-05, + "loss": 0.7847, + "step": 3806 + }, + { + "epoch": 0.3132688747171364, + "grad_norm": 1.8247981932325896, + "learning_rate": 1.607888818855168e-05, + "loss": 0.7565, + "step": 3807 + }, + { + "epoch": 0.31335116231228144, + "grad_norm": 1.4669885684346229, + "learning_rate": 1.6076771663020507e-05, + "loss": 0.8057, + "step": 3808 + }, + { + "epoch": 0.31343344990742644, + "grad_norm": 0.4719992291304323, + "learning_rate": 1.6074654705805194e-05, + "loss": 0.5486, + "step": 3809 + }, + { + "epoch": 0.3135157375025715, + "grad_norm": 1.6647995724657256, + "learning_rate": 1.6072537317056128e-05, + "loss": 0.7938, + "step": 3810 + }, + { + "epoch": 0.3135980250977165, + "grad_norm": 2.476341315156464, + "learning_rate": 1.6070419496923716e-05, + "loss": 0.808, + "step": 3811 + }, + { + "epoch": 0.31368031269286156, + "grad_norm": 1.6305978435794288, + "learning_rate": 1.606830124555842e-05, + "loss": 0.815, + "step": 3812 + }, + { + "epoch": 0.31376260028800657, + "grad_norm": 1.7651591384088836, + "learning_rate": 1.6066182563110698e-05, + "loss": 0.8375, + "step": 3813 + }, + { + "epoch": 0.3138448878831516, + "grad_norm": 1.6865103523908131, + "learning_rate": 1.6064063449731076e-05, + "loss": 0.8222, + "step": 3814 + }, + { + "epoch": 0.3139271754782966, + "grad_norm": 0.4241649811874777, + "learning_rate": 1.606194390557008e-05, + "loss": 0.5088, + "step": 3815 + }, + { + "epoch": 0.3140094630734417, + "grad_norm": 1.451751859628819, + "learning_rate": 1.6059823930778286e-05, + "loss": 0.7924, + "step": 3816 + }, + { + "epoch": 0.3140917506685867, + "grad_norm": 1.5746528155678103, + "learning_rate": 1.6057703525506293e-05, + "loss": 0.7977, + "step": 3817 + }, + { + "epoch": 0.31417403826373175, + "grad_norm": 1.633038667302806, + "learning_rate": 1.6055582689904724e-05, + "loss": 0.8224, + "step": 3818 + }, + { + "epoch": 0.31425632585887675, + "grad_norm": 1.3997285893805254, + "learning_rate": 1.605346142412425e-05, + "loss": 0.8236, + "step": 3819 + }, + { + "epoch": 0.3143386134540218, + "grad_norm": 1.3776423446334434, + "learning_rate": 1.6051339728315557e-05, + "loss": 0.8028, + "step": 3820 + }, + { + "epoch": 0.3144209010491668, + "grad_norm": 0.4296997716679929, + "learning_rate": 1.6049217602629368e-05, + "loss": 0.5383, + "step": 3821 + }, + { + "epoch": 0.31450318864431187, + "grad_norm": 1.5112326971489343, + "learning_rate": 1.604709504721643e-05, + "loss": 0.8011, + "step": 3822 + }, + { + "epoch": 0.31458547623945693, + "grad_norm": 1.5000778581455252, + "learning_rate": 1.6044972062227536e-05, + "loss": 0.788, + "step": 3823 + }, + { + "epoch": 0.31466776383460193, + "grad_norm": 1.8304966534923168, + "learning_rate": 1.604284864781349e-05, + "loss": 0.8036, + "step": 3824 + }, + { + "epoch": 0.314750051429747, + "grad_norm": 1.447229013784352, + "learning_rate": 1.6040724804125144e-05, + "loss": 0.788, + "step": 3825 + }, + { + "epoch": 0.314832339024892, + "grad_norm": 1.400055065113336, + "learning_rate": 1.6038600531313365e-05, + "loss": 0.8338, + "step": 3826 + }, + { + "epoch": 0.31491462662003705, + "grad_norm": 1.6462850322118918, + "learning_rate": 1.6036475829529065e-05, + "loss": 0.8159, + "step": 3827 + }, + { + "epoch": 0.31499691421518206, + "grad_norm": 1.4405452087209327, + "learning_rate": 1.6034350698923175e-05, + "loss": 0.8021, + "step": 3828 + }, + { + "epoch": 0.3150792018103271, + "grad_norm": 1.238962748264279, + "learning_rate": 1.6032225139646663e-05, + "loss": 0.7933, + "step": 3829 + }, + { + "epoch": 0.3151614894054721, + "grad_norm": 1.3760217055651613, + "learning_rate": 1.603009915185052e-05, + "loss": 0.8448, + "step": 3830 + }, + { + "epoch": 0.3152437770006172, + "grad_norm": 0.4592060820215616, + "learning_rate": 1.602797273568578e-05, + "loss": 0.5474, + "step": 3831 + }, + { + "epoch": 0.3153260645957622, + "grad_norm": 1.46346971437999, + "learning_rate": 1.60258458913035e-05, + "loss": 0.8075, + "step": 3832 + }, + { + "epoch": 0.31540835219090724, + "grad_norm": 0.43688001977430013, + "learning_rate": 1.6023718618854756e-05, + "loss": 0.5195, + "step": 3833 + }, + { + "epoch": 0.31549063978605224, + "grad_norm": 1.387966012985661, + "learning_rate": 1.6021590918490685e-05, + "loss": 0.7928, + "step": 3834 + }, + { + "epoch": 0.3155729273811973, + "grad_norm": 1.2845135247879658, + "learning_rate": 1.6019462790362415e-05, + "loss": 0.7991, + "step": 3835 + }, + { + "epoch": 0.3156552149763423, + "grad_norm": 1.3529603536381467, + "learning_rate": 1.6017334234621143e-05, + "loss": 0.7881, + "step": 3836 + }, + { + "epoch": 0.31573750257148736, + "grad_norm": 0.4413166562719957, + "learning_rate": 1.601520525141807e-05, + "loss": 0.5475, + "step": 3837 + }, + { + "epoch": 0.31581979016663236, + "grad_norm": 1.5798220607086506, + "learning_rate": 1.6013075840904433e-05, + "loss": 0.8072, + "step": 3838 + }, + { + "epoch": 0.3159020777617774, + "grad_norm": 1.3374478512636028, + "learning_rate": 1.6010946003231507e-05, + "loss": 0.774, + "step": 3839 + }, + { + "epoch": 0.3159843653569224, + "grad_norm": 1.3317452201789195, + "learning_rate": 1.6008815738550588e-05, + "loss": 0.7837, + "step": 3840 + }, + { + "epoch": 0.3160666529520675, + "grad_norm": 1.524499254315705, + "learning_rate": 1.6006685047013008e-05, + "loss": 0.8071, + "step": 3841 + }, + { + "epoch": 0.3161489405472125, + "grad_norm": 1.3306539588843487, + "learning_rate": 1.600455392877013e-05, + "loss": 0.8184, + "step": 3842 + }, + { + "epoch": 0.31623122814235755, + "grad_norm": 0.42763710421613876, + "learning_rate": 1.6002422383973345e-05, + "loss": 0.5117, + "step": 3843 + }, + { + "epoch": 0.31631351573750255, + "grad_norm": 1.5219746227372761, + "learning_rate": 1.6000290412774072e-05, + "loss": 0.8044, + "step": 3844 + }, + { + "epoch": 0.3163958033326476, + "grad_norm": 1.41718191129118, + "learning_rate": 1.599815801532376e-05, + "loss": 0.7955, + "step": 3845 + }, + { + "epoch": 0.3164780909277926, + "grad_norm": 1.2798703247927727, + "learning_rate": 1.59960251917739e-05, + "loss": 0.7954, + "step": 3846 + }, + { + "epoch": 0.31656037852293767, + "grad_norm": 4.202688452888293, + "learning_rate": 1.5993891942276e-05, + "loss": 0.8213, + "step": 3847 + }, + { + "epoch": 0.31664266611808267, + "grad_norm": 3.028615193665598, + "learning_rate": 1.59917582669816e-05, + "loss": 0.7849, + "step": 3848 + }, + { + "epoch": 0.31672495371322773, + "grad_norm": 1.4614417133635351, + "learning_rate": 1.5989624166042275e-05, + "loss": 0.7868, + "step": 3849 + }, + { + "epoch": 0.3168072413083728, + "grad_norm": 1.4622588657748734, + "learning_rate": 1.598748963960963e-05, + "loss": 0.7858, + "step": 3850 + }, + { + "epoch": 0.3168895289035178, + "grad_norm": 1.411182313839077, + "learning_rate": 1.5985354687835296e-05, + "loss": 0.8097, + "step": 3851 + }, + { + "epoch": 0.31697181649866285, + "grad_norm": 1.4862535038642217, + "learning_rate": 1.598321931087094e-05, + "loss": 0.8158, + "step": 3852 + }, + { + "epoch": 0.31705410409380785, + "grad_norm": 1.7517411737646655, + "learning_rate": 1.598108350886825e-05, + "loss": 0.7836, + "step": 3853 + }, + { + "epoch": 0.3171363916889529, + "grad_norm": 1.50628024287692, + "learning_rate": 1.597894728197895e-05, + "loss": 0.825, + "step": 3854 + }, + { + "epoch": 0.3172186792840979, + "grad_norm": 1.5551702890796064, + "learning_rate": 1.59768106303548e-05, + "loss": 0.7912, + "step": 3855 + }, + { + "epoch": 0.317300966879243, + "grad_norm": 1.565139044705377, + "learning_rate": 1.5974673554147583e-05, + "loss": 0.8298, + "step": 3856 + }, + { + "epoch": 0.317383254474388, + "grad_norm": 1.6719645431305064, + "learning_rate": 1.597253605350911e-05, + "loss": 0.8159, + "step": 3857 + }, + { + "epoch": 0.31746554206953304, + "grad_norm": 1.738755656953427, + "learning_rate": 1.5970398128591226e-05, + "loss": 0.8111, + "step": 3858 + }, + { + "epoch": 0.31754782966467804, + "grad_norm": 1.3016455355427752, + "learning_rate": 1.596825977954581e-05, + "loss": 0.796, + "step": 3859 + }, + { + "epoch": 0.3176301172598231, + "grad_norm": 1.6339301852364194, + "learning_rate": 1.5966121006524763e-05, + "loss": 0.8089, + "step": 3860 + }, + { + "epoch": 0.3177124048549681, + "grad_norm": 1.6554656500069764, + "learning_rate": 1.5963981809680017e-05, + "loss": 0.857, + "step": 3861 + }, + { + "epoch": 0.31779469245011316, + "grad_norm": 1.7785982317572142, + "learning_rate": 1.5961842189163547e-05, + "loss": 0.8098, + "step": 3862 + }, + { + "epoch": 0.31787698004525816, + "grad_norm": 2.1597556737395562, + "learning_rate": 1.595970214512734e-05, + "loss": 0.798, + "step": 3863 + }, + { + "epoch": 0.3179592676404032, + "grad_norm": 1.9370568707163913, + "learning_rate": 1.5957561677723426e-05, + "loss": 0.8079, + "step": 3864 + }, + { + "epoch": 0.3180415552355482, + "grad_norm": 1.9444571258095107, + "learning_rate": 1.5955420787103856e-05, + "loss": 0.8109, + "step": 3865 + }, + { + "epoch": 0.3181238428306933, + "grad_norm": 1.760393730100222, + "learning_rate": 1.5953279473420715e-05, + "loss": 0.7758, + "step": 3866 + }, + { + "epoch": 0.3182061304258383, + "grad_norm": 1.6420492720718982, + "learning_rate": 1.5951137736826122e-05, + "loss": 0.8169, + "step": 3867 + }, + { + "epoch": 0.31828841802098334, + "grad_norm": 0.44768553469151645, + "learning_rate": 1.5948995577472226e-05, + "loss": 0.5237, + "step": 3868 + }, + { + "epoch": 0.31837070561612835, + "grad_norm": 0.4704272888843954, + "learning_rate": 1.5946852995511196e-05, + "loss": 0.5489, + "step": 3869 + }, + { + "epoch": 0.3184529932112734, + "grad_norm": 1.5467122402037703, + "learning_rate": 1.5944709991095238e-05, + "loss": 0.8043, + "step": 3870 + }, + { + "epoch": 0.3185352808064184, + "grad_norm": 2.390616971685316, + "learning_rate": 1.594256656437659e-05, + "loss": 0.7991, + "step": 3871 + }, + { + "epoch": 0.31861756840156347, + "grad_norm": 1.7214781780864123, + "learning_rate": 1.5940422715507522e-05, + "loss": 0.7695, + "step": 3872 + }, + { + "epoch": 0.31869985599670847, + "grad_norm": 1.6526754937961543, + "learning_rate": 1.593827844464032e-05, + "loss": 0.786, + "step": 3873 + }, + { + "epoch": 0.31878214359185353, + "grad_norm": 1.7516686291235084, + "learning_rate": 1.593613375192731e-05, + "loss": 0.8132, + "step": 3874 + }, + { + "epoch": 0.3188644311869986, + "grad_norm": 1.9016648443724014, + "learning_rate": 1.593398863752086e-05, + "loss": 0.8149, + "step": 3875 + }, + { + "epoch": 0.3189467187821436, + "grad_norm": 0.4667168130769043, + "learning_rate": 1.5931843101573345e-05, + "loss": 0.5291, + "step": 3876 + }, + { + "epoch": 0.31902900637728865, + "grad_norm": 2.1732316364322637, + "learning_rate": 1.592969714423718e-05, + "loss": 0.8116, + "step": 3877 + }, + { + "epoch": 0.31911129397243365, + "grad_norm": 2.0042075733499396, + "learning_rate": 1.5927550765664814e-05, + "loss": 0.7947, + "step": 3878 + }, + { + "epoch": 0.3191935815675787, + "grad_norm": 2.00997410027192, + "learning_rate": 1.592540396600872e-05, + "loss": 0.8145, + "step": 3879 + }, + { + "epoch": 0.3192758691627237, + "grad_norm": 1.5637086608174724, + "learning_rate": 1.5923256745421408e-05, + "loss": 0.827, + "step": 3880 + }, + { + "epoch": 0.31935815675786877, + "grad_norm": 1.6181242256485258, + "learning_rate": 1.592110910405541e-05, + "loss": 0.8151, + "step": 3881 + }, + { + "epoch": 0.3194404443530138, + "grad_norm": 3.370578012508591, + "learning_rate": 1.5918961042063285e-05, + "loss": 0.7889, + "step": 3882 + }, + { + "epoch": 0.31952273194815883, + "grad_norm": 1.882982469706157, + "learning_rate": 1.5916812559597635e-05, + "loss": 0.7844, + "step": 3883 + }, + { + "epoch": 0.31960501954330384, + "grad_norm": 2.002582131058503, + "learning_rate": 1.5914663656811086e-05, + "loss": 0.8074, + "step": 3884 + }, + { + "epoch": 0.3196873071384489, + "grad_norm": 1.600771949322908, + "learning_rate": 1.591251433385629e-05, + "loss": 0.805, + "step": 3885 + }, + { + "epoch": 0.3197695947335939, + "grad_norm": 1.5408535050061394, + "learning_rate": 1.591036459088593e-05, + "loss": 0.8019, + "step": 3886 + }, + { + "epoch": 0.31985188232873896, + "grad_norm": 1.47785259187151, + "learning_rate": 1.590821442805272e-05, + "loss": 0.7725, + "step": 3887 + }, + { + "epoch": 0.31993416992388396, + "grad_norm": 1.649957375486492, + "learning_rate": 1.590606384550941e-05, + "loss": 0.8423, + "step": 3888 + }, + { + "epoch": 0.320016457519029, + "grad_norm": 1.722398959070732, + "learning_rate": 1.590391284340877e-05, + "loss": 0.7958, + "step": 3889 + }, + { + "epoch": 0.320098745114174, + "grad_norm": 0.4439275700511717, + "learning_rate": 1.5901761421903602e-05, + "loss": 0.5155, + "step": 3890 + }, + { + "epoch": 0.3201810327093191, + "grad_norm": 1.6385779929802615, + "learning_rate": 1.589960958114674e-05, + "loss": 0.8037, + "step": 3891 + }, + { + "epoch": 0.3202633203044641, + "grad_norm": 2.2677890806726233, + "learning_rate": 1.589745732129105e-05, + "loss": 0.8014, + "step": 3892 + }, + { + "epoch": 0.32034560789960914, + "grad_norm": 1.7440331245033582, + "learning_rate": 1.589530464248942e-05, + "loss": 0.7926, + "step": 3893 + }, + { + "epoch": 0.32042789549475414, + "grad_norm": 1.7572278727107002, + "learning_rate": 1.589315154489478e-05, + "loss": 0.799, + "step": 3894 + }, + { + "epoch": 0.3205101830898992, + "grad_norm": 1.8522770471221919, + "learning_rate": 1.5890998028660077e-05, + "loss": 0.8021, + "step": 3895 + }, + { + "epoch": 0.3205924706850442, + "grad_norm": 1.6887691382789587, + "learning_rate": 1.5888844093938295e-05, + "loss": 0.7935, + "step": 3896 + }, + { + "epoch": 0.32067475828018926, + "grad_norm": 1.6838500779017664, + "learning_rate": 1.5886689740882448e-05, + "loss": 0.7934, + "step": 3897 + }, + { + "epoch": 0.32075704587533427, + "grad_norm": 1.7224600932545493, + "learning_rate": 1.5884534969645574e-05, + "loss": 0.7891, + "step": 3898 + }, + { + "epoch": 0.3208393334704793, + "grad_norm": 1.7210572191001172, + "learning_rate": 1.588237978038075e-05, + "loss": 0.7813, + "step": 3899 + }, + { + "epoch": 0.32092162106562433, + "grad_norm": 2.8344387409880674, + "learning_rate": 1.588022417324107e-05, + "loss": 0.8094, + "step": 3900 + }, + { + "epoch": 0.3210039086607694, + "grad_norm": 2.1217231122168143, + "learning_rate": 1.587806814837967e-05, + "loss": 0.7753, + "step": 3901 + }, + { + "epoch": 0.32108619625591445, + "grad_norm": 2.214733800606323, + "learning_rate": 1.587591170594971e-05, + "loss": 0.7817, + "step": 3902 + }, + { + "epoch": 0.32116848385105945, + "grad_norm": 0.43017597210839653, + "learning_rate": 1.587375484610438e-05, + "loss": 0.5431, + "step": 3903 + }, + { + "epoch": 0.3212507714462045, + "grad_norm": 2.259495257156286, + "learning_rate": 1.58715975689969e-05, + "loss": 0.814, + "step": 3904 + }, + { + "epoch": 0.3213330590413495, + "grad_norm": 0.4247657565225192, + "learning_rate": 1.5869439874780518e-05, + "loss": 0.5151, + "step": 3905 + }, + { + "epoch": 0.32141534663649457, + "grad_norm": 1.612160032316507, + "learning_rate": 1.5867281763608514e-05, + "loss": 0.7756, + "step": 3906 + }, + { + "epoch": 0.3214976342316396, + "grad_norm": 2.9126245659122993, + "learning_rate": 1.5865123235634196e-05, + "loss": 0.8098, + "step": 3907 + }, + { + "epoch": 0.32157992182678463, + "grad_norm": 1.8880678211217063, + "learning_rate": 1.5862964291010904e-05, + "loss": 0.8025, + "step": 3908 + }, + { + "epoch": 0.32166220942192963, + "grad_norm": 2.470962406465848, + "learning_rate": 1.5860804929892007e-05, + "loss": 0.7807, + "step": 3909 + }, + { + "epoch": 0.3217444970170747, + "grad_norm": 1.8096274546446176, + "learning_rate": 1.58586451524309e-05, + "loss": 0.7938, + "step": 3910 + }, + { + "epoch": 0.3218267846122197, + "grad_norm": 1.7357976383540734, + "learning_rate": 1.5856484958781007e-05, + "loss": 0.7698, + "step": 3911 + }, + { + "epoch": 0.32190907220736475, + "grad_norm": 1.6751104297921913, + "learning_rate": 1.5854324349095794e-05, + "loss": 0.7985, + "step": 3912 + }, + { + "epoch": 0.32199135980250976, + "grad_norm": 0.45078868165814406, + "learning_rate": 1.5852163323528736e-05, + "loss": 0.5611, + "step": 3913 + }, + { + "epoch": 0.3220736473976548, + "grad_norm": 1.8160018709189711, + "learning_rate": 1.585000188223336e-05, + "loss": 0.7767, + "step": 3914 + }, + { + "epoch": 0.3221559349927998, + "grad_norm": 1.8270349432081778, + "learning_rate": 1.5847840025363206e-05, + "loss": 0.7747, + "step": 3915 + }, + { + "epoch": 0.3222382225879449, + "grad_norm": 1.8128097632259197, + "learning_rate": 1.5845677753071847e-05, + "loss": 0.785, + "step": 3916 + }, + { + "epoch": 0.3223205101830899, + "grad_norm": 1.8944803424632484, + "learning_rate": 1.5843515065512885e-05, + "loss": 0.804, + "step": 3917 + }, + { + "epoch": 0.32240279777823494, + "grad_norm": 2.2596012157276135, + "learning_rate": 1.5841351962839966e-05, + "loss": 0.7594, + "step": 3918 + }, + { + "epoch": 0.32248508537337994, + "grad_norm": 0.4063307625629307, + "learning_rate": 1.583918844520674e-05, + "loss": 0.5257, + "step": 3919 + }, + { + "epoch": 0.322567372968525, + "grad_norm": 1.7705394326449508, + "learning_rate": 1.5837024512766905e-05, + "loss": 0.7968, + "step": 3920 + }, + { + "epoch": 0.32264966056367, + "grad_norm": 2.2793356779989806, + "learning_rate": 1.583486016567419e-05, + "loss": 0.7715, + "step": 3921 + }, + { + "epoch": 0.32273194815881506, + "grad_norm": 2.3770074512733346, + "learning_rate": 1.5832695404082334e-05, + "loss": 0.7833, + "step": 3922 + }, + { + "epoch": 0.32281423575396007, + "grad_norm": 1.9767747470853723, + "learning_rate": 1.5830530228145125e-05, + "loss": 0.799, + "step": 3923 + }, + { + "epoch": 0.3228965233491051, + "grad_norm": 2.1188118908486704, + "learning_rate": 1.5828364638016377e-05, + "loss": 0.8134, + "step": 3924 + }, + { + "epoch": 0.3229788109442501, + "grad_norm": 0.431863066119262, + "learning_rate": 1.5826198633849922e-05, + "loss": 0.5202, + "step": 3925 + }, + { + "epoch": 0.3230610985393952, + "grad_norm": 2.0264993947923973, + "learning_rate": 1.5824032215799635e-05, + "loss": 0.7724, + "step": 3926 + }, + { + "epoch": 0.32314338613454024, + "grad_norm": 2.080019534214767, + "learning_rate": 1.582186538401941e-05, + "loss": 0.7923, + "step": 3927 + }, + { + "epoch": 0.32322567372968525, + "grad_norm": 2.3466362434119823, + "learning_rate": 1.5819698138663185e-05, + "loss": 0.789, + "step": 3928 + }, + { + "epoch": 0.3233079613248303, + "grad_norm": 2.676529299854149, + "learning_rate": 1.581753047988491e-05, + "loss": 0.7975, + "step": 3929 + }, + { + "epoch": 0.3233902489199753, + "grad_norm": 2.084847629393202, + "learning_rate": 1.5815362407838572e-05, + "loss": 0.8186, + "step": 3930 + }, + { + "epoch": 0.32347253651512037, + "grad_norm": 1.8539793305616092, + "learning_rate": 1.581319392267819e-05, + "loss": 0.7899, + "step": 3931 + }, + { + "epoch": 0.32355482411026537, + "grad_norm": 1.946873396330972, + "learning_rate": 1.5811025024557806e-05, + "loss": 0.8188, + "step": 3932 + }, + { + "epoch": 0.32363711170541043, + "grad_norm": 2.159098166320808, + "learning_rate": 1.58088557136315e-05, + "loss": 0.7848, + "step": 3933 + }, + { + "epoch": 0.32371939930055543, + "grad_norm": 2.047580727362364, + "learning_rate": 1.5806685990053374e-05, + "loss": 0.7795, + "step": 3934 + }, + { + "epoch": 0.3238016868957005, + "grad_norm": 2.671302249322026, + "learning_rate": 1.5804515853977562e-05, + "loss": 0.7878, + "step": 3935 + }, + { + "epoch": 0.3238839744908455, + "grad_norm": 1.8591097758701614, + "learning_rate": 1.5802345305558224e-05, + "loss": 0.7879, + "step": 3936 + }, + { + "epoch": 0.32396626208599055, + "grad_norm": 1.8516339214566484, + "learning_rate": 1.580017434494956e-05, + "loss": 0.7827, + "step": 3937 + }, + { + "epoch": 0.32404854968113556, + "grad_norm": 2.366887865572097, + "learning_rate": 1.5798002972305782e-05, + "loss": 0.7994, + "step": 3938 + }, + { + "epoch": 0.3241308372762806, + "grad_norm": 2.14665278036224, + "learning_rate": 1.5795831187781147e-05, + "loss": 0.7899, + "step": 3939 + }, + { + "epoch": 0.3242131248714256, + "grad_norm": 1.9282270888047135, + "learning_rate": 1.5793658991529934e-05, + "loss": 0.7822, + "step": 3940 + }, + { + "epoch": 0.3242954124665707, + "grad_norm": 1.7470566661407605, + "learning_rate": 1.5791486383706448e-05, + "loss": 0.7985, + "step": 3941 + }, + { + "epoch": 0.3243777000617157, + "grad_norm": 2.2568501368655274, + "learning_rate": 1.5789313364465037e-05, + "loss": 0.7811, + "step": 3942 + }, + { + "epoch": 0.32445998765686074, + "grad_norm": 1.8202413829686064, + "learning_rate": 1.578713993396006e-05, + "loss": 0.7751, + "step": 3943 + }, + { + "epoch": 0.32454227525200574, + "grad_norm": 1.6164720356317872, + "learning_rate": 1.5784966092345916e-05, + "loss": 0.7684, + "step": 3944 + }, + { + "epoch": 0.3246245628471508, + "grad_norm": 1.7664920640395763, + "learning_rate": 1.5782791839777035e-05, + "loss": 0.8171, + "step": 3945 + }, + { + "epoch": 0.3247068504422958, + "grad_norm": 1.7140385425234788, + "learning_rate": 1.578061717640787e-05, + "loss": 0.7983, + "step": 3946 + }, + { + "epoch": 0.32478913803744086, + "grad_norm": 1.9915284832242997, + "learning_rate": 1.5778442102392903e-05, + "loss": 0.7752, + "step": 3947 + }, + { + "epoch": 0.32487142563258586, + "grad_norm": 1.8606268367017726, + "learning_rate": 1.5776266617886652e-05, + "loss": 0.7885, + "step": 3948 + }, + { + "epoch": 0.3249537132277309, + "grad_norm": 1.8214192299435588, + "learning_rate": 1.577409072304366e-05, + "loss": 0.8146, + "step": 3949 + }, + { + "epoch": 0.3250360008228759, + "grad_norm": 2.027591412924204, + "learning_rate": 1.5771914418018493e-05, + "loss": 0.7706, + "step": 3950 + }, + { + "epoch": 0.325118288418021, + "grad_norm": 2.1354790579381695, + "learning_rate": 1.5769737702965762e-05, + "loss": 0.8089, + "step": 3951 + }, + { + "epoch": 0.32520057601316604, + "grad_norm": 2.074699176889147, + "learning_rate": 1.576756057804009e-05, + "loss": 0.773, + "step": 3952 + }, + { + "epoch": 0.32528286360831105, + "grad_norm": 2.100870397111902, + "learning_rate": 1.5765383043396137e-05, + "loss": 0.7644, + "step": 3953 + }, + { + "epoch": 0.3253651512034561, + "grad_norm": 0.45137516125086635, + "learning_rate": 1.5763205099188594e-05, + "loss": 0.5643, + "step": 3954 + }, + { + "epoch": 0.3254474387986011, + "grad_norm": 2.217278953007144, + "learning_rate": 1.5761026745572178e-05, + "loss": 0.7821, + "step": 3955 + }, + { + "epoch": 0.32552972639374617, + "grad_norm": 1.9618940028761858, + "learning_rate": 1.5758847982701636e-05, + "loss": 0.8177, + "step": 3956 + }, + { + "epoch": 0.32561201398889117, + "grad_norm": 2.1199923845726, + "learning_rate": 1.575666881073174e-05, + "loss": 0.7761, + "step": 3957 + }, + { + "epoch": 0.3256943015840362, + "grad_norm": 0.43239596333275854, + "learning_rate": 1.57544892298173e-05, + "loss": 0.5101, + "step": 3958 + }, + { + "epoch": 0.32577658917918123, + "grad_norm": 2.850191356164409, + "learning_rate": 1.575230924011315e-05, + "loss": 0.7955, + "step": 3959 + }, + { + "epoch": 0.3258588767743263, + "grad_norm": 1.8692796068426931, + "learning_rate": 1.5750128841774147e-05, + "loss": 0.7897, + "step": 3960 + }, + { + "epoch": 0.3259411643694713, + "grad_norm": 2.1827524276448274, + "learning_rate": 1.574794803495519e-05, + "loss": 0.8264, + "step": 3961 + }, + { + "epoch": 0.32602345196461635, + "grad_norm": 2.263658558707186, + "learning_rate": 1.5745766819811197e-05, + "loss": 0.8098, + "step": 3962 + }, + { + "epoch": 0.32610573955976135, + "grad_norm": 0.4244020766463776, + "learning_rate": 1.5743585196497114e-05, + "loss": 0.5129, + "step": 3963 + }, + { + "epoch": 0.3261880271549064, + "grad_norm": 2.595927481531137, + "learning_rate": 1.574140316516793e-05, + "loss": 0.8184, + "step": 3964 + }, + { + "epoch": 0.3262703147500514, + "grad_norm": 0.419897869617388, + "learning_rate": 1.5739220725978642e-05, + "loss": 0.5457, + "step": 3965 + }, + { + "epoch": 0.3263526023451965, + "grad_norm": 2.258978016931398, + "learning_rate": 1.5737037879084298e-05, + "loss": 0.7558, + "step": 3966 + }, + { + "epoch": 0.3264348899403415, + "grad_norm": 2.542891238910011, + "learning_rate": 1.5734854624639956e-05, + "loss": 0.807, + "step": 3967 + }, + { + "epoch": 0.32651717753548654, + "grad_norm": 0.4265054536091749, + "learning_rate": 1.5732670962800712e-05, + "loss": 0.5115, + "step": 3968 + }, + { + "epoch": 0.32659946513063154, + "grad_norm": 1.8955838199608996, + "learning_rate": 1.5730486893721688e-05, + "loss": 0.7797, + "step": 3969 + }, + { + "epoch": 0.3266817527257766, + "grad_norm": 2.0601699426832663, + "learning_rate": 1.5728302417558043e-05, + "loss": 0.7884, + "step": 3970 + }, + { + "epoch": 0.3267640403209216, + "grad_norm": 0.40840965019656833, + "learning_rate": 1.5726117534464954e-05, + "loss": 0.53, + "step": 3971 + }, + { + "epoch": 0.32684632791606666, + "grad_norm": 1.8843960611451949, + "learning_rate": 1.5723932244597634e-05, + "loss": 0.767, + "step": 3972 + }, + { + "epoch": 0.32692861551121166, + "grad_norm": 1.904639008422634, + "learning_rate": 1.5721746548111322e-05, + "loss": 0.8159, + "step": 3973 + }, + { + "epoch": 0.3270109031063567, + "grad_norm": 1.9290444263661974, + "learning_rate": 1.5719560445161284e-05, + "loss": 0.829, + "step": 3974 + }, + { + "epoch": 0.3270931907015017, + "grad_norm": 2.297219569544417, + "learning_rate": 1.571737393590282e-05, + "loss": 0.7835, + "step": 3975 + }, + { + "epoch": 0.3271754782966468, + "grad_norm": 1.8435362368311108, + "learning_rate": 1.5715187020491254e-05, + "loss": 0.7753, + "step": 3976 + }, + { + "epoch": 0.3272577658917918, + "grad_norm": 2.067228837195023, + "learning_rate": 1.5712999699081947e-05, + "loss": 0.8031, + "step": 3977 + }, + { + "epoch": 0.32734005348693684, + "grad_norm": 2.074862181219466, + "learning_rate": 1.5710811971830274e-05, + "loss": 0.8123, + "step": 3978 + }, + { + "epoch": 0.3274223410820819, + "grad_norm": 0.4119220619691624, + "learning_rate": 1.570862383889165e-05, + "loss": 0.526, + "step": 3979 + }, + { + "epoch": 0.3275046286772269, + "grad_norm": 2.2681013473374505, + "learning_rate": 1.570643530042152e-05, + "loss": 0.7865, + "step": 3980 + }, + { + "epoch": 0.32758691627237196, + "grad_norm": 2.346967460159985, + "learning_rate": 1.5704246356575352e-05, + "loss": 0.7951, + "step": 3981 + }, + { + "epoch": 0.32766920386751697, + "grad_norm": 2.0280379580514323, + "learning_rate": 1.5702057007508648e-05, + "loss": 0.7967, + "step": 3982 + }, + { + "epoch": 0.327751491462662, + "grad_norm": 2.645494755217657, + "learning_rate": 1.5699867253376928e-05, + "loss": 0.7813, + "step": 3983 + }, + { + "epoch": 0.32783377905780703, + "grad_norm": 2.0400139913657713, + "learning_rate": 1.5697677094335758e-05, + "loss": 0.787, + "step": 3984 + }, + { + "epoch": 0.3279160666529521, + "grad_norm": 2.205406469269344, + "learning_rate": 1.5695486530540717e-05, + "loss": 0.7531, + "step": 3985 + }, + { + "epoch": 0.3279983542480971, + "grad_norm": 0.43167951522059045, + "learning_rate": 1.5693295562147423e-05, + "loss": 0.5336, + "step": 3986 + }, + { + "epoch": 0.32808064184324215, + "grad_norm": 2.107204959767559, + "learning_rate": 1.569110418931152e-05, + "loss": 0.7802, + "step": 3987 + }, + { + "epoch": 0.32816292943838715, + "grad_norm": 2.4392908759184007, + "learning_rate": 1.5688912412188673e-05, + "loss": 0.8002, + "step": 3988 + }, + { + "epoch": 0.3282452170335322, + "grad_norm": 2.14764265908603, + "learning_rate": 1.5686720230934587e-05, + "loss": 0.8399, + "step": 3989 + }, + { + "epoch": 0.3283275046286772, + "grad_norm": 2.3680824327659655, + "learning_rate": 1.568452764570499e-05, + "loss": 0.8197, + "step": 3990 + }, + { + "epoch": 0.32840979222382227, + "grad_norm": 2.3280330211238662, + "learning_rate": 1.5682334656655642e-05, + "loss": 0.7957, + "step": 3991 + }, + { + "epoch": 0.3284920798189673, + "grad_norm": 1.8107832679614735, + "learning_rate": 1.5680141263942325e-05, + "loss": 0.8044, + "step": 3992 + }, + { + "epoch": 0.32857436741411233, + "grad_norm": 2.998434288538815, + "learning_rate": 1.5677947467720856e-05, + "loss": 0.7835, + "step": 3993 + }, + { + "epoch": 0.32865665500925734, + "grad_norm": 0.4392336814515646, + "learning_rate": 1.5675753268147085e-05, + "loss": 0.5206, + "step": 3994 + }, + { + "epoch": 0.3287389426044024, + "grad_norm": 2.36967808615286, + "learning_rate": 1.5673558665376873e-05, + "loss": 0.8124, + "step": 3995 + }, + { + "epoch": 0.3288212301995474, + "grad_norm": 2.0335764345348855, + "learning_rate": 1.567136365956613e-05, + "loss": 0.7935, + "step": 3996 + }, + { + "epoch": 0.32890351779469246, + "grad_norm": 2.13170838537876, + "learning_rate": 1.5669168250870784e-05, + "loss": 0.7875, + "step": 3997 + }, + { + "epoch": 0.32898580538983746, + "grad_norm": 1.9456780606139135, + "learning_rate": 1.566697243944679e-05, + "loss": 0.7759, + "step": 3998 + }, + { + "epoch": 0.3290680929849825, + "grad_norm": 2.773312723528782, + "learning_rate": 1.5664776225450132e-05, + "loss": 0.7478, + "step": 3999 + }, + { + "epoch": 0.3291503805801275, + "grad_norm": 2.4541175140476814, + "learning_rate": 1.5662579609036836e-05, + "loss": 0.8043, + "step": 4000 + }, + { + "epoch": 0.3292326681752726, + "grad_norm": 3.6905082234410176, + "learning_rate": 1.566038259036294e-05, + "loss": 0.7849, + "step": 4001 + }, + { + "epoch": 0.3293149557704176, + "grad_norm": 2.130518471319136, + "learning_rate": 1.5658185169584518e-05, + "loss": 0.794, + "step": 4002 + }, + { + "epoch": 0.32939724336556264, + "grad_norm": 2.8597433510479378, + "learning_rate": 1.565598734685767e-05, + "loss": 0.8093, + "step": 4003 + }, + { + "epoch": 0.3294795309607077, + "grad_norm": 0.46263416600642066, + "learning_rate": 1.5653789122338526e-05, + "loss": 0.5274, + "step": 4004 + }, + { + "epoch": 0.3295618185558527, + "grad_norm": 3.503175456230659, + "learning_rate": 1.565159049618324e-05, + "loss": 0.8216, + "step": 4005 + }, + { + "epoch": 0.32964410615099776, + "grad_norm": 4.067051927797743, + "learning_rate": 1.5649391468548013e-05, + "loss": 0.7893, + "step": 4006 + }, + { + "epoch": 0.32972639374614277, + "grad_norm": 2.6026548196115122, + "learning_rate": 1.5647192039589042e-05, + "loss": 0.8011, + "step": 4007 + }, + { + "epoch": 0.3298086813412878, + "grad_norm": 2.1311953566519763, + "learning_rate": 1.5644992209462583e-05, + "loss": 0.7826, + "step": 4008 + }, + { + "epoch": 0.3298909689364328, + "grad_norm": 2.4932806360122854, + "learning_rate": 1.5642791978324908e-05, + "loss": 0.805, + "step": 4009 + }, + { + "epoch": 0.3299732565315779, + "grad_norm": 2.728699047628324, + "learning_rate": 1.5640591346332313e-05, + "loss": 0.7736, + "step": 4010 + }, + { + "epoch": 0.3300555441267229, + "grad_norm": 1.8896891934256532, + "learning_rate": 1.563839031364113e-05, + "loss": 0.7609, + "step": 4011 + }, + { + "epoch": 0.33013783172186795, + "grad_norm": 2.4997541576257656, + "learning_rate": 1.5636188880407717e-05, + "loss": 0.7786, + "step": 4012 + }, + { + "epoch": 0.33022011931701295, + "grad_norm": 0.4302448310310332, + "learning_rate": 1.5633987046788458e-05, + "loss": 0.5157, + "step": 4013 + }, + { + "epoch": 0.330302406912158, + "grad_norm": 2.5275586558015863, + "learning_rate": 1.563178481293977e-05, + "loss": 0.8039, + "step": 4014 + }, + { + "epoch": 0.330384694507303, + "grad_norm": 2.2740611908051758, + "learning_rate": 1.5629582179018097e-05, + "loss": 0.7724, + "step": 4015 + }, + { + "epoch": 0.33046698210244807, + "grad_norm": 2.356752663446273, + "learning_rate": 1.5627379145179907e-05, + "loss": 0.7805, + "step": 4016 + }, + { + "epoch": 0.3305492696975931, + "grad_norm": 2.0804581335562267, + "learning_rate": 1.5625175711581702e-05, + "loss": 0.7573, + "step": 4017 + }, + { + "epoch": 0.33063155729273813, + "grad_norm": 1.9908887619880642, + "learning_rate": 1.5622971878380014e-05, + "loss": 0.791, + "step": 4018 + }, + { + "epoch": 0.33071384488788313, + "grad_norm": 2.1469774019491155, + "learning_rate": 1.5620767645731394e-05, + "loss": 0.7831, + "step": 4019 + }, + { + "epoch": 0.3307961324830282, + "grad_norm": 1.8937606278549608, + "learning_rate": 1.5618563013792426e-05, + "loss": 0.7737, + "step": 4020 + }, + { + "epoch": 0.3308784200781732, + "grad_norm": 2.5642012322205394, + "learning_rate": 1.5616357982719732e-05, + "loss": 0.7675, + "step": 4021 + }, + { + "epoch": 0.33096070767331826, + "grad_norm": 0.4356450645178253, + "learning_rate": 1.561415255266995e-05, + "loss": 0.5228, + "step": 4022 + }, + { + "epoch": 0.33104299526846326, + "grad_norm": 2.574916998095043, + "learning_rate": 1.5611946723799745e-05, + "loss": 0.778, + "step": 4023 + }, + { + "epoch": 0.3311252828636083, + "grad_norm": 2.7688836825179655, + "learning_rate": 1.560974049626582e-05, + "loss": 0.8046, + "step": 4024 + }, + { + "epoch": 0.3312075704587533, + "grad_norm": 2.171637815428403, + "learning_rate": 1.5607533870224905e-05, + "loss": 0.7726, + "step": 4025 + }, + { + "epoch": 0.3312898580538984, + "grad_norm": 2.6121128273222545, + "learning_rate": 1.5605326845833747e-05, + "loss": 0.7622, + "step": 4026 + }, + { + "epoch": 0.3313721456490434, + "grad_norm": 3.8572921705382037, + "learning_rate": 1.5603119423249138e-05, + "loss": 0.7663, + "step": 4027 + }, + { + "epoch": 0.33145443324418844, + "grad_norm": 2.4190187644468533, + "learning_rate": 1.5600911602627887e-05, + "loss": 0.7833, + "step": 4028 + }, + { + "epoch": 0.33153672083933344, + "grad_norm": 2.2438722540135903, + "learning_rate": 1.559870338412683e-05, + "loss": 0.8005, + "step": 4029 + }, + { + "epoch": 0.3316190084344785, + "grad_norm": 2.0470890034029168, + "learning_rate": 1.559649476790284e-05, + "loss": 0.7796, + "step": 4030 + }, + { + "epoch": 0.33170129602962356, + "grad_norm": 2.306122681003319, + "learning_rate": 1.5594285754112813e-05, + "loss": 0.7822, + "step": 4031 + }, + { + "epoch": 0.33178358362476856, + "grad_norm": 2.258462595107799, + "learning_rate": 1.559207634291367e-05, + "loss": 0.7889, + "step": 4032 + }, + { + "epoch": 0.3318658712199136, + "grad_norm": 2.4594697996504116, + "learning_rate": 1.558986653446237e-05, + "loss": 0.7824, + "step": 4033 + }, + { + "epoch": 0.3319481588150586, + "grad_norm": 2.5711750789918475, + "learning_rate": 1.5587656328915886e-05, + "loss": 0.7918, + "step": 4034 + }, + { + "epoch": 0.3320304464102037, + "grad_norm": 0.4433676817708056, + "learning_rate": 1.5585445726431235e-05, + "loss": 0.509, + "step": 4035 + }, + { + "epoch": 0.3321127340053487, + "grad_norm": 2.283865584285344, + "learning_rate": 1.5583234727165456e-05, + "loss": 0.7443, + "step": 4036 + }, + { + "epoch": 0.33219502160049375, + "grad_norm": 0.4101973792108834, + "learning_rate": 1.5581023331275607e-05, + "loss": 0.4913, + "step": 4037 + }, + { + "epoch": 0.33227730919563875, + "grad_norm": 3.581975059566234, + "learning_rate": 1.5578811538918788e-05, + "loss": 0.764, + "step": 4038 + }, + { + "epoch": 0.3323595967907838, + "grad_norm": 2.061623283699238, + "learning_rate": 1.5576599350252118e-05, + "loss": 0.8047, + "step": 4039 + }, + { + "epoch": 0.3324418843859288, + "grad_norm": 2.5185115205311805, + "learning_rate": 1.5574386765432747e-05, + "loss": 0.7757, + "step": 4040 + }, + { + "epoch": 0.33252417198107387, + "grad_norm": 0.4373356793724688, + "learning_rate": 1.557217378461786e-05, + "loss": 0.5516, + "step": 4041 + }, + { + "epoch": 0.33260645957621887, + "grad_norm": 0.4175419290847868, + "learning_rate": 1.5569960407964656e-05, + "loss": 0.5028, + "step": 4042 + }, + { + "epoch": 0.33268874717136393, + "grad_norm": 0.4193817285666343, + "learning_rate": 1.556774663563037e-05, + "loss": 0.5275, + "step": 4043 + }, + { + "epoch": 0.33277103476650893, + "grad_norm": 2.7178153363796858, + "learning_rate": 1.556553246777227e-05, + "loss": 0.8058, + "step": 4044 + }, + { + "epoch": 0.332853322361654, + "grad_norm": 2.3231797903294433, + "learning_rate": 1.5563317904547647e-05, + "loss": 0.8015, + "step": 4045 + }, + { + "epoch": 0.332935609956799, + "grad_norm": 2.3013934131802776, + "learning_rate": 1.556110294611381e-05, + "loss": 0.7931, + "step": 4046 + }, + { + "epoch": 0.33301789755194405, + "grad_norm": 0.41864141680826006, + "learning_rate": 1.5558887592628118e-05, + "loss": 0.5046, + "step": 4047 + }, + { + "epoch": 0.33310018514708906, + "grad_norm": 2.063054176010626, + "learning_rate": 1.555667184424794e-05, + "loss": 0.8141, + "step": 4048 + }, + { + "epoch": 0.3331824727422341, + "grad_norm": 0.43526008271738104, + "learning_rate": 1.555445570113068e-05, + "loss": 0.5364, + "step": 4049 + }, + { + "epoch": 0.3332647603373791, + "grad_norm": 2.5772664816408586, + "learning_rate": 1.5552239163433774e-05, + "loss": 0.7868, + "step": 4050 + }, + { + "epoch": 0.3333470479325242, + "grad_norm": 0.46894680139071393, + "learning_rate": 1.5550022231314678e-05, + "loss": 0.5729, + "step": 4051 + }, + { + "epoch": 0.3334293355276692, + "grad_norm": 1.9956817792375223, + "learning_rate": 1.5547804904930873e-05, + "loss": 0.7839, + "step": 4052 + }, + { + "epoch": 0.33351162312281424, + "grad_norm": 2.21909870705443, + "learning_rate": 1.5545587184439883e-05, + "loss": 0.769, + "step": 4053 + }, + { + "epoch": 0.33359391071795924, + "grad_norm": 2.037384083154615, + "learning_rate": 1.554336906999925e-05, + "loss": 0.7825, + "step": 4054 + }, + { + "epoch": 0.3336761983131043, + "grad_norm": 0.436862139649715, + "learning_rate": 1.554115056176654e-05, + "loss": 0.537, + "step": 4055 + }, + { + "epoch": 0.33375848590824936, + "grad_norm": 2.414719241435678, + "learning_rate": 1.5538931659899357e-05, + "loss": 0.7639, + "step": 4056 + }, + { + "epoch": 0.33384077350339436, + "grad_norm": 3.080897428013041, + "learning_rate": 1.553671236455533e-05, + "loss": 0.8175, + "step": 4057 + }, + { + "epoch": 0.3339230610985394, + "grad_norm": 3.6067928304673678, + "learning_rate": 1.553449267589211e-05, + "loss": 0.7767, + "step": 4058 + }, + { + "epoch": 0.3340053486936844, + "grad_norm": 2.2683890374702798, + "learning_rate": 1.5532272594067378e-05, + "loss": 0.788, + "step": 4059 + }, + { + "epoch": 0.3340876362888295, + "grad_norm": 4.657508972032548, + "learning_rate": 1.5530052119238848e-05, + "loss": 0.7629, + "step": 4060 + }, + { + "epoch": 0.3341699238839745, + "grad_norm": 2.6649940020873686, + "learning_rate": 1.5527831251564264e-05, + "loss": 0.8005, + "step": 4061 + }, + { + "epoch": 0.33425221147911954, + "grad_norm": 2.5930973491609897, + "learning_rate": 1.5525609991201384e-05, + "loss": 0.7978, + "step": 4062 + }, + { + "epoch": 0.33433449907426455, + "grad_norm": 0.4321773872749387, + "learning_rate": 1.5523388338308014e-05, + "loss": 0.5579, + "step": 4063 + }, + { + "epoch": 0.3344167866694096, + "grad_norm": 2.4268824148494375, + "learning_rate": 1.552116629304196e-05, + "loss": 0.8035, + "step": 4064 + }, + { + "epoch": 0.3344990742645546, + "grad_norm": 2.2973387188444834, + "learning_rate": 1.551894385556109e-05, + "loss": 0.7847, + "step": 4065 + }, + { + "epoch": 0.33458136185969967, + "grad_norm": 2.729428390786781, + "learning_rate": 1.5516721026023272e-05, + "loss": 0.82, + "step": 4066 + }, + { + "epoch": 0.33466364945484467, + "grad_norm": 2.3680876747481876, + "learning_rate": 1.5514497804586416e-05, + "loss": 0.7786, + "step": 4067 + }, + { + "epoch": 0.33474593704998973, + "grad_norm": 2.1300210233336654, + "learning_rate": 1.5512274191408456e-05, + "loss": 0.7309, + "step": 4068 + }, + { + "epoch": 0.33482822464513473, + "grad_norm": 3.2729300200283262, + "learning_rate": 1.551005018664735e-05, + "loss": 0.7738, + "step": 4069 + }, + { + "epoch": 0.3349105122402798, + "grad_norm": 1.9704989368246766, + "learning_rate": 1.5507825790461093e-05, + "loss": 0.7596, + "step": 4070 + }, + { + "epoch": 0.3349927998354248, + "grad_norm": 0.4365947099302801, + "learning_rate": 1.55056010030077e-05, + "loss": 0.5131, + "step": 4071 + }, + { + "epoch": 0.33507508743056985, + "grad_norm": 2.0074999190203124, + "learning_rate": 1.5503375824445218e-05, + "loss": 0.786, + "step": 4072 + }, + { + "epoch": 0.33515737502571485, + "grad_norm": 1.748780315236836, + "learning_rate": 1.5501150254931716e-05, + "loss": 0.7874, + "step": 4073 + }, + { + "epoch": 0.3352396626208599, + "grad_norm": 2.4331820741114027, + "learning_rate": 1.54989242946253e-05, + "loss": 0.7733, + "step": 4074 + }, + { + "epoch": 0.3353219502160049, + "grad_norm": 1.907826778457189, + "learning_rate": 1.5496697943684094e-05, + "loss": 0.7798, + "step": 4075 + }, + { + "epoch": 0.33540423781115, + "grad_norm": 1.8091432034316774, + "learning_rate": 1.549447120226626e-05, + "loss": 0.779, + "step": 4076 + }, + { + "epoch": 0.335486525406295, + "grad_norm": 2.1216937869082813, + "learning_rate": 1.5492244070529975e-05, + "loss": 0.8008, + "step": 4077 + }, + { + "epoch": 0.33556881300144004, + "grad_norm": 0.4665356572253128, + "learning_rate": 1.5490016548633455e-05, + "loss": 0.5399, + "step": 4078 + }, + { + "epoch": 0.33565110059658504, + "grad_norm": 2.0588554112755584, + "learning_rate": 1.5487788636734943e-05, + "loss": 0.7858, + "step": 4079 + }, + { + "epoch": 0.3357333881917301, + "grad_norm": 3.1048131482979073, + "learning_rate": 1.54855603349927e-05, + "loss": 0.7484, + "step": 4080 + }, + { + "epoch": 0.33581567578687516, + "grad_norm": 1.9977468749376908, + "learning_rate": 1.548333164356502e-05, + "loss": 0.7747, + "step": 4081 + }, + { + "epoch": 0.33589796338202016, + "grad_norm": 1.9507568482794677, + "learning_rate": 1.5481102562610236e-05, + "loss": 0.7856, + "step": 4082 + }, + { + "epoch": 0.3359802509771652, + "grad_norm": 2.4700140913823962, + "learning_rate": 1.5478873092286694e-05, + "loss": 0.8011, + "step": 4083 + }, + { + "epoch": 0.3360625385723102, + "grad_norm": 2.2523640725668215, + "learning_rate": 1.5476643232752763e-05, + "loss": 0.7944, + "step": 4084 + }, + { + "epoch": 0.3361448261674553, + "grad_norm": 2.17172088234952, + "learning_rate": 1.5474412984166858e-05, + "loss": 0.7984, + "step": 4085 + }, + { + "epoch": 0.3362271137626003, + "grad_norm": 1.8796567105026312, + "learning_rate": 1.547218234668741e-05, + "loss": 0.7782, + "step": 4086 + }, + { + "epoch": 0.33630940135774534, + "grad_norm": 1.7950177915791392, + "learning_rate": 1.5469951320472874e-05, + "loss": 0.7964, + "step": 4087 + }, + { + "epoch": 0.33639168895289034, + "grad_norm": 1.7626809722482157, + "learning_rate": 1.5467719905681752e-05, + "loss": 0.7889, + "step": 4088 + }, + { + "epoch": 0.3364739765480354, + "grad_norm": 1.991539753326125, + "learning_rate": 1.546548810247255e-05, + "loss": 0.79, + "step": 4089 + }, + { + "epoch": 0.3365562641431804, + "grad_norm": 2.115231252527963, + "learning_rate": 1.5463255911003808e-05, + "loss": 0.7908, + "step": 4090 + }, + { + "epoch": 0.33663855173832546, + "grad_norm": 1.6321369721550663, + "learning_rate": 1.5461023331434112e-05, + "loss": 0.7894, + "step": 4091 + }, + { + "epoch": 0.33672083933347047, + "grad_norm": 2.1896541961345846, + "learning_rate": 1.545879036392205e-05, + "loss": 0.7944, + "step": 4092 + }, + { + "epoch": 0.3368031269286155, + "grad_norm": 0.5113893318754602, + "learning_rate": 1.5456557008626244e-05, + "loss": 0.5442, + "step": 4093 + }, + { + "epoch": 0.33688541452376053, + "grad_norm": 1.7351416694095272, + "learning_rate": 1.545432326570536e-05, + "loss": 0.777, + "step": 4094 + }, + { + "epoch": 0.3369677021189056, + "grad_norm": 2.230496217825373, + "learning_rate": 1.5452089135318074e-05, + "loss": 0.7605, + "step": 4095 + }, + { + "epoch": 0.3370499897140506, + "grad_norm": 1.7918268706931566, + "learning_rate": 1.5449854617623096e-05, + "loss": 0.8035, + "step": 4096 + }, + { + "epoch": 0.33713227730919565, + "grad_norm": 2.2531298457396263, + "learning_rate": 1.544761971277916e-05, + "loss": 0.7871, + "step": 4097 + }, + { + "epoch": 0.33721456490434065, + "grad_norm": 2.386839052979061, + "learning_rate": 1.544538442094503e-05, + "loss": 0.8035, + "step": 4098 + }, + { + "epoch": 0.3372968524994857, + "grad_norm": 2.2183148555618057, + "learning_rate": 1.5443148742279504e-05, + "loss": 0.7676, + "step": 4099 + }, + { + "epoch": 0.3373791400946307, + "grad_norm": 1.6436385462158203, + "learning_rate": 1.5440912676941392e-05, + "loss": 0.7738, + "step": 4100 + }, + { + "epoch": 0.3374614276897758, + "grad_norm": 0.44570601705566626, + "learning_rate": 1.543867622508955e-05, + "loss": 0.543, + "step": 4101 + }, + { + "epoch": 0.3375437152849208, + "grad_norm": 2.2551194306581515, + "learning_rate": 1.543643938688284e-05, + "loss": 0.815, + "step": 4102 + }, + { + "epoch": 0.33762600288006583, + "grad_norm": 1.9418876796336597, + "learning_rate": 1.5434202162480175e-05, + "loss": 0.7817, + "step": 4103 + }, + { + "epoch": 0.33770829047521084, + "grad_norm": 2.0876894686853817, + "learning_rate": 1.5431964552040478e-05, + "loss": 0.7784, + "step": 4104 + }, + { + "epoch": 0.3377905780703559, + "grad_norm": 1.987353387707695, + "learning_rate": 1.5429726555722708e-05, + "loss": 0.8066, + "step": 4105 + }, + { + "epoch": 0.3378728656655009, + "grad_norm": 3.259218369318484, + "learning_rate": 1.5427488173685842e-05, + "loss": 0.7775, + "step": 4106 + }, + { + "epoch": 0.33795515326064596, + "grad_norm": 1.8533066136922023, + "learning_rate": 1.54252494060889e-05, + "loss": 0.7956, + "step": 4107 + }, + { + "epoch": 0.338037440855791, + "grad_norm": 1.9536184886449355, + "learning_rate": 1.542301025309092e-05, + "loss": 0.8026, + "step": 4108 + }, + { + "epoch": 0.338119728450936, + "grad_norm": 1.8613583097264637, + "learning_rate": 1.5420770714850956e-05, + "loss": 0.7763, + "step": 4109 + }, + { + "epoch": 0.3382020160460811, + "grad_norm": 0.42751450227861165, + "learning_rate": 1.5418530791528115e-05, + "loss": 0.5328, + "step": 4110 + }, + { + "epoch": 0.3382843036412261, + "grad_norm": 1.7004766564081137, + "learning_rate": 1.5416290483281512e-05, + "loss": 0.7659, + "step": 4111 + }, + { + "epoch": 0.33836659123637114, + "grad_norm": 2.8240946495342465, + "learning_rate": 1.5414049790270294e-05, + "loss": 0.7836, + "step": 4112 + }, + { + "epoch": 0.33844887883151614, + "grad_norm": 1.8021331146733643, + "learning_rate": 1.541180871265364e-05, + "loss": 0.8015, + "step": 4113 + }, + { + "epoch": 0.3385311664266612, + "grad_norm": 3.449214192179231, + "learning_rate": 1.5409567250590746e-05, + "loss": 0.781, + "step": 4114 + }, + { + "epoch": 0.3386134540218062, + "grad_norm": 0.4383606675842084, + "learning_rate": 1.540732540424085e-05, + "loss": 0.5093, + "step": 4115 + }, + { + "epoch": 0.33869574161695126, + "grad_norm": 1.9850473052707085, + "learning_rate": 1.54050831737632e-05, + "loss": 0.7694, + "step": 4116 + }, + { + "epoch": 0.33877802921209627, + "grad_norm": 1.9317148764512087, + "learning_rate": 1.540284055931709e-05, + "loss": 0.7903, + "step": 4117 + }, + { + "epoch": 0.3388603168072413, + "grad_norm": 1.6480666719876866, + "learning_rate": 1.5400597561061825e-05, + "loss": 0.7929, + "step": 4118 + }, + { + "epoch": 0.3389426044023863, + "grad_norm": 0.4286688284073161, + "learning_rate": 1.5398354179156747e-05, + "loss": 0.5475, + "step": 4119 + }, + { + "epoch": 0.3390248919975314, + "grad_norm": 0.4232030691579948, + "learning_rate": 1.539611041376122e-05, + "loss": 0.5282, + "step": 4120 + }, + { + "epoch": 0.3391071795926764, + "grad_norm": 2.188095931719702, + "learning_rate": 1.539386626503464e-05, + "loss": 0.8267, + "step": 4121 + }, + { + "epoch": 0.33918946718782145, + "grad_norm": 2.2336043337964013, + "learning_rate": 1.539162173313643e-05, + "loss": 0.7968, + "step": 4122 + }, + { + "epoch": 0.33927175478296645, + "grad_norm": 1.7327144878573497, + "learning_rate": 1.538937681822603e-05, + "loss": 0.7926, + "step": 4123 + }, + { + "epoch": 0.3393540423781115, + "grad_norm": 0.4099620413261847, + "learning_rate": 1.538713152046292e-05, + "loss": 0.518, + "step": 4124 + }, + { + "epoch": 0.3394363299732565, + "grad_norm": 1.7009331434739732, + "learning_rate": 1.5384885840006604e-05, + "loss": 0.8089, + "step": 4125 + }, + { + "epoch": 0.33951861756840157, + "grad_norm": 1.5319943743654452, + "learning_rate": 1.538263977701661e-05, + "loss": 0.7932, + "step": 4126 + }, + { + "epoch": 0.3396009051635466, + "grad_norm": 0.42235284493781433, + "learning_rate": 1.5380393331652495e-05, + "loss": 0.5097, + "step": 4127 + }, + { + "epoch": 0.33968319275869163, + "grad_norm": 11.1615298912984, + "learning_rate": 1.537814650407384e-05, + "loss": 0.7968, + "step": 4128 + }, + { + "epoch": 0.33976548035383664, + "grad_norm": 1.7586320233591177, + "learning_rate": 1.537589929444026e-05, + "loss": 0.7995, + "step": 4129 + }, + { + "epoch": 0.3398477679489817, + "grad_norm": 1.5528213595126805, + "learning_rate": 1.5373651702911393e-05, + "loss": 0.7535, + "step": 4130 + }, + { + "epoch": 0.3399300555441267, + "grad_norm": 0.4485178263335829, + "learning_rate": 1.5371403729646905e-05, + "loss": 0.5046, + "step": 4131 + }, + { + "epoch": 0.34001234313927176, + "grad_norm": 1.7964144593662237, + "learning_rate": 1.536915537480648e-05, + "loss": 0.7851, + "step": 4132 + }, + { + "epoch": 0.3400946307344168, + "grad_norm": 1.8147768900972738, + "learning_rate": 1.5366906638549845e-05, + "loss": 0.7978, + "step": 4133 + }, + { + "epoch": 0.3401769183295618, + "grad_norm": 3.348717434702876, + "learning_rate": 1.5364657521036747e-05, + "loss": 0.7832, + "step": 4134 + }, + { + "epoch": 0.3402592059247069, + "grad_norm": 1.7615874202683148, + "learning_rate": 1.5362408022426958e-05, + "loss": 0.8017, + "step": 4135 + }, + { + "epoch": 0.3403414935198519, + "grad_norm": 2.0426562833962247, + "learning_rate": 1.536015814288028e-05, + "loss": 0.7941, + "step": 4136 + }, + { + "epoch": 0.34042378111499694, + "grad_norm": 1.5361881219172584, + "learning_rate": 1.5357907882556537e-05, + "loss": 0.8022, + "step": 4137 + }, + { + "epoch": 0.34050606871014194, + "grad_norm": 3.0995959121715253, + "learning_rate": 1.5355657241615588e-05, + "loss": 0.7737, + "step": 4138 + }, + { + "epoch": 0.340588356305287, + "grad_norm": 2.588606371301522, + "learning_rate": 1.535340622021732e-05, + "loss": 0.793, + "step": 4139 + }, + { + "epoch": 0.340670643900432, + "grad_norm": 1.8902237364663586, + "learning_rate": 1.5351154818521626e-05, + "loss": 0.7944, + "step": 4140 + }, + { + "epoch": 0.34075293149557706, + "grad_norm": 1.7204007865686732, + "learning_rate": 1.5348903036688456e-05, + "loss": 0.7956, + "step": 4141 + }, + { + "epoch": 0.34083521909072206, + "grad_norm": 1.9999028390813671, + "learning_rate": 1.534665087487777e-05, + "loss": 0.7936, + "step": 4142 + }, + { + "epoch": 0.3409175066858671, + "grad_norm": 1.8644536096936406, + "learning_rate": 1.5344398333249554e-05, + "loss": 0.7921, + "step": 4143 + }, + { + "epoch": 0.3409997942810121, + "grad_norm": 1.7722552179841173, + "learning_rate": 1.534214541196383e-05, + "loss": 0.794, + "step": 4144 + }, + { + "epoch": 0.3410820818761572, + "grad_norm": 1.6203041917519163, + "learning_rate": 1.5339892111180637e-05, + "loss": 0.7749, + "step": 4145 + }, + { + "epoch": 0.3411643694713022, + "grad_norm": 1.7129423869701084, + "learning_rate": 1.533763843106005e-05, + "loss": 0.796, + "step": 4146 + }, + { + "epoch": 0.34124665706644725, + "grad_norm": 1.751627667300706, + "learning_rate": 1.5335384371762163e-05, + "loss": 0.7904, + "step": 4147 + }, + { + "epoch": 0.34132894466159225, + "grad_norm": 1.693787450935078, + "learning_rate": 1.5333129933447103e-05, + "loss": 0.7835, + "step": 4148 + }, + { + "epoch": 0.3414112322567373, + "grad_norm": 1.849766722664072, + "learning_rate": 1.5330875116275022e-05, + "loss": 0.7825, + "step": 4149 + }, + { + "epoch": 0.3414935198518823, + "grad_norm": 2.335996008891753, + "learning_rate": 1.5328619920406102e-05, + "loss": 0.784, + "step": 4150 + }, + { + "epoch": 0.34157580744702737, + "grad_norm": 0.44901144941744836, + "learning_rate": 1.532636434600054e-05, + "loss": 0.5513, + "step": 4151 + }, + { + "epoch": 0.34165809504217237, + "grad_norm": 1.7605434715758692, + "learning_rate": 1.5324108393218576e-05, + "loss": 0.7851, + "step": 4152 + }, + { + "epoch": 0.34174038263731743, + "grad_norm": 1.9583447241776644, + "learning_rate": 1.5321852062220467e-05, + "loss": 0.817, + "step": 4153 + }, + { + "epoch": 0.34182267023246243, + "grad_norm": 1.9555340733009015, + "learning_rate": 1.5319595353166496e-05, + "loss": 0.7928, + "step": 4154 + }, + { + "epoch": 0.3419049578276075, + "grad_norm": 1.6234717616083876, + "learning_rate": 1.531733826621698e-05, + "loss": 0.77, + "step": 4155 + }, + { + "epoch": 0.3419872454227525, + "grad_norm": 0.4456752890716857, + "learning_rate": 1.5315080801532255e-05, + "loss": 0.5504, + "step": 4156 + }, + { + "epoch": 0.34206953301789755, + "grad_norm": 0.44365575104674465, + "learning_rate": 1.531282295927269e-05, + "loss": 0.5112, + "step": 4157 + }, + { + "epoch": 0.34215182061304256, + "grad_norm": 1.947857220556244, + "learning_rate": 1.531056473959868e-05, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.3422341082081876, + "grad_norm": 2.12391634258539, + "learning_rate": 1.530830614267065e-05, + "loss": 0.7968, + "step": 4159 + }, + { + "epoch": 0.3423163958033327, + "grad_norm": 1.750950386734388, + "learning_rate": 1.530604716864903e-05, + "loss": 0.7881, + "step": 4160 + }, + { + "epoch": 0.3423986833984777, + "grad_norm": 1.9947279914057716, + "learning_rate": 1.530378781769431e-05, + "loss": 0.7887, + "step": 4161 + }, + { + "epoch": 0.34248097099362274, + "grad_norm": 0.45005457810175087, + "learning_rate": 1.5301528089966987e-05, + "loss": 0.5424, + "step": 4162 + }, + { + "epoch": 0.34256325858876774, + "grad_norm": 2.2805189849580034, + "learning_rate": 1.529926798562759e-05, + "loss": 0.7734, + "step": 4163 + }, + { + "epoch": 0.3426455461839128, + "grad_norm": 1.5993081613216364, + "learning_rate": 1.529700750483666e-05, + "loss": 0.7817, + "step": 4164 + }, + { + "epoch": 0.3427278337790578, + "grad_norm": 1.880393324380173, + "learning_rate": 1.5294746647754796e-05, + "loss": 0.791, + "step": 4165 + }, + { + "epoch": 0.34281012137420286, + "grad_norm": 2.2495773107601904, + "learning_rate": 1.52924854145426e-05, + "loss": 0.7674, + "step": 4166 + }, + { + "epoch": 0.34289240896934786, + "grad_norm": 1.6974789508594508, + "learning_rate": 1.52902238053607e-05, + "loss": 0.8326, + "step": 4167 + }, + { + "epoch": 0.3429746965644929, + "grad_norm": 0.4563080678296591, + "learning_rate": 1.528796182036976e-05, + "loss": 0.5343, + "step": 4168 + }, + { + "epoch": 0.3430569841596379, + "grad_norm": 1.556776041063212, + "learning_rate": 1.528569945973047e-05, + "loss": 0.8141, + "step": 4169 + }, + { + "epoch": 0.343139271754783, + "grad_norm": 1.5065087672450441, + "learning_rate": 1.5283436723603545e-05, + "loss": 0.7871, + "step": 4170 + }, + { + "epoch": 0.343221559349928, + "grad_norm": 1.4917252771642058, + "learning_rate": 1.5281173612149723e-05, + "loss": 0.7942, + "step": 4171 + }, + { + "epoch": 0.34330384694507304, + "grad_norm": 1.7600396605299424, + "learning_rate": 1.5278910125529776e-05, + "loss": 0.7875, + "step": 4172 + }, + { + "epoch": 0.34338613454021805, + "grad_norm": 0.4718144474080084, + "learning_rate": 1.5276646263904493e-05, + "loss": 0.528, + "step": 4173 + }, + { + "epoch": 0.3434684221353631, + "grad_norm": 1.7921169307664222, + "learning_rate": 1.52743820274347e-05, + "loss": 0.7934, + "step": 4174 + }, + { + "epoch": 0.3435507097305081, + "grad_norm": 1.906441376187714, + "learning_rate": 1.5272117416281242e-05, + "loss": 0.8022, + "step": 4175 + }, + { + "epoch": 0.34363299732565317, + "grad_norm": 0.4163744964764687, + "learning_rate": 1.5269852430604997e-05, + "loss": 0.4966, + "step": 4176 + }, + { + "epoch": 0.34371528492079817, + "grad_norm": 1.8296072306072015, + "learning_rate": 1.5267587070566864e-05, + "loss": 0.8, + "step": 4177 + }, + { + "epoch": 0.34379757251594323, + "grad_norm": 1.674607986023617, + "learning_rate": 1.5265321336327766e-05, + "loss": 0.7773, + "step": 4178 + }, + { + "epoch": 0.34387986011108823, + "grad_norm": 1.6711256431021402, + "learning_rate": 1.526305522804866e-05, + "loss": 0.7988, + "step": 4179 + }, + { + "epoch": 0.3439621477062333, + "grad_norm": 1.6731906814384143, + "learning_rate": 1.526078874589053e-05, + "loss": 0.7912, + "step": 4180 + }, + { + "epoch": 0.3440444353013783, + "grad_norm": 1.5370806580936933, + "learning_rate": 1.5258521890014381e-05, + "loss": 0.7959, + "step": 4181 + }, + { + "epoch": 0.34412672289652335, + "grad_norm": 1.7509253626808505, + "learning_rate": 1.5256254660581247e-05, + "loss": 0.789, + "step": 4182 + }, + { + "epoch": 0.34420901049166835, + "grad_norm": 1.6598945176515831, + "learning_rate": 1.5253987057752186e-05, + "loss": 0.806, + "step": 4183 + }, + { + "epoch": 0.3442912980868134, + "grad_norm": 1.9159792462949143, + "learning_rate": 1.5251719081688288e-05, + "loss": 0.8085, + "step": 4184 + }, + { + "epoch": 0.34437358568195847, + "grad_norm": 1.7556028330987612, + "learning_rate": 1.5249450732550668e-05, + "loss": 0.787, + "step": 4185 + }, + { + "epoch": 0.3444558732771035, + "grad_norm": 2.318757101763348, + "learning_rate": 1.5247182010500458e-05, + "loss": 0.7872, + "step": 4186 + }, + { + "epoch": 0.34453816087224853, + "grad_norm": 3.031324644798477, + "learning_rate": 1.5244912915698833e-05, + "loss": 0.8013, + "step": 4187 + }, + { + "epoch": 0.34462044846739354, + "grad_norm": 2.0434488021905586, + "learning_rate": 1.5242643448306981e-05, + "loss": 0.8163, + "step": 4188 + }, + { + "epoch": 0.3447027360625386, + "grad_norm": 1.6799979144229933, + "learning_rate": 1.5240373608486123e-05, + "loss": 0.7715, + "step": 4189 + }, + { + "epoch": 0.3447850236576836, + "grad_norm": 0.4467407100216385, + "learning_rate": 1.5238103396397505e-05, + "loss": 0.5467, + "step": 4190 + }, + { + "epoch": 0.34486731125282866, + "grad_norm": 1.9693649433882872, + "learning_rate": 1.52358328122024e-05, + "loss": 0.7841, + "step": 4191 + }, + { + "epoch": 0.34494959884797366, + "grad_norm": 1.2915651169899338, + "learning_rate": 1.5233561856062104e-05, + "loss": 0.8025, + "step": 4192 + }, + { + "epoch": 0.3450318864431187, + "grad_norm": 1.7854079753024872, + "learning_rate": 1.5231290528137943e-05, + "loss": 0.7722, + "step": 4193 + }, + { + "epoch": 0.3451141740382637, + "grad_norm": 1.6367298314154521, + "learning_rate": 1.5229018828591273e-05, + "loss": 0.8148, + "step": 4194 + }, + { + "epoch": 0.3451964616334088, + "grad_norm": 1.7294806681563275, + "learning_rate": 1.5226746757583465e-05, + "loss": 0.7904, + "step": 4195 + }, + { + "epoch": 0.3452787492285538, + "grad_norm": 0.4373012276375105, + "learning_rate": 1.5224474315275926e-05, + "loss": 0.54, + "step": 4196 + }, + { + "epoch": 0.34536103682369884, + "grad_norm": 1.5022173094188904, + "learning_rate": 1.5222201501830088e-05, + "loss": 0.7818, + "step": 4197 + }, + { + "epoch": 0.34544332441884384, + "grad_norm": 1.7240451693707013, + "learning_rate": 1.5219928317407404e-05, + "loss": 0.7736, + "step": 4198 + }, + { + "epoch": 0.3455256120139889, + "grad_norm": 2.205567965000888, + "learning_rate": 1.5217654762169364e-05, + "loss": 0.7661, + "step": 4199 + }, + { + "epoch": 0.3456078996091339, + "grad_norm": 1.586922527517016, + "learning_rate": 1.5215380836277474e-05, + "loss": 0.7783, + "step": 4200 + }, + { + "epoch": 0.34569018720427896, + "grad_norm": 1.7853572284210082, + "learning_rate": 1.521310653989327e-05, + "loss": 0.8258, + "step": 4201 + }, + { + "epoch": 0.34577247479942397, + "grad_norm": 1.4672545450198033, + "learning_rate": 1.5210831873178311e-05, + "loss": 0.7834, + "step": 4202 + }, + { + "epoch": 0.345854762394569, + "grad_norm": 1.5690386844013842, + "learning_rate": 1.5208556836294192e-05, + "loss": 0.7796, + "step": 4203 + }, + { + "epoch": 0.34593704998971403, + "grad_norm": 0.4192692613470579, + "learning_rate": 1.5206281429402524e-05, + "loss": 0.5253, + "step": 4204 + }, + { + "epoch": 0.3460193375848591, + "grad_norm": 1.5149887373516677, + "learning_rate": 1.520400565266495e-05, + "loss": 0.8028, + "step": 4205 + }, + { + "epoch": 0.3461016251800041, + "grad_norm": 1.453553109481577, + "learning_rate": 1.520172950624314e-05, + "loss": 0.7909, + "step": 4206 + }, + { + "epoch": 0.34618391277514915, + "grad_norm": 0.4297334154612609, + "learning_rate": 1.5199452990298781e-05, + "loss": 0.5441, + "step": 4207 + }, + { + "epoch": 0.34626620037029415, + "grad_norm": 1.6152493810058053, + "learning_rate": 1.5197176104993598e-05, + "loss": 0.7648, + "step": 4208 + }, + { + "epoch": 0.3463484879654392, + "grad_norm": 1.4384567755187307, + "learning_rate": 1.5194898850489338e-05, + "loss": 0.8243, + "step": 4209 + }, + { + "epoch": 0.3464307755605842, + "grad_norm": 1.9412054957510942, + "learning_rate": 1.519262122694777e-05, + "loss": 0.7981, + "step": 4210 + }, + { + "epoch": 0.3465130631557293, + "grad_norm": 1.6101767213118356, + "learning_rate": 1.5190343234530694e-05, + "loss": 0.766, + "step": 4211 + }, + { + "epoch": 0.34659535075087433, + "grad_norm": 3.3062058052946153, + "learning_rate": 1.5188064873399935e-05, + "loss": 0.769, + "step": 4212 + }, + { + "epoch": 0.34667763834601933, + "grad_norm": 0.42009599275022463, + "learning_rate": 1.5185786143717347e-05, + "loss": 0.5446, + "step": 4213 + }, + { + "epoch": 0.3467599259411644, + "grad_norm": 1.4677687844466423, + "learning_rate": 1.51835070456448e-05, + "loss": 0.7933, + "step": 4214 + }, + { + "epoch": 0.3468422135363094, + "grad_norm": 1.493217503475276, + "learning_rate": 1.5181227579344207e-05, + "loss": 0.7651, + "step": 4215 + }, + { + "epoch": 0.34692450113145445, + "grad_norm": 1.74764198103528, + "learning_rate": 1.5178947744977493e-05, + "loss": 0.7622, + "step": 4216 + }, + { + "epoch": 0.34700678872659946, + "grad_norm": 1.862412917914253, + "learning_rate": 1.5176667542706611e-05, + "loss": 0.8079, + "step": 4217 + }, + { + "epoch": 0.3470890763217445, + "grad_norm": 1.758942491124323, + "learning_rate": 1.5174386972693546e-05, + "loss": 0.7467, + "step": 4218 + }, + { + "epoch": 0.3471713639168895, + "grad_norm": 1.5665368455310102, + "learning_rate": 1.5172106035100305e-05, + "loss": 0.8025, + "step": 4219 + }, + { + "epoch": 0.3472536515120346, + "grad_norm": 1.711947014681679, + "learning_rate": 1.5169824730088926e-05, + "loss": 0.7928, + "step": 4220 + }, + { + "epoch": 0.3473359391071796, + "grad_norm": 2.0085808038513737, + "learning_rate": 1.5167543057821463e-05, + "loss": 0.7835, + "step": 4221 + }, + { + "epoch": 0.34741822670232464, + "grad_norm": 1.7430833856993977, + "learning_rate": 1.5165261018460004e-05, + "loss": 0.7675, + "step": 4222 + }, + { + "epoch": 0.34750051429746964, + "grad_norm": 1.845518885363521, + "learning_rate": 1.5162978612166668e-05, + "loss": 0.7779, + "step": 4223 + }, + { + "epoch": 0.3475828018926147, + "grad_norm": 2.7130830798869385, + "learning_rate": 1.5160695839103587e-05, + "loss": 0.7705, + "step": 4224 + }, + { + "epoch": 0.3476650894877597, + "grad_norm": 1.9547052151210038, + "learning_rate": 1.5158412699432923e-05, + "loss": 0.783, + "step": 4225 + }, + { + "epoch": 0.34774737708290476, + "grad_norm": 2.5145943348093507, + "learning_rate": 1.5156129193316876e-05, + "loss": 0.782, + "step": 4226 + }, + { + "epoch": 0.34782966467804977, + "grad_norm": 1.65257979402394, + "learning_rate": 1.5153845320917653e-05, + "loss": 0.8015, + "step": 4227 + }, + { + "epoch": 0.3479119522731948, + "grad_norm": 1.92403730012545, + "learning_rate": 1.51515610823975e-05, + "loss": 0.8318, + "step": 4228 + }, + { + "epoch": 0.3479942398683398, + "grad_norm": 2.394929975442086, + "learning_rate": 1.5149276477918691e-05, + "loss": 0.8012, + "step": 4229 + }, + { + "epoch": 0.3480765274634849, + "grad_norm": 2.7461350080669233, + "learning_rate": 1.5146991507643514e-05, + "loss": 0.8125, + "step": 4230 + }, + { + "epoch": 0.3481588150586299, + "grad_norm": 1.8120751425621948, + "learning_rate": 1.5144706171734289e-05, + "loss": 0.7726, + "step": 4231 + }, + { + "epoch": 0.34824110265377495, + "grad_norm": 1.880205900342263, + "learning_rate": 1.514242047035337e-05, + "loss": 0.7893, + "step": 4232 + }, + { + "epoch": 0.34832339024891995, + "grad_norm": 0.4647853530055138, + "learning_rate": 1.5140134403663123e-05, + "loss": 0.5387, + "step": 4233 + }, + { + "epoch": 0.348405677844065, + "grad_norm": 1.856891163678968, + "learning_rate": 1.5137847971825945e-05, + "loss": 0.7911, + "step": 4234 + }, + { + "epoch": 0.34848796543921, + "grad_norm": 2.5751394290321663, + "learning_rate": 1.5135561175004267e-05, + "loss": 0.802, + "step": 4235 + }, + { + "epoch": 0.34857025303435507, + "grad_norm": 2.10679008451569, + "learning_rate": 1.5133274013360537e-05, + "loss": 0.7561, + "step": 4236 + }, + { + "epoch": 0.34865254062950013, + "grad_norm": 6.051687839873176, + "learning_rate": 1.513098648705723e-05, + "loss": 0.7989, + "step": 4237 + }, + { + "epoch": 0.34873482822464513, + "grad_norm": 1.8938964041180257, + "learning_rate": 1.5128698596256848e-05, + "loss": 0.7753, + "step": 4238 + }, + { + "epoch": 0.3488171158197902, + "grad_norm": 0.43663972439972254, + "learning_rate": 1.5126410341121918e-05, + "loss": 0.5499, + "step": 4239 + }, + { + "epoch": 0.3488994034149352, + "grad_norm": 1.7524512839146058, + "learning_rate": 1.5124121721814997e-05, + "loss": 0.7988, + "step": 4240 + }, + { + "epoch": 0.34898169101008025, + "grad_norm": 2.147678489537462, + "learning_rate": 1.5121832738498668e-05, + "loss": 0.8009, + "step": 4241 + }, + { + "epoch": 0.34906397860522526, + "grad_norm": 1.9916009072361731, + "learning_rate": 1.5119543391335528e-05, + "loss": 0.8214, + "step": 4242 + }, + { + "epoch": 0.3491462662003703, + "grad_norm": 2.085216813138033, + "learning_rate": 1.5117253680488213e-05, + "loss": 0.7498, + "step": 4243 + }, + { + "epoch": 0.3492285537955153, + "grad_norm": 1.714008577700925, + "learning_rate": 1.5114963606119385e-05, + "loss": 0.8042, + "step": 4244 + }, + { + "epoch": 0.3493108413906604, + "grad_norm": 1.7263799829530475, + "learning_rate": 1.5112673168391717e-05, + "loss": 0.7906, + "step": 4245 + }, + { + "epoch": 0.3493931289858054, + "grad_norm": 2.130672830015722, + "learning_rate": 1.5110382367467923e-05, + "loss": 0.7749, + "step": 4246 + }, + { + "epoch": 0.34947541658095044, + "grad_norm": 1.7780716625039947, + "learning_rate": 1.5108091203510742e-05, + "loss": 0.7774, + "step": 4247 + }, + { + "epoch": 0.34955770417609544, + "grad_norm": 2.1644882894071458, + "learning_rate": 1.510579967668293e-05, + "loss": 0.7883, + "step": 4248 + }, + { + "epoch": 0.3496399917712405, + "grad_norm": 1.760334884478578, + "learning_rate": 1.5103507787147273e-05, + "loss": 0.7546, + "step": 4249 + }, + { + "epoch": 0.3497222793663855, + "grad_norm": 1.9435333039412346, + "learning_rate": 1.5101215535066589e-05, + "loss": 0.8172, + "step": 4250 + }, + { + "epoch": 0.34980456696153056, + "grad_norm": 1.631519680132551, + "learning_rate": 1.5098922920603709e-05, + "loss": 0.7752, + "step": 4251 + }, + { + "epoch": 0.34988685455667556, + "grad_norm": 1.6116949765501842, + "learning_rate": 1.5096629943921502e-05, + "loss": 0.7568, + "step": 4252 + }, + { + "epoch": 0.3499691421518206, + "grad_norm": 2.2422131361565327, + "learning_rate": 1.509433660518285e-05, + "loss": 0.8293, + "step": 4253 + }, + { + "epoch": 0.3500514297469656, + "grad_norm": 2.596112128877894, + "learning_rate": 1.5092042904550673e-05, + "loss": 0.7793, + "step": 4254 + }, + { + "epoch": 0.3501337173421107, + "grad_norm": 0.4581743573064218, + "learning_rate": 1.5089748842187914e-05, + "loss": 0.548, + "step": 4255 + }, + { + "epoch": 0.3502160049372557, + "grad_norm": 1.776109036903087, + "learning_rate": 1.5087454418257537e-05, + "loss": 0.8004, + "step": 4256 + }, + { + "epoch": 0.35029829253240075, + "grad_norm": 2.1083981886382475, + "learning_rate": 1.5085159632922532e-05, + "loss": 0.805, + "step": 4257 + }, + { + "epoch": 0.35038058012754575, + "grad_norm": 0.4169899132477313, + "learning_rate": 1.5082864486345923e-05, + "loss": 0.5054, + "step": 4258 + }, + { + "epoch": 0.3504628677226908, + "grad_norm": 0.40960518547708463, + "learning_rate": 1.5080568978690746e-05, + "loss": 0.5182, + "step": 4259 + }, + { + "epoch": 0.3505451553178358, + "grad_norm": 2.1915036808679096, + "learning_rate": 1.5078273110120074e-05, + "loss": 0.7803, + "step": 4260 + }, + { + "epoch": 0.35062744291298087, + "grad_norm": 1.8696022940477977, + "learning_rate": 1.5075976880797006e-05, + "loss": 0.7757, + "step": 4261 + }, + { + "epoch": 0.3507097305081259, + "grad_norm": 2.363058279324034, + "learning_rate": 1.5073680290884654e-05, + "loss": 0.7653, + "step": 4262 + }, + { + "epoch": 0.35079201810327093, + "grad_norm": 0.46062839853449933, + "learning_rate": 1.5071383340546169e-05, + "loss": 0.5365, + "step": 4263 + }, + { + "epoch": 0.350874305698416, + "grad_norm": 0.43011203608729137, + "learning_rate": 1.5069086029944723e-05, + "loss": 0.5271, + "step": 4264 + }, + { + "epoch": 0.350956593293561, + "grad_norm": 2.037853153702179, + "learning_rate": 1.5066788359243512e-05, + "loss": 0.7989, + "step": 4265 + }, + { + "epoch": 0.35103888088870605, + "grad_norm": 1.639424871080346, + "learning_rate": 1.5064490328605756e-05, + "loss": 0.7834, + "step": 4266 + }, + { + "epoch": 0.35112116848385105, + "grad_norm": 1.6125841712238647, + "learning_rate": 1.5062191938194712e-05, + "loss": 0.802, + "step": 4267 + }, + { + "epoch": 0.3512034560789961, + "grad_norm": 1.8753737259352214, + "learning_rate": 1.5059893188173647e-05, + "loss": 0.7807, + "step": 4268 + }, + { + "epoch": 0.3512857436741411, + "grad_norm": 1.6341123032672547, + "learning_rate": 1.5057594078705857e-05, + "loss": 0.7787, + "step": 4269 + }, + { + "epoch": 0.3513680312692862, + "grad_norm": 2.237770860720103, + "learning_rate": 1.5055294609954678e-05, + "loss": 0.7762, + "step": 4270 + }, + { + "epoch": 0.3514503188644312, + "grad_norm": 1.5519616780677832, + "learning_rate": 1.5052994782083454e-05, + "loss": 0.8126, + "step": 4271 + }, + { + "epoch": 0.35153260645957624, + "grad_norm": 1.509703879953318, + "learning_rate": 1.5050694595255558e-05, + "loss": 0.7957, + "step": 4272 + }, + { + "epoch": 0.35161489405472124, + "grad_norm": 1.844428715887101, + "learning_rate": 1.5048394049634398e-05, + "loss": 0.7769, + "step": 4273 + }, + { + "epoch": 0.3516971816498663, + "grad_norm": 0.4780655911411019, + "learning_rate": 1.5046093145383397e-05, + "loss": 0.5073, + "step": 4274 + }, + { + "epoch": 0.3517794692450113, + "grad_norm": 1.8122459480963558, + "learning_rate": 1.5043791882666013e-05, + "loss": 0.7866, + "step": 4275 + }, + { + "epoch": 0.35186175684015636, + "grad_norm": 2.7196919535131583, + "learning_rate": 1.5041490261645717e-05, + "loss": 0.803, + "step": 4276 + }, + { + "epoch": 0.35194404443530136, + "grad_norm": 1.947362916758738, + "learning_rate": 1.5039188282486015e-05, + "loss": 0.7593, + "step": 4277 + }, + { + "epoch": 0.3520263320304464, + "grad_norm": 1.6755592332557474, + "learning_rate": 1.5036885945350437e-05, + "loss": 0.7979, + "step": 4278 + }, + { + "epoch": 0.3521086196255914, + "grad_norm": 0.44549387741601987, + "learning_rate": 1.5034583250402536e-05, + "loss": 0.5504, + "step": 4279 + }, + { + "epoch": 0.3521909072207365, + "grad_norm": 1.6959891194348287, + "learning_rate": 1.5032280197805894e-05, + "loss": 0.7979, + "step": 4280 + }, + { + "epoch": 0.3522731948158815, + "grad_norm": 0.4334185076699465, + "learning_rate": 1.5029976787724115e-05, + "loss": 0.5261, + "step": 4281 + }, + { + "epoch": 0.35235548241102654, + "grad_norm": 1.5551353847283433, + "learning_rate": 1.5027673020320828e-05, + "loss": 0.7556, + "step": 4282 + }, + { + "epoch": 0.35243777000617155, + "grad_norm": 1.897905580799512, + "learning_rate": 1.502536889575969e-05, + "loss": 0.7847, + "step": 4283 + }, + { + "epoch": 0.3525200576013166, + "grad_norm": 1.4099051942622551, + "learning_rate": 1.5023064414204383e-05, + "loss": 0.8067, + "step": 4284 + }, + { + "epoch": 0.3526023451964616, + "grad_norm": 1.3115240490795925, + "learning_rate": 1.5020759575818615e-05, + "loss": 0.769, + "step": 4285 + }, + { + "epoch": 0.35268463279160667, + "grad_norm": 1.4639194067957033, + "learning_rate": 1.5018454380766114e-05, + "loss": 0.7802, + "step": 4286 + }, + { + "epoch": 0.35276692038675167, + "grad_norm": 2.7851546916573082, + "learning_rate": 1.501614882921064e-05, + "loss": 0.7753, + "step": 4287 + }, + { + "epoch": 0.35284920798189673, + "grad_norm": 1.5293399433307024, + "learning_rate": 1.5013842921315975e-05, + "loss": 0.7762, + "step": 4288 + }, + { + "epoch": 0.3529314955770418, + "grad_norm": 0.4384726124476116, + "learning_rate": 1.5011536657245929e-05, + "loss": 0.506, + "step": 4289 + }, + { + "epoch": 0.3530137831721868, + "grad_norm": 0.4596793540848056, + "learning_rate": 1.5009230037164334e-05, + "loss": 0.5755, + "step": 4290 + }, + { + "epoch": 0.35309607076733185, + "grad_norm": 2.2532046831265706, + "learning_rate": 1.5006923061235044e-05, + "loss": 0.7833, + "step": 4291 + }, + { + "epoch": 0.35317835836247685, + "grad_norm": 1.4800954913943396, + "learning_rate": 1.5004615729621948e-05, + "loss": 0.8004, + "step": 4292 + }, + { + "epoch": 0.3532606459576219, + "grad_norm": 4.369531230294371, + "learning_rate": 1.5002308042488957e-05, + "loss": 0.7948, + "step": 4293 + }, + { + "epoch": 0.3533429335527669, + "grad_norm": 1.4581301798184974, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8079, + "step": 4294 + }, + { + "epoch": 0.35342522114791197, + "grad_norm": 0.42557480757840555, + "learning_rate": 1.4997691602319043e-05, + "loss": 0.5219, + "step": 4295 + }, + { + "epoch": 0.353507508743057, + "grad_norm": 1.7232340708525398, + "learning_rate": 1.4995382849610067e-05, + "loss": 0.7887, + "step": 4296 + }, + { + "epoch": 0.35358979633820203, + "grad_norm": 1.4294977274858944, + "learning_rate": 1.499307374203708e-05, + "loss": 0.8211, + "step": 4297 + }, + { + "epoch": 0.35367208393334704, + "grad_norm": 1.6951785203767, + "learning_rate": 1.4990764279764119e-05, + "loss": 0.7464, + "step": 4298 + }, + { + "epoch": 0.3537543715284921, + "grad_norm": 1.7342801853234957, + "learning_rate": 1.4988454462955247e-05, + "loss": 0.8096, + "step": 4299 + }, + { + "epoch": 0.3538366591236371, + "grad_norm": 1.8333889306402888, + "learning_rate": 1.4986144291774547e-05, + "loss": 0.8065, + "step": 4300 + }, + { + "epoch": 0.35391894671878216, + "grad_norm": 1.8109674757270033, + "learning_rate": 1.498383376638613e-05, + "loss": 0.8099, + "step": 4301 + }, + { + "epoch": 0.35400123431392716, + "grad_norm": 1.7127351891082367, + "learning_rate": 1.4981522886954134e-05, + "loss": 0.776, + "step": 4302 + }, + { + "epoch": 0.3540835219090722, + "grad_norm": 2.534337989440837, + "learning_rate": 1.4979211653642717e-05, + "loss": 0.7926, + "step": 4303 + }, + { + "epoch": 0.3541658095042172, + "grad_norm": 3.2956799725671764, + "learning_rate": 1.4976900066616069e-05, + "loss": 0.7775, + "step": 4304 + }, + { + "epoch": 0.3542480970993623, + "grad_norm": 1.3450479980394232, + "learning_rate": 1.49745881260384e-05, + "loss": 0.7548, + "step": 4305 + }, + { + "epoch": 0.3543303846945073, + "grad_norm": 1.6505077821163292, + "learning_rate": 1.4972275832073946e-05, + "loss": 0.7936, + "step": 4306 + }, + { + "epoch": 0.35441267228965234, + "grad_norm": 2.0578215330096588, + "learning_rate": 1.4969963184886966e-05, + "loss": 0.7781, + "step": 4307 + }, + { + "epoch": 0.35449495988479734, + "grad_norm": 1.585819640723395, + "learning_rate": 1.4967650184641753e-05, + "loss": 0.7997, + "step": 4308 + }, + { + "epoch": 0.3545772474799424, + "grad_norm": 1.7645604991703887, + "learning_rate": 1.4965336831502614e-05, + "loss": 0.8015, + "step": 4309 + }, + { + "epoch": 0.3546595350750874, + "grad_norm": 0.4372974520778442, + "learning_rate": 1.4963023125633887e-05, + "loss": 0.5125, + "step": 4310 + }, + { + "epoch": 0.35474182267023247, + "grad_norm": 1.4258633483589316, + "learning_rate": 1.4960709067199937e-05, + "loss": 0.7803, + "step": 4311 + }, + { + "epoch": 0.35482411026537747, + "grad_norm": 1.5188146591151062, + "learning_rate": 1.4958394656365146e-05, + "loss": 0.7933, + "step": 4312 + }, + { + "epoch": 0.3549063978605225, + "grad_norm": 2.6880047243594043, + "learning_rate": 1.4956079893293926e-05, + "loss": 0.7793, + "step": 4313 + }, + { + "epoch": 0.3549886854556676, + "grad_norm": 0.43100430278635776, + "learning_rate": 1.495376477815072e-05, + "loss": 0.5538, + "step": 4314 + }, + { + "epoch": 0.3550709730508126, + "grad_norm": 0.43658011623034215, + "learning_rate": 1.4951449311099988e-05, + "loss": 0.5187, + "step": 4315 + }, + { + "epoch": 0.35515326064595765, + "grad_norm": 1.4978063264835995, + "learning_rate": 1.4949133492306212e-05, + "loss": 0.7808, + "step": 4316 + }, + { + "epoch": 0.35523554824110265, + "grad_norm": 2.017109062281373, + "learning_rate": 1.4946817321933908e-05, + "loss": 0.7691, + "step": 4317 + }, + { + "epoch": 0.3553178358362477, + "grad_norm": 0.43278384502390105, + "learning_rate": 1.4944500800147614e-05, + "loss": 0.5224, + "step": 4318 + }, + { + "epoch": 0.3554001234313927, + "grad_norm": 2.547518864388293, + "learning_rate": 1.4942183927111894e-05, + "loss": 0.7816, + "step": 4319 + }, + { + "epoch": 0.35548241102653777, + "grad_norm": 1.5296634536888702, + "learning_rate": 1.4939866702991326e-05, + "loss": 0.7757, + "step": 4320 + }, + { + "epoch": 0.3555646986216828, + "grad_norm": 2.5812116396659763, + "learning_rate": 1.493754912795053e-05, + "loss": 0.8021, + "step": 4321 + }, + { + "epoch": 0.35564698621682783, + "grad_norm": 0.4565619768179845, + "learning_rate": 1.493523120215414e-05, + "loss": 0.5301, + "step": 4322 + }, + { + "epoch": 0.35572927381197283, + "grad_norm": 0.4322188785723711, + "learning_rate": 1.4932912925766818e-05, + "loss": 0.5188, + "step": 4323 + }, + { + "epoch": 0.3558115614071179, + "grad_norm": 1.6949003738215151, + "learning_rate": 1.493059429895325e-05, + "loss": 0.8034, + "step": 4324 + }, + { + "epoch": 0.3558938490022629, + "grad_norm": 1.8602893281816135, + "learning_rate": 1.4928275321878152e-05, + "loss": 0.8028, + "step": 4325 + }, + { + "epoch": 0.35597613659740795, + "grad_norm": 1.4907644504514543, + "learning_rate": 1.4925955994706255e-05, + "loss": 0.7807, + "step": 4326 + }, + { + "epoch": 0.35605842419255296, + "grad_norm": 1.4239535398006118, + "learning_rate": 1.4923636317602318e-05, + "loss": 0.7921, + "step": 4327 + }, + { + "epoch": 0.356140711787698, + "grad_norm": 1.4902110723412587, + "learning_rate": 1.4921316290731134e-05, + "loss": 0.7928, + "step": 4328 + }, + { + "epoch": 0.356222999382843, + "grad_norm": 1.8527416755681474, + "learning_rate": 1.491899591425751e-05, + "loss": 0.7832, + "step": 4329 + }, + { + "epoch": 0.3563052869779881, + "grad_norm": 1.8637668135802596, + "learning_rate": 1.4916675188346284e-05, + "loss": 0.7864, + "step": 4330 + }, + { + "epoch": 0.3563875745731331, + "grad_norm": 3.0640721383251632, + "learning_rate": 1.491435411316232e-05, + "loss": 0.7782, + "step": 4331 + }, + { + "epoch": 0.35646986216827814, + "grad_norm": 2.388721813885545, + "learning_rate": 1.4912032688870493e-05, + "loss": 0.7903, + "step": 4332 + }, + { + "epoch": 0.35655214976342314, + "grad_norm": 0.44245453915069405, + "learning_rate": 1.4909710915635722e-05, + "loss": 0.529, + "step": 4333 + }, + { + "epoch": 0.3566344373585682, + "grad_norm": 1.6032963266044926, + "learning_rate": 1.4907388793622939e-05, + "loss": 0.7919, + "step": 4334 + }, + { + "epoch": 0.3567167249537132, + "grad_norm": 1.6393045541929858, + "learning_rate": 1.4905066322997105e-05, + "loss": 0.7842, + "step": 4335 + }, + { + "epoch": 0.35679901254885826, + "grad_norm": 1.507884493635332, + "learning_rate": 1.4902743503923205e-05, + "loss": 0.8016, + "step": 4336 + }, + { + "epoch": 0.35688130014400327, + "grad_norm": 1.3560049215640406, + "learning_rate": 1.4900420336566243e-05, + "loss": 0.807, + "step": 4337 + }, + { + "epoch": 0.3569635877391483, + "grad_norm": 1.4349679556693968, + "learning_rate": 1.4898096821091262e-05, + "loss": 0.7375, + "step": 4338 + }, + { + "epoch": 0.35704587533429333, + "grad_norm": 1.5183757647071592, + "learning_rate": 1.4895772957663315e-05, + "loss": 0.7875, + "step": 4339 + }, + { + "epoch": 0.3571281629294384, + "grad_norm": 2.2178816644971873, + "learning_rate": 1.4893448746447485e-05, + "loss": 0.8037, + "step": 4340 + }, + { + "epoch": 0.35721045052458344, + "grad_norm": 1.6676595172038777, + "learning_rate": 1.4891124187608883e-05, + "loss": 0.8064, + "step": 4341 + }, + { + "epoch": 0.35729273811972845, + "grad_norm": 1.6292009573164543, + "learning_rate": 1.488879928131264e-05, + "loss": 0.784, + "step": 4342 + }, + { + "epoch": 0.3573750257148735, + "grad_norm": 2.3299089741042924, + "learning_rate": 1.4886474027723916e-05, + "loss": 0.7779, + "step": 4343 + }, + { + "epoch": 0.3574573133100185, + "grad_norm": 2.5843007019758164, + "learning_rate": 1.488414842700789e-05, + "loss": 0.7615, + "step": 4344 + }, + { + "epoch": 0.35753960090516357, + "grad_norm": 2.01821967326205, + "learning_rate": 1.4881822479329776e-05, + "loss": 0.7861, + "step": 4345 + }, + { + "epoch": 0.35762188850030857, + "grad_norm": 1.5113228304964665, + "learning_rate": 1.4879496184854794e-05, + "loss": 0.7825, + "step": 4346 + }, + { + "epoch": 0.35770417609545363, + "grad_norm": 1.364429323660587, + "learning_rate": 1.4877169543748209e-05, + "loss": 0.7601, + "step": 4347 + }, + { + "epoch": 0.35778646369059863, + "grad_norm": 1.9001598124952053, + "learning_rate": 1.48748425561753e-05, + "loss": 0.7942, + "step": 4348 + }, + { + "epoch": 0.3578687512857437, + "grad_norm": 1.6844932331039923, + "learning_rate": 1.487251522230137e-05, + "loss": 0.8034, + "step": 4349 + }, + { + "epoch": 0.3579510388808887, + "grad_norm": 1.7566380889578541, + "learning_rate": 1.4870187542291751e-05, + "loss": 0.8053, + "step": 4350 + }, + { + "epoch": 0.35803332647603375, + "grad_norm": 2.078558288380098, + "learning_rate": 1.4867859516311803e-05, + "loss": 0.7937, + "step": 4351 + }, + { + "epoch": 0.35811561407117876, + "grad_norm": 1.6595737343227377, + "learning_rate": 1.4865531144526894e-05, + "loss": 0.8424, + "step": 4352 + }, + { + "epoch": 0.3581979016663238, + "grad_norm": 1.9147501230715447, + "learning_rate": 1.4863202427102437e-05, + "loss": 0.7919, + "step": 4353 + }, + { + "epoch": 0.3582801892614688, + "grad_norm": 1.84351545251438, + "learning_rate": 1.4860873364203855e-05, + "loss": 0.8086, + "step": 4354 + }, + { + "epoch": 0.3583624768566139, + "grad_norm": 2.037047198191343, + "learning_rate": 1.4858543955996605e-05, + "loss": 0.7899, + "step": 4355 + }, + { + "epoch": 0.3584447644517589, + "grad_norm": 1.9936858446786507, + "learning_rate": 1.4856214202646161e-05, + "loss": 0.8084, + "step": 4356 + }, + { + "epoch": 0.35852705204690394, + "grad_norm": 1.9081258550978737, + "learning_rate": 1.4853884104318028e-05, + "loss": 0.7796, + "step": 4357 + }, + { + "epoch": 0.35860933964204894, + "grad_norm": 2.090683290882742, + "learning_rate": 1.4851553661177728e-05, + "loss": 0.7852, + "step": 4358 + }, + { + "epoch": 0.358691627237194, + "grad_norm": 1.8781498017728904, + "learning_rate": 1.4849222873390815e-05, + "loss": 0.7805, + "step": 4359 + }, + { + "epoch": 0.358773914832339, + "grad_norm": 1.7386884741520234, + "learning_rate": 1.4846891741122869e-05, + "loss": 0.7679, + "step": 4360 + }, + { + "epoch": 0.35885620242748406, + "grad_norm": 1.7603079590151627, + "learning_rate": 1.4844560264539483e-05, + "loss": 0.7481, + "step": 4361 + }, + { + "epoch": 0.35893849002262906, + "grad_norm": 2.4484258611817618, + "learning_rate": 1.4842228443806282e-05, + "loss": 0.8061, + "step": 4362 + }, + { + "epoch": 0.3590207776177741, + "grad_norm": 2.0869440215336836, + "learning_rate": 1.4839896279088917e-05, + "loss": 0.8011, + "step": 4363 + }, + { + "epoch": 0.3591030652129191, + "grad_norm": 0.43753689917699273, + "learning_rate": 1.483756377055306e-05, + "loss": 0.5496, + "step": 4364 + }, + { + "epoch": 0.3591853528080642, + "grad_norm": 2.031649607822119, + "learning_rate": 1.483523091836441e-05, + "loss": 0.8227, + "step": 4365 + }, + { + "epoch": 0.35926764040320924, + "grad_norm": 1.8553219815666742, + "learning_rate": 1.4832897722688688e-05, + "loss": 0.7724, + "step": 4366 + }, + { + "epoch": 0.35934992799835425, + "grad_norm": 3.9583170664767717, + "learning_rate": 1.4830564183691642e-05, + "loss": 0.7842, + "step": 4367 + }, + { + "epoch": 0.3594322155934993, + "grad_norm": 1.7655995488989444, + "learning_rate": 1.4828230301539042e-05, + "loss": 0.7717, + "step": 4368 + }, + { + "epoch": 0.3595145031886443, + "grad_norm": 2.4827230111740852, + "learning_rate": 1.482589607639668e-05, + "loss": 0.7668, + "step": 4369 + }, + { + "epoch": 0.35959679078378937, + "grad_norm": 2.1463383584658353, + "learning_rate": 1.482356150843038e-05, + "loss": 0.7585, + "step": 4370 + }, + { + "epoch": 0.35967907837893437, + "grad_norm": 1.869372283109839, + "learning_rate": 1.4821226597805987e-05, + "loss": 0.7641, + "step": 4371 + }, + { + "epoch": 0.35976136597407943, + "grad_norm": 0.4499207892352961, + "learning_rate": 1.4818891344689363e-05, + "loss": 0.5478, + "step": 4372 + }, + { + "epoch": 0.35984365356922443, + "grad_norm": 2.0440067055615048, + "learning_rate": 1.4816555749246407e-05, + "loss": 0.7762, + "step": 4373 + }, + { + "epoch": 0.3599259411643695, + "grad_norm": 3.147207113013275, + "learning_rate": 1.4814219811643033e-05, + "loss": 0.8085, + "step": 4374 + }, + { + "epoch": 0.3600082287595145, + "grad_norm": 2.216160077603176, + "learning_rate": 1.4811883532045184e-05, + "loss": 0.8197, + "step": 4375 + }, + { + "epoch": 0.36009051635465955, + "grad_norm": 1.9417645543266515, + "learning_rate": 1.4809546910618821e-05, + "loss": 0.7747, + "step": 4376 + }, + { + "epoch": 0.36017280394980455, + "grad_norm": 2.58679860017351, + "learning_rate": 1.4807209947529941e-05, + "loss": 0.7903, + "step": 4377 + }, + { + "epoch": 0.3602550915449496, + "grad_norm": 0.4442965451755858, + "learning_rate": 1.4804872642944553e-05, + "loss": 0.5569, + "step": 4378 + }, + { + "epoch": 0.3603373791400946, + "grad_norm": 2.085092584435455, + "learning_rate": 1.4802534997028695e-05, + "loss": 0.7963, + "step": 4379 + }, + { + "epoch": 0.3604196667352397, + "grad_norm": 3.3901320322156514, + "learning_rate": 1.4800197009948434e-05, + "loss": 0.7941, + "step": 4380 + }, + { + "epoch": 0.3605019543303847, + "grad_norm": 2.064303540863521, + "learning_rate": 1.4797858681869852e-05, + "loss": 0.7987, + "step": 4381 + }, + { + "epoch": 0.36058424192552974, + "grad_norm": 2.485014757376644, + "learning_rate": 1.4795520012959064e-05, + "loss": 0.7748, + "step": 4382 + }, + { + "epoch": 0.36066652952067474, + "grad_norm": 1.914798579701594, + "learning_rate": 1.4793181003382201e-05, + "loss": 0.7639, + "step": 4383 + }, + { + "epoch": 0.3607488171158198, + "grad_norm": 2.838719737626025, + "learning_rate": 1.4790841653305428e-05, + "loss": 0.7505, + "step": 4384 + }, + { + "epoch": 0.3608311047109648, + "grad_norm": 6.260918360797801, + "learning_rate": 1.4788501962894923e-05, + "loss": 0.7675, + "step": 4385 + }, + { + "epoch": 0.36091339230610986, + "grad_norm": 2.5501043836126875, + "learning_rate": 1.47861619323169e-05, + "loss": 0.7504, + "step": 4386 + }, + { + "epoch": 0.36099567990125486, + "grad_norm": 2.1505803851911667, + "learning_rate": 1.4783821561737587e-05, + "loss": 0.8255, + "step": 4387 + }, + { + "epoch": 0.3610779674963999, + "grad_norm": 1.9949662428415138, + "learning_rate": 1.4781480851323238e-05, + "loss": 0.7895, + "step": 4388 + }, + { + "epoch": 0.3611602550915449, + "grad_norm": 2.3932676443907117, + "learning_rate": 1.477913980124014e-05, + "loss": 0.7882, + "step": 4389 + }, + { + "epoch": 0.36124254268669, + "grad_norm": 3.4156281292965858, + "learning_rate": 1.4776798411654589e-05, + "loss": 0.7787, + "step": 4390 + }, + { + "epoch": 0.36132483028183504, + "grad_norm": 7.616137893545026, + "learning_rate": 1.4774456682732923e-05, + "loss": 0.7754, + "step": 4391 + }, + { + "epoch": 0.36140711787698004, + "grad_norm": 2.1515663233240767, + "learning_rate": 1.4772114614641488e-05, + "loss": 0.8253, + "step": 4392 + }, + { + "epoch": 0.3614894054721251, + "grad_norm": 2.2023060458768993, + "learning_rate": 1.4769772207546659e-05, + "loss": 0.7728, + "step": 4393 + }, + { + "epoch": 0.3615716930672701, + "grad_norm": 2.066421862032711, + "learning_rate": 1.4767429461614846e-05, + "loss": 0.8279, + "step": 4394 + }, + { + "epoch": 0.36165398066241516, + "grad_norm": 1.9163198370520074, + "learning_rate": 1.4765086377012466e-05, + "loss": 0.7756, + "step": 4395 + }, + { + "epoch": 0.36173626825756017, + "grad_norm": 2.615582751637595, + "learning_rate": 1.476274295390597e-05, + "loss": 0.7961, + "step": 4396 + }, + { + "epoch": 0.3618185558527052, + "grad_norm": 4.41644033257218, + "learning_rate": 1.4760399192461831e-05, + "loss": 0.7601, + "step": 4397 + }, + { + "epoch": 0.36190084344785023, + "grad_norm": 2.2109711420310236, + "learning_rate": 1.475805509284655e-05, + "loss": 0.7722, + "step": 4398 + }, + { + "epoch": 0.3619831310429953, + "grad_norm": 2.275007390439845, + "learning_rate": 1.475571065522664e-05, + "loss": 0.8105, + "step": 4399 + }, + { + "epoch": 0.3620654186381403, + "grad_norm": 2.6731814569907346, + "learning_rate": 1.4753365879768656e-05, + "loss": 0.761, + "step": 4400 + }, + { + "epoch": 0.36214770623328535, + "grad_norm": 2.2806188231209954, + "learning_rate": 1.4751020766639158e-05, + "loss": 0.803, + "step": 4401 + }, + { + "epoch": 0.36222999382843035, + "grad_norm": 2.6625844562523073, + "learning_rate": 1.4748675316004741e-05, + "loss": 0.7826, + "step": 4402 + }, + { + "epoch": 0.3623122814235754, + "grad_norm": 2.9353270418499036, + "learning_rate": 1.4746329528032029e-05, + "loss": 0.807, + "step": 4403 + }, + { + "epoch": 0.3623945690187204, + "grad_norm": 2.1595877463416455, + "learning_rate": 1.4743983402887654e-05, + "loss": 0.8089, + "step": 4404 + }, + { + "epoch": 0.3624768566138655, + "grad_norm": 0.4436264102071975, + "learning_rate": 1.4741636940738286e-05, + "loss": 0.5838, + "step": 4405 + }, + { + "epoch": 0.3625591442090105, + "grad_norm": 2.1020565999096643, + "learning_rate": 1.4739290141750615e-05, + "loss": 0.7764, + "step": 4406 + }, + { + "epoch": 0.36264143180415553, + "grad_norm": 2.2256976473440324, + "learning_rate": 1.4736943006091348e-05, + "loss": 0.7884, + "step": 4407 + }, + { + "epoch": 0.36272371939930054, + "grad_norm": 1.9858803972230128, + "learning_rate": 1.4734595533927228e-05, + "loss": 0.8152, + "step": 4408 + }, + { + "epoch": 0.3628060069944456, + "grad_norm": 3.1016590160338855, + "learning_rate": 1.4732247725425013e-05, + "loss": 0.7771, + "step": 4409 + }, + { + "epoch": 0.3628882945895906, + "grad_norm": 2.341868715880278, + "learning_rate": 1.4729899580751488e-05, + "loss": 0.7542, + "step": 4410 + }, + { + "epoch": 0.36297058218473566, + "grad_norm": 3.4637181984694623, + "learning_rate": 1.4727551100073458e-05, + "loss": 0.7601, + "step": 4411 + }, + { + "epoch": 0.36305286977988066, + "grad_norm": 2.014929654564701, + "learning_rate": 1.4725202283557762e-05, + "loss": 0.781, + "step": 4412 + }, + { + "epoch": 0.3631351573750257, + "grad_norm": 0.436093194203851, + "learning_rate": 1.4722853131371252e-05, + "loss": 0.5101, + "step": 4413 + }, + { + "epoch": 0.3632174449701707, + "grad_norm": 2.956083725742205, + "learning_rate": 1.4720503643680805e-05, + "loss": 0.767, + "step": 4414 + }, + { + "epoch": 0.3632997325653158, + "grad_norm": 1.9646441399269536, + "learning_rate": 1.4718153820653337e-05, + "loss": 0.7914, + "step": 4415 + }, + { + "epoch": 0.3633820201604608, + "grad_norm": 0.39786861770836607, + "learning_rate": 1.471580366245576e-05, + "loss": 0.5067, + "step": 4416 + }, + { + "epoch": 0.36346430775560584, + "grad_norm": 6.216926529114422, + "learning_rate": 1.4713453169255032e-05, + "loss": 0.7755, + "step": 4417 + }, + { + "epoch": 0.3635465953507509, + "grad_norm": 1.8278689159508343, + "learning_rate": 1.4711102341218133e-05, + "loss": 0.7711, + "step": 4418 + }, + { + "epoch": 0.3636288829458959, + "grad_norm": 2.1161362522194045, + "learning_rate": 1.4708751178512055e-05, + "loss": 0.793, + "step": 4419 + }, + { + "epoch": 0.36371117054104096, + "grad_norm": 2.213773823337977, + "learning_rate": 1.4706399681303825e-05, + "loss": 0.7536, + "step": 4420 + }, + { + "epoch": 0.36379345813618597, + "grad_norm": 2.653453023061755, + "learning_rate": 1.470404784976049e-05, + "loss": 0.765, + "step": 4421 + }, + { + "epoch": 0.363875745731331, + "grad_norm": 2.4982987604135567, + "learning_rate": 1.4701695684049115e-05, + "loss": 0.7616, + "step": 4422 + }, + { + "epoch": 0.363958033326476, + "grad_norm": 2.5408162463108006, + "learning_rate": 1.4699343184336801e-05, + "loss": 0.7701, + "step": 4423 + }, + { + "epoch": 0.3640403209216211, + "grad_norm": 2.3664164761512345, + "learning_rate": 1.4696990350790663e-05, + "loss": 0.7694, + "step": 4424 + }, + { + "epoch": 0.3641226085167661, + "grad_norm": 2.1764983944899647, + "learning_rate": 1.469463718357784e-05, + "loss": 0.7738, + "step": 4425 + }, + { + "epoch": 0.36420489611191115, + "grad_norm": 3.528121792473789, + "learning_rate": 1.46922836828655e-05, + "loss": 0.8011, + "step": 4426 + }, + { + "epoch": 0.36428718370705615, + "grad_norm": 2.715815677172619, + "learning_rate": 1.4689929848820831e-05, + "loss": 0.7594, + "step": 4427 + }, + { + "epoch": 0.3643694713022012, + "grad_norm": 1.9804015653569753, + "learning_rate": 1.4687575681611048e-05, + "loss": 0.7785, + "step": 4428 + }, + { + "epoch": 0.3644517588973462, + "grad_norm": 1.9600468292254916, + "learning_rate": 1.4685221181403382e-05, + "loss": 0.7706, + "step": 4429 + }, + { + "epoch": 0.36453404649249127, + "grad_norm": 2.8777421364458555, + "learning_rate": 1.4682866348365102e-05, + "loss": 0.7803, + "step": 4430 + }, + { + "epoch": 0.3646163340876363, + "grad_norm": 2.176178063953276, + "learning_rate": 1.468051118266348e-05, + "loss": 0.7708, + "step": 4431 + }, + { + "epoch": 0.36469862168278133, + "grad_norm": 1.8795084006154403, + "learning_rate": 1.4678155684465828e-05, + "loss": 0.7682, + "step": 4432 + }, + { + "epoch": 0.36478090927792634, + "grad_norm": 1.7924640517850554, + "learning_rate": 1.4675799853939483e-05, + "loss": 0.7844, + "step": 4433 + }, + { + "epoch": 0.3648631968730714, + "grad_norm": 2.49611047395207, + "learning_rate": 1.4673443691251793e-05, + "loss": 0.7956, + "step": 4434 + }, + { + "epoch": 0.3649454844682164, + "grad_norm": 2.7584007784580247, + "learning_rate": 1.4671087196570137e-05, + "loss": 0.7933, + "step": 4435 + }, + { + "epoch": 0.36502777206336146, + "grad_norm": 2.841840168495128, + "learning_rate": 1.4668730370061914e-05, + "loss": 0.7921, + "step": 4436 + }, + { + "epoch": 0.36511005965850646, + "grad_norm": 1.954551730501201, + "learning_rate": 1.4666373211894553e-05, + "loss": 0.7576, + "step": 4437 + }, + { + "epoch": 0.3651923472536515, + "grad_norm": 0.4457840230578507, + "learning_rate": 1.4664015722235505e-05, + "loss": 0.5342, + "step": 4438 + }, + { + "epoch": 0.3652746348487965, + "grad_norm": 2.272082365086708, + "learning_rate": 1.4661657901252236e-05, + "loss": 0.7938, + "step": 4439 + }, + { + "epoch": 0.3653569224439416, + "grad_norm": 1.8497286525207204, + "learning_rate": 1.4659299749112243e-05, + "loss": 0.7751, + "step": 4440 + }, + { + "epoch": 0.3654392100390866, + "grad_norm": 0.4713257525773104, + "learning_rate": 1.4656941265983054e-05, + "loss": 0.5644, + "step": 4441 + }, + { + "epoch": 0.36552149763423164, + "grad_norm": 1.9767974354616729, + "learning_rate": 1.46545824520322e-05, + "loss": 0.7883, + "step": 4442 + }, + { + "epoch": 0.3656037852293767, + "grad_norm": 2.1481195870810703, + "learning_rate": 1.4652223307427254e-05, + "loss": 0.771, + "step": 4443 + }, + { + "epoch": 0.3656860728245217, + "grad_norm": 1.8051795359148761, + "learning_rate": 1.4649863832335805e-05, + "loss": 0.7909, + "step": 4444 + }, + { + "epoch": 0.36576836041966676, + "grad_norm": 2.3149191020200597, + "learning_rate": 1.4647504026925464e-05, + "loss": 0.7772, + "step": 4445 + }, + { + "epoch": 0.36585064801481176, + "grad_norm": 2.255789293611326, + "learning_rate": 1.4645143891363869e-05, + "loss": 0.7948, + "step": 4446 + }, + { + "epoch": 0.3659329356099568, + "grad_norm": 2.169344677139653, + "learning_rate": 1.4642783425818684e-05, + "loss": 0.8179, + "step": 4447 + }, + { + "epoch": 0.3660152232051018, + "grad_norm": 1.8152605627814558, + "learning_rate": 1.4640422630457586e-05, + "loss": 0.7903, + "step": 4448 + }, + { + "epoch": 0.3660975108002469, + "grad_norm": 2.206827823284778, + "learning_rate": 1.4638061505448286e-05, + "loss": 0.7686, + "step": 4449 + }, + { + "epoch": 0.3661797983953919, + "grad_norm": 2.101686778623947, + "learning_rate": 1.4635700050958516e-05, + "loss": 0.7737, + "step": 4450 + }, + { + "epoch": 0.36626208599053695, + "grad_norm": 1.9607020489875775, + "learning_rate": 1.4633338267156028e-05, + "loss": 0.7568, + "step": 4451 + }, + { + "epoch": 0.36634437358568195, + "grad_norm": 2.0592409137920873, + "learning_rate": 1.4630976154208598e-05, + "loss": 0.7869, + "step": 4452 + }, + { + "epoch": 0.366426661180827, + "grad_norm": 2.6252630769120087, + "learning_rate": 1.462861371228403e-05, + "loss": 0.7687, + "step": 4453 + }, + { + "epoch": 0.366508948775972, + "grad_norm": 2.1773211757346003, + "learning_rate": 1.4626250941550144e-05, + "loss": 0.7811, + "step": 4454 + }, + { + "epoch": 0.36659123637111707, + "grad_norm": 2.180626335955112, + "learning_rate": 1.4623887842174792e-05, + "loss": 0.7553, + "step": 4455 + }, + { + "epoch": 0.36667352396626207, + "grad_norm": 2.320887199378988, + "learning_rate": 1.462152441432584e-05, + "loss": 0.765, + "step": 4456 + }, + { + "epoch": 0.36675581156140713, + "grad_norm": 1.8838337805008327, + "learning_rate": 1.4619160658171186e-05, + "loss": 0.767, + "step": 4457 + }, + { + "epoch": 0.36683809915655213, + "grad_norm": 1.8138251759224935, + "learning_rate": 1.4616796573878746e-05, + "loss": 0.7722, + "step": 4458 + }, + { + "epoch": 0.3669203867516972, + "grad_norm": 0.4544553362135176, + "learning_rate": 1.4614432161616462e-05, + "loss": 0.5241, + "step": 4459 + }, + { + "epoch": 0.3670026743468422, + "grad_norm": 2.4727369174406193, + "learning_rate": 1.4612067421552296e-05, + "loss": 0.7727, + "step": 4460 + }, + { + "epoch": 0.36708496194198725, + "grad_norm": 2.0979961178815763, + "learning_rate": 1.4609702353854237e-05, + "loss": 0.7751, + "step": 4461 + }, + { + "epoch": 0.36716724953713226, + "grad_norm": 0.4150156175350144, + "learning_rate": 1.4607336958690294e-05, + "loss": 0.5245, + "step": 4462 + }, + { + "epoch": 0.3672495371322773, + "grad_norm": 0.42416759569725526, + "learning_rate": 1.4604971236228501e-05, + "loss": 0.5297, + "step": 4463 + }, + { + "epoch": 0.3673318247274223, + "grad_norm": 1.8600715550814397, + "learning_rate": 1.4602605186636915e-05, + "loss": 0.782, + "step": 4464 + }, + { + "epoch": 0.3674141123225674, + "grad_norm": 2.264156513094946, + "learning_rate": 1.4600238810083622e-05, + "loss": 0.7976, + "step": 4465 + }, + { + "epoch": 0.3674963999177124, + "grad_norm": 2.466816640608851, + "learning_rate": 1.4597872106736717e-05, + "loss": 0.769, + "step": 4466 + }, + { + "epoch": 0.36757868751285744, + "grad_norm": 2.4140449926533583, + "learning_rate": 1.459550507676433e-05, + "loss": 0.7374, + "step": 4467 + }, + { + "epoch": 0.36766097510800244, + "grad_norm": 1.9744437590118025, + "learning_rate": 1.4593137720334617e-05, + "loss": 0.8271, + "step": 4468 + }, + { + "epoch": 0.3677432627031475, + "grad_norm": 2.0861707911384118, + "learning_rate": 1.459077003761574e-05, + "loss": 0.7556, + "step": 4469 + }, + { + "epoch": 0.36782555029829256, + "grad_norm": 2.225586480349321, + "learning_rate": 1.4588402028775908e-05, + "loss": 0.8122, + "step": 4470 + }, + { + "epoch": 0.36790783789343756, + "grad_norm": 2.3460756177803863, + "learning_rate": 1.4586033693983327e-05, + "loss": 0.7529, + "step": 4471 + }, + { + "epoch": 0.3679901254885826, + "grad_norm": 2.02700075708636, + "learning_rate": 1.458366503340625e-05, + "loss": 0.759, + "step": 4472 + }, + { + "epoch": 0.3680724130837276, + "grad_norm": 2.1164599563780713, + "learning_rate": 1.458129604721294e-05, + "loss": 0.7755, + "step": 4473 + }, + { + "epoch": 0.3681547006788727, + "grad_norm": 2.1320393238304827, + "learning_rate": 1.4578926735571683e-05, + "loss": 0.8045, + "step": 4474 + }, + { + "epoch": 0.3682369882740177, + "grad_norm": 0.48695037172098554, + "learning_rate": 1.4576557098650796e-05, + "loss": 0.5391, + "step": 4475 + }, + { + "epoch": 0.36831927586916274, + "grad_norm": 1.5776679410827645, + "learning_rate": 1.4574187136618611e-05, + "loss": 0.7711, + "step": 4476 + }, + { + "epoch": 0.36840156346430775, + "grad_norm": 1.983467140644944, + "learning_rate": 1.4571816849643488e-05, + "loss": 0.7422, + "step": 4477 + }, + { + "epoch": 0.3684838510594528, + "grad_norm": 2.7952470061780335, + "learning_rate": 1.4569446237893805e-05, + "loss": 0.7629, + "step": 4478 + }, + { + "epoch": 0.3685661386545978, + "grad_norm": 1.8951929397468077, + "learning_rate": 1.4567075301537973e-05, + "loss": 0.7608, + "step": 4479 + }, + { + "epoch": 0.36864842624974287, + "grad_norm": 2.2644032786330666, + "learning_rate": 1.4564704040744413e-05, + "loss": 0.7731, + "step": 4480 + }, + { + "epoch": 0.36873071384488787, + "grad_norm": 1.6308147304056042, + "learning_rate": 1.4562332455681576e-05, + "loss": 0.7718, + "step": 4481 + }, + { + "epoch": 0.36881300144003293, + "grad_norm": 1.9187417357013663, + "learning_rate": 1.4559960546517941e-05, + "loss": 0.7499, + "step": 4482 + }, + { + "epoch": 0.36889528903517793, + "grad_norm": 2.4970892054774243, + "learning_rate": 1.4557588313422002e-05, + "loss": 0.7722, + "step": 4483 + }, + { + "epoch": 0.368977576630323, + "grad_norm": 2.035220067729166, + "learning_rate": 1.4555215756562275e-05, + "loss": 0.783, + "step": 4484 + }, + { + "epoch": 0.369059864225468, + "grad_norm": 1.5326810812478324, + "learning_rate": 1.455284287610731e-05, + "loss": 0.7957, + "step": 4485 + }, + { + "epoch": 0.36914215182061305, + "grad_norm": 9.804159766806016, + "learning_rate": 1.4550469672225665e-05, + "loss": 0.7803, + "step": 4486 + }, + { + "epoch": 0.36922443941575805, + "grad_norm": 1.7776938498064865, + "learning_rate": 1.454809614508593e-05, + "loss": 0.7409, + "step": 4487 + }, + { + "epoch": 0.3693067270109031, + "grad_norm": 1.4513063578619894, + "learning_rate": 1.4545722294856721e-05, + "loss": 0.7513, + "step": 4488 + }, + { + "epoch": 0.3693890146060481, + "grad_norm": 3.5972151914030492, + "learning_rate": 1.454334812170667e-05, + "loss": 0.7692, + "step": 4489 + }, + { + "epoch": 0.3694713022011932, + "grad_norm": 1.6586163446187947, + "learning_rate": 1.4540973625804433e-05, + "loss": 0.7604, + "step": 4490 + }, + { + "epoch": 0.3695535897963382, + "grad_norm": 0.4313549997543267, + "learning_rate": 1.4538598807318696e-05, + "loss": 0.5467, + "step": 4491 + }, + { + "epoch": 0.36963587739148324, + "grad_norm": 4.145955650814357, + "learning_rate": 1.4536223666418155e-05, + "loss": 0.7478, + "step": 4492 + }, + { + "epoch": 0.36971816498662824, + "grad_norm": 1.7442126531231734, + "learning_rate": 1.4533848203271537e-05, + "loss": 0.8121, + "step": 4493 + }, + { + "epoch": 0.3698004525817733, + "grad_norm": 1.6286417796899968, + "learning_rate": 1.4531472418047598e-05, + "loss": 0.7719, + "step": 4494 + }, + { + "epoch": 0.36988274017691836, + "grad_norm": 1.6615117096187888, + "learning_rate": 1.4529096310915102e-05, + "loss": 0.7699, + "step": 4495 + }, + { + "epoch": 0.36996502777206336, + "grad_norm": 1.4117787685747312, + "learning_rate": 1.4526719882042848e-05, + "loss": 0.7878, + "step": 4496 + }, + { + "epoch": 0.3700473153672084, + "grad_norm": 1.638714013161808, + "learning_rate": 1.4524343131599653e-05, + "loss": 0.7376, + "step": 4497 + }, + { + "epoch": 0.3701296029623534, + "grad_norm": 1.5965406796796069, + "learning_rate": 1.452196605975436e-05, + "loss": 0.7777, + "step": 4498 + }, + { + "epoch": 0.3702118905574985, + "grad_norm": 1.5482838017669318, + "learning_rate": 1.4519588666675827e-05, + "loss": 0.7855, + "step": 4499 + }, + { + "epoch": 0.3702941781526435, + "grad_norm": 1.8858369520853138, + "learning_rate": 1.4517210952532947e-05, + "loss": 0.7805, + "step": 4500 + }, + { + "epoch": 0.37037646574778854, + "grad_norm": 1.4375744990660175, + "learning_rate": 1.4514832917494621e-05, + "loss": 0.7696, + "step": 4501 + }, + { + "epoch": 0.37045875334293354, + "grad_norm": 2.347801549265926, + "learning_rate": 1.4512454561729785e-05, + "loss": 0.7445, + "step": 4502 + }, + { + "epoch": 0.3705410409380786, + "grad_norm": 1.5609272987977516, + "learning_rate": 1.4510075885407397e-05, + "loss": 0.7722, + "step": 4503 + }, + { + "epoch": 0.3706233285332236, + "grad_norm": 0.4317774207142754, + "learning_rate": 1.4507696888696427e-05, + "loss": 0.5321, + "step": 4504 + }, + { + "epoch": 0.37070561612836866, + "grad_norm": 0.44896736781993724, + "learning_rate": 1.4505317571765884e-05, + "loss": 0.5395, + "step": 4505 + }, + { + "epoch": 0.37078790372351367, + "grad_norm": 1.8858777948103396, + "learning_rate": 1.4502937934784782e-05, + "loss": 0.7858, + "step": 4506 + }, + { + "epoch": 0.3708701913186587, + "grad_norm": 2.0328093636829405, + "learning_rate": 1.4500557977922169e-05, + "loss": 0.7787, + "step": 4507 + }, + { + "epoch": 0.37095247891380373, + "grad_norm": 1.4327084093712712, + "learning_rate": 1.449817770134712e-05, + "loss": 0.7672, + "step": 4508 + }, + { + "epoch": 0.3710347665089488, + "grad_norm": 0.43904804316605966, + "learning_rate": 1.4495797105228717e-05, + "loss": 0.5272, + "step": 4509 + }, + { + "epoch": 0.3711170541040938, + "grad_norm": 1.8640721897885113, + "learning_rate": 1.4493416189736078e-05, + "loss": 0.7821, + "step": 4510 + }, + { + "epoch": 0.37119934169923885, + "grad_norm": 2.1193011716107697, + "learning_rate": 1.449103495503834e-05, + "loss": 0.7917, + "step": 4511 + }, + { + "epoch": 0.37128162929438385, + "grad_norm": 1.7976609236363605, + "learning_rate": 1.4488653401304661e-05, + "loss": 0.767, + "step": 4512 + }, + { + "epoch": 0.3713639168895289, + "grad_norm": 2.129476666394086, + "learning_rate": 1.4486271528704221e-05, + "loss": 0.7577, + "step": 4513 + }, + { + "epoch": 0.3714462044846739, + "grad_norm": 1.750353382913871, + "learning_rate": 1.4483889337406229e-05, + "loss": 0.7854, + "step": 4514 + }, + { + "epoch": 0.371528492079819, + "grad_norm": 1.7298980126758945, + "learning_rate": 1.4481506827579907e-05, + "loss": 0.7681, + "step": 4515 + }, + { + "epoch": 0.371610779674964, + "grad_norm": 1.8153783425434746, + "learning_rate": 1.4479123999394511e-05, + "loss": 0.7803, + "step": 4516 + }, + { + "epoch": 0.37169306727010903, + "grad_norm": 1.7403068790709029, + "learning_rate": 1.4476740853019306e-05, + "loss": 0.7721, + "step": 4517 + }, + { + "epoch": 0.37177535486525404, + "grad_norm": 0.4399208383339162, + "learning_rate": 1.447435738862359e-05, + "loss": 0.5243, + "step": 4518 + }, + { + "epoch": 0.3718576424603991, + "grad_norm": 1.6715068973685927, + "learning_rate": 1.4471973606376683e-05, + "loss": 0.7726, + "step": 4519 + }, + { + "epoch": 0.3719399300555441, + "grad_norm": 2.024714093194338, + "learning_rate": 1.446958950644792e-05, + "loss": 0.754, + "step": 4520 + }, + { + "epoch": 0.37202221765068916, + "grad_norm": 1.4834201143071126, + "learning_rate": 1.4467205089006669e-05, + "loss": 0.7723, + "step": 4521 + }, + { + "epoch": 0.3721045052458342, + "grad_norm": 1.8700387404842578, + "learning_rate": 1.4464820354222313e-05, + "loss": 0.7761, + "step": 4522 + }, + { + "epoch": 0.3721867928409792, + "grad_norm": 1.6717258091113558, + "learning_rate": 1.4462435302264258e-05, + "loss": 0.7549, + "step": 4523 + }, + { + "epoch": 0.3722690804361243, + "grad_norm": 1.4988403446389595, + "learning_rate": 1.4460049933301936e-05, + "loss": 0.7658, + "step": 4524 + }, + { + "epoch": 0.3723513680312693, + "grad_norm": 1.9011931159728093, + "learning_rate": 1.4457664247504801e-05, + "loss": 0.7386, + "step": 4525 + }, + { + "epoch": 0.37243365562641434, + "grad_norm": 1.5673821015214395, + "learning_rate": 1.4455278245042324e-05, + "loss": 0.7549, + "step": 4526 + }, + { + "epoch": 0.37251594322155934, + "grad_norm": 7.012481079996853, + "learning_rate": 1.4452891926084007e-05, + "loss": 0.7986, + "step": 4527 + }, + { + "epoch": 0.3725982308167044, + "grad_norm": 2.2057183131549345, + "learning_rate": 1.445050529079937e-05, + "loss": 0.782, + "step": 4528 + }, + { + "epoch": 0.3726805184118494, + "grad_norm": 1.6008460838370742, + "learning_rate": 1.4448118339357952e-05, + "loss": 0.791, + "step": 4529 + }, + { + "epoch": 0.37276280600699446, + "grad_norm": 1.505605241363267, + "learning_rate": 1.4445731071929322e-05, + "loss": 0.7968, + "step": 4530 + }, + { + "epoch": 0.37284509360213947, + "grad_norm": 1.6138313139826725, + "learning_rate": 1.444334348868307e-05, + "loss": 0.7937, + "step": 4531 + }, + { + "epoch": 0.3729273811972845, + "grad_norm": 0.4458667608434429, + "learning_rate": 1.4440955589788799e-05, + "loss": 0.5406, + "step": 4532 + }, + { + "epoch": 0.3730096687924295, + "grad_norm": 0.432796137745458, + "learning_rate": 1.4438567375416146e-05, + "loss": 0.5296, + "step": 4533 + }, + { + "epoch": 0.3730919563875746, + "grad_norm": 0.40766863777022705, + "learning_rate": 1.4436178845734765e-05, + "loss": 0.5185, + "step": 4534 + }, + { + "epoch": 0.3731742439827196, + "grad_norm": 1.6234333459586188, + "learning_rate": 1.4433790000914335e-05, + "loss": 0.766, + "step": 4535 + }, + { + "epoch": 0.37325653157786465, + "grad_norm": 1.589246175659433, + "learning_rate": 1.443140084112455e-05, + "loss": 0.8056, + "step": 4536 + }, + { + "epoch": 0.37333881917300965, + "grad_norm": 0.4631045822690777, + "learning_rate": 1.4429011366535141e-05, + "loss": 0.5251, + "step": 4537 + }, + { + "epoch": 0.3734211067681547, + "grad_norm": 2.483003333811014, + "learning_rate": 1.4426621577315845e-05, + "loss": 0.768, + "step": 4538 + }, + { + "epoch": 0.3735033943632997, + "grad_norm": 1.5338530280671414, + "learning_rate": 1.4424231473636433e-05, + "loss": 0.7825, + "step": 4539 + }, + { + "epoch": 0.37358568195844477, + "grad_norm": 0.45025629813446305, + "learning_rate": 1.4421841055666692e-05, + "loss": 0.5369, + "step": 4540 + }, + { + "epoch": 0.3736679695535898, + "grad_norm": 1.4208958817575046, + "learning_rate": 1.4419450323576433e-05, + "loss": 0.7747, + "step": 4541 + }, + { + "epoch": 0.37375025714873483, + "grad_norm": 1.5654795359049074, + "learning_rate": 1.441705927753549e-05, + "loss": 0.8072, + "step": 4542 + }, + { + "epoch": 0.37383254474387984, + "grad_norm": 1.4134230661246265, + "learning_rate": 1.4414667917713722e-05, + "loss": 0.7878, + "step": 4543 + }, + { + "epoch": 0.3739148323390249, + "grad_norm": 1.6379887262014758, + "learning_rate": 1.4412276244281007e-05, + "loss": 0.7817, + "step": 4544 + }, + { + "epoch": 0.3739971199341699, + "grad_norm": 1.4450553066957816, + "learning_rate": 1.4409884257407241e-05, + "loss": 0.7891, + "step": 4545 + }, + { + "epoch": 0.37407940752931496, + "grad_norm": 1.7623252183283458, + "learning_rate": 1.4407491957262352e-05, + "loss": 0.7584, + "step": 4546 + }, + { + "epoch": 0.37416169512446, + "grad_norm": 2.3308199124710796, + "learning_rate": 1.4405099344016283e-05, + "loss": 0.7739, + "step": 4547 + }, + { + "epoch": 0.374243982719605, + "grad_norm": 1.4816028141881152, + "learning_rate": 1.4402706417838998e-05, + "loss": 0.8105, + "step": 4548 + }, + { + "epoch": 0.3743262703147501, + "grad_norm": 1.9583308216181488, + "learning_rate": 1.4400313178900493e-05, + "loss": 0.7764, + "step": 4549 + }, + { + "epoch": 0.3744085579098951, + "grad_norm": 5.005498485553217, + "learning_rate": 1.4397919627370778e-05, + "loss": 0.7797, + "step": 4550 + }, + { + "epoch": 0.37449084550504014, + "grad_norm": 1.8625426911477554, + "learning_rate": 1.4395525763419887e-05, + "loss": 0.7413, + "step": 4551 + }, + { + "epoch": 0.37457313310018514, + "grad_norm": 0.43797451334491677, + "learning_rate": 1.4393131587217872e-05, + "loss": 0.5325, + "step": 4552 + }, + { + "epoch": 0.3746554206953302, + "grad_norm": 1.7261389962629246, + "learning_rate": 1.4390737098934814e-05, + "loss": 0.7817, + "step": 4553 + }, + { + "epoch": 0.3747377082904752, + "grad_norm": 0.432483155996718, + "learning_rate": 1.4388342298740818e-05, + "loss": 0.5266, + "step": 4554 + }, + { + "epoch": 0.37481999588562026, + "grad_norm": 2.0048308136300412, + "learning_rate": 1.4385947186806002e-05, + "loss": 0.8069, + "step": 4555 + }, + { + "epoch": 0.37490228348076526, + "grad_norm": 0.4266240039230243, + "learning_rate": 1.4383551763300511e-05, + "loss": 0.5414, + "step": 4556 + }, + { + "epoch": 0.3749845710759103, + "grad_norm": 0.40765210161391374, + "learning_rate": 1.4381156028394516e-05, + "loss": 0.5189, + "step": 4557 + }, + { + "epoch": 0.3750668586710553, + "grad_norm": 1.5829942825994772, + "learning_rate": 1.43787599822582e-05, + "loss": 0.7935, + "step": 4558 + }, + { + "epoch": 0.3751491462662004, + "grad_norm": 1.8774020709371115, + "learning_rate": 1.4376363625061777e-05, + "loss": 0.7774, + "step": 4559 + }, + { + "epoch": 0.3752314338613454, + "grad_norm": 2.0550537940410543, + "learning_rate": 1.4373966956975485e-05, + "loss": 0.7709, + "step": 4560 + }, + { + "epoch": 0.37531372145649045, + "grad_norm": 1.8249255325108416, + "learning_rate": 1.4371569978169573e-05, + "loss": 0.7987, + "step": 4561 + }, + { + "epoch": 0.37539600905163545, + "grad_norm": 1.541752537939895, + "learning_rate": 1.4369172688814321e-05, + "loss": 0.7597, + "step": 4562 + }, + { + "epoch": 0.3754782966467805, + "grad_norm": 1.3961789137334686, + "learning_rate": 1.4366775089080032e-05, + "loss": 0.7543, + "step": 4563 + }, + { + "epoch": 0.3755605842419255, + "grad_norm": 2.1105305033114408, + "learning_rate": 1.4364377179137019e-05, + "loss": 0.7893, + "step": 4564 + }, + { + "epoch": 0.37564287183707057, + "grad_norm": 1.495692999681622, + "learning_rate": 1.4361978959155634e-05, + "loss": 0.7719, + "step": 4565 + }, + { + "epoch": 0.37572515943221557, + "grad_norm": 1.4691891017741983, + "learning_rate": 1.435958042930624e-05, + "loss": 0.7936, + "step": 4566 + }, + { + "epoch": 0.37580744702736063, + "grad_norm": 1.6181827872012398, + "learning_rate": 1.4357181589759224e-05, + "loss": 0.7743, + "step": 4567 + }, + { + "epoch": 0.37588973462250563, + "grad_norm": 2.3181204703282168, + "learning_rate": 1.4354782440684996e-05, + "loss": 0.7959, + "step": 4568 + }, + { + "epoch": 0.3759720222176507, + "grad_norm": 1.7137548161273988, + "learning_rate": 1.4352382982253987e-05, + "loss": 0.7952, + "step": 4569 + }, + { + "epoch": 0.3760543098127957, + "grad_norm": 1.4865403220549387, + "learning_rate": 1.4349983214636651e-05, + "loss": 0.7595, + "step": 4570 + }, + { + "epoch": 0.37613659740794075, + "grad_norm": 1.6103471768909214, + "learning_rate": 1.4347583138003466e-05, + "loss": 0.7806, + "step": 4571 + }, + { + "epoch": 0.3762188850030858, + "grad_norm": 1.5131674019559864, + "learning_rate": 1.4345182752524928e-05, + "loss": 0.7636, + "step": 4572 + }, + { + "epoch": 0.3763011725982308, + "grad_norm": 1.63268871631576, + "learning_rate": 1.4342782058371556e-05, + "loss": 0.7492, + "step": 4573 + }, + { + "epoch": 0.3763834601933759, + "grad_norm": 2.061844552068988, + "learning_rate": 1.434038105571389e-05, + "loss": 0.7729, + "step": 4574 + }, + { + "epoch": 0.3764657477885209, + "grad_norm": 0.5066430113274296, + "learning_rate": 1.4337979744722499e-05, + "loss": 0.5516, + "step": 4575 + }, + { + "epoch": 0.37654803538366594, + "grad_norm": 1.6736737523868574, + "learning_rate": 1.433557812556796e-05, + "loss": 0.7661, + "step": 4576 + }, + { + "epoch": 0.37663032297881094, + "grad_norm": 2.105534033243132, + "learning_rate": 1.4333176198420886e-05, + "loss": 0.7938, + "step": 4577 + }, + { + "epoch": 0.376712610573956, + "grad_norm": 2.1355003778351414, + "learning_rate": 1.4330773963451908e-05, + "loss": 0.7819, + "step": 4578 + }, + { + "epoch": 0.376794898169101, + "grad_norm": 1.4741029506595988, + "learning_rate": 1.4328371420831671e-05, + "loss": 0.7853, + "step": 4579 + }, + { + "epoch": 0.37687718576424606, + "grad_norm": 1.5835573949313149, + "learning_rate": 1.4325968570730848e-05, + "loss": 0.7749, + "step": 4580 + }, + { + "epoch": 0.37695947335939106, + "grad_norm": 0.44083094397088896, + "learning_rate": 1.4323565413320142e-05, + "loss": 0.5391, + "step": 4581 + }, + { + "epoch": 0.3770417609545361, + "grad_norm": 1.6347567812423156, + "learning_rate": 1.4321161948770259e-05, + "loss": 0.7737, + "step": 4582 + }, + { + "epoch": 0.3771240485496811, + "grad_norm": 1.8441033262743296, + "learning_rate": 1.4318758177251942e-05, + "loss": 0.7192, + "step": 4583 + }, + { + "epoch": 0.3772063361448262, + "grad_norm": 0.4289571956899822, + "learning_rate": 1.4316354098935954e-05, + "loss": 0.5118, + "step": 4584 + }, + { + "epoch": 0.3772886237399712, + "grad_norm": 1.7899605781408279, + "learning_rate": 1.4313949713993071e-05, + "loss": 0.7726, + "step": 4585 + }, + { + "epoch": 0.37737091133511624, + "grad_norm": 1.56666651510149, + "learning_rate": 1.4311545022594102e-05, + "loss": 0.7793, + "step": 4586 + }, + { + "epoch": 0.37745319893026125, + "grad_norm": 1.737668997898733, + "learning_rate": 1.4309140024909866e-05, + "loss": 0.8099, + "step": 4587 + }, + { + "epoch": 0.3775354865254063, + "grad_norm": 1.589919808762366, + "learning_rate": 1.4306734721111218e-05, + "loss": 0.7804, + "step": 4588 + }, + { + "epoch": 0.3776177741205513, + "grad_norm": 0.4244706060066968, + "learning_rate": 1.4304329111369022e-05, + "loss": 0.4932, + "step": 4589 + }, + { + "epoch": 0.37770006171569637, + "grad_norm": 2.4329554443936816, + "learning_rate": 1.4301923195854169e-05, + "loss": 0.7891, + "step": 4590 + }, + { + "epoch": 0.37778234931084137, + "grad_norm": 0.43773898887519175, + "learning_rate": 1.429951697473757e-05, + "loss": 0.5401, + "step": 4591 + }, + { + "epoch": 0.37786463690598643, + "grad_norm": 1.7246753161721682, + "learning_rate": 1.4297110448190165e-05, + "loss": 0.8075, + "step": 4592 + }, + { + "epoch": 0.37794692450113143, + "grad_norm": 2.01452624675174, + "learning_rate": 1.4294703616382903e-05, + "loss": 0.7609, + "step": 4593 + }, + { + "epoch": 0.3780292120962765, + "grad_norm": 1.6966931260351583, + "learning_rate": 1.4292296479486767e-05, + "loss": 0.7696, + "step": 4594 + }, + { + "epoch": 0.3781114996914215, + "grad_norm": 1.6082916245483474, + "learning_rate": 1.4289889037672753e-05, + "loss": 0.782, + "step": 4595 + }, + { + "epoch": 0.37819378728656655, + "grad_norm": 0.43571892849028787, + "learning_rate": 1.4287481291111883e-05, + "loss": 0.5085, + "step": 4596 + }, + { + "epoch": 0.37827607488171155, + "grad_norm": 1.6687021531709416, + "learning_rate": 1.4285073239975196e-05, + "loss": 0.791, + "step": 4597 + }, + { + "epoch": 0.3783583624768566, + "grad_norm": 1.4674920894090417, + "learning_rate": 1.4282664884433761e-05, + "loss": 0.7724, + "step": 4598 + }, + { + "epoch": 0.37844065007200167, + "grad_norm": 1.8231309511495206, + "learning_rate": 1.4280256224658661e-05, + "loss": 0.7819, + "step": 4599 + }, + { + "epoch": 0.3785229376671467, + "grad_norm": 1.5851674784462597, + "learning_rate": 1.4277847260821005e-05, + "loss": 0.7643, + "step": 4600 + }, + { + "epoch": 0.37860522526229173, + "grad_norm": 1.8373627682166949, + "learning_rate": 1.427543799309192e-05, + "loss": 0.8036, + "step": 4601 + }, + { + "epoch": 0.37868751285743674, + "grad_norm": 1.8604288802938558, + "learning_rate": 1.427302842164256e-05, + "loss": 0.7754, + "step": 4602 + }, + { + "epoch": 0.3787698004525818, + "grad_norm": 1.656448028035198, + "learning_rate": 1.4270618546644091e-05, + "loss": 0.8092, + "step": 4603 + }, + { + "epoch": 0.3788520880477268, + "grad_norm": 0.43009165612102307, + "learning_rate": 1.4268208368267713e-05, + "loss": 0.5195, + "step": 4604 + }, + { + "epoch": 0.37893437564287186, + "grad_norm": 1.584713294524119, + "learning_rate": 1.4265797886684636e-05, + "loss": 0.7956, + "step": 4605 + }, + { + "epoch": 0.37901666323801686, + "grad_norm": 1.4611823928271321, + "learning_rate": 1.42633871020661e-05, + "loss": 0.7752, + "step": 4606 + }, + { + "epoch": 0.3790989508331619, + "grad_norm": 1.5306387548194744, + "learning_rate": 1.4260976014583365e-05, + "loss": 0.7766, + "step": 4607 + }, + { + "epoch": 0.3791812384283069, + "grad_norm": 1.6347143847078773, + "learning_rate": 1.4258564624407707e-05, + "loss": 0.7735, + "step": 4608 + }, + { + "epoch": 0.379263526023452, + "grad_norm": 2.1087785704595596, + "learning_rate": 1.4256152931710427e-05, + "loss": 0.7548, + "step": 4609 + }, + { + "epoch": 0.379345813618597, + "grad_norm": 0.4586510286645557, + "learning_rate": 1.4253740936662851e-05, + "loss": 0.542, + "step": 4610 + }, + { + "epoch": 0.37942810121374204, + "grad_norm": 1.5829988483837198, + "learning_rate": 1.425132863943632e-05, + "loss": 0.7983, + "step": 4611 + }, + { + "epoch": 0.37951038880888704, + "grad_norm": 0.42329751953786027, + "learning_rate": 1.4248916040202204e-05, + "loss": 0.527, + "step": 4612 + }, + { + "epoch": 0.3795926764040321, + "grad_norm": 1.7726799396306343, + "learning_rate": 1.4246503139131887e-05, + "loss": 0.7798, + "step": 4613 + }, + { + "epoch": 0.3796749639991771, + "grad_norm": 1.8816783912453918, + "learning_rate": 1.4244089936396776e-05, + "loss": 0.807, + "step": 4614 + }, + { + "epoch": 0.37975725159432216, + "grad_norm": 1.548165423221472, + "learning_rate": 1.4241676432168306e-05, + "loss": 0.7617, + "step": 4615 + }, + { + "epoch": 0.37983953918946717, + "grad_norm": 1.7317113706167404, + "learning_rate": 1.4239262626617927e-05, + "loss": 0.7868, + "step": 4616 + }, + { + "epoch": 0.3799218267846122, + "grad_norm": 1.4057493354349178, + "learning_rate": 1.4236848519917107e-05, + "loss": 0.7612, + "step": 4617 + }, + { + "epoch": 0.38000411437975723, + "grad_norm": 1.7705982163378193, + "learning_rate": 1.4234434112237346e-05, + "loss": 0.8089, + "step": 4618 + }, + { + "epoch": 0.3800864019749023, + "grad_norm": 2.853516435647564, + "learning_rate": 1.4232019403750157e-05, + "loss": 0.7749, + "step": 4619 + }, + { + "epoch": 0.3801686895700473, + "grad_norm": 1.559043015888748, + "learning_rate": 1.422960439462708e-05, + "loss": 0.772, + "step": 4620 + }, + { + "epoch": 0.38025097716519235, + "grad_norm": 1.8879800923469299, + "learning_rate": 1.4227189085039668e-05, + "loss": 0.7691, + "step": 4621 + }, + { + "epoch": 0.38033326476033735, + "grad_norm": 0.4256614959361152, + "learning_rate": 1.4224773475159504e-05, + "loss": 0.5531, + "step": 4622 + }, + { + "epoch": 0.3804155523554824, + "grad_norm": 0.4200057556792914, + "learning_rate": 1.4222357565158189e-05, + "loss": 0.5464, + "step": 4623 + }, + { + "epoch": 0.38049783995062747, + "grad_norm": 1.917363164103099, + "learning_rate": 1.4219941355207347e-05, + "loss": 0.7407, + "step": 4624 + }, + { + "epoch": 0.3805801275457725, + "grad_norm": 1.4810251000145112, + "learning_rate": 1.4217524845478618e-05, + "loss": 0.8127, + "step": 4625 + }, + { + "epoch": 0.38066241514091753, + "grad_norm": 1.4435143298392117, + "learning_rate": 1.421510803614367e-05, + "loss": 0.7905, + "step": 4626 + }, + { + "epoch": 0.38074470273606253, + "grad_norm": 2.0985491595215184, + "learning_rate": 1.4212690927374188e-05, + "loss": 0.7816, + "step": 4627 + }, + { + "epoch": 0.3808269903312076, + "grad_norm": 1.36587917935779, + "learning_rate": 1.421027351934188e-05, + "loss": 0.7809, + "step": 4628 + }, + { + "epoch": 0.3809092779263526, + "grad_norm": 1.7137264620781636, + "learning_rate": 1.4207855812218472e-05, + "loss": 0.7994, + "step": 4629 + }, + { + "epoch": 0.38099156552149765, + "grad_norm": 0.441510461339159, + "learning_rate": 1.4205437806175721e-05, + "loss": 0.4994, + "step": 4630 + }, + { + "epoch": 0.38107385311664266, + "grad_norm": 1.3562536795580937, + "learning_rate": 1.4203019501385391e-05, + "loss": 0.7499, + "step": 4631 + }, + { + "epoch": 0.3811561407117877, + "grad_norm": 0.42632945270690537, + "learning_rate": 1.4200600898019276e-05, + "loss": 0.5156, + "step": 4632 + }, + { + "epoch": 0.3812384283069327, + "grad_norm": 1.6573555699915479, + "learning_rate": 1.4198181996249196e-05, + "loss": 0.817, + "step": 4633 + }, + { + "epoch": 0.3813207159020778, + "grad_norm": 1.90762606509077, + "learning_rate": 1.4195762796246976e-05, + "loss": 0.7941, + "step": 4634 + }, + { + "epoch": 0.3814030034972228, + "grad_norm": 1.459358643776078, + "learning_rate": 1.4193343298184479e-05, + "loss": 0.8, + "step": 4635 + }, + { + "epoch": 0.38148529109236784, + "grad_norm": 1.5081872207107532, + "learning_rate": 1.4190923502233583e-05, + "loss": 0.7832, + "step": 4636 + }, + { + "epoch": 0.38156757868751284, + "grad_norm": 1.430695030854491, + "learning_rate": 1.4188503408566179e-05, + "loss": 0.7656, + "step": 4637 + }, + { + "epoch": 0.3816498662826579, + "grad_norm": 1.49802076909413, + "learning_rate": 1.4186083017354194e-05, + "loss": 0.7822, + "step": 4638 + }, + { + "epoch": 0.3817321538778029, + "grad_norm": 1.4754487085953332, + "learning_rate": 1.4183662328769568e-05, + "loss": 0.7757, + "step": 4639 + }, + { + "epoch": 0.38181444147294796, + "grad_norm": 1.5712279624439285, + "learning_rate": 1.4181241342984255e-05, + "loss": 0.7508, + "step": 4640 + }, + { + "epoch": 0.38189672906809297, + "grad_norm": 2.4526469485793667, + "learning_rate": 1.417882006017025e-05, + "loss": 0.7915, + "step": 4641 + }, + { + "epoch": 0.381979016663238, + "grad_norm": 2.1003872646327926, + "learning_rate": 1.4176398480499548e-05, + "loss": 0.7621, + "step": 4642 + }, + { + "epoch": 0.382061304258383, + "grad_norm": 1.2032367702521385, + "learning_rate": 1.4173976604144177e-05, + "loss": 0.7571, + "step": 4643 + }, + { + "epoch": 0.3821435918535281, + "grad_norm": 1.4285144802391894, + "learning_rate": 1.4171554431276184e-05, + "loss": 0.8014, + "step": 4644 + }, + { + "epoch": 0.3822258794486731, + "grad_norm": 1.4220750148782317, + "learning_rate": 1.4169131962067636e-05, + "loss": 0.7578, + "step": 4645 + }, + { + "epoch": 0.38230816704381815, + "grad_norm": 0.4455145286244249, + "learning_rate": 1.416670919669062e-05, + "loss": 0.5599, + "step": 4646 + }, + { + "epoch": 0.38239045463896315, + "grad_norm": 1.3999354856030055, + "learning_rate": 1.4164286135317246e-05, + "loss": 0.8006, + "step": 4647 + }, + { + "epoch": 0.3824727422341082, + "grad_norm": 1.616631495560293, + "learning_rate": 1.4161862778119648e-05, + "loss": 0.7697, + "step": 4648 + }, + { + "epoch": 0.3825550298292532, + "grad_norm": 1.5018858547569893, + "learning_rate": 1.4159439125269971e-05, + "loss": 0.7915, + "step": 4649 + }, + { + "epoch": 0.38263731742439827, + "grad_norm": 0.45941714058641875, + "learning_rate": 1.415701517694039e-05, + "loss": 0.5196, + "step": 4650 + }, + { + "epoch": 0.38271960501954333, + "grad_norm": 1.6684908069043642, + "learning_rate": 1.4154590933303101e-05, + "loss": 0.7726, + "step": 4651 + }, + { + "epoch": 0.38280189261468833, + "grad_norm": 0.43372533774078476, + "learning_rate": 1.4152166394530315e-05, + "loss": 0.5096, + "step": 4652 + }, + { + "epoch": 0.3828841802098334, + "grad_norm": 0.4234684263881324, + "learning_rate": 1.414974156079427e-05, + "loss": 0.5302, + "step": 4653 + }, + { + "epoch": 0.3829664678049784, + "grad_norm": 1.1404665622376478, + "learning_rate": 1.4147316432267221e-05, + "loss": 0.7692, + "step": 4654 + }, + { + "epoch": 0.38304875540012345, + "grad_norm": 1.3798957686446907, + "learning_rate": 1.4144891009121445e-05, + "loss": 0.7961, + "step": 4655 + }, + { + "epoch": 0.38313104299526846, + "grad_norm": 0.40309752757542106, + "learning_rate": 1.4142465291529242e-05, + "loss": 0.5341, + "step": 4656 + }, + { + "epoch": 0.3832133305904135, + "grad_norm": 1.375260305651529, + "learning_rate": 1.4140039279662925e-05, + "loss": 0.7648, + "step": 4657 + }, + { + "epoch": 0.3832956181855585, + "grad_norm": 1.4117972630068905, + "learning_rate": 1.4137612973694843e-05, + "loss": 0.7884, + "step": 4658 + }, + { + "epoch": 0.3833779057807036, + "grad_norm": 1.2109296707377057, + "learning_rate": 1.4135186373797352e-05, + "loss": 0.7326, + "step": 4659 + }, + { + "epoch": 0.3834601933758486, + "grad_norm": 1.4551816177875234, + "learning_rate": 1.4132759480142833e-05, + "loss": 0.7792, + "step": 4660 + }, + { + "epoch": 0.38354248097099364, + "grad_norm": 1.7120630918341764, + "learning_rate": 1.4130332292903688e-05, + "loss": 0.7886, + "step": 4661 + }, + { + "epoch": 0.38362476856613864, + "grad_norm": 1.4394662645048832, + "learning_rate": 1.4127904812252346e-05, + "loss": 0.8079, + "step": 4662 + }, + { + "epoch": 0.3837070561612837, + "grad_norm": 1.503221658017532, + "learning_rate": 1.4125477038361246e-05, + "loss": 0.7873, + "step": 4663 + }, + { + "epoch": 0.3837893437564287, + "grad_norm": 1.4878693585300635, + "learning_rate": 1.4123048971402856e-05, + "loss": 0.7936, + "step": 4664 + }, + { + "epoch": 0.38387163135157376, + "grad_norm": 1.3540421146847554, + "learning_rate": 1.4120620611549658e-05, + "loss": 0.8116, + "step": 4665 + }, + { + "epoch": 0.38395391894671876, + "grad_norm": 0.442982182343593, + "learning_rate": 1.4118191958974165e-05, + "loss": 0.5275, + "step": 4666 + }, + { + "epoch": 0.3840362065418638, + "grad_norm": 0.4257828491973008, + "learning_rate": 1.4115763013848897e-05, + "loss": 0.5258, + "step": 4667 + }, + { + "epoch": 0.3841184941370088, + "grad_norm": 0.42064364777225555, + "learning_rate": 1.4113333776346414e-05, + "loss": 0.5117, + "step": 4668 + }, + { + "epoch": 0.3842007817321539, + "grad_norm": 1.4756658546731998, + "learning_rate": 1.4110904246639272e-05, + "loss": 0.7734, + "step": 4669 + }, + { + "epoch": 0.3842830693272989, + "grad_norm": 1.5890994209434663, + "learning_rate": 1.4108474424900067e-05, + "loss": 0.7915, + "step": 4670 + }, + { + "epoch": 0.38436535692244395, + "grad_norm": 1.3176833024362846, + "learning_rate": 1.4106044311301412e-05, + "loss": 0.7731, + "step": 4671 + }, + { + "epoch": 0.38444764451758895, + "grad_norm": 1.448650646202771, + "learning_rate": 1.4103613906015935e-05, + "loss": 0.7932, + "step": 4672 + }, + { + "epoch": 0.384529932112734, + "grad_norm": 0.45435234039854683, + "learning_rate": 1.4101183209216287e-05, + "loss": 0.5304, + "step": 4673 + }, + { + "epoch": 0.384612219707879, + "grad_norm": 1.419300408601491, + "learning_rate": 1.4098752221075147e-05, + "loss": 0.7779, + "step": 4674 + }, + { + "epoch": 0.38469450730302407, + "grad_norm": 1.7050156449118206, + "learning_rate": 1.4096320941765197e-05, + "loss": 0.7375, + "step": 4675 + }, + { + "epoch": 0.3847767948981691, + "grad_norm": 1.3631532898791363, + "learning_rate": 1.4093889371459164e-05, + "loss": 0.8061, + "step": 4676 + }, + { + "epoch": 0.38485908249331413, + "grad_norm": 1.3952806505126574, + "learning_rate": 1.4091457510329778e-05, + "loss": 0.7942, + "step": 4677 + }, + { + "epoch": 0.3849413700884592, + "grad_norm": 1.54950606552771, + "learning_rate": 1.408902535854979e-05, + "loss": 0.7743, + "step": 4678 + }, + { + "epoch": 0.3850236576836042, + "grad_norm": 1.5343388971524679, + "learning_rate": 1.4086592916291982e-05, + "loss": 0.7701, + "step": 4679 + }, + { + "epoch": 0.38510594527874925, + "grad_norm": 1.4484528580185516, + "learning_rate": 1.408416018372915e-05, + "loss": 0.7603, + "step": 4680 + }, + { + "epoch": 0.38518823287389425, + "grad_norm": 0.4376941943244993, + "learning_rate": 1.4081727161034109e-05, + "loss": 0.5332, + "step": 4681 + }, + { + "epoch": 0.3852705204690393, + "grad_norm": 1.3099462046946182, + "learning_rate": 1.4079293848379696e-05, + "loss": 0.7864, + "step": 4682 + }, + { + "epoch": 0.3853528080641843, + "grad_norm": 1.391529083314694, + "learning_rate": 1.4076860245938775e-05, + "loss": 0.8025, + "step": 4683 + }, + { + "epoch": 0.3854350956593294, + "grad_norm": 0.43217526617783636, + "learning_rate": 1.407442635388422e-05, + "loss": 0.5265, + "step": 4684 + }, + { + "epoch": 0.3855173832544744, + "grad_norm": 2.4922892569291593, + "learning_rate": 1.4071992172388933e-05, + "loss": 0.7863, + "step": 4685 + }, + { + "epoch": 0.38559967084961944, + "grad_norm": 1.5080506927939803, + "learning_rate": 1.4069557701625836e-05, + "loss": 0.7598, + "step": 4686 + }, + { + "epoch": 0.38568195844476444, + "grad_norm": 0.43969057217864205, + "learning_rate": 1.4067122941767868e-05, + "loss": 0.5443, + "step": 4687 + }, + { + "epoch": 0.3857642460399095, + "grad_norm": 1.5561662928790831, + "learning_rate": 1.4064687892987987e-05, + "loss": 0.7576, + "step": 4688 + }, + { + "epoch": 0.3858465336350545, + "grad_norm": 1.6693651171520447, + "learning_rate": 1.4062252555459183e-05, + "loss": 0.7869, + "step": 4689 + }, + { + "epoch": 0.38592882123019956, + "grad_norm": 1.4932354265644647, + "learning_rate": 1.4059816929354452e-05, + "loss": 0.7757, + "step": 4690 + }, + { + "epoch": 0.38601110882534456, + "grad_norm": 1.6801554967119523, + "learning_rate": 1.405738101484682e-05, + "loss": 0.7544, + "step": 4691 + }, + { + "epoch": 0.3860933964204896, + "grad_norm": 0.40610245923903565, + "learning_rate": 1.405494481210933e-05, + "loss": 0.4992, + "step": 4692 + }, + { + "epoch": 0.3861756840156346, + "grad_norm": 1.4279030928712846, + "learning_rate": 1.4052508321315043e-05, + "loss": 0.7706, + "step": 4693 + }, + { + "epoch": 0.3862579716107797, + "grad_norm": 1.5691049745726078, + "learning_rate": 1.405007154263705e-05, + "loss": 0.7752, + "step": 4694 + }, + { + "epoch": 0.3863402592059247, + "grad_norm": 1.3813306660772908, + "learning_rate": 1.404763447624845e-05, + "loss": 0.7697, + "step": 4695 + }, + { + "epoch": 0.38642254680106974, + "grad_norm": 1.7524155707894806, + "learning_rate": 1.4045197122322366e-05, + "loss": 0.7711, + "step": 4696 + }, + { + "epoch": 0.38650483439621475, + "grad_norm": 1.618016021200021, + "learning_rate": 1.4042759481031954e-05, + "loss": 0.7865, + "step": 4697 + }, + { + "epoch": 0.3865871219913598, + "grad_norm": 1.3578284028285228, + "learning_rate": 1.4040321552550368e-05, + "loss": 0.7681, + "step": 4698 + }, + { + "epoch": 0.3866694095865048, + "grad_norm": 4.092460781356398, + "learning_rate": 1.4037883337050803e-05, + "loss": 0.7714, + "step": 4699 + }, + { + "epoch": 0.38675169718164987, + "grad_norm": 1.4719471939488527, + "learning_rate": 1.4035444834706466e-05, + "loss": 0.7892, + "step": 4700 + }, + { + "epoch": 0.3868339847767949, + "grad_norm": 1.4608554046728215, + "learning_rate": 1.4033006045690577e-05, + "loss": 0.7578, + "step": 4701 + }, + { + "epoch": 0.38691627237193993, + "grad_norm": 1.223396737139375, + "learning_rate": 1.403056697017639e-05, + "loss": 0.7707, + "step": 4702 + }, + { + "epoch": 0.386998559967085, + "grad_norm": 1.317923428108782, + "learning_rate": 1.4028127608337175e-05, + "loss": 0.7717, + "step": 4703 + }, + { + "epoch": 0.38708084756223, + "grad_norm": 1.3819884068481956, + "learning_rate": 1.4025687960346214e-05, + "loss": 0.7872, + "step": 4704 + }, + { + "epoch": 0.38716313515737505, + "grad_norm": 1.96519181229529, + "learning_rate": 1.4023248026376817e-05, + "loss": 0.785, + "step": 4705 + }, + { + "epoch": 0.38724542275252005, + "grad_norm": 1.5293915817900865, + "learning_rate": 1.4020807806602317e-05, + "loss": 0.782, + "step": 4706 + }, + { + "epoch": 0.3873277103476651, + "grad_norm": 1.3503938845969048, + "learning_rate": 1.4018367301196059e-05, + "loss": 0.7628, + "step": 4707 + }, + { + "epoch": 0.3874099979428101, + "grad_norm": 2.46831478516891, + "learning_rate": 1.4015926510331415e-05, + "loss": 0.7582, + "step": 4708 + }, + { + "epoch": 0.3874922855379552, + "grad_norm": 1.512639308351756, + "learning_rate": 1.4013485434181775e-05, + "loss": 0.8096, + "step": 4709 + }, + { + "epoch": 0.3875745731331002, + "grad_norm": 0.4787808359261194, + "learning_rate": 1.4011044072920545e-05, + "loss": 0.5421, + "step": 4710 + }, + { + "epoch": 0.38765686072824523, + "grad_norm": 1.841445502457464, + "learning_rate": 1.4008602426721162e-05, + "loss": 0.7505, + "step": 4711 + }, + { + "epoch": 0.38773914832339024, + "grad_norm": 1.9793074512100168, + "learning_rate": 1.4006160495757075e-05, + "loss": 0.8026, + "step": 4712 + }, + { + "epoch": 0.3878214359185353, + "grad_norm": 0.41820306154431286, + "learning_rate": 1.4003718280201749e-05, + "loss": 0.529, + "step": 4713 + }, + { + "epoch": 0.3879037235136803, + "grad_norm": 2.1922234268617027, + "learning_rate": 1.4001275780228681e-05, + "loss": 0.7875, + "step": 4714 + }, + { + "epoch": 0.38798601110882536, + "grad_norm": 1.71731266852407, + "learning_rate": 1.399883299601138e-05, + "loss": 0.7809, + "step": 4715 + }, + { + "epoch": 0.38806829870397036, + "grad_norm": 1.6436657501427487, + "learning_rate": 1.399638992772338e-05, + "loss": 0.7546, + "step": 4716 + }, + { + "epoch": 0.3881505862991154, + "grad_norm": 2.0600468316771163, + "learning_rate": 1.3993946575538231e-05, + "loss": 0.7787, + "step": 4717 + }, + { + "epoch": 0.3882328738942604, + "grad_norm": 1.7074996919331196, + "learning_rate": 1.3991502939629502e-05, + "loss": 0.7736, + "step": 4718 + }, + { + "epoch": 0.3883151614894055, + "grad_norm": 0.44896731408123147, + "learning_rate": 1.398905902017079e-05, + "loss": 0.5334, + "step": 4719 + }, + { + "epoch": 0.3883974490845505, + "grad_norm": 2.2707396136393876, + "learning_rate": 1.3986614817335704e-05, + "loss": 0.7572, + "step": 4720 + }, + { + "epoch": 0.38847973667969554, + "grad_norm": 1.7804022965363049, + "learning_rate": 1.3984170331297878e-05, + "loss": 0.7947, + "step": 4721 + }, + { + "epoch": 0.38856202427484055, + "grad_norm": 2.0217448209881135, + "learning_rate": 1.3981725562230958e-05, + "loss": 0.7654, + "step": 4722 + }, + { + "epoch": 0.3886443118699856, + "grad_norm": 2.394829419690257, + "learning_rate": 1.397928051030863e-05, + "loss": 0.7892, + "step": 4723 + }, + { + "epoch": 0.3887265994651306, + "grad_norm": 2.176981887296413, + "learning_rate": 1.3976835175704575e-05, + "loss": 0.7702, + "step": 4724 + }, + { + "epoch": 0.38880888706027567, + "grad_norm": 1.8281059093883796, + "learning_rate": 1.3974389558592507e-05, + "loss": 0.7785, + "step": 4725 + }, + { + "epoch": 0.38889117465542067, + "grad_norm": 1.667697252251999, + "learning_rate": 1.3971943659146162e-05, + "loss": 0.798, + "step": 4726 + }, + { + "epoch": 0.3889734622505657, + "grad_norm": 3.194609710321003, + "learning_rate": 1.3969497477539294e-05, + "loss": 0.7882, + "step": 4727 + }, + { + "epoch": 0.3890557498457108, + "grad_norm": 2.0144735792488864, + "learning_rate": 1.3967051013945672e-05, + "loss": 0.7874, + "step": 4728 + }, + { + "epoch": 0.3891380374408558, + "grad_norm": 0.4336243517562767, + "learning_rate": 1.396460426853909e-05, + "loss": 0.5518, + "step": 4729 + }, + { + "epoch": 0.38922032503600085, + "grad_norm": 1.8817602468966521, + "learning_rate": 1.3962157241493361e-05, + "loss": 0.7682, + "step": 4730 + }, + { + "epoch": 0.38930261263114585, + "grad_norm": 1.867001888964424, + "learning_rate": 1.395970993298232e-05, + "loss": 0.8407, + "step": 4731 + }, + { + "epoch": 0.3893849002262909, + "grad_norm": 1.849096345409507, + "learning_rate": 1.3957262343179815e-05, + "loss": 0.7818, + "step": 4732 + }, + { + "epoch": 0.3894671878214359, + "grad_norm": 1.747856622988127, + "learning_rate": 1.3954814472259724e-05, + "loss": 0.7259, + "step": 4733 + }, + { + "epoch": 0.38954947541658097, + "grad_norm": 1.5758884339815993, + "learning_rate": 1.3952366320395936e-05, + "loss": 0.7638, + "step": 4734 + }, + { + "epoch": 0.389631763011726, + "grad_norm": 2.3860119040884213, + "learning_rate": 1.3949917887762367e-05, + "loss": 0.7582, + "step": 4735 + }, + { + "epoch": 0.38971405060687103, + "grad_norm": 1.7462878572225637, + "learning_rate": 1.3947469174532948e-05, + "loss": 0.7564, + "step": 4736 + }, + { + "epoch": 0.38979633820201604, + "grad_norm": 1.6596576039588808, + "learning_rate": 1.3945020180881632e-05, + "loss": 0.7733, + "step": 4737 + }, + { + "epoch": 0.3898786257971611, + "grad_norm": 1.62926660004808, + "learning_rate": 1.394257090698239e-05, + "loss": 0.8072, + "step": 4738 + }, + { + "epoch": 0.3899609133923061, + "grad_norm": 1.809049284714984, + "learning_rate": 1.3940121353009217e-05, + "loss": 0.7642, + "step": 4739 + }, + { + "epoch": 0.39004320098745116, + "grad_norm": 1.775395529870585, + "learning_rate": 1.3937671519136127e-05, + "loss": 0.7894, + "step": 4740 + }, + { + "epoch": 0.39012548858259616, + "grad_norm": 1.6312728683199293, + "learning_rate": 1.3935221405537145e-05, + "loss": 0.8085, + "step": 4741 + }, + { + "epoch": 0.3902077761777412, + "grad_norm": 2.3350774301403154, + "learning_rate": 1.3932771012386331e-05, + "loss": 0.8029, + "step": 4742 + }, + { + "epoch": 0.3902900637728862, + "grad_norm": 1.460942446929902, + "learning_rate": 1.3930320339857753e-05, + "loss": 0.7773, + "step": 4743 + }, + { + "epoch": 0.3903723513680313, + "grad_norm": 1.7378370895545514, + "learning_rate": 1.3927869388125504e-05, + "loss": 0.7826, + "step": 4744 + }, + { + "epoch": 0.3904546389631763, + "grad_norm": 1.731763812909668, + "learning_rate": 1.3925418157363693e-05, + "loss": 0.7758, + "step": 4745 + }, + { + "epoch": 0.39053692655832134, + "grad_norm": 1.9791135260178654, + "learning_rate": 1.3922966647746456e-05, + "loss": 0.7604, + "step": 4746 + }, + { + "epoch": 0.39061921415346634, + "grad_norm": 1.9208066343833679, + "learning_rate": 1.3920514859447943e-05, + "loss": 0.8044, + "step": 4747 + }, + { + "epoch": 0.3907015017486114, + "grad_norm": 1.8190125542396773, + "learning_rate": 1.3918062792642322e-05, + "loss": 0.7723, + "step": 4748 + }, + { + "epoch": 0.3907837893437564, + "grad_norm": 1.6378758273832892, + "learning_rate": 1.391561044750379e-05, + "loss": 0.7932, + "step": 4749 + }, + { + "epoch": 0.39086607693890146, + "grad_norm": 1.5833266449769714, + "learning_rate": 1.391315782420655e-05, + "loss": 0.7398, + "step": 4750 + }, + { + "epoch": 0.39094836453404647, + "grad_norm": 3.6267223924593806, + "learning_rate": 1.3910704922924836e-05, + "loss": 0.7919, + "step": 4751 + }, + { + "epoch": 0.3910306521291915, + "grad_norm": 1.8648929003549561, + "learning_rate": 1.39082517438329e-05, + "loss": 0.7559, + "step": 4752 + }, + { + "epoch": 0.3911129397243366, + "grad_norm": 1.7769186522984872, + "learning_rate": 1.390579828710501e-05, + "loss": 0.7853, + "step": 4753 + }, + { + "epoch": 0.3911952273194816, + "grad_norm": 1.7015576774330343, + "learning_rate": 1.3903344552915457e-05, + "loss": 0.768, + "step": 4754 + }, + { + "epoch": 0.39127751491462665, + "grad_norm": 1.633530944077053, + "learning_rate": 1.390089054143855e-05, + "loss": 0.7719, + "step": 4755 + }, + { + "epoch": 0.39135980250977165, + "grad_norm": 1.75285757945078, + "learning_rate": 1.3898436252848617e-05, + "loss": 0.7993, + "step": 4756 + }, + { + "epoch": 0.3914420901049167, + "grad_norm": 1.6549874434880767, + "learning_rate": 1.3895981687320006e-05, + "loss": 0.7753, + "step": 4757 + }, + { + "epoch": 0.3915243777000617, + "grad_norm": 1.8798099701099278, + "learning_rate": 1.389352684502709e-05, + "loss": 0.7499, + "step": 4758 + }, + { + "epoch": 0.39160666529520677, + "grad_norm": 1.8933438995047365, + "learning_rate": 1.389107172614425e-05, + "loss": 0.7842, + "step": 4759 + }, + { + "epoch": 0.39168895289035177, + "grad_norm": 1.5117186427168021, + "learning_rate": 1.3888616330845897e-05, + "loss": 0.7415, + "step": 4760 + }, + { + "epoch": 0.39177124048549683, + "grad_norm": 4.377633633758451, + "learning_rate": 1.3886160659306463e-05, + "loss": 0.7919, + "step": 4761 + }, + { + "epoch": 0.39185352808064183, + "grad_norm": 1.7648356133107233, + "learning_rate": 1.3883704711700387e-05, + "loss": 0.7718, + "step": 4762 + }, + { + "epoch": 0.3919358156757869, + "grad_norm": 1.332711715229098, + "learning_rate": 1.3881248488202138e-05, + "loss": 0.7658, + "step": 4763 + }, + { + "epoch": 0.3920181032709319, + "grad_norm": 2.8899492462415983, + "learning_rate": 1.3878791988986208e-05, + "loss": 0.7515, + "step": 4764 + }, + { + "epoch": 0.39210039086607695, + "grad_norm": 1.5841886588796068, + "learning_rate": 1.3876335214227098e-05, + "loss": 0.7801, + "step": 4765 + }, + { + "epoch": 0.39218267846122196, + "grad_norm": 0.4287991207533434, + "learning_rate": 1.3873878164099331e-05, + "loss": 0.5361, + "step": 4766 + }, + { + "epoch": 0.392264966056367, + "grad_norm": 1.7828845456243445, + "learning_rate": 1.3871420838777456e-05, + "loss": 0.7549, + "step": 4767 + }, + { + "epoch": 0.392347253651512, + "grad_norm": 1.883930819084228, + "learning_rate": 1.3868963238436035e-05, + "loss": 0.7893, + "step": 4768 + }, + { + "epoch": 0.3924295412466571, + "grad_norm": 1.7613580118582424, + "learning_rate": 1.3866505363249651e-05, + "loss": 0.7761, + "step": 4769 + }, + { + "epoch": 0.3925118288418021, + "grad_norm": 0.42030830073722, + "learning_rate": 1.3864047213392916e-05, + "loss": 0.4859, + "step": 4770 + }, + { + "epoch": 0.39259411643694714, + "grad_norm": 3.439379003095375, + "learning_rate": 1.3861588789040442e-05, + "loss": 0.7925, + "step": 4771 + }, + { + "epoch": 0.39267640403209214, + "grad_norm": 2.911765158971951, + "learning_rate": 1.3859130090366877e-05, + "loss": 0.7733, + "step": 4772 + }, + { + "epoch": 0.3927586916272372, + "grad_norm": 1.846601442536393, + "learning_rate": 1.385667111754688e-05, + "loss": 0.7616, + "step": 4773 + }, + { + "epoch": 0.3928409792223822, + "grad_norm": 1.747058906293054, + "learning_rate": 1.3854211870755139e-05, + "loss": 0.7706, + "step": 4774 + }, + { + "epoch": 0.39292326681752726, + "grad_norm": 1.6299233156124149, + "learning_rate": 1.385175235016635e-05, + "loss": 0.7786, + "step": 4775 + }, + { + "epoch": 0.39300555441267226, + "grad_norm": 1.4978035079756256, + "learning_rate": 1.384929255595523e-05, + "loss": 0.7926, + "step": 4776 + }, + { + "epoch": 0.3930878420078173, + "grad_norm": 1.6306867114241461, + "learning_rate": 1.3846832488296524e-05, + "loss": 0.7695, + "step": 4777 + }, + { + "epoch": 0.3931701296029623, + "grad_norm": 0.4536405273178374, + "learning_rate": 1.3844372147364992e-05, + "loss": 0.5241, + "step": 4778 + }, + { + "epoch": 0.3932524171981074, + "grad_norm": 1.7178068525189172, + "learning_rate": 1.384191153333541e-05, + "loss": 0.7742, + "step": 4779 + }, + { + "epoch": 0.39333470479325244, + "grad_norm": 1.9137628042469057, + "learning_rate": 1.3839450646382577e-05, + "loss": 0.7531, + "step": 4780 + }, + { + "epoch": 0.39341699238839745, + "grad_norm": 1.810913371509303, + "learning_rate": 1.3836989486681311e-05, + "loss": 0.7618, + "step": 4781 + }, + { + "epoch": 0.3934992799835425, + "grad_norm": 1.7017193906277954, + "learning_rate": 1.3834528054406447e-05, + "loss": 0.7683, + "step": 4782 + }, + { + "epoch": 0.3935815675786875, + "grad_norm": 1.9713949746257746, + "learning_rate": 1.3832066349732843e-05, + "loss": 0.7761, + "step": 4783 + }, + { + "epoch": 0.39366385517383257, + "grad_norm": 1.6119526302159937, + "learning_rate": 1.3829604372835377e-05, + "loss": 0.7604, + "step": 4784 + }, + { + "epoch": 0.39374614276897757, + "grad_norm": 1.8727639953963953, + "learning_rate": 1.3827142123888936e-05, + "loss": 0.8005, + "step": 4785 + }, + { + "epoch": 0.39382843036412263, + "grad_norm": 0.43477428222626374, + "learning_rate": 1.382467960306844e-05, + "loss": 0.5363, + "step": 4786 + }, + { + "epoch": 0.39391071795926763, + "grad_norm": 1.4108350338929352, + "learning_rate": 1.3822216810548822e-05, + "loss": 0.7755, + "step": 4787 + }, + { + "epoch": 0.3939930055544127, + "grad_norm": 0.4476186386981923, + "learning_rate": 1.3819753746505038e-05, + "loss": 0.492, + "step": 4788 + }, + { + "epoch": 0.3940752931495577, + "grad_norm": 1.5801613370282777, + "learning_rate": 1.3817290411112052e-05, + "loss": 0.8049, + "step": 4789 + }, + { + "epoch": 0.39415758074470275, + "grad_norm": 1.5624580377664357, + "learning_rate": 1.3814826804544863e-05, + "loss": 0.7701, + "step": 4790 + }, + { + "epoch": 0.39423986833984775, + "grad_norm": 1.5718463792858444, + "learning_rate": 1.3812362926978478e-05, + "loss": 0.8046, + "step": 4791 + }, + { + "epoch": 0.3943221559349928, + "grad_norm": 1.5522539374335746, + "learning_rate": 1.3809898778587927e-05, + "loss": 0.7409, + "step": 4792 + }, + { + "epoch": 0.3944044435301378, + "grad_norm": 2.0762750291167875, + "learning_rate": 1.380743435954826e-05, + "loss": 0.7785, + "step": 4793 + }, + { + "epoch": 0.3944867311252829, + "grad_norm": 1.8873388178236636, + "learning_rate": 1.3804969670034545e-05, + "loss": 0.7728, + "step": 4794 + }, + { + "epoch": 0.3945690187204279, + "grad_norm": 1.7509046034453075, + "learning_rate": 1.380250471022187e-05, + "loss": 0.765, + "step": 4795 + }, + { + "epoch": 0.39465130631557294, + "grad_norm": 1.3241241903003864, + "learning_rate": 1.3800039480285343e-05, + "loss": 0.7607, + "step": 4796 + }, + { + "epoch": 0.39473359391071794, + "grad_norm": 1.4133930458860324, + "learning_rate": 1.3797573980400088e-05, + "loss": 0.7739, + "step": 4797 + }, + { + "epoch": 0.394815881505863, + "grad_norm": 0.42795374263489266, + "learning_rate": 1.3795108210741248e-05, + "loss": 0.5007, + "step": 4798 + }, + { + "epoch": 0.394898169101008, + "grad_norm": 1.527365234078468, + "learning_rate": 1.3792642171483994e-05, + "loss": 0.7856, + "step": 4799 + }, + { + "epoch": 0.39498045669615306, + "grad_norm": 1.6491814501007112, + "learning_rate": 1.3790175862803504e-05, + "loss": 0.7829, + "step": 4800 + }, + { + "epoch": 0.39506274429129806, + "grad_norm": 2.122180018875669, + "learning_rate": 1.378770928487498e-05, + "loss": 0.7627, + "step": 4801 + }, + { + "epoch": 0.3951450318864431, + "grad_norm": 1.8021052844293737, + "learning_rate": 1.378524243787365e-05, + "loss": 0.7754, + "step": 4802 + }, + { + "epoch": 0.3952273194815881, + "grad_norm": 1.9893859162041378, + "learning_rate": 1.3782775321974746e-05, + "loss": 0.7531, + "step": 4803 + }, + { + "epoch": 0.3953096070767332, + "grad_norm": 2.9921955103282523, + "learning_rate": 1.378030793735354e-05, + "loss": 0.7578, + "step": 4804 + }, + { + "epoch": 0.39539189467187824, + "grad_norm": 1.748968558557209, + "learning_rate": 1.3777840284185295e-05, + "loss": 0.7807, + "step": 4805 + }, + { + "epoch": 0.39547418226702324, + "grad_norm": 1.4623425341945417, + "learning_rate": 1.3775372362645324e-05, + "loss": 0.7607, + "step": 4806 + }, + { + "epoch": 0.3955564698621683, + "grad_norm": 0.4242403740438736, + "learning_rate": 1.3772904172908936e-05, + "loss": 0.5264, + "step": 4807 + }, + { + "epoch": 0.3956387574573133, + "grad_norm": 1.921682656195057, + "learning_rate": 1.377043571515147e-05, + "loss": 0.7799, + "step": 4808 + }, + { + "epoch": 0.39572104505245836, + "grad_norm": 0.4151638734851008, + "learning_rate": 1.376796698954828e-05, + "loss": 0.5419, + "step": 4809 + }, + { + "epoch": 0.39580333264760337, + "grad_norm": 1.7046813432671464, + "learning_rate": 1.3765497996274744e-05, + "loss": 0.801, + "step": 4810 + }, + { + "epoch": 0.3958856202427484, + "grad_norm": 1.739406298158605, + "learning_rate": 1.3763028735506247e-05, + "loss": 0.7916, + "step": 4811 + }, + { + "epoch": 0.39596790783789343, + "grad_norm": 2.2464481204144375, + "learning_rate": 1.3760559207418209e-05, + "loss": 0.773, + "step": 4812 + }, + { + "epoch": 0.3960501954330385, + "grad_norm": 1.996458080137954, + "learning_rate": 1.3758089412186062e-05, + "loss": 0.7685, + "step": 4813 + }, + { + "epoch": 0.3961324830281835, + "grad_norm": 0.43619925736944076, + "learning_rate": 1.375561934998525e-05, + "loss": 0.5418, + "step": 4814 + }, + { + "epoch": 0.39621477062332855, + "grad_norm": 2.0310487198193887, + "learning_rate": 1.3753149020991248e-05, + "loss": 0.7547, + "step": 4815 + }, + { + "epoch": 0.39629705821847355, + "grad_norm": 1.765975567326183, + "learning_rate": 1.375067842537954e-05, + "loss": 0.7551, + "step": 4816 + }, + { + "epoch": 0.3963793458136186, + "grad_norm": 1.6822865641220783, + "learning_rate": 1.3748207563325635e-05, + "loss": 0.8024, + "step": 4817 + }, + { + "epoch": 0.3964616334087636, + "grad_norm": 1.5848718906882158, + "learning_rate": 1.3745736435005059e-05, + "loss": 0.7821, + "step": 4818 + }, + { + "epoch": 0.3965439210039087, + "grad_norm": 1.9770200734068921, + "learning_rate": 1.3743265040593358e-05, + "loss": 0.7795, + "step": 4819 + }, + { + "epoch": 0.3966262085990537, + "grad_norm": 2.105318678727579, + "learning_rate": 1.3740793380266095e-05, + "loss": 0.7769, + "step": 4820 + }, + { + "epoch": 0.39670849619419873, + "grad_norm": 1.9266919948005554, + "learning_rate": 1.373832145419885e-05, + "loss": 0.7715, + "step": 4821 + }, + { + "epoch": 0.39679078378934374, + "grad_norm": 2.383660140396743, + "learning_rate": 1.3735849262567231e-05, + "loss": 0.785, + "step": 4822 + }, + { + "epoch": 0.3968730713844888, + "grad_norm": 1.8308391472082335, + "learning_rate": 1.3733376805546855e-05, + "loss": 0.7703, + "step": 4823 + }, + { + "epoch": 0.3969553589796338, + "grad_norm": 0.42636535497538935, + "learning_rate": 1.373090408331336e-05, + "loss": 0.5094, + "step": 4824 + }, + { + "epoch": 0.39703764657477886, + "grad_norm": 1.944343003241766, + "learning_rate": 1.3728431096042407e-05, + "loss": 0.7707, + "step": 4825 + }, + { + "epoch": 0.39711993416992386, + "grad_norm": 1.7768131165944197, + "learning_rate": 1.372595784390967e-05, + "loss": 0.7884, + "step": 4826 + }, + { + "epoch": 0.3972022217650689, + "grad_norm": 1.9144595023422326, + "learning_rate": 1.3723484327090846e-05, + "loss": 0.7583, + "step": 4827 + }, + { + "epoch": 0.3972845093602139, + "grad_norm": 0.41638232983062673, + "learning_rate": 1.3721010545761653e-05, + "loss": 0.5149, + "step": 4828 + }, + { + "epoch": 0.397366796955359, + "grad_norm": 2.1243611290257998, + "learning_rate": 1.371853650009782e-05, + "loss": 0.7848, + "step": 4829 + }, + { + "epoch": 0.397449084550504, + "grad_norm": 0.41768502231270427, + "learning_rate": 1.37160621902751e-05, + "loss": 0.5374, + "step": 4830 + }, + { + "epoch": 0.39753137214564904, + "grad_norm": 1.7962254209278277, + "learning_rate": 1.3713587616469266e-05, + "loss": 0.767, + "step": 4831 + }, + { + "epoch": 0.3976136597407941, + "grad_norm": 1.7898996121973307, + "learning_rate": 1.3711112778856107e-05, + "loss": 0.7682, + "step": 4832 + }, + { + "epoch": 0.3976959473359391, + "grad_norm": 2.781789426512144, + "learning_rate": 1.3708637677611429e-05, + "loss": 0.7873, + "step": 4833 + }, + { + "epoch": 0.39777823493108416, + "grad_norm": 1.8111725457097099, + "learning_rate": 1.3706162312911064e-05, + "loss": 0.7987, + "step": 4834 + }, + { + "epoch": 0.39786052252622917, + "grad_norm": 2.0199120742026353, + "learning_rate": 1.3703686684930855e-05, + "loss": 0.7897, + "step": 4835 + }, + { + "epoch": 0.3979428101213742, + "grad_norm": 0.4221006521753932, + "learning_rate": 1.3701210793846667e-05, + "loss": 0.5703, + "step": 4836 + }, + { + "epoch": 0.3980250977165192, + "grad_norm": 0.4192743796138713, + "learning_rate": 1.3698734639834385e-05, + "loss": 0.5413, + "step": 4837 + }, + { + "epoch": 0.3981073853116643, + "grad_norm": 0.43264439140179317, + "learning_rate": 1.3696258223069908e-05, + "loss": 0.5525, + "step": 4838 + }, + { + "epoch": 0.3981896729068093, + "grad_norm": 1.907492323372713, + "learning_rate": 1.3693781543729157e-05, + "loss": 0.7543, + "step": 4839 + }, + { + "epoch": 0.39827196050195435, + "grad_norm": 1.858901544893876, + "learning_rate": 1.3691304601988074e-05, + "loss": 0.7705, + "step": 4840 + }, + { + "epoch": 0.39835424809709935, + "grad_norm": 1.6933598530478624, + "learning_rate": 1.3688827398022612e-05, + "loss": 0.781, + "step": 4841 + }, + { + "epoch": 0.3984365356922444, + "grad_norm": 2.068591134858905, + "learning_rate": 1.3686349932008755e-05, + "loss": 0.7469, + "step": 4842 + }, + { + "epoch": 0.3985188232873894, + "grad_norm": 1.6224698284144443, + "learning_rate": 1.3683872204122495e-05, + "loss": 0.758, + "step": 4843 + }, + { + "epoch": 0.39860111088253447, + "grad_norm": 1.7594490655620967, + "learning_rate": 1.368139421453984e-05, + "loss": 0.7857, + "step": 4844 + }, + { + "epoch": 0.3986833984776795, + "grad_norm": 1.481693328560124, + "learning_rate": 1.3678915963436834e-05, + "loss": 0.7524, + "step": 4845 + }, + { + "epoch": 0.39876568607282453, + "grad_norm": 2.1486729334941774, + "learning_rate": 1.3676437450989518e-05, + "loss": 0.752, + "step": 4846 + }, + { + "epoch": 0.39884797366796954, + "grad_norm": 2.1799912946799878, + "learning_rate": 1.3673958677373964e-05, + "loss": 0.7819, + "step": 4847 + }, + { + "epoch": 0.3989302612631146, + "grad_norm": 2.1180983344081348, + "learning_rate": 1.3671479642766263e-05, + "loss": 0.8214, + "step": 4848 + }, + { + "epoch": 0.3990125488582596, + "grad_norm": 2.7757867868868846, + "learning_rate": 1.3669000347342519e-05, + "loss": 0.7644, + "step": 4849 + }, + { + "epoch": 0.39909483645340466, + "grad_norm": 1.6900340191126524, + "learning_rate": 1.3666520791278859e-05, + "loss": 0.7946, + "step": 4850 + }, + { + "epoch": 0.39917712404854966, + "grad_norm": 1.7298253546304831, + "learning_rate": 1.3664040974751424e-05, + "loss": 0.7745, + "step": 4851 + }, + { + "epoch": 0.3992594116436947, + "grad_norm": 0.46359746310117156, + "learning_rate": 1.3661560897936379e-05, + "loss": 0.5404, + "step": 4852 + }, + { + "epoch": 0.3993416992388397, + "grad_norm": 1.6835608035272525, + "learning_rate": 1.3659080561009904e-05, + "loss": 0.7727, + "step": 4853 + }, + { + "epoch": 0.3994239868339848, + "grad_norm": 1.5474459085552865, + "learning_rate": 1.3656599964148198e-05, + "loss": 0.7466, + "step": 4854 + }, + { + "epoch": 0.3995062744291298, + "grad_norm": 1.8375199925729886, + "learning_rate": 1.3654119107527477e-05, + "loss": 0.7441, + "step": 4855 + }, + { + "epoch": 0.39958856202427484, + "grad_norm": 2.9603745816889036, + "learning_rate": 1.3651637991323981e-05, + "loss": 0.7595, + "step": 4856 + }, + { + "epoch": 0.3996708496194199, + "grad_norm": 1.8405677376439393, + "learning_rate": 1.364915661571396e-05, + "loss": 0.768, + "step": 4857 + }, + { + "epoch": 0.3997531372145649, + "grad_norm": 0.44053315994172615, + "learning_rate": 1.3646674980873689e-05, + "loss": 0.547, + "step": 4858 + }, + { + "epoch": 0.39983542480970996, + "grad_norm": 0.4214633069703209, + "learning_rate": 1.3644193086979458e-05, + "loss": 0.5257, + "step": 4859 + }, + { + "epoch": 0.39991771240485496, + "grad_norm": 2.3436845812867064, + "learning_rate": 1.3641710934207582e-05, + "loss": 0.7553, + "step": 4860 + }, + { + "epoch": 0.4, + "grad_norm": 1.7970209908615555, + "learning_rate": 1.3639228522734382e-05, + "loss": 0.7962, + "step": 4861 + }, + { + "epoch": 0.400082287595145, + "grad_norm": 0.4087400009461183, + "learning_rate": 1.3636745852736209e-05, + "loss": 0.4983, + "step": 4862 + }, + { + "epoch": 0.4001645751902901, + "grad_norm": 1.960318064560911, + "learning_rate": 1.3634262924389427e-05, + "loss": 0.7403, + "step": 4863 + }, + { + "epoch": 0.4002468627854351, + "grad_norm": 1.6788656523589116, + "learning_rate": 1.3631779737870419e-05, + "loss": 0.7595, + "step": 4864 + }, + { + "epoch": 0.40032915038058015, + "grad_norm": 1.8196290392894898, + "learning_rate": 1.3629296293355585e-05, + "loss": 0.7688, + "step": 4865 + }, + { + "epoch": 0.40041143797572515, + "grad_norm": 1.5825305355553214, + "learning_rate": 1.362681259102135e-05, + "loss": 0.7621, + "step": 4866 + }, + { + "epoch": 0.4004937255708702, + "grad_norm": 2.0454788013366407, + "learning_rate": 1.3624328631044146e-05, + "loss": 0.8071, + "step": 4867 + }, + { + "epoch": 0.4005760131660152, + "grad_norm": 1.8777481040983126, + "learning_rate": 1.3621844413600431e-05, + "loss": 0.7801, + "step": 4868 + }, + { + "epoch": 0.40065830076116027, + "grad_norm": 1.4105119946806837, + "learning_rate": 1.3619359938866686e-05, + "loss": 0.7581, + "step": 4869 + }, + { + "epoch": 0.40074058835630527, + "grad_norm": 1.7471208973659427, + "learning_rate": 1.3616875207019394e-05, + "loss": 0.7758, + "step": 4870 + }, + { + "epoch": 0.40082287595145033, + "grad_norm": 2.461518212038076, + "learning_rate": 1.3614390218235073e-05, + "loss": 0.7449, + "step": 4871 + }, + { + "epoch": 0.40090516354659533, + "grad_norm": 1.4596985079474272, + "learning_rate": 1.3611904972690253e-05, + "loss": 0.7655, + "step": 4872 + }, + { + "epoch": 0.4009874511417404, + "grad_norm": 1.4378911626840982, + "learning_rate": 1.360941947056148e-05, + "loss": 0.7682, + "step": 4873 + }, + { + "epoch": 0.4010697387368854, + "grad_norm": 1.760108551466199, + "learning_rate": 1.3606933712025322e-05, + "loss": 0.7495, + "step": 4874 + }, + { + "epoch": 0.40115202633203045, + "grad_norm": 1.740035980918573, + "learning_rate": 1.3604447697258355e-05, + "loss": 0.7786, + "step": 4875 + }, + { + "epoch": 0.40123431392717546, + "grad_norm": 1.4004694157203506, + "learning_rate": 1.3601961426437194e-05, + "loss": 0.7797, + "step": 4876 + }, + { + "epoch": 0.4013166015223205, + "grad_norm": 0.5046879122535155, + "learning_rate": 1.3599474899738452e-05, + "loss": 0.5435, + "step": 4877 + }, + { + "epoch": 0.4013988891174655, + "grad_norm": 1.788820169973912, + "learning_rate": 1.359698811733877e-05, + "loss": 0.7697, + "step": 4878 + }, + { + "epoch": 0.4014811767126106, + "grad_norm": 1.8614872245999197, + "learning_rate": 1.3594501079414802e-05, + "loss": 0.8038, + "step": 4879 + }, + { + "epoch": 0.4015634643077556, + "grad_norm": 1.8484023683823119, + "learning_rate": 1.359201378614323e-05, + "loss": 0.7699, + "step": 4880 + }, + { + "epoch": 0.40164575190290064, + "grad_norm": 2.0614830442463603, + "learning_rate": 1.358952623770074e-05, + "loss": 0.8072, + "step": 4881 + }, + { + "epoch": 0.4017280394980457, + "grad_norm": 3.9453174569222447, + "learning_rate": 1.3587038434264049e-05, + "loss": 0.7893, + "step": 4882 + }, + { + "epoch": 0.4018103270931907, + "grad_norm": 0.40279339420393756, + "learning_rate": 1.3584550376009884e-05, + "loss": 0.517, + "step": 4883 + }, + { + "epoch": 0.40189261468833576, + "grad_norm": 2.696904320271931, + "learning_rate": 1.358206206311499e-05, + "loss": 0.7604, + "step": 4884 + }, + { + "epoch": 0.40197490228348076, + "grad_norm": 1.7002055767421063, + "learning_rate": 1.3579573495756138e-05, + "loss": 0.7621, + "step": 4885 + }, + { + "epoch": 0.4020571898786258, + "grad_norm": 1.4417551665854058, + "learning_rate": 1.3577084674110111e-05, + "loss": 0.7701, + "step": 4886 + }, + { + "epoch": 0.4021394774737708, + "grad_norm": 3.1964360376377474, + "learning_rate": 1.3574595598353706e-05, + "loss": 0.7689, + "step": 4887 + }, + { + "epoch": 0.4022217650689159, + "grad_norm": 1.9329056363928834, + "learning_rate": 1.3572106268663748e-05, + "loss": 0.7561, + "step": 4888 + }, + { + "epoch": 0.4023040526640609, + "grad_norm": 0.43722755573668787, + "learning_rate": 1.3569616685217076e-05, + "loss": 0.5259, + "step": 4889 + }, + { + "epoch": 0.40238634025920594, + "grad_norm": 2.6434964768046463, + "learning_rate": 1.356712684819054e-05, + "loss": 0.7579, + "step": 4890 + }, + { + "epoch": 0.40246862785435095, + "grad_norm": 1.9034867153086414, + "learning_rate": 1.3564636757761017e-05, + "loss": 0.7606, + "step": 4891 + }, + { + "epoch": 0.402550915449496, + "grad_norm": 0.4306359159848287, + "learning_rate": 1.3562146414105403e-05, + "loss": 0.5378, + "step": 4892 + }, + { + "epoch": 0.402633203044641, + "grad_norm": 1.3069718280247866, + "learning_rate": 1.3559655817400601e-05, + "loss": 0.7936, + "step": 4893 + }, + { + "epoch": 0.40271549063978607, + "grad_norm": 1.472315444709784, + "learning_rate": 1.3557164967823545e-05, + "loss": 0.8005, + "step": 4894 + }, + { + "epoch": 0.40279777823493107, + "grad_norm": 0.4438865245027994, + "learning_rate": 1.3554673865551178e-05, + "loss": 0.5526, + "step": 4895 + }, + { + "epoch": 0.40288006583007613, + "grad_norm": 0.4092496085919146, + "learning_rate": 1.3552182510760464e-05, + "loss": 0.5127, + "step": 4896 + }, + { + "epoch": 0.40296235342522113, + "grad_norm": 1.947904400507941, + "learning_rate": 1.3549690903628383e-05, + "loss": 0.7586, + "step": 4897 + }, + { + "epoch": 0.4030446410203662, + "grad_norm": 1.3324539759990628, + "learning_rate": 1.3547199044331943e-05, + "loss": 0.7455, + "step": 4898 + }, + { + "epoch": 0.4031269286155112, + "grad_norm": 1.7277746218333458, + "learning_rate": 1.3544706933048151e-05, + "loss": 0.7856, + "step": 4899 + }, + { + "epoch": 0.40320921621065625, + "grad_norm": 1.3631259202292618, + "learning_rate": 1.3542214569954046e-05, + "loss": 0.7752, + "step": 4900 + }, + { + "epoch": 0.40329150380580125, + "grad_norm": 1.7504545288323108, + "learning_rate": 1.3539721955226691e-05, + "loss": 0.8063, + "step": 4901 + }, + { + "epoch": 0.4033737914009463, + "grad_norm": 1.4519026506968586, + "learning_rate": 1.353722908904314e-05, + "loss": 0.759, + "step": 4902 + }, + { + "epoch": 0.4034560789960913, + "grad_norm": 1.6004911677680815, + "learning_rate": 1.3534735971580496e-05, + "loss": 0.7434, + "step": 4903 + }, + { + "epoch": 0.4035383665912364, + "grad_norm": 1.5611945592406076, + "learning_rate": 1.3532242603015861e-05, + "loss": 0.7805, + "step": 4904 + }, + { + "epoch": 0.4036206541863814, + "grad_norm": 1.765741236637157, + "learning_rate": 1.352974898352636e-05, + "loss": 0.798, + "step": 4905 + }, + { + "epoch": 0.40370294178152644, + "grad_norm": 0.4336361154363454, + "learning_rate": 1.3527255113289137e-05, + "loss": 0.5201, + "step": 4906 + }, + { + "epoch": 0.40378522937667144, + "grad_norm": 1.5053764133558618, + "learning_rate": 1.3524760992481349e-05, + "loss": 0.7812, + "step": 4907 + }, + { + "epoch": 0.4038675169718165, + "grad_norm": 2.013336845147902, + "learning_rate": 1.352226662128018e-05, + "loss": 0.7437, + "step": 4908 + }, + { + "epoch": 0.40394980456696156, + "grad_norm": 2.2747824673977366, + "learning_rate": 1.3519771999862825e-05, + "loss": 0.7388, + "step": 4909 + }, + { + "epoch": 0.40403209216210656, + "grad_norm": 1.516793658791264, + "learning_rate": 1.3517277128406492e-05, + "loss": 0.7698, + "step": 4910 + }, + { + "epoch": 0.4041143797572516, + "grad_norm": 0.4388959032271146, + "learning_rate": 1.3514782007088418e-05, + "loss": 0.5314, + "step": 4911 + }, + { + "epoch": 0.4041966673523966, + "grad_norm": 1.483935470558113, + "learning_rate": 1.3512286636085854e-05, + "loss": 0.8138, + "step": 4912 + }, + { + "epoch": 0.4042789549475417, + "grad_norm": 2.072977830658991, + "learning_rate": 1.3509791015576061e-05, + "loss": 0.7891, + "step": 4913 + }, + { + "epoch": 0.4043612425426867, + "grad_norm": 1.723649269677755, + "learning_rate": 1.350729514573633e-05, + "loss": 0.7551, + "step": 4914 + }, + { + "epoch": 0.40444353013783174, + "grad_norm": 1.8307492168372972, + "learning_rate": 1.350479902674396e-05, + "loss": 0.7663, + "step": 4915 + }, + { + "epoch": 0.40452581773297674, + "grad_norm": 1.4880869166911947, + "learning_rate": 1.3502302658776271e-05, + "loss": 0.8055, + "step": 4916 + }, + { + "epoch": 0.4046081053281218, + "grad_norm": 1.6683330303049455, + "learning_rate": 1.3499806042010604e-05, + "loss": 0.7968, + "step": 4917 + }, + { + "epoch": 0.4046903929232668, + "grad_norm": 1.9286416071395267, + "learning_rate": 1.3497309176624311e-05, + "loss": 0.7668, + "step": 4918 + }, + { + "epoch": 0.40477268051841186, + "grad_norm": 0.4293279523174143, + "learning_rate": 1.3494812062794768e-05, + "loss": 0.5036, + "step": 4919 + }, + { + "epoch": 0.40485496811355687, + "grad_norm": 1.7043061673411577, + "learning_rate": 1.3492314700699367e-05, + "loss": 0.7437, + "step": 4920 + }, + { + "epoch": 0.4049372557087019, + "grad_norm": 1.7816697058082576, + "learning_rate": 1.3489817090515514e-05, + "loss": 0.7731, + "step": 4921 + }, + { + "epoch": 0.40501954330384693, + "grad_norm": 1.6717206321745375, + "learning_rate": 1.3487319232420632e-05, + "loss": 0.7676, + "step": 4922 + }, + { + "epoch": 0.405101830898992, + "grad_norm": 1.4730061196113597, + "learning_rate": 1.3484821126592175e-05, + "loss": 0.7392, + "step": 4923 + }, + { + "epoch": 0.405184118494137, + "grad_norm": 1.9198999032145847, + "learning_rate": 1.3482322773207595e-05, + "loss": 0.7713, + "step": 4924 + }, + { + "epoch": 0.40526640608928205, + "grad_norm": 1.5471560715497887, + "learning_rate": 1.3479824172444373e-05, + "loss": 0.7567, + "step": 4925 + }, + { + "epoch": 0.40534869368442705, + "grad_norm": 1.7565074583124216, + "learning_rate": 1.347732532448001e-05, + "loss": 0.7897, + "step": 4926 + }, + { + "epoch": 0.4054309812795721, + "grad_norm": 0.4543176021195966, + "learning_rate": 1.3474826229492018e-05, + "loss": 0.5126, + "step": 4927 + }, + { + "epoch": 0.4055132688747171, + "grad_norm": 1.4826206239126454, + "learning_rate": 1.3472326887657926e-05, + "loss": 0.7615, + "step": 4928 + }, + { + "epoch": 0.4055955564698622, + "grad_norm": 1.6359765350230477, + "learning_rate": 1.3469827299155284e-05, + "loss": 0.7546, + "step": 4929 + }, + { + "epoch": 0.4056778440650072, + "grad_norm": 0.476160976162796, + "learning_rate": 1.3467327464161659e-05, + "loss": 0.5497, + "step": 4930 + }, + { + "epoch": 0.40576013166015223, + "grad_norm": 2.120893877005628, + "learning_rate": 1.3464827382854637e-05, + "loss": 0.7623, + "step": 4931 + }, + { + "epoch": 0.40584241925529724, + "grad_norm": 1.631024438078317, + "learning_rate": 1.3462327055411822e-05, + "loss": 0.779, + "step": 4932 + }, + { + "epoch": 0.4059247068504423, + "grad_norm": 1.82509215102173, + "learning_rate": 1.3459826482010826e-05, + "loss": 0.7645, + "step": 4933 + }, + { + "epoch": 0.40600699444558735, + "grad_norm": 1.933115322850354, + "learning_rate": 1.3457325662829293e-05, + "loss": 0.78, + "step": 4934 + }, + { + "epoch": 0.40608928204073236, + "grad_norm": 1.8278849662481265, + "learning_rate": 1.3454824598044874e-05, + "loss": 0.7634, + "step": 4935 + }, + { + "epoch": 0.4061715696358774, + "grad_norm": 0.4456272425481699, + "learning_rate": 1.3452323287835239e-05, + "loss": 0.5391, + "step": 4936 + }, + { + "epoch": 0.4062538572310224, + "grad_norm": 0.43237911283029584, + "learning_rate": 1.3449821732378077e-05, + "loss": 0.5227, + "step": 4937 + }, + { + "epoch": 0.4063361448261675, + "grad_norm": 1.9109067671292628, + "learning_rate": 1.34473199318511e-05, + "loss": 0.7555, + "step": 4938 + }, + { + "epoch": 0.4064184324213125, + "grad_norm": 1.9286371242590499, + "learning_rate": 1.3444817886432026e-05, + "loss": 0.7714, + "step": 4939 + }, + { + "epoch": 0.40650072001645754, + "grad_norm": 1.6574290059144718, + "learning_rate": 1.34423155962986e-05, + "loss": 0.7609, + "step": 4940 + }, + { + "epoch": 0.40658300761160254, + "grad_norm": 0.45710138595751815, + "learning_rate": 1.343981306162858e-05, + "loss": 0.5307, + "step": 4941 + }, + { + "epoch": 0.4066652952067476, + "grad_norm": 0.4251721524952735, + "learning_rate": 1.3437310282599737e-05, + "loss": 0.5143, + "step": 4942 + }, + { + "epoch": 0.4067475828018926, + "grad_norm": 1.8948576158790909, + "learning_rate": 1.3434807259389872e-05, + "loss": 0.7801, + "step": 4943 + }, + { + "epoch": 0.40682987039703766, + "grad_norm": 1.9587680740921614, + "learning_rate": 1.3432303992176795e-05, + "loss": 0.7797, + "step": 4944 + }, + { + "epoch": 0.40691215799218267, + "grad_norm": 1.570689244456003, + "learning_rate": 1.3429800481138326e-05, + "loss": 0.7691, + "step": 4945 + }, + { + "epoch": 0.4069944455873277, + "grad_norm": 1.56489869939894, + "learning_rate": 1.3427296726452319e-05, + "loss": 0.782, + "step": 4946 + }, + { + "epoch": 0.4070767331824727, + "grad_norm": 1.909738790877715, + "learning_rate": 1.3424792728296633e-05, + "loss": 0.7849, + "step": 4947 + }, + { + "epoch": 0.4071590207776178, + "grad_norm": 1.8394060092834992, + "learning_rate": 1.3422288486849148e-05, + "loss": 0.7736, + "step": 4948 + }, + { + "epoch": 0.4072413083727628, + "grad_norm": 1.7793819166759424, + "learning_rate": 1.3419784002287764e-05, + "loss": 0.7542, + "step": 4949 + }, + { + "epoch": 0.40732359596790785, + "grad_norm": 1.5827679421793317, + "learning_rate": 1.3417279274790394e-05, + "loss": 0.761, + "step": 4950 + }, + { + "epoch": 0.40740588356305285, + "grad_norm": 1.66407348293981, + "learning_rate": 1.3414774304534968e-05, + "loss": 0.7875, + "step": 4951 + }, + { + "epoch": 0.4074881711581979, + "grad_norm": 0.4550484221953091, + "learning_rate": 1.3412269091699435e-05, + "loss": 0.5248, + "step": 4952 + }, + { + "epoch": 0.4075704587533429, + "grad_norm": 1.7068975927109304, + "learning_rate": 1.3409763636461765e-05, + "loss": 0.7994, + "step": 4953 + }, + { + "epoch": 0.40765274634848797, + "grad_norm": 2.3984482340661573, + "learning_rate": 1.3407257938999938e-05, + "loss": 0.7316, + "step": 4954 + }, + { + "epoch": 0.407735033943633, + "grad_norm": 2.0420382413869484, + "learning_rate": 1.3404751999491958e-05, + "loss": 0.7848, + "step": 4955 + }, + { + "epoch": 0.40781732153877803, + "grad_norm": 1.8052502540531599, + "learning_rate": 1.340224581811584e-05, + "loss": 0.8001, + "step": 4956 + }, + { + "epoch": 0.40789960913392304, + "grad_norm": 2.73852070821008, + "learning_rate": 1.339973939504962e-05, + "loss": 0.7491, + "step": 4957 + }, + { + "epoch": 0.4079818967290681, + "grad_norm": 1.5488400223933032, + "learning_rate": 1.3397232730471353e-05, + "loss": 0.7788, + "step": 4958 + }, + { + "epoch": 0.4080641843242131, + "grad_norm": 1.8333193990506436, + "learning_rate": 1.33947258245591e-05, + "loss": 0.7637, + "step": 4959 + }, + { + "epoch": 0.40814647191935816, + "grad_norm": 1.8251438014290884, + "learning_rate": 1.3392218677490957e-05, + "loss": 0.7711, + "step": 4960 + }, + { + "epoch": 0.4082287595145032, + "grad_norm": 1.426156751809347, + "learning_rate": 1.3389711289445026e-05, + "loss": 0.7487, + "step": 4961 + }, + { + "epoch": 0.4083110471096482, + "grad_norm": 0.43221766606680473, + "learning_rate": 1.3387203660599425e-05, + "loss": 0.5147, + "step": 4962 + }, + { + "epoch": 0.4083933347047933, + "grad_norm": 1.6612970987914661, + "learning_rate": 1.338469579113229e-05, + "loss": 0.8006, + "step": 4963 + }, + { + "epoch": 0.4084756222999383, + "grad_norm": 0.44694152905733986, + "learning_rate": 1.3382187681221784e-05, + "loss": 0.5438, + "step": 4964 + }, + { + "epoch": 0.40855790989508334, + "grad_norm": 1.5338464731870534, + "learning_rate": 1.3379679331046069e-05, + "loss": 0.8067, + "step": 4965 + }, + { + "epoch": 0.40864019749022834, + "grad_norm": 0.40968562968932015, + "learning_rate": 1.3377170740783343e-05, + "loss": 0.5191, + "step": 4966 + }, + { + "epoch": 0.4087224850853734, + "grad_norm": 1.6944304650460844, + "learning_rate": 1.3374661910611809e-05, + "loss": 0.7664, + "step": 4967 + }, + { + "epoch": 0.4088047726805184, + "grad_norm": 1.6610118429501306, + "learning_rate": 1.3372152840709686e-05, + "loss": 0.7863, + "step": 4968 + }, + { + "epoch": 0.40888706027566346, + "grad_norm": 1.6715927094383753, + "learning_rate": 1.3369643531255218e-05, + "loss": 0.7858, + "step": 4969 + }, + { + "epoch": 0.40896934787080846, + "grad_norm": 0.43741151859608335, + "learning_rate": 1.3367133982426668e-05, + "loss": 0.5261, + "step": 4970 + }, + { + "epoch": 0.4090516354659535, + "grad_norm": 0.4229882143348287, + "learning_rate": 1.3364624194402301e-05, + "loss": 0.5341, + "step": 4971 + }, + { + "epoch": 0.4091339230610985, + "grad_norm": 1.6934227673315239, + "learning_rate": 1.3362114167360412e-05, + "loss": 0.7595, + "step": 4972 + }, + { + "epoch": 0.4092162106562436, + "grad_norm": 1.9653945370709516, + "learning_rate": 1.335960390147931e-05, + "loss": 0.7886, + "step": 4973 + }, + { + "epoch": 0.4092984982513886, + "grad_norm": 1.794734109874099, + "learning_rate": 1.3357093396937319e-05, + "loss": 0.7616, + "step": 4974 + }, + { + "epoch": 0.40938078584653365, + "grad_norm": 0.4140601438438201, + "learning_rate": 1.3354582653912784e-05, + "loss": 0.5279, + "step": 4975 + }, + { + "epoch": 0.40946307344167865, + "grad_norm": 2.1144337734196648, + "learning_rate": 1.3352071672584062e-05, + "loss": 0.777, + "step": 4976 + }, + { + "epoch": 0.4095453610368237, + "grad_norm": 2.323473111288366, + "learning_rate": 1.3349560453129527e-05, + "loss": 0.7418, + "step": 4977 + }, + { + "epoch": 0.4096276486319687, + "grad_norm": 1.6401170242632215, + "learning_rate": 1.3347048995727576e-05, + "loss": 0.7917, + "step": 4978 + }, + { + "epoch": 0.40970993622711377, + "grad_norm": 2.2864545947676818, + "learning_rate": 1.3344537300556618e-05, + "loss": 0.7452, + "step": 4979 + }, + { + "epoch": 0.40979222382225877, + "grad_norm": 2.3877773205994433, + "learning_rate": 1.3342025367795076e-05, + "loss": 0.7935, + "step": 4980 + }, + { + "epoch": 0.40987451141740383, + "grad_norm": 1.9004796937524528, + "learning_rate": 1.3339513197621398e-05, + "loss": 0.7389, + "step": 4981 + }, + { + "epoch": 0.40995679901254883, + "grad_norm": 1.3529487501644817, + "learning_rate": 1.3337000790214044e-05, + "loss": 0.7688, + "step": 4982 + }, + { + "epoch": 0.4100390866076939, + "grad_norm": 1.6012659422765416, + "learning_rate": 1.333448814575149e-05, + "loss": 0.7801, + "step": 4983 + }, + { + "epoch": 0.4101213742028389, + "grad_norm": 2.054345242654024, + "learning_rate": 1.333197526441223e-05, + "loss": 0.7617, + "step": 4984 + }, + { + "epoch": 0.41020366179798395, + "grad_norm": 1.959113764419574, + "learning_rate": 1.3329462146374779e-05, + "loss": 0.7816, + "step": 4985 + }, + { + "epoch": 0.410285949393129, + "grad_norm": 1.6701850943567758, + "learning_rate": 1.3326948791817657e-05, + "loss": 0.7537, + "step": 4986 + }, + { + "epoch": 0.410368236988274, + "grad_norm": 1.651622376716435, + "learning_rate": 1.3324435200919417e-05, + "loss": 0.7415, + "step": 4987 + }, + { + "epoch": 0.4104505245834191, + "grad_norm": 2.2237698477766585, + "learning_rate": 1.3321921373858618e-05, + "loss": 0.7537, + "step": 4988 + }, + { + "epoch": 0.4105328121785641, + "grad_norm": 1.8715363961845082, + "learning_rate": 1.3319407310813835e-05, + "loss": 0.7488, + "step": 4989 + }, + { + "epoch": 0.41061509977370914, + "grad_norm": 1.9547084540854163, + "learning_rate": 1.3316893011963665e-05, + "loss": 0.7655, + "step": 4990 + }, + { + "epoch": 0.41069738736885414, + "grad_norm": 2.565588007076143, + "learning_rate": 1.331437847748672e-05, + "loss": 0.7869, + "step": 4991 + }, + { + "epoch": 0.4107796749639992, + "grad_norm": 5.844086980839877, + "learning_rate": 1.331186370756163e-05, + "loss": 0.7878, + "step": 4992 + }, + { + "epoch": 0.4108619625591442, + "grad_norm": 1.640678613354489, + "learning_rate": 1.3309348702367038e-05, + "loss": 0.7665, + "step": 4993 + }, + { + "epoch": 0.41094425015428926, + "grad_norm": 0.45098185305666094, + "learning_rate": 1.3306833462081606e-05, + "loss": 0.5526, + "step": 4994 + }, + { + "epoch": 0.41102653774943426, + "grad_norm": 0.445368061974247, + "learning_rate": 1.3304317986884012e-05, + "loss": 0.5129, + "step": 4995 + }, + { + "epoch": 0.4111088253445793, + "grad_norm": 0.4093054511187383, + "learning_rate": 1.3301802276952957e-05, + "loss": 0.5241, + "step": 4996 + }, + { + "epoch": 0.4111911129397243, + "grad_norm": 2.097335579601903, + "learning_rate": 1.3299286332467145e-05, + "loss": 0.7642, + "step": 4997 + }, + { + "epoch": 0.4112734005348694, + "grad_norm": 1.8006639441423546, + "learning_rate": 1.329677015360531e-05, + "loss": 0.7452, + "step": 4998 + }, + { + "epoch": 0.4113556881300144, + "grad_norm": 2.1119563961456067, + "learning_rate": 1.3294253740546197e-05, + "loss": 0.7777, + "step": 4999 + }, + { + "epoch": 0.41143797572515944, + "grad_norm": 0.4676953271688469, + "learning_rate": 1.3291737093468564e-05, + "loss": 0.5323, + "step": 5000 + }, + { + "epoch": 0.41152026332030445, + "grad_norm": 1.8416364935911176, + "learning_rate": 1.3289220212551193e-05, + "loss": 0.7334, + "step": 5001 + }, + { + "epoch": 0.4116025509154495, + "grad_norm": 2.934895327517708, + "learning_rate": 1.328670309797288e-05, + "loss": 0.7917, + "step": 5002 + }, + { + "epoch": 0.4116848385105945, + "grad_norm": 2.026576851353003, + "learning_rate": 1.3284185749912432e-05, + "loss": 0.7702, + "step": 5003 + }, + { + "epoch": 0.41176712610573957, + "grad_norm": 1.5190830151534365, + "learning_rate": 1.3281668168548683e-05, + "loss": 0.7797, + "step": 5004 + }, + { + "epoch": 0.41184941370088457, + "grad_norm": 0.4451024141132392, + "learning_rate": 1.3279150354060475e-05, + "loss": 0.5255, + "step": 5005 + }, + { + "epoch": 0.41193170129602963, + "grad_norm": 1.562207655710592, + "learning_rate": 1.327663230662667e-05, + "loss": 0.7661, + "step": 5006 + }, + { + "epoch": 0.41201398889117463, + "grad_norm": 1.938967949231585, + "learning_rate": 1.3274114026426146e-05, + "loss": 0.7681, + "step": 5007 + }, + { + "epoch": 0.4120962764863197, + "grad_norm": 0.404526320222232, + "learning_rate": 1.32715955136378e-05, + "loss": 0.4853, + "step": 5008 + }, + { + "epoch": 0.4121785640814647, + "grad_norm": 1.9063971261537724, + "learning_rate": 1.3269076768440537e-05, + "loss": 0.7456, + "step": 5009 + }, + { + "epoch": 0.41226085167660975, + "grad_norm": 1.532026313709754, + "learning_rate": 1.326655779101329e-05, + "loss": 0.7899, + "step": 5010 + }, + { + "epoch": 0.4123431392717548, + "grad_norm": 1.6015679866881218, + "learning_rate": 1.3264038581535004e-05, + "loss": 0.7847, + "step": 5011 + }, + { + "epoch": 0.4124254268668998, + "grad_norm": 1.407031326654496, + "learning_rate": 1.3261519140184636e-05, + "loss": 0.7722, + "step": 5012 + }, + { + "epoch": 0.41250771446204487, + "grad_norm": 1.4834378337977596, + "learning_rate": 1.3258999467141161e-05, + "loss": 0.7741, + "step": 5013 + }, + { + "epoch": 0.4125900020571899, + "grad_norm": 1.6396831554931321, + "learning_rate": 1.325647956258358e-05, + "loss": 0.7666, + "step": 5014 + }, + { + "epoch": 0.41267228965233493, + "grad_norm": 1.6117830343426984, + "learning_rate": 1.3253959426690897e-05, + "loss": 0.7774, + "step": 5015 + }, + { + "epoch": 0.41275457724747994, + "grad_norm": 1.7020871320447157, + "learning_rate": 1.325143905964214e-05, + "loss": 0.7686, + "step": 5016 + }, + { + "epoch": 0.412836864842625, + "grad_norm": 1.404765997143861, + "learning_rate": 1.3248918461616353e-05, + "loss": 0.7598, + "step": 5017 + }, + { + "epoch": 0.41291915243777, + "grad_norm": 1.6069357360065282, + "learning_rate": 1.3246397632792593e-05, + "loss": 0.7929, + "step": 5018 + }, + { + "epoch": 0.41300144003291506, + "grad_norm": 1.4990493975812995, + "learning_rate": 1.3243876573349939e-05, + "loss": 0.7749, + "step": 5019 + }, + { + "epoch": 0.41308372762806006, + "grad_norm": 1.8155847466606108, + "learning_rate": 1.324135528346748e-05, + "loss": 0.7806, + "step": 5020 + }, + { + "epoch": 0.4131660152232051, + "grad_norm": 2.0798069498276908, + "learning_rate": 1.3238833763324323e-05, + "loss": 0.7714, + "step": 5021 + }, + { + "epoch": 0.4132483028183501, + "grad_norm": 1.8709270224054353, + "learning_rate": 1.3236312013099598e-05, + "loss": 0.7602, + "step": 5022 + }, + { + "epoch": 0.4133305904134952, + "grad_norm": 1.4978048369194639, + "learning_rate": 1.3233790032972444e-05, + "loss": 0.7914, + "step": 5023 + }, + { + "epoch": 0.4134128780086402, + "grad_norm": 3.4795839336524765, + "learning_rate": 1.3231267823122016e-05, + "loss": 0.7632, + "step": 5024 + }, + { + "epoch": 0.41349516560378524, + "grad_norm": 1.4927364874378168, + "learning_rate": 1.3228745383727493e-05, + "loss": 0.7543, + "step": 5025 + }, + { + "epoch": 0.41357745319893024, + "grad_norm": 1.3905234703794662, + "learning_rate": 1.3226222714968058e-05, + "loss": 0.7735, + "step": 5026 + }, + { + "epoch": 0.4136597407940753, + "grad_norm": 1.773141390542888, + "learning_rate": 1.3223699817022921e-05, + "loss": 0.7598, + "step": 5027 + }, + { + "epoch": 0.4137420283892203, + "grad_norm": 0.43882232444174063, + "learning_rate": 1.3221176690071307e-05, + "loss": 0.5115, + "step": 5028 + }, + { + "epoch": 0.41382431598436537, + "grad_norm": 1.309350723763602, + "learning_rate": 1.3218653334292451e-05, + "loss": 0.7485, + "step": 5029 + }, + { + "epoch": 0.41390660357951037, + "grad_norm": 1.6168800239630767, + "learning_rate": 1.3216129749865606e-05, + "loss": 0.7931, + "step": 5030 + }, + { + "epoch": 0.4139888911746554, + "grad_norm": 2.872333521063567, + "learning_rate": 1.3213605936970054e-05, + "loss": 0.7781, + "step": 5031 + }, + { + "epoch": 0.41407117876980043, + "grad_norm": 1.4324809158834964, + "learning_rate": 1.321108189578507e-05, + "loss": 0.7444, + "step": 5032 + }, + { + "epoch": 0.4141534663649455, + "grad_norm": 0.44709914413940904, + "learning_rate": 1.3208557626489964e-05, + "loss": 0.5172, + "step": 5033 + }, + { + "epoch": 0.4142357539600905, + "grad_norm": 2.0498498733476733, + "learning_rate": 1.320603312926406e-05, + "loss": 0.7692, + "step": 5034 + }, + { + "epoch": 0.41431804155523555, + "grad_norm": 0.4145305896412639, + "learning_rate": 1.3203508404286685e-05, + "loss": 0.5007, + "step": 5035 + }, + { + "epoch": 0.41440032915038055, + "grad_norm": 1.9323844636158003, + "learning_rate": 1.3200983451737197e-05, + "loss": 0.7846, + "step": 5036 + }, + { + "epoch": 0.4144826167455256, + "grad_norm": 1.8689877922629514, + "learning_rate": 1.3198458271794967e-05, + "loss": 0.7777, + "step": 5037 + }, + { + "epoch": 0.41456490434067067, + "grad_norm": 0.41320740258994243, + "learning_rate": 1.3195932864639373e-05, + "loss": 0.5143, + "step": 5038 + }, + { + "epoch": 0.4146471919358157, + "grad_norm": 0.4304451548969223, + "learning_rate": 1.319340723044982e-05, + "loss": 0.5276, + "step": 5039 + }, + { + "epoch": 0.41472947953096073, + "grad_norm": 1.4559434127961126, + "learning_rate": 1.3190881369405726e-05, + "loss": 0.7784, + "step": 5040 + }, + { + "epoch": 0.41481176712610573, + "grad_norm": 1.632737275825597, + "learning_rate": 1.3188355281686522e-05, + "loss": 0.7802, + "step": 5041 + }, + { + "epoch": 0.4148940547212508, + "grad_norm": 1.7025871052445793, + "learning_rate": 1.3185828967471657e-05, + "loss": 0.7772, + "step": 5042 + }, + { + "epoch": 0.4149763423163958, + "grad_norm": 1.7821818346494938, + "learning_rate": 1.3183302426940598e-05, + "loss": 0.7285, + "step": 5043 + }, + { + "epoch": 0.41505862991154086, + "grad_norm": 2.138285823653135, + "learning_rate": 1.3180775660272827e-05, + "loss": 0.7524, + "step": 5044 + }, + { + "epoch": 0.41514091750668586, + "grad_norm": 3.0380970524618123, + "learning_rate": 1.3178248667647835e-05, + "loss": 0.7538, + "step": 5045 + }, + { + "epoch": 0.4152232051018309, + "grad_norm": 0.41941940853710796, + "learning_rate": 1.3175721449245148e-05, + "loss": 0.522, + "step": 5046 + }, + { + "epoch": 0.4153054926969759, + "grad_norm": 1.4236763156729122, + "learning_rate": 1.317319400524428e-05, + "loss": 0.7799, + "step": 5047 + }, + { + "epoch": 0.415387780292121, + "grad_norm": 0.4284914301697701, + "learning_rate": 1.3170666335824789e-05, + "loss": 0.5323, + "step": 5048 + }, + { + "epoch": 0.415470067887266, + "grad_norm": 1.6007102982745622, + "learning_rate": 1.3168138441166234e-05, + "loss": 0.7521, + "step": 5049 + }, + { + "epoch": 0.41555235548241104, + "grad_norm": 1.6727941336003327, + "learning_rate": 1.3165610321448186e-05, + "loss": 0.7493, + "step": 5050 + }, + { + "epoch": 0.41563464307755604, + "grad_norm": 0.4264443049783327, + "learning_rate": 1.3163081976850248e-05, + "loss": 0.5184, + "step": 5051 + }, + { + "epoch": 0.4157169306727011, + "grad_norm": 1.5994047406573793, + "learning_rate": 1.3160553407552024e-05, + "loss": 0.7674, + "step": 5052 + }, + { + "epoch": 0.4157992182678461, + "grad_norm": 1.7260818919341794, + "learning_rate": 1.3158024613733139e-05, + "loss": 0.7804, + "step": 5053 + }, + { + "epoch": 0.41588150586299116, + "grad_norm": 0.41801490898079263, + "learning_rate": 1.3155495595573234e-05, + "loss": 0.5019, + "step": 5054 + }, + { + "epoch": 0.41596379345813617, + "grad_norm": 1.5004590058620488, + "learning_rate": 1.3152966353251973e-05, + "loss": 0.7539, + "step": 5055 + }, + { + "epoch": 0.4160460810532812, + "grad_norm": 1.3735538690733806, + "learning_rate": 1.3150436886949021e-05, + "loss": 0.7639, + "step": 5056 + }, + { + "epoch": 0.41612836864842623, + "grad_norm": 1.3814041819734526, + "learning_rate": 1.3147907196844071e-05, + "loss": 0.7569, + "step": 5057 + }, + { + "epoch": 0.4162106562435713, + "grad_norm": 1.3577704238087476, + "learning_rate": 1.314537728311683e-05, + "loss": 0.7481, + "step": 5058 + }, + { + "epoch": 0.4162929438387163, + "grad_norm": 1.5892925066526178, + "learning_rate": 1.3142847145947017e-05, + "loss": 0.7899, + "step": 5059 + }, + { + "epoch": 0.41637523143386135, + "grad_norm": 1.5791454059845709, + "learning_rate": 1.3140316785514368e-05, + "loss": 0.7731, + "step": 5060 + }, + { + "epoch": 0.41645751902900635, + "grad_norm": 0.4481325769688427, + "learning_rate": 1.3137786201998637e-05, + "loss": 0.5292, + "step": 5061 + }, + { + "epoch": 0.4165398066241514, + "grad_norm": 1.3512972698946157, + "learning_rate": 1.3135255395579594e-05, + "loss": 0.7715, + "step": 5062 + }, + { + "epoch": 0.41662209421929647, + "grad_norm": 1.726565033728082, + "learning_rate": 1.3132724366437022e-05, + "loss": 0.7821, + "step": 5063 + }, + { + "epoch": 0.41670438181444147, + "grad_norm": 1.869590668951175, + "learning_rate": 1.3130193114750717e-05, + "loss": 0.7904, + "step": 5064 + }, + { + "epoch": 0.41678666940958653, + "grad_norm": 1.5467724285537807, + "learning_rate": 1.3127661640700502e-05, + "loss": 0.7613, + "step": 5065 + }, + { + "epoch": 0.41686895700473153, + "grad_norm": 1.3344513657846178, + "learning_rate": 1.3125129944466205e-05, + "loss": 0.7569, + "step": 5066 + }, + { + "epoch": 0.4169512445998766, + "grad_norm": 1.6527148570668175, + "learning_rate": 1.3122598026227675e-05, + "loss": 0.7746, + "step": 5067 + }, + { + "epoch": 0.4170335321950216, + "grad_norm": 1.4182979069458792, + "learning_rate": 1.3120065886164776e-05, + "loss": 0.754, + "step": 5068 + }, + { + "epoch": 0.41711581979016665, + "grad_norm": 1.5485382538373795, + "learning_rate": 1.3117533524457386e-05, + "loss": 0.7724, + "step": 5069 + }, + { + "epoch": 0.41719810738531166, + "grad_norm": 1.567997263670981, + "learning_rate": 1.3115000941285399e-05, + "loss": 0.7961, + "step": 5070 + }, + { + "epoch": 0.4172803949804567, + "grad_norm": 1.7212045886670109, + "learning_rate": 1.311246813682873e-05, + "loss": 0.7651, + "step": 5071 + }, + { + "epoch": 0.4173626825756017, + "grad_norm": 1.256124529216021, + "learning_rate": 1.3109935111267299e-05, + "loss": 0.7727, + "step": 5072 + }, + { + "epoch": 0.4174449701707468, + "grad_norm": 1.5209157618989988, + "learning_rate": 1.310740186478105e-05, + "loss": 0.775, + "step": 5073 + }, + { + "epoch": 0.4175272577658918, + "grad_norm": 0.46085776853616245, + "learning_rate": 1.3104868397549946e-05, + "loss": 0.5252, + "step": 5074 + }, + { + "epoch": 0.41760954536103684, + "grad_norm": 1.4560600220037676, + "learning_rate": 1.3102334709753955e-05, + "loss": 0.7553, + "step": 5075 + }, + { + "epoch": 0.41769183295618184, + "grad_norm": 1.6783398693789424, + "learning_rate": 1.3099800801573066e-05, + "loss": 0.7634, + "step": 5076 + }, + { + "epoch": 0.4177741205513269, + "grad_norm": 2.820381075672895, + "learning_rate": 1.3097266673187288e-05, + "loss": 0.7731, + "step": 5077 + }, + { + "epoch": 0.4178564081464719, + "grad_norm": 1.5264833489712644, + "learning_rate": 1.3094732324776638e-05, + "loss": 0.7735, + "step": 5078 + }, + { + "epoch": 0.41793869574161696, + "grad_norm": 1.3234040456856644, + "learning_rate": 1.3092197756521153e-05, + "loss": 0.7531, + "step": 5079 + }, + { + "epoch": 0.41802098333676196, + "grad_norm": 1.3920292831488978, + "learning_rate": 1.3089662968600883e-05, + "loss": 0.7373, + "step": 5080 + }, + { + "epoch": 0.418103270931907, + "grad_norm": 1.6173580320900134, + "learning_rate": 1.3087127961195898e-05, + "loss": 0.7719, + "step": 5081 + }, + { + "epoch": 0.418185558527052, + "grad_norm": 1.8838472381637519, + "learning_rate": 1.3084592734486278e-05, + "loss": 0.7833, + "step": 5082 + }, + { + "epoch": 0.4182678461221971, + "grad_norm": 1.884730841104447, + "learning_rate": 1.3082057288652125e-05, + "loss": 0.7747, + "step": 5083 + }, + { + "epoch": 0.4183501337173421, + "grad_norm": 1.753595390408341, + "learning_rate": 1.3079521623873555e-05, + "loss": 0.7897, + "step": 5084 + }, + { + "epoch": 0.41843242131248715, + "grad_norm": 1.8208443359581845, + "learning_rate": 1.307698574033069e-05, + "loss": 0.7808, + "step": 5085 + }, + { + "epoch": 0.41851470890763215, + "grad_norm": 0.4371661845162275, + "learning_rate": 1.3074449638203676e-05, + "loss": 0.4759, + "step": 5086 + }, + { + "epoch": 0.4185969965027772, + "grad_norm": 0.4440348031963649, + "learning_rate": 1.3071913317672681e-05, + "loss": 0.5562, + "step": 5087 + }, + { + "epoch": 0.4186792840979222, + "grad_norm": 1.8712632530125262, + "learning_rate": 1.3069376778917878e-05, + "loss": 0.7673, + "step": 5088 + }, + { + "epoch": 0.41876157169306727, + "grad_norm": 0.4155705831410766, + "learning_rate": 1.3066840022119452e-05, + "loss": 0.4923, + "step": 5089 + }, + { + "epoch": 0.41884385928821233, + "grad_norm": 0.39515421812835205, + "learning_rate": 1.306430304745762e-05, + "loss": 0.4999, + "step": 5090 + }, + { + "epoch": 0.41892614688335733, + "grad_norm": 1.4054890040026182, + "learning_rate": 1.3061765855112598e-05, + "loss": 0.7741, + "step": 5091 + }, + { + "epoch": 0.4190084344785024, + "grad_norm": 2.2335756350166465, + "learning_rate": 1.3059228445264624e-05, + "loss": 0.7751, + "step": 5092 + }, + { + "epoch": 0.4190907220736474, + "grad_norm": 1.6600869877654398, + "learning_rate": 1.305669081809396e-05, + "loss": 0.7702, + "step": 5093 + }, + { + "epoch": 0.41917300966879245, + "grad_norm": 1.4465100812790435, + "learning_rate": 1.3054152973780862e-05, + "loss": 0.7673, + "step": 5094 + }, + { + "epoch": 0.41925529726393745, + "grad_norm": 1.5671428187118122, + "learning_rate": 1.3051614912505628e-05, + "loss": 0.777, + "step": 5095 + }, + { + "epoch": 0.4193375848590825, + "grad_norm": 1.5056303804208666, + "learning_rate": 1.3049076634448546e-05, + "loss": 0.7799, + "step": 5096 + }, + { + "epoch": 0.4194198724542275, + "grad_norm": 1.3565405943152693, + "learning_rate": 1.3046538139789937e-05, + "loss": 0.7873, + "step": 5097 + }, + { + "epoch": 0.4195021600493726, + "grad_norm": 2.201298718812234, + "learning_rate": 1.3043999428710132e-05, + "loss": 0.7538, + "step": 5098 + }, + { + "epoch": 0.4195844476445176, + "grad_norm": 2.0883371235498713, + "learning_rate": 1.3041460501389475e-05, + "loss": 0.7705, + "step": 5099 + }, + { + "epoch": 0.41966673523966264, + "grad_norm": 1.3634639619305107, + "learning_rate": 1.3038921358008325e-05, + "loss": 0.7558, + "step": 5100 + }, + { + "epoch": 0.41974902283480764, + "grad_norm": 1.4475871216128158, + "learning_rate": 1.3036381998747066e-05, + "loss": 0.7678, + "step": 5101 + }, + { + "epoch": 0.4198313104299527, + "grad_norm": 1.6263776836927404, + "learning_rate": 1.3033842423786083e-05, + "loss": 0.7593, + "step": 5102 + }, + { + "epoch": 0.4199135980250977, + "grad_norm": 1.7103980706401114, + "learning_rate": 1.3031302633305786e-05, + "loss": 0.7618, + "step": 5103 + }, + { + "epoch": 0.41999588562024276, + "grad_norm": 1.801477064688305, + "learning_rate": 1.3028762627486602e-05, + "loss": 0.7893, + "step": 5104 + }, + { + "epoch": 0.42007817321538776, + "grad_norm": 2.0092072779892396, + "learning_rate": 1.3026222406508961e-05, + "loss": 0.772, + "step": 5105 + }, + { + "epoch": 0.4201604608105328, + "grad_norm": 1.4979470117010707, + "learning_rate": 1.3023681970553324e-05, + "loss": 0.7719, + "step": 5106 + }, + { + "epoch": 0.4202427484056778, + "grad_norm": 1.3653751066921536, + "learning_rate": 1.302114131980015e-05, + "loss": 0.7713, + "step": 5107 + }, + { + "epoch": 0.4203250360008229, + "grad_norm": 1.533035185977836, + "learning_rate": 1.301860045442993e-05, + "loss": 0.7644, + "step": 5108 + }, + { + "epoch": 0.4204073235959679, + "grad_norm": 1.5860031475417253, + "learning_rate": 1.3016059374623164e-05, + "loss": 0.7909, + "step": 5109 + }, + { + "epoch": 0.42048961119111294, + "grad_norm": 1.4152382400076218, + "learning_rate": 1.3013518080560363e-05, + "loss": 0.7675, + "step": 5110 + }, + { + "epoch": 0.42057189878625795, + "grad_norm": 1.5608399835234603, + "learning_rate": 1.3010976572422054e-05, + "loss": 0.7688, + "step": 5111 + }, + { + "epoch": 0.420654186381403, + "grad_norm": 0.45404576910475064, + "learning_rate": 1.3008434850388787e-05, + "loss": 0.5249, + "step": 5112 + }, + { + "epoch": 0.420736473976548, + "grad_norm": 1.615692930852457, + "learning_rate": 1.3005892914641118e-05, + "loss": 0.7852, + "step": 5113 + }, + { + "epoch": 0.42081876157169307, + "grad_norm": 1.494073945805766, + "learning_rate": 1.3003350765359623e-05, + "loss": 0.7569, + "step": 5114 + }, + { + "epoch": 0.4209010491668381, + "grad_norm": 1.3741933567028626, + "learning_rate": 1.3000808402724897e-05, + "loss": 0.7576, + "step": 5115 + }, + { + "epoch": 0.42098333676198313, + "grad_norm": 1.520500230116891, + "learning_rate": 1.2998265826917537e-05, + "loss": 0.7685, + "step": 5116 + }, + { + "epoch": 0.4210656243571282, + "grad_norm": 0.45502657082235015, + "learning_rate": 1.299572303811817e-05, + "loss": 0.5627, + "step": 5117 + }, + { + "epoch": 0.4211479119522732, + "grad_norm": 1.6344517447182347, + "learning_rate": 1.2993180036507428e-05, + "loss": 0.7678, + "step": 5118 + }, + { + "epoch": 0.42123019954741825, + "grad_norm": 0.40474338758797934, + "learning_rate": 1.2990636822265962e-05, + "loss": 0.4988, + "step": 5119 + }, + { + "epoch": 0.42131248714256325, + "grad_norm": 1.6363273242242935, + "learning_rate": 1.298809339557444e-05, + "loss": 0.7599, + "step": 5120 + }, + { + "epoch": 0.4213947747377083, + "grad_norm": 1.6227381664530693, + "learning_rate": 1.2985549756613544e-05, + "loss": 0.7988, + "step": 5121 + }, + { + "epoch": 0.4214770623328533, + "grad_norm": 1.4711784696271182, + "learning_rate": 1.2983005905563965e-05, + "loss": 0.7726, + "step": 5122 + }, + { + "epoch": 0.4215593499279984, + "grad_norm": 1.6507706823567376, + "learning_rate": 1.2980461842606418e-05, + "loss": 0.7915, + "step": 5123 + }, + { + "epoch": 0.4216416375231434, + "grad_norm": 1.5865666571336252, + "learning_rate": 1.297791756792163e-05, + "loss": 0.7692, + "step": 5124 + }, + { + "epoch": 0.42172392511828843, + "grad_norm": 0.41573677655848396, + "learning_rate": 1.2975373081690338e-05, + "loss": 0.5165, + "step": 5125 + }, + { + "epoch": 0.42180621271343344, + "grad_norm": 1.4088308592641254, + "learning_rate": 1.2972828384093301e-05, + "loss": 0.7853, + "step": 5126 + }, + { + "epoch": 0.4218885003085785, + "grad_norm": 1.7292862646103722, + "learning_rate": 1.2970283475311292e-05, + "loss": 0.7987, + "step": 5127 + }, + { + "epoch": 0.4219707879037235, + "grad_norm": 1.4867965706928574, + "learning_rate": 1.2967738355525095e-05, + "loss": 0.7364, + "step": 5128 + }, + { + "epoch": 0.42205307549886856, + "grad_norm": 1.994351839835645, + "learning_rate": 1.2965193024915508e-05, + "loss": 0.8048, + "step": 5129 + }, + { + "epoch": 0.42213536309401356, + "grad_norm": 1.30151949613697, + "learning_rate": 1.2962647483663355e-05, + "loss": 0.7585, + "step": 5130 + }, + { + "epoch": 0.4222176506891586, + "grad_norm": 1.4195711300007476, + "learning_rate": 1.296010173194946e-05, + "loss": 0.7871, + "step": 5131 + }, + { + "epoch": 0.4222999382843036, + "grad_norm": 2.7394937898896794, + "learning_rate": 1.295755576995467e-05, + "loss": 0.7639, + "step": 5132 + }, + { + "epoch": 0.4223822258794487, + "grad_norm": 0.4229363038090469, + "learning_rate": 1.2955009597859854e-05, + "loss": 0.5216, + "step": 5133 + }, + { + "epoch": 0.4224645134745937, + "grad_norm": 1.4544481756502918, + "learning_rate": 1.2952463215845876e-05, + "loss": 0.7454, + "step": 5134 + }, + { + "epoch": 0.42254680106973874, + "grad_norm": 1.53905923803469, + "learning_rate": 1.2949916624093635e-05, + "loss": 0.7504, + "step": 5135 + }, + { + "epoch": 0.42262908866488375, + "grad_norm": 1.621202683032121, + "learning_rate": 1.2947369822784036e-05, + "loss": 0.7438, + "step": 5136 + }, + { + "epoch": 0.4227113762600288, + "grad_norm": 0.42755570426590067, + "learning_rate": 1.2944822812097999e-05, + "loss": 0.523, + "step": 5137 + }, + { + "epoch": 0.4227936638551738, + "grad_norm": 2.1413179694941182, + "learning_rate": 1.2942275592216456e-05, + "loss": 0.762, + "step": 5138 + }, + { + "epoch": 0.42287595145031887, + "grad_norm": 1.4907434942926372, + "learning_rate": 1.2939728163320364e-05, + "loss": 0.7745, + "step": 5139 + }, + { + "epoch": 0.4229582390454639, + "grad_norm": 1.7486286008690697, + "learning_rate": 1.293718052559068e-05, + "loss": 0.7306, + "step": 5140 + }, + { + "epoch": 0.4230405266406089, + "grad_norm": 0.4247915583886388, + "learning_rate": 1.2934632679208394e-05, + "loss": 0.5, + "step": 5141 + }, + { + "epoch": 0.423122814235754, + "grad_norm": 2.209998428302031, + "learning_rate": 1.2932084624354493e-05, + "loss": 0.7748, + "step": 5142 + }, + { + "epoch": 0.423205101830899, + "grad_norm": 1.3258655903251082, + "learning_rate": 1.292953636120999e-05, + "loss": 0.8055, + "step": 5143 + }, + { + "epoch": 0.42328738942604405, + "grad_norm": 1.7478923874342644, + "learning_rate": 1.292698788995591e-05, + "loss": 0.7776, + "step": 5144 + }, + { + "epoch": 0.42336967702118905, + "grad_norm": 1.4546080506610337, + "learning_rate": 1.2924439210773288e-05, + "loss": 0.7662, + "step": 5145 + }, + { + "epoch": 0.4234519646163341, + "grad_norm": 1.84600182926804, + "learning_rate": 1.2921890323843183e-05, + "loss": 0.7559, + "step": 5146 + }, + { + "epoch": 0.4235342522114791, + "grad_norm": 0.4297463743160169, + "learning_rate": 1.2919341229346665e-05, + "loss": 0.505, + "step": 5147 + }, + { + "epoch": 0.42361653980662417, + "grad_norm": 1.485311015613397, + "learning_rate": 1.291679192746481e-05, + "loss": 0.7671, + "step": 5148 + }, + { + "epoch": 0.4236988274017692, + "grad_norm": 1.585720548996577, + "learning_rate": 1.2914242418378725e-05, + "loss": 0.7806, + "step": 5149 + }, + { + "epoch": 0.42378111499691423, + "grad_norm": 1.4488866071571211, + "learning_rate": 1.2911692702269518e-05, + "loss": 0.7743, + "step": 5150 + }, + { + "epoch": 0.42386340259205924, + "grad_norm": 1.8683954481792246, + "learning_rate": 1.2909142779318318e-05, + "loss": 0.7602, + "step": 5151 + }, + { + "epoch": 0.4239456901872043, + "grad_norm": 1.8202227364242167, + "learning_rate": 1.2906592649706267e-05, + "loss": 0.7653, + "step": 5152 + }, + { + "epoch": 0.4240279777823493, + "grad_norm": 1.4653383125828967, + "learning_rate": 1.2904042313614525e-05, + "loss": 0.7742, + "step": 5153 + }, + { + "epoch": 0.42411026537749436, + "grad_norm": 1.7504887239682128, + "learning_rate": 1.2901491771224258e-05, + "loss": 0.7292, + "step": 5154 + }, + { + "epoch": 0.42419255297263936, + "grad_norm": 1.7376172834546189, + "learning_rate": 1.2898941022716656e-05, + "loss": 0.7895, + "step": 5155 + }, + { + "epoch": 0.4242748405677844, + "grad_norm": 1.542980498781137, + "learning_rate": 1.2896390068272924e-05, + "loss": 0.7855, + "step": 5156 + }, + { + "epoch": 0.4243571281629294, + "grad_norm": 1.8639200023609965, + "learning_rate": 1.2893838908074267e-05, + "loss": 0.7529, + "step": 5157 + }, + { + "epoch": 0.4244394157580745, + "grad_norm": 0.4163223823649685, + "learning_rate": 1.2891287542301927e-05, + "loss": 0.5091, + "step": 5158 + }, + { + "epoch": 0.4245217033532195, + "grad_norm": 1.5905907936024637, + "learning_rate": 1.2888735971137143e-05, + "loss": 0.7487, + "step": 5159 + }, + { + "epoch": 0.42460399094836454, + "grad_norm": 1.4107196558407695, + "learning_rate": 1.2886184194761176e-05, + "loss": 0.7989, + "step": 5160 + }, + { + "epoch": 0.42468627854350954, + "grad_norm": 1.546701104670946, + "learning_rate": 1.2883632213355298e-05, + "loss": 0.8068, + "step": 5161 + }, + { + "epoch": 0.4247685661386546, + "grad_norm": 1.83483448142194, + "learning_rate": 1.28810800271008e-05, + "loss": 0.7815, + "step": 5162 + }, + { + "epoch": 0.4248508537337996, + "grad_norm": 0.4066156390924842, + "learning_rate": 1.2878527636178983e-05, + "loss": 0.509, + "step": 5163 + }, + { + "epoch": 0.42493314132894466, + "grad_norm": 1.5114965315864686, + "learning_rate": 1.2875975040771169e-05, + "loss": 0.7364, + "step": 5164 + }, + { + "epoch": 0.42501542892408967, + "grad_norm": 1.4783716263839073, + "learning_rate": 1.2873422241058687e-05, + "loss": 0.7785, + "step": 5165 + }, + { + "epoch": 0.4250977165192347, + "grad_norm": 1.5953693438101555, + "learning_rate": 1.2870869237222882e-05, + "loss": 0.7729, + "step": 5166 + }, + { + "epoch": 0.4251800041143798, + "grad_norm": 1.3070634760450786, + "learning_rate": 1.2868316029445119e-05, + "loss": 0.7457, + "step": 5167 + }, + { + "epoch": 0.4252622917095248, + "grad_norm": 1.6716615553887653, + "learning_rate": 1.2865762617906773e-05, + "loss": 0.7949, + "step": 5168 + }, + { + "epoch": 0.42534457930466985, + "grad_norm": 1.6424393407088949, + "learning_rate": 1.2863209002789234e-05, + "loss": 0.7665, + "step": 5169 + }, + { + "epoch": 0.42542686689981485, + "grad_norm": 1.8368517694360709, + "learning_rate": 1.2860655184273905e-05, + "loss": 0.7589, + "step": 5170 + }, + { + "epoch": 0.4255091544949599, + "grad_norm": 1.6910739427353059, + "learning_rate": 1.285810116254221e-05, + "loss": 0.7609, + "step": 5171 + }, + { + "epoch": 0.4255914420901049, + "grad_norm": 0.42424881142350507, + "learning_rate": 1.2855546937775577e-05, + "loss": 0.4942, + "step": 5172 + }, + { + "epoch": 0.42567372968524997, + "grad_norm": 1.542781392981818, + "learning_rate": 1.2852992510155452e-05, + "loss": 0.764, + "step": 5173 + }, + { + "epoch": 0.42575601728039497, + "grad_norm": 1.5985163120875747, + "learning_rate": 1.2850437879863308e-05, + "loss": 0.7664, + "step": 5174 + }, + { + "epoch": 0.42583830487554003, + "grad_norm": 1.7067798563255017, + "learning_rate": 1.2847883047080613e-05, + "loss": 0.745, + "step": 5175 + }, + { + "epoch": 0.42592059247068503, + "grad_norm": 1.5425430052742577, + "learning_rate": 1.2845328011988864e-05, + "loss": 0.7623, + "step": 5176 + }, + { + "epoch": 0.4260028800658301, + "grad_norm": 0.4139689834537053, + "learning_rate": 1.2842772774769561e-05, + "loss": 0.5228, + "step": 5177 + }, + { + "epoch": 0.4260851676609751, + "grad_norm": 1.5853904225299937, + "learning_rate": 1.2840217335604224e-05, + "loss": 0.7767, + "step": 5178 + }, + { + "epoch": 0.42616745525612015, + "grad_norm": 1.8292083238766386, + "learning_rate": 1.2837661694674392e-05, + "loss": 0.8075, + "step": 5179 + }, + { + "epoch": 0.42624974285126516, + "grad_norm": 1.732967652107009, + "learning_rate": 1.2835105852161612e-05, + "loss": 0.765, + "step": 5180 + }, + { + "epoch": 0.4263320304464102, + "grad_norm": 1.5638738883106125, + "learning_rate": 1.2832549808247446e-05, + "loss": 0.7762, + "step": 5181 + }, + { + "epoch": 0.4264143180415552, + "grad_norm": 1.7252466059553242, + "learning_rate": 1.2829993563113472e-05, + "loss": 0.7734, + "step": 5182 + }, + { + "epoch": 0.4264966056367003, + "grad_norm": 2.0328498898223897, + "learning_rate": 1.2827437116941279e-05, + "loss": 0.7684, + "step": 5183 + }, + { + "epoch": 0.4265788932318453, + "grad_norm": 2.6630486251179266, + "learning_rate": 1.2824880469912478e-05, + "loss": 0.7799, + "step": 5184 + }, + { + "epoch": 0.42666118082699034, + "grad_norm": 1.6705067759936936, + "learning_rate": 1.2822323622208686e-05, + "loss": 0.7793, + "step": 5185 + }, + { + "epoch": 0.42674346842213534, + "grad_norm": 0.43772564058816205, + "learning_rate": 1.2819766574011534e-05, + "loss": 0.5441, + "step": 5186 + }, + { + "epoch": 0.4268257560172804, + "grad_norm": 1.6539620647164595, + "learning_rate": 1.2817209325502676e-05, + "loss": 0.7851, + "step": 5187 + }, + { + "epoch": 0.4269080436124254, + "grad_norm": 3.8335157315830424, + "learning_rate": 1.2814651876863774e-05, + "loss": 0.7714, + "step": 5188 + }, + { + "epoch": 0.42699033120757046, + "grad_norm": 0.4337833066363883, + "learning_rate": 1.2812094228276503e-05, + "loss": 0.5143, + "step": 5189 + }, + { + "epoch": 0.42707261880271546, + "grad_norm": 1.5169173676124459, + "learning_rate": 1.2809536379922556e-05, + "loss": 0.7602, + "step": 5190 + }, + { + "epoch": 0.4271549063978605, + "grad_norm": 2.1162238775763718, + "learning_rate": 1.2806978331983638e-05, + "loss": 0.7922, + "step": 5191 + }, + { + "epoch": 0.4272371939930056, + "grad_norm": 2.112881374708722, + "learning_rate": 1.2804420084641467e-05, + "loss": 0.7728, + "step": 5192 + }, + { + "epoch": 0.4273194815881506, + "grad_norm": 1.9133292233067944, + "learning_rate": 1.2801861638077776e-05, + "loss": 0.7472, + "step": 5193 + }, + { + "epoch": 0.42740176918329564, + "grad_norm": 1.7602646746941304, + "learning_rate": 1.279930299247432e-05, + "loss": 0.7685, + "step": 5194 + }, + { + "epoch": 0.42748405677844065, + "grad_norm": 0.42218831713367605, + "learning_rate": 1.2796744148012856e-05, + "loss": 0.5075, + "step": 5195 + }, + { + "epoch": 0.4275663443735857, + "grad_norm": 0.4466521912640819, + "learning_rate": 1.2794185104875157e-05, + "loss": 0.5322, + "step": 5196 + }, + { + "epoch": 0.4276486319687307, + "grad_norm": 1.7542867820590973, + "learning_rate": 1.279162586324302e-05, + "loss": 0.7365, + "step": 5197 + }, + { + "epoch": 0.42773091956387577, + "grad_norm": 1.6306734215871295, + "learning_rate": 1.2789066423298244e-05, + "loss": 0.7618, + "step": 5198 + }, + { + "epoch": 0.42781320715902077, + "grad_norm": 2.678747864792711, + "learning_rate": 1.278650678522265e-05, + "loss": 0.7711, + "step": 5199 + }, + { + "epoch": 0.42789549475416583, + "grad_norm": 1.6711488419018756, + "learning_rate": 1.2783946949198074e-05, + "loss": 0.7532, + "step": 5200 + }, + { + "epoch": 0.42797778234931083, + "grad_norm": 1.5831251667366022, + "learning_rate": 1.2781386915406352e-05, + "loss": 0.729, + "step": 5201 + }, + { + "epoch": 0.4280600699444559, + "grad_norm": 1.5047987654305965, + "learning_rate": 1.2778826684029357e-05, + "loss": 0.7529, + "step": 5202 + }, + { + "epoch": 0.4281423575396009, + "grad_norm": 0.4208506790931328, + "learning_rate": 1.2776266255248959e-05, + "loss": 0.505, + "step": 5203 + }, + { + "epoch": 0.42822464513474595, + "grad_norm": 1.617049488148483, + "learning_rate": 1.2773705629247044e-05, + "loss": 0.791, + "step": 5204 + }, + { + "epoch": 0.42830693272989095, + "grad_norm": 1.57958248644709, + "learning_rate": 1.2771144806205516e-05, + "loss": 0.7624, + "step": 5205 + }, + { + "epoch": 0.428389220325036, + "grad_norm": 0.4209184596408624, + "learning_rate": 1.2768583786306294e-05, + "loss": 0.5249, + "step": 5206 + }, + { + "epoch": 0.428471507920181, + "grad_norm": 1.8086111528023487, + "learning_rate": 1.2766022569731309e-05, + "loss": 0.8134, + "step": 5207 + }, + { + "epoch": 0.4285537955153261, + "grad_norm": 1.613496536803672, + "learning_rate": 1.2763461156662502e-05, + "loss": 0.7732, + "step": 5208 + }, + { + "epoch": 0.4286360831104711, + "grad_norm": 1.4781590789807222, + "learning_rate": 1.2760899547281835e-05, + "loss": 0.7132, + "step": 5209 + }, + { + "epoch": 0.42871837070561614, + "grad_norm": 0.42721261936497057, + "learning_rate": 1.2758337741771278e-05, + "loss": 0.5292, + "step": 5210 + }, + { + "epoch": 0.42880065830076114, + "grad_norm": 2.055260707251137, + "learning_rate": 1.2755775740312824e-05, + "loss": 0.7853, + "step": 5211 + }, + { + "epoch": 0.4288829458959062, + "grad_norm": 1.7500822073190063, + "learning_rate": 1.2753213543088463e-05, + "loss": 0.7636, + "step": 5212 + }, + { + "epoch": 0.4289652334910512, + "grad_norm": 1.6934093063571298, + "learning_rate": 1.2750651150280215e-05, + "loss": 0.7441, + "step": 5213 + }, + { + "epoch": 0.42904752108619626, + "grad_norm": 1.5186965833201034, + "learning_rate": 1.2748088562070113e-05, + "loss": 0.7525, + "step": 5214 + }, + { + "epoch": 0.42912980868134126, + "grad_norm": 1.5854573671368248, + "learning_rate": 1.274552577864019e-05, + "loss": 0.7545, + "step": 5215 + }, + { + "epoch": 0.4292120962764863, + "grad_norm": 1.648722813856504, + "learning_rate": 1.2742962800172509e-05, + "loss": 0.7487, + "step": 5216 + }, + { + "epoch": 0.4292943838716313, + "grad_norm": 1.6835477352791386, + "learning_rate": 1.2740399626849138e-05, + "loss": 0.7481, + "step": 5217 + }, + { + "epoch": 0.4293766714667764, + "grad_norm": 2.4324912313819316, + "learning_rate": 1.273783625885216e-05, + "loss": 0.7502, + "step": 5218 + }, + { + "epoch": 0.42945895906192144, + "grad_norm": 1.6760375089536597, + "learning_rate": 1.273527269636367e-05, + "loss": 0.7807, + "step": 5219 + }, + { + "epoch": 0.42954124665706644, + "grad_norm": 2.0643567699659573, + "learning_rate": 1.2732708939565784e-05, + "loss": 0.7767, + "step": 5220 + }, + { + "epoch": 0.4296235342522115, + "grad_norm": 1.7043177195312391, + "learning_rate": 1.2730144988640624e-05, + "loss": 0.7563, + "step": 5221 + }, + { + "epoch": 0.4297058218473565, + "grad_norm": 1.9617857196971888, + "learning_rate": 1.272758084377033e-05, + "loss": 0.7797, + "step": 5222 + }, + { + "epoch": 0.42978810944250156, + "grad_norm": 1.4979702458113164, + "learning_rate": 1.2725016505137058e-05, + "loss": 0.7546, + "step": 5223 + }, + { + "epoch": 0.42987039703764657, + "grad_norm": 1.597686090641279, + "learning_rate": 1.2722451972922968e-05, + "loss": 0.8006, + "step": 5224 + }, + { + "epoch": 0.4299526846327916, + "grad_norm": 1.4693354724995469, + "learning_rate": 1.2719887247310245e-05, + "loss": 0.7377, + "step": 5225 + }, + { + "epoch": 0.43003497222793663, + "grad_norm": 1.6900623503295762, + "learning_rate": 1.2717322328481086e-05, + "loss": 0.7404, + "step": 5226 + }, + { + "epoch": 0.4301172598230817, + "grad_norm": 1.5072638517663324, + "learning_rate": 1.2714757216617688e-05, + "loss": 0.7476, + "step": 5227 + }, + { + "epoch": 0.4301995474182267, + "grad_norm": 1.5701840099761109, + "learning_rate": 1.2712191911902282e-05, + "loss": 0.7793, + "step": 5228 + }, + { + "epoch": 0.43028183501337175, + "grad_norm": 1.3519019479681234, + "learning_rate": 1.27096264145171e-05, + "loss": 0.782, + "step": 5229 + }, + { + "epoch": 0.43036412260851675, + "grad_norm": 1.8908851159431082, + "learning_rate": 1.270706072464439e-05, + "loss": 0.7622, + "step": 5230 + }, + { + "epoch": 0.4304464102036618, + "grad_norm": 2.8377656340603648, + "learning_rate": 1.2704494842466415e-05, + "loss": 0.7557, + "step": 5231 + }, + { + "epoch": 0.4305286977988068, + "grad_norm": 1.4954029268409403, + "learning_rate": 1.2701928768165455e-05, + "loss": 0.7376, + "step": 5232 + }, + { + "epoch": 0.4306109853939519, + "grad_norm": 2.0028517362033216, + "learning_rate": 1.2699362501923793e-05, + "loss": 0.7882, + "step": 5233 + }, + { + "epoch": 0.4306932729890969, + "grad_norm": 1.8718657416089302, + "learning_rate": 1.2696796043923736e-05, + "loss": 0.7758, + "step": 5234 + }, + { + "epoch": 0.43077556058424193, + "grad_norm": 1.8143501257470138, + "learning_rate": 1.2694229394347604e-05, + "loss": 0.7588, + "step": 5235 + }, + { + "epoch": 0.43085784817938694, + "grad_norm": 1.7476886285249085, + "learning_rate": 1.269166255337772e-05, + "loss": 0.749, + "step": 5236 + }, + { + "epoch": 0.430940135774532, + "grad_norm": 1.5393365930420047, + "learning_rate": 1.2689095521196435e-05, + "loss": 0.7417, + "step": 5237 + }, + { + "epoch": 0.431022423369677, + "grad_norm": 1.69595062587202, + "learning_rate": 1.2686528297986107e-05, + "loss": 0.7753, + "step": 5238 + }, + { + "epoch": 0.43110471096482206, + "grad_norm": 1.662891651131129, + "learning_rate": 1.2683960883929103e-05, + "loss": 0.7449, + "step": 5239 + }, + { + "epoch": 0.43118699855996706, + "grad_norm": 1.8785072624099672, + "learning_rate": 1.268139327920781e-05, + "loss": 0.7806, + "step": 5240 + }, + { + "epoch": 0.4312692861551121, + "grad_norm": 1.3354013215459486, + "learning_rate": 1.2678825484004626e-05, + "loss": 0.739, + "step": 5241 + }, + { + "epoch": 0.4313515737502571, + "grad_norm": 1.6725730267123509, + "learning_rate": 1.2676257498501964e-05, + "loss": 0.7539, + "step": 5242 + }, + { + "epoch": 0.4314338613454022, + "grad_norm": 1.3828265909164317, + "learning_rate": 1.2673689322882253e-05, + "loss": 0.7521, + "step": 5243 + }, + { + "epoch": 0.43151614894054724, + "grad_norm": 0.4136211915510149, + "learning_rate": 1.2671120957327926e-05, + "loss": 0.523, + "step": 5244 + }, + { + "epoch": 0.43159843653569224, + "grad_norm": 1.9788673027611474, + "learning_rate": 1.2668552402021437e-05, + "loss": 0.7729, + "step": 5245 + }, + { + "epoch": 0.4316807241308373, + "grad_norm": 0.4186920852350189, + "learning_rate": 1.2665983657145252e-05, + "loss": 0.4897, + "step": 5246 + }, + { + "epoch": 0.4317630117259823, + "grad_norm": 0.43693709539369247, + "learning_rate": 1.2663414722881856e-05, + "loss": 0.5337, + "step": 5247 + }, + { + "epoch": 0.43184529932112736, + "grad_norm": 2.445832487857032, + "learning_rate": 1.2660845599413731e-05, + "loss": 0.7638, + "step": 5248 + }, + { + "epoch": 0.43192758691627237, + "grad_norm": 1.5578719498657498, + "learning_rate": 1.2658276286923397e-05, + "loss": 0.732, + "step": 5249 + }, + { + "epoch": 0.4320098745114174, + "grad_norm": 2.2760130547989124, + "learning_rate": 1.2655706785593363e-05, + "loss": 0.7426, + "step": 5250 + }, + { + "epoch": 0.4320921621065624, + "grad_norm": 1.795957145593185, + "learning_rate": 1.2653137095606164e-05, + "loss": 0.7661, + "step": 5251 + }, + { + "epoch": 0.4321744497017075, + "grad_norm": 0.4468414753270686, + "learning_rate": 1.2650567217144357e-05, + "loss": 0.5181, + "step": 5252 + }, + { + "epoch": 0.4322567372968525, + "grad_norm": 1.6552672727143638, + "learning_rate": 1.2647997150390485e-05, + "loss": 0.7504, + "step": 5253 + }, + { + "epoch": 0.43233902489199755, + "grad_norm": 1.6616395977450535, + "learning_rate": 1.2645426895527134e-05, + "loss": 0.7587, + "step": 5254 + }, + { + "epoch": 0.43242131248714255, + "grad_norm": 1.6393046791188197, + "learning_rate": 1.264285645273689e-05, + "loss": 0.7481, + "step": 5255 + }, + { + "epoch": 0.4325036000822876, + "grad_norm": 1.704760629011104, + "learning_rate": 1.2640285822202345e-05, + "loss": 0.7483, + "step": 5256 + }, + { + "epoch": 0.4325858876774326, + "grad_norm": 1.6870999314384783, + "learning_rate": 1.263771500410612e-05, + "loss": 0.7718, + "step": 5257 + }, + { + "epoch": 0.43266817527257767, + "grad_norm": 2.4208978149585447, + "learning_rate": 1.2635143998630838e-05, + "loss": 0.7757, + "step": 5258 + }, + { + "epoch": 0.4327504628677227, + "grad_norm": 1.7568077366893644, + "learning_rate": 1.2632572805959144e-05, + "loss": 0.7789, + "step": 5259 + }, + { + "epoch": 0.43283275046286773, + "grad_norm": 2.862954397670126, + "learning_rate": 1.2630001426273687e-05, + "loss": 0.7751, + "step": 5260 + }, + { + "epoch": 0.43291503805801274, + "grad_norm": 1.8650184240658945, + "learning_rate": 1.2627429859757132e-05, + "loss": 0.7544, + "step": 5261 + }, + { + "epoch": 0.4329973256531578, + "grad_norm": 1.7436005136553854, + "learning_rate": 1.2624858106592164e-05, + "loss": 0.7843, + "step": 5262 + }, + { + "epoch": 0.4330796132483028, + "grad_norm": 1.7701000277899346, + "learning_rate": 1.2622286166961472e-05, + "loss": 0.7213, + "step": 5263 + }, + { + "epoch": 0.43316190084344786, + "grad_norm": 2.0664945522557017, + "learning_rate": 1.2619714041047768e-05, + "loss": 0.7524, + "step": 5264 + }, + { + "epoch": 0.43324418843859286, + "grad_norm": 1.9560198067482983, + "learning_rate": 1.2617141729033767e-05, + "loss": 0.7725, + "step": 5265 + }, + { + "epoch": 0.4333264760337379, + "grad_norm": 1.7929322193879602, + "learning_rate": 1.2614569231102199e-05, + "loss": 0.7577, + "step": 5266 + }, + { + "epoch": 0.4334087636288829, + "grad_norm": 1.7215643940632464, + "learning_rate": 1.2611996547435818e-05, + "loss": 0.7932, + "step": 5267 + }, + { + "epoch": 0.433491051224028, + "grad_norm": 2.0015531669910835, + "learning_rate": 1.2609423678217378e-05, + "loss": 0.7338, + "step": 5268 + }, + { + "epoch": 0.433573338819173, + "grad_norm": 1.7649224587361327, + "learning_rate": 1.2606850623629653e-05, + "loss": 0.7816, + "step": 5269 + }, + { + "epoch": 0.43365562641431804, + "grad_norm": 1.8395471743690264, + "learning_rate": 1.2604277383855433e-05, + "loss": 0.7909, + "step": 5270 + }, + { + "epoch": 0.4337379140094631, + "grad_norm": 2.049587397046376, + "learning_rate": 1.2601703959077507e-05, + "loss": 0.7589, + "step": 5271 + }, + { + "epoch": 0.4338202016046081, + "grad_norm": 1.857465496624235, + "learning_rate": 1.2599130349478694e-05, + "loss": 0.7591, + "step": 5272 + }, + { + "epoch": 0.43390248919975316, + "grad_norm": 2.1612367880988073, + "learning_rate": 1.2596556555241822e-05, + "loss": 0.7945, + "step": 5273 + }, + { + "epoch": 0.43398477679489816, + "grad_norm": 2.0058565517692255, + "learning_rate": 1.2593982576549719e-05, + "loss": 0.7773, + "step": 5274 + }, + { + "epoch": 0.4340670643900432, + "grad_norm": 1.9822137003161668, + "learning_rate": 1.2591408413585244e-05, + "loss": 0.7545, + "step": 5275 + }, + { + "epoch": 0.4341493519851882, + "grad_norm": 1.6976634063282594, + "learning_rate": 1.2588834066531262e-05, + "loss": 0.7533, + "step": 5276 + }, + { + "epoch": 0.4342316395803333, + "grad_norm": 2.0508009827117677, + "learning_rate": 1.2586259535570645e-05, + "loss": 0.7603, + "step": 5277 + }, + { + "epoch": 0.4343139271754783, + "grad_norm": 0.4294782181912124, + "learning_rate": 1.2583684820886291e-05, + "loss": 0.5199, + "step": 5278 + }, + { + "epoch": 0.43439621477062335, + "grad_norm": 1.6713645757382147, + "learning_rate": 1.2581109922661094e-05, + "loss": 0.7519, + "step": 5279 + }, + { + "epoch": 0.43447850236576835, + "grad_norm": 2.725736963220771, + "learning_rate": 1.2578534841077978e-05, + "loss": 0.7579, + "step": 5280 + }, + { + "epoch": 0.4345607899609134, + "grad_norm": 2.1600369580656116, + "learning_rate": 1.2575959576319871e-05, + "loss": 0.7866, + "step": 5281 + }, + { + "epoch": 0.4346430775560584, + "grad_norm": 0.410570714312826, + "learning_rate": 1.2573384128569717e-05, + "loss": 0.5059, + "step": 5282 + }, + { + "epoch": 0.43472536515120347, + "grad_norm": 2.3489768568601948, + "learning_rate": 1.2570808498010467e-05, + "loss": 0.7723, + "step": 5283 + }, + { + "epoch": 0.43480765274634847, + "grad_norm": 1.7276019751035514, + "learning_rate": 1.2568232684825095e-05, + "loss": 0.7767, + "step": 5284 + }, + { + "epoch": 0.43488994034149353, + "grad_norm": 2.2192964633873284, + "learning_rate": 1.2565656689196577e-05, + "loss": 0.7581, + "step": 5285 + }, + { + "epoch": 0.43497222793663853, + "grad_norm": 2.3167112667717875, + "learning_rate": 1.2563080511307915e-05, + "loss": 0.7683, + "step": 5286 + }, + { + "epoch": 0.4350545155317836, + "grad_norm": 2.398383637703362, + "learning_rate": 1.2560504151342113e-05, + "loss": 0.7627, + "step": 5287 + }, + { + "epoch": 0.4351368031269286, + "grad_norm": 0.4438705318130196, + "learning_rate": 1.2557927609482186e-05, + "loss": 0.5174, + "step": 5288 + }, + { + "epoch": 0.43521909072207365, + "grad_norm": 2.0142380472510872, + "learning_rate": 1.2555350885911175e-05, + "loss": 0.7645, + "step": 5289 + }, + { + "epoch": 0.43530137831721866, + "grad_norm": 0.41073606097817394, + "learning_rate": 1.2552773980812125e-05, + "loss": 0.5108, + "step": 5290 + }, + { + "epoch": 0.4353836659123637, + "grad_norm": 2.4966908634115814, + "learning_rate": 1.2550196894368092e-05, + "loss": 0.7329, + "step": 5291 + }, + { + "epoch": 0.4354659535075087, + "grad_norm": 1.886512929815993, + "learning_rate": 1.2547619626762152e-05, + "loss": 0.771, + "step": 5292 + }, + { + "epoch": 0.4355482411026538, + "grad_norm": 2.021859623025128, + "learning_rate": 1.2545042178177384e-05, + "loss": 0.7819, + "step": 5293 + }, + { + "epoch": 0.4356305286977988, + "grad_norm": 3.222487992257021, + "learning_rate": 1.2542464548796894e-05, + "loss": 0.7828, + "step": 5294 + }, + { + "epoch": 0.43571281629294384, + "grad_norm": 0.4361777035332118, + "learning_rate": 1.2539886738803788e-05, + "loss": 0.5104, + "step": 5295 + }, + { + "epoch": 0.4357951038880889, + "grad_norm": 0.40541058292318344, + "learning_rate": 1.253730874838119e-05, + "loss": 0.5034, + "step": 5296 + }, + { + "epoch": 0.4358773914832339, + "grad_norm": 2.056836431104409, + "learning_rate": 1.2534730577712236e-05, + "loss": 0.7877, + "step": 5297 + }, + { + "epoch": 0.43595967907837896, + "grad_norm": 2.442567216146298, + "learning_rate": 1.2532152226980075e-05, + "loss": 0.7723, + "step": 5298 + }, + { + "epoch": 0.43604196667352396, + "grad_norm": 0.42449158451039903, + "learning_rate": 1.2529573696367869e-05, + "loss": 0.53, + "step": 5299 + }, + { + "epoch": 0.436124254268669, + "grad_norm": 1.9160053830771828, + "learning_rate": 1.2526994986058793e-05, + "loss": 0.7764, + "step": 5300 + }, + { + "epoch": 0.436206541863814, + "grad_norm": 1.8538069284155732, + "learning_rate": 1.2524416096236037e-05, + "loss": 0.7705, + "step": 5301 + }, + { + "epoch": 0.4362888294589591, + "grad_norm": 0.42891331240796776, + "learning_rate": 1.2521837027082796e-05, + "loss": 0.5223, + "step": 5302 + }, + { + "epoch": 0.4363711170541041, + "grad_norm": 2.7360104341572455, + "learning_rate": 1.2519257778782286e-05, + "loss": 0.791, + "step": 5303 + }, + { + "epoch": 0.43645340464924914, + "grad_norm": 1.7804489169269153, + "learning_rate": 1.2516678351517734e-05, + "loss": 0.7546, + "step": 5304 + }, + { + "epoch": 0.43653569224439415, + "grad_norm": 2.462759440225803, + "learning_rate": 1.2514098745472376e-05, + "loss": 0.7585, + "step": 5305 + }, + { + "epoch": 0.4366179798395392, + "grad_norm": 2.140182679141451, + "learning_rate": 1.2511518960829464e-05, + "loss": 0.7596, + "step": 5306 + }, + { + "epoch": 0.4367002674346842, + "grad_norm": 2.5050931607699263, + "learning_rate": 1.2508938997772262e-05, + "loss": 0.7902, + "step": 5307 + }, + { + "epoch": 0.43678255502982927, + "grad_norm": 2.4586627802952585, + "learning_rate": 1.2506358856484044e-05, + "loss": 0.7629, + "step": 5308 + }, + { + "epoch": 0.43686484262497427, + "grad_norm": 3.445279865052278, + "learning_rate": 1.2503778537148102e-05, + "loss": 0.7457, + "step": 5309 + }, + { + "epoch": 0.43694713022011933, + "grad_norm": 2.0666239853593726, + "learning_rate": 1.2501198039947737e-05, + "loss": 0.7819, + "step": 5310 + }, + { + "epoch": 0.43702941781526433, + "grad_norm": 3.5737846167866976, + "learning_rate": 1.2498617365066263e-05, + "loss": 0.7688, + "step": 5311 + }, + { + "epoch": 0.4371117054104094, + "grad_norm": 2.6380339843186613, + "learning_rate": 1.2496036512687009e-05, + "loss": 0.787, + "step": 5312 + }, + { + "epoch": 0.4371939930055544, + "grad_norm": 2.007803482360068, + "learning_rate": 1.2493455482993313e-05, + "loss": 0.7763, + "step": 5313 + }, + { + "epoch": 0.43727628060069945, + "grad_norm": 2.0325609745868123, + "learning_rate": 1.2490874276168526e-05, + "loss": 0.7557, + "step": 5314 + }, + { + "epoch": 0.43735856819584445, + "grad_norm": 1.7484740156894214, + "learning_rate": 1.2488292892396012e-05, + "loss": 0.7622, + "step": 5315 + }, + { + "epoch": 0.4374408557909895, + "grad_norm": 1.910203991004595, + "learning_rate": 1.2485711331859153e-05, + "loss": 0.7625, + "step": 5316 + }, + { + "epoch": 0.4375231433861345, + "grad_norm": 2.235918119636264, + "learning_rate": 1.2483129594741336e-05, + "loss": 0.7751, + "step": 5317 + }, + { + "epoch": 0.4376054309812796, + "grad_norm": 2.019696515674241, + "learning_rate": 1.2480547681225964e-05, + "loss": 0.7791, + "step": 5318 + }, + { + "epoch": 0.4376877185764246, + "grad_norm": 0.42690166020533754, + "learning_rate": 1.247796559149645e-05, + "loss": 0.515, + "step": 5319 + }, + { + "epoch": 0.43777000617156964, + "grad_norm": 0.4370903446951065, + "learning_rate": 1.2475383325736226e-05, + "loss": 0.5229, + "step": 5320 + }, + { + "epoch": 0.4378522937667147, + "grad_norm": 0.4046398591910913, + "learning_rate": 1.2472800884128727e-05, + "loss": 0.5223, + "step": 5321 + }, + { + "epoch": 0.4379345813618597, + "grad_norm": 2.2169097646114904, + "learning_rate": 1.247021826685741e-05, + "loss": 0.7494, + "step": 5322 + }, + { + "epoch": 0.43801686895700476, + "grad_norm": 0.4257892876059985, + "learning_rate": 1.2467635474105735e-05, + "loss": 0.5295, + "step": 5323 + }, + { + "epoch": 0.43809915655214976, + "grad_norm": 1.9610076128919676, + "learning_rate": 1.2465052506057187e-05, + "loss": 0.7622, + "step": 5324 + }, + { + "epoch": 0.4381814441472948, + "grad_norm": 2.0732986181921365, + "learning_rate": 1.2462469362895247e-05, + "loss": 0.7366, + "step": 5325 + }, + { + "epoch": 0.4382637317424398, + "grad_norm": 5.795480253794371, + "learning_rate": 1.2459886044803422e-05, + "loss": 0.7731, + "step": 5326 + }, + { + "epoch": 0.4383460193375849, + "grad_norm": 2.2910993404139464, + "learning_rate": 1.245730255196523e-05, + "loss": 0.7822, + "step": 5327 + }, + { + "epoch": 0.4384283069327299, + "grad_norm": 2.286356114315915, + "learning_rate": 1.245471888456419e-05, + "loss": 0.7892, + "step": 5328 + }, + { + "epoch": 0.43851059452787494, + "grad_norm": 2.99716688788888, + "learning_rate": 1.2452135042783846e-05, + "loss": 0.7549, + "step": 5329 + }, + { + "epoch": 0.43859288212301994, + "grad_norm": 1.8286039865100288, + "learning_rate": 1.2449551026807754e-05, + "loss": 0.7712, + "step": 5330 + }, + { + "epoch": 0.438675169718165, + "grad_norm": 0.458354439256516, + "learning_rate": 1.2446966836819471e-05, + "loss": 0.5112, + "step": 5331 + }, + { + "epoch": 0.43875745731331, + "grad_norm": 1.8600664715159574, + "learning_rate": 1.244438247300258e-05, + "loss": 0.7518, + "step": 5332 + }, + { + "epoch": 0.43883974490845506, + "grad_norm": 1.844266365241888, + "learning_rate": 1.2441797935540667e-05, + "loss": 0.749, + "step": 5333 + }, + { + "epoch": 0.43892203250360007, + "grad_norm": 2.0966310271468935, + "learning_rate": 1.2439213224617332e-05, + "loss": 0.7646, + "step": 5334 + }, + { + "epoch": 0.4390043200987451, + "grad_norm": 6.031611142862644, + "learning_rate": 1.2436628340416191e-05, + "loss": 0.7362, + "step": 5335 + }, + { + "epoch": 0.43908660769389013, + "grad_norm": 2.06479090085676, + "learning_rate": 1.2434043283120872e-05, + "loss": 0.7561, + "step": 5336 + }, + { + "epoch": 0.4391688952890352, + "grad_norm": 1.8134173165218581, + "learning_rate": 1.2431458052915007e-05, + "loss": 0.7777, + "step": 5337 + }, + { + "epoch": 0.4392511828841802, + "grad_norm": 2.1913842128031593, + "learning_rate": 1.242887264998225e-05, + "loss": 0.7508, + "step": 5338 + }, + { + "epoch": 0.43933347047932525, + "grad_norm": 2.4090061695108895, + "learning_rate": 1.242628707450627e-05, + "loss": 0.7909, + "step": 5339 + }, + { + "epoch": 0.43941575807447025, + "grad_norm": 0.4300327389730397, + "learning_rate": 1.2423701326670732e-05, + "loss": 0.4803, + "step": 5340 + }, + { + "epoch": 0.4394980456696153, + "grad_norm": 2.305487346601941, + "learning_rate": 1.2421115406659327e-05, + "loss": 0.7563, + "step": 5341 + }, + { + "epoch": 0.4395803332647603, + "grad_norm": 0.4322662308230838, + "learning_rate": 1.2418529314655761e-05, + "loss": 0.5135, + "step": 5342 + }, + { + "epoch": 0.4396626208599054, + "grad_norm": 2.719925782910544, + "learning_rate": 1.2415943050843736e-05, + "loss": 0.7552, + "step": 5343 + }, + { + "epoch": 0.4397449084550504, + "grad_norm": 2.8650835213625263, + "learning_rate": 1.241335661540698e-05, + "loss": 0.7751, + "step": 5344 + }, + { + "epoch": 0.43982719605019543, + "grad_norm": 1.9816765210874252, + "learning_rate": 1.2410770008529232e-05, + "loss": 0.7359, + "step": 5345 + }, + { + "epoch": 0.43990948364534044, + "grad_norm": 1.8510312478100834, + "learning_rate": 1.2408183230394237e-05, + "loss": 0.7508, + "step": 5346 + }, + { + "epoch": 0.4399917712404855, + "grad_norm": 2.5597926288178066, + "learning_rate": 1.2405596281185756e-05, + "loss": 0.7671, + "step": 5347 + }, + { + "epoch": 0.44007405883563055, + "grad_norm": 2.1965145125445025, + "learning_rate": 1.2403009161087565e-05, + "loss": 0.7652, + "step": 5348 + }, + { + "epoch": 0.44015634643077556, + "grad_norm": 2.07687389197985, + "learning_rate": 1.2400421870283444e-05, + "loss": 0.7808, + "step": 5349 + }, + { + "epoch": 0.4402386340259206, + "grad_norm": 2.1121362958085683, + "learning_rate": 1.239783440895719e-05, + "loss": 0.7614, + "step": 5350 + }, + { + "epoch": 0.4403209216210656, + "grad_norm": 3.0710316798327484, + "learning_rate": 1.2395246777292617e-05, + "loss": 0.7941, + "step": 5351 + }, + { + "epoch": 0.4404032092162107, + "grad_norm": 0.4449725057174452, + "learning_rate": 1.2392658975473543e-05, + "loss": 0.5402, + "step": 5352 + }, + { + "epoch": 0.4404854968113557, + "grad_norm": 0.4184244488348674, + "learning_rate": 1.2390071003683803e-05, + "loss": 0.4885, + "step": 5353 + }, + { + "epoch": 0.44056778440650074, + "grad_norm": 1.8483292292478641, + "learning_rate": 1.238748286210724e-05, + "loss": 0.7663, + "step": 5354 + }, + { + "epoch": 0.44065007200164574, + "grad_norm": 2.434257803349876, + "learning_rate": 1.2384894550927713e-05, + "loss": 0.7637, + "step": 5355 + }, + { + "epoch": 0.4407323595967908, + "grad_norm": 2.11152106306381, + "learning_rate": 1.2382306070329091e-05, + "loss": 0.7293, + "step": 5356 + }, + { + "epoch": 0.4408146471919358, + "grad_norm": 0.41012729938361886, + "learning_rate": 1.2379717420495259e-05, + "loss": 0.4895, + "step": 5357 + }, + { + "epoch": 0.44089693478708086, + "grad_norm": 1.9849795501640861, + "learning_rate": 1.2377128601610103e-05, + "loss": 0.7593, + "step": 5358 + }, + { + "epoch": 0.44097922238222587, + "grad_norm": 2.8203376690873636, + "learning_rate": 1.2374539613857535e-05, + "loss": 0.7833, + "step": 5359 + }, + { + "epoch": 0.4410615099773709, + "grad_norm": 2.222486272332006, + "learning_rate": 1.2371950457421469e-05, + "loss": 0.7515, + "step": 5360 + }, + { + "epoch": 0.4411437975725159, + "grad_norm": 2.8407575821616162, + "learning_rate": 1.2369361132485835e-05, + "loss": 0.7869, + "step": 5361 + }, + { + "epoch": 0.441226085167661, + "grad_norm": 2.540708296648168, + "learning_rate": 1.2366771639234578e-05, + "loss": 0.7574, + "step": 5362 + }, + { + "epoch": 0.441308372762806, + "grad_norm": 1.9714420948368472, + "learning_rate": 1.2364181977851649e-05, + "loss": 0.7724, + "step": 5363 + }, + { + "epoch": 0.44139066035795105, + "grad_norm": 1.9672339619449446, + "learning_rate": 1.2361592148521009e-05, + "loss": 0.783, + "step": 5364 + }, + { + "epoch": 0.44147294795309605, + "grad_norm": 1.8579350307563158, + "learning_rate": 1.2359002151426645e-05, + "loss": 0.7565, + "step": 5365 + }, + { + "epoch": 0.4415552355482411, + "grad_norm": 2.251350976087786, + "learning_rate": 1.2356411986752537e-05, + "loss": 0.7583, + "step": 5366 + }, + { + "epoch": 0.4416375231433861, + "grad_norm": 2.3916858192307253, + "learning_rate": 1.235382165468269e-05, + "loss": 0.773, + "step": 5367 + }, + { + "epoch": 0.44171981073853117, + "grad_norm": 1.7911859738800386, + "learning_rate": 1.235123115540112e-05, + "loss": 0.7517, + "step": 5368 + }, + { + "epoch": 0.4418020983336762, + "grad_norm": 2.2598950757897205, + "learning_rate": 1.2348640489091845e-05, + "loss": 0.7716, + "step": 5369 + }, + { + "epoch": 0.44188438592882123, + "grad_norm": 2.6866946808071193, + "learning_rate": 1.2346049655938908e-05, + "loss": 0.7398, + "step": 5370 + }, + { + "epoch": 0.44196667352396624, + "grad_norm": 1.8782987971996845, + "learning_rate": 1.2343458656126357e-05, + "loss": 0.7369, + "step": 5371 + }, + { + "epoch": 0.4420489611191113, + "grad_norm": 1.6721536431980022, + "learning_rate": 1.234086748983825e-05, + "loss": 0.7231, + "step": 5372 + }, + { + "epoch": 0.44213124871425635, + "grad_norm": 2.1673673082026896, + "learning_rate": 1.2338276157258658e-05, + "loss": 0.762, + "step": 5373 + }, + { + "epoch": 0.44221353630940136, + "grad_norm": 2.2048508825337017, + "learning_rate": 1.233568465857167e-05, + "loss": 0.7755, + "step": 5374 + }, + { + "epoch": 0.4422958239045464, + "grad_norm": 1.843620598631604, + "learning_rate": 1.2333092993961378e-05, + "loss": 0.7843, + "step": 5375 + }, + { + "epoch": 0.4423781114996914, + "grad_norm": 0.42766762532182706, + "learning_rate": 1.233050116361189e-05, + "loss": 0.525, + "step": 5376 + }, + { + "epoch": 0.4424603990948365, + "grad_norm": 2.054589962399393, + "learning_rate": 1.2327909167707328e-05, + "loss": 0.7696, + "step": 5377 + }, + { + "epoch": 0.4425426866899815, + "grad_norm": 2.0003781751379344, + "learning_rate": 1.2325317006431821e-05, + "loss": 0.7576, + "step": 5378 + }, + { + "epoch": 0.44262497428512654, + "grad_norm": 2.3012284842085338, + "learning_rate": 1.232272467996951e-05, + "loss": 0.7367, + "step": 5379 + }, + { + "epoch": 0.44270726188027154, + "grad_norm": 1.9661061234133794, + "learning_rate": 1.2320132188504557e-05, + "loss": 0.7571, + "step": 5380 + }, + { + "epoch": 0.4427895494754166, + "grad_norm": 4.331359407885945, + "learning_rate": 1.2317539532221121e-05, + "loss": 0.7339, + "step": 5381 + }, + { + "epoch": 0.4428718370705616, + "grad_norm": 2.1381535921921824, + "learning_rate": 1.2314946711303384e-05, + "loss": 0.7263, + "step": 5382 + }, + { + "epoch": 0.44295412466570666, + "grad_norm": 2.226768930215416, + "learning_rate": 1.2312353725935536e-05, + "loss": 0.7553, + "step": 5383 + }, + { + "epoch": 0.44303641226085166, + "grad_norm": 0.4312002765126313, + "learning_rate": 1.2309760576301775e-05, + "loss": 0.5271, + "step": 5384 + }, + { + "epoch": 0.4431186998559967, + "grad_norm": 2.0891151047665555, + "learning_rate": 1.2307167262586317e-05, + "loss": 0.7718, + "step": 5385 + }, + { + "epoch": 0.4432009874511417, + "grad_norm": 2.0067355180148763, + "learning_rate": 1.2304573784973388e-05, + "loss": 0.7653, + "step": 5386 + }, + { + "epoch": 0.4432832750462868, + "grad_norm": 3.985961007803416, + "learning_rate": 1.230198014364722e-05, + "loss": 0.7232, + "step": 5387 + }, + { + "epoch": 0.4433655626414318, + "grad_norm": 2.5706772252316705, + "learning_rate": 1.2299386338792067e-05, + "loss": 0.7352, + "step": 5388 + }, + { + "epoch": 0.44344785023657685, + "grad_norm": 2.0691201431806694, + "learning_rate": 1.2296792370592185e-05, + "loss": 0.7217, + "step": 5389 + }, + { + "epoch": 0.44353013783172185, + "grad_norm": 2.2953546455923908, + "learning_rate": 1.2294198239231847e-05, + "loss": 0.7575, + "step": 5390 + }, + { + "epoch": 0.4436124254268669, + "grad_norm": 3.11857953514892, + "learning_rate": 1.2291603944895333e-05, + "loss": 0.7554, + "step": 5391 + }, + { + "epoch": 0.4436947130220119, + "grad_norm": 1.8589461217373002, + "learning_rate": 1.2289009487766942e-05, + "loss": 0.754, + "step": 5392 + }, + { + "epoch": 0.44377700061715697, + "grad_norm": 2.528602956072674, + "learning_rate": 1.2286414868030975e-05, + "loss": 0.7437, + "step": 5393 + }, + { + "epoch": 0.443859288212302, + "grad_norm": 1.9702329866437434, + "learning_rate": 1.2283820085871755e-05, + "loss": 0.7514, + "step": 5394 + }, + { + "epoch": 0.44394157580744703, + "grad_norm": 2.045640694664754, + "learning_rate": 1.2281225141473606e-05, + "loss": 0.7729, + "step": 5395 + }, + { + "epoch": 0.44402386340259203, + "grad_norm": 2.0182290557866964, + "learning_rate": 1.2278630035020874e-05, + "loss": 0.7603, + "step": 5396 + }, + { + "epoch": 0.4441061509977371, + "grad_norm": 2.478759541374359, + "learning_rate": 1.2276034766697909e-05, + "loss": 0.7863, + "step": 5397 + }, + { + "epoch": 0.4441884385928821, + "grad_norm": 2.223523452953577, + "learning_rate": 1.2273439336689071e-05, + "loss": 0.782, + "step": 5398 + }, + { + "epoch": 0.44427072618802715, + "grad_norm": 1.885396943862482, + "learning_rate": 1.2270843745178738e-05, + "loss": 0.7477, + "step": 5399 + }, + { + "epoch": 0.4443530137831722, + "grad_norm": 2.16124723724886, + "learning_rate": 1.2268247992351301e-05, + "loss": 0.7284, + "step": 5400 + }, + { + "epoch": 0.4444353013783172, + "grad_norm": 1.6193342359232468, + "learning_rate": 1.226565207839115e-05, + "loss": 0.7487, + "step": 5401 + }, + { + "epoch": 0.4445175889734623, + "grad_norm": 2.0519275672899453, + "learning_rate": 1.22630560034827e-05, + "loss": 0.7391, + "step": 5402 + }, + { + "epoch": 0.4445998765686073, + "grad_norm": 1.9426431016659014, + "learning_rate": 1.2260459767810373e-05, + "loss": 0.7435, + "step": 5403 + }, + { + "epoch": 0.44468216416375234, + "grad_norm": 1.7959110215798182, + "learning_rate": 1.2257863371558596e-05, + "loss": 0.7523, + "step": 5404 + }, + { + "epoch": 0.44476445175889734, + "grad_norm": 1.937062435318408, + "learning_rate": 1.2255266814911818e-05, + "loss": 0.7685, + "step": 5405 + }, + { + "epoch": 0.4448467393540424, + "grad_norm": 1.9468985092733406, + "learning_rate": 1.225267009805449e-05, + "loss": 0.7608, + "step": 5406 + }, + { + "epoch": 0.4449290269491874, + "grad_norm": 0.4170081570449744, + "learning_rate": 1.2250073221171085e-05, + "loss": 0.5196, + "step": 5407 + }, + { + "epoch": 0.44501131454433246, + "grad_norm": 2.020999826256211, + "learning_rate": 1.2247476184446071e-05, + "loss": 0.7607, + "step": 5408 + }, + { + "epoch": 0.44509360213947746, + "grad_norm": 0.4374008505099345, + "learning_rate": 1.2244878988063948e-05, + "loss": 0.5069, + "step": 5409 + }, + { + "epoch": 0.4451758897346225, + "grad_norm": 0.39977905984851303, + "learning_rate": 1.224228163220921e-05, + "loss": 0.5054, + "step": 5410 + }, + { + "epoch": 0.4452581773297675, + "grad_norm": 2.597341627645074, + "learning_rate": 1.223968411706637e-05, + "loss": 0.782, + "step": 5411 + }, + { + "epoch": 0.4453404649249126, + "grad_norm": 2.17236569559739, + "learning_rate": 1.2237086442819957e-05, + "loss": 0.756, + "step": 5412 + }, + { + "epoch": 0.4454227525200576, + "grad_norm": 1.8920012747326824, + "learning_rate": 1.2234488609654496e-05, + "loss": 0.7647, + "step": 5413 + }, + { + "epoch": 0.44550504011520264, + "grad_norm": 1.7822240745872422, + "learning_rate": 1.2231890617754539e-05, + "loss": 0.7335, + "step": 5414 + }, + { + "epoch": 0.44558732771034765, + "grad_norm": 2.2101405942194825, + "learning_rate": 1.2229292467304641e-05, + "loss": 0.7498, + "step": 5415 + }, + { + "epoch": 0.4456696153054927, + "grad_norm": 2.10607039991208, + "learning_rate": 1.2226694158489371e-05, + "loss": 0.7655, + "step": 5416 + }, + { + "epoch": 0.4457519029006377, + "grad_norm": 1.9274287612574406, + "learning_rate": 1.2224095691493309e-05, + "loss": 0.7364, + "step": 5417 + }, + { + "epoch": 0.44583419049578277, + "grad_norm": 2.97330215584428, + "learning_rate": 1.222149706650105e-05, + "loss": 0.7552, + "step": 5418 + }, + { + "epoch": 0.44591647809092777, + "grad_norm": 3.759675828972365, + "learning_rate": 1.221889828369719e-05, + "loss": 0.7861, + "step": 5419 + }, + { + "epoch": 0.44599876568607283, + "grad_norm": 3.116155038754245, + "learning_rate": 1.221629934326634e-05, + "loss": 0.767, + "step": 5420 + }, + { + "epoch": 0.44608105328121783, + "grad_norm": 2.4734294317998287, + "learning_rate": 1.2213700245393137e-05, + "loss": 0.7296, + "step": 5421 + }, + { + "epoch": 0.4461633408763629, + "grad_norm": 2.1321460663892915, + "learning_rate": 1.2211100990262203e-05, + "loss": 0.7612, + "step": 5422 + }, + { + "epoch": 0.4462456284715079, + "grad_norm": 1.883195885065359, + "learning_rate": 1.2208501578058192e-05, + "loss": 0.7648, + "step": 5423 + }, + { + "epoch": 0.44632791606665295, + "grad_norm": 1.9074115253954822, + "learning_rate": 1.2205902008965762e-05, + "loss": 0.7428, + "step": 5424 + }, + { + "epoch": 0.446410203661798, + "grad_norm": 2.334344286279892, + "learning_rate": 1.220330228316958e-05, + "loss": 0.7736, + "step": 5425 + }, + { + "epoch": 0.446492491256943, + "grad_norm": 2.47976458256848, + "learning_rate": 1.2200702400854328e-05, + "loss": 0.762, + "step": 5426 + }, + { + "epoch": 0.4465747788520881, + "grad_norm": 1.7687214611427784, + "learning_rate": 1.2198102362204698e-05, + "loss": 0.778, + "step": 5427 + }, + { + "epoch": 0.4466570664472331, + "grad_norm": 0.44499319581301283, + "learning_rate": 1.2195502167405389e-05, + "loss": 0.516, + "step": 5428 + }, + { + "epoch": 0.44673935404237813, + "grad_norm": 2.3446559332607633, + "learning_rate": 1.2192901816641121e-05, + "loss": 0.7435, + "step": 5429 + }, + { + "epoch": 0.44682164163752314, + "grad_norm": 3.170683104686457, + "learning_rate": 1.219030131009661e-05, + "loss": 0.7557, + "step": 5430 + }, + { + "epoch": 0.4469039292326682, + "grad_norm": 1.9211666451847358, + "learning_rate": 1.21877006479566e-05, + "loss": 0.7464, + "step": 5431 + }, + { + "epoch": 0.4469862168278132, + "grad_norm": 1.9676285513068985, + "learning_rate": 1.2185099830405835e-05, + "loss": 0.7389, + "step": 5432 + }, + { + "epoch": 0.44706850442295826, + "grad_norm": 0.4312730633201075, + "learning_rate": 1.2182498857629072e-05, + "loss": 0.5067, + "step": 5433 + }, + { + "epoch": 0.44715079201810326, + "grad_norm": 2.082489292587286, + "learning_rate": 1.2179897729811079e-05, + "loss": 0.7712, + "step": 5434 + }, + { + "epoch": 0.4472330796132483, + "grad_norm": 2.7510767758986336, + "learning_rate": 1.217729644713664e-05, + "loss": 0.7619, + "step": 5435 + }, + { + "epoch": 0.4473153672083933, + "grad_norm": 2.0454203082685285, + "learning_rate": 1.2174695009790542e-05, + "loss": 0.7676, + "step": 5436 + }, + { + "epoch": 0.4473976548035384, + "grad_norm": 2.8696679036135597, + "learning_rate": 1.2172093417957587e-05, + "loss": 0.7596, + "step": 5437 + }, + { + "epoch": 0.4474799423986834, + "grad_norm": 2.471135295319708, + "learning_rate": 1.2169491671822593e-05, + "loss": 0.7557, + "step": 5438 + }, + { + "epoch": 0.44756222999382844, + "grad_norm": 2.554429225307245, + "learning_rate": 1.2166889771570377e-05, + "loss": 0.7451, + "step": 5439 + }, + { + "epoch": 0.44764451758897345, + "grad_norm": 2.77557242485594, + "learning_rate": 1.2164287717385779e-05, + "loss": 0.7462, + "step": 5440 + }, + { + "epoch": 0.4477268051841185, + "grad_norm": 2.4910565864078325, + "learning_rate": 1.2161685509453643e-05, + "loss": 0.7431, + "step": 5441 + }, + { + "epoch": 0.4478090927792635, + "grad_norm": 1.9268501676837038, + "learning_rate": 1.2159083147958823e-05, + "loss": 0.7388, + "step": 5442 + }, + { + "epoch": 0.44789138037440857, + "grad_norm": 1.9033167259511043, + "learning_rate": 1.2156480633086189e-05, + "loss": 0.744, + "step": 5443 + }, + { + "epoch": 0.44797366796955357, + "grad_norm": 2.7908304272697326, + "learning_rate": 1.2153877965020625e-05, + "loss": 0.7656, + "step": 5444 + }, + { + "epoch": 0.4480559555646986, + "grad_norm": 1.9922629370515004, + "learning_rate": 1.215127514394701e-05, + "loss": 0.7616, + "step": 5445 + }, + { + "epoch": 0.44813824315984363, + "grad_norm": 2.4372741853013924, + "learning_rate": 1.2148672170050251e-05, + "loss": 0.7721, + "step": 5446 + }, + { + "epoch": 0.4482205307549887, + "grad_norm": 2.272558217068026, + "learning_rate": 1.2146069043515256e-05, + "loss": 0.7724, + "step": 5447 + }, + { + "epoch": 0.4483028183501337, + "grad_norm": 5.187639858046062, + "learning_rate": 1.214346576452695e-05, + "loss": 0.7489, + "step": 5448 + }, + { + "epoch": 0.44838510594527875, + "grad_norm": 0.4592794524335327, + "learning_rate": 1.2140862333270262e-05, + "loss": 0.5042, + "step": 5449 + }, + { + "epoch": 0.4484673935404238, + "grad_norm": 0.42225024248913795, + "learning_rate": 1.2138258749930141e-05, + "loss": 0.5054, + "step": 5450 + }, + { + "epoch": 0.4485496811355688, + "grad_norm": 2.4817780538550407, + "learning_rate": 1.2135655014691534e-05, + "loss": 0.7618, + "step": 5451 + }, + { + "epoch": 0.44863196873071387, + "grad_norm": 5.344994564180993, + "learning_rate": 1.2133051127739413e-05, + "loss": 0.763, + "step": 5452 + }, + { + "epoch": 0.4487142563258589, + "grad_norm": 0.4157400032414873, + "learning_rate": 1.213044708925875e-05, + "loss": 0.5121, + "step": 5453 + }, + { + "epoch": 0.44879654392100393, + "grad_norm": 2.274000737500358, + "learning_rate": 1.2127842899434531e-05, + "loss": 0.7652, + "step": 5454 + }, + { + "epoch": 0.44887883151614894, + "grad_norm": 2.5413856018117493, + "learning_rate": 1.2125238558451759e-05, + "loss": 0.7735, + "step": 5455 + }, + { + "epoch": 0.448961119111294, + "grad_norm": 2.4049097950323155, + "learning_rate": 1.2122634066495438e-05, + "loss": 0.7656, + "step": 5456 + }, + { + "epoch": 0.449043406706439, + "grad_norm": 2.0848540175793007, + "learning_rate": 1.2120029423750585e-05, + "loss": 0.7372, + "step": 5457 + }, + { + "epoch": 0.44912569430158406, + "grad_norm": 2.0365856014470527, + "learning_rate": 1.2117424630402234e-05, + "loss": 0.7602, + "step": 5458 + }, + { + "epoch": 0.44920798189672906, + "grad_norm": 2.7413042284845273, + "learning_rate": 1.2114819686635426e-05, + "loss": 0.7453, + "step": 5459 + }, + { + "epoch": 0.4492902694918741, + "grad_norm": 0.45561642804140334, + "learning_rate": 1.2112214592635208e-05, + "loss": 0.4804, + "step": 5460 + }, + { + "epoch": 0.4493725570870191, + "grad_norm": 2.264976763379205, + "learning_rate": 1.210960934858664e-05, + "loss": 0.8021, + "step": 5461 + }, + { + "epoch": 0.4494548446821642, + "grad_norm": 2.0917282985710517, + "learning_rate": 1.2107003954674803e-05, + "loss": 0.7573, + "step": 5462 + }, + { + "epoch": 0.4495371322773092, + "grad_norm": 2.5914369479985857, + "learning_rate": 1.210439841108477e-05, + "loss": 0.7626, + "step": 5463 + }, + { + "epoch": 0.44961941987245424, + "grad_norm": 2.4658566377275126, + "learning_rate": 1.2101792718001643e-05, + "loss": 0.7616, + "step": 5464 + }, + { + "epoch": 0.44970170746759924, + "grad_norm": 1.939043398448544, + "learning_rate": 1.209918687561052e-05, + "loss": 0.7604, + "step": 5465 + }, + { + "epoch": 0.4497839950627443, + "grad_norm": 2.008784896054091, + "learning_rate": 1.2096580884096518e-05, + "loss": 0.737, + "step": 5466 + }, + { + "epoch": 0.4498662826578893, + "grad_norm": 0.4031887406997368, + "learning_rate": 1.2093974743644767e-05, + "loss": 0.4908, + "step": 5467 + }, + { + "epoch": 0.44994857025303436, + "grad_norm": 0.4070656300683124, + "learning_rate": 1.2091368454440395e-05, + "loss": 0.5024, + "step": 5468 + }, + { + "epoch": 0.45003085784817937, + "grad_norm": 2.487160113471423, + "learning_rate": 1.2088762016668554e-05, + "loss": 0.7398, + "step": 5469 + }, + { + "epoch": 0.4501131454433244, + "grad_norm": 1.9903501837019772, + "learning_rate": 1.2086155430514402e-05, + "loss": 0.7719, + "step": 5470 + }, + { + "epoch": 0.45019543303846943, + "grad_norm": 2.282467208790329, + "learning_rate": 1.2083548696163101e-05, + "loss": 0.7965, + "step": 5471 + }, + { + "epoch": 0.4502777206336145, + "grad_norm": 2.322686352844429, + "learning_rate": 1.2080941813799833e-05, + "loss": 0.7524, + "step": 5472 + }, + { + "epoch": 0.4503600082287595, + "grad_norm": 2.1582943273627224, + "learning_rate": 1.207833478360979e-05, + "loss": 0.7559, + "step": 5473 + }, + { + "epoch": 0.45044229582390455, + "grad_norm": 2.20610807394045, + "learning_rate": 1.2075727605778164e-05, + "loss": 0.7543, + "step": 5474 + }, + { + "epoch": 0.45052458341904955, + "grad_norm": 2.09132490770673, + "learning_rate": 1.207312028049017e-05, + "loss": 0.7396, + "step": 5475 + }, + { + "epoch": 0.4506068710141946, + "grad_norm": 1.992160690975379, + "learning_rate": 1.207051280793103e-05, + "loss": 0.7475, + "step": 5476 + }, + { + "epoch": 0.45068915860933967, + "grad_norm": 0.46576663845909844, + "learning_rate": 1.2067905188285967e-05, + "loss": 0.534, + "step": 5477 + }, + { + "epoch": 0.45077144620448467, + "grad_norm": 2.1631091486420617, + "learning_rate": 1.2065297421740225e-05, + "loss": 0.7838, + "step": 5478 + }, + { + "epoch": 0.45085373379962973, + "grad_norm": 1.9780894878588755, + "learning_rate": 1.2062689508479063e-05, + "loss": 0.7458, + "step": 5479 + }, + { + "epoch": 0.45093602139477473, + "grad_norm": 0.42725583646274057, + "learning_rate": 1.2060081448687733e-05, + "loss": 0.525, + "step": 5480 + }, + { + "epoch": 0.4510183089899198, + "grad_norm": 1.9096815826218536, + "learning_rate": 1.2057473242551512e-05, + "loss": 0.7662, + "step": 5481 + }, + { + "epoch": 0.4511005965850648, + "grad_norm": 2.5599758048849646, + "learning_rate": 1.2054864890255684e-05, + "loss": 0.7921, + "step": 5482 + }, + { + "epoch": 0.45118288418020985, + "grad_norm": 1.9583789867483383, + "learning_rate": 1.205225639198554e-05, + "loss": 0.7615, + "step": 5483 + }, + { + "epoch": 0.45126517177535486, + "grad_norm": 2.108542564657258, + "learning_rate": 1.204964774792638e-05, + "loss": 0.7395, + "step": 5484 + }, + { + "epoch": 0.4513474593704999, + "grad_norm": 2.5737598472761594, + "learning_rate": 1.2047038958263529e-05, + "loss": 0.733, + "step": 5485 + }, + { + "epoch": 0.4514297469656449, + "grad_norm": 4.875556583195035, + "learning_rate": 1.2044430023182297e-05, + "loss": 0.7663, + "step": 5486 + }, + { + "epoch": 0.45151203456079, + "grad_norm": 2.3618090614095038, + "learning_rate": 1.204182094286803e-05, + "loss": 0.7427, + "step": 5487 + }, + { + "epoch": 0.451594322155935, + "grad_norm": 3.0241376857396434, + "learning_rate": 1.2039211717506068e-05, + "loss": 0.76, + "step": 5488 + }, + { + "epoch": 0.45167660975108004, + "grad_norm": 2.1735704673553164, + "learning_rate": 1.2036602347281765e-05, + "loss": 0.7201, + "step": 5489 + }, + { + "epoch": 0.45175889734622504, + "grad_norm": 2.9323022659637776, + "learning_rate": 1.2033992832380492e-05, + "loss": 0.7539, + "step": 5490 + }, + { + "epoch": 0.4518411849413701, + "grad_norm": 2.650195150608655, + "learning_rate": 1.2031383172987616e-05, + "loss": 0.7516, + "step": 5491 + }, + { + "epoch": 0.4519234725365151, + "grad_norm": 2.140177626435725, + "learning_rate": 1.2028773369288532e-05, + "loss": 0.74, + "step": 5492 + }, + { + "epoch": 0.45200576013166016, + "grad_norm": 2.2367429780420847, + "learning_rate": 1.2026163421468633e-05, + "loss": 0.7752, + "step": 5493 + }, + { + "epoch": 0.45208804772680516, + "grad_norm": 2.4014985643422664, + "learning_rate": 1.2023553329713322e-05, + "loss": 0.7827, + "step": 5494 + }, + { + "epoch": 0.4521703353219502, + "grad_norm": 0.4483915604013464, + "learning_rate": 1.2020943094208019e-05, + "loss": 0.4782, + "step": 5495 + }, + { + "epoch": 0.4522526229170952, + "grad_norm": 2.484609745390695, + "learning_rate": 1.2018332715138153e-05, + "loss": 0.7731, + "step": 5496 + }, + { + "epoch": 0.4523349105122403, + "grad_norm": 0.412230401879127, + "learning_rate": 1.2015722192689156e-05, + "loss": 0.4965, + "step": 5497 + }, + { + "epoch": 0.4524171981073853, + "grad_norm": 3.181019543409481, + "learning_rate": 1.2013111527046479e-05, + "loss": 0.7561, + "step": 5498 + }, + { + "epoch": 0.45249948570253035, + "grad_norm": 0.43042797525807247, + "learning_rate": 1.2010500718395581e-05, + "loss": 0.5123, + "step": 5499 + }, + { + "epoch": 0.45258177329767535, + "grad_norm": 2.277874217901845, + "learning_rate": 1.2007889766921925e-05, + "loss": 0.747, + "step": 5500 + }, + { + "epoch": 0.4526640608928204, + "grad_norm": 2.20757344531449, + "learning_rate": 1.2005278672810992e-05, + "loss": 0.7384, + "step": 5501 + }, + { + "epoch": 0.45274634848796547, + "grad_norm": 2.4667770060450436, + "learning_rate": 1.2002667436248267e-05, + "loss": 0.7468, + "step": 5502 + }, + { + "epoch": 0.45282863608311047, + "grad_norm": 2.3366903132467796, + "learning_rate": 1.2000056057419252e-05, + "loss": 0.7642, + "step": 5503 + }, + { + "epoch": 0.45291092367825553, + "grad_norm": 2.764775029076765, + "learning_rate": 1.1997444536509453e-05, + "loss": 0.7664, + "step": 5504 + }, + { + "epoch": 0.45299321127340053, + "grad_norm": 0.4158629976255346, + "learning_rate": 1.199483287370439e-05, + "loss": 0.5289, + "step": 5505 + }, + { + "epoch": 0.4530754988685456, + "grad_norm": 0.4371320756521687, + "learning_rate": 1.1992221069189587e-05, + "loss": 0.5083, + "step": 5506 + }, + { + "epoch": 0.4531577864636906, + "grad_norm": 2.494218001690742, + "learning_rate": 1.1989609123150587e-05, + "loss": 0.7362, + "step": 5507 + }, + { + "epoch": 0.45324007405883565, + "grad_norm": 2.9082944445436842, + "learning_rate": 1.1986997035772938e-05, + "loss": 0.789, + "step": 5508 + }, + { + "epoch": 0.45332236165398065, + "grad_norm": 2.0884373437673496, + "learning_rate": 1.1984384807242195e-05, + "loss": 0.7449, + "step": 5509 + }, + { + "epoch": 0.4534046492491257, + "grad_norm": 2.522702146935991, + "learning_rate": 1.198177243774393e-05, + "loss": 0.7556, + "step": 5510 + }, + { + "epoch": 0.4534869368442707, + "grad_norm": 10.42179642979865, + "learning_rate": 1.197915992746372e-05, + "loss": 0.7591, + "step": 5511 + }, + { + "epoch": 0.4535692244394158, + "grad_norm": 0.4286784067559489, + "learning_rate": 1.1976547276587153e-05, + "loss": 0.5136, + "step": 5512 + }, + { + "epoch": 0.4536515120345608, + "grad_norm": 2.1202159423853537, + "learning_rate": 1.1973934485299831e-05, + "loss": 0.7621, + "step": 5513 + }, + { + "epoch": 0.45373379962970584, + "grad_norm": 2.7307189374163348, + "learning_rate": 1.1971321553787358e-05, + "loss": 0.7507, + "step": 5514 + }, + { + "epoch": 0.45381608722485084, + "grad_norm": 2.652175127435486, + "learning_rate": 1.1968708482235357e-05, + "loss": 0.7329, + "step": 5515 + }, + { + "epoch": 0.4538983748199959, + "grad_norm": 2.400999240531746, + "learning_rate": 1.1966095270829452e-05, + "loss": 0.7331, + "step": 5516 + }, + { + "epoch": 0.4539806624151409, + "grad_norm": 2.2318419359651314, + "learning_rate": 1.1963481919755282e-05, + "loss": 0.748, + "step": 5517 + }, + { + "epoch": 0.45406295001028596, + "grad_norm": 0.4398569289239833, + "learning_rate": 1.1960868429198498e-05, + "loss": 0.5306, + "step": 5518 + }, + { + "epoch": 0.45414523760543096, + "grad_norm": 2.6714768672219624, + "learning_rate": 1.1958254799344758e-05, + "loss": 0.7631, + "step": 5519 + }, + { + "epoch": 0.454227525200576, + "grad_norm": 3.0349212809126853, + "learning_rate": 1.1955641030379726e-05, + "loss": 0.7911, + "step": 5520 + }, + { + "epoch": 0.454309812795721, + "grad_norm": 2.7741509102755453, + "learning_rate": 1.1953027122489083e-05, + "loss": 0.754, + "step": 5521 + }, + { + "epoch": 0.4543921003908661, + "grad_norm": 0.4127490273538106, + "learning_rate": 1.1950413075858518e-05, + "loss": 0.4893, + "step": 5522 + }, + { + "epoch": 0.4544743879860111, + "grad_norm": 3.382838305041213, + "learning_rate": 1.1947798890673727e-05, + "loss": 0.7304, + "step": 5523 + }, + { + "epoch": 0.45455667558115614, + "grad_norm": 3.0367554799217227, + "learning_rate": 1.1945184567120418e-05, + "loss": 0.7259, + "step": 5524 + }, + { + "epoch": 0.45463896317630115, + "grad_norm": 2.709871417538688, + "learning_rate": 1.194257010538431e-05, + "loss": 0.7625, + "step": 5525 + }, + { + "epoch": 0.4547212507714462, + "grad_norm": 4.416955882482528, + "learning_rate": 1.1939955505651123e-05, + "loss": 0.7551, + "step": 5526 + }, + { + "epoch": 0.4548035383665912, + "grad_norm": 3.4372628383132944, + "learning_rate": 1.1937340768106605e-05, + "loss": 0.754, + "step": 5527 + }, + { + "epoch": 0.45488582596173627, + "grad_norm": 2.348343368085471, + "learning_rate": 1.1934725892936496e-05, + "loss": 0.726, + "step": 5528 + }, + { + "epoch": 0.4549681135568813, + "grad_norm": 2.289715705089844, + "learning_rate": 1.1932110880326553e-05, + "loss": 0.7133, + "step": 5529 + }, + { + "epoch": 0.45505040115202633, + "grad_norm": 2.8699412082656783, + "learning_rate": 1.1929495730462541e-05, + "loss": 0.7999, + "step": 5530 + }, + { + "epoch": 0.4551326887471714, + "grad_norm": 2.57215440322858, + "learning_rate": 1.192688044353024e-05, + "loss": 0.7528, + "step": 5531 + }, + { + "epoch": 0.4552149763423164, + "grad_norm": 2.723173436258021, + "learning_rate": 1.1924265019715433e-05, + "loss": 0.7515, + "step": 5532 + }, + { + "epoch": 0.45529726393746145, + "grad_norm": 2.420211412870845, + "learning_rate": 1.1921649459203917e-05, + "loss": 0.7562, + "step": 5533 + }, + { + "epoch": 0.45537955153260645, + "grad_norm": 8.7791466216183, + "learning_rate": 1.19190337621815e-05, + "loss": 0.7534, + "step": 5534 + }, + { + "epoch": 0.4554618391277515, + "grad_norm": 0.43495882109077477, + "learning_rate": 1.1916417928833988e-05, + "loss": 0.5229, + "step": 5535 + }, + { + "epoch": 0.4555441267228965, + "grad_norm": 2.8049470785228032, + "learning_rate": 1.1913801959347213e-05, + "loss": 0.7688, + "step": 5536 + }, + { + "epoch": 0.4556264143180416, + "grad_norm": 2.345740107067168, + "learning_rate": 1.191118585390701e-05, + "loss": 0.7569, + "step": 5537 + }, + { + "epoch": 0.4557087019131866, + "grad_norm": 2.111864186314937, + "learning_rate": 1.1908569612699217e-05, + "loss": 0.7542, + "step": 5538 + }, + { + "epoch": 0.45579098950833163, + "grad_norm": 2.4677319977433423, + "learning_rate": 1.1905953235909693e-05, + "loss": 0.7556, + "step": 5539 + }, + { + "epoch": 0.45587327710347664, + "grad_norm": 2.4232460379201863, + "learning_rate": 1.19033367237243e-05, + "loss": 0.7379, + "step": 5540 + }, + { + "epoch": 0.4559555646986217, + "grad_norm": 2.1907899911272404, + "learning_rate": 1.1900720076328906e-05, + "loss": 0.7616, + "step": 5541 + }, + { + "epoch": 0.4560378522937667, + "grad_norm": 2.249056477009621, + "learning_rate": 1.18981032939094e-05, + "loss": 0.7314, + "step": 5542 + }, + { + "epoch": 0.45612013988891176, + "grad_norm": 2.731578861784911, + "learning_rate": 1.1895486376651675e-05, + "loss": 0.7397, + "step": 5543 + }, + { + "epoch": 0.45620242748405676, + "grad_norm": 0.4270439641181335, + "learning_rate": 1.1892869324741625e-05, + "loss": 0.5182, + "step": 5544 + }, + { + "epoch": 0.4562847150792018, + "grad_norm": 2.3630649439679634, + "learning_rate": 1.1890252138365169e-05, + "loss": 0.7724, + "step": 5545 + }, + { + "epoch": 0.4563670026743468, + "grad_norm": 3.9753155398745585, + "learning_rate": 1.188763481770822e-05, + "loss": 0.7559, + "step": 5546 + }, + { + "epoch": 0.4564492902694919, + "grad_norm": 2.6841327980584984, + "learning_rate": 1.1885017362956712e-05, + "loss": 0.7639, + "step": 5547 + }, + { + "epoch": 0.4565315778646369, + "grad_norm": 0.40417769923318725, + "learning_rate": 1.1882399774296589e-05, + "loss": 0.5068, + "step": 5548 + }, + { + "epoch": 0.45661386545978194, + "grad_norm": 2.2022568410658323, + "learning_rate": 1.1879782051913792e-05, + "loss": 0.7513, + "step": 5549 + }, + { + "epoch": 0.45669615305492695, + "grad_norm": 2.1866314853854947, + "learning_rate": 1.1877164195994288e-05, + "loss": 0.7689, + "step": 5550 + }, + { + "epoch": 0.456778440650072, + "grad_norm": 2.753178816178423, + "learning_rate": 1.1874546206724043e-05, + "loss": 0.7169, + "step": 5551 + }, + { + "epoch": 0.456860728245217, + "grad_norm": 2.5705588184990886, + "learning_rate": 1.187192808428903e-05, + "loss": 0.7593, + "step": 5552 + }, + { + "epoch": 0.45694301584036207, + "grad_norm": 2.3607431436222184, + "learning_rate": 1.1869309828875239e-05, + "loss": 0.7945, + "step": 5553 + }, + { + "epoch": 0.4570253034355071, + "grad_norm": 2.420033687346616, + "learning_rate": 1.186669144066867e-05, + "loss": 0.7425, + "step": 5554 + }, + { + "epoch": 0.4571075910306521, + "grad_norm": 2.3738342362522435, + "learning_rate": 1.1864072919855325e-05, + "loss": 0.7637, + "step": 5555 + }, + { + "epoch": 0.4571898786257972, + "grad_norm": 2.1297484031324205, + "learning_rate": 1.1861454266621219e-05, + "loss": 0.7448, + "step": 5556 + }, + { + "epoch": 0.4572721662209422, + "grad_norm": 2.0065527757110697, + "learning_rate": 1.1858835481152385e-05, + "loss": 0.7406, + "step": 5557 + }, + { + "epoch": 0.45735445381608725, + "grad_norm": 2.3345709901876637, + "learning_rate": 1.1856216563634843e-05, + "loss": 0.7533, + "step": 5558 + }, + { + "epoch": 0.45743674141123225, + "grad_norm": 3.036504638027851, + "learning_rate": 1.185359751425465e-05, + "loss": 0.7884, + "step": 5559 + }, + { + "epoch": 0.4575190290063773, + "grad_norm": 0.42778732195343716, + "learning_rate": 1.1850978333197856e-05, + "loss": 0.5128, + "step": 5560 + }, + { + "epoch": 0.4576013166015223, + "grad_norm": 2.4654579621065875, + "learning_rate": 1.1848359020650517e-05, + "loss": 0.7503, + "step": 5561 + }, + { + "epoch": 0.45768360419666737, + "grad_norm": 2.5099277173577157, + "learning_rate": 1.184573957679871e-05, + "loss": 0.7583, + "step": 5562 + }, + { + "epoch": 0.4577658917918124, + "grad_norm": 2.0911776994859355, + "learning_rate": 1.1843120001828517e-05, + "loss": 0.7769, + "step": 5563 + }, + { + "epoch": 0.45784817938695743, + "grad_norm": 2.4602606032108083, + "learning_rate": 1.1840500295926025e-05, + "loss": 0.7668, + "step": 5564 + }, + { + "epoch": 0.45793046698210244, + "grad_norm": 2.745197988576234, + "learning_rate": 1.1837880459277335e-05, + "loss": 0.7165, + "step": 5565 + }, + { + "epoch": 0.4580127545772475, + "grad_norm": 2.1701350499614422, + "learning_rate": 1.1835260492068558e-05, + "loss": 0.7566, + "step": 5566 + }, + { + "epoch": 0.4580950421723925, + "grad_norm": 3.098693708480851, + "learning_rate": 1.183264039448581e-05, + "loss": 0.7627, + "step": 5567 + }, + { + "epoch": 0.45817732976753756, + "grad_norm": 2.248622064045407, + "learning_rate": 1.1830020166715218e-05, + "loss": 0.746, + "step": 5568 + }, + { + "epoch": 0.45825961736268256, + "grad_norm": 2.292733796104913, + "learning_rate": 1.1827399808942923e-05, + "loss": 0.7794, + "step": 5569 + }, + { + "epoch": 0.4583419049578276, + "grad_norm": 2.859074647920882, + "learning_rate": 1.1824779321355066e-05, + "loss": 0.7727, + "step": 5570 + }, + { + "epoch": 0.4584241925529726, + "grad_norm": 1.778429979880988, + "learning_rate": 1.1822158704137805e-05, + "loss": 0.7439, + "step": 5571 + }, + { + "epoch": 0.4585064801481177, + "grad_norm": 2.2157385335709514, + "learning_rate": 1.1819537957477304e-05, + "loss": 0.7847, + "step": 5572 + }, + { + "epoch": 0.4585887677432627, + "grad_norm": 0.43301274645037946, + "learning_rate": 1.1816917081559735e-05, + "loss": 0.5508, + "step": 5573 + }, + { + "epoch": 0.45867105533840774, + "grad_norm": 1.9683373121937986, + "learning_rate": 1.1814296076571283e-05, + "loss": 0.7319, + "step": 5574 + }, + { + "epoch": 0.45875334293355274, + "grad_norm": 2.218338472110191, + "learning_rate": 1.1811674942698142e-05, + "loss": 0.7685, + "step": 5575 + }, + { + "epoch": 0.4588356305286978, + "grad_norm": 2.114172638835147, + "learning_rate": 1.1809053680126506e-05, + "loss": 0.7629, + "step": 5576 + }, + { + "epoch": 0.4589179181238428, + "grad_norm": 2.217071805084302, + "learning_rate": 1.180643228904259e-05, + "loss": 0.7732, + "step": 5577 + }, + { + "epoch": 0.45900020571898786, + "grad_norm": 2.147640917765827, + "learning_rate": 1.1803810769632618e-05, + "loss": 0.7601, + "step": 5578 + }, + { + "epoch": 0.45908249331413287, + "grad_norm": 2.259905582045493, + "learning_rate": 1.1801189122082808e-05, + "loss": 0.7421, + "step": 5579 + }, + { + "epoch": 0.4591647809092779, + "grad_norm": 2.666688578461901, + "learning_rate": 1.1798567346579408e-05, + "loss": 0.7571, + "step": 5580 + }, + { + "epoch": 0.459247068504423, + "grad_norm": 0.44593688083689575, + "learning_rate": 1.1795945443308657e-05, + "loss": 0.4945, + "step": 5581 + }, + { + "epoch": 0.459329356099568, + "grad_norm": 2.263339742673631, + "learning_rate": 1.1793323412456814e-05, + "loss": 0.7975, + "step": 5582 + }, + { + "epoch": 0.45941164369471305, + "grad_norm": 1.9612000586729805, + "learning_rate": 1.1790701254210146e-05, + "loss": 0.7406, + "step": 5583 + }, + { + "epoch": 0.45949393128985805, + "grad_norm": 2.3421054538608854, + "learning_rate": 1.1788078968754924e-05, + "loss": 0.7661, + "step": 5584 + }, + { + "epoch": 0.4595762188850031, + "grad_norm": 2.5633666359304375, + "learning_rate": 1.178545655627743e-05, + "loss": 0.7659, + "step": 5585 + }, + { + "epoch": 0.4596585064801481, + "grad_norm": 0.41096389211852863, + "learning_rate": 1.178283401696396e-05, + "loss": 0.516, + "step": 5586 + }, + { + "epoch": 0.45974079407529317, + "grad_norm": 0.41458650857605023, + "learning_rate": 1.178021135100081e-05, + "loss": 0.4934, + "step": 5587 + }, + { + "epoch": 0.45982308167043817, + "grad_norm": 2.2985738750007503, + "learning_rate": 1.1777588558574292e-05, + "loss": 0.7608, + "step": 5588 + }, + { + "epoch": 0.45990536926558323, + "grad_norm": 2.6315849529356985, + "learning_rate": 1.177496563987073e-05, + "loss": 0.7518, + "step": 5589 + }, + { + "epoch": 0.45998765686072823, + "grad_norm": 3.5111781365072936, + "learning_rate": 1.1772342595076445e-05, + "loss": 0.7839, + "step": 5590 + }, + { + "epoch": 0.4600699444558733, + "grad_norm": 1.875212272512906, + "learning_rate": 1.1769719424377774e-05, + "loss": 0.7574, + "step": 5591 + }, + { + "epoch": 0.4601522320510183, + "grad_norm": 1.9525170038115427, + "learning_rate": 1.1767096127961071e-05, + "loss": 0.7737, + "step": 5592 + }, + { + "epoch": 0.46023451964616335, + "grad_norm": 4.306458801308796, + "learning_rate": 1.1764472706012682e-05, + "loss": 0.7583, + "step": 5593 + }, + { + "epoch": 0.46031680724130836, + "grad_norm": 3.3429450361781514, + "learning_rate": 1.1761849158718972e-05, + "loss": 0.7333, + "step": 5594 + }, + { + "epoch": 0.4603990948364534, + "grad_norm": 3.547428539281552, + "learning_rate": 1.1759225486266319e-05, + "loss": 0.717, + "step": 5595 + }, + { + "epoch": 0.4604813824315984, + "grad_norm": 1.8340373836980097, + "learning_rate": 1.1756601688841099e-05, + "loss": 0.756, + "step": 5596 + }, + { + "epoch": 0.4605636700267435, + "grad_norm": 1.9991496892507565, + "learning_rate": 1.1753977766629703e-05, + "loss": 0.7568, + "step": 5597 + }, + { + "epoch": 0.4606459576218885, + "grad_norm": 2.940376465909627, + "learning_rate": 1.1751353719818535e-05, + "loss": 0.7392, + "step": 5598 + }, + { + "epoch": 0.46072824521703354, + "grad_norm": 2.573953056634928, + "learning_rate": 1.1748729548593998e-05, + "loss": 0.7419, + "step": 5599 + }, + { + "epoch": 0.46081053281217854, + "grad_norm": 2.035434843493305, + "learning_rate": 1.1746105253142508e-05, + "loss": 0.7714, + "step": 5600 + }, + { + "epoch": 0.4608928204073236, + "grad_norm": 2.3612562162234267, + "learning_rate": 1.1743480833650498e-05, + "loss": 0.7463, + "step": 5601 + }, + { + "epoch": 0.4609751080024686, + "grad_norm": 2.2670776537274175, + "learning_rate": 1.1740856290304394e-05, + "loss": 0.7348, + "step": 5602 + }, + { + "epoch": 0.46105739559761366, + "grad_norm": 0.40853927760602204, + "learning_rate": 1.1738231623290643e-05, + "loss": 0.514, + "step": 5603 + }, + { + "epoch": 0.46113968319275866, + "grad_norm": 2.415716460380235, + "learning_rate": 1.17356068327957e-05, + "loss": 0.728, + "step": 5604 + }, + { + "epoch": 0.4612219707879037, + "grad_norm": 2.084331097097651, + "learning_rate": 1.1732981919006024e-05, + "loss": 0.7599, + "step": 5605 + }, + { + "epoch": 0.4613042583830488, + "grad_norm": 2.154151150470679, + "learning_rate": 1.173035688210808e-05, + "loss": 0.7419, + "step": 5606 + }, + { + "epoch": 0.4613865459781938, + "grad_norm": 2.197839957072846, + "learning_rate": 1.1727731722288356e-05, + "loss": 0.7601, + "step": 5607 + }, + { + "epoch": 0.46146883357333884, + "grad_norm": 2.019837653396431, + "learning_rate": 1.172510643973333e-05, + "loss": 0.7487, + "step": 5608 + }, + { + "epoch": 0.46155112116848385, + "grad_norm": 1.8369940939666347, + "learning_rate": 1.1722481034629498e-05, + "loss": 0.7503, + "step": 5609 + }, + { + "epoch": 0.4616334087636289, + "grad_norm": 2.2412721525031816, + "learning_rate": 1.1719855507163374e-05, + "loss": 0.7471, + "step": 5610 + }, + { + "epoch": 0.4617156963587739, + "grad_norm": 1.8242708203863025, + "learning_rate": 1.1717229857521461e-05, + "loss": 0.7512, + "step": 5611 + }, + { + "epoch": 0.46179798395391897, + "grad_norm": 2.460241659766797, + "learning_rate": 1.1714604085890292e-05, + "loss": 0.7541, + "step": 5612 + }, + { + "epoch": 0.46188027154906397, + "grad_norm": 1.9410595569512286, + "learning_rate": 1.1711978192456384e-05, + "loss": 0.7437, + "step": 5613 + }, + { + "epoch": 0.46196255914420903, + "grad_norm": 2.618386283233846, + "learning_rate": 1.1709352177406284e-05, + "loss": 0.7652, + "step": 5614 + }, + { + "epoch": 0.46204484673935403, + "grad_norm": 2.2975366208810626, + "learning_rate": 1.1706726040926542e-05, + "loss": 0.7684, + "step": 5615 + }, + { + "epoch": 0.4621271343344991, + "grad_norm": 2.252541460283884, + "learning_rate": 1.1704099783203708e-05, + "loss": 0.748, + "step": 5616 + }, + { + "epoch": 0.4622094219296441, + "grad_norm": 1.963479936721276, + "learning_rate": 1.1701473404424354e-05, + "loss": 0.7343, + "step": 5617 + }, + { + "epoch": 0.46229170952478915, + "grad_norm": 0.42706299223034566, + "learning_rate": 1.1698846904775047e-05, + "loss": 0.5133, + "step": 5618 + }, + { + "epoch": 0.46237399711993415, + "grad_norm": 2.0444135649749384, + "learning_rate": 1.1696220284442374e-05, + "loss": 0.7671, + "step": 5619 + }, + { + "epoch": 0.4624562847150792, + "grad_norm": 2.4531291386018172, + "learning_rate": 1.1693593543612925e-05, + "loss": 0.7476, + "step": 5620 + }, + { + "epoch": 0.4625385723102242, + "grad_norm": 2.1206531732639986, + "learning_rate": 1.1690966682473301e-05, + "loss": 0.7399, + "step": 5621 + }, + { + "epoch": 0.4626208599053693, + "grad_norm": 0.4385397729197111, + "learning_rate": 1.1688339701210106e-05, + "loss": 0.5194, + "step": 5622 + }, + { + "epoch": 0.4627031475005143, + "grad_norm": 0.4221980485241206, + "learning_rate": 1.1685712600009959e-05, + "loss": 0.5177, + "step": 5623 + }, + { + "epoch": 0.46278543509565934, + "grad_norm": 2.5441570461119793, + "learning_rate": 1.1683085379059486e-05, + "loss": 0.7544, + "step": 5624 + }, + { + "epoch": 0.46286772269080434, + "grad_norm": 2.5334394281304116, + "learning_rate": 1.1680458038545316e-05, + "loss": 0.7371, + "step": 5625 + }, + { + "epoch": 0.4629500102859494, + "grad_norm": 0.406084761511755, + "learning_rate": 1.16778305786541e-05, + "loss": 0.4987, + "step": 5626 + }, + { + "epoch": 0.4630322978810944, + "grad_norm": 0.41890195627666343, + "learning_rate": 1.167520299957248e-05, + "loss": 0.4911, + "step": 5627 + }, + { + "epoch": 0.46311458547623946, + "grad_norm": 1.9604856185775212, + "learning_rate": 1.1672575301487118e-05, + "loss": 0.7604, + "step": 5628 + }, + { + "epoch": 0.46319687307138446, + "grad_norm": 1.9250929940090589, + "learning_rate": 1.1669947484584681e-05, + "loss": 0.7485, + "step": 5629 + }, + { + "epoch": 0.4632791606665295, + "grad_norm": 2.4036179357018264, + "learning_rate": 1.1667319549051845e-05, + "loss": 0.7805, + "step": 5630 + }, + { + "epoch": 0.4633614482616746, + "grad_norm": 1.8395439411024015, + "learning_rate": 1.1664691495075295e-05, + "loss": 0.7777, + "step": 5631 + }, + { + "epoch": 0.4634437358568196, + "grad_norm": 2.514436513509076, + "learning_rate": 1.1662063322841724e-05, + "loss": 0.752, + "step": 5632 + }, + { + "epoch": 0.46352602345196464, + "grad_norm": 1.9958650617144158, + "learning_rate": 1.1659435032537833e-05, + "loss": 0.7468, + "step": 5633 + }, + { + "epoch": 0.46360831104710964, + "grad_norm": 2.381825419610243, + "learning_rate": 1.1656806624350331e-05, + "loss": 0.7512, + "step": 5634 + }, + { + "epoch": 0.4636905986422547, + "grad_norm": 2.2901474600441922, + "learning_rate": 1.1654178098465936e-05, + "loss": 0.7406, + "step": 5635 + }, + { + "epoch": 0.4637728862373997, + "grad_norm": 2.695575194164159, + "learning_rate": 1.1651549455071376e-05, + "loss": 0.7298, + "step": 5636 + }, + { + "epoch": 0.46385517383254476, + "grad_norm": 2.615623584664641, + "learning_rate": 1.164892069435338e-05, + "loss": 0.7297, + "step": 5637 + }, + { + "epoch": 0.46393746142768977, + "grad_norm": 2.0200137217207073, + "learning_rate": 1.1646291816498696e-05, + "loss": 0.7611, + "step": 5638 + }, + { + "epoch": 0.4640197490228348, + "grad_norm": 3.0411462323121508, + "learning_rate": 1.1643662821694078e-05, + "loss": 0.7645, + "step": 5639 + }, + { + "epoch": 0.46410203661797983, + "grad_norm": 2.3400011828071374, + "learning_rate": 1.1641033710126278e-05, + "loss": 0.7459, + "step": 5640 + }, + { + "epoch": 0.4641843242131249, + "grad_norm": 0.42569773070724515, + "learning_rate": 1.1638404481982069e-05, + "loss": 0.5198, + "step": 5641 + }, + { + "epoch": 0.4642666118082699, + "grad_norm": 3.0040664429408594, + "learning_rate": 1.1635775137448226e-05, + "loss": 0.7528, + "step": 5642 + }, + { + "epoch": 0.46434889940341495, + "grad_norm": 1.9345602598619334, + "learning_rate": 1.1633145676711532e-05, + "loss": 0.7552, + "step": 5643 + }, + { + "epoch": 0.46443118699855995, + "grad_norm": 3.2693446083450586, + "learning_rate": 1.1630516099958782e-05, + "loss": 0.7489, + "step": 5644 + }, + { + "epoch": 0.464513474593705, + "grad_norm": 2.151674055410782, + "learning_rate": 1.1627886407376777e-05, + "loss": 0.7653, + "step": 5645 + }, + { + "epoch": 0.46459576218885, + "grad_norm": 2.1979946126037473, + "learning_rate": 1.1625256599152323e-05, + "loss": 0.7542, + "step": 5646 + }, + { + "epoch": 0.4646780497839951, + "grad_norm": 0.4388382003330121, + "learning_rate": 1.1622626675472242e-05, + "loss": 0.5382, + "step": 5647 + }, + { + "epoch": 0.4647603373791401, + "grad_norm": 2.650307926924577, + "learning_rate": 1.1619996636523353e-05, + "loss": 0.7538, + "step": 5648 + }, + { + "epoch": 0.46484262497428513, + "grad_norm": 2.294141642193051, + "learning_rate": 1.1617366482492496e-05, + "loss": 0.7581, + "step": 5649 + }, + { + "epoch": 0.46492491256943014, + "grad_norm": 1.9461451602498006, + "learning_rate": 1.1614736213566512e-05, + "loss": 0.753, + "step": 5650 + }, + { + "epoch": 0.4650072001645752, + "grad_norm": 1.9716826271869443, + "learning_rate": 1.1612105829932247e-05, + "loss": 0.7544, + "step": 5651 + }, + { + "epoch": 0.4650894877597202, + "grad_norm": 2.4710117276991737, + "learning_rate": 1.1609475331776566e-05, + "loss": 0.7661, + "step": 5652 + }, + { + "epoch": 0.46517177535486526, + "grad_norm": 2.539869995501278, + "learning_rate": 1.1606844719286327e-05, + "loss": 0.7678, + "step": 5653 + }, + { + "epoch": 0.46525406295001026, + "grad_norm": 2.066534776068987, + "learning_rate": 1.1604213992648413e-05, + "loss": 0.7528, + "step": 5654 + }, + { + "epoch": 0.4653363505451553, + "grad_norm": 0.41899937474956234, + "learning_rate": 1.1601583152049698e-05, + "loss": 0.5102, + "step": 5655 + }, + { + "epoch": 0.4654186381403003, + "grad_norm": 3.1076390669679226, + "learning_rate": 1.159895219767708e-05, + "loss": 0.7409, + "step": 5656 + }, + { + "epoch": 0.4655009257354454, + "grad_norm": 2.0830787113182265, + "learning_rate": 1.1596321129717453e-05, + "loss": 0.7597, + "step": 5657 + }, + { + "epoch": 0.46558321333059044, + "grad_norm": 1.9903650497938024, + "learning_rate": 1.1593689948357727e-05, + "loss": 0.7886, + "step": 5658 + }, + { + "epoch": 0.46566550092573544, + "grad_norm": 2.178401154189519, + "learning_rate": 1.159105865378482e-05, + "loss": 0.7456, + "step": 5659 + }, + { + "epoch": 0.4657477885208805, + "grad_norm": 1.9869084654123954, + "learning_rate": 1.1588427246185645e-05, + "loss": 0.7716, + "step": 5660 + }, + { + "epoch": 0.4658300761160255, + "grad_norm": 0.4125871692728283, + "learning_rate": 1.158579572574714e-05, + "loss": 0.4891, + "step": 5661 + }, + { + "epoch": 0.46591236371117056, + "grad_norm": 3.45550207338365, + "learning_rate": 1.1583164092656246e-05, + "loss": 0.7646, + "step": 5662 + }, + { + "epoch": 0.46599465130631557, + "grad_norm": 2.0863642863515794, + "learning_rate": 1.1580532347099902e-05, + "loss": 0.7735, + "step": 5663 + }, + { + "epoch": 0.4660769389014606, + "grad_norm": 1.8036246084325789, + "learning_rate": 1.157790048926507e-05, + "loss": 0.7351, + "step": 5664 + }, + { + "epoch": 0.4661592264966056, + "grad_norm": 2.048452517989147, + "learning_rate": 1.1575268519338712e-05, + "loss": 0.7177, + "step": 5665 + }, + { + "epoch": 0.4662415140917507, + "grad_norm": 3.0245924976939307, + "learning_rate": 1.1572636437507796e-05, + "loss": 0.7515, + "step": 5666 + }, + { + "epoch": 0.4663238016868957, + "grad_norm": 2.6074001599599255, + "learning_rate": 1.15700042439593e-05, + "loss": 0.7699, + "step": 5667 + }, + { + "epoch": 0.46640608928204075, + "grad_norm": 1.900428840626333, + "learning_rate": 1.1567371938880219e-05, + "loss": 0.7669, + "step": 5668 + }, + { + "epoch": 0.46648837687718575, + "grad_norm": 1.7457584425906891, + "learning_rate": 1.156473952245754e-05, + "loss": 0.7772, + "step": 5669 + }, + { + "epoch": 0.4665706644723308, + "grad_norm": 1.8370702721904013, + "learning_rate": 1.1562106994878266e-05, + "loss": 0.7091, + "step": 5670 + }, + { + "epoch": 0.4666529520674758, + "grad_norm": 0.4058521931669938, + "learning_rate": 1.1559474356329413e-05, + "loss": 0.5014, + "step": 5671 + }, + { + "epoch": 0.46673523966262087, + "grad_norm": 0.40231563959071076, + "learning_rate": 1.155684160699799e-05, + "loss": 0.509, + "step": 5672 + }, + { + "epoch": 0.4668175272577659, + "grad_norm": 1.978218525174263, + "learning_rate": 1.1554208747071032e-05, + "loss": 0.7229, + "step": 5673 + }, + { + "epoch": 0.46689981485291093, + "grad_norm": 0.4383378842815371, + "learning_rate": 1.155157577673557e-05, + "loss": 0.5226, + "step": 5674 + }, + { + "epoch": 0.46698210244805594, + "grad_norm": 2.1049199344422997, + "learning_rate": 1.1548942696178646e-05, + "loss": 0.7573, + "step": 5675 + }, + { + "epoch": 0.467064390043201, + "grad_norm": 2.315228497149809, + "learning_rate": 1.1546309505587311e-05, + "loss": 0.7521, + "step": 5676 + }, + { + "epoch": 0.467146677638346, + "grad_norm": 6.905649703183174, + "learning_rate": 1.1543676205148624e-05, + "loss": 0.7286, + "step": 5677 + }, + { + "epoch": 0.46722896523349106, + "grad_norm": 0.4213331299981888, + "learning_rate": 1.1541042795049644e-05, + "loss": 0.5347, + "step": 5678 + }, + { + "epoch": 0.46731125282863606, + "grad_norm": 2.2421058534315277, + "learning_rate": 1.153840927547745e-05, + "loss": 0.7501, + "step": 5679 + }, + { + "epoch": 0.4673935404237811, + "grad_norm": 2.6444260000528304, + "learning_rate": 1.1535775646619118e-05, + "loss": 0.7419, + "step": 5680 + }, + { + "epoch": 0.4674758280189261, + "grad_norm": 2.073886624293451, + "learning_rate": 1.1533141908661745e-05, + "loss": 0.775, + "step": 5681 + }, + { + "epoch": 0.4675581156140712, + "grad_norm": 3.2867561944829427, + "learning_rate": 1.1530508061792423e-05, + "loss": 0.7528, + "step": 5682 + }, + { + "epoch": 0.46764040320921624, + "grad_norm": 0.4196514685831581, + "learning_rate": 1.152787410619825e-05, + "loss": 0.4932, + "step": 5683 + }, + { + "epoch": 0.46772269080436124, + "grad_norm": 2.220193348439036, + "learning_rate": 1.152524004206635e-05, + "loss": 0.7352, + "step": 5684 + }, + { + "epoch": 0.4678049783995063, + "grad_norm": 2.3309409946445285, + "learning_rate": 1.1522605869583837e-05, + "loss": 0.747, + "step": 5685 + }, + { + "epoch": 0.4678872659946513, + "grad_norm": 2.5231061158025834, + "learning_rate": 1.1519971588937833e-05, + "loss": 0.7305, + "step": 5686 + }, + { + "epoch": 0.46796955358979636, + "grad_norm": 1.8541132578326183, + "learning_rate": 1.1517337200315481e-05, + "loss": 0.7765, + "step": 5687 + }, + { + "epoch": 0.46805184118494136, + "grad_norm": 1.899905412691171, + "learning_rate": 1.1514702703903922e-05, + "loss": 0.738, + "step": 5688 + }, + { + "epoch": 0.4681341287800864, + "grad_norm": 1.9361160734401437, + "learning_rate": 1.1512068099890303e-05, + "loss": 0.7554, + "step": 5689 + }, + { + "epoch": 0.4682164163752314, + "grad_norm": 2.912719866595171, + "learning_rate": 1.1509433388461785e-05, + "loss": 0.7283, + "step": 5690 + }, + { + "epoch": 0.4682987039703765, + "grad_norm": 2.341082987042206, + "learning_rate": 1.1506798569805536e-05, + "loss": 0.7597, + "step": 5691 + }, + { + "epoch": 0.4683809915655215, + "grad_norm": 2.1914493863246074, + "learning_rate": 1.1504163644108721e-05, + "loss": 0.7284, + "step": 5692 + }, + { + "epoch": 0.46846327916066655, + "grad_norm": 2.3556679012282493, + "learning_rate": 1.150152861155853e-05, + "loss": 0.7557, + "step": 5693 + }, + { + "epoch": 0.46854556675581155, + "grad_norm": 2.2155909073455398, + "learning_rate": 1.1498893472342148e-05, + "loss": 0.7109, + "step": 5694 + }, + { + "epoch": 0.4686278543509566, + "grad_norm": 0.4200529188989092, + "learning_rate": 1.1496258226646771e-05, + "loss": 0.4925, + "step": 5695 + }, + { + "epoch": 0.4687101419461016, + "grad_norm": 1.9939320066807982, + "learning_rate": 1.14936228746596e-05, + "loss": 0.779, + "step": 5696 + }, + { + "epoch": 0.46879242954124667, + "grad_norm": 2.3607595484379575, + "learning_rate": 1.149098741656785e-05, + "loss": 0.7513, + "step": 5697 + }, + { + "epoch": 0.46887471713639167, + "grad_norm": 0.4418605286610495, + "learning_rate": 1.1488351852558739e-05, + "loss": 0.5071, + "step": 5698 + }, + { + "epoch": 0.46895700473153673, + "grad_norm": 1.9003114174594395, + "learning_rate": 1.1485716182819493e-05, + "loss": 0.7445, + "step": 5699 + }, + { + "epoch": 0.46903929232668173, + "grad_norm": 2.203293510674813, + "learning_rate": 1.1483080407537343e-05, + "loss": 0.7385, + "step": 5700 + }, + { + "epoch": 0.4691215799218268, + "grad_norm": 1.9325837732507527, + "learning_rate": 1.1480444526899535e-05, + "loss": 0.7557, + "step": 5701 + }, + { + "epoch": 0.4692038675169718, + "grad_norm": 2.5915573878411795, + "learning_rate": 1.1477808541093316e-05, + "loss": 0.7679, + "step": 5702 + }, + { + "epoch": 0.46928615511211685, + "grad_norm": 2.7702833527712176, + "learning_rate": 1.1475172450305939e-05, + "loss": 0.7873, + "step": 5703 + }, + { + "epoch": 0.46936844270726186, + "grad_norm": 2.420724740451737, + "learning_rate": 1.1472536254724672e-05, + "loss": 0.7519, + "step": 5704 + }, + { + "epoch": 0.4694507303024069, + "grad_norm": 2.151214734406208, + "learning_rate": 1.1469899954536786e-05, + "loss": 0.7456, + "step": 5705 + }, + { + "epoch": 0.4695330178975519, + "grad_norm": 0.4404429591644004, + "learning_rate": 1.1467263549929553e-05, + "loss": 0.5123, + "step": 5706 + }, + { + "epoch": 0.469615305492697, + "grad_norm": 2.637957830728388, + "learning_rate": 1.1464627041090265e-05, + "loss": 0.7886, + "step": 5707 + }, + { + "epoch": 0.469697593087842, + "grad_norm": 0.43710611543182665, + "learning_rate": 1.146199042820622e-05, + "loss": 0.4931, + "step": 5708 + }, + { + "epoch": 0.46977988068298704, + "grad_norm": 1.9231651766482778, + "learning_rate": 1.145935371146471e-05, + "loss": 0.744, + "step": 5709 + }, + { + "epoch": 0.4698621682781321, + "grad_norm": 2.4380820944771826, + "learning_rate": 1.1456716891053043e-05, + "loss": 0.7516, + "step": 5710 + }, + { + "epoch": 0.4699444558732771, + "grad_norm": 0.4114280819823362, + "learning_rate": 1.1454079967158542e-05, + "loss": 0.4786, + "step": 5711 + }, + { + "epoch": 0.47002674346842216, + "grad_norm": 2.1755802482747857, + "learning_rate": 1.1451442939968525e-05, + "loss": 0.7511, + "step": 5712 + }, + { + "epoch": 0.47010903106356716, + "grad_norm": 2.472118745496376, + "learning_rate": 1.144880580967032e-05, + "loss": 0.7561, + "step": 5713 + }, + { + "epoch": 0.4701913186587122, + "grad_norm": 2.281716223315861, + "learning_rate": 1.144616857645127e-05, + "loss": 0.7478, + "step": 5714 + }, + { + "epoch": 0.4702736062538572, + "grad_norm": 2.211675325960965, + "learning_rate": 1.1443531240498715e-05, + "loss": 0.7337, + "step": 5715 + }, + { + "epoch": 0.4703558938490023, + "grad_norm": 2.0993511176579034, + "learning_rate": 1.144089380200001e-05, + "loss": 0.7766, + "step": 5716 + }, + { + "epoch": 0.4704381814441473, + "grad_norm": 0.43401663370714305, + "learning_rate": 1.1438256261142516e-05, + "loss": 0.4971, + "step": 5717 + }, + { + "epoch": 0.47052046903929234, + "grad_norm": 0.42934259329724733, + "learning_rate": 1.1435618618113593e-05, + "loss": 0.5089, + "step": 5718 + }, + { + "epoch": 0.47060275663443735, + "grad_norm": 0.4234240474637959, + "learning_rate": 1.1432980873100622e-05, + "loss": 0.5157, + "step": 5719 + }, + { + "epoch": 0.4706850442295824, + "grad_norm": 2.198263211311973, + "learning_rate": 1.1430343026290983e-05, + "loss": 0.7509, + "step": 5720 + }, + { + "epoch": 0.4707673318247274, + "grad_norm": 2.703679913430109, + "learning_rate": 1.142770507787206e-05, + "loss": 0.7527, + "step": 5721 + }, + { + "epoch": 0.47084961941987247, + "grad_norm": 2.3718137125880654, + "learning_rate": 1.142506702803125e-05, + "loss": 0.7279, + "step": 5722 + }, + { + "epoch": 0.47093190701501747, + "grad_norm": 2.3278646043543176, + "learning_rate": 1.1422428876955964e-05, + "loss": 0.7562, + "step": 5723 + }, + { + "epoch": 0.47101419461016253, + "grad_norm": 2.2101267283094814, + "learning_rate": 1.14197906248336e-05, + "loss": 0.7601, + "step": 5724 + }, + { + "epoch": 0.47109648220530753, + "grad_norm": 2.401947770938394, + "learning_rate": 1.1417152271851582e-05, + "loss": 0.7468, + "step": 5725 + }, + { + "epoch": 0.4711787698004526, + "grad_norm": 2.669067775284675, + "learning_rate": 1.1414513818197335e-05, + "loss": 0.7548, + "step": 5726 + }, + { + "epoch": 0.4712610573955976, + "grad_norm": 2.4636560552954134, + "learning_rate": 1.1411875264058286e-05, + "loss": 0.7595, + "step": 5727 + }, + { + "epoch": 0.47134334499074265, + "grad_norm": 2.5210944200735734, + "learning_rate": 1.1409236609621878e-05, + "loss": 0.7413, + "step": 5728 + }, + { + "epoch": 0.47142563258588766, + "grad_norm": 6.088867250391302, + "learning_rate": 1.1406597855075554e-05, + "loss": 0.7421, + "step": 5729 + }, + { + "epoch": 0.4715079201810327, + "grad_norm": 3.2223370951210537, + "learning_rate": 1.1403959000606766e-05, + "loss": 0.7711, + "step": 5730 + }, + { + "epoch": 0.4715902077761777, + "grad_norm": 2.488736429009699, + "learning_rate": 1.1401320046402979e-05, + "loss": 0.7404, + "step": 5731 + }, + { + "epoch": 0.4716724953713228, + "grad_norm": 2.203750681303369, + "learning_rate": 1.1398680992651657e-05, + "loss": 0.7729, + "step": 5732 + }, + { + "epoch": 0.4717547829664678, + "grad_norm": 3.216370797569919, + "learning_rate": 1.1396041839540271e-05, + "loss": 0.7376, + "step": 5733 + }, + { + "epoch": 0.47183707056161284, + "grad_norm": 3.388561106863594, + "learning_rate": 1.1393402587256308e-05, + "loss": 0.7634, + "step": 5734 + }, + { + "epoch": 0.4719193581567579, + "grad_norm": 3.3538547979166484, + "learning_rate": 1.1390763235987252e-05, + "loss": 0.7468, + "step": 5735 + }, + { + "epoch": 0.4720016457519029, + "grad_norm": 3.2447331726854065, + "learning_rate": 1.1388123785920599e-05, + "loss": 0.7783, + "step": 5736 + }, + { + "epoch": 0.47208393334704796, + "grad_norm": 2.3293143204024593, + "learning_rate": 1.1385484237243855e-05, + "loss": 0.7062, + "step": 5737 + }, + { + "epoch": 0.47216622094219296, + "grad_norm": 3.698015243222801, + "learning_rate": 1.1382844590144527e-05, + "loss": 0.7389, + "step": 5738 + }, + { + "epoch": 0.472248508537338, + "grad_norm": 2.665714768655772, + "learning_rate": 1.1380204844810127e-05, + "loss": 0.7604, + "step": 5739 + }, + { + "epoch": 0.472330796132483, + "grad_norm": 2.4911900838088687, + "learning_rate": 1.1377565001428188e-05, + "loss": 0.736, + "step": 5740 + }, + { + "epoch": 0.4724130837276281, + "grad_norm": 2.4152202143490284, + "learning_rate": 1.1374925060186231e-05, + "loss": 0.7643, + "step": 5741 + }, + { + "epoch": 0.4724953713227731, + "grad_norm": 2.347623558210925, + "learning_rate": 1.1372285021271794e-05, + "loss": 0.7655, + "step": 5742 + }, + { + "epoch": 0.47257765891791814, + "grad_norm": 2.8302401895514038, + "learning_rate": 1.1369644884872429e-05, + "loss": 0.7491, + "step": 5743 + }, + { + "epoch": 0.47265994651306315, + "grad_norm": 2.063581561035632, + "learning_rate": 1.1367004651175678e-05, + "loss": 0.7711, + "step": 5744 + }, + { + "epoch": 0.4727422341082082, + "grad_norm": 2.797682083430461, + "learning_rate": 1.1364364320369102e-05, + "loss": 0.7382, + "step": 5745 + }, + { + "epoch": 0.4728245217033532, + "grad_norm": 2.579411336658761, + "learning_rate": 1.136172389264027e-05, + "loss": 0.7421, + "step": 5746 + }, + { + "epoch": 0.47290680929849827, + "grad_norm": 2.483164991197225, + "learning_rate": 1.135908336817675e-05, + "loss": 0.7522, + "step": 5747 + }, + { + "epoch": 0.47298909689364327, + "grad_norm": 2.3363711066518107, + "learning_rate": 1.1356442747166116e-05, + "loss": 0.7287, + "step": 5748 + }, + { + "epoch": 0.4730713844887883, + "grad_norm": 0.4945971602842628, + "learning_rate": 1.1353802029795965e-05, + "loss": 0.5133, + "step": 5749 + }, + { + "epoch": 0.47315367208393333, + "grad_norm": 3.0074629214147977, + "learning_rate": 1.1351161216253878e-05, + "loss": 0.7493, + "step": 5750 + }, + { + "epoch": 0.4732359596790784, + "grad_norm": 2.595723893668729, + "learning_rate": 1.1348520306727462e-05, + "loss": 0.7433, + "step": 5751 + }, + { + "epoch": 0.4733182472742234, + "grad_norm": 2.5632807422469748, + "learning_rate": 1.1345879301404321e-05, + "loss": 0.761, + "step": 5752 + }, + { + "epoch": 0.47340053486936845, + "grad_norm": 2.935522600288664, + "learning_rate": 1.1343238200472064e-05, + "loss": 0.7655, + "step": 5753 + }, + { + "epoch": 0.47348282246451345, + "grad_norm": 4.787111263634783, + "learning_rate": 1.1340597004118314e-05, + "loss": 0.7669, + "step": 5754 + }, + { + "epoch": 0.4735651100596585, + "grad_norm": 2.82531001114739, + "learning_rate": 1.13379557125307e-05, + "loss": 0.7473, + "step": 5755 + }, + { + "epoch": 0.4736473976548035, + "grad_norm": 2.957002178005482, + "learning_rate": 1.133531432589685e-05, + "loss": 0.7597, + "step": 5756 + }, + { + "epoch": 0.4737296852499486, + "grad_norm": 3.263852679300578, + "learning_rate": 1.1332672844404402e-05, + "loss": 0.7309, + "step": 5757 + }, + { + "epoch": 0.4738119728450936, + "grad_norm": 2.2841607016668766, + "learning_rate": 1.1330031268241013e-05, + "loss": 0.7329, + "step": 5758 + }, + { + "epoch": 0.47389426044023863, + "grad_norm": 2.5738549672418545, + "learning_rate": 1.1327389597594325e-05, + "loss": 0.7481, + "step": 5759 + }, + { + "epoch": 0.4739765480353837, + "grad_norm": 0.42851375746752157, + "learning_rate": 1.1324747832652004e-05, + "loss": 0.5182, + "step": 5760 + }, + { + "epoch": 0.4740588356305287, + "grad_norm": 3.112748998480725, + "learning_rate": 1.1322105973601716e-05, + "loss": 0.77, + "step": 5761 + }, + { + "epoch": 0.47414112322567376, + "grad_norm": 1.961720274469146, + "learning_rate": 1.1319464020631135e-05, + "loss": 0.7582, + "step": 5762 + }, + { + "epoch": 0.47422341082081876, + "grad_norm": 2.623178252350117, + "learning_rate": 1.1316821973927938e-05, + "loss": 0.7482, + "step": 5763 + }, + { + "epoch": 0.4743056984159638, + "grad_norm": 2.794743066328855, + "learning_rate": 1.1314179833679815e-05, + "loss": 0.7464, + "step": 5764 + }, + { + "epoch": 0.4743879860111088, + "grad_norm": 2.5035009223309737, + "learning_rate": 1.1311537600074458e-05, + "loss": 0.7539, + "step": 5765 + }, + { + "epoch": 0.4744702736062539, + "grad_norm": 2.38072014852653, + "learning_rate": 1.1308895273299569e-05, + "loss": 0.7399, + "step": 5766 + }, + { + "epoch": 0.4745525612013989, + "grad_norm": 0.42835283753445264, + "learning_rate": 1.1306252853542852e-05, + "loss": 0.5188, + "step": 5767 + }, + { + "epoch": 0.47463484879654394, + "grad_norm": 3.2350689748119446, + "learning_rate": 1.1303610340992019e-05, + "loss": 0.7735, + "step": 5768 + }, + { + "epoch": 0.47471713639168894, + "grad_norm": 3.36059003573526, + "learning_rate": 1.1300967735834796e-05, + "loss": 0.7658, + "step": 5769 + }, + { + "epoch": 0.474799423986834, + "grad_norm": 2.969124052421608, + "learning_rate": 1.1298325038258904e-05, + "loss": 0.7393, + "step": 5770 + }, + { + "epoch": 0.474881711581979, + "grad_norm": 0.4078378122272731, + "learning_rate": 1.1295682248452075e-05, + "loss": 0.5126, + "step": 5771 + }, + { + "epoch": 0.47496399917712406, + "grad_norm": 0.4237109376231862, + "learning_rate": 1.1293039366602057e-05, + "loss": 0.4926, + "step": 5772 + }, + { + "epoch": 0.47504628677226907, + "grad_norm": 2.597403418399249, + "learning_rate": 1.1290396392896585e-05, + "loss": 0.7716, + "step": 5773 + }, + { + "epoch": 0.4751285743674141, + "grad_norm": 0.41510689902586945, + "learning_rate": 1.128775332752342e-05, + "loss": 0.5046, + "step": 5774 + }, + { + "epoch": 0.47521086196255913, + "grad_norm": 3.9069491283685154, + "learning_rate": 1.1285110170670317e-05, + "loss": 0.7948, + "step": 5775 + }, + { + "epoch": 0.4752931495577042, + "grad_norm": 2.4037275599307883, + "learning_rate": 1.1282466922525044e-05, + "loss": 0.7776, + "step": 5776 + }, + { + "epoch": 0.4753754371528492, + "grad_norm": 2.277897477752852, + "learning_rate": 1.127982358327537e-05, + "loss": 0.7551, + "step": 5777 + }, + { + "epoch": 0.47545772474799425, + "grad_norm": 2.439972297811944, + "learning_rate": 1.1277180153109076e-05, + "loss": 0.7596, + "step": 5778 + }, + { + "epoch": 0.47554001234313925, + "grad_norm": 3.9798815324070342, + "learning_rate": 1.1274536632213945e-05, + "loss": 0.7638, + "step": 5779 + }, + { + "epoch": 0.4756222999382843, + "grad_norm": 5.600735329238622, + "learning_rate": 1.1271893020777771e-05, + "loss": 0.7561, + "step": 5780 + }, + { + "epoch": 0.4757045875334293, + "grad_norm": 2.307130402544801, + "learning_rate": 1.1269249318988354e-05, + "loss": 0.7221, + "step": 5781 + }, + { + "epoch": 0.47578687512857437, + "grad_norm": 2.305278607338824, + "learning_rate": 1.1266605527033492e-05, + "loss": 0.7648, + "step": 5782 + }, + { + "epoch": 0.4758691627237194, + "grad_norm": 3.9749214083145885, + "learning_rate": 1.1263961645100998e-05, + "loss": 0.7413, + "step": 5783 + }, + { + "epoch": 0.47595145031886443, + "grad_norm": 2.934765236703905, + "learning_rate": 1.1261317673378692e-05, + "loss": 0.758, + "step": 5784 + }, + { + "epoch": 0.47603373791400944, + "grad_norm": 2.3255683834199368, + "learning_rate": 1.1258673612054395e-05, + "loss": 0.7704, + "step": 5785 + }, + { + "epoch": 0.4761160255091545, + "grad_norm": 2.1219415528399206, + "learning_rate": 1.1256029461315939e-05, + "loss": 0.751, + "step": 5786 + }, + { + "epoch": 0.47619831310429955, + "grad_norm": 2.4072922662137937, + "learning_rate": 1.1253385221351158e-05, + "loss": 0.7529, + "step": 5787 + }, + { + "epoch": 0.47628060069944456, + "grad_norm": 2.5590171509214414, + "learning_rate": 1.1250740892347895e-05, + "loss": 0.7278, + "step": 5788 + }, + { + "epoch": 0.4763628882945896, + "grad_norm": 2.3074172553082763, + "learning_rate": 1.1248096474493999e-05, + "loss": 0.767, + "step": 5789 + }, + { + "epoch": 0.4764451758897346, + "grad_norm": 4.186064764904037, + "learning_rate": 1.1245451967977328e-05, + "loss": 0.7657, + "step": 5790 + }, + { + "epoch": 0.4765274634848797, + "grad_norm": 3.2763929186020113, + "learning_rate": 1.1242807372985738e-05, + "loss": 0.7475, + "step": 5791 + }, + { + "epoch": 0.4766097510800247, + "grad_norm": 2.134087707195808, + "learning_rate": 1.12401626897071e-05, + "loss": 0.7322, + "step": 5792 + }, + { + "epoch": 0.47669203867516974, + "grad_norm": 2.509159529201529, + "learning_rate": 1.1237517918329288e-05, + "loss": 0.7297, + "step": 5793 + }, + { + "epoch": 0.47677432627031474, + "grad_norm": 3.639050290523382, + "learning_rate": 1.1234873059040183e-05, + "loss": 0.7471, + "step": 5794 + }, + { + "epoch": 0.4768566138654598, + "grad_norm": 2.1177268983426845, + "learning_rate": 1.123222811202767e-05, + "loss": 0.7215, + "step": 5795 + }, + { + "epoch": 0.4769389014606048, + "grad_norm": 2.417329477961515, + "learning_rate": 1.1229583077479644e-05, + "loss": 0.7699, + "step": 5796 + }, + { + "epoch": 0.47702118905574986, + "grad_norm": 0.4752616157438557, + "learning_rate": 1.1226937955584001e-05, + "loss": 0.5278, + "step": 5797 + }, + { + "epoch": 0.47710347665089486, + "grad_norm": 2.310545010067949, + "learning_rate": 1.1224292746528651e-05, + "loss": 0.74, + "step": 5798 + }, + { + "epoch": 0.4771857642460399, + "grad_norm": 2.4183989862227455, + "learning_rate": 1.1221647450501498e-05, + "loss": 0.7454, + "step": 5799 + }, + { + "epoch": 0.4772680518411849, + "grad_norm": 0.4541920347943409, + "learning_rate": 1.1219002067690466e-05, + "loss": 0.5344, + "step": 5800 + }, + { + "epoch": 0.47735033943633, + "grad_norm": 2.4113448084278426, + "learning_rate": 1.1216356598283478e-05, + "loss": 0.7365, + "step": 5801 + }, + { + "epoch": 0.477432627031475, + "grad_norm": 2.436602714689282, + "learning_rate": 1.1213711042468457e-05, + "loss": 0.7453, + "step": 5802 + }, + { + "epoch": 0.47751491462662005, + "grad_norm": 0.4336257230496221, + "learning_rate": 1.121106540043335e-05, + "loss": 0.5393, + "step": 5803 + }, + { + "epoch": 0.47759720222176505, + "grad_norm": 24.95275324897151, + "learning_rate": 1.1208419672366093e-05, + "loss": 0.7697, + "step": 5804 + }, + { + "epoch": 0.4776794898169101, + "grad_norm": 3.288144413712097, + "learning_rate": 1.1205773858454634e-05, + "loss": 0.7607, + "step": 5805 + }, + { + "epoch": 0.4777617774120551, + "grad_norm": 0.4457836580858964, + "learning_rate": 1.1203127958886928e-05, + "loss": 0.5393, + "step": 5806 + }, + { + "epoch": 0.47784406500720017, + "grad_norm": 4.906271585946284, + "learning_rate": 1.1200481973850939e-05, + "loss": 0.7367, + "step": 5807 + }, + { + "epoch": 0.4779263526023452, + "grad_norm": 0.4269639287651738, + "learning_rate": 1.1197835903534626e-05, + "loss": 0.53, + "step": 5808 + }, + { + "epoch": 0.47800864019749023, + "grad_norm": 3.8127397955169147, + "learning_rate": 1.1195189748125968e-05, + "loss": 0.7741, + "step": 5809 + }, + { + "epoch": 0.47809092779263523, + "grad_norm": 3.7963067838902695, + "learning_rate": 1.1192543507812943e-05, + "loss": 0.7339, + "step": 5810 + }, + { + "epoch": 0.4781732153877803, + "grad_norm": 2.769247115467446, + "learning_rate": 1.1189897182783531e-05, + "loss": 0.7848, + "step": 5811 + }, + { + "epoch": 0.47825550298292535, + "grad_norm": 0.4212265456291987, + "learning_rate": 1.1187250773225729e-05, + "loss": 0.5248, + "step": 5812 + }, + { + "epoch": 0.47833779057807035, + "grad_norm": 2.4441724741452457, + "learning_rate": 1.1184604279327531e-05, + "loss": 0.7368, + "step": 5813 + }, + { + "epoch": 0.4784200781732154, + "grad_norm": 2.5624255498076765, + "learning_rate": 1.1181957701276937e-05, + "loss": 0.7665, + "step": 5814 + }, + { + "epoch": 0.4785023657683604, + "grad_norm": 2.347933328753732, + "learning_rate": 1.1179311039261959e-05, + "loss": 0.7335, + "step": 5815 + }, + { + "epoch": 0.4785846533635055, + "grad_norm": 2.0843386900470113, + "learning_rate": 1.1176664293470613e-05, + "loss": 0.7537, + "step": 5816 + }, + { + "epoch": 0.4786669409586505, + "grad_norm": 2.5001314098958973, + "learning_rate": 1.1174017464090915e-05, + "loss": 0.7344, + "step": 5817 + }, + { + "epoch": 0.47874922855379554, + "grad_norm": 3.0862399519523374, + "learning_rate": 1.1171370551310897e-05, + "loss": 0.7468, + "step": 5818 + }, + { + "epoch": 0.47883151614894054, + "grad_norm": 3.38767683313373, + "learning_rate": 1.1168723555318586e-05, + "loss": 0.7586, + "step": 5819 + }, + { + "epoch": 0.4789138037440856, + "grad_norm": 2.353123053015181, + "learning_rate": 1.1166076476302021e-05, + "loss": 0.7714, + "step": 5820 + }, + { + "epoch": 0.4789960913392306, + "grad_norm": 2.8532980489501276, + "learning_rate": 1.116342931444925e-05, + "loss": 0.7426, + "step": 5821 + }, + { + "epoch": 0.47907837893437566, + "grad_norm": 3.8846515875873187, + "learning_rate": 1.1160782069948324e-05, + "loss": 0.7516, + "step": 5822 + }, + { + "epoch": 0.47916066652952066, + "grad_norm": 2.3883838604540593, + "learning_rate": 1.1158134742987291e-05, + "loss": 0.7563, + "step": 5823 + }, + { + "epoch": 0.4792429541246657, + "grad_norm": 2.8957057858831052, + "learning_rate": 1.115548733375422e-05, + "loss": 0.7393, + "step": 5824 + }, + { + "epoch": 0.4793252417198107, + "grad_norm": 0.4252429577751018, + "learning_rate": 1.115283984243718e-05, + "loss": 0.5043, + "step": 5825 + }, + { + "epoch": 0.4794075293149558, + "grad_norm": 3.5843028275211433, + "learning_rate": 1.1150192269224235e-05, + "loss": 0.7665, + "step": 5826 + }, + { + "epoch": 0.4794898169101008, + "grad_norm": 2.361823847812176, + "learning_rate": 1.1147544614303474e-05, + "loss": 0.7674, + "step": 5827 + }, + { + "epoch": 0.47957210450524584, + "grad_norm": 2.579479809861864, + "learning_rate": 1.1144896877862981e-05, + "loss": 0.7567, + "step": 5828 + }, + { + "epoch": 0.47965439210039085, + "grad_norm": 2.881322673426298, + "learning_rate": 1.1142249060090842e-05, + "loss": 0.7537, + "step": 5829 + }, + { + "epoch": 0.4797366796955359, + "grad_norm": 2.463539332520094, + "learning_rate": 1.1139601161175156e-05, + "loss": 0.7559, + "step": 5830 + }, + { + "epoch": 0.4798189672906809, + "grad_norm": 2.185200128757755, + "learning_rate": 1.113695318130403e-05, + "loss": 0.7301, + "step": 5831 + }, + { + "epoch": 0.47990125488582597, + "grad_norm": 2.2243523143459325, + "learning_rate": 1.1134305120665562e-05, + "loss": 0.7304, + "step": 5832 + }, + { + "epoch": 0.47998354248097097, + "grad_norm": 8.814225464941826, + "learning_rate": 1.1131656979447878e-05, + "loss": 0.7415, + "step": 5833 + }, + { + "epoch": 0.48006583007611603, + "grad_norm": 1.8668778246407944, + "learning_rate": 1.1129008757839089e-05, + "loss": 0.7325, + "step": 5834 + }, + { + "epoch": 0.48014811767126103, + "grad_norm": 2.269401671164961, + "learning_rate": 1.1126360456027322e-05, + "loss": 0.7498, + "step": 5835 + }, + { + "epoch": 0.4802304052664061, + "grad_norm": 2.2798756533331694, + "learning_rate": 1.1123712074200714e-05, + "loss": 0.7236, + "step": 5836 + }, + { + "epoch": 0.4803126928615511, + "grad_norm": 2.100492684877444, + "learning_rate": 1.1121063612547393e-05, + "loss": 0.7195, + "step": 5837 + }, + { + "epoch": 0.48039498045669615, + "grad_norm": 1.9933908304526973, + "learning_rate": 1.1118415071255508e-05, + "loss": 0.7496, + "step": 5838 + }, + { + "epoch": 0.4804772680518412, + "grad_norm": 2.774331063547509, + "learning_rate": 1.1115766450513208e-05, + "loss": 0.7818, + "step": 5839 + }, + { + "epoch": 0.4805595556469862, + "grad_norm": 1.8864411010203455, + "learning_rate": 1.1113117750508639e-05, + "loss": 0.7667, + "step": 5840 + }, + { + "epoch": 0.4806418432421313, + "grad_norm": 3.1970776753363412, + "learning_rate": 1.1110468971429967e-05, + "loss": 0.7416, + "step": 5841 + }, + { + "epoch": 0.4807241308372763, + "grad_norm": 1.9377896252210605, + "learning_rate": 1.1107820113465356e-05, + "loss": 0.7415, + "step": 5842 + }, + { + "epoch": 0.48080641843242133, + "grad_norm": 2.042404120783801, + "learning_rate": 1.1105171176802973e-05, + "loss": 0.7647, + "step": 5843 + }, + { + "epoch": 0.48088870602756634, + "grad_norm": 1.975918175307963, + "learning_rate": 1.1102522161630999e-05, + "loss": 0.7615, + "step": 5844 + }, + { + "epoch": 0.4809709936227114, + "grad_norm": 2.662295324071712, + "learning_rate": 1.1099873068137614e-05, + "loss": 0.7604, + "step": 5845 + }, + { + "epoch": 0.4810532812178564, + "grad_norm": 2.689256525136526, + "learning_rate": 1.1097223896511005e-05, + "loss": 0.7569, + "step": 5846 + }, + { + "epoch": 0.48113556881300146, + "grad_norm": 0.4286216986845714, + "learning_rate": 1.1094574646939365e-05, + "loss": 0.4875, + "step": 5847 + }, + { + "epoch": 0.48121785640814646, + "grad_norm": 2.0022539921156643, + "learning_rate": 1.1091925319610894e-05, + "loss": 0.7348, + "step": 5848 + }, + { + "epoch": 0.4813001440032915, + "grad_norm": 2.2740110783951852, + "learning_rate": 1.1089275914713792e-05, + "loss": 0.7303, + "step": 5849 + }, + { + "epoch": 0.4813824315984365, + "grad_norm": 2.0724468570057026, + "learning_rate": 1.1086626432436272e-05, + "loss": 0.7251, + "step": 5850 + }, + { + "epoch": 0.4814647191935816, + "grad_norm": 2.485873019766488, + "learning_rate": 1.1083976872966549e-05, + "loss": 0.7711, + "step": 5851 + }, + { + "epoch": 0.4815470067887266, + "grad_norm": 0.4268969267634154, + "learning_rate": 1.108132723649284e-05, + "loss": 0.5155, + "step": 5852 + }, + { + "epoch": 0.48162929438387164, + "grad_norm": 2.0271999814428363, + "learning_rate": 1.1078677523203376e-05, + "loss": 0.7298, + "step": 5853 + }, + { + "epoch": 0.48171158197901665, + "grad_norm": 2.516987662937783, + "learning_rate": 1.1076027733286388e-05, + "loss": 0.762, + "step": 5854 + }, + { + "epoch": 0.4817938695741617, + "grad_norm": 0.41378361048563406, + "learning_rate": 1.1073377866930105e-05, + "loss": 0.5222, + "step": 5855 + }, + { + "epoch": 0.4818761571693067, + "grad_norm": 2.7213883165707213, + "learning_rate": 1.1070727924322777e-05, + "loss": 0.7479, + "step": 5856 + }, + { + "epoch": 0.48195844476445177, + "grad_norm": 2.574171079105191, + "learning_rate": 1.1068077905652652e-05, + "loss": 0.7507, + "step": 5857 + }, + { + "epoch": 0.48204073235959677, + "grad_norm": 2.7205400102447226, + "learning_rate": 1.1065427811107979e-05, + "loss": 0.7699, + "step": 5858 + }, + { + "epoch": 0.4821230199547418, + "grad_norm": 2.15351055264949, + "learning_rate": 1.1062777640877017e-05, + "loss": 0.7608, + "step": 5859 + }, + { + "epoch": 0.48220530754988683, + "grad_norm": 2.538753447790659, + "learning_rate": 1.1060127395148034e-05, + "loss": 0.7628, + "step": 5860 + }, + { + "epoch": 0.4822875951450319, + "grad_norm": 2.3897414090714544, + "learning_rate": 1.1057477074109295e-05, + "loss": 0.7359, + "step": 5861 + }, + { + "epoch": 0.4823698827401769, + "grad_norm": 4.136506120697047, + "learning_rate": 1.1054826677949073e-05, + "loss": 0.7695, + "step": 5862 + }, + { + "epoch": 0.48245217033532195, + "grad_norm": 1.9443226058110845, + "learning_rate": 1.1052176206855656e-05, + "loss": 0.7544, + "step": 5863 + }, + { + "epoch": 0.482534457930467, + "grad_norm": 2.24061582205666, + "learning_rate": 1.1049525661017319e-05, + "loss": 0.7521, + "step": 5864 + }, + { + "epoch": 0.482616745525612, + "grad_norm": 2.661831095887107, + "learning_rate": 1.1046875040622361e-05, + "loss": 0.7375, + "step": 5865 + }, + { + "epoch": 0.48269903312075707, + "grad_norm": 1.9741771926051932, + "learning_rate": 1.1044224345859074e-05, + "loss": 0.7383, + "step": 5866 + }, + { + "epoch": 0.4827813207159021, + "grad_norm": 2.497008087255779, + "learning_rate": 1.1041573576915759e-05, + "loss": 0.7294, + "step": 5867 + }, + { + "epoch": 0.48286360831104713, + "grad_norm": 2.5522087265385807, + "learning_rate": 1.1038922733980724e-05, + "loss": 0.781, + "step": 5868 + }, + { + "epoch": 0.48294589590619214, + "grad_norm": 2.4230181525614767, + "learning_rate": 1.1036271817242277e-05, + "loss": 0.7358, + "step": 5869 + }, + { + "epoch": 0.4830281835013372, + "grad_norm": 2.1440092704513187, + "learning_rate": 1.1033620826888735e-05, + "loss": 0.7311, + "step": 5870 + }, + { + "epoch": 0.4831104710964822, + "grad_norm": 4.822894427520598, + "learning_rate": 1.1030969763108428e-05, + "loss": 0.7597, + "step": 5871 + }, + { + "epoch": 0.48319275869162726, + "grad_norm": 5.593339451447891, + "learning_rate": 1.1028318626089675e-05, + "loss": 0.7592, + "step": 5872 + }, + { + "epoch": 0.48327504628677226, + "grad_norm": 2.708539408081071, + "learning_rate": 1.102566741602081e-05, + "loss": 0.7553, + "step": 5873 + }, + { + "epoch": 0.4833573338819173, + "grad_norm": 2.3395110304954874, + "learning_rate": 1.1023016133090174e-05, + "loss": 0.7603, + "step": 5874 + }, + { + "epoch": 0.4834396214770623, + "grad_norm": 3.676848792043229, + "learning_rate": 1.1020364777486108e-05, + "loss": 0.72, + "step": 5875 + }, + { + "epoch": 0.4835219090722074, + "grad_norm": 2.3007486502863856, + "learning_rate": 1.1017713349396959e-05, + "loss": 0.7438, + "step": 5876 + }, + { + "epoch": 0.4836041966673524, + "grad_norm": 3.049883486500315, + "learning_rate": 1.1015061849011083e-05, + "loss": 0.767, + "step": 5877 + }, + { + "epoch": 0.48368648426249744, + "grad_norm": 2.110250605589129, + "learning_rate": 1.1012410276516837e-05, + "loss": 0.764, + "step": 5878 + }, + { + "epoch": 0.48376877185764244, + "grad_norm": 0.42285433013743734, + "learning_rate": 1.1009758632102583e-05, + "loss": 0.5209, + "step": 5879 + }, + { + "epoch": 0.4838510594527875, + "grad_norm": 0.4394230043763465, + "learning_rate": 1.100710691595669e-05, + "loss": 0.5151, + "step": 5880 + }, + { + "epoch": 0.4839333470479325, + "grad_norm": 2.1417250985014373, + "learning_rate": 1.1004455128267532e-05, + "loss": 0.747, + "step": 5881 + }, + { + "epoch": 0.48401563464307756, + "grad_norm": 0.41543905109939117, + "learning_rate": 1.100180326922349e-05, + "loss": 0.5157, + "step": 5882 + }, + { + "epoch": 0.48409792223822257, + "grad_norm": 2.0875405581489375, + "learning_rate": 1.0999151339012945e-05, + "loss": 0.7391, + "step": 5883 + }, + { + "epoch": 0.4841802098333676, + "grad_norm": 0.44467920131810695, + "learning_rate": 1.0996499337824286e-05, + "loss": 0.5103, + "step": 5884 + }, + { + "epoch": 0.48426249742851263, + "grad_norm": 4.9289111486965025, + "learning_rate": 1.099384726584591e-05, + "loss": 0.7663, + "step": 5885 + }, + { + "epoch": 0.4843447850236577, + "grad_norm": 1.9966050259898946, + "learning_rate": 1.0991195123266212e-05, + "loss": 0.7555, + "step": 5886 + }, + { + "epoch": 0.4844270726188027, + "grad_norm": 2.271603444253016, + "learning_rate": 1.0988542910273596e-05, + "loss": 0.7711, + "step": 5887 + }, + { + "epoch": 0.48450936021394775, + "grad_norm": 2.945437909230147, + "learning_rate": 1.0985890627056475e-05, + "loss": 0.7234, + "step": 5888 + }, + { + "epoch": 0.4845916478090928, + "grad_norm": 2.3272327789972227, + "learning_rate": 1.0983238273803259e-05, + "loss": 0.7866, + "step": 5889 + }, + { + "epoch": 0.4846739354042378, + "grad_norm": 2.4046682103727206, + "learning_rate": 1.098058585070237e-05, + "loss": 0.7359, + "step": 5890 + }, + { + "epoch": 0.48475622299938287, + "grad_norm": 2.0623467139838647, + "learning_rate": 1.097793335794223e-05, + "loss": 0.7448, + "step": 5891 + }, + { + "epoch": 0.48483851059452787, + "grad_norm": 2.5977528872159747, + "learning_rate": 1.0975280795711267e-05, + "loss": 0.7511, + "step": 5892 + }, + { + "epoch": 0.48492079818967293, + "grad_norm": 3.5014483613036917, + "learning_rate": 1.0972628164197917e-05, + "loss": 0.705, + "step": 5893 + }, + { + "epoch": 0.48500308578481793, + "grad_norm": 2.738469620573681, + "learning_rate": 1.0969975463590619e-05, + "loss": 0.7556, + "step": 5894 + }, + { + "epoch": 0.485085373379963, + "grad_norm": 2.5707144290792456, + "learning_rate": 1.0967322694077814e-05, + "loss": 0.759, + "step": 5895 + }, + { + "epoch": 0.485167660975108, + "grad_norm": 2.186377066364156, + "learning_rate": 1.0964669855847948e-05, + "loss": 0.7736, + "step": 5896 + }, + { + "epoch": 0.48524994857025305, + "grad_norm": 0.4569947856893424, + "learning_rate": 1.0962016949089484e-05, + "loss": 0.4994, + "step": 5897 + }, + { + "epoch": 0.48533223616539806, + "grad_norm": 4.034831193739979, + "learning_rate": 1.0959363973990873e-05, + "loss": 0.7367, + "step": 5898 + }, + { + "epoch": 0.4854145237605431, + "grad_norm": 2.057380342861732, + "learning_rate": 1.0956710930740577e-05, + "loss": 0.7544, + "step": 5899 + }, + { + "epoch": 0.4854968113556881, + "grad_norm": 2.737632794778471, + "learning_rate": 1.0954057819527072e-05, + "loss": 0.7736, + "step": 5900 + }, + { + "epoch": 0.4855790989508332, + "grad_norm": 2.462802067897765, + "learning_rate": 1.0951404640538818e-05, + "loss": 0.7193, + "step": 5901 + }, + { + "epoch": 0.4856613865459782, + "grad_norm": 3.3447283865494004, + "learning_rate": 1.0948751393964306e-05, + "loss": 0.7462, + "step": 5902 + }, + { + "epoch": 0.48574367414112324, + "grad_norm": 3.057298268480763, + "learning_rate": 1.0946098079992009e-05, + "loss": 0.7656, + "step": 5903 + }, + { + "epoch": 0.48582596173626824, + "grad_norm": 3.0398875309543913, + "learning_rate": 1.094344469881042e-05, + "loss": 0.7576, + "step": 5904 + }, + { + "epoch": 0.4859082493314133, + "grad_norm": 2.786276441249452, + "learning_rate": 1.0940791250608025e-05, + "loss": 0.7271, + "step": 5905 + }, + { + "epoch": 0.4859905369265583, + "grad_norm": 3.30256925158889, + "learning_rate": 1.0938137735573326e-05, + "loss": 0.7348, + "step": 5906 + }, + { + "epoch": 0.48607282452170336, + "grad_norm": 3.057034167936319, + "learning_rate": 1.0935484153894823e-05, + "loss": 0.7389, + "step": 5907 + }, + { + "epoch": 0.48615511211684836, + "grad_norm": 3.8187285052341693, + "learning_rate": 1.093283050576102e-05, + "loss": 0.7467, + "step": 5908 + }, + { + "epoch": 0.4862373997119934, + "grad_norm": 0.4219217778521901, + "learning_rate": 1.0930176791360431e-05, + "loss": 0.5127, + "step": 5909 + }, + { + "epoch": 0.4863196873071384, + "grad_norm": 2.0917132926499407, + "learning_rate": 1.0927523010881571e-05, + "loss": 0.7571, + "step": 5910 + }, + { + "epoch": 0.4864019749022835, + "grad_norm": 3.706537248450162, + "learning_rate": 1.092486916451296e-05, + "loss": 0.7307, + "step": 5911 + }, + { + "epoch": 0.4864842624974285, + "grad_norm": 4.598371252379631, + "learning_rate": 1.0922215252443123e-05, + "loss": 0.7358, + "step": 5912 + }, + { + "epoch": 0.48656655009257355, + "grad_norm": 3.756247632099237, + "learning_rate": 1.0919561274860587e-05, + "loss": 0.7349, + "step": 5913 + }, + { + "epoch": 0.48664883768771855, + "grad_norm": 2.7813121919630777, + "learning_rate": 1.091690723195389e-05, + "loss": 0.7483, + "step": 5914 + }, + { + "epoch": 0.4867311252828636, + "grad_norm": 2.6735165747158693, + "learning_rate": 1.0914253123911569e-05, + "loss": 0.7773, + "step": 5915 + }, + { + "epoch": 0.48681341287800867, + "grad_norm": 3.132191879377762, + "learning_rate": 1.0911598950922167e-05, + "loss": 0.7157, + "step": 5916 + }, + { + "epoch": 0.48689570047315367, + "grad_norm": 3.357033400060678, + "learning_rate": 1.0908944713174237e-05, + "loss": 0.7578, + "step": 5917 + }, + { + "epoch": 0.48697798806829873, + "grad_norm": 3.1270476502172793, + "learning_rate": 1.0906290410856326e-05, + "loss": 0.7494, + "step": 5918 + }, + { + "epoch": 0.48706027566344373, + "grad_norm": 3.185665158496523, + "learning_rate": 1.0903636044156992e-05, + "loss": 0.7742, + "step": 5919 + }, + { + "epoch": 0.4871425632585888, + "grad_norm": 0.4125940179088418, + "learning_rate": 1.0900981613264802e-05, + "loss": 0.4858, + "step": 5920 + }, + { + "epoch": 0.4872248508537338, + "grad_norm": 2.742328054973892, + "learning_rate": 1.0898327118368317e-05, + "loss": 0.752, + "step": 5921 + }, + { + "epoch": 0.48730713844887885, + "grad_norm": 2.7945171349202536, + "learning_rate": 1.089567255965611e-05, + "loss": 0.7872, + "step": 5922 + }, + { + "epoch": 0.48738942604402385, + "grad_norm": 2.8323346292722054, + "learning_rate": 1.0893017937316758e-05, + "loss": 0.7594, + "step": 5923 + }, + { + "epoch": 0.4874717136391689, + "grad_norm": 2.2636349203178616, + "learning_rate": 1.0890363251538838e-05, + "loss": 0.7603, + "step": 5924 + }, + { + "epoch": 0.4875540012343139, + "grad_norm": 2.500932436229604, + "learning_rate": 1.0887708502510938e-05, + "loss": 0.7441, + "step": 5925 + }, + { + "epoch": 0.487636288829459, + "grad_norm": 0.43668529542989243, + "learning_rate": 1.0885053690421648e-05, + "loss": 0.4944, + "step": 5926 + }, + { + "epoch": 0.487718576424604, + "grad_norm": 2.5214223483357903, + "learning_rate": 1.0882398815459554e-05, + "loss": 0.7383, + "step": 5927 + }, + { + "epoch": 0.48780086401974904, + "grad_norm": 2.4492406142260656, + "learning_rate": 1.0879743877813262e-05, + "loss": 0.7679, + "step": 5928 + }, + { + "epoch": 0.48788315161489404, + "grad_norm": 2.344349993716172, + "learning_rate": 1.0877088877671374e-05, + "loss": 0.7571, + "step": 5929 + }, + { + "epoch": 0.4879654392100391, + "grad_norm": 3.31854002702116, + "learning_rate": 1.0874433815222492e-05, + "loss": 0.7445, + "step": 5930 + }, + { + "epoch": 0.4880477268051841, + "grad_norm": 3.0376670967355413, + "learning_rate": 1.087177869065523e-05, + "loss": 0.7592, + "step": 5931 + }, + { + "epoch": 0.48813001440032916, + "grad_norm": 2.9084850730831673, + "learning_rate": 1.0869123504158205e-05, + "loss": 0.7442, + "step": 5932 + }, + { + "epoch": 0.48821230199547416, + "grad_norm": 0.41236335162588506, + "learning_rate": 1.0866468255920037e-05, + "loss": 0.5313, + "step": 5933 + }, + { + "epoch": 0.4882945895906192, + "grad_norm": 2.6174243250250835, + "learning_rate": 1.0863812946129348e-05, + "loss": 0.7512, + "step": 5934 + }, + { + "epoch": 0.4883768771857642, + "grad_norm": 0.4291259052271163, + "learning_rate": 1.086115757497477e-05, + "loss": 0.4903, + "step": 5935 + }, + { + "epoch": 0.4884591647809093, + "grad_norm": 4.034748484716541, + "learning_rate": 1.0858502142644936e-05, + "loss": 0.7447, + "step": 5936 + }, + { + "epoch": 0.4885414523760543, + "grad_norm": 0.41663413292064316, + "learning_rate": 1.085584664932848e-05, + "loss": 0.5162, + "step": 5937 + }, + { + "epoch": 0.48862373997119934, + "grad_norm": 0.4259087849514523, + "learning_rate": 1.085319109521405e-05, + "loss": 0.4888, + "step": 5938 + }, + { + "epoch": 0.48870602756634435, + "grad_norm": 3.271446712755462, + "learning_rate": 1.0850535480490288e-05, + "loss": 0.7331, + "step": 5939 + }, + { + "epoch": 0.4887883151614894, + "grad_norm": 2.3346157993031507, + "learning_rate": 1.0847879805345842e-05, + "loss": 0.7195, + "step": 5940 + }, + { + "epoch": 0.48887060275663446, + "grad_norm": 2.631609304254791, + "learning_rate": 1.0845224069969375e-05, + "loss": 0.7475, + "step": 5941 + }, + { + "epoch": 0.48895289035177947, + "grad_norm": 2.7486552258625663, + "learning_rate": 1.084256827454954e-05, + "loss": 0.7338, + "step": 5942 + }, + { + "epoch": 0.4890351779469245, + "grad_norm": 5.0377005230372855, + "learning_rate": 1.0839912419275002e-05, + "loss": 0.7538, + "step": 5943 + }, + { + "epoch": 0.48911746554206953, + "grad_norm": 0.4363180269824626, + "learning_rate": 1.0837256504334432e-05, + "loss": 0.5194, + "step": 5944 + }, + { + "epoch": 0.4891997531372146, + "grad_norm": 2.5779110803209857, + "learning_rate": 1.0834600529916497e-05, + "loss": 0.7283, + "step": 5945 + }, + { + "epoch": 0.4892820407323596, + "grad_norm": 0.42456102626586856, + "learning_rate": 1.0831944496209874e-05, + "loss": 0.5026, + "step": 5946 + }, + { + "epoch": 0.48936432832750465, + "grad_norm": 3.249510773605525, + "learning_rate": 1.0829288403403246e-05, + "loss": 0.7522, + "step": 5947 + }, + { + "epoch": 0.48944661592264965, + "grad_norm": 3.203160802293661, + "learning_rate": 1.0826632251685295e-05, + "loss": 0.734, + "step": 5948 + }, + { + "epoch": 0.4895289035177947, + "grad_norm": 2.4540275186923655, + "learning_rate": 1.0823976041244712e-05, + "loss": 0.7571, + "step": 5949 + }, + { + "epoch": 0.4896111911129397, + "grad_norm": 0.4267091046688989, + "learning_rate": 1.0821319772270188e-05, + "loss": 0.5096, + "step": 5950 + }, + { + "epoch": 0.4896934787080848, + "grad_norm": 2.6376648124871314, + "learning_rate": 1.0818663444950421e-05, + "loss": 0.7656, + "step": 5951 + }, + { + "epoch": 0.4897757663032298, + "grad_norm": 3.5190408049773008, + "learning_rate": 1.0816007059474112e-05, + "loss": 0.7333, + "step": 5952 + }, + { + "epoch": 0.48985805389837483, + "grad_norm": 7.657086481505059, + "learning_rate": 1.0813350616029966e-05, + "loss": 0.7608, + "step": 5953 + }, + { + "epoch": 0.48994034149351984, + "grad_norm": 2.1896664291295247, + "learning_rate": 1.0810694114806693e-05, + "loss": 0.7331, + "step": 5954 + }, + { + "epoch": 0.4900226290886649, + "grad_norm": 2.336218902999158, + "learning_rate": 1.080803755599301e-05, + "loss": 0.7596, + "step": 5955 + }, + { + "epoch": 0.4901049166838099, + "grad_norm": 2.194112605802613, + "learning_rate": 1.0805380939777627e-05, + "loss": 0.7382, + "step": 5956 + }, + { + "epoch": 0.49018720427895496, + "grad_norm": 2.193013612191447, + "learning_rate": 1.080272426634927e-05, + "loss": 0.7523, + "step": 5957 + }, + { + "epoch": 0.49026949187409996, + "grad_norm": 2.5937942147916555, + "learning_rate": 1.0800067535896668e-05, + "loss": 0.7705, + "step": 5958 + }, + { + "epoch": 0.490351779469245, + "grad_norm": 2.473993458502424, + "learning_rate": 1.0797410748608546e-05, + "loss": 0.7364, + "step": 5959 + }, + { + "epoch": 0.49043406706439, + "grad_norm": 1.8578265761387351, + "learning_rate": 1.0794753904673638e-05, + "loss": 0.7434, + "step": 5960 + }, + { + "epoch": 0.4905163546595351, + "grad_norm": 1.8170923882874057, + "learning_rate": 1.0792097004280689e-05, + "loss": 0.745, + "step": 5961 + }, + { + "epoch": 0.4905986422546801, + "grad_norm": 5.579064717634074, + "learning_rate": 1.0789440047618429e-05, + "loss": 0.7779, + "step": 5962 + }, + { + "epoch": 0.49068092984982514, + "grad_norm": 2.5838370803995576, + "learning_rate": 1.0786783034875614e-05, + "loss": 0.7401, + "step": 5963 + }, + { + "epoch": 0.49076321744497015, + "grad_norm": 2.7954611873176507, + "learning_rate": 1.0784125966240992e-05, + "loss": 0.7605, + "step": 5964 + }, + { + "epoch": 0.4908455050401152, + "grad_norm": 2.9198130890298004, + "learning_rate": 1.0781468841903313e-05, + "loss": 0.7367, + "step": 5965 + }, + { + "epoch": 0.4909277926352602, + "grad_norm": 0.42722425849438456, + "learning_rate": 1.0778811662051335e-05, + "loss": 0.5319, + "step": 5966 + }, + { + "epoch": 0.49101008023040527, + "grad_norm": 2.4886886801236208, + "learning_rate": 1.0776154426873829e-05, + "loss": 0.7507, + "step": 5967 + }, + { + "epoch": 0.4910923678255503, + "grad_norm": 2.9213644194564474, + "learning_rate": 1.0773497136559547e-05, + "loss": 0.7493, + "step": 5968 + }, + { + "epoch": 0.4911746554206953, + "grad_norm": 2.2623352773185847, + "learning_rate": 1.0770839791297267e-05, + "loss": 0.7356, + "step": 5969 + }, + { + "epoch": 0.4912569430158404, + "grad_norm": 2.0688746699795977, + "learning_rate": 1.0768182391275762e-05, + "loss": 0.719, + "step": 5970 + }, + { + "epoch": 0.4913392306109854, + "grad_norm": 2.5343354866282555, + "learning_rate": 1.0765524936683808e-05, + "loss": 0.7504, + "step": 5971 + }, + { + "epoch": 0.49142151820613045, + "grad_norm": 2.492483379163454, + "learning_rate": 1.076286742771019e-05, + "loss": 0.7496, + "step": 5972 + }, + { + "epoch": 0.49150380580127545, + "grad_norm": 4.047264137622197, + "learning_rate": 1.0760209864543687e-05, + "loss": 0.75, + "step": 5973 + }, + { + "epoch": 0.4915860933964205, + "grad_norm": 2.8851857059931336, + "learning_rate": 1.0757552247373091e-05, + "loss": 0.7889, + "step": 5974 + }, + { + "epoch": 0.4916683809915655, + "grad_norm": 2.524693840748495, + "learning_rate": 1.0754894576387194e-05, + "loss": 0.7342, + "step": 5975 + }, + { + "epoch": 0.49175066858671057, + "grad_norm": 3.677543380909794, + "learning_rate": 1.0752236851774796e-05, + "loss": 0.793, + "step": 5976 + }, + { + "epoch": 0.4918329561818556, + "grad_norm": 1.8129483420547836, + "learning_rate": 1.0749579073724691e-05, + "loss": 0.7337, + "step": 5977 + }, + { + "epoch": 0.49191524377700063, + "grad_norm": 2.007125404797403, + "learning_rate": 1.074692124242569e-05, + "loss": 0.7421, + "step": 5978 + }, + { + "epoch": 0.49199753137214564, + "grad_norm": 0.40599757929440744, + "learning_rate": 1.07442633580666e-05, + "loss": 0.5275, + "step": 5979 + }, + { + "epoch": 0.4920798189672907, + "grad_norm": 1.9689085127252406, + "learning_rate": 1.074160542083623e-05, + "loss": 0.7232, + "step": 5980 + }, + { + "epoch": 0.4921621065624357, + "grad_norm": 2.1970912530316022, + "learning_rate": 1.0738947430923397e-05, + "loss": 0.7363, + "step": 5981 + }, + { + "epoch": 0.49224439415758076, + "grad_norm": 2.1237699472512, + "learning_rate": 1.073628938851692e-05, + "loss": 0.7593, + "step": 5982 + }, + { + "epoch": 0.49232668175272576, + "grad_norm": 2.1629226173379585, + "learning_rate": 1.0733631293805622e-05, + "loss": 0.7513, + "step": 5983 + }, + { + "epoch": 0.4924089693478708, + "grad_norm": 2.1805267863792324, + "learning_rate": 1.0730973146978332e-05, + "loss": 0.7471, + "step": 5984 + }, + { + "epoch": 0.4924912569430158, + "grad_norm": 2.0932680543383326, + "learning_rate": 1.0728314948223877e-05, + "loss": 0.7498, + "step": 5985 + }, + { + "epoch": 0.4925735445381609, + "grad_norm": 3.6566437259596665, + "learning_rate": 1.0725656697731092e-05, + "loss": 0.7227, + "step": 5986 + }, + { + "epoch": 0.4926558321333059, + "grad_norm": 2.356637888619104, + "learning_rate": 1.0722998395688817e-05, + "loss": 0.7152, + "step": 5987 + }, + { + "epoch": 0.49273811972845094, + "grad_norm": 2.3654002150855313, + "learning_rate": 1.0720340042285893e-05, + "loss": 0.7222, + "step": 5988 + }, + { + "epoch": 0.49282040732359594, + "grad_norm": 2.452819374617149, + "learning_rate": 1.0717681637711162e-05, + "loss": 0.7401, + "step": 5989 + }, + { + "epoch": 0.492902694918741, + "grad_norm": 3.6444442955610876, + "learning_rate": 1.071502318215348e-05, + "loss": 0.7414, + "step": 5990 + }, + { + "epoch": 0.492984982513886, + "grad_norm": 2.1610708270323897, + "learning_rate": 1.071236467580169e-05, + "loss": 0.7544, + "step": 5991 + }, + { + "epoch": 0.49306727010903106, + "grad_norm": 2.0366493210771575, + "learning_rate": 1.0709706118844652e-05, + "loss": 0.7499, + "step": 5992 + }, + { + "epoch": 0.4931495577041761, + "grad_norm": 2.380443917228487, + "learning_rate": 1.070704751147123e-05, + "loss": 0.7256, + "step": 5993 + }, + { + "epoch": 0.4932318452993211, + "grad_norm": 1.9782304050226442, + "learning_rate": 1.070438885387028e-05, + "loss": 0.7523, + "step": 5994 + }, + { + "epoch": 0.4933141328944662, + "grad_norm": 0.42407090897517796, + "learning_rate": 1.0701730146230675e-05, + "loss": 0.505, + "step": 5995 + }, + { + "epoch": 0.4933964204896112, + "grad_norm": 2.057305422674386, + "learning_rate": 1.0699071388741281e-05, + "loss": 0.749, + "step": 5996 + }, + { + "epoch": 0.49347870808475625, + "grad_norm": 2.293118086191529, + "learning_rate": 1.0696412581590976e-05, + "loss": 0.7533, + "step": 5997 + }, + { + "epoch": 0.49356099567990125, + "grad_norm": 2.6822080725595256, + "learning_rate": 1.0693753724968632e-05, + "loss": 0.7532, + "step": 5998 + }, + { + "epoch": 0.4936432832750463, + "grad_norm": 0.4328429783180453, + "learning_rate": 1.0691094819063137e-05, + "loss": 0.5439, + "step": 5999 + }, + { + "epoch": 0.4937255708701913, + "grad_norm": 0.410079396639334, + "learning_rate": 1.0688435864063366e-05, + "loss": 0.5191, + "step": 6000 + }, + { + "epoch": 0.49380785846533637, + "grad_norm": 0.40366330880327406, + "learning_rate": 1.0685776860158215e-05, + "loss": 0.5036, + "step": 6001 + }, + { + "epoch": 0.49389014606048137, + "grad_norm": 2.1870831451604498, + "learning_rate": 1.0683117807536574e-05, + "loss": 0.7395, + "step": 6002 + }, + { + "epoch": 0.49397243365562643, + "grad_norm": 2.370545012883816, + "learning_rate": 1.0680458706387333e-05, + "loss": 0.7449, + "step": 6003 + }, + { + "epoch": 0.49405472125077143, + "grad_norm": 2.3089874918556452, + "learning_rate": 1.0677799556899396e-05, + "loss": 0.7498, + "step": 6004 + }, + { + "epoch": 0.4941370088459165, + "grad_norm": 0.4266572357417992, + "learning_rate": 1.0675140359261662e-05, + "loss": 0.5247, + "step": 6005 + }, + { + "epoch": 0.4942192964410615, + "grad_norm": 2.483515497896064, + "learning_rate": 1.0672481113663036e-05, + "loss": 0.7577, + "step": 6006 + }, + { + "epoch": 0.49430158403620655, + "grad_norm": 0.4229978853843417, + "learning_rate": 1.0669821820292427e-05, + "loss": 0.4991, + "step": 6007 + }, + { + "epoch": 0.49438387163135156, + "grad_norm": 3.010844763084947, + "learning_rate": 1.0667162479338748e-05, + "loss": 0.7352, + "step": 6008 + }, + { + "epoch": 0.4944661592264966, + "grad_norm": 2.31615379875278, + "learning_rate": 1.0664503090990914e-05, + "loss": 0.7583, + "step": 6009 + }, + { + "epoch": 0.4945484468216416, + "grad_norm": 2.2782514157693905, + "learning_rate": 1.0661843655437843e-05, + "loss": 0.7311, + "step": 6010 + }, + { + "epoch": 0.4946307344167867, + "grad_norm": 0.4404808215131872, + "learning_rate": 1.0659184172868458e-05, + "loss": 0.5146, + "step": 6011 + }, + { + "epoch": 0.4947130220119317, + "grad_norm": 2.67949634687975, + "learning_rate": 1.065652464347168e-05, + "loss": 0.7338, + "step": 6012 + }, + { + "epoch": 0.49479530960707674, + "grad_norm": 3.107240779831543, + "learning_rate": 1.0653865067436442e-05, + "loss": 0.7305, + "step": 6013 + }, + { + "epoch": 0.49487759720222174, + "grad_norm": 2.413063639928704, + "learning_rate": 1.065120544495168e-05, + "loss": 0.7406, + "step": 6014 + }, + { + "epoch": 0.4949598847973668, + "grad_norm": 2.289194242007833, + "learning_rate": 1.064854577620632e-05, + "loss": 0.7891, + "step": 6015 + }, + { + "epoch": 0.4950421723925118, + "grad_norm": 0.44623796263977006, + "learning_rate": 1.0645886061389308e-05, + "loss": 0.4777, + "step": 6016 + }, + { + "epoch": 0.49512445998765686, + "grad_norm": 2.6030473629371746, + "learning_rate": 1.0643226300689579e-05, + "loss": 0.7516, + "step": 6017 + }, + { + "epoch": 0.49520674758280187, + "grad_norm": 2.276132531564297, + "learning_rate": 1.0640566494296085e-05, + "loss": 0.742, + "step": 6018 + }, + { + "epoch": 0.4952890351779469, + "grad_norm": 2.155295058129665, + "learning_rate": 1.0637906642397775e-05, + "loss": 0.7399, + "step": 6019 + }, + { + "epoch": 0.495371322773092, + "grad_norm": 0.44687683789119254, + "learning_rate": 1.0635246745183592e-05, + "loss": 0.502, + "step": 6020 + }, + { + "epoch": 0.495453610368237, + "grad_norm": 2.205794614701129, + "learning_rate": 1.0632586802842497e-05, + "loss": 0.716, + "step": 6021 + }, + { + "epoch": 0.49553589796338204, + "grad_norm": 0.4185104674717768, + "learning_rate": 1.0629926815563451e-05, + "loss": 0.5138, + "step": 6022 + }, + { + "epoch": 0.49561818555852705, + "grad_norm": 2.48015784421315, + "learning_rate": 1.0627266783535408e-05, + "loss": 0.7437, + "step": 6023 + }, + { + "epoch": 0.4957004731536721, + "grad_norm": 2.2784849583637707, + "learning_rate": 1.0624606706947336e-05, + "loss": 0.774, + "step": 6024 + }, + { + "epoch": 0.4957827607488171, + "grad_norm": 2.414520876863104, + "learning_rate": 1.0621946585988206e-05, + "loss": 0.7557, + "step": 6025 + }, + { + "epoch": 0.49586504834396217, + "grad_norm": 9.14071234781431, + "learning_rate": 1.061928642084698e-05, + "loss": 0.714, + "step": 6026 + }, + { + "epoch": 0.49594733593910717, + "grad_norm": 0.44939131042205466, + "learning_rate": 1.061662621171264e-05, + "loss": 0.5141, + "step": 6027 + }, + { + "epoch": 0.49602962353425223, + "grad_norm": 0.42217829727118356, + "learning_rate": 1.061396595877416e-05, + "loss": 0.4969, + "step": 6028 + }, + { + "epoch": 0.49611191112939723, + "grad_norm": 2.5896286654486365, + "learning_rate": 1.0611305662220519e-05, + "loss": 0.7604, + "step": 6029 + }, + { + "epoch": 0.4961941987245423, + "grad_norm": 0.40362276720892865, + "learning_rate": 1.06086453222407e-05, + "loss": 0.501, + "step": 6030 + }, + { + "epoch": 0.4962764863196873, + "grad_norm": 2.6078628629401917, + "learning_rate": 1.0605984939023694e-05, + "loss": 0.7301, + "step": 6031 + }, + { + "epoch": 0.49635877391483235, + "grad_norm": 2.577565625997828, + "learning_rate": 1.0603324512758486e-05, + "loss": 0.7234, + "step": 6032 + }, + { + "epoch": 0.49644106150997735, + "grad_norm": 2.5446294013333555, + "learning_rate": 1.0600664043634069e-05, + "loss": 0.7351, + "step": 6033 + }, + { + "epoch": 0.4965233491051224, + "grad_norm": 2.5136581702713054, + "learning_rate": 1.059800353183944e-05, + "loss": 0.7515, + "step": 6034 + }, + { + "epoch": 0.4966056367002674, + "grad_norm": 2.7413351853281065, + "learning_rate": 1.0595342977563594e-05, + "loss": 0.7523, + "step": 6035 + }, + { + "epoch": 0.4966879242954125, + "grad_norm": 0.42448406278374273, + "learning_rate": 1.0592682380995538e-05, + "loss": 0.4892, + "step": 6036 + }, + { + "epoch": 0.4967702118905575, + "grad_norm": 2.7971747306091537, + "learning_rate": 1.0590021742324272e-05, + "loss": 0.7405, + "step": 6037 + }, + { + "epoch": 0.49685249948570254, + "grad_norm": 2.8909346896363903, + "learning_rate": 1.0587361061738806e-05, + "loss": 0.803, + "step": 6038 + }, + { + "epoch": 0.49693478708084754, + "grad_norm": 2.513391100557382, + "learning_rate": 1.0584700339428147e-05, + "loss": 0.7336, + "step": 6039 + }, + { + "epoch": 0.4970170746759926, + "grad_norm": 2.0641517069583974, + "learning_rate": 1.0582039575581315e-05, + "loss": 0.7323, + "step": 6040 + }, + { + "epoch": 0.4970993622711376, + "grad_norm": 2.617851672422438, + "learning_rate": 1.0579378770387318e-05, + "loss": 0.7619, + "step": 6041 + }, + { + "epoch": 0.49718164986628266, + "grad_norm": 3.479833287187143, + "learning_rate": 1.0576717924035182e-05, + "loss": 0.7676, + "step": 6042 + }, + { + "epoch": 0.49726393746142766, + "grad_norm": 2.2623887129549067, + "learning_rate": 1.0574057036713926e-05, + "loss": 0.758, + "step": 6043 + }, + { + "epoch": 0.4973462250565727, + "grad_norm": 3.680709873189796, + "learning_rate": 1.057139610861258e-05, + "loss": 0.7017, + "step": 6044 + }, + { + "epoch": 0.4974285126517178, + "grad_norm": 0.4390267833661819, + "learning_rate": 1.0568735139920163e-05, + "loss": 0.5087, + "step": 6045 + }, + { + "epoch": 0.4975108002468628, + "grad_norm": 2.5478523672879594, + "learning_rate": 1.0566074130825713e-05, + "loss": 0.7386, + "step": 6046 + }, + { + "epoch": 0.49759308784200784, + "grad_norm": 2.8494640624311276, + "learning_rate": 1.0563413081518263e-05, + "loss": 0.7281, + "step": 6047 + }, + { + "epoch": 0.49767537543715284, + "grad_norm": 2.7558107668725467, + "learning_rate": 1.0560751992186849e-05, + "loss": 0.7436, + "step": 6048 + }, + { + "epoch": 0.4977576630322979, + "grad_norm": 2.2132948255827913, + "learning_rate": 1.0558090863020509e-05, + "loss": 0.7294, + "step": 6049 + }, + { + "epoch": 0.4978399506274429, + "grad_norm": 2.4271122420279574, + "learning_rate": 1.0555429694208288e-05, + "loss": 0.7434, + "step": 6050 + }, + { + "epoch": 0.49792223822258797, + "grad_norm": 2.3676380256440956, + "learning_rate": 1.0552768485939232e-05, + "loss": 0.7815, + "step": 6051 + }, + { + "epoch": 0.49800452581773297, + "grad_norm": 2.2456935044869826, + "learning_rate": 1.055010723840238e-05, + "loss": 0.7176, + "step": 6052 + }, + { + "epoch": 0.498086813412878, + "grad_norm": 3.586808235984742, + "learning_rate": 1.0547445951786794e-05, + "loss": 0.7617, + "step": 6053 + }, + { + "epoch": 0.49816910100802303, + "grad_norm": 2.4198052324972266, + "learning_rate": 1.0544784626281523e-05, + "loss": 0.7597, + "step": 6054 + }, + { + "epoch": 0.4982513886031681, + "grad_norm": 2.1714498355592418, + "learning_rate": 1.0542123262075622e-05, + "loss": 0.7844, + "step": 6055 + }, + { + "epoch": 0.4983336761983131, + "grad_norm": 3.0337922159772828, + "learning_rate": 1.0539461859358154e-05, + "loss": 0.7644, + "step": 6056 + }, + { + "epoch": 0.49841596379345815, + "grad_norm": 3.8729686646436696, + "learning_rate": 1.0536800418318175e-05, + "loss": 0.7213, + "step": 6057 + }, + { + "epoch": 0.49849825138860315, + "grad_norm": 2.562630165738044, + "learning_rate": 1.0534138939144755e-05, + "loss": 0.7743, + "step": 6058 + }, + { + "epoch": 0.4985805389837482, + "grad_norm": 2.451740036126474, + "learning_rate": 1.0531477422026957e-05, + "loss": 0.7619, + "step": 6059 + }, + { + "epoch": 0.4986628265788932, + "grad_norm": 2.466977624718412, + "learning_rate": 1.0528815867153857e-05, + "loss": 0.7581, + "step": 6060 + }, + { + "epoch": 0.4987451141740383, + "grad_norm": 2.048508699442771, + "learning_rate": 1.052615427471452e-05, + "loss": 0.7309, + "step": 6061 + }, + { + "epoch": 0.4988274017691833, + "grad_norm": 2.3361006143240206, + "learning_rate": 1.0523492644898023e-05, + "loss": 0.7237, + "step": 6062 + }, + { + "epoch": 0.49890968936432833, + "grad_norm": 2.8231150552296698, + "learning_rate": 1.052083097789345e-05, + "loss": 0.7553, + "step": 6063 + }, + { + "epoch": 0.49899197695947334, + "grad_norm": 2.2871564945855716, + "learning_rate": 1.0518169273889876e-05, + "loss": 0.7675, + "step": 6064 + }, + { + "epoch": 0.4990742645546184, + "grad_norm": 2.203467699866617, + "learning_rate": 1.0515507533076384e-05, + "loss": 0.7604, + "step": 6065 + }, + { + "epoch": 0.4991565521497634, + "grad_norm": 3.0762316195193926, + "learning_rate": 1.051284575564206e-05, + "loss": 0.735, + "step": 6066 + }, + { + "epoch": 0.49923883974490846, + "grad_norm": 2.1924519377157314, + "learning_rate": 1.0510183941775995e-05, + "loss": 0.7395, + "step": 6067 + }, + { + "epoch": 0.49932112734005346, + "grad_norm": 0.4846902908290774, + "learning_rate": 1.0507522091667282e-05, + "loss": 0.5415, + "step": 6068 + }, + { + "epoch": 0.4994034149351985, + "grad_norm": 0.4372369265926279, + "learning_rate": 1.0504860205505008e-05, + "loss": 0.497, + "step": 6069 + }, + { + "epoch": 0.4994857025303436, + "grad_norm": 2.284362250366818, + "learning_rate": 1.0502198283478274e-05, + "loss": 0.7521, + "step": 6070 + }, + { + "epoch": 0.4995679901254886, + "grad_norm": 6.393096290262044, + "learning_rate": 1.0499536325776176e-05, + "loss": 0.7408, + "step": 6071 + }, + { + "epoch": 0.49965027772063364, + "grad_norm": 0.40950861176666326, + "learning_rate": 1.0496874332587817e-05, + "loss": 0.4661, + "step": 6072 + }, + { + "epoch": 0.49973256531577864, + "grad_norm": 2.694905352932295, + "learning_rate": 1.0494212304102299e-05, + "loss": 0.7309, + "step": 6073 + }, + { + "epoch": 0.4998148529109237, + "grad_norm": 2.3534570299785353, + "learning_rate": 1.049155024050873e-05, + "loss": 0.7091, + "step": 6074 + }, + { + "epoch": 0.4998971405060687, + "grad_norm": 2.5620256391859413, + "learning_rate": 1.0488888141996218e-05, + "loss": 0.7503, + "step": 6075 + }, + { + "epoch": 0.49997942810121376, + "grad_norm": 2.331405488595376, + "learning_rate": 1.0486226008753873e-05, + "loss": 0.7827, + "step": 6076 + }, + { + "epoch": 0.5000617156963588, + "grad_norm": 1.940834432972169, + "learning_rate": 1.0483563840970813e-05, + "loss": 0.7538, + "step": 6077 + }, + { + "epoch": 0.5001440032915038, + "grad_norm": 3.2090599790759637, + "learning_rate": 1.048090163883615e-05, + "loss": 0.7318, + "step": 6078 + }, + { + "epoch": 0.5002262908866488, + "grad_norm": 2.3655365822779424, + "learning_rate": 1.0478239402539e-05, + "loss": 0.7216, + "step": 6079 + }, + { + "epoch": 0.5003085784817939, + "grad_norm": 2.1753268254223737, + "learning_rate": 1.0475577132268493e-05, + "loss": 0.7383, + "step": 6080 + }, + { + "epoch": 0.500390866076939, + "grad_norm": 2.2231089935271418, + "learning_rate": 1.0472914828213745e-05, + "loss": 0.7437, + "step": 6081 + }, + { + "epoch": 0.5004731536720839, + "grad_norm": 2.333644611481843, + "learning_rate": 1.047025249056388e-05, + "loss": 0.7171, + "step": 6082 + }, + { + "epoch": 0.500555441267229, + "grad_norm": 3.3434261362174413, + "learning_rate": 1.0467590119508038e-05, + "loss": 0.7561, + "step": 6083 + }, + { + "epoch": 0.500637728862374, + "grad_norm": 2.516456800541028, + "learning_rate": 1.0464927715235336e-05, + "loss": 0.7378, + "step": 6084 + }, + { + "epoch": 0.5007200164575191, + "grad_norm": 3.0748151891768205, + "learning_rate": 1.0462265277934913e-05, + "loss": 0.7611, + "step": 6085 + }, + { + "epoch": 0.500802304052664, + "grad_norm": 0.44474650892462286, + "learning_rate": 1.0459602807795907e-05, + "loss": 0.5217, + "step": 6086 + }, + { + "epoch": 0.5008845916478091, + "grad_norm": 3.158615441115673, + "learning_rate": 1.0456940305007451e-05, + "loss": 0.7726, + "step": 6087 + }, + { + "epoch": 0.5009668792429541, + "grad_norm": 2.8251082429131746, + "learning_rate": 1.0454277769758685e-05, + "loss": 0.7349, + "step": 6088 + }, + { + "epoch": 0.5010491668380992, + "grad_norm": 2.551445392275488, + "learning_rate": 1.0451615202238755e-05, + "loss": 0.7364, + "step": 6089 + }, + { + "epoch": 0.5011314544332441, + "grad_norm": 2.509451061156119, + "learning_rate": 1.0448952602636801e-05, + "loss": 0.7452, + "step": 6090 + }, + { + "epoch": 0.5012137420283892, + "grad_norm": 2.621048517429974, + "learning_rate": 1.0446289971141973e-05, + "loss": 0.7664, + "step": 6091 + }, + { + "epoch": 0.5012960296235343, + "grad_norm": 2.2460100441830546, + "learning_rate": 1.044362730794342e-05, + "loss": 0.7461, + "step": 6092 + }, + { + "epoch": 0.5013783172186793, + "grad_norm": 2.412160042352132, + "learning_rate": 1.0440964613230294e-05, + "loss": 0.7558, + "step": 6093 + }, + { + "epoch": 0.5014606048138243, + "grad_norm": 2.398023215136683, + "learning_rate": 1.0438301887191742e-05, + "loss": 0.757, + "step": 6094 + }, + { + "epoch": 0.5015428924089693, + "grad_norm": 2.4920748802776305, + "learning_rate": 1.043563913001693e-05, + "loss": 0.7205, + "step": 6095 + }, + { + "epoch": 0.5016251800041144, + "grad_norm": 0.4338246941571097, + "learning_rate": 1.043297634189501e-05, + "loss": 0.5177, + "step": 6096 + }, + { + "epoch": 0.5017074675992594, + "grad_norm": 2.9525982819252694, + "learning_rate": 1.0430313523015138e-05, + "loss": 0.7285, + "step": 6097 + }, + { + "epoch": 0.5017897551944045, + "grad_norm": 3.7697645062100738, + "learning_rate": 1.042765067356649e-05, + "loss": 0.7688, + "step": 6098 + }, + { + "epoch": 0.5018720427895494, + "grad_norm": 2.376963508657987, + "learning_rate": 1.0424987793738216e-05, + "loss": 0.7298, + "step": 6099 + }, + { + "epoch": 0.5019543303846945, + "grad_norm": 3.4321281748959476, + "learning_rate": 1.0422324883719492e-05, + "loss": 0.7274, + "step": 6100 + }, + { + "epoch": 0.5020366179798396, + "grad_norm": 3.596886611895181, + "learning_rate": 1.0419661943699482e-05, + "loss": 0.7427, + "step": 6101 + }, + { + "epoch": 0.5021189055749846, + "grad_norm": 2.2366517787634694, + "learning_rate": 1.0416998973867356e-05, + "loss": 0.7308, + "step": 6102 + }, + { + "epoch": 0.5022011931701296, + "grad_norm": 4.401439790927072, + "learning_rate": 1.0414335974412295e-05, + "loss": 0.7355, + "step": 6103 + }, + { + "epoch": 0.5022834807652746, + "grad_norm": 2.469972463368188, + "learning_rate": 1.0411672945523466e-05, + "loss": 0.7572, + "step": 6104 + }, + { + "epoch": 0.5023657683604197, + "grad_norm": 2.253612720260971, + "learning_rate": 1.040900988739005e-05, + "loss": 0.715, + "step": 6105 + }, + { + "epoch": 0.5024480559555647, + "grad_norm": 2.686661910538296, + "learning_rate": 1.0406346800201226e-05, + "loss": 0.7315, + "step": 6106 + }, + { + "epoch": 0.5025303435507097, + "grad_norm": 2.186699181108991, + "learning_rate": 1.0403683684146172e-05, + "loss": 0.7406, + "step": 6107 + }, + { + "epoch": 0.5026126311458547, + "grad_norm": 4.380839409695006, + "learning_rate": 1.040102053941408e-05, + "loss": 0.7479, + "step": 6108 + }, + { + "epoch": 0.5026949187409998, + "grad_norm": 2.4954364399469156, + "learning_rate": 1.0398357366194128e-05, + "loss": 0.7395, + "step": 6109 + }, + { + "epoch": 0.5027772063361449, + "grad_norm": 3.192320887842379, + "learning_rate": 1.0395694164675505e-05, + "loss": 0.7821, + "step": 6110 + }, + { + "epoch": 0.5028594939312898, + "grad_norm": 0.4317576537104797, + "learning_rate": 1.0393030935047401e-05, + "loss": 0.5173, + "step": 6111 + }, + { + "epoch": 0.5029417815264349, + "grad_norm": 2.855921071177744, + "learning_rate": 1.039036767749901e-05, + "loss": 0.7447, + "step": 6112 + }, + { + "epoch": 0.5030240691215799, + "grad_norm": 2.7454794369824236, + "learning_rate": 1.0387704392219522e-05, + "loss": 0.7516, + "step": 6113 + }, + { + "epoch": 0.503106356716725, + "grad_norm": 0.4209810191647319, + "learning_rate": 1.0385041079398133e-05, + "loss": 0.5204, + "step": 6114 + }, + { + "epoch": 0.5031886443118699, + "grad_norm": 2.5055552409349895, + "learning_rate": 1.0382377739224043e-05, + "loss": 0.7507, + "step": 6115 + }, + { + "epoch": 0.503270931907015, + "grad_norm": 2.4775435118201226, + "learning_rate": 1.037971437188645e-05, + "loss": 0.7259, + "step": 6116 + }, + { + "epoch": 0.50335321950216, + "grad_norm": 3.039902650183094, + "learning_rate": 1.0377050977574554e-05, + "loss": 0.7508, + "step": 6117 + }, + { + "epoch": 0.5034355070973051, + "grad_norm": 4.941905567213464, + "learning_rate": 1.0374387556477561e-05, + "loss": 0.75, + "step": 6118 + }, + { + "epoch": 0.5035177946924501, + "grad_norm": 2.33976961570007, + "learning_rate": 1.0371724108784675e-05, + "loss": 0.7521, + "step": 6119 + }, + { + "epoch": 0.5036000822875951, + "grad_norm": 2.45609025226816, + "learning_rate": 1.03690606346851e-05, + "loss": 0.7724, + "step": 6120 + }, + { + "epoch": 0.5036823698827402, + "grad_norm": 2.7676335315654583, + "learning_rate": 1.0366397134368051e-05, + "loss": 0.7829, + "step": 6121 + }, + { + "epoch": 0.5037646574778852, + "grad_norm": 0.4360042384466672, + "learning_rate": 1.0363733608022734e-05, + "loss": 0.5201, + "step": 6122 + }, + { + "epoch": 0.5038469450730303, + "grad_norm": 2.476706780121197, + "learning_rate": 1.0361070055838363e-05, + "loss": 0.7546, + "step": 6123 + }, + { + "epoch": 0.5039292326681752, + "grad_norm": 0.42801407425872084, + "learning_rate": 1.0358406478004155e-05, + "loss": 0.5042, + "step": 6124 + }, + { + "epoch": 0.5040115202633203, + "grad_norm": 0.4122737630940814, + "learning_rate": 1.035574287470932e-05, + "loss": 0.5214, + "step": 6125 + }, + { + "epoch": 0.5040938078584654, + "grad_norm": 2.287535574559189, + "learning_rate": 1.0353079246143084e-05, + "loss": 0.723, + "step": 6126 + }, + { + "epoch": 0.5041760954536104, + "grad_norm": 2.726040483454551, + "learning_rate": 1.0350415592494662e-05, + "loss": 0.7518, + "step": 6127 + }, + { + "epoch": 0.5042583830487554, + "grad_norm": 2.2014572152151954, + "learning_rate": 1.0347751913953275e-05, + "loss": 0.7563, + "step": 6128 + }, + { + "epoch": 0.5043406706439004, + "grad_norm": 0.4363361454379446, + "learning_rate": 1.0345088210708147e-05, + "loss": 0.5324, + "step": 6129 + }, + { + "epoch": 0.5044229582390455, + "grad_norm": 2.248103108975257, + "learning_rate": 1.034242448294851e-05, + "loss": 0.7346, + "step": 6130 + }, + { + "epoch": 0.5045052458341905, + "grad_norm": 2.046895834384303, + "learning_rate": 1.0339760730863582e-05, + "loss": 0.7601, + "step": 6131 + }, + { + "epoch": 0.5045875334293355, + "grad_norm": 5.978247321389429, + "learning_rate": 1.0337096954642595e-05, + "loss": 0.7213, + "step": 6132 + }, + { + "epoch": 0.5046698210244805, + "grad_norm": 3.00267170666837, + "learning_rate": 1.0334433154474778e-05, + "loss": 0.7389, + "step": 6133 + }, + { + "epoch": 0.5047521086196256, + "grad_norm": 2.3478158464378285, + "learning_rate": 1.0331769330549366e-05, + "loss": 0.7592, + "step": 6134 + }, + { + "epoch": 0.5048343962147707, + "grad_norm": 2.176976493677943, + "learning_rate": 1.0329105483055593e-05, + "loss": 0.7663, + "step": 6135 + }, + { + "epoch": 0.5049166838099156, + "grad_norm": 0.4470114866980031, + "learning_rate": 1.0326441612182689e-05, + "loss": 0.5191, + "step": 6136 + }, + { + "epoch": 0.5049989714050607, + "grad_norm": 2.3822863482966756, + "learning_rate": 1.03237777181199e-05, + "loss": 0.7627, + "step": 6137 + }, + { + "epoch": 0.5050812590002057, + "grad_norm": 2.2669100574443406, + "learning_rate": 1.0321113801056457e-05, + "loss": 0.7704, + "step": 6138 + }, + { + "epoch": 0.5051635465953508, + "grad_norm": 3.0438948208982923, + "learning_rate": 1.0318449861181601e-05, + "loss": 0.757, + "step": 6139 + }, + { + "epoch": 0.5052458341904957, + "grad_norm": 2.5095587353547817, + "learning_rate": 1.0315785898684579e-05, + "loss": 0.7628, + "step": 6140 + }, + { + "epoch": 0.5053281217856408, + "grad_norm": 2.768987128487559, + "learning_rate": 1.031312191375463e-05, + "loss": 0.7532, + "step": 6141 + }, + { + "epoch": 0.5054104093807859, + "grad_norm": 0.42391668590750514, + "learning_rate": 1.0310457906581001e-05, + "loss": 0.5013, + "step": 6142 + }, + { + "epoch": 0.5054926969759309, + "grad_norm": 2.3037054887134323, + "learning_rate": 1.0307793877352941e-05, + "loss": 0.7426, + "step": 6143 + }, + { + "epoch": 0.5055749845710759, + "grad_norm": 4.734843642784254, + "learning_rate": 1.0305129826259695e-05, + "loss": 0.7656, + "step": 6144 + }, + { + "epoch": 0.5056572721662209, + "grad_norm": 2.4558757771875515, + "learning_rate": 1.0302465753490514e-05, + "loss": 0.7579, + "step": 6145 + }, + { + "epoch": 0.505739559761366, + "grad_norm": 2.537080107718293, + "learning_rate": 1.0299801659234649e-05, + "loss": 0.7596, + "step": 6146 + }, + { + "epoch": 0.505821847356511, + "grad_norm": 3.099666951290331, + "learning_rate": 1.0297137543681355e-05, + "loss": 0.7189, + "step": 6147 + }, + { + "epoch": 0.5059041349516561, + "grad_norm": 2.2115335930728155, + "learning_rate": 1.0294473407019882e-05, + "loss": 0.7242, + "step": 6148 + }, + { + "epoch": 0.505986422546801, + "grad_norm": 2.771234521112867, + "learning_rate": 1.0291809249439493e-05, + "loss": 0.7643, + "step": 6149 + }, + { + "epoch": 0.5060687101419461, + "grad_norm": 2.318535810442622, + "learning_rate": 1.028914507112944e-05, + "loss": 0.7123, + "step": 6150 + }, + { + "epoch": 0.5061509977370912, + "grad_norm": 2.579614539784969, + "learning_rate": 1.0286480872278984e-05, + "loss": 0.7421, + "step": 6151 + }, + { + "epoch": 0.5062332853322362, + "grad_norm": 1.959296435346338, + "learning_rate": 1.0283816653077383e-05, + "loss": 0.7707, + "step": 6152 + }, + { + "epoch": 0.5063155729273812, + "grad_norm": 2.073040432133896, + "learning_rate": 1.0281152413713905e-05, + "loss": 0.7317, + "step": 6153 + }, + { + "epoch": 0.5063978605225262, + "grad_norm": 0.42263113407093833, + "learning_rate": 1.0278488154377806e-05, + "loss": 0.5152, + "step": 6154 + }, + { + "epoch": 0.5064801481176713, + "grad_norm": 3.3082315308533516, + "learning_rate": 1.0275823875258352e-05, + "loss": 0.7636, + "step": 6155 + }, + { + "epoch": 0.5065624357128163, + "grad_norm": 0.39304776787419665, + "learning_rate": 1.0273159576544815e-05, + "loss": 0.4948, + "step": 6156 + }, + { + "epoch": 0.5066447233079613, + "grad_norm": 2.2654656589994966, + "learning_rate": 1.0270495258426456e-05, + "loss": 0.7524, + "step": 6157 + }, + { + "epoch": 0.5067270109031063, + "grad_norm": 2.686074070284502, + "learning_rate": 1.0267830921092547e-05, + "loss": 0.774, + "step": 6158 + }, + { + "epoch": 0.5068092984982514, + "grad_norm": 2.4734675589175654, + "learning_rate": 1.0265166564732361e-05, + "loss": 0.7556, + "step": 6159 + }, + { + "epoch": 0.5068915860933965, + "grad_norm": 2.354522472726642, + "learning_rate": 1.0262502189535161e-05, + "loss": 0.7823, + "step": 6160 + }, + { + "epoch": 0.5069738736885414, + "grad_norm": 0.4019985774290736, + "learning_rate": 1.0259837795690229e-05, + "loss": 0.4859, + "step": 6161 + }, + { + "epoch": 0.5070561612836865, + "grad_norm": 2.0881943800538125, + "learning_rate": 1.0257173383386836e-05, + "loss": 0.7102, + "step": 6162 + }, + { + "epoch": 0.5071384488788315, + "grad_norm": 3.1384997959436776, + "learning_rate": 1.0254508952814252e-05, + "loss": 0.7238, + "step": 6163 + }, + { + "epoch": 0.5072207364739766, + "grad_norm": 2.213682687902802, + "learning_rate": 1.0251844504161763e-05, + "loss": 0.7472, + "step": 6164 + }, + { + "epoch": 0.5073030240691215, + "grad_norm": 4.096906545949818, + "learning_rate": 1.0249180037618644e-05, + "loss": 0.7157, + "step": 6165 + }, + { + "epoch": 0.5073853116642666, + "grad_norm": 2.928803307553829, + "learning_rate": 1.0246515553374172e-05, + "loss": 0.7355, + "step": 6166 + }, + { + "epoch": 0.5074675992594117, + "grad_norm": 2.742238219488076, + "learning_rate": 1.024385105161763e-05, + "loss": 0.7502, + "step": 6167 + }, + { + "epoch": 0.5075498868545567, + "grad_norm": 2.2400118972024656, + "learning_rate": 1.0241186532538299e-05, + "loss": 0.715, + "step": 6168 + }, + { + "epoch": 0.5076321744497017, + "grad_norm": 2.5185260746673714, + "learning_rate": 1.0238521996325461e-05, + "loss": 0.7293, + "step": 6169 + }, + { + "epoch": 0.5077144620448467, + "grad_norm": 2.5753623810649215, + "learning_rate": 1.0235857443168404e-05, + "loss": 0.7589, + "step": 6170 + }, + { + "epoch": 0.5077967496399918, + "grad_norm": 2.298659623360463, + "learning_rate": 1.023319287325641e-05, + "loss": 0.7203, + "step": 6171 + }, + { + "epoch": 0.5078790372351368, + "grad_norm": 2.489096213737918, + "learning_rate": 1.0230528286778769e-05, + "loss": 0.7624, + "step": 6172 + }, + { + "epoch": 0.5079613248302818, + "grad_norm": 2.2851475838904296, + "learning_rate": 1.0227863683924766e-05, + "loss": 0.7457, + "step": 6173 + }, + { + "epoch": 0.5080436124254268, + "grad_norm": 2.245561133148552, + "learning_rate": 1.022519906488369e-05, + "loss": 0.7187, + "step": 6174 + }, + { + "epoch": 0.5081259000205719, + "grad_norm": 2.0302421048710406, + "learning_rate": 1.0222534429844835e-05, + "loss": 0.7435, + "step": 6175 + }, + { + "epoch": 0.508208187615717, + "grad_norm": 2.7617399973780357, + "learning_rate": 1.021986977899749e-05, + "loss": 0.7408, + "step": 6176 + }, + { + "epoch": 0.508290475210862, + "grad_norm": 1.9642504584057616, + "learning_rate": 1.0217205112530946e-05, + "loss": 0.7221, + "step": 6177 + }, + { + "epoch": 0.508372762806007, + "grad_norm": 2.149805849238984, + "learning_rate": 1.0214540430634496e-05, + "loss": 0.7167, + "step": 6178 + }, + { + "epoch": 0.508455050401152, + "grad_norm": 2.1437167819430325, + "learning_rate": 1.0211875733497443e-05, + "loss": 0.7645, + "step": 6179 + }, + { + "epoch": 0.5085373379962971, + "grad_norm": 2.5678341766789314, + "learning_rate": 1.0209211021309071e-05, + "loss": 0.7773, + "step": 6180 + }, + { + "epoch": 0.5086196255914421, + "grad_norm": 2.3069818754226956, + "learning_rate": 1.0206546294258686e-05, + "loss": 0.7018, + "step": 6181 + }, + { + "epoch": 0.5087019131865871, + "grad_norm": 2.6459544451840507, + "learning_rate": 1.0203881552535582e-05, + "loss": 0.7394, + "step": 6182 + }, + { + "epoch": 0.5087842007817321, + "grad_norm": 2.132618558013428, + "learning_rate": 1.0201216796329057e-05, + "loss": 0.7494, + "step": 6183 + }, + { + "epoch": 0.5088664883768772, + "grad_norm": 2.0244256762148036, + "learning_rate": 1.0198552025828413e-05, + "loss": 0.7123, + "step": 6184 + }, + { + "epoch": 0.5089487759720223, + "grad_norm": 2.3158497500482866, + "learning_rate": 1.0195887241222953e-05, + "loss": 0.7709, + "step": 6185 + }, + { + "epoch": 0.5090310635671672, + "grad_norm": 2.728975646525103, + "learning_rate": 1.0193222442701973e-05, + "loss": 0.7646, + "step": 6186 + }, + { + "epoch": 0.5091133511623123, + "grad_norm": 2.5125197095193497, + "learning_rate": 1.019055763045478e-05, + "loss": 0.7618, + "step": 6187 + }, + { + "epoch": 0.5091956387574573, + "grad_norm": 1.934557105147055, + "learning_rate": 1.0187892804670682e-05, + "loss": 0.7575, + "step": 6188 + }, + { + "epoch": 0.5092779263526024, + "grad_norm": 0.4390200036109407, + "learning_rate": 1.0185227965538978e-05, + "loss": 0.5248, + "step": 6189 + }, + { + "epoch": 0.5093602139477473, + "grad_norm": 2.7854222621259694, + "learning_rate": 1.0182563113248971e-05, + "loss": 0.769, + "step": 6190 + }, + { + "epoch": 0.5094425015428924, + "grad_norm": 2.012273918120954, + "learning_rate": 1.0179898247989978e-05, + "loss": 0.7524, + "step": 6191 + }, + { + "epoch": 0.5095247891380374, + "grad_norm": 2.2132647596902153, + "learning_rate": 1.0177233369951298e-05, + "loss": 0.7468, + "step": 6192 + }, + { + "epoch": 0.5096070767331825, + "grad_norm": 2.0657336165606277, + "learning_rate": 1.0174568479322245e-05, + "loss": 0.7507, + "step": 6193 + }, + { + "epoch": 0.5096893643283275, + "grad_norm": 3.157562480636095, + "learning_rate": 1.0171903576292127e-05, + "loss": 0.7318, + "step": 6194 + }, + { + "epoch": 0.5097716519234725, + "grad_norm": 1.8024812968136865, + "learning_rate": 1.016923866105025e-05, + "loss": 0.7616, + "step": 6195 + }, + { + "epoch": 0.5098539395186176, + "grad_norm": 1.6982073423616435, + "learning_rate": 1.016657373378593e-05, + "loss": 0.7663, + "step": 6196 + }, + { + "epoch": 0.5099362271137626, + "grad_norm": 2.6943233708658125, + "learning_rate": 1.0163908794688482e-05, + "loss": 0.7408, + "step": 6197 + }, + { + "epoch": 0.5100185147089076, + "grad_norm": 2.0197173004452518, + "learning_rate": 1.0161243843947213e-05, + "loss": 0.749, + "step": 6198 + }, + { + "epoch": 0.5101008023040526, + "grad_norm": 0.43551744887018706, + "learning_rate": 1.0158578881751438e-05, + "loss": 0.5036, + "step": 6199 + }, + { + "epoch": 0.5101830898991977, + "grad_norm": 2.307026196829826, + "learning_rate": 1.0155913908290476e-05, + "loss": 0.743, + "step": 6200 + }, + { + "epoch": 0.5102653774943428, + "grad_norm": 2.0386091469322456, + "learning_rate": 1.0153248923753635e-05, + "loss": 0.71, + "step": 6201 + }, + { + "epoch": 0.5103476650894878, + "grad_norm": 2.2986501124221923, + "learning_rate": 1.015058392833024e-05, + "loss": 0.7626, + "step": 6202 + }, + { + "epoch": 0.5104299526846328, + "grad_norm": 0.4151868937604923, + "learning_rate": 1.01479189222096e-05, + "loss": 0.5143, + "step": 6203 + }, + { + "epoch": 0.5105122402797778, + "grad_norm": 2.1550632955065927, + "learning_rate": 1.0145253905581038e-05, + "loss": 0.7532, + "step": 6204 + }, + { + "epoch": 0.5105945278749229, + "grad_norm": 4.753381375921007, + "learning_rate": 1.0142588878633872e-05, + "loss": 0.7573, + "step": 6205 + }, + { + "epoch": 0.5106768154700679, + "grad_norm": 2.1305956279711347, + "learning_rate": 1.0139923841557417e-05, + "loss": 0.7543, + "step": 6206 + }, + { + "epoch": 0.5107591030652129, + "grad_norm": 2.128985661192511, + "learning_rate": 1.0137258794540998e-05, + "loss": 0.747, + "step": 6207 + }, + { + "epoch": 0.5108413906603579, + "grad_norm": 2.2620286322671386, + "learning_rate": 1.0134593737773934e-05, + "loss": 0.7418, + "step": 6208 + }, + { + "epoch": 0.510923678255503, + "grad_norm": 2.645779264935748, + "learning_rate": 1.0131928671445545e-05, + "loss": 0.7522, + "step": 6209 + }, + { + "epoch": 0.5110059658506481, + "grad_norm": 2.116853555972645, + "learning_rate": 1.0129263595745155e-05, + "loss": 0.7503, + "step": 6210 + }, + { + "epoch": 0.511088253445793, + "grad_norm": 1.9861600090153357, + "learning_rate": 1.0126598510862086e-05, + "loss": 0.7108, + "step": 6211 + }, + { + "epoch": 0.5111705410409381, + "grad_norm": 1.907505598936795, + "learning_rate": 1.012393341698566e-05, + "loss": 0.7398, + "step": 6212 + }, + { + "epoch": 0.5112528286360831, + "grad_norm": 2.039179859241202, + "learning_rate": 1.0121268314305204e-05, + "loss": 0.746, + "step": 6213 + }, + { + "epoch": 0.5113351162312282, + "grad_norm": 2.1903644427316893, + "learning_rate": 1.0118603203010043e-05, + "loss": 0.7392, + "step": 6214 + }, + { + "epoch": 0.5114174038263731, + "grad_norm": 2.4930862709233392, + "learning_rate": 1.0115938083289493e-05, + "loss": 0.7238, + "step": 6215 + }, + { + "epoch": 0.5114996914215182, + "grad_norm": 1.9208790836380165, + "learning_rate": 1.0113272955332893e-05, + "loss": 0.7823, + "step": 6216 + }, + { + "epoch": 0.5115819790166632, + "grad_norm": 0.4330554764379318, + "learning_rate": 1.0110607819329563e-05, + "loss": 0.5288, + "step": 6217 + }, + { + "epoch": 0.5116642666118083, + "grad_norm": 0.41382683283310395, + "learning_rate": 1.0107942675468828e-05, + "loss": 0.5019, + "step": 6218 + }, + { + "epoch": 0.5117465542069533, + "grad_norm": 3.0852094833726267, + "learning_rate": 1.0105277523940019e-05, + "loss": 0.7312, + "step": 6219 + }, + { + "epoch": 0.5118288418020983, + "grad_norm": 2.555773757133729, + "learning_rate": 1.0102612364932463e-05, + "loss": 0.7816, + "step": 6220 + }, + { + "epoch": 0.5119111293972434, + "grad_norm": 2.1641663241519677, + "learning_rate": 1.0099947198635489e-05, + "loss": 0.7565, + "step": 6221 + }, + { + "epoch": 0.5119934169923884, + "grad_norm": 0.41708941384595954, + "learning_rate": 1.0097282025238424e-05, + "loss": 0.5164, + "step": 6222 + }, + { + "epoch": 0.5120757045875334, + "grad_norm": 2.2962792204700606, + "learning_rate": 1.0094616844930603e-05, + "loss": 0.76, + "step": 6223 + }, + { + "epoch": 0.5121579921826784, + "grad_norm": 2.604866524406999, + "learning_rate": 1.0091951657901351e-05, + "loss": 0.7593, + "step": 6224 + }, + { + "epoch": 0.5122402797778235, + "grad_norm": 0.42858603318524996, + "learning_rate": 1.0089286464339997e-05, + "loss": 0.5037, + "step": 6225 + }, + { + "epoch": 0.5123225673729686, + "grad_norm": 2.1340011226764712, + "learning_rate": 1.008662126443588e-05, + "loss": 0.7454, + "step": 6226 + }, + { + "epoch": 0.5124048549681136, + "grad_norm": 2.1485590490378126, + "learning_rate": 1.0083956058378326e-05, + "loss": 0.765, + "step": 6227 + }, + { + "epoch": 0.5124871425632586, + "grad_norm": 2.281930202399075, + "learning_rate": 1.0081290846356666e-05, + "loss": 0.7723, + "step": 6228 + }, + { + "epoch": 0.5125694301584036, + "grad_norm": 3.90057042827999, + "learning_rate": 1.0078625628560237e-05, + "loss": 0.7683, + "step": 6229 + }, + { + "epoch": 0.5126517177535487, + "grad_norm": 2.7245158277190775, + "learning_rate": 1.0075960405178367e-05, + "loss": 0.7429, + "step": 6230 + }, + { + "epoch": 0.5127340053486937, + "grad_norm": 2.6878793385457906, + "learning_rate": 1.007329517640039e-05, + "loss": 0.7643, + "step": 6231 + }, + { + "epoch": 0.5128162929438387, + "grad_norm": 3.009699062202179, + "learning_rate": 1.0070629942415646e-05, + "loss": 0.7632, + "step": 6232 + }, + { + "epoch": 0.5128985805389837, + "grad_norm": 0.4308599199960913, + "learning_rate": 1.006796470341346e-05, + "loss": 0.495, + "step": 6233 + }, + { + "epoch": 0.5129808681341288, + "grad_norm": 0.4141157250972085, + "learning_rate": 1.006529945958317e-05, + "loss": 0.5024, + "step": 6234 + }, + { + "epoch": 0.5130631557292739, + "grad_norm": 2.3797139401073593, + "learning_rate": 1.0062634211114113e-05, + "loss": 0.7243, + "step": 6235 + }, + { + "epoch": 0.5131454433244188, + "grad_norm": 2.2857517558307534, + "learning_rate": 1.0059968958195618e-05, + "loss": 0.7474, + "step": 6236 + }, + { + "epoch": 0.5132277309195639, + "grad_norm": 3.343117279504847, + "learning_rate": 1.005730370101703e-05, + "loss": 0.7441, + "step": 6237 + }, + { + "epoch": 0.5133100185147089, + "grad_norm": 3.357213483550919, + "learning_rate": 1.0054638439767674e-05, + "loss": 0.7346, + "step": 6238 + }, + { + "epoch": 0.513392306109854, + "grad_norm": 2.6791609269893346, + "learning_rate": 1.0051973174636892e-05, + "loss": 0.7775, + "step": 6239 + }, + { + "epoch": 0.5134745937049989, + "grad_norm": 5.072930319813126, + "learning_rate": 1.0049307905814018e-05, + "loss": 0.7402, + "step": 6240 + }, + { + "epoch": 0.513556881300144, + "grad_norm": 2.2526439510104765, + "learning_rate": 1.004664263348839e-05, + "loss": 0.7395, + "step": 6241 + }, + { + "epoch": 0.513639168895289, + "grad_norm": 2.2865720366296807, + "learning_rate": 1.0043977357849344e-05, + "loss": 0.732, + "step": 6242 + }, + { + "epoch": 0.5137214564904341, + "grad_norm": 4.861906682468846, + "learning_rate": 1.0041312079086219e-05, + "loss": 0.7342, + "step": 6243 + }, + { + "epoch": 0.513803744085579, + "grad_norm": 2.2192328346113097, + "learning_rate": 1.0038646797388344e-05, + "loss": 0.7649, + "step": 6244 + }, + { + "epoch": 0.5138860316807241, + "grad_norm": 2.3802058685942957, + "learning_rate": 1.0035981512945069e-05, + "loss": 0.7442, + "step": 6245 + }, + { + "epoch": 0.5139683192758692, + "grad_norm": 0.4591715896817389, + "learning_rate": 1.0033316225945722e-05, + "loss": 0.5159, + "step": 6246 + }, + { + "epoch": 0.5140506068710142, + "grad_norm": 2.2471180864818394, + "learning_rate": 1.0030650936579643e-05, + "loss": 0.7521, + "step": 6247 + }, + { + "epoch": 0.5141328944661592, + "grad_norm": 2.1826127411034557, + "learning_rate": 1.0027985645036169e-05, + "loss": 0.7452, + "step": 6248 + }, + { + "epoch": 0.5142151820613042, + "grad_norm": 3.0535058234710686, + "learning_rate": 1.0025320351504641e-05, + "loss": 0.7389, + "step": 6249 + }, + { + "epoch": 0.5142974696564493, + "grad_norm": 1.8603090069805477, + "learning_rate": 1.0022655056174397e-05, + "loss": 0.7789, + "step": 6250 + }, + { + "epoch": 0.5143797572515943, + "grad_norm": 2.1143037108203844, + "learning_rate": 1.0019989759234771e-05, + "loss": 0.7559, + "step": 6251 + }, + { + "epoch": 0.5144620448467394, + "grad_norm": 2.1451702406137567, + "learning_rate": 1.0017324460875108e-05, + "loss": 0.7485, + "step": 6252 + }, + { + "epoch": 0.5145443324418844, + "grad_norm": 2.0684892598204465, + "learning_rate": 1.0014659161284743e-05, + "loss": 0.7132, + "step": 6253 + }, + { + "epoch": 0.5146266200370294, + "grad_norm": 2.069221617511183, + "learning_rate": 1.0011993860653011e-05, + "loss": 0.7529, + "step": 6254 + }, + { + "epoch": 0.5147089076321745, + "grad_norm": 1.9748246292547444, + "learning_rate": 1.0009328559169258e-05, + "loss": 0.7462, + "step": 6255 + }, + { + "epoch": 0.5147911952273195, + "grad_norm": 2.2130068875814795, + "learning_rate": 1.0006663257022818e-05, + "loss": 0.7453, + "step": 6256 + }, + { + "epoch": 0.5148734828224645, + "grad_norm": 3.4257473349936425, + "learning_rate": 1.0003997954403031e-05, + "loss": 0.7263, + "step": 6257 + }, + { + "epoch": 0.5149557704176095, + "grad_norm": 1.9528241061313207, + "learning_rate": 1.0001332651499236e-05, + "loss": 0.7506, + "step": 6258 + }, + { + "epoch": 0.5150380580127546, + "grad_norm": 2.6436103331327123, + "learning_rate": 9.998667348500769e-06, + "loss": 0.7549, + "step": 6259 + }, + { + "epoch": 0.5151203456078997, + "grad_norm": 1.9632925259938878, + "learning_rate": 9.996002045596974e-06, + "loss": 0.7423, + "step": 6260 + }, + { + "epoch": 0.5152026332030446, + "grad_norm": 2.076211723345843, + "learning_rate": 9.993336742977187e-06, + "loss": 0.7627, + "step": 6261 + }, + { + "epoch": 0.5152849207981897, + "grad_norm": 2.0925499448938134, + "learning_rate": 9.990671440830743e-06, + "loss": 0.7499, + "step": 6262 + }, + { + "epoch": 0.5153672083933347, + "grad_norm": 1.9799415480559766, + "learning_rate": 9.98800613934699e-06, + "loss": 0.7135, + "step": 6263 + }, + { + "epoch": 0.5154494959884798, + "grad_norm": 0.41444173351739805, + "learning_rate": 9.98534083871526e-06, + "loss": 0.4997, + "step": 6264 + }, + { + "epoch": 0.5155317835836247, + "grad_norm": 2.003714675386412, + "learning_rate": 9.982675539124895e-06, + "loss": 0.732, + "step": 6265 + }, + { + "epoch": 0.5156140711787698, + "grad_norm": 2.2046682494105982, + "learning_rate": 9.98001024076523e-06, + "loss": 0.7554, + "step": 6266 + }, + { + "epoch": 0.5156963587739148, + "grad_norm": 0.4183513354996623, + "learning_rate": 9.977344943825608e-06, + "loss": 0.4972, + "step": 6267 + }, + { + "epoch": 0.5157786463690599, + "grad_norm": 2.138031098087256, + "learning_rate": 9.97467964849536e-06, + "loss": 0.7337, + "step": 6268 + }, + { + "epoch": 0.5158609339642048, + "grad_norm": 1.9506183348137367, + "learning_rate": 9.972014354963834e-06, + "loss": 0.7333, + "step": 6269 + }, + { + "epoch": 0.5159432215593499, + "grad_norm": 1.9342444901407263, + "learning_rate": 9.969349063420362e-06, + "loss": 0.7422, + "step": 6270 + }, + { + "epoch": 0.516025509154495, + "grad_norm": 0.40668670005005436, + "learning_rate": 9.966683774054285e-06, + "loss": 0.4686, + "step": 6271 + }, + { + "epoch": 0.51610779674964, + "grad_norm": 2.165936614445209, + "learning_rate": 9.964018487054936e-06, + "loss": 0.7536, + "step": 6272 + }, + { + "epoch": 0.516190084344785, + "grad_norm": 1.8147048175373395, + "learning_rate": 9.96135320261166e-06, + "loss": 0.7597, + "step": 6273 + }, + { + "epoch": 0.51627237193993, + "grad_norm": 2.4802861486343097, + "learning_rate": 9.958687920913786e-06, + "loss": 0.7506, + "step": 6274 + }, + { + "epoch": 0.5163546595350751, + "grad_norm": 2.193424199065049, + "learning_rate": 9.95602264215066e-06, + "loss": 0.7532, + "step": 6275 + }, + { + "epoch": 0.5164369471302201, + "grad_norm": 0.42656596767074306, + "learning_rate": 9.953357366511613e-06, + "loss": 0.4951, + "step": 6276 + }, + { + "epoch": 0.5165192347253652, + "grad_norm": 2.0500011595591356, + "learning_rate": 9.950692094185985e-06, + "loss": 0.73, + "step": 6277 + }, + { + "epoch": 0.5166015223205102, + "grad_norm": 2.3492393315392497, + "learning_rate": 9.948026825363112e-06, + "loss": 0.7515, + "step": 6278 + }, + { + "epoch": 0.5166838099156552, + "grad_norm": 2.257363773888789, + "learning_rate": 9.945361560232331e-06, + "loss": 0.7444, + "step": 6279 + }, + { + "epoch": 0.5167660975108003, + "grad_norm": 0.43553595775278703, + "learning_rate": 9.942696298982974e-06, + "loss": 0.5147, + "step": 6280 + }, + { + "epoch": 0.5168483851059453, + "grad_norm": 2.562245447311384, + "learning_rate": 9.940031041804386e-06, + "loss": 0.7416, + "step": 6281 + }, + { + "epoch": 0.5169306727010903, + "grad_norm": 2.3710383370174006, + "learning_rate": 9.93736578888589e-06, + "loss": 0.7573, + "step": 6282 + }, + { + "epoch": 0.5170129602962353, + "grad_norm": 2.1951940519880004, + "learning_rate": 9.934700540416832e-06, + "loss": 0.7341, + "step": 6283 + }, + { + "epoch": 0.5170952478913804, + "grad_norm": 2.5215476920869784, + "learning_rate": 9.932035296586543e-06, + "loss": 0.7239, + "step": 6284 + }, + { + "epoch": 0.5171775354865255, + "grad_norm": 2.9042589626541195, + "learning_rate": 9.929370057584359e-06, + "loss": 0.7144, + "step": 6285 + }, + { + "epoch": 0.5172598230816704, + "grad_norm": 2.442552571851999, + "learning_rate": 9.926704823599612e-06, + "loss": 0.7343, + "step": 6286 + }, + { + "epoch": 0.5173421106768155, + "grad_norm": 2.698478663040992, + "learning_rate": 9.924039594821637e-06, + "loss": 0.7166, + "step": 6287 + }, + { + "epoch": 0.5174243982719605, + "grad_norm": 2.1026039575740154, + "learning_rate": 9.921374371439765e-06, + "loss": 0.7568, + "step": 6288 + }, + { + "epoch": 0.5175066858671056, + "grad_norm": 2.5901144020401374, + "learning_rate": 9.918709153643337e-06, + "loss": 0.726, + "step": 6289 + }, + { + "epoch": 0.5175889734622505, + "grad_norm": 2.8251266084066766, + "learning_rate": 9.916043941621678e-06, + "loss": 0.7218, + "step": 6290 + }, + { + "epoch": 0.5176712610573956, + "grad_norm": 1.9476201874990315, + "learning_rate": 9.913378735564123e-06, + "loss": 0.7361, + "step": 6291 + }, + { + "epoch": 0.5177535486525406, + "grad_norm": 0.4218762002770594, + "learning_rate": 9.910713535660004e-06, + "loss": 0.4782, + "step": 6292 + }, + { + "epoch": 0.5178358362476857, + "grad_norm": 2.093262796036577, + "learning_rate": 9.908048342098656e-06, + "loss": 0.7608, + "step": 6293 + }, + { + "epoch": 0.5179181238428306, + "grad_norm": 3.402183665643468, + "learning_rate": 9.9053831550694e-06, + "loss": 0.7488, + "step": 6294 + }, + { + "epoch": 0.5180004114379757, + "grad_norm": 2.09248447345659, + "learning_rate": 9.90271797476158e-06, + "loss": 0.7529, + "step": 6295 + }, + { + "epoch": 0.5180826990331208, + "grad_norm": 0.40977313077153565, + "learning_rate": 9.900052801364514e-06, + "loss": 0.5114, + "step": 6296 + }, + { + "epoch": 0.5181649866282658, + "grad_norm": 0.4040440795268409, + "learning_rate": 9.897387635067542e-06, + "loss": 0.4885, + "step": 6297 + }, + { + "epoch": 0.5182472742234108, + "grad_norm": 2.1300559348967347, + "learning_rate": 9.894722476059984e-06, + "loss": 0.7441, + "step": 6298 + }, + { + "epoch": 0.5183295618185558, + "grad_norm": 12.550997007578612, + "learning_rate": 9.892057324531177e-06, + "loss": 0.74, + "step": 6299 + }, + { + "epoch": 0.5184118494137009, + "grad_norm": 2.2604137152603694, + "learning_rate": 9.88939218067044e-06, + "loss": 0.7491, + "step": 6300 + }, + { + "epoch": 0.518494137008846, + "grad_norm": 1.886440899731487, + "learning_rate": 9.88672704466711e-06, + "loss": 0.705, + "step": 6301 + }, + { + "epoch": 0.5185764246039909, + "grad_norm": 2.746466928018293, + "learning_rate": 9.884061916710508e-06, + "loss": 0.7507, + "step": 6302 + }, + { + "epoch": 0.518658712199136, + "grad_norm": 2.6372479387035237, + "learning_rate": 9.881396796989964e-06, + "loss": 0.7717, + "step": 6303 + }, + { + "epoch": 0.518740999794281, + "grad_norm": 4.2448707211978975, + "learning_rate": 9.8787316856948e-06, + "loss": 0.7414, + "step": 6304 + }, + { + "epoch": 0.5188232873894261, + "grad_norm": 0.43336212700112686, + "learning_rate": 9.876066583014345e-06, + "loss": 0.4882, + "step": 6305 + }, + { + "epoch": 0.5189055749845711, + "grad_norm": 2.6919995785692836, + "learning_rate": 9.873401489137916e-06, + "loss": 0.7501, + "step": 6306 + }, + { + "epoch": 0.5189878625797161, + "grad_norm": 2.7733249231152937, + "learning_rate": 9.87073640425485e-06, + "loss": 0.7691, + "step": 6307 + }, + { + "epoch": 0.5190701501748611, + "grad_norm": 2.2773471847566076, + "learning_rate": 9.868071328554457e-06, + "loss": 0.7313, + "step": 6308 + }, + { + "epoch": 0.5191524377700062, + "grad_norm": 2.000237744041564, + "learning_rate": 9.865406262226071e-06, + "loss": 0.7541, + "step": 6309 + }, + { + "epoch": 0.5192347253651513, + "grad_norm": 5.230436619250048, + "learning_rate": 9.862741205459005e-06, + "loss": 0.7191, + "step": 6310 + }, + { + "epoch": 0.5193170129602962, + "grad_norm": 2.8647877749641375, + "learning_rate": 9.860076158442588e-06, + "loss": 0.7391, + "step": 6311 + }, + { + "epoch": 0.5193993005554413, + "grad_norm": 2.121413791419389, + "learning_rate": 9.857411121366132e-06, + "loss": 0.7279, + "step": 6312 + }, + { + "epoch": 0.5194815881505863, + "grad_norm": 2.081435961151187, + "learning_rate": 9.854746094418967e-06, + "loss": 0.7068, + "step": 6313 + }, + { + "epoch": 0.5195638757457314, + "grad_norm": 0.43832732534751606, + "learning_rate": 9.852081077790401e-06, + "loss": 0.5273, + "step": 6314 + }, + { + "epoch": 0.5196461633408763, + "grad_norm": 2.1618677455455244, + "learning_rate": 9.849416071669765e-06, + "loss": 0.7403, + "step": 6315 + }, + { + "epoch": 0.5197284509360214, + "grad_norm": 0.4097961255156024, + "learning_rate": 9.846751076246367e-06, + "loss": 0.5018, + "step": 6316 + }, + { + "epoch": 0.5198107385311664, + "grad_norm": 2.285575132900069, + "learning_rate": 9.84408609170953e-06, + "loss": 0.7341, + "step": 6317 + }, + { + "epoch": 0.5198930261263115, + "grad_norm": 2.2365130960217945, + "learning_rate": 9.841421118248565e-06, + "loss": 0.7337, + "step": 6318 + }, + { + "epoch": 0.5199753137214564, + "grad_norm": 1.9527245278553882, + "learning_rate": 9.838756156052793e-06, + "loss": 0.7335, + "step": 6319 + }, + { + "epoch": 0.5200576013166015, + "grad_norm": 2.191985141430642, + "learning_rate": 9.83609120531152e-06, + "loss": 0.7391, + "step": 6320 + }, + { + "epoch": 0.5201398889117466, + "grad_norm": 2.039951876680582, + "learning_rate": 9.833426266214072e-06, + "loss": 0.7278, + "step": 6321 + }, + { + "epoch": 0.5202221765068916, + "grad_norm": 0.39823603377615935, + "learning_rate": 9.830761338949752e-06, + "loss": 0.4633, + "step": 6322 + }, + { + "epoch": 0.5203044641020366, + "grad_norm": 2.354468253446684, + "learning_rate": 9.828096423707878e-06, + "loss": 0.734, + "step": 6323 + }, + { + "epoch": 0.5203867516971816, + "grad_norm": 2.98245443379787, + "learning_rate": 9.82543152067776e-06, + "loss": 0.7202, + "step": 6324 + }, + { + "epoch": 0.5204690392923267, + "grad_norm": 0.42907651010783054, + "learning_rate": 9.822766630048707e-06, + "loss": 0.4774, + "step": 6325 + }, + { + "epoch": 0.5205513268874717, + "grad_norm": 0.4404799660151963, + "learning_rate": 9.820101752010025e-06, + "loss": 0.5002, + "step": 6326 + }, + { + "epoch": 0.5206336144826167, + "grad_norm": 0.4062244697497123, + "learning_rate": 9.81743688675103e-06, + "loss": 0.492, + "step": 6327 + }, + { + "epoch": 0.5207159020777617, + "grad_norm": 2.30328216083948, + "learning_rate": 9.814772034461027e-06, + "loss": 0.7162, + "step": 6328 + }, + { + "epoch": 0.5207981896729068, + "grad_norm": 2.055686774586588, + "learning_rate": 9.812107195329323e-06, + "loss": 0.7381, + "step": 6329 + }, + { + "epoch": 0.5208804772680519, + "grad_norm": 2.1867013973539478, + "learning_rate": 9.809442369545221e-06, + "loss": 0.7663, + "step": 6330 + }, + { + "epoch": 0.5209627648631969, + "grad_norm": 2.25062621890102, + "learning_rate": 9.806777557298032e-06, + "loss": 0.7704, + "step": 6331 + }, + { + "epoch": 0.5210450524583419, + "grad_norm": 2.1509211996740447, + "learning_rate": 9.80411275877705e-06, + "loss": 0.7667, + "step": 6332 + }, + { + "epoch": 0.5211273400534869, + "grad_norm": 2.094050912383823, + "learning_rate": 9.801447974171589e-06, + "loss": 0.7256, + "step": 6333 + }, + { + "epoch": 0.521209627648632, + "grad_norm": 0.4444945280994854, + "learning_rate": 9.798783203670946e-06, + "loss": 0.4968, + "step": 6334 + }, + { + "epoch": 0.521291915243777, + "grad_norm": 2.2877793849768606, + "learning_rate": 9.796118447464423e-06, + "loss": 0.7357, + "step": 6335 + }, + { + "epoch": 0.521374202838922, + "grad_norm": 0.4391005342399146, + "learning_rate": 9.793453705741318e-06, + "loss": 0.4899, + "step": 6336 + }, + { + "epoch": 0.521456490434067, + "grad_norm": 3.182020895267973, + "learning_rate": 9.790788978690934e-06, + "loss": 0.7286, + "step": 6337 + }, + { + "epoch": 0.5215387780292121, + "grad_norm": 2.2662340500463207, + "learning_rate": 9.788124266502562e-06, + "loss": 0.7254, + "step": 6338 + }, + { + "epoch": 0.5216210656243572, + "grad_norm": 2.5426129610562236, + "learning_rate": 9.785459569365507e-06, + "loss": 0.7367, + "step": 6339 + }, + { + "epoch": 0.5217033532195021, + "grad_norm": 3.4914035818491977, + "learning_rate": 9.782794887469058e-06, + "loss": 0.7582, + "step": 6340 + }, + { + "epoch": 0.5217856408146472, + "grad_norm": 2.3391957713537535, + "learning_rate": 9.780130221002516e-06, + "loss": 0.7406, + "step": 6341 + }, + { + "epoch": 0.5218679284097922, + "grad_norm": 2.3511496196123547, + "learning_rate": 9.77746557015517e-06, + "loss": 0.7362, + "step": 6342 + }, + { + "epoch": 0.5219502160049373, + "grad_norm": 2.6643727344191803, + "learning_rate": 9.774800935116313e-06, + "loss": 0.7415, + "step": 6343 + }, + { + "epoch": 0.5220325036000822, + "grad_norm": 0.4331428215803185, + "learning_rate": 9.772136316075236e-06, + "loss": 0.5044, + "step": 6344 + }, + { + "epoch": 0.5221147911952273, + "grad_norm": 1.910870327976848, + "learning_rate": 9.769471713221236e-06, + "loss": 0.7261, + "step": 6345 + }, + { + "epoch": 0.5221970787903724, + "grad_norm": 0.4227807086273547, + "learning_rate": 9.766807126743592e-06, + "loss": 0.4876, + "step": 6346 + }, + { + "epoch": 0.5222793663855174, + "grad_norm": 2.51528658173533, + "learning_rate": 9.7641425568316e-06, + "loss": 0.762, + "step": 6347 + }, + { + "epoch": 0.5223616539806624, + "grad_norm": 2.1711666210609613, + "learning_rate": 9.761478003674542e-06, + "loss": 0.775, + "step": 6348 + }, + { + "epoch": 0.5224439415758074, + "grad_norm": 1.9950191201052898, + "learning_rate": 9.758813467461706e-06, + "loss": 0.7614, + "step": 6349 + }, + { + "epoch": 0.5225262291709525, + "grad_norm": 2.309309371121942, + "learning_rate": 9.756148948382372e-06, + "loss": 0.7524, + "step": 6350 + }, + { + "epoch": 0.5226085167660975, + "grad_norm": 2.361535017679006, + "learning_rate": 9.753484446625833e-06, + "loss": 0.7318, + "step": 6351 + }, + { + "epoch": 0.5226908043612425, + "grad_norm": 2.821090353441509, + "learning_rate": 9.750819962381359e-06, + "loss": 0.7737, + "step": 6352 + }, + { + "epoch": 0.5227730919563875, + "grad_norm": 2.364148110039101, + "learning_rate": 9.74815549583824e-06, + "loss": 0.7535, + "step": 6353 + }, + { + "epoch": 0.5228553795515326, + "grad_norm": 0.4361459338651872, + "learning_rate": 9.74549104718575e-06, + "loss": 0.5099, + "step": 6354 + }, + { + "epoch": 0.5229376671466777, + "grad_norm": 2.3260102757459764, + "learning_rate": 9.74282661661317e-06, + "loss": 0.6935, + "step": 6355 + }, + { + "epoch": 0.5230199547418227, + "grad_norm": 2.3637533507509225, + "learning_rate": 9.740162204309774e-06, + "loss": 0.7562, + "step": 6356 + }, + { + "epoch": 0.5231022423369677, + "grad_norm": 2.133177517078845, + "learning_rate": 9.737497810464839e-06, + "loss": 0.7461, + "step": 6357 + }, + { + "epoch": 0.5231845299321127, + "grad_norm": 2.5849119917375494, + "learning_rate": 9.734833435267642e-06, + "loss": 0.7558, + "step": 6358 + }, + { + "epoch": 0.5232668175272578, + "grad_norm": 2.724881817701977, + "learning_rate": 9.732169078907453e-06, + "loss": 0.7789, + "step": 6359 + }, + { + "epoch": 0.5233491051224028, + "grad_norm": 2.0982463536340057, + "learning_rate": 9.729504741573545e-06, + "loss": 0.7503, + "step": 6360 + }, + { + "epoch": 0.5234313927175478, + "grad_norm": 3.011980556861576, + "learning_rate": 9.726840423455185e-06, + "loss": 0.7459, + "step": 6361 + }, + { + "epoch": 0.5235136803126929, + "grad_norm": 2.7557945825246777, + "learning_rate": 9.72417612474165e-06, + "loss": 0.7439, + "step": 6362 + }, + { + "epoch": 0.5235959679078379, + "grad_norm": 2.407152766604447, + "learning_rate": 9.721511845622195e-06, + "loss": 0.7338, + "step": 6363 + }, + { + "epoch": 0.523678255502983, + "grad_norm": 3.1277546699919587, + "learning_rate": 9.718847586286099e-06, + "loss": 0.7308, + "step": 6364 + }, + { + "epoch": 0.5237605430981279, + "grad_norm": 0.402735190563187, + "learning_rate": 9.716183346922617e-06, + "loss": 0.4837, + "step": 6365 + }, + { + "epoch": 0.523842830693273, + "grad_norm": 2.7038585674435054, + "learning_rate": 9.71351912772102e-06, + "loss": 0.7303, + "step": 6366 + }, + { + "epoch": 0.523925118288418, + "grad_norm": 2.954008879645378, + "learning_rate": 9.710854928870561e-06, + "loss": 0.7251, + "step": 6367 + }, + { + "epoch": 0.5240074058835631, + "grad_norm": 3.118596549449317, + "learning_rate": 9.708190750560512e-06, + "loss": 0.7537, + "step": 6368 + }, + { + "epoch": 0.524089693478708, + "grad_norm": 2.633567483622676, + "learning_rate": 9.705526592980117e-06, + "loss": 0.7564, + "step": 6369 + }, + { + "epoch": 0.5241719810738531, + "grad_norm": 3.1775518327875663, + "learning_rate": 9.702862456318649e-06, + "loss": 0.7467, + "step": 6370 + }, + { + "epoch": 0.5242542686689982, + "grad_norm": 5.058543875112356, + "learning_rate": 9.700198340765353e-06, + "loss": 0.7215, + "step": 6371 + }, + { + "epoch": 0.5243365562641432, + "grad_norm": 0.4207798415885715, + "learning_rate": 9.69753424650949e-06, + "loss": 0.5004, + "step": 6372 + }, + { + "epoch": 0.5244188438592882, + "grad_norm": 0.42283160138029524, + "learning_rate": 9.694870173740307e-06, + "loss": 0.5068, + "step": 6373 + }, + { + "epoch": 0.5245011314544332, + "grad_norm": 2.3388605570942045, + "learning_rate": 9.692206122647064e-06, + "loss": 0.7556, + "step": 6374 + }, + { + "epoch": 0.5245834190495783, + "grad_norm": 1.9880473367258045, + "learning_rate": 9.689542093418999e-06, + "loss": 0.7288, + "step": 6375 + }, + { + "epoch": 0.5246657066447233, + "grad_norm": 0.4169926054978123, + "learning_rate": 9.686878086245371e-06, + "loss": 0.5495, + "step": 6376 + }, + { + "epoch": 0.5247479942398683, + "grad_norm": 2.074306692521544, + "learning_rate": 9.684214101315423e-06, + "loss": 0.7532, + "step": 6377 + }, + { + "epoch": 0.5248302818350133, + "grad_norm": 2.3230519581375355, + "learning_rate": 9.681550138818402e-06, + "loss": 0.7283, + "step": 6378 + }, + { + "epoch": 0.5249125694301584, + "grad_norm": 2.770724089156809, + "learning_rate": 9.678886198943546e-06, + "loss": 0.7372, + "step": 6379 + }, + { + "epoch": 0.5249948570253035, + "grad_norm": 2.574592320338826, + "learning_rate": 9.676222281880106e-06, + "loss": 0.7888, + "step": 6380 + }, + { + "epoch": 0.5250771446204485, + "grad_norm": 2.779942552725026, + "learning_rate": 9.67355838781731e-06, + "loss": 0.7314, + "step": 6381 + }, + { + "epoch": 0.5251594322155935, + "grad_norm": 2.387319457613477, + "learning_rate": 9.67089451694441e-06, + "loss": 0.7554, + "step": 6382 + }, + { + "epoch": 0.5252417198107385, + "grad_norm": 3.145691928463502, + "learning_rate": 9.668230669450635e-06, + "loss": 0.7385, + "step": 6383 + }, + { + "epoch": 0.5253240074058836, + "grad_norm": 2.3218814452956673, + "learning_rate": 9.665566845525223e-06, + "loss": 0.7373, + "step": 6384 + }, + { + "epoch": 0.5254062950010286, + "grad_norm": 2.4493535483690705, + "learning_rate": 9.662903045357408e-06, + "loss": 0.7694, + "step": 6385 + }, + { + "epoch": 0.5254885825961736, + "grad_norm": 2.3717859359179174, + "learning_rate": 9.660239269136423e-06, + "loss": 0.7344, + "step": 6386 + }, + { + "epoch": 0.5255708701913187, + "grad_norm": 3.2819587563936397, + "learning_rate": 9.65757551705149e-06, + "loss": 0.7545, + "step": 6387 + }, + { + "epoch": 0.5256531577864637, + "grad_norm": 2.4386492947579077, + "learning_rate": 9.654911789291854e-06, + "loss": 0.7328, + "step": 6388 + }, + { + "epoch": 0.5257354453816088, + "grad_norm": 3.141478170858726, + "learning_rate": 9.652248086046726e-06, + "loss": 0.732, + "step": 6389 + }, + { + "epoch": 0.5258177329767537, + "grad_norm": 2.071007124998601, + "learning_rate": 9.649584407505341e-06, + "loss": 0.7405, + "step": 6390 + }, + { + "epoch": 0.5259000205718988, + "grad_norm": 2.232998977240791, + "learning_rate": 9.646920753856918e-06, + "loss": 0.7823, + "step": 6391 + }, + { + "epoch": 0.5259823081670438, + "grad_norm": 2.353559582894934, + "learning_rate": 9.644257125290682e-06, + "loss": 0.7556, + "step": 6392 + }, + { + "epoch": 0.5260645957621889, + "grad_norm": 2.5199612970946372, + "learning_rate": 9.641593521995846e-06, + "loss": 0.7416, + "step": 6393 + }, + { + "epoch": 0.5261468833573338, + "grad_norm": 0.41827050833727986, + "learning_rate": 9.638929944161639e-06, + "loss": 0.4801, + "step": 6394 + }, + { + "epoch": 0.5262291709524789, + "grad_norm": 2.2975631624982706, + "learning_rate": 9.636266391977266e-06, + "loss": 0.742, + "step": 6395 + }, + { + "epoch": 0.526311458547624, + "grad_norm": 2.5064124382294612, + "learning_rate": 9.633602865631952e-06, + "loss": 0.7258, + "step": 6396 + }, + { + "epoch": 0.526393746142769, + "grad_norm": 2.474007851115927, + "learning_rate": 9.6309393653149e-06, + "loss": 0.735, + "step": 6397 + }, + { + "epoch": 0.526476033737914, + "grad_norm": 4.067766777387784, + "learning_rate": 9.628275891215328e-06, + "loss": 0.7414, + "step": 6398 + }, + { + "epoch": 0.526558321333059, + "grad_norm": 4.129580013755052, + "learning_rate": 9.62561244352244e-06, + "loss": 0.7625, + "step": 6399 + }, + { + "epoch": 0.5266406089282041, + "grad_norm": 3.099981524786792, + "learning_rate": 9.622949022425448e-06, + "loss": 0.7371, + "step": 6400 + }, + { + "epoch": 0.5267228965233491, + "grad_norm": 4.2256795994319605, + "learning_rate": 9.62028562811355e-06, + "loss": 0.7592, + "step": 6401 + }, + { + "epoch": 0.5268051841184941, + "grad_norm": 2.561252639344002, + "learning_rate": 9.61762226077596e-06, + "loss": 0.7306, + "step": 6402 + }, + { + "epoch": 0.5268874717136391, + "grad_norm": 2.945647554265657, + "learning_rate": 9.614958920601867e-06, + "loss": 0.7367, + "step": 6403 + }, + { + "epoch": 0.5269697593087842, + "grad_norm": 2.4511068587956646, + "learning_rate": 9.612295607780483e-06, + "loss": 0.7699, + "step": 6404 + }, + { + "epoch": 0.5270520469039293, + "grad_norm": 2.1957942782573387, + "learning_rate": 9.609632322500994e-06, + "loss": 0.7282, + "step": 6405 + }, + { + "epoch": 0.5271343344990743, + "grad_norm": 2.243547362371902, + "learning_rate": 9.606969064952602e-06, + "loss": 0.7412, + "step": 6406 + }, + { + "epoch": 0.5272166220942193, + "grad_norm": 4.060442633120275, + "learning_rate": 9.604305835324496e-06, + "loss": 0.7524, + "step": 6407 + }, + { + "epoch": 0.5272989096893643, + "grad_norm": 4.092004887608577, + "learning_rate": 9.601642633805875e-06, + "loss": 0.7307, + "step": 6408 + }, + { + "epoch": 0.5273811972845094, + "grad_norm": 2.507133738515735, + "learning_rate": 9.598979460585922e-06, + "loss": 0.6958, + "step": 6409 + }, + { + "epoch": 0.5274634848796544, + "grad_norm": 2.293023336656868, + "learning_rate": 9.59631631585383e-06, + "loss": 0.7529, + "step": 6410 + }, + { + "epoch": 0.5275457724747994, + "grad_norm": 2.603836614727879, + "learning_rate": 9.593653199798778e-06, + "loss": 0.7596, + "step": 6411 + }, + { + "epoch": 0.5276280600699444, + "grad_norm": 2.4443681634852195, + "learning_rate": 9.590990112609953e-06, + "loss": 0.7792, + "step": 6412 + }, + { + "epoch": 0.5277103476650895, + "grad_norm": 2.7122772718484622, + "learning_rate": 9.588327054476534e-06, + "loss": 0.7446, + "step": 6413 + }, + { + "epoch": 0.5277926352602346, + "grad_norm": 2.4357866360739324, + "learning_rate": 9.585664025587707e-06, + "loss": 0.722, + "step": 6414 + }, + { + "epoch": 0.5278749228553795, + "grad_norm": 4.445373894753648, + "learning_rate": 9.583001026132644e-06, + "loss": 0.7791, + "step": 6415 + }, + { + "epoch": 0.5279572104505246, + "grad_norm": 2.444639323709026, + "learning_rate": 9.580338056300523e-06, + "loss": 0.7296, + "step": 6416 + }, + { + "epoch": 0.5280394980456696, + "grad_norm": 0.41822657523143697, + "learning_rate": 9.577675116280512e-06, + "loss": 0.4988, + "step": 6417 + }, + { + "epoch": 0.5281217856408147, + "grad_norm": 2.4288548709631916, + "learning_rate": 9.575012206261786e-06, + "loss": 0.7427, + "step": 6418 + }, + { + "epoch": 0.5282040732359596, + "grad_norm": 2.3162222542101976, + "learning_rate": 9.572349326433512e-06, + "loss": 0.7457, + "step": 6419 + }, + { + "epoch": 0.5282863608311047, + "grad_norm": 2.5665528605469796, + "learning_rate": 9.569686476984864e-06, + "loss": 0.7406, + "step": 6420 + }, + { + "epoch": 0.5283686484262498, + "grad_norm": 2.671989589111655, + "learning_rate": 9.567023658104992e-06, + "loss": 0.7362, + "step": 6421 + }, + { + "epoch": 0.5284509360213948, + "grad_norm": 2.089791642814302, + "learning_rate": 9.564360869983073e-06, + "loss": 0.7291, + "step": 6422 + }, + { + "epoch": 0.5285332236165398, + "grad_norm": 3.09746374552525, + "learning_rate": 9.561698112808258e-06, + "loss": 0.7218, + "step": 6423 + }, + { + "epoch": 0.5286155112116848, + "grad_norm": 3.0204390754825314, + "learning_rate": 9.559035386769711e-06, + "loss": 0.6936, + "step": 6424 + }, + { + "epoch": 0.5286977988068299, + "grad_norm": 2.981908622321696, + "learning_rate": 9.55637269205658e-06, + "loss": 0.7452, + "step": 6425 + }, + { + "epoch": 0.5287800864019749, + "grad_norm": 2.1585141283765705, + "learning_rate": 9.55371002885803e-06, + "loss": 0.7309, + "step": 6426 + }, + { + "epoch": 0.5288623739971199, + "grad_norm": 2.4142215737587525, + "learning_rate": 9.551047397363199e-06, + "loss": 0.7249, + "step": 6427 + }, + { + "epoch": 0.5289446615922649, + "grad_norm": 2.2997277013794197, + "learning_rate": 9.548384797761247e-06, + "loss": 0.7441, + "step": 6428 + }, + { + "epoch": 0.52902694918741, + "grad_norm": 2.692471632606989, + "learning_rate": 9.545722230241315e-06, + "loss": 0.7146, + "step": 6429 + }, + { + "epoch": 0.5291092367825551, + "grad_norm": 2.707425762831531, + "learning_rate": 9.543059694992552e-06, + "loss": 0.7303, + "step": 6430 + }, + { + "epoch": 0.5291915243777, + "grad_norm": 2.4298051603830193, + "learning_rate": 9.540397192204093e-06, + "loss": 0.7125, + "step": 6431 + }, + { + "epoch": 0.5292738119728451, + "grad_norm": 2.306010775788359, + "learning_rate": 9.537734722065089e-06, + "loss": 0.7326, + "step": 6432 + }, + { + "epoch": 0.5293560995679901, + "grad_norm": 2.4544360259011078, + "learning_rate": 9.535072284764663e-06, + "loss": 0.739, + "step": 6433 + }, + { + "epoch": 0.5294383871631352, + "grad_norm": 3.0217241617864623, + "learning_rate": 9.532409880491965e-06, + "loss": 0.7413, + "step": 6434 + }, + { + "epoch": 0.5295206747582802, + "grad_norm": 0.43956872191878316, + "learning_rate": 9.529747509436118e-06, + "loss": 0.4918, + "step": 6435 + }, + { + "epoch": 0.5296029623534252, + "grad_norm": 2.942299280697185, + "learning_rate": 9.527085171786259e-06, + "loss": 0.7636, + "step": 6436 + }, + { + "epoch": 0.5296852499485702, + "grad_norm": 3.381794102636625, + "learning_rate": 9.52442286773151e-06, + "loss": 0.753, + "step": 6437 + }, + { + "epoch": 0.5297675375437153, + "grad_norm": 2.808049304446277, + "learning_rate": 9.521760597461001e-06, + "loss": 0.7349, + "step": 6438 + }, + { + "epoch": 0.5298498251388604, + "grad_norm": 3.1132484233985895, + "learning_rate": 9.519098361163852e-06, + "loss": 0.7437, + "step": 6439 + }, + { + "epoch": 0.5299321127340053, + "grad_norm": 3.189511370239155, + "learning_rate": 9.51643615902919e-06, + "loss": 0.7152, + "step": 6440 + }, + { + "epoch": 0.5300144003291504, + "grad_norm": 3.2461369905120514, + "learning_rate": 9.513773991246127e-06, + "loss": 0.7271, + "step": 6441 + }, + { + "epoch": 0.5300966879242954, + "grad_norm": 0.4301502694981612, + "learning_rate": 9.511111858003785e-06, + "loss": 0.4876, + "step": 6442 + }, + { + "epoch": 0.5301789755194405, + "grad_norm": 2.2731878706223205, + "learning_rate": 9.508449759491272e-06, + "loss": 0.7688, + "step": 6443 + }, + { + "epoch": 0.5302612631145854, + "grad_norm": 2.5527826509298746, + "learning_rate": 9.505787695897705e-06, + "loss": 0.7399, + "step": 6444 + }, + { + "epoch": 0.5303435507097305, + "grad_norm": 2.768536289028072, + "learning_rate": 9.503125667412185e-06, + "loss": 0.7434, + "step": 6445 + }, + { + "epoch": 0.5304258383048756, + "grad_norm": 2.6052448391246203, + "learning_rate": 9.500463674223827e-06, + "loss": 0.7322, + "step": 6446 + }, + { + "epoch": 0.5305081259000206, + "grad_norm": 2.070882444957591, + "learning_rate": 9.497801716521728e-06, + "loss": 0.7166, + "step": 6447 + }, + { + "epoch": 0.5305904134951656, + "grad_norm": 2.9729516441887633, + "learning_rate": 9.495139794494995e-06, + "loss": 0.7375, + "step": 6448 + }, + { + "epoch": 0.5306727010903106, + "grad_norm": 2.331498790971716, + "learning_rate": 9.492477908332721e-06, + "loss": 0.738, + "step": 6449 + }, + { + "epoch": 0.5307549886854557, + "grad_norm": 3.327939829810355, + "learning_rate": 9.489816058224007e-06, + "loss": 0.7245, + "step": 6450 + }, + { + "epoch": 0.5308372762806007, + "grad_norm": 3.5750309363283828, + "learning_rate": 9.48715424435794e-06, + "loss": 0.7506, + "step": 6451 + }, + { + "epoch": 0.5309195638757457, + "grad_norm": 0.43097071211712523, + "learning_rate": 9.484492466923619e-06, + "loss": 0.5029, + "step": 6452 + }, + { + "epoch": 0.5310018514708907, + "grad_norm": 0.4211489848159116, + "learning_rate": 9.481830726110129e-06, + "loss": 0.4986, + "step": 6453 + }, + { + "epoch": 0.5310841390660358, + "grad_norm": 2.6416937450309175, + "learning_rate": 9.479169022106555e-06, + "loss": 0.7269, + "step": 6454 + }, + { + "epoch": 0.5311664266611809, + "grad_norm": 2.437267791116944, + "learning_rate": 9.476507355101979e-06, + "loss": 0.7544, + "step": 6455 + }, + { + "epoch": 0.5312487142563258, + "grad_norm": 2.529699260667492, + "learning_rate": 9.473845725285487e-06, + "loss": 0.7489, + "step": 6456 + }, + { + "epoch": 0.5313310018514709, + "grad_norm": 2.5342310906133148, + "learning_rate": 9.471184132846147e-06, + "loss": 0.744, + "step": 6457 + }, + { + "epoch": 0.5314132894466159, + "grad_norm": 3.4910784753777757, + "learning_rate": 9.468522577973048e-06, + "loss": 0.7468, + "step": 6458 + }, + { + "epoch": 0.531495577041761, + "grad_norm": 0.41085968482365354, + "learning_rate": 9.465861060855248e-06, + "loss": 0.5012, + "step": 6459 + }, + { + "epoch": 0.531577864636906, + "grad_norm": 3.2904688953843855, + "learning_rate": 9.46319958168183e-06, + "loss": 0.7274, + "step": 6460 + }, + { + "epoch": 0.531660152232051, + "grad_norm": 2.545599217045579, + "learning_rate": 9.460538140641851e-06, + "loss": 0.7514, + "step": 6461 + }, + { + "epoch": 0.531742439827196, + "grad_norm": 3.681160696783038, + "learning_rate": 9.457876737924383e-06, + "loss": 0.7211, + "step": 6462 + }, + { + "epoch": 0.5318247274223411, + "grad_norm": 2.8577060550458926, + "learning_rate": 9.45521537371848e-06, + "loss": 0.7477, + "step": 6463 + }, + { + "epoch": 0.5319070150174862, + "grad_norm": 2.5107896584997373, + "learning_rate": 9.452554048213211e-06, + "loss": 0.7608, + "step": 6464 + }, + { + "epoch": 0.5319893026126311, + "grad_norm": 2.4715637494888174, + "learning_rate": 9.449892761597622e-06, + "loss": 0.7547, + "step": 6465 + }, + { + "epoch": 0.5320715902077762, + "grad_norm": 2.9057387139899715, + "learning_rate": 9.447231514060775e-06, + "loss": 0.7358, + "step": 6466 + }, + { + "epoch": 0.5321538778029212, + "grad_norm": 2.893434604123407, + "learning_rate": 9.444570305791715e-06, + "loss": 0.7512, + "step": 6467 + }, + { + "epoch": 0.5322361653980663, + "grad_norm": 3.0218665973504972, + "learning_rate": 9.441909136979495e-06, + "loss": 0.7339, + "step": 6468 + }, + { + "epoch": 0.5323184529932112, + "grad_norm": 3.728585484803758, + "learning_rate": 9.439248007813155e-06, + "loss": 0.7693, + "step": 6469 + }, + { + "epoch": 0.5324007405883563, + "grad_norm": 2.6074090321875487, + "learning_rate": 9.436586918481742e-06, + "loss": 0.7267, + "step": 6470 + }, + { + "epoch": 0.5324830281835014, + "grad_norm": 3.649079735222573, + "learning_rate": 9.433925869174288e-06, + "loss": 0.7473, + "step": 6471 + }, + { + "epoch": 0.5325653157786464, + "grad_norm": 2.603464985419445, + "learning_rate": 9.43126486007984e-06, + "loss": 0.762, + "step": 6472 + }, + { + "epoch": 0.5326476033737914, + "grad_norm": 3.7784094228118157, + "learning_rate": 9.428603891387424e-06, + "loss": 0.7545, + "step": 6473 + }, + { + "epoch": 0.5327298909689364, + "grad_norm": 0.43891187539973253, + "learning_rate": 9.425942963286077e-06, + "loss": 0.5147, + "step": 6474 + }, + { + "epoch": 0.5328121785640815, + "grad_norm": 0.45617533513509323, + "learning_rate": 9.423282075964821e-06, + "loss": 0.5162, + "step": 6475 + }, + { + "epoch": 0.5328944661592265, + "grad_norm": 2.2234067633028256, + "learning_rate": 9.420621229612687e-06, + "loss": 0.784, + "step": 6476 + }, + { + "epoch": 0.5329767537543715, + "grad_norm": 2.464407649173562, + "learning_rate": 9.417960424418688e-06, + "loss": 0.7371, + "step": 6477 + }, + { + "epoch": 0.5330590413495165, + "grad_norm": 2.605155005690587, + "learning_rate": 9.415299660571856e-06, + "loss": 0.7265, + "step": 6478 + }, + { + "epoch": 0.5331413289446616, + "grad_norm": 2.423841937573343, + "learning_rate": 9.412638938261197e-06, + "loss": 0.7663, + "step": 6479 + }, + { + "epoch": 0.5332236165398067, + "grad_norm": 3.9914874570018086, + "learning_rate": 9.409978257675733e-06, + "loss": 0.7592, + "step": 6480 + }, + { + "epoch": 0.5333059041349516, + "grad_norm": 2.4378608821790078, + "learning_rate": 9.407317619004465e-06, + "loss": 0.7295, + "step": 6481 + }, + { + "epoch": 0.5333881917300967, + "grad_norm": 3.043152264855031, + "learning_rate": 9.40465702243641e-06, + "loss": 0.7401, + "step": 6482 + }, + { + "epoch": 0.5334704793252417, + "grad_norm": 2.3877650605444063, + "learning_rate": 9.401996468160563e-06, + "loss": 0.742, + "step": 6483 + }, + { + "epoch": 0.5335527669203868, + "grad_norm": 2.3820837923750564, + "learning_rate": 9.399335956365935e-06, + "loss": 0.7036, + "step": 6484 + }, + { + "epoch": 0.5336350545155318, + "grad_norm": 2.659279911510047, + "learning_rate": 9.396675487241516e-06, + "loss": 0.7147, + "step": 6485 + }, + { + "epoch": 0.5337173421106768, + "grad_norm": 3.2677720311573966, + "learning_rate": 9.39401506097631e-06, + "loss": 0.7255, + "step": 6486 + }, + { + "epoch": 0.5337996297058218, + "grad_norm": 4.048672864825397, + "learning_rate": 9.391354677759302e-06, + "loss": 0.7695, + "step": 6487 + }, + { + "epoch": 0.5338819173009669, + "grad_norm": 2.7770605911288575, + "learning_rate": 9.388694337779488e-06, + "loss": 0.7306, + "step": 6488 + }, + { + "epoch": 0.533964204896112, + "grad_norm": 0.44568873628866157, + "learning_rate": 9.386034041225843e-06, + "loss": 0.4999, + "step": 6489 + }, + { + "epoch": 0.5340464924912569, + "grad_norm": 2.4774398598873786, + "learning_rate": 9.383373788287367e-06, + "loss": 0.7447, + "step": 6490 + }, + { + "epoch": 0.534128780086402, + "grad_norm": 0.4457045151014345, + "learning_rate": 9.380713579153022e-06, + "loss": 0.5087, + "step": 6491 + }, + { + "epoch": 0.534211067681547, + "grad_norm": 2.486008490113522, + "learning_rate": 9.3780534140118e-06, + "loss": 0.7489, + "step": 6492 + }, + { + "epoch": 0.5342933552766921, + "grad_norm": 2.4342591756154897, + "learning_rate": 9.375393293052667e-06, + "loss": 0.7357, + "step": 6493 + }, + { + "epoch": 0.534375642871837, + "grad_norm": 2.3140738528059175, + "learning_rate": 9.372733216464597e-06, + "loss": 0.7558, + "step": 6494 + }, + { + "epoch": 0.5344579304669821, + "grad_norm": 3.655949052264836, + "learning_rate": 9.370073184436552e-06, + "loss": 0.7603, + "step": 6495 + }, + { + "epoch": 0.5345402180621271, + "grad_norm": 0.4236824301369135, + "learning_rate": 9.367413197157506e-06, + "loss": 0.5181, + "step": 6496 + }, + { + "epoch": 0.5346225056572722, + "grad_norm": 2.436887461767267, + "learning_rate": 9.36475325481641e-06, + "loss": 0.7472, + "step": 6497 + }, + { + "epoch": 0.5347047932524172, + "grad_norm": 3.248013525641095, + "learning_rate": 9.36209335760223e-06, + "loss": 0.7629, + "step": 6498 + }, + { + "epoch": 0.5347870808475622, + "grad_norm": 3.099840841673295, + "learning_rate": 9.359433505703917e-06, + "loss": 0.7569, + "step": 6499 + }, + { + "epoch": 0.5348693684427073, + "grad_norm": 2.3581327669554333, + "learning_rate": 9.356773699310424e-06, + "loss": 0.7625, + "step": 6500 + }, + { + "epoch": 0.5349516560378523, + "grad_norm": 0.48489745726215583, + "learning_rate": 9.354113938610695e-06, + "loss": 0.5299, + "step": 6501 + }, + { + "epoch": 0.5350339436329973, + "grad_norm": 3.0505486183686252, + "learning_rate": 9.351454223793685e-06, + "loss": 0.7503, + "step": 6502 + }, + { + "epoch": 0.5351162312281423, + "grad_norm": 2.3239467450107267, + "learning_rate": 9.348794555048323e-06, + "loss": 0.7374, + "step": 6503 + }, + { + "epoch": 0.5351985188232874, + "grad_norm": 2.6703473756761715, + "learning_rate": 9.34613493256356e-06, + "loss": 0.7468, + "step": 6504 + }, + { + "epoch": 0.5352808064184325, + "grad_norm": 2.698523650492583, + "learning_rate": 9.343475356528324e-06, + "loss": 0.7119, + "step": 6505 + }, + { + "epoch": 0.5353630940135774, + "grad_norm": 2.696625729974696, + "learning_rate": 9.340815827131549e-06, + "loss": 0.7157, + "step": 6506 + }, + { + "epoch": 0.5354453816087225, + "grad_norm": 4.219330195875581, + "learning_rate": 9.338156344562162e-06, + "loss": 0.7492, + "step": 6507 + }, + { + "epoch": 0.5355276692038675, + "grad_norm": 4.1396009010416215, + "learning_rate": 9.33549690900909e-06, + "loss": 0.7574, + "step": 6508 + }, + { + "epoch": 0.5356099567990126, + "grad_norm": 2.4269299885289506, + "learning_rate": 9.332837520661254e-06, + "loss": 0.7621, + "step": 6509 + }, + { + "epoch": 0.5356922443941576, + "grad_norm": 2.5231599067922756, + "learning_rate": 9.330178179707575e-06, + "loss": 0.6945, + "step": 6510 + }, + { + "epoch": 0.5357745319893026, + "grad_norm": 2.307280960071785, + "learning_rate": 9.327518886336968e-06, + "loss": 0.7487, + "step": 6511 + }, + { + "epoch": 0.5358568195844476, + "grad_norm": 2.6248946130298343, + "learning_rate": 9.324859640738343e-06, + "loss": 0.7324, + "step": 6512 + }, + { + "epoch": 0.5359391071795927, + "grad_norm": 2.753856497369798, + "learning_rate": 9.322200443100607e-06, + "loss": 0.7581, + "step": 6513 + }, + { + "epoch": 0.5360213947747378, + "grad_norm": 3.139032847756096, + "learning_rate": 9.319541293612672e-06, + "loss": 0.7708, + "step": 6514 + }, + { + "epoch": 0.5361036823698827, + "grad_norm": 2.2714614885635127, + "learning_rate": 9.31688219246343e-06, + "loss": 0.7512, + "step": 6515 + }, + { + "epoch": 0.5361859699650278, + "grad_norm": 2.227398350961639, + "learning_rate": 9.314223139841788e-06, + "loss": 0.7294, + "step": 6516 + }, + { + "epoch": 0.5362682575601728, + "grad_norm": 0.433482875301665, + "learning_rate": 9.311564135936635e-06, + "loss": 0.4989, + "step": 6517 + }, + { + "epoch": 0.5363505451553179, + "grad_norm": 2.4792998418775, + "learning_rate": 9.30890518093687e-06, + "loss": 0.7123, + "step": 6518 + }, + { + "epoch": 0.5364328327504628, + "grad_norm": 2.6483281974568826, + "learning_rate": 9.306246275031371e-06, + "loss": 0.7463, + "step": 6519 + }, + { + "epoch": 0.5365151203456079, + "grad_norm": 2.258361402530894, + "learning_rate": 9.303587418409029e-06, + "loss": 0.7358, + "step": 6520 + }, + { + "epoch": 0.536597407940753, + "grad_norm": 0.42739401767977375, + "learning_rate": 9.30092861125872e-06, + "loss": 0.5018, + "step": 6521 + }, + { + "epoch": 0.536679695535898, + "grad_norm": 2.6228908580529486, + "learning_rate": 9.298269853769328e-06, + "loss": 0.7574, + "step": 6522 + }, + { + "epoch": 0.536761983131043, + "grad_norm": 0.4471461266064599, + "learning_rate": 9.295611146129722e-06, + "loss": 0.5113, + "step": 6523 + }, + { + "epoch": 0.536844270726188, + "grad_norm": 2.6030381245135104, + "learning_rate": 9.292952488528774e-06, + "loss": 0.7363, + "step": 6524 + }, + { + "epoch": 0.5369265583213331, + "grad_norm": 2.210215906839458, + "learning_rate": 9.290293881155352e-06, + "loss": 0.7394, + "step": 6525 + }, + { + "epoch": 0.5370088459164781, + "grad_norm": 15.09439899137758, + "learning_rate": 9.287635324198316e-06, + "loss": 0.7431, + "step": 6526 + }, + { + "epoch": 0.5370911335116231, + "grad_norm": 2.111890663133796, + "learning_rate": 9.284976817846526e-06, + "loss": 0.7308, + "step": 6527 + }, + { + "epoch": 0.5371734211067681, + "grad_norm": 0.421408437931751, + "learning_rate": 9.282318362288843e-06, + "loss": 0.497, + "step": 6528 + }, + { + "epoch": 0.5372557087019132, + "grad_norm": 2.7277512645048816, + "learning_rate": 9.27965995771411e-06, + "loss": 0.7541, + "step": 6529 + }, + { + "epoch": 0.5373379962970583, + "grad_norm": 2.685529048892094, + "learning_rate": 9.277001604311186e-06, + "loss": 0.7485, + "step": 6530 + }, + { + "epoch": 0.5374202838922032, + "grad_norm": 2.541546208945715, + "learning_rate": 9.274343302268911e-06, + "loss": 0.7519, + "step": 6531 + }, + { + "epoch": 0.5375025714873483, + "grad_norm": 4.550814760400207, + "learning_rate": 9.271685051776128e-06, + "loss": 0.7681, + "step": 6532 + }, + { + "epoch": 0.5375848590824933, + "grad_norm": 2.952267731911811, + "learning_rate": 9.269026853021672e-06, + "loss": 0.726, + "step": 6533 + }, + { + "epoch": 0.5376671466776384, + "grad_norm": 2.5608790502148597, + "learning_rate": 9.266368706194383e-06, + "loss": 0.7363, + "step": 6534 + }, + { + "epoch": 0.5377494342727834, + "grad_norm": 2.878984518753457, + "learning_rate": 9.263710611483083e-06, + "loss": 0.7588, + "step": 6535 + }, + { + "epoch": 0.5378317218679284, + "grad_norm": 2.274358386408207, + "learning_rate": 9.261052569076608e-06, + "loss": 0.7579, + "step": 6536 + }, + { + "epoch": 0.5379140094630734, + "grad_norm": 3.819615647410497, + "learning_rate": 9.258394579163773e-06, + "loss": 0.7202, + "step": 6537 + }, + { + "epoch": 0.5379962970582185, + "grad_norm": 5.105574226220051, + "learning_rate": 9.255736641933404e-06, + "loss": 0.7365, + "step": 6538 + }, + { + "epoch": 0.5380785846533636, + "grad_norm": 2.258820351446526, + "learning_rate": 9.253078757574312e-06, + "loss": 0.7328, + "step": 6539 + }, + { + "epoch": 0.5381608722485085, + "grad_norm": 3.47182677897865, + "learning_rate": 9.250420926275312e-06, + "loss": 0.7325, + "step": 6540 + }, + { + "epoch": 0.5382431598436536, + "grad_norm": 2.8680147998805827, + "learning_rate": 9.247763148225208e-06, + "loss": 0.7148, + "step": 6541 + }, + { + "epoch": 0.5383254474387986, + "grad_norm": 2.2196761346007143, + "learning_rate": 9.24510542361281e-06, + "loss": 0.732, + "step": 6542 + }, + { + "epoch": 0.5384077350339437, + "grad_norm": 3.000356155889897, + "learning_rate": 9.242447752626912e-06, + "loss": 0.7224, + "step": 6543 + }, + { + "epoch": 0.5384900226290886, + "grad_norm": 2.7013584437628566, + "learning_rate": 9.239790135456318e-06, + "loss": 0.7362, + "step": 6544 + }, + { + "epoch": 0.5385723102242337, + "grad_norm": 0.43442571687963477, + "learning_rate": 9.237132572289816e-06, + "loss": 0.5277, + "step": 6545 + }, + { + "epoch": 0.5386545978193787, + "grad_norm": 3.1937450227907838, + "learning_rate": 9.234475063316195e-06, + "loss": 0.7318, + "step": 6546 + }, + { + "epoch": 0.5387368854145238, + "grad_norm": 2.3712725963262886, + "learning_rate": 9.23181760872424e-06, + "loss": 0.718, + "step": 6547 + }, + { + "epoch": 0.5388191730096688, + "grad_norm": 0.43455482056411865, + "learning_rate": 9.229160208702735e-06, + "loss": 0.4822, + "step": 6548 + }, + { + "epoch": 0.5389014606048138, + "grad_norm": 2.6798804885443963, + "learning_rate": 9.226502863440456e-06, + "loss": 0.7271, + "step": 6549 + }, + { + "epoch": 0.5389837481999589, + "grad_norm": 2.6029976945232263, + "learning_rate": 9.223845573126176e-06, + "loss": 0.7428, + "step": 6550 + }, + { + "epoch": 0.5390660357951039, + "grad_norm": 2.850629487647868, + "learning_rate": 9.221188337948666e-06, + "loss": 0.7697, + "step": 6551 + }, + { + "epoch": 0.5391483233902489, + "grad_norm": 2.5471432738750424, + "learning_rate": 9.218531158096688e-06, + "loss": 0.7033, + "step": 6552 + }, + { + "epoch": 0.5392306109853939, + "grad_norm": 2.9111848257089226, + "learning_rate": 9.215874033759011e-06, + "loss": 0.734, + "step": 6553 + }, + { + "epoch": 0.539312898580539, + "grad_norm": 3.1231293020799167, + "learning_rate": 9.213216965124386e-06, + "loss": 0.7604, + "step": 6554 + }, + { + "epoch": 0.539395186175684, + "grad_norm": 2.6406608937882594, + "learning_rate": 9.210559952381573e-06, + "loss": 0.7311, + "step": 6555 + }, + { + "epoch": 0.539477473770829, + "grad_norm": 2.2326013888097873, + "learning_rate": 9.207902995719316e-06, + "loss": 0.744, + "step": 6556 + }, + { + "epoch": 0.539559761365974, + "grad_norm": 2.6890373025552776, + "learning_rate": 9.205246095326364e-06, + "loss": 0.755, + "step": 6557 + }, + { + "epoch": 0.5396420489611191, + "grad_norm": 2.316867261324287, + "learning_rate": 9.202589251391454e-06, + "loss": 0.7624, + "step": 6558 + }, + { + "epoch": 0.5397243365562642, + "grad_norm": 0.4472903248654255, + "learning_rate": 9.199932464103335e-06, + "loss": 0.5052, + "step": 6559 + }, + { + "epoch": 0.5398066241514091, + "grad_norm": 2.6391980531872483, + "learning_rate": 9.19727573365073e-06, + "loss": 0.7499, + "step": 6560 + }, + { + "epoch": 0.5398889117465542, + "grad_norm": 0.4070023972864873, + "learning_rate": 9.194619060222376e-06, + "loss": 0.487, + "step": 6561 + }, + { + "epoch": 0.5399711993416992, + "grad_norm": 0.41946295083101315, + "learning_rate": 9.191962444006994e-06, + "loss": 0.5088, + "step": 6562 + }, + { + "epoch": 0.5400534869368443, + "grad_norm": 2.612576798150405, + "learning_rate": 9.189305885193308e-06, + "loss": 0.7497, + "step": 6563 + }, + { + "epoch": 0.5401357745319894, + "grad_norm": 0.4105477732083668, + "learning_rate": 9.186649383970035e-06, + "loss": 0.4752, + "step": 6564 + }, + { + "epoch": 0.5402180621271343, + "grad_norm": 2.4691576062454264, + "learning_rate": 9.18399294052589e-06, + "loss": 0.7513, + "step": 6565 + }, + { + "epoch": 0.5403003497222794, + "grad_norm": 2.473665340733756, + "learning_rate": 9.18133655504958e-06, + "loss": 0.746, + "step": 6566 + }, + { + "epoch": 0.5403826373174244, + "grad_norm": 2.6060348136603877, + "learning_rate": 9.178680227729815e-06, + "loss": 0.76, + "step": 6567 + }, + { + "epoch": 0.5404649249125695, + "grad_norm": 2.047321451683292, + "learning_rate": 9.17602395875529e-06, + "loss": 0.7353, + "step": 6568 + }, + { + "epoch": 0.5405472125077144, + "grad_norm": 2.320428957048217, + "learning_rate": 9.173367748314707e-06, + "loss": 0.7553, + "step": 6569 + }, + { + "epoch": 0.5406295001028595, + "grad_norm": 0.4444097426401769, + "learning_rate": 9.170711596596754e-06, + "loss": 0.4853, + "step": 6570 + }, + { + "epoch": 0.5407117876980045, + "grad_norm": 2.476620209016738, + "learning_rate": 9.16805550379013e-06, + "loss": 0.7184, + "step": 6571 + }, + { + "epoch": 0.5407940752931496, + "grad_norm": 2.4075890070685295, + "learning_rate": 9.165399470083505e-06, + "loss": 0.7997, + "step": 6572 + }, + { + "epoch": 0.5408763628882945, + "grad_norm": 2.410637686962245, + "learning_rate": 9.16274349566557e-06, + "loss": 0.7527, + "step": 6573 + }, + { + "epoch": 0.5409586504834396, + "grad_norm": 0.44014456558943443, + "learning_rate": 9.160087580724998e-06, + "loss": 0.5134, + "step": 6574 + }, + { + "epoch": 0.5410409380785847, + "grad_norm": 7.4171534592766815, + "learning_rate": 9.157431725450463e-06, + "loss": 0.7655, + "step": 6575 + }, + { + "epoch": 0.5411232256737297, + "grad_norm": 4.932141244123582, + "learning_rate": 9.154775930030625e-06, + "loss": 0.7476, + "step": 6576 + }, + { + "epoch": 0.5412055132688747, + "grad_norm": 1.8202755881861639, + "learning_rate": 9.15212019465416e-06, + "loss": 0.7215, + "step": 6577 + }, + { + "epoch": 0.5412878008640197, + "grad_norm": 2.7803494514979743, + "learning_rate": 9.149464519509714e-06, + "loss": 0.7408, + "step": 6578 + }, + { + "epoch": 0.5413700884591648, + "grad_norm": 2.3784075478592377, + "learning_rate": 9.146808904785953e-06, + "loss": 0.773, + "step": 6579 + }, + { + "epoch": 0.5414523760543098, + "grad_norm": 1.962592398024929, + "learning_rate": 9.144153350671521e-06, + "loss": 0.7374, + "step": 6580 + }, + { + "epoch": 0.5415346636494548, + "grad_norm": 2.5009068459658486, + "learning_rate": 9.141497857355068e-06, + "loss": 0.7612, + "step": 6581 + }, + { + "epoch": 0.5416169512445999, + "grad_norm": 2.582272463519278, + "learning_rate": 9.13884242502523e-06, + "loss": 0.7561, + "step": 6582 + }, + { + "epoch": 0.5416992388397449, + "grad_norm": 2.2311434401012553, + "learning_rate": 9.136187053870655e-06, + "loss": 0.7669, + "step": 6583 + }, + { + "epoch": 0.54178152643489, + "grad_norm": 0.4375109595309301, + "learning_rate": 9.133531744079965e-06, + "loss": 0.5293, + "step": 6584 + }, + { + "epoch": 0.5418638140300349, + "grad_norm": 3.2956442069394747, + "learning_rate": 9.130876495841797e-06, + "loss": 0.7619, + "step": 6585 + }, + { + "epoch": 0.54194610162518, + "grad_norm": 2.443956615654636, + "learning_rate": 9.128221309344771e-06, + "loss": 0.7312, + "step": 6586 + }, + { + "epoch": 0.542028389220325, + "grad_norm": 0.4105975561056929, + "learning_rate": 9.125566184777512e-06, + "loss": 0.5078, + "step": 6587 + }, + { + "epoch": 0.5421106768154701, + "grad_norm": 2.34315660921629, + "learning_rate": 9.122911122328629e-06, + "loss": 0.7232, + "step": 6588 + }, + { + "epoch": 0.5421929644106152, + "grad_norm": 2.3463137484545262, + "learning_rate": 9.120256122186742e-06, + "loss": 0.7375, + "step": 6589 + }, + { + "epoch": 0.5422752520057601, + "grad_norm": 6.052322630536959, + "learning_rate": 9.117601184540446e-06, + "loss": 0.7128, + "step": 6590 + }, + { + "epoch": 0.5423575396009052, + "grad_norm": 2.554167583025512, + "learning_rate": 9.114946309578356e-06, + "loss": 0.7607, + "step": 6591 + }, + { + "epoch": 0.5424398271960502, + "grad_norm": 2.2322503699939076, + "learning_rate": 9.112291497489063e-06, + "loss": 0.7296, + "step": 6592 + }, + { + "epoch": 0.5425221147911953, + "grad_norm": 3.3909022834624167, + "learning_rate": 9.109636748461165e-06, + "loss": 0.6996, + "step": 6593 + }, + { + "epoch": 0.5426044023863402, + "grad_norm": 2.4776793158633708, + "learning_rate": 9.106982062683245e-06, + "loss": 0.7524, + "step": 6594 + }, + { + "epoch": 0.5426866899814853, + "grad_norm": 2.5990860347799507, + "learning_rate": 9.104327440343893e-06, + "loss": 0.7612, + "step": 6595 + }, + { + "epoch": 0.5427689775766303, + "grad_norm": 2.5415503725735626, + "learning_rate": 9.101672881631685e-06, + "loss": 0.7451, + "step": 6596 + }, + { + "epoch": 0.5428512651717754, + "grad_norm": 2.6103847670278526, + "learning_rate": 9.099018386735201e-06, + "loss": 0.7438, + "step": 6597 + }, + { + "epoch": 0.5429335527669203, + "grad_norm": 3.5119700553639253, + "learning_rate": 9.096363955843008e-06, + "loss": 0.7113, + "step": 6598 + }, + { + "epoch": 0.5430158403620654, + "grad_norm": 2.724968757264959, + "learning_rate": 9.093709589143677e-06, + "loss": 0.7401, + "step": 6599 + }, + { + "epoch": 0.5430981279572105, + "grad_norm": 0.43458558174849676, + "learning_rate": 9.091055286825766e-06, + "loss": 0.5201, + "step": 6600 + }, + { + "epoch": 0.5431804155523555, + "grad_norm": 3.0295076661503226, + "learning_rate": 9.088401049077835e-06, + "loss": 0.7335, + "step": 6601 + }, + { + "epoch": 0.5432627031475005, + "grad_norm": 2.278239923789596, + "learning_rate": 9.085746876088433e-06, + "loss": 0.7242, + "step": 6602 + }, + { + "epoch": 0.5433449907426455, + "grad_norm": 2.2612578274261743, + "learning_rate": 9.083092768046113e-06, + "loss": 0.763, + "step": 6603 + }, + { + "epoch": 0.5434272783377906, + "grad_norm": 0.4255587135096718, + "learning_rate": 9.080438725139415e-06, + "loss": 0.4972, + "step": 6604 + }, + { + "epoch": 0.5435095659329356, + "grad_norm": 2.4853124186622724, + "learning_rate": 9.077784747556882e-06, + "loss": 0.7441, + "step": 6605 + }, + { + "epoch": 0.5435918535280806, + "grad_norm": 2.353792313481416, + "learning_rate": 9.075130835487042e-06, + "loss": 0.7346, + "step": 6606 + }, + { + "epoch": 0.5436741411232257, + "grad_norm": 2.3109071782246575, + "learning_rate": 9.072476989118432e-06, + "loss": 0.7424, + "step": 6607 + }, + { + "epoch": 0.5437564287183707, + "grad_norm": 2.4208336918142117, + "learning_rate": 9.069823208639567e-06, + "loss": 0.7606, + "step": 6608 + }, + { + "epoch": 0.5438387163135158, + "grad_norm": 2.2279036431160457, + "learning_rate": 9.067169494238983e-06, + "loss": 0.716, + "step": 6609 + }, + { + "epoch": 0.5439210039086607, + "grad_norm": 0.41530239634302757, + "learning_rate": 9.064515846105177e-06, + "loss": 0.4862, + "step": 6610 + }, + { + "epoch": 0.5440032915038058, + "grad_norm": 3.121175536329775, + "learning_rate": 9.061862264426676e-06, + "loss": 0.7379, + "step": 6611 + }, + { + "epoch": 0.5440855790989508, + "grad_norm": 2.198210961389105, + "learning_rate": 9.059208749391976e-06, + "loss": 0.712, + "step": 6612 + }, + { + "epoch": 0.5441678666940959, + "grad_norm": 3.8850041637488837, + "learning_rate": 9.056555301189584e-06, + "loss": 0.7365, + "step": 6613 + }, + { + "epoch": 0.544250154289241, + "grad_norm": 0.42078819109700943, + "learning_rate": 9.053901920007991e-06, + "loss": 0.5074, + "step": 6614 + }, + { + "epoch": 0.5443324418843859, + "grad_norm": 2.1090766058183914, + "learning_rate": 9.0512486060357e-06, + "loss": 0.741, + "step": 6615 + }, + { + "epoch": 0.544414729479531, + "grad_norm": 4.7434914751390975, + "learning_rate": 9.04859535946118e-06, + "loss": 0.7605, + "step": 6616 + }, + { + "epoch": 0.544497017074676, + "grad_norm": 2.4448752106087235, + "learning_rate": 9.045942180472932e-06, + "loss": 0.7369, + "step": 6617 + }, + { + "epoch": 0.5445793046698211, + "grad_norm": 3.799962697007861, + "learning_rate": 9.043289069259423e-06, + "loss": 0.7465, + "step": 6618 + }, + { + "epoch": 0.544661592264966, + "grad_norm": 2.685497669902693, + "learning_rate": 9.04063602600913e-06, + "loss": 0.7243, + "step": 6619 + }, + { + "epoch": 0.5447438798601111, + "grad_norm": 2.2707226157175295, + "learning_rate": 9.037983050910518e-06, + "loss": 0.7239, + "step": 6620 + }, + { + "epoch": 0.5448261674552561, + "grad_norm": 2.3221047877636614, + "learning_rate": 9.035330144152053e-06, + "loss": 0.739, + "step": 6621 + }, + { + "epoch": 0.5449084550504012, + "grad_norm": 2.081806068291908, + "learning_rate": 9.032677305922188e-06, + "loss": 0.7401, + "step": 6622 + }, + { + "epoch": 0.5449907426455461, + "grad_norm": 2.652515700037123, + "learning_rate": 9.030024536409385e-06, + "loss": 0.7391, + "step": 6623 + }, + { + "epoch": 0.5450730302406912, + "grad_norm": 2.5733013339012634, + "learning_rate": 9.027371835802085e-06, + "loss": 0.7198, + "step": 6624 + }, + { + "epoch": 0.5451553178358363, + "grad_norm": 2.565345131580304, + "learning_rate": 9.024719204288735e-06, + "loss": 0.7359, + "step": 6625 + }, + { + "epoch": 0.5452376054309813, + "grad_norm": 1.9962908454043604, + "learning_rate": 9.022066642057772e-06, + "loss": 0.7436, + "step": 6626 + }, + { + "epoch": 0.5453198930261263, + "grad_norm": 2.5876912213471366, + "learning_rate": 9.019414149297635e-06, + "loss": 0.7368, + "step": 6627 + }, + { + "epoch": 0.5454021806212713, + "grad_norm": 0.4287576042531908, + "learning_rate": 9.016761726196741e-06, + "loss": 0.4641, + "step": 6628 + }, + { + "epoch": 0.5454844682164164, + "grad_norm": 2.194211281286066, + "learning_rate": 9.014109372943526e-06, + "loss": 0.7301, + "step": 6629 + }, + { + "epoch": 0.5455667558115614, + "grad_norm": 0.40944141887000013, + "learning_rate": 9.011457089726406e-06, + "loss": 0.4884, + "step": 6630 + }, + { + "epoch": 0.5456490434067064, + "grad_norm": 2.0277307900277366, + "learning_rate": 9.008804876733792e-06, + "loss": 0.7272, + "step": 6631 + }, + { + "epoch": 0.5457313310018514, + "grad_norm": 1.9825828233044607, + "learning_rate": 9.006152734154093e-06, + "loss": 0.7382, + "step": 6632 + }, + { + "epoch": 0.5458136185969965, + "grad_norm": 3.3378250840705124, + "learning_rate": 9.003500662175717e-06, + "loss": 0.7283, + "step": 6633 + }, + { + "epoch": 0.5458959061921416, + "grad_norm": 2.3027596090171865, + "learning_rate": 9.000848660987056e-06, + "loss": 0.7563, + "step": 6634 + }, + { + "epoch": 0.5459781937872865, + "grad_norm": 2.553755342269563, + "learning_rate": 8.998196730776512e-06, + "loss": 0.73, + "step": 6635 + }, + { + "epoch": 0.5460604813824316, + "grad_norm": 2.292201453957403, + "learning_rate": 8.99554487173247e-06, + "loss": 0.7454, + "step": 6636 + }, + { + "epoch": 0.5461427689775766, + "grad_norm": 3.188000489256651, + "learning_rate": 8.992893084043313e-06, + "loss": 0.747, + "step": 6637 + }, + { + "epoch": 0.5462250565727217, + "grad_norm": 2.5753985552077134, + "learning_rate": 8.99024136789742e-06, + "loss": 0.7428, + "step": 6638 + }, + { + "epoch": 0.5463073441678667, + "grad_norm": 2.847125156377996, + "learning_rate": 8.987589723483166e-06, + "loss": 0.766, + "step": 6639 + }, + { + "epoch": 0.5463896317630117, + "grad_norm": 9.897271780512513, + "learning_rate": 8.984938150988917e-06, + "loss": 0.7492, + "step": 6640 + }, + { + "epoch": 0.5464719193581568, + "grad_norm": 2.630814258684138, + "learning_rate": 8.982286650603044e-06, + "loss": 0.7255, + "step": 6641 + }, + { + "epoch": 0.5465542069533018, + "grad_norm": 0.44689171697080726, + "learning_rate": 8.979635222513892e-06, + "loss": 0.492, + "step": 6642 + }, + { + "epoch": 0.5466364945484469, + "grad_norm": 2.3660493103998887, + "learning_rate": 8.976983866909828e-06, + "loss": 0.7379, + "step": 6643 + }, + { + "epoch": 0.5467187821435918, + "grad_norm": 3.0227472100458908, + "learning_rate": 8.974332583979192e-06, + "loss": 0.7358, + "step": 6644 + }, + { + "epoch": 0.5468010697387369, + "grad_norm": 1.8970493070167012, + "learning_rate": 8.97168137391033e-06, + "loss": 0.7453, + "step": 6645 + }, + { + "epoch": 0.5468833573338819, + "grad_norm": 1.8761041511960028, + "learning_rate": 8.969030236891575e-06, + "loss": 0.7382, + "step": 6646 + }, + { + "epoch": 0.546965644929027, + "grad_norm": 2.532634964587329, + "learning_rate": 8.96637917311127e-06, + "loss": 0.7486, + "step": 6647 + }, + { + "epoch": 0.5470479325241719, + "grad_norm": 2.418146177030269, + "learning_rate": 8.963728182757728e-06, + "loss": 0.7381, + "step": 6648 + }, + { + "epoch": 0.547130220119317, + "grad_norm": 3.1292715341263793, + "learning_rate": 8.961077266019283e-06, + "loss": 0.7405, + "step": 6649 + }, + { + "epoch": 0.5472125077144621, + "grad_norm": 2.5213307132025533, + "learning_rate": 8.958426423084246e-06, + "loss": 0.7345, + "step": 6650 + }, + { + "epoch": 0.5472947953096071, + "grad_norm": 2.4737235910017614, + "learning_rate": 8.955775654140931e-06, + "loss": 0.7617, + "step": 6651 + }, + { + "epoch": 0.5473770829047521, + "grad_norm": 2.3679589434897377, + "learning_rate": 8.953124959377642e-06, + "loss": 0.7466, + "step": 6652 + }, + { + "epoch": 0.5474593704998971, + "grad_norm": 1.9427039099287224, + "learning_rate": 8.950474338982684e-06, + "loss": 0.7124, + "step": 6653 + }, + { + "epoch": 0.5475416580950422, + "grad_norm": 2.066388733025409, + "learning_rate": 8.947823793144347e-06, + "loss": 0.7306, + "step": 6654 + }, + { + "epoch": 0.5476239456901872, + "grad_norm": 2.4373680915793323, + "learning_rate": 8.945173322050929e-06, + "loss": 0.7219, + "step": 6655 + }, + { + "epoch": 0.5477062332853322, + "grad_norm": 2.5447014115665354, + "learning_rate": 8.942522925890708e-06, + "loss": 0.7178, + "step": 6656 + }, + { + "epoch": 0.5477885208804772, + "grad_norm": 3.039018471672431, + "learning_rate": 8.93987260485197e-06, + "loss": 0.7239, + "step": 6657 + }, + { + "epoch": 0.5478708084756223, + "grad_norm": 2.2230398351011167, + "learning_rate": 8.937222359122986e-06, + "loss": 0.7342, + "step": 6658 + }, + { + "epoch": 0.5479530960707674, + "grad_norm": 2.6168178685191856, + "learning_rate": 8.934572188892026e-06, + "loss": 0.7548, + "step": 6659 + }, + { + "epoch": 0.5480353836659123, + "grad_norm": 2.3439517562147274, + "learning_rate": 8.931922094347351e-06, + "loss": 0.7373, + "step": 6660 + }, + { + "epoch": 0.5481176712610574, + "grad_norm": 2.1039753197019992, + "learning_rate": 8.929272075677225e-06, + "loss": 0.7376, + "step": 6661 + }, + { + "epoch": 0.5481999588562024, + "grad_norm": 2.022687460859279, + "learning_rate": 8.926622133069898e-06, + "loss": 0.7307, + "step": 6662 + }, + { + "epoch": 0.5482822464513475, + "grad_norm": 2.660399733246982, + "learning_rate": 8.923972266713619e-06, + "loss": 0.7607, + "step": 6663 + }, + { + "epoch": 0.5483645340464925, + "grad_norm": 3.779837869067736, + "learning_rate": 8.921322476796627e-06, + "loss": 0.7277, + "step": 6664 + }, + { + "epoch": 0.5484468216416375, + "grad_norm": 2.4500388179443666, + "learning_rate": 8.918672763507164e-06, + "loss": 0.731, + "step": 6665 + }, + { + "epoch": 0.5485291092367826, + "grad_norm": 2.1015778637194216, + "learning_rate": 8.916023127033453e-06, + "loss": 0.7427, + "step": 6666 + }, + { + "epoch": 0.5486113968319276, + "grad_norm": 0.4351644949700402, + "learning_rate": 8.913373567563731e-06, + "loss": 0.5112, + "step": 6667 + }, + { + "epoch": 0.5486936844270727, + "grad_norm": 0.43661733805795916, + "learning_rate": 8.91072408528621e-06, + "loss": 0.5071, + "step": 6668 + }, + { + "epoch": 0.5487759720222176, + "grad_norm": 2.1941441805576196, + "learning_rate": 8.90807468038911e-06, + "loss": 0.7338, + "step": 6669 + }, + { + "epoch": 0.5488582596173627, + "grad_norm": 0.4071765939580612, + "learning_rate": 8.905425353060639e-06, + "loss": 0.4803, + "step": 6670 + }, + { + "epoch": 0.5489405472125077, + "grad_norm": 2.4467140218991505, + "learning_rate": 8.902776103489e-06, + "loss": 0.7515, + "step": 6671 + }, + { + "epoch": 0.5490228348076528, + "grad_norm": 0.41738826940050416, + "learning_rate": 8.900126931862388e-06, + "loss": 0.4906, + "step": 6672 + }, + { + "epoch": 0.5491051224027977, + "grad_norm": 0.4264123261969366, + "learning_rate": 8.897477838369004e-06, + "loss": 0.445, + "step": 6673 + }, + { + "epoch": 0.5491874099979428, + "grad_norm": 2.1592430812668426, + "learning_rate": 8.89482882319703e-06, + "loss": 0.7396, + "step": 6674 + }, + { + "epoch": 0.5492696975930879, + "grad_norm": 2.9480223106439363, + "learning_rate": 8.89217988653465e-06, + "loss": 0.7203, + "step": 6675 + }, + { + "epoch": 0.5493519851882329, + "grad_norm": 2.3882928444645475, + "learning_rate": 8.889531028570037e-06, + "loss": 0.743, + "step": 6676 + }, + { + "epoch": 0.5494342727833779, + "grad_norm": 2.079792953179712, + "learning_rate": 8.886882249491366e-06, + "loss": 0.7411, + "step": 6677 + }, + { + "epoch": 0.5495165603785229, + "grad_norm": 2.1475388007391536, + "learning_rate": 8.884233549486796e-06, + "loss": 0.743, + "step": 6678 + }, + { + "epoch": 0.549598847973668, + "grad_norm": 2.2428618044864868, + "learning_rate": 8.881584928744497e-06, + "loss": 0.7511, + "step": 6679 + }, + { + "epoch": 0.549681135568813, + "grad_norm": 2.440553925262461, + "learning_rate": 8.878936387452608e-06, + "loss": 0.74, + "step": 6680 + }, + { + "epoch": 0.549763423163958, + "grad_norm": 2.1091600006933846, + "learning_rate": 8.876287925799291e-06, + "loss": 0.7078, + "step": 6681 + }, + { + "epoch": 0.549845710759103, + "grad_norm": 2.1994783035978247, + "learning_rate": 8.87363954397268e-06, + "loss": 0.74, + "step": 6682 + }, + { + "epoch": 0.5499279983542481, + "grad_norm": 4.386185401573048, + "learning_rate": 8.870991242160916e-06, + "loss": 0.7336, + "step": 6683 + }, + { + "epoch": 0.5500102859493932, + "grad_norm": 2.033138909426715, + "learning_rate": 8.868343020552125e-06, + "loss": 0.7474, + "step": 6684 + }, + { + "epoch": 0.5500925735445381, + "grad_norm": 2.7343311712493352, + "learning_rate": 8.865694879334443e-06, + "loss": 0.7481, + "step": 6685 + }, + { + "epoch": 0.5501748611396832, + "grad_norm": 2.750567367163172, + "learning_rate": 8.863046818695976e-06, + "loss": 0.7505, + "step": 6686 + }, + { + "epoch": 0.5502571487348282, + "grad_norm": 2.5073414139762096, + "learning_rate": 8.860398838824848e-06, + "loss": 0.7405, + "step": 6687 + }, + { + "epoch": 0.5503394363299733, + "grad_norm": 2.4536217402643064, + "learning_rate": 8.857750939909161e-06, + "loss": 0.7368, + "step": 6688 + }, + { + "epoch": 0.5504217239251182, + "grad_norm": 2.999682783209976, + "learning_rate": 8.855103122137024e-06, + "loss": 0.728, + "step": 6689 + }, + { + "epoch": 0.5505040115202633, + "grad_norm": 0.4415670865903558, + "learning_rate": 8.85245538569653e-06, + "loss": 0.4923, + "step": 6690 + }, + { + "epoch": 0.5505862991154084, + "grad_norm": 2.553889000794622, + "learning_rate": 8.84980773077577e-06, + "loss": 0.7476, + "step": 6691 + }, + { + "epoch": 0.5506685867105534, + "grad_norm": 2.4950862335341104, + "learning_rate": 8.847160157562824e-06, + "loss": 0.7645, + "step": 6692 + }, + { + "epoch": 0.5507508743056985, + "grad_norm": 2.314000443601787, + "learning_rate": 8.844512666245783e-06, + "loss": 0.7399, + "step": 6693 + }, + { + "epoch": 0.5508331619008434, + "grad_norm": 0.540389265654608, + "learning_rate": 8.841865257012712e-06, + "loss": 0.4973, + "step": 6694 + }, + { + "epoch": 0.5509154494959885, + "grad_norm": 2.2741008534828717, + "learning_rate": 8.839217930051683e-06, + "loss": 0.7621, + "step": 6695 + }, + { + "epoch": 0.5509977370911335, + "grad_norm": 1.9262693521054346, + "learning_rate": 8.836570685550752e-06, + "loss": 0.7286, + "step": 6696 + }, + { + "epoch": 0.5510800246862786, + "grad_norm": 1.9932643998793076, + "learning_rate": 8.833923523697982e-06, + "loss": 0.712, + "step": 6697 + }, + { + "epoch": 0.5511623122814235, + "grad_norm": 0.442764614878778, + "learning_rate": 8.831276444681417e-06, + "loss": 0.4861, + "step": 6698 + }, + { + "epoch": 0.5512445998765686, + "grad_norm": 2.236201885295573, + "learning_rate": 8.828629448689108e-06, + "loss": 0.7775, + "step": 6699 + }, + { + "epoch": 0.5513268874717137, + "grad_norm": 0.4234306129647931, + "learning_rate": 8.825982535909086e-06, + "loss": 0.4834, + "step": 6700 + }, + { + "epoch": 0.5514091750668587, + "grad_norm": 2.4461523587318204, + "learning_rate": 8.823335706529392e-06, + "loss": 0.7233, + "step": 6701 + }, + { + "epoch": 0.5514914626620037, + "grad_norm": 3.7850557757702576, + "learning_rate": 8.820688960738043e-06, + "loss": 0.7417, + "step": 6702 + }, + { + "epoch": 0.5515737502571487, + "grad_norm": 2.1381726552097526, + "learning_rate": 8.818042298723066e-06, + "loss": 0.7458, + "step": 6703 + }, + { + "epoch": 0.5516560378522938, + "grad_norm": 0.39792099867667885, + "learning_rate": 8.815395720672472e-06, + "loss": 0.471, + "step": 6704 + }, + { + "epoch": 0.5517383254474388, + "grad_norm": 2.4123348881952307, + "learning_rate": 8.812749226774274e-06, + "loss": 0.7559, + "step": 6705 + }, + { + "epoch": 0.5518206130425838, + "grad_norm": 0.4284688672412533, + "learning_rate": 8.81010281721647e-06, + "loss": 0.4786, + "step": 6706 + }, + { + "epoch": 0.5519029006377288, + "grad_norm": 2.081941959365609, + "learning_rate": 8.807456492187062e-06, + "loss": 0.7605, + "step": 6707 + }, + { + "epoch": 0.5519851882328739, + "grad_norm": 2.7348191857624524, + "learning_rate": 8.804810251874035e-06, + "loss": 0.736, + "step": 6708 + }, + { + "epoch": 0.552067475828019, + "grad_norm": 0.4369973892657782, + "learning_rate": 8.802164096465379e-06, + "loss": 0.5307, + "step": 6709 + }, + { + "epoch": 0.5521497634231639, + "grad_norm": 2.4977828295476696, + "learning_rate": 8.799518026149066e-06, + "loss": 0.7552, + "step": 6710 + }, + { + "epoch": 0.552232051018309, + "grad_norm": 2.427577576628341, + "learning_rate": 8.796872041113077e-06, + "loss": 0.7509, + "step": 6711 + }, + { + "epoch": 0.552314338613454, + "grad_norm": 0.4411767970044333, + "learning_rate": 8.79422614154537e-06, + "loss": 0.5084, + "step": 6712 + }, + { + "epoch": 0.5523966262085991, + "grad_norm": 2.237773286907074, + "learning_rate": 8.791580327633912e-06, + "loss": 0.7189, + "step": 6713 + }, + { + "epoch": 0.552478913803744, + "grad_norm": 3.3099836081302527, + "learning_rate": 8.788934599566654e-06, + "loss": 0.7061, + "step": 6714 + }, + { + "epoch": 0.5525612013988891, + "grad_norm": 2.7229043444175467, + "learning_rate": 8.786288957531546e-06, + "loss": 0.7391, + "step": 6715 + }, + { + "epoch": 0.5526434889940341, + "grad_norm": 2.432799049293871, + "learning_rate": 8.783643401716527e-06, + "loss": 0.7077, + "step": 6716 + }, + { + "epoch": 0.5527257765891792, + "grad_norm": 0.4436116016678454, + "learning_rate": 8.780997932309539e-06, + "loss": 0.5286, + "step": 6717 + }, + { + "epoch": 0.5528080641843243, + "grad_norm": 2.847150360971711, + "learning_rate": 8.778352549498504e-06, + "loss": 0.721, + "step": 6718 + }, + { + "epoch": 0.5528903517794692, + "grad_norm": 3.967660034292338, + "learning_rate": 8.775707253471356e-06, + "loss": 0.7393, + "step": 6719 + }, + { + "epoch": 0.5529726393746143, + "grad_norm": 2.1668862475243684, + "learning_rate": 8.773062044416e-06, + "loss": 0.7239, + "step": 6720 + }, + { + "epoch": 0.5530549269697593, + "grad_norm": 2.66664910929128, + "learning_rate": 8.770416922520361e-06, + "loss": 0.7507, + "step": 6721 + }, + { + "epoch": 0.5531372145649044, + "grad_norm": 3.500117237118635, + "learning_rate": 8.767771887972333e-06, + "loss": 0.7379, + "step": 6722 + }, + { + "epoch": 0.5532195021600493, + "grad_norm": 1.9820726306426333, + "learning_rate": 8.765126940959822e-06, + "loss": 0.7199, + "step": 6723 + }, + { + "epoch": 0.5533017897551944, + "grad_norm": 2.677940462075145, + "learning_rate": 8.762482081670714e-06, + "loss": 0.7306, + "step": 6724 + }, + { + "epoch": 0.5533840773503395, + "grad_norm": 2.3529054871612516, + "learning_rate": 8.759837310292904e-06, + "loss": 0.7573, + "step": 6725 + }, + { + "epoch": 0.5534663649454845, + "grad_norm": 2.717313086495739, + "learning_rate": 8.757192627014267e-06, + "loss": 0.7277, + "step": 6726 + }, + { + "epoch": 0.5535486525406295, + "grad_norm": 2.334195305886769, + "learning_rate": 8.754548032022679e-06, + "loss": 0.7409, + "step": 6727 + }, + { + "epoch": 0.5536309401357745, + "grad_norm": 6.428642555554321, + "learning_rate": 8.751903525506005e-06, + "loss": 0.7678, + "step": 6728 + }, + { + "epoch": 0.5537132277309196, + "grad_norm": 0.4086235410867775, + "learning_rate": 8.74925910765211e-06, + "loss": 0.4948, + "step": 6729 + }, + { + "epoch": 0.5537955153260646, + "grad_norm": 3.2476653139914777, + "learning_rate": 8.746614778648844e-06, + "loss": 0.7473, + "step": 6730 + }, + { + "epoch": 0.5538778029212096, + "grad_norm": 2.256060333817364, + "learning_rate": 8.743970538684065e-06, + "loss": 0.7453, + "step": 6731 + }, + { + "epoch": 0.5539600905163546, + "grad_norm": 2.70939993923931, + "learning_rate": 8.741326387945606e-06, + "loss": 0.7596, + "step": 6732 + }, + { + "epoch": 0.5540423781114997, + "grad_norm": 2.9249169121422898, + "learning_rate": 8.738682326621311e-06, + "loss": 0.755, + "step": 6733 + }, + { + "epoch": 0.5541246657066448, + "grad_norm": 2.2888634829425207, + "learning_rate": 8.736038354899005e-06, + "loss": 0.7324, + "step": 6734 + }, + { + "epoch": 0.5542069533017897, + "grad_norm": 2.896434231519029, + "learning_rate": 8.733394472966513e-06, + "loss": 0.742, + "step": 6735 + }, + { + "epoch": 0.5542892408969348, + "grad_norm": 2.1266001136963495, + "learning_rate": 8.73075068101165e-06, + "loss": 0.6978, + "step": 6736 + }, + { + "epoch": 0.5543715284920798, + "grad_norm": 6.532473672691437, + "learning_rate": 8.72810697922223e-06, + "loss": 0.7402, + "step": 6737 + }, + { + "epoch": 0.5544538160872249, + "grad_norm": 2.078006280857085, + "learning_rate": 8.725463367786056e-06, + "loss": 0.7361, + "step": 6738 + }, + { + "epoch": 0.5545361036823698, + "grad_norm": 3.6565363792284624, + "learning_rate": 8.722819846890928e-06, + "loss": 0.7313, + "step": 6739 + }, + { + "epoch": 0.5546183912775149, + "grad_norm": 2.5964507840454347, + "learning_rate": 8.720176416724634e-06, + "loss": 0.7587, + "step": 6740 + }, + { + "epoch": 0.55470067887266, + "grad_norm": 3.6663395044366855, + "learning_rate": 8.717533077474962e-06, + "loss": 0.7011, + "step": 6741 + }, + { + "epoch": 0.554782966467805, + "grad_norm": 2.3061764526299986, + "learning_rate": 8.714889829329684e-06, + "loss": 0.7171, + "step": 6742 + }, + { + "epoch": 0.5548652540629501, + "grad_norm": 3.029299583619761, + "learning_rate": 8.71224667247658e-06, + "loss": 0.7447, + "step": 6743 + }, + { + "epoch": 0.554947541658095, + "grad_norm": 3.6145609904195575, + "learning_rate": 8.709603607103416e-06, + "loss": 0.7329, + "step": 6744 + }, + { + "epoch": 0.5550298292532401, + "grad_norm": 2.674233126644895, + "learning_rate": 8.706960633397947e-06, + "loss": 0.7353, + "step": 6745 + }, + { + "epoch": 0.5551121168483851, + "grad_norm": 2.6545820957469597, + "learning_rate": 8.704317751547927e-06, + "loss": 0.7157, + "step": 6746 + }, + { + "epoch": 0.5551944044435302, + "grad_norm": 2.7989331163802498, + "learning_rate": 8.701674961741097e-06, + "loss": 0.7453, + "step": 6747 + }, + { + "epoch": 0.5552766920386751, + "grad_norm": 2.696516737655921, + "learning_rate": 8.699032264165206e-06, + "loss": 0.742, + "step": 6748 + }, + { + "epoch": 0.5553589796338202, + "grad_norm": 2.5329744684330002, + "learning_rate": 8.696389659007981e-06, + "loss": 0.7642, + "step": 6749 + }, + { + "epoch": 0.5554412672289653, + "grad_norm": 2.113801361969796, + "learning_rate": 8.693747146457151e-06, + "loss": 0.7243, + "step": 6750 + }, + { + "epoch": 0.5555235548241103, + "grad_norm": 0.44628312915978624, + "learning_rate": 8.691104726700433e-06, + "loss": 0.5156, + "step": 6751 + }, + { + "epoch": 0.5556058424192553, + "grad_norm": 2.2516121630679438, + "learning_rate": 8.688462399925545e-06, + "loss": 0.749, + "step": 6752 + }, + { + "epoch": 0.5556881300144003, + "grad_norm": 2.8755072670545525, + "learning_rate": 8.685820166320185e-06, + "loss": 0.7013, + "step": 6753 + }, + { + "epoch": 0.5557704176095454, + "grad_norm": 2.691739518361505, + "learning_rate": 8.683178026072064e-06, + "loss": 0.7346, + "step": 6754 + }, + { + "epoch": 0.5558527052046904, + "grad_norm": 2.176907854931837, + "learning_rate": 8.680535979368867e-06, + "loss": 0.7374, + "step": 6755 + }, + { + "epoch": 0.5559349927998354, + "grad_norm": 0.4075688832983528, + "learning_rate": 8.677894026398286e-06, + "loss": 0.4865, + "step": 6756 + }, + { + "epoch": 0.5560172803949804, + "grad_norm": 2.152714971874688, + "learning_rate": 8.675252167347998e-06, + "loss": 0.721, + "step": 6757 + }, + { + "epoch": 0.5560995679901255, + "grad_norm": 2.4587947042385676, + "learning_rate": 8.672610402405678e-06, + "loss": 0.7571, + "step": 6758 + }, + { + "epoch": 0.5561818555852706, + "grad_norm": 3.3825229708143896, + "learning_rate": 8.669968731758989e-06, + "loss": 0.7432, + "step": 6759 + }, + { + "epoch": 0.5562641431804155, + "grad_norm": 2.525577056478877, + "learning_rate": 8.6673271555956e-06, + "loss": 0.7359, + "step": 6760 + }, + { + "epoch": 0.5563464307755606, + "grad_norm": 3.556408458476164, + "learning_rate": 8.664685674103152e-06, + "loss": 0.7406, + "step": 6761 + }, + { + "epoch": 0.5564287183707056, + "grad_norm": 3.6780843063983046, + "learning_rate": 8.662044287469304e-06, + "loss": 0.7201, + "step": 6762 + }, + { + "epoch": 0.5565110059658507, + "grad_norm": 3.0134653935944278, + "learning_rate": 8.659402995881685e-06, + "loss": 0.7413, + "step": 6763 + }, + { + "epoch": 0.5565932935609956, + "grad_norm": 0.43597842441156226, + "learning_rate": 8.656761799527938e-06, + "loss": 0.4938, + "step": 6764 + }, + { + "epoch": 0.5566755811561407, + "grad_norm": 2.2299944081226752, + "learning_rate": 8.654120698595679e-06, + "loss": 0.7498, + "step": 6765 + }, + { + "epoch": 0.5567578687512857, + "grad_norm": 2.262417795504359, + "learning_rate": 8.651479693272541e-06, + "loss": 0.7618, + "step": 6766 + }, + { + "epoch": 0.5568401563464308, + "grad_norm": 2.717943032532629, + "learning_rate": 8.648838783746122e-06, + "loss": 0.7277, + "step": 6767 + }, + { + "epoch": 0.5569224439415759, + "grad_norm": 2.3087083026216066, + "learning_rate": 8.646197970204037e-06, + "loss": 0.7412, + "step": 6768 + }, + { + "epoch": 0.5570047315367208, + "grad_norm": 0.4098492067652577, + "learning_rate": 8.643557252833884e-06, + "loss": 0.4864, + "step": 6769 + }, + { + "epoch": 0.5570870191318659, + "grad_norm": 2.2201732062081567, + "learning_rate": 8.640916631823255e-06, + "loss": 0.7405, + "step": 6770 + }, + { + "epoch": 0.5571693067270109, + "grad_norm": 2.3541021035383, + "learning_rate": 8.638276107359733e-06, + "loss": 0.7444, + "step": 6771 + }, + { + "epoch": 0.557251594322156, + "grad_norm": 2.365683459662189, + "learning_rate": 8.6356356796309e-06, + "loss": 0.7523, + "step": 6772 + }, + { + "epoch": 0.5573338819173009, + "grad_norm": 2.4941227277582736, + "learning_rate": 8.632995348824324e-06, + "loss": 0.7443, + "step": 6773 + }, + { + "epoch": 0.557416169512446, + "grad_norm": 2.052573733023068, + "learning_rate": 8.630355115127576e-06, + "loss": 0.7424, + "step": 6774 + }, + { + "epoch": 0.557498457107591, + "grad_norm": 2.9005931753428467, + "learning_rate": 8.627714978728207e-06, + "loss": 0.7238, + "step": 6775 + }, + { + "epoch": 0.5575807447027361, + "grad_norm": 2.1827185446762134, + "learning_rate": 8.625074939813774e-06, + "loss": 0.7337, + "step": 6776 + }, + { + "epoch": 0.5576630322978811, + "grad_norm": 2.4577780425423112, + "learning_rate": 8.622434998571816e-06, + "loss": 0.7722, + "step": 6777 + }, + { + "epoch": 0.5577453198930261, + "grad_norm": 2.110901496314898, + "learning_rate": 8.619795155189875e-06, + "loss": 0.741, + "step": 6778 + }, + { + "epoch": 0.5578276074881712, + "grad_norm": 0.43141133404224874, + "learning_rate": 8.617155409855475e-06, + "loss": 0.4741, + "step": 6779 + }, + { + "epoch": 0.5579098950833162, + "grad_norm": 3.141497359681966, + "learning_rate": 8.614515762756147e-06, + "loss": 0.744, + "step": 6780 + }, + { + "epoch": 0.5579921826784612, + "grad_norm": 2.257198380084299, + "learning_rate": 8.6118762140794e-06, + "loss": 0.7298, + "step": 6781 + }, + { + "epoch": 0.5580744702736062, + "grad_norm": 0.41834455518730845, + "learning_rate": 8.60923676401275e-06, + "loss": 0.5106, + "step": 6782 + }, + { + "epoch": 0.5581567578687513, + "grad_norm": 2.28883420680071, + "learning_rate": 8.606597412743695e-06, + "loss": 0.7828, + "step": 6783 + }, + { + "epoch": 0.5582390454638964, + "grad_norm": 3.0884991318819717, + "learning_rate": 8.603958160459732e-06, + "loss": 0.7312, + "step": 6784 + }, + { + "epoch": 0.5583213330590413, + "grad_norm": 2.8722107939381405, + "learning_rate": 8.601319007348345e-06, + "loss": 0.749, + "step": 6785 + }, + { + "epoch": 0.5584036206541864, + "grad_norm": 2.232279421679622, + "learning_rate": 8.598679953597023e-06, + "loss": 0.7667, + "step": 6786 + }, + { + "epoch": 0.5584859082493314, + "grad_norm": 1.9377349735032967, + "learning_rate": 8.596040999393234e-06, + "loss": 0.7522, + "step": 6787 + }, + { + "epoch": 0.5585681958444765, + "grad_norm": 3.7172130389226448, + "learning_rate": 8.593402144924449e-06, + "loss": 0.744, + "step": 6788 + }, + { + "epoch": 0.5586504834396214, + "grad_norm": 0.42766813159548495, + "learning_rate": 8.590763390378125e-06, + "loss": 0.5129, + "step": 6789 + }, + { + "epoch": 0.5587327710347665, + "grad_norm": 1.976633757693294, + "learning_rate": 8.588124735941716e-06, + "loss": 0.7233, + "step": 6790 + }, + { + "epoch": 0.5588150586299115, + "grad_norm": 2.1821301389542698, + "learning_rate": 8.585486181802665e-06, + "loss": 0.7144, + "step": 6791 + }, + { + "epoch": 0.5588973462250566, + "grad_norm": 2.038372777894107, + "learning_rate": 8.58284772814842e-06, + "loss": 0.7542, + "step": 6792 + }, + { + "epoch": 0.5589796338202015, + "grad_norm": 2.6420777569604246, + "learning_rate": 8.580209375166399e-06, + "loss": 0.7243, + "step": 6793 + }, + { + "epoch": 0.5590619214153466, + "grad_norm": 3.1795697237182705, + "learning_rate": 8.57757112304404e-06, + "loss": 0.7466, + "step": 6794 + }, + { + "epoch": 0.5591442090104917, + "grad_norm": 0.4107233698330363, + "learning_rate": 8.574932971968747e-06, + "loss": 0.5029, + "step": 6795 + }, + { + "epoch": 0.5592264966056367, + "grad_norm": 2.278566845547665, + "learning_rate": 8.572294922127942e-06, + "loss": 0.7152, + "step": 6796 + }, + { + "epoch": 0.5593087842007818, + "grad_norm": 2.3433575733968324, + "learning_rate": 8.569656973709018e-06, + "loss": 0.7557, + "step": 6797 + }, + { + "epoch": 0.5593910717959267, + "grad_norm": 2.9840935110950366, + "learning_rate": 8.567019126899381e-06, + "loss": 0.7293, + "step": 6798 + }, + { + "epoch": 0.5594733593910718, + "grad_norm": 2.976988367968953, + "learning_rate": 8.564381381886407e-06, + "loss": 0.7439, + "step": 6799 + }, + { + "epoch": 0.5595556469862168, + "grad_norm": 2.288441256934263, + "learning_rate": 8.561743738857488e-06, + "loss": 0.7474, + "step": 6800 + }, + { + "epoch": 0.5596379345813619, + "grad_norm": 2.84586169356329, + "learning_rate": 8.559106197999991e-06, + "loss": 0.7633, + "step": 6801 + }, + { + "epoch": 0.5597202221765069, + "grad_norm": 2.9268895115531977, + "learning_rate": 8.556468759501287e-06, + "loss": 0.7323, + "step": 6802 + }, + { + "epoch": 0.5598025097716519, + "grad_norm": 2.014744921547367, + "learning_rate": 8.553831423548733e-06, + "loss": 0.7037, + "step": 6803 + }, + { + "epoch": 0.559884797366797, + "grad_norm": 2.85512258094743, + "learning_rate": 8.551194190329683e-06, + "loss": 0.7631, + "step": 6804 + }, + { + "epoch": 0.559967084961942, + "grad_norm": 0.437084183936253, + "learning_rate": 8.548557060031477e-06, + "loss": 0.5004, + "step": 6805 + }, + { + "epoch": 0.560049372557087, + "grad_norm": 0.416560188263507, + "learning_rate": 8.54592003284146e-06, + "loss": 0.5022, + "step": 6806 + }, + { + "epoch": 0.560131660152232, + "grad_norm": 0.41157099696289046, + "learning_rate": 8.543283108946958e-06, + "loss": 0.4922, + "step": 6807 + }, + { + "epoch": 0.5602139477473771, + "grad_norm": 2.333016581444677, + "learning_rate": 8.540646288535295e-06, + "loss": 0.7281, + "step": 6808 + }, + { + "epoch": 0.5602962353425222, + "grad_norm": 2.1048835630355063, + "learning_rate": 8.538009571793784e-06, + "loss": 0.7281, + "step": 6809 + }, + { + "epoch": 0.5603785229376671, + "grad_norm": 2.4322175381083446, + "learning_rate": 8.535372958909736e-06, + "loss": 0.7658, + "step": 6810 + }, + { + "epoch": 0.5604608105328122, + "grad_norm": 2.2827719362886993, + "learning_rate": 8.532736450070447e-06, + "loss": 0.7206, + "step": 6811 + }, + { + "epoch": 0.5605430981279572, + "grad_norm": 3.720591344474088, + "learning_rate": 8.53010004546322e-06, + "loss": 0.7174, + "step": 6812 + }, + { + "epoch": 0.5606253857231023, + "grad_norm": 0.40483908545101094, + "learning_rate": 8.52746374527533e-06, + "loss": 0.4818, + "step": 6813 + }, + { + "epoch": 0.5607076733182472, + "grad_norm": 2.4411990675533497, + "learning_rate": 8.524827549694063e-06, + "loss": 0.7326, + "step": 6814 + }, + { + "epoch": 0.5607899609133923, + "grad_norm": 2.7415247057133842, + "learning_rate": 8.522191458906687e-06, + "loss": 0.741, + "step": 6815 + }, + { + "epoch": 0.5608722485085373, + "grad_norm": 2.6378795207727253, + "learning_rate": 8.519555473100469e-06, + "loss": 0.7069, + "step": 6816 + }, + { + "epoch": 0.5609545361036824, + "grad_norm": 2.412651236518836, + "learning_rate": 8.516919592462657e-06, + "loss": 0.7266, + "step": 6817 + }, + { + "epoch": 0.5610368236988273, + "grad_norm": 2.9921057923206544, + "learning_rate": 8.51428381718051e-06, + "loss": 0.7334, + "step": 6818 + }, + { + "epoch": 0.5611191112939724, + "grad_norm": 2.4183429906996152, + "learning_rate": 8.511648147441263e-06, + "loss": 0.7519, + "step": 6819 + }, + { + "epoch": 0.5612013988891175, + "grad_norm": 2.456485063446469, + "learning_rate": 8.509012583432153e-06, + "loss": 0.7456, + "step": 6820 + }, + { + "epoch": 0.5612836864842625, + "grad_norm": 3.13542718732077, + "learning_rate": 8.506377125340401e-06, + "loss": 0.7347, + "step": 6821 + }, + { + "epoch": 0.5613659740794076, + "grad_norm": 2.3508058366395406, + "learning_rate": 8.503741773353234e-06, + "loss": 0.7214, + "step": 6822 + }, + { + "epoch": 0.5614482616745525, + "grad_norm": 2.7999311361504535, + "learning_rate": 8.501106527657852e-06, + "loss": 0.7458, + "step": 6823 + }, + { + "epoch": 0.5615305492696976, + "grad_norm": 2.497999233348141, + "learning_rate": 8.498471388441472e-06, + "loss": 0.7666, + "step": 6824 + }, + { + "epoch": 0.5616128368648426, + "grad_norm": 0.4229876442007767, + "learning_rate": 8.495836355891278e-06, + "loss": 0.4729, + "step": 6825 + }, + { + "epoch": 0.5616951244599877, + "grad_norm": 2.6816198555549122, + "learning_rate": 8.493201430194467e-06, + "loss": 0.7542, + "step": 6826 + }, + { + "epoch": 0.5617774120551327, + "grad_norm": 4.373315072484634, + "learning_rate": 8.490566611538216e-06, + "loss": 0.7179, + "step": 6827 + }, + { + "epoch": 0.5618596996502777, + "grad_norm": 0.4225494724134802, + "learning_rate": 8.487931900109699e-06, + "loss": 0.5144, + "step": 6828 + }, + { + "epoch": 0.5619419872454228, + "grad_norm": 2.686846127838823, + "learning_rate": 8.48529729609608e-06, + "loss": 0.7812, + "step": 6829 + }, + { + "epoch": 0.5620242748405678, + "grad_norm": 2.15853324223056, + "learning_rate": 8.482662799684522e-06, + "loss": 0.7089, + "step": 6830 + }, + { + "epoch": 0.5621065624357128, + "grad_norm": 2.6746801423048643, + "learning_rate": 8.480028411062167e-06, + "loss": 0.7279, + "step": 6831 + }, + { + "epoch": 0.5621888500308578, + "grad_norm": 3.272551030992251, + "learning_rate": 8.477394130416168e-06, + "loss": 0.7181, + "step": 6832 + }, + { + "epoch": 0.5622711376260029, + "grad_norm": 2.5913594548415557, + "learning_rate": 8.474759957933651e-06, + "loss": 0.7401, + "step": 6833 + }, + { + "epoch": 0.562353425221148, + "grad_norm": 2.1371761340001076, + "learning_rate": 8.472125893801751e-06, + "loss": 0.7391, + "step": 6834 + }, + { + "epoch": 0.5624357128162929, + "grad_norm": 2.774311141894095, + "learning_rate": 8.469491938207578e-06, + "loss": 0.7499, + "step": 6835 + }, + { + "epoch": 0.562518000411438, + "grad_norm": 1.7882571830164804, + "learning_rate": 8.46685809133826e-06, + "loss": 0.7032, + "step": 6836 + }, + { + "epoch": 0.562600288006583, + "grad_norm": 2.2394144318593803, + "learning_rate": 8.464224353380882e-06, + "loss": 0.7443, + "step": 6837 + }, + { + "epoch": 0.5626825756017281, + "grad_norm": 2.1543423967069004, + "learning_rate": 8.461590724522554e-06, + "loss": 0.7476, + "step": 6838 + }, + { + "epoch": 0.562764863196873, + "grad_norm": 2.152916852838681, + "learning_rate": 8.458957204950359e-06, + "loss": 0.7573, + "step": 6839 + }, + { + "epoch": 0.5628471507920181, + "grad_norm": 2.0243808432647774, + "learning_rate": 8.456323794851383e-06, + "loss": 0.7192, + "step": 6840 + }, + { + "epoch": 0.5629294383871631, + "grad_norm": 2.5332965533318816, + "learning_rate": 8.453690494412692e-06, + "loss": 0.7328, + "step": 6841 + }, + { + "epoch": 0.5630117259823082, + "grad_norm": 3.2749224297448647, + "learning_rate": 8.451057303821357e-06, + "loss": 0.7414, + "step": 6842 + }, + { + "epoch": 0.5630940135774531, + "grad_norm": 3.7633674674604385, + "learning_rate": 8.448424223264432e-06, + "loss": 0.7683, + "step": 6843 + }, + { + "epoch": 0.5631763011725982, + "grad_norm": 2.021966477986595, + "learning_rate": 8.445791252928971e-06, + "loss": 0.7343, + "step": 6844 + }, + { + "epoch": 0.5632585887677433, + "grad_norm": 0.4344886904775334, + "learning_rate": 8.443158393002013e-06, + "loss": 0.4875, + "step": 6845 + }, + { + "epoch": 0.5633408763628883, + "grad_norm": 2.198723434293564, + "learning_rate": 8.440525643670594e-06, + "loss": 0.7297, + "step": 6846 + }, + { + "epoch": 0.5634231639580334, + "grad_norm": 2.5265203952734523, + "learning_rate": 8.437893005121737e-06, + "loss": 0.7576, + "step": 6847 + }, + { + "epoch": 0.5635054515531783, + "grad_norm": 2.3955731310271626, + "learning_rate": 8.435260477542467e-06, + "loss": 0.705, + "step": 6848 + }, + { + "epoch": 0.5635877391483234, + "grad_norm": 2.321069077673645, + "learning_rate": 8.432628061119783e-06, + "loss": 0.7345, + "step": 6849 + }, + { + "epoch": 0.5636700267434684, + "grad_norm": 2.5641766447508245, + "learning_rate": 8.4299957560407e-06, + "loss": 0.7599, + "step": 6850 + }, + { + "epoch": 0.5637523143386135, + "grad_norm": 2.4177423419630304, + "learning_rate": 8.427363562492207e-06, + "loss": 0.725, + "step": 6851 + }, + { + "epoch": 0.5638346019337585, + "grad_norm": 2.7912571603408995, + "learning_rate": 8.424731480661293e-06, + "loss": 0.7443, + "step": 6852 + }, + { + "epoch": 0.5639168895289035, + "grad_norm": 3.7692399291251815, + "learning_rate": 8.422099510734934e-06, + "loss": 0.7249, + "step": 6853 + }, + { + "epoch": 0.5639991771240486, + "grad_norm": 0.42289772506184364, + "learning_rate": 8.419467652900103e-06, + "loss": 0.5009, + "step": 6854 + }, + { + "epoch": 0.5640814647191936, + "grad_norm": 3.8320401294801525, + "learning_rate": 8.416835907343758e-06, + "loss": 0.7326, + "step": 6855 + }, + { + "epoch": 0.5641637523143386, + "grad_norm": 3.1205463372600053, + "learning_rate": 8.414204274252862e-06, + "loss": 0.7492, + "step": 6856 + }, + { + "epoch": 0.5642460399094836, + "grad_norm": 2.534664306422425, + "learning_rate": 8.411572753814358e-06, + "loss": 0.7284, + "step": 6857 + }, + { + "epoch": 0.5643283275046287, + "grad_norm": 6.583770043501786, + "learning_rate": 8.408941346215185e-06, + "loss": 0.7423, + "step": 6858 + }, + { + "epoch": 0.5644106150997737, + "grad_norm": 3.2410039362340317, + "learning_rate": 8.406310051642274e-06, + "loss": 0.7193, + "step": 6859 + }, + { + "epoch": 0.5644929026949187, + "grad_norm": 2.133398785499632, + "learning_rate": 8.40367887028255e-06, + "loss": 0.7486, + "step": 6860 + }, + { + "epoch": 0.5645751902900638, + "grad_norm": 2.5275521662700413, + "learning_rate": 8.401047802322921e-06, + "loss": 0.7518, + "step": 6861 + }, + { + "epoch": 0.5646574778852088, + "grad_norm": 0.40719892858714096, + "learning_rate": 8.398416847950307e-06, + "loss": 0.4736, + "step": 6862 + }, + { + "epoch": 0.5647397654803539, + "grad_norm": 2.4639258817949043, + "learning_rate": 8.395786007351592e-06, + "loss": 0.7375, + "step": 6863 + }, + { + "epoch": 0.5648220530754988, + "grad_norm": 2.3908477364337624, + "learning_rate": 8.393155280713676e-06, + "loss": 0.7518, + "step": 6864 + }, + { + "epoch": 0.5649043406706439, + "grad_norm": 2.654205560735612, + "learning_rate": 8.39052466822344e-06, + "loss": 0.7493, + "step": 6865 + }, + { + "epoch": 0.5649866282657889, + "grad_norm": 2.2678112634227583, + "learning_rate": 8.387894170067756e-06, + "loss": 0.7465, + "step": 6866 + }, + { + "epoch": 0.565068915860934, + "grad_norm": 0.4250831715275198, + "learning_rate": 8.38526378643349e-06, + "loss": 0.4985, + "step": 6867 + }, + { + "epoch": 0.5651512034560789, + "grad_norm": 2.6573731541116734, + "learning_rate": 8.382633517507509e-06, + "loss": 0.6984, + "step": 6868 + }, + { + "epoch": 0.565233491051224, + "grad_norm": 4.504731689899669, + "learning_rate": 8.380003363476648e-06, + "loss": 0.7328, + "step": 6869 + }, + { + "epoch": 0.5653157786463691, + "grad_norm": 2.8394259665823265, + "learning_rate": 8.377373324527763e-06, + "loss": 0.7561, + "step": 6870 + }, + { + "epoch": 0.5653980662415141, + "grad_norm": 2.136653316227661, + "learning_rate": 8.37474340084768e-06, + "loss": 0.7142, + "step": 6871 + }, + { + "epoch": 0.5654803538366592, + "grad_norm": 2.149979899961706, + "learning_rate": 8.372113592623228e-06, + "loss": 0.737, + "step": 6872 + }, + { + "epoch": 0.5655626414318041, + "grad_norm": 2.5423945525838385, + "learning_rate": 8.369483900041221e-06, + "loss": 0.7018, + "step": 6873 + }, + { + "epoch": 0.5656449290269492, + "grad_norm": 2.342655076933809, + "learning_rate": 8.366854323288473e-06, + "loss": 0.7088, + "step": 6874 + }, + { + "epoch": 0.5657272166220942, + "grad_norm": 3.382612481413114, + "learning_rate": 8.364224862551776e-06, + "loss": 0.7262, + "step": 6875 + }, + { + "epoch": 0.5658095042172393, + "grad_norm": 3.639024438039282, + "learning_rate": 8.361595518017935e-06, + "loss": 0.7441, + "step": 6876 + }, + { + "epoch": 0.5658917918123842, + "grad_norm": 3.4802540440751866, + "learning_rate": 8.358966289873724e-06, + "loss": 0.7055, + "step": 6877 + }, + { + "epoch": 0.5659740794075293, + "grad_norm": 0.44825496093826545, + "learning_rate": 8.356337178305927e-06, + "loss": 0.478, + "step": 6878 + }, + { + "epoch": 0.5660563670026744, + "grad_norm": 2.6835090757483027, + "learning_rate": 8.353708183501305e-06, + "loss": 0.7466, + "step": 6879 + }, + { + "epoch": 0.5661386545978194, + "grad_norm": 3.024964727043475, + "learning_rate": 8.351079305646624e-06, + "loss": 0.7296, + "step": 6880 + }, + { + "epoch": 0.5662209421929644, + "grad_norm": 2.3184945221807665, + "learning_rate": 8.348450544928628e-06, + "loss": 0.7395, + "step": 6881 + }, + { + "epoch": 0.5663032297881094, + "grad_norm": 2.606015848783252, + "learning_rate": 8.345821901534067e-06, + "loss": 0.7387, + "step": 6882 + }, + { + "epoch": 0.5663855173832545, + "grad_norm": 2.3472993702779004, + "learning_rate": 8.343193375649672e-06, + "loss": 0.7241, + "step": 6883 + }, + { + "epoch": 0.5664678049783995, + "grad_norm": 2.1945785792508805, + "learning_rate": 8.34056496746217e-06, + "loss": 0.722, + "step": 6884 + }, + { + "epoch": 0.5665500925735445, + "grad_norm": 2.2931037783173385, + "learning_rate": 8.337936677158278e-06, + "loss": 0.7356, + "step": 6885 + }, + { + "epoch": 0.5666323801686896, + "grad_norm": 3.0880929240658275, + "learning_rate": 8.335308504924708e-06, + "loss": 0.7322, + "step": 6886 + }, + { + "epoch": 0.5667146677638346, + "grad_norm": 2.8551071483524213, + "learning_rate": 8.332680450948157e-06, + "loss": 0.7187, + "step": 6887 + }, + { + "epoch": 0.5667969553589797, + "grad_norm": 2.696062498701033, + "learning_rate": 8.330052515415324e-06, + "loss": 0.7188, + "step": 6888 + }, + { + "epoch": 0.5668792429541246, + "grad_norm": 3.9567867112782746, + "learning_rate": 8.327424698512886e-06, + "loss": 0.7571, + "step": 6889 + }, + { + "epoch": 0.5669615305492697, + "grad_norm": 2.416164130792423, + "learning_rate": 8.324797000427525e-06, + "loss": 0.758, + "step": 6890 + }, + { + "epoch": 0.5670438181444147, + "grad_norm": 2.2763908543837856, + "learning_rate": 8.322169421345904e-06, + "loss": 0.7265, + "step": 6891 + }, + { + "epoch": 0.5671261057395598, + "grad_norm": 0.443205177543366, + "learning_rate": 8.319541961454687e-06, + "loss": 0.4972, + "step": 6892 + }, + { + "epoch": 0.5672083933347047, + "grad_norm": 3.5424943005549485, + "learning_rate": 8.316914620940516e-06, + "loss": 0.7198, + "step": 6893 + }, + { + "epoch": 0.5672906809298498, + "grad_norm": 0.40925698635786806, + "learning_rate": 8.314287399990046e-06, + "loss": 0.4902, + "step": 6894 + }, + { + "epoch": 0.5673729685249949, + "grad_norm": 0.4032211485545162, + "learning_rate": 8.311660298789896e-06, + "loss": 0.4652, + "step": 6895 + }, + { + "epoch": 0.5674552561201399, + "grad_norm": 2.216676935442759, + "learning_rate": 8.309033317526704e-06, + "loss": 0.7061, + "step": 6896 + }, + { + "epoch": 0.567537543715285, + "grad_norm": 2.4111153587977667, + "learning_rate": 8.306406456387077e-06, + "loss": 0.7463, + "step": 6897 + }, + { + "epoch": 0.5676198313104299, + "grad_norm": 3.161461789537958, + "learning_rate": 8.30377971555763e-06, + "loss": 0.746, + "step": 6898 + }, + { + "epoch": 0.567702118905575, + "grad_norm": 2.8940749084119184, + "learning_rate": 8.301153095224956e-06, + "loss": 0.71, + "step": 6899 + }, + { + "epoch": 0.56778440650072, + "grad_norm": 3.640654892963235, + "learning_rate": 8.298526595575653e-06, + "loss": 0.7311, + "step": 6900 + }, + { + "epoch": 0.5678666940958651, + "grad_norm": 3.0039687064091964, + "learning_rate": 8.295900216796295e-06, + "loss": 0.7493, + "step": 6901 + }, + { + "epoch": 0.56794898169101, + "grad_norm": 2.584933413503265, + "learning_rate": 8.293273959073465e-06, + "loss": 0.7347, + "step": 6902 + }, + { + "epoch": 0.5680312692861551, + "grad_norm": 2.53198327618415, + "learning_rate": 8.29064782259372e-06, + "loss": 0.736, + "step": 6903 + }, + { + "epoch": 0.5681135568813002, + "grad_norm": 2.7058110382334437, + "learning_rate": 8.288021807543622e-06, + "loss": 0.7022, + "step": 6904 + }, + { + "epoch": 0.5681958444764452, + "grad_norm": 2.5429433663720507, + "learning_rate": 8.285395914109713e-06, + "loss": 0.715, + "step": 6905 + }, + { + "epoch": 0.5682781320715902, + "grad_norm": 2.8935383946812214, + "learning_rate": 8.282770142478542e-06, + "loss": 0.7481, + "step": 6906 + }, + { + "epoch": 0.5683604196667352, + "grad_norm": 3.097055327530295, + "learning_rate": 8.280144492836628e-06, + "loss": 0.7409, + "step": 6907 + }, + { + "epoch": 0.5684427072618803, + "grad_norm": 2.888717600900299, + "learning_rate": 8.277518965370503e-06, + "loss": 0.7416, + "step": 6908 + }, + { + "epoch": 0.5685249948570253, + "grad_norm": 2.368485703254481, + "learning_rate": 8.274893560266675e-06, + "loss": 0.7616, + "step": 6909 + }, + { + "epoch": 0.5686072824521703, + "grad_norm": 2.436790621759486, + "learning_rate": 8.27226827771165e-06, + "loss": 0.738, + "step": 6910 + }, + { + "epoch": 0.5686895700473154, + "grad_norm": 2.3150538527092075, + "learning_rate": 8.269643117891921e-06, + "loss": 0.7331, + "step": 6911 + }, + { + "epoch": 0.5687718576424604, + "grad_norm": 0.42249017494820124, + "learning_rate": 8.267018080993983e-06, + "loss": 0.4584, + "step": 6912 + }, + { + "epoch": 0.5688541452376055, + "grad_norm": 0.44154488915041695, + "learning_rate": 8.264393167204301e-06, + "loss": 0.5124, + "step": 6913 + }, + { + "epoch": 0.5689364328327504, + "grad_norm": 2.398016491695682, + "learning_rate": 8.261768376709359e-06, + "loss": 0.7486, + "step": 6914 + }, + { + "epoch": 0.5690187204278955, + "grad_norm": 3.1403107278701983, + "learning_rate": 8.25914370969561e-06, + "loss": 0.7258, + "step": 6915 + }, + { + "epoch": 0.5691010080230405, + "grad_norm": 2.773734346693096, + "learning_rate": 8.256519166349509e-06, + "loss": 0.7475, + "step": 6916 + }, + { + "epoch": 0.5691832956181856, + "grad_norm": 2.2933208576750235, + "learning_rate": 8.253894746857496e-06, + "loss": 0.7384, + "step": 6917 + }, + { + "epoch": 0.5692655832133305, + "grad_norm": 0.433408409732961, + "learning_rate": 8.251270451406009e-06, + "loss": 0.5094, + "step": 6918 + }, + { + "epoch": 0.5693478708084756, + "grad_norm": 2.923201603791523, + "learning_rate": 8.248646280181469e-06, + "loss": 0.7369, + "step": 6919 + }, + { + "epoch": 0.5694301584036207, + "grad_norm": 3.436774155419867, + "learning_rate": 8.246022233370298e-06, + "loss": 0.7323, + "step": 6920 + }, + { + "epoch": 0.5695124459987657, + "grad_norm": 2.236093723048941, + "learning_rate": 8.243398311158904e-06, + "loss": 0.7502, + "step": 6921 + }, + { + "epoch": 0.5695947335939107, + "grad_norm": 2.8104782729871887, + "learning_rate": 8.240774513733687e-06, + "loss": 0.7464, + "step": 6922 + }, + { + "epoch": 0.5696770211890557, + "grad_norm": 3.379466683957598, + "learning_rate": 8.238150841281031e-06, + "loss": 0.738, + "step": 6923 + }, + { + "epoch": 0.5697593087842008, + "grad_norm": 2.3612768367972383, + "learning_rate": 8.235527293987324e-06, + "loss": 0.715, + "step": 6924 + }, + { + "epoch": 0.5698415963793458, + "grad_norm": 3.3397786248006387, + "learning_rate": 8.232903872038932e-06, + "loss": 0.7314, + "step": 6925 + }, + { + "epoch": 0.5699238839744909, + "grad_norm": 3.5267476938973337, + "learning_rate": 8.230280575622227e-06, + "loss": 0.7305, + "step": 6926 + }, + { + "epoch": 0.5700061715696358, + "grad_norm": 2.9224753630377247, + "learning_rate": 8.22765740492356e-06, + "loss": 0.722, + "step": 6927 + }, + { + "epoch": 0.5700884591647809, + "grad_norm": 2.453787657257346, + "learning_rate": 8.225034360129274e-06, + "loss": 0.7401, + "step": 6928 + }, + { + "epoch": 0.570170746759926, + "grad_norm": 2.680692460105198, + "learning_rate": 8.222411441425711e-06, + "loss": 0.7434, + "step": 6929 + }, + { + "epoch": 0.570253034355071, + "grad_norm": 3.1001552585484085, + "learning_rate": 8.219788648999195e-06, + "loss": 0.7286, + "step": 6930 + }, + { + "epoch": 0.570335321950216, + "grad_norm": 0.41626708300244847, + "learning_rate": 8.217165983036044e-06, + "loss": 0.4934, + "step": 6931 + }, + { + "epoch": 0.570417609545361, + "grad_norm": 6.328992696186804, + "learning_rate": 8.214543443722575e-06, + "loss": 0.7624, + "step": 6932 + }, + { + "epoch": 0.5704998971405061, + "grad_norm": 2.9080997216704167, + "learning_rate": 8.21192103124508e-06, + "loss": 0.7356, + "step": 6933 + }, + { + "epoch": 0.5705821847356511, + "grad_norm": 2.6685657687523032, + "learning_rate": 8.20929874578986e-06, + "loss": 0.7283, + "step": 6934 + }, + { + "epoch": 0.5706644723307961, + "grad_norm": 3.5327489406046793, + "learning_rate": 8.206676587543189e-06, + "loss": 0.7409, + "step": 6935 + }, + { + "epoch": 0.5707467599259411, + "grad_norm": 3.1265733127284374, + "learning_rate": 8.204054556691343e-06, + "loss": 0.7293, + "step": 6936 + }, + { + "epoch": 0.5708290475210862, + "grad_norm": 2.607202534383765, + "learning_rate": 8.201432653420594e-06, + "loss": 0.7682, + "step": 6937 + }, + { + "epoch": 0.5709113351162313, + "grad_norm": 0.40244099868974376, + "learning_rate": 8.19881087791719e-06, + "loss": 0.4581, + "step": 6938 + }, + { + "epoch": 0.5709936227113762, + "grad_norm": 0.4539483103834783, + "learning_rate": 8.196189230367386e-06, + "loss": 0.5092, + "step": 6939 + }, + { + "epoch": 0.5710759103065213, + "grad_norm": 3.8912600232472054, + "learning_rate": 8.19356771095741e-06, + "loss": 0.7493, + "step": 6940 + }, + { + "epoch": 0.5711581979016663, + "grad_norm": 4.186058233351514, + "learning_rate": 8.190946319873497e-06, + "loss": 0.7475, + "step": 6941 + }, + { + "epoch": 0.5712404854968114, + "grad_norm": 0.46306845921809064, + "learning_rate": 8.18832505730186e-06, + "loss": 0.5346, + "step": 6942 + }, + { + "epoch": 0.5713227730919563, + "grad_norm": 2.48811830071205, + "learning_rate": 8.18570392342872e-06, + "loss": 0.7359, + "step": 6943 + }, + { + "epoch": 0.5714050606871014, + "grad_norm": 2.347570133609858, + "learning_rate": 8.183082918440266e-06, + "loss": 0.7408, + "step": 6944 + }, + { + "epoch": 0.5714873482822465, + "grad_norm": 2.901138223752057, + "learning_rate": 8.180462042522699e-06, + "loss": 0.7366, + "step": 6945 + }, + { + "epoch": 0.5715696358773915, + "grad_norm": 2.8130365984734826, + "learning_rate": 8.177841295862197e-06, + "loss": 0.7332, + "step": 6946 + }, + { + "epoch": 0.5716519234725365, + "grad_norm": 3.778883682095684, + "learning_rate": 8.175220678644938e-06, + "loss": 0.7216, + "step": 6947 + }, + { + "epoch": 0.5717342110676815, + "grad_norm": 0.41704043858524253, + "learning_rate": 8.172600191057077e-06, + "loss": 0.4945, + "step": 6948 + }, + { + "epoch": 0.5718164986628266, + "grad_norm": 3.4852906441048788, + "learning_rate": 8.169979833284785e-06, + "loss": 0.7242, + "step": 6949 + }, + { + "epoch": 0.5718987862579716, + "grad_norm": 3.34899450652653, + "learning_rate": 8.167359605514191e-06, + "loss": 0.7676, + "step": 6950 + }, + { + "epoch": 0.5719810738531167, + "grad_norm": 3.086601319633245, + "learning_rate": 8.164739507931445e-06, + "loss": 0.7344, + "step": 6951 + }, + { + "epoch": 0.5720633614482616, + "grad_norm": 0.4241039514928082, + "learning_rate": 8.162119540722667e-06, + "loss": 0.4866, + "step": 6952 + }, + { + "epoch": 0.5721456490434067, + "grad_norm": 2.53960366326396, + "learning_rate": 8.159499704073978e-06, + "loss": 0.7256, + "step": 6953 + }, + { + "epoch": 0.5722279366385518, + "grad_norm": 2.556841084273822, + "learning_rate": 8.156879998171486e-06, + "loss": 0.7296, + "step": 6954 + }, + { + "epoch": 0.5723102242336968, + "grad_norm": 2.6552187744235063, + "learning_rate": 8.154260423201294e-06, + "loss": 0.7162, + "step": 6955 + }, + { + "epoch": 0.5723925118288418, + "grad_norm": 2.6544002471713313, + "learning_rate": 8.151640979349485e-06, + "loss": 0.7547, + "step": 6956 + }, + { + "epoch": 0.5724747994239868, + "grad_norm": 2.1518948090755114, + "learning_rate": 8.149021666802148e-06, + "loss": 0.758, + "step": 6957 + }, + { + "epoch": 0.5725570870191319, + "grad_norm": 2.584096837736162, + "learning_rate": 8.146402485745351e-06, + "loss": 0.7175, + "step": 6958 + }, + { + "epoch": 0.5726393746142769, + "grad_norm": 4.050921344145484, + "learning_rate": 8.143783436365159e-06, + "loss": 0.7225, + "step": 6959 + }, + { + "epoch": 0.5727216622094219, + "grad_norm": 2.1156566668027104, + "learning_rate": 8.14116451884762e-06, + "loss": 0.7166, + "step": 6960 + }, + { + "epoch": 0.572803949804567, + "grad_norm": 2.3400082512139373, + "learning_rate": 8.138545733378783e-06, + "loss": 0.7329, + "step": 6961 + }, + { + "epoch": 0.572886237399712, + "grad_norm": 2.5354105471965345, + "learning_rate": 8.135927080144675e-06, + "loss": 0.7432, + "step": 6962 + }, + { + "epoch": 0.5729685249948571, + "grad_norm": 0.3926937787197345, + "learning_rate": 8.133308559331331e-06, + "loss": 0.4599, + "step": 6963 + }, + { + "epoch": 0.573050812590002, + "grad_norm": 0.4323973379141318, + "learning_rate": 8.130690171124763e-06, + "loss": 0.5165, + "step": 6964 + }, + { + "epoch": 0.5731331001851471, + "grad_norm": 0.4272145159986249, + "learning_rate": 8.128071915710973e-06, + "loss": 0.524, + "step": 6965 + }, + { + "epoch": 0.5732153877802921, + "grad_norm": 2.2982201562676243, + "learning_rate": 8.12545379327596e-06, + "loss": 0.7097, + "step": 6966 + }, + { + "epoch": 0.5732976753754372, + "grad_norm": 3.07187696247268, + "learning_rate": 8.122835804005713e-06, + "loss": 0.7366, + "step": 6967 + }, + { + "epoch": 0.5733799629705821, + "grad_norm": 0.4466184311432639, + "learning_rate": 8.120217948086206e-06, + "loss": 0.4916, + "step": 6968 + }, + { + "epoch": 0.5734622505657272, + "grad_norm": 2.4982311040338785, + "learning_rate": 8.117600225703413e-06, + "loss": 0.7526, + "step": 6969 + }, + { + "epoch": 0.5735445381608723, + "grad_norm": 2.1931046381815933, + "learning_rate": 8.114982637043288e-06, + "loss": 0.7068, + "step": 6970 + }, + { + "epoch": 0.5736268257560173, + "grad_norm": 2.472872631745059, + "learning_rate": 8.112365182291783e-06, + "loss": 0.6884, + "step": 6971 + }, + { + "epoch": 0.5737091133511623, + "grad_norm": 2.8640352387312853, + "learning_rate": 8.109747861634836e-06, + "loss": 0.7537, + "step": 6972 + }, + { + "epoch": 0.5737914009463073, + "grad_norm": 3.2981077021912517, + "learning_rate": 8.107130675258378e-06, + "loss": 0.7223, + "step": 6973 + }, + { + "epoch": 0.5738736885414524, + "grad_norm": 2.546333181605702, + "learning_rate": 8.104513623348326e-06, + "loss": 0.7384, + "step": 6974 + }, + { + "epoch": 0.5739559761365974, + "grad_norm": 3.1143357782897354, + "learning_rate": 8.101896706090602e-06, + "loss": 0.7303, + "step": 6975 + }, + { + "epoch": 0.5740382637317425, + "grad_norm": 2.879274343146357, + "learning_rate": 8.099279923671092e-06, + "loss": 0.7086, + "step": 6976 + }, + { + "epoch": 0.5741205513268874, + "grad_norm": 2.753343759382858, + "learning_rate": 8.096663276275703e-06, + "loss": 0.6833, + "step": 6977 + }, + { + "epoch": 0.5742028389220325, + "grad_norm": 2.6061282786159325, + "learning_rate": 8.094046764090309e-06, + "loss": 0.7156, + "step": 6978 + }, + { + "epoch": 0.5742851265171776, + "grad_norm": 2.838322219499334, + "learning_rate": 8.091430387300784e-06, + "loss": 0.7332, + "step": 6979 + }, + { + "epoch": 0.5743674141123226, + "grad_norm": 2.5887117227697867, + "learning_rate": 8.088814146092991e-06, + "loss": 0.7608, + "step": 6980 + }, + { + "epoch": 0.5744497017074676, + "grad_norm": 3.2335090583738806, + "learning_rate": 8.086198040652789e-06, + "loss": 0.7333, + "step": 6981 + }, + { + "epoch": 0.5745319893026126, + "grad_norm": 2.509842148626471, + "learning_rate": 8.083582071166012e-06, + "loss": 0.7428, + "step": 6982 + }, + { + "epoch": 0.5746142768977577, + "grad_norm": 3.5583783443362473, + "learning_rate": 8.080966237818504e-06, + "loss": 0.7384, + "step": 6983 + }, + { + "epoch": 0.5746965644929027, + "grad_norm": 2.8245194754140033, + "learning_rate": 8.078350540796083e-06, + "loss": 0.7268, + "step": 6984 + }, + { + "epoch": 0.5747788520880477, + "grad_norm": 2.589718682648033, + "learning_rate": 8.075734980284568e-06, + "loss": 0.7078, + "step": 6985 + }, + { + "epoch": 0.5748611396831927, + "grad_norm": 2.1014769912943656, + "learning_rate": 8.07311955646976e-06, + "loss": 0.7507, + "step": 6986 + }, + { + "epoch": 0.5749434272783378, + "grad_norm": 2.535767495257375, + "learning_rate": 8.070504269537462e-06, + "loss": 0.7413, + "step": 6987 + }, + { + "epoch": 0.5750257148734829, + "grad_norm": 0.4142548822833212, + "learning_rate": 8.067889119673449e-06, + "loss": 0.4858, + "step": 6988 + }, + { + "epoch": 0.5751080024686278, + "grad_norm": 3.3321149471063913, + "learning_rate": 8.065274107063507e-06, + "loss": 0.7199, + "step": 6989 + }, + { + "epoch": 0.5751902900637729, + "grad_norm": 2.739524582851905, + "learning_rate": 8.062659231893395e-06, + "loss": 0.7146, + "step": 6990 + }, + { + "epoch": 0.5752725776589179, + "grad_norm": 2.46448998280583, + "learning_rate": 8.060044494348878e-06, + "loss": 0.7512, + "step": 6991 + }, + { + "epoch": 0.575354865254063, + "grad_norm": 2.7438161536081798, + "learning_rate": 8.057429894615693e-06, + "loss": 0.7412, + "step": 6992 + }, + { + "epoch": 0.5754371528492079, + "grad_norm": 0.40732972597422157, + "learning_rate": 8.054815432879587e-06, + "loss": 0.483, + "step": 6993 + }, + { + "epoch": 0.575519440444353, + "grad_norm": 3.3823120774696664, + "learning_rate": 8.052201109326273e-06, + "loss": 0.7266, + "step": 6994 + }, + { + "epoch": 0.575601728039498, + "grad_norm": 2.374796087026487, + "learning_rate": 8.049586924141484e-06, + "loss": 0.7334, + "step": 6995 + }, + { + "epoch": 0.5756840156346431, + "grad_norm": 2.31659324450982, + "learning_rate": 8.046972877510918e-06, + "loss": 0.7224, + "step": 6996 + }, + { + "epoch": 0.5757663032297881, + "grad_norm": 3.5480640317249637, + "learning_rate": 8.044358969620277e-06, + "loss": 0.7215, + "step": 6997 + }, + { + "epoch": 0.5758485908249331, + "grad_norm": 2.527524975193735, + "learning_rate": 8.041745200655247e-06, + "loss": 0.7328, + "step": 6998 + }, + { + "epoch": 0.5759308784200782, + "grad_norm": 2.3276015738075735, + "learning_rate": 8.039131570801506e-06, + "loss": 0.7334, + "step": 6999 + }, + { + "epoch": 0.5760131660152232, + "grad_norm": 3.4769517268737884, + "learning_rate": 8.03651808024472e-06, + "loss": 0.7226, + "step": 7000 + }, + { + "epoch": 0.5760954536103683, + "grad_norm": 2.3833350964588282, + "learning_rate": 8.033904729170553e-06, + "loss": 0.7247, + "step": 7001 + }, + { + "epoch": 0.5761777412055132, + "grad_norm": 2.796263504670092, + "learning_rate": 8.031291517764645e-06, + "loss": 0.7361, + "step": 7002 + }, + { + "epoch": 0.5762600288006583, + "grad_norm": 3.177374863609765, + "learning_rate": 8.028678446212645e-06, + "loss": 0.7439, + "step": 7003 + }, + { + "epoch": 0.5763423163958034, + "grad_norm": 3.0070083427212073, + "learning_rate": 8.02606551470017e-06, + "loss": 0.7678, + "step": 7004 + }, + { + "epoch": 0.5764246039909484, + "grad_norm": 2.7267273263647653, + "learning_rate": 8.023452723412848e-06, + "loss": 0.7395, + "step": 7005 + }, + { + "epoch": 0.5765068915860934, + "grad_norm": 2.836042231753001, + "learning_rate": 8.02084007253628e-06, + "loss": 0.7351, + "step": 7006 + }, + { + "epoch": 0.5765891791812384, + "grad_norm": 5.61460986851652, + "learning_rate": 8.018227562256072e-06, + "loss": 0.7355, + "step": 7007 + }, + { + "epoch": 0.5766714667763835, + "grad_norm": 2.6418882275660844, + "learning_rate": 8.015615192757807e-06, + "loss": 0.7485, + "step": 7008 + }, + { + "epoch": 0.5767537543715285, + "grad_norm": 3.0016490820576416, + "learning_rate": 8.013002964227065e-06, + "loss": 0.7124, + "step": 7009 + }, + { + "epoch": 0.5768360419666735, + "grad_norm": 0.4085500696272252, + "learning_rate": 8.010390876849415e-06, + "loss": 0.4598, + "step": 7010 + }, + { + "epoch": 0.5769183295618185, + "grad_norm": 0.4100995645430402, + "learning_rate": 8.007778930810414e-06, + "loss": 0.4997, + "step": 7011 + }, + { + "epoch": 0.5770006171569636, + "grad_norm": 3.0879918867190423, + "learning_rate": 8.005167126295612e-06, + "loss": 0.753, + "step": 7012 + }, + { + "epoch": 0.5770829047521087, + "grad_norm": 2.757423148186556, + "learning_rate": 8.00255546349055e-06, + "loss": 0.7059, + "step": 7013 + }, + { + "epoch": 0.5771651923472536, + "grad_norm": 2.327817685974752, + "learning_rate": 7.999943942580748e-06, + "loss": 0.7259, + "step": 7014 + }, + { + "epoch": 0.5772474799423987, + "grad_norm": 2.441448455967104, + "learning_rate": 7.997332563751734e-06, + "loss": 0.7347, + "step": 7015 + }, + { + "epoch": 0.5773297675375437, + "grad_norm": 3.037312572377695, + "learning_rate": 7.99472132718901e-06, + "loss": 0.7547, + "step": 7016 + }, + { + "epoch": 0.5774120551326888, + "grad_norm": 2.6318083997736066, + "learning_rate": 7.992110233078078e-06, + "loss": 0.7406, + "step": 7017 + }, + { + "epoch": 0.5774943427278337, + "grad_norm": 2.784784467697426, + "learning_rate": 7.989499281604419e-06, + "loss": 0.7276, + "step": 7018 + }, + { + "epoch": 0.5775766303229788, + "grad_norm": 3.1000890748457994, + "learning_rate": 7.986888472953523e-06, + "loss": 0.7068, + "step": 7019 + }, + { + "epoch": 0.5776589179181238, + "grad_norm": 2.0140621482733323, + "learning_rate": 7.984277807310844e-06, + "loss": 0.7352, + "step": 7020 + }, + { + "epoch": 0.5777412055132689, + "grad_norm": 0.43736711332795764, + "learning_rate": 7.981667284861849e-06, + "loss": 0.4978, + "step": 7021 + }, + { + "epoch": 0.5778234931084139, + "grad_norm": 2.54155384688715, + "learning_rate": 7.979056905791981e-06, + "loss": 0.732, + "step": 7022 + }, + { + "epoch": 0.5779057807035589, + "grad_norm": 2.426701129691466, + "learning_rate": 7.976446670286681e-06, + "loss": 0.7192, + "step": 7023 + }, + { + "epoch": 0.577988068298704, + "grad_norm": 2.31743678852942, + "learning_rate": 7.97383657853137e-06, + "loss": 0.7594, + "step": 7024 + }, + { + "epoch": 0.578070355893849, + "grad_norm": 2.8844360858609788, + "learning_rate": 7.971226630711472e-06, + "loss": 0.7542, + "step": 7025 + }, + { + "epoch": 0.5781526434889941, + "grad_norm": 2.6866543599639874, + "learning_rate": 7.968616827012382e-06, + "loss": 0.7201, + "step": 7026 + }, + { + "epoch": 0.578234931084139, + "grad_norm": 2.678180058374591, + "learning_rate": 7.966007167619511e-06, + "loss": 0.7466, + "step": 7027 + }, + { + "epoch": 0.5783172186792841, + "grad_norm": 3.3059825872036983, + "learning_rate": 7.963397652718237e-06, + "loss": 0.7482, + "step": 7028 + }, + { + "epoch": 0.5783995062744292, + "grad_norm": 2.639782044633218, + "learning_rate": 7.960788282493937e-06, + "loss": 0.6924, + "step": 7029 + }, + { + "epoch": 0.5784817938695742, + "grad_norm": 2.616709987180559, + "learning_rate": 7.958179057131973e-06, + "loss": 0.7418, + "step": 7030 + }, + { + "epoch": 0.5785640814647192, + "grad_norm": 2.8036141217104524, + "learning_rate": 7.955569976817706e-06, + "loss": 0.7427, + "step": 7031 + }, + { + "epoch": 0.5786463690598642, + "grad_norm": 0.4331179441389204, + "learning_rate": 7.952961041736476e-06, + "loss": 0.4973, + "step": 7032 + }, + { + "epoch": 0.5787286566550093, + "grad_norm": 2.618523914844018, + "learning_rate": 7.950352252073622e-06, + "loss": 0.7555, + "step": 7033 + }, + { + "epoch": 0.5788109442501543, + "grad_norm": 2.541039953163154, + "learning_rate": 7.947743608014464e-06, + "loss": 0.7294, + "step": 7034 + }, + { + "epoch": 0.5788932318452993, + "grad_norm": 2.708075660357763, + "learning_rate": 7.945135109744321e-06, + "loss": 0.7266, + "step": 7035 + }, + { + "epoch": 0.5789755194404443, + "grad_norm": 3.050387125289151, + "learning_rate": 7.942526757448491e-06, + "loss": 0.7099, + "step": 7036 + }, + { + "epoch": 0.5790578070355894, + "grad_norm": 2.881948114156914, + "learning_rate": 7.939918551312272e-06, + "loss": 0.7371, + "step": 7037 + }, + { + "epoch": 0.5791400946307345, + "grad_norm": 2.6029616866239946, + "learning_rate": 7.937310491520939e-06, + "loss": 0.7404, + "step": 7038 + }, + { + "epoch": 0.5792223822258794, + "grad_norm": 2.318004305183051, + "learning_rate": 7.934702578259777e-06, + "loss": 0.7305, + "step": 7039 + }, + { + "epoch": 0.5793046698210245, + "grad_norm": 2.764184929273391, + "learning_rate": 7.932094811714037e-06, + "loss": 0.7537, + "step": 7040 + }, + { + "epoch": 0.5793869574161695, + "grad_norm": 2.2772965985263274, + "learning_rate": 7.929487192068977e-06, + "loss": 0.7154, + "step": 7041 + }, + { + "epoch": 0.5794692450113146, + "grad_norm": 2.6559876645231837, + "learning_rate": 7.926879719509833e-06, + "loss": 0.7206, + "step": 7042 + }, + { + "epoch": 0.5795515326064595, + "grad_norm": 2.7599342743466373, + "learning_rate": 7.92427239422184e-06, + "loss": 0.6947, + "step": 7043 + }, + { + "epoch": 0.5796338202016046, + "grad_norm": 2.5046260943215577, + "learning_rate": 7.921665216390213e-06, + "loss": 0.7166, + "step": 7044 + }, + { + "epoch": 0.5797161077967496, + "grad_norm": 2.4951993022644308, + "learning_rate": 7.91905818620017e-06, + "loss": 0.7044, + "step": 7045 + }, + { + "epoch": 0.5797983953918947, + "grad_norm": 3.257701560655134, + "learning_rate": 7.9164513038369e-06, + "loss": 0.7103, + "step": 7046 + }, + { + "epoch": 0.5798806829870397, + "grad_norm": 2.113884340036998, + "learning_rate": 7.913844569485603e-06, + "loss": 0.702, + "step": 7047 + }, + { + "epoch": 0.5799629705821847, + "grad_norm": 2.888735577230008, + "learning_rate": 7.911237983331447e-06, + "loss": 0.7426, + "step": 7048 + }, + { + "epoch": 0.5800452581773298, + "grad_norm": 2.3831799961242432, + "learning_rate": 7.908631545559609e-06, + "loss": 0.7324, + "step": 7049 + }, + { + "epoch": 0.5801275457724748, + "grad_norm": 3.3762026372488734, + "learning_rate": 7.906025256355235e-06, + "loss": 0.7637, + "step": 7050 + }, + { + "epoch": 0.5802098333676198, + "grad_norm": 2.657266579968778, + "learning_rate": 7.903419115903486e-06, + "loss": 0.7488, + "step": 7051 + }, + { + "epoch": 0.5802921209627648, + "grad_norm": 0.4104831745016612, + "learning_rate": 7.900813124389483e-06, + "loss": 0.4874, + "step": 7052 + }, + { + "epoch": 0.5803744085579099, + "grad_norm": 2.631659102642673, + "learning_rate": 7.898207281998362e-06, + "loss": 0.7259, + "step": 7053 + }, + { + "epoch": 0.580456696153055, + "grad_norm": 2.4786942113467485, + "learning_rate": 7.895601588915232e-06, + "loss": 0.7002, + "step": 7054 + }, + { + "epoch": 0.5805389837482, + "grad_norm": 0.396880011091362, + "learning_rate": 7.892996045325203e-06, + "loss": 0.4837, + "step": 7055 + }, + { + "epoch": 0.580621271343345, + "grad_norm": 2.652649019855485, + "learning_rate": 7.890390651413362e-06, + "loss": 0.7144, + "step": 7056 + }, + { + "epoch": 0.58070355893849, + "grad_norm": 3.0068448969471104, + "learning_rate": 7.887785407364799e-06, + "loss": 0.705, + "step": 7057 + }, + { + "epoch": 0.5807858465336351, + "grad_norm": 2.30481378273506, + "learning_rate": 7.885180313364576e-06, + "loss": 0.7173, + "step": 7058 + }, + { + "epoch": 0.5808681341287801, + "grad_norm": 2.152586750010606, + "learning_rate": 7.882575369597768e-06, + "loss": 0.7012, + "step": 7059 + }, + { + "epoch": 0.5809504217239251, + "grad_norm": 2.3158967358884346, + "learning_rate": 7.879970576249416e-06, + "loss": 0.7526, + "step": 7060 + }, + { + "epoch": 0.5810327093190701, + "grad_norm": 2.104324177582021, + "learning_rate": 7.877365933504567e-06, + "loss": 0.6974, + "step": 7061 + }, + { + "epoch": 0.5811149969142152, + "grad_norm": 0.41662109525025554, + "learning_rate": 7.874761441548244e-06, + "loss": 0.5164, + "step": 7062 + }, + { + "epoch": 0.5811972845093603, + "grad_norm": 2.891388460909693, + "learning_rate": 7.872157100565472e-06, + "loss": 0.7054, + "step": 7063 + }, + { + "epoch": 0.5812795721045052, + "grad_norm": 2.4487036930974755, + "learning_rate": 7.869552910741253e-06, + "loss": 0.7012, + "step": 7064 + }, + { + "epoch": 0.5813618596996503, + "grad_norm": 2.4956113098228503, + "learning_rate": 7.86694887226059e-06, + "loss": 0.7425, + "step": 7065 + }, + { + "epoch": 0.5814441472947953, + "grad_norm": 2.4564778080765572, + "learning_rate": 7.86434498530847e-06, + "loss": 0.7477, + "step": 7066 + }, + { + "epoch": 0.5815264348899404, + "grad_norm": 2.5564800182519143, + "learning_rate": 7.861741250069866e-06, + "loss": 0.7159, + "step": 7067 + }, + { + "epoch": 0.5816087224850853, + "grad_norm": 2.467057450598865, + "learning_rate": 7.859137666729742e-06, + "loss": 0.7335, + "step": 7068 + }, + { + "epoch": 0.5816910100802304, + "grad_norm": 0.42509079719068443, + "learning_rate": 7.856534235473055e-06, + "loss": 0.5062, + "step": 7069 + }, + { + "epoch": 0.5817732976753754, + "grad_norm": 2.2157189877768704, + "learning_rate": 7.853930956484745e-06, + "loss": 0.7182, + "step": 7070 + }, + { + "epoch": 0.5818555852705205, + "grad_norm": 2.734859071842515, + "learning_rate": 7.851327829949752e-06, + "loss": 0.7486, + "step": 7071 + }, + { + "epoch": 0.5819378728656655, + "grad_norm": 5.957085510825153, + "learning_rate": 7.848724856052993e-06, + "loss": 0.7398, + "step": 7072 + }, + { + "epoch": 0.5820201604608105, + "grad_norm": 2.9099864590216984, + "learning_rate": 7.84612203497938e-06, + "loss": 0.7395, + "step": 7073 + }, + { + "epoch": 0.5821024480559556, + "grad_norm": 3.056957885517857, + "learning_rate": 7.843519366913813e-06, + "loss": 0.7605, + "step": 7074 + }, + { + "epoch": 0.5821847356511006, + "grad_norm": 3.2967667312019384, + "learning_rate": 7.840916852041182e-06, + "loss": 0.7281, + "step": 7075 + }, + { + "epoch": 0.5822670232462456, + "grad_norm": 2.6698190242841244, + "learning_rate": 7.83831449054636e-06, + "loss": 0.7233, + "step": 7076 + }, + { + "epoch": 0.5823493108413906, + "grad_norm": 0.45093724405010566, + "learning_rate": 7.835712282614225e-06, + "loss": 0.5099, + "step": 7077 + }, + { + "epoch": 0.5824315984365357, + "grad_norm": 3.1127490540300626, + "learning_rate": 7.833110228429626e-06, + "loss": 0.7396, + "step": 7078 + }, + { + "epoch": 0.5825138860316807, + "grad_norm": 2.206841865442832, + "learning_rate": 7.830508328177412e-06, + "loss": 0.7067, + "step": 7079 + }, + { + "epoch": 0.5825961736268258, + "grad_norm": 2.2392870074942137, + "learning_rate": 7.827906582042415e-06, + "loss": 0.7497, + "step": 7080 + }, + { + "epoch": 0.5826784612219708, + "grad_norm": 2.6279456973406528, + "learning_rate": 7.825304990209463e-06, + "loss": 0.7445, + "step": 7081 + }, + { + "epoch": 0.5827607488171158, + "grad_norm": 0.4305210954612214, + "learning_rate": 7.822703552863362e-06, + "loss": 0.4887, + "step": 7082 + }, + { + "epoch": 0.5828430364122609, + "grad_norm": 2.815882394743614, + "learning_rate": 7.820102270188926e-06, + "loss": 0.7669, + "step": 7083 + }, + { + "epoch": 0.5829253240074059, + "grad_norm": 3.267898800044683, + "learning_rate": 7.817501142370931e-06, + "loss": 0.7259, + "step": 7084 + }, + { + "epoch": 0.5830076116025509, + "grad_norm": 2.7138706801211083, + "learning_rate": 7.81490016959417e-06, + "loss": 0.7401, + "step": 7085 + }, + { + "epoch": 0.5830898991976959, + "grad_norm": 2.482887114191282, + "learning_rate": 7.812299352043404e-06, + "loss": 0.7453, + "step": 7086 + }, + { + "epoch": 0.583172186792841, + "grad_norm": 2.2804488446168683, + "learning_rate": 7.809698689903393e-06, + "loss": 0.7217, + "step": 7087 + }, + { + "epoch": 0.583254474387986, + "grad_norm": 2.8594622074580984, + "learning_rate": 7.807098183358882e-06, + "loss": 0.7128, + "step": 7088 + }, + { + "epoch": 0.583336761983131, + "grad_norm": 2.335874862364808, + "learning_rate": 7.804497832594615e-06, + "loss": 0.72, + "step": 7089 + }, + { + "epoch": 0.5834190495782761, + "grad_norm": 2.4628697559101864, + "learning_rate": 7.801897637795305e-06, + "loss": 0.7124, + "step": 7090 + }, + { + "epoch": 0.5835013371734211, + "grad_norm": 2.1692699683110446, + "learning_rate": 7.799297599145676e-06, + "loss": 0.7352, + "step": 7091 + }, + { + "epoch": 0.5835836247685662, + "grad_norm": 4.206027334641872, + "learning_rate": 7.796697716830422e-06, + "loss": 0.7368, + "step": 7092 + }, + { + "epoch": 0.5836659123637111, + "grad_norm": 2.6354937850641527, + "learning_rate": 7.794097991034243e-06, + "loss": 0.7343, + "step": 7093 + }, + { + "epoch": 0.5837481999588562, + "grad_norm": 0.3996349036329061, + "learning_rate": 7.791498421941812e-06, + "loss": 0.4779, + "step": 7094 + }, + { + "epoch": 0.5838304875540012, + "grad_norm": 0.4380709507118102, + "learning_rate": 7.788899009737802e-06, + "loss": 0.5281, + "step": 7095 + }, + { + "epoch": 0.5839127751491463, + "grad_norm": 0.42945737793408656, + "learning_rate": 7.786299754606868e-06, + "loss": 0.4786, + "step": 7096 + }, + { + "epoch": 0.5839950627442912, + "grad_norm": 3.3654231813035222, + "learning_rate": 7.783700656733661e-06, + "loss": 0.7297, + "step": 7097 + }, + { + "epoch": 0.5840773503394363, + "grad_norm": 2.573883606469181, + "learning_rate": 7.781101716302814e-06, + "loss": 0.729, + "step": 7098 + }, + { + "epoch": 0.5841596379345814, + "grad_norm": 2.771203322015796, + "learning_rate": 7.778502933498955e-06, + "loss": 0.7218, + "step": 7099 + }, + { + "epoch": 0.5842419255297264, + "grad_norm": 2.426693584130095, + "learning_rate": 7.775904308506693e-06, + "loss": 0.7259, + "step": 7100 + }, + { + "epoch": 0.5843242131248714, + "grad_norm": 2.0711443706993857, + "learning_rate": 7.773305841510634e-06, + "loss": 0.7365, + "step": 7101 + }, + { + "epoch": 0.5844065007200164, + "grad_norm": 2.2596073901221287, + "learning_rate": 7.77070753269536e-06, + "loss": 0.7328, + "step": 7102 + }, + { + "epoch": 0.5844887883151615, + "grad_norm": 3.0260997554559323, + "learning_rate": 7.768109382245465e-06, + "loss": 0.7178, + "step": 7103 + }, + { + "epoch": 0.5845710759103065, + "grad_norm": 2.3751282464949335, + "learning_rate": 7.765511390345509e-06, + "loss": 0.7115, + "step": 7104 + }, + { + "epoch": 0.5846533635054516, + "grad_norm": 2.7161771686707055, + "learning_rate": 7.76291355718005e-06, + "loss": 0.7381, + "step": 7105 + }, + { + "epoch": 0.5847356511005966, + "grad_norm": 2.64099317506132, + "learning_rate": 7.760315882933631e-06, + "loss": 0.7344, + "step": 7106 + }, + { + "epoch": 0.5848179386957416, + "grad_norm": 2.0198803417203477, + "learning_rate": 7.757718367790794e-06, + "loss": 0.714, + "step": 7107 + }, + { + "epoch": 0.5849002262908867, + "grad_norm": 3.0917951990095056, + "learning_rate": 7.755121011936053e-06, + "loss": 0.7422, + "step": 7108 + }, + { + "epoch": 0.5849825138860317, + "grad_norm": 2.287234919068011, + "learning_rate": 7.75252381555393e-06, + "loss": 0.7385, + "step": 7109 + }, + { + "epoch": 0.5850648014811767, + "grad_norm": 2.316000870594042, + "learning_rate": 7.749926778828919e-06, + "loss": 0.7388, + "step": 7110 + }, + { + "epoch": 0.5851470890763217, + "grad_norm": 3.1813749273594936, + "learning_rate": 7.747329901945513e-06, + "loss": 0.7329, + "step": 7111 + }, + { + "epoch": 0.5852293766714668, + "grad_norm": 2.2319308028151457, + "learning_rate": 7.744733185088186e-06, + "loss": 0.7456, + "step": 7112 + }, + { + "epoch": 0.5853116642666119, + "grad_norm": 2.6747667384185645, + "learning_rate": 7.742136628441408e-06, + "loss": 0.7155, + "step": 7113 + }, + { + "epoch": 0.5853939518617568, + "grad_norm": 2.8559850503511055, + "learning_rate": 7.73954023218963e-06, + "loss": 0.7216, + "step": 7114 + }, + { + "epoch": 0.5854762394569019, + "grad_norm": 0.43396018014482957, + "learning_rate": 7.736943996517306e-06, + "loss": 0.4908, + "step": 7115 + }, + { + "epoch": 0.5855585270520469, + "grad_norm": 2.452288596869233, + "learning_rate": 7.734347921608852e-06, + "loss": 0.732, + "step": 7116 + }, + { + "epoch": 0.585640814647192, + "grad_norm": 2.549147389910834, + "learning_rate": 7.731752007648706e-06, + "loss": 0.717, + "step": 7117 + }, + { + "epoch": 0.5857231022423369, + "grad_norm": 1.9970971654916436, + "learning_rate": 7.729156254821263e-06, + "loss": 0.6969, + "step": 7118 + }, + { + "epoch": 0.585805389837482, + "grad_norm": 3.2416540539121437, + "learning_rate": 7.726560663310934e-06, + "loss": 0.7588, + "step": 7119 + }, + { + "epoch": 0.585887677432627, + "grad_norm": 0.4095446623118184, + "learning_rate": 7.723965233302095e-06, + "loss": 0.4726, + "step": 7120 + }, + { + "epoch": 0.5859699650277721, + "grad_norm": 0.43071830605942374, + "learning_rate": 7.721369964979131e-06, + "loss": 0.4807, + "step": 7121 + }, + { + "epoch": 0.586052252622917, + "grad_norm": 2.0422065020417404, + "learning_rate": 7.718774858526395e-06, + "loss": 0.7258, + "step": 7122 + }, + { + "epoch": 0.5861345402180621, + "grad_norm": 3.139550732332815, + "learning_rate": 7.71617991412825e-06, + "loss": 0.7507, + "step": 7123 + }, + { + "epoch": 0.5862168278132072, + "grad_norm": 2.5082745119797005, + "learning_rate": 7.713585131969027e-06, + "loss": 0.7348, + "step": 7124 + }, + { + "epoch": 0.5862991154083522, + "grad_norm": 1.827534261660831, + "learning_rate": 7.710990512233063e-06, + "loss": 0.7057, + "step": 7125 + }, + { + "epoch": 0.5863814030034972, + "grad_norm": 2.2939407407645485, + "learning_rate": 7.70839605510467e-06, + "loss": 0.7611, + "step": 7126 + }, + { + "epoch": 0.5864636905986422, + "grad_norm": 2.30154625807231, + "learning_rate": 7.705801760768159e-06, + "loss": 0.7044, + "step": 7127 + }, + { + "epoch": 0.5865459781937873, + "grad_norm": 2.3465580040842227, + "learning_rate": 7.703207629407817e-06, + "loss": 0.7015, + "step": 7128 + }, + { + "epoch": 0.5866282657889323, + "grad_norm": 2.1906168406271562, + "learning_rate": 7.700613661207935e-06, + "loss": 0.75, + "step": 7129 + }, + { + "epoch": 0.5867105533840774, + "grad_norm": 2.0749992627979497, + "learning_rate": 7.698019856352781e-06, + "loss": 0.7074, + "step": 7130 + }, + { + "epoch": 0.5867928409792224, + "grad_norm": 2.086983436985289, + "learning_rate": 7.695426215026612e-06, + "loss": 0.7375, + "step": 7131 + }, + { + "epoch": 0.5868751285743674, + "grad_norm": 3.039017989513399, + "learning_rate": 7.692832737413687e-06, + "loss": 0.7014, + "step": 7132 + }, + { + "epoch": 0.5869574161695125, + "grad_norm": 2.0798151644701797, + "learning_rate": 7.690239423698226e-06, + "loss": 0.7125, + "step": 7133 + }, + { + "epoch": 0.5870397037646575, + "grad_norm": 2.2865532020651176, + "learning_rate": 7.687646274064467e-06, + "loss": 0.7288, + "step": 7134 + }, + { + "epoch": 0.5871219913598025, + "grad_norm": 2.823923044750235, + "learning_rate": 7.685053288696618e-06, + "loss": 0.7267, + "step": 7135 + }, + { + "epoch": 0.5872042789549475, + "grad_norm": 0.4274350449715916, + "learning_rate": 7.682460467778882e-06, + "loss": 0.5148, + "step": 7136 + }, + { + "epoch": 0.5872865665500926, + "grad_norm": 2.2838688426896274, + "learning_rate": 7.679867811495445e-06, + "loss": 0.7397, + "step": 7137 + }, + { + "epoch": 0.5873688541452377, + "grad_norm": 2.559391068334812, + "learning_rate": 7.677275320030491e-06, + "loss": 0.7307, + "step": 7138 + }, + { + "epoch": 0.5874511417403826, + "grad_norm": 2.337698131316199, + "learning_rate": 7.67468299356818e-06, + "loss": 0.7307, + "step": 7139 + }, + { + "epoch": 0.5875334293355277, + "grad_norm": 2.1253239644028947, + "learning_rate": 7.672090832292677e-06, + "loss": 0.7341, + "step": 7140 + }, + { + "epoch": 0.5876157169306727, + "grad_norm": 2.316751383868327, + "learning_rate": 7.669498836388111e-06, + "loss": 0.7756, + "step": 7141 + }, + { + "epoch": 0.5876980045258178, + "grad_norm": 2.3281173604386676, + "learning_rate": 7.666907006038627e-06, + "loss": 0.7477, + "step": 7142 + }, + { + "epoch": 0.5877802921209627, + "grad_norm": 3.833662913598264, + "learning_rate": 7.664315341428334e-06, + "loss": 0.6968, + "step": 7143 + }, + { + "epoch": 0.5878625797161078, + "grad_norm": 0.4150420648534827, + "learning_rate": 7.661723842741346e-06, + "loss": 0.471, + "step": 7144 + }, + { + "epoch": 0.5879448673112528, + "grad_norm": 0.42004421621269705, + "learning_rate": 7.659132510161752e-06, + "loss": 0.5062, + "step": 7145 + }, + { + "epoch": 0.5880271549063979, + "grad_norm": 2.656019859531749, + "learning_rate": 7.656541343873646e-06, + "loss": 0.7224, + "step": 7146 + }, + { + "epoch": 0.5881094425015428, + "grad_norm": 2.5439502239517813, + "learning_rate": 7.653950344061092e-06, + "loss": 0.7171, + "step": 7147 + }, + { + "epoch": 0.5881917300966879, + "grad_norm": 3.0682789634649725, + "learning_rate": 7.651359510908157e-06, + "loss": 0.7263, + "step": 7148 + }, + { + "epoch": 0.588274017691833, + "grad_norm": 4.338426984553852, + "learning_rate": 7.648768844598882e-06, + "loss": 0.7402, + "step": 7149 + }, + { + "epoch": 0.588356305286978, + "grad_norm": 4.242795082182335, + "learning_rate": 7.646178345317312e-06, + "loss": 0.7022, + "step": 7150 + }, + { + "epoch": 0.588438592882123, + "grad_norm": 2.261552907506846, + "learning_rate": 7.643588013247464e-06, + "loss": 0.7286, + "step": 7151 + }, + { + "epoch": 0.588520880477268, + "grad_norm": 3.541780561106453, + "learning_rate": 7.640997848573358e-06, + "loss": 0.7128, + "step": 7152 + }, + { + "epoch": 0.5886031680724131, + "grad_norm": 2.706950973992088, + "learning_rate": 7.63840785147899e-06, + "loss": 0.732, + "step": 7153 + }, + { + "epoch": 0.5886854556675581, + "grad_norm": 2.461554973909932, + "learning_rate": 7.635818022148356e-06, + "loss": 0.726, + "step": 7154 + }, + { + "epoch": 0.5887677432627032, + "grad_norm": 2.707797334287131, + "learning_rate": 7.633228360765425e-06, + "loss": 0.7502, + "step": 7155 + }, + { + "epoch": 0.5888500308578482, + "grad_norm": 1.9866735775888418, + "learning_rate": 7.630638867514167e-06, + "loss": 0.7517, + "step": 7156 + }, + { + "epoch": 0.5889323184529932, + "grad_norm": 3.2583651002202934, + "learning_rate": 7.628049542578532e-06, + "loss": 0.7464, + "step": 7157 + }, + { + "epoch": 0.5890146060481383, + "grad_norm": 2.0255326823560886, + "learning_rate": 7.625460386142468e-06, + "loss": 0.7443, + "step": 7158 + }, + { + "epoch": 0.5890968936432833, + "grad_norm": 0.4159661936726183, + "learning_rate": 7.622871398389899e-06, + "loss": 0.4646, + "step": 7159 + }, + { + "epoch": 0.5891791812384283, + "grad_norm": 2.6449128716197023, + "learning_rate": 7.620282579504745e-06, + "loss": 0.7366, + "step": 7160 + }, + { + "epoch": 0.5892614688335733, + "grad_norm": 2.785935541170508, + "learning_rate": 7.61769392967091e-06, + "loss": 0.7491, + "step": 7161 + }, + { + "epoch": 0.5893437564287184, + "grad_norm": 2.0410450322343827, + "learning_rate": 7.615105449072288e-06, + "loss": 0.7062, + "step": 7162 + }, + { + "epoch": 0.5894260440238634, + "grad_norm": 2.664775783624409, + "learning_rate": 7.6125171378927585e-06, + "loss": 0.7146, + "step": 7163 + }, + { + "epoch": 0.5895083316190084, + "grad_norm": 3.2199749454709243, + "learning_rate": 7.6099289963162e-06, + "loss": 0.7061, + "step": 7164 + }, + { + "epoch": 0.5895906192141535, + "grad_norm": 2.5405619560864996, + "learning_rate": 7.607341024526457e-06, + "loss": 0.7286, + "step": 7165 + }, + { + "epoch": 0.5896729068092985, + "grad_norm": 14.31389999485597, + "learning_rate": 7.604753222707384e-06, + "loss": 0.7114, + "step": 7166 + }, + { + "epoch": 0.5897551944044436, + "grad_norm": 2.347230219784832, + "learning_rate": 7.602165591042811e-06, + "loss": 0.6988, + "step": 7167 + }, + { + "epoch": 0.5898374819995885, + "grad_norm": 0.41105773458581074, + "learning_rate": 7.599578129716561e-06, + "loss": 0.4964, + "step": 7168 + }, + { + "epoch": 0.5899197695947336, + "grad_norm": 2.793017958342222, + "learning_rate": 7.596990838912437e-06, + "loss": 0.7565, + "step": 7169 + }, + { + "epoch": 0.5900020571898786, + "grad_norm": 0.41161436627665393, + "learning_rate": 7.5944037188142475e-06, + "loss": 0.4895, + "step": 7170 + }, + { + "epoch": 0.5900843447850237, + "grad_norm": 0.4260423001951087, + "learning_rate": 7.591816769605764e-06, + "loss": 0.4928, + "step": 7171 + }, + { + "epoch": 0.5901666323801686, + "grad_norm": 2.7255636756376527, + "learning_rate": 7.58922999147077e-06, + "loss": 0.7278, + "step": 7172 + }, + { + "epoch": 0.5902489199753137, + "grad_norm": 2.4715249558303154, + "learning_rate": 7.586643384593021e-06, + "loss": 0.7318, + "step": 7173 + }, + { + "epoch": 0.5903312075704588, + "grad_norm": 2.4164518742199794, + "learning_rate": 7.584056949156268e-06, + "loss": 0.7572, + "step": 7174 + }, + { + "epoch": 0.5904134951656038, + "grad_norm": 2.90380466671781, + "learning_rate": 7.581470685344242e-06, + "loss": 0.7452, + "step": 7175 + }, + { + "epoch": 0.5904957827607488, + "grad_norm": 4.089689277250992, + "learning_rate": 7.578884593340674e-06, + "loss": 0.7267, + "step": 7176 + }, + { + "epoch": 0.5905780703558938, + "grad_norm": 2.2192065271330588, + "learning_rate": 7.576298673329268e-06, + "loss": 0.6839, + "step": 7177 + }, + { + "epoch": 0.5906603579510389, + "grad_norm": 2.3676364099677594, + "learning_rate": 7.573712925493732e-06, + "loss": 0.7452, + "step": 7178 + }, + { + "epoch": 0.5907426455461839, + "grad_norm": 5.903553077967115, + "learning_rate": 7.5711273500177484e-06, + "loss": 0.7425, + "step": 7179 + }, + { + "epoch": 0.5908249331413289, + "grad_norm": 2.476395481829392, + "learning_rate": 7.568541947084995e-06, + "loss": 0.7512, + "step": 7180 + }, + { + "epoch": 0.590907220736474, + "grad_norm": 3.5753330394501925, + "learning_rate": 7.5659567168791314e-06, + "loss": 0.7188, + "step": 7181 + }, + { + "epoch": 0.590989508331619, + "grad_norm": 2.917583536850498, + "learning_rate": 7.5633716595838125e-06, + "loss": 0.7286, + "step": 7182 + }, + { + "epoch": 0.5910717959267641, + "grad_norm": 1.8373433124290457, + "learning_rate": 7.560786775382669e-06, + "loss": 0.7285, + "step": 7183 + }, + { + "epoch": 0.5911540835219091, + "grad_norm": 2.4055196248633566, + "learning_rate": 7.5582020644593356e-06, + "loss": 0.7709, + "step": 7184 + }, + { + "epoch": 0.5912363711170541, + "grad_norm": 2.149841398879703, + "learning_rate": 7.5556175269974215e-06, + "loss": 0.7346, + "step": 7185 + }, + { + "epoch": 0.5913186587121991, + "grad_norm": 2.9908050419735557, + "learning_rate": 7.553033163180531e-06, + "loss": 0.7238, + "step": 7186 + }, + { + "epoch": 0.5914009463073442, + "grad_norm": 2.4833644504795207, + "learning_rate": 7.55044897319225e-06, + "loss": 0.7509, + "step": 7187 + }, + { + "epoch": 0.5914832339024892, + "grad_norm": 3.0911970005979614, + "learning_rate": 7.547864957216157e-06, + "loss": 0.7751, + "step": 7188 + }, + { + "epoch": 0.5915655214976342, + "grad_norm": 0.44133381463371457, + "learning_rate": 7.545281115435812e-06, + "loss": 0.4818, + "step": 7189 + }, + { + "epoch": 0.5916478090927793, + "grad_norm": 2.316036464973504, + "learning_rate": 7.542697448034775e-06, + "loss": 0.7313, + "step": 7190 + }, + { + "epoch": 0.5917300966879243, + "grad_norm": 2.1463520479398057, + "learning_rate": 7.540113955196579e-06, + "loss": 0.7308, + "step": 7191 + }, + { + "epoch": 0.5918123842830694, + "grad_norm": 2.671480295056863, + "learning_rate": 7.537530637104756e-06, + "loss": 0.7614, + "step": 7192 + }, + { + "epoch": 0.5918946718782143, + "grad_norm": 2.372301581004326, + "learning_rate": 7.534947493942817e-06, + "loss": 0.7195, + "step": 7193 + }, + { + "epoch": 0.5919769594733594, + "grad_norm": 2.303421724417176, + "learning_rate": 7.532364525894267e-06, + "loss": 0.7162, + "step": 7194 + }, + { + "epoch": 0.5920592470685044, + "grad_norm": 3.02644239043821, + "learning_rate": 7.529781733142591e-06, + "loss": 0.7168, + "step": 7195 + }, + { + "epoch": 0.5921415346636495, + "grad_norm": 2.661243354002446, + "learning_rate": 7.527199115871277e-06, + "loss": 0.7283, + "step": 7196 + }, + { + "epoch": 0.5922238222587944, + "grad_norm": 2.703696551001549, + "learning_rate": 7.524616674263776e-06, + "loss": 0.7155, + "step": 7197 + }, + { + "epoch": 0.5923061098539395, + "grad_norm": 0.4177358720419162, + "learning_rate": 7.522034408503551e-06, + "loss": 0.4783, + "step": 7198 + }, + { + "epoch": 0.5923883974490846, + "grad_norm": 2.766038571101227, + "learning_rate": 7.519452318774038e-06, + "loss": 0.7159, + "step": 7199 + }, + { + "epoch": 0.5924706850442296, + "grad_norm": 0.40789327859943747, + "learning_rate": 7.516870405258667e-06, + "loss": 0.4701, + "step": 7200 + }, + { + "epoch": 0.5925529726393746, + "grad_norm": 2.82559802634693, + "learning_rate": 7.5142886681408465e-06, + "loss": 0.7262, + "step": 7201 + }, + { + "epoch": 0.5926352602345196, + "grad_norm": 2.293032986467908, + "learning_rate": 7.5117071076039905e-06, + "loss": 0.7371, + "step": 7202 + }, + { + "epoch": 0.5927175478296647, + "grad_norm": 2.468401739925741, + "learning_rate": 7.509125723831476e-06, + "loss": 0.7008, + "step": 7203 + }, + { + "epoch": 0.5927998354248097, + "grad_norm": 0.4248346875860926, + "learning_rate": 7.506544517006691e-06, + "loss": 0.4951, + "step": 7204 + }, + { + "epoch": 0.5928821230199547, + "grad_norm": 0.41653802582408006, + "learning_rate": 7.503963487312993e-06, + "loss": 0.5045, + "step": 7205 + }, + { + "epoch": 0.5929644106150997, + "grad_norm": 2.635911926590857, + "learning_rate": 7.50138263493374e-06, + "loss": 0.74, + "step": 7206 + }, + { + "epoch": 0.5930466982102448, + "grad_norm": 2.384233588826891, + "learning_rate": 7.498801960052266e-06, + "loss": 0.7136, + "step": 7207 + }, + { + "epoch": 0.5931289858053899, + "grad_norm": 2.729536548388346, + "learning_rate": 7.496221462851902e-06, + "loss": 0.7404, + "step": 7208 + }, + { + "epoch": 0.5932112734005349, + "grad_norm": 3.32645327998201, + "learning_rate": 7.493641143515957e-06, + "loss": 0.7484, + "step": 7209 + }, + { + "epoch": 0.5932935609956799, + "grad_norm": 2.5472167996655415, + "learning_rate": 7.491061002227742e-06, + "loss": 0.7236, + "step": 7210 + }, + { + "epoch": 0.5933758485908249, + "grad_norm": 2.8074357262836833, + "learning_rate": 7.488481039170539e-06, + "loss": 0.7229, + "step": 7211 + }, + { + "epoch": 0.59345813618597, + "grad_norm": 3.138750532712732, + "learning_rate": 7.485901254527628e-06, + "loss": 0.7491, + "step": 7212 + }, + { + "epoch": 0.593540423781115, + "grad_norm": 2.842040825698355, + "learning_rate": 7.483321648482269e-06, + "loss": 0.7415, + "step": 7213 + }, + { + "epoch": 0.59362271137626, + "grad_norm": 3.9170710492901915, + "learning_rate": 7.480742221217717e-06, + "loss": 0.7307, + "step": 7214 + }, + { + "epoch": 0.593704998971405, + "grad_norm": 3.4136034554913484, + "learning_rate": 7.478162972917204e-06, + "loss": 0.7073, + "step": 7215 + }, + { + "epoch": 0.5937872865665501, + "grad_norm": 0.39292347203059247, + "learning_rate": 7.475583903763965e-06, + "loss": 0.4807, + "step": 7216 + }, + { + "epoch": 0.5938695741616952, + "grad_norm": 0.42839484451111937, + "learning_rate": 7.473005013941208e-06, + "loss": 0.495, + "step": 7217 + }, + { + "epoch": 0.5939518617568401, + "grad_norm": 2.545538297754622, + "learning_rate": 7.470426303632135e-06, + "loss": 0.7502, + "step": 7218 + }, + { + "epoch": 0.5940341493519852, + "grad_norm": 2.5832884896136177, + "learning_rate": 7.467847773019928e-06, + "loss": 0.7028, + "step": 7219 + }, + { + "epoch": 0.5941164369471302, + "grad_norm": 0.40301179291497236, + "learning_rate": 7.465269422287769e-06, + "loss": 0.4615, + "step": 7220 + }, + { + "epoch": 0.5941987245422753, + "grad_norm": 2.402266106700467, + "learning_rate": 7.462691251618812e-06, + "loss": 0.7331, + "step": 7221 + }, + { + "epoch": 0.5942810121374202, + "grad_norm": 2.8853405704344874, + "learning_rate": 7.460113261196215e-06, + "loss": 0.742, + "step": 7222 + }, + { + "epoch": 0.5943632997325653, + "grad_norm": 2.2520595279482705, + "learning_rate": 7.457535451203108e-06, + "loss": 0.7176, + "step": 7223 + }, + { + "epoch": 0.5944455873277104, + "grad_norm": 2.6052353119134084, + "learning_rate": 7.454957821822617e-06, + "loss": 0.7197, + "step": 7224 + }, + { + "epoch": 0.5945278749228554, + "grad_norm": 2.455824688723407, + "learning_rate": 7.452380373237852e-06, + "loss": 0.7432, + "step": 7225 + }, + { + "epoch": 0.5946101625180004, + "grad_norm": 2.2341532922328042, + "learning_rate": 7.449803105631912e-06, + "loss": 0.7284, + "step": 7226 + }, + { + "epoch": 0.5946924501131454, + "grad_norm": 2.4685171556920116, + "learning_rate": 7.447226019187878e-06, + "loss": 0.7346, + "step": 7227 + }, + { + "epoch": 0.5947747377082905, + "grad_norm": 3.6066180780512185, + "learning_rate": 7.44464911408883e-06, + "loss": 0.7097, + "step": 7228 + }, + { + "epoch": 0.5948570253034355, + "grad_norm": 3.939182958528144, + "learning_rate": 7.442072390517816e-06, + "loss": 0.7318, + "step": 7229 + }, + { + "epoch": 0.5949393128985805, + "grad_norm": 3.0199486186140017, + "learning_rate": 7.439495848657894e-06, + "loss": 0.7388, + "step": 7230 + }, + { + "epoch": 0.5950216004937255, + "grad_norm": 2.4373741598473426, + "learning_rate": 7.436919488692089e-06, + "loss": 0.7394, + "step": 7231 + }, + { + "epoch": 0.5951038880888706, + "grad_norm": 4.592259980313678, + "learning_rate": 7.434343310803427e-06, + "loss": 0.7362, + "step": 7232 + }, + { + "epoch": 0.5951861756840157, + "grad_norm": 2.4325398638951685, + "learning_rate": 7.431767315174908e-06, + "loss": 0.7479, + "step": 7233 + }, + { + "epoch": 0.5952684632791607, + "grad_norm": 3.11218513722077, + "learning_rate": 7.429191501989539e-06, + "loss": 0.7599, + "step": 7234 + }, + { + "epoch": 0.5953507508743057, + "grad_norm": 3.11969115728696, + "learning_rate": 7.4266158714302875e-06, + "loss": 0.7515, + "step": 7235 + }, + { + "epoch": 0.5954330384694507, + "grad_norm": 3.000067249735535, + "learning_rate": 7.424040423680132e-06, + "loss": 0.7169, + "step": 7236 + }, + { + "epoch": 0.5955153260645958, + "grad_norm": 5.144705739413185, + "learning_rate": 7.421465158922025e-06, + "loss": 0.7301, + "step": 7237 + }, + { + "epoch": 0.5955976136597408, + "grad_norm": 2.4544494314390506, + "learning_rate": 7.418890077338912e-06, + "loss": 0.6871, + "step": 7238 + }, + { + "epoch": 0.5956799012548858, + "grad_norm": 2.4714025304179645, + "learning_rate": 7.416315179113713e-06, + "loss": 0.7095, + "step": 7239 + }, + { + "epoch": 0.5957621888500308, + "grad_norm": 2.8229304800864927, + "learning_rate": 7.413740464429359e-06, + "loss": 0.7279, + "step": 7240 + }, + { + "epoch": 0.5958444764451759, + "grad_norm": 3.8679488786600014, + "learning_rate": 7.4111659334687415e-06, + "loss": 0.7676, + "step": 7241 + }, + { + "epoch": 0.595926764040321, + "grad_norm": 0.4102645031170378, + "learning_rate": 7.408591586414759e-06, + "loss": 0.4995, + "step": 7242 + }, + { + "epoch": 0.5960090516354659, + "grad_norm": 0.40777539324932377, + "learning_rate": 7.4060174234502845e-06, + "loss": 0.486, + "step": 7243 + }, + { + "epoch": 0.596091339230611, + "grad_norm": 3.045020664371829, + "learning_rate": 7.403443444758185e-06, + "loss": 0.73, + "step": 7244 + }, + { + "epoch": 0.596173626825756, + "grad_norm": 2.6882466692581435, + "learning_rate": 7.4008696505213085e-06, + "loss": 0.7306, + "step": 7245 + }, + { + "epoch": 0.5962559144209011, + "grad_norm": 2.756979420000623, + "learning_rate": 7.398296040922497e-06, + "loss": 0.7366, + "step": 7246 + }, + { + "epoch": 0.596338202016046, + "grad_norm": 2.624733657572914, + "learning_rate": 7.395722616144571e-06, + "loss": 0.7212, + "step": 7247 + }, + { + "epoch": 0.5964204896111911, + "grad_norm": 0.4115413113822751, + "learning_rate": 7.393149376370349e-06, + "loss": 0.4845, + "step": 7248 + }, + { + "epoch": 0.5965027772063362, + "grad_norm": 2.1300673270346686, + "learning_rate": 7.390576321782623e-06, + "loss": 0.7567, + "step": 7249 + }, + { + "epoch": 0.5965850648014812, + "grad_norm": 2.571491393018002, + "learning_rate": 7.388003452564186e-06, + "loss": 0.712, + "step": 7250 + }, + { + "epoch": 0.5966673523966262, + "grad_norm": 2.5339218997242594, + "learning_rate": 7.3854307688978035e-06, + "loss": 0.7052, + "step": 7251 + }, + { + "epoch": 0.5967496399917712, + "grad_norm": 2.655138548546828, + "learning_rate": 7.38285827096624e-06, + "loss": 0.7425, + "step": 7252 + }, + { + "epoch": 0.5968319275869163, + "grad_norm": 2.6336494906823917, + "learning_rate": 7.380285958952236e-06, + "loss": 0.7511, + "step": 7253 + }, + { + "epoch": 0.5969142151820613, + "grad_norm": 2.335776608467133, + "learning_rate": 7.377713833038531e-06, + "loss": 0.7473, + "step": 7254 + }, + { + "epoch": 0.5969965027772063, + "grad_norm": 2.937109495401576, + "learning_rate": 7.37514189340784e-06, + "loss": 0.7394, + "step": 7255 + }, + { + "epoch": 0.5970787903723513, + "grad_norm": 2.6885699619524734, + "learning_rate": 7.3725701402428715e-06, + "loss": 0.7064, + "step": 7256 + }, + { + "epoch": 0.5971610779674964, + "grad_norm": 0.40915314729092145, + "learning_rate": 7.369998573726318e-06, + "loss": 0.4808, + "step": 7257 + }, + { + "epoch": 0.5972433655626415, + "grad_norm": 1.943782303516882, + "learning_rate": 7.367427194040862e-06, + "loss": 0.7222, + "step": 7258 + }, + { + "epoch": 0.5973256531577865, + "grad_norm": 2.558163046650503, + "learning_rate": 7.364856001369163e-06, + "loss": 0.7452, + "step": 7259 + }, + { + "epoch": 0.5974079407529315, + "grad_norm": 2.3512810252290492, + "learning_rate": 7.362284995893884e-06, + "loss": 0.7137, + "step": 7260 + }, + { + "epoch": 0.5974902283480765, + "grad_norm": 2.491785775535029, + "learning_rate": 7.359714177797658e-06, + "loss": 0.7162, + "step": 7261 + }, + { + "epoch": 0.5975725159432216, + "grad_norm": 0.43750585983960555, + "learning_rate": 7.357143547263117e-06, + "loss": 0.5011, + "step": 7262 + }, + { + "epoch": 0.5976548035383666, + "grad_norm": 2.606429559241179, + "learning_rate": 7.3545731044728685e-06, + "loss": 0.7262, + "step": 7263 + }, + { + "epoch": 0.5977370911335116, + "grad_norm": 0.40648627247037356, + "learning_rate": 7.352002849609519e-06, + "loss": 0.5004, + "step": 7264 + }, + { + "epoch": 0.5978193787286566, + "grad_norm": 2.414145841884392, + "learning_rate": 7.349432782855648e-06, + "loss": 0.7458, + "step": 7265 + }, + { + "epoch": 0.5979016663238017, + "grad_norm": 2.9167343386763407, + "learning_rate": 7.346862904393839e-06, + "loss": 0.7364, + "step": 7266 + }, + { + "epoch": 0.5979839539189468, + "grad_norm": 2.3523260335326652, + "learning_rate": 7.34429321440664e-06, + "loss": 0.7484, + "step": 7267 + }, + { + "epoch": 0.5980662415140917, + "grad_norm": 2.031350100479241, + "learning_rate": 7.341723713076608e-06, + "loss": 0.7278, + "step": 7268 + }, + { + "epoch": 0.5981485291092368, + "grad_norm": 2.11180804919559, + "learning_rate": 7.33915440058627e-06, + "loss": 0.7247, + "step": 7269 + }, + { + "epoch": 0.5982308167043818, + "grad_norm": 0.4453953567290814, + "learning_rate": 7.336585277118151e-06, + "loss": 0.4961, + "step": 7270 + }, + { + "epoch": 0.5983131042995269, + "grad_norm": 2.821640659264986, + "learning_rate": 7.334016342854748e-06, + "loss": 0.7506, + "step": 7271 + }, + { + "epoch": 0.5983953918946718, + "grad_norm": 2.145943080229115, + "learning_rate": 7.331447597978569e-06, + "loss": 0.7241, + "step": 7272 + }, + { + "epoch": 0.5984776794898169, + "grad_norm": 2.564413535823018, + "learning_rate": 7.328879042672078e-06, + "loss": 0.7119, + "step": 7273 + }, + { + "epoch": 0.598559967084962, + "grad_norm": 2.281819583894213, + "learning_rate": 7.326310677117751e-06, + "loss": 0.7252, + "step": 7274 + }, + { + "epoch": 0.598642254680107, + "grad_norm": 2.3503384954788697, + "learning_rate": 7.3237425014980365e-06, + "loss": 0.7098, + "step": 7275 + }, + { + "epoch": 0.598724542275252, + "grad_norm": 10.559614315227956, + "learning_rate": 7.3211745159953775e-06, + "loss": 0.7243, + "step": 7276 + }, + { + "epoch": 0.598806829870397, + "grad_norm": 2.550075188540764, + "learning_rate": 7.318606720792194e-06, + "loss": 0.7181, + "step": 7277 + }, + { + "epoch": 0.5988891174655421, + "grad_norm": 3.7692530485711555, + "learning_rate": 7.316039116070903e-06, + "loss": 0.7003, + "step": 7278 + }, + { + "epoch": 0.5989714050606871, + "grad_norm": 2.8480958033548225, + "learning_rate": 7.313471702013897e-06, + "loss": 0.7067, + "step": 7279 + }, + { + "epoch": 0.5990536926558321, + "grad_norm": 2.413549561666904, + "learning_rate": 7.3109044788035676e-06, + "loss": 0.7358, + "step": 7280 + }, + { + "epoch": 0.5991359802509771, + "grad_norm": 0.43106803529237925, + "learning_rate": 7.3083374466222826e-06, + "loss": 0.4978, + "step": 7281 + }, + { + "epoch": 0.5992182678461222, + "grad_norm": 2.255324005127599, + "learning_rate": 7.305770605652402e-06, + "loss": 0.7225, + "step": 7282 + }, + { + "epoch": 0.5993005554412673, + "grad_norm": 3.700026077912763, + "learning_rate": 7.303203956076267e-06, + "loss": 0.7866, + "step": 7283 + }, + { + "epoch": 0.5993828430364123, + "grad_norm": 2.3500705931203707, + "learning_rate": 7.300637498076213e-06, + "loss": 0.7357, + "step": 7284 + }, + { + "epoch": 0.5994651306315573, + "grad_norm": 2.377343711337513, + "learning_rate": 7.298071231834549e-06, + "loss": 0.7408, + "step": 7285 + }, + { + "epoch": 0.5995474182267023, + "grad_norm": 2.144299985473729, + "learning_rate": 7.295505157533587e-06, + "loss": 0.7392, + "step": 7286 + }, + { + "epoch": 0.5996297058218474, + "grad_norm": 3.2604246400601853, + "learning_rate": 7.292939275355613e-06, + "loss": 0.7682, + "step": 7287 + }, + { + "epoch": 0.5997119934169924, + "grad_norm": 2.253615332252266, + "learning_rate": 7.290373585482905e-06, + "loss": 0.7045, + "step": 7288 + }, + { + "epoch": 0.5997942810121374, + "grad_norm": 2.1860889236185392, + "learning_rate": 7.287808088097722e-06, + "loss": 0.7232, + "step": 7289 + }, + { + "epoch": 0.5998765686072824, + "grad_norm": 2.6694013588593806, + "learning_rate": 7.285242783382317e-06, + "loss": 0.7714, + "step": 7290 + }, + { + "epoch": 0.5999588562024275, + "grad_norm": 0.41603426328787846, + "learning_rate": 7.282677671518918e-06, + "loss": 0.4826, + "step": 7291 + }, + { + "epoch": 0.6000411437975726, + "grad_norm": 0.4175632891690942, + "learning_rate": 7.280112752689756e-06, + "loss": 0.4774, + "step": 7292 + }, + { + "epoch": 0.6001234313927175, + "grad_norm": 2.540561447708401, + "learning_rate": 7.2775480270770334e-06, + "loss": 0.7255, + "step": 7293 + }, + { + "epoch": 0.6002057189878626, + "grad_norm": 4.623262872355257, + "learning_rate": 7.2749834948629464e-06, + "loss": 0.6859, + "step": 7294 + }, + { + "epoch": 0.6002880065830076, + "grad_norm": 2.2834848992096752, + "learning_rate": 7.272419156229672e-06, + "loss": 0.7318, + "step": 7295 + }, + { + "epoch": 0.6003702941781527, + "grad_norm": 3.0004243639041333, + "learning_rate": 7.269855011359382e-06, + "loss": 0.726, + "step": 7296 + }, + { + "epoch": 0.6004525817732976, + "grad_norm": 2.3230915646317825, + "learning_rate": 7.267291060434219e-06, + "loss": 0.7384, + "step": 7297 + }, + { + "epoch": 0.6005348693684427, + "grad_norm": 2.7255326853521544, + "learning_rate": 7.264727303636337e-06, + "loss": 0.7458, + "step": 7298 + }, + { + "epoch": 0.6006171569635878, + "grad_norm": 0.44486168188897335, + "learning_rate": 7.2621637411478456e-06, + "loss": 0.5278, + "step": 7299 + }, + { + "epoch": 0.6006994445587328, + "grad_norm": 2.4474335574233126, + "learning_rate": 7.259600373150867e-06, + "loss": 0.7142, + "step": 7300 + }, + { + "epoch": 0.6007817321538778, + "grad_norm": 3.587242633609106, + "learning_rate": 7.257037199827494e-06, + "loss": 0.7266, + "step": 7301 + }, + { + "epoch": 0.6008640197490228, + "grad_norm": 2.2449035308402148, + "learning_rate": 7.254474221359813e-06, + "loss": 0.7099, + "step": 7302 + }, + { + "epoch": 0.6009463073441679, + "grad_norm": 2.7027505064012782, + "learning_rate": 7.2519114379298896e-06, + "loss": 0.7182, + "step": 7303 + }, + { + "epoch": 0.6010285949393129, + "grad_norm": 2.8382320194443795, + "learning_rate": 7.249348849719788e-06, + "loss": 0.7454, + "step": 7304 + }, + { + "epoch": 0.6011108825344579, + "grad_norm": 2.0990248354353622, + "learning_rate": 7.24678645691154e-06, + "loss": 0.7545, + "step": 7305 + }, + { + "epoch": 0.6011931701296029, + "grad_norm": 2.152614301987953, + "learning_rate": 7.244224259687183e-06, + "loss": 0.7035, + "step": 7306 + }, + { + "epoch": 0.601275457724748, + "grad_norm": 2.3029624893081904, + "learning_rate": 7.241662258228723e-06, + "loss": 0.7458, + "step": 7307 + }, + { + "epoch": 0.6013577453198931, + "grad_norm": 2.486995832403457, + "learning_rate": 7.23910045271817e-06, + "loss": 0.7289, + "step": 7308 + }, + { + "epoch": 0.601440032915038, + "grad_norm": 2.4962337239553833, + "learning_rate": 7.236538843337499e-06, + "loss": 0.7294, + "step": 7309 + }, + { + "epoch": 0.6015223205101831, + "grad_norm": 2.2170188260718633, + "learning_rate": 7.233977430268697e-06, + "loss": 0.7348, + "step": 7310 + }, + { + "epoch": 0.6016046081053281, + "grad_norm": 0.43132439561004293, + "learning_rate": 7.2314162136937075e-06, + "loss": 0.5157, + "step": 7311 + }, + { + "epoch": 0.6016868957004732, + "grad_norm": 2.298236133875319, + "learning_rate": 7.228855193794486e-06, + "loss": 0.7317, + "step": 7312 + }, + { + "epoch": 0.6017691832956182, + "grad_norm": 2.3164304498229704, + "learning_rate": 7.22629437075296e-06, + "loss": 0.7028, + "step": 7313 + }, + { + "epoch": 0.6018514708907632, + "grad_norm": 2.5818243305294075, + "learning_rate": 7.223733744751047e-06, + "loss": 0.7173, + "step": 7314 + }, + { + "epoch": 0.6019337584859082, + "grad_norm": 0.43401419227516147, + "learning_rate": 7.221173315970646e-06, + "loss": 0.4898, + "step": 7315 + }, + { + "epoch": 0.6020160460810533, + "grad_norm": 4.412165444220676, + "learning_rate": 7.218613084593651e-06, + "loss": 0.7519, + "step": 7316 + }, + { + "epoch": 0.6020983336761984, + "grad_norm": 2.6860122737989762, + "learning_rate": 7.21605305080193e-06, + "loss": 0.7382, + "step": 7317 + }, + { + "epoch": 0.6021806212713433, + "grad_norm": 1.9707582490392606, + "learning_rate": 7.2134932147773516e-06, + "loss": 0.7526, + "step": 7318 + }, + { + "epoch": 0.6022629088664884, + "grad_norm": 2.631185015100694, + "learning_rate": 7.210933576701757e-06, + "loss": 0.7576, + "step": 7319 + }, + { + "epoch": 0.6023451964616334, + "grad_norm": 0.40993230365120537, + "learning_rate": 7.208374136756984e-06, + "loss": 0.4948, + "step": 7320 + }, + { + "epoch": 0.6024274840567785, + "grad_norm": 2.5113764340100233, + "learning_rate": 7.205814895124845e-06, + "loss": 0.7069, + "step": 7321 + }, + { + "epoch": 0.6025097716519234, + "grad_norm": 2.4350857449403387, + "learning_rate": 7.203255851987145e-06, + "loss": 0.7178, + "step": 7322 + }, + { + "epoch": 0.6025920592470685, + "grad_norm": 2.609998803136941, + "learning_rate": 7.200697007525681e-06, + "loss": 0.7262, + "step": 7323 + }, + { + "epoch": 0.6026743468422135, + "grad_norm": 3.5151774449203557, + "learning_rate": 7.198138361922223e-06, + "loss": 0.7253, + "step": 7324 + }, + { + "epoch": 0.6027566344373586, + "grad_norm": 0.40249784371146474, + "learning_rate": 7.195579915358536e-06, + "loss": 0.4971, + "step": 7325 + }, + { + "epoch": 0.6028389220325036, + "grad_norm": 3.193435373974895, + "learning_rate": 7.1930216680163645e-06, + "loss": 0.7354, + "step": 7326 + }, + { + "epoch": 0.6029212096276486, + "grad_norm": 0.4222749565733095, + "learning_rate": 7.190463620077448e-06, + "loss": 0.4887, + "step": 7327 + }, + { + "epoch": 0.6030034972227937, + "grad_norm": 0.4205529982451343, + "learning_rate": 7.1879057717234975e-06, + "loss": 0.4938, + "step": 7328 + }, + { + "epoch": 0.6030857848179387, + "grad_norm": 3.073977632516191, + "learning_rate": 7.185348123136229e-06, + "loss": 0.7384, + "step": 7329 + }, + { + "epoch": 0.6031680724130837, + "grad_norm": 3.4729218516372136, + "learning_rate": 7.182790674497324e-06, + "loss": 0.7566, + "step": 7330 + }, + { + "epoch": 0.6032503600082287, + "grad_norm": 2.4237846062966253, + "learning_rate": 7.180233425988469e-06, + "loss": 0.7071, + "step": 7331 + }, + { + "epoch": 0.6033326476033738, + "grad_norm": 2.5704579195911696, + "learning_rate": 7.177676377791319e-06, + "loss": 0.7598, + "step": 7332 + }, + { + "epoch": 0.6034149351985189, + "grad_norm": 0.4041371125395736, + "learning_rate": 7.175119530087526e-06, + "loss": 0.4839, + "step": 7333 + }, + { + "epoch": 0.6034972227936638, + "grad_norm": 2.0413854792038353, + "learning_rate": 7.172562883058721e-06, + "loss": 0.7373, + "step": 7334 + }, + { + "epoch": 0.6035795103888089, + "grad_norm": 3.08562844566261, + "learning_rate": 7.170006436886531e-06, + "loss": 0.7553, + "step": 7335 + }, + { + "epoch": 0.6036617979839539, + "grad_norm": 1.9790687399061282, + "learning_rate": 7.167450191752556e-06, + "loss": 0.717, + "step": 7336 + }, + { + "epoch": 0.603744085579099, + "grad_norm": 2.631799891672532, + "learning_rate": 7.1648941478383905e-06, + "loss": 0.7044, + "step": 7337 + }, + { + "epoch": 0.603826373174244, + "grad_norm": 3.7709365263684638, + "learning_rate": 7.162338305325609e-06, + "loss": 0.7411, + "step": 7338 + }, + { + "epoch": 0.603908660769389, + "grad_norm": 2.696556878682047, + "learning_rate": 7.159782664395778e-06, + "loss": 0.7194, + "step": 7339 + }, + { + "epoch": 0.603990948364534, + "grad_norm": 3.5867781704351547, + "learning_rate": 7.157227225230441e-06, + "loss": 0.7222, + "step": 7340 + }, + { + "epoch": 0.6040732359596791, + "grad_norm": 3.0265862153546386, + "learning_rate": 7.154671988011139e-06, + "loss": 0.7107, + "step": 7341 + }, + { + "epoch": 0.6041555235548242, + "grad_norm": 2.0932391565789397, + "learning_rate": 7.1521169529193875e-06, + "loss": 0.7192, + "step": 7342 + }, + { + "epoch": 0.6042378111499691, + "grad_norm": 2.162889311123922, + "learning_rate": 7.149562120136694e-06, + "loss": 0.7496, + "step": 7343 + }, + { + "epoch": 0.6043200987451142, + "grad_norm": 0.4049358216525677, + "learning_rate": 7.147007489844548e-06, + "loss": 0.4753, + "step": 7344 + }, + { + "epoch": 0.6044023863402592, + "grad_norm": 2.6689755363543464, + "learning_rate": 7.144453062224428e-06, + "loss": 0.7185, + "step": 7345 + }, + { + "epoch": 0.6044846739354043, + "grad_norm": 4.183607620554075, + "learning_rate": 7.141898837457793e-06, + "loss": 0.7118, + "step": 7346 + }, + { + "epoch": 0.6045669615305492, + "grad_norm": 2.6201705335185554, + "learning_rate": 7.139344815726098e-06, + "loss": 0.7337, + "step": 7347 + }, + { + "epoch": 0.6046492491256943, + "grad_norm": 0.40306774118717165, + "learning_rate": 7.1367909972107675e-06, + "loss": 0.4906, + "step": 7348 + }, + { + "epoch": 0.6047315367208393, + "grad_norm": 2.578690596669481, + "learning_rate": 7.134237382093228e-06, + "loss": 0.7176, + "step": 7349 + }, + { + "epoch": 0.6048138243159844, + "grad_norm": 2.412320530036613, + "learning_rate": 7.131683970554881e-06, + "loss": 0.7438, + "step": 7350 + }, + { + "epoch": 0.6048961119111294, + "grad_norm": 2.619648573391517, + "learning_rate": 7.129130762777121e-06, + "loss": 0.7342, + "step": 7351 + }, + { + "epoch": 0.6049783995062744, + "grad_norm": 2.7609304826411667, + "learning_rate": 7.126577758941314e-06, + "loss": 0.7493, + "step": 7352 + }, + { + "epoch": 0.6050606871014195, + "grad_norm": 2.9315003522653256, + "learning_rate": 7.124024959228835e-06, + "loss": 0.7493, + "step": 7353 + }, + { + "epoch": 0.6051429746965645, + "grad_norm": 2.691666683251502, + "learning_rate": 7.121472363821017e-06, + "loss": 0.7186, + "step": 7354 + }, + { + "epoch": 0.6052252622917095, + "grad_norm": 6.430529308893544, + "learning_rate": 7.118919972899203e-06, + "loss": 0.7269, + "step": 7355 + }, + { + "epoch": 0.6053075498868545, + "grad_norm": 2.6698104081822036, + "learning_rate": 7.116367786644705e-06, + "loss": 0.7415, + "step": 7356 + }, + { + "epoch": 0.6053898374819996, + "grad_norm": 2.1984247116394533, + "learning_rate": 7.113815805238829e-06, + "loss": 0.7377, + "step": 7357 + }, + { + "epoch": 0.6054721250771447, + "grad_norm": 2.586567968753607, + "learning_rate": 7.1112640288628584e-06, + "loss": 0.7509, + "step": 7358 + }, + { + "epoch": 0.6055544126722896, + "grad_norm": 2.647623546612699, + "learning_rate": 7.108712457698077e-06, + "loss": 0.747, + "step": 7359 + }, + { + "epoch": 0.6056367002674347, + "grad_norm": 2.413562918264829, + "learning_rate": 7.106161091925732e-06, + "loss": 0.7322, + "step": 7360 + }, + { + "epoch": 0.6057189878625797, + "grad_norm": 2.2313319675093495, + "learning_rate": 7.10360993172708e-06, + "loss": 0.7284, + "step": 7361 + }, + { + "epoch": 0.6058012754577248, + "grad_norm": 2.259390451246755, + "learning_rate": 7.101058977283345e-06, + "loss": 0.7326, + "step": 7362 + }, + { + "epoch": 0.6058835630528698, + "grad_norm": 2.613348259455992, + "learning_rate": 7.0985082287757455e-06, + "loss": 0.6871, + "step": 7363 + }, + { + "epoch": 0.6059658506480148, + "grad_norm": 2.5776494677359456, + "learning_rate": 7.095957686385479e-06, + "loss": 0.749, + "step": 7364 + }, + { + "epoch": 0.6060481382431598, + "grad_norm": 2.6423089010111416, + "learning_rate": 7.093407350293737e-06, + "loss": 0.7253, + "step": 7365 + }, + { + "epoch": 0.6061304258383049, + "grad_norm": 2.742139190347056, + "learning_rate": 7.090857220681684e-06, + "loss": 0.7114, + "step": 7366 + }, + { + "epoch": 0.60621271343345, + "grad_norm": 0.43111155738704054, + "learning_rate": 7.088307297730485e-06, + "loss": 0.5118, + "step": 7367 + }, + { + "epoch": 0.6062950010285949, + "grad_norm": 3.141391465034662, + "learning_rate": 7.085757581621278e-06, + "loss": 0.7347, + "step": 7368 + }, + { + "epoch": 0.60637728862374, + "grad_norm": 2.341587869585441, + "learning_rate": 7.0832080725351925e-06, + "loss": 0.7467, + "step": 7369 + }, + { + "epoch": 0.606459576218885, + "grad_norm": 0.43481336133119575, + "learning_rate": 7.080658770653339e-06, + "loss": 0.4917, + "step": 7370 + }, + { + "epoch": 0.6065418638140301, + "grad_norm": 1.9918811093763835, + "learning_rate": 7.078109676156819e-06, + "loss": 0.7548, + "step": 7371 + }, + { + "epoch": 0.606624151409175, + "grad_norm": 2.2450143292864455, + "learning_rate": 7.075560789226712e-06, + "loss": 0.671, + "step": 7372 + }, + { + "epoch": 0.6067064390043201, + "grad_norm": 2.5012611683145147, + "learning_rate": 7.073012110044094e-06, + "loss": 0.7527, + "step": 7373 + }, + { + "epoch": 0.6067887265994651, + "grad_norm": 1.9872684269181449, + "learning_rate": 7.0704636387900115e-06, + "loss": 0.7022, + "step": 7374 + }, + { + "epoch": 0.6068710141946102, + "grad_norm": 2.798062944400925, + "learning_rate": 7.0679153756455095e-06, + "loss": 0.7394, + "step": 7375 + }, + { + "epoch": 0.6069533017897552, + "grad_norm": 2.0618412701262585, + "learning_rate": 7.065367320791608e-06, + "loss": 0.7055, + "step": 7376 + }, + { + "epoch": 0.6070355893849002, + "grad_norm": 3.9316271974803247, + "learning_rate": 7.0628194744093216e-06, + "loss": 0.7135, + "step": 7377 + }, + { + "epoch": 0.6071178769800453, + "grad_norm": 2.417424870352575, + "learning_rate": 7.060271836679637e-06, + "loss": 0.6929, + "step": 7378 + }, + { + "epoch": 0.6072001645751903, + "grad_norm": 2.4711130859366333, + "learning_rate": 7.0577244077835465e-06, + "loss": 0.738, + "step": 7379 + }, + { + "epoch": 0.6072824521703353, + "grad_norm": 0.421236102344599, + "learning_rate": 7.055177187902002e-06, + "loss": 0.4812, + "step": 7380 + }, + { + "epoch": 0.6073647397654803, + "grad_norm": 1.933865809855915, + "learning_rate": 7.052630177215965e-06, + "loss": 0.7292, + "step": 7381 + }, + { + "epoch": 0.6074470273606254, + "grad_norm": 0.4245665867640909, + "learning_rate": 7.050083375906365e-06, + "loss": 0.5125, + "step": 7382 + }, + { + "epoch": 0.6075293149557704, + "grad_norm": 2.6581659496660666, + "learning_rate": 7.047536784154126e-06, + "loss": 0.7065, + "step": 7383 + }, + { + "epoch": 0.6076116025509154, + "grad_norm": 2.8298553033788116, + "learning_rate": 7.044990402140148e-06, + "loss": 0.7374, + "step": 7384 + }, + { + "epoch": 0.6076938901460605, + "grad_norm": 0.4315652106749901, + "learning_rate": 7.042444230045332e-06, + "loss": 0.4764, + "step": 7385 + }, + { + "epoch": 0.6077761777412055, + "grad_norm": 2.1978224744925376, + "learning_rate": 7.039898268050543e-06, + "loss": 0.7448, + "step": 7386 + }, + { + "epoch": 0.6078584653363506, + "grad_norm": 0.40667782540548336, + "learning_rate": 7.037352516336649e-06, + "loss": 0.4955, + "step": 7387 + }, + { + "epoch": 0.6079407529314956, + "grad_norm": 2.586485251615531, + "learning_rate": 7.034806975084493e-06, + "loss": 0.7156, + "step": 7388 + }, + { + "epoch": 0.6080230405266406, + "grad_norm": 2.54138721007884, + "learning_rate": 7.032261644474909e-06, + "loss": 0.7202, + "step": 7389 + }, + { + "epoch": 0.6081053281217856, + "grad_norm": 2.260097925976659, + "learning_rate": 7.029716524688708e-06, + "loss": 0.7364, + "step": 7390 + }, + { + "epoch": 0.6081876157169307, + "grad_norm": 2.6615225834017098, + "learning_rate": 7.027171615906702e-06, + "loss": 0.761, + "step": 7391 + }, + { + "epoch": 0.6082699033120758, + "grad_norm": 0.42108515991730083, + "learning_rate": 7.024626918309663e-06, + "loss": 0.4892, + "step": 7392 + }, + { + "epoch": 0.6083521909072207, + "grad_norm": 2.406912014379565, + "learning_rate": 7.0220824320783724e-06, + "loss": 0.7506, + "step": 7393 + }, + { + "epoch": 0.6084344785023658, + "grad_norm": 5.54877018667168, + "learning_rate": 7.019538157393583e-06, + "loss": 0.7313, + "step": 7394 + }, + { + "epoch": 0.6085167660975108, + "grad_norm": 2.8589251605030084, + "learning_rate": 7.016994094436037e-06, + "loss": 0.7089, + "step": 7395 + }, + { + "epoch": 0.6085990536926559, + "grad_norm": 2.5028693208113606, + "learning_rate": 7.014450243386459e-06, + "loss": 0.7503, + "step": 7396 + }, + { + "epoch": 0.6086813412878008, + "grad_norm": 0.4239888009600039, + "learning_rate": 7.0119066044255625e-06, + "loss": 0.4963, + "step": 7397 + }, + { + "epoch": 0.6087636288829459, + "grad_norm": 2.3161630929648385, + "learning_rate": 7.009363177734038e-06, + "loss": 0.7296, + "step": 7398 + }, + { + "epoch": 0.6088459164780909, + "grad_norm": 2.715868030176399, + "learning_rate": 7.006819963492575e-06, + "loss": 0.7146, + "step": 7399 + }, + { + "epoch": 0.608928204073236, + "grad_norm": 2.7382613649493845, + "learning_rate": 7.004276961881833e-06, + "loss": 0.7177, + "step": 7400 + }, + { + "epoch": 0.609010491668381, + "grad_norm": 2.6313746144784123, + "learning_rate": 7.001734173082466e-06, + "loss": 0.7361, + "step": 7401 + }, + { + "epoch": 0.609092779263526, + "grad_norm": 2.200696575443853, + "learning_rate": 6.999191597275106e-06, + "loss": 0.7234, + "step": 7402 + }, + { + "epoch": 0.6091750668586711, + "grad_norm": 2.1701870385395177, + "learning_rate": 6.996649234640378e-06, + "loss": 0.7059, + "step": 7403 + }, + { + "epoch": 0.6092573544538161, + "grad_norm": 3.2396994728157678, + "learning_rate": 6.994107085358882e-06, + "loss": 0.7464, + "step": 7404 + }, + { + "epoch": 0.6093396420489611, + "grad_norm": 1.9841986756218277, + "learning_rate": 6.9915651496112145e-06, + "loss": 0.7737, + "step": 7405 + }, + { + "epoch": 0.6094219296441061, + "grad_norm": 2.5755249526085278, + "learning_rate": 6.9890234275779465e-06, + "loss": 0.73, + "step": 7406 + }, + { + "epoch": 0.6095042172392512, + "grad_norm": 2.9556483486814944, + "learning_rate": 6.986481919439641e-06, + "loss": 0.7314, + "step": 7407 + }, + { + "epoch": 0.6095865048343962, + "grad_norm": 3.9097143341907574, + "learning_rate": 6.983940625376837e-06, + "loss": 0.7557, + "step": 7408 + }, + { + "epoch": 0.6096687924295412, + "grad_norm": 4.410891119429949, + "learning_rate": 6.981399545570071e-06, + "loss": 0.7473, + "step": 7409 + }, + { + "epoch": 0.6097510800246863, + "grad_norm": 2.514714515149069, + "learning_rate": 6.978858680199849e-06, + "loss": 0.7186, + "step": 7410 + }, + { + "epoch": 0.6098333676198313, + "grad_norm": 2.79918429783107, + "learning_rate": 6.976318029446679e-06, + "loss": 0.7322, + "step": 7411 + }, + { + "epoch": 0.6099156552149764, + "grad_norm": 2.2824737711716963, + "learning_rate": 6.973777593491038e-06, + "loss": 0.7624, + "step": 7412 + }, + { + "epoch": 0.6099979428101214, + "grad_norm": 0.42408956698447237, + "learning_rate": 6.9712373725134e-06, + "loss": 0.5011, + "step": 7413 + }, + { + "epoch": 0.6100802304052664, + "grad_norm": 2.078226906698426, + "learning_rate": 6.968697366694213e-06, + "loss": 0.7449, + "step": 7414 + }, + { + "epoch": 0.6101625180004114, + "grad_norm": 2.8807900638111814, + "learning_rate": 6.966157576213919e-06, + "loss": 0.7187, + "step": 7415 + }, + { + "epoch": 0.6102448055955565, + "grad_norm": 2.312451511359597, + "learning_rate": 6.963618001252934e-06, + "loss": 0.73, + "step": 7416 + }, + { + "epoch": 0.6103270931907016, + "grad_norm": 1.952416536754663, + "learning_rate": 6.961078641991678e-06, + "loss": 0.7101, + "step": 7417 + }, + { + "epoch": 0.6104093807858465, + "grad_norm": 2.373853373663527, + "learning_rate": 6.958539498610528e-06, + "loss": 0.7185, + "step": 7418 + }, + { + "epoch": 0.6104916683809916, + "grad_norm": 2.5407736903037077, + "learning_rate": 6.956000571289873e-06, + "loss": 0.7078, + "step": 7419 + }, + { + "epoch": 0.6105739559761366, + "grad_norm": 0.40161179868816405, + "learning_rate": 6.9534618602100664e-06, + "loss": 0.4697, + "step": 7420 + }, + { + "epoch": 0.6106562435712817, + "grad_norm": 2.151708811698962, + "learning_rate": 6.950923365551459e-06, + "loss": 0.7315, + "step": 7421 + }, + { + "epoch": 0.6107385311664266, + "grad_norm": 0.4220444206222698, + "learning_rate": 6.948385087494376e-06, + "loss": 0.4693, + "step": 7422 + }, + { + "epoch": 0.6108208187615717, + "grad_norm": 2.8526718889073575, + "learning_rate": 6.945847026219142e-06, + "loss": 0.7181, + "step": 7423 + }, + { + "epoch": 0.6109031063567167, + "grad_norm": 0.43053883657791486, + "learning_rate": 6.9433091819060434e-06, + "loss": 0.4719, + "step": 7424 + }, + { + "epoch": 0.6109853939518618, + "grad_norm": 4.277567505012875, + "learning_rate": 6.9407715547353775e-06, + "loss": 0.7178, + "step": 7425 + }, + { + "epoch": 0.6110676815470067, + "grad_norm": 3.415293805700905, + "learning_rate": 6.938234144887406e-06, + "loss": 0.7396, + "step": 7426 + }, + { + "epoch": 0.6111499691421518, + "grad_norm": 2.97540173774179, + "learning_rate": 6.935696952542386e-06, + "loss": 0.7256, + "step": 7427 + }, + { + "epoch": 0.6112322567372969, + "grad_norm": 2.574939158878115, + "learning_rate": 6.93315997788055e-06, + "loss": 0.723, + "step": 7428 + }, + { + "epoch": 0.6113145443324419, + "grad_norm": 3.32632471756277, + "learning_rate": 6.930623221082129e-06, + "loss": 0.7296, + "step": 7429 + }, + { + "epoch": 0.6113968319275869, + "grad_norm": 2.8811170262228734, + "learning_rate": 6.928086682327319e-06, + "loss": 0.7616, + "step": 7430 + }, + { + "epoch": 0.6114791195227319, + "grad_norm": 0.4419602627730875, + "learning_rate": 6.9255503617963245e-06, + "loss": 0.4654, + "step": 7431 + }, + { + "epoch": 0.611561407117877, + "grad_norm": 2.317419512171074, + "learning_rate": 6.923014259669314e-06, + "loss": 0.7334, + "step": 7432 + }, + { + "epoch": 0.611643694713022, + "grad_norm": 2.885462181005391, + "learning_rate": 6.9204783761264506e-06, + "loss": 0.7186, + "step": 7433 + }, + { + "epoch": 0.611725982308167, + "grad_norm": 2.7984495130252465, + "learning_rate": 6.9179427113478755e-06, + "loss": 0.7161, + "step": 7434 + }, + { + "epoch": 0.611808269903312, + "grad_norm": 3.0636307529349667, + "learning_rate": 6.915407265513725e-06, + "loss": 0.7214, + "step": 7435 + }, + { + "epoch": 0.6118905574984571, + "grad_norm": 2.5267724275008256, + "learning_rate": 6.9128720388041035e-06, + "loss": 0.7211, + "step": 7436 + }, + { + "epoch": 0.6119728450936022, + "grad_norm": 2.682099543147998, + "learning_rate": 6.910337031399121e-06, + "loss": 0.7169, + "step": 7437 + }, + { + "epoch": 0.6120551326887471, + "grad_norm": 3.528492302829186, + "learning_rate": 6.907802243478851e-06, + "loss": 0.7337, + "step": 7438 + }, + { + "epoch": 0.6121374202838922, + "grad_norm": 2.829836861246372, + "learning_rate": 6.905267675223367e-06, + "loss": 0.7433, + "step": 7439 + }, + { + "epoch": 0.6122197078790372, + "grad_norm": 2.5915951853071504, + "learning_rate": 6.902733326812716e-06, + "loss": 0.7357, + "step": 7440 + }, + { + "epoch": 0.6123019954741823, + "grad_norm": 2.982529777903633, + "learning_rate": 6.900199198426938e-06, + "loss": 0.7193, + "step": 7441 + }, + { + "epoch": 0.6123842830693274, + "grad_norm": 0.4356563114819184, + "learning_rate": 6.897665290246048e-06, + "loss": 0.4913, + "step": 7442 + }, + { + "epoch": 0.6124665706644723, + "grad_norm": 2.4911110337244344, + "learning_rate": 6.895131602450058e-06, + "loss": 0.7182, + "step": 7443 + }, + { + "epoch": 0.6125488582596174, + "grad_norm": 0.4528358529106356, + "learning_rate": 6.892598135218952e-06, + "loss": 0.502, + "step": 7444 + }, + { + "epoch": 0.6126311458547624, + "grad_norm": 2.2205855774323067, + "learning_rate": 6.890064888732706e-06, + "loss": 0.7133, + "step": 7445 + }, + { + "epoch": 0.6127134334499075, + "grad_norm": 3.3786029588731044, + "learning_rate": 6.887531863171275e-06, + "loss": 0.7085, + "step": 7446 + }, + { + "epoch": 0.6127957210450524, + "grad_norm": 0.4325654260677364, + "learning_rate": 6.884999058714605e-06, + "loss": 0.4974, + "step": 7447 + }, + { + "epoch": 0.6128780086401975, + "grad_norm": 2.856356371909117, + "learning_rate": 6.882466475542616e-06, + "loss": 0.7436, + "step": 7448 + }, + { + "epoch": 0.6129602962353425, + "grad_norm": 2.241120713822887, + "learning_rate": 6.879934113835229e-06, + "loss": 0.7461, + "step": 7449 + }, + { + "epoch": 0.6130425838304876, + "grad_norm": 2.357513110622926, + "learning_rate": 6.877401973772326e-06, + "loss": 0.7528, + "step": 7450 + }, + { + "epoch": 0.6131248714256325, + "grad_norm": 2.341039880674718, + "learning_rate": 6.874870055533798e-06, + "loss": 0.7362, + "step": 7451 + }, + { + "epoch": 0.6132071590207776, + "grad_norm": 2.279851472562641, + "learning_rate": 6.872338359299502e-06, + "loss": 0.7438, + "step": 7452 + }, + { + "epoch": 0.6132894466159227, + "grad_norm": 2.3501628176436573, + "learning_rate": 6.869806885249287e-06, + "loss": 0.7581, + "step": 7453 + }, + { + "epoch": 0.6133717342110677, + "grad_norm": 0.42580612312169785, + "learning_rate": 6.867275633562982e-06, + "loss": 0.485, + "step": 7454 + }, + { + "epoch": 0.6134540218062127, + "grad_norm": 0.40325508950038896, + "learning_rate": 6.864744604420412e-06, + "loss": 0.4829, + "step": 7455 + }, + { + "epoch": 0.6135363094013577, + "grad_norm": 2.1273558493797147, + "learning_rate": 6.862213798001365e-06, + "loss": 0.7649, + "step": 7456 + }, + { + "epoch": 0.6136185969965028, + "grad_norm": 2.4589357222217068, + "learning_rate": 6.859683214485635e-06, + "loss": 0.7232, + "step": 7457 + }, + { + "epoch": 0.6137008845916478, + "grad_norm": 3.643892031771377, + "learning_rate": 6.857152854052987e-06, + "loss": 0.7469, + "step": 7458 + }, + { + "epoch": 0.6137831721867928, + "grad_norm": 3.231468062866289, + "learning_rate": 6.854622716883174e-06, + "loss": 0.7113, + "step": 7459 + }, + { + "epoch": 0.6138654597819379, + "grad_norm": 2.1487260624292226, + "learning_rate": 6.852092803155932e-06, + "loss": 0.7367, + "step": 7460 + }, + { + "epoch": 0.6139477473770829, + "grad_norm": 7.385789799508688, + "learning_rate": 6.8495631130509846e-06, + "loss": 0.7166, + "step": 7461 + }, + { + "epoch": 0.614030034972228, + "grad_norm": 0.4182037350821111, + "learning_rate": 6.847033646748031e-06, + "loss": 0.4968, + "step": 7462 + }, + { + "epoch": 0.6141123225673729, + "grad_norm": 2.17074393606963, + "learning_rate": 6.844504404426768e-06, + "loss": 0.7198, + "step": 7463 + }, + { + "epoch": 0.614194610162518, + "grad_norm": 2.5548209315075963, + "learning_rate": 6.8419753862668655e-06, + "loss": 0.7397, + "step": 7464 + }, + { + "epoch": 0.614276897757663, + "grad_norm": 0.4506096435746833, + "learning_rate": 6.839446592447983e-06, + "loss": 0.4765, + "step": 7465 + }, + { + "epoch": 0.6143591853528081, + "grad_norm": 2.4084683984674817, + "learning_rate": 6.836918023149757e-06, + "loss": 0.7345, + "step": 7466 + }, + { + "epoch": 0.6144414729479531, + "grad_norm": 2.1505974050570766, + "learning_rate": 6.8343896785518184e-06, + "loss": 0.7279, + "step": 7467 + }, + { + "epoch": 0.6145237605430981, + "grad_norm": 0.4222512345859038, + "learning_rate": 6.83186155883377e-06, + "loss": 0.4867, + "step": 7468 + }, + { + "epoch": 0.6146060481382432, + "grad_norm": 0.4126045276825431, + "learning_rate": 6.829333664175214e-06, + "loss": 0.4836, + "step": 7469 + }, + { + "epoch": 0.6146883357333882, + "grad_norm": 2.4191286711697964, + "learning_rate": 6.826805994755721e-06, + "loss": 0.742, + "step": 7470 + }, + { + "epoch": 0.6147706233285333, + "grad_norm": 1.9798801530532248, + "learning_rate": 6.824278550754859e-06, + "loss": 0.7162, + "step": 7471 + }, + { + "epoch": 0.6148529109236782, + "grad_norm": 4.148193383140986, + "learning_rate": 6.821751332352166e-06, + "loss": 0.7561, + "step": 7472 + }, + { + "epoch": 0.6149351985188233, + "grad_norm": 2.32265620532926, + "learning_rate": 6.81922433972718e-06, + "loss": 0.7254, + "step": 7473 + }, + { + "epoch": 0.6150174861139683, + "grad_norm": 2.2530769420779206, + "learning_rate": 6.816697573059403e-06, + "loss": 0.7292, + "step": 7474 + }, + { + "epoch": 0.6150997737091134, + "grad_norm": 0.4090897786120325, + "learning_rate": 6.814171032528346e-06, + "loss": 0.4995, + "step": 7475 + }, + { + "epoch": 0.6151820613042583, + "grad_norm": 2.833813242407942, + "learning_rate": 6.811644718313482e-06, + "loss": 0.7177, + "step": 7476 + }, + { + "epoch": 0.6152643488994034, + "grad_norm": 2.200127531473764, + "learning_rate": 6.809118630594278e-06, + "loss": 0.7203, + "step": 7477 + }, + { + "epoch": 0.6153466364945485, + "grad_norm": 2.0172228036826114, + "learning_rate": 6.806592769550182e-06, + "loss": 0.7088, + "step": 7478 + }, + { + "epoch": 0.6154289240896935, + "grad_norm": 2.111159479312796, + "learning_rate": 6.8040671353606324e-06, + "loss": 0.7469, + "step": 7479 + }, + { + "epoch": 0.6155112116848385, + "grad_norm": 0.416667548463264, + "learning_rate": 6.801541728205037e-06, + "loss": 0.4859, + "step": 7480 + }, + { + "epoch": 0.6155934992799835, + "grad_norm": 1.7113086052431354, + "learning_rate": 6.7990165482628055e-06, + "loss": 0.7554, + "step": 7481 + }, + { + "epoch": 0.6156757868751286, + "grad_norm": 2.305122665110337, + "learning_rate": 6.796491595713317e-06, + "loss": 0.7181, + "step": 7482 + }, + { + "epoch": 0.6157580744702736, + "grad_norm": 2.778730758630924, + "learning_rate": 6.793966870735946e-06, + "loss": 0.7325, + "step": 7483 + }, + { + "epoch": 0.6158403620654186, + "grad_norm": 2.1948745480734764, + "learning_rate": 6.791442373510038e-06, + "loss": 0.7373, + "step": 7484 + }, + { + "epoch": 0.6159226496605636, + "grad_norm": 2.4360017909912215, + "learning_rate": 6.7889181042149346e-06, + "loss": 0.7563, + "step": 7485 + }, + { + "epoch": 0.6160049372557087, + "grad_norm": 2.0915858718941918, + "learning_rate": 6.78639406302995e-06, + "loss": 0.7111, + "step": 7486 + }, + { + "epoch": 0.6160872248508538, + "grad_norm": 6.1162835936604685, + "learning_rate": 6.783870250134398e-06, + "loss": 0.7079, + "step": 7487 + }, + { + "epoch": 0.6161695124459987, + "grad_norm": 3.2996214835867845, + "learning_rate": 6.781346665707554e-06, + "loss": 0.7159, + "step": 7488 + }, + { + "epoch": 0.6162518000411438, + "grad_norm": 2.2378812886273374, + "learning_rate": 6.778823309928698e-06, + "loss": 0.7161, + "step": 7489 + }, + { + "epoch": 0.6163340876362888, + "grad_norm": 2.362752002391396, + "learning_rate": 6.776300182977083e-06, + "loss": 0.7827, + "step": 7490 + }, + { + "epoch": 0.6164163752314339, + "grad_norm": 1.9572757513042374, + "learning_rate": 6.7737772850319475e-06, + "loss": 0.7414, + "step": 7491 + }, + { + "epoch": 0.616498662826579, + "grad_norm": 2.0748367511704315, + "learning_rate": 6.77125461627251e-06, + "loss": 0.6962, + "step": 7492 + }, + { + "epoch": 0.6165809504217239, + "grad_norm": 1.8852681033714078, + "learning_rate": 6.768732176877988e-06, + "loss": 0.7473, + "step": 7493 + }, + { + "epoch": 0.616663238016869, + "grad_norm": 2.7878516813726466, + "learning_rate": 6.766209967027557e-06, + "loss": 0.7145, + "step": 7494 + }, + { + "epoch": 0.616745525612014, + "grad_norm": 2.556082976668858, + "learning_rate": 6.763687986900405e-06, + "loss": 0.7239, + "step": 7495 + }, + { + "epoch": 0.6168278132071591, + "grad_norm": 1.9469384589450678, + "learning_rate": 6.761166236675678e-06, + "loss": 0.7142, + "step": 7496 + }, + { + "epoch": 0.616910100802304, + "grad_norm": 0.4167786737567454, + "learning_rate": 6.758644716532525e-06, + "loss": 0.4897, + "step": 7497 + }, + { + "epoch": 0.6169923883974491, + "grad_norm": 2.4798170170751197, + "learning_rate": 6.756123426650065e-06, + "loss": 0.717, + "step": 7498 + }, + { + "epoch": 0.6170746759925941, + "grad_norm": 2.6630517898977852, + "learning_rate": 6.753602367207411e-06, + "loss": 0.7152, + "step": 7499 + }, + { + "epoch": 0.6171569635877392, + "grad_norm": 2.4477436542331117, + "learning_rate": 6.75108153838365e-06, + "loss": 0.7305, + "step": 7500 + }, + { + "epoch": 0.6172392511828841, + "grad_norm": 0.4485559984125505, + "learning_rate": 6.748560940357863e-06, + "loss": 0.497, + "step": 7501 + }, + { + "epoch": 0.6173215387780292, + "grad_norm": 3.1360915111380434, + "learning_rate": 6.746040573309106e-06, + "loss": 0.7343, + "step": 7502 + }, + { + "epoch": 0.6174038263731743, + "grad_norm": 2.8233184525518467, + "learning_rate": 6.7435204374164245e-06, + "loss": 0.7244, + "step": 7503 + }, + { + "epoch": 0.6174861139683193, + "grad_norm": 0.4122337914921573, + "learning_rate": 6.741000532858842e-06, + "loss": 0.4742, + "step": 7504 + }, + { + "epoch": 0.6175684015634643, + "grad_norm": 0.4260070987487295, + "learning_rate": 6.738480859815371e-06, + "loss": 0.5042, + "step": 7505 + }, + { + "epoch": 0.6176506891586093, + "grad_norm": 2.675002223434143, + "learning_rate": 6.735961418464999e-06, + "loss": 0.7312, + "step": 7506 + }, + { + "epoch": 0.6177329767537544, + "grad_norm": 2.063299754693337, + "learning_rate": 6.7334422089867125e-06, + "loss": 0.7436, + "step": 7507 + }, + { + "epoch": 0.6178152643488994, + "grad_norm": 3.7151597197647503, + "learning_rate": 6.7309232315594656e-06, + "loss": 0.6939, + "step": 7508 + }, + { + "epoch": 0.6178975519440444, + "grad_norm": 0.3918859316872356, + "learning_rate": 6.728404486362206e-06, + "loss": 0.4621, + "step": 7509 + }, + { + "epoch": 0.6179798395391894, + "grad_norm": 1.9983981380954352, + "learning_rate": 6.7258859735738565e-06, + "loss": 0.7431, + "step": 7510 + }, + { + "epoch": 0.6180621271343345, + "grad_norm": 2.3059227669094478, + "learning_rate": 6.723367693373335e-06, + "loss": 0.7193, + "step": 7511 + }, + { + "epoch": 0.6181444147294796, + "grad_norm": 1.9914675973938407, + "learning_rate": 6.720849645939527e-06, + "loss": 0.7151, + "step": 7512 + }, + { + "epoch": 0.6182267023246245, + "grad_norm": 0.41748182796721384, + "learning_rate": 6.71833183145132e-06, + "loss": 0.4697, + "step": 7513 + }, + { + "epoch": 0.6183089899197696, + "grad_norm": 2.2663425333347047, + "learning_rate": 6.7158142500875705e-06, + "loss": 0.7442, + "step": 7514 + }, + { + "epoch": 0.6183912775149146, + "grad_norm": 2.4293320086811736, + "learning_rate": 6.713296902027123e-06, + "loss": 0.7155, + "step": 7515 + }, + { + "epoch": 0.6184735651100597, + "grad_norm": 2.0822118362525313, + "learning_rate": 6.710779787448811e-06, + "loss": 0.6957, + "step": 7516 + }, + { + "epoch": 0.6185558527052047, + "grad_norm": 1.7857113534818985, + "learning_rate": 6.7082629065314375e-06, + "loss": 0.7078, + "step": 7517 + }, + { + "epoch": 0.6186381403003497, + "grad_norm": 1.986222024453004, + "learning_rate": 6.705746259453807e-06, + "loss": 0.7485, + "step": 7518 + }, + { + "epoch": 0.6187204278954948, + "grad_norm": 2.2108825370609457, + "learning_rate": 6.703229846394691e-06, + "loss": 0.766, + "step": 7519 + }, + { + "epoch": 0.6188027154906398, + "grad_norm": 2.4374437914114306, + "learning_rate": 6.700713667532856e-06, + "loss": 0.7204, + "step": 7520 + }, + { + "epoch": 0.6188850030857849, + "grad_norm": 2.259218303856434, + "learning_rate": 6.698197723047046e-06, + "loss": 0.7271, + "step": 7521 + }, + { + "epoch": 0.6189672906809298, + "grad_norm": 2.188948817372675, + "learning_rate": 6.695682013115989e-06, + "loss": 0.7206, + "step": 7522 + }, + { + "epoch": 0.6190495782760749, + "grad_norm": 2.427999406809418, + "learning_rate": 6.693166537918394e-06, + "loss": 0.7359, + "step": 7523 + }, + { + "epoch": 0.6191318658712199, + "grad_norm": 5.307905128159324, + "learning_rate": 6.690651297632964e-06, + "loss": 0.7186, + "step": 7524 + }, + { + "epoch": 0.619214153466365, + "grad_norm": 1.8783582027251158, + "learning_rate": 6.688136292438372e-06, + "loss": 0.7091, + "step": 7525 + }, + { + "epoch": 0.6192964410615099, + "grad_norm": 2.3655508654646322, + "learning_rate": 6.685621522513282e-06, + "loss": 0.7263, + "step": 7526 + }, + { + "epoch": 0.619378728656655, + "grad_norm": 2.284834825982675, + "learning_rate": 6.683106988036338e-06, + "loss": 0.7705, + "step": 7527 + }, + { + "epoch": 0.6194610162518001, + "grad_norm": 4.59759530492956, + "learning_rate": 6.6805926891861695e-06, + "loss": 0.7065, + "step": 7528 + }, + { + "epoch": 0.6195433038469451, + "grad_norm": 0.42386693312631396, + "learning_rate": 6.678078626141384e-06, + "loss": 0.5154, + "step": 7529 + }, + { + "epoch": 0.6196255914420901, + "grad_norm": 0.4153129652689517, + "learning_rate": 6.6755647990805875e-06, + "loss": 0.4862, + "step": 7530 + }, + { + "epoch": 0.6197078790372351, + "grad_norm": 2.8657873342977496, + "learning_rate": 6.673051208182343e-06, + "loss": 0.7482, + "step": 7531 + }, + { + "epoch": 0.6197901666323802, + "grad_norm": 0.43402706893884335, + "learning_rate": 6.670537853625225e-06, + "loss": 0.4857, + "step": 7532 + }, + { + "epoch": 0.6198724542275252, + "grad_norm": 2.2736334588898788, + "learning_rate": 6.6680247355877705e-06, + "loss": 0.715, + "step": 7533 + }, + { + "epoch": 0.6199547418226702, + "grad_norm": 2.8174089210582154, + "learning_rate": 6.6655118542485135e-06, + "loss": 0.7515, + "step": 7534 + }, + { + "epoch": 0.6200370294178152, + "grad_norm": 2.1924222892417986, + "learning_rate": 6.662999209785956e-06, + "loss": 0.73, + "step": 7535 + }, + { + "epoch": 0.6201193170129603, + "grad_norm": 2.2713659652917255, + "learning_rate": 6.660486802378605e-06, + "loss": 0.7268, + "step": 7536 + }, + { + "epoch": 0.6202016046081054, + "grad_norm": 1.9388972682793653, + "learning_rate": 6.657974632204924e-06, + "loss": 0.7308, + "step": 7537 + }, + { + "epoch": 0.6202838922032503, + "grad_norm": 2.4694732609009966, + "learning_rate": 6.655462699443386e-06, + "loss": 0.7199, + "step": 7538 + }, + { + "epoch": 0.6203661797983954, + "grad_norm": 3.8150973908260912, + "learning_rate": 6.6529510042724255e-06, + "loss": 0.7515, + "step": 7539 + }, + { + "epoch": 0.6204484673935404, + "grad_norm": 2.003655061461574, + "learning_rate": 6.650439546870475e-06, + "loss": 0.7705, + "step": 7540 + }, + { + "epoch": 0.6205307549886855, + "grad_norm": 2.2808845630635166, + "learning_rate": 6.647928327415941e-06, + "loss": 0.6795, + "step": 7541 + }, + { + "epoch": 0.6206130425838304, + "grad_norm": 2.094881216359929, + "learning_rate": 6.64541734608722e-06, + "loss": 0.7212, + "step": 7542 + }, + { + "epoch": 0.6206953301789755, + "grad_norm": 1.9517278768441797, + "learning_rate": 6.6429066030626795e-06, + "loss": 0.7473, + "step": 7543 + }, + { + "epoch": 0.6207776177741205, + "grad_norm": 2.0993367776128307, + "learning_rate": 6.6403960985206915e-06, + "loss": 0.7455, + "step": 7544 + }, + { + "epoch": 0.6208599053692656, + "grad_norm": 0.4229646537674248, + "learning_rate": 6.63788583263959e-06, + "loss": 0.4985, + "step": 7545 + }, + { + "epoch": 0.6209421929644107, + "grad_norm": 3.5095892765184566, + "learning_rate": 6.635375805597702e-06, + "loss": 0.7244, + "step": 7546 + }, + { + "epoch": 0.6210244805595556, + "grad_norm": 0.4083202878323374, + "learning_rate": 6.632866017573335e-06, + "loss": 0.4858, + "step": 7547 + }, + { + "epoch": 0.6211067681547007, + "grad_norm": 3.1831844589807123, + "learning_rate": 6.630356468744783e-06, + "loss": 0.7209, + "step": 7548 + }, + { + "epoch": 0.6211890557498457, + "grad_norm": 2.912867448561611, + "learning_rate": 6.6278471592903146e-06, + "loss": 0.7072, + "step": 7549 + }, + { + "epoch": 0.6212713433449908, + "grad_norm": 0.41822193765206633, + "learning_rate": 6.6253380893881955e-06, + "loss": 0.4787, + "step": 7550 + }, + { + "epoch": 0.6213536309401357, + "grad_norm": 2.452489369763351, + "learning_rate": 6.622829259216659e-06, + "loss": 0.7345, + "step": 7551 + }, + { + "epoch": 0.6214359185352808, + "grad_norm": 3.241247141877019, + "learning_rate": 6.620320668953933e-06, + "loss": 0.737, + "step": 7552 + }, + { + "epoch": 0.6215182061304259, + "grad_norm": 2.3012496284652744, + "learning_rate": 6.6178123187782195e-06, + "loss": 0.7396, + "step": 7553 + }, + { + "epoch": 0.6216004937255709, + "grad_norm": 0.4229165132694804, + "learning_rate": 6.6153042088677124e-06, + "loss": 0.495, + "step": 7554 + }, + { + "epoch": 0.6216827813207159, + "grad_norm": 1.9670058853317678, + "learning_rate": 6.612796339400577e-06, + "loss": 0.7239, + "step": 7555 + }, + { + "epoch": 0.6217650689158609, + "grad_norm": 2.2621655840328914, + "learning_rate": 6.610288710554976e-06, + "loss": 0.701, + "step": 7556 + }, + { + "epoch": 0.621847356511006, + "grad_norm": 0.4403973737435333, + "learning_rate": 6.6077813225090425e-06, + "loss": 0.518, + "step": 7557 + }, + { + "epoch": 0.621929644106151, + "grad_norm": 2.1712425990155073, + "learning_rate": 6.605274175440901e-06, + "loss": 0.7513, + "step": 7558 + }, + { + "epoch": 0.622011931701296, + "grad_norm": 1.827914067047543, + "learning_rate": 6.602767269528651e-06, + "loss": 0.7254, + "step": 7559 + }, + { + "epoch": 0.622094219296441, + "grad_norm": 2.273490499184524, + "learning_rate": 6.600260604950384e-06, + "loss": 0.6955, + "step": 7560 + }, + { + "epoch": 0.6221765068915861, + "grad_norm": 2.1834922748708006, + "learning_rate": 6.5977541818841595e-06, + "loss": 0.7313, + "step": 7561 + }, + { + "epoch": 0.6222587944867312, + "grad_norm": 1.940022313111334, + "learning_rate": 6.595248000508043e-06, + "loss": 0.749, + "step": 7562 + }, + { + "epoch": 0.6223410820818761, + "grad_norm": 0.4089844610135058, + "learning_rate": 6.592742061000061e-06, + "loss": 0.4698, + "step": 7563 + }, + { + "epoch": 0.6224233696770212, + "grad_norm": 2.177965439230456, + "learning_rate": 6.590236363538237e-06, + "loss": 0.7301, + "step": 7564 + }, + { + "epoch": 0.6225056572721662, + "grad_norm": 1.9963980119419522, + "learning_rate": 6.587730908300566e-06, + "loss": 0.7454, + "step": 7565 + }, + { + "epoch": 0.6225879448673113, + "grad_norm": 2.2340633833154304, + "learning_rate": 6.585225695465036e-06, + "loss": 0.743, + "step": 7566 + }, + { + "epoch": 0.6226702324624562, + "grad_norm": 1.973592822536053, + "learning_rate": 6.582720725209607e-06, + "loss": 0.7435, + "step": 7567 + }, + { + "epoch": 0.6227525200576013, + "grad_norm": 1.819070766986217, + "learning_rate": 6.580215997712239e-06, + "loss": 0.7284, + "step": 7568 + }, + { + "epoch": 0.6228348076527463, + "grad_norm": 3.534915425741585, + "learning_rate": 6.577711513150851e-06, + "loss": 0.7491, + "step": 7569 + }, + { + "epoch": 0.6229170952478914, + "grad_norm": 2.0819569155206117, + "learning_rate": 6.5752072717033685e-06, + "loss": 0.724, + "step": 7570 + }, + { + "epoch": 0.6229993828430365, + "grad_norm": 2.534755675264709, + "learning_rate": 6.572703273547682e-06, + "loss": 0.7319, + "step": 7571 + }, + { + "epoch": 0.6230816704381814, + "grad_norm": 2.1762681974192444, + "learning_rate": 6.570199518861676e-06, + "loss": 0.727, + "step": 7572 + }, + { + "epoch": 0.6231639580333265, + "grad_norm": 2.088327445753348, + "learning_rate": 6.567696007823207e-06, + "loss": 0.7208, + "step": 7573 + }, + { + "epoch": 0.6232462456284715, + "grad_norm": 0.4258771303335688, + "learning_rate": 6.56519274061013e-06, + "loss": 0.4926, + "step": 7574 + }, + { + "epoch": 0.6233285332236166, + "grad_norm": 2.5771522841721355, + "learning_rate": 6.562689717400261e-06, + "loss": 0.7272, + "step": 7575 + }, + { + "epoch": 0.6234108208187615, + "grad_norm": 1.9614067417179577, + "learning_rate": 6.560186938371423e-06, + "loss": 0.7121, + "step": 7576 + }, + { + "epoch": 0.6234931084139066, + "grad_norm": 2.232201613132238, + "learning_rate": 6.557684403701401e-06, + "loss": 0.7105, + "step": 7577 + }, + { + "epoch": 0.6235753960090517, + "grad_norm": 0.4272311296250902, + "learning_rate": 6.555182113567976e-06, + "loss": 0.5085, + "step": 7578 + }, + { + "epoch": 0.6236576836041967, + "grad_norm": 2.2126446164510356, + "learning_rate": 6.5526800681489025e-06, + "loss": 0.7121, + "step": 7579 + }, + { + "epoch": 0.6237399711993417, + "grad_norm": 2.3854632245381233, + "learning_rate": 6.550178267621925e-06, + "loss": 0.7629, + "step": 7580 + }, + { + "epoch": 0.6238222587944867, + "grad_norm": 3.216150835315899, + "learning_rate": 6.547676712164763e-06, + "loss": 0.7322, + "step": 7581 + }, + { + "epoch": 0.6239045463896318, + "grad_norm": 2.191072105062492, + "learning_rate": 6.545175401955129e-06, + "loss": 0.7249, + "step": 7582 + }, + { + "epoch": 0.6239868339847768, + "grad_norm": 2.436886276900028, + "learning_rate": 6.542674337170709e-06, + "loss": 0.7289, + "step": 7583 + }, + { + "epoch": 0.6240691215799218, + "grad_norm": 2.133551653367922, + "learning_rate": 6.540173517989175e-06, + "loss": 0.699, + "step": 7584 + }, + { + "epoch": 0.6241514091750668, + "grad_norm": 2.5476646025598857, + "learning_rate": 6.537672944588181e-06, + "loss": 0.7483, + "step": 7585 + }, + { + "epoch": 0.6242336967702119, + "grad_norm": 2.5384129413112047, + "learning_rate": 6.535172617145364e-06, + "loss": 0.7327, + "step": 7586 + }, + { + "epoch": 0.624315984365357, + "grad_norm": 1.9466620358184892, + "learning_rate": 6.53267253583834e-06, + "loss": 0.7329, + "step": 7587 + }, + { + "epoch": 0.6243982719605019, + "grad_norm": 0.4040465588053262, + "learning_rate": 6.530172700844719e-06, + "loss": 0.5024, + "step": 7588 + }, + { + "epoch": 0.624480559555647, + "grad_norm": 1.7927789309780267, + "learning_rate": 6.527673112342077e-06, + "loss": 0.7071, + "step": 7589 + }, + { + "epoch": 0.624562847150792, + "grad_norm": 2.0683519841306843, + "learning_rate": 6.525173770507986e-06, + "loss": 0.7316, + "step": 7590 + }, + { + "epoch": 0.6246451347459371, + "grad_norm": 2.046037144943284, + "learning_rate": 6.522674675519991e-06, + "loss": 0.6998, + "step": 7591 + }, + { + "epoch": 0.624727422341082, + "grad_norm": 2.66216682032732, + "learning_rate": 6.5201758275556286e-06, + "loss": 0.7116, + "step": 7592 + }, + { + "epoch": 0.6248097099362271, + "grad_norm": 2.0322896602996234, + "learning_rate": 6.517677226792405e-06, + "loss": 0.7016, + "step": 7593 + }, + { + "epoch": 0.6248919975313721, + "grad_norm": 2.253326991741861, + "learning_rate": 6.515178873407828e-06, + "loss": 0.7271, + "step": 7594 + }, + { + "epoch": 0.6249742851265172, + "grad_norm": 2.437096046132411, + "learning_rate": 6.512680767579367e-06, + "loss": 0.7209, + "step": 7595 + }, + { + "epoch": 0.6250565727216623, + "grad_norm": 1.95580789063701, + "learning_rate": 6.510182909484491e-06, + "loss": 0.714, + "step": 7596 + }, + { + "epoch": 0.6251388603168072, + "grad_norm": 2.227313867467561, + "learning_rate": 6.5076852993006345e-06, + "loss": 0.7258, + "step": 7597 + }, + { + "epoch": 0.6252211479119523, + "grad_norm": 2.080244110721111, + "learning_rate": 6.505187937205234e-06, + "loss": 0.6962, + "step": 7598 + }, + { + "epoch": 0.6253034355070973, + "grad_norm": 0.42215617276392864, + "learning_rate": 6.502690823375688e-06, + "loss": 0.4826, + "step": 7599 + }, + { + "epoch": 0.6253857231022424, + "grad_norm": 2.8730372329827727, + "learning_rate": 6.500193957989401e-06, + "loss": 0.7142, + "step": 7600 + }, + { + "epoch": 0.6254680106973873, + "grad_norm": 2.8013590878802854, + "learning_rate": 6.49769734122373e-06, + "loss": 0.7509, + "step": 7601 + }, + { + "epoch": 0.6255502982925324, + "grad_norm": 2.2765319596617366, + "learning_rate": 6.495200973256043e-06, + "loss": 0.717, + "step": 7602 + }, + { + "epoch": 0.6256325858876775, + "grad_norm": 2.7033639008041956, + "learning_rate": 6.492704854263672e-06, + "loss": 0.7268, + "step": 7603 + }, + { + "epoch": 0.6257148734828225, + "grad_norm": 2.472988558010329, + "learning_rate": 6.490208984423941e-06, + "loss": 0.739, + "step": 7604 + }, + { + "epoch": 0.6257971610779675, + "grad_norm": 0.41290820289680535, + "learning_rate": 6.487713363914148e-06, + "loss": 0.4983, + "step": 7605 + }, + { + "epoch": 0.6258794486731125, + "grad_norm": 0.4282767236482035, + "learning_rate": 6.485217992911585e-06, + "loss": 0.4862, + "step": 7606 + }, + { + "epoch": 0.6259617362682576, + "grad_norm": 2.3267181463473188, + "learning_rate": 6.482722871593509e-06, + "loss": 0.7364, + "step": 7607 + }, + { + "epoch": 0.6260440238634026, + "grad_norm": 2.8422015820325863, + "learning_rate": 6.480228000137179e-06, + "loss": 0.7278, + "step": 7608 + }, + { + "epoch": 0.6261263114585476, + "grad_norm": 2.1362802618576353, + "learning_rate": 6.477733378719822e-06, + "loss": 0.7337, + "step": 7609 + }, + { + "epoch": 0.6262085990536926, + "grad_norm": 1.924000108410377, + "learning_rate": 6.475239007518652e-06, + "loss": 0.7437, + "step": 7610 + }, + { + "epoch": 0.6262908866488377, + "grad_norm": 2.3427351972057666, + "learning_rate": 6.472744886710867e-06, + "loss": 0.7307, + "step": 7611 + }, + { + "epoch": 0.6263731742439828, + "grad_norm": 0.40272647490971025, + "learning_rate": 6.470251016473645e-06, + "loss": 0.4815, + "step": 7612 + }, + { + "epoch": 0.6264554618391277, + "grad_norm": 1.8557765239248825, + "learning_rate": 6.46775739698414e-06, + "loss": 0.7255, + "step": 7613 + }, + { + "epoch": 0.6265377494342728, + "grad_norm": 0.41251294126370963, + "learning_rate": 6.465264028419507e-06, + "loss": 0.474, + "step": 7614 + }, + { + "epoch": 0.6266200370294178, + "grad_norm": 2.1587220562119542, + "learning_rate": 6.462770910956862e-06, + "loss": 0.7375, + "step": 7615 + }, + { + "epoch": 0.6267023246245629, + "grad_norm": 2.3586981207897675, + "learning_rate": 6.460278044773316e-06, + "loss": 0.7313, + "step": 7616 + }, + { + "epoch": 0.6267846122197078, + "grad_norm": 2.039259976506147, + "learning_rate": 6.457785430045955e-06, + "loss": 0.7577, + "step": 7617 + }, + { + "epoch": 0.6268668998148529, + "grad_norm": 2.44284358845322, + "learning_rate": 6.455293066951854e-06, + "loss": 0.7269, + "step": 7618 + }, + { + "epoch": 0.6269491874099979, + "grad_norm": 0.44540327813082253, + "learning_rate": 6.452800955668061e-06, + "loss": 0.49, + "step": 7619 + }, + { + "epoch": 0.627031475005143, + "grad_norm": 2.1439004781403805, + "learning_rate": 6.450309096371619e-06, + "loss": 0.7413, + "step": 7620 + }, + { + "epoch": 0.6271137626002881, + "grad_norm": 1.8858618819350936, + "learning_rate": 6.44781748923954e-06, + "loss": 0.7177, + "step": 7621 + }, + { + "epoch": 0.627196050195433, + "grad_norm": 1.823556032411837, + "learning_rate": 6.445326134448827e-06, + "loss": 0.7407, + "step": 7622 + }, + { + "epoch": 0.6272783377905781, + "grad_norm": 1.8596060222077937, + "learning_rate": 6.4428350321764585e-06, + "loss": 0.7542, + "step": 7623 + }, + { + "epoch": 0.6273606253857231, + "grad_norm": 2.890138572020586, + "learning_rate": 6.440344182599403e-06, + "loss": 0.7517, + "step": 7624 + }, + { + "epoch": 0.6274429129808682, + "grad_norm": 2.338274437634758, + "learning_rate": 6.4378535858946e-06, + "loss": 0.7155, + "step": 7625 + }, + { + "epoch": 0.6275252005760131, + "grad_norm": 3.385266059856828, + "learning_rate": 6.435363242238986e-06, + "loss": 0.7255, + "step": 7626 + }, + { + "epoch": 0.6276074881711582, + "grad_norm": 1.9409271828697525, + "learning_rate": 6.432873151809463e-06, + "loss": 0.7347, + "step": 7627 + }, + { + "epoch": 0.6276897757663032, + "grad_norm": 0.4200570474621552, + "learning_rate": 6.43038331478293e-06, + "loss": 0.4894, + "step": 7628 + }, + { + "epoch": 0.6277720633614483, + "grad_norm": 0.4057011891816415, + "learning_rate": 6.4278937313362544e-06, + "loss": 0.4784, + "step": 7629 + }, + { + "epoch": 0.6278543509565933, + "grad_norm": 3.3269466584550718, + "learning_rate": 6.425404401646298e-06, + "loss": 0.7357, + "step": 7630 + }, + { + "epoch": 0.6279366385517383, + "grad_norm": 2.08380827577758, + "learning_rate": 6.422915325889892e-06, + "loss": 0.7387, + "step": 7631 + }, + { + "epoch": 0.6280189261468834, + "grad_norm": 2.493724050993378, + "learning_rate": 6.4204265042438675e-06, + "loss": 0.7132, + "step": 7632 + }, + { + "epoch": 0.6281012137420284, + "grad_norm": 2.0251198929853764, + "learning_rate": 6.417937936885013e-06, + "loss": 0.7458, + "step": 7633 + }, + { + "epoch": 0.6281835013371734, + "grad_norm": 1.8535818616465134, + "learning_rate": 6.415449623990122e-06, + "loss": 0.7165, + "step": 7634 + }, + { + "epoch": 0.6282657889323184, + "grad_norm": 2.1061762732338196, + "learning_rate": 6.412961565735955e-06, + "loss": 0.7357, + "step": 7635 + }, + { + "epoch": 0.6283480765274635, + "grad_norm": 1.8625454286095338, + "learning_rate": 6.410473762299265e-06, + "loss": 0.716, + "step": 7636 + }, + { + "epoch": 0.6284303641226086, + "grad_norm": 2.095971168496662, + "learning_rate": 6.407986213856773e-06, + "loss": 0.7327, + "step": 7637 + }, + { + "epoch": 0.6285126517177535, + "grad_norm": 0.4234738729464648, + "learning_rate": 6.405498920585203e-06, + "loss": 0.5093, + "step": 7638 + }, + { + "epoch": 0.6285949393128986, + "grad_norm": 2.117092398514615, + "learning_rate": 6.403011882661234e-06, + "loss": 0.7351, + "step": 7639 + }, + { + "epoch": 0.6286772269080436, + "grad_norm": 2.05278397796014, + "learning_rate": 6.400525100261552e-06, + "loss": 0.7529, + "step": 7640 + }, + { + "epoch": 0.6287595145031887, + "grad_norm": 4.68083635239492, + "learning_rate": 6.398038573562811e-06, + "loss": 0.7067, + "step": 7641 + }, + { + "epoch": 0.6288418020983336, + "grad_norm": 2.3030811089409635, + "learning_rate": 6.395552302741648e-06, + "loss": 0.7588, + "step": 7642 + }, + { + "epoch": 0.6289240896934787, + "grad_norm": 1.845281747061714, + "learning_rate": 6.393066287974683e-06, + "loss": 0.7342, + "step": 7643 + }, + { + "epoch": 0.6290063772886237, + "grad_norm": 0.4280038601034338, + "learning_rate": 6.390580529438526e-06, + "loss": 0.4809, + "step": 7644 + }, + { + "epoch": 0.6290886648837688, + "grad_norm": 1.8344036688529135, + "learning_rate": 6.388095027309749e-06, + "loss": 0.7513, + "step": 7645 + }, + { + "epoch": 0.6291709524789139, + "grad_norm": 2.2766655666320657, + "learning_rate": 6.38560978176493e-06, + "loss": 0.7204, + "step": 7646 + }, + { + "epoch": 0.6292532400740588, + "grad_norm": 0.4015122043276666, + "learning_rate": 6.383124792980608e-06, + "loss": 0.4792, + "step": 7647 + }, + { + "epoch": 0.6293355276692039, + "grad_norm": 0.4225635714891269, + "learning_rate": 6.380640061133321e-06, + "loss": 0.4867, + "step": 7648 + }, + { + "epoch": 0.6294178152643489, + "grad_norm": 2.058998420208348, + "learning_rate": 6.3781555863995715e-06, + "loss": 0.7322, + "step": 7649 + }, + { + "epoch": 0.629500102859494, + "grad_norm": 1.9487999490408312, + "learning_rate": 6.37567136895586e-06, + "loss": 0.7158, + "step": 7650 + }, + { + "epoch": 0.6295823904546389, + "grad_norm": 0.4074366670170456, + "learning_rate": 6.373187408978654e-06, + "loss": 0.506, + "step": 7651 + }, + { + "epoch": 0.629664678049784, + "grad_norm": 2.2406596040043367, + "learning_rate": 6.370703706644418e-06, + "loss": 0.7419, + "step": 7652 + }, + { + "epoch": 0.629746965644929, + "grad_norm": 1.87033962246932, + "learning_rate": 6.3682202621295845e-06, + "loss": 0.7241, + "step": 7653 + }, + { + "epoch": 0.6298292532400741, + "grad_norm": 1.9320227407080641, + "learning_rate": 6.3657370756105776e-06, + "loss": 0.721, + "step": 7654 + }, + { + "epoch": 0.629911540835219, + "grad_norm": 1.862613266897739, + "learning_rate": 6.363254147263794e-06, + "loss": 0.7204, + "step": 7655 + }, + { + "epoch": 0.6299938284303641, + "grad_norm": 2.3748806263134496, + "learning_rate": 6.360771477265623e-06, + "loss": 0.7316, + "step": 7656 + }, + { + "epoch": 0.6300761160255092, + "grad_norm": 2.879737668353372, + "learning_rate": 6.3582890657924216e-06, + "loss": 0.7206, + "step": 7657 + }, + { + "epoch": 0.6301584036206542, + "grad_norm": 2.0740558964489058, + "learning_rate": 6.355806913020543e-06, + "loss": 0.7245, + "step": 7658 + }, + { + "epoch": 0.6302406912157992, + "grad_norm": 2.149158613235514, + "learning_rate": 6.353325019126314e-06, + "loss": 0.737, + "step": 7659 + }, + { + "epoch": 0.6303229788109442, + "grad_norm": 2.14519683564858, + "learning_rate": 6.350843384286045e-06, + "loss": 0.7171, + "step": 7660 + }, + { + "epoch": 0.6304052664060893, + "grad_norm": 2.0742919112543197, + "learning_rate": 6.348362008676023e-06, + "loss": 0.6947, + "step": 7661 + }, + { + "epoch": 0.6304875540012344, + "grad_norm": 2.134902780148822, + "learning_rate": 6.345880892472527e-06, + "loss": 0.7204, + "step": 7662 + }, + { + "epoch": 0.6305698415963793, + "grad_norm": 1.8069097596666854, + "learning_rate": 6.343400035851805e-06, + "loss": 0.7556, + "step": 7663 + }, + { + "epoch": 0.6306521291915244, + "grad_norm": 0.404834952652182, + "learning_rate": 6.340919438990099e-06, + "loss": 0.4902, + "step": 7664 + }, + { + "epoch": 0.6307344167866694, + "grad_norm": 2.000220774123535, + "learning_rate": 6.338439102063622e-06, + "loss": 0.757, + "step": 7665 + }, + { + "epoch": 0.6308167043818145, + "grad_norm": 2.037680407110623, + "learning_rate": 6.3359590252485795e-06, + "loss": 0.7032, + "step": 7666 + }, + { + "epoch": 0.6308989919769594, + "grad_norm": 2.110465951128527, + "learning_rate": 6.333479208721145e-06, + "loss": 0.7498, + "step": 7667 + }, + { + "epoch": 0.6309812795721045, + "grad_norm": 0.4358324800393943, + "learning_rate": 6.330999652657486e-06, + "loss": 0.5168, + "step": 7668 + }, + { + "epoch": 0.6310635671672495, + "grad_norm": 2.0462155940305284, + "learning_rate": 6.328520357233738e-06, + "loss": 0.7266, + "step": 7669 + }, + { + "epoch": 0.6311458547623946, + "grad_norm": 2.1813252235565703, + "learning_rate": 6.326041322626041e-06, + "loss": 0.7284, + "step": 7670 + }, + { + "epoch": 0.6312281423575395, + "grad_norm": 2.2984329759398006, + "learning_rate": 6.323562549010486e-06, + "loss": 0.7337, + "step": 7671 + }, + { + "epoch": 0.6313104299526846, + "grad_norm": 1.9040540520431162, + "learning_rate": 6.321084036563171e-06, + "loss": 0.7254, + "step": 7672 + }, + { + "epoch": 0.6313927175478297, + "grad_norm": 2.2173704319551617, + "learning_rate": 6.318605785460161e-06, + "loss": 0.739, + "step": 7673 + }, + { + "epoch": 0.6314750051429747, + "grad_norm": 2.2216147080644557, + "learning_rate": 6.316127795877511e-06, + "loss": 0.7305, + "step": 7674 + }, + { + "epoch": 0.6315572927381198, + "grad_norm": 4.178906528768139, + "learning_rate": 6.313650067991246e-06, + "loss": 0.7153, + "step": 7675 + }, + { + "epoch": 0.6316395803332647, + "grad_norm": 0.4022054866898268, + "learning_rate": 6.311172601977391e-06, + "loss": 0.4851, + "step": 7676 + }, + { + "epoch": 0.6317218679284098, + "grad_norm": 1.5789632821027042, + "learning_rate": 6.3086953980119295e-06, + "loss": 0.7222, + "step": 7677 + }, + { + "epoch": 0.6318041555235548, + "grad_norm": 2.465495481723768, + "learning_rate": 6.306218456270847e-06, + "loss": 0.754, + "step": 7678 + }, + { + "epoch": 0.6318864431186999, + "grad_norm": 2.496020265051002, + "learning_rate": 6.303741776930097e-06, + "loss": 0.7141, + "step": 7679 + }, + { + "epoch": 0.6319687307138449, + "grad_norm": 2.1743797892530017, + "learning_rate": 6.30126536016562e-06, + "loss": 0.7211, + "step": 7680 + }, + { + "epoch": 0.6320510183089899, + "grad_norm": 3.2560695843383423, + "learning_rate": 6.2987892061533364e-06, + "loss": 0.7505, + "step": 7681 + }, + { + "epoch": 0.632133305904135, + "grad_norm": 3.773164906063876, + "learning_rate": 6.2963133150691495e-06, + "loss": 0.7459, + "step": 7682 + }, + { + "epoch": 0.63221559349928, + "grad_norm": 2.070205080894526, + "learning_rate": 6.293837687088937e-06, + "loss": 0.7354, + "step": 7683 + }, + { + "epoch": 0.632297881094425, + "grad_norm": 2.3209441651201588, + "learning_rate": 6.291362322388574e-06, + "loss": 0.7279, + "step": 7684 + }, + { + "epoch": 0.63238016868957, + "grad_norm": 0.42033956726655575, + "learning_rate": 6.288887221143896e-06, + "loss": 0.4824, + "step": 7685 + }, + { + "epoch": 0.6324624562847151, + "grad_norm": 2.319589126709886, + "learning_rate": 6.2864123835307375e-06, + "loss": 0.7106, + "step": 7686 + }, + { + "epoch": 0.6325447438798601, + "grad_norm": 1.865256699000475, + "learning_rate": 6.283937809724904e-06, + "loss": 0.7416, + "step": 7687 + }, + { + "epoch": 0.6326270314750051, + "grad_norm": 0.43257749345624746, + "learning_rate": 6.281463499902186e-06, + "loss": 0.5129, + "step": 7688 + }, + { + "epoch": 0.6327093190701502, + "grad_norm": 1.9066465733840203, + "learning_rate": 6.27898945423835e-06, + "loss": 0.7392, + "step": 7689 + }, + { + "epoch": 0.6327916066652952, + "grad_norm": 2.5129121353279213, + "learning_rate": 6.276515672909157e-06, + "loss": 0.7301, + "step": 7690 + }, + { + "epoch": 0.6328738942604403, + "grad_norm": 0.4323676780468816, + "learning_rate": 6.274042156090334e-06, + "loss": 0.5377, + "step": 7691 + }, + { + "epoch": 0.6329561818555852, + "grad_norm": 2.613649921099537, + "learning_rate": 6.2715689039575985e-06, + "loss": 0.7516, + "step": 7692 + }, + { + "epoch": 0.6330384694507303, + "grad_norm": 2.0904843924864367, + "learning_rate": 6.269095916686643e-06, + "loss": 0.7287, + "step": 7693 + }, + { + "epoch": 0.6331207570458753, + "grad_norm": 1.5753115390393089, + "learning_rate": 6.266623194453149e-06, + "loss": 0.7028, + "step": 7694 + }, + { + "epoch": 0.6332030446410204, + "grad_norm": 1.9512292667962645, + "learning_rate": 6.264150737432771e-06, + "loss": 0.7282, + "step": 7695 + }, + { + "epoch": 0.6332853322361653, + "grad_norm": 5.914211167107383, + "learning_rate": 6.261678545801151e-06, + "loss": 0.7332, + "step": 7696 + }, + { + "epoch": 0.6333676198313104, + "grad_norm": 2.019324168779867, + "learning_rate": 6.259206619733909e-06, + "loss": 0.7225, + "step": 7697 + }, + { + "epoch": 0.6334499074264555, + "grad_norm": 0.4201514272766437, + "learning_rate": 6.256734959406646e-06, + "loss": 0.4804, + "step": 7698 + }, + { + "epoch": 0.6335321950216005, + "grad_norm": 1.6916410565123474, + "learning_rate": 6.254263564994944e-06, + "loss": 0.7257, + "step": 7699 + }, + { + "epoch": 0.6336144826167456, + "grad_norm": 2.0051606535642823, + "learning_rate": 6.25179243667437e-06, + "loss": 0.7402, + "step": 7700 + }, + { + "epoch": 0.6336967702118905, + "grad_norm": 2.3212714169775053, + "learning_rate": 6.249321574620463e-06, + "loss": 0.7204, + "step": 7701 + }, + { + "epoch": 0.6337790578070356, + "grad_norm": 4.620668098208437, + "learning_rate": 6.246850979008759e-06, + "loss": 0.7268, + "step": 7702 + }, + { + "epoch": 0.6338613454021806, + "grad_norm": 0.42275244368774406, + "learning_rate": 6.244380650014752e-06, + "loss": 0.4957, + "step": 7703 + }, + { + "epoch": 0.6339436329973257, + "grad_norm": 0.4188137268791631, + "learning_rate": 6.241910587813943e-06, + "loss": 0.4783, + "step": 7704 + }, + { + "epoch": 0.6340259205924706, + "grad_norm": 1.9980733942599618, + "learning_rate": 6.239440792581793e-06, + "loss": 0.7208, + "step": 7705 + }, + { + "epoch": 0.6341082081876157, + "grad_norm": 0.4240478232249822, + "learning_rate": 6.236971264493756e-06, + "loss": 0.4794, + "step": 7706 + }, + { + "epoch": 0.6341904957827608, + "grad_norm": 0.4267034887519634, + "learning_rate": 6.23450200372526e-06, + "loss": 0.4863, + "step": 7707 + }, + { + "epoch": 0.6342727833779058, + "grad_norm": 1.7770975193975613, + "learning_rate": 6.232033010451721e-06, + "loss": 0.734, + "step": 7708 + }, + { + "epoch": 0.6343550709730508, + "grad_norm": 1.9044753379431554, + "learning_rate": 6.229564284848533e-06, + "loss": 0.7437, + "step": 7709 + }, + { + "epoch": 0.6344373585681958, + "grad_norm": 1.9481120955076106, + "learning_rate": 6.2270958270910665e-06, + "loss": 0.7247, + "step": 7710 + }, + { + "epoch": 0.6345196461633409, + "grad_norm": 1.9958091359673509, + "learning_rate": 6.22462763735468e-06, + "loss": 0.7323, + "step": 7711 + }, + { + "epoch": 0.634601933758486, + "grad_norm": 0.4262085373240435, + "learning_rate": 6.222159715814704e-06, + "loss": 0.4924, + "step": 7712 + }, + { + "epoch": 0.6346842213536309, + "grad_norm": 2.7697570169769477, + "learning_rate": 6.219692062646465e-06, + "loss": 0.7459, + "step": 7713 + }, + { + "epoch": 0.634766508948776, + "grad_norm": 2.1862670238166944, + "learning_rate": 6.217224678025253e-06, + "loss": 0.7453, + "step": 7714 + }, + { + "epoch": 0.634848796543921, + "grad_norm": 1.7020574665871306, + "learning_rate": 6.214757562126355e-06, + "loss": 0.7285, + "step": 7715 + }, + { + "epoch": 0.6349310841390661, + "grad_norm": 1.8931861799108651, + "learning_rate": 6.212290715125021e-06, + "loss": 0.7315, + "step": 7716 + }, + { + "epoch": 0.635013371734211, + "grad_norm": 1.836361917883911, + "learning_rate": 6.209824137196501e-06, + "loss": 0.6983, + "step": 7717 + }, + { + "epoch": 0.6350956593293561, + "grad_norm": 2.1049439828486, + "learning_rate": 6.207357828516007e-06, + "loss": 0.7078, + "step": 7718 + }, + { + "epoch": 0.6351779469245011, + "grad_norm": 0.4043474878995039, + "learning_rate": 6.204891789258755e-06, + "loss": 0.4856, + "step": 7719 + }, + { + "epoch": 0.6352602345196462, + "grad_norm": 1.7150773612696641, + "learning_rate": 6.202426019599914e-06, + "loss": 0.7466, + "step": 7720 + }, + { + "epoch": 0.6353425221147911, + "grad_norm": 1.8233264921915435, + "learning_rate": 6.199960519714661e-06, + "loss": 0.739, + "step": 7721 + }, + { + "epoch": 0.6354248097099362, + "grad_norm": 1.71552418581388, + "learning_rate": 6.197495289778131e-06, + "loss": 0.7461, + "step": 7722 + }, + { + "epoch": 0.6355070973050813, + "grad_norm": 2.1237819341443496, + "learning_rate": 6.195030329965457e-06, + "loss": 0.7587, + "step": 7723 + }, + { + "epoch": 0.6355893849002263, + "grad_norm": 2.4308828643726144, + "learning_rate": 6.19256564045174e-06, + "loss": 0.7347, + "step": 7724 + }, + { + "epoch": 0.6356716724953714, + "grad_norm": 2.0020621796207156, + "learning_rate": 6.190101221412077e-06, + "loss": 0.7401, + "step": 7725 + }, + { + "epoch": 0.6357539600905163, + "grad_norm": 0.4140642797469834, + "learning_rate": 6.187637073021523e-06, + "loss": 0.4816, + "step": 7726 + }, + { + "epoch": 0.6358362476856614, + "grad_norm": 2.2837648372665167, + "learning_rate": 6.185173195455139e-06, + "loss": 0.7035, + "step": 7727 + }, + { + "epoch": 0.6359185352808064, + "grad_norm": 2.065761567452068, + "learning_rate": 6.182709588887949e-06, + "loss": 0.7316, + "step": 7728 + }, + { + "epoch": 0.6360008228759515, + "grad_norm": 2.1437918234853632, + "learning_rate": 6.180246253494966e-06, + "loss": 0.7099, + "step": 7729 + }, + { + "epoch": 0.6360831104710964, + "grad_norm": 1.929483077488045, + "learning_rate": 6.177783189451178e-06, + "loss": 0.7497, + "step": 7730 + }, + { + "epoch": 0.6361653980662415, + "grad_norm": 0.4156432761134627, + "learning_rate": 6.1753203969315625e-06, + "loss": 0.4948, + "step": 7731 + }, + { + "epoch": 0.6362476856613866, + "grad_norm": 2.312400315900538, + "learning_rate": 6.1728578761110645e-06, + "loss": 0.7278, + "step": 7732 + }, + { + "epoch": 0.6363299732565316, + "grad_norm": 1.8801617973720686, + "learning_rate": 6.170395627164627e-06, + "loss": 0.7122, + "step": 7733 + }, + { + "epoch": 0.6364122608516766, + "grad_norm": 2.2342423328898025, + "learning_rate": 6.167933650267157e-06, + "loss": 0.7422, + "step": 7734 + }, + { + "epoch": 0.6364945484468216, + "grad_norm": 2.1911876189651833, + "learning_rate": 6.165471945593555e-06, + "loss": 0.7135, + "step": 7735 + }, + { + "epoch": 0.6365768360419667, + "grad_norm": 2.0828462909835723, + "learning_rate": 6.16301051331869e-06, + "loss": 0.7253, + "step": 7736 + }, + { + "epoch": 0.6366591236371117, + "grad_norm": 2.185014051934205, + "learning_rate": 6.160549353617426e-06, + "loss": 0.7231, + "step": 7737 + }, + { + "epoch": 0.6367414112322567, + "grad_norm": 1.7108214950029001, + "learning_rate": 6.15808846666459e-06, + "loss": 0.7298, + "step": 7738 + }, + { + "epoch": 0.6368236988274018, + "grad_norm": 0.4075688616670595, + "learning_rate": 6.1556278526350085e-06, + "loss": 0.4732, + "step": 7739 + }, + { + "epoch": 0.6369059864225468, + "grad_norm": 2.5703058005875654, + "learning_rate": 6.153167511703476e-06, + "loss": 0.7203, + "step": 7740 + }, + { + "epoch": 0.6369882740176919, + "grad_norm": 1.9889443625086172, + "learning_rate": 6.150707444044772e-06, + "loss": 0.7416, + "step": 7741 + }, + { + "epoch": 0.6370705616128368, + "grad_norm": 0.4192971614699097, + "learning_rate": 6.148247649833654e-06, + "loss": 0.4655, + "step": 7742 + }, + { + "epoch": 0.6371528492079819, + "grad_norm": 2.259834733362474, + "learning_rate": 6.1457881292448655e-06, + "loss": 0.7205, + "step": 7743 + }, + { + "epoch": 0.6372351368031269, + "grad_norm": 1.7556507994362769, + "learning_rate": 6.143328882453119e-06, + "loss": 0.729, + "step": 7744 + }, + { + "epoch": 0.637317424398272, + "grad_norm": 1.9172751548649989, + "learning_rate": 6.140869909633125e-06, + "loss": 0.77, + "step": 7745 + }, + { + "epoch": 0.6373997119934169, + "grad_norm": 2.013679893454795, + "learning_rate": 6.13841121095956e-06, + "loss": 0.7346, + "step": 7746 + }, + { + "epoch": 0.637481999588562, + "grad_norm": 1.8965892418896675, + "learning_rate": 6.135952786607088e-06, + "loss": 0.7539, + "step": 7747 + }, + { + "epoch": 0.6375642871837071, + "grad_norm": 1.624053192834026, + "learning_rate": 6.133494636750348e-06, + "loss": 0.7373, + "step": 7748 + }, + { + "epoch": 0.6376465747788521, + "grad_norm": 3.0012788701407813, + "learning_rate": 6.131036761563969e-06, + "loss": 0.729, + "step": 7749 + }, + { + "epoch": 0.6377288623739972, + "grad_norm": 1.7978261577787382, + "learning_rate": 6.128579161222545e-06, + "loss": 0.7256, + "step": 7750 + }, + { + "epoch": 0.6378111499691421, + "grad_norm": 2.6682984623862493, + "learning_rate": 6.126121835900673e-06, + "loss": 0.7446, + "step": 7751 + }, + { + "epoch": 0.6378934375642872, + "grad_norm": 1.8304061470682533, + "learning_rate": 6.123664785772905e-06, + "loss": 0.7141, + "step": 7752 + }, + { + "epoch": 0.6379757251594322, + "grad_norm": 1.6718650977659815, + "learning_rate": 6.121208011013795e-06, + "loss": 0.7204, + "step": 7753 + }, + { + "epoch": 0.6380580127545773, + "grad_norm": 1.7132832665732112, + "learning_rate": 6.1187515117978616e-06, + "loss": 0.7071, + "step": 7754 + }, + { + "epoch": 0.6381403003497222, + "grad_norm": 2.144958482552898, + "learning_rate": 6.116295288299616e-06, + "loss": 0.7113, + "step": 7755 + }, + { + "epoch": 0.6382225879448673, + "grad_norm": 1.7204234199319703, + "learning_rate": 6.113839340693539e-06, + "loss": 0.7122, + "step": 7756 + }, + { + "epoch": 0.6383048755400124, + "grad_norm": 0.4048452172397955, + "learning_rate": 6.111383669154105e-06, + "loss": 0.4507, + "step": 7757 + }, + { + "epoch": 0.6383871631351574, + "grad_norm": 1.644253538994966, + "learning_rate": 6.108928273855752e-06, + "loss": 0.7273, + "step": 7758 + }, + { + "epoch": 0.6384694507303024, + "grad_norm": 2.04253315141429, + "learning_rate": 6.106473154972914e-06, + "loss": 0.7153, + "step": 7759 + }, + { + "epoch": 0.6385517383254474, + "grad_norm": 1.8570652606850309, + "learning_rate": 6.1040183126799945e-06, + "loss": 0.7386, + "step": 7760 + }, + { + "epoch": 0.6386340259205925, + "grad_norm": 1.6153800787367236, + "learning_rate": 6.101563747151387e-06, + "loss": 0.7211, + "step": 7761 + }, + { + "epoch": 0.6387163135157375, + "grad_norm": 2.111237363674184, + "learning_rate": 6.0991094585614515e-06, + "loss": 0.7457, + "step": 7762 + }, + { + "epoch": 0.6387986011108825, + "grad_norm": 0.429577344538212, + "learning_rate": 6.096655447084545e-06, + "loss": 0.5079, + "step": 7763 + }, + { + "epoch": 0.6388808887060275, + "grad_norm": 2.743329996221105, + "learning_rate": 6.094201712894989e-06, + "loss": 0.7264, + "step": 7764 + }, + { + "epoch": 0.6389631763011726, + "grad_norm": 1.9756482477623747, + "learning_rate": 6.091748256167101e-06, + "loss": 0.749, + "step": 7765 + }, + { + "epoch": 0.6390454638963177, + "grad_norm": 1.552679400414878, + "learning_rate": 6.089295077075165e-06, + "loss": 0.7063, + "step": 7766 + }, + { + "epoch": 0.6391277514914626, + "grad_norm": 1.7059182666088482, + "learning_rate": 6.086842175793454e-06, + "loss": 0.7343, + "step": 7767 + }, + { + "epoch": 0.6392100390866077, + "grad_norm": 4.074562492065128, + "learning_rate": 6.084389552496215e-06, + "loss": 0.723, + "step": 7768 + }, + { + "epoch": 0.6392923266817527, + "grad_norm": 0.4224139289339871, + "learning_rate": 6.081937207357681e-06, + "loss": 0.49, + "step": 7769 + }, + { + "epoch": 0.6393746142768978, + "grad_norm": 0.4187246774379287, + "learning_rate": 6.079485140552059e-06, + "loss": 0.4903, + "step": 7770 + }, + { + "epoch": 0.6394569018720427, + "grad_norm": 2.130801595310272, + "learning_rate": 6.077033352253545e-06, + "loss": 0.7138, + "step": 7771 + }, + { + "epoch": 0.6395391894671878, + "grad_norm": 1.8234223854397968, + "learning_rate": 6.0745818426363075e-06, + "loss": 0.7136, + "step": 7772 + }, + { + "epoch": 0.6396214770623329, + "grad_norm": 1.9119629121434913, + "learning_rate": 6.0721306118745006e-06, + "loss": 0.7077, + "step": 7773 + }, + { + "epoch": 0.6397037646574779, + "grad_norm": 1.7860864785453352, + "learning_rate": 6.069679660142249e-06, + "loss": 0.7049, + "step": 7774 + }, + { + "epoch": 0.639786052252623, + "grad_norm": 1.8618275069082972, + "learning_rate": 6.067228987613672e-06, + "loss": 0.7269, + "step": 7775 + }, + { + "epoch": 0.6398683398477679, + "grad_norm": 3.950541631159081, + "learning_rate": 6.064778594462854e-06, + "loss": 0.7149, + "step": 7776 + }, + { + "epoch": 0.639950627442913, + "grad_norm": 1.9733440799476738, + "learning_rate": 6.062328480863876e-06, + "loss": 0.7244, + "step": 7777 + }, + { + "epoch": 0.640032915038058, + "grad_norm": 0.43374979480821974, + "learning_rate": 6.059878646990782e-06, + "loss": 0.5003, + "step": 7778 + }, + { + "epoch": 0.6401152026332031, + "grad_norm": 0.41983808564477587, + "learning_rate": 6.057429093017612e-06, + "loss": 0.4748, + "step": 7779 + }, + { + "epoch": 0.640197490228348, + "grad_norm": 0.42266352345450003, + "learning_rate": 6.05497981911837e-06, + "loss": 0.4861, + "step": 7780 + }, + { + "epoch": 0.6402797778234931, + "grad_norm": 0.41342318783587656, + "learning_rate": 6.052530825467054e-06, + "loss": 0.4673, + "step": 7781 + }, + { + "epoch": 0.6403620654186382, + "grad_norm": 0.4161251117813859, + "learning_rate": 6.050082112237633e-06, + "loss": 0.5168, + "step": 7782 + }, + { + "epoch": 0.6404443530137832, + "grad_norm": 1.9464917862275388, + "learning_rate": 6.047633679604067e-06, + "loss": 0.7303, + "step": 7783 + }, + { + "epoch": 0.6405266406089282, + "grad_norm": 2.1772748129907855, + "learning_rate": 6.045185527740277e-06, + "loss": 0.7465, + "step": 7784 + }, + { + "epoch": 0.6406089282040732, + "grad_norm": 3.4594707096930533, + "learning_rate": 6.042737656820187e-06, + "loss": 0.7195, + "step": 7785 + }, + { + "epoch": 0.6406912157992183, + "grad_norm": 1.7658705549410707, + "learning_rate": 6.040290067017682e-06, + "loss": 0.7177, + "step": 7786 + }, + { + "epoch": 0.6407735033943633, + "grad_norm": 1.8728645660556356, + "learning_rate": 6.037842758506641e-06, + "loss": 0.7374, + "step": 7787 + }, + { + "epoch": 0.6408557909895083, + "grad_norm": 0.40772024075653335, + "learning_rate": 6.03539573146091e-06, + "loss": 0.4797, + "step": 7788 + }, + { + "epoch": 0.6409380785846533, + "grad_norm": 0.415300951696093, + "learning_rate": 6.032948986054332e-06, + "loss": 0.4861, + "step": 7789 + }, + { + "epoch": 0.6410203661797984, + "grad_norm": 4.820144052685122, + "learning_rate": 6.030502522460706e-06, + "loss": 0.7275, + "step": 7790 + }, + { + "epoch": 0.6411026537749435, + "grad_norm": 2.0958080813683737, + "learning_rate": 6.028056340853838e-06, + "loss": 0.7122, + "step": 7791 + }, + { + "epoch": 0.6411849413700884, + "grad_norm": 1.794926805010093, + "learning_rate": 6.025610441407494e-06, + "loss": 0.7327, + "step": 7792 + }, + { + "epoch": 0.6412672289652335, + "grad_norm": 2.290225102074725, + "learning_rate": 6.023164824295429e-06, + "loss": 0.7396, + "step": 7793 + }, + { + "epoch": 0.6413495165603785, + "grad_norm": 1.9080327079972568, + "learning_rate": 6.020719489691371e-06, + "loss": 0.7177, + "step": 7794 + }, + { + "epoch": 0.6414318041555236, + "grad_norm": 1.6303912052196579, + "learning_rate": 6.018274437769042e-06, + "loss": 0.7711, + "step": 7795 + }, + { + "epoch": 0.6415140917506685, + "grad_norm": 1.864285796183624, + "learning_rate": 6.015829668702125e-06, + "loss": 0.7468, + "step": 7796 + }, + { + "epoch": 0.6415963793458136, + "grad_norm": 1.751756515112598, + "learning_rate": 6.013385182664299e-06, + "loss": 0.7234, + "step": 7797 + }, + { + "epoch": 0.6416786669409587, + "grad_norm": 3.1264924717244176, + "learning_rate": 6.010940979829211e-06, + "loss": 0.7655, + "step": 7798 + }, + { + "epoch": 0.6417609545361037, + "grad_norm": 1.7121055140420551, + "learning_rate": 6.008497060370501e-06, + "loss": 0.7194, + "step": 7799 + }, + { + "epoch": 0.6418432421312487, + "grad_norm": 1.8419300028601957, + "learning_rate": 6.006053424461773e-06, + "loss": 0.7071, + "step": 7800 + }, + { + "epoch": 0.6419255297263937, + "grad_norm": 1.937345455288174, + "learning_rate": 6.0036100722766246e-06, + "loss": 0.7134, + "step": 7801 + }, + { + "epoch": 0.6420078173215388, + "grad_norm": 2.4095824193830633, + "learning_rate": 6.0011670039886205e-06, + "loss": 0.7331, + "step": 7802 + }, + { + "epoch": 0.6420901049166838, + "grad_norm": 1.721664209651507, + "learning_rate": 5.9987242197713215e-06, + "loss": 0.7311, + "step": 7803 + }, + { + "epoch": 0.6421723925118289, + "grad_norm": 2.3099896607350865, + "learning_rate": 5.996281719798254e-06, + "loss": 0.7544, + "step": 7804 + }, + { + "epoch": 0.6422546801069738, + "grad_norm": 1.7070384700171692, + "learning_rate": 5.993839504242931e-06, + "loss": 0.7401, + "step": 7805 + }, + { + "epoch": 0.6423369677021189, + "grad_norm": 2.2439566612486206, + "learning_rate": 5.991397573278841e-06, + "loss": 0.7408, + "step": 7806 + }, + { + "epoch": 0.642419255297264, + "grad_norm": 1.9995314306144316, + "learning_rate": 5.988955927079458e-06, + "loss": 0.775, + "step": 7807 + }, + { + "epoch": 0.642501542892409, + "grad_norm": 2.103671933768807, + "learning_rate": 5.9865145658182275e-06, + "loss": 0.7614, + "step": 7808 + }, + { + "epoch": 0.642583830487554, + "grad_norm": 1.9361532661184009, + "learning_rate": 5.984073489668589e-06, + "loss": 0.7391, + "step": 7809 + }, + { + "epoch": 0.642666118082699, + "grad_norm": 0.42013554465931785, + "learning_rate": 5.981632698803943e-06, + "loss": 0.4997, + "step": 7810 + }, + { + "epoch": 0.6427484056778441, + "grad_norm": 2.4523565873690223, + "learning_rate": 5.979192193397687e-06, + "loss": 0.7333, + "step": 7811 + }, + { + "epoch": 0.6428306932729891, + "grad_norm": 2.3128073840651724, + "learning_rate": 5.976751973623185e-06, + "loss": 0.7626, + "step": 7812 + }, + { + "epoch": 0.6429129808681341, + "grad_norm": 1.9384515521701324, + "learning_rate": 5.9743120396537916e-06, + "loss": 0.7243, + "step": 7813 + }, + { + "epoch": 0.6429952684632791, + "grad_norm": 2.0515880508980686, + "learning_rate": 5.971872391662828e-06, + "loss": 0.7341, + "step": 7814 + }, + { + "epoch": 0.6430775560584242, + "grad_norm": 3.315048250793878, + "learning_rate": 5.96943302982361e-06, + "loss": 0.7417, + "step": 7815 + }, + { + "epoch": 0.6431598436535693, + "grad_norm": 1.7243809920536257, + "learning_rate": 5.966993954309423e-06, + "loss": 0.7227, + "step": 7816 + }, + { + "epoch": 0.6432421312487142, + "grad_norm": 1.9665448781378425, + "learning_rate": 5.964555165293539e-06, + "loss": 0.6881, + "step": 7817 + }, + { + "epoch": 0.6433244188438593, + "grad_norm": 2.162996833891354, + "learning_rate": 5.962116662949199e-06, + "loss": 0.7204, + "step": 7818 + }, + { + "epoch": 0.6434067064390043, + "grad_norm": 2.28863547314551, + "learning_rate": 5.959678447449635e-06, + "loss": 0.7583, + "step": 7819 + }, + { + "epoch": 0.6434889940341494, + "grad_norm": 1.9895512077892297, + "learning_rate": 5.9572405189680495e-06, + "loss": 0.7299, + "step": 7820 + }, + { + "epoch": 0.6435712816292943, + "grad_norm": 1.8926325662523136, + "learning_rate": 5.954802877677638e-06, + "loss": 0.713, + "step": 7821 + }, + { + "epoch": 0.6436535692244394, + "grad_norm": 1.981743769597512, + "learning_rate": 5.952365523751554e-06, + "loss": 0.7143, + "step": 7822 + }, + { + "epoch": 0.6437358568195845, + "grad_norm": 2.532899853480285, + "learning_rate": 5.949928457362954e-06, + "loss": 0.7575, + "step": 7823 + }, + { + "epoch": 0.6438181444147295, + "grad_norm": 4.114891906091397, + "learning_rate": 5.947491678684959e-06, + "loss": 0.72, + "step": 7824 + }, + { + "epoch": 0.6439004320098745, + "grad_norm": 2.054452202473746, + "learning_rate": 5.945055187890675e-06, + "loss": 0.7476, + "step": 7825 + }, + { + "epoch": 0.6439827196050195, + "grad_norm": 1.8483514761749944, + "learning_rate": 5.942618985153181e-06, + "loss": 0.6956, + "step": 7826 + }, + { + "epoch": 0.6440650072001646, + "grad_norm": 1.87421605525078, + "learning_rate": 5.940183070645552e-06, + "loss": 0.717, + "step": 7827 + }, + { + "epoch": 0.6441472947953096, + "grad_norm": 1.763025415379264, + "learning_rate": 5.937747444540819e-06, + "loss": 0.7159, + "step": 7828 + }, + { + "epoch": 0.6442295823904547, + "grad_norm": 1.965119648881471, + "learning_rate": 5.935312107012015e-06, + "loss": 0.7017, + "step": 7829 + }, + { + "epoch": 0.6443118699855996, + "grad_norm": 2.3406159421831294, + "learning_rate": 5.932877058232136e-06, + "loss": 0.7012, + "step": 7830 + }, + { + "epoch": 0.6443941575807447, + "grad_norm": 2.037084550714441, + "learning_rate": 5.930442298374168e-06, + "loss": 0.7344, + "step": 7831 + }, + { + "epoch": 0.6444764451758898, + "grad_norm": 1.7062079155615253, + "learning_rate": 5.928007827611069e-06, + "loss": 0.7173, + "step": 7832 + }, + { + "epoch": 0.6445587327710348, + "grad_norm": 0.4070444619062846, + "learning_rate": 5.925573646115785e-06, + "loss": 0.4816, + "step": 7833 + }, + { + "epoch": 0.6446410203661798, + "grad_norm": 0.40725294712014637, + "learning_rate": 5.923139754061228e-06, + "loss": 0.5051, + "step": 7834 + }, + { + "epoch": 0.6447233079613248, + "grad_norm": 2.2521986272667114, + "learning_rate": 5.920706151620307e-06, + "loss": 0.7633, + "step": 7835 + }, + { + "epoch": 0.6448055955564699, + "grad_norm": 2.343256517389742, + "learning_rate": 5.918272838965895e-06, + "loss": 0.7071, + "step": 7836 + }, + { + "epoch": 0.6448878831516149, + "grad_norm": 2.1785160756702258, + "learning_rate": 5.9158398162708555e-06, + "loss": 0.7303, + "step": 7837 + }, + { + "epoch": 0.6449701707467599, + "grad_norm": 2.0900914901413024, + "learning_rate": 5.913407083708021e-06, + "loss": 0.7233, + "step": 7838 + }, + { + "epoch": 0.6450524583419049, + "grad_norm": 1.7938987267098365, + "learning_rate": 5.910974641450214e-06, + "loss": 0.7087, + "step": 7839 + }, + { + "epoch": 0.64513474593705, + "grad_norm": 2.013430293668321, + "learning_rate": 5.908542489670225e-06, + "loss": 0.74, + "step": 7840 + }, + { + "epoch": 0.6452170335321951, + "grad_norm": 2.1582800691197175, + "learning_rate": 5.906110628540837e-06, + "loss": 0.7316, + "step": 7841 + }, + { + "epoch": 0.64529932112734, + "grad_norm": 1.8077550556168211, + "learning_rate": 5.903679058234803e-06, + "loss": 0.7083, + "step": 7842 + }, + { + "epoch": 0.6453816087224851, + "grad_norm": 0.4140713160191435, + "learning_rate": 5.901247778924859e-06, + "loss": 0.5098, + "step": 7843 + }, + { + "epoch": 0.6454638963176301, + "grad_norm": 2.1648292286605404, + "learning_rate": 5.898816790783715e-06, + "loss": 0.7201, + "step": 7844 + }, + { + "epoch": 0.6455461839127752, + "grad_norm": 2.0470156736648146, + "learning_rate": 5.8963860939840704e-06, + "loss": 0.7261, + "step": 7845 + }, + { + "epoch": 0.6456284715079201, + "grad_norm": 2.2894955368829324, + "learning_rate": 5.89395568869859e-06, + "loss": 0.757, + "step": 7846 + }, + { + "epoch": 0.6457107591030652, + "grad_norm": 2.0740701976784193, + "learning_rate": 5.891525575099934e-06, + "loss": 0.7111, + "step": 7847 + }, + { + "epoch": 0.6457930466982102, + "grad_norm": 1.8528131524108304, + "learning_rate": 5.88909575336073e-06, + "loss": 0.7293, + "step": 7848 + }, + { + "epoch": 0.6458753342933553, + "grad_norm": 2.3433904673929997, + "learning_rate": 5.886666223653592e-06, + "loss": 0.7, + "step": 7849 + }, + { + "epoch": 0.6459576218885003, + "grad_norm": 2.026029264888549, + "learning_rate": 5.8842369861511035e-06, + "loss": 0.7298, + "step": 7850 + }, + { + "epoch": 0.6460399094836453, + "grad_norm": 2.078975083007053, + "learning_rate": 5.8818080410258404e-06, + "loss": 0.7143, + "step": 7851 + }, + { + "epoch": 0.6461221970787904, + "grad_norm": 0.4265653536998042, + "learning_rate": 5.879379388450344e-06, + "loss": 0.4697, + "step": 7852 + }, + { + "epoch": 0.6462044846739354, + "grad_norm": 3.4563034805230624, + "learning_rate": 5.876951028597151e-06, + "loss": 0.701, + "step": 7853 + }, + { + "epoch": 0.6462867722690805, + "grad_norm": 0.41429787667014534, + "learning_rate": 5.874522961638758e-06, + "loss": 0.4866, + "step": 7854 + }, + { + "epoch": 0.6463690598642254, + "grad_norm": 2.1450646791873185, + "learning_rate": 5.8720951877476595e-06, + "loss": 0.6985, + "step": 7855 + }, + { + "epoch": 0.6464513474593705, + "grad_norm": 1.8891264549181581, + "learning_rate": 5.869667707096315e-06, + "loss": 0.7226, + "step": 7856 + }, + { + "epoch": 0.6465336350545156, + "grad_norm": 1.9973368373912819, + "learning_rate": 5.867240519857173e-06, + "loss": 0.7412, + "step": 7857 + }, + { + "epoch": 0.6466159226496606, + "grad_norm": 2.3145984446965717, + "learning_rate": 5.864813626202652e-06, + "loss": 0.7034, + "step": 7858 + }, + { + "epoch": 0.6466982102448056, + "grad_norm": 1.894703844227745, + "learning_rate": 5.862387026305163e-06, + "loss": 0.7347, + "step": 7859 + }, + { + "epoch": 0.6467804978399506, + "grad_norm": 2.7052541612678973, + "learning_rate": 5.859960720337076e-06, + "loss": 0.7135, + "step": 7860 + }, + { + "epoch": 0.6468627854350957, + "grad_norm": 2.3637245343962916, + "learning_rate": 5.857534708470764e-06, + "loss": 0.7161, + "step": 7861 + }, + { + "epoch": 0.6469450730302407, + "grad_norm": 4.071559740628326, + "learning_rate": 5.8551089908785585e-06, + "loss": 0.703, + "step": 7862 + }, + { + "epoch": 0.6470273606253857, + "grad_norm": 2.46918008767158, + "learning_rate": 5.852683567732784e-06, + "loss": 0.7447, + "step": 7863 + }, + { + "epoch": 0.6471096482205307, + "grad_norm": 2.4671472282435514, + "learning_rate": 5.8502584392057335e-06, + "loss": 0.7212, + "step": 7864 + }, + { + "epoch": 0.6471919358156758, + "grad_norm": 2.85973273646127, + "learning_rate": 5.847833605469689e-06, + "loss": 0.7487, + "step": 7865 + }, + { + "epoch": 0.6472742234108209, + "grad_norm": 2.2753654017178353, + "learning_rate": 5.845409066696901e-06, + "loss": 0.7276, + "step": 7866 + }, + { + "epoch": 0.6473565110059658, + "grad_norm": 2.4345793780271134, + "learning_rate": 5.842984823059613e-06, + "loss": 0.7339, + "step": 7867 + }, + { + "epoch": 0.6474387986011109, + "grad_norm": 0.4139377530521681, + "learning_rate": 5.840560874730032e-06, + "loss": 0.4784, + "step": 7868 + }, + { + "epoch": 0.6475210861962559, + "grad_norm": 2.940524796867118, + "learning_rate": 5.838137221880358e-06, + "loss": 0.7301, + "step": 7869 + }, + { + "epoch": 0.647603373791401, + "grad_norm": 2.5338434204995215, + "learning_rate": 5.835713864682755e-06, + "loss": 0.7273, + "step": 7870 + }, + { + "epoch": 0.6476856613865459, + "grad_norm": 2.251749102600391, + "learning_rate": 5.8332908033093825e-06, + "loss": 0.7183, + "step": 7871 + }, + { + "epoch": 0.647767948981691, + "grad_norm": 2.4515768685003185, + "learning_rate": 5.830868037932369e-06, + "loss": 0.7188, + "step": 7872 + }, + { + "epoch": 0.647850236576836, + "grad_norm": 2.1175103125786547, + "learning_rate": 5.828445568723822e-06, + "loss": 0.765, + "step": 7873 + }, + { + "epoch": 0.6479325241719811, + "grad_norm": 2.3610349672858866, + "learning_rate": 5.826023395855825e-06, + "loss": 0.7452, + "step": 7874 + }, + { + "epoch": 0.648014811767126, + "grad_norm": 1.9855161307856124, + "learning_rate": 5.823601519500457e-06, + "loss": 0.7033, + "step": 7875 + }, + { + "epoch": 0.6480970993622711, + "grad_norm": 0.41643811275649567, + "learning_rate": 5.821179939829753e-06, + "loss": 0.4723, + "step": 7876 + }, + { + "epoch": 0.6481793869574162, + "grad_norm": 2.6150139409611874, + "learning_rate": 5.818758657015746e-06, + "loss": 0.7228, + "step": 7877 + }, + { + "epoch": 0.6482616745525612, + "grad_norm": 2.4968416393585473, + "learning_rate": 5.816337671230438e-06, + "loss": 0.7401, + "step": 7878 + }, + { + "epoch": 0.6483439621477063, + "grad_norm": 3.4500287122713407, + "learning_rate": 5.8139169826458104e-06, + "loss": 0.7135, + "step": 7879 + }, + { + "epoch": 0.6484262497428512, + "grad_norm": 0.3949147118532159, + "learning_rate": 5.8114965914338225e-06, + "loss": 0.4867, + "step": 7880 + }, + { + "epoch": 0.6485085373379963, + "grad_norm": 2.012522843085755, + "learning_rate": 5.809076497766422e-06, + "loss": 0.7214, + "step": 7881 + }, + { + "epoch": 0.6485908249331414, + "grad_norm": 2.967690024822581, + "learning_rate": 5.806656701815521e-06, + "loss": 0.7283, + "step": 7882 + }, + { + "epoch": 0.6486731125282864, + "grad_norm": 0.4252079892305907, + "learning_rate": 5.804237203753026e-06, + "loss": 0.4811, + "step": 7883 + }, + { + "epoch": 0.6487554001234314, + "grad_norm": 2.6924948243868725, + "learning_rate": 5.8018180037508094e-06, + "loss": 0.7129, + "step": 7884 + }, + { + "epoch": 0.6488376877185764, + "grad_norm": 2.206668616850765, + "learning_rate": 5.799399101980727e-06, + "loss": 0.7452, + "step": 7885 + }, + { + "epoch": 0.6489199753137215, + "grad_norm": 1.9258506832553175, + "learning_rate": 5.796980498614611e-06, + "loss": 0.7368, + "step": 7886 + }, + { + "epoch": 0.6490022629088665, + "grad_norm": 2.043058167516074, + "learning_rate": 5.794562193824283e-06, + "loss": 0.727, + "step": 7887 + }, + { + "epoch": 0.6490845505040115, + "grad_norm": 2.3362176626006295, + "learning_rate": 5.792144187781528e-06, + "loss": 0.7511, + "step": 7888 + }, + { + "epoch": 0.6491668380991565, + "grad_norm": 2.4495720062792454, + "learning_rate": 5.7897264806581236e-06, + "loss": 0.7296, + "step": 7889 + }, + { + "epoch": 0.6492491256943016, + "grad_norm": 2.557078567606202, + "learning_rate": 5.787309072625815e-06, + "loss": 0.7334, + "step": 7890 + }, + { + "epoch": 0.6493314132894467, + "grad_norm": 2.1804971261919275, + "learning_rate": 5.784891963856335e-06, + "loss": 0.769, + "step": 7891 + }, + { + "epoch": 0.6494137008845916, + "grad_norm": 2.4020934044088107, + "learning_rate": 5.782475154521384e-06, + "loss": 0.7299, + "step": 7892 + }, + { + "epoch": 0.6494959884797367, + "grad_norm": 2.3949064438779946, + "learning_rate": 5.780058644792656e-06, + "loss": 0.761, + "step": 7893 + }, + { + "epoch": 0.6495782760748817, + "grad_norm": 2.085231413835885, + "learning_rate": 5.777642434841812e-06, + "loss": 0.7588, + "step": 7894 + }, + { + "epoch": 0.6496605636700268, + "grad_norm": 2.078094661002405, + "learning_rate": 5.775226524840498e-06, + "loss": 0.7304, + "step": 7895 + }, + { + "epoch": 0.6497428512651717, + "grad_norm": 2.154646948902891, + "learning_rate": 5.772810914960336e-06, + "loss": 0.737, + "step": 7896 + }, + { + "epoch": 0.6498251388603168, + "grad_norm": 0.40668190359511813, + "learning_rate": 5.770395605372927e-06, + "loss": 0.5049, + "step": 7897 + }, + { + "epoch": 0.6499074264554618, + "grad_norm": 2.1538351023780566, + "learning_rate": 5.767980596249845e-06, + "loss": 0.7153, + "step": 7898 + }, + { + "epoch": 0.6499897140506069, + "grad_norm": 2.010840936725749, + "learning_rate": 5.765565887762658e-06, + "loss": 0.705, + "step": 7899 + }, + { + "epoch": 0.6500720016457519, + "grad_norm": 2.2569847699032457, + "learning_rate": 5.7631514800828935e-06, + "loss": 0.7542, + "step": 7900 + }, + { + "epoch": 0.6501542892408969, + "grad_norm": 3.138646290498214, + "learning_rate": 5.760737373382077e-06, + "loss": 0.7114, + "step": 7901 + }, + { + "epoch": 0.650236576836042, + "grad_norm": 1.7925936074981248, + "learning_rate": 5.758323567831697e-06, + "loss": 0.7228, + "step": 7902 + }, + { + "epoch": 0.650318864431187, + "grad_norm": 1.9356284967852626, + "learning_rate": 5.755910063603223e-06, + "loss": 0.7774, + "step": 7903 + }, + { + "epoch": 0.6504011520263321, + "grad_norm": 2.24119471295261, + "learning_rate": 5.753496860868115e-06, + "loss": 0.7488, + "step": 7904 + }, + { + "epoch": 0.650483439621477, + "grad_norm": 2.565509732439287, + "learning_rate": 5.751083959797796e-06, + "loss": 0.7419, + "step": 7905 + }, + { + "epoch": 0.6505657272166221, + "grad_norm": 0.4346186674919707, + "learning_rate": 5.74867136056368e-06, + "loss": 0.4812, + "step": 7906 + }, + { + "epoch": 0.6506480148117672, + "grad_norm": 2.337147338124947, + "learning_rate": 5.746259063337152e-06, + "loss": 0.712, + "step": 7907 + }, + { + "epoch": 0.6507303024069122, + "grad_norm": 1.8078914223211566, + "learning_rate": 5.743847068289576e-06, + "loss": 0.7566, + "step": 7908 + }, + { + "epoch": 0.6508125900020572, + "grad_norm": 2.9820177018229814, + "learning_rate": 5.7414353755922944e-06, + "loss": 0.7213, + "step": 7909 + }, + { + "epoch": 0.6508948775972022, + "grad_norm": 2.017441241261812, + "learning_rate": 5.739023985416638e-06, + "loss": 0.7206, + "step": 7910 + }, + { + "epoch": 0.6509771651923473, + "grad_norm": 1.7323572437120163, + "learning_rate": 5.736612897933898e-06, + "loss": 0.7552, + "step": 7911 + }, + { + "epoch": 0.6510594527874923, + "grad_norm": 0.4145048688653215, + "learning_rate": 5.7342021133153635e-06, + "loss": 0.4664, + "step": 7912 + }, + { + "epoch": 0.6511417403826373, + "grad_norm": 2.092946592276492, + "learning_rate": 5.731791631732291e-06, + "loss": 0.7112, + "step": 7913 + }, + { + "epoch": 0.6512240279777823, + "grad_norm": 2.1013586495268535, + "learning_rate": 5.729381453355912e-06, + "loss": 0.758, + "step": 7914 + }, + { + "epoch": 0.6513063155729274, + "grad_norm": 2.4941100095138196, + "learning_rate": 5.726971578357443e-06, + "loss": 0.7039, + "step": 7915 + }, + { + "epoch": 0.6513886031680725, + "grad_norm": 2.1085614867247493, + "learning_rate": 5.724562006908081e-06, + "loss": 0.7454, + "step": 7916 + }, + { + "epoch": 0.6514708907632174, + "grad_norm": 0.42530293528242413, + "learning_rate": 5.722152739178995e-06, + "loss": 0.4832, + "step": 7917 + }, + { + "epoch": 0.6515531783583625, + "grad_norm": 0.4197023786191634, + "learning_rate": 5.719743775341343e-06, + "loss": 0.4765, + "step": 7918 + }, + { + "epoch": 0.6516354659535075, + "grad_norm": 1.9378059135859556, + "learning_rate": 5.7173351155662406e-06, + "loss": 0.7388, + "step": 7919 + }, + { + "epoch": 0.6517177535486526, + "grad_norm": 0.41645350331184555, + "learning_rate": 5.714926760024807e-06, + "loss": 0.4887, + "step": 7920 + }, + { + "epoch": 0.6518000411437975, + "grad_norm": 2.006392359796858, + "learning_rate": 5.71251870888812e-06, + "loss": 0.7097, + "step": 7921 + }, + { + "epoch": 0.6518823287389426, + "grad_norm": 0.42002286394410593, + "learning_rate": 5.710110962327249e-06, + "loss": 0.4847, + "step": 7922 + }, + { + "epoch": 0.6519646163340876, + "grad_norm": 0.44944842551724734, + "learning_rate": 5.707703520513232e-06, + "loss": 0.5058, + "step": 7923 + }, + { + "epoch": 0.6520469039292327, + "grad_norm": 2.2057993498284523, + "learning_rate": 5.7052963836171e-06, + "loss": 0.7323, + "step": 7924 + }, + { + "epoch": 0.6521291915243776, + "grad_norm": 1.6885388925619726, + "learning_rate": 5.702889551809837e-06, + "loss": 0.7125, + "step": 7925 + }, + { + "epoch": 0.6522114791195227, + "grad_norm": 1.9124255903579557, + "learning_rate": 5.700483025262431e-06, + "loss": 0.7251, + "step": 7926 + }, + { + "epoch": 0.6522937667146678, + "grad_norm": 2.2365083082045563, + "learning_rate": 5.698076804145831e-06, + "loss": 0.7499, + "step": 7927 + }, + { + "epoch": 0.6523760543098128, + "grad_norm": 2.151981300863038, + "learning_rate": 5.69567088863098e-06, + "loss": 0.748, + "step": 7928 + }, + { + "epoch": 0.6524583419049578, + "grad_norm": 1.7330190156381857, + "learning_rate": 5.6932652788887856e-06, + "loss": 0.7328, + "step": 7929 + }, + { + "epoch": 0.6525406295001028, + "grad_norm": 0.42832091277294054, + "learning_rate": 5.690859975090137e-06, + "loss": 0.4821, + "step": 7930 + }, + { + "epoch": 0.6526229170952479, + "grad_norm": 2.200870050915053, + "learning_rate": 5.6884549774059e-06, + "loss": 0.7256, + "step": 7931 + }, + { + "epoch": 0.652705204690393, + "grad_norm": 0.40434326840067825, + "learning_rate": 5.686050286006931e-06, + "loss": 0.4796, + "step": 7932 + }, + { + "epoch": 0.652787492285538, + "grad_norm": 2.7210390459611986, + "learning_rate": 5.683645901064046e-06, + "loss": 0.7168, + "step": 7933 + }, + { + "epoch": 0.652869779880683, + "grad_norm": 2.172225563930739, + "learning_rate": 5.681241822748058e-06, + "loss": 0.7302, + "step": 7934 + }, + { + "epoch": 0.652952067475828, + "grad_norm": 2.2436562388013153, + "learning_rate": 5.678838051229744e-06, + "loss": 0.7175, + "step": 7935 + }, + { + "epoch": 0.6530343550709731, + "grad_norm": 1.678773603829532, + "learning_rate": 5.6764345866798645e-06, + "loss": 0.7217, + "step": 7936 + }, + { + "epoch": 0.6531166426661181, + "grad_norm": 2.0209364861300148, + "learning_rate": 5.674031429269151e-06, + "loss": 0.7288, + "step": 7937 + }, + { + "epoch": 0.6531989302612631, + "grad_norm": 1.7439853042770517, + "learning_rate": 5.671628579168333e-06, + "loss": 0.7039, + "step": 7938 + }, + { + "epoch": 0.6532812178564081, + "grad_norm": 1.8892414177777892, + "learning_rate": 5.669226036548094e-06, + "loss": 0.6951, + "step": 7939 + }, + { + "epoch": 0.6533635054515532, + "grad_norm": 1.9983987311317213, + "learning_rate": 5.666823801579115e-06, + "loss": 0.71, + "step": 7940 + }, + { + "epoch": 0.6534457930466983, + "grad_norm": 0.41415189087895704, + "learning_rate": 5.6644218744320426e-06, + "loss": 0.4832, + "step": 7941 + }, + { + "epoch": 0.6535280806418432, + "grad_norm": 0.41057345616572877, + "learning_rate": 5.662020255277507e-06, + "loss": 0.4799, + "step": 7942 + }, + { + "epoch": 0.6536103682369883, + "grad_norm": 2.246000177496527, + "learning_rate": 5.659618944286112e-06, + "loss": 0.7401, + "step": 7943 + }, + { + "epoch": 0.6536926558321333, + "grad_norm": 0.4019613536207622, + "learning_rate": 5.657217941628448e-06, + "loss": 0.483, + "step": 7944 + }, + { + "epoch": 0.6537749434272784, + "grad_norm": 2.3344186649398315, + "learning_rate": 5.654817247475073e-06, + "loss": 0.7056, + "step": 7945 + }, + { + "epoch": 0.6538572310224233, + "grad_norm": 2.050403588235336, + "learning_rate": 5.652416861996535e-06, + "loss": 0.7214, + "step": 7946 + }, + { + "epoch": 0.6539395186175684, + "grad_norm": 1.9219197844242004, + "learning_rate": 5.65001678536335e-06, + "loss": 0.7287, + "step": 7947 + }, + { + "epoch": 0.6540218062127134, + "grad_norm": 0.41254720738253614, + "learning_rate": 5.647617017746017e-06, + "loss": 0.4757, + "step": 7948 + }, + { + "epoch": 0.6541040938078585, + "grad_norm": 2.3086848410786036, + "learning_rate": 5.645217559315006e-06, + "loss": 0.7025, + "step": 7949 + }, + { + "epoch": 0.6541863814030034, + "grad_norm": 1.9502685149288348, + "learning_rate": 5.64281841024078e-06, + "loss": 0.6963, + "step": 7950 + }, + { + "epoch": 0.6542686689981485, + "grad_norm": 2.015075822702565, + "learning_rate": 5.6404195706937605e-06, + "loss": 0.7307, + "step": 7951 + }, + { + "epoch": 0.6543509565932936, + "grad_norm": 1.8019890745359826, + "learning_rate": 5.638021040844367e-06, + "loss": 0.7356, + "step": 7952 + }, + { + "epoch": 0.6544332441884386, + "grad_norm": 5.614547937501384, + "learning_rate": 5.635622820862983e-06, + "loss": 0.7317, + "step": 7953 + }, + { + "epoch": 0.6545155317835836, + "grad_norm": 0.4204068332138207, + "learning_rate": 5.633224910919974e-06, + "loss": 0.5127, + "step": 7954 + }, + { + "epoch": 0.6545978193787286, + "grad_norm": 2.3641811106220088, + "learning_rate": 5.630827311185679e-06, + "loss": 0.7238, + "step": 7955 + }, + { + "epoch": 0.6546801069738737, + "grad_norm": 2.0711258780941555, + "learning_rate": 5.628430021830429e-06, + "loss": 0.7499, + "step": 7956 + }, + { + "epoch": 0.6547623945690187, + "grad_norm": 1.9183753729027844, + "learning_rate": 5.6260330430245145e-06, + "loss": 0.7075, + "step": 7957 + }, + { + "epoch": 0.6548446821641638, + "grad_norm": 0.4128116035729483, + "learning_rate": 5.623636374938223e-06, + "loss": 0.4737, + "step": 7958 + }, + { + "epoch": 0.6549269697593088, + "grad_norm": 2.0633003783121597, + "learning_rate": 5.621240017741802e-06, + "loss": 0.7194, + "step": 7959 + }, + { + "epoch": 0.6550092573544538, + "grad_norm": 2.4520151683059326, + "learning_rate": 5.6188439716054895e-06, + "loss": 0.715, + "step": 7960 + }, + { + "epoch": 0.6550915449495989, + "grad_norm": 2.0977125878853013, + "learning_rate": 5.61644823669949e-06, + "loss": 0.7265, + "step": 7961 + }, + { + "epoch": 0.6551738325447439, + "grad_norm": 1.6241211646523084, + "learning_rate": 5.614052813194002e-06, + "loss": 0.7561, + "step": 7962 + }, + { + "epoch": 0.6552561201398889, + "grad_norm": 2.7396331976685864, + "learning_rate": 5.6116577012591825e-06, + "loss": 0.7342, + "step": 7963 + }, + { + "epoch": 0.6553384077350339, + "grad_norm": 2.8200785025835313, + "learning_rate": 5.609262901065187e-06, + "loss": 0.7346, + "step": 7964 + }, + { + "epoch": 0.655420695330179, + "grad_norm": 2.9304829126896235, + "learning_rate": 5.606868412782131e-06, + "loss": 0.7008, + "step": 7965 + }, + { + "epoch": 0.655502982925324, + "grad_norm": 2.1467217879257725, + "learning_rate": 5.60447423658012e-06, + "loss": 0.7262, + "step": 7966 + }, + { + "epoch": 0.655585270520469, + "grad_norm": 2.1349198554488686, + "learning_rate": 5.602080372629224e-06, + "loss": 0.7171, + "step": 7967 + }, + { + "epoch": 0.6556675581156141, + "grad_norm": 2.0927406056364557, + "learning_rate": 5.599686821099508e-06, + "loss": 0.7388, + "step": 7968 + }, + { + "epoch": 0.6557498457107591, + "grad_norm": 2.124545567795085, + "learning_rate": 5.597293582161001e-06, + "loss": 0.7502, + "step": 7969 + }, + { + "epoch": 0.6558321333059042, + "grad_norm": 2.333558303254436, + "learning_rate": 5.5949006559837194e-06, + "loss": 0.7219, + "step": 7970 + }, + { + "epoch": 0.6559144209010491, + "grad_norm": 0.42278531818380805, + "learning_rate": 5.59250804273765e-06, + "loss": 0.5029, + "step": 7971 + }, + { + "epoch": 0.6559967084961942, + "grad_norm": 0.4110970912117105, + "learning_rate": 5.590115742592762e-06, + "loss": 0.4682, + "step": 7972 + }, + { + "epoch": 0.6560789960913392, + "grad_norm": 1.5917896048941766, + "learning_rate": 5.587723755718994e-06, + "loss": 0.7451, + "step": 7973 + }, + { + "epoch": 0.6561612836864843, + "grad_norm": 1.7750312180727132, + "learning_rate": 5.585332082286279e-06, + "loss": 0.701, + "step": 7974 + }, + { + "epoch": 0.6562435712816292, + "grad_norm": 2.090393643922597, + "learning_rate": 5.5829407224645074e-06, + "loss": 0.7085, + "step": 7975 + }, + { + "epoch": 0.6563258588767743, + "grad_norm": 0.4065435065284663, + "learning_rate": 5.580549676423568e-06, + "loss": 0.4758, + "step": 7976 + }, + { + "epoch": 0.6564081464719194, + "grad_norm": 1.913519243242606, + "learning_rate": 5.57815894433331e-06, + "loss": 0.717, + "step": 7977 + }, + { + "epoch": 0.6564904340670644, + "grad_norm": 2.009904403960007, + "learning_rate": 5.575768526363571e-06, + "loss": 0.7245, + "step": 7978 + }, + { + "epoch": 0.6565727216622094, + "grad_norm": 2.7349058694125765, + "learning_rate": 5.573378422684155e-06, + "loss": 0.7339, + "step": 7979 + }, + { + "epoch": 0.6566550092573544, + "grad_norm": 1.9148146753046331, + "learning_rate": 5.570988633464861e-06, + "loss": 0.7415, + "step": 7980 + }, + { + "epoch": 0.6567372968524995, + "grad_norm": 1.966692011432542, + "learning_rate": 5.5685991588754476e-06, + "loss": 0.7291, + "step": 7981 + }, + { + "epoch": 0.6568195844476445, + "grad_norm": 2.1570590841002946, + "learning_rate": 5.566209999085667e-06, + "loss": 0.7463, + "step": 7982 + }, + { + "epoch": 0.6569018720427896, + "grad_norm": 2.410350705479689, + "learning_rate": 5.563821154265237e-06, + "loss": 0.7263, + "step": 7983 + }, + { + "epoch": 0.6569841596379346, + "grad_norm": 2.2778003276590724, + "learning_rate": 5.561432624583857e-06, + "loss": 0.7418, + "step": 7984 + }, + { + "epoch": 0.6570664472330796, + "grad_norm": 1.7664646650160627, + "learning_rate": 5.5590444102112005e-06, + "loss": 0.7313, + "step": 7985 + }, + { + "epoch": 0.6571487348282247, + "grad_norm": 2.0292898933390555, + "learning_rate": 5.556656511316933e-06, + "loss": 0.7217, + "step": 7986 + }, + { + "epoch": 0.6572310224233697, + "grad_norm": 2.2790122440808886, + "learning_rate": 5.554268928070675e-06, + "loss": 0.7532, + "step": 7987 + }, + { + "epoch": 0.6573133100185147, + "grad_norm": 2.0757232564996353, + "learning_rate": 5.551881660642051e-06, + "loss": 0.6899, + "step": 7988 + }, + { + "epoch": 0.6573955976136597, + "grad_norm": 5.414205024461689, + "learning_rate": 5.549494709200633e-06, + "loss": 0.7158, + "step": 7989 + }, + { + "epoch": 0.6574778852088048, + "grad_norm": 2.14189102519751, + "learning_rate": 5.5471080739159964e-06, + "loss": 0.7137, + "step": 7990 + }, + { + "epoch": 0.6575601728039498, + "grad_norm": 1.9655385558651588, + "learning_rate": 5.544721754957676e-06, + "loss": 0.709, + "step": 7991 + }, + { + "epoch": 0.6576424603990948, + "grad_norm": 3.3341566665357916, + "learning_rate": 5.542335752495203e-06, + "loss": 0.7288, + "step": 7992 + }, + { + "epoch": 0.6577247479942399, + "grad_norm": 2.0250714311316953, + "learning_rate": 5.539950066698063e-06, + "loss": 0.7494, + "step": 7993 + }, + { + "epoch": 0.6578070355893849, + "grad_norm": 2.271617912309411, + "learning_rate": 5.537564697735747e-06, + "loss": 0.7326, + "step": 7994 + }, + { + "epoch": 0.65788932318453, + "grad_norm": 0.43218615933614535, + "learning_rate": 5.535179645777691e-06, + "loss": 0.5255, + "step": 7995 + }, + { + "epoch": 0.6579716107796749, + "grad_norm": 2.4343062798527577, + "learning_rate": 5.532794910993334e-06, + "loss": 0.7158, + "step": 7996 + }, + { + "epoch": 0.65805389837482, + "grad_norm": 2.0581672906130404, + "learning_rate": 5.530410493552079e-06, + "loss": 0.731, + "step": 7997 + }, + { + "epoch": 0.658136185969965, + "grad_norm": 0.42174914903228694, + "learning_rate": 5.52802639362332e-06, + "loss": 0.4872, + "step": 7998 + }, + { + "epoch": 0.6582184735651101, + "grad_norm": 3.2206258431440093, + "learning_rate": 5.5256426113764136e-06, + "loss": 0.743, + "step": 7999 + }, + { + "epoch": 0.658300761160255, + "grad_norm": 3.410436129049433, + "learning_rate": 5.5232591469807e-06, + "loss": 0.723, + "step": 8000 + }, + { + "epoch": 0.6583830487554001, + "grad_norm": 1.9626205350330594, + "learning_rate": 5.520876000605493e-06, + "loss": 0.7129, + "step": 8001 + }, + { + "epoch": 0.6584653363505452, + "grad_norm": 2.4691179830580556, + "learning_rate": 5.518493172420096e-06, + "loss": 0.7096, + "step": 8002 + }, + { + "epoch": 0.6585476239456902, + "grad_norm": 2.310904147399013, + "learning_rate": 5.516110662593772e-06, + "loss": 0.6853, + "step": 8003 + }, + { + "epoch": 0.6586299115408352, + "grad_norm": 2.0630296225728535, + "learning_rate": 5.51372847129578e-06, + "loss": 0.7329, + "step": 8004 + }, + { + "epoch": 0.6587121991359802, + "grad_norm": 3.488799603741265, + "learning_rate": 5.511346598695344e-06, + "loss": 0.7248, + "step": 8005 + }, + { + "epoch": 0.6587944867311253, + "grad_norm": 1.7188103504374606, + "learning_rate": 5.508965044961666e-06, + "loss": 0.7051, + "step": 8006 + }, + { + "epoch": 0.6588767743262703, + "grad_norm": 2.107230683554777, + "learning_rate": 5.506583810263926e-06, + "loss": 0.7261, + "step": 8007 + }, + { + "epoch": 0.6589590619214154, + "grad_norm": 2.360805983641841, + "learning_rate": 5.504202894771288e-06, + "loss": 0.7391, + "step": 8008 + }, + { + "epoch": 0.6590413495165603, + "grad_norm": 1.9904443381323729, + "learning_rate": 5.5018222986528835e-06, + "loss": 0.7479, + "step": 8009 + }, + { + "epoch": 0.6591236371117054, + "grad_norm": 2.0146077042501194, + "learning_rate": 5.499442022077833e-06, + "loss": 0.7229, + "step": 8010 + }, + { + "epoch": 0.6592059247068505, + "grad_norm": 2.3726537993085004, + "learning_rate": 5.497062065215223e-06, + "loss": 0.7386, + "step": 8011 + }, + { + "epoch": 0.6592882123019955, + "grad_norm": 1.7747729699455888, + "learning_rate": 5.494682428234124e-06, + "loss": 0.7411, + "step": 8012 + }, + { + "epoch": 0.6593704998971405, + "grad_norm": 2.5697567930762824, + "learning_rate": 5.492303111303574e-06, + "loss": 0.7353, + "step": 8013 + }, + { + "epoch": 0.6594527874922855, + "grad_norm": 0.42215165579683905, + "learning_rate": 5.489924114592608e-06, + "loss": 0.4828, + "step": 8014 + }, + { + "epoch": 0.6595350750874306, + "grad_norm": 2.2157913649958965, + "learning_rate": 5.487545438270214e-06, + "loss": 0.7354, + "step": 8015 + }, + { + "epoch": 0.6596173626825756, + "grad_norm": 2.111331023885856, + "learning_rate": 5.485167082505382e-06, + "loss": 0.7173, + "step": 8016 + }, + { + "epoch": 0.6596996502777206, + "grad_norm": 2.352528820573211, + "learning_rate": 5.482789047467058e-06, + "loss": 0.7012, + "step": 8017 + }, + { + "epoch": 0.6597819378728657, + "grad_norm": 2.154953911052205, + "learning_rate": 5.480411333324177e-06, + "loss": 0.7022, + "step": 8018 + }, + { + "epoch": 0.6598642254680107, + "grad_norm": 2.4514097808023707, + "learning_rate": 5.478033940245643e-06, + "loss": 0.704, + "step": 8019 + }, + { + "epoch": 0.6599465130631558, + "grad_norm": 0.41763416511392326, + "learning_rate": 5.47565686840035e-06, + "loss": 0.4892, + "step": 8020 + }, + { + "epoch": 0.6600288006583007, + "grad_norm": 2.435754995368027, + "learning_rate": 5.473280117957152e-06, + "loss": 0.7279, + "step": 8021 + }, + { + "epoch": 0.6601110882534458, + "grad_norm": 1.9643599069310287, + "learning_rate": 5.470903689084901e-06, + "loss": 0.7403, + "step": 8022 + }, + { + "epoch": 0.6601933758485908, + "grad_norm": 0.42599190905310946, + "learning_rate": 5.4685275819524075e-06, + "loss": 0.4999, + "step": 8023 + }, + { + "epoch": 0.6602756634437359, + "grad_norm": 0.4414767016301874, + "learning_rate": 5.466151796728467e-06, + "loss": 0.5064, + "step": 8024 + }, + { + "epoch": 0.6603579510388808, + "grad_norm": 2.3699209178444947, + "learning_rate": 5.4637763335818495e-06, + "loss": 0.7418, + "step": 8025 + }, + { + "epoch": 0.6604402386340259, + "grad_norm": 2.169879735881038, + "learning_rate": 5.46140119268131e-06, + "loss": 0.7427, + "step": 8026 + }, + { + "epoch": 0.660522526229171, + "grad_norm": 0.4217624489377124, + "learning_rate": 5.459026374195566e-06, + "loss": 0.4987, + "step": 8027 + }, + { + "epoch": 0.660604813824316, + "grad_norm": 1.937868548792891, + "learning_rate": 5.456651878293333e-06, + "loss": 0.7554, + "step": 8028 + }, + { + "epoch": 0.660687101419461, + "grad_norm": 4.046944697874956, + "learning_rate": 5.454277705143283e-06, + "loss": 0.754, + "step": 8029 + }, + { + "epoch": 0.660769389014606, + "grad_norm": 1.863317203845375, + "learning_rate": 5.451903854914075e-06, + "loss": 0.7545, + "step": 8030 + }, + { + "epoch": 0.6608516766097511, + "grad_norm": 0.3959914780972475, + "learning_rate": 5.449530327774339e-06, + "loss": 0.4577, + "step": 8031 + }, + { + "epoch": 0.6609339642048961, + "grad_norm": 0.41350985729816947, + "learning_rate": 5.447157123892697e-06, + "loss": 0.4845, + "step": 8032 + }, + { + "epoch": 0.6610162518000412, + "grad_norm": 1.8181417172623087, + "learning_rate": 5.444784243437726e-06, + "loss": 0.7127, + "step": 8033 + }, + { + "epoch": 0.6610985393951861, + "grad_norm": 2.3106404431240137, + "learning_rate": 5.442411686578002e-06, + "loss": 0.7185, + "step": 8034 + }, + { + "epoch": 0.6611808269903312, + "grad_norm": 1.8605113062532863, + "learning_rate": 5.440039453482062e-06, + "loss": 0.7432, + "step": 8035 + }, + { + "epoch": 0.6612631145854763, + "grad_norm": 2.2265711511705972, + "learning_rate": 5.437667544318428e-06, + "loss": 0.7217, + "step": 8036 + }, + { + "epoch": 0.6613454021806213, + "grad_norm": 1.887018629491341, + "learning_rate": 5.435295959255591e-06, + "loss": 0.7221, + "step": 8037 + }, + { + "epoch": 0.6614276897757663, + "grad_norm": 2.3340638005659615, + "learning_rate": 5.4329246984620325e-06, + "loss": 0.7129, + "step": 8038 + }, + { + "epoch": 0.6615099773709113, + "grad_norm": 2.3883997293220696, + "learning_rate": 5.430553762106194e-06, + "loss": 0.7461, + "step": 8039 + }, + { + "epoch": 0.6615922649660564, + "grad_norm": 0.4040524934866481, + "learning_rate": 5.428183150356515e-06, + "loss": 0.4922, + "step": 8040 + }, + { + "epoch": 0.6616745525612014, + "grad_norm": 0.4695785092981076, + "learning_rate": 5.425812863381392e-06, + "loss": 0.5261, + "step": 8041 + }, + { + "epoch": 0.6617568401563464, + "grad_norm": 2.0337461711439557, + "learning_rate": 5.42344290134921e-06, + "loss": 0.7371, + "step": 8042 + }, + { + "epoch": 0.6618391277514915, + "grad_norm": 2.583354840518371, + "learning_rate": 5.421073264428318e-06, + "loss": 0.7456, + "step": 8043 + }, + { + "epoch": 0.6619214153466365, + "grad_norm": 2.1328350280562858, + "learning_rate": 5.418703952787064e-06, + "loss": 0.719, + "step": 8044 + }, + { + "epoch": 0.6620037029417816, + "grad_norm": 3.890553224115731, + "learning_rate": 5.416334966593751e-06, + "loss": 0.7363, + "step": 8045 + }, + { + "epoch": 0.6620859905369265, + "grad_norm": 1.9543485187902963, + "learning_rate": 5.413966306016674e-06, + "loss": 0.7036, + "step": 8046 + }, + { + "epoch": 0.6621682781320716, + "grad_norm": 2.2398244873677298, + "learning_rate": 5.411597971224098e-06, + "loss": 0.7234, + "step": 8047 + }, + { + "epoch": 0.6622505657272166, + "grad_norm": 2.6471216561885393, + "learning_rate": 5.409229962384264e-06, + "loss": 0.7064, + "step": 8048 + }, + { + "epoch": 0.6623328533223617, + "grad_norm": 1.937589663889543, + "learning_rate": 5.406862279665386e-06, + "loss": 0.7007, + "step": 8049 + }, + { + "epoch": 0.6624151409175066, + "grad_norm": 2.1297511042934425, + "learning_rate": 5.404494923235672e-06, + "loss": 0.7468, + "step": 8050 + }, + { + "epoch": 0.6624974285126517, + "grad_norm": 2.706986683344493, + "learning_rate": 5.402127893263284e-06, + "loss": 0.6983, + "step": 8051 + }, + { + "epoch": 0.6625797161077968, + "grad_norm": 0.42988806055912576, + "learning_rate": 5.399761189916381e-06, + "loss": 0.4963, + "step": 8052 + }, + { + "epoch": 0.6626620037029418, + "grad_norm": 1.9342865292684313, + "learning_rate": 5.397394813363086e-06, + "loss": 0.7066, + "step": 8053 + }, + { + "epoch": 0.6627442912980868, + "grad_norm": 2.163994234301118, + "learning_rate": 5.395028763771504e-06, + "loss": 0.7254, + "step": 8054 + }, + { + "epoch": 0.6628265788932318, + "grad_norm": 2.184361552512493, + "learning_rate": 5.392663041309709e-06, + "loss": 0.7206, + "step": 8055 + }, + { + "epoch": 0.6629088664883769, + "grad_norm": 1.848226819975942, + "learning_rate": 5.390297646145768e-06, + "loss": 0.7043, + "step": 8056 + }, + { + "epoch": 0.6629911540835219, + "grad_norm": 0.4161841852993745, + "learning_rate": 5.387932578447705e-06, + "loss": 0.4977, + "step": 8057 + }, + { + "epoch": 0.6630734416786669, + "grad_norm": 2.1484518064010816, + "learning_rate": 5.385567838383544e-06, + "loss": 0.7104, + "step": 8058 + }, + { + "epoch": 0.663155729273812, + "grad_norm": 2.156932954451813, + "learning_rate": 5.383203426121258e-06, + "loss": 0.7172, + "step": 8059 + }, + { + "epoch": 0.663238016868957, + "grad_norm": 2.0310433139566673, + "learning_rate": 5.380839341828819e-06, + "loss": 0.7414, + "step": 8060 + }, + { + "epoch": 0.6633203044641021, + "grad_norm": 1.9163155299550643, + "learning_rate": 5.378475585674161e-06, + "loss": 0.7453, + "step": 8061 + }, + { + "epoch": 0.6634025920592471, + "grad_norm": 2.2345861274984915, + "learning_rate": 5.376112157825212e-06, + "loss": 0.7259, + "step": 8062 + }, + { + "epoch": 0.6634848796543921, + "grad_norm": 2.8642133322085965, + "learning_rate": 5.373749058449856e-06, + "loss": 0.7308, + "step": 8063 + }, + { + "epoch": 0.6635671672495371, + "grad_norm": 2.7717869121188596, + "learning_rate": 5.3713862877159765e-06, + "loss": 0.7244, + "step": 8064 + }, + { + "epoch": 0.6636494548446822, + "grad_norm": 2.0086944464470524, + "learning_rate": 5.3690238457914055e-06, + "loss": 0.7287, + "step": 8065 + }, + { + "epoch": 0.6637317424398272, + "grad_norm": 1.8400924657528226, + "learning_rate": 5.366661732843976e-06, + "loss": 0.7048, + "step": 8066 + }, + { + "epoch": 0.6638140300349722, + "grad_norm": 0.4230536168465791, + "learning_rate": 5.364299949041486e-06, + "loss": 0.5179, + "step": 8067 + }, + { + "epoch": 0.6638963176301172, + "grad_norm": 0.4231799539089219, + "learning_rate": 5.361938494551716e-06, + "loss": 0.4939, + "step": 8068 + }, + { + "epoch": 0.6639786052252623, + "grad_norm": 3.249812661639723, + "learning_rate": 5.3595773695424194e-06, + "loss": 0.6948, + "step": 8069 + }, + { + "epoch": 0.6640608928204074, + "grad_norm": 2.528651436295182, + "learning_rate": 5.357216574181323e-06, + "loss": 0.7245, + "step": 8070 + }, + { + "epoch": 0.6641431804155523, + "grad_norm": 2.811301849786796, + "learning_rate": 5.354856108636134e-06, + "loss": 0.7301, + "step": 8071 + }, + { + "epoch": 0.6642254680106974, + "grad_norm": 3.018961909802455, + "learning_rate": 5.352495973074541e-06, + "loss": 0.75, + "step": 8072 + }, + { + "epoch": 0.6643077556058424, + "grad_norm": 2.5398572191463384, + "learning_rate": 5.3501361676641994e-06, + "loss": 0.709, + "step": 8073 + }, + { + "epoch": 0.6643900432009875, + "grad_norm": 2.9743559872646768, + "learning_rate": 5.3477766925727506e-06, + "loss": 0.7165, + "step": 8074 + }, + { + "epoch": 0.6644723307961324, + "grad_norm": 0.41805940821573423, + "learning_rate": 5.345417547967805e-06, + "loss": 0.4921, + "step": 8075 + }, + { + "epoch": 0.6645546183912775, + "grad_norm": 2.5772563108083504, + "learning_rate": 5.343058734016953e-06, + "loss": 0.7212, + "step": 8076 + }, + { + "epoch": 0.6646369059864226, + "grad_norm": 0.4050363455205766, + "learning_rate": 5.3407002508877585e-06, + "loss": 0.4596, + "step": 8077 + }, + { + "epoch": 0.6647191935815676, + "grad_norm": 2.636623478984383, + "learning_rate": 5.3383420987477685e-06, + "loss": 0.6965, + "step": 8078 + }, + { + "epoch": 0.6648014811767126, + "grad_norm": 2.315575445123205, + "learning_rate": 5.335984277764499e-06, + "loss": 0.7332, + "step": 8079 + }, + { + "epoch": 0.6648837687718576, + "grad_norm": 1.982713225895261, + "learning_rate": 5.333626788105449e-06, + "loss": 0.7226, + "step": 8080 + }, + { + "epoch": 0.6649660563670027, + "grad_norm": 2.9914494469893524, + "learning_rate": 5.331269629938091e-06, + "loss": 0.7146, + "step": 8081 + }, + { + "epoch": 0.6650483439621477, + "grad_norm": 0.41424814240417235, + "learning_rate": 5.328912803429871e-06, + "loss": 0.4885, + "step": 8082 + }, + { + "epoch": 0.6651306315572927, + "grad_norm": 0.43755766819602854, + "learning_rate": 5.326556308748212e-06, + "loss": 0.4803, + "step": 8083 + }, + { + "epoch": 0.6652129191524377, + "grad_norm": 2.673842726049815, + "learning_rate": 5.324200146060521e-06, + "loss": 0.7046, + "step": 8084 + }, + { + "epoch": 0.6652952067475828, + "grad_norm": 2.1794721744710386, + "learning_rate": 5.3218443155341715e-06, + "loss": 0.7104, + "step": 8085 + }, + { + "epoch": 0.6653774943427279, + "grad_norm": 0.4138739536849859, + "learning_rate": 5.319488817336524e-06, + "loss": 0.4962, + "step": 8086 + }, + { + "epoch": 0.6654597819378729, + "grad_norm": 2.4273858130831414, + "learning_rate": 5.317133651634905e-06, + "loss": 0.7344, + "step": 8087 + }, + { + "epoch": 0.6655420695330179, + "grad_norm": 2.652818017649886, + "learning_rate": 5.3147788185966224e-06, + "loss": 0.7335, + "step": 8088 + }, + { + "epoch": 0.6656243571281629, + "grad_norm": 2.4482328013051378, + "learning_rate": 5.3124243183889555e-06, + "loss": 0.6999, + "step": 8089 + }, + { + "epoch": 0.665706644723308, + "grad_norm": 1.98493050023241, + "learning_rate": 5.310070151179172e-06, + "loss": 0.7352, + "step": 8090 + }, + { + "epoch": 0.665788932318453, + "grad_norm": 1.9194605003988263, + "learning_rate": 5.307716317134501e-06, + "loss": 0.7186, + "step": 8091 + }, + { + "epoch": 0.665871219913598, + "grad_norm": 1.8815741367985541, + "learning_rate": 5.305362816422164e-06, + "loss": 0.7244, + "step": 8092 + }, + { + "epoch": 0.665953507508743, + "grad_norm": 2.1774515306762763, + "learning_rate": 5.303009649209343e-06, + "loss": 0.7124, + "step": 8093 + }, + { + "epoch": 0.6660357951038881, + "grad_norm": 2.460561026001481, + "learning_rate": 5.3006568156632044e-06, + "loss": 0.7465, + "step": 8094 + }, + { + "epoch": 0.6661180826990332, + "grad_norm": 1.8636129569295024, + "learning_rate": 5.298304315950886e-06, + "loss": 0.7283, + "step": 8095 + }, + { + "epoch": 0.6662003702941781, + "grad_norm": 2.670303604736574, + "learning_rate": 5.295952150239511e-06, + "loss": 0.7157, + "step": 8096 + }, + { + "epoch": 0.6662826578893232, + "grad_norm": 0.4252788249604733, + "learning_rate": 5.293600318696176e-06, + "loss": 0.5188, + "step": 8097 + }, + { + "epoch": 0.6663649454844682, + "grad_norm": 2.0442959353681767, + "learning_rate": 5.291248821487948e-06, + "loss": 0.7386, + "step": 8098 + }, + { + "epoch": 0.6664472330796133, + "grad_norm": 2.1058544407432285, + "learning_rate": 5.288897658781872e-06, + "loss": 0.7354, + "step": 8099 + }, + { + "epoch": 0.6665295206747582, + "grad_norm": 1.984283290155403, + "learning_rate": 5.286546830744969e-06, + "loss": 0.7035, + "step": 8100 + }, + { + "epoch": 0.6666118082699033, + "grad_norm": 1.9410355755852458, + "learning_rate": 5.284196337544244e-06, + "loss": 0.7382, + "step": 8101 + }, + { + "epoch": 0.6666940958650484, + "grad_norm": 1.755048361377421, + "learning_rate": 5.281846179346666e-06, + "loss": 0.7429, + "step": 8102 + }, + { + "epoch": 0.6667763834601934, + "grad_norm": 2.20686599478125, + "learning_rate": 5.279496356319193e-06, + "loss": 0.7377, + "step": 8103 + }, + { + "epoch": 0.6668586710553384, + "grad_norm": 1.8850229916500822, + "learning_rate": 5.277146868628751e-06, + "loss": 0.7168, + "step": 8104 + }, + { + "epoch": 0.6669409586504834, + "grad_norm": 2.202337995714095, + "learning_rate": 5.2747977164422415e-06, + "loss": 0.7104, + "step": 8105 + }, + { + "epoch": 0.6670232462456285, + "grad_norm": 1.8626323818155055, + "learning_rate": 5.272448899926542e-06, + "loss": 0.7258, + "step": 8106 + }, + { + "epoch": 0.6671055338407735, + "grad_norm": 1.746332898620734, + "learning_rate": 5.270100419248516e-06, + "loss": 0.7383, + "step": 8107 + }, + { + "epoch": 0.6671878214359185, + "grad_norm": 2.2913840908171252, + "learning_rate": 5.267752274574986e-06, + "loss": 0.6979, + "step": 8108 + }, + { + "epoch": 0.6672701090310635, + "grad_norm": 1.8264790400885789, + "learning_rate": 5.265404466072773e-06, + "loss": 0.7496, + "step": 8109 + }, + { + "epoch": 0.6673523966262086, + "grad_norm": 1.5660380065109003, + "learning_rate": 5.263056993908653e-06, + "loss": 0.7246, + "step": 8110 + }, + { + "epoch": 0.6674346842213537, + "grad_norm": 1.8521035972816886, + "learning_rate": 5.26070985824939e-06, + "loss": 0.7153, + "step": 8111 + }, + { + "epoch": 0.6675169718164987, + "grad_norm": 1.9904825555315564, + "learning_rate": 5.258363059261715e-06, + "loss": 0.7394, + "step": 8112 + }, + { + "epoch": 0.6675992594116437, + "grad_norm": 2.1832952561338717, + "learning_rate": 5.256016597112348e-06, + "loss": 0.7402, + "step": 8113 + }, + { + "epoch": 0.6676815470067887, + "grad_norm": 2.085358814110372, + "learning_rate": 5.253670471967972e-06, + "loss": 0.7257, + "step": 8114 + }, + { + "epoch": 0.6677638346019338, + "grad_norm": 1.8450143583420897, + "learning_rate": 5.251324683995258e-06, + "loss": 0.7337, + "step": 8115 + }, + { + "epoch": 0.6678461221970788, + "grad_norm": 2.00622081526513, + "learning_rate": 5.248979233360845e-06, + "loss": 0.7455, + "step": 8116 + }, + { + "epoch": 0.6679284097922238, + "grad_norm": 1.7171753518341681, + "learning_rate": 5.246634120231348e-06, + "loss": 0.6954, + "step": 8117 + }, + { + "epoch": 0.6680106973873688, + "grad_norm": 1.9244389269140825, + "learning_rate": 5.244289344773359e-06, + "loss": 0.7317, + "step": 8118 + }, + { + "epoch": 0.6680929849825139, + "grad_norm": 1.8997262274365545, + "learning_rate": 5.241944907153454e-06, + "loss": 0.7291, + "step": 8119 + }, + { + "epoch": 0.668175272577659, + "grad_norm": 1.7693256246335503, + "learning_rate": 5.2396008075381665e-06, + "loss": 0.7282, + "step": 8120 + }, + { + "epoch": 0.6682575601728039, + "grad_norm": 2.014336588129239, + "learning_rate": 5.237257046094031e-06, + "loss": 0.7397, + "step": 8121 + }, + { + "epoch": 0.668339847767949, + "grad_norm": 1.9287629726754054, + "learning_rate": 5.234913622987536e-06, + "loss": 0.7358, + "step": 8122 + }, + { + "epoch": 0.668422135363094, + "grad_norm": 0.4138888046403556, + "learning_rate": 5.232570538385158e-06, + "loss": 0.4614, + "step": 8123 + }, + { + "epoch": 0.6685044229582391, + "grad_norm": 1.8110397958033138, + "learning_rate": 5.230227792453339e-06, + "loss": 0.7257, + "step": 8124 + }, + { + "epoch": 0.668586710553384, + "grad_norm": 2.201950062559441, + "learning_rate": 5.227885385358515e-06, + "loss": 0.7277, + "step": 8125 + }, + { + "epoch": 0.6686689981485291, + "grad_norm": 4.392325558668216, + "learning_rate": 5.225543317267077e-06, + "loss": 0.7491, + "step": 8126 + }, + { + "epoch": 0.6687512857436742, + "grad_norm": 15.337566347063976, + "learning_rate": 5.22320158834541e-06, + "loss": 0.7507, + "step": 8127 + }, + { + "epoch": 0.6688335733388192, + "grad_norm": 2.044487470893075, + "learning_rate": 5.220860198759863e-06, + "loss": 0.7197, + "step": 8128 + }, + { + "epoch": 0.6689158609339642, + "grad_norm": 2.310731869337375, + "learning_rate": 5.218519148676764e-06, + "loss": 0.7296, + "step": 8129 + }, + { + "epoch": 0.6689981485291092, + "grad_norm": 1.951018222768312, + "learning_rate": 5.216178438262415e-06, + "loss": 0.7203, + "step": 8130 + }, + { + "epoch": 0.6690804361242543, + "grad_norm": 1.9944349593117396, + "learning_rate": 5.213838067683102e-06, + "loss": 0.7216, + "step": 8131 + }, + { + "epoch": 0.6691627237193993, + "grad_norm": 0.4390231254342351, + "learning_rate": 5.211498037105074e-06, + "loss": 0.4869, + "step": 8132 + }, + { + "epoch": 0.6692450113145443, + "grad_norm": 2.0478869831640183, + "learning_rate": 5.209158346694572e-06, + "loss": 0.7312, + "step": 8133 + }, + { + "epoch": 0.6693272989096893, + "grad_norm": 1.8164047123276972, + "learning_rate": 5.206818996617799e-06, + "loss": 0.7418, + "step": 8134 + }, + { + "epoch": 0.6694095865048344, + "grad_norm": 1.7773277080107663, + "learning_rate": 5.20447998704094e-06, + "loss": 0.7083, + "step": 8135 + }, + { + "epoch": 0.6694918740999795, + "grad_norm": 2.2039890278002803, + "learning_rate": 5.202141318130148e-06, + "loss": 0.726, + "step": 8136 + }, + { + "epoch": 0.6695741616951245, + "grad_norm": 0.3988812215166928, + "learning_rate": 5.199802990051568e-06, + "loss": 0.4703, + "step": 8137 + }, + { + "epoch": 0.6696564492902695, + "grad_norm": 2.0648009218727648, + "learning_rate": 5.197465002971303e-06, + "loss": 0.7303, + "step": 8138 + }, + { + "epoch": 0.6697387368854145, + "grad_norm": 1.7651898000517456, + "learning_rate": 5.1951273570554515e-06, + "loss": 0.7052, + "step": 8139 + }, + { + "epoch": 0.6698210244805596, + "grad_norm": 2.121666162054685, + "learning_rate": 5.192790052470061e-06, + "loss": 0.735, + "step": 8140 + }, + { + "epoch": 0.6699033120757046, + "grad_norm": 1.7771513584027288, + "learning_rate": 5.190453089381181e-06, + "loss": 0.7063, + "step": 8141 + }, + { + "epoch": 0.6699855996708496, + "grad_norm": 1.7630592049956475, + "learning_rate": 5.188116467954818e-06, + "loss": 0.732, + "step": 8142 + }, + { + "epoch": 0.6700678872659946, + "grad_norm": 2.2783636995327403, + "learning_rate": 5.185780188356968e-06, + "loss": 0.7327, + "step": 8143 + }, + { + "epoch": 0.6701501748611397, + "grad_norm": 0.4283535341391618, + "learning_rate": 5.183444250753592e-06, + "loss": 0.4733, + "step": 8144 + }, + { + "epoch": 0.6702324624562848, + "grad_norm": 1.9834510388761968, + "learning_rate": 5.181108655310641e-06, + "loss": 0.7386, + "step": 8145 + }, + { + "epoch": 0.6703147500514297, + "grad_norm": 0.4364858515791318, + "learning_rate": 5.1787734021940164e-06, + "loss": 0.4873, + "step": 8146 + }, + { + "epoch": 0.6703970376465748, + "grad_norm": 2.1045202772250007, + "learning_rate": 5.176438491569622e-06, + "loss": 0.7151, + "step": 8147 + }, + { + "epoch": 0.6704793252417198, + "grad_norm": 1.8298468678991067, + "learning_rate": 5.17410392360332e-06, + "loss": 0.741, + "step": 8148 + }, + { + "epoch": 0.6705616128368649, + "grad_norm": 1.957332181228669, + "learning_rate": 5.1717696984609614e-06, + "loss": 0.7126, + "step": 8149 + }, + { + "epoch": 0.6706439004320098, + "grad_norm": 2.3117022798206346, + "learning_rate": 5.169435816308361e-06, + "loss": 0.6926, + "step": 8150 + }, + { + "epoch": 0.6707261880271549, + "grad_norm": 4.099221037566502, + "learning_rate": 5.167102277311316e-06, + "loss": 0.7529, + "step": 8151 + }, + { + "epoch": 0.6708084756223, + "grad_norm": 2.409301734492972, + "learning_rate": 5.164769081635592e-06, + "loss": 0.7369, + "step": 8152 + }, + { + "epoch": 0.670890763217445, + "grad_norm": 2.007150213408588, + "learning_rate": 5.162436229446944e-06, + "loss": 0.7007, + "step": 8153 + }, + { + "epoch": 0.67097305081259, + "grad_norm": 2.0642174152476818, + "learning_rate": 5.160103720911084e-06, + "loss": 0.7767, + "step": 8154 + }, + { + "epoch": 0.671055338407735, + "grad_norm": 2.2357506483529455, + "learning_rate": 5.15777155619372e-06, + "loss": 0.7325, + "step": 8155 + }, + { + "epoch": 0.6711376260028801, + "grad_norm": 2.0101341244955497, + "learning_rate": 5.155439735460521e-06, + "loss": 0.7421, + "step": 8156 + }, + { + "epoch": 0.6712199135980251, + "grad_norm": 2.587700959421111, + "learning_rate": 5.153108258877136e-06, + "loss": 0.7491, + "step": 8157 + }, + { + "epoch": 0.6713022011931701, + "grad_norm": 2.080235007382727, + "learning_rate": 5.150777126609184e-06, + "loss": 0.7318, + "step": 8158 + }, + { + "epoch": 0.6713844887883151, + "grad_norm": 2.2151361010565407, + "learning_rate": 5.148446338822274e-06, + "loss": 0.7204, + "step": 8159 + }, + { + "epoch": 0.6714667763834602, + "grad_norm": 2.0878882316977214, + "learning_rate": 5.146115895681974e-06, + "loss": 0.7275, + "step": 8160 + }, + { + "epoch": 0.6715490639786053, + "grad_norm": 1.5708866125121432, + "learning_rate": 5.143785797353839e-06, + "loss": 0.7517, + "step": 8161 + }, + { + "epoch": 0.6716313515737503, + "grad_norm": 0.4173752150304486, + "learning_rate": 5.1414560440033975e-06, + "loss": 0.489, + "step": 8162 + }, + { + "epoch": 0.6717136391688953, + "grad_norm": 1.7299593701354425, + "learning_rate": 5.139126635796148e-06, + "loss": 0.7083, + "step": 8163 + }, + { + "epoch": 0.6717959267640403, + "grad_norm": 2.2402668167040285, + "learning_rate": 5.136797572897565e-06, + "loss": 0.7317, + "step": 8164 + }, + { + "epoch": 0.6718782143591854, + "grad_norm": 0.4198872636753775, + "learning_rate": 5.1344688554731085e-06, + "loss": 0.4957, + "step": 8165 + }, + { + "epoch": 0.6719605019543304, + "grad_norm": 0.4372099437683501, + "learning_rate": 5.132140483688199e-06, + "loss": 0.4912, + "step": 8166 + }, + { + "epoch": 0.6720427895494754, + "grad_norm": 0.43176778409989, + "learning_rate": 5.129812457708248e-06, + "loss": 0.4983, + "step": 8167 + }, + { + "epoch": 0.6721250771446204, + "grad_norm": 1.8464525601572546, + "learning_rate": 5.1274847776986325e-06, + "loss": 0.7319, + "step": 8168 + }, + { + "epoch": 0.6722073647397655, + "grad_norm": 2.2000118659218324, + "learning_rate": 5.125157443824704e-06, + "loss": 0.6937, + "step": 8169 + }, + { + "epoch": 0.6722896523349106, + "grad_norm": 0.3987387796318597, + "learning_rate": 5.122830456251793e-06, + "loss": 0.4948, + "step": 8170 + }, + { + "epoch": 0.6723719399300555, + "grad_norm": 0.4371059650902839, + "learning_rate": 5.120503815145209e-06, + "loss": 0.4891, + "step": 8171 + }, + { + "epoch": 0.6724542275252006, + "grad_norm": 2.2030049517772166, + "learning_rate": 5.118177520670227e-06, + "loss": 0.7313, + "step": 8172 + }, + { + "epoch": 0.6725365151203456, + "grad_norm": 2.0314917187781227, + "learning_rate": 5.11585157299211e-06, + "loss": 0.7193, + "step": 8173 + }, + { + "epoch": 0.6726188027154907, + "grad_norm": 1.7623866076676942, + "learning_rate": 5.113525972276087e-06, + "loss": 0.7309, + "step": 8174 + }, + { + "epoch": 0.6727010903106356, + "grad_norm": 1.8263213908774723, + "learning_rate": 5.1112007186873636e-06, + "loss": 0.7232, + "step": 8175 + }, + { + "epoch": 0.6727833779057807, + "grad_norm": 1.6769268050878543, + "learning_rate": 5.1088758123911186e-06, + "loss": 0.7401, + "step": 8176 + }, + { + "epoch": 0.6728656655009257, + "grad_norm": 1.7134905434093006, + "learning_rate": 5.106551253552518e-06, + "loss": 0.7403, + "step": 8177 + }, + { + "epoch": 0.6729479530960708, + "grad_norm": 2.2239724706960704, + "learning_rate": 5.104227042336687e-06, + "loss": 0.7105, + "step": 8178 + }, + { + "epoch": 0.6730302406912158, + "grad_norm": 0.42655345492488683, + "learning_rate": 5.101903178908741e-06, + "loss": 0.4763, + "step": 8179 + }, + { + "epoch": 0.6731125282863608, + "grad_norm": 0.41885662804518914, + "learning_rate": 5.099579663433758e-06, + "loss": 0.4837, + "step": 8180 + }, + { + "epoch": 0.6731948158815059, + "grad_norm": 2.12324942113338, + "learning_rate": 5.097256496076801e-06, + "loss": 0.7163, + "step": 8181 + }, + { + "epoch": 0.6732771034766509, + "grad_norm": 1.7990759967410401, + "learning_rate": 5.094933677002895e-06, + "loss": 0.6971, + "step": 8182 + }, + { + "epoch": 0.6733593910717959, + "grad_norm": 2.693636629887004, + "learning_rate": 5.092611206377063e-06, + "loss": 0.7005, + "step": 8183 + }, + { + "epoch": 0.6734416786669409, + "grad_norm": 2.4537009778873524, + "learning_rate": 5.0902890843642775e-06, + "loss": 0.7461, + "step": 8184 + }, + { + "epoch": 0.673523966262086, + "grad_norm": 1.5860463225392576, + "learning_rate": 5.087967311129508e-06, + "loss": 0.7054, + "step": 8185 + }, + { + "epoch": 0.673606253857231, + "grad_norm": 2.5294430200941704, + "learning_rate": 5.085645886837685e-06, + "loss": 0.7132, + "step": 8186 + }, + { + "epoch": 0.673688541452376, + "grad_norm": 2.6785886898810674, + "learning_rate": 5.0833248116537184e-06, + "loss": 0.7174, + "step": 8187 + }, + { + "epoch": 0.6737708290475211, + "grad_norm": 1.9634262108192782, + "learning_rate": 5.08100408574249e-06, + "loss": 0.7001, + "step": 8188 + }, + { + "epoch": 0.6738531166426661, + "grad_norm": 2.517244478716251, + "learning_rate": 5.078683709268869e-06, + "loss": 0.7416, + "step": 8189 + }, + { + "epoch": 0.6739354042378112, + "grad_norm": 2.088271159908881, + "learning_rate": 5.076363682397682e-06, + "loss": 0.7285, + "step": 8190 + }, + { + "epoch": 0.6740176918329562, + "grad_norm": 0.4207137745939929, + "learning_rate": 5.07404400529375e-06, + "loss": 0.463, + "step": 8191 + }, + { + "epoch": 0.6740999794281012, + "grad_norm": 2.1047589204410704, + "learning_rate": 5.071724678121853e-06, + "loss": 0.6857, + "step": 8192 + }, + { + "epoch": 0.6741822670232462, + "grad_norm": 2.212810160043007, + "learning_rate": 5.069405701046755e-06, + "loss": 0.6968, + "step": 8193 + }, + { + "epoch": 0.6742645546183913, + "grad_norm": 1.9671176184503403, + "learning_rate": 5.0670870742331844e-06, + "loss": 0.7409, + "step": 8194 + }, + { + "epoch": 0.6743468422135364, + "grad_norm": 2.003813030475019, + "learning_rate": 5.064768797845864e-06, + "loss": 0.7425, + "step": 8195 + }, + { + "epoch": 0.6744291298086813, + "grad_norm": 2.014229988303136, + "learning_rate": 5.062450872049472e-06, + "loss": 0.7163, + "step": 8196 + }, + { + "epoch": 0.6745114174038264, + "grad_norm": 1.7615904153241693, + "learning_rate": 5.060133297008676e-06, + "loss": 0.6926, + "step": 8197 + }, + { + "epoch": 0.6745937049989714, + "grad_norm": 2.46977316692928, + "learning_rate": 5.057816072888112e-06, + "loss": 0.7272, + "step": 8198 + }, + { + "epoch": 0.6746759925941165, + "grad_norm": 2.229028855615293, + "learning_rate": 5.05549919985239e-06, + "loss": 0.7562, + "step": 8199 + }, + { + "epoch": 0.6747582801892614, + "grad_norm": 2.1638190793484458, + "learning_rate": 5.053182678066093e-06, + "loss": 0.7436, + "step": 8200 + }, + { + "epoch": 0.6748405677844065, + "grad_norm": 0.40545978914639297, + "learning_rate": 5.050866507693791e-06, + "loss": 0.457, + "step": 8201 + }, + { + "epoch": 0.6749228553795515, + "grad_norm": 1.8341018800572688, + "learning_rate": 5.048550688900013e-06, + "loss": 0.732, + "step": 8202 + }, + { + "epoch": 0.6750051429746966, + "grad_norm": 2.038625841646188, + "learning_rate": 5.046235221849281e-06, + "loss": 0.7412, + "step": 8203 + }, + { + "epoch": 0.6750874305698416, + "grad_norm": 2.163837452367973, + "learning_rate": 5.043920106706075e-06, + "loss": 0.7513, + "step": 8204 + }, + { + "epoch": 0.6751697181649866, + "grad_norm": 1.757183254774935, + "learning_rate": 5.0416053436348585e-06, + "loss": 0.7327, + "step": 8205 + }, + { + "epoch": 0.6752520057601317, + "grad_norm": 4.8381001730546265, + "learning_rate": 5.039290932800066e-06, + "loss": 0.7279, + "step": 8206 + }, + { + "epoch": 0.6753342933552767, + "grad_norm": 2.131535870190117, + "learning_rate": 5.0369768743661156e-06, + "loss": 0.7231, + "step": 8207 + }, + { + "epoch": 0.6754165809504217, + "grad_norm": 1.9835431961879362, + "learning_rate": 5.034663168497387e-06, + "loss": 0.6955, + "step": 8208 + }, + { + "epoch": 0.6754988685455667, + "grad_norm": 0.4162947301014757, + "learning_rate": 5.032349815358253e-06, + "loss": 0.4761, + "step": 8209 + }, + { + "epoch": 0.6755811561407118, + "grad_norm": 2.4472196213211372, + "learning_rate": 5.030036815113036e-06, + "loss": 0.7194, + "step": 8210 + }, + { + "epoch": 0.6756634437358569, + "grad_norm": 1.798645828833852, + "learning_rate": 5.0277241679260605e-06, + "loss": 0.7272, + "step": 8211 + }, + { + "epoch": 0.6757457313310018, + "grad_norm": 2.1705435093665058, + "learning_rate": 5.025411873961603e-06, + "loss": 0.7065, + "step": 8212 + }, + { + "epoch": 0.6758280189261469, + "grad_norm": 1.9285340698294906, + "learning_rate": 5.0230999333839345e-06, + "loss": 0.7089, + "step": 8213 + }, + { + "epoch": 0.6759103065212919, + "grad_norm": 2.030700239340169, + "learning_rate": 5.020788346357284e-06, + "loss": 0.7283, + "step": 8214 + }, + { + "epoch": 0.675992594116437, + "grad_norm": 2.322321927976158, + "learning_rate": 5.018477113045872e-06, + "loss": 0.742, + "step": 8215 + }, + { + "epoch": 0.676074881711582, + "grad_norm": 1.8815431413821109, + "learning_rate": 5.016166233613873e-06, + "loss": 0.7358, + "step": 8216 + }, + { + "epoch": 0.676157169306727, + "grad_norm": 2.000067064162872, + "learning_rate": 5.013855708225459e-06, + "loss": 0.7192, + "step": 8217 + }, + { + "epoch": 0.676239456901872, + "grad_norm": 2.086730555647238, + "learning_rate": 5.011545537044755e-06, + "loss": 0.7238, + "step": 8218 + }, + { + "epoch": 0.6763217444970171, + "grad_norm": 2.2549600438421447, + "learning_rate": 5.0092357202358845e-06, + "loss": 0.7238, + "step": 8219 + }, + { + "epoch": 0.6764040320921622, + "grad_norm": 2.79109890710536, + "learning_rate": 5.006926257962925e-06, + "loss": 0.7052, + "step": 8220 + }, + { + "epoch": 0.6764863196873071, + "grad_norm": 2.0069465490137484, + "learning_rate": 5.004617150389941e-06, + "loss": 0.7209, + "step": 8221 + }, + { + "epoch": 0.6765686072824522, + "grad_norm": 1.9821323466632756, + "learning_rate": 5.00230839768096e-06, + "loss": 0.7574, + "step": 8222 + }, + { + "epoch": 0.6766508948775972, + "grad_norm": 0.45178128565129316, + "learning_rate": 5.000000000000003e-06, + "loss": 0.487, + "step": 8223 + }, + { + "epoch": 0.6767331824727423, + "grad_norm": 1.7100554088465592, + "learning_rate": 4.997691957511045e-06, + "loss": 0.7314, + "step": 8224 + }, + { + "epoch": 0.6768154700678872, + "grad_norm": 2.60182246439749, + "learning_rate": 4.995384270378054e-06, + "loss": 0.7394, + "step": 8225 + }, + { + "epoch": 0.6768977576630323, + "grad_norm": 2.012437470241448, + "learning_rate": 4.993076938764961e-06, + "loss": 0.7261, + "step": 8226 + }, + { + "epoch": 0.6769800452581773, + "grad_norm": 1.9563476302237788, + "learning_rate": 4.990769962835674e-06, + "loss": 0.7273, + "step": 8227 + }, + { + "epoch": 0.6770623328533224, + "grad_norm": 1.9843458341012636, + "learning_rate": 4.988463342754075e-06, + "loss": 0.7412, + "step": 8228 + }, + { + "epoch": 0.6771446204484673, + "grad_norm": 2.284244963884832, + "learning_rate": 4.986157078684029e-06, + "loss": 0.7271, + "step": 8229 + }, + { + "epoch": 0.6772269080436124, + "grad_norm": 0.4122624289554678, + "learning_rate": 4.983851170789362e-06, + "loss": 0.4867, + "step": 8230 + }, + { + "epoch": 0.6773091956387575, + "grad_norm": 1.590729154248612, + "learning_rate": 4.98154561923389e-06, + "loss": 0.7085, + "step": 8231 + }, + { + "epoch": 0.6773914832339025, + "grad_norm": 1.8917714506332246, + "learning_rate": 4.979240424181391e-06, + "loss": 0.7341, + "step": 8232 + }, + { + "epoch": 0.6774737708290475, + "grad_norm": 1.8498109937750256, + "learning_rate": 4.976935585795623e-06, + "loss": 0.732, + "step": 8233 + }, + { + "epoch": 0.6775560584241925, + "grad_norm": 2.3141471444506774, + "learning_rate": 4.9746311042403125e-06, + "loss": 0.7236, + "step": 8234 + }, + { + "epoch": 0.6776383460193376, + "grad_norm": 1.9560508329021284, + "learning_rate": 4.972326979679176e-06, + "loss": 0.7095, + "step": 8235 + }, + { + "epoch": 0.6777206336144826, + "grad_norm": 2.767077181730613, + "learning_rate": 4.970023212275888e-06, + "loss": 0.734, + "step": 8236 + }, + { + "epoch": 0.6778029212096276, + "grad_norm": 2.4240756853694143, + "learning_rate": 4.967719802194109e-06, + "loss": 0.7051, + "step": 8237 + }, + { + "epoch": 0.6778852088047727, + "grad_norm": 0.42212744027897126, + "learning_rate": 4.965416749597467e-06, + "loss": 0.4781, + "step": 8238 + }, + { + "epoch": 0.6779674963999177, + "grad_norm": 2.020322114380335, + "learning_rate": 4.963114054649568e-06, + "loss": 0.7665, + "step": 8239 + }, + { + "epoch": 0.6780497839950628, + "grad_norm": 2.306656192191008, + "learning_rate": 4.960811717513988e-06, + "loss": 0.7214, + "step": 8240 + }, + { + "epoch": 0.6781320715902078, + "grad_norm": 2.226263440680553, + "learning_rate": 4.958509738354288e-06, + "loss": 0.7087, + "step": 8241 + }, + { + "epoch": 0.6782143591853528, + "grad_norm": 1.9565311172185245, + "learning_rate": 4.956208117333989e-06, + "loss": 0.7391, + "step": 8242 + }, + { + "epoch": 0.6782966467804978, + "grad_norm": 0.4203235588731226, + "learning_rate": 4.953906854616603e-06, + "loss": 0.4754, + "step": 8243 + }, + { + "epoch": 0.6783789343756429, + "grad_norm": 2.1771932364943503, + "learning_rate": 4.951605950365606e-06, + "loss": 0.7263, + "step": 8244 + }, + { + "epoch": 0.678461221970788, + "grad_norm": 1.831083584318659, + "learning_rate": 4.949305404744445e-06, + "loss": 0.7181, + "step": 8245 + }, + { + "epoch": 0.6785435095659329, + "grad_norm": 2.1250440125935803, + "learning_rate": 4.94700521791655e-06, + "loss": 0.7386, + "step": 8246 + }, + { + "epoch": 0.678625797161078, + "grad_norm": 2.0330036841607275, + "learning_rate": 4.944705390045325e-06, + "loss": 0.7054, + "step": 8247 + }, + { + "epoch": 0.678708084756223, + "grad_norm": 2.105322163457391, + "learning_rate": 4.9424059212941424e-06, + "loss": 0.7126, + "step": 8248 + }, + { + "epoch": 0.6787903723513681, + "grad_norm": 2.644890307405743, + "learning_rate": 4.9401068118263575e-06, + "loss": 0.7323, + "step": 8249 + }, + { + "epoch": 0.678872659946513, + "grad_norm": 3.3723355407958944, + "learning_rate": 4.937808061805293e-06, + "loss": 0.7115, + "step": 8250 + }, + { + "epoch": 0.6789549475416581, + "grad_norm": 2.24549795330145, + "learning_rate": 4.935509671394248e-06, + "loss": 0.7054, + "step": 8251 + }, + { + "epoch": 0.6790372351368031, + "grad_norm": 1.9467816815191978, + "learning_rate": 4.933211640756491e-06, + "loss": 0.7049, + "step": 8252 + }, + { + "epoch": 0.6791195227319482, + "grad_norm": 1.9060337699545784, + "learning_rate": 4.930913970055282e-06, + "loss": 0.7182, + "step": 8253 + }, + { + "epoch": 0.6792018103270931, + "grad_norm": 2.1392129654178618, + "learning_rate": 4.928616659453834e-06, + "loss": 0.7303, + "step": 8254 + }, + { + "epoch": 0.6792840979222382, + "grad_norm": 2.1259291577129087, + "learning_rate": 4.926319709115349e-06, + "loss": 0.6868, + "step": 8255 + }, + { + "epoch": 0.6793663855173833, + "grad_norm": 1.9509530436554743, + "learning_rate": 4.924023119202999e-06, + "loss": 0.727, + "step": 8256 + }, + { + "epoch": 0.6794486731125283, + "grad_norm": 3.358286453897422, + "learning_rate": 4.921726889879931e-06, + "loss": 0.7631, + "step": 8257 + }, + { + "epoch": 0.6795309607076733, + "grad_norm": 2.6403374639966026, + "learning_rate": 4.919431021309258e-06, + "loss": 0.7226, + "step": 8258 + }, + { + "epoch": 0.6796132483028183, + "grad_norm": 2.1336383795430236, + "learning_rate": 4.917135513654083e-06, + "loss": 0.7397, + "step": 8259 + }, + { + "epoch": 0.6796955358979634, + "grad_norm": 2.812905755453336, + "learning_rate": 4.914840367077469e-06, + "loss": 0.6949, + "step": 8260 + }, + { + "epoch": 0.6797778234931084, + "grad_norm": 2.5483064736413947, + "learning_rate": 4.912545581742468e-06, + "loss": 0.7114, + "step": 8261 + }, + { + "epoch": 0.6798601110882534, + "grad_norm": 3.852890110696467, + "learning_rate": 4.91025115781209e-06, + "loss": 0.7293, + "step": 8262 + }, + { + "epoch": 0.6799423986833985, + "grad_norm": 0.4114066794742928, + "learning_rate": 4.907957095449332e-06, + "loss": 0.4834, + "step": 8263 + }, + { + "epoch": 0.6800246862785435, + "grad_norm": 2.0323270054041562, + "learning_rate": 4.905663394817154e-06, + "loss": 0.7123, + "step": 8264 + }, + { + "epoch": 0.6801069738736886, + "grad_norm": 1.8066083261973875, + "learning_rate": 4.903370056078505e-06, + "loss": 0.7242, + "step": 8265 + }, + { + "epoch": 0.6801892614688336, + "grad_norm": 2.164541507724205, + "learning_rate": 4.901077079396293e-06, + "loss": 0.7075, + "step": 8266 + }, + { + "epoch": 0.6802715490639786, + "grad_norm": 1.993230645533029, + "learning_rate": 4.898784464933413e-06, + "loss": 0.7379, + "step": 8267 + }, + { + "epoch": 0.6803538366591236, + "grad_norm": 0.41385165679032354, + "learning_rate": 4.896492212852728e-06, + "loss": 0.4801, + "step": 8268 + }, + { + "epoch": 0.6804361242542687, + "grad_norm": 2.1665025874629107, + "learning_rate": 4.894200323317074e-06, + "loss": 0.7136, + "step": 8269 + }, + { + "epoch": 0.6805184118494138, + "grad_norm": 2.1791410945201757, + "learning_rate": 4.89190879648926e-06, + "loss": 0.728, + "step": 8270 + }, + { + "epoch": 0.6806006994445587, + "grad_norm": 2.4662320591724587, + "learning_rate": 4.889617632532079e-06, + "loss": 0.7409, + "step": 8271 + }, + { + "epoch": 0.6806829870397038, + "grad_norm": 3.0571474206543, + "learning_rate": 4.887326831608284e-06, + "loss": 0.7263, + "step": 8272 + }, + { + "epoch": 0.6807652746348488, + "grad_norm": 2.2331659543864273, + "learning_rate": 4.885036393880625e-06, + "loss": 0.7117, + "step": 8273 + }, + { + "epoch": 0.6808475622299939, + "grad_norm": 2.5591869732183414, + "learning_rate": 4.88274631951179e-06, + "loss": 0.6838, + "step": 8274 + }, + { + "epoch": 0.6809298498251388, + "grad_norm": 2.218282169857226, + "learning_rate": 4.8804566086644764e-06, + "loss": 0.7081, + "step": 8275 + }, + { + "epoch": 0.6810121374202839, + "grad_norm": 2.334356024009349, + "learning_rate": 4.878167261501335e-06, + "loss": 0.7224, + "step": 8276 + }, + { + "epoch": 0.6810944250154289, + "grad_norm": 2.068207588492827, + "learning_rate": 4.875878278185004e-06, + "loss": 0.7128, + "step": 8277 + }, + { + "epoch": 0.681176712610574, + "grad_norm": 2.9423659840629077, + "learning_rate": 4.8735896588780816e-06, + "loss": 0.7531, + "step": 8278 + }, + { + "epoch": 0.681259000205719, + "grad_norm": 2.1795168273093437, + "learning_rate": 4.871301403743158e-06, + "loss": 0.7163, + "step": 8279 + }, + { + "epoch": 0.681341287800864, + "grad_norm": 2.3221810510854235, + "learning_rate": 4.869013512942774e-06, + "loss": 0.7227, + "step": 8280 + }, + { + "epoch": 0.6814235753960091, + "grad_norm": 0.4390727645502479, + "learning_rate": 4.866725986639468e-06, + "loss": 0.4912, + "step": 8281 + }, + { + "epoch": 0.6815058629911541, + "grad_norm": 2.6741335125073156, + "learning_rate": 4.8644388249957344e-06, + "loss": 0.7434, + "step": 8282 + }, + { + "epoch": 0.6815881505862991, + "grad_norm": 2.2519474357297407, + "learning_rate": 4.8621520281740575e-06, + "loss": 0.7032, + "step": 8283 + }, + { + "epoch": 0.6816704381814441, + "grad_norm": 2.3767257494662895, + "learning_rate": 4.859865596336879e-06, + "loss": 0.71, + "step": 8284 + }, + { + "epoch": 0.6817527257765892, + "grad_norm": 0.42212764506956585, + "learning_rate": 4.8575795296466364e-06, + "loss": 0.4769, + "step": 8285 + }, + { + "epoch": 0.6818350133717342, + "grad_norm": 2.1535395639552117, + "learning_rate": 4.8552938282657125e-06, + "loss": 0.7251, + "step": 8286 + }, + { + "epoch": 0.6819173009668792, + "grad_norm": 0.4071614834757921, + "learning_rate": 4.8530084923564914e-06, + "loss": 0.4862, + "step": 8287 + }, + { + "epoch": 0.6819995885620243, + "grad_norm": 3.2075132618866613, + "learning_rate": 4.850723522081311e-06, + "loss": 0.7226, + "step": 8288 + }, + { + "epoch": 0.6820818761571693, + "grad_norm": 2.3186178724307847, + "learning_rate": 4.848438917602497e-06, + "loss": 0.6978, + "step": 8289 + }, + { + "epoch": 0.6821641637523144, + "grad_norm": 2.7368709020737683, + "learning_rate": 4.846154679082351e-06, + "loss": 0.7265, + "step": 8290 + }, + { + "epoch": 0.6822464513474593, + "grad_norm": 0.4165330671868627, + "learning_rate": 4.8438708066831275e-06, + "loss": 0.5195, + "step": 8291 + }, + { + "epoch": 0.6823287389426044, + "grad_norm": 2.3572825020665915, + "learning_rate": 4.841587300567079e-06, + "loss": 0.6877, + "step": 8292 + }, + { + "epoch": 0.6824110265377494, + "grad_norm": 0.42469304975932876, + "learning_rate": 4.839304160896414e-06, + "loss": 0.4848, + "step": 8293 + }, + { + "epoch": 0.6824933141328945, + "grad_norm": 2.541665392759915, + "learning_rate": 4.837021387833334e-06, + "loss": 0.7232, + "step": 8294 + }, + { + "epoch": 0.6825756017280395, + "grad_norm": 2.0818882766415348, + "learning_rate": 4.834738981539992e-06, + "loss": 0.724, + "step": 8295 + }, + { + "epoch": 0.6826578893231845, + "grad_norm": 1.8041107154284595, + "learning_rate": 4.832456942178542e-06, + "loss": 0.7194, + "step": 8296 + }, + { + "epoch": 0.6827401769183296, + "grad_norm": 2.1139714282305926, + "learning_rate": 4.830175269911077e-06, + "loss": 0.7024, + "step": 8297 + }, + { + "epoch": 0.6828224645134746, + "grad_norm": 2.053951827730139, + "learning_rate": 4.827893964899697e-06, + "loss": 0.7198, + "step": 8298 + }, + { + "epoch": 0.6829047521086197, + "grad_norm": 1.925922461818165, + "learning_rate": 4.825613027306455e-06, + "loss": 0.7009, + "step": 8299 + }, + { + "epoch": 0.6829870397037646, + "grad_norm": 2.701504345833427, + "learning_rate": 4.823332457293391e-06, + "loss": 0.7146, + "step": 8300 + }, + { + "epoch": 0.6830693272989097, + "grad_norm": 0.4255957647826883, + "learning_rate": 4.821052255022511e-06, + "loss": 0.4645, + "step": 8301 + }, + { + "epoch": 0.6831516148940547, + "grad_norm": 2.4920549602852335, + "learning_rate": 4.818772420655797e-06, + "loss": 0.7287, + "step": 8302 + }, + { + "epoch": 0.6832339024891998, + "grad_norm": 2.4803854737356392, + "learning_rate": 4.8164929543552e-06, + "loss": 0.7326, + "step": 8303 + }, + { + "epoch": 0.6833161900843447, + "grad_norm": 0.44078856309245307, + "learning_rate": 4.8142138562826565e-06, + "loss": 0.4823, + "step": 8304 + }, + { + "epoch": 0.6833984776794898, + "grad_norm": 1.823655990504472, + "learning_rate": 4.811935126600065e-06, + "loss": 0.7292, + "step": 8305 + }, + { + "epoch": 0.6834807652746349, + "grad_norm": 2.072157779764827, + "learning_rate": 4.809656765469308e-06, + "loss": 0.7148, + "step": 8306 + }, + { + "epoch": 0.6835630528697799, + "grad_norm": 0.407688622779884, + "learning_rate": 4.807378773052234e-06, + "loss": 0.4318, + "step": 8307 + }, + { + "epoch": 0.6836453404649249, + "grad_norm": 2.455895649364227, + "learning_rate": 4.805101149510667e-06, + "loss": 0.7233, + "step": 8308 + }, + { + "epoch": 0.6837276280600699, + "grad_norm": 2.003886691154661, + "learning_rate": 4.802823895006402e-06, + "loss": 0.7234, + "step": 8309 + }, + { + "epoch": 0.683809915655215, + "grad_norm": 2.0952117388189087, + "learning_rate": 4.800547009701221e-06, + "loss": 0.7128, + "step": 8310 + }, + { + "epoch": 0.68389220325036, + "grad_norm": 0.41529574894026206, + "learning_rate": 4.79827049375686e-06, + "loss": 0.4591, + "step": 8311 + }, + { + "epoch": 0.683974490845505, + "grad_norm": 2.3926650245157313, + "learning_rate": 4.795994347335049e-06, + "loss": 0.7181, + "step": 8312 + }, + { + "epoch": 0.68405677844065, + "grad_norm": 2.1451344236405583, + "learning_rate": 4.793718570597477e-06, + "loss": 0.7214, + "step": 8313 + }, + { + "epoch": 0.6841390660357951, + "grad_norm": 2.5517037695707816, + "learning_rate": 4.791443163705811e-06, + "loss": 0.72, + "step": 8314 + }, + { + "epoch": 0.6842213536309402, + "grad_norm": 1.9216480418668187, + "learning_rate": 4.789168126821689e-06, + "loss": 0.7352, + "step": 8315 + }, + { + "epoch": 0.6843036412260851, + "grad_norm": 1.6813347904516318, + "learning_rate": 4.786893460106734e-06, + "loss": 0.7242, + "step": 8316 + }, + { + "epoch": 0.6843859288212302, + "grad_norm": 2.088928497013939, + "learning_rate": 4.784619163722527e-06, + "loss": 0.7393, + "step": 8317 + }, + { + "epoch": 0.6844682164163752, + "grad_norm": 3.744614745855123, + "learning_rate": 4.7823452378306365e-06, + "loss": 0.7114, + "step": 8318 + }, + { + "epoch": 0.6845505040115203, + "grad_norm": 2.190432800667934, + "learning_rate": 4.7800716825925976e-06, + "loss": 0.7468, + "step": 8319 + }, + { + "epoch": 0.6846327916066653, + "grad_norm": 2.222581357687478, + "learning_rate": 4.777798498169917e-06, + "loss": 0.7364, + "step": 8320 + }, + { + "epoch": 0.6847150792018103, + "grad_norm": 2.275832284163188, + "learning_rate": 4.775525684724076e-06, + "loss": 0.708, + "step": 8321 + }, + { + "epoch": 0.6847973667969554, + "grad_norm": 4.236301609977853, + "learning_rate": 4.773253242416538e-06, + "loss": 0.721, + "step": 8322 + }, + { + "epoch": 0.6848796543921004, + "grad_norm": 2.2848590699551092, + "learning_rate": 4.770981171408728e-06, + "loss": 0.7144, + "step": 8323 + }, + { + "epoch": 0.6849619419872455, + "grad_norm": 1.689773734613047, + "learning_rate": 4.768709471862057e-06, + "loss": 0.7041, + "step": 8324 + }, + { + "epoch": 0.6850442295823904, + "grad_norm": 1.8382920904547055, + "learning_rate": 4.766438143937898e-06, + "loss": 0.6911, + "step": 8325 + }, + { + "epoch": 0.6851265171775355, + "grad_norm": 1.9887454046147972, + "learning_rate": 4.764167187797604e-06, + "loss": 0.7075, + "step": 8326 + }, + { + "epoch": 0.6852088047726805, + "grad_norm": 1.636418716713073, + "learning_rate": 4.761896603602495e-06, + "loss": 0.7068, + "step": 8327 + }, + { + "epoch": 0.6852910923678256, + "grad_norm": 3.1060059658692496, + "learning_rate": 4.759626391513879e-06, + "loss": 0.7058, + "step": 8328 + }, + { + "epoch": 0.6853733799629705, + "grad_norm": 2.1181520791747444, + "learning_rate": 4.757356551693019e-06, + "loss": 0.7307, + "step": 8329 + }, + { + "epoch": 0.6854556675581156, + "grad_norm": 2.0218357520316736, + "learning_rate": 4.755087084301169e-06, + "loss": 0.7098, + "step": 8330 + }, + { + "epoch": 0.6855379551532607, + "grad_norm": 1.8592170116229325, + "learning_rate": 4.752817989499544e-06, + "loss": 0.7099, + "step": 8331 + }, + { + "epoch": 0.6856202427484057, + "grad_norm": 1.8204627538119547, + "learning_rate": 4.750549267449338e-06, + "loss": 0.7225, + "step": 8332 + }, + { + "epoch": 0.6857025303435507, + "grad_norm": 0.42359756758554656, + "learning_rate": 4.748280918311713e-06, + "loss": 0.4719, + "step": 8333 + }, + { + "epoch": 0.6857848179386957, + "grad_norm": 2.9220362273822995, + "learning_rate": 4.746012942247816e-06, + "loss": 0.7437, + "step": 8334 + }, + { + "epoch": 0.6858671055338408, + "grad_norm": 2.457399494876367, + "learning_rate": 4.743745339418754e-06, + "loss": 0.6977, + "step": 8335 + }, + { + "epoch": 0.6859493931289858, + "grad_norm": 2.1905293689783902, + "learning_rate": 4.74147810998562e-06, + "loss": 0.7181, + "step": 8336 + }, + { + "epoch": 0.6860316807241308, + "grad_norm": 0.43444464894373475, + "learning_rate": 4.739211254109473e-06, + "loss": 0.5104, + "step": 8337 + }, + { + "epoch": 0.6861139683192758, + "grad_norm": 1.7293996419702446, + "learning_rate": 4.736944771951343e-06, + "loss": 0.7149, + "step": 8338 + }, + { + "epoch": 0.6861962559144209, + "grad_norm": 2.2049870567125662, + "learning_rate": 4.734678663672237e-06, + "loss": 0.7118, + "step": 8339 + }, + { + "epoch": 0.686278543509566, + "grad_norm": 1.5765807097931024, + "learning_rate": 4.732412929433141e-06, + "loss": 0.7203, + "step": 8340 + }, + { + "epoch": 0.6863608311047109, + "grad_norm": 0.4221767331282806, + "learning_rate": 4.730147569395003e-06, + "loss": 0.5195, + "step": 8341 + }, + { + "epoch": 0.686443118699856, + "grad_norm": 2.407190275950566, + "learning_rate": 4.727882583718757e-06, + "loss": 0.7184, + "step": 8342 + }, + { + "epoch": 0.686525406295001, + "grad_norm": 2.1436007542669926, + "learning_rate": 4.725617972565301e-06, + "loss": 0.7362, + "step": 8343 + }, + { + "epoch": 0.6866076938901461, + "grad_norm": 2.124019328552479, + "learning_rate": 4.723353736095509e-06, + "loss": 0.7032, + "step": 8344 + }, + { + "epoch": 0.6866899814852911, + "grad_norm": 1.9188917752466592, + "learning_rate": 4.721089874470226e-06, + "loss": 0.7364, + "step": 8345 + }, + { + "epoch": 0.6867722690804361, + "grad_norm": 2.5776933380333324, + "learning_rate": 4.7188263878502785e-06, + "loss": 0.7054, + "step": 8346 + }, + { + "epoch": 0.6868545566755812, + "grad_norm": 2.37030045131587, + "learning_rate": 4.7165632763964555e-06, + "loss": 0.7235, + "step": 8347 + }, + { + "epoch": 0.6869368442707262, + "grad_norm": 0.41647968844662103, + "learning_rate": 4.714300540269531e-06, + "loss": 0.468, + "step": 8348 + }, + { + "epoch": 0.6870191318658713, + "grad_norm": 2.1299569102572486, + "learning_rate": 4.712038179630243e-06, + "loss": 0.7506, + "step": 8349 + }, + { + "epoch": 0.6871014194610162, + "grad_norm": 0.41123783389335344, + "learning_rate": 4.709776194639306e-06, + "loss": 0.4773, + "step": 8350 + }, + { + "epoch": 0.6871837070561613, + "grad_norm": 2.3397048611490905, + "learning_rate": 4.707514585457403e-06, + "loss": 0.7156, + "step": 8351 + }, + { + "epoch": 0.6872659946513063, + "grad_norm": 2.065154057516744, + "learning_rate": 4.705253352245205e-06, + "loss": 0.6985, + "step": 8352 + }, + { + "epoch": 0.6873482822464514, + "grad_norm": 0.4345344666152636, + "learning_rate": 4.702992495163337e-06, + "loss": 0.497, + "step": 8353 + }, + { + "epoch": 0.6874305698415963, + "grad_norm": 2.03105902355216, + "learning_rate": 4.7007320143724175e-06, + "loss": 0.7251, + "step": 8354 + }, + { + "epoch": 0.6875128574367414, + "grad_norm": 0.41672109258250745, + "learning_rate": 4.698471910033015e-06, + "loss": 0.4663, + "step": 8355 + }, + { + "epoch": 0.6875951450318865, + "grad_norm": 2.1557138910403917, + "learning_rate": 4.696212182305692e-06, + "loss": 0.7331, + "step": 8356 + }, + { + "epoch": 0.6876774326270315, + "grad_norm": 1.8745916208173845, + "learning_rate": 4.693952831350969e-06, + "loss": 0.7395, + "step": 8357 + }, + { + "epoch": 0.6877597202221765, + "grad_norm": 2.1127561288796213, + "learning_rate": 4.691693857329356e-06, + "loss": 0.7402, + "step": 8358 + }, + { + "epoch": 0.6878420078173215, + "grad_norm": 2.3861625059391867, + "learning_rate": 4.689435260401317e-06, + "loss": 0.7149, + "step": 8359 + }, + { + "epoch": 0.6879242954124666, + "grad_norm": 2.048652543061431, + "learning_rate": 4.687177040727313e-06, + "loss": 0.6855, + "step": 8360 + }, + { + "epoch": 0.6880065830076116, + "grad_norm": 1.9895795621980847, + "learning_rate": 4.684919198467747e-06, + "loss": 0.7229, + "step": 8361 + }, + { + "epoch": 0.6880888706027566, + "grad_norm": 2.0483287906089007, + "learning_rate": 4.682661733783025e-06, + "loss": 0.738, + "step": 8362 + }, + { + "epoch": 0.6881711581979016, + "grad_norm": 1.9221697556552464, + "learning_rate": 4.680404646833506e-06, + "loss": 0.743, + "step": 8363 + }, + { + "epoch": 0.6882534457930467, + "grad_norm": 1.9931372209572782, + "learning_rate": 4.678147937779537e-06, + "loss": 0.7209, + "step": 8364 + }, + { + "epoch": 0.6883357333881918, + "grad_norm": 1.9882133088116098, + "learning_rate": 4.675891606781424e-06, + "loss": 0.6966, + "step": 8365 + }, + { + "epoch": 0.6884180209833367, + "grad_norm": 2.497561484301833, + "learning_rate": 4.673635653999463e-06, + "loss": 0.7147, + "step": 8366 + }, + { + "epoch": 0.6885003085784818, + "grad_norm": 0.4410451998728743, + "learning_rate": 4.6713800795939015e-06, + "loss": 0.4971, + "step": 8367 + }, + { + "epoch": 0.6885825961736268, + "grad_norm": 2.196389804711583, + "learning_rate": 4.669124883724979e-06, + "loss": 0.7249, + "step": 8368 + }, + { + "epoch": 0.6886648837687719, + "grad_norm": 1.9079138420072408, + "learning_rate": 4.666870066552896e-06, + "loss": 0.7217, + "step": 8369 + }, + { + "epoch": 0.6887471713639169, + "grad_norm": 2.028643864322441, + "learning_rate": 4.664615628237839e-06, + "loss": 0.7181, + "step": 8370 + }, + { + "epoch": 0.6888294589590619, + "grad_norm": 2.198257456583837, + "learning_rate": 4.662361568939955e-06, + "loss": 0.73, + "step": 8371 + }, + { + "epoch": 0.688911746554207, + "grad_norm": 2.2338894746184152, + "learning_rate": 4.660107888819368e-06, + "loss": 0.7323, + "step": 8372 + }, + { + "epoch": 0.688994034149352, + "grad_norm": 0.4075138594979353, + "learning_rate": 4.657854588036173e-06, + "loss": 0.4845, + "step": 8373 + }, + { + "epoch": 0.6890763217444971, + "grad_norm": 2.232035325122152, + "learning_rate": 4.655601666750449e-06, + "loss": 0.7196, + "step": 8374 + }, + { + "epoch": 0.689158609339642, + "grad_norm": 0.39796264861524644, + "learning_rate": 4.653349125122232e-06, + "loss": 0.4505, + "step": 8375 + }, + { + "epoch": 0.6892408969347871, + "grad_norm": 1.7271077910903503, + "learning_rate": 4.651096963311546e-06, + "loss": 0.7278, + "step": 8376 + }, + { + "epoch": 0.6893231845299321, + "grad_norm": 2.009603859715638, + "learning_rate": 4.648845181478376e-06, + "loss": 0.722, + "step": 8377 + }, + { + "epoch": 0.6894054721250772, + "grad_norm": 0.4244460704339306, + "learning_rate": 4.646593779782687e-06, + "loss": 0.4939, + "step": 8378 + }, + { + "epoch": 0.6894877597202221, + "grad_norm": 1.8656320015143577, + "learning_rate": 4.6443427583844115e-06, + "loss": 0.7205, + "step": 8379 + }, + { + "epoch": 0.6895700473153672, + "grad_norm": 3.2288015693579406, + "learning_rate": 4.642092117443464e-06, + "loss": 0.7398, + "step": 8380 + }, + { + "epoch": 0.6896523349105123, + "grad_norm": 0.4388782580017985, + "learning_rate": 4.639841857119721e-06, + "loss": 0.4875, + "step": 8381 + }, + { + "epoch": 0.6897346225056573, + "grad_norm": 2.087678554465908, + "learning_rate": 4.6375919775730435e-06, + "loss": 0.7259, + "step": 8382 + }, + { + "epoch": 0.6898169101008023, + "grad_norm": 2.8002121918762635, + "learning_rate": 4.6353424789632565e-06, + "loss": 0.7189, + "step": 8383 + }, + { + "epoch": 0.6898991976959473, + "grad_norm": 2.0847551991138564, + "learning_rate": 4.63309336145016e-06, + "loss": 0.7312, + "step": 8384 + }, + { + "epoch": 0.6899814852910924, + "grad_norm": 1.8864098885250762, + "learning_rate": 4.6308446251935235e-06, + "loss": 0.7064, + "step": 8385 + }, + { + "epoch": 0.6900637728862374, + "grad_norm": 2.562200071005781, + "learning_rate": 4.628596270353103e-06, + "loss": 0.7001, + "step": 8386 + }, + { + "epoch": 0.6901460604813824, + "grad_norm": 1.6734810920434113, + "learning_rate": 4.626348297088608e-06, + "loss": 0.7211, + "step": 8387 + }, + { + "epoch": 0.6902283480765274, + "grad_norm": 2.6768319163992107, + "learning_rate": 4.624100705559742e-06, + "loss": 0.722, + "step": 8388 + }, + { + "epoch": 0.6903106356716725, + "grad_norm": 2.099052707022342, + "learning_rate": 4.621853495926163e-06, + "loss": 0.7101, + "step": 8389 + }, + { + "epoch": 0.6903929232668176, + "grad_norm": 0.4245307248802644, + "learning_rate": 4.61960666834751e-06, + "loss": 0.4826, + "step": 8390 + }, + { + "epoch": 0.6904752108619625, + "grad_norm": 1.9200948941619975, + "learning_rate": 4.617360222983392e-06, + "loss": 0.7104, + "step": 8391 + }, + { + "epoch": 0.6905574984571076, + "grad_norm": 1.7561010593881596, + "learning_rate": 4.615114159993399e-06, + "loss": 0.7277, + "step": 8392 + }, + { + "epoch": 0.6906397860522526, + "grad_norm": 2.6184706931396824, + "learning_rate": 4.612868479537081e-06, + "loss": 0.6949, + "step": 8393 + }, + { + "epoch": 0.6907220736473977, + "grad_norm": 2.8053440131855996, + "learning_rate": 4.6106231817739735e-06, + "loss": 0.7222, + "step": 8394 + }, + { + "epoch": 0.6908043612425427, + "grad_norm": 1.7189965070812379, + "learning_rate": 4.608378266863576e-06, + "loss": 0.73, + "step": 8395 + }, + { + "epoch": 0.6908866488376877, + "grad_norm": 2.527090547715466, + "learning_rate": 4.606133734965365e-06, + "loss": 0.7227, + "step": 8396 + }, + { + "epoch": 0.6909689364328327, + "grad_norm": 2.4138793573573833, + "learning_rate": 4.603889586238782e-06, + "loss": 0.7036, + "step": 8397 + }, + { + "epoch": 0.6910512240279778, + "grad_norm": 1.6365540008133976, + "learning_rate": 4.601645820843257e-06, + "loss": 0.699, + "step": 8398 + }, + { + "epoch": 0.6911335116231229, + "grad_norm": 2.640213189804637, + "learning_rate": 4.599402438938176e-06, + "loss": 0.735, + "step": 8399 + }, + { + "epoch": 0.6912157992182678, + "grad_norm": 2.712874273602302, + "learning_rate": 4.597159440682914e-06, + "loss": 0.7253, + "step": 8400 + }, + { + "epoch": 0.6912980868134129, + "grad_norm": 2.9307779294694343, + "learning_rate": 4.594916826236803e-06, + "loss": 0.7405, + "step": 8401 + }, + { + "epoch": 0.6913803744085579, + "grad_norm": 2.6527645464396974, + "learning_rate": 4.592674595759157e-06, + "loss": 0.7367, + "step": 8402 + }, + { + "epoch": 0.691462662003703, + "grad_norm": 2.0194467166651724, + "learning_rate": 4.590432749409256e-06, + "loss": 0.7017, + "step": 8403 + }, + { + "epoch": 0.6915449495988479, + "grad_norm": 1.886857776559483, + "learning_rate": 4.588191287346365e-06, + "loss": 0.7057, + "step": 8404 + }, + { + "epoch": 0.691627237193993, + "grad_norm": 2.525616294826516, + "learning_rate": 4.585950209729707e-06, + "loss": 0.7498, + "step": 8405 + }, + { + "epoch": 0.691709524789138, + "grad_norm": 2.0420802732986307, + "learning_rate": 4.583709516718491e-06, + "loss": 0.6933, + "step": 8406 + }, + { + "epoch": 0.6917918123842831, + "grad_norm": 3.3802777653990215, + "learning_rate": 4.581469208471888e-06, + "loss": 0.7166, + "step": 8407 + }, + { + "epoch": 0.6918740999794281, + "grad_norm": 2.3888597520776744, + "learning_rate": 4.579229285149047e-06, + "loss": 0.7092, + "step": 8408 + }, + { + "epoch": 0.6919563875745731, + "grad_norm": 2.5688805697577637, + "learning_rate": 4.576989746909085e-06, + "loss": 0.7309, + "step": 8409 + }, + { + "epoch": 0.6920386751697182, + "grad_norm": 3.801789499589322, + "learning_rate": 4.574750593911102e-06, + "loss": 0.7011, + "step": 8410 + }, + { + "epoch": 0.6921209627648632, + "grad_norm": 2.8245772471144686, + "learning_rate": 4.572511826314158e-06, + "loss": 0.7474, + "step": 8411 + }, + { + "epoch": 0.6922032503600082, + "grad_norm": 2.0129308719755667, + "learning_rate": 4.5702734442772965e-06, + "loss": 0.7174, + "step": 8412 + }, + { + "epoch": 0.6922855379551532, + "grad_norm": 1.8893849719589333, + "learning_rate": 4.568035447959525e-06, + "loss": 0.6983, + "step": 8413 + }, + { + "epoch": 0.6923678255502983, + "grad_norm": 2.043021715463447, + "learning_rate": 4.56579783751983e-06, + "loss": 0.7399, + "step": 8414 + }, + { + "epoch": 0.6924501131454434, + "grad_norm": 2.860110123766533, + "learning_rate": 4.5635606131171616e-06, + "loss": 0.7028, + "step": 8415 + }, + { + "epoch": 0.6925324007405883, + "grad_norm": 1.7935753235151781, + "learning_rate": 4.561323774910457e-06, + "loss": 0.7542, + "step": 8416 + }, + { + "epoch": 0.6926146883357334, + "grad_norm": 2.3315097510323763, + "learning_rate": 4.559087323058609e-06, + "loss": 0.7231, + "step": 8417 + }, + { + "epoch": 0.6926969759308784, + "grad_norm": 2.5778834727501643, + "learning_rate": 4.556851257720499e-06, + "loss": 0.7407, + "step": 8418 + }, + { + "epoch": 0.6927792635260235, + "grad_norm": 2.4571328916968542, + "learning_rate": 4.554615579054973e-06, + "loss": 0.7365, + "step": 8419 + }, + { + "epoch": 0.6928615511211684, + "grad_norm": 1.9955059737133733, + "learning_rate": 4.552380287220845e-06, + "loss": 0.7189, + "step": 8420 + }, + { + "epoch": 0.6929438387163135, + "grad_norm": 2.144549488174213, + "learning_rate": 4.550145382376908e-06, + "loss": 0.7204, + "step": 8421 + }, + { + "epoch": 0.6930261263114585, + "grad_norm": 2.0045905274143205, + "learning_rate": 4.547910864681929e-06, + "loss": 0.7315, + "step": 8422 + }, + { + "epoch": 0.6931084139066036, + "grad_norm": 2.3305941653095856, + "learning_rate": 4.5456767342946405e-06, + "loss": 0.7321, + "step": 8423 + }, + { + "epoch": 0.6931907015017487, + "grad_norm": 2.414663002272141, + "learning_rate": 4.543442991373761e-06, + "loss": 0.7157, + "step": 8424 + }, + { + "epoch": 0.6932729890968936, + "grad_norm": 2.3855104309396706, + "learning_rate": 4.541209636077957e-06, + "loss": 0.7194, + "step": 8425 + }, + { + "epoch": 0.6933552766920387, + "grad_norm": 2.85048688100238, + "learning_rate": 4.538976668565894e-06, + "loss": 0.706, + "step": 8426 + }, + { + "epoch": 0.6934375642871837, + "grad_norm": 2.2633018658294466, + "learning_rate": 4.536744088996192e-06, + "loss": 0.7517, + "step": 8427 + }, + { + "epoch": 0.6935198518823288, + "grad_norm": 1.9997075515866458, + "learning_rate": 4.534511897527456e-06, + "loss": 0.726, + "step": 8428 + }, + { + "epoch": 0.6936021394774737, + "grad_norm": 1.7652642355519952, + "learning_rate": 4.532280094318251e-06, + "loss": 0.7123, + "step": 8429 + }, + { + "epoch": 0.6936844270726188, + "grad_norm": 2.4319355364554767, + "learning_rate": 4.5300486795271305e-06, + "loss": 0.7299, + "step": 8430 + }, + { + "epoch": 0.6937667146677639, + "grad_norm": 2.141613803432424, + "learning_rate": 4.527817653312596e-06, + "loss": 0.7021, + "step": 8431 + }, + { + "epoch": 0.6938490022629089, + "grad_norm": 2.039548165906424, + "learning_rate": 4.525587015833148e-06, + "loss": 0.738, + "step": 8432 + }, + { + "epoch": 0.6939312898580539, + "grad_norm": 1.7224813421551204, + "learning_rate": 4.52335676724724e-06, + "loss": 0.7078, + "step": 8433 + }, + { + "epoch": 0.6940135774531989, + "grad_norm": 0.43229433235862913, + "learning_rate": 4.521126907713313e-06, + "loss": 0.4909, + "step": 8434 + }, + { + "epoch": 0.694095865048344, + "grad_norm": 0.4285037895321895, + "learning_rate": 4.518897437389766e-06, + "loss": 0.5029, + "step": 8435 + }, + { + "epoch": 0.694178152643489, + "grad_norm": 2.2783667790606073, + "learning_rate": 4.516668356434981e-06, + "loss": 0.711, + "step": 8436 + }, + { + "epoch": 0.694260440238634, + "grad_norm": 1.686080479692274, + "learning_rate": 4.514439665007303e-06, + "loss": 0.6977, + "step": 8437 + }, + { + "epoch": 0.694342727833779, + "grad_norm": 0.4232559031106983, + "learning_rate": 4.5122113632650624e-06, + "loss": 0.4965, + "step": 8438 + }, + { + "epoch": 0.6944250154289241, + "grad_norm": 2.2235041450503363, + "learning_rate": 4.509983451366545e-06, + "loss": 0.727, + "step": 8439 + }, + { + "epoch": 0.6945073030240692, + "grad_norm": 2.0466255890386695, + "learning_rate": 4.507755929470029e-06, + "loss": 0.7019, + "step": 8440 + }, + { + "epoch": 0.6945895906192141, + "grad_norm": 6.73797568678057, + "learning_rate": 4.505528797733747e-06, + "loss": 0.7362, + "step": 8441 + }, + { + "epoch": 0.6946718782143592, + "grad_norm": 1.851598906543045, + "learning_rate": 4.5033020563159115e-06, + "loss": 0.7133, + "step": 8442 + }, + { + "epoch": 0.6947541658095042, + "grad_norm": 3.2320496115084776, + "learning_rate": 4.501075705374705e-06, + "loss": 0.6963, + "step": 8443 + }, + { + "epoch": 0.6948364534046493, + "grad_norm": 1.9338645937714154, + "learning_rate": 4.498849745068289e-06, + "loss": 0.7157, + "step": 8444 + }, + { + "epoch": 0.6949187409997942, + "grad_norm": 1.8219384890291233, + "learning_rate": 4.496624175554785e-06, + "loss": 0.7038, + "step": 8445 + }, + { + "epoch": 0.6950010285949393, + "grad_norm": 0.4123978154421038, + "learning_rate": 4.494398996992303e-06, + "loss": 0.4864, + "step": 8446 + }, + { + "epoch": 0.6950833161900843, + "grad_norm": 1.8414261544657247, + "learning_rate": 4.492174209538911e-06, + "loss": 0.7331, + "step": 8447 + }, + { + "epoch": 0.6951656037852294, + "grad_norm": 0.42548067067669704, + "learning_rate": 4.489949813352654e-06, + "loss": 0.4826, + "step": 8448 + }, + { + "epoch": 0.6952478913803745, + "grad_norm": 2.1263556428599553, + "learning_rate": 4.487725808591547e-06, + "loss": 0.7101, + "step": 8449 + }, + { + "epoch": 0.6953301789755194, + "grad_norm": 2.143845747451078, + "learning_rate": 4.485502195413587e-06, + "loss": 0.7718, + "step": 8450 + }, + { + "epoch": 0.6954124665706645, + "grad_norm": 2.3298849874678944, + "learning_rate": 4.483278973976728e-06, + "loss": 0.7414, + "step": 8451 + }, + { + "epoch": 0.6954947541658095, + "grad_norm": 2.1279596014710473, + "learning_rate": 4.4810561444389125e-06, + "loss": 0.761, + "step": 8452 + }, + { + "epoch": 0.6955770417609546, + "grad_norm": 4.758418764095306, + "learning_rate": 4.478833706958041e-06, + "loss": 0.7097, + "step": 8453 + }, + { + "epoch": 0.6956593293560995, + "grad_norm": 2.5794725698652456, + "learning_rate": 4.476611661691993e-06, + "loss": 0.7104, + "step": 8454 + }, + { + "epoch": 0.6957416169512446, + "grad_norm": 2.153664360004114, + "learning_rate": 4.474390008798617e-06, + "loss": 0.7376, + "step": 8455 + }, + { + "epoch": 0.6958239045463896, + "grad_norm": 3.1063538366985837, + "learning_rate": 4.47216874843574e-06, + "loss": 0.754, + "step": 8456 + }, + { + "epoch": 0.6959061921415347, + "grad_norm": 2.220297569813015, + "learning_rate": 4.469947880761152e-06, + "loss": 0.7288, + "step": 8457 + }, + { + "epoch": 0.6959884797366797, + "grad_norm": 2.1906570833988797, + "learning_rate": 4.467727405932626e-06, + "loss": 0.7345, + "step": 8458 + }, + { + "epoch": 0.6960707673318247, + "grad_norm": 2.1993416141627664, + "learning_rate": 4.465507324107897e-06, + "loss": 0.7227, + "step": 8459 + }, + { + "epoch": 0.6961530549269698, + "grad_norm": 1.8483593499288982, + "learning_rate": 4.463287635444677e-06, + "loss": 0.7257, + "step": 8460 + }, + { + "epoch": 0.6962353425221148, + "grad_norm": 1.804279569338293, + "learning_rate": 4.461068340100645e-06, + "loss": 0.7007, + "step": 8461 + }, + { + "epoch": 0.6963176301172598, + "grad_norm": 2.9380103533820634, + "learning_rate": 4.458849438233464e-06, + "loss": 0.7119, + "step": 8462 + }, + { + "epoch": 0.6963999177124048, + "grad_norm": 4.934236979254306, + "learning_rate": 4.456630930000753e-06, + "loss": 0.7565, + "step": 8463 + }, + { + "epoch": 0.6964822053075499, + "grad_norm": 1.9986767574607576, + "learning_rate": 4.45441281556012e-06, + "loss": 0.7291, + "step": 8464 + }, + { + "epoch": 0.696564492902695, + "grad_norm": 2.203519094567773, + "learning_rate": 4.45219509506913e-06, + "loss": 0.7394, + "step": 8465 + }, + { + "epoch": 0.6966467804978399, + "grad_norm": 0.418641412385769, + "learning_rate": 4.449977768685329e-06, + "loss": 0.491, + "step": 8466 + }, + { + "epoch": 0.696729068092985, + "grad_norm": 2.140078943464689, + "learning_rate": 4.447760836566227e-06, + "loss": 0.7277, + "step": 8467 + }, + { + "epoch": 0.69681135568813, + "grad_norm": 0.4105479277301949, + "learning_rate": 4.4455442988693205e-06, + "loss": 0.471, + "step": 8468 + }, + { + "epoch": 0.6968936432832751, + "grad_norm": 2.1644598025637327, + "learning_rate": 4.44332815575206e-06, + "loss": 0.7289, + "step": 8469 + }, + { + "epoch": 0.69697593087842, + "grad_norm": 1.722097514873939, + "learning_rate": 4.441112407371883e-06, + "loss": 0.7236, + "step": 8470 + }, + { + "epoch": 0.6970582184735651, + "grad_norm": 0.4223790152310961, + "learning_rate": 4.438897053886193e-06, + "loss": 0.4943, + "step": 8471 + }, + { + "epoch": 0.6971405060687101, + "grad_norm": 2.266627086007987, + "learning_rate": 4.436682095452361e-06, + "loss": 0.7167, + "step": 8472 + }, + { + "epoch": 0.6972227936638552, + "grad_norm": 2.587977572926307, + "learning_rate": 4.434467532227732e-06, + "loss": 0.7088, + "step": 8473 + }, + { + "epoch": 0.6973050812590003, + "grad_norm": 2.6290006377376947, + "learning_rate": 4.432253364369633e-06, + "loss": 0.7181, + "step": 8474 + }, + { + "epoch": 0.6973873688541452, + "grad_norm": 1.8921742769006984, + "learning_rate": 4.430039592035347e-06, + "loss": 0.7287, + "step": 8475 + }, + { + "epoch": 0.6974696564492903, + "grad_norm": 0.4565906605256509, + "learning_rate": 4.427826215382144e-06, + "loss": 0.4777, + "step": 8476 + }, + { + "epoch": 0.6975519440444353, + "grad_norm": 1.8508815911839795, + "learning_rate": 4.425613234567255e-06, + "loss": 0.7275, + "step": 8477 + }, + { + "epoch": 0.6976342316395804, + "grad_norm": 2.0681514367906524, + "learning_rate": 4.423400649747888e-06, + "loss": 0.7151, + "step": 8478 + }, + { + "epoch": 0.6977165192347253, + "grad_norm": 2.131438442942258, + "learning_rate": 4.421188461081215e-06, + "loss": 0.7201, + "step": 8479 + }, + { + "epoch": 0.6977988068298704, + "grad_norm": 1.7294699164838934, + "learning_rate": 4.418976668724396e-06, + "loss": 0.7329, + "step": 8480 + }, + { + "epoch": 0.6978810944250154, + "grad_norm": 1.7725016474659558, + "learning_rate": 4.416765272834546e-06, + "loss": 0.7032, + "step": 8481 + }, + { + "epoch": 0.6979633820201605, + "grad_norm": 1.6297289070242948, + "learning_rate": 4.414554273568765e-06, + "loss": 0.7055, + "step": 8482 + }, + { + "epoch": 0.6980456696153055, + "grad_norm": 1.7729235085791584, + "learning_rate": 4.412343671084116e-06, + "loss": 0.7249, + "step": 8483 + }, + { + "epoch": 0.6981279572104505, + "grad_norm": 0.4145088964981422, + "learning_rate": 4.410133465537633e-06, + "loss": 0.48, + "step": 8484 + }, + { + "epoch": 0.6982102448055956, + "grad_norm": 2.5866959723899687, + "learning_rate": 4.4079236570863325e-06, + "loss": 0.7157, + "step": 8485 + }, + { + "epoch": 0.6982925324007406, + "grad_norm": 2.2521652886102346, + "learning_rate": 4.405714245887188e-06, + "loss": 0.7134, + "step": 8486 + }, + { + "epoch": 0.6983748199958856, + "grad_norm": 2.4710190193428714, + "learning_rate": 4.40350523209716e-06, + "loss": 0.7129, + "step": 8487 + }, + { + "epoch": 0.6984571075910306, + "grad_norm": 2.2095892801097143, + "learning_rate": 4.401296615873172e-06, + "loss": 0.7291, + "step": 8488 + }, + { + "epoch": 0.6985393951861757, + "grad_norm": 2.0020991963035777, + "learning_rate": 4.399088397372118e-06, + "loss": 0.7221, + "step": 8489 + }, + { + "epoch": 0.6986216827813208, + "grad_norm": 2.4293075019247663, + "learning_rate": 4.396880576750862e-06, + "loss": 0.6868, + "step": 8490 + }, + { + "epoch": 0.6987039703764657, + "grad_norm": 0.433710332649285, + "learning_rate": 4.394673154166253e-06, + "loss": 0.4734, + "step": 8491 + }, + { + "epoch": 0.6987862579716108, + "grad_norm": 1.8076310665867534, + "learning_rate": 4.392466129775096e-06, + "loss": 0.7154, + "step": 8492 + }, + { + "epoch": 0.6988685455667558, + "grad_norm": 2.4852402271123153, + "learning_rate": 4.39025950373418e-06, + "loss": 0.7295, + "step": 8493 + }, + { + "epoch": 0.6989508331619009, + "grad_norm": 2.0541635359413397, + "learning_rate": 4.388053276200258e-06, + "loss": 0.6933, + "step": 8494 + }, + { + "epoch": 0.6990331207570458, + "grad_norm": 2.020671340890777, + "learning_rate": 4.385847447330056e-06, + "loss": 0.7184, + "step": 8495 + }, + { + "epoch": 0.6991154083521909, + "grad_norm": 1.9150793804578332, + "learning_rate": 4.383642017280269e-06, + "loss": 0.7102, + "step": 8496 + }, + { + "epoch": 0.6991976959473359, + "grad_norm": 0.416409120613045, + "learning_rate": 4.381436986207574e-06, + "loss": 0.4835, + "step": 8497 + }, + { + "epoch": 0.699279983542481, + "grad_norm": 1.8150693306887458, + "learning_rate": 4.3792323542686075e-06, + "loss": 0.7183, + "step": 8498 + }, + { + "epoch": 0.6993622711376261, + "grad_norm": 2.161752122644772, + "learning_rate": 4.377028121619989e-06, + "loss": 0.725, + "step": 8499 + }, + { + "epoch": 0.699444558732771, + "grad_norm": 2.284157364094333, + "learning_rate": 4.3748242884183005e-06, + "loss": 0.7463, + "step": 8500 + }, + { + "epoch": 0.6995268463279161, + "grad_norm": 1.8091130970020333, + "learning_rate": 4.372620854820096e-06, + "loss": 0.7322, + "step": 8501 + }, + { + "epoch": 0.6996091339230611, + "grad_norm": 2.850061893675816, + "learning_rate": 4.3704178209819055e-06, + "loss": 0.7252, + "step": 8502 + }, + { + "epoch": 0.6996914215182062, + "grad_norm": 0.4282024834096784, + "learning_rate": 4.368215187060232e-06, + "loss": 0.4857, + "step": 8503 + }, + { + "epoch": 0.6997737091133511, + "grad_norm": 2.444920862974949, + "learning_rate": 4.366012953211542e-06, + "loss": 0.7158, + "step": 8504 + }, + { + "epoch": 0.6998559967084962, + "grad_norm": 0.4167054632054993, + "learning_rate": 4.3638111195922885e-06, + "loss": 0.4503, + "step": 8505 + }, + { + "epoch": 0.6999382843036412, + "grad_norm": 2.5574981497717855, + "learning_rate": 4.361609686358872e-06, + "loss": 0.7216, + "step": 8506 + }, + { + "epoch": 0.7000205718987863, + "grad_norm": 1.8309305101196673, + "learning_rate": 4.359408653667689e-06, + "loss": 0.7106, + "step": 8507 + }, + { + "epoch": 0.7001028594939313, + "grad_norm": 2.5349117685370324, + "learning_rate": 4.357208021675093e-06, + "loss": 0.7293, + "step": 8508 + }, + { + "epoch": 0.7001851470890763, + "grad_norm": 1.8808184980825031, + "learning_rate": 4.3550077905374165e-06, + "loss": 0.7248, + "step": 8509 + }, + { + "epoch": 0.7002674346842214, + "grad_norm": 2.3337115710346903, + "learning_rate": 4.352807960410955e-06, + "loss": 0.7205, + "step": 8510 + }, + { + "epoch": 0.7003497222793664, + "grad_norm": 1.9346753827438454, + "learning_rate": 4.350608531451993e-06, + "loss": 0.7425, + "step": 8511 + }, + { + "epoch": 0.7004320098745114, + "grad_norm": 1.5702606948651023, + "learning_rate": 4.348409503816758e-06, + "loss": 0.7105, + "step": 8512 + }, + { + "epoch": 0.7005142974696564, + "grad_norm": 2.095068106725747, + "learning_rate": 4.346210877661477e-06, + "loss": 0.7158, + "step": 8513 + }, + { + "epoch": 0.7005965850648015, + "grad_norm": 0.40213985315277284, + "learning_rate": 4.344012653142331e-06, + "loss": 0.4665, + "step": 8514 + }, + { + "epoch": 0.7006788726599465, + "grad_norm": 2.0432330983431486, + "learning_rate": 4.3418148304154835e-06, + "loss": 0.712, + "step": 8515 + }, + { + "epoch": 0.7007611602550915, + "grad_norm": 3.1740773881021953, + "learning_rate": 4.3396174096370584e-06, + "loss": 0.7193, + "step": 8516 + }, + { + "epoch": 0.7008434478502366, + "grad_norm": 1.8338444972200156, + "learning_rate": 4.337420390963166e-06, + "loss": 0.7117, + "step": 8517 + }, + { + "epoch": 0.7009257354453816, + "grad_norm": 2.062011593407388, + "learning_rate": 4.335223774549867e-06, + "loss": 0.7215, + "step": 8518 + }, + { + "epoch": 0.7010080230405267, + "grad_norm": 1.897945378258996, + "learning_rate": 4.333027560553215e-06, + "loss": 0.7116, + "step": 8519 + }, + { + "epoch": 0.7010903106356716, + "grad_norm": 2.0460798894684817, + "learning_rate": 4.330831749129218e-06, + "loss": 0.7074, + "step": 8520 + }, + { + "epoch": 0.7011725982308167, + "grad_norm": 2.6064783320117817, + "learning_rate": 4.328636340433872e-06, + "loss": 0.7126, + "step": 8521 + }, + { + "epoch": 0.7012548858259617, + "grad_norm": 1.9929222525774146, + "learning_rate": 4.326441334623129e-06, + "loss": 0.7175, + "step": 8522 + }, + { + "epoch": 0.7013371734211068, + "grad_norm": 1.903681754112913, + "learning_rate": 4.32424673185292e-06, + "loss": 0.7332, + "step": 8523 + }, + { + "epoch": 0.7014194610162519, + "grad_norm": 1.8980503812551661, + "learning_rate": 4.322052532279143e-06, + "loss": 0.7147, + "step": 8524 + }, + { + "epoch": 0.7015017486113968, + "grad_norm": 2.2460979704248105, + "learning_rate": 4.319858736057677e-06, + "loss": 0.7603, + "step": 8525 + }, + { + "epoch": 0.7015840362065419, + "grad_norm": 0.453129619136031, + "learning_rate": 4.317665343344359e-06, + "loss": 0.4832, + "step": 8526 + }, + { + "epoch": 0.7016663238016869, + "grad_norm": 1.792698073710456, + "learning_rate": 4.315472354295011e-06, + "loss": 0.7361, + "step": 8527 + }, + { + "epoch": 0.701748611396832, + "grad_norm": 2.1013020039526307, + "learning_rate": 4.313279769065416e-06, + "loss": 0.7618, + "step": 8528 + }, + { + "epoch": 0.7018308989919769, + "grad_norm": 2.068121177525637, + "learning_rate": 4.3110875878113314e-06, + "loss": 0.7468, + "step": 8529 + }, + { + "epoch": 0.701913186587122, + "grad_norm": 4.088503340668273, + "learning_rate": 4.308895810688484e-06, + "loss": 0.6883, + "step": 8530 + }, + { + "epoch": 0.701995474182267, + "grad_norm": 2.2342859419984604, + "learning_rate": 4.306704437852578e-06, + "loss": 0.7284, + "step": 8531 + }, + { + "epoch": 0.7020777617774121, + "grad_norm": 2.163108934418821, + "learning_rate": 4.304513469459281e-06, + "loss": 0.7338, + "step": 8532 + }, + { + "epoch": 0.702160049372557, + "grad_norm": 2.118144661640324, + "learning_rate": 4.302322905664243e-06, + "loss": 0.7018, + "step": 8533 + }, + { + "epoch": 0.7022423369677021, + "grad_norm": 2.365916734906901, + "learning_rate": 4.300132746623074e-06, + "loss": 0.7327, + "step": 8534 + }, + { + "epoch": 0.7023246245628472, + "grad_norm": 2.0745770220068893, + "learning_rate": 4.297942992491357e-06, + "loss": 0.7149, + "step": 8535 + }, + { + "epoch": 0.7024069121579922, + "grad_norm": 1.9446762554646304, + "learning_rate": 4.295753643424648e-06, + "loss": 0.6925, + "step": 8536 + }, + { + "epoch": 0.7024891997531372, + "grad_norm": 2.10884125424283, + "learning_rate": 4.293564699578482e-06, + "loss": 0.7123, + "step": 8537 + }, + { + "epoch": 0.7025714873482822, + "grad_norm": 0.40017096597180557, + "learning_rate": 4.29137616110835e-06, + "loss": 0.4555, + "step": 8538 + }, + { + "epoch": 0.7026537749434273, + "grad_norm": 3.607260278655278, + "learning_rate": 4.289188028169728e-06, + "loss": 0.7319, + "step": 8539 + }, + { + "epoch": 0.7027360625385723, + "grad_norm": 3.1536794398353236, + "learning_rate": 4.287000300918057e-06, + "loss": 0.7113, + "step": 8540 + }, + { + "epoch": 0.7028183501337173, + "grad_norm": 1.9871522382970532, + "learning_rate": 4.284812979508748e-06, + "loss": 0.6882, + "step": 8541 + }, + { + "epoch": 0.7029006377288624, + "grad_norm": 2.3193829638920573, + "learning_rate": 4.282626064097181e-06, + "loss": 0.7258, + "step": 8542 + }, + { + "epoch": 0.7029829253240074, + "grad_norm": 1.9692210662264245, + "learning_rate": 4.2804395548387175e-06, + "loss": 0.7269, + "step": 8543 + }, + { + "epoch": 0.7030652129191525, + "grad_norm": 3.0357936622701205, + "learning_rate": 4.278253451888679e-06, + "loss": 0.7623, + "step": 8544 + }, + { + "epoch": 0.7031475005142974, + "grad_norm": 3.3452833996410267, + "learning_rate": 4.2760677554023665e-06, + "loss": 0.7573, + "step": 8545 + }, + { + "epoch": 0.7032297881094425, + "grad_norm": 2.5200225654904456, + "learning_rate": 4.273882465535047e-06, + "loss": 0.7131, + "step": 8546 + }, + { + "epoch": 0.7033120757045875, + "grad_norm": 3.480042811531191, + "learning_rate": 4.271697582441961e-06, + "loss": 0.7106, + "step": 8547 + }, + { + "epoch": 0.7033943632997326, + "grad_norm": 0.4033861888751465, + "learning_rate": 4.2695131062783124e-06, + "loss": 0.4793, + "step": 8548 + }, + { + "epoch": 0.7034766508948775, + "grad_norm": 2.566619499271985, + "learning_rate": 4.2673290371992924e-06, + "loss": 0.7137, + "step": 8549 + }, + { + "epoch": 0.7035589384900226, + "grad_norm": 2.0010743684678234, + "learning_rate": 4.265145375360046e-06, + "loss": 0.7251, + "step": 8550 + }, + { + "epoch": 0.7036412260851677, + "grad_norm": 2.0486193427577466, + "learning_rate": 4.262962120915703e-06, + "loss": 0.7389, + "step": 8551 + }, + { + "epoch": 0.7037235136803127, + "grad_norm": 2.344427305301285, + "learning_rate": 4.260779274021358e-06, + "loss": 0.7663, + "step": 8552 + }, + { + "epoch": 0.7038058012754578, + "grad_norm": 1.7204227665584613, + "learning_rate": 4.258596834832074e-06, + "loss": 0.7113, + "step": 8553 + }, + { + "epoch": 0.7038880888706027, + "grad_norm": 2.270564648236, + "learning_rate": 4.256414803502885e-06, + "loss": 0.7112, + "step": 8554 + }, + { + "epoch": 0.7039703764657478, + "grad_norm": 1.9776114909297458, + "learning_rate": 4.254233180188806e-06, + "loss": 0.6998, + "step": 8555 + }, + { + "epoch": 0.7040526640608928, + "grad_norm": 1.9874263011922337, + "learning_rate": 4.25205196504481e-06, + "loss": 0.6892, + "step": 8556 + }, + { + "epoch": 0.7041349516560379, + "grad_norm": 2.122433109236832, + "learning_rate": 4.249871158225853e-06, + "loss": 0.7042, + "step": 8557 + }, + { + "epoch": 0.7042172392511828, + "grad_norm": 1.8649650797527384, + "learning_rate": 4.2476907598868546e-06, + "loss": 0.7107, + "step": 8558 + }, + { + "epoch": 0.7042995268463279, + "grad_norm": 2.2018084174650263, + "learning_rate": 4.2455107701827035e-06, + "loss": 0.7541, + "step": 8559 + }, + { + "epoch": 0.704381814441473, + "grad_norm": 0.41976044484738645, + "learning_rate": 4.2433311892682605e-06, + "loss": 0.4833, + "step": 8560 + }, + { + "epoch": 0.704464102036618, + "grad_norm": 1.9350811374605343, + "learning_rate": 4.241152017298367e-06, + "loss": 0.7269, + "step": 8561 + }, + { + "epoch": 0.704546389631763, + "grad_norm": 1.9907011897513125, + "learning_rate": 4.238973254427822e-06, + "loss": 0.7239, + "step": 8562 + }, + { + "epoch": 0.704628677226908, + "grad_norm": 1.857235210848351, + "learning_rate": 4.236794900811406e-06, + "loss": 0.7158, + "step": 8563 + }, + { + "epoch": 0.7047109648220531, + "grad_norm": 1.9966548202439236, + "learning_rate": 4.234616956603864e-06, + "loss": 0.7258, + "step": 8564 + }, + { + "epoch": 0.7047932524171981, + "grad_norm": 1.9009824881607444, + "learning_rate": 4.232439421959913e-06, + "loss": 0.7266, + "step": 8565 + }, + { + "epoch": 0.7048755400123431, + "grad_norm": 2.24088985539903, + "learning_rate": 4.230262297034238e-06, + "loss": 0.7153, + "step": 8566 + }, + { + "epoch": 0.7049578276074882, + "grad_norm": 1.8932110157028914, + "learning_rate": 4.228085581981506e-06, + "loss": 0.7076, + "step": 8567 + }, + { + "epoch": 0.7050401152026332, + "grad_norm": 1.7429490728305783, + "learning_rate": 4.225909276956339e-06, + "loss": 0.733, + "step": 8568 + }, + { + "epoch": 0.7051224027977783, + "grad_norm": 2.3392310333761897, + "learning_rate": 4.223733382113347e-06, + "loss": 0.7145, + "step": 8569 + }, + { + "epoch": 0.7052046903929232, + "grad_norm": 2.4717057008414356, + "learning_rate": 4.221557897607097e-06, + "loss": 0.7301, + "step": 8570 + }, + { + "epoch": 0.7052869779880683, + "grad_norm": 3.0815044697427787, + "learning_rate": 4.219382823592133e-06, + "loss": 0.7155, + "step": 8571 + }, + { + "epoch": 0.7053692655832133, + "grad_norm": 1.7233073232286056, + "learning_rate": 4.217208160222965e-06, + "loss": 0.7075, + "step": 8572 + }, + { + "epoch": 0.7054515531783584, + "grad_norm": 2.0187750801783015, + "learning_rate": 4.2150339076540845e-06, + "loss": 0.7052, + "step": 8573 + }, + { + "epoch": 0.7055338407735033, + "grad_norm": 1.940943854061721, + "learning_rate": 4.212860066039941e-06, + "loss": 0.6886, + "step": 8574 + }, + { + "epoch": 0.7056161283686484, + "grad_norm": 1.9203491491232547, + "learning_rate": 4.21068663553497e-06, + "loss": 0.7152, + "step": 8575 + }, + { + "epoch": 0.7056984159637935, + "grad_norm": 1.7842382679617161, + "learning_rate": 4.2085136162935535e-06, + "loss": 0.7022, + "step": 8576 + }, + { + "epoch": 0.7057807035589385, + "grad_norm": 2.027887258914919, + "learning_rate": 4.2063410084700715e-06, + "loss": 0.7135, + "step": 8577 + }, + { + "epoch": 0.7058629911540836, + "grad_norm": 2.5503118475235373, + "learning_rate": 4.204168812218855e-06, + "loss": 0.7024, + "step": 8578 + }, + { + "epoch": 0.7059452787492285, + "grad_norm": 2.0583322731557305, + "learning_rate": 4.2019970276942215e-06, + "loss": 0.7417, + "step": 8579 + }, + { + "epoch": 0.7060275663443736, + "grad_norm": 1.950292707922437, + "learning_rate": 4.199825655050442e-06, + "loss": 0.7335, + "step": 8580 + }, + { + "epoch": 0.7061098539395186, + "grad_norm": 0.4196641980062882, + "learning_rate": 4.197654694441781e-06, + "loss": 0.475, + "step": 8581 + }, + { + "epoch": 0.7061921415346637, + "grad_norm": 1.6343226914152615, + "learning_rate": 4.195484146022442e-06, + "loss": 0.7044, + "step": 8582 + }, + { + "epoch": 0.7062744291298086, + "grad_norm": 2.090138663165931, + "learning_rate": 4.193314009946629e-06, + "loss": 0.6816, + "step": 8583 + }, + { + "epoch": 0.7063567167249537, + "grad_norm": 1.7055578181174171, + "learning_rate": 4.191144286368501e-06, + "loss": 0.7142, + "step": 8584 + }, + { + "epoch": 0.7064390043200988, + "grad_norm": 1.7766343803962614, + "learning_rate": 4.188974975442196e-06, + "loss": 0.6942, + "step": 8585 + }, + { + "epoch": 0.7065212919152438, + "grad_norm": 1.818877012587966, + "learning_rate": 4.186806077321814e-06, + "loss": 0.7327, + "step": 8586 + }, + { + "epoch": 0.7066035795103888, + "grad_norm": 1.6272761183726194, + "learning_rate": 4.184637592161433e-06, + "loss": 0.7036, + "step": 8587 + }, + { + "epoch": 0.7066858671055338, + "grad_norm": 0.4065485966395321, + "learning_rate": 4.182469520115094e-06, + "loss": 0.4786, + "step": 8588 + }, + { + "epoch": 0.7067681547006789, + "grad_norm": 1.8269545786911912, + "learning_rate": 4.180301861336819e-06, + "loss": 0.7216, + "step": 8589 + }, + { + "epoch": 0.7068504422958239, + "grad_norm": 2.6141866011250325, + "learning_rate": 4.1781346159805895e-06, + "loss": 0.7248, + "step": 8590 + }, + { + "epoch": 0.7069327298909689, + "grad_norm": 1.8860037553515292, + "learning_rate": 4.1759677842003685e-06, + "loss": 0.7025, + "step": 8591 + }, + { + "epoch": 0.707015017486114, + "grad_norm": 1.9096908384710025, + "learning_rate": 4.173801366150083e-06, + "loss": 0.7052, + "step": 8592 + }, + { + "epoch": 0.707097305081259, + "grad_norm": 1.8801636131961779, + "learning_rate": 4.171635361983631e-06, + "loss": 0.7439, + "step": 8593 + }, + { + "epoch": 0.7071795926764041, + "grad_norm": 2.1102308246451726, + "learning_rate": 4.169469771854878e-06, + "loss": 0.7193, + "step": 8594 + }, + { + "epoch": 0.707261880271549, + "grad_norm": 2.1029164465080608, + "learning_rate": 4.167304595917672e-06, + "loss": 0.7084, + "step": 8595 + }, + { + "epoch": 0.7073441678666941, + "grad_norm": 0.4195856905761755, + "learning_rate": 4.165139834325814e-06, + "loss": 0.4737, + "step": 8596 + }, + { + "epoch": 0.7074264554618391, + "grad_norm": 2.2704082579113534, + "learning_rate": 4.162975487233096e-06, + "loss": 0.7089, + "step": 8597 + }, + { + "epoch": 0.7075087430569842, + "grad_norm": 2.013562485167235, + "learning_rate": 4.1608115547932635e-06, + "loss": 0.7101, + "step": 8598 + }, + { + "epoch": 0.7075910306521291, + "grad_norm": 2.0391496553654425, + "learning_rate": 4.158648037160041e-06, + "loss": 0.723, + "step": 8599 + }, + { + "epoch": 0.7076733182472742, + "grad_norm": 1.7987875715283441, + "learning_rate": 4.1564849344871165e-06, + "loss": 0.677, + "step": 8600 + }, + { + "epoch": 0.7077556058424193, + "grad_norm": 2.4457412822215443, + "learning_rate": 4.154322246928158e-06, + "loss": 0.7181, + "step": 8601 + }, + { + "epoch": 0.7078378934375643, + "grad_norm": 2.346338098128662, + "learning_rate": 4.1521599746367965e-06, + "loss": 0.7326, + "step": 8602 + }, + { + "epoch": 0.7079201810327094, + "grad_norm": 1.8126509368625228, + "learning_rate": 4.149998117766643e-06, + "loss": 0.739, + "step": 8603 + }, + { + "epoch": 0.7080024686278543, + "grad_norm": 2.1558172121973063, + "learning_rate": 4.147836676471265e-06, + "loss": 0.7181, + "step": 8604 + }, + { + "epoch": 0.7080847562229994, + "grad_norm": 3.0169058203419756, + "learning_rate": 4.145675650904211e-06, + "loss": 0.7103, + "step": 8605 + }, + { + "epoch": 0.7081670438181444, + "grad_norm": 2.1839241723418152, + "learning_rate": 4.143515041218994e-06, + "loss": 0.7248, + "step": 8606 + }, + { + "epoch": 0.7082493314132895, + "grad_norm": 1.6988483631479012, + "learning_rate": 4.141354847569105e-06, + "loss": 0.7181, + "step": 8607 + }, + { + "epoch": 0.7083316190084344, + "grad_norm": 2.2961809966630424, + "learning_rate": 4.139195070107995e-06, + "loss": 0.7222, + "step": 8608 + }, + { + "epoch": 0.7084139066035795, + "grad_norm": 2.790296492233685, + "learning_rate": 4.137035708989098e-06, + "loss": 0.6851, + "step": 8609 + }, + { + "epoch": 0.7084961941987246, + "grad_norm": 2.4410239638310762, + "learning_rate": 4.134876764365807e-06, + "loss": 0.7477, + "step": 8610 + }, + { + "epoch": 0.7085784817938696, + "grad_norm": 1.9149433578892066, + "learning_rate": 4.132718236391491e-06, + "loss": 0.7066, + "step": 8611 + }, + { + "epoch": 0.7086607693890146, + "grad_norm": 0.42668202234142716, + "learning_rate": 4.130560125219485e-06, + "loss": 0.5009, + "step": 8612 + }, + { + "epoch": 0.7087430569841596, + "grad_norm": 2.4755424281336693, + "learning_rate": 4.128402431003104e-06, + "loss": 0.7536, + "step": 8613 + }, + { + "epoch": 0.7088253445793047, + "grad_norm": 3.6182900296671208, + "learning_rate": 4.126245153895621e-06, + "loss": 0.7399, + "step": 8614 + }, + { + "epoch": 0.7089076321744497, + "grad_norm": 2.139527549357835, + "learning_rate": 4.124088294050291e-06, + "loss": 0.721, + "step": 8615 + }, + { + "epoch": 0.7089899197695947, + "grad_norm": 2.0949321532144887, + "learning_rate": 4.121931851620332e-06, + "loss": 0.7224, + "step": 8616 + }, + { + "epoch": 0.7090722073647397, + "grad_norm": 1.840982609689361, + "learning_rate": 4.119775826758934e-06, + "loss": 0.712, + "step": 8617 + }, + { + "epoch": 0.7091544949598848, + "grad_norm": 2.2502935056066735, + "learning_rate": 4.117620219619254e-06, + "loss": 0.7254, + "step": 8618 + }, + { + "epoch": 0.7092367825550299, + "grad_norm": 1.84206387677283, + "learning_rate": 4.115465030354429e-06, + "loss": 0.7366, + "step": 8619 + }, + { + "epoch": 0.7093190701501748, + "grad_norm": 2.6642562891451487, + "learning_rate": 4.113310259117555e-06, + "loss": 0.7716, + "step": 8620 + }, + { + "epoch": 0.7094013577453199, + "grad_norm": 1.6340236964070982, + "learning_rate": 4.111155906061708e-06, + "loss": 0.7149, + "step": 8621 + }, + { + "epoch": 0.7094836453404649, + "grad_norm": 1.8169834861344099, + "learning_rate": 4.1090019713399275e-06, + "loss": 0.7531, + "step": 8622 + }, + { + "epoch": 0.70956593293561, + "grad_norm": 2.1653115304186707, + "learning_rate": 4.106848455105226e-06, + "loss": 0.712, + "step": 8623 + }, + { + "epoch": 0.7096482205307549, + "grad_norm": 0.41437932166491426, + "learning_rate": 4.104695357510583e-06, + "loss": 0.5055, + "step": 8624 + }, + { + "epoch": 0.7097305081259, + "grad_norm": 0.4185362119502973, + "learning_rate": 4.1025426787089565e-06, + "loss": 0.477, + "step": 8625 + }, + { + "epoch": 0.709812795721045, + "grad_norm": 2.2582587964648617, + "learning_rate": 4.100390418853263e-06, + "loss": 0.7253, + "step": 8626 + }, + { + "epoch": 0.7098950833161901, + "grad_norm": 1.7481927243136253, + "learning_rate": 4.098238578096402e-06, + "loss": 0.7158, + "step": 8627 + }, + { + "epoch": 0.7099773709113352, + "grad_norm": 2.1810856185791465, + "learning_rate": 4.096087156591234e-06, + "loss": 0.7163, + "step": 8628 + }, + { + "epoch": 0.7100596585064801, + "grad_norm": 2.35724369463555, + "learning_rate": 4.093936154490594e-06, + "loss": 0.7098, + "step": 8629 + }, + { + "epoch": 0.7101419461016252, + "grad_norm": 1.8971893193239884, + "learning_rate": 4.09178557194728e-06, + "loss": 0.7354, + "step": 8630 + }, + { + "epoch": 0.7102242336967702, + "grad_norm": 2.012735308162709, + "learning_rate": 4.089635409114072e-06, + "loss": 0.7557, + "step": 8631 + }, + { + "epoch": 0.7103065212919153, + "grad_norm": 2.076631034717968, + "learning_rate": 4.0874856661437116e-06, + "loss": 0.7263, + "step": 8632 + }, + { + "epoch": 0.7103888088870602, + "grad_norm": 2.114046002431131, + "learning_rate": 4.0853363431889145e-06, + "loss": 0.7272, + "step": 8633 + }, + { + "epoch": 0.7104710964822053, + "grad_norm": 2.1722383349511034, + "learning_rate": 4.083187440402366e-06, + "loss": 0.732, + "step": 8634 + }, + { + "epoch": 0.7105533840773504, + "grad_norm": 2.1407381026507353, + "learning_rate": 4.0810389579367195e-06, + "loss": 0.7308, + "step": 8635 + }, + { + "epoch": 0.7106356716724954, + "grad_norm": 1.9583999694543597, + "learning_rate": 4.0788908959445965e-06, + "loss": 0.708, + "step": 8636 + }, + { + "epoch": 0.7107179592676404, + "grad_norm": 6.372993125291939, + "learning_rate": 4.076743254578597e-06, + "loss": 0.7211, + "step": 8637 + }, + { + "epoch": 0.7108002468627854, + "grad_norm": 2.0289350604910914, + "learning_rate": 4.07459603399128e-06, + "loss": 0.7344, + "step": 8638 + }, + { + "epoch": 0.7108825344579305, + "grad_norm": 2.3964291276986884, + "learning_rate": 4.0724492343351886e-06, + "loss": 0.7106, + "step": 8639 + }, + { + "epoch": 0.7109648220530755, + "grad_norm": 0.4154102826963481, + "learning_rate": 4.070302855762824e-06, + "loss": 0.4566, + "step": 8640 + }, + { + "epoch": 0.7110471096482205, + "grad_norm": 1.986226083072746, + "learning_rate": 4.068156898426662e-06, + "loss": 0.7347, + "step": 8641 + }, + { + "epoch": 0.7111293972433655, + "grad_norm": 1.8566797142808635, + "learning_rate": 4.066011362479143e-06, + "loss": 0.718, + "step": 8642 + }, + { + "epoch": 0.7112116848385106, + "grad_norm": 1.6787974866140718, + "learning_rate": 4.0638662480726895e-06, + "loss": 0.7098, + "step": 8643 + }, + { + "epoch": 0.7112939724336557, + "grad_norm": 2.0228411726219897, + "learning_rate": 4.061721555359682e-06, + "loss": 0.7885, + "step": 8644 + }, + { + "epoch": 0.7113762600288006, + "grad_norm": 1.5609119971660987, + "learning_rate": 4.0595772844924855e-06, + "loss": 0.6785, + "step": 8645 + }, + { + "epoch": 0.7114585476239457, + "grad_norm": 2.0086032976918826, + "learning_rate": 4.057433435623411e-06, + "loss": 0.697, + "step": 8646 + }, + { + "epoch": 0.7115408352190907, + "grad_norm": 1.8728167413825663, + "learning_rate": 4.055290008904765e-06, + "loss": 0.6966, + "step": 8647 + }, + { + "epoch": 0.7116231228142358, + "grad_norm": 2.022388559258871, + "learning_rate": 4.053147004488806e-06, + "loss": 0.7601, + "step": 8648 + }, + { + "epoch": 0.7117054104093807, + "grad_norm": 2.2410997845431715, + "learning_rate": 4.051004422527777e-06, + "loss": 0.7341, + "step": 8649 + }, + { + "epoch": 0.7117876980045258, + "grad_norm": 1.9881857979067399, + "learning_rate": 4.048862263173876e-06, + "loss": 0.7334, + "step": 8650 + }, + { + "epoch": 0.7118699855996709, + "grad_norm": 0.4229754806170139, + "learning_rate": 4.04672052657929e-06, + "loss": 0.4909, + "step": 8651 + }, + { + "epoch": 0.7119522731948159, + "grad_norm": 2.476782976245682, + "learning_rate": 4.04457921289615e-06, + "loss": 0.7201, + "step": 8652 + }, + { + "epoch": 0.712034560789961, + "grad_norm": 1.9144103371017702, + "learning_rate": 4.04243832227658e-06, + "loss": 0.7103, + "step": 8653 + }, + { + "epoch": 0.7121168483851059, + "grad_norm": 1.6670521047658315, + "learning_rate": 4.040297854872663e-06, + "loss": 0.7268, + "step": 8654 + }, + { + "epoch": 0.712199135980251, + "grad_norm": 1.7431861961097102, + "learning_rate": 4.038157810836456e-06, + "loss": 0.7052, + "step": 8655 + }, + { + "epoch": 0.712281423575396, + "grad_norm": 1.6869338176110942, + "learning_rate": 4.0360181903199854e-06, + "loss": 0.7261, + "step": 8656 + }, + { + "epoch": 0.7123637111705411, + "grad_norm": 2.0462850245279762, + "learning_rate": 4.033878993475244e-06, + "loss": 0.6965, + "step": 8657 + }, + { + "epoch": 0.712445998765686, + "grad_norm": 1.9168286492468203, + "learning_rate": 4.031740220454195e-06, + "loss": 0.7094, + "step": 8658 + }, + { + "epoch": 0.7125282863608311, + "grad_norm": 3.265712486921818, + "learning_rate": 4.029601871408778e-06, + "loss": 0.7388, + "step": 8659 + }, + { + "epoch": 0.7126105739559762, + "grad_norm": 2.3425592789544156, + "learning_rate": 4.027463946490892e-06, + "loss": 0.7086, + "step": 8660 + }, + { + "epoch": 0.7126928615511212, + "grad_norm": 1.8012396353001319, + "learning_rate": 4.025326445852421e-06, + "loss": 0.7364, + "step": 8661 + }, + { + "epoch": 0.7127751491462662, + "grad_norm": 2.008637364233, + "learning_rate": 4.023189369645204e-06, + "loss": 0.706, + "step": 8662 + }, + { + "epoch": 0.7128574367414112, + "grad_norm": 2.3849702777929602, + "learning_rate": 4.021052718021055e-06, + "loss": 0.7039, + "step": 8663 + }, + { + "epoch": 0.7129397243365563, + "grad_norm": 1.9621348579593594, + "learning_rate": 4.018916491131755e-06, + "loss": 0.724, + "step": 8664 + }, + { + "epoch": 0.7130220119317013, + "grad_norm": 0.43312447623136935, + "learning_rate": 4.016780689129066e-06, + "loss": 0.486, + "step": 8665 + }, + { + "epoch": 0.7131042995268463, + "grad_norm": 1.795587072497899, + "learning_rate": 4.014645312164704e-06, + "loss": 0.7216, + "step": 8666 + }, + { + "epoch": 0.7131865871219913, + "grad_norm": 2.185000230412621, + "learning_rate": 4.012510360390372e-06, + "loss": 0.7396, + "step": 8667 + }, + { + "epoch": 0.7132688747171364, + "grad_norm": 2.0873676434849933, + "learning_rate": 4.010375833957727e-06, + "loss": 0.7356, + "step": 8668 + }, + { + "epoch": 0.7133511623122815, + "grad_norm": 2.042512650356064, + "learning_rate": 4.008241733018404e-06, + "loss": 0.7207, + "step": 8669 + }, + { + "epoch": 0.7134334499074264, + "grad_norm": 2.4226255303789945, + "learning_rate": 4.006108057724003e-06, + "loss": 0.6888, + "step": 8670 + }, + { + "epoch": 0.7135157375025715, + "grad_norm": 2.0430712979691, + "learning_rate": 4.003974808226103e-06, + "loss": 0.7237, + "step": 8671 + }, + { + "epoch": 0.7135980250977165, + "grad_norm": 2.0621398393387627, + "learning_rate": 4.00184198467624e-06, + "loss": 0.6805, + "step": 8672 + }, + { + "epoch": 0.7136803126928616, + "grad_norm": 0.4105107337318778, + "learning_rate": 3.999709587225933e-06, + "loss": 0.4934, + "step": 8673 + }, + { + "epoch": 0.7137626002880065, + "grad_norm": 2.3832561761347915, + "learning_rate": 3.997577616026661e-06, + "loss": 0.7073, + "step": 8674 + }, + { + "epoch": 0.7138448878831516, + "grad_norm": 2.142673970340435, + "learning_rate": 3.995446071229872e-06, + "loss": 0.7165, + "step": 8675 + }, + { + "epoch": 0.7139271754782966, + "grad_norm": 3.4831996423581897, + "learning_rate": 3.993314952986995e-06, + "loss": 0.7197, + "step": 8676 + }, + { + "epoch": 0.7140094630734417, + "grad_norm": 1.8033237028725022, + "learning_rate": 3.991184261449413e-06, + "loss": 0.6931, + "step": 8677 + }, + { + "epoch": 0.7140917506685867, + "grad_norm": 1.9998509660603063, + "learning_rate": 3.989053996768496e-06, + "loss": 0.6969, + "step": 8678 + }, + { + "epoch": 0.7141740382637317, + "grad_norm": 1.7729693151352874, + "learning_rate": 3.98692415909557e-06, + "loss": 0.724, + "step": 8679 + }, + { + "epoch": 0.7142563258588768, + "grad_norm": 1.9206558233215993, + "learning_rate": 3.984794748581935e-06, + "loss": 0.7463, + "step": 8680 + }, + { + "epoch": 0.7143386134540218, + "grad_norm": 2.2684351618329712, + "learning_rate": 3.982665765378857e-06, + "loss": 0.7294, + "step": 8681 + }, + { + "epoch": 0.7144209010491669, + "grad_norm": 0.41876483665751685, + "learning_rate": 3.980537209637584e-06, + "loss": 0.4766, + "step": 8682 + }, + { + "epoch": 0.7145031886443118, + "grad_norm": 1.7106828982178042, + "learning_rate": 3.978409081509318e-06, + "loss": 0.7225, + "step": 8683 + }, + { + "epoch": 0.7145854762394569, + "grad_norm": 2.4383680358863935, + "learning_rate": 3.976281381145243e-06, + "loss": 0.6929, + "step": 8684 + }, + { + "epoch": 0.714667763834602, + "grad_norm": 3.1323594204081995, + "learning_rate": 3.974154108696505e-06, + "loss": 0.7149, + "step": 8685 + }, + { + "epoch": 0.714750051429747, + "grad_norm": 2.020418661005787, + "learning_rate": 3.972027264314223e-06, + "loss": 0.7215, + "step": 8686 + }, + { + "epoch": 0.714832339024892, + "grad_norm": 1.927885772896168, + "learning_rate": 3.9699008481494806e-06, + "loss": 0.7227, + "step": 8687 + }, + { + "epoch": 0.714914626620037, + "grad_norm": 1.6126885691504718, + "learning_rate": 3.967774860353342e-06, + "loss": 0.6944, + "step": 8688 + }, + { + "epoch": 0.7149969142151821, + "grad_norm": 1.7454708145683717, + "learning_rate": 3.965649301076825e-06, + "loss": 0.7168, + "step": 8689 + }, + { + "epoch": 0.7150792018103271, + "grad_norm": 2.4491214459385153, + "learning_rate": 3.9635241704709355e-06, + "loss": 0.6989, + "step": 8690 + }, + { + "epoch": 0.7151614894054721, + "grad_norm": 2.652967691866735, + "learning_rate": 3.9613994686866355e-06, + "loss": 0.7285, + "step": 8691 + }, + { + "epoch": 0.7152437770006171, + "grad_norm": 1.9618409054533206, + "learning_rate": 3.95927519587486e-06, + "loss": 0.7423, + "step": 8692 + }, + { + "epoch": 0.7153260645957622, + "grad_norm": 1.7940140031400145, + "learning_rate": 3.9571513521865104e-06, + "loss": 0.7262, + "step": 8693 + }, + { + "epoch": 0.7154083521909073, + "grad_norm": 2.0175010642491644, + "learning_rate": 3.955027937772466e-06, + "loss": 0.7203, + "step": 8694 + }, + { + "epoch": 0.7154906397860522, + "grad_norm": 2.0512811649210327, + "learning_rate": 3.952904952783568e-06, + "loss": 0.702, + "step": 8695 + }, + { + "epoch": 0.7155729273811973, + "grad_norm": 2.992033118012045, + "learning_rate": 3.950782397370635e-06, + "loss": 0.7014, + "step": 8696 + }, + { + "epoch": 0.7156552149763423, + "grad_norm": 1.6507304203126645, + "learning_rate": 3.948660271684445e-06, + "loss": 0.7321, + "step": 8697 + }, + { + "epoch": 0.7157375025714874, + "grad_norm": 2.060858189297949, + "learning_rate": 3.9465385758757525e-06, + "loss": 0.7255, + "step": 8698 + }, + { + "epoch": 0.7158197901666323, + "grad_norm": 2.47071088986879, + "learning_rate": 3.9444173100952745e-06, + "loss": 0.689, + "step": 8699 + }, + { + "epoch": 0.7159020777617774, + "grad_norm": 2.166732501966155, + "learning_rate": 3.94229647449371e-06, + "loss": 0.6846, + "step": 8700 + }, + { + "epoch": 0.7159843653569224, + "grad_norm": 2.517448203874145, + "learning_rate": 3.940176069221713e-06, + "loss": 0.6885, + "step": 8701 + }, + { + "epoch": 0.7160666529520675, + "grad_norm": 1.5351438151628778, + "learning_rate": 3.93805609442992e-06, + "loss": 0.6988, + "step": 8702 + }, + { + "epoch": 0.7161489405472125, + "grad_norm": 1.7570268021451272, + "learning_rate": 3.9359365502689276e-06, + "loss": 0.7278, + "step": 8703 + }, + { + "epoch": 0.7162312281423575, + "grad_norm": 1.990533744524997, + "learning_rate": 3.933817436889304e-06, + "loss": 0.7365, + "step": 8704 + }, + { + "epoch": 0.7163135157375026, + "grad_norm": 1.9413658442490074, + "learning_rate": 3.931698754441584e-06, + "loss": 0.7367, + "step": 8705 + }, + { + "epoch": 0.7163958033326476, + "grad_norm": 1.834029911471151, + "learning_rate": 3.929580503076284e-06, + "loss": 0.7308, + "step": 8706 + }, + { + "epoch": 0.7164780909277927, + "grad_norm": 0.39977396776902596, + "learning_rate": 3.927462682943874e-06, + "loss": 0.4663, + "step": 8707 + }, + { + "epoch": 0.7165603785229376, + "grad_norm": 1.8106266080620803, + "learning_rate": 3.925345294194807e-06, + "loss": 0.6894, + "step": 8708 + }, + { + "epoch": 0.7166426661180827, + "grad_norm": 2.817691859548077, + "learning_rate": 3.923228336979494e-06, + "loss": 0.7074, + "step": 8709 + }, + { + "epoch": 0.7167249537132278, + "grad_norm": 2.2149447467366348, + "learning_rate": 3.921111811448323e-06, + "loss": 0.6968, + "step": 8710 + }, + { + "epoch": 0.7168072413083728, + "grad_norm": 0.4283970954339759, + "learning_rate": 3.918995717751642e-06, + "loss": 0.4693, + "step": 8711 + }, + { + "epoch": 0.7168895289035178, + "grad_norm": 2.210128268825673, + "learning_rate": 3.916880056039785e-06, + "loss": 0.7458, + "step": 8712 + }, + { + "epoch": 0.7169718164986628, + "grad_norm": 1.6484242998525183, + "learning_rate": 3.914764826463037e-06, + "loss": 0.6787, + "step": 8713 + }, + { + "epoch": 0.7170541040938079, + "grad_norm": 2.3525386666462387, + "learning_rate": 3.912650029171666e-06, + "loss": 0.6969, + "step": 8714 + }, + { + "epoch": 0.7171363916889529, + "grad_norm": 0.43626781279986865, + "learning_rate": 3.910535664315903e-06, + "loss": 0.4771, + "step": 8715 + }, + { + "epoch": 0.7172186792840979, + "grad_norm": 2.463258785817559, + "learning_rate": 3.9084217320459475e-06, + "loss": 0.7002, + "step": 8716 + }, + { + "epoch": 0.7173009668792429, + "grad_norm": 0.39806224476515745, + "learning_rate": 3.906308232511966e-06, + "loss": 0.4889, + "step": 8717 + }, + { + "epoch": 0.717383254474388, + "grad_norm": 2.206299600436389, + "learning_rate": 3.9041951658641066e-06, + "loss": 0.6942, + "step": 8718 + }, + { + "epoch": 0.7174655420695331, + "grad_norm": 1.9442021921875008, + "learning_rate": 3.902082532252469e-06, + "loss": 0.6791, + "step": 8719 + }, + { + "epoch": 0.717547829664678, + "grad_norm": 0.42020945011033, + "learning_rate": 3.899970331827141e-06, + "loss": 0.4778, + "step": 8720 + }, + { + "epoch": 0.7176301172598231, + "grad_norm": 1.8477941280503218, + "learning_rate": 3.897858564738164e-06, + "loss": 0.7322, + "step": 8721 + }, + { + "epoch": 0.7177124048549681, + "grad_norm": 2.11180341750008, + "learning_rate": 3.895747231135556e-06, + "loss": 0.7185, + "step": 8722 + }, + { + "epoch": 0.7177946924501132, + "grad_norm": 2.0735809098151963, + "learning_rate": 3.893636331169299e-06, + "loss": 0.7487, + "step": 8723 + }, + { + "epoch": 0.7178769800452581, + "grad_norm": 0.4069865780587335, + "learning_rate": 3.891525864989355e-06, + "loss": 0.4905, + "step": 8724 + }, + { + "epoch": 0.7179592676404032, + "grad_norm": 2.385908897236288, + "learning_rate": 3.889415832745641e-06, + "loss": 0.7091, + "step": 8725 + }, + { + "epoch": 0.7180415552355482, + "grad_norm": 1.7243175580651744, + "learning_rate": 3.88730623458806e-06, + "loss": 0.7146, + "step": 8726 + }, + { + "epoch": 0.7181238428306933, + "grad_norm": 0.4079412241396543, + "learning_rate": 3.885197070666462e-06, + "loss": 0.4861, + "step": 8727 + }, + { + "epoch": 0.7182061304258383, + "grad_norm": 2.8408347089937873, + "learning_rate": 3.883088341130689e-06, + "loss": 0.7407, + "step": 8728 + }, + { + "epoch": 0.7182884180209833, + "grad_norm": 0.41702283145843055, + "learning_rate": 3.8809800461305325e-06, + "loss": 0.4619, + "step": 8729 + }, + { + "epoch": 0.7183707056161284, + "grad_norm": 0.4362125331720785, + "learning_rate": 3.878872185815773e-06, + "loss": 0.4982, + "step": 8730 + }, + { + "epoch": 0.7184529932112734, + "grad_norm": 2.1863501237887633, + "learning_rate": 3.876764760336139e-06, + "loss": 0.7262, + "step": 8731 + }, + { + "epoch": 0.7185352808064185, + "grad_norm": 0.408105114069146, + "learning_rate": 3.874657769841351e-06, + "loss": 0.4741, + "step": 8732 + }, + { + "epoch": 0.7186175684015634, + "grad_norm": 2.0926145770700617, + "learning_rate": 3.872551214481073e-06, + "loss": 0.7254, + "step": 8733 + }, + { + "epoch": 0.7186998559967085, + "grad_norm": 1.960463846348349, + "learning_rate": 3.87044509440496e-06, + "loss": 0.7343, + "step": 8734 + }, + { + "epoch": 0.7187821435918536, + "grad_norm": 2.1119097735415866, + "learning_rate": 3.868339409762621e-06, + "loss": 0.7285, + "step": 8735 + }, + { + "epoch": 0.7188644311869986, + "grad_norm": 1.9734332930150678, + "learning_rate": 3.8662341607036495e-06, + "loss": 0.6873, + "step": 8736 + }, + { + "epoch": 0.7189467187821436, + "grad_norm": 2.3364487184445886, + "learning_rate": 3.864129347377592e-06, + "loss": 0.7235, + "step": 8737 + }, + { + "epoch": 0.7190290063772886, + "grad_norm": 1.9519788425509623, + "learning_rate": 3.862024969933973e-06, + "loss": 0.7316, + "step": 8738 + }, + { + "epoch": 0.7191112939724337, + "grad_norm": 2.3492179724493387, + "learning_rate": 3.859921028522281e-06, + "loss": 0.6842, + "step": 8739 + }, + { + "epoch": 0.7191935815675787, + "grad_norm": 1.8348433296733133, + "learning_rate": 3.857817523291982e-06, + "loss": 0.6995, + "step": 8740 + }, + { + "epoch": 0.7192758691627237, + "grad_norm": 1.9269844482486431, + "learning_rate": 3.855714454392501e-06, + "loss": 0.7215, + "step": 8741 + }, + { + "epoch": 0.7193581567578687, + "grad_norm": 9.167556014134869, + "learning_rate": 3.853611821973241e-06, + "loss": 0.7248, + "step": 8742 + }, + { + "epoch": 0.7194404443530138, + "grad_norm": 2.135175403998364, + "learning_rate": 3.851509626183568e-06, + "loss": 0.7329, + "step": 8743 + }, + { + "epoch": 0.7195227319481589, + "grad_norm": 2.225989828579902, + "learning_rate": 3.849407867172818e-06, + "loss": 0.7197, + "step": 8744 + }, + { + "epoch": 0.7196050195433038, + "grad_norm": 2.180212302349484, + "learning_rate": 3.847306545090294e-06, + "loss": 0.7233, + "step": 8745 + }, + { + "epoch": 0.7196873071384489, + "grad_norm": 2.3579327462654476, + "learning_rate": 3.845205660085276e-06, + "loss": 0.7123, + "step": 8746 + }, + { + "epoch": 0.7197695947335939, + "grad_norm": 2.20231787370899, + "learning_rate": 3.8431052123070015e-06, + "loss": 0.708, + "step": 8747 + }, + { + "epoch": 0.719851882328739, + "grad_norm": 1.7449618006793695, + "learning_rate": 3.8410052019046895e-06, + "loss": 0.7048, + "step": 8748 + }, + { + "epoch": 0.7199341699238839, + "grad_norm": 2.289183066552959, + "learning_rate": 3.838905629027518e-06, + "loss": 0.7308, + "step": 8749 + }, + { + "epoch": 0.720016457519029, + "grad_norm": 0.4123963748948826, + "learning_rate": 3.836806493824637e-06, + "loss": 0.5, + "step": 8750 + }, + { + "epoch": 0.720098745114174, + "grad_norm": 3.107098355301742, + "learning_rate": 3.834707796445164e-06, + "loss": 0.7142, + "step": 8751 + }, + { + "epoch": 0.7201810327093191, + "grad_norm": 1.9044119757639377, + "learning_rate": 3.8326095370381924e-06, + "loss": 0.7002, + "step": 8752 + }, + { + "epoch": 0.720263320304464, + "grad_norm": 0.4253370851011507, + "learning_rate": 3.830511715752772e-06, + "loss": 0.5118, + "step": 8753 + }, + { + "epoch": 0.7203456078996091, + "grad_norm": 2.531708860800557, + "learning_rate": 3.8284143327379355e-06, + "loss": 0.7101, + "step": 8754 + }, + { + "epoch": 0.7204278954947542, + "grad_norm": 1.839957023649343, + "learning_rate": 3.826317388142676e-06, + "loss": 0.7435, + "step": 8755 + }, + { + "epoch": 0.7205101830898992, + "grad_norm": 1.8620414109668004, + "learning_rate": 3.824220882115955e-06, + "loss": 0.7063, + "step": 8756 + }, + { + "epoch": 0.7205924706850443, + "grad_norm": 0.40257851674645223, + "learning_rate": 3.822124814806702e-06, + "loss": 0.4978, + "step": 8757 + }, + { + "epoch": 0.7206747582801892, + "grad_norm": 1.6351040695401133, + "learning_rate": 3.820029186363827e-06, + "loss": 0.7277, + "step": 8758 + }, + { + "epoch": 0.7207570458753343, + "grad_norm": 2.062422927311494, + "learning_rate": 3.817933996936192e-06, + "loss": 0.7109, + "step": 8759 + }, + { + "epoch": 0.7208393334704793, + "grad_norm": 2.4710806184067082, + "learning_rate": 3.815839246672641e-06, + "loss": 0.7042, + "step": 8760 + }, + { + "epoch": 0.7209216210656244, + "grad_norm": 1.9518686301813768, + "learning_rate": 3.81374493572198e-06, + "loss": 0.7099, + "step": 8761 + }, + { + "epoch": 0.7210039086607694, + "grad_norm": 2.0091779993114915, + "learning_rate": 3.8116510642329864e-06, + "loss": 0.7129, + "step": 8762 + }, + { + "epoch": 0.7210861962559144, + "grad_norm": 2.182698238508877, + "learning_rate": 3.8095576323544002e-06, + "loss": 0.7355, + "step": 8763 + }, + { + "epoch": 0.7211684838510595, + "grad_norm": 2.159237676396974, + "learning_rate": 3.8074646402349437e-06, + "loss": 0.7046, + "step": 8764 + }, + { + "epoch": 0.7212507714462045, + "grad_norm": 2.6362885254621258, + "learning_rate": 3.8053720880232913e-06, + "loss": 0.7271, + "step": 8765 + }, + { + "epoch": 0.7213330590413495, + "grad_norm": 2.3994367423015843, + "learning_rate": 3.803279975868103e-06, + "loss": 0.7401, + "step": 8766 + }, + { + "epoch": 0.7214153466364945, + "grad_norm": 2.175164983490028, + "learning_rate": 3.801188303917994e-06, + "loss": 0.7212, + "step": 8767 + }, + { + "epoch": 0.7214976342316396, + "grad_norm": 2.1713410644757247, + "learning_rate": 3.799097072321556e-06, + "loss": 0.7104, + "step": 8768 + }, + { + "epoch": 0.7215799218267847, + "grad_norm": 3.8955390223307838, + "learning_rate": 3.797006281227341e-06, + "loss": 0.7229, + "step": 8769 + }, + { + "epoch": 0.7216622094219296, + "grad_norm": 2.0301256770896146, + "learning_rate": 3.7949159307838823e-06, + "loss": 0.7251, + "step": 8770 + }, + { + "epoch": 0.7217444970170747, + "grad_norm": 0.42254711964974867, + "learning_rate": 3.7928260211396696e-06, + "loss": 0.467, + "step": 8771 + }, + { + "epoch": 0.7218267846122197, + "grad_norm": 2.6826528294797756, + "learning_rate": 3.7907365524431727e-06, + "loss": 0.6994, + "step": 8772 + }, + { + "epoch": 0.7219090722073648, + "grad_norm": 1.9815700598161157, + "learning_rate": 3.788647524842821e-06, + "loss": 0.7159, + "step": 8773 + }, + { + "epoch": 0.7219913598025097, + "grad_norm": 2.2781899103542065, + "learning_rate": 3.786558938487015e-06, + "loss": 0.7381, + "step": 8774 + }, + { + "epoch": 0.7220736473976548, + "grad_norm": 2.0037203070148633, + "learning_rate": 3.784470793524122e-06, + "loss": 0.7291, + "step": 8775 + }, + { + "epoch": 0.7221559349927998, + "grad_norm": 2.6134579265529485, + "learning_rate": 3.782383090102487e-06, + "loss": 0.7195, + "step": 8776 + }, + { + "epoch": 0.7222382225879449, + "grad_norm": 2.1766277669067406, + "learning_rate": 3.7802958283704106e-06, + "loss": 0.7264, + "step": 8777 + }, + { + "epoch": 0.7223205101830898, + "grad_norm": 1.9991315868435762, + "learning_rate": 3.7782090084761746e-06, + "loss": 0.7369, + "step": 8778 + }, + { + "epoch": 0.7224027977782349, + "grad_norm": 2.2453451037430843, + "learning_rate": 3.776122630568021e-06, + "loss": 0.7344, + "step": 8779 + }, + { + "epoch": 0.72248508537338, + "grad_norm": 2.161400860963148, + "learning_rate": 3.774036694794161e-06, + "loss": 0.7125, + "step": 8780 + }, + { + "epoch": 0.722567372968525, + "grad_norm": 1.867647479688836, + "learning_rate": 3.771951201302775e-06, + "loss": 0.7219, + "step": 8781 + }, + { + "epoch": 0.7226496605636701, + "grad_norm": 1.8522964136018596, + "learning_rate": 3.7698661502420176e-06, + "loss": 0.7115, + "step": 8782 + }, + { + "epoch": 0.722731948158815, + "grad_norm": 2.223025870186525, + "learning_rate": 3.767781541760003e-06, + "loss": 0.7162, + "step": 8783 + }, + { + "epoch": 0.7228142357539601, + "grad_norm": 2.0887528878082624, + "learning_rate": 3.765697376004823e-06, + "loss": 0.7173, + "step": 8784 + }, + { + "epoch": 0.7228965233491051, + "grad_norm": 3.4480822536229345, + "learning_rate": 3.7636136531245313e-06, + "loss": 0.7044, + "step": 8785 + }, + { + "epoch": 0.7229788109442502, + "grad_norm": 2.1143345232600392, + "learning_rate": 3.7615303732671528e-06, + "loss": 0.7327, + "step": 8786 + }, + { + "epoch": 0.7230610985393952, + "grad_norm": 2.399065465045948, + "learning_rate": 3.759447536580676e-06, + "loss": 0.7245, + "step": 8787 + }, + { + "epoch": 0.7231433861345402, + "grad_norm": 1.915540986490959, + "learning_rate": 3.7573651432130687e-06, + "loss": 0.7126, + "step": 8788 + }, + { + "epoch": 0.7232256737296853, + "grad_norm": 1.779064979145445, + "learning_rate": 3.755283193312256e-06, + "loss": 0.6884, + "step": 8789 + }, + { + "epoch": 0.7233079613248303, + "grad_norm": 2.702926190793012, + "learning_rate": 3.75320168702614e-06, + "loss": 0.7305, + "step": 8790 + }, + { + "epoch": 0.7233902489199753, + "grad_norm": 1.8819955401609725, + "learning_rate": 3.751120624502587e-06, + "loss": 0.7072, + "step": 8791 + }, + { + "epoch": 0.7234725365151203, + "grad_norm": 2.4449889724187175, + "learning_rate": 3.7490400058894305e-06, + "loss": 0.738, + "step": 8792 + }, + { + "epoch": 0.7235548241102654, + "grad_norm": 1.9486008943875188, + "learning_rate": 3.7469598313344725e-06, + "loss": 0.7114, + "step": 8793 + }, + { + "epoch": 0.7236371117054105, + "grad_norm": 1.7016758174052318, + "learning_rate": 3.744880100985491e-06, + "loss": 0.706, + "step": 8794 + }, + { + "epoch": 0.7237193993005554, + "grad_norm": 2.686094618736218, + "learning_rate": 3.7428008149902194e-06, + "loss": 0.7092, + "step": 8795 + }, + { + "epoch": 0.7238016868957005, + "grad_norm": 2.326370813462399, + "learning_rate": 3.7407219734963783e-06, + "loss": 0.7501, + "step": 8796 + }, + { + "epoch": 0.7238839744908455, + "grad_norm": 0.421596282498221, + "learning_rate": 3.7386435766516315e-06, + "loss": 0.4738, + "step": 8797 + }, + { + "epoch": 0.7239662620859906, + "grad_norm": 3.3107868752156544, + "learning_rate": 3.736565624603634e-06, + "loss": 0.722, + "step": 8798 + }, + { + "epoch": 0.7240485496811355, + "grad_norm": 2.343323087648118, + "learning_rate": 3.7344881174999946e-06, + "loss": 0.6937, + "step": 8799 + }, + { + "epoch": 0.7241308372762806, + "grad_norm": 3.0585396379895093, + "learning_rate": 3.732411055488303e-06, + "loss": 0.6889, + "step": 8800 + }, + { + "epoch": 0.7242131248714256, + "grad_norm": 2.38001389984029, + "learning_rate": 3.7303344387161023e-06, + "loss": 0.7112, + "step": 8801 + }, + { + "epoch": 0.7242954124665707, + "grad_norm": 0.41539036589042977, + "learning_rate": 3.728258267330924e-06, + "loss": 0.4845, + "step": 8802 + }, + { + "epoch": 0.7243777000617156, + "grad_norm": 2.77808490940728, + "learning_rate": 3.726182541480241e-06, + "loss": 0.7012, + "step": 8803 + }, + { + "epoch": 0.7244599876568607, + "grad_norm": 0.404736522608996, + "learning_rate": 3.7241072613115205e-06, + "loss": 0.4626, + "step": 8804 + }, + { + "epoch": 0.7245422752520058, + "grad_norm": 0.41168023681446764, + "learning_rate": 3.7220324269721785e-06, + "loss": 0.4524, + "step": 8805 + }, + { + "epoch": 0.7246245628471508, + "grad_norm": 2.763864362455816, + "learning_rate": 3.719958038609618e-06, + "loss": 0.7024, + "step": 8806 + }, + { + "epoch": 0.7247068504422958, + "grad_norm": 1.8750614272144093, + "learning_rate": 3.717884096371194e-06, + "loss": 0.729, + "step": 8807 + }, + { + "epoch": 0.7247891380374408, + "grad_norm": 0.4082099437769587, + "learning_rate": 3.715810600404237e-06, + "loss": 0.4782, + "step": 8808 + }, + { + "epoch": 0.7248714256325859, + "grad_norm": 2.238267341087181, + "learning_rate": 3.7137375508560413e-06, + "loss": 0.717, + "step": 8809 + }, + { + "epoch": 0.724953713227731, + "grad_norm": 1.917640942209912, + "learning_rate": 3.711664947873881e-06, + "loss": 0.717, + "step": 8810 + }, + { + "epoch": 0.725036000822876, + "grad_norm": 3.592960600379022, + "learning_rate": 3.7095927916049812e-06, + "loss": 0.7346, + "step": 8811 + }, + { + "epoch": 0.725118288418021, + "grad_norm": 0.4035893816105096, + "learning_rate": 3.7075210821965545e-06, + "loss": 0.4621, + "step": 8812 + }, + { + "epoch": 0.725200576013166, + "grad_norm": 2.2878631097586783, + "learning_rate": 3.7054498197957655e-06, + "loss": 0.7168, + "step": 8813 + }, + { + "epoch": 0.7252828636083111, + "grad_norm": 2.0404638001827546, + "learning_rate": 3.7033790045497554e-06, + "loss": 0.7228, + "step": 8814 + }, + { + "epoch": 0.7253651512034561, + "grad_norm": 1.9554911841018168, + "learning_rate": 3.7013086366056273e-06, + "loss": 0.7363, + "step": 8815 + }, + { + "epoch": 0.7254474387986011, + "grad_norm": 1.9191754596039734, + "learning_rate": 3.6992387161104636e-06, + "loss": 0.7168, + "step": 8816 + }, + { + "epoch": 0.7255297263937461, + "grad_norm": 0.4025521965461186, + "learning_rate": 3.6971692432113015e-06, + "loss": 0.4642, + "step": 8817 + }, + { + "epoch": 0.7256120139888912, + "grad_norm": 0.41073803720128693, + "learning_rate": 3.6951002180551598e-06, + "loss": 0.4696, + "step": 8818 + }, + { + "epoch": 0.7256943015840362, + "grad_norm": 1.986585710796419, + "learning_rate": 3.6930316407890167e-06, + "loss": 0.7401, + "step": 8819 + }, + { + "epoch": 0.7257765891791812, + "grad_norm": 2.494589303074705, + "learning_rate": 3.6909635115598174e-06, + "loss": 0.7177, + "step": 8820 + }, + { + "epoch": 0.7258588767743263, + "grad_norm": 1.9339581340377723, + "learning_rate": 3.6888958305144784e-06, + "loss": 0.7191, + "step": 8821 + }, + { + "epoch": 0.7259411643694713, + "grad_norm": 2.1717658091130554, + "learning_rate": 3.6868285977998887e-06, + "loss": 0.7452, + "step": 8822 + }, + { + "epoch": 0.7260234519646164, + "grad_norm": 1.8345693085026782, + "learning_rate": 3.684761813562897e-06, + "loss": 0.7275, + "step": 8823 + }, + { + "epoch": 0.7261057395597613, + "grad_norm": 3.799574016904524, + "learning_rate": 3.682695477950329e-06, + "loss": 0.7374, + "step": 8824 + }, + { + "epoch": 0.7261880271549064, + "grad_norm": 2.0237823505379504, + "learning_rate": 3.6806295911089706e-06, + "loss": 0.7094, + "step": 8825 + }, + { + "epoch": 0.7262703147500514, + "grad_norm": 0.4180210002675001, + "learning_rate": 3.678564153185581e-06, + "loss": 0.4722, + "step": 8826 + }, + { + "epoch": 0.7263526023451965, + "grad_norm": 1.8537240829429487, + "learning_rate": 3.67649916432688e-06, + "loss": 0.6878, + "step": 8827 + }, + { + "epoch": 0.7264348899403414, + "grad_norm": 2.052193130533363, + "learning_rate": 3.674434624679569e-06, + "loss": 0.739, + "step": 8828 + }, + { + "epoch": 0.7265171775354865, + "grad_norm": 2.355503838993796, + "learning_rate": 3.672370534390303e-06, + "loss": 0.723, + "step": 8829 + }, + { + "epoch": 0.7265994651306316, + "grad_norm": 1.8992703834283056, + "learning_rate": 3.6703068936057172e-06, + "loss": 0.7461, + "step": 8830 + }, + { + "epoch": 0.7266817527257766, + "grad_norm": 1.902532074169642, + "learning_rate": 3.668243702472407e-06, + "loss": 0.716, + "step": 8831 + }, + { + "epoch": 0.7267640403209216, + "grad_norm": 2.10508408288086, + "learning_rate": 3.6661809611369394e-06, + "loss": 0.6978, + "step": 8832 + }, + { + "epoch": 0.7268463279160666, + "grad_norm": 1.762164629549705, + "learning_rate": 3.664118669745842e-06, + "loss": 0.7318, + "step": 8833 + }, + { + "epoch": 0.7269286155112117, + "grad_norm": 1.739569324564914, + "learning_rate": 3.662056828445625e-06, + "loss": 0.6946, + "step": 8834 + }, + { + "epoch": 0.7270109031063567, + "grad_norm": 2.596000124051889, + "learning_rate": 3.6599954373827517e-06, + "loss": 0.7023, + "step": 8835 + }, + { + "epoch": 0.7270931907015018, + "grad_norm": 6.045018584801388, + "learning_rate": 3.6579344967036666e-06, + "loss": 0.7356, + "step": 8836 + }, + { + "epoch": 0.7271754782966467, + "grad_norm": 2.038378417470202, + "learning_rate": 3.6558740065547727e-06, + "loss": 0.7387, + "step": 8837 + }, + { + "epoch": 0.7272577658917918, + "grad_norm": 5.729080012502652, + "learning_rate": 3.6538139670824434e-06, + "loss": 0.7212, + "step": 8838 + }, + { + "epoch": 0.7273400534869369, + "grad_norm": 1.7496244258411044, + "learning_rate": 3.6517543784330166e-06, + "loss": 0.7192, + "step": 8839 + }, + { + "epoch": 0.7274223410820819, + "grad_norm": 2.166660060655116, + "learning_rate": 3.6496952407528108e-06, + "loss": 0.7252, + "step": 8840 + }, + { + "epoch": 0.7275046286772269, + "grad_norm": 1.9176576909308383, + "learning_rate": 3.647636554188094e-06, + "loss": 0.7228, + "step": 8841 + }, + { + "epoch": 0.7275869162723719, + "grad_norm": 0.4230978563735771, + "learning_rate": 3.6455783188851225e-06, + "loss": 0.518, + "step": 8842 + }, + { + "epoch": 0.727669203867517, + "grad_norm": 2.155396758061834, + "learning_rate": 3.6435205349901037e-06, + "loss": 0.7135, + "step": 8843 + }, + { + "epoch": 0.727751491462662, + "grad_norm": 0.42995379618425766, + "learning_rate": 3.6414632026492214e-06, + "loss": 0.4865, + "step": 8844 + }, + { + "epoch": 0.727833779057807, + "grad_norm": 2.6529137770609985, + "learning_rate": 3.6394063220086208e-06, + "loss": 0.7308, + "step": 8845 + }, + { + "epoch": 0.727916066652952, + "grad_norm": 2.1795567190651175, + "learning_rate": 3.6373498932144246e-06, + "loss": 0.7255, + "step": 8846 + }, + { + "epoch": 0.7279983542480971, + "grad_norm": 2.542919497117812, + "learning_rate": 3.6352939164127144e-06, + "loss": 0.7609, + "step": 8847 + }, + { + "epoch": 0.7280806418432422, + "grad_norm": 2.251424666488088, + "learning_rate": 3.6332383917495486e-06, + "loss": 0.7218, + "step": 8848 + }, + { + "epoch": 0.7281629294383871, + "grad_norm": 1.8656365775653938, + "learning_rate": 3.6311833193709456e-06, + "loss": 0.7066, + "step": 8849 + }, + { + "epoch": 0.7282452170335322, + "grad_norm": 1.936746113184277, + "learning_rate": 3.6291286994228936e-06, + "loss": 0.7287, + "step": 8850 + }, + { + "epoch": 0.7283275046286772, + "grad_norm": 3.752051785015716, + "learning_rate": 3.627074532051348e-06, + "loss": 0.6901, + "step": 8851 + }, + { + "epoch": 0.7284097922238223, + "grad_norm": 1.904517139188235, + "learning_rate": 3.6250208174022374e-06, + "loss": 0.7553, + "step": 8852 + }, + { + "epoch": 0.7284920798189672, + "grad_norm": 0.40358492305815136, + "learning_rate": 3.6229675556214495e-06, + "loss": 0.4599, + "step": 8853 + }, + { + "epoch": 0.7285743674141123, + "grad_norm": 1.84143508195038, + "learning_rate": 3.620914746854852e-06, + "loss": 0.7264, + "step": 8854 + }, + { + "epoch": 0.7286566550092574, + "grad_norm": 1.822855655548514, + "learning_rate": 3.618862391248269e-06, + "loss": 0.7459, + "step": 8855 + }, + { + "epoch": 0.7287389426044024, + "grad_norm": 2.1690723877641074, + "learning_rate": 3.6168104889474963e-06, + "loss": 0.7228, + "step": 8856 + }, + { + "epoch": 0.7288212301995474, + "grad_norm": 2.0226914795597226, + "learning_rate": 3.6147590400982945e-06, + "loss": 0.7126, + "step": 8857 + }, + { + "epoch": 0.7289035177946924, + "grad_norm": 0.41317503632238367, + "learning_rate": 3.6127080448464024e-06, + "loss": 0.4792, + "step": 8858 + }, + { + "epoch": 0.7289858053898375, + "grad_norm": 1.8768374426727032, + "learning_rate": 3.6106575033375123e-06, + "loss": 0.7209, + "step": 8859 + }, + { + "epoch": 0.7290680929849825, + "grad_norm": 1.6900864637569013, + "learning_rate": 3.608607415717299e-06, + "loss": 0.7113, + "step": 8860 + }, + { + "epoch": 0.7291503805801276, + "grad_norm": 1.9715972450877828, + "learning_rate": 3.6065577821313925e-06, + "loss": 0.7051, + "step": 8861 + }, + { + "epoch": 0.7292326681752725, + "grad_norm": 2.120727855963964, + "learning_rate": 3.6045086027253974e-06, + "loss": 0.7279, + "step": 8862 + }, + { + "epoch": 0.7293149557704176, + "grad_norm": 2.7857856291320333, + "learning_rate": 3.6024598776448782e-06, + "loss": 0.7126, + "step": 8863 + }, + { + "epoch": 0.7293972433655627, + "grad_norm": 2.1000936980787843, + "learning_rate": 3.6004116070353824e-06, + "loss": 0.7452, + "step": 8864 + }, + { + "epoch": 0.7294795309607077, + "grad_norm": 1.7415206142523918, + "learning_rate": 3.5983637910424073e-06, + "loss": 0.681, + "step": 8865 + }, + { + "epoch": 0.7295618185558527, + "grad_norm": 1.5984260404463457, + "learning_rate": 3.5963164298114385e-06, + "loss": 0.6943, + "step": 8866 + }, + { + "epoch": 0.7296441061509977, + "grad_norm": 2.0343280252535187, + "learning_rate": 3.594269523487902e-06, + "loss": 0.7301, + "step": 8867 + }, + { + "epoch": 0.7297263937461428, + "grad_norm": 1.9971272510083524, + "learning_rate": 3.5922230722172135e-06, + "loss": 0.6807, + "step": 8868 + }, + { + "epoch": 0.7298086813412878, + "grad_norm": 1.974312238624741, + "learning_rate": 3.590177076144754e-06, + "loss": 0.6988, + "step": 8869 + }, + { + "epoch": 0.7298909689364328, + "grad_norm": 2.1605063230899435, + "learning_rate": 3.5881315354158607e-06, + "loss": 0.7095, + "step": 8870 + }, + { + "epoch": 0.7299732565315779, + "grad_norm": 0.43551534423435595, + "learning_rate": 3.586086450175853e-06, + "loss": 0.48, + "step": 8871 + }, + { + "epoch": 0.7300555441267229, + "grad_norm": 1.9835369994109306, + "learning_rate": 3.5840418205700057e-06, + "loss": 0.7338, + "step": 8872 + }, + { + "epoch": 0.730137831721868, + "grad_norm": 1.958515869600213, + "learning_rate": 3.5819976467435657e-06, + "loss": 0.7271, + "step": 8873 + }, + { + "epoch": 0.7302201193170129, + "grad_norm": 2.130787043992138, + "learning_rate": 3.5799539288417464e-06, + "loss": 0.7275, + "step": 8874 + }, + { + "epoch": 0.730302406912158, + "grad_norm": 1.9117714028803932, + "learning_rate": 3.577910667009736e-06, + "loss": 0.7174, + "step": 8875 + }, + { + "epoch": 0.730384694507303, + "grad_norm": 2.102440796101792, + "learning_rate": 3.575867861392678e-06, + "loss": 0.7401, + "step": 8876 + }, + { + "epoch": 0.7304669821024481, + "grad_norm": 2.3246167207169273, + "learning_rate": 3.5738255121357e-06, + "loss": 0.6884, + "step": 8877 + }, + { + "epoch": 0.730549269697593, + "grad_norm": 2.0476405318182205, + "learning_rate": 3.571783619383873e-06, + "loss": 0.7058, + "step": 8878 + }, + { + "epoch": 0.7306315572927381, + "grad_norm": 2.0535318054692016, + "learning_rate": 3.569742183282261e-06, + "loss": 0.7173, + "step": 8879 + }, + { + "epoch": 0.7307138448878832, + "grad_norm": 0.4100181247246475, + "learning_rate": 3.5677012039758772e-06, + "loss": 0.4683, + "step": 8880 + }, + { + "epoch": 0.7307961324830282, + "grad_norm": 1.8988252353724988, + "learning_rate": 3.565660681609717e-06, + "loss": 0.7087, + "step": 8881 + }, + { + "epoch": 0.7308784200781732, + "grad_norm": 2.606978946990878, + "learning_rate": 3.563620616328728e-06, + "loss": 0.7172, + "step": 8882 + }, + { + "epoch": 0.7309607076733182, + "grad_norm": 1.9559883131878883, + "learning_rate": 3.5615810082778434e-06, + "loss": 0.7227, + "step": 8883 + }, + { + "epoch": 0.7310429952684633, + "grad_norm": 1.7557374627873856, + "learning_rate": 3.5595418576019402e-06, + "loss": 0.72, + "step": 8884 + }, + { + "epoch": 0.7311252828636083, + "grad_norm": 2.3828092056135857, + "learning_rate": 3.557503164445887e-06, + "loss": 0.7259, + "step": 8885 + }, + { + "epoch": 0.7312075704587534, + "grad_norm": 1.9020432552303899, + "learning_rate": 3.5554649289545027e-06, + "loss": 0.6861, + "step": 8886 + }, + { + "epoch": 0.7312898580538983, + "grad_norm": 4.027380286106868, + "learning_rate": 3.553427151272587e-06, + "loss": 0.6944, + "step": 8887 + }, + { + "epoch": 0.7313721456490434, + "grad_norm": 1.892374867312961, + "learning_rate": 3.551389831544896e-06, + "loss": 0.7295, + "step": 8888 + }, + { + "epoch": 0.7314544332441885, + "grad_norm": 0.40328882049990106, + "learning_rate": 3.5493529699161587e-06, + "loss": 0.4674, + "step": 8889 + }, + { + "epoch": 0.7315367208393335, + "grad_norm": 0.39658788500065584, + "learning_rate": 3.5473165665310673e-06, + "loss": 0.4744, + "step": 8890 + }, + { + "epoch": 0.7316190084344785, + "grad_norm": 2.1220467995865735, + "learning_rate": 3.5452806215342894e-06, + "loss": 0.7152, + "step": 8891 + }, + { + "epoch": 0.7317012960296235, + "grad_norm": 1.5794139358371262, + "learning_rate": 3.5432451350704513e-06, + "loss": 0.698, + "step": 8892 + }, + { + "epoch": 0.7317835836247686, + "grad_norm": 0.4102021055205153, + "learning_rate": 3.5412101072841554e-06, + "loss": 0.4841, + "step": 8893 + }, + { + "epoch": 0.7318658712199136, + "grad_norm": 0.4367892108811795, + "learning_rate": 3.5391755383199645e-06, + "loss": 0.461, + "step": 8894 + }, + { + "epoch": 0.7319481588150586, + "grad_norm": 1.9903499823344526, + "learning_rate": 3.5371414283224103e-06, + "loss": 0.7325, + "step": 8895 + }, + { + "epoch": 0.7320304464102037, + "grad_norm": 1.7947063937892966, + "learning_rate": 3.535107777435991e-06, + "loss": 0.6909, + "step": 8896 + }, + { + "epoch": 0.7321127340053487, + "grad_norm": 1.7994483934851988, + "learning_rate": 3.5330745858051786e-06, + "loss": 0.7549, + "step": 8897 + }, + { + "epoch": 0.7321950216004938, + "grad_norm": 1.4838956873154334, + "learning_rate": 3.5310418535744017e-06, + "loss": 0.7466, + "step": 8898 + }, + { + "epoch": 0.7322773091956387, + "grad_norm": 2.654908824354985, + "learning_rate": 3.5290095808880696e-06, + "loss": 0.7223, + "step": 8899 + }, + { + "epoch": 0.7323595967907838, + "grad_norm": 2.0859175131734777, + "learning_rate": 3.526977767890548e-06, + "loss": 0.6901, + "step": 8900 + }, + { + "epoch": 0.7324418843859288, + "grad_norm": 2.2588898312422083, + "learning_rate": 3.524946414726175e-06, + "loss": 0.7176, + "step": 8901 + }, + { + "epoch": 0.7325241719810739, + "grad_norm": 2.7739437958550166, + "learning_rate": 3.5229155215392483e-06, + "loss": 0.7376, + "step": 8902 + }, + { + "epoch": 0.7326064595762188, + "grad_norm": 2.1413902231658657, + "learning_rate": 3.5208850884740485e-06, + "loss": 0.7373, + "step": 8903 + }, + { + "epoch": 0.7326887471713639, + "grad_norm": 1.7374877546012604, + "learning_rate": 3.5188551156748064e-06, + "loss": 0.7378, + "step": 8904 + }, + { + "epoch": 0.732771034766509, + "grad_norm": 1.7642720062796504, + "learning_rate": 3.516825603285736e-06, + "loss": 0.7186, + "step": 8905 + }, + { + "epoch": 0.732853322361654, + "grad_norm": 1.8374061728212798, + "learning_rate": 3.5147965514510053e-06, + "loss": 0.7204, + "step": 8906 + }, + { + "epoch": 0.732935609956799, + "grad_norm": 1.5653499309860353, + "learning_rate": 3.512767960314757e-06, + "loss": 0.713, + "step": 8907 + }, + { + "epoch": 0.733017897551944, + "grad_norm": 1.7159660992020411, + "learning_rate": 3.5107398300210936e-06, + "loss": 0.721, + "step": 8908 + }, + { + "epoch": 0.7331001851470891, + "grad_norm": 0.42779174764207134, + "learning_rate": 3.5087121607140996e-06, + "loss": 0.5005, + "step": 8909 + }, + { + "epoch": 0.7331824727422341, + "grad_norm": 1.9723044268065084, + "learning_rate": 3.5066849525378078e-06, + "loss": 0.7267, + "step": 8910 + }, + { + "epoch": 0.7332647603373792, + "grad_norm": 0.40942210771181925, + "learning_rate": 3.5046582056362365e-06, + "loss": 0.4649, + "step": 8911 + }, + { + "epoch": 0.7333470479325241, + "grad_norm": 0.41259879383540393, + "learning_rate": 3.5026319201533577e-06, + "loss": 0.4603, + "step": 8912 + }, + { + "epoch": 0.7334293355276692, + "grad_norm": 2.235104119341903, + "learning_rate": 3.5006060962331167e-06, + "loss": 0.7192, + "step": 8913 + }, + { + "epoch": 0.7335116231228143, + "grad_norm": 2.395214670740568, + "learning_rate": 3.4985807340194212e-06, + "loss": 0.7433, + "step": 8914 + }, + { + "epoch": 0.7335939107179593, + "grad_norm": 1.6297850614660103, + "learning_rate": 3.4965558336561557e-06, + "loss": 0.7114, + "step": 8915 + }, + { + "epoch": 0.7336761983131043, + "grad_norm": 2.9663445777513884, + "learning_rate": 3.4945313952871605e-06, + "loss": 0.7291, + "step": 8916 + }, + { + "epoch": 0.7337584859082493, + "grad_norm": 1.9082590290556825, + "learning_rate": 3.4925074190562535e-06, + "loss": 0.7132, + "step": 8917 + }, + { + "epoch": 0.7338407735033944, + "grad_norm": 1.9049527356830427, + "learning_rate": 3.490483905107214e-06, + "loss": 0.7065, + "step": 8918 + }, + { + "epoch": 0.7339230610985394, + "grad_norm": 2.2147828439161223, + "learning_rate": 3.4884608535837848e-06, + "loss": 0.6995, + "step": 8919 + }, + { + "epoch": 0.7340053486936844, + "grad_norm": 1.7740220406863405, + "learning_rate": 3.4864382646296814e-06, + "loss": 0.7014, + "step": 8920 + }, + { + "epoch": 0.7340876362888294, + "grad_norm": 2.0350926812418777, + "learning_rate": 3.48441613838859e-06, + "loss": 0.7219, + "step": 8921 + }, + { + "epoch": 0.7341699238839745, + "grad_norm": 0.4216205870687056, + "learning_rate": 3.4823944750041516e-06, + "loss": 0.4494, + "step": 8922 + }, + { + "epoch": 0.7342522114791196, + "grad_norm": 1.8139330136076037, + "learning_rate": 3.4803732746199904e-06, + "loss": 0.6662, + "step": 8923 + }, + { + "epoch": 0.7343344990742645, + "grad_norm": 2.0576215860241263, + "learning_rate": 3.4783525373796856e-06, + "loss": 0.736, + "step": 8924 + }, + { + "epoch": 0.7344167866694096, + "grad_norm": 1.8889330563305253, + "learning_rate": 3.476332263426786e-06, + "loss": 0.7215, + "step": 8925 + }, + { + "epoch": 0.7344990742645546, + "grad_norm": 2.2630743709548296, + "learning_rate": 3.4743124529048067e-06, + "loss": 0.7014, + "step": 8926 + }, + { + "epoch": 0.7345813618596997, + "grad_norm": 3.161386285220962, + "learning_rate": 3.4722931059572375e-06, + "loss": 0.7448, + "step": 8927 + }, + { + "epoch": 0.7346636494548446, + "grad_norm": 2.337683813363229, + "learning_rate": 3.470274222727522e-06, + "loss": 0.7109, + "step": 8928 + }, + { + "epoch": 0.7347459370499897, + "grad_norm": 0.4319192019296773, + "learning_rate": 3.4682558033590875e-06, + "loss": 0.5203, + "step": 8929 + }, + { + "epoch": 0.7348282246451348, + "grad_norm": 2.041895178209283, + "learning_rate": 3.466237847995315e-06, + "loss": 0.72, + "step": 8930 + }, + { + "epoch": 0.7349105122402798, + "grad_norm": 2.554678908936084, + "learning_rate": 3.464220356779556e-06, + "loss": 0.7485, + "step": 8931 + }, + { + "epoch": 0.7349927998354248, + "grad_norm": 2.1240810520754856, + "learning_rate": 3.4622033298551274e-06, + "loss": 0.7365, + "step": 8932 + }, + { + "epoch": 0.7350750874305698, + "grad_norm": 1.7535679846008987, + "learning_rate": 3.460186767365321e-06, + "loss": 0.7104, + "step": 8933 + }, + { + "epoch": 0.7351573750257149, + "grad_norm": 2.8178913936977685, + "learning_rate": 3.458170669453386e-06, + "loss": 0.7521, + "step": 8934 + }, + { + "epoch": 0.7352396626208599, + "grad_norm": 1.9825611015490983, + "learning_rate": 3.4561550362625463e-06, + "loss": 0.6986, + "step": 8935 + }, + { + "epoch": 0.7353219502160049, + "grad_norm": 1.731172899609003, + "learning_rate": 3.4541398679359883e-06, + "loss": 0.7442, + "step": 8936 + }, + { + "epoch": 0.7354042378111499, + "grad_norm": 1.9807119788595047, + "learning_rate": 3.452125164616865e-06, + "loss": 0.7125, + "step": 8937 + }, + { + "epoch": 0.735486525406295, + "grad_norm": 1.7054689754042252, + "learning_rate": 3.450110926448296e-06, + "loss": 0.7226, + "step": 8938 + }, + { + "epoch": 0.7355688130014401, + "grad_norm": 3.1110608919387674, + "learning_rate": 3.448097153573374e-06, + "loss": 0.7194, + "step": 8939 + }, + { + "epoch": 0.7356511005965851, + "grad_norm": 1.8968522217340296, + "learning_rate": 3.4460838461351496e-06, + "loss": 0.7333, + "step": 8940 + }, + { + "epoch": 0.7357333881917301, + "grad_norm": 1.7315746240647703, + "learning_rate": 3.44407100427665e-06, + "loss": 0.724, + "step": 8941 + }, + { + "epoch": 0.7358156757868751, + "grad_norm": 0.40079326476294935, + "learning_rate": 3.442058628140862e-06, + "loss": 0.4838, + "step": 8942 + }, + { + "epoch": 0.7358979633820202, + "grad_norm": 1.9152062630149294, + "learning_rate": 3.440046717870741e-06, + "loss": 0.7084, + "step": 8943 + }, + { + "epoch": 0.7359802509771652, + "grad_norm": 1.68816988523511, + "learning_rate": 3.438035273609206e-06, + "loss": 0.7113, + "step": 8944 + }, + { + "epoch": 0.7360625385723102, + "grad_norm": 3.9150204547792486, + "learning_rate": 3.436024295499155e-06, + "loss": 0.691, + "step": 8945 + }, + { + "epoch": 0.7361448261674552, + "grad_norm": 0.41781604403724387, + "learning_rate": 3.4340137836834376e-06, + "loss": 0.4949, + "step": 8946 + }, + { + "epoch": 0.7362271137626003, + "grad_norm": 2.639598207446852, + "learning_rate": 3.4320037383048867e-06, + "loss": 0.718, + "step": 8947 + }, + { + "epoch": 0.7363094013577454, + "grad_norm": 0.4230346254816236, + "learning_rate": 3.429994159506279e-06, + "loss": 0.4976, + "step": 8948 + }, + { + "epoch": 0.7363916889528903, + "grad_norm": 2.1665709568320017, + "learning_rate": 3.4279850474303822e-06, + "loss": 0.7326, + "step": 8949 + }, + { + "epoch": 0.7364739765480354, + "grad_norm": 2.8836616030922135, + "learning_rate": 3.425976402219915e-06, + "loss": 0.7361, + "step": 8950 + }, + { + "epoch": 0.7365562641431804, + "grad_norm": 2.2382592818430096, + "learning_rate": 3.423968224017573e-06, + "loss": 0.7357, + "step": 8951 + }, + { + "epoch": 0.7366385517383255, + "grad_norm": 1.851270228786091, + "learning_rate": 3.4219605129660084e-06, + "loss": 0.7238, + "step": 8952 + }, + { + "epoch": 0.7367208393334704, + "grad_norm": 2.040991814357207, + "learning_rate": 3.419953269207855e-06, + "loss": 0.7054, + "step": 8953 + }, + { + "epoch": 0.7368031269286155, + "grad_norm": 1.7167778356513148, + "learning_rate": 3.4179464928856932e-06, + "loss": 0.7107, + "step": 8954 + }, + { + "epoch": 0.7368854145237606, + "grad_norm": 1.7788031819859693, + "learning_rate": 3.415940184142088e-06, + "loss": 0.7228, + "step": 8955 + }, + { + "epoch": 0.7369677021189056, + "grad_norm": 1.7472520542169194, + "learning_rate": 3.4139343431195593e-06, + "loss": 0.7173, + "step": 8956 + }, + { + "epoch": 0.7370499897140506, + "grad_norm": 1.6962664001270982, + "learning_rate": 3.4119289699606064e-06, + "loss": 0.705, + "step": 8957 + }, + { + "epoch": 0.7371322773091956, + "grad_norm": 2.2033324005104222, + "learning_rate": 3.4099240648076813e-06, + "loss": 0.7297, + "step": 8958 + }, + { + "epoch": 0.7372145649043407, + "grad_norm": 1.492594724055178, + "learning_rate": 3.4079196278032136e-06, + "loss": 0.7082, + "step": 8959 + }, + { + "epoch": 0.7372968524994857, + "grad_norm": 2.0435036951941123, + "learning_rate": 3.405915659089588e-06, + "loss": 0.7297, + "step": 8960 + }, + { + "epoch": 0.7373791400946307, + "grad_norm": 0.41558661828865556, + "learning_rate": 3.4039121588091727e-06, + "loss": 0.4578, + "step": 8961 + }, + { + "epoch": 0.7374614276897757, + "grad_norm": 1.9617123785571935, + "learning_rate": 3.401909127104286e-06, + "loss": 0.7354, + "step": 8962 + }, + { + "epoch": 0.7375437152849208, + "grad_norm": 2.0385766463432047, + "learning_rate": 3.3999065641172246e-06, + "loss": 0.7618, + "step": 8963 + }, + { + "epoch": 0.7376260028800659, + "grad_norm": 3.7284559835111626, + "learning_rate": 3.3979044699902464e-06, + "loss": 0.7156, + "step": 8964 + }, + { + "epoch": 0.7377082904752109, + "grad_norm": 1.9382238348325116, + "learning_rate": 3.3959028448655763e-06, + "loss": 0.7396, + "step": 8965 + }, + { + "epoch": 0.7377905780703559, + "grad_norm": 1.8503695805032725, + "learning_rate": 3.3939016888854027e-06, + "loss": 0.7358, + "step": 8966 + }, + { + "epoch": 0.7378728656655009, + "grad_norm": 2.2857322824410637, + "learning_rate": 3.391901002191892e-06, + "loss": 0.6969, + "step": 8967 + }, + { + "epoch": 0.737955153260646, + "grad_norm": 2.4233169548347577, + "learning_rate": 3.3899007849271617e-06, + "loss": 0.7287, + "step": 8968 + }, + { + "epoch": 0.738037440855791, + "grad_norm": 2.3700117396661673, + "learning_rate": 3.387901037233312e-06, + "loss": 0.7371, + "step": 8969 + }, + { + "epoch": 0.738119728450936, + "grad_norm": 2.3764958883327143, + "learning_rate": 3.385901759252399e-06, + "loss": 0.7296, + "step": 8970 + }, + { + "epoch": 0.738202016046081, + "grad_norm": 1.9508298575233165, + "learning_rate": 3.3839029511264465e-06, + "loss": 0.7116, + "step": 8971 + }, + { + "epoch": 0.7382843036412261, + "grad_norm": 1.8474046002194677, + "learning_rate": 3.3819046129974452e-06, + "loss": 0.7249, + "step": 8972 + }, + { + "epoch": 0.7383665912363712, + "grad_norm": 1.9027900054016826, + "learning_rate": 3.3799067450073584e-06, + "loss": 0.7141, + "step": 8973 + }, + { + "epoch": 0.7384488788315161, + "grad_norm": 0.4174221125188079, + "learning_rate": 3.377909347298106e-06, + "loss": 0.5081, + "step": 8974 + }, + { + "epoch": 0.7385311664266612, + "grad_norm": 1.9497172296975591, + "learning_rate": 3.375912420011587e-06, + "loss": 0.7134, + "step": 8975 + }, + { + "epoch": 0.7386134540218062, + "grad_norm": 1.8940475973987165, + "learning_rate": 3.3739159632896556e-06, + "loss": 0.7026, + "step": 8976 + }, + { + "epoch": 0.7386957416169513, + "grad_norm": 1.7691158774163358, + "learning_rate": 3.3719199772741373e-06, + "loss": 0.7371, + "step": 8977 + }, + { + "epoch": 0.7387780292120962, + "grad_norm": 2.603206503791469, + "learning_rate": 3.3699244621068207e-06, + "loss": 0.7113, + "step": 8978 + }, + { + "epoch": 0.7388603168072413, + "grad_norm": 0.4125430986622687, + "learning_rate": 3.36792941792947e-06, + "loss": 0.4724, + "step": 8979 + }, + { + "epoch": 0.7389426044023863, + "grad_norm": 2.000974828831058, + "learning_rate": 3.3659348448838035e-06, + "loss": 0.7035, + "step": 8980 + }, + { + "epoch": 0.7390248919975314, + "grad_norm": 2.008299069258025, + "learning_rate": 3.363940743111519e-06, + "loss": 0.7196, + "step": 8981 + }, + { + "epoch": 0.7391071795926764, + "grad_norm": 1.942918457552559, + "learning_rate": 3.3619471127542724e-06, + "loss": 0.7114, + "step": 8982 + }, + { + "epoch": 0.7391894671878214, + "grad_norm": 2.4510183147012365, + "learning_rate": 3.3599539539536864e-06, + "loss": 0.7289, + "step": 8983 + }, + { + "epoch": 0.7392717547829665, + "grad_norm": 1.723469604094164, + "learning_rate": 3.3579612668513496e-06, + "loss": 0.7048, + "step": 8984 + }, + { + "epoch": 0.7393540423781115, + "grad_norm": 2.226827118033211, + "learning_rate": 3.3559690515888245e-06, + "loss": 0.7248, + "step": 8985 + }, + { + "epoch": 0.7394363299732565, + "grad_norm": 1.693521657536107, + "learning_rate": 3.3539773083076287e-06, + "loss": 0.7076, + "step": 8986 + }, + { + "epoch": 0.7395186175684015, + "grad_norm": 2.1219140856269343, + "learning_rate": 3.3519860371492607e-06, + "loss": 0.7085, + "step": 8987 + }, + { + "epoch": 0.7396009051635466, + "grad_norm": 1.9939409650531428, + "learning_rate": 3.3499952382551725e-06, + "loss": 0.7188, + "step": 8988 + }, + { + "epoch": 0.7396831927586917, + "grad_norm": 1.6322306196166714, + "learning_rate": 3.348004911766787e-06, + "loss": 0.7066, + "step": 8989 + }, + { + "epoch": 0.7397654803538367, + "grad_norm": 1.8141286475263982, + "learning_rate": 3.3460150578254913e-06, + "loss": 0.6988, + "step": 8990 + }, + { + "epoch": 0.7398477679489817, + "grad_norm": 2.2663633575228257, + "learning_rate": 3.3440256765726477e-06, + "loss": 0.7201, + "step": 8991 + }, + { + "epoch": 0.7399300555441267, + "grad_norm": 2.17368023763863, + "learning_rate": 3.3420367681495715e-06, + "loss": 0.7136, + "step": 8992 + }, + { + "epoch": 0.7400123431392718, + "grad_norm": 1.6456156106840318, + "learning_rate": 3.3400483326975598e-06, + "loss": 0.7249, + "step": 8993 + }, + { + "epoch": 0.7400946307344168, + "grad_norm": 1.7849799816293264, + "learning_rate": 3.338060370357862e-06, + "loss": 0.7068, + "step": 8994 + }, + { + "epoch": 0.7401769183295618, + "grad_norm": 0.4197574670756084, + "learning_rate": 3.3360728812717027e-06, + "loss": 0.4659, + "step": 8995 + }, + { + "epoch": 0.7402592059247068, + "grad_norm": 2.1734884311231113, + "learning_rate": 3.3340858655802644e-06, + "loss": 0.727, + "step": 8996 + }, + { + "epoch": 0.7403414935198519, + "grad_norm": 1.9700227922811273, + "learning_rate": 3.332099323424709e-06, + "loss": 0.7175, + "step": 8997 + }, + { + "epoch": 0.740423781114997, + "grad_norm": 1.6057805511002325, + "learning_rate": 3.330113254946151e-06, + "loss": 0.68, + "step": 8998 + }, + { + "epoch": 0.7405060687101419, + "grad_norm": 2.004822286357248, + "learning_rate": 3.328127660285684e-06, + "loss": 0.7422, + "step": 8999 + }, + { + "epoch": 0.740588356305287, + "grad_norm": 2.3561546677088856, + "learning_rate": 3.326142539584357e-06, + "loss": 0.7065, + "step": 9000 + }, + { + "epoch": 0.740670643900432, + "grad_norm": 2.0556251580612455, + "learning_rate": 3.324157892983192e-06, + "loss": 0.7405, + "step": 9001 + }, + { + "epoch": 0.7407529314955771, + "grad_norm": 0.4092342813876628, + "learning_rate": 3.32217372062317e-06, + "loss": 0.463, + "step": 9002 + }, + { + "epoch": 0.740835219090722, + "grad_norm": 1.70618006266928, + "learning_rate": 3.32019002264525e-06, + "loss": 0.7058, + "step": 9003 + }, + { + "epoch": 0.7409175066858671, + "grad_norm": 2.55142108841872, + "learning_rate": 3.3182067991903453e-06, + "loss": 0.6992, + "step": 9004 + }, + { + "epoch": 0.7409997942810121, + "grad_norm": 2.1877081509367073, + "learning_rate": 3.316224050399347e-06, + "loss": 0.7378, + "step": 9005 + }, + { + "epoch": 0.7410820818761572, + "grad_norm": 1.8362970549312607, + "learning_rate": 3.314241776413103e-06, + "loss": 0.7124, + "step": 9006 + }, + { + "epoch": 0.7411643694713022, + "grad_norm": 0.41120187525149937, + "learning_rate": 3.3122599773724307e-06, + "loss": 0.4666, + "step": 9007 + }, + { + "epoch": 0.7412466570664472, + "grad_norm": 0.41576316267350166, + "learning_rate": 3.310278653418111e-06, + "loss": 0.4933, + "step": 9008 + }, + { + "epoch": 0.7413289446615923, + "grad_norm": 2.023132995091473, + "learning_rate": 3.3082978046908996e-06, + "loss": 0.7222, + "step": 9009 + }, + { + "epoch": 0.7414112322567373, + "grad_norm": 1.6800134523801462, + "learning_rate": 3.306317431331508e-06, + "loss": 0.6882, + "step": 9010 + }, + { + "epoch": 0.7414935198518823, + "grad_norm": 0.4436816117691066, + "learning_rate": 3.304337533480625e-06, + "loss": 0.4857, + "step": 9011 + }, + { + "epoch": 0.7415758074470273, + "grad_norm": 1.718983035694611, + "learning_rate": 3.3023581112788938e-06, + "loss": 0.696, + "step": 9012 + }, + { + "epoch": 0.7416580950421724, + "grad_norm": 2.1461186618914683, + "learning_rate": 3.3003791648669327e-06, + "loss": 0.7195, + "step": 9013 + }, + { + "epoch": 0.7417403826373175, + "grad_norm": 0.4063916328825096, + "learning_rate": 3.2984006943853176e-06, + "loss": 0.4595, + "step": 9014 + }, + { + "epoch": 0.7418226702324625, + "grad_norm": 1.9855083965618725, + "learning_rate": 3.296422699974602e-06, + "loss": 0.6981, + "step": 9015 + }, + { + "epoch": 0.7419049578276075, + "grad_norm": 2.2454527093602588, + "learning_rate": 3.2944451817752944e-06, + "loss": 0.7141, + "step": 9016 + }, + { + "epoch": 0.7419872454227525, + "grad_norm": 2.259099353028589, + "learning_rate": 3.2924681399278835e-06, + "loss": 0.7153, + "step": 9017 + }, + { + "epoch": 0.7420695330178976, + "grad_norm": 3.3402722879209574, + "learning_rate": 3.2904915745728016e-06, + "loss": 0.6995, + "step": 9018 + }, + { + "epoch": 0.7421518206130426, + "grad_norm": 1.8987634528161403, + "learning_rate": 3.288515485850472e-06, + "loss": 0.7208, + "step": 9019 + }, + { + "epoch": 0.7422341082081876, + "grad_norm": 2.102491954990254, + "learning_rate": 3.2865398739012654e-06, + "loss": 0.716, + "step": 9020 + }, + { + "epoch": 0.7423163958033326, + "grad_norm": 3.0740716457521793, + "learning_rate": 3.2845647388655335e-06, + "loss": 0.7242, + "step": 9021 + }, + { + "epoch": 0.7423986833984777, + "grad_norm": 2.3132049336249816, + "learning_rate": 3.2825900808835788e-06, + "loss": 0.7301, + "step": 9022 + }, + { + "epoch": 0.7424809709936228, + "grad_norm": 1.9919069204227817, + "learning_rate": 3.280615900095688e-06, + "loss": 0.7088, + "step": 9023 + }, + { + "epoch": 0.7425632585887677, + "grad_norm": 2.235254878316146, + "learning_rate": 3.278642196642092e-06, + "loss": 0.7496, + "step": 9024 + }, + { + "epoch": 0.7426455461839128, + "grad_norm": 2.3920423238103656, + "learning_rate": 3.2766689706630085e-06, + "loss": 0.6976, + "step": 9025 + }, + { + "epoch": 0.7427278337790578, + "grad_norm": 1.9473864437602242, + "learning_rate": 3.2746962222986056e-06, + "loss": 0.7215, + "step": 9026 + }, + { + "epoch": 0.7428101213742029, + "grad_norm": 0.4882337396535953, + "learning_rate": 3.2727239516890297e-06, + "loss": 0.4853, + "step": 9027 + }, + { + "epoch": 0.7428924089693478, + "grad_norm": 1.6938737958799037, + "learning_rate": 3.2707521589743864e-06, + "loss": 0.7322, + "step": 9028 + }, + { + "epoch": 0.7429746965644929, + "grad_norm": 2.544532368713991, + "learning_rate": 3.268780844294748e-06, + "loss": 0.7217, + "step": 9029 + }, + { + "epoch": 0.743056984159638, + "grad_norm": 1.9003332220255604, + "learning_rate": 3.26681000779015e-06, + "loss": 0.7265, + "step": 9030 + }, + { + "epoch": 0.743139271754783, + "grad_norm": 2.037057695401237, + "learning_rate": 3.264839649600604e-06, + "loss": 0.7255, + "step": 9031 + }, + { + "epoch": 0.743221559349928, + "grad_norm": 2.142601702924721, + "learning_rate": 3.262869769866075e-06, + "loss": 0.724, + "step": 9032 + }, + { + "epoch": 0.743303846945073, + "grad_norm": 2.2480629516843798, + "learning_rate": 3.260900368726506e-06, + "loss": 0.6979, + "step": 9033 + }, + { + "epoch": 0.7433861345402181, + "grad_norm": 2.0397195244285227, + "learning_rate": 3.258931446321797e-06, + "loss": 0.727, + "step": 9034 + }, + { + "epoch": 0.7434684221353631, + "grad_norm": 2.131173005090217, + "learning_rate": 3.2569630027918176e-06, + "loss": 0.7049, + "step": 9035 + }, + { + "epoch": 0.7435507097305081, + "grad_norm": 1.8438096783328601, + "learning_rate": 3.2549950382763993e-06, + "loss": 0.7258, + "step": 9036 + }, + { + "epoch": 0.7436329973256531, + "grad_norm": 0.4191417848098184, + "learning_rate": 3.25302755291535e-06, + "loss": 0.4884, + "step": 9037 + }, + { + "epoch": 0.7437152849207982, + "grad_norm": 2.4978512311041388, + "learning_rate": 3.25106054684843e-06, + "loss": 0.7455, + "step": 9038 + }, + { + "epoch": 0.7437975725159433, + "grad_norm": 2.240283677606699, + "learning_rate": 3.2490940202153787e-06, + "loss": 0.7009, + "step": 9039 + }, + { + "epoch": 0.7438798601110882, + "grad_norm": 1.8762166463667564, + "learning_rate": 3.2471279731558913e-06, + "loss": 0.7058, + "step": 9040 + }, + { + "epoch": 0.7439621477062333, + "grad_norm": 0.4020021874277218, + "learning_rate": 3.2451624058096332e-06, + "loss": 0.4841, + "step": 9041 + }, + { + "epoch": 0.7440444353013783, + "grad_norm": 1.7215375795430048, + "learning_rate": 3.2431973183162312e-06, + "loss": 0.6955, + "step": 9042 + }, + { + "epoch": 0.7441267228965234, + "grad_norm": 1.9841667354240453, + "learning_rate": 3.24123271081529e-06, + "loss": 0.7237, + "step": 9043 + }, + { + "epoch": 0.7442090104916684, + "grad_norm": 1.6393074462972284, + "learning_rate": 3.2392685834463645e-06, + "loss": 0.7228, + "step": 9044 + }, + { + "epoch": 0.7442912980868134, + "grad_norm": 1.7964233730473773, + "learning_rate": 3.2373049363489895e-06, + "loss": 0.6913, + "step": 9045 + }, + { + "epoch": 0.7443735856819584, + "grad_norm": 3.9465841533285073, + "learning_rate": 3.2353417696626567e-06, + "loss": 0.7243, + "step": 9046 + }, + { + "epoch": 0.7444558732771035, + "grad_norm": 0.3868727799886983, + "learning_rate": 3.2333790835268264e-06, + "loss": 0.4703, + "step": 9047 + }, + { + "epoch": 0.7445381608722486, + "grad_norm": 1.7344749622946751, + "learning_rate": 3.2314168780809206e-06, + "loss": 0.7263, + "step": 9048 + }, + { + "epoch": 0.7446204484673935, + "grad_norm": 1.7261190466354583, + "learning_rate": 3.229455153464338e-06, + "loss": 0.7473, + "step": 9049 + }, + { + "epoch": 0.7447027360625386, + "grad_norm": 2.4457483853275575, + "learning_rate": 3.2274939098164316e-06, + "loss": 0.6957, + "step": 9050 + }, + { + "epoch": 0.7447850236576836, + "grad_norm": 2.0304723235014954, + "learning_rate": 3.2255331472765282e-06, + "loss": 0.6965, + "step": 9051 + }, + { + "epoch": 0.7448673112528287, + "grad_norm": 2.66182430965699, + "learning_rate": 3.2235728659839174e-06, + "loss": 0.7124, + "step": 9052 + }, + { + "epoch": 0.7449495988479736, + "grad_norm": 2.790029001152084, + "learning_rate": 3.221613066077852e-06, + "loss": 0.7339, + "step": 9053 + }, + { + "epoch": 0.7450318864431187, + "grad_norm": 1.964072418645399, + "learning_rate": 3.219653747697551e-06, + "loss": 0.6952, + "step": 9054 + }, + { + "epoch": 0.7451141740382637, + "grad_norm": 0.4244714021361457, + "learning_rate": 3.2176949109822076e-06, + "loss": 0.5176, + "step": 9055 + }, + { + "epoch": 0.7451964616334088, + "grad_norm": 0.40322194671851086, + "learning_rate": 3.2157365560709674e-06, + "loss": 0.4741, + "step": 9056 + }, + { + "epoch": 0.7452787492285537, + "grad_norm": 2.7110392008035338, + "learning_rate": 3.213778683102956e-06, + "loss": 0.6849, + "step": 9057 + }, + { + "epoch": 0.7453610368236988, + "grad_norm": 0.4120034422494695, + "learning_rate": 3.2118212922172543e-06, + "loss": 0.4735, + "step": 9058 + }, + { + "epoch": 0.7454433244188439, + "grad_norm": 1.604510026012353, + "learning_rate": 3.209864383552912e-06, + "loss": 0.7073, + "step": 9059 + }, + { + "epoch": 0.7455256120139889, + "grad_norm": 1.927053357329461, + "learning_rate": 3.2079079572489402e-06, + "loss": 0.713, + "step": 9060 + }, + { + "epoch": 0.7456078996091339, + "grad_norm": 0.4178091249029116, + "learning_rate": 3.2059520134443257e-06, + "loss": 0.4492, + "step": 9061 + }, + { + "epoch": 0.7456901872042789, + "grad_norm": 2.0587590027016263, + "learning_rate": 3.203996552278018e-06, + "loss": 0.6725, + "step": 9062 + }, + { + "epoch": 0.745772474799424, + "grad_norm": 1.7762745478107809, + "learning_rate": 3.2020415738889267e-06, + "loss": 0.7039, + "step": 9063 + }, + { + "epoch": 0.745854762394569, + "grad_norm": 1.9352868260680476, + "learning_rate": 3.20008707841593e-06, + "loss": 0.7108, + "step": 9064 + }, + { + "epoch": 0.745937049989714, + "grad_norm": 2.235455143858674, + "learning_rate": 3.1981330659978695e-06, + "loss": 0.722, + "step": 9065 + }, + { + "epoch": 0.746019337584859, + "grad_norm": 0.4176980585652352, + "learning_rate": 3.1961795367735603e-06, + "loss": 0.4642, + "step": 9066 + }, + { + "epoch": 0.7461016251800041, + "grad_norm": 2.116787216394759, + "learning_rate": 3.194226490881773e-06, + "loss": 0.7157, + "step": 9067 + }, + { + "epoch": 0.7461839127751492, + "grad_norm": 1.8697990065933472, + "learning_rate": 3.1922739284612545e-06, + "loss": 0.6806, + "step": 9068 + }, + { + "epoch": 0.7462662003702942, + "grad_norm": 3.248906168149121, + "learning_rate": 3.1903218496507095e-06, + "loss": 0.7183, + "step": 9069 + }, + { + "epoch": 0.7463484879654392, + "grad_norm": 2.157367549507887, + "learning_rate": 3.1883702545888096e-06, + "loss": 0.7486, + "step": 9070 + }, + { + "epoch": 0.7464307755605842, + "grad_norm": 1.8433242608870348, + "learning_rate": 3.18641914341419e-06, + "loss": 0.7407, + "step": 9071 + }, + { + "epoch": 0.7465130631557293, + "grad_norm": 1.8880999806596521, + "learning_rate": 3.184468516265461e-06, + "loss": 0.6945, + "step": 9072 + }, + { + "epoch": 0.7465953507508744, + "grad_norm": 2.1872245785081557, + "learning_rate": 3.182518373281185e-06, + "loss": 0.721, + "step": 9073 + }, + { + "epoch": 0.7466776383460193, + "grad_norm": 2.3695625808252907, + "learning_rate": 3.180568714599904e-06, + "loss": 0.7204, + "step": 9074 + }, + { + "epoch": 0.7467599259411644, + "grad_norm": 0.41465145693880506, + "learning_rate": 3.178619540360116e-06, + "loss": 0.4902, + "step": 9075 + }, + { + "epoch": 0.7468422135363094, + "grad_norm": 1.7204201572692448, + "learning_rate": 3.1766708507002865e-06, + "loss": 0.71, + "step": 9076 + }, + { + "epoch": 0.7469245011314545, + "grad_norm": 2.415663355967086, + "learning_rate": 3.1747226457588443e-06, + "loss": 0.6888, + "step": 9077 + }, + { + "epoch": 0.7470067887265994, + "grad_norm": 2.2708931926674225, + "learning_rate": 3.1727749256741937e-06, + "loss": 0.7224, + "step": 9078 + }, + { + "epoch": 0.7470890763217445, + "grad_norm": 1.9711808854697248, + "learning_rate": 3.170827690584689e-06, + "loss": 0.7016, + "step": 9079 + }, + { + "epoch": 0.7471713639168895, + "grad_norm": 0.4377734858410571, + "learning_rate": 3.168880940628668e-06, + "loss": 0.4672, + "step": 9080 + }, + { + "epoch": 0.7472536515120346, + "grad_norm": 3.1124431814838824, + "learning_rate": 3.1669346759444195e-06, + "loss": 0.6986, + "step": 9081 + }, + { + "epoch": 0.7473359391071795, + "grad_norm": 1.7721621005249708, + "learning_rate": 3.164988896670205e-06, + "loss": 0.7009, + "step": 9082 + }, + { + "epoch": 0.7474182267023246, + "grad_norm": 2.0193477246824165, + "learning_rate": 3.163043602944245e-06, + "loss": 0.7165, + "step": 9083 + }, + { + "epoch": 0.7475005142974697, + "grad_norm": 1.9829989560093042, + "learning_rate": 3.1610987949047356e-06, + "loss": 0.7085, + "step": 9084 + }, + { + "epoch": 0.7475828018926147, + "grad_norm": 2.1070792283309934, + "learning_rate": 3.1591544726898272e-06, + "loss": 0.7181, + "step": 9085 + }, + { + "epoch": 0.7476650894877597, + "grad_norm": 2.0284262839432405, + "learning_rate": 3.157210636437649e-06, + "loss": 0.7025, + "step": 9086 + }, + { + "epoch": 0.7477473770829047, + "grad_norm": 1.811888924625548, + "learning_rate": 3.155267286286282e-06, + "loss": 0.7085, + "step": 9087 + }, + { + "epoch": 0.7478296646780498, + "grad_norm": 2.2168447215944798, + "learning_rate": 3.1533244223737823e-06, + "loss": 0.694, + "step": 9088 + }, + { + "epoch": 0.7479119522731948, + "grad_norm": 2.0871455517214246, + "learning_rate": 3.1513820448381616e-06, + "loss": 0.7147, + "step": 9089 + }, + { + "epoch": 0.7479942398683398, + "grad_norm": 1.8349210136147347, + "learning_rate": 3.1494401538174115e-06, + "loss": 0.738, + "step": 9090 + }, + { + "epoch": 0.7480765274634849, + "grad_norm": 2.017938905484706, + "learning_rate": 3.1474987494494733e-06, + "loss": 0.7414, + "step": 9091 + }, + { + "epoch": 0.7481588150586299, + "grad_norm": 1.766054573906848, + "learning_rate": 3.145557831872269e-06, + "loss": 0.7291, + "step": 9092 + }, + { + "epoch": 0.748241102653775, + "grad_norm": 2.3608539117108664, + "learning_rate": 3.143617401223674e-06, + "loss": 0.6611, + "step": 9093 + }, + { + "epoch": 0.74832339024892, + "grad_norm": 1.944227478985891, + "learning_rate": 3.1416774576415332e-06, + "loss": 0.7408, + "step": 9094 + }, + { + "epoch": 0.748405677844065, + "grad_norm": 1.9035883536691198, + "learning_rate": 3.1397380012636545e-06, + "loss": 0.6913, + "step": 9095 + }, + { + "epoch": 0.74848796543921, + "grad_norm": 1.8822904250516452, + "learning_rate": 3.1377990322278207e-06, + "loss": 0.7148, + "step": 9096 + }, + { + "epoch": 0.7485702530343551, + "grad_norm": 1.9999552649302352, + "learning_rate": 3.1358605506717655e-06, + "loss": 0.6891, + "step": 9097 + }, + { + "epoch": 0.7486525406295002, + "grad_norm": 2.599317253058182, + "learning_rate": 3.1339225567332055e-06, + "loss": 0.6938, + "step": 9098 + }, + { + "epoch": 0.7487348282246451, + "grad_norm": 2.036942483453893, + "learning_rate": 3.1319850505498006e-06, + "loss": 0.7026, + "step": 9099 + }, + { + "epoch": 0.7488171158197902, + "grad_norm": 1.8703573405983325, + "learning_rate": 3.1300480322591966e-06, + "loss": 0.7294, + "step": 9100 + }, + { + "epoch": 0.7488994034149352, + "grad_norm": 2.160680210891329, + "learning_rate": 3.1281115019989892e-06, + "loss": 0.7298, + "step": 9101 + }, + { + "epoch": 0.7489816910100803, + "grad_norm": 1.7695581846559167, + "learning_rate": 3.1261754599067562e-06, + "loss": 0.7102, + "step": 9102 + }, + { + "epoch": 0.7490639786052252, + "grad_norm": 2.039661917565203, + "learning_rate": 3.1242399061200214e-06, + "loss": 0.7264, + "step": 9103 + }, + { + "epoch": 0.7491462662003703, + "grad_norm": 1.5700024672774617, + "learning_rate": 3.1223048407762933e-06, + "loss": 0.6904, + "step": 9104 + }, + { + "epoch": 0.7492285537955153, + "grad_norm": 1.6663521683117708, + "learning_rate": 3.120370264013024e-06, + "loss": 0.7074, + "step": 9105 + }, + { + "epoch": 0.7493108413906604, + "grad_norm": 1.7596503034468816, + "learning_rate": 3.118436175967652e-06, + "loss": 0.6874, + "step": 9106 + }, + { + "epoch": 0.7493931289858053, + "grad_norm": 2.1669426098093267, + "learning_rate": 3.1165025767775646e-06, + "loss": 0.7192, + "step": 9107 + }, + { + "epoch": 0.7494754165809504, + "grad_norm": 0.40828739957520754, + "learning_rate": 3.1145694665801305e-06, + "loss": 0.4656, + "step": 9108 + }, + { + "epoch": 0.7495577041760955, + "grad_norm": 2.212928282672075, + "learning_rate": 3.1126368455126686e-06, + "loss": 0.7156, + "step": 9109 + }, + { + "epoch": 0.7496399917712405, + "grad_norm": 2.0502579875591533, + "learning_rate": 3.1107047137124714e-06, + "loss": 0.7137, + "step": 9110 + }, + { + "epoch": 0.7497222793663855, + "grad_norm": 3.2333301711125895, + "learning_rate": 3.1087730713167895e-06, + "loss": 0.6999, + "step": 9111 + }, + { + "epoch": 0.7498045669615305, + "grad_norm": 1.893397855250217, + "learning_rate": 3.1068419184628517e-06, + "loss": 0.6878, + "step": 9112 + }, + { + "epoch": 0.7498868545566756, + "grad_norm": 3.0412647766826657, + "learning_rate": 3.1049112552878357e-06, + "loss": 0.7136, + "step": 9113 + }, + { + "epoch": 0.7499691421518206, + "grad_norm": 2.052746906688453, + "learning_rate": 3.1029810819289017e-06, + "loss": 0.7366, + "step": 9114 + }, + { + "epoch": 0.7500514297469656, + "grad_norm": 1.6043563433792207, + "learning_rate": 3.1010513985231605e-06, + "loss": 0.7002, + "step": 9115 + }, + { + "epoch": 0.7501337173421107, + "grad_norm": 4.519767102705469, + "learning_rate": 3.099122205207695e-06, + "loss": 0.6952, + "step": 9116 + }, + { + "epoch": 0.7502160049372557, + "grad_norm": 1.7890760097313998, + "learning_rate": 3.0971935021195486e-06, + "loss": 0.7097, + "step": 9117 + }, + { + "epoch": 0.7502982925324008, + "grad_norm": 1.771139177197992, + "learning_rate": 3.0952652893957393e-06, + "loss": 0.6949, + "step": 9118 + }, + { + "epoch": 0.7503805801275458, + "grad_norm": 2.1199120756760186, + "learning_rate": 3.093337567173238e-06, + "loss": 0.6938, + "step": 9119 + }, + { + "epoch": 0.7504628677226908, + "grad_norm": 2.4973081169671185, + "learning_rate": 3.0914103355889946e-06, + "loss": 0.7113, + "step": 9120 + }, + { + "epoch": 0.7505451553178358, + "grad_norm": 2.139083321915509, + "learning_rate": 3.0894835947799117e-06, + "loss": 0.7217, + "step": 9121 + }, + { + "epoch": 0.7506274429129809, + "grad_norm": 3.142568008851085, + "learning_rate": 3.087557344882862e-06, + "loss": 0.6951, + "step": 9122 + }, + { + "epoch": 0.750709730508126, + "grad_norm": 4.638577672837599, + "learning_rate": 3.0856315860346807e-06, + "loss": 0.7205, + "step": 9123 + }, + { + "epoch": 0.7507920181032709, + "grad_norm": 3.1926100504826147, + "learning_rate": 3.083706318372177e-06, + "loss": 0.7405, + "step": 9124 + }, + { + "epoch": 0.750874305698416, + "grad_norm": 0.44653820663460875, + "learning_rate": 3.0817815420321128e-06, + "loss": 0.4858, + "step": 9125 + }, + { + "epoch": 0.750956593293561, + "grad_norm": 2.227286421829932, + "learning_rate": 3.0798572571512265e-06, + "loss": 0.7345, + "step": 9126 + }, + { + "epoch": 0.7510388808887061, + "grad_norm": 2.266921785376986, + "learning_rate": 3.077933463866213e-06, + "loss": 0.7362, + "step": 9127 + }, + { + "epoch": 0.751121168483851, + "grad_norm": 1.8354014188473562, + "learning_rate": 3.0760101623137373e-06, + "loss": 0.7135, + "step": 9128 + }, + { + "epoch": 0.7512034560789961, + "grad_norm": 2.3671674424194133, + "learning_rate": 3.0740873526304214e-06, + "loss": 0.7403, + "step": 9129 + }, + { + "epoch": 0.7512857436741411, + "grad_norm": 2.0093641054033555, + "learning_rate": 3.072165034952869e-06, + "loss": 0.7411, + "step": 9130 + }, + { + "epoch": 0.7513680312692862, + "grad_norm": 2.666387309912012, + "learning_rate": 3.070243209417628e-06, + "loss": 0.7146, + "step": 9131 + }, + { + "epoch": 0.7514503188644311, + "grad_norm": 0.417197392400611, + "learning_rate": 3.068321876161231e-06, + "loss": 0.4974, + "step": 9132 + }, + { + "epoch": 0.7515326064595762, + "grad_norm": 1.9424162899308142, + "learning_rate": 3.0664010353201624e-06, + "loss": 0.7272, + "step": 9133 + }, + { + "epoch": 0.7516148940547213, + "grad_norm": 2.202626878770787, + "learning_rate": 3.064480687030875e-06, + "loss": 0.7265, + "step": 9134 + }, + { + "epoch": 0.7516971816498663, + "grad_norm": 2.0956349960898217, + "learning_rate": 3.062560831429785e-06, + "loss": 0.7228, + "step": 9135 + }, + { + "epoch": 0.7517794692450113, + "grad_norm": 1.7618399597110652, + "learning_rate": 3.0606414686532814e-06, + "loss": 0.7389, + "step": 9136 + }, + { + "epoch": 0.7518617568401563, + "grad_norm": 1.7553777146506013, + "learning_rate": 3.058722598837708e-06, + "loss": 0.7205, + "step": 9137 + }, + { + "epoch": 0.7519440444353014, + "grad_norm": 0.41287520900696206, + "learning_rate": 3.056804222119383e-06, + "loss": 0.4676, + "step": 9138 + }, + { + "epoch": 0.7520263320304464, + "grad_norm": 0.4243613911359341, + "learning_rate": 3.054886338634582e-06, + "loss": 0.4897, + "step": 9139 + }, + { + "epoch": 0.7521086196255914, + "grad_norm": 2.3025683972553166, + "learning_rate": 3.052968948519548e-06, + "loss": 0.6899, + "step": 9140 + }, + { + "epoch": 0.7521909072207364, + "grad_norm": 0.42587891552155555, + "learning_rate": 3.051052051910487e-06, + "loss": 0.476, + "step": 9141 + }, + { + "epoch": 0.7522731948158815, + "grad_norm": 2.0742557776021986, + "learning_rate": 3.0491356489435776e-06, + "loss": 0.7268, + "step": 9142 + }, + { + "epoch": 0.7523554824110266, + "grad_norm": 1.8156939599282875, + "learning_rate": 3.0472197397549528e-06, + "loss": 0.7237, + "step": 9143 + }, + { + "epoch": 0.7524377700061716, + "grad_norm": 6.026961620508389, + "learning_rate": 3.045304324480721e-06, + "loss": 0.7182, + "step": 9144 + }, + { + "epoch": 0.7525200576013166, + "grad_norm": 1.7166097905865065, + "learning_rate": 3.043389403256949e-06, + "loss": 0.6787, + "step": 9145 + }, + { + "epoch": 0.7526023451964616, + "grad_norm": 1.8028685188678983, + "learning_rate": 3.0414749762196673e-06, + "loss": 0.7125, + "step": 9146 + }, + { + "epoch": 0.7526846327916067, + "grad_norm": 0.4218401580409773, + "learning_rate": 3.0395610435048718e-06, + "loss": 0.4821, + "step": 9147 + }, + { + "epoch": 0.7527669203867517, + "grad_norm": 5.423612420550729, + "learning_rate": 3.0376476052485306e-06, + "loss": 0.7528, + "step": 9148 + }, + { + "epoch": 0.7528492079818967, + "grad_norm": 1.9159103480138138, + "learning_rate": 3.0357346615865668e-06, + "loss": 0.6918, + "step": 9149 + }, + { + "epoch": 0.7529314955770418, + "grad_norm": 0.4211269181543552, + "learning_rate": 3.0338222126548778e-06, + "loss": 0.4683, + "step": 9150 + }, + { + "epoch": 0.7530137831721868, + "grad_norm": 0.41235924710879446, + "learning_rate": 3.031910258589318e-06, + "loss": 0.4679, + "step": 9151 + }, + { + "epoch": 0.7530960707673319, + "grad_norm": 2.0121423966366945, + "learning_rate": 3.0299987995257095e-06, + "loss": 0.7128, + "step": 9152 + }, + { + "epoch": 0.7531783583624768, + "grad_norm": 1.875812037225134, + "learning_rate": 3.0280878355998356e-06, + "loss": 0.7262, + "step": 9153 + }, + { + "epoch": 0.7532606459576219, + "grad_norm": 2.1385392555810436, + "learning_rate": 3.026177366947456e-06, + "loss": 0.7228, + "step": 9154 + }, + { + "epoch": 0.7533429335527669, + "grad_norm": 3.9281822702966536, + "learning_rate": 3.0242673937042797e-06, + "loss": 0.7352, + "step": 9155 + }, + { + "epoch": 0.753425221147912, + "grad_norm": 0.4072768257883261, + "learning_rate": 3.0223579160059956e-06, + "loss": 0.4514, + "step": 9156 + }, + { + "epoch": 0.7535075087430569, + "grad_norm": 2.828059994670032, + "learning_rate": 3.020448933988246e-06, + "loss": 0.686, + "step": 9157 + }, + { + "epoch": 0.753589796338202, + "grad_norm": 2.236896564640052, + "learning_rate": 3.018540447786641e-06, + "loss": 0.7097, + "step": 9158 + }, + { + "epoch": 0.7536720839333471, + "grad_norm": 2.0908123137544865, + "learning_rate": 3.0166324575367546e-06, + "loss": 0.6872, + "step": 9159 + }, + { + "epoch": 0.7537543715284921, + "grad_norm": 2.0563086078155437, + "learning_rate": 3.0147249633741338e-06, + "loss": 0.6763, + "step": 9160 + }, + { + "epoch": 0.7538366591236371, + "grad_norm": 0.4082749854768439, + "learning_rate": 3.012817965434277e-06, + "loss": 0.457, + "step": 9161 + }, + { + "epoch": 0.7539189467187821, + "grad_norm": 1.8421726626282753, + "learning_rate": 3.0109114638526636e-06, + "loss": 0.7231, + "step": 9162 + }, + { + "epoch": 0.7540012343139272, + "grad_norm": 2.3022675113878686, + "learning_rate": 3.0090054587647153e-06, + "loss": 0.744, + "step": 9163 + }, + { + "epoch": 0.7540835219090722, + "grad_norm": 1.9175950906780805, + "learning_rate": 3.007099950305844e-06, + "loss": 0.7183, + "step": 9164 + }, + { + "epoch": 0.7541658095042172, + "grad_norm": 2.177843340285793, + "learning_rate": 3.005194938611404e-06, + "loss": 0.7209, + "step": 9165 + }, + { + "epoch": 0.7542480970993622, + "grad_norm": 1.9866611762887159, + "learning_rate": 3.0032904238167327e-06, + "loss": 0.7063, + "step": 9166 + }, + { + "epoch": 0.7543303846945073, + "grad_norm": 2.246648730830409, + "learning_rate": 3.0013864060571173e-06, + "loss": 0.7172, + "step": 9167 + }, + { + "epoch": 0.7544126722896524, + "grad_norm": 1.9644517398582584, + "learning_rate": 2.9994828854678247e-06, + "loss": 0.7347, + "step": 9168 + }, + { + "epoch": 0.7544949598847973, + "grad_norm": 2.094922574859876, + "learning_rate": 2.9975798621840657e-06, + "loss": 0.7257, + "step": 9169 + }, + { + "epoch": 0.7545772474799424, + "grad_norm": 2.0780835581119246, + "learning_rate": 2.9956773363410387e-06, + "loss": 0.7331, + "step": 9170 + }, + { + "epoch": 0.7546595350750874, + "grad_norm": 1.819267409152771, + "learning_rate": 2.9937753080738874e-06, + "loss": 0.7144, + "step": 9171 + }, + { + "epoch": 0.7547418226702325, + "grad_norm": 2.3961463864813246, + "learning_rate": 2.9918737775177376e-06, + "loss": 0.7038, + "step": 9172 + }, + { + "epoch": 0.7548241102653775, + "grad_norm": 1.923390935902075, + "learning_rate": 2.9899727448076633e-06, + "loss": 0.7336, + "step": 9173 + }, + { + "epoch": 0.7549063978605225, + "grad_norm": 1.7278435257544527, + "learning_rate": 2.9880722100787207e-06, + "loss": 0.7073, + "step": 9174 + }, + { + "epoch": 0.7549886854556676, + "grad_norm": 0.41711274143831717, + "learning_rate": 2.986172173465909e-06, + "loss": 0.4469, + "step": 9175 + }, + { + "epoch": 0.7550709730508126, + "grad_norm": 1.7302607481639138, + "learning_rate": 2.9842726351042108e-06, + "loss": 0.7463, + "step": 9176 + }, + { + "epoch": 0.7551532606459577, + "grad_norm": 2.195503605782336, + "learning_rate": 2.9823735951285627e-06, + "loss": 0.7363, + "step": 9177 + }, + { + "epoch": 0.7552355482411026, + "grad_norm": 1.9517605729394807, + "learning_rate": 2.9804750536738737e-06, + "loss": 0.6869, + "step": 9178 + }, + { + "epoch": 0.7553178358362477, + "grad_norm": 2.1850486099838275, + "learning_rate": 2.9785770108750113e-06, + "loss": 0.7308, + "step": 9179 + }, + { + "epoch": 0.7554001234313927, + "grad_norm": 2.0952386684406297, + "learning_rate": 2.9766794668668087e-06, + "loss": 0.7159, + "step": 9180 + }, + { + "epoch": 0.7554824110265378, + "grad_norm": 2.2880300173613897, + "learning_rate": 2.9747824217840615e-06, + "loss": 0.7301, + "step": 9181 + }, + { + "epoch": 0.7555646986216827, + "grad_norm": 2.039450515983734, + "learning_rate": 2.9728858757615386e-06, + "loss": 0.6933, + "step": 9182 + }, + { + "epoch": 0.7556469862168278, + "grad_norm": 0.4304244435176246, + "learning_rate": 2.970989828933962e-06, + "loss": 0.482, + "step": 9183 + }, + { + "epoch": 0.7557292738119729, + "grad_norm": 2.1371824250446516, + "learning_rate": 2.96909428143603e-06, + "loss": 0.7165, + "step": 9184 + }, + { + "epoch": 0.7558115614071179, + "grad_norm": 0.41027123368479645, + "learning_rate": 2.967199233402396e-06, + "loss": 0.4647, + "step": 9185 + }, + { + "epoch": 0.7558938490022629, + "grad_norm": 2.8013625774052207, + "learning_rate": 2.9653046849676816e-06, + "loss": 0.7109, + "step": 9186 + }, + { + "epoch": 0.7559761365974079, + "grad_norm": 0.42572397557315034, + "learning_rate": 2.9634106362664684e-06, + "loss": 0.4901, + "step": 9187 + }, + { + "epoch": 0.756058424192553, + "grad_norm": 2.263352288777671, + "learning_rate": 2.961517087433313e-06, + "loss": 0.7097, + "step": 9188 + }, + { + "epoch": 0.756140711787698, + "grad_norm": 2.265305496474047, + "learning_rate": 2.9596240386027243e-06, + "loss": 0.699, + "step": 9189 + }, + { + "epoch": 0.756222999382843, + "grad_norm": 1.9400171234254058, + "learning_rate": 2.957731489909188e-06, + "loss": 0.7447, + "step": 9190 + }, + { + "epoch": 0.756305286977988, + "grad_norm": 4.304432763552447, + "learning_rate": 2.955839441487144e-06, + "loss": 0.7308, + "step": 9191 + }, + { + "epoch": 0.7563875745731331, + "grad_norm": 2.0882510697692864, + "learning_rate": 2.953947893471001e-06, + "loss": 0.7192, + "step": 9192 + }, + { + "epoch": 0.7564698621682782, + "grad_norm": 1.844672623143083, + "learning_rate": 2.9520568459951283e-06, + "loss": 0.7267, + "step": 9193 + }, + { + "epoch": 0.7565521497634231, + "grad_norm": 2.7834705629069583, + "learning_rate": 2.950166299193867e-06, + "loss": 0.7089, + "step": 9194 + }, + { + "epoch": 0.7566344373585682, + "grad_norm": 0.4279929826184908, + "learning_rate": 2.9482762532015163e-06, + "loss": 0.4538, + "step": 9195 + }, + { + "epoch": 0.7567167249537132, + "grad_norm": 2.067506458222209, + "learning_rate": 2.9463867081523456e-06, + "loss": 0.7459, + "step": 9196 + }, + { + "epoch": 0.7567990125488583, + "grad_norm": 1.8923168021622079, + "learning_rate": 2.9444976641805823e-06, + "loss": 0.7266, + "step": 9197 + }, + { + "epoch": 0.7568813001440033, + "grad_norm": 0.40937459382638797, + "learning_rate": 2.942609121420421e-06, + "loss": 0.4551, + "step": 9198 + }, + { + "epoch": 0.7569635877391483, + "grad_norm": 2.1017491629096448, + "learning_rate": 2.940721080006018e-06, + "loss": 0.7178, + "step": 9199 + }, + { + "epoch": 0.7570458753342934, + "grad_norm": 1.9468172965028057, + "learning_rate": 2.938833540071503e-06, + "loss": 0.7328, + "step": 9200 + }, + { + "epoch": 0.7571281629294384, + "grad_norm": 2.3043230044229883, + "learning_rate": 2.936946501750958e-06, + "loss": 0.7177, + "step": 9201 + }, + { + "epoch": 0.7572104505245835, + "grad_norm": 2.017309817215876, + "learning_rate": 2.9350599651784406e-06, + "loss": 0.7209, + "step": 9202 + }, + { + "epoch": 0.7572927381197284, + "grad_norm": 1.8968522419059137, + "learning_rate": 2.933173930487965e-06, + "loss": 0.7274, + "step": 9203 + }, + { + "epoch": 0.7573750257148735, + "grad_norm": 1.7865045207995185, + "learning_rate": 2.931288397813511e-06, + "loss": 0.7175, + "step": 9204 + }, + { + "epoch": 0.7574573133100185, + "grad_norm": 2.042108564771714, + "learning_rate": 2.929403367289021e-06, + "loss": 0.7385, + "step": 9205 + }, + { + "epoch": 0.7575396009051636, + "grad_norm": 1.7799054639514778, + "learning_rate": 2.9275188390484123e-06, + "loss": 0.7096, + "step": 9206 + }, + { + "epoch": 0.7576218885003085, + "grad_norm": 2.9114627669191964, + "learning_rate": 2.925634813225551e-06, + "loss": 0.7271, + "step": 9207 + }, + { + "epoch": 0.7577041760954536, + "grad_norm": 1.95507054090128, + "learning_rate": 2.923751289954282e-06, + "loss": 0.741, + "step": 9208 + }, + { + "epoch": 0.7577864636905987, + "grad_norm": 2.072308162730678, + "learning_rate": 2.921868269368404e-06, + "loss": 0.747, + "step": 9209 + }, + { + "epoch": 0.7578687512857437, + "grad_norm": 2.0740332259080705, + "learning_rate": 2.919985751601685e-06, + "loss": 0.7238, + "step": 9210 + }, + { + "epoch": 0.7579510388808887, + "grad_norm": 0.3948183513075035, + "learning_rate": 2.9181037367878517e-06, + "loss": 0.4776, + "step": 9211 + }, + { + "epoch": 0.7580333264760337, + "grad_norm": 1.9084530220037184, + "learning_rate": 2.9162222250606067e-06, + "loss": 0.7208, + "step": 9212 + }, + { + "epoch": 0.7581156140711788, + "grad_norm": 1.959729497235665, + "learning_rate": 2.9143412165536023e-06, + "loss": 0.7244, + "step": 9213 + }, + { + "epoch": 0.7581979016663238, + "grad_norm": 2.3396923997016885, + "learning_rate": 2.91246071140047e-06, + "loss": 0.7365, + "step": 9214 + }, + { + "epoch": 0.7582801892614688, + "grad_norm": 3.3417186067857316, + "learning_rate": 2.910580709734794e-06, + "loss": 0.7218, + "step": 9215 + }, + { + "epoch": 0.7583624768566138, + "grad_norm": 2.200476700290985, + "learning_rate": 2.908701211690126e-06, + "loss": 0.7184, + "step": 9216 + }, + { + "epoch": 0.7584447644517589, + "grad_norm": 2.2965308265993802, + "learning_rate": 2.9068222173999794e-06, + "loss": 0.733, + "step": 9217 + }, + { + "epoch": 0.758527052046904, + "grad_norm": 0.43945668496515106, + "learning_rate": 2.904943726997842e-06, + "loss": 0.4916, + "step": 9218 + }, + { + "epoch": 0.7586093396420489, + "grad_norm": 2.138509316764314, + "learning_rate": 2.9030657406171525e-06, + "loss": 0.7088, + "step": 9219 + }, + { + "epoch": 0.758691627237194, + "grad_norm": 1.6623849181407633, + "learning_rate": 2.9011882583913265e-06, + "loss": 0.6963, + "step": 9220 + }, + { + "epoch": 0.758773914832339, + "grad_norm": 2.0512241354165934, + "learning_rate": 2.8993112804537337e-06, + "loss": 0.7065, + "step": 9221 + }, + { + "epoch": 0.7588562024274841, + "grad_norm": 0.420586132599048, + "learning_rate": 2.897434806937711e-06, + "loss": 0.479, + "step": 9222 + }, + { + "epoch": 0.7589384900226291, + "grad_norm": 2.1824114753677852, + "learning_rate": 2.8955588379765578e-06, + "loss": 0.701, + "step": 9223 + }, + { + "epoch": 0.7590207776177741, + "grad_norm": 2.393876703411098, + "learning_rate": 2.8936833737035465e-06, + "loss": 0.707, + "step": 9224 + }, + { + "epoch": 0.7591030652129191, + "grad_norm": 2.158825689405682, + "learning_rate": 2.8918084142518998e-06, + "loss": 0.7084, + "step": 9225 + }, + { + "epoch": 0.7591853528080642, + "grad_norm": 0.4304709467545752, + "learning_rate": 2.8899339597548183e-06, + "loss": 0.4812, + "step": 9226 + }, + { + "epoch": 0.7592676404032093, + "grad_norm": 2.926851104067986, + "learning_rate": 2.888060010345458e-06, + "loss": 0.7257, + "step": 9227 + }, + { + "epoch": 0.7593499279983542, + "grad_norm": 0.4161479003784738, + "learning_rate": 2.886186566156941e-06, + "loss": 0.5001, + "step": 9228 + }, + { + "epoch": 0.7594322155934993, + "grad_norm": 2.0715300154641283, + "learning_rate": 2.8843136273223495e-06, + "loss": 0.7438, + "step": 9229 + }, + { + "epoch": 0.7595145031886443, + "grad_norm": 1.760848232023531, + "learning_rate": 2.882441193974742e-06, + "loss": 0.682, + "step": 9230 + }, + { + "epoch": 0.7595967907837894, + "grad_norm": 2.7051104523389164, + "learning_rate": 2.8805692662471254e-06, + "loss": 0.6945, + "step": 9231 + }, + { + "epoch": 0.7596790783789343, + "grad_norm": 2.359866058152421, + "learning_rate": 2.8786978442724887e-06, + "loss": 0.6981, + "step": 9232 + }, + { + "epoch": 0.7597613659740794, + "grad_norm": 0.4102485242531031, + "learning_rate": 2.8768269281837625e-06, + "loss": 0.4577, + "step": 9233 + }, + { + "epoch": 0.7598436535692245, + "grad_norm": 0.4394527030503246, + "learning_rate": 2.874956518113863e-06, + "loss": 0.4685, + "step": 9234 + }, + { + "epoch": 0.7599259411643695, + "grad_norm": 1.9408467754702003, + "learning_rate": 2.8730866141956536e-06, + "loss": 0.6966, + "step": 9235 + }, + { + "epoch": 0.7600082287595145, + "grad_norm": 1.9556845095116449, + "learning_rate": 2.8712172165619766e-06, + "loss": 0.7397, + "step": 9236 + }, + { + "epoch": 0.7600905163546595, + "grad_norm": 2.571916385602802, + "learning_rate": 2.8693483253456247e-06, + "loss": 0.7248, + "step": 9237 + }, + { + "epoch": 0.7601728039498046, + "grad_norm": 3.7677233841776525, + "learning_rate": 2.8674799406793697e-06, + "loss": 0.7235, + "step": 9238 + }, + { + "epoch": 0.7602550915449496, + "grad_norm": 1.8099323553260214, + "learning_rate": 2.865612062695927e-06, + "loss": 0.6923, + "step": 9239 + }, + { + "epoch": 0.7603373791400946, + "grad_norm": 0.4443073374781417, + "learning_rate": 2.863744691527998e-06, + "loss": 0.4888, + "step": 9240 + }, + { + "epoch": 0.7604196667352396, + "grad_norm": 4.459657043646712, + "learning_rate": 2.8618778273082283e-06, + "loss": 0.7115, + "step": 9241 + }, + { + "epoch": 0.7605019543303847, + "grad_norm": 1.9940077448251239, + "learning_rate": 2.860011470169246e-06, + "loss": 0.7081, + "step": 9242 + }, + { + "epoch": 0.7605842419255298, + "grad_norm": 2.6853407250014705, + "learning_rate": 2.8581456202436297e-06, + "loss": 0.7374, + "step": 9243 + }, + { + "epoch": 0.7606665295206747, + "grad_norm": 1.8880608688473122, + "learning_rate": 2.8562802776639277e-06, + "loss": 0.7314, + "step": 9244 + }, + { + "epoch": 0.7607488171158198, + "grad_norm": 1.7023367972574517, + "learning_rate": 2.8544154425626468e-06, + "loss": 0.7331, + "step": 9245 + }, + { + "epoch": 0.7608311047109648, + "grad_norm": 2.0130422336042346, + "learning_rate": 2.8525511150722674e-06, + "loss": 0.7193, + "step": 9246 + }, + { + "epoch": 0.7609133923061099, + "grad_norm": 2.0404435920066852, + "learning_rate": 2.850687295325224e-06, + "loss": 0.6949, + "step": 9247 + }, + { + "epoch": 0.7609956799012549, + "grad_norm": 2.1475353948490916, + "learning_rate": 2.8488239834539245e-06, + "loss": 0.7324, + "step": 9248 + }, + { + "epoch": 0.7610779674963999, + "grad_norm": 2.632190976654884, + "learning_rate": 2.8469611795907313e-06, + "loss": 0.7103, + "step": 9249 + }, + { + "epoch": 0.761160255091545, + "grad_norm": 1.9425574328875146, + "learning_rate": 2.8450988838679783e-06, + "loss": 0.7211, + "step": 9250 + }, + { + "epoch": 0.76124254268669, + "grad_norm": 2.846924708681644, + "learning_rate": 2.8432370964179536e-06, + "loss": 0.7153, + "step": 9251 + }, + { + "epoch": 0.7613248302818351, + "grad_norm": 1.7885313159741776, + "learning_rate": 2.8413758173729235e-06, + "loss": 0.7183, + "step": 9252 + }, + { + "epoch": 0.76140711787698, + "grad_norm": 1.7034908366800727, + "learning_rate": 2.8395150468651034e-06, + "loss": 0.7077, + "step": 9253 + }, + { + "epoch": 0.7614894054721251, + "grad_norm": 2.79221109582725, + "learning_rate": 2.8376547850266834e-06, + "loss": 0.7197, + "step": 9254 + }, + { + "epoch": 0.7615716930672701, + "grad_norm": 1.8291925577455446, + "learning_rate": 2.8357950319898185e-06, + "loss": 0.7089, + "step": 9255 + }, + { + "epoch": 0.7616539806624152, + "grad_norm": 0.4075395694427416, + "learning_rate": 2.8339357878866114e-06, + "loss": 0.4707, + "step": 9256 + }, + { + "epoch": 0.7617362682575601, + "grad_norm": 3.785992811122604, + "learning_rate": 2.8320770528491494e-06, + "loss": 0.7077, + "step": 9257 + }, + { + "epoch": 0.7618185558527052, + "grad_norm": 1.7775200455966835, + "learning_rate": 2.8302188270094654e-06, + "loss": 0.7394, + "step": 9258 + }, + { + "epoch": 0.7619008434478503, + "grad_norm": 1.885101942643782, + "learning_rate": 2.8283611104995733e-06, + "loss": 0.7296, + "step": 9259 + }, + { + "epoch": 0.7619831310429953, + "grad_norm": 1.8076050992845274, + "learning_rate": 2.8265039034514387e-06, + "loss": 0.6935, + "step": 9260 + }, + { + "epoch": 0.7620654186381403, + "grad_norm": 3.3315284117472714, + "learning_rate": 2.8246472059969943e-06, + "loss": 0.7299, + "step": 9261 + }, + { + "epoch": 0.7621477062332853, + "grad_norm": 1.782529442848717, + "learning_rate": 2.822791018268135e-06, + "loss": 0.7258, + "step": 9262 + }, + { + "epoch": 0.7622299938284304, + "grad_norm": 1.9095949091014244, + "learning_rate": 2.8209353403967255e-06, + "loss": 0.7184, + "step": 9263 + }, + { + "epoch": 0.7623122814235754, + "grad_norm": 1.9495755411028841, + "learning_rate": 2.8190801725145855e-06, + "loss": 0.7237, + "step": 9264 + }, + { + "epoch": 0.7623945690187204, + "grad_norm": 1.6490781208036598, + "learning_rate": 2.817225514753509e-06, + "loss": 0.7095, + "step": 9265 + }, + { + "epoch": 0.7624768566138654, + "grad_norm": 2.155162561562627, + "learning_rate": 2.815371367245244e-06, + "loss": 0.7115, + "step": 9266 + }, + { + "epoch": 0.7625591442090105, + "grad_norm": 2.092724077707241, + "learning_rate": 2.8135177301215077e-06, + "loss": 0.7076, + "step": 9267 + }, + { + "epoch": 0.7626414318041556, + "grad_norm": 2.199426045225233, + "learning_rate": 2.811664603513976e-06, + "loss": 0.7051, + "step": 9268 + }, + { + "epoch": 0.7627237193993005, + "grad_norm": 1.9742086019549603, + "learning_rate": 2.8098119875542972e-06, + "loss": 0.707, + "step": 9269 + }, + { + "epoch": 0.7628060069944456, + "grad_norm": 1.7977208156897004, + "learning_rate": 2.8079598823740726e-06, + "loss": 0.7284, + "step": 9270 + }, + { + "epoch": 0.7628882945895906, + "grad_norm": 2.447209015402611, + "learning_rate": 2.8061082881048797e-06, + "loss": 0.726, + "step": 9271 + }, + { + "epoch": 0.7629705821847357, + "grad_norm": 1.713016039878938, + "learning_rate": 2.8042572048782492e-06, + "loss": 0.7069, + "step": 9272 + }, + { + "epoch": 0.7630528697798807, + "grad_norm": 2.1412505954717096, + "learning_rate": 2.8024066328256784e-06, + "loss": 0.7375, + "step": 9273 + }, + { + "epoch": 0.7631351573750257, + "grad_norm": 1.5927562997459417, + "learning_rate": 2.8005565720786266e-06, + "loss": 0.7096, + "step": 9274 + }, + { + "epoch": 0.7632174449701707, + "grad_norm": 1.8846719517033188, + "learning_rate": 2.7987070227685255e-06, + "loss": 0.736, + "step": 9275 + }, + { + "epoch": 0.7632997325653158, + "grad_norm": 1.8482379696014237, + "learning_rate": 2.7968579850267576e-06, + "loss": 0.7169, + "step": 9276 + }, + { + "epoch": 0.7633820201604609, + "grad_norm": 2.0155340787008207, + "learning_rate": 2.7950094589846823e-06, + "loss": 0.7, + "step": 9277 + }, + { + "epoch": 0.7634643077556058, + "grad_norm": 1.8218786920528007, + "learning_rate": 2.793161444773611e-06, + "loss": 0.7124, + "step": 9278 + }, + { + "epoch": 0.7635465953507509, + "grad_norm": 3.5490983569768697, + "learning_rate": 2.791313942524826e-06, + "loss": 0.7226, + "step": 9279 + }, + { + "epoch": 0.7636288829458959, + "grad_norm": 3.1167704008933277, + "learning_rate": 2.789466952369566e-06, + "loss": 0.7254, + "step": 9280 + }, + { + "epoch": 0.763711170541041, + "grad_norm": 2.3978620313217176, + "learning_rate": 2.7876204744390466e-06, + "loss": 0.6809, + "step": 9281 + }, + { + "epoch": 0.7637934581361859, + "grad_norm": 1.7259867260062427, + "learning_rate": 2.7857745088644297e-06, + "loss": 0.69, + "step": 9282 + }, + { + "epoch": 0.763875745731331, + "grad_norm": 2.0167752329852155, + "learning_rate": 2.783929055776858e-06, + "loss": 0.6908, + "step": 9283 + }, + { + "epoch": 0.763958033326476, + "grad_norm": 0.416704659311358, + "learning_rate": 2.7820841153074265e-06, + "loss": 0.4436, + "step": 9284 + }, + { + "epoch": 0.7640403209216211, + "grad_norm": 2.275262048333697, + "learning_rate": 2.780239687587195e-06, + "loss": 0.6982, + "step": 9285 + }, + { + "epoch": 0.764122608516766, + "grad_norm": 1.9052606507349268, + "learning_rate": 2.7783957727471867e-06, + "loss": 0.6935, + "step": 9286 + }, + { + "epoch": 0.7642048961119111, + "grad_norm": 1.7704891001073824, + "learning_rate": 2.776552370918397e-06, + "loss": 0.7091, + "step": 9287 + }, + { + "epoch": 0.7642871837070562, + "grad_norm": 1.68784649627469, + "learning_rate": 2.7747094822317713e-06, + "loss": 0.7383, + "step": 9288 + }, + { + "epoch": 0.7643694713022012, + "grad_norm": 2.0776560807957964, + "learning_rate": 2.772867106818232e-06, + "loss": 0.7364, + "step": 9289 + }, + { + "epoch": 0.7644517588973462, + "grad_norm": 2.5951330971036604, + "learning_rate": 2.771025244808656e-06, + "loss": 0.7289, + "step": 9290 + }, + { + "epoch": 0.7645340464924912, + "grad_norm": 0.4021931031248818, + "learning_rate": 2.7691838963338844e-06, + "loss": 0.4886, + "step": 9291 + }, + { + "epoch": 0.7646163340876363, + "grad_norm": 2.3374216567136132, + "learning_rate": 2.7673430615247233e-06, + "loss": 0.7238, + "step": 9292 + }, + { + "epoch": 0.7646986216827814, + "grad_norm": 2.095568574185342, + "learning_rate": 2.765502740511946e-06, + "loss": 0.7138, + "step": 9293 + }, + { + "epoch": 0.7647809092779263, + "grad_norm": 2.124700154794619, + "learning_rate": 2.763662933426281e-06, + "loss": 0.7267, + "step": 9294 + }, + { + "epoch": 0.7648631968730714, + "grad_norm": 2.1695609586031344, + "learning_rate": 2.761823640398432e-06, + "loss": 0.727, + "step": 9295 + }, + { + "epoch": 0.7649454844682164, + "grad_norm": 0.41474044180647995, + "learning_rate": 2.759984861559055e-06, + "loss": 0.4593, + "step": 9296 + }, + { + "epoch": 0.7650277720633615, + "grad_norm": 1.8150677245215947, + "learning_rate": 2.7581465970387753e-06, + "loss": 0.6898, + "step": 9297 + }, + { + "epoch": 0.7651100596585064, + "grad_norm": 2.2177656699813957, + "learning_rate": 2.7563088469681776e-06, + "loss": 0.7011, + "step": 9298 + }, + { + "epoch": 0.7651923472536515, + "grad_norm": 1.8065911373954484, + "learning_rate": 2.754471611477817e-06, + "loss": 0.7431, + "step": 9299 + }, + { + "epoch": 0.7652746348487965, + "grad_norm": 2.8608085243379007, + "learning_rate": 2.7526348906982027e-06, + "loss": 0.7282, + "step": 9300 + }, + { + "epoch": 0.7653569224439416, + "grad_norm": 1.888264991899343, + "learning_rate": 2.7507986847598176e-06, + "loss": 0.7133, + "step": 9301 + }, + { + "epoch": 0.7654392100390867, + "grad_norm": 4.483097994258277, + "learning_rate": 2.7489629937931017e-06, + "loss": 0.7267, + "step": 9302 + }, + { + "epoch": 0.7655214976342316, + "grad_norm": 2.2637286092255327, + "learning_rate": 2.7471278179284577e-06, + "loss": 0.6994, + "step": 9303 + }, + { + "epoch": 0.7656037852293767, + "grad_norm": 3.1178817803042747, + "learning_rate": 2.7452931572962517e-06, + "loss": 0.7225, + "step": 9304 + }, + { + "epoch": 0.7656860728245217, + "grad_norm": 1.9195086014677365, + "learning_rate": 2.74345901202682e-06, + "loss": 0.7346, + "step": 9305 + }, + { + "epoch": 0.7657683604196668, + "grad_norm": 0.42685900623892464, + "learning_rate": 2.7416253822504533e-06, + "loss": 0.4802, + "step": 9306 + }, + { + "epoch": 0.7658506480148117, + "grad_norm": 1.7510349428036442, + "learning_rate": 2.7397922680974134e-06, + "loss": 0.7108, + "step": 9307 + }, + { + "epoch": 0.7659329356099568, + "grad_norm": 0.4360029334071004, + "learning_rate": 2.7379596696979196e-06, + "loss": 0.487, + "step": 9308 + }, + { + "epoch": 0.7660152232051018, + "grad_norm": 0.4036874936081672, + "learning_rate": 2.7361275871821567e-06, + "loss": 0.4521, + "step": 9309 + }, + { + "epoch": 0.7660975108002469, + "grad_norm": 0.4296239645702699, + "learning_rate": 2.7342960206802714e-06, + "loss": 0.4676, + "step": 9310 + }, + { + "epoch": 0.7661797983953919, + "grad_norm": 1.829362089761161, + "learning_rate": 2.7324649703223793e-06, + "loss": 0.7129, + "step": 9311 + }, + { + "epoch": 0.7662620859905369, + "grad_norm": 2.1922646649164816, + "learning_rate": 2.7306344362385496e-06, + "loss": 0.7205, + "step": 9312 + }, + { + "epoch": 0.766344373585682, + "grad_norm": 2.1598496179664424, + "learning_rate": 2.72880441855883e-06, + "loss": 0.7229, + "step": 9313 + }, + { + "epoch": 0.766426661180827, + "grad_norm": 1.7844620701030687, + "learning_rate": 2.726974917413211e-06, + "loss": 0.7037, + "step": 9314 + }, + { + "epoch": 0.766508948775972, + "grad_norm": 2.515134238757728, + "learning_rate": 2.7251459329316644e-06, + "loss": 0.7345, + "step": 9315 + }, + { + "epoch": 0.766591236371117, + "grad_norm": 2.1173857497495767, + "learning_rate": 2.723317465244113e-06, + "loss": 0.7206, + "step": 9316 + }, + { + "epoch": 0.7666735239662621, + "grad_norm": 0.4189061252234066, + "learning_rate": 2.7214895144804553e-06, + "loss": 0.4768, + "step": 9317 + }, + { + "epoch": 0.7667558115614072, + "grad_norm": 0.4073158093904964, + "learning_rate": 2.7196620807705387e-06, + "loss": 0.471, + "step": 9318 + }, + { + "epoch": 0.7668380991565521, + "grad_norm": 2.5507127392037265, + "learning_rate": 2.7178351642441913e-06, + "loss": 0.7274, + "step": 9319 + }, + { + "epoch": 0.7669203867516972, + "grad_norm": 2.3155149657464342, + "learning_rate": 2.7160087650311817e-06, + "loss": 0.6917, + "step": 9320 + }, + { + "epoch": 0.7670026743468422, + "grad_norm": 2.056309294064502, + "learning_rate": 2.7141828832612627e-06, + "loss": 0.6904, + "step": 9321 + }, + { + "epoch": 0.7670849619419873, + "grad_norm": 2.304762203848955, + "learning_rate": 2.712357519064137e-06, + "loss": 0.7021, + "step": 9322 + }, + { + "epoch": 0.7671672495371322, + "grad_norm": 2.1068475470579773, + "learning_rate": 2.7105326725694815e-06, + "loss": 0.7493, + "step": 9323 + }, + { + "epoch": 0.7672495371322773, + "grad_norm": 2.0724651982051054, + "learning_rate": 2.7087083439069253e-06, + "loss": 0.703, + "step": 9324 + }, + { + "epoch": 0.7673318247274223, + "grad_norm": 2.351688588179169, + "learning_rate": 2.706884533206072e-06, + "loss": 0.7411, + "step": 9325 + }, + { + "epoch": 0.7674141123225674, + "grad_norm": 2.202717817095867, + "learning_rate": 2.7050612405964714e-06, + "loss": 0.7248, + "step": 9326 + }, + { + "epoch": 0.7674963999177125, + "grad_norm": 2.5052382775153488, + "learning_rate": 2.7032384662076584e-06, + "loss": 0.7041, + "step": 9327 + }, + { + "epoch": 0.7675786875128574, + "grad_norm": 2.052258625678161, + "learning_rate": 2.701416210169111e-06, + "loss": 0.7172, + "step": 9328 + }, + { + "epoch": 0.7676609751080025, + "grad_norm": 2.4002865184192195, + "learning_rate": 2.6995944726102875e-06, + "loss": 0.7218, + "step": 9329 + }, + { + "epoch": 0.7677432627031475, + "grad_norm": 2.5996753916152406, + "learning_rate": 2.6977732536605973e-06, + "loss": 0.718, + "step": 9330 + }, + { + "epoch": 0.7678255502982926, + "grad_norm": 1.9619986264962244, + "learning_rate": 2.695952553449417e-06, + "loss": 0.6998, + "step": 9331 + }, + { + "epoch": 0.7679078378934375, + "grad_norm": 2.6072893131764405, + "learning_rate": 2.694132372106083e-06, + "loss": 0.7101, + "step": 9332 + }, + { + "epoch": 0.7679901254885826, + "grad_norm": 2.040953198662545, + "learning_rate": 2.6923127097599045e-06, + "loss": 0.7154, + "step": 9333 + }, + { + "epoch": 0.7680724130837276, + "grad_norm": 2.076766276228566, + "learning_rate": 2.690493566540141e-06, + "loss": 0.7396, + "step": 9334 + }, + { + "epoch": 0.7681547006788727, + "grad_norm": 2.772995154830085, + "learning_rate": 2.6886749425760274e-06, + "loss": 0.6953, + "step": 9335 + }, + { + "epoch": 0.7682369882740177, + "grad_norm": 2.4557790056178943, + "learning_rate": 2.686856837996753e-06, + "loss": 0.7331, + "step": 9336 + }, + { + "epoch": 0.7683192758691627, + "grad_norm": 4.192737999538648, + "learning_rate": 2.685039252931474e-06, + "loss": 0.7373, + "step": 9337 + }, + { + "epoch": 0.7684015634643078, + "grad_norm": 0.4017022717284391, + "learning_rate": 2.6832221875093045e-06, + "loss": 0.473, + "step": 9338 + }, + { + "epoch": 0.7684838510594528, + "grad_norm": 1.8554477528599844, + "learning_rate": 2.6814056418593326e-06, + "loss": 0.6916, + "step": 9339 + }, + { + "epoch": 0.7685661386545978, + "grad_norm": 1.6869323824360845, + "learning_rate": 2.679589616110596e-06, + "loss": 0.7113, + "step": 9340 + }, + { + "epoch": 0.7686484262497428, + "grad_norm": 2.4483507048171016, + "learning_rate": 2.6777741103921084e-06, + "loss": 0.6961, + "step": 9341 + }, + { + "epoch": 0.7687307138448879, + "grad_norm": 1.9788235852874987, + "learning_rate": 2.6759591248328386e-06, + "loss": 0.6786, + "step": 9342 + }, + { + "epoch": 0.768813001440033, + "grad_norm": 2.6766282156479058, + "learning_rate": 2.6741446595617203e-06, + "loss": 0.7574, + "step": 9343 + }, + { + "epoch": 0.7688952890351779, + "grad_norm": 2.2635952522418323, + "learning_rate": 2.672330714707645e-06, + "loss": 0.7111, + "step": 9344 + }, + { + "epoch": 0.768977576630323, + "grad_norm": 2.245396918057047, + "learning_rate": 2.67051729039948e-06, + "loss": 0.7266, + "step": 9345 + }, + { + "epoch": 0.769059864225468, + "grad_norm": 0.438205703476465, + "learning_rate": 2.668704386766042e-06, + "loss": 0.491, + "step": 9346 + }, + { + "epoch": 0.7691421518206131, + "grad_norm": 0.4170916658421931, + "learning_rate": 2.6668920039361236e-06, + "loss": 0.4882, + "step": 9347 + }, + { + "epoch": 0.769224439415758, + "grad_norm": 0.45453390192633, + "learning_rate": 2.6650801420384696e-06, + "loss": 0.48, + "step": 9348 + }, + { + "epoch": 0.7693067270109031, + "grad_norm": 2.5380519421411716, + "learning_rate": 2.6632688012017906e-06, + "loss": 0.7381, + "step": 9349 + }, + { + "epoch": 0.7693890146060481, + "grad_norm": 1.9421472719881498, + "learning_rate": 2.66145798155476e-06, + "loss": 0.7271, + "step": 9350 + }, + { + "epoch": 0.7694713022011932, + "grad_norm": 2.2422938417815836, + "learning_rate": 2.659647683226022e-06, + "loss": 0.7049, + "step": 9351 + }, + { + "epoch": 0.7695535897963383, + "grad_norm": 1.9892030619135241, + "learning_rate": 2.6578379063441697e-06, + "loss": 0.7224, + "step": 9352 + }, + { + "epoch": 0.7696358773914832, + "grad_norm": 2.237870813034633, + "learning_rate": 2.656028651037774e-06, + "loss": 0.6903, + "step": 9353 + }, + { + "epoch": 0.7697181649866283, + "grad_norm": 1.7815261356067251, + "learning_rate": 2.6542199174353576e-06, + "loss": 0.7136, + "step": 9354 + }, + { + "epoch": 0.7698004525817733, + "grad_norm": 1.9016238710512625, + "learning_rate": 2.65241170566541e-06, + "loss": 0.7144, + "step": 9355 + }, + { + "epoch": 0.7698827401769184, + "grad_norm": 2.054845606345834, + "learning_rate": 2.6506040158563816e-06, + "loss": 0.7133, + "step": 9356 + }, + { + "epoch": 0.7699650277720633, + "grad_norm": 0.415145220600389, + "learning_rate": 2.6487968481366933e-06, + "loss": 0.4783, + "step": 9357 + }, + { + "epoch": 0.7700473153672084, + "grad_norm": 2.4186227126168034, + "learning_rate": 2.646990202634717e-06, + "loss": 0.715, + "step": 9358 + }, + { + "epoch": 0.7701296029623534, + "grad_norm": 2.0861158398776034, + "learning_rate": 2.6451840794788007e-06, + "loss": 0.7239, + "step": 9359 + }, + { + "epoch": 0.7702118905574985, + "grad_norm": 2.031889184505084, + "learning_rate": 2.643378478797245e-06, + "loss": 0.741, + "step": 9360 + }, + { + "epoch": 0.7702941781526434, + "grad_norm": 2.285753022769836, + "learning_rate": 2.641573400718317e-06, + "loss": 0.7226, + "step": 9361 + }, + { + "epoch": 0.7703764657477885, + "grad_norm": 2.1595460824848285, + "learning_rate": 2.639768845370244e-06, + "loss": 0.7237, + "step": 9362 + }, + { + "epoch": 0.7704587533429336, + "grad_norm": 2.2331071495984975, + "learning_rate": 2.637964812881224e-06, + "loss": 0.7196, + "step": 9363 + }, + { + "epoch": 0.7705410409380786, + "grad_norm": 0.4227327690790136, + "learning_rate": 2.6361613033794066e-06, + "loss": 0.4827, + "step": 9364 + }, + { + "epoch": 0.7706233285332236, + "grad_norm": 2.0730869266849394, + "learning_rate": 2.6343583169929167e-06, + "loss": 0.7137, + "step": 9365 + }, + { + "epoch": 0.7707056161283686, + "grad_norm": 1.998845627966273, + "learning_rate": 2.632555853849832e-06, + "loss": 0.7022, + "step": 9366 + }, + { + "epoch": 0.7707879037235137, + "grad_norm": 2.051261091522672, + "learning_rate": 2.630753914078197e-06, + "loss": 0.6968, + "step": 9367 + }, + { + "epoch": 0.7708701913186587, + "grad_norm": 2.21290569120782, + "learning_rate": 2.6289524978060155e-06, + "loss": 0.7014, + "step": 9368 + }, + { + "epoch": 0.7709524789138037, + "grad_norm": 1.6612032920436022, + "learning_rate": 2.6271516051612646e-06, + "loss": 0.6683, + "step": 9369 + }, + { + "epoch": 0.7710347665089488, + "grad_norm": 1.737532553515438, + "learning_rate": 2.625351236271868e-06, + "loss": 0.7288, + "step": 9370 + }, + { + "epoch": 0.7711170541040938, + "grad_norm": 1.9435872402959067, + "learning_rate": 2.6235513912657283e-06, + "loss": 0.6795, + "step": 9371 + }, + { + "epoch": 0.7711993416992389, + "grad_norm": 2.264712265072835, + "learning_rate": 2.621752070270702e-06, + "loss": 0.7011, + "step": 9372 + }, + { + "epoch": 0.7712816292943838, + "grad_norm": 3.27758021719666, + "learning_rate": 2.6199532734146084e-06, + "loss": 0.7253, + "step": 9373 + }, + { + "epoch": 0.7713639168895289, + "grad_norm": 2.678759748133882, + "learning_rate": 2.6181550008252286e-06, + "loss": 0.6899, + "step": 9374 + }, + { + "epoch": 0.7714462044846739, + "grad_norm": 1.967787159190323, + "learning_rate": 2.616357252630316e-06, + "loss": 0.6936, + "step": 9375 + }, + { + "epoch": 0.771528492079819, + "grad_norm": 2.203079698933733, + "learning_rate": 2.6145600289575714e-06, + "loss": 0.6915, + "step": 9376 + }, + { + "epoch": 0.771610779674964, + "grad_norm": 2.5165788554538064, + "learning_rate": 2.6127633299346742e-06, + "loss": 0.7185, + "step": 9377 + }, + { + "epoch": 0.771693067270109, + "grad_norm": 2.0237632428873233, + "learning_rate": 2.610967155689257e-06, + "loss": 0.7379, + "step": 9378 + }, + { + "epoch": 0.7717753548652541, + "grad_norm": 1.8606089224853721, + "learning_rate": 2.6091715063489154e-06, + "loss": 0.6972, + "step": 9379 + }, + { + "epoch": 0.7718576424603991, + "grad_norm": 2.273784009020266, + "learning_rate": 2.6073763820412068e-06, + "loss": 0.722, + "step": 9380 + }, + { + "epoch": 0.7719399300555442, + "grad_norm": 2.4583183985362114, + "learning_rate": 2.6055817828936603e-06, + "loss": 0.7086, + "step": 9381 + }, + { + "epoch": 0.7720222176506891, + "grad_norm": 0.39989761139755714, + "learning_rate": 2.603787709033756e-06, + "loss": 0.4658, + "step": 9382 + }, + { + "epoch": 0.7721045052458342, + "grad_norm": 2.21301759406552, + "learning_rate": 2.60199416058895e-06, + "loss": 0.7037, + "step": 9383 + }, + { + "epoch": 0.7721867928409792, + "grad_norm": 1.942749403471381, + "learning_rate": 2.600201137686641e-06, + "loss": 0.7134, + "step": 9384 + }, + { + "epoch": 0.7722690804361243, + "grad_norm": 2.1426439680861464, + "learning_rate": 2.598408640454213e-06, + "loss": 0.687, + "step": 9385 + }, + { + "epoch": 0.7723513680312692, + "grad_norm": 1.854953150145232, + "learning_rate": 2.5966166690189944e-06, + "loss": 0.725, + "step": 9386 + }, + { + "epoch": 0.7724336556264143, + "grad_norm": 1.7082101522170734, + "learning_rate": 2.5948252235082903e-06, + "loss": 0.6979, + "step": 9387 + }, + { + "epoch": 0.7725159432215594, + "grad_norm": 0.43142550294071025, + "learning_rate": 2.5930343040493565e-06, + "loss": 0.5006, + "step": 9388 + }, + { + "epoch": 0.7725982308167044, + "grad_norm": 2.185268372448731, + "learning_rate": 2.5912439107694266e-06, + "loss": 0.7045, + "step": 9389 + }, + { + "epoch": 0.7726805184118494, + "grad_norm": 2.0627502269749782, + "learning_rate": 2.589454043795675e-06, + "loss": 0.7282, + "step": 9390 + }, + { + "epoch": 0.7727628060069944, + "grad_norm": 2.1698327551002703, + "learning_rate": 2.5876647032552595e-06, + "loss": 0.7078, + "step": 9391 + }, + { + "epoch": 0.7728450936021395, + "grad_norm": 2.279922101039725, + "learning_rate": 2.5858758892752855e-06, + "loss": 0.7517, + "step": 9392 + }, + { + "epoch": 0.7729273811972845, + "grad_norm": 2.228131225949399, + "learning_rate": 2.584087601982834e-06, + "loss": 0.725, + "step": 9393 + }, + { + "epoch": 0.7730096687924295, + "grad_norm": 1.8969554918804161, + "learning_rate": 2.5822998415049405e-06, + "loss": 0.7312, + "step": 9394 + }, + { + "epoch": 0.7730919563875746, + "grad_norm": 0.42698140516583905, + "learning_rate": 2.5805126079686027e-06, + "loss": 0.474, + "step": 9395 + }, + { + "epoch": 0.7731742439827196, + "grad_norm": 0.4081973560954864, + "learning_rate": 2.5787259015007805e-06, + "loss": 0.4737, + "step": 9396 + }, + { + "epoch": 0.7732565315778647, + "grad_norm": 1.655670356055697, + "learning_rate": 2.5769397222284055e-06, + "loss": 0.7072, + "step": 9397 + }, + { + "epoch": 0.7733388191730096, + "grad_norm": 3.5123225084290532, + "learning_rate": 2.5751540702783574e-06, + "loss": 0.6876, + "step": 9398 + }, + { + "epoch": 0.7734211067681547, + "grad_norm": 1.7293536191805985, + "learning_rate": 2.5733689457774925e-06, + "loss": 0.7184, + "step": 9399 + }, + { + "epoch": 0.7735033943632997, + "grad_norm": 1.7863260672445118, + "learning_rate": 2.5715843488526217e-06, + "loss": 0.7166, + "step": 9400 + }, + { + "epoch": 0.7735856819584448, + "grad_norm": 1.735338697167531, + "learning_rate": 2.5698002796305176e-06, + "loss": 0.6861, + "step": 9401 + }, + { + "epoch": 0.7736679695535899, + "grad_norm": 1.7918998378599351, + "learning_rate": 2.568016738237916e-06, + "loss": 0.6892, + "step": 9402 + }, + { + "epoch": 0.7737502571487348, + "grad_norm": 2.286047934421387, + "learning_rate": 2.5662337248015236e-06, + "loss": 0.6823, + "step": 9403 + }, + { + "epoch": 0.7738325447438799, + "grad_norm": 2.1749174024963587, + "learning_rate": 2.564451239447995e-06, + "loss": 0.713, + "step": 9404 + }, + { + "epoch": 0.7739148323390249, + "grad_norm": 2.3247186419358377, + "learning_rate": 2.5626692823039614e-06, + "loss": 0.707, + "step": 9405 + }, + { + "epoch": 0.77399711993417, + "grad_norm": 2.0404469307301216, + "learning_rate": 2.560887853496009e-06, + "loss": 0.7314, + "step": 9406 + }, + { + "epoch": 0.7740794075293149, + "grad_norm": 2.3596383782753536, + "learning_rate": 2.559106953150685e-06, + "loss": 0.7015, + "step": 9407 + }, + { + "epoch": 0.77416169512446, + "grad_norm": 0.4246097856282256, + "learning_rate": 2.5573265813945016e-06, + "loss": 0.4648, + "step": 9408 + }, + { + "epoch": 0.774243982719605, + "grad_norm": 0.41460826755634483, + "learning_rate": 2.555546738353937e-06, + "loss": 0.4791, + "step": 9409 + }, + { + "epoch": 0.7743262703147501, + "grad_norm": 2.009396441011187, + "learning_rate": 2.5537674241554233e-06, + "loss": 0.7033, + "step": 9410 + }, + { + "epoch": 0.774408557909895, + "grad_norm": 0.4100179706415165, + "learning_rate": 2.5519886389253678e-06, + "loss": 0.4929, + "step": 9411 + }, + { + "epoch": 0.7744908455050401, + "grad_norm": 2.207915031960584, + "learning_rate": 2.550210382790127e-06, + "loss": 0.7194, + "step": 9412 + }, + { + "epoch": 0.7745731331001852, + "grad_norm": 0.4309258818807022, + "learning_rate": 2.5484326558760265e-06, + "loss": 0.4568, + "step": 9413 + }, + { + "epoch": 0.7746554206953302, + "grad_norm": 1.9813997896429503, + "learning_rate": 2.5466554583093497e-06, + "loss": 0.7381, + "step": 9414 + }, + { + "epoch": 0.7747377082904752, + "grad_norm": 2.0243207565191303, + "learning_rate": 2.544878790216353e-06, + "loss": 0.6991, + "step": 9415 + }, + { + "epoch": 0.7748199958856202, + "grad_norm": 2.4363624147559397, + "learning_rate": 2.5431026517232406e-06, + "loss": 0.7196, + "step": 9416 + }, + { + "epoch": 0.7749022834807653, + "grad_norm": 1.9385343866995044, + "learning_rate": 2.541327042956194e-06, + "loss": 0.7282, + "step": 9417 + }, + { + "epoch": 0.7749845710759103, + "grad_norm": 1.8321972789783634, + "learning_rate": 2.539551964041346e-06, + "loss": 0.6895, + "step": 9418 + }, + { + "epoch": 0.7750668586710553, + "grad_norm": 1.956661599896836, + "learning_rate": 2.537777415104794e-06, + "loss": 0.731, + "step": 9419 + }, + { + "epoch": 0.7751491462662004, + "grad_norm": 4.172780219183053, + "learning_rate": 2.5360033962725985e-06, + "loss": 0.7222, + "step": 9420 + }, + { + "epoch": 0.7752314338613454, + "grad_norm": 1.7406656039846384, + "learning_rate": 2.534229907670788e-06, + "loss": 0.7076, + "step": 9421 + }, + { + "epoch": 0.7753137214564905, + "grad_norm": 2.2562132210436516, + "learning_rate": 2.5324569494253413e-06, + "loss": 0.6991, + "step": 9422 + }, + { + "epoch": 0.7753960090516354, + "grad_norm": 2.11760290523536, + "learning_rate": 2.530684521662213e-06, + "loss": 0.7062, + "step": 9423 + }, + { + "epoch": 0.7754782966467805, + "grad_norm": 1.951633028582491, + "learning_rate": 2.5289126245073114e-06, + "loss": 0.7108, + "step": 9424 + }, + { + "epoch": 0.7755605842419255, + "grad_norm": 0.4127565218677568, + "learning_rate": 2.5271412580865095e-06, + "loss": 0.4725, + "step": 9425 + }, + { + "epoch": 0.7756428718370706, + "grad_norm": 0.44737408275732954, + "learning_rate": 2.5253704225256382e-06, + "loss": 0.4753, + "step": 9426 + }, + { + "epoch": 0.7757251594322155, + "grad_norm": 1.7711861978226418, + "learning_rate": 2.523600117950501e-06, + "loss": 0.7222, + "step": 9427 + }, + { + "epoch": 0.7758074470273606, + "grad_norm": 2.4095840949245946, + "learning_rate": 2.521830344486851e-06, + "loss": 0.7134, + "step": 9428 + }, + { + "epoch": 0.7758897346225057, + "grad_norm": 1.6747175184937715, + "learning_rate": 2.5200611022604173e-06, + "loss": 0.7452, + "step": 9429 + }, + { + "epoch": 0.7759720222176507, + "grad_norm": 0.41991668486872313, + "learning_rate": 2.5182923913968804e-06, + "loss": 0.5077, + "step": 9430 + }, + { + "epoch": 0.7760543098127958, + "grad_norm": 2.124393880809959, + "learning_rate": 2.516524212021887e-06, + "loss": 0.7164, + "step": 9431 + }, + { + "epoch": 0.7761365974079407, + "grad_norm": 1.709718081206611, + "learning_rate": 2.514756564261043e-06, + "loss": 0.722, + "step": 9432 + }, + { + "epoch": 0.7762188850030858, + "grad_norm": 2.2558854399316073, + "learning_rate": 2.5129894482399244e-06, + "loss": 0.7063, + "step": 9433 + }, + { + "epoch": 0.7763011725982308, + "grad_norm": 0.4116527996409889, + "learning_rate": 2.5112228640840584e-06, + "loss": 0.4712, + "step": 9434 + }, + { + "epoch": 0.7763834601933759, + "grad_norm": 5.104844803091092, + "learning_rate": 2.5094568119189465e-06, + "loss": 0.7187, + "step": 9435 + }, + { + "epoch": 0.7764657477885208, + "grad_norm": 0.41035686387134673, + "learning_rate": 2.5076912918700434e-06, + "loss": 0.4397, + "step": 9436 + }, + { + "epoch": 0.7765480353836659, + "grad_norm": 1.9675055497609846, + "learning_rate": 2.505926304062769e-06, + "loss": 0.6927, + "step": 9437 + }, + { + "epoch": 0.776630322978811, + "grad_norm": 1.9089219074011612, + "learning_rate": 2.5041618486225016e-06, + "loss": 0.7341, + "step": 9438 + }, + { + "epoch": 0.776712610573956, + "grad_norm": 0.422532963502283, + "learning_rate": 2.5023979256745922e-06, + "loss": 0.4662, + "step": 9439 + }, + { + "epoch": 0.776794898169101, + "grad_norm": 2.290945668322069, + "learning_rate": 2.5006345353443408e-06, + "loss": 0.6962, + "step": 9440 + }, + { + "epoch": 0.776877185764246, + "grad_norm": 2.040838913567379, + "learning_rate": 2.498871677757021e-06, + "loss": 0.7232, + "step": 9441 + }, + { + "epoch": 0.7769594733593911, + "grad_norm": 2.9280170461241934, + "learning_rate": 2.497109353037861e-06, + "loss": 0.7207, + "step": 9442 + }, + { + "epoch": 0.7770417609545361, + "grad_norm": 1.7504071583579865, + "learning_rate": 2.4953475613120526e-06, + "loss": 0.6941, + "step": 9443 + }, + { + "epoch": 0.7771240485496811, + "grad_norm": 0.4197427011103399, + "learning_rate": 2.4935863027047503e-06, + "loss": 0.478, + "step": 9444 + }, + { + "epoch": 0.7772063361448261, + "grad_norm": 2.3031638058420936, + "learning_rate": 2.491825577341074e-06, + "loss": 0.7385, + "step": 9445 + }, + { + "epoch": 0.7772886237399712, + "grad_norm": 2.125050796490055, + "learning_rate": 2.4900653853460987e-06, + "loss": 0.7188, + "step": 9446 + }, + { + "epoch": 0.7773709113351163, + "grad_norm": 3.0673093263844238, + "learning_rate": 2.4883057268448707e-06, + "loss": 0.7212, + "step": 9447 + }, + { + "epoch": 0.7774531989302612, + "grad_norm": 1.7278145812423589, + "learning_rate": 2.4865466019623906e-06, + "loss": 0.7092, + "step": 9448 + }, + { + "epoch": 0.7775354865254063, + "grad_norm": 0.40639208134101984, + "learning_rate": 2.484788010823621e-06, + "loss": 0.4549, + "step": 9449 + }, + { + "epoch": 0.7776177741205513, + "grad_norm": 1.7162896953721922, + "learning_rate": 2.4830299535534953e-06, + "loss": 0.6913, + "step": 9450 + }, + { + "epoch": 0.7777000617156964, + "grad_norm": 2.6054409623152224, + "learning_rate": 2.4812724302768976e-06, + "loss": 0.7085, + "step": 9451 + }, + { + "epoch": 0.7777823493108413, + "grad_norm": 1.7955853497783962, + "learning_rate": 2.479515441118685e-06, + "loss": 0.7051, + "step": 9452 + }, + { + "epoch": 0.7778646369059864, + "grad_norm": 0.42040310901695893, + "learning_rate": 2.4777589862036677e-06, + "loss": 0.4896, + "step": 9453 + }, + { + "epoch": 0.7779469245011315, + "grad_norm": 0.4100653916560508, + "learning_rate": 2.4760030656566213e-06, + "loss": 0.4797, + "step": 9454 + }, + { + "epoch": 0.7780292120962765, + "grad_norm": 2.3338129793617903, + "learning_rate": 2.474247679602283e-06, + "loss": 0.7273, + "step": 9455 + }, + { + "epoch": 0.7781114996914216, + "grad_norm": 2.1716887263202493, + "learning_rate": 2.472492828165356e-06, + "loss": 0.6994, + "step": 9456 + }, + { + "epoch": 0.7781937872865665, + "grad_norm": 3.0597479549123117, + "learning_rate": 2.4707385114704963e-06, + "loss": 0.7093, + "step": 9457 + }, + { + "epoch": 0.7782760748817116, + "grad_norm": 0.4101351274593842, + "learning_rate": 2.468984729642335e-06, + "loss": 0.481, + "step": 9458 + }, + { + "epoch": 0.7783583624768566, + "grad_norm": 2.0444681690647495, + "learning_rate": 2.4672314828054554e-06, + "loss": 0.7267, + "step": 9459 + }, + { + "epoch": 0.7784406500720017, + "grad_norm": 1.8686615200358232, + "learning_rate": 2.465478771084403e-06, + "loss": 0.7454, + "step": 9460 + }, + { + "epoch": 0.7785229376671466, + "grad_norm": 2.350495091302732, + "learning_rate": 2.463726594603687e-06, + "loss": 0.7449, + "step": 9461 + }, + { + "epoch": 0.7786052252622917, + "grad_norm": 2.4355540701715594, + "learning_rate": 2.4619749534877834e-06, + "loss": 0.7342, + "step": 9462 + }, + { + "epoch": 0.7786875128574368, + "grad_norm": 2.1004908073129034, + "learning_rate": 2.4602238478611216e-06, + "loss": 0.7146, + "step": 9463 + }, + { + "epoch": 0.7787698004525818, + "grad_norm": 1.870921855899997, + "learning_rate": 2.4584732778481046e-06, + "loss": 0.6939, + "step": 9464 + }, + { + "epoch": 0.7788520880477268, + "grad_norm": 0.40952971969038426, + "learning_rate": 2.4567232435730803e-06, + "loss": 0.4762, + "step": 9465 + }, + { + "epoch": 0.7789343756428718, + "grad_norm": 2.5770819077153866, + "learning_rate": 2.454973745160375e-06, + "loss": 0.7034, + "step": 9466 + }, + { + "epoch": 0.7790166632380169, + "grad_norm": 2.0625542678820845, + "learning_rate": 2.4532247827342657e-06, + "loss": 0.7105, + "step": 9467 + }, + { + "epoch": 0.7790989508331619, + "grad_norm": 1.9331992772170792, + "learning_rate": 2.4514763564190026e-06, + "loss": 0.6942, + "step": 9468 + }, + { + "epoch": 0.7791812384283069, + "grad_norm": 2.1019327141330946, + "learning_rate": 2.449728466338782e-06, + "loss": 0.6929, + "step": 9469 + }, + { + "epoch": 0.779263526023452, + "grad_norm": 1.7640170579271446, + "learning_rate": 2.4479811126177845e-06, + "loss": 0.7078, + "step": 9470 + }, + { + "epoch": 0.779345813618597, + "grad_norm": 1.8465704392458722, + "learning_rate": 2.4462342953801234e-06, + "loss": 0.6945, + "step": 9471 + }, + { + "epoch": 0.7794281012137421, + "grad_norm": 6.35705263679098, + "learning_rate": 2.4444880147499007e-06, + "loss": 0.7111, + "step": 9472 + }, + { + "epoch": 0.779510388808887, + "grad_norm": 2.339437651938714, + "learning_rate": 2.442742270851164e-06, + "loss": 0.7338, + "step": 9473 + }, + { + "epoch": 0.7795926764040321, + "grad_norm": 0.4880491628262197, + "learning_rate": 2.4409970638079327e-06, + "loss": 0.468, + "step": 9474 + }, + { + "epoch": 0.7796749639991771, + "grad_norm": 2.307539997277236, + "learning_rate": 2.4392523937441816e-06, + "loss": 0.7239, + "step": 9475 + }, + { + "epoch": 0.7797572515943222, + "grad_norm": 1.879948318177653, + "learning_rate": 2.4375082607838484e-06, + "loss": 0.7285, + "step": 9476 + }, + { + "epoch": 0.7798395391894671, + "grad_norm": 2.011967066234033, + "learning_rate": 2.4357646650508305e-06, + "loss": 0.6978, + "step": 9477 + }, + { + "epoch": 0.7799218267846122, + "grad_norm": 2.6004044856397543, + "learning_rate": 2.434021606668998e-06, + "loss": 0.7283, + "step": 9478 + }, + { + "epoch": 0.7800041143797573, + "grad_norm": 1.9291584960180648, + "learning_rate": 2.432279085762166e-06, + "loss": 0.7459, + "step": 9479 + }, + { + "epoch": 0.7800864019749023, + "grad_norm": 1.8382768119479282, + "learning_rate": 2.430537102454129e-06, + "loss": 0.7161, + "step": 9480 + }, + { + "epoch": 0.7801686895700474, + "grad_norm": 1.8604146432262054, + "learning_rate": 2.42879565686863e-06, + "loss": 0.6979, + "step": 9481 + }, + { + "epoch": 0.7802509771651923, + "grad_norm": 1.7832356245347227, + "learning_rate": 2.4270547491293793e-06, + "loss": 0.6966, + "step": 9482 + }, + { + "epoch": 0.7803332647603374, + "grad_norm": 2.039370237709717, + "learning_rate": 2.4253143793600462e-06, + "loss": 0.6905, + "step": 9483 + }, + { + "epoch": 0.7804155523554824, + "grad_norm": 2.549145971428939, + "learning_rate": 2.4235745476842676e-06, + "loss": 0.7023, + "step": 9484 + }, + { + "epoch": 0.7804978399506275, + "grad_norm": 0.4105252943600946, + "learning_rate": 2.421835254225634e-06, + "loss": 0.4707, + "step": 9485 + }, + { + "epoch": 0.7805801275457724, + "grad_norm": 2.2464907528836418, + "learning_rate": 2.420096499107707e-06, + "loss": 0.7222, + "step": 9486 + }, + { + "epoch": 0.7806624151409175, + "grad_norm": 1.9433493453553319, + "learning_rate": 2.418358282454002e-06, + "loss": 0.7265, + "step": 9487 + }, + { + "epoch": 0.7807447027360626, + "grad_norm": 1.8998555284982823, + "learning_rate": 2.4166206043880005e-06, + "loss": 0.7213, + "step": 9488 + }, + { + "epoch": 0.7808269903312076, + "grad_norm": 1.645442710892401, + "learning_rate": 2.4148834650331397e-06, + "loss": 0.7228, + "step": 9489 + }, + { + "epoch": 0.7809092779263526, + "grad_norm": 1.980814556966827, + "learning_rate": 2.413146864512831e-06, + "loss": 0.724, + "step": 9490 + }, + { + "epoch": 0.7809915655214976, + "grad_norm": 0.40941601343605316, + "learning_rate": 2.4114108029504314e-06, + "loss": 0.4783, + "step": 9491 + }, + { + "epoch": 0.7810738531166427, + "grad_norm": 2.4095081657365784, + "learning_rate": 2.409675280469276e-06, + "loss": 0.7131, + "step": 9492 + }, + { + "epoch": 0.7811561407117877, + "grad_norm": 2.8078003068622075, + "learning_rate": 2.407940297192649e-06, + "loss": 0.7066, + "step": 9493 + }, + { + "epoch": 0.7812384283069327, + "grad_norm": 2.2282315949212177, + "learning_rate": 2.4062058532438026e-06, + "loss": 0.7081, + "step": 9494 + }, + { + "epoch": 0.7813207159020777, + "grad_norm": 2.611578186163987, + "learning_rate": 2.4044719487459446e-06, + "loss": 0.7332, + "step": 9495 + }, + { + "epoch": 0.7814030034972228, + "grad_norm": 2.889108403415988, + "learning_rate": 2.402738583822254e-06, + "loss": 0.7321, + "step": 9496 + }, + { + "epoch": 0.7814852910923679, + "grad_norm": 2.288633523631624, + "learning_rate": 2.4010057585958626e-06, + "loss": 0.7442, + "step": 9497 + }, + { + "epoch": 0.7815675786875128, + "grad_norm": 0.397215974897909, + "learning_rate": 2.3992734731898716e-06, + "loss": 0.4509, + "step": 9498 + }, + { + "epoch": 0.7816498662826579, + "grad_norm": 2.298381236774855, + "learning_rate": 2.3975417277273385e-06, + "loss": 0.7129, + "step": 9499 + }, + { + "epoch": 0.7817321538778029, + "grad_norm": 1.9548454063305163, + "learning_rate": 2.395810522331282e-06, + "loss": 0.6981, + "step": 9500 + }, + { + "epoch": 0.781814441472948, + "grad_norm": 3.9694940008304442, + "learning_rate": 2.3940798571246813e-06, + "loss": 0.7136, + "step": 9501 + }, + { + "epoch": 0.7818967290680929, + "grad_norm": 2.026244077946394, + "learning_rate": 2.3923497322304878e-06, + "loss": 0.708, + "step": 9502 + }, + { + "epoch": 0.781979016663238, + "grad_norm": 2.5374361057875796, + "learning_rate": 2.3906201477715995e-06, + "loss": 0.7078, + "step": 9503 + }, + { + "epoch": 0.782061304258383, + "grad_norm": 2.973419663929755, + "learning_rate": 2.3888911038708893e-06, + "loss": 0.7088, + "step": 9504 + }, + { + "epoch": 0.7821435918535281, + "grad_norm": 0.4275633014824214, + "learning_rate": 2.387162600651183e-06, + "loss": 0.462, + "step": 9505 + }, + { + "epoch": 0.7822258794486732, + "grad_norm": 2.863637075290664, + "learning_rate": 2.38543463823527e-06, + "loss": 0.7339, + "step": 9506 + }, + { + "epoch": 0.7823081670438181, + "grad_norm": 1.7584459457904704, + "learning_rate": 2.3837072167459007e-06, + "loss": 0.6813, + "step": 9507 + }, + { + "epoch": 0.7823904546389632, + "grad_norm": 2.3834215590386396, + "learning_rate": 2.3819803363057925e-06, + "loss": 0.7397, + "step": 9508 + }, + { + "epoch": 0.7824727422341082, + "grad_norm": 1.697008902874565, + "learning_rate": 2.380253997037615e-06, + "loss": 0.7093, + "step": 9509 + }, + { + "epoch": 0.7825550298292533, + "grad_norm": 1.81474768332481, + "learning_rate": 2.378528199064011e-06, + "loss": 0.7126, + "step": 9510 + }, + { + "epoch": 0.7826373174243982, + "grad_norm": 1.5582716051053553, + "learning_rate": 2.3768029425075745e-06, + "loss": 0.7038, + "step": 9511 + }, + { + "epoch": 0.7827196050195433, + "grad_norm": 2.0564901244468765, + "learning_rate": 2.3750782274908656e-06, + "loss": 0.7017, + "step": 9512 + }, + { + "epoch": 0.7828018926146884, + "grad_norm": 1.864013299485104, + "learning_rate": 2.373354054136402e-06, + "loss": 0.6744, + "step": 9513 + }, + { + "epoch": 0.7828841802098334, + "grad_norm": 2.1585590791092524, + "learning_rate": 2.371630422566673e-06, + "loss": 0.7083, + "step": 9514 + }, + { + "epoch": 0.7829664678049784, + "grad_norm": 2.2006304145824807, + "learning_rate": 2.3699073329041156e-06, + "loss": 0.7063, + "step": 9515 + }, + { + "epoch": 0.7830487554001234, + "grad_norm": 0.4154868055553076, + "learning_rate": 2.3681847852711414e-06, + "loss": 0.4639, + "step": 9516 + }, + { + "epoch": 0.7831310429952685, + "grad_norm": 1.9662078693455431, + "learning_rate": 2.366462779790115e-06, + "loss": 0.7248, + "step": 9517 + }, + { + "epoch": 0.7832133305904135, + "grad_norm": 2.0531120535222493, + "learning_rate": 2.3647413165833642e-06, + "loss": 0.7155, + "step": 9518 + }, + { + "epoch": 0.7832956181855585, + "grad_norm": 2.3794503653528123, + "learning_rate": 2.363020395773178e-06, + "loss": 0.7544, + "step": 9519 + }, + { + "epoch": 0.7833779057807035, + "grad_norm": 2.2506158991984235, + "learning_rate": 2.361300017481811e-06, + "loss": 0.71, + "step": 9520 + }, + { + "epoch": 0.7834601933758486, + "grad_norm": 0.4074636157434928, + "learning_rate": 2.3595801818314723e-06, + "loss": 0.4839, + "step": 9521 + }, + { + "epoch": 0.7835424809709937, + "grad_norm": 2.628946892525229, + "learning_rate": 2.3578608889443413e-06, + "loss": 0.7004, + "step": 9522 + }, + { + "epoch": 0.7836247685661386, + "grad_norm": 1.8778881229163462, + "learning_rate": 2.35614213894255e-06, + "loss": 0.7324, + "step": 9523 + }, + { + "epoch": 0.7837070561612837, + "grad_norm": 2.2871961129670133, + "learning_rate": 2.3544239319481974e-06, + "loss": 0.7121, + "step": 9524 + }, + { + "epoch": 0.7837893437564287, + "grad_norm": 2.1870817160946205, + "learning_rate": 2.352706268083338e-06, + "loss": 0.7353, + "step": 9525 + }, + { + "epoch": 0.7838716313515738, + "grad_norm": 1.802178449356788, + "learning_rate": 2.350989147469999e-06, + "loss": 0.7156, + "step": 9526 + }, + { + "epoch": 0.7839539189467187, + "grad_norm": 1.850304663473559, + "learning_rate": 2.3492725702301544e-06, + "loss": 0.7154, + "step": 9527 + }, + { + "epoch": 0.7840362065418638, + "grad_norm": 1.8910871192731915, + "learning_rate": 2.3475565364857544e-06, + "loss": 0.7055, + "step": 9528 + }, + { + "epoch": 0.7841184941370088, + "grad_norm": 2.037729338393167, + "learning_rate": 2.345841046358699e-06, + "loss": 0.7463, + "step": 9529 + }, + { + "epoch": 0.7842007817321539, + "grad_norm": 2.308436321129287, + "learning_rate": 2.344126099970856e-06, + "loss": 0.7076, + "step": 9530 + }, + { + "epoch": 0.784283069327299, + "grad_norm": 2.049223553455472, + "learning_rate": 2.342411697444047e-06, + "loss": 0.7386, + "step": 9531 + }, + { + "epoch": 0.7843653569224439, + "grad_norm": 1.7534561868939411, + "learning_rate": 2.3406978389000677e-06, + "loss": 0.6941, + "step": 9532 + }, + { + "epoch": 0.784447644517589, + "grad_norm": 2.3374824979900786, + "learning_rate": 2.338984524460661e-06, + "loss": 0.7211, + "step": 9533 + }, + { + "epoch": 0.784529932112734, + "grad_norm": 2.0136668558243236, + "learning_rate": 2.3372717542475486e-06, + "loss": 0.7318, + "step": 9534 + }, + { + "epoch": 0.7846122197078791, + "grad_norm": 1.9285331374267973, + "learning_rate": 2.3355595283823908e-06, + "loss": 0.7164, + "step": 9535 + }, + { + "epoch": 0.784694507303024, + "grad_norm": 2.1085357073196116, + "learning_rate": 2.3338478469868287e-06, + "loss": 0.7395, + "step": 9536 + }, + { + "epoch": 0.7847767948981691, + "grad_norm": 2.2947443657974778, + "learning_rate": 2.3321367101824522e-06, + "loss": 0.7182, + "step": 9537 + }, + { + "epoch": 0.7848590824933142, + "grad_norm": 1.7466371382859112, + "learning_rate": 2.3304261180908238e-06, + "loss": 0.7015, + "step": 9538 + }, + { + "epoch": 0.7849413700884592, + "grad_norm": 2.146713922986805, + "learning_rate": 2.3287160708334553e-06, + "loss": 0.6964, + "step": 9539 + }, + { + "epoch": 0.7850236576836042, + "grad_norm": 2.4245407204038063, + "learning_rate": 2.3270065685318355e-06, + "loss": 0.6928, + "step": 9540 + }, + { + "epoch": 0.7851059452787492, + "grad_norm": 2.2116044612511088, + "learning_rate": 2.325297611307391e-06, + "loss": 0.7103, + "step": 9541 + }, + { + "epoch": 0.7851882328738943, + "grad_norm": 3.127109536722712, + "learning_rate": 2.3235891992815342e-06, + "loss": 0.7103, + "step": 9542 + }, + { + "epoch": 0.7852705204690393, + "grad_norm": 1.8499504280333763, + "learning_rate": 2.3218813325756205e-06, + "loss": 0.7513, + "step": 9543 + }, + { + "epoch": 0.7853528080641843, + "grad_norm": 0.41251307005150606, + "learning_rate": 2.3201740113109815e-06, + "loss": 0.4688, + "step": 9544 + }, + { + "epoch": 0.7854350956593293, + "grad_norm": 2.2190559856707686, + "learning_rate": 2.318467235608898e-06, + "loss": 0.7303, + "step": 9545 + }, + { + "epoch": 0.7855173832544744, + "grad_norm": 1.909794239042325, + "learning_rate": 2.3167610055906165e-06, + "loss": 0.7324, + "step": 9546 + }, + { + "epoch": 0.7855996708496195, + "grad_norm": 1.6926464009297455, + "learning_rate": 2.315055321377344e-06, + "loss": 0.6991, + "step": 9547 + }, + { + "epoch": 0.7856819584447644, + "grad_norm": 2.216249974775411, + "learning_rate": 2.3133501830902526e-06, + "loss": 0.7373, + "step": 9548 + }, + { + "epoch": 0.7857642460399095, + "grad_norm": 0.4253506411886045, + "learning_rate": 2.31164559085047e-06, + "loss": 0.4661, + "step": 9549 + }, + { + "epoch": 0.7858465336350545, + "grad_norm": 2.0150895532299735, + "learning_rate": 2.3099415447790908e-06, + "loss": 0.7407, + "step": 9550 + }, + { + "epoch": 0.7859288212301996, + "grad_norm": 1.6950831046578412, + "learning_rate": 2.3082380449971663e-06, + "loss": 0.7338, + "step": 9551 + }, + { + "epoch": 0.7860111088253445, + "grad_norm": 2.157564136275157, + "learning_rate": 2.306535091625709e-06, + "loss": 0.7023, + "step": 9552 + }, + { + "epoch": 0.7860933964204896, + "grad_norm": 2.178166809149763, + "learning_rate": 2.304832684785693e-06, + "loss": 0.6983, + "step": 9553 + }, + { + "epoch": 0.7861756840156346, + "grad_norm": 1.8205405663867755, + "learning_rate": 2.3031308245980587e-06, + "loss": 0.7101, + "step": 9554 + }, + { + "epoch": 0.7862579716107797, + "grad_norm": 2.256550477124557, + "learning_rate": 2.301429511183699e-06, + "loss": 0.7155, + "step": 9555 + }, + { + "epoch": 0.7863402592059247, + "grad_norm": 3.6429034055998337, + "learning_rate": 2.2997287446634774e-06, + "loss": 0.6985, + "step": 9556 + }, + { + "epoch": 0.7864225468010697, + "grad_norm": 0.41090528528478176, + "learning_rate": 2.2980285251582112e-06, + "loss": 0.4806, + "step": 9557 + }, + { + "epoch": 0.7865048343962148, + "grad_norm": 2.356795836802788, + "learning_rate": 2.2963288527886817e-06, + "loss": 0.7283, + "step": 9558 + }, + { + "epoch": 0.7865871219913598, + "grad_norm": 2.193738902057485, + "learning_rate": 2.294629727675627e-06, + "loss": 0.7387, + "step": 9559 + }, + { + "epoch": 0.7866694095865049, + "grad_norm": 2.1684630888460354, + "learning_rate": 2.2929311499397554e-06, + "loss": 0.7172, + "step": 9560 + }, + { + "epoch": 0.7867516971816498, + "grad_norm": 1.6718154013666715, + "learning_rate": 2.2912331197017267e-06, + "loss": 0.6966, + "step": 9561 + }, + { + "epoch": 0.7868339847767949, + "grad_norm": 2.222687187114647, + "learning_rate": 2.289535637082173e-06, + "loss": 0.714, + "step": 9562 + }, + { + "epoch": 0.78691627237194, + "grad_norm": 1.8958098346974188, + "learning_rate": 2.2878387022016755e-06, + "loss": 0.7241, + "step": 9563 + }, + { + "epoch": 0.786998559967085, + "grad_norm": 1.9115432488870718, + "learning_rate": 2.2861423151807825e-06, + "loss": 0.7205, + "step": 9564 + }, + { + "epoch": 0.78708084756223, + "grad_norm": 1.9383783980020095, + "learning_rate": 2.284446476140001e-06, + "loss": 0.6764, + "step": 9565 + }, + { + "epoch": 0.787163135157375, + "grad_norm": 2.2515442758362, + "learning_rate": 2.282751185199805e-06, + "loss": 0.7128, + "step": 9566 + }, + { + "epoch": 0.7872454227525201, + "grad_norm": 0.418372631556048, + "learning_rate": 2.2810564424806203e-06, + "loss": 0.4909, + "step": 9567 + }, + { + "epoch": 0.7873277103476651, + "grad_norm": 0.4053192478797293, + "learning_rate": 2.2793622481028434e-06, + "loss": 0.458, + "step": 9568 + }, + { + "epoch": 0.7874099979428101, + "grad_norm": 2.409914959008336, + "learning_rate": 2.2776686021868254e-06, + "loss": 0.7247, + "step": 9569 + }, + { + "epoch": 0.7874922855379551, + "grad_norm": 1.939618747630163, + "learning_rate": 2.2759755048528796e-06, + "loss": 0.695, + "step": 9570 + }, + { + "epoch": 0.7875745731331002, + "grad_norm": 2.549585804709224, + "learning_rate": 2.2742829562212797e-06, + "loss": 0.7094, + "step": 9571 + }, + { + "epoch": 0.7876568607282453, + "grad_norm": 0.4220612081426452, + "learning_rate": 2.272590956412265e-06, + "loss": 0.4842, + "step": 9572 + }, + { + "epoch": 0.7877391483233902, + "grad_norm": 1.9596255892103034, + "learning_rate": 2.2708995055460283e-06, + "loss": 0.708, + "step": 9573 + }, + { + "epoch": 0.7878214359185353, + "grad_norm": 2.9820249479183607, + "learning_rate": 2.269208603742732e-06, + "loss": 0.7007, + "step": 9574 + }, + { + "epoch": 0.7879037235136803, + "grad_norm": 1.8057661654439177, + "learning_rate": 2.2675182511224925e-06, + "loss": 0.693, + "step": 9575 + }, + { + "epoch": 0.7879860111088254, + "grad_norm": 2.3467495022960887, + "learning_rate": 2.2658284478053906e-06, + "loss": 0.7219, + "step": 9576 + }, + { + "epoch": 0.7880682987039703, + "grad_norm": 0.41759145349199794, + "learning_rate": 2.2641391939114644e-06, + "loss": 0.492, + "step": 9577 + }, + { + "epoch": 0.7881505862991154, + "grad_norm": 0.41434271668396644, + "learning_rate": 2.2624504895607212e-06, + "loss": 0.4752, + "step": 9578 + }, + { + "epoch": 0.7882328738942604, + "grad_norm": 0.4251933074082139, + "learning_rate": 2.260762334873117e-06, + "loss": 0.4626, + "step": 9579 + }, + { + "epoch": 0.7883151614894055, + "grad_norm": 2.372286233244929, + "learning_rate": 2.2590747299685833e-06, + "loss": 0.7166, + "step": 9580 + }, + { + "epoch": 0.7883974490845505, + "grad_norm": 4.273353382094368, + "learning_rate": 2.2573876749669997e-06, + "loss": 0.7086, + "step": 9581 + }, + { + "epoch": 0.7884797366796955, + "grad_norm": 2.0666639309062504, + "learning_rate": 2.255701169988215e-06, + "loss": 0.7099, + "step": 9582 + }, + { + "epoch": 0.7885620242748406, + "grad_norm": 2.412801478460297, + "learning_rate": 2.25401521515203e-06, + "loss": 0.7117, + "step": 9583 + }, + { + "epoch": 0.7886443118699856, + "grad_norm": 1.853800489354446, + "learning_rate": 2.252329810578219e-06, + "loss": 0.7421, + "step": 9584 + }, + { + "epoch": 0.7887265994651307, + "grad_norm": 1.878399788672184, + "learning_rate": 2.250644956386505e-06, + "loss": 0.7186, + "step": 9585 + }, + { + "epoch": 0.7888088870602756, + "grad_norm": 0.4033054209658536, + "learning_rate": 2.2489606526965834e-06, + "loss": 0.4666, + "step": 9586 + }, + { + "epoch": 0.7888911746554207, + "grad_norm": 1.8215304288271688, + "learning_rate": 2.2472768996281e-06, + "loss": 0.7273, + "step": 9587 + }, + { + "epoch": 0.7889734622505657, + "grad_norm": 1.8795699579521696, + "learning_rate": 2.245593697300669e-06, + "loss": 0.6996, + "step": 9588 + }, + { + "epoch": 0.7890557498457108, + "grad_norm": 1.6647422112133519, + "learning_rate": 2.243911045833855e-06, + "loss": 0.7287, + "step": 9589 + }, + { + "epoch": 0.7891380374408558, + "grad_norm": 0.41187243585954025, + "learning_rate": 2.242228945347201e-06, + "loss": 0.4567, + "step": 9590 + }, + { + "epoch": 0.7892203250360008, + "grad_norm": 2.1309384639760993, + "learning_rate": 2.240547395960192e-06, + "loss": 0.713, + "step": 9591 + }, + { + "epoch": 0.7893026126311459, + "grad_norm": 2.0647908644183373, + "learning_rate": 2.23886639779229e-06, + "loss": 0.6945, + "step": 9592 + }, + { + "epoch": 0.7893849002262909, + "grad_norm": 1.7205967171812229, + "learning_rate": 2.2371859509629066e-06, + "loss": 0.7171, + "step": 9593 + }, + { + "epoch": 0.7894671878214359, + "grad_norm": 0.41920745318796565, + "learning_rate": 2.23550605559142e-06, + "loss": 0.4882, + "step": 9594 + }, + { + "epoch": 0.7895494754165809, + "grad_norm": 1.6642131970130505, + "learning_rate": 2.2338267117971615e-06, + "loss": 0.7327, + "step": 9595 + }, + { + "epoch": 0.789631763011726, + "grad_norm": 2.3159253289895867, + "learning_rate": 2.2321479196994354e-06, + "loss": 0.7277, + "step": 9596 + }, + { + "epoch": 0.789714050606871, + "grad_norm": 2.022945036970359, + "learning_rate": 2.2304696794174973e-06, + "loss": 0.7197, + "step": 9597 + }, + { + "epoch": 0.789796338202016, + "grad_norm": 2.153909113022087, + "learning_rate": 2.2287919910705693e-06, + "loss": 0.73, + "step": 9598 + }, + { + "epoch": 0.7898786257971611, + "grad_norm": 2.725950096284293, + "learning_rate": 2.22711485477783e-06, + "loss": 0.7344, + "step": 9599 + }, + { + "epoch": 0.7899609133923061, + "grad_norm": 0.429681559973264, + "learning_rate": 2.225438270658421e-06, + "loss": 0.499, + "step": 9600 + }, + { + "epoch": 0.7900432009874512, + "grad_norm": 4.607349112582879, + "learning_rate": 2.2237622388314406e-06, + "loss": 0.7163, + "step": 9601 + }, + { + "epoch": 0.7901254885825961, + "grad_norm": 2.2439675608573886, + "learning_rate": 2.222086759415957e-06, + "loss": 0.7173, + "step": 9602 + }, + { + "epoch": 0.7902077761777412, + "grad_norm": 3.3979395458448467, + "learning_rate": 2.2204118325309885e-06, + "loss": 0.7089, + "step": 9603 + }, + { + "epoch": 0.7902900637728862, + "grad_norm": 1.91874543983582, + "learning_rate": 2.218737458295528e-06, + "loss": 0.7034, + "step": 9604 + }, + { + "epoch": 0.7903723513680313, + "grad_norm": 1.8645681415790647, + "learning_rate": 2.217063636828507e-06, + "loss": 0.6937, + "step": 9605 + }, + { + "epoch": 0.7904546389631762, + "grad_norm": 2.234933813852695, + "learning_rate": 2.215390368248842e-06, + "loss": 0.6917, + "step": 9606 + }, + { + "epoch": 0.7905369265583213, + "grad_norm": 1.7583888933309508, + "learning_rate": 2.2137176526753933e-06, + "loss": 0.6886, + "step": 9607 + }, + { + "epoch": 0.7906192141534664, + "grad_norm": 2.1965137681300533, + "learning_rate": 2.2120454902269928e-06, + "loss": 0.6979, + "step": 9608 + }, + { + "epoch": 0.7907015017486114, + "grad_norm": 1.9760261275922486, + "learning_rate": 2.2103738810224227e-06, + "loss": 0.7081, + "step": 9609 + }, + { + "epoch": 0.7907837893437565, + "grad_norm": 2.3651309224735595, + "learning_rate": 2.2087028251804397e-06, + "loss": 0.7219, + "step": 9610 + }, + { + "epoch": 0.7908660769389014, + "grad_norm": 0.4055840549957618, + "learning_rate": 2.207032322819743e-06, + "loss": 0.4729, + "step": 9611 + }, + { + "epoch": 0.7909483645340465, + "grad_norm": 3.244657790225726, + "learning_rate": 2.205362374059009e-06, + "loss": 0.7167, + "step": 9612 + }, + { + "epoch": 0.7910306521291915, + "grad_norm": 3.1849515297716056, + "learning_rate": 2.203692979016864e-06, + "loss": 0.7377, + "step": 9613 + }, + { + "epoch": 0.7911129397243366, + "grad_norm": 2.5008090239418688, + "learning_rate": 2.2020241378119036e-06, + "loss": 0.7021, + "step": 9614 + }, + { + "epoch": 0.7911952273194816, + "grad_norm": 2.342792424333416, + "learning_rate": 2.2003558505626775e-06, + "loss": 0.7441, + "step": 9615 + }, + { + "epoch": 0.7912775149146266, + "grad_norm": 2.044753398931351, + "learning_rate": 2.198688117387698e-06, + "loss": 0.7139, + "step": 9616 + }, + { + "epoch": 0.7913598025097717, + "grad_norm": 2.1286559644063083, + "learning_rate": 2.1970209384054364e-06, + "loss": 0.7322, + "step": 9617 + }, + { + "epoch": 0.7914420901049167, + "grad_norm": 2.416038653274261, + "learning_rate": 2.1953543137343314e-06, + "loss": 0.7076, + "step": 9618 + }, + { + "epoch": 0.7915243777000617, + "grad_norm": 2.058676489147294, + "learning_rate": 2.19368824349277e-06, + "loss": 0.7329, + "step": 9619 + }, + { + "epoch": 0.7916066652952067, + "grad_norm": 1.617674961491238, + "learning_rate": 2.1920227277991145e-06, + "loss": 0.6793, + "step": 9620 + }, + { + "epoch": 0.7916889528903518, + "grad_norm": 0.4117782153395388, + "learning_rate": 2.190357766771678e-06, + "loss": 0.4775, + "step": 9621 + }, + { + "epoch": 0.7917712404854969, + "grad_norm": 2.1497796426805897, + "learning_rate": 2.1886933605287366e-06, + "loss": 0.6826, + "step": 9622 + }, + { + "epoch": 0.7918535280806418, + "grad_norm": 1.9228318634665749, + "learning_rate": 2.187029509188523e-06, + "loss": 0.7072, + "step": 9623 + }, + { + "epoch": 0.7919358156757869, + "grad_norm": 0.4147972875200307, + "learning_rate": 2.185366212869241e-06, + "loss": 0.4381, + "step": 9624 + }, + { + "epoch": 0.7920181032709319, + "grad_norm": 4.412375971756283, + "learning_rate": 2.1837034716890426e-06, + "loss": 0.7018, + "step": 9625 + }, + { + "epoch": 0.792100390866077, + "grad_norm": 2.0339746862302075, + "learning_rate": 2.1820412857660513e-06, + "loss": 0.7041, + "step": 9626 + }, + { + "epoch": 0.7921826784612219, + "grad_norm": 4.1035707631272755, + "learning_rate": 2.1803796552183455e-06, + "loss": 0.7212, + "step": 9627 + }, + { + "epoch": 0.792264966056367, + "grad_norm": 4.49839335823964, + "learning_rate": 2.178718580163962e-06, + "loss": 0.6939, + "step": 9628 + }, + { + "epoch": 0.792347253651512, + "grad_norm": 1.8123534817391374, + "learning_rate": 2.1770580607209e-06, + "loss": 0.7067, + "step": 9629 + }, + { + "epoch": 0.7924295412466571, + "grad_norm": 1.7876969034973949, + "learning_rate": 2.1753980970071267e-06, + "loss": 0.6989, + "step": 9630 + }, + { + "epoch": 0.792511828841802, + "grad_norm": 1.6445045989889577, + "learning_rate": 2.173738689140554e-06, + "loss": 0.6826, + "step": 9631 + }, + { + "epoch": 0.7925941164369471, + "grad_norm": 0.4433098250788095, + "learning_rate": 2.172079837239073e-06, + "loss": 0.4802, + "step": 9632 + }, + { + "epoch": 0.7926764040320922, + "grad_norm": 2.1089278807969194, + "learning_rate": 2.1704215414205198e-06, + "loss": 0.7254, + "step": 9633 + }, + { + "epoch": 0.7927586916272372, + "grad_norm": 2.141042492051759, + "learning_rate": 2.1687638018027003e-06, + "loss": 0.7106, + "step": 9634 + }, + { + "epoch": 0.7928409792223823, + "grad_norm": 2.473563951866919, + "learning_rate": 2.167106618503373e-06, + "loss": 0.721, + "step": 9635 + }, + { + "epoch": 0.7929232668175272, + "grad_norm": 4.627561533088396, + "learning_rate": 2.165449991640266e-06, + "loss": 0.7034, + "step": 9636 + }, + { + "epoch": 0.7930055544126723, + "grad_norm": 0.40704945909931434, + "learning_rate": 2.163793921331061e-06, + "loss": 0.4579, + "step": 9637 + }, + { + "epoch": 0.7930878420078173, + "grad_norm": 4.173532818631314, + "learning_rate": 2.162138407693406e-06, + "loss": 0.6748, + "step": 9638 + }, + { + "epoch": 0.7931701296029624, + "grad_norm": 1.9554131721957506, + "learning_rate": 2.1604834508449037e-06, + "loss": 0.7179, + "step": 9639 + }, + { + "epoch": 0.7932524171981074, + "grad_norm": 1.8957067333554243, + "learning_rate": 2.1588290509031194e-06, + "loss": 0.7099, + "step": 9640 + }, + { + "epoch": 0.7933347047932524, + "grad_norm": 2.2782225553703346, + "learning_rate": 2.157175207985577e-06, + "loss": 0.7186, + "step": 9641 + }, + { + "epoch": 0.7934169923883975, + "grad_norm": 2.3858350817575804, + "learning_rate": 2.1555219222097644e-06, + "loss": 0.7018, + "step": 9642 + }, + { + "epoch": 0.7934992799835425, + "grad_norm": 2.212011886206731, + "learning_rate": 2.153869193693131e-06, + "loss": 0.6826, + "step": 9643 + }, + { + "epoch": 0.7935815675786875, + "grad_norm": 2.7088541071501986, + "learning_rate": 2.1522170225530838e-06, + "loss": 0.7398, + "step": 9644 + }, + { + "epoch": 0.7936638551738325, + "grad_norm": 1.803502876720303, + "learning_rate": 2.150565408906988e-06, + "loss": 0.7422, + "step": 9645 + }, + { + "epoch": 0.7937461427689776, + "grad_norm": 12.518957613081993, + "learning_rate": 2.148914352872169e-06, + "loss": 0.7063, + "step": 9646 + }, + { + "epoch": 0.7938284303641227, + "grad_norm": 1.9855309650953226, + "learning_rate": 2.147263854565922e-06, + "loss": 0.7179, + "step": 9647 + }, + { + "epoch": 0.7939107179592676, + "grad_norm": 2.2569361900891223, + "learning_rate": 2.1456139141054888e-06, + "loss": 0.6984, + "step": 9648 + }, + { + "epoch": 0.7939930055544127, + "grad_norm": 1.95941998945297, + "learning_rate": 2.143964531608085e-06, + "loss": 0.7269, + "step": 9649 + }, + { + "epoch": 0.7940752931495577, + "grad_norm": 1.8245405425170866, + "learning_rate": 2.142315707190876e-06, + "loss": 0.7257, + "step": 9650 + }, + { + "epoch": 0.7941575807447028, + "grad_norm": 2.0108218617174174, + "learning_rate": 2.140667440970994e-06, + "loss": 0.7249, + "step": 9651 + }, + { + "epoch": 0.7942398683398477, + "grad_norm": 2.0573038512508166, + "learning_rate": 2.139019733065525e-06, + "loss": 0.7091, + "step": 9652 + }, + { + "epoch": 0.7943221559349928, + "grad_norm": 2.6639811740612975, + "learning_rate": 2.137372583591525e-06, + "loss": 0.7232, + "step": 9653 + }, + { + "epoch": 0.7944044435301378, + "grad_norm": 1.9024740133789138, + "learning_rate": 2.135725992665999e-06, + "loss": 0.6993, + "step": 9654 + }, + { + "epoch": 0.7944867311252829, + "grad_norm": 2.386203276030498, + "learning_rate": 2.1340799604059238e-06, + "loss": 0.7109, + "step": 9655 + }, + { + "epoch": 0.7945690187204278, + "grad_norm": 3.3744968833061133, + "learning_rate": 2.13243448692823e-06, + "loss": 0.6969, + "step": 9656 + }, + { + "epoch": 0.7946513063155729, + "grad_norm": 2.6762826431876823, + "learning_rate": 2.1307895723498063e-06, + "loss": 0.742, + "step": 9657 + }, + { + "epoch": 0.794733593910718, + "grad_norm": 2.1484305650991073, + "learning_rate": 2.129145216787505e-06, + "loss": 0.7208, + "step": 9658 + }, + { + "epoch": 0.794815881505863, + "grad_norm": 2.206519893384964, + "learning_rate": 2.1275014203581425e-06, + "loss": 0.7127, + "step": 9659 + }, + { + "epoch": 0.794898169101008, + "grad_norm": 1.8415042033563214, + "learning_rate": 2.1258581831784854e-06, + "loss": 0.7234, + "step": 9660 + }, + { + "epoch": 0.794980456696153, + "grad_norm": 1.9577239686191383, + "learning_rate": 2.1242155053652734e-06, + "loss": 0.7441, + "step": 9661 + }, + { + "epoch": 0.7950627442912981, + "grad_norm": 1.9530761366738059, + "learning_rate": 2.1225733870351963e-06, + "loss": 0.7113, + "step": 9662 + }, + { + "epoch": 0.7951450318864431, + "grad_norm": 1.9191232060785968, + "learning_rate": 2.1209318283049074e-06, + "loss": 0.7174, + "step": 9663 + }, + { + "epoch": 0.7952273194815882, + "grad_norm": 2.4616856373574225, + "learning_rate": 2.1192908292910188e-06, + "loss": 0.7008, + "step": 9664 + }, + { + "epoch": 0.7953096070767331, + "grad_norm": 4.580327340979014, + "learning_rate": 2.1176503901101087e-06, + "loss": 0.7086, + "step": 9665 + }, + { + "epoch": 0.7953918946718782, + "grad_norm": 2.076086987029498, + "learning_rate": 2.1160105108787056e-06, + "loss": 0.7189, + "step": 9666 + }, + { + "epoch": 0.7954741822670233, + "grad_norm": 1.8966256271928563, + "learning_rate": 2.1143711917133104e-06, + "loss": 0.7012, + "step": 9667 + }, + { + "epoch": 0.7955564698621683, + "grad_norm": 2.2609965364880087, + "learning_rate": 2.1127324327303745e-06, + "loss": 0.7194, + "step": 9668 + }, + { + "epoch": 0.7956387574573133, + "grad_norm": 2.4770407111400474, + "learning_rate": 2.1110942340463124e-06, + "loss": 0.7466, + "step": 9669 + }, + { + "epoch": 0.7957210450524583, + "grad_norm": 0.4494943426102909, + "learning_rate": 2.109456595777498e-06, + "loss": 0.4792, + "step": 9670 + }, + { + "epoch": 0.7958033326476034, + "grad_norm": 1.6637947737258625, + "learning_rate": 2.1078195180402695e-06, + "loss": 0.7018, + "step": 9671 + }, + { + "epoch": 0.7958856202427484, + "grad_norm": 2.447234499054824, + "learning_rate": 2.1061830009509186e-06, + "loss": 0.6952, + "step": 9672 + }, + { + "epoch": 0.7959679078378934, + "grad_norm": 0.4219348090599259, + "learning_rate": 2.1045470446257057e-06, + "loss": 0.4843, + "step": 9673 + }, + { + "epoch": 0.7960501954330385, + "grad_norm": 2.3321806837621826, + "learning_rate": 2.1029116491808433e-06, + "loss": 0.7333, + "step": 9674 + }, + { + "epoch": 0.7961324830281835, + "grad_norm": 0.4070978427909062, + "learning_rate": 2.101276814732508e-06, + "loss": 0.4613, + "step": 9675 + }, + { + "epoch": 0.7962147706233286, + "grad_norm": 2.294893646887775, + "learning_rate": 2.0996425413968335e-06, + "loss": 0.7072, + "step": 9676 + }, + { + "epoch": 0.7962970582184735, + "grad_norm": 2.9533324999767925, + "learning_rate": 2.09800882928992e-06, + "loss": 0.7226, + "step": 9677 + }, + { + "epoch": 0.7963793458136186, + "grad_norm": 1.9046928241661172, + "learning_rate": 2.09637567852782e-06, + "loss": 0.6868, + "step": 9678 + }, + { + "epoch": 0.7964616334087636, + "grad_norm": 1.96544282052215, + "learning_rate": 2.094743089226554e-06, + "loss": 0.6949, + "step": 9679 + }, + { + "epoch": 0.7965439210039087, + "grad_norm": 2.6318942989233722, + "learning_rate": 2.0931110615020967e-06, + "loss": 0.6994, + "step": 9680 + }, + { + "epoch": 0.7966262085990536, + "grad_norm": 2.2039668622743975, + "learning_rate": 2.091479595470385e-06, + "loss": 0.7006, + "step": 9681 + }, + { + "epoch": 0.7967084961941987, + "grad_norm": 2.4571062291599906, + "learning_rate": 2.0898486912473116e-06, + "loss": 0.7003, + "step": 9682 + }, + { + "epoch": 0.7967907837893438, + "grad_norm": 2.2938683290463193, + "learning_rate": 2.0882183489487386e-06, + "loss": 0.7402, + "step": 9683 + }, + { + "epoch": 0.7968730713844888, + "grad_norm": 1.8501202698762043, + "learning_rate": 2.0865885686904798e-06, + "loss": 0.726, + "step": 9684 + }, + { + "epoch": 0.7969553589796338, + "grad_norm": 0.4188222243501081, + "learning_rate": 2.0849593505883182e-06, + "loss": 0.4649, + "step": 9685 + }, + { + "epoch": 0.7970376465747788, + "grad_norm": 1.9298895576099977, + "learning_rate": 2.08333069475798e-06, + "loss": 0.7109, + "step": 9686 + }, + { + "epoch": 0.7971199341699239, + "grad_norm": 0.40849165890194883, + "learning_rate": 2.081702601315172e-06, + "loss": 0.4669, + "step": 9687 + }, + { + "epoch": 0.7972022217650689, + "grad_norm": 0.39906237566564323, + "learning_rate": 2.0800750703755445e-06, + "loss": 0.4598, + "step": 9688 + }, + { + "epoch": 0.797284509360214, + "grad_norm": 2.249677091723538, + "learning_rate": 2.0784481020547222e-06, + "loss": 0.7019, + "step": 9689 + }, + { + "epoch": 0.797366796955359, + "grad_norm": 2.0412116806567533, + "learning_rate": 2.076821696468273e-06, + "loss": 0.725, + "step": 9690 + }, + { + "epoch": 0.797449084550504, + "grad_norm": 1.9130093875547922, + "learning_rate": 2.0751958537317464e-06, + "loss": 0.7166, + "step": 9691 + }, + { + "epoch": 0.7975313721456491, + "grad_norm": 2.287101253276625, + "learning_rate": 2.073570573960626e-06, + "loss": 0.7057, + "step": 9692 + }, + { + "epoch": 0.7976136597407941, + "grad_norm": 1.7320813578113214, + "learning_rate": 2.07194585727038e-06, + "loss": 0.696, + "step": 9693 + }, + { + "epoch": 0.7976959473359391, + "grad_norm": 1.9223938667744114, + "learning_rate": 2.0703217037764166e-06, + "loss": 0.7036, + "step": 9694 + }, + { + "epoch": 0.7977782349310841, + "grad_norm": 2.1421365881582406, + "learning_rate": 2.068698113594121e-06, + "loss": 0.7055, + "step": 9695 + }, + { + "epoch": 0.7978605225262292, + "grad_norm": 2.2760908438232708, + "learning_rate": 2.067075086838828e-06, + "loss": 0.701, + "step": 9696 + }, + { + "epoch": 0.7979428101213742, + "grad_norm": 2.144785881957786, + "learning_rate": 2.0654526236258334e-06, + "loss": 0.7241, + "step": 9697 + }, + { + "epoch": 0.7980250977165192, + "grad_norm": 2.4923948004812018, + "learning_rate": 2.0638307240703915e-06, + "loss": 0.7472, + "step": 9698 + }, + { + "epoch": 0.7981073853116643, + "grad_norm": 1.9157297766296448, + "learning_rate": 2.062209388287727e-06, + "loss": 0.7334, + "step": 9699 + }, + { + "epoch": 0.7981896729068093, + "grad_norm": 1.965511906357181, + "learning_rate": 2.060588616393009e-06, + "loss": 0.695, + "step": 9700 + }, + { + "epoch": 0.7982719605019544, + "grad_norm": 3.322265593320306, + "learning_rate": 2.0589684085013815e-06, + "loss": 0.715, + "step": 9701 + }, + { + "epoch": 0.7983542480970993, + "grad_norm": 0.42469080811361654, + "learning_rate": 2.057348764727939e-06, + "loss": 0.4813, + "step": 9702 + }, + { + "epoch": 0.7984365356922444, + "grad_norm": 1.8134155839144832, + "learning_rate": 2.0557296851877373e-06, + "loss": 0.7118, + "step": 9703 + }, + { + "epoch": 0.7985188232873894, + "grad_norm": 2.030872423183532, + "learning_rate": 2.0541111699957915e-06, + "loss": 0.7193, + "step": 9704 + }, + { + "epoch": 0.7986011108825345, + "grad_norm": 1.9396798772706754, + "learning_rate": 2.0524932192670834e-06, + "loss": 0.6984, + "step": 9705 + }, + { + "epoch": 0.7986833984776794, + "grad_norm": 1.7860974293535004, + "learning_rate": 2.0508758331165446e-06, + "loss": 0.7064, + "step": 9706 + }, + { + "epoch": 0.7987656860728245, + "grad_norm": 2.429145999269169, + "learning_rate": 2.0492590116590747e-06, + "loss": 0.7124, + "step": 9707 + }, + { + "epoch": 0.7988479736679696, + "grad_norm": 2.1912930760007967, + "learning_rate": 2.047642755009531e-06, + "loss": 0.7235, + "step": 9708 + }, + { + "epoch": 0.7989302612631146, + "grad_norm": 2.1177814147244307, + "learning_rate": 2.0460270632827273e-06, + "loss": 0.75, + "step": 9709 + }, + { + "epoch": 0.7990125488582596, + "grad_norm": 1.8707488717376344, + "learning_rate": 2.044411936593438e-06, + "loss": 0.7014, + "step": 9710 + }, + { + "epoch": 0.7990948364534046, + "grad_norm": 1.9046586065415427, + "learning_rate": 2.0427973750564044e-06, + "loss": 0.7398, + "step": 9711 + }, + { + "epoch": 0.7991771240485497, + "grad_norm": 1.9352500801124255, + "learning_rate": 2.0411833787863177e-06, + "loss": 0.6847, + "step": 9712 + }, + { + "epoch": 0.7992594116436947, + "grad_norm": 2.0279995750922524, + "learning_rate": 2.039569947897837e-06, + "loss": 0.7086, + "step": 9713 + }, + { + "epoch": 0.7993416992388398, + "grad_norm": 1.9427290825611387, + "learning_rate": 2.037957082505577e-06, + "loss": 0.6995, + "step": 9714 + }, + { + "epoch": 0.7994239868339847, + "grad_norm": 0.4177024595950708, + "learning_rate": 2.0363447827241123e-06, + "loss": 0.4555, + "step": 9715 + }, + { + "epoch": 0.7995062744291298, + "grad_norm": 2.206368294461977, + "learning_rate": 2.034733048667975e-06, + "loss": 0.6896, + "step": 9716 + }, + { + "epoch": 0.7995885620242749, + "grad_norm": 1.8990124461983122, + "learning_rate": 2.0331218804516674e-06, + "loss": 0.7239, + "step": 9717 + }, + { + "epoch": 0.7996708496194199, + "grad_norm": 0.4449595779095626, + "learning_rate": 2.0315112781896363e-06, + "loss": 0.4688, + "step": 9718 + }, + { + "epoch": 0.7997531372145649, + "grad_norm": 1.755055866659146, + "learning_rate": 2.029901241996304e-06, + "loss": 0.7395, + "step": 9719 + }, + { + "epoch": 0.7998354248097099, + "grad_norm": 2.313378103567429, + "learning_rate": 2.0282917719860396e-06, + "loss": 0.7311, + "step": 9720 + }, + { + "epoch": 0.799917712404855, + "grad_norm": 2.0038429630795167, + "learning_rate": 2.0266828682731797e-06, + "loss": 0.6919, + "step": 9721 + }, + { + "epoch": 0.8, + "grad_norm": 2.0439250987699245, + "learning_rate": 2.0250745309720157e-06, + "loss": 0.713, + "step": 9722 + }, + { + "epoch": 0.800082287595145, + "grad_norm": 2.2651644098642447, + "learning_rate": 2.023466760196805e-06, + "loss": 0.7152, + "step": 9723 + }, + { + "epoch": 0.80016457519029, + "grad_norm": 2.043111861092737, + "learning_rate": 2.021859556061756e-06, + "loss": 0.6729, + "step": 9724 + }, + { + "epoch": 0.8002468627854351, + "grad_norm": 1.6566547166244159, + "learning_rate": 2.0202529186810484e-06, + "loss": 0.6923, + "step": 9725 + }, + { + "epoch": 0.8003291503805802, + "grad_norm": 3.252853174971504, + "learning_rate": 2.0186468481688104e-06, + "loss": 0.7114, + "step": 9726 + }, + { + "epoch": 0.8004114379757251, + "grad_norm": 2.6289707316097775, + "learning_rate": 2.0170413446391367e-06, + "loss": 0.7352, + "step": 9727 + }, + { + "epoch": 0.8004937255708702, + "grad_norm": 2.34342782424263, + "learning_rate": 2.0154364082060773e-06, + "loss": 0.7023, + "step": 9728 + }, + { + "epoch": 0.8005760131660152, + "grad_norm": 1.892537979741158, + "learning_rate": 2.0138320389836476e-06, + "loss": 0.7202, + "step": 9729 + }, + { + "epoch": 0.8006583007611603, + "grad_norm": 1.919209661845698, + "learning_rate": 2.012228237085816e-06, + "loss": 0.7243, + "step": 9730 + }, + { + "epoch": 0.8007405883563052, + "grad_norm": 1.8734525370356232, + "learning_rate": 2.0106250026265174e-06, + "loss": 0.7159, + "step": 9731 + }, + { + "epoch": 0.8008228759514503, + "grad_norm": 1.863402798202083, + "learning_rate": 2.0090223357196426e-06, + "loss": 0.7061, + "step": 9732 + }, + { + "epoch": 0.8009051635465954, + "grad_norm": 0.4266615548235125, + "learning_rate": 2.0074202364790405e-06, + "loss": 0.4509, + "step": 9733 + }, + { + "epoch": 0.8009874511417404, + "grad_norm": 1.809995348282232, + "learning_rate": 2.0058187050185207e-06, + "loss": 0.7099, + "step": 9734 + }, + { + "epoch": 0.8010697387368854, + "grad_norm": 1.912572998688039, + "learning_rate": 2.004217741451857e-06, + "loss": 0.7124, + "step": 9735 + }, + { + "epoch": 0.8011520263320304, + "grad_norm": 2.141370588921066, + "learning_rate": 2.0026173458927755e-06, + "loss": 0.6933, + "step": 9736 + }, + { + "epoch": 0.8012343139271755, + "grad_norm": 1.7418018550189098, + "learning_rate": 2.001017518454971e-06, + "loss": 0.7041, + "step": 9737 + }, + { + "epoch": 0.8013166015223205, + "grad_norm": 1.9156732032355168, + "learning_rate": 1.9994182592520894e-06, + "loss": 0.7182, + "step": 9738 + }, + { + "epoch": 0.8013988891174656, + "grad_norm": 1.9300870282689528, + "learning_rate": 1.9978195683977387e-06, + "loss": 0.7175, + "step": 9739 + }, + { + "epoch": 0.8014811767126105, + "grad_norm": 3.203705123959986, + "learning_rate": 1.9962214460054864e-06, + "loss": 0.7013, + "step": 9740 + }, + { + "epoch": 0.8015634643077556, + "grad_norm": 2.2330572603875196, + "learning_rate": 1.994623892188865e-06, + "loss": 0.7141, + "step": 9741 + }, + { + "epoch": 0.8016457519029007, + "grad_norm": 1.8013460504693715, + "learning_rate": 1.993026907061356e-06, + "loss": 0.6881, + "step": 9742 + }, + { + "epoch": 0.8017280394980457, + "grad_norm": 1.8279100033998994, + "learning_rate": 1.9914304907364146e-06, + "loss": 0.7007, + "step": 9743 + }, + { + "epoch": 0.8018103270931907, + "grad_norm": 2.0581083811357495, + "learning_rate": 1.9898346433274417e-06, + "loss": 0.7332, + "step": 9744 + }, + { + "epoch": 0.8018926146883357, + "grad_norm": 2.397844547340273, + "learning_rate": 1.988239364947806e-06, + "loss": 0.7009, + "step": 9745 + }, + { + "epoch": 0.8019749022834808, + "grad_norm": 1.6097769586371564, + "learning_rate": 1.986644655710831e-06, + "loss": 0.681, + "step": 9746 + }, + { + "epoch": 0.8020571898786258, + "grad_norm": 2.7185385020115773, + "learning_rate": 1.985050515729806e-06, + "loss": 0.695, + "step": 9747 + }, + { + "epoch": 0.8021394774737708, + "grad_norm": 1.8003757870820636, + "learning_rate": 1.983456945117972e-06, + "loss": 0.6967, + "step": 9748 + }, + { + "epoch": 0.8022217650689158, + "grad_norm": 0.4120446189585933, + "learning_rate": 1.981863943988539e-06, + "loss": 0.445, + "step": 9749 + }, + { + "epoch": 0.8023040526640609, + "grad_norm": 1.9435473327734247, + "learning_rate": 1.9802715124546677e-06, + "loss": 0.7178, + "step": 9750 + }, + { + "epoch": 0.802386340259206, + "grad_norm": 1.8241371561807798, + "learning_rate": 1.9786796506294826e-06, + "loss": 0.7342, + "step": 9751 + }, + { + "epoch": 0.8024686278543509, + "grad_norm": 1.9694800794294849, + "learning_rate": 1.9770883586260637e-06, + "loss": 0.702, + "step": 9752 + }, + { + "epoch": 0.802550915449496, + "grad_norm": 1.8851315466242011, + "learning_rate": 1.9754976365574606e-06, + "loss": 0.7339, + "step": 9753 + }, + { + "epoch": 0.802633203044641, + "grad_norm": 2.625261295015606, + "learning_rate": 1.973907484536669e-06, + "loss": 0.7053, + "step": 9754 + }, + { + "epoch": 0.8027154906397861, + "grad_norm": 1.8518136175723547, + "learning_rate": 1.972317902676659e-06, + "loss": 0.7336, + "step": 9755 + }, + { + "epoch": 0.802797778234931, + "grad_norm": 0.42235047550500004, + "learning_rate": 1.9707288910903423e-06, + "loss": 0.4831, + "step": 9756 + }, + { + "epoch": 0.8028800658300761, + "grad_norm": 1.8301956504240302, + "learning_rate": 1.9691404498906064e-06, + "loss": 0.7181, + "step": 9757 + }, + { + "epoch": 0.8029623534252212, + "grad_norm": 0.40054986255432484, + "learning_rate": 1.9675525791902873e-06, + "loss": 0.4689, + "step": 9758 + }, + { + "epoch": 0.8030446410203662, + "grad_norm": 2.3225175604446626, + "learning_rate": 1.965965279102189e-06, + "loss": 0.7273, + "step": 9759 + }, + { + "epoch": 0.8031269286155112, + "grad_norm": 1.9163426283521343, + "learning_rate": 1.9643785497390665e-06, + "loss": 0.7309, + "step": 9760 + }, + { + "epoch": 0.8032092162106562, + "grad_norm": 1.9864725994741261, + "learning_rate": 1.962792391213646e-06, + "loss": 0.7041, + "step": 9761 + }, + { + "epoch": 0.8032915038058013, + "grad_norm": 2.0054033582624244, + "learning_rate": 1.9612068036385944e-06, + "loss": 0.7268, + "step": 9762 + }, + { + "epoch": 0.8033737914009463, + "grad_norm": 1.9161372404763, + "learning_rate": 1.9596217871265588e-06, + "loss": 0.7298, + "step": 9763 + }, + { + "epoch": 0.8034560789960914, + "grad_norm": 1.7296234970914695, + "learning_rate": 1.95803734179013e-06, + "loss": 0.7057, + "step": 9764 + }, + { + "epoch": 0.8035383665912363, + "grad_norm": 1.7153716267858454, + "learning_rate": 1.956453467741869e-06, + "loss": 0.7077, + "step": 9765 + }, + { + "epoch": 0.8036206541863814, + "grad_norm": 2.9958501237238337, + "learning_rate": 1.9548701650942915e-06, + "loss": 0.7076, + "step": 9766 + }, + { + "epoch": 0.8037029417815265, + "grad_norm": 1.9600830254898622, + "learning_rate": 1.9532874339598696e-06, + "loss": 0.7095, + "step": 9767 + }, + { + "epoch": 0.8037852293766715, + "grad_norm": 2.0467537075147155, + "learning_rate": 1.951705274451038e-06, + "loss": 0.7038, + "step": 9768 + }, + { + "epoch": 0.8038675169718165, + "grad_norm": 1.570437785034517, + "learning_rate": 1.950123686680194e-06, + "loss": 0.7022, + "step": 9769 + }, + { + "epoch": 0.8039498045669615, + "grad_norm": 1.7028339413948315, + "learning_rate": 1.948542670759688e-06, + "loss": 0.7103, + "step": 9770 + }, + { + "epoch": 0.8040320921621066, + "grad_norm": 1.8289663088793922, + "learning_rate": 1.9469622268018363e-06, + "loss": 0.7162, + "step": 9771 + }, + { + "epoch": 0.8041143797572516, + "grad_norm": 0.388416538452183, + "learning_rate": 1.945382354918909e-06, + "loss": 0.4641, + "step": 9772 + }, + { + "epoch": 0.8041966673523966, + "grad_norm": 0.39545685317882706, + "learning_rate": 1.943803055223138e-06, + "loss": 0.4637, + "step": 9773 + }, + { + "epoch": 0.8042789549475416, + "grad_norm": 0.4214903375980745, + "learning_rate": 1.9422243278267106e-06, + "loss": 0.498, + "step": 9774 + }, + { + "epoch": 0.8043612425426867, + "grad_norm": 1.9919722526171364, + "learning_rate": 1.9406461728417838e-06, + "loss": 0.6868, + "step": 9775 + }, + { + "epoch": 0.8044435301378318, + "grad_norm": 1.863774956140299, + "learning_rate": 1.9390685903804605e-06, + "loss": 0.6975, + "step": 9776 + }, + { + "epoch": 0.8045258177329767, + "grad_norm": 1.748527857531413, + "learning_rate": 1.9374915805548168e-06, + "loss": 0.707, + "step": 9777 + }, + { + "epoch": 0.8046081053281218, + "grad_norm": 2.40910159839661, + "learning_rate": 1.9359151434768765e-06, + "loss": 0.7159, + "step": 9778 + }, + { + "epoch": 0.8046903929232668, + "grad_norm": 2.031483283800606, + "learning_rate": 1.934339279258628e-06, + "loss": 0.7115, + "step": 9779 + }, + { + "epoch": 0.8047726805184119, + "grad_norm": 0.40246285829036005, + "learning_rate": 1.9327639880120153e-06, + "loss": 0.4648, + "step": 9780 + }, + { + "epoch": 0.8048549681135568, + "grad_norm": 2.62731250153125, + "learning_rate": 1.93118926984895e-06, + "loss": 0.7041, + "step": 9781 + }, + { + "epoch": 0.8049372557087019, + "grad_norm": 2.052215236834145, + "learning_rate": 1.929615124881291e-06, + "loss": 0.6912, + "step": 9782 + }, + { + "epoch": 0.805019543303847, + "grad_norm": 0.41924272204366414, + "learning_rate": 1.928041553220871e-06, + "loss": 0.4778, + "step": 9783 + }, + { + "epoch": 0.805101830898992, + "grad_norm": 1.8055157017908088, + "learning_rate": 1.9264685549794693e-06, + "loss": 0.7016, + "step": 9784 + }, + { + "epoch": 0.805184118494137, + "grad_norm": 2.033693100628309, + "learning_rate": 1.9248961302688298e-06, + "loss": 0.7313, + "step": 9785 + }, + { + "epoch": 0.805266406089282, + "grad_norm": 2.572517701768818, + "learning_rate": 1.9233242792006523e-06, + "loss": 0.6927, + "step": 9786 + }, + { + "epoch": 0.8053486936844271, + "grad_norm": 0.42720317148975395, + "learning_rate": 1.921753001886604e-06, + "loss": 0.4934, + "step": 9787 + }, + { + "epoch": 0.8054309812795721, + "grad_norm": 0.4307075976526212, + "learning_rate": 1.9201822984383e-06, + "loss": 0.4994, + "step": 9788 + }, + { + "epoch": 0.8055132688747171, + "grad_norm": 2.291426762768202, + "learning_rate": 1.9186121689673275e-06, + "loss": 0.7094, + "step": 9789 + }, + { + "epoch": 0.8055955564698621, + "grad_norm": 2.0597064255156816, + "learning_rate": 1.9170426135852218e-06, + "loss": 0.6988, + "step": 9790 + }, + { + "epoch": 0.8056778440650072, + "grad_norm": 0.39609107928744675, + "learning_rate": 1.915473632403482e-06, + "loss": 0.4637, + "step": 9791 + }, + { + "epoch": 0.8057601316601523, + "grad_norm": 1.7563603698205292, + "learning_rate": 1.9139052255335624e-06, + "loss": 0.7078, + "step": 9792 + }, + { + "epoch": 0.8058424192552973, + "grad_norm": 0.40181947448065547, + "learning_rate": 1.9123373930868873e-06, + "loss": 0.5073, + "step": 9793 + }, + { + "epoch": 0.8059247068504423, + "grad_norm": 0.4009474765894149, + "learning_rate": 1.9107701351748278e-06, + "loss": 0.4663, + "step": 9794 + }, + { + "epoch": 0.8060069944455873, + "grad_norm": 2.281310766018567, + "learning_rate": 1.909203451908722e-06, + "loss": 0.7313, + "step": 9795 + }, + { + "epoch": 0.8060892820407324, + "grad_norm": 1.802426161653647, + "learning_rate": 1.907637343399864e-06, + "loss": 0.6984, + "step": 9796 + }, + { + "epoch": 0.8061715696358774, + "grad_norm": 2.968654820056152, + "learning_rate": 1.9060718097595077e-06, + "loss": 0.717, + "step": 9797 + }, + { + "epoch": 0.8062538572310224, + "grad_norm": 1.9148387179700093, + "learning_rate": 1.9045068510988641e-06, + "loss": 0.707, + "step": 9798 + }, + { + "epoch": 0.8063361448261674, + "grad_norm": 2.257197813170384, + "learning_rate": 1.9029424675291087e-06, + "loss": 0.7309, + "step": 9799 + }, + { + "epoch": 0.8064184324213125, + "grad_norm": 1.6582517748826437, + "learning_rate": 1.9013786591613681e-06, + "loss": 0.6891, + "step": 9800 + }, + { + "epoch": 0.8065007200164576, + "grad_norm": 1.7746708922217518, + "learning_rate": 1.899815426106738e-06, + "loss": 0.7275, + "step": 9801 + }, + { + "epoch": 0.8065830076116025, + "grad_norm": 2.3520685485850508, + "learning_rate": 1.8982527684762664e-06, + "loss": 0.7361, + "step": 9802 + }, + { + "epoch": 0.8066652952067476, + "grad_norm": 1.750505182233444, + "learning_rate": 1.896690686380962e-06, + "loss": 0.6828, + "step": 9803 + }, + { + "epoch": 0.8067475828018926, + "grad_norm": 1.9392785410739228, + "learning_rate": 1.8951291799317883e-06, + "loss": 0.7105, + "step": 9804 + }, + { + "epoch": 0.8068298703970377, + "grad_norm": 1.9779244831162257, + "learning_rate": 1.8935682492396778e-06, + "loss": 0.706, + "step": 9805 + }, + { + "epoch": 0.8069121579921826, + "grad_norm": 1.6661394253111792, + "learning_rate": 1.892007894415513e-06, + "loss": 0.7357, + "step": 9806 + }, + { + "epoch": 0.8069944455873277, + "grad_norm": 1.8708299048534998, + "learning_rate": 1.8904481155701437e-06, + "loss": 0.7428, + "step": 9807 + }, + { + "epoch": 0.8070767331824727, + "grad_norm": 2.2472466662840582, + "learning_rate": 1.8888889128143695e-06, + "loss": 0.7224, + "step": 9808 + }, + { + "epoch": 0.8071590207776178, + "grad_norm": 2.040848085655821, + "learning_rate": 1.887330286258956e-06, + "loss": 0.7096, + "step": 9809 + }, + { + "epoch": 0.8072413083727628, + "grad_norm": 1.9398881742006366, + "learning_rate": 1.8857722360146224e-06, + "loss": 0.7411, + "step": 9810 + }, + { + "epoch": 0.8073235959679078, + "grad_norm": 1.7949780780926046, + "learning_rate": 1.8842147621920538e-06, + "loss": 0.7, + "step": 9811 + }, + { + "epoch": 0.8074058835630529, + "grad_norm": 2.5889033434037683, + "learning_rate": 1.8826578649018879e-06, + "loss": 0.6986, + "step": 9812 + }, + { + "epoch": 0.8074881711581979, + "grad_norm": 2.28807174496187, + "learning_rate": 1.8811015442547275e-06, + "loss": 0.7112, + "step": 9813 + }, + { + "epoch": 0.8075704587533429, + "grad_norm": 1.7311091816135458, + "learning_rate": 1.879545800361129e-06, + "loss": 0.7257, + "step": 9814 + }, + { + "epoch": 0.8076527463484879, + "grad_norm": 1.8441030765625266, + "learning_rate": 1.87799063333161e-06, + "loss": 0.7401, + "step": 9815 + }, + { + "epoch": 0.807735033943633, + "grad_norm": 3.742432011380351, + "learning_rate": 1.876436043276645e-06, + "loss": 0.7434, + "step": 9816 + }, + { + "epoch": 0.807817321538778, + "grad_norm": 2.425005631629464, + "learning_rate": 1.8748820303066739e-06, + "loss": 0.744, + "step": 9817 + }, + { + "epoch": 0.8078996091339231, + "grad_norm": 1.960124002794105, + "learning_rate": 1.8733285945320867e-06, + "loss": 0.6732, + "step": 9818 + }, + { + "epoch": 0.8079818967290681, + "grad_norm": 1.655166601451773, + "learning_rate": 1.8717757360632427e-06, + "loss": 0.7101, + "step": 9819 + }, + { + "epoch": 0.8080641843242131, + "grad_norm": 2.35352160471275, + "learning_rate": 1.8702234550104502e-06, + "loss": 0.7217, + "step": 9820 + }, + { + "epoch": 0.8081464719193582, + "grad_norm": 0.4038867177838153, + "learning_rate": 1.8686717514839825e-06, + "loss": 0.5036, + "step": 9821 + }, + { + "epoch": 0.8082287595145032, + "grad_norm": 0.39816261275983217, + "learning_rate": 1.8671206255940666e-06, + "loss": 0.4882, + "step": 9822 + }, + { + "epoch": 0.8083110471096482, + "grad_norm": 1.7260306340669238, + "learning_rate": 1.8655700774508978e-06, + "loss": 0.7315, + "step": 9823 + }, + { + "epoch": 0.8083933347047932, + "grad_norm": 0.4188626388480571, + "learning_rate": 1.864020107164618e-06, + "loss": 0.4833, + "step": 9824 + }, + { + "epoch": 0.8084756222999383, + "grad_norm": 1.816253772847475, + "learning_rate": 1.8624707148453446e-06, + "loss": 0.7196, + "step": 9825 + }, + { + "epoch": 0.8085579098950834, + "grad_norm": 6.856283235661718, + "learning_rate": 1.860921900603132e-06, + "loss": 0.6907, + "step": 9826 + }, + { + "epoch": 0.8086401974902283, + "grad_norm": 2.025605226974372, + "learning_rate": 1.859373664548013e-06, + "loss": 0.6979, + "step": 9827 + }, + { + "epoch": 0.8087224850853734, + "grad_norm": 1.9757105118643494, + "learning_rate": 1.8578260067899678e-06, + "loss": 0.6823, + "step": 9828 + }, + { + "epoch": 0.8088047726805184, + "grad_norm": 1.9289859510248806, + "learning_rate": 1.8562789274389448e-06, + "loss": 0.7359, + "step": 9829 + }, + { + "epoch": 0.8088870602756635, + "grad_norm": 1.6001089939445345, + "learning_rate": 1.8547324266048394e-06, + "loss": 0.7033, + "step": 9830 + }, + { + "epoch": 0.8089693478708084, + "grad_norm": 2.1137547808147215, + "learning_rate": 1.8531865043975217e-06, + "loss": 0.7427, + "step": 9831 + }, + { + "epoch": 0.8090516354659535, + "grad_norm": 2.1543821575073614, + "learning_rate": 1.8516411609268004e-06, + "loss": 0.7112, + "step": 9832 + }, + { + "epoch": 0.8091339230610985, + "grad_norm": 0.4299273236947779, + "learning_rate": 1.850096396302462e-06, + "loss": 0.4647, + "step": 9833 + }, + { + "epoch": 0.8092162106562436, + "grad_norm": 1.7940057601560253, + "learning_rate": 1.848552210634239e-06, + "loss": 0.7089, + "step": 9834 + }, + { + "epoch": 0.8092984982513886, + "grad_norm": 2.6013333856618597, + "learning_rate": 1.8470086040318314e-06, + "loss": 0.7117, + "step": 9835 + }, + { + "epoch": 0.8093807858465336, + "grad_norm": 0.41904875105928313, + "learning_rate": 1.8454655766048978e-06, + "loss": 0.4644, + "step": 9836 + }, + { + "epoch": 0.8094630734416787, + "grad_norm": 2.3431194184598003, + "learning_rate": 1.8439231284630444e-06, + "loss": 0.7198, + "step": 9837 + }, + { + "epoch": 0.8095453610368237, + "grad_norm": 1.811417570681222, + "learning_rate": 1.8423812597158496e-06, + "loss": 0.7209, + "step": 9838 + }, + { + "epoch": 0.8096276486319687, + "grad_norm": 2.2760161611868672, + "learning_rate": 1.840839970472843e-06, + "loss": 0.7401, + "step": 9839 + }, + { + "epoch": 0.8097099362271137, + "grad_norm": 2.374029075809117, + "learning_rate": 1.839299260843518e-06, + "loss": 0.7383, + "step": 9840 + }, + { + "epoch": 0.8097922238222588, + "grad_norm": 1.7860583811174555, + "learning_rate": 1.8377591309373188e-06, + "loss": 0.7045, + "step": 9841 + }, + { + "epoch": 0.8098745114174039, + "grad_norm": 2.046980817706621, + "learning_rate": 1.836219580863664e-06, + "loss": 0.7203, + "step": 9842 + }, + { + "epoch": 0.8099567990125489, + "grad_norm": 2.8005357931500745, + "learning_rate": 1.8346806107319082e-06, + "loss": 0.7145, + "step": 9843 + }, + { + "epoch": 0.8100390866076939, + "grad_norm": 2.295229465785295, + "learning_rate": 1.833142220651386e-06, + "loss": 0.7308, + "step": 9844 + }, + { + "epoch": 0.8101213742028389, + "grad_norm": 1.9492903361049359, + "learning_rate": 1.8316044107313779e-06, + "loss": 0.7157, + "step": 9845 + }, + { + "epoch": 0.810203661797984, + "grad_norm": 1.8250977014616696, + "learning_rate": 1.83006718108113e-06, + "loss": 0.7249, + "step": 9846 + }, + { + "epoch": 0.810285949393129, + "grad_norm": 2.122600385999285, + "learning_rate": 1.8285305318098444e-06, + "loss": 0.6998, + "step": 9847 + }, + { + "epoch": 0.810368236988274, + "grad_norm": 2.0695585012529323, + "learning_rate": 1.8269944630266824e-06, + "loss": 0.7065, + "step": 9848 + }, + { + "epoch": 0.810450524583419, + "grad_norm": 1.9993657774928226, + "learning_rate": 1.8254589748407592e-06, + "loss": 0.702, + "step": 9849 + }, + { + "epoch": 0.8105328121785641, + "grad_norm": 2.356260808442222, + "learning_rate": 1.8239240673611602e-06, + "loss": 0.7162, + "step": 9850 + }, + { + "epoch": 0.8106150997737092, + "grad_norm": 1.9636665798950954, + "learning_rate": 1.822389740696917e-06, + "loss": 0.7352, + "step": 9851 + }, + { + "epoch": 0.8106973873688541, + "grad_norm": 2.4166232204154174, + "learning_rate": 1.8208559949570314e-06, + "loss": 0.7257, + "step": 9852 + }, + { + "epoch": 0.8107796749639992, + "grad_norm": 2.08586066208624, + "learning_rate": 1.819322830250455e-06, + "loss": 0.6981, + "step": 9853 + }, + { + "epoch": 0.8108619625591442, + "grad_norm": 2.138206693693744, + "learning_rate": 1.8177902466861009e-06, + "loss": 0.6981, + "step": 9854 + }, + { + "epoch": 0.8109442501542893, + "grad_norm": 1.7933835537174807, + "learning_rate": 1.8162582443728404e-06, + "loss": 0.7002, + "step": 9855 + }, + { + "epoch": 0.8110265377494342, + "grad_norm": 1.680884764151247, + "learning_rate": 1.8147268234195093e-06, + "loss": 0.7309, + "step": 9856 + }, + { + "epoch": 0.8111088253445793, + "grad_norm": 1.9847847598277164, + "learning_rate": 1.8131959839348912e-06, + "loss": 0.7381, + "step": 9857 + }, + { + "epoch": 0.8111911129397243, + "grad_norm": 2.102877598479016, + "learning_rate": 1.8116657260277405e-06, + "loss": 0.7391, + "step": 9858 + }, + { + "epoch": 0.8112734005348694, + "grad_norm": 1.8643971153961838, + "learning_rate": 1.81013604980676e-06, + "loss": 0.7094, + "step": 9859 + }, + { + "epoch": 0.8113556881300144, + "grad_norm": 3.323691379161553, + "learning_rate": 1.8086069553806186e-06, + "loss": 0.7091, + "step": 9860 + }, + { + "epoch": 0.8114379757251594, + "grad_norm": 2.2488294626372203, + "learning_rate": 1.8070784428579347e-06, + "loss": 0.7331, + "step": 9861 + }, + { + "epoch": 0.8115202633203045, + "grad_norm": 1.7157510391323105, + "learning_rate": 1.8055505123472994e-06, + "loss": 0.7148, + "step": 9862 + }, + { + "epoch": 0.8116025509154495, + "grad_norm": 2.077667251424406, + "learning_rate": 1.804023163957247e-06, + "loss": 0.7028, + "step": 9863 + }, + { + "epoch": 0.8116848385105945, + "grad_norm": 2.248204030286972, + "learning_rate": 1.8024963977962851e-06, + "loss": 0.6973, + "step": 9864 + }, + { + "epoch": 0.8117671261057395, + "grad_norm": 1.8249137363441423, + "learning_rate": 1.800970213972868e-06, + "loss": 0.6998, + "step": 9865 + }, + { + "epoch": 0.8118494137008846, + "grad_norm": 0.43507760425860603, + "learning_rate": 1.7994446125954158e-06, + "loss": 0.4593, + "step": 9866 + }, + { + "epoch": 0.8119317012960297, + "grad_norm": 1.9092265864685694, + "learning_rate": 1.7979195937722992e-06, + "loss": 0.7404, + "step": 9867 + }, + { + "epoch": 0.8120139888911747, + "grad_norm": 1.548386660558381, + "learning_rate": 1.796395157611861e-06, + "loss": 0.6983, + "step": 9868 + }, + { + "epoch": 0.8120962764863197, + "grad_norm": 1.8884971449083372, + "learning_rate": 1.7948713042223885e-06, + "loss": 0.7101, + "step": 9869 + }, + { + "epoch": 0.8121785640814647, + "grad_norm": 1.8162568949224442, + "learning_rate": 1.793348033712139e-06, + "loss": 0.6929, + "step": 9870 + }, + { + "epoch": 0.8122608516766098, + "grad_norm": 1.6759000402440805, + "learning_rate": 1.791825346189321e-06, + "loss": 0.703, + "step": 9871 + }, + { + "epoch": 0.8123431392717548, + "grad_norm": 1.8878367177263726, + "learning_rate": 1.7903032417621025e-06, + "loss": 0.7304, + "step": 9872 + }, + { + "epoch": 0.8124254268668998, + "grad_norm": 2.797642191881407, + "learning_rate": 1.7887817205386104e-06, + "loss": 0.726, + "step": 9873 + }, + { + "epoch": 0.8125077144620448, + "grad_norm": 1.7724497790389615, + "learning_rate": 1.787260782626935e-06, + "loss": 0.7216, + "step": 9874 + }, + { + "epoch": 0.8125900020571899, + "grad_norm": 0.41707210169523323, + "learning_rate": 1.7857404281351166e-06, + "loss": 0.4657, + "step": 9875 + }, + { + "epoch": 0.812672289652335, + "grad_norm": 1.9533984395269985, + "learning_rate": 1.784220657171164e-06, + "loss": 0.705, + "step": 9876 + }, + { + "epoch": 0.8127545772474799, + "grad_norm": 1.7516484322856427, + "learning_rate": 1.7827014698430377e-06, + "loss": 0.7463, + "step": 9877 + }, + { + "epoch": 0.812836864842625, + "grad_norm": 2.4412960043286516, + "learning_rate": 1.781182866258656e-06, + "loss": 0.7184, + "step": 9878 + }, + { + "epoch": 0.81291915243777, + "grad_norm": 1.7815477497821113, + "learning_rate": 1.779664846525898e-06, + "loss": 0.7092, + "step": 9879 + }, + { + "epoch": 0.8130014400329151, + "grad_norm": 1.9123199288150199, + "learning_rate": 1.7781474107526042e-06, + "loss": 0.6847, + "step": 9880 + }, + { + "epoch": 0.81308372762806, + "grad_norm": 1.9502443765171433, + "learning_rate": 1.7766305590465682e-06, + "loss": 0.7015, + "step": 9881 + }, + { + "epoch": 0.8131660152232051, + "grad_norm": 2.048056224032064, + "learning_rate": 1.7751142915155473e-06, + "loss": 0.686, + "step": 9882 + }, + { + "epoch": 0.8132483028183501, + "grad_norm": 2.2609697001584372, + "learning_rate": 1.773598608267254e-06, + "loss": 0.7101, + "step": 9883 + }, + { + "epoch": 0.8133305904134952, + "grad_norm": 2.0416216154371076, + "learning_rate": 1.772083509409359e-06, + "loss": 0.7124, + "step": 9884 + }, + { + "epoch": 0.8134128780086402, + "grad_norm": 0.42047139627667335, + "learning_rate": 1.7705689950494908e-06, + "loss": 0.4848, + "step": 9885 + }, + { + "epoch": 0.8134951656037852, + "grad_norm": 1.8255390510587912, + "learning_rate": 1.7690550652952432e-06, + "loss": 0.7278, + "step": 9886 + }, + { + "epoch": 0.8135774531989303, + "grad_norm": 1.9022854255171127, + "learning_rate": 1.7675417202541566e-06, + "loss": 0.7237, + "step": 9887 + }, + { + "epoch": 0.8136597407940753, + "grad_norm": 2.39190975769215, + "learning_rate": 1.7660289600337432e-06, + "loss": 0.7474, + "step": 9888 + }, + { + "epoch": 0.8137420283892203, + "grad_norm": 0.43087662071194927, + "learning_rate": 1.7645167847414646e-06, + "loss": 0.4817, + "step": 9889 + }, + { + "epoch": 0.8138243159843653, + "grad_norm": 1.9069038450952935, + "learning_rate": 1.7630051944847427e-06, + "loss": 0.6855, + "step": 9890 + }, + { + "epoch": 0.8139066035795104, + "grad_norm": 1.7050567524956086, + "learning_rate": 1.761494189370956e-06, + "loss": 0.7141, + "step": 9891 + }, + { + "epoch": 0.8139888911746554, + "grad_norm": 1.5673252053504947, + "learning_rate": 1.7599837695074496e-06, + "loss": 0.7275, + "step": 9892 + }, + { + "epoch": 0.8140711787698005, + "grad_norm": 1.6234063661516784, + "learning_rate": 1.7584739350015145e-06, + "loss": 0.6769, + "step": 9893 + }, + { + "epoch": 0.8141534663649455, + "grad_norm": 1.7399925437645316, + "learning_rate": 1.756964685960414e-06, + "loss": 0.7284, + "step": 9894 + }, + { + "epoch": 0.8142357539600905, + "grad_norm": 1.9922729277963618, + "learning_rate": 1.7554560224913597e-06, + "loss": 0.726, + "step": 9895 + }, + { + "epoch": 0.8143180415552356, + "grad_norm": 2.2747677554630363, + "learning_rate": 1.753947944701524e-06, + "loss": 0.7059, + "step": 9896 + }, + { + "epoch": 0.8144003291503806, + "grad_norm": 2.1822094587384324, + "learning_rate": 1.7524404526980366e-06, + "loss": 0.7122, + "step": 9897 + }, + { + "epoch": 0.8144826167455256, + "grad_norm": 2.420083821991806, + "learning_rate": 1.7509335465879905e-06, + "loss": 0.6935, + "step": 9898 + }, + { + "epoch": 0.8145649043406706, + "grad_norm": 2.024167984569118, + "learning_rate": 1.7494272264784317e-06, + "loss": 0.7181, + "step": 9899 + }, + { + "epoch": 0.8146471919358157, + "grad_norm": 1.8545972565045905, + "learning_rate": 1.74792149247637e-06, + "loss": 0.7325, + "step": 9900 + }, + { + "epoch": 0.8147294795309608, + "grad_norm": 1.9151692311751622, + "learning_rate": 1.7464163446887684e-06, + "loss": 0.7064, + "step": 9901 + }, + { + "epoch": 0.8148117671261057, + "grad_norm": 1.6258814631506615, + "learning_rate": 1.7449117832225504e-06, + "loss": 0.7052, + "step": 9902 + }, + { + "epoch": 0.8148940547212508, + "grad_norm": 2.388628640676216, + "learning_rate": 1.743407808184594e-06, + "loss": 0.7122, + "step": 9903 + }, + { + "epoch": 0.8149763423163958, + "grad_norm": 2.159311678493064, + "learning_rate": 1.741904419681746e-06, + "loss": 0.711, + "step": 9904 + }, + { + "epoch": 0.8150586299115409, + "grad_norm": 2.3356515179332757, + "learning_rate": 1.740401617820797e-06, + "loss": 0.7268, + "step": 9905 + }, + { + "epoch": 0.8151409175066858, + "grad_norm": 1.9143670524851604, + "learning_rate": 1.7388994027085138e-06, + "loss": 0.6842, + "step": 9906 + }, + { + "epoch": 0.8152232051018309, + "grad_norm": 3.202681753056612, + "learning_rate": 1.7373977744516001e-06, + "loss": 0.7373, + "step": 9907 + }, + { + "epoch": 0.8153054926969759, + "grad_norm": 0.40942255295520963, + "learning_rate": 1.7358967331567368e-06, + "loss": 0.4987, + "step": 9908 + }, + { + "epoch": 0.815387780292121, + "grad_norm": 2.7589019782388893, + "learning_rate": 1.7343962789305514e-06, + "loss": 0.7165, + "step": 9909 + }, + { + "epoch": 0.815470067887266, + "grad_norm": 2.0799451178848494, + "learning_rate": 1.7328964118796375e-06, + "loss": 0.7132, + "step": 9910 + }, + { + "epoch": 0.815552355482411, + "grad_norm": 1.9302875305372054, + "learning_rate": 1.7313971321105383e-06, + "loss": 0.7095, + "step": 9911 + }, + { + "epoch": 0.8156346430775561, + "grad_norm": 1.7615934184356885, + "learning_rate": 1.729898439729768e-06, + "loss": 0.7346, + "step": 9912 + }, + { + "epoch": 0.8157169306727011, + "grad_norm": 2.277257038679096, + "learning_rate": 1.7284003348437829e-06, + "loss": 0.7201, + "step": 9913 + }, + { + "epoch": 0.8157992182678461, + "grad_norm": 2.0046532386880167, + "learning_rate": 1.7269028175590107e-06, + "loss": 0.733, + "step": 9914 + }, + { + "epoch": 0.8158815058629911, + "grad_norm": 1.6927161902881191, + "learning_rate": 1.7254058879818292e-06, + "loss": 0.6802, + "step": 9915 + }, + { + "epoch": 0.8159637934581362, + "grad_norm": 2.039495179168975, + "learning_rate": 1.7239095462185828e-06, + "loss": 0.7247, + "step": 9916 + }, + { + "epoch": 0.8160460810532812, + "grad_norm": 1.9558273855223487, + "learning_rate": 1.7224137923755669e-06, + "loss": 0.7127, + "step": 9917 + }, + { + "epoch": 0.8161283686484262, + "grad_norm": 0.39454829860761237, + "learning_rate": 1.7209186265590372e-06, + "loss": 0.4646, + "step": 9918 + }, + { + "epoch": 0.8162106562435713, + "grad_norm": 1.680772602866258, + "learning_rate": 1.7194240488752046e-06, + "loss": 0.7128, + "step": 9919 + }, + { + "epoch": 0.8162929438387163, + "grad_norm": 1.74865883829522, + "learning_rate": 1.7179300594302472e-06, + "loss": 0.7044, + "step": 9920 + }, + { + "epoch": 0.8163752314338614, + "grad_norm": 1.8695838876273705, + "learning_rate": 1.7164366583302916e-06, + "loss": 0.7058, + "step": 9921 + }, + { + "epoch": 0.8164575190290064, + "grad_norm": 1.667457211473695, + "learning_rate": 1.7149438456814304e-06, + "loss": 0.7078, + "step": 9922 + }, + { + "epoch": 0.8165398066241514, + "grad_norm": 1.8166350159971578, + "learning_rate": 1.7134516215897079e-06, + "loss": 0.6956, + "step": 9923 + }, + { + "epoch": 0.8166220942192964, + "grad_norm": 2.281008837789244, + "learning_rate": 1.7119599861611302e-06, + "loss": 0.7151, + "step": 9924 + }, + { + "epoch": 0.8167043818144415, + "grad_norm": 2.4090206652358908, + "learning_rate": 1.7104689395016572e-06, + "loss": 0.7028, + "step": 9925 + }, + { + "epoch": 0.8167866694095866, + "grad_norm": 2.1827976928653468, + "learning_rate": 1.7089784817172174e-06, + "loss": 0.7194, + "step": 9926 + }, + { + "epoch": 0.8168689570047315, + "grad_norm": 0.4360581517650875, + "learning_rate": 1.7074886129136825e-06, + "loss": 0.489, + "step": 9927 + }, + { + "epoch": 0.8169512445998766, + "grad_norm": 1.9812783356020283, + "learning_rate": 1.7059993331968982e-06, + "loss": 0.7025, + "step": 9928 + }, + { + "epoch": 0.8170335321950216, + "grad_norm": 1.8791386054781238, + "learning_rate": 1.7045106426726577e-06, + "loss": 0.7142, + "step": 9929 + }, + { + "epoch": 0.8171158197901667, + "grad_norm": 1.9953038258481388, + "learning_rate": 1.7030225414467126e-06, + "loss": 0.732, + "step": 9930 + }, + { + "epoch": 0.8171981073853116, + "grad_norm": 1.7288997630238603, + "learning_rate": 1.7015350296247758e-06, + "loss": 0.7305, + "step": 9931 + }, + { + "epoch": 0.8172803949804567, + "grad_norm": 1.9712602631791938, + "learning_rate": 1.7000481073125219e-06, + "loss": 0.7263, + "step": 9932 + }, + { + "epoch": 0.8173626825756017, + "grad_norm": 0.422607721922522, + "learning_rate": 1.6985617746155735e-06, + "loss": 0.4847, + "step": 9933 + }, + { + "epoch": 0.8174449701707468, + "grad_norm": 1.9698318255679366, + "learning_rate": 1.697076031639523e-06, + "loss": 0.7126, + "step": 9934 + }, + { + "epoch": 0.8175272577658917, + "grad_norm": 0.40065023266904565, + "learning_rate": 1.6955908784899133e-06, + "loss": 0.4664, + "step": 9935 + }, + { + "epoch": 0.8176095453610368, + "grad_norm": 2.857724912407178, + "learning_rate": 1.6941063152722459e-06, + "loss": 0.6912, + "step": 9936 + }, + { + "epoch": 0.8176918329561819, + "grad_norm": 1.9356688020806059, + "learning_rate": 1.6926223420919796e-06, + "loss": 0.7224, + "step": 9937 + }, + { + "epoch": 0.8177741205513269, + "grad_norm": 2.2751518622239084, + "learning_rate": 1.6911389590545402e-06, + "loss": 0.6968, + "step": 9938 + }, + { + "epoch": 0.8178564081464719, + "grad_norm": 2.0715860427054875, + "learning_rate": 1.6896561662652976e-06, + "loss": 0.705, + "step": 9939 + }, + { + "epoch": 0.8179386957416169, + "grad_norm": 2.6856451587250616, + "learning_rate": 1.6881739638295935e-06, + "loss": 0.7088, + "step": 9940 + }, + { + "epoch": 0.818020983336762, + "grad_norm": 1.927922931341174, + "learning_rate": 1.6866923518527178e-06, + "loss": 0.6954, + "step": 9941 + }, + { + "epoch": 0.818103270931907, + "grad_norm": 2.1229255784553294, + "learning_rate": 1.685211330439923e-06, + "loss": 0.7177, + "step": 9942 + }, + { + "epoch": 0.818185558527052, + "grad_norm": 0.42114078915536973, + "learning_rate": 1.6837308996964142e-06, + "loss": 0.4792, + "step": 9943 + }, + { + "epoch": 0.818267846122197, + "grad_norm": 2.0606506690418795, + "learning_rate": 1.6822510597273656e-06, + "loss": 0.6886, + "step": 9944 + }, + { + "epoch": 0.8183501337173421, + "grad_norm": 2.256105941003019, + "learning_rate": 1.6807718106378968e-06, + "loss": 0.6892, + "step": 9945 + }, + { + "epoch": 0.8184324213124872, + "grad_norm": 2.175001104570195, + "learning_rate": 1.6792931525330968e-06, + "loss": 0.7335, + "step": 9946 + }, + { + "epoch": 0.8185147089076322, + "grad_norm": 1.5704505159425257, + "learning_rate": 1.6778150855180043e-06, + "loss": 0.672, + "step": 9947 + }, + { + "epoch": 0.8185969965027772, + "grad_norm": 1.950046361825801, + "learning_rate": 1.6763376096976181e-06, + "loss": 0.7372, + "step": 9948 + }, + { + "epoch": 0.8186792840979222, + "grad_norm": 1.9864756047056265, + "learning_rate": 1.6748607251768933e-06, + "loss": 0.7052, + "step": 9949 + }, + { + "epoch": 0.8187615716930673, + "grad_norm": 1.9366154374698266, + "learning_rate": 1.6733844320607517e-06, + "loss": 0.7068, + "step": 9950 + }, + { + "epoch": 0.8188438592882124, + "grad_norm": 2.063707092430174, + "learning_rate": 1.6719087304540615e-06, + "loss": 0.7082, + "step": 9951 + }, + { + "epoch": 0.8189261468833573, + "grad_norm": 0.41843126163347794, + "learning_rate": 1.670433620461659e-06, + "loss": 0.5035, + "step": 9952 + }, + { + "epoch": 0.8190084344785024, + "grad_norm": 1.86703459879513, + "learning_rate": 1.66895910218833e-06, + "loss": 0.7152, + "step": 9953 + }, + { + "epoch": 0.8190907220736474, + "grad_norm": 2.9399523029311374, + "learning_rate": 1.6674851757388232e-06, + "loss": 0.7029, + "step": 9954 + }, + { + "epoch": 0.8191730096687925, + "grad_norm": 2.0419351240552155, + "learning_rate": 1.6660118412178417e-06, + "loss": 0.715, + "step": 9955 + }, + { + "epoch": 0.8192552972639374, + "grad_norm": 1.801089293385041, + "learning_rate": 1.6645390987300525e-06, + "loss": 0.7189, + "step": 9956 + }, + { + "epoch": 0.8193375848590825, + "grad_norm": 1.9265111861212403, + "learning_rate": 1.6630669483800721e-06, + "loss": 0.7149, + "step": 9957 + }, + { + "epoch": 0.8194198724542275, + "grad_norm": 2.0782963269035446, + "learning_rate": 1.6615953902724868e-06, + "loss": 0.7307, + "step": 9958 + }, + { + "epoch": 0.8195021600493726, + "grad_norm": 2.025259782829896, + "learning_rate": 1.6601244245118286e-06, + "loss": 0.702, + "step": 9959 + }, + { + "epoch": 0.8195844476445175, + "grad_norm": 1.7840433985929696, + "learning_rate": 1.658654051202594e-06, + "loss": 0.692, + "step": 9960 + }, + { + "epoch": 0.8196667352396626, + "grad_norm": 1.977444789682622, + "learning_rate": 1.6571842704492325e-06, + "loss": 0.7067, + "step": 9961 + }, + { + "epoch": 0.8197490228348077, + "grad_norm": 1.8621167413863724, + "learning_rate": 1.6557150823561608e-06, + "loss": 0.709, + "step": 9962 + }, + { + "epoch": 0.8198313104299527, + "grad_norm": 2.779866614036762, + "learning_rate": 1.654246487027743e-06, + "loss": 0.7186, + "step": 9963 + }, + { + "epoch": 0.8199135980250977, + "grad_norm": 3.4718386210176013, + "learning_rate": 1.6527784845683104e-06, + "loss": 0.6987, + "step": 9964 + }, + { + "epoch": 0.8199958856202427, + "grad_norm": 1.9154444780309932, + "learning_rate": 1.6513110750821438e-06, + "loss": 0.7043, + "step": 9965 + }, + { + "epoch": 0.8200781732153878, + "grad_norm": 2.0303678898373505, + "learning_rate": 1.6498442586734874e-06, + "loss": 0.7094, + "step": 9966 + }, + { + "epoch": 0.8201604608105328, + "grad_norm": 2.1423444672081864, + "learning_rate": 1.6483780354465374e-06, + "loss": 0.7171, + "step": 9967 + }, + { + "epoch": 0.8202427484056778, + "grad_norm": 0.4220786189943916, + "learning_rate": 1.6469124055054587e-06, + "loss": 0.4709, + "step": 9968 + }, + { + "epoch": 0.8203250360008228, + "grad_norm": 2.067737943034816, + "learning_rate": 1.6454473689543616e-06, + "loss": 0.7272, + "step": 9969 + }, + { + "epoch": 0.8204073235959679, + "grad_norm": 2.0802245750763726, + "learning_rate": 1.643982925897326e-06, + "loss": 0.7056, + "step": 9970 + }, + { + "epoch": 0.820489611191113, + "grad_norm": 1.8673096123633162, + "learning_rate": 1.6425190764383759e-06, + "loss": 0.6818, + "step": 9971 + }, + { + "epoch": 0.820571898786258, + "grad_norm": 0.4158958698067362, + "learning_rate": 1.6410558206815076e-06, + "loss": 0.4646, + "step": 9972 + }, + { + "epoch": 0.820654186381403, + "grad_norm": 1.6539245003196419, + "learning_rate": 1.6395931587306624e-06, + "loss": 0.7295, + "step": 9973 + }, + { + "epoch": 0.820736473976548, + "grad_norm": 2.065431457736314, + "learning_rate": 1.6381310906897518e-06, + "loss": 0.7023, + "step": 9974 + }, + { + "epoch": 0.8208187615716931, + "grad_norm": 0.40152221597989146, + "learning_rate": 1.6366696166626329e-06, + "loss": 0.4695, + "step": 9975 + }, + { + "epoch": 0.8209010491668381, + "grad_norm": 0.39162931713091687, + "learning_rate": 1.6352087367531344e-06, + "loss": 0.4703, + "step": 9976 + }, + { + "epoch": 0.8209833367619831, + "grad_norm": 1.7873266742918539, + "learning_rate": 1.6337484510650236e-06, + "loss": 0.7261, + "step": 9977 + }, + { + "epoch": 0.8210656243571282, + "grad_norm": 1.779131057305479, + "learning_rate": 1.6322887597020465e-06, + "loss": 0.7196, + "step": 9978 + }, + { + "epoch": 0.8211479119522732, + "grad_norm": 2.32153290643034, + "learning_rate": 1.630829662767892e-06, + "loss": 0.7079, + "step": 9979 + }, + { + "epoch": 0.8212301995474183, + "grad_norm": 2.464907796088477, + "learning_rate": 1.6293711603662156e-06, + "loss": 0.6938, + "step": 9980 + }, + { + "epoch": 0.8213124871425632, + "grad_norm": 1.9322379539505563, + "learning_rate": 1.6279132526006225e-06, + "loss": 0.7265, + "step": 9981 + }, + { + "epoch": 0.8213947747377083, + "grad_norm": 2.15501047591358, + "learning_rate": 1.626455939574687e-06, + "loss": 0.6913, + "step": 9982 + }, + { + "epoch": 0.8214770623328533, + "grad_norm": 0.4249739963795962, + "learning_rate": 1.6249992213919263e-06, + "loss": 0.4716, + "step": 9983 + }, + { + "epoch": 0.8215593499279984, + "grad_norm": 1.759892505511181, + "learning_rate": 1.6235430981558287e-06, + "loss": 0.6769, + "step": 9984 + }, + { + "epoch": 0.8216416375231433, + "grad_norm": 1.9573537013091313, + "learning_rate": 1.6220875699698313e-06, + "loss": 0.7368, + "step": 9985 + }, + { + "epoch": 0.8217239251182884, + "grad_norm": 2.29552688574819, + "learning_rate": 1.620632636937336e-06, + "loss": 0.7189, + "step": 9986 + }, + { + "epoch": 0.8218062127134335, + "grad_norm": 1.6256025585383602, + "learning_rate": 1.6191782991616988e-06, + "loss": 0.6981, + "step": 9987 + }, + { + "epoch": 0.8218885003085785, + "grad_norm": 1.7534178879868327, + "learning_rate": 1.6177245567462306e-06, + "loss": 0.7288, + "step": 9988 + }, + { + "epoch": 0.8219707879037235, + "grad_norm": 2.1663828803436953, + "learning_rate": 1.6162714097942033e-06, + "loss": 0.7019, + "step": 9989 + }, + { + "epoch": 0.8220530754988685, + "grad_norm": 1.822934727016988, + "learning_rate": 1.6148188584088486e-06, + "loss": 0.7123, + "step": 9990 + }, + { + "epoch": 0.8221353630940136, + "grad_norm": 1.7213054111724562, + "learning_rate": 1.6133669026933508e-06, + "loss": 0.6871, + "step": 9991 + }, + { + "epoch": 0.8222176506891586, + "grad_norm": 2.0172198001814294, + "learning_rate": 1.611915542750857e-06, + "loss": 0.721, + "step": 9992 + }, + { + "epoch": 0.8222999382843036, + "grad_norm": 1.8250532990408392, + "learning_rate": 1.6104647786844695e-06, + "loss": 0.7136, + "step": 9993 + }, + { + "epoch": 0.8223822258794486, + "grad_norm": 0.4050876972970882, + "learning_rate": 1.609014610597246e-06, + "loss": 0.469, + "step": 9994 + }, + { + "epoch": 0.8224645134745937, + "grad_norm": 1.7524139286868692, + "learning_rate": 1.6075650385922025e-06, + "loss": 0.6842, + "step": 9995 + }, + { + "epoch": 0.8225468010697388, + "grad_norm": 0.41248371412486046, + "learning_rate": 1.6061160627723204e-06, + "loss": 0.4773, + "step": 9996 + }, + { + "epoch": 0.8226290886648838, + "grad_norm": 1.8479348964468312, + "learning_rate": 1.604667683240526e-06, + "loss": 0.701, + "step": 9997 + }, + { + "epoch": 0.8227113762600288, + "grad_norm": 2.0844605435196697, + "learning_rate": 1.6032199000997163e-06, + "loss": 0.7374, + "step": 9998 + }, + { + "epoch": 0.8227936638551738, + "grad_norm": 2.112695613101499, + "learning_rate": 1.601772713452736e-06, + "loss": 0.6935, + "step": 9999 + }, + { + "epoch": 0.8228759514503189, + "grad_norm": 5.34557537065652, + "learning_rate": 1.6003261234023904e-06, + "loss": 0.6894, + "step": 10000 + }, + { + "epoch": 0.822958239045464, + "grad_norm": 1.942647619965594, + "learning_rate": 1.5988801300514422e-06, + "loss": 0.6961, + "step": 10001 + }, + { + "epoch": 0.8230405266406089, + "grad_norm": 1.7888543810942354, + "learning_rate": 1.5974347335026152e-06, + "loss": 0.6968, + "step": 10002 + }, + { + "epoch": 0.823122814235754, + "grad_norm": 2.1778522630671207, + "learning_rate": 1.5959899338585861e-06, + "loss": 0.6997, + "step": 10003 + }, + { + "epoch": 0.823205101830899, + "grad_norm": 1.5568823947335846, + "learning_rate": 1.5945457312219924e-06, + "loss": 0.7253, + "step": 10004 + }, + { + "epoch": 0.8232873894260441, + "grad_norm": 2.041852509349447, + "learning_rate": 1.5931021256954293e-06, + "loss": 0.7034, + "step": 10005 + }, + { + "epoch": 0.823369677021189, + "grad_norm": 1.8992025638519208, + "learning_rate": 1.5916591173814456e-06, + "loss": 0.7314, + "step": 10006 + }, + { + "epoch": 0.8234519646163341, + "grad_norm": 1.808455296187523, + "learning_rate": 1.590216706382548e-06, + "loss": 0.6753, + "step": 10007 + }, + { + "epoch": 0.8235342522114791, + "grad_norm": 1.9678641607619778, + "learning_rate": 1.5887748928012092e-06, + "loss": 0.7107, + "step": 10008 + }, + { + "epoch": 0.8236165398066242, + "grad_norm": 0.42525367029533223, + "learning_rate": 1.5873336767398473e-06, + "loss": 0.4455, + "step": 10009 + }, + { + "epoch": 0.8236988274017691, + "grad_norm": 2.033797796904989, + "learning_rate": 1.5858930583008491e-06, + "loss": 0.6955, + "step": 10010 + }, + { + "epoch": 0.8237811149969142, + "grad_norm": 1.8463444167776082, + "learning_rate": 1.584453037586553e-06, + "loss": 0.7161, + "step": 10011 + }, + { + "epoch": 0.8238634025920593, + "grad_norm": 2.255677752228243, + "learning_rate": 1.5830136146992536e-06, + "loss": 0.7071, + "step": 10012 + }, + { + "epoch": 0.8239456901872043, + "grad_norm": 2.435169189620127, + "learning_rate": 1.5815747897412038e-06, + "loss": 0.7069, + "step": 10013 + }, + { + "epoch": 0.8240279777823493, + "grad_norm": 2.097668305091276, + "learning_rate": 1.5801365628146193e-06, + "loss": 0.7279, + "step": 10014 + }, + { + "epoch": 0.8241102653774943, + "grad_norm": 2.594663646448271, + "learning_rate": 1.5786989340216662e-06, + "loss": 0.7039, + "step": 10015 + }, + { + "epoch": 0.8241925529726394, + "grad_norm": 1.9607771458092267, + "learning_rate": 1.5772619034644755e-06, + "loss": 0.7502, + "step": 10016 + }, + { + "epoch": 0.8242748405677844, + "grad_norm": 1.9243505617835626, + "learning_rate": 1.5758254712451281e-06, + "loss": 0.7107, + "step": 10017 + }, + { + "epoch": 0.8243571281629294, + "grad_norm": 0.41326796950946826, + "learning_rate": 1.5743896374656675e-06, + "loss": 0.4786, + "step": 10018 + }, + { + "epoch": 0.8244394157580744, + "grad_norm": 2.5113411531786154, + "learning_rate": 1.5729544022280897e-06, + "loss": 0.7162, + "step": 10019 + }, + { + "epoch": 0.8245217033532195, + "grad_norm": 0.43064605573487225, + "learning_rate": 1.5715197656343562e-06, + "loss": 0.46, + "step": 10020 + }, + { + "epoch": 0.8246039909483646, + "grad_norm": 0.3985651471902848, + "learning_rate": 1.5700857277863767e-06, + "loss": 0.4567, + "step": 10021 + }, + { + "epoch": 0.8246862785435096, + "grad_norm": 1.7973735253342467, + "learning_rate": 1.5686522887860288e-06, + "loss": 0.7135, + "step": 10022 + }, + { + "epoch": 0.8247685661386546, + "grad_norm": 1.607480190772272, + "learning_rate": 1.5672194487351378e-06, + "loss": 0.6821, + "step": 10023 + }, + { + "epoch": 0.8248508537337996, + "grad_norm": 2.3123639777479257, + "learning_rate": 1.5657872077354908e-06, + "loss": 0.7326, + "step": 10024 + }, + { + "epoch": 0.8249331413289447, + "grad_norm": 2.053080212864445, + "learning_rate": 1.5643555658888299e-06, + "loss": 0.6956, + "step": 10025 + }, + { + "epoch": 0.8250154289240897, + "grad_norm": 0.4185245322296301, + "learning_rate": 1.5629245232968605e-06, + "loss": 0.4551, + "step": 10026 + }, + { + "epoch": 0.8250977165192347, + "grad_norm": 1.7436580203509053, + "learning_rate": 1.5614940800612376e-06, + "loss": 0.7, + "step": 10027 + }, + { + "epoch": 0.8251800041143798, + "grad_norm": 2.21410374181179, + "learning_rate": 1.5600642362835828e-06, + "loss": 0.7067, + "step": 10028 + }, + { + "epoch": 0.8252622917095248, + "grad_norm": 2.007884879496195, + "learning_rate": 1.5586349920654675e-06, + "loss": 0.7153, + "step": 10029 + }, + { + "epoch": 0.8253445793046699, + "grad_norm": 2.575404756842271, + "learning_rate": 1.5572063475084188e-06, + "loss": 0.7191, + "step": 10030 + }, + { + "epoch": 0.8254268668998148, + "grad_norm": 1.8558782737258355, + "learning_rate": 1.5557783027139317e-06, + "loss": 0.7181, + "step": 10031 + }, + { + "epoch": 0.8255091544949599, + "grad_norm": 1.825773648739784, + "learning_rate": 1.5543508577834477e-06, + "loss": 0.7081, + "step": 10032 + }, + { + "epoch": 0.8255914420901049, + "grad_norm": 0.4030609488556746, + "learning_rate": 1.5529240128183732e-06, + "loss": 0.4801, + "step": 10033 + }, + { + "epoch": 0.82567372968525, + "grad_norm": 1.8511085817494681, + "learning_rate": 1.5514977679200683e-06, + "loss": 0.6791, + "step": 10034 + }, + { + "epoch": 0.8257560172803949, + "grad_norm": 0.41885480094910416, + "learning_rate": 1.5500721231898508e-06, + "loss": 0.4532, + "step": 10035 + }, + { + "epoch": 0.82583830487554, + "grad_norm": 2.11291184865842, + "learning_rate": 1.5486470787289932e-06, + "loss": 0.7012, + "step": 10036 + }, + { + "epoch": 0.825920592470685, + "grad_norm": 1.7185138232537436, + "learning_rate": 1.5472226346387341e-06, + "loss": 0.7188, + "step": 10037 + }, + { + "epoch": 0.8260028800658301, + "grad_norm": 2.330939591308333, + "learning_rate": 1.5457987910202577e-06, + "loss": 0.7156, + "step": 10038 + }, + { + "epoch": 0.8260851676609751, + "grad_norm": 0.43176123362125096, + "learning_rate": 1.544375547974718e-06, + "loss": 0.4999, + "step": 10039 + }, + { + "epoch": 0.8261674552561201, + "grad_norm": 0.4116631939085329, + "learning_rate": 1.5429529056032156e-06, + "loss": 0.4469, + "step": 10040 + }, + { + "epoch": 0.8262497428512652, + "grad_norm": 2.113697907358001, + "learning_rate": 1.5415308640068139e-06, + "loss": 0.7027, + "step": 10041 + }, + { + "epoch": 0.8263320304464102, + "grad_norm": 1.9753025022487212, + "learning_rate": 1.540109423286531e-06, + "loss": 0.7255, + "step": 10042 + }, + { + "epoch": 0.8264143180415552, + "grad_norm": 2.007632925065888, + "learning_rate": 1.5386885835433462e-06, + "loss": 0.7022, + "step": 10043 + }, + { + "epoch": 0.8264966056367002, + "grad_norm": 1.867330509984371, + "learning_rate": 1.537268344878191e-06, + "loss": 0.7144, + "step": 10044 + }, + { + "epoch": 0.8265788932318453, + "grad_norm": 1.781621725670015, + "learning_rate": 1.5358487073919604e-06, + "loss": 0.7009, + "step": 10045 + }, + { + "epoch": 0.8266611808269904, + "grad_norm": 2.4007381582520964, + "learning_rate": 1.534429671185501e-06, + "loss": 0.7267, + "step": 10046 + }, + { + "epoch": 0.8267434684221353, + "grad_norm": 1.9858925367942084, + "learning_rate": 1.5330112363596196e-06, + "loss": 0.7199, + "step": 10047 + }, + { + "epoch": 0.8268257560172804, + "grad_norm": 1.7481657998231535, + "learning_rate": 1.5315934030150759e-06, + "loss": 0.7146, + "step": 10048 + }, + { + "epoch": 0.8269080436124254, + "grad_norm": 0.43831271091345886, + "learning_rate": 1.5301761712525965e-06, + "loss": 0.4912, + "step": 10049 + }, + { + "epoch": 0.8269903312075705, + "grad_norm": 0.39678586582825837, + "learning_rate": 1.528759541172854e-06, + "loss": 0.4592, + "step": 10050 + }, + { + "epoch": 0.8270726188027155, + "grad_norm": 2.023524646999891, + "learning_rate": 1.5273435128764913e-06, + "loss": 0.6909, + "step": 10051 + }, + { + "epoch": 0.8271549063978605, + "grad_norm": 1.6240189912719936, + "learning_rate": 1.52592808646409e-06, + "loss": 0.7185, + "step": 10052 + }, + { + "epoch": 0.8272371939930055, + "grad_norm": 2.1855114988162576, + "learning_rate": 1.524513262036208e-06, + "loss": 0.7067, + "step": 10053 + }, + { + "epoch": 0.8273194815881506, + "grad_norm": 0.4125694065691081, + "learning_rate": 1.5230990396933465e-06, + "loss": 0.4945, + "step": 10054 + }, + { + "epoch": 0.8274017691832957, + "grad_norm": 1.647185905283757, + "learning_rate": 1.5216854195359742e-06, + "loss": 0.6963, + "step": 10055 + }, + { + "epoch": 0.8274840567784406, + "grad_norm": 1.7529832858229137, + "learning_rate": 1.5202724016645099e-06, + "loss": 0.7448, + "step": 10056 + }, + { + "epoch": 0.8275663443735857, + "grad_norm": 1.9095718850664822, + "learning_rate": 1.5188599861793362e-06, + "loss": 0.7104, + "step": 10057 + }, + { + "epoch": 0.8276486319687307, + "grad_norm": 1.7122808327408747, + "learning_rate": 1.517448173180781e-06, + "loss": 0.714, + "step": 10058 + }, + { + "epoch": 0.8277309195638758, + "grad_norm": 1.96575090018373, + "learning_rate": 1.516036962769145e-06, + "loss": 0.6862, + "step": 10059 + }, + { + "epoch": 0.8278132071590207, + "grad_norm": 0.4244015676025212, + "learning_rate": 1.5146263550446715e-06, + "loss": 0.4612, + "step": 10060 + }, + { + "epoch": 0.8278954947541658, + "grad_norm": 1.915017833834617, + "learning_rate": 1.5132163501075747e-06, + "loss": 0.699, + "step": 10061 + }, + { + "epoch": 0.8279777823493109, + "grad_norm": 2.274441849678985, + "learning_rate": 1.5118069480580134e-06, + "loss": 0.7134, + "step": 10062 + }, + { + "epoch": 0.8280600699444559, + "grad_norm": 2.602633736954954, + "learning_rate": 1.510398148996115e-06, + "loss": 0.7342, + "step": 10063 + }, + { + "epoch": 0.8281423575396009, + "grad_norm": 2.427676355862514, + "learning_rate": 1.508989953021952e-06, + "loss": 0.6991, + "step": 10064 + }, + { + "epoch": 0.8282246451347459, + "grad_norm": 1.8590590583134612, + "learning_rate": 1.5075823602355645e-06, + "loss": 0.7021, + "step": 10065 + }, + { + "epoch": 0.828306932729891, + "grad_norm": 1.9535877327554934, + "learning_rate": 1.5061753707369431e-06, + "loss": 0.7014, + "step": 10066 + }, + { + "epoch": 0.828389220325036, + "grad_norm": 2.051353783486264, + "learning_rate": 1.5047689846260415e-06, + "loss": 0.7006, + "step": 10067 + }, + { + "epoch": 0.828471507920181, + "grad_norm": 0.43029202554528617, + "learning_rate": 1.5033632020027645e-06, + "loss": 0.4841, + "step": 10068 + }, + { + "epoch": 0.828553795515326, + "grad_norm": 1.8165666279721606, + "learning_rate": 1.5019580229669783e-06, + "loss": 0.696, + "step": 10069 + }, + { + "epoch": 0.8286360831104711, + "grad_norm": 2.1421031430980544, + "learning_rate": 1.5005534476185002e-06, + "loss": 0.717, + "step": 10070 + }, + { + "epoch": 0.8287183707056162, + "grad_norm": 2.5628492549853092, + "learning_rate": 1.4991494760571157e-06, + "loss": 0.7102, + "step": 10071 + }, + { + "epoch": 0.8288006583007611, + "grad_norm": 2.3273797305043264, + "learning_rate": 1.497746108382554e-06, + "loss": 0.707, + "step": 10072 + }, + { + "epoch": 0.8288829458959062, + "grad_norm": 2.393256928615148, + "learning_rate": 1.4963433446945142e-06, + "loss": 0.7125, + "step": 10073 + }, + { + "epoch": 0.8289652334910512, + "grad_norm": 2.156812395494318, + "learning_rate": 1.4949411850926443e-06, + "loss": 0.7108, + "step": 10074 + }, + { + "epoch": 0.8290475210861963, + "grad_norm": 2.643111858415067, + "learning_rate": 1.4935396296765492e-06, + "loss": 0.6983, + "step": 10075 + }, + { + "epoch": 0.8291298086813413, + "grad_norm": 2.7270680690578986, + "learning_rate": 1.4921386785457937e-06, + "loss": 0.7097, + "step": 10076 + }, + { + "epoch": 0.8292120962764863, + "grad_norm": 1.902483435845288, + "learning_rate": 1.4907383317999026e-06, + "loss": 0.7114, + "step": 10077 + }, + { + "epoch": 0.8292943838716313, + "grad_norm": 1.8996231310794633, + "learning_rate": 1.4893385895383483e-06, + "loss": 0.7061, + "step": 10078 + }, + { + "epoch": 0.8293766714667764, + "grad_norm": 1.7242398297898025, + "learning_rate": 1.4879394518605728e-06, + "loss": 0.722, + "step": 10079 + }, + { + "epoch": 0.8294589590619215, + "grad_norm": 2.5868638585184645, + "learning_rate": 1.4865409188659653e-06, + "loss": 0.7225, + "step": 10080 + }, + { + "epoch": 0.8295412466570664, + "grad_norm": 0.4364274445715279, + "learning_rate": 1.485142990653876e-06, + "loss": 0.4919, + "step": 10081 + }, + { + "epoch": 0.8296235342522115, + "grad_norm": 0.4100145987294864, + "learning_rate": 1.4837456673236094e-06, + "loss": 0.4741, + "step": 10082 + }, + { + "epoch": 0.8297058218473565, + "grad_norm": 12.445948320800504, + "learning_rate": 1.4823489489744325e-06, + "loss": 0.7134, + "step": 10083 + }, + { + "epoch": 0.8297881094425016, + "grad_norm": 2.646672426731111, + "learning_rate": 1.4809528357055626e-06, + "loss": 0.7286, + "step": 10084 + }, + { + "epoch": 0.8298703970376465, + "grad_norm": 1.6404183209841674, + "learning_rate": 1.4795573276161801e-06, + "loss": 0.7253, + "step": 10085 + }, + { + "epoch": 0.8299526846327916, + "grad_norm": 1.9357541452092004, + "learning_rate": 1.4781624248054194e-06, + "loss": 0.7112, + "step": 10086 + }, + { + "epoch": 0.8300349722279367, + "grad_norm": 1.9274030419251027, + "learning_rate": 1.476768127372371e-06, + "loss": 0.7222, + "step": 10087 + }, + { + "epoch": 0.8301172598230817, + "grad_norm": 1.7149876458969746, + "learning_rate": 1.4753744354160827e-06, + "loss": 0.7005, + "step": 10088 + }, + { + "epoch": 0.8301995474182267, + "grad_norm": 2.393316073673153, + "learning_rate": 1.4739813490355625e-06, + "loss": 0.7099, + "step": 10089 + }, + { + "epoch": 0.8302818350133717, + "grad_norm": 2.0549554399929657, + "learning_rate": 1.4725888683297696e-06, + "loss": 0.7169, + "step": 10090 + }, + { + "epoch": 0.8303641226085168, + "grad_norm": 2.0878511921399765, + "learning_rate": 1.4711969933976288e-06, + "loss": 0.7212, + "step": 10091 + }, + { + "epoch": 0.8304464102036618, + "grad_norm": 2.077146448761268, + "learning_rate": 1.4698057243380137e-06, + "loss": 0.7109, + "step": 10092 + }, + { + "epoch": 0.8305286977988068, + "grad_norm": 1.9593382410590092, + "learning_rate": 1.4684150612497584e-06, + "loss": 0.7362, + "step": 10093 + }, + { + "epoch": 0.8306109853939518, + "grad_norm": 2.236473502812876, + "learning_rate": 1.4670250042316503e-06, + "loss": 0.7307, + "step": 10094 + }, + { + "epoch": 0.8306932729890969, + "grad_norm": 4.224918233796886, + "learning_rate": 1.4656355533824408e-06, + "loss": 0.7036, + "step": 10095 + }, + { + "epoch": 0.830775560584242, + "grad_norm": 1.9946229397118196, + "learning_rate": 1.4642467088008317e-06, + "loss": 0.729, + "step": 10096 + }, + { + "epoch": 0.8308578481793869, + "grad_norm": 1.8496499832439905, + "learning_rate": 1.4628584705854875e-06, + "loss": 0.729, + "step": 10097 + }, + { + "epoch": 0.830940135774532, + "grad_norm": 0.43422073112180004, + "learning_rate": 1.4614708388350241e-06, + "loss": 0.4762, + "step": 10098 + }, + { + "epoch": 0.831022423369677, + "grad_norm": 2.3824373991579773, + "learning_rate": 1.4600838136480167e-06, + "loss": 0.6918, + "step": 10099 + }, + { + "epoch": 0.8311047109648221, + "grad_norm": 1.984254600042265, + "learning_rate": 1.4586973951229954e-06, + "loss": 0.7411, + "step": 10100 + }, + { + "epoch": 0.8311869985599671, + "grad_norm": 0.41109435260129096, + "learning_rate": 1.457311583358454e-06, + "loss": 0.4637, + "step": 10101 + }, + { + "epoch": 0.8312692861551121, + "grad_norm": 1.826305869520841, + "learning_rate": 1.4559263784528332e-06, + "loss": 0.7079, + "step": 10102 + }, + { + "epoch": 0.8313515737502571, + "grad_norm": 2.052868474494131, + "learning_rate": 1.4545417805045415e-06, + "loss": 0.7204, + "step": 10103 + }, + { + "epoch": 0.8314338613454022, + "grad_norm": 1.9246045972914487, + "learning_rate": 1.4531577896119343e-06, + "loss": 0.7169, + "step": 10104 + }, + { + "epoch": 0.8315161489405473, + "grad_norm": 1.6835207155705056, + "learning_rate": 1.4517744058733295e-06, + "loss": 0.7099, + "step": 10105 + }, + { + "epoch": 0.8315984365356922, + "grad_norm": 1.8776032454424876, + "learning_rate": 1.4503916293869968e-06, + "loss": 0.6979, + "step": 10106 + }, + { + "epoch": 0.8316807241308373, + "grad_norm": 3.2233058840815927, + "learning_rate": 1.4490094602511728e-06, + "loss": 0.6871, + "step": 10107 + }, + { + "epoch": 0.8317630117259823, + "grad_norm": 1.925288621748184, + "learning_rate": 1.4476278985640391e-06, + "loss": 0.7092, + "step": 10108 + }, + { + "epoch": 0.8318452993211274, + "grad_norm": 1.790538162842461, + "learning_rate": 1.4462469444237448e-06, + "loss": 0.7013, + "step": 10109 + }, + { + "epoch": 0.8319275869162723, + "grad_norm": 2.8123816908153962, + "learning_rate": 1.444866597928387e-06, + "loss": 0.7345, + "step": 10110 + }, + { + "epoch": 0.8320098745114174, + "grad_norm": 3.675385786133351, + "learning_rate": 1.4434868591760244e-06, + "loss": 0.7145, + "step": 10111 + }, + { + "epoch": 0.8320921621065624, + "grad_norm": 1.9999311067615517, + "learning_rate": 1.442107728264669e-06, + "loss": 0.7075, + "step": 10112 + }, + { + "epoch": 0.8321744497017075, + "grad_norm": 1.968252975228794, + "learning_rate": 1.440729205292296e-06, + "loss": 0.6898, + "step": 10113 + }, + { + "epoch": 0.8322567372968525, + "grad_norm": 3.1505404795675482, + "learning_rate": 1.4393512903568307e-06, + "loss": 0.7134, + "step": 10114 + }, + { + "epoch": 0.8323390248919975, + "grad_norm": 1.9375528933998276, + "learning_rate": 1.4379739835561601e-06, + "loss": 0.6985, + "step": 10115 + }, + { + "epoch": 0.8324213124871426, + "grad_norm": 2.0346425428362247, + "learning_rate": 1.436597284988125e-06, + "loss": 0.7101, + "step": 10116 + }, + { + "epoch": 0.8325036000822876, + "grad_norm": 1.9065783066211603, + "learning_rate": 1.4352211947505235e-06, + "loss": 0.716, + "step": 10117 + }, + { + "epoch": 0.8325858876774326, + "grad_norm": 1.7899388456748069, + "learning_rate": 1.4338457129411099e-06, + "loss": 0.7345, + "step": 10118 + }, + { + "epoch": 0.8326681752725776, + "grad_norm": 0.4089066214584763, + "learning_rate": 1.432470839657598e-06, + "loss": 0.4483, + "step": 10119 + }, + { + "epoch": 0.8327504628677227, + "grad_norm": 2.443020953606305, + "learning_rate": 1.4310965749976547e-06, + "loss": 0.7061, + "step": 10120 + }, + { + "epoch": 0.8328327504628678, + "grad_norm": 1.996303838316402, + "learning_rate": 1.4297229190589112e-06, + "loss": 0.7086, + "step": 10121 + }, + { + "epoch": 0.8329150380580127, + "grad_norm": 1.7810773770798305, + "learning_rate": 1.42834987193894e-06, + "loss": 0.7266, + "step": 10122 + }, + { + "epoch": 0.8329973256531578, + "grad_norm": 1.8837646847533005, + "learning_rate": 1.4269774337352892e-06, + "loss": 0.7219, + "step": 10123 + }, + { + "epoch": 0.8330796132483028, + "grad_norm": 2.1534802001835813, + "learning_rate": 1.425605604545448e-06, + "loss": 0.6972, + "step": 10124 + }, + { + "epoch": 0.8331619008434479, + "grad_norm": 1.8177288395426725, + "learning_rate": 1.4242343844668738e-06, + "loss": 0.7202, + "step": 10125 + }, + { + "epoch": 0.8332441884385929, + "grad_norm": 2.386208855976913, + "learning_rate": 1.4228637735969718e-06, + "loss": 0.712, + "step": 10126 + }, + { + "epoch": 0.8333264760337379, + "grad_norm": 1.8829597686276847, + "learning_rate": 1.4214937720331157e-06, + "loss": 0.7156, + "step": 10127 + }, + { + "epoch": 0.8334087636288829, + "grad_norm": 2.4661728037700223, + "learning_rate": 1.4201243798726171e-06, + "loss": 0.7135, + "step": 10128 + }, + { + "epoch": 0.833491051224028, + "grad_norm": 1.9704190876792804, + "learning_rate": 1.4187555972127641e-06, + "loss": 0.7411, + "step": 10129 + }, + { + "epoch": 0.8335733388191731, + "grad_norm": 1.754078157504742, + "learning_rate": 1.4173874241507868e-06, + "loss": 0.701, + "step": 10130 + }, + { + "epoch": 0.833655626414318, + "grad_norm": 1.8867439097486627, + "learning_rate": 1.4160198607838838e-06, + "loss": 0.716, + "step": 10131 + }, + { + "epoch": 0.8337379140094631, + "grad_norm": 1.8243422334533033, + "learning_rate": 1.4146529072091997e-06, + "loss": 0.7266, + "step": 10132 + }, + { + "epoch": 0.8338202016046081, + "grad_norm": 1.621127346935549, + "learning_rate": 1.413286563523848e-06, + "loss": 0.7091, + "step": 10133 + }, + { + "epoch": 0.8339024891997532, + "grad_norm": 0.4080823486280811, + "learning_rate": 1.411920829824881e-06, + "loss": 0.462, + "step": 10134 + }, + { + "epoch": 0.8339847767948981, + "grad_norm": 1.9498163389997252, + "learning_rate": 1.4105557062093255e-06, + "loss": 0.6806, + "step": 10135 + }, + { + "epoch": 0.8340670643900432, + "grad_norm": 1.9354989225499333, + "learning_rate": 1.4091911927741542e-06, + "loss": 0.7266, + "step": 10136 + }, + { + "epoch": 0.8341493519851882, + "grad_norm": 0.42303277856531296, + "learning_rate": 1.4078272896163036e-06, + "loss": 0.4791, + "step": 10137 + }, + { + "epoch": 0.8342316395803333, + "grad_norm": 1.8932294208214406, + "learning_rate": 1.406463996832661e-06, + "loss": 0.7006, + "step": 10138 + }, + { + "epoch": 0.8343139271754783, + "grad_norm": 1.9721520948085813, + "learning_rate": 1.4051013145200732e-06, + "loss": 0.7179, + "step": 10139 + }, + { + "epoch": 0.8343962147706233, + "grad_norm": 1.7801694161436472, + "learning_rate": 1.4037392427753406e-06, + "loss": 0.681, + "step": 10140 + }, + { + "epoch": 0.8344785023657684, + "grad_norm": 1.9635353214373499, + "learning_rate": 1.4023777816952255e-06, + "loss": 0.709, + "step": 10141 + }, + { + "epoch": 0.8345607899609134, + "grad_norm": 2.0458610022205597, + "learning_rate": 1.4010169313764421e-06, + "loss": 0.7102, + "step": 10142 + }, + { + "epoch": 0.8346430775560584, + "grad_norm": 2.375634780460858, + "learning_rate": 1.399656691915665e-06, + "loss": 0.7003, + "step": 10143 + }, + { + "epoch": 0.8347253651512034, + "grad_norm": 0.3965093076916356, + "learning_rate": 1.3982970634095227e-06, + "loss": 0.475, + "step": 10144 + }, + { + "epoch": 0.8348076527463485, + "grad_norm": 1.9474564565258727, + "learning_rate": 1.3969380459546012e-06, + "loss": 0.6993, + "step": 10145 + }, + { + "epoch": 0.8348899403414936, + "grad_norm": 0.4136701973764035, + "learning_rate": 1.3955796396474398e-06, + "loss": 0.4749, + "step": 10146 + }, + { + "epoch": 0.8349722279366385, + "grad_norm": 1.728675109115516, + "learning_rate": 1.3942218445845412e-06, + "loss": 0.7274, + "step": 10147 + }, + { + "epoch": 0.8350545155317836, + "grad_norm": 0.4169090119134647, + "learning_rate": 1.3928646608623587e-06, + "loss": 0.4632, + "step": 10148 + }, + { + "epoch": 0.8351368031269286, + "grad_norm": 2.251380355439002, + "learning_rate": 1.391508088577308e-06, + "loss": 0.7125, + "step": 10149 + }, + { + "epoch": 0.8352190907220737, + "grad_norm": 2.2336073958772626, + "learning_rate": 1.3901521278257546e-06, + "loss": 0.6883, + "step": 10150 + }, + { + "epoch": 0.8353013783172187, + "grad_norm": 2.1089170959299146, + "learning_rate": 1.3887967787040257e-06, + "loss": 0.7222, + "step": 10151 + }, + { + "epoch": 0.8353836659123637, + "grad_norm": 2.067478080883, + "learning_rate": 1.387442041308399e-06, + "loss": 0.6918, + "step": 10152 + }, + { + "epoch": 0.8354659535075087, + "grad_norm": 1.870562257005282, + "learning_rate": 1.3860879157351181e-06, + "loss": 0.7259, + "step": 10153 + }, + { + "epoch": 0.8355482411026538, + "grad_norm": 3.0059132545378007, + "learning_rate": 1.3847344020803744e-06, + "loss": 0.7191, + "step": 10154 + }, + { + "epoch": 0.8356305286977989, + "grad_norm": 1.9165016109183002, + "learning_rate": 1.3833815004403218e-06, + "loss": 0.7043, + "step": 10155 + }, + { + "epoch": 0.8357128162929438, + "grad_norm": 3.1644959525369645, + "learning_rate": 1.3820292109110677e-06, + "loss": 0.6813, + "step": 10156 + }, + { + "epoch": 0.8357951038880889, + "grad_norm": 2.4358270440219503, + "learning_rate": 1.3806775335886757e-06, + "loss": 0.71, + "step": 10157 + }, + { + "epoch": 0.8358773914832339, + "grad_norm": 1.7722567166884942, + "learning_rate": 1.3793264685691654e-06, + "loss": 0.6911, + "step": 10158 + }, + { + "epoch": 0.835959679078379, + "grad_norm": 2.3738457937895454, + "learning_rate": 1.3779760159485188e-06, + "loss": 0.6982, + "step": 10159 + }, + { + "epoch": 0.8360419666735239, + "grad_norm": 1.7654088853326046, + "learning_rate": 1.376626175822664e-06, + "loss": 0.7342, + "step": 10160 + }, + { + "epoch": 0.836124254268669, + "grad_norm": 0.4056974000116889, + "learning_rate": 1.3752769482874972e-06, + "loss": 0.4679, + "step": 10161 + }, + { + "epoch": 0.836206541863814, + "grad_norm": 1.98967507976564, + "learning_rate": 1.373928333438862e-06, + "loss": 0.7239, + "step": 10162 + }, + { + "epoch": 0.8362888294589591, + "grad_norm": 0.41272727854485275, + "learning_rate": 1.372580331372564e-06, + "loss": 0.4667, + "step": 10163 + }, + { + "epoch": 0.836371117054104, + "grad_norm": 1.8613880439420278, + "learning_rate": 1.3712329421843584e-06, + "loss": 0.7312, + "step": 10164 + }, + { + "epoch": 0.8364534046492491, + "grad_norm": 2.2187084194699733, + "learning_rate": 1.369886165969968e-06, + "loss": 0.729, + "step": 10165 + }, + { + "epoch": 0.8365356922443942, + "grad_norm": 2.638054371458598, + "learning_rate": 1.3685400028250596e-06, + "loss": 0.6978, + "step": 10166 + }, + { + "epoch": 0.8366179798395392, + "grad_norm": 1.5886442222073636, + "learning_rate": 1.3671944528452673e-06, + "loss": 0.6963, + "step": 10167 + }, + { + "epoch": 0.8367002674346842, + "grad_norm": 2.2922936720514113, + "learning_rate": 1.3658495161261765e-06, + "loss": 0.7276, + "step": 10168 + }, + { + "epoch": 0.8367825550298292, + "grad_norm": 0.4111768229899866, + "learning_rate": 1.3645051927633268e-06, + "loss": 0.4825, + "step": 10169 + }, + { + "epoch": 0.8368648426249743, + "grad_norm": 0.4097850661973844, + "learning_rate": 1.3631614828522155e-06, + "loss": 0.4459, + "step": 10170 + }, + { + "epoch": 0.8369471302201194, + "grad_norm": 2.0960762886176205, + "learning_rate": 1.3618183864883016e-06, + "loss": 0.6976, + "step": 10171 + }, + { + "epoch": 0.8370294178152643, + "grad_norm": 1.9639660585981882, + "learning_rate": 1.3604759037669935e-06, + "loss": 0.7102, + "step": 10172 + }, + { + "epoch": 0.8371117054104094, + "grad_norm": 1.8165544384650667, + "learning_rate": 1.3591340347836624e-06, + "loss": 0.7139, + "step": 10173 + }, + { + "epoch": 0.8371939930055544, + "grad_norm": 2.0221367328750786, + "learning_rate": 1.3577927796336299e-06, + "loss": 0.7019, + "step": 10174 + }, + { + "epoch": 0.8372762806006995, + "grad_norm": 2.33650362896097, + "learning_rate": 1.3564521384121775e-06, + "loss": 0.7313, + "step": 10175 + }, + { + "epoch": 0.8373585681958444, + "grad_norm": 2.1962342969037176, + "learning_rate": 1.3551121112145394e-06, + "loss": 0.7155, + "step": 10176 + }, + { + "epoch": 0.8374408557909895, + "grad_norm": 1.690460932575896, + "learning_rate": 1.3537726981359144e-06, + "loss": 0.7041, + "step": 10177 + }, + { + "epoch": 0.8375231433861345, + "grad_norm": 0.4388451407883307, + "learning_rate": 1.352433899271447e-06, + "loss": 0.4951, + "step": 10178 + }, + { + "epoch": 0.8376054309812796, + "grad_norm": 1.826099130179524, + "learning_rate": 1.3510957147162474e-06, + "loss": 0.7225, + "step": 10179 + }, + { + "epoch": 0.8376877185764247, + "grad_norm": 1.7956575447804923, + "learning_rate": 1.3497581445653763e-06, + "loss": 0.6834, + "step": 10180 + }, + { + "epoch": 0.8377700061715696, + "grad_norm": 0.4090679611970876, + "learning_rate": 1.3484211889138531e-06, + "loss": 0.4589, + "step": 10181 + }, + { + "epoch": 0.8378522937667147, + "grad_norm": 1.647467565523533, + "learning_rate": 1.3470848478566501e-06, + "loss": 0.6979, + "step": 10182 + }, + { + "epoch": 0.8379345813618597, + "grad_norm": 1.844941810806294, + "learning_rate": 1.3457491214887032e-06, + "loss": 0.7078, + "step": 10183 + }, + { + "epoch": 0.8380168689570048, + "grad_norm": 2.0030995791031176, + "learning_rate": 1.3444140099048951e-06, + "loss": 0.7092, + "step": 10184 + }, + { + "epoch": 0.8380991565521497, + "grad_norm": 1.9364679134544889, + "learning_rate": 1.3430795132000772e-06, + "loss": 0.7287, + "step": 10185 + }, + { + "epoch": 0.8381814441472948, + "grad_norm": 0.419515838995266, + "learning_rate": 1.3417456314690447e-06, + "loss": 0.4741, + "step": 10186 + }, + { + "epoch": 0.8382637317424398, + "grad_norm": 1.7354119009939883, + "learning_rate": 1.340412364806557e-06, + "loss": 0.6807, + "step": 10187 + }, + { + "epoch": 0.8383460193375849, + "grad_norm": 2.006024353027689, + "learning_rate": 1.339079713307322e-06, + "loss": 0.7069, + "step": 10188 + }, + { + "epoch": 0.8384283069327298, + "grad_norm": 1.7655651917817634, + "learning_rate": 1.3377476770660169e-06, + "loss": 0.7196, + "step": 10189 + }, + { + "epoch": 0.8385105945278749, + "grad_norm": 1.926485392629462, + "learning_rate": 1.3364162561772608e-06, + "loss": 0.7346, + "step": 10190 + }, + { + "epoch": 0.83859288212302, + "grad_norm": 1.7802902604853892, + "learning_rate": 1.3350854507356425e-06, + "loss": 0.6838, + "step": 10191 + }, + { + "epoch": 0.838675169718165, + "grad_norm": 1.7179889243723534, + "learning_rate": 1.3337552608356918e-06, + "loss": 0.7206, + "step": 10192 + }, + { + "epoch": 0.83875745731331, + "grad_norm": 1.9682005652656887, + "learning_rate": 1.3324256865719109e-06, + "loss": 0.7266, + "step": 10193 + }, + { + "epoch": 0.838839744908455, + "grad_norm": 1.9157483051822015, + "learning_rate": 1.3310967280387444e-06, + "loss": 0.7175, + "step": 10194 + }, + { + "epoch": 0.8389220325036001, + "grad_norm": 2.5581894462391053, + "learning_rate": 1.3297683853306054e-06, + "loss": 0.7105, + "step": 10195 + }, + { + "epoch": 0.8390043200987451, + "grad_norm": 3.3618986861453766, + "learning_rate": 1.3284406585418518e-06, + "loss": 0.6854, + "step": 10196 + }, + { + "epoch": 0.8390866076938901, + "grad_norm": 1.8623936276084858, + "learning_rate": 1.3271135477668095e-06, + "loss": 0.7075, + "step": 10197 + }, + { + "epoch": 0.8391688952890352, + "grad_norm": 0.4149830555856918, + "learning_rate": 1.3257870530997475e-06, + "loss": 0.4407, + "step": 10198 + }, + { + "epoch": 0.8392511828841802, + "grad_norm": 2.7451452278181407, + "learning_rate": 1.3244611746349024e-06, + "loss": 0.7115, + "step": 10199 + }, + { + "epoch": 0.8393334704793253, + "grad_norm": 1.8245384494512142, + "learning_rate": 1.3231359124664578e-06, + "loss": 0.7255, + "step": 10200 + }, + { + "epoch": 0.8394157580744702, + "grad_norm": 1.89717141558931, + "learning_rate": 1.3218112666885651e-06, + "loss": 0.7096, + "step": 10201 + }, + { + "epoch": 0.8394980456696153, + "grad_norm": 1.9704734650733835, + "learning_rate": 1.3204872373953203e-06, + "loss": 0.7141, + "step": 10202 + }, + { + "epoch": 0.8395803332647603, + "grad_norm": 1.8314784367935366, + "learning_rate": 1.3191638246807814e-06, + "loss": 0.7012, + "step": 10203 + }, + { + "epoch": 0.8396626208599054, + "grad_norm": 0.41982641294506706, + "learning_rate": 1.3178410286389598e-06, + "loss": 0.475, + "step": 10204 + }, + { + "epoch": 0.8397449084550505, + "grad_norm": 1.8474606257899582, + "learning_rate": 1.316518849363828e-06, + "loss": 0.708, + "step": 10205 + }, + { + "epoch": 0.8398271960501954, + "grad_norm": 1.8299163903591524, + "learning_rate": 1.315197286949309e-06, + "loss": 0.6975, + "step": 10206 + }, + { + "epoch": 0.8399094836453405, + "grad_norm": 1.8840750941254798, + "learning_rate": 1.3138763414892863e-06, + "loss": 0.6954, + "step": 10207 + }, + { + "epoch": 0.8399917712404855, + "grad_norm": 2.6494127425654, + "learning_rate": 1.3125560130775973e-06, + "loss": 0.683, + "step": 10208 + }, + { + "epoch": 0.8400740588356306, + "grad_norm": 2.0806215267411283, + "learning_rate": 1.3112363018080365e-06, + "loss": 0.7188, + "step": 10209 + }, + { + "epoch": 0.8401563464307755, + "grad_norm": 1.9832341244282738, + "learning_rate": 1.3099172077743495e-06, + "loss": 0.7088, + "step": 10210 + }, + { + "epoch": 0.8402386340259206, + "grad_norm": 2.0235633641195454, + "learning_rate": 1.3085987310702497e-06, + "loss": 0.7121, + "step": 10211 + }, + { + "epoch": 0.8403209216210656, + "grad_norm": 2.4212823729157966, + "learning_rate": 1.3072808717893938e-06, + "loss": 0.7128, + "step": 10212 + }, + { + "epoch": 0.8404032092162107, + "grad_norm": 2.2289666160614203, + "learning_rate": 1.305963630025404e-06, + "loss": 0.7202, + "step": 10213 + }, + { + "epoch": 0.8404854968113556, + "grad_norm": 2.3612853285584285, + "learning_rate": 1.3046470058718552e-06, + "loss": 0.6913, + "step": 10214 + }, + { + "epoch": 0.8405677844065007, + "grad_norm": 2.0457309375515003, + "learning_rate": 1.3033309994222764e-06, + "loss": 0.746, + "step": 10215 + }, + { + "epoch": 0.8406500720016458, + "grad_norm": 1.9558460101364885, + "learning_rate": 1.302015610770152e-06, + "loss": 0.6945, + "step": 10216 + }, + { + "epoch": 0.8407323595967908, + "grad_norm": 1.7449696584294538, + "learning_rate": 1.3007008400089315e-06, + "loss": 0.6844, + "step": 10217 + }, + { + "epoch": 0.8408146471919358, + "grad_norm": 1.9301942383925634, + "learning_rate": 1.2993866872320094e-06, + "loss": 0.6991, + "step": 10218 + }, + { + "epoch": 0.8408969347870808, + "grad_norm": 1.9154741075265236, + "learning_rate": 1.2980731525327428e-06, + "loss": 0.7056, + "step": 10219 + }, + { + "epoch": 0.8409792223822259, + "grad_norm": 2.181722060401743, + "learning_rate": 1.296760236004444e-06, + "loss": 0.7175, + "step": 10220 + }, + { + "epoch": 0.841061509977371, + "grad_norm": 2.2823846548338955, + "learning_rate": 1.2954479377403772e-06, + "loss": 0.7125, + "step": 10221 + }, + { + "epoch": 0.8411437975725159, + "grad_norm": 1.84916990362799, + "learning_rate": 1.294136257833769e-06, + "loss": 0.6972, + "step": 10222 + }, + { + "epoch": 0.841226085167661, + "grad_norm": 2.292898913246316, + "learning_rate": 1.292825196377797e-06, + "loss": 0.7003, + "step": 10223 + }, + { + "epoch": 0.841308372762806, + "grad_norm": 2.336379340948326, + "learning_rate": 1.2915147534656003e-06, + "loss": 0.6723, + "step": 10224 + }, + { + "epoch": 0.8413906603579511, + "grad_norm": 1.8085483734826888, + "learning_rate": 1.2902049291902675e-06, + "loss": 0.7342, + "step": 10225 + }, + { + "epoch": 0.841472947953096, + "grad_norm": 1.761148509661823, + "learning_rate": 1.2888957236448474e-06, + "loss": 0.7201, + "step": 10226 + }, + { + "epoch": 0.8415552355482411, + "grad_norm": 1.864724360502892, + "learning_rate": 1.2875871369223425e-06, + "loss": 0.6989, + "step": 10227 + }, + { + "epoch": 0.8416375231433861, + "grad_norm": 0.4170204639692809, + "learning_rate": 1.286279169115715e-06, + "loss": 0.4655, + "step": 10228 + }, + { + "epoch": 0.8417198107385312, + "grad_norm": 2.2768958288959316, + "learning_rate": 1.2849718203178786e-06, + "loss": 0.712, + "step": 10229 + }, + { + "epoch": 0.8418020983336763, + "grad_norm": 1.8919978322915996, + "learning_rate": 1.2836650906217074e-06, + "loss": 0.6843, + "step": 10230 + }, + { + "epoch": 0.8418843859288212, + "grad_norm": 1.5870851645983324, + "learning_rate": 1.2823589801200297e-06, + "loss": 0.6973, + "step": 10231 + }, + { + "epoch": 0.8419666735239663, + "grad_norm": 1.785814573661589, + "learning_rate": 1.281053488905628e-06, + "loss": 0.7159, + "step": 10232 + }, + { + "epoch": 0.8420489611191113, + "grad_norm": 0.43056195334674846, + "learning_rate": 1.2797486170712391e-06, + "loss": 0.4907, + "step": 10233 + }, + { + "epoch": 0.8421312487142564, + "grad_norm": 0.42021730812179886, + "learning_rate": 1.2784443647095658e-06, + "loss": 0.4864, + "step": 10234 + }, + { + "epoch": 0.8422135363094013, + "grad_norm": 2.5717252901313676, + "learning_rate": 1.2771407319132545e-06, + "loss": 0.7537, + "step": 10235 + }, + { + "epoch": 0.8422958239045464, + "grad_norm": 2.2104410449596807, + "learning_rate": 1.2758377187749182e-06, + "loss": 0.7008, + "step": 10236 + }, + { + "epoch": 0.8423781114996914, + "grad_norm": 1.6847194688756084, + "learning_rate": 1.2745353253871173e-06, + "loss": 0.7099, + "step": 10237 + }, + { + "epoch": 0.8424603990948365, + "grad_norm": 2.3163924542131493, + "learning_rate": 1.2732335518423721e-06, + "loss": 0.6987, + "step": 10238 + }, + { + "epoch": 0.8425426866899814, + "grad_norm": 0.4170377002797985, + "learning_rate": 1.2719323982331577e-06, + "loss": 0.4747, + "step": 10239 + }, + { + "epoch": 0.8426249742851265, + "grad_norm": 2.9253623245881935, + "learning_rate": 1.2706318646519089e-06, + "loss": 0.7303, + "step": 10240 + }, + { + "epoch": 0.8427072618802716, + "grad_norm": 4.960955167319618, + "learning_rate": 1.2693319511910106e-06, + "loss": 0.701, + "step": 10241 + }, + { + "epoch": 0.8427895494754166, + "grad_norm": 2.607314976265902, + "learning_rate": 1.2680326579428092e-06, + "loss": 0.7065, + "step": 10242 + }, + { + "epoch": 0.8428718370705616, + "grad_norm": 1.830582988741335, + "learning_rate": 1.266733984999603e-06, + "loss": 0.7123, + "step": 10243 + }, + { + "epoch": 0.8429541246657066, + "grad_norm": 1.8324980334679204, + "learning_rate": 1.265435932453648e-06, + "loss": 0.703, + "step": 10244 + }, + { + "epoch": 0.8430364122608517, + "grad_norm": 2.0120626917719426, + "learning_rate": 1.264138500397153e-06, + "loss": 0.7561, + "step": 10245 + }, + { + "epoch": 0.8431186998559967, + "grad_norm": 1.9825402775555796, + "learning_rate": 1.2628416889222906e-06, + "loss": 0.7127, + "step": 10246 + }, + { + "epoch": 0.8432009874511417, + "grad_norm": 1.7076520443581673, + "learning_rate": 1.2615454981211795e-06, + "loss": 0.7077, + "step": 10247 + }, + { + "epoch": 0.8432832750462868, + "grad_norm": 1.7849902001036342, + "learning_rate": 1.2602499280859026e-06, + "loss": 0.7304, + "step": 10248 + }, + { + "epoch": 0.8433655626414318, + "grad_norm": 2.125022972969341, + "learning_rate": 1.2589549789084942e-06, + "loss": 0.7054, + "step": 10249 + }, + { + "epoch": 0.8434478502365769, + "grad_norm": 3.4557644849236016, + "learning_rate": 1.2576606506809453e-06, + "loss": 0.6708, + "step": 10250 + }, + { + "epoch": 0.8435301378317218, + "grad_norm": 1.7881653847609789, + "learning_rate": 1.2563669434951998e-06, + "loss": 0.693, + "step": 10251 + }, + { + "epoch": 0.8436124254268669, + "grad_norm": 1.9524175146178002, + "learning_rate": 1.2550738574431642e-06, + "loss": 0.7258, + "step": 10252 + }, + { + "epoch": 0.8436947130220119, + "grad_norm": 1.809389420014345, + "learning_rate": 1.2537813926166963e-06, + "loss": 0.6926, + "step": 10253 + }, + { + "epoch": 0.843777000617157, + "grad_norm": 2.03525938068682, + "learning_rate": 1.2524895491076117e-06, + "loss": 0.706, + "step": 10254 + }, + { + "epoch": 0.843859288212302, + "grad_norm": 1.7858822391644913, + "learning_rate": 1.25119832700768e-06, + "loss": 0.7009, + "step": 10255 + }, + { + "epoch": 0.843941575807447, + "grad_norm": 2.094113137868796, + "learning_rate": 1.249907726408628e-06, + "loss": 0.7169, + "step": 10256 + }, + { + "epoch": 0.8440238634025921, + "grad_norm": 1.7992521433014736, + "learning_rate": 1.2486177474021343e-06, + "loss": 0.7206, + "step": 10257 + }, + { + "epoch": 0.8441061509977371, + "grad_norm": 3.1318664806261776, + "learning_rate": 1.2473283900798428e-06, + "loss": 0.7285, + "step": 10258 + }, + { + "epoch": 0.8441884385928822, + "grad_norm": 2.0819836112925505, + "learning_rate": 1.246039654533343e-06, + "loss": 0.7015, + "step": 10259 + }, + { + "epoch": 0.8442707261880271, + "grad_norm": 1.8852691902923178, + "learning_rate": 1.2447515408541877e-06, + "loss": 0.7014, + "step": 10260 + }, + { + "epoch": 0.8443530137831722, + "grad_norm": 2.0647221263856386, + "learning_rate": 1.2434640491338811e-06, + "loss": 0.696, + "step": 10261 + }, + { + "epoch": 0.8444353013783172, + "grad_norm": 1.7690003980237012, + "learning_rate": 1.2421771794638849e-06, + "loss": 0.6962, + "step": 10262 + }, + { + "epoch": 0.8445175889734623, + "grad_norm": 2.118066166848626, + "learning_rate": 1.2408909319356132e-06, + "loss": 0.7213, + "step": 10263 + }, + { + "epoch": 0.8445998765686072, + "grad_norm": 1.9309648615668102, + "learning_rate": 1.2396053066404435e-06, + "loss": 0.698, + "step": 10264 + }, + { + "epoch": 0.8446821641637523, + "grad_norm": 1.742290317404275, + "learning_rate": 1.2383203036697012e-06, + "loss": 0.6952, + "step": 10265 + }, + { + "epoch": 0.8447644517588974, + "grad_norm": 2.3917692849629035, + "learning_rate": 1.2370359231146744e-06, + "loss": 0.7163, + "step": 10266 + }, + { + "epoch": 0.8448467393540424, + "grad_norm": 2.8148724459573504, + "learning_rate": 1.2357521650666004e-06, + "loss": 0.7114, + "step": 10267 + }, + { + "epoch": 0.8449290269491874, + "grad_norm": 0.4185443338796155, + "learning_rate": 1.2344690296166772e-06, + "loss": 0.4756, + "step": 10268 + }, + { + "epoch": 0.8450113145443324, + "grad_norm": 2.9085304229554585, + "learning_rate": 1.2331865168560531e-06, + "loss": 0.7151, + "step": 10269 + }, + { + "epoch": 0.8450936021394775, + "grad_norm": 0.3891123517962539, + "learning_rate": 1.2319046268758406e-06, + "loss": 0.4552, + "step": 10270 + }, + { + "epoch": 0.8451758897346225, + "grad_norm": 1.8983793557170132, + "learning_rate": 1.2306233597670991e-06, + "loss": 0.7053, + "step": 10271 + }, + { + "epoch": 0.8452581773297675, + "grad_norm": 1.6595928282938697, + "learning_rate": 1.2293427156208536e-06, + "loss": 0.6961, + "step": 10272 + }, + { + "epoch": 0.8453404649249125, + "grad_norm": 1.7384060163790773, + "learning_rate": 1.2280626945280705e-06, + "loss": 0.7192, + "step": 10273 + }, + { + "epoch": 0.8454227525200576, + "grad_norm": 1.966244581528523, + "learning_rate": 1.2267832965796878e-06, + "loss": 0.7212, + "step": 10274 + }, + { + "epoch": 0.8455050401152027, + "grad_norm": 2.1490434325171397, + "learning_rate": 1.2255045218665862e-06, + "loss": 0.6967, + "step": 10275 + }, + { + "epoch": 0.8455873277103476, + "grad_norm": 1.6926077116578118, + "learning_rate": 1.2242263704796131e-06, + "loss": 0.7123, + "step": 10276 + }, + { + "epoch": 0.8456696153054927, + "grad_norm": 3.3567209186352573, + "learning_rate": 1.2229488425095614e-06, + "loss": 0.7105, + "step": 10277 + }, + { + "epoch": 0.8457519029006377, + "grad_norm": 1.7778744460361489, + "learning_rate": 1.221671938047192e-06, + "loss": 0.7138, + "step": 10278 + }, + { + "epoch": 0.8458341904957828, + "grad_norm": 2.3826895362244045, + "learning_rate": 1.2203956571832044e-06, + "loss": 0.7252, + "step": 10279 + }, + { + "epoch": 0.8459164780909278, + "grad_norm": 1.6348916197096575, + "learning_rate": 1.2191200000082705e-06, + "loss": 0.668, + "step": 10280 + }, + { + "epoch": 0.8459987656860728, + "grad_norm": 2.6985398110375103, + "learning_rate": 1.2178449666130065e-06, + "loss": 0.7164, + "step": 10281 + }, + { + "epoch": 0.8460810532812179, + "grad_norm": 2.121171110476976, + "learning_rate": 1.2165705570879938e-06, + "loss": 0.6923, + "step": 10282 + }, + { + "epoch": 0.8461633408763629, + "grad_norm": 1.9680019296120925, + "learning_rate": 1.2152967715237618e-06, + "loss": 0.6914, + "step": 10283 + }, + { + "epoch": 0.846245628471508, + "grad_norm": 1.6300018117904098, + "learning_rate": 1.2140236100107982e-06, + "loss": 0.7025, + "step": 10284 + }, + { + "epoch": 0.8463279160666529, + "grad_norm": 2.0297450013897516, + "learning_rate": 1.2127510726395431e-06, + "loss": 0.7069, + "step": 10285 + }, + { + "epoch": 0.846410203661798, + "grad_norm": 0.41956706584927045, + "learning_rate": 1.2114791595004017e-06, + "loss": 0.4675, + "step": 10286 + }, + { + "epoch": 0.846492491256943, + "grad_norm": 1.77104712413674, + "learning_rate": 1.2102078706837227e-06, + "loss": 0.707, + "step": 10287 + }, + { + "epoch": 0.8465747788520881, + "grad_norm": 1.7496266201333963, + "learning_rate": 1.208937206279822e-06, + "loss": 0.6961, + "step": 10288 + }, + { + "epoch": 0.846657066447233, + "grad_norm": 1.819396577093213, + "learning_rate": 1.207667166378964e-06, + "loss": 0.7382, + "step": 10289 + }, + { + "epoch": 0.8467393540423781, + "grad_norm": 1.9219255133604651, + "learning_rate": 1.2063977510713676e-06, + "loss": 0.7052, + "step": 10290 + }, + { + "epoch": 0.8468216416375232, + "grad_norm": 1.8964086425931268, + "learning_rate": 1.2051289604472105e-06, + "loss": 0.7044, + "step": 10291 + }, + { + "epoch": 0.8469039292326682, + "grad_norm": 1.9495257844522091, + "learning_rate": 1.2038607945966285e-06, + "loss": 0.7024, + "step": 10292 + }, + { + "epoch": 0.8469862168278132, + "grad_norm": 1.967273683684835, + "learning_rate": 1.2025932536097063e-06, + "loss": 0.7179, + "step": 10293 + }, + { + "epoch": 0.8470685044229582, + "grad_norm": 0.40047025600779074, + "learning_rate": 1.2013263375764917e-06, + "loss": 0.4605, + "step": 10294 + }, + { + "epoch": 0.8471507920181033, + "grad_norm": 1.7124758792935142, + "learning_rate": 1.2000600465869837e-06, + "loss": 0.7015, + "step": 10295 + }, + { + "epoch": 0.8472330796132483, + "grad_norm": 2.218095468979744, + "learning_rate": 1.1987943807311353e-06, + "loss": 0.6884, + "step": 10296 + }, + { + "epoch": 0.8473153672083933, + "grad_norm": 1.8494702539779984, + "learning_rate": 1.1975293400988575e-06, + "loss": 0.6914, + "step": 10297 + }, + { + "epoch": 0.8473976548035383, + "grad_norm": 0.405253825258688, + "learning_rate": 1.1962649247800196e-06, + "loss": 0.4978, + "step": 10298 + }, + { + "epoch": 0.8474799423986834, + "grad_norm": 2.549282765778276, + "learning_rate": 1.1950011348644398e-06, + "loss": 0.7083, + "step": 10299 + }, + { + "epoch": 0.8475622299938285, + "grad_norm": 1.4688116629019223, + "learning_rate": 1.1937379704419005e-06, + "loss": 0.7149, + "step": 10300 + }, + { + "epoch": 0.8476445175889734, + "grad_norm": 1.719421750569004, + "learning_rate": 1.1924754316021315e-06, + "loss": 0.6933, + "step": 10301 + }, + { + "epoch": 0.8477268051841185, + "grad_norm": 1.622310987319696, + "learning_rate": 1.1912135184348227e-06, + "loss": 0.7099, + "step": 10302 + }, + { + "epoch": 0.8478090927792635, + "grad_norm": 1.8917647224384704, + "learning_rate": 1.1899522310296151e-06, + "loss": 0.6997, + "step": 10303 + }, + { + "epoch": 0.8478913803744086, + "grad_norm": 1.812106109768098, + "learning_rate": 1.1886915694761148e-06, + "loss": 0.7056, + "step": 10304 + }, + { + "epoch": 0.8479736679695535, + "grad_norm": 1.977061957846284, + "learning_rate": 1.1874315338638709e-06, + "loss": 0.6873, + "step": 10305 + }, + { + "epoch": 0.8480559555646986, + "grad_norm": 2.207232952132028, + "learning_rate": 1.1861721242823998e-06, + "loss": 0.7051, + "step": 10306 + }, + { + "epoch": 0.8481382431598437, + "grad_norm": 1.824017889305365, + "learning_rate": 1.1849133408211643e-06, + "loss": 0.7197, + "step": 10307 + }, + { + "epoch": 0.8482205307549887, + "grad_norm": 1.9155380629416918, + "learning_rate": 1.1836551835695887e-06, + "loss": 0.7197, + "step": 10308 + }, + { + "epoch": 0.8483028183501338, + "grad_norm": 1.5769909403100815, + "learning_rate": 1.1823976526170466e-06, + "loss": 0.6938, + "step": 10309 + }, + { + "epoch": 0.8483851059452787, + "grad_norm": 1.8590008859743095, + "learning_rate": 1.1811407480528758e-06, + "loss": 0.7096, + "step": 10310 + }, + { + "epoch": 0.8484673935404238, + "grad_norm": 2.371814123362432, + "learning_rate": 1.1798844699663602e-06, + "loss": 0.7048, + "step": 10311 + }, + { + "epoch": 0.8485496811355688, + "grad_norm": 2.0820372169411048, + "learning_rate": 1.1786288184467486e-06, + "loss": 0.7064, + "step": 10312 + }, + { + "epoch": 0.8486319687307139, + "grad_norm": 1.6375265406285449, + "learning_rate": 1.1773737935832375e-06, + "loss": 0.7119, + "step": 10313 + }, + { + "epoch": 0.8487142563258588, + "grad_norm": 1.80748525137628, + "learning_rate": 1.176119395464983e-06, + "loss": 0.7137, + "step": 10314 + }, + { + "epoch": 0.8487965439210039, + "grad_norm": 1.9733427959425411, + "learning_rate": 1.1748656241810929e-06, + "loss": 0.7106, + "step": 10315 + }, + { + "epoch": 0.848878831516149, + "grad_norm": 1.9720981963710762, + "learning_rate": 1.1736124798206361e-06, + "loss": 0.7071, + "step": 10316 + }, + { + "epoch": 0.848961119111294, + "grad_norm": 1.8378456804370888, + "learning_rate": 1.1723599624726323e-06, + "loss": 0.6913, + "step": 10317 + }, + { + "epoch": 0.849043406706439, + "grad_norm": 0.4124598229803567, + "learning_rate": 1.1711080722260603e-06, + "loss": 0.4789, + "step": 10318 + }, + { + "epoch": 0.849125694301584, + "grad_norm": 2.0350689896389276, + "learning_rate": 1.1698568091698514e-06, + "loss": 0.7782, + "step": 10319 + }, + { + "epoch": 0.8492079818967291, + "grad_norm": 2.0314898983086622, + "learning_rate": 1.1686061733928932e-06, + "loss": 0.7173, + "step": 10320 + }, + { + "epoch": 0.8492902694918741, + "grad_norm": 1.9107509835226428, + "learning_rate": 1.167356164984026e-06, + "loss": 0.7207, + "step": 10321 + }, + { + "epoch": 0.8493725570870191, + "grad_norm": 2.317472235494565, + "learning_rate": 1.1661067840320538e-06, + "loss": 0.7104, + "step": 10322 + }, + { + "epoch": 0.8494548446821641, + "grad_norm": 2.1843868740981947, + "learning_rate": 1.1648580306257262e-06, + "loss": 0.7151, + "step": 10323 + }, + { + "epoch": 0.8495371322773092, + "grad_norm": 1.9799820937553907, + "learning_rate": 1.1636099048537553e-06, + "loss": 0.7408, + "step": 10324 + }, + { + "epoch": 0.8496194198724543, + "grad_norm": 2.2054385675637067, + "learning_rate": 1.1623624068048057e-06, + "loss": 0.7332, + "step": 10325 + }, + { + "epoch": 0.8497017074675992, + "grad_norm": 1.840019681228478, + "learning_rate": 1.1611155365674975e-06, + "loss": 0.7166, + "step": 10326 + }, + { + "epoch": 0.8497839950627443, + "grad_norm": 5.912579943482064, + "learning_rate": 1.1598692942304034e-06, + "loss": 0.7282, + "step": 10327 + }, + { + "epoch": 0.8498662826578893, + "grad_norm": 2.8825981136132937, + "learning_rate": 1.1586236798820593e-06, + "loss": 0.7189, + "step": 10328 + }, + { + "epoch": 0.8499485702530344, + "grad_norm": 1.7399882269766815, + "learning_rate": 1.1573786936109465e-06, + "loss": 0.7035, + "step": 10329 + }, + { + "epoch": 0.8500308578481793, + "grad_norm": 3.211797349973899, + "learning_rate": 1.1561343355055122e-06, + "loss": 0.6966, + "step": 10330 + }, + { + "epoch": 0.8501131454433244, + "grad_norm": 2.444007527466717, + "learning_rate": 1.154890605654151e-06, + "loss": 0.7134, + "step": 10331 + }, + { + "epoch": 0.8501954330384695, + "grad_norm": 0.406538786718097, + "learning_rate": 1.1536475041452155e-06, + "loss": 0.4807, + "step": 10332 + }, + { + "epoch": 0.8502777206336145, + "grad_norm": 2.2375808055431623, + "learning_rate": 1.152405031067011e-06, + "loss": 0.7081, + "step": 10333 + }, + { + "epoch": 0.8503600082287596, + "grad_norm": 2.159832089498162, + "learning_rate": 1.1511631865078055e-06, + "loss": 0.7272, + "step": 10334 + }, + { + "epoch": 0.8504422958239045, + "grad_norm": 3.5583892502052374, + "learning_rate": 1.149921970555814e-06, + "loss": 0.7089, + "step": 10335 + }, + { + "epoch": 0.8505245834190496, + "grad_norm": 1.773183915029322, + "learning_rate": 1.148681383299215e-06, + "loss": 0.6947, + "step": 10336 + }, + { + "epoch": 0.8506068710141946, + "grad_norm": 2.46139920200671, + "learning_rate": 1.1474414248261334e-06, + "loss": 0.7141, + "step": 10337 + }, + { + "epoch": 0.8506891586093397, + "grad_norm": 1.9668278154844525, + "learning_rate": 1.1462020952246567e-06, + "loss": 0.7237, + "step": 10338 + }, + { + "epoch": 0.8507714462044846, + "grad_norm": 0.40184045565642457, + "learning_rate": 1.144963394582821e-06, + "loss": 0.4534, + "step": 10339 + }, + { + "epoch": 0.8508537337996297, + "grad_norm": 1.9536314928347704, + "learning_rate": 1.1437253229886258e-06, + "loss": 0.699, + "step": 10340 + }, + { + "epoch": 0.8509360213947748, + "grad_norm": 1.6338814515450075, + "learning_rate": 1.1424878805300187e-06, + "loss": 0.7094, + "step": 10341 + }, + { + "epoch": 0.8510183089899198, + "grad_norm": 1.736294331293277, + "learning_rate": 1.1412510672949117e-06, + "loss": 0.7038, + "step": 10342 + }, + { + "epoch": 0.8511005965850648, + "grad_norm": 1.8121601581855766, + "learning_rate": 1.1400148833711567e-06, + "loss": 0.6957, + "step": 10343 + }, + { + "epoch": 0.8511828841802098, + "grad_norm": 0.4204002769588918, + "learning_rate": 1.1387793288465766e-06, + "loss": 0.4564, + "step": 10344 + }, + { + "epoch": 0.8512651717753549, + "grad_norm": 1.841713087967031, + "learning_rate": 1.1375444038089401e-06, + "loss": 0.7129, + "step": 10345 + }, + { + "epoch": 0.8513474593704999, + "grad_norm": 1.921567303618468, + "learning_rate": 1.136310108345977e-06, + "loss": 0.6723, + "step": 10346 + }, + { + "epoch": 0.8514297469656449, + "grad_norm": 1.9761364469405873, + "learning_rate": 1.1350764425453664e-06, + "loss": 0.7045, + "step": 10347 + }, + { + "epoch": 0.8515120345607899, + "grad_norm": 2.3222526229537928, + "learning_rate": 1.1338434064947534e-06, + "loss": 0.7117, + "step": 10348 + }, + { + "epoch": 0.851594322155935, + "grad_norm": 1.6545074112622304, + "learning_rate": 1.1326110002817202e-06, + "loss": 0.6919, + "step": 10349 + }, + { + "epoch": 0.8516766097510801, + "grad_norm": 4.386363919479266, + "learning_rate": 1.1313792239938227e-06, + "loss": 0.7184, + "step": 10350 + }, + { + "epoch": 0.851758897346225, + "grad_norm": 1.6585036180116306, + "learning_rate": 1.1301480777185592e-06, + "loss": 0.7171, + "step": 10351 + }, + { + "epoch": 0.8518411849413701, + "grad_norm": 1.5386026632377185, + "learning_rate": 1.1289175615433934e-06, + "loss": 0.6829, + "step": 10352 + }, + { + "epoch": 0.8519234725365151, + "grad_norm": 2.406312856519946, + "learning_rate": 1.1276876755557376e-06, + "loss": 0.7313, + "step": 10353 + }, + { + "epoch": 0.8520057601316602, + "grad_norm": 0.4069793536637202, + "learning_rate": 1.1264584198429595e-06, + "loss": 0.4691, + "step": 10354 + }, + { + "epoch": 0.8520880477268051, + "grad_norm": 1.6094865065508657, + "learning_rate": 1.1252297944923819e-06, + "loss": 0.6894, + "step": 10355 + }, + { + "epoch": 0.8521703353219502, + "grad_norm": 0.41196729384231895, + "learning_rate": 1.124001799591289e-06, + "loss": 0.4827, + "step": 10356 + }, + { + "epoch": 0.8522526229170952, + "grad_norm": 2.0841167927255797, + "learning_rate": 1.122774435226911e-06, + "loss": 0.7077, + "step": 10357 + }, + { + "epoch": 0.8523349105122403, + "grad_norm": 1.9009601387353183, + "learning_rate": 1.1215477014864429e-06, + "loss": 0.6848, + "step": 10358 + }, + { + "epoch": 0.8524171981073854, + "grad_norm": 2.0598717475628647, + "learning_rate": 1.1203215984570258e-06, + "loss": 0.666, + "step": 10359 + }, + { + "epoch": 0.8524994857025303, + "grad_norm": 2.018793343130484, + "learning_rate": 1.1190961262257626e-06, + "loss": 0.7096, + "step": 10360 + }, + { + "epoch": 0.8525817732976754, + "grad_norm": 1.955284111842583, + "learning_rate": 1.1178712848797046e-06, + "loss": 0.719, + "step": 10361 + }, + { + "epoch": 0.8526640608928204, + "grad_norm": 1.8550908309812542, + "learning_rate": 1.1166470745058677e-06, + "loss": 0.6985, + "step": 10362 + }, + { + "epoch": 0.8527463484879655, + "grad_norm": 0.41150039113898795, + "learning_rate": 1.1154234951912147e-06, + "loss": 0.4769, + "step": 10363 + }, + { + "epoch": 0.8528286360831104, + "grad_norm": 1.972073051221203, + "learning_rate": 1.1142005470226692e-06, + "loss": 0.732, + "step": 10364 + }, + { + "epoch": 0.8529109236782555, + "grad_norm": 1.8863102037669834, + "learning_rate": 1.112978230087106e-06, + "loss": 0.6976, + "step": 10365 + }, + { + "epoch": 0.8529932112734006, + "grad_norm": 1.8008602201319026, + "learning_rate": 1.1117565444713573e-06, + "loss": 0.711, + "step": 10366 + }, + { + "epoch": 0.8530754988685456, + "grad_norm": 0.4482991357377541, + "learning_rate": 1.1105354902622067e-06, + "loss": 0.4703, + "step": 10367 + }, + { + "epoch": 0.8531577864636906, + "grad_norm": 0.42794892701390386, + "learning_rate": 1.1093150675464004e-06, + "loss": 0.4753, + "step": 10368 + }, + { + "epoch": 0.8532400740588356, + "grad_norm": 1.8754906707656926, + "learning_rate": 1.1080952764106312e-06, + "loss": 0.7127, + "step": 10369 + }, + { + "epoch": 0.8533223616539807, + "grad_norm": 1.9092114441399464, + "learning_rate": 1.1068761169415543e-06, + "loss": 0.681, + "step": 10370 + }, + { + "epoch": 0.8534046492491257, + "grad_norm": 1.9701610858727343, + "learning_rate": 1.1056575892257758e-06, + "loss": 0.7137, + "step": 10371 + }, + { + "epoch": 0.8534869368442707, + "grad_norm": 1.6012445293942819, + "learning_rate": 1.104439693349858e-06, + "loss": 0.7082, + "step": 10372 + }, + { + "epoch": 0.8535692244394157, + "grad_norm": 1.961801753846987, + "learning_rate": 1.103222429400317e-06, + "loss": 0.6913, + "step": 10373 + }, + { + "epoch": 0.8536515120345608, + "grad_norm": 1.8295688802909091, + "learning_rate": 1.1020057974636266e-06, + "loss": 0.718, + "step": 10374 + }, + { + "epoch": 0.8537337996297059, + "grad_norm": 0.4014056426928875, + "learning_rate": 1.1007897976262138e-06, + "loss": 0.4533, + "step": 10375 + }, + { + "epoch": 0.8538160872248508, + "grad_norm": 1.718422663589629, + "learning_rate": 1.0995744299744626e-06, + "loss": 0.7132, + "step": 10376 + }, + { + "epoch": 0.8538983748199959, + "grad_norm": 2.059273593035152, + "learning_rate": 1.0983596945947095e-06, + "loss": 0.7143, + "step": 10377 + }, + { + "epoch": 0.8539806624151409, + "grad_norm": 1.6383556972775244, + "learning_rate": 1.0971455915732488e-06, + "loss": 0.684, + "step": 10378 + }, + { + "epoch": 0.854062950010286, + "grad_norm": 2.7020716635612985, + "learning_rate": 1.0959321209963236e-06, + "loss": 0.6792, + "step": 10379 + }, + { + "epoch": 0.8541452376054309, + "grad_norm": 0.4247964977541339, + "learning_rate": 1.0947192829501441e-06, + "loss": 0.4685, + "step": 10380 + }, + { + "epoch": 0.854227525200576, + "grad_norm": 2.286436490780876, + "learning_rate": 1.0935070775208623e-06, + "loss": 0.6779, + "step": 10381 + }, + { + "epoch": 0.854309812795721, + "grad_norm": 0.42224329358823764, + "learning_rate": 1.092295504794595e-06, + "loss": 0.4651, + "step": 10382 + }, + { + "epoch": 0.8543921003908661, + "grad_norm": 2.3785624268739403, + "learning_rate": 1.091084564857411e-06, + "loss": 0.7056, + "step": 10383 + }, + { + "epoch": 0.8544743879860112, + "grad_norm": 1.9789856178668663, + "learning_rate": 1.0898742577953303e-06, + "loss": 0.6826, + "step": 10384 + }, + { + "epoch": 0.8545566755811561, + "grad_norm": 1.9054610671856667, + "learning_rate": 1.0886645836943311e-06, + "loss": 0.7012, + "step": 10385 + }, + { + "epoch": 0.8546389631763012, + "grad_norm": 2.2906804495197433, + "learning_rate": 1.087455542640351e-06, + "loss": 0.7208, + "step": 10386 + }, + { + "epoch": 0.8547212507714462, + "grad_norm": 1.5968693899218858, + "learning_rate": 1.0862471347192726e-06, + "loss": 0.7057, + "step": 10387 + }, + { + "epoch": 0.8548035383665913, + "grad_norm": 1.947758344009564, + "learning_rate": 1.0850393600169451e-06, + "loss": 0.7117, + "step": 10388 + }, + { + "epoch": 0.8548858259617362, + "grad_norm": 1.742294018488579, + "learning_rate": 1.083832218619164e-06, + "loss": 0.7154, + "step": 10389 + }, + { + "epoch": 0.8549681135568813, + "grad_norm": 1.7378245447573295, + "learning_rate": 1.0826257106116822e-06, + "loss": 0.6923, + "step": 10390 + }, + { + "epoch": 0.8550504011520264, + "grad_norm": 2.0413263900072502, + "learning_rate": 1.0814198360802076e-06, + "loss": 0.6826, + "step": 10391 + }, + { + "epoch": 0.8551326887471714, + "grad_norm": 2.11374832279161, + "learning_rate": 1.080214595110406e-06, + "loss": 0.7172, + "step": 10392 + }, + { + "epoch": 0.8552149763423164, + "grad_norm": 1.7547945520567758, + "learning_rate": 1.0790099877878923e-06, + "loss": 0.7031, + "step": 10393 + }, + { + "epoch": 0.8552972639374614, + "grad_norm": 1.9797756432031584, + "learning_rate": 1.077806014198245e-06, + "loss": 0.6845, + "step": 10394 + }, + { + "epoch": 0.8553795515326065, + "grad_norm": 1.8382015635827782, + "learning_rate": 1.0766026744269886e-06, + "loss": 0.7328, + "step": 10395 + }, + { + "epoch": 0.8554618391277515, + "grad_norm": 2.028406872798221, + "learning_rate": 1.0753999685596073e-06, + "loss": 0.7137, + "step": 10396 + }, + { + "epoch": 0.8555441267228965, + "grad_norm": 2.489271476274407, + "learning_rate": 1.074197896681538e-06, + "loss": 0.7009, + "step": 10397 + }, + { + "epoch": 0.8556264143180415, + "grad_norm": 1.8227953452842807, + "learning_rate": 1.072996458878177e-06, + "loss": 0.7293, + "step": 10398 + }, + { + "epoch": 0.8557087019131866, + "grad_norm": 2.0056220824737037, + "learning_rate": 1.0717956552348696e-06, + "loss": 0.68, + "step": 10399 + }, + { + "epoch": 0.8557909895083317, + "grad_norm": 1.799831497792828, + "learning_rate": 1.0705954858369217e-06, + "loss": 0.7007, + "step": 10400 + }, + { + "epoch": 0.8558732771034766, + "grad_norm": 1.626085360820614, + "learning_rate": 1.0693959507695894e-06, + "loss": 0.7619, + "step": 10401 + }, + { + "epoch": 0.8559555646986217, + "grad_norm": 1.7461203824629106, + "learning_rate": 1.068197050118087e-06, + "loss": 0.7187, + "step": 10402 + }, + { + "epoch": 0.8560378522937667, + "grad_norm": 0.41883765958824903, + "learning_rate": 1.066998783967581e-06, + "loss": 0.4535, + "step": 10403 + }, + { + "epoch": 0.8561201398889118, + "grad_norm": 1.8904816376586564, + "learning_rate": 1.0658011524031953e-06, + "loss": 0.7044, + "step": 10404 + }, + { + "epoch": 0.8562024274840567, + "grad_norm": 2.0834898216459465, + "learning_rate": 1.0646041555100073e-06, + "loss": 0.6813, + "step": 10405 + }, + { + "epoch": 0.8562847150792018, + "grad_norm": 1.7869994148160948, + "learning_rate": 1.0634077933730503e-06, + "loss": 0.7194, + "step": 10406 + }, + { + "epoch": 0.8563670026743468, + "grad_norm": 2.327795639598082, + "learning_rate": 1.0622120660773128e-06, + "loss": 0.6981, + "step": 10407 + }, + { + "epoch": 0.8564492902694919, + "grad_norm": 0.42466422742121035, + "learning_rate": 1.0610169737077358e-06, + "loss": 0.4702, + "step": 10408 + }, + { + "epoch": 0.8565315778646369, + "grad_norm": 1.7809384385833815, + "learning_rate": 1.059822516349216e-06, + "loss": 0.7268, + "step": 10409 + }, + { + "epoch": 0.8566138654597819, + "grad_norm": 1.8207239361630865, + "learning_rate": 1.0586286940866074e-06, + "loss": 0.6767, + "step": 10410 + }, + { + "epoch": 0.856696153054927, + "grad_norm": 2.270044928960466, + "learning_rate": 1.057435507004716e-06, + "loss": 0.701, + "step": 10411 + }, + { + "epoch": 0.856778440650072, + "grad_norm": 2.144536369354913, + "learning_rate": 1.056242955188308e-06, + "loss": 0.6873, + "step": 10412 + }, + { + "epoch": 0.8568607282452171, + "grad_norm": 1.8603307329863799, + "learning_rate": 1.0550510387220936e-06, + "loss": 0.7225, + "step": 10413 + }, + { + "epoch": 0.856943015840362, + "grad_norm": 1.980279680102889, + "learning_rate": 1.0538597576907484e-06, + "loss": 0.6866, + "step": 10414 + }, + { + "epoch": 0.8570253034355071, + "grad_norm": 1.9567646636657878, + "learning_rate": 1.0526691121789002e-06, + "loss": 0.7212, + "step": 10415 + }, + { + "epoch": 0.8571075910306521, + "grad_norm": 1.7597461941230144, + "learning_rate": 1.0514791022711268e-06, + "loss": 0.6873, + "step": 10416 + }, + { + "epoch": 0.8571898786257972, + "grad_norm": 1.5231849429841398, + "learning_rate": 1.0502897280519696e-06, + "loss": 0.7154, + "step": 10417 + }, + { + "epoch": 0.8572721662209422, + "grad_norm": 1.811467734175766, + "learning_rate": 1.0491009896059167e-06, + "loss": 0.6984, + "step": 10418 + }, + { + "epoch": 0.8573544538160872, + "grad_norm": 2.159323001225842, + "learning_rate": 1.0479128870174148e-06, + "loss": 0.7099, + "step": 10419 + }, + { + "epoch": 0.8574367414112323, + "grad_norm": 1.7028834203582341, + "learning_rate": 1.0467254203708621e-06, + "loss": 0.6849, + "step": 10420 + }, + { + "epoch": 0.8575190290063773, + "grad_norm": 2.145118028086299, + "learning_rate": 1.0455385897506197e-06, + "loss": 0.7228, + "step": 10421 + }, + { + "epoch": 0.8576013166015223, + "grad_norm": 0.4131309857886483, + "learning_rate": 1.044352395240994e-06, + "loss": 0.446, + "step": 10422 + }, + { + "epoch": 0.8576836041966673, + "grad_norm": 1.8275055783556218, + "learning_rate": 1.0431668369262538e-06, + "loss": 0.7034, + "step": 10423 + }, + { + "epoch": 0.8577658917918124, + "grad_norm": 0.3955007164613246, + "learning_rate": 1.041981914890614e-06, + "loss": 0.459, + "step": 10424 + }, + { + "epoch": 0.8578481793869575, + "grad_norm": 1.9385745646864432, + "learning_rate": 1.0407976292182554e-06, + "loss": 0.7348, + "step": 10425 + }, + { + "epoch": 0.8579304669821024, + "grad_norm": 3.5415688707660755, + "learning_rate": 1.0396139799933026e-06, + "loss": 0.7303, + "step": 10426 + }, + { + "epoch": 0.8580127545772475, + "grad_norm": 0.4137373125172968, + "learning_rate": 1.0384309672998449e-06, + "loss": 0.4727, + "step": 10427 + }, + { + "epoch": 0.8580950421723925, + "grad_norm": 1.8068921054760096, + "learning_rate": 1.0372485912219165e-06, + "loss": 0.7172, + "step": 10428 + }, + { + "epoch": 0.8581773297675376, + "grad_norm": 2.368806403213488, + "learning_rate": 1.036066851843519e-06, + "loss": 0.7125, + "step": 10429 + }, + { + "epoch": 0.8582596173626825, + "grad_norm": 2.106766294836573, + "learning_rate": 1.034885749248593e-06, + "loss": 0.7386, + "step": 10430 + }, + { + "epoch": 0.8583419049578276, + "grad_norm": 2.2069837471727913, + "learning_rate": 1.033705283521047e-06, + "loss": 0.7084, + "step": 10431 + }, + { + "epoch": 0.8584241925529726, + "grad_norm": 2.0346864491685026, + "learning_rate": 1.0325254547447373e-06, + "loss": 0.7298, + "step": 10432 + }, + { + "epoch": 0.8585064801481177, + "grad_norm": 1.8283731291703, + "learning_rate": 1.0313462630034787e-06, + "loss": 0.7259, + "step": 10433 + }, + { + "epoch": 0.8585887677432626, + "grad_norm": 2.116981375280564, + "learning_rate": 1.0301677083810391e-06, + "loss": 0.699, + "step": 10434 + }, + { + "epoch": 0.8586710553384077, + "grad_norm": 2.5239688288803577, + "learning_rate": 1.0289897909611402e-06, + "loss": 0.7105, + "step": 10435 + }, + { + "epoch": 0.8587533429335528, + "grad_norm": 1.6323238861957423, + "learning_rate": 1.0278125108274572e-06, + "loss": 0.725, + "step": 10436 + }, + { + "epoch": 0.8588356305286978, + "grad_norm": 0.3919366942920444, + "learning_rate": 1.0266358680636267e-06, + "loss": 0.4668, + "step": 10437 + }, + { + "epoch": 0.8589179181238429, + "grad_norm": 2.419902069863786, + "learning_rate": 1.025459862753232e-06, + "loss": 0.6873, + "step": 10438 + }, + { + "epoch": 0.8590002057189878, + "grad_norm": 2.2059383628950044, + "learning_rate": 1.024284494979817e-06, + "loss": 0.7443, + "step": 10439 + }, + { + "epoch": 0.8590824933141329, + "grad_norm": 1.9277417340268832, + "learning_rate": 1.0231097648268772e-06, + "loss": 0.7036, + "step": 10440 + }, + { + "epoch": 0.859164780909278, + "grad_norm": 0.42640323293139204, + "learning_rate": 1.0219356723778639e-06, + "loss": 0.4507, + "step": 10441 + }, + { + "epoch": 0.859247068504423, + "grad_norm": 1.5690860186284012, + "learning_rate": 1.0207622177161802e-06, + "loss": 0.7097, + "step": 10442 + }, + { + "epoch": 0.859329356099568, + "grad_norm": 2.4639353871672123, + "learning_rate": 1.0195894009251893e-06, + "loss": 0.6806, + "step": 10443 + }, + { + "epoch": 0.859411643694713, + "grad_norm": 3.6523141193704953, + "learning_rate": 1.0184172220882037e-06, + "loss": 0.7128, + "step": 10444 + }, + { + "epoch": 0.8594939312898581, + "grad_norm": 1.9414884539117716, + "learning_rate": 1.017245681288498e-06, + "loss": 0.7316, + "step": 10445 + }, + { + "epoch": 0.8595762188850031, + "grad_norm": 2.0344851777432553, + "learning_rate": 1.0160747786092916e-06, + "loss": 0.6982, + "step": 10446 + }, + { + "epoch": 0.8596585064801481, + "grad_norm": 2.0426878637856554, + "learning_rate": 1.0149045141337665e-06, + "loss": 0.7331, + "step": 10447 + }, + { + "epoch": 0.8597407940752931, + "grad_norm": 2.691809893287452, + "learning_rate": 1.0137348879450526e-06, + "loss": 0.711, + "step": 10448 + }, + { + "epoch": 0.8598230816704382, + "grad_norm": 2.951088333599911, + "learning_rate": 1.0125659001262422e-06, + "loss": 0.7314, + "step": 10449 + }, + { + "epoch": 0.8599053692655833, + "grad_norm": 1.8637372146383229, + "learning_rate": 1.0113975507603757e-06, + "loss": 0.7223, + "step": 10450 + }, + { + "epoch": 0.8599876568607282, + "grad_norm": 1.9932717858025188, + "learning_rate": 1.0102298399304545e-06, + "loss": 0.6936, + "step": 10451 + }, + { + "epoch": 0.8600699444558733, + "grad_norm": 1.82298380755983, + "learning_rate": 1.0090627677194276e-06, + "loss": 0.7061, + "step": 10452 + }, + { + "epoch": 0.8601522320510183, + "grad_norm": 1.9678927775685744, + "learning_rate": 1.0078963342102033e-06, + "loss": 0.7024, + "step": 10453 + }, + { + "epoch": 0.8602345196461634, + "grad_norm": 1.8926823112428872, + "learning_rate": 1.0067305394856408e-06, + "loss": 0.7363, + "step": 10454 + }, + { + "epoch": 0.8603168072413083, + "grad_norm": 0.4328394508577774, + "learning_rate": 1.0055653836285595e-06, + "loss": 0.47, + "step": 10455 + }, + { + "epoch": 0.8603990948364534, + "grad_norm": 1.9419713996701697, + "learning_rate": 1.0044008667217276e-06, + "loss": 0.6796, + "step": 10456 + }, + { + "epoch": 0.8604813824315984, + "grad_norm": 1.9707381704647657, + "learning_rate": 1.0032369888478733e-06, + "loss": 0.7021, + "step": 10457 + }, + { + "epoch": 0.8605636700267435, + "grad_norm": 2.0356138261537216, + "learning_rate": 1.002073750089675e-06, + "loss": 0.7127, + "step": 10458 + }, + { + "epoch": 0.8606459576218884, + "grad_norm": 2.072753806510741, + "learning_rate": 1.0009111505297675e-06, + "loss": 0.7124, + "step": 10459 + }, + { + "epoch": 0.8607282452170335, + "grad_norm": 1.9923166529417076, + "learning_rate": 9.997491902507383e-07, + "loss": 0.7047, + "step": 10460 + }, + { + "epoch": 0.8608105328121786, + "grad_norm": 1.9117710500229972, + "learning_rate": 9.985878693351347e-07, + "loss": 0.697, + "step": 10461 + }, + { + "epoch": 0.8608928204073236, + "grad_norm": 1.8561975313260848, + "learning_rate": 9.974271878654517e-07, + "loss": 0.7002, + "step": 10462 + }, + { + "epoch": 0.8609751080024687, + "grad_norm": 2.504760452512425, + "learning_rate": 9.962671459241458e-07, + "loss": 0.695, + "step": 10463 + }, + { + "epoch": 0.8610573955976136, + "grad_norm": 2.2795296364370485, + "learning_rate": 9.95107743593622e-07, + "loss": 0.7515, + "step": 10464 + }, + { + "epoch": 0.8611396831927587, + "grad_norm": 1.9003227623067622, + "learning_rate": 9.939489809562419e-07, + "loss": 0.6961, + "step": 10465 + }, + { + "epoch": 0.8612219707879037, + "grad_norm": 0.4257491771687449, + "learning_rate": 9.927908580943223e-07, + "loss": 0.481, + "step": 10466 + }, + { + "epoch": 0.8613042583830488, + "grad_norm": 2.3835342560092214, + "learning_rate": 9.916333750901374e-07, + "loss": 0.6997, + "step": 10467 + }, + { + "epoch": 0.8613865459781938, + "grad_norm": 2.2995203343268256, + "learning_rate": 9.904765320259091e-07, + "loss": 0.7205, + "step": 10468 + }, + { + "epoch": 0.8614688335733388, + "grad_norm": 0.4312229830025395, + "learning_rate": 9.893203289838206e-07, + "loss": 0.4867, + "step": 10469 + }, + { + "epoch": 0.8615511211684839, + "grad_norm": 2.298350803729023, + "learning_rate": 9.88164766046006e-07, + "loss": 0.6632, + "step": 10470 + }, + { + "epoch": 0.8616334087636289, + "grad_norm": 1.9913903742179528, + "learning_rate": 9.87009843294553e-07, + "loss": 0.7213, + "step": 10471 + }, + { + "epoch": 0.8617156963587739, + "grad_norm": 1.8061380600965968, + "learning_rate": 9.85855560811505e-07, + "loss": 0.6805, + "step": 10472 + }, + { + "epoch": 0.8617979839539189, + "grad_norm": 1.8773919776497703, + "learning_rate": 9.847019186788641e-07, + "loss": 0.736, + "step": 10473 + }, + { + "epoch": 0.861880271549064, + "grad_norm": 1.923119642843843, + "learning_rate": 9.835489169785795e-07, + "loss": 0.7316, + "step": 10474 + }, + { + "epoch": 0.861962559144209, + "grad_norm": 2.4564423114213247, + "learning_rate": 9.82396555792562e-07, + "loss": 0.6953, + "step": 10475 + }, + { + "epoch": 0.862044846739354, + "grad_norm": 1.9733608650721863, + "learning_rate": 9.812448352026716e-07, + "loss": 0.6974, + "step": 10476 + }, + { + "epoch": 0.8621271343344991, + "grad_norm": 1.745064222034232, + "learning_rate": 9.800937552907253e-07, + "loss": 0.7029, + "step": 10477 + }, + { + "epoch": 0.8622094219296441, + "grad_norm": 2.2259513844218812, + "learning_rate": 9.78943316138491e-07, + "loss": 0.679, + "step": 10478 + }, + { + "epoch": 0.8622917095247892, + "grad_norm": 1.8907625725385868, + "learning_rate": 9.777935178276976e-07, + "loss": 0.6765, + "step": 10479 + }, + { + "epoch": 0.8623739971199341, + "grad_norm": 1.6942180401188751, + "learning_rate": 9.766443604400233e-07, + "loss": 0.6909, + "step": 10480 + }, + { + "epoch": 0.8624562847150792, + "grad_norm": 2.0958530169290075, + "learning_rate": 9.754958440571038e-07, + "loss": 0.7204, + "step": 10481 + }, + { + "epoch": 0.8625385723102242, + "grad_norm": 2.113253599021367, + "learning_rate": 9.743479687605272e-07, + "loss": 0.7483, + "step": 10482 + }, + { + "epoch": 0.8626208599053693, + "grad_norm": 2.270836691834726, + "learning_rate": 9.732007346318361e-07, + "loss": 0.707, + "step": 10483 + }, + { + "epoch": 0.8627031475005142, + "grad_norm": 10.870834978510775, + "learning_rate": 9.720541417525275e-07, + "loss": 0.6777, + "step": 10484 + }, + { + "epoch": 0.8627854350956593, + "grad_norm": 1.7202116069667597, + "learning_rate": 9.709081902040563e-07, + "loss": 0.7451, + "step": 10485 + }, + { + "epoch": 0.8628677226908044, + "grad_norm": 1.7941887835399457, + "learning_rate": 9.69762880067825e-07, + "loss": 0.7007, + "step": 10486 + }, + { + "epoch": 0.8629500102859494, + "grad_norm": 1.806539219165287, + "learning_rate": 9.686182114251996e-07, + "loss": 0.7318, + "step": 10487 + }, + { + "epoch": 0.8630322978810945, + "grad_norm": 2.006080626270702, + "learning_rate": 9.67474184357492e-07, + "loss": 0.7251, + "step": 10488 + }, + { + "epoch": 0.8631145854762394, + "grad_norm": 2.6197598364460584, + "learning_rate": 9.663307989459737e-07, + "loss": 0.711, + "step": 10489 + }, + { + "epoch": 0.8631968730713845, + "grad_norm": 2.0168602323564944, + "learning_rate": 9.651880552718663e-07, + "loss": 0.7265, + "step": 10490 + }, + { + "epoch": 0.8632791606665295, + "grad_norm": 2.2533595716497015, + "learning_rate": 9.640459534163516e-07, + "loss": 0.718, + "step": 10491 + }, + { + "epoch": 0.8633614482616746, + "grad_norm": 0.3945774429894956, + "learning_rate": 9.629044934605602e-07, + "loss": 0.4653, + "step": 10492 + }, + { + "epoch": 0.8634437358568195, + "grad_norm": 2.1562933954591434, + "learning_rate": 9.617636754855842e-07, + "loss": 0.7196, + "step": 10493 + }, + { + "epoch": 0.8635260234519646, + "grad_norm": 1.811741020886685, + "learning_rate": 9.606234995724583e-07, + "loss": 0.7197, + "step": 10494 + }, + { + "epoch": 0.8636083110471097, + "grad_norm": 0.41085811416028306, + "learning_rate": 9.59483965802186e-07, + "loss": 0.4891, + "step": 10495 + }, + { + "epoch": 0.8636905986422547, + "grad_norm": 1.7545094410735982, + "learning_rate": 9.58345074255711e-07, + "loss": 0.72, + "step": 10496 + }, + { + "epoch": 0.8637728862373997, + "grad_norm": 2.0121831830321115, + "learning_rate": 9.572068250139443e-07, + "loss": 0.7118, + "step": 10497 + }, + { + "epoch": 0.8638551738325447, + "grad_norm": 2.0456731124801757, + "learning_rate": 9.560692181577414e-07, + "loss": 0.684, + "step": 10498 + }, + { + "epoch": 0.8639374614276898, + "grad_norm": 2.321656373078515, + "learning_rate": 9.549322537679217e-07, + "loss": 0.7013, + "step": 10499 + }, + { + "epoch": 0.8640197490228348, + "grad_norm": 1.699116341530622, + "learning_rate": 9.537959319252444e-07, + "loss": 0.7157, + "step": 10500 + }, + { + "epoch": 0.8641020366179798, + "grad_norm": 2.042824712309506, + "learning_rate": 9.5266025271044e-07, + "loss": 0.723, + "step": 10501 + }, + { + "epoch": 0.8641843242131249, + "grad_norm": 2.119626361838599, + "learning_rate": 9.515252162041788e-07, + "loss": 0.6847, + "step": 10502 + }, + { + "epoch": 0.8642666118082699, + "grad_norm": 2.111674981849815, + "learning_rate": 9.503908224870984e-07, + "loss": 0.7055, + "step": 10503 + }, + { + "epoch": 0.864348899403415, + "grad_norm": 2.005830492401134, + "learning_rate": 9.492570716397809e-07, + "loss": 0.6849, + "step": 10504 + }, + { + "epoch": 0.8644311869985599, + "grad_norm": 1.8124946730078475, + "learning_rate": 9.481239637427663e-07, + "loss": 0.7371, + "step": 10505 + }, + { + "epoch": 0.864513474593705, + "grad_norm": 2.07135898952424, + "learning_rate": 9.469914988765471e-07, + "loss": 0.7196, + "step": 10506 + }, + { + "epoch": 0.86459576218885, + "grad_norm": 1.6438524950059663, + "learning_rate": 9.458596771215756e-07, + "loss": 0.6994, + "step": 10507 + }, + { + "epoch": 0.8646780497839951, + "grad_norm": 1.814288362322922, + "learning_rate": 9.447284985582516e-07, + "loss": 0.7099, + "step": 10508 + }, + { + "epoch": 0.86476033737914, + "grad_norm": 1.666060465377452, + "learning_rate": 9.435979632669346e-07, + "loss": 0.7145, + "step": 10509 + }, + { + "epoch": 0.8648426249742851, + "grad_norm": 2.0579302275060303, + "learning_rate": 9.424680713279344e-07, + "loss": 0.7097, + "step": 10510 + }, + { + "epoch": 0.8649249125694302, + "grad_norm": 0.42398248032113844, + "learning_rate": 9.41338822821517e-07, + "loss": 0.4606, + "step": 10511 + }, + { + "epoch": 0.8650072001645752, + "grad_norm": 0.40985375273418945, + "learning_rate": 9.402102178279005e-07, + "loss": 0.4667, + "step": 10512 + }, + { + "epoch": 0.8650894877597203, + "grad_norm": 1.885057355480188, + "learning_rate": 9.390822564272639e-07, + "loss": 0.7133, + "step": 10513 + }, + { + "epoch": 0.8651717753548652, + "grad_norm": 1.74180290369534, + "learning_rate": 9.3795493869973e-07, + "loss": 0.6838, + "step": 10514 + }, + { + "epoch": 0.8652540629500103, + "grad_norm": 2.108952628293608, + "learning_rate": 9.368282647253879e-07, + "loss": 0.7259, + "step": 10515 + }, + { + "epoch": 0.8653363505451553, + "grad_norm": 1.9493775009327556, + "learning_rate": 9.357022345842704e-07, + "loss": 0.6523, + "step": 10516 + }, + { + "epoch": 0.8654186381403004, + "grad_norm": 2.2812163485926784, + "learning_rate": 9.345768483563699e-07, + "loss": 0.706, + "step": 10517 + }, + { + "epoch": 0.8655009257354453, + "grad_norm": 2.2526214155839357, + "learning_rate": 9.334521061216306e-07, + "loss": 0.7359, + "step": 10518 + }, + { + "epoch": 0.8655832133305904, + "grad_norm": 2.4029694160618535, + "learning_rate": 9.323280079599561e-07, + "loss": 0.7249, + "step": 10519 + }, + { + "epoch": 0.8656655009257355, + "grad_norm": 0.40247156612321944, + "learning_rate": 9.312045539511961e-07, + "loss": 0.4478, + "step": 10520 + }, + { + "epoch": 0.8657477885208805, + "grad_norm": 2.027205166285737, + "learning_rate": 9.300817441751631e-07, + "loss": 0.7132, + "step": 10521 + }, + { + "epoch": 0.8658300761160255, + "grad_norm": 3.3820505068866353, + "learning_rate": 9.289595787116168e-07, + "loss": 0.699, + "step": 10522 + }, + { + "epoch": 0.8659123637111705, + "grad_norm": 2.1417260547554395, + "learning_rate": 9.278380576402768e-07, + "loss": 0.7203, + "step": 10523 + }, + { + "epoch": 0.8659946513063156, + "grad_norm": 1.5826568864497834, + "learning_rate": 9.267171810408093e-07, + "loss": 0.6889, + "step": 10524 + }, + { + "epoch": 0.8660769389014606, + "grad_norm": 1.7096506265239242, + "learning_rate": 9.255969489928451e-07, + "loss": 0.7037, + "step": 10525 + }, + { + "epoch": 0.8661592264966056, + "grad_norm": 1.5494314962415914, + "learning_rate": 9.244773615759583e-07, + "loss": 0.7046, + "step": 10526 + }, + { + "epoch": 0.8662415140917507, + "grad_norm": 2.549071212945872, + "learning_rate": 9.233584188696887e-07, + "loss": 0.7257, + "step": 10527 + }, + { + "epoch": 0.8663238016868957, + "grad_norm": 1.968628264621553, + "learning_rate": 9.222401209535215e-07, + "loss": 0.7159, + "step": 10528 + }, + { + "epoch": 0.8664060892820408, + "grad_norm": 2.038892671153333, + "learning_rate": 9.211224679068976e-07, + "loss": 0.7434, + "step": 10529 + }, + { + "epoch": 0.8664883768771857, + "grad_norm": 1.6213887096284394, + "learning_rate": 9.200054598092112e-07, + "loss": 0.6992, + "step": 10530 + }, + { + "epoch": 0.8665706644723308, + "grad_norm": 0.4204604884405205, + "learning_rate": 9.18889096739819e-07, + "loss": 0.4793, + "step": 10531 + }, + { + "epoch": 0.8666529520674758, + "grad_norm": 2.1429670665912495, + "learning_rate": 9.177733787780197e-07, + "loss": 0.7189, + "step": 10532 + }, + { + "epoch": 0.8667352396626209, + "grad_norm": 2.158614953929338, + "learning_rate": 9.166583060030776e-07, + "loss": 0.7213, + "step": 10533 + }, + { + "epoch": 0.8668175272577658, + "grad_norm": 2.5079387730588505, + "learning_rate": 9.155438784942017e-07, + "loss": 0.7028, + "step": 10534 + }, + { + "epoch": 0.8668998148529109, + "grad_norm": 2.6173804933502676, + "learning_rate": 9.144300963305607e-07, + "loss": 0.7401, + "step": 10535 + }, + { + "epoch": 0.866982102448056, + "grad_norm": 0.40722003432561793, + "learning_rate": 9.133169595912727e-07, + "loss": 0.4868, + "step": 10536 + }, + { + "epoch": 0.867064390043201, + "grad_norm": 2.1368376609249076, + "learning_rate": 9.122044683554188e-07, + "loss": 0.7159, + "step": 10537 + }, + { + "epoch": 0.867146677638346, + "grad_norm": 2.9185645454187856, + "learning_rate": 9.110926227020234e-07, + "loss": 0.6923, + "step": 10538 + }, + { + "epoch": 0.867228965233491, + "grad_norm": 1.8677540684827667, + "learning_rate": 9.099814227100745e-07, + "loss": 0.7206, + "step": 10539 + }, + { + "epoch": 0.8673112528286361, + "grad_norm": 1.9201799423867383, + "learning_rate": 9.088708684585079e-07, + "loss": 0.7064, + "step": 10540 + }, + { + "epoch": 0.8673935404237811, + "grad_norm": 2.1311208332076697, + "learning_rate": 9.07760960026216e-07, + "loss": 0.7003, + "step": 10541 + }, + { + "epoch": 0.8674758280189262, + "grad_norm": 1.9177259992155546, + "learning_rate": 9.066516974920425e-07, + "loss": 0.7075, + "step": 10542 + }, + { + "epoch": 0.8675581156140711, + "grad_norm": 1.8417699619641557, + "learning_rate": 9.055430809347909e-07, + "loss": 0.6915, + "step": 10543 + }, + { + "epoch": 0.8676404032092162, + "grad_norm": 3.1465419792621088, + "learning_rate": 9.044351104332139e-07, + "loss": 0.7279, + "step": 10544 + }, + { + "epoch": 0.8677226908043613, + "grad_norm": 0.438923945045461, + "learning_rate": 9.033277860660217e-07, + "loss": 0.4909, + "step": 10545 + }, + { + "epoch": 0.8678049783995063, + "grad_norm": 2.127123516299765, + "learning_rate": 9.022211079118748e-07, + "loss": 0.7285, + "step": 10546 + }, + { + "epoch": 0.8678872659946513, + "grad_norm": 1.8598715961040533, + "learning_rate": 9.011150760493914e-07, + "loss": 0.7525, + "step": 10547 + }, + { + "epoch": 0.8679695535897963, + "grad_norm": 1.962043728044906, + "learning_rate": 9.000096905571398e-07, + "loss": 0.7134, + "step": 10548 + }, + { + "epoch": 0.8680518411849414, + "grad_norm": 2.0530591932999758, + "learning_rate": 8.989049515136483e-07, + "loss": 0.7167, + "step": 10549 + }, + { + "epoch": 0.8681341287800864, + "grad_norm": 0.440709634079969, + "learning_rate": 8.978008589973919e-07, + "loss": 0.456, + "step": 10550 + }, + { + "epoch": 0.8682164163752314, + "grad_norm": 0.40018257932870077, + "learning_rate": 8.966974130868078e-07, + "loss": 0.4579, + "step": 10551 + }, + { + "epoch": 0.8682987039703765, + "grad_norm": 0.4211855834030983, + "learning_rate": 8.9559461386028e-07, + "loss": 0.4786, + "step": 10552 + }, + { + "epoch": 0.8683809915655215, + "grad_norm": 1.7571766103311819, + "learning_rate": 8.944924613961514e-07, + "loss": 0.7003, + "step": 10553 + }, + { + "epoch": 0.8684632791606666, + "grad_norm": 1.8734042402251894, + "learning_rate": 8.933909557727139e-07, + "loss": 0.7224, + "step": 10554 + }, + { + "epoch": 0.8685455667558115, + "grad_norm": 2.211964816311203, + "learning_rate": 8.922900970682213e-07, + "loss": 0.6911, + "step": 10555 + }, + { + "epoch": 0.8686278543509566, + "grad_norm": 0.41256330034894023, + "learning_rate": 8.911898853608724e-07, + "loss": 0.4694, + "step": 10556 + }, + { + "epoch": 0.8687101419461016, + "grad_norm": 2.550735232946453, + "learning_rate": 8.900903207288292e-07, + "loss": 0.7324, + "step": 10557 + }, + { + "epoch": 0.8687924295412467, + "grad_norm": 1.6315126697808333, + "learning_rate": 8.889914032502e-07, + "loss": 0.7222, + "step": 10558 + }, + { + "epoch": 0.8688747171363916, + "grad_norm": 1.993781824274365, + "learning_rate": 8.878931330030505e-07, + "loss": 0.7043, + "step": 10559 + }, + { + "epoch": 0.8689570047315367, + "grad_norm": 2.685102008283424, + "learning_rate": 8.86795510065398e-07, + "loss": 0.7244, + "step": 10560 + }, + { + "epoch": 0.8690392923266818, + "grad_norm": 1.9013440611997026, + "learning_rate": 8.856985345152202e-07, + "loss": 0.7079, + "step": 10561 + }, + { + "epoch": 0.8691215799218268, + "grad_norm": 1.8818287679945123, + "learning_rate": 8.846022064304405e-07, + "loss": 0.7275, + "step": 10562 + }, + { + "epoch": 0.8692038675169718, + "grad_norm": 2.0033888657104115, + "learning_rate": 8.835065258889464e-07, + "loss": 0.7204, + "step": 10563 + }, + { + "epoch": 0.8692861551121168, + "grad_norm": 2.379559876113967, + "learning_rate": 8.824114929685646e-07, + "loss": 0.7196, + "step": 10564 + }, + { + "epoch": 0.8693684427072619, + "grad_norm": 1.8618351239888955, + "learning_rate": 8.813171077470905e-07, + "loss": 0.6975, + "step": 10565 + }, + { + "epoch": 0.8694507303024069, + "grad_norm": 4.418098812079655, + "learning_rate": 8.802233703022645e-07, + "loss": 0.6974, + "step": 10566 + }, + { + "epoch": 0.869533017897552, + "grad_norm": 2.7215701152170366, + "learning_rate": 8.791302807117874e-07, + "loss": 0.7135, + "step": 10567 + }, + { + "epoch": 0.8696153054926969, + "grad_norm": 1.9378933176335216, + "learning_rate": 8.780378390533062e-07, + "loss": 0.7055, + "step": 10568 + }, + { + "epoch": 0.869697593087842, + "grad_norm": 2.350312394873924, + "learning_rate": 8.769460454044332e-07, + "loss": 0.7432, + "step": 10569 + }, + { + "epoch": 0.8697798806829871, + "grad_norm": 2.2444306643138954, + "learning_rate": 8.758548998427185e-07, + "loss": 0.7411, + "step": 10570 + }, + { + "epoch": 0.8698621682781321, + "grad_norm": 2.0721606605649514, + "learning_rate": 8.747644024456814e-07, + "loss": 0.7165, + "step": 10571 + }, + { + "epoch": 0.8699444558732771, + "grad_norm": 2.7333801442274854, + "learning_rate": 8.736745532907854e-07, + "loss": 0.6965, + "step": 10572 + }, + { + "epoch": 0.8700267434684221, + "grad_norm": 0.40421999877265474, + "learning_rate": 8.725853524554562e-07, + "loss": 0.4885, + "step": 10573 + }, + { + "epoch": 0.8701090310635672, + "grad_norm": 2.2085405953978774, + "learning_rate": 8.714968000170665e-07, + "loss": 0.7036, + "step": 10574 + }, + { + "epoch": 0.8701913186587122, + "grad_norm": 1.7304918787690051, + "learning_rate": 8.704088960529455e-07, + "loss": 0.7228, + "step": 10575 + }, + { + "epoch": 0.8702736062538572, + "grad_norm": 1.8687020852372451, + "learning_rate": 8.693216406403726e-07, + "loss": 0.7017, + "step": 10576 + }, + { + "epoch": 0.8703558938490022, + "grad_norm": 2.0371153566500686, + "learning_rate": 8.682350338565914e-07, + "loss": 0.692, + "step": 10577 + }, + { + "epoch": 0.8704381814441473, + "grad_norm": 1.8818335731691762, + "learning_rate": 8.67149075778787e-07, + "loss": 0.7103, + "step": 10578 + }, + { + "epoch": 0.8705204690392924, + "grad_norm": 2.1163680613299545, + "learning_rate": 8.660637664841076e-07, + "loss": 0.7151, + "step": 10579 + }, + { + "epoch": 0.8706027566344373, + "grad_norm": 1.8168728310145434, + "learning_rate": 8.649791060496515e-07, + "loss": 0.6935, + "step": 10580 + }, + { + "epoch": 0.8706850442295824, + "grad_norm": 2.2483867036310548, + "learning_rate": 8.638950945524705e-07, + "loss": 0.7046, + "step": 10581 + }, + { + "epoch": 0.8707673318247274, + "grad_norm": 0.42249146095275697, + "learning_rate": 8.628117320695683e-07, + "loss": 0.4742, + "step": 10582 + }, + { + "epoch": 0.8708496194198725, + "grad_norm": 1.8274568739475838, + "learning_rate": 8.617290186779104e-07, + "loss": 0.7411, + "step": 10583 + }, + { + "epoch": 0.8709319070150174, + "grad_norm": 1.8906049982750117, + "learning_rate": 8.60646954454406e-07, + "loss": 0.6722, + "step": 10584 + }, + { + "epoch": 0.8710141946101625, + "grad_norm": 1.8731542189996795, + "learning_rate": 8.595655394759284e-07, + "loss": 0.7054, + "step": 10585 + }, + { + "epoch": 0.8710964822053076, + "grad_norm": 2.1676124735218982, + "learning_rate": 8.584847738192958e-07, + "loss": 0.7176, + "step": 10586 + }, + { + "epoch": 0.8711787698004526, + "grad_norm": 3.0107898372178195, + "learning_rate": 8.57404657561286e-07, + "loss": 0.7146, + "step": 10587 + }, + { + "epoch": 0.8712610573955976, + "grad_norm": 2.229268325569716, + "learning_rate": 8.563251907786263e-07, + "loss": 0.7272, + "step": 10588 + }, + { + "epoch": 0.8713433449907426, + "grad_norm": 1.9812948461439828, + "learning_rate": 8.552463735480032e-07, + "loss": 0.6665, + "step": 10589 + }, + { + "epoch": 0.8714256325858877, + "grad_norm": 1.8093161900128114, + "learning_rate": 8.54168205946051e-07, + "loss": 0.7157, + "step": 10590 + }, + { + "epoch": 0.8715079201810327, + "grad_norm": 2.764591695134947, + "learning_rate": 8.53090688049365e-07, + "loss": 0.7041, + "step": 10591 + }, + { + "epoch": 0.8715902077761778, + "grad_norm": 1.8691076471217276, + "learning_rate": 8.520138199344874e-07, + "loss": 0.6906, + "step": 10592 + }, + { + "epoch": 0.8716724953713227, + "grad_norm": 0.42630691661716597, + "learning_rate": 8.509376016779191e-07, + "loss": 0.4953, + "step": 10593 + }, + { + "epoch": 0.8717547829664678, + "grad_norm": 0.4125998605327206, + "learning_rate": 8.498620333561092e-07, + "loss": 0.4367, + "step": 10594 + }, + { + "epoch": 0.8718370705616129, + "grad_norm": 1.8747628862017245, + "learning_rate": 8.487871150454685e-07, + "loss": 0.7382, + "step": 10595 + }, + { + "epoch": 0.8719193581567579, + "grad_norm": 1.8512323095423129, + "learning_rate": 8.47712846822355e-07, + "loss": 0.7023, + "step": 10596 + }, + { + "epoch": 0.8720016457519029, + "grad_norm": 3.0302091611469524, + "learning_rate": 8.466392287630854e-07, + "loss": 0.7229, + "step": 10597 + }, + { + "epoch": 0.8720839333470479, + "grad_norm": 0.4414209077608377, + "learning_rate": 8.455662609439263e-07, + "loss": 0.4872, + "step": 10598 + }, + { + "epoch": 0.872166220942193, + "grad_norm": 1.8433766929003115, + "learning_rate": 8.444939434411003e-07, + "loss": 0.7136, + "step": 10599 + }, + { + "epoch": 0.872248508537338, + "grad_norm": 2.0893283527916653, + "learning_rate": 8.434222763307798e-07, + "loss": 0.697, + "step": 10600 + }, + { + "epoch": 0.872330796132483, + "grad_norm": 1.7031708344618166, + "learning_rate": 8.423512596890993e-07, + "loss": 0.6934, + "step": 10601 + }, + { + "epoch": 0.872413083727628, + "grad_norm": 0.41546660598897217, + "learning_rate": 8.412808935921379e-07, + "loss": 0.4533, + "step": 10602 + }, + { + "epoch": 0.8724953713227731, + "grad_norm": 2.1852640242523957, + "learning_rate": 8.402111781159361e-07, + "loss": 0.734, + "step": 10603 + }, + { + "epoch": 0.8725776589179182, + "grad_norm": 2.139352630703398, + "learning_rate": 8.391421133364841e-07, + "loss": 0.7217, + "step": 10604 + }, + { + "epoch": 0.8726599465130631, + "grad_norm": 2.0010861232888653, + "learning_rate": 8.380736993297256e-07, + "loss": 0.6936, + "step": 10605 + }, + { + "epoch": 0.8727422341082082, + "grad_norm": 1.8053832928975304, + "learning_rate": 8.370059361715566e-07, + "loss": 0.6932, + "step": 10606 + }, + { + "epoch": 0.8728245217033532, + "grad_norm": 1.8119232835938175, + "learning_rate": 8.359388239378318e-07, + "loss": 0.6924, + "step": 10607 + }, + { + "epoch": 0.8729068092984983, + "grad_norm": 1.8441652723099269, + "learning_rate": 8.348723627043598e-07, + "loss": 0.7306, + "step": 10608 + }, + { + "epoch": 0.8729890968936432, + "grad_norm": 0.4297003506597561, + "learning_rate": 8.338065525468974e-07, + "loss": 0.4633, + "step": 10609 + }, + { + "epoch": 0.8730713844887883, + "grad_norm": 1.7951512959489797, + "learning_rate": 8.327413935411588e-07, + "loss": 0.7068, + "step": 10610 + }, + { + "epoch": 0.8731536720839334, + "grad_norm": 2.2662504436504145, + "learning_rate": 8.316768857628077e-07, + "loss": 0.6899, + "step": 10611 + }, + { + "epoch": 0.8732359596790784, + "grad_norm": 0.40769575396777274, + "learning_rate": 8.306130292874704e-07, + "loss": 0.4561, + "step": 10612 + }, + { + "epoch": 0.8733182472742234, + "grad_norm": 2.201551958603256, + "learning_rate": 8.295498241907185e-07, + "loss": 0.6881, + "step": 10613 + }, + { + "epoch": 0.8734005348693684, + "grad_norm": 0.4142258780708879, + "learning_rate": 8.284872705480817e-07, + "loss": 0.473, + "step": 10614 + }, + { + "epoch": 0.8734828224645135, + "grad_norm": 3.1540790657886038, + "learning_rate": 8.27425368435042e-07, + "loss": 0.6856, + "step": 10615 + }, + { + "epoch": 0.8735651100596585, + "grad_norm": 1.7257380759754544, + "learning_rate": 8.263641179270343e-07, + "loss": 0.7264, + "step": 10616 + }, + { + "epoch": 0.8736473976548036, + "grad_norm": 2.5163380331667846, + "learning_rate": 8.253035190994474e-07, + "loss": 0.6866, + "step": 10617 + }, + { + "epoch": 0.8737296852499485, + "grad_norm": 1.8341702575528716, + "learning_rate": 8.242435720276276e-07, + "loss": 0.6803, + "step": 10618 + }, + { + "epoch": 0.8738119728450936, + "grad_norm": 2.0231573865529247, + "learning_rate": 8.23184276786867e-07, + "loss": 0.7308, + "step": 10619 + }, + { + "epoch": 0.8738942604402387, + "grad_norm": 0.40585662954547047, + "learning_rate": 8.22125633452422e-07, + "loss": 0.4904, + "step": 10620 + }, + { + "epoch": 0.8739765480353837, + "grad_norm": 0.4226382474839874, + "learning_rate": 8.210676420994945e-07, + "loss": 0.478, + "step": 10621 + }, + { + "epoch": 0.8740588356305287, + "grad_norm": 1.865029378015269, + "learning_rate": 8.200103028032413e-07, + "loss": 0.72, + "step": 10622 + }, + { + "epoch": 0.8741411232256737, + "grad_norm": 1.9441507956391042, + "learning_rate": 8.189536156387734e-07, + "loss": 0.7083, + "step": 10623 + }, + { + "epoch": 0.8742234108208188, + "grad_norm": 0.4334846311450764, + "learning_rate": 8.178975806811607e-07, + "loss": 0.4474, + "step": 10624 + }, + { + "epoch": 0.8743056984159638, + "grad_norm": 2.125200911434388, + "learning_rate": 8.168421980054164e-07, + "loss": 0.7031, + "step": 10625 + }, + { + "epoch": 0.8743879860111088, + "grad_norm": 0.4266933336939788, + "learning_rate": 8.157874676865174e-07, + "loss": 0.4888, + "step": 10626 + }, + { + "epoch": 0.8744702736062538, + "grad_norm": 1.8055419181023802, + "learning_rate": 8.147333897993893e-07, + "loss": 0.7074, + "step": 10627 + }, + { + "epoch": 0.8745525612013989, + "grad_norm": 2.233306943871246, + "learning_rate": 8.13679964418912e-07, + "loss": 0.6964, + "step": 10628 + }, + { + "epoch": 0.874634848796544, + "grad_norm": 2.4228670167008857, + "learning_rate": 8.126271916199158e-07, + "loss": 0.6895, + "step": 10629 + }, + { + "epoch": 0.8747171363916889, + "grad_norm": 2.7355111821858054, + "learning_rate": 8.11575071477193e-07, + "loss": 0.6966, + "step": 10630 + }, + { + "epoch": 0.874799423986834, + "grad_norm": 1.936260353956121, + "learning_rate": 8.105236040654818e-07, + "loss": 0.6918, + "step": 10631 + }, + { + "epoch": 0.874881711581979, + "grad_norm": 1.998184028654403, + "learning_rate": 8.094727894594778e-07, + "loss": 0.7197, + "step": 10632 + }, + { + "epoch": 0.8749639991771241, + "grad_norm": 1.7823210596495294, + "learning_rate": 8.084226277338292e-07, + "loss": 0.7048, + "step": 10633 + }, + { + "epoch": 0.875046286772269, + "grad_norm": 2.0479432080390922, + "learning_rate": 8.073731189631373e-07, + "loss": 0.7062, + "step": 10634 + }, + { + "epoch": 0.8751285743674141, + "grad_norm": 2.313793074404136, + "learning_rate": 8.063242632219559e-07, + "loss": 0.7165, + "step": 10635 + }, + { + "epoch": 0.8752108619625592, + "grad_norm": 2.4743254035074735, + "learning_rate": 8.052760605847976e-07, + "loss": 0.7147, + "step": 10636 + }, + { + "epoch": 0.8752931495577042, + "grad_norm": 2.0199140733295304, + "learning_rate": 8.042285111261205e-07, + "loss": 0.7076, + "step": 10637 + }, + { + "epoch": 0.8753754371528492, + "grad_norm": 1.7199382254605668, + "learning_rate": 8.031816149203464e-07, + "loss": 0.7021, + "step": 10638 + }, + { + "epoch": 0.8754577247479942, + "grad_norm": 2.256899252260317, + "learning_rate": 8.021353720418424e-07, + "loss": 0.7022, + "step": 10639 + }, + { + "epoch": 0.8755400123431393, + "grad_norm": 1.8407829758811847, + "learning_rate": 8.010897825649311e-07, + "loss": 0.717, + "step": 10640 + }, + { + "epoch": 0.8756222999382843, + "grad_norm": 1.8053067659350364, + "learning_rate": 8.000448465638888e-07, + "loss": 0.6927, + "step": 10641 + }, + { + "epoch": 0.8757045875334294, + "grad_norm": 2.343880802789923, + "learning_rate": 7.990005641129484e-07, + "loss": 0.7559, + "step": 10642 + }, + { + "epoch": 0.8757868751285743, + "grad_norm": 2.117018458126865, + "learning_rate": 7.979569352862915e-07, + "loss": 0.7318, + "step": 10643 + }, + { + "epoch": 0.8758691627237194, + "grad_norm": 2.080214906656237, + "learning_rate": 7.969139601580622e-07, + "loss": 0.6925, + "step": 10644 + }, + { + "epoch": 0.8759514503188645, + "grad_norm": 3.6077955860472284, + "learning_rate": 7.958716388023424e-07, + "loss": 0.6965, + "step": 10645 + }, + { + "epoch": 0.8760337379140095, + "grad_norm": 1.8457647541381268, + "learning_rate": 7.948299712931829e-07, + "loss": 0.716, + "step": 10646 + }, + { + "epoch": 0.8761160255091545, + "grad_norm": 1.9450208485403444, + "learning_rate": 7.937889577045799e-07, + "loss": 0.7007, + "step": 10647 + }, + { + "epoch": 0.8761983131042995, + "grad_norm": 0.4402726953337924, + "learning_rate": 7.927485981104877e-07, + "loss": 0.46, + "step": 10648 + }, + { + "epoch": 0.8762806006994446, + "grad_norm": 1.9625184068414032, + "learning_rate": 7.917088925848082e-07, + "loss": 0.7056, + "step": 10649 + }, + { + "epoch": 0.8763628882945896, + "grad_norm": 2.441795530939994, + "learning_rate": 7.906698412014068e-07, + "loss": 0.6943, + "step": 10650 + }, + { + "epoch": 0.8764451758897346, + "grad_norm": 1.93475012635238, + "learning_rate": 7.89631444034088e-07, + "loss": 0.7124, + "step": 10651 + }, + { + "epoch": 0.8765274634848796, + "grad_norm": 0.41737929684607183, + "learning_rate": 7.885937011566225e-07, + "loss": 0.4776, + "step": 10652 + }, + { + "epoch": 0.8766097510800247, + "grad_norm": 1.8864007218771754, + "learning_rate": 7.875566126427281e-07, + "loss": 0.7181, + "step": 10653 + }, + { + "epoch": 0.8766920386751698, + "grad_norm": 2.0898731896935883, + "learning_rate": 7.865201785660803e-07, + "loss": 0.7249, + "step": 10654 + }, + { + "epoch": 0.8767743262703147, + "grad_norm": 2.6126965689857604, + "learning_rate": 7.854843990003036e-07, + "loss": 0.7142, + "step": 10655 + }, + { + "epoch": 0.8768566138654598, + "grad_norm": 2.420142849638488, + "learning_rate": 7.844492740189791e-07, + "loss": 0.7002, + "step": 10656 + }, + { + "epoch": 0.8769389014606048, + "grad_norm": 3.3611696483270874, + "learning_rate": 7.83414803695639e-07, + "loss": 0.6712, + "step": 10657 + }, + { + "epoch": 0.8770211890557499, + "grad_norm": 2.440006896932615, + "learning_rate": 7.823809881037725e-07, + "loss": 0.6938, + "step": 10658 + }, + { + "epoch": 0.8771034766508948, + "grad_norm": 2.3224285755334306, + "learning_rate": 7.813478273168185e-07, + "loss": 0.7035, + "step": 10659 + }, + { + "epoch": 0.8771857642460399, + "grad_norm": 2.187027381425663, + "learning_rate": 7.803153214081726e-07, + "loss": 0.7082, + "step": 10660 + }, + { + "epoch": 0.877268051841185, + "grad_norm": 2.01663939691991, + "learning_rate": 7.79283470451182e-07, + "loss": 0.7224, + "step": 10661 + }, + { + "epoch": 0.87735033943633, + "grad_norm": 2.035929276141439, + "learning_rate": 7.782522745191467e-07, + "loss": 0.6936, + "step": 10662 + }, + { + "epoch": 0.877432627031475, + "grad_norm": 2.1481283450416777, + "learning_rate": 7.772217336853205e-07, + "loss": 0.6979, + "step": 10663 + }, + { + "epoch": 0.87751491462662, + "grad_norm": 1.7004379034655963, + "learning_rate": 7.761918480229147e-07, + "loss": 0.7366, + "step": 10664 + }, + { + "epoch": 0.8775972022217651, + "grad_norm": 2.0187333883557663, + "learning_rate": 7.751626176050864e-07, + "loss": 0.7059, + "step": 10665 + }, + { + "epoch": 0.8776794898169101, + "grad_norm": 0.41395163370963983, + "learning_rate": 7.741340425049548e-07, + "loss": 0.4723, + "step": 10666 + }, + { + "epoch": 0.8777617774120551, + "grad_norm": 0.4103064624255899, + "learning_rate": 7.731061227955861e-07, + "loss": 0.4519, + "step": 10667 + }, + { + "epoch": 0.8778440650072001, + "grad_norm": 0.3770910158585888, + "learning_rate": 7.720788585500028e-07, + "loss": 0.4615, + "step": 10668 + }, + { + "epoch": 0.8779263526023452, + "grad_norm": 0.41175553044747626, + "learning_rate": 7.710522498411766e-07, + "loss": 0.4491, + "step": 10669 + }, + { + "epoch": 0.8780086401974903, + "grad_norm": 1.9641673580613714, + "learning_rate": 7.700262967420413e-07, + "loss": 0.712, + "step": 10670 + }, + { + "epoch": 0.8780909277926353, + "grad_norm": 1.7373614235370776, + "learning_rate": 7.690009993254755e-07, + "loss": 0.7394, + "step": 10671 + }, + { + "epoch": 0.8781732153877803, + "grad_norm": 1.7520908000499285, + "learning_rate": 7.679763576643173e-07, + "loss": 0.6987, + "step": 10672 + }, + { + "epoch": 0.8782555029829253, + "grad_norm": 2.3482585103872022, + "learning_rate": 7.669523718313543e-07, + "loss": 0.6965, + "step": 10673 + }, + { + "epoch": 0.8783377905780704, + "grad_norm": 2.6536168679549177, + "learning_rate": 7.659290418993282e-07, + "loss": 0.7181, + "step": 10674 + }, + { + "epoch": 0.8784200781732154, + "grad_norm": 0.4318798218661134, + "learning_rate": 7.649063679409341e-07, + "loss": 0.4556, + "step": 10675 + }, + { + "epoch": 0.8785023657683604, + "grad_norm": 2.1100499874242815, + "learning_rate": 7.63884350028824e-07, + "loss": 0.6876, + "step": 10676 + }, + { + "epoch": 0.8785846533635054, + "grad_norm": 2.847456234922006, + "learning_rate": 7.628629882355964e-07, + "loss": 0.7352, + "step": 10677 + }, + { + "epoch": 0.8786669409586505, + "grad_norm": 2.26623493039629, + "learning_rate": 7.618422826338123e-07, + "loss": 0.7169, + "step": 10678 + }, + { + "epoch": 0.8787492285537956, + "grad_norm": 0.4388560172495084, + "learning_rate": 7.60822233295977e-07, + "loss": 0.48, + "step": 10679 + }, + { + "epoch": 0.8788315161489405, + "grad_norm": 2.5351800780427247, + "learning_rate": 7.598028402945545e-07, + "loss": 0.7099, + "step": 10680 + }, + { + "epoch": 0.8789138037440856, + "grad_norm": 2.2952738859182125, + "learning_rate": 7.587841037019595e-07, + "loss": 0.7222, + "step": 10681 + }, + { + "epoch": 0.8789960913392306, + "grad_norm": 1.8524077088797395, + "learning_rate": 7.577660235905649e-07, + "loss": 0.694, + "step": 10682 + }, + { + "epoch": 0.8790783789343757, + "grad_norm": 2.143397648308029, + "learning_rate": 7.567486000326885e-07, + "loss": 0.7164, + "step": 10683 + }, + { + "epoch": 0.8791606665295206, + "grad_norm": 2.6380355882639392, + "learning_rate": 7.557318331006114e-07, + "loss": 0.7175, + "step": 10684 + }, + { + "epoch": 0.8792429541246657, + "grad_norm": 1.7216884100135748, + "learning_rate": 7.547157228665603e-07, + "loss": 0.737, + "step": 10685 + }, + { + "epoch": 0.8793252417198107, + "grad_norm": 2.1542606190416618, + "learning_rate": 7.537002694027196e-07, + "loss": 0.6853, + "step": 10686 + }, + { + "epoch": 0.8794075293149558, + "grad_norm": 2.7220116257385185, + "learning_rate": 7.526854727812216e-07, + "loss": 0.7355, + "step": 10687 + }, + { + "epoch": 0.8794898169101008, + "grad_norm": 0.3977545602886836, + "learning_rate": 7.516713330741609e-07, + "loss": 0.4556, + "step": 10688 + }, + { + "epoch": 0.8795721045052458, + "grad_norm": 2.0001302239477536, + "learning_rate": 7.506578503535766e-07, + "loss": 0.6937, + "step": 10689 + }, + { + "epoch": 0.8796543921003909, + "grad_norm": 1.9796027566242897, + "learning_rate": 7.496450246914677e-07, + "loss": 0.7322, + "step": 10690 + }, + { + "epoch": 0.8797366796955359, + "grad_norm": 1.6627131107571311, + "learning_rate": 7.486328561597833e-07, + "loss": 0.7221, + "step": 10691 + }, + { + "epoch": 0.8798189672906809, + "grad_norm": 1.856745778039713, + "learning_rate": 7.476213448304248e-07, + "loss": 0.716, + "step": 10692 + }, + { + "epoch": 0.8799012548858259, + "grad_norm": 0.45562311817414447, + "learning_rate": 7.466104907752481e-07, + "loss": 0.4771, + "step": 10693 + }, + { + "epoch": 0.879983542480971, + "grad_norm": 2.0997396860521063, + "learning_rate": 7.456002940660645e-07, + "loss": 0.7454, + "step": 10694 + }, + { + "epoch": 0.880065830076116, + "grad_norm": 1.9548590512310082, + "learning_rate": 7.445907547746345e-07, + "loss": 0.696, + "step": 10695 + }, + { + "epoch": 0.8801481176712611, + "grad_norm": 1.863387869593898, + "learning_rate": 7.435818729726763e-07, + "loss": 0.7194, + "step": 10696 + }, + { + "epoch": 0.8802304052664061, + "grad_norm": 2.1320729252854207, + "learning_rate": 7.425736487318591e-07, + "loss": 0.7181, + "step": 10697 + }, + { + "epoch": 0.8803126928615511, + "grad_norm": 2.1619191463365377, + "learning_rate": 7.415660821238047e-07, + "loss": 0.7107, + "step": 10698 + }, + { + "epoch": 0.8803949804566962, + "grad_norm": 2.4576958760917105, + "learning_rate": 7.405591732200878e-07, + "loss": 0.7062, + "step": 10699 + }, + { + "epoch": 0.8804772680518412, + "grad_norm": 2.066863581023789, + "learning_rate": 7.395529220922392e-07, + "loss": 0.72, + "step": 10700 + }, + { + "epoch": 0.8805595556469862, + "grad_norm": 2.298203312325162, + "learning_rate": 7.385473288117406e-07, + "loss": 0.6991, + "step": 10701 + }, + { + "epoch": 0.8806418432421312, + "grad_norm": 0.4335079614227494, + "learning_rate": 7.375423934500303e-07, + "loss": 0.4979, + "step": 10702 + }, + { + "epoch": 0.8807241308372763, + "grad_norm": 1.8400005016366188, + "learning_rate": 7.365381160784946e-07, + "loss": 0.7034, + "step": 10703 + }, + { + "epoch": 0.8808064184324214, + "grad_norm": 1.6274270162785214, + "learning_rate": 7.355344967684763e-07, + "loss": 0.7043, + "step": 10704 + }, + { + "epoch": 0.8808887060275663, + "grad_norm": 0.435745562399412, + "learning_rate": 7.345315355912696e-07, + "loss": 0.4754, + "step": 10705 + }, + { + "epoch": 0.8809709936227114, + "grad_norm": 2.1002133457420427, + "learning_rate": 7.335292326181265e-07, + "loss": 0.7207, + "step": 10706 + }, + { + "epoch": 0.8810532812178564, + "grad_norm": 2.2558220207985977, + "learning_rate": 7.325275879202454e-07, + "loss": 0.6986, + "step": 10707 + }, + { + "epoch": 0.8811355688130015, + "grad_norm": 1.875944970454355, + "learning_rate": 7.31526601568785e-07, + "loss": 0.7172, + "step": 10708 + }, + { + "epoch": 0.8812178564081464, + "grad_norm": 1.7861970628797834, + "learning_rate": 7.305262736348528e-07, + "loss": 0.6789, + "step": 10709 + }, + { + "epoch": 0.8813001440032915, + "grad_norm": 2.3625520173112724, + "learning_rate": 7.295266041895088e-07, + "loss": 0.6858, + "step": 10710 + }, + { + "epoch": 0.8813824315984365, + "grad_norm": 1.8633388296088622, + "learning_rate": 7.285275933037672e-07, + "loss": 0.7025, + "step": 10711 + }, + { + "epoch": 0.8814647191935816, + "grad_norm": 1.944160293074244, + "learning_rate": 7.275292410486001e-07, + "loss": 0.7293, + "step": 10712 + }, + { + "epoch": 0.8815470067887266, + "grad_norm": 2.801834637948836, + "learning_rate": 7.265315474949241e-07, + "loss": 0.6859, + "step": 10713 + }, + { + "epoch": 0.8816292943838716, + "grad_norm": 0.4056879861666221, + "learning_rate": 7.255345127136204e-07, + "loss": 0.4543, + "step": 10714 + }, + { + "epoch": 0.8817115819790167, + "grad_norm": 1.899976283359679, + "learning_rate": 7.245381367755089e-07, + "loss": 0.6929, + "step": 10715 + }, + { + "epoch": 0.8817938695741617, + "grad_norm": 2.0140166563906954, + "learning_rate": 7.235424197513763e-07, + "loss": 0.7193, + "step": 10716 + }, + { + "epoch": 0.8818761571693067, + "grad_norm": 1.6678633023546663, + "learning_rate": 7.225473617119527e-07, + "loss": 0.678, + "step": 10717 + }, + { + "epoch": 0.8819584447644517, + "grad_norm": 1.8824907649846812, + "learning_rate": 7.215529627279294e-07, + "loss": 0.705, + "step": 10718 + }, + { + "epoch": 0.8820407323595968, + "grad_norm": 2.2658785872269203, + "learning_rate": 7.205592228699432e-07, + "loss": 0.7249, + "step": 10719 + }, + { + "epoch": 0.8821230199547418, + "grad_norm": 2.0381022016417396, + "learning_rate": 7.19566142208592e-07, + "loss": 0.7435, + "step": 10720 + }, + { + "epoch": 0.8822053075498869, + "grad_norm": 1.9901464399340232, + "learning_rate": 7.185737208144183e-07, + "loss": 0.6909, + "step": 10721 + }, + { + "epoch": 0.8822875951450319, + "grad_norm": 2.6008956342708918, + "learning_rate": 7.175819587579247e-07, + "loss": 0.6932, + "step": 10722 + }, + { + "epoch": 0.8823698827401769, + "grad_norm": 1.9802430705691394, + "learning_rate": 7.165908561095625e-07, + "loss": 0.7239, + "step": 10723 + }, + { + "epoch": 0.882452170335322, + "grad_norm": 2.1815344553746625, + "learning_rate": 7.156004129397409e-07, + "loss": 0.736, + "step": 10724 + }, + { + "epoch": 0.882534457930467, + "grad_norm": 2.2083114995835413, + "learning_rate": 7.146106293188171e-07, + "loss": 0.6968, + "step": 10725 + }, + { + "epoch": 0.882616745525612, + "grad_norm": 0.4527297552714102, + "learning_rate": 7.136215053171047e-07, + "loss": 0.5098, + "step": 10726 + }, + { + "epoch": 0.882699033120757, + "grad_norm": 1.7794816483472844, + "learning_rate": 7.126330410048676e-07, + "loss": 0.7315, + "step": 10727 + }, + { + "epoch": 0.8827813207159021, + "grad_norm": 2.2983746887544383, + "learning_rate": 7.116452364523285e-07, + "loss": 0.7155, + "step": 10728 + }, + { + "epoch": 0.8828636083110472, + "grad_norm": 1.9641679565739234, + "learning_rate": 7.106580917296557e-07, + "loss": 0.7172, + "step": 10729 + }, + { + "epoch": 0.8829458959061921, + "grad_norm": 2.0941389489308406, + "learning_rate": 7.096716069069765e-07, + "loss": 0.7128, + "step": 10730 + }, + { + "epoch": 0.8830281835013372, + "grad_norm": 2.334123497591664, + "learning_rate": 7.086857820543691e-07, + "loss": 0.7389, + "step": 10731 + }, + { + "epoch": 0.8831104710964822, + "grad_norm": 2.5313751004601617, + "learning_rate": 7.077006172418654e-07, + "loss": 0.7432, + "step": 10732 + }, + { + "epoch": 0.8831927586916273, + "grad_norm": 1.902709839902159, + "learning_rate": 7.067161125394462e-07, + "loss": 0.7031, + "step": 10733 + }, + { + "epoch": 0.8832750462867722, + "grad_norm": 1.69127459419308, + "learning_rate": 7.057322680170542e-07, + "loss": 0.7128, + "step": 10734 + }, + { + "epoch": 0.8833573338819173, + "grad_norm": 2.3025881157502157, + "learning_rate": 7.047490837445758e-07, + "loss": 0.6911, + "step": 10735 + }, + { + "epoch": 0.8834396214770623, + "grad_norm": 2.312065682398312, + "learning_rate": 7.037665597918586e-07, + "loss": 0.7261, + "step": 10736 + }, + { + "epoch": 0.8835219090722074, + "grad_norm": 2.21272661676425, + "learning_rate": 7.027846962286988e-07, + "loss": 0.71, + "step": 10737 + }, + { + "epoch": 0.8836041966673523, + "grad_norm": 1.8887081401546595, + "learning_rate": 7.01803493124844e-07, + "loss": 0.7183, + "step": 10738 + }, + { + "epoch": 0.8836864842624974, + "grad_norm": 1.8565854118708915, + "learning_rate": 7.008229505499986e-07, + "loss": 0.6978, + "step": 10739 + }, + { + "epoch": 0.8837687718576425, + "grad_norm": 1.9764466280652069, + "learning_rate": 6.9984306857382e-07, + "loss": 0.6971, + "step": 10740 + }, + { + "epoch": 0.8838510594527875, + "grad_norm": 2.0203757708518784, + "learning_rate": 6.988638472659148e-07, + "loss": 0.7188, + "step": 10741 + }, + { + "epoch": 0.8839333470479325, + "grad_norm": 2.201693864363604, + "learning_rate": 6.978852866958486e-07, + "loss": 0.7322, + "step": 10742 + }, + { + "epoch": 0.8840156346430775, + "grad_norm": 2.6515757681121883, + "learning_rate": 6.969073869331344e-07, + "loss": 0.7273, + "step": 10743 + }, + { + "epoch": 0.8840979222382226, + "grad_norm": 1.8724075355355927, + "learning_rate": 6.959301480472425e-07, + "loss": 0.7106, + "step": 10744 + }, + { + "epoch": 0.8841802098333676, + "grad_norm": 2.146656227803641, + "learning_rate": 6.949535701075905e-07, + "loss": 0.7174, + "step": 10745 + }, + { + "epoch": 0.8842624974285127, + "grad_norm": 2.830355596124984, + "learning_rate": 6.939776531835573e-07, + "loss": 0.7114, + "step": 10746 + }, + { + "epoch": 0.8843447850236577, + "grad_norm": 2.1331916018109545, + "learning_rate": 6.930023973444677e-07, + "loss": 0.7283, + "step": 10747 + }, + { + "epoch": 0.8844270726188027, + "grad_norm": 2.1103318776382265, + "learning_rate": 6.920278026596061e-07, + "loss": 0.6848, + "step": 10748 + }, + { + "epoch": 0.8845093602139478, + "grad_norm": 2.136063108263668, + "learning_rate": 6.910538691982016e-07, + "loss": 0.7102, + "step": 10749 + }, + { + "epoch": 0.8845916478090928, + "grad_norm": 0.4134217052309871, + "learning_rate": 6.900805970294444e-07, + "loss": 0.4512, + "step": 10750 + }, + { + "epoch": 0.8846739354042378, + "grad_norm": 0.41562314679103607, + "learning_rate": 6.891079862224692e-07, + "loss": 0.4716, + "step": 10751 + }, + { + "epoch": 0.8847562229993828, + "grad_norm": 1.7119879962379438, + "learning_rate": 6.881360368463752e-07, + "loss": 0.7073, + "step": 10752 + }, + { + "epoch": 0.8848385105945279, + "grad_norm": 2.0126026632839524, + "learning_rate": 6.871647489702026e-07, + "loss": 0.6778, + "step": 10753 + }, + { + "epoch": 0.884920798189673, + "grad_norm": 0.39611038239413343, + "learning_rate": 6.861941226629542e-07, + "loss": 0.4552, + "step": 10754 + }, + { + "epoch": 0.8850030857848179, + "grad_norm": 1.9951077170010272, + "learning_rate": 6.852241579935815e-07, + "loss": 0.6912, + "step": 10755 + }, + { + "epoch": 0.885085373379963, + "grad_norm": 2.0822260161104538, + "learning_rate": 6.842548550309857e-07, + "loss": 0.6971, + "step": 10756 + }, + { + "epoch": 0.885167660975108, + "grad_norm": 1.8091984419494385, + "learning_rate": 6.832862138440266e-07, + "loss": 0.7083, + "step": 10757 + }, + { + "epoch": 0.8852499485702531, + "grad_norm": 2.038686050444032, + "learning_rate": 6.823182345015156e-07, + "loss": 0.7067, + "step": 10758 + }, + { + "epoch": 0.885332236165398, + "grad_norm": 2.1209202776721114, + "learning_rate": 6.813509170722143e-07, + "loss": 0.6885, + "step": 10759 + }, + { + "epoch": 0.8854145237605431, + "grad_norm": 1.9772570459187804, + "learning_rate": 6.803842616248435e-07, + "loss": 0.7264, + "step": 10760 + }, + { + "epoch": 0.8854968113556881, + "grad_norm": 2.00381271543292, + "learning_rate": 6.794182682280692e-07, + "loss": 0.7122, + "step": 10761 + }, + { + "epoch": 0.8855790989508332, + "grad_norm": 2.115595749559598, + "learning_rate": 6.784529369505155e-07, + "loss": 0.7321, + "step": 10762 + }, + { + "epoch": 0.8856613865459781, + "grad_norm": 2.0134300395247355, + "learning_rate": 6.774882678607563e-07, + "loss": 0.7227, + "step": 10763 + }, + { + "epoch": 0.8857436741411232, + "grad_norm": 1.9838039041444253, + "learning_rate": 6.765242610273226e-07, + "loss": 0.7191, + "step": 10764 + }, + { + "epoch": 0.8858259617362683, + "grad_norm": 2.939302763763935, + "learning_rate": 6.755609165186927e-07, + "loss": 0.7218, + "step": 10765 + }, + { + "epoch": 0.8859082493314133, + "grad_norm": 1.910287431392845, + "learning_rate": 6.745982344033053e-07, + "loss": 0.7283, + "step": 10766 + }, + { + "epoch": 0.8859905369265583, + "grad_norm": 19.708398246886663, + "learning_rate": 6.736362147495445e-07, + "loss": 0.7161, + "step": 10767 + }, + { + "epoch": 0.8860728245217033, + "grad_norm": 0.3938769892086675, + "learning_rate": 6.726748576257525e-07, + "loss": 0.4513, + "step": 10768 + }, + { + "epoch": 0.8861551121168484, + "grad_norm": 1.675247590157387, + "learning_rate": 6.7171416310022e-07, + "loss": 0.7039, + "step": 10769 + }, + { + "epoch": 0.8862373997119934, + "grad_norm": 1.961858924268636, + "learning_rate": 6.707541312411969e-07, + "loss": 0.6989, + "step": 10770 + }, + { + "epoch": 0.8863196873071385, + "grad_norm": 2.009559763482882, + "learning_rate": 6.697947621168788e-07, + "loss": 0.688, + "step": 10771 + }, + { + "epoch": 0.8864019749022835, + "grad_norm": 1.98319470277511, + "learning_rate": 6.688360557954199e-07, + "loss": 0.7156, + "step": 10772 + }, + { + "epoch": 0.8864842624974285, + "grad_norm": 2.4009722723673863, + "learning_rate": 6.678780123449257e-07, + "loss": 0.7042, + "step": 10773 + }, + { + "epoch": 0.8865665500925736, + "grad_norm": 1.929796714676613, + "learning_rate": 6.669206318334531e-07, + "loss": 0.7232, + "step": 10774 + }, + { + "epoch": 0.8866488376877186, + "grad_norm": 2.2136731825868274, + "learning_rate": 6.659639143290109e-07, + "loss": 0.7287, + "step": 10775 + }, + { + "epoch": 0.8867311252828636, + "grad_norm": 2.0372780636402807, + "learning_rate": 6.650078598995657e-07, + "loss": 0.7015, + "step": 10776 + }, + { + "epoch": 0.8868134128780086, + "grad_norm": 1.9058430702646258, + "learning_rate": 6.640524686130312e-07, + "loss": 0.7285, + "step": 10777 + }, + { + "epoch": 0.8868957004731537, + "grad_norm": 2.87131088938466, + "learning_rate": 6.63097740537283e-07, + "loss": 0.6886, + "step": 10778 + }, + { + "epoch": 0.8869779880682988, + "grad_norm": 0.43164215072802964, + "learning_rate": 6.621436757401356e-07, + "loss": 0.484, + "step": 10779 + }, + { + "epoch": 0.8870602756634437, + "grad_norm": 2.401961550913563, + "learning_rate": 6.611902742893684e-07, + "loss": 0.7025, + "step": 10780 + }, + { + "epoch": 0.8871425632585888, + "grad_norm": 1.95328870809376, + "learning_rate": 6.60237536252708e-07, + "loss": 0.7142, + "step": 10781 + }, + { + "epoch": 0.8872248508537338, + "grad_norm": 1.9085382442914929, + "learning_rate": 6.592854616978383e-07, + "loss": 0.6739, + "step": 10782 + }, + { + "epoch": 0.8873071384488789, + "grad_norm": 0.39878970782317025, + "learning_rate": 6.583340506923897e-07, + "loss": 0.4752, + "step": 10783 + }, + { + "epoch": 0.8873894260440238, + "grad_norm": 2.6375425378298014, + "learning_rate": 6.573833033039523e-07, + "loss": 0.7143, + "step": 10784 + }, + { + "epoch": 0.8874717136391689, + "grad_norm": 2.312101852228633, + "learning_rate": 6.564332196000611e-07, + "loss": 0.6976, + "step": 10785 + }, + { + "epoch": 0.8875540012343139, + "grad_norm": 2.169714617142834, + "learning_rate": 6.554837996482133e-07, + "loss": 0.7006, + "step": 10786 + }, + { + "epoch": 0.887636288829459, + "grad_norm": 2.0730401030275774, + "learning_rate": 6.545350435158504e-07, + "loss": 0.7074, + "step": 10787 + }, + { + "epoch": 0.887718576424604, + "grad_norm": 1.9209310464808769, + "learning_rate": 6.53586951270373e-07, + "loss": 0.7165, + "step": 10788 + }, + { + "epoch": 0.887800864019749, + "grad_norm": 1.9478924923904646, + "learning_rate": 6.526395229791305e-07, + "loss": 0.7202, + "step": 10789 + }, + { + "epoch": 0.8878831516148941, + "grad_norm": 2.1904009112015412, + "learning_rate": 6.516927587094301e-07, + "loss": 0.6929, + "step": 10790 + }, + { + "epoch": 0.8879654392100391, + "grad_norm": 2.2207262136489687, + "learning_rate": 6.507466585285228e-07, + "loss": 0.7083, + "step": 10791 + }, + { + "epoch": 0.8880477268051841, + "grad_norm": 1.9024992639435776, + "learning_rate": 6.498012225036221e-07, + "loss": 0.7024, + "step": 10792 + }, + { + "epoch": 0.8881300144003291, + "grad_norm": 1.8525630327285658, + "learning_rate": 6.488564507018869e-07, + "loss": 0.6889, + "step": 10793 + }, + { + "epoch": 0.8882123019954742, + "grad_norm": 2.2448223637864952, + "learning_rate": 6.479123431904377e-07, + "loss": 0.7298, + "step": 10794 + }, + { + "epoch": 0.8882945895906192, + "grad_norm": 2.7576113551984083, + "learning_rate": 6.469689000363377e-07, + "loss": 0.7089, + "step": 10795 + }, + { + "epoch": 0.8883768771857642, + "grad_norm": 2.0198845802014778, + "learning_rate": 6.460261213066099e-07, + "loss": 0.7185, + "step": 10796 + }, + { + "epoch": 0.8884591647809092, + "grad_norm": 2.09959210222671, + "learning_rate": 6.450840070682252e-07, + "loss": 0.7174, + "step": 10797 + }, + { + "epoch": 0.8885414523760543, + "grad_norm": 1.9030506527929294, + "learning_rate": 6.441425573881133e-07, + "loss": 0.7135, + "step": 10798 + }, + { + "epoch": 0.8886237399711994, + "grad_norm": 1.9064183097936898, + "learning_rate": 6.432017723331496e-07, + "loss": 0.6864, + "step": 10799 + }, + { + "epoch": 0.8887060275663444, + "grad_norm": 2.0563696123205464, + "learning_rate": 6.422616519701686e-07, + "loss": 0.7017, + "step": 10800 + }, + { + "epoch": 0.8887883151614894, + "grad_norm": 2.387730998980557, + "learning_rate": 6.413221963659577e-07, + "loss": 0.6977, + "step": 10801 + }, + { + "epoch": 0.8888706027566344, + "grad_norm": 1.839045702414333, + "learning_rate": 6.403834055872471e-07, + "loss": 0.7146, + "step": 10802 + }, + { + "epoch": 0.8889528903517795, + "grad_norm": 2.4647337168193717, + "learning_rate": 6.394452797007333e-07, + "loss": 0.7049, + "step": 10803 + }, + { + "epoch": 0.8890351779469245, + "grad_norm": 1.7689140467620361, + "learning_rate": 6.385078187730542e-07, + "loss": 0.7137, + "step": 10804 + }, + { + "epoch": 0.8891174655420695, + "grad_norm": 0.41472366308848146, + "learning_rate": 6.375710228708109e-07, + "loss": 0.4667, + "step": 10805 + }, + { + "epoch": 0.8891997531372146, + "grad_norm": 0.4170140886849823, + "learning_rate": 6.36634892060548e-07, + "loss": 0.4568, + "step": 10806 + }, + { + "epoch": 0.8892820407323596, + "grad_norm": 2.546505852219856, + "learning_rate": 6.35699426408769e-07, + "loss": 0.6884, + "step": 10807 + }, + { + "epoch": 0.8893643283275047, + "grad_norm": 1.849637132430407, + "learning_rate": 6.34764625981924e-07, + "loss": 0.7501, + "step": 10808 + }, + { + "epoch": 0.8894466159226496, + "grad_norm": 4.676247213091344, + "learning_rate": 6.338304908464254e-07, + "loss": 0.7428, + "step": 10809 + }, + { + "epoch": 0.8895289035177947, + "grad_norm": 4.379113980654926, + "learning_rate": 6.328970210686269e-07, + "loss": 0.7063, + "step": 10810 + }, + { + "epoch": 0.8896111911129397, + "grad_norm": 2.0119055633543725, + "learning_rate": 6.319642167148455e-07, + "loss": 0.7723, + "step": 10811 + }, + { + "epoch": 0.8896934787080848, + "grad_norm": 2.058186955278663, + "learning_rate": 6.310320778513445e-07, + "loss": 0.6787, + "step": 10812 + }, + { + "epoch": 0.8897757663032297, + "grad_norm": 1.7793417461636822, + "learning_rate": 6.301006045443414e-07, + "loss": 0.7182, + "step": 10813 + }, + { + "epoch": 0.8898580538983748, + "grad_norm": 0.4020207035591105, + "learning_rate": 6.291697968600042e-07, + "loss": 0.4531, + "step": 10814 + }, + { + "epoch": 0.8899403414935199, + "grad_norm": 1.6709796161494808, + "learning_rate": 6.282396548644609e-07, + "loss": 0.6961, + "step": 10815 + }, + { + "epoch": 0.8900226290886649, + "grad_norm": 2.8378604539216954, + "learning_rate": 6.273101786237823e-07, + "loss": 0.7302, + "step": 10816 + }, + { + "epoch": 0.8901049166838099, + "grad_norm": 1.5954320662048604, + "learning_rate": 6.263813682039999e-07, + "loss": 0.6594, + "step": 10817 + }, + { + "epoch": 0.8901872042789549, + "grad_norm": 1.9282019599696203, + "learning_rate": 6.254532236710953e-07, + "loss": 0.7171, + "step": 10818 + }, + { + "epoch": 0.8902694918741, + "grad_norm": 1.877833396509543, + "learning_rate": 6.245257450910014e-07, + "loss": 0.7296, + "step": 10819 + }, + { + "epoch": 0.890351779469245, + "grad_norm": 1.8509978753255525, + "learning_rate": 6.235989325296022e-07, + "loss": 0.7053, + "step": 10820 + }, + { + "epoch": 0.89043406706439, + "grad_norm": 1.8053714100384592, + "learning_rate": 6.226727860527415e-07, + "loss": 0.7263, + "step": 10821 + }, + { + "epoch": 0.890516354659535, + "grad_norm": 2.065662922150952, + "learning_rate": 6.21747305726208e-07, + "loss": 0.7044, + "step": 10822 + }, + { + "epoch": 0.8905986422546801, + "grad_norm": 2.0582558399999167, + "learning_rate": 6.20822491615749e-07, + "loss": 0.7233, + "step": 10823 + }, + { + "epoch": 0.8906809298498252, + "grad_norm": 2.071436336746969, + "learning_rate": 6.198983437870609e-07, + "loss": 0.7134, + "step": 10824 + }, + { + "epoch": 0.8907632174449702, + "grad_norm": 3.4445955726701456, + "learning_rate": 6.189748623057923e-07, + "loss": 0.6869, + "step": 10825 + }, + { + "epoch": 0.8908455050401152, + "grad_norm": 1.9497543291443655, + "learning_rate": 6.180520472375462e-07, + "loss": 0.7012, + "step": 10826 + }, + { + "epoch": 0.8909277926352602, + "grad_norm": 2.259218525800722, + "learning_rate": 6.171298986478791e-07, + "loss": 0.6909, + "step": 10827 + }, + { + "epoch": 0.8910100802304053, + "grad_norm": 1.6572389279585633, + "learning_rate": 6.162084166022974e-07, + "loss": 0.6934, + "step": 10828 + }, + { + "epoch": 0.8910923678255503, + "grad_norm": 1.7896801827269042, + "learning_rate": 6.152876011662634e-07, + "loss": 0.7084, + "step": 10829 + }, + { + "epoch": 0.8911746554206953, + "grad_norm": 1.9554376291657967, + "learning_rate": 6.143674524051902e-07, + "loss": 0.6997, + "step": 10830 + }, + { + "epoch": 0.8912569430158404, + "grad_norm": 2.44560187888361, + "learning_rate": 6.134479703844431e-07, + "loss": 0.68, + "step": 10831 + }, + { + "epoch": 0.8913392306109854, + "grad_norm": 2.190576029487891, + "learning_rate": 6.125291551693391e-07, + "loss": 0.7031, + "step": 10832 + }, + { + "epoch": 0.8914215182061305, + "grad_norm": 1.9185571244972475, + "learning_rate": 6.116110068251524e-07, + "loss": 0.6978, + "step": 10833 + }, + { + "epoch": 0.8915038058012754, + "grad_norm": 2.473142156781821, + "learning_rate": 6.106935254171031e-07, + "loss": 0.7098, + "step": 10834 + }, + { + "epoch": 0.8915860933964205, + "grad_norm": 1.9445223720267608, + "learning_rate": 6.097767110103725e-07, + "loss": 0.7169, + "step": 10835 + }, + { + "epoch": 0.8916683809915655, + "grad_norm": 2.9913900009015104, + "learning_rate": 6.088605636700862e-07, + "loss": 0.6925, + "step": 10836 + }, + { + "epoch": 0.8917506685867106, + "grad_norm": 2.220354741456709, + "learning_rate": 6.079450834613265e-07, + "loss": 0.7211, + "step": 10837 + }, + { + "epoch": 0.8918329561818555, + "grad_norm": 1.871341126200811, + "learning_rate": 6.07030270449126e-07, + "loss": 0.724, + "step": 10838 + }, + { + "epoch": 0.8919152437770006, + "grad_norm": 1.9065596448370117, + "learning_rate": 6.061161246984749e-07, + "loss": 0.7536, + "step": 10839 + }, + { + "epoch": 0.8919975313721457, + "grad_norm": 2.8489352228710936, + "learning_rate": 6.052026462743099e-07, + "loss": 0.6845, + "step": 10840 + }, + { + "epoch": 0.8920798189672907, + "grad_norm": 1.810735247917806, + "learning_rate": 6.042898352415249e-07, + "loss": 0.707, + "step": 10841 + }, + { + "epoch": 0.8921621065624357, + "grad_norm": 0.4375216300550306, + "learning_rate": 6.033776916649636e-07, + "loss": 0.4706, + "step": 10842 + }, + { + "epoch": 0.8922443941575807, + "grad_norm": 1.7051952732364342, + "learning_rate": 6.024662156094241e-07, + "loss": 0.7262, + "step": 10843 + }, + { + "epoch": 0.8923266817527258, + "grad_norm": 2.0900408947807128, + "learning_rate": 6.015554071396546e-07, + "loss": 0.702, + "step": 10844 + }, + { + "epoch": 0.8924089693478708, + "grad_norm": 2.160306065761101, + "learning_rate": 6.006452663203588e-07, + "loss": 0.7393, + "step": 10845 + }, + { + "epoch": 0.8924912569430158, + "grad_norm": 2.24653845458936, + "learning_rate": 5.997357932161906e-07, + "loss": 0.7131, + "step": 10846 + }, + { + "epoch": 0.8925735445381608, + "grad_norm": 2.4428039656494627, + "learning_rate": 5.988269878917586e-07, + "loss": 0.7095, + "step": 10847 + }, + { + "epoch": 0.8926558321333059, + "grad_norm": 1.7719267377927523, + "learning_rate": 5.979188504116229e-07, + "loss": 0.7173, + "step": 10848 + }, + { + "epoch": 0.892738119728451, + "grad_norm": 1.7513444067065906, + "learning_rate": 5.970113808402967e-07, + "loss": 0.6937, + "step": 10849 + }, + { + "epoch": 0.892820407323596, + "grad_norm": 2.1258661793276463, + "learning_rate": 5.961045792422416e-07, + "loss": 0.7035, + "step": 10850 + }, + { + "epoch": 0.892902694918741, + "grad_norm": 2.186793617471711, + "learning_rate": 5.951984456818793e-07, + "loss": 0.7242, + "step": 10851 + }, + { + "epoch": 0.892984982513886, + "grad_norm": 5.931001268635786, + "learning_rate": 5.942929802235775e-07, + "loss": 0.7251, + "step": 10852 + }, + { + "epoch": 0.8930672701090311, + "grad_norm": 2.313530715728318, + "learning_rate": 5.933881829316612e-07, + "loss": 0.7003, + "step": 10853 + }, + { + "epoch": 0.8931495577041761, + "grad_norm": 2.0279710277367027, + "learning_rate": 5.924840538704046e-07, + "loss": 0.7171, + "step": 10854 + }, + { + "epoch": 0.8932318452993211, + "grad_norm": 1.9531758976537767, + "learning_rate": 5.915805931040364e-07, + "loss": 0.6838, + "step": 10855 + }, + { + "epoch": 0.8933141328944662, + "grad_norm": 2.2063521043856036, + "learning_rate": 5.906778006967351e-07, + "loss": 0.7033, + "step": 10856 + }, + { + "epoch": 0.8933964204896112, + "grad_norm": 1.8219208870258297, + "learning_rate": 5.897756767126362e-07, + "loss": 0.6638, + "step": 10857 + }, + { + "epoch": 0.8934787080847563, + "grad_norm": 2.3019675238963857, + "learning_rate": 5.888742212158216e-07, + "loss": 0.7106, + "step": 10858 + }, + { + "epoch": 0.8935609956799012, + "grad_norm": 0.4292290230956334, + "learning_rate": 5.879734342703347e-07, + "loss": 0.4816, + "step": 10859 + }, + { + "epoch": 0.8936432832750463, + "grad_norm": 1.7365699645560297, + "learning_rate": 5.870733159401598e-07, + "loss": 0.6875, + "step": 10860 + }, + { + "epoch": 0.8937255708701913, + "grad_norm": 2.267138982644959, + "learning_rate": 5.861738662892447e-07, + "loss": 0.7376, + "step": 10861 + }, + { + "epoch": 0.8938078584653364, + "grad_norm": 2.3649526283020155, + "learning_rate": 5.852750853814803e-07, + "loss": 0.6901, + "step": 10862 + }, + { + "epoch": 0.8938901460604813, + "grad_norm": 2.5428214864233816, + "learning_rate": 5.843769732807202e-07, + "loss": 0.7162, + "step": 10863 + }, + { + "epoch": 0.8939724336556264, + "grad_norm": 2.6265148282415174, + "learning_rate": 5.83479530050759e-07, + "loss": 0.7148, + "step": 10864 + }, + { + "epoch": 0.8940547212507715, + "grad_norm": 1.6157496843116759, + "learning_rate": 5.825827557553565e-07, + "loss": 0.7129, + "step": 10865 + }, + { + "epoch": 0.8941370088459165, + "grad_norm": 0.41251109122646207, + "learning_rate": 5.816866504582108e-07, + "loss": 0.4595, + "step": 10866 + }, + { + "epoch": 0.8942192964410615, + "grad_norm": 3.227007507189739, + "learning_rate": 5.807912142229843e-07, + "loss": 0.7166, + "step": 10867 + }, + { + "epoch": 0.8943015840362065, + "grad_norm": 2.7392529022898504, + "learning_rate": 5.798964471132851e-07, + "loss": 0.7261, + "step": 10868 + }, + { + "epoch": 0.8943838716313516, + "grad_norm": 2.170373979884502, + "learning_rate": 5.790023491926777e-07, + "loss": 0.6967, + "step": 10869 + }, + { + "epoch": 0.8944661592264966, + "grad_norm": 2.0531540554092045, + "learning_rate": 5.781089205246759e-07, + "loss": 0.7115, + "step": 10870 + }, + { + "epoch": 0.8945484468216416, + "grad_norm": 2.3212646689486967, + "learning_rate": 5.772161611727511e-07, + "loss": 0.6816, + "step": 10871 + }, + { + "epoch": 0.8946307344167866, + "grad_norm": 2.189893733125765, + "learning_rate": 5.763240712003182e-07, + "loss": 0.7339, + "step": 10872 + }, + { + "epoch": 0.8947130220119317, + "grad_norm": 2.1939695319212724, + "learning_rate": 5.754326506707531e-07, + "loss": 0.7191, + "step": 10873 + }, + { + "epoch": 0.8947953096070768, + "grad_norm": 2.006246969748498, + "learning_rate": 5.745418996473795e-07, + "loss": 0.6933, + "step": 10874 + }, + { + "epoch": 0.8948775972022218, + "grad_norm": 2.954786477104015, + "learning_rate": 5.73651818193477e-07, + "loss": 0.7234, + "step": 10875 + }, + { + "epoch": 0.8949598847973668, + "grad_norm": 4.615726765345215, + "learning_rate": 5.727624063722737e-07, + "loss": 0.6781, + "step": 10876 + }, + { + "epoch": 0.8950421723925118, + "grad_norm": 2.0363697931264126, + "learning_rate": 5.718736642469513e-07, + "loss": 0.6818, + "step": 10877 + }, + { + "epoch": 0.8951244599876569, + "grad_norm": 1.9314033822277161, + "learning_rate": 5.709855918806461e-07, + "loss": 0.7006, + "step": 10878 + }, + { + "epoch": 0.8952067475828019, + "grad_norm": 2.0881251686577573, + "learning_rate": 5.700981893364455e-07, + "loss": 0.6992, + "step": 10879 + }, + { + "epoch": 0.8952890351779469, + "grad_norm": 0.40357924802614115, + "learning_rate": 5.692114566773876e-07, + "loss": 0.4602, + "step": 10880 + }, + { + "epoch": 0.895371322773092, + "grad_norm": 1.8665438926626967, + "learning_rate": 5.683253939664668e-07, + "loss": 0.7258, + "step": 10881 + }, + { + "epoch": 0.895453610368237, + "grad_norm": 3.2428359267135476, + "learning_rate": 5.674400012666259e-07, + "loss": 0.6878, + "step": 10882 + }, + { + "epoch": 0.8955358979633821, + "grad_norm": 0.40743972319909194, + "learning_rate": 5.665552786407624e-07, + "loss": 0.4517, + "step": 10883 + }, + { + "epoch": 0.895618185558527, + "grad_norm": 1.8597108763835917, + "learning_rate": 5.656712261517239e-07, + "loss": 0.7358, + "step": 10884 + }, + { + "epoch": 0.8957004731536721, + "grad_norm": 2.3918338713318756, + "learning_rate": 5.647878438623156e-07, + "loss": 0.7061, + "step": 10885 + }, + { + "epoch": 0.8957827607488171, + "grad_norm": 0.44355678723619973, + "learning_rate": 5.639051318352872e-07, + "loss": 0.473, + "step": 10886 + }, + { + "epoch": 0.8958650483439622, + "grad_norm": 2.162263410742953, + "learning_rate": 5.630230901333489e-07, + "loss": 0.7205, + "step": 10887 + }, + { + "epoch": 0.8959473359391071, + "grad_norm": 1.6670026858994391, + "learning_rate": 5.621417188191581e-07, + "loss": 0.6982, + "step": 10888 + }, + { + "epoch": 0.8960296235342522, + "grad_norm": 2.352433217138437, + "learning_rate": 5.612610179553258e-07, + "loss": 0.7014, + "step": 10889 + }, + { + "epoch": 0.8961119111293973, + "grad_norm": 1.8273651920787726, + "learning_rate": 5.603809876044153e-07, + "loss": 0.7192, + "step": 10890 + }, + { + "epoch": 0.8961941987245423, + "grad_norm": 1.7078988209748611, + "learning_rate": 5.595016278289434e-07, + "loss": 0.6927, + "step": 10891 + }, + { + "epoch": 0.8962764863196873, + "grad_norm": 2.1893249895639246, + "learning_rate": 5.586229386913766e-07, + "loss": 0.7016, + "step": 10892 + }, + { + "epoch": 0.8963587739148323, + "grad_norm": 2.3524008638763916, + "learning_rate": 5.577449202541396e-07, + "loss": 0.7378, + "step": 10893 + }, + { + "epoch": 0.8964410615099774, + "grad_norm": 2.1398180474876556, + "learning_rate": 5.568675725796014e-07, + "loss": 0.7079, + "step": 10894 + }, + { + "epoch": 0.8965233491051224, + "grad_norm": 1.7970238505913083, + "learning_rate": 5.559908957300897e-07, + "loss": 0.6963, + "step": 10895 + }, + { + "epoch": 0.8966056367002674, + "grad_norm": 3.092132858343044, + "learning_rate": 5.551148897678793e-07, + "loss": 0.6966, + "step": 10896 + }, + { + "epoch": 0.8966879242954124, + "grad_norm": 2.590878692922372, + "learning_rate": 5.542395547552037e-07, + "loss": 0.7213, + "step": 10897 + }, + { + "epoch": 0.8967702118905575, + "grad_norm": 0.43058334332283665, + "learning_rate": 5.533648907542432e-07, + "loss": 0.4718, + "step": 10898 + }, + { + "epoch": 0.8968524994857026, + "grad_norm": 6.847176447275601, + "learning_rate": 5.524908978271337e-07, + "loss": 0.7001, + "step": 10899 + }, + { + "epoch": 0.8969347870808476, + "grad_norm": 1.7647142634736606, + "learning_rate": 5.516175760359621e-07, + "loss": 0.7147, + "step": 10900 + }, + { + "epoch": 0.8970170746759926, + "grad_norm": 2.163165269150606, + "learning_rate": 5.507449254427677e-07, + "loss": 0.6836, + "step": 10901 + }, + { + "epoch": 0.8970993622711376, + "grad_norm": 2.4242265573455857, + "learning_rate": 5.498729461095409e-07, + "loss": 0.7255, + "step": 10902 + }, + { + "epoch": 0.8971816498662827, + "grad_norm": 1.8420211919077654, + "learning_rate": 5.490016380982277e-07, + "loss": 0.6929, + "step": 10903 + }, + { + "epoch": 0.8972639374614277, + "grad_norm": 2.072968677341676, + "learning_rate": 5.481310014707219e-07, + "loss": 0.7019, + "step": 10904 + }, + { + "epoch": 0.8973462250565727, + "grad_norm": 2.418019023042868, + "learning_rate": 5.472610362888752e-07, + "loss": 0.724, + "step": 10905 + }, + { + "epoch": 0.8974285126517177, + "grad_norm": 0.4192004483036187, + "learning_rate": 5.463917426144871e-07, + "loss": 0.4566, + "step": 10906 + }, + { + "epoch": 0.8975108002468628, + "grad_norm": 6.791058624081407, + "learning_rate": 5.455231205093103e-07, + "loss": 0.7116, + "step": 10907 + }, + { + "epoch": 0.8975930878420079, + "grad_norm": 1.8200862710851522, + "learning_rate": 5.4465517003505e-07, + "loss": 0.7141, + "step": 10908 + }, + { + "epoch": 0.8976753754371528, + "grad_norm": 4.372451335303304, + "learning_rate": 5.437878912533667e-07, + "loss": 0.741, + "step": 10909 + }, + { + "epoch": 0.8977576630322979, + "grad_norm": 1.9624321771930926, + "learning_rate": 5.429212842258669e-07, + "loss": 0.7112, + "step": 10910 + }, + { + "epoch": 0.8978399506274429, + "grad_norm": 2.2338388877869098, + "learning_rate": 5.420553490141156e-07, + "loss": 0.7196, + "step": 10911 + }, + { + "epoch": 0.897922238222588, + "grad_norm": 2.7817484813057898, + "learning_rate": 5.411900856796271e-07, + "loss": 0.7056, + "step": 10912 + }, + { + "epoch": 0.8980045258177329, + "grad_norm": 5.301801718084529, + "learning_rate": 5.403254942838665e-07, + "loss": 0.7272, + "step": 10913 + }, + { + "epoch": 0.898086813412878, + "grad_norm": 2.7289517528270424, + "learning_rate": 5.394615748882548e-07, + "loss": 0.7076, + "step": 10914 + }, + { + "epoch": 0.898169101008023, + "grad_norm": 1.9637838953619224, + "learning_rate": 5.385983275541628e-07, + "loss": 0.6969, + "step": 10915 + }, + { + "epoch": 0.8982513886031681, + "grad_norm": 0.44232305966901675, + "learning_rate": 5.377357523429138e-07, + "loss": 0.4547, + "step": 10916 + }, + { + "epoch": 0.8983336761983131, + "grad_norm": 2.7800917648235037, + "learning_rate": 5.368738493157854e-07, + "loss": 0.689, + "step": 10917 + }, + { + "epoch": 0.8984159637934581, + "grad_norm": 1.7735949719030204, + "learning_rate": 5.360126185340043e-07, + "loss": 0.6861, + "step": 10918 + }, + { + "epoch": 0.8984982513886032, + "grad_norm": 0.4353296278997954, + "learning_rate": 5.351520600587524e-07, + "loss": 0.4684, + "step": 10919 + }, + { + "epoch": 0.8985805389837482, + "grad_norm": 2.2273046374348544, + "learning_rate": 5.342921739511598e-07, + "loss": 0.6881, + "step": 10920 + }, + { + "epoch": 0.8986628265788932, + "grad_norm": 0.41317241259869836, + "learning_rate": 5.334329602723131e-07, + "loss": 0.4696, + "step": 10921 + }, + { + "epoch": 0.8987451141740382, + "grad_norm": 1.8099349177060053, + "learning_rate": 5.325744190832493e-07, + "loss": 0.7222, + "step": 10922 + }, + { + "epoch": 0.8988274017691833, + "grad_norm": 1.741607640196034, + "learning_rate": 5.317165504449584e-07, + "loss": 0.7198, + "step": 10923 + }, + { + "epoch": 0.8989096893643284, + "grad_norm": 2.1184618944004407, + "learning_rate": 5.308593544183826e-07, + "loss": 0.7194, + "step": 10924 + }, + { + "epoch": 0.8989919769594733, + "grad_norm": 0.40738328657110523, + "learning_rate": 5.300028310644134e-07, + "loss": 0.4642, + "step": 10925 + }, + { + "epoch": 0.8990742645546184, + "grad_norm": 1.746225265988427, + "learning_rate": 5.291469804438976e-07, + "loss": 0.7007, + "step": 10926 + }, + { + "epoch": 0.8991565521497634, + "grad_norm": 1.855014806263825, + "learning_rate": 5.282918026176342e-07, + "loss": 0.6911, + "step": 10927 + }, + { + "epoch": 0.8992388397449085, + "grad_norm": 0.4402273871592844, + "learning_rate": 5.274372976463726e-07, + "loss": 0.4578, + "step": 10928 + }, + { + "epoch": 0.8993211273400535, + "grad_norm": 2.3048465179374564, + "learning_rate": 5.265834655908198e-07, + "loss": 0.7164, + "step": 10929 + }, + { + "epoch": 0.8994034149351985, + "grad_norm": 7.244424240218328, + "learning_rate": 5.257303065116237e-07, + "loss": 0.7154, + "step": 10930 + }, + { + "epoch": 0.8994857025303435, + "grad_norm": 1.9840863228122785, + "learning_rate": 5.248778204693961e-07, + "loss": 0.6954, + "step": 10931 + }, + { + "epoch": 0.8995679901254886, + "grad_norm": 2.0961464373036005, + "learning_rate": 5.240260075246927e-07, + "loss": 0.721, + "step": 10932 + }, + { + "epoch": 0.8996502777206337, + "grad_norm": 1.8191277825106067, + "learning_rate": 5.2317486773803e-07, + "loss": 0.7197, + "step": 10933 + }, + { + "epoch": 0.8997325653157786, + "grad_norm": 2.454445099066783, + "learning_rate": 5.223244011698659e-07, + "loss": 0.7023, + "step": 10934 + }, + { + "epoch": 0.8998148529109237, + "grad_norm": 1.6749640876239764, + "learning_rate": 5.214746078806232e-07, + "loss": 0.7006, + "step": 10935 + }, + { + "epoch": 0.8998971405060687, + "grad_norm": 0.41109622194549855, + "learning_rate": 5.206254879306627e-07, + "loss": 0.4793, + "step": 10936 + }, + { + "epoch": 0.8999794281012138, + "grad_norm": 1.9873874825960844, + "learning_rate": 5.197770413803083e-07, + "loss": 0.6836, + "step": 10937 + }, + { + "epoch": 0.9000617156963587, + "grad_norm": 2.144667825917773, + "learning_rate": 5.189292682898306e-07, + "loss": 0.6894, + "step": 10938 + }, + { + "epoch": 0.9001440032915038, + "grad_norm": 2.03014483542568, + "learning_rate": 5.18082168719456e-07, + "loss": 0.7012, + "step": 10939 + }, + { + "epoch": 0.9002262908866488, + "grad_norm": 1.9632918318050032, + "learning_rate": 5.172357427293584e-07, + "loss": 0.7317, + "step": 10940 + }, + { + "epoch": 0.9003085784817939, + "grad_norm": 2.172545427652292, + "learning_rate": 5.163899903796709e-07, + "loss": 0.7483, + "step": 10941 + }, + { + "epoch": 0.9003908660769389, + "grad_norm": 2.4464223998708867, + "learning_rate": 5.1554491173047e-07, + "loss": 0.7077, + "step": 10942 + }, + { + "epoch": 0.9004731536720839, + "grad_norm": 1.6575481264597194, + "learning_rate": 5.147005068417909e-07, + "loss": 0.7001, + "step": 10943 + }, + { + "epoch": 0.900555441267229, + "grad_norm": 1.7246783342564453, + "learning_rate": 5.138567757736168e-07, + "loss": 0.7183, + "step": 10944 + }, + { + "epoch": 0.900637728862374, + "grad_norm": 0.41132376968734885, + "learning_rate": 5.130137185858886e-07, + "loss": 0.4732, + "step": 10945 + }, + { + "epoch": 0.900720016457519, + "grad_norm": 1.6169447029151491, + "learning_rate": 5.121713353384927e-07, + "loss": 0.7192, + "step": 10946 + }, + { + "epoch": 0.900802304052664, + "grad_norm": 1.8995407873672752, + "learning_rate": 5.113296260912725e-07, + "loss": 0.7295, + "step": 10947 + }, + { + "epoch": 0.9008845916478091, + "grad_norm": 1.8558420274185126, + "learning_rate": 5.104885909040191e-07, + "loss": 0.6783, + "step": 10948 + }, + { + "epoch": 0.9009668792429542, + "grad_norm": 1.7294520079638938, + "learning_rate": 5.096482298364813e-07, + "loss": 0.6931, + "step": 10949 + }, + { + "epoch": 0.9010491668380991, + "grad_norm": 0.4191840248726286, + "learning_rate": 5.088085429483547e-07, + "loss": 0.472, + "step": 10950 + }, + { + "epoch": 0.9011314544332442, + "grad_norm": 2.025234091783655, + "learning_rate": 5.079695302992915e-07, + "loss": 0.7207, + "step": 10951 + }, + { + "epoch": 0.9012137420283892, + "grad_norm": 1.949666567906439, + "learning_rate": 5.071311919488919e-07, + "loss": 0.7176, + "step": 10952 + }, + { + "epoch": 0.9012960296235343, + "grad_norm": 1.78662665242097, + "learning_rate": 5.062935279567116e-07, + "loss": 0.7115, + "step": 10953 + }, + { + "epoch": 0.9013783172186793, + "grad_norm": 0.44208983688981784, + "learning_rate": 5.054565383822541e-07, + "loss": 0.4585, + "step": 10954 + }, + { + "epoch": 0.9014606048138243, + "grad_norm": 2.094688392380113, + "learning_rate": 5.046202232849817e-07, + "loss": 0.7131, + "step": 10955 + }, + { + "epoch": 0.9015428924089693, + "grad_norm": 2.3549282629755663, + "learning_rate": 5.037845827243015e-07, + "loss": 0.6932, + "step": 10956 + }, + { + "epoch": 0.9016251800041144, + "grad_norm": 2.560923592381276, + "learning_rate": 5.029496167595793e-07, + "loss": 0.6955, + "step": 10957 + }, + { + "epoch": 0.9017074675992595, + "grad_norm": 0.4090637960215095, + "learning_rate": 5.021153254501276e-07, + "loss": 0.4657, + "step": 10958 + }, + { + "epoch": 0.9017897551944044, + "grad_norm": 2.353126353504958, + "learning_rate": 5.012817088552136e-07, + "loss": 0.7038, + "step": 10959 + }, + { + "epoch": 0.9018720427895495, + "grad_norm": 2.4104405659970998, + "learning_rate": 5.004487670340552e-07, + "loss": 0.6867, + "step": 10960 + }, + { + "epoch": 0.9019543303846945, + "grad_norm": 1.6246418486864889, + "learning_rate": 4.996165000458253e-07, + "loss": 0.6994, + "step": 10961 + }, + { + "epoch": 0.9020366179798396, + "grad_norm": 2.5426667553595816, + "learning_rate": 4.987849079496431e-07, + "loss": 0.6846, + "step": 10962 + }, + { + "epoch": 0.9021189055749845, + "grad_norm": 2.1456119544118475, + "learning_rate": 4.979539908045882e-07, + "loss": 0.6847, + "step": 10963 + }, + { + "epoch": 0.9022011931701296, + "grad_norm": 2.122980157826601, + "learning_rate": 4.971237486696867e-07, + "loss": 0.7085, + "step": 10964 + }, + { + "epoch": 0.9022834807652746, + "grad_norm": 1.7529712938817326, + "learning_rate": 4.962941816039147e-07, + "loss": 0.6923, + "step": 10965 + }, + { + "epoch": 0.9023657683604197, + "grad_norm": 0.40413095931207726, + "learning_rate": 4.954652896662049e-07, + "loss": 0.4656, + "step": 10966 + }, + { + "epoch": 0.9024480559555647, + "grad_norm": 2.07424009279328, + "learning_rate": 4.946370729154415e-07, + "loss": 0.7148, + "step": 10967 + }, + { + "epoch": 0.9025303435507097, + "grad_norm": 3.9158325622575862, + "learning_rate": 4.938095314104574e-07, + "loss": 0.7022, + "step": 10968 + }, + { + "epoch": 0.9026126311458548, + "grad_norm": 0.4233914840035128, + "learning_rate": 4.929826652100433e-07, + "loss": 0.469, + "step": 10969 + }, + { + "epoch": 0.9026949187409998, + "grad_norm": 0.41441678406695426, + "learning_rate": 4.921564743729357e-07, + "loss": 0.4814, + "step": 10970 + }, + { + "epoch": 0.9027772063361448, + "grad_norm": 2.234004754446393, + "learning_rate": 4.913309589578274e-07, + "loss": 0.7115, + "step": 10971 + }, + { + "epoch": 0.9028594939312898, + "grad_norm": 0.407302879454868, + "learning_rate": 4.905061190233584e-07, + "loss": 0.4528, + "step": 10972 + }, + { + "epoch": 0.9029417815264349, + "grad_norm": 1.7361761684922359, + "learning_rate": 4.896819546281284e-07, + "loss": 0.7181, + "step": 10973 + }, + { + "epoch": 0.90302406912158, + "grad_norm": 1.8353204500433462, + "learning_rate": 4.888584658306816e-07, + "loss": 0.7326, + "step": 10974 + }, + { + "epoch": 0.9031063567167249, + "grad_norm": 3.1619681117615537, + "learning_rate": 4.880356526895203e-07, + "loss": 0.6805, + "step": 10975 + }, + { + "epoch": 0.90318864431187, + "grad_norm": 1.9204205985790916, + "learning_rate": 4.872135152630941e-07, + "loss": 0.7157, + "step": 10976 + }, + { + "epoch": 0.903270931907015, + "grad_norm": 0.41198707846915816, + "learning_rate": 4.863920536098055e-07, + "loss": 0.4794, + "step": 10977 + }, + { + "epoch": 0.9033532195021601, + "grad_norm": 1.8696358687668166, + "learning_rate": 4.855712677880098e-07, + "loss": 0.6898, + "step": 10978 + }, + { + "epoch": 0.9034355070973051, + "grad_norm": 2.136780047256098, + "learning_rate": 4.847511578560171e-07, + "loss": 0.7104, + "step": 10979 + }, + { + "epoch": 0.9035177946924501, + "grad_norm": 3.855172310089153, + "learning_rate": 4.83931723872082e-07, + "loss": 0.7123, + "step": 10980 + }, + { + "epoch": 0.9036000822875951, + "grad_norm": 2.384812307548946, + "learning_rate": 4.831129658944212e-07, + "loss": 0.7289, + "step": 10981 + }, + { + "epoch": 0.9036823698827402, + "grad_norm": 0.40898509148369677, + "learning_rate": 4.822948839811947e-07, + "loss": 0.4607, + "step": 10982 + }, + { + "epoch": 0.9037646574778853, + "grad_norm": 2.049988721959896, + "learning_rate": 4.814774781905185e-07, + "loss": 0.7037, + "step": 10983 + }, + { + "epoch": 0.9038469450730302, + "grad_norm": 0.4034032590989249, + "learning_rate": 4.806607485804582e-07, + "loss": 0.4743, + "step": 10984 + }, + { + "epoch": 0.9039292326681753, + "grad_norm": 1.8661198099386784, + "learning_rate": 4.798446952090352e-07, + "loss": 0.7323, + "step": 10985 + }, + { + "epoch": 0.9040115202633203, + "grad_norm": 2.2405993451177646, + "learning_rate": 4.790293181342188e-07, + "loss": 0.6871, + "step": 10986 + }, + { + "epoch": 0.9040938078584654, + "grad_norm": 1.7819353831601263, + "learning_rate": 4.782146174139346e-07, + "loss": 0.6986, + "step": 10987 + }, + { + "epoch": 0.9041760954536103, + "grad_norm": 0.4205149514006089, + "learning_rate": 4.774005931060566e-07, + "loss": 0.4558, + "step": 10988 + }, + { + "epoch": 0.9042583830487554, + "grad_norm": 2.1462585584566107, + "learning_rate": 4.765872452684106e-07, + "loss": 0.7033, + "step": 10989 + }, + { + "epoch": 0.9043406706439004, + "grad_norm": 6.66487546095507, + "learning_rate": 4.757745739587749e-07, + "loss": 0.7179, + "step": 10990 + }, + { + "epoch": 0.9044229582390455, + "grad_norm": 2.0846983519699234, + "learning_rate": 4.749625792348833e-07, + "loss": 0.7206, + "step": 10991 + }, + { + "epoch": 0.9045052458341905, + "grad_norm": 2.09605178236101, + "learning_rate": 4.7415126115441523e-07, + "loss": 0.6994, + "step": 10992 + }, + { + "epoch": 0.9045875334293355, + "grad_norm": 1.9406592793833968, + "learning_rate": 4.73340619775009e-07, + "loss": 0.6882, + "step": 10993 + }, + { + "epoch": 0.9046698210244806, + "grad_norm": 1.8122044265756425, + "learning_rate": 4.725306551542497e-07, + "loss": 0.7091, + "step": 10994 + }, + { + "epoch": 0.9047521086196256, + "grad_norm": 1.8960423313165213, + "learning_rate": 4.7172136734967455e-07, + "loss": 0.6787, + "step": 10995 + }, + { + "epoch": 0.9048343962147706, + "grad_norm": 1.9942009245806092, + "learning_rate": 4.7091275641877655e-07, + "loss": 0.7061, + "step": 10996 + }, + { + "epoch": 0.9049166838099156, + "grad_norm": 1.8746989714777338, + "learning_rate": 4.7010482241899414e-07, + "loss": 0.7302, + "step": 10997 + }, + { + "epoch": 0.9049989714050607, + "grad_norm": 1.6782443791040864, + "learning_rate": 4.69297565407727e-07, + "loss": 0.732, + "step": 10998 + }, + { + "epoch": 0.9050812590002058, + "grad_norm": 2.4238219921189654, + "learning_rate": 4.6849098544231807e-07, + "loss": 0.7094, + "step": 10999 + }, + { + "epoch": 0.9051635465953507, + "grad_norm": 1.6443836663719462, + "learning_rate": 4.676850825800672e-07, + "loss": 0.7074, + "step": 11000 + }, + { + "epoch": 0.9052458341904958, + "grad_norm": 1.9147142982649707, + "learning_rate": 4.6687985687822066e-07, + "loss": 0.7093, + "step": 11001 + }, + { + "epoch": 0.9053281217856408, + "grad_norm": 1.9277620115819656, + "learning_rate": 4.6607530839398507e-07, + "loss": 0.685, + "step": 11002 + }, + { + "epoch": 0.9054104093807859, + "grad_norm": 5.341918519049872, + "learning_rate": 4.652714371845113e-07, + "loss": 0.6803, + "step": 11003 + }, + { + "epoch": 0.9054926969759309, + "grad_norm": 2.092710244936841, + "learning_rate": 4.64468243306907e-07, + "loss": 0.7418, + "step": 11004 + }, + { + "epoch": 0.9055749845710759, + "grad_norm": 2.6866815328317846, + "learning_rate": 4.636657268182287e-07, + "loss": 0.7194, + "step": 11005 + }, + { + "epoch": 0.9056572721662209, + "grad_norm": 1.8501199104704196, + "learning_rate": 4.6286388777548653e-07, + "loss": 0.7059, + "step": 11006 + }, + { + "epoch": 0.905739559761366, + "grad_norm": 1.8490276700916306, + "learning_rate": 4.620627262356403e-07, + "loss": 0.6996, + "step": 11007 + }, + { + "epoch": 0.9058218473565111, + "grad_norm": 2.117716894553738, + "learning_rate": 4.6126224225560455e-07, + "loss": 0.7135, + "step": 11008 + }, + { + "epoch": 0.905904134951656, + "grad_norm": 1.9005720907358608, + "learning_rate": 4.6046243589224383e-07, + "loss": 0.714, + "step": 11009 + }, + { + "epoch": 0.9059864225468011, + "grad_norm": 2.019359920259305, + "learning_rate": 4.596633072023771e-07, + "loss": 0.6974, + "step": 11010 + }, + { + "epoch": 0.9060687101419461, + "grad_norm": 1.9655200406710245, + "learning_rate": 4.5886485624276997e-07, + "loss": 0.7098, + "step": 11011 + }, + { + "epoch": 0.9061509977370912, + "grad_norm": 2.010926477615652, + "learning_rate": 4.58067083070145e-07, + "loss": 0.7176, + "step": 11012 + }, + { + "epoch": 0.9062332853322361, + "grad_norm": 2.07563793683227, + "learning_rate": 4.572699877411735e-07, + "loss": 0.711, + "step": 11013 + }, + { + "epoch": 0.9063155729273812, + "grad_norm": 1.7648287979186847, + "learning_rate": 4.564735703124823e-07, + "loss": 0.7017, + "step": 11014 + }, + { + "epoch": 0.9063978605225262, + "grad_norm": 1.960983606369153, + "learning_rate": 4.556778308406451e-07, + "loss": 0.7003, + "step": 11015 + }, + { + "epoch": 0.9064801481176713, + "grad_norm": 1.9559791379712275, + "learning_rate": 4.548827693821922e-07, + "loss": 0.7198, + "step": 11016 + }, + { + "epoch": 0.9065624357128163, + "grad_norm": 1.85914134769184, + "learning_rate": 4.540883859936007e-07, + "loss": 0.7228, + "step": 11017 + }, + { + "epoch": 0.9066447233079613, + "grad_norm": 4.41994484236899, + "learning_rate": 4.5329468073130436e-07, + "loss": 0.7067, + "step": 11018 + }, + { + "epoch": 0.9067270109031064, + "grad_norm": 3.096304263652965, + "learning_rate": 4.5250165365168573e-07, + "loss": 0.682, + "step": 11019 + }, + { + "epoch": 0.9068092984982514, + "grad_norm": 1.9186269060054493, + "learning_rate": 4.51709304811081e-07, + "loss": 0.7106, + "step": 11020 + }, + { + "epoch": 0.9068915860933964, + "grad_norm": 6.732900114109423, + "learning_rate": 4.509176342657773e-07, + "loss": 0.698, + "step": 11021 + }, + { + "epoch": 0.9069738736885414, + "grad_norm": 1.9544852828337322, + "learning_rate": 4.5012664207201407e-07, + "loss": 0.7057, + "step": 11022 + }, + { + "epoch": 0.9070561612836865, + "grad_norm": 2.0856419600518383, + "learning_rate": 4.493363282859797e-07, + "loss": 0.6777, + "step": 11023 + }, + { + "epoch": 0.9071384488788315, + "grad_norm": 0.39119312129430545, + "learning_rate": 4.4854669296382047e-07, + "loss": 0.4693, + "step": 11024 + }, + { + "epoch": 0.9072207364739765, + "grad_norm": 2.0235953881173594, + "learning_rate": 4.47757736161627e-07, + "loss": 0.7077, + "step": 11025 + }, + { + "epoch": 0.9073030240691216, + "grad_norm": 2.2448379020475895, + "learning_rate": 4.4696945793544776e-07, + "loss": 0.7001, + "step": 11026 + }, + { + "epoch": 0.9073853116642666, + "grad_norm": 2.050920533452338, + "learning_rate": 4.461818583412814e-07, + "loss": 0.7254, + "step": 11027 + }, + { + "epoch": 0.9074675992594117, + "grad_norm": 1.9576632712186244, + "learning_rate": 4.453949374350774e-07, + "loss": 0.741, + "step": 11028 + }, + { + "epoch": 0.9075498868545567, + "grad_norm": 2.2798680444953368, + "learning_rate": 4.4460869527273464e-07, + "loss": 0.6761, + "step": 11029 + }, + { + "epoch": 0.9076321744497017, + "grad_norm": 2.551392974305997, + "learning_rate": 4.438231319101094e-07, + "loss": 0.7038, + "step": 11030 + }, + { + "epoch": 0.9077144620448467, + "grad_norm": 1.8711206240933778, + "learning_rate": 4.4303824740300485e-07, + "loss": 0.7319, + "step": 11031 + }, + { + "epoch": 0.9077967496399918, + "grad_norm": 1.648958803259434, + "learning_rate": 4.422540418071808e-07, + "loss": 0.6735, + "step": 11032 + }, + { + "epoch": 0.9078790372351369, + "grad_norm": 1.6141550228097739, + "learning_rate": 4.414705151783449e-07, + "loss": 0.694, + "step": 11033 + }, + { + "epoch": 0.9079613248302818, + "grad_norm": 2.04524781258983, + "learning_rate": 4.406876675721561e-07, + "loss": 0.6942, + "step": 11034 + }, + { + "epoch": 0.9080436124254269, + "grad_norm": 2.0546286377930194, + "learning_rate": 4.399054990442264e-07, + "loss": 0.7016, + "step": 11035 + }, + { + "epoch": 0.9081259000205719, + "grad_norm": 1.8496111578247136, + "learning_rate": 4.391240096501226e-07, + "loss": 0.7095, + "step": 11036 + }, + { + "epoch": 0.908208187615717, + "grad_norm": 1.7066185051568723, + "learning_rate": 4.3834319944535685e-07, + "loss": 0.709, + "step": 11037 + }, + { + "epoch": 0.9082904752108619, + "grad_norm": 1.7535757244012138, + "learning_rate": 4.375630684854004e-07, + "loss": 0.7094, + "step": 11038 + }, + { + "epoch": 0.908372762806007, + "grad_norm": 0.40349373194279, + "learning_rate": 4.3678361682567005e-07, + "loss": 0.4584, + "step": 11039 + }, + { + "epoch": 0.908455050401152, + "grad_norm": 1.7235365997508305, + "learning_rate": 4.360048445215381e-07, + "loss": 0.6963, + "step": 11040 + }, + { + "epoch": 0.9085373379962971, + "grad_norm": 2.0260556615675416, + "learning_rate": 4.3522675162832484e-07, + "loss": 0.7059, + "step": 11041 + }, + { + "epoch": 0.908619625591442, + "grad_norm": 0.42059428383480435, + "learning_rate": 4.344493382013082e-07, + "loss": 0.4576, + "step": 11042 + }, + { + "epoch": 0.9087019131865871, + "grad_norm": 1.835708502901761, + "learning_rate": 4.3367260429571176e-07, + "loss": 0.6849, + "step": 11043 + }, + { + "epoch": 0.9087842007817322, + "grad_norm": 0.4320590911622642, + "learning_rate": 4.3289654996671483e-07, + "loss": 0.4831, + "step": 11044 + }, + { + "epoch": 0.9088664883768772, + "grad_norm": 2.063958817218877, + "learning_rate": 4.3212117526944765e-07, + "loss": 0.7024, + "step": 11045 + }, + { + "epoch": 0.9089487759720222, + "grad_norm": 1.6697420731802135, + "learning_rate": 4.3134648025899064e-07, + "loss": 0.7001, + "step": 11046 + }, + { + "epoch": 0.9090310635671672, + "grad_norm": 1.8512625051075655, + "learning_rate": 4.305724649903742e-07, + "loss": 0.6983, + "step": 11047 + }, + { + "epoch": 0.9091133511623123, + "grad_norm": 0.43399664612871214, + "learning_rate": 4.2979912951858884e-07, + "loss": 0.4815, + "step": 11048 + }, + { + "epoch": 0.9091956387574573, + "grad_norm": 1.5250551940121007, + "learning_rate": 4.29026473898565e-07, + "loss": 0.6954, + "step": 11049 + }, + { + "epoch": 0.9092779263526023, + "grad_norm": 1.797760660041021, + "learning_rate": 4.282544981851966e-07, + "loss": 0.684, + "step": 11050 + }, + { + "epoch": 0.9093602139477474, + "grad_norm": 1.8326907853843704, + "learning_rate": 4.2748320243332086e-07, + "loss": 0.6974, + "step": 11051 + }, + { + "epoch": 0.9094425015428924, + "grad_norm": 1.7824666686402322, + "learning_rate": 4.267125866977284e-07, + "loss": 0.6739, + "step": 11052 + }, + { + "epoch": 0.9095247891380375, + "grad_norm": 1.909838761611788, + "learning_rate": 4.2594265103316324e-07, + "loss": 0.6889, + "step": 11053 + }, + { + "epoch": 0.9096070767331824, + "grad_norm": 1.9239789897119026, + "learning_rate": 4.2517339549432157e-07, + "loss": 0.7334, + "step": 11054 + }, + { + "epoch": 0.9096893643283275, + "grad_norm": 1.6189217501479802, + "learning_rate": 4.244048201358486e-07, + "loss": 0.7029, + "step": 11055 + }, + { + "epoch": 0.9097716519234725, + "grad_norm": 4.8805893749393325, + "learning_rate": 4.23636925012344e-07, + "loss": 0.695, + "step": 11056 + }, + { + "epoch": 0.9098539395186176, + "grad_norm": 2.106450500706384, + "learning_rate": 4.228697101783563e-07, + "loss": 0.6801, + "step": 11057 + }, + { + "epoch": 0.9099362271137627, + "grad_norm": 2.1533597616255125, + "learning_rate": 4.2210317568838974e-07, + "loss": 0.7187, + "step": 11058 + }, + { + "epoch": 0.9100185147089076, + "grad_norm": 1.845814260541582, + "learning_rate": 4.21337321596893e-07, + "loss": 0.6992, + "step": 11059 + }, + { + "epoch": 0.9101008023040527, + "grad_norm": 2.020478593135618, + "learning_rate": 4.2057214795827693e-07, + "loss": 0.6748, + "step": 11060 + }, + { + "epoch": 0.9101830898991977, + "grad_norm": 1.9629882168259263, + "learning_rate": 4.198076548268937e-07, + "loss": 0.7023, + "step": 11061 + }, + { + "epoch": 0.9102653774943428, + "grad_norm": 1.9424114109432755, + "learning_rate": 4.190438422570542e-07, + "loss": 0.714, + "step": 11062 + }, + { + "epoch": 0.9103476650894877, + "grad_norm": 1.9136431891609194, + "learning_rate": 4.182807103030184e-07, + "loss": 0.7112, + "step": 11063 + }, + { + "epoch": 0.9104299526846328, + "grad_norm": 2.01138469694574, + "learning_rate": 4.1751825901899634e-07, + "loss": 0.6904, + "step": 11064 + }, + { + "epoch": 0.9105122402797778, + "grad_norm": 1.6751422899538997, + "learning_rate": 4.167564884591513e-07, + "loss": 0.7018, + "step": 11065 + }, + { + "epoch": 0.9105945278749229, + "grad_norm": 1.8797959340640578, + "learning_rate": 4.159953986776011e-07, + "loss": 0.7055, + "step": 11066 + }, + { + "epoch": 0.9106768154700678, + "grad_norm": 2.273422409377992, + "learning_rate": 4.1523498972840803e-07, + "loss": 0.728, + "step": 11067 + }, + { + "epoch": 0.9107591030652129, + "grad_norm": 1.810235921403143, + "learning_rate": 4.1447526166559447e-07, + "loss": 0.6869, + "step": 11068 + }, + { + "epoch": 0.910841390660358, + "grad_norm": 1.800178350950108, + "learning_rate": 4.137162145431295e-07, + "loss": 0.6917, + "step": 11069 + }, + { + "epoch": 0.910923678255503, + "grad_norm": 2.0381919156286683, + "learning_rate": 4.1295784841493215e-07, + "loss": 0.7044, + "step": 11070 + }, + { + "epoch": 0.911005965850648, + "grad_norm": 2.119481067197588, + "learning_rate": 4.1220016333487713e-07, + "loss": 0.731, + "step": 11071 + }, + { + "epoch": 0.911088253445793, + "grad_norm": 2.419974612384401, + "learning_rate": 4.1144315935678913e-07, + "loss": 0.695, + "step": 11072 + }, + { + "epoch": 0.9111705410409381, + "grad_norm": 1.6646523307020307, + "learning_rate": 4.1068683653444406e-07, + "loss": 0.7047, + "step": 11073 + }, + { + "epoch": 0.9112528286360831, + "grad_norm": 1.8568686817332887, + "learning_rate": 4.0993119492157117e-07, + "loss": 0.7022, + "step": 11074 + }, + { + "epoch": 0.9113351162312281, + "grad_norm": 1.7954980136668017, + "learning_rate": 4.0917623457184863e-07, + "loss": 0.7082, + "step": 11075 + }, + { + "epoch": 0.9114174038263732, + "grad_norm": 0.42050010737035154, + "learning_rate": 4.0842195553890904e-07, + "loss": 0.4857, + "step": 11076 + }, + { + "epoch": 0.9114996914215182, + "grad_norm": 1.6466941495090661, + "learning_rate": 4.0766835787633295e-07, + "loss": 0.7111, + "step": 11077 + }, + { + "epoch": 0.9115819790166633, + "grad_norm": 1.8197346713051439, + "learning_rate": 4.069154416376564e-07, + "loss": 0.7032, + "step": 11078 + }, + { + "epoch": 0.9116642666118082, + "grad_norm": 2.197850835038065, + "learning_rate": 4.0616320687636434e-07, + "loss": 0.7078, + "step": 11079 + }, + { + "epoch": 0.9117465542069533, + "grad_norm": 1.8656378678249799, + "learning_rate": 4.054116536458985e-07, + "loss": 0.6941, + "step": 11080 + }, + { + "epoch": 0.9118288418020983, + "grad_norm": 1.8579691394957856, + "learning_rate": 4.0466078199964065e-07, + "loss": 0.6939, + "step": 11081 + }, + { + "epoch": 0.9119111293972434, + "grad_norm": 2.093915122663871, + "learning_rate": 4.0391059199093696e-07, + "loss": 0.6997, + "step": 11082 + }, + { + "epoch": 0.9119934169923885, + "grad_norm": 1.760694674327503, + "learning_rate": 4.03161083673077e-07, + "loss": 0.7162, + "step": 11083 + }, + { + "epoch": 0.9120757045875334, + "grad_norm": 0.4225517120559836, + "learning_rate": 4.024122570993072e-07, + "loss": 0.4587, + "step": 11084 + }, + { + "epoch": 0.9121579921826785, + "grad_norm": 2.708988931427557, + "learning_rate": 4.0166411232281934e-07, + "loss": 0.7181, + "step": 11085 + }, + { + "epoch": 0.9122402797778235, + "grad_norm": 0.3947932625953984, + "learning_rate": 4.009166493967664e-07, + "loss": 0.4543, + "step": 11086 + }, + { + "epoch": 0.9123225673729686, + "grad_norm": 0.41802536281021874, + "learning_rate": 4.0016986837424056e-07, + "loss": 0.4833, + "step": 11087 + }, + { + "epoch": 0.9124048549681135, + "grad_norm": 1.7671848091782563, + "learning_rate": 3.994237693082947e-07, + "loss": 0.7049, + "step": 11088 + }, + { + "epoch": 0.9124871425632586, + "grad_norm": 2.030658023953159, + "learning_rate": 3.9867835225192996e-07, + "loss": 0.694, + "step": 11089 + }, + { + "epoch": 0.9125694301584036, + "grad_norm": 0.4219633064095707, + "learning_rate": 3.9793361725810164e-07, + "loss": 0.4791, + "step": 11090 + }, + { + "epoch": 0.9126517177535487, + "grad_norm": 1.8698840619408101, + "learning_rate": 3.97189564379713e-07, + "loss": 0.6982, + "step": 11091 + }, + { + "epoch": 0.9127340053486936, + "grad_norm": 2.453835538136072, + "learning_rate": 3.9644619366961955e-07, + "loss": 0.7034, + "step": 11092 + }, + { + "epoch": 0.9128162929438387, + "grad_norm": 1.9068566149242838, + "learning_rate": 3.9570350518062907e-07, + "loss": 0.6797, + "step": 11093 + }, + { + "epoch": 0.9128985805389838, + "grad_norm": 0.40076021404693446, + "learning_rate": 3.9496149896550265e-07, + "loss": 0.4609, + "step": 11094 + }, + { + "epoch": 0.9129808681341288, + "grad_norm": 2.1967164341072247, + "learning_rate": 3.9422017507694923e-07, + "loss": 0.7161, + "step": 11095 + }, + { + "epoch": 0.9130631557292738, + "grad_norm": 2.1515579721388973, + "learning_rate": 3.9347953356763447e-07, + "loss": 0.6839, + "step": 11096 + }, + { + "epoch": 0.9131454433244188, + "grad_norm": 1.8840812141905696, + "learning_rate": 3.9273957449016965e-07, + "loss": 0.696, + "step": 11097 + }, + { + "epoch": 0.9132277309195639, + "grad_norm": 1.8719659689988786, + "learning_rate": 3.920002978971205e-07, + "loss": 0.6812, + "step": 11098 + }, + { + "epoch": 0.9133100185147089, + "grad_norm": 0.43646982857766475, + "learning_rate": 3.9126170384100383e-07, + "loss": 0.4651, + "step": 11099 + }, + { + "epoch": 0.9133923061098539, + "grad_norm": 1.617558080895866, + "learning_rate": 3.9052379237428996e-07, + "loss": 0.6953, + "step": 11100 + }, + { + "epoch": 0.913474593704999, + "grad_norm": 1.7056647854101412, + "learning_rate": 3.897865635493958e-07, + "loss": 0.6951, + "step": 11101 + }, + { + "epoch": 0.913556881300144, + "grad_norm": 1.7178009171586561, + "learning_rate": 3.890500174186973e-07, + "loss": 0.6947, + "step": 11102 + }, + { + "epoch": 0.9136391688952891, + "grad_norm": 0.43209897340488473, + "learning_rate": 3.883141540345148e-07, + "loss": 0.4972, + "step": 11103 + }, + { + "epoch": 0.913721456490434, + "grad_norm": 1.8277333572798018, + "learning_rate": 3.87578973449122e-07, + "loss": 0.6975, + "step": 11104 + }, + { + "epoch": 0.9138037440855791, + "grad_norm": 1.7327776488058233, + "learning_rate": 3.8684447571474606e-07, + "loss": 0.7209, + "step": 11105 + }, + { + "epoch": 0.9138860316807241, + "grad_norm": 2.112631319223756, + "learning_rate": 3.861106608835663e-07, + "loss": 0.6939, + "step": 11106 + }, + { + "epoch": 0.9139683192758692, + "grad_norm": 0.4156451878530187, + "learning_rate": 3.8537752900770775e-07, + "loss": 0.4569, + "step": 11107 + }, + { + "epoch": 0.9140506068710142, + "grad_norm": 2.1175320316101724, + "learning_rate": 3.8464508013925427e-07, + "loss": 0.6987, + "step": 11108 + }, + { + "epoch": 0.9141328944661592, + "grad_norm": 1.7265713167233072, + "learning_rate": 3.839133143302376e-07, + "loss": 0.6968, + "step": 11109 + }, + { + "epoch": 0.9142151820613043, + "grad_norm": 1.9191830532562333, + "learning_rate": 3.8318223163264056e-07, + "loss": 0.7424, + "step": 11110 + }, + { + "epoch": 0.9142974696564493, + "grad_norm": 0.4110752147612241, + "learning_rate": 3.824518320983961e-07, + "loss": 0.4425, + "step": 11111 + }, + { + "epoch": 0.9143797572515944, + "grad_norm": 1.762681398499508, + "learning_rate": 3.8172211577939377e-07, + "loss": 0.7066, + "step": 11112 + }, + { + "epoch": 0.9144620448467393, + "grad_norm": 0.4071216330684625, + "learning_rate": 3.8099308272746994e-07, + "loss": 0.465, + "step": 11113 + }, + { + "epoch": 0.9145443324418844, + "grad_norm": 0.425493722010197, + "learning_rate": 3.8026473299441425e-07, + "loss": 0.4824, + "step": 11114 + }, + { + "epoch": 0.9146266200370294, + "grad_norm": 2.294062868433947, + "learning_rate": 3.7953706663196754e-07, + "loss": 0.6833, + "step": 11115 + }, + { + "epoch": 0.9147089076321745, + "grad_norm": 1.7919217052804826, + "learning_rate": 3.788100836918229e-07, + "loss": 0.7438, + "step": 11116 + }, + { + "epoch": 0.9147911952273194, + "grad_norm": 2.9857103171434387, + "learning_rate": 3.780837842256202e-07, + "loss": 0.7266, + "step": 11117 + }, + { + "epoch": 0.9148734828224645, + "grad_norm": 2.2079823723831713, + "learning_rate": 3.773581682849603e-07, + "loss": 0.6731, + "step": 11118 + }, + { + "epoch": 0.9149557704176096, + "grad_norm": 2.1316272119237207, + "learning_rate": 3.7663323592138533e-07, + "loss": 0.6915, + "step": 11119 + }, + { + "epoch": 0.9150380580127546, + "grad_norm": 0.42834007012859804, + "learning_rate": 3.7590898718639634e-07, + "loss": 0.4984, + "step": 11120 + }, + { + "epoch": 0.9151203456078996, + "grad_norm": 2.1910475009394528, + "learning_rate": 3.7518542213144107e-07, + "loss": 0.7181, + "step": 11121 + }, + { + "epoch": 0.9152026332030446, + "grad_norm": 1.6649810934801026, + "learning_rate": 3.7446254080792055e-07, + "loss": 0.6901, + "step": 11122 + }, + { + "epoch": 0.9152849207981897, + "grad_norm": 1.6990430787040411, + "learning_rate": 3.73740343267186e-07, + "loss": 0.688, + "step": 11123 + }, + { + "epoch": 0.9153672083933347, + "grad_norm": 1.975108180408139, + "learning_rate": 3.7301882956054415e-07, + "loss": 0.7196, + "step": 11124 + }, + { + "epoch": 0.9154494959884797, + "grad_norm": 1.8575125298772637, + "learning_rate": 3.722979997392473e-07, + "loss": 0.7149, + "step": 11125 + }, + { + "epoch": 0.9155317835836247, + "grad_norm": 3.46024754055721, + "learning_rate": 3.715778538545056e-07, + "loss": 0.7068, + "step": 11126 + }, + { + "epoch": 0.9156140711787698, + "grad_norm": 1.9580975492476471, + "learning_rate": 3.708583919574738e-07, + "loss": 0.6736, + "step": 11127 + }, + { + "epoch": 0.9156963587739149, + "grad_norm": 0.40893462296092853, + "learning_rate": 3.701396140992619e-07, + "loss": 0.4745, + "step": 11128 + }, + { + "epoch": 0.9157786463690598, + "grad_norm": 3.2550189386813138, + "learning_rate": 3.6942152033093036e-07, + "loss": 0.7034, + "step": 11129 + }, + { + "epoch": 0.9158609339642049, + "grad_norm": 2.3165522989453073, + "learning_rate": 3.687041107034928e-07, + "loss": 0.7301, + "step": 11130 + }, + { + "epoch": 0.9159432215593499, + "grad_norm": 2.1917975260060465, + "learning_rate": 3.6798738526791164e-07, + "loss": 0.7291, + "step": 11131 + }, + { + "epoch": 0.916025509154495, + "grad_norm": 0.4520771168946314, + "learning_rate": 3.672713440751041e-07, + "loss": 0.4679, + "step": 11132 + }, + { + "epoch": 0.91610779674964, + "grad_norm": 2.934643452698258, + "learning_rate": 3.66555987175935e-07, + "loss": 0.6988, + "step": 11133 + }, + { + "epoch": 0.916190084344785, + "grad_norm": 1.8283233530551737, + "learning_rate": 3.6584131462122143e-07, + "loss": 0.706, + "step": 11134 + }, + { + "epoch": 0.91627237193993, + "grad_norm": 1.8020808864718025, + "learning_rate": 3.6512732646173166e-07, + "loss": 0.7318, + "step": 11135 + }, + { + "epoch": 0.9163546595350751, + "grad_norm": 2.0387164021999165, + "learning_rate": 3.644140227481907e-07, + "loss": 0.7044, + "step": 11136 + }, + { + "epoch": 0.9164369471302202, + "grad_norm": 1.7382654971845062, + "learning_rate": 3.637014035312658e-07, + "loss": 0.7092, + "step": 11137 + }, + { + "epoch": 0.9165192347253651, + "grad_norm": 3.1089372817020813, + "learning_rate": 3.6298946886158315e-07, + "loss": 0.709, + "step": 11138 + }, + { + "epoch": 0.9166015223205102, + "grad_norm": 1.7490167109142685, + "learning_rate": 3.622782187897167e-07, + "loss": 0.7192, + "step": 11139 + }, + { + "epoch": 0.9166838099156552, + "grad_norm": 1.9789641637939623, + "learning_rate": 3.615676533661916e-07, + "loss": 0.6964, + "step": 11140 + }, + { + "epoch": 0.9167660975108003, + "grad_norm": 0.43176660458614835, + "learning_rate": 3.6085777264148524e-07, + "loss": 0.4699, + "step": 11141 + }, + { + "epoch": 0.9168483851059452, + "grad_norm": 2.215304175995779, + "learning_rate": 3.6014857666602955e-07, + "loss": 0.7058, + "step": 11142 + }, + { + "epoch": 0.9169306727010903, + "grad_norm": 1.9973814493309634, + "learning_rate": 3.5944006549019974e-07, + "loss": 0.7117, + "step": 11143 + }, + { + "epoch": 0.9170129602962354, + "grad_norm": 1.7674951493769886, + "learning_rate": 3.5873223916433107e-07, + "loss": 0.687, + "step": 11144 + }, + { + "epoch": 0.9170952478913804, + "grad_norm": 2.3455530997613874, + "learning_rate": 3.580250977387056e-07, + "loss": 0.704, + "step": 11145 + }, + { + "epoch": 0.9171775354865254, + "grad_norm": 1.7171856058758734, + "learning_rate": 3.5731864126355766e-07, + "loss": 0.7062, + "step": 11146 + }, + { + "epoch": 0.9172598230816704, + "grad_norm": 0.424020996240505, + "learning_rate": 3.5661286978907026e-07, + "loss": 0.4872, + "step": 11147 + }, + { + "epoch": 0.9173421106768155, + "grad_norm": 2.053649850906344, + "learning_rate": 3.5590778336538346e-07, + "loss": 0.6912, + "step": 11148 + }, + { + "epoch": 0.9174243982719605, + "grad_norm": 2.0199947967638336, + "learning_rate": 3.552033820425838e-07, + "loss": 0.708, + "step": 11149 + }, + { + "epoch": 0.9175066858671055, + "grad_norm": 3.363111131720926, + "learning_rate": 3.544996658707134e-07, + "loss": 0.6867, + "step": 11150 + }, + { + "epoch": 0.9175889734622505, + "grad_norm": 2.0844997125121756, + "learning_rate": 3.53796634899759e-07, + "loss": 0.6923, + "step": 11151 + }, + { + "epoch": 0.9176712610573956, + "grad_norm": 1.9861006159027221, + "learning_rate": 3.5309428917966515e-07, + "loss": 0.6973, + "step": 11152 + }, + { + "epoch": 0.9177535486525407, + "grad_norm": 1.9265598371803432, + "learning_rate": 3.523926287603241e-07, + "loss": 0.7371, + "step": 11153 + }, + { + "epoch": 0.9178358362476856, + "grad_norm": 2.1561632865458757, + "learning_rate": 3.5169165369158266e-07, + "loss": 0.7517, + "step": 11154 + }, + { + "epoch": 0.9179181238428307, + "grad_norm": 1.7885685662382447, + "learning_rate": 3.509913640232354e-07, + "loss": 0.7248, + "step": 11155 + }, + { + "epoch": 0.9180004114379757, + "grad_norm": 1.8746516523314138, + "learning_rate": 3.5029175980503263e-07, + "loss": 0.7051, + "step": 11156 + }, + { + "epoch": 0.9180826990331208, + "grad_norm": 0.42669486517882177, + "learning_rate": 3.495928410866678e-07, + "loss": 0.4678, + "step": 11157 + }, + { + "epoch": 0.9181649866282657, + "grad_norm": 1.5085186998303914, + "learning_rate": 3.488946079177957e-07, + "loss": 0.7051, + "step": 11158 + }, + { + "epoch": 0.9182472742234108, + "grad_norm": 1.9137125990368065, + "learning_rate": 3.481970603480145e-07, + "loss": 0.6801, + "step": 11159 + }, + { + "epoch": 0.9183295618185559, + "grad_norm": 0.4522697848414297, + "learning_rate": 3.4750019842688e-07, + "loss": 0.473, + "step": 11160 + }, + { + "epoch": 0.9184118494137009, + "grad_norm": 0.4070960042578885, + "learning_rate": 3.4680402220389485e-07, + "loss": 0.4751, + "step": 11161 + }, + { + "epoch": 0.918494137008846, + "grad_norm": 2.9779201006960236, + "learning_rate": 3.461085317285129e-07, + "loss": 0.671, + "step": 11162 + }, + { + "epoch": 0.9185764246039909, + "grad_norm": 0.4165353861555988, + "learning_rate": 3.454137270501412e-07, + "loss": 0.4671, + "step": 11163 + }, + { + "epoch": 0.918658712199136, + "grad_norm": 1.7013635065197363, + "learning_rate": 3.447196082181392e-07, + "loss": 0.705, + "step": 11164 + }, + { + "epoch": 0.918740999794281, + "grad_norm": 0.4455564829660013, + "learning_rate": 3.4402617528181304e-07, + "loss": 0.4625, + "step": 11165 + }, + { + "epoch": 0.9188232873894261, + "grad_norm": 2.059802938474781, + "learning_rate": 3.433334282904266e-07, + "loss": 0.7023, + "step": 11166 + }, + { + "epoch": 0.918905574984571, + "grad_norm": 2.0862104738994267, + "learning_rate": 3.4264136729319054e-07, + "loss": 0.7073, + "step": 11167 + }, + { + "epoch": 0.9189878625797161, + "grad_norm": 1.9348851339717583, + "learning_rate": 3.4194999233926664e-07, + "loss": 0.7331, + "step": 11168 + }, + { + "epoch": 0.9190701501748612, + "grad_norm": 1.8414786307869675, + "learning_rate": 3.412593034777678e-07, + "loss": 0.7332, + "step": 11169 + }, + { + "epoch": 0.9191524377700062, + "grad_norm": 2.1169698362827933, + "learning_rate": 3.405693007577626e-07, + "loss": 0.7069, + "step": 11170 + }, + { + "epoch": 0.9192347253651512, + "grad_norm": 2.1975156477602513, + "learning_rate": 3.398799842282652e-07, + "loss": 0.7018, + "step": 11171 + }, + { + "epoch": 0.9193170129602962, + "grad_norm": 2.251103702219983, + "learning_rate": 3.391913539382463e-07, + "loss": 0.7071, + "step": 11172 + }, + { + "epoch": 0.9193993005554413, + "grad_norm": 1.8505745101046729, + "learning_rate": 3.3850340993662246e-07, + "loss": 0.7056, + "step": 11173 + }, + { + "epoch": 0.9194815881505863, + "grad_norm": 1.86079842391407, + "learning_rate": 3.378161522722656e-07, + "loss": 0.7055, + "step": 11174 + }, + { + "epoch": 0.9195638757457313, + "grad_norm": 1.7482574883125304, + "learning_rate": 3.371295809939967e-07, + "loss": 0.7158, + "step": 11175 + }, + { + "epoch": 0.9196461633408763, + "grad_norm": 1.930405136741411, + "learning_rate": 3.3644369615058904e-07, + "loss": 0.7179, + "step": 11176 + }, + { + "epoch": 0.9197284509360214, + "grad_norm": 3.21248043307195, + "learning_rate": 3.3575849779076575e-07, + "loss": 0.6874, + "step": 11177 + }, + { + "epoch": 0.9198107385311665, + "grad_norm": 1.9568543504972324, + "learning_rate": 3.350739859632035e-07, + "loss": 0.689, + "step": 11178 + }, + { + "epoch": 0.9198930261263114, + "grad_norm": 2.317259595876262, + "learning_rate": 3.3439016071652896e-07, + "loss": 0.6964, + "step": 11179 + }, + { + "epoch": 0.9199753137214565, + "grad_norm": 0.4300465636391542, + "learning_rate": 3.3370702209931995e-07, + "loss": 0.4718, + "step": 11180 + }, + { + "epoch": 0.9200576013166015, + "grad_norm": 0.4186830059197986, + "learning_rate": 3.3302457016010316e-07, + "loss": 0.4843, + "step": 11181 + }, + { + "epoch": 0.9201398889117466, + "grad_norm": 1.8500288250402575, + "learning_rate": 3.3234280494736313e-07, + "loss": 0.6943, + "step": 11182 + }, + { + "epoch": 0.9202221765068915, + "grad_norm": 2.8601505788876733, + "learning_rate": 3.3166172650952786e-07, + "loss": 0.7011, + "step": 11183 + }, + { + "epoch": 0.9203044641020366, + "grad_norm": 2.419095038862596, + "learning_rate": 3.30981334894982e-07, + "loss": 0.6983, + "step": 11184 + }, + { + "epoch": 0.9203867516971816, + "grad_norm": 0.4131232421178841, + "learning_rate": 3.3030163015205784e-07, + "loss": 0.4822, + "step": 11185 + }, + { + "epoch": 0.9204690392923267, + "grad_norm": 2.321308149997848, + "learning_rate": 3.2962261232904134e-07, + "loss": 0.7193, + "step": 11186 + }, + { + "epoch": 0.9205513268874718, + "grad_norm": 2.408139149625049, + "learning_rate": 3.289442814741706e-07, + "loss": 0.6997, + "step": 11187 + }, + { + "epoch": 0.9206336144826167, + "grad_norm": 2.083855000921472, + "learning_rate": 3.282666376356303e-07, + "loss": 0.6952, + "step": 11188 + }, + { + "epoch": 0.9207159020777618, + "grad_norm": 2.0843356501131005, + "learning_rate": 3.275896808615608e-07, + "loss": 0.7328, + "step": 11189 + }, + { + "epoch": 0.9207981896729068, + "grad_norm": 2.0168248022971023, + "learning_rate": 3.269134112000516e-07, + "loss": 0.7252, + "step": 11190 + }, + { + "epoch": 0.9208804772680519, + "grad_norm": 1.823147676994674, + "learning_rate": 3.262378286991441e-07, + "loss": 0.7024, + "step": 11191 + }, + { + "epoch": 0.9209627648631968, + "grad_norm": 2.2188437944384507, + "learning_rate": 3.2556293340683e-07, + "loss": 0.7232, + "step": 11192 + }, + { + "epoch": 0.9210450524583419, + "grad_norm": 4.206370307809758, + "learning_rate": 3.248887253710531e-07, + "loss": 0.7381, + "step": 11193 + }, + { + "epoch": 0.921127340053487, + "grad_norm": 2.007295787570619, + "learning_rate": 3.2421520463970733e-07, + "loss": 0.7093, + "step": 11194 + }, + { + "epoch": 0.921209627648632, + "grad_norm": 1.7431942347815454, + "learning_rate": 3.2354237126064117e-07, + "loss": 0.6707, + "step": 11195 + }, + { + "epoch": 0.921291915243777, + "grad_norm": 1.6154964650847916, + "learning_rate": 3.2287022528164846e-07, + "loss": 0.6843, + "step": 11196 + }, + { + "epoch": 0.921374202838922, + "grad_norm": 2.2268285867267203, + "learning_rate": 3.221987667504789e-07, + "loss": 0.7031, + "step": 11197 + }, + { + "epoch": 0.9214564904340671, + "grad_norm": 2.699424405908909, + "learning_rate": 3.2152799571483093e-07, + "loss": 0.7319, + "step": 11198 + }, + { + "epoch": 0.9215387780292121, + "grad_norm": 0.4115826119220522, + "learning_rate": 3.208579122223576e-07, + "loss": 0.4754, + "step": 11199 + }, + { + "epoch": 0.9216210656243571, + "grad_norm": 0.42372727561835577, + "learning_rate": 3.201885163206564e-07, + "loss": 0.459, + "step": 11200 + }, + { + "epoch": 0.9217033532195021, + "grad_norm": 1.6973587414082418, + "learning_rate": 3.1951980805728366e-07, + "loss": 0.6853, + "step": 11201 + }, + { + "epoch": 0.9217856408146472, + "grad_norm": 2.254218956928782, + "learning_rate": 3.1885178747974257e-07, + "loss": 0.6997, + "step": 11202 + }, + { + "epoch": 0.9218679284097923, + "grad_norm": 2.035840959699793, + "learning_rate": 3.181844546354873e-07, + "loss": 0.7042, + "step": 11203 + }, + { + "epoch": 0.9219502160049372, + "grad_norm": 2.3216962555719527, + "learning_rate": 3.175178095719245e-07, + "loss": 0.7179, + "step": 11204 + }, + { + "epoch": 0.9220325036000823, + "grad_norm": 2.1646797279533243, + "learning_rate": 3.168518523364117e-07, + "loss": 0.7505, + "step": 11205 + }, + { + "epoch": 0.9221147911952273, + "grad_norm": 1.8918975580496307, + "learning_rate": 3.161865829762567e-07, + "loss": 0.7256, + "step": 11206 + }, + { + "epoch": 0.9221970787903724, + "grad_norm": 1.9016089321748606, + "learning_rate": 3.155220015387206e-07, + "loss": 0.722, + "step": 11207 + }, + { + "epoch": 0.9222793663855173, + "grad_norm": 1.6335602861207836, + "learning_rate": 3.1485810807101447e-07, + "loss": 0.6916, + "step": 11208 + }, + { + "epoch": 0.9223616539806624, + "grad_norm": 0.4202625516753293, + "learning_rate": 3.141949026202984e-07, + "loss": 0.4618, + "step": 11209 + }, + { + "epoch": 0.9224439415758074, + "grad_norm": 1.9122937143336103, + "learning_rate": 3.135323852336847e-07, + "loss": 0.6946, + "step": 11210 + }, + { + "epoch": 0.9225262291709525, + "grad_norm": 0.407191492899598, + "learning_rate": 3.1287055595824125e-07, + "loss": 0.4558, + "step": 11211 + }, + { + "epoch": 0.9226085167660976, + "grad_norm": 2.3840285372342085, + "learning_rate": 3.122094148409793e-07, + "loss": 0.7094, + "step": 11212 + }, + { + "epoch": 0.9226908043612425, + "grad_norm": 2.332944464386065, + "learning_rate": 3.115489619288692e-07, + "loss": 0.7298, + "step": 11213 + }, + { + "epoch": 0.9227730919563876, + "grad_norm": 1.8474640217786942, + "learning_rate": 3.108891972688266e-07, + "loss": 0.7018, + "step": 11214 + }, + { + "epoch": 0.9228553795515326, + "grad_norm": 1.979582488751761, + "learning_rate": 3.102301209077185e-07, + "loss": 0.7232, + "step": 11215 + }, + { + "epoch": 0.9229376671466777, + "grad_norm": 1.799195111475414, + "learning_rate": 3.0957173289236644e-07, + "loss": 0.699, + "step": 11216 + }, + { + "epoch": 0.9230199547418226, + "grad_norm": 1.6541994565396199, + "learning_rate": 3.089140332695417e-07, + "loss": 0.7083, + "step": 11217 + }, + { + "epoch": 0.9231022423369677, + "grad_norm": 1.8984678507115642, + "learning_rate": 3.0825702208596376e-07, + "loss": 0.6892, + "step": 11218 + }, + { + "epoch": 0.9231845299321128, + "grad_norm": 1.8380749105268797, + "learning_rate": 3.076006993883085e-07, + "loss": 0.667, + "step": 11219 + }, + { + "epoch": 0.9232668175272578, + "grad_norm": 3.3120527637388566, + "learning_rate": 3.0694506522319977e-07, + "loss": 0.6925, + "step": 11220 + }, + { + "epoch": 0.9233491051224028, + "grad_norm": 2.283447824175545, + "learning_rate": 3.0629011963721143e-07, + "loss": 0.7161, + "step": 11221 + }, + { + "epoch": 0.9234313927175478, + "grad_norm": 1.8298093720802682, + "learning_rate": 3.0563586267686853e-07, + "loss": 0.6621, + "step": 11222 + }, + { + "epoch": 0.9235136803126929, + "grad_norm": 1.6476251159353563, + "learning_rate": 3.0498229438865157e-07, + "loss": 0.7148, + "step": 11223 + }, + { + "epoch": 0.9235959679078379, + "grad_norm": 1.9762375306283848, + "learning_rate": 3.043294148189868e-07, + "loss": 0.7378, + "step": 11224 + }, + { + "epoch": 0.9236782555029829, + "grad_norm": 1.9778665563641953, + "learning_rate": 3.0367722401425495e-07, + "loss": 0.6911, + "step": 11225 + }, + { + "epoch": 0.9237605430981279, + "grad_norm": 1.958022299492967, + "learning_rate": 3.0302572202078664e-07, + "loss": 0.6874, + "step": 11226 + }, + { + "epoch": 0.923842830693273, + "grad_norm": 2.4185327790295688, + "learning_rate": 3.0237490888486265e-07, + "loss": 0.7236, + "step": 11227 + }, + { + "epoch": 0.9239251182884181, + "grad_norm": 1.8946770059973608, + "learning_rate": 3.0172478465271493e-07, + "loss": 0.6735, + "step": 11228 + }, + { + "epoch": 0.924007405883563, + "grad_norm": 1.7817821835973289, + "learning_rate": 3.0107534937052986e-07, + "loss": 0.722, + "step": 11229 + }, + { + "epoch": 0.9240896934787081, + "grad_norm": 1.7635435178885828, + "learning_rate": 3.004266030844394e-07, + "loss": 0.7077, + "step": 11230 + }, + { + "epoch": 0.9241719810738531, + "grad_norm": 1.6942067847759714, + "learning_rate": 2.9977854584053336e-07, + "loss": 0.6782, + "step": 11231 + }, + { + "epoch": 0.9242542686689982, + "grad_norm": 2.3935053906351933, + "learning_rate": 2.991311776848438e-07, + "loss": 0.6765, + "step": 11232 + }, + { + "epoch": 0.9243365562641431, + "grad_norm": 1.9672727718255352, + "learning_rate": 2.984844986633617e-07, + "loss": 0.6954, + "step": 11233 + }, + { + "epoch": 0.9244188438592882, + "grad_norm": 2.173472497902593, + "learning_rate": 2.978385088220248e-07, + "loss": 0.6822, + "step": 11234 + }, + { + "epoch": 0.9245011314544332, + "grad_norm": 2.057946499093014, + "learning_rate": 2.9719320820672524e-07, + "loss": 0.6722, + "step": 11235 + }, + { + "epoch": 0.9245834190495783, + "grad_norm": 2.1363558380340475, + "learning_rate": 2.9654859686330194e-07, + "loss": 0.7038, + "step": 11236 + }, + { + "epoch": 0.9246657066447234, + "grad_norm": 4.847951772179186, + "learning_rate": 2.959046748375505e-07, + "loss": 0.6721, + "step": 11237 + }, + { + "epoch": 0.9247479942398683, + "grad_norm": 2.125572509255214, + "learning_rate": 2.952614421752087e-07, + "loss": 0.7131, + "step": 11238 + }, + { + "epoch": 0.9248302818350134, + "grad_norm": 1.9141641843347492, + "learning_rate": 2.946188989219745e-07, + "loss": 0.6802, + "step": 11239 + }, + { + "epoch": 0.9249125694301584, + "grad_norm": 2.877994093742204, + "learning_rate": 2.9397704512349133e-07, + "loss": 0.7531, + "step": 11240 + }, + { + "epoch": 0.9249948570253035, + "grad_norm": 1.799365394107581, + "learning_rate": 2.9333588082535726e-07, + "loss": 0.7267, + "step": 11241 + }, + { + "epoch": 0.9250771446204484, + "grad_norm": 1.5551425805527177, + "learning_rate": 2.926954060731191e-07, + "loss": 0.6933, + "step": 11242 + }, + { + "epoch": 0.9251594322155935, + "grad_norm": 1.7596207951971268, + "learning_rate": 2.920556209122749e-07, + "loss": 0.7039, + "step": 11243 + }, + { + "epoch": 0.9252417198107385, + "grad_norm": 2.3656339844672267, + "learning_rate": 2.914165253882728e-07, + "loss": 0.7195, + "step": 11244 + }, + { + "epoch": 0.9253240074058836, + "grad_norm": 1.8522331769488773, + "learning_rate": 2.9077811954651425e-07, + "loss": 0.7081, + "step": 11245 + }, + { + "epoch": 0.9254062950010286, + "grad_norm": 1.838683036924409, + "learning_rate": 2.901404034323507e-07, + "loss": 0.7029, + "step": 11246 + }, + { + "epoch": 0.9254885825961736, + "grad_norm": 2.4485692078314036, + "learning_rate": 2.895033770910849e-07, + "loss": 0.7377, + "step": 11247 + }, + { + "epoch": 0.9255708701913187, + "grad_norm": 1.868731252975563, + "learning_rate": 2.8886704056796945e-07, + "loss": 0.7129, + "step": 11248 + }, + { + "epoch": 0.9256531577864637, + "grad_norm": 1.9616632588024705, + "learning_rate": 2.882313939082082e-07, + "loss": 0.7018, + "step": 11249 + }, + { + "epoch": 0.9257354453816087, + "grad_norm": 2.4464993360017364, + "learning_rate": 2.8759643715695616e-07, + "loss": 0.6746, + "step": 11250 + }, + { + "epoch": 0.9258177329767537, + "grad_norm": 2.01278180003544, + "learning_rate": 2.869621703593228e-07, + "loss": 0.6875, + "step": 11251 + }, + { + "epoch": 0.9259000205718988, + "grad_norm": 1.7593243283486453, + "learning_rate": 2.8632859356036103e-07, + "loss": 0.7222, + "step": 11252 + }, + { + "epoch": 0.9259823081670439, + "grad_norm": 2.5096596810540306, + "learning_rate": 2.8569570680508254e-07, + "loss": 0.712, + "step": 11253 + }, + { + "epoch": 0.9260645957621888, + "grad_norm": 1.9826867047699595, + "learning_rate": 2.850635101384447e-07, + "loss": 0.7125, + "step": 11254 + }, + { + "epoch": 0.9261468833573339, + "grad_norm": 3.8156930564109364, + "learning_rate": 2.844320036053594e-07, + "loss": 0.6904, + "step": 11255 + }, + { + "epoch": 0.9262291709524789, + "grad_norm": 1.9464954844151272, + "learning_rate": 2.838011872506852e-07, + "loss": 0.6845, + "step": 11256 + }, + { + "epoch": 0.926311458547624, + "grad_norm": 2.358587341722792, + "learning_rate": 2.831710611192373e-07, + "loss": 0.735, + "step": 11257 + }, + { + "epoch": 0.9263937461427689, + "grad_norm": 1.9282382101593416, + "learning_rate": 2.8254162525577553e-07, + "loss": 0.7083, + "step": 11258 + }, + { + "epoch": 0.926476033737914, + "grad_norm": 2.359634767363598, + "learning_rate": 2.8191287970501747e-07, + "loss": 0.7292, + "step": 11259 + }, + { + "epoch": 0.926558321333059, + "grad_norm": 2.125430868836873, + "learning_rate": 2.812848245116273e-07, + "loss": 0.7197, + "step": 11260 + }, + { + "epoch": 0.9266406089282041, + "grad_norm": 1.7063617992072404, + "learning_rate": 2.8065745972021943e-07, + "loss": 0.7068, + "step": 11261 + }, + { + "epoch": 0.9267228965233492, + "grad_norm": 1.7624021473946603, + "learning_rate": 2.800307853753614e-07, + "loss": 0.739, + "step": 11262 + }, + { + "epoch": 0.9268051841184941, + "grad_norm": 3.167617712568675, + "learning_rate": 2.794048015215722e-07, + "loss": 0.7106, + "step": 11263 + }, + { + "epoch": 0.9268874717136392, + "grad_norm": 0.4162063297423761, + "learning_rate": 2.7877950820331954e-07, + "loss": 0.459, + "step": 11264 + }, + { + "epoch": 0.9269697593087842, + "grad_norm": 2.0510370324319473, + "learning_rate": 2.7815490546502457e-07, + "loss": 0.7226, + "step": 11265 + }, + { + "epoch": 0.9270520469039293, + "grad_norm": 2.074813362555719, + "learning_rate": 2.775309933510573e-07, + "loss": 0.7202, + "step": 11266 + }, + { + "epoch": 0.9271343344990742, + "grad_norm": 0.41678611532792487, + "learning_rate": 2.7690777190574003e-07, + "loss": 0.4688, + "step": 11267 + }, + { + "epoch": 0.9272166220942193, + "grad_norm": 2.524819704929307, + "learning_rate": 2.76285241173343e-07, + "loss": 0.7063, + "step": 11268 + }, + { + "epoch": 0.9272989096893643, + "grad_norm": 2.192225635805618, + "learning_rate": 2.7566340119809297e-07, + "loss": 0.7008, + "step": 11269 + }, + { + "epoch": 0.9273811972845094, + "grad_norm": 1.925742741909308, + "learning_rate": 2.750422520241625e-07, + "loss": 0.701, + "step": 11270 + }, + { + "epoch": 0.9274634848796544, + "grad_norm": 1.7635525890697241, + "learning_rate": 2.7442179369567836e-07, + "loss": 0.7029, + "step": 11271 + }, + { + "epoch": 0.9275457724747994, + "grad_norm": 1.8430370674105243, + "learning_rate": 2.738020262567154e-07, + "loss": 0.7009, + "step": 11272 + }, + { + "epoch": 0.9276280600699445, + "grad_norm": 2.002272188286741, + "learning_rate": 2.731829497513028e-07, + "loss": 0.7057, + "step": 11273 + }, + { + "epoch": 0.9277103476650895, + "grad_norm": 1.7864164057711445, + "learning_rate": 2.7256456422341647e-07, + "loss": 0.7178, + "step": 11274 + }, + { + "epoch": 0.9277926352602345, + "grad_norm": 1.8979446271776037, + "learning_rate": 2.7194686971698805e-07, + "loss": 0.7227, + "step": 11275 + }, + { + "epoch": 0.9278749228553795, + "grad_norm": 1.9126674694157884, + "learning_rate": 2.7132986627589454e-07, + "loss": 0.7025, + "step": 11276 + }, + { + "epoch": 0.9279572104505246, + "grad_norm": 1.85867373124332, + "learning_rate": 2.707135539439698e-07, + "loss": 0.7053, + "step": 11277 + }, + { + "epoch": 0.9280394980456697, + "grad_norm": 3.8561739648234066, + "learning_rate": 2.700979327649955e-07, + "loss": 0.707, + "step": 11278 + }, + { + "epoch": 0.9281217856408146, + "grad_norm": 1.7699153081722436, + "learning_rate": 2.694830027827022e-07, + "loss": 0.7065, + "step": 11279 + }, + { + "epoch": 0.9282040732359597, + "grad_norm": 1.8221859939792688, + "learning_rate": 2.688687640407739e-07, + "loss": 0.7276, + "step": 11280 + }, + { + "epoch": 0.9282863608311047, + "grad_norm": 1.8055607560155484, + "learning_rate": 2.682552165828478e-07, + "loss": 0.7008, + "step": 11281 + }, + { + "epoch": 0.9283686484262498, + "grad_norm": 1.713287013853736, + "learning_rate": 2.676423604525058e-07, + "loss": 0.7172, + "step": 11282 + }, + { + "epoch": 0.9284509360213947, + "grad_norm": 2.1336521560727646, + "learning_rate": 2.6703019569328746e-07, + "loss": 0.6893, + "step": 11283 + }, + { + "epoch": 0.9285332236165398, + "grad_norm": 1.7833327511488737, + "learning_rate": 2.664187223486769e-07, + "loss": 0.6893, + "step": 11284 + }, + { + "epoch": 0.9286155112116848, + "grad_norm": 1.7292393174598433, + "learning_rate": 2.65807940462115e-07, + "loss": 0.7051, + "step": 11285 + }, + { + "epoch": 0.9286977988068299, + "grad_norm": 0.4144297577952963, + "learning_rate": 2.6519785007698916e-07, + "loss": 0.4512, + "step": 11286 + }, + { + "epoch": 0.9287800864019748, + "grad_norm": 2.0049921216186335, + "learning_rate": 2.6458845123663926e-07, + "loss": 0.6971, + "step": 11287 + }, + { + "epoch": 0.9288623739971199, + "grad_norm": 0.42711488775963274, + "learning_rate": 2.6397974398435613e-07, + "loss": 0.4752, + "step": 11288 + }, + { + "epoch": 0.928944661592265, + "grad_norm": 2.069065070058369, + "learning_rate": 2.633717283633819e-07, + "loss": 0.7334, + "step": 11289 + }, + { + "epoch": 0.92902694918741, + "grad_norm": 1.6400501248276589, + "learning_rate": 2.6276440441690866e-07, + "loss": 0.7092, + "step": 11290 + }, + { + "epoch": 0.9291092367825551, + "grad_norm": 0.4219462118361071, + "learning_rate": 2.6215777218807967e-07, + "loss": 0.4504, + "step": 11291 + }, + { + "epoch": 0.9291915243777, + "grad_norm": 2.592378329572483, + "learning_rate": 2.615518317199883e-07, + "loss": 0.7078, + "step": 11292 + }, + { + "epoch": 0.9292738119728451, + "grad_norm": 2.4162405709807486, + "learning_rate": 2.609465830556812e-07, + "loss": 0.7184, + "step": 11293 + }, + { + "epoch": 0.9293560995679901, + "grad_norm": 3.0669430390147916, + "learning_rate": 2.6034202623815285e-07, + "loss": 0.6996, + "step": 11294 + }, + { + "epoch": 0.9294383871631352, + "grad_norm": 2.0712177978401316, + "learning_rate": 2.5973816131035226e-07, + "loss": 0.71, + "step": 11295 + }, + { + "epoch": 0.9295206747582802, + "grad_norm": 2.0166780800606694, + "learning_rate": 2.5913498831517504e-07, + "loss": 0.7133, + "step": 11296 + }, + { + "epoch": 0.9296029623534252, + "grad_norm": 2.18263555522366, + "learning_rate": 2.585325072954692e-07, + "loss": 0.711, + "step": 11297 + }, + { + "epoch": 0.9296852499485703, + "grad_norm": 0.43149930985756263, + "learning_rate": 2.5793071829403493e-07, + "loss": 0.4841, + "step": 11298 + }, + { + "epoch": 0.9297675375437153, + "grad_norm": 2.3038097214220596, + "learning_rate": 2.573296213536236e-07, + "loss": 0.7153, + "step": 11299 + }, + { + "epoch": 0.9298498251388603, + "grad_norm": 2.2093184350436337, + "learning_rate": 2.5672921651693215e-07, + "loss": 0.6834, + "step": 11300 + }, + { + "epoch": 0.9299321127340053, + "grad_norm": 2.3291355785025805, + "learning_rate": 2.561295038266187e-07, + "loss": 0.706, + "step": 11301 + }, + { + "epoch": 0.9300144003291504, + "grad_norm": 2.347579773528517, + "learning_rate": 2.555304833252792e-07, + "loss": 0.711, + "step": 11302 + }, + { + "epoch": 0.9300966879242955, + "grad_norm": 0.3987358335134094, + "learning_rate": 2.5493215505547177e-07, + "loss": 0.4775, + "step": 11303 + }, + { + "epoch": 0.9301789755194404, + "grad_norm": 1.667409331542241, + "learning_rate": 2.54334519059698e-07, + "loss": 0.6994, + "step": 11304 + }, + { + "epoch": 0.9302612631145855, + "grad_norm": 0.4236711668069023, + "learning_rate": 2.537375753804161e-07, + "loss": 0.4878, + "step": 11305 + }, + { + "epoch": 0.9303435507097305, + "grad_norm": 2.055934832617243, + "learning_rate": 2.531413240600278e-07, + "loss": 0.7121, + "step": 11306 + }, + { + "epoch": 0.9304258383048756, + "grad_norm": 1.9739829786119014, + "learning_rate": 2.5254576514089357e-07, + "loss": 0.7289, + "step": 11307 + }, + { + "epoch": 0.9305081259000205, + "grad_norm": 1.8096029878775812, + "learning_rate": 2.519508986653185e-07, + "loss": 0.6937, + "step": 11308 + }, + { + "epoch": 0.9305904134951656, + "grad_norm": 1.8968223676752807, + "learning_rate": 2.5135672467556216e-07, + "loss": 0.7006, + "step": 11309 + }, + { + "epoch": 0.9306727010903106, + "grad_norm": 0.4253011800046198, + "learning_rate": 2.5076324321383295e-07, + "loss": 0.4819, + "step": 11310 + }, + { + "epoch": 0.9307549886854557, + "grad_norm": 1.7060864531239284, + "learning_rate": 2.501704543222927e-07, + "loss": 0.7056, + "step": 11311 + }, + { + "epoch": 0.9308372762806006, + "grad_norm": 1.8474221389948373, + "learning_rate": 2.4957835804305e-07, + "loss": 0.6979, + "step": 11312 + }, + { + "epoch": 0.9309195638757457, + "grad_norm": 0.398992384064237, + "learning_rate": 2.489869544181678e-07, + "loss": 0.4769, + "step": 11313 + }, + { + "epoch": 0.9310018514708908, + "grad_norm": 0.41620782433414716, + "learning_rate": 2.4839624348965694e-07, + "loss": 0.4619, + "step": 11314 + }, + { + "epoch": 0.9310841390660358, + "grad_norm": 2.7488112095061705, + "learning_rate": 2.4780622529948174e-07, + "loss": 0.7219, + "step": 11315 + }, + { + "epoch": 0.9311664266611809, + "grad_norm": 2.5040152821376727, + "learning_rate": 2.472168998895563e-07, + "loss": 0.7081, + "step": 11316 + }, + { + "epoch": 0.9312487142563258, + "grad_norm": 1.935972128970949, + "learning_rate": 2.4662826730174505e-07, + "loss": 0.6816, + "step": 11317 + }, + { + "epoch": 0.9313310018514709, + "grad_norm": 1.6315728508342249, + "learning_rate": 2.460403275778633e-07, + "loss": 0.6996, + "step": 11318 + }, + { + "epoch": 0.9314132894466159, + "grad_norm": 2.7858095182925684, + "learning_rate": 2.4545308075967777e-07, + "loss": 0.7124, + "step": 11319 + }, + { + "epoch": 0.931495577041761, + "grad_norm": 2.052639757957855, + "learning_rate": 2.44866526888905e-07, + "loss": 0.6926, + "step": 11320 + }, + { + "epoch": 0.931577864636906, + "grad_norm": 2.4792941959756516, + "learning_rate": 2.442806660072139e-07, + "loss": 0.6968, + "step": 11321 + }, + { + "epoch": 0.931660152232051, + "grad_norm": 2.755477937877578, + "learning_rate": 2.4369549815622005e-07, + "loss": 0.6858, + "step": 11322 + }, + { + "epoch": 0.9317424398271961, + "grad_norm": 3.01217266910165, + "learning_rate": 2.4311102337749804e-07, + "loss": 0.7208, + "step": 11323 + }, + { + "epoch": 0.9318247274223411, + "grad_norm": 2.2544309363605937, + "learning_rate": 2.425272417125635e-07, + "loss": 0.7093, + "step": 11324 + }, + { + "epoch": 0.9319070150174861, + "grad_norm": 2.405253121100486, + "learning_rate": 2.419441532028899e-07, + "loss": 0.7052, + "step": 11325 + }, + { + "epoch": 0.9319893026126311, + "grad_norm": 2.1138087782929906, + "learning_rate": 2.4136175788989637e-07, + "loss": 0.7036, + "step": 11326 + }, + { + "epoch": 0.9320715902077762, + "grad_norm": 3.096802404644551, + "learning_rate": 2.4078005581495867e-07, + "loss": 0.7143, + "step": 11327 + }, + { + "epoch": 0.9321538778029212, + "grad_norm": 2.0404300703403857, + "learning_rate": 2.40199047019396e-07, + "loss": 0.7388, + "step": 11328 + }, + { + "epoch": 0.9322361653980662, + "grad_norm": 1.9964956064727892, + "learning_rate": 2.3961873154448646e-07, + "loss": 0.7082, + "step": 11329 + }, + { + "epoch": 0.9323184529932113, + "grad_norm": 0.4203321594362028, + "learning_rate": 2.390391094314526e-07, + "loss": 0.4801, + "step": 11330 + }, + { + "epoch": 0.9324007405883563, + "grad_norm": 2.28044888024721, + "learning_rate": 2.3846018072147037e-07, + "loss": 0.7217, + "step": 11331 + }, + { + "epoch": 0.9324830281835014, + "grad_norm": 1.9561656457613756, + "learning_rate": 2.3788194545566357e-07, + "loss": 0.7214, + "step": 11332 + }, + { + "epoch": 0.9325653157786463, + "grad_norm": 2.9538461577905526, + "learning_rate": 2.3730440367511376e-07, + "loss": 0.7258, + "step": 11333 + }, + { + "epoch": 0.9326476033737914, + "grad_norm": 2.024007507658877, + "learning_rate": 2.367275554208437e-07, + "loss": 0.7259, + "step": 11334 + }, + { + "epoch": 0.9327298909689364, + "grad_norm": 0.4136188293813573, + "learning_rate": 2.3615140073383502e-07, + "loss": 0.4646, + "step": 11335 + }, + { + "epoch": 0.9328121785640815, + "grad_norm": 2.340492747904092, + "learning_rate": 2.3557593965501723e-07, + "loss": 0.7166, + "step": 11336 + }, + { + "epoch": 0.9328944661592264, + "grad_norm": 1.727001245840666, + "learning_rate": 2.3500117222526763e-07, + "loss": 0.6907, + "step": 11337 + }, + { + "epoch": 0.9329767537543715, + "grad_norm": 1.8541928458185994, + "learning_rate": 2.344270984854169e-07, + "loss": 0.7124, + "step": 11338 + }, + { + "epoch": 0.9330590413495166, + "grad_norm": 2.3845476195879662, + "learning_rate": 2.3385371847624905e-07, + "loss": 0.6828, + "step": 11339 + }, + { + "epoch": 0.9331413289446616, + "grad_norm": 3.172180253680803, + "learning_rate": 2.3328103223849263e-07, + "loss": 0.7383, + "step": 11340 + }, + { + "epoch": 0.9332236165398067, + "grad_norm": 2.671094748969022, + "learning_rate": 2.32709039812834e-07, + "loss": 0.7252, + "step": 11341 + }, + { + "epoch": 0.9333059041349516, + "grad_norm": 2.252737419517049, + "learning_rate": 2.3213774123990395e-07, + "loss": 0.7198, + "step": 11342 + }, + { + "epoch": 0.9333881917300967, + "grad_norm": 1.9911603928601525, + "learning_rate": 2.3156713656028785e-07, + "loss": 0.7013, + "step": 11343 + }, + { + "epoch": 0.9334704793252417, + "grad_norm": 1.8576114762700076, + "learning_rate": 2.3099722581451878e-07, + "loss": 0.717, + "step": 11344 + }, + { + "epoch": 0.9335527669203868, + "grad_norm": 1.7798462868482081, + "learning_rate": 2.3042800904308437e-07, + "loss": 0.6985, + "step": 11345 + }, + { + "epoch": 0.9336350545155317, + "grad_norm": 2.198051920722998, + "learning_rate": 2.2985948628641896e-07, + "loss": 0.7036, + "step": 11346 + }, + { + "epoch": 0.9337173421106768, + "grad_norm": 1.9110081888407027, + "learning_rate": 2.2929165758491245e-07, + "loss": 0.6947, + "step": 11347 + }, + { + "epoch": 0.9337996297058219, + "grad_norm": 3.637111803437997, + "learning_rate": 2.2872452297890035e-07, + "loss": 0.6773, + "step": 11348 + }, + { + "epoch": 0.9338819173009669, + "grad_norm": 0.43134908536919375, + "learning_rate": 2.2815808250867156e-07, + "loss": 0.5004, + "step": 11349 + }, + { + "epoch": 0.9339642048961119, + "grad_norm": 3.8231402501283096, + "learning_rate": 2.2759233621446275e-07, + "loss": 0.7126, + "step": 11350 + }, + { + "epoch": 0.9340464924912569, + "grad_norm": 0.39724052315782277, + "learning_rate": 2.2702728413646734e-07, + "loss": 0.4726, + "step": 11351 + }, + { + "epoch": 0.934128780086402, + "grad_norm": 2.100011229521002, + "learning_rate": 2.2646292631482325e-07, + "loss": 0.7215, + "step": 11352 + }, + { + "epoch": 0.934211067681547, + "grad_norm": 2.0264526698870866, + "learning_rate": 2.2589926278962394e-07, + "loss": 0.7045, + "step": 11353 + }, + { + "epoch": 0.934293355276692, + "grad_norm": 3.6650203828346983, + "learning_rate": 2.2533629360090847e-07, + "loss": 0.7209, + "step": 11354 + }, + { + "epoch": 0.934375642871837, + "grad_norm": 2.248419662159314, + "learning_rate": 2.2477401878867157e-07, + "loss": 0.6989, + "step": 11355 + }, + { + "epoch": 0.9344579304669821, + "grad_norm": 0.4358611263937441, + "learning_rate": 2.2421243839285345e-07, + "loss": 0.4863, + "step": 11356 + }, + { + "epoch": 0.9345402180621272, + "grad_norm": 2.2605852438063287, + "learning_rate": 2.2365155245335002e-07, + "loss": 0.727, + "step": 11357 + }, + { + "epoch": 0.9346225056572721, + "grad_norm": 2.016788741291178, + "learning_rate": 2.2309136101000606e-07, + "loss": 0.7169, + "step": 11358 + }, + { + "epoch": 0.9347047932524172, + "grad_norm": 1.7338564752085062, + "learning_rate": 2.225318641026153e-07, + "loss": 0.7075, + "step": 11359 + }, + { + "epoch": 0.9347870808475622, + "grad_norm": 1.9697605494073274, + "learning_rate": 2.2197306177092482e-07, + "loss": 0.7204, + "step": 11360 + }, + { + "epoch": 0.9348693684427073, + "grad_norm": 1.872434930231894, + "learning_rate": 2.2141495405463065e-07, + "loss": 0.6971, + "step": 11361 + }, + { + "epoch": 0.9349516560378522, + "grad_norm": 1.9915205082564857, + "learning_rate": 2.2085754099337886e-07, + "loss": 0.7317, + "step": 11362 + }, + { + "epoch": 0.9350339436329973, + "grad_norm": 2.3764856584095946, + "learning_rate": 2.2030082262676777e-07, + "loss": 0.6856, + "step": 11363 + }, + { + "epoch": 0.9351162312281424, + "grad_norm": 2.1959853624162275, + "learning_rate": 2.1974479899434576e-07, + "loss": 0.7144, + "step": 11364 + }, + { + "epoch": 0.9351985188232874, + "grad_norm": 1.9432517796651276, + "learning_rate": 2.1918947013561342e-07, + "loss": 0.6807, + "step": 11365 + }, + { + "epoch": 0.9352808064184325, + "grad_norm": 1.962139438765353, + "learning_rate": 2.1863483609001812e-07, + "loss": 0.7092, + "step": 11366 + }, + { + "epoch": 0.9353630940135774, + "grad_norm": 2.4319179894548775, + "learning_rate": 2.1808089689696165e-07, + "loss": 0.6653, + "step": 11367 + }, + { + "epoch": 0.9354453816087225, + "grad_norm": 2.029780893283195, + "learning_rate": 2.1752765259579368e-07, + "loss": 0.7348, + "step": 11368 + }, + { + "epoch": 0.9355276692038675, + "grad_norm": 1.896766541300229, + "learning_rate": 2.1697510322581716e-07, + "loss": 0.6885, + "step": 11369 + }, + { + "epoch": 0.9356099567990126, + "grad_norm": 2.4500157430294047, + "learning_rate": 2.1642324882628186e-07, + "loss": 0.6879, + "step": 11370 + }, + { + "epoch": 0.9356922443941575, + "grad_norm": 0.4213299093415896, + "learning_rate": 2.1587208943639526e-07, + "loss": 0.4565, + "step": 11371 + }, + { + "epoch": 0.9357745319893026, + "grad_norm": 1.6492808582919194, + "learning_rate": 2.1532162509530607e-07, + "loss": 0.6886, + "step": 11372 + }, + { + "epoch": 0.9358568195844477, + "grad_norm": 1.8605632932368097, + "learning_rate": 2.1477185584212078e-07, + "loss": 0.7089, + "step": 11373 + }, + { + "epoch": 0.9359391071795927, + "grad_norm": 3.0442243077081543, + "learning_rate": 2.1422278171589372e-07, + "loss": 0.7424, + "step": 11374 + }, + { + "epoch": 0.9360213947747377, + "grad_norm": 2.2443305452005, + "learning_rate": 2.1367440275563035e-07, + "loss": 0.6961, + "step": 11375 + }, + { + "epoch": 0.9361036823698827, + "grad_norm": 2.0277848316224016, + "learning_rate": 2.1312671900028615e-07, + "loss": 0.7237, + "step": 11376 + }, + { + "epoch": 0.9361859699650278, + "grad_norm": 1.6432668355731448, + "learning_rate": 2.125797304887689e-07, + "loss": 0.6986, + "step": 11377 + }, + { + "epoch": 0.9362682575601728, + "grad_norm": 1.981992601165778, + "learning_rate": 2.120334372599342e-07, + "loss": 0.7222, + "step": 11378 + }, + { + "epoch": 0.9363505451553178, + "grad_norm": 1.7309499764445708, + "learning_rate": 2.1148783935259098e-07, + "loss": 0.689, + "step": 11379 + }, + { + "epoch": 0.9364328327504629, + "grad_norm": 1.8599517276799642, + "learning_rate": 2.1094293680549606e-07, + "loss": 0.7148, + "step": 11380 + }, + { + "epoch": 0.9365151203456079, + "grad_norm": 1.714699734331753, + "learning_rate": 2.1039872965736064e-07, + "loss": 0.7282, + "step": 11381 + }, + { + "epoch": 0.936597407940753, + "grad_norm": 1.698405032534264, + "learning_rate": 2.0985521794684383e-07, + "loss": 0.7196, + "step": 11382 + }, + { + "epoch": 0.9366796955358979, + "grad_norm": 1.9390735433483643, + "learning_rate": 2.0931240171255473e-07, + "loss": 0.7247, + "step": 11383 + }, + { + "epoch": 0.936761983131043, + "grad_norm": 2.1829612651401704, + "learning_rate": 2.0877028099305475e-07, + "loss": 0.7371, + "step": 11384 + }, + { + "epoch": 0.936844270726188, + "grad_norm": 2.2656271139424473, + "learning_rate": 2.0822885582685414e-07, + "loss": 0.7104, + "step": 11385 + }, + { + "epoch": 0.9369265583213331, + "grad_norm": 2.0684219951176557, + "learning_rate": 2.0768812625241773e-07, + "loss": 0.7434, + "step": 11386 + }, + { + "epoch": 0.937008845916478, + "grad_norm": 3.269315843150572, + "learning_rate": 2.071480923081548e-07, + "loss": 0.6954, + "step": 11387 + }, + { + "epoch": 0.9370911335116231, + "grad_norm": 2.3378736336670363, + "learning_rate": 2.0660875403243242e-07, + "loss": 0.7091, + "step": 11388 + }, + { + "epoch": 0.9371734211067682, + "grad_norm": 1.8059089532250816, + "learning_rate": 2.0607011146355993e-07, + "loss": 0.6948, + "step": 11389 + }, + { + "epoch": 0.9372557087019132, + "grad_norm": 1.8302431545330466, + "learning_rate": 2.0553216463980451e-07, + "loss": 0.6998, + "step": 11390 + }, + { + "epoch": 0.9373379962970583, + "grad_norm": 2.1574915657949516, + "learning_rate": 2.0499491359937896e-07, + "loss": 0.7131, + "step": 11391 + }, + { + "epoch": 0.9374202838922032, + "grad_norm": 1.9646466803532199, + "learning_rate": 2.0445835838045048e-07, + "loss": 0.6871, + "step": 11392 + }, + { + "epoch": 0.9375025714873483, + "grad_norm": 1.7080928972568339, + "learning_rate": 2.0392249902113525e-07, + "loss": 0.7113, + "step": 11393 + }, + { + "epoch": 0.9375848590824933, + "grad_norm": 1.6298314445162976, + "learning_rate": 2.0338733555949953e-07, + "loss": 0.6939, + "step": 11394 + }, + { + "epoch": 0.9376671466776384, + "grad_norm": 2.2543724611025073, + "learning_rate": 2.028528680335584e-07, + "loss": 0.7117, + "step": 11395 + }, + { + "epoch": 0.9377494342727833, + "grad_norm": 1.7466893802446135, + "learning_rate": 2.0231909648128267e-07, + "loss": 0.7038, + "step": 11396 + }, + { + "epoch": 0.9378317218679284, + "grad_norm": 1.6099648560451507, + "learning_rate": 2.0178602094058863e-07, + "loss": 0.7009, + "step": 11397 + }, + { + "epoch": 0.9379140094630735, + "grad_norm": 2.0941812986117148, + "learning_rate": 2.01253641449346e-07, + "loss": 0.6951, + "step": 11398 + }, + { + "epoch": 0.9379962970582185, + "grad_norm": 1.7770209868755895, + "learning_rate": 2.0072195804537454e-07, + "loss": 0.7085, + "step": 11399 + }, + { + "epoch": 0.9380785846533635, + "grad_norm": 1.7250751225624819, + "learning_rate": 2.001909707664429e-07, + "loss": 0.6991, + "step": 11400 + }, + { + "epoch": 0.9381608722485085, + "grad_norm": 2.47778530172642, + "learning_rate": 1.9966067965027201e-07, + "loss": 0.7032, + "step": 11401 + }, + { + "epoch": 0.9382431598436536, + "grad_norm": 2.591474233275943, + "learning_rate": 1.9913108473453291e-07, + "loss": 0.7001, + "step": 11402 + }, + { + "epoch": 0.9383254474387986, + "grad_norm": 2.6852361720914577, + "learning_rate": 1.9860218605684767e-07, + "loss": 0.6908, + "step": 11403 + }, + { + "epoch": 0.9384077350339436, + "grad_norm": 1.717124592125826, + "learning_rate": 1.9807398365478847e-07, + "loss": 0.709, + "step": 11404 + }, + { + "epoch": 0.9384900226290886, + "grad_norm": 1.9056280546500293, + "learning_rate": 1.9754647756587754e-07, + "loss": 0.7177, + "step": 11405 + }, + { + "epoch": 0.9385723102242337, + "grad_norm": 1.931852984263246, + "learning_rate": 1.9701966782758818e-07, + "loss": 0.7142, + "step": 11406 + }, + { + "epoch": 0.9386545978193788, + "grad_norm": 2.1079700132563577, + "learning_rate": 1.9649355447734386e-07, + "loss": 0.6977, + "step": 11407 + }, + { + "epoch": 0.9387368854145237, + "grad_norm": 1.8841808780658762, + "learning_rate": 1.9596813755251908e-07, + "loss": 0.7216, + "step": 11408 + }, + { + "epoch": 0.9388191730096688, + "grad_norm": 2.1412241632909046, + "learning_rate": 1.9544341709043735e-07, + "loss": 0.7087, + "step": 11409 + }, + { + "epoch": 0.9389014606048138, + "grad_norm": 0.40733110492069896, + "learning_rate": 1.949193931283766e-07, + "loss": 0.4724, + "step": 11410 + }, + { + "epoch": 0.9389837481999589, + "grad_norm": 1.858799266366392, + "learning_rate": 1.9439606570356151e-07, + "loss": 0.676, + "step": 11411 + }, + { + "epoch": 0.9390660357951038, + "grad_norm": 1.8383645320411992, + "learning_rate": 1.938734348531679e-07, + "loss": 0.7017, + "step": 11412 + }, + { + "epoch": 0.9391483233902489, + "grad_norm": 1.80908310186753, + "learning_rate": 1.9335150061432272e-07, + "loss": 0.7104, + "step": 11413 + }, + { + "epoch": 0.939230610985394, + "grad_norm": 0.42318775066799197, + "learning_rate": 1.9283026302410303e-07, + "loss": 0.4506, + "step": 11414 + }, + { + "epoch": 0.939312898580539, + "grad_norm": 1.8419036897891643, + "learning_rate": 1.9230972211953692e-07, + "loss": 0.7241, + "step": 11415 + }, + { + "epoch": 0.939395186175684, + "grad_norm": 1.6994095845411072, + "learning_rate": 1.9178987793760483e-07, + "loss": 0.6909, + "step": 11416 + }, + { + "epoch": 0.939477473770829, + "grad_norm": 1.6441601133648047, + "learning_rate": 1.9127073051523282e-07, + "loss": 0.6888, + "step": 11417 + }, + { + "epoch": 0.9395597613659741, + "grad_norm": 2.5756240817510805, + "learning_rate": 1.9075227988930134e-07, + "loss": 0.688, + "step": 11418 + }, + { + "epoch": 0.9396420489611191, + "grad_norm": 2.2707862842335427, + "learning_rate": 1.9023452609663983e-07, + "loss": 0.6844, + "step": 11419 + }, + { + "epoch": 0.9397243365562642, + "grad_norm": 2.253511816530023, + "learning_rate": 1.8971746917403e-07, + "loss": 0.7032, + "step": 11420 + }, + { + "epoch": 0.9398066241514091, + "grad_norm": 0.4078316640064775, + "learning_rate": 1.8920110915820132e-07, + "loss": 0.4626, + "step": 11421 + }, + { + "epoch": 0.9398889117465542, + "grad_norm": 1.8046097438722222, + "learning_rate": 1.8868544608583673e-07, + "loss": 0.6901, + "step": 11422 + }, + { + "epoch": 0.9399711993416993, + "grad_norm": 0.3956211659815121, + "learning_rate": 1.8817047999356686e-07, + "loss": 0.4395, + "step": 11423 + }, + { + "epoch": 0.9400534869368443, + "grad_norm": 1.7787927346687191, + "learning_rate": 1.876562109179747e-07, + "loss": 0.6973, + "step": 11424 + }, + { + "epoch": 0.9401357745319893, + "grad_norm": 1.9521775918582915, + "learning_rate": 1.8714263889559102e-07, + "loss": 0.7271, + "step": 11425 + }, + { + "epoch": 0.9402180621271343, + "grad_norm": 1.7017404819178301, + "learning_rate": 1.8662976396290222e-07, + "loss": 0.6832, + "step": 11426 + }, + { + "epoch": 0.9403003497222794, + "grad_norm": 1.9947183228438023, + "learning_rate": 1.8611758615634024e-07, + "loss": 0.711, + "step": 11427 + }, + { + "epoch": 0.9403826373174244, + "grad_norm": 1.961774452810668, + "learning_rate": 1.856061055122904e-07, + "loss": 0.6936, + "step": 11428 + }, + { + "epoch": 0.9404649249125694, + "grad_norm": 1.850509648823574, + "learning_rate": 1.8509532206708814e-07, + "loss": 0.7155, + "step": 11429 + }, + { + "epoch": 0.9405472125077144, + "grad_norm": 2.02546003876758, + "learning_rate": 1.845852358570166e-07, + "loss": 0.7196, + "step": 11430 + }, + { + "epoch": 0.9406295001028595, + "grad_norm": 1.7955828162703662, + "learning_rate": 1.8407584691831126e-07, + "loss": 0.6909, + "step": 11431 + }, + { + "epoch": 0.9407117876980046, + "grad_norm": 2.264181053349525, + "learning_rate": 1.8356715528716095e-07, + "loss": 0.7193, + "step": 11432 + }, + { + "epoch": 0.9407940752931495, + "grad_norm": 1.6858246942626276, + "learning_rate": 1.8305916099970012e-07, + "loss": 0.6956, + "step": 11433 + }, + { + "epoch": 0.9408763628882946, + "grad_norm": 1.990998827222498, + "learning_rate": 1.8255186409201765e-07, + "loss": 0.6867, + "step": 11434 + }, + { + "epoch": 0.9409586504834396, + "grad_norm": 1.8607282323083456, + "learning_rate": 1.8204526460014915e-07, + "loss": 0.707, + "step": 11435 + }, + { + "epoch": 0.9410409380785847, + "grad_norm": 0.426773201593063, + "learning_rate": 1.8153936256008475e-07, + "loss": 0.4617, + "step": 11436 + }, + { + "epoch": 0.9411232256737296, + "grad_norm": 2.1764715897075178, + "learning_rate": 1.8103415800776014e-07, + "loss": 0.7098, + "step": 11437 + }, + { + "epoch": 0.9412055132688747, + "grad_norm": 1.9269871137841608, + "learning_rate": 1.805296509790666e-07, + "loss": 0.7125, + "step": 11438 + }, + { + "epoch": 0.9412878008640198, + "grad_norm": 5.654116428163195, + "learning_rate": 1.8002584150984216e-07, + "loss": 0.6998, + "step": 11439 + }, + { + "epoch": 0.9413700884591648, + "grad_norm": 2.138279864011334, + "learning_rate": 1.7952272963587812e-07, + "loss": 0.703, + "step": 11440 + }, + { + "epoch": 0.9414523760543098, + "grad_norm": 1.9157165284355175, + "learning_rate": 1.7902031539291265e-07, + "loss": 0.7005, + "step": 11441 + }, + { + "epoch": 0.9415346636494548, + "grad_norm": 0.42801737642038784, + "learning_rate": 1.7851859881663935e-07, + "loss": 0.4782, + "step": 11442 + }, + { + "epoch": 0.9416169512445999, + "grad_norm": 1.8415207878143212, + "learning_rate": 1.780175799426964e-07, + "loss": 0.6938, + "step": 11443 + }, + { + "epoch": 0.9416992388397449, + "grad_norm": 2.1634689881265188, + "learning_rate": 1.7751725880667648e-07, + "loss": 0.7309, + "step": 11444 + }, + { + "epoch": 0.94178152643489, + "grad_norm": 0.4087220109079892, + "learning_rate": 1.770176354441222e-07, + "loss": 0.4766, + "step": 11445 + }, + { + "epoch": 0.9418638140300349, + "grad_norm": 2.1108388252197408, + "learning_rate": 1.7651870989052523e-07, + "loss": 0.7258, + "step": 11446 + }, + { + "epoch": 0.94194610162518, + "grad_norm": 4.07230459378246, + "learning_rate": 1.7602048218132938e-07, + "loss": 0.6792, + "step": 11447 + }, + { + "epoch": 0.9420283892203251, + "grad_norm": 2.2779853344947125, + "learning_rate": 1.755229523519275e-07, + "loss": 0.7211, + "step": 11448 + }, + { + "epoch": 0.9421106768154701, + "grad_norm": 1.8369683301772217, + "learning_rate": 1.7502612043766243e-07, + "loss": 0.7334, + "step": 11449 + }, + { + "epoch": 0.9421929644106151, + "grad_norm": 2.1217538880784677, + "learning_rate": 1.7452998647382924e-07, + "loss": 0.687, + "step": 11450 + }, + { + "epoch": 0.9422752520057601, + "grad_norm": 1.8749260040527125, + "learning_rate": 1.7403455049567197e-07, + "loss": 0.7303, + "step": 11451 + }, + { + "epoch": 0.9423575396009052, + "grad_norm": 5.704047257853909, + "learning_rate": 1.735398125383869e-07, + "loss": 0.7362, + "step": 11452 + }, + { + "epoch": 0.9424398271960502, + "grad_norm": 2.0927860054116203, + "learning_rate": 1.730457726371182e-07, + "loss": 0.6965, + "step": 11453 + }, + { + "epoch": 0.9425221147911952, + "grad_norm": 1.6296464553248395, + "learning_rate": 1.7255243082696104e-07, + "loss": 0.7182, + "step": 11454 + }, + { + "epoch": 0.9426044023863402, + "grad_norm": 1.8930080811164163, + "learning_rate": 1.72059787142963e-07, + "loss": 0.6853, + "step": 11455 + }, + { + "epoch": 0.9426866899814853, + "grad_norm": 0.4137599053469965, + "learning_rate": 1.7156784162012052e-07, + "loss": 0.4646, + "step": 11456 + }, + { + "epoch": 0.9427689775766304, + "grad_norm": 1.8309556005016736, + "learning_rate": 1.7107659429337897e-07, + "loss": 0.6897, + "step": 11457 + }, + { + "epoch": 0.9428512651717753, + "grad_norm": 2.549540483597432, + "learning_rate": 1.7058604519763933e-07, + "loss": 0.7079, + "step": 11458 + }, + { + "epoch": 0.9429335527669204, + "grad_norm": 1.7320762394266374, + "learning_rate": 1.7009619436774592e-07, + "loss": 0.695, + "step": 11459 + }, + { + "epoch": 0.9430158403620654, + "grad_norm": 2.2381015416340895, + "learning_rate": 1.6960704183849875e-07, + "loss": 0.6988, + "step": 11460 + }, + { + "epoch": 0.9430981279572105, + "grad_norm": 1.7007786895137416, + "learning_rate": 1.6911858764464438e-07, + "loss": 0.7411, + "step": 11461 + }, + { + "epoch": 0.9431804155523554, + "grad_norm": 2.8520103467847653, + "learning_rate": 1.6863083182088514e-07, + "loss": 0.7164, + "step": 11462 + }, + { + "epoch": 0.9432627031475005, + "grad_norm": 1.553451664681166, + "learning_rate": 1.681437744018688e-07, + "loss": 0.6981, + "step": 11463 + }, + { + "epoch": 0.9433449907426456, + "grad_norm": 0.41755962141435404, + "learning_rate": 1.676574154221944e-07, + "loss": 0.4561, + "step": 11464 + }, + { + "epoch": 0.9434272783377906, + "grad_norm": 2.1007501973899476, + "learning_rate": 1.6717175491641203e-07, + "loss": 0.7098, + "step": 11465 + }, + { + "epoch": 0.9435095659329356, + "grad_norm": 2.58561019408169, + "learning_rate": 1.6668679291902413e-07, + "loss": 0.7134, + "step": 11466 + }, + { + "epoch": 0.9435918535280806, + "grad_norm": 1.8883483263425582, + "learning_rate": 1.6620252946447868e-07, + "loss": 0.7061, + "step": 11467 + }, + { + "epoch": 0.9436741411232257, + "grad_norm": 1.8557139034139787, + "learning_rate": 1.657189645871804e-07, + "loss": 0.6889, + "step": 11468 + }, + { + "epoch": 0.9437564287183707, + "grad_norm": 2.009796698946195, + "learning_rate": 1.6523609832147846e-07, + "loss": 0.7008, + "step": 11469 + }, + { + "epoch": 0.9438387163135158, + "grad_norm": 15.822377475029754, + "learning_rate": 1.6475393070167657e-07, + "loss": 0.7069, + "step": 11470 + }, + { + "epoch": 0.9439210039086607, + "grad_norm": 2.3453735531529296, + "learning_rate": 1.6427246176202505e-07, + "loss": 0.7223, + "step": 11471 + }, + { + "epoch": 0.9440032915038058, + "grad_norm": 1.8581441805091503, + "learning_rate": 1.637916915367288e-07, + "loss": 0.7097, + "step": 11472 + }, + { + "epoch": 0.9440855790989509, + "grad_norm": 1.9273753512092282, + "learning_rate": 1.6331162005993938e-07, + "loss": 0.7381, + "step": 11473 + }, + { + "epoch": 0.9441678666940959, + "grad_norm": 0.4133832904903734, + "learning_rate": 1.6283224736576175e-07, + "loss": 0.4847, + "step": 11474 + }, + { + "epoch": 0.9442501542892409, + "grad_norm": 0.4247266759210233, + "learning_rate": 1.6235357348824865e-07, + "loss": 0.4499, + "step": 11475 + }, + { + "epoch": 0.9443324418843859, + "grad_norm": 1.580652207222016, + "learning_rate": 1.6187559846140511e-07, + "loss": 0.7128, + "step": 11476 + }, + { + "epoch": 0.944414729479531, + "grad_norm": 1.9643615398447567, + "learning_rate": 1.6139832231918396e-07, + "loss": 0.7264, + "step": 11477 + }, + { + "epoch": 0.944497017074676, + "grad_norm": 1.8292496288678581, + "learning_rate": 1.6092174509549364e-07, + "loss": 0.7105, + "step": 11478 + }, + { + "epoch": 0.944579304669821, + "grad_norm": 2.917266747653042, + "learning_rate": 1.6044586682418484e-07, + "loss": 0.7074, + "step": 11479 + }, + { + "epoch": 0.944661592264966, + "grad_norm": 1.934479749632032, + "learning_rate": 1.599706875390672e-07, + "loss": 0.7269, + "step": 11480 + }, + { + "epoch": 0.9447438798601111, + "grad_norm": 2.485069177854259, + "learning_rate": 1.594962072738948e-07, + "loss": 0.7174, + "step": 11481 + }, + { + "epoch": 0.9448261674552562, + "grad_norm": 1.6641846234062905, + "learning_rate": 1.5902242606237518e-07, + "loss": 0.7216, + "step": 11482 + }, + { + "epoch": 0.9449084550504011, + "grad_norm": 2.0477107102800467, + "learning_rate": 1.5854934393816247e-07, + "loss": 0.6975, + "step": 11483 + }, + { + "epoch": 0.9449907426455462, + "grad_norm": 0.4290012901750915, + "learning_rate": 1.5807696093486646e-07, + "loss": 0.4784, + "step": 11484 + }, + { + "epoch": 0.9450730302406912, + "grad_norm": 0.40077214411918965, + "learning_rate": 1.5760527708604146e-07, + "loss": 0.4671, + "step": 11485 + }, + { + "epoch": 0.9451553178358363, + "grad_norm": 2.182652740711563, + "learning_rate": 1.571342924251995e-07, + "loss": 0.7229, + "step": 11486 + }, + { + "epoch": 0.9452376054309812, + "grad_norm": 2.155747657455392, + "learning_rate": 1.5666400698579497e-07, + "loss": 0.7101, + "step": 11487 + }, + { + "epoch": 0.9453198930261263, + "grad_norm": 1.8381625821937697, + "learning_rate": 1.561944208012378e-07, + "loss": 0.7185, + "step": 11488 + }, + { + "epoch": 0.9454021806212713, + "grad_norm": 2.997964610939401, + "learning_rate": 1.5572553390488466e-07, + "loss": 0.7248, + "step": 11489 + }, + { + "epoch": 0.9454844682164164, + "grad_norm": 1.709863299236503, + "learning_rate": 1.5525734633004775e-07, + "loss": 0.7119, + "step": 11490 + }, + { + "epoch": 0.9455667558115614, + "grad_norm": 2.027876476705144, + "learning_rate": 1.5478985810998271e-07, + "loss": 0.6861, + "step": 11491 + }, + { + "epoch": 0.9456490434067064, + "grad_norm": 1.6656989710619703, + "learning_rate": 1.5432306927790297e-07, + "loss": 0.7209, + "step": 11492 + }, + { + "epoch": 0.9457313310018515, + "grad_norm": 2.602716429075312, + "learning_rate": 1.5385697986696536e-07, + "loss": 0.7352, + "step": 11493 + }, + { + "epoch": 0.9458136185969965, + "grad_norm": 1.609006414198999, + "learning_rate": 1.5339158991028224e-07, + "loss": 0.7038, + "step": 11494 + }, + { + "epoch": 0.9458959061921416, + "grad_norm": 2.7365437831459527, + "learning_rate": 1.5292689944091165e-07, + "loss": 0.7162, + "step": 11495 + }, + { + "epoch": 0.9459781937872865, + "grad_norm": 2.3027474247483255, + "learning_rate": 1.524629084918683e-07, + "loss": 0.6911, + "step": 11496 + }, + { + "epoch": 0.9460604813824316, + "grad_norm": 1.688188119796344, + "learning_rate": 1.5199961709610912e-07, + "loss": 0.7106, + "step": 11497 + }, + { + "epoch": 0.9461427689775767, + "grad_norm": 2.0689370125906144, + "learning_rate": 1.515370252865489e-07, + "loss": 0.6863, + "step": 11498 + }, + { + "epoch": 0.9462250565727217, + "grad_norm": 2.2840276091882146, + "learning_rate": 1.5107513309604695e-07, + "loss": 0.7425, + "step": 11499 + }, + { + "epoch": 0.9463073441678667, + "grad_norm": 1.5945843056607276, + "learning_rate": 1.5061394055741806e-07, + "loss": 0.701, + "step": 11500 + }, + { + "epoch": 0.9463896317630117, + "grad_norm": 1.7108106959569729, + "learning_rate": 1.5015344770342165e-07, + "loss": 0.6871, + "step": 11501 + }, + { + "epoch": 0.9464719193581568, + "grad_norm": 0.3940751205024042, + "learning_rate": 1.4969365456677265e-07, + "loss": 0.4555, + "step": 11502 + }, + { + "epoch": 0.9465542069533018, + "grad_norm": 1.800197174233794, + "learning_rate": 1.492345611801327e-07, + "loss": 0.7066, + "step": 11503 + }, + { + "epoch": 0.9466364945484468, + "grad_norm": 2.6582404323951665, + "learning_rate": 1.4877616757611568e-07, + "loss": 0.7394, + "step": 11504 + }, + { + "epoch": 0.9467187821435918, + "grad_norm": 0.40828740263316254, + "learning_rate": 1.4831847378728559e-07, + "loss": 0.469, + "step": 11505 + }, + { + "epoch": 0.9468010697387369, + "grad_norm": 1.9769678459024818, + "learning_rate": 1.4786147984615528e-07, + "loss": 0.6899, + "step": 11506 + }, + { + "epoch": 0.946883357333882, + "grad_norm": 2.2739275543524045, + "learning_rate": 1.4740518578518882e-07, + "loss": 0.7067, + "step": 11507 + }, + { + "epoch": 0.9469656449290269, + "grad_norm": 2.0721663288508925, + "learning_rate": 1.4694959163680133e-07, + "loss": 0.6827, + "step": 11508 + }, + { + "epoch": 0.947047932524172, + "grad_norm": 0.40434742676463936, + "learning_rate": 1.4649469743335699e-07, + "loss": 0.4492, + "step": 11509 + }, + { + "epoch": 0.947130220119317, + "grad_norm": 2.9101657029678623, + "learning_rate": 1.46040503207171e-07, + "loss": 0.7394, + "step": 11510 + }, + { + "epoch": 0.9472125077144621, + "grad_norm": 1.8224258062121477, + "learning_rate": 1.4558700899050981e-07, + "loss": 0.7064, + "step": 11511 + }, + { + "epoch": 0.947294795309607, + "grad_norm": 1.783033305451014, + "learning_rate": 1.4513421481558655e-07, + "loss": 0.7108, + "step": 11512 + }, + { + "epoch": 0.9473770829047521, + "grad_norm": 1.7749060016982576, + "learning_rate": 1.4468212071456767e-07, + "loss": 0.704, + "step": 11513 + }, + { + "epoch": 0.9474593704998971, + "grad_norm": 1.8609633359007078, + "learning_rate": 1.4423072671957082e-07, + "loss": 0.7068, + "step": 11514 + }, + { + "epoch": 0.9475416580950422, + "grad_norm": 2.7245438510377817, + "learning_rate": 1.437800328626604e-07, + "loss": 0.7166, + "step": 11515 + }, + { + "epoch": 0.9476239456901872, + "grad_norm": 2.1102470726521596, + "learning_rate": 1.4333003917585408e-07, + "loss": 0.6853, + "step": 11516 + }, + { + "epoch": 0.9477062332853322, + "grad_norm": 0.4268133065605587, + "learning_rate": 1.4288074569111853e-07, + "loss": 0.4939, + "step": 11517 + }, + { + "epoch": 0.9477885208804773, + "grad_norm": 1.8221334869770325, + "learning_rate": 1.4243215244037157e-07, + "loss": 0.7084, + "step": 11518 + }, + { + "epoch": 0.9478708084756223, + "grad_norm": 1.8592611079229429, + "learning_rate": 1.4198425945547767e-07, + "loss": 0.72, + "step": 11519 + }, + { + "epoch": 0.9479530960707674, + "grad_norm": 1.9241102372747847, + "learning_rate": 1.4153706676825807e-07, + "loss": 0.6864, + "step": 11520 + }, + { + "epoch": 0.9480353836659123, + "grad_norm": 1.901603688606735, + "learning_rate": 1.4109057441047735e-07, + "loss": 0.6924, + "step": 11521 + }, + { + "epoch": 0.9481176712610574, + "grad_norm": 1.6522960722532976, + "learning_rate": 1.406447824138568e-07, + "loss": 0.6724, + "step": 11522 + }, + { + "epoch": 0.9481999588562025, + "grad_norm": 2.275901630622711, + "learning_rate": 1.4019969081006225e-07, + "loss": 0.7113, + "step": 11523 + }, + { + "epoch": 0.9482822464513475, + "grad_norm": 2.207451871247362, + "learning_rate": 1.3975529963071388e-07, + "loss": 0.7149, + "step": 11524 + }, + { + "epoch": 0.9483645340464925, + "grad_norm": 0.4133964925425684, + "learning_rate": 1.393116089073787e-07, + "loss": 0.4539, + "step": 11525 + }, + { + "epoch": 0.9484468216416375, + "grad_norm": 0.4102042704098497, + "learning_rate": 1.388686186715782e-07, + "loss": 0.4567, + "step": 11526 + }, + { + "epoch": 0.9485291092367826, + "grad_norm": 1.9852584582039745, + "learning_rate": 1.3842632895477937e-07, + "loss": 0.7236, + "step": 11527 + }, + { + "epoch": 0.9486113968319276, + "grad_norm": 1.9287726214265757, + "learning_rate": 1.3798473978840488e-07, + "loss": 0.7033, + "step": 11528 + }, + { + "epoch": 0.9486936844270726, + "grad_norm": 2.065403131162283, + "learning_rate": 1.3754385120382074e-07, + "loss": 0.6802, + "step": 11529 + }, + { + "epoch": 0.9487759720222176, + "grad_norm": 1.9577078821217682, + "learning_rate": 1.371036632323497e-07, + "loss": 0.6774, + "step": 11530 + }, + { + "epoch": 0.9488582596173627, + "grad_norm": 1.8258315397535652, + "learning_rate": 1.3666417590526004e-07, + "loss": 0.6995, + "step": 11531 + }, + { + "epoch": 0.9489405472125078, + "grad_norm": 1.5696687042443118, + "learning_rate": 1.3622538925377348e-07, + "loss": 0.7179, + "step": 11532 + }, + { + "epoch": 0.9490228348076527, + "grad_norm": 1.7303261191839663, + "learning_rate": 1.3578730330906177e-07, + "loss": 0.6962, + "step": 11533 + }, + { + "epoch": 0.9491051224027978, + "grad_norm": 0.43345354606984454, + "learning_rate": 1.353499181022433e-07, + "loss": 0.4776, + "step": 11534 + }, + { + "epoch": 0.9491874099979428, + "grad_norm": 2.3114946582655627, + "learning_rate": 1.3491323366438992e-07, + "loss": 0.7129, + "step": 11535 + }, + { + "epoch": 0.9492696975930879, + "grad_norm": 4.362405937048436, + "learning_rate": 1.3447725002652457e-07, + "loss": 0.7171, + "step": 11536 + }, + { + "epoch": 0.9493519851882328, + "grad_norm": 1.8788732086750273, + "learning_rate": 1.3404196721961803e-07, + "loss": 0.6957, + "step": 11537 + }, + { + "epoch": 0.9494342727833779, + "grad_norm": 2.0975432599563235, + "learning_rate": 1.3360738527459117e-07, + "loss": 0.696, + "step": 11538 + }, + { + "epoch": 0.949516560378523, + "grad_norm": 1.7853172320282986, + "learning_rate": 1.33173504222317e-07, + "loss": 0.6968, + "step": 11539 + }, + { + "epoch": 0.949598847973668, + "grad_norm": 0.4153409605911696, + "learning_rate": 1.327403240936176e-07, + "loss": 0.4732, + "step": 11540 + }, + { + "epoch": 0.949681135568813, + "grad_norm": 2.182978935824249, + "learning_rate": 1.3230784491926496e-07, + "loss": 0.725, + "step": 11541 + }, + { + "epoch": 0.949763423163958, + "grad_norm": 1.893521408969188, + "learning_rate": 1.3187606672998232e-07, + "loss": 0.7284, + "step": 11542 + }, + { + "epoch": 0.9498457107591031, + "grad_norm": 2.0691266263196137, + "learning_rate": 1.3144498955644181e-07, + "loss": 0.7212, + "step": 11543 + }, + { + "epoch": 0.9499279983542481, + "grad_norm": 1.9501729190395345, + "learning_rate": 1.310146134292667e-07, + "loss": 0.6869, + "step": 11544 + }, + { + "epoch": 0.9500102859493931, + "grad_norm": 2.206128908084436, + "learning_rate": 1.305849383790303e-07, + "loss": 0.7247, + "step": 11545 + }, + { + "epoch": 0.9500925735445381, + "grad_norm": 0.39867786670459876, + "learning_rate": 1.301559644362571e-07, + "loss": 0.4535, + "step": 11546 + }, + { + "epoch": 0.9501748611396832, + "grad_norm": 0.4187566412760843, + "learning_rate": 1.297276916314183e-07, + "loss": 0.4686, + "step": 11547 + }, + { + "epoch": 0.9502571487348282, + "grad_norm": 2.252487126173627, + "learning_rate": 1.2930011999493952e-07, + "loss": 0.7258, + "step": 11548 + }, + { + "epoch": 0.9503394363299733, + "grad_norm": 2.1708546350975264, + "learning_rate": 1.2887324955719426e-07, + "loss": 0.6936, + "step": 11549 + }, + { + "epoch": 0.9504217239251183, + "grad_norm": 2.0689422344546546, + "learning_rate": 1.2844708034850716e-07, + "loss": 0.7418, + "step": 11550 + }, + { + "epoch": 0.9505040115202633, + "grad_norm": 2.1278941653909316, + "learning_rate": 1.280216123991529e-07, + "loss": 0.7041, + "step": 11551 + }, + { + "epoch": 0.9505862991154084, + "grad_norm": 1.9382734214585597, + "learning_rate": 1.2759684573935504e-07, + "loss": 0.7026, + "step": 11552 + }, + { + "epoch": 0.9506685867105534, + "grad_norm": 2.3082756259043116, + "learning_rate": 1.2717278039928839e-07, + "loss": 0.7064, + "step": 11553 + }, + { + "epoch": 0.9507508743056984, + "grad_norm": 1.8002721373792592, + "learning_rate": 1.2674941640907768e-07, + "loss": 0.7206, + "step": 11554 + }, + { + "epoch": 0.9508331619008434, + "grad_norm": 1.9374577356865184, + "learning_rate": 1.263267537987989e-07, + "loss": 0.7155, + "step": 11555 + }, + { + "epoch": 0.9509154494959885, + "grad_norm": 1.8730908838854354, + "learning_rate": 1.2590479259847687e-07, + "loss": 0.7282, + "step": 11556 + }, + { + "epoch": 0.9509977370911336, + "grad_norm": 2.3193232440775757, + "learning_rate": 1.2548353283808768e-07, + "loss": 0.7143, + "step": 11557 + }, + { + "epoch": 0.9510800246862785, + "grad_norm": 1.8276148386345077, + "learning_rate": 1.2506297454755621e-07, + "loss": 0.6877, + "step": 11558 + }, + { + "epoch": 0.9511623122814236, + "grad_norm": 1.7902097796199559, + "learning_rate": 1.246431177567575e-07, + "loss": 0.7121, + "step": 11559 + }, + { + "epoch": 0.9512445998765686, + "grad_norm": 2.0097663168055857, + "learning_rate": 1.2422396249551884e-07, + "loss": 0.7274, + "step": 11560 + }, + { + "epoch": 0.9513268874717137, + "grad_norm": 2.5025723378458813, + "learning_rate": 1.238055087936163e-07, + "loss": 0.7125, + "step": 11561 + }, + { + "epoch": 0.9514091750668586, + "grad_norm": 1.7818920217065677, + "learning_rate": 1.2338775668077508e-07, + "loss": 0.7072, + "step": 11562 + }, + { + "epoch": 0.9514914626620037, + "grad_norm": 1.9797305461892127, + "learning_rate": 1.229707061866736e-07, + "loss": 0.681, + "step": 11563 + }, + { + "epoch": 0.9515737502571487, + "grad_norm": 1.7326988395629033, + "learning_rate": 1.2255435734093603e-07, + "loss": 0.7135, + "step": 11564 + }, + { + "epoch": 0.9516560378522938, + "grad_norm": 2.4030865015567175, + "learning_rate": 1.2213871017314083e-07, + "loss": 0.6844, + "step": 11565 + }, + { + "epoch": 0.9517383254474387, + "grad_norm": 2.0024151001114574, + "learning_rate": 1.2172376471281444e-07, + "loss": 0.7369, + "step": 11566 + }, + { + "epoch": 0.9518206130425838, + "grad_norm": 2.132163987245121, + "learning_rate": 1.2130952098943215e-07, + "loss": 0.7307, + "step": 11567 + }, + { + "epoch": 0.9519029006377289, + "grad_norm": 2.1610644115412296, + "learning_rate": 1.208959790324249e-07, + "loss": 0.6965, + "step": 11568 + }, + { + "epoch": 0.9519851882328739, + "grad_norm": 1.903217529707663, + "learning_rate": 1.2048313887116802e-07, + "loss": 0.6952, + "step": 11569 + }, + { + "epoch": 0.9520674758280189, + "grad_norm": 2.2796998270482614, + "learning_rate": 1.200710005349881e-07, + "loss": 0.7265, + "step": 11570 + }, + { + "epoch": 0.9521497634231639, + "grad_norm": 1.6344755868874223, + "learning_rate": 1.1965956405316392e-07, + "loss": 0.6933, + "step": 11571 + }, + { + "epoch": 0.952232051018309, + "grad_norm": 0.4163973706766798, + "learning_rate": 1.192488294549232e-07, + "loss": 0.4559, + "step": 11572 + }, + { + "epoch": 0.952314338613454, + "grad_norm": 1.785156827173986, + "learning_rate": 1.1883879676944265e-07, + "loss": 0.7089, + "step": 11573 + }, + { + "epoch": 0.9523966262085991, + "grad_norm": 2.0001634136016655, + "learning_rate": 1.1842946602585226e-07, + "loss": 0.7123, + "step": 11574 + }, + { + "epoch": 0.952478913803744, + "grad_norm": 1.8796086244511145, + "learning_rate": 1.1802083725322988e-07, + "loss": 0.7167, + "step": 11575 + }, + { + "epoch": 0.9525612013988891, + "grad_norm": 1.4980195726241914, + "learning_rate": 1.176129104806023e-07, + "loss": 0.6508, + "step": 11576 + }, + { + "epoch": 0.9526434889940342, + "grad_norm": 0.3984355237349968, + "learning_rate": 1.1720568573694968e-07, + "loss": 0.4522, + "step": 11577 + }, + { + "epoch": 0.9527257765891792, + "grad_norm": 1.883813058069627, + "learning_rate": 1.1679916305119999e-07, + "loss": 0.7215, + "step": 11578 + }, + { + "epoch": 0.9528080641843242, + "grad_norm": 2.007299658205894, + "learning_rate": 1.1639334245223122e-07, + "loss": 0.702, + "step": 11579 + }, + { + "epoch": 0.9528903517794692, + "grad_norm": 0.4378217018263401, + "learning_rate": 1.1598822396887365e-07, + "loss": 0.4926, + "step": 11580 + }, + { + "epoch": 0.9529726393746143, + "grad_norm": 2.0849600372938952, + "learning_rate": 1.155838076299054e-07, + "loss": 0.7234, + "step": 11581 + }, + { + "epoch": 0.9530549269697594, + "grad_norm": 1.822827789496078, + "learning_rate": 1.1518009346405568e-07, + "loss": 0.7215, + "step": 11582 + }, + { + "epoch": 0.9531372145649043, + "grad_norm": 1.7373805836308591, + "learning_rate": 1.1477708150000377e-07, + "loss": 0.7246, + "step": 11583 + }, + { + "epoch": 0.9532195021600494, + "grad_norm": 2.1723321004860012, + "learning_rate": 1.1437477176637791e-07, + "loss": 0.7346, + "step": 11584 + }, + { + "epoch": 0.9533017897551944, + "grad_norm": 1.8847749485068492, + "learning_rate": 1.1397316429175964e-07, + "loss": 0.7069, + "step": 11585 + }, + { + "epoch": 0.9533840773503395, + "grad_norm": 0.42818175648459833, + "learning_rate": 1.1357225910467729e-07, + "loss": 0.4907, + "step": 11586 + }, + { + "epoch": 0.9534663649454844, + "grad_norm": 0.40445319698883203, + "learning_rate": 1.1317205623361138e-07, + "loss": 0.4727, + "step": 11587 + }, + { + "epoch": 0.9535486525406295, + "grad_norm": 1.9022368563117553, + "learning_rate": 1.1277255570698919e-07, + "loss": 0.7002, + "step": 11588 + }, + { + "epoch": 0.9536309401357745, + "grad_norm": 2.1161676563973026, + "learning_rate": 1.1237375755319357e-07, + "loss": 0.7083, + "step": 11589 + }, + { + "epoch": 0.9537132277309196, + "grad_norm": 2.7301930805764973, + "learning_rate": 1.1197566180055187e-07, + "loss": 0.6724, + "step": 11590 + }, + { + "epoch": 0.9537955153260645, + "grad_norm": 0.4129562478052801, + "learning_rate": 1.1157826847734698e-07, + "loss": 0.4607, + "step": 11591 + }, + { + "epoch": 0.9538778029212096, + "grad_norm": 1.9350515894455849, + "learning_rate": 1.1118157761180747e-07, + "loss": 0.7092, + "step": 11592 + }, + { + "epoch": 0.9539600905163547, + "grad_norm": 1.9170810379140866, + "learning_rate": 1.1078558923211413e-07, + "loss": 0.7056, + "step": 11593 + }, + { + "epoch": 0.9540423781114997, + "grad_norm": 1.8813539139342723, + "learning_rate": 1.1039030336639667e-07, + "loss": 0.7315, + "step": 11594 + }, + { + "epoch": 0.9541246657066447, + "grad_norm": 1.9114136595017088, + "learning_rate": 1.0999572004273595e-07, + "loss": 0.6943, + "step": 11595 + }, + { + "epoch": 0.9542069533017897, + "grad_norm": 2.778816344312026, + "learning_rate": 1.0960183928916179e-07, + "loss": 0.6834, + "step": 11596 + }, + { + "epoch": 0.9542892408969348, + "grad_norm": 2.417401916426073, + "learning_rate": 1.0920866113365736e-07, + "loss": 0.7131, + "step": 11597 + }, + { + "epoch": 0.9543715284920798, + "grad_norm": 0.4134192735660469, + "learning_rate": 1.0881618560415031e-07, + "loss": 0.4845, + "step": 11598 + }, + { + "epoch": 0.9544538160872249, + "grad_norm": 1.651179274785669, + "learning_rate": 1.0842441272852388e-07, + "loss": 0.6849, + "step": 11599 + }, + { + "epoch": 0.9545361036823699, + "grad_norm": 1.8386512519158844, + "learning_rate": 1.0803334253460696e-07, + "loss": 0.6952, + "step": 11600 + }, + { + "epoch": 0.9546183912775149, + "grad_norm": 1.7604171148832655, + "learning_rate": 1.0764297505018173e-07, + "loss": 0.7018, + "step": 11601 + }, + { + "epoch": 0.95470067887266, + "grad_norm": 2.242030247969173, + "learning_rate": 1.0725331030297936e-07, + "loss": 0.7117, + "step": 11602 + }, + { + "epoch": 0.954782966467805, + "grad_norm": 2.1942326101966114, + "learning_rate": 1.0686434832068104e-07, + "loss": 0.7229, + "step": 11603 + }, + { + "epoch": 0.95486525406295, + "grad_norm": 2.1270250240501, + "learning_rate": 1.064760891309169e-07, + "loss": 0.7122, + "step": 11604 + }, + { + "epoch": 0.954947541658095, + "grad_norm": 2.142141134831101, + "learning_rate": 1.0608853276126929e-07, + "loss": 0.7187, + "step": 11605 + }, + { + "epoch": 0.9550298292532401, + "grad_norm": 1.915808019664214, + "learning_rate": 1.0570167923926955e-07, + "loss": 0.7032, + "step": 11606 + }, + { + "epoch": 0.9551121168483852, + "grad_norm": 2.1646957188902527, + "learning_rate": 1.0531552859239791e-07, + "loss": 0.7111, + "step": 11607 + }, + { + "epoch": 0.9551944044435301, + "grad_norm": 1.7591263200333416, + "learning_rate": 1.0493008084808798e-07, + "loss": 0.7044, + "step": 11608 + }, + { + "epoch": 0.9552766920386752, + "grad_norm": 1.8999689986450696, + "learning_rate": 1.0454533603372119e-07, + "loss": 0.7071, + "step": 11609 + }, + { + "epoch": 0.9553589796338202, + "grad_norm": 1.8683377534091872, + "learning_rate": 1.0416129417662679e-07, + "loss": 0.7097, + "step": 11610 + }, + { + "epoch": 0.9554412672289653, + "grad_norm": 3.6460288027983445, + "learning_rate": 1.0377795530408852e-07, + "loss": 0.7003, + "step": 11611 + }, + { + "epoch": 0.9555235548241102, + "grad_norm": 2.168497076742402, + "learning_rate": 1.033953194433368e-07, + "loss": 0.7134, + "step": 11612 + }, + { + "epoch": 0.9556058424192553, + "grad_norm": 2.1225754068270533, + "learning_rate": 1.0301338662155547e-07, + "loss": 0.704, + "step": 11613 + }, + { + "epoch": 0.9556881300144003, + "grad_norm": 1.948807031929045, + "learning_rate": 1.0263215686587391e-07, + "loss": 0.7194, + "step": 11614 + }, + { + "epoch": 0.9557704176095454, + "grad_norm": 2.0549523043101576, + "learning_rate": 1.0225163020337714e-07, + "loss": 0.7091, + "step": 11615 + }, + { + "epoch": 0.9558527052046903, + "grad_norm": 2.302916921733569, + "learning_rate": 1.0187180666109353e-07, + "loss": 0.726, + "step": 11616 + }, + { + "epoch": 0.9559349927998354, + "grad_norm": 2.1268902096654716, + "learning_rate": 1.0149268626600927e-07, + "loss": 0.7064, + "step": 11617 + }, + { + "epoch": 0.9560172803949805, + "grad_norm": 2.0007022305797912, + "learning_rate": 1.0111426904505172e-07, + "loss": 0.6976, + "step": 11618 + }, + { + "epoch": 0.9560995679901255, + "grad_norm": 1.9306870306233175, + "learning_rate": 1.0073655502510715e-07, + "loss": 0.685, + "step": 11619 + }, + { + "epoch": 0.9561818555852705, + "grad_norm": 2.011150343132918, + "learning_rate": 1.0035954423300632e-07, + "loss": 0.7075, + "step": 11620 + }, + { + "epoch": 0.9562641431804155, + "grad_norm": 1.6333426610547621, + "learning_rate": 9.998323669553111e-08, + "loss": 0.686, + "step": 11621 + }, + { + "epoch": 0.9563464307755606, + "grad_norm": 0.4218894191498942, + "learning_rate": 9.960763243941351e-08, + "loss": 0.4731, + "step": 11622 + }, + { + "epoch": 0.9564287183707056, + "grad_norm": 1.863653914632319, + "learning_rate": 9.923273149133772e-08, + "loss": 0.7162, + "step": 11623 + }, + { + "epoch": 0.9565110059658507, + "grad_norm": 3.209873398978489, + "learning_rate": 9.885853387793354e-08, + "loss": 0.704, + "step": 11624 + }, + { + "epoch": 0.9565932935609957, + "grad_norm": 2.1281842478198394, + "learning_rate": 9.848503962578415e-08, + "loss": 0.7008, + "step": 11625 + }, + { + "epoch": 0.9566755811561407, + "grad_norm": 1.94618795178726, + "learning_rate": 9.81122487614239e-08, + "loss": 0.6888, + "step": 11626 + }, + { + "epoch": 0.9567578687512858, + "grad_norm": 1.888382054606733, + "learning_rate": 9.774016131133268e-08, + "loss": 0.7121, + "step": 11627 + }, + { + "epoch": 0.9568401563464308, + "grad_norm": 2.1594371777515717, + "learning_rate": 9.736877730194383e-08, + "loss": 0.6952, + "step": 11628 + }, + { + "epoch": 0.9569224439415758, + "grad_norm": 8.046247814910865, + "learning_rate": 9.699809675964066e-08, + "loss": 0.7208, + "step": 11629 + }, + { + "epoch": 0.9570047315367208, + "grad_norm": 1.9824364195663768, + "learning_rate": 9.662811971075548e-08, + "loss": 0.7224, + "step": 11630 + }, + { + "epoch": 0.9570870191318659, + "grad_norm": 1.8630243120127372, + "learning_rate": 9.625884618157055e-08, + "loss": 0.7136, + "step": 11631 + }, + { + "epoch": 0.957169306727011, + "grad_norm": 0.4094134769003833, + "learning_rate": 9.589027619831826e-08, + "loss": 0.4603, + "step": 11632 + }, + { + "epoch": 0.9572515943221559, + "grad_norm": 1.782448761159508, + "learning_rate": 9.552240978718097e-08, + "loss": 0.7425, + "step": 11633 + }, + { + "epoch": 0.957333881917301, + "grad_norm": 2.230977997789283, + "learning_rate": 9.515524697429224e-08, + "loss": 0.7066, + "step": 11634 + }, + { + "epoch": 0.957416169512446, + "grad_norm": 2.1035827453810927, + "learning_rate": 9.478878778573452e-08, + "loss": 0.6882, + "step": 11635 + }, + { + "epoch": 0.9574984571075911, + "grad_norm": 2.5992261449466336, + "learning_rate": 9.442303224753923e-08, + "loss": 0.7457, + "step": 11636 + }, + { + "epoch": 0.957580744702736, + "grad_norm": 3.262542955508255, + "learning_rate": 9.405798038569113e-08, + "loss": 0.7322, + "step": 11637 + }, + { + "epoch": 0.9576630322978811, + "grad_norm": 2.1168495937202825, + "learning_rate": 9.369363222612282e-08, + "loss": 0.7034, + "step": 11638 + }, + { + "epoch": 0.9577453198930261, + "grad_norm": 4.472543739550214, + "learning_rate": 9.33299877947147e-08, + "loss": 0.6989, + "step": 11639 + }, + { + "epoch": 0.9578276074881712, + "grad_norm": 1.6043840994711314, + "learning_rate": 9.296704711730054e-08, + "loss": 0.7066, + "step": 11640 + }, + { + "epoch": 0.9579098950833161, + "grad_norm": 1.9190959588541794, + "learning_rate": 9.260481021966416e-08, + "loss": 0.6734, + "step": 11641 + }, + { + "epoch": 0.9579921826784612, + "grad_norm": 2.077459243886354, + "learning_rate": 9.224327712753722e-08, + "loss": 0.7194, + "step": 11642 + }, + { + "epoch": 0.9580744702736063, + "grad_norm": 1.9570827014931624, + "learning_rate": 9.18824478666025e-08, + "loss": 0.7066, + "step": 11643 + }, + { + "epoch": 0.9581567578687513, + "grad_norm": 0.4132962241150539, + "learning_rate": 9.152232246249393e-08, + "loss": 0.4697, + "step": 11644 + }, + { + "epoch": 0.9582390454638963, + "grad_norm": 1.89074726795155, + "learning_rate": 9.116290094079328e-08, + "loss": 0.7204, + "step": 11645 + }, + { + "epoch": 0.9583213330590413, + "grad_norm": 1.8576626830301557, + "learning_rate": 9.080418332703234e-08, + "loss": 0.7131, + "step": 11646 + }, + { + "epoch": 0.9584036206541864, + "grad_norm": 2.268254407355867, + "learning_rate": 9.044616964669517e-08, + "loss": 0.7452, + "step": 11647 + }, + { + "epoch": 0.9584859082493314, + "grad_norm": 1.6394238193586526, + "learning_rate": 9.008885992521365e-08, + "loss": 0.6892, + "step": 11648 + }, + { + "epoch": 0.9585681958444765, + "grad_norm": 2.6170653280768654, + "learning_rate": 8.973225418797193e-08, + "loss": 0.7449, + "step": 11649 + }, + { + "epoch": 0.9586504834396214, + "grad_norm": 1.619924516147844, + "learning_rate": 8.937635246030196e-08, + "loss": 0.7024, + "step": 11650 + }, + { + "epoch": 0.9587327710347665, + "grad_norm": 0.4198177206982011, + "learning_rate": 8.902115476748574e-08, + "loss": 0.4779, + "step": 11651 + }, + { + "epoch": 0.9588150586299116, + "grad_norm": 2.715844359285104, + "learning_rate": 8.86666611347553e-08, + "loss": 0.7075, + "step": 11652 + }, + { + "epoch": 0.9588973462250566, + "grad_norm": 1.9120558621594799, + "learning_rate": 8.831287158729606e-08, + "loss": 0.7058, + "step": 11653 + }, + { + "epoch": 0.9589796338202016, + "grad_norm": 2.355796979039373, + "learning_rate": 8.795978615023792e-08, + "loss": 0.7017, + "step": 11654 + }, + { + "epoch": 0.9590619214153466, + "grad_norm": 2.2192124181419985, + "learning_rate": 8.760740484866414e-08, + "loss": 0.7277, + "step": 11655 + }, + { + "epoch": 0.9591442090104917, + "grad_norm": 1.9212939148189292, + "learning_rate": 8.725572770760915e-08, + "loss": 0.6928, + "step": 11656 + }, + { + "epoch": 0.9592264966056367, + "grad_norm": 2.1286949267827655, + "learning_rate": 8.690475475205406e-08, + "loss": 0.6977, + "step": 11657 + }, + { + "epoch": 0.9593087842007817, + "grad_norm": 2.141770128679849, + "learning_rate": 8.655448600693118e-08, + "loss": 0.7244, + "step": 11658 + }, + { + "epoch": 0.9593910717959268, + "grad_norm": 2.932135383243813, + "learning_rate": 8.620492149712278e-08, + "loss": 0.717, + "step": 11659 + }, + { + "epoch": 0.9594733593910718, + "grad_norm": 0.4072120037135009, + "learning_rate": 8.585606124746238e-08, + "loss": 0.4587, + "step": 11660 + }, + { + "epoch": 0.9595556469862169, + "grad_norm": 1.859607507680437, + "learning_rate": 8.550790528273235e-08, + "loss": 0.708, + "step": 11661 + }, + { + "epoch": 0.9596379345813618, + "grad_norm": 3.3827672694356536, + "learning_rate": 8.516045362766512e-08, + "loss": 0.7121, + "step": 11662 + }, + { + "epoch": 0.9597202221765069, + "grad_norm": 0.40433909117117756, + "learning_rate": 8.481370630694208e-08, + "loss": 0.4516, + "step": 11663 + }, + { + "epoch": 0.9598025097716519, + "grad_norm": 3.99315989759503, + "learning_rate": 8.446766334519685e-08, + "loss": 0.695, + "step": 11664 + }, + { + "epoch": 0.959884797366797, + "grad_norm": 3.7560436914619886, + "learning_rate": 8.412232476701087e-08, + "loss": 0.7137, + "step": 11665 + }, + { + "epoch": 0.9599670849619419, + "grad_norm": 2.087605776678687, + "learning_rate": 8.377769059691676e-08, + "loss": 0.7232, + "step": 11666 + }, + { + "epoch": 0.960049372557087, + "grad_norm": 1.8614056223821298, + "learning_rate": 8.343376085939825e-08, + "loss": 0.6918, + "step": 11667 + }, + { + "epoch": 0.9601316601522321, + "grad_norm": 2.7212352094747465, + "learning_rate": 8.309053557888469e-08, + "loss": 0.6801, + "step": 11668 + }, + { + "epoch": 0.9602139477473771, + "grad_norm": 2.007147652684613, + "learning_rate": 8.274801477975992e-08, + "loss": 0.7052, + "step": 11669 + }, + { + "epoch": 0.9602962353425221, + "grad_norm": 1.848790057358446, + "learning_rate": 8.240619848635666e-08, + "loss": 0.6956, + "step": 11670 + }, + { + "epoch": 0.9603785229376671, + "grad_norm": 2.094957919089807, + "learning_rate": 8.206508672295555e-08, + "loss": 0.7221, + "step": 11671 + }, + { + "epoch": 0.9604608105328122, + "grad_norm": 2.1971721500813586, + "learning_rate": 8.172467951378827e-08, + "loss": 0.7191, + "step": 11672 + }, + { + "epoch": 0.9605430981279572, + "grad_norm": 1.7601301194161265, + "learning_rate": 8.138497688303992e-08, + "loss": 0.7051, + "step": 11673 + }, + { + "epoch": 0.9606253857231022, + "grad_norm": 2.0303908729111537, + "learning_rate": 8.1045978854839e-08, + "loss": 0.7169, + "step": 11674 + }, + { + "epoch": 0.9607076733182472, + "grad_norm": 1.8016256847495769, + "learning_rate": 8.070768545326957e-08, + "loss": 0.7291, + "step": 11675 + }, + { + "epoch": 0.9607899609133923, + "grad_norm": 1.566239675356478, + "learning_rate": 8.037009670236128e-08, + "loss": 0.6977, + "step": 11676 + }, + { + "epoch": 0.9608722485085374, + "grad_norm": 1.7735424919798726, + "learning_rate": 8.003321262609831e-08, + "loss": 0.725, + "step": 11677 + }, + { + "epoch": 0.9609545361036824, + "grad_norm": 1.88863631145464, + "learning_rate": 7.969703324841039e-08, + "loss": 0.7035, + "step": 11678 + }, + { + "epoch": 0.9610368236988274, + "grad_norm": 1.686756216200601, + "learning_rate": 7.936155859318173e-08, + "loss": 0.6983, + "step": 11679 + }, + { + "epoch": 0.9611191112939724, + "grad_norm": 2.084389978078032, + "learning_rate": 7.902678868424107e-08, + "loss": 0.7194, + "step": 11680 + }, + { + "epoch": 0.9612013988891175, + "grad_norm": 1.6669093235196493, + "learning_rate": 7.869272354537049e-08, + "loss": 0.7013, + "step": 11681 + }, + { + "epoch": 0.9612836864842625, + "grad_norm": 2.1488957078172293, + "learning_rate": 7.835936320030324e-08, + "loss": 0.7275, + "step": 11682 + }, + { + "epoch": 0.9613659740794075, + "grad_norm": 2.109602343900026, + "learning_rate": 7.802670767271925e-08, + "loss": 0.6779, + "step": 11683 + }, + { + "epoch": 0.9614482616745526, + "grad_norm": 2.2070666238853045, + "learning_rate": 7.769475698624962e-08, + "loss": 0.7008, + "step": 11684 + }, + { + "epoch": 0.9615305492696976, + "grad_norm": 2.009774277118629, + "learning_rate": 7.73635111644766e-08, + "loss": 0.7058, + "step": 11685 + }, + { + "epoch": 0.9616128368648427, + "grad_norm": 1.9004349539135006, + "learning_rate": 7.703297023093025e-08, + "loss": 0.6702, + "step": 11686 + }, + { + "epoch": 0.9616951244599876, + "grad_norm": 0.3928543015485921, + "learning_rate": 7.670313420909181e-08, + "loss": 0.4464, + "step": 11687 + }, + { + "epoch": 0.9617774120551327, + "grad_norm": 3.162796716368214, + "learning_rate": 7.637400312239363e-08, + "loss": 0.6844, + "step": 11688 + }, + { + "epoch": 0.9618596996502777, + "grad_norm": 1.7902744276678817, + "learning_rate": 7.604557699421588e-08, + "loss": 0.6964, + "step": 11689 + }, + { + "epoch": 0.9619419872454228, + "grad_norm": 1.880695061715598, + "learning_rate": 7.571785584788883e-08, + "loss": 0.7012, + "step": 11690 + }, + { + "epoch": 0.9620242748405677, + "grad_norm": 1.988484067598492, + "learning_rate": 7.539083970669492e-08, + "loss": 0.7369, + "step": 11691 + }, + { + "epoch": 0.9621065624357128, + "grad_norm": 2.5494081105020414, + "learning_rate": 7.506452859386226e-08, + "loss": 0.7133, + "step": 11692 + }, + { + "epoch": 0.9621888500308579, + "grad_norm": 2.1705658211993484, + "learning_rate": 7.473892253257342e-08, + "loss": 0.7283, + "step": 11693 + }, + { + "epoch": 0.9622711376260029, + "grad_norm": 2.124873463160807, + "learning_rate": 7.441402154595767e-08, + "loss": 0.7329, + "step": 11694 + }, + { + "epoch": 0.9623534252211479, + "grad_norm": 2.061147455600449, + "learning_rate": 7.408982565709655e-08, + "loss": 0.6742, + "step": 11695 + }, + { + "epoch": 0.9624357128162929, + "grad_norm": 1.9209808830415591, + "learning_rate": 7.376633488902164e-08, + "loss": 0.6863, + "step": 11696 + }, + { + "epoch": 0.962518000411438, + "grad_norm": 2.746565429060613, + "learning_rate": 7.344354926471009e-08, + "loss": 0.7044, + "step": 11697 + }, + { + "epoch": 0.962600288006583, + "grad_norm": 1.9423149814804594, + "learning_rate": 7.312146880709358e-08, + "loss": 0.6978, + "step": 11698 + }, + { + "epoch": 0.962682575601728, + "grad_norm": 1.9277050926961459, + "learning_rate": 7.28000935390527e-08, + "loss": 0.6817, + "step": 11699 + }, + { + "epoch": 0.962764863196873, + "grad_norm": 1.9916152038801933, + "learning_rate": 7.247942348341697e-08, + "loss": 0.7024, + "step": 11700 + }, + { + "epoch": 0.9628471507920181, + "grad_norm": 2.012854093565221, + "learning_rate": 7.215945866296592e-08, + "loss": 0.7072, + "step": 11701 + }, + { + "epoch": 0.9629294383871632, + "grad_norm": 3.6356780584933834, + "learning_rate": 7.184019910043027e-08, + "loss": 0.7335, + "step": 11702 + }, + { + "epoch": 0.9630117259823082, + "grad_norm": 0.43571773761842336, + "learning_rate": 7.152164481848856e-08, + "loss": 0.4725, + "step": 11703 + }, + { + "epoch": 0.9630940135774532, + "grad_norm": 2.3459440580831434, + "learning_rate": 7.120379583977044e-08, + "loss": 0.702, + "step": 11704 + }, + { + "epoch": 0.9631763011725982, + "grad_norm": 2.2534078689349935, + "learning_rate": 7.088665218685675e-08, + "loss": 0.7079, + "step": 11705 + }, + { + "epoch": 0.9632585887677433, + "grad_norm": 8.330391414942097, + "learning_rate": 7.057021388227614e-08, + "loss": 0.7164, + "step": 11706 + }, + { + "epoch": 0.9633408763628883, + "grad_norm": 1.6591853621822317, + "learning_rate": 7.025448094850729e-08, + "loss": 0.7033, + "step": 11707 + }, + { + "epoch": 0.9634231639580333, + "grad_norm": 2.2713172697302078, + "learning_rate": 6.993945340798003e-08, + "loss": 0.6896, + "step": 11708 + }, + { + "epoch": 0.9635054515531783, + "grad_norm": 2.8734891015676753, + "learning_rate": 6.962513128307424e-08, + "loss": 0.7135, + "step": 11709 + }, + { + "epoch": 0.9635877391483234, + "grad_norm": 0.4105380819239511, + "learning_rate": 6.931151459611651e-08, + "loss": 0.4701, + "step": 11710 + }, + { + "epoch": 0.9636700267434685, + "grad_norm": 2.458589238389856, + "learning_rate": 6.899860336938679e-08, + "loss": 0.7234, + "step": 11711 + }, + { + "epoch": 0.9637523143386134, + "grad_norm": 1.7330757830996464, + "learning_rate": 6.868639762511398e-08, + "loss": 0.7231, + "step": 11712 + }, + { + "epoch": 0.9638346019337585, + "grad_norm": 1.9966686557829074, + "learning_rate": 6.837489738547809e-08, + "loss": 0.7112, + "step": 11713 + }, + { + "epoch": 0.9639168895289035, + "grad_norm": 1.631709081117669, + "learning_rate": 6.806410267260588e-08, + "loss": 0.6827, + "step": 11714 + }, + { + "epoch": 0.9639991771240486, + "grad_norm": 0.4185883158102109, + "learning_rate": 6.775401350857525e-08, + "loss": 0.4768, + "step": 11715 + }, + { + "epoch": 0.9640814647191935, + "grad_norm": 1.6988228184313863, + "learning_rate": 6.744462991541522e-08, + "loss": 0.7186, + "step": 11716 + }, + { + "epoch": 0.9641637523143386, + "grad_norm": 0.4496513644786353, + "learning_rate": 6.71359519151038e-08, + "loss": 0.4822, + "step": 11717 + }, + { + "epoch": 0.9642460399094837, + "grad_norm": 1.9617122096396629, + "learning_rate": 6.6827979529569e-08, + "loss": 0.6857, + "step": 11718 + }, + { + "epoch": 0.9643283275046287, + "grad_norm": 2.003580370344142, + "learning_rate": 6.652071278068994e-08, + "loss": 0.7255, + "step": 11719 + }, + { + "epoch": 0.9644106150997737, + "grad_norm": 3.6727015073681346, + "learning_rate": 6.621415169029255e-08, + "loss": 0.7191, + "step": 11720 + }, + { + "epoch": 0.9644929026949187, + "grad_norm": 1.9051404482971965, + "learning_rate": 6.590829628015494e-08, + "loss": 0.7294, + "step": 11721 + }, + { + "epoch": 0.9645751902900638, + "grad_norm": 1.6152637885480468, + "learning_rate": 6.560314657200418e-08, + "loss": 0.6834, + "step": 11722 + }, + { + "epoch": 0.9646574778852088, + "grad_norm": 1.9766807479906807, + "learning_rate": 6.529870258751957e-08, + "loss": 0.7208, + "step": 11723 + }, + { + "epoch": 0.9647397654803538, + "grad_norm": 2.0037153174815505, + "learning_rate": 6.499496434832608e-08, + "loss": 0.7146, + "step": 11724 + }, + { + "epoch": 0.9648220530754988, + "grad_norm": 1.8217791303199515, + "learning_rate": 6.469193187600087e-08, + "loss": 0.6958, + "step": 11725 + }, + { + "epoch": 0.9649043406706439, + "grad_norm": 2.015647993065728, + "learning_rate": 6.438960519207338e-08, + "loss": 0.6934, + "step": 11726 + }, + { + "epoch": 0.964986628265789, + "grad_norm": 1.9137377401430749, + "learning_rate": 6.408798431801755e-08, + "loss": 0.694, + "step": 11727 + }, + { + "epoch": 0.965068915860934, + "grad_norm": 0.40829312042323895, + "learning_rate": 6.378706927526068e-08, + "loss": 0.4815, + "step": 11728 + }, + { + "epoch": 0.965151203456079, + "grad_norm": 3.0190189208862788, + "learning_rate": 6.348686008518123e-08, + "loss": 0.735, + "step": 11729 + }, + { + "epoch": 0.965233491051224, + "grad_norm": 1.9664253947803203, + "learning_rate": 6.318735676910326e-08, + "loss": 0.6863, + "step": 11730 + }, + { + "epoch": 0.9653157786463691, + "grad_norm": 2.452778726039404, + "learning_rate": 6.288855934830417e-08, + "loss": 0.7098, + "step": 11731 + }, + { + "epoch": 0.9653980662415141, + "grad_norm": 1.811443304049906, + "learning_rate": 6.259046784400924e-08, + "loss": 0.6801, + "step": 11732 + }, + { + "epoch": 0.9654803538366591, + "grad_norm": 1.991566656719058, + "learning_rate": 6.229308227739594e-08, + "loss": 0.7029, + "step": 11733 + }, + { + "epoch": 0.9655626414318041, + "grad_norm": 1.8571618596634702, + "learning_rate": 6.199640266958739e-08, + "loss": 0.7247, + "step": 11734 + }, + { + "epoch": 0.9656449290269492, + "grad_norm": 2.529495651781941, + "learning_rate": 6.170042904166229e-08, + "loss": 0.7072, + "step": 11735 + }, + { + "epoch": 0.9657272166220943, + "grad_norm": 0.39982441685919656, + "learning_rate": 6.14051614146438e-08, + "loss": 0.4588, + "step": 11736 + }, + { + "epoch": 0.9658095042172392, + "grad_norm": 1.8811670242515015, + "learning_rate": 6.111059980950962e-08, + "loss": 0.7372, + "step": 11737 + }, + { + "epoch": 0.9658917918123843, + "grad_norm": 1.8990608606325208, + "learning_rate": 6.081674424718187e-08, + "loss": 0.719, + "step": 11738 + }, + { + "epoch": 0.9659740794075293, + "grad_norm": 1.997517113087183, + "learning_rate": 6.05235947485383e-08, + "loss": 0.7347, + "step": 11739 + }, + { + "epoch": 0.9660563670026744, + "grad_norm": 2.0172700636726297, + "learning_rate": 6.023115133440227e-08, + "loss": 0.7319, + "step": 11740 + }, + { + "epoch": 0.9661386545978193, + "grad_norm": 1.8456545378468963, + "learning_rate": 5.993941402554937e-08, + "loss": 0.6947, + "step": 11741 + }, + { + "epoch": 0.9662209421929644, + "grad_norm": 1.67419827239229, + "learning_rate": 5.964838284270302e-08, + "loss": 0.6959, + "step": 11742 + }, + { + "epoch": 0.9663032297881095, + "grad_norm": 0.4280883699326706, + "learning_rate": 5.9358057806540024e-08, + "loss": 0.4816, + "step": 11743 + }, + { + "epoch": 0.9663855173832545, + "grad_norm": 2.100678973202437, + "learning_rate": 5.906843893768055e-08, + "loss": 0.707, + "step": 11744 + }, + { + "epoch": 0.9664678049783995, + "grad_norm": 5.869136122700723, + "learning_rate": 5.877952625670258e-08, + "loss": 0.7187, + "step": 11745 + }, + { + "epoch": 0.9665500925735445, + "grad_norm": 2.5258572498016534, + "learning_rate": 5.849131978412747e-08, + "loss": 0.6951, + "step": 11746 + }, + { + "epoch": 0.9666323801686896, + "grad_norm": 2.3888437096490653, + "learning_rate": 5.820381954043108e-08, + "loss": 0.7256, + "step": 11747 + }, + { + "epoch": 0.9667146677638346, + "grad_norm": 2.0447520560611414, + "learning_rate": 5.7917025546034846e-08, + "loss": 0.6947, + "step": 11748 + }, + { + "epoch": 0.9667969553589796, + "grad_norm": 0.4159749125564766, + "learning_rate": 5.763093782131357e-08, + "loss": 0.4629, + "step": 11749 + }, + { + "epoch": 0.9668792429541246, + "grad_norm": 1.8096049872854842, + "learning_rate": 5.7345556386589894e-08, + "loss": 0.7147, + "step": 11750 + }, + { + "epoch": 0.9669615305492697, + "grad_norm": 1.772194364017142, + "learning_rate": 5.706088126213649e-08, + "loss": 0.7054, + "step": 11751 + }, + { + "epoch": 0.9670438181444148, + "grad_norm": 2.1751032518384212, + "learning_rate": 5.6776912468177184e-08, + "loss": 0.6915, + "step": 11752 + }, + { + "epoch": 0.9671261057395598, + "grad_norm": 1.7240669554780703, + "learning_rate": 5.649365002488361e-08, + "loss": 0.6835, + "step": 11753 + }, + { + "epoch": 0.9672083933347048, + "grad_norm": 1.8679000723069252, + "learning_rate": 5.621109395237967e-08, + "loss": 0.7311, + "step": 11754 + }, + { + "epoch": 0.9672906809298498, + "grad_norm": 1.9620655419942472, + "learning_rate": 5.592924427073709e-08, + "loss": 0.7159, + "step": 11755 + }, + { + "epoch": 0.9673729685249949, + "grad_norm": 1.9530604876235969, + "learning_rate": 5.564810099997653e-08, + "loss": 0.6878, + "step": 11756 + }, + { + "epoch": 0.9674552561201399, + "grad_norm": 2.1107647993480456, + "learning_rate": 5.5367664160072e-08, + "loss": 0.7175, + "step": 11757 + }, + { + "epoch": 0.9675375437152849, + "grad_norm": 2.03316501064997, + "learning_rate": 5.5087933770944234e-08, + "loss": 0.7299, + "step": 11758 + }, + { + "epoch": 0.96761983131043, + "grad_norm": 2.4415696068020507, + "learning_rate": 5.480890985246512e-08, + "loss": 0.7441, + "step": 11759 + }, + { + "epoch": 0.967702118905575, + "grad_norm": 2.0686821676020943, + "learning_rate": 5.453059242445658e-08, + "loss": 0.7041, + "step": 11760 + }, + { + "epoch": 0.9677844065007201, + "grad_norm": 2.2320667521780964, + "learning_rate": 5.425298150668945e-08, + "loss": 0.7164, + "step": 11761 + }, + { + "epoch": 0.967866694095865, + "grad_norm": 2.146850476354364, + "learning_rate": 5.397607711888353e-08, + "loss": 0.7012, + "step": 11762 + }, + { + "epoch": 0.9679489816910101, + "grad_norm": 3.416916976712876, + "learning_rate": 5.369987928071196e-08, + "loss": 0.724, + "step": 11763 + }, + { + "epoch": 0.9680312692861551, + "grad_norm": 0.409102420924207, + "learning_rate": 5.3424388011793505e-08, + "loss": 0.4413, + "step": 11764 + }, + { + "epoch": 0.9681135568813002, + "grad_norm": 2.0763513838602807, + "learning_rate": 5.314960333169916e-08, + "loss": 0.705, + "step": 11765 + }, + { + "epoch": 0.9681958444764451, + "grad_norm": 1.7277054240697254, + "learning_rate": 5.287552525994999e-08, + "loss": 0.7105, + "step": 11766 + }, + { + "epoch": 0.9682781320715902, + "grad_norm": 0.41295071190328786, + "learning_rate": 5.2602153816014855e-08, + "loss": 0.4666, + "step": 11767 + }, + { + "epoch": 0.9683604196667353, + "grad_norm": 1.7960591410945654, + "learning_rate": 5.232948901931489e-08, + "loss": 0.7068, + "step": 11768 + }, + { + "epoch": 0.9684427072618803, + "grad_norm": 2.3520724396493553, + "learning_rate": 5.2057530889217945e-08, + "loss": 0.7082, + "step": 11769 + }, + { + "epoch": 0.9685249948570253, + "grad_norm": 1.9930262966253889, + "learning_rate": 5.178627944504633e-08, + "loss": 0.7158, + "step": 11770 + }, + { + "epoch": 0.9686072824521703, + "grad_norm": 0.4346656614498784, + "learning_rate": 5.1515734706066856e-08, + "loss": 0.4785, + "step": 11771 + }, + { + "epoch": 0.9686895700473154, + "grad_norm": 2.4049391514058875, + "learning_rate": 5.1245896691498595e-08, + "loss": 0.7124, + "step": 11772 + }, + { + "epoch": 0.9687718576424604, + "grad_norm": 0.4170533762734798, + "learning_rate": 5.097676542051178e-08, + "loss": 0.475, + "step": 11773 + }, + { + "epoch": 0.9688541452376054, + "grad_norm": 1.7811027114327433, + "learning_rate": 5.070834091222443e-08, + "loss": 0.7047, + "step": 11774 + }, + { + "epoch": 0.9689364328327504, + "grad_norm": 1.8836373483100433, + "learning_rate": 5.044062318570464e-08, + "loss": 0.6601, + "step": 11775 + }, + { + "epoch": 0.9690187204278955, + "grad_norm": 2.4098180293701055, + "learning_rate": 5.0173612259971635e-08, + "loss": 0.7092, + "step": 11776 + }, + { + "epoch": 0.9691010080230406, + "grad_norm": 2.1055822018760906, + "learning_rate": 4.990730815399247e-08, + "loss": 0.7173, + "step": 11777 + }, + { + "epoch": 0.9691832956181856, + "grad_norm": 1.8997809398541035, + "learning_rate": 4.964171088668535e-08, + "loss": 0.698, + "step": 11778 + }, + { + "epoch": 0.9692655832133306, + "grad_norm": 1.7030787756504009, + "learning_rate": 4.937682047691739e-08, + "loss": 0.7071, + "step": 11779 + }, + { + "epoch": 0.9693478708084756, + "grad_norm": 0.40888799362960904, + "learning_rate": 4.911263694350688e-08, + "loss": 0.4705, + "step": 11780 + }, + { + "epoch": 0.9694301584036207, + "grad_norm": 2.207145626010943, + "learning_rate": 4.884916030521991e-08, + "loss": 0.7517, + "step": 11781 + }, + { + "epoch": 0.9695124459987657, + "grad_norm": 1.8917073125098327, + "learning_rate": 4.858639058077486e-08, + "loss": 0.7308, + "step": 11782 + }, + { + "epoch": 0.9695947335939107, + "grad_norm": 2.2500912919327636, + "learning_rate": 4.832432778883789e-08, + "loss": 0.7317, + "step": 11783 + }, + { + "epoch": 0.9696770211890557, + "grad_norm": 1.79813927115205, + "learning_rate": 4.806297194802523e-08, + "loss": 0.7099, + "step": 11784 + }, + { + "epoch": 0.9697593087842008, + "grad_norm": 1.7970746198147538, + "learning_rate": 4.7802323076903134e-08, + "loss": 0.7173, + "step": 11785 + }, + { + "epoch": 0.9698415963793459, + "grad_norm": 0.40642063315443117, + "learning_rate": 4.7542381193987905e-08, + "loss": 0.4597, + "step": 11786 + }, + { + "epoch": 0.9699238839744908, + "grad_norm": 1.9646966785229667, + "learning_rate": 4.728314631774478e-08, + "loss": 0.7408, + "step": 11787 + }, + { + "epoch": 0.9700061715696359, + "grad_norm": 1.8030567336326835, + "learning_rate": 4.702461846659012e-08, + "loss": 0.6888, + "step": 11788 + }, + { + "epoch": 0.9700884591647809, + "grad_norm": 2.0023484700517282, + "learning_rate": 4.676679765888925e-08, + "loss": 0.6884, + "step": 11789 + }, + { + "epoch": 0.970170746759926, + "grad_norm": 2.2478169754198696, + "learning_rate": 4.6509683912957514e-08, + "loss": 0.706, + "step": 11790 + }, + { + "epoch": 0.9702530343550709, + "grad_norm": 1.6514338001699116, + "learning_rate": 4.625327724705808e-08, + "loss": 0.6946, + "step": 11791 + }, + { + "epoch": 0.970335321950216, + "grad_norm": 1.7084024597590406, + "learning_rate": 4.59975776794086e-08, + "loss": 0.6937, + "step": 11792 + }, + { + "epoch": 0.970417609545361, + "grad_norm": 2.0485057572940204, + "learning_rate": 4.574258522817232e-08, + "loss": 0.718, + "step": 11793 + }, + { + "epoch": 0.9704998971405061, + "grad_norm": 2.829162091485086, + "learning_rate": 4.548829991146253e-08, + "loss": 0.7533, + "step": 11794 + }, + { + "epoch": 0.970582184735651, + "grad_norm": 0.40498965264153014, + "learning_rate": 4.523472174734478e-08, + "loss": 0.4686, + "step": 11795 + }, + { + "epoch": 0.9706644723307961, + "grad_norm": 2.045199157700003, + "learning_rate": 4.498185075383132e-08, + "loss": 0.7198, + "step": 11796 + }, + { + "epoch": 0.9707467599259412, + "grad_norm": 2.1292863767136385, + "learning_rate": 4.4729686948886684e-08, + "loss": 0.7056, + "step": 11797 + }, + { + "epoch": 0.9708290475210862, + "grad_norm": 2.076626937846562, + "learning_rate": 4.447823035042431e-08, + "loss": 0.7295, + "step": 11798 + }, + { + "epoch": 0.9709113351162312, + "grad_norm": 0.41227251969065215, + "learning_rate": 4.4227480976306583e-08, + "loss": 0.4773, + "step": 11799 + }, + { + "epoch": 0.9709936227113762, + "grad_norm": 2.0768073593408163, + "learning_rate": 4.397743884434702e-08, + "loss": 0.7175, + "step": 11800 + }, + { + "epoch": 0.9710759103065213, + "grad_norm": 2.0557918417249312, + "learning_rate": 4.372810397230809e-08, + "loss": 0.7012, + "step": 11801 + }, + { + "epoch": 0.9711581979016664, + "grad_norm": 1.7914345546251869, + "learning_rate": 4.3479476377901177e-08, + "loss": 0.7099, + "step": 11802 + }, + { + "epoch": 0.9712404854968113, + "grad_norm": 0.4008100660351981, + "learning_rate": 4.323155607878993e-08, + "loss": 0.4653, + "step": 11803 + }, + { + "epoch": 0.9713227730919564, + "grad_norm": 2.069494952681516, + "learning_rate": 4.29843430925847e-08, + "loss": 0.7128, + "step": 11804 + }, + { + "epoch": 0.9714050606871014, + "grad_norm": 0.42486303598137454, + "learning_rate": 4.2737837436848117e-08, + "loss": 0.4902, + "step": 11805 + }, + { + "epoch": 0.9714873482822465, + "grad_norm": 1.819631543354148, + "learning_rate": 4.249203912909172e-08, + "loss": 0.7046, + "step": 11806 + }, + { + "epoch": 0.9715696358773915, + "grad_norm": 0.4221346969538022, + "learning_rate": 4.224694818677599e-08, + "loss": 0.4813, + "step": 11807 + }, + { + "epoch": 0.9716519234725365, + "grad_norm": 3.0198603618989583, + "learning_rate": 4.200256462731256e-08, + "loss": 0.7034, + "step": 11808 + }, + { + "epoch": 0.9717342110676815, + "grad_norm": 1.6735894076865154, + "learning_rate": 4.1758888468059754e-08, + "loss": 0.7243, + "step": 11809 + }, + { + "epoch": 0.9718164986628266, + "grad_norm": 1.9132687450082475, + "learning_rate": 4.151591972633151e-08, + "loss": 0.7295, + "step": 11810 + }, + { + "epoch": 0.9718987862579717, + "grad_norm": 1.8232402289826788, + "learning_rate": 4.127365841938513e-08, + "loss": 0.6899, + "step": 11811 + }, + { + "epoch": 0.9719810738531166, + "grad_norm": 0.42155291288274604, + "learning_rate": 4.1032104564431297e-08, + "loss": 0.4954, + "step": 11812 + }, + { + "epoch": 0.9720633614482617, + "grad_norm": 1.7488597683734544, + "learning_rate": 4.079125817863072e-08, + "loss": 0.6905, + "step": 11813 + }, + { + "epoch": 0.9721456490434067, + "grad_norm": 1.7258189215472062, + "learning_rate": 4.0551119279090836e-08, + "loss": 0.7091, + "step": 11814 + }, + { + "epoch": 0.9722279366385518, + "grad_norm": 1.943988963533932, + "learning_rate": 4.031168788287132e-08, + "loss": 0.7152, + "step": 11815 + }, + { + "epoch": 0.9723102242336967, + "grad_norm": 1.8255702568940329, + "learning_rate": 4.0072964006981907e-08, + "loss": 0.7137, + "step": 11816 + }, + { + "epoch": 0.9723925118288418, + "grad_norm": 2.2310239275684633, + "learning_rate": 3.9834947668380145e-08, + "loss": 0.7092, + "step": 11817 + }, + { + "epoch": 0.9724747994239868, + "grad_norm": 1.9785597403132975, + "learning_rate": 3.959763888397583e-08, + "loss": 0.695, + "step": 11818 + }, + { + "epoch": 0.9725570870191319, + "grad_norm": 1.6717371213940233, + "learning_rate": 3.936103767062549e-08, + "loss": 0.6754, + "step": 11819 + }, + { + "epoch": 0.9726393746142769, + "grad_norm": 2.0734494708443694, + "learning_rate": 3.9125144045136785e-08, + "loss": 0.7094, + "step": 11820 + }, + { + "epoch": 0.9727216622094219, + "grad_norm": 2.3449448439943943, + "learning_rate": 3.888995802426854e-08, + "loss": 0.7181, + "step": 11821 + }, + { + "epoch": 0.972803949804567, + "grad_norm": 1.6029641210705916, + "learning_rate": 3.8655479624726267e-08, + "loss": 0.6822, + "step": 11822 + }, + { + "epoch": 0.972886237399712, + "grad_norm": 1.981990679039243, + "learning_rate": 3.842170886316887e-08, + "loss": 0.7249, + "step": 11823 + }, + { + "epoch": 0.972968524994857, + "grad_norm": 1.9028348030193996, + "learning_rate": 3.818864575620196e-08, + "loss": 0.6935, + "step": 11824 + }, + { + "epoch": 0.973050812590002, + "grad_norm": 2.237671601980677, + "learning_rate": 3.795629032038117e-08, + "loss": 0.7372, + "step": 11825 + }, + { + "epoch": 0.9731331001851471, + "grad_norm": 1.8500274525606317, + "learning_rate": 3.772464257221442e-08, + "loss": 0.6999, + "step": 11826 + }, + { + "epoch": 0.9732153877802922, + "grad_norm": 1.9590308465518342, + "learning_rate": 3.749370252815632e-08, + "loss": 0.6922, + "step": 11827 + }, + { + "epoch": 0.9732976753754371, + "grad_norm": 1.6686410738799464, + "learning_rate": 3.726347020461374e-08, + "loss": 0.6816, + "step": 11828 + }, + { + "epoch": 0.9733799629705822, + "grad_norm": 0.42166678220399, + "learning_rate": 3.703394561794027e-08, + "loss": 0.4584, + "step": 11829 + }, + { + "epoch": 0.9734622505657272, + "grad_norm": 2.208499312819335, + "learning_rate": 3.680512878444287e-08, + "loss": 0.7104, + "step": 11830 + }, + { + "epoch": 0.9735445381608723, + "grad_norm": 1.8269062907130718, + "learning_rate": 3.6577019720374086e-08, + "loss": 0.6938, + "step": 11831 + }, + { + "epoch": 0.9736268257560173, + "grad_norm": 0.40638619234821194, + "learning_rate": 3.634961844194096e-08, + "loss": 0.4418, + "step": 11832 + }, + { + "epoch": 0.9737091133511623, + "grad_norm": 2.04598040624015, + "learning_rate": 3.612292496529501e-08, + "loss": 0.7304, + "step": 11833 + }, + { + "epoch": 0.9737914009463073, + "grad_norm": 2.28134003471954, + "learning_rate": 3.5896939306542253e-08, + "loss": 0.7104, + "step": 11834 + }, + { + "epoch": 0.9738736885414524, + "grad_norm": 3.1785428464468177, + "learning_rate": 3.567166148173651e-08, + "loss": 0.6916, + "step": 11835 + }, + { + "epoch": 0.9739559761365975, + "grad_norm": 1.894786773919502, + "learning_rate": 3.544709150687942e-08, + "loss": 0.7038, + "step": 11836 + }, + { + "epoch": 0.9740382637317424, + "grad_norm": 2.7070609697616854, + "learning_rate": 3.522322939792488e-08, + "loss": 0.7055, + "step": 11837 + }, + { + "epoch": 0.9741205513268875, + "grad_norm": 0.40751829662842437, + "learning_rate": 3.500007517077686e-08, + "loss": 0.4473, + "step": 11838 + }, + { + "epoch": 0.9742028389220325, + "grad_norm": 2.4312984996382463, + "learning_rate": 3.477762884128599e-08, + "loss": 0.7005, + "step": 11839 + }, + { + "epoch": 0.9742851265171776, + "grad_norm": 1.7761586799347355, + "learning_rate": 3.45558904252552e-08, + "loss": 0.713, + "step": 11840 + }, + { + "epoch": 0.9743674141123225, + "grad_norm": 1.6872914166802075, + "learning_rate": 3.433485993843744e-08, + "loss": 0.7191, + "step": 11841 + }, + { + "epoch": 0.9744497017074676, + "grad_norm": 2.1453767279441043, + "learning_rate": 3.4114537396532366e-08, + "loss": 0.694, + "step": 11842 + }, + { + "epoch": 0.9745319893026126, + "grad_norm": 2.574531181682933, + "learning_rate": 3.389492281519302e-08, + "loss": 0.7147, + "step": 11843 + }, + { + "epoch": 0.9746142768977577, + "grad_norm": 1.7773357702216661, + "learning_rate": 3.367601621002026e-08, + "loss": 0.7122, + "step": 11844 + }, + { + "epoch": 0.9746965644929027, + "grad_norm": 2.0940617205065593, + "learning_rate": 3.3457817596563854e-08, + "loss": 0.7103, + "step": 11845 + }, + { + "epoch": 0.9747788520880477, + "grad_norm": 1.7052218908051389, + "learning_rate": 3.3240326990325865e-08, + "loss": 0.7135, + "step": 11846 + }, + { + "epoch": 0.9748611396831928, + "grad_norm": 2.103786179516335, + "learning_rate": 3.302354440675504e-08, + "loss": 0.7, + "step": 11847 + }, + { + "epoch": 0.9749434272783378, + "grad_norm": 1.999930089818625, + "learning_rate": 3.280746986125127e-08, + "loss": 0.7307, + "step": 11848 + }, + { + "epoch": 0.9750257148734828, + "grad_norm": 1.8536421754727401, + "learning_rate": 3.2592103369165627e-08, + "loss": 0.7225, + "step": 11849 + }, + { + "epoch": 0.9751080024686278, + "grad_norm": 2.907899631233037, + "learning_rate": 3.2377444945795866e-08, + "loss": 0.7177, + "step": 11850 + }, + { + "epoch": 0.9751902900637729, + "grad_norm": 2.5865764973510132, + "learning_rate": 3.216349460639201e-08, + "loss": 0.6772, + "step": 11851 + }, + { + "epoch": 0.975272577658918, + "grad_norm": 2.3392778164250436, + "learning_rate": 3.19502523661519e-08, + "loss": 0.7016, + "step": 11852 + }, + { + "epoch": 0.9753548652540629, + "grad_norm": 2.1092324143609447, + "learning_rate": 3.173771824022454e-08, + "loss": 0.713, + "step": 11853 + }, + { + "epoch": 0.975437152849208, + "grad_norm": 1.8019453340210225, + "learning_rate": 3.1525892243707836e-08, + "loss": 0.6899, + "step": 11854 + }, + { + "epoch": 0.975519440444353, + "grad_norm": 4.80505380142809, + "learning_rate": 3.131477439164865e-08, + "loss": 0.688, + "step": 11855 + }, + { + "epoch": 0.9756017280394981, + "grad_norm": 2.0721676527248007, + "learning_rate": 3.1104364699046096e-08, + "loss": 0.704, + "step": 11856 + }, + { + "epoch": 0.9756840156346431, + "grad_norm": 2.1916757028352207, + "learning_rate": 3.089466318084711e-08, + "loss": 0.7066, + "step": 11857 + }, + { + "epoch": 0.9757663032297881, + "grad_norm": 2.096639920538251, + "learning_rate": 3.068566985194754e-08, + "loss": 0.7015, + "step": 11858 + }, + { + "epoch": 0.9758485908249331, + "grad_norm": 0.37918614944344764, + "learning_rate": 3.0477384727194414e-08, + "loss": 0.4335, + "step": 11859 + }, + { + "epoch": 0.9759308784200782, + "grad_norm": 2.8521025681639918, + "learning_rate": 3.0269807821383665e-08, + "loss": 0.7234, + "step": 11860 + }, + { + "epoch": 0.9760131660152233, + "grad_norm": 1.9943804940255423, + "learning_rate": 3.006293914926128e-08, + "loss": 0.6572, + "step": 11861 + }, + { + "epoch": 0.9760954536103682, + "grad_norm": 2.7016998669399412, + "learning_rate": 2.985677872552439e-08, + "loss": 0.7388, + "step": 11862 + }, + { + "epoch": 0.9761777412055133, + "grad_norm": 2.0073151336023587, + "learning_rate": 2.965132656481573e-08, + "loss": 0.7048, + "step": 11863 + }, + { + "epoch": 0.9762600288006583, + "grad_norm": 1.7764126406211305, + "learning_rate": 2.9446582681732504e-08, + "loss": 0.7048, + "step": 11864 + }, + { + "epoch": 0.9763423163958034, + "grad_norm": 0.41942014834744246, + "learning_rate": 2.924254709081753e-08, + "loss": 0.4626, + "step": 11865 + }, + { + "epoch": 0.9764246039909483, + "grad_norm": 1.8729447700773898, + "learning_rate": 2.9039219806566987e-08, + "loss": 0.6866, + "step": 11866 + }, + { + "epoch": 0.9765068915860934, + "grad_norm": 1.9951412402372832, + "learning_rate": 2.8836600843423767e-08, + "loss": 0.7147, + "step": 11867 + }, + { + "epoch": 0.9765891791812384, + "grad_norm": 2.544713324780585, + "learning_rate": 2.8634690215781913e-08, + "loss": 0.6776, + "step": 11868 + }, + { + "epoch": 0.9766714667763835, + "grad_norm": 1.7524365623913154, + "learning_rate": 2.8433487937985506e-08, + "loss": 0.7301, + "step": 11869 + }, + { + "epoch": 0.9767537543715284, + "grad_norm": 1.7603026353134625, + "learning_rate": 2.8232994024326442e-08, + "loss": 0.7294, + "step": 11870 + }, + { + "epoch": 0.9768360419666735, + "grad_norm": 2.1268866888886984, + "learning_rate": 2.803320848904778e-08, + "loss": 0.7046, + "step": 11871 + }, + { + "epoch": 0.9769183295618186, + "grad_norm": 2.10201515415034, + "learning_rate": 2.783413134634261e-08, + "loss": 0.716, + "step": 11872 + }, + { + "epoch": 0.9770006171569636, + "grad_norm": 2.197493708763074, + "learning_rate": 2.7635762610351835e-08, + "loss": 0.7286, + "step": 11873 + }, + { + "epoch": 0.9770829047521086, + "grad_norm": 2.607351306426533, + "learning_rate": 2.7438102295168635e-08, + "loss": 0.7095, + "step": 11874 + }, + { + "epoch": 0.9771651923472536, + "grad_norm": 1.6928906323226869, + "learning_rate": 2.7241150414833996e-08, + "loss": 0.6986, + "step": 11875 + }, + { + "epoch": 0.9772474799423987, + "grad_norm": 2.0413481092181267, + "learning_rate": 2.7044906983338946e-08, + "loss": 0.6649, + "step": 11876 + }, + { + "epoch": 0.9773297675375437, + "grad_norm": 2.459994332622993, + "learning_rate": 2.6849372014623455e-08, + "loss": 0.7029, + "step": 11877 + }, + { + "epoch": 0.9774120551326887, + "grad_norm": 1.7237899962310035, + "learning_rate": 2.6654545522579733e-08, + "loss": 0.6933, + "step": 11878 + }, + { + "epoch": 0.9774943427278338, + "grad_norm": 1.5657458863269926, + "learning_rate": 2.6460427521046716e-08, + "loss": 0.7086, + "step": 11879 + }, + { + "epoch": 0.9775766303229788, + "grad_norm": 1.8720338736026847, + "learning_rate": 2.6267018023815594e-08, + "loss": 0.7033, + "step": 11880 + }, + { + "epoch": 0.9776589179181239, + "grad_norm": 2.3853114102417687, + "learning_rate": 2.6074317044623156e-08, + "loss": 0.7021, + "step": 11881 + }, + { + "epoch": 0.9777412055132689, + "grad_norm": 0.43829531216709133, + "learning_rate": 2.588232459716178e-08, + "loss": 0.4724, + "step": 11882 + }, + { + "epoch": 0.9778234931084139, + "grad_norm": 0.40040882383321164, + "learning_rate": 2.5691040695068336e-08, + "loss": 0.4782, + "step": 11883 + }, + { + "epoch": 0.9779057807035589, + "grad_norm": 0.4142731848974326, + "learning_rate": 2.550046535193196e-08, + "loss": 0.4613, + "step": 11884 + }, + { + "epoch": 0.977988068298704, + "grad_norm": 0.42549342569333626, + "learning_rate": 2.5310598581290702e-08, + "loss": 0.4784, + "step": 11885 + }, + { + "epoch": 0.978070355893849, + "grad_norm": 1.7894951234811918, + "learning_rate": 2.5121440396631557e-08, + "loss": 0.6781, + "step": 11886 + }, + { + "epoch": 0.978152643488994, + "grad_norm": 1.9491993536975893, + "learning_rate": 2.4932990811393777e-08, + "loss": 0.7036, + "step": 11887 + }, + { + "epoch": 0.9782349310841391, + "grad_norm": 1.7639826785486652, + "learning_rate": 2.4745249838963315e-08, + "loss": 0.708, + "step": 11888 + }, + { + "epoch": 0.9783172186792841, + "grad_norm": 0.3981387020207396, + "learning_rate": 2.4558217492677283e-08, + "loss": 0.4553, + "step": 11889 + }, + { + "epoch": 0.9783995062744292, + "grad_norm": 2.0019934681830236, + "learning_rate": 2.437189378582172e-08, + "loss": 0.6921, + "step": 11890 + }, + { + "epoch": 0.9784817938695741, + "grad_norm": 2.0060102665579826, + "learning_rate": 2.4186278731632707e-08, + "loss": 0.6878, + "step": 11891 + }, + { + "epoch": 0.9785640814647192, + "grad_norm": 2.4540705455729848, + "learning_rate": 2.4001372343297468e-08, + "loss": 0.7006, + "step": 11892 + }, + { + "epoch": 0.9786463690598642, + "grad_norm": 1.6027246182584245, + "learning_rate": 2.3817174633949948e-08, + "loss": 0.6975, + "step": 11893 + }, + { + "epoch": 0.9787286566550093, + "grad_norm": 2.25606371138766, + "learning_rate": 2.363368561667634e-08, + "loss": 0.6886, + "step": 11894 + }, + { + "epoch": 0.9788109442501542, + "grad_norm": 0.42109403299683523, + "learning_rate": 2.3450905304510663e-08, + "loss": 0.466, + "step": 11895 + }, + { + "epoch": 0.9788932318452993, + "grad_norm": 1.6348357948027095, + "learning_rate": 2.326883371043809e-08, + "loss": 0.6752, + "step": 11896 + }, + { + "epoch": 0.9789755194404444, + "grad_norm": 2.077533631208267, + "learning_rate": 2.3087470847390492e-08, + "loss": 0.6936, + "step": 11897 + }, + { + "epoch": 0.9790578070355894, + "grad_norm": 2.649853200328735, + "learning_rate": 2.2906816728254234e-08, + "loss": 0.714, + "step": 11898 + }, + { + "epoch": 0.9791400946307344, + "grad_norm": 2.1621029875469584, + "learning_rate": 2.272687136586238e-08, + "loss": 0.7117, + "step": 11899 + }, + { + "epoch": 0.9792223822258794, + "grad_norm": 2.1409121910355005, + "learning_rate": 2.2547634772996928e-08, + "loss": 0.6939, + "step": 11900 + }, + { + "epoch": 0.9793046698210245, + "grad_norm": 1.9667843945732197, + "learning_rate": 2.236910696238992e-08, + "loss": 0.7249, + "step": 11901 + }, + { + "epoch": 0.9793869574161695, + "grad_norm": 1.96261468035295, + "learning_rate": 2.2191287946725647e-08, + "loss": 0.7141, + "step": 11902 + }, + { + "epoch": 0.9794692450113145, + "grad_norm": 1.5849635859334488, + "learning_rate": 2.201417773863512e-08, + "loss": 0.674, + "step": 11903 + }, + { + "epoch": 0.9795515326064596, + "grad_norm": 1.7103059154496385, + "learning_rate": 2.1837776350699392e-08, + "loss": 0.6932, + "step": 11904 + }, + { + "epoch": 0.9796338202016046, + "grad_norm": 2.593523844006, + "learning_rate": 2.166208379545065e-08, + "loss": 0.7257, + "step": 11905 + }, + { + "epoch": 0.9797161077967497, + "grad_norm": 0.42317825721045776, + "learning_rate": 2.1487100085370027e-08, + "loss": 0.467, + "step": 11906 + }, + { + "epoch": 0.9797983953918946, + "grad_norm": 1.9534704276080264, + "learning_rate": 2.131282523288647e-08, + "loss": 0.6896, + "step": 11907 + }, + { + "epoch": 0.9798806829870397, + "grad_norm": 0.460920216140725, + "learning_rate": 2.1139259250382292e-08, + "loss": 0.4696, + "step": 11908 + }, + { + "epoch": 0.9799629705821847, + "grad_norm": 0.4214858832340325, + "learning_rate": 2.0966402150185416e-08, + "loss": 0.4644, + "step": 11909 + }, + { + "epoch": 0.9800452581773298, + "grad_norm": 2.1452003795186525, + "learning_rate": 2.0794253944577125e-08, + "loss": 0.7349, + "step": 11910 + }, + { + "epoch": 0.9801275457724749, + "grad_norm": 3.296085134488794, + "learning_rate": 2.0622814645785416e-08, + "loss": 0.7103, + "step": 11911 + }, + { + "epoch": 0.9802098333676198, + "grad_norm": 2.1245159197958197, + "learning_rate": 2.0452084265988327e-08, + "loss": 0.7012, + "step": 11912 + }, + { + "epoch": 0.9802921209627649, + "grad_norm": 2.0648261943144623, + "learning_rate": 2.0282062817316152e-08, + "loss": 0.7068, + "step": 11913 + }, + { + "epoch": 0.9803744085579099, + "grad_norm": 1.9608940818023537, + "learning_rate": 2.0112750311845896e-08, + "loss": 0.7083, + "step": 11914 + }, + { + "epoch": 0.980456696153055, + "grad_norm": 1.8238516507930322, + "learning_rate": 1.994414676160461e-08, + "loss": 0.7217, + "step": 11915 + }, + { + "epoch": 0.9805389837481999, + "grad_norm": 2.3037897069326707, + "learning_rate": 1.9776252178571597e-08, + "loss": 0.7039, + "step": 11916 + }, + { + "epoch": 0.980621271343345, + "grad_norm": 1.7086362211250603, + "learning_rate": 1.9609066574672876e-08, + "loss": 0.6857, + "step": 11917 + }, + { + "epoch": 0.98070355893849, + "grad_norm": 0.40241330582346907, + "learning_rate": 1.9442589961783388e-08, + "loss": 0.4561, + "step": 11918 + }, + { + "epoch": 0.9807858465336351, + "grad_norm": 1.7367734736409455, + "learning_rate": 1.927682235173145e-08, + "loss": 0.6627, + "step": 11919 + }, + { + "epoch": 0.98086813412878, + "grad_norm": 2.103110317684392, + "learning_rate": 1.911176375629209e-08, + "loss": 0.6956, + "step": 11920 + }, + { + "epoch": 0.9809504217239251, + "grad_norm": 2.098812872410639, + "learning_rate": 1.8947414187191483e-08, + "loss": 0.7033, + "step": 11921 + }, + { + "epoch": 0.9810327093190702, + "grad_norm": 1.820435596516699, + "learning_rate": 1.878377365610362e-08, + "loss": 0.6886, + "step": 11922 + }, + { + "epoch": 0.9811149969142152, + "grad_norm": 1.7246485381434649, + "learning_rate": 1.8620842174654764e-08, + "loss": 0.7183, + "step": 11923 + }, + { + "epoch": 0.9811972845093602, + "grad_norm": 1.867795160560803, + "learning_rate": 1.8458619754417873e-08, + "loss": 0.7056, + "step": 11924 + }, + { + "epoch": 0.9812795721045052, + "grad_norm": 2.56986298932985, + "learning_rate": 1.8297106406917065e-08, + "loss": 0.6901, + "step": 11925 + }, + { + "epoch": 0.9813618596996503, + "grad_norm": 2.308702807934525, + "learning_rate": 1.8136302143626495e-08, + "loss": 0.7351, + "step": 11926 + }, + { + "epoch": 0.9814441472947953, + "grad_norm": 2.104234060743508, + "learning_rate": 1.7976206975969247e-08, + "loss": 0.71, + "step": 11927 + }, + { + "epoch": 0.9815264348899403, + "grad_norm": 2.052734381159749, + "learning_rate": 1.7816820915319554e-08, + "loss": 0.697, + "step": 11928 + }, + { + "epoch": 0.9816087224850853, + "grad_norm": 2.0758082086198324, + "learning_rate": 1.7658143972997256e-08, + "loss": 0.6789, + "step": 11929 + }, + { + "epoch": 0.9816910100802304, + "grad_norm": 2.002865556072755, + "learning_rate": 1.7500176160277772e-08, + "loss": 0.6775, + "step": 11930 + }, + { + "epoch": 0.9817732976753755, + "grad_norm": 0.4160157071455825, + "learning_rate": 1.7342917488379906e-08, + "loss": 0.4797, + "step": 11931 + }, + { + "epoch": 0.9818555852705204, + "grad_norm": 2.3211098762755644, + "learning_rate": 1.7186367968475837e-08, + "loss": 0.7069, + "step": 11932 + }, + { + "epoch": 0.9819378728656655, + "grad_norm": 1.786747766386664, + "learning_rate": 1.7030527611687775e-08, + "loss": 0.7028, + "step": 11933 + }, + { + "epoch": 0.9820201604608105, + "grad_norm": 2.2207683648060703, + "learning_rate": 1.6875396429085756e-08, + "loss": 0.722, + "step": 11934 + }, + { + "epoch": 0.9821024480559556, + "grad_norm": 0.42685117158413655, + "learning_rate": 1.672097443168985e-08, + "loss": 0.4744, + "step": 11935 + }, + { + "epoch": 0.9821847356511006, + "grad_norm": 2.383340745701831, + "learning_rate": 1.6567261630470177e-08, + "loss": 0.6896, + "step": 11936 + }, + { + "epoch": 0.9822670232462456, + "grad_norm": 1.7746632276483472, + "learning_rate": 1.6414258036345777e-08, + "loss": 0.7322, + "step": 11937 + }, + { + "epoch": 0.9823493108413907, + "grad_norm": 1.7205311497674192, + "learning_rate": 1.6261963660185733e-08, + "loss": 0.6848, + "step": 11938 + }, + { + "epoch": 0.9824315984365357, + "grad_norm": 2.3005013186620467, + "learning_rate": 1.6110378512809166e-08, + "loss": 0.7088, + "step": 11939 + }, + { + "epoch": 0.9825138860316808, + "grad_norm": 1.6100459463889913, + "learning_rate": 1.5959502604985244e-08, + "loss": 0.72, + "step": 11940 + }, + { + "epoch": 0.9825961736268257, + "grad_norm": 2.586669744738869, + "learning_rate": 1.580933594743095e-08, + "loss": 0.7025, + "step": 11941 + }, + { + "epoch": 0.9826784612219708, + "grad_norm": 1.9104285811390251, + "learning_rate": 1.56598785508133e-08, + "loss": 0.702, + "step": 11942 + }, + { + "epoch": 0.9827607488171158, + "grad_norm": 1.7445333661648956, + "learning_rate": 1.5511130425751586e-08, + "loss": 0.7053, + "step": 11943 + }, + { + "epoch": 0.9828430364122609, + "grad_norm": 1.8907708579543898, + "learning_rate": 1.5363091582810687e-08, + "loss": 0.6999, + "step": 11944 + }, + { + "epoch": 0.9829253240074058, + "grad_norm": 1.7207538548588954, + "learning_rate": 1.5215762032506632e-08, + "loss": 0.6933, + "step": 11945 + }, + { + "epoch": 0.9830076116025509, + "grad_norm": 1.6891552429810468, + "learning_rate": 1.5069141785307717e-08, + "loss": 0.7101, + "step": 11946 + }, + { + "epoch": 0.983089899197696, + "grad_norm": 2.4086068458333405, + "learning_rate": 1.4923230851628946e-08, + "loss": 0.7002, + "step": 11947 + }, + { + "epoch": 0.983172186792841, + "grad_norm": 3.012156937856631, + "learning_rate": 1.4778029241834247e-08, + "loss": 0.7027, + "step": 11948 + }, + { + "epoch": 0.983254474387986, + "grad_norm": 1.7213037222411407, + "learning_rate": 1.4633536966238705e-08, + "loss": 0.7326, + "step": 11949 + }, + { + "epoch": 0.983336761983131, + "grad_norm": 1.926696588493255, + "learning_rate": 1.448975403510855e-08, + "loss": 0.7162, + "step": 11950 + }, + { + "epoch": 0.9834190495782761, + "grad_norm": 2.247506445488047, + "learning_rate": 1.4346680458656726e-08, + "loss": 0.7105, + "step": 11951 + }, + { + "epoch": 0.9835013371734211, + "grad_norm": 0.42235586296091254, + "learning_rate": 1.4204316247046213e-08, + "loss": 0.4576, + "step": 11952 + }, + { + "epoch": 0.9835836247685661, + "grad_norm": 3.0795489144202417, + "learning_rate": 1.4062661410392253e-08, + "loss": 0.7203, + "step": 11953 + }, + { + "epoch": 0.9836659123637111, + "grad_norm": 1.8217454899818168, + "learning_rate": 1.3921715958755689e-08, + "loss": 0.702, + "step": 11954 + }, + { + "epoch": 0.9837481999588562, + "grad_norm": 1.6915316479376694, + "learning_rate": 1.378147990215073e-08, + "loss": 0.7084, + "step": 11955 + }, + { + "epoch": 0.9838304875540013, + "grad_norm": 2.045795209681667, + "learning_rate": 1.3641953250538298e-08, + "loss": 0.6868, + "step": 11956 + }, + { + "epoch": 0.9839127751491462, + "grad_norm": 1.8756438632036834, + "learning_rate": 1.3503136013831575e-08, + "loss": 0.7064, + "step": 11957 + }, + { + "epoch": 0.9839950627442913, + "grad_norm": 1.8754457093107404, + "learning_rate": 1.336502820189045e-08, + "loss": 0.6793, + "step": 11958 + }, + { + "epoch": 0.9840773503394363, + "grad_norm": 3.1299914099005948, + "learning_rate": 1.3227629824525967e-08, + "loss": 0.7142, + "step": 11959 + }, + { + "epoch": 0.9841596379345814, + "grad_norm": 2.38763437562821, + "learning_rate": 1.3090940891500314e-08, + "loss": 0.6797, + "step": 11960 + }, + { + "epoch": 0.9842419255297264, + "grad_norm": 0.41214427009102256, + "learning_rate": 1.2954961412521283e-08, + "loss": 0.4642, + "step": 11961 + }, + { + "epoch": 0.9843242131248714, + "grad_norm": 3.326943225013204, + "learning_rate": 1.2819691397250034e-08, + "loss": 0.7091, + "step": 11962 + }, + { + "epoch": 0.9844065007200165, + "grad_norm": 1.659875890420232, + "learning_rate": 1.2685130855296656e-08, + "loss": 0.7046, + "step": 11963 + }, + { + "epoch": 0.9844887883151615, + "grad_norm": 2.063314879587402, + "learning_rate": 1.2551279796219063e-08, + "loss": 0.731, + "step": 11964 + }, + { + "epoch": 0.9845710759103066, + "grad_norm": 2.3940785067620016, + "learning_rate": 1.2418138229525201e-08, + "loss": 0.6905, + "step": 11965 + }, + { + "epoch": 0.9846533635054515, + "grad_norm": 2.2443016249217345, + "learning_rate": 1.2285706164675282e-08, + "loss": 0.7104, + "step": 11966 + }, + { + "epoch": 0.9847356511005966, + "grad_norm": 2.0119104889994475, + "learning_rate": 1.2153983611075115e-08, + "loss": 0.7279, + "step": 11967 + }, + { + "epoch": 0.9848179386957416, + "grad_norm": 0.4188739486402796, + "learning_rate": 1.2022970578081661e-08, + "loss": 0.4761, + "step": 11968 + }, + { + "epoch": 0.9849002262908867, + "grad_norm": 0.4515307545776995, + "learning_rate": 1.1892667075005248e-08, + "loss": 0.4775, + "step": 11969 + }, + { + "epoch": 0.9849825138860316, + "grad_norm": 2.0409433783245605, + "learning_rate": 1.1763073111098477e-08, + "loss": 0.7365, + "step": 11970 + }, + { + "epoch": 0.9850648014811767, + "grad_norm": 0.41290375803272034, + "learning_rate": 1.1634188695569537e-08, + "loss": 0.4623, + "step": 11971 + }, + { + "epoch": 0.9851470890763218, + "grad_norm": 1.8356180422190895, + "learning_rate": 1.1506013837573326e-08, + "loss": 0.711, + "step": 11972 + }, + { + "epoch": 0.9852293766714668, + "grad_norm": 0.41608256469207294, + "learning_rate": 1.1378548546217005e-08, + "loss": 0.4462, + "step": 11973 + }, + { + "epoch": 0.9853116642666118, + "grad_norm": 2.0459216316247137, + "learning_rate": 1.1251792830553333e-08, + "loss": 0.7141, + "step": 11974 + }, + { + "epoch": 0.9853939518617568, + "grad_norm": 1.931828760921166, + "learning_rate": 1.1125746699587325e-08, + "loss": 0.7041, + "step": 11975 + }, + { + "epoch": 0.9854762394569019, + "grad_norm": 1.919221873732418, + "learning_rate": 1.1000410162274045e-08, + "loss": 0.7052, + "step": 11976 + }, + { + "epoch": 0.9855585270520469, + "grad_norm": 2.2700274783479166, + "learning_rate": 1.0875783227516368e-08, + "loss": 0.7143, + "step": 11977 + }, + { + "epoch": 0.9856408146471919, + "grad_norm": 0.41789510619770476, + "learning_rate": 1.0751865904167213e-08, + "loss": 0.468, + "step": 11978 + }, + { + "epoch": 0.985723102242337, + "grad_norm": 5.214620251619243, + "learning_rate": 1.0628658201030651e-08, + "loss": 0.733, + "step": 11979 + }, + { + "epoch": 0.985805389837482, + "grad_norm": 1.8206233417277304, + "learning_rate": 1.0506160126858567e-08, + "loss": 0.6849, + "step": 11980 + }, + { + "epoch": 0.9858876774326271, + "grad_norm": 0.4230724998046636, + "learning_rate": 1.038437169035289e-08, + "loss": 0.4601, + "step": 11981 + }, + { + "epoch": 0.985969965027772, + "grad_norm": 0.4016197811175623, + "learning_rate": 1.026329290016559e-08, + "loss": 0.4619, + "step": 11982 + }, + { + "epoch": 0.9860522526229171, + "grad_norm": 0.4373973210118119, + "learning_rate": 1.014292376489756e-08, + "loss": 0.4901, + "step": 11983 + }, + { + "epoch": 0.9861345402180621, + "grad_norm": 2.3323635906397073, + "learning_rate": 1.0023264293099743e-08, + "loss": 0.6942, + "step": 11984 + }, + { + "epoch": 0.9862168278132072, + "grad_norm": 2.328430443142431, + "learning_rate": 9.904314493273116e-09, + "loss": 0.7255, + "step": 11985 + }, + { + "epoch": 0.9862991154083522, + "grad_norm": 0.41656184431828375, + "learning_rate": 9.786074373866472e-09, + "loss": 0.4784, + "step": 11986 + }, + { + "epoch": 0.9863814030034972, + "grad_norm": 0.38328571075358336, + "learning_rate": 9.668543943280873e-09, + "loss": 0.4318, + "step": 11987 + }, + { + "epoch": 0.9864636905986423, + "grad_norm": 1.9600138745867046, + "learning_rate": 9.551723209864084e-09, + "loss": 0.717, + "step": 11988 + }, + { + "epoch": 0.9865459781937873, + "grad_norm": 2.2393150162877884, + "learning_rate": 9.435612181916132e-09, + "loss": 0.7038, + "step": 11989 + }, + { + "epoch": 0.9866282657889324, + "grad_norm": 2.286909935182745, + "learning_rate": 9.320210867683754e-09, + "loss": 0.7327, + "step": 11990 + }, + { + "epoch": 0.9867105533840773, + "grad_norm": 1.945962236819001, + "learning_rate": 9.205519275367058e-09, + "loss": 0.6967, + "step": 11991 + }, + { + "epoch": 0.9867928409792224, + "grad_norm": 1.939376991016164, + "learning_rate": 9.09153741311175e-09, + "loss": 0.7322, + "step": 11992 + }, + { + "epoch": 0.9868751285743674, + "grad_norm": 2.3404045590670552, + "learning_rate": 8.978265289015797e-09, + "loss": 0.6879, + "step": 11993 + }, + { + "epoch": 0.9869574161695125, + "grad_norm": 2.1170634881046615, + "learning_rate": 8.865702911124985e-09, + "loss": 0.7236, + "step": 11994 + }, + { + "epoch": 0.9870397037646574, + "grad_norm": 1.80024304431302, + "learning_rate": 8.75385028743736e-09, + "loss": 0.6962, + "step": 11995 + }, + { + "epoch": 0.9871219913598025, + "grad_norm": 0.4370552412708593, + "learning_rate": 8.642707425896568e-09, + "loss": 0.4662, + "step": 11996 + }, + { + "epoch": 0.9872042789549476, + "grad_norm": 1.8628861924196731, + "learning_rate": 8.532274334398516e-09, + "loss": 0.6757, + "step": 11997 + }, + { + "epoch": 0.9872865665500926, + "grad_norm": 2.103898136742355, + "learning_rate": 8.42255102079026e-09, + "loss": 0.7134, + "step": 11998 + }, + { + "epoch": 0.9873688541452376, + "grad_norm": 1.585711971493033, + "learning_rate": 8.313537492863344e-09, + "loss": 0.697, + "step": 11999 + }, + { + "epoch": 0.9874511417403826, + "grad_norm": 2.014681831642436, + "learning_rate": 8.205233758363796e-09, + "loss": 0.6656, + "step": 12000 + }, + { + "epoch": 0.9875334293355277, + "grad_norm": 0.4355481499713039, + "learning_rate": 8.097639824985459e-09, + "loss": 0.488, + "step": 12001 + }, + { + "epoch": 0.9876157169306727, + "grad_norm": 2.440584655686994, + "learning_rate": 7.990755700371111e-09, + "loss": 0.7264, + "step": 12002 + }, + { + "epoch": 0.9876980045258177, + "grad_norm": 2.765629567984163, + "learning_rate": 7.884581392113565e-09, + "loss": 0.6972, + "step": 12003 + }, + { + "epoch": 0.9877802921209627, + "grad_norm": 1.8002228616138822, + "learning_rate": 7.779116907754569e-09, + "loss": 0.7089, + "step": 12004 + }, + { + "epoch": 0.9878625797161078, + "grad_norm": 1.908775823185005, + "learning_rate": 7.674362254788125e-09, + "loss": 0.7036, + "step": 12005 + }, + { + "epoch": 0.9879448673112529, + "grad_norm": 2.4399349825720447, + "learning_rate": 7.57031744065384e-09, + "loss": 0.7071, + "step": 12006 + }, + { + "epoch": 0.9880271549063978, + "grad_norm": 3.590808488992488, + "learning_rate": 7.466982472743578e-09, + "loss": 0.7133, + "step": 12007 + }, + { + "epoch": 0.9881094425015429, + "grad_norm": 2.59165697425964, + "learning_rate": 7.3643573583981335e-09, + "loss": 0.7053, + "step": 12008 + }, + { + "epoch": 0.9881917300966879, + "grad_norm": 1.8899212290622365, + "learning_rate": 7.26244210490834e-09, + "loss": 0.7153, + "step": 12009 + }, + { + "epoch": 0.988274017691833, + "grad_norm": 0.419259871737911, + "learning_rate": 7.161236719512854e-09, + "loss": 0.4807, + "step": 12010 + }, + { + "epoch": 0.988356305286978, + "grad_norm": 1.8455770698519138, + "learning_rate": 7.060741209402588e-09, + "loss": 0.6904, + "step": 12011 + }, + { + "epoch": 0.988438592882123, + "grad_norm": 2.387947259878761, + "learning_rate": 6.960955581716278e-09, + "loss": 0.7226, + "step": 12012 + }, + { + "epoch": 0.988520880477268, + "grad_norm": 1.600651506228458, + "learning_rate": 6.861879843541586e-09, + "loss": 0.6818, + "step": 12013 + }, + { + "epoch": 0.9886031680724131, + "grad_norm": 2.2226725610523497, + "learning_rate": 6.763514001917326e-09, + "loss": 0.6884, + "step": 12014 + }, + { + "epoch": 0.9886854556675582, + "grad_norm": 2.0118110199886847, + "learning_rate": 6.665858063831243e-09, + "loss": 0.7224, + "step": 12015 + }, + { + "epoch": 0.9887677432627031, + "grad_norm": 2.809207334384299, + "learning_rate": 6.56891203622112e-09, + "loss": 0.7115, + "step": 12016 + }, + { + "epoch": 0.9888500308578482, + "grad_norm": 2.2748141640304858, + "learning_rate": 6.47267592597256e-09, + "loss": 0.7097, + "step": 12017 + }, + { + "epoch": 0.9889323184529932, + "grad_norm": 0.4126702826777897, + "learning_rate": 6.377149739923428e-09, + "loss": 0.4529, + "step": 12018 + }, + { + "epoch": 0.9890146060481383, + "grad_norm": 1.8652945243773542, + "learning_rate": 6.282333484858294e-09, + "loss": 0.7027, + "step": 12019 + }, + { + "epoch": 0.9890968936432832, + "grad_norm": 1.9544543211873966, + "learning_rate": 6.188227167513994e-09, + "loss": 0.7098, + "step": 12020 + }, + { + "epoch": 0.9891791812384283, + "grad_norm": 2.1566969642706897, + "learning_rate": 6.09483079457629e-09, + "loss": 0.7315, + "step": 12021 + }, + { + "epoch": 0.9892614688335734, + "grad_norm": 2.029349275221569, + "learning_rate": 6.002144372677654e-09, + "loss": 0.7113, + "step": 12022 + }, + { + "epoch": 0.9893437564287184, + "grad_norm": 2.28006513432975, + "learning_rate": 5.9101679084039294e-09, + "loss": 0.7026, + "step": 12023 + }, + { + "epoch": 0.9894260440238634, + "grad_norm": 1.911032337662065, + "learning_rate": 5.81890140828878e-09, + "loss": 0.6767, + "step": 12024 + }, + { + "epoch": 0.9895083316190084, + "grad_norm": 1.880740595120678, + "learning_rate": 5.728344878815905e-09, + "loss": 0.7177, + "step": 12025 + }, + { + "epoch": 0.9895906192141535, + "grad_norm": 0.41965683538541587, + "learning_rate": 5.638498326417941e-09, + "loss": 0.4572, + "step": 12026 + }, + { + "epoch": 0.9896729068092985, + "grad_norm": 0.42063139215843376, + "learning_rate": 5.549361757477556e-09, + "loss": 0.4702, + "step": 12027 + }, + { + "epoch": 0.9897551944044435, + "grad_norm": 1.7949257781671446, + "learning_rate": 5.460935178326354e-09, + "loss": 0.6938, + "step": 12028 + }, + { + "epoch": 0.9898374819995885, + "grad_norm": 2.836486197389585, + "learning_rate": 5.373218595245977e-09, + "loss": 0.737, + "step": 12029 + }, + { + "epoch": 0.9899197695947336, + "grad_norm": 1.856387005518697, + "learning_rate": 5.286212014468106e-09, + "loss": 0.7163, + "step": 12030 + }, + { + "epoch": 0.9900020571898787, + "grad_norm": 1.8445204696340716, + "learning_rate": 5.199915442174463e-09, + "loss": 0.6939, + "step": 12031 + }, + { + "epoch": 0.9900843447850236, + "grad_norm": 2.189775790435511, + "learning_rate": 5.1143288844934805e-09, + "loss": 0.6872, + "step": 12032 + }, + { + "epoch": 0.9901666323801687, + "grad_norm": 1.9262572211348683, + "learning_rate": 5.029452347506958e-09, + "loss": 0.6985, + "step": 12033 + }, + { + "epoch": 0.9902489199753137, + "grad_norm": 1.8265731968488872, + "learning_rate": 4.9452858372434074e-09, + "loss": 0.7178, + "step": 12034 + }, + { + "epoch": 0.9903312075704588, + "grad_norm": 0.4232028296257527, + "learning_rate": 4.861829359682491e-09, + "loss": 0.4667, + "step": 12035 + }, + { + "epoch": 0.9904134951656037, + "grad_norm": 2.0621729220352023, + "learning_rate": 4.779082920751688e-09, + "loss": 0.7075, + "step": 12036 + }, + { + "epoch": 0.9904957827607488, + "grad_norm": 2.405801917828205, + "learning_rate": 4.697046526329629e-09, + "loss": 0.714, + "step": 12037 + }, + { + "epoch": 0.9905780703558938, + "grad_norm": 2.095629034243118, + "learning_rate": 4.6157201822438765e-09, + "loss": 0.7056, + "step": 12038 + }, + { + "epoch": 0.9906603579510389, + "grad_norm": 1.7577357830094564, + "learning_rate": 4.5351038942731405e-09, + "loss": 0.72, + "step": 12039 + }, + { + "epoch": 0.990742645546184, + "grad_norm": 1.9375305069352475, + "learning_rate": 4.455197668142841e-09, + "loss": 0.6899, + "step": 12040 + }, + { + "epoch": 0.9908249331413289, + "grad_norm": 2.039246556428437, + "learning_rate": 4.3760015095295486e-09, + "loss": 0.7412, + "step": 12041 + }, + { + "epoch": 0.990907220736474, + "grad_norm": 1.9197827780990477, + "learning_rate": 4.2975154240598725e-09, + "loss": 0.7079, + "step": 12042 + }, + { + "epoch": 0.990989508331619, + "grad_norm": 2.180658158494007, + "learning_rate": 4.219739417309354e-09, + "loss": 0.7064, + "step": 12043 + }, + { + "epoch": 0.9910717959267641, + "grad_norm": 1.9672434045323288, + "learning_rate": 4.142673494801352e-09, + "loss": 0.7279, + "step": 12044 + }, + { + "epoch": 0.991154083521909, + "grad_norm": 1.854131289168204, + "learning_rate": 4.0663176620125974e-09, + "loss": 0.7096, + "step": 12045 + }, + { + "epoch": 0.9912363711170541, + "grad_norm": 3.298265849113113, + "learning_rate": 3.990671924366529e-09, + "loss": 0.7381, + "step": 12046 + }, + { + "epoch": 0.9913186587121992, + "grad_norm": 1.754022379242701, + "learning_rate": 3.915736287236627e-09, + "loss": 0.7031, + "step": 12047 + }, + { + "epoch": 0.9914009463073442, + "grad_norm": 1.774621170036306, + "learning_rate": 3.84151075594641e-09, + "loss": 0.7052, + "step": 12048 + }, + { + "epoch": 0.9914832339024892, + "grad_norm": 2.051332158770129, + "learning_rate": 3.767995335769437e-09, + "loss": 0.7205, + "step": 12049 + }, + { + "epoch": 0.9915655214976342, + "grad_norm": 1.8700625203139725, + "learning_rate": 3.695190031927087e-09, + "loss": 0.6827, + "step": 12050 + }, + { + "epoch": 0.9916478090927793, + "grad_norm": 1.6983387469352702, + "learning_rate": 3.6230948495918906e-09, + "loss": 0.6929, + "step": 12051 + }, + { + "epoch": 0.9917300966879243, + "grad_norm": 1.762083015494954, + "learning_rate": 3.551709793884195e-09, + "loss": 0.7114, + "step": 12052 + }, + { + "epoch": 0.9918123842830693, + "grad_norm": 1.9299766020633649, + "learning_rate": 3.48103486987772e-09, + "loss": 0.6917, + "step": 12053 + }, + { + "epoch": 0.9918946718782143, + "grad_norm": 2.2258330223547227, + "learning_rate": 3.4110700825895628e-09, + "loss": 0.7048, + "step": 12054 + }, + { + "epoch": 0.9919769594733594, + "grad_norm": 0.4178117955555536, + "learning_rate": 3.341815436993523e-09, + "loss": 0.4652, + "step": 12055 + }, + { + "epoch": 0.9920592470685045, + "grad_norm": 1.8629863274624183, + "learning_rate": 3.2732709380067783e-09, + "loss": 0.7065, + "step": 12056 + }, + { + "epoch": 0.9921415346636494, + "grad_norm": 2.0820224273074044, + "learning_rate": 3.2054365904998776e-09, + "loss": 0.695, + "step": 12057 + }, + { + "epoch": 0.9922238222587945, + "grad_norm": 1.8508817806324531, + "learning_rate": 3.1383123992900776e-09, + "loss": 0.6962, + "step": 12058 + }, + { + "epoch": 0.9923061098539395, + "grad_norm": 2.0236262548184953, + "learning_rate": 3.0718983691480074e-09, + "loss": 0.7158, + "step": 12059 + }, + { + "epoch": 0.9923883974490846, + "grad_norm": 2.2864798898219654, + "learning_rate": 3.0061945047910046e-09, + "loss": 0.7175, + "step": 12060 + }, + { + "epoch": 0.9924706850442295, + "grad_norm": 2.1135672218464734, + "learning_rate": 2.941200810885336e-09, + "loss": 0.735, + "step": 12061 + }, + { + "epoch": 0.9925529726393746, + "grad_norm": 1.9911117058837677, + "learning_rate": 2.87691729204842e-09, + "loss": 0.6872, + "step": 12062 + }, + { + "epoch": 0.9926352602345196, + "grad_norm": 2.3695330790845563, + "learning_rate": 2.8133439528477138e-09, + "loss": 0.7048, + "step": 12063 + }, + { + "epoch": 0.9927175478296647, + "grad_norm": 1.7524046882711521, + "learning_rate": 2.750480797799604e-09, + "loss": 0.6871, + "step": 12064 + }, + { + "epoch": 0.9927998354248098, + "grad_norm": 1.8548363156892609, + "learning_rate": 2.688327831368298e-09, + "loss": 0.6837, + "step": 12065 + }, + { + "epoch": 0.9928821230199547, + "grad_norm": 1.9138244077521682, + "learning_rate": 2.626885057970263e-09, + "loss": 0.7201, + "step": 12066 + }, + { + "epoch": 0.9929644106150998, + "grad_norm": 0.4059703136362695, + "learning_rate": 2.566152481969786e-09, + "loss": 0.4651, + "step": 12067 + }, + { + "epoch": 0.9930466982102448, + "grad_norm": 2.5736195654997256, + "learning_rate": 2.506130107681193e-09, + "loss": 0.6991, + "step": 12068 + }, + { + "epoch": 0.9931289858053899, + "grad_norm": 1.8678793400874907, + "learning_rate": 2.4468179393677403e-09, + "loss": 0.7043, + "step": 12069 + }, + { + "epoch": 0.9932112734005348, + "grad_norm": 1.9571895790605611, + "learning_rate": 2.388215981244946e-09, + "loss": 0.714, + "step": 12070 + }, + { + "epoch": 0.9932935609956799, + "grad_norm": 1.9545717615750122, + "learning_rate": 2.330324237473924e-09, + "loss": 0.7271, + "step": 12071 + }, + { + "epoch": 0.993375848590825, + "grad_norm": 0.43448009375308466, + "learning_rate": 2.2731427121669425e-09, + "loss": 0.4651, + "step": 12072 + }, + { + "epoch": 0.99345813618597, + "grad_norm": 2.316918622465375, + "learning_rate": 2.216671409388527e-09, + "loss": 0.7353, + "step": 12073 + }, + { + "epoch": 0.993540423781115, + "grad_norm": 0.4075256989781204, + "learning_rate": 2.160910333147692e-09, + "loss": 0.4528, + "step": 12074 + }, + { + "epoch": 0.99362271137626, + "grad_norm": 1.5907127924590678, + "learning_rate": 2.105859487406825e-09, + "loss": 0.694, + "step": 12075 + }, + { + "epoch": 0.9937049989714051, + "grad_norm": 0.4046555394159082, + "learning_rate": 2.0515188760761306e-09, + "loss": 0.4641, + "step": 12076 + }, + { + "epoch": 0.9937872865665501, + "grad_norm": 1.9696820574456613, + "learning_rate": 1.9978885030158546e-09, + "loss": 0.7385, + "step": 12077 + }, + { + "epoch": 0.9938695741616951, + "grad_norm": 2.2130667669108344, + "learning_rate": 1.9449683720373923e-09, + "loss": 0.7129, + "step": 12078 + }, + { + "epoch": 0.9939518617568401, + "grad_norm": 2.1377406054401042, + "learning_rate": 1.892758486897739e-09, + "loss": 0.721, + "step": 12079 + }, + { + "epoch": 0.9940341493519852, + "grad_norm": 1.6595884817079822, + "learning_rate": 1.8412588513072594e-09, + "loss": 0.7244, + "step": 12080 + }, + { + "epoch": 0.9941164369471303, + "grad_norm": 2.086166246322125, + "learning_rate": 1.7904694689241387e-09, + "loss": 0.7328, + "step": 12081 + }, + { + "epoch": 0.9941987245422752, + "grad_norm": 1.8392037207890441, + "learning_rate": 1.7403903433566016e-09, + "loss": 0.6994, + "step": 12082 + }, + { + "epoch": 0.9942810121374203, + "grad_norm": 1.7730096289548989, + "learning_rate": 1.6910214781618028e-09, + "loss": 0.7091, + "step": 12083 + }, + { + "epoch": 0.9943632997325653, + "grad_norm": 0.41873770989778236, + "learning_rate": 1.6423628768458266e-09, + "loss": 0.4476, + "step": 12084 + }, + { + "epoch": 0.9944455873277104, + "grad_norm": 1.9702433858023916, + "learning_rate": 1.5944145428670178e-09, + "loss": 0.7223, + "step": 12085 + }, + { + "epoch": 0.9945278749228553, + "grad_norm": 1.9893068745561935, + "learning_rate": 1.5471764796315403e-09, + "loss": 0.6909, + "step": 12086 + }, + { + "epoch": 0.9946101625180004, + "grad_norm": 1.8022474157832822, + "learning_rate": 1.5006486904944883e-09, + "loss": 0.7045, + "step": 12087 + }, + { + "epoch": 0.9946924501131454, + "grad_norm": 2.7435861553102647, + "learning_rate": 1.4548311787598856e-09, + "loss": 0.6994, + "step": 12088 + }, + { + "epoch": 0.9947747377082905, + "grad_norm": 2.6894288281618564, + "learning_rate": 1.4097239476851266e-09, + "loss": 0.6903, + "step": 12089 + }, + { + "epoch": 0.9948570253034356, + "grad_norm": 1.8743154095680825, + "learning_rate": 1.3653270004720943e-09, + "loss": 0.7231, + "step": 12090 + }, + { + "epoch": 0.9949393128985805, + "grad_norm": 1.8599511169574863, + "learning_rate": 1.3216403402771527e-09, + "loss": 0.7074, + "step": 12091 + }, + { + "epoch": 0.9950216004937256, + "grad_norm": 2.0260204492601357, + "learning_rate": 1.278663970201155e-09, + "loss": 0.6983, + "step": 12092 + }, + { + "epoch": 0.9951038880888706, + "grad_norm": 1.9530515849660015, + "learning_rate": 1.2363978932983245e-09, + "loss": 0.6883, + "step": 12093 + }, + { + "epoch": 0.9951861756840157, + "grad_norm": 1.985234934767335, + "learning_rate": 1.1948421125718146e-09, + "loss": 0.7139, + "step": 12094 + }, + { + "epoch": 0.9952684632791606, + "grad_norm": 2.176474016361572, + "learning_rate": 1.1539966309725982e-09, + "loss": 0.7063, + "step": 12095 + }, + { + "epoch": 0.9953507508743057, + "grad_norm": 2.6699415041462076, + "learning_rate": 1.113861451402798e-09, + "loss": 0.7133, + "step": 12096 + }, + { + "epoch": 0.9954330384694507, + "grad_norm": 1.8758452097184888, + "learning_rate": 1.0744365767123565e-09, + "loss": 0.7064, + "step": 12097 + }, + { + "epoch": 0.9955153260645958, + "grad_norm": 2.3413462095774373, + "learning_rate": 1.0357220097045872e-09, + "loss": 0.7144, + "step": 12098 + }, + { + "epoch": 0.9955976136597408, + "grad_norm": 3.390117093814026, + "learning_rate": 9.977177531272918e-10, + "loss": 0.6951, + "step": 12099 + }, + { + "epoch": 0.9956799012548858, + "grad_norm": 2.1312202756693286, + "learning_rate": 9.60423809681643e-10, + "loss": 0.7073, + "step": 12100 + }, + { + "epoch": 0.9957621888500309, + "grad_norm": 1.9071568445131275, + "learning_rate": 9.238401820155229e-10, + "loss": 0.7313, + "step": 12101 + }, + { + "epoch": 0.9958444764451759, + "grad_norm": 1.8708642095247343, + "learning_rate": 8.879668727290736e-10, + "loss": 0.703, + "step": 12102 + }, + { + "epoch": 0.9959267640403209, + "grad_norm": 1.7810229380886617, + "learning_rate": 8.528038843702569e-10, + "loss": 0.7189, + "step": 12103 + }, + { + "epoch": 0.9960090516354659, + "grad_norm": 1.9122332774232658, + "learning_rate": 8.183512194370747e-10, + "loss": 0.7046, + "step": 12104 + }, + { + "epoch": 0.996091339230611, + "grad_norm": 2.2004302941501397, + "learning_rate": 7.846088803775687e-10, + "loss": 0.6913, + "step": 12105 + }, + { + "epoch": 0.996173626825756, + "grad_norm": 1.821450072724091, + "learning_rate": 7.515768695876002e-10, + "loss": 0.7227, + "step": 12106 + }, + { + "epoch": 0.996255914420901, + "grad_norm": 0.41843295364315386, + "learning_rate": 7.192551894130706e-10, + "loss": 0.4766, + "step": 12107 + }, + { + "epoch": 0.9963382020160461, + "grad_norm": 1.867948395629928, + "learning_rate": 6.876438421521414e-10, + "loss": 0.7038, + "step": 12108 + }, + { + "epoch": 0.9964204896111911, + "grad_norm": 2.3086805015280607, + "learning_rate": 6.567428300496836e-10, + "loss": 0.7095, + "step": 12109 + }, + { + "epoch": 0.9965027772063362, + "grad_norm": 1.8360486854570164, + "learning_rate": 6.265521553006082e-10, + "loss": 0.7069, + "step": 12110 + }, + { + "epoch": 0.9965850648014811, + "grad_norm": 1.6936120931893883, + "learning_rate": 5.970718200487557e-10, + "loss": 0.7422, + "step": 12111 + }, + { + "epoch": 0.9966673523966262, + "grad_norm": 2.099650541285875, + "learning_rate": 5.683018263902273e-10, + "loss": 0.7334, + "step": 12112 + }, + { + "epoch": 0.9967496399917712, + "grad_norm": 1.8548997293383709, + "learning_rate": 5.402421763678334e-10, + "loss": 0.6923, + "step": 12113 + }, + { + "epoch": 0.9968319275869163, + "grad_norm": 1.8452076421260866, + "learning_rate": 5.128928719744241e-10, + "loss": 0.6949, + "step": 12114 + }, + { + "epoch": 0.9969142151820614, + "grad_norm": 2.1770301618812793, + "learning_rate": 4.8625391515289e-10, + "loss": 0.6984, + "step": 12115 + }, + { + "epoch": 0.9969965027772063, + "grad_norm": 0.3998843119139332, + "learning_rate": 4.6032530779616115e-10, + "loss": 0.4597, + "step": 12116 + }, + { + "epoch": 0.9970787903723514, + "grad_norm": 1.9897770004143867, + "learning_rate": 4.3510705174609757e-10, + "loss": 0.7113, + "step": 12117 + }, + { + "epoch": 0.9971610779674964, + "grad_norm": 0.4113434346919404, + "learning_rate": 4.1059914879459927e-10, + "loss": 0.4257, + "step": 12118 + }, + { + "epoch": 0.9972433655626415, + "grad_norm": 2.086164293791303, + "learning_rate": 3.8680160068138573e-10, + "loss": 0.7164, + "step": 12119 + }, + { + "epoch": 0.9973256531577864, + "grad_norm": 2.0163305197395633, + "learning_rate": 3.637144090984368e-10, + "loss": 0.6957, + "step": 12120 + }, + { + "epoch": 0.9974079407529315, + "grad_norm": 0.4167540115626253, + "learning_rate": 3.413375756855519e-10, + "loss": 0.4756, + "step": 12121 + }, + { + "epoch": 0.9974902283480765, + "grad_norm": 2.395733808481312, + "learning_rate": 3.1967110203146023e-10, + "loss": 0.7121, + "step": 12122 + }, + { + "epoch": 0.9975725159432216, + "grad_norm": 1.8259035050781733, + "learning_rate": 2.9871498967493083e-10, + "loss": 0.7215, + "step": 12123 + }, + { + "epoch": 0.9976548035383666, + "grad_norm": 2.3101581941560667, + "learning_rate": 2.784692401069933e-10, + "loss": 0.7436, + "step": 12124 + }, + { + "epoch": 0.9977370911335116, + "grad_norm": 2.2556797248455847, + "learning_rate": 2.589338547631659e-10, + "loss": 0.7596, + "step": 12125 + }, + { + "epoch": 0.9978193787286567, + "grad_norm": 0.42826423042518735, + "learning_rate": 2.4010883503344797e-10, + "loss": 0.5027, + "step": 12126 + }, + { + "epoch": 0.9979016663238017, + "grad_norm": 1.8009720912564666, + "learning_rate": 2.2199418225343772e-10, + "loss": 0.683, + "step": 12127 + }, + { + "epoch": 0.9979839539189467, + "grad_norm": 2.1283616134282326, + "learning_rate": 2.0458989771099392e-10, + "loss": 0.7221, + "step": 12128 + }, + { + "epoch": 0.9980662415140917, + "grad_norm": 1.90404183956531, + "learning_rate": 1.8789598264179477e-10, + "loss": 0.7128, + "step": 12129 + }, + { + "epoch": 0.9981485291092368, + "grad_norm": 2.2040032616588094, + "learning_rate": 1.7191243823266867e-10, + "loss": 0.6966, + "step": 12130 + }, + { + "epoch": 0.9982308167043819, + "grad_norm": 2.2764663199515476, + "learning_rate": 1.5663926561826358e-10, + "loss": 0.7048, + "step": 12131 + }, + { + "epoch": 0.9983131042995268, + "grad_norm": 1.9374807148235162, + "learning_rate": 1.420764658832674e-10, + "loss": 0.6977, + "step": 12132 + }, + { + "epoch": 0.9983953918946719, + "grad_norm": 0.3889551761676263, + "learning_rate": 1.282240400635182e-10, + "loss": 0.4353, + "step": 12133 + }, + { + "epoch": 0.9984776794898169, + "grad_norm": 4.5833109646290575, + "learning_rate": 1.1508198914267354e-10, + "loss": 0.7248, + "step": 12134 + }, + { + "epoch": 0.998559967084962, + "grad_norm": 1.6193938736048847, + "learning_rate": 1.0265031405332081e-10, + "loss": 0.7167, + "step": 12135 + }, + { + "epoch": 0.9986422546801069, + "grad_norm": 1.7368651869787086, + "learning_rate": 9.092901567919755e-11, + "loss": 0.6957, + "step": 12136 + }, + { + "epoch": 0.998724542275252, + "grad_norm": 3.3330823365146562, + "learning_rate": 7.991809485297097e-11, + "loss": 0.7243, + "step": 12137 + }, + { + "epoch": 0.998806829870397, + "grad_norm": 3.09809399408158, + "learning_rate": 6.961755235734835e-11, + "loss": 0.7209, + "step": 12138 + }, + { + "epoch": 0.9988891174655421, + "grad_norm": 0.43134595475437537, + "learning_rate": 6.002738892285642e-11, + "loss": 0.4643, + "step": 12139 + }, + { + "epoch": 0.9989714050606872, + "grad_norm": 1.6841961460114823, + "learning_rate": 5.1147605231172127e-11, + "loss": 0.7015, + "step": 12140 + }, + { + "epoch": 0.9990536926558321, + "grad_norm": 1.6935918626542048, + "learning_rate": 4.2978201914012364e-11, + "loss": 0.7124, + "step": 12141 + }, + { + "epoch": 0.9991359802509772, + "grad_norm": 1.9778267610567235, + "learning_rate": 3.551917955202377e-11, + "loss": 0.6653, + "step": 12142 + }, + { + "epoch": 0.9992182678461222, + "grad_norm": 1.951141794650468, + "learning_rate": 2.8770538672562298e-11, + "loss": 0.7269, + "step": 12143 + }, + { + "epoch": 0.9993005554412673, + "grad_norm": 1.944994963934807, + "learning_rate": 2.2732279757464725e-11, + "loss": 0.7137, + "step": 12144 + }, + { + "epoch": 0.9993828430364122, + "grad_norm": 1.8832679707064368, + "learning_rate": 1.7404403235277145e-11, + "loss": 0.7077, + "step": 12145 + }, + { + "epoch": 0.9994651306315573, + "grad_norm": 1.9107399180535656, + "learning_rate": 1.2786909483475385e-11, + "loss": 0.7218, + "step": 12146 + }, + { + "epoch": 0.9995474182267023, + "grad_norm": 1.972565695798252, + "learning_rate": 8.87979883068546e-12, + "loss": 0.6992, + "step": 12147 + }, + { + "epoch": 0.9996297058218474, + "grad_norm": 2.0492683488258336, + "learning_rate": 5.6830715544631265e-12, + "loss": 0.6872, + "step": 12148 + }, + { + "epoch": 0.9997119934169924, + "grad_norm": 1.70073536396772, + "learning_rate": 3.196727881293882e-12, + "loss": 0.7151, + "step": 12149 + }, + { + "epoch": 0.9997942810121374, + "grad_norm": 1.9053158566343649, + "learning_rate": 1.4207679899236326e-12, + "loss": 0.7303, + "step": 12150 + }, + { + "epoch": 0.9998765686072825, + "grad_norm": 1.7303541094004005, + "learning_rate": 3.551920035871348e-13, + "loss": 0.6865, + "step": 12151 + }, + { + "epoch": 0.9999588562024275, + "grad_norm": 2.0548232024360455, + "learning_rate": 0.0, + "loss": 0.7214, + "step": 12152 + }, + { + "epoch": 0.9999588562024275, + "step": 12152, + "total_flos": 2.8417143434379264e+16, + "train_loss": 0.7377270700421045, + "train_runtime": 136202.0352, + "train_samples_per_second": 45.683, + "train_steps_per_second": 0.089 + } + ], + "logging_steps": 1.0, + "max_steps": 12152, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 608, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8417143434379264e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}