diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,55418 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 7895, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000253324889170361, + "grad_norm": 12.183216094970703, + "learning_rate": 1.0131712259371835e-08, + "loss": 1.4559, + "step": 1 + }, + { + "epoch": 0.000506649778340722, + "grad_norm": 10.059121131896973, + "learning_rate": 2.026342451874367e-08, + "loss": 1.5302, + "step": 2 + }, + { + "epoch": 0.0007599746675110829, + "grad_norm": 10.393256187438965, + "learning_rate": 3.0395136778115507e-08, + "loss": 1.421, + "step": 3 + }, + { + "epoch": 0.001013299556681444, + "grad_norm": 10.55001449584961, + "learning_rate": 4.052684903748734e-08, + "loss": 1.3825, + "step": 4 + }, + { + "epoch": 0.001266624445851805, + "grad_norm": 9.996969223022461, + "learning_rate": 5.0658561296859173e-08, + "loss": 1.3259, + "step": 5 + }, + { + "epoch": 0.0015199493350221659, + "grad_norm": 14.385526657104492, + "learning_rate": 6.079027355623101e-08, + "loss": 1.36, + "step": 6 + }, + { + "epoch": 0.001773274224192527, + "grad_norm": 10.932132720947266, + "learning_rate": 7.092198581560284e-08, + "loss": 1.5607, + "step": 7 + }, + { + "epoch": 0.002026599113362888, + "grad_norm": 11.283689498901367, + "learning_rate": 8.105369807497468e-08, + "loss": 1.5634, + "step": 8 + }, + { + "epoch": 0.002279924002533249, + "grad_norm": 12.34595775604248, + "learning_rate": 9.118541033434651e-08, + "loss": 1.5305, + "step": 9 + }, + { + "epoch": 0.00253324889170361, + "grad_norm": 12.212746620178223, + "learning_rate": 1.0131712259371835e-07, + "loss": 1.5121, + "step": 10 + }, + { + "epoch": 0.002786573780873971, + "grad_norm": 14.221822738647461, + "learning_rate": 1.1144883485309017e-07, + "loss": 1.6522, + "step": 11 + }, + { + "epoch": 0.0030398986700443317, + "grad_norm": 10.095308303833008, + "learning_rate": 1.2158054711246203e-07, + "loss": 1.3202, + "step": 12 + }, + { + "epoch": 0.003293223559214693, + "grad_norm": 12.034480094909668, + "learning_rate": 1.3171225937183385e-07, + "loss": 1.5278, + "step": 13 + }, + { + "epoch": 0.003546548448385054, + "grad_norm": 10.577567100524902, + "learning_rate": 1.4184397163120568e-07, + "loss": 1.4612, + "step": 14 + }, + { + "epoch": 0.0037998733375554147, + "grad_norm": 12.96672248840332, + "learning_rate": 1.5197568389057753e-07, + "loss": 1.4318, + "step": 15 + }, + { + "epoch": 0.004053198226725776, + "grad_norm": 10.878087997436523, + "learning_rate": 1.6210739614994936e-07, + "loss": 1.4786, + "step": 16 + }, + { + "epoch": 0.004306523115896137, + "grad_norm": 15.642341613769531, + "learning_rate": 1.722391084093212e-07, + "loss": 1.5879, + "step": 17 + }, + { + "epoch": 0.004559848005066498, + "grad_norm": 12.503395080566406, + "learning_rate": 1.8237082066869301e-07, + "loss": 1.3623, + "step": 18 + }, + { + "epoch": 0.004813172894236859, + "grad_norm": 10.388249397277832, + "learning_rate": 1.9250253292806487e-07, + "loss": 1.4227, + "step": 19 + }, + { + "epoch": 0.00506649778340722, + "grad_norm": 14.328523635864258, + "learning_rate": 2.026342451874367e-07, + "loss": 1.6519, + "step": 20 + }, + { + "epoch": 0.0053198226725775805, + "grad_norm": 14.44335651397705, + "learning_rate": 2.1276595744680852e-07, + "loss": 1.5192, + "step": 21 + }, + { + "epoch": 0.005573147561747942, + "grad_norm": 12.124639511108398, + "learning_rate": 2.2289766970618035e-07, + "loss": 1.3475, + "step": 22 + }, + { + "epoch": 0.005826472450918303, + "grad_norm": 10.395844459533691, + "learning_rate": 2.330293819655522e-07, + "loss": 1.3104, + "step": 23 + }, + { + "epoch": 0.0060797973400886635, + "grad_norm": 11.978984832763672, + "learning_rate": 2.4316109422492405e-07, + "loss": 1.444, + "step": 24 + }, + { + "epoch": 0.006333122229259025, + "grad_norm": 11.570704460144043, + "learning_rate": 2.532928064842959e-07, + "loss": 1.5421, + "step": 25 + }, + { + "epoch": 0.006586447118429386, + "grad_norm": 10.934496879577637, + "learning_rate": 2.634245187436677e-07, + "loss": 1.3876, + "step": 26 + }, + { + "epoch": 0.006839772007599746, + "grad_norm": 9.044105529785156, + "learning_rate": 2.7355623100303953e-07, + "loss": 1.4284, + "step": 27 + }, + { + "epoch": 0.007093096896770108, + "grad_norm": 8.541162490844727, + "learning_rate": 2.8368794326241136e-07, + "loss": 1.4057, + "step": 28 + }, + { + "epoch": 0.007346421785940469, + "grad_norm": 12.932791709899902, + "learning_rate": 2.938196555217832e-07, + "loss": 1.5905, + "step": 29 + }, + { + "epoch": 0.007599746675110829, + "grad_norm": 9.997332572937012, + "learning_rate": 3.0395136778115507e-07, + "loss": 1.468, + "step": 30 + }, + { + "epoch": 0.007853071564281191, + "grad_norm": 8.133085250854492, + "learning_rate": 3.140830800405269e-07, + "loss": 1.4008, + "step": 31 + }, + { + "epoch": 0.008106396453451552, + "grad_norm": 8.5535249710083, + "learning_rate": 3.242147922998987e-07, + "loss": 1.3891, + "step": 32 + }, + { + "epoch": 0.008359721342621912, + "grad_norm": 8.260271072387695, + "learning_rate": 3.3434650455927055e-07, + "loss": 1.4133, + "step": 33 + }, + { + "epoch": 0.008613046231792274, + "grad_norm": 10.480013847351074, + "learning_rate": 3.444782168186424e-07, + "loss": 1.4304, + "step": 34 + }, + { + "epoch": 0.008866371120962635, + "grad_norm": 7.807279109954834, + "learning_rate": 3.5460992907801425e-07, + "loss": 1.5742, + "step": 35 + }, + { + "epoch": 0.009119696010132995, + "grad_norm": 8.243898391723633, + "learning_rate": 3.6474164133738603e-07, + "loss": 1.3216, + "step": 36 + }, + { + "epoch": 0.009373020899303357, + "grad_norm": 8.004374504089355, + "learning_rate": 3.748733535967579e-07, + "loss": 1.4215, + "step": 37 + }, + { + "epoch": 0.009626345788473718, + "grad_norm": 7.767487049102783, + "learning_rate": 3.8500506585612973e-07, + "loss": 1.4052, + "step": 38 + }, + { + "epoch": 0.009879670677644078, + "grad_norm": 7.188638210296631, + "learning_rate": 3.9513677811550156e-07, + "loss": 1.3566, + "step": 39 + }, + { + "epoch": 0.01013299556681444, + "grad_norm": 10.821622848510742, + "learning_rate": 4.052684903748734e-07, + "loss": 1.3463, + "step": 40 + }, + { + "epoch": 0.010386320455984801, + "grad_norm": 8.293659210205078, + "learning_rate": 4.1540020263424527e-07, + "loss": 1.4509, + "step": 41 + }, + { + "epoch": 0.010639645345155161, + "grad_norm": 6.990182876586914, + "learning_rate": 4.2553191489361704e-07, + "loss": 1.4913, + "step": 42 + }, + { + "epoch": 0.010892970234325523, + "grad_norm": 7.317448139190674, + "learning_rate": 4.356636271529889e-07, + "loss": 1.3197, + "step": 43 + }, + { + "epoch": 0.011146295123495884, + "grad_norm": 7.312219619750977, + "learning_rate": 4.457953394123607e-07, + "loss": 1.3949, + "step": 44 + }, + { + "epoch": 0.011399620012666244, + "grad_norm": 7.905741214752197, + "learning_rate": 4.5592705167173257e-07, + "loss": 1.4606, + "step": 45 + }, + { + "epoch": 0.011652944901836606, + "grad_norm": 7.263676166534424, + "learning_rate": 4.660587639311044e-07, + "loss": 1.4277, + "step": 46 + }, + { + "epoch": 0.011906269791006967, + "grad_norm": 7.631997585296631, + "learning_rate": 4.7619047619047623e-07, + "loss": 1.3829, + "step": 47 + }, + { + "epoch": 0.012159594680177327, + "grad_norm": 7.328865051269531, + "learning_rate": 4.863221884498481e-07, + "loss": 1.4042, + "step": 48 + }, + { + "epoch": 0.012412919569347688, + "grad_norm": 6.723066806793213, + "learning_rate": 4.964539007092199e-07, + "loss": 1.3893, + "step": 49 + }, + { + "epoch": 0.01266624445851805, + "grad_norm": 7.137428283691406, + "learning_rate": 5.065856129685918e-07, + "loss": 1.3351, + "step": 50 + }, + { + "epoch": 0.01291956934768841, + "grad_norm": 5.965123176574707, + "learning_rate": 5.167173252279636e-07, + "loss": 1.3512, + "step": 51 + }, + { + "epoch": 0.013172894236858771, + "grad_norm": 6.249057769775391, + "learning_rate": 5.268490374873354e-07, + "loss": 1.2029, + "step": 52 + }, + { + "epoch": 0.013426219126029133, + "grad_norm": 7.059906005859375, + "learning_rate": 5.369807497467072e-07, + "loss": 1.412, + "step": 53 + }, + { + "epoch": 0.013679544015199493, + "grad_norm": 6.9676947593688965, + "learning_rate": 5.471124620060791e-07, + "loss": 1.2606, + "step": 54 + }, + { + "epoch": 0.013932868904369854, + "grad_norm": 6.916459083557129, + "learning_rate": 5.572441742654509e-07, + "loss": 1.3164, + "step": 55 + }, + { + "epoch": 0.014186193793540216, + "grad_norm": 6.74832820892334, + "learning_rate": 5.673758865248227e-07, + "loss": 1.4729, + "step": 56 + }, + { + "epoch": 0.014439518682710576, + "grad_norm": 6.46651029586792, + "learning_rate": 5.775075987841945e-07, + "loss": 1.4083, + "step": 57 + }, + { + "epoch": 0.014692843571880937, + "grad_norm": 6.30950927734375, + "learning_rate": 5.876393110435664e-07, + "loss": 1.2164, + "step": 58 + }, + { + "epoch": 0.014946168461051299, + "grad_norm": 6.412931442260742, + "learning_rate": 5.977710233029382e-07, + "loss": 1.2824, + "step": 59 + }, + { + "epoch": 0.015199493350221659, + "grad_norm": 5.739402770996094, + "learning_rate": 6.079027355623101e-07, + "loss": 1.3378, + "step": 60 + }, + { + "epoch": 0.01545281823939202, + "grad_norm": 5.976473808288574, + "learning_rate": 6.180344478216819e-07, + "loss": 1.2651, + "step": 61 + }, + { + "epoch": 0.015706143128562382, + "grad_norm": 6.36185359954834, + "learning_rate": 6.281661600810538e-07, + "loss": 1.3197, + "step": 62 + }, + { + "epoch": 0.015959468017732743, + "grad_norm": 5.828892707824707, + "learning_rate": 6.382978723404255e-07, + "loss": 1.2424, + "step": 63 + }, + { + "epoch": 0.016212792906903105, + "grad_norm": 6.295270919799805, + "learning_rate": 6.484295845997974e-07, + "loss": 1.223, + "step": 64 + }, + { + "epoch": 0.016466117796073463, + "grad_norm": 6.278206825256348, + "learning_rate": 6.585612968591693e-07, + "loss": 1.2775, + "step": 65 + }, + { + "epoch": 0.016719442685243825, + "grad_norm": 5.356352806091309, + "learning_rate": 6.686930091185411e-07, + "loss": 1.1728, + "step": 66 + }, + { + "epoch": 0.016972767574414186, + "grad_norm": 6.592249393463135, + "learning_rate": 6.788247213779129e-07, + "loss": 1.3805, + "step": 67 + }, + { + "epoch": 0.017226092463584548, + "grad_norm": 6.495695114135742, + "learning_rate": 6.889564336372847e-07, + "loss": 1.4029, + "step": 68 + }, + { + "epoch": 0.01747941735275491, + "grad_norm": 5.936342239379883, + "learning_rate": 6.990881458966566e-07, + "loss": 1.2631, + "step": 69 + }, + { + "epoch": 0.01773274224192527, + "grad_norm": 6.378044128417969, + "learning_rate": 7.092198581560285e-07, + "loss": 1.4561, + "step": 70 + }, + { + "epoch": 0.01798606713109563, + "grad_norm": 6.01347541809082, + "learning_rate": 7.193515704154002e-07, + "loss": 1.3346, + "step": 71 + }, + { + "epoch": 0.01823939202026599, + "grad_norm": 6.118315696716309, + "learning_rate": 7.294832826747721e-07, + "loss": 1.2509, + "step": 72 + }, + { + "epoch": 0.018492716909436352, + "grad_norm": 5.997556686401367, + "learning_rate": 7.39614994934144e-07, + "loss": 1.3574, + "step": 73 + }, + { + "epoch": 0.018746041798606713, + "grad_norm": 5.349987506866455, + "learning_rate": 7.497467071935158e-07, + "loss": 1.2392, + "step": 74 + }, + { + "epoch": 0.018999366687777075, + "grad_norm": 5.7259907722473145, + "learning_rate": 7.598784194528875e-07, + "loss": 1.2385, + "step": 75 + }, + { + "epoch": 0.019252691576947437, + "grad_norm": 6.2682600021362305, + "learning_rate": 7.700101317122595e-07, + "loss": 1.2794, + "step": 76 + }, + { + "epoch": 0.019506016466117795, + "grad_norm": 5.695931911468506, + "learning_rate": 7.801418439716313e-07, + "loss": 1.3075, + "step": 77 + }, + { + "epoch": 0.019759341355288156, + "grad_norm": 5.9824700355529785, + "learning_rate": 7.902735562310031e-07, + "loss": 1.4274, + "step": 78 + }, + { + "epoch": 0.020012666244458518, + "grad_norm": 6.070525646209717, + "learning_rate": 8.00405268490375e-07, + "loss": 1.2768, + "step": 79 + }, + { + "epoch": 0.02026599113362888, + "grad_norm": 6.06186580657959, + "learning_rate": 8.105369807497468e-07, + "loss": 1.2878, + "step": 80 + }, + { + "epoch": 0.02051931602279924, + "grad_norm": 5.997804641723633, + "learning_rate": 8.206686930091186e-07, + "loss": 1.2837, + "step": 81 + }, + { + "epoch": 0.020772640911969602, + "grad_norm": 5.837419509887695, + "learning_rate": 8.308004052684905e-07, + "loss": 1.2875, + "step": 82 + }, + { + "epoch": 0.02102596580113996, + "grad_norm": 6.083469390869141, + "learning_rate": 8.409321175278623e-07, + "loss": 1.3317, + "step": 83 + }, + { + "epoch": 0.021279290690310322, + "grad_norm": 5.9370198249816895, + "learning_rate": 8.510638297872341e-07, + "loss": 1.228, + "step": 84 + }, + { + "epoch": 0.021532615579480684, + "grad_norm": 5.628711700439453, + "learning_rate": 8.611955420466059e-07, + "loss": 1.259, + "step": 85 + }, + { + "epoch": 0.021785940468651045, + "grad_norm": 5.700177192687988, + "learning_rate": 8.713272543059778e-07, + "loss": 1.3854, + "step": 86 + }, + { + "epoch": 0.022039265357821407, + "grad_norm": 5.34316873550415, + "learning_rate": 8.814589665653496e-07, + "loss": 1.1796, + "step": 87 + }, + { + "epoch": 0.02229259024699177, + "grad_norm": 5.681664943695068, + "learning_rate": 8.915906788247214e-07, + "loss": 1.3161, + "step": 88 + }, + { + "epoch": 0.022545915136162126, + "grad_norm": 5.612365245819092, + "learning_rate": 9.017223910840933e-07, + "loss": 1.2537, + "step": 89 + }, + { + "epoch": 0.022799240025332488, + "grad_norm": 6.493178844451904, + "learning_rate": 9.118541033434651e-07, + "loss": 1.3713, + "step": 90 + }, + { + "epoch": 0.02305256491450285, + "grad_norm": 5.3083295822143555, + "learning_rate": 9.219858156028369e-07, + "loss": 1.1649, + "step": 91 + }, + { + "epoch": 0.02330588980367321, + "grad_norm": 6.291053295135498, + "learning_rate": 9.321175278622088e-07, + "loss": 1.2495, + "step": 92 + }, + { + "epoch": 0.023559214692843573, + "grad_norm": 6.047577381134033, + "learning_rate": 9.422492401215806e-07, + "loss": 1.3651, + "step": 93 + }, + { + "epoch": 0.023812539582013934, + "grad_norm": 5.668388843536377, + "learning_rate": 9.523809523809525e-07, + "loss": 1.2696, + "step": 94 + }, + { + "epoch": 0.024065864471184292, + "grad_norm": 5.498602867126465, + "learning_rate": 9.625126646403244e-07, + "loss": 1.1631, + "step": 95 + }, + { + "epoch": 0.024319189360354654, + "grad_norm": 6.081385612487793, + "learning_rate": 9.726443768996962e-07, + "loss": 1.3137, + "step": 96 + }, + { + "epoch": 0.024572514249525015, + "grad_norm": 6.417263507843018, + "learning_rate": 9.827760891590678e-07, + "loss": 1.4252, + "step": 97 + }, + { + "epoch": 0.024825839138695377, + "grad_norm": 5.6804656982421875, + "learning_rate": 9.929078014184399e-07, + "loss": 1.291, + "step": 98 + }, + { + "epoch": 0.02507916402786574, + "grad_norm": 6.759496688842773, + "learning_rate": 1.0030395136778117e-06, + "loss": 1.3338, + "step": 99 + }, + { + "epoch": 0.0253324889170361, + "grad_norm": 5.483371257781982, + "learning_rate": 1.0131712259371835e-06, + "loss": 1.1854, + "step": 100 + }, + { + "epoch": 0.025585813806206458, + "grad_norm": 5.6341681480407715, + "learning_rate": 1.0233029381965553e-06, + "loss": 1.23, + "step": 101 + }, + { + "epoch": 0.02583913869537682, + "grad_norm": 5.76145076751709, + "learning_rate": 1.0334346504559272e-06, + "loss": 1.3177, + "step": 102 + }, + { + "epoch": 0.02609246358454718, + "grad_norm": 5.4306745529174805, + "learning_rate": 1.043566362715299e-06, + "loss": 1.2265, + "step": 103 + }, + { + "epoch": 0.026345788473717543, + "grad_norm": 5.675881862640381, + "learning_rate": 1.0536980749746708e-06, + "loss": 1.1397, + "step": 104 + }, + { + "epoch": 0.026599113362887904, + "grad_norm": 5.60020112991333, + "learning_rate": 1.0638297872340427e-06, + "loss": 1.3454, + "step": 105 + }, + { + "epoch": 0.026852438252058266, + "grad_norm": 5.341634273529053, + "learning_rate": 1.0739614994934145e-06, + "loss": 1.1553, + "step": 106 + }, + { + "epoch": 0.027105763141228624, + "grad_norm": 5.30195426940918, + "learning_rate": 1.0840932117527863e-06, + "loss": 1.2349, + "step": 107 + }, + { + "epoch": 0.027359088030398986, + "grad_norm": 5.831993103027344, + "learning_rate": 1.0942249240121581e-06, + "loss": 1.332, + "step": 108 + }, + { + "epoch": 0.027612412919569347, + "grad_norm": 5.65993595123291, + "learning_rate": 1.10435663627153e-06, + "loss": 1.4381, + "step": 109 + }, + { + "epoch": 0.02786573780873971, + "grad_norm": 5.581968307495117, + "learning_rate": 1.1144883485309018e-06, + "loss": 1.1786, + "step": 110 + }, + { + "epoch": 0.02811906269791007, + "grad_norm": 5.961047649383545, + "learning_rate": 1.1246200607902736e-06, + "loss": 1.2945, + "step": 111 + }, + { + "epoch": 0.028372387587080432, + "grad_norm": 6.274909496307373, + "learning_rate": 1.1347517730496454e-06, + "loss": 1.4267, + "step": 112 + }, + { + "epoch": 0.028625712476250793, + "grad_norm": 5.144623279571533, + "learning_rate": 1.1448834853090175e-06, + "loss": 1.168, + "step": 113 + }, + { + "epoch": 0.02887903736542115, + "grad_norm": 5.619993209838867, + "learning_rate": 1.155015197568389e-06, + "loss": 1.2755, + "step": 114 + }, + { + "epoch": 0.029132362254591513, + "grad_norm": 5.38019323348999, + "learning_rate": 1.165146909827761e-06, + "loss": 1.2547, + "step": 115 + }, + { + "epoch": 0.029385687143761875, + "grad_norm": 5.788126468658447, + "learning_rate": 1.1752786220871327e-06, + "loss": 1.233, + "step": 116 + }, + { + "epoch": 0.029639012032932236, + "grad_norm": 5.342813014984131, + "learning_rate": 1.1854103343465048e-06, + "loss": 1.1753, + "step": 117 + }, + { + "epoch": 0.029892336922102598, + "grad_norm": 5.679368019104004, + "learning_rate": 1.1955420466058764e-06, + "loss": 1.3854, + "step": 118 + }, + { + "epoch": 0.03014566181127296, + "grad_norm": 5.998026371002197, + "learning_rate": 1.2056737588652482e-06, + "loss": 1.3227, + "step": 119 + }, + { + "epoch": 0.030398986700443317, + "grad_norm": 5.865193843841553, + "learning_rate": 1.2158054711246203e-06, + "loss": 1.3186, + "step": 120 + }, + { + "epoch": 0.03065231158961368, + "grad_norm": 5.349485874176025, + "learning_rate": 1.2259371833839919e-06, + "loss": 1.116, + "step": 121 + }, + { + "epoch": 0.03090563647878404, + "grad_norm": 5.940770626068115, + "learning_rate": 1.2360688956433637e-06, + "loss": 1.3233, + "step": 122 + }, + { + "epoch": 0.031158961367954402, + "grad_norm": 5.499122142791748, + "learning_rate": 1.2462006079027357e-06, + "loss": 1.269, + "step": 123 + }, + { + "epoch": 0.031412286257124764, + "grad_norm": 5.793130874633789, + "learning_rate": 1.2563323201621076e-06, + "loss": 1.3234, + "step": 124 + }, + { + "epoch": 0.031665611146295125, + "grad_norm": 5.939444541931152, + "learning_rate": 1.2664640324214794e-06, + "loss": 1.3782, + "step": 125 + }, + { + "epoch": 0.03191893603546549, + "grad_norm": 5.3546013832092285, + "learning_rate": 1.276595744680851e-06, + "loss": 1.2291, + "step": 126 + }, + { + "epoch": 0.03217226092463585, + "grad_norm": 5.786067962646484, + "learning_rate": 1.286727456940223e-06, + "loss": 1.2777, + "step": 127 + }, + { + "epoch": 0.03242558581380621, + "grad_norm": 5.49869441986084, + "learning_rate": 1.2968591691995949e-06, + "loss": 1.2273, + "step": 128 + }, + { + "epoch": 0.032678910702976564, + "grad_norm": 5.4831929206848145, + "learning_rate": 1.3069908814589665e-06, + "loss": 1.2496, + "step": 129 + }, + { + "epoch": 0.032932235592146926, + "grad_norm": 5.68988561630249, + "learning_rate": 1.3171225937183385e-06, + "loss": 1.2683, + "step": 130 + }, + { + "epoch": 0.03318556048131729, + "grad_norm": 4.994076728820801, + "learning_rate": 1.3272543059777104e-06, + "loss": 1.3347, + "step": 131 + }, + { + "epoch": 0.03343888537048765, + "grad_norm": 5.527144432067871, + "learning_rate": 1.3373860182370822e-06, + "loss": 1.3032, + "step": 132 + }, + { + "epoch": 0.03369221025965801, + "grad_norm": 5.677175045013428, + "learning_rate": 1.347517730496454e-06, + "loss": 1.2438, + "step": 133 + }, + { + "epoch": 0.03394553514882837, + "grad_norm": 5.882457256317139, + "learning_rate": 1.3576494427558258e-06, + "loss": 1.3001, + "step": 134 + }, + { + "epoch": 0.034198860037998734, + "grad_norm": 5.756645202636719, + "learning_rate": 1.3677811550151977e-06, + "loss": 1.2909, + "step": 135 + }, + { + "epoch": 0.034452184927169095, + "grad_norm": 5.526420593261719, + "learning_rate": 1.3779128672745695e-06, + "loss": 1.2927, + "step": 136 + }, + { + "epoch": 0.03470550981633946, + "grad_norm": 5.440059185028076, + "learning_rate": 1.3880445795339415e-06, + "loss": 1.3548, + "step": 137 + }, + { + "epoch": 0.03495883470550982, + "grad_norm": 5.624795436859131, + "learning_rate": 1.3981762917933131e-06, + "loss": 1.3193, + "step": 138 + }, + { + "epoch": 0.03521215959468018, + "grad_norm": 6.083158493041992, + "learning_rate": 1.408308004052685e-06, + "loss": 1.3284, + "step": 139 + }, + { + "epoch": 0.03546548448385054, + "grad_norm": 6.338374137878418, + "learning_rate": 1.418439716312057e-06, + "loss": 1.2175, + "step": 140 + }, + { + "epoch": 0.035718809373020896, + "grad_norm": 5.234193801879883, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.1537, + "step": 141 + }, + { + "epoch": 0.03597213426219126, + "grad_norm": 5.26567268371582, + "learning_rate": 1.4387031408308005e-06, + "loss": 1.1837, + "step": 142 + }, + { + "epoch": 0.03622545915136162, + "grad_norm": 5.7893147468566895, + "learning_rate": 1.4488348530901725e-06, + "loss": 1.1652, + "step": 143 + }, + { + "epoch": 0.03647878404053198, + "grad_norm": 5.951411247253418, + "learning_rate": 1.4589665653495441e-06, + "loss": 1.3915, + "step": 144 + }, + { + "epoch": 0.03673210892970234, + "grad_norm": 5.907873630523682, + "learning_rate": 1.4690982776089161e-06, + "loss": 1.3022, + "step": 145 + }, + { + "epoch": 0.036985433818872704, + "grad_norm": 5.551837921142578, + "learning_rate": 1.479229989868288e-06, + "loss": 1.2273, + "step": 146 + }, + { + "epoch": 0.037238758708043065, + "grad_norm": 5.3975443840026855, + "learning_rate": 1.4893617021276596e-06, + "loss": 1.1165, + "step": 147 + }, + { + "epoch": 0.03749208359721343, + "grad_norm": 5.251124382019043, + "learning_rate": 1.4994934143870316e-06, + "loss": 1.2381, + "step": 148 + }, + { + "epoch": 0.03774540848638379, + "grad_norm": 5.477112293243408, + "learning_rate": 1.5096251266464035e-06, + "loss": 1.1766, + "step": 149 + }, + { + "epoch": 0.03799873337555415, + "grad_norm": 5.669548511505127, + "learning_rate": 1.519756838905775e-06, + "loss": 1.3699, + "step": 150 + }, + { + "epoch": 0.03825205826472451, + "grad_norm": 5.682290077209473, + "learning_rate": 1.5298885511651471e-06, + "loss": 1.2886, + "step": 151 + }, + { + "epoch": 0.03850538315389487, + "grad_norm": 5.136720657348633, + "learning_rate": 1.540020263424519e-06, + "loss": 1.1589, + "step": 152 + }, + { + "epoch": 0.03875870804306523, + "grad_norm": 5.364446640014648, + "learning_rate": 1.5501519756838905e-06, + "loss": 1.168, + "step": 153 + }, + { + "epoch": 0.03901203293223559, + "grad_norm": 5.042862415313721, + "learning_rate": 1.5602836879432626e-06, + "loss": 1.1465, + "step": 154 + }, + { + "epoch": 0.03926535782140595, + "grad_norm": 5.456974983215332, + "learning_rate": 1.5704154002026344e-06, + "loss": 1.233, + "step": 155 + }, + { + "epoch": 0.03951868271057631, + "grad_norm": 5.630804538726807, + "learning_rate": 1.5805471124620062e-06, + "loss": 1.0355, + "step": 156 + }, + { + "epoch": 0.039772007599746674, + "grad_norm": 5.544961452484131, + "learning_rate": 1.590678824721378e-06, + "loss": 1.336, + "step": 157 + }, + { + "epoch": 0.040025332488917036, + "grad_norm": 5.7770676612854, + "learning_rate": 1.60081053698075e-06, + "loss": 1.2418, + "step": 158 + }, + { + "epoch": 0.0402786573780874, + "grad_norm": 5.462604999542236, + "learning_rate": 1.6109422492401217e-06, + "loss": 1.1947, + "step": 159 + }, + { + "epoch": 0.04053198226725776, + "grad_norm": 5.253891944885254, + "learning_rate": 1.6210739614994935e-06, + "loss": 1.178, + "step": 160 + }, + { + "epoch": 0.04078530715642812, + "grad_norm": 5.2835001945495605, + "learning_rate": 1.6312056737588656e-06, + "loss": 1.1609, + "step": 161 + }, + { + "epoch": 0.04103863204559848, + "grad_norm": 5.4543914794921875, + "learning_rate": 1.6413373860182372e-06, + "loss": 1.1959, + "step": 162 + }, + { + "epoch": 0.04129195693476884, + "grad_norm": 6.159005641937256, + "learning_rate": 1.651469098277609e-06, + "loss": 1.2731, + "step": 163 + }, + { + "epoch": 0.041545281823939205, + "grad_norm": 5.177642822265625, + "learning_rate": 1.661600810536981e-06, + "loss": 1.1241, + "step": 164 + }, + { + "epoch": 0.04179860671310956, + "grad_norm": 5.764869213104248, + "learning_rate": 1.6717325227963527e-06, + "loss": 1.2032, + "step": 165 + }, + { + "epoch": 0.04205193160227992, + "grad_norm": 5.476707935333252, + "learning_rate": 1.6818642350557245e-06, + "loss": 1.291, + "step": 166 + }, + { + "epoch": 0.04230525649145028, + "grad_norm": 5.529805660247803, + "learning_rate": 1.6919959473150963e-06, + "loss": 1.2466, + "step": 167 + }, + { + "epoch": 0.042558581380620644, + "grad_norm": 5.377219200134277, + "learning_rate": 1.7021276595744682e-06, + "loss": 1.1435, + "step": 168 + }, + { + "epoch": 0.042811906269791006, + "grad_norm": 5.9831013679504395, + "learning_rate": 1.7122593718338402e-06, + "loss": 1.2628, + "step": 169 + }, + { + "epoch": 0.04306523115896137, + "grad_norm": 5.454530715942383, + "learning_rate": 1.7223910840932118e-06, + "loss": 1.2458, + "step": 170 + }, + { + "epoch": 0.04331855604813173, + "grad_norm": 5.665981769561768, + "learning_rate": 1.7325227963525836e-06, + "loss": 1.239, + "step": 171 + }, + { + "epoch": 0.04357188093730209, + "grad_norm": 5.54949426651001, + "learning_rate": 1.7426545086119557e-06, + "loss": 1.144, + "step": 172 + }, + { + "epoch": 0.04382520582647245, + "grad_norm": 5.391991138458252, + "learning_rate": 1.7527862208713273e-06, + "loss": 1.2089, + "step": 173 + }, + { + "epoch": 0.044078530715642814, + "grad_norm": 5.288974285125732, + "learning_rate": 1.7629179331306991e-06, + "loss": 1.2922, + "step": 174 + }, + { + "epoch": 0.044331855604813175, + "grad_norm": 5.138978004455566, + "learning_rate": 1.7730496453900712e-06, + "loss": 1.1125, + "step": 175 + }, + { + "epoch": 0.04458518049398354, + "grad_norm": 5.508606433868408, + "learning_rate": 1.7831813576494428e-06, + "loss": 1.3176, + "step": 176 + }, + { + "epoch": 0.0448385053831539, + "grad_norm": 5.380614280700684, + "learning_rate": 1.7933130699088146e-06, + "loss": 1.1962, + "step": 177 + }, + { + "epoch": 0.04509183027232425, + "grad_norm": 5.325069904327393, + "learning_rate": 1.8034447821681866e-06, + "loss": 1.2922, + "step": 178 + }, + { + "epoch": 0.045345155161494614, + "grad_norm": 5.326142311096191, + "learning_rate": 1.8135764944275583e-06, + "loss": 1.1661, + "step": 179 + }, + { + "epoch": 0.045598480050664976, + "grad_norm": 5.590676784515381, + "learning_rate": 1.8237082066869303e-06, + "loss": 1.2746, + "step": 180 + }, + { + "epoch": 0.04585180493983534, + "grad_norm": 5.422430515289307, + "learning_rate": 1.8338399189463021e-06, + "loss": 1.2305, + "step": 181 + }, + { + "epoch": 0.0461051298290057, + "grad_norm": 5.1848955154418945, + "learning_rate": 1.8439716312056737e-06, + "loss": 1.1117, + "step": 182 + }, + { + "epoch": 0.04635845471817606, + "grad_norm": 5.1731462478637695, + "learning_rate": 1.8541033434650458e-06, + "loss": 1.1643, + "step": 183 + }, + { + "epoch": 0.04661177960734642, + "grad_norm": 5.318070888519287, + "learning_rate": 1.8642350557244176e-06, + "loss": 1.2171, + "step": 184 + }, + { + "epoch": 0.046865104496516784, + "grad_norm": 5.908895015716553, + "learning_rate": 1.8743667679837892e-06, + "loss": 1.422, + "step": 185 + }, + { + "epoch": 0.047118429385687145, + "grad_norm": 5.597854137420654, + "learning_rate": 1.8844984802431613e-06, + "loss": 1.2011, + "step": 186 + }, + { + "epoch": 0.04737175427485751, + "grad_norm": 5.565207004547119, + "learning_rate": 1.894630192502533e-06, + "loss": 1.1722, + "step": 187 + }, + { + "epoch": 0.04762507916402787, + "grad_norm": 5.958907604217529, + "learning_rate": 1.904761904761905e-06, + "loss": 1.3827, + "step": 188 + }, + { + "epoch": 0.04787840405319823, + "grad_norm": 5.93968391418457, + "learning_rate": 1.9148936170212767e-06, + "loss": 1.1789, + "step": 189 + }, + { + "epoch": 0.048131728942368585, + "grad_norm": 5.312741279602051, + "learning_rate": 1.9250253292806488e-06, + "loss": 1.2322, + "step": 190 + }, + { + "epoch": 0.048385053831538946, + "grad_norm": 5.421358108520508, + "learning_rate": 1.9351570415400204e-06, + "loss": 1.1542, + "step": 191 + }, + { + "epoch": 0.04863837872070931, + "grad_norm": 5.5818095207214355, + "learning_rate": 1.9452887537993924e-06, + "loss": 1.2192, + "step": 192 + }, + { + "epoch": 0.04889170360987967, + "grad_norm": 5.480461597442627, + "learning_rate": 1.955420466058764e-06, + "loss": 1.2174, + "step": 193 + }, + { + "epoch": 0.04914502849905003, + "grad_norm": 5.37147855758667, + "learning_rate": 1.9655521783181357e-06, + "loss": 1.2698, + "step": 194 + }, + { + "epoch": 0.04939835338822039, + "grad_norm": 5.655091285705566, + "learning_rate": 1.9756838905775077e-06, + "loss": 1.0975, + "step": 195 + }, + { + "epoch": 0.049651678277390754, + "grad_norm": 5.748220443725586, + "learning_rate": 1.9858156028368797e-06, + "loss": 1.3046, + "step": 196 + }, + { + "epoch": 0.049905003166561115, + "grad_norm": 6.164122104644775, + "learning_rate": 1.9959473150962513e-06, + "loss": 1.2854, + "step": 197 + }, + { + "epoch": 0.05015832805573148, + "grad_norm": 5.593215465545654, + "learning_rate": 2.0060790273556234e-06, + "loss": 1.2397, + "step": 198 + }, + { + "epoch": 0.05041165294490184, + "grad_norm": 5.580338001251221, + "learning_rate": 2.016210739614995e-06, + "loss": 1.1977, + "step": 199 + }, + { + "epoch": 0.0506649778340722, + "grad_norm": 6.3558878898620605, + "learning_rate": 2.026342451874367e-06, + "loss": 1.2491, + "step": 200 + }, + { + "epoch": 0.05091830272324256, + "grad_norm": 5.370153427124023, + "learning_rate": 2.0364741641337387e-06, + "loss": 1.1747, + "step": 201 + }, + { + "epoch": 0.051171627612412916, + "grad_norm": 5.352872371673584, + "learning_rate": 2.0466058763931107e-06, + "loss": 1.3069, + "step": 202 + }, + { + "epoch": 0.05142495250158328, + "grad_norm": 5.849454402923584, + "learning_rate": 2.0567375886524823e-06, + "loss": 1.3362, + "step": 203 + }, + { + "epoch": 0.05167827739075364, + "grad_norm": 5.278831481933594, + "learning_rate": 2.0668693009118543e-06, + "loss": 1.1309, + "step": 204 + }, + { + "epoch": 0.051931602279924, + "grad_norm": 5.236042022705078, + "learning_rate": 2.0770010131712264e-06, + "loss": 1.1772, + "step": 205 + }, + { + "epoch": 0.05218492716909436, + "grad_norm": 5.872128963470459, + "learning_rate": 2.087132725430598e-06, + "loss": 1.2255, + "step": 206 + }, + { + "epoch": 0.052438252058264724, + "grad_norm": 5.233270645141602, + "learning_rate": 2.0972644376899696e-06, + "loss": 1.2479, + "step": 207 + }, + { + "epoch": 0.052691576947435086, + "grad_norm": 5.511275768280029, + "learning_rate": 2.1073961499493417e-06, + "loss": 1.1882, + "step": 208 + }, + { + "epoch": 0.05294490183660545, + "grad_norm": 5.640159606933594, + "learning_rate": 2.1175278622087133e-06, + "loss": 1.4266, + "step": 209 + }, + { + "epoch": 0.05319822672577581, + "grad_norm": 5.4359612464904785, + "learning_rate": 2.1276595744680853e-06, + "loss": 1.227, + "step": 210 + }, + { + "epoch": 0.05345155161494617, + "grad_norm": 5.258604049682617, + "learning_rate": 2.1377912867274573e-06, + "loss": 1.1736, + "step": 211 + }, + { + "epoch": 0.05370487650411653, + "grad_norm": 5.675787448883057, + "learning_rate": 2.147922998986829e-06, + "loss": 1.2261, + "step": 212 + }, + { + "epoch": 0.05395820139328689, + "grad_norm": 5.490091800689697, + "learning_rate": 2.158054711246201e-06, + "loss": 1.1692, + "step": 213 + }, + { + "epoch": 0.05421152628245725, + "grad_norm": 5.569422721862793, + "learning_rate": 2.1681864235055726e-06, + "loss": 1.2708, + "step": 214 + }, + { + "epoch": 0.05446485117162761, + "grad_norm": 6.177036762237549, + "learning_rate": 2.1783181357649442e-06, + "loss": 1.2441, + "step": 215 + }, + { + "epoch": 0.05471817606079797, + "grad_norm": 4.792664051055908, + "learning_rate": 2.1884498480243163e-06, + "loss": 1.1072, + "step": 216 + }, + { + "epoch": 0.05497150094996833, + "grad_norm": 5.102123260498047, + "learning_rate": 2.1985815602836883e-06, + "loss": 1.1383, + "step": 217 + }, + { + "epoch": 0.055224825839138694, + "grad_norm": 5.349294662475586, + "learning_rate": 2.20871327254306e-06, + "loss": 1.0587, + "step": 218 + }, + { + "epoch": 0.055478150728309056, + "grad_norm": 5.529597759246826, + "learning_rate": 2.218844984802432e-06, + "loss": 1.2363, + "step": 219 + }, + { + "epoch": 0.05573147561747942, + "grad_norm": 5.755967140197754, + "learning_rate": 2.2289766970618036e-06, + "loss": 1.3302, + "step": 220 + }, + { + "epoch": 0.05598480050664978, + "grad_norm": 5.315429210662842, + "learning_rate": 2.2391084093211756e-06, + "loss": 1.2632, + "step": 221 + }, + { + "epoch": 0.05623812539582014, + "grad_norm": 5.7810139656066895, + "learning_rate": 2.2492401215805472e-06, + "loss": 1.1679, + "step": 222 + }, + { + "epoch": 0.0564914502849905, + "grad_norm": 5.621176719665527, + "learning_rate": 2.2593718338399193e-06, + "loss": 1.2638, + "step": 223 + }, + { + "epoch": 0.056744775174160864, + "grad_norm": 5.352179527282715, + "learning_rate": 2.269503546099291e-06, + "loss": 1.2045, + "step": 224 + }, + { + "epoch": 0.056998100063331225, + "grad_norm": 5.482887268066406, + "learning_rate": 2.279635258358663e-06, + "loss": 1.0832, + "step": 225 + }, + { + "epoch": 0.05725142495250159, + "grad_norm": 5.484334468841553, + "learning_rate": 2.289766970618035e-06, + "loss": 1.1747, + "step": 226 + }, + { + "epoch": 0.05750474984167194, + "grad_norm": 5.4562554359436035, + "learning_rate": 2.2998986828774066e-06, + "loss": 1.2621, + "step": 227 + }, + { + "epoch": 0.0577580747308423, + "grad_norm": 5.370246410369873, + "learning_rate": 2.310030395136778e-06, + "loss": 1.055, + "step": 228 + }, + { + "epoch": 0.058011399620012664, + "grad_norm": 5.205504417419434, + "learning_rate": 2.3201621073961502e-06, + "loss": 1.1595, + "step": 229 + }, + { + "epoch": 0.058264724509183026, + "grad_norm": 5.665552616119385, + "learning_rate": 2.330293819655522e-06, + "loss": 1.2979, + "step": 230 + }, + { + "epoch": 0.05851804939835339, + "grad_norm": 5.3679070472717285, + "learning_rate": 2.340425531914894e-06, + "loss": 1.2985, + "step": 231 + }, + { + "epoch": 0.05877137428752375, + "grad_norm": 5.475890636444092, + "learning_rate": 2.3505572441742655e-06, + "loss": 1.1784, + "step": 232 + }, + { + "epoch": 0.05902469917669411, + "grad_norm": 5.7248992919921875, + "learning_rate": 2.3606889564336375e-06, + "loss": 1.2104, + "step": 233 + }, + { + "epoch": 0.05927802406586447, + "grad_norm": 5.338497638702393, + "learning_rate": 2.3708206686930096e-06, + "loss": 1.1068, + "step": 234 + }, + { + "epoch": 0.059531348955034834, + "grad_norm": 5.562758445739746, + "learning_rate": 2.380952380952381e-06, + "loss": 1.2252, + "step": 235 + }, + { + "epoch": 0.059784673844205195, + "grad_norm": 5.872779846191406, + "learning_rate": 2.391084093211753e-06, + "loss": 1.1181, + "step": 236 + }, + { + "epoch": 0.06003799873337556, + "grad_norm": 5.6264214515686035, + "learning_rate": 2.401215805471125e-06, + "loss": 1.3164, + "step": 237 + }, + { + "epoch": 0.06029132362254592, + "grad_norm": 5.5919647216796875, + "learning_rate": 2.4113475177304965e-06, + "loss": 1.1989, + "step": 238 + }, + { + "epoch": 0.06054464851171627, + "grad_norm": 5.198634147644043, + "learning_rate": 2.4214792299898685e-06, + "loss": 1.2422, + "step": 239 + }, + { + "epoch": 0.060797973400886635, + "grad_norm": 5.159615993499756, + "learning_rate": 2.4316109422492405e-06, + "loss": 1.1807, + "step": 240 + }, + { + "epoch": 0.061051298290056996, + "grad_norm": 5.649820804595947, + "learning_rate": 2.441742654508612e-06, + "loss": 1.1369, + "step": 241 + }, + { + "epoch": 0.06130462317922736, + "grad_norm": 5.47420597076416, + "learning_rate": 2.4518743667679838e-06, + "loss": 1.304, + "step": 242 + }, + { + "epoch": 0.06155794806839772, + "grad_norm": 4.905952453613281, + "learning_rate": 2.462006079027356e-06, + "loss": 1.1568, + "step": 243 + }, + { + "epoch": 0.06181127295756808, + "grad_norm": 5.942564964294434, + "learning_rate": 2.4721377912867274e-06, + "loss": 1.272, + "step": 244 + }, + { + "epoch": 0.06206459784673844, + "grad_norm": 6.035305976867676, + "learning_rate": 2.4822695035460995e-06, + "loss": 1.2711, + "step": 245 + }, + { + "epoch": 0.062317922735908804, + "grad_norm": 5.5649800300598145, + "learning_rate": 2.4924012158054715e-06, + "loss": 1.2617, + "step": 246 + }, + { + "epoch": 0.06257124762507917, + "grad_norm": 5.223843097686768, + "learning_rate": 2.502532928064843e-06, + "loss": 1.2616, + "step": 247 + }, + { + "epoch": 0.06282457251424953, + "grad_norm": 5.219488620758057, + "learning_rate": 2.512664640324215e-06, + "loss": 1.162, + "step": 248 + }, + { + "epoch": 0.06307789740341989, + "grad_norm": 5.6271867752075195, + "learning_rate": 2.5227963525835868e-06, + "loss": 1.2603, + "step": 249 + }, + { + "epoch": 0.06333122229259025, + "grad_norm": 5.215453147888184, + "learning_rate": 2.532928064842959e-06, + "loss": 1.1556, + "step": 250 + }, + { + "epoch": 0.06358454718176061, + "grad_norm": 5.587162494659424, + "learning_rate": 2.543059777102331e-06, + "loss": 1.2228, + "step": 251 + }, + { + "epoch": 0.06383787207093097, + "grad_norm": 5.448244571685791, + "learning_rate": 2.553191489361702e-06, + "loss": 1.1525, + "step": 252 + }, + { + "epoch": 0.06409119696010133, + "grad_norm": 6.127096176147461, + "learning_rate": 2.563323201621074e-06, + "loss": 1.1575, + "step": 253 + }, + { + "epoch": 0.0643445218492717, + "grad_norm": 4.879481315612793, + "learning_rate": 2.573454913880446e-06, + "loss": 1.1375, + "step": 254 + }, + { + "epoch": 0.06459784673844206, + "grad_norm": 5.284853458404541, + "learning_rate": 2.5835866261398177e-06, + "loss": 1.1636, + "step": 255 + }, + { + "epoch": 0.06485117162761242, + "grad_norm": 4.960007190704346, + "learning_rate": 2.5937183383991898e-06, + "loss": 1.1259, + "step": 256 + }, + { + "epoch": 0.06510449651678277, + "grad_norm": 5.490287780761719, + "learning_rate": 2.603850050658562e-06, + "loss": 1.2555, + "step": 257 + }, + { + "epoch": 0.06535782140595313, + "grad_norm": 5.298451900482178, + "learning_rate": 2.613981762917933e-06, + "loss": 1.3183, + "step": 258 + }, + { + "epoch": 0.06561114629512349, + "grad_norm": 4.963212966918945, + "learning_rate": 2.624113475177305e-06, + "loss": 1.1287, + "step": 259 + }, + { + "epoch": 0.06586447118429385, + "grad_norm": 5.298976421356201, + "learning_rate": 2.634245187436677e-06, + "loss": 1.1672, + "step": 260 + }, + { + "epoch": 0.06611779607346421, + "grad_norm": 5.273107528686523, + "learning_rate": 2.644376899696049e-06, + "loss": 1.1491, + "step": 261 + }, + { + "epoch": 0.06637112096263457, + "grad_norm": 5.450850486755371, + "learning_rate": 2.6545086119554207e-06, + "loss": 1.2121, + "step": 262 + }, + { + "epoch": 0.06662444585180494, + "grad_norm": 5.025203704833984, + "learning_rate": 2.6646403242147928e-06, + "loss": 1.1708, + "step": 263 + }, + { + "epoch": 0.0668777707409753, + "grad_norm": 5.214092254638672, + "learning_rate": 2.6747720364741644e-06, + "loss": 1.2275, + "step": 264 + }, + { + "epoch": 0.06713109563014566, + "grad_norm": 5.152961254119873, + "learning_rate": 2.684903748733536e-06, + "loss": 1.2722, + "step": 265 + }, + { + "epoch": 0.06738442051931602, + "grad_norm": 5.896986961364746, + "learning_rate": 2.695035460992908e-06, + "loss": 1.3384, + "step": 266 + }, + { + "epoch": 0.06763774540848638, + "grad_norm": 5.6102614402771, + "learning_rate": 2.70516717325228e-06, + "loss": 1.2057, + "step": 267 + }, + { + "epoch": 0.06789107029765674, + "grad_norm": 5.330872535705566, + "learning_rate": 2.7152988855116517e-06, + "loss": 1.1099, + "step": 268 + }, + { + "epoch": 0.0681443951868271, + "grad_norm": 5.673343181610107, + "learning_rate": 2.7254305977710233e-06, + "loss": 1.2957, + "step": 269 + }, + { + "epoch": 0.06839772007599747, + "grad_norm": 5.127655506134033, + "learning_rate": 2.7355623100303953e-06, + "loss": 1.1429, + "step": 270 + }, + { + "epoch": 0.06865104496516783, + "grad_norm": 5.601057052612305, + "learning_rate": 2.745694022289767e-06, + "loss": 1.1527, + "step": 271 + }, + { + "epoch": 0.06890436985433819, + "grad_norm": 5.111176013946533, + "learning_rate": 2.755825734549139e-06, + "loss": 1.1284, + "step": 272 + }, + { + "epoch": 0.06915769474350855, + "grad_norm": 5.472817897796631, + "learning_rate": 2.765957446808511e-06, + "loss": 1.3043, + "step": 273 + }, + { + "epoch": 0.06941101963267891, + "grad_norm": 5.623200416564941, + "learning_rate": 2.776089159067883e-06, + "loss": 1.2232, + "step": 274 + }, + { + "epoch": 0.06966434452184928, + "grad_norm": 5.1391825675964355, + "learning_rate": 2.7862208713272543e-06, + "loss": 1.1516, + "step": 275 + }, + { + "epoch": 0.06991766941101964, + "grad_norm": 4.69110631942749, + "learning_rate": 2.7963525835866263e-06, + "loss": 1.1348, + "step": 276 + }, + { + "epoch": 0.07017099430019, + "grad_norm": 4.934825420379639, + "learning_rate": 2.8064842958459983e-06, + "loss": 1.1664, + "step": 277 + }, + { + "epoch": 0.07042431918936036, + "grad_norm": 5.43660306930542, + "learning_rate": 2.81661600810537e-06, + "loss": 1.2981, + "step": 278 + }, + { + "epoch": 0.07067764407853072, + "grad_norm": 5.765442848205566, + "learning_rate": 2.826747720364742e-06, + "loss": 1.2113, + "step": 279 + }, + { + "epoch": 0.07093096896770108, + "grad_norm": 5.242639064788818, + "learning_rate": 2.836879432624114e-06, + "loss": 1.2121, + "step": 280 + }, + { + "epoch": 0.07118429385687144, + "grad_norm": 5.404294967651367, + "learning_rate": 2.8470111448834852e-06, + "loss": 1.287, + "step": 281 + }, + { + "epoch": 0.07143761874604179, + "grad_norm": 5.30703592300415, + "learning_rate": 2.8571428571428573e-06, + "loss": 1.1924, + "step": 282 + }, + { + "epoch": 0.07169094363521215, + "grad_norm": 5.358706474304199, + "learning_rate": 2.8672745694022293e-06, + "loss": 1.2447, + "step": 283 + }, + { + "epoch": 0.07194426852438252, + "grad_norm": 5.951683521270752, + "learning_rate": 2.877406281661601e-06, + "loss": 1.2574, + "step": 284 + }, + { + "epoch": 0.07219759341355288, + "grad_norm": 4.923393726348877, + "learning_rate": 2.887537993920973e-06, + "loss": 1.0977, + "step": 285 + }, + { + "epoch": 0.07245091830272324, + "grad_norm": 5.036745071411133, + "learning_rate": 2.897669706180345e-06, + "loss": 1.2233, + "step": 286 + }, + { + "epoch": 0.0727042431918936, + "grad_norm": 5.286535739898682, + "learning_rate": 2.907801418439716e-06, + "loss": 1.194, + "step": 287 + }, + { + "epoch": 0.07295756808106396, + "grad_norm": 5.941610336303711, + "learning_rate": 2.9179331306990882e-06, + "loss": 1.3264, + "step": 288 + }, + { + "epoch": 0.07321089297023432, + "grad_norm": 5.5159687995910645, + "learning_rate": 2.9280648429584603e-06, + "loss": 1.2171, + "step": 289 + }, + { + "epoch": 0.07346421785940468, + "grad_norm": 5.988364219665527, + "learning_rate": 2.9381965552178323e-06, + "loss": 1.2531, + "step": 290 + }, + { + "epoch": 0.07371754274857505, + "grad_norm": 5.247437477111816, + "learning_rate": 2.948328267477204e-06, + "loss": 1.189, + "step": 291 + }, + { + "epoch": 0.07397086763774541, + "grad_norm": 5.554168224334717, + "learning_rate": 2.958459979736576e-06, + "loss": 1.2524, + "step": 292 + }, + { + "epoch": 0.07422419252691577, + "grad_norm": 5.522302627563477, + "learning_rate": 2.968591691995947e-06, + "loss": 1.3294, + "step": 293 + }, + { + "epoch": 0.07447751741608613, + "grad_norm": 4.99888277053833, + "learning_rate": 2.978723404255319e-06, + "loss": 1.1604, + "step": 294 + }, + { + "epoch": 0.07473084230525649, + "grad_norm": 6.059742450714111, + "learning_rate": 2.9888551165146912e-06, + "loss": 1.2363, + "step": 295 + }, + { + "epoch": 0.07498416719442685, + "grad_norm": 5.118231296539307, + "learning_rate": 2.9989868287740633e-06, + "loss": 1.0948, + "step": 296 + }, + { + "epoch": 0.07523749208359722, + "grad_norm": 5.535048007965088, + "learning_rate": 3.009118541033435e-06, + "loss": 1.1973, + "step": 297 + }, + { + "epoch": 0.07549081697276758, + "grad_norm": 5.824736595153809, + "learning_rate": 3.019250253292807e-06, + "loss": 1.1542, + "step": 298 + }, + { + "epoch": 0.07574414186193794, + "grad_norm": 5.241152763366699, + "learning_rate": 3.0293819655521785e-06, + "loss": 1.2698, + "step": 299 + }, + { + "epoch": 0.0759974667511083, + "grad_norm": 5.363473415374756, + "learning_rate": 3.03951367781155e-06, + "loss": 1.1991, + "step": 300 + }, + { + "epoch": 0.07625079164027866, + "grad_norm": 5.177186965942383, + "learning_rate": 3.049645390070922e-06, + "loss": 1.1214, + "step": 301 + }, + { + "epoch": 0.07650411652944902, + "grad_norm": 5.408219814300537, + "learning_rate": 3.0597771023302942e-06, + "loss": 1.364, + "step": 302 + }, + { + "epoch": 0.07675744141861938, + "grad_norm": 5.7646636962890625, + "learning_rate": 3.0699088145896663e-06, + "loss": 1.2886, + "step": 303 + }, + { + "epoch": 0.07701076630778975, + "grad_norm": 5.605578422546387, + "learning_rate": 3.080040526849038e-06, + "loss": 1.2563, + "step": 304 + }, + { + "epoch": 0.07726409119696011, + "grad_norm": 5.21433162689209, + "learning_rate": 3.0901722391084095e-06, + "loss": 1.3145, + "step": 305 + }, + { + "epoch": 0.07751741608613046, + "grad_norm": 5.218530654907227, + "learning_rate": 3.100303951367781e-06, + "loss": 1.2366, + "step": 306 + }, + { + "epoch": 0.07777074097530082, + "grad_norm": 5.189089775085449, + "learning_rate": 3.110435663627153e-06, + "loss": 1.2856, + "step": 307 + }, + { + "epoch": 0.07802406586447118, + "grad_norm": 5.267763614654541, + "learning_rate": 3.120567375886525e-06, + "loss": 1.2607, + "step": 308 + }, + { + "epoch": 0.07827739075364154, + "grad_norm": 5.631710052490234, + "learning_rate": 3.1306990881458972e-06, + "loss": 1.2722, + "step": 309 + }, + { + "epoch": 0.0785307156428119, + "grad_norm": 5.194464683532715, + "learning_rate": 3.140830800405269e-06, + "loss": 1.3005, + "step": 310 + }, + { + "epoch": 0.07878404053198226, + "grad_norm": 4.978764057159424, + "learning_rate": 3.1509625126646404e-06, + "loss": 1.1632, + "step": 311 + }, + { + "epoch": 0.07903736542115262, + "grad_norm": 5.242401123046875, + "learning_rate": 3.1610942249240125e-06, + "loss": 1.1535, + "step": 312 + }, + { + "epoch": 0.07929069031032299, + "grad_norm": 5.89813232421875, + "learning_rate": 3.171225937183384e-06, + "loss": 1.2232, + "step": 313 + }, + { + "epoch": 0.07954401519949335, + "grad_norm": 5.654772758483887, + "learning_rate": 3.181357649442756e-06, + "loss": 1.2795, + "step": 314 + }, + { + "epoch": 0.07979734008866371, + "grad_norm": 5.303825855255127, + "learning_rate": 3.191489361702128e-06, + "loss": 1.1862, + "step": 315 + }, + { + "epoch": 0.08005066497783407, + "grad_norm": 5.2518439292907715, + "learning_rate": 3.2016210739615e-06, + "loss": 1.2725, + "step": 316 + }, + { + "epoch": 0.08030398986700443, + "grad_norm": 5.576358795166016, + "learning_rate": 3.2117527862208714e-06, + "loss": 1.2813, + "step": 317 + }, + { + "epoch": 0.0805573147561748, + "grad_norm": 5.416296482086182, + "learning_rate": 3.2218844984802434e-06, + "loss": 1.1842, + "step": 318 + }, + { + "epoch": 0.08081063964534516, + "grad_norm": 5.248586654663086, + "learning_rate": 3.232016210739615e-06, + "loss": 1.1256, + "step": 319 + }, + { + "epoch": 0.08106396453451552, + "grad_norm": 5.1331658363342285, + "learning_rate": 3.242147922998987e-06, + "loss": 1.2788, + "step": 320 + }, + { + "epoch": 0.08131728942368588, + "grad_norm": 5.369128704071045, + "learning_rate": 3.252279635258359e-06, + "loss": 1.1909, + "step": 321 + }, + { + "epoch": 0.08157061431285624, + "grad_norm": 5.515130996704102, + "learning_rate": 3.262411347517731e-06, + "loss": 1.2855, + "step": 322 + }, + { + "epoch": 0.0818239392020266, + "grad_norm": 4.981941223144531, + "learning_rate": 3.2725430597771024e-06, + "loss": 1.13, + "step": 323 + }, + { + "epoch": 0.08207726409119696, + "grad_norm": 6.185157775878906, + "learning_rate": 3.2826747720364744e-06, + "loss": 1.3237, + "step": 324 + }, + { + "epoch": 0.08233058898036733, + "grad_norm": 5.413832187652588, + "learning_rate": 3.2928064842958464e-06, + "loss": 1.335, + "step": 325 + }, + { + "epoch": 0.08258391386953769, + "grad_norm": 5.234766960144043, + "learning_rate": 3.302938196555218e-06, + "loss": 1.0887, + "step": 326 + }, + { + "epoch": 0.08283723875870805, + "grad_norm": 5.378023624420166, + "learning_rate": 3.31306990881459e-06, + "loss": 1.2683, + "step": 327 + }, + { + "epoch": 0.08309056364787841, + "grad_norm": 5.207670211791992, + "learning_rate": 3.323201621073962e-06, + "loss": 1.2492, + "step": 328 + }, + { + "epoch": 0.08334388853704877, + "grad_norm": 4.918520450592041, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.1408, + "step": 329 + }, + { + "epoch": 0.08359721342621912, + "grad_norm": 5.375702857971191, + "learning_rate": 3.3434650455927054e-06, + "loss": 1.2153, + "step": 330 + }, + { + "epoch": 0.08385053831538948, + "grad_norm": 5.3048996925354, + "learning_rate": 3.3535967578520774e-06, + "loss": 1.3736, + "step": 331 + }, + { + "epoch": 0.08410386320455984, + "grad_norm": 5.028907775878906, + "learning_rate": 3.363728470111449e-06, + "loss": 1.1762, + "step": 332 + }, + { + "epoch": 0.0843571880937302, + "grad_norm": 4.838732719421387, + "learning_rate": 3.373860182370821e-06, + "loss": 1.1368, + "step": 333 + }, + { + "epoch": 0.08461051298290057, + "grad_norm": 5.1180853843688965, + "learning_rate": 3.3839918946301927e-06, + "loss": 1.1149, + "step": 334 + }, + { + "epoch": 0.08486383787207093, + "grad_norm": 5.510826587677002, + "learning_rate": 3.3941236068895643e-06, + "loss": 1.3176, + "step": 335 + }, + { + "epoch": 0.08511716276124129, + "grad_norm": 4.998798370361328, + "learning_rate": 3.4042553191489363e-06, + "loss": 1.1184, + "step": 336 + }, + { + "epoch": 0.08537048765041165, + "grad_norm": 5.48846960067749, + "learning_rate": 3.4143870314083084e-06, + "loss": 1.2034, + "step": 337 + }, + { + "epoch": 0.08562381253958201, + "grad_norm": 5.412283897399902, + "learning_rate": 3.4245187436676804e-06, + "loss": 1.2323, + "step": 338 + }, + { + "epoch": 0.08587713742875237, + "grad_norm": 5.218188285827637, + "learning_rate": 3.434650455927052e-06, + "loss": 1.2015, + "step": 339 + }, + { + "epoch": 0.08613046231792273, + "grad_norm": 5.164620876312256, + "learning_rate": 3.4447821681864236e-06, + "loss": 1.2658, + "step": 340 + }, + { + "epoch": 0.0863837872070931, + "grad_norm": 5.300448894500732, + "learning_rate": 3.4549138804457952e-06, + "loss": 1.1716, + "step": 341 + }, + { + "epoch": 0.08663711209626346, + "grad_norm": 5.105301380157471, + "learning_rate": 3.4650455927051673e-06, + "loss": 1.1445, + "step": 342 + }, + { + "epoch": 0.08689043698543382, + "grad_norm": 5.227696895599365, + "learning_rate": 3.4751773049645393e-06, + "loss": 1.1638, + "step": 343 + }, + { + "epoch": 0.08714376187460418, + "grad_norm": 5.406198024749756, + "learning_rate": 3.4853090172239114e-06, + "loss": 1.2622, + "step": 344 + }, + { + "epoch": 0.08739708676377454, + "grad_norm": 5.080615043640137, + "learning_rate": 3.495440729483283e-06, + "loss": 1.1559, + "step": 345 + }, + { + "epoch": 0.0876504116529449, + "grad_norm": 5.109445571899414, + "learning_rate": 3.5055724417426546e-06, + "loss": 1.2238, + "step": 346 + }, + { + "epoch": 0.08790373654211527, + "grad_norm": 5.225170612335205, + "learning_rate": 3.5157041540020266e-06, + "loss": 1.2987, + "step": 347 + }, + { + "epoch": 0.08815706143128563, + "grad_norm": 4.769128322601318, + "learning_rate": 3.5258358662613982e-06, + "loss": 1.2457, + "step": 348 + }, + { + "epoch": 0.08841038632045599, + "grad_norm": 5.18264102935791, + "learning_rate": 3.5359675785207703e-06, + "loss": 1.1111, + "step": 349 + }, + { + "epoch": 0.08866371120962635, + "grad_norm": 5.246507167816162, + "learning_rate": 3.5460992907801423e-06, + "loss": 1.1872, + "step": 350 + }, + { + "epoch": 0.08891703609879671, + "grad_norm": 4.926772117614746, + "learning_rate": 3.5562310030395144e-06, + "loss": 1.1972, + "step": 351 + }, + { + "epoch": 0.08917036098796707, + "grad_norm": 5.447096347808838, + "learning_rate": 3.5663627152988856e-06, + "loss": 1.2149, + "step": 352 + }, + { + "epoch": 0.08942368587713743, + "grad_norm": 4.652470588684082, + "learning_rate": 3.5764944275582576e-06, + "loss": 1.1225, + "step": 353 + }, + { + "epoch": 0.0896770107663078, + "grad_norm": 5.64431619644165, + "learning_rate": 3.586626139817629e-06, + "loss": 1.2261, + "step": 354 + }, + { + "epoch": 0.08993033565547814, + "grad_norm": 5.396925449371338, + "learning_rate": 3.5967578520770012e-06, + "loss": 1.1252, + "step": 355 + }, + { + "epoch": 0.0901836605446485, + "grad_norm": 5.296283721923828, + "learning_rate": 3.6068895643363733e-06, + "loss": 1.2206, + "step": 356 + }, + { + "epoch": 0.09043698543381887, + "grad_norm": 5.024763584136963, + "learning_rate": 3.6170212765957453e-06, + "loss": 1.203, + "step": 357 + }, + { + "epoch": 0.09069031032298923, + "grad_norm": 5.703789710998535, + "learning_rate": 3.6271529888551165e-06, + "loss": 1.2007, + "step": 358 + }, + { + "epoch": 0.09094363521215959, + "grad_norm": 5.510667324066162, + "learning_rate": 3.6372847011144885e-06, + "loss": 1.1985, + "step": 359 + }, + { + "epoch": 0.09119696010132995, + "grad_norm": 5.609925746917725, + "learning_rate": 3.6474164133738606e-06, + "loss": 1.2416, + "step": 360 + }, + { + "epoch": 0.09145028499050031, + "grad_norm": 4.885375022888184, + "learning_rate": 3.657548125633232e-06, + "loss": 1.3185, + "step": 361 + }, + { + "epoch": 0.09170360987967068, + "grad_norm": 4.889640808105469, + "learning_rate": 3.6676798378926042e-06, + "loss": 1.099, + "step": 362 + }, + { + "epoch": 0.09195693476884104, + "grad_norm": 5.009072303771973, + "learning_rate": 3.6778115501519763e-06, + "loss": 1.1495, + "step": 363 + }, + { + "epoch": 0.0922102596580114, + "grad_norm": 4.96414852142334, + "learning_rate": 3.6879432624113475e-06, + "loss": 1.2632, + "step": 364 + }, + { + "epoch": 0.09246358454718176, + "grad_norm": 5.6016316413879395, + "learning_rate": 3.6980749746707195e-06, + "loss": 1.2534, + "step": 365 + }, + { + "epoch": 0.09271690943635212, + "grad_norm": 4.959722995758057, + "learning_rate": 3.7082066869300915e-06, + "loss": 1.186, + "step": 366 + }, + { + "epoch": 0.09297023432552248, + "grad_norm": 5.125227928161621, + "learning_rate": 3.7183383991894636e-06, + "loss": 1.0711, + "step": 367 + }, + { + "epoch": 0.09322355921469284, + "grad_norm": 5.24466609954834, + "learning_rate": 3.728470111448835e-06, + "loss": 1.1699, + "step": 368 + }, + { + "epoch": 0.0934768841038632, + "grad_norm": 5.310903549194336, + "learning_rate": 3.7386018237082072e-06, + "loss": 1.2407, + "step": 369 + }, + { + "epoch": 0.09373020899303357, + "grad_norm": 5.320084095001221, + "learning_rate": 3.7487335359675784e-06, + "loss": 1.3521, + "step": 370 + }, + { + "epoch": 0.09398353388220393, + "grad_norm": 5.608123302459717, + "learning_rate": 3.7588652482269505e-06, + "loss": 1.195, + "step": 371 + }, + { + "epoch": 0.09423685877137429, + "grad_norm": 5.201747417449951, + "learning_rate": 3.7689969604863225e-06, + "loss": 1.0786, + "step": 372 + }, + { + "epoch": 0.09449018366054465, + "grad_norm": 5.231518268585205, + "learning_rate": 3.7791286727456945e-06, + "loss": 1.2605, + "step": 373 + }, + { + "epoch": 0.09474350854971501, + "grad_norm": 5.920846462249756, + "learning_rate": 3.789260385005066e-06, + "loss": 1.3388, + "step": 374 + }, + { + "epoch": 0.09499683343888538, + "grad_norm": 5.163887977600098, + "learning_rate": 3.799392097264438e-06, + "loss": 1.1228, + "step": 375 + }, + { + "epoch": 0.09525015832805574, + "grad_norm": 5.717247486114502, + "learning_rate": 3.80952380952381e-06, + "loss": 1.2795, + "step": 376 + }, + { + "epoch": 0.0955034832172261, + "grad_norm": 4.618260383605957, + "learning_rate": 3.819655521783182e-06, + "loss": 1.1681, + "step": 377 + }, + { + "epoch": 0.09575680810639646, + "grad_norm": 5.07727575302124, + "learning_rate": 3.8297872340425535e-06, + "loss": 1.1068, + "step": 378 + }, + { + "epoch": 0.09601013299556681, + "grad_norm": 5.215041637420654, + "learning_rate": 3.839918946301925e-06, + "loss": 1.2376, + "step": 379 + }, + { + "epoch": 0.09626345788473717, + "grad_norm": 5.052102088928223, + "learning_rate": 3.8500506585612975e-06, + "loss": 1.2343, + "step": 380 + }, + { + "epoch": 0.09651678277390753, + "grad_norm": 4.981046199798584, + "learning_rate": 3.860182370820669e-06, + "loss": 1.318, + "step": 381 + }, + { + "epoch": 0.09677010766307789, + "grad_norm": 5.665030002593994, + "learning_rate": 3.870314083080041e-06, + "loss": 1.2604, + "step": 382 + }, + { + "epoch": 0.09702343255224825, + "grad_norm": 5.199818134307861, + "learning_rate": 3.880445795339412e-06, + "loss": 1.2613, + "step": 383 + }, + { + "epoch": 0.09727675744141862, + "grad_norm": 5.118903636932373, + "learning_rate": 3.890577507598785e-06, + "loss": 1.2008, + "step": 384 + }, + { + "epoch": 0.09753008233058898, + "grad_norm": 4.857881546020508, + "learning_rate": 3.9007092198581565e-06, + "loss": 1.2139, + "step": 385 + }, + { + "epoch": 0.09778340721975934, + "grad_norm": 4.874960899353027, + "learning_rate": 3.910840932117528e-06, + "loss": 1.1296, + "step": 386 + }, + { + "epoch": 0.0980367321089297, + "grad_norm": 5.047555446624756, + "learning_rate": 3.9209726443769005e-06, + "loss": 1.1817, + "step": 387 + }, + { + "epoch": 0.09829005699810006, + "grad_norm": 4.445738315582275, + "learning_rate": 3.931104356636271e-06, + "loss": 1.1105, + "step": 388 + }, + { + "epoch": 0.09854338188727042, + "grad_norm": 5.266610145568848, + "learning_rate": 3.941236068895644e-06, + "loss": 1.2458, + "step": 389 + }, + { + "epoch": 0.09879670677644078, + "grad_norm": 5.520458221435547, + "learning_rate": 3.951367781155015e-06, + "loss": 1.2126, + "step": 390 + }, + { + "epoch": 0.09905003166561115, + "grad_norm": 5.264195919036865, + "learning_rate": 3.961499493414388e-06, + "loss": 1.2394, + "step": 391 + }, + { + "epoch": 0.09930335655478151, + "grad_norm": 5.44072961807251, + "learning_rate": 3.9716312056737595e-06, + "loss": 1.3318, + "step": 392 + }, + { + "epoch": 0.09955668144395187, + "grad_norm": 4.864696502685547, + "learning_rate": 3.981762917933131e-06, + "loss": 1.1867, + "step": 393 + }, + { + "epoch": 0.09981000633312223, + "grad_norm": 4.9540114402771, + "learning_rate": 3.991894630192503e-06, + "loss": 1.1448, + "step": 394 + }, + { + "epoch": 0.10006333122229259, + "grad_norm": 5.423478603363037, + "learning_rate": 4.002026342451874e-06, + "loss": 1.2762, + "step": 395 + }, + { + "epoch": 0.10031665611146295, + "grad_norm": 5.948858261108398, + "learning_rate": 4.012158054711247e-06, + "loss": 1.3089, + "step": 396 + }, + { + "epoch": 0.10056998100063332, + "grad_norm": 4.995451927185059, + "learning_rate": 4.022289766970618e-06, + "loss": 1.2065, + "step": 397 + }, + { + "epoch": 0.10082330588980368, + "grad_norm": 5.340811729431152, + "learning_rate": 4.03242147922999e-06, + "loss": 1.1683, + "step": 398 + }, + { + "epoch": 0.10107663077897404, + "grad_norm": 5.5295491218566895, + "learning_rate": 4.042553191489362e-06, + "loss": 1.267, + "step": 399 + }, + { + "epoch": 0.1013299556681444, + "grad_norm": 5.184334754943848, + "learning_rate": 4.052684903748734e-06, + "loss": 1.1513, + "step": 400 + }, + { + "epoch": 0.10158328055731476, + "grad_norm": 5.154574394226074, + "learning_rate": 4.062816616008106e-06, + "loss": 1.1991, + "step": 401 + }, + { + "epoch": 0.10183660544648512, + "grad_norm": 5.186733722686768, + "learning_rate": 4.072948328267477e-06, + "loss": 1.2177, + "step": 402 + }, + { + "epoch": 0.10208993033565548, + "grad_norm": 5.594045639038086, + "learning_rate": 4.08308004052685e-06, + "loss": 1.2582, + "step": 403 + }, + { + "epoch": 0.10234325522482583, + "grad_norm": 4.924523830413818, + "learning_rate": 4.093211752786221e-06, + "loss": 1.237, + "step": 404 + }, + { + "epoch": 0.1025965801139962, + "grad_norm": 5.1376543045043945, + "learning_rate": 4.103343465045593e-06, + "loss": 1.155, + "step": 405 + }, + { + "epoch": 0.10284990500316656, + "grad_norm": 4.898098468780518, + "learning_rate": 4.113475177304965e-06, + "loss": 1.1889, + "step": 406 + }, + { + "epoch": 0.10310322989233692, + "grad_norm": 5.024801731109619, + "learning_rate": 4.123606889564336e-06, + "loss": 1.1944, + "step": 407 + }, + { + "epoch": 0.10335655478150728, + "grad_norm": 5.272379398345947, + "learning_rate": 4.133738601823709e-06, + "loss": 1.2796, + "step": 408 + }, + { + "epoch": 0.10360987967067764, + "grad_norm": 5.209236145019531, + "learning_rate": 4.14387031408308e-06, + "loss": 1.1897, + "step": 409 + }, + { + "epoch": 0.103863204559848, + "grad_norm": 4.9806976318359375, + "learning_rate": 4.154002026342453e-06, + "loss": 1.2955, + "step": 410 + }, + { + "epoch": 0.10411652944901836, + "grad_norm": 5.147138595581055, + "learning_rate": 4.1641337386018235e-06, + "loss": 1.1794, + "step": 411 + }, + { + "epoch": 0.10436985433818873, + "grad_norm": 5.406728744506836, + "learning_rate": 4.174265450861196e-06, + "loss": 1.3, + "step": 412 + }, + { + "epoch": 0.10462317922735909, + "grad_norm": 5.133518218994141, + "learning_rate": 4.184397163120568e-06, + "loss": 1.3578, + "step": 413 + }, + { + "epoch": 0.10487650411652945, + "grad_norm": 5.1981520652771, + "learning_rate": 4.194528875379939e-06, + "loss": 1.4429, + "step": 414 + }, + { + "epoch": 0.10512982900569981, + "grad_norm": 4.696353435516357, + "learning_rate": 4.204660587639312e-06, + "loss": 1.1034, + "step": 415 + }, + { + "epoch": 0.10538315389487017, + "grad_norm": 5.248478889465332, + "learning_rate": 4.214792299898683e-06, + "loss": 1.3058, + "step": 416 + }, + { + "epoch": 0.10563647878404053, + "grad_norm": 4.99880313873291, + "learning_rate": 4.224924012158055e-06, + "loss": 1.1162, + "step": 417 + }, + { + "epoch": 0.1058898036732109, + "grad_norm": 5.2948455810546875, + "learning_rate": 4.2350557244174265e-06, + "loss": 1.323, + "step": 418 + }, + { + "epoch": 0.10614312856238126, + "grad_norm": 5.046769142150879, + "learning_rate": 4.245187436676799e-06, + "loss": 1.2691, + "step": 419 + }, + { + "epoch": 0.10639645345155162, + "grad_norm": 5.314305305480957, + "learning_rate": 4.255319148936171e-06, + "loss": 1.1455, + "step": 420 + }, + { + "epoch": 0.10664977834072198, + "grad_norm": 5.6942853927612305, + "learning_rate": 4.265450861195542e-06, + "loss": 1.3361, + "step": 421 + }, + { + "epoch": 0.10690310322989234, + "grad_norm": 5.832979202270508, + "learning_rate": 4.275582573454915e-06, + "loss": 1.4814, + "step": 422 + }, + { + "epoch": 0.1071564281190627, + "grad_norm": 5.128951549530029, + "learning_rate": 4.2857142857142855e-06, + "loss": 1.2869, + "step": 423 + }, + { + "epoch": 0.10740975300823306, + "grad_norm": 5.437411785125732, + "learning_rate": 4.295845997973658e-06, + "loss": 1.2294, + "step": 424 + }, + { + "epoch": 0.10766307789740343, + "grad_norm": 5.274136543273926, + "learning_rate": 4.3059777102330295e-06, + "loss": 1.1951, + "step": 425 + }, + { + "epoch": 0.10791640278657379, + "grad_norm": 5.109431266784668, + "learning_rate": 4.316109422492402e-06, + "loss": 1.2014, + "step": 426 + }, + { + "epoch": 0.10816972767574415, + "grad_norm": 5.105831146240234, + "learning_rate": 4.326241134751774e-06, + "loss": 1.2486, + "step": 427 + }, + { + "epoch": 0.1084230525649145, + "grad_norm": 5.709077835083008, + "learning_rate": 4.336372847011145e-06, + "loss": 1.2318, + "step": 428 + }, + { + "epoch": 0.10867637745408486, + "grad_norm": 4.6664934158325195, + "learning_rate": 4.346504559270517e-06, + "loss": 1.1364, + "step": 429 + }, + { + "epoch": 0.10892970234325522, + "grad_norm": 5.61269998550415, + "learning_rate": 4.3566362715298885e-06, + "loss": 1.2759, + "step": 430 + }, + { + "epoch": 0.10918302723242558, + "grad_norm": 5.311194896697998, + "learning_rate": 4.366767983789261e-06, + "loss": 1.2994, + "step": 431 + }, + { + "epoch": 0.10943635212159594, + "grad_norm": 5.4426679611206055, + "learning_rate": 4.3768996960486325e-06, + "loss": 1.2654, + "step": 432 + }, + { + "epoch": 0.1096896770107663, + "grad_norm": 5.604968070983887, + "learning_rate": 4.387031408308004e-06, + "loss": 1.283, + "step": 433 + }, + { + "epoch": 0.10994300189993667, + "grad_norm": 4.886499404907227, + "learning_rate": 4.397163120567377e-06, + "loss": 1.2903, + "step": 434 + }, + { + "epoch": 0.11019632678910703, + "grad_norm": 5.554519176483154, + "learning_rate": 4.407294832826748e-06, + "loss": 1.2279, + "step": 435 + }, + { + "epoch": 0.11044965167827739, + "grad_norm": 4.756959915161133, + "learning_rate": 4.41742654508612e-06, + "loss": 1.158, + "step": 436 + }, + { + "epoch": 0.11070297656744775, + "grad_norm": 5.010580539703369, + "learning_rate": 4.4275582573454915e-06, + "loss": 1.3596, + "step": 437 + }, + { + "epoch": 0.11095630145661811, + "grad_norm": 5.439083099365234, + "learning_rate": 4.437689969604864e-06, + "loss": 1.2645, + "step": 438 + }, + { + "epoch": 0.11120962634578847, + "grad_norm": 5.501040935516357, + "learning_rate": 4.4478216818642355e-06, + "loss": 1.2239, + "step": 439 + }, + { + "epoch": 0.11146295123495883, + "grad_norm": 5.120853900909424, + "learning_rate": 4.457953394123607e-06, + "loss": 1.2607, + "step": 440 + }, + { + "epoch": 0.1117162761241292, + "grad_norm": 4.933994293212891, + "learning_rate": 4.468085106382979e-06, + "loss": 1.187, + "step": 441 + }, + { + "epoch": 0.11196960101329956, + "grad_norm": 4.760913372039795, + "learning_rate": 4.478216818642351e-06, + "loss": 1.0609, + "step": 442 + }, + { + "epoch": 0.11222292590246992, + "grad_norm": 4.908841133117676, + "learning_rate": 4.488348530901723e-06, + "loss": 1.0957, + "step": 443 + }, + { + "epoch": 0.11247625079164028, + "grad_norm": 5.275254726409912, + "learning_rate": 4.4984802431610945e-06, + "loss": 1.0603, + "step": 444 + }, + { + "epoch": 0.11272957568081064, + "grad_norm": 5.005535125732422, + "learning_rate": 4.508611955420467e-06, + "loss": 1.1966, + "step": 445 + }, + { + "epoch": 0.112982900569981, + "grad_norm": 5.133699893951416, + "learning_rate": 4.5187436676798385e-06, + "loss": 1.2276, + "step": 446 + }, + { + "epoch": 0.11323622545915137, + "grad_norm": 5.156227111816406, + "learning_rate": 4.52887537993921e-06, + "loss": 1.0842, + "step": 447 + }, + { + "epoch": 0.11348955034832173, + "grad_norm": 5.140113830566406, + "learning_rate": 4.539007092198582e-06, + "loss": 1.212, + "step": 448 + }, + { + "epoch": 0.11374287523749209, + "grad_norm": 5.005239009857178, + "learning_rate": 4.549138804457953e-06, + "loss": 1.1829, + "step": 449 + }, + { + "epoch": 0.11399620012666245, + "grad_norm": 4.534379959106445, + "learning_rate": 4.559270516717326e-06, + "loss": 1.015, + "step": 450 + }, + { + "epoch": 0.11424952501583281, + "grad_norm": 4.960865497589111, + "learning_rate": 4.5694022289766975e-06, + "loss": 1.0726, + "step": 451 + }, + { + "epoch": 0.11450284990500317, + "grad_norm": 5.407089710235596, + "learning_rate": 4.57953394123607e-06, + "loss": 1.2036, + "step": 452 + }, + { + "epoch": 0.11475617479417352, + "grad_norm": 4.970057487487793, + "learning_rate": 4.589665653495441e-06, + "loss": 1.2275, + "step": 453 + }, + { + "epoch": 0.11500949968334388, + "grad_norm": 5.319904327392578, + "learning_rate": 4.599797365754813e-06, + "loss": 1.2653, + "step": 454 + }, + { + "epoch": 0.11526282457251424, + "grad_norm": 5.534214496612549, + "learning_rate": 4.609929078014185e-06, + "loss": 1.2502, + "step": 455 + }, + { + "epoch": 0.1155161494616846, + "grad_norm": 5.3505425453186035, + "learning_rate": 4.620060790273556e-06, + "loss": 1.3581, + "step": 456 + }, + { + "epoch": 0.11576947435085497, + "grad_norm": 5.140420436859131, + "learning_rate": 4.630192502532929e-06, + "loss": 1.2164, + "step": 457 + }, + { + "epoch": 0.11602279924002533, + "grad_norm": 5.232957363128662, + "learning_rate": 4.6403242147923005e-06, + "loss": 1.2348, + "step": 458 + }, + { + "epoch": 0.11627612412919569, + "grad_norm": 5.0088324546813965, + "learning_rate": 4.650455927051672e-06, + "loss": 1.202, + "step": 459 + }, + { + "epoch": 0.11652944901836605, + "grad_norm": 5.049102306365967, + "learning_rate": 4.660587639311044e-06, + "loss": 1.3339, + "step": 460 + }, + { + "epoch": 0.11678277390753641, + "grad_norm": 5.3588995933532715, + "learning_rate": 4.670719351570416e-06, + "loss": 1.2878, + "step": 461 + }, + { + "epoch": 0.11703609879670678, + "grad_norm": 4.652127265930176, + "learning_rate": 4.680851063829788e-06, + "loss": 1.1328, + "step": 462 + }, + { + "epoch": 0.11728942368587714, + "grad_norm": 5.1378278732299805, + "learning_rate": 4.690982776089159e-06, + "loss": 1.2494, + "step": 463 + }, + { + "epoch": 0.1175427485750475, + "grad_norm": 5.077396392822266, + "learning_rate": 4.701114488348531e-06, + "loss": 1.244, + "step": 464 + }, + { + "epoch": 0.11779607346421786, + "grad_norm": 4.885287761688232, + "learning_rate": 4.711246200607903e-06, + "loss": 1.045, + "step": 465 + }, + { + "epoch": 0.11804939835338822, + "grad_norm": 4.663662433624268, + "learning_rate": 4.721377912867275e-06, + "loss": 1.1788, + "step": 466 + }, + { + "epoch": 0.11830272324255858, + "grad_norm": 5.223139762878418, + "learning_rate": 4.731509625126647e-06, + "loss": 1.2393, + "step": 467 + }, + { + "epoch": 0.11855604813172894, + "grad_norm": 4.848125457763672, + "learning_rate": 4.741641337386019e-06, + "loss": 1.1446, + "step": 468 + }, + { + "epoch": 0.1188093730208993, + "grad_norm": 5.275095462799072, + "learning_rate": 4.751773049645391e-06, + "loss": 1.2942, + "step": 469 + }, + { + "epoch": 0.11906269791006967, + "grad_norm": 4.845182418823242, + "learning_rate": 4.761904761904762e-06, + "loss": 1.2158, + "step": 470 + }, + { + "epoch": 0.11931602279924003, + "grad_norm": 5.734543800354004, + "learning_rate": 4.772036474164134e-06, + "loss": 1.3147, + "step": 471 + }, + { + "epoch": 0.11956934768841039, + "grad_norm": 5.24599027633667, + "learning_rate": 4.782168186423506e-06, + "loss": 1.2151, + "step": 472 + }, + { + "epoch": 0.11982267257758075, + "grad_norm": 4.8684163093566895, + "learning_rate": 4.792299898682878e-06, + "loss": 1.1434, + "step": 473 + }, + { + "epoch": 0.12007599746675111, + "grad_norm": 4.637145042419434, + "learning_rate": 4.80243161094225e-06, + "loss": 1.1748, + "step": 474 + }, + { + "epoch": 0.12032932235592148, + "grad_norm": 4.772747993469238, + "learning_rate": 4.812563323201621e-06, + "loss": 1.1442, + "step": 475 + }, + { + "epoch": 0.12058264724509184, + "grad_norm": 4.968971252441406, + "learning_rate": 4.822695035460993e-06, + "loss": 1.2552, + "step": 476 + }, + { + "epoch": 0.12083597213426218, + "grad_norm": 5.498548984527588, + "learning_rate": 4.832826747720365e-06, + "loss": 1.1324, + "step": 477 + }, + { + "epoch": 0.12108929702343255, + "grad_norm": 5.052052974700928, + "learning_rate": 4.842958459979737e-06, + "loss": 1.2966, + "step": 478 + }, + { + "epoch": 0.12134262191260291, + "grad_norm": 5.156652450561523, + "learning_rate": 4.853090172239109e-06, + "loss": 1.1934, + "step": 479 + }, + { + "epoch": 0.12159594680177327, + "grad_norm": 5.284748554229736, + "learning_rate": 4.863221884498481e-06, + "loss": 1.2785, + "step": 480 + }, + { + "epoch": 0.12184927169094363, + "grad_norm": 4.996010780334473, + "learning_rate": 4.873353596757853e-06, + "loss": 1.1932, + "step": 481 + }, + { + "epoch": 0.12210259658011399, + "grad_norm": 5.440031051635742, + "learning_rate": 4.883485309017224e-06, + "loss": 1.2987, + "step": 482 + }, + { + "epoch": 0.12235592146928435, + "grad_norm": 5.136556148529053, + "learning_rate": 4.893617021276596e-06, + "loss": 1.0797, + "step": 483 + }, + { + "epoch": 0.12260924635845472, + "grad_norm": 5.024312496185303, + "learning_rate": 4.9037487335359675e-06, + "loss": 1.1816, + "step": 484 + }, + { + "epoch": 0.12286257124762508, + "grad_norm": 5.054494380950928, + "learning_rate": 4.91388044579534e-06, + "loss": 1.3098, + "step": 485 + }, + { + "epoch": 0.12311589613679544, + "grad_norm": 5.291783332824707, + "learning_rate": 4.924012158054712e-06, + "loss": 1.2519, + "step": 486 + }, + { + "epoch": 0.1233692210259658, + "grad_norm": 5.203293800354004, + "learning_rate": 4.934143870314084e-06, + "loss": 1.2508, + "step": 487 + }, + { + "epoch": 0.12362254591513616, + "grad_norm": 4.995335578918457, + "learning_rate": 4.944275582573455e-06, + "loss": 1.1623, + "step": 488 + }, + { + "epoch": 0.12387587080430652, + "grad_norm": 4.85042667388916, + "learning_rate": 4.954407294832827e-06, + "loss": 1.2747, + "step": 489 + }, + { + "epoch": 0.12412919569347688, + "grad_norm": 5.630333423614502, + "learning_rate": 4.964539007092199e-06, + "loss": 1.3355, + "step": 490 + }, + { + "epoch": 0.12438252058264725, + "grad_norm": 5.215515613555908, + "learning_rate": 4.9746707193515705e-06, + "loss": 1.1913, + "step": 491 + }, + { + "epoch": 0.12463584547181761, + "grad_norm": 4.697793006896973, + "learning_rate": 4.984802431610943e-06, + "loss": 1.1055, + "step": 492 + }, + { + "epoch": 0.12488917036098797, + "grad_norm": 5.270546913146973, + "learning_rate": 4.994934143870315e-06, + "loss": 1.1722, + "step": 493 + }, + { + "epoch": 0.12514249525015833, + "grad_norm": 5.559477806091309, + "learning_rate": 5.005065856129686e-06, + "loss": 1.2904, + "step": 494 + }, + { + "epoch": 0.12539582013932868, + "grad_norm": 4.968261241912842, + "learning_rate": 5.015197568389059e-06, + "loss": 1.1808, + "step": 495 + }, + { + "epoch": 0.12564914502849905, + "grad_norm": 5.030310153961182, + "learning_rate": 5.02532928064843e-06, + "loss": 1.2484, + "step": 496 + }, + { + "epoch": 0.1259024699176694, + "grad_norm": 4.8509840965271, + "learning_rate": 5.035460992907801e-06, + "loss": 1.3076, + "step": 497 + }, + { + "epoch": 0.12615579480683978, + "grad_norm": 4.936345100402832, + "learning_rate": 5.0455927051671735e-06, + "loss": 1.1693, + "step": 498 + }, + { + "epoch": 0.12640911969601012, + "grad_norm": 5.1634392738342285, + "learning_rate": 5.055724417426545e-06, + "loss": 1.2937, + "step": 499 + }, + { + "epoch": 0.1266624445851805, + "grad_norm": 4.734792709350586, + "learning_rate": 5.065856129685918e-06, + "loss": 1.1153, + "step": 500 + }, + { + "epoch": 0.1266624445851805, + "eval_loss": 1.2462952136993408, + "eval_runtime": 12.2985, + "eval_samples_per_second": 32.524, + "eval_steps_per_second": 4.066, + "step": 500 + }, + { + "epoch": 0.12691576947435085, + "grad_norm": 5.271373271942139, + "learning_rate": 5.075987841945289e-06, + "loss": 1.1751, + "step": 501 + }, + { + "epoch": 0.12716909436352122, + "grad_norm": 4.605556488037109, + "learning_rate": 5.086119554204662e-06, + "loss": 1.1828, + "step": 502 + }, + { + "epoch": 0.12742241925269157, + "grad_norm": 4.6827311515808105, + "learning_rate": 5.096251266464033e-06, + "loss": 1.1332, + "step": 503 + }, + { + "epoch": 0.12767574414186195, + "grad_norm": 6.322502613067627, + "learning_rate": 5.106382978723404e-06, + "loss": 1.1816, + "step": 504 + }, + { + "epoch": 0.1279290690310323, + "grad_norm": 4.943698883056641, + "learning_rate": 5.1165146909827765e-06, + "loss": 1.1607, + "step": 505 + }, + { + "epoch": 0.12818239392020267, + "grad_norm": 4.914384841918945, + "learning_rate": 5.126646403242148e-06, + "loss": 1.1669, + "step": 506 + }, + { + "epoch": 0.12843571880937302, + "grad_norm": 5.10693359375, + "learning_rate": 5.136778115501521e-06, + "loss": 1.2683, + "step": 507 + }, + { + "epoch": 0.1286890436985434, + "grad_norm": 5.151780128479004, + "learning_rate": 5.146909827760892e-06, + "loss": 1.2711, + "step": 508 + }, + { + "epoch": 0.12894236858771374, + "grad_norm": 4.811371803283691, + "learning_rate": 5.157041540020263e-06, + "loss": 1.2546, + "step": 509 + }, + { + "epoch": 0.12919569347688412, + "grad_norm": 4.895148277282715, + "learning_rate": 5.1671732522796354e-06, + "loss": 1.2519, + "step": 510 + }, + { + "epoch": 0.12944901836605446, + "grad_norm": 4.593327522277832, + "learning_rate": 5.177304964539007e-06, + "loss": 1.2683, + "step": 511 + }, + { + "epoch": 0.12970234325522484, + "grad_norm": 5.491243839263916, + "learning_rate": 5.1874366767983795e-06, + "loss": 1.3801, + "step": 512 + }, + { + "epoch": 0.1299556681443952, + "grad_norm": 4.669014930725098, + "learning_rate": 5.197568389057751e-06, + "loss": 1.076, + "step": 513 + }, + { + "epoch": 0.13020899303356553, + "grad_norm": 4.788550853729248, + "learning_rate": 5.207700101317124e-06, + "loss": 1.0858, + "step": 514 + }, + { + "epoch": 0.1304623179227359, + "grad_norm": 5.297513008117676, + "learning_rate": 5.217831813576495e-06, + "loss": 1.263, + "step": 515 + }, + { + "epoch": 0.13071564281190626, + "grad_norm": 4.7116923332214355, + "learning_rate": 5.227963525835866e-06, + "loss": 1.1378, + "step": 516 + }, + { + "epoch": 0.13096896770107663, + "grad_norm": 4.830559730529785, + "learning_rate": 5.2380952380952384e-06, + "loss": 1.1793, + "step": 517 + }, + { + "epoch": 0.13122229259024698, + "grad_norm": 4.668907165527344, + "learning_rate": 5.24822695035461e-06, + "loss": 1.1554, + "step": 518 + }, + { + "epoch": 0.13147561747941736, + "grad_norm": 5.034953594207764, + "learning_rate": 5.2583586626139825e-06, + "loss": 1.2435, + "step": 519 + }, + { + "epoch": 0.1317289423685877, + "grad_norm": 4.942272186279297, + "learning_rate": 5.268490374873354e-06, + "loss": 1.1792, + "step": 520 + }, + { + "epoch": 0.13198226725775808, + "grad_norm": 4.644348621368408, + "learning_rate": 5.278622087132726e-06, + "loss": 1.2066, + "step": 521 + }, + { + "epoch": 0.13223559214692843, + "grad_norm": 4.6449785232543945, + "learning_rate": 5.288753799392098e-06, + "loss": 1.1041, + "step": 522 + }, + { + "epoch": 0.1324889170360988, + "grad_norm": 4.8624186515808105, + "learning_rate": 5.298885511651469e-06, + "loss": 1.2315, + "step": 523 + }, + { + "epoch": 0.13274224192526915, + "grad_norm": 4.965691566467285, + "learning_rate": 5.3090172239108414e-06, + "loss": 1.3935, + "step": 524 + }, + { + "epoch": 0.13299556681443953, + "grad_norm": 4.97011137008667, + "learning_rate": 5.319148936170213e-06, + "loss": 1.414, + "step": 525 + }, + { + "epoch": 0.13324889170360987, + "grad_norm": 4.666852951049805, + "learning_rate": 5.3292806484295855e-06, + "loss": 1.1865, + "step": 526 + }, + { + "epoch": 0.13350221659278025, + "grad_norm": 5.110204219818115, + "learning_rate": 5.339412360688957e-06, + "loss": 1.0862, + "step": 527 + }, + { + "epoch": 0.1337555414819506, + "grad_norm": 5.102944850921631, + "learning_rate": 5.349544072948329e-06, + "loss": 1.2819, + "step": 528 + }, + { + "epoch": 0.13400886637112097, + "grad_norm": 4.5785980224609375, + "learning_rate": 5.359675785207701e-06, + "loss": 1.2022, + "step": 529 + }, + { + "epoch": 0.13426219126029132, + "grad_norm": 4.983090400695801, + "learning_rate": 5.369807497467072e-06, + "loss": 1.2425, + "step": 530 + }, + { + "epoch": 0.1345155161494617, + "grad_norm": 5.571552753448486, + "learning_rate": 5.3799392097264444e-06, + "loss": 1.334, + "step": 531 + }, + { + "epoch": 0.13476884103863204, + "grad_norm": 4.813221454620361, + "learning_rate": 5.390070921985816e-06, + "loss": 1.2187, + "step": 532 + }, + { + "epoch": 0.13502216592780242, + "grad_norm": 4.895610809326172, + "learning_rate": 5.400202634245188e-06, + "loss": 1.1801, + "step": 533 + }, + { + "epoch": 0.13527549081697277, + "grad_norm": 5.426584243774414, + "learning_rate": 5.41033434650456e-06, + "loss": 1.392, + "step": 534 + }, + { + "epoch": 0.13552881570614314, + "grad_norm": 4.887606143951416, + "learning_rate": 5.420466058763931e-06, + "loss": 1.2473, + "step": 535 + }, + { + "epoch": 0.1357821405953135, + "grad_norm": 5.3059401512146, + "learning_rate": 5.430597771023303e-06, + "loss": 1.2066, + "step": 536 + }, + { + "epoch": 0.13603546548448386, + "grad_norm": 5.063421726226807, + "learning_rate": 5.440729483282675e-06, + "loss": 1.2607, + "step": 537 + }, + { + "epoch": 0.1362887903736542, + "grad_norm": 5.2057294845581055, + "learning_rate": 5.450861195542047e-06, + "loss": 1.3135, + "step": 538 + }, + { + "epoch": 0.13654211526282456, + "grad_norm": 5.15851354598999, + "learning_rate": 5.460992907801419e-06, + "loss": 1.3632, + "step": 539 + }, + { + "epoch": 0.13679544015199493, + "grad_norm": 5.122459411621094, + "learning_rate": 5.471124620060791e-06, + "loss": 1.1665, + "step": 540 + }, + { + "epoch": 0.13704876504116528, + "grad_norm": 5.20040225982666, + "learning_rate": 5.481256332320163e-06, + "loss": 1.2699, + "step": 541 + }, + { + "epoch": 0.13730208993033566, + "grad_norm": 4.900663375854492, + "learning_rate": 5.491388044579534e-06, + "loss": 1.1101, + "step": 542 + }, + { + "epoch": 0.137555414819506, + "grad_norm": 4.9732561111450195, + "learning_rate": 5.501519756838906e-06, + "loss": 1.2512, + "step": 543 + }, + { + "epoch": 0.13780873970867638, + "grad_norm": 4.934910774230957, + "learning_rate": 5.511651469098278e-06, + "loss": 1.1755, + "step": 544 + }, + { + "epoch": 0.13806206459784673, + "grad_norm": 4.943143367767334, + "learning_rate": 5.52178318135765e-06, + "loss": 1.2349, + "step": 545 + }, + { + "epoch": 0.1383153894870171, + "grad_norm": 4.923962116241455, + "learning_rate": 5.531914893617022e-06, + "loss": 1.2429, + "step": 546 + }, + { + "epoch": 0.13856871437618745, + "grad_norm": 5.372138500213623, + "learning_rate": 5.542046605876394e-06, + "loss": 1.2674, + "step": 547 + }, + { + "epoch": 0.13882203926535783, + "grad_norm": 4.940601825714111, + "learning_rate": 5.552178318135766e-06, + "loss": 1.2771, + "step": 548 + }, + { + "epoch": 0.13907536415452817, + "grad_norm": 4.944694995880127, + "learning_rate": 5.562310030395137e-06, + "loss": 1.2695, + "step": 549 + }, + { + "epoch": 0.13932868904369855, + "grad_norm": 5.337344169616699, + "learning_rate": 5.5724417426545085e-06, + "loss": 1.1693, + "step": 550 + }, + { + "epoch": 0.1395820139328689, + "grad_norm": 4.865754127502441, + "learning_rate": 5.582573454913881e-06, + "loss": 1.2209, + "step": 551 + }, + { + "epoch": 0.13983533882203927, + "grad_norm": 4.5765228271484375, + "learning_rate": 5.592705167173253e-06, + "loss": 1.2566, + "step": 552 + }, + { + "epoch": 0.14008866371120962, + "grad_norm": 5.803656101226807, + "learning_rate": 5.602836879432625e-06, + "loss": 1.4674, + "step": 553 + }, + { + "epoch": 0.14034198860038, + "grad_norm": 4.550984859466553, + "learning_rate": 5.612968591691997e-06, + "loss": 1.1272, + "step": 554 + }, + { + "epoch": 0.14059531348955034, + "grad_norm": 4.758320331573486, + "learning_rate": 5.623100303951369e-06, + "loss": 1.0811, + "step": 555 + }, + { + "epoch": 0.14084863837872072, + "grad_norm": 4.948498725891113, + "learning_rate": 5.63323201621074e-06, + "loss": 1.2059, + "step": 556 + }, + { + "epoch": 0.14110196326789107, + "grad_norm": 4.6559858322143555, + "learning_rate": 5.6433637284701115e-06, + "loss": 1.0973, + "step": 557 + }, + { + "epoch": 0.14135528815706144, + "grad_norm": 4.768313407897949, + "learning_rate": 5.653495440729484e-06, + "loss": 1.2299, + "step": 558 + }, + { + "epoch": 0.1416086130462318, + "grad_norm": 5.055200099945068, + "learning_rate": 5.663627152988856e-06, + "loss": 1.2204, + "step": 559 + }, + { + "epoch": 0.14186193793540217, + "grad_norm": 5.64510440826416, + "learning_rate": 5.673758865248228e-06, + "loss": 1.3287, + "step": 560 + }, + { + "epoch": 0.1421152628245725, + "grad_norm": 5.069836616516113, + "learning_rate": 5.683890577507599e-06, + "loss": 1.1542, + "step": 561 + }, + { + "epoch": 0.1423685877137429, + "grad_norm": 4.7821478843688965, + "learning_rate": 5.6940222897669704e-06, + "loss": 1.3209, + "step": 562 + }, + { + "epoch": 0.14262191260291324, + "grad_norm": 5.0057148933410645, + "learning_rate": 5.704154002026343e-06, + "loss": 1.1426, + "step": 563 + }, + { + "epoch": 0.14287523749208358, + "grad_norm": 4.5058913230896, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.1116, + "step": 564 + }, + { + "epoch": 0.14312856238125396, + "grad_norm": 5.375939846038818, + "learning_rate": 5.724417426545087e-06, + "loss": 1.2233, + "step": 565 + }, + { + "epoch": 0.1433818872704243, + "grad_norm": 5.549449443817139, + "learning_rate": 5.734549138804459e-06, + "loss": 1.2536, + "step": 566 + }, + { + "epoch": 0.14363521215959468, + "grad_norm": 4.731637001037598, + "learning_rate": 5.744680851063831e-06, + "loss": 1.145, + "step": 567 + }, + { + "epoch": 0.14388853704876503, + "grad_norm": 4.841800212860107, + "learning_rate": 5.754812563323202e-06, + "loss": 1.1289, + "step": 568 + }, + { + "epoch": 0.1441418619379354, + "grad_norm": 4.759994029998779, + "learning_rate": 5.7649442755825734e-06, + "loss": 1.0788, + "step": 569 + }, + { + "epoch": 0.14439518682710575, + "grad_norm": 5.397213459014893, + "learning_rate": 5.775075987841946e-06, + "loss": 1.4156, + "step": 570 + }, + { + "epoch": 0.14464851171627613, + "grad_norm": 5.066045761108398, + "learning_rate": 5.7852077001013175e-06, + "loss": 1.2685, + "step": 571 + }, + { + "epoch": 0.14490183660544648, + "grad_norm": 4.570275783538818, + "learning_rate": 5.79533941236069e-06, + "loss": 1.174, + "step": 572 + }, + { + "epoch": 0.14515516149461685, + "grad_norm": 4.592137336730957, + "learning_rate": 5.805471124620062e-06, + "loss": 1.1032, + "step": 573 + }, + { + "epoch": 0.1454084863837872, + "grad_norm": 4.847631454467773, + "learning_rate": 5.815602836879432e-06, + "loss": 1.2636, + "step": 574 + }, + { + "epoch": 0.14566181127295758, + "grad_norm": 5.709230899810791, + "learning_rate": 5.825734549138805e-06, + "loss": 1.282, + "step": 575 + }, + { + "epoch": 0.14591513616212792, + "grad_norm": 5.450544357299805, + "learning_rate": 5.8358662613981764e-06, + "loss": 1.1158, + "step": 576 + }, + { + "epoch": 0.1461684610512983, + "grad_norm": 4.97231912612915, + "learning_rate": 5.845997973657549e-06, + "loss": 1.2709, + "step": 577 + }, + { + "epoch": 0.14642178594046865, + "grad_norm": 4.959066867828369, + "learning_rate": 5.8561296859169205e-06, + "loss": 1.1452, + "step": 578 + }, + { + "epoch": 0.14667511082963902, + "grad_norm": 5.879068374633789, + "learning_rate": 5.866261398176293e-06, + "loss": 1.386, + "step": 579 + }, + { + "epoch": 0.14692843571880937, + "grad_norm": 5.1025710105896, + "learning_rate": 5.876393110435665e-06, + "loss": 1.269, + "step": 580 + }, + { + "epoch": 0.14718176060797974, + "grad_norm": 5.202458381652832, + "learning_rate": 5.886524822695035e-06, + "loss": 1.2261, + "step": 581 + }, + { + "epoch": 0.1474350854971501, + "grad_norm": 5.0866475105285645, + "learning_rate": 5.896656534954408e-06, + "loss": 1.2695, + "step": 582 + }, + { + "epoch": 0.14768841038632047, + "grad_norm": 5.054940223693848, + "learning_rate": 5.9067882472137794e-06, + "loss": 1.2914, + "step": 583 + }, + { + "epoch": 0.14794173527549082, + "grad_norm": 4.855292797088623, + "learning_rate": 5.916919959473152e-06, + "loss": 1.2319, + "step": 584 + }, + { + "epoch": 0.1481950601646612, + "grad_norm": 4.923979759216309, + "learning_rate": 5.9270516717325235e-06, + "loss": 1.228, + "step": 585 + }, + { + "epoch": 0.14844838505383154, + "grad_norm": 4.366090297698975, + "learning_rate": 5.937183383991894e-06, + "loss": 1.145, + "step": 586 + }, + { + "epoch": 0.1487017099430019, + "grad_norm": 4.820800304412842, + "learning_rate": 5.947315096251267e-06, + "loss": 1.1171, + "step": 587 + }, + { + "epoch": 0.14895503483217226, + "grad_norm": 5.232685089111328, + "learning_rate": 5.957446808510638e-06, + "loss": 1.1695, + "step": 588 + }, + { + "epoch": 0.1492083597213426, + "grad_norm": 5.733992576599121, + "learning_rate": 5.967578520770011e-06, + "loss": 1.2805, + "step": 589 + }, + { + "epoch": 0.14946168461051298, + "grad_norm": 5.02573299407959, + "learning_rate": 5.9777102330293824e-06, + "loss": 1.1743, + "step": 590 + }, + { + "epoch": 0.14971500949968333, + "grad_norm": 4.9159626960754395, + "learning_rate": 5.987841945288755e-06, + "loss": 1.2444, + "step": 591 + }, + { + "epoch": 0.1499683343888537, + "grad_norm": 4.857386112213135, + "learning_rate": 5.9979736575481265e-06, + "loss": 1.278, + "step": 592 + }, + { + "epoch": 0.15022165927802406, + "grad_norm": 5.224244594573975, + "learning_rate": 6.008105369807497e-06, + "loss": 1.436, + "step": 593 + }, + { + "epoch": 0.15047498416719443, + "grad_norm": 4.591535568237305, + "learning_rate": 6.01823708206687e-06, + "loss": 1.2383, + "step": 594 + }, + { + "epoch": 0.15072830905636478, + "grad_norm": 5.028872489929199, + "learning_rate": 6.028368794326241e-06, + "loss": 1.3694, + "step": 595 + }, + { + "epoch": 0.15098163394553515, + "grad_norm": 5.284378528594971, + "learning_rate": 6.038500506585614e-06, + "loss": 1.3386, + "step": 596 + }, + { + "epoch": 0.1512349588347055, + "grad_norm": 4.551464557647705, + "learning_rate": 6.0486322188449854e-06, + "loss": 1.2189, + "step": 597 + }, + { + "epoch": 0.15148828372387588, + "grad_norm": 5.2143778800964355, + "learning_rate": 6.058763931104357e-06, + "loss": 1.1644, + "step": 598 + }, + { + "epoch": 0.15174160861304622, + "grad_norm": 4.9922099113464355, + "learning_rate": 6.0688956433637295e-06, + "loss": 1.298, + "step": 599 + }, + { + "epoch": 0.1519949335022166, + "grad_norm": 5.037347316741943, + "learning_rate": 6.0790273556231e-06, + "loss": 1.348, + "step": 600 + }, + { + "epoch": 0.15224825839138695, + "grad_norm": 4.79486608505249, + "learning_rate": 6.089159067882473e-06, + "loss": 1.242, + "step": 601 + }, + { + "epoch": 0.15250158328055732, + "grad_norm": 4.395257949829102, + "learning_rate": 6.099290780141844e-06, + "loss": 1.1533, + "step": 602 + }, + { + "epoch": 0.15275490816972767, + "grad_norm": 4.24887228012085, + "learning_rate": 6.109422492401216e-06, + "loss": 1.1706, + "step": 603 + }, + { + "epoch": 0.15300823305889805, + "grad_norm": 4.749911308288574, + "learning_rate": 6.1195542046605884e-06, + "loss": 1.1852, + "step": 604 + }, + { + "epoch": 0.1532615579480684, + "grad_norm": 5.326765537261963, + "learning_rate": 6.12968591691996e-06, + "loss": 1.2012, + "step": 605 + }, + { + "epoch": 0.15351488283723877, + "grad_norm": 4.694608211517334, + "learning_rate": 6.1398176291793325e-06, + "loss": 1.1974, + "step": 606 + }, + { + "epoch": 0.15376820772640912, + "grad_norm": 4.689748764038086, + "learning_rate": 6.149949341438703e-06, + "loss": 1.2694, + "step": 607 + }, + { + "epoch": 0.1540215326155795, + "grad_norm": 4.834078788757324, + "learning_rate": 6.160081053698076e-06, + "loss": 1.2839, + "step": 608 + }, + { + "epoch": 0.15427485750474984, + "grad_norm": 4.931235313415527, + "learning_rate": 6.170212765957447e-06, + "loss": 1.2335, + "step": 609 + }, + { + "epoch": 0.15452818239392022, + "grad_norm": 4.4806060791015625, + "learning_rate": 6.180344478216819e-06, + "loss": 1.1619, + "step": 610 + }, + { + "epoch": 0.15478150728309056, + "grad_norm": 4.908557891845703, + "learning_rate": 6.1904761904761914e-06, + "loss": 1.2526, + "step": 611 + }, + { + "epoch": 0.1550348321722609, + "grad_norm": 4.620062828063965, + "learning_rate": 6.200607902735562e-06, + "loss": 1.2694, + "step": 612 + }, + { + "epoch": 0.1552881570614313, + "grad_norm": 4.872439384460449, + "learning_rate": 6.210739614994935e-06, + "loss": 1.1727, + "step": 613 + }, + { + "epoch": 0.15554148195060163, + "grad_norm": 4.710540294647217, + "learning_rate": 6.220871327254306e-06, + "loss": 1.1959, + "step": 614 + }, + { + "epoch": 0.155794806839772, + "grad_norm": 4.609006881713867, + "learning_rate": 6.231003039513678e-06, + "loss": 1.2663, + "step": 615 + }, + { + "epoch": 0.15604813172894236, + "grad_norm": 4.969557285308838, + "learning_rate": 6.24113475177305e-06, + "loss": 1.1245, + "step": 616 + }, + { + "epoch": 0.15630145661811273, + "grad_norm": 4.4855055809021, + "learning_rate": 6.251266464032422e-06, + "loss": 1.2075, + "step": 617 + }, + { + "epoch": 0.15655478150728308, + "grad_norm": 4.479387283325195, + "learning_rate": 6.2613981762917944e-06, + "loss": 1.254, + "step": 618 + }, + { + "epoch": 0.15680810639645346, + "grad_norm": 4.780701637268066, + "learning_rate": 6.271529888551165e-06, + "loss": 1.2976, + "step": 619 + }, + { + "epoch": 0.1570614312856238, + "grad_norm": 4.669686317443848, + "learning_rate": 6.281661600810538e-06, + "loss": 1.1554, + "step": 620 + }, + { + "epoch": 0.15731475617479418, + "grad_norm": 4.6630330085754395, + "learning_rate": 6.291793313069909e-06, + "loss": 1.1868, + "step": 621 + }, + { + "epoch": 0.15756808106396453, + "grad_norm": 4.786992073059082, + "learning_rate": 6.301925025329281e-06, + "loss": 1.1817, + "step": 622 + }, + { + "epoch": 0.1578214059531349, + "grad_norm": 4.614963054656982, + "learning_rate": 6.312056737588653e-06, + "loss": 1.0693, + "step": 623 + }, + { + "epoch": 0.15807473084230525, + "grad_norm": 4.794111728668213, + "learning_rate": 6.322188449848025e-06, + "loss": 1.2119, + "step": 624 + }, + { + "epoch": 0.15832805573147563, + "grad_norm": 4.7772440910339355, + "learning_rate": 6.3323201621073974e-06, + "loss": 1.1036, + "step": 625 + }, + { + "epoch": 0.15858138062064597, + "grad_norm": 4.855818748474121, + "learning_rate": 6.342451874366768e-06, + "loss": 1.2127, + "step": 626 + }, + { + "epoch": 0.15883470550981635, + "grad_norm": 4.64150333404541, + "learning_rate": 6.35258358662614e-06, + "loss": 1.1563, + "step": 627 + }, + { + "epoch": 0.1590880303989867, + "grad_norm": 5.098568439483643, + "learning_rate": 6.362715298885512e-06, + "loss": 1.2082, + "step": 628 + }, + { + "epoch": 0.15934135528815707, + "grad_norm": 4.789144515991211, + "learning_rate": 6.372847011144884e-06, + "loss": 1.2472, + "step": 629 + }, + { + "epoch": 0.15959468017732742, + "grad_norm": 4.904264450073242, + "learning_rate": 6.382978723404256e-06, + "loss": 1.3159, + "step": 630 + }, + { + "epoch": 0.1598480050664978, + "grad_norm": 5.040409088134766, + "learning_rate": 6.393110435663628e-06, + "loss": 1.293, + "step": 631 + }, + { + "epoch": 0.16010132995566814, + "grad_norm": 5.19047737121582, + "learning_rate": 6.403242147923e-06, + "loss": 1.258, + "step": 632 + }, + { + "epoch": 0.16035465484483852, + "grad_norm": 4.936978816986084, + "learning_rate": 6.413373860182371e-06, + "loss": 1.3897, + "step": 633 + }, + { + "epoch": 0.16060797973400887, + "grad_norm": 4.566588401794434, + "learning_rate": 6.423505572441743e-06, + "loss": 1.1514, + "step": 634 + }, + { + "epoch": 0.16086130462317924, + "grad_norm": 4.738023281097412, + "learning_rate": 6.433637284701115e-06, + "loss": 1.1792, + "step": 635 + }, + { + "epoch": 0.1611146295123496, + "grad_norm": 5.172008037567139, + "learning_rate": 6.443768996960487e-06, + "loss": 1.2696, + "step": 636 + }, + { + "epoch": 0.16136795440151994, + "grad_norm": 4.78951358795166, + "learning_rate": 6.453900709219859e-06, + "loss": 1.487, + "step": 637 + }, + { + "epoch": 0.1616212792906903, + "grad_norm": 4.4658989906311035, + "learning_rate": 6.46403242147923e-06, + "loss": 1.1498, + "step": 638 + }, + { + "epoch": 0.16187460417986066, + "grad_norm": 4.382638454437256, + "learning_rate": 6.474164133738602e-06, + "loss": 1.1615, + "step": 639 + }, + { + "epoch": 0.16212792906903103, + "grad_norm": 4.52036190032959, + "learning_rate": 6.484295845997974e-06, + "loss": 1.191, + "step": 640 + }, + { + "epoch": 0.16238125395820138, + "grad_norm": 4.304661750793457, + "learning_rate": 6.494427558257346e-06, + "loss": 1.1075, + "step": 641 + }, + { + "epoch": 0.16263457884737176, + "grad_norm": 4.753248691558838, + "learning_rate": 6.504559270516718e-06, + "loss": 1.2028, + "step": 642 + }, + { + "epoch": 0.1628879037365421, + "grad_norm": 4.911106586456299, + "learning_rate": 6.51469098277609e-06, + "loss": 1.3859, + "step": 643 + }, + { + "epoch": 0.16314122862571248, + "grad_norm": 5.116793155670166, + "learning_rate": 6.524822695035462e-06, + "loss": 1.1626, + "step": 644 + }, + { + "epoch": 0.16339455351488283, + "grad_norm": 5.212765693664551, + "learning_rate": 6.534954407294833e-06, + "loss": 1.2837, + "step": 645 + }, + { + "epoch": 0.1636478784040532, + "grad_norm": 4.747133731842041, + "learning_rate": 6.545086119554205e-06, + "loss": 1.2263, + "step": 646 + }, + { + "epoch": 0.16390120329322355, + "grad_norm": 4.83426570892334, + "learning_rate": 6.555217831813577e-06, + "loss": 1.2202, + "step": 647 + }, + { + "epoch": 0.16415452818239393, + "grad_norm": 4.844605922698975, + "learning_rate": 6.565349544072949e-06, + "loss": 1.1607, + "step": 648 + }, + { + "epoch": 0.16440785307156427, + "grad_norm": 4.556807518005371, + "learning_rate": 6.575481256332321e-06, + "loss": 1.2303, + "step": 649 + }, + { + "epoch": 0.16466117796073465, + "grad_norm": 4.369943618774414, + "learning_rate": 6.585612968591693e-06, + "loss": 1.1333, + "step": 650 + }, + { + "epoch": 0.164914502849905, + "grad_norm": 4.572859287261963, + "learning_rate": 6.595744680851064e-06, + "loss": 1.1399, + "step": 651 + }, + { + "epoch": 0.16516782773907537, + "grad_norm": 4.5565505027771, + "learning_rate": 6.605876393110436e-06, + "loss": 1.2068, + "step": 652 + }, + { + "epoch": 0.16542115262824572, + "grad_norm": 4.9332170486450195, + "learning_rate": 6.616008105369808e-06, + "loss": 1.4343, + "step": 653 + }, + { + "epoch": 0.1656744775174161, + "grad_norm": 5.112438201904297, + "learning_rate": 6.62613981762918e-06, + "loss": 1.3254, + "step": 654 + }, + { + "epoch": 0.16592780240658644, + "grad_norm": 5.191864967346191, + "learning_rate": 6.636271529888552e-06, + "loss": 1.3029, + "step": 655 + }, + { + "epoch": 0.16618112729575682, + "grad_norm": 4.797086715698242, + "learning_rate": 6.646403242147924e-06, + "loss": 1.2374, + "step": 656 + }, + { + "epoch": 0.16643445218492717, + "grad_norm": 5.420797348022461, + "learning_rate": 6.656534954407296e-06, + "loss": 1.2674, + "step": 657 + }, + { + "epoch": 0.16668777707409754, + "grad_norm": 5.221513271331787, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1778, + "step": 658 + }, + { + "epoch": 0.1669411019632679, + "grad_norm": 4.723294258117676, + "learning_rate": 6.676798378926039e-06, + "loss": 1.2055, + "step": 659 + }, + { + "epoch": 0.16719442685243824, + "grad_norm": 5.161945819854736, + "learning_rate": 6.686930091185411e-06, + "loss": 1.3098, + "step": 660 + }, + { + "epoch": 0.1674477517416086, + "grad_norm": 4.428056716918945, + "learning_rate": 6.697061803444783e-06, + "loss": 1.1297, + "step": 661 + }, + { + "epoch": 0.16770107663077896, + "grad_norm": 4.702414035797119, + "learning_rate": 6.707193515704155e-06, + "loss": 1.1794, + "step": 662 + }, + { + "epoch": 0.16795440151994934, + "grad_norm": 4.583940505981445, + "learning_rate": 6.7173252279635256e-06, + "loss": 1.1729, + "step": 663 + }, + { + "epoch": 0.16820772640911968, + "grad_norm": 5.682275295257568, + "learning_rate": 6.727456940222898e-06, + "loss": 1.2655, + "step": 664 + }, + { + "epoch": 0.16846105129829006, + "grad_norm": 4.875946521759033, + "learning_rate": 6.73758865248227e-06, + "loss": 1.1625, + "step": 665 + }, + { + "epoch": 0.1687143761874604, + "grad_norm": 4.4593281745910645, + "learning_rate": 6.747720364741642e-06, + "loss": 1.1104, + "step": 666 + }, + { + "epoch": 0.16896770107663078, + "grad_norm": 5.123931407928467, + "learning_rate": 6.757852077001014e-06, + "loss": 1.3114, + "step": 667 + }, + { + "epoch": 0.16922102596580113, + "grad_norm": 4.7207255363464355, + "learning_rate": 6.767983789260385e-06, + "loss": 1.184, + "step": 668 + }, + { + "epoch": 0.1694743508549715, + "grad_norm": 4.69482946395874, + "learning_rate": 6.778115501519758e-06, + "loss": 1.1978, + "step": 669 + }, + { + "epoch": 0.16972767574414185, + "grad_norm": 4.450202941894531, + "learning_rate": 6.7882472137791286e-06, + "loss": 1.1371, + "step": 670 + }, + { + "epoch": 0.16998100063331223, + "grad_norm": 5.538049221038818, + "learning_rate": 6.798378926038501e-06, + "loss": 1.2752, + "step": 671 + }, + { + "epoch": 0.17023432552248258, + "grad_norm": 4.678607940673828, + "learning_rate": 6.808510638297873e-06, + "loss": 1.305, + "step": 672 + }, + { + "epoch": 0.17048765041165295, + "grad_norm": 4.505133152008057, + "learning_rate": 6.818642350557245e-06, + "loss": 1.1406, + "step": 673 + }, + { + "epoch": 0.1707409753008233, + "grad_norm": 5.682215213775635, + "learning_rate": 6.828774062816617e-06, + "loss": 1.3239, + "step": 674 + }, + { + "epoch": 0.17099430018999368, + "grad_norm": 4.628654479980469, + "learning_rate": 6.838905775075988e-06, + "loss": 1.0611, + "step": 675 + }, + { + "epoch": 0.17124762507916402, + "grad_norm": 4.638597011566162, + "learning_rate": 6.849037487335361e-06, + "loss": 1.3282, + "step": 676 + }, + { + "epoch": 0.1715009499683344, + "grad_norm": 4.467402458190918, + "learning_rate": 6.8591691995947316e-06, + "loss": 1.2017, + "step": 677 + }, + { + "epoch": 0.17175427485750475, + "grad_norm": 4.895129203796387, + "learning_rate": 6.869300911854104e-06, + "loss": 1.2265, + "step": 678 + }, + { + "epoch": 0.17200759974667512, + "grad_norm": 4.73948860168457, + "learning_rate": 6.879432624113476e-06, + "loss": 1.1968, + "step": 679 + }, + { + "epoch": 0.17226092463584547, + "grad_norm": 5.32872200012207, + "learning_rate": 6.889564336372847e-06, + "loss": 1.2781, + "step": 680 + }, + { + "epoch": 0.17251424952501584, + "grad_norm": 4.710712909698486, + "learning_rate": 6.89969604863222e-06, + "loss": 1.2644, + "step": 681 + }, + { + "epoch": 0.1727675744141862, + "grad_norm": 5.227484226226807, + "learning_rate": 6.9098277608915905e-06, + "loss": 1.3654, + "step": 682 + }, + { + "epoch": 0.17302089930335657, + "grad_norm": 4.663438320159912, + "learning_rate": 6.919959473150964e-06, + "loss": 1.1535, + "step": 683 + }, + { + "epoch": 0.17327422419252692, + "grad_norm": 4.927851676940918, + "learning_rate": 6.9300911854103346e-06, + "loss": 1.2334, + "step": 684 + }, + { + "epoch": 0.17352754908169726, + "grad_norm": 4.664185047149658, + "learning_rate": 6.940222897669707e-06, + "loss": 1.3305, + "step": 685 + }, + { + "epoch": 0.17378087397086764, + "grad_norm": 4.821979999542236, + "learning_rate": 6.950354609929079e-06, + "loss": 1.0947, + "step": 686 + }, + { + "epoch": 0.174034198860038, + "grad_norm": 4.539811611175537, + "learning_rate": 6.96048632218845e-06, + "loss": 1.216, + "step": 687 + }, + { + "epoch": 0.17428752374920836, + "grad_norm": 5.37496280670166, + "learning_rate": 6.970618034447823e-06, + "loss": 1.3172, + "step": 688 + }, + { + "epoch": 0.1745408486383787, + "grad_norm": 4.168117523193359, + "learning_rate": 6.9807497467071935e-06, + "loss": 1.0461, + "step": 689 + }, + { + "epoch": 0.17479417352754908, + "grad_norm": 4.736009120941162, + "learning_rate": 6.990881458966566e-06, + "loss": 1.2247, + "step": 690 + }, + { + "epoch": 0.17504749841671943, + "grad_norm": 4.574060440063477, + "learning_rate": 7.0010131712259376e-06, + "loss": 1.1369, + "step": 691 + }, + { + "epoch": 0.1753008233058898, + "grad_norm": 4.718704700469971, + "learning_rate": 7.011144883485309e-06, + "loss": 1.255, + "step": 692 + }, + { + "epoch": 0.17555414819506016, + "grad_norm": 4.60386848449707, + "learning_rate": 7.021276595744682e-06, + "loss": 1.2849, + "step": 693 + }, + { + "epoch": 0.17580747308423053, + "grad_norm": 4.837037086486816, + "learning_rate": 7.031408308004053e-06, + "loss": 1.2205, + "step": 694 + }, + { + "epoch": 0.17606079797340088, + "grad_norm": 4.803045272827148, + "learning_rate": 7.041540020263426e-06, + "loss": 1.1764, + "step": 695 + }, + { + "epoch": 0.17631412286257125, + "grad_norm": 4.914492130279541, + "learning_rate": 7.0516717325227965e-06, + "loss": 1.2196, + "step": 696 + }, + { + "epoch": 0.1765674477517416, + "grad_norm": 5.577901363372803, + "learning_rate": 7.061803444782169e-06, + "loss": 1.2251, + "step": 697 + }, + { + "epoch": 0.17682077264091198, + "grad_norm": 4.781679630279541, + "learning_rate": 7.0719351570415406e-06, + "loss": 1.1626, + "step": 698 + }, + { + "epoch": 0.17707409753008233, + "grad_norm": 4.7842535972595215, + "learning_rate": 7.082066869300912e-06, + "loss": 1.1781, + "step": 699 + }, + { + "epoch": 0.1773274224192527, + "grad_norm": 4.552501201629639, + "learning_rate": 7.092198581560285e-06, + "loss": 1.1998, + "step": 700 + }, + { + "epoch": 0.17758074730842305, + "grad_norm": 4.845871925354004, + "learning_rate": 7.102330293819656e-06, + "loss": 1.2929, + "step": 701 + }, + { + "epoch": 0.17783407219759342, + "grad_norm": 4.504978179931641, + "learning_rate": 7.112462006079029e-06, + "loss": 1.0928, + "step": 702 + }, + { + "epoch": 0.17808739708676377, + "grad_norm": 4.638769626617432, + "learning_rate": 7.1225937183383995e-06, + "loss": 1.207, + "step": 703 + }, + { + "epoch": 0.17834072197593415, + "grad_norm": 4.376689434051514, + "learning_rate": 7.132725430597771e-06, + "loss": 1.2776, + "step": 704 + }, + { + "epoch": 0.1785940468651045, + "grad_norm": 4.428815841674805, + "learning_rate": 7.1428571428571436e-06, + "loss": 1.2085, + "step": 705 + }, + { + "epoch": 0.17884737175427487, + "grad_norm": 4.936357021331787, + "learning_rate": 7.152988855116515e-06, + "loss": 1.2298, + "step": 706 + }, + { + "epoch": 0.17910069664344522, + "grad_norm": 4.868915557861328, + "learning_rate": 7.163120567375888e-06, + "loss": 1.2821, + "step": 707 + }, + { + "epoch": 0.1793540215326156, + "grad_norm": 4.827498435974121, + "learning_rate": 7.173252279635258e-06, + "loss": 1.1198, + "step": 708 + }, + { + "epoch": 0.17960734642178594, + "grad_norm": 4.9391984939575195, + "learning_rate": 7.183383991894632e-06, + "loss": 1.333, + "step": 709 + }, + { + "epoch": 0.1798606713109563, + "grad_norm": 4.812231063842773, + "learning_rate": 7.1935157041540025e-06, + "loss": 1.3199, + "step": 710 + }, + { + "epoch": 0.18011399620012666, + "grad_norm": 4.894856929779053, + "learning_rate": 7.203647416413374e-06, + "loss": 1.4395, + "step": 711 + }, + { + "epoch": 0.180367321089297, + "grad_norm": 4.840929985046387, + "learning_rate": 7.2137791286727466e-06, + "loss": 1.1412, + "step": 712 + }, + { + "epoch": 0.1806206459784674, + "grad_norm": 4.748245716094971, + "learning_rate": 7.223910840932118e-06, + "loss": 1.2242, + "step": 713 + }, + { + "epoch": 0.18087397086763773, + "grad_norm": 4.636096000671387, + "learning_rate": 7.234042553191491e-06, + "loss": 1.3063, + "step": 714 + }, + { + "epoch": 0.1811272957568081, + "grad_norm": 4.838342189788818, + "learning_rate": 7.244174265450861e-06, + "loss": 1.2305, + "step": 715 + }, + { + "epoch": 0.18138062064597846, + "grad_norm": 4.387026786804199, + "learning_rate": 7.254305977710233e-06, + "loss": 1.2357, + "step": 716 + }, + { + "epoch": 0.18163394553514883, + "grad_norm": 4.892704963684082, + "learning_rate": 7.2644376899696055e-06, + "loss": 1.2262, + "step": 717 + }, + { + "epoch": 0.18188727042431918, + "grad_norm": 4.6830058097839355, + "learning_rate": 7.274569402228977e-06, + "loss": 1.1495, + "step": 718 + }, + { + "epoch": 0.18214059531348956, + "grad_norm": 4.722542762756348, + "learning_rate": 7.2847011144883496e-06, + "loss": 1.2518, + "step": 719 + }, + { + "epoch": 0.1823939202026599, + "grad_norm": 4.444228649139404, + "learning_rate": 7.294832826747721e-06, + "loss": 1.3131, + "step": 720 + }, + { + "epoch": 0.18264724509183028, + "grad_norm": 4.4738335609436035, + "learning_rate": 7.304964539007094e-06, + "loss": 1.256, + "step": 721 + }, + { + "epoch": 0.18290056998100063, + "grad_norm": 4.794192314147949, + "learning_rate": 7.315096251266464e-06, + "loss": 1.1494, + "step": 722 + }, + { + "epoch": 0.183153894870171, + "grad_norm": 5.2765679359436035, + "learning_rate": 7.325227963525836e-06, + "loss": 1.1946, + "step": 723 + }, + { + "epoch": 0.18340721975934135, + "grad_norm": 5.034933567047119, + "learning_rate": 7.3353596757852085e-06, + "loss": 1.2822, + "step": 724 + }, + { + "epoch": 0.18366054464851173, + "grad_norm": 4.959056854248047, + "learning_rate": 7.34549138804458e-06, + "loss": 1.2693, + "step": 725 + }, + { + "epoch": 0.18391386953768207, + "grad_norm": 5.423939228057861, + "learning_rate": 7.3556231003039526e-06, + "loss": 1.3931, + "step": 726 + }, + { + "epoch": 0.18416719442685245, + "grad_norm": 4.752450466156006, + "learning_rate": 7.365754812563324e-06, + "loss": 1.3915, + "step": 727 + }, + { + "epoch": 0.1844205193160228, + "grad_norm": 4.22570276260376, + "learning_rate": 7.375886524822695e-06, + "loss": 1.1261, + "step": 728 + }, + { + "epoch": 0.18467384420519317, + "grad_norm": 4.663388729095459, + "learning_rate": 7.386018237082067e-06, + "loss": 1.1313, + "step": 729 + }, + { + "epoch": 0.18492716909436352, + "grad_norm": 4.750311374664307, + "learning_rate": 7.396149949341439e-06, + "loss": 1.218, + "step": 730 + }, + { + "epoch": 0.1851804939835339, + "grad_norm": 5.208932876586914, + "learning_rate": 7.4062816616008115e-06, + "loss": 1.5564, + "step": 731 + }, + { + "epoch": 0.18543381887270424, + "grad_norm": 4.712618350982666, + "learning_rate": 7.416413373860183e-06, + "loss": 1.2767, + "step": 732 + }, + { + "epoch": 0.18568714376187462, + "grad_norm": 5.186469078063965, + "learning_rate": 7.426545086119554e-06, + "loss": 1.1933, + "step": 733 + }, + { + "epoch": 0.18594046865104497, + "grad_norm": 4.9151930809021, + "learning_rate": 7.436676798378927e-06, + "loss": 1.2096, + "step": 734 + }, + { + "epoch": 0.1861937935402153, + "grad_norm": 4.630429744720459, + "learning_rate": 7.446808510638298e-06, + "loss": 1.1771, + "step": 735 + }, + { + "epoch": 0.1864471184293857, + "grad_norm": 4.926432132720947, + "learning_rate": 7.45694022289767e-06, + "loss": 1.2877, + "step": 736 + }, + { + "epoch": 0.18670044331855604, + "grad_norm": 4.804089546203613, + "learning_rate": 7.467071935157042e-06, + "loss": 1.2231, + "step": 737 + }, + { + "epoch": 0.1869537682077264, + "grad_norm": 4.711433410644531, + "learning_rate": 7.4772036474164145e-06, + "loss": 1.3406, + "step": 738 + }, + { + "epoch": 0.18720709309689676, + "grad_norm": 5.337911128997803, + "learning_rate": 7.487335359675786e-06, + "loss": 1.2785, + "step": 739 + }, + { + "epoch": 0.18746041798606713, + "grad_norm": 4.779917240142822, + "learning_rate": 7.497467071935157e-06, + "loss": 1.2526, + "step": 740 + }, + { + "epoch": 0.18771374287523748, + "grad_norm": 4.563136100769043, + "learning_rate": 7.507598784194529e-06, + "loss": 1.2333, + "step": 741 + }, + { + "epoch": 0.18796706776440786, + "grad_norm": 4.497562408447266, + "learning_rate": 7.517730496453901e-06, + "loss": 1.1728, + "step": 742 + }, + { + "epoch": 0.1882203926535782, + "grad_norm": 4.735845565795898, + "learning_rate": 7.527862208713273e-06, + "loss": 1.2258, + "step": 743 + }, + { + "epoch": 0.18847371754274858, + "grad_norm": 4.666139125823975, + "learning_rate": 7.537993920972645e-06, + "loss": 1.2191, + "step": 744 + }, + { + "epoch": 0.18872704243191893, + "grad_norm": 4.478661060333252, + "learning_rate": 7.548125633232017e-06, + "loss": 1.2205, + "step": 745 + }, + { + "epoch": 0.1889803673210893, + "grad_norm": 4.495870113372803, + "learning_rate": 7.558257345491389e-06, + "loss": 1.0589, + "step": 746 + }, + { + "epoch": 0.18923369221025965, + "grad_norm": 5.100306034088135, + "learning_rate": 7.56838905775076e-06, + "loss": 1.3409, + "step": 747 + }, + { + "epoch": 0.18948701709943003, + "grad_norm": 4.780535697937012, + "learning_rate": 7.578520770010132e-06, + "loss": 1.275, + "step": 748 + }, + { + "epoch": 0.18974034198860038, + "grad_norm": 4.6737189292907715, + "learning_rate": 7.588652482269504e-06, + "loss": 1.3216, + "step": 749 + }, + { + "epoch": 0.18999366687777075, + "grad_norm": 4.600954055786133, + "learning_rate": 7.598784194528876e-06, + "loss": 1.1785, + "step": 750 + }, + { + "epoch": 0.1902469917669411, + "grad_norm": 4.667325973510742, + "learning_rate": 7.608915906788248e-06, + "loss": 1.1652, + "step": 751 + }, + { + "epoch": 0.19050031665611147, + "grad_norm": 4.5473856925964355, + "learning_rate": 7.61904761904762e-06, + "loss": 1.3336, + "step": 752 + }, + { + "epoch": 0.19075364154528182, + "grad_norm": 5.475617408752441, + "learning_rate": 7.629179331306992e-06, + "loss": 1.3631, + "step": 753 + }, + { + "epoch": 0.1910069664344522, + "grad_norm": 4.32639217376709, + "learning_rate": 7.639311043566364e-06, + "loss": 1.1886, + "step": 754 + }, + { + "epoch": 0.19126029132362254, + "grad_norm": 4.698390007019043, + "learning_rate": 7.649442755825735e-06, + "loss": 1.2001, + "step": 755 + }, + { + "epoch": 0.19151361621279292, + "grad_norm": 4.61825704574585, + "learning_rate": 7.659574468085107e-06, + "loss": 1.0986, + "step": 756 + }, + { + "epoch": 0.19176694110196327, + "grad_norm": 4.3026628494262695, + "learning_rate": 7.669706180344479e-06, + "loss": 1.2281, + "step": 757 + }, + { + "epoch": 0.19202026599113362, + "grad_norm": 4.374382972717285, + "learning_rate": 7.67983789260385e-06, + "loss": 1.1682, + "step": 758 + }, + { + "epoch": 0.192273590880304, + "grad_norm": 4.806503772735596, + "learning_rate": 7.689969604863222e-06, + "loss": 1.2286, + "step": 759 + }, + { + "epoch": 0.19252691576947434, + "grad_norm": 4.930648326873779, + "learning_rate": 7.700101317122595e-06, + "loss": 1.3067, + "step": 760 + }, + { + "epoch": 0.1927802406586447, + "grad_norm": 6.1269307136535645, + "learning_rate": 7.710233029381967e-06, + "loss": 1.354, + "step": 761 + }, + { + "epoch": 0.19303356554781506, + "grad_norm": 4.767146587371826, + "learning_rate": 7.720364741641338e-06, + "loss": 1.2168, + "step": 762 + }, + { + "epoch": 0.19328689043698544, + "grad_norm": 4.531832695007324, + "learning_rate": 7.73049645390071e-06, + "loss": 1.2485, + "step": 763 + }, + { + "epoch": 0.19354021532615578, + "grad_norm": 4.542140007019043, + "learning_rate": 7.740628166160082e-06, + "loss": 1.2629, + "step": 764 + }, + { + "epoch": 0.19379354021532616, + "grad_norm": 4.538090229034424, + "learning_rate": 7.750759878419453e-06, + "loss": 1.2346, + "step": 765 + }, + { + "epoch": 0.1940468651044965, + "grad_norm": 4.264867782592773, + "learning_rate": 7.760891590678825e-06, + "loss": 1.1366, + "step": 766 + }, + { + "epoch": 0.19430018999366688, + "grad_norm": 4.4305739402771, + "learning_rate": 7.771023302938198e-06, + "loss": 1.1654, + "step": 767 + }, + { + "epoch": 0.19455351488283723, + "grad_norm": 4.640016078948975, + "learning_rate": 7.78115501519757e-06, + "loss": 1.1595, + "step": 768 + }, + { + "epoch": 0.1948068397720076, + "grad_norm": 4.859258651733398, + "learning_rate": 7.79128672745694e-06, + "loss": 1.2217, + "step": 769 + }, + { + "epoch": 0.19506016466117795, + "grad_norm": 4.299515247344971, + "learning_rate": 7.801418439716313e-06, + "loss": 1.1171, + "step": 770 + }, + { + "epoch": 0.19531348955034833, + "grad_norm": 4.902132511138916, + "learning_rate": 7.811550151975685e-06, + "loss": 1.2726, + "step": 771 + }, + { + "epoch": 0.19556681443951868, + "grad_norm": 5.054393291473389, + "learning_rate": 7.821681864235056e-06, + "loss": 1.1047, + "step": 772 + }, + { + "epoch": 0.19582013932868905, + "grad_norm": 4.509024143218994, + "learning_rate": 7.831813576494428e-06, + "loss": 1.1481, + "step": 773 + }, + { + "epoch": 0.1960734642178594, + "grad_norm": 4.73517370223999, + "learning_rate": 7.841945288753801e-06, + "loss": 1.3705, + "step": 774 + }, + { + "epoch": 0.19632678910702978, + "grad_norm": 4.514815330505371, + "learning_rate": 7.852077001013173e-06, + "loss": 1.3325, + "step": 775 + }, + { + "epoch": 0.19658011399620012, + "grad_norm": 4.597283363342285, + "learning_rate": 7.862208713272543e-06, + "loss": 1.2341, + "step": 776 + }, + { + "epoch": 0.1968334388853705, + "grad_norm": 4.144809246063232, + "learning_rate": 7.872340425531916e-06, + "loss": 1.1692, + "step": 777 + }, + { + "epoch": 0.19708676377454085, + "grad_norm": 4.425267696380615, + "learning_rate": 7.882472137791288e-06, + "loss": 1.1706, + "step": 778 + }, + { + "epoch": 0.19734008866371122, + "grad_norm": 4.391229629516602, + "learning_rate": 7.89260385005066e-06, + "loss": 1.2168, + "step": 779 + }, + { + "epoch": 0.19759341355288157, + "grad_norm": 4.830233573913574, + "learning_rate": 7.90273556231003e-06, + "loss": 1.2752, + "step": 780 + }, + { + "epoch": 0.19784673844205194, + "grad_norm": 4.596672534942627, + "learning_rate": 7.912867274569402e-06, + "loss": 1.2317, + "step": 781 + }, + { + "epoch": 0.1981000633312223, + "grad_norm": 4.901708126068115, + "learning_rate": 7.922998986828776e-06, + "loss": 1.2566, + "step": 782 + }, + { + "epoch": 0.19835338822039264, + "grad_norm": 4.467739582061768, + "learning_rate": 7.933130699088146e-06, + "loss": 1.1349, + "step": 783 + }, + { + "epoch": 0.19860671310956302, + "grad_norm": 4.667692184448242, + "learning_rate": 7.943262411347519e-06, + "loss": 1.1607, + "step": 784 + }, + { + "epoch": 0.19886003799873336, + "grad_norm": 4.813882350921631, + "learning_rate": 7.95339412360689e-06, + "loss": 1.3432, + "step": 785 + }, + { + "epoch": 0.19911336288790374, + "grad_norm": 4.331610202789307, + "learning_rate": 7.963525835866262e-06, + "loss": 1.1571, + "step": 786 + }, + { + "epoch": 0.1993666877770741, + "grad_norm": 4.513205528259277, + "learning_rate": 7.973657548125634e-06, + "loss": 1.3929, + "step": 787 + }, + { + "epoch": 0.19962001266624446, + "grad_norm": 4.2903265953063965, + "learning_rate": 7.983789260385005e-06, + "loss": 1.3334, + "step": 788 + }, + { + "epoch": 0.1998733375554148, + "grad_norm": 3.9981842041015625, + "learning_rate": 7.993920972644377e-06, + "loss": 1.0683, + "step": 789 + }, + { + "epoch": 0.20012666244458518, + "grad_norm": 4.327489376068115, + "learning_rate": 8.004052684903749e-06, + "loss": 1.0201, + "step": 790 + }, + { + "epoch": 0.20037998733375553, + "grad_norm": 4.737874984741211, + "learning_rate": 8.014184397163122e-06, + "loss": 1.4405, + "step": 791 + }, + { + "epoch": 0.2006333122229259, + "grad_norm": 4.559230804443359, + "learning_rate": 8.024316109422494e-06, + "loss": 1.2087, + "step": 792 + }, + { + "epoch": 0.20088663711209626, + "grad_norm": 4.605414390563965, + "learning_rate": 8.034447821681865e-06, + "loss": 1.2694, + "step": 793 + }, + { + "epoch": 0.20113996200126663, + "grad_norm": 5.313296794891357, + "learning_rate": 8.044579533941237e-06, + "loss": 1.3344, + "step": 794 + }, + { + "epoch": 0.20139328689043698, + "grad_norm": 5.075579643249512, + "learning_rate": 8.054711246200608e-06, + "loss": 1.361, + "step": 795 + }, + { + "epoch": 0.20164661177960735, + "grad_norm": 4.876029968261719, + "learning_rate": 8.06484295845998e-06, + "loss": 1.2475, + "step": 796 + }, + { + "epoch": 0.2018999366687777, + "grad_norm": 4.533029556274414, + "learning_rate": 8.074974670719352e-06, + "loss": 1.3967, + "step": 797 + }, + { + "epoch": 0.20215326155794808, + "grad_norm": 4.693490028381348, + "learning_rate": 8.085106382978723e-06, + "loss": 1.2355, + "step": 798 + }, + { + "epoch": 0.20240658644711843, + "grad_norm": 4.543484687805176, + "learning_rate": 8.095238095238097e-06, + "loss": 1.1849, + "step": 799 + }, + { + "epoch": 0.2026599113362888, + "grad_norm": 4.704221248626709, + "learning_rate": 8.105369807497468e-06, + "loss": 1.2655, + "step": 800 + }, + { + "epoch": 0.20291323622545915, + "grad_norm": 4.049461841583252, + "learning_rate": 8.11550151975684e-06, + "loss": 1.0146, + "step": 801 + }, + { + "epoch": 0.20316656111462952, + "grad_norm": 4.67555046081543, + "learning_rate": 8.125633232016211e-06, + "loss": 1.1409, + "step": 802 + }, + { + "epoch": 0.20341988600379987, + "grad_norm": 4.73737907409668, + "learning_rate": 8.135764944275583e-06, + "loss": 1.3214, + "step": 803 + }, + { + "epoch": 0.20367321089297025, + "grad_norm": 4.849461078643799, + "learning_rate": 8.145896656534955e-06, + "loss": 1.3731, + "step": 804 + }, + { + "epoch": 0.2039265357821406, + "grad_norm": 4.94274377822876, + "learning_rate": 8.156028368794326e-06, + "loss": 1.2043, + "step": 805 + }, + { + "epoch": 0.20417986067131097, + "grad_norm": 4.574157238006592, + "learning_rate": 8.1661600810537e-06, + "loss": 1.2201, + "step": 806 + }, + { + "epoch": 0.20443318556048132, + "grad_norm": 4.684918403625488, + "learning_rate": 8.176291793313071e-06, + "loss": 1.1782, + "step": 807 + }, + { + "epoch": 0.20468651044965167, + "grad_norm": 5.050751209259033, + "learning_rate": 8.186423505572443e-06, + "loss": 1.1306, + "step": 808 + }, + { + "epoch": 0.20493983533882204, + "grad_norm": 4.734405517578125, + "learning_rate": 8.196555217831814e-06, + "loss": 1.1486, + "step": 809 + }, + { + "epoch": 0.2051931602279924, + "grad_norm": 4.575673580169678, + "learning_rate": 8.206686930091186e-06, + "loss": 1.3207, + "step": 810 + }, + { + "epoch": 0.20544648511716276, + "grad_norm": 4.46920108795166, + "learning_rate": 8.216818642350558e-06, + "loss": 1.1536, + "step": 811 + }, + { + "epoch": 0.2056998100063331, + "grad_norm": 4.92442512512207, + "learning_rate": 8.22695035460993e-06, + "loss": 1.2989, + "step": 812 + }, + { + "epoch": 0.2059531348955035, + "grad_norm": 4.4767069816589355, + "learning_rate": 8.237082066869303e-06, + "loss": 1.182, + "step": 813 + }, + { + "epoch": 0.20620645978467383, + "grad_norm": 4.701501369476318, + "learning_rate": 8.247213779128672e-06, + "loss": 1.2302, + "step": 814 + }, + { + "epoch": 0.2064597846738442, + "grad_norm": 4.852680206298828, + "learning_rate": 8.257345491388046e-06, + "loss": 1.3965, + "step": 815 + }, + { + "epoch": 0.20671310956301456, + "grad_norm": 4.312740802764893, + "learning_rate": 8.267477203647417e-06, + "loss": 1.1641, + "step": 816 + }, + { + "epoch": 0.20696643445218493, + "grad_norm": 4.566477298736572, + "learning_rate": 8.277608915906789e-06, + "loss": 1.2108, + "step": 817 + }, + { + "epoch": 0.20721975934135528, + "grad_norm": 4.805391311645508, + "learning_rate": 8.28774062816616e-06, + "loss": 1.2754, + "step": 818 + }, + { + "epoch": 0.20747308423052566, + "grad_norm": 4.131628513336182, + "learning_rate": 8.297872340425532e-06, + "loss": 1.0912, + "step": 819 + }, + { + "epoch": 0.207726409119696, + "grad_norm": 4.7957892417907715, + "learning_rate": 8.308004052684906e-06, + "loss": 1.1674, + "step": 820 + }, + { + "epoch": 0.20797973400886638, + "grad_norm": 4.751543998718262, + "learning_rate": 8.318135764944275e-06, + "loss": 1.3456, + "step": 821 + }, + { + "epoch": 0.20823305889803673, + "grad_norm": 4.739302635192871, + "learning_rate": 8.328267477203647e-06, + "loss": 1.1692, + "step": 822 + }, + { + "epoch": 0.2084863837872071, + "grad_norm": 5.0696916580200195, + "learning_rate": 8.33839918946302e-06, + "loss": 1.2978, + "step": 823 + }, + { + "epoch": 0.20873970867637745, + "grad_norm": 4.685317516326904, + "learning_rate": 8.348530901722392e-06, + "loss": 1.226, + "step": 824 + }, + { + "epoch": 0.20899303356554783, + "grad_norm": 4.865052700042725, + "learning_rate": 8.358662613981764e-06, + "loss": 1.338, + "step": 825 + }, + { + "epoch": 0.20924635845471817, + "grad_norm": 4.250523090362549, + "learning_rate": 8.368794326241135e-06, + "loss": 1.1768, + "step": 826 + }, + { + "epoch": 0.20949968334388855, + "grad_norm": 4.162044525146484, + "learning_rate": 8.378926038500509e-06, + "loss": 1.2061, + "step": 827 + }, + { + "epoch": 0.2097530082330589, + "grad_norm": 4.625030517578125, + "learning_rate": 8.389057750759878e-06, + "loss": 1.2754, + "step": 828 + }, + { + "epoch": 0.21000633312222927, + "grad_norm": 4.660970687866211, + "learning_rate": 8.39918946301925e-06, + "loss": 1.2398, + "step": 829 + }, + { + "epoch": 0.21025965801139962, + "grad_norm": 4.666497230529785, + "learning_rate": 8.409321175278623e-06, + "loss": 1.1091, + "step": 830 + }, + { + "epoch": 0.21051298290057, + "grad_norm": 4.57522439956665, + "learning_rate": 8.419452887537995e-06, + "loss": 1.3134, + "step": 831 + }, + { + "epoch": 0.21076630778974034, + "grad_norm": 5.008798122406006, + "learning_rate": 8.429584599797367e-06, + "loss": 1.2074, + "step": 832 + }, + { + "epoch": 0.2110196326789107, + "grad_norm": 4.1695780754089355, + "learning_rate": 8.439716312056738e-06, + "loss": 1.148, + "step": 833 + }, + { + "epoch": 0.21127295756808107, + "grad_norm": 4.676490306854248, + "learning_rate": 8.44984802431611e-06, + "loss": 1.0401, + "step": 834 + }, + { + "epoch": 0.2115262824572514, + "grad_norm": 4.756280422210693, + "learning_rate": 8.459979736575481e-06, + "loss": 1.238, + "step": 835 + }, + { + "epoch": 0.2117796073464218, + "grad_norm": 4.5334601402282715, + "learning_rate": 8.470111448834853e-06, + "loss": 1.1742, + "step": 836 + }, + { + "epoch": 0.21203293223559214, + "grad_norm": 4.883600234985352, + "learning_rate": 8.480243161094226e-06, + "loss": 1.3176, + "step": 837 + }, + { + "epoch": 0.2122862571247625, + "grad_norm": 4.711753845214844, + "learning_rate": 8.490374873353598e-06, + "loss": 1.2518, + "step": 838 + }, + { + "epoch": 0.21253958201393286, + "grad_norm": 4.728085041046143, + "learning_rate": 8.50050658561297e-06, + "loss": 1.28, + "step": 839 + }, + { + "epoch": 0.21279290690310323, + "grad_norm": 4.967433929443359, + "learning_rate": 8.510638297872341e-06, + "loss": 1.3654, + "step": 840 + }, + { + "epoch": 0.21304623179227358, + "grad_norm": 4.490946292877197, + "learning_rate": 8.520770010131713e-06, + "loss": 1.2963, + "step": 841 + }, + { + "epoch": 0.21329955668144396, + "grad_norm": 4.252407073974609, + "learning_rate": 8.530901722391084e-06, + "loss": 1.2775, + "step": 842 + }, + { + "epoch": 0.2135528815706143, + "grad_norm": 4.625506401062012, + "learning_rate": 8.541033434650456e-06, + "loss": 1.2156, + "step": 843 + }, + { + "epoch": 0.21380620645978468, + "grad_norm": 4.151983737945557, + "learning_rate": 8.55116514690983e-06, + "loss": 1.3086, + "step": 844 + }, + { + "epoch": 0.21405953134895503, + "grad_norm": 4.83617639541626, + "learning_rate": 8.561296859169201e-06, + "loss": 1.2696, + "step": 845 + }, + { + "epoch": 0.2143128562381254, + "grad_norm": 4.213065147399902, + "learning_rate": 8.571428571428571e-06, + "loss": 1.0948, + "step": 846 + }, + { + "epoch": 0.21456618112729575, + "grad_norm": 4.133890151977539, + "learning_rate": 8.581560283687944e-06, + "loss": 1.3205, + "step": 847 + }, + { + "epoch": 0.21481950601646613, + "grad_norm": 4.231499195098877, + "learning_rate": 8.591691995947316e-06, + "loss": 1.1487, + "step": 848 + }, + { + "epoch": 0.21507283090563648, + "grad_norm": 4.207747459411621, + "learning_rate": 8.601823708206687e-06, + "loss": 1.2424, + "step": 849 + }, + { + "epoch": 0.21532615579480685, + "grad_norm": 4.441788673400879, + "learning_rate": 8.611955420466059e-06, + "loss": 1.2109, + "step": 850 + }, + { + "epoch": 0.2155794806839772, + "grad_norm": 4.671090602874756, + "learning_rate": 8.622087132725432e-06, + "loss": 1.3559, + "step": 851 + }, + { + "epoch": 0.21583280557314757, + "grad_norm": 4.83182430267334, + "learning_rate": 8.632218844984804e-06, + "loss": 1.3977, + "step": 852 + }, + { + "epoch": 0.21608613046231792, + "grad_norm": 4.461249351501465, + "learning_rate": 8.642350557244174e-06, + "loss": 1.2408, + "step": 853 + }, + { + "epoch": 0.2163394553514883, + "grad_norm": 4.672199726104736, + "learning_rate": 8.652482269503547e-06, + "loss": 1.3422, + "step": 854 + }, + { + "epoch": 0.21659278024065864, + "grad_norm": 4.4290900230407715, + "learning_rate": 8.662613981762919e-06, + "loss": 1.2632, + "step": 855 + }, + { + "epoch": 0.216846105129829, + "grad_norm": 4.46585750579834, + "learning_rate": 8.67274569402229e-06, + "loss": 1.1929, + "step": 856 + }, + { + "epoch": 0.21709943001899937, + "grad_norm": 4.754853248596191, + "learning_rate": 8.682877406281662e-06, + "loss": 1.2999, + "step": 857 + }, + { + "epoch": 0.21735275490816972, + "grad_norm": 4.45685338973999, + "learning_rate": 8.693009118541034e-06, + "loss": 1.2949, + "step": 858 + }, + { + "epoch": 0.2176060797973401, + "grad_norm": 4.251350402832031, + "learning_rate": 8.703140830800407e-06, + "loss": 1.3525, + "step": 859 + }, + { + "epoch": 0.21785940468651044, + "grad_norm": 4.639081954956055, + "learning_rate": 8.713272543059777e-06, + "loss": 1.2987, + "step": 860 + }, + { + "epoch": 0.2181127295756808, + "grad_norm": 4.152873992919922, + "learning_rate": 8.72340425531915e-06, + "loss": 1.3355, + "step": 861 + }, + { + "epoch": 0.21836605446485116, + "grad_norm": 4.566308975219727, + "learning_rate": 8.733535967578522e-06, + "loss": 1.2251, + "step": 862 + }, + { + "epoch": 0.21861937935402154, + "grad_norm": 5.728938579559326, + "learning_rate": 8.743667679837893e-06, + "loss": 1.3422, + "step": 863 + }, + { + "epoch": 0.21887270424319188, + "grad_norm": 5.035421848297119, + "learning_rate": 8.753799392097265e-06, + "loss": 1.3664, + "step": 864 + }, + { + "epoch": 0.21912602913236226, + "grad_norm": 5.070148468017578, + "learning_rate": 8.763931104356637e-06, + "loss": 1.3327, + "step": 865 + }, + { + "epoch": 0.2193793540215326, + "grad_norm": 4.458358287811279, + "learning_rate": 8.774062816616008e-06, + "loss": 1.3214, + "step": 866 + }, + { + "epoch": 0.21963267891070298, + "grad_norm": 4.389994144439697, + "learning_rate": 8.78419452887538e-06, + "loss": 1.1337, + "step": 867 + }, + { + "epoch": 0.21988600379987333, + "grad_norm": 4.408799171447754, + "learning_rate": 8.794326241134753e-06, + "loss": 1.2597, + "step": 868 + }, + { + "epoch": 0.2201393286890437, + "grad_norm": 4.331786632537842, + "learning_rate": 8.804457953394125e-06, + "loss": 1.3029, + "step": 869 + }, + { + "epoch": 0.22039265357821405, + "grad_norm": 4.499011039733887, + "learning_rate": 8.814589665653496e-06, + "loss": 1.217, + "step": 870 + }, + { + "epoch": 0.22064597846738443, + "grad_norm": 4.391335964202881, + "learning_rate": 8.824721377912868e-06, + "loss": 1.2315, + "step": 871 + }, + { + "epoch": 0.22089930335655478, + "grad_norm": 4.000341892242432, + "learning_rate": 8.83485309017224e-06, + "loss": 1.1479, + "step": 872 + }, + { + "epoch": 0.22115262824572515, + "grad_norm": 4.703832626342773, + "learning_rate": 8.844984802431611e-06, + "loss": 1.3473, + "step": 873 + }, + { + "epoch": 0.2214059531348955, + "grad_norm": 4.657235145568848, + "learning_rate": 8.855116514690983e-06, + "loss": 1.2664, + "step": 874 + }, + { + "epoch": 0.22165927802406588, + "grad_norm": 4.632787227630615, + "learning_rate": 8.865248226950355e-06, + "loss": 1.3598, + "step": 875 + }, + { + "epoch": 0.22191260291323622, + "grad_norm": 5.230173587799072, + "learning_rate": 8.875379939209728e-06, + "loss": 1.3767, + "step": 876 + }, + { + "epoch": 0.2221659278024066, + "grad_norm": 4.797090530395508, + "learning_rate": 8.8855116514691e-06, + "loss": 1.3005, + "step": 877 + }, + { + "epoch": 0.22241925269157695, + "grad_norm": 4.6531596183776855, + "learning_rate": 8.895643363728471e-06, + "loss": 1.2636, + "step": 878 + }, + { + "epoch": 0.22267257758074732, + "grad_norm": 4.6279473304748535, + "learning_rate": 8.905775075987843e-06, + "loss": 1.2479, + "step": 879 + }, + { + "epoch": 0.22292590246991767, + "grad_norm": 4.576147556304932, + "learning_rate": 8.915906788247214e-06, + "loss": 1.2952, + "step": 880 + }, + { + "epoch": 0.22317922735908802, + "grad_norm": 4.335544586181641, + "learning_rate": 8.926038500506586e-06, + "loss": 1.1002, + "step": 881 + }, + { + "epoch": 0.2234325522482584, + "grad_norm": 4.895113945007324, + "learning_rate": 8.936170212765958e-06, + "loss": 1.2593, + "step": 882 + }, + { + "epoch": 0.22368587713742874, + "grad_norm": 4.440051555633545, + "learning_rate": 8.94630192502533e-06, + "loss": 1.1467, + "step": 883 + }, + { + "epoch": 0.22393920202659912, + "grad_norm": 4.605652809143066, + "learning_rate": 8.956433637284702e-06, + "loss": 1.2606, + "step": 884 + }, + { + "epoch": 0.22419252691576946, + "grad_norm": 4.574369430541992, + "learning_rate": 8.966565349544074e-06, + "loss": 1.2385, + "step": 885 + }, + { + "epoch": 0.22444585180493984, + "grad_norm": 4.45790433883667, + "learning_rate": 8.976697061803446e-06, + "loss": 1.1846, + "step": 886 + }, + { + "epoch": 0.2246991766941102, + "grad_norm": 4.781072616577148, + "learning_rate": 8.986828774062817e-06, + "loss": 1.2787, + "step": 887 + }, + { + "epoch": 0.22495250158328056, + "grad_norm": 4.68171501159668, + "learning_rate": 8.996960486322189e-06, + "loss": 1.1573, + "step": 888 + }, + { + "epoch": 0.2252058264724509, + "grad_norm": 4.497115612030029, + "learning_rate": 9.00709219858156e-06, + "loss": 1.3108, + "step": 889 + }, + { + "epoch": 0.22545915136162129, + "grad_norm": 4.450427055358887, + "learning_rate": 9.017223910840934e-06, + "loss": 1.2311, + "step": 890 + }, + { + "epoch": 0.22571247625079163, + "grad_norm": 4.6777424812316895, + "learning_rate": 9.027355623100304e-06, + "loss": 1.3122, + "step": 891 + }, + { + "epoch": 0.225965801139962, + "grad_norm": 4.58901309967041, + "learning_rate": 9.037487335359677e-06, + "loss": 1.329, + "step": 892 + }, + { + "epoch": 0.22621912602913236, + "grad_norm": 4.763391494750977, + "learning_rate": 9.047619047619049e-06, + "loss": 1.2884, + "step": 893 + }, + { + "epoch": 0.22647245091830273, + "grad_norm": 4.61236047744751, + "learning_rate": 9.05775075987842e-06, + "loss": 1.3443, + "step": 894 + }, + { + "epoch": 0.22672577580747308, + "grad_norm": 4.786077499389648, + "learning_rate": 9.067882472137792e-06, + "loss": 1.2916, + "step": 895 + }, + { + "epoch": 0.22697910069664345, + "grad_norm": 4.385064601898193, + "learning_rate": 9.078014184397164e-06, + "loss": 1.2207, + "step": 896 + }, + { + "epoch": 0.2272324255858138, + "grad_norm": 4.293412208557129, + "learning_rate": 9.088145896656537e-06, + "loss": 1.1634, + "step": 897 + }, + { + "epoch": 0.22748575047498418, + "grad_norm": 4.510735511779785, + "learning_rate": 9.098277608915907e-06, + "loss": 1.2406, + "step": 898 + }, + { + "epoch": 0.22773907536415453, + "grad_norm": 5.970086097717285, + "learning_rate": 9.108409321175278e-06, + "loss": 1.2865, + "step": 899 + }, + { + "epoch": 0.2279924002533249, + "grad_norm": 4.670897960662842, + "learning_rate": 9.118541033434652e-06, + "loss": 1.2052, + "step": 900 + }, + { + "epoch": 0.22824572514249525, + "grad_norm": 4.067436695098877, + "learning_rate": 9.128672745694023e-06, + "loss": 1.1642, + "step": 901 + }, + { + "epoch": 0.22849905003166562, + "grad_norm": 4.683802127838135, + "learning_rate": 9.138804457953395e-06, + "loss": 1.4001, + "step": 902 + }, + { + "epoch": 0.22875237492083597, + "grad_norm": 4.468769073486328, + "learning_rate": 9.148936170212767e-06, + "loss": 1.2653, + "step": 903 + }, + { + "epoch": 0.22900569981000635, + "grad_norm": 4.894930839538574, + "learning_rate": 9.15906788247214e-06, + "loss": 1.2637, + "step": 904 + }, + { + "epoch": 0.2292590246991767, + "grad_norm": 4.898458003997803, + "learning_rate": 9.16919959473151e-06, + "loss": 1.2617, + "step": 905 + }, + { + "epoch": 0.22951234958834704, + "grad_norm": 4.784833908081055, + "learning_rate": 9.179331306990881e-06, + "loss": 1.3382, + "step": 906 + }, + { + "epoch": 0.22976567447751742, + "grad_norm": 4.338533878326416, + "learning_rate": 9.189463019250255e-06, + "loss": 1.2544, + "step": 907 + }, + { + "epoch": 0.23001899936668777, + "grad_norm": 4.30348539352417, + "learning_rate": 9.199594731509626e-06, + "loss": 1.2249, + "step": 908 + }, + { + "epoch": 0.23027232425585814, + "grad_norm": 4.037816524505615, + "learning_rate": 9.209726443768998e-06, + "loss": 1.2093, + "step": 909 + }, + { + "epoch": 0.2305256491450285, + "grad_norm": 4.498706340789795, + "learning_rate": 9.21985815602837e-06, + "loss": 1.2782, + "step": 910 + }, + { + "epoch": 0.23077897403419886, + "grad_norm": 3.9231648445129395, + "learning_rate": 9.229989868287741e-06, + "loss": 1.1986, + "step": 911 + }, + { + "epoch": 0.2310322989233692, + "grad_norm": 4.32187032699585, + "learning_rate": 9.240121580547113e-06, + "loss": 1.3408, + "step": 912 + }, + { + "epoch": 0.2312856238125396, + "grad_norm": 4.445515155792236, + "learning_rate": 9.250253292806484e-06, + "loss": 1.2548, + "step": 913 + }, + { + "epoch": 0.23153894870170993, + "grad_norm": 4.231760501861572, + "learning_rate": 9.260385005065858e-06, + "loss": 1.1247, + "step": 914 + }, + { + "epoch": 0.2317922735908803, + "grad_norm": 4.526363372802734, + "learning_rate": 9.27051671732523e-06, + "loss": 1.1513, + "step": 915 + }, + { + "epoch": 0.23204559848005066, + "grad_norm": 4.280642986297607, + "learning_rate": 9.280648429584601e-06, + "loss": 1.2449, + "step": 916 + }, + { + "epoch": 0.23229892336922103, + "grad_norm": 4.070103645324707, + "learning_rate": 9.290780141843973e-06, + "loss": 1.1165, + "step": 917 + }, + { + "epoch": 0.23255224825839138, + "grad_norm": 4.806093692779541, + "learning_rate": 9.300911854103344e-06, + "loss": 1.2911, + "step": 918 + }, + { + "epoch": 0.23280557314756176, + "grad_norm": 4.156966686248779, + "learning_rate": 9.311043566362716e-06, + "loss": 1.1085, + "step": 919 + }, + { + "epoch": 0.2330588980367321, + "grad_norm": 4.51497220993042, + "learning_rate": 9.321175278622087e-06, + "loss": 1.2273, + "step": 920 + }, + { + "epoch": 0.23331222292590248, + "grad_norm": 4.607082843780518, + "learning_rate": 9.33130699088146e-06, + "loss": 1.3149, + "step": 921 + }, + { + "epoch": 0.23356554781507283, + "grad_norm": 4.478603363037109, + "learning_rate": 9.341438703140832e-06, + "loss": 1.2928, + "step": 922 + }, + { + "epoch": 0.2338188727042432, + "grad_norm": 4.361640930175781, + "learning_rate": 9.351570415400202e-06, + "loss": 1.1646, + "step": 923 + }, + { + "epoch": 0.23407219759341355, + "grad_norm": 4.692184925079346, + "learning_rate": 9.361702127659576e-06, + "loss": 1.2986, + "step": 924 + }, + { + "epoch": 0.23432552248258393, + "grad_norm": 4.741754531860352, + "learning_rate": 9.371833839918947e-06, + "loss": 1.2221, + "step": 925 + }, + { + "epoch": 0.23457884737175427, + "grad_norm": 4.560026168823242, + "learning_rate": 9.381965552178319e-06, + "loss": 1.2979, + "step": 926 + }, + { + "epoch": 0.23483217226092465, + "grad_norm": 4.630263328552246, + "learning_rate": 9.39209726443769e-06, + "loss": 1.2944, + "step": 927 + }, + { + "epoch": 0.235085497150095, + "grad_norm": 4.621401309967041, + "learning_rate": 9.402228976697062e-06, + "loss": 1.304, + "step": 928 + }, + { + "epoch": 0.23533882203926534, + "grad_norm": 4.51711368560791, + "learning_rate": 9.412360688956435e-06, + "loss": 1.3174, + "step": 929 + }, + { + "epoch": 0.23559214692843572, + "grad_norm": 4.709510326385498, + "learning_rate": 9.422492401215805e-06, + "loss": 1.2371, + "step": 930 + }, + { + "epoch": 0.23584547181760607, + "grad_norm": 4.616505146026611, + "learning_rate": 9.432624113475179e-06, + "loss": 1.4215, + "step": 931 + }, + { + "epoch": 0.23609879670677644, + "grad_norm": 4.576818943023682, + "learning_rate": 9.44275582573455e-06, + "loss": 1.3146, + "step": 932 + }, + { + "epoch": 0.2363521215959468, + "grad_norm": 4.430837154388428, + "learning_rate": 9.452887537993922e-06, + "loss": 1.2977, + "step": 933 + }, + { + "epoch": 0.23660544648511717, + "grad_norm": 4.0142388343811035, + "learning_rate": 9.463019250253293e-06, + "loss": 1.1209, + "step": 934 + }, + { + "epoch": 0.2368587713742875, + "grad_norm": 4.440330505371094, + "learning_rate": 9.473150962512665e-06, + "loss": 1.2769, + "step": 935 + }, + { + "epoch": 0.2371120962634579, + "grad_norm": 4.563925743103027, + "learning_rate": 9.483282674772038e-06, + "loss": 1.2471, + "step": 936 + }, + { + "epoch": 0.23736542115262824, + "grad_norm": 4.916733741760254, + "learning_rate": 9.493414387031408e-06, + "loss": 1.4746, + "step": 937 + }, + { + "epoch": 0.2376187460417986, + "grad_norm": 4.41516637802124, + "learning_rate": 9.503546099290782e-06, + "loss": 1.3259, + "step": 938 + }, + { + "epoch": 0.23787207093096896, + "grad_norm": 4.409024238586426, + "learning_rate": 9.513677811550153e-06, + "loss": 1.2328, + "step": 939 + }, + { + "epoch": 0.23812539582013934, + "grad_norm": 4.46191930770874, + "learning_rate": 9.523809523809525e-06, + "loss": 1.3066, + "step": 940 + }, + { + "epoch": 0.23837872070930968, + "grad_norm": 4.529879093170166, + "learning_rate": 9.533941236068896e-06, + "loss": 1.1784, + "step": 941 + }, + { + "epoch": 0.23863204559848006, + "grad_norm": 4.373073101043701, + "learning_rate": 9.544072948328268e-06, + "loss": 1.3092, + "step": 942 + }, + { + "epoch": 0.2388853704876504, + "grad_norm": 4.161291599273682, + "learning_rate": 9.55420466058764e-06, + "loss": 1.1198, + "step": 943 + }, + { + "epoch": 0.23913869537682078, + "grad_norm": 4.716823577880859, + "learning_rate": 9.564336372847011e-06, + "loss": 1.2307, + "step": 944 + }, + { + "epoch": 0.23939202026599113, + "grad_norm": 4.811398983001709, + "learning_rate": 9.574468085106385e-06, + "loss": 1.3575, + "step": 945 + }, + { + "epoch": 0.2396453451551615, + "grad_norm": 4.302731037139893, + "learning_rate": 9.584599797365756e-06, + "loss": 1.1594, + "step": 946 + }, + { + "epoch": 0.23989867004433185, + "grad_norm": 4.507314205169678, + "learning_rate": 9.594731509625128e-06, + "loss": 1.2235, + "step": 947 + }, + { + "epoch": 0.24015199493350223, + "grad_norm": 4.548384666442871, + "learning_rate": 9.6048632218845e-06, + "loss": 1.346, + "step": 948 + }, + { + "epoch": 0.24040531982267258, + "grad_norm": 4.291707992553711, + "learning_rate": 9.614994934143871e-06, + "loss": 1.1325, + "step": 949 + }, + { + "epoch": 0.24065864471184295, + "grad_norm": 4.426799297332764, + "learning_rate": 9.625126646403243e-06, + "loss": 1.1833, + "step": 950 + }, + { + "epoch": 0.2409119696010133, + "grad_norm": 4.290721416473389, + "learning_rate": 9.635258358662614e-06, + "loss": 1.2496, + "step": 951 + }, + { + "epoch": 0.24116529449018367, + "grad_norm": 4.555160045623779, + "learning_rate": 9.645390070921986e-06, + "loss": 1.2221, + "step": 952 + }, + { + "epoch": 0.24141861937935402, + "grad_norm": 3.9502954483032227, + "learning_rate": 9.655521783181359e-06, + "loss": 1.175, + "step": 953 + }, + { + "epoch": 0.24167194426852437, + "grad_norm": 4.670769691467285, + "learning_rate": 9.66565349544073e-06, + "loss": 1.3957, + "step": 954 + }, + { + "epoch": 0.24192526915769474, + "grad_norm": 4.343841075897217, + "learning_rate": 9.675785207700102e-06, + "loss": 1.2519, + "step": 955 + }, + { + "epoch": 0.2421785940468651, + "grad_norm": 4.143503189086914, + "learning_rate": 9.685916919959474e-06, + "loss": 1.2741, + "step": 956 + }, + { + "epoch": 0.24243191893603547, + "grad_norm": 4.043879985809326, + "learning_rate": 9.696048632218846e-06, + "loss": 1.1817, + "step": 957 + }, + { + "epoch": 0.24268524382520582, + "grad_norm": 3.883371114730835, + "learning_rate": 9.706180344478217e-06, + "loss": 1.1797, + "step": 958 + }, + { + "epoch": 0.2429385687143762, + "grad_norm": 3.936861515045166, + "learning_rate": 9.716312056737589e-06, + "loss": 1.1455, + "step": 959 + }, + { + "epoch": 0.24319189360354654, + "grad_norm": 4.490845680236816, + "learning_rate": 9.726443768996962e-06, + "loss": 1.2841, + "step": 960 + }, + { + "epoch": 0.2434452184927169, + "grad_norm": 4.519428253173828, + "learning_rate": 9.736575481256332e-06, + "loss": 1.2878, + "step": 961 + }, + { + "epoch": 0.24369854338188726, + "grad_norm": 4.357264518737793, + "learning_rate": 9.746707193515705e-06, + "loss": 1.1434, + "step": 962 + }, + { + "epoch": 0.24395186827105764, + "grad_norm": 4.587429523468018, + "learning_rate": 9.756838905775077e-06, + "loss": 1.2539, + "step": 963 + }, + { + "epoch": 0.24420519316022798, + "grad_norm": 4.37636661529541, + "learning_rate": 9.766970618034449e-06, + "loss": 1.1944, + "step": 964 + }, + { + "epoch": 0.24445851804939836, + "grad_norm": 4.168228626251221, + "learning_rate": 9.77710233029382e-06, + "loss": 1.178, + "step": 965 + }, + { + "epoch": 0.2447118429385687, + "grad_norm": 4.678414344787598, + "learning_rate": 9.787234042553192e-06, + "loss": 1.2634, + "step": 966 + }, + { + "epoch": 0.24496516782773908, + "grad_norm": 4.3709611892700195, + "learning_rate": 9.797365754812565e-06, + "loss": 1.2131, + "step": 967 + }, + { + "epoch": 0.24521849271690943, + "grad_norm": 4.429797172546387, + "learning_rate": 9.807497467071935e-06, + "loss": 1.2301, + "step": 968 + }, + { + "epoch": 0.2454718176060798, + "grad_norm": 4.7387003898620605, + "learning_rate": 9.817629179331308e-06, + "loss": 1.1721, + "step": 969 + }, + { + "epoch": 0.24572514249525015, + "grad_norm": 4.454428672790527, + "learning_rate": 9.82776089159068e-06, + "loss": 1.3677, + "step": 970 + }, + { + "epoch": 0.24597846738442053, + "grad_norm": 4.637801647186279, + "learning_rate": 9.837892603850052e-06, + "loss": 1.2987, + "step": 971 + }, + { + "epoch": 0.24623179227359088, + "grad_norm": 4.245021820068359, + "learning_rate": 9.848024316109423e-06, + "loss": 1.3343, + "step": 972 + }, + { + "epoch": 0.24648511716276125, + "grad_norm": 4.278740882873535, + "learning_rate": 9.858156028368795e-06, + "loss": 1.2818, + "step": 973 + }, + { + "epoch": 0.2467384420519316, + "grad_norm": 4.510958671569824, + "learning_rate": 9.868287740628168e-06, + "loss": 1.3503, + "step": 974 + }, + { + "epoch": 0.24699176694110198, + "grad_norm": 4.130159854888916, + "learning_rate": 9.878419452887538e-06, + "loss": 1.2979, + "step": 975 + }, + { + "epoch": 0.24724509183027232, + "grad_norm": 4.403942108154297, + "learning_rate": 9.88855116514691e-06, + "loss": 1.2455, + "step": 976 + }, + { + "epoch": 0.2474984167194427, + "grad_norm": 4.474011421203613, + "learning_rate": 9.898682877406283e-06, + "loss": 1.2902, + "step": 977 + }, + { + "epoch": 0.24775174160861305, + "grad_norm": 4.206770896911621, + "learning_rate": 9.908814589665655e-06, + "loss": 1.2073, + "step": 978 + }, + { + "epoch": 0.2480050664977834, + "grad_norm": 4.503035068511963, + "learning_rate": 9.918946301925026e-06, + "loss": 1.2272, + "step": 979 + }, + { + "epoch": 0.24825839138695377, + "grad_norm": 4.825187683105469, + "learning_rate": 9.929078014184398e-06, + "loss": 1.3413, + "step": 980 + }, + { + "epoch": 0.24851171627612412, + "grad_norm": 4.212365627288818, + "learning_rate": 9.939209726443771e-06, + "loss": 1.2217, + "step": 981 + }, + { + "epoch": 0.2487650411652945, + "grad_norm": 4.430416107177734, + "learning_rate": 9.949341438703141e-06, + "loss": 1.1744, + "step": 982 + }, + { + "epoch": 0.24901836605446484, + "grad_norm": 5.085466384887695, + "learning_rate": 9.959473150962513e-06, + "loss": 1.3391, + "step": 983 + }, + { + "epoch": 0.24927169094363522, + "grad_norm": 4.587413787841797, + "learning_rate": 9.969604863221886e-06, + "loss": 1.2309, + "step": 984 + }, + { + "epoch": 0.24952501583280556, + "grad_norm": 4.599008083343506, + "learning_rate": 9.979736575481258e-06, + "loss": 1.204, + "step": 985 + }, + { + "epoch": 0.24977834072197594, + "grad_norm": 4.6535162925720215, + "learning_rate": 9.98986828774063e-06, + "loss": 1.2601, + "step": 986 + }, + { + "epoch": 0.2500316656111463, + "grad_norm": 3.9977779388427734, + "learning_rate": 1e-05, + "loss": 1.2025, + "step": 987 + }, + { + "epoch": 0.25028499050031666, + "grad_norm": 4.556582450866699, + "learning_rate": 9.999999929801171e-06, + "loss": 1.3582, + "step": 988 + }, + { + "epoch": 0.250538315389487, + "grad_norm": 4.729936599731445, + "learning_rate": 9.99999971920469e-06, + "loss": 1.322, + "step": 989 + }, + { + "epoch": 0.25079164027865736, + "grad_norm": 4.16456413269043, + "learning_rate": 9.999999368210558e-06, + "loss": 1.1521, + "step": 990 + }, + { + "epoch": 0.25104496516782776, + "grad_norm": 4.371050834655762, + "learning_rate": 9.999998876818787e-06, + "loss": 1.3173, + "step": 991 + }, + { + "epoch": 0.2512982900569981, + "grad_norm": 4.363968372344971, + "learning_rate": 9.999998245029391e-06, + "loss": 1.3401, + "step": 992 + }, + { + "epoch": 0.25155161494616846, + "grad_norm": 4.4262847900390625, + "learning_rate": 9.999997472842388e-06, + "loss": 1.1315, + "step": 993 + }, + { + "epoch": 0.2518049398353388, + "grad_norm": 4.121704578399658, + "learning_rate": 9.999996560257801e-06, + "loss": 1.0382, + "step": 994 + }, + { + "epoch": 0.2520582647245092, + "grad_norm": 4.098482131958008, + "learning_rate": 9.999995507275652e-06, + "loss": 1.1252, + "step": 995 + }, + { + "epoch": 0.25231158961367955, + "grad_norm": 4.604094982147217, + "learning_rate": 9.999994313895973e-06, + "loss": 1.4268, + "step": 996 + }, + { + "epoch": 0.2525649145028499, + "grad_norm": 4.778846740722656, + "learning_rate": 9.999992980118795e-06, + "loss": 1.3845, + "step": 997 + }, + { + "epoch": 0.25281823939202025, + "grad_norm": 4.112329959869385, + "learning_rate": 9.999991505944161e-06, + "loss": 1.2382, + "step": 998 + }, + { + "epoch": 0.25307156428119065, + "grad_norm": 4.718144416809082, + "learning_rate": 9.999989891372107e-06, + "loss": 1.1994, + "step": 999 + }, + { + "epoch": 0.253324889170361, + "grad_norm": 4.772528648376465, + "learning_rate": 9.99998813640268e-06, + "loss": 1.3379, + "step": 1000 + }, + { + "epoch": 0.253324889170361, + "eval_loss": 1.277957558631897, + "eval_runtime": 11.8882, + "eval_samples_per_second": 33.647, + "eval_steps_per_second": 4.206, + "step": 1000 + }, + { + "epoch": 0.25357821405953135, + "grad_norm": 4.5006232261657715, + "learning_rate": 9.99998624103593e-06, + "loss": 1.2804, + "step": 1001 + }, + { + "epoch": 0.2538315389487017, + "grad_norm": 4.349647521972656, + "learning_rate": 9.999984205271911e-06, + "loss": 1.2432, + "step": 1002 + }, + { + "epoch": 0.25408486383787204, + "grad_norm": 4.238874435424805, + "learning_rate": 9.999982029110677e-06, + "loss": 1.2355, + "step": 1003 + }, + { + "epoch": 0.25433818872704245, + "grad_norm": 4.476635456085205, + "learning_rate": 9.999979712552293e-06, + "loss": 1.3165, + "step": 1004 + }, + { + "epoch": 0.2545915136162128, + "grad_norm": 4.258846282958984, + "learning_rate": 9.99997725559682e-06, + "loss": 1.3219, + "step": 1005 + }, + { + "epoch": 0.25484483850538314, + "grad_norm": 4.392333984375, + "learning_rate": 9.99997465824433e-06, + "loss": 1.2872, + "step": 1006 + }, + { + "epoch": 0.2550981633945535, + "grad_norm": 4.222839832305908, + "learning_rate": 9.999971920494895e-06, + "loss": 1.3018, + "step": 1007 + }, + { + "epoch": 0.2553514882837239, + "grad_norm": 4.751513481140137, + "learning_rate": 9.999969042348592e-06, + "loss": 1.2831, + "step": 1008 + }, + { + "epoch": 0.25560481317289424, + "grad_norm": 5.133706092834473, + "learning_rate": 9.999966023805501e-06, + "loss": 1.3577, + "step": 1009 + }, + { + "epoch": 0.2558581380620646, + "grad_norm": 4.333564281463623, + "learning_rate": 9.999962864865708e-06, + "loss": 1.1974, + "step": 1010 + }, + { + "epoch": 0.25611146295123494, + "grad_norm": 4.265956401824951, + "learning_rate": 9.9999595655293e-06, + "loss": 1.2562, + "step": 1011 + }, + { + "epoch": 0.25636478784040534, + "grad_norm": 4.358114242553711, + "learning_rate": 9.999956125796372e-06, + "loss": 1.2141, + "step": 1012 + }, + { + "epoch": 0.2566181127295757, + "grad_norm": 4.338444232940674, + "learning_rate": 9.999952545667018e-06, + "loss": 1.1853, + "step": 1013 + }, + { + "epoch": 0.25687143761874603, + "grad_norm": 4.273314476013184, + "learning_rate": 9.999948825141342e-06, + "loss": 1.1726, + "step": 1014 + }, + { + "epoch": 0.2571247625079164, + "grad_norm": 4.453716278076172, + "learning_rate": 9.999944964219447e-06, + "loss": 1.2582, + "step": 1015 + }, + { + "epoch": 0.2573780873970868, + "grad_norm": 4.264922142028809, + "learning_rate": 9.999940962901437e-06, + "loss": 1.2207, + "step": 1016 + }, + { + "epoch": 0.25763141228625713, + "grad_norm": 4.277278423309326, + "learning_rate": 9.99993682118743e-06, + "loss": 1.3064, + "step": 1017 + }, + { + "epoch": 0.2578847371754275, + "grad_norm": 4.1954731941223145, + "learning_rate": 9.999932539077541e-06, + "loss": 1.1567, + "step": 1018 + }, + { + "epoch": 0.25813806206459783, + "grad_norm": 4.1409149169921875, + "learning_rate": 9.999928116571888e-06, + "loss": 1.223, + "step": 1019 + }, + { + "epoch": 0.25839138695376823, + "grad_norm": 3.997588634490967, + "learning_rate": 9.9999235536706e-06, + "loss": 1.2553, + "step": 1020 + }, + { + "epoch": 0.2586447118429386, + "grad_norm": 4.307750701904297, + "learning_rate": 9.9999188503738e-06, + "loss": 1.2374, + "step": 1021 + }, + { + "epoch": 0.2588980367321089, + "grad_norm": 4.207553863525391, + "learning_rate": 9.999914006681622e-06, + "loss": 1.2823, + "step": 1022 + }, + { + "epoch": 0.2591513616212793, + "grad_norm": 4.0428690910339355, + "learning_rate": 9.999909022594201e-06, + "loss": 1.215, + "step": 1023 + }, + { + "epoch": 0.2594046865104497, + "grad_norm": 4.571065902709961, + "learning_rate": 9.999903898111679e-06, + "loss": 1.398, + "step": 1024 + }, + { + "epoch": 0.25965801139962, + "grad_norm": 5.071618556976318, + "learning_rate": 9.9998986332342e-06, + "loss": 1.2586, + "step": 1025 + }, + { + "epoch": 0.2599113362887904, + "grad_norm": 4.284810543060303, + "learning_rate": 9.999893227961909e-06, + "loss": 1.2099, + "step": 1026 + }, + { + "epoch": 0.2601646611779607, + "grad_norm": 4.17232608795166, + "learning_rate": 9.99988768229496e-06, + "loss": 1.2971, + "step": 1027 + }, + { + "epoch": 0.26041798606713107, + "grad_norm": 4.449314117431641, + "learning_rate": 9.999881996233508e-06, + "loss": 1.2091, + "step": 1028 + }, + { + "epoch": 0.26067131095630147, + "grad_norm": 4.640637397766113, + "learning_rate": 9.999876169777713e-06, + "loss": 1.2349, + "step": 1029 + }, + { + "epoch": 0.2609246358454718, + "grad_norm": 4.333415985107422, + "learning_rate": 9.999870202927739e-06, + "loss": 1.3162, + "step": 1030 + }, + { + "epoch": 0.26117796073464217, + "grad_norm": 4.521099090576172, + "learning_rate": 9.999864095683753e-06, + "loss": 1.3238, + "step": 1031 + }, + { + "epoch": 0.2614312856238125, + "grad_norm": 4.749149322509766, + "learning_rate": 9.999857848045927e-06, + "loss": 1.1897, + "step": 1032 + }, + { + "epoch": 0.2616846105129829, + "grad_norm": 4.04673433303833, + "learning_rate": 9.999851460014435e-06, + "loss": 1.2942, + "step": 1033 + }, + { + "epoch": 0.26193793540215327, + "grad_norm": 4.424718379974365, + "learning_rate": 9.999844931589457e-06, + "loss": 1.2762, + "step": 1034 + }, + { + "epoch": 0.2621912602913236, + "grad_norm": 4.327094554901123, + "learning_rate": 9.999838262771178e-06, + "loss": 1.2108, + "step": 1035 + }, + { + "epoch": 0.26244458518049396, + "grad_norm": 4.391043186187744, + "learning_rate": 9.999831453559782e-06, + "loss": 1.2186, + "step": 1036 + }, + { + "epoch": 0.26269791006966436, + "grad_norm": 4.974733829498291, + "learning_rate": 9.999824503955465e-06, + "loss": 1.3035, + "step": 1037 + }, + { + "epoch": 0.2629512349588347, + "grad_norm": 4.520740509033203, + "learning_rate": 9.999817413958415e-06, + "loss": 1.2613, + "step": 1038 + }, + { + "epoch": 0.26320455984800506, + "grad_norm": 3.9201395511627197, + "learning_rate": 9.999810183568839e-06, + "loss": 1.2798, + "step": 1039 + }, + { + "epoch": 0.2634578847371754, + "grad_norm": 4.590415954589844, + "learning_rate": 9.999802812786933e-06, + "loss": 1.3429, + "step": 1040 + }, + { + "epoch": 0.2637112096263458, + "grad_norm": 4.340219497680664, + "learning_rate": 9.999795301612912e-06, + "loss": 1.2433, + "step": 1041 + }, + { + "epoch": 0.26396453451551616, + "grad_norm": 4.225920677185059, + "learning_rate": 9.99978765004698e-06, + "loss": 1.2547, + "step": 1042 + }, + { + "epoch": 0.2642178594046865, + "grad_norm": 4.53737735748291, + "learning_rate": 9.999779858089353e-06, + "loss": 1.2362, + "step": 1043 + }, + { + "epoch": 0.26447118429385685, + "grad_norm": 4.659938812255859, + "learning_rate": 9.999771925740251e-06, + "loss": 1.3427, + "step": 1044 + }, + { + "epoch": 0.26472450918302726, + "grad_norm": 4.459654331207275, + "learning_rate": 9.999763852999897e-06, + "loss": 1.309, + "step": 1045 + }, + { + "epoch": 0.2649778340721976, + "grad_norm": 4.582836151123047, + "learning_rate": 9.999755639868518e-06, + "loss": 1.2851, + "step": 1046 + }, + { + "epoch": 0.26523115896136795, + "grad_norm": 4.712745666503906, + "learning_rate": 9.999747286346342e-06, + "loss": 1.3503, + "step": 1047 + }, + { + "epoch": 0.2654844838505383, + "grad_norm": 4.448166847229004, + "learning_rate": 9.999738792433609e-06, + "loss": 1.398, + "step": 1048 + }, + { + "epoch": 0.2657378087397087, + "grad_norm": 4.9156975746154785, + "learning_rate": 9.99973015813055e-06, + "loss": 1.4148, + "step": 1049 + }, + { + "epoch": 0.26599113362887905, + "grad_norm": 4.917559623718262, + "learning_rate": 9.999721383437413e-06, + "loss": 1.3996, + "step": 1050 + }, + { + "epoch": 0.2662444585180494, + "grad_norm": 4.078757286071777, + "learning_rate": 9.999712468354444e-06, + "loss": 1.2654, + "step": 1051 + }, + { + "epoch": 0.26649778340721975, + "grad_norm": 4.17608642578125, + "learning_rate": 9.999703412881892e-06, + "loss": 1.2419, + "step": 1052 + }, + { + "epoch": 0.2667511082963901, + "grad_norm": 4.432343482971191, + "learning_rate": 9.99969421702001e-06, + "loss": 1.1326, + "step": 1053 + }, + { + "epoch": 0.2670044331855605, + "grad_norm": 4.077740669250488, + "learning_rate": 9.999684880769058e-06, + "loss": 1.0532, + "step": 1054 + }, + { + "epoch": 0.26725775807473084, + "grad_norm": 4.219796180725098, + "learning_rate": 9.999675404129299e-06, + "loss": 1.2136, + "step": 1055 + }, + { + "epoch": 0.2675110829639012, + "grad_norm": 4.376031875610352, + "learning_rate": 9.999665787100997e-06, + "loss": 1.2242, + "step": 1056 + }, + { + "epoch": 0.26776440785307154, + "grad_norm": 4.411856651306152, + "learning_rate": 9.999656029684422e-06, + "loss": 1.2607, + "step": 1057 + }, + { + "epoch": 0.26801773274224194, + "grad_norm": 4.215365886688232, + "learning_rate": 9.99964613187985e-06, + "loss": 1.2405, + "step": 1058 + }, + { + "epoch": 0.2682710576314123, + "grad_norm": 4.299322605133057, + "learning_rate": 9.999636093687556e-06, + "loss": 1.2972, + "step": 1059 + }, + { + "epoch": 0.26852438252058264, + "grad_norm": 4.521589756011963, + "learning_rate": 9.999625915107826e-06, + "loss": 1.3306, + "step": 1060 + }, + { + "epoch": 0.268777707409753, + "grad_norm": 4.32185697555542, + "learning_rate": 9.999615596140944e-06, + "loss": 1.2621, + "step": 1061 + }, + { + "epoch": 0.2690310322989234, + "grad_norm": 4.3838701248168945, + "learning_rate": 9.999605136787197e-06, + "loss": 1.2831, + "step": 1062 + }, + { + "epoch": 0.26928435718809374, + "grad_norm": 4.577671051025391, + "learning_rate": 9.999594537046882e-06, + "loss": 1.2296, + "step": 1063 + }, + { + "epoch": 0.2695376820772641, + "grad_norm": 4.250572681427002, + "learning_rate": 9.999583796920296e-06, + "loss": 1.2728, + "step": 1064 + }, + { + "epoch": 0.26979100696643443, + "grad_norm": 4.592348575592041, + "learning_rate": 9.999572916407741e-06, + "loss": 1.2855, + "step": 1065 + }, + { + "epoch": 0.27004433185560484, + "grad_norm": 4.446188449859619, + "learning_rate": 9.99956189550952e-06, + "loss": 1.2105, + "step": 1066 + }, + { + "epoch": 0.2702976567447752, + "grad_norm": 4.710268497467041, + "learning_rate": 9.999550734225945e-06, + "loss": 1.4268, + "step": 1067 + }, + { + "epoch": 0.27055098163394553, + "grad_norm": 4.610569953918457, + "learning_rate": 9.999539432557327e-06, + "loss": 1.2433, + "step": 1068 + }, + { + "epoch": 0.2708043065231159, + "grad_norm": 4.46692419052124, + "learning_rate": 9.999527990503986e-06, + "loss": 1.2776, + "step": 1069 + }, + { + "epoch": 0.2710576314122863, + "grad_norm": 4.537080764770508, + "learning_rate": 9.999516408066244e-06, + "loss": 1.3355, + "step": 1070 + }, + { + "epoch": 0.27131095630145663, + "grad_norm": 3.7139177322387695, + "learning_rate": 9.99950468524442e-06, + "loss": 1.1919, + "step": 1071 + }, + { + "epoch": 0.271564281190627, + "grad_norm": 4.5693583488464355, + "learning_rate": 9.999492822038853e-06, + "loss": 1.316, + "step": 1072 + }, + { + "epoch": 0.2718176060797973, + "grad_norm": 4.537054538726807, + "learning_rate": 9.999480818449868e-06, + "loss": 1.183, + "step": 1073 + }, + { + "epoch": 0.27207093096896773, + "grad_norm": 4.270650863647461, + "learning_rate": 9.999468674477803e-06, + "loss": 1.3077, + "step": 1074 + }, + { + "epoch": 0.2723242558581381, + "grad_norm": 3.8746337890625, + "learning_rate": 9.999456390123004e-06, + "loss": 1.1064, + "step": 1075 + }, + { + "epoch": 0.2725775807473084, + "grad_norm": 4.1109466552734375, + "learning_rate": 9.99944396538581e-06, + "loss": 1.1139, + "step": 1076 + }, + { + "epoch": 0.27283090563647877, + "grad_norm": 4.652540683746338, + "learning_rate": 9.999431400266574e-06, + "loss": 1.3425, + "step": 1077 + }, + { + "epoch": 0.2730842305256491, + "grad_norm": 4.178191184997559, + "learning_rate": 9.999418694765648e-06, + "loss": 1.3607, + "step": 1078 + }, + { + "epoch": 0.2733375554148195, + "grad_norm": 4.317570686340332, + "learning_rate": 9.999405848883387e-06, + "loss": 1.176, + "step": 1079 + }, + { + "epoch": 0.27359088030398987, + "grad_norm": 4.367639064788818, + "learning_rate": 9.999392862620154e-06, + "loss": 1.2317, + "step": 1080 + }, + { + "epoch": 0.2738442051931602, + "grad_norm": 4.147261142730713, + "learning_rate": 9.999379735976312e-06, + "loss": 1.2692, + "step": 1081 + }, + { + "epoch": 0.27409753008233056, + "grad_norm": 4.390732765197754, + "learning_rate": 9.999366468952229e-06, + "loss": 1.2009, + "step": 1082 + }, + { + "epoch": 0.27435085497150097, + "grad_norm": 4.224822521209717, + "learning_rate": 9.99935306154828e-06, + "loss": 1.2473, + "step": 1083 + }, + { + "epoch": 0.2746041798606713, + "grad_norm": 4.581904411315918, + "learning_rate": 9.999339513764838e-06, + "loss": 1.2304, + "step": 1084 + }, + { + "epoch": 0.27485750474984166, + "grad_norm": 4.850911617279053, + "learning_rate": 9.999325825602288e-06, + "loss": 1.4055, + "step": 1085 + }, + { + "epoch": 0.275110829639012, + "grad_norm": 4.441746234893799, + "learning_rate": 9.999311997061011e-06, + "loss": 1.2762, + "step": 1086 + }, + { + "epoch": 0.2753641545281824, + "grad_norm": 4.414993762969971, + "learning_rate": 9.999298028141396e-06, + "loss": 1.3163, + "step": 1087 + }, + { + "epoch": 0.27561747941735276, + "grad_norm": 4.703424453735352, + "learning_rate": 9.999283918843836e-06, + "loss": 1.2733, + "step": 1088 + }, + { + "epoch": 0.2758708043065231, + "grad_norm": 4.2780537605285645, + "learning_rate": 9.999269669168727e-06, + "loss": 1.2719, + "step": 1089 + }, + { + "epoch": 0.27612412919569346, + "grad_norm": 4.284636974334717, + "learning_rate": 9.999255279116468e-06, + "loss": 1.2018, + "step": 1090 + }, + { + "epoch": 0.27637745408486386, + "grad_norm": 3.985605478286743, + "learning_rate": 9.999240748687464e-06, + "loss": 1.2875, + "step": 1091 + }, + { + "epoch": 0.2766307789740342, + "grad_norm": 4.404812812805176, + "learning_rate": 9.999226077882123e-06, + "loss": 1.133, + "step": 1092 + }, + { + "epoch": 0.27688410386320456, + "grad_norm": 4.2787251472473145, + "learning_rate": 9.999211266700855e-06, + "loss": 1.2876, + "step": 1093 + }, + { + "epoch": 0.2771374287523749, + "grad_norm": 4.34266996383667, + "learning_rate": 9.99919631514408e-06, + "loss": 1.206, + "step": 1094 + }, + { + "epoch": 0.2773907536415453, + "grad_norm": 4.1548542976379395, + "learning_rate": 9.999181223212215e-06, + "loss": 1.2429, + "step": 1095 + }, + { + "epoch": 0.27764407853071565, + "grad_norm": 4.232419490814209, + "learning_rate": 9.999165990905684e-06, + "loss": 1.2596, + "step": 1096 + }, + { + "epoch": 0.277897403419886, + "grad_norm": 4.342401504516602, + "learning_rate": 9.999150618224914e-06, + "loss": 1.1759, + "step": 1097 + }, + { + "epoch": 0.27815072830905635, + "grad_norm": 4.437196254730225, + "learning_rate": 9.99913510517034e-06, + "loss": 1.2726, + "step": 1098 + }, + { + "epoch": 0.27840405319822675, + "grad_norm": 4.315725326538086, + "learning_rate": 9.999119451742392e-06, + "loss": 1.3566, + "step": 1099 + }, + { + "epoch": 0.2786573780873971, + "grad_norm": 4.410442352294922, + "learning_rate": 9.999103657941514e-06, + "loss": 1.3163, + "step": 1100 + }, + { + "epoch": 0.27891070297656745, + "grad_norm": 4.655491352081299, + "learning_rate": 9.999087723768149e-06, + "loss": 1.4307, + "step": 1101 + }, + { + "epoch": 0.2791640278657378, + "grad_norm": 3.777026891708374, + "learning_rate": 9.999071649222744e-06, + "loss": 1.1103, + "step": 1102 + }, + { + "epoch": 0.27941735275490814, + "grad_norm": 4.128303527832031, + "learning_rate": 9.99905543430575e-06, + "loss": 1.2228, + "step": 1103 + }, + { + "epoch": 0.27967067764407855, + "grad_norm": 4.326513767242432, + "learning_rate": 9.99903907901762e-06, + "loss": 1.2697, + "step": 1104 + }, + { + "epoch": 0.2799240025332489, + "grad_norm": 4.27681303024292, + "learning_rate": 9.999022583358816e-06, + "loss": 1.2722, + "step": 1105 + }, + { + "epoch": 0.28017732742241924, + "grad_norm": 4.046457290649414, + "learning_rate": 9.999005947329804e-06, + "loss": 1.1574, + "step": 1106 + }, + { + "epoch": 0.2804306523115896, + "grad_norm": 4.085561275482178, + "learning_rate": 9.998989170931046e-06, + "loss": 1.1658, + "step": 1107 + }, + { + "epoch": 0.28068397720076, + "grad_norm": 4.437790393829346, + "learning_rate": 9.998972254163016e-06, + "loss": 1.2576, + "step": 1108 + }, + { + "epoch": 0.28093730208993034, + "grad_norm": 4.35031795501709, + "learning_rate": 9.998955197026186e-06, + "loss": 1.1881, + "step": 1109 + }, + { + "epoch": 0.2811906269791007, + "grad_norm": 4.078453063964844, + "learning_rate": 9.99893799952104e-06, + "loss": 1.2298, + "step": 1110 + }, + { + "epoch": 0.28144395186827104, + "grad_norm": 4.690131664276123, + "learning_rate": 9.998920661648053e-06, + "loss": 1.3197, + "step": 1111 + }, + { + "epoch": 0.28169727675744144, + "grad_norm": 4.241751670837402, + "learning_rate": 9.99890318340772e-06, + "loss": 1.3322, + "step": 1112 + }, + { + "epoch": 0.2819506016466118, + "grad_norm": 4.170272350311279, + "learning_rate": 9.998885564800528e-06, + "loss": 1.2634, + "step": 1113 + }, + { + "epoch": 0.28220392653578213, + "grad_norm": 4.748134613037109, + "learning_rate": 9.998867805826973e-06, + "loss": 1.2743, + "step": 1114 + }, + { + "epoch": 0.2824572514249525, + "grad_norm": 4.64783239364624, + "learning_rate": 9.998849906487552e-06, + "loss": 1.3666, + "step": 1115 + }, + { + "epoch": 0.2827105763141229, + "grad_norm": 3.911585807800293, + "learning_rate": 9.998831866782769e-06, + "loss": 1.0517, + "step": 1116 + }, + { + "epoch": 0.28296390120329323, + "grad_norm": 4.002292633056641, + "learning_rate": 9.99881368671313e-06, + "loss": 1.2947, + "step": 1117 + }, + { + "epoch": 0.2832172260924636, + "grad_norm": 4.09478759765625, + "learning_rate": 9.998795366279145e-06, + "loss": 1.2217, + "step": 1118 + }, + { + "epoch": 0.28347055098163393, + "grad_norm": 4.193706035614014, + "learning_rate": 9.998776905481328e-06, + "loss": 1.2352, + "step": 1119 + }, + { + "epoch": 0.28372387587080433, + "grad_norm": 4.266157627105713, + "learning_rate": 9.9987583043202e-06, + "loss": 1.2766, + "step": 1120 + }, + { + "epoch": 0.2839772007599747, + "grad_norm": 4.135251522064209, + "learning_rate": 9.998739562796281e-06, + "loss": 1.3218, + "step": 1121 + }, + { + "epoch": 0.284230525649145, + "grad_norm": 3.903446674346924, + "learning_rate": 9.998720680910097e-06, + "loss": 1.2192, + "step": 1122 + }, + { + "epoch": 0.2844838505383154, + "grad_norm": 4.4483256340026855, + "learning_rate": 9.99870165866218e-06, + "loss": 1.1485, + "step": 1123 + }, + { + "epoch": 0.2847371754274858, + "grad_norm": 4.186819076538086, + "learning_rate": 9.998682496053065e-06, + "loss": 1.1963, + "step": 1124 + }, + { + "epoch": 0.2849905003166561, + "grad_norm": 4.26090145111084, + "learning_rate": 9.998663193083285e-06, + "loss": 1.2054, + "step": 1125 + }, + { + "epoch": 0.2852438252058265, + "grad_norm": 4.314788341522217, + "learning_rate": 9.998643749753387e-06, + "loss": 1.2775, + "step": 1126 + }, + { + "epoch": 0.2854971500949968, + "grad_norm": 4.19307804107666, + "learning_rate": 9.998624166063915e-06, + "loss": 1.2285, + "step": 1127 + }, + { + "epoch": 0.28575047498416717, + "grad_norm": 4.191956520080566, + "learning_rate": 9.99860444201542e-06, + "loss": 1.189, + "step": 1128 + }, + { + "epoch": 0.28600379987333757, + "grad_norm": 4.338077068328857, + "learning_rate": 9.998584577608454e-06, + "loss": 1.313, + "step": 1129 + }, + { + "epoch": 0.2862571247625079, + "grad_norm": 4.931329727172852, + "learning_rate": 9.998564572843577e-06, + "loss": 1.3063, + "step": 1130 + }, + { + "epoch": 0.28651044965167827, + "grad_norm": 4.653294086456299, + "learning_rate": 9.998544427721348e-06, + "loss": 1.358, + "step": 1131 + }, + { + "epoch": 0.2867637745408486, + "grad_norm": 4.46495246887207, + "learning_rate": 9.998524142242334e-06, + "loss": 1.2361, + "step": 1132 + }, + { + "epoch": 0.287017099430019, + "grad_norm": 4.529207229614258, + "learning_rate": 9.998503716407105e-06, + "loss": 1.2673, + "step": 1133 + }, + { + "epoch": 0.28727042431918937, + "grad_norm": 3.916558265686035, + "learning_rate": 9.998483150216234e-06, + "loss": 1.2153, + "step": 1134 + }, + { + "epoch": 0.2875237492083597, + "grad_norm": 4.207367897033691, + "learning_rate": 9.9984624436703e-06, + "loss": 1.1631, + "step": 1135 + }, + { + "epoch": 0.28777707409753006, + "grad_norm": 4.070796489715576, + "learning_rate": 9.998441596769883e-06, + "loss": 1.2328, + "step": 1136 + }, + { + "epoch": 0.28803039898670046, + "grad_norm": 4.211414337158203, + "learning_rate": 9.998420609515568e-06, + "loss": 1.1903, + "step": 1137 + }, + { + "epoch": 0.2882837238758708, + "grad_norm": 4.0857086181640625, + "learning_rate": 9.998399481907945e-06, + "loss": 1.1356, + "step": 1138 + }, + { + "epoch": 0.28853704876504116, + "grad_norm": 4.152622222900391, + "learning_rate": 9.998378213947607e-06, + "loss": 1.2288, + "step": 1139 + }, + { + "epoch": 0.2887903736542115, + "grad_norm": 4.304581642150879, + "learning_rate": 9.998356805635154e-06, + "loss": 1.2543, + "step": 1140 + }, + { + "epoch": 0.2890436985433819, + "grad_norm": 4.069146156311035, + "learning_rate": 9.998335256971183e-06, + "loss": 1.1565, + "step": 1141 + }, + { + "epoch": 0.28929702343255226, + "grad_norm": 4.224587440490723, + "learning_rate": 9.998313567956299e-06, + "loss": 1.2069, + "step": 1142 + }, + { + "epoch": 0.2895503483217226, + "grad_norm": 3.9599013328552246, + "learning_rate": 9.998291738591115e-06, + "loss": 1.1918, + "step": 1143 + }, + { + "epoch": 0.28980367321089295, + "grad_norm": 4.346947193145752, + "learning_rate": 9.998269768876239e-06, + "loss": 1.2541, + "step": 1144 + }, + { + "epoch": 0.29005699810006336, + "grad_norm": 4.332737445831299, + "learning_rate": 9.998247658812293e-06, + "loss": 1.2649, + "step": 1145 + }, + { + "epoch": 0.2903103229892337, + "grad_norm": 4.032033443450928, + "learning_rate": 9.998225408399894e-06, + "loss": 1.2889, + "step": 1146 + }, + { + "epoch": 0.29056364787840405, + "grad_norm": 4.593135356903076, + "learning_rate": 9.998203017639668e-06, + "loss": 1.3193, + "step": 1147 + }, + { + "epoch": 0.2908169727675744, + "grad_norm": 4.185672760009766, + "learning_rate": 9.998180486532242e-06, + "loss": 1.399, + "step": 1148 + }, + { + "epoch": 0.29107029765674475, + "grad_norm": 4.378150939941406, + "learning_rate": 9.998157815078253e-06, + "loss": 1.2216, + "step": 1149 + }, + { + "epoch": 0.29132362254591515, + "grad_norm": 3.8727684020996094, + "learning_rate": 9.998135003278335e-06, + "loss": 1.3562, + "step": 1150 + }, + { + "epoch": 0.2915769474350855, + "grad_norm": 4.286794662475586, + "learning_rate": 9.998112051133127e-06, + "loss": 1.2944, + "step": 1151 + }, + { + "epoch": 0.29183027232425585, + "grad_norm": 4.495805263519287, + "learning_rate": 9.998088958643277e-06, + "loss": 1.2803, + "step": 1152 + }, + { + "epoch": 0.2920835972134262, + "grad_norm": 4.211381435394287, + "learning_rate": 9.99806572580943e-06, + "loss": 1.1863, + "step": 1153 + }, + { + "epoch": 0.2923369221025966, + "grad_norm": 4.094125747680664, + "learning_rate": 9.998042352632242e-06, + "loss": 1.3885, + "step": 1154 + }, + { + "epoch": 0.29259024699176694, + "grad_norm": 4.094359874725342, + "learning_rate": 9.998018839112365e-06, + "loss": 1.2796, + "step": 1155 + }, + { + "epoch": 0.2928435718809373, + "grad_norm": 3.947622060775757, + "learning_rate": 9.997995185250462e-06, + "loss": 1.2438, + "step": 1156 + }, + { + "epoch": 0.29309689677010764, + "grad_norm": 4.144353866577148, + "learning_rate": 9.997971391047197e-06, + "loss": 1.1298, + "step": 1157 + }, + { + "epoch": 0.29335022165927804, + "grad_norm": 3.9895102977752686, + "learning_rate": 9.997947456503238e-06, + "loss": 1.1884, + "step": 1158 + }, + { + "epoch": 0.2936035465484484, + "grad_norm": 4.259244918823242, + "learning_rate": 9.997923381619257e-06, + "loss": 1.2304, + "step": 1159 + }, + { + "epoch": 0.29385687143761874, + "grad_norm": 4.923684120178223, + "learning_rate": 9.99789916639593e-06, + "loss": 1.5598, + "step": 1160 + }, + { + "epoch": 0.2941101963267891, + "grad_norm": 4.479145526885986, + "learning_rate": 9.997874810833936e-06, + "loss": 1.2207, + "step": 1161 + }, + { + "epoch": 0.2943635212159595, + "grad_norm": 5.040842533111572, + "learning_rate": 9.99785031493396e-06, + "loss": 1.2571, + "step": 1162 + }, + { + "epoch": 0.29461684610512984, + "grad_norm": 3.860297441482544, + "learning_rate": 9.997825678696688e-06, + "loss": 1.2313, + "step": 1163 + }, + { + "epoch": 0.2948701709943002, + "grad_norm": 4.079608917236328, + "learning_rate": 9.997800902122816e-06, + "loss": 1.1536, + "step": 1164 + }, + { + "epoch": 0.29512349588347053, + "grad_norm": 3.9218783378601074, + "learning_rate": 9.997775985213035e-06, + "loss": 1.1827, + "step": 1165 + }, + { + "epoch": 0.29537682077264094, + "grad_norm": 3.88308048248291, + "learning_rate": 9.99775092796805e-06, + "loss": 1.2224, + "step": 1166 + }, + { + "epoch": 0.2956301456618113, + "grad_norm": 3.903902053833008, + "learning_rate": 9.997725730388556e-06, + "loss": 1.133, + "step": 1167 + }, + { + "epoch": 0.29588347055098163, + "grad_norm": 4.177271366119385, + "learning_rate": 9.99770039247527e-06, + "loss": 1.1429, + "step": 1168 + }, + { + "epoch": 0.296136795440152, + "grad_norm": 3.8579490184783936, + "learning_rate": 9.997674914228896e-06, + "loss": 1.1676, + "step": 1169 + }, + { + "epoch": 0.2963901203293224, + "grad_norm": 4.455913543701172, + "learning_rate": 9.997649295650157e-06, + "loss": 1.2915, + "step": 1170 + }, + { + "epoch": 0.29664344521849273, + "grad_norm": 4.432326793670654, + "learning_rate": 9.997623536739764e-06, + "loss": 1.2021, + "step": 1171 + }, + { + "epoch": 0.2968967701076631, + "grad_norm": 4.448549747467041, + "learning_rate": 9.997597637498445e-06, + "loss": 1.2106, + "step": 1172 + }, + { + "epoch": 0.2971500949968334, + "grad_norm": 4.243499755859375, + "learning_rate": 9.99757159792693e-06, + "loss": 1.2227, + "step": 1173 + }, + { + "epoch": 0.2974034198860038, + "grad_norm": 4.29484748840332, + "learning_rate": 9.997545418025942e-06, + "loss": 1.3158, + "step": 1174 + }, + { + "epoch": 0.2976567447751742, + "grad_norm": 4.385229110717773, + "learning_rate": 9.997519097796224e-06, + "loss": 1.1911, + "step": 1175 + }, + { + "epoch": 0.2979100696643445, + "grad_norm": 4.231202125549316, + "learning_rate": 9.997492637238512e-06, + "loss": 1.114, + "step": 1176 + }, + { + "epoch": 0.29816339455351487, + "grad_norm": 4.25446081161499, + "learning_rate": 9.997466036353549e-06, + "loss": 1.2236, + "step": 1177 + }, + { + "epoch": 0.2984167194426852, + "grad_norm": 4.097778797149658, + "learning_rate": 9.99743929514208e-06, + "loss": 1.2686, + "step": 1178 + }, + { + "epoch": 0.2986700443318556, + "grad_norm": 4.999679088592529, + "learning_rate": 9.99741241360486e-06, + "loss": 1.189, + "step": 1179 + }, + { + "epoch": 0.29892336922102597, + "grad_norm": 3.9533159732818604, + "learning_rate": 9.99738539174264e-06, + "loss": 1.2186, + "step": 1180 + }, + { + "epoch": 0.2991766941101963, + "grad_norm": 4.128479957580566, + "learning_rate": 9.99735822955618e-06, + "loss": 1.2896, + "step": 1181 + }, + { + "epoch": 0.29943001899936666, + "grad_norm": 4.410684585571289, + "learning_rate": 9.997330927046243e-06, + "loss": 1.3052, + "step": 1182 + }, + { + "epoch": 0.29968334388853707, + "grad_norm": 4.5088372230529785, + "learning_rate": 9.997303484213597e-06, + "loss": 1.2223, + "step": 1183 + }, + { + "epoch": 0.2999366687777074, + "grad_norm": 4.644050121307373, + "learning_rate": 9.99727590105901e-06, + "loss": 1.3765, + "step": 1184 + }, + { + "epoch": 0.30018999366687776, + "grad_norm": 4.378511905670166, + "learning_rate": 9.99724817758326e-06, + "loss": 1.33, + "step": 1185 + }, + { + "epoch": 0.3004433185560481, + "grad_norm": 4.221255779266357, + "learning_rate": 9.997220313787122e-06, + "loss": 1.1759, + "step": 1186 + }, + { + "epoch": 0.3006966434452185, + "grad_norm": 4.051570415496826, + "learning_rate": 9.99719230967138e-06, + "loss": 1.2226, + "step": 1187 + }, + { + "epoch": 0.30094996833438886, + "grad_norm": 4.175117492675781, + "learning_rate": 9.997164165236819e-06, + "loss": 1.1844, + "step": 1188 + }, + { + "epoch": 0.3012032932235592, + "grad_norm": 4.066131591796875, + "learning_rate": 9.997135880484232e-06, + "loss": 1.2764, + "step": 1189 + }, + { + "epoch": 0.30145661811272956, + "grad_norm": 4.347494602203369, + "learning_rate": 9.99710745541441e-06, + "loss": 1.2386, + "step": 1190 + }, + { + "epoch": 0.30170994300189996, + "grad_norm": 4.305504322052002, + "learning_rate": 9.997078890028153e-06, + "loss": 1.4568, + "step": 1191 + }, + { + "epoch": 0.3019632678910703, + "grad_norm": 4.016643524169922, + "learning_rate": 9.997050184326263e-06, + "loss": 1.2184, + "step": 1192 + }, + { + "epoch": 0.30221659278024066, + "grad_norm": 4.105624198913574, + "learning_rate": 9.997021338309547e-06, + "loss": 1.3006, + "step": 1193 + }, + { + "epoch": 0.302469917669411, + "grad_norm": 4.551614284515381, + "learning_rate": 9.996992351978811e-06, + "loss": 1.4236, + "step": 1194 + }, + { + "epoch": 0.3027232425585814, + "grad_norm": 4.448981761932373, + "learning_rate": 9.996963225334874e-06, + "loss": 1.3243, + "step": 1195 + }, + { + "epoch": 0.30297656744775175, + "grad_norm": 4.29794454574585, + "learning_rate": 9.996933958378551e-06, + "loss": 1.3387, + "step": 1196 + }, + { + "epoch": 0.3032298923369221, + "grad_norm": 4.252987861633301, + "learning_rate": 9.996904551110664e-06, + "loss": 1.2944, + "step": 1197 + }, + { + "epoch": 0.30348321722609245, + "grad_norm": 4.383278846740723, + "learning_rate": 9.99687500353204e-06, + "loss": 1.296, + "step": 1198 + }, + { + "epoch": 0.3037365421152628, + "grad_norm": 4.380770683288574, + "learning_rate": 9.996845315643506e-06, + "loss": 1.2481, + "step": 1199 + }, + { + "epoch": 0.3039898670044332, + "grad_norm": 4.155014514923096, + "learning_rate": 9.996815487445899e-06, + "loss": 1.2227, + "step": 1200 + }, + { + "epoch": 0.30424319189360355, + "grad_norm": 4.016407489776611, + "learning_rate": 9.996785518940056e-06, + "loss": 1.4424, + "step": 1201 + }, + { + "epoch": 0.3044965167827739, + "grad_norm": 4.063757419586182, + "learning_rate": 9.996755410126815e-06, + "loss": 1.2652, + "step": 1202 + }, + { + "epoch": 0.30474984167194424, + "grad_norm": 4.2165117263793945, + "learning_rate": 9.996725161007027e-06, + "loss": 1.2779, + "step": 1203 + }, + { + "epoch": 0.30500316656111465, + "grad_norm": 3.8022258281707764, + "learning_rate": 9.996694771581535e-06, + "loss": 1.2567, + "step": 1204 + }, + { + "epoch": 0.305256491450285, + "grad_norm": 4.526707649230957, + "learning_rate": 9.996664241851197e-06, + "loss": 1.4895, + "step": 1205 + }, + { + "epoch": 0.30550981633945534, + "grad_norm": 3.676969289779663, + "learning_rate": 9.99663357181687e-06, + "loss": 1.1388, + "step": 1206 + }, + { + "epoch": 0.3057631412286257, + "grad_norm": 4.362067699432373, + "learning_rate": 9.996602761479413e-06, + "loss": 1.2873, + "step": 1207 + }, + { + "epoch": 0.3060164661177961, + "grad_norm": 4.138330936431885, + "learning_rate": 9.996571810839693e-06, + "loss": 1.1894, + "step": 1208 + }, + { + "epoch": 0.30626979100696644, + "grad_norm": 4.2472615242004395, + "learning_rate": 9.996540719898578e-06, + "loss": 1.2704, + "step": 1209 + }, + { + "epoch": 0.3065231158961368, + "grad_norm": 3.9246773719787598, + "learning_rate": 9.99650948865694e-06, + "loss": 1.2234, + "step": 1210 + }, + { + "epoch": 0.30677644078530714, + "grad_norm": 4.2852044105529785, + "learning_rate": 9.996478117115659e-06, + "loss": 1.2984, + "step": 1211 + }, + { + "epoch": 0.30702976567447754, + "grad_norm": 4.638847827911377, + "learning_rate": 9.996446605275614e-06, + "loss": 1.2492, + "step": 1212 + }, + { + "epoch": 0.3072830905636479, + "grad_norm": 4.264072418212891, + "learning_rate": 9.99641495313769e-06, + "loss": 1.3236, + "step": 1213 + }, + { + "epoch": 0.30753641545281823, + "grad_norm": 4.2414350509643555, + "learning_rate": 9.996383160702775e-06, + "loss": 1.328, + "step": 1214 + }, + { + "epoch": 0.3077897403419886, + "grad_norm": 3.8178317546844482, + "learning_rate": 9.996351227971763e-06, + "loss": 1.1453, + "step": 1215 + }, + { + "epoch": 0.308043065231159, + "grad_norm": 4.003973484039307, + "learning_rate": 9.996319154945551e-06, + "loss": 1.1736, + "step": 1216 + }, + { + "epoch": 0.30829639012032933, + "grad_norm": 4.237383842468262, + "learning_rate": 9.996286941625038e-06, + "loss": 1.377, + "step": 1217 + }, + { + "epoch": 0.3085497150094997, + "grad_norm": 4.385888576507568, + "learning_rate": 9.99625458801113e-06, + "loss": 1.3226, + "step": 1218 + }, + { + "epoch": 0.30880303989867003, + "grad_norm": 4.4331278800964355, + "learning_rate": 9.996222094104733e-06, + "loss": 1.3285, + "step": 1219 + }, + { + "epoch": 0.30905636478784043, + "grad_norm": 4.470774173736572, + "learning_rate": 9.996189459906762e-06, + "loss": 1.2614, + "step": 1220 + }, + { + "epoch": 0.3093096896770108, + "grad_norm": 4.2090229988098145, + "learning_rate": 9.996156685418133e-06, + "loss": 1.2239, + "step": 1221 + }, + { + "epoch": 0.3095630145661811, + "grad_norm": 3.836949348449707, + "learning_rate": 9.996123770639766e-06, + "loss": 1.2611, + "step": 1222 + }, + { + "epoch": 0.3098163394553515, + "grad_norm": 4.553760528564453, + "learning_rate": 9.996090715572587e-06, + "loss": 1.3546, + "step": 1223 + }, + { + "epoch": 0.3100696643445218, + "grad_norm": 4.262728691101074, + "learning_rate": 9.996057520217519e-06, + "loss": 1.2489, + "step": 1224 + }, + { + "epoch": 0.3103229892336922, + "grad_norm": 4.288120269775391, + "learning_rate": 9.996024184575497e-06, + "loss": 1.3574, + "step": 1225 + }, + { + "epoch": 0.3105763141228626, + "grad_norm": 4.323267936706543, + "learning_rate": 9.99599070864746e-06, + "loss": 1.388, + "step": 1226 + }, + { + "epoch": 0.3108296390120329, + "grad_norm": 3.9528911113739014, + "learning_rate": 9.995957092434345e-06, + "loss": 1.1117, + "step": 1227 + }, + { + "epoch": 0.31108296390120327, + "grad_norm": 4.321549415588379, + "learning_rate": 9.995923335937095e-06, + "loss": 1.2249, + "step": 1228 + }, + { + "epoch": 0.31133628879037367, + "grad_norm": 3.863180637359619, + "learning_rate": 9.995889439156661e-06, + "loss": 1.1733, + "step": 1229 + }, + { + "epoch": 0.311589613679544, + "grad_norm": 4.257288932800293, + "learning_rate": 9.995855402093991e-06, + "loss": 1.1108, + "step": 1230 + }, + { + "epoch": 0.31184293856871437, + "grad_norm": 4.414398193359375, + "learning_rate": 9.995821224750044e-06, + "loss": 1.2711, + "step": 1231 + }, + { + "epoch": 0.3120962634578847, + "grad_norm": 4.367593288421631, + "learning_rate": 9.995786907125778e-06, + "loss": 1.2059, + "step": 1232 + }, + { + "epoch": 0.3123495883470551, + "grad_norm": 4.501057147979736, + "learning_rate": 9.995752449222159e-06, + "loss": 1.2884, + "step": 1233 + }, + { + "epoch": 0.31260291323622547, + "grad_norm": 4.291118621826172, + "learning_rate": 9.99571785104015e-06, + "loss": 1.1887, + "step": 1234 + }, + { + "epoch": 0.3128562381253958, + "grad_norm": 4.475590705871582, + "learning_rate": 9.995683112580725e-06, + "loss": 1.428, + "step": 1235 + }, + { + "epoch": 0.31310956301456616, + "grad_norm": 4.3426923751831055, + "learning_rate": 9.99564823384486e-06, + "loss": 1.2912, + "step": 1236 + }, + { + "epoch": 0.31336288790373656, + "grad_norm": 4.068551540374756, + "learning_rate": 9.995613214833534e-06, + "loss": 1.2501, + "step": 1237 + }, + { + "epoch": 0.3136162127929069, + "grad_norm": 4.004886627197266, + "learning_rate": 9.995578055547732e-06, + "loss": 1.2266, + "step": 1238 + }, + { + "epoch": 0.31386953768207726, + "grad_norm": 4.359253883361816, + "learning_rate": 9.99554275598844e-06, + "loss": 1.2092, + "step": 1239 + }, + { + "epoch": 0.3141228625712476, + "grad_norm": 3.9630236625671387, + "learning_rate": 9.995507316156645e-06, + "loss": 1.0663, + "step": 1240 + }, + { + "epoch": 0.314376187460418, + "grad_norm": 4.438020706176758, + "learning_rate": 9.995471736053349e-06, + "loss": 1.2586, + "step": 1241 + }, + { + "epoch": 0.31462951234958836, + "grad_norm": 4.1128973960876465, + "learning_rate": 9.995436015679545e-06, + "loss": 1.2448, + "step": 1242 + }, + { + "epoch": 0.3148828372387587, + "grad_norm": 4.267119884490967, + "learning_rate": 9.995400155036241e-06, + "loss": 1.3054, + "step": 1243 + }, + { + "epoch": 0.31513616212792905, + "grad_norm": 4.559778213500977, + "learning_rate": 9.995364154124442e-06, + "loss": 1.3046, + "step": 1244 + }, + { + "epoch": 0.31538948701709946, + "grad_norm": 4.251293182373047, + "learning_rate": 9.995328012945158e-06, + "loss": 1.1169, + "step": 1245 + }, + { + "epoch": 0.3156428119062698, + "grad_norm": 4.158294200897217, + "learning_rate": 9.995291731499406e-06, + "loss": 1.2024, + "step": 1246 + }, + { + "epoch": 0.31589613679544015, + "grad_norm": 4.296648025512695, + "learning_rate": 9.995255309788202e-06, + "loss": 1.231, + "step": 1247 + }, + { + "epoch": 0.3161494616846105, + "grad_norm": 4.170580863952637, + "learning_rate": 9.99521874781257e-06, + "loss": 1.2121, + "step": 1248 + }, + { + "epoch": 0.31640278657378085, + "grad_norm": 4.245189666748047, + "learning_rate": 9.995182045573537e-06, + "loss": 1.2395, + "step": 1249 + }, + { + "epoch": 0.31665611146295125, + "grad_norm": 3.9477896690368652, + "learning_rate": 9.995145203072132e-06, + "loss": 1.2729, + "step": 1250 + }, + { + "epoch": 0.3169094363521216, + "grad_norm": 4.069958686828613, + "learning_rate": 9.995108220309392e-06, + "loss": 1.3298, + "step": 1251 + }, + { + "epoch": 0.31716276124129195, + "grad_norm": 3.9865245819091797, + "learning_rate": 9.995071097286355e-06, + "loss": 1.2307, + "step": 1252 + }, + { + "epoch": 0.3174160861304623, + "grad_norm": 4.537790298461914, + "learning_rate": 9.995033834004061e-06, + "loss": 1.1516, + "step": 1253 + }, + { + "epoch": 0.3176694110196327, + "grad_norm": 4.2048492431640625, + "learning_rate": 9.99499643046356e-06, + "loss": 1.4099, + "step": 1254 + }, + { + "epoch": 0.31792273590880304, + "grad_norm": 3.8523194789886475, + "learning_rate": 9.9949588866659e-06, + "loss": 1.0571, + "step": 1255 + }, + { + "epoch": 0.3181760607979734, + "grad_norm": 3.7813720703125, + "learning_rate": 9.994921202612135e-06, + "loss": 1.1963, + "step": 1256 + }, + { + "epoch": 0.31842938568714374, + "grad_norm": 3.999424934387207, + "learning_rate": 9.994883378303324e-06, + "loss": 1.0709, + "step": 1257 + }, + { + "epoch": 0.31868271057631414, + "grad_norm": 4.427967071533203, + "learning_rate": 9.994845413740529e-06, + "loss": 1.3181, + "step": 1258 + }, + { + "epoch": 0.3189360354654845, + "grad_norm": 4.221949577331543, + "learning_rate": 9.994807308924814e-06, + "loss": 1.1552, + "step": 1259 + }, + { + "epoch": 0.31918936035465484, + "grad_norm": 5.069612979888916, + "learning_rate": 9.99476906385725e-06, + "loss": 1.2541, + "step": 1260 + }, + { + "epoch": 0.3194426852438252, + "grad_norm": 4.107848644256592, + "learning_rate": 9.994730678538914e-06, + "loss": 1.2334, + "step": 1261 + }, + { + "epoch": 0.3196960101329956, + "grad_norm": 5.001278400421143, + "learning_rate": 9.994692152970882e-06, + "loss": 1.2908, + "step": 1262 + }, + { + "epoch": 0.31994933502216594, + "grad_norm": 4.346644401550293, + "learning_rate": 9.994653487154233e-06, + "loss": 1.3057, + "step": 1263 + }, + { + "epoch": 0.3202026599113363, + "grad_norm": 3.7297120094299316, + "learning_rate": 9.994614681090056e-06, + "loss": 1.0748, + "step": 1264 + }, + { + "epoch": 0.32045598480050663, + "grad_norm": 4.22995138168335, + "learning_rate": 9.994575734779439e-06, + "loss": 1.2675, + "step": 1265 + }, + { + "epoch": 0.32070930968967704, + "grad_norm": 3.8312008380889893, + "learning_rate": 9.994536648223477e-06, + "loss": 1.171, + "step": 1266 + }, + { + "epoch": 0.3209626345788474, + "grad_norm": 3.967376232147217, + "learning_rate": 9.994497421423266e-06, + "loss": 1.2821, + "step": 1267 + }, + { + "epoch": 0.32121595946801773, + "grad_norm": 4.221203327178955, + "learning_rate": 9.994458054379909e-06, + "loss": 1.2664, + "step": 1268 + }, + { + "epoch": 0.3214692843571881, + "grad_norm": 4.06121301651001, + "learning_rate": 9.994418547094511e-06, + "loss": 1.391, + "step": 1269 + }, + { + "epoch": 0.3217226092463585, + "grad_norm": 4.372332572937012, + "learning_rate": 9.99437889956818e-06, + "loss": 1.3267, + "step": 1270 + }, + { + "epoch": 0.32197593413552883, + "grad_norm": 4.301946640014648, + "learning_rate": 9.994339111802032e-06, + "loss": 1.3362, + "step": 1271 + }, + { + "epoch": 0.3222292590246992, + "grad_norm": 4.1606903076171875, + "learning_rate": 9.99429918379718e-06, + "loss": 1.3005, + "step": 1272 + }, + { + "epoch": 0.3224825839138695, + "grad_norm": 3.8666462898254395, + "learning_rate": 9.99425911555475e-06, + "loss": 1.0772, + "step": 1273 + }, + { + "epoch": 0.3227359088030399, + "grad_norm": 4.130020618438721, + "learning_rate": 9.994218907075863e-06, + "loss": 1.2813, + "step": 1274 + }, + { + "epoch": 0.3229892336922103, + "grad_norm": 3.832627296447754, + "learning_rate": 9.994178558361649e-06, + "loss": 1.2409, + "step": 1275 + }, + { + "epoch": 0.3232425585813806, + "grad_norm": 4.253661155700684, + "learning_rate": 9.994138069413244e-06, + "loss": 1.3443, + "step": 1276 + }, + { + "epoch": 0.32349588347055097, + "grad_norm": 4.230594158172607, + "learning_rate": 9.994097440231781e-06, + "loss": 1.3346, + "step": 1277 + }, + { + "epoch": 0.3237492083597213, + "grad_norm": 4.01607608795166, + "learning_rate": 9.994056670818404e-06, + "loss": 1.1863, + "step": 1278 + }, + { + "epoch": 0.3240025332488917, + "grad_norm": 4.280426979064941, + "learning_rate": 9.994015761174254e-06, + "loss": 1.3087, + "step": 1279 + }, + { + "epoch": 0.32425585813806207, + "grad_norm": 3.954005479812622, + "learning_rate": 9.993974711300485e-06, + "loss": 1.1972, + "step": 1280 + }, + { + "epoch": 0.3245091830272324, + "grad_norm": 4.103531837463379, + "learning_rate": 9.993933521198244e-06, + "loss": 1.3494, + "step": 1281 + }, + { + "epoch": 0.32476250791640277, + "grad_norm": 3.806126832962036, + "learning_rate": 9.99389219086869e-06, + "loss": 1.1307, + "step": 1282 + }, + { + "epoch": 0.32501583280557317, + "grad_norm": 4.244111061096191, + "learning_rate": 9.993850720312987e-06, + "loss": 1.326, + "step": 1283 + }, + { + "epoch": 0.3252691576947435, + "grad_norm": 4.36318826675415, + "learning_rate": 9.993809109532294e-06, + "loss": 1.2854, + "step": 1284 + }, + { + "epoch": 0.32552248258391386, + "grad_norm": 4.563369274139404, + "learning_rate": 9.993767358527781e-06, + "loss": 1.3687, + "step": 1285 + }, + { + "epoch": 0.3257758074730842, + "grad_norm": 4.065633296966553, + "learning_rate": 9.993725467300624e-06, + "loss": 1.2267, + "step": 1286 + }, + { + "epoch": 0.3260291323622546, + "grad_norm": 3.907613754272461, + "learning_rate": 9.993683435851995e-06, + "loss": 1.2013, + "step": 1287 + }, + { + "epoch": 0.32628245725142496, + "grad_norm": 4.399827480316162, + "learning_rate": 9.993641264183074e-06, + "loss": 1.3248, + "step": 1288 + }, + { + "epoch": 0.3265357821405953, + "grad_norm": 4.3525776863098145, + "learning_rate": 9.993598952295048e-06, + "loss": 1.2563, + "step": 1289 + }, + { + "epoch": 0.32678910702976566, + "grad_norm": 3.8334646224975586, + "learning_rate": 9.993556500189103e-06, + "loss": 1.1712, + "step": 1290 + }, + { + "epoch": 0.32704243191893606, + "grad_norm": 4.270079612731934, + "learning_rate": 9.993513907866432e-06, + "loss": 1.473, + "step": 1291 + }, + { + "epoch": 0.3272957568081064, + "grad_norm": 4.307861328125, + "learning_rate": 9.993471175328231e-06, + "loss": 1.2472, + "step": 1292 + }, + { + "epoch": 0.32754908169727676, + "grad_norm": 4.307832717895508, + "learning_rate": 9.9934283025757e-06, + "loss": 1.3726, + "step": 1293 + }, + { + "epoch": 0.3278024065864471, + "grad_norm": 3.930716037750244, + "learning_rate": 9.993385289610044e-06, + "loss": 1.1815, + "step": 1294 + }, + { + "epoch": 0.3280557314756175, + "grad_norm": 4.554092884063721, + "learning_rate": 9.993342136432467e-06, + "loss": 1.3118, + "step": 1295 + }, + { + "epoch": 0.32830905636478785, + "grad_norm": 3.947636365890503, + "learning_rate": 9.993298843044184e-06, + "loss": 0.9973, + "step": 1296 + }, + { + "epoch": 0.3285623812539582, + "grad_norm": 4.019169807434082, + "learning_rate": 9.99325540944641e-06, + "loss": 1.2246, + "step": 1297 + }, + { + "epoch": 0.32881570614312855, + "grad_norm": 4.576243877410889, + "learning_rate": 9.993211835640364e-06, + "loss": 1.2787, + "step": 1298 + }, + { + "epoch": 0.3290690310322989, + "grad_norm": 4.043989181518555, + "learning_rate": 9.99316812162727e-06, + "loss": 1.2234, + "step": 1299 + }, + { + "epoch": 0.3293223559214693, + "grad_norm": 4.106696605682373, + "learning_rate": 9.993124267408356e-06, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.32957568081063965, + "grad_norm": 4.1464080810546875, + "learning_rate": 9.99308027298485e-06, + "loss": 1.297, + "step": 1301 + }, + { + "epoch": 0.32982900569981, + "grad_norm": 4.302769660949707, + "learning_rate": 9.993036138357993e-06, + "loss": 1.2412, + "step": 1302 + }, + { + "epoch": 0.33008233058898034, + "grad_norm": 3.8838820457458496, + "learning_rate": 9.99299186352902e-06, + "loss": 1.149, + "step": 1303 + }, + { + "epoch": 0.33033565547815075, + "grad_norm": 4.113027095794678, + "learning_rate": 9.992947448499176e-06, + "loss": 1.2752, + "step": 1304 + }, + { + "epoch": 0.3305889803673211, + "grad_norm": 4.165403366088867, + "learning_rate": 9.992902893269709e-06, + "loss": 1.2183, + "step": 1305 + }, + { + "epoch": 0.33084230525649144, + "grad_norm": 4.083942413330078, + "learning_rate": 9.992858197841866e-06, + "loss": 1.2577, + "step": 1306 + }, + { + "epoch": 0.3310956301456618, + "grad_norm": 4.231232166290283, + "learning_rate": 9.992813362216907e-06, + "loss": 1.2931, + "step": 1307 + }, + { + "epoch": 0.3313489550348322, + "grad_norm": 3.911257266998291, + "learning_rate": 9.992768386396088e-06, + "loss": 1.2065, + "step": 1308 + }, + { + "epoch": 0.33160227992400254, + "grad_norm": 4.078742027282715, + "learning_rate": 9.992723270380674e-06, + "loss": 1.2378, + "step": 1309 + }, + { + "epoch": 0.3318556048131729, + "grad_norm": 4.388514041900635, + "learning_rate": 9.992678014171928e-06, + "loss": 1.2061, + "step": 1310 + }, + { + "epoch": 0.33210892970234324, + "grad_norm": 4.175978183746338, + "learning_rate": 9.992632617771126e-06, + "loss": 1.261, + "step": 1311 + }, + { + "epoch": 0.33236225459151364, + "grad_norm": 4.308657646179199, + "learning_rate": 9.992587081179537e-06, + "loss": 1.334, + "step": 1312 + }, + { + "epoch": 0.332615579480684, + "grad_norm": 3.970914363861084, + "learning_rate": 9.992541404398445e-06, + "loss": 1.1876, + "step": 1313 + }, + { + "epoch": 0.33286890436985433, + "grad_norm": 3.9518797397613525, + "learning_rate": 9.99249558742913e-06, + "loss": 1.3064, + "step": 1314 + }, + { + "epoch": 0.3331222292590247, + "grad_norm": 4.303136348724365, + "learning_rate": 9.992449630272878e-06, + "loss": 1.3708, + "step": 1315 + }, + { + "epoch": 0.3333755541481951, + "grad_norm": 4.3481526374816895, + "learning_rate": 9.992403532930981e-06, + "loss": 1.2291, + "step": 1316 + }, + { + "epoch": 0.33362887903736543, + "grad_norm": 4.0205278396606445, + "learning_rate": 9.992357295404733e-06, + "loss": 1.2624, + "step": 1317 + }, + { + "epoch": 0.3338822039265358, + "grad_norm": 4.278614521026611, + "learning_rate": 9.99231091769543e-06, + "loss": 1.3077, + "step": 1318 + }, + { + "epoch": 0.33413552881570613, + "grad_norm": 4.188653469085693, + "learning_rate": 9.992264399804378e-06, + "loss": 1.2738, + "step": 1319 + }, + { + "epoch": 0.3343888537048765, + "grad_norm": 3.7268896102905273, + "learning_rate": 9.992217741732881e-06, + "loss": 1.1054, + "step": 1320 + }, + { + "epoch": 0.3346421785940469, + "grad_norm": 4.0296711921691895, + "learning_rate": 9.99217094348225e-06, + "loss": 1.1936, + "step": 1321 + }, + { + "epoch": 0.3348955034832172, + "grad_norm": 3.9426963329315186, + "learning_rate": 9.992124005053798e-06, + "loss": 1.2568, + "step": 1322 + }, + { + "epoch": 0.3351488283723876, + "grad_norm": 4.505002021789551, + "learning_rate": 9.992076926448844e-06, + "loss": 1.4986, + "step": 1323 + }, + { + "epoch": 0.3354021532615579, + "grad_norm": 4.16630220413208, + "learning_rate": 9.992029707668708e-06, + "loss": 1.3681, + "step": 1324 + }, + { + "epoch": 0.3356554781507283, + "grad_norm": 3.81494140625, + "learning_rate": 9.991982348714719e-06, + "loss": 1.2015, + "step": 1325 + }, + { + "epoch": 0.3359088030398987, + "grad_norm": 4.078320026397705, + "learning_rate": 9.991934849588205e-06, + "loss": 1.2564, + "step": 1326 + }, + { + "epoch": 0.336162127929069, + "grad_norm": 4.1204304695129395, + "learning_rate": 9.991887210290501e-06, + "loss": 1.1618, + "step": 1327 + }, + { + "epoch": 0.33641545281823937, + "grad_norm": 3.9501395225524902, + "learning_rate": 9.991839430822942e-06, + "loss": 1.2996, + "step": 1328 + }, + { + "epoch": 0.33666877770740977, + "grad_norm": 4.23710823059082, + "learning_rate": 9.991791511186872e-06, + "loss": 1.4021, + "step": 1329 + }, + { + "epoch": 0.3369221025965801, + "grad_norm": 4.005718231201172, + "learning_rate": 9.991743451383636e-06, + "loss": 1.1643, + "step": 1330 + }, + { + "epoch": 0.33717542748575047, + "grad_norm": 4.026569843292236, + "learning_rate": 9.991695251414584e-06, + "loss": 1.2508, + "step": 1331 + }, + { + "epoch": 0.3374287523749208, + "grad_norm": 4.557069778442383, + "learning_rate": 9.991646911281067e-06, + "loss": 1.3715, + "step": 1332 + }, + { + "epoch": 0.3376820772640912, + "grad_norm": 3.9800522327423096, + "learning_rate": 9.991598430984445e-06, + "loss": 1.213, + "step": 1333 + }, + { + "epoch": 0.33793540215326157, + "grad_norm": 4.316899299621582, + "learning_rate": 9.99154981052608e-06, + "loss": 1.2742, + "step": 1334 + }, + { + "epoch": 0.3381887270424319, + "grad_norm": 3.8521671295166016, + "learning_rate": 9.991501049907336e-06, + "loss": 1.263, + "step": 1335 + }, + { + "epoch": 0.33844205193160226, + "grad_norm": 4.271399974822998, + "learning_rate": 9.991452149129579e-06, + "loss": 1.2038, + "step": 1336 + }, + { + "epoch": 0.33869537682077266, + "grad_norm": 4.1595377922058105, + "learning_rate": 9.991403108194187e-06, + "loss": 1.2474, + "step": 1337 + }, + { + "epoch": 0.338948701709943, + "grad_norm": 4.392655372619629, + "learning_rate": 9.991353927102537e-06, + "loss": 1.3154, + "step": 1338 + }, + { + "epoch": 0.33920202659911336, + "grad_norm": 4.084737300872803, + "learning_rate": 9.991304605856006e-06, + "loss": 1.1643, + "step": 1339 + }, + { + "epoch": 0.3394553514882837, + "grad_norm": 3.8093276023864746, + "learning_rate": 9.99125514445598e-06, + "loss": 1.0724, + "step": 1340 + }, + { + "epoch": 0.3397086763774541, + "grad_norm": 4.508452892303467, + "learning_rate": 9.991205542903849e-06, + "loss": 1.2907, + "step": 1341 + }, + { + "epoch": 0.33996200126662446, + "grad_norm": 3.834325075149536, + "learning_rate": 9.991155801201006e-06, + "loss": 1.1093, + "step": 1342 + }, + { + "epoch": 0.3402153261557948, + "grad_norm": 4.575895309448242, + "learning_rate": 9.991105919348846e-06, + "loss": 1.2663, + "step": 1343 + }, + { + "epoch": 0.34046865104496515, + "grad_norm": 3.84704852104187, + "learning_rate": 9.991055897348773e-06, + "loss": 1.1895, + "step": 1344 + }, + { + "epoch": 0.3407219759341355, + "grad_norm": 4.40834379196167, + "learning_rate": 9.991005735202191e-06, + "loss": 1.3147, + "step": 1345 + }, + { + "epoch": 0.3409753008233059, + "grad_norm": 4.067554950714111, + "learning_rate": 9.990955432910504e-06, + "loss": 1.2896, + "step": 1346 + }, + { + "epoch": 0.34122862571247625, + "grad_norm": 3.7188527584075928, + "learning_rate": 9.99090499047513e-06, + "loss": 1.1266, + "step": 1347 + }, + { + "epoch": 0.3414819506016466, + "grad_norm": 4.089304447174072, + "learning_rate": 9.990854407897481e-06, + "loss": 1.2385, + "step": 1348 + }, + { + "epoch": 0.34173527549081695, + "grad_norm": 4.055856227874756, + "learning_rate": 9.99080368517898e-06, + "loss": 1.2671, + "step": 1349 + }, + { + "epoch": 0.34198860037998735, + "grad_norm": 4.142635822296143, + "learning_rate": 9.990752822321052e-06, + "loss": 1.2366, + "step": 1350 + }, + { + "epoch": 0.3422419252691577, + "grad_norm": 3.9640398025512695, + "learning_rate": 9.990701819325122e-06, + "loss": 1.2174, + "step": 1351 + }, + { + "epoch": 0.34249525015832805, + "grad_norm": 4.102181434631348, + "learning_rate": 9.990650676192626e-06, + "loss": 1.1915, + "step": 1352 + }, + { + "epoch": 0.3427485750474984, + "grad_norm": 3.8989973068237305, + "learning_rate": 9.990599392924996e-06, + "loss": 1.206, + "step": 1353 + }, + { + "epoch": 0.3430018999366688, + "grad_norm": 4.988248825073242, + "learning_rate": 9.990547969523673e-06, + "loss": 1.3651, + "step": 1354 + }, + { + "epoch": 0.34325522482583914, + "grad_norm": 3.986790418624878, + "learning_rate": 9.990496405990104e-06, + "loss": 1.0947, + "step": 1355 + }, + { + "epoch": 0.3435085497150095, + "grad_norm": 4.1601948738098145, + "learning_rate": 9.990444702325736e-06, + "loss": 1.2768, + "step": 1356 + }, + { + "epoch": 0.34376187460417984, + "grad_norm": 4.031864166259766, + "learning_rate": 9.990392858532017e-06, + "loss": 1.1873, + "step": 1357 + }, + { + "epoch": 0.34401519949335024, + "grad_norm": 4.27942419052124, + "learning_rate": 9.990340874610406e-06, + "loss": 1.338, + "step": 1358 + }, + { + "epoch": 0.3442685243825206, + "grad_norm": 3.944505214691162, + "learning_rate": 9.990288750562365e-06, + "loss": 1.2352, + "step": 1359 + }, + { + "epoch": 0.34452184927169094, + "grad_norm": 4.307021141052246, + "learning_rate": 9.99023648638935e-06, + "loss": 1.3033, + "step": 1360 + }, + { + "epoch": 0.3447751741608613, + "grad_norm": 4.350025177001953, + "learning_rate": 9.990184082092838e-06, + "loss": 1.3114, + "step": 1361 + }, + { + "epoch": 0.3450284990500317, + "grad_norm": 4.570362091064453, + "learning_rate": 9.990131537674293e-06, + "loss": 1.2122, + "step": 1362 + }, + { + "epoch": 0.34528182393920204, + "grad_norm": 4.033261299133301, + "learning_rate": 9.990078853135192e-06, + "loss": 1.2177, + "step": 1363 + }, + { + "epoch": 0.3455351488283724, + "grad_norm": 4.397818088531494, + "learning_rate": 9.990026028477018e-06, + "loss": 1.3178, + "step": 1364 + }, + { + "epoch": 0.34578847371754273, + "grad_norm": 3.95310115814209, + "learning_rate": 9.98997306370125e-06, + "loss": 1.1768, + "step": 1365 + }, + { + "epoch": 0.34604179860671314, + "grad_norm": 4.639983654022217, + "learning_rate": 9.98991995880938e-06, + "loss": 1.3302, + "step": 1366 + }, + { + "epoch": 0.3462951234958835, + "grad_norm": 4.064812660217285, + "learning_rate": 9.989866713802894e-06, + "loss": 1.3295, + "step": 1367 + }, + { + "epoch": 0.34654844838505383, + "grad_norm": 4.017191410064697, + "learning_rate": 9.98981332868329e-06, + "loss": 1.2532, + "step": 1368 + }, + { + "epoch": 0.3468017732742242, + "grad_norm": 4.332028865814209, + "learning_rate": 9.989759803452065e-06, + "loss": 1.256, + "step": 1369 + }, + { + "epoch": 0.3470550981633945, + "grad_norm": 4.250354290008545, + "learning_rate": 9.989706138110724e-06, + "loss": 1.3158, + "step": 1370 + }, + { + "epoch": 0.34730842305256493, + "grad_norm": 3.973611831665039, + "learning_rate": 9.989652332660773e-06, + "loss": 1.1776, + "step": 1371 + }, + { + "epoch": 0.3475617479417353, + "grad_norm": 4.323413848876953, + "learning_rate": 9.989598387103724e-06, + "loss": 1.2984, + "step": 1372 + }, + { + "epoch": 0.3478150728309056, + "grad_norm": 4.1420159339904785, + "learning_rate": 9.98954430144109e-06, + "loss": 1.3327, + "step": 1373 + }, + { + "epoch": 0.348068397720076, + "grad_norm": 4.249251842498779, + "learning_rate": 9.98949007567439e-06, + "loss": 1.3095, + "step": 1374 + }, + { + "epoch": 0.3483217226092464, + "grad_norm": 4.179461479187012, + "learning_rate": 9.989435709805148e-06, + "loss": 1.1735, + "step": 1375 + }, + { + "epoch": 0.3485750474984167, + "grad_norm": 4.478822231292725, + "learning_rate": 9.989381203834892e-06, + "loss": 1.3818, + "step": 1376 + }, + { + "epoch": 0.34882837238758707, + "grad_norm": 4.347557067871094, + "learning_rate": 9.989326557765147e-06, + "loss": 1.4084, + "step": 1377 + }, + { + "epoch": 0.3490816972767574, + "grad_norm": 4.458835124969482, + "learning_rate": 9.98927177159745e-06, + "loss": 1.2902, + "step": 1378 + }, + { + "epoch": 0.3493350221659278, + "grad_norm": 4.278656482696533, + "learning_rate": 9.989216845333343e-06, + "loss": 1.2257, + "step": 1379 + }, + { + "epoch": 0.34958834705509817, + "grad_norm": 4.385313034057617, + "learning_rate": 9.989161778974364e-06, + "loss": 1.2569, + "step": 1380 + }, + { + "epoch": 0.3498416719442685, + "grad_norm": 4.05797815322876, + "learning_rate": 9.98910657252206e-06, + "loss": 1.2239, + "step": 1381 + }, + { + "epoch": 0.35009499683343887, + "grad_norm": 3.951141834259033, + "learning_rate": 9.989051225977982e-06, + "loss": 1.3011, + "step": 1382 + }, + { + "epoch": 0.35034832172260927, + "grad_norm": 4.144949436187744, + "learning_rate": 9.988995739343684e-06, + "loss": 1.3402, + "step": 1383 + }, + { + "epoch": 0.3506016466117796, + "grad_norm": 4.196046829223633, + "learning_rate": 9.988940112620724e-06, + "loss": 1.2248, + "step": 1384 + }, + { + "epoch": 0.35085497150094996, + "grad_norm": 3.6633708477020264, + "learning_rate": 9.988884345810664e-06, + "loss": 1.1418, + "step": 1385 + }, + { + "epoch": 0.3511082963901203, + "grad_norm": 4.088935375213623, + "learning_rate": 9.988828438915068e-06, + "loss": 1.3156, + "step": 1386 + }, + { + "epoch": 0.3513616212792907, + "grad_norm": 3.9691689014434814, + "learning_rate": 9.98877239193551e-06, + "loss": 1.2846, + "step": 1387 + }, + { + "epoch": 0.35161494616846106, + "grad_norm": 4.316605091094971, + "learning_rate": 9.98871620487356e-06, + "loss": 1.3143, + "step": 1388 + }, + { + "epoch": 0.3518682710576314, + "grad_norm": 4.068321228027344, + "learning_rate": 9.988659877730798e-06, + "loss": 1.2556, + "step": 1389 + }, + { + "epoch": 0.35212159594680176, + "grad_norm": 4.39426326751709, + "learning_rate": 9.988603410508803e-06, + "loss": 1.2159, + "step": 1390 + }, + { + "epoch": 0.35237492083597216, + "grad_norm": 4.016197204589844, + "learning_rate": 9.988546803209164e-06, + "loss": 1.2425, + "step": 1391 + }, + { + "epoch": 0.3526282457251425, + "grad_norm": 4.415781497955322, + "learning_rate": 9.988490055833468e-06, + "loss": 1.2436, + "step": 1392 + }, + { + "epoch": 0.35288157061431286, + "grad_norm": 4.049761772155762, + "learning_rate": 9.988433168383309e-06, + "loss": 1.1914, + "step": 1393 + }, + { + "epoch": 0.3531348955034832, + "grad_norm": 4.043946743011475, + "learning_rate": 9.988376140860285e-06, + "loss": 1.3694, + "step": 1394 + }, + { + "epoch": 0.35338822039265355, + "grad_norm": 4.123354434967041, + "learning_rate": 9.988318973265998e-06, + "loss": 1.146, + "step": 1395 + }, + { + "epoch": 0.35364154528182395, + "grad_norm": 4.2967529296875, + "learning_rate": 9.98826166560205e-06, + "loss": 1.1554, + "step": 1396 + }, + { + "epoch": 0.3538948701709943, + "grad_norm": 3.7744598388671875, + "learning_rate": 9.988204217870055e-06, + "loss": 1.1237, + "step": 1397 + }, + { + "epoch": 0.35414819506016465, + "grad_norm": 4.072709560394287, + "learning_rate": 9.98814663007162e-06, + "loss": 1.2496, + "step": 1398 + }, + { + "epoch": 0.354401519949335, + "grad_norm": 3.9878129959106445, + "learning_rate": 9.988088902208367e-06, + "loss": 1.1834, + "step": 1399 + }, + { + "epoch": 0.3546548448385054, + "grad_norm": 3.452812671661377, + "learning_rate": 9.988031034281917e-06, + "loss": 1.0854, + "step": 1400 + }, + { + "epoch": 0.35490816972767575, + "grad_norm": 4.219860553741455, + "learning_rate": 9.987973026293891e-06, + "loss": 1.2147, + "step": 1401 + }, + { + "epoch": 0.3551614946168461, + "grad_norm": 4.2590012550354, + "learning_rate": 9.987914878245921e-06, + "loss": 1.3714, + "step": 1402 + }, + { + "epoch": 0.35541481950601644, + "grad_norm": 3.835472822189331, + "learning_rate": 9.98785659013964e-06, + "loss": 1.3531, + "step": 1403 + }, + { + "epoch": 0.35566814439518685, + "grad_norm": 3.726508617401123, + "learning_rate": 9.987798161976683e-06, + "loss": 1.2905, + "step": 1404 + }, + { + "epoch": 0.3559214692843572, + "grad_norm": 3.9989113807678223, + "learning_rate": 9.987739593758691e-06, + "loss": 1.2383, + "step": 1405 + }, + { + "epoch": 0.35617479417352754, + "grad_norm": 4.159228801727295, + "learning_rate": 9.98768088548731e-06, + "loss": 1.2434, + "step": 1406 + }, + { + "epoch": 0.3564281190626979, + "grad_norm": 3.9649009704589844, + "learning_rate": 9.987622037164185e-06, + "loss": 1.2097, + "step": 1407 + }, + { + "epoch": 0.3566814439518683, + "grad_norm": 3.781782865524292, + "learning_rate": 9.987563048790971e-06, + "loss": 1.2509, + "step": 1408 + }, + { + "epoch": 0.35693476884103864, + "grad_norm": 4.055093288421631, + "learning_rate": 9.987503920369326e-06, + "loss": 1.143, + "step": 1409 + }, + { + "epoch": 0.357188093730209, + "grad_norm": 4.081745624542236, + "learning_rate": 9.987444651900909e-06, + "loss": 1.1594, + "step": 1410 + }, + { + "epoch": 0.35744141861937934, + "grad_norm": 4.309524059295654, + "learning_rate": 9.987385243387381e-06, + "loss": 1.1907, + "step": 1411 + }, + { + "epoch": 0.35769474350854974, + "grad_norm": 3.8126816749572754, + "learning_rate": 9.987325694830414e-06, + "loss": 1.1785, + "step": 1412 + }, + { + "epoch": 0.3579480683977201, + "grad_norm": 4.495013236999512, + "learning_rate": 9.98726600623168e-06, + "loss": 1.3577, + "step": 1413 + }, + { + "epoch": 0.35820139328689043, + "grad_norm": 4.093703269958496, + "learning_rate": 9.987206177592852e-06, + "loss": 1.2145, + "step": 1414 + }, + { + "epoch": 0.3584547181760608, + "grad_norm": 4.1885905265808105, + "learning_rate": 9.987146208915612e-06, + "loss": 1.3387, + "step": 1415 + }, + { + "epoch": 0.3587080430652312, + "grad_norm": 4.143773078918457, + "learning_rate": 9.987086100201646e-06, + "loss": 1.1747, + "step": 1416 + }, + { + "epoch": 0.35896136795440153, + "grad_norm": 3.8252334594726562, + "learning_rate": 9.98702585145264e-06, + "loss": 1.1336, + "step": 1417 + }, + { + "epoch": 0.3592146928435719, + "grad_norm": 3.914947986602783, + "learning_rate": 9.986965462670282e-06, + "loss": 1.1176, + "step": 1418 + }, + { + "epoch": 0.35946801773274223, + "grad_norm": 3.8457250595092773, + "learning_rate": 9.986904933856274e-06, + "loss": 1.2418, + "step": 1419 + }, + { + "epoch": 0.3597213426219126, + "grad_norm": 4.563148021697998, + "learning_rate": 9.98684426501231e-06, + "loss": 1.3218, + "step": 1420 + }, + { + "epoch": 0.359974667511083, + "grad_norm": 4.02828311920166, + "learning_rate": 9.9867834561401e-06, + "loss": 1.166, + "step": 1421 + }, + { + "epoch": 0.3602279924002533, + "grad_norm": 3.606048822402954, + "learning_rate": 9.986722507241344e-06, + "loss": 1.1854, + "step": 1422 + }, + { + "epoch": 0.3604813172894237, + "grad_norm": 4.164137840270996, + "learning_rate": 9.986661418317759e-06, + "loss": 1.3286, + "step": 1423 + }, + { + "epoch": 0.360734642178594, + "grad_norm": 4.220963001251221, + "learning_rate": 9.986600189371058e-06, + "loss": 1.309, + "step": 1424 + }, + { + "epoch": 0.3609879670677644, + "grad_norm": 3.883132219314575, + "learning_rate": 9.986538820402962e-06, + "loss": 1.3193, + "step": 1425 + }, + { + "epoch": 0.3612412919569348, + "grad_norm": 4.095963478088379, + "learning_rate": 9.986477311415192e-06, + "loss": 1.3036, + "step": 1426 + }, + { + "epoch": 0.3614946168461051, + "grad_norm": 3.9755353927612305, + "learning_rate": 9.986415662409476e-06, + "loss": 1.3306, + "step": 1427 + }, + { + "epoch": 0.36174794173527547, + "grad_norm": 3.626171588897705, + "learning_rate": 9.986353873387545e-06, + "loss": 1.1595, + "step": 1428 + }, + { + "epoch": 0.36200126662444587, + "grad_norm": 3.70266056060791, + "learning_rate": 9.986291944351136e-06, + "loss": 1.2649, + "step": 1429 + }, + { + "epoch": 0.3622545915136162, + "grad_norm": 3.8242764472961426, + "learning_rate": 9.986229875301984e-06, + "loss": 1.3271, + "step": 1430 + }, + { + "epoch": 0.36250791640278657, + "grad_norm": 4.0928955078125, + "learning_rate": 9.986167666241834e-06, + "loss": 1.1441, + "step": 1431 + }, + { + "epoch": 0.3627612412919569, + "grad_norm": 4.2461957931518555, + "learning_rate": 9.986105317172434e-06, + "loss": 1.24, + "step": 1432 + }, + { + "epoch": 0.3630145661811273, + "grad_norm": 3.7550787925720215, + "learning_rate": 9.986042828095534e-06, + "loss": 1.2437, + "step": 1433 + }, + { + "epoch": 0.36326789107029767, + "grad_norm": 3.7784383296966553, + "learning_rate": 9.985980199012887e-06, + "loss": 1.1827, + "step": 1434 + }, + { + "epoch": 0.363521215959468, + "grad_norm": 3.8189046382904053, + "learning_rate": 9.985917429926253e-06, + "loss": 1.0869, + "step": 1435 + }, + { + "epoch": 0.36377454084863836, + "grad_norm": 4.422670841217041, + "learning_rate": 9.985854520837396e-06, + "loss": 1.2184, + "step": 1436 + }, + { + "epoch": 0.36402786573780876, + "grad_norm": 4.154664993286133, + "learning_rate": 9.985791471748079e-06, + "loss": 1.1978, + "step": 1437 + }, + { + "epoch": 0.3642811906269791, + "grad_norm": 3.967803955078125, + "learning_rate": 9.985728282660075e-06, + "loss": 1.2848, + "step": 1438 + }, + { + "epoch": 0.36453451551614946, + "grad_norm": 4.257202625274658, + "learning_rate": 9.985664953575157e-06, + "loss": 1.2945, + "step": 1439 + }, + { + "epoch": 0.3647878404053198, + "grad_norm": 4.032511234283447, + "learning_rate": 9.985601484495105e-06, + "loss": 1.1835, + "step": 1440 + }, + { + "epoch": 0.3650411652944902, + "grad_norm": 4.19437313079834, + "learning_rate": 9.985537875421698e-06, + "loss": 1.3775, + "step": 1441 + }, + { + "epoch": 0.36529449018366056, + "grad_norm": 4.085351943969727, + "learning_rate": 9.985474126356724e-06, + "loss": 1.2746, + "step": 1442 + }, + { + "epoch": 0.3655478150728309, + "grad_norm": 3.9030771255493164, + "learning_rate": 9.985410237301976e-06, + "loss": 1.2155, + "step": 1443 + }, + { + "epoch": 0.36580113996200125, + "grad_norm": 3.9144396781921387, + "learning_rate": 9.985346208259244e-06, + "loss": 1.1466, + "step": 1444 + }, + { + "epoch": 0.3660544648511716, + "grad_norm": 4.423661708831787, + "learning_rate": 9.985282039230326e-06, + "loss": 1.2705, + "step": 1445 + }, + { + "epoch": 0.366307789740342, + "grad_norm": 3.9243950843811035, + "learning_rate": 9.985217730217025e-06, + "loss": 1.2001, + "step": 1446 + }, + { + "epoch": 0.36656111462951235, + "grad_norm": 4.033255577087402, + "learning_rate": 9.985153281221144e-06, + "loss": 1.3083, + "step": 1447 + }, + { + "epoch": 0.3668144395186827, + "grad_norm": 4.167736053466797, + "learning_rate": 9.9850886922445e-06, + "loss": 1.1748, + "step": 1448 + }, + { + "epoch": 0.36706776440785305, + "grad_norm": 3.9712142944335938, + "learning_rate": 9.9850239632889e-06, + "loss": 1.1794, + "step": 1449 + }, + { + "epoch": 0.36732108929702345, + "grad_norm": 4.448064804077148, + "learning_rate": 9.984959094356163e-06, + "loss": 1.2823, + "step": 1450 + }, + { + "epoch": 0.3675744141861938, + "grad_norm": 4.087486743927002, + "learning_rate": 9.984894085448111e-06, + "loss": 1.4028, + "step": 1451 + }, + { + "epoch": 0.36782773907536415, + "grad_norm": 3.659010410308838, + "learning_rate": 9.98482893656657e-06, + "loss": 1.1813, + "step": 1452 + }, + { + "epoch": 0.3680810639645345, + "grad_norm": 3.814033031463623, + "learning_rate": 9.984763647713369e-06, + "loss": 1.1237, + "step": 1453 + }, + { + "epoch": 0.3683343888537049, + "grad_norm": 4.058841228485107, + "learning_rate": 9.984698218890341e-06, + "loss": 1.4169, + "step": 1454 + }, + { + "epoch": 0.36858771374287524, + "grad_norm": 3.9934372901916504, + "learning_rate": 9.984632650099322e-06, + "loss": 1.2431, + "step": 1455 + }, + { + "epoch": 0.3688410386320456, + "grad_norm": 3.9111716747283936, + "learning_rate": 9.984566941342156e-06, + "loss": 1.2912, + "step": 1456 + }, + { + "epoch": 0.36909436352121594, + "grad_norm": 4.044859409332275, + "learning_rate": 9.984501092620685e-06, + "loss": 1.2513, + "step": 1457 + }, + { + "epoch": 0.36934768841038634, + "grad_norm": 3.935584306716919, + "learning_rate": 9.98443510393676e-06, + "loss": 1.2562, + "step": 1458 + }, + { + "epoch": 0.3696010132995567, + "grad_norm": 3.825549840927124, + "learning_rate": 9.984368975292233e-06, + "loss": 1.2572, + "step": 1459 + }, + { + "epoch": 0.36985433818872704, + "grad_norm": 4.032233238220215, + "learning_rate": 9.984302706688962e-06, + "loss": 1.2745, + "step": 1460 + }, + { + "epoch": 0.3701076630778974, + "grad_norm": 4.025396823883057, + "learning_rate": 9.984236298128806e-06, + "loss": 1.224, + "step": 1461 + }, + { + "epoch": 0.3703609879670678, + "grad_norm": 4.060763359069824, + "learning_rate": 9.984169749613632e-06, + "loss": 1.1462, + "step": 1462 + }, + { + "epoch": 0.37061431285623814, + "grad_norm": 3.721524238586426, + "learning_rate": 9.984103061145306e-06, + "loss": 1.2868, + "step": 1463 + }, + { + "epoch": 0.3708676377454085, + "grad_norm": 3.8655741214752197, + "learning_rate": 9.984036232725702e-06, + "loss": 1.1302, + "step": 1464 + }, + { + "epoch": 0.37112096263457883, + "grad_norm": 3.8349742889404297, + "learning_rate": 9.983969264356697e-06, + "loss": 1.1663, + "step": 1465 + }, + { + "epoch": 0.37137428752374924, + "grad_norm": 4.160083770751953, + "learning_rate": 9.983902156040172e-06, + "loss": 1.2775, + "step": 1466 + }, + { + "epoch": 0.3716276124129196, + "grad_norm": 3.9831254482269287, + "learning_rate": 9.983834907778009e-06, + "loss": 1.3009, + "step": 1467 + }, + { + "epoch": 0.37188093730208993, + "grad_norm": 3.8674516677856445, + "learning_rate": 9.9837675195721e-06, + "loss": 1.1354, + "step": 1468 + }, + { + "epoch": 0.3721342621912603, + "grad_norm": 3.812502145767212, + "learning_rate": 9.98369999142433e-06, + "loss": 1.0914, + "step": 1469 + }, + { + "epoch": 0.3723875870804306, + "grad_norm": 3.684924840927124, + "learning_rate": 9.983632323336606e-06, + "loss": 1.1655, + "step": 1470 + }, + { + "epoch": 0.37264091196960103, + "grad_norm": 4.2497968673706055, + "learning_rate": 9.983564515310817e-06, + "loss": 1.2061, + "step": 1471 + }, + { + "epoch": 0.3728942368587714, + "grad_norm": 4.240072250366211, + "learning_rate": 9.983496567348874e-06, + "loss": 1.2387, + "step": 1472 + }, + { + "epoch": 0.3731475617479417, + "grad_norm": 4.081851482391357, + "learning_rate": 9.983428479452683e-06, + "loss": 1.2998, + "step": 1473 + }, + { + "epoch": 0.3734008866371121, + "grad_norm": 3.9202663898468018, + "learning_rate": 9.983360251624156e-06, + "loss": 1.1734, + "step": 1474 + }, + { + "epoch": 0.3736542115262825, + "grad_norm": 4.135853290557861, + "learning_rate": 9.98329188386521e-06, + "loss": 1.25, + "step": 1475 + }, + { + "epoch": 0.3739075364154528, + "grad_norm": 4.302660942077637, + "learning_rate": 9.983223376177761e-06, + "loss": 1.1406, + "step": 1476 + }, + { + "epoch": 0.37416086130462317, + "grad_norm": 4.354469299316406, + "learning_rate": 9.983154728563738e-06, + "loss": 1.2616, + "step": 1477 + }, + { + "epoch": 0.3744141861937935, + "grad_norm": 3.839205503463745, + "learning_rate": 9.983085941025063e-06, + "loss": 1.1259, + "step": 1478 + }, + { + "epoch": 0.3746675110829639, + "grad_norm": 3.928718090057373, + "learning_rate": 9.983017013563671e-06, + "loss": 1.1985, + "step": 1479 + }, + { + "epoch": 0.37492083597213427, + "grad_norm": 3.98683762550354, + "learning_rate": 9.982947946181497e-06, + "loss": 1.2515, + "step": 1480 + }, + { + "epoch": 0.3751741608613046, + "grad_norm": 4.049800872802734, + "learning_rate": 9.98287873888048e-06, + "loss": 1.2324, + "step": 1481 + }, + { + "epoch": 0.37542748575047497, + "grad_norm": 4.102850914001465, + "learning_rate": 9.982809391662563e-06, + "loss": 1.244, + "step": 1482 + }, + { + "epoch": 0.37568081063964537, + "grad_norm": 4.339122772216797, + "learning_rate": 9.982739904529695e-06, + "loss": 1.3376, + "step": 1483 + }, + { + "epoch": 0.3759341355288157, + "grad_norm": 3.813641309738159, + "learning_rate": 9.982670277483824e-06, + "loss": 1.2096, + "step": 1484 + }, + { + "epoch": 0.37618746041798606, + "grad_norm": 4.246204853057861, + "learning_rate": 9.982600510526908e-06, + "loss": 1.3492, + "step": 1485 + }, + { + "epoch": 0.3764407853071564, + "grad_norm": 3.831717014312744, + "learning_rate": 9.982530603660905e-06, + "loss": 1.2275, + "step": 1486 + }, + { + "epoch": 0.3766941101963268, + "grad_norm": 4.106499195098877, + "learning_rate": 9.982460556887776e-06, + "loss": 1.2366, + "step": 1487 + }, + { + "epoch": 0.37694743508549716, + "grad_norm": 4.112460136413574, + "learning_rate": 9.982390370209492e-06, + "loss": 1.3153, + "step": 1488 + }, + { + "epoch": 0.3772007599746675, + "grad_norm": 4.27755880355835, + "learning_rate": 9.982320043628022e-06, + "loss": 1.2236, + "step": 1489 + }, + { + "epoch": 0.37745408486383786, + "grad_norm": 3.8407411575317383, + "learning_rate": 9.982249577145338e-06, + "loss": 1.1825, + "step": 1490 + }, + { + "epoch": 0.37770740975300826, + "grad_norm": 4.075498580932617, + "learning_rate": 9.982178970763421e-06, + "loss": 1.2772, + "step": 1491 + }, + { + "epoch": 0.3779607346421786, + "grad_norm": 4.027531147003174, + "learning_rate": 9.982108224484255e-06, + "loss": 1.3049, + "step": 1492 + }, + { + "epoch": 0.37821405953134896, + "grad_norm": 4.01485013961792, + "learning_rate": 9.982037338309824e-06, + "loss": 1.3352, + "step": 1493 + }, + { + "epoch": 0.3784673844205193, + "grad_norm": 4.572307586669922, + "learning_rate": 9.98196631224212e-06, + "loss": 1.1925, + "step": 1494 + }, + { + "epoch": 0.37872070930968965, + "grad_norm": 3.935351848602295, + "learning_rate": 9.981895146283139e-06, + "loss": 1.2807, + "step": 1495 + }, + { + "epoch": 0.37897403419886005, + "grad_norm": 4.141636371612549, + "learning_rate": 9.981823840434875e-06, + "loss": 1.3289, + "step": 1496 + }, + { + "epoch": 0.3792273590880304, + "grad_norm": 3.8772614002227783, + "learning_rate": 9.98175239469933e-06, + "loss": 1.2172, + "step": 1497 + }, + { + "epoch": 0.37948068397720075, + "grad_norm": 3.867401599884033, + "learning_rate": 9.981680809078516e-06, + "loss": 1.2663, + "step": 1498 + }, + { + "epoch": 0.3797340088663711, + "grad_norm": 4.0219831466674805, + "learning_rate": 9.98160908357444e-06, + "loss": 1.3242, + "step": 1499 + }, + { + "epoch": 0.3799873337555415, + "grad_norm": 4.100152969360352, + "learning_rate": 9.981537218189113e-06, + "loss": 1.3128, + "step": 1500 + }, + { + "epoch": 0.3799873337555415, + "eval_loss": 1.2583286762237549, + "eval_runtime": 12.7009, + "eval_samples_per_second": 31.494, + "eval_steps_per_second": 3.937, + "step": 1500 + }, + { + "epoch": 0.38024065864471185, + "grad_norm": 3.9439661502838135, + "learning_rate": 9.981465212924557e-06, + "loss": 1.1865, + "step": 1501 + }, + { + "epoch": 0.3804939835338822, + "grad_norm": 3.7056007385253906, + "learning_rate": 9.981393067782793e-06, + "loss": 1.2072, + "step": 1502 + }, + { + "epoch": 0.38074730842305254, + "grad_norm": 4.4519171714782715, + "learning_rate": 9.981320782765847e-06, + "loss": 1.3596, + "step": 1503 + }, + { + "epoch": 0.38100063331222295, + "grad_norm": 3.824197769165039, + "learning_rate": 9.981248357875745e-06, + "loss": 1.24, + "step": 1504 + }, + { + "epoch": 0.3812539582013933, + "grad_norm": 4.500977039337158, + "learning_rate": 9.981175793114526e-06, + "loss": 1.2211, + "step": 1505 + }, + { + "epoch": 0.38150728309056364, + "grad_norm": 3.888195514678955, + "learning_rate": 9.981103088484226e-06, + "loss": 1.1836, + "step": 1506 + }, + { + "epoch": 0.381760607979734, + "grad_norm": 4.110970497131348, + "learning_rate": 9.981030243986885e-06, + "loss": 1.1025, + "step": 1507 + }, + { + "epoch": 0.3820139328689044, + "grad_norm": 4.05878210067749, + "learning_rate": 9.980957259624549e-06, + "loss": 1.2961, + "step": 1508 + }, + { + "epoch": 0.38226725775807474, + "grad_norm": 4.440320014953613, + "learning_rate": 9.980884135399268e-06, + "loss": 1.2095, + "step": 1509 + }, + { + "epoch": 0.3825205826472451, + "grad_norm": 3.711378574371338, + "learning_rate": 9.980810871313094e-06, + "loss": 1.1757, + "step": 1510 + }, + { + "epoch": 0.38277390753641544, + "grad_norm": 4.575596809387207, + "learning_rate": 9.980737467368086e-06, + "loss": 1.3483, + "step": 1511 + }, + { + "epoch": 0.38302723242558584, + "grad_norm": 3.8007843494415283, + "learning_rate": 9.980663923566306e-06, + "loss": 1.19, + "step": 1512 + }, + { + "epoch": 0.3832805573147562, + "grad_norm": 4.5330119132995605, + "learning_rate": 9.980590239909814e-06, + "loss": 1.1921, + "step": 1513 + }, + { + "epoch": 0.38353388220392653, + "grad_norm": 3.7246146202087402, + "learning_rate": 9.980516416400683e-06, + "loss": 1.1175, + "step": 1514 + }, + { + "epoch": 0.3837872070930969, + "grad_norm": 3.7267162799835205, + "learning_rate": 9.980442453040986e-06, + "loss": 1.1074, + "step": 1515 + }, + { + "epoch": 0.38404053198226723, + "grad_norm": 3.743905782699585, + "learning_rate": 9.980368349832799e-06, + "loss": 1.2421, + "step": 1516 + }, + { + "epoch": 0.38429385687143763, + "grad_norm": 4.078317642211914, + "learning_rate": 9.980294106778203e-06, + "loss": 1.271, + "step": 1517 + }, + { + "epoch": 0.384547181760608, + "grad_norm": 4.014880180358887, + "learning_rate": 9.980219723879283e-06, + "loss": 1.258, + "step": 1518 + }, + { + "epoch": 0.38480050664977833, + "grad_norm": 4.1032538414001465, + "learning_rate": 9.980145201138127e-06, + "loss": 1.3407, + "step": 1519 + }, + { + "epoch": 0.3850538315389487, + "grad_norm": 3.856544017791748, + "learning_rate": 9.980070538556828e-06, + "loss": 1.1389, + "step": 1520 + }, + { + "epoch": 0.3853071564281191, + "grad_norm": 4.109692573547363, + "learning_rate": 9.979995736137482e-06, + "loss": 1.3166, + "step": 1521 + }, + { + "epoch": 0.3855604813172894, + "grad_norm": 4.3691840171813965, + "learning_rate": 9.979920793882191e-06, + "loss": 1.3517, + "step": 1522 + }, + { + "epoch": 0.3858138062064598, + "grad_norm": 4.090217113494873, + "learning_rate": 9.979845711793057e-06, + "loss": 1.3177, + "step": 1523 + }, + { + "epoch": 0.3860671310956301, + "grad_norm": 4.154439449310303, + "learning_rate": 9.97977048987219e-06, + "loss": 1.3189, + "step": 1524 + }, + { + "epoch": 0.3863204559848005, + "grad_norm": 3.9512176513671875, + "learning_rate": 9.9796951281217e-06, + "loss": 1.2516, + "step": 1525 + }, + { + "epoch": 0.3865737808739709, + "grad_norm": 4.008617401123047, + "learning_rate": 9.979619626543705e-06, + "loss": 1.2444, + "step": 1526 + }, + { + "epoch": 0.3868271057631412, + "grad_norm": 4.366746425628662, + "learning_rate": 9.979543985140325e-06, + "loss": 1.2738, + "step": 1527 + }, + { + "epoch": 0.38708043065231157, + "grad_norm": 4.0556745529174805, + "learning_rate": 9.979468203913684e-06, + "loss": 1.2275, + "step": 1528 + }, + { + "epoch": 0.38733375554148197, + "grad_norm": 3.4091527462005615, + "learning_rate": 9.97939228286591e-06, + "loss": 1.1072, + "step": 1529 + }, + { + "epoch": 0.3875870804306523, + "grad_norm": 3.441343069076538, + "learning_rate": 9.979316221999133e-06, + "loss": 1.1414, + "step": 1530 + }, + { + "epoch": 0.38784040531982267, + "grad_norm": 4.0893659591674805, + "learning_rate": 9.979240021315493e-06, + "loss": 1.2421, + "step": 1531 + }, + { + "epoch": 0.388093730208993, + "grad_norm": 3.9928534030914307, + "learning_rate": 9.979163680817124e-06, + "loss": 1.2132, + "step": 1532 + }, + { + "epoch": 0.3883470550981634, + "grad_norm": 3.8956265449523926, + "learning_rate": 9.979087200506175e-06, + "loss": 1.14, + "step": 1533 + }, + { + "epoch": 0.38860037998733377, + "grad_norm": 3.679107666015625, + "learning_rate": 9.97901058038479e-06, + "loss": 1.1185, + "step": 1534 + }, + { + "epoch": 0.3888537048765041, + "grad_norm": 3.8407785892486572, + "learning_rate": 9.978933820455119e-06, + "loss": 1.1715, + "step": 1535 + }, + { + "epoch": 0.38910702976567446, + "grad_norm": 3.9353787899017334, + "learning_rate": 9.978856920719321e-06, + "loss": 1.1604, + "step": 1536 + }, + { + "epoch": 0.38936035465484486, + "grad_norm": 3.892644166946411, + "learning_rate": 9.978779881179557e-06, + "loss": 1.3091, + "step": 1537 + }, + { + "epoch": 0.3896136795440152, + "grad_norm": 3.780111789703369, + "learning_rate": 9.978702701837985e-06, + "loss": 1.2745, + "step": 1538 + }, + { + "epoch": 0.38986700443318556, + "grad_norm": 4.206931114196777, + "learning_rate": 9.978625382696773e-06, + "loss": 1.3056, + "step": 1539 + }, + { + "epoch": 0.3901203293223559, + "grad_norm": 4.062953948974609, + "learning_rate": 9.978547923758098e-06, + "loss": 1.2184, + "step": 1540 + }, + { + "epoch": 0.39037365421152626, + "grad_norm": 4.187219142913818, + "learning_rate": 9.978470325024127e-06, + "loss": 1.3464, + "step": 1541 + }, + { + "epoch": 0.39062697910069666, + "grad_norm": 4.182732105255127, + "learning_rate": 9.978392586497043e-06, + "loss": 1.2979, + "step": 1542 + }, + { + "epoch": 0.390880303989867, + "grad_norm": 4.154636383056641, + "learning_rate": 9.97831470817903e-06, + "loss": 1.2577, + "step": 1543 + }, + { + "epoch": 0.39113362887903735, + "grad_norm": 3.8434035778045654, + "learning_rate": 9.978236690072271e-06, + "loss": 1.2204, + "step": 1544 + }, + { + "epoch": 0.3913869537682077, + "grad_norm": 4.057715892791748, + "learning_rate": 9.978158532178961e-06, + "loss": 1.2932, + "step": 1545 + }, + { + "epoch": 0.3916402786573781, + "grad_norm": 4.071051120758057, + "learning_rate": 9.978080234501292e-06, + "loss": 1.2954, + "step": 1546 + }, + { + "epoch": 0.39189360354654845, + "grad_norm": 3.698793649673462, + "learning_rate": 9.978001797041464e-06, + "loss": 1.1183, + "step": 1547 + }, + { + "epoch": 0.3921469284357188, + "grad_norm": 3.4650938510894775, + "learning_rate": 9.977923219801678e-06, + "loss": 1.2295, + "step": 1548 + }, + { + "epoch": 0.39240025332488915, + "grad_norm": 4.090244770050049, + "learning_rate": 9.97784450278414e-06, + "loss": 1.4205, + "step": 1549 + }, + { + "epoch": 0.39265357821405955, + "grad_norm": 4.126779079437256, + "learning_rate": 9.977765645991062e-06, + "loss": 1.3266, + "step": 1550 + }, + { + "epoch": 0.3929069031032299, + "grad_norm": 3.527606964111328, + "learning_rate": 9.977686649424658e-06, + "loss": 1.0561, + "step": 1551 + }, + { + "epoch": 0.39316022799240025, + "grad_norm": 3.956460475921631, + "learning_rate": 9.977607513087145e-06, + "loss": 1.1791, + "step": 1552 + }, + { + "epoch": 0.3934135528815706, + "grad_norm": 3.9396400451660156, + "learning_rate": 9.977528236980746e-06, + "loss": 1.2844, + "step": 1553 + }, + { + "epoch": 0.393666877770741, + "grad_norm": 4.004495620727539, + "learning_rate": 9.977448821107686e-06, + "loss": 1.3033, + "step": 1554 + }, + { + "epoch": 0.39392020265991134, + "grad_norm": 3.8449161052703857, + "learning_rate": 9.977369265470197e-06, + "loss": 1.117, + "step": 1555 + }, + { + "epoch": 0.3941735275490817, + "grad_norm": 4.153273105621338, + "learning_rate": 9.977289570070514e-06, + "loss": 1.199, + "step": 1556 + }, + { + "epoch": 0.39442685243825204, + "grad_norm": 3.6627635955810547, + "learning_rate": 9.97720973491087e-06, + "loss": 1.2055, + "step": 1557 + }, + { + "epoch": 0.39468017732742244, + "grad_norm": 4.08154821395874, + "learning_rate": 9.977129759993511e-06, + "loss": 1.2293, + "step": 1558 + }, + { + "epoch": 0.3949335022165928, + "grad_norm": 3.6264965534210205, + "learning_rate": 9.97704964532068e-06, + "loss": 1.1469, + "step": 1559 + }, + { + "epoch": 0.39518682710576314, + "grad_norm": 4.057589054107666, + "learning_rate": 9.976969390894626e-06, + "loss": 1.2882, + "step": 1560 + }, + { + "epoch": 0.3954401519949335, + "grad_norm": 4.077418327331543, + "learning_rate": 9.976888996717606e-06, + "loss": 1.1201, + "step": 1561 + }, + { + "epoch": 0.3956934768841039, + "grad_norm": 4.358857154846191, + "learning_rate": 9.976808462791876e-06, + "loss": 1.3677, + "step": 1562 + }, + { + "epoch": 0.39594680177327424, + "grad_norm": 3.968888759613037, + "learning_rate": 9.976727789119696e-06, + "loss": 1.2792, + "step": 1563 + }, + { + "epoch": 0.3962001266624446, + "grad_norm": 4.224818706512451, + "learning_rate": 9.976646975703333e-06, + "loss": 1.2444, + "step": 1564 + }, + { + "epoch": 0.39645345155161493, + "grad_norm": 4.409234046936035, + "learning_rate": 9.976566022545053e-06, + "loss": 1.3107, + "step": 1565 + }, + { + "epoch": 0.3967067764407853, + "grad_norm": 3.9601402282714844, + "learning_rate": 9.976484929647135e-06, + "loss": 1.4012, + "step": 1566 + }, + { + "epoch": 0.3969601013299557, + "grad_norm": 3.763319253921509, + "learning_rate": 9.97640369701185e-06, + "loss": 1.156, + "step": 1567 + }, + { + "epoch": 0.39721342621912603, + "grad_norm": 3.6474928855895996, + "learning_rate": 9.976322324641482e-06, + "loss": 1.1047, + "step": 1568 + }, + { + "epoch": 0.3974667511082964, + "grad_norm": 3.506779432296753, + "learning_rate": 9.976240812538315e-06, + "loss": 1.2464, + "step": 1569 + }, + { + "epoch": 0.3977200759974667, + "grad_norm": 3.992598533630371, + "learning_rate": 9.97615916070464e-06, + "loss": 1.2044, + "step": 1570 + }, + { + "epoch": 0.39797340088663713, + "grad_norm": 4.019123077392578, + "learning_rate": 9.976077369142747e-06, + "loss": 1.3095, + "step": 1571 + }, + { + "epoch": 0.3982267257758075, + "grad_norm": 4.286922931671143, + "learning_rate": 9.975995437854933e-06, + "loss": 1.2642, + "step": 1572 + }, + { + "epoch": 0.3984800506649778, + "grad_norm": 4.605002403259277, + "learning_rate": 9.9759133668435e-06, + "loss": 1.3369, + "step": 1573 + }, + { + "epoch": 0.3987333755541482, + "grad_norm": 3.912172555923462, + "learning_rate": 9.97583115611075e-06, + "loss": 1.147, + "step": 1574 + }, + { + "epoch": 0.3989867004433186, + "grad_norm": 4.260953903198242, + "learning_rate": 9.975748805658996e-06, + "loss": 1.1082, + "step": 1575 + }, + { + "epoch": 0.3992400253324889, + "grad_norm": 3.7952542304992676, + "learning_rate": 9.975666315490547e-06, + "loss": 1.084, + "step": 1576 + }, + { + "epoch": 0.39949335022165927, + "grad_norm": 3.7086267471313477, + "learning_rate": 9.975583685607717e-06, + "loss": 1.1192, + "step": 1577 + }, + { + "epoch": 0.3997466751108296, + "grad_norm": 3.9403204917907715, + "learning_rate": 9.975500916012832e-06, + "loss": 1.1835, + "step": 1578 + }, + { + "epoch": 0.4, + "grad_norm": 4.280765533447266, + "learning_rate": 9.975418006708213e-06, + "loss": 1.3106, + "step": 1579 + }, + { + "epoch": 0.40025332488917037, + "grad_norm": 3.7051100730895996, + "learning_rate": 9.975334957696186e-06, + "loss": 1.1271, + "step": 1580 + }, + { + "epoch": 0.4005066497783407, + "grad_norm": 4.107967376708984, + "learning_rate": 9.975251768979088e-06, + "loss": 1.2167, + "step": 1581 + }, + { + "epoch": 0.40075997466751107, + "grad_norm": 3.9122486114501953, + "learning_rate": 9.97516844055925e-06, + "loss": 1.2156, + "step": 1582 + }, + { + "epoch": 0.40101329955668147, + "grad_norm": 4.168511867523193, + "learning_rate": 9.975084972439016e-06, + "loss": 1.1672, + "step": 1583 + }, + { + "epoch": 0.4012666244458518, + "grad_norm": 4.123467922210693, + "learning_rate": 9.975001364620727e-06, + "loss": 1.3482, + "step": 1584 + }, + { + "epoch": 0.40151994933502216, + "grad_norm": 3.775412082672119, + "learning_rate": 9.97491761710673e-06, + "loss": 1.1986, + "step": 1585 + }, + { + "epoch": 0.4017732742241925, + "grad_norm": 3.749708414077759, + "learning_rate": 9.974833729899378e-06, + "loss": 1.1553, + "step": 1586 + }, + { + "epoch": 0.4020265991133629, + "grad_norm": 4.15584659576416, + "learning_rate": 9.974749703001027e-06, + "loss": 1.268, + "step": 1587 + }, + { + "epoch": 0.40227992400253326, + "grad_norm": 3.7087783813476562, + "learning_rate": 9.974665536414036e-06, + "loss": 1.1035, + "step": 1588 + }, + { + "epoch": 0.4025332488917036, + "grad_norm": 4.503772258758545, + "learning_rate": 9.97458123014077e-06, + "loss": 1.4037, + "step": 1589 + }, + { + "epoch": 0.40278657378087396, + "grad_norm": 3.9420435428619385, + "learning_rate": 9.974496784183592e-06, + "loss": 1.2059, + "step": 1590 + }, + { + "epoch": 0.4030398986700443, + "grad_norm": 4.350255966186523, + "learning_rate": 9.974412198544877e-06, + "loss": 1.329, + "step": 1591 + }, + { + "epoch": 0.4032932235592147, + "grad_norm": 4.118219375610352, + "learning_rate": 9.974327473226998e-06, + "loss": 1.1194, + "step": 1592 + }, + { + "epoch": 0.40354654844838506, + "grad_norm": 3.929291248321533, + "learning_rate": 9.974242608232337e-06, + "loss": 1.2417, + "step": 1593 + }, + { + "epoch": 0.4037998733375554, + "grad_norm": 3.713636875152588, + "learning_rate": 9.974157603563273e-06, + "loss": 1.2178, + "step": 1594 + }, + { + "epoch": 0.40405319822672575, + "grad_norm": 4.204817295074463, + "learning_rate": 9.974072459222195e-06, + "loss": 1.145, + "step": 1595 + }, + { + "epoch": 0.40430652311589615, + "grad_norm": 4.219133377075195, + "learning_rate": 9.973987175211492e-06, + "loss": 1.2754, + "step": 1596 + }, + { + "epoch": 0.4045598480050665, + "grad_norm": 3.8872694969177246, + "learning_rate": 9.973901751533563e-06, + "loss": 1.1998, + "step": 1597 + }, + { + "epoch": 0.40481317289423685, + "grad_norm": 4.318929672241211, + "learning_rate": 9.973816188190803e-06, + "loss": 1.3327, + "step": 1598 + }, + { + "epoch": 0.4050664977834072, + "grad_norm": 3.7346320152282715, + "learning_rate": 9.973730485185615e-06, + "loss": 1.2627, + "step": 1599 + }, + { + "epoch": 0.4053198226725776, + "grad_norm": 4.234984874725342, + "learning_rate": 9.973644642520407e-06, + "loss": 1.385, + "step": 1600 + }, + { + "epoch": 0.40557314756174795, + "grad_norm": 3.7861108779907227, + "learning_rate": 9.973558660197588e-06, + "loss": 1.226, + "step": 1601 + }, + { + "epoch": 0.4058264724509183, + "grad_norm": 4.297764778137207, + "learning_rate": 9.973472538219573e-06, + "loss": 1.3382, + "step": 1602 + }, + { + "epoch": 0.40607979734008864, + "grad_norm": 3.9423766136169434, + "learning_rate": 9.97338627658878e-06, + "loss": 1.2161, + "step": 1603 + }, + { + "epoch": 0.40633312222925905, + "grad_norm": 3.855275869369507, + "learning_rate": 9.973299875307631e-06, + "loss": 1.3608, + "step": 1604 + }, + { + "epoch": 0.4065864471184294, + "grad_norm": 3.6971325874328613, + "learning_rate": 9.973213334378553e-06, + "loss": 1.1117, + "step": 1605 + }, + { + "epoch": 0.40683977200759974, + "grad_norm": 4.086726188659668, + "learning_rate": 9.973126653803975e-06, + "loss": 1.3681, + "step": 1606 + }, + { + "epoch": 0.4070930968967701, + "grad_norm": 3.759913682937622, + "learning_rate": 9.97303983358633e-06, + "loss": 1.1092, + "step": 1607 + }, + { + "epoch": 0.4073464217859405, + "grad_norm": 3.9194438457489014, + "learning_rate": 9.972952873728061e-06, + "loss": 1.2265, + "step": 1608 + }, + { + "epoch": 0.40759974667511084, + "grad_norm": 4.006295204162598, + "learning_rate": 9.972865774231602e-06, + "loss": 1.2321, + "step": 1609 + }, + { + "epoch": 0.4078530715642812, + "grad_norm": 3.828908920288086, + "learning_rate": 9.972778535099405e-06, + "loss": 1.1447, + "step": 1610 + }, + { + "epoch": 0.40810639645345154, + "grad_norm": 4.304439544677734, + "learning_rate": 9.972691156333917e-06, + "loss": 1.2447, + "step": 1611 + }, + { + "epoch": 0.40835972134262194, + "grad_norm": 4.169918537139893, + "learning_rate": 9.97260363793759e-06, + "loss": 1.2263, + "step": 1612 + }, + { + "epoch": 0.4086130462317923, + "grad_norm": 3.9893386363983154, + "learning_rate": 9.972515979912887e-06, + "loss": 1.2821, + "step": 1613 + }, + { + "epoch": 0.40886637112096264, + "grad_norm": 3.9322779178619385, + "learning_rate": 9.972428182262264e-06, + "loss": 1.1436, + "step": 1614 + }, + { + "epoch": 0.409119696010133, + "grad_norm": 3.978416919708252, + "learning_rate": 9.972340244988187e-06, + "loss": 1.1924, + "step": 1615 + }, + { + "epoch": 0.40937302089930333, + "grad_norm": 4.024904727935791, + "learning_rate": 9.972252168093127e-06, + "loss": 1.3191, + "step": 1616 + }, + { + "epoch": 0.40962634578847373, + "grad_norm": 3.776224374771118, + "learning_rate": 9.972163951579557e-06, + "loss": 1.2135, + "step": 1617 + }, + { + "epoch": 0.4098796706776441, + "grad_norm": 3.8125698566436768, + "learning_rate": 9.972075595449953e-06, + "loss": 1.1311, + "step": 1618 + }, + { + "epoch": 0.41013299556681443, + "grad_norm": 3.916381597518921, + "learning_rate": 9.971987099706798e-06, + "loss": 1.1718, + "step": 1619 + }, + { + "epoch": 0.4103863204559848, + "grad_norm": 3.6483993530273438, + "learning_rate": 9.971898464352574e-06, + "loss": 1.2224, + "step": 1620 + }, + { + "epoch": 0.4106396453451552, + "grad_norm": 4.058041095733643, + "learning_rate": 9.971809689389771e-06, + "loss": 1.3286, + "step": 1621 + }, + { + "epoch": 0.4108929702343255, + "grad_norm": 4.105546474456787, + "learning_rate": 9.971720774820884e-06, + "loss": 1.2571, + "step": 1622 + }, + { + "epoch": 0.4111462951234959, + "grad_norm": 3.9141669273376465, + "learning_rate": 9.971631720648406e-06, + "loss": 1.3122, + "step": 1623 + }, + { + "epoch": 0.4113996200126662, + "grad_norm": 4.1042938232421875, + "learning_rate": 9.97154252687484e-06, + "loss": 1.2634, + "step": 1624 + }, + { + "epoch": 0.4116529449018366, + "grad_norm": 3.5131354331970215, + "learning_rate": 9.971453193502689e-06, + "loss": 1.0811, + "step": 1625 + }, + { + "epoch": 0.411906269791007, + "grad_norm": 4.2376627922058105, + "learning_rate": 9.971363720534463e-06, + "loss": 1.2599, + "step": 1626 + }, + { + "epoch": 0.4121595946801773, + "grad_norm": 4.638526916503906, + "learning_rate": 9.971274107972675e-06, + "loss": 1.3102, + "step": 1627 + }, + { + "epoch": 0.41241291956934767, + "grad_norm": 3.7138400077819824, + "learning_rate": 9.971184355819839e-06, + "loss": 1.2356, + "step": 1628 + }, + { + "epoch": 0.41266624445851807, + "grad_norm": 4.112374305725098, + "learning_rate": 9.971094464078476e-06, + "loss": 1.2447, + "step": 1629 + }, + { + "epoch": 0.4129195693476884, + "grad_norm": 3.9615652561187744, + "learning_rate": 9.97100443275111e-06, + "loss": 1.1926, + "step": 1630 + }, + { + "epoch": 0.41317289423685877, + "grad_norm": 4.023500442504883, + "learning_rate": 9.97091426184027e-06, + "loss": 1.3549, + "step": 1631 + }, + { + "epoch": 0.4134262191260291, + "grad_norm": 4.214376926422119, + "learning_rate": 9.970823951348488e-06, + "loss": 1.311, + "step": 1632 + }, + { + "epoch": 0.4136795440151995, + "grad_norm": 3.8443446159362793, + "learning_rate": 9.970733501278297e-06, + "loss": 1.3935, + "step": 1633 + }, + { + "epoch": 0.41393286890436987, + "grad_norm": 3.7742702960968018, + "learning_rate": 9.970642911632241e-06, + "loss": 1.1621, + "step": 1634 + }, + { + "epoch": 0.4141861937935402, + "grad_norm": 4.038102149963379, + "learning_rate": 9.970552182412861e-06, + "loss": 1.3304, + "step": 1635 + }, + { + "epoch": 0.41443951868271056, + "grad_norm": 3.636784791946411, + "learning_rate": 9.970461313622704e-06, + "loss": 1.1665, + "step": 1636 + }, + { + "epoch": 0.41469284357188096, + "grad_norm": 3.843376398086548, + "learning_rate": 9.970370305264325e-06, + "loss": 1.2383, + "step": 1637 + }, + { + "epoch": 0.4149461684610513, + "grad_norm": 37.40401077270508, + "learning_rate": 9.970279157340274e-06, + "loss": 1.4453, + "step": 1638 + }, + { + "epoch": 0.41519949335022166, + "grad_norm": 3.8575456142425537, + "learning_rate": 9.970187869853117e-06, + "loss": 1.2007, + "step": 1639 + }, + { + "epoch": 0.415452818239392, + "grad_norm": 3.9534714221954346, + "learning_rate": 9.970096442805413e-06, + "loss": 1.3262, + "step": 1640 + }, + { + "epoch": 0.41570614312856236, + "grad_norm": 3.9382474422454834, + "learning_rate": 9.970004876199731e-06, + "loss": 1.2947, + "step": 1641 + }, + { + "epoch": 0.41595946801773276, + "grad_norm": 4.135881423950195, + "learning_rate": 9.96991317003864e-06, + "loss": 1.2837, + "step": 1642 + }, + { + "epoch": 0.4162127929069031, + "grad_norm": 3.807326078414917, + "learning_rate": 9.969821324324717e-06, + "loss": 1.1733, + "step": 1643 + }, + { + "epoch": 0.41646611779607345, + "grad_norm": 4.549855709075928, + "learning_rate": 9.969729339060541e-06, + "loss": 1.4417, + "step": 1644 + }, + { + "epoch": 0.4167194426852438, + "grad_norm": 4.065769672393799, + "learning_rate": 9.969637214248693e-06, + "loss": 1.262, + "step": 1645 + }, + { + "epoch": 0.4169727675744142, + "grad_norm": 3.8566086292266846, + "learning_rate": 9.969544949891763e-06, + "loss": 1.2035, + "step": 1646 + }, + { + "epoch": 0.41722609246358455, + "grad_norm": 4.1258697509765625, + "learning_rate": 9.969452545992339e-06, + "loss": 1.2796, + "step": 1647 + }, + { + "epoch": 0.4174794173527549, + "grad_norm": 3.7903356552124023, + "learning_rate": 9.969360002553017e-06, + "loss": 1.1092, + "step": 1648 + }, + { + "epoch": 0.41773274224192525, + "grad_norm": 3.8634912967681885, + "learning_rate": 9.969267319576394e-06, + "loss": 1.3094, + "step": 1649 + }, + { + "epoch": 0.41798606713109565, + "grad_norm": 3.896378755569458, + "learning_rate": 9.969174497065074e-06, + "loss": 1.2376, + "step": 1650 + }, + { + "epoch": 0.418239392020266, + "grad_norm": 3.868745803833008, + "learning_rate": 9.969081535021665e-06, + "loss": 1.2363, + "step": 1651 + }, + { + "epoch": 0.41849271690943635, + "grad_norm": 3.708183526992798, + "learning_rate": 9.968988433448775e-06, + "loss": 1.2199, + "step": 1652 + }, + { + "epoch": 0.4187460417986067, + "grad_norm": 4.169568061828613, + "learning_rate": 9.968895192349016e-06, + "loss": 1.263, + "step": 1653 + }, + { + "epoch": 0.4189993666877771, + "grad_norm": 3.771214008331299, + "learning_rate": 9.96880181172501e-06, + "loss": 1.2186, + "step": 1654 + }, + { + "epoch": 0.41925269157694744, + "grad_norm": 3.9816854000091553, + "learning_rate": 9.96870829157938e-06, + "loss": 1.2561, + "step": 1655 + }, + { + "epoch": 0.4195060164661178, + "grad_norm": 4.4165778160095215, + "learning_rate": 9.968614631914746e-06, + "loss": 1.2839, + "step": 1656 + }, + { + "epoch": 0.41975934135528814, + "grad_norm": 3.9381518363952637, + "learning_rate": 9.968520832733745e-06, + "loss": 1.2773, + "step": 1657 + }, + { + "epoch": 0.42001266624445854, + "grad_norm": 4.232367038726807, + "learning_rate": 9.968426894039006e-06, + "loss": 1.3495, + "step": 1658 + }, + { + "epoch": 0.4202659911336289, + "grad_norm": 3.5772526264190674, + "learning_rate": 9.96833281583317e-06, + "loss": 1.0775, + "step": 1659 + }, + { + "epoch": 0.42051931602279924, + "grad_norm": 3.7976326942443848, + "learning_rate": 9.968238598118876e-06, + "loss": 1.2518, + "step": 1660 + }, + { + "epoch": 0.4207726409119696, + "grad_norm": 4.092529773712158, + "learning_rate": 9.96814424089877e-06, + "loss": 1.2172, + "step": 1661 + }, + { + "epoch": 0.42102596580114, + "grad_norm": 4.227485656738281, + "learning_rate": 9.968049744175503e-06, + "loss": 1.2755, + "step": 1662 + }, + { + "epoch": 0.42127929069031034, + "grad_norm": 3.9120378494262695, + "learning_rate": 9.967955107951727e-06, + "loss": 1.2017, + "step": 1663 + }, + { + "epoch": 0.4215326155794807, + "grad_norm": 4.034577369689941, + "learning_rate": 9.967860332230102e-06, + "loss": 1.2581, + "step": 1664 + }, + { + "epoch": 0.42178594046865103, + "grad_norm": 4.344268321990967, + "learning_rate": 9.967765417013284e-06, + "loss": 1.1905, + "step": 1665 + }, + { + "epoch": 0.4220392653578214, + "grad_norm": 4.0689544677734375, + "learning_rate": 9.967670362303944e-06, + "loss": 1.1421, + "step": 1666 + }, + { + "epoch": 0.4222925902469918, + "grad_norm": 4.221645355224609, + "learning_rate": 9.967575168104746e-06, + "loss": 1.1829, + "step": 1667 + }, + { + "epoch": 0.42254591513616213, + "grad_norm": 3.80424165725708, + "learning_rate": 9.967479834418368e-06, + "loss": 1.3086, + "step": 1668 + }, + { + "epoch": 0.4227992400253325, + "grad_norm": 4.1412200927734375, + "learning_rate": 9.967384361247484e-06, + "loss": 1.1899, + "step": 1669 + }, + { + "epoch": 0.4230525649145028, + "grad_norm": 4.207159996032715, + "learning_rate": 9.967288748594775e-06, + "loss": 1.219, + "step": 1670 + }, + { + "epoch": 0.42330588980367323, + "grad_norm": 4.115466594696045, + "learning_rate": 9.967192996462925e-06, + "loss": 1.1965, + "step": 1671 + }, + { + "epoch": 0.4235592146928436, + "grad_norm": 3.930893659591675, + "learning_rate": 9.967097104854624e-06, + "loss": 1.2194, + "step": 1672 + }, + { + "epoch": 0.4238125395820139, + "grad_norm": 3.5521059036254883, + "learning_rate": 9.967001073772564e-06, + "loss": 1.0621, + "step": 1673 + }, + { + "epoch": 0.4240658644711843, + "grad_norm": 3.9072189331054688, + "learning_rate": 9.966904903219443e-06, + "loss": 1.1969, + "step": 1674 + }, + { + "epoch": 0.4243191893603547, + "grad_norm": 3.701289176940918, + "learning_rate": 9.966808593197959e-06, + "loss": 1.1165, + "step": 1675 + }, + { + "epoch": 0.424572514249525, + "grad_norm": 4.0461320877075195, + "learning_rate": 9.966712143710819e-06, + "loss": 1.3238, + "step": 1676 + }, + { + "epoch": 0.42482583913869537, + "grad_norm": 4.037704944610596, + "learning_rate": 9.966615554760729e-06, + "loss": 1.3101, + "step": 1677 + }, + { + "epoch": 0.4250791640278657, + "grad_norm": 4.149792671203613, + "learning_rate": 9.966518826350401e-06, + "loss": 1.2117, + "step": 1678 + }, + { + "epoch": 0.4253324889170361, + "grad_norm": 3.9528732299804688, + "learning_rate": 9.966421958482553e-06, + "loss": 1.2514, + "step": 1679 + }, + { + "epoch": 0.42558581380620647, + "grad_norm": 3.8577218055725098, + "learning_rate": 9.966324951159904e-06, + "loss": 1.1941, + "step": 1680 + }, + { + "epoch": 0.4258391386953768, + "grad_norm": 3.78157114982605, + "learning_rate": 9.966227804385177e-06, + "loss": 1.1766, + "step": 1681 + }, + { + "epoch": 0.42609246358454717, + "grad_norm": 3.9237875938415527, + "learning_rate": 9.966130518161102e-06, + "loss": 1.2366, + "step": 1682 + }, + { + "epoch": 0.42634578847371757, + "grad_norm": 3.930771589279175, + "learning_rate": 9.96603309249041e-06, + "loss": 1.2634, + "step": 1683 + }, + { + "epoch": 0.4265991133628879, + "grad_norm": 4.1034135818481445, + "learning_rate": 9.965935527375835e-06, + "loss": 1.3415, + "step": 1684 + }, + { + "epoch": 0.42685243825205826, + "grad_norm": 4.213851451873779, + "learning_rate": 9.965837822820117e-06, + "loss": 1.3057, + "step": 1685 + }, + { + "epoch": 0.4271057631412286, + "grad_norm": 4.129579544067383, + "learning_rate": 9.965739978826004e-06, + "loss": 1.1247, + "step": 1686 + }, + { + "epoch": 0.42735908803039896, + "grad_norm": 4.075704574584961, + "learning_rate": 9.965641995396235e-06, + "loss": 1.1876, + "step": 1687 + }, + { + "epoch": 0.42761241291956936, + "grad_norm": 4.015869617462158, + "learning_rate": 9.96554387253357e-06, + "loss": 1.2813, + "step": 1688 + }, + { + "epoch": 0.4278657378087397, + "grad_norm": 3.5135021209716797, + "learning_rate": 9.965445610240758e-06, + "loss": 1.165, + "step": 1689 + }, + { + "epoch": 0.42811906269791006, + "grad_norm": 3.59696102142334, + "learning_rate": 9.965347208520561e-06, + "loss": 1.2176, + "step": 1690 + }, + { + "epoch": 0.4283723875870804, + "grad_norm": 3.8628885746002197, + "learning_rate": 9.965248667375742e-06, + "loss": 1.1508, + "step": 1691 + }, + { + "epoch": 0.4286257124762508, + "grad_norm": 3.7970235347747803, + "learning_rate": 9.965149986809067e-06, + "loss": 1.2897, + "step": 1692 + }, + { + "epoch": 0.42887903736542116, + "grad_norm": 4.112331867218018, + "learning_rate": 9.965051166823308e-06, + "loss": 1.4396, + "step": 1693 + }, + { + "epoch": 0.4291323622545915, + "grad_norm": 3.932244062423706, + "learning_rate": 9.964952207421239e-06, + "loss": 1.2638, + "step": 1694 + }, + { + "epoch": 0.42938568714376185, + "grad_norm": 3.9631271362304688, + "learning_rate": 9.96485310860564e-06, + "loss": 1.209, + "step": 1695 + }, + { + "epoch": 0.42963901203293225, + "grad_norm": 4.168961048126221, + "learning_rate": 9.96475387037929e-06, + "loss": 1.3285, + "step": 1696 + }, + { + "epoch": 0.4298923369221026, + "grad_norm": 3.889246940612793, + "learning_rate": 9.96465449274498e-06, + "loss": 1.2151, + "step": 1697 + }, + { + "epoch": 0.43014566181127295, + "grad_norm": 4.106306552886963, + "learning_rate": 9.964554975705499e-06, + "loss": 1.3925, + "step": 1698 + }, + { + "epoch": 0.4303989867004433, + "grad_norm": 3.897536516189575, + "learning_rate": 9.96445531926364e-06, + "loss": 1.3023, + "step": 1699 + }, + { + "epoch": 0.4306523115896137, + "grad_norm": 3.791674852371216, + "learning_rate": 9.964355523422201e-06, + "loss": 1.1373, + "step": 1700 + }, + { + "epoch": 0.43090563647878405, + "grad_norm": 3.9845049381256104, + "learning_rate": 9.964255588183989e-06, + "loss": 1.2183, + "step": 1701 + }, + { + "epoch": 0.4311589613679544, + "grad_norm": 4.000594615936279, + "learning_rate": 9.964155513551806e-06, + "loss": 1.2421, + "step": 1702 + }, + { + "epoch": 0.43141228625712474, + "grad_norm": 4.074901580810547, + "learning_rate": 9.964055299528462e-06, + "loss": 1.2713, + "step": 1703 + }, + { + "epoch": 0.43166561114629515, + "grad_norm": 3.7564749717712402, + "learning_rate": 9.96395494611677e-06, + "loss": 1.0999, + "step": 1704 + }, + { + "epoch": 0.4319189360354655, + "grad_norm": 3.847492218017578, + "learning_rate": 9.963854453319552e-06, + "loss": 1.1781, + "step": 1705 + }, + { + "epoch": 0.43217226092463584, + "grad_norm": 4.278839111328125, + "learning_rate": 9.963753821139625e-06, + "loss": 1.2701, + "step": 1706 + }, + { + "epoch": 0.4324255858138062, + "grad_norm": 3.66853666305542, + "learning_rate": 9.96365304957982e-06, + "loss": 1.0739, + "step": 1707 + }, + { + "epoch": 0.4326789107029766, + "grad_norm": 3.588291883468628, + "learning_rate": 9.963552138642962e-06, + "loss": 1.1273, + "step": 1708 + }, + { + "epoch": 0.43293223559214694, + "grad_norm": 3.9035909175872803, + "learning_rate": 9.963451088331885e-06, + "loss": 1.1392, + "step": 1709 + }, + { + "epoch": 0.4331855604813173, + "grad_norm": 4.046663761138916, + "learning_rate": 9.96334989864943e-06, + "loss": 1.242, + "step": 1710 + }, + { + "epoch": 0.43343888537048764, + "grad_norm": 3.7607357501983643, + "learning_rate": 9.963248569598436e-06, + "loss": 1.1249, + "step": 1711 + }, + { + "epoch": 0.433692210259658, + "grad_norm": 3.775604724884033, + "learning_rate": 9.963147101181748e-06, + "loss": 1.131, + "step": 1712 + }, + { + "epoch": 0.4339455351488284, + "grad_norm": 3.9623661041259766, + "learning_rate": 9.963045493402215e-06, + "loss": 1.1045, + "step": 1713 + }, + { + "epoch": 0.43419886003799874, + "grad_norm": 4.357337951660156, + "learning_rate": 9.962943746262691e-06, + "loss": 1.3916, + "step": 1714 + }, + { + "epoch": 0.4344521849271691, + "grad_norm": 3.967344045639038, + "learning_rate": 9.962841859766032e-06, + "loss": 1.1091, + "step": 1715 + }, + { + "epoch": 0.43470550981633943, + "grad_norm": 3.9389865398406982, + "learning_rate": 9.9627398339151e-06, + "loss": 1.0805, + "step": 1716 + }, + { + "epoch": 0.43495883470550983, + "grad_norm": 4.415093421936035, + "learning_rate": 9.96263766871276e-06, + "loss": 1.3108, + "step": 1717 + }, + { + "epoch": 0.4352121595946802, + "grad_norm": 3.8508241176605225, + "learning_rate": 9.962535364161879e-06, + "loss": 1.1796, + "step": 1718 + }, + { + "epoch": 0.43546548448385053, + "grad_norm": 3.9241507053375244, + "learning_rate": 9.962432920265333e-06, + "loss": 1.2547, + "step": 1719 + }, + { + "epoch": 0.4357188093730209, + "grad_norm": 3.8462283611297607, + "learning_rate": 9.962330337025993e-06, + "loss": 1.1839, + "step": 1720 + }, + { + "epoch": 0.4359721342621913, + "grad_norm": 3.643967628479004, + "learning_rate": 9.962227614446744e-06, + "loss": 1.217, + "step": 1721 + }, + { + "epoch": 0.4362254591513616, + "grad_norm": 3.898104190826416, + "learning_rate": 9.96212475253047e-06, + "loss": 1.1689, + "step": 1722 + }, + { + "epoch": 0.436478784040532, + "grad_norm": 3.9070205688476562, + "learning_rate": 9.96202175128006e-06, + "loss": 1.2024, + "step": 1723 + }, + { + "epoch": 0.4367321089297023, + "grad_norm": 4.046167373657227, + "learning_rate": 9.961918610698403e-06, + "loss": 1.2221, + "step": 1724 + }, + { + "epoch": 0.4369854338188727, + "grad_norm": 3.691685676574707, + "learning_rate": 9.961815330788397e-06, + "loss": 1.2499, + "step": 1725 + }, + { + "epoch": 0.4372387587080431, + "grad_norm": 3.5208113193511963, + "learning_rate": 9.961711911552943e-06, + "loss": 1.1007, + "step": 1726 + }, + { + "epoch": 0.4374920835972134, + "grad_norm": 3.906390905380249, + "learning_rate": 9.961608352994943e-06, + "loss": 1.2704, + "step": 1727 + }, + { + "epoch": 0.43774540848638377, + "grad_norm": 3.5950465202331543, + "learning_rate": 9.961504655117306e-06, + "loss": 1.1376, + "step": 1728 + }, + { + "epoch": 0.4379987333755542, + "grad_norm": 3.922039031982422, + "learning_rate": 9.961400817922943e-06, + "loss": 1.2138, + "step": 1729 + }, + { + "epoch": 0.4382520582647245, + "grad_norm": 3.3800699710845947, + "learning_rate": 9.961296841414772e-06, + "loss": 1.1219, + "step": 1730 + }, + { + "epoch": 0.43850538315389487, + "grad_norm": 4.212485313415527, + "learning_rate": 9.96119272559571e-06, + "loss": 1.1339, + "step": 1731 + }, + { + "epoch": 0.4387587080430652, + "grad_norm": 3.8920695781707764, + "learning_rate": 9.961088470468681e-06, + "loss": 1.1936, + "step": 1732 + }, + { + "epoch": 0.4390120329322356, + "grad_norm": 4.168819904327393, + "learning_rate": 9.960984076036612e-06, + "loss": 1.3314, + "step": 1733 + }, + { + "epoch": 0.43926535782140597, + "grad_norm": 4.400965690612793, + "learning_rate": 9.960879542302437e-06, + "loss": 1.3004, + "step": 1734 + }, + { + "epoch": 0.4395186827105763, + "grad_norm": 3.778353452682495, + "learning_rate": 9.96077486926909e-06, + "loss": 1.2599, + "step": 1735 + }, + { + "epoch": 0.43977200759974666, + "grad_norm": 3.7821664810180664, + "learning_rate": 9.960670056939507e-06, + "loss": 1.1449, + "step": 1736 + }, + { + "epoch": 0.440025332488917, + "grad_norm": 3.8294286727905273, + "learning_rate": 9.960565105316636e-06, + "loss": 1.1239, + "step": 1737 + }, + { + "epoch": 0.4402786573780874, + "grad_norm": 3.890516996383667, + "learning_rate": 9.960460014403422e-06, + "loss": 1.2695, + "step": 1738 + }, + { + "epoch": 0.44053198226725776, + "grad_norm": 3.5491371154785156, + "learning_rate": 9.960354784202814e-06, + "loss": 1.2282, + "step": 1739 + }, + { + "epoch": 0.4407853071564281, + "grad_norm": 4.057201385498047, + "learning_rate": 9.96024941471777e-06, + "loss": 1.2313, + "step": 1740 + }, + { + "epoch": 0.44103863204559846, + "grad_norm": 3.9677681922912598, + "learning_rate": 9.960143905951247e-06, + "loss": 1.1955, + "step": 1741 + }, + { + "epoch": 0.44129195693476886, + "grad_norm": 4.036614418029785, + "learning_rate": 9.960038257906206e-06, + "loss": 1.3526, + "step": 1742 + }, + { + "epoch": 0.4415452818239392, + "grad_norm": 4.330037593841553, + "learning_rate": 9.959932470585619e-06, + "loss": 1.2349, + "step": 1743 + }, + { + "epoch": 0.44179860671310955, + "grad_norm": 3.7230772972106934, + "learning_rate": 9.959826543992448e-06, + "loss": 1.219, + "step": 1744 + }, + { + "epoch": 0.4420519316022799, + "grad_norm": 3.892216682434082, + "learning_rate": 9.959720478129677e-06, + "loss": 1.2287, + "step": 1745 + }, + { + "epoch": 0.4423052564914503, + "grad_norm": 3.636098861694336, + "learning_rate": 9.959614273000276e-06, + "loss": 1.2059, + "step": 1746 + }, + { + "epoch": 0.44255858138062065, + "grad_norm": 3.4097251892089844, + "learning_rate": 9.959507928607232e-06, + "loss": 1.2674, + "step": 1747 + }, + { + "epoch": 0.442811906269791, + "grad_norm": 3.809654951095581, + "learning_rate": 9.95940144495353e-06, + "loss": 1.2083, + "step": 1748 + }, + { + "epoch": 0.44306523115896135, + "grad_norm": 3.955592393875122, + "learning_rate": 9.95929482204216e-06, + "loss": 1.2913, + "step": 1749 + }, + { + "epoch": 0.44331855604813175, + "grad_norm": 3.4874720573425293, + "learning_rate": 9.959188059876115e-06, + "loss": 1.1605, + "step": 1750 + }, + { + "epoch": 0.4435718809373021, + "grad_norm": 3.6861040592193604, + "learning_rate": 9.959081158458393e-06, + "loss": 1.196, + "step": 1751 + }, + { + "epoch": 0.44382520582647245, + "grad_norm": 4.005224704742432, + "learning_rate": 9.958974117791998e-06, + "loss": 1.1422, + "step": 1752 + }, + { + "epoch": 0.4440785307156428, + "grad_norm": 4.103886604309082, + "learning_rate": 9.958866937879932e-06, + "loss": 1.2124, + "step": 1753 + }, + { + "epoch": 0.4443318556048132, + "grad_norm": 3.5270473957061768, + "learning_rate": 9.958759618725208e-06, + "loss": 1.1824, + "step": 1754 + }, + { + "epoch": 0.44458518049398354, + "grad_norm": 4.049485206604004, + "learning_rate": 9.958652160330837e-06, + "loss": 1.22, + "step": 1755 + }, + { + "epoch": 0.4448385053831539, + "grad_norm": 3.594125986099243, + "learning_rate": 9.958544562699838e-06, + "loss": 1.1801, + "step": 1756 + }, + { + "epoch": 0.44509183027232424, + "grad_norm": 4.060774803161621, + "learning_rate": 9.95843682583523e-06, + "loss": 1.1721, + "step": 1757 + }, + { + "epoch": 0.44534515516149464, + "grad_norm": 4.582459449768066, + "learning_rate": 9.958328949740043e-06, + "loss": 1.2746, + "step": 1758 + }, + { + "epoch": 0.445598480050665, + "grad_norm": 3.7934377193450928, + "learning_rate": 9.958220934417302e-06, + "loss": 1.1259, + "step": 1759 + }, + { + "epoch": 0.44585180493983534, + "grad_norm": 4.88258695602417, + "learning_rate": 9.95811277987004e-06, + "loss": 1.2888, + "step": 1760 + }, + { + "epoch": 0.4461051298290057, + "grad_norm": 4.136811256408691, + "learning_rate": 9.958004486101293e-06, + "loss": 1.3195, + "step": 1761 + }, + { + "epoch": 0.44635845471817603, + "grad_norm": 4.080023765563965, + "learning_rate": 9.957896053114106e-06, + "loss": 1.2883, + "step": 1762 + }, + { + "epoch": 0.44661177960734644, + "grad_norm": 3.545778274536133, + "learning_rate": 9.957787480911522e-06, + "loss": 1.113, + "step": 1763 + }, + { + "epoch": 0.4468651044965168, + "grad_norm": 4.2189178466796875, + "learning_rate": 9.957678769496587e-06, + "loss": 1.286, + "step": 1764 + }, + { + "epoch": 0.44711842938568713, + "grad_norm": 3.997009754180908, + "learning_rate": 9.957569918872359e-06, + "loss": 1.2288, + "step": 1765 + }, + { + "epoch": 0.4473717542748575, + "grad_norm": 4.109933853149414, + "learning_rate": 9.95746092904189e-06, + "loss": 1.4011, + "step": 1766 + }, + { + "epoch": 0.4476250791640279, + "grad_norm": 3.9960086345672607, + "learning_rate": 9.957351800008241e-06, + "loss": 1.319, + "step": 1767 + }, + { + "epoch": 0.44787840405319823, + "grad_norm": 4.276450157165527, + "learning_rate": 9.957242531774476e-06, + "loss": 1.1925, + "step": 1768 + }, + { + "epoch": 0.4481317289423686, + "grad_norm": 4.669538497924805, + "learning_rate": 9.957133124343666e-06, + "loss": 1.1981, + "step": 1769 + }, + { + "epoch": 0.4483850538315389, + "grad_norm": 4.093993663787842, + "learning_rate": 9.957023577718879e-06, + "loss": 1.2145, + "step": 1770 + }, + { + "epoch": 0.44863837872070933, + "grad_norm": 3.737457036972046, + "learning_rate": 9.956913891903195e-06, + "loss": 1.293, + "step": 1771 + }, + { + "epoch": 0.4488917036098797, + "grad_norm": 3.5621869564056396, + "learning_rate": 9.95680406689969e-06, + "loss": 1.0754, + "step": 1772 + }, + { + "epoch": 0.44914502849905, + "grad_norm": 3.8836936950683594, + "learning_rate": 9.956694102711452e-06, + "loss": 1.2296, + "step": 1773 + }, + { + "epoch": 0.4493983533882204, + "grad_norm": 4.1828742027282715, + "learning_rate": 9.956583999341564e-06, + "loss": 1.5065, + "step": 1774 + }, + { + "epoch": 0.4496516782773908, + "grad_norm": 4.305746555328369, + "learning_rate": 9.956473756793123e-06, + "loss": 1.3093, + "step": 1775 + }, + { + "epoch": 0.4499050031665611, + "grad_norm": 3.833472728729248, + "learning_rate": 9.956363375069222e-06, + "loss": 1.2496, + "step": 1776 + }, + { + "epoch": 0.45015832805573147, + "grad_norm": 3.845844268798828, + "learning_rate": 9.95625285417296e-06, + "loss": 1.182, + "step": 1777 + }, + { + "epoch": 0.4504116529449018, + "grad_norm": 3.8236515522003174, + "learning_rate": 9.956142194107438e-06, + "loss": 1.1313, + "step": 1778 + }, + { + "epoch": 0.4506649778340722, + "grad_norm": 3.7965993881225586, + "learning_rate": 9.95603139487577e-06, + "loss": 1.1112, + "step": 1779 + }, + { + "epoch": 0.45091830272324257, + "grad_norm": 4.976373195648193, + "learning_rate": 9.95592045648106e-06, + "loss": 1.1899, + "step": 1780 + }, + { + "epoch": 0.4511716276124129, + "grad_norm": 3.762986183166504, + "learning_rate": 9.955809378926428e-06, + "loss": 1.1653, + "step": 1781 + }, + { + "epoch": 0.45142495250158327, + "grad_norm": 3.95588755607605, + "learning_rate": 9.955698162214992e-06, + "loss": 1.2598, + "step": 1782 + }, + { + "epoch": 0.45167827739075367, + "grad_norm": 3.8318119049072266, + "learning_rate": 9.955586806349874e-06, + "loss": 1.2612, + "step": 1783 + }, + { + "epoch": 0.451931602279924, + "grad_norm": 3.6611649990081787, + "learning_rate": 9.9554753113342e-06, + "loss": 1.2329, + "step": 1784 + }, + { + "epoch": 0.45218492716909436, + "grad_norm": 4.284769058227539, + "learning_rate": 9.955363677171101e-06, + "loss": 1.2697, + "step": 1785 + }, + { + "epoch": 0.4524382520582647, + "grad_norm": 3.549837112426758, + "learning_rate": 9.955251903863713e-06, + "loss": 1.1174, + "step": 1786 + }, + { + "epoch": 0.45269157694743506, + "grad_norm": 4.23306941986084, + "learning_rate": 9.955139991415175e-06, + "loss": 1.2886, + "step": 1787 + }, + { + "epoch": 0.45294490183660546, + "grad_norm": 3.990553617477417, + "learning_rate": 9.955027939828628e-06, + "loss": 1.3615, + "step": 1788 + }, + { + "epoch": 0.4531982267257758, + "grad_norm": 4.17873477935791, + "learning_rate": 9.954915749107219e-06, + "loss": 1.3555, + "step": 1789 + }, + { + "epoch": 0.45345155161494616, + "grad_norm": 3.8110413551330566, + "learning_rate": 9.954803419254097e-06, + "loss": 1.2577, + "step": 1790 + }, + { + "epoch": 0.4537048765041165, + "grad_norm": 3.6629533767700195, + "learning_rate": 9.954690950272419e-06, + "loss": 1.0698, + "step": 1791 + }, + { + "epoch": 0.4539582013932869, + "grad_norm": 3.7844316959381104, + "learning_rate": 9.95457834216534e-06, + "loss": 1.2555, + "step": 1792 + }, + { + "epoch": 0.45421152628245726, + "grad_norm": 3.962120771408081, + "learning_rate": 9.954465594936024e-06, + "loss": 1.2444, + "step": 1793 + }, + { + "epoch": 0.4544648511716276, + "grad_norm": 3.9265973567962646, + "learning_rate": 9.954352708587636e-06, + "loss": 1.283, + "step": 1794 + }, + { + "epoch": 0.45471817606079795, + "grad_norm": 4.013422966003418, + "learning_rate": 9.954239683123344e-06, + "loss": 1.2045, + "step": 1795 + }, + { + "epoch": 0.45497150094996835, + "grad_norm": 4.067855358123779, + "learning_rate": 9.954126518546326e-06, + "loss": 1.2415, + "step": 1796 + }, + { + "epoch": 0.4552248258391387, + "grad_norm": 3.4011635780334473, + "learning_rate": 9.954013214859757e-06, + "loss": 1.0973, + "step": 1797 + }, + { + "epoch": 0.45547815072830905, + "grad_norm": 3.786022186279297, + "learning_rate": 9.953899772066817e-06, + "loss": 1.132, + "step": 1798 + }, + { + "epoch": 0.4557314756174794, + "grad_norm": 3.6727492809295654, + "learning_rate": 9.953786190170694e-06, + "loss": 1.1464, + "step": 1799 + }, + { + "epoch": 0.4559848005066498, + "grad_norm": 3.9442310333251953, + "learning_rate": 9.953672469174578e-06, + "loss": 1.1045, + "step": 1800 + }, + { + "epoch": 0.45623812539582015, + "grad_norm": 3.8506267070770264, + "learning_rate": 9.953558609081659e-06, + "loss": 1.2371, + "step": 1801 + }, + { + "epoch": 0.4564914502849905, + "grad_norm": 3.5730085372924805, + "learning_rate": 9.953444609895136e-06, + "loss": 1.1059, + "step": 1802 + }, + { + "epoch": 0.45674477517416084, + "grad_norm": 4.165727138519287, + "learning_rate": 9.953330471618211e-06, + "loss": 1.1077, + "step": 1803 + }, + { + "epoch": 0.45699810006333125, + "grad_norm": 3.7444920539855957, + "learning_rate": 9.953216194254088e-06, + "loss": 1.1051, + "step": 1804 + }, + { + "epoch": 0.4572514249525016, + "grad_norm": 4.292081832885742, + "learning_rate": 9.953101777805973e-06, + "loss": 1.3843, + "step": 1805 + }, + { + "epoch": 0.45750474984167194, + "grad_norm": 3.955099582672119, + "learning_rate": 9.952987222277084e-06, + "loss": 1.2544, + "step": 1806 + }, + { + "epoch": 0.4577580747308423, + "grad_norm": 3.6804616451263428, + "learning_rate": 9.952872527670636e-06, + "loss": 1.0769, + "step": 1807 + }, + { + "epoch": 0.4580113996200127, + "grad_norm": 4.215207576751709, + "learning_rate": 9.952757693989848e-06, + "loss": 1.3601, + "step": 1808 + }, + { + "epoch": 0.45826472450918304, + "grad_norm": 3.824826240539551, + "learning_rate": 9.952642721237945e-06, + "loss": 1.2812, + "step": 1809 + }, + { + "epoch": 0.4585180493983534, + "grad_norm": 3.581301689147949, + "learning_rate": 9.952527609418153e-06, + "loss": 1.0627, + "step": 1810 + }, + { + "epoch": 0.45877137428752374, + "grad_norm": 3.8855178356170654, + "learning_rate": 9.95241235853371e-06, + "loss": 1.3018, + "step": 1811 + }, + { + "epoch": 0.4590246991766941, + "grad_norm": 3.7887930870056152, + "learning_rate": 9.952296968587847e-06, + "loss": 1.2669, + "step": 1812 + }, + { + "epoch": 0.4592780240658645, + "grad_norm": 3.6120541095733643, + "learning_rate": 9.952181439583808e-06, + "loss": 1.0911, + "step": 1813 + }, + { + "epoch": 0.45953134895503484, + "grad_norm": 5.2798357009887695, + "learning_rate": 9.952065771524834e-06, + "loss": 1.1291, + "step": 1814 + }, + { + "epoch": 0.4597846738442052, + "grad_norm": 4.032440185546875, + "learning_rate": 9.951949964414174e-06, + "loss": 1.2002, + "step": 1815 + }, + { + "epoch": 0.46003799873337553, + "grad_norm": 3.861185073852539, + "learning_rate": 9.95183401825508e-06, + "loss": 1.2061, + "step": 1816 + }, + { + "epoch": 0.46029132362254593, + "grad_norm": 4.094289302825928, + "learning_rate": 9.951717933050808e-06, + "loss": 1.3375, + "step": 1817 + }, + { + "epoch": 0.4605446485117163, + "grad_norm": 4.029137134552002, + "learning_rate": 9.951601708804616e-06, + "loss": 1.1493, + "step": 1818 + }, + { + "epoch": 0.46079797340088663, + "grad_norm": 4.202754497528076, + "learning_rate": 9.95148534551977e-06, + "loss": 1.3218, + "step": 1819 + }, + { + "epoch": 0.461051298290057, + "grad_norm": 4.536633014678955, + "learning_rate": 9.951368843199537e-06, + "loss": 1.3656, + "step": 1820 + }, + { + "epoch": 0.4613046231792274, + "grad_norm": 3.5142319202423096, + "learning_rate": 9.951252201847185e-06, + "loss": 1.2584, + "step": 1821 + }, + { + "epoch": 0.4615579480683977, + "grad_norm": 4.182380199432373, + "learning_rate": 9.951135421465994e-06, + "loss": 1.1919, + "step": 1822 + }, + { + "epoch": 0.4618112729575681, + "grad_norm": 4.094515323638916, + "learning_rate": 9.951018502059241e-06, + "loss": 1.248, + "step": 1823 + }, + { + "epoch": 0.4620645978467384, + "grad_norm": 3.897925615310669, + "learning_rate": 9.950901443630207e-06, + "loss": 1.1969, + "step": 1824 + }, + { + "epoch": 0.4623179227359088, + "grad_norm": 3.899789571762085, + "learning_rate": 9.950784246182182e-06, + "loss": 1.3231, + "step": 1825 + }, + { + "epoch": 0.4625712476250792, + "grad_norm": 3.80564284324646, + "learning_rate": 9.950666909718455e-06, + "loss": 1.2565, + "step": 1826 + }, + { + "epoch": 0.4628245725142495, + "grad_norm": 3.5356082916259766, + "learning_rate": 9.950549434242323e-06, + "loss": 1.0738, + "step": 1827 + }, + { + "epoch": 0.46307789740341987, + "grad_norm": 3.7041282653808594, + "learning_rate": 9.950431819757082e-06, + "loss": 1.2571, + "step": 1828 + }, + { + "epoch": 0.4633312222925903, + "grad_norm": 3.9085562229156494, + "learning_rate": 9.950314066266036e-06, + "loss": 1.1865, + "step": 1829 + }, + { + "epoch": 0.4635845471817606, + "grad_norm": 4.401854038238525, + "learning_rate": 9.95019617377249e-06, + "loss": 1.303, + "step": 1830 + }, + { + "epoch": 0.46383787207093097, + "grad_norm": 3.7897353172302246, + "learning_rate": 9.950078142279756e-06, + "loss": 1.1402, + "step": 1831 + }, + { + "epoch": 0.4640911969601013, + "grad_norm": 4.0070390701293945, + "learning_rate": 9.949959971791148e-06, + "loss": 1.4373, + "step": 1832 + }, + { + "epoch": 0.4643445218492717, + "grad_norm": 3.7028911113739014, + "learning_rate": 9.949841662309984e-06, + "loss": 1.2008, + "step": 1833 + }, + { + "epoch": 0.46459784673844207, + "grad_norm": 3.7358293533325195, + "learning_rate": 9.949723213839587e-06, + "loss": 1.1822, + "step": 1834 + }, + { + "epoch": 0.4648511716276124, + "grad_norm": 3.5930306911468506, + "learning_rate": 9.94960462638328e-06, + "loss": 1.1352, + "step": 1835 + }, + { + "epoch": 0.46510449651678276, + "grad_norm": 3.7119147777557373, + "learning_rate": 9.949485899944396e-06, + "loss": 1.1535, + "step": 1836 + }, + { + "epoch": 0.4653578214059531, + "grad_norm": 3.891018867492676, + "learning_rate": 9.949367034526267e-06, + "loss": 1.2585, + "step": 1837 + }, + { + "epoch": 0.4656111462951235, + "grad_norm": 3.875239133834839, + "learning_rate": 9.94924803013223e-06, + "loss": 1.2086, + "step": 1838 + }, + { + "epoch": 0.46586447118429386, + "grad_norm": 3.798842668533325, + "learning_rate": 9.94912888676563e-06, + "loss": 1.222, + "step": 1839 + }, + { + "epoch": 0.4661177960734642, + "grad_norm": 3.962372064590454, + "learning_rate": 9.949009604429811e-06, + "loss": 1.2586, + "step": 1840 + }, + { + "epoch": 0.46637112096263456, + "grad_norm": 3.7630298137664795, + "learning_rate": 9.948890183128122e-06, + "loss": 1.0473, + "step": 1841 + }, + { + "epoch": 0.46662444585180496, + "grad_norm": 3.6790220737457275, + "learning_rate": 9.948770622863914e-06, + "loss": 1.2097, + "step": 1842 + }, + { + "epoch": 0.4668777707409753, + "grad_norm": 3.9057321548461914, + "learning_rate": 9.948650923640547e-06, + "loss": 1.2565, + "step": 1843 + }, + { + "epoch": 0.46713109563014565, + "grad_norm": 4.201375961303711, + "learning_rate": 9.948531085461382e-06, + "loss": 1.1996, + "step": 1844 + }, + { + "epoch": 0.467384420519316, + "grad_norm": 3.963596820831299, + "learning_rate": 9.948411108329783e-06, + "loss": 1.2559, + "step": 1845 + }, + { + "epoch": 0.4676377454084864, + "grad_norm": 3.891873598098755, + "learning_rate": 9.94829099224912e-06, + "loss": 1.3843, + "step": 1846 + }, + { + "epoch": 0.46789107029765675, + "grad_norm": 3.9661974906921387, + "learning_rate": 9.948170737222763e-06, + "loss": 1.3839, + "step": 1847 + }, + { + "epoch": 0.4681443951868271, + "grad_norm": 3.680875062942505, + "learning_rate": 9.948050343254092e-06, + "loss": 1.2136, + "step": 1848 + }, + { + "epoch": 0.46839772007599745, + "grad_norm": 3.872366189956665, + "learning_rate": 9.947929810346486e-06, + "loss": 1.228, + "step": 1849 + }, + { + "epoch": 0.46865104496516785, + "grad_norm": 4.095637798309326, + "learning_rate": 9.94780913850333e-06, + "loss": 1.136, + "step": 1850 + }, + { + "epoch": 0.4689043698543382, + "grad_norm": 3.4010581970214844, + "learning_rate": 9.947688327728013e-06, + "loss": 1.1325, + "step": 1851 + }, + { + "epoch": 0.46915769474350855, + "grad_norm": 3.8270957469940186, + "learning_rate": 9.947567378023927e-06, + "loss": 1.1534, + "step": 1852 + }, + { + "epoch": 0.4694110196326789, + "grad_norm": 4.096489429473877, + "learning_rate": 9.947446289394466e-06, + "loss": 1.2138, + "step": 1853 + }, + { + "epoch": 0.4696643445218493, + "grad_norm": 3.8658835887908936, + "learning_rate": 9.947325061843035e-06, + "loss": 1.1772, + "step": 1854 + }, + { + "epoch": 0.46991766941101965, + "grad_norm": 4.072702407836914, + "learning_rate": 9.947203695373033e-06, + "loss": 1.2552, + "step": 1855 + }, + { + "epoch": 0.47017099430019, + "grad_norm": 3.8411388397216797, + "learning_rate": 9.94708218998787e-06, + "loss": 1.3319, + "step": 1856 + }, + { + "epoch": 0.47042431918936034, + "grad_norm": 3.4740614891052246, + "learning_rate": 9.946960545690958e-06, + "loss": 1.0957, + "step": 1857 + }, + { + "epoch": 0.4706776440785307, + "grad_norm": 3.8394386768341064, + "learning_rate": 9.946838762485712e-06, + "loss": 1.2165, + "step": 1858 + }, + { + "epoch": 0.4709309689677011, + "grad_norm": 3.7392282485961914, + "learning_rate": 9.946716840375552e-06, + "loss": 1.245, + "step": 1859 + }, + { + "epoch": 0.47118429385687144, + "grad_norm": 4.12973690032959, + "learning_rate": 9.946594779363901e-06, + "loss": 1.1534, + "step": 1860 + }, + { + "epoch": 0.4714376187460418, + "grad_norm": 3.3439900875091553, + "learning_rate": 9.946472579454188e-06, + "loss": 1.0638, + "step": 1861 + }, + { + "epoch": 0.47169094363521213, + "grad_norm": 3.2665603160858154, + "learning_rate": 9.946350240649843e-06, + "loss": 1.0374, + "step": 1862 + }, + { + "epoch": 0.47194426852438254, + "grad_norm": 3.712949752807617, + "learning_rate": 9.9462277629543e-06, + "loss": 1.2055, + "step": 1863 + }, + { + "epoch": 0.4721975934135529, + "grad_norm": 3.8193211555480957, + "learning_rate": 9.946105146371003e-06, + "loss": 1.1696, + "step": 1864 + }, + { + "epoch": 0.47245091830272323, + "grad_norm": 4.11019229888916, + "learning_rate": 9.94598239090339e-06, + "loss": 1.2141, + "step": 1865 + }, + { + "epoch": 0.4727042431918936, + "grad_norm": 4.131199359893799, + "learning_rate": 9.945859496554909e-06, + "loss": 1.2633, + "step": 1866 + }, + { + "epoch": 0.472957568081064, + "grad_norm": 4.110605716705322, + "learning_rate": 9.94573646332901e-06, + "loss": 1.3537, + "step": 1867 + }, + { + "epoch": 0.47321089297023433, + "grad_norm": 4.174281120300293, + "learning_rate": 9.945613291229152e-06, + "loss": 1.1177, + "step": 1868 + }, + { + "epoch": 0.4734642178594047, + "grad_norm": 4.247077465057373, + "learning_rate": 9.945489980258788e-06, + "loss": 1.2448, + "step": 1869 + }, + { + "epoch": 0.473717542748575, + "grad_norm": 3.9473955631256104, + "learning_rate": 9.945366530421385e-06, + "loss": 1.1886, + "step": 1870 + }, + { + "epoch": 0.47397086763774543, + "grad_norm": 3.6181344985961914, + "learning_rate": 9.945242941720408e-06, + "loss": 1.1013, + "step": 1871 + }, + { + "epoch": 0.4742241925269158, + "grad_norm": 3.9763457775115967, + "learning_rate": 9.945119214159324e-06, + "loss": 1.2935, + "step": 1872 + }, + { + "epoch": 0.4744775174160861, + "grad_norm": 3.6653854846954346, + "learning_rate": 9.944995347741613e-06, + "loss": 1.0569, + "step": 1873 + }, + { + "epoch": 0.4747308423052565, + "grad_norm": 3.494320869445801, + "learning_rate": 9.94487134247075e-06, + "loss": 1.0765, + "step": 1874 + }, + { + "epoch": 0.4749841671944269, + "grad_norm": 3.7668800354003906, + "learning_rate": 9.944747198350215e-06, + "loss": 1.1446, + "step": 1875 + }, + { + "epoch": 0.4752374920835972, + "grad_norm": 3.7044589519500732, + "learning_rate": 9.9446229153835e-06, + "loss": 1.1367, + "step": 1876 + }, + { + "epoch": 0.47549081697276757, + "grad_norm": 3.969409227371216, + "learning_rate": 9.944498493574088e-06, + "loss": 1.2635, + "step": 1877 + }, + { + "epoch": 0.4757441418619379, + "grad_norm": 3.864562511444092, + "learning_rate": 9.944373932925475e-06, + "loss": 1.318, + "step": 1878 + }, + { + "epoch": 0.4759974667511083, + "grad_norm": 3.6207752227783203, + "learning_rate": 9.944249233441162e-06, + "loss": 1.2752, + "step": 1879 + }, + { + "epoch": 0.47625079164027867, + "grad_norm": 3.924584150314331, + "learning_rate": 9.944124395124645e-06, + "loss": 1.2219, + "step": 1880 + }, + { + "epoch": 0.476504116529449, + "grad_norm": 3.9239914417266846, + "learning_rate": 9.943999417979435e-06, + "loss": 1.2603, + "step": 1881 + }, + { + "epoch": 0.47675744141861937, + "grad_norm": 3.840116500854492, + "learning_rate": 9.943874302009037e-06, + "loss": 1.1768, + "step": 1882 + }, + { + "epoch": 0.4770107663077897, + "grad_norm": 3.9847187995910645, + "learning_rate": 9.943749047216966e-06, + "loss": 1.0493, + "step": 1883 + }, + { + "epoch": 0.4772640911969601, + "grad_norm": 3.9286274909973145, + "learning_rate": 9.943623653606738e-06, + "loss": 1.3563, + "step": 1884 + }, + { + "epoch": 0.47751741608613046, + "grad_norm": 4.200928688049316, + "learning_rate": 9.943498121181877e-06, + "loss": 1.3595, + "step": 1885 + }, + { + "epoch": 0.4777707409753008, + "grad_norm": 3.5353293418884277, + "learning_rate": 9.943372449945903e-06, + "loss": 1.1743, + "step": 1886 + }, + { + "epoch": 0.47802406586447116, + "grad_norm": 4.019369602203369, + "learning_rate": 9.943246639902349e-06, + "loss": 1.1256, + "step": 1887 + }, + { + "epoch": 0.47827739075364156, + "grad_norm": 4.187241077423096, + "learning_rate": 9.943120691054745e-06, + "loss": 1.3265, + "step": 1888 + }, + { + "epoch": 0.4785307156428119, + "grad_norm": 4.001984596252441, + "learning_rate": 9.942994603406629e-06, + "loss": 1.2638, + "step": 1889 + }, + { + "epoch": 0.47878404053198226, + "grad_norm": 3.6993045806884766, + "learning_rate": 9.942868376961542e-06, + "loss": 1.1978, + "step": 1890 + }, + { + "epoch": 0.4790373654211526, + "grad_norm": 3.9563772678375244, + "learning_rate": 9.942742011723028e-06, + "loss": 1.2217, + "step": 1891 + }, + { + "epoch": 0.479290690310323, + "grad_norm": 3.6382575035095215, + "learning_rate": 9.942615507694633e-06, + "loss": 1.1652, + "step": 1892 + }, + { + "epoch": 0.47954401519949336, + "grad_norm": 4.0197834968566895, + "learning_rate": 9.942488864879912e-06, + "loss": 1.3435, + "step": 1893 + }, + { + "epoch": 0.4797973400886637, + "grad_norm": 3.5034239292144775, + "learning_rate": 9.94236208328242e-06, + "loss": 1.0797, + "step": 1894 + }, + { + "epoch": 0.48005066497783405, + "grad_norm": 3.9032034873962402, + "learning_rate": 9.942235162905719e-06, + "loss": 1.2162, + "step": 1895 + }, + { + "epoch": 0.48030398986700445, + "grad_norm": 3.9833247661590576, + "learning_rate": 9.942108103753367e-06, + "loss": 1.2685, + "step": 1896 + }, + { + "epoch": 0.4805573147561748, + "grad_norm": 3.9508931636810303, + "learning_rate": 9.941980905828939e-06, + "loss": 1.1964, + "step": 1897 + }, + { + "epoch": 0.48081063964534515, + "grad_norm": 4.35664176940918, + "learning_rate": 9.941853569136001e-06, + "loss": 1.3512, + "step": 1898 + }, + { + "epoch": 0.4810639645345155, + "grad_norm": 4.374859809875488, + "learning_rate": 9.941726093678132e-06, + "loss": 1.2175, + "step": 1899 + }, + { + "epoch": 0.4813172894236859, + "grad_norm": 3.94962215423584, + "learning_rate": 9.941598479458911e-06, + "loss": 1.3012, + "step": 1900 + }, + { + "epoch": 0.48157061431285625, + "grad_norm": 3.9118425846099854, + "learning_rate": 9.941470726481921e-06, + "loss": 1.1469, + "step": 1901 + }, + { + "epoch": 0.4818239392020266, + "grad_norm": 3.626525402069092, + "learning_rate": 9.941342834750748e-06, + "loss": 1.2502, + "step": 1902 + }, + { + "epoch": 0.48207726409119694, + "grad_norm": 4.247342109680176, + "learning_rate": 9.941214804268983e-06, + "loss": 1.2068, + "step": 1903 + }, + { + "epoch": 0.48233058898036735, + "grad_norm": 4.113563060760498, + "learning_rate": 9.941086635040225e-06, + "loss": 1.2238, + "step": 1904 + }, + { + "epoch": 0.4825839138695377, + "grad_norm": 3.7747962474823, + "learning_rate": 9.940958327068068e-06, + "loss": 1.1895, + "step": 1905 + }, + { + "epoch": 0.48283723875870804, + "grad_norm": 4.083578109741211, + "learning_rate": 9.940829880356117e-06, + "loss": 1.2219, + "step": 1906 + }, + { + "epoch": 0.4830905636478784, + "grad_norm": 3.6183180809020996, + "learning_rate": 9.940701294907979e-06, + "loss": 1.2029, + "step": 1907 + }, + { + "epoch": 0.48334388853704874, + "grad_norm": 4.016280651092529, + "learning_rate": 9.940572570727265e-06, + "loss": 1.2682, + "step": 1908 + }, + { + "epoch": 0.48359721342621914, + "grad_norm": 3.8436830043792725, + "learning_rate": 9.940443707817588e-06, + "loss": 1.33, + "step": 1909 + }, + { + "epoch": 0.4838505383153895, + "grad_norm": 3.845280408859253, + "learning_rate": 9.940314706182566e-06, + "loss": 1.2901, + "step": 1910 + }, + { + "epoch": 0.48410386320455984, + "grad_norm": 3.605034589767456, + "learning_rate": 9.940185565825824e-06, + "loss": 1.3842, + "step": 1911 + }, + { + "epoch": 0.4843571880937302, + "grad_norm": 3.5678157806396484, + "learning_rate": 9.940056286750988e-06, + "loss": 1.1572, + "step": 1912 + }, + { + "epoch": 0.4846105129829006, + "grad_norm": 4.389228343963623, + "learning_rate": 9.939926868961684e-06, + "loss": 1.195, + "step": 1913 + }, + { + "epoch": 0.48486383787207094, + "grad_norm": 3.625386953353882, + "learning_rate": 9.93979731246155e-06, + "loss": 1.2272, + "step": 1914 + }, + { + "epoch": 0.4851171627612413, + "grad_norm": 3.577895164489746, + "learning_rate": 9.939667617254222e-06, + "loss": 1.0943, + "step": 1915 + }, + { + "epoch": 0.48537048765041163, + "grad_norm": 4.115499019622803, + "learning_rate": 9.939537783343342e-06, + "loss": 1.198, + "step": 1916 + }, + { + "epoch": 0.48562381253958203, + "grad_norm": 3.993464708328247, + "learning_rate": 9.939407810732558e-06, + "loss": 1.3738, + "step": 1917 + }, + { + "epoch": 0.4858771374287524, + "grad_norm": 4.160882472991943, + "learning_rate": 9.939277699425515e-06, + "loss": 1.2951, + "step": 1918 + }, + { + "epoch": 0.48613046231792273, + "grad_norm": 3.5682358741760254, + "learning_rate": 9.939147449425873e-06, + "loss": 1.1651, + "step": 1919 + }, + { + "epoch": 0.4863837872070931, + "grad_norm": 3.803846836090088, + "learning_rate": 9.939017060737283e-06, + "loss": 1.2901, + "step": 1920 + }, + { + "epoch": 0.4866371120962635, + "grad_norm": 4.027624130249023, + "learning_rate": 9.938886533363408e-06, + "loss": 1.2616, + "step": 1921 + }, + { + "epoch": 0.4868904369854338, + "grad_norm": 3.9616568088531494, + "learning_rate": 9.938755867307915e-06, + "loss": 1.2761, + "step": 1922 + }, + { + "epoch": 0.4871437618746042, + "grad_norm": 3.555142641067505, + "learning_rate": 9.938625062574471e-06, + "loss": 1.1668, + "step": 1923 + }, + { + "epoch": 0.4873970867637745, + "grad_norm": 3.52968168258667, + "learning_rate": 9.938494119166751e-06, + "loss": 1.2268, + "step": 1924 + }, + { + "epoch": 0.4876504116529449, + "grad_norm": 3.665621519088745, + "learning_rate": 9.93836303708843e-06, + "loss": 1.003, + "step": 1925 + }, + { + "epoch": 0.4879037365421153, + "grad_norm": 3.53971004486084, + "learning_rate": 9.938231816343191e-06, + "loss": 0.9954, + "step": 1926 + }, + { + "epoch": 0.4881570614312856, + "grad_norm": 4.071761608123779, + "learning_rate": 9.938100456934716e-06, + "loss": 1.2768, + "step": 1927 + }, + { + "epoch": 0.48841038632045597, + "grad_norm": 3.6120855808258057, + "learning_rate": 9.937968958866693e-06, + "loss": 1.0892, + "step": 1928 + }, + { + "epoch": 0.4886637112096264, + "grad_norm": 3.712071657180786, + "learning_rate": 9.937837322142818e-06, + "loss": 1.2015, + "step": 1929 + }, + { + "epoch": 0.4889170360987967, + "grad_norm": 4.381753921508789, + "learning_rate": 9.937705546766784e-06, + "loss": 1.3256, + "step": 1930 + }, + { + "epoch": 0.48917036098796707, + "grad_norm": 3.739407539367676, + "learning_rate": 9.937573632742294e-06, + "loss": 1.1897, + "step": 1931 + }, + { + "epoch": 0.4894236858771374, + "grad_norm": 3.939709186553955, + "learning_rate": 9.93744158007305e-06, + "loss": 1.0873, + "step": 1932 + }, + { + "epoch": 0.48967701076630776, + "grad_norm": 3.766883373260498, + "learning_rate": 9.93730938876276e-06, + "loss": 1.1832, + "step": 1933 + }, + { + "epoch": 0.48993033565547817, + "grad_norm": 3.7371826171875, + "learning_rate": 9.937177058815134e-06, + "loss": 1.191, + "step": 1934 + }, + { + "epoch": 0.4901836605446485, + "grad_norm": 3.86498761177063, + "learning_rate": 9.937044590233895e-06, + "loss": 1.3341, + "step": 1935 + }, + { + "epoch": 0.49043698543381886, + "grad_norm": 3.599517583847046, + "learning_rate": 9.936911983022755e-06, + "loss": 1.109, + "step": 1936 + }, + { + "epoch": 0.4906903103229892, + "grad_norm": 3.963528871536255, + "learning_rate": 9.93677923718544e-06, + "loss": 1.305, + "step": 1937 + }, + { + "epoch": 0.4909436352121596, + "grad_norm": 3.53969669342041, + "learning_rate": 9.936646352725678e-06, + "loss": 1.1524, + "step": 1938 + }, + { + "epoch": 0.49119696010132996, + "grad_norm": 3.351635456085205, + "learning_rate": 9.936513329647201e-06, + "loss": 1.1303, + "step": 1939 + }, + { + "epoch": 0.4914502849905003, + "grad_norm": 3.753661870956421, + "learning_rate": 9.936380167953744e-06, + "loss": 1.1581, + "step": 1940 + }, + { + "epoch": 0.49170360987967066, + "grad_norm": 3.5876848697662354, + "learning_rate": 9.936246867649044e-06, + "loss": 1.0585, + "step": 1941 + }, + { + "epoch": 0.49195693476884106, + "grad_norm": 3.771541118621826, + "learning_rate": 9.936113428736845e-06, + "loss": 1.1871, + "step": 1942 + }, + { + "epoch": 0.4922102596580114, + "grad_norm": 3.8653862476348877, + "learning_rate": 9.935979851220895e-06, + "loss": 1.2944, + "step": 1943 + }, + { + "epoch": 0.49246358454718175, + "grad_norm": 4.140165328979492, + "learning_rate": 9.935846135104945e-06, + "loss": 1.3218, + "step": 1944 + }, + { + "epoch": 0.4927169094363521, + "grad_norm": 3.655492067337036, + "learning_rate": 9.935712280392747e-06, + "loss": 1.199, + "step": 1945 + }, + { + "epoch": 0.4929702343255225, + "grad_norm": 3.7361409664154053, + "learning_rate": 9.935578287088063e-06, + "loss": 1.2055, + "step": 1946 + }, + { + "epoch": 0.49322355921469285, + "grad_norm": 4.23722505569458, + "learning_rate": 9.935444155194654e-06, + "loss": 1.2338, + "step": 1947 + }, + { + "epoch": 0.4934768841038632, + "grad_norm": 3.802419900894165, + "learning_rate": 9.935309884716285e-06, + "loss": 1.2822, + "step": 1948 + }, + { + "epoch": 0.49373020899303355, + "grad_norm": 3.4896342754364014, + "learning_rate": 9.93517547565673e-06, + "loss": 1.1569, + "step": 1949 + }, + { + "epoch": 0.49398353388220395, + "grad_norm": 4.055016040802002, + "learning_rate": 9.935040928019756e-06, + "loss": 1.106, + "step": 1950 + }, + { + "epoch": 0.4942368587713743, + "grad_norm": 3.828601598739624, + "learning_rate": 9.93490624180915e-06, + "loss": 1.1133, + "step": 1951 + }, + { + "epoch": 0.49449018366054465, + "grad_norm": 3.522599220275879, + "learning_rate": 9.934771417028688e-06, + "loss": 1.0369, + "step": 1952 + }, + { + "epoch": 0.494743508549715, + "grad_norm": 3.7118875980377197, + "learning_rate": 9.934636453682158e-06, + "loss": 1.2343, + "step": 1953 + }, + { + "epoch": 0.4949968334388854, + "grad_norm": 3.7711949348449707, + "learning_rate": 9.93450135177335e-06, + "loss": 1.1774, + "step": 1954 + }, + { + "epoch": 0.49525015832805575, + "grad_norm": 3.6655616760253906, + "learning_rate": 9.934366111306055e-06, + "loss": 1.2547, + "step": 1955 + }, + { + "epoch": 0.4955034832172261, + "grad_norm": 3.4832236766815186, + "learning_rate": 9.934230732284072e-06, + "loss": 1.1242, + "step": 1956 + }, + { + "epoch": 0.49575680810639644, + "grad_norm": 3.6455090045928955, + "learning_rate": 9.934095214711204e-06, + "loss": 1.4028, + "step": 1957 + }, + { + "epoch": 0.4960101329955668, + "grad_norm": 3.7010371685028076, + "learning_rate": 9.933959558591254e-06, + "loss": 1.3082, + "step": 1958 + }, + { + "epoch": 0.4962634578847372, + "grad_norm": 4.023183345794678, + "learning_rate": 9.933823763928032e-06, + "loss": 1.282, + "step": 1959 + }, + { + "epoch": 0.49651678277390754, + "grad_norm": 3.851107358932495, + "learning_rate": 9.933687830725351e-06, + "loss": 1.2865, + "step": 1960 + }, + { + "epoch": 0.4967701076630779, + "grad_norm": 3.800704002380371, + "learning_rate": 9.933551758987029e-06, + "loss": 1.3246, + "step": 1961 + }, + { + "epoch": 0.49702343255224823, + "grad_norm": 3.916658401489258, + "learning_rate": 9.933415548716884e-06, + "loss": 1.2298, + "step": 1962 + }, + { + "epoch": 0.49727675744141864, + "grad_norm": 4.056824207305908, + "learning_rate": 9.933279199918743e-06, + "loss": 1.266, + "step": 1963 + }, + { + "epoch": 0.497530082330589, + "grad_norm": 3.6966655254364014, + "learning_rate": 9.933142712596435e-06, + "loss": 1.0131, + "step": 1964 + }, + { + "epoch": 0.49778340721975933, + "grad_norm": 3.6945905685424805, + "learning_rate": 9.933006086753793e-06, + "loss": 1.2248, + "step": 1965 + }, + { + "epoch": 0.4980367321089297, + "grad_norm": 3.9814541339874268, + "learning_rate": 9.93286932239465e-06, + "loss": 1.3103, + "step": 1966 + }, + { + "epoch": 0.4982900569981001, + "grad_norm": 3.521240472793579, + "learning_rate": 9.932732419522849e-06, + "loss": 1.0976, + "step": 1967 + }, + { + "epoch": 0.49854338188727043, + "grad_norm": 3.8234946727752686, + "learning_rate": 9.932595378142233e-06, + "loss": 1.3093, + "step": 1968 + }, + { + "epoch": 0.4987967067764408, + "grad_norm": 3.7615811824798584, + "learning_rate": 9.932458198256652e-06, + "loss": 1.1305, + "step": 1969 + }, + { + "epoch": 0.4990500316656111, + "grad_norm": 3.895721912384033, + "learning_rate": 9.932320879869956e-06, + "loss": 1.1736, + "step": 1970 + }, + { + "epoch": 0.49930335655478153, + "grad_norm": 3.6676993370056152, + "learning_rate": 9.932183422986e-06, + "loss": 1.2882, + "step": 1971 + }, + { + "epoch": 0.4995566814439519, + "grad_norm": 3.9136924743652344, + "learning_rate": 9.932045827608648e-06, + "loss": 1.1333, + "step": 1972 + }, + { + "epoch": 0.4998100063331222, + "grad_norm": 3.678321599960327, + "learning_rate": 9.931908093741757e-06, + "loss": 1.2515, + "step": 1973 + }, + { + "epoch": 0.5000633312222926, + "grad_norm": 3.9502320289611816, + "learning_rate": 9.931770221389201e-06, + "loss": 1.1732, + "step": 1974 + }, + { + "epoch": 0.5003166561114629, + "grad_norm": 4.168564796447754, + "learning_rate": 9.931632210554846e-06, + "loss": 1.477, + "step": 1975 + }, + { + "epoch": 0.5005699810006333, + "grad_norm": 3.4833409786224365, + "learning_rate": 9.931494061242573e-06, + "loss": 1.154, + "step": 1976 + }, + { + "epoch": 0.5008233058898037, + "grad_norm": 3.859236240386963, + "learning_rate": 9.931355773456257e-06, + "loss": 1.2578, + "step": 1977 + }, + { + "epoch": 0.501076630778974, + "grad_norm": 4.015527248382568, + "learning_rate": 9.93121734719978e-06, + "loss": 1.2787, + "step": 1978 + }, + { + "epoch": 0.5013299556681444, + "grad_norm": 3.5901248455047607, + "learning_rate": 9.931078782477033e-06, + "loss": 1.2092, + "step": 1979 + }, + { + "epoch": 0.5015832805573147, + "grad_norm": 3.4743640422821045, + "learning_rate": 9.930940079291904e-06, + "loss": 1.1874, + "step": 1980 + }, + { + "epoch": 0.5018366054464851, + "grad_norm": 3.7826244831085205, + "learning_rate": 9.93080123764829e-06, + "loss": 1.2899, + "step": 1981 + }, + { + "epoch": 0.5020899303356555, + "grad_norm": 3.508268117904663, + "learning_rate": 9.930662257550087e-06, + "loss": 1.2104, + "step": 1982 + }, + { + "epoch": 0.5023432552248258, + "grad_norm": 3.540865421295166, + "learning_rate": 9.930523139001199e-06, + "loss": 1.0924, + "step": 1983 + }, + { + "epoch": 0.5025965801139962, + "grad_norm": 4.011488914489746, + "learning_rate": 9.930383882005532e-06, + "loss": 1.3308, + "step": 1984 + }, + { + "epoch": 0.5028499050031665, + "grad_norm": 3.86618971824646, + "learning_rate": 9.930244486566996e-06, + "loss": 1.2516, + "step": 1985 + }, + { + "epoch": 0.5031032298923369, + "grad_norm": 3.611562967300415, + "learning_rate": 9.930104952689507e-06, + "loss": 1.3106, + "step": 1986 + }, + { + "epoch": 0.5033565547815073, + "grad_norm": 4.026989936828613, + "learning_rate": 9.929965280376981e-06, + "loss": 1.1976, + "step": 1987 + }, + { + "epoch": 0.5036098796706776, + "grad_norm": 3.4937264919281006, + "learning_rate": 9.929825469633338e-06, + "loss": 1.0598, + "step": 1988 + }, + { + "epoch": 0.503863204559848, + "grad_norm": 3.5130345821380615, + "learning_rate": 9.929685520462508e-06, + "loss": 1.1567, + "step": 1989 + }, + { + "epoch": 0.5041165294490184, + "grad_norm": 3.598287343978882, + "learning_rate": 9.929545432868422e-06, + "loss": 1.1503, + "step": 1990 + }, + { + "epoch": 0.5043698543381887, + "grad_norm": 3.6132335662841797, + "learning_rate": 9.929405206855008e-06, + "loss": 1.1466, + "step": 1991 + }, + { + "epoch": 0.5046231792273591, + "grad_norm": 3.887468099594116, + "learning_rate": 9.929264842426204e-06, + "loss": 1.322, + "step": 1992 + }, + { + "epoch": 0.5048765041165294, + "grad_norm": 3.5717601776123047, + "learning_rate": 9.929124339585956e-06, + "loss": 1.2215, + "step": 1993 + }, + { + "epoch": 0.5051298290056998, + "grad_norm": 3.476271390914917, + "learning_rate": 9.928983698338207e-06, + "loss": 1.1382, + "step": 1994 + }, + { + "epoch": 0.5053831538948702, + "grad_norm": 3.6219406127929688, + "learning_rate": 9.928842918686905e-06, + "loss": 1.1434, + "step": 1995 + }, + { + "epoch": 0.5056364787840405, + "grad_norm": 3.7151741981506348, + "learning_rate": 9.928702000636004e-06, + "loss": 1.3485, + "step": 1996 + }, + { + "epoch": 0.5058898036732109, + "grad_norm": 3.5612034797668457, + "learning_rate": 9.92856094418946e-06, + "loss": 1.326, + "step": 1997 + }, + { + "epoch": 0.5061431285623813, + "grad_norm": 3.819822072982788, + "learning_rate": 9.928419749351236e-06, + "loss": 1.1866, + "step": 1998 + }, + { + "epoch": 0.5063964534515516, + "grad_norm": 3.5734217166900635, + "learning_rate": 9.928278416125294e-06, + "loss": 1.1093, + "step": 1999 + }, + { + "epoch": 0.506649778340722, + "grad_norm": 3.6447010040283203, + "learning_rate": 9.928136944515605e-06, + "loss": 1.0837, + "step": 2000 + }, + { + "epoch": 0.506649778340722, + "eval_loss": 1.2390925884246826, + "eval_runtime": 12.3897, + "eval_samples_per_second": 32.285, + "eval_steps_per_second": 4.036, + "step": 2000 + }, + { + "epoch": 0.5069031032298923, + "grad_norm": 3.805245876312256, + "learning_rate": 9.927995334526139e-06, + "loss": 1.2013, + "step": 2001 + }, + { + "epoch": 0.5071564281190627, + "grad_norm": 3.6440162658691406, + "learning_rate": 9.927853586160876e-06, + "loss": 1.098, + "step": 2002 + }, + { + "epoch": 0.5074097530082331, + "grad_norm": 3.89577317237854, + "learning_rate": 9.927711699423792e-06, + "loss": 1.2889, + "step": 2003 + }, + { + "epoch": 0.5076630778974034, + "grad_norm": 4.357210636138916, + "learning_rate": 9.927569674318874e-06, + "loss": 1.4048, + "step": 2004 + }, + { + "epoch": 0.5079164027865738, + "grad_norm": 4.084841251373291, + "learning_rate": 9.927427510850107e-06, + "loss": 1.2109, + "step": 2005 + }, + { + "epoch": 0.5081697276757441, + "grad_norm": 4.151208877563477, + "learning_rate": 9.927285209021487e-06, + "loss": 1.1832, + "step": 2006 + }, + { + "epoch": 0.5084230525649145, + "grad_norm": 4.021500587463379, + "learning_rate": 9.927142768837005e-06, + "loss": 1.24, + "step": 2007 + }, + { + "epoch": 0.5086763774540849, + "grad_norm": 3.5542354583740234, + "learning_rate": 9.927000190300666e-06, + "loss": 1.3194, + "step": 2008 + }, + { + "epoch": 0.5089297023432552, + "grad_norm": 4.103028774261475, + "learning_rate": 9.926857473416469e-06, + "loss": 1.3017, + "step": 2009 + }, + { + "epoch": 0.5091830272324256, + "grad_norm": 4.074617385864258, + "learning_rate": 9.926714618188424e-06, + "loss": 1.3239, + "step": 2010 + }, + { + "epoch": 0.509436352121596, + "grad_norm": 3.6382956504821777, + "learning_rate": 9.926571624620542e-06, + "loss": 1.2841, + "step": 2011 + }, + { + "epoch": 0.5096896770107663, + "grad_norm": 3.8066420555114746, + "learning_rate": 9.926428492716838e-06, + "loss": 1.2699, + "step": 2012 + }, + { + "epoch": 0.5099430018999367, + "grad_norm": 3.729119300842285, + "learning_rate": 9.92628522248133e-06, + "loss": 1.2963, + "step": 2013 + }, + { + "epoch": 0.510196326789107, + "grad_norm": 3.840654134750366, + "learning_rate": 9.926141813918042e-06, + "loss": 1.1904, + "step": 2014 + }, + { + "epoch": 0.5104496516782774, + "grad_norm": 3.9306607246398926, + "learning_rate": 9.925998267031001e-06, + "loss": 1.2877, + "step": 2015 + }, + { + "epoch": 0.5107029765674478, + "grad_norm": 3.945796489715576, + "learning_rate": 9.925854581824236e-06, + "loss": 1.1981, + "step": 2016 + }, + { + "epoch": 0.5109563014566181, + "grad_norm": 3.8385891914367676, + "learning_rate": 9.925710758301785e-06, + "loss": 1.2959, + "step": 2017 + }, + { + "epoch": 0.5112096263457885, + "grad_norm": 4.107153415679932, + "learning_rate": 9.925566796467685e-06, + "loss": 1.2408, + "step": 2018 + }, + { + "epoch": 0.5114629512349589, + "grad_norm": 3.6010515689849854, + "learning_rate": 9.925422696325976e-06, + "loss": 1.1565, + "step": 2019 + }, + { + "epoch": 0.5117162761241292, + "grad_norm": 4.021646976470947, + "learning_rate": 9.925278457880706e-06, + "loss": 1.1205, + "step": 2020 + }, + { + "epoch": 0.5119696010132996, + "grad_norm": 3.8271992206573486, + "learning_rate": 9.925134081135925e-06, + "loss": 1.1259, + "step": 2021 + }, + { + "epoch": 0.5122229259024699, + "grad_norm": 4.050345420837402, + "learning_rate": 9.924989566095689e-06, + "loss": 1.3793, + "step": 2022 + }, + { + "epoch": 0.5124762507916403, + "grad_norm": 3.9474878311157227, + "learning_rate": 9.924844912764053e-06, + "loss": 1.3196, + "step": 2023 + }, + { + "epoch": 0.5127295756808107, + "grad_norm": 3.6200332641601562, + "learning_rate": 9.924700121145081e-06, + "loss": 1.1426, + "step": 2024 + }, + { + "epoch": 0.512982900569981, + "grad_norm": 3.7752346992492676, + "learning_rate": 9.924555191242838e-06, + "loss": 1.1998, + "step": 2025 + }, + { + "epoch": 0.5132362254591514, + "grad_norm": 3.581876277923584, + "learning_rate": 9.92441012306139e-06, + "loss": 1.2416, + "step": 2026 + }, + { + "epoch": 0.5134895503483218, + "grad_norm": 3.861626148223877, + "learning_rate": 9.924264916604817e-06, + "loss": 1.3283, + "step": 2027 + }, + { + "epoch": 0.5137428752374921, + "grad_norm": 4.046294212341309, + "learning_rate": 9.924119571877192e-06, + "loss": 1.1203, + "step": 2028 + }, + { + "epoch": 0.5139962001266625, + "grad_norm": 3.922940254211426, + "learning_rate": 9.923974088882597e-06, + "loss": 1.2, + "step": 2029 + }, + { + "epoch": 0.5142495250158328, + "grad_norm": 3.821467161178589, + "learning_rate": 9.923828467625118e-06, + "loss": 1.1233, + "step": 2030 + }, + { + "epoch": 0.5145028499050032, + "grad_norm": 3.7097089290618896, + "learning_rate": 9.923682708108844e-06, + "loss": 1.0941, + "step": 2031 + }, + { + "epoch": 0.5147561747941736, + "grad_norm": 3.8376052379608154, + "learning_rate": 9.923536810337866e-06, + "loss": 1.3278, + "step": 2032 + }, + { + "epoch": 0.5150094996833439, + "grad_norm": 3.7457728385925293, + "learning_rate": 9.923390774316282e-06, + "loss": 1.2232, + "step": 2033 + }, + { + "epoch": 0.5152628245725143, + "grad_norm": 3.587247133255005, + "learning_rate": 9.923244600048191e-06, + "loss": 1.1964, + "step": 2034 + }, + { + "epoch": 0.5155161494616846, + "grad_norm": 3.852628469467163, + "learning_rate": 9.923098287537702e-06, + "loss": 1.2074, + "step": 2035 + }, + { + "epoch": 0.515769474350855, + "grad_norm": 3.785722494125366, + "learning_rate": 9.92295183678892e-06, + "loss": 1.2066, + "step": 2036 + }, + { + "epoch": 0.5160227992400254, + "grad_norm": 4.016298770904541, + "learning_rate": 9.922805247805956e-06, + "loss": 1.1638, + "step": 2037 + }, + { + "epoch": 0.5162761241291957, + "grad_norm": 3.6992557048797607, + "learning_rate": 9.922658520592927e-06, + "loss": 1.1199, + "step": 2038 + }, + { + "epoch": 0.5165294490183661, + "grad_norm": 3.6224451065063477, + "learning_rate": 9.922511655153957e-06, + "loss": 1.1693, + "step": 2039 + }, + { + "epoch": 0.5167827739075365, + "grad_norm": 3.499828815460205, + "learning_rate": 9.922364651493165e-06, + "loss": 1.1314, + "step": 2040 + }, + { + "epoch": 0.5170360987967068, + "grad_norm": 3.856823682785034, + "learning_rate": 9.92221750961468e-06, + "loss": 1.172, + "step": 2041 + }, + { + "epoch": 0.5172894236858772, + "grad_norm": 4.1732563972473145, + "learning_rate": 9.922070229522636e-06, + "loss": 1.3302, + "step": 2042 + }, + { + "epoch": 0.5175427485750475, + "grad_norm": 3.7007861137390137, + "learning_rate": 9.921922811221166e-06, + "loss": 1.2326, + "step": 2043 + }, + { + "epoch": 0.5177960734642179, + "grad_norm": 4.095267295837402, + "learning_rate": 9.92177525471441e-06, + "loss": 1.3154, + "step": 2044 + }, + { + "epoch": 0.5180493983533883, + "grad_norm": 3.9190802574157715, + "learning_rate": 9.921627560006511e-06, + "loss": 1.0951, + "step": 2045 + }, + { + "epoch": 0.5183027232425585, + "grad_norm": 3.760322332382202, + "learning_rate": 9.921479727101619e-06, + "loss": 1.2138, + "step": 2046 + }, + { + "epoch": 0.518556048131729, + "grad_norm": 3.5061159133911133, + "learning_rate": 9.921331756003881e-06, + "loss": 1.2732, + "step": 2047 + }, + { + "epoch": 0.5188093730208994, + "grad_norm": 3.5123519897460938, + "learning_rate": 9.921183646717454e-06, + "loss": 1.1772, + "step": 2048 + }, + { + "epoch": 0.5190626979100696, + "grad_norm": 3.607067346572876, + "learning_rate": 9.921035399246497e-06, + "loss": 1.0284, + "step": 2049 + }, + { + "epoch": 0.51931602279924, + "grad_norm": 3.7096149921417236, + "learning_rate": 9.920887013595171e-06, + "loss": 1.1604, + "step": 2050 + }, + { + "epoch": 0.5195693476884103, + "grad_norm": 3.6085991859436035, + "learning_rate": 9.920738489767646e-06, + "loss": 1.2365, + "step": 2051 + }, + { + "epoch": 0.5198226725775807, + "grad_norm": 3.6953377723693848, + "learning_rate": 9.92058982776809e-06, + "loss": 1.1491, + "step": 2052 + }, + { + "epoch": 0.5200759974667511, + "grad_norm": 3.950957775115967, + "learning_rate": 9.920441027600678e-06, + "loss": 1.1382, + "step": 2053 + }, + { + "epoch": 0.5203293223559214, + "grad_norm": 4.319118499755859, + "learning_rate": 9.920292089269587e-06, + "loss": 1.4027, + "step": 2054 + }, + { + "epoch": 0.5205826472450918, + "grad_norm": 4.153669834136963, + "learning_rate": 9.920143012778999e-06, + "loss": 1.2138, + "step": 2055 + }, + { + "epoch": 0.5208359721342621, + "grad_norm": 3.893864631652832, + "learning_rate": 9.919993798133104e-06, + "loss": 1.1122, + "step": 2056 + }, + { + "epoch": 0.5210892970234325, + "grad_norm": 3.6461257934570312, + "learning_rate": 9.919844445336088e-06, + "loss": 1.1891, + "step": 2057 + }, + { + "epoch": 0.5213426219126029, + "grad_norm": 4.094683647155762, + "learning_rate": 9.919694954392145e-06, + "loss": 1.2813, + "step": 2058 + }, + { + "epoch": 0.5215959468017732, + "grad_norm": 3.255305051803589, + "learning_rate": 9.919545325305475e-06, + "loss": 1.0393, + "step": 2059 + }, + { + "epoch": 0.5218492716909436, + "grad_norm": 3.8772292137145996, + "learning_rate": 9.919395558080276e-06, + "loss": 1.1041, + "step": 2060 + }, + { + "epoch": 0.522102596580114, + "grad_norm": 3.944272994995117, + "learning_rate": 9.919245652720756e-06, + "loss": 1.2829, + "step": 2061 + }, + { + "epoch": 0.5223559214692843, + "grad_norm": 3.7375152111053467, + "learning_rate": 9.919095609231125e-06, + "loss": 1.2344, + "step": 2062 + }, + { + "epoch": 0.5226092463584547, + "grad_norm": 3.8056650161743164, + "learning_rate": 9.918945427615594e-06, + "loss": 1.2783, + "step": 2063 + }, + { + "epoch": 0.522862571247625, + "grad_norm": 3.702904224395752, + "learning_rate": 9.91879510787838e-06, + "loss": 1.2879, + "step": 2064 + }, + { + "epoch": 0.5231158961367954, + "grad_norm": 3.8003106117248535, + "learning_rate": 9.918644650023706e-06, + "loss": 1.1689, + "step": 2065 + }, + { + "epoch": 0.5233692210259658, + "grad_norm": 3.9635705947875977, + "learning_rate": 9.918494054055795e-06, + "loss": 1.2963, + "step": 2066 + }, + { + "epoch": 0.5236225459151361, + "grad_norm": 3.4321775436401367, + "learning_rate": 9.918343319978877e-06, + "loss": 1.0809, + "step": 2067 + }, + { + "epoch": 0.5238758708043065, + "grad_norm": 3.361389636993408, + "learning_rate": 9.918192447797182e-06, + "loss": 1.1926, + "step": 2068 + }, + { + "epoch": 0.5241291956934769, + "grad_norm": 3.9239723682403564, + "learning_rate": 9.918041437514948e-06, + "loss": 1.2748, + "step": 2069 + }, + { + "epoch": 0.5243825205826472, + "grad_norm": 3.5856761932373047, + "learning_rate": 9.917890289136416e-06, + "loss": 1.254, + "step": 2070 + }, + { + "epoch": 0.5246358454718176, + "grad_norm": 3.499878406524658, + "learning_rate": 9.91773900266583e-06, + "loss": 1.1641, + "step": 2071 + }, + { + "epoch": 0.5248891703609879, + "grad_norm": 3.535977840423584, + "learning_rate": 9.917587578107438e-06, + "loss": 1.2673, + "step": 2072 + }, + { + "epoch": 0.5251424952501583, + "grad_norm": 3.4507598876953125, + "learning_rate": 9.91743601546549e-06, + "loss": 1.1051, + "step": 2073 + }, + { + "epoch": 0.5253958201393287, + "grad_norm": 3.9945662021636963, + "learning_rate": 9.917284314744245e-06, + "loss": 1.1079, + "step": 2074 + }, + { + "epoch": 0.525649145028499, + "grad_norm": 3.716646671295166, + "learning_rate": 9.91713247594796e-06, + "loss": 1.2796, + "step": 2075 + }, + { + "epoch": 0.5259024699176694, + "grad_norm": 3.8714919090270996, + "learning_rate": 9.9169804990809e-06, + "loss": 1.412, + "step": 2076 + }, + { + "epoch": 0.5261557948068398, + "grad_norm": 3.7069480419158936, + "learning_rate": 9.91682838414733e-06, + "loss": 1.243, + "step": 2077 + }, + { + "epoch": 0.5264091196960101, + "grad_norm": 3.471064329147339, + "learning_rate": 9.916676131151528e-06, + "loss": 1.2679, + "step": 2078 + }, + { + "epoch": 0.5266624445851805, + "grad_norm": 3.772437334060669, + "learning_rate": 9.91652374009776e-06, + "loss": 1.1777, + "step": 2079 + }, + { + "epoch": 0.5269157694743508, + "grad_norm": 4.003936290740967, + "learning_rate": 9.916371210990313e-06, + "loss": 1.2332, + "step": 2080 + }, + { + "epoch": 0.5271690943635212, + "grad_norm": 4.371222972869873, + "learning_rate": 9.916218543833464e-06, + "loss": 1.2212, + "step": 2081 + }, + { + "epoch": 0.5274224192526916, + "grad_norm": 3.849670648574829, + "learning_rate": 9.916065738631504e-06, + "loss": 1.3082, + "step": 2082 + }, + { + "epoch": 0.5276757441418619, + "grad_norm": 3.9164035320281982, + "learning_rate": 9.915912795388722e-06, + "loss": 1.2778, + "step": 2083 + }, + { + "epoch": 0.5279290690310323, + "grad_norm": 3.9645965099334717, + "learning_rate": 9.915759714109412e-06, + "loss": 1.2734, + "step": 2084 + }, + { + "epoch": 0.5281823939202026, + "grad_norm": 3.515418529510498, + "learning_rate": 9.915606494797874e-06, + "loss": 1.1672, + "step": 2085 + }, + { + "epoch": 0.528435718809373, + "grad_norm": 3.6071736812591553, + "learning_rate": 9.915453137458409e-06, + "loss": 1.2437, + "step": 2086 + }, + { + "epoch": 0.5286890436985434, + "grad_norm": 3.903167247772217, + "learning_rate": 9.915299642095323e-06, + "loss": 1.2654, + "step": 2087 + }, + { + "epoch": 0.5289423685877137, + "grad_norm": 3.7934257984161377, + "learning_rate": 9.915146008712928e-06, + "loss": 1.3173, + "step": 2088 + }, + { + "epoch": 0.5291956934768841, + "grad_norm": 3.4777708053588867, + "learning_rate": 9.914992237315535e-06, + "loss": 1.1638, + "step": 2089 + }, + { + "epoch": 0.5294490183660545, + "grad_norm": 3.345414161682129, + "learning_rate": 9.914838327907466e-06, + "loss": 1.2188, + "step": 2090 + }, + { + "epoch": 0.5297023432552248, + "grad_norm": 3.4722683429718018, + "learning_rate": 9.914684280493039e-06, + "loss": 1.1764, + "step": 2091 + }, + { + "epoch": 0.5299556681443952, + "grad_norm": 3.456489086151123, + "learning_rate": 9.91453009507658e-06, + "loss": 1.1269, + "step": 2092 + }, + { + "epoch": 0.5302089930335655, + "grad_norm": 3.9334988594055176, + "learning_rate": 9.91437577166242e-06, + "loss": 1.2108, + "step": 2093 + }, + { + "epoch": 0.5304623179227359, + "grad_norm": 3.5916223526000977, + "learning_rate": 9.914221310254892e-06, + "loss": 1.2412, + "step": 2094 + }, + { + "epoch": 0.5307156428119063, + "grad_norm": 3.69205904006958, + "learning_rate": 9.914066710858333e-06, + "loss": 1.251, + "step": 2095 + }, + { + "epoch": 0.5309689677010766, + "grad_norm": 4.0429463386535645, + "learning_rate": 9.913911973477082e-06, + "loss": 1.2754, + "step": 2096 + }, + { + "epoch": 0.531222292590247, + "grad_norm": 3.604990243911743, + "learning_rate": 9.913757098115488e-06, + "loss": 1.314, + "step": 2097 + }, + { + "epoch": 0.5314756174794174, + "grad_norm": 3.89520001411438, + "learning_rate": 9.913602084777896e-06, + "loss": 1.2807, + "step": 2098 + }, + { + "epoch": 0.5317289423685877, + "grad_norm": 3.5424606800079346, + "learning_rate": 9.913446933468661e-06, + "loss": 1.1046, + "step": 2099 + }, + { + "epoch": 0.5319822672577581, + "grad_norm": 4.084542274475098, + "learning_rate": 9.913291644192139e-06, + "loss": 1.366, + "step": 2100 + }, + { + "epoch": 0.5322355921469284, + "grad_norm": 3.751150608062744, + "learning_rate": 9.91313621695269e-06, + "loss": 1.2879, + "step": 2101 + }, + { + "epoch": 0.5324889170360988, + "grad_norm": 3.7791383266448975, + "learning_rate": 9.91298065175468e-06, + "loss": 1.2437, + "step": 2102 + }, + { + "epoch": 0.5327422419252692, + "grad_norm": 3.40948224067688, + "learning_rate": 9.912824948602474e-06, + "loss": 1.1915, + "step": 2103 + }, + { + "epoch": 0.5329955668144395, + "grad_norm": 3.428567886352539, + "learning_rate": 9.912669107500447e-06, + "loss": 1.0372, + "step": 2104 + }, + { + "epoch": 0.5332488917036099, + "grad_norm": 3.6524455547332764, + "learning_rate": 9.912513128452974e-06, + "loss": 1.2302, + "step": 2105 + }, + { + "epoch": 0.5335022165927802, + "grad_norm": 3.7702980041503906, + "learning_rate": 9.912357011464436e-06, + "loss": 1.2113, + "step": 2106 + }, + { + "epoch": 0.5337555414819506, + "grad_norm": 3.56003999710083, + "learning_rate": 9.912200756539211e-06, + "loss": 1.1612, + "step": 2107 + }, + { + "epoch": 0.534008866371121, + "grad_norm": 3.6964781284332275, + "learning_rate": 9.912044363681695e-06, + "loss": 1.1697, + "step": 2108 + }, + { + "epoch": 0.5342621912602913, + "grad_norm": 3.6726064682006836, + "learning_rate": 9.911887832896274e-06, + "loss": 1.1919, + "step": 2109 + }, + { + "epoch": 0.5345155161494617, + "grad_norm": 3.5990684032440186, + "learning_rate": 9.911731164187345e-06, + "loss": 1.1649, + "step": 2110 + }, + { + "epoch": 0.5347688410386321, + "grad_norm": 3.431689500808716, + "learning_rate": 9.911574357559308e-06, + "loss": 1.0571, + "step": 2111 + }, + { + "epoch": 0.5350221659278024, + "grad_norm": 3.707425594329834, + "learning_rate": 9.911417413016565e-06, + "loss": 1.1837, + "step": 2112 + }, + { + "epoch": 0.5352754908169728, + "grad_norm": 3.7318520545959473, + "learning_rate": 9.911260330563522e-06, + "loss": 1.3111, + "step": 2113 + }, + { + "epoch": 0.5355288157061431, + "grad_norm": 3.8392438888549805, + "learning_rate": 9.91110311020459e-06, + "loss": 1.1554, + "step": 2114 + }, + { + "epoch": 0.5357821405953135, + "grad_norm": 3.4397387504577637, + "learning_rate": 9.910945751944185e-06, + "loss": 1.1259, + "step": 2115 + }, + { + "epoch": 0.5360354654844839, + "grad_norm": 3.693002223968506, + "learning_rate": 9.910788255786725e-06, + "loss": 1.3216, + "step": 2116 + }, + { + "epoch": 0.5362887903736542, + "grad_norm": 3.924417018890381, + "learning_rate": 9.910630621736632e-06, + "loss": 1.3065, + "step": 2117 + }, + { + "epoch": 0.5365421152628246, + "grad_norm": 4.017768383026123, + "learning_rate": 9.910472849798333e-06, + "loss": 1.2996, + "step": 2118 + }, + { + "epoch": 0.536795440151995, + "grad_norm": 3.6276843547821045, + "learning_rate": 9.910314939976257e-06, + "loss": 1.1187, + "step": 2119 + }, + { + "epoch": 0.5370487650411653, + "grad_norm": 3.8386552333831787, + "learning_rate": 9.91015689227484e-06, + "loss": 1.2779, + "step": 2120 + }, + { + "epoch": 0.5373020899303357, + "grad_norm": 4.193774700164795, + "learning_rate": 9.909998706698519e-06, + "loss": 1.1943, + "step": 2121 + }, + { + "epoch": 0.537555414819506, + "grad_norm": 3.8841090202331543, + "learning_rate": 9.909840383251735e-06, + "loss": 1.1898, + "step": 2122 + }, + { + "epoch": 0.5378087397086764, + "grad_norm": 3.499729633331299, + "learning_rate": 9.909681921938934e-06, + "loss": 1.0828, + "step": 2123 + }, + { + "epoch": 0.5380620645978468, + "grad_norm": 3.5716724395751953, + "learning_rate": 9.909523322764568e-06, + "loss": 1.1872, + "step": 2124 + }, + { + "epoch": 0.5383153894870171, + "grad_norm": 3.674340009689331, + "learning_rate": 9.909364585733085e-06, + "loss": 1.2995, + "step": 2125 + }, + { + "epoch": 0.5385687143761875, + "grad_norm": 3.615060567855835, + "learning_rate": 9.909205710848945e-06, + "loss": 1.1923, + "step": 2126 + }, + { + "epoch": 0.5388220392653578, + "grad_norm": 3.3412909507751465, + "learning_rate": 9.90904669811661e-06, + "loss": 1.1698, + "step": 2127 + }, + { + "epoch": 0.5390753641545282, + "grad_norm": 4.253294944763184, + "learning_rate": 9.908887547540546e-06, + "loss": 1.271, + "step": 2128 + }, + { + "epoch": 0.5393286890436986, + "grad_norm": 3.6456515789031982, + "learning_rate": 9.90872825912522e-06, + "loss": 1.2528, + "step": 2129 + }, + { + "epoch": 0.5395820139328689, + "grad_norm": 3.7424333095550537, + "learning_rate": 9.908568832875104e-06, + "loss": 1.2329, + "step": 2130 + }, + { + "epoch": 0.5398353388220393, + "grad_norm": 3.7191965579986572, + "learning_rate": 9.908409268794677e-06, + "loss": 1.1287, + "step": 2131 + }, + { + "epoch": 0.5400886637112097, + "grad_norm": 3.479968786239624, + "learning_rate": 9.908249566888416e-06, + "loss": 1.1933, + "step": 2132 + }, + { + "epoch": 0.54034198860038, + "grad_norm": 3.8091413974761963, + "learning_rate": 9.90808972716081e-06, + "loss": 1.2426, + "step": 2133 + }, + { + "epoch": 0.5405953134895504, + "grad_norm": 3.447298765182495, + "learning_rate": 9.907929749616345e-06, + "loss": 1.2251, + "step": 2134 + }, + { + "epoch": 0.5408486383787207, + "grad_norm": 3.6902029514312744, + "learning_rate": 9.907769634259511e-06, + "loss": 1.2301, + "step": 2135 + }, + { + "epoch": 0.5411019632678911, + "grad_norm": 3.4941325187683105, + "learning_rate": 9.907609381094807e-06, + "loss": 1.1992, + "step": 2136 + }, + { + "epoch": 0.5413552881570615, + "grad_norm": 3.628161668777466, + "learning_rate": 9.907448990126732e-06, + "loss": 1.1705, + "step": 2137 + }, + { + "epoch": 0.5416086130462318, + "grad_norm": 4.226117134094238, + "learning_rate": 9.907288461359788e-06, + "loss": 1.3543, + "step": 2138 + }, + { + "epoch": 0.5418619379354022, + "grad_norm": 3.90450382232666, + "learning_rate": 9.907127794798483e-06, + "loss": 1.2767, + "step": 2139 + }, + { + "epoch": 0.5421152628245726, + "grad_norm": 3.9196813106536865, + "learning_rate": 9.906966990447332e-06, + "loss": 1.2252, + "step": 2140 + }, + { + "epoch": 0.5423685877137429, + "grad_norm": 4.277498245239258, + "learning_rate": 9.906806048310847e-06, + "loss": 1.257, + "step": 2141 + }, + { + "epoch": 0.5426219126029133, + "grad_norm": 3.480289936065674, + "learning_rate": 9.906644968393546e-06, + "loss": 1.2312, + "step": 2142 + }, + { + "epoch": 0.5428752374920836, + "grad_norm": 3.5467991828918457, + "learning_rate": 9.906483750699955e-06, + "loss": 1.1649, + "step": 2143 + }, + { + "epoch": 0.543128562381254, + "grad_norm": 4.222132682800293, + "learning_rate": 9.906322395234601e-06, + "loss": 1.348, + "step": 2144 + }, + { + "epoch": 0.5433818872704244, + "grad_norm": 4.237131118774414, + "learning_rate": 9.906160902002013e-06, + "loss": 1.1572, + "step": 2145 + }, + { + "epoch": 0.5436352121595946, + "grad_norm": 4.146684646606445, + "learning_rate": 9.905999271006726e-06, + "loss": 1.3532, + "step": 2146 + }, + { + "epoch": 0.543888537048765, + "grad_norm": 3.6279287338256836, + "learning_rate": 9.905837502253279e-06, + "loss": 1.1156, + "step": 2147 + }, + { + "epoch": 0.5441418619379355, + "grad_norm": 3.8298258781433105, + "learning_rate": 9.905675595746214e-06, + "loss": 1.2658, + "step": 2148 + }, + { + "epoch": 0.5443951868271057, + "grad_norm": 3.568838119506836, + "learning_rate": 9.905513551490078e-06, + "loss": 1.282, + "step": 2149 + }, + { + "epoch": 0.5446485117162762, + "grad_norm": 3.4449048042297363, + "learning_rate": 9.90535136948942e-06, + "loss": 1.3066, + "step": 2150 + }, + { + "epoch": 0.5449018366054464, + "grad_norm": 3.561124801635742, + "learning_rate": 9.905189049748796e-06, + "loss": 1.0653, + "step": 2151 + }, + { + "epoch": 0.5451551614946168, + "grad_norm": 3.7354698181152344, + "learning_rate": 9.905026592272759e-06, + "loss": 1.2388, + "step": 2152 + }, + { + "epoch": 0.5454084863837872, + "grad_norm": 3.690352439880371, + "learning_rate": 9.904863997065878e-06, + "loss": 1.4449, + "step": 2153 + }, + { + "epoch": 0.5456618112729575, + "grad_norm": 3.964592218399048, + "learning_rate": 9.904701264132713e-06, + "loss": 1.329, + "step": 2154 + }, + { + "epoch": 0.545915136162128, + "grad_norm": 3.7120819091796875, + "learning_rate": 9.904538393477835e-06, + "loss": 1.1677, + "step": 2155 + }, + { + "epoch": 0.5461684610512982, + "grad_norm": 3.5068633556365967, + "learning_rate": 9.904375385105818e-06, + "loss": 1.1439, + "step": 2156 + }, + { + "epoch": 0.5464217859404686, + "grad_norm": 3.9299685955047607, + "learning_rate": 9.904212239021238e-06, + "loss": 1.1888, + "step": 2157 + }, + { + "epoch": 0.546675110829639, + "grad_norm": 4.307981967926025, + "learning_rate": 9.904048955228677e-06, + "loss": 1.3852, + "step": 2158 + }, + { + "epoch": 0.5469284357188093, + "grad_norm": 4.037630558013916, + "learning_rate": 9.903885533732722e-06, + "loss": 1.3964, + "step": 2159 + }, + { + "epoch": 0.5471817606079797, + "grad_norm": 3.4218804836273193, + "learning_rate": 9.903721974537956e-06, + "loss": 1.0959, + "step": 2160 + }, + { + "epoch": 0.5474350854971501, + "grad_norm": 3.514329671859741, + "learning_rate": 9.903558277648979e-06, + "loss": 1.1461, + "step": 2161 + }, + { + "epoch": 0.5476884103863204, + "grad_norm": 3.447237730026245, + "learning_rate": 9.903394443070381e-06, + "loss": 1.1734, + "step": 2162 + }, + { + "epoch": 0.5479417352754908, + "grad_norm": 3.6662721633911133, + "learning_rate": 9.903230470806766e-06, + "loss": 1.2235, + "step": 2163 + }, + { + "epoch": 0.5481950601646611, + "grad_norm": 3.6063079833984375, + "learning_rate": 9.903066360862736e-06, + "loss": 1.2365, + "step": 2164 + }, + { + "epoch": 0.5484483850538315, + "grad_norm": 3.9934699535369873, + "learning_rate": 9.902902113242903e-06, + "loss": 1.3201, + "step": 2165 + }, + { + "epoch": 0.5487017099430019, + "grad_norm": 3.5313923358917236, + "learning_rate": 9.902737727951876e-06, + "loss": 1.1839, + "step": 2166 + }, + { + "epoch": 0.5489550348321722, + "grad_norm": 3.5758824348449707, + "learning_rate": 9.90257320499427e-06, + "loss": 1.2516, + "step": 2167 + }, + { + "epoch": 0.5492083597213426, + "grad_norm": 3.4398324489593506, + "learning_rate": 9.902408544374706e-06, + "loss": 1.2804, + "step": 2168 + }, + { + "epoch": 0.549461684610513, + "grad_norm": 3.7697672843933105, + "learning_rate": 9.90224374609781e-06, + "loss": 1.2488, + "step": 2169 + }, + { + "epoch": 0.5497150094996833, + "grad_norm": 3.538827419281006, + "learning_rate": 9.902078810168206e-06, + "loss": 1.2351, + "step": 2170 + }, + { + "epoch": 0.5499683343888537, + "grad_norm": 3.9213740825653076, + "learning_rate": 9.901913736590527e-06, + "loss": 1.4626, + "step": 2171 + }, + { + "epoch": 0.550221659278024, + "grad_norm": 3.5800392627716064, + "learning_rate": 9.901748525369406e-06, + "loss": 1.1471, + "step": 2172 + }, + { + "epoch": 0.5504749841671944, + "grad_norm": 4.1332807540893555, + "learning_rate": 9.901583176509485e-06, + "loss": 1.3493, + "step": 2173 + }, + { + "epoch": 0.5507283090563648, + "grad_norm": 3.9190592765808105, + "learning_rate": 9.901417690015405e-06, + "loss": 1.3066, + "step": 2174 + }, + { + "epoch": 0.5509816339455351, + "grad_norm": 3.843719244003296, + "learning_rate": 9.901252065891814e-06, + "loss": 1.1864, + "step": 2175 + }, + { + "epoch": 0.5512349588347055, + "grad_norm": 3.5053625106811523, + "learning_rate": 9.90108630414336e-06, + "loss": 1.1446, + "step": 2176 + }, + { + "epoch": 0.5514882837238758, + "grad_norm": 3.4328432083129883, + "learning_rate": 9.900920404774703e-06, + "loss": 1.1201, + "step": 2177 + }, + { + "epoch": 0.5517416086130462, + "grad_norm": 4.106205463409424, + "learning_rate": 9.900754367790497e-06, + "loss": 1.3857, + "step": 2178 + }, + { + "epoch": 0.5519949335022166, + "grad_norm": 3.870600461959839, + "learning_rate": 9.900588193195405e-06, + "loss": 1.2029, + "step": 2179 + }, + { + "epoch": 0.5522482583913869, + "grad_norm": 3.6704447269439697, + "learning_rate": 9.900421880994093e-06, + "loss": 1.3875, + "step": 2180 + }, + { + "epoch": 0.5525015832805573, + "grad_norm": 3.882249593734741, + "learning_rate": 9.900255431191232e-06, + "loss": 1.2162, + "step": 2181 + }, + { + "epoch": 0.5527549081697277, + "grad_norm": 3.450150966644287, + "learning_rate": 9.900088843791494e-06, + "loss": 1.0647, + "step": 2182 + }, + { + "epoch": 0.553008233058898, + "grad_norm": 4.354090690612793, + "learning_rate": 9.899922118799559e-06, + "loss": 1.2928, + "step": 2183 + }, + { + "epoch": 0.5532615579480684, + "grad_norm": 3.895104169845581, + "learning_rate": 9.899755256220107e-06, + "loss": 1.1778, + "step": 2184 + }, + { + "epoch": 0.5535148828372387, + "grad_norm": 3.9388809204101562, + "learning_rate": 9.899588256057824e-06, + "loss": 1.234, + "step": 2185 + }, + { + "epoch": 0.5537682077264091, + "grad_norm": 3.7990224361419678, + "learning_rate": 9.899421118317399e-06, + "loss": 1.2266, + "step": 2186 + }, + { + "epoch": 0.5540215326155795, + "grad_norm": 4.169088840484619, + "learning_rate": 9.899253843003525e-06, + "loss": 1.1988, + "step": 2187 + }, + { + "epoch": 0.5542748575047498, + "grad_norm": 3.5473103523254395, + "learning_rate": 9.899086430120898e-06, + "loss": 1.1201, + "step": 2188 + }, + { + "epoch": 0.5545281823939202, + "grad_norm": 3.514629364013672, + "learning_rate": 9.898918879674223e-06, + "loss": 1.0495, + "step": 2189 + }, + { + "epoch": 0.5547815072830906, + "grad_norm": 3.4299962520599365, + "learning_rate": 9.8987511916682e-06, + "loss": 1.0817, + "step": 2190 + }, + { + "epoch": 0.5550348321722609, + "grad_norm": 3.9319610595703125, + "learning_rate": 9.898583366107539e-06, + "loss": 1.2688, + "step": 2191 + }, + { + "epoch": 0.5552881570614313, + "grad_norm": 3.759704113006592, + "learning_rate": 9.898415402996952e-06, + "loss": 1.2093, + "step": 2192 + }, + { + "epoch": 0.5555414819506016, + "grad_norm": 3.3724687099456787, + "learning_rate": 9.898247302341158e-06, + "loss": 1.1356, + "step": 2193 + }, + { + "epoch": 0.555794806839772, + "grad_norm": 3.661829948425293, + "learning_rate": 9.898079064144877e-06, + "loss": 1.2271, + "step": 2194 + }, + { + "epoch": 0.5560481317289424, + "grad_norm": 3.6634035110473633, + "learning_rate": 9.897910688412829e-06, + "loss": 1.2451, + "step": 2195 + }, + { + "epoch": 0.5563014566181127, + "grad_norm": 3.894113302230835, + "learning_rate": 9.897742175149746e-06, + "loss": 1.1522, + "step": 2196 + }, + { + "epoch": 0.5565547815072831, + "grad_norm": 3.4600610733032227, + "learning_rate": 9.897573524360357e-06, + "loss": 1.1344, + "step": 2197 + }, + { + "epoch": 0.5568081063964535, + "grad_norm": 3.658263683319092, + "learning_rate": 9.897404736049399e-06, + "loss": 1.26, + "step": 2198 + }, + { + "epoch": 0.5570614312856238, + "grad_norm": 3.899393081665039, + "learning_rate": 9.897235810221612e-06, + "loss": 1.2925, + "step": 2199 + }, + { + "epoch": 0.5573147561747942, + "grad_norm": 3.949037790298462, + "learning_rate": 9.897066746881738e-06, + "loss": 1.1514, + "step": 2200 + }, + { + "epoch": 0.5575680810639645, + "grad_norm": 3.4396145343780518, + "learning_rate": 9.896897546034524e-06, + "loss": 1.1929, + "step": 2201 + }, + { + "epoch": 0.5578214059531349, + "grad_norm": 3.182199239730835, + "learning_rate": 9.896728207684724e-06, + "loss": 1.1388, + "step": 2202 + }, + { + "epoch": 0.5580747308423053, + "grad_norm": 3.5556013584136963, + "learning_rate": 9.89655873183709e-06, + "loss": 1.202, + "step": 2203 + }, + { + "epoch": 0.5583280557314756, + "grad_norm": 3.7224278450012207, + "learning_rate": 9.896389118496381e-06, + "loss": 1.1746, + "step": 2204 + }, + { + "epoch": 0.558581380620646, + "grad_norm": 4.2328362464904785, + "learning_rate": 9.896219367667362e-06, + "loss": 1.3125, + "step": 2205 + }, + { + "epoch": 0.5588347055098163, + "grad_norm": 4.193203926086426, + "learning_rate": 9.896049479354797e-06, + "loss": 1.2244, + "step": 2206 + }, + { + "epoch": 0.5590880303989867, + "grad_norm": 3.8881187438964844, + "learning_rate": 9.895879453563457e-06, + "loss": 1.1759, + "step": 2207 + }, + { + "epoch": 0.5593413552881571, + "grad_norm": 3.405766725540161, + "learning_rate": 9.895709290298117e-06, + "loss": 1.2021, + "step": 2208 + }, + { + "epoch": 0.5595946801773274, + "grad_norm": 3.778693437576294, + "learning_rate": 9.895538989563555e-06, + "loss": 1.1735, + "step": 2209 + }, + { + "epoch": 0.5598480050664978, + "grad_norm": 4.0944929122924805, + "learning_rate": 9.895368551364551e-06, + "loss": 1.3031, + "step": 2210 + }, + { + "epoch": 0.5601013299556682, + "grad_norm": 3.705367088317871, + "learning_rate": 9.895197975705895e-06, + "loss": 1.1974, + "step": 2211 + }, + { + "epoch": 0.5603546548448385, + "grad_norm": 3.4504377841949463, + "learning_rate": 9.895027262592372e-06, + "loss": 1.1317, + "step": 2212 + }, + { + "epoch": 0.5606079797340089, + "grad_norm": 3.8827710151672363, + "learning_rate": 9.894856412028778e-06, + "loss": 1.1375, + "step": 2213 + }, + { + "epoch": 0.5608613046231792, + "grad_norm": 4.1285858154296875, + "learning_rate": 9.89468542401991e-06, + "loss": 1.3678, + "step": 2214 + }, + { + "epoch": 0.5611146295123496, + "grad_norm": 3.390394449234009, + "learning_rate": 9.89451429857057e-06, + "loss": 1.2629, + "step": 2215 + }, + { + "epoch": 0.56136795440152, + "grad_norm": 3.5470921993255615, + "learning_rate": 9.894343035685561e-06, + "loss": 1.2365, + "step": 2216 + }, + { + "epoch": 0.5616212792906903, + "grad_norm": 3.7412421703338623, + "learning_rate": 9.894171635369697e-06, + "loss": 1.3165, + "step": 2217 + }, + { + "epoch": 0.5618746041798607, + "grad_norm": 3.2183799743652344, + "learning_rate": 9.894000097627783e-06, + "loss": 1.1686, + "step": 2218 + }, + { + "epoch": 0.5621279290690311, + "grad_norm": 3.583022356033325, + "learning_rate": 9.893828422464642e-06, + "loss": 1.1512, + "step": 2219 + }, + { + "epoch": 0.5623812539582014, + "grad_norm": 3.8245561122894287, + "learning_rate": 9.893656609885092e-06, + "loss": 1.1414, + "step": 2220 + }, + { + "epoch": 0.5626345788473718, + "grad_norm": 3.6685290336608887, + "learning_rate": 9.89348465989396e-06, + "loss": 1.2011, + "step": 2221 + }, + { + "epoch": 0.5628879037365421, + "grad_norm": 3.8434760570526123, + "learning_rate": 9.89331257249607e-06, + "loss": 1.1604, + "step": 2222 + }, + { + "epoch": 0.5631412286257125, + "grad_norm": 3.6513028144836426, + "learning_rate": 9.893140347696257e-06, + "loss": 1.1783, + "step": 2223 + }, + { + "epoch": 0.5633945535148829, + "grad_norm": 3.6049387454986572, + "learning_rate": 9.892967985499356e-06, + "loss": 1.1097, + "step": 2224 + }, + { + "epoch": 0.5636478784040532, + "grad_norm": 3.929824113845825, + "learning_rate": 9.892795485910207e-06, + "loss": 1.1884, + "step": 2225 + }, + { + "epoch": 0.5639012032932236, + "grad_norm": 3.6172595024108887, + "learning_rate": 9.892622848933653e-06, + "loss": 1.186, + "step": 2226 + }, + { + "epoch": 0.5641545281823939, + "grad_norm": 3.9310948848724365, + "learning_rate": 9.892450074574545e-06, + "loss": 1.1807, + "step": 2227 + }, + { + "epoch": 0.5644078530715643, + "grad_norm": 3.604039430618286, + "learning_rate": 9.892277162837731e-06, + "loss": 1.0479, + "step": 2228 + }, + { + "epoch": 0.5646611779607347, + "grad_norm": 3.72631573677063, + "learning_rate": 9.892104113728065e-06, + "loss": 1.2094, + "step": 2229 + }, + { + "epoch": 0.564914502849905, + "grad_norm": 3.7389190196990967, + "learning_rate": 9.891930927250411e-06, + "loss": 1.1695, + "step": 2230 + }, + { + "epoch": 0.5651678277390754, + "grad_norm": 3.9570655822753906, + "learning_rate": 9.891757603409629e-06, + "loss": 1.1978, + "step": 2231 + }, + { + "epoch": 0.5654211526282458, + "grad_norm": 3.9663290977478027, + "learning_rate": 9.891584142210584e-06, + "loss": 1.2631, + "step": 2232 + }, + { + "epoch": 0.5656744775174161, + "grad_norm": 3.927863121032715, + "learning_rate": 9.89141054365815e-06, + "loss": 1.1389, + "step": 2233 + }, + { + "epoch": 0.5659278024065865, + "grad_norm": 3.885694742202759, + "learning_rate": 9.891236807757201e-06, + "loss": 1.2052, + "step": 2234 + }, + { + "epoch": 0.5661811272957568, + "grad_norm": 3.5142667293548584, + "learning_rate": 9.891062934512615e-06, + "loss": 1.1599, + "step": 2235 + }, + { + "epoch": 0.5664344521849272, + "grad_norm": 3.523827314376831, + "learning_rate": 9.890888923929274e-06, + "loss": 1.245, + "step": 2236 + }, + { + "epoch": 0.5666877770740976, + "grad_norm": 3.3272361755371094, + "learning_rate": 9.890714776012065e-06, + "loss": 1.0359, + "step": 2237 + }, + { + "epoch": 0.5669411019632679, + "grad_norm": 3.374920606613159, + "learning_rate": 9.890540490765876e-06, + "loss": 1.2326, + "step": 2238 + }, + { + "epoch": 0.5671944268524383, + "grad_norm": 3.8165230751037598, + "learning_rate": 9.890366068195603e-06, + "loss": 1.3016, + "step": 2239 + }, + { + "epoch": 0.5674477517416087, + "grad_norm": 3.5627212524414062, + "learning_rate": 9.890191508306141e-06, + "loss": 1.2669, + "step": 2240 + }, + { + "epoch": 0.567701076630779, + "grad_norm": 3.4874629974365234, + "learning_rate": 9.890016811102395e-06, + "loss": 1.1789, + "step": 2241 + }, + { + "epoch": 0.5679544015199494, + "grad_norm": 3.576786994934082, + "learning_rate": 9.889841976589268e-06, + "loss": 1.0605, + "step": 2242 + }, + { + "epoch": 0.5682077264091197, + "grad_norm": 3.454319953918457, + "learning_rate": 9.88966700477167e-06, + "loss": 1.2683, + "step": 2243 + }, + { + "epoch": 0.56846105129829, + "grad_norm": 3.5707461833953857, + "learning_rate": 9.889491895654515e-06, + "loss": 1.3246, + "step": 2244 + }, + { + "epoch": 0.5687143761874605, + "grad_norm": 3.5469753742218018, + "learning_rate": 9.88931664924272e-06, + "loss": 1.109, + "step": 2245 + }, + { + "epoch": 0.5689677010766307, + "grad_norm": 3.709312915802002, + "learning_rate": 9.889141265541202e-06, + "loss": 1.2081, + "step": 2246 + }, + { + "epoch": 0.5692210259658012, + "grad_norm": 3.820150375366211, + "learning_rate": 9.888965744554892e-06, + "loss": 1.2152, + "step": 2247 + }, + { + "epoch": 0.5694743508549716, + "grad_norm": 3.6582584381103516, + "learning_rate": 9.888790086288714e-06, + "loss": 1.2377, + "step": 2248 + }, + { + "epoch": 0.5697276757441418, + "grad_norm": 3.495410442352295, + "learning_rate": 9.8886142907476e-06, + "loss": 1.1192, + "step": 2249 + }, + { + "epoch": 0.5699810006333123, + "grad_norm": 3.2466015815734863, + "learning_rate": 9.88843835793649e-06, + "loss": 1.1125, + "step": 2250 + }, + { + "epoch": 0.5702343255224825, + "grad_norm": 3.685121536254883, + "learning_rate": 9.888262287860321e-06, + "loss": 1.1283, + "step": 2251 + }, + { + "epoch": 0.570487650411653, + "grad_norm": 3.65809965133667, + "learning_rate": 9.888086080524039e-06, + "loss": 1.3646, + "step": 2252 + }, + { + "epoch": 0.5707409753008233, + "grad_norm": 3.577664852142334, + "learning_rate": 9.88790973593259e-06, + "loss": 1.239, + "step": 2253 + }, + { + "epoch": 0.5709943001899936, + "grad_norm": 3.3043811321258545, + "learning_rate": 9.887733254090925e-06, + "loss": 1.1543, + "step": 2254 + }, + { + "epoch": 0.571247625079164, + "grad_norm": 3.625861406326294, + "learning_rate": 9.887556635004003e-06, + "loss": 1.1222, + "step": 2255 + }, + { + "epoch": 0.5715009499683343, + "grad_norm": 3.6544387340545654, + "learning_rate": 9.887379878676782e-06, + "loss": 1.2513, + "step": 2256 + }, + { + "epoch": 0.5717542748575047, + "grad_norm": 3.5436434745788574, + "learning_rate": 9.887202985114223e-06, + "loss": 1.199, + "step": 2257 + }, + { + "epoch": 0.5720075997466751, + "grad_norm": 3.650327205657959, + "learning_rate": 9.887025954321295e-06, + "loss": 1.147, + "step": 2258 + }, + { + "epoch": 0.5722609246358454, + "grad_norm": 3.910425901412964, + "learning_rate": 9.88684878630297e-06, + "loss": 1.3222, + "step": 2259 + }, + { + "epoch": 0.5725142495250158, + "grad_norm": 4.051957130432129, + "learning_rate": 9.88667148106422e-06, + "loss": 1.2572, + "step": 2260 + }, + { + "epoch": 0.5727675744141862, + "grad_norm": 3.374309778213501, + "learning_rate": 9.886494038610025e-06, + "loss": 1.1578, + "step": 2261 + }, + { + "epoch": 0.5730208993033565, + "grad_norm": 3.7524333000183105, + "learning_rate": 9.886316458945367e-06, + "loss": 1.2844, + "step": 2262 + }, + { + "epoch": 0.5732742241925269, + "grad_norm": 3.5143179893493652, + "learning_rate": 9.886138742075235e-06, + "loss": 1.1891, + "step": 2263 + }, + { + "epoch": 0.5735275490816972, + "grad_norm": 3.6519126892089844, + "learning_rate": 9.885960888004616e-06, + "loss": 1.1665, + "step": 2264 + }, + { + "epoch": 0.5737808739708676, + "grad_norm": 3.3841686248779297, + "learning_rate": 9.885782896738504e-06, + "loss": 1.2329, + "step": 2265 + }, + { + "epoch": 0.574034198860038, + "grad_norm": 3.453477621078491, + "learning_rate": 9.8856047682819e-06, + "loss": 1.0778, + "step": 2266 + }, + { + "epoch": 0.5742875237492083, + "grad_norm": 3.317171335220337, + "learning_rate": 9.885426502639803e-06, + "loss": 1.1258, + "step": 2267 + }, + { + "epoch": 0.5745408486383787, + "grad_norm": 3.9248321056365967, + "learning_rate": 9.88524809981722e-06, + "loss": 1.2575, + "step": 2268 + }, + { + "epoch": 0.5747941735275491, + "grad_norm": 3.7087948322296143, + "learning_rate": 9.88506955981916e-06, + "loss": 1.0535, + "step": 2269 + }, + { + "epoch": 0.5750474984167194, + "grad_norm": 3.3787403106689453, + "learning_rate": 9.884890882650635e-06, + "loss": 1.1185, + "step": 2270 + }, + { + "epoch": 0.5753008233058898, + "grad_norm": 3.9463608264923096, + "learning_rate": 9.884712068316665e-06, + "loss": 1.2782, + "step": 2271 + }, + { + "epoch": 0.5755541481950601, + "grad_norm": 3.6454262733459473, + "learning_rate": 9.88453311682227e-06, + "loss": 1.071, + "step": 2272 + }, + { + "epoch": 0.5758074730842305, + "grad_norm": 3.7849459648132324, + "learning_rate": 9.884354028172472e-06, + "loss": 1.0822, + "step": 2273 + }, + { + "epoch": 0.5760607979734009, + "grad_norm": 3.8799235820770264, + "learning_rate": 9.884174802372303e-06, + "loss": 1.1646, + "step": 2274 + }, + { + "epoch": 0.5763141228625712, + "grad_norm": 3.6872293949127197, + "learning_rate": 9.883995439426797e-06, + "loss": 1.2012, + "step": 2275 + }, + { + "epoch": 0.5765674477517416, + "grad_norm": 3.7613775730133057, + "learning_rate": 9.883815939340985e-06, + "loss": 1.1613, + "step": 2276 + }, + { + "epoch": 0.5768207726409119, + "grad_norm": 3.8921945095062256, + "learning_rate": 9.883636302119911e-06, + "loss": 1.4246, + "step": 2277 + }, + { + "epoch": 0.5770740975300823, + "grad_norm": 3.689805746078491, + "learning_rate": 9.88345652776862e-06, + "loss": 1.1549, + "step": 2278 + }, + { + "epoch": 0.5773274224192527, + "grad_norm": 3.6792287826538086, + "learning_rate": 9.883276616292157e-06, + "loss": 1.1565, + "step": 2279 + }, + { + "epoch": 0.577580747308423, + "grad_norm": 3.658616781234741, + "learning_rate": 9.883096567695575e-06, + "loss": 1.333, + "step": 2280 + }, + { + "epoch": 0.5778340721975934, + "grad_norm": 3.7376174926757812, + "learning_rate": 9.882916381983931e-06, + "loss": 1.2714, + "step": 2281 + }, + { + "epoch": 0.5780873970867638, + "grad_norm": 3.3016133308410645, + "learning_rate": 9.882736059162283e-06, + "loss": 1.043, + "step": 2282 + }, + { + "epoch": 0.5783407219759341, + "grad_norm": 3.7895755767822266, + "learning_rate": 9.882555599235694e-06, + "loss": 1.227, + "step": 2283 + }, + { + "epoch": 0.5785940468651045, + "grad_norm": 3.860079765319824, + "learning_rate": 9.882375002209235e-06, + "loss": 1.1046, + "step": 2284 + }, + { + "epoch": 0.5788473717542748, + "grad_norm": 3.4295051097869873, + "learning_rate": 9.882194268087973e-06, + "loss": 1.044, + "step": 2285 + }, + { + "epoch": 0.5791006966434452, + "grad_norm": 3.6967933177948, + "learning_rate": 9.882013396876983e-06, + "loss": 1.1913, + "step": 2286 + }, + { + "epoch": 0.5793540215326156, + "grad_norm": 3.7292520999908447, + "learning_rate": 9.881832388581345e-06, + "loss": 1.2879, + "step": 2287 + }, + { + "epoch": 0.5796073464217859, + "grad_norm": 3.2542896270751953, + "learning_rate": 9.881651243206142e-06, + "loss": 1.0997, + "step": 2288 + }, + { + "epoch": 0.5798606713109563, + "grad_norm": 3.7655575275421143, + "learning_rate": 9.88146996075646e-06, + "loss": 1.2617, + "step": 2289 + }, + { + "epoch": 0.5801139962001267, + "grad_norm": 3.836667060852051, + "learning_rate": 9.881288541237389e-06, + "loss": 1.2397, + "step": 2290 + }, + { + "epoch": 0.580367321089297, + "grad_norm": 3.9770896434783936, + "learning_rate": 9.881106984654025e-06, + "loss": 1.227, + "step": 2291 + }, + { + "epoch": 0.5806206459784674, + "grad_norm": 4.11192512512207, + "learning_rate": 9.880925291011463e-06, + "loss": 1.1607, + "step": 2292 + }, + { + "epoch": 0.5808739708676377, + "grad_norm": 3.859837532043457, + "learning_rate": 9.880743460314806e-06, + "loss": 1.1955, + "step": 2293 + }, + { + "epoch": 0.5811272957568081, + "grad_norm": 3.834381580352783, + "learning_rate": 9.88056149256916e-06, + "loss": 1.3425, + "step": 2294 + }, + { + "epoch": 0.5813806206459785, + "grad_norm": 3.7988929748535156, + "learning_rate": 9.880379387779637e-06, + "loss": 1.1974, + "step": 2295 + }, + { + "epoch": 0.5816339455351488, + "grad_norm": 3.9919984340667725, + "learning_rate": 9.880197145951347e-06, + "loss": 1.1925, + "step": 2296 + }, + { + "epoch": 0.5818872704243192, + "grad_norm": 3.8683440685272217, + "learning_rate": 9.880014767089407e-06, + "loss": 1.2232, + "step": 2297 + }, + { + "epoch": 0.5821405953134895, + "grad_norm": 3.4029898643493652, + "learning_rate": 9.879832251198941e-06, + "loss": 1.2023, + "step": 2298 + }, + { + "epoch": 0.5823939202026599, + "grad_norm": 3.969633102416992, + "learning_rate": 9.879649598285073e-06, + "loss": 1.2331, + "step": 2299 + }, + { + "epoch": 0.5826472450918303, + "grad_norm": 4.044780731201172, + "learning_rate": 9.879466808352928e-06, + "loss": 1.3385, + "step": 2300 + }, + { + "epoch": 0.5829005699810006, + "grad_norm": 3.8353214263916016, + "learning_rate": 9.879283881407645e-06, + "loss": 1.2206, + "step": 2301 + }, + { + "epoch": 0.583153894870171, + "grad_norm": 3.7752926349639893, + "learning_rate": 9.879100817454357e-06, + "loss": 1.2771, + "step": 2302 + }, + { + "epoch": 0.5834072197593414, + "grad_norm": 3.4767112731933594, + "learning_rate": 9.878917616498205e-06, + "loss": 1.2505, + "step": 2303 + }, + { + "epoch": 0.5836605446485117, + "grad_norm": 3.8391661643981934, + "learning_rate": 9.878734278544332e-06, + "loss": 1.3158, + "step": 2304 + }, + { + "epoch": 0.5839138695376821, + "grad_norm": 3.89296555519104, + "learning_rate": 9.878550803597888e-06, + "loss": 1.1653, + "step": 2305 + }, + { + "epoch": 0.5841671944268524, + "grad_norm": 3.8957955837249756, + "learning_rate": 9.878367191664022e-06, + "loss": 1.2505, + "step": 2306 + }, + { + "epoch": 0.5844205193160228, + "grad_norm": 3.8498334884643555, + "learning_rate": 9.878183442747892e-06, + "loss": 1.2245, + "step": 2307 + }, + { + "epoch": 0.5846738442051932, + "grad_norm": 3.7277896404266357, + "learning_rate": 9.87799955685466e-06, + "loss": 1.2585, + "step": 2308 + }, + { + "epoch": 0.5849271690943635, + "grad_norm": 3.3101165294647217, + "learning_rate": 9.877815533989484e-06, + "loss": 1.0909, + "step": 2309 + }, + { + "epoch": 0.5851804939835339, + "grad_norm": 3.9764208793640137, + "learning_rate": 9.877631374157534e-06, + "loss": 1.1819, + "step": 2310 + }, + { + "epoch": 0.5854338188727043, + "grad_norm": 3.7967846393585205, + "learning_rate": 9.877447077363983e-06, + "loss": 1.2923, + "step": 2311 + }, + { + "epoch": 0.5856871437618746, + "grad_norm": 3.9297142028808594, + "learning_rate": 9.877262643614003e-06, + "loss": 1.1912, + "step": 2312 + }, + { + "epoch": 0.585940468651045, + "grad_norm": 3.7486352920532227, + "learning_rate": 9.877078072912773e-06, + "loss": 1.121, + "step": 2313 + }, + { + "epoch": 0.5861937935402153, + "grad_norm": 3.9516937732696533, + "learning_rate": 9.876893365265477e-06, + "loss": 1.3066, + "step": 2314 + }, + { + "epoch": 0.5864471184293857, + "grad_norm": 3.692528247833252, + "learning_rate": 9.876708520677302e-06, + "loss": 1.2561, + "step": 2315 + }, + { + "epoch": 0.5867004433185561, + "grad_norm": 3.739330530166626, + "learning_rate": 9.876523539153436e-06, + "loss": 1.1427, + "step": 2316 + }, + { + "epoch": 0.5869537682077264, + "grad_norm": 3.611651659011841, + "learning_rate": 9.876338420699076e-06, + "loss": 1.2367, + "step": 2317 + }, + { + "epoch": 0.5872070930968968, + "grad_norm": 3.9292173385620117, + "learning_rate": 9.876153165319417e-06, + "loss": 1.3312, + "step": 2318 + }, + { + "epoch": 0.5874604179860672, + "grad_norm": 3.6973323822021484, + "learning_rate": 9.875967773019664e-06, + "loss": 1.2351, + "step": 2319 + }, + { + "epoch": 0.5877137428752375, + "grad_norm": 3.271484613418579, + "learning_rate": 9.875782243805019e-06, + "loss": 1.1391, + "step": 2320 + }, + { + "epoch": 0.5879670677644079, + "grad_norm": 3.175833225250244, + "learning_rate": 9.875596577680695e-06, + "loss": 1.0855, + "step": 2321 + }, + { + "epoch": 0.5882203926535782, + "grad_norm": 3.5260026454925537, + "learning_rate": 9.875410774651903e-06, + "loss": 1.1706, + "step": 2322 + }, + { + "epoch": 0.5884737175427486, + "grad_norm": 3.352823257446289, + "learning_rate": 9.875224834723865e-06, + "loss": 1.2847, + "step": 2323 + }, + { + "epoch": 0.588727042431919, + "grad_norm": 3.724398374557495, + "learning_rate": 9.875038757901795e-06, + "loss": 1.2112, + "step": 2324 + }, + { + "epoch": 0.5889803673210893, + "grad_norm": 3.6846039295196533, + "learning_rate": 9.874852544190923e-06, + "loss": 1.1993, + "step": 2325 + }, + { + "epoch": 0.5892336922102597, + "grad_norm": 3.6119465827941895, + "learning_rate": 9.874666193596476e-06, + "loss": 1.1627, + "step": 2326 + }, + { + "epoch": 0.58948701709943, + "grad_norm": 3.6697070598602295, + "learning_rate": 9.874479706123686e-06, + "loss": 1.193, + "step": 2327 + }, + { + "epoch": 0.5897403419886004, + "grad_norm": 3.8406481742858887, + "learning_rate": 9.87429308177779e-06, + "loss": 1.0613, + "step": 2328 + }, + { + "epoch": 0.5899936668777708, + "grad_norm": 3.6351852416992188, + "learning_rate": 9.874106320564033e-06, + "loss": 1.1984, + "step": 2329 + }, + { + "epoch": 0.5902469917669411, + "grad_norm": 4.594501495361328, + "learning_rate": 9.87391942248765e-06, + "loss": 1.139, + "step": 2330 + }, + { + "epoch": 0.5905003166561115, + "grad_norm": 3.439969062805176, + "learning_rate": 9.873732387553897e-06, + "loss": 1.0793, + "step": 2331 + }, + { + "epoch": 0.5907536415452819, + "grad_norm": 4.089970588684082, + "learning_rate": 9.87354521576802e-06, + "loss": 1.4473, + "step": 2332 + }, + { + "epoch": 0.5910069664344522, + "grad_norm": 3.807089328765869, + "learning_rate": 9.87335790713528e-06, + "loss": 1.2171, + "step": 2333 + }, + { + "epoch": 0.5912602913236226, + "grad_norm": 3.730450391769409, + "learning_rate": 9.873170461660934e-06, + "loss": 1.1716, + "step": 2334 + }, + { + "epoch": 0.5915136162127929, + "grad_norm": 3.7394068241119385, + "learning_rate": 9.872982879350243e-06, + "loss": 1.3105, + "step": 2335 + }, + { + "epoch": 0.5917669411019633, + "grad_norm": 3.285534143447876, + "learning_rate": 9.872795160208478e-06, + "loss": 1.1275, + "step": 2336 + }, + { + "epoch": 0.5920202659911337, + "grad_norm": 3.6585047245025635, + "learning_rate": 9.87260730424091e-06, + "loss": 1.1234, + "step": 2337 + }, + { + "epoch": 0.592273590880304, + "grad_norm": 3.874295473098755, + "learning_rate": 9.872419311452811e-06, + "loss": 1.1829, + "step": 2338 + }, + { + "epoch": 0.5925269157694744, + "grad_norm": 3.504314422607422, + "learning_rate": 9.872231181849461e-06, + "loss": 1.1766, + "step": 2339 + }, + { + "epoch": 0.5927802406586448, + "grad_norm": 3.646886110305786, + "learning_rate": 9.872042915436144e-06, + "loss": 1.2219, + "step": 2340 + }, + { + "epoch": 0.5930335655478151, + "grad_norm": 4.184945106506348, + "learning_rate": 9.871854512218145e-06, + "loss": 1.2237, + "step": 2341 + }, + { + "epoch": 0.5932868904369855, + "grad_norm": 3.251213312149048, + "learning_rate": 9.871665972200754e-06, + "loss": 0.9566, + "step": 2342 + }, + { + "epoch": 0.5935402153261558, + "grad_norm": 3.8858537673950195, + "learning_rate": 9.871477295389266e-06, + "loss": 1.2412, + "step": 2343 + }, + { + "epoch": 0.5937935402153262, + "grad_norm": 3.5049939155578613, + "learning_rate": 9.87128848178898e-06, + "loss": 1.0805, + "step": 2344 + }, + { + "epoch": 0.5940468651044966, + "grad_norm": 4.045013904571533, + "learning_rate": 9.871099531405195e-06, + "loss": 1.3981, + "step": 2345 + }, + { + "epoch": 0.5943001899936668, + "grad_norm": 3.804518461227417, + "learning_rate": 9.870910444243219e-06, + "loss": 1.1937, + "step": 2346 + }, + { + "epoch": 0.5945535148828373, + "grad_norm": 3.7914228439331055, + "learning_rate": 9.87072122030836e-06, + "loss": 1.2388, + "step": 2347 + }, + { + "epoch": 0.5948068397720075, + "grad_norm": 3.981391191482544, + "learning_rate": 9.870531859605931e-06, + "loss": 1.151, + "step": 2348 + }, + { + "epoch": 0.595060164661178, + "grad_norm": 3.6058106422424316, + "learning_rate": 9.870342362141252e-06, + "loss": 1.0759, + "step": 2349 + }, + { + "epoch": 0.5953134895503484, + "grad_norm": 3.439100742340088, + "learning_rate": 9.870152727919642e-06, + "loss": 1.196, + "step": 2350 + }, + { + "epoch": 0.5955668144395186, + "grad_norm": 3.8293299674987793, + "learning_rate": 9.869962956946426e-06, + "loss": 1.2095, + "step": 2351 + }, + { + "epoch": 0.595820139328689, + "grad_norm": 3.6827378273010254, + "learning_rate": 9.869773049226932e-06, + "loss": 1.0907, + "step": 2352 + }, + { + "epoch": 0.5960734642178595, + "grad_norm": 3.8984475135803223, + "learning_rate": 9.869583004766491e-06, + "loss": 1.1624, + "step": 2353 + }, + { + "epoch": 0.5963267891070297, + "grad_norm": 3.549410581588745, + "learning_rate": 9.869392823570445e-06, + "loss": 1.2173, + "step": 2354 + }, + { + "epoch": 0.5965801139962001, + "grad_norm": 3.6839888095855713, + "learning_rate": 9.869202505644128e-06, + "loss": 1.179, + "step": 2355 + }, + { + "epoch": 0.5968334388853704, + "grad_norm": 3.5096590518951416, + "learning_rate": 9.869012050992889e-06, + "loss": 1.1676, + "step": 2356 + }, + { + "epoch": 0.5970867637745408, + "grad_norm": 3.367306709289551, + "learning_rate": 9.868821459622071e-06, + "loss": 1.0645, + "step": 2357 + }, + { + "epoch": 0.5973400886637112, + "grad_norm": 3.5231404304504395, + "learning_rate": 9.868630731537031e-06, + "loss": 1.1651, + "step": 2358 + }, + { + "epoch": 0.5975934135528815, + "grad_norm": 3.431152820587158, + "learning_rate": 9.86843986674312e-06, + "loss": 1.1637, + "step": 2359 + }, + { + "epoch": 0.5978467384420519, + "grad_norm": 3.711580753326416, + "learning_rate": 9.868248865245702e-06, + "loss": 1.3629, + "step": 2360 + }, + { + "epoch": 0.5981000633312223, + "grad_norm": 3.830181837081909, + "learning_rate": 9.868057727050134e-06, + "loss": 1.1988, + "step": 2361 + }, + { + "epoch": 0.5983533882203926, + "grad_norm": 4.121151924133301, + "learning_rate": 9.867866452161789e-06, + "loss": 1.2656, + "step": 2362 + }, + { + "epoch": 0.598606713109563, + "grad_norm": 3.4987690448760986, + "learning_rate": 9.867675040586035e-06, + "loss": 1.1397, + "step": 2363 + }, + { + "epoch": 0.5988600379987333, + "grad_norm": 3.3809003829956055, + "learning_rate": 9.867483492328246e-06, + "loss": 1.1094, + "step": 2364 + }, + { + "epoch": 0.5991133628879037, + "grad_norm": 3.502246618270874, + "learning_rate": 9.867291807393803e-06, + "loss": 1.1703, + "step": 2365 + }, + { + "epoch": 0.5993666877770741, + "grad_norm": 3.4921200275421143, + "learning_rate": 9.867099985788087e-06, + "loss": 1.1978, + "step": 2366 + }, + { + "epoch": 0.5996200126662444, + "grad_norm": 4.000209808349609, + "learning_rate": 9.866908027516484e-06, + "loss": 1.3362, + "step": 2367 + }, + { + "epoch": 0.5998733375554148, + "grad_norm": 3.200626850128174, + "learning_rate": 9.866715932584385e-06, + "loss": 1.0594, + "step": 2368 + }, + { + "epoch": 0.6001266624445852, + "grad_norm": 3.8519949913024902, + "learning_rate": 9.866523700997183e-06, + "loss": 1.2104, + "step": 2369 + }, + { + "epoch": 0.6003799873337555, + "grad_norm": 3.4061481952667236, + "learning_rate": 9.866331332760277e-06, + "loss": 1.1132, + "step": 2370 + }, + { + "epoch": 0.6006333122229259, + "grad_norm": 3.539203405380249, + "learning_rate": 9.866138827879068e-06, + "loss": 1.2231, + "step": 2371 + }, + { + "epoch": 0.6008866371120962, + "grad_norm": 3.539919137954712, + "learning_rate": 9.865946186358962e-06, + "loss": 1.284, + "step": 2372 + }, + { + "epoch": 0.6011399620012666, + "grad_norm": 3.8084702491760254, + "learning_rate": 9.865753408205365e-06, + "loss": 1.2749, + "step": 2373 + }, + { + "epoch": 0.601393286890437, + "grad_norm": 3.732297420501709, + "learning_rate": 9.865560493423695e-06, + "loss": 1.0956, + "step": 2374 + }, + { + "epoch": 0.6016466117796073, + "grad_norm": 3.882216691970825, + "learning_rate": 9.865367442019366e-06, + "loss": 1.386, + "step": 2375 + }, + { + "epoch": 0.6018999366687777, + "grad_norm": 3.396533727645874, + "learning_rate": 9.8651742539978e-06, + "loss": 1.0325, + "step": 2376 + }, + { + "epoch": 0.602153261557948, + "grad_norm": 4.000504016876221, + "learning_rate": 9.86498092936442e-06, + "loss": 1.3059, + "step": 2377 + }, + { + "epoch": 0.6024065864471184, + "grad_norm": 3.0580947399139404, + "learning_rate": 9.864787468124658e-06, + "loss": 1.0508, + "step": 2378 + }, + { + "epoch": 0.6026599113362888, + "grad_norm": 3.613420248031616, + "learning_rate": 9.864593870283942e-06, + "loss": 1.1817, + "step": 2379 + }, + { + "epoch": 0.6029132362254591, + "grad_norm": 3.494577646255493, + "learning_rate": 9.86440013584771e-06, + "loss": 1.0924, + "step": 2380 + }, + { + "epoch": 0.6031665611146295, + "grad_norm": 3.633143424987793, + "learning_rate": 9.864206264821403e-06, + "loss": 1.2451, + "step": 2381 + }, + { + "epoch": 0.6034198860037999, + "grad_norm": 3.626067638397217, + "learning_rate": 9.864012257210462e-06, + "loss": 1.1005, + "step": 2382 + }, + { + "epoch": 0.6036732108929702, + "grad_norm": 3.796259641647339, + "learning_rate": 9.863818113020338e-06, + "loss": 1.217, + "step": 2383 + }, + { + "epoch": 0.6039265357821406, + "grad_norm": 3.874244451522827, + "learning_rate": 9.86362383225648e-06, + "loss": 1.2143, + "step": 2384 + }, + { + "epoch": 0.6041798606713109, + "grad_norm": 3.2779505252838135, + "learning_rate": 9.863429414924346e-06, + "loss": 1.1315, + "step": 2385 + }, + { + "epoch": 0.6044331855604813, + "grad_norm": 3.3644847869873047, + "learning_rate": 9.863234861029393e-06, + "loss": 1.1816, + "step": 2386 + }, + { + "epoch": 0.6046865104496517, + "grad_norm": 3.528351306915283, + "learning_rate": 9.863040170577084e-06, + "loss": 1.1545, + "step": 2387 + }, + { + "epoch": 0.604939835338822, + "grad_norm": 3.5509703159332275, + "learning_rate": 9.862845343572885e-06, + "loss": 1.1927, + "step": 2388 + }, + { + "epoch": 0.6051931602279924, + "grad_norm": 3.670538902282715, + "learning_rate": 9.86265038002227e-06, + "loss": 1.1376, + "step": 2389 + }, + { + "epoch": 0.6054464851171628, + "grad_norm": 3.4490549564361572, + "learning_rate": 9.862455279930709e-06, + "loss": 1.1018, + "step": 2390 + }, + { + "epoch": 0.6056998100063331, + "grad_norm": 3.8076932430267334, + "learning_rate": 9.862260043303685e-06, + "loss": 1.2233, + "step": 2391 + }, + { + "epoch": 0.6059531348955035, + "grad_norm": 3.9804649353027344, + "learning_rate": 9.862064670146676e-06, + "loss": 1.1906, + "step": 2392 + }, + { + "epoch": 0.6062064597846738, + "grad_norm": 3.8885498046875, + "learning_rate": 9.86186916046517e-06, + "loss": 1.2377, + "step": 2393 + }, + { + "epoch": 0.6064597846738442, + "grad_norm": 3.717961549758911, + "learning_rate": 9.861673514264658e-06, + "loss": 1.2273, + "step": 2394 + }, + { + "epoch": 0.6067131095630146, + "grad_norm": 3.9650092124938965, + "learning_rate": 9.861477731550631e-06, + "loss": 1.1808, + "step": 2395 + }, + { + "epoch": 0.6069664344521849, + "grad_norm": 3.5600218772888184, + "learning_rate": 9.861281812328587e-06, + "loss": 1.0836, + "step": 2396 + }, + { + "epoch": 0.6072197593413553, + "grad_norm": 3.285839319229126, + "learning_rate": 9.86108575660403e-06, + "loss": 1.1076, + "step": 2397 + }, + { + "epoch": 0.6074730842305256, + "grad_norm": 3.713930368423462, + "learning_rate": 9.860889564382463e-06, + "loss": 1.2391, + "step": 2398 + }, + { + "epoch": 0.607726409119696, + "grad_norm": 3.4917590618133545, + "learning_rate": 9.860693235669394e-06, + "loss": 1.1042, + "step": 2399 + }, + { + "epoch": 0.6079797340088664, + "grad_norm": 4.100406646728516, + "learning_rate": 9.860496770470338e-06, + "loss": 1.4159, + "step": 2400 + }, + { + "epoch": 0.6082330588980367, + "grad_norm": 3.681727647781372, + "learning_rate": 9.860300168790811e-06, + "loss": 1.2346, + "step": 2401 + }, + { + "epoch": 0.6084863837872071, + "grad_norm": 3.341277599334717, + "learning_rate": 9.860103430636332e-06, + "loss": 1.1499, + "step": 2402 + }, + { + "epoch": 0.6087397086763775, + "grad_norm": 3.523472309112549, + "learning_rate": 9.859906556012427e-06, + "loss": 1.0684, + "step": 2403 + }, + { + "epoch": 0.6089930335655478, + "grad_norm": 3.5669310092926025, + "learning_rate": 9.859709544924624e-06, + "loss": 1.2042, + "step": 2404 + }, + { + "epoch": 0.6092463584547182, + "grad_norm": 3.4634294509887695, + "learning_rate": 9.859512397378455e-06, + "loss": 1.224, + "step": 2405 + }, + { + "epoch": 0.6094996833438885, + "grad_norm": 3.555194139480591, + "learning_rate": 9.859315113379455e-06, + "loss": 1.2471, + "step": 2406 + }, + { + "epoch": 0.6097530082330589, + "grad_norm": 3.5009043216705322, + "learning_rate": 9.859117692933163e-06, + "loss": 1.18, + "step": 2407 + }, + { + "epoch": 0.6100063331222293, + "grad_norm": 3.4651875495910645, + "learning_rate": 9.858920136045124e-06, + "loss": 1.0705, + "step": 2408 + }, + { + "epoch": 0.6102596580113996, + "grad_norm": 3.6112053394317627, + "learning_rate": 9.858722442720885e-06, + "loss": 1.1159, + "step": 2409 + }, + { + "epoch": 0.61051298290057, + "grad_norm": 3.9805543422698975, + "learning_rate": 9.858524612965997e-06, + "loss": 1.3357, + "step": 2410 + }, + { + "epoch": 0.6107663077897404, + "grad_norm": 3.884993076324463, + "learning_rate": 9.858326646786017e-06, + "loss": 1.2623, + "step": 2411 + }, + { + "epoch": 0.6110196326789107, + "grad_norm": 3.940034866333008, + "learning_rate": 9.858128544186498e-06, + "loss": 1.2967, + "step": 2412 + }, + { + "epoch": 0.6112729575680811, + "grad_norm": 3.6648826599121094, + "learning_rate": 9.85793030517301e-06, + "loss": 1.1501, + "step": 2413 + }, + { + "epoch": 0.6115262824572514, + "grad_norm": 3.650735378265381, + "learning_rate": 9.857731929751114e-06, + "loss": 1.3022, + "step": 2414 + }, + { + "epoch": 0.6117796073464218, + "grad_norm": 3.5040194988250732, + "learning_rate": 9.857533417926382e-06, + "loss": 1.1969, + "step": 2415 + }, + { + "epoch": 0.6120329322355922, + "grad_norm": 3.9353554248809814, + "learning_rate": 9.857334769704388e-06, + "loss": 1.1229, + "step": 2416 + }, + { + "epoch": 0.6122862571247625, + "grad_norm": 3.66317081451416, + "learning_rate": 9.85713598509071e-06, + "loss": 1.252, + "step": 2417 + }, + { + "epoch": 0.6125395820139329, + "grad_norm": 3.60054349899292, + "learning_rate": 9.856937064090931e-06, + "loss": 1.1752, + "step": 2418 + }, + { + "epoch": 0.6127929069031033, + "grad_norm": 3.854161262512207, + "learning_rate": 9.856738006710636e-06, + "loss": 1.1795, + "step": 2419 + }, + { + "epoch": 0.6130462317922736, + "grad_norm": 3.4885568618774414, + "learning_rate": 9.856538812955411e-06, + "loss": 1.1409, + "step": 2420 + }, + { + "epoch": 0.613299556681444, + "grad_norm": 3.6377522945404053, + "learning_rate": 9.856339482830856e-06, + "loss": 1.2984, + "step": 2421 + }, + { + "epoch": 0.6135528815706143, + "grad_norm": 3.71474027633667, + "learning_rate": 9.856140016342562e-06, + "loss": 1.1658, + "step": 2422 + }, + { + "epoch": 0.6138062064597847, + "grad_norm": 3.5078494548797607, + "learning_rate": 9.855940413496134e-06, + "loss": 1.0478, + "step": 2423 + }, + { + "epoch": 0.6140595313489551, + "grad_norm": 3.417715072631836, + "learning_rate": 9.855740674297173e-06, + "loss": 1.0735, + "step": 2424 + }, + { + "epoch": 0.6143128562381254, + "grad_norm": 4.048703193664551, + "learning_rate": 9.855540798751292e-06, + "loss": 1.2193, + "step": 2425 + }, + { + "epoch": 0.6145661811272958, + "grad_norm": 4.096585750579834, + "learning_rate": 9.8553407868641e-06, + "loss": 1.4754, + "step": 2426 + }, + { + "epoch": 0.6148195060164661, + "grad_norm": 3.781301975250244, + "learning_rate": 9.855140638641213e-06, + "loss": 1.1606, + "step": 2427 + }, + { + "epoch": 0.6150728309056365, + "grad_norm": 4.041252136230469, + "learning_rate": 9.854940354088253e-06, + "loss": 1.2971, + "step": 2428 + }, + { + "epoch": 0.6153261557948069, + "grad_norm": 3.318225383758545, + "learning_rate": 9.854739933210846e-06, + "loss": 1.0641, + "step": 2429 + }, + { + "epoch": 0.6155794806839772, + "grad_norm": 3.967278003692627, + "learning_rate": 9.854539376014614e-06, + "loss": 1.3346, + "step": 2430 + }, + { + "epoch": 0.6158328055731476, + "grad_norm": 3.648906707763672, + "learning_rate": 9.854338682505193e-06, + "loss": 1.2795, + "step": 2431 + }, + { + "epoch": 0.616086130462318, + "grad_norm": 3.5962131023406982, + "learning_rate": 9.854137852688216e-06, + "loss": 1.1027, + "step": 2432 + }, + { + "epoch": 0.6163394553514883, + "grad_norm": 3.73262619972229, + "learning_rate": 9.853936886569324e-06, + "loss": 1.2438, + "step": 2433 + }, + { + "epoch": 0.6165927802406587, + "grad_norm": 3.7775135040283203, + "learning_rate": 9.853735784154159e-06, + "loss": 1.2827, + "step": 2434 + }, + { + "epoch": 0.616846105129829, + "grad_norm": 4.0967302322387695, + "learning_rate": 9.853534545448367e-06, + "loss": 1.2889, + "step": 2435 + }, + { + "epoch": 0.6170994300189994, + "grad_norm": 3.595000982284546, + "learning_rate": 9.8533331704576e-06, + "loss": 1.2437, + "step": 2436 + }, + { + "epoch": 0.6173527549081698, + "grad_norm": 3.4501607418060303, + "learning_rate": 9.853131659187513e-06, + "loss": 1.1672, + "step": 2437 + }, + { + "epoch": 0.6176060797973401, + "grad_norm": 3.6820144653320312, + "learning_rate": 9.852930011643763e-06, + "loss": 1.2082, + "step": 2438 + }, + { + "epoch": 0.6178594046865105, + "grad_norm": 3.65936279296875, + "learning_rate": 9.852728227832013e-06, + "loss": 1.2964, + "step": 2439 + }, + { + "epoch": 0.6181127295756809, + "grad_norm": 3.977670431137085, + "learning_rate": 9.852526307757928e-06, + "loss": 1.2392, + "step": 2440 + }, + { + "epoch": 0.6183660544648512, + "grad_norm": 3.5032083988189697, + "learning_rate": 9.85232425142718e-06, + "loss": 1.1944, + "step": 2441 + }, + { + "epoch": 0.6186193793540216, + "grad_norm": 3.7888448238372803, + "learning_rate": 9.852122058845439e-06, + "loss": 1.2319, + "step": 2442 + }, + { + "epoch": 0.6188727042431919, + "grad_norm": 3.6973185539245605, + "learning_rate": 9.851919730018386e-06, + "loss": 1.2063, + "step": 2443 + }, + { + "epoch": 0.6191260291323623, + "grad_norm": 3.6707706451416016, + "learning_rate": 9.851717264951702e-06, + "loss": 1.1027, + "step": 2444 + }, + { + "epoch": 0.6193793540215327, + "grad_norm": 3.313488721847534, + "learning_rate": 9.85151466365107e-06, + "loss": 1.1288, + "step": 2445 + }, + { + "epoch": 0.619632678910703, + "grad_norm": 3.5334272384643555, + "learning_rate": 9.851311926122179e-06, + "loss": 1.1914, + "step": 2446 + }, + { + "epoch": 0.6198860037998734, + "grad_norm": 3.603609323501587, + "learning_rate": 9.851109052370725e-06, + "loss": 1.2569, + "step": 2447 + }, + { + "epoch": 0.6201393286890436, + "grad_norm": 3.827209234237671, + "learning_rate": 9.850906042402399e-06, + "loss": 1.223, + "step": 2448 + }, + { + "epoch": 0.620392653578214, + "grad_norm": 3.4155426025390625, + "learning_rate": 9.850702896222908e-06, + "loss": 1.1548, + "step": 2449 + }, + { + "epoch": 0.6206459784673845, + "grad_norm": 3.9286139011383057, + "learning_rate": 9.850499613837952e-06, + "loss": 1.3792, + "step": 2450 + }, + { + "epoch": 0.6208993033565547, + "grad_norm": 3.6877617835998535, + "learning_rate": 9.850296195253241e-06, + "loss": 1.1077, + "step": 2451 + }, + { + "epoch": 0.6211526282457251, + "grad_norm": 3.57601261138916, + "learning_rate": 9.850092640474485e-06, + "loss": 1.2984, + "step": 2452 + }, + { + "epoch": 0.6214059531348956, + "grad_norm": 3.9190783500671387, + "learning_rate": 9.849888949507402e-06, + "loss": 1.3625, + "step": 2453 + }, + { + "epoch": 0.6216592780240658, + "grad_norm": 3.610003709793091, + "learning_rate": 9.849685122357708e-06, + "loss": 1.2229, + "step": 2454 + }, + { + "epoch": 0.6219126029132362, + "grad_norm": 3.735644578933716, + "learning_rate": 9.849481159031131e-06, + "loss": 1.1831, + "step": 2455 + }, + { + "epoch": 0.6221659278024065, + "grad_norm": 3.2859835624694824, + "learning_rate": 9.849277059533395e-06, + "loss": 1.0491, + "step": 2456 + }, + { + "epoch": 0.6224192526915769, + "grad_norm": 3.327444076538086, + "learning_rate": 9.849072823870232e-06, + "loss": 1.1315, + "step": 2457 + }, + { + "epoch": 0.6226725775807473, + "grad_norm": 3.6521694660186768, + "learning_rate": 9.848868452047378e-06, + "loss": 1.1709, + "step": 2458 + }, + { + "epoch": 0.6229259024699176, + "grad_norm": 3.5883758068084717, + "learning_rate": 9.84866394407057e-06, + "loss": 1.1678, + "step": 2459 + }, + { + "epoch": 0.623179227359088, + "grad_norm": 3.2457404136657715, + "learning_rate": 9.84845929994555e-06, + "loss": 1.0339, + "step": 2460 + }, + { + "epoch": 0.6234325522482584, + "grad_norm": 3.464756727218628, + "learning_rate": 9.848254519678064e-06, + "loss": 1.1122, + "step": 2461 + }, + { + "epoch": 0.6236858771374287, + "grad_norm": 3.6326985359191895, + "learning_rate": 9.848049603273865e-06, + "loss": 1.1961, + "step": 2462 + }, + { + "epoch": 0.6239392020265991, + "grad_norm": 3.8263144493103027, + "learning_rate": 9.847844550738706e-06, + "loss": 1.1436, + "step": 2463 + }, + { + "epoch": 0.6241925269157694, + "grad_norm": 3.556764602661133, + "learning_rate": 9.847639362078344e-06, + "loss": 1.1058, + "step": 2464 + }, + { + "epoch": 0.6244458518049398, + "grad_norm": 3.9600555896759033, + "learning_rate": 9.847434037298538e-06, + "loss": 1.2834, + "step": 2465 + }, + { + "epoch": 0.6246991766941102, + "grad_norm": 4.088186740875244, + "learning_rate": 9.847228576405058e-06, + "loss": 1.2617, + "step": 2466 + }, + { + "epoch": 0.6249525015832805, + "grad_norm": 3.8825464248657227, + "learning_rate": 9.847022979403671e-06, + "loss": 1.1537, + "step": 2467 + }, + { + "epoch": 0.6252058264724509, + "grad_norm": 3.252762794494629, + "learning_rate": 9.84681724630015e-06, + "loss": 1.1198, + "step": 2468 + }, + { + "epoch": 0.6254591513616212, + "grad_norm": 3.690809488296509, + "learning_rate": 9.846611377100274e-06, + "loss": 1.2873, + "step": 2469 + }, + { + "epoch": 0.6257124762507916, + "grad_norm": 3.537818193435669, + "learning_rate": 9.846405371809821e-06, + "loss": 1.2116, + "step": 2470 + }, + { + "epoch": 0.625965801139962, + "grad_norm": 3.7453677654266357, + "learning_rate": 9.846199230434576e-06, + "loss": 1.2925, + "step": 2471 + }, + { + "epoch": 0.6262191260291323, + "grad_norm": 3.2455244064331055, + "learning_rate": 9.84599295298033e-06, + "loss": 1.0063, + "step": 2472 + }, + { + "epoch": 0.6264724509183027, + "grad_norm": 3.628256320953369, + "learning_rate": 9.845786539452871e-06, + "loss": 1.1957, + "step": 2473 + }, + { + "epoch": 0.6267257758074731, + "grad_norm": 3.5005202293395996, + "learning_rate": 9.845579989857998e-06, + "loss": 1.1522, + "step": 2474 + }, + { + "epoch": 0.6269791006966434, + "grad_norm": 3.5334384441375732, + "learning_rate": 9.845373304201509e-06, + "loss": 1.375, + "step": 2475 + }, + { + "epoch": 0.6272324255858138, + "grad_norm": 3.3087430000305176, + "learning_rate": 9.84516648248921e-06, + "loss": 1.1753, + "step": 2476 + }, + { + "epoch": 0.6274857504749841, + "grad_norm": 3.5373950004577637, + "learning_rate": 9.844959524726907e-06, + "loss": 1.1284, + "step": 2477 + }, + { + "epoch": 0.6277390753641545, + "grad_norm": 3.617833375930786, + "learning_rate": 9.84475243092041e-06, + "loss": 1.0662, + "step": 2478 + }, + { + "epoch": 0.6279924002533249, + "grad_norm": 3.5470521450042725, + "learning_rate": 9.844545201075535e-06, + "loss": 1.1782, + "step": 2479 + }, + { + "epoch": 0.6282457251424952, + "grad_norm": 3.921121835708618, + "learning_rate": 9.844337835198102e-06, + "loss": 1.3281, + "step": 2480 + }, + { + "epoch": 0.6284990500316656, + "grad_norm": 3.5617926120758057, + "learning_rate": 9.844130333293932e-06, + "loss": 1.2355, + "step": 2481 + }, + { + "epoch": 0.628752374920836, + "grad_norm": 3.533834218978882, + "learning_rate": 9.843922695368855e-06, + "loss": 1.1639, + "step": 2482 + }, + { + "epoch": 0.6290056998100063, + "grad_norm": 3.3362550735473633, + "learning_rate": 9.843714921428698e-06, + "loss": 1.2091, + "step": 2483 + }, + { + "epoch": 0.6292590246991767, + "grad_norm": 3.5323235988616943, + "learning_rate": 9.843507011479296e-06, + "loss": 1.356, + "step": 2484 + }, + { + "epoch": 0.629512349588347, + "grad_norm": 3.518071174621582, + "learning_rate": 9.843298965526486e-06, + "loss": 1.1021, + "step": 2485 + }, + { + "epoch": 0.6297656744775174, + "grad_norm": 3.7014873027801514, + "learning_rate": 9.843090783576112e-06, + "loss": 1.3844, + "step": 2486 + }, + { + "epoch": 0.6300189993666878, + "grad_norm": 3.5969622135162354, + "learning_rate": 9.842882465634019e-06, + "loss": 1.1927, + "step": 2487 + }, + { + "epoch": 0.6302723242558581, + "grad_norm": 3.5788826942443848, + "learning_rate": 9.842674011706057e-06, + "loss": 1.2414, + "step": 2488 + }, + { + "epoch": 0.6305256491450285, + "grad_norm": 4.032215595245361, + "learning_rate": 9.842465421798074e-06, + "loss": 1.2238, + "step": 2489 + }, + { + "epoch": 0.6307789740341989, + "grad_norm": 3.575622797012329, + "learning_rate": 9.842256695915937e-06, + "loss": 1.3002, + "step": 2490 + }, + { + "epoch": 0.6310322989233692, + "grad_norm": 3.4500770568847656, + "learning_rate": 9.842047834065498e-06, + "loss": 1.1602, + "step": 2491 + }, + { + "epoch": 0.6312856238125396, + "grad_norm": 3.9447834491729736, + "learning_rate": 9.841838836252627e-06, + "loss": 1.1706, + "step": 2492 + }, + { + "epoch": 0.6315389487017099, + "grad_norm": 3.377351760864258, + "learning_rate": 9.84162970248319e-06, + "loss": 1.2073, + "step": 2493 + }, + { + "epoch": 0.6317922735908803, + "grad_norm": 3.500148296356201, + "learning_rate": 9.84142043276306e-06, + "loss": 1.2556, + "step": 2494 + }, + { + "epoch": 0.6320455984800507, + "grad_norm": 3.5882997512817383, + "learning_rate": 9.841211027098114e-06, + "loss": 1.1664, + "step": 2495 + }, + { + "epoch": 0.632298923369221, + "grad_norm": 3.531613826751709, + "learning_rate": 9.84100148549423e-06, + "loss": 1.2383, + "step": 2496 + }, + { + "epoch": 0.6325522482583914, + "grad_norm": 3.6397781372070312, + "learning_rate": 9.840791807957294e-06, + "loss": 1.2755, + "step": 2497 + }, + { + "epoch": 0.6328055731475617, + "grad_norm": 3.521218776702881, + "learning_rate": 9.840581994493193e-06, + "loss": 1.1873, + "step": 2498 + }, + { + "epoch": 0.6330588980367321, + "grad_norm": 3.705307960510254, + "learning_rate": 9.840372045107818e-06, + "loss": 1.2026, + "step": 2499 + }, + { + "epoch": 0.6333122229259025, + "grad_norm": 3.428232192993164, + "learning_rate": 9.840161959807064e-06, + "loss": 1.1304, + "step": 2500 + }, + { + "epoch": 0.6333122229259025, + "eval_loss": 1.2158032655715942, + "eval_runtime": 12.2773, + "eval_samples_per_second": 32.58, + "eval_steps_per_second": 4.073, + "step": 2500 + }, + { + "epoch": 0.6335655478150728, + "grad_norm": 3.6344573497772217, + "learning_rate": 9.839951738596831e-06, + "loss": 1.125, + "step": 2501 + }, + { + "epoch": 0.6338188727042432, + "grad_norm": 3.926832437515259, + "learning_rate": 9.839741381483021e-06, + "loss": 1.1736, + "step": 2502 + }, + { + "epoch": 0.6340721975934136, + "grad_norm": 3.35507869720459, + "learning_rate": 9.839530888471542e-06, + "loss": 1.0818, + "step": 2503 + }, + { + "epoch": 0.6343255224825839, + "grad_norm": 3.9087727069854736, + "learning_rate": 9.839320259568304e-06, + "loss": 1.2332, + "step": 2504 + }, + { + "epoch": 0.6345788473717543, + "grad_norm": 3.944812536239624, + "learning_rate": 9.839109494779222e-06, + "loss": 1.3249, + "step": 2505 + }, + { + "epoch": 0.6348321722609246, + "grad_norm": 3.5641653537750244, + "learning_rate": 9.838898594110211e-06, + "loss": 1.2282, + "step": 2506 + }, + { + "epoch": 0.635085497150095, + "grad_norm": 3.5591001510620117, + "learning_rate": 9.838687557567199e-06, + "loss": 1.2849, + "step": 2507 + }, + { + "epoch": 0.6353388220392654, + "grad_norm": 4.062427520751953, + "learning_rate": 9.838476385156103e-06, + "loss": 1.2333, + "step": 2508 + }, + { + "epoch": 0.6355921469284357, + "grad_norm": 3.5046918392181396, + "learning_rate": 9.838265076882863e-06, + "loss": 1.2505, + "step": 2509 + }, + { + "epoch": 0.6358454718176061, + "grad_norm": 3.372955083847046, + "learning_rate": 9.838053632753403e-06, + "loss": 1.2336, + "step": 2510 + }, + { + "epoch": 0.6360987967067765, + "grad_norm": 3.606010913848877, + "learning_rate": 9.837842052773667e-06, + "loss": 1.2544, + "step": 2511 + }, + { + "epoch": 0.6363521215959468, + "grad_norm": 4.002732753753662, + "learning_rate": 9.837630336949592e-06, + "loss": 1.2273, + "step": 2512 + }, + { + "epoch": 0.6366054464851172, + "grad_norm": 3.352332353591919, + "learning_rate": 9.837418485287126e-06, + "loss": 1.0451, + "step": 2513 + }, + { + "epoch": 0.6368587713742875, + "grad_norm": 3.3298330307006836, + "learning_rate": 9.837206497792216e-06, + "loss": 1.0997, + "step": 2514 + }, + { + "epoch": 0.6371120962634579, + "grad_norm": 3.6441266536712646, + "learning_rate": 9.836994374470814e-06, + "loss": 1.219, + "step": 2515 + }, + { + "epoch": 0.6373654211526283, + "grad_norm": 3.3209352493286133, + "learning_rate": 9.836782115328878e-06, + "loss": 1.0118, + "step": 2516 + }, + { + "epoch": 0.6376187460417986, + "grad_norm": 3.3625969886779785, + "learning_rate": 9.836569720372367e-06, + "loss": 1.1919, + "step": 2517 + }, + { + "epoch": 0.637872070930969, + "grad_norm": 3.6182706356048584, + "learning_rate": 9.836357189607243e-06, + "loss": 1.2894, + "step": 2518 + }, + { + "epoch": 0.6381253958201393, + "grad_norm": 3.827185869216919, + "learning_rate": 9.836144523039476e-06, + "loss": 1.2509, + "step": 2519 + }, + { + "epoch": 0.6383787207093097, + "grad_norm": 3.77411150932312, + "learning_rate": 9.83593172067504e-06, + "loss": 1.2143, + "step": 2520 + }, + { + "epoch": 0.6386320455984801, + "grad_norm": 3.397799491882324, + "learning_rate": 9.835718782519905e-06, + "loss": 1.1697, + "step": 2521 + }, + { + "epoch": 0.6388853704876504, + "grad_norm": 4.167684078216553, + "learning_rate": 9.835505708580055e-06, + "loss": 1.3649, + "step": 2522 + }, + { + "epoch": 0.6391386953768208, + "grad_norm": 3.54966402053833, + "learning_rate": 9.83529249886147e-06, + "loss": 1.1086, + "step": 2523 + }, + { + "epoch": 0.6393920202659912, + "grad_norm": 3.8356873989105225, + "learning_rate": 9.83507915337014e-06, + "loss": 1.2253, + "step": 2524 + }, + { + "epoch": 0.6396453451551615, + "grad_norm": 3.2942631244659424, + "learning_rate": 9.83486567211205e-06, + "loss": 1.0629, + "step": 2525 + }, + { + "epoch": 0.6398986700443319, + "grad_norm": 3.718168258666992, + "learning_rate": 9.8346520550932e-06, + "loss": 1.2178, + "step": 2526 + }, + { + "epoch": 0.6401519949335022, + "grad_norm": 3.384877920150757, + "learning_rate": 9.834438302319585e-06, + "loss": 1.0572, + "step": 2527 + }, + { + "epoch": 0.6404053198226726, + "grad_norm": 3.6040239334106445, + "learning_rate": 9.834224413797208e-06, + "loss": 1.2056, + "step": 2528 + }, + { + "epoch": 0.640658644711843, + "grad_norm": 3.670182943344116, + "learning_rate": 9.834010389532075e-06, + "loss": 1.2164, + "step": 2529 + }, + { + "epoch": 0.6409119696010133, + "grad_norm": 3.1873552799224854, + "learning_rate": 9.833796229530199e-06, + "loss": 1.0307, + "step": 2530 + }, + { + "epoch": 0.6411652944901837, + "grad_norm": 3.485616445541382, + "learning_rate": 9.833581933797586e-06, + "loss": 1.1692, + "step": 2531 + }, + { + "epoch": 0.6414186193793541, + "grad_norm": 3.664083957672119, + "learning_rate": 9.833367502340261e-06, + "loss": 1.1873, + "step": 2532 + }, + { + "epoch": 0.6416719442685244, + "grad_norm": 3.7422778606414795, + "learning_rate": 9.833152935164242e-06, + "loss": 1.2442, + "step": 2533 + }, + { + "epoch": 0.6419252691576948, + "grad_norm": 3.2830452919006348, + "learning_rate": 9.832938232275551e-06, + "loss": 1.1344, + "step": 2534 + }, + { + "epoch": 0.6421785940468651, + "grad_norm": 3.661639451980591, + "learning_rate": 9.832723393680222e-06, + "loss": 1.3233, + "step": 2535 + }, + { + "epoch": 0.6424319189360355, + "grad_norm": 3.579019784927368, + "learning_rate": 9.832508419384284e-06, + "loss": 1.2245, + "step": 2536 + }, + { + "epoch": 0.6426852438252059, + "grad_norm": 3.857731580734253, + "learning_rate": 9.832293309393775e-06, + "loss": 1.3119, + "step": 2537 + }, + { + "epoch": 0.6429385687143762, + "grad_norm": 3.6141717433929443, + "learning_rate": 9.832078063714733e-06, + "loss": 1.1423, + "step": 2538 + }, + { + "epoch": 0.6431918936035466, + "grad_norm": 3.8558592796325684, + "learning_rate": 9.831862682353206e-06, + "loss": 1.2059, + "step": 2539 + }, + { + "epoch": 0.643445218492717, + "grad_norm": 3.6421942710876465, + "learning_rate": 9.831647165315238e-06, + "loss": 1.1619, + "step": 2540 + }, + { + "epoch": 0.6436985433818873, + "grad_norm": 3.2293131351470947, + "learning_rate": 9.831431512606883e-06, + "loss": 1.1146, + "step": 2541 + }, + { + "epoch": 0.6439518682710577, + "grad_norm": 3.673706531524658, + "learning_rate": 9.831215724234194e-06, + "loss": 1.2253, + "step": 2542 + }, + { + "epoch": 0.644205193160228, + "grad_norm": 3.3997058868408203, + "learning_rate": 9.830999800203233e-06, + "loss": 1.2331, + "step": 2543 + }, + { + "epoch": 0.6444585180493984, + "grad_norm": 3.706429958343506, + "learning_rate": 9.830783740520063e-06, + "loss": 1.2251, + "step": 2544 + }, + { + "epoch": 0.6447118429385688, + "grad_norm": 3.5596835613250732, + "learning_rate": 9.830567545190747e-06, + "loss": 1.22, + "step": 2545 + }, + { + "epoch": 0.644965167827739, + "grad_norm": 3.6280670166015625, + "learning_rate": 9.830351214221359e-06, + "loss": 1.2266, + "step": 2546 + }, + { + "epoch": 0.6452184927169095, + "grad_norm": 3.31756329536438, + "learning_rate": 9.830134747617975e-06, + "loss": 1.0577, + "step": 2547 + }, + { + "epoch": 0.6454718176060797, + "grad_norm": 3.772573709487915, + "learning_rate": 9.829918145386668e-06, + "loss": 1.2135, + "step": 2548 + }, + { + "epoch": 0.6457251424952501, + "grad_norm": 3.4784507751464844, + "learning_rate": 9.829701407533526e-06, + "loss": 1.1664, + "step": 2549 + }, + { + "epoch": 0.6459784673844206, + "grad_norm": 3.6505179405212402, + "learning_rate": 9.829484534064628e-06, + "loss": 1.1425, + "step": 2550 + }, + { + "epoch": 0.6462317922735908, + "grad_norm": 3.907944440841675, + "learning_rate": 9.829267524986071e-06, + "loss": 1.3133, + "step": 2551 + }, + { + "epoch": 0.6464851171627612, + "grad_norm": 4.5512261390686035, + "learning_rate": 9.829050380303945e-06, + "loss": 1.2907, + "step": 2552 + }, + { + "epoch": 0.6467384420519317, + "grad_norm": 3.2760396003723145, + "learning_rate": 9.828833100024347e-06, + "loss": 1.0209, + "step": 2553 + }, + { + "epoch": 0.6469917669411019, + "grad_norm": 3.457796096801758, + "learning_rate": 9.828615684153379e-06, + "loss": 1.2081, + "step": 2554 + }, + { + "epoch": 0.6472450918302723, + "grad_norm": 3.9074411392211914, + "learning_rate": 9.828398132697146e-06, + "loss": 1.263, + "step": 2555 + }, + { + "epoch": 0.6474984167194426, + "grad_norm": 3.8507144451141357, + "learning_rate": 9.828180445661754e-06, + "loss": 1.2712, + "step": 2556 + }, + { + "epoch": 0.647751741608613, + "grad_norm": 3.44443678855896, + "learning_rate": 9.82796262305332e-06, + "loss": 1.1306, + "step": 2557 + }, + { + "epoch": 0.6480050664977834, + "grad_norm": 3.4174551963806152, + "learning_rate": 9.827744664877959e-06, + "loss": 1.2463, + "step": 2558 + }, + { + "epoch": 0.6482583913869537, + "grad_norm": 4.097899436950684, + "learning_rate": 9.82752657114179e-06, + "loss": 1.3898, + "step": 2559 + }, + { + "epoch": 0.6485117162761241, + "grad_norm": 3.9730896949768066, + "learning_rate": 9.827308341850936e-06, + "loss": 1.2523, + "step": 2560 + }, + { + "epoch": 0.6487650411652945, + "grad_norm": 3.5033178329467773, + "learning_rate": 9.827089977011528e-06, + "loss": 1.1796, + "step": 2561 + }, + { + "epoch": 0.6490183660544648, + "grad_norm": 3.499201536178589, + "learning_rate": 9.826871476629696e-06, + "loss": 1.16, + "step": 2562 + }, + { + "epoch": 0.6492716909436352, + "grad_norm": 3.558795690536499, + "learning_rate": 9.826652840711573e-06, + "loss": 1.1215, + "step": 2563 + }, + { + "epoch": 0.6495250158328055, + "grad_norm": 3.6329128742218018, + "learning_rate": 9.826434069263301e-06, + "loss": 1.2845, + "step": 2564 + }, + { + "epoch": 0.6497783407219759, + "grad_norm": 3.5413899421691895, + "learning_rate": 9.826215162291023e-06, + "loss": 1.1394, + "step": 2565 + }, + { + "epoch": 0.6500316656111463, + "grad_norm": 3.2898104190826416, + "learning_rate": 9.825996119800887e-06, + "loss": 1.075, + "step": 2566 + }, + { + "epoch": 0.6502849905003166, + "grad_norm": 3.540954113006592, + "learning_rate": 9.825776941799038e-06, + "loss": 1.1711, + "step": 2567 + }, + { + "epoch": 0.650538315389487, + "grad_norm": 3.7790684700012207, + "learning_rate": 9.825557628291637e-06, + "loss": 1.1565, + "step": 2568 + }, + { + "epoch": 0.6507916402786573, + "grad_norm": 3.7246615886688232, + "learning_rate": 9.82533817928484e-06, + "loss": 1.2207, + "step": 2569 + }, + { + "epoch": 0.6510449651678277, + "grad_norm": 3.244691848754883, + "learning_rate": 9.825118594784805e-06, + "loss": 1.1929, + "step": 2570 + }, + { + "epoch": 0.6512982900569981, + "grad_norm": 3.6630892753601074, + "learning_rate": 9.824898874797704e-06, + "loss": 1.136, + "step": 2571 + }, + { + "epoch": 0.6515516149461684, + "grad_norm": 3.627481698989868, + "learning_rate": 9.824679019329704e-06, + "loss": 1.2029, + "step": 2572 + }, + { + "epoch": 0.6518049398353388, + "grad_norm": 4.051472187042236, + "learning_rate": 9.824459028386976e-06, + "loss": 1.2067, + "step": 2573 + }, + { + "epoch": 0.6520582647245092, + "grad_norm": 4.0756001472473145, + "learning_rate": 9.824238901975703e-06, + "loss": 1.3799, + "step": 2574 + }, + { + "epoch": 0.6523115896136795, + "grad_norm": 3.7352709770202637, + "learning_rate": 9.82401864010206e-06, + "loss": 1.123, + "step": 2575 + }, + { + "epoch": 0.6525649145028499, + "grad_norm": 3.5075106620788574, + "learning_rate": 9.823798242772235e-06, + "loss": 1.0726, + "step": 2576 + }, + { + "epoch": 0.6528182393920202, + "grad_norm": 3.140592098236084, + "learning_rate": 9.823577709992417e-06, + "loss": 1.0517, + "step": 2577 + }, + { + "epoch": 0.6530715642811906, + "grad_norm": 3.906625747680664, + "learning_rate": 9.823357041768798e-06, + "loss": 1.2148, + "step": 2578 + }, + { + "epoch": 0.653324889170361, + "grad_norm": 3.9738869667053223, + "learning_rate": 9.823136238107573e-06, + "loss": 1.3406, + "step": 2579 + }, + { + "epoch": 0.6535782140595313, + "grad_norm": 3.5039169788360596, + "learning_rate": 9.822915299014941e-06, + "loss": 1.2235, + "step": 2580 + }, + { + "epoch": 0.6538315389487017, + "grad_norm": 3.6179444789886475, + "learning_rate": 9.822694224497111e-06, + "loss": 1.3082, + "step": 2581 + }, + { + "epoch": 0.6540848638378721, + "grad_norm": 3.5878806114196777, + "learning_rate": 9.822473014560285e-06, + "loss": 1.1701, + "step": 2582 + }, + { + "epoch": 0.6543381887270424, + "grad_norm": 3.4601778984069824, + "learning_rate": 9.822251669210679e-06, + "loss": 1.0544, + "step": 2583 + }, + { + "epoch": 0.6545915136162128, + "grad_norm": 3.5456607341766357, + "learning_rate": 9.822030188454506e-06, + "loss": 1.2495, + "step": 2584 + }, + { + "epoch": 0.6548448385053831, + "grad_norm": 4.021832466125488, + "learning_rate": 9.821808572297984e-06, + "loss": 1.4227, + "step": 2585 + }, + { + "epoch": 0.6550981633945535, + "grad_norm": 3.634138345718384, + "learning_rate": 9.821586820747337e-06, + "loss": 1.1106, + "step": 2586 + }, + { + "epoch": 0.6553514882837239, + "grad_norm": 3.8235223293304443, + "learning_rate": 9.821364933808793e-06, + "loss": 1.2882, + "step": 2587 + }, + { + "epoch": 0.6556048131728942, + "grad_norm": 3.4546220302581787, + "learning_rate": 9.821142911488582e-06, + "loss": 1.1963, + "step": 2588 + }, + { + "epoch": 0.6558581380620646, + "grad_norm": 3.7855916023254395, + "learning_rate": 9.820920753792935e-06, + "loss": 1.2486, + "step": 2589 + }, + { + "epoch": 0.656111462951235, + "grad_norm": 3.4730772972106934, + "learning_rate": 9.820698460728095e-06, + "loss": 1.1228, + "step": 2590 + }, + { + "epoch": 0.6563647878404053, + "grad_norm": 3.582882881164551, + "learning_rate": 9.820476032300302e-06, + "loss": 1.2618, + "step": 2591 + }, + { + "epoch": 0.6566181127295757, + "grad_norm": 3.7244527339935303, + "learning_rate": 9.8202534685158e-06, + "loss": 1.2207, + "step": 2592 + }, + { + "epoch": 0.656871437618746, + "grad_norm": 3.8072774410247803, + "learning_rate": 9.820030769380839e-06, + "loss": 1.0822, + "step": 2593 + }, + { + "epoch": 0.6571247625079164, + "grad_norm": 3.689227819442749, + "learning_rate": 9.819807934901673e-06, + "loss": 1.2445, + "step": 2594 + }, + { + "epoch": 0.6573780873970868, + "grad_norm": 3.8133480548858643, + "learning_rate": 9.81958496508456e-06, + "loss": 1.2767, + "step": 2595 + }, + { + "epoch": 0.6576314122862571, + "grad_norm": 3.3837785720825195, + "learning_rate": 9.81936185993576e-06, + "loss": 1.128, + "step": 2596 + }, + { + "epoch": 0.6578847371754275, + "grad_norm": 3.8632192611694336, + "learning_rate": 9.819138619461538e-06, + "loss": 1.281, + "step": 2597 + }, + { + "epoch": 0.6581380620645978, + "grad_norm": 3.616199254989624, + "learning_rate": 9.818915243668163e-06, + "loss": 1.1237, + "step": 2598 + }, + { + "epoch": 0.6583913869537682, + "grad_norm": 3.4493207931518555, + "learning_rate": 9.818691732561904e-06, + "loss": 1.1529, + "step": 2599 + }, + { + "epoch": 0.6586447118429386, + "grad_norm": 3.936377763748169, + "learning_rate": 9.818468086149041e-06, + "loss": 1.2162, + "step": 2600 + }, + { + "epoch": 0.6588980367321089, + "grad_norm": 3.9524548053741455, + "learning_rate": 9.818244304435853e-06, + "loss": 1.4033, + "step": 2601 + }, + { + "epoch": 0.6591513616212793, + "grad_norm": 3.612659215927124, + "learning_rate": 9.818020387428621e-06, + "loss": 1.1037, + "step": 2602 + }, + { + "epoch": 0.6594046865104497, + "grad_norm": 3.4523441791534424, + "learning_rate": 9.817796335133637e-06, + "loss": 1.1582, + "step": 2603 + }, + { + "epoch": 0.65965801139962, + "grad_norm": 3.481605052947998, + "learning_rate": 9.817572147557189e-06, + "loss": 1.1846, + "step": 2604 + }, + { + "epoch": 0.6599113362887904, + "grad_norm": 3.3964247703552246, + "learning_rate": 9.817347824705572e-06, + "loss": 1.2143, + "step": 2605 + }, + { + "epoch": 0.6601646611779607, + "grad_norm": 3.5022902488708496, + "learning_rate": 9.817123366585088e-06, + "loss": 1.1489, + "step": 2606 + }, + { + "epoch": 0.6604179860671311, + "grad_norm": 3.4793701171875, + "learning_rate": 9.816898773202037e-06, + "loss": 1.2056, + "step": 2607 + }, + { + "epoch": 0.6606713109563015, + "grad_norm": 3.454171657562256, + "learning_rate": 9.816674044562724e-06, + "loss": 1.1265, + "step": 2608 + }, + { + "epoch": 0.6609246358454718, + "grad_norm": 3.443490982055664, + "learning_rate": 9.816449180673465e-06, + "loss": 1.2059, + "step": 2609 + }, + { + "epoch": 0.6611779607346422, + "grad_norm": 3.181407928466797, + "learning_rate": 9.816224181540567e-06, + "loss": 1.0797, + "step": 2610 + }, + { + "epoch": 0.6614312856238126, + "grad_norm": 3.7836201190948486, + "learning_rate": 9.815999047170353e-06, + "loss": 1.2255, + "step": 2611 + }, + { + "epoch": 0.6616846105129829, + "grad_norm": 3.772714376449585, + "learning_rate": 9.815773777569142e-06, + "loss": 1.1066, + "step": 2612 + }, + { + "epoch": 0.6619379354021533, + "grad_norm": 3.495640754699707, + "learning_rate": 9.81554837274326e-06, + "loss": 1.3562, + "step": 2613 + }, + { + "epoch": 0.6621912602913236, + "grad_norm": 3.265284776687622, + "learning_rate": 9.815322832699036e-06, + "loss": 1.1142, + "step": 2614 + }, + { + "epoch": 0.662444585180494, + "grad_norm": 3.3903627395629883, + "learning_rate": 9.815097157442805e-06, + "loss": 1.1176, + "step": 2615 + }, + { + "epoch": 0.6626979100696644, + "grad_norm": 3.4801971912384033, + "learning_rate": 9.814871346980903e-06, + "loss": 1.3048, + "step": 2616 + }, + { + "epoch": 0.6629512349588347, + "grad_norm": 3.5183205604553223, + "learning_rate": 9.814645401319671e-06, + "loss": 1.1921, + "step": 2617 + }, + { + "epoch": 0.6632045598480051, + "grad_norm": 3.0776429176330566, + "learning_rate": 9.814419320465452e-06, + "loss": 1.1038, + "step": 2618 + }, + { + "epoch": 0.6634578847371754, + "grad_norm": 3.521026849746704, + "learning_rate": 9.814193104424593e-06, + "loss": 1.117, + "step": 2619 + }, + { + "epoch": 0.6637112096263458, + "grad_norm": 3.6802165508270264, + "learning_rate": 9.813966753203448e-06, + "loss": 1.2133, + "step": 2620 + }, + { + "epoch": 0.6639645345155162, + "grad_norm": 3.3408403396606445, + "learning_rate": 9.813740266808375e-06, + "loss": 1.211, + "step": 2621 + }, + { + "epoch": 0.6642178594046865, + "grad_norm": 3.791865110397339, + "learning_rate": 9.81351364524573e-06, + "loss": 1.2252, + "step": 2622 + }, + { + "epoch": 0.6644711842938569, + "grad_norm": 3.5978035926818848, + "learning_rate": 9.813286888521878e-06, + "loss": 1.2328, + "step": 2623 + }, + { + "epoch": 0.6647245091830273, + "grad_norm": 3.6417248249053955, + "learning_rate": 9.813059996643186e-06, + "loss": 1.2067, + "step": 2624 + }, + { + "epoch": 0.6649778340721976, + "grad_norm": 3.5736918449401855, + "learning_rate": 9.812832969616026e-06, + "loss": 1.1599, + "step": 2625 + }, + { + "epoch": 0.665231158961368, + "grad_norm": 3.390369176864624, + "learning_rate": 9.812605807446771e-06, + "loss": 1.2273, + "step": 2626 + }, + { + "epoch": 0.6654844838505383, + "grad_norm": 3.5803728103637695, + "learning_rate": 9.8123785101418e-06, + "loss": 1.1763, + "step": 2627 + }, + { + "epoch": 0.6657378087397087, + "grad_norm": 3.7515909671783447, + "learning_rate": 9.812151077707497e-06, + "loss": 1.2424, + "step": 2628 + }, + { + "epoch": 0.6659911336288791, + "grad_norm": 3.962169647216797, + "learning_rate": 9.811923510150248e-06, + "loss": 1.1588, + "step": 2629 + }, + { + "epoch": 0.6662444585180494, + "grad_norm": 3.5884110927581787, + "learning_rate": 9.81169580747644e-06, + "loss": 1.1604, + "step": 2630 + }, + { + "epoch": 0.6664977834072198, + "grad_norm": 3.8470706939697266, + "learning_rate": 9.81146796969247e-06, + "loss": 1.146, + "step": 2631 + }, + { + "epoch": 0.6667511082963902, + "grad_norm": 3.6632542610168457, + "learning_rate": 9.811239996804734e-06, + "loss": 1.1298, + "step": 2632 + }, + { + "epoch": 0.6670044331855605, + "grad_norm": 3.558820962905884, + "learning_rate": 9.811011888819632e-06, + "loss": 1.2359, + "step": 2633 + }, + { + "epoch": 0.6672577580747309, + "grad_norm": 3.3591184616088867, + "learning_rate": 9.810783645743573e-06, + "loss": 1.203, + "step": 2634 + }, + { + "epoch": 0.6675110829639012, + "grad_norm": 4.192767143249512, + "learning_rate": 9.810555267582964e-06, + "loss": 1.3851, + "step": 2635 + }, + { + "epoch": 0.6677644078530716, + "grad_norm": 3.6012911796569824, + "learning_rate": 9.810326754344217e-06, + "loss": 1.1344, + "step": 2636 + }, + { + "epoch": 0.668017732742242, + "grad_norm": 3.674480676651001, + "learning_rate": 9.81009810603375e-06, + "loss": 1.077, + "step": 2637 + }, + { + "epoch": 0.6682710576314123, + "grad_norm": 3.7993180751800537, + "learning_rate": 9.809869322657983e-06, + "loss": 1.2632, + "step": 2638 + }, + { + "epoch": 0.6685243825205827, + "grad_norm": 3.856072425842285, + "learning_rate": 9.809640404223338e-06, + "loss": 1.2595, + "step": 2639 + }, + { + "epoch": 0.668777707409753, + "grad_norm": 3.7589426040649414, + "learning_rate": 9.809411350736246e-06, + "loss": 1.2555, + "step": 2640 + }, + { + "epoch": 0.6690310322989234, + "grad_norm": 3.743173837661743, + "learning_rate": 9.809182162203137e-06, + "loss": 1.1436, + "step": 2641 + }, + { + "epoch": 0.6692843571880938, + "grad_norm": 3.676936149597168, + "learning_rate": 9.808952838630447e-06, + "loss": 1.1748, + "step": 2642 + }, + { + "epoch": 0.669537682077264, + "grad_norm": 3.525935173034668, + "learning_rate": 9.808723380024614e-06, + "loss": 1.1007, + "step": 2643 + }, + { + "epoch": 0.6697910069664345, + "grad_norm": 3.3500614166259766, + "learning_rate": 9.808493786392083e-06, + "loss": 1.1616, + "step": 2644 + }, + { + "epoch": 0.6700443318556049, + "grad_norm": 3.467604875564575, + "learning_rate": 9.8082640577393e-06, + "loss": 1.0956, + "step": 2645 + }, + { + "epoch": 0.6702976567447751, + "grad_norm": 3.592313051223755, + "learning_rate": 9.808034194072717e-06, + "loss": 1.2814, + "step": 2646 + }, + { + "epoch": 0.6705509816339456, + "grad_norm": 3.6698861122131348, + "learning_rate": 9.807804195398785e-06, + "loss": 1.1094, + "step": 2647 + }, + { + "epoch": 0.6708043065231158, + "grad_norm": 3.490830898284912, + "learning_rate": 9.807574061723966e-06, + "loss": 1.1319, + "step": 2648 + }, + { + "epoch": 0.6710576314122862, + "grad_norm": 3.7101504802703857, + "learning_rate": 9.80734379305472e-06, + "loss": 1.1763, + "step": 2649 + }, + { + "epoch": 0.6713109563014567, + "grad_norm": 3.7675065994262695, + "learning_rate": 9.807113389397514e-06, + "loss": 1.2364, + "step": 2650 + }, + { + "epoch": 0.6715642811906269, + "grad_norm": 3.990852117538452, + "learning_rate": 9.806882850758816e-06, + "loss": 1.276, + "step": 2651 + }, + { + "epoch": 0.6718176060797973, + "grad_norm": 3.472147226333618, + "learning_rate": 9.806652177145102e-06, + "loss": 1.146, + "step": 2652 + }, + { + "epoch": 0.6720709309689678, + "grad_norm": 3.1356961727142334, + "learning_rate": 9.806421368562847e-06, + "loss": 1.046, + "step": 2653 + }, + { + "epoch": 0.672324255858138, + "grad_norm": 3.3926939964294434, + "learning_rate": 9.806190425018531e-06, + "loss": 1.07, + "step": 2654 + }, + { + "epoch": 0.6725775807473084, + "grad_norm": 3.6589784622192383, + "learning_rate": 9.805959346518643e-06, + "loss": 1.1224, + "step": 2655 + }, + { + "epoch": 0.6728309056364787, + "grad_norm": 3.628971815109253, + "learning_rate": 9.805728133069667e-06, + "loss": 1.161, + "step": 2656 + }, + { + "epoch": 0.6730842305256491, + "grad_norm": 3.4346976280212402, + "learning_rate": 9.805496784678099e-06, + "loss": 1.1655, + "step": 2657 + }, + { + "epoch": 0.6733375554148195, + "grad_norm": 3.8982088565826416, + "learning_rate": 9.805265301350433e-06, + "loss": 1.1965, + "step": 2658 + }, + { + "epoch": 0.6735908803039898, + "grad_norm": 3.2711942195892334, + "learning_rate": 9.805033683093168e-06, + "loss": 1.2469, + "step": 2659 + }, + { + "epoch": 0.6738442051931602, + "grad_norm": 3.649364471435547, + "learning_rate": 9.804801929912811e-06, + "loss": 1.1161, + "step": 2660 + }, + { + "epoch": 0.6740975300823306, + "grad_norm": 3.8847033977508545, + "learning_rate": 9.804570041815866e-06, + "loss": 1.2566, + "step": 2661 + }, + { + "epoch": 0.6743508549715009, + "grad_norm": 3.7913880348205566, + "learning_rate": 9.804338018808847e-06, + "loss": 1.205, + "step": 2662 + }, + { + "epoch": 0.6746041798606713, + "grad_norm": 3.595766305923462, + "learning_rate": 9.80410586089827e-06, + "loss": 1.1123, + "step": 2663 + }, + { + "epoch": 0.6748575047498416, + "grad_norm": 3.2884340286254883, + "learning_rate": 9.80387356809065e-06, + "loss": 1.0508, + "step": 2664 + }, + { + "epoch": 0.675110829639012, + "grad_norm": 3.183701992034912, + "learning_rate": 9.80364114039251e-06, + "loss": 1.1189, + "step": 2665 + }, + { + "epoch": 0.6753641545281824, + "grad_norm": 3.781153917312622, + "learning_rate": 9.803408577810381e-06, + "loss": 1.3086, + "step": 2666 + }, + { + "epoch": 0.6756174794173527, + "grad_norm": 3.7944130897521973, + "learning_rate": 9.80317588035079e-06, + "loss": 1.1294, + "step": 2667 + }, + { + "epoch": 0.6758708043065231, + "grad_norm": 3.5396370887756348, + "learning_rate": 9.80294304802027e-06, + "loss": 1.186, + "step": 2668 + }, + { + "epoch": 0.6761241291956934, + "grad_norm": 3.7316062450408936, + "learning_rate": 9.802710080825362e-06, + "loss": 1.2823, + "step": 2669 + }, + { + "epoch": 0.6763774540848638, + "grad_norm": 3.5834367275238037, + "learning_rate": 9.802476978772604e-06, + "loss": 1.2484, + "step": 2670 + }, + { + "epoch": 0.6766307789740342, + "grad_norm": 3.470984697341919, + "learning_rate": 9.802243741868545e-06, + "loss": 1.026, + "step": 2671 + }, + { + "epoch": 0.6768841038632045, + "grad_norm": 3.6198978424072266, + "learning_rate": 9.80201037011973e-06, + "loss": 1.1535, + "step": 2672 + }, + { + "epoch": 0.6771374287523749, + "grad_norm": 3.994255304336548, + "learning_rate": 9.801776863532716e-06, + "loss": 1.1094, + "step": 2673 + }, + { + "epoch": 0.6773907536415453, + "grad_norm": 3.4337193965911865, + "learning_rate": 9.801543222114058e-06, + "loss": 1.0807, + "step": 2674 + }, + { + "epoch": 0.6776440785307156, + "grad_norm": 3.4465723037719727, + "learning_rate": 9.801309445870317e-06, + "loss": 1.1684, + "step": 2675 + }, + { + "epoch": 0.677897403419886, + "grad_norm": 3.2914836406707764, + "learning_rate": 9.801075534808058e-06, + "loss": 1.0535, + "step": 2676 + }, + { + "epoch": 0.6781507283090563, + "grad_norm": 3.766895294189453, + "learning_rate": 9.800841488933846e-06, + "loss": 1.2471, + "step": 2677 + }, + { + "epoch": 0.6784040531982267, + "grad_norm": 3.9472076892852783, + "learning_rate": 9.800607308254254e-06, + "loss": 1.2071, + "step": 2678 + }, + { + "epoch": 0.6786573780873971, + "grad_norm": 3.281208038330078, + "learning_rate": 9.800372992775862e-06, + "loss": 1.1713, + "step": 2679 + }, + { + "epoch": 0.6789107029765674, + "grad_norm": 3.465557336807251, + "learning_rate": 9.800138542505247e-06, + "loss": 1.2152, + "step": 2680 + }, + { + "epoch": 0.6791640278657378, + "grad_norm": 3.6775200366973877, + "learning_rate": 9.799903957448988e-06, + "loss": 1.1666, + "step": 2681 + }, + { + "epoch": 0.6794173527549082, + "grad_norm": 3.829551935195923, + "learning_rate": 9.799669237613678e-06, + "loss": 1.0608, + "step": 2682 + }, + { + "epoch": 0.6796706776440785, + "grad_norm": 3.5077948570251465, + "learning_rate": 9.799434383005904e-06, + "loss": 1.142, + "step": 2683 + }, + { + "epoch": 0.6799240025332489, + "grad_norm": 3.613893508911133, + "learning_rate": 9.799199393632262e-06, + "loss": 1.1488, + "step": 2684 + }, + { + "epoch": 0.6801773274224192, + "grad_norm": 3.767568826675415, + "learning_rate": 9.798964269499348e-06, + "loss": 1.343, + "step": 2685 + }, + { + "epoch": 0.6804306523115896, + "grad_norm": 3.9153528213500977, + "learning_rate": 9.79872901061377e-06, + "loss": 1.1427, + "step": 2686 + }, + { + "epoch": 0.68068397720076, + "grad_norm": 3.626418113708496, + "learning_rate": 9.79849361698213e-06, + "loss": 1.1733, + "step": 2687 + }, + { + "epoch": 0.6809373020899303, + "grad_norm": 3.3231289386749268, + "learning_rate": 9.798258088611036e-06, + "loss": 1.0894, + "step": 2688 + }, + { + "epoch": 0.6811906269791007, + "grad_norm": 3.4808497428894043, + "learning_rate": 9.798022425507106e-06, + "loss": 1.1191, + "step": 2689 + }, + { + "epoch": 0.681443951868271, + "grad_norm": 3.5405893325805664, + "learning_rate": 9.797786627676955e-06, + "loss": 1.2735, + "step": 2690 + }, + { + "epoch": 0.6816972767574414, + "grad_norm": 3.6188483238220215, + "learning_rate": 9.797550695127203e-06, + "loss": 1.2375, + "step": 2691 + }, + { + "epoch": 0.6819506016466118, + "grad_norm": 3.759868860244751, + "learning_rate": 9.797314627864477e-06, + "loss": 1.2528, + "step": 2692 + }, + { + "epoch": 0.6822039265357821, + "grad_norm": 3.7572758197784424, + "learning_rate": 9.797078425895403e-06, + "loss": 1.2812, + "step": 2693 + }, + { + "epoch": 0.6824572514249525, + "grad_norm": 3.654799222946167, + "learning_rate": 9.796842089226616e-06, + "loss": 1.2315, + "step": 2694 + }, + { + "epoch": 0.6827105763141229, + "grad_norm": 3.8572299480438232, + "learning_rate": 9.796605617864752e-06, + "loss": 1.1414, + "step": 2695 + }, + { + "epoch": 0.6829639012032932, + "grad_norm": 3.588945150375366, + "learning_rate": 9.796369011816449e-06, + "loss": 1.128, + "step": 2696 + }, + { + "epoch": 0.6832172260924636, + "grad_norm": 3.4814610481262207, + "learning_rate": 9.796132271088351e-06, + "loss": 1.1898, + "step": 2697 + }, + { + "epoch": 0.6834705509816339, + "grad_norm": 3.417701482772827, + "learning_rate": 9.79589539568711e-06, + "loss": 1.1112, + "step": 2698 + }, + { + "epoch": 0.6837238758708043, + "grad_norm": 3.6168627738952637, + "learning_rate": 9.795658385619373e-06, + "loss": 1.2118, + "step": 2699 + }, + { + "epoch": 0.6839772007599747, + "grad_norm": 3.409858226776123, + "learning_rate": 9.795421240891795e-06, + "loss": 1.0221, + "step": 2700 + }, + { + "epoch": 0.684230525649145, + "grad_norm": 3.630058765411377, + "learning_rate": 9.795183961511037e-06, + "loss": 1.1802, + "step": 2701 + }, + { + "epoch": 0.6844838505383154, + "grad_norm": 3.1784121990203857, + "learning_rate": 9.79494654748376e-06, + "loss": 1.1309, + "step": 2702 + }, + { + "epoch": 0.6847371754274858, + "grad_norm": 3.5996437072753906, + "learning_rate": 9.794708998816631e-06, + "loss": 1.2083, + "step": 2703 + }, + { + "epoch": 0.6849905003166561, + "grad_norm": 3.6962954998016357, + "learning_rate": 9.794471315516322e-06, + "loss": 1.2055, + "step": 2704 + }, + { + "epoch": 0.6852438252058265, + "grad_norm": 3.6837923526763916, + "learning_rate": 9.794233497589505e-06, + "loss": 1.1432, + "step": 2705 + }, + { + "epoch": 0.6854971500949968, + "grad_norm": 3.7618539333343506, + "learning_rate": 9.793995545042856e-06, + "loss": 1.2378, + "step": 2706 + }, + { + "epoch": 0.6857504749841672, + "grad_norm": 3.795304536819458, + "learning_rate": 9.793757457883062e-06, + "loss": 1.2583, + "step": 2707 + }, + { + "epoch": 0.6860037998733376, + "grad_norm": 3.4354283809661865, + "learning_rate": 9.793519236116804e-06, + "loss": 1.1152, + "step": 2708 + }, + { + "epoch": 0.6862571247625079, + "grad_norm": 3.2153573036193848, + "learning_rate": 9.793280879750772e-06, + "loss": 1.0758, + "step": 2709 + }, + { + "epoch": 0.6865104496516783, + "grad_norm": 3.977475881576538, + "learning_rate": 9.79304238879166e-06, + "loss": 1.2517, + "step": 2710 + }, + { + "epoch": 0.6867637745408487, + "grad_norm": 3.4537222385406494, + "learning_rate": 9.792803763246166e-06, + "loss": 1.2322, + "step": 2711 + }, + { + "epoch": 0.687017099430019, + "grad_norm": 3.520698070526123, + "learning_rate": 9.792565003120987e-06, + "loss": 1.1549, + "step": 2712 + }, + { + "epoch": 0.6872704243191894, + "grad_norm": 3.863520860671997, + "learning_rate": 9.792326108422827e-06, + "loss": 1.1825, + "step": 2713 + }, + { + "epoch": 0.6875237492083597, + "grad_norm": 3.8098387718200684, + "learning_rate": 9.792087079158399e-06, + "loss": 1.2562, + "step": 2714 + }, + { + "epoch": 0.6877770740975301, + "grad_norm": 3.588688373565674, + "learning_rate": 9.79184791533441e-06, + "loss": 1.1739, + "step": 2715 + }, + { + "epoch": 0.6880303989867005, + "grad_norm": 3.77241849899292, + "learning_rate": 9.791608616957577e-06, + "loss": 1.1563, + "step": 2716 + }, + { + "epoch": 0.6882837238758708, + "grad_norm": 3.7428314685821533, + "learning_rate": 9.79136918403462e-06, + "loss": 1.1555, + "step": 2717 + }, + { + "epoch": 0.6885370487650412, + "grad_norm": 3.3901526927948, + "learning_rate": 9.791129616572263e-06, + "loss": 1.0641, + "step": 2718 + }, + { + "epoch": 0.6887903736542115, + "grad_norm": 3.3258988857269287, + "learning_rate": 9.790889914577231e-06, + "loss": 0.9971, + "step": 2719 + }, + { + "epoch": 0.6890436985433819, + "grad_norm": 3.6803858280181885, + "learning_rate": 9.790650078056257e-06, + "loss": 1.1599, + "step": 2720 + }, + { + "epoch": 0.6892970234325523, + "grad_norm": 3.4460649490356445, + "learning_rate": 9.790410107016073e-06, + "loss": 1.068, + "step": 2721 + }, + { + "epoch": 0.6895503483217226, + "grad_norm": 3.723104476928711, + "learning_rate": 9.790170001463417e-06, + "loss": 1.1796, + "step": 2722 + }, + { + "epoch": 0.689803673210893, + "grad_norm": 3.571429491043091, + "learning_rate": 9.789929761405035e-06, + "loss": 1.1815, + "step": 2723 + }, + { + "epoch": 0.6900569981000634, + "grad_norm": 3.5266213417053223, + "learning_rate": 9.789689386847667e-06, + "loss": 1.2039, + "step": 2724 + }, + { + "epoch": 0.6903103229892337, + "grad_norm": 3.419090986251831, + "learning_rate": 9.78944887779807e-06, + "loss": 1.1417, + "step": 2725 + }, + { + "epoch": 0.6905636478784041, + "grad_norm": 3.677717447280884, + "learning_rate": 9.78920823426299e-06, + "loss": 1.2448, + "step": 2726 + }, + { + "epoch": 0.6908169727675744, + "grad_norm": 3.6117289066314697, + "learning_rate": 9.788967456249188e-06, + "loss": 1.2947, + "step": 2727 + }, + { + "epoch": 0.6910702976567448, + "grad_norm": 3.5542232990264893, + "learning_rate": 9.788726543763425e-06, + "loss": 1.1518, + "step": 2728 + }, + { + "epoch": 0.6913236225459152, + "grad_norm": 3.5767104625701904, + "learning_rate": 9.788485496812464e-06, + "loss": 1.1689, + "step": 2729 + }, + { + "epoch": 0.6915769474350855, + "grad_norm": 3.090622663497925, + "learning_rate": 9.788244315403075e-06, + "loss": 1.0362, + "step": 2730 + }, + { + "epoch": 0.6918302723242559, + "grad_norm": 3.714952230453491, + "learning_rate": 9.78800299954203e-06, + "loss": 1.3668, + "step": 2731 + }, + { + "epoch": 0.6920835972134263, + "grad_norm": 3.6238505840301514, + "learning_rate": 9.787761549236105e-06, + "loss": 1.245, + "step": 2732 + }, + { + "epoch": 0.6923369221025966, + "grad_norm": 3.8209900856018066, + "learning_rate": 9.787519964492081e-06, + "loss": 1.3525, + "step": 2733 + }, + { + "epoch": 0.692590246991767, + "grad_norm": 3.626279830932617, + "learning_rate": 9.787278245316739e-06, + "loss": 1.1803, + "step": 2734 + }, + { + "epoch": 0.6928435718809373, + "grad_norm": 3.9143478870391846, + "learning_rate": 9.787036391716866e-06, + "loss": 1.2359, + "step": 2735 + }, + { + "epoch": 0.6930968967701077, + "grad_norm": 3.4293344020843506, + "learning_rate": 9.786794403699256e-06, + "loss": 1.1852, + "step": 2736 + }, + { + "epoch": 0.6933502216592781, + "grad_norm": 3.490367889404297, + "learning_rate": 9.786552281270701e-06, + "loss": 1.1433, + "step": 2737 + }, + { + "epoch": 0.6936035465484484, + "grad_norm": 3.5443949699401855, + "learning_rate": 9.786310024438004e-06, + "loss": 1.333, + "step": 2738 + }, + { + "epoch": 0.6938568714376188, + "grad_norm": 3.587047576904297, + "learning_rate": 9.786067633207963e-06, + "loss": 1.1288, + "step": 2739 + }, + { + "epoch": 0.694110196326789, + "grad_norm": 3.672039747238159, + "learning_rate": 9.785825107587386e-06, + "loss": 1.1176, + "step": 2740 + }, + { + "epoch": 0.6943635212159595, + "grad_norm": 4.216299057006836, + "learning_rate": 9.785582447583084e-06, + "loss": 1.3564, + "step": 2741 + }, + { + "epoch": 0.6946168461051299, + "grad_norm": 3.2558181285858154, + "learning_rate": 9.785339653201869e-06, + "loss": 1.1666, + "step": 2742 + }, + { + "epoch": 0.6948701709943002, + "grad_norm": 3.5010135173797607, + "learning_rate": 9.78509672445056e-06, + "loss": 1.2109, + "step": 2743 + }, + { + "epoch": 0.6951234958834706, + "grad_norm": 3.6624481678009033, + "learning_rate": 9.784853661335976e-06, + "loss": 1.2024, + "step": 2744 + }, + { + "epoch": 0.695376820772641, + "grad_norm": 3.223618268966675, + "learning_rate": 9.784610463864946e-06, + "loss": 1.1214, + "step": 2745 + }, + { + "epoch": 0.6956301456618112, + "grad_norm": 3.5028629302978516, + "learning_rate": 9.784367132044295e-06, + "loss": 1.2009, + "step": 2746 + }, + { + "epoch": 0.6958834705509817, + "grad_norm": 3.3813838958740234, + "learning_rate": 9.784123665880858e-06, + "loss": 1.1444, + "step": 2747 + }, + { + "epoch": 0.696136795440152, + "grad_norm": 3.6025495529174805, + "learning_rate": 9.78388006538147e-06, + "loss": 1.2635, + "step": 2748 + }, + { + "epoch": 0.6963901203293223, + "grad_norm": 4.06139612197876, + "learning_rate": 9.783636330552972e-06, + "loss": 1.2937, + "step": 2749 + }, + { + "epoch": 0.6966434452184928, + "grad_norm": 3.7361483573913574, + "learning_rate": 9.783392461402208e-06, + "loss": 1.331, + "step": 2750 + }, + { + "epoch": 0.696896770107663, + "grad_norm": 3.60343074798584, + "learning_rate": 9.783148457936028e-06, + "loss": 1.1484, + "step": 2751 + }, + { + "epoch": 0.6971500949968334, + "grad_norm": 3.524378776550293, + "learning_rate": 9.782904320161278e-06, + "loss": 1.3156, + "step": 2752 + }, + { + "epoch": 0.6974034198860039, + "grad_norm": 3.6785829067230225, + "learning_rate": 9.782660048084816e-06, + "loss": 1.103, + "step": 2753 + }, + { + "epoch": 0.6976567447751741, + "grad_norm": 3.7482783794403076, + "learning_rate": 9.782415641713503e-06, + "loss": 1.207, + "step": 2754 + }, + { + "epoch": 0.6979100696643445, + "grad_norm": 3.5252530574798584, + "learning_rate": 9.7821711010542e-06, + "loss": 1.2077, + "step": 2755 + }, + { + "epoch": 0.6981633945535148, + "grad_norm": 3.8628737926483154, + "learning_rate": 9.781926426113773e-06, + "loss": 1.2113, + "step": 2756 + }, + { + "epoch": 0.6984167194426852, + "grad_norm": 3.483025074005127, + "learning_rate": 9.781681616899094e-06, + "loss": 1.1702, + "step": 2757 + }, + { + "epoch": 0.6986700443318556, + "grad_norm": 3.4358625411987305, + "learning_rate": 9.781436673417035e-06, + "loss": 1.1687, + "step": 2758 + }, + { + "epoch": 0.6989233692210259, + "grad_norm": 3.8206863403320312, + "learning_rate": 9.781191595674476e-06, + "loss": 1.3216, + "step": 2759 + }, + { + "epoch": 0.6991766941101963, + "grad_norm": 3.408923625946045, + "learning_rate": 9.780946383678297e-06, + "loss": 1.0955, + "step": 2760 + }, + { + "epoch": 0.6994300189993667, + "grad_norm": 3.7028443813323975, + "learning_rate": 9.780701037435386e-06, + "loss": 1.2207, + "step": 2761 + }, + { + "epoch": 0.699683343888537, + "grad_norm": 3.5919699668884277, + "learning_rate": 9.78045555695263e-06, + "loss": 1.2388, + "step": 2762 + }, + { + "epoch": 0.6999366687777074, + "grad_norm": 3.6181886196136475, + "learning_rate": 9.780209942236923e-06, + "loss": 1.2301, + "step": 2763 + }, + { + "epoch": 0.7001899936668777, + "grad_norm": 3.928760528564453, + "learning_rate": 9.77996419329516e-06, + "loss": 1.3502, + "step": 2764 + }, + { + "epoch": 0.7004433185560481, + "grad_norm": 3.586662769317627, + "learning_rate": 9.779718310134242e-06, + "loss": 1.2067, + "step": 2765 + }, + { + "epoch": 0.7006966434452185, + "grad_norm": 3.925246000289917, + "learning_rate": 9.779472292761075e-06, + "loss": 1.3005, + "step": 2766 + }, + { + "epoch": 0.7009499683343888, + "grad_norm": 3.557842493057251, + "learning_rate": 9.779226141182566e-06, + "loss": 1.1391, + "step": 2767 + }, + { + "epoch": 0.7012032932235592, + "grad_norm": 3.4727470874786377, + "learning_rate": 9.778979855405627e-06, + "loss": 1.0853, + "step": 2768 + }, + { + "epoch": 0.7014566181127295, + "grad_norm": 3.460678815841675, + "learning_rate": 9.778733435437174e-06, + "loss": 1.2103, + "step": 2769 + }, + { + "epoch": 0.7017099430018999, + "grad_norm": 3.586535930633545, + "learning_rate": 9.778486881284123e-06, + "loss": 1.2026, + "step": 2770 + }, + { + "epoch": 0.7019632678910703, + "grad_norm": 3.348304033279419, + "learning_rate": 9.778240192953402e-06, + "loss": 1.1491, + "step": 2771 + }, + { + "epoch": 0.7022165927802406, + "grad_norm": 3.8354218006134033, + "learning_rate": 9.777993370451938e-06, + "loss": 1.2024, + "step": 2772 + }, + { + "epoch": 0.702469917669411, + "grad_norm": 3.896153211593628, + "learning_rate": 9.777746413786657e-06, + "loss": 1.0675, + "step": 2773 + }, + { + "epoch": 0.7027232425585814, + "grad_norm": 3.4157509803771973, + "learning_rate": 9.777499322964496e-06, + "loss": 1.1816, + "step": 2774 + }, + { + "epoch": 0.7029765674477517, + "grad_norm": 3.68452525138855, + "learning_rate": 9.777252097992394e-06, + "loss": 1.3039, + "step": 2775 + }, + { + "epoch": 0.7032298923369221, + "grad_norm": 3.712641954421997, + "learning_rate": 9.777004738877291e-06, + "loss": 1.3429, + "step": 2776 + }, + { + "epoch": 0.7034832172260924, + "grad_norm": 3.383347272872925, + "learning_rate": 9.776757245626134e-06, + "loss": 1.072, + "step": 2777 + }, + { + "epoch": 0.7037365421152628, + "grad_norm": 3.540350914001465, + "learning_rate": 9.776509618245874e-06, + "loss": 1.2046, + "step": 2778 + }, + { + "epoch": 0.7039898670044332, + "grad_norm": 3.4994916915893555, + "learning_rate": 9.776261856743462e-06, + "loss": 1.1806, + "step": 2779 + }, + { + "epoch": 0.7042431918936035, + "grad_norm": 3.762624740600586, + "learning_rate": 9.776013961125852e-06, + "loss": 1.1689, + "step": 2780 + }, + { + "epoch": 0.7044965167827739, + "grad_norm": 3.590169668197632, + "learning_rate": 9.775765931400012e-06, + "loss": 1.2333, + "step": 2781 + }, + { + "epoch": 0.7047498416719443, + "grad_norm": 3.553405284881592, + "learning_rate": 9.775517767572901e-06, + "loss": 1.1121, + "step": 2782 + }, + { + "epoch": 0.7050031665611146, + "grad_norm": 3.6617019176483154, + "learning_rate": 9.775269469651492e-06, + "loss": 1.2301, + "step": 2783 + }, + { + "epoch": 0.705256491450285, + "grad_norm": 3.711812973022461, + "learning_rate": 9.775021037642752e-06, + "loss": 1.2415, + "step": 2784 + }, + { + "epoch": 0.7055098163394553, + "grad_norm": 4.336576461791992, + "learning_rate": 9.77477247155366e-06, + "loss": 1.2206, + "step": 2785 + }, + { + "epoch": 0.7057631412286257, + "grad_norm": 3.625121831893921, + "learning_rate": 9.774523771391196e-06, + "loss": 1.1155, + "step": 2786 + }, + { + "epoch": 0.7060164661177961, + "grad_norm": 3.3995773792266846, + "learning_rate": 9.77427493716234e-06, + "loss": 1.0699, + "step": 2787 + }, + { + "epoch": 0.7062697910069664, + "grad_norm": 3.3976480960845947, + "learning_rate": 9.774025968874083e-06, + "loss": 1.0927, + "step": 2788 + }, + { + "epoch": 0.7065231158961368, + "grad_norm": 3.8355326652526855, + "learning_rate": 9.773776866533413e-06, + "loss": 1.2444, + "step": 2789 + }, + { + "epoch": 0.7067764407853071, + "grad_norm": 3.6447906494140625, + "learning_rate": 9.773527630147326e-06, + "loss": 1.1609, + "step": 2790 + }, + { + "epoch": 0.7070297656744775, + "grad_norm": 3.718270778656006, + "learning_rate": 9.773278259722822e-06, + "loss": 1.1833, + "step": 2791 + }, + { + "epoch": 0.7072830905636479, + "grad_norm": 3.87910795211792, + "learning_rate": 9.7730287552669e-06, + "loss": 1.3377, + "step": 2792 + }, + { + "epoch": 0.7075364154528182, + "grad_norm": 3.5926077365875244, + "learning_rate": 9.772779116786568e-06, + "loss": 1.1916, + "step": 2793 + }, + { + "epoch": 0.7077897403419886, + "grad_norm": 3.551798105239868, + "learning_rate": 9.772529344288836e-06, + "loss": 1.1198, + "step": 2794 + }, + { + "epoch": 0.708043065231159, + "grad_norm": 3.351592779159546, + "learning_rate": 9.772279437780716e-06, + "loss": 1.1733, + "step": 2795 + }, + { + "epoch": 0.7082963901203293, + "grad_norm": 3.3235034942626953, + "learning_rate": 9.772029397269226e-06, + "loss": 1.0887, + "step": 2796 + }, + { + "epoch": 0.7085497150094997, + "grad_norm": 3.4611363410949707, + "learning_rate": 9.771779222761389e-06, + "loss": 1.1947, + "step": 2797 + }, + { + "epoch": 0.70880303989867, + "grad_norm": 3.4118261337280273, + "learning_rate": 9.771528914264225e-06, + "loss": 1.1529, + "step": 2798 + }, + { + "epoch": 0.7090563647878404, + "grad_norm": 3.7440123558044434, + "learning_rate": 9.771278471784767e-06, + "loss": 1.1056, + "step": 2799 + }, + { + "epoch": 0.7093096896770108, + "grad_norm": 3.0236973762512207, + "learning_rate": 9.771027895330045e-06, + "loss": 1.0729, + "step": 2800 + }, + { + "epoch": 0.7095630145661811, + "grad_norm": 3.7565345764160156, + "learning_rate": 9.770777184907096e-06, + "loss": 1.2416, + "step": 2801 + }, + { + "epoch": 0.7098163394553515, + "grad_norm": 3.542041063308716, + "learning_rate": 9.770526340522959e-06, + "loss": 1.1375, + "step": 2802 + }, + { + "epoch": 0.7100696643445219, + "grad_norm": 3.2771825790405273, + "learning_rate": 9.77027536218468e-06, + "loss": 1.1273, + "step": 2803 + }, + { + "epoch": 0.7103229892336922, + "grad_norm": 3.5278711318969727, + "learning_rate": 9.770024249899302e-06, + "loss": 1.0621, + "step": 2804 + }, + { + "epoch": 0.7105763141228626, + "grad_norm": 3.2327733039855957, + "learning_rate": 9.769773003673882e-06, + "loss": 1.1363, + "step": 2805 + }, + { + "epoch": 0.7108296390120329, + "grad_norm": 3.598832130432129, + "learning_rate": 9.769521623515468e-06, + "loss": 1.1317, + "step": 2806 + }, + { + "epoch": 0.7110829639012033, + "grad_norm": 3.827223300933838, + "learning_rate": 9.769270109431123e-06, + "loss": 1.2074, + "step": 2807 + }, + { + "epoch": 0.7113362887903737, + "grad_norm": 3.5710222721099854, + "learning_rate": 9.76901846142791e-06, + "loss": 1.1924, + "step": 2808 + }, + { + "epoch": 0.711589613679544, + "grad_norm": 3.319575786590576, + "learning_rate": 9.768766679512894e-06, + "loss": 1.1732, + "step": 2809 + }, + { + "epoch": 0.7118429385687144, + "grad_norm": 3.6563777923583984, + "learning_rate": 9.768514763693143e-06, + "loss": 1.2641, + "step": 2810 + }, + { + "epoch": 0.7120962634578847, + "grad_norm": 3.330498695373535, + "learning_rate": 9.768262713975734e-06, + "loss": 1.1456, + "step": 2811 + }, + { + "epoch": 0.7123495883470551, + "grad_norm": 3.920604705810547, + "learning_rate": 9.768010530367741e-06, + "loss": 1.3083, + "step": 2812 + }, + { + "epoch": 0.7126029132362255, + "grad_norm": 3.7744476795196533, + "learning_rate": 9.767758212876247e-06, + "loss": 1.1667, + "step": 2813 + }, + { + "epoch": 0.7128562381253958, + "grad_norm": 3.3494551181793213, + "learning_rate": 9.767505761508338e-06, + "loss": 1.1604, + "step": 2814 + }, + { + "epoch": 0.7131095630145662, + "grad_norm": 3.6803066730499268, + "learning_rate": 9.767253176271104e-06, + "loss": 1.1327, + "step": 2815 + }, + { + "epoch": 0.7133628879037366, + "grad_norm": 3.979126214981079, + "learning_rate": 9.767000457171632e-06, + "loss": 1.2729, + "step": 2816 + }, + { + "epoch": 0.7136162127929069, + "grad_norm": 3.2043516635894775, + "learning_rate": 9.76674760421702e-06, + "loss": 1.1035, + "step": 2817 + }, + { + "epoch": 0.7138695376820773, + "grad_norm": 3.6564838886260986, + "learning_rate": 9.766494617414373e-06, + "loss": 1.2312, + "step": 2818 + }, + { + "epoch": 0.7141228625712476, + "grad_norm": 3.200101137161255, + "learning_rate": 9.766241496770787e-06, + "loss": 1.1007, + "step": 2819 + }, + { + "epoch": 0.714376187460418, + "grad_norm": 3.8117692470550537, + "learning_rate": 9.765988242293378e-06, + "loss": 1.2331, + "step": 2820 + }, + { + "epoch": 0.7146295123495884, + "grad_norm": 3.5853116512298584, + "learning_rate": 9.765734853989251e-06, + "loss": 1.2502, + "step": 2821 + }, + { + "epoch": 0.7148828372387587, + "grad_norm": 3.3093082904815674, + "learning_rate": 9.765481331865521e-06, + "loss": 1.1271, + "step": 2822 + }, + { + "epoch": 0.7151361621279291, + "grad_norm": 3.530261516571045, + "learning_rate": 9.76522767592931e-06, + "loss": 1.3105, + "step": 2823 + }, + { + "epoch": 0.7153894870170995, + "grad_norm": 3.4714860916137695, + "learning_rate": 9.764973886187741e-06, + "loss": 1.1998, + "step": 2824 + }, + { + "epoch": 0.7156428119062698, + "grad_norm": 3.621464252471924, + "learning_rate": 9.764719962647937e-06, + "loss": 1.2733, + "step": 2825 + }, + { + "epoch": 0.7158961367954402, + "grad_norm": 3.5986428260803223, + "learning_rate": 9.764465905317029e-06, + "loss": 1.3084, + "step": 2826 + }, + { + "epoch": 0.7161494616846105, + "grad_norm": 3.1292741298675537, + "learning_rate": 9.764211714202152e-06, + "loss": 1.1816, + "step": 2827 + }, + { + "epoch": 0.7164027865737809, + "grad_norm": 3.384155511856079, + "learning_rate": 9.763957389310443e-06, + "loss": 1.1063, + "step": 2828 + }, + { + "epoch": 0.7166561114629513, + "grad_norm": 3.7090487480163574, + "learning_rate": 9.763702930649045e-06, + "loss": 1.1882, + "step": 2829 + }, + { + "epoch": 0.7169094363521216, + "grad_norm": 3.265096426010132, + "learning_rate": 9.763448338225098e-06, + "loss": 1.059, + "step": 2830 + }, + { + "epoch": 0.717162761241292, + "grad_norm": 3.665614604949951, + "learning_rate": 9.763193612045756e-06, + "loss": 1.1052, + "step": 2831 + }, + { + "epoch": 0.7174160861304624, + "grad_norm": 3.465980291366577, + "learning_rate": 9.762938752118169e-06, + "loss": 1.1833, + "step": 2832 + }, + { + "epoch": 0.7176694110196327, + "grad_norm": 3.3577919006347656, + "learning_rate": 9.762683758449495e-06, + "loss": 1.0934, + "step": 2833 + }, + { + "epoch": 0.7179227359088031, + "grad_norm": 3.931393623352051, + "learning_rate": 9.762428631046893e-06, + "loss": 1.255, + "step": 2834 + }, + { + "epoch": 0.7181760607979734, + "grad_norm": 3.6076884269714355, + "learning_rate": 9.762173369917527e-06, + "loss": 1.2592, + "step": 2835 + }, + { + "epoch": 0.7184293856871438, + "grad_norm": 3.668667793273926, + "learning_rate": 9.761917975068564e-06, + "loss": 1.2569, + "step": 2836 + }, + { + "epoch": 0.7186827105763142, + "grad_norm": 3.767733097076416, + "learning_rate": 9.761662446507177e-06, + "loss": 1.1592, + "step": 2837 + }, + { + "epoch": 0.7189360354654845, + "grad_norm": 3.57222056388855, + "learning_rate": 9.761406784240539e-06, + "loss": 1.242, + "step": 2838 + }, + { + "epoch": 0.7191893603546549, + "grad_norm": 4.031567573547363, + "learning_rate": 9.76115098827583e-06, + "loss": 1.2613, + "step": 2839 + }, + { + "epoch": 0.7194426852438252, + "grad_norm": 3.574033260345459, + "learning_rate": 9.760895058620236e-06, + "loss": 1.3366, + "step": 2840 + }, + { + "epoch": 0.7196960101329956, + "grad_norm": 3.2835404872894287, + "learning_rate": 9.760638995280938e-06, + "loss": 1.0849, + "step": 2841 + }, + { + "epoch": 0.719949335022166, + "grad_norm": 3.594667911529541, + "learning_rate": 9.760382798265127e-06, + "loss": 1.2336, + "step": 2842 + }, + { + "epoch": 0.7202026599113363, + "grad_norm": 3.377202272415161, + "learning_rate": 9.76012646758e-06, + "loss": 1.1906, + "step": 2843 + }, + { + "epoch": 0.7204559848005067, + "grad_norm": 3.599886178970337, + "learning_rate": 9.759870003232751e-06, + "loss": 1.3008, + "step": 2844 + }, + { + "epoch": 0.7207093096896771, + "grad_norm": 3.5252554416656494, + "learning_rate": 9.759613405230583e-06, + "loss": 1.2417, + "step": 2845 + }, + { + "epoch": 0.7209626345788474, + "grad_norm": 3.7451412677764893, + "learning_rate": 9.759356673580703e-06, + "loss": 1.2919, + "step": 2846 + }, + { + "epoch": 0.7212159594680178, + "grad_norm": 3.5595405101776123, + "learning_rate": 9.759099808290318e-06, + "loss": 1.0684, + "step": 2847 + }, + { + "epoch": 0.721469284357188, + "grad_norm": 3.7966434955596924, + "learning_rate": 9.75884280936664e-06, + "loss": 1.2092, + "step": 2848 + }, + { + "epoch": 0.7217226092463584, + "grad_norm": 3.775515556335449, + "learning_rate": 9.758585676816888e-06, + "loss": 1.2583, + "step": 2849 + }, + { + "epoch": 0.7219759341355289, + "grad_norm": 3.613001585006714, + "learning_rate": 9.758328410648277e-06, + "loss": 1.2235, + "step": 2850 + }, + { + "epoch": 0.7222292590246991, + "grad_norm": 3.4368202686309814, + "learning_rate": 9.758071010868037e-06, + "loss": 1.2107, + "step": 2851 + }, + { + "epoch": 0.7224825839138695, + "grad_norm": 3.6963143348693848, + "learning_rate": 9.757813477483393e-06, + "loss": 1.1236, + "step": 2852 + }, + { + "epoch": 0.72273590880304, + "grad_norm": 3.0793519020080566, + "learning_rate": 9.757555810501575e-06, + "loss": 1.1591, + "step": 2853 + }, + { + "epoch": 0.7229892336922102, + "grad_norm": 3.4738950729370117, + "learning_rate": 9.757298009929822e-06, + "loss": 1.2413, + "step": 2854 + }, + { + "epoch": 0.7232425585813806, + "grad_norm": 3.767167568206787, + "learning_rate": 9.757040075775369e-06, + "loss": 1.1664, + "step": 2855 + }, + { + "epoch": 0.7234958834705509, + "grad_norm": 3.5329854488372803, + "learning_rate": 9.75678200804546e-06, + "loss": 1.1411, + "step": 2856 + }, + { + "epoch": 0.7237492083597213, + "grad_norm": 3.2329213619232178, + "learning_rate": 9.75652380674734e-06, + "loss": 1.0596, + "step": 2857 + }, + { + "epoch": 0.7240025332488917, + "grad_norm": 3.5320141315460205, + "learning_rate": 9.756265471888263e-06, + "loss": 1.2004, + "step": 2858 + }, + { + "epoch": 0.724255858138062, + "grad_norm": 3.182819366455078, + "learning_rate": 9.75600700347548e-06, + "loss": 1.0744, + "step": 2859 + }, + { + "epoch": 0.7245091830272324, + "grad_norm": 3.5611660480499268, + "learning_rate": 9.755748401516252e-06, + "loss": 1.1452, + "step": 2860 + }, + { + "epoch": 0.7247625079164027, + "grad_norm": 3.2804763317108154, + "learning_rate": 9.755489666017834e-06, + "loss": 1.0568, + "step": 2861 + }, + { + "epoch": 0.7250158328055731, + "grad_norm": 3.84488582611084, + "learning_rate": 9.755230796987496e-06, + "loss": 1.2335, + "step": 2862 + }, + { + "epoch": 0.7252691576947435, + "grad_norm": 4.201401233673096, + "learning_rate": 9.754971794432506e-06, + "loss": 1.1728, + "step": 2863 + }, + { + "epoch": 0.7255224825839138, + "grad_norm": 3.7625200748443604, + "learning_rate": 9.754712658360137e-06, + "loss": 1.3335, + "step": 2864 + }, + { + "epoch": 0.7257758074730842, + "grad_norm": 3.607170820236206, + "learning_rate": 9.754453388777665e-06, + "loss": 1.3213, + "step": 2865 + }, + { + "epoch": 0.7260291323622546, + "grad_norm": 3.8149611949920654, + "learning_rate": 9.754193985692371e-06, + "loss": 1.1848, + "step": 2866 + }, + { + "epoch": 0.7262824572514249, + "grad_norm": 3.745016098022461, + "learning_rate": 9.753934449111535e-06, + "loss": 1.1727, + "step": 2867 + }, + { + "epoch": 0.7265357821405953, + "grad_norm": 3.7872586250305176, + "learning_rate": 9.753674779042451e-06, + "loss": 1.1979, + "step": 2868 + }, + { + "epoch": 0.7267891070297656, + "grad_norm": 3.3696279525756836, + "learning_rate": 9.753414975492406e-06, + "loss": 1.0847, + "step": 2869 + }, + { + "epoch": 0.727042431918936, + "grad_norm": 3.3734793663024902, + "learning_rate": 9.753155038468696e-06, + "loss": 1.1775, + "step": 2870 + }, + { + "epoch": 0.7272957568081064, + "grad_norm": 3.693936347961426, + "learning_rate": 9.752894967978622e-06, + "loss": 1.2687, + "step": 2871 + }, + { + "epoch": 0.7275490816972767, + "grad_norm": 3.2961838245391846, + "learning_rate": 9.752634764029483e-06, + "loss": 1.1288, + "step": 2872 + }, + { + "epoch": 0.7278024065864471, + "grad_norm": 3.4125046730041504, + "learning_rate": 9.752374426628587e-06, + "loss": 1.1906, + "step": 2873 + }, + { + "epoch": 0.7280557314756175, + "grad_norm": 3.271693468093872, + "learning_rate": 9.752113955783245e-06, + "loss": 1.1654, + "step": 2874 + }, + { + "epoch": 0.7283090563647878, + "grad_norm": 3.64925479888916, + "learning_rate": 9.75185335150077e-06, + "loss": 1.2178, + "step": 2875 + }, + { + "epoch": 0.7285623812539582, + "grad_norm": 3.1804234981536865, + "learning_rate": 9.751592613788481e-06, + "loss": 0.9787, + "step": 2876 + }, + { + "epoch": 0.7288157061431285, + "grad_norm": 3.6267011165618896, + "learning_rate": 9.751331742653698e-06, + "loss": 1.0725, + "step": 2877 + }, + { + "epoch": 0.7290690310322989, + "grad_norm": 3.5391435623168945, + "learning_rate": 9.751070738103745e-06, + "loss": 1.1544, + "step": 2878 + }, + { + "epoch": 0.7293223559214693, + "grad_norm": 3.3422415256500244, + "learning_rate": 9.750809600145955e-06, + "loss": 1.1848, + "step": 2879 + }, + { + "epoch": 0.7295756808106396, + "grad_norm": 3.2491321563720703, + "learning_rate": 9.750548328787657e-06, + "loss": 1.1193, + "step": 2880 + }, + { + "epoch": 0.72982900569981, + "grad_norm": 3.7429699897766113, + "learning_rate": 9.750286924036188e-06, + "loss": 1.2345, + "step": 2881 + }, + { + "epoch": 0.7300823305889804, + "grad_norm": 3.055997133255005, + "learning_rate": 9.750025385898887e-06, + "loss": 1.1035, + "step": 2882 + }, + { + "epoch": 0.7303356554781507, + "grad_norm": 3.5209739208221436, + "learning_rate": 9.749763714383102e-06, + "loss": 1.2324, + "step": 2883 + }, + { + "epoch": 0.7305889803673211, + "grad_norm": 3.2700893878936768, + "learning_rate": 9.749501909496177e-06, + "loss": 1.0234, + "step": 2884 + }, + { + "epoch": 0.7308423052564914, + "grad_norm": 3.586759328842163, + "learning_rate": 9.749239971245463e-06, + "loss": 1.3001, + "step": 2885 + }, + { + "epoch": 0.7310956301456618, + "grad_norm": 3.6156978607177734, + "learning_rate": 9.74897789963832e-06, + "loss": 1.2048, + "step": 2886 + }, + { + "epoch": 0.7313489550348322, + "grad_norm": 3.196514844894409, + "learning_rate": 9.748715694682101e-06, + "loss": 1.0979, + "step": 2887 + }, + { + "epoch": 0.7316022799240025, + "grad_norm": 3.7128384113311768, + "learning_rate": 9.74845335638417e-06, + "loss": 1.1782, + "step": 2888 + }, + { + "epoch": 0.7318556048131729, + "grad_norm": 3.2188334465026855, + "learning_rate": 9.748190884751896e-06, + "loss": 1.0616, + "step": 2889 + }, + { + "epoch": 0.7321089297023432, + "grad_norm": 4.012807846069336, + "learning_rate": 9.747928279792647e-06, + "loss": 1.2046, + "step": 2890 + }, + { + "epoch": 0.7323622545915136, + "grad_norm": 3.4193642139434814, + "learning_rate": 9.747665541513795e-06, + "loss": 1.2761, + "step": 2891 + }, + { + "epoch": 0.732615579480684, + "grad_norm": 3.9362986087799072, + "learning_rate": 9.747402669922723e-06, + "loss": 1.1693, + "step": 2892 + }, + { + "epoch": 0.7328689043698543, + "grad_norm": 3.805783987045288, + "learning_rate": 9.747139665026807e-06, + "loss": 1.3688, + "step": 2893 + }, + { + "epoch": 0.7331222292590247, + "grad_norm": 3.6252715587615967, + "learning_rate": 9.746876526833435e-06, + "loss": 1.214, + "step": 2894 + }, + { + "epoch": 0.7333755541481951, + "grad_norm": 3.744133234024048, + "learning_rate": 9.746613255349994e-06, + "loss": 1.1945, + "step": 2895 + }, + { + "epoch": 0.7336288790373654, + "grad_norm": 3.763392210006714, + "learning_rate": 9.746349850583878e-06, + "loss": 1.1511, + "step": 2896 + }, + { + "epoch": 0.7338822039265358, + "grad_norm": 3.8932294845581055, + "learning_rate": 9.746086312542482e-06, + "loss": 1.2599, + "step": 2897 + }, + { + "epoch": 0.7341355288157061, + "grad_norm": 3.539275884628296, + "learning_rate": 9.745822641233209e-06, + "loss": 1.0907, + "step": 2898 + }, + { + "epoch": 0.7343888537048765, + "grad_norm": 3.518880844116211, + "learning_rate": 9.745558836663459e-06, + "loss": 1.0641, + "step": 2899 + }, + { + "epoch": 0.7346421785940469, + "grad_norm": 3.6900782585144043, + "learning_rate": 9.74529489884064e-06, + "loss": 1.2881, + "step": 2900 + }, + { + "epoch": 0.7348955034832172, + "grad_norm": 3.250474214553833, + "learning_rate": 9.745030827772165e-06, + "loss": 1.0688, + "step": 2901 + }, + { + "epoch": 0.7351488283723876, + "grad_norm": 3.5695221424102783, + "learning_rate": 9.744766623465449e-06, + "loss": 1.1915, + "step": 2902 + }, + { + "epoch": 0.735402153261558, + "grad_norm": 3.4501566886901855, + "learning_rate": 9.744502285927908e-06, + "loss": 1.0965, + "step": 2903 + }, + { + "epoch": 0.7356554781507283, + "grad_norm": 3.3264076709747314, + "learning_rate": 9.744237815166968e-06, + "loss": 1.1917, + "step": 2904 + }, + { + "epoch": 0.7359088030398987, + "grad_norm": 3.6294021606445312, + "learning_rate": 9.743973211190054e-06, + "loss": 1.1603, + "step": 2905 + }, + { + "epoch": 0.736162127929069, + "grad_norm": 3.9763689041137695, + "learning_rate": 9.743708474004594e-06, + "loss": 1.1595, + "step": 2906 + }, + { + "epoch": 0.7364154528182394, + "grad_norm": 3.597679853439331, + "learning_rate": 9.743443603618026e-06, + "loss": 1.1928, + "step": 2907 + }, + { + "epoch": 0.7366687777074098, + "grad_norm": 3.656299591064453, + "learning_rate": 9.74317860003778e-06, + "loss": 1.1791, + "step": 2908 + }, + { + "epoch": 0.7369221025965801, + "grad_norm": 3.7492666244506836, + "learning_rate": 9.742913463271308e-06, + "loss": 1.1845, + "step": 2909 + }, + { + "epoch": 0.7371754274857505, + "grad_norm": 3.099785089492798, + "learning_rate": 9.742648193326044e-06, + "loss": 1.023, + "step": 2910 + }, + { + "epoch": 0.7374287523749208, + "grad_norm": 3.5840766429901123, + "learning_rate": 9.742382790209445e-06, + "loss": 1.1468, + "step": 2911 + }, + { + "epoch": 0.7376820772640912, + "grad_norm": 3.1212668418884277, + "learning_rate": 9.742117253928957e-06, + "loss": 1.0754, + "step": 2912 + }, + { + "epoch": 0.7379354021532616, + "grad_norm": 3.5566112995147705, + "learning_rate": 9.741851584492041e-06, + "loss": 1.2996, + "step": 2913 + }, + { + "epoch": 0.7381887270424319, + "grad_norm": 3.684170961380005, + "learning_rate": 9.741585781906155e-06, + "loss": 1.2737, + "step": 2914 + }, + { + "epoch": 0.7384420519316023, + "grad_norm": 3.6651434898376465, + "learning_rate": 9.741319846178762e-06, + "loss": 1.1011, + "step": 2915 + }, + { + "epoch": 0.7386953768207727, + "grad_norm": 3.631960391998291, + "learning_rate": 9.741053777317328e-06, + "loss": 1.1804, + "step": 2916 + }, + { + "epoch": 0.738948701709943, + "grad_norm": 3.5752904415130615, + "learning_rate": 9.74078757532933e-06, + "loss": 1.1606, + "step": 2917 + }, + { + "epoch": 0.7392020265991134, + "grad_norm": 3.315155029296875, + "learning_rate": 9.740521240222235e-06, + "loss": 1.0884, + "step": 2918 + }, + { + "epoch": 0.7394553514882837, + "grad_norm": 3.4121146202087402, + "learning_rate": 9.740254772003527e-06, + "loss": 1.2873, + "step": 2919 + }, + { + "epoch": 0.7397086763774541, + "grad_norm": 3.9900479316711426, + "learning_rate": 9.739988170680687e-06, + "loss": 1.2129, + "step": 2920 + }, + { + "epoch": 0.7399620012666245, + "grad_norm": 3.557452440261841, + "learning_rate": 9.7397214362612e-06, + "loss": 1.2383, + "step": 2921 + }, + { + "epoch": 0.7402153261557948, + "grad_norm": 3.6594953536987305, + "learning_rate": 9.739454568752556e-06, + "loss": 1.2351, + "step": 2922 + }, + { + "epoch": 0.7404686510449652, + "grad_norm": 3.352383852005005, + "learning_rate": 9.73918756816225e-06, + "loss": 1.0694, + "step": 2923 + }, + { + "epoch": 0.7407219759341356, + "grad_norm": 3.5007543563842773, + "learning_rate": 9.738920434497777e-06, + "loss": 1.2174, + "step": 2924 + }, + { + "epoch": 0.7409753008233059, + "grad_norm": 3.6896722316741943, + "learning_rate": 9.73865316776664e-06, + "loss": 1.2207, + "step": 2925 + }, + { + "epoch": 0.7412286257124763, + "grad_norm": 3.894541025161743, + "learning_rate": 9.738385767976344e-06, + "loss": 1.2838, + "step": 2926 + }, + { + "epoch": 0.7414819506016466, + "grad_norm": 3.4676342010498047, + "learning_rate": 9.738118235134395e-06, + "loss": 1.2436, + "step": 2927 + }, + { + "epoch": 0.741735275490817, + "grad_norm": 3.736973285675049, + "learning_rate": 9.737850569248308e-06, + "loss": 1.0835, + "step": 2928 + }, + { + "epoch": 0.7419886003799874, + "grad_norm": 3.6173088550567627, + "learning_rate": 9.737582770325595e-06, + "loss": 1.2395, + "step": 2929 + }, + { + "epoch": 0.7422419252691577, + "grad_norm": 3.260591506958008, + "learning_rate": 9.737314838373781e-06, + "loss": 1.0065, + "step": 2930 + }, + { + "epoch": 0.7424952501583281, + "grad_norm": 3.230839252471924, + "learning_rate": 9.737046773400384e-06, + "loss": 1.1127, + "step": 2931 + }, + { + "epoch": 0.7427485750474985, + "grad_norm": 3.7157719135284424, + "learning_rate": 9.736778575412935e-06, + "loss": 1.1872, + "step": 2932 + }, + { + "epoch": 0.7430018999366688, + "grad_norm": 3.453279495239258, + "learning_rate": 9.736510244418965e-06, + "loss": 1.0701, + "step": 2933 + }, + { + "epoch": 0.7432552248258392, + "grad_norm": 3.4291446208953857, + "learning_rate": 9.736241780426005e-06, + "loss": 1.0969, + "step": 2934 + }, + { + "epoch": 0.7435085497150095, + "grad_norm": 3.3004000186920166, + "learning_rate": 9.735973183441598e-06, + "loss": 1.1602, + "step": 2935 + }, + { + "epoch": 0.7437618746041799, + "grad_norm": 3.5002408027648926, + "learning_rate": 9.735704453473281e-06, + "loss": 1.1217, + "step": 2936 + }, + { + "epoch": 0.7440151994933503, + "grad_norm": 3.2200589179992676, + "learning_rate": 9.735435590528603e-06, + "loss": 1.065, + "step": 2937 + }, + { + "epoch": 0.7442685243825206, + "grad_norm": 3.6667120456695557, + "learning_rate": 9.735166594615115e-06, + "loss": 1.208, + "step": 2938 + }, + { + "epoch": 0.744521849271691, + "grad_norm": 3.5607337951660156, + "learning_rate": 9.734897465740367e-06, + "loss": 1.1361, + "step": 2939 + }, + { + "epoch": 0.7447751741608613, + "grad_norm": 3.6154489517211914, + "learning_rate": 9.734628203911916e-06, + "loss": 1.1662, + "step": 2940 + }, + { + "epoch": 0.7450284990500317, + "grad_norm": 3.8687450885772705, + "learning_rate": 9.734358809137325e-06, + "loss": 1.2813, + "step": 2941 + }, + { + "epoch": 0.7452818239392021, + "grad_norm": 3.4730746746063232, + "learning_rate": 9.73408928142416e-06, + "loss": 1.1927, + "step": 2942 + }, + { + "epoch": 0.7455351488283724, + "grad_norm": 3.4987568855285645, + "learning_rate": 9.733819620779983e-06, + "loss": 1.263, + "step": 2943 + }, + { + "epoch": 0.7457884737175428, + "grad_norm": 4.5206074714660645, + "learning_rate": 9.733549827212371e-06, + "loss": 1.5617, + "step": 2944 + }, + { + "epoch": 0.7460417986067132, + "grad_norm": 3.6188478469848633, + "learning_rate": 9.7332799007289e-06, + "loss": 1.2127, + "step": 2945 + }, + { + "epoch": 0.7462951234958835, + "grad_norm": 3.842108964920044, + "learning_rate": 9.733009841337145e-06, + "loss": 1.1828, + "step": 2946 + }, + { + "epoch": 0.7465484483850539, + "grad_norm": 3.4908463954925537, + "learning_rate": 9.732739649044694e-06, + "loss": 1.1087, + "step": 2947 + }, + { + "epoch": 0.7468017732742241, + "grad_norm": 3.5372354984283447, + "learning_rate": 9.732469323859131e-06, + "loss": 1.1885, + "step": 2948 + }, + { + "epoch": 0.7470550981633945, + "grad_norm": 3.726322889328003, + "learning_rate": 9.732198865788047e-06, + "loss": 1.1577, + "step": 2949 + }, + { + "epoch": 0.747308423052565, + "grad_norm": 3.757126808166504, + "learning_rate": 9.731928274839038e-06, + "loss": 1.3131, + "step": 2950 + }, + { + "epoch": 0.7475617479417352, + "grad_norm": 3.244915246963501, + "learning_rate": 9.7316575510197e-06, + "loss": 1.139, + "step": 2951 + }, + { + "epoch": 0.7478150728309056, + "grad_norm": 3.993474245071411, + "learning_rate": 9.731386694337635e-06, + "loss": 1.2573, + "step": 2952 + }, + { + "epoch": 0.748068397720076, + "grad_norm": 3.6339077949523926, + "learning_rate": 9.73111570480045e-06, + "loss": 1.3047, + "step": 2953 + }, + { + "epoch": 0.7483217226092463, + "grad_norm": 3.532299518585205, + "learning_rate": 9.730844582415752e-06, + "loss": 1.2002, + "step": 2954 + }, + { + "epoch": 0.7485750474984167, + "grad_norm": 3.2947475910186768, + "learning_rate": 9.730573327191158e-06, + "loss": 1.0655, + "step": 2955 + }, + { + "epoch": 0.748828372387587, + "grad_norm": 3.738976240158081, + "learning_rate": 9.73030193913428e-06, + "loss": 1.1959, + "step": 2956 + }, + { + "epoch": 0.7490816972767574, + "grad_norm": 3.7513082027435303, + "learning_rate": 9.73003041825274e-06, + "loss": 1.1723, + "step": 2957 + }, + { + "epoch": 0.7493350221659278, + "grad_norm": 3.2699825763702393, + "learning_rate": 9.729758764554164e-06, + "loss": 1.1177, + "step": 2958 + }, + { + "epoch": 0.7495883470550981, + "grad_norm": 3.1440470218658447, + "learning_rate": 9.729486978046178e-06, + "loss": 1.0667, + "step": 2959 + }, + { + "epoch": 0.7498416719442685, + "grad_norm": 3.0764389038085938, + "learning_rate": 9.729215058736417e-06, + "loss": 1.0814, + "step": 2960 + }, + { + "epoch": 0.7500949968334388, + "grad_norm": 3.6139206886291504, + "learning_rate": 9.72894300663251e-06, + "loss": 1.1392, + "step": 2961 + }, + { + "epoch": 0.7503483217226092, + "grad_norm": 3.397728443145752, + "learning_rate": 9.7286708217421e-06, + "loss": 1.201, + "step": 2962 + }, + { + "epoch": 0.7506016466117796, + "grad_norm": 3.492866039276123, + "learning_rate": 9.728398504072832e-06, + "loss": 1.0876, + "step": 2963 + }, + { + "epoch": 0.7508549715009499, + "grad_norm": 3.6088707447052, + "learning_rate": 9.728126053632348e-06, + "loss": 1.1156, + "step": 2964 + }, + { + "epoch": 0.7511082963901203, + "grad_norm": 3.5072734355926514, + "learning_rate": 9.727853470428301e-06, + "loss": 1.1877, + "step": 2965 + }, + { + "epoch": 0.7513616212792907, + "grad_norm": 3.3493130207061768, + "learning_rate": 9.727580754468345e-06, + "loss": 1.0638, + "step": 2966 + }, + { + "epoch": 0.751614946168461, + "grad_norm": 3.3511264324188232, + "learning_rate": 9.727307905760137e-06, + "loss": 1.0617, + "step": 2967 + }, + { + "epoch": 0.7518682710576314, + "grad_norm": 3.3056769371032715, + "learning_rate": 9.727034924311337e-06, + "loss": 0.9994, + "step": 2968 + }, + { + "epoch": 0.7521215959468017, + "grad_norm": 3.7182092666625977, + "learning_rate": 9.726761810129614e-06, + "loss": 1.2064, + "step": 2969 + }, + { + "epoch": 0.7523749208359721, + "grad_norm": 4.280904769897461, + "learning_rate": 9.726488563222633e-06, + "loss": 1.5103, + "step": 2970 + }, + { + "epoch": 0.7526282457251425, + "grad_norm": 3.617830991744995, + "learning_rate": 9.726215183598069e-06, + "loss": 1.1906, + "step": 2971 + }, + { + "epoch": 0.7528815706143128, + "grad_norm": 3.109330177307129, + "learning_rate": 9.725941671263597e-06, + "loss": 1.0851, + "step": 2972 + }, + { + "epoch": 0.7531348955034832, + "grad_norm": 3.392521381378174, + "learning_rate": 9.7256680262269e-06, + "loss": 1.2138, + "step": 2973 + }, + { + "epoch": 0.7533882203926536, + "grad_norm": 3.6669206619262695, + "learning_rate": 9.725394248495657e-06, + "loss": 1.1091, + "step": 2974 + }, + { + "epoch": 0.7536415452818239, + "grad_norm": 3.238131523132324, + "learning_rate": 9.72512033807756e-06, + "loss": 1.1352, + "step": 2975 + }, + { + "epoch": 0.7538948701709943, + "grad_norm": 3.459148645401001, + "learning_rate": 9.724846294980298e-06, + "loss": 1.2146, + "step": 2976 + }, + { + "epoch": 0.7541481950601646, + "grad_norm": 3.5758607387542725, + "learning_rate": 9.724572119211566e-06, + "loss": 1.1659, + "step": 2977 + }, + { + "epoch": 0.754401519949335, + "grad_norm": 3.6841254234313965, + "learning_rate": 9.724297810779064e-06, + "loss": 1.2396, + "step": 2978 + }, + { + "epoch": 0.7546548448385054, + "grad_norm": 3.3286004066467285, + "learning_rate": 9.724023369690493e-06, + "loss": 1.1295, + "step": 2979 + }, + { + "epoch": 0.7549081697276757, + "grad_norm": 3.5271801948547363, + "learning_rate": 9.72374879595356e-06, + "loss": 1.2116, + "step": 2980 + }, + { + "epoch": 0.7551614946168461, + "grad_norm": 3.4255433082580566, + "learning_rate": 9.723474089575975e-06, + "loss": 1.1897, + "step": 2981 + }, + { + "epoch": 0.7554148195060165, + "grad_norm": 3.3398475646972656, + "learning_rate": 9.723199250565449e-06, + "loss": 1.0863, + "step": 2982 + }, + { + "epoch": 0.7556681443951868, + "grad_norm": 3.2170045375823975, + "learning_rate": 9.722924278929705e-06, + "loss": 0.9774, + "step": 2983 + }, + { + "epoch": 0.7559214692843572, + "grad_norm": 3.3909413814544678, + "learning_rate": 9.722649174676459e-06, + "loss": 1.081, + "step": 2984 + }, + { + "epoch": 0.7561747941735275, + "grad_norm": 3.376862049102783, + "learning_rate": 9.722373937813439e-06, + "loss": 1.1951, + "step": 2985 + }, + { + "epoch": 0.7564281190626979, + "grad_norm": 3.3181986808776855, + "learning_rate": 9.72209856834837e-06, + "loss": 1.1478, + "step": 2986 + }, + { + "epoch": 0.7566814439518683, + "grad_norm": 3.5607292652130127, + "learning_rate": 9.721823066288988e-06, + "loss": 1.0894, + "step": 2987 + }, + { + "epoch": 0.7569347688410386, + "grad_norm": 3.5868828296661377, + "learning_rate": 9.721547431643027e-06, + "loss": 1.1139, + "step": 2988 + }, + { + "epoch": 0.757188093730209, + "grad_norm": 4.153842449188232, + "learning_rate": 9.721271664418226e-06, + "loss": 1.2214, + "step": 2989 + }, + { + "epoch": 0.7574414186193793, + "grad_norm": 3.2191667556762695, + "learning_rate": 9.72099576462233e-06, + "loss": 1.1246, + "step": 2990 + }, + { + "epoch": 0.7576947435085497, + "grad_norm": 3.7205142974853516, + "learning_rate": 9.720719732263087e-06, + "loss": 1.188, + "step": 2991 + }, + { + "epoch": 0.7579480683977201, + "grad_norm": 3.6832919120788574, + "learning_rate": 9.720443567348245e-06, + "loss": 1.2336, + "step": 2992 + }, + { + "epoch": 0.7582013932868904, + "grad_norm": 3.2360286712646484, + "learning_rate": 9.720167269885561e-06, + "loss": 1.0151, + "step": 2993 + }, + { + "epoch": 0.7584547181760608, + "grad_norm": 3.7540252208709717, + "learning_rate": 9.71989083988279e-06, + "loss": 1.0989, + "step": 2994 + }, + { + "epoch": 0.7587080430652312, + "grad_norm": 3.3956358432769775, + "learning_rate": 9.719614277347697e-06, + "loss": 1.1714, + "step": 2995 + }, + { + "epoch": 0.7589613679544015, + "grad_norm": 3.719648838043213, + "learning_rate": 9.71933758228805e-06, + "loss": 1.0855, + "step": 2996 + }, + { + "epoch": 0.7592146928435719, + "grad_norm": 3.271644353866577, + "learning_rate": 9.719060754711613e-06, + "loss": 1.0492, + "step": 2997 + }, + { + "epoch": 0.7594680177327422, + "grad_norm": 3.6907081604003906, + "learning_rate": 9.718783794626163e-06, + "loss": 1.2695, + "step": 2998 + }, + { + "epoch": 0.7597213426219126, + "grad_norm": 3.3592963218688965, + "learning_rate": 9.718506702039474e-06, + "loss": 1.1613, + "step": 2999 + }, + { + "epoch": 0.759974667511083, + "grad_norm": 3.1232833862304688, + "learning_rate": 9.718229476959329e-06, + "loss": 1.0907, + "step": 3000 + }, + { + "epoch": 0.759974667511083, + "eval_loss": 1.1983312368392944, + "eval_runtime": 11.8746, + "eval_samples_per_second": 33.685, + "eval_steps_per_second": 4.211, + "step": 3000 + }, + { + "epoch": 0.7602279924002533, + "grad_norm": 3.976891279220581, + "learning_rate": 9.717952119393512e-06, + "loss": 1.3394, + "step": 3001 + }, + { + "epoch": 0.7604813172894237, + "grad_norm": 3.916884183883667, + "learning_rate": 9.717674629349809e-06, + "loss": 1.1595, + "step": 3002 + }, + { + "epoch": 0.7607346421785941, + "grad_norm": 3.557054281234741, + "learning_rate": 9.717397006836016e-06, + "loss": 1.2948, + "step": 3003 + }, + { + "epoch": 0.7609879670677644, + "grad_norm": 3.4140894412994385, + "learning_rate": 9.717119251859925e-06, + "loss": 1.1628, + "step": 3004 + }, + { + "epoch": 0.7612412919569348, + "grad_norm": 3.865302085876465, + "learning_rate": 9.716841364429334e-06, + "loss": 1.2314, + "step": 3005 + }, + { + "epoch": 0.7614946168461051, + "grad_norm": 3.5563416481018066, + "learning_rate": 9.716563344552052e-06, + "loss": 1.1763, + "step": 3006 + }, + { + "epoch": 0.7617479417352755, + "grad_norm": 3.6717288494110107, + "learning_rate": 9.716285192235878e-06, + "loss": 1.2297, + "step": 3007 + }, + { + "epoch": 0.7620012666244459, + "grad_norm": 3.1846835613250732, + "learning_rate": 9.716006907488629e-06, + "loss": 1.1082, + "step": 3008 + }, + { + "epoch": 0.7622545915136162, + "grad_norm": 3.318401575088501, + "learning_rate": 9.715728490318117e-06, + "loss": 1.0138, + "step": 3009 + }, + { + "epoch": 0.7625079164027866, + "grad_norm": 3.2839062213897705, + "learning_rate": 9.715449940732158e-06, + "loss": 1.0879, + "step": 3010 + }, + { + "epoch": 0.7627612412919569, + "grad_norm": 3.480377435684204, + "learning_rate": 9.715171258738574e-06, + "loss": 1.2739, + "step": 3011 + }, + { + "epoch": 0.7630145661811273, + "grad_norm": 3.6452996730804443, + "learning_rate": 9.71489244434519e-06, + "loss": 1.1813, + "step": 3012 + }, + { + "epoch": 0.7632678910702977, + "grad_norm": 3.612215042114258, + "learning_rate": 9.714613497559839e-06, + "loss": 1.2545, + "step": 3013 + }, + { + "epoch": 0.763521215959468, + "grad_norm": 3.9662351608276367, + "learning_rate": 9.714334418390348e-06, + "loss": 1.3321, + "step": 3014 + }, + { + "epoch": 0.7637745408486384, + "grad_norm": 3.423196792602539, + "learning_rate": 9.714055206844557e-06, + "loss": 1.2222, + "step": 3015 + }, + { + "epoch": 0.7640278657378088, + "grad_norm": 3.5687761306762695, + "learning_rate": 9.713775862930306e-06, + "loss": 1.2045, + "step": 3016 + }, + { + "epoch": 0.7642811906269791, + "grad_norm": 3.524186849594116, + "learning_rate": 9.713496386655436e-06, + "loss": 1.1724, + "step": 3017 + }, + { + "epoch": 0.7645345155161495, + "grad_norm": 3.4388513565063477, + "learning_rate": 9.713216778027798e-06, + "loss": 1.2327, + "step": 3018 + }, + { + "epoch": 0.7647878404053198, + "grad_norm": 3.0279359817504883, + "learning_rate": 9.712937037055241e-06, + "loss": 1.1481, + "step": 3019 + }, + { + "epoch": 0.7650411652944902, + "grad_norm": 3.2815046310424805, + "learning_rate": 9.712657163745623e-06, + "loss": 1.1834, + "step": 3020 + }, + { + "epoch": 0.7652944901836606, + "grad_norm": 3.692152976989746, + "learning_rate": 9.712377158106798e-06, + "loss": 1.3233, + "step": 3021 + }, + { + "epoch": 0.7655478150728309, + "grad_norm": 3.515498399734497, + "learning_rate": 9.712097020146631e-06, + "loss": 1.2536, + "step": 3022 + }, + { + "epoch": 0.7658011399620013, + "grad_norm": 3.546562671661377, + "learning_rate": 9.711816749872989e-06, + "loss": 1.1545, + "step": 3023 + }, + { + "epoch": 0.7660544648511717, + "grad_norm": 3.4301865100860596, + "learning_rate": 9.711536347293742e-06, + "loss": 1.1617, + "step": 3024 + }, + { + "epoch": 0.766307789740342, + "grad_norm": 3.3776097297668457, + "learning_rate": 9.711255812416762e-06, + "loss": 1.158, + "step": 3025 + }, + { + "epoch": 0.7665611146295124, + "grad_norm": 3.4715332984924316, + "learning_rate": 9.710975145249925e-06, + "loss": 1.0201, + "step": 3026 + }, + { + "epoch": 0.7668144395186827, + "grad_norm": 3.1569130420684814, + "learning_rate": 9.710694345801116e-06, + "loss": 1.0537, + "step": 3027 + }, + { + "epoch": 0.7670677644078531, + "grad_norm": 3.8341550827026367, + "learning_rate": 9.710413414078218e-06, + "loss": 1.2747, + "step": 3028 + }, + { + "epoch": 0.7673210892970235, + "grad_norm": 4.095462799072266, + "learning_rate": 9.710132350089117e-06, + "loss": 1.2228, + "step": 3029 + }, + { + "epoch": 0.7675744141861938, + "grad_norm": 3.852132797241211, + "learning_rate": 9.709851153841708e-06, + "loss": 1.2327, + "step": 3030 + }, + { + "epoch": 0.7678277390753642, + "grad_norm": 3.2577106952667236, + "learning_rate": 9.709569825343886e-06, + "loss": 1.061, + "step": 3031 + }, + { + "epoch": 0.7680810639645345, + "grad_norm": 3.1022679805755615, + "learning_rate": 9.709288364603551e-06, + "loss": 1.0671, + "step": 3032 + }, + { + "epoch": 0.7683343888537049, + "grad_norm": 3.4080400466918945, + "learning_rate": 9.709006771628605e-06, + "loss": 1.0825, + "step": 3033 + }, + { + "epoch": 0.7685877137428753, + "grad_norm": 3.4398090839385986, + "learning_rate": 9.708725046426957e-06, + "loss": 1.1026, + "step": 3034 + }, + { + "epoch": 0.7688410386320456, + "grad_norm": 3.5645711421966553, + "learning_rate": 9.708443189006516e-06, + "loss": 1.0719, + "step": 3035 + }, + { + "epoch": 0.769094363521216, + "grad_norm": 3.712003231048584, + "learning_rate": 9.708161199375198e-06, + "loss": 1.3359, + "step": 3036 + }, + { + "epoch": 0.7693476884103864, + "grad_norm": 3.857830762863159, + "learning_rate": 9.707879077540918e-06, + "loss": 1.3231, + "step": 3037 + }, + { + "epoch": 0.7696010132995567, + "grad_norm": 3.32967472076416, + "learning_rate": 9.7075968235116e-06, + "loss": 1.0123, + "step": 3038 + }, + { + "epoch": 0.7698543381887271, + "grad_norm": 3.3485281467437744, + "learning_rate": 9.70731443729517e-06, + "loss": 1.1482, + "step": 3039 + }, + { + "epoch": 0.7701076630778974, + "grad_norm": 3.2829980850219727, + "learning_rate": 9.707031918899558e-06, + "loss": 1.0776, + "step": 3040 + }, + { + "epoch": 0.7703609879670678, + "grad_norm": 3.3828749656677246, + "learning_rate": 9.706749268332694e-06, + "loss": 1.2159, + "step": 3041 + }, + { + "epoch": 0.7706143128562382, + "grad_norm": 3.7079780101776123, + "learning_rate": 9.70646648560252e-06, + "loss": 1.0764, + "step": 3042 + }, + { + "epoch": 0.7708676377454085, + "grad_norm": 3.597647190093994, + "learning_rate": 9.70618357071697e-06, + "loss": 1.2618, + "step": 3043 + }, + { + "epoch": 0.7711209626345789, + "grad_norm": 3.4452812671661377, + "learning_rate": 9.705900523683991e-06, + "loss": 1.1679, + "step": 3044 + }, + { + "epoch": 0.7713742875237493, + "grad_norm": 3.8126072883605957, + "learning_rate": 9.705617344511531e-06, + "loss": 1.3734, + "step": 3045 + }, + { + "epoch": 0.7716276124129196, + "grad_norm": 3.520598888397217, + "learning_rate": 9.705334033207542e-06, + "loss": 1.0812, + "step": 3046 + }, + { + "epoch": 0.77188093730209, + "grad_norm": 3.625828981399536, + "learning_rate": 9.705050589779979e-06, + "loss": 1.2304, + "step": 3047 + }, + { + "epoch": 0.7721342621912602, + "grad_norm": 3.4245150089263916, + "learning_rate": 9.7047670142368e-06, + "loss": 1.0356, + "step": 3048 + }, + { + "epoch": 0.7723875870804306, + "grad_norm": 3.5724411010742188, + "learning_rate": 9.704483306585967e-06, + "loss": 1.1305, + "step": 3049 + }, + { + "epoch": 0.772640911969601, + "grad_norm": 3.7136075496673584, + "learning_rate": 9.70419946683545e-06, + "loss": 1.2277, + "step": 3050 + }, + { + "epoch": 0.7728942368587713, + "grad_norm": 3.5142602920532227, + "learning_rate": 9.703915494993215e-06, + "loss": 1.2297, + "step": 3051 + }, + { + "epoch": 0.7731475617479417, + "grad_norm": 3.6985182762145996, + "learning_rate": 9.703631391067239e-06, + "loss": 1.1701, + "step": 3052 + }, + { + "epoch": 0.7734008866371122, + "grad_norm": 3.6202380657196045, + "learning_rate": 9.703347155065496e-06, + "loss": 1.1452, + "step": 3053 + }, + { + "epoch": 0.7736542115262824, + "grad_norm": 3.7811033725738525, + "learning_rate": 9.703062786995972e-06, + "loss": 1.2575, + "step": 3054 + }, + { + "epoch": 0.7739075364154528, + "grad_norm": 3.7027955055236816, + "learning_rate": 9.702778286866647e-06, + "loss": 1.1537, + "step": 3055 + }, + { + "epoch": 0.7741608613046231, + "grad_norm": 3.3426177501678467, + "learning_rate": 9.702493654685512e-06, + "loss": 1.2052, + "step": 3056 + }, + { + "epoch": 0.7744141861937935, + "grad_norm": 3.939532518386841, + "learning_rate": 9.702208890460559e-06, + "loss": 1.22, + "step": 3057 + }, + { + "epoch": 0.7746675110829639, + "grad_norm": 3.596379280090332, + "learning_rate": 9.701923994199784e-06, + "loss": 1.1168, + "step": 3058 + }, + { + "epoch": 0.7749208359721342, + "grad_norm": 3.7136929035186768, + "learning_rate": 9.701638965911188e-06, + "loss": 1.2842, + "step": 3059 + }, + { + "epoch": 0.7751741608613046, + "grad_norm": 3.8441104888916016, + "learning_rate": 9.701353805602773e-06, + "loss": 1.3659, + "step": 3060 + }, + { + "epoch": 0.7754274857504749, + "grad_norm": 3.539217472076416, + "learning_rate": 9.701068513282547e-06, + "loss": 1.1411, + "step": 3061 + }, + { + "epoch": 0.7756808106396453, + "grad_norm": 3.299401044845581, + "learning_rate": 9.70078308895852e-06, + "loss": 1.1182, + "step": 3062 + }, + { + "epoch": 0.7759341355288157, + "grad_norm": 3.4412131309509277, + "learning_rate": 9.700497532638707e-06, + "loss": 1.1753, + "step": 3063 + }, + { + "epoch": 0.776187460417986, + "grad_norm": 3.202305793762207, + "learning_rate": 9.700211844331126e-06, + "loss": 1.128, + "step": 3064 + }, + { + "epoch": 0.7764407853071564, + "grad_norm": 3.5436625480651855, + "learning_rate": 9.6999260240438e-06, + "loss": 1.2065, + "step": 3065 + }, + { + "epoch": 0.7766941101963268, + "grad_norm": 3.8974997997283936, + "learning_rate": 9.699640071784752e-06, + "loss": 1.1363, + "step": 3066 + }, + { + "epoch": 0.7769474350854971, + "grad_norm": 3.397515058517456, + "learning_rate": 9.699353987562017e-06, + "loss": 1.0489, + "step": 3067 + }, + { + "epoch": 0.7772007599746675, + "grad_norm": 3.2989842891693115, + "learning_rate": 9.699067771383621e-06, + "loss": 1.2765, + "step": 3068 + }, + { + "epoch": 0.7774540848638378, + "grad_norm": 3.317469596862793, + "learning_rate": 9.698781423257606e-06, + "loss": 1.1563, + "step": 3069 + }, + { + "epoch": 0.7777074097530082, + "grad_norm": 3.4486303329467773, + "learning_rate": 9.698494943192011e-06, + "loss": 1.265, + "step": 3070 + }, + { + "epoch": 0.7779607346421786, + "grad_norm": 3.5988612174987793, + "learning_rate": 9.69820833119488e-06, + "loss": 1.1572, + "step": 3071 + }, + { + "epoch": 0.7782140595313489, + "grad_norm": 3.2691543102264404, + "learning_rate": 9.697921587274261e-06, + "loss": 1.1893, + "step": 3072 + }, + { + "epoch": 0.7784673844205193, + "grad_norm": 3.7137327194213867, + "learning_rate": 9.697634711438205e-06, + "loss": 1.1896, + "step": 3073 + }, + { + "epoch": 0.7787207093096897, + "grad_norm": 3.6938259601593018, + "learning_rate": 9.697347703694769e-06, + "loss": 1.1371, + "step": 3074 + }, + { + "epoch": 0.77897403419886, + "grad_norm": 3.327613353729248, + "learning_rate": 9.697060564052009e-06, + "loss": 1.2833, + "step": 3075 + }, + { + "epoch": 0.7792273590880304, + "grad_norm": 3.5304393768310547, + "learning_rate": 9.696773292517991e-06, + "loss": 1.152, + "step": 3076 + }, + { + "epoch": 0.7794806839772007, + "grad_norm": 3.6134355068206787, + "learning_rate": 9.696485889100781e-06, + "loss": 1.2214, + "step": 3077 + }, + { + "epoch": 0.7797340088663711, + "grad_norm": 3.2280755043029785, + "learning_rate": 9.696198353808449e-06, + "loss": 1.1215, + "step": 3078 + }, + { + "epoch": 0.7799873337555415, + "grad_norm": 3.1979382038116455, + "learning_rate": 9.695910686649067e-06, + "loss": 1.1125, + "step": 3079 + }, + { + "epoch": 0.7802406586447118, + "grad_norm": 3.7828009128570557, + "learning_rate": 9.695622887630714e-06, + "loss": 1.2805, + "step": 3080 + }, + { + "epoch": 0.7804939835338822, + "grad_norm": 3.492016077041626, + "learning_rate": 9.69533495676147e-06, + "loss": 1.2535, + "step": 3081 + }, + { + "epoch": 0.7807473084230525, + "grad_norm": 3.493319034576416, + "learning_rate": 9.695046894049422e-06, + "loss": 1.1868, + "step": 3082 + }, + { + "epoch": 0.7810006333122229, + "grad_norm": 3.061150312423706, + "learning_rate": 9.694758699502658e-06, + "loss": 1.0062, + "step": 3083 + }, + { + "epoch": 0.7812539582013933, + "grad_norm": 3.4540719985961914, + "learning_rate": 9.694470373129268e-06, + "loss": 1.1865, + "step": 3084 + }, + { + "epoch": 0.7815072830905636, + "grad_norm": 3.6647727489471436, + "learning_rate": 9.694181914937353e-06, + "loss": 1.2265, + "step": 3085 + }, + { + "epoch": 0.781760607979734, + "grad_norm": 3.4149343967437744, + "learning_rate": 9.693893324935008e-06, + "loss": 1.2191, + "step": 3086 + }, + { + "epoch": 0.7820139328689044, + "grad_norm": 3.4975831508636475, + "learning_rate": 9.69360460313034e-06, + "loss": 1.1281, + "step": 3087 + }, + { + "epoch": 0.7822672577580747, + "grad_norm": 3.4812533855438232, + "learning_rate": 9.693315749531452e-06, + "loss": 1.0671, + "step": 3088 + }, + { + "epoch": 0.7825205826472451, + "grad_norm": 3.6105175018310547, + "learning_rate": 9.69302676414646e-06, + "loss": 1.2766, + "step": 3089 + }, + { + "epoch": 0.7827739075364154, + "grad_norm": 3.2064576148986816, + "learning_rate": 9.692737646983475e-06, + "loss": 1.2355, + "step": 3090 + }, + { + "epoch": 0.7830272324255858, + "grad_norm": 3.4227678775787354, + "learning_rate": 9.692448398050616e-06, + "loss": 1.1434, + "step": 3091 + }, + { + "epoch": 0.7832805573147562, + "grad_norm": 3.613091468811035, + "learning_rate": 9.692159017356005e-06, + "loss": 1.2833, + "step": 3092 + }, + { + "epoch": 0.7835338822039265, + "grad_norm": 3.376112937927246, + "learning_rate": 9.691869504907768e-06, + "loss": 1.3671, + "step": 3093 + }, + { + "epoch": 0.7837872070930969, + "grad_norm": 3.616713047027588, + "learning_rate": 9.691579860714033e-06, + "loss": 1.1647, + "step": 3094 + }, + { + "epoch": 0.7840405319822673, + "grad_norm": 3.1492090225219727, + "learning_rate": 9.691290084782935e-06, + "loss": 0.9979, + "step": 3095 + }, + { + "epoch": 0.7842938568714376, + "grad_norm": 3.5917351245880127, + "learning_rate": 9.691000177122613e-06, + "loss": 1.2316, + "step": 3096 + }, + { + "epoch": 0.784547181760608, + "grad_norm": 3.565091609954834, + "learning_rate": 9.690710137741202e-06, + "loss": 1.2166, + "step": 3097 + }, + { + "epoch": 0.7848005066497783, + "grad_norm": 3.124160051345825, + "learning_rate": 9.690419966646849e-06, + "loss": 1.0655, + "step": 3098 + }, + { + "epoch": 0.7850538315389487, + "grad_norm": 3.484919309616089, + "learning_rate": 9.690129663847703e-06, + "loss": 1.1891, + "step": 3099 + }, + { + "epoch": 0.7853071564281191, + "grad_norm": 3.617344856262207, + "learning_rate": 9.689839229351912e-06, + "loss": 1.1958, + "step": 3100 + }, + { + "epoch": 0.7855604813172894, + "grad_norm": 3.445941686630249, + "learning_rate": 9.689548663167636e-06, + "loss": 1.2537, + "step": 3101 + }, + { + "epoch": 0.7858138062064598, + "grad_norm": 3.6886892318725586, + "learning_rate": 9.689257965303029e-06, + "loss": 1.3261, + "step": 3102 + }, + { + "epoch": 0.7860671310956302, + "grad_norm": 3.6095004081726074, + "learning_rate": 9.688967135766257e-06, + "loss": 1.1573, + "step": 3103 + }, + { + "epoch": 0.7863204559848005, + "grad_norm": 3.524705171585083, + "learning_rate": 9.688676174565486e-06, + "loss": 1.1003, + "step": 3104 + }, + { + "epoch": 0.7865737808739709, + "grad_norm": 3.440295934677124, + "learning_rate": 9.688385081708883e-06, + "loss": 1.2146, + "step": 3105 + }, + { + "epoch": 0.7868271057631412, + "grad_norm": 3.291731119155884, + "learning_rate": 9.688093857204628e-06, + "loss": 1.0812, + "step": 3106 + }, + { + "epoch": 0.7870804306523116, + "grad_norm": 3.2207679748535156, + "learning_rate": 9.687802501060893e-06, + "loss": 0.9835, + "step": 3107 + }, + { + "epoch": 0.787333755541482, + "grad_norm": 3.5070319175720215, + "learning_rate": 9.687511013285863e-06, + "loss": 1.2079, + "step": 3108 + }, + { + "epoch": 0.7875870804306523, + "grad_norm": 3.92917799949646, + "learning_rate": 9.687219393887716e-06, + "loss": 1.2231, + "step": 3109 + }, + { + "epoch": 0.7878404053198227, + "grad_norm": 3.5776121616363525, + "learning_rate": 9.686927642874648e-06, + "loss": 1.1356, + "step": 3110 + }, + { + "epoch": 0.788093730208993, + "grad_norm": 3.3320472240448, + "learning_rate": 9.68663576025485e-06, + "loss": 1.1073, + "step": 3111 + }, + { + "epoch": 0.7883470550981634, + "grad_norm": 3.826519012451172, + "learning_rate": 9.686343746036513e-06, + "loss": 1.3251, + "step": 3112 + }, + { + "epoch": 0.7886003799873338, + "grad_norm": 3.5765302181243896, + "learning_rate": 9.686051600227841e-06, + "loss": 1.1568, + "step": 3113 + }, + { + "epoch": 0.7888537048765041, + "grad_norm": 3.455152988433838, + "learning_rate": 9.685759322837039e-06, + "loss": 1.2745, + "step": 3114 + }, + { + "epoch": 0.7891070297656745, + "grad_norm": 3.5755326747894287, + "learning_rate": 9.685466913872308e-06, + "loss": 1.0886, + "step": 3115 + }, + { + "epoch": 0.7893603546548449, + "grad_norm": 3.298093795776367, + "learning_rate": 9.685174373341864e-06, + "loss": 1.2409, + "step": 3116 + }, + { + "epoch": 0.7896136795440152, + "grad_norm": 3.254582643508911, + "learning_rate": 9.684881701253917e-06, + "loss": 1.1229, + "step": 3117 + }, + { + "epoch": 0.7898670044331856, + "grad_norm": 3.6387033462524414, + "learning_rate": 9.684588897616689e-06, + "loss": 1.1904, + "step": 3118 + }, + { + "epoch": 0.7901203293223559, + "grad_norm": 3.4219889640808105, + "learning_rate": 9.6842959624384e-06, + "loss": 1.2364, + "step": 3119 + }, + { + "epoch": 0.7903736542115263, + "grad_norm": 3.6625919342041016, + "learning_rate": 9.684002895727279e-06, + "loss": 1.2403, + "step": 3120 + }, + { + "epoch": 0.7906269791006967, + "grad_norm": 3.738839864730835, + "learning_rate": 9.683709697491549e-06, + "loss": 1.1652, + "step": 3121 + }, + { + "epoch": 0.790880303989867, + "grad_norm": 3.450275421142578, + "learning_rate": 9.683416367739444e-06, + "loss": 1.0929, + "step": 3122 + }, + { + "epoch": 0.7911336288790374, + "grad_norm": 3.7725143432617188, + "learning_rate": 9.683122906479206e-06, + "loss": 1.2139, + "step": 3123 + }, + { + "epoch": 0.7913869537682078, + "grad_norm": 3.659973621368408, + "learning_rate": 9.68282931371907e-06, + "loss": 1.2173, + "step": 3124 + }, + { + "epoch": 0.7916402786573781, + "grad_norm": 3.4902517795562744, + "learning_rate": 9.68253558946728e-06, + "loss": 1.2356, + "step": 3125 + }, + { + "epoch": 0.7918936035465485, + "grad_norm": 3.636298418045044, + "learning_rate": 9.682241733732088e-06, + "loss": 1.2319, + "step": 3126 + }, + { + "epoch": 0.7921469284357188, + "grad_norm": 3.2394468784332275, + "learning_rate": 9.681947746521742e-06, + "loss": 0.9985, + "step": 3127 + }, + { + "epoch": 0.7924002533248892, + "grad_norm": 3.2394330501556396, + "learning_rate": 9.681653627844497e-06, + "loss": 1.1402, + "step": 3128 + }, + { + "epoch": 0.7926535782140596, + "grad_norm": 3.2934374809265137, + "learning_rate": 9.681359377708613e-06, + "loss": 1.0669, + "step": 3129 + }, + { + "epoch": 0.7929069031032299, + "grad_norm": 3.6023757457733154, + "learning_rate": 9.681064996122351e-06, + "loss": 1.1956, + "step": 3130 + }, + { + "epoch": 0.7931602279924003, + "grad_norm": 3.362863302230835, + "learning_rate": 9.680770483093978e-06, + "loss": 1.1133, + "step": 3131 + }, + { + "epoch": 0.7934135528815706, + "grad_norm": 3.4116978645324707, + "learning_rate": 9.680475838631764e-06, + "loss": 1.3068, + "step": 3132 + }, + { + "epoch": 0.793666877770741, + "grad_norm": 3.557642936706543, + "learning_rate": 9.680181062743981e-06, + "loss": 1.2215, + "step": 3133 + }, + { + "epoch": 0.7939202026599114, + "grad_norm": 3.188054323196411, + "learning_rate": 9.67988615543891e-06, + "loss": 1.1557, + "step": 3134 + }, + { + "epoch": 0.7941735275490817, + "grad_norm": 3.318195104598999, + "learning_rate": 9.679591116724826e-06, + "loss": 1.132, + "step": 3135 + }, + { + "epoch": 0.7944268524382521, + "grad_norm": 3.3555331230163574, + "learning_rate": 9.679295946610017e-06, + "loss": 1.1461, + "step": 3136 + }, + { + "epoch": 0.7946801773274225, + "grad_norm": 3.305663585662842, + "learning_rate": 9.679000645102771e-06, + "loss": 1.2936, + "step": 3137 + }, + { + "epoch": 0.7949335022165928, + "grad_norm": 3.2727150917053223, + "learning_rate": 9.67870521221138e-06, + "loss": 1.0754, + "step": 3138 + }, + { + "epoch": 0.7951868271057632, + "grad_norm": 3.497584581375122, + "learning_rate": 9.678409647944141e-06, + "loss": 1.1606, + "step": 3139 + }, + { + "epoch": 0.7954401519949335, + "grad_norm": 3.5710928440093994, + "learning_rate": 9.678113952309351e-06, + "loss": 1.2395, + "step": 3140 + }, + { + "epoch": 0.7956934768841039, + "grad_norm": 3.847907304763794, + "learning_rate": 9.677818125315314e-06, + "loss": 1.2105, + "step": 3141 + }, + { + "epoch": 0.7959468017732743, + "grad_norm": 3.30964732170105, + "learning_rate": 9.677522166970335e-06, + "loss": 1.1297, + "step": 3142 + }, + { + "epoch": 0.7962001266624446, + "grad_norm": 3.6384894847869873, + "learning_rate": 9.677226077282728e-06, + "loss": 1.2604, + "step": 3143 + }, + { + "epoch": 0.796453451551615, + "grad_norm": 3.5480246543884277, + "learning_rate": 9.676929856260803e-06, + "loss": 1.2013, + "step": 3144 + }, + { + "epoch": 0.7967067764407854, + "grad_norm": 3.394646167755127, + "learning_rate": 9.67663350391288e-06, + "loss": 1.0893, + "step": 3145 + }, + { + "epoch": 0.7969601013299557, + "grad_norm": 3.6400444507598877, + "learning_rate": 9.67633702024728e-06, + "loss": 1.1384, + "step": 3146 + }, + { + "epoch": 0.797213426219126, + "grad_norm": 3.373098850250244, + "learning_rate": 9.676040405272329e-06, + "loss": 1.2476, + "step": 3147 + }, + { + "epoch": 0.7974667511082963, + "grad_norm": 3.505863666534424, + "learning_rate": 9.675743658996353e-06, + "loss": 1.1156, + "step": 3148 + }, + { + "epoch": 0.7977200759974667, + "grad_norm": 3.760483980178833, + "learning_rate": 9.675446781427689e-06, + "loss": 1.3028, + "step": 3149 + }, + { + "epoch": 0.7979734008866372, + "grad_norm": 3.562391519546509, + "learning_rate": 9.675149772574669e-06, + "loss": 1.2152, + "step": 3150 + }, + { + "epoch": 0.7982267257758074, + "grad_norm": 3.8508145809173584, + "learning_rate": 9.674852632445635e-06, + "loss": 1.2636, + "step": 3151 + }, + { + "epoch": 0.7984800506649778, + "grad_norm": 3.880608081817627, + "learning_rate": 9.674555361048931e-06, + "loss": 1.1694, + "step": 3152 + }, + { + "epoch": 0.7987333755541483, + "grad_norm": 3.732882022857666, + "learning_rate": 9.674257958392901e-06, + "loss": 1.1474, + "step": 3153 + }, + { + "epoch": 0.7989867004433185, + "grad_norm": 3.0603837966918945, + "learning_rate": 9.6739604244859e-06, + "loss": 1.034, + "step": 3154 + }, + { + "epoch": 0.799240025332489, + "grad_norm": 3.607832431793213, + "learning_rate": 9.67366275933628e-06, + "loss": 1.2098, + "step": 3155 + }, + { + "epoch": 0.7994933502216592, + "grad_norm": 3.4668948650360107, + "learning_rate": 9.673364962952402e-06, + "loss": 1.0646, + "step": 3156 + }, + { + "epoch": 0.7997466751108296, + "grad_norm": 3.265519142150879, + "learning_rate": 9.673067035342625e-06, + "loss": 1.2244, + "step": 3157 + }, + { + "epoch": 0.8, + "grad_norm": 3.534250020980835, + "learning_rate": 9.672768976515314e-06, + "loss": 1.1848, + "step": 3158 + }, + { + "epoch": 0.8002533248891703, + "grad_norm": 3.676901340484619, + "learning_rate": 9.672470786478842e-06, + "loss": 1.1528, + "step": 3159 + }, + { + "epoch": 0.8005066497783407, + "grad_norm": 3.2827255725860596, + "learning_rate": 9.672172465241581e-06, + "loss": 1.1754, + "step": 3160 + }, + { + "epoch": 0.800759974667511, + "grad_norm": 3.2366580963134766, + "learning_rate": 9.671874012811905e-06, + "loss": 1.0439, + "step": 3161 + }, + { + "epoch": 0.8010132995566814, + "grad_norm": 3.3683462142944336, + "learning_rate": 9.671575429198198e-06, + "loss": 1.0951, + "step": 3162 + }, + { + "epoch": 0.8012666244458518, + "grad_norm": 3.380063772201538, + "learning_rate": 9.671276714408841e-06, + "loss": 1.3027, + "step": 3163 + }, + { + "epoch": 0.8015199493350221, + "grad_norm": 3.5820157527923584, + "learning_rate": 9.670977868452225e-06, + "loss": 1.2704, + "step": 3164 + }, + { + "epoch": 0.8017732742241925, + "grad_norm": 3.5308854579925537, + "learning_rate": 9.670678891336738e-06, + "loss": 1.1675, + "step": 3165 + }, + { + "epoch": 0.8020265991133629, + "grad_norm": 3.722160577774048, + "learning_rate": 9.670379783070776e-06, + "loss": 1.2754, + "step": 3166 + }, + { + "epoch": 0.8022799240025332, + "grad_norm": 3.3937900066375732, + "learning_rate": 9.670080543662742e-06, + "loss": 1.1771, + "step": 3167 + }, + { + "epoch": 0.8025332488917036, + "grad_norm": 3.163614511489868, + "learning_rate": 9.66978117312103e-06, + "loss": 1.0594, + "step": 3168 + }, + { + "epoch": 0.8027865737808739, + "grad_norm": 3.295989990234375, + "learning_rate": 9.669481671454055e-06, + "loss": 1.0515, + "step": 3169 + }, + { + "epoch": 0.8030398986700443, + "grad_norm": 3.417457103729248, + "learning_rate": 9.669182038670223e-06, + "loss": 1.1258, + "step": 3170 + }, + { + "epoch": 0.8032932235592147, + "grad_norm": 3.2533841133117676, + "learning_rate": 9.668882274777946e-06, + "loss": 1.1411, + "step": 3171 + }, + { + "epoch": 0.803546548448385, + "grad_norm": 3.379027843475342, + "learning_rate": 9.668582379785645e-06, + "loss": 1.241, + "step": 3172 + }, + { + "epoch": 0.8037998733375554, + "grad_norm": 3.3672876358032227, + "learning_rate": 9.668282353701737e-06, + "loss": 1.2533, + "step": 3173 + }, + { + "epoch": 0.8040531982267258, + "grad_norm": 3.446272373199463, + "learning_rate": 9.667982196534651e-06, + "loss": 1.1312, + "step": 3174 + }, + { + "epoch": 0.8043065231158961, + "grad_norm": 3.0003163814544678, + "learning_rate": 9.66768190829281e-06, + "loss": 1.0172, + "step": 3175 + }, + { + "epoch": 0.8045598480050665, + "grad_norm": 3.13234281539917, + "learning_rate": 9.66738148898465e-06, + "loss": 1.2439, + "step": 3176 + }, + { + "epoch": 0.8048131728942368, + "grad_norm": 3.442028045654297, + "learning_rate": 9.667080938618605e-06, + "loss": 1.2135, + "step": 3177 + }, + { + "epoch": 0.8050664977834072, + "grad_norm": 3.6103761196136475, + "learning_rate": 9.666780257203114e-06, + "loss": 1.1319, + "step": 3178 + }, + { + "epoch": 0.8053198226725776, + "grad_norm": 3.420414686203003, + "learning_rate": 9.666479444746622e-06, + "loss": 1.1977, + "step": 3179 + }, + { + "epoch": 0.8055731475617479, + "grad_norm": 3.5578372478485107, + "learning_rate": 9.666178501257573e-06, + "loss": 1.0822, + "step": 3180 + }, + { + "epoch": 0.8058264724509183, + "grad_norm": 3.520909547805786, + "learning_rate": 9.665877426744418e-06, + "loss": 1.2723, + "step": 3181 + }, + { + "epoch": 0.8060797973400886, + "grad_norm": 3.3848018646240234, + "learning_rate": 9.665576221215614e-06, + "loss": 1.0662, + "step": 3182 + }, + { + "epoch": 0.806333122229259, + "grad_norm": 3.620361804962158, + "learning_rate": 9.665274884679614e-06, + "loss": 1.1763, + "step": 3183 + }, + { + "epoch": 0.8065864471184294, + "grad_norm": 3.8704168796539307, + "learning_rate": 9.664973417144883e-06, + "loss": 1.3097, + "step": 3184 + }, + { + "epoch": 0.8068397720075997, + "grad_norm": 3.535381317138672, + "learning_rate": 9.664671818619884e-06, + "loss": 1.1958, + "step": 3185 + }, + { + "epoch": 0.8070930968967701, + "grad_norm": 3.782193422317505, + "learning_rate": 9.664370089113087e-06, + "loss": 1.2697, + "step": 3186 + }, + { + "epoch": 0.8073464217859405, + "grad_norm": 3.414486885070801, + "learning_rate": 9.664068228632963e-06, + "loss": 1.1252, + "step": 3187 + }, + { + "epoch": 0.8075997466751108, + "grad_norm": 3.460775852203369, + "learning_rate": 9.663766237187991e-06, + "loss": 1.1037, + "step": 3188 + }, + { + "epoch": 0.8078530715642812, + "grad_norm": 3.7717158794403076, + "learning_rate": 9.663464114786649e-06, + "loss": 1.2827, + "step": 3189 + }, + { + "epoch": 0.8081063964534515, + "grad_norm": 3.7486562728881836, + "learning_rate": 9.66316186143742e-06, + "loss": 1.2179, + "step": 3190 + }, + { + "epoch": 0.8083597213426219, + "grad_norm": 3.182420015335083, + "learning_rate": 9.662859477148789e-06, + "loss": 0.9427, + "step": 3191 + }, + { + "epoch": 0.8086130462317923, + "grad_norm": 3.69174861907959, + "learning_rate": 9.66255696192925e-06, + "loss": 1.2672, + "step": 3192 + }, + { + "epoch": 0.8088663711209626, + "grad_norm": 3.5637874603271484, + "learning_rate": 9.6622543157873e-06, + "loss": 1.2077, + "step": 3193 + }, + { + "epoch": 0.809119696010133, + "grad_norm": 3.373786211013794, + "learning_rate": 9.661951538731431e-06, + "loss": 1.1181, + "step": 3194 + }, + { + "epoch": 0.8093730208993034, + "grad_norm": 3.3571665287017822, + "learning_rate": 9.661648630770149e-06, + "loss": 1.1348, + "step": 3195 + }, + { + "epoch": 0.8096263457884737, + "grad_norm": 3.5804684162139893, + "learning_rate": 9.661345591911956e-06, + "loss": 1.13, + "step": 3196 + }, + { + "epoch": 0.8098796706776441, + "grad_norm": 3.5872457027435303, + "learning_rate": 9.661042422165366e-06, + "loss": 1.271, + "step": 3197 + }, + { + "epoch": 0.8101329955668144, + "grad_norm": 4.017947196960449, + "learning_rate": 9.66073912153889e-06, + "loss": 1.2686, + "step": 3198 + }, + { + "epoch": 0.8103863204559848, + "grad_norm": 4.318078994750977, + "learning_rate": 9.66043569004104e-06, + "loss": 1.2008, + "step": 3199 + }, + { + "epoch": 0.8106396453451552, + "grad_norm": 3.1133437156677246, + "learning_rate": 9.660132127680344e-06, + "loss": 1.1175, + "step": 3200 + }, + { + "epoch": 0.8108929702343255, + "grad_norm": 3.359477996826172, + "learning_rate": 9.65982843446532e-06, + "loss": 1.1499, + "step": 3201 + }, + { + "epoch": 0.8111462951234959, + "grad_norm": 3.635134696960449, + "learning_rate": 9.659524610404499e-06, + "loss": 1.1756, + "step": 3202 + }, + { + "epoch": 0.8113996200126662, + "grad_norm": 3.313199281692505, + "learning_rate": 9.659220655506408e-06, + "loss": 1.141, + "step": 3203 + }, + { + "epoch": 0.8116529449018366, + "grad_norm": 3.8395190238952637, + "learning_rate": 9.658916569779586e-06, + "loss": 1.275, + "step": 3204 + }, + { + "epoch": 0.811906269791007, + "grad_norm": 3.571878433227539, + "learning_rate": 9.65861235323257e-06, + "loss": 1.2219, + "step": 3205 + }, + { + "epoch": 0.8121595946801773, + "grad_norm": 3.641077995300293, + "learning_rate": 9.658308005873905e-06, + "loss": 1.2264, + "step": 3206 + }, + { + "epoch": 0.8124129195693477, + "grad_norm": 3.2891337871551514, + "learning_rate": 9.658003527712132e-06, + "loss": 1.1374, + "step": 3207 + }, + { + "epoch": 0.8126662444585181, + "grad_norm": 3.634793996810913, + "learning_rate": 9.657698918755803e-06, + "loss": 1.2506, + "step": 3208 + }, + { + "epoch": 0.8129195693476884, + "grad_norm": 3.4935338497161865, + "learning_rate": 9.657394179013471e-06, + "loss": 1.2953, + "step": 3209 + }, + { + "epoch": 0.8131728942368588, + "grad_norm": 3.2657690048217773, + "learning_rate": 9.657089308493695e-06, + "loss": 1.0885, + "step": 3210 + }, + { + "epoch": 0.8134262191260291, + "grad_norm": 3.4446094036102295, + "learning_rate": 9.656784307205033e-06, + "loss": 1.3055, + "step": 3211 + }, + { + "epoch": 0.8136795440151995, + "grad_norm": 3.265702962875366, + "learning_rate": 9.65647917515605e-06, + "loss": 0.9897, + "step": 3212 + }, + { + "epoch": 0.8139328689043699, + "grad_norm": 3.2030014991760254, + "learning_rate": 9.656173912355314e-06, + "loss": 1.1032, + "step": 3213 + }, + { + "epoch": 0.8141861937935402, + "grad_norm": 3.6671595573425293, + "learning_rate": 9.655868518811398e-06, + "loss": 1.2168, + "step": 3214 + }, + { + "epoch": 0.8144395186827106, + "grad_norm": 3.2315893173217773, + "learning_rate": 9.655562994532875e-06, + "loss": 1.0976, + "step": 3215 + }, + { + "epoch": 0.814692843571881, + "grad_norm": 3.3605079650878906, + "learning_rate": 9.655257339528325e-06, + "loss": 1.1803, + "step": 3216 + }, + { + "epoch": 0.8149461684610513, + "grad_norm": 3.871366500854492, + "learning_rate": 9.65495155380633e-06, + "loss": 1.3661, + "step": 3217 + }, + { + "epoch": 0.8151994933502217, + "grad_norm": 3.7332444190979004, + "learning_rate": 9.65464563737548e-06, + "loss": 1.1392, + "step": 3218 + }, + { + "epoch": 0.815452818239392, + "grad_norm": 4.088546276092529, + "learning_rate": 9.654339590244359e-06, + "loss": 1.3065, + "step": 3219 + }, + { + "epoch": 0.8157061431285624, + "grad_norm": 3.6960456371307373, + "learning_rate": 9.654033412421565e-06, + "loss": 1.1731, + "step": 3220 + }, + { + "epoch": 0.8159594680177328, + "grad_norm": 3.8195888996124268, + "learning_rate": 9.653727103915694e-06, + "loss": 1.0466, + "step": 3221 + }, + { + "epoch": 0.8162127929069031, + "grad_norm": 3.41788649559021, + "learning_rate": 9.653420664735348e-06, + "loss": 1.1442, + "step": 3222 + }, + { + "epoch": 0.8164661177960735, + "grad_norm": 3.5325682163238525, + "learning_rate": 9.653114094889128e-06, + "loss": 1.1169, + "step": 3223 + }, + { + "epoch": 0.8167194426852439, + "grad_norm": 3.577907085418701, + "learning_rate": 9.652807394385646e-06, + "loss": 1.1823, + "step": 3224 + }, + { + "epoch": 0.8169727675744142, + "grad_norm": 3.28833270072937, + "learning_rate": 9.652500563233513e-06, + "loss": 1.1912, + "step": 3225 + }, + { + "epoch": 0.8172260924635846, + "grad_norm": 3.2663886547088623, + "learning_rate": 9.652193601441346e-06, + "loss": 1.2565, + "step": 3226 + }, + { + "epoch": 0.8174794173527549, + "grad_norm": 3.691718101501465, + "learning_rate": 9.651886509017763e-06, + "loss": 1.2236, + "step": 3227 + }, + { + "epoch": 0.8177327422419253, + "grad_norm": 3.51438570022583, + "learning_rate": 9.651579285971386e-06, + "loss": 1.232, + "step": 3228 + }, + { + "epoch": 0.8179860671310957, + "grad_norm": 3.36354660987854, + "learning_rate": 9.651271932310843e-06, + "loss": 1.1005, + "step": 3229 + }, + { + "epoch": 0.818239392020266, + "grad_norm": 2.835439443588257, + "learning_rate": 9.650964448044763e-06, + "loss": 1.1692, + "step": 3230 + }, + { + "epoch": 0.8184927169094364, + "grad_norm": 3.549734354019165, + "learning_rate": 9.650656833181784e-06, + "loss": 1.1792, + "step": 3231 + }, + { + "epoch": 0.8187460417986067, + "grad_norm": 3.8005549907684326, + "learning_rate": 9.650349087730538e-06, + "loss": 1.0825, + "step": 3232 + }, + { + "epoch": 0.8189993666877771, + "grad_norm": 3.6286110877990723, + "learning_rate": 9.65004121169967e-06, + "loss": 1.219, + "step": 3233 + }, + { + "epoch": 0.8192526915769475, + "grad_norm": 3.3073770999908447, + "learning_rate": 9.649733205097824e-06, + "loss": 1.0917, + "step": 3234 + }, + { + "epoch": 0.8195060164661178, + "grad_norm": 3.6399085521698, + "learning_rate": 9.64942506793365e-06, + "loss": 1.3087, + "step": 3235 + }, + { + "epoch": 0.8197593413552882, + "grad_norm": 3.6974353790283203, + "learning_rate": 9.649116800215799e-06, + "loss": 1.1865, + "step": 3236 + }, + { + "epoch": 0.8200126662444586, + "grad_norm": 3.6860082149505615, + "learning_rate": 9.648808401952926e-06, + "loss": 1.239, + "step": 3237 + }, + { + "epoch": 0.8202659911336289, + "grad_norm": 3.564912796020508, + "learning_rate": 9.648499873153691e-06, + "loss": 1.1653, + "step": 3238 + }, + { + "epoch": 0.8205193160227993, + "grad_norm": 3.8110899925231934, + "learning_rate": 9.648191213826761e-06, + "loss": 1.1491, + "step": 3239 + }, + { + "epoch": 0.8207726409119696, + "grad_norm": 3.462819814682007, + "learning_rate": 9.647882423980799e-06, + "loss": 1.2072, + "step": 3240 + }, + { + "epoch": 0.82102596580114, + "grad_norm": 3.7578890323638916, + "learning_rate": 9.647573503624477e-06, + "loss": 1.2355, + "step": 3241 + }, + { + "epoch": 0.8212792906903104, + "grad_norm": 3.286555051803589, + "learning_rate": 9.647264452766468e-06, + "loss": 1.1327, + "step": 3242 + }, + { + "epoch": 0.8215326155794807, + "grad_norm": 3.6600263118743896, + "learning_rate": 9.646955271415453e-06, + "loss": 1.2181, + "step": 3243 + }, + { + "epoch": 0.821785940468651, + "grad_norm": 3.9920105934143066, + "learning_rate": 9.64664595958011e-06, + "loss": 1.3502, + "step": 3244 + }, + { + "epoch": 0.8220392653578215, + "grad_norm": 3.3619606494903564, + "learning_rate": 9.646336517269127e-06, + "loss": 1.2424, + "step": 3245 + }, + { + "epoch": 0.8222925902469918, + "grad_norm": 3.620398759841919, + "learning_rate": 9.646026944491194e-06, + "loss": 1.1896, + "step": 3246 + }, + { + "epoch": 0.8225459151361622, + "grad_norm": 3.2635204792022705, + "learning_rate": 9.645717241255e-06, + "loss": 1.1473, + "step": 3247 + }, + { + "epoch": 0.8227992400253324, + "grad_norm": 3.320261001586914, + "learning_rate": 9.645407407569244e-06, + "loss": 1.0127, + "step": 3248 + }, + { + "epoch": 0.8230525649145028, + "grad_norm": 3.4120755195617676, + "learning_rate": 9.645097443442624e-06, + "loss": 1.0719, + "step": 3249 + }, + { + "epoch": 0.8233058898036733, + "grad_norm": 3.1058967113494873, + "learning_rate": 9.644787348883846e-06, + "loss": 1.1065, + "step": 3250 + }, + { + "epoch": 0.8235592146928435, + "grad_norm": 3.425712823867798, + "learning_rate": 9.644477123901617e-06, + "loss": 1.034, + "step": 3251 + }, + { + "epoch": 0.823812539582014, + "grad_norm": 3.5326826572418213, + "learning_rate": 9.644166768504645e-06, + "loss": 1.1458, + "step": 3252 + }, + { + "epoch": 0.8240658644711842, + "grad_norm": 3.7466468811035156, + "learning_rate": 9.643856282701646e-06, + "loss": 1.3668, + "step": 3253 + }, + { + "epoch": 0.8243191893603546, + "grad_norm": 3.811091184616089, + "learning_rate": 9.643545666501342e-06, + "loss": 1.2014, + "step": 3254 + }, + { + "epoch": 0.824572514249525, + "grad_norm": 3.382639169692993, + "learning_rate": 9.64323491991245e-06, + "loss": 1.0997, + "step": 3255 + }, + { + "epoch": 0.8248258391386953, + "grad_norm": 3.555203437805176, + "learning_rate": 9.642924042943699e-06, + "loss": 1.2725, + "step": 3256 + }, + { + "epoch": 0.8250791640278657, + "grad_norm": 3.4761922359466553, + "learning_rate": 9.642613035603816e-06, + "loss": 1.1242, + "step": 3257 + }, + { + "epoch": 0.8253324889170361, + "grad_norm": 3.351353168487549, + "learning_rate": 9.642301897901537e-06, + "loss": 1.0991, + "step": 3258 + }, + { + "epoch": 0.8255858138062064, + "grad_norm": 3.5288312435150146, + "learning_rate": 9.641990629845593e-06, + "loss": 1.1853, + "step": 3259 + }, + { + "epoch": 0.8258391386953768, + "grad_norm": 3.377979040145874, + "learning_rate": 9.64167923144473e-06, + "loss": 1.1463, + "step": 3260 + }, + { + "epoch": 0.8260924635845471, + "grad_norm": 3.269613265991211, + "learning_rate": 9.64136770270769e-06, + "loss": 1.1406, + "step": 3261 + }, + { + "epoch": 0.8263457884737175, + "grad_norm": 3.4501872062683105, + "learning_rate": 9.641056043643218e-06, + "loss": 1.1931, + "step": 3262 + }, + { + "epoch": 0.8265991133628879, + "grad_norm": 3.291609764099121, + "learning_rate": 9.640744254260068e-06, + "loss": 1.082, + "step": 3263 + }, + { + "epoch": 0.8268524382520582, + "grad_norm": 3.530622720718384, + "learning_rate": 9.640432334566995e-06, + "loss": 1.1391, + "step": 3264 + }, + { + "epoch": 0.8271057631412286, + "grad_norm": 3.626964807510376, + "learning_rate": 9.640120284572757e-06, + "loss": 1.1773, + "step": 3265 + }, + { + "epoch": 0.827359088030399, + "grad_norm": 3.473663091659546, + "learning_rate": 9.639808104286118e-06, + "loss": 1.234, + "step": 3266 + }, + { + "epoch": 0.8276124129195693, + "grad_norm": 3.2221481800079346, + "learning_rate": 9.639495793715838e-06, + "loss": 0.9925, + "step": 3267 + }, + { + "epoch": 0.8278657378087397, + "grad_norm": 3.36942458152771, + "learning_rate": 9.639183352870693e-06, + "loss": 1.1158, + "step": 3268 + }, + { + "epoch": 0.82811906269791, + "grad_norm": 3.7078704833984375, + "learning_rate": 9.638870781759453e-06, + "loss": 1.1496, + "step": 3269 + }, + { + "epoch": 0.8283723875870804, + "grad_norm": 3.411898374557495, + "learning_rate": 9.638558080390895e-06, + "loss": 1.1096, + "step": 3270 + }, + { + "epoch": 0.8286257124762508, + "grad_norm": 3.7567858695983887, + "learning_rate": 9.638245248773804e-06, + "loss": 1.2055, + "step": 3271 + }, + { + "epoch": 0.8288790373654211, + "grad_norm": 3.509984016418457, + "learning_rate": 9.637932286916955e-06, + "loss": 1.1556, + "step": 3272 + }, + { + "epoch": 0.8291323622545915, + "grad_norm": 3.6461474895477295, + "learning_rate": 9.637619194829144e-06, + "loss": 1.2767, + "step": 3273 + }, + { + "epoch": 0.8293856871437619, + "grad_norm": 3.354914665222168, + "learning_rate": 9.63730597251916e-06, + "loss": 1.1551, + "step": 3274 + }, + { + "epoch": 0.8296390120329322, + "grad_norm": 3.547229766845703, + "learning_rate": 9.636992619995795e-06, + "loss": 1.1532, + "step": 3275 + }, + { + "epoch": 0.8298923369221026, + "grad_norm": 3.6905324459075928, + "learning_rate": 9.636679137267852e-06, + "loss": 1.2091, + "step": 3276 + }, + { + "epoch": 0.8301456618112729, + "grad_norm": 3.495739698410034, + "learning_rate": 9.636365524344132e-06, + "loss": 1.2042, + "step": 3277 + }, + { + "epoch": 0.8303989867004433, + "grad_norm": 3.5037713050842285, + "learning_rate": 9.636051781233443e-06, + "loss": 1.2253, + "step": 3278 + }, + { + "epoch": 0.8306523115896137, + "grad_norm": 3.280710220336914, + "learning_rate": 9.635737907944589e-06, + "loss": 1.2191, + "step": 3279 + }, + { + "epoch": 0.830905636478784, + "grad_norm": 3.6910886764526367, + "learning_rate": 9.63542390448639e-06, + "loss": 1.179, + "step": 3280 + }, + { + "epoch": 0.8311589613679544, + "grad_norm": 3.32639217376709, + "learning_rate": 9.635109770867658e-06, + "loss": 1.1353, + "step": 3281 + }, + { + "epoch": 0.8314122862571247, + "grad_norm": 3.2570884227752686, + "learning_rate": 9.634795507097217e-06, + "loss": 1.0916, + "step": 3282 + }, + { + "epoch": 0.8316656111462951, + "grad_norm": 3.7500569820404053, + "learning_rate": 9.634481113183892e-06, + "loss": 1.1985, + "step": 3283 + }, + { + "epoch": 0.8319189360354655, + "grad_norm": 3.468489646911621, + "learning_rate": 9.634166589136508e-06, + "loss": 1.1431, + "step": 3284 + }, + { + "epoch": 0.8321722609246358, + "grad_norm": 3.260855197906494, + "learning_rate": 9.633851934963899e-06, + "loss": 1.1575, + "step": 3285 + }, + { + "epoch": 0.8324255858138062, + "grad_norm": 3.2026495933532715, + "learning_rate": 9.633537150674898e-06, + "loss": 1.0722, + "step": 3286 + }, + { + "epoch": 0.8326789107029766, + "grad_norm": 3.4730541706085205, + "learning_rate": 9.633222236278346e-06, + "loss": 1.2272, + "step": 3287 + }, + { + "epoch": 0.8329322355921469, + "grad_norm": 3.312720537185669, + "learning_rate": 9.632907191783085e-06, + "loss": 1.239, + "step": 3288 + }, + { + "epoch": 0.8331855604813173, + "grad_norm": 3.187642812728882, + "learning_rate": 9.632592017197962e-06, + "loss": 1.059, + "step": 3289 + }, + { + "epoch": 0.8334388853704876, + "grad_norm": 4.059429168701172, + "learning_rate": 9.632276712531825e-06, + "loss": 1.3552, + "step": 3290 + }, + { + "epoch": 0.833692210259658, + "grad_norm": 3.5450494289398193, + "learning_rate": 9.63196127779353e-06, + "loss": 1.2119, + "step": 3291 + }, + { + "epoch": 0.8339455351488284, + "grad_norm": 3.40283465385437, + "learning_rate": 9.631645712991932e-06, + "loss": 1.0008, + "step": 3292 + }, + { + "epoch": 0.8341988600379987, + "grad_norm": 3.6970903873443604, + "learning_rate": 9.631330018135896e-06, + "loss": 1.1442, + "step": 3293 + }, + { + "epoch": 0.8344521849271691, + "grad_norm": 3.3924965858459473, + "learning_rate": 9.631014193234282e-06, + "loss": 1.1336, + "step": 3294 + }, + { + "epoch": 0.8347055098163395, + "grad_norm": 3.460174083709717, + "learning_rate": 9.630698238295959e-06, + "loss": 1.2668, + "step": 3295 + }, + { + "epoch": 0.8349588347055098, + "grad_norm": 3.0121147632598877, + "learning_rate": 9.6303821533298e-06, + "loss": 1.0716, + "step": 3296 + }, + { + "epoch": 0.8352121595946802, + "grad_norm": 3.6534409523010254, + "learning_rate": 9.630065938344682e-06, + "loss": 1.2435, + "step": 3297 + }, + { + "epoch": 0.8354654844838505, + "grad_norm": 3.2021796703338623, + "learning_rate": 9.62974959334948e-06, + "loss": 1.0484, + "step": 3298 + }, + { + "epoch": 0.8357188093730209, + "grad_norm": 4.067572116851807, + "learning_rate": 9.629433118353083e-06, + "loss": 1.3855, + "step": 3299 + }, + { + "epoch": 0.8359721342621913, + "grad_norm": 3.870067596435547, + "learning_rate": 9.62911651336437e-06, + "loss": 1.2859, + "step": 3300 + }, + { + "epoch": 0.8362254591513616, + "grad_norm": 3.280607223510742, + "learning_rate": 9.628799778392237e-06, + "loss": 1.2571, + "step": 3301 + }, + { + "epoch": 0.836478784040532, + "grad_norm": 3.271177291870117, + "learning_rate": 9.628482913445575e-06, + "loss": 0.9535, + "step": 3302 + }, + { + "epoch": 0.8367321089297023, + "grad_norm": 3.8014323711395264, + "learning_rate": 9.628165918533282e-06, + "loss": 1.2103, + "step": 3303 + }, + { + "epoch": 0.8369854338188727, + "grad_norm": 4.148789882659912, + "learning_rate": 9.627848793664258e-06, + "loss": 1.2324, + "step": 3304 + }, + { + "epoch": 0.8372387587080431, + "grad_norm": 3.280813217163086, + "learning_rate": 9.62753153884741e-06, + "loss": 1.1898, + "step": 3305 + }, + { + "epoch": 0.8374920835972134, + "grad_norm": 3.4878618717193604, + "learning_rate": 9.627214154091646e-06, + "loss": 1.2181, + "step": 3306 + }, + { + "epoch": 0.8377454084863838, + "grad_norm": 3.2640535831451416, + "learning_rate": 9.626896639405876e-06, + "loss": 1.1118, + "step": 3307 + }, + { + "epoch": 0.8379987333755542, + "grad_norm": 3.645920753479004, + "learning_rate": 9.626578994799017e-06, + "loss": 1.2364, + "step": 3308 + }, + { + "epoch": 0.8382520582647245, + "grad_norm": 3.916548490524292, + "learning_rate": 9.62626122027999e-06, + "loss": 1.2849, + "step": 3309 + }, + { + "epoch": 0.8385053831538949, + "grad_norm": 3.662060499191284, + "learning_rate": 9.625943315857713e-06, + "loss": 1.2489, + "step": 3310 + }, + { + "epoch": 0.8387587080430652, + "grad_norm": 3.293536424636841, + "learning_rate": 9.625625281541117e-06, + "loss": 1.1696, + "step": 3311 + }, + { + "epoch": 0.8390120329322356, + "grad_norm": 3.3884665966033936, + "learning_rate": 9.625307117339132e-06, + "loss": 1.3609, + "step": 3312 + }, + { + "epoch": 0.839265357821406, + "grad_norm": 3.5792012214660645, + "learning_rate": 9.62498882326069e-06, + "loss": 1.1006, + "step": 3313 + }, + { + "epoch": 0.8395186827105763, + "grad_norm": 3.2106194496154785, + "learning_rate": 9.62467039931473e-06, + "loss": 1.0974, + "step": 3314 + }, + { + "epoch": 0.8397720075997467, + "grad_norm": 3.4125583171844482, + "learning_rate": 9.624351845510192e-06, + "loss": 1.1999, + "step": 3315 + }, + { + "epoch": 0.8400253324889171, + "grad_norm": 3.447925090789795, + "learning_rate": 9.624033161856024e-06, + "loss": 1.1, + "step": 3316 + }, + { + "epoch": 0.8402786573780874, + "grad_norm": 3.7276673316955566, + "learning_rate": 9.623714348361169e-06, + "loss": 1.3541, + "step": 3317 + }, + { + "epoch": 0.8405319822672578, + "grad_norm": 3.6366593837738037, + "learning_rate": 9.623395405034584e-06, + "loss": 1.183, + "step": 3318 + }, + { + "epoch": 0.8407853071564281, + "grad_norm": 3.335683822631836, + "learning_rate": 9.623076331885222e-06, + "loss": 1.1631, + "step": 3319 + }, + { + "epoch": 0.8410386320455985, + "grad_norm": 3.652543783187866, + "learning_rate": 9.622757128922043e-06, + "loss": 1.1235, + "step": 3320 + }, + { + "epoch": 0.8412919569347689, + "grad_norm": 3.2850959300994873, + "learning_rate": 9.62243779615401e-06, + "loss": 1.0402, + "step": 3321 + }, + { + "epoch": 0.8415452818239392, + "grad_norm": 3.1912031173706055, + "learning_rate": 9.62211833359009e-06, + "loss": 1.0177, + "step": 3322 + }, + { + "epoch": 0.8417986067131096, + "grad_norm": 3.3349733352661133, + "learning_rate": 9.621798741239255e-06, + "loss": 1.1605, + "step": 3323 + }, + { + "epoch": 0.84205193160228, + "grad_norm": 3.359973907470703, + "learning_rate": 9.621479019110476e-06, + "loss": 1.1057, + "step": 3324 + }, + { + "epoch": 0.8423052564914503, + "grad_norm": 3.6060352325439453, + "learning_rate": 9.621159167212735e-06, + "loss": 1.2396, + "step": 3325 + }, + { + "epoch": 0.8425585813806207, + "grad_norm": 3.468721866607666, + "learning_rate": 9.620839185555006e-06, + "loss": 1.1335, + "step": 3326 + }, + { + "epoch": 0.842811906269791, + "grad_norm": 3.759596824645996, + "learning_rate": 9.620519074146282e-06, + "loss": 1.2117, + "step": 3327 + }, + { + "epoch": 0.8430652311589614, + "grad_norm": 3.733689546585083, + "learning_rate": 9.620198832995547e-06, + "loss": 1.1796, + "step": 3328 + }, + { + "epoch": 0.8433185560481318, + "grad_norm": 3.7993173599243164, + "learning_rate": 9.619878462111793e-06, + "loss": 1.1737, + "step": 3329 + }, + { + "epoch": 0.8435718809373021, + "grad_norm": 2.978166103363037, + "learning_rate": 9.619557961504018e-06, + "loss": 0.9284, + "step": 3330 + }, + { + "epoch": 0.8438252058264725, + "grad_norm": 3.3335835933685303, + "learning_rate": 9.619237331181221e-06, + "loss": 1.189, + "step": 3331 + }, + { + "epoch": 0.8440785307156428, + "grad_norm": 3.68396258354187, + "learning_rate": 9.618916571152403e-06, + "loss": 1.1267, + "step": 3332 + }, + { + "epoch": 0.8443318556048132, + "grad_norm": 3.612015962600708, + "learning_rate": 9.618595681426574e-06, + "loss": 1.3243, + "step": 3333 + }, + { + "epoch": 0.8445851804939836, + "grad_norm": 3.8254354000091553, + "learning_rate": 9.618274662012743e-06, + "loss": 1.3139, + "step": 3334 + }, + { + "epoch": 0.8448385053831539, + "grad_norm": 3.5294089317321777, + "learning_rate": 9.617953512919922e-06, + "loss": 1.1345, + "step": 3335 + }, + { + "epoch": 0.8450918302723243, + "grad_norm": 3.3894784450531006, + "learning_rate": 9.617632234157132e-06, + "loss": 1.0853, + "step": 3336 + }, + { + "epoch": 0.8453451551614947, + "grad_norm": 3.574516534805298, + "learning_rate": 9.617310825733395e-06, + "loss": 1.1328, + "step": 3337 + }, + { + "epoch": 0.845598480050665, + "grad_norm": 3.29719877243042, + "learning_rate": 9.616989287657731e-06, + "loss": 1.0126, + "step": 3338 + }, + { + "epoch": 0.8458518049398354, + "grad_norm": 3.667088031768799, + "learning_rate": 9.616667619939172e-06, + "loss": 1.3743, + "step": 3339 + }, + { + "epoch": 0.8461051298290057, + "grad_norm": 3.3549182415008545, + "learning_rate": 9.616345822586753e-06, + "loss": 1.1181, + "step": 3340 + }, + { + "epoch": 0.8463584547181761, + "grad_norm": 3.507493257522583, + "learning_rate": 9.616023895609503e-06, + "loss": 1.2276, + "step": 3341 + }, + { + "epoch": 0.8466117796073465, + "grad_norm": 3.775092601776123, + "learning_rate": 9.615701839016468e-06, + "loss": 1.2453, + "step": 3342 + }, + { + "epoch": 0.8468651044965168, + "grad_norm": 3.3038055896759033, + "learning_rate": 9.615379652816687e-06, + "loss": 1.2524, + "step": 3343 + }, + { + "epoch": 0.8471184293856872, + "grad_norm": 3.6604628562927246, + "learning_rate": 9.615057337019208e-06, + "loss": 1.0756, + "step": 3344 + }, + { + "epoch": 0.8473717542748576, + "grad_norm": 3.2440614700317383, + "learning_rate": 9.614734891633084e-06, + "loss": 1.1165, + "step": 3345 + }, + { + "epoch": 0.8476250791640279, + "grad_norm": 3.37060284614563, + "learning_rate": 9.614412316667367e-06, + "loss": 1.1274, + "step": 3346 + }, + { + "epoch": 0.8478784040531983, + "grad_norm": 3.8288700580596924, + "learning_rate": 9.614089612131114e-06, + "loss": 1.1934, + "step": 3347 + }, + { + "epoch": 0.8481317289423685, + "grad_norm": 3.8033788204193115, + "learning_rate": 9.613766778033387e-06, + "loss": 1.1799, + "step": 3348 + }, + { + "epoch": 0.848385053831539, + "grad_norm": 3.9271960258483887, + "learning_rate": 9.613443814383252e-06, + "loss": 1.1305, + "step": 3349 + }, + { + "epoch": 0.8486383787207094, + "grad_norm": 3.70815372467041, + "learning_rate": 9.613120721189776e-06, + "loss": 1.1846, + "step": 3350 + }, + { + "epoch": 0.8488917036098796, + "grad_norm": 3.861818790435791, + "learning_rate": 9.612797498462032e-06, + "loss": 1.1439, + "step": 3351 + }, + { + "epoch": 0.84914502849905, + "grad_norm": 3.6337101459503174, + "learning_rate": 9.612474146209097e-06, + "loss": 1.1044, + "step": 3352 + }, + { + "epoch": 0.8493983533882203, + "grad_norm": 3.3341217041015625, + "learning_rate": 9.61215066444005e-06, + "loss": 1.0371, + "step": 3353 + }, + { + "epoch": 0.8496516782773907, + "grad_norm": 3.7437963485717773, + "learning_rate": 9.611827053163973e-06, + "loss": 1.317, + "step": 3354 + }, + { + "epoch": 0.8499050031665611, + "grad_norm": 3.634654998779297, + "learning_rate": 9.611503312389953e-06, + "loss": 1.2687, + "step": 3355 + }, + { + "epoch": 0.8501583280557314, + "grad_norm": 3.5133156776428223, + "learning_rate": 9.611179442127083e-06, + "loss": 1.2252, + "step": 3356 + }, + { + "epoch": 0.8504116529449018, + "grad_norm": 3.691375255584717, + "learning_rate": 9.610855442384456e-06, + "loss": 1.2031, + "step": 3357 + }, + { + "epoch": 0.8506649778340722, + "grad_norm": 3.2914648056030273, + "learning_rate": 9.610531313171168e-06, + "loss": 1.1457, + "step": 3358 + }, + { + "epoch": 0.8509183027232425, + "grad_norm": 3.2313108444213867, + "learning_rate": 9.610207054496322e-06, + "loss": 1.101, + "step": 3359 + }, + { + "epoch": 0.8511716276124129, + "grad_norm": 3.529951333999634, + "learning_rate": 9.609882666369022e-06, + "loss": 1.1806, + "step": 3360 + }, + { + "epoch": 0.8514249525015832, + "grad_norm": 3.5383546352386475, + "learning_rate": 9.609558148798378e-06, + "loss": 1.145, + "step": 3361 + }, + { + "epoch": 0.8516782773907536, + "grad_norm": 3.3370304107666016, + "learning_rate": 9.609233501793502e-06, + "loss": 1.0638, + "step": 3362 + }, + { + "epoch": 0.851931602279924, + "grad_norm": 3.65524959564209, + "learning_rate": 9.608908725363509e-06, + "loss": 1.3135, + "step": 3363 + }, + { + "epoch": 0.8521849271690943, + "grad_norm": 3.2601053714752197, + "learning_rate": 9.608583819517519e-06, + "loss": 1.1432, + "step": 3364 + }, + { + "epoch": 0.8524382520582647, + "grad_norm": 3.9082705974578857, + "learning_rate": 9.608258784264654e-06, + "loss": 1.36, + "step": 3365 + }, + { + "epoch": 0.8526915769474351, + "grad_norm": 3.53440260887146, + "learning_rate": 9.607933619614042e-06, + "loss": 1.268, + "step": 3366 + }, + { + "epoch": 0.8529449018366054, + "grad_norm": 3.2626724243164062, + "learning_rate": 9.607608325574816e-06, + "loss": 1.1265, + "step": 3367 + }, + { + "epoch": 0.8531982267257758, + "grad_norm": 3.4613711833953857, + "learning_rate": 9.607282902156106e-06, + "loss": 1.1979, + "step": 3368 + }, + { + "epoch": 0.8534515516149461, + "grad_norm": 3.542390823364258, + "learning_rate": 9.606957349367052e-06, + "loss": 1.198, + "step": 3369 + }, + { + "epoch": 0.8537048765041165, + "grad_norm": 3.2955868244171143, + "learning_rate": 9.606631667216794e-06, + "loss": 1.2156, + "step": 3370 + }, + { + "epoch": 0.8539582013932869, + "grad_norm": 3.3685765266418457, + "learning_rate": 9.60630585571448e-06, + "loss": 1.2304, + "step": 3371 + }, + { + "epoch": 0.8542115262824572, + "grad_norm": 4.13430643081665, + "learning_rate": 9.605979914869255e-06, + "loss": 1.3528, + "step": 3372 + }, + { + "epoch": 0.8544648511716276, + "grad_norm": 3.3133087158203125, + "learning_rate": 9.605653844690273e-06, + "loss": 1.1131, + "step": 3373 + }, + { + "epoch": 0.8547181760607979, + "grad_norm": 3.21870756149292, + "learning_rate": 9.605327645186688e-06, + "loss": 1.1227, + "step": 3374 + }, + { + "epoch": 0.8549715009499683, + "grad_norm": 3.2606735229492188, + "learning_rate": 9.605001316367664e-06, + "loss": 1.2406, + "step": 3375 + }, + { + "epoch": 0.8552248258391387, + "grad_norm": 3.4934375286102295, + "learning_rate": 9.60467485824236e-06, + "loss": 1.2796, + "step": 3376 + }, + { + "epoch": 0.855478150728309, + "grad_norm": 3.60872220993042, + "learning_rate": 9.604348270819944e-06, + "loss": 1.0723, + "step": 3377 + }, + { + "epoch": 0.8557314756174794, + "grad_norm": 3.738271951675415, + "learning_rate": 9.604021554109586e-06, + "loss": 1.3091, + "step": 3378 + }, + { + "epoch": 0.8559848005066498, + "grad_norm": 3.7667062282562256, + "learning_rate": 9.60369470812046e-06, + "loss": 1.2022, + "step": 3379 + }, + { + "epoch": 0.8562381253958201, + "grad_norm": 3.4908809661865234, + "learning_rate": 9.603367732861746e-06, + "loss": 1.2256, + "step": 3380 + }, + { + "epoch": 0.8564914502849905, + "grad_norm": 3.327702522277832, + "learning_rate": 9.603040628342622e-06, + "loss": 1.2269, + "step": 3381 + }, + { + "epoch": 0.8567447751741608, + "grad_norm": 3.4891254901885986, + "learning_rate": 9.602713394572276e-06, + "loss": 1.1644, + "step": 3382 + }, + { + "epoch": 0.8569981000633312, + "grad_norm": 3.4369208812713623, + "learning_rate": 9.602386031559893e-06, + "loss": 1.1514, + "step": 3383 + }, + { + "epoch": 0.8572514249525016, + "grad_norm": 3.2946858406066895, + "learning_rate": 9.602058539314669e-06, + "loss": 1.158, + "step": 3384 + }, + { + "epoch": 0.8575047498416719, + "grad_norm": 3.3024790287017822, + "learning_rate": 9.601730917845798e-06, + "loss": 1.2727, + "step": 3385 + }, + { + "epoch": 0.8577580747308423, + "grad_norm": 3.251995325088501, + "learning_rate": 9.60140316716248e-06, + "loss": 1.0886, + "step": 3386 + }, + { + "epoch": 0.8580113996200127, + "grad_norm": 3.3264102935791016, + "learning_rate": 9.601075287273916e-06, + "loss": 1.1008, + "step": 3387 + }, + { + "epoch": 0.858264724509183, + "grad_norm": 3.395895481109619, + "learning_rate": 9.600747278189314e-06, + "loss": 1.0521, + "step": 3388 + }, + { + "epoch": 0.8585180493983534, + "grad_norm": 3.5084939002990723, + "learning_rate": 9.600419139917887e-06, + "loss": 1.1628, + "step": 3389 + }, + { + "epoch": 0.8587713742875237, + "grad_norm": 3.2303173542022705, + "learning_rate": 9.600090872468846e-06, + "loss": 1.1065, + "step": 3390 + }, + { + "epoch": 0.8590246991766941, + "grad_norm": 3.6353373527526855, + "learning_rate": 9.599762475851409e-06, + "loss": 1.2276, + "step": 3391 + }, + { + "epoch": 0.8592780240658645, + "grad_norm": 3.4630491733551025, + "learning_rate": 9.599433950074797e-06, + "loss": 1.1645, + "step": 3392 + }, + { + "epoch": 0.8595313489550348, + "grad_norm": 3.725224256515503, + "learning_rate": 9.599105295148235e-06, + "loss": 1.1279, + "step": 3393 + }, + { + "epoch": 0.8597846738442052, + "grad_norm": 3.003977060317993, + "learning_rate": 9.598776511080954e-06, + "loss": 1.0819, + "step": 3394 + }, + { + "epoch": 0.8600379987333756, + "grad_norm": 3.839994192123413, + "learning_rate": 9.598447597882181e-06, + "loss": 1.3302, + "step": 3395 + }, + { + "epoch": 0.8602913236225459, + "grad_norm": 3.295487880706787, + "learning_rate": 9.598118555561156e-06, + "loss": 1.0836, + "step": 3396 + }, + { + "epoch": 0.8605446485117163, + "grad_norm": 3.553488254547119, + "learning_rate": 9.597789384127117e-06, + "loss": 1.2063, + "step": 3397 + }, + { + "epoch": 0.8607979734008866, + "grad_norm": 3.2332942485809326, + "learning_rate": 9.597460083589307e-06, + "loss": 1.0509, + "step": 3398 + }, + { + "epoch": 0.861051298290057, + "grad_norm": 3.3764026165008545, + "learning_rate": 9.597130653956973e-06, + "loss": 1.153, + "step": 3399 + }, + { + "epoch": 0.8613046231792274, + "grad_norm": 3.295069932937622, + "learning_rate": 9.596801095239365e-06, + "loss": 1.0382, + "step": 3400 + }, + { + "epoch": 0.8615579480683977, + "grad_norm": 3.451786518096924, + "learning_rate": 9.596471407445736e-06, + "loss": 1.1697, + "step": 3401 + }, + { + "epoch": 0.8618112729575681, + "grad_norm": 3.5099878311157227, + "learning_rate": 9.596141590585344e-06, + "loss": 1.1123, + "step": 3402 + }, + { + "epoch": 0.8620645978467384, + "grad_norm": 3.3034534454345703, + "learning_rate": 9.59581164466745e-06, + "loss": 1.1429, + "step": 3403 + }, + { + "epoch": 0.8623179227359088, + "grad_norm": 3.3229706287384033, + "learning_rate": 9.595481569701319e-06, + "loss": 1.0715, + "step": 3404 + }, + { + "epoch": 0.8625712476250792, + "grad_norm": 3.2438437938690186, + "learning_rate": 9.595151365696221e-06, + "loss": 1.1276, + "step": 3405 + }, + { + "epoch": 0.8628245725142495, + "grad_norm": 3.7240092754364014, + "learning_rate": 9.594821032661425e-06, + "loss": 1.1568, + "step": 3406 + }, + { + "epoch": 0.8630778974034199, + "grad_norm": 3.175704002380371, + "learning_rate": 9.594490570606207e-06, + "loss": 1.1228, + "step": 3407 + }, + { + "epoch": 0.8633312222925903, + "grad_norm": 3.8279178142547607, + "learning_rate": 9.594159979539849e-06, + "loss": 1.2746, + "step": 3408 + }, + { + "epoch": 0.8635845471817606, + "grad_norm": 3.5796291828155518, + "learning_rate": 9.59382925947163e-06, + "loss": 1.2341, + "step": 3409 + }, + { + "epoch": 0.863837872070931, + "grad_norm": 3.5394248962402344, + "learning_rate": 9.59349841041084e-06, + "loss": 1.1411, + "step": 3410 + }, + { + "epoch": 0.8640911969601013, + "grad_norm": 3.572301149368286, + "learning_rate": 9.593167432366766e-06, + "loss": 1.1513, + "step": 3411 + }, + { + "epoch": 0.8643445218492717, + "grad_norm": 3.1407394409179688, + "learning_rate": 9.592836325348705e-06, + "loss": 1.1336, + "step": 3412 + }, + { + "epoch": 0.8645978467384421, + "grad_norm": 3.3476498126983643, + "learning_rate": 9.59250508936595e-06, + "loss": 1.1538, + "step": 3413 + }, + { + "epoch": 0.8648511716276124, + "grad_norm": 3.4111223220825195, + "learning_rate": 9.592173724427809e-06, + "loss": 1.056, + "step": 3414 + }, + { + "epoch": 0.8651044965167828, + "grad_norm": 3.2360401153564453, + "learning_rate": 9.591842230543578e-06, + "loss": 1.0997, + "step": 3415 + }, + { + "epoch": 0.8653578214059532, + "grad_norm": 3.479116678237915, + "learning_rate": 9.59151060772257e-06, + "loss": 1.294, + "step": 3416 + }, + { + "epoch": 0.8656111462951235, + "grad_norm": 3.418973922729492, + "learning_rate": 9.591178855974097e-06, + "loss": 1.2281, + "step": 3417 + }, + { + "epoch": 0.8658644711842939, + "grad_norm": 3.6004345417022705, + "learning_rate": 9.590846975307473e-06, + "loss": 1.2213, + "step": 3418 + }, + { + "epoch": 0.8661177960734642, + "grad_norm": 3.6040594577789307, + "learning_rate": 9.590514965732017e-06, + "loss": 1.0826, + "step": 3419 + }, + { + "epoch": 0.8663711209626346, + "grad_norm": 3.5817062854766846, + "learning_rate": 9.590182827257053e-06, + "loss": 1.2093, + "step": 3420 + }, + { + "epoch": 0.866624445851805, + "grad_norm": 3.6036312580108643, + "learning_rate": 9.589850559891906e-06, + "loss": 1.2953, + "step": 3421 + }, + { + "epoch": 0.8668777707409753, + "grad_norm": 3.4746053218841553, + "learning_rate": 9.589518163645908e-06, + "loss": 1.1615, + "step": 3422 + }, + { + "epoch": 0.8671310956301457, + "grad_norm": 3.4524717330932617, + "learning_rate": 9.58918563852839e-06, + "loss": 1.1426, + "step": 3423 + }, + { + "epoch": 0.867384420519316, + "grad_norm": 3.8653552532196045, + "learning_rate": 9.58885298454869e-06, + "loss": 1.1985, + "step": 3424 + }, + { + "epoch": 0.8676377454084864, + "grad_norm": 3.478992462158203, + "learning_rate": 9.588520201716149e-06, + "loss": 1.2534, + "step": 3425 + }, + { + "epoch": 0.8678910702976568, + "grad_norm": 3.3428542613983154, + "learning_rate": 9.588187290040109e-06, + "loss": 0.9823, + "step": 3426 + }, + { + "epoch": 0.8681443951868271, + "grad_norm": 3.336794137954712, + "learning_rate": 9.587854249529924e-06, + "loss": 1.2026, + "step": 3427 + }, + { + "epoch": 0.8683977200759975, + "grad_norm": 3.6661384105682373, + "learning_rate": 9.58752108019494e-06, + "loss": 1.1801, + "step": 3428 + }, + { + "epoch": 0.8686510449651679, + "grad_norm": 3.672661066055298, + "learning_rate": 9.587187782044514e-06, + "loss": 1.2013, + "step": 3429 + }, + { + "epoch": 0.8689043698543382, + "grad_norm": 3.4144299030303955, + "learning_rate": 9.586854355088006e-06, + "loss": 1.386, + "step": 3430 + }, + { + "epoch": 0.8691576947435086, + "grad_norm": 3.670971393585205, + "learning_rate": 9.586520799334776e-06, + "loss": 1.1359, + "step": 3431 + }, + { + "epoch": 0.8694110196326789, + "grad_norm": 3.665709972381592, + "learning_rate": 9.586187114794192e-06, + "loss": 1.1387, + "step": 3432 + }, + { + "epoch": 0.8696643445218493, + "grad_norm": 3.3912973403930664, + "learning_rate": 9.585853301475625e-06, + "loss": 1.1105, + "step": 3433 + }, + { + "epoch": 0.8699176694110197, + "grad_norm": 3.3353540897369385, + "learning_rate": 9.585519359388445e-06, + "loss": 1.2105, + "step": 3434 + }, + { + "epoch": 0.87017099430019, + "grad_norm": 3.507322072982788, + "learning_rate": 9.585185288542031e-06, + "loss": 1.0176, + "step": 3435 + }, + { + "epoch": 0.8704243191893604, + "grad_norm": 3.2737700939178467, + "learning_rate": 9.584851088945762e-06, + "loss": 1.0474, + "step": 3436 + }, + { + "epoch": 0.8706776440785308, + "grad_norm": 3.285423517227173, + "learning_rate": 9.584516760609024e-06, + "loss": 1.1533, + "step": 3437 + }, + { + "epoch": 0.8709309689677011, + "grad_norm": 3.5063467025756836, + "learning_rate": 9.584182303541205e-06, + "loss": 1.2178, + "step": 3438 + }, + { + "epoch": 0.8711842938568715, + "grad_norm": 3.2288620471954346, + "learning_rate": 9.583847717751694e-06, + "loss": 1.0731, + "step": 3439 + }, + { + "epoch": 0.8714376187460418, + "grad_norm": 3.816467523574829, + "learning_rate": 9.583513003249889e-06, + "loss": 1.2776, + "step": 3440 + }, + { + "epoch": 0.8716909436352122, + "grad_norm": 3.81528902053833, + "learning_rate": 9.583178160045186e-06, + "loss": 1.1239, + "step": 3441 + }, + { + "epoch": 0.8719442685243826, + "grad_norm": 3.3365797996520996, + "learning_rate": 9.582843188146987e-06, + "loss": 1.2304, + "step": 3442 + }, + { + "epoch": 0.8721975934135529, + "grad_norm": 3.1397223472595215, + "learning_rate": 9.582508087564701e-06, + "loss": 1.0956, + "step": 3443 + }, + { + "epoch": 0.8724509183027233, + "grad_norm": 3.297811985015869, + "learning_rate": 9.582172858307735e-06, + "loss": 1.1335, + "step": 3444 + }, + { + "epoch": 0.8727042431918937, + "grad_norm": 3.520543336868286, + "learning_rate": 9.581837500385503e-06, + "loss": 1.2358, + "step": 3445 + }, + { + "epoch": 0.872957568081064, + "grad_norm": 3.4179327487945557, + "learning_rate": 9.581502013807422e-06, + "loss": 1.142, + "step": 3446 + }, + { + "epoch": 0.8732108929702344, + "grad_norm": 3.337519407272339, + "learning_rate": 9.58116639858291e-06, + "loss": 1.1565, + "step": 3447 + }, + { + "epoch": 0.8734642178594046, + "grad_norm": 3.9267821311950684, + "learning_rate": 9.580830654721393e-06, + "loss": 1.1889, + "step": 3448 + }, + { + "epoch": 0.873717542748575, + "grad_norm": 3.376887559890747, + "learning_rate": 9.580494782232299e-06, + "loss": 1.0976, + "step": 3449 + }, + { + "epoch": 0.8739708676377455, + "grad_norm": 3.618590831756592, + "learning_rate": 9.580158781125058e-06, + "loss": 1.211, + "step": 3450 + }, + { + "epoch": 0.8742241925269157, + "grad_norm": 3.1070926189422607, + "learning_rate": 9.579822651409105e-06, + "loss": 1.2301, + "step": 3451 + }, + { + "epoch": 0.8744775174160861, + "grad_norm": 3.3936069011688232, + "learning_rate": 9.579486393093878e-06, + "loss": 1.1922, + "step": 3452 + }, + { + "epoch": 0.8747308423052564, + "grad_norm": 3.7701573371887207, + "learning_rate": 9.57915000618882e-06, + "loss": 1.1539, + "step": 3453 + }, + { + "epoch": 0.8749841671944268, + "grad_norm": 3.4328768253326416, + "learning_rate": 9.578813490703375e-06, + "loss": 1.1615, + "step": 3454 + }, + { + "epoch": 0.8752374920835972, + "grad_norm": 3.3606112003326416, + "learning_rate": 9.578476846646994e-06, + "loss": 1.0825, + "step": 3455 + }, + { + "epoch": 0.8754908169727675, + "grad_norm": 3.7307562828063965, + "learning_rate": 9.57814007402913e-06, + "loss": 1.2022, + "step": 3456 + }, + { + "epoch": 0.8757441418619379, + "grad_norm": 3.4997165203094482, + "learning_rate": 9.577803172859236e-06, + "loss": 1.1344, + "step": 3457 + }, + { + "epoch": 0.8759974667511083, + "grad_norm": 3.4206204414367676, + "learning_rate": 9.577466143146777e-06, + "loss": 1.2185, + "step": 3458 + }, + { + "epoch": 0.8762507916402786, + "grad_norm": 3.629504919052124, + "learning_rate": 9.577128984901212e-06, + "loss": 1.0822, + "step": 3459 + }, + { + "epoch": 0.876504116529449, + "grad_norm": 3.5562291145324707, + "learning_rate": 9.57679169813201e-06, + "loss": 1.2349, + "step": 3460 + }, + { + "epoch": 0.8767574414186193, + "grad_norm": 3.643573045730591, + "learning_rate": 9.576454282848645e-06, + "loss": 1.2338, + "step": 3461 + }, + { + "epoch": 0.8770107663077897, + "grad_norm": 3.4442660808563232, + "learning_rate": 9.576116739060585e-06, + "loss": 1.0719, + "step": 3462 + }, + { + "epoch": 0.8772640911969601, + "grad_norm": 3.5599753856658936, + "learning_rate": 9.575779066777316e-06, + "loss": 1.1835, + "step": 3463 + }, + { + "epoch": 0.8775174160861304, + "grad_norm": 3.6639044284820557, + "learning_rate": 9.575441266008312e-06, + "loss": 1.2526, + "step": 3464 + }, + { + "epoch": 0.8777707409753008, + "grad_norm": 3.6009585857391357, + "learning_rate": 9.575103336763063e-06, + "loss": 1.102, + "step": 3465 + }, + { + "epoch": 0.8780240658644712, + "grad_norm": 3.8054094314575195, + "learning_rate": 9.574765279051055e-06, + "loss": 1.1178, + "step": 3466 + }, + { + "epoch": 0.8782773907536415, + "grad_norm": 3.4370920658111572, + "learning_rate": 9.574427092881784e-06, + "loss": 1.0412, + "step": 3467 + }, + { + "epoch": 0.8785307156428119, + "grad_norm": 3.3152363300323486, + "learning_rate": 9.574088778264744e-06, + "loss": 1.1768, + "step": 3468 + }, + { + "epoch": 0.8787840405319822, + "grad_norm": 3.4283127784729004, + "learning_rate": 9.573750335209433e-06, + "loss": 1.1101, + "step": 3469 + }, + { + "epoch": 0.8790373654211526, + "grad_norm": 3.6179544925689697, + "learning_rate": 9.573411763725358e-06, + "loss": 1.2943, + "step": 3470 + }, + { + "epoch": 0.879290690310323, + "grad_norm": 3.568488359451294, + "learning_rate": 9.573073063822023e-06, + "loss": 1.2498, + "step": 3471 + }, + { + "epoch": 0.8795440151994933, + "grad_norm": 3.5222697257995605, + "learning_rate": 9.572734235508941e-06, + "loss": 1.0618, + "step": 3472 + }, + { + "epoch": 0.8797973400886637, + "grad_norm": 3.1682956218719482, + "learning_rate": 9.572395278795622e-06, + "loss": 1.0945, + "step": 3473 + }, + { + "epoch": 0.880050664977834, + "grad_norm": 3.5997045040130615, + "learning_rate": 9.57205619369159e-06, + "loss": 1.1523, + "step": 3474 + }, + { + "epoch": 0.8803039898670044, + "grad_norm": 3.4013473987579346, + "learning_rate": 9.57171698020636e-06, + "loss": 1.0699, + "step": 3475 + }, + { + "epoch": 0.8805573147561748, + "grad_norm": 3.2505035400390625, + "learning_rate": 9.571377638349462e-06, + "loss": 1.0694, + "step": 3476 + }, + { + "epoch": 0.8808106396453451, + "grad_norm": 3.628488779067993, + "learning_rate": 9.571038168130422e-06, + "loss": 1.1776, + "step": 3477 + }, + { + "epoch": 0.8810639645345155, + "grad_norm": 3.3734829425811768, + "learning_rate": 9.570698569558771e-06, + "loss": 1.102, + "step": 3478 + }, + { + "epoch": 0.8813172894236859, + "grad_norm": 6.371610641479492, + "learning_rate": 9.57035884264405e-06, + "loss": 1.1883, + "step": 3479 + }, + { + "epoch": 0.8815706143128562, + "grad_norm": 3.6515114307403564, + "learning_rate": 9.57001898739579e-06, + "loss": 1.1875, + "step": 3480 + }, + { + "epoch": 0.8818239392020266, + "grad_norm": 3.4682676792144775, + "learning_rate": 9.569679003823542e-06, + "loss": 1.1922, + "step": 3481 + }, + { + "epoch": 0.8820772640911969, + "grad_norm": 3.526710033416748, + "learning_rate": 9.56933889193685e-06, + "loss": 1.202, + "step": 3482 + }, + { + "epoch": 0.8823305889803673, + "grad_norm": 3.291552782058716, + "learning_rate": 9.56899865174526e-06, + "loss": 1.0547, + "step": 3483 + }, + { + "epoch": 0.8825839138695377, + "grad_norm": 3.222761631011963, + "learning_rate": 9.568658283258331e-06, + "loss": 1.1274, + "step": 3484 + }, + { + "epoch": 0.882837238758708, + "grad_norm": 3.328718900680542, + "learning_rate": 9.568317786485619e-06, + "loss": 1.2029, + "step": 3485 + }, + { + "epoch": 0.8830905636478784, + "grad_norm": 3.3436994552612305, + "learning_rate": 9.567977161436685e-06, + "loss": 1.2401, + "step": 3486 + }, + { + "epoch": 0.8833438885370488, + "grad_norm": 3.443335771560669, + "learning_rate": 9.567636408121092e-06, + "loss": 1.1754, + "step": 3487 + }, + { + "epoch": 0.8835972134262191, + "grad_norm": 3.813403606414795, + "learning_rate": 9.56729552654841e-06, + "loss": 1.2644, + "step": 3488 + }, + { + "epoch": 0.8838505383153895, + "grad_norm": 3.697558879852295, + "learning_rate": 9.56695451672821e-06, + "loss": 1.1746, + "step": 3489 + }, + { + "epoch": 0.8841038632045598, + "grad_norm": 3.2442736625671387, + "learning_rate": 9.566613378670068e-06, + "loss": 1.1262, + "step": 3490 + }, + { + "epoch": 0.8843571880937302, + "grad_norm": 3.6892635822296143, + "learning_rate": 9.566272112383563e-06, + "loss": 1.1524, + "step": 3491 + }, + { + "epoch": 0.8846105129829006, + "grad_norm": 3.149174928665161, + "learning_rate": 9.565930717878276e-06, + "loss": 1.0615, + "step": 3492 + }, + { + "epoch": 0.8848638378720709, + "grad_norm": 3.460679531097412, + "learning_rate": 9.565589195163796e-06, + "loss": 1.2664, + "step": 3493 + }, + { + "epoch": 0.8851171627612413, + "grad_norm": 3.4815971851348877, + "learning_rate": 9.565247544249709e-06, + "loss": 1.2627, + "step": 3494 + }, + { + "epoch": 0.8853704876504117, + "grad_norm": 3.1610448360443115, + "learning_rate": 9.564905765145611e-06, + "loss": 1.0562, + "step": 3495 + }, + { + "epoch": 0.885623812539582, + "grad_norm": 3.1781082153320312, + "learning_rate": 9.5645638578611e-06, + "loss": 1.1922, + "step": 3496 + }, + { + "epoch": 0.8858771374287524, + "grad_norm": 3.4381654262542725, + "learning_rate": 9.564221822405774e-06, + "loss": 1.1594, + "step": 3497 + }, + { + "epoch": 0.8861304623179227, + "grad_norm": 3.464570999145508, + "learning_rate": 9.563879658789239e-06, + "loss": 1.1795, + "step": 3498 + }, + { + "epoch": 0.8863837872070931, + "grad_norm": 3.122692823410034, + "learning_rate": 9.563537367021103e-06, + "loss": 1.1004, + "step": 3499 + }, + { + "epoch": 0.8866371120962635, + "grad_norm": 3.4979028701782227, + "learning_rate": 9.563194947110975e-06, + "loss": 1.2198, + "step": 3500 + }, + { + "epoch": 0.8866371120962635, + "eval_loss": 1.1842656135559082, + "eval_runtime": 12.3015, + "eval_samples_per_second": 32.516, + "eval_steps_per_second": 4.065, + "step": 3500 + }, + { + "epoch": 0.8868904369854338, + "grad_norm": 3.350447416305542, + "learning_rate": 9.562852399068472e-06, + "loss": 1.1479, + "step": 3501 + }, + { + "epoch": 0.8871437618746042, + "grad_norm": 3.1752302646636963, + "learning_rate": 9.562509722903213e-06, + "loss": 1.1348, + "step": 3502 + }, + { + "epoch": 0.8873970867637745, + "grad_norm": 3.252607583999634, + "learning_rate": 9.562166918624817e-06, + "loss": 1.1539, + "step": 3503 + }, + { + "epoch": 0.8876504116529449, + "grad_norm": 3.5568649768829346, + "learning_rate": 9.561823986242916e-06, + "loss": 1.1659, + "step": 3504 + }, + { + "epoch": 0.8879037365421153, + "grad_norm": 3.367532968521118, + "learning_rate": 9.561480925767133e-06, + "loss": 1.2785, + "step": 3505 + }, + { + "epoch": 0.8881570614312856, + "grad_norm": 3.3485682010650635, + "learning_rate": 9.561137737207103e-06, + "loss": 1.2095, + "step": 3506 + }, + { + "epoch": 0.888410386320456, + "grad_norm": 3.486233949661255, + "learning_rate": 9.560794420572464e-06, + "loss": 1.2842, + "step": 3507 + }, + { + "epoch": 0.8886637112096264, + "grad_norm": 3.257869243621826, + "learning_rate": 9.560450975872855e-06, + "loss": 1.0405, + "step": 3508 + }, + { + "epoch": 0.8889170360987967, + "grad_norm": 3.318464756011963, + "learning_rate": 9.56010740311792e-06, + "loss": 1.1972, + "step": 3509 + }, + { + "epoch": 0.8891703609879671, + "grad_norm": 3.7074925899505615, + "learning_rate": 9.559763702317306e-06, + "loss": 1.2003, + "step": 3510 + }, + { + "epoch": 0.8894236858771374, + "grad_norm": 3.423823356628418, + "learning_rate": 9.559419873480664e-06, + "loss": 1.1482, + "step": 3511 + }, + { + "epoch": 0.8896770107663078, + "grad_norm": 3.256509304046631, + "learning_rate": 9.559075916617649e-06, + "loss": 1.0957, + "step": 3512 + }, + { + "epoch": 0.8899303356554782, + "grad_norm": 3.308105707168579, + "learning_rate": 9.55873183173792e-06, + "loss": 1.0717, + "step": 3513 + }, + { + "epoch": 0.8901836605446485, + "grad_norm": 3.4461209774017334, + "learning_rate": 9.558387618851137e-06, + "loss": 1.1767, + "step": 3514 + }, + { + "epoch": 0.8904369854338189, + "grad_norm": 3.324324131011963, + "learning_rate": 9.558043277966967e-06, + "loss": 1.0425, + "step": 3515 + }, + { + "epoch": 0.8906903103229893, + "grad_norm": 3.5780386924743652, + "learning_rate": 9.557698809095076e-06, + "loss": 1.2086, + "step": 3516 + }, + { + "epoch": 0.8909436352121596, + "grad_norm": 3.3247432708740234, + "learning_rate": 9.55735421224514e-06, + "loss": 1.1104, + "step": 3517 + }, + { + "epoch": 0.89119696010133, + "grad_norm": 3.3876378536224365, + "learning_rate": 9.557009487426834e-06, + "loss": 1.2138, + "step": 3518 + }, + { + "epoch": 0.8914502849905003, + "grad_norm": 3.313143730163574, + "learning_rate": 9.556664634649837e-06, + "loss": 1.1164, + "step": 3519 + }, + { + "epoch": 0.8917036098796707, + "grad_norm": 3.8288326263427734, + "learning_rate": 9.55631965392383e-06, + "loss": 1.2153, + "step": 3520 + }, + { + "epoch": 0.8919569347688411, + "grad_norm": 3.245234966278076, + "learning_rate": 9.555974545258507e-06, + "loss": 1.1173, + "step": 3521 + }, + { + "epoch": 0.8922102596580114, + "grad_norm": 3.3028581142425537, + "learning_rate": 9.555629308663553e-06, + "loss": 1.2273, + "step": 3522 + }, + { + "epoch": 0.8924635845471818, + "grad_norm": 3.3003835678100586, + "learning_rate": 9.555283944148661e-06, + "loss": 1.1889, + "step": 3523 + }, + { + "epoch": 0.8927169094363521, + "grad_norm": 3.2953989505767822, + "learning_rate": 9.554938451723533e-06, + "loss": 1.0949, + "step": 3524 + }, + { + "epoch": 0.8929702343255225, + "grad_norm": 3.2672576904296875, + "learning_rate": 9.554592831397866e-06, + "loss": 1.1747, + "step": 3525 + }, + { + "epoch": 0.8932235592146929, + "grad_norm": 3.283121109008789, + "learning_rate": 9.554247083181369e-06, + "loss": 1.0598, + "step": 3526 + }, + { + "epoch": 0.8934768841038632, + "grad_norm": 3.359567880630493, + "learning_rate": 9.553901207083746e-06, + "loss": 1.1151, + "step": 3527 + }, + { + "epoch": 0.8937302089930336, + "grad_norm": 3.2975292205810547, + "learning_rate": 9.553555203114713e-06, + "loss": 1.0455, + "step": 3528 + }, + { + "epoch": 0.893983533882204, + "grad_norm": 3.4688210487365723, + "learning_rate": 9.553209071283984e-06, + "loss": 1.1231, + "step": 3529 + }, + { + "epoch": 0.8942368587713743, + "grad_norm": 3.655614137649536, + "learning_rate": 9.552862811601279e-06, + "loss": 1.2476, + "step": 3530 + }, + { + "epoch": 0.8944901836605447, + "grad_norm": 3.1691958904266357, + "learning_rate": 9.552516424076318e-06, + "loss": 1.1201, + "step": 3531 + }, + { + "epoch": 0.894743508549715, + "grad_norm": 3.238905191421509, + "learning_rate": 9.552169908718831e-06, + "loss": 1.1532, + "step": 3532 + }, + { + "epoch": 0.8949968334388854, + "grad_norm": 3.3464627265930176, + "learning_rate": 9.551823265538546e-06, + "loss": 1.1665, + "step": 3533 + }, + { + "epoch": 0.8952501583280558, + "grad_norm": 3.670006036758423, + "learning_rate": 9.551476494545198e-06, + "loss": 1.251, + "step": 3534 + }, + { + "epoch": 0.8955034832172261, + "grad_norm": 3.0911617279052734, + "learning_rate": 9.551129595748521e-06, + "loss": 1.1598, + "step": 3535 + }, + { + "epoch": 0.8957568081063965, + "grad_norm": 3.6140267848968506, + "learning_rate": 9.55078256915826e-06, + "loss": 1.1254, + "step": 3536 + }, + { + "epoch": 0.8960101329955669, + "grad_norm": 4.0415778160095215, + "learning_rate": 9.550435414784157e-06, + "loss": 1.4153, + "step": 3537 + }, + { + "epoch": 0.8962634578847372, + "grad_norm": 3.4572088718414307, + "learning_rate": 9.55008813263596e-06, + "loss": 1.1791, + "step": 3538 + }, + { + "epoch": 0.8965167827739076, + "grad_norm": 3.47280216217041, + "learning_rate": 9.549740722723419e-06, + "loss": 1.1283, + "step": 3539 + }, + { + "epoch": 0.8967701076630779, + "grad_norm": 3.799362897872925, + "learning_rate": 9.549393185056292e-06, + "loss": 1.2423, + "step": 3540 + }, + { + "epoch": 0.8970234325522483, + "grad_norm": 3.3728370666503906, + "learning_rate": 9.549045519644338e-06, + "loss": 1.1087, + "step": 3541 + }, + { + "epoch": 0.8972767574414187, + "grad_norm": 3.5982282161712646, + "learning_rate": 9.548697726497319e-06, + "loss": 1.2767, + "step": 3542 + }, + { + "epoch": 0.897530082330589, + "grad_norm": 3.420250654220581, + "learning_rate": 9.548349805624997e-06, + "loss": 1.213, + "step": 3543 + }, + { + "epoch": 0.8977834072197594, + "grad_norm": 3.512740135192871, + "learning_rate": 9.548001757037143e-06, + "loss": 1.1563, + "step": 3544 + }, + { + "epoch": 0.8980367321089296, + "grad_norm": 3.475491523742676, + "learning_rate": 9.547653580743534e-06, + "loss": 1.3159, + "step": 3545 + }, + { + "epoch": 0.8982900569981, + "grad_norm": 3.273994207382202, + "learning_rate": 9.547305276753942e-06, + "loss": 1.114, + "step": 3546 + }, + { + "epoch": 0.8985433818872705, + "grad_norm": 3.3316612243652344, + "learning_rate": 9.546956845078151e-06, + "loss": 1.1047, + "step": 3547 + }, + { + "epoch": 0.8987967067764407, + "grad_norm": 3.2396793365478516, + "learning_rate": 9.54660828572594e-06, + "loss": 1.2443, + "step": 3548 + }, + { + "epoch": 0.8990500316656111, + "grad_norm": 3.4848010540008545, + "learning_rate": 9.546259598707102e-06, + "loss": 1.1434, + "step": 3549 + }, + { + "epoch": 0.8993033565547816, + "grad_norm": 3.5532639026641846, + "learning_rate": 9.545910784031425e-06, + "loss": 1.2154, + "step": 3550 + }, + { + "epoch": 0.8995566814439518, + "grad_norm": 3.2470998764038086, + "learning_rate": 9.545561841708702e-06, + "loss": 1.1101, + "step": 3551 + }, + { + "epoch": 0.8998100063331222, + "grad_norm": 3.5670552253723145, + "learning_rate": 9.545212771748734e-06, + "loss": 1.2471, + "step": 3552 + }, + { + "epoch": 0.9000633312222925, + "grad_norm": 3.370518207550049, + "learning_rate": 9.544863574161322e-06, + "loss": 1.1992, + "step": 3553 + }, + { + "epoch": 0.9003166561114629, + "grad_norm": 3.3459818363189697, + "learning_rate": 9.54451424895627e-06, + "loss": 1.1867, + "step": 3554 + }, + { + "epoch": 0.9005699810006333, + "grad_norm": 3.3378310203552246, + "learning_rate": 9.544164796143386e-06, + "loss": 1.1293, + "step": 3555 + }, + { + "epoch": 0.9008233058898036, + "grad_norm": 3.452312469482422, + "learning_rate": 9.543815215732488e-06, + "loss": 1.2222, + "step": 3556 + }, + { + "epoch": 0.901076630778974, + "grad_norm": 3.7270290851593018, + "learning_rate": 9.543465507733387e-06, + "loss": 1.0793, + "step": 3557 + }, + { + "epoch": 0.9013299556681444, + "grad_norm": 3.4278666973114014, + "learning_rate": 9.543115672155903e-06, + "loss": 1.1119, + "step": 3558 + }, + { + "epoch": 0.9015832805573147, + "grad_norm": 3.2745046615600586, + "learning_rate": 9.54276570900986e-06, + "loss": 1.1795, + "step": 3559 + }, + { + "epoch": 0.9018366054464851, + "grad_norm": 3.317507743835449, + "learning_rate": 9.542415618305088e-06, + "loss": 1.1609, + "step": 3560 + }, + { + "epoch": 0.9020899303356554, + "grad_norm": 3.190793991088867, + "learning_rate": 9.542065400051412e-06, + "loss": 1.0849, + "step": 3561 + }, + { + "epoch": 0.9023432552248258, + "grad_norm": 3.4192872047424316, + "learning_rate": 9.541715054258667e-06, + "loss": 1.2198, + "step": 3562 + }, + { + "epoch": 0.9025965801139962, + "grad_norm": 3.187180757522583, + "learning_rate": 9.541364580936694e-06, + "loss": 1.0867, + "step": 3563 + }, + { + "epoch": 0.9028499050031665, + "grad_norm": 3.841717481613159, + "learning_rate": 9.541013980095331e-06, + "loss": 1.1032, + "step": 3564 + }, + { + "epoch": 0.9031032298923369, + "grad_norm": 3.3154666423797607, + "learning_rate": 9.540663251744425e-06, + "loss": 1.2579, + "step": 3565 + }, + { + "epoch": 0.9033565547815073, + "grad_norm": 3.5662498474121094, + "learning_rate": 9.540312395893823e-06, + "loss": 1.1479, + "step": 3566 + }, + { + "epoch": 0.9036098796706776, + "grad_norm": 3.458418369293213, + "learning_rate": 9.539961412553375e-06, + "loss": 1.0921, + "step": 3567 + }, + { + "epoch": 0.903863204559848, + "grad_norm": 2.8619308471679688, + "learning_rate": 9.53961030173294e-06, + "loss": 1.0249, + "step": 3568 + }, + { + "epoch": 0.9041165294490183, + "grad_norm": 3.324341297149658, + "learning_rate": 9.539259063442375e-06, + "loss": 1.1141, + "step": 3569 + }, + { + "epoch": 0.9043698543381887, + "grad_norm": 3.535216808319092, + "learning_rate": 9.538907697691542e-06, + "loss": 1.3503, + "step": 3570 + }, + { + "epoch": 0.9046231792273591, + "grad_norm": 3.4408106803894043, + "learning_rate": 9.538556204490308e-06, + "loss": 1.1379, + "step": 3571 + }, + { + "epoch": 0.9048765041165294, + "grad_norm": 3.6078426837921143, + "learning_rate": 9.538204583848544e-06, + "loss": 1.3365, + "step": 3572 + }, + { + "epoch": 0.9051298290056998, + "grad_norm": 3.498480796813965, + "learning_rate": 9.537852835776123e-06, + "loss": 1.1158, + "step": 3573 + }, + { + "epoch": 0.9053831538948701, + "grad_norm": 3.3247604370117188, + "learning_rate": 9.537500960282919e-06, + "loss": 1.1844, + "step": 3574 + }, + { + "epoch": 0.9056364787840405, + "grad_norm": 3.5207560062408447, + "learning_rate": 9.537148957378816e-06, + "loss": 1.2253, + "step": 3575 + }, + { + "epoch": 0.9058898036732109, + "grad_norm": 3.20007586479187, + "learning_rate": 9.536796827073696e-06, + "loss": 1.089, + "step": 3576 + }, + { + "epoch": 0.9061431285623812, + "grad_norm": 3.0342764854431152, + "learning_rate": 9.536444569377447e-06, + "loss": 1.0194, + "step": 3577 + }, + { + "epoch": 0.9063964534515516, + "grad_norm": 3.244046926498413, + "learning_rate": 9.536092184299963e-06, + "loss": 1.04, + "step": 3578 + }, + { + "epoch": 0.906649778340722, + "grad_norm": 3.3093714714050293, + "learning_rate": 9.535739671851134e-06, + "loss": 1.0467, + "step": 3579 + }, + { + "epoch": 0.9069031032298923, + "grad_norm": 3.905623435974121, + "learning_rate": 9.53538703204086e-06, + "loss": 1.2277, + "step": 3580 + }, + { + "epoch": 0.9071564281190627, + "grad_norm": 3.680220127105713, + "learning_rate": 9.535034264879047e-06, + "loss": 1.2586, + "step": 3581 + }, + { + "epoch": 0.907409753008233, + "grad_norm": 3.434997797012329, + "learning_rate": 9.534681370375595e-06, + "loss": 1.0123, + "step": 3582 + }, + { + "epoch": 0.9076630778974034, + "grad_norm": 3.155743360519409, + "learning_rate": 9.534328348540417e-06, + "loss": 1.172, + "step": 3583 + }, + { + "epoch": 0.9079164027865738, + "grad_norm": 3.8581161499023438, + "learning_rate": 9.533975199383423e-06, + "loss": 1.2394, + "step": 3584 + }, + { + "epoch": 0.9081697276757441, + "grad_norm": 3.4210739135742188, + "learning_rate": 9.533621922914532e-06, + "loss": 1.1136, + "step": 3585 + }, + { + "epoch": 0.9084230525649145, + "grad_norm": 3.2781097888946533, + "learning_rate": 9.53326851914366e-06, + "loss": 1.1038, + "step": 3586 + }, + { + "epoch": 0.9086763774540849, + "grad_norm": 3.327735662460327, + "learning_rate": 9.532914988080734e-06, + "loss": 1.0744, + "step": 3587 + }, + { + "epoch": 0.9089297023432552, + "grad_norm": 3.6758453845977783, + "learning_rate": 9.532561329735678e-06, + "loss": 1.1279, + "step": 3588 + }, + { + "epoch": 0.9091830272324256, + "grad_norm": 3.8179948329925537, + "learning_rate": 9.532207544118426e-06, + "loss": 1.1381, + "step": 3589 + }, + { + "epoch": 0.9094363521215959, + "grad_norm": 3.188765287399292, + "learning_rate": 9.53185363123891e-06, + "loss": 1.1702, + "step": 3590 + }, + { + "epoch": 0.9096896770107663, + "grad_norm": 4.352060317993164, + "learning_rate": 9.531499591107068e-06, + "loss": 1.5593, + "step": 3591 + }, + { + "epoch": 0.9099430018999367, + "grad_norm": 3.6396377086639404, + "learning_rate": 9.531145423732839e-06, + "loss": 1.221, + "step": 3592 + }, + { + "epoch": 0.910196326789107, + "grad_norm": 3.6435651779174805, + "learning_rate": 9.530791129126174e-06, + "loss": 1.265, + "step": 3593 + }, + { + "epoch": 0.9104496516782774, + "grad_norm": 3.252295970916748, + "learning_rate": 9.530436707297015e-06, + "loss": 1.0908, + "step": 3594 + }, + { + "epoch": 0.9107029765674477, + "grad_norm": 3.793752431869507, + "learning_rate": 9.530082158255317e-06, + "loss": 1.2533, + "step": 3595 + }, + { + "epoch": 0.9109563014566181, + "grad_norm": 3.2656681537628174, + "learning_rate": 9.529727482011036e-06, + "loss": 1.204, + "step": 3596 + }, + { + "epoch": 0.9112096263457885, + "grad_norm": 3.5367040634155273, + "learning_rate": 9.529372678574129e-06, + "loss": 1.125, + "step": 3597 + }, + { + "epoch": 0.9114629512349588, + "grad_norm": 3.358474016189575, + "learning_rate": 9.529017747954561e-06, + "loss": 1.0224, + "step": 3598 + }, + { + "epoch": 0.9117162761241292, + "grad_norm": 3.8379125595092773, + "learning_rate": 9.528662690162296e-06, + "loss": 1.2502, + "step": 3599 + }, + { + "epoch": 0.9119696010132996, + "grad_norm": 3.674384355545044, + "learning_rate": 9.528307505207307e-06, + "loss": 1.294, + "step": 3600 + }, + { + "epoch": 0.9122229259024699, + "grad_norm": 3.563316822052002, + "learning_rate": 9.527952193099564e-06, + "loss": 1.3051, + "step": 3601 + }, + { + "epoch": 0.9124762507916403, + "grad_norm": 3.5329408645629883, + "learning_rate": 9.527596753849046e-06, + "loss": 1.232, + "step": 3602 + }, + { + "epoch": 0.9127295756808106, + "grad_norm": 3.2041163444519043, + "learning_rate": 9.527241187465735e-06, + "loss": 0.9945, + "step": 3603 + }, + { + "epoch": 0.912982900569981, + "grad_norm": 3.312466859817505, + "learning_rate": 9.52688549395961e-06, + "loss": 1.1157, + "step": 3604 + }, + { + "epoch": 0.9132362254591514, + "grad_norm": 2.954261302947998, + "learning_rate": 9.526529673340665e-06, + "loss": 1.0741, + "step": 3605 + }, + { + "epoch": 0.9134895503483217, + "grad_norm": 3.649989604949951, + "learning_rate": 9.526173725618885e-06, + "loss": 1.1959, + "step": 3606 + }, + { + "epoch": 0.9137428752374921, + "grad_norm": 3.1549792289733887, + "learning_rate": 9.525817650804271e-06, + "loss": 1.1578, + "step": 3607 + }, + { + "epoch": 0.9139962001266625, + "grad_norm": 3.561000347137451, + "learning_rate": 9.525461448906817e-06, + "loss": 1.1482, + "step": 3608 + }, + { + "epoch": 0.9142495250158328, + "grad_norm": 3.3429903984069824, + "learning_rate": 9.525105119936528e-06, + "loss": 1.1744, + "step": 3609 + }, + { + "epoch": 0.9145028499050032, + "grad_norm": 3.5031373500823975, + "learning_rate": 9.524748663903408e-06, + "loss": 1.0668, + "step": 3610 + }, + { + "epoch": 0.9147561747941735, + "grad_norm": 3.495927572250366, + "learning_rate": 9.524392080817465e-06, + "loss": 1.283, + "step": 3611 + }, + { + "epoch": 0.9150094996833439, + "grad_norm": 3.4305431842803955, + "learning_rate": 9.52403537068871e-06, + "loss": 1.1616, + "step": 3612 + }, + { + "epoch": 0.9152628245725143, + "grad_norm": 3.8657901287078857, + "learning_rate": 9.523678533527166e-06, + "loss": 1.3108, + "step": 3613 + }, + { + "epoch": 0.9155161494616846, + "grad_norm": 3.1496613025665283, + "learning_rate": 9.52332156934285e-06, + "loss": 1.0218, + "step": 3614 + }, + { + "epoch": 0.915769474350855, + "grad_norm": 3.193406343460083, + "learning_rate": 9.52296447814578e-06, + "loss": 1.0279, + "step": 3615 + }, + { + "epoch": 0.9160227992400254, + "grad_norm": 3.3336079120635986, + "learning_rate": 9.52260725994599e-06, + "loss": 1.1338, + "step": 3616 + }, + { + "epoch": 0.9162761241291957, + "grad_norm": 3.353907823562622, + "learning_rate": 9.522249914753507e-06, + "loss": 1.0635, + "step": 3617 + }, + { + "epoch": 0.9165294490183661, + "grad_norm": 3.2871077060699463, + "learning_rate": 9.521892442578368e-06, + "loss": 1.1136, + "step": 3618 + }, + { + "epoch": 0.9167827739075364, + "grad_norm": 3.2008469104766846, + "learning_rate": 9.521534843430606e-06, + "loss": 1.1852, + "step": 3619 + }, + { + "epoch": 0.9170360987967068, + "grad_norm": 4.270013809204102, + "learning_rate": 9.521177117320267e-06, + "loss": 1.301, + "step": 3620 + }, + { + "epoch": 0.9172894236858772, + "grad_norm": 3.3976364135742188, + "learning_rate": 9.52081926425739e-06, + "loss": 1.1561, + "step": 3621 + }, + { + "epoch": 0.9175427485750475, + "grad_norm": 3.1098673343658447, + "learning_rate": 9.520461284252029e-06, + "loss": 1.1531, + "step": 3622 + }, + { + "epoch": 0.9177960734642179, + "grad_norm": 3.628095865249634, + "learning_rate": 9.520103177314235e-06, + "loss": 1.1812, + "step": 3623 + }, + { + "epoch": 0.9180493983533882, + "grad_norm": 3.203521251678467, + "learning_rate": 9.519744943454062e-06, + "loss": 1.016, + "step": 3624 + }, + { + "epoch": 0.9183027232425586, + "grad_norm": 3.242738962173462, + "learning_rate": 9.519386582681566e-06, + "loss": 1.2456, + "step": 3625 + }, + { + "epoch": 0.918556048131729, + "grad_norm": 3.5318028926849365, + "learning_rate": 9.519028095006817e-06, + "loss": 1.104, + "step": 3626 + }, + { + "epoch": 0.9188093730208993, + "grad_norm": 3.5023603439331055, + "learning_rate": 9.518669480439875e-06, + "loss": 1.1201, + "step": 3627 + }, + { + "epoch": 0.9190626979100697, + "grad_norm": 3.1643176078796387, + "learning_rate": 9.51831073899081e-06, + "loss": 1.1703, + "step": 3628 + }, + { + "epoch": 0.9193160227992401, + "grad_norm": 3.1051294803619385, + "learning_rate": 9.5179518706697e-06, + "loss": 1.1061, + "step": 3629 + }, + { + "epoch": 0.9195693476884104, + "grad_norm": 3.1064279079437256, + "learning_rate": 9.517592875486619e-06, + "loss": 1.0738, + "step": 3630 + }, + { + "epoch": 0.9198226725775808, + "grad_norm": 3.454463005065918, + "learning_rate": 9.517233753451646e-06, + "loss": 1.1865, + "step": 3631 + }, + { + "epoch": 0.9200759974667511, + "grad_norm": 3.2854862213134766, + "learning_rate": 9.516874504574866e-06, + "loss": 1.0756, + "step": 3632 + }, + { + "epoch": 0.9203293223559215, + "grad_norm": 3.3770976066589355, + "learning_rate": 9.516515128866368e-06, + "loss": 1.1921, + "step": 3633 + }, + { + "epoch": 0.9205826472450919, + "grad_norm": 3.346442937850952, + "learning_rate": 9.51615562633624e-06, + "loss": 1.2264, + "step": 3634 + }, + { + "epoch": 0.9208359721342622, + "grad_norm": 3.339327096939087, + "learning_rate": 9.515795996994582e-06, + "loss": 1.2777, + "step": 3635 + }, + { + "epoch": 0.9210892970234326, + "grad_norm": 3.4643490314483643, + "learning_rate": 9.515436240851486e-06, + "loss": 0.9849, + "step": 3636 + }, + { + "epoch": 0.921342621912603, + "grad_norm": 3.3188140392303467, + "learning_rate": 9.515076357917059e-06, + "loss": 1.1642, + "step": 3637 + }, + { + "epoch": 0.9215959468017733, + "grad_norm": 3.399202585220337, + "learning_rate": 9.514716348201403e-06, + "loss": 1.1187, + "step": 3638 + }, + { + "epoch": 0.9218492716909437, + "grad_norm": 3.731959581375122, + "learning_rate": 9.514356211714626e-06, + "loss": 1.1744, + "step": 3639 + }, + { + "epoch": 0.922102596580114, + "grad_norm": 3.2492988109588623, + "learning_rate": 9.513995948466844e-06, + "loss": 1.1625, + "step": 3640 + }, + { + "epoch": 0.9223559214692844, + "grad_norm": 3.396430492401123, + "learning_rate": 9.513635558468172e-06, + "loss": 1.2304, + "step": 3641 + }, + { + "epoch": 0.9226092463584548, + "grad_norm": 3.5798380374908447, + "learning_rate": 9.513275041728728e-06, + "loss": 1.1121, + "step": 3642 + }, + { + "epoch": 0.922862571247625, + "grad_norm": 3.0054409503936768, + "learning_rate": 9.512914398258637e-06, + "loss": 1.0951, + "step": 3643 + }, + { + "epoch": 0.9231158961367955, + "grad_norm": 3.5808658599853516, + "learning_rate": 9.512553628068024e-06, + "loss": 1.2808, + "step": 3644 + }, + { + "epoch": 0.9233692210259657, + "grad_norm": 3.384359359741211, + "learning_rate": 9.51219273116702e-06, + "loss": 1.1461, + "step": 3645 + }, + { + "epoch": 0.9236225459151362, + "grad_norm": 3.1516263484954834, + "learning_rate": 9.511831707565758e-06, + "loss": 1.0265, + "step": 3646 + }, + { + "epoch": 0.9238758708043066, + "grad_norm": 3.3705384731292725, + "learning_rate": 9.511470557274379e-06, + "loss": 1.1197, + "step": 3647 + }, + { + "epoch": 0.9241291956934768, + "grad_norm": 3.4362244606018066, + "learning_rate": 9.511109280303018e-06, + "loss": 1.0621, + "step": 3648 + }, + { + "epoch": 0.9243825205826472, + "grad_norm": 3.3836193084716797, + "learning_rate": 9.510747876661825e-06, + "loss": 1.2772, + "step": 3649 + }, + { + "epoch": 0.9246358454718177, + "grad_norm": 3.540104627609253, + "learning_rate": 9.510386346360945e-06, + "loss": 1.2575, + "step": 3650 + }, + { + "epoch": 0.9248891703609879, + "grad_norm": 3.4364829063415527, + "learning_rate": 9.51002468941053e-06, + "loss": 1.105, + "step": 3651 + }, + { + "epoch": 0.9251424952501583, + "grad_norm": 3.6412746906280518, + "learning_rate": 9.509662905820736e-06, + "loss": 1.2104, + "step": 3652 + }, + { + "epoch": 0.9253958201393286, + "grad_norm": 3.5720605850219727, + "learning_rate": 9.50930099560172e-06, + "loss": 1.2214, + "step": 3653 + }, + { + "epoch": 0.925649145028499, + "grad_norm": 3.4222095012664795, + "learning_rate": 9.508938958763647e-06, + "loss": 1.1668, + "step": 3654 + }, + { + "epoch": 0.9259024699176694, + "grad_norm": 3.6412322521209717, + "learning_rate": 9.50857679531668e-06, + "loss": 1.2624, + "step": 3655 + }, + { + "epoch": 0.9261557948068397, + "grad_norm": 3.187004566192627, + "learning_rate": 9.508214505270989e-06, + "loss": 1.1644, + "step": 3656 + }, + { + "epoch": 0.9264091196960101, + "grad_norm": 3.4312942028045654, + "learning_rate": 9.50785208863675e-06, + "loss": 1.1829, + "step": 3657 + }, + { + "epoch": 0.9266624445851805, + "grad_norm": 3.1643357276916504, + "learning_rate": 9.507489545424135e-06, + "loss": 1.1241, + "step": 3658 + }, + { + "epoch": 0.9269157694743508, + "grad_norm": 3.2400124073028564, + "learning_rate": 9.507126875643328e-06, + "loss": 1.2106, + "step": 3659 + }, + { + "epoch": 0.9271690943635212, + "grad_norm": 3.43381667137146, + "learning_rate": 9.506764079304508e-06, + "loss": 1.2335, + "step": 3660 + }, + { + "epoch": 0.9274224192526915, + "grad_norm": 3.0889716148376465, + "learning_rate": 9.506401156417868e-06, + "loss": 1.0938, + "step": 3661 + }, + { + "epoch": 0.9276757441418619, + "grad_norm": 3.585270404815674, + "learning_rate": 9.506038106993594e-06, + "loss": 1.1914, + "step": 3662 + }, + { + "epoch": 0.9279290690310323, + "grad_norm": 3.6273610591888428, + "learning_rate": 9.505674931041882e-06, + "loss": 1.184, + "step": 3663 + }, + { + "epoch": 0.9281823939202026, + "grad_norm": 3.4800095558166504, + "learning_rate": 9.50531162857293e-06, + "loss": 1.1435, + "step": 3664 + }, + { + "epoch": 0.928435718809373, + "grad_norm": 3.5298163890838623, + "learning_rate": 9.504948199596938e-06, + "loss": 1.2875, + "step": 3665 + }, + { + "epoch": 0.9286890436985434, + "grad_norm": 3.3472063541412354, + "learning_rate": 9.504584644124113e-06, + "loss": 1.0663, + "step": 3666 + }, + { + "epoch": 0.9289423685877137, + "grad_norm": 3.5220470428466797, + "learning_rate": 9.504220962164662e-06, + "loss": 1.1808, + "step": 3667 + }, + { + "epoch": 0.9291956934768841, + "grad_norm": 3.6811187267303467, + "learning_rate": 9.503857153728797e-06, + "loss": 1.2056, + "step": 3668 + }, + { + "epoch": 0.9294490183660544, + "grad_norm": 3.4177627563476562, + "learning_rate": 9.503493218826734e-06, + "loss": 1.1911, + "step": 3669 + }, + { + "epoch": 0.9297023432552248, + "grad_norm": 3.5350520610809326, + "learning_rate": 9.503129157468693e-06, + "loss": 1.2154, + "step": 3670 + }, + { + "epoch": 0.9299556681443952, + "grad_norm": 3.4798848628997803, + "learning_rate": 9.502764969664895e-06, + "loss": 1.3087, + "step": 3671 + }, + { + "epoch": 0.9302089930335655, + "grad_norm": 3.2628650665283203, + "learning_rate": 9.502400655425567e-06, + "loss": 1.1072, + "step": 3672 + }, + { + "epoch": 0.9304623179227359, + "grad_norm": 3.29524827003479, + "learning_rate": 9.50203621476094e-06, + "loss": 1.1156, + "step": 3673 + }, + { + "epoch": 0.9307156428119062, + "grad_norm": 3.172996997833252, + "learning_rate": 9.501671647681246e-06, + "loss": 1.0985, + "step": 3674 + }, + { + "epoch": 0.9309689677010766, + "grad_norm": 3.3947994709014893, + "learning_rate": 9.50130695419672e-06, + "loss": 1.1692, + "step": 3675 + }, + { + "epoch": 0.931222292590247, + "grad_norm": 3.2290871143341064, + "learning_rate": 9.500942134317605e-06, + "loss": 1.1601, + "step": 3676 + }, + { + "epoch": 0.9314756174794173, + "grad_norm": 3.762852907180786, + "learning_rate": 9.500577188054146e-06, + "loss": 1.3201, + "step": 3677 + }, + { + "epoch": 0.9317289423685877, + "grad_norm": 3.3253703117370605, + "learning_rate": 9.500212115416588e-06, + "loss": 1.287, + "step": 3678 + }, + { + "epoch": 0.9319822672577581, + "grad_norm": 3.639549970626831, + "learning_rate": 9.49984691641518e-06, + "loss": 1.2524, + "step": 3679 + }, + { + "epoch": 0.9322355921469284, + "grad_norm": 3.2327334880828857, + "learning_rate": 9.499481591060183e-06, + "loss": 1.1269, + "step": 3680 + }, + { + "epoch": 0.9324889170360988, + "grad_norm": 3.175328493118286, + "learning_rate": 9.499116139361852e-06, + "loss": 1.0656, + "step": 3681 + }, + { + "epoch": 0.9327422419252691, + "grad_norm": 3.2034428119659424, + "learning_rate": 9.498750561330448e-06, + "loss": 1.0962, + "step": 3682 + }, + { + "epoch": 0.9329955668144395, + "grad_norm": 3.437746286392212, + "learning_rate": 9.498384856976235e-06, + "loss": 1.1874, + "step": 3683 + }, + { + "epoch": 0.9332488917036099, + "grad_norm": 3.2602434158325195, + "learning_rate": 9.498019026309484e-06, + "loss": 1.1412, + "step": 3684 + }, + { + "epoch": 0.9335022165927802, + "grad_norm": 3.566904067993164, + "learning_rate": 9.497653069340467e-06, + "loss": 1.2866, + "step": 3685 + }, + { + "epoch": 0.9337555414819506, + "grad_norm": 3.1169989109039307, + "learning_rate": 9.497286986079462e-06, + "loss": 1.1, + "step": 3686 + }, + { + "epoch": 0.934008866371121, + "grad_norm": 3.468841314315796, + "learning_rate": 9.496920776536745e-06, + "loss": 1.0942, + "step": 3687 + }, + { + "epoch": 0.9342621912602913, + "grad_norm": 3.2861669063568115, + "learning_rate": 9.496554440722598e-06, + "loss": 1.1, + "step": 3688 + }, + { + "epoch": 0.9345155161494617, + "grad_norm": 3.364629030227661, + "learning_rate": 9.49618797864731e-06, + "loss": 1.0949, + "step": 3689 + }, + { + "epoch": 0.934768841038632, + "grad_norm": 3.7341485023498535, + "learning_rate": 9.495821390321173e-06, + "loss": 1.1573, + "step": 3690 + }, + { + "epoch": 0.9350221659278024, + "grad_norm": 3.2584400177001953, + "learning_rate": 9.49545467575448e-06, + "loss": 1.0675, + "step": 3691 + }, + { + "epoch": 0.9352754908169728, + "grad_norm": 3.204885244369507, + "learning_rate": 9.495087834957523e-06, + "loss": 1.1662, + "step": 3692 + }, + { + "epoch": 0.9355288157061431, + "grad_norm": 3.42419695854187, + "learning_rate": 9.494720867940608e-06, + "loss": 1.1485, + "step": 3693 + }, + { + "epoch": 0.9357821405953135, + "grad_norm": 3.1303205490112305, + "learning_rate": 9.494353774714036e-06, + "loss": 1.0381, + "step": 3694 + }, + { + "epoch": 0.9360354654844838, + "grad_norm": 3.7704291343688965, + "learning_rate": 9.493986555288118e-06, + "loss": 1.293, + "step": 3695 + }, + { + "epoch": 0.9362887903736542, + "grad_norm": 3.3077592849731445, + "learning_rate": 9.493619209673164e-06, + "loss": 1.0866, + "step": 3696 + }, + { + "epoch": 0.9365421152628246, + "grad_norm": 3.1638870239257812, + "learning_rate": 9.493251737879488e-06, + "loss": 1.1623, + "step": 3697 + }, + { + "epoch": 0.9367954401519949, + "grad_norm": 3.3037116527557373, + "learning_rate": 9.492884139917408e-06, + "loss": 1.0958, + "step": 3698 + }, + { + "epoch": 0.9370487650411653, + "grad_norm": 3.504441499710083, + "learning_rate": 9.492516415797249e-06, + "loss": 1.0941, + "step": 3699 + }, + { + "epoch": 0.9373020899303357, + "grad_norm": 3.4657788276672363, + "learning_rate": 9.492148565529333e-06, + "loss": 1.0397, + "step": 3700 + }, + { + "epoch": 0.937555414819506, + "grad_norm": 3.318652868270874, + "learning_rate": 9.491780589123991e-06, + "loss": 1.1455, + "step": 3701 + }, + { + "epoch": 0.9378087397086764, + "grad_norm": 3.411210775375366, + "learning_rate": 9.491412486591556e-06, + "loss": 1.1889, + "step": 3702 + }, + { + "epoch": 0.9380620645978467, + "grad_norm": 3.2670881748199463, + "learning_rate": 9.491044257942364e-06, + "loss": 1.0567, + "step": 3703 + }, + { + "epoch": 0.9383153894870171, + "grad_norm": 3.5867013931274414, + "learning_rate": 9.490675903186753e-06, + "loss": 1.2733, + "step": 3704 + }, + { + "epoch": 0.9385687143761875, + "grad_norm": 3.686847686767578, + "learning_rate": 9.490307422335068e-06, + "loss": 1.0941, + "step": 3705 + }, + { + "epoch": 0.9388220392653578, + "grad_norm": 3.306431293487549, + "learning_rate": 9.489938815397655e-06, + "loss": 1.2053, + "step": 3706 + }, + { + "epoch": 0.9390753641545282, + "grad_norm": 3.5809178352355957, + "learning_rate": 9.489570082384863e-06, + "loss": 1.1933, + "step": 3707 + }, + { + "epoch": 0.9393286890436986, + "grad_norm": 3.156501293182373, + "learning_rate": 9.489201223307048e-06, + "loss": 1.062, + "step": 3708 + }, + { + "epoch": 0.9395820139328689, + "grad_norm": 3.27620005607605, + "learning_rate": 9.488832238174568e-06, + "loss": 1.1342, + "step": 3709 + }, + { + "epoch": 0.9398353388220393, + "grad_norm": 3.3848562240600586, + "learning_rate": 9.48846312699778e-06, + "loss": 1.1495, + "step": 3710 + }, + { + "epoch": 0.9400886637112096, + "grad_norm": 3.2755961418151855, + "learning_rate": 9.488093889787053e-06, + "loss": 1.1533, + "step": 3711 + }, + { + "epoch": 0.94034198860038, + "grad_norm": 3.1494967937469482, + "learning_rate": 9.487724526552753e-06, + "loss": 1.1192, + "step": 3712 + }, + { + "epoch": 0.9405953134895504, + "grad_norm": 3.6119613647460938, + "learning_rate": 9.48735503730525e-06, + "loss": 1.1016, + "step": 3713 + }, + { + "epoch": 0.9408486383787207, + "grad_norm": 3.4080798625946045, + "learning_rate": 9.486985422054921e-06, + "loss": 1.2853, + "step": 3714 + }, + { + "epoch": 0.9411019632678911, + "grad_norm": 3.7177653312683105, + "learning_rate": 9.486615680812146e-06, + "loss": 1.1727, + "step": 3715 + }, + { + "epoch": 0.9413552881570614, + "grad_norm": 3.400573492050171, + "learning_rate": 9.486245813587305e-06, + "loss": 1.1045, + "step": 3716 + }, + { + "epoch": 0.9416086130462318, + "grad_norm": 3.753406286239624, + "learning_rate": 9.485875820390783e-06, + "loss": 1.33, + "step": 3717 + }, + { + "epoch": 0.9418619379354022, + "grad_norm": 3.4892420768737793, + "learning_rate": 9.48550570123297e-06, + "loss": 1.2081, + "step": 3718 + }, + { + "epoch": 0.9421152628245725, + "grad_norm": 3.635849952697754, + "learning_rate": 9.485135456124261e-06, + "loss": 1.128, + "step": 3719 + }, + { + "epoch": 0.9423685877137429, + "grad_norm": 3.3166277408599854, + "learning_rate": 9.48476508507505e-06, + "loss": 1.1779, + "step": 3720 + }, + { + "epoch": 0.9426219126029133, + "grad_norm": 3.4502623081207275, + "learning_rate": 9.484394588095738e-06, + "loss": 1.2057, + "step": 3721 + }, + { + "epoch": 0.9428752374920836, + "grad_norm": 3.455456256866455, + "learning_rate": 9.484023965196726e-06, + "loss": 1.2209, + "step": 3722 + }, + { + "epoch": 0.943128562381254, + "grad_norm": 3.287853717803955, + "learning_rate": 9.483653216388424e-06, + "loss": 1.1678, + "step": 3723 + }, + { + "epoch": 0.9433818872704243, + "grad_norm": 3.548009157180786, + "learning_rate": 9.48328234168124e-06, + "loss": 1.2503, + "step": 3724 + }, + { + "epoch": 0.9436352121595947, + "grad_norm": 3.4027111530303955, + "learning_rate": 9.48291134108559e-06, + "loss": 1.3112, + "step": 3725 + }, + { + "epoch": 0.9438885370487651, + "grad_norm": 3.3963234424591064, + "learning_rate": 9.482540214611888e-06, + "loss": 1.1574, + "step": 3726 + }, + { + "epoch": 0.9441418619379354, + "grad_norm": 3.3159518241882324, + "learning_rate": 9.482168962270561e-06, + "loss": 1.1946, + "step": 3727 + }, + { + "epoch": 0.9443951868271058, + "grad_norm": 3.2820098400115967, + "learning_rate": 9.48179758407203e-06, + "loss": 1.1389, + "step": 3728 + }, + { + "epoch": 0.9446485117162762, + "grad_norm": 3.4625706672668457, + "learning_rate": 9.48142608002672e-06, + "loss": 1.1033, + "step": 3729 + }, + { + "epoch": 0.9449018366054465, + "grad_norm": 3.100806951522827, + "learning_rate": 9.481054450145069e-06, + "loss": 1.0783, + "step": 3730 + }, + { + "epoch": 0.9451551614946169, + "grad_norm": 3.291769504547119, + "learning_rate": 9.480682694437508e-06, + "loss": 1.2077, + "step": 3731 + }, + { + "epoch": 0.9454084863837872, + "grad_norm": 3.1551342010498047, + "learning_rate": 9.480310812914477e-06, + "loss": 1.0391, + "step": 3732 + }, + { + "epoch": 0.9456618112729576, + "grad_norm": 3.392557382583618, + "learning_rate": 9.47993880558642e-06, + "loss": 1.2441, + "step": 3733 + }, + { + "epoch": 0.945915136162128, + "grad_norm": 3.3054678440093994, + "learning_rate": 9.47956667246378e-06, + "loss": 1.1569, + "step": 3734 + }, + { + "epoch": 0.9461684610512983, + "grad_norm": 3.2276017665863037, + "learning_rate": 9.479194413557007e-06, + "loss": 1.2043, + "step": 3735 + }, + { + "epoch": 0.9464217859404687, + "grad_norm": 3.59144926071167, + "learning_rate": 9.478822028876553e-06, + "loss": 1.2623, + "step": 3736 + }, + { + "epoch": 0.9466751108296391, + "grad_norm": 3.2537569999694824, + "learning_rate": 9.478449518432878e-06, + "loss": 1.1272, + "step": 3737 + }, + { + "epoch": 0.9469284357188094, + "grad_norm": 3.278841257095337, + "learning_rate": 9.478076882236437e-06, + "loss": 1.3562, + "step": 3738 + }, + { + "epoch": 0.9471817606079798, + "grad_norm": 3.6335675716400146, + "learning_rate": 9.477704120297698e-06, + "loss": 1.1144, + "step": 3739 + }, + { + "epoch": 0.94743508549715, + "grad_norm": 3.020799160003662, + "learning_rate": 9.477331232627124e-06, + "loss": 1.0823, + "step": 3740 + }, + { + "epoch": 0.9476884103863205, + "grad_norm": 3.571873664855957, + "learning_rate": 9.476958219235189e-06, + "loss": 1.1729, + "step": 3741 + }, + { + "epoch": 0.9479417352754909, + "grad_norm": 3.5291309356689453, + "learning_rate": 9.476585080132365e-06, + "loss": 1.1524, + "step": 3742 + }, + { + "epoch": 0.9481950601646612, + "grad_norm": 3.2429070472717285, + "learning_rate": 9.47621181532913e-06, + "loss": 1.2193, + "step": 3743 + }, + { + "epoch": 0.9484483850538316, + "grad_norm": 3.3645012378692627, + "learning_rate": 9.475838424835964e-06, + "loss": 1.1676, + "step": 3744 + }, + { + "epoch": 0.9487017099430018, + "grad_norm": 3.4241702556610107, + "learning_rate": 9.475464908663355e-06, + "loss": 1.0979, + "step": 3745 + }, + { + "epoch": 0.9489550348321723, + "grad_norm": 3.4654715061187744, + "learning_rate": 9.475091266821786e-06, + "loss": 1.1196, + "step": 3746 + }, + { + "epoch": 0.9492083597213427, + "grad_norm": 3.587653160095215, + "learning_rate": 9.474717499321754e-06, + "loss": 1.2102, + "step": 3747 + }, + { + "epoch": 0.949461684610513, + "grad_norm": 3.1175477504730225, + "learning_rate": 9.474343606173751e-06, + "loss": 1.0849, + "step": 3748 + }, + { + "epoch": 0.9497150094996833, + "grad_norm": 3.8891992568969727, + "learning_rate": 9.473969587388277e-06, + "loss": 1.2371, + "step": 3749 + }, + { + "epoch": 0.9499683343888538, + "grad_norm": 3.7875583171844482, + "learning_rate": 9.473595442975834e-06, + "loss": 1.2844, + "step": 3750 + }, + { + "epoch": 0.950221659278024, + "grad_norm": 3.4973838329315186, + "learning_rate": 9.473221172946926e-06, + "loss": 1.1984, + "step": 3751 + }, + { + "epoch": 0.9504749841671944, + "grad_norm": 3.3866705894470215, + "learning_rate": 9.472846777312065e-06, + "loss": 1.1878, + "step": 3752 + }, + { + "epoch": 0.9507283090563647, + "grad_norm": 3.5690484046936035, + "learning_rate": 9.472472256081765e-06, + "loss": 1.1684, + "step": 3753 + }, + { + "epoch": 0.9509816339455351, + "grad_norm": 3.505452871322632, + "learning_rate": 9.472097609266535e-06, + "loss": 1.1924, + "step": 3754 + }, + { + "epoch": 0.9512349588347055, + "grad_norm": 3.671903610229492, + "learning_rate": 9.471722836876905e-06, + "loss": 1.1182, + "step": 3755 + }, + { + "epoch": 0.9514882837238758, + "grad_norm": 3.6271207332611084, + "learning_rate": 9.471347938923392e-06, + "loss": 1.1628, + "step": 3756 + }, + { + "epoch": 0.9517416086130462, + "grad_norm": 3.468125104904175, + "learning_rate": 9.470972915416522e-06, + "loss": 1.0835, + "step": 3757 + }, + { + "epoch": 0.9519949335022166, + "grad_norm": 3.480942726135254, + "learning_rate": 9.47059776636683e-06, + "loss": 1.1908, + "step": 3758 + }, + { + "epoch": 0.9522482583913869, + "grad_norm": 3.083986759185791, + "learning_rate": 9.470222491784849e-06, + "loss": 1.0621, + "step": 3759 + }, + { + "epoch": 0.9525015832805573, + "grad_norm": 3.3976480960845947, + "learning_rate": 9.469847091681115e-06, + "loss": 1.0349, + "step": 3760 + }, + { + "epoch": 0.9527549081697276, + "grad_norm": 3.461347818374634, + "learning_rate": 9.46947156606617e-06, + "loss": 1.2073, + "step": 3761 + }, + { + "epoch": 0.953008233058898, + "grad_norm": 3.341705560684204, + "learning_rate": 9.469095914950556e-06, + "loss": 1.0939, + "step": 3762 + }, + { + "epoch": 0.9532615579480684, + "grad_norm": 3.6789677143096924, + "learning_rate": 9.468720138344825e-06, + "loss": 1.1137, + "step": 3763 + }, + { + "epoch": 0.9535148828372387, + "grad_norm": 3.5974793434143066, + "learning_rate": 9.468344236259528e-06, + "loss": 1.2334, + "step": 3764 + }, + { + "epoch": 0.9537682077264091, + "grad_norm": 3.2468185424804688, + "learning_rate": 9.467968208705217e-06, + "loss": 1.2708, + "step": 3765 + }, + { + "epoch": 0.9540215326155794, + "grad_norm": 3.4909257888793945, + "learning_rate": 9.467592055692454e-06, + "loss": 1.1995, + "step": 3766 + }, + { + "epoch": 0.9542748575047498, + "grad_norm": 3.160750150680542, + "learning_rate": 9.4672157772318e-06, + "loss": 0.9491, + "step": 3767 + }, + { + "epoch": 0.9545281823939202, + "grad_norm": 3.3668110370635986, + "learning_rate": 9.466839373333818e-06, + "loss": 1.1766, + "step": 3768 + }, + { + "epoch": 0.9547815072830905, + "grad_norm": 3.286818742752075, + "learning_rate": 9.466462844009083e-06, + "loss": 1.0569, + "step": 3769 + }, + { + "epoch": 0.9550348321722609, + "grad_norm": 3.1187360286712646, + "learning_rate": 9.466086189268163e-06, + "loss": 1.022, + "step": 3770 + }, + { + "epoch": 0.9552881570614313, + "grad_norm": 3.311356544494629, + "learning_rate": 9.465709409121638e-06, + "loss": 1.1193, + "step": 3771 + }, + { + "epoch": 0.9555414819506016, + "grad_norm": 3.3468382358551025, + "learning_rate": 9.465332503580083e-06, + "loss": 1.0767, + "step": 3772 + }, + { + "epoch": 0.955794806839772, + "grad_norm": 3.2264561653137207, + "learning_rate": 9.464955472654085e-06, + "loss": 1.2398, + "step": 3773 + }, + { + "epoch": 0.9560481317289423, + "grad_norm": 3.4525232315063477, + "learning_rate": 9.464578316354231e-06, + "loss": 1.289, + "step": 3774 + }, + { + "epoch": 0.9563014566181127, + "grad_norm": 3.4776265621185303, + "learning_rate": 9.46420103469111e-06, + "loss": 1.1543, + "step": 3775 + }, + { + "epoch": 0.9565547815072831, + "grad_norm": 3.537991762161255, + "learning_rate": 9.463823627675314e-06, + "loss": 1.1817, + "step": 3776 + }, + { + "epoch": 0.9568081063964534, + "grad_norm": 3.2594330310821533, + "learning_rate": 9.463446095317445e-06, + "loss": 1.107, + "step": 3777 + }, + { + "epoch": 0.9570614312856238, + "grad_norm": 3.2585158348083496, + "learning_rate": 9.463068437628102e-06, + "loss": 1.1603, + "step": 3778 + }, + { + "epoch": 0.9573147561747942, + "grad_norm": 3.54490327835083, + "learning_rate": 9.462690654617888e-06, + "loss": 1.1565, + "step": 3779 + }, + { + "epoch": 0.9575680810639645, + "grad_norm": 3.3803181648254395, + "learning_rate": 9.462312746297413e-06, + "loss": 1.082, + "step": 3780 + }, + { + "epoch": 0.9578214059531349, + "grad_norm": 3.3924155235290527, + "learning_rate": 9.461934712677286e-06, + "loss": 1.0383, + "step": 3781 + }, + { + "epoch": 0.9580747308423052, + "grad_norm": 3.3958303928375244, + "learning_rate": 9.461556553768124e-06, + "loss": 1.1977, + "step": 3782 + }, + { + "epoch": 0.9583280557314756, + "grad_norm": 3.1743788719177246, + "learning_rate": 9.461178269580546e-06, + "loss": 1.0083, + "step": 3783 + }, + { + "epoch": 0.958581380620646, + "grad_norm": 3.194161891937256, + "learning_rate": 9.460799860125171e-06, + "loss": 1.0622, + "step": 3784 + }, + { + "epoch": 0.9588347055098163, + "grad_norm": 3.3265326023101807, + "learning_rate": 9.460421325412627e-06, + "loss": 1.1222, + "step": 3785 + }, + { + "epoch": 0.9590880303989867, + "grad_norm": 3.457919120788574, + "learning_rate": 9.460042665453543e-06, + "loss": 1.103, + "step": 3786 + }, + { + "epoch": 0.9593413552881571, + "grad_norm": 3.9475507736206055, + "learning_rate": 9.459663880258554e-06, + "loss": 1.1429, + "step": 3787 + }, + { + "epoch": 0.9595946801773274, + "grad_norm": 3.5583839416503906, + "learning_rate": 9.45928496983829e-06, + "loss": 1.2336, + "step": 3788 + }, + { + "epoch": 0.9598480050664978, + "grad_norm": 3.488065242767334, + "learning_rate": 9.458905934203395e-06, + "loss": 1.272, + "step": 3789 + }, + { + "epoch": 0.9601013299556681, + "grad_norm": 3.486656427383423, + "learning_rate": 9.458526773364512e-06, + "loss": 1.1709, + "step": 3790 + }, + { + "epoch": 0.9603546548448385, + "grad_norm": 3.8320512771606445, + "learning_rate": 9.458147487332285e-06, + "loss": 1.4177, + "step": 3791 + }, + { + "epoch": 0.9606079797340089, + "grad_norm": 3.292505979537964, + "learning_rate": 9.457768076117368e-06, + "loss": 1.2246, + "step": 3792 + }, + { + "epoch": 0.9608613046231792, + "grad_norm": 3.4512226581573486, + "learning_rate": 9.457388539730411e-06, + "loss": 1.3752, + "step": 3793 + }, + { + "epoch": 0.9611146295123496, + "grad_norm": 3.332423210144043, + "learning_rate": 9.457008878182074e-06, + "loss": 1.0874, + "step": 3794 + }, + { + "epoch": 0.9613679544015199, + "grad_norm": 3.1790709495544434, + "learning_rate": 9.456629091483015e-06, + "loss": 1.0322, + "step": 3795 + }, + { + "epoch": 0.9616212792906903, + "grad_norm": 3.306790590286255, + "learning_rate": 9.456249179643901e-06, + "loss": 1.1335, + "step": 3796 + }, + { + "epoch": 0.9618746041798607, + "grad_norm": 3.4400649070739746, + "learning_rate": 9.4558691426754e-06, + "loss": 1.0695, + "step": 3797 + }, + { + "epoch": 0.962127929069031, + "grad_norm": 3.24611234664917, + "learning_rate": 9.45548898058818e-06, + "loss": 1.1332, + "step": 3798 + }, + { + "epoch": 0.9623812539582014, + "grad_norm": 3.5424106121063232, + "learning_rate": 9.455108693392918e-06, + "loss": 1.1823, + "step": 3799 + }, + { + "epoch": 0.9626345788473718, + "grad_norm": 3.662250280380249, + "learning_rate": 9.454728281100292e-06, + "loss": 1.1708, + "step": 3800 + }, + { + "epoch": 0.9628879037365421, + "grad_norm": 3.3638834953308105, + "learning_rate": 9.454347743720985e-06, + "loss": 1.1165, + "step": 3801 + }, + { + "epoch": 0.9631412286257125, + "grad_norm": 3.294171094894409, + "learning_rate": 9.453967081265679e-06, + "loss": 1.0476, + "step": 3802 + }, + { + "epoch": 0.9633945535148828, + "grad_norm": 3.36968994140625, + "learning_rate": 9.453586293745065e-06, + "loss": 0.9926, + "step": 3803 + }, + { + "epoch": 0.9636478784040532, + "grad_norm": 3.604736566543579, + "learning_rate": 9.453205381169836e-06, + "loss": 1.1909, + "step": 3804 + }, + { + "epoch": 0.9639012032932236, + "grad_norm": 3.4355592727661133, + "learning_rate": 9.452824343550686e-06, + "loss": 1.245, + "step": 3805 + }, + { + "epoch": 0.9641545281823939, + "grad_norm": 3.1951956748962402, + "learning_rate": 9.452443180898316e-06, + "loss": 1.1425, + "step": 3806 + }, + { + "epoch": 0.9644078530715643, + "grad_norm": 3.562960386276245, + "learning_rate": 9.452061893223428e-06, + "loss": 1.2107, + "step": 3807 + }, + { + "epoch": 0.9646611779607347, + "grad_norm": 3.605611801147461, + "learning_rate": 9.45168048053673e-06, + "loss": 1.0697, + "step": 3808 + }, + { + "epoch": 0.964914502849905, + "grad_norm": 3.229020357131958, + "learning_rate": 9.451298942848928e-06, + "loss": 1.0341, + "step": 3809 + }, + { + "epoch": 0.9651678277390754, + "grad_norm": 3.192826271057129, + "learning_rate": 9.45091728017074e-06, + "loss": 1.1174, + "step": 3810 + }, + { + "epoch": 0.9654211526282457, + "grad_norm": 3.4178497791290283, + "learning_rate": 9.45053549251288e-06, + "loss": 1.0368, + "step": 3811 + }, + { + "epoch": 0.9656744775174161, + "grad_norm": 3.4938018321990967, + "learning_rate": 9.45015357988607e-06, + "loss": 1.3107, + "step": 3812 + }, + { + "epoch": 0.9659278024065865, + "grad_norm": 3.2385363578796387, + "learning_rate": 9.449771542301031e-06, + "loss": 1.1503, + "step": 3813 + }, + { + "epoch": 0.9661811272957568, + "grad_norm": 3.6335394382476807, + "learning_rate": 9.449389379768495e-06, + "loss": 1.2816, + "step": 3814 + }, + { + "epoch": 0.9664344521849272, + "grad_norm": 3.1078078746795654, + "learning_rate": 9.44900709229919e-06, + "loss": 1.1068, + "step": 3815 + }, + { + "epoch": 0.9666877770740975, + "grad_norm": 3.5607919692993164, + "learning_rate": 9.44862467990385e-06, + "loss": 1.2817, + "step": 3816 + }, + { + "epoch": 0.9669411019632679, + "grad_norm": 3.2077252864837646, + "learning_rate": 9.448242142593216e-06, + "loss": 1.0986, + "step": 3817 + }, + { + "epoch": 0.9671944268524383, + "grad_norm": 3.0507113933563232, + "learning_rate": 9.447859480378025e-06, + "loss": 1.1553, + "step": 3818 + }, + { + "epoch": 0.9674477517416086, + "grad_norm": 3.2941465377807617, + "learning_rate": 9.447476693269027e-06, + "loss": 1.2877, + "step": 3819 + }, + { + "epoch": 0.967701076630779, + "grad_norm": 3.351145029067993, + "learning_rate": 9.447093781276965e-06, + "loss": 1.2199, + "step": 3820 + }, + { + "epoch": 0.9679544015199494, + "grad_norm": 3.4783740043640137, + "learning_rate": 9.446710744412595e-06, + "loss": 1.1059, + "step": 3821 + }, + { + "epoch": 0.9682077264091197, + "grad_norm": 3.4025323390960693, + "learning_rate": 9.446327582686672e-06, + "loss": 1.2659, + "step": 3822 + }, + { + "epoch": 0.9684610512982901, + "grad_norm": 3.8878393173217773, + "learning_rate": 9.445944296109954e-06, + "loss": 1.4049, + "step": 3823 + }, + { + "epoch": 0.9687143761874604, + "grad_norm": 3.270028829574585, + "learning_rate": 9.445560884693203e-06, + "loss": 1.2016, + "step": 3824 + }, + { + "epoch": 0.9689677010766308, + "grad_norm": 3.1180226802825928, + "learning_rate": 9.445177348447187e-06, + "loss": 1.1235, + "step": 3825 + }, + { + "epoch": 0.9692210259658012, + "grad_norm": 3.4842357635498047, + "learning_rate": 9.444793687382674e-06, + "loss": 1.1148, + "step": 3826 + }, + { + "epoch": 0.9694743508549715, + "grad_norm": 3.5103421211242676, + "learning_rate": 9.444409901510439e-06, + "loss": 1.1294, + "step": 3827 + }, + { + "epoch": 0.9697276757441419, + "grad_norm": 3.630034923553467, + "learning_rate": 9.444025990841254e-06, + "loss": 1.2975, + "step": 3828 + }, + { + "epoch": 0.9699810006333123, + "grad_norm": 3.3602428436279297, + "learning_rate": 9.443641955385904e-06, + "loss": 1.1799, + "step": 3829 + }, + { + "epoch": 0.9702343255224826, + "grad_norm": 3.3463146686553955, + "learning_rate": 9.44325779515517e-06, + "loss": 1.2184, + "step": 3830 + }, + { + "epoch": 0.970487650411653, + "grad_norm": 3.0899195671081543, + "learning_rate": 9.44287351015984e-06, + "loss": 0.9963, + "step": 3831 + }, + { + "epoch": 0.9707409753008233, + "grad_norm": 2.956557273864746, + "learning_rate": 9.442489100410704e-06, + "loss": 1.0106, + "step": 3832 + }, + { + "epoch": 0.9709943001899937, + "grad_norm": 3.286870002746582, + "learning_rate": 9.442104565918555e-06, + "loss": 1.2033, + "step": 3833 + }, + { + "epoch": 0.9712476250791641, + "grad_norm": 3.317532777786255, + "learning_rate": 9.441719906694194e-06, + "loss": 1.0545, + "step": 3834 + }, + { + "epoch": 0.9715009499683344, + "grad_norm": 3.214738130569458, + "learning_rate": 9.441335122748418e-06, + "loss": 1.2286, + "step": 3835 + }, + { + "epoch": 0.9717542748575048, + "grad_norm": 3.272801160812378, + "learning_rate": 9.440950214092033e-06, + "loss": 1.0557, + "step": 3836 + }, + { + "epoch": 0.9720075997466752, + "grad_norm": 3.406696319580078, + "learning_rate": 9.440565180735848e-06, + "loss": 1.1168, + "step": 3837 + }, + { + "epoch": 0.9722609246358455, + "grad_norm": 3.1612672805786133, + "learning_rate": 9.440180022690674e-06, + "loss": 1.1292, + "step": 3838 + }, + { + "epoch": 0.9725142495250159, + "grad_norm": 3.3064706325531006, + "learning_rate": 9.439794739967326e-06, + "loss": 1.1617, + "step": 3839 + }, + { + "epoch": 0.9727675744141862, + "grad_norm": 3.067596197128296, + "learning_rate": 9.439409332576624e-06, + "loss": 1.0532, + "step": 3840 + }, + { + "epoch": 0.9730208993033566, + "grad_norm": 3.350423812866211, + "learning_rate": 9.439023800529385e-06, + "loss": 1.2133, + "step": 3841 + }, + { + "epoch": 0.973274224192527, + "grad_norm": 3.269676446914673, + "learning_rate": 9.43863814383644e-06, + "loss": 1.0703, + "step": 3842 + }, + { + "epoch": 0.9735275490816973, + "grad_norm": 3.0289134979248047, + "learning_rate": 9.438252362508617e-06, + "loss": 1.0969, + "step": 3843 + }, + { + "epoch": 0.9737808739708677, + "grad_norm": 3.3679349422454834, + "learning_rate": 9.437866456556747e-06, + "loss": 1.1468, + "step": 3844 + }, + { + "epoch": 0.974034198860038, + "grad_norm": 3.594999074935913, + "learning_rate": 9.437480425991668e-06, + "loss": 1.1801, + "step": 3845 + }, + { + "epoch": 0.9742875237492084, + "grad_norm": 3.608299493789673, + "learning_rate": 9.437094270824218e-06, + "loss": 1.244, + "step": 3846 + }, + { + "epoch": 0.9745408486383788, + "grad_norm": 3.7407684326171875, + "learning_rate": 9.43670799106524e-06, + "loss": 1.186, + "step": 3847 + }, + { + "epoch": 0.974794173527549, + "grad_norm": 3.3501274585723877, + "learning_rate": 9.43632158672558e-06, + "loss": 1.1088, + "step": 3848 + }, + { + "epoch": 0.9750474984167194, + "grad_norm": 3.390684127807617, + "learning_rate": 9.43593505781609e-06, + "loss": 1.2245, + "step": 3849 + }, + { + "epoch": 0.9753008233058899, + "grad_norm": 3.3741683959960938, + "learning_rate": 9.435548404347623e-06, + "loss": 1.1435, + "step": 3850 + }, + { + "epoch": 0.9755541481950601, + "grad_norm": 3.3280506134033203, + "learning_rate": 9.435161626331034e-06, + "loss": 1.037, + "step": 3851 + }, + { + "epoch": 0.9758074730842305, + "grad_norm": 3.345827579498291, + "learning_rate": 9.434774723777187e-06, + "loss": 1.168, + "step": 3852 + }, + { + "epoch": 0.9760607979734008, + "grad_norm": 3.2142157554626465, + "learning_rate": 9.434387696696942e-06, + "loss": 1.0419, + "step": 3853 + }, + { + "epoch": 0.9763141228625712, + "grad_norm": 3.3951430320739746, + "learning_rate": 9.434000545101172e-06, + "loss": 1.1057, + "step": 3854 + }, + { + "epoch": 0.9765674477517416, + "grad_norm": 3.477745294570923, + "learning_rate": 9.433613269000743e-06, + "loss": 1.1029, + "step": 3855 + }, + { + "epoch": 0.9768207726409119, + "grad_norm": 3.197359800338745, + "learning_rate": 9.433225868406531e-06, + "loss": 1.0358, + "step": 3856 + }, + { + "epoch": 0.9770740975300823, + "grad_norm": 3.2064270973205566, + "learning_rate": 9.432838343329416e-06, + "loss": 1.1123, + "step": 3857 + }, + { + "epoch": 0.9773274224192527, + "grad_norm": 3.4815673828125, + "learning_rate": 9.432450693780275e-06, + "loss": 1.1419, + "step": 3858 + }, + { + "epoch": 0.977580747308423, + "grad_norm": 2.9322943687438965, + "learning_rate": 9.43206291977e-06, + "loss": 1.0544, + "step": 3859 + }, + { + "epoch": 0.9778340721975934, + "grad_norm": 3.3347134590148926, + "learning_rate": 9.431675021309472e-06, + "loss": 1.1945, + "step": 3860 + }, + { + "epoch": 0.9780873970867637, + "grad_norm": 3.537545919418335, + "learning_rate": 9.431286998409587e-06, + "loss": 1.0261, + "step": 3861 + }, + { + "epoch": 0.9783407219759341, + "grad_norm": 3.597611904144287, + "learning_rate": 9.43089885108124e-06, + "loss": 1.2227, + "step": 3862 + }, + { + "epoch": 0.9785940468651045, + "grad_norm": 2.9854297637939453, + "learning_rate": 9.430510579335331e-06, + "loss": 1.0479, + "step": 3863 + }, + { + "epoch": 0.9788473717542748, + "grad_norm": 3.229724645614624, + "learning_rate": 9.43012218318276e-06, + "loss": 1.0965, + "step": 3864 + }, + { + "epoch": 0.9791006966434452, + "grad_norm": 3.2314443588256836, + "learning_rate": 9.429733662634436e-06, + "loss": 1.0474, + "step": 3865 + }, + { + "epoch": 0.9793540215326155, + "grad_norm": 3.617342233657837, + "learning_rate": 9.429345017701265e-06, + "loss": 1.2463, + "step": 3866 + }, + { + "epoch": 0.9796073464217859, + "grad_norm": 3.136746644973755, + "learning_rate": 9.428956248394164e-06, + "loss": 1.1454, + "step": 3867 + }, + { + "epoch": 0.9798606713109563, + "grad_norm": 2.977908134460449, + "learning_rate": 9.428567354724047e-06, + "loss": 1.1663, + "step": 3868 + }, + { + "epoch": 0.9801139962001266, + "grad_norm": 3.4196927547454834, + "learning_rate": 9.428178336701833e-06, + "loss": 1.0333, + "step": 3869 + }, + { + "epoch": 0.980367321089297, + "grad_norm": 3.3801467418670654, + "learning_rate": 9.427789194338447e-06, + "loss": 1.0466, + "step": 3870 + }, + { + "epoch": 0.9806206459784674, + "grad_norm": 3.2432374954223633, + "learning_rate": 9.427399927644817e-06, + "loss": 1.1126, + "step": 3871 + }, + { + "epoch": 0.9808739708676377, + "grad_norm": 3.3430604934692383, + "learning_rate": 9.42701053663187e-06, + "loss": 1.3454, + "step": 3872 + }, + { + "epoch": 0.9811272957568081, + "grad_norm": 3.461338520050049, + "learning_rate": 9.426621021310542e-06, + "loss": 0.9893, + "step": 3873 + }, + { + "epoch": 0.9813806206459784, + "grad_norm": 3.28102707862854, + "learning_rate": 9.42623138169177e-06, + "loss": 1.087, + "step": 3874 + }, + { + "epoch": 0.9816339455351488, + "grad_norm": 3.4910247325897217, + "learning_rate": 9.425841617786498e-06, + "loss": 1.1913, + "step": 3875 + }, + { + "epoch": 0.9818872704243192, + "grad_norm": 3.680506706237793, + "learning_rate": 9.425451729605665e-06, + "loss": 1.234, + "step": 3876 + }, + { + "epoch": 0.9821405953134895, + "grad_norm": 3.419874668121338, + "learning_rate": 9.425061717160224e-06, + "loss": 1.1164, + "step": 3877 + }, + { + "epoch": 0.9823939202026599, + "grad_norm": 3.5245721340179443, + "learning_rate": 9.424671580461122e-06, + "loss": 1.213, + "step": 3878 + }, + { + "epoch": 0.9826472450918303, + "grad_norm": 3.2231924533843994, + "learning_rate": 9.424281319519315e-06, + "loss": 1.1113, + "step": 3879 + }, + { + "epoch": 0.9829005699810006, + "grad_norm": 3.562605857849121, + "learning_rate": 9.423890934345765e-06, + "loss": 1.2658, + "step": 3880 + }, + { + "epoch": 0.983153894870171, + "grad_norm": 3.5420823097229004, + "learning_rate": 9.423500424951428e-06, + "loss": 1.1124, + "step": 3881 + }, + { + "epoch": 0.9834072197593413, + "grad_norm": 3.412123680114746, + "learning_rate": 9.423109791347274e-06, + "loss": 1.0965, + "step": 3882 + }, + { + "epoch": 0.9836605446485117, + "grad_norm": 3.1747586727142334, + "learning_rate": 9.422719033544269e-06, + "loss": 0.9925, + "step": 3883 + }, + { + "epoch": 0.9839138695376821, + "grad_norm": 3.5493829250335693, + "learning_rate": 9.422328151553388e-06, + "loss": 1.0916, + "step": 3884 + }, + { + "epoch": 0.9841671944268524, + "grad_norm": 3.297175168991089, + "learning_rate": 9.421937145385604e-06, + "loss": 1.0271, + "step": 3885 + }, + { + "epoch": 0.9844205193160228, + "grad_norm": 3.282716989517212, + "learning_rate": 9.4215460150519e-06, + "loss": 1.0088, + "step": 3886 + }, + { + "epoch": 0.9846738442051931, + "grad_norm": 3.3660576343536377, + "learning_rate": 9.421154760563253e-06, + "loss": 1.1502, + "step": 3887 + }, + { + "epoch": 0.9849271690943635, + "grad_norm": 3.6032137870788574, + "learning_rate": 9.420763381930653e-06, + "loss": 1.1942, + "step": 3888 + }, + { + "epoch": 0.9851804939835339, + "grad_norm": 3.5022025108337402, + "learning_rate": 9.420371879165089e-06, + "loss": 1.0985, + "step": 3889 + }, + { + "epoch": 0.9854338188727042, + "grad_norm": 3.558393955230713, + "learning_rate": 9.419980252277554e-06, + "loss": 1.1885, + "step": 3890 + }, + { + "epoch": 0.9856871437618746, + "grad_norm": 3.6664602756500244, + "learning_rate": 9.419588501279047e-06, + "loss": 1.247, + "step": 3891 + }, + { + "epoch": 0.985940468651045, + "grad_norm": 3.4450783729553223, + "learning_rate": 9.419196626180565e-06, + "loss": 1.1347, + "step": 3892 + }, + { + "epoch": 0.9861937935402153, + "grad_norm": 3.165170192718506, + "learning_rate": 9.418804626993111e-06, + "loss": 1.1621, + "step": 3893 + }, + { + "epoch": 0.9864471184293857, + "grad_norm": 3.789806842803955, + "learning_rate": 9.418412503727697e-06, + "loss": 1.2786, + "step": 3894 + }, + { + "epoch": 0.986700443318556, + "grad_norm": 3.4419732093811035, + "learning_rate": 9.41802025639533e-06, + "loss": 1.2254, + "step": 3895 + }, + { + "epoch": 0.9869537682077264, + "grad_norm": 3.2875359058380127, + "learning_rate": 9.417627885007025e-06, + "loss": 1.1569, + "step": 3896 + }, + { + "epoch": 0.9872070930968968, + "grad_norm": 2.901073455810547, + "learning_rate": 9.4172353895738e-06, + "loss": 1.0742, + "step": 3897 + }, + { + "epoch": 0.9874604179860671, + "grad_norm": 3.3430562019348145, + "learning_rate": 9.416842770106673e-06, + "loss": 1.1992, + "step": 3898 + }, + { + "epoch": 0.9877137428752375, + "grad_norm": 3.46517014503479, + "learning_rate": 9.416450026616672e-06, + "loss": 1.139, + "step": 3899 + }, + { + "epoch": 0.9879670677644079, + "grad_norm": 3.2958526611328125, + "learning_rate": 9.416057159114826e-06, + "loss": 1.283, + "step": 3900 + }, + { + "epoch": 0.9882203926535782, + "grad_norm": 3.2788097858428955, + "learning_rate": 9.415664167612164e-06, + "loss": 1.1059, + "step": 3901 + }, + { + "epoch": 0.9884737175427486, + "grad_norm": 3.61752986907959, + "learning_rate": 9.415271052119721e-06, + "loss": 1.1303, + "step": 3902 + }, + { + "epoch": 0.9887270424319189, + "grad_norm": 3.313946485519409, + "learning_rate": 9.414877812648535e-06, + "loss": 1.1279, + "step": 3903 + }, + { + "epoch": 0.9889803673210893, + "grad_norm": 3.685398817062378, + "learning_rate": 9.414484449209652e-06, + "loss": 1.1137, + "step": 3904 + }, + { + "epoch": 0.9892336922102597, + "grad_norm": 3.203425168991089, + "learning_rate": 9.414090961814114e-06, + "loss": 1.1434, + "step": 3905 + }, + { + "epoch": 0.98948701709943, + "grad_norm": 3.2387688159942627, + "learning_rate": 9.413697350472968e-06, + "loss": 1.1431, + "step": 3906 + }, + { + "epoch": 0.9897403419886004, + "grad_norm": 3.46217942237854, + "learning_rate": 9.413303615197272e-06, + "loss": 1.166, + "step": 3907 + }, + { + "epoch": 0.9899936668777708, + "grad_norm": 2.9703526496887207, + "learning_rate": 9.412909755998077e-06, + "loss": 1.0074, + "step": 3908 + }, + { + "epoch": 0.9902469917669411, + "grad_norm": 3.5861029624938965, + "learning_rate": 9.412515772886446e-06, + "loss": 1.1414, + "step": 3909 + }, + { + "epoch": 0.9905003166561115, + "grad_norm": 3.4567534923553467, + "learning_rate": 9.412121665873437e-06, + "loss": 1.2878, + "step": 3910 + }, + { + "epoch": 0.9907536415452818, + "grad_norm": 3.2463295459747314, + "learning_rate": 9.411727434970121e-06, + "loss": 1.1724, + "step": 3911 + }, + { + "epoch": 0.9910069664344522, + "grad_norm": 3.3823342323303223, + "learning_rate": 9.411333080187568e-06, + "loss": 1.267, + "step": 3912 + }, + { + "epoch": 0.9912602913236226, + "grad_norm": 3.239588499069214, + "learning_rate": 9.410938601536848e-06, + "loss": 1.0305, + "step": 3913 + }, + { + "epoch": 0.9915136162127929, + "grad_norm": 3.7073850631713867, + "learning_rate": 9.41054399902904e-06, + "loss": 1.2407, + "step": 3914 + }, + { + "epoch": 0.9917669411019633, + "grad_norm": 3.311387538909912, + "learning_rate": 9.410149272675224e-06, + "loss": 1.0787, + "step": 3915 + }, + { + "epoch": 0.9920202659911336, + "grad_norm": 3.822373390197754, + "learning_rate": 9.409754422486482e-06, + "loss": 1.2348, + "step": 3916 + }, + { + "epoch": 0.992273590880304, + "grad_norm": 3.1513609886169434, + "learning_rate": 9.409359448473904e-06, + "loss": 1.1415, + "step": 3917 + }, + { + "epoch": 0.9925269157694744, + "grad_norm": 3.7293031215667725, + "learning_rate": 9.40896435064858e-06, + "loss": 1.4051, + "step": 3918 + }, + { + "epoch": 0.9927802406586447, + "grad_norm": 3.0820281505584717, + "learning_rate": 9.4085691290216e-06, + "loss": 1.0717, + "step": 3919 + }, + { + "epoch": 0.9930335655478151, + "grad_norm": 3.3282504081726074, + "learning_rate": 9.408173783604068e-06, + "loss": 1.1197, + "step": 3920 + }, + { + "epoch": 0.9932868904369855, + "grad_norm": 3.198674201965332, + "learning_rate": 9.407778314407081e-06, + "loss": 1.1, + "step": 3921 + }, + { + "epoch": 0.9935402153261558, + "grad_norm": 3.5387020111083984, + "learning_rate": 9.407382721441744e-06, + "loss": 1.1417, + "step": 3922 + }, + { + "epoch": 0.9937935402153262, + "grad_norm": 3.6065056324005127, + "learning_rate": 9.406987004719169e-06, + "loss": 1.232, + "step": 3923 + }, + { + "epoch": 0.9940468651044965, + "grad_norm": 3.1034014225006104, + "learning_rate": 9.406591164250462e-06, + "loss": 1.0804, + "step": 3924 + }, + { + "epoch": 0.9943001899936669, + "grad_norm": 3.61995005607605, + "learning_rate": 9.406195200046739e-06, + "loss": 1.2058, + "step": 3925 + }, + { + "epoch": 0.9945535148828373, + "grad_norm": 3.545271635055542, + "learning_rate": 9.405799112119123e-06, + "loss": 1.1187, + "step": 3926 + }, + { + "epoch": 0.9948068397720076, + "grad_norm": 3.5722460746765137, + "learning_rate": 9.405402900478731e-06, + "loss": 1.2531, + "step": 3927 + }, + { + "epoch": 0.995060164661178, + "grad_norm": 3.2239632606506348, + "learning_rate": 9.40500656513669e-06, + "loss": 1.1244, + "step": 3928 + }, + { + "epoch": 0.9953134895503484, + "grad_norm": 3.4617607593536377, + "learning_rate": 9.404610106104131e-06, + "loss": 1.1761, + "step": 3929 + }, + { + "epoch": 0.9955668144395187, + "grad_norm": 3.228865146636963, + "learning_rate": 9.404213523392183e-06, + "loss": 1.015, + "step": 3930 + }, + { + "epoch": 0.9958201393286891, + "grad_norm": 3.3742833137512207, + "learning_rate": 9.403816817011984e-06, + "loss": 1.1929, + "step": 3931 + }, + { + "epoch": 0.9960734642178594, + "grad_norm": 3.2746987342834473, + "learning_rate": 9.403419986974671e-06, + "loss": 1.039, + "step": 3932 + }, + { + "epoch": 0.9963267891070298, + "grad_norm": 3.239696502685547, + "learning_rate": 9.40302303329139e-06, + "loss": 1.0192, + "step": 3933 + }, + { + "epoch": 0.9965801139962002, + "grad_norm": 3.384113073348999, + "learning_rate": 9.402625955973286e-06, + "loss": 0.9779, + "step": 3934 + }, + { + "epoch": 0.9968334388853705, + "grad_norm": 2.995961904525757, + "learning_rate": 9.402228755031508e-06, + "loss": 1.0088, + "step": 3935 + }, + { + "epoch": 0.9970867637745409, + "grad_norm": 3.698370933532715, + "learning_rate": 9.401831430477211e-06, + "loss": 1.2532, + "step": 3936 + }, + { + "epoch": 0.9973400886637112, + "grad_norm": 3.454690933227539, + "learning_rate": 9.40143398232155e-06, + "loss": 1.1475, + "step": 3937 + }, + { + "epoch": 0.9975934135528816, + "grad_norm": 3.397371292114258, + "learning_rate": 9.401036410575686e-06, + "loss": 1.1621, + "step": 3938 + }, + { + "epoch": 0.997846738442052, + "grad_norm": 3.0821456909179688, + "learning_rate": 9.40063871525078e-06, + "loss": 1.0157, + "step": 3939 + }, + { + "epoch": 0.9981000633312223, + "grad_norm": 3.279803991317749, + "learning_rate": 9.400240896358003e-06, + "loss": 1.1192, + "step": 3940 + }, + { + "epoch": 0.9983533882203927, + "grad_norm": 3.269456624984741, + "learning_rate": 9.399842953908525e-06, + "loss": 1.19, + "step": 3941 + }, + { + "epoch": 0.9986067131095631, + "grad_norm": 3.564135789871216, + "learning_rate": 9.399444887913517e-06, + "loss": 1.0596, + "step": 3942 + }, + { + "epoch": 0.9988600379987334, + "grad_norm": 3.460651397705078, + "learning_rate": 9.39904669838416e-06, + "loss": 1.1363, + "step": 3943 + }, + { + "epoch": 0.9991133628879038, + "grad_norm": 3.2880966663360596, + "learning_rate": 9.398648385331632e-06, + "loss": 1.1396, + "step": 3944 + }, + { + "epoch": 0.999366687777074, + "grad_norm": 3.545489549636841, + "learning_rate": 9.39824994876712e-06, + "loss": 1.1993, + "step": 3945 + }, + { + "epoch": 0.9996200126662445, + "grad_norm": 3.4143259525299072, + "learning_rate": 9.397851388701811e-06, + "loss": 1.042, + "step": 3946 + }, + { + "epoch": 0.9998733375554149, + "grad_norm": 3.38020920753479, + "learning_rate": 9.397452705146895e-06, + "loss": 1.2462, + "step": 3947 + }, + { + "epoch": 1.0001266624445853, + "grad_norm": 3.2661054134368896, + "learning_rate": 9.397053898113569e-06, + "loss": 1.0958, + "step": 3948 + }, + { + "epoch": 1.0003799873337555, + "grad_norm": 3.2052109241485596, + "learning_rate": 9.39665496761303e-06, + "loss": 0.6504, + "step": 3949 + }, + { + "epoch": 1.0006333122229258, + "grad_norm": 3.3197989463806152, + "learning_rate": 9.39625591365648e-06, + "loss": 0.7845, + "step": 3950 + }, + { + "epoch": 1.0008866371120964, + "grad_norm": 3.427189588546753, + "learning_rate": 9.395856736255125e-06, + "loss": 0.9611, + "step": 3951 + }, + { + "epoch": 1.0011399620012666, + "grad_norm": 3.1614127159118652, + "learning_rate": 9.395457435420172e-06, + "loss": 0.7962, + "step": 3952 + }, + { + "epoch": 1.001393286890437, + "grad_norm": 3.2238099575042725, + "learning_rate": 9.395058011162835e-06, + "loss": 0.9196, + "step": 3953 + }, + { + "epoch": 1.0016466117796075, + "grad_norm": 2.831684112548828, + "learning_rate": 9.394658463494328e-06, + "loss": 0.7302, + "step": 3954 + }, + { + "epoch": 1.0018999366687777, + "grad_norm": 3.56166934967041, + "learning_rate": 9.394258792425873e-06, + "loss": 0.8171, + "step": 3955 + }, + { + "epoch": 1.002153261557948, + "grad_norm": 3.648761510848999, + "learning_rate": 9.393858997968687e-06, + "loss": 0.8623, + "step": 3956 + }, + { + "epoch": 1.0024065864471183, + "grad_norm": 3.7296135425567627, + "learning_rate": 9.393459080134003e-06, + "loss": 0.7937, + "step": 3957 + }, + { + "epoch": 1.0026599113362888, + "grad_norm": 3.5527822971343994, + "learning_rate": 9.393059038933046e-06, + "loss": 0.7415, + "step": 3958 + }, + { + "epoch": 1.0029132362254591, + "grad_norm": 4.277530193328857, + "learning_rate": 9.39265887437705e-06, + "loss": 0.9787, + "step": 3959 + }, + { + "epoch": 1.0031665611146294, + "grad_norm": 4.242221832275391, + "learning_rate": 9.392258586477252e-06, + "loss": 0.9013, + "step": 3960 + }, + { + "epoch": 1.0034198860038, + "grad_norm": 3.581080675125122, + "learning_rate": 9.391858175244892e-06, + "loss": 0.722, + "step": 3961 + }, + { + "epoch": 1.0036732108929702, + "grad_norm": 3.8707706928253174, + "learning_rate": 9.391457640691212e-06, + "loss": 0.7524, + "step": 3962 + }, + { + "epoch": 1.0039265357821405, + "grad_norm": 4.110296726226807, + "learning_rate": 9.39105698282746e-06, + "loss": 0.9165, + "step": 3963 + }, + { + "epoch": 1.004179860671311, + "grad_norm": 3.7063021659851074, + "learning_rate": 9.390656201664885e-06, + "loss": 0.7811, + "step": 3964 + }, + { + "epoch": 1.0044331855604813, + "grad_norm": 3.5401947498321533, + "learning_rate": 9.390255297214743e-06, + "loss": 0.7897, + "step": 3965 + }, + { + "epoch": 1.0046865104496516, + "grad_norm": 3.8825721740722656, + "learning_rate": 9.389854269488288e-06, + "loss": 0.9127, + "step": 3966 + }, + { + "epoch": 1.0049398353388221, + "grad_norm": 3.7624917030334473, + "learning_rate": 9.389453118496784e-06, + "loss": 0.7913, + "step": 3967 + }, + { + "epoch": 1.0051931602279924, + "grad_norm": 3.513160228729248, + "learning_rate": 9.389051844251493e-06, + "loss": 0.8007, + "step": 3968 + }, + { + "epoch": 1.0054464851171627, + "grad_norm": 3.7435648441314697, + "learning_rate": 9.388650446763685e-06, + "loss": 0.8864, + "step": 3969 + }, + { + "epoch": 1.005699810006333, + "grad_norm": 3.542984962463379, + "learning_rate": 9.388248926044627e-06, + "loss": 0.7748, + "step": 3970 + }, + { + "epoch": 1.0059531348955035, + "grad_norm": 3.4242076873779297, + "learning_rate": 9.387847282105597e-06, + "loss": 0.7556, + "step": 3971 + }, + { + "epoch": 1.0062064597846738, + "grad_norm": 3.671903610229492, + "learning_rate": 9.387445514957872e-06, + "loss": 0.8695, + "step": 3972 + }, + { + "epoch": 1.0064597846738441, + "grad_norm": 3.238417625427246, + "learning_rate": 9.387043624612733e-06, + "loss": 0.8229, + "step": 3973 + }, + { + "epoch": 1.0067131095630146, + "grad_norm": 3.5176565647125244, + "learning_rate": 9.386641611081464e-06, + "loss": 0.8807, + "step": 3974 + }, + { + "epoch": 1.006966434452185, + "grad_norm": 3.416443347930908, + "learning_rate": 9.386239474375354e-06, + "loss": 0.7816, + "step": 3975 + }, + { + "epoch": 1.0072197593413552, + "grad_norm": 3.7881276607513428, + "learning_rate": 9.385837214505697e-06, + "loss": 0.898, + "step": 3976 + }, + { + "epoch": 1.0074730842305257, + "grad_norm": 4.878559112548828, + "learning_rate": 9.385434831483787e-06, + "loss": 0.7363, + "step": 3977 + }, + { + "epoch": 1.007726409119696, + "grad_norm": 3.616326093673706, + "learning_rate": 9.385032325320921e-06, + "loss": 0.8663, + "step": 3978 + }, + { + "epoch": 1.0079797340088663, + "grad_norm": 3.512878179550171, + "learning_rate": 9.384629696028404e-06, + "loss": 0.7816, + "step": 3979 + }, + { + "epoch": 1.0082330588980368, + "grad_norm": 3.7067666053771973, + "learning_rate": 9.384226943617538e-06, + "loss": 0.7679, + "step": 3980 + }, + { + "epoch": 1.0084863837872071, + "grad_norm": 3.615403413772583, + "learning_rate": 9.383824068099637e-06, + "loss": 0.7131, + "step": 3981 + }, + { + "epoch": 1.0087397086763774, + "grad_norm": 3.7785820960998535, + "learning_rate": 9.383421069486009e-06, + "loss": 0.895, + "step": 3982 + }, + { + "epoch": 1.008993033565548, + "grad_norm": 3.4927902221679688, + "learning_rate": 9.383017947787972e-06, + "loss": 0.8078, + "step": 3983 + }, + { + "epoch": 1.0092463584547182, + "grad_norm": 3.422529458999634, + "learning_rate": 9.382614703016845e-06, + "loss": 0.7221, + "step": 3984 + }, + { + "epoch": 1.0094996833438885, + "grad_norm": 3.8402881622314453, + "learning_rate": 9.382211335183951e-06, + "loss": 0.9036, + "step": 3985 + }, + { + "epoch": 1.0097530082330588, + "grad_norm": 3.8119280338287354, + "learning_rate": 9.381807844300617e-06, + "loss": 0.7648, + "step": 3986 + }, + { + "epoch": 1.0100063331222293, + "grad_norm": 3.590049982070923, + "learning_rate": 9.381404230378171e-06, + "loss": 0.7746, + "step": 3987 + }, + { + "epoch": 1.0102596580113996, + "grad_norm": 3.670738935470581, + "learning_rate": 9.38100049342795e-06, + "loss": 0.8161, + "step": 3988 + }, + { + "epoch": 1.01051298290057, + "grad_norm": 3.62028431892395, + "learning_rate": 9.380596633461288e-06, + "loss": 0.7307, + "step": 3989 + }, + { + "epoch": 1.0107663077897404, + "grad_norm": 3.922258138656616, + "learning_rate": 9.380192650489523e-06, + "loss": 0.9206, + "step": 3990 + }, + { + "epoch": 1.0110196326789107, + "grad_norm": 3.51464581489563, + "learning_rate": 9.379788544524004e-06, + "loss": 0.825, + "step": 3991 + }, + { + "epoch": 1.011272957568081, + "grad_norm": 3.8684113025665283, + "learning_rate": 9.379384315576075e-06, + "loss": 0.9172, + "step": 3992 + }, + { + "epoch": 1.0115262824572515, + "grad_norm": 3.779548406600952, + "learning_rate": 9.378979963657087e-06, + "loss": 0.7806, + "step": 3993 + }, + { + "epoch": 1.0117796073464218, + "grad_norm": 3.2644236087799072, + "learning_rate": 9.378575488778392e-06, + "loss": 0.8051, + "step": 3994 + }, + { + "epoch": 1.012032932235592, + "grad_norm": 3.8024179935455322, + "learning_rate": 9.378170890951352e-06, + "loss": 0.8841, + "step": 3995 + }, + { + "epoch": 1.0122862571247626, + "grad_norm": 3.60563588142395, + "learning_rate": 9.377766170187324e-06, + "loss": 0.7384, + "step": 3996 + }, + { + "epoch": 1.012539582013933, + "grad_norm": 3.503539562225342, + "learning_rate": 9.377361326497673e-06, + "loss": 0.7236, + "step": 3997 + }, + { + "epoch": 1.0127929069031032, + "grad_norm": 3.932176113128662, + "learning_rate": 9.376956359893769e-06, + "loss": 0.8586, + "step": 3998 + }, + { + "epoch": 1.0130462317922735, + "grad_norm": 3.7912957668304443, + "learning_rate": 9.376551270386983e-06, + "loss": 0.8211, + "step": 3999 + }, + { + "epoch": 1.013299556681444, + "grad_norm": 3.5588326454162598, + "learning_rate": 9.376146057988686e-06, + "loss": 0.7828, + "step": 4000 + }, + { + "epoch": 1.013299556681444, + "eval_loss": 1.1961894035339355, + "eval_runtime": 13.9493, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 3.584, + "step": 4000 + }, + { + "epoch": 1.0135528815706143, + "grad_norm": 3.7038869857788086, + "learning_rate": 9.375740722710257e-06, + "loss": 0.8325, + "step": 4001 + }, + { + "epoch": 1.0138062064597846, + "grad_norm": 3.4084620475769043, + "learning_rate": 9.375335264563083e-06, + "loss": 0.6979, + "step": 4002 + }, + { + "epoch": 1.014059531348955, + "grad_norm": 3.8237576484680176, + "learning_rate": 9.374929683558545e-06, + "loss": 0.8107, + "step": 4003 + }, + { + "epoch": 1.0143128562381254, + "grad_norm": 3.8058247566223145, + "learning_rate": 9.37452397970803e-06, + "loss": 0.7667, + "step": 4004 + }, + { + "epoch": 1.0145661811272957, + "grad_norm": 3.383089065551758, + "learning_rate": 9.374118153022934e-06, + "loss": 0.7403, + "step": 4005 + }, + { + "epoch": 1.0148195060164662, + "grad_norm": 4.054951190948486, + "learning_rate": 9.373712203514649e-06, + "loss": 0.8392, + "step": 4006 + }, + { + "epoch": 1.0150728309056365, + "grad_norm": 3.5813748836517334, + "learning_rate": 9.373306131194575e-06, + "loss": 0.8302, + "step": 4007 + }, + { + "epoch": 1.0153261557948068, + "grad_norm": 3.783907651901245, + "learning_rate": 9.372899936074117e-06, + "loss": 0.7773, + "step": 4008 + }, + { + "epoch": 1.0155794806839773, + "grad_norm": 3.4219818115234375, + "learning_rate": 9.372493618164675e-06, + "loss": 0.7994, + "step": 4009 + }, + { + "epoch": 1.0158328055731476, + "grad_norm": 3.5228030681610107, + "learning_rate": 9.372087177477662e-06, + "loss": 0.7912, + "step": 4010 + }, + { + "epoch": 1.0160861304623179, + "grad_norm": 3.52055025100708, + "learning_rate": 9.371680614024493e-06, + "loss": 0.6163, + "step": 4011 + }, + { + "epoch": 1.0163394553514882, + "grad_norm": 3.9310102462768555, + "learning_rate": 9.371273927816577e-06, + "loss": 0.8431, + "step": 4012 + }, + { + "epoch": 1.0165927802406587, + "grad_norm": 3.408386468887329, + "learning_rate": 9.370867118865341e-06, + "loss": 0.7651, + "step": 4013 + }, + { + "epoch": 1.016846105129829, + "grad_norm": 3.851137161254883, + "learning_rate": 9.370460187182204e-06, + "loss": 0.8768, + "step": 4014 + }, + { + "epoch": 1.0170994300189993, + "grad_norm": 3.3667516708374023, + "learning_rate": 9.370053132778594e-06, + "loss": 0.7137, + "step": 4015 + }, + { + "epoch": 1.0173527549081698, + "grad_norm": 4.034343719482422, + "learning_rate": 9.369645955665938e-06, + "loss": 0.973, + "step": 4016 + }, + { + "epoch": 1.01760607979734, + "grad_norm": 3.683751344680786, + "learning_rate": 9.369238655855672e-06, + "loss": 0.7618, + "step": 4017 + }, + { + "epoch": 1.0178594046865104, + "grad_norm": 4.034209728240967, + "learning_rate": 9.368831233359234e-06, + "loss": 0.8932, + "step": 4018 + }, + { + "epoch": 1.0181127295756809, + "grad_norm": 3.8687524795532227, + "learning_rate": 9.368423688188062e-06, + "loss": 0.9488, + "step": 4019 + }, + { + "epoch": 1.0183660544648512, + "grad_norm": 4.022247791290283, + "learning_rate": 9.3680160203536e-06, + "loss": 0.8182, + "step": 4020 + }, + { + "epoch": 1.0186193793540215, + "grad_norm": 3.3809804916381836, + "learning_rate": 9.367608229867295e-06, + "loss": 0.7617, + "step": 4021 + }, + { + "epoch": 1.018872704243192, + "grad_norm": 3.5665273666381836, + "learning_rate": 9.367200316740597e-06, + "loss": 0.7599, + "step": 4022 + }, + { + "epoch": 1.0191260291323623, + "grad_norm": 3.5940616130828857, + "learning_rate": 9.366792280984964e-06, + "loss": 0.7481, + "step": 4023 + }, + { + "epoch": 1.0193793540215326, + "grad_norm": 3.7060680389404297, + "learning_rate": 9.366384122611846e-06, + "loss": 0.8695, + "step": 4024 + }, + { + "epoch": 1.019632678910703, + "grad_norm": 3.90010142326355, + "learning_rate": 9.365975841632712e-06, + "loss": 0.7519, + "step": 4025 + }, + { + "epoch": 1.0198860037998734, + "grad_norm": 3.875436544418335, + "learning_rate": 9.365567438059022e-06, + "loss": 0.7557, + "step": 4026 + }, + { + "epoch": 1.0201393286890437, + "grad_norm": 3.612962245941162, + "learning_rate": 9.365158911902244e-06, + "loss": 0.8537, + "step": 4027 + }, + { + "epoch": 1.020392653578214, + "grad_norm": 3.8356757164001465, + "learning_rate": 9.364750263173849e-06, + "loss": 0.816, + "step": 4028 + }, + { + "epoch": 1.0206459784673845, + "grad_norm": 3.585089683532715, + "learning_rate": 9.364341491885312e-06, + "loss": 0.7298, + "step": 4029 + }, + { + "epoch": 1.0208993033565548, + "grad_norm": 3.722064971923828, + "learning_rate": 9.363932598048112e-06, + "loss": 0.7424, + "step": 4030 + }, + { + "epoch": 1.021152628245725, + "grad_norm": 3.741924285888672, + "learning_rate": 9.363523581673731e-06, + "loss": 0.7275, + "step": 4031 + }, + { + "epoch": 1.0214059531348956, + "grad_norm": 4.047133922576904, + "learning_rate": 9.363114442773651e-06, + "loss": 0.9372, + "step": 4032 + }, + { + "epoch": 1.0216592780240659, + "grad_norm": 3.6290786266326904, + "learning_rate": 9.362705181359362e-06, + "loss": 0.716, + "step": 4033 + }, + { + "epoch": 1.0219126029132362, + "grad_norm": 3.848945379257202, + "learning_rate": 9.362295797442358e-06, + "loss": 0.7539, + "step": 4034 + }, + { + "epoch": 1.0221659278024067, + "grad_norm": 3.30218505859375, + "learning_rate": 9.361886291034132e-06, + "loss": 0.6905, + "step": 4035 + }, + { + "epoch": 1.022419252691577, + "grad_norm": 3.9459948539733887, + "learning_rate": 9.361476662146183e-06, + "loss": 0.6884, + "step": 4036 + }, + { + "epoch": 1.0226725775807473, + "grad_norm": 3.8152194023132324, + "learning_rate": 9.361066910790013e-06, + "loss": 0.8528, + "step": 4037 + }, + { + "epoch": 1.0229259024699178, + "grad_norm": 3.5387020111083984, + "learning_rate": 9.36065703697713e-06, + "loss": 0.8407, + "step": 4038 + }, + { + "epoch": 1.023179227359088, + "grad_norm": 3.615628242492676, + "learning_rate": 9.36024704071904e-06, + "loss": 0.7512, + "step": 4039 + }, + { + "epoch": 1.0234325522482584, + "grad_norm": 3.804381847381592, + "learning_rate": 9.359836922027255e-06, + "loss": 0.8224, + "step": 4040 + }, + { + "epoch": 1.0236858771374286, + "grad_norm": 3.6101276874542236, + "learning_rate": 9.359426680913295e-06, + "loss": 0.7711, + "step": 4041 + }, + { + "epoch": 1.0239392020265992, + "grad_norm": 3.5813560485839844, + "learning_rate": 9.359016317388677e-06, + "loss": 0.9614, + "step": 4042 + }, + { + "epoch": 1.0241925269157695, + "grad_norm": 3.7858824729919434, + "learning_rate": 9.358605831464921e-06, + "loss": 0.7475, + "step": 4043 + }, + { + "epoch": 1.0244458518049397, + "grad_norm": 3.810811996459961, + "learning_rate": 9.358195223153558e-06, + "loss": 0.862, + "step": 4044 + }, + { + "epoch": 1.0246991766941103, + "grad_norm": 3.565434455871582, + "learning_rate": 9.357784492466116e-06, + "loss": 0.798, + "step": 4045 + }, + { + "epoch": 1.0249525015832806, + "grad_norm": 3.3674070835113525, + "learning_rate": 9.357373639414127e-06, + "loss": 0.816, + "step": 4046 + }, + { + "epoch": 1.0252058264724508, + "grad_norm": 3.9376728534698486, + "learning_rate": 9.356962664009129e-06, + "loss": 0.7253, + "step": 4047 + }, + { + "epoch": 1.0254591513616214, + "grad_norm": 4.085208892822266, + "learning_rate": 9.356551566262661e-06, + "loss": 0.7994, + "step": 4048 + }, + { + "epoch": 1.0257124762507916, + "grad_norm": 3.700139045715332, + "learning_rate": 9.356140346186266e-06, + "loss": 0.7552, + "step": 4049 + }, + { + "epoch": 1.025965801139962, + "grad_norm": 3.817131280899048, + "learning_rate": 9.355729003791494e-06, + "loss": 0.7647, + "step": 4050 + }, + { + "epoch": 1.0262191260291325, + "grad_norm": 3.4104766845703125, + "learning_rate": 9.35531753908989e-06, + "loss": 0.6676, + "step": 4051 + }, + { + "epoch": 1.0264724509183027, + "grad_norm": 3.7993674278259277, + "learning_rate": 9.354905952093015e-06, + "loss": 0.8102, + "step": 4052 + }, + { + "epoch": 1.026725775807473, + "grad_norm": 3.708205223083496, + "learning_rate": 9.354494242812418e-06, + "loss": 0.7942, + "step": 4053 + }, + { + "epoch": 1.0269791006966436, + "grad_norm": 4.100223064422607, + "learning_rate": 9.354082411259664e-06, + "loss": 0.8309, + "step": 4054 + }, + { + "epoch": 1.0272324255858138, + "grad_norm": 3.8497891426086426, + "learning_rate": 9.353670457446318e-06, + "loss": 0.8815, + "step": 4055 + }, + { + "epoch": 1.0274857504749841, + "grad_norm": 3.486553430557251, + "learning_rate": 9.353258381383944e-06, + "loss": 0.7857, + "step": 4056 + }, + { + "epoch": 1.0277390753641544, + "grad_norm": 3.7564120292663574, + "learning_rate": 9.352846183084119e-06, + "loss": 0.7394, + "step": 4057 + }, + { + "epoch": 1.027992400253325, + "grad_norm": 3.5400476455688477, + "learning_rate": 9.35243386255841e-06, + "loss": 0.7344, + "step": 4058 + }, + { + "epoch": 1.0282457251424952, + "grad_norm": 3.2935454845428467, + "learning_rate": 9.352021419818398e-06, + "loss": 0.7182, + "step": 4059 + }, + { + "epoch": 1.0284990500316655, + "grad_norm": 3.8735382556915283, + "learning_rate": 9.351608854875665e-06, + "loss": 0.7742, + "step": 4060 + }, + { + "epoch": 1.028752374920836, + "grad_norm": 3.4187848567962646, + "learning_rate": 9.351196167741796e-06, + "loss": 0.7383, + "step": 4061 + }, + { + "epoch": 1.0290056998100063, + "grad_norm": 3.6276071071624756, + "learning_rate": 9.350783358428375e-06, + "loss": 0.7414, + "step": 4062 + }, + { + "epoch": 1.0292590246991766, + "grad_norm": 4.093981742858887, + "learning_rate": 9.350370426946998e-06, + "loss": 0.7992, + "step": 4063 + }, + { + "epoch": 1.0295123495883471, + "grad_norm": 4.762815952301025, + "learning_rate": 9.349957373309259e-06, + "loss": 0.8498, + "step": 4064 + }, + { + "epoch": 1.0297656744775174, + "grad_norm": 3.806854009628296, + "learning_rate": 9.349544197526755e-06, + "loss": 0.7386, + "step": 4065 + }, + { + "epoch": 1.0300189993666877, + "grad_norm": 3.7432122230529785, + "learning_rate": 9.349130899611088e-06, + "loss": 0.8265, + "step": 4066 + }, + { + "epoch": 1.0302723242558582, + "grad_norm": 4.310154914855957, + "learning_rate": 9.348717479573865e-06, + "loss": 0.8894, + "step": 4067 + }, + { + "epoch": 1.0305256491450285, + "grad_norm": 3.9261465072631836, + "learning_rate": 9.348303937426692e-06, + "loss": 0.9846, + "step": 4068 + }, + { + "epoch": 1.0307789740341988, + "grad_norm": 4.200754642486572, + "learning_rate": 9.347890273181183e-06, + "loss": 0.8683, + "step": 4069 + }, + { + "epoch": 1.0310322989233691, + "grad_norm": 3.166792631149292, + "learning_rate": 9.347476486848954e-06, + "loss": 0.7617, + "step": 4070 + }, + { + "epoch": 1.0312856238125396, + "grad_norm": 3.516883611679077, + "learning_rate": 9.347062578441622e-06, + "loss": 0.7223, + "step": 4071 + }, + { + "epoch": 1.03153894870171, + "grad_norm": 3.4659571647644043, + "learning_rate": 9.346648547970809e-06, + "loss": 0.7658, + "step": 4072 + }, + { + "epoch": 1.0317922735908802, + "grad_norm": 3.5516104698181152, + "learning_rate": 9.346234395448143e-06, + "loss": 0.9041, + "step": 4073 + }, + { + "epoch": 1.0320455984800507, + "grad_norm": 3.813924789428711, + "learning_rate": 9.345820120885252e-06, + "loss": 0.8464, + "step": 4074 + }, + { + "epoch": 1.032298923369221, + "grad_norm": 4.0255937576293945, + "learning_rate": 9.345405724293767e-06, + "loss": 0.9182, + "step": 4075 + }, + { + "epoch": 1.0325522482583913, + "grad_norm": 3.270711898803711, + "learning_rate": 9.34499120568533e-06, + "loss": 0.644, + "step": 4076 + }, + { + "epoch": 1.0328055731475618, + "grad_norm": 3.5418875217437744, + "learning_rate": 9.344576565071572e-06, + "loss": 0.7468, + "step": 4077 + }, + { + "epoch": 1.0330588980367321, + "grad_norm": 3.934638738632202, + "learning_rate": 9.344161802464143e-06, + "loss": 0.7684, + "step": 4078 + }, + { + "epoch": 1.0333122229259024, + "grad_norm": 3.533428430557251, + "learning_rate": 9.343746917874686e-06, + "loss": 0.8012, + "step": 4079 + }, + { + "epoch": 1.033565547815073, + "grad_norm": 3.596932888031006, + "learning_rate": 9.34333191131485e-06, + "loss": 0.7605, + "step": 4080 + }, + { + "epoch": 1.0338188727042432, + "grad_norm": 3.7631914615631104, + "learning_rate": 9.342916782796291e-06, + "loss": 0.8154, + "step": 4081 + }, + { + "epoch": 1.0340721975934135, + "grad_norm": 3.557291030883789, + "learning_rate": 9.342501532330666e-06, + "loss": 0.8532, + "step": 4082 + }, + { + "epoch": 1.0343255224825838, + "grad_norm": 3.152371406555176, + "learning_rate": 9.342086159929629e-06, + "loss": 0.7563, + "step": 4083 + }, + { + "epoch": 1.0345788473717543, + "grad_norm": 3.6007800102233887, + "learning_rate": 9.34167066560485e-06, + "loss": 0.647, + "step": 4084 + }, + { + "epoch": 1.0348321722609246, + "grad_norm": 3.5349671840667725, + "learning_rate": 9.341255049367994e-06, + "loss": 0.6733, + "step": 4085 + }, + { + "epoch": 1.035085497150095, + "grad_norm": 3.530944347381592, + "learning_rate": 9.34083931123073e-06, + "loss": 0.7737, + "step": 4086 + }, + { + "epoch": 1.0353388220392654, + "grad_norm": 3.9375393390655518, + "learning_rate": 9.340423451204733e-06, + "loss": 0.857, + "step": 4087 + }, + { + "epoch": 1.0355921469284357, + "grad_norm": 3.490175247192383, + "learning_rate": 9.34000746930168e-06, + "loss": 0.7594, + "step": 4088 + }, + { + "epoch": 1.035845471817606, + "grad_norm": 3.2738397121429443, + "learning_rate": 9.33959136553325e-06, + "loss": 0.7471, + "step": 4089 + }, + { + "epoch": 1.0360987967067765, + "grad_norm": 3.5194242000579834, + "learning_rate": 9.33917513991113e-06, + "loss": 0.6387, + "step": 4090 + }, + { + "epoch": 1.0363521215959468, + "grad_norm": 3.405498504638672, + "learning_rate": 9.338758792447005e-06, + "loss": 0.7327, + "step": 4091 + }, + { + "epoch": 1.036605446485117, + "grad_norm": 4.062540531158447, + "learning_rate": 9.338342323152566e-06, + "loss": 0.8626, + "step": 4092 + }, + { + "epoch": 1.0368587713742876, + "grad_norm": 3.594367742538452, + "learning_rate": 9.337925732039508e-06, + "loss": 0.7588, + "step": 4093 + }, + { + "epoch": 1.037112096263458, + "grad_norm": 3.48313307762146, + "learning_rate": 9.337509019119529e-06, + "loss": 0.7734, + "step": 4094 + }, + { + "epoch": 1.0373654211526282, + "grad_norm": 3.8747398853302, + "learning_rate": 9.33709218440433e-06, + "loss": 0.7602, + "step": 4095 + }, + { + "epoch": 1.0376187460417987, + "grad_norm": 3.6556296348571777, + "learning_rate": 9.336675227905614e-06, + "loss": 0.7492, + "step": 4096 + }, + { + "epoch": 1.037872070930969, + "grad_norm": 3.5307297706604004, + "learning_rate": 9.336258149635091e-06, + "loss": 0.6898, + "step": 4097 + }, + { + "epoch": 1.0381253958201393, + "grad_norm": 3.6485016345977783, + "learning_rate": 9.33584094960447e-06, + "loss": 0.8027, + "step": 4098 + }, + { + "epoch": 1.0383787207093096, + "grad_norm": 3.7099509239196777, + "learning_rate": 9.33542362782547e-06, + "loss": 0.8457, + "step": 4099 + }, + { + "epoch": 1.03863204559848, + "grad_norm": 3.612884283065796, + "learning_rate": 9.335006184309805e-06, + "loss": 0.7799, + "step": 4100 + }, + { + "epoch": 1.0388853704876504, + "grad_norm": 3.448322057723999, + "learning_rate": 9.334588619069197e-06, + "loss": 0.7126, + "step": 4101 + }, + { + "epoch": 1.0391386953768207, + "grad_norm": 3.669529914855957, + "learning_rate": 9.334170932115375e-06, + "loss": 0.6772, + "step": 4102 + }, + { + "epoch": 1.0393920202659912, + "grad_norm": 3.4435253143310547, + "learning_rate": 9.333753123460061e-06, + "loss": 0.6991, + "step": 4103 + }, + { + "epoch": 1.0396453451551615, + "grad_norm": 3.9661593437194824, + "learning_rate": 9.333335193114992e-06, + "loss": 0.8298, + "step": 4104 + }, + { + "epoch": 1.0398986700443318, + "grad_norm": 3.6540608406066895, + "learning_rate": 9.332917141091903e-06, + "loss": 0.7116, + "step": 4105 + }, + { + "epoch": 1.0401519949335023, + "grad_norm": 3.6850311756134033, + "learning_rate": 9.33249896740253e-06, + "loss": 0.6842, + "step": 4106 + }, + { + "epoch": 1.0404053198226726, + "grad_norm": 3.594804286956787, + "learning_rate": 9.332080672058617e-06, + "loss": 0.708, + "step": 4107 + }, + { + "epoch": 1.0406586447118429, + "grad_norm": 3.6079134941101074, + "learning_rate": 9.331662255071909e-06, + "loss": 0.8033, + "step": 4108 + }, + { + "epoch": 1.0409119696010134, + "grad_norm": 4.036489963531494, + "learning_rate": 9.331243716454154e-06, + "loss": 0.9127, + "step": 4109 + }, + { + "epoch": 1.0411652944901837, + "grad_norm": 4.392895221710205, + "learning_rate": 9.330825056217105e-06, + "loss": 0.7351, + "step": 4110 + }, + { + "epoch": 1.041418619379354, + "grad_norm": 3.399625062942505, + "learning_rate": 9.330406274372521e-06, + "loss": 0.7421, + "step": 4111 + }, + { + "epoch": 1.0416719442685243, + "grad_norm": 3.749479055404663, + "learning_rate": 9.329987370932157e-06, + "loss": 0.8002, + "step": 4112 + }, + { + "epoch": 1.0419252691576948, + "grad_norm": 3.7531774044036865, + "learning_rate": 9.329568345907776e-06, + "loss": 0.7491, + "step": 4113 + }, + { + "epoch": 1.042178594046865, + "grad_norm": 4.106726169586182, + "learning_rate": 9.329149199311148e-06, + "loss": 0.7408, + "step": 4114 + }, + { + "epoch": 1.0424319189360354, + "grad_norm": 3.7518279552459717, + "learning_rate": 9.328729931154036e-06, + "loss": 0.7299, + "step": 4115 + }, + { + "epoch": 1.0426852438252059, + "grad_norm": 3.997710704803467, + "learning_rate": 9.328310541448218e-06, + "loss": 0.7315, + "step": 4116 + }, + { + "epoch": 1.0429385687143762, + "grad_norm": 3.344733476638794, + "learning_rate": 9.327891030205467e-06, + "loss": 0.7416, + "step": 4117 + }, + { + "epoch": 1.0431918936035465, + "grad_norm": 3.4776430130004883, + "learning_rate": 9.327471397437567e-06, + "loss": 0.7233, + "step": 4118 + }, + { + "epoch": 1.043445218492717, + "grad_norm": 4.643021583557129, + "learning_rate": 9.327051643156295e-06, + "loss": 0.9628, + "step": 4119 + }, + { + "epoch": 1.0436985433818873, + "grad_norm": 3.613051414489746, + "learning_rate": 9.326631767373444e-06, + "loss": 0.8216, + "step": 4120 + }, + { + "epoch": 1.0439518682710576, + "grad_norm": 4.063212871551514, + "learning_rate": 9.3262117701008e-06, + "loss": 0.7857, + "step": 4121 + }, + { + "epoch": 1.044205193160228, + "grad_norm": 3.6081511974334717, + "learning_rate": 9.325791651350156e-06, + "loss": 0.6622, + "step": 4122 + }, + { + "epoch": 1.0444585180493984, + "grad_norm": 3.584641933441162, + "learning_rate": 9.325371411133309e-06, + "loss": 0.6775, + "step": 4123 + }, + { + "epoch": 1.0447118429385687, + "grad_norm": 3.308696985244751, + "learning_rate": 9.324951049462063e-06, + "loss": 0.7193, + "step": 4124 + }, + { + "epoch": 1.0449651678277392, + "grad_norm": 3.8555233478546143, + "learning_rate": 9.324530566348215e-06, + "loss": 0.857, + "step": 4125 + }, + { + "epoch": 1.0452184927169095, + "grad_norm": 3.670700788497925, + "learning_rate": 9.324109961803578e-06, + "loss": 0.8167, + "step": 4126 + }, + { + "epoch": 1.0454718176060798, + "grad_norm": 3.8492424488067627, + "learning_rate": 9.32368923583996e-06, + "loss": 0.7953, + "step": 4127 + }, + { + "epoch": 1.04572514249525, + "grad_norm": 3.4988362789154053, + "learning_rate": 9.323268388469173e-06, + "loss": 0.8068, + "step": 4128 + }, + { + "epoch": 1.0459784673844206, + "grad_norm": 3.37612247467041, + "learning_rate": 9.322847419703037e-06, + "loss": 0.7807, + "step": 4129 + }, + { + "epoch": 1.0462317922735909, + "grad_norm": 4.227519512176514, + "learning_rate": 9.322426329553371e-06, + "loss": 0.8334, + "step": 4130 + }, + { + "epoch": 1.0464851171627612, + "grad_norm": 3.745054244995117, + "learning_rate": 9.322005118032e-06, + "loss": 0.77, + "step": 4131 + }, + { + "epoch": 1.0467384420519317, + "grad_norm": 3.4740939140319824, + "learning_rate": 9.32158378515075e-06, + "loss": 0.8007, + "step": 4132 + }, + { + "epoch": 1.046991766941102, + "grad_norm": 3.788421154022217, + "learning_rate": 9.321162330921453e-06, + "loss": 0.7977, + "step": 4133 + }, + { + "epoch": 1.0472450918302723, + "grad_norm": 3.160989999771118, + "learning_rate": 9.320740755355944e-06, + "loss": 0.6265, + "step": 4134 + }, + { + "epoch": 1.0474984167194428, + "grad_norm": 3.3964905738830566, + "learning_rate": 9.32031905846606e-06, + "loss": 0.8452, + "step": 4135 + }, + { + "epoch": 1.047751741608613, + "grad_norm": 3.5575075149536133, + "learning_rate": 9.31989724026364e-06, + "loss": 0.8274, + "step": 4136 + }, + { + "epoch": 1.0480050664977834, + "grad_norm": 4.0009942054748535, + "learning_rate": 9.319475300760531e-06, + "loss": 0.9045, + "step": 4137 + }, + { + "epoch": 1.0482583913869539, + "grad_norm": 3.404421329498291, + "learning_rate": 9.319053239968581e-06, + "loss": 0.7146, + "step": 4138 + }, + { + "epoch": 1.0485117162761242, + "grad_norm": 3.66922664642334, + "learning_rate": 9.318631057899639e-06, + "loss": 0.7487, + "step": 4139 + }, + { + "epoch": 1.0487650411652945, + "grad_norm": 3.483755350112915, + "learning_rate": 9.31820875456556e-06, + "loss": 0.6974, + "step": 4140 + }, + { + "epoch": 1.0490183660544647, + "grad_norm": 3.725436210632324, + "learning_rate": 9.317786329978204e-06, + "loss": 0.8188, + "step": 4141 + }, + { + "epoch": 1.0492716909436353, + "grad_norm": 3.7148492336273193, + "learning_rate": 9.317363784149432e-06, + "loss": 0.7631, + "step": 4142 + }, + { + "epoch": 1.0495250158328056, + "grad_norm": 3.5920324325561523, + "learning_rate": 9.316941117091107e-06, + "loss": 0.7792, + "step": 4143 + }, + { + "epoch": 1.0497783407219758, + "grad_norm": 3.592470407485962, + "learning_rate": 9.3165183288151e-06, + "loss": 0.7027, + "step": 4144 + }, + { + "epoch": 1.0500316656111464, + "grad_norm": 3.8422300815582275, + "learning_rate": 9.316095419333281e-06, + "loss": 0.7744, + "step": 4145 + }, + { + "epoch": 1.0502849905003167, + "grad_norm": 3.559274196624756, + "learning_rate": 9.315672388657527e-06, + "loss": 0.7609, + "step": 4146 + }, + { + "epoch": 1.050538315389487, + "grad_norm": 3.2121517658233643, + "learning_rate": 9.315249236799713e-06, + "loss": 0.7698, + "step": 4147 + }, + { + "epoch": 1.0507916402786575, + "grad_norm": 3.810584306716919, + "learning_rate": 9.314825963771724e-06, + "loss": 0.7424, + "step": 4148 + }, + { + "epoch": 1.0510449651678277, + "grad_norm": 4.001245021820068, + "learning_rate": 9.314402569585443e-06, + "loss": 0.8759, + "step": 4149 + }, + { + "epoch": 1.051298290056998, + "grad_norm": 4.0187506675720215, + "learning_rate": 9.31397905425276e-06, + "loss": 0.8156, + "step": 4150 + }, + { + "epoch": 1.0515516149461686, + "grad_norm": 3.8050217628479004, + "learning_rate": 9.313555417785568e-06, + "loss": 0.7594, + "step": 4151 + }, + { + "epoch": 1.0518049398353388, + "grad_norm": 3.811541795730591, + "learning_rate": 9.31313166019576e-06, + "loss": 0.7524, + "step": 4152 + }, + { + "epoch": 1.0520582647245091, + "grad_norm": 3.808600664138794, + "learning_rate": 9.312707781495239e-06, + "loss": 0.869, + "step": 4153 + }, + { + "epoch": 1.0523115896136797, + "grad_norm": 3.384950876235962, + "learning_rate": 9.312283781695903e-06, + "loss": 0.6546, + "step": 4154 + }, + { + "epoch": 1.05256491450285, + "grad_norm": 3.976621389389038, + "learning_rate": 9.31185966080966e-06, + "loss": 0.8433, + "step": 4155 + }, + { + "epoch": 1.0528182393920202, + "grad_norm": 3.789478302001953, + "learning_rate": 9.311435418848419e-06, + "loss": 0.8427, + "step": 4156 + }, + { + "epoch": 1.0530715642811905, + "grad_norm": 4.111769199371338, + "learning_rate": 9.311011055824089e-06, + "loss": 0.9546, + "step": 4157 + }, + { + "epoch": 1.053324889170361, + "grad_norm": 3.746201515197754, + "learning_rate": 9.310586571748592e-06, + "loss": 0.8288, + "step": 4158 + }, + { + "epoch": 1.0535782140595313, + "grad_norm": 3.287363052368164, + "learning_rate": 9.310161966633845e-06, + "loss": 0.6317, + "step": 4159 + }, + { + "epoch": 1.0538315389487016, + "grad_norm": 3.618128538131714, + "learning_rate": 9.309737240491767e-06, + "loss": 0.8061, + "step": 4160 + }, + { + "epoch": 1.0540848638378721, + "grad_norm": 3.80902361869812, + "learning_rate": 9.30931239333429e-06, + "loss": 0.749, + "step": 4161 + }, + { + "epoch": 1.0543381887270424, + "grad_norm": 3.424065113067627, + "learning_rate": 9.308887425173339e-06, + "loss": 0.7547, + "step": 4162 + }, + { + "epoch": 1.0545915136162127, + "grad_norm": 3.779470443725586, + "learning_rate": 9.308462336020849e-06, + "loss": 0.8846, + "step": 4163 + }, + { + "epoch": 1.0548448385053832, + "grad_norm": 3.467844247817993, + "learning_rate": 9.308037125888756e-06, + "loss": 0.7141, + "step": 4164 + }, + { + "epoch": 1.0550981633945535, + "grad_norm": 3.766876697540283, + "learning_rate": 9.307611794789001e-06, + "loss": 0.8758, + "step": 4165 + }, + { + "epoch": 1.0553514882837238, + "grad_norm": 3.8177969455718994, + "learning_rate": 9.307186342733525e-06, + "loss": 0.7208, + "step": 4166 + }, + { + "epoch": 1.0556048131728943, + "grad_norm": 3.287339925765991, + "learning_rate": 9.306760769734273e-06, + "loss": 0.7097, + "step": 4167 + }, + { + "epoch": 1.0558581380620646, + "grad_norm": 3.8283634185791016, + "learning_rate": 9.3063350758032e-06, + "loss": 0.8535, + "step": 4168 + }, + { + "epoch": 1.056111462951235, + "grad_norm": 3.535898447036743, + "learning_rate": 9.305909260952255e-06, + "loss": 0.6666, + "step": 4169 + }, + { + "epoch": 1.0563647878404052, + "grad_norm": 3.6630406379699707, + "learning_rate": 9.305483325193397e-06, + "loss": 0.8016, + "step": 4170 + }, + { + "epoch": 1.0566181127295757, + "grad_norm": 4.214574337005615, + "learning_rate": 9.305057268538581e-06, + "loss": 0.8248, + "step": 4171 + }, + { + "epoch": 1.056871437618746, + "grad_norm": 3.893692970275879, + "learning_rate": 9.304631090999779e-06, + "loss": 0.8624, + "step": 4172 + }, + { + "epoch": 1.0571247625079163, + "grad_norm": 3.6482512950897217, + "learning_rate": 9.304204792588953e-06, + "loss": 0.8034, + "step": 4173 + }, + { + "epoch": 1.0573780873970868, + "grad_norm": 4.257894515991211, + "learning_rate": 9.303778373318073e-06, + "loss": 0.7821, + "step": 4174 + }, + { + "epoch": 1.0576314122862571, + "grad_norm": 3.569042682647705, + "learning_rate": 9.303351833199113e-06, + "loss": 0.8068, + "step": 4175 + }, + { + "epoch": 1.0578847371754274, + "grad_norm": 3.841977596282959, + "learning_rate": 9.30292517224405e-06, + "loss": 0.8009, + "step": 4176 + }, + { + "epoch": 1.058138062064598, + "grad_norm": 4.106107711791992, + "learning_rate": 9.302498390464868e-06, + "loss": 0.7948, + "step": 4177 + }, + { + "epoch": 1.0583913869537682, + "grad_norm": 3.645066022872925, + "learning_rate": 9.302071487873543e-06, + "loss": 0.7916, + "step": 4178 + }, + { + "epoch": 1.0586447118429385, + "grad_norm": 3.900665283203125, + "learning_rate": 9.30164446448207e-06, + "loss": 0.7928, + "step": 4179 + }, + { + "epoch": 1.058898036732109, + "grad_norm": 3.4762229919433594, + "learning_rate": 9.301217320302436e-06, + "loss": 0.7257, + "step": 4180 + }, + { + "epoch": 1.0591513616212793, + "grad_norm": 3.898374557495117, + "learning_rate": 9.300790055346634e-06, + "loss": 0.7139, + "step": 4181 + }, + { + "epoch": 1.0594046865104496, + "grad_norm": 3.7925961017608643, + "learning_rate": 9.300362669626665e-06, + "loss": 0.8422, + "step": 4182 + }, + { + "epoch": 1.0596580113996201, + "grad_norm": 3.7468175888061523, + "learning_rate": 9.299935163154527e-06, + "loss": 0.78, + "step": 4183 + }, + { + "epoch": 1.0599113362887904, + "grad_norm": 3.210975170135498, + "learning_rate": 9.299507535942224e-06, + "loss": 0.7004, + "step": 4184 + }, + { + "epoch": 1.0601646611779607, + "grad_norm": 3.7563984394073486, + "learning_rate": 9.299079788001766e-06, + "loss": 0.788, + "step": 4185 + }, + { + "epoch": 1.060417986067131, + "grad_norm": 3.920912742614746, + "learning_rate": 9.29865191934516e-06, + "loss": 0.7621, + "step": 4186 + }, + { + "epoch": 1.0606713109563015, + "grad_norm": 3.855454921722412, + "learning_rate": 9.298223929984425e-06, + "loss": 0.6967, + "step": 4187 + }, + { + "epoch": 1.0609246358454718, + "grad_norm": 4.34567403793335, + "learning_rate": 9.297795819931576e-06, + "loss": 0.962, + "step": 4188 + }, + { + "epoch": 1.061177960734642, + "grad_norm": 3.305553436279297, + "learning_rate": 9.297367589198635e-06, + "loss": 0.7294, + "step": 4189 + }, + { + "epoch": 1.0614312856238126, + "grad_norm": 3.634536027908325, + "learning_rate": 9.296939237797626e-06, + "loss": 0.8723, + "step": 4190 + }, + { + "epoch": 1.061684610512983, + "grad_norm": 3.124066114425659, + "learning_rate": 9.296510765740577e-06, + "loss": 0.7415, + "step": 4191 + }, + { + "epoch": 1.0619379354021532, + "grad_norm": 3.7073328495025635, + "learning_rate": 9.296082173039519e-06, + "loss": 0.7426, + "step": 4192 + }, + { + "epoch": 1.0621912602913237, + "grad_norm": 3.358052968978882, + "learning_rate": 9.295653459706488e-06, + "loss": 0.8083, + "step": 4193 + }, + { + "epoch": 1.062444585180494, + "grad_norm": 3.457728624343872, + "learning_rate": 9.29522462575352e-06, + "loss": 0.7909, + "step": 4194 + }, + { + "epoch": 1.0626979100696643, + "grad_norm": 3.4607529640197754, + "learning_rate": 9.294795671192657e-06, + "loss": 0.8017, + "step": 4195 + }, + { + "epoch": 1.0629512349588346, + "grad_norm": 3.8210277557373047, + "learning_rate": 9.294366596035947e-06, + "loss": 0.8498, + "step": 4196 + }, + { + "epoch": 1.063204559848005, + "grad_norm": 3.636653423309326, + "learning_rate": 9.293937400295433e-06, + "loss": 0.8401, + "step": 4197 + }, + { + "epoch": 1.0634578847371754, + "grad_norm": 3.7521755695343018, + "learning_rate": 9.293508083983171e-06, + "loss": 0.6958, + "step": 4198 + }, + { + "epoch": 1.0637112096263457, + "grad_norm": 3.7031736373901367, + "learning_rate": 9.293078647111214e-06, + "loss": 0.7144, + "step": 4199 + }, + { + "epoch": 1.0639645345155162, + "grad_norm": 3.623506546020508, + "learning_rate": 9.29264908969162e-06, + "loss": 0.6882, + "step": 4200 + }, + { + "epoch": 1.0642178594046865, + "grad_norm": 3.9294145107269287, + "learning_rate": 9.292219411736452e-06, + "loss": 0.7163, + "step": 4201 + }, + { + "epoch": 1.0644711842938568, + "grad_norm": 4.184792995452881, + "learning_rate": 9.291789613257774e-06, + "loss": 0.8392, + "step": 4202 + }, + { + "epoch": 1.0647245091830273, + "grad_norm": 4.025232791900635, + "learning_rate": 9.291359694267655e-06, + "loss": 0.6967, + "step": 4203 + }, + { + "epoch": 1.0649778340721976, + "grad_norm": 4.411888599395752, + "learning_rate": 9.290929654778168e-06, + "loss": 0.9006, + "step": 4204 + }, + { + "epoch": 1.0652311589613679, + "grad_norm": 3.92000412940979, + "learning_rate": 9.290499494801387e-06, + "loss": 0.7887, + "step": 4205 + }, + { + "epoch": 1.0654844838505384, + "grad_norm": 3.5988378524780273, + "learning_rate": 9.290069214349391e-06, + "loss": 0.7804, + "step": 4206 + }, + { + "epoch": 1.0657378087397087, + "grad_norm": 3.8604774475097656, + "learning_rate": 9.289638813434261e-06, + "loss": 0.8488, + "step": 4207 + }, + { + "epoch": 1.065991133628879, + "grad_norm": 3.5935158729553223, + "learning_rate": 9.289208292068086e-06, + "loss": 0.7512, + "step": 4208 + }, + { + "epoch": 1.0662444585180495, + "grad_norm": 3.192046880722046, + "learning_rate": 9.28877765026295e-06, + "loss": 0.69, + "step": 4209 + }, + { + "epoch": 1.0664977834072198, + "grad_norm": 3.207685708999634, + "learning_rate": 9.288346888030948e-06, + "loss": 0.7661, + "step": 4210 + }, + { + "epoch": 1.06675110829639, + "grad_norm": 3.6800575256347656, + "learning_rate": 9.287916005384177e-06, + "loss": 0.7619, + "step": 4211 + }, + { + "epoch": 1.0670044331855606, + "grad_norm": 3.293300151824951, + "learning_rate": 9.287485002334732e-06, + "loss": 0.7534, + "step": 4212 + }, + { + "epoch": 1.0672577580747309, + "grad_norm": 3.7135701179504395, + "learning_rate": 9.28705387889472e-06, + "loss": 0.8989, + "step": 4213 + }, + { + "epoch": 1.0675110829639012, + "grad_norm": 3.69555926322937, + "learning_rate": 9.286622635076242e-06, + "loss": 0.7713, + "step": 4214 + }, + { + "epoch": 1.0677644078530715, + "grad_norm": 3.442887783050537, + "learning_rate": 9.28619127089141e-06, + "loss": 0.7726, + "step": 4215 + }, + { + "epoch": 1.068017732742242, + "grad_norm": 3.5746614933013916, + "learning_rate": 9.285759786352337e-06, + "loss": 0.7489, + "step": 4216 + }, + { + "epoch": 1.0682710576314123, + "grad_norm": 3.65659499168396, + "learning_rate": 9.285328181471138e-06, + "loss": 0.7464, + "step": 4217 + }, + { + "epoch": 1.0685243825205826, + "grad_norm": 3.24165940284729, + "learning_rate": 9.28489645625993e-06, + "loss": 0.6833, + "step": 4218 + }, + { + "epoch": 1.068777707409753, + "grad_norm": 3.432204008102417, + "learning_rate": 9.284464610730842e-06, + "loss": 0.7081, + "step": 4219 + }, + { + "epoch": 1.0690310322989234, + "grad_norm": 3.7620961666107178, + "learning_rate": 9.28403264489599e-06, + "loss": 0.7481, + "step": 4220 + }, + { + "epoch": 1.0692843571880937, + "grad_norm": 3.600041389465332, + "learning_rate": 9.283600558767514e-06, + "loss": 0.753, + "step": 4221 + }, + { + "epoch": 1.0695376820772642, + "grad_norm": 3.680054187774658, + "learning_rate": 9.28316835235754e-06, + "loss": 0.765, + "step": 4222 + }, + { + "epoch": 1.0697910069664345, + "grad_norm": 4.144865036010742, + "learning_rate": 9.282736025678206e-06, + "loss": 0.9797, + "step": 4223 + }, + { + "epoch": 1.0700443318556048, + "grad_norm": 3.513779640197754, + "learning_rate": 9.282303578741652e-06, + "loss": 0.8246, + "step": 4224 + }, + { + "epoch": 1.070297656744775, + "grad_norm": 3.870331048965454, + "learning_rate": 9.28187101156002e-06, + "loss": 0.8284, + "step": 4225 + }, + { + "epoch": 1.0705509816339456, + "grad_norm": 3.7399344444274902, + "learning_rate": 9.281438324145454e-06, + "loss": 0.745, + "step": 4226 + }, + { + "epoch": 1.0708043065231159, + "grad_norm": 3.684959650039673, + "learning_rate": 9.28100551651011e-06, + "loss": 0.7379, + "step": 4227 + }, + { + "epoch": 1.0710576314122862, + "grad_norm": 3.779805898666382, + "learning_rate": 9.280572588666139e-06, + "loss": 0.8191, + "step": 4228 + }, + { + "epoch": 1.0713109563014567, + "grad_norm": 3.8943259716033936, + "learning_rate": 9.280139540625693e-06, + "loss": 0.9724, + "step": 4229 + }, + { + "epoch": 1.071564281190627, + "grad_norm": 4.2728705406188965, + "learning_rate": 9.279706372400936e-06, + "loss": 0.7738, + "step": 4230 + }, + { + "epoch": 1.0718176060797973, + "grad_norm": 3.7429397106170654, + "learning_rate": 9.27927308400403e-06, + "loss": 0.8156, + "step": 4231 + }, + { + "epoch": 1.0720709309689678, + "grad_norm": 3.7998812198638916, + "learning_rate": 9.27883967544714e-06, + "loss": 0.7416, + "step": 4232 + }, + { + "epoch": 1.072324255858138, + "grad_norm": 3.844186305999756, + "learning_rate": 9.278406146742438e-06, + "loss": 0.8633, + "step": 4233 + }, + { + "epoch": 1.0725775807473084, + "grad_norm": 3.5452141761779785, + "learning_rate": 9.277972497902097e-06, + "loss": 0.7541, + "step": 4234 + }, + { + "epoch": 1.0728309056364789, + "grad_norm": 3.764220952987671, + "learning_rate": 9.277538728938294e-06, + "loss": 0.8352, + "step": 4235 + }, + { + "epoch": 1.0730842305256492, + "grad_norm": 3.211428642272949, + "learning_rate": 9.277104839863207e-06, + "loss": 0.6962, + "step": 4236 + }, + { + "epoch": 1.0733375554148195, + "grad_norm": 3.6833016872406006, + "learning_rate": 9.276670830689021e-06, + "loss": 0.8521, + "step": 4237 + }, + { + "epoch": 1.07359088030399, + "grad_norm": 3.590811252593994, + "learning_rate": 9.276236701427924e-06, + "loss": 0.8868, + "step": 4238 + }, + { + "epoch": 1.0738442051931603, + "grad_norm": 3.729499578475952, + "learning_rate": 9.275802452092101e-06, + "loss": 0.874, + "step": 4239 + }, + { + "epoch": 1.0740975300823306, + "grad_norm": 3.695556163787842, + "learning_rate": 9.275368082693752e-06, + "loss": 0.737, + "step": 4240 + }, + { + "epoch": 1.0743508549715008, + "grad_norm": 3.678680896759033, + "learning_rate": 9.27493359324507e-06, + "loss": 0.7896, + "step": 4241 + }, + { + "epoch": 1.0746041798606714, + "grad_norm": 3.8089263439178467, + "learning_rate": 9.274498983758255e-06, + "loss": 0.8519, + "step": 4242 + }, + { + "epoch": 1.0748575047498417, + "grad_norm": 3.6422061920166016, + "learning_rate": 9.274064254245514e-06, + "loss": 0.8217, + "step": 4243 + }, + { + "epoch": 1.075110829639012, + "grad_norm": 3.469003200531006, + "learning_rate": 9.27362940471905e-06, + "loss": 0.695, + "step": 4244 + }, + { + "epoch": 1.0753641545281825, + "grad_norm": 3.448909044265747, + "learning_rate": 9.273194435191078e-06, + "loss": 0.7327, + "step": 4245 + }, + { + "epoch": 1.0756174794173528, + "grad_norm": 3.759049654006958, + "learning_rate": 9.272759345673807e-06, + "loss": 0.925, + "step": 4246 + }, + { + "epoch": 1.075870804306523, + "grad_norm": 3.563659906387329, + "learning_rate": 9.272324136179459e-06, + "loss": 0.7678, + "step": 4247 + }, + { + "epoch": 1.0761241291956936, + "grad_norm": 3.738771438598633, + "learning_rate": 9.271888806720248e-06, + "loss": 0.7844, + "step": 4248 + }, + { + "epoch": 1.0763774540848638, + "grad_norm": 3.7911908626556396, + "learning_rate": 9.271453357308401e-06, + "loss": 0.9547, + "step": 4249 + }, + { + "epoch": 1.0766307789740341, + "grad_norm": 3.996831178665161, + "learning_rate": 9.27101778795615e-06, + "loss": 0.8331, + "step": 4250 + }, + { + "epoch": 1.0768841038632047, + "grad_norm": 3.803457021713257, + "learning_rate": 9.270582098675718e-06, + "loss": 0.7729, + "step": 4251 + }, + { + "epoch": 1.077137428752375, + "grad_norm": 3.7224628925323486, + "learning_rate": 9.270146289479343e-06, + "loss": 0.7928, + "step": 4252 + }, + { + "epoch": 1.0773907536415452, + "grad_norm": 3.6929430961608887, + "learning_rate": 9.26971036037926e-06, + "loss": 0.8014, + "step": 4253 + }, + { + "epoch": 1.0776440785307155, + "grad_norm": 3.998534917831421, + "learning_rate": 9.269274311387712e-06, + "loss": 0.7043, + "step": 4254 + }, + { + "epoch": 1.077897403419886, + "grad_norm": 3.6632144451141357, + "learning_rate": 9.268838142516943e-06, + "loss": 0.7599, + "step": 4255 + }, + { + "epoch": 1.0781507283090563, + "grad_norm": 3.255547523498535, + "learning_rate": 9.2684018537792e-06, + "loss": 0.7823, + "step": 4256 + }, + { + "epoch": 1.0784040531982266, + "grad_norm": 4.060213565826416, + "learning_rate": 9.267965445186733e-06, + "loss": 0.8022, + "step": 4257 + }, + { + "epoch": 1.0786573780873971, + "grad_norm": 3.7716310024261475, + "learning_rate": 9.267528916751796e-06, + "loss": 0.8277, + "step": 4258 + }, + { + "epoch": 1.0789107029765674, + "grad_norm": 3.8046209812164307, + "learning_rate": 9.267092268486648e-06, + "loss": 0.7937, + "step": 4259 + }, + { + "epoch": 1.0791640278657377, + "grad_norm": 3.2104485034942627, + "learning_rate": 9.266655500403549e-06, + "loss": 0.6956, + "step": 4260 + }, + { + "epoch": 1.0794173527549082, + "grad_norm": 3.81565523147583, + "learning_rate": 9.266218612514763e-06, + "loss": 0.9505, + "step": 4261 + }, + { + "epoch": 1.0796706776440785, + "grad_norm": 3.8160922527313232, + "learning_rate": 9.265781604832558e-06, + "loss": 0.8061, + "step": 4262 + }, + { + "epoch": 1.0799240025332488, + "grad_norm": 3.815609931945801, + "learning_rate": 9.265344477369203e-06, + "loss": 0.7305, + "step": 4263 + }, + { + "epoch": 1.0801773274224193, + "grad_norm": 4.010029315948486, + "learning_rate": 9.264907230136977e-06, + "loss": 0.8887, + "step": 4264 + }, + { + "epoch": 1.0804306523115896, + "grad_norm": 4.082635879516602, + "learning_rate": 9.264469863148152e-06, + "loss": 0.8839, + "step": 4265 + }, + { + "epoch": 1.08068397720076, + "grad_norm": 3.54335618019104, + "learning_rate": 9.264032376415013e-06, + "loss": 0.6708, + "step": 4266 + }, + { + "epoch": 1.0809373020899304, + "grad_norm": 3.5170133113861084, + "learning_rate": 9.263594769949845e-06, + "loss": 0.7863, + "step": 4267 + }, + { + "epoch": 1.0811906269791007, + "grad_norm": 4.150489807128906, + "learning_rate": 9.263157043764932e-06, + "loss": 0.8792, + "step": 4268 + }, + { + "epoch": 1.081443951868271, + "grad_norm": 3.6395976543426514, + "learning_rate": 9.262719197872569e-06, + "loss": 0.7188, + "step": 4269 + }, + { + "epoch": 1.0816972767574413, + "grad_norm": 3.2906672954559326, + "learning_rate": 9.262281232285048e-06, + "loss": 0.7369, + "step": 4270 + }, + { + "epoch": 1.0819506016466118, + "grad_norm": 3.4226391315460205, + "learning_rate": 9.261843147014666e-06, + "loss": 0.7741, + "step": 4271 + }, + { + "epoch": 1.0822039265357821, + "grad_norm": 3.590555191040039, + "learning_rate": 9.261404942073729e-06, + "loss": 0.7568, + "step": 4272 + }, + { + "epoch": 1.0824572514249524, + "grad_norm": 3.4189095497131348, + "learning_rate": 9.260966617474535e-06, + "loss": 0.7826, + "step": 4273 + }, + { + "epoch": 1.082710576314123, + "grad_norm": 4.2814130783081055, + "learning_rate": 9.260528173229399e-06, + "loss": 0.7474, + "step": 4274 + }, + { + "epoch": 1.0829639012032932, + "grad_norm": 3.827186346054077, + "learning_rate": 9.260089609350626e-06, + "loss": 0.7962, + "step": 4275 + }, + { + "epoch": 1.0832172260924635, + "grad_norm": 4.3773298263549805, + "learning_rate": 9.259650925850534e-06, + "loss": 0.9521, + "step": 4276 + }, + { + "epoch": 1.083470550981634, + "grad_norm": 3.643171548843384, + "learning_rate": 9.259212122741441e-06, + "loss": 0.8232, + "step": 4277 + }, + { + "epoch": 1.0837238758708043, + "grad_norm": 4.062769412994385, + "learning_rate": 9.258773200035666e-06, + "loss": 0.9115, + "step": 4278 + }, + { + "epoch": 1.0839772007599746, + "grad_norm": 3.6062681674957275, + "learning_rate": 9.258334157745538e-06, + "loss": 0.8954, + "step": 4279 + }, + { + "epoch": 1.0842305256491451, + "grad_norm": 3.7051634788513184, + "learning_rate": 9.257894995883382e-06, + "loss": 0.7412, + "step": 4280 + }, + { + "epoch": 1.0844838505383154, + "grad_norm": 3.7709696292877197, + "learning_rate": 9.25745571446153e-06, + "loss": 0.7234, + "step": 4281 + }, + { + "epoch": 1.0847371754274857, + "grad_norm": 3.7264342308044434, + "learning_rate": 9.257016313492317e-06, + "loss": 0.7977, + "step": 4282 + }, + { + "epoch": 1.084990500316656, + "grad_norm": 3.64595103263855, + "learning_rate": 9.25657679298808e-06, + "loss": 0.8048, + "step": 4283 + }, + { + "epoch": 1.0852438252058265, + "grad_norm": 3.6637768745422363, + "learning_rate": 9.256137152961162e-06, + "loss": 0.8779, + "step": 4284 + }, + { + "epoch": 1.0854971500949968, + "grad_norm": 3.79441499710083, + "learning_rate": 9.255697393423907e-06, + "loss": 0.71, + "step": 4285 + }, + { + "epoch": 1.085750474984167, + "grad_norm": 4.033945560455322, + "learning_rate": 9.255257514388665e-06, + "loss": 0.8279, + "step": 4286 + }, + { + "epoch": 1.0860037998733376, + "grad_norm": 3.7677478790283203, + "learning_rate": 9.254817515867786e-06, + "loss": 0.8403, + "step": 4287 + }, + { + "epoch": 1.086257124762508, + "grad_norm": 3.4312496185302734, + "learning_rate": 9.254377397873626e-06, + "loss": 0.6923, + "step": 4288 + }, + { + "epoch": 1.0865104496516782, + "grad_norm": 3.560711622238159, + "learning_rate": 9.253937160418542e-06, + "loss": 0.8237, + "step": 4289 + }, + { + "epoch": 1.0867637745408487, + "grad_norm": 3.5195810794830322, + "learning_rate": 9.253496803514896e-06, + "loss": 0.6966, + "step": 4290 + }, + { + "epoch": 1.087017099430019, + "grad_norm": 4.035468101501465, + "learning_rate": 9.253056327175054e-06, + "loss": 0.7802, + "step": 4291 + }, + { + "epoch": 1.0872704243191893, + "grad_norm": 3.76127552986145, + "learning_rate": 9.252615731411382e-06, + "loss": 0.8538, + "step": 4292 + }, + { + "epoch": 1.0875237492083598, + "grad_norm": 3.775036334991455, + "learning_rate": 9.252175016236254e-06, + "loss": 0.787, + "step": 4293 + }, + { + "epoch": 1.08777707409753, + "grad_norm": 3.646313190460205, + "learning_rate": 9.251734181662046e-06, + "loss": 0.7934, + "step": 4294 + }, + { + "epoch": 1.0880303989867004, + "grad_norm": 3.2216339111328125, + "learning_rate": 9.251293227701133e-06, + "loss": 0.65, + "step": 4295 + }, + { + "epoch": 1.088283723875871, + "grad_norm": 3.8053488731384277, + "learning_rate": 9.250852154365898e-06, + "loss": 0.8331, + "step": 4296 + }, + { + "epoch": 1.0885370487650412, + "grad_norm": 3.5517189502716064, + "learning_rate": 9.25041096166873e-06, + "loss": 0.7879, + "step": 4297 + }, + { + "epoch": 1.0887903736542115, + "grad_norm": 3.8531367778778076, + "learning_rate": 9.249969649622013e-06, + "loss": 0.8339, + "step": 4298 + }, + { + "epoch": 1.0890436985433818, + "grad_norm": 3.5429439544677734, + "learning_rate": 9.249528218238139e-06, + "loss": 0.7619, + "step": 4299 + }, + { + "epoch": 1.0892970234325523, + "grad_norm": 3.2794110774993896, + "learning_rate": 9.249086667529504e-06, + "loss": 0.7274, + "step": 4300 + }, + { + "epoch": 1.0895503483217226, + "grad_norm": 3.6612181663513184, + "learning_rate": 9.248644997508506e-06, + "loss": 0.7675, + "step": 4301 + }, + { + "epoch": 1.0898036732108929, + "grad_norm": 3.957714557647705, + "learning_rate": 9.248203208187551e-06, + "loss": 0.9582, + "step": 4302 + }, + { + "epoch": 1.0900569981000634, + "grad_norm": 3.532169818878174, + "learning_rate": 9.24776129957904e-06, + "loss": 0.6714, + "step": 4303 + }, + { + "epoch": 1.0903103229892337, + "grad_norm": 3.4459142684936523, + "learning_rate": 9.247319271695382e-06, + "loss": 0.7545, + "step": 4304 + }, + { + "epoch": 1.090563647878404, + "grad_norm": 3.7273051738739014, + "learning_rate": 9.246877124548988e-06, + "loss": 0.6319, + "step": 4305 + }, + { + "epoch": 1.0908169727675745, + "grad_norm": 4.333641529083252, + "learning_rate": 9.246434858152277e-06, + "loss": 1.0639, + "step": 4306 + }, + { + "epoch": 1.0910702976567448, + "grad_norm": 3.7943003177642822, + "learning_rate": 9.245992472517664e-06, + "loss": 0.7914, + "step": 4307 + }, + { + "epoch": 1.091323622545915, + "grad_norm": 3.623751163482666, + "learning_rate": 9.245549967657572e-06, + "loss": 0.7618, + "step": 4308 + }, + { + "epoch": 1.0915769474350856, + "grad_norm": 3.7345826625823975, + "learning_rate": 9.245107343584427e-06, + "loss": 0.8078, + "step": 4309 + }, + { + "epoch": 1.091830272324256, + "grad_norm": 3.5183663368225098, + "learning_rate": 9.244664600310659e-06, + "loss": 0.7537, + "step": 4310 + }, + { + "epoch": 1.0920835972134262, + "grad_norm": 3.544497489929199, + "learning_rate": 9.244221737848697e-06, + "loss": 0.7847, + "step": 4311 + }, + { + "epoch": 1.0923369221025965, + "grad_norm": 3.587022304534912, + "learning_rate": 9.243778756210979e-06, + "loss": 0.8602, + "step": 4312 + }, + { + "epoch": 1.092590246991767, + "grad_norm": 3.5137152671813965, + "learning_rate": 9.243335655409941e-06, + "loss": 0.7009, + "step": 4313 + }, + { + "epoch": 1.0928435718809373, + "grad_norm": 3.9993653297424316, + "learning_rate": 9.242892435458026e-06, + "loss": 0.7765, + "step": 4314 + }, + { + "epoch": 1.0930968967701076, + "grad_norm": 3.4785187244415283, + "learning_rate": 9.242449096367683e-06, + "loss": 0.6808, + "step": 4315 + }, + { + "epoch": 1.093350221659278, + "grad_norm": 3.496853828430176, + "learning_rate": 9.242005638151354e-06, + "loss": 0.7718, + "step": 4316 + }, + { + "epoch": 1.0936035465484484, + "grad_norm": 3.3387272357940674, + "learning_rate": 9.241562060821498e-06, + "loss": 0.7598, + "step": 4317 + }, + { + "epoch": 1.0938568714376187, + "grad_norm": 4.0735859870910645, + "learning_rate": 9.241118364390565e-06, + "loss": 0.9256, + "step": 4318 + }, + { + "epoch": 1.0941101963267892, + "grad_norm": 3.7407758235931396, + "learning_rate": 9.240674548871017e-06, + "loss": 0.6623, + "step": 4319 + }, + { + "epoch": 1.0943635212159595, + "grad_norm": 3.656137228012085, + "learning_rate": 9.240230614275316e-06, + "loss": 0.7394, + "step": 4320 + }, + { + "epoch": 1.0946168461051298, + "grad_norm": 3.55597186088562, + "learning_rate": 9.239786560615925e-06, + "loss": 0.7031, + "step": 4321 + }, + { + "epoch": 1.0948701709943003, + "grad_norm": 4.085122108459473, + "learning_rate": 9.239342387905314e-06, + "loss": 0.768, + "step": 4322 + }, + { + "epoch": 1.0951234958834706, + "grad_norm": 4.219598293304443, + "learning_rate": 9.238898096155958e-06, + "loss": 0.8081, + "step": 4323 + }, + { + "epoch": 1.0953768207726409, + "grad_norm": 3.7133283615112305, + "learning_rate": 9.238453685380329e-06, + "loss": 0.6618, + "step": 4324 + }, + { + "epoch": 1.0956301456618114, + "grad_norm": 3.9537534713745117, + "learning_rate": 9.238009155590906e-06, + "loss": 0.8014, + "step": 4325 + }, + { + "epoch": 1.0958834705509817, + "grad_norm": 3.4699196815490723, + "learning_rate": 9.237564506800174e-06, + "loss": 0.7975, + "step": 4326 + }, + { + "epoch": 1.096136795440152, + "grad_norm": 3.6257779598236084, + "learning_rate": 9.237119739020616e-06, + "loss": 0.7796, + "step": 4327 + }, + { + "epoch": 1.0963901203293223, + "grad_norm": 3.7349250316619873, + "learning_rate": 9.23667485226472e-06, + "loss": 0.7903, + "step": 4328 + }, + { + "epoch": 1.0966434452184928, + "grad_norm": 3.5463314056396484, + "learning_rate": 9.236229846544983e-06, + "loss": 0.8036, + "step": 4329 + }, + { + "epoch": 1.096896770107663, + "grad_norm": 3.45389461517334, + "learning_rate": 9.235784721873895e-06, + "loss": 0.8413, + "step": 4330 + }, + { + "epoch": 1.0971500949968334, + "grad_norm": 3.915783166885376, + "learning_rate": 9.235339478263958e-06, + "loss": 0.7464, + "step": 4331 + }, + { + "epoch": 1.0974034198860039, + "grad_norm": 3.667205810546875, + "learning_rate": 9.234894115727673e-06, + "loss": 0.8264, + "step": 4332 + }, + { + "epoch": 1.0976567447751742, + "grad_norm": 3.0400261878967285, + "learning_rate": 9.234448634277547e-06, + "loss": 0.6946, + "step": 4333 + }, + { + "epoch": 1.0979100696643445, + "grad_norm": 3.7669754028320312, + "learning_rate": 9.234003033926087e-06, + "loss": 0.7198, + "step": 4334 + }, + { + "epoch": 1.098163394553515, + "grad_norm": 3.7670071125030518, + "learning_rate": 9.233557314685806e-06, + "loss": 0.7183, + "step": 4335 + }, + { + "epoch": 1.0984167194426853, + "grad_norm": 4.155233860015869, + "learning_rate": 9.23311147656922e-06, + "loss": 0.8613, + "step": 4336 + }, + { + "epoch": 1.0986700443318556, + "grad_norm": 3.7832155227661133, + "learning_rate": 9.232665519588848e-06, + "loss": 0.7233, + "step": 4337 + }, + { + "epoch": 1.098923369221026, + "grad_norm": 3.5161235332489014, + "learning_rate": 9.232219443757212e-06, + "loss": 0.808, + "step": 4338 + }, + { + "epoch": 1.0991766941101964, + "grad_norm": 3.6883702278137207, + "learning_rate": 9.231773249086838e-06, + "loss": 0.7893, + "step": 4339 + }, + { + "epoch": 1.0994300189993667, + "grad_norm": 3.724613904953003, + "learning_rate": 9.231326935590252e-06, + "loss": 0.7927, + "step": 4340 + }, + { + "epoch": 1.099683343888537, + "grad_norm": 4.20599365234375, + "learning_rate": 9.230880503279991e-06, + "loss": 0.8809, + "step": 4341 + }, + { + "epoch": 1.0999366687777075, + "grad_norm": 3.3758602142333984, + "learning_rate": 9.230433952168588e-06, + "loss": 0.7751, + "step": 4342 + }, + { + "epoch": 1.1001899936668778, + "grad_norm": 4.082566261291504, + "learning_rate": 9.229987282268582e-06, + "loss": 0.9636, + "step": 4343 + }, + { + "epoch": 1.100443318556048, + "grad_norm": 3.775946617126465, + "learning_rate": 9.229540493592517e-06, + "loss": 0.762, + "step": 4344 + }, + { + "epoch": 1.1006966434452186, + "grad_norm": 3.816842794418335, + "learning_rate": 9.229093586152936e-06, + "loss": 0.811, + "step": 4345 + }, + { + "epoch": 1.1009499683343889, + "grad_norm": 3.738156795501709, + "learning_rate": 9.228646559962388e-06, + "loss": 0.7242, + "step": 4346 + }, + { + "epoch": 1.1012032932235591, + "grad_norm": 3.452763557434082, + "learning_rate": 9.22819941503343e-06, + "loss": 0.7588, + "step": 4347 + }, + { + "epoch": 1.1014566181127297, + "grad_norm": 3.708622932434082, + "learning_rate": 9.22775215137861e-06, + "loss": 0.8296, + "step": 4348 + }, + { + "epoch": 1.1017099430019, + "grad_norm": 3.7021007537841797, + "learning_rate": 9.227304769010494e-06, + "loss": 0.8233, + "step": 4349 + }, + { + "epoch": 1.1019632678910702, + "grad_norm": 3.600891351699829, + "learning_rate": 9.22685726794164e-06, + "loss": 0.6824, + "step": 4350 + }, + { + "epoch": 1.1022165927802408, + "grad_norm": 3.5252370834350586, + "learning_rate": 9.226409648184616e-06, + "loss": 0.7261, + "step": 4351 + }, + { + "epoch": 1.102469917669411, + "grad_norm": 3.488820791244507, + "learning_rate": 9.225961909751987e-06, + "loss": 0.6659, + "step": 4352 + }, + { + "epoch": 1.1027232425585813, + "grad_norm": 3.602959394454956, + "learning_rate": 9.225514052656332e-06, + "loss": 0.7595, + "step": 4353 + }, + { + "epoch": 1.1029765674477519, + "grad_norm": 3.727651834487915, + "learning_rate": 9.22506607691022e-06, + "loss": 0.8803, + "step": 4354 + }, + { + "epoch": 1.1032298923369221, + "grad_norm": 4.054435729980469, + "learning_rate": 9.224617982526236e-06, + "loss": 0.8313, + "step": 4355 + }, + { + "epoch": 1.1034832172260924, + "grad_norm": 3.574852466583252, + "learning_rate": 9.224169769516957e-06, + "loss": 0.7235, + "step": 4356 + }, + { + "epoch": 1.1037365421152627, + "grad_norm": 3.9324018955230713, + "learning_rate": 9.22372143789497e-06, + "loss": 0.7576, + "step": 4357 + }, + { + "epoch": 1.1039898670044332, + "grad_norm": 3.902390718460083, + "learning_rate": 9.223272987672865e-06, + "loss": 0.9231, + "step": 4358 + }, + { + "epoch": 1.1042431918936035, + "grad_norm": 3.8140463829040527, + "learning_rate": 9.222824418863234e-06, + "loss": 0.6814, + "step": 4359 + }, + { + "epoch": 1.1044965167827738, + "grad_norm": 4.1273722648620605, + "learning_rate": 9.222375731478673e-06, + "loss": 0.8463, + "step": 4360 + }, + { + "epoch": 1.1047498416719443, + "grad_norm": 3.649075746536255, + "learning_rate": 9.221926925531778e-06, + "loss": 0.6866, + "step": 4361 + }, + { + "epoch": 1.1050031665611146, + "grad_norm": 3.250063180923462, + "learning_rate": 9.221478001035158e-06, + "loss": 0.8044, + "step": 4362 + }, + { + "epoch": 1.105256491450285, + "grad_norm": 3.6262922286987305, + "learning_rate": 9.221028958001411e-06, + "loss": 0.7421, + "step": 4363 + }, + { + "epoch": 1.1055098163394554, + "grad_norm": 3.5227932929992676, + "learning_rate": 9.22057979644315e-06, + "loss": 0.7195, + "step": 4364 + }, + { + "epoch": 1.1057631412286257, + "grad_norm": 3.579219102859497, + "learning_rate": 9.220130516372986e-06, + "loss": 0.8423, + "step": 4365 + }, + { + "epoch": 1.106016466117796, + "grad_norm": 3.35093355178833, + "learning_rate": 9.219681117803537e-06, + "loss": 0.7214, + "step": 4366 + }, + { + "epoch": 1.1062697910069663, + "grad_norm": 3.8864123821258545, + "learning_rate": 9.219231600747418e-06, + "loss": 0.686, + "step": 4367 + }, + { + "epoch": 1.1065231158961368, + "grad_norm": 3.799591302871704, + "learning_rate": 9.218781965217252e-06, + "loss": 0.7165, + "step": 4368 + }, + { + "epoch": 1.1067764407853071, + "grad_norm": 3.962235450744629, + "learning_rate": 9.218332211225667e-06, + "loss": 0.8037, + "step": 4369 + }, + { + "epoch": 1.1070297656744774, + "grad_norm": 3.679168939590454, + "learning_rate": 9.21788233878529e-06, + "loss": 0.7558, + "step": 4370 + }, + { + "epoch": 1.107283090563648, + "grad_norm": 3.9936811923980713, + "learning_rate": 9.217432347908754e-06, + "loss": 0.8641, + "step": 4371 + }, + { + "epoch": 1.1075364154528182, + "grad_norm": 3.7278389930725098, + "learning_rate": 9.216982238608696e-06, + "loss": 0.8261, + "step": 4372 + }, + { + "epoch": 1.1077897403419885, + "grad_norm": 4.024977684020996, + "learning_rate": 9.216532010897751e-06, + "loss": 0.8757, + "step": 4373 + }, + { + "epoch": 1.108043065231159, + "grad_norm": 3.846114158630371, + "learning_rate": 9.216081664788565e-06, + "loss": 0.8498, + "step": 4374 + }, + { + "epoch": 1.1082963901203293, + "grad_norm": 3.6417713165283203, + "learning_rate": 9.21563120029378e-06, + "loss": 0.7203, + "step": 4375 + }, + { + "epoch": 1.1085497150094996, + "grad_norm": 3.7245826721191406, + "learning_rate": 9.215180617426047e-06, + "loss": 0.8541, + "step": 4376 + }, + { + "epoch": 1.1088030398986701, + "grad_norm": 3.698324680328369, + "learning_rate": 9.21472991619802e-06, + "loss": 0.8451, + "step": 4377 + }, + { + "epoch": 1.1090563647878404, + "grad_norm": 4.14930534362793, + "learning_rate": 9.21427909662235e-06, + "loss": 0.8199, + "step": 4378 + }, + { + "epoch": 1.1093096896770107, + "grad_norm": 3.709798574447632, + "learning_rate": 9.213828158711699e-06, + "loss": 0.755, + "step": 4379 + }, + { + "epoch": 1.1095630145661812, + "grad_norm": 3.5829548835754395, + "learning_rate": 9.213377102478728e-06, + "loss": 0.7758, + "step": 4380 + }, + { + "epoch": 1.1098163394553515, + "grad_norm": 4.226644992828369, + "learning_rate": 9.212925927936103e-06, + "loss": 1.0089, + "step": 4381 + }, + { + "epoch": 1.1100696643445218, + "grad_norm": 3.9692909717559814, + "learning_rate": 9.21247463509649e-06, + "loss": 0.8052, + "step": 4382 + }, + { + "epoch": 1.1103229892336923, + "grad_norm": 4.0409111976623535, + "learning_rate": 9.212023223972568e-06, + "loss": 0.8921, + "step": 4383 + }, + { + "epoch": 1.1105763141228626, + "grad_norm": 3.563741445541382, + "learning_rate": 9.211571694577006e-06, + "loss": 0.7763, + "step": 4384 + }, + { + "epoch": 1.110829639012033, + "grad_norm": 4.491901397705078, + "learning_rate": 9.211120046922483e-06, + "loss": 0.731, + "step": 4385 + }, + { + "epoch": 1.1110829639012032, + "grad_norm": 3.8411054611206055, + "learning_rate": 9.210668281021683e-06, + "loss": 0.8005, + "step": 4386 + }, + { + "epoch": 1.1113362887903737, + "grad_norm": 4.127683639526367, + "learning_rate": 9.210216396887293e-06, + "loss": 0.8026, + "step": 4387 + }, + { + "epoch": 1.111589613679544, + "grad_norm": 3.6287953853607178, + "learning_rate": 9.209764394532e-06, + "loss": 0.7264, + "step": 4388 + }, + { + "epoch": 1.1118429385687143, + "grad_norm": 4.0084733963012695, + "learning_rate": 9.209312273968493e-06, + "loss": 0.8431, + "step": 4389 + }, + { + "epoch": 1.1120962634578848, + "grad_norm": 3.1336419582366943, + "learning_rate": 9.208860035209472e-06, + "loss": 0.687, + "step": 4390 + }, + { + "epoch": 1.112349588347055, + "grad_norm": 3.6700150966644287, + "learning_rate": 9.208407678267635e-06, + "loss": 0.8033, + "step": 4391 + }, + { + "epoch": 1.1126029132362254, + "grad_norm": 3.8876240253448486, + "learning_rate": 9.207955203155681e-06, + "loss": 0.7619, + "step": 4392 + }, + { + "epoch": 1.112856238125396, + "grad_norm": 3.584474802017212, + "learning_rate": 9.20750260988632e-06, + "loss": 0.7602, + "step": 4393 + }, + { + "epoch": 1.1131095630145662, + "grad_norm": 3.9350109100341797, + "learning_rate": 9.207049898472253e-06, + "loss": 0.8195, + "step": 4394 + }, + { + "epoch": 1.1133628879037365, + "grad_norm": 3.685990571975708, + "learning_rate": 9.2065970689262e-06, + "loss": 0.8204, + "step": 4395 + }, + { + "epoch": 1.1136162127929068, + "grad_norm": 4.11929178237915, + "learning_rate": 9.206144121260871e-06, + "loss": 0.755, + "step": 4396 + }, + { + "epoch": 1.1138695376820773, + "grad_norm": 3.9196510314941406, + "learning_rate": 9.205691055488987e-06, + "loss": 0.8139, + "step": 4397 + }, + { + "epoch": 1.1141228625712476, + "grad_norm": 3.9029970169067383, + "learning_rate": 9.20523787162327e-06, + "loss": 0.7376, + "step": 4398 + }, + { + "epoch": 1.1143761874604179, + "grad_norm": 3.4818239212036133, + "learning_rate": 9.204784569676444e-06, + "loss": 0.7489, + "step": 4399 + }, + { + "epoch": 1.1146295123495884, + "grad_norm": 3.872526168823242, + "learning_rate": 9.20433114966124e-06, + "loss": 0.6817, + "step": 4400 + }, + { + "epoch": 1.1148828372387587, + "grad_norm": 3.580820083618164, + "learning_rate": 9.203877611590384e-06, + "loss": 0.833, + "step": 4401 + }, + { + "epoch": 1.115136162127929, + "grad_norm": 3.9937806129455566, + "learning_rate": 9.203423955476616e-06, + "loss": 0.7673, + "step": 4402 + }, + { + "epoch": 1.1153894870170995, + "grad_norm": 3.680983066558838, + "learning_rate": 9.202970181332674e-06, + "loss": 0.6896, + "step": 4403 + }, + { + "epoch": 1.1156428119062698, + "grad_norm": 3.7662012577056885, + "learning_rate": 9.2025162891713e-06, + "loss": 0.8438, + "step": 4404 + }, + { + "epoch": 1.11589613679544, + "grad_norm": 3.911691188812256, + "learning_rate": 9.202062279005237e-06, + "loss": 0.9176, + "step": 4405 + }, + { + "epoch": 1.1161494616846106, + "grad_norm": 3.595184326171875, + "learning_rate": 9.201608150847234e-06, + "loss": 0.7462, + "step": 4406 + }, + { + "epoch": 1.116402786573781, + "grad_norm": 4.015063762664795, + "learning_rate": 9.201153904710045e-06, + "loss": 0.8581, + "step": 4407 + }, + { + "epoch": 1.1166561114629512, + "grad_norm": 3.923367500305176, + "learning_rate": 9.200699540606423e-06, + "loss": 0.7719, + "step": 4408 + }, + { + "epoch": 1.1169094363521217, + "grad_norm": 3.704188823699951, + "learning_rate": 9.200245058549127e-06, + "loss": 0.7193, + "step": 4409 + }, + { + "epoch": 1.117162761241292, + "grad_norm": 4.123978137969971, + "learning_rate": 9.199790458550917e-06, + "loss": 0.7962, + "step": 4410 + }, + { + "epoch": 1.1174160861304623, + "grad_norm": 3.5524094104766846, + "learning_rate": 9.19933574062456e-06, + "loss": 0.7907, + "step": 4411 + }, + { + "epoch": 1.1176694110196326, + "grad_norm": 3.885824203491211, + "learning_rate": 9.198880904782823e-06, + "loss": 0.7299, + "step": 4412 + }, + { + "epoch": 1.117922735908803, + "grad_norm": 3.6361563205718994, + "learning_rate": 9.19842595103848e-06, + "loss": 0.7897, + "step": 4413 + }, + { + "epoch": 1.1181760607979734, + "grad_norm": 3.3709397315979004, + "learning_rate": 9.1979708794043e-06, + "loss": 0.6722, + "step": 4414 + }, + { + "epoch": 1.1184293856871437, + "grad_norm": 3.6834933757781982, + "learning_rate": 9.197515689893068e-06, + "loss": 0.7568, + "step": 4415 + }, + { + "epoch": 1.1186827105763142, + "grad_norm": 3.6808292865753174, + "learning_rate": 9.197060382517563e-06, + "loss": 0.682, + "step": 4416 + }, + { + "epoch": 1.1189360354654845, + "grad_norm": 3.3640832901000977, + "learning_rate": 9.196604957290569e-06, + "loss": 0.7307, + "step": 4417 + }, + { + "epoch": 1.1191893603546548, + "grad_norm": 3.6205482482910156, + "learning_rate": 9.196149414224875e-06, + "loss": 0.7453, + "step": 4418 + }, + { + "epoch": 1.1194426852438253, + "grad_norm": 3.865307092666626, + "learning_rate": 9.195693753333271e-06, + "loss": 0.8446, + "step": 4419 + }, + { + "epoch": 1.1196960101329956, + "grad_norm": 3.702802896499634, + "learning_rate": 9.195237974628555e-06, + "loss": 0.6828, + "step": 4420 + }, + { + "epoch": 1.1199493350221659, + "grad_norm": 3.967543125152588, + "learning_rate": 9.19478207812352e-06, + "loss": 0.8518, + "step": 4421 + }, + { + "epoch": 1.1202026599113364, + "grad_norm": 3.5002949237823486, + "learning_rate": 9.194326063830972e-06, + "loss": 0.6606, + "step": 4422 + }, + { + "epoch": 1.1204559848005067, + "grad_norm": 3.9500465393066406, + "learning_rate": 9.193869931763715e-06, + "loss": 0.8085, + "step": 4423 + }, + { + "epoch": 1.120709309689677, + "grad_norm": 3.9784748554229736, + "learning_rate": 9.193413681934553e-06, + "loss": 0.6886, + "step": 4424 + }, + { + "epoch": 1.1209626345788473, + "grad_norm": 3.8072926998138428, + "learning_rate": 9.192957314356303e-06, + "loss": 0.8054, + "step": 4425 + }, + { + "epoch": 1.1212159594680178, + "grad_norm": 3.774982452392578, + "learning_rate": 9.192500829041775e-06, + "loss": 0.8279, + "step": 4426 + }, + { + "epoch": 1.121469284357188, + "grad_norm": 3.39636492729187, + "learning_rate": 9.19204422600379e-06, + "loss": 0.7936, + "step": 4427 + }, + { + "epoch": 1.1217226092463584, + "grad_norm": 3.933150053024292, + "learning_rate": 9.191587505255166e-06, + "loss": 0.7482, + "step": 4428 + }, + { + "epoch": 1.1219759341355289, + "grad_norm": 3.645341634750366, + "learning_rate": 9.19113066680873e-06, + "loss": 0.8449, + "step": 4429 + }, + { + "epoch": 1.1222292590246992, + "grad_norm": 3.5464110374450684, + "learning_rate": 9.190673710677308e-06, + "loss": 0.7274, + "step": 4430 + }, + { + "epoch": 1.1224825839138695, + "grad_norm": 3.369464635848999, + "learning_rate": 9.190216636873735e-06, + "loss": 0.749, + "step": 4431 + }, + { + "epoch": 1.12273590880304, + "grad_norm": 3.8639848232269287, + "learning_rate": 9.189759445410841e-06, + "loss": 0.8226, + "step": 4432 + }, + { + "epoch": 1.1229892336922103, + "grad_norm": 3.737210988998413, + "learning_rate": 9.189302136301466e-06, + "loss": 0.7601, + "step": 4433 + }, + { + "epoch": 1.1232425585813806, + "grad_norm": 4.071779251098633, + "learning_rate": 9.18884470955845e-06, + "loss": 0.9538, + "step": 4434 + }, + { + "epoch": 1.123495883470551, + "grad_norm": 3.3898377418518066, + "learning_rate": 9.18838716519464e-06, + "loss": 0.7252, + "step": 4435 + }, + { + "epoch": 1.1237492083597214, + "grad_norm": 3.7422585487365723, + "learning_rate": 9.187929503222879e-06, + "loss": 0.7957, + "step": 4436 + }, + { + "epoch": 1.1240025332488917, + "grad_norm": 3.3443500995635986, + "learning_rate": 9.18747172365602e-06, + "loss": 0.7858, + "step": 4437 + }, + { + "epoch": 1.1242558581380622, + "grad_norm": 3.7862918376922607, + "learning_rate": 9.187013826506919e-06, + "loss": 0.7939, + "step": 4438 + }, + { + "epoch": 1.1245091830272325, + "grad_norm": 3.5818467140197754, + "learning_rate": 9.186555811788431e-06, + "loss": 0.8862, + "step": 4439 + }, + { + "epoch": 1.1247625079164028, + "grad_norm": 3.5962002277374268, + "learning_rate": 9.186097679513419e-06, + "loss": 0.8394, + "step": 4440 + }, + { + "epoch": 1.1250158328055733, + "grad_norm": 3.585381031036377, + "learning_rate": 9.185639429694744e-06, + "loss": 0.6962, + "step": 4441 + }, + { + "epoch": 1.1252691576947436, + "grad_norm": 3.802293300628662, + "learning_rate": 9.185181062345278e-06, + "loss": 0.8747, + "step": 4442 + }, + { + "epoch": 1.1255224825839139, + "grad_norm": 3.661857843399048, + "learning_rate": 9.184722577477889e-06, + "loss": 0.7727, + "step": 4443 + }, + { + "epoch": 1.1257758074730841, + "grad_norm": 3.70066237449646, + "learning_rate": 9.18426397510545e-06, + "loss": 0.7525, + "step": 4444 + }, + { + "epoch": 1.1260291323622547, + "grad_norm": 3.3943498134613037, + "learning_rate": 9.183805255240841e-06, + "loss": 0.6573, + "step": 4445 + }, + { + "epoch": 1.126282457251425, + "grad_norm": 3.671624183654785, + "learning_rate": 9.183346417896938e-06, + "loss": 0.8727, + "step": 4446 + }, + { + "epoch": 1.1265357821405952, + "grad_norm": 4.023936748504639, + "learning_rate": 9.182887463086633e-06, + "loss": 0.8415, + "step": 4447 + }, + { + "epoch": 1.1267891070297658, + "grad_norm": 3.890685796737671, + "learning_rate": 9.182428390822806e-06, + "loss": 0.9273, + "step": 4448 + }, + { + "epoch": 1.127042431918936, + "grad_norm": 3.540053129196167, + "learning_rate": 9.18196920111835e-06, + "loss": 0.7377, + "step": 4449 + }, + { + "epoch": 1.1272957568081063, + "grad_norm": 3.9170687198638916, + "learning_rate": 9.181509893986158e-06, + "loss": 0.7624, + "step": 4450 + }, + { + "epoch": 1.1275490816972766, + "grad_norm": 3.3918919563293457, + "learning_rate": 9.18105046943913e-06, + "loss": 0.7004, + "step": 4451 + }, + { + "epoch": 1.1278024065864471, + "grad_norm": 3.777207136154175, + "learning_rate": 9.180590927490163e-06, + "loss": 0.8214, + "step": 4452 + }, + { + "epoch": 1.1280557314756174, + "grad_norm": 3.8879199028015137, + "learning_rate": 9.180131268152164e-06, + "loss": 0.8458, + "step": 4453 + }, + { + "epoch": 1.1283090563647877, + "grad_norm": 4.480652332305908, + "learning_rate": 9.179671491438035e-06, + "loss": 0.8267, + "step": 4454 + }, + { + "epoch": 1.1285623812539582, + "grad_norm": 3.8782782554626465, + "learning_rate": 9.179211597360693e-06, + "loss": 0.8176, + "step": 4455 + }, + { + "epoch": 1.1288157061431285, + "grad_norm": 4.272160530090332, + "learning_rate": 9.178751585933046e-06, + "loss": 0.8025, + "step": 4456 + }, + { + "epoch": 1.1290690310322988, + "grad_norm": 3.7379496097564697, + "learning_rate": 9.178291457168012e-06, + "loss": 0.8301, + "step": 4457 + }, + { + "epoch": 1.1293223559214693, + "grad_norm": 3.911961317062378, + "learning_rate": 9.177831211078513e-06, + "loss": 0.6796, + "step": 4458 + }, + { + "epoch": 1.1295756808106396, + "grad_norm": 3.614490032196045, + "learning_rate": 9.177370847677472e-06, + "loss": 0.7909, + "step": 4459 + }, + { + "epoch": 1.12982900569981, + "grad_norm": 4.423027038574219, + "learning_rate": 9.176910366977816e-06, + "loss": 0.7512, + "step": 4460 + }, + { + "epoch": 1.1300823305889804, + "grad_norm": 3.891252279281616, + "learning_rate": 9.176449768992474e-06, + "loss": 0.9182, + "step": 4461 + }, + { + "epoch": 1.1303356554781507, + "grad_norm": 4.018182754516602, + "learning_rate": 9.175989053734379e-06, + "loss": 0.8079, + "step": 4462 + }, + { + "epoch": 1.130588980367321, + "grad_norm": 3.9636101722717285, + "learning_rate": 9.17552822121647e-06, + "loss": 0.8613, + "step": 4463 + }, + { + "epoch": 1.1308423052564915, + "grad_norm": 3.582052230834961, + "learning_rate": 9.175067271451685e-06, + "loss": 0.9291, + "step": 4464 + }, + { + "epoch": 1.1310956301456618, + "grad_norm": 3.36541748046875, + "learning_rate": 9.174606204452967e-06, + "loss": 0.7971, + "step": 4465 + }, + { + "epoch": 1.1313489550348321, + "grad_norm": 3.6572256088256836, + "learning_rate": 9.174145020233265e-06, + "loss": 0.8778, + "step": 4466 + }, + { + "epoch": 1.1316022799240026, + "grad_norm": 3.8592522144317627, + "learning_rate": 9.173683718805525e-06, + "loss": 0.9217, + "step": 4467 + }, + { + "epoch": 1.131855604813173, + "grad_norm": 3.251784563064575, + "learning_rate": 9.173222300182706e-06, + "loss": 0.7041, + "step": 4468 + }, + { + "epoch": 1.1321089297023432, + "grad_norm": 3.740300178527832, + "learning_rate": 9.172760764377755e-06, + "loss": 0.8344, + "step": 4469 + }, + { + "epoch": 1.1323622545915137, + "grad_norm": 3.6480960845947266, + "learning_rate": 9.172299111403643e-06, + "loss": 0.7285, + "step": 4470 + }, + { + "epoch": 1.132615579480684, + "grad_norm": 3.844107151031494, + "learning_rate": 9.171837341273324e-06, + "loss": 0.7676, + "step": 4471 + }, + { + "epoch": 1.1328689043698543, + "grad_norm": 3.5058791637420654, + "learning_rate": 9.17137545399977e-06, + "loss": 0.8908, + "step": 4472 + }, + { + "epoch": 1.1331222292590246, + "grad_norm": 4.316401958465576, + "learning_rate": 9.170913449595945e-06, + "loss": 0.9873, + "step": 4473 + }, + { + "epoch": 1.1333755541481951, + "grad_norm": 3.805889368057251, + "learning_rate": 9.170451328074828e-06, + "loss": 0.8489, + "step": 4474 + }, + { + "epoch": 1.1336288790373654, + "grad_norm": 4.286030292510986, + "learning_rate": 9.16998908944939e-06, + "loss": 0.9839, + "step": 4475 + }, + { + "epoch": 1.1338822039265357, + "grad_norm": 3.9902102947235107, + "learning_rate": 9.169526733732614e-06, + "loss": 0.7551, + "step": 4476 + }, + { + "epoch": 1.1341355288157062, + "grad_norm": 3.488490581512451, + "learning_rate": 9.169064260937483e-06, + "loss": 0.8311, + "step": 4477 + }, + { + "epoch": 1.1343888537048765, + "grad_norm": 4.064438343048096, + "learning_rate": 9.16860167107698e-06, + "loss": 0.885, + "step": 4478 + }, + { + "epoch": 1.1346421785940468, + "grad_norm": 3.749305248260498, + "learning_rate": 9.168138964164096e-06, + "loss": 0.8578, + "step": 4479 + }, + { + "epoch": 1.134895503483217, + "grad_norm": 3.4141809940338135, + "learning_rate": 9.167676140211823e-06, + "loss": 0.8509, + "step": 4480 + }, + { + "epoch": 1.1351488283723876, + "grad_norm": 4.0905303955078125, + "learning_rate": 9.167213199233159e-06, + "loss": 0.905, + "step": 4481 + }, + { + "epoch": 1.135402153261558, + "grad_norm": 4.127016544342041, + "learning_rate": 9.166750141241099e-06, + "loss": 0.8783, + "step": 4482 + }, + { + "epoch": 1.1356554781507282, + "grad_norm": 3.519977569580078, + "learning_rate": 9.166286966248648e-06, + "loss": 0.7849, + "step": 4483 + }, + { + "epoch": 1.1359088030398987, + "grad_norm": 3.529370069503784, + "learning_rate": 9.165823674268812e-06, + "loss": 0.7462, + "step": 4484 + }, + { + "epoch": 1.136162127929069, + "grad_norm": 3.660217761993408, + "learning_rate": 9.165360265314601e-06, + "loss": 0.6998, + "step": 4485 + }, + { + "epoch": 1.1364154528182393, + "grad_norm": 3.851372718811035, + "learning_rate": 9.164896739399026e-06, + "loss": 0.7012, + "step": 4486 + }, + { + "epoch": 1.1366687777074098, + "grad_norm": 3.846855401992798, + "learning_rate": 9.164433096535102e-06, + "loss": 0.6293, + "step": 4487 + }, + { + "epoch": 1.13692210259658, + "grad_norm": 3.5173442363739014, + "learning_rate": 9.163969336735847e-06, + "loss": 0.7445, + "step": 4488 + }, + { + "epoch": 1.1371754274857504, + "grad_norm": 3.500772714614868, + "learning_rate": 9.163505460014286e-06, + "loss": 0.8659, + "step": 4489 + }, + { + "epoch": 1.137428752374921, + "grad_norm": 4.18075704574585, + "learning_rate": 9.163041466383443e-06, + "loss": 0.7625, + "step": 4490 + }, + { + "epoch": 1.1376820772640912, + "grad_norm": 3.931058168411255, + "learning_rate": 9.162577355856346e-06, + "loss": 0.862, + "step": 4491 + }, + { + "epoch": 1.1379354021532615, + "grad_norm": 3.5401864051818848, + "learning_rate": 9.162113128446028e-06, + "loss": 0.6734, + "step": 4492 + }, + { + "epoch": 1.138188727042432, + "grad_norm": 3.6016790866851807, + "learning_rate": 9.161648784165525e-06, + "loss": 0.7529, + "step": 4493 + }, + { + "epoch": 1.1384420519316023, + "grad_norm": 4.87125301361084, + "learning_rate": 9.161184323027874e-06, + "loss": 0.8357, + "step": 4494 + }, + { + "epoch": 1.1386953768207726, + "grad_norm": 3.4067635536193848, + "learning_rate": 9.160719745046117e-06, + "loss": 0.7535, + "step": 4495 + }, + { + "epoch": 1.1389487017099431, + "grad_norm": 3.5701889991760254, + "learning_rate": 9.1602550502333e-06, + "loss": 0.8978, + "step": 4496 + }, + { + "epoch": 1.1392020265991134, + "grad_norm": 3.7475717067718506, + "learning_rate": 9.15979023860247e-06, + "loss": 0.7395, + "step": 4497 + }, + { + "epoch": 1.1394553514882837, + "grad_norm": 3.598468542098999, + "learning_rate": 9.159325310166683e-06, + "loss": 0.7488, + "step": 4498 + }, + { + "epoch": 1.139708676377454, + "grad_norm": 3.6979329586029053, + "learning_rate": 9.158860264938987e-06, + "loss": 0.7688, + "step": 4499 + }, + { + "epoch": 1.1399620012666245, + "grad_norm": 4.062190055847168, + "learning_rate": 9.158395102932445e-06, + "loss": 0.9437, + "step": 4500 + }, + { + "epoch": 1.1399620012666245, + "eval_loss": 1.1921050548553467, + "eval_runtime": 13.9602, + "eval_samples_per_second": 28.653, + "eval_steps_per_second": 3.582, + "step": 4500 + }, + { + "epoch": 1.1402153261557948, + "grad_norm": 3.9083962440490723, + "learning_rate": 9.157929824160117e-06, + "loss": 0.7845, + "step": 4501 + }, + { + "epoch": 1.140468651044965, + "grad_norm": 4.207871437072754, + "learning_rate": 9.15746442863507e-06, + "loss": 0.9516, + "step": 4502 + }, + { + "epoch": 1.1407219759341356, + "grad_norm": 4.152109622955322, + "learning_rate": 9.15699891637037e-06, + "loss": 0.9171, + "step": 4503 + }, + { + "epoch": 1.140975300823306, + "grad_norm": 3.7536070346832275, + "learning_rate": 9.156533287379088e-06, + "loss": 0.7167, + "step": 4504 + }, + { + "epoch": 1.1412286257124762, + "grad_norm": 4.014231204986572, + "learning_rate": 9.1560675416743e-06, + "loss": 0.7548, + "step": 4505 + }, + { + "epoch": 1.1414819506016467, + "grad_norm": 3.6225619316101074, + "learning_rate": 9.155601679269082e-06, + "loss": 0.7183, + "step": 4506 + }, + { + "epoch": 1.141735275490817, + "grad_norm": 3.1658689975738525, + "learning_rate": 9.155135700176517e-06, + "loss": 0.7276, + "step": 4507 + }, + { + "epoch": 1.1419886003799873, + "grad_norm": 3.6831867694854736, + "learning_rate": 9.15466960440969e-06, + "loss": 0.7494, + "step": 4508 + }, + { + "epoch": 1.1422419252691576, + "grad_norm": 3.932875871658325, + "learning_rate": 9.154203391981687e-06, + "loss": 0.9567, + "step": 4509 + }, + { + "epoch": 1.142495250158328, + "grad_norm": 3.699507236480713, + "learning_rate": 9.1537370629056e-06, + "loss": 0.6713, + "step": 4510 + }, + { + "epoch": 1.1427485750474984, + "grad_norm": 3.869405746459961, + "learning_rate": 9.15327061719452e-06, + "loss": 0.7468, + "step": 4511 + }, + { + "epoch": 1.1430018999366687, + "grad_norm": 3.7737667560577393, + "learning_rate": 9.152804054861552e-06, + "loss": 0.884, + "step": 4512 + }, + { + "epoch": 1.1432552248258392, + "grad_norm": 4.137895107269287, + "learning_rate": 9.152337375919792e-06, + "loss": 0.7774, + "step": 4513 + }, + { + "epoch": 1.1435085497150095, + "grad_norm": 3.4484589099884033, + "learning_rate": 9.151870580382343e-06, + "loss": 0.7318, + "step": 4514 + }, + { + "epoch": 1.1437618746041798, + "grad_norm": 3.6685168743133545, + "learning_rate": 9.151403668262314e-06, + "loss": 0.7481, + "step": 4515 + }, + { + "epoch": 1.1440151994933503, + "grad_norm": 3.924276351928711, + "learning_rate": 9.150936639572816e-06, + "loss": 0.8262, + "step": 4516 + }, + { + "epoch": 1.1442685243825206, + "grad_norm": 3.643343210220337, + "learning_rate": 9.150469494326961e-06, + "loss": 0.8521, + "step": 4517 + }, + { + "epoch": 1.1445218492716909, + "grad_norm": 3.7538487911224365, + "learning_rate": 9.15000223253787e-06, + "loss": 0.856, + "step": 4518 + }, + { + "epoch": 1.1447751741608614, + "grad_norm": 3.8249545097351074, + "learning_rate": 9.14953485421866e-06, + "loss": 0.7416, + "step": 4519 + }, + { + "epoch": 1.1450284990500317, + "grad_norm": 3.8859763145446777, + "learning_rate": 9.149067359382457e-06, + "loss": 0.8388, + "step": 4520 + }, + { + "epoch": 1.145281823939202, + "grad_norm": 3.4407546520233154, + "learning_rate": 9.148599748042388e-06, + "loss": 0.8163, + "step": 4521 + }, + { + "epoch": 1.1455351488283725, + "grad_norm": 3.633143186569214, + "learning_rate": 9.148132020211582e-06, + "loss": 0.7831, + "step": 4522 + }, + { + "epoch": 1.1457884737175428, + "grad_norm": 4.0248188972473145, + "learning_rate": 9.147664175903172e-06, + "loss": 0.9102, + "step": 4523 + }, + { + "epoch": 1.146041798606713, + "grad_norm": 3.8170149326324463, + "learning_rate": 9.147196215130295e-06, + "loss": 0.7716, + "step": 4524 + }, + { + "epoch": 1.1462951234958836, + "grad_norm": 4.042696475982666, + "learning_rate": 9.146728137906093e-06, + "loss": 0.8385, + "step": 4525 + }, + { + "epoch": 1.1465484483850539, + "grad_norm": 3.540112257003784, + "learning_rate": 9.146259944243709e-06, + "loss": 0.7392, + "step": 4526 + }, + { + "epoch": 1.1468017732742242, + "grad_norm": 3.671898126602173, + "learning_rate": 9.14579163415629e-06, + "loss": 0.7941, + "step": 4527 + }, + { + "epoch": 1.1470550981633945, + "grad_norm": 4.141158103942871, + "learning_rate": 9.145323207656983e-06, + "loss": 0.8339, + "step": 4528 + }, + { + "epoch": 1.147308423052565, + "grad_norm": 3.43936824798584, + "learning_rate": 9.144854664758943e-06, + "loss": 0.6344, + "step": 4529 + }, + { + "epoch": 1.1475617479417353, + "grad_norm": 3.8091511726379395, + "learning_rate": 9.144386005475328e-06, + "loss": 0.8165, + "step": 4530 + }, + { + "epoch": 1.1478150728309056, + "grad_norm": 3.5215532779693604, + "learning_rate": 9.143917229819296e-06, + "loss": 0.7855, + "step": 4531 + }, + { + "epoch": 1.148068397720076, + "grad_norm": 3.872860908508301, + "learning_rate": 9.143448337804011e-06, + "loss": 0.8541, + "step": 4532 + }, + { + "epoch": 1.1483217226092464, + "grad_norm": 3.8704535961151123, + "learning_rate": 9.142979329442636e-06, + "loss": 0.8286, + "step": 4533 + }, + { + "epoch": 1.1485750474984167, + "grad_norm": 3.385845422744751, + "learning_rate": 9.142510204748345e-06, + "loss": 0.6775, + "step": 4534 + }, + { + "epoch": 1.1488283723875872, + "grad_norm": 3.9064671993255615, + "learning_rate": 9.142040963734308e-06, + "loss": 0.7115, + "step": 4535 + }, + { + "epoch": 1.1490816972767575, + "grad_norm": 3.4068045616149902, + "learning_rate": 9.141571606413704e-06, + "loss": 0.7802, + "step": 4536 + }, + { + "epoch": 1.1493350221659278, + "grad_norm": 3.667557716369629, + "learning_rate": 9.141102132799708e-06, + "loss": 0.704, + "step": 4537 + }, + { + "epoch": 1.149588347055098, + "grad_norm": 3.244607925415039, + "learning_rate": 9.140632542905508e-06, + "loss": 0.7116, + "step": 4538 + }, + { + "epoch": 1.1498416719442686, + "grad_norm": 3.388017416000366, + "learning_rate": 9.140162836744284e-06, + "loss": 0.6418, + "step": 4539 + }, + { + "epoch": 1.1500949968334389, + "grad_norm": 3.6708273887634277, + "learning_rate": 9.13969301432923e-06, + "loss": 0.7388, + "step": 4540 + }, + { + "epoch": 1.1503483217226091, + "grad_norm": 3.588341236114502, + "learning_rate": 9.139223075673534e-06, + "loss": 0.7893, + "step": 4541 + }, + { + "epoch": 1.1506016466117797, + "grad_norm": 3.6004638671875, + "learning_rate": 9.138753020790396e-06, + "loss": 0.7852, + "step": 4542 + }, + { + "epoch": 1.15085497150095, + "grad_norm": 4.104678630828857, + "learning_rate": 9.138282849693013e-06, + "loss": 0.8434, + "step": 4543 + }, + { + "epoch": 1.1511082963901202, + "grad_norm": 4.082339286804199, + "learning_rate": 9.137812562394585e-06, + "loss": 0.7289, + "step": 4544 + }, + { + "epoch": 1.1513616212792908, + "grad_norm": 3.8769257068634033, + "learning_rate": 9.13734215890832e-06, + "loss": 0.7936, + "step": 4545 + }, + { + "epoch": 1.151614946168461, + "grad_norm": 4.094728469848633, + "learning_rate": 9.136871639247428e-06, + "loss": 0.8328, + "step": 4546 + }, + { + "epoch": 1.1518682710576313, + "grad_norm": 3.725374698638916, + "learning_rate": 9.136401003425117e-06, + "loss": 0.7151, + "step": 4547 + }, + { + "epoch": 1.1521215959468019, + "grad_norm": 3.8494014739990234, + "learning_rate": 9.135930251454607e-06, + "loss": 0.8312, + "step": 4548 + }, + { + "epoch": 1.1523749208359721, + "grad_norm": 3.7849738597869873, + "learning_rate": 9.135459383349113e-06, + "loss": 0.8568, + "step": 4549 + }, + { + "epoch": 1.1526282457251424, + "grad_norm": 3.453423261642456, + "learning_rate": 9.134988399121856e-06, + "loss": 0.7342, + "step": 4550 + }, + { + "epoch": 1.152881570614313, + "grad_norm": 3.2565577030181885, + "learning_rate": 9.134517298786065e-06, + "loss": 0.8274, + "step": 4551 + }, + { + "epoch": 1.1531348955034832, + "grad_norm": 3.9076638221740723, + "learning_rate": 9.134046082354965e-06, + "loss": 0.9612, + "step": 4552 + }, + { + "epoch": 1.1533882203926535, + "grad_norm": 3.7145302295684814, + "learning_rate": 9.133574749841789e-06, + "loss": 0.8045, + "step": 4553 + }, + { + "epoch": 1.153641545281824, + "grad_norm": 3.959188222885132, + "learning_rate": 9.133103301259772e-06, + "loss": 0.7974, + "step": 4554 + }, + { + "epoch": 1.1538948701709943, + "grad_norm": 3.694349765777588, + "learning_rate": 9.132631736622148e-06, + "loss": 0.8629, + "step": 4555 + }, + { + "epoch": 1.1541481950601646, + "grad_norm": 3.4247512817382812, + "learning_rate": 9.132160055942165e-06, + "loss": 0.8294, + "step": 4556 + }, + { + "epoch": 1.154401519949335, + "grad_norm": 3.6549019813537598, + "learning_rate": 9.131688259233063e-06, + "loss": 0.7506, + "step": 4557 + }, + { + "epoch": 1.1546548448385054, + "grad_norm": 3.674513339996338, + "learning_rate": 9.131216346508092e-06, + "loss": 0.7861, + "step": 4558 + }, + { + "epoch": 1.1549081697276757, + "grad_norm": 3.335111379623413, + "learning_rate": 9.130744317780503e-06, + "loss": 0.6282, + "step": 4559 + }, + { + "epoch": 1.155161494616846, + "grad_norm": 3.6455440521240234, + "learning_rate": 9.130272173063547e-06, + "loss": 0.8454, + "step": 4560 + }, + { + "epoch": 1.1554148195060165, + "grad_norm": 3.5580334663391113, + "learning_rate": 9.129799912370485e-06, + "loss": 0.8475, + "step": 4561 + }, + { + "epoch": 1.1556681443951868, + "grad_norm": 3.539486885070801, + "learning_rate": 9.129327535714578e-06, + "loss": 0.6811, + "step": 4562 + }, + { + "epoch": 1.1559214692843571, + "grad_norm": 3.76601243019104, + "learning_rate": 9.128855043109088e-06, + "loss": 0.8203, + "step": 4563 + }, + { + "epoch": 1.1561747941735276, + "grad_norm": 3.333263397216797, + "learning_rate": 9.128382434567285e-06, + "loss": 0.7483, + "step": 4564 + }, + { + "epoch": 1.156428119062698, + "grad_norm": 3.8397083282470703, + "learning_rate": 9.127909710102435e-06, + "loss": 0.8516, + "step": 4565 + }, + { + "epoch": 1.1566814439518682, + "grad_norm": 3.8690736293792725, + "learning_rate": 9.12743686972782e-06, + "loss": 0.7916, + "step": 4566 + }, + { + "epoch": 1.1569347688410385, + "grad_norm": 3.472472906112671, + "learning_rate": 9.126963913456708e-06, + "loss": 0.7204, + "step": 4567 + }, + { + "epoch": 1.157188093730209, + "grad_norm": 3.4159295558929443, + "learning_rate": 9.126490841302384e-06, + "loss": 0.7895, + "step": 4568 + }, + { + "epoch": 1.1574414186193793, + "grad_norm": 3.65435528755188, + "learning_rate": 9.126017653278132e-06, + "loss": 0.855, + "step": 4569 + }, + { + "epoch": 1.1576947435085496, + "grad_norm": 3.797349214553833, + "learning_rate": 9.125544349397238e-06, + "loss": 0.791, + "step": 4570 + }, + { + "epoch": 1.1579480683977201, + "grad_norm": 3.4813103675842285, + "learning_rate": 9.125070929672993e-06, + "loss": 0.8287, + "step": 4571 + }, + { + "epoch": 1.1582013932868904, + "grad_norm": 3.6932687759399414, + "learning_rate": 9.124597394118688e-06, + "loss": 0.7619, + "step": 4572 + }, + { + "epoch": 1.1584547181760607, + "grad_norm": 3.941201686859131, + "learning_rate": 9.124123742747622e-06, + "loss": 0.8806, + "step": 4573 + }, + { + "epoch": 1.1587080430652312, + "grad_norm": 3.5929315090179443, + "learning_rate": 9.123649975573095e-06, + "loss": 0.7398, + "step": 4574 + }, + { + "epoch": 1.1589613679544015, + "grad_norm": 3.7371160984039307, + "learning_rate": 9.123176092608408e-06, + "loss": 0.7426, + "step": 4575 + }, + { + "epoch": 1.1592146928435718, + "grad_norm": 3.800302267074585, + "learning_rate": 9.12270209386687e-06, + "loss": 0.866, + "step": 4576 + }, + { + "epoch": 1.1594680177327423, + "grad_norm": 3.735410213470459, + "learning_rate": 9.122227979361789e-06, + "loss": 0.9066, + "step": 4577 + }, + { + "epoch": 1.1597213426219126, + "grad_norm": 3.0655813217163086, + "learning_rate": 9.121753749106478e-06, + "loss": 0.6971, + "step": 4578 + }, + { + "epoch": 1.159974667511083, + "grad_norm": 4.177846431732178, + "learning_rate": 9.121279403114253e-06, + "loss": 0.8673, + "step": 4579 + }, + { + "epoch": 1.1602279924002534, + "grad_norm": 4.221340656280518, + "learning_rate": 9.120804941398435e-06, + "loss": 0.7972, + "step": 4580 + }, + { + "epoch": 1.1604813172894237, + "grad_norm": 3.9534647464752197, + "learning_rate": 9.120330363972345e-06, + "loss": 0.7697, + "step": 4581 + }, + { + "epoch": 1.160734642178594, + "grad_norm": 3.3120903968811035, + "learning_rate": 9.11985567084931e-06, + "loss": 0.7504, + "step": 4582 + }, + { + "epoch": 1.1609879670677645, + "grad_norm": 3.8544650077819824, + "learning_rate": 9.119380862042659e-06, + "loss": 0.794, + "step": 4583 + }, + { + "epoch": 1.1612412919569348, + "grad_norm": 4.032211780548096, + "learning_rate": 9.118905937565723e-06, + "loss": 0.8554, + "step": 4584 + }, + { + "epoch": 1.161494616846105, + "grad_norm": 3.4280614852905273, + "learning_rate": 9.118430897431839e-06, + "loss": 0.7677, + "step": 4585 + }, + { + "epoch": 1.1617479417352754, + "grad_norm": 3.9021902084350586, + "learning_rate": 9.117955741654346e-06, + "loss": 0.758, + "step": 4586 + }, + { + "epoch": 1.162001266624446, + "grad_norm": 3.649503469467163, + "learning_rate": 9.117480470246584e-06, + "loss": 0.7399, + "step": 4587 + }, + { + "epoch": 1.1622545915136162, + "grad_norm": 3.409973382949829, + "learning_rate": 9.117005083221903e-06, + "loss": 0.7951, + "step": 4588 + }, + { + "epoch": 1.1625079164027865, + "grad_norm": 3.909235715866089, + "learning_rate": 9.116529580593647e-06, + "loss": 0.9294, + "step": 4589 + }, + { + "epoch": 1.162761241291957, + "grad_norm": 4.342448711395264, + "learning_rate": 9.11605396237517e-06, + "loss": 0.8511, + "step": 4590 + }, + { + "epoch": 1.1630145661811273, + "grad_norm": 3.296447515487671, + "learning_rate": 9.115578228579826e-06, + "loss": 0.6998, + "step": 4591 + }, + { + "epoch": 1.1632678910702976, + "grad_norm": 3.938145160675049, + "learning_rate": 9.115102379220978e-06, + "loss": 0.9381, + "step": 4592 + }, + { + "epoch": 1.1635212159594681, + "grad_norm": 3.656583786010742, + "learning_rate": 9.11462641431198e-06, + "loss": 0.907, + "step": 4593 + }, + { + "epoch": 1.1637745408486384, + "grad_norm": 4.242737293243408, + "learning_rate": 9.114150333866201e-06, + "loss": 0.7964, + "step": 4594 + }, + { + "epoch": 1.1640278657378087, + "grad_norm": 3.5653722286224365, + "learning_rate": 9.11367413789701e-06, + "loss": 0.795, + "step": 4595 + }, + { + "epoch": 1.164281190626979, + "grad_norm": 3.39385724067688, + "learning_rate": 9.113197826417776e-06, + "loss": 0.7885, + "step": 4596 + }, + { + "epoch": 1.1645345155161495, + "grad_norm": 3.6165449619293213, + "learning_rate": 9.112721399441877e-06, + "loss": 0.6814, + "step": 4597 + }, + { + "epoch": 1.1647878404053198, + "grad_norm": 4.119375705718994, + "learning_rate": 9.112244856982687e-06, + "loss": 0.8861, + "step": 4598 + }, + { + "epoch": 1.16504116529449, + "grad_norm": 3.5786070823669434, + "learning_rate": 9.111768199053588e-06, + "loss": 0.7515, + "step": 4599 + }, + { + "epoch": 1.1652944901836606, + "grad_norm": 3.670085906982422, + "learning_rate": 9.111291425667967e-06, + "loss": 0.844, + "step": 4600 + }, + { + "epoch": 1.165547815072831, + "grad_norm": 3.777029514312744, + "learning_rate": 9.110814536839208e-06, + "loss": 1.0063, + "step": 4601 + }, + { + "epoch": 1.1658011399620012, + "grad_norm": 3.5752599239349365, + "learning_rate": 9.110337532580705e-06, + "loss": 0.9175, + "step": 4602 + }, + { + "epoch": 1.1660544648511717, + "grad_norm": 4.313035488128662, + "learning_rate": 9.109860412905849e-06, + "loss": 0.8021, + "step": 4603 + }, + { + "epoch": 1.166307789740342, + "grad_norm": 3.4963371753692627, + "learning_rate": 9.10938317782804e-06, + "loss": 0.7787, + "step": 4604 + }, + { + "epoch": 1.1665611146295123, + "grad_norm": 3.543499708175659, + "learning_rate": 9.108905827360677e-06, + "loss": 0.8232, + "step": 4605 + }, + { + "epoch": 1.1668144395186828, + "grad_norm": 4.013868808746338, + "learning_rate": 9.108428361517163e-06, + "loss": 0.8854, + "step": 4606 + }, + { + "epoch": 1.167067764407853, + "grad_norm": 3.909543037414551, + "learning_rate": 9.107950780310908e-06, + "loss": 0.8123, + "step": 4607 + }, + { + "epoch": 1.1673210892970234, + "grad_norm": 3.912917137145996, + "learning_rate": 9.107473083755317e-06, + "loss": 0.9191, + "step": 4608 + }, + { + "epoch": 1.167574414186194, + "grad_norm": 3.570385694503784, + "learning_rate": 9.10699527186381e-06, + "loss": 0.7581, + "step": 4609 + }, + { + "epoch": 1.1678277390753642, + "grad_norm": 3.5049171447753906, + "learning_rate": 9.106517344649802e-06, + "loss": 0.8042, + "step": 4610 + }, + { + "epoch": 1.1680810639645345, + "grad_norm": 3.562222957611084, + "learning_rate": 9.106039302126709e-06, + "loss": 0.8154, + "step": 4611 + }, + { + "epoch": 1.168334388853705, + "grad_norm": 3.671994686126709, + "learning_rate": 9.105561144307958e-06, + "loss": 0.809, + "step": 4612 + }, + { + "epoch": 1.1685877137428753, + "grad_norm": 3.3304800987243652, + "learning_rate": 9.105082871206972e-06, + "loss": 0.77, + "step": 4613 + }, + { + "epoch": 1.1688410386320456, + "grad_norm": 3.5379252433776855, + "learning_rate": 9.104604482837184e-06, + "loss": 0.7443, + "step": 4614 + }, + { + "epoch": 1.1690943635212159, + "grad_norm": 3.332242965698242, + "learning_rate": 9.104125979212027e-06, + "loss": 0.8041, + "step": 4615 + }, + { + "epoch": 1.1693476884103864, + "grad_norm": 3.415736675262451, + "learning_rate": 9.103647360344937e-06, + "loss": 0.6914, + "step": 4616 + }, + { + "epoch": 1.1696010132995567, + "grad_norm": 3.767634153366089, + "learning_rate": 9.10316862624935e-06, + "loss": 0.809, + "step": 4617 + }, + { + "epoch": 1.169854338188727, + "grad_norm": 3.434682846069336, + "learning_rate": 9.102689776938711e-06, + "loss": 0.698, + "step": 4618 + }, + { + "epoch": 1.1701076630778975, + "grad_norm": 4.115676403045654, + "learning_rate": 9.102210812426467e-06, + "loss": 0.834, + "step": 4619 + }, + { + "epoch": 1.1703609879670678, + "grad_norm": 3.654546022415161, + "learning_rate": 9.101731732726066e-06, + "loss": 0.7559, + "step": 4620 + }, + { + "epoch": 1.170614312856238, + "grad_norm": 3.954991340637207, + "learning_rate": 9.10125253785096e-06, + "loss": 0.8116, + "step": 4621 + }, + { + "epoch": 1.1708676377454084, + "grad_norm": 3.3564887046813965, + "learning_rate": 9.100773227814606e-06, + "loss": 0.7457, + "step": 4622 + }, + { + "epoch": 1.1711209626345789, + "grad_norm": 4.064774036407471, + "learning_rate": 9.10029380263046e-06, + "loss": 0.8447, + "step": 4623 + }, + { + "epoch": 1.1713742875237492, + "grad_norm": 3.9697160720825195, + "learning_rate": 9.099814262311986e-06, + "loss": 0.856, + "step": 4624 + }, + { + "epoch": 1.1716276124129195, + "grad_norm": 3.4555749893188477, + "learning_rate": 9.09933460687265e-06, + "loss": 0.7495, + "step": 4625 + }, + { + "epoch": 1.17188093730209, + "grad_norm": 4.025459289550781, + "learning_rate": 9.098854836325918e-06, + "loss": 0.7282, + "step": 4626 + }, + { + "epoch": 1.1721342621912603, + "grad_norm": 3.783754587173462, + "learning_rate": 9.098374950685265e-06, + "loss": 0.8051, + "step": 4627 + }, + { + "epoch": 1.1723875870804306, + "grad_norm": 3.386687755584717, + "learning_rate": 9.097894949964164e-06, + "loss": 0.7492, + "step": 4628 + }, + { + "epoch": 1.172640911969601, + "grad_norm": 3.3886213302612305, + "learning_rate": 9.097414834176092e-06, + "loss": 0.8315, + "step": 4629 + }, + { + "epoch": 1.1728942368587714, + "grad_norm": 3.9223880767822266, + "learning_rate": 9.096934603334533e-06, + "loss": 0.7318, + "step": 4630 + }, + { + "epoch": 1.1731475617479417, + "grad_norm": 3.5498759746551514, + "learning_rate": 9.096454257452968e-06, + "loss": 0.7495, + "step": 4631 + }, + { + "epoch": 1.1734008866371122, + "grad_norm": 3.227283239364624, + "learning_rate": 9.095973796544891e-06, + "loss": 0.7463, + "step": 4632 + }, + { + "epoch": 1.1736542115262825, + "grad_norm": 3.749796152114868, + "learning_rate": 9.095493220623787e-06, + "loss": 0.7447, + "step": 4633 + }, + { + "epoch": 1.1739075364154528, + "grad_norm": 3.7740070819854736, + "learning_rate": 9.095012529703156e-06, + "loss": 0.8366, + "step": 4634 + }, + { + "epoch": 1.1741608613046233, + "grad_norm": 3.5309395790100098, + "learning_rate": 9.09453172379649e-06, + "loss": 0.7738, + "step": 4635 + }, + { + "epoch": 1.1744141861937936, + "grad_norm": 3.762718677520752, + "learning_rate": 9.094050802917291e-06, + "loss": 0.8424, + "step": 4636 + }, + { + "epoch": 1.1746675110829639, + "grad_norm": 3.685856580734253, + "learning_rate": 9.093569767079065e-06, + "loss": 0.7974, + "step": 4637 + }, + { + "epoch": 1.1749208359721344, + "grad_norm": 3.5668835639953613, + "learning_rate": 9.093088616295321e-06, + "loss": 0.7287, + "step": 4638 + }, + { + "epoch": 1.1751741608613047, + "grad_norm": 3.8816702365875244, + "learning_rate": 9.092607350579563e-06, + "loss": 0.7434, + "step": 4639 + }, + { + "epoch": 1.175427485750475, + "grad_norm": 3.4125139713287354, + "learning_rate": 9.092125969945311e-06, + "loss": 0.7396, + "step": 4640 + }, + { + "epoch": 1.1756808106396455, + "grad_norm": 3.3155534267425537, + "learning_rate": 9.09164447440608e-06, + "loss": 0.6941, + "step": 4641 + }, + { + "epoch": 1.1759341355288158, + "grad_norm": 3.691645383834839, + "learning_rate": 9.09116286397539e-06, + "loss": 0.7019, + "step": 4642 + }, + { + "epoch": 1.176187460417986, + "grad_norm": 3.39418363571167, + "learning_rate": 9.090681138666763e-06, + "loss": 0.6927, + "step": 4643 + }, + { + "epoch": 1.1764407853071563, + "grad_norm": 3.94716739654541, + "learning_rate": 9.090199298493727e-06, + "loss": 0.7232, + "step": 4644 + }, + { + "epoch": 1.1766941101963269, + "grad_norm": 4.030582904815674, + "learning_rate": 9.089717343469812e-06, + "loss": 0.9173, + "step": 4645 + }, + { + "epoch": 1.1769474350854972, + "grad_norm": 3.814612865447998, + "learning_rate": 9.089235273608552e-06, + "loss": 0.7907, + "step": 4646 + }, + { + "epoch": 1.1772007599746674, + "grad_norm": 3.907712697982788, + "learning_rate": 9.08875308892348e-06, + "loss": 0.7244, + "step": 4647 + }, + { + "epoch": 1.177454084863838, + "grad_norm": 3.4128739833831787, + "learning_rate": 9.08827078942814e-06, + "loss": 0.7244, + "step": 4648 + }, + { + "epoch": 1.1777074097530082, + "grad_norm": 3.8643686771392822, + "learning_rate": 9.08778837513607e-06, + "loss": 0.7758, + "step": 4649 + }, + { + "epoch": 1.1779607346421785, + "grad_norm": 3.5134050846099854, + "learning_rate": 9.087305846060818e-06, + "loss": 0.7238, + "step": 4650 + }, + { + "epoch": 1.1782140595313488, + "grad_norm": 3.5021824836730957, + "learning_rate": 9.086823202215935e-06, + "loss": 0.7189, + "step": 4651 + }, + { + "epoch": 1.1784673844205193, + "grad_norm": 3.862809181213379, + "learning_rate": 9.086340443614972e-06, + "loss": 0.7798, + "step": 4652 + }, + { + "epoch": 1.1787207093096896, + "grad_norm": 3.533787727355957, + "learning_rate": 9.085857570271484e-06, + "loss": 0.6933, + "step": 4653 + }, + { + "epoch": 1.17897403419886, + "grad_norm": 3.475367307662964, + "learning_rate": 9.08537458219903e-06, + "loss": 0.7725, + "step": 4654 + }, + { + "epoch": 1.1792273590880304, + "grad_norm": 3.678919792175293, + "learning_rate": 9.084891479411172e-06, + "loss": 0.8077, + "step": 4655 + }, + { + "epoch": 1.1794806839772007, + "grad_norm": 3.676244020462036, + "learning_rate": 9.084408261921477e-06, + "loss": 0.6353, + "step": 4656 + }, + { + "epoch": 1.179734008866371, + "grad_norm": 3.4303646087646484, + "learning_rate": 9.083924929743512e-06, + "loss": 0.8347, + "step": 4657 + }, + { + "epoch": 1.1799873337555415, + "grad_norm": 4.191209316253662, + "learning_rate": 9.08344148289085e-06, + "loss": 0.8738, + "step": 4658 + }, + { + "epoch": 1.1802406586447118, + "grad_norm": 4.154456615447998, + "learning_rate": 9.082957921377063e-06, + "loss": 0.8625, + "step": 4659 + }, + { + "epoch": 1.1804939835338821, + "grad_norm": 3.559074878692627, + "learning_rate": 9.08247424521573e-06, + "loss": 0.7643, + "step": 4660 + }, + { + "epoch": 1.1807473084230526, + "grad_norm": 3.6135287284851074, + "learning_rate": 9.081990454420437e-06, + "loss": 0.8673, + "step": 4661 + }, + { + "epoch": 1.181000633312223, + "grad_norm": 3.6605186462402344, + "learning_rate": 9.081506549004763e-06, + "loss": 0.6727, + "step": 4662 + }, + { + "epoch": 1.1812539582013932, + "grad_norm": 4.238001346588135, + "learning_rate": 9.081022528982298e-06, + "loss": 0.8279, + "step": 4663 + }, + { + "epoch": 1.1815072830905637, + "grad_norm": 3.378873586654663, + "learning_rate": 9.080538394366636e-06, + "loss": 0.7547, + "step": 4664 + }, + { + "epoch": 1.181760607979734, + "grad_norm": 3.2367618083953857, + "learning_rate": 9.080054145171365e-06, + "loss": 0.7417, + "step": 4665 + }, + { + "epoch": 1.1820139328689043, + "grad_norm": 3.37278413772583, + "learning_rate": 9.079569781410087e-06, + "loss": 0.7176, + "step": 4666 + }, + { + "epoch": 1.1822672577580748, + "grad_norm": 3.4264819622039795, + "learning_rate": 9.079085303096401e-06, + "loss": 0.7502, + "step": 4667 + }, + { + "epoch": 1.1825205826472451, + "grad_norm": 3.859360933303833, + "learning_rate": 9.078600710243912e-06, + "loss": 0.7934, + "step": 4668 + }, + { + "epoch": 1.1827739075364154, + "grad_norm": 3.580442428588867, + "learning_rate": 9.078116002866226e-06, + "loss": 0.8133, + "step": 4669 + }, + { + "epoch": 1.1830272324255857, + "grad_norm": 3.8393847942352295, + "learning_rate": 9.077631180976955e-06, + "loss": 0.758, + "step": 4670 + }, + { + "epoch": 1.1832805573147562, + "grad_norm": 3.4573380947113037, + "learning_rate": 9.077146244589712e-06, + "loss": 0.6716, + "step": 4671 + }, + { + "epoch": 1.1835338822039265, + "grad_norm": 3.5827207565307617, + "learning_rate": 9.076661193718111e-06, + "loss": 0.8539, + "step": 4672 + }, + { + "epoch": 1.1837872070930968, + "grad_norm": 3.2626802921295166, + "learning_rate": 9.076176028375775e-06, + "loss": 0.6498, + "step": 4673 + }, + { + "epoch": 1.1840405319822673, + "grad_norm": 3.4087109565734863, + "learning_rate": 9.075690748576327e-06, + "loss": 0.6383, + "step": 4674 + }, + { + "epoch": 1.1842938568714376, + "grad_norm": 3.3773810863494873, + "learning_rate": 9.075205354333394e-06, + "loss": 0.6406, + "step": 4675 + }, + { + "epoch": 1.184547181760608, + "grad_norm": 3.88681960105896, + "learning_rate": 9.074719845660605e-06, + "loss": 0.79, + "step": 4676 + }, + { + "epoch": 1.1848005066497784, + "grad_norm": 3.5406460762023926, + "learning_rate": 9.07423422257159e-06, + "loss": 0.7606, + "step": 4677 + }, + { + "epoch": 1.1850538315389487, + "grad_norm": 3.60103178024292, + "learning_rate": 9.07374848507999e-06, + "loss": 0.8023, + "step": 4678 + }, + { + "epoch": 1.185307156428119, + "grad_norm": 3.9661853313446045, + "learning_rate": 9.07326263319944e-06, + "loss": 0.8699, + "step": 4679 + }, + { + "epoch": 1.1855604813172893, + "grad_norm": 4.237826347351074, + "learning_rate": 9.072776666943583e-06, + "loss": 0.7779, + "step": 4680 + }, + { + "epoch": 1.1858138062064598, + "grad_norm": 3.834801435470581, + "learning_rate": 9.07229058632607e-06, + "loss": 0.8709, + "step": 4681 + }, + { + "epoch": 1.1860671310956301, + "grad_norm": 3.58707332611084, + "learning_rate": 9.071804391360544e-06, + "loss": 0.8227, + "step": 4682 + }, + { + "epoch": 1.1863204559848004, + "grad_norm": 3.898444414138794, + "learning_rate": 9.071318082060659e-06, + "loss": 0.8405, + "step": 4683 + }, + { + "epoch": 1.186573780873971, + "grad_norm": 3.8865280151367188, + "learning_rate": 9.070831658440068e-06, + "loss": 0.8815, + "step": 4684 + }, + { + "epoch": 1.1868271057631412, + "grad_norm": 3.698413848876953, + "learning_rate": 9.070345120512436e-06, + "loss": 0.8733, + "step": 4685 + }, + { + "epoch": 1.1870804306523115, + "grad_norm": 4.0502800941467285, + "learning_rate": 9.069858468291417e-06, + "loss": 0.9704, + "step": 4686 + }, + { + "epoch": 1.187333755541482, + "grad_norm": 3.8300623893737793, + "learning_rate": 9.069371701790684e-06, + "loss": 0.8485, + "step": 4687 + }, + { + "epoch": 1.1875870804306523, + "grad_norm": 3.7541427612304688, + "learning_rate": 9.068884821023898e-06, + "loss": 0.8075, + "step": 4688 + }, + { + "epoch": 1.1878404053198226, + "grad_norm": 4.094725131988525, + "learning_rate": 9.068397826004734e-06, + "loss": 0.8, + "step": 4689 + }, + { + "epoch": 1.1880937302089931, + "grad_norm": 3.4738681316375732, + "learning_rate": 9.067910716746863e-06, + "loss": 0.8071, + "step": 4690 + }, + { + "epoch": 1.1883470550981634, + "grad_norm": 3.5837690830230713, + "learning_rate": 9.067423493263969e-06, + "loss": 0.7731, + "step": 4691 + }, + { + "epoch": 1.1886003799873337, + "grad_norm": 4.020441055297852, + "learning_rate": 9.066936155569728e-06, + "loss": 0.7003, + "step": 4692 + }, + { + "epoch": 1.1888537048765042, + "grad_norm": 3.8502564430236816, + "learning_rate": 9.066448703677828e-06, + "loss": 0.7282, + "step": 4693 + }, + { + "epoch": 1.1891070297656745, + "grad_norm": 3.1409623622894287, + "learning_rate": 9.065961137601953e-06, + "loss": 0.6211, + "step": 4694 + }, + { + "epoch": 1.1893603546548448, + "grad_norm": 3.890843152999878, + "learning_rate": 9.065473457355793e-06, + "loss": 0.8454, + "step": 4695 + }, + { + "epoch": 1.1896136795440153, + "grad_norm": 3.9443559646606445, + "learning_rate": 9.064985662953046e-06, + "loss": 0.7721, + "step": 4696 + }, + { + "epoch": 1.1898670044331856, + "grad_norm": 3.839155673980713, + "learning_rate": 9.064497754407407e-06, + "loss": 0.8109, + "step": 4697 + }, + { + "epoch": 1.190120329322356, + "grad_norm": 3.730426549911499, + "learning_rate": 9.064009731732574e-06, + "loss": 0.7671, + "step": 4698 + }, + { + "epoch": 1.1903736542115262, + "grad_norm": 3.659146308898926, + "learning_rate": 9.063521594942254e-06, + "loss": 0.7655, + "step": 4699 + }, + { + "epoch": 1.1906269791006967, + "grad_norm": 3.4877068996429443, + "learning_rate": 9.063033344050151e-06, + "loss": 0.7731, + "step": 4700 + }, + { + "epoch": 1.190880303989867, + "grad_norm": 3.3288068771362305, + "learning_rate": 9.062544979069977e-06, + "loss": 0.65, + "step": 4701 + }, + { + "epoch": 1.1911336288790373, + "grad_norm": 3.840134382247925, + "learning_rate": 9.062056500015443e-06, + "loss": 0.8196, + "step": 4702 + }, + { + "epoch": 1.1913869537682078, + "grad_norm": 3.9276678562164307, + "learning_rate": 9.061567906900268e-06, + "loss": 0.721, + "step": 4703 + }, + { + "epoch": 1.191640278657378, + "grad_norm": 4.0413360595703125, + "learning_rate": 9.061079199738168e-06, + "loss": 0.807, + "step": 4704 + }, + { + "epoch": 1.1918936035465484, + "grad_norm": 3.896489381790161, + "learning_rate": 9.060590378542868e-06, + "loss": 0.6801, + "step": 4705 + }, + { + "epoch": 1.192146928435719, + "grad_norm": 3.962916612625122, + "learning_rate": 9.060101443328092e-06, + "loss": 0.8777, + "step": 4706 + }, + { + "epoch": 1.1924002533248892, + "grad_norm": 3.965481758117676, + "learning_rate": 9.059612394107573e-06, + "loss": 0.7118, + "step": 4707 + }, + { + "epoch": 1.1926535782140595, + "grad_norm": 3.788297653198242, + "learning_rate": 9.05912323089504e-06, + "loss": 0.7165, + "step": 4708 + }, + { + "epoch": 1.1929069031032298, + "grad_norm": 3.574528217315674, + "learning_rate": 9.058633953704228e-06, + "loss": 0.8802, + "step": 4709 + }, + { + "epoch": 1.1931602279924003, + "grad_norm": 3.7018349170684814, + "learning_rate": 9.058144562548876e-06, + "loss": 0.8945, + "step": 4710 + }, + { + "epoch": 1.1934135528815706, + "grad_norm": 3.8939034938812256, + "learning_rate": 9.057655057442728e-06, + "loss": 0.7497, + "step": 4711 + }, + { + "epoch": 1.1936668777707409, + "grad_norm": 3.595820188522339, + "learning_rate": 9.057165438399528e-06, + "loss": 0.8388, + "step": 4712 + }, + { + "epoch": 1.1939202026599114, + "grad_norm": 3.4211912155151367, + "learning_rate": 9.056675705433024e-06, + "loss": 0.6714, + "step": 4713 + }, + { + "epoch": 1.1941735275490817, + "grad_norm": 3.458573341369629, + "learning_rate": 9.056185858556966e-06, + "loss": 0.6966, + "step": 4714 + }, + { + "epoch": 1.194426852438252, + "grad_norm": 3.368644952774048, + "learning_rate": 9.055695897785113e-06, + "loss": 0.8037, + "step": 4715 + }, + { + "epoch": 1.1946801773274225, + "grad_norm": 3.442203998565674, + "learning_rate": 9.055205823131217e-06, + "loss": 0.7374, + "step": 4716 + }, + { + "epoch": 1.1949335022165928, + "grad_norm": 3.0957603454589844, + "learning_rate": 9.054715634609043e-06, + "loss": 0.7183, + "step": 4717 + }, + { + "epoch": 1.195186827105763, + "grad_norm": 4.103257179260254, + "learning_rate": 9.054225332232355e-06, + "loss": 0.8436, + "step": 4718 + }, + { + "epoch": 1.1954401519949336, + "grad_norm": 3.9489400386810303, + "learning_rate": 9.053734916014918e-06, + "loss": 0.8258, + "step": 4719 + }, + { + "epoch": 1.1956934768841039, + "grad_norm": 4.136053562164307, + "learning_rate": 9.053244385970507e-06, + "loss": 0.719, + "step": 4720 + }, + { + "epoch": 1.1959468017732742, + "grad_norm": 3.3901286125183105, + "learning_rate": 9.052753742112893e-06, + "loss": 0.7553, + "step": 4721 + }, + { + "epoch": 1.1962001266624447, + "grad_norm": 3.6780190467834473, + "learning_rate": 9.052262984455851e-06, + "loss": 0.8364, + "step": 4722 + }, + { + "epoch": 1.196453451551615, + "grad_norm": 3.696352481842041, + "learning_rate": 9.051772113013166e-06, + "loss": 0.8566, + "step": 4723 + }, + { + "epoch": 1.1967067764407853, + "grad_norm": 3.751959800720215, + "learning_rate": 9.051281127798617e-06, + "loss": 0.8296, + "step": 4724 + }, + { + "epoch": 1.1969601013299558, + "grad_norm": 4.118260383605957, + "learning_rate": 9.050790028825994e-06, + "loss": 0.849, + "step": 4725 + }, + { + "epoch": 1.197213426219126, + "grad_norm": 3.5884218215942383, + "learning_rate": 9.050298816109085e-06, + "loss": 0.8572, + "step": 4726 + }, + { + "epoch": 1.1974667511082964, + "grad_norm": 3.55743145942688, + "learning_rate": 9.049807489661683e-06, + "loss": 0.823, + "step": 4727 + }, + { + "epoch": 1.1977200759974667, + "grad_norm": 4.042413234710693, + "learning_rate": 9.049316049497587e-06, + "loss": 0.9174, + "step": 4728 + }, + { + "epoch": 1.1979734008866372, + "grad_norm": 3.994415044784546, + "learning_rate": 9.048824495630593e-06, + "loss": 0.9475, + "step": 4729 + }, + { + "epoch": 1.1982267257758075, + "grad_norm": 3.970856189727783, + "learning_rate": 9.048332828074504e-06, + "loss": 0.8147, + "step": 4730 + }, + { + "epoch": 1.1984800506649778, + "grad_norm": 3.8265199661254883, + "learning_rate": 9.047841046843126e-06, + "loss": 0.7644, + "step": 4731 + }, + { + "epoch": 1.1987333755541483, + "grad_norm": 3.4341936111450195, + "learning_rate": 9.047349151950272e-06, + "loss": 0.7143, + "step": 4732 + }, + { + "epoch": 1.1989867004433186, + "grad_norm": 3.607229232788086, + "learning_rate": 9.046857143409746e-06, + "loss": 0.65, + "step": 4733 + }, + { + "epoch": 1.1992400253324889, + "grad_norm": 3.93404221534729, + "learning_rate": 9.04636502123537e-06, + "loss": 0.9532, + "step": 4734 + }, + { + "epoch": 1.1994933502216594, + "grad_norm": 4.027116298675537, + "learning_rate": 9.045872785440961e-06, + "loss": 0.8919, + "step": 4735 + }, + { + "epoch": 1.1997466751108297, + "grad_norm": 3.4875340461730957, + "learning_rate": 9.04538043604034e-06, + "loss": 0.7551, + "step": 4736 + }, + { + "epoch": 1.2, + "grad_norm": 4.019157886505127, + "learning_rate": 9.044887973047335e-06, + "loss": 0.858, + "step": 4737 + }, + { + "epoch": 1.2002533248891702, + "grad_norm": 3.2707157135009766, + "learning_rate": 9.044395396475767e-06, + "loss": 0.7807, + "step": 4738 + }, + { + "epoch": 1.2005066497783408, + "grad_norm": 4.049706935882568, + "learning_rate": 9.043902706339474e-06, + "loss": 0.9211, + "step": 4739 + }, + { + "epoch": 1.200759974667511, + "grad_norm": 3.1721930503845215, + "learning_rate": 9.043409902652288e-06, + "loss": 0.7179, + "step": 4740 + }, + { + "epoch": 1.2010132995566813, + "grad_norm": 4.242766857147217, + "learning_rate": 9.042916985428048e-06, + "loss": 0.7944, + "step": 4741 + }, + { + "epoch": 1.2012666244458519, + "grad_norm": 3.860823154449463, + "learning_rate": 9.042423954680592e-06, + "loss": 0.8658, + "step": 4742 + }, + { + "epoch": 1.2015199493350222, + "grad_norm": 4.302134990692139, + "learning_rate": 9.041930810423768e-06, + "loss": 0.75, + "step": 4743 + }, + { + "epoch": 1.2017732742241924, + "grad_norm": 3.4918928146362305, + "learning_rate": 9.041437552671421e-06, + "loss": 0.7826, + "step": 4744 + }, + { + "epoch": 1.202026599113363, + "grad_norm": 3.3742682933807373, + "learning_rate": 9.0409441814374e-06, + "loss": 0.8313, + "step": 4745 + }, + { + "epoch": 1.2022799240025333, + "grad_norm": 3.8758504390716553, + "learning_rate": 9.04045069673556e-06, + "loss": 0.8295, + "step": 4746 + }, + { + "epoch": 1.2025332488917035, + "grad_norm": 3.509568214416504, + "learning_rate": 9.039957098579762e-06, + "loss": 0.8013, + "step": 4747 + }, + { + "epoch": 1.202786573780874, + "grad_norm": 3.639634609222412, + "learning_rate": 9.039463386983857e-06, + "loss": 0.8307, + "step": 4748 + }, + { + "epoch": 1.2030398986700443, + "grad_norm": 3.486271619796753, + "learning_rate": 9.038969561961716e-06, + "loss": 0.6925, + "step": 4749 + }, + { + "epoch": 1.2032932235592146, + "grad_norm": 3.6544554233551025, + "learning_rate": 9.038475623527204e-06, + "loss": 0.8655, + "step": 4750 + }, + { + "epoch": 1.2035465484483852, + "grad_norm": 3.636603832244873, + "learning_rate": 9.037981571694187e-06, + "loss": 0.7629, + "step": 4751 + }, + { + "epoch": 1.2037998733375554, + "grad_norm": 3.749607801437378, + "learning_rate": 9.037487406476541e-06, + "loss": 0.9063, + "step": 4752 + }, + { + "epoch": 1.2040531982267257, + "grad_norm": 4.07143497467041, + "learning_rate": 9.03699312788814e-06, + "loss": 0.8602, + "step": 4753 + }, + { + "epoch": 1.2043065231158963, + "grad_norm": 3.489014148712158, + "learning_rate": 9.036498735942865e-06, + "loss": 0.8314, + "step": 4754 + }, + { + "epoch": 1.2045598480050665, + "grad_norm": 3.5965168476104736, + "learning_rate": 9.036004230654595e-06, + "loss": 0.8041, + "step": 4755 + }, + { + "epoch": 1.2048131728942368, + "grad_norm": 3.958611011505127, + "learning_rate": 9.03550961203722e-06, + "loss": 0.9175, + "step": 4756 + }, + { + "epoch": 1.2050664977834071, + "grad_norm": 4.254213333129883, + "learning_rate": 9.035014880104626e-06, + "loss": 0.9666, + "step": 4757 + }, + { + "epoch": 1.2053198226725776, + "grad_norm": 3.5343542098999023, + "learning_rate": 9.034520034870706e-06, + "loss": 0.7728, + "step": 4758 + }, + { + "epoch": 1.205573147561748, + "grad_norm": 4.204711437225342, + "learning_rate": 9.034025076349353e-06, + "loss": 0.8475, + "step": 4759 + }, + { + "epoch": 1.2058264724509182, + "grad_norm": 3.3257641792297363, + "learning_rate": 9.033530004554467e-06, + "loss": 0.7982, + "step": 4760 + }, + { + "epoch": 1.2060797973400887, + "grad_norm": 3.382373332977295, + "learning_rate": 9.033034819499951e-06, + "loss": 0.6749, + "step": 4761 + }, + { + "epoch": 1.206333122229259, + "grad_norm": 3.4364678859710693, + "learning_rate": 9.032539521199705e-06, + "loss": 0.8166, + "step": 4762 + }, + { + "epoch": 1.2065864471184293, + "grad_norm": 3.706695556640625, + "learning_rate": 9.032044109667639e-06, + "loss": 0.8283, + "step": 4763 + }, + { + "epoch": 1.2068397720075998, + "grad_norm": 3.9466264247894287, + "learning_rate": 9.031548584917666e-06, + "loss": 0.7703, + "step": 4764 + }, + { + "epoch": 1.2070930968967701, + "grad_norm": 3.926689624786377, + "learning_rate": 9.031052946963697e-06, + "loss": 0.7708, + "step": 4765 + }, + { + "epoch": 1.2073464217859404, + "grad_norm": 4.062969207763672, + "learning_rate": 9.030557195819649e-06, + "loss": 0.8716, + "step": 4766 + }, + { + "epoch": 1.2075997466751107, + "grad_norm": 4.117637634277344, + "learning_rate": 9.030061331499446e-06, + "loss": 0.7546, + "step": 4767 + }, + { + "epoch": 1.2078530715642812, + "grad_norm": 3.3397328853607178, + "learning_rate": 9.029565354017009e-06, + "loss": 0.753, + "step": 4768 + }, + { + "epoch": 1.2081063964534515, + "grad_norm": 4.444040298461914, + "learning_rate": 9.029069263386267e-06, + "loss": 0.7658, + "step": 4769 + }, + { + "epoch": 1.2083597213426218, + "grad_norm": 3.76849627494812, + "learning_rate": 9.028573059621146e-06, + "loss": 0.7475, + "step": 4770 + }, + { + "epoch": 1.2086130462317923, + "grad_norm": 3.8576104640960693, + "learning_rate": 9.028076742735583e-06, + "loss": 0.7425, + "step": 4771 + }, + { + "epoch": 1.2088663711209626, + "grad_norm": 3.9090235233306885, + "learning_rate": 9.027580312743512e-06, + "loss": 0.8225, + "step": 4772 + }, + { + "epoch": 1.209119696010133, + "grad_norm": 3.612471103668213, + "learning_rate": 9.027083769658875e-06, + "loss": 0.7807, + "step": 4773 + }, + { + "epoch": 1.2093730208993034, + "grad_norm": 3.902747631072998, + "learning_rate": 9.026587113495612e-06, + "loss": 0.7209, + "step": 4774 + }, + { + "epoch": 1.2096263457884737, + "grad_norm": 3.997373342514038, + "learning_rate": 9.026090344267669e-06, + "loss": 0.8836, + "step": 4775 + }, + { + "epoch": 1.209879670677644, + "grad_norm": 3.82385516166687, + "learning_rate": 9.025593461988998e-06, + "loss": 0.7881, + "step": 4776 + }, + { + "epoch": 1.2101329955668145, + "grad_norm": 3.9499361515045166, + "learning_rate": 9.025096466673548e-06, + "loss": 0.8651, + "step": 4777 + }, + { + "epoch": 1.2103863204559848, + "grad_norm": 4.139962196350098, + "learning_rate": 9.024599358335278e-06, + "loss": 0.9294, + "step": 4778 + }, + { + "epoch": 1.2106396453451551, + "grad_norm": 3.218672037124634, + "learning_rate": 9.024102136988141e-06, + "loss": 0.6617, + "step": 4779 + }, + { + "epoch": 1.2108929702343256, + "grad_norm": 3.538665294647217, + "learning_rate": 9.023604802646104e-06, + "loss": 0.686, + "step": 4780 + }, + { + "epoch": 1.211146295123496, + "grad_norm": 3.94869065284729, + "learning_rate": 9.02310735532313e-06, + "loss": 0.8399, + "step": 4781 + }, + { + "epoch": 1.2113996200126662, + "grad_norm": 3.490705966949463, + "learning_rate": 9.022609795033187e-06, + "loss": 0.7787, + "step": 4782 + }, + { + "epoch": 1.2116529449018367, + "grad_norm": 3.956037998199463, + "learning_rate": 9.022112121790243e-06, + "loss": 0.8172, + "step": 4783 + }, + { + "epoch": 1.211906269791007, + "grad_norm": 2.635892868041992, + "learning_rate": 9.021614335608279e-06, + "loss": 0.6711, + "step": 4784 + }, + { + "epoch": 1.2121595946801773, + "grad_norm": 3.9293594360351562, + "learning_rate": 9.02111643650127e-06, + "loss": 0.7689, + "step": 4785 + }, + { + "epoch": 1.2124129195693476, + "grad_norm": 3.5471110343933105, + "learning_rate": 9.020618424483195e-06, + "loss": 0.6563, + "step": 4786 + }, + { + "epoch": 1.2126662444585181, + "grad_norm": 4.197956562042236, + "learning_rate": 9.020120299568038e-06, + "loss": 0.9316, + "step": 4787 + }, + { + "epoch": 1.2129195693476884, + "grad_norm": 3.879873037338257, + "learning_rate": 9.019622061769789e-06, + "loss": 0.9289, + "step": 4788 + }, + { + "epoch": 1.2131728942368587, + "grad_norm": 3.908384323120117, + "learning_rate": 9.019123711102434e-06, + "loss": 0.7307, + "step": 4789 + }, + { + "epoch": 1.2134262191260292, + "grad_norm": 3.6532509326934814, + "learning_rate": 9.01862524757997e-06, + "loss": 0.731, + "step": 4790 + }, + { + "epoch": 1.2136795440151995, + "grad_norm": 3.716970205307007, + "learning_rate": 9.018126671216392e-06, + "loss": 0.7126, + "step": 4791 + }, + { + "epoch": 1.2139328689043698, + "grad_norm": 3.8923962116241455, + "learning_rate": 9.017627982025701e-06, + "loss": 0.7515, + "step": 4792 + }, + { + "epoch": 1.21418619379354, + "grad_norm": 4.065270900726318, + "learning_rate": 9.0171291800219e-06, + "loss": 0.9012, + "step": 4793 + }, + { + "epoch": 1.2144395186827106, + "grad_norm": 3.6236536502838135, + "learning_rate": 9.016630265218994e-06, + "loss": 0.7596, + "step": 4794 + }, + { + "epoch": 1.214692843571881, + "grad_norm": 3.9246580600738525, + "learning_rate": 9.016131237630992e-06, + "loss": 0.9081, + "step": 4795 + }, + { + "epoch": 1.2149461684610512, + "grad_norm": 3.6482737064361572, + "learning_rate": 9.015632097271906e-06, + "loss": 0.7758, + "step": 4796 + }, + { + "epoch": 1.2151994933502217, + "grad_norm": 3.962167978286743, + "learning_rate": 9.015132844155755e-06, + "loss": 0.7765, + "step": 4797 + }, + { + "epoch": 1.215452818239392, + "grad_norm": 3.6573903560638428, + "learning_rate": 9.014633478296554e-06, + "loss": 0.8502, + "step": 4798 + }, + { + "epoch": 1.2157061431285623, + "grad_norm": 3.6809349060058594, + "learning_rate": 9.014133999708328e-06, + "loss": 0.8705, + "step": 4799 + }, + { + "epoch": 1.2159594680177328, + "grad_norm": 4.255880832672119, + "learning_rate": 9.013634408405098e-06, + "loss": 0.7849, + "step": 4800 + }, + { + "epoch": 1.216212792906903, + "grad_norm": 4.187831401824951, + "learning_rate": 9.013134704400898e-06, + "loss": 0.8874, + "step": 4801 + }, + { + "epoch": 1.2164661177960734, + "grad_norm": 3.6912200450897217, + "learning_rate": 9.012634887709755e-06, + "loss": 0.7232, + "step": 4802 + }, + { + "epoch": 1.216719442685244, + "grad_norm": 3.335015296936035, + "learning_rate": 9.012134958345703e-06, + "loss": 0.7511, + "step": 4803 + }, + { + "epoch": 1.2169727675744142, + "grad_norm": 3.792112112045288, + "learning_rate": 9.011634916322785e-06, + "loss": 0.7904, + "step": 4804 + }, + { + "epoch": 1.2172260924635845, + "grad_norm": 4.308746814727783, + "learning_rate": 9.011134761655039e-06, + "loss": 0.8286, + "step": 4805 + }, + { + "epoch": 1.217479417352755, + "grad_norm": 3.910618782043457, + "learning_rate": 9.010634494356507e-06, + "loss": 0.8875, + "step": 4806 + }, + { + "epoch": 1.2177327422419253, + "grad_norm": 3.9517557621002197, + "learning_rate": 9.010134114441237e-06, + "loss": 0.8079, + "step": 4807 + }, + { + "epoch": 1.2179860671310956, + "grad_norm": 3.4831385612487793, + "learning_rate": 9.009633621923282e-06, + "loss": 0.7058, + "step": 4808 + }, + { + "epoch": 1.218239392020266, + "grad_norm": 3.7121262550354004, + "learning_rate": 9.009133016816694e-06, + "loss": 0.7318, + "step": 4809 + }, + { + "epoch": 1.2184927169094364, + "grad_norm": 3.54034161567688, + "learning_rate": 9.00863229913553e-06, + "loss": 0.7555, + "step": 4810 + }, + { + "epoch": 1.2187460417986067, + "grad_norm": 3.896744966506958, + "learning_rate": 9.00813146889385e-06, + "loss": 0.908, + "step": 4811 + }, + { + "epoch": 1.2189993666877772, + "grad_norm": 3.977553129196167, + "learning_rate": 9.007630526105718e-06, + "loss": 0.6372, + "step": 4812 + }, + { + "epoch": 1.2192526915769475, + "grad_norm": 5.474034786224365, + "learning_rate": 9.007129470785196e-06, + "loss": 0.9058, + "step": 4813 + }, + { + "epoch": 1.2195060164661178, + "grad_norm": 3.76747989654541, + "learning_rate": 9.00662830294636e-06, + "loss": 0.8183, + "step": 4814 + }, + { + "epoch": 1.219759341355288, + "grad_norm": 3.620443105697632, + "learning_rate": 9.006127022603276e-06, + "loss": 0.6893, + "step": 4815 + }, + { + "epoch": 1.2200126662444586, + "grad_norm": 3.8057548999786377, + "learning_rate": 9.005625629770024e-06, + "loss": 0.8761, + "step": 4816 + }, + { + "epoch": 1.2202659911336289, + "grad_norm": 3.5878427028656006, + "learning_rate": 9.005124124460682e-06, + "loss": 0.7336, + "step": 4817 + }, + { + "epoch": 1.2205193160227992, + "grad_norm": 4.016486167907715, + "learning_rate": 9.004622506689331e-06, + "loss": 0.7536, + "step": 4818 + }, + { + "epoch": 1.2207726409119697, + "grad_norm": 3.7963743209838867, + "learning_rate": 9.004120776470058e-06, + "loss": 0.7734, + "step": 4819 + }, + { + "epoch": 1.22102596580114, + "grad_norm": 3.6075387001037598, + "learning_rate": 9.003618933816948e-06, + "loss": 0.7353, + "step": 4820 + }, + { + "epoch": 1.2212792906903103, + "grad_norm": 4.058751583099365, + "learning_rate": 9.003116978744098e-06, + "loss": 0.8964, + "step": 4821 + }, + { + "epoch": 1.2215326155794806, + "grad_norm": 3.674062967300415, + "learning_rate": 9.002614911265598e-06, + "loss": 0.6853, + "step": 4822 + }, + { + "epoch": 1.221785940468651, + "grad_norm": 3.7696738243103027, + "learning_rate": 9.002112731395544e-06, + "loss": 0.7808, + "step": 4823 + }, + { + "epoch": 1.2220392653578214, + "grad_norm": 4.296994686126709, + "learning_rate": 9.001610439148046e-06, + "loss": 0.9033, + "step": 4824 + }, + { + "epoch": 1.2222925902469917, + "grad_norm": 3.666095018386841, + "learning_rate": 9.001108034537199e-06, + "loss": 0.6923, + "step": 4825 + }, + { + "epoch": 1.2225459151361622, + "grad_norm": 3.87882137298584, + "learning_rate": 9.000605517577113e-06, + "loss": 0.7728, + "step": 4826 + }, + { + "epoch": 1.2227992400253325, + "grad_norm": 3.883906126022339, + "learning_rate": 9.000102888281901e-06, + "loss": 0.7317, + "step": 4827 + }, + { + "epoch": 1.2230525649145028, + "grad_norm": 4.187375068664551, + "learning_rate": 8.999600146665672e-06, + "loss": 0.8786, + "step": 4828 + }, + { + "epoch": 1.2233058898036733, + "grad_norm": 3.488924741744995, + "learning_rate": 8.999097292742549e-06, + "loss": 0.8264, + "step": 4829 + }, + { + "epoch": 1.2235592146928436, + "grad_norm": 3.888960599899292, + "learning_rate": 8.998594326526647e-06, + "loss": 0.8025, + "step": 4830 + }, + { + "epoch": 1.2238125395820139, + "grad_norm": 3.666673183441162, + "learning_rate": 8.998091248032089e-06, + "loss": 0.7531, + "step": 4831 + }, + { + "epoch": 1.2240658644711844, + "grad_norm": 3.860265016555786, + "learning_rate": 8.997588057273004e-06, + "loss": 0.7896, + "step": 4832 + }, + { + "epoch": 1.2243191893603547, + "grad_norm": 3.5526556968688965, + "learning_rate": 8.997084754263519e-06, + "loss": 0.7484, + "step": 4833 + }, + { + "epoch": 1.224572514249525, + "grad_norm": 3.7557437419891357, + "learning_rate": 8.996581339017767e-06, + "loss": 0.7255, + "step": 4834 + }, + { + "epoch": 1.2248258391386955, + "grad_norm": 3.388437271118164, + "learning_rate": 8.996077811549886e-06, + "loss": 0.8126, + "step": 4835 + }, + { + "epoch": 1.2250791640278658, + "grad_norm": 3.607306718826294, + "learning_rate": 8.995574171874011e-06, + "loss": 0.8345, + "step": 4836 + }, + { + "epoch": 1.225332488917036, + "grad_norm": 3.809396982192993, + "learning_rate": 8.995070420004286e-06, + "loss": 0.8228, + "step": 4837 + }, + { + "epoch": 1.2255858138062066, + "grad_norm": 4.179544448852539, + "learning_rate": 8.994566555954858e-06, + "loss": 0.8792, + "step": 4838 + }, + { + "epoch": 1.2258391386953769, + "grad_norm": 3.894911766052246, + "learning_rate": 8.994062579739871e-06, + "loss": 0.7968, + "step": 4839 + }, + { + "epoch": 1.2260924635845472, + "grad_norm": 3.6074442863464355, + "learning_rate": 8.993558491373479e-06, + "loss": 0.7674, + "step": 4840 + }, + { + "epoch": 1.2263457884737174, + "grad_norm": 3.7121362686157227, + "learning_rate": 8.993054290869838e-06, + "loss": 0.9287, + "step": 4841 + }, + { + "epoch": 1.226599113362888, + "grad_norm": 3.6276259422302246, + "learning_rate": 8.992549978243104e-06, + "loss": 0.7449, + "step": 4842 + }, + { + "epoch": 1.2268524382520583, + "grad_norm": 3.604879856109619, + "learning_rate": 8.992045553507436e-06, + "loss": 0.858, + "step": 4843 + }, + { + "epoch": 1.2271057631412285, + "grad_norm": 3.6814112663269043, + "learning_rate": 8.991541016677002e-06, + "loss": 0.8183, + "step": 4844 + }, + { + "epoch": 1.227359088030399, + "grad_norm": 3.4473230838775635, + "learning_rate": 8.991036367765964e-06, + "loss": 0.7232, + "step": 4845 + }, + { + "epoch": 1.2276124129195694, + "grad_norm": 3.860246181488037, + "learning_rate": 8.990531606788497e-06, + "loss": 0.773, + "step": 4846 + }, + { + "epoch": 1.2278657378087396, + "grad_norm": 3.4309897422790527, + "learning_rate": 8.990026733758772e-06, + "loss": 0.7406, + "step": 4847 + }, + { + "epoch": 1.2281190626979102, + "grad_norm": 3.4666717052459717, + "learning_rate": 8.989521748690969e-06, + "loss": 0.8216, + "step": 4848 + }, + { + "epoch": 1.2283723875870804, + "grad_norm": 3.6101233959198, + "learning_rate": 8.989016651599262e-06, + "loss": 0.7905, + "step": 4849 + }, + { + "epoch": 1.2286257124762507, + "grad_norm": 3.8484609127044678, + "learning_rate": 8.988511442497839e-06, + "loss": 0.8098, + "step": 4850 + }, + { + "epoch": 1.228879037365421, + "grad_norm": 3.559965133666992, + "learning_rate": 8.988006121400881e-06, + "loss": 0.7643, + "step": 4851 + }, + { + "epoch": 1.2291323622545915, + "grad_norm": 3.5645501613616943, + "learning_rate": 8.987500688322583e-06, + "loss": 0.6875, + "step": 4852 + }, + { + "epoch": 1.2293856871437618, + "grad_norm": 3.5796780586242676, + "learning_rate": 8.986995143277134e-06, + "loss": 0.741, + "step": 4853 + }, + { + "epoch": 1.2296390120329321, + "grad_norm": 3.296039581298828, + "learning_rate": 8.98648948627873e-06, + "loss": 0.7354, + "step": 4854 + }, + { + "epoch": 1.2298923369221026, + "grad_norm": 3.3527204990386963, + "learning_rate": 8.985983717341568e-06, + "loss": 0.7294, + "step": 4855 + }, + { + "epoch": 1.230145661811273, + "grad_norm": 4.831778526306152, + "learning_rate": 8.985477836479855e-06, + "loss": 1.0119, + "step": 4856 + }, + { + "epoch": 1.2303989867004432, + "grad_norm": 3.175151824951172, + "learning_rate": 8.984971843707787e-06, + "loss": 0.7934, + "step": 4857 + }, + { + "epoch": 1.2306523115896137, + "grad_norm": 3.8539159297943115, + "learning_rate": 8.984465739039583e-06, + "loss": 0.9443, + "step": 4858 + }, + { + "epoch": 1.230905636478784, + "grad_norm": 3.8513283729553223, + "learning_rate": 8.983959522489445e-06, + "loss": 0.7991, + "step": 4859 + }, + { + "epoch": 1.2311589613679543, + "grad_norm": 3.5532569885253906, + "learning_rate": 8.983453194071592e-06, + "loss": 0.883, + "step": 4860 + }, + { + "epoch": 1.2314122862571248, + "grad_norm": 3.6331706047058105, + "learning_rate": 8.982946753800238e-06, + "loss": 0.8583, + "step": 4861 + }, + { + "epoch": 1.2316656111462951, + "grad_norm": 3.5020289421081543, + "learning_rate": 8.982440201689609e-06, + "loss": 0.7938, + "step": 4862 + }, + { + "epoch": 1.2319189360354654, + "grad_norm": 3.7837648391723633, + "learning_rate": 8.981933537753925e-06, + "loss": 0.7653, + "step": 4863 + }, + { + "epoch": 1.232172260924636, + "grad_norm": 3.6381020545959473, + "learning_rate": 8.981426762007412e-06, + "loss": 0.8849, + "step": 4864 + }, + { + "epoch": 1.2324255858138062, + "grad_norm": 3.292473077774048, + "learning_rate": 8.980919874464302e-06, + "loss": 0.7163, + "step": 4865 + }, + { + "epoch": 1.2326789107029765, + "grad_norm": 3.657729148864746, + "learning_rate": 8.98041287513883e-06, + "loss": 0.7659, + "step": 4866 + }, + { + "epoch": 1.232932235592147, + "grad_norm": 3.368623971939087, + "learning_rate": 8.979905764045227e-06, + "loss": 0.7878, + "step": 4867 + }, + { + "epoch": 1.2331855604813173, + "grad_norm": 3.797635793685913, + "learning_rate": 8.979398541197736e-06, + "loss": 0.7507, + "step": 4868 + }, + { + "epoch": 1.2334388853704876, + "grad_norm": 3.610055446624756, + "learning_rate": 8.9788912066106e-06, + "loss": 0.9107, + "step": 4869 + }, + { + "epoch": 1.233692210259658, + "grad_norm": 3.4671168327331543, + "learning_rate": 8.978383760298063e-06, + "loss": 0.7952, + "step": 4870 + }, + { + "epoch": 1.2339455351488284, + "grad_norm": 3.547825336456299, + "learning_rate": 8.977876202274377e-06, + "loss": 0.7668, + "step": 4871 + }, + { + "epoch": 1.2341988600379987, + "grad_norm": 3.617056369781494, + "learning_rate": 8.977368532553787e-06, + "loss": 0.7, + "step": 4872 + }, + { + "epoch": 1.234452184927169, + "grad_norm": 3.9896669387817383, + "learning_rate": 8.976860751150555e-06, + "loss": 0.822, + "step": 4873 + }, + { + "epoch": 1.2347055098163395, + "grad_norm": 3.564788341522217, + "learning_rate": 8.976352858078938e-06, + "loss": 0.8184, + "step": 4874 + }, + { + "epoch": 1.2349588347055098, + "grad_norm": 3.5641210079193115, + "learning_rate": 8.975844853353195e-06, + "loss": 0.848, + "step": 4875 + }, + { + "epoch": 1.2352121595946801, + "grad_norm": 3.7378146648406982, + "learning_rate": 8.975336736987593e-06, + "loss": 0.7395, + "step": 4876 + }, + { + "epoch": 1.2354654844838506, + "grad_norm": 3.9268436431884766, + "learning_rate": 8.974828508996398e-06, + "loss": 0.7369, + "step": 4877 + }, + { + "epoch": 1.235718809373021, + "grad_norm": 3.9070422649383545, + "learning_rate": 8.974320169393882e-06, + "loss": 0.7026, + "step": 4878 + }, + { + "epoch": 1.2359721342621912, + "grad_norm": 3.6587164402008057, + "learning_rate": 8.973811718194317e-06, + "loss": 0.8198, + "step": 4879 + }, + { + "epoch": 1.2362254591513615, + "grad_norm": 3.9976251125335693, + "learning_rate": 8.973303155411981e-06, + "loss": 0.847, + "step": 4880 + }, + { + "epoch": 1.236478784040532, + "grad_norm": 3.8499391078948975, + "learning_rate": 8.972794481061156e-06, + "loss": 0.8002, + "step": 4881 + }, + { + "epoch": 1.2367321089297023, + "grad_norm": 3.5910487174987793, + "learning_rate": 8.972285695156124e-06, + "loss": 0.8595, + "step": 4882 + }, + { + "epoch": 1.2369854338188726, + "grad_norm": 3.729759931564331, + "learning_rate": 8.971776797711171e-06, + "loss": 0.7493, + "step": 4883 + }, + { + "epoch": 1.2372387587080431, + "grad_norm": 3.6986725330352783, + "learning_rate": 8.971267788740587e-06, + "loss": 0.8213, + "step": 4884 + }, + { + "epoch": 1.2374920835972134, + "grad_norm": 4.1502909660339355, + "learning_rate": 8.970758668258665e-06, + "loss": 0.7147, + "step": 4885 + }, + { + "epoch": 1.2377454084863837, + "grad_norm": 3.77043080329895, + "learning_rate": 8.970249436279702e-06, + "loss": 0.7631, + "step": 4886 + }, + { + "epoch": 1.2379987333755542, + "grad_norm": 3.1170456409454346, + "learning_rate": 8.969740092817992e-06, + "loss": 0.691, + "step": 4887 + }, + { + "epoch": 1.2382520582647245, + "grad_norm": 3.482490301132202, + "learning_rate": 8.969230637887842e-06, + "loss": 0.7393, + "step": 4888 + }, + { + "epoch": 1.2385053831538948, + "grad_norm": 4.178841590881348, + "learning_rate": 8.968721071503557e-06, + "loss": 0.9059, + "step": 4889 + }, + { + "epoch": 1.2387587080430653, + "grad_norm": 4.210383415222168, + "learning_rate": 8.968211393679445e-06, + "loss": 0.8748, + "step": 4890 + }, + { + "epoch": 1.2390120329322356, + "grad_norm": 4.279092311859131, + "learning_rate": 8.967701604429814e-06, + "loss": 0.8227, + "step": 4891 + }, + { + "epoch": 1.239265357821406, + "grad_norm": 3.8279335498809814, + "learning_rate": 8.967191703768984e-06, + "loss": 0.7862, + "step": 4892 + }, + { + "epoch": 1.2395186827105764, + "grad_norm": 3.838911533355713, + "learning_rate": 8.96668169171127e-06, + "loss": 0.7087, + "step": 4893 + }, + { + "epoch": 1.2397720075997467, + "grad_norm": 3.7137227058410645, + "learning_rate": 8.966171568270994e-06, + "loss": 0.7594, + "step": 4894 + }, + { + "epoch": 1.240025332488917, + "grad_norm": 3.868379831314087, + "learning_rate": 8.965661333462478e-06, + "loss": 0.6866, + "step": 4895 + }, + { + "epoch": 1.2402786573780875, + "grad_norm": 3.4717066287994385, + "learning_rate": 8.965150987300052e-06, + "loss": 0.7789, + "step": 4896 + }, + { + "epoch": 1.2405319822672578, + "grad_norm": 3.9209249019622803, + "learning_rate": 8.964640529798041e-06, + "loss": 0.8094, + "step": 4897 + }, + { + "epoch": 1.240785307156428, + "grad_norm": 3.8062753677368164, + "learning_rate": 8.964129960970785e-06, + "loss": 0.8163, + "step": 4898 + }, + { + "epoch": 1.2410386320455984, + "grad_norm": 3.9788119792938232, + "learning_rate": 8.963619280832617e-06, + "loss": 0.8038, + "step": 4899 + }, + { + "epoch": 1.241291956934769, + "grad_norm": 3.948587417602539, + "learning_rate": 8.963108489397875e-06, + "loss": 0.9206, + "step": 4900 + }, + { + "epoch": 1.2415452818239392, + "grad_norm": 3.2996327877044678, + "learning_rate": 8.962597586680908e-06, + "loss": 0.8708, + "step": 4901 + }, + { + "epoch": 1.2417986067131095, + "grad_norm": 3.3904151916503906, + "learning_rate": 8.962086572696055e-06, + "loss": 0.7771, + "step": 4902 + }, + { + "epoch": 1.24205193160228, + "grad_norm": 3.5035574436187744, + "learning_rate": 8.961575447457669e-06, + "loss": 0.8709, + "step": 4903 + }, + { + "epoch": 1.2423052564914503, + "grad_norm": 3.2244651317596436, + "learning_rate": 8.961064210980101e-06, + "loss": 0.7146, + "step": 4904 + }, + { + "epoch": 1.2425585813806206, + "grad_norm": 3.9018938541412354, + "learning_rate": 8.960552863277707e-06, + "loss": 0.8699, + "step": 4905 + }, + { + "epoch": 1.242811906269791, + "grad_norm": 3.8801350593566895, + "learning_rate": 8.960041404364845e-06, + "loss": 0.866, + "step": 4906 + }, + { + "epoch": 1.2430652311589614, + "grad_norm": 3.586246967315674, + "learning_rate": 8.959529834255876e-06, + "loss": 0.7806, + "step": 4907 + }, + { + "epoch": 1.2433185560481317, + "grad_norm": 3.703272581100464, + "learning_rate": 8.959018152965164e-06, + "loss": 0.7762, + "step": 4908 + }, + { + "epoch": 1.243571880937302, + "grad_norm": 3.905339479446411, + "learning_rate": 8.958506360507077e-06, + "loss": 0.7887, + "step": 4909 + }, + { + "epoch": 1.2438252058264725, + "grad_norm": 3.5449533462524414, + "learning_rate": 8.957994456895989e-06, + "loss": 0.6952, + "step": 4910 + }, + { + "epoch": 1.2440785307156428, + "grad_norm": 3.859621524810791, + "learning_rate": 8.957482442146271e-06, + "loss": 0.8361, + "step": 4911 + }, + { + "epoch": 1.244331855604813, + "grad_norm": 3.862034797668457, + "learning_rate": 8.956970316272301e-06, + "loss": 0.7858, + "step": 4912 + }, + { + "epoch": 1.2445851804939836, + "grad_norm": 3.644862651824951, + "learning_rate": 8.956458079288459e-06, + "loss": 0.8152, + "step": 4913 + }, + { + "epoch": 1.2448385053831539, + "grad_norm": 3.7374160289764404, + "learning_rate": 8.955945731209128e-06, + "loss": 0.7904, + "step": 4914 + }, + { + "epoch": 1.2450918302723242, + "grad_norm": 3.565932035446167, + "learning_rate": 8.955433272048694e-06, + "loss": 0.7909, + "step": 4915 + }, + { + "epoch": 1.2453451551614947, + "grad_norm": 3.4232900142669678, + "learning_rate": 8.95492070182155e-06, + "loss": 0.7504, + "step": 4916 + }, + { + "epoch": 1.245598480050665, + "grad_norm": 3.5261430740356445, + "learning_rate": 8.954408020542084e-06, + "loss": 0.6876, + "step": 4917 + }, + { + "epoch": 1.2458518049398353, + "grad_norm": 3.4949758052825928, + "learning_rate": 8.953895228224697e-06, + "loss": 0.6847, + "step": 4918 + }, + { + "epoch": 1.2461051298290058, + "grad_norm": 3.655601739883423, + "learning_rate": 8.953382324883782e-06, + "loss": 0.7402, + "step": 4919 + }, + { + "epoch": 1.246358454718176, + "grad_norm": 3.472031593322754, + "learning_rate": 8.952869310533744e-06, + "loss": 0.6989, + "step": 4920 + }, + { + "epoch": 1.2466117796073464, + "grad_norm": 3.8667638301849365, + "learning_rate": 8.95235618518899e-06, + "loss": 0.8203, + "step": 4921 + }, + { + "epoch": 1.2468651044965169, + "grad_norm": 3.60306978225708, + "learning_rate": 8.951842948863927e-06, + "loss": 0.8294, + "step": 4922 + }, + { + "epoch": 1.2471184293856872, + "grad_norm": 3.7473983764648438, + "learning_rate": 8.951329601572965e-06, + "loss": 0.7854, + "step": 4923 + }, + { + "epoch": 1.2473717542748575, + "grad_norm": 3.660061836242676, + "learning_rate": 8.95081614333052e-06, + "loss": 0.8648, + "step": 4924 + }, + { + "epoch": 1.247625079164028, + "grad_norm": 3.6376841068267822, + "learning_rate": 8.95030257415101e-06, + "loss": 0.8261, + "step": 4925 + }, + { + "epoch": 1.2478784040531983, + "grad_norm": 3.5895543098449707, + "learning_rate": 8.949788894048853e-06, + "loss": 0.811, + "step": 4926 + }, + { + "epoch": 1.2481317289423686, + "grad_norm": 3.393707752227783, + "learning_rate": 8.949275103038479e-06, + "loss": 0.8807, + "step": 4927 + }, + { + "epoch": 1.2483850538315389, + "grad_norm": 3.668567657470703, + "learning_rate": 8.948761201134309e-06, + "loss": 0.6899, + "step": 4928 + }, + { + "epoch": 1.2486383787207094, + "grad_norm": 3.5369865894317627, + "learning_rate": 8.948247188350773e-06, + "loss": 0.8388, + "step": 4929 + }, + { + "epoch": 1.2488917036098797, + "grad_norm": 3.3084373474121094, + "learning_rate": 8.947733064702308e-06, + "loss": 0.7367, + "step": 4930 + }, + { + "epoch": 1.24914502849905, + "grad_norm": 3.5315730571746826, + "learning_rate": 8.94721883020335e-06, + "loss": 0.7929, + "step": 4931 + }, + { + "epoch": 1.2493983533882205, + "grad_norm": 3.397585391998291, + "learning_rate": 8.946704484868337e-06, + "loss": 0.8362, + "step": 4932 + }, + { + "epoch": 1.2496516782773908, + "grad_norm": 4.010646343231201, + "learning_rate": 8.946190028711712e-06, + "loss": 0.7752, + "step": 4933 + }, + { + "epoch": 1.249905003166561, + "grad_norm": 4.345476150512695, + "learning_rate": 8.945675461747919e-06, + "loss": 0.7777, + "step": 4934 + }, + { + "epoch": 1.2501583280557313, + "grad_norm": 3.873446464538574, + "learning_rate": 8.94516078399141e-06, + "loss": 0.7596, + "step": 4935 + }, + { + "epoch": 1.2504116529449019, + "grad_norm": 3.6672351360321045, + "learning_rate": 8.944645995456634e-06, + "loss": 0.7857, + "step": 4936 + }, + { + "epoch": 1.2506649778340722, + "grad_norm": 3.8857362270355225, + "learning_rate": 8.944131096158046e-06, + "loss": 0.7407, + "step": 4937 + }, + { + "epoch": 1.2509183027232424, + "grad_norm": 4.123960494995117, + "learning_rate": 8.943616086110107e-06, + "loss": 0.8535, + "step": 4938 + }, + { + "epoch": 1.251171627612413, + "grad_norm": 4.322494983673096, + "learning_rate": 8.943100965327276e-06, + "loss": 0.7959, + "step": 4939 + }, + { + "epoch": 1.2514249525015833, + "grad_norm": 3.6760008335113525, + "learning_rate": 8.942585733824018e-06, + "loss": 0.7764, + "step": 4940 + }, + { + "epoch": 1.2516782773907535, + "grad_norm": 3.91406512260437, + "learning_rate": 8.9420703916148e-06, + "loss": 0.9002, + "step": 4941 + }, + { + "epoch": 1.251931602279924, + "grad_norm": 3.5883524417877197, + "learning_rate": 8.941554938714094e-06, + "loss": 0.65, + "step": 4942 + }, + { + "epoch": 1.2521849271690944, + "grad_norm": 3.617281675338745, + "learning_rate": 8.94103937513637e-06, + "loss": 0.7746, + "step": 4943 + }, + { + "epoch": 1.2524382520582646, + "grad_norm": 3.4269120693206787, + "learning_rate": 8.940523700896111e-06, + "loss": 0.8321, + "step": 4944 + }, + { + "epoch": 1.2526915769474352, + "grad_norm": 3.3844025135040283, + "learning_rate": 8.940007916007792e-06, + "loss": 0.8338, + "step": 4945 + }, + { + "epoch": 1.2529449018366055, + "grad_norm": 4.153983116149902, + "learning_rate": 8.939492020485898e-06, + "loss": 0.9274, + "step": 4946 + }, + { + "epoch": 1.2531982267257757, + "grad_norm": 3.826216459274292, + "learning_rate": 8.938976014344913e-06, + "loss": 0.6986, + "step": 4947 + }, + { + "epoch": 1.2534515516149463, + "grad_norm": 3.4436357021331787, + "learning_rate": 8.938459897599327e-06, + "loss": 0.6969, + "step": 4948 + }, + { + "epoch": 1.2537048765041165, + "grad_norm": 4.002293586730957, + "learning_rate": 8.937943670263636e-06, + "loss": 0.6836, + "step": 4949 + }, + { + "epoch": 1.2539582013932868, + "grad_norm": 3.919435501098633, + "learning_rate": 8.93742733235233e-06, + "loss": 0.7949, + "step": 4950 + }, + { + "epoch": 1.2542115262824574, + "grad_norm": 3.728658437728882, + "learning_rate": 8.93691088387991e-06, + "loss": 0.7893, + "step": 4951 + }, + { + "epoch": 1.2544648511716276, + "grad_norm": 3.820829153060913, + "learning_rate": 8.93639432486088e-06, + "loss": 0.8745, + "step": 4952 + }, + { + "epoch": 1.254718176060798, + "grad_norm": 3.7555582523345947, + "learning_rate": 8.935877655309739e-06, + "loss": 0.7299, + "step": 4953 + }, + { + "epoch": 1.2549715009499685, + "grad_norm": 3.7295260429382324, + "learning_rate": 8.935360875241e-06, + "loss": 0.8915, + "step": 4954 + }, + { + "epoch": 1.2552248258391387, + "grad_norm": 3.7932441234588623, + "learning_rate": 8.934843984669171e-06, + "loss": 0.7672, + "step": 4955 + }, + { + "epoch": 1.255478150728309, + "grad_norm": 4.009216785430908, + "learning_rate": 8.934326983608768e-06, + "loss": 0.8522, + "step": 4956 + }, + { + "epoch": 1.2557314756174796, + "grad_norm": 3.5970778465270996, + "learning_rate": 8.933809872074306e-06, + "loss": 0.8272, + "step": 4957 + }, + { + "epoch": 1.2559848005066498, + "grad_norm": 3.852548599243164, + "learning_rate": 8.933292650080307e-06, + "loss": 0.7751, + "step": 4958 + }, + { + "epoch": 1.2562381253958201, + "grad_norm": 3.8101701736450195, + "learning_rate": 8.932775317641294e-06, + "loss": 0.7083, + "step": 4959 + }, + { + "epoch": 1.2564914502849904, + "grad_norm": 4.030754566192627, + "learning_rate": 8.932257874771792e-06, + "loss": 0.9273, + "step": 4960 + }, + { + "epoch": 1.256744775174161, + "grad_norm": 4.145448684692383, + "learning_rate": 8.931740321486335e-06, + "loss": 0.7546, + "step": 4961 + }, + { + "epoch": 1.2569981000633312, + "grad_norm": 3.3041367530822754, + "learning_rate": 8.93122265779945e-06, + "loss": 0.857, + "step": 4962 + }, + { + "epoch": 1.2572514249525015, + "grad_norm": 3.5868442058563232, + "learning_rate": 8.930704883725676e-06, + "loss": 0.7319, + "step": 4963 + }, + { + "epoch": 1.2575047498416718, + "grad_norm": 3.6959526538848877, + "learning_rate": 8.93018699927955e-06, + "loss": 0.7746, + "step": 4964 + }, + { + "epoch": 1.2577580747308423, + "grad_norm": 3.8091931343078613, + "learning_rate": 8.929669004475616e-06, + "loss": 0.8816, + "step": 4965 + }, + { + "epoch": 1.2580113996200126, + "grad_norm": 3.836026430130005, + "learning_rate": 8.929150899328418e-06, + "loss": 0.7411, + "step": 4966 + }, + { + "epoch": 1.258264724509183, + "grad_norm": 3.737752914428711, + "learning_rate": 8.928632683852504e-06, + "loss": 0.7715, + "step": 4967 + }, + { + "epoch": 1.2585180493983534, + "grad_norm": 3.351428270339966, + "learning_rate": 8.928114358062428e-06, + "loss": 0.7465, + "step": 4968 + }, + { + "epoch": 1.2587713742875237, + "grad_norm": 4.012657165527344, + "learning_rate": 8.927595921972738e-06, + "loss": 0.7875, + "step": 4969 + }, + { + "epoch": 1.259024699176694, + "grad_norm": 3.795178174972534, + "learning_rate": 8.927077375597997e-06, + "loss": 0.8054, + "step": 4970 + }, + { + "epoch": 1.2592780240658645, + "grad_norm": 3.275935173034668, + "learning_rate": 8.926558718952765e-06, + "loss": 0.7714, + "step": 4971 + }, + { + "epoch": 1.2595313489550348, + "grad_norm": 4.029940128326416, + "learning_rate": 8.926039952051603e-06, + "loss": 0.8743, + "step": 4972 + }, + { + "epoch": 1.2597846738442051, + "grad_norm": 3.473362922668457, + "learning_rate": 8.925521074909078e-06, + "loss": 0.6853, + "step": 4973 + }, + { + "epoch": 1.2600379987333756, + "grad_norm": 3.401608467102051, + "learning_rate": 8.925002087539763e-06, + "loss": 0.7765, + "step": 4974 + }, + { + "epoch": 1.260291323622546, + "grad_norm": 4.135184288024902, + "learning_rate": 8.92448298995823e-06, + "loss": 0.7321, + "step": 4975 + }, + { + "epoch": 1.2605446485117162, + "grad_norm": 3.2951271533966064, + "learning_rate": 8.923963782179051e-06, + "loss": 0.7111, + "step": 4976 + }, + { + "epoch": 1.2607979734008867, + "grad_norm": 3.373471975326538, + "learning_rate": 8.92344446421681e-06, + "loss": 0.7699, + "step": 4977 + }, + { + "epoch": 1.261051298290057, + "grad_norm": 3.8345279693603516, + "learning_rate": 8.922925036086086e-06, + "loss": 0.7885, + "step": 4978 + }, + { + "epoch": 1.2613046231792273, + "grad_norm": 3.6304073333740234, + "learning_rate": 8.922405497801468e-06, + "loss": 0.8356, + "step": 4979 + }, + { + "epoch": 1.2615579480683978, + "grad_norm": 3.9937851428985596, + "learning_rate": 8.921885849377539e-06, + "loss": 0.8689, + "step": 4980 + }, + { + "epoch": 1.2618112729575681, + "grad_norm": 4.083383560180664, + "learning_rate": 8.921366090828896e-06, + "loss": 0.8164, + "step": 4981 + }, + { + "epoch": 1.2620645978467384, + "grad_norm": 3.7671871185302734, + "learning_rate": 8.920846222170129e-06, + "loss": 0.7979, + "step": 4982 + }, + { + "epoch": 1.262317922735909, + "grad_norm": 4.072494029998779, + "learning_rate": 8.920326243415839e-06, + "loss": 0.8004, + "step": 4983 + }, + { + "epoch": 1.2625712476250792, + "grad_norm": 3.728637933731079, + "learning_rate": 8.919806154580623e-06, + "loss": 0.8023, + "step": 4984 + }, + { + "epoch": 1.2628245725142495, + "grad_norm": 3.6757984161376953, + "learning_rate": 8.919285955679092e-06, + "loss": 0.8087, + "step": 4985 + }, + { + "epoch": 1.26307789740342, + "grad_norm": 3.940227508544922, + "learning_rate": 8.918765646725845e-06, + "loss": 0.7942, + "step": 4986 + }, + { + "epoch": 1.2633312222925903, + "grad_norm": 3.649754762649536, + "learning_rate": 8.918245227735494e-06, + "loss": 0.727, + "step": 4987 + }, + { + "epoch": 1.2635845471817606, + "grad_norm": 3.6697895526885986, + "learning_rate": 8.917724698722657e-06, + "loss": 0.683, + "step": 4988 + }, + { + "epoch": 1.263837872070931, + "grad_norm": 4.008494853973389, + "learning_rate": 8.917204059701942e-06, + "loss": 0.7574, + "step": 4989 + }, + { + "epoch": 1.2640911969601012, + "grad_norm": 3.700213670730591, + "learning_rate": 8.916683310687977e-06, + "loss": 0.6426, + "step": 4990 + }, + { + "epoch": 1.2643445218492717, + "grad_norm": 4.02217435836792, + "learning_rate": 8.916162451695378e-06, + "loss": 0.7463, + "step": 4991 + }, + { + "epoch": 1.264597846738442, + "grad_norm": 3.6228909492492676, + "learning_rate": 8.915641482738775e-06, + "loss": 0.7692, + "step": 4992 + }, + { + "epoch": 1.2648511716276123, + "grad_norm": 3.844886541366577, + "learning_rate": 8.915120403832793e-06, + "loss": 0.7234, + "step": 4993 + }, + { + "epoch": 1.2651044965167828, + "grad_norm": 3.5308997631073, + "learning_rate": 8.914599214992065e-06, + "loss": 0.8154, + "step": 4994 + }, + { + "epoch": 1.265357821405953, + "grad_norm": 4.345712184906006, + "learning_rate": 8.914077916231225e-06, + "loss": 0.8576, + "step": 4995 + }, + { + "epoch": 1.2656111462951234, + "grad_norm": 3.661726474761963, + "learning_rate": 8.913556507564914e-06, + "loss": 0.6412, + "step": 4996 + }, + { + "epoch": 1.265864471184294, + "grad_norm": 3.592749834060669, + "learning_rate": 8.913034989007767e-06, + "loss": 0.8385, + "step": 4997 + }, + { + "epoch": 1.2661177960734642, + "grad_norm": 3.512347459793091, + "learning_rate": 8.912513360574435e-06, + "loss": 0.8836, + "step": 4998 + }, + { + "epoch": 1.2663711209626345, + "grad_norm": 3.67021107673645, + "learning_rate": 8.911991622279559e-06, + "loss": 0.7278, + "step": 4999 + }, + { + "epoch": 1.266624445851805, + "grad_norm": 3.180130958557129, + "learning_rate": 8.911469774137793e-06, + "loss": 0.7645, + "step": 5000 + }, + { + "epoch": 1.266624445851805, + "eval_loss": 1.1878291368484497, + "eval_runtime": 13.9366, + "eval_samples_per_second": 28.701, + "eval_steps_per_second": 3.588, + "step": 5000 + }, + { + "epoch": 1.2668777707409753, + "grad_norm": 3.49995493888855, + "learning_rate": 8.910947816163787e-06, + "loss": 0.7172, + "step": 5001 + }, + { + "epoch": 1.2671310956301456, + "grad_norm": 3.2964305877685547, + "learning_rate": 8.910425748372202e-06, + "loss": 0.7972, + "step": 5002 + }, + { + "epoch": 1.267384420519316, + "grad_norm": 4.145137310028076, + "learning_rate": 8.909903570777692e-06, + "loss": 0.8347, + "step": 5003 + }, + { + "epoch": 1.2676377454084864, + "grad_norm": 3.467320442199707, + "learning_rate": 8.909381283394925e-06, + "loss": 0.8494, + "step": 5004 + }, + { + "epoch": 1.2678910702976567, + "grad_norm": 3.2528982162475586, + "learning_rate": 8.908858886238562e-06, + "loss": 0.7902, + "step": 5005 + }, + { + "epoch": 1.2681443951868272, + "grad_norm": 3.5220718383789062, + "learning_rate": 8.908336379323274e-06, + "loss": 0.7353, + "step": 5006 + }, + { + "epoch": 1.2683977200759975, + "grad_norm": 3.4364192485809326, + "learning_rate": 8.907813762663731e-06, + "loss": 0.8121, + "step": 5007 + }, + { + "epoch": 1.2686510449651678, + "grad_norm": 3.683168411254883, + "learning_rate": 8.907291036274612e-06, + "loss": 0.7455, + "step": 5008 + }, + { + "epoch": 1.2689043698543383, + "grad_norm": 3.633958101272583, + "learning_rate": 8.906768200170589e-06, + "loss": 0.6622, + "step": 5009 + }, + { + "epoch": 1.2691576947435086, + "grad_norm": 3.8055920600891113, + "learning_rate": 8.906245254366348e-06, + "loss": 0.8261, + "step": 5010 + }, + { + "epoch": 1.2694110196326789, + "grad_norm": 3.2654600143432617, + "learning_rate": 8.90572219887657e-06, + "loss": 0.6898, + "step": 5011 + }, + { + "epoch": 1.2696643445218494, + "grad_norm": 4.933192253112793, + "learning_rate": 8.905199033715943e-06, + "loss": 0.8274, + "step": 5012 + }, + { + "epoch": 1.2699176694110197, + "grad_norm": 3.869718074798584, + "learning_rate": 8.904675758899157e-06, + "loss": 0.7575, + "step": 5013 + }, + { + "epoch": 1.27017099430019, + "grad_norm": 3.6533114910125732, + "learning_rate": 8.904152374440908e-06, + "loss": 0.8566, + "step": 5014 + }, + { + "epoch": 1.2704243191893603, + "grad_norm": 3.694594383239746, + "learning_rate": 8.903628880355888e-06, + "loss": 0.6968, + "step": 5015 + }, + { + "epoch": 1.2706776440785308, + "grad_norm": 3.6560215950012207, + "learning_rate": 8.903105276658799e-06, + "loss": 0.8285, + "step": 5016 + }, + { + "epoch": 1.270930968967701, + "grad_norm": 4.171489238739014, + "learning_rate": 8.902581563364344e-06, + "loss": 0.8613, + "step": 5017 + }, + { + "epoch": 1.2711842938568714, + "grad_norm": 3.6669111251831055, + "learning_rate": 8.902057740487227e-06, + "loss": 0.6709, + "step": 5018 + }, + { + "epoch": 1.2714376187460417, + "grad_norm": 3.866795301437378, + "learning_rate": 8.901533808042157e-06, + "loss": 0.8865, + "step": 5019 + }, + { + "epoch": 1.2716909436352122, + "grad_norm": 4.075090408325195, + "learning_rate": 8.901009766043846e-06, + "loss": 0.7882, + "step": 5020 + }, + { + "epoch": 1.2719442685243825, + "grad_norm": 3.5593883991241455, + "learning_rate": 8.90048561450701e-06, + "loss": 0.8008, + "step": 5021 + }, + { + "epoch": 1.2721975934135528, + "grad_norm": 4.1571478843688965, + "learning_rate": 8.899961353446367e-06, + "loss": 0.6434, + "step": 5022 + }, + { + "epoch": 1.2724509183027233, + "grad_norm": 3.5136566162109375, + "learning_rate": 8.899436982876635e-06, + "loss": 0.753, + "step": 5023 + }, + { + "epoch": 1.2727042431918936, + "grad_norm": 4.385815620422363, + "learning_rate": 8.89891250281254e-06, + "loss": 0.814, + "step": 5024 + }, + { + "epoch": 1.2729575680810639, + "grad_norm": 4.039702892303467, + "learning_rate": 8.898387913268812e-06, + "loss": 0.9063, + "step": 5025 + }, + { + "epoch": 1.2732108929702344, + "grad_norm": 3.526564121246338, + "learning_rate": 8.897863214260176e-06, + "loss": 0.7967, + "step": 5026 + }, + { + "epoch": 1.2734642178594047, + "grad_norm": 3.5580739974975586, + "learning_rate": 8.897338405801368e-06, + "loss": 0.8186, + "step": 5027 + }, + { + "epoch": 1.273717542748575, + "grad_norm": 3.797952175140381, + "learning_rate": 8.896813487907125e-06, + "loss": 0.6974, + "step": 5028 + }, + { + "epoch": 1.2739708676377455, + "grad_norm": 3.9370381832122803, + "learning_rate": 8.896288460592187e-06, + "loss": 0.7742, + "step": 5029 + }, + { + "epoch": 1.2742241925269158, + "grad_norm": 3.9475274085998535, + "learning_rate": 8.895763323871292e-06, + "loss": 0.8672, + "step": 5030 + }, + { + "epoch": 1.274477517416086, + "grad_norm": 3.5705692768096924, + "learning_rate": 8.89523807775919e-06, + "loss": 0.7926, + "step": 5031 + }, + { + "epoch": 1.2747308423052566, + "grad_norm": 5.121907711029053, + "learning_rate": 8.89471272227063e-06, + "loss": 0.7513, + "step": 5032 + }, + { + "epoch": 1.2749841671944269, + "grad_norm": 3.21358060836792, + "learning_rate": 8.89418725742036e-06, + "loss": 0.8127, + "step": 5033 + }, + { + "epoch": 1.2752374920835972, + "grad_norm": 3.715261220932007, + "learning_rate": 8.893661683223136e-06, + "loss": 0.8201, + "step": 5034 + }, + { + "epoch": 1.2754908169727677, + "grad_norm": 3.8979718685150146, + "learning_rate": 8.89313599969372e-06, + "loss": 0.8807, + "step": 5035 + }, + { + "epoch": 1.275744141861938, + "grad_norm": 3.51139497756958, + "learning_rate": 8.892610206846868e-06, + "loss": 0.7374, + "step": 5036 + }, + { + "epoch": 1.2759974667511083, + "grad_norm": 3.727140188217163, + "learning_rate": 8.892084304697347e-06, + "loss": 0.8441, + "step": 5037 + }, + { + "epoch": 1.2762507916402788, + "grad_norm": 3.341991424560547, + "learning_rate": 8.891558293259921e-06, + "loss": 0.6944, + "step": 5038 + }, + { + "epoch": 1.276504116529449, + "grad_norm": 3.6908040046691895, + "learning_rate": 8.891032172549362e-06, + "loss": 0.7233, + "step": 5039 + }, + { + "epoch": 1.2767574414186194, + "grad_norm": 3.654240608215332, + "learning_rate": 8.890505942580443e-06, + "loss": 0.8648, + "step": 5040 + }, + { + "epoch": 1.2770107663077899, + "grad_norm": 3.500528335571289, + "learning_rate": 8.889979603367942e-06, + "loss": 0.7325, + "step": 5041 + }, + { + "epoch": 1.2772640911969602, + "grad_norm": 3.7596962451934814, + "learning_rate": 8.889453154926637e-06, + "loss": 0.96, + "step": 5042 + }, + { + "epoch": 1.2775174160861305, + "grad_norm": 3.9248828887939453, + "learning_rate": 8.88892659727131e-06, + "loss": 0.8253, + "step": 5043 + }, + { + "epoch": 1.2777707409753007, + "grad_norm": 3.842301845550537, + "learning_rate": 8.888399930416746e-06, + "loss": 0.7854, + "step": 5044 + }, + { + "epoch": 1.2780240658644713, + "grad_norm": 3.546661138534546, + "learning_rate": 8.887873154377736e-06, + "loss": 0.7607, + "step": 5045 + }, + { + "epoch": 1.2782773907536416, + "grad_norm": 3.448086977005005, + "learning_rate": 8.887346269169067e-06, + "loss": 0.7556, + "step": 5046 + }, + { + "epoch": 1.2785307156428118, + "grad_norm": 3.7830214500427246, + "learning_rate": 8.88681927480554e-06, + "loss": 0.8122, + "step": 5047 + }, + { + "epoch": 1.2787840405319821, + "grad_norm": 3.8307809829711914, + "learning_rate": 8.886292171301947e-06, + "loss": 0.9047, + "step": 5048 + }, + { + "epoch": 1.2790373654211526, + "grad_norm": 3.741891622543335, + "learning_rate": 8.885764958673093e-06, + "loss": 0.7657, + "step": 5049 + }, + { + "epoch": 1.279290690310323, + "grad_norm": 3.4128377437591553, + "learning_rate": 8.885237636933779e-06, + "loss": 0.7411, + "step": 5050 + }, + { + "epoch": 1.2795440151994932, + "grad_norm": 3.633951187133789, + "learning_rate": 8.884710206098814e-06, + "loss": 0.8994, + "step": 5051 + }, + { + "epoch": 1.2797973400886637, + "grad_norm": 3.897345542907715, + "learning_rate": 8.884182666183005e-06, + "loss": 0.8265, + "step": 5052 + }, + { + "epoch": 1.280050664977834, + "grad_norm": 3.6153879165649414, + "learning_rate": 8.883655017201169e-06, + "loss": 0.8037, + "step": 5053 + }, + { + "epoch": 1.2803039898670043, + "grad_norm": 4.0165791511535645, + "learning_rate": 8.883127259168121e-06, + "loss": 0.8108, + "step": 5054 + }, + { + "epoch": 1.2805573147561748, + "grad_norm": 3.8056552410125732, + "learning_rate": 8.882599392098677e-06, + "loss": 0.7588, + "step": 5055 + }, + { + "epoch": 1.2808106396453451, + "grad_norm": 4.42263126373291, + "learning_rate": 8.882071416007664e-06, + "loss": 1.0186, + "step": 5056 + }, + { + "epoch": 1.2810639645345154, + "grad_norm": 3.960272789001465, + "learning_rate": 8.881543330909905e-06, + "loss": 0.7764, + "step": 5057 + }, + { + "epoch": 1.281317289423686, + "grad_norm": 4.283044338226318, + "learning_rate": 8.881015136820227e-06, + "loss": 0.7066, + "step": 5058 + }, + { + "epoch": 1.2815706143128562, + "grad_norm": 3.494903087615967, + "learning_rate": 8.880486833753464e-06, + "loss": 0.6419, + "step": 5059 + }, + { + "epoch": 1.2818239392020265, + "grad_norm": 3.93373441696167, + "learning_rate": 8.879958421724448e-06, + "loss": 0.6975, + "step": 5060 + }, + { + "epoch": 1.282077264091197, + "grad_norm": 3.516568422317505, + "learning_rate": 8.87942990074802e-06, + "loss": 0.6778, + "step": 5061 + }, + { + "epoch": 1.2823305889803673, + "grad_norm": 3.7583367824554443, + "learning_rate": 8.878901270839017e-06, + "loss": 0.772, + "step": 5062 + }, + { + "epoch": 1.2825839138695376, + "grad_norm": 3.836733102798462, + "learning_rate": 8.878372532012285e-06, + "loss": 0.7258, + "step": 5063 + }, + { + "epoch": 1.2828372387587081, + "grad_norm": 3.661405086517334, + "learning_rate": 8.87784368428267e-06, + "loss": 0.7599, + "step": 5064 + }, + { + "epoch": 1.2830905636478784, + "grad_norm": 3.767148733139038, + "learning_rate": 8.877314727665021e-06, + "loss": 0.7297, + "step": 5065 + }, + { + "epoch": 1.2833438885370487, + "grad_norm": 4.096794605255127, + "learning_rate": 8.876785662174193e-06, + "loss": 0.7756, + "step": 5066 + }, + { + "epoch": 1.2835972134262192, + "grad_norm": 3.8847033977508545, + "learning_rate": 8.876256487825041e-06, + "loss": 0.7152, + "step": 5067 + }, + { + "epoch": 1.2838505383153895, + "grad_norm": 3.8769469261169434, + "learning_rate": 8.875727204632422e-06, + "loss": 0.8622, + "step": 5068 + }, + { + "epoch": 1.2841038632045598, + "grad_norm": 3.176262378692627, + "learning_rate": 8.8751978126112e-06, + "loss": 0.7312, + "step": 5069 + }, + { + "epoch": 1.2843571880937303, + "grad_norm": 3.7866973876953125, + "learning_rate": 8.874668311776239e-06, + "loss": 0.8017, + "step": 5070 + }, + { + "epoch": 1.2846105129829006, + "grad_norm": 3.927074670791626, + "learning_rate": 8.87413870214241e-06, + "loss": 0.8997, + "step": 5071 + }, + { + "epoch": 1.284863837872071, + "grad_norm": 3.751995086669922, + "learning_rate": 8.87360898372458e-06, + "loss": 0.8129, + "step": 5072 + }, + { + "epoch": 1.2851171627612412, + "grad_norm": 3.669092893600464, + "learning_rate": 8.873079156537625e-06, + "loss": 0.7774, + "step": 5073 + }, + { + "epoch": 1.2853704876504117, + "grad_norm": 3.3392302989959717, + "learning_rate": 8.872549220596422e-06, + "loss": 0.7465, + "step": 5074 + }, + { + "epoch": 1.285623812539582, + "grad_norm": 3.6528210639953613, + "learning_rate": 8.872019175915854e-06, + "loss": 0.8081, + "step": 5075 + }, + { + "epoch": 1.2858771374287523, + "grad_norm": 3.4234163761138916, + "learning_rate": 8.871489022510801e-06, + "loss": 0.7314, + "step": 5076 + }, + { + "epoch": 1.2861304623179226, + "grad_norm": 3.5174355506896973, + "learning_rate": 8.870958760396151e-06, + "loss": 0.7822, + "step": 5077 + }, + { + "epoch": 1.2863837872070931, + "grad_norm": 3.6532506942749023, + "learning_rate": 8.870428389586794e-06, + "loss": 0.766, + "step": 5078 + }, + { + "epoch": 1.2866371120962634, + "grad_norm": 3.323868989944458, + "learning_rate": 8.86989791009762e-06, + "loss": 0.7974, + "step": 5079 + }, + { + "epoch": 1.2868904369854337, + "grad_norm": 3.9011785984039307, + "learning_rate": 8.869367321943527e-06, + "loss": 0.6893, + "step": 5080 + }, + { + "epoch": 1.2871437618746042, + "grad_norm": 3.767474412918091, + "learning_rate": 8.868836625139415e-06, + "loss": 0.8577, + "step": 5081 + }, + { + "epoch": 1.2873970867637745, + "grad_norm": 4.177369594573975, + "learning_rate": 8.868305819700181e-06, + "loss": 0.7859, + "step": 5082 + }, + { + "epoch": 1.2876504116529448, + "grad_norm": 3.96870493888855, + "learning_rate": 8.867774905640733e-06, + "loss": 0.7837, + "step": 5083 + }, + { + "epoch": 1.2879037365421153, + "grad_norm": 3.806058645248413, + "learning_rate": 8.86724388297598e-06, + "loss": 0.6973, + "step": 5084 + }, + { + "epoch": 1.2881570614312856, + "grad_norm": 3.4279656410217285, + "learning_rate": 8.866712751720831e-06, + "loss": 0.8016, + "step": 5085 + }, + { + "epoch": 1.288410386320456, + "grad_norm": 3.970402479171753, + "learning_rate": 8.8661815118902e-06, + "loss": 0.7277, + "step": 5086 + }, + { + "epoch": 1.2886637112096264, + "grad_norm": 3.720019817352295, + "learning_rate": 8.865650163499004e-06, + "loss": 0.8066, + "step": 5087 + }, + { + "epoch": 1.2889170360987967, + "grad_norm": 4.123616695404053, + "learning_rate": 8.865118706562164e-06, + "loss": 0.8142, + "step": 5088 + }, + { + "epoch": 1.289170360987967, + "grad_norm": 4.003778457641602, + "learning_rate": 8.8645871410946e-06, + "loss": 0.8571, + "step": 5089 + }, + { + "epoch": 1.2894236858771375, + "grad_norm": 3.4247050285339355, + "learning_rate": 8.86405546711124e-06, + "loss": 0.8019, + "step": 5090 + }, + { + "epoch": 1.2896770107663078, + "grad_norm": 3.3717615604400635, + "learning_rate": 8.863523684627018e-06, + "loss": 0.727, + "step": 5091 + }, + { + "epoch": 1.289930335655478, + "grad_norm": 3.3375675678253174, + "learning_rate": 8.862991793656858e-06, + "loss": 0.753, + "step": 5092 + }, + { + "epoch": 1.2901836605446486, + "grad_norm": 3.5768208503723145, + "learning_rate": 8.8624597942157e-06, + "loss": 0.8196, + "step": 5093 + }, + { + "epoch": 1.290436985433819, + "grad_norm": 3.681488275527954, + "learning_rate": 8.86192768631848e-06, + "loss": 0.8417, + "step": 5094 + }, + { + "epoch": 1.2906903103229892, + "grad_norm": 3.708117961883545, + "learning_rate": 8.861395469980142e-06, + "loss": 0.8578, + "step": 5095 + }, + { + "epoch": 1.2909436352121597, + "grad_norm": 3.418780565261841, + "learning_rate": 8.860863145215627e-06, + "loss": 0.7713, + "step": 5096 + }, + { + "epoch": 1.29119696010133, + "grad_norm": 3.7200613021850586, + "learning_rate": 8.860330712039886e-06, + "loss": 0.7198, + "step": 5097 + }, + { + "epoch": 1.2914502849905003, + "grad_norm": 3.6819093227386475, + "learning_rate": 8.859798170467868e-06, + "loss": 0.8028, + "step": 5098 + }, + { + "epoch": 1.2917036098796708, + "grad_norm": 3.46396541595459, + "learning_rate": 8.859265520514525e-06, + "loss": 0.8006, + "step": 5099 + }, + { + "epoch": 1.291956934768841, + "grad_norm": 3.258542537689209, + "learning_rate": 8.858732762194816e-06, + "loss": 0.6876, + "step": 5100 + }, + { + "epoch": 1.2922102596580114, + "grad_norm": 3.6014840602874756, + "learning_rate": 8.8581998955237e-06, + "loss": 0.6743, + "step": 5101 + }, + { + "epoch": 1.2924635845471817, + "grad_norm": 4.297863006591797, + "learning_rate": 8.857666920516139e-06, + "loss": 0.8732, + "step": 5102 + }, + { + "epoch": 1.2927169094363522, + "grad_norm": 3.884458065032959, + "learning_rate": 8.8571338371871e-06, + "loss": 0.9417, + "step": 5103 + }, + { + "epoch": 1.2929702343255225, + "grad_norm": 3.376234769821167, + "learning_rate": 8.856600645551549e-06, + "loss": 0.6684, + "step": 5104 + }, + { + "epoch": 1.2932235592146928, + "grad_norm": 3.6533520221710205, + "learning_rate": 8.85606734562446e-06, + "loss": 0.7386, + "step": 5105 + }, + { + "epoch": 1.293476884103863, + "grad_norm": 3.8629558086395264, + "learning_rate": 8.855533937420806e-06, + "loss": 0.7538, + "step": 5106 + }, + { + "epoch": 1.2937302089930336, + "grad_norm": 3.934384346008301, + "learning_rate": 8.855000420955567e-06, + "loss": 0.8641, + "step": 5107 + }, + { + "epoch": 1.2939835338822039, + "grad_norm": 3.7146103382110596, + "learning_rate": 8.854466796243722e-06, + "loss": 0.8123, + "step": 5108 + }, + { + "epoch": 1.2942368587713742, + "grad_norm": 3.9950599670410156, + "learning_rate": 8.853933063300258e-06, + "loss": 0.6525, + "step": 5109 + }, + { + "epoch": 1.2944901836605447, + "grad_norm": 3.913001537322998, + "learning_rate": 8.853399222140158e-06, + "loss": 0.8264, + "step": 5110 + }, + { + "epoch": 1.294743508549715, + "grad_norm": 3.609727382659912, + "learning_rate": 8.852865272778413e-06, + "loss": 0.7613, + "step": 5111 + }, + { + "epoch": 1.2949968334388853, + "grad_norm": 3.5861222743988037, + "learning_rate": 8.85233121523002e-06, + "loss": 0.9255, + "step": 5112 + }, + { + "epoch": 1.2952501583280558, + "grad_norm": 4.302942276000977, + "learning_rate": 8.85179704950997e-06, + "loss": 0.8833, + "step": 5113 + }, + { + "epoch": 1.295503483217226, + "grad_norm": 3.647034168243408, + "learning_rate": 8.851262775633263e-06, + "loss": 0.7407, + "step": 5114 + }, + { + "epoch": 1.2957568081063964, + "grad_norm": 3.4388327598571777, + "learning_rate": 8.850728393614903e-06, + "loss": 0.8952, + "step": 5115 + }, + { + "epoch": 1.2960101329955669, + "grad_norm": 3.5317115783691406, + "learning_rate": 8.850193903469895e-06, + "loss": 0.8687, + "step": 5116 + }, + { + "epoch": 1.2962634578847372, + "grad_norm": 3.7548828125, + "learning_rate": 8.849659305213248e-06, + "loss": 0.7272, + "step": 5117 + }, + { + "epoch": 1.2965167827739075, + "grad_norm": 3.6353676319122314, + "learning_rate": 8.84912459885997e-06, + "loss": 0.7473, + "step": 5118 + }, + { + "epoch": 1.296770107663078, + "grad_norm": 3.826050043106079, + "learning_rate": 8.848589784425076e-06, + "loss": 0.8157, + "step": 5119 + }, + { + "epoch": 1.2970234325522483, + "grad_norm": 4.017717361450195, + "learning_rate": 8.848054861923587e-06, + "loss": 0.9376, + "step": 5120 + }, + { + "epoch": 1.2972767574414186, + "grad_norm": 3.9414191246032715, + "learning_rate": 8.847519831370522e-06, + "loss": 0.7703, + "step": 5121 + }, + { + "epoch": 1.297530082330589, + "grad_norm": 3.7812705039978027, + "learning_rate": 8.8469846927809e-06, + "loss": 0.7178, + "step": 5122 + }, + { + "epoch": 1.2977834072197594, + "grad_norm": 3.833151340484619, + "learning_rate": 8.846449446169754e-06, + "loss": 0.7842, + "step": 5123 + }, + { + "epoch": 1.2980367321089297, + "grad_norm": 4.485802173614502, + "learning_rate": 8.845914091552108e-06, + "loss": 0.9194, + "step": 5124 + }, + { + "epoch": 1.2982900569981002, + "grad_norm": 3.8566842079162598, + "learning_rate": 8.845378628942998e-06, + "loss": 0.7548, + "step": 5125 + }, + { + "epoch": 1.2985433818872705, + "grad_norm": 3.787710666656494, + "learning_rate": 8.844843058357458e-06, + "loss": 0.7981, + "step": 5126 + }, + { + "epoch": 1.2987967067764408, + "grad_norm": 3.9272103309631348, + "learning_rate": 8.844307379810526e-06, + "loss": 0.7384, + "step": 5127 + }, + { + "epoch": 1.2990500316656113, + "grad_norm": 4.102965831756592, + "learning_rate": 8.843771593317246e-06, + "loss": 0.7695, + "step": 5128 + }, + { + "epoch": 1.2993033565547816, + "grad_norm": 4.810306072235107, + "learning_rate": 8.843235698892661e-06, + "loss": 0.8306, + "step": 5129 + }, + { + "epoch": 1.2995566814439519, + "grad_norm": 3.666740655899048, + "learning_rate": 8.842699696551818e-06, + "loss": 0.7999, + "step": 5130 + }, + { + "epoch": 1.2998100063331222, + "grad_norm": 3.9185757637023926, + "learning_rate": 8.84216358630977e-06, + "loss": 0.7595, + "step": 5131 + }, + { + "epoch": 1.3000633312222927, + "grad_norm": 3.7936949729919434, + "learning_rate": 8.841627368181569e-06, + "loss": 0.8162, + "step": 5132 + }, + { + "epoch": 1.300316656111463, + "grad_norm": 3.8769707679748535, + "learning_rate": 8.84109104218227e-06, + "loss": 0.7763, + "step": 5133 + }, + { + "epoch": 1.3005699810006333, + "grad_norm": 3.700146436691284, + "learning_rate": 8.840554608326935e-06, + "loss": 0.7458, + "step": 5134 + }, + { + "epoch": 1.3008233058898035, + "grad_norm": 3.6501848697662354, + "learning_rate": 8.84001806663063e-06, + "loss": 0.7223, + "step": 5135 + }, + { + "epoch": 1.301076630778974, + "grad_norm": 3.628479242324829, + "learning_rate": 8.839481417108414e-06, + "loss": 0.8136, + "step": 5136 + }, + { + "epoch": 1.3013299556681444, + "grad_norm": 3.7592058181762695, + "learning_rate": 8.838944659775358e-06, + "loss": 0.7518, + "step": 5137 + }, + { + "epoch": 1.3015832805573146, + "grad_norm": 3.9640681743621826, + "learning_rate": 8.83840779464654e-06, + "loss": 0.8824, + "step": 5138 + }, + { + "epoch": 1.3018366054464852, + "grad_norm": 3.3016247749328613, + "learning_rate": 8.837870821737027e-06, + "loss": 0.7037, + "step": 5139 + }, + { + "epoch": 1.3020899303356555, + "grad_norm": 3.74817156791687, + "learning_rate": 8.8373337410619e-06, + "loss": 0.7651, + "step": 5140 + }, + { + "epoch": 1.3023432552248257, + "grad_norm": 3.2207884788513184, + "learning_rate": 8.83679655263624e-06, + "loss": 0.6062, + "step": 5141 + }, + { + "epoch": 1.3025965801139963, + "grad_norm": 3.3142166137695312, + "learning_rate": 8.836259256475132e-06, + "loss": 0.6988, + "step": 5142 + }, + { + "epoch": 1.3028499050031666, + "grad_norm": 4.2149434089660645, + "learning_rate": 8.835721852593661e-06, + "loss": 0.8357, + "step": 5143 + }, + { + "epoch": 1.3031032298923368, + "grad_norm": 4.146454334259033, + "learning_rate": 8.835184341006919e-06, + "loss": 0.7967, + "step": 5144 + }, + { + "epoch": 1.3033565547815074, + "grad_norm": 4.046872138977051, + "learning_rate": 8.834646721729999e-06, + "loss": 0.817, + "step": 5145 + }, + { + "epoch": 1.3036098796706777, + "grad_norm": 3.705402374267578, + "learning_rate": 8.834108994777995e-06, + "loss": 0.7499, + "step": 5146 + }, + { + "epoch": 1.303863204559848, + "grad_norm": 3.4352595806121826, + "learning_rate": 8.833571160166007e-06, + "loss": 0.7402, + "step": 5147 + }, + { + "epoch": 1.3041165294490185, + "grad_norm": 3.3411202430725098, + "learning_rate": 8.83303321790914e-06, + "loss": 0.8052, + "step": 5148 + }, + { + "epoch": 1.3043698543381888, + "grad_norm": 3.522878408432007, + "learning_rate": 8.832495168022494e-06, + "loss": 0.6031, + "step": 5149 + }, + { + "epoch": 1.304623179227359, + "grad_norm": 4.035628795623779, + "learning_rate": 8.83195701052118e-06, + "loss": 0.7687, + "step": 5150 + }, + { + "epoch": 1.3048765041165296, + "grad_norm": 3.939291477203369, + "learning_rate": 8.83141874542031e-06, + "loss": 0.8159, + "step": 5151 + }, + { + "epoch": 1.3051298290056998, + "grad_norm": 3.952495574951172, + "learning_rate": 8.830880372734998e-06, + "loss": 0.7899, + "step": 5152 + }, + { + "epoch": 1.3053831538948701, + "grad_norm": 3.5980591773986816, + "learning_rate": 8.830341892480359e-06, + "loss": 0.8068, + "step": 5153 + }, + { + "epoch": 1.3056364787840407, + "grad_norm": 3.4994215965270996, + "learning_rate": 8.829803304671515e-06, + "loss": 0.6223, + "step": 5154 + }, + { + "epoch": 1.305889803673211, + "grad_norm": 4.123564720153809, + "learning_rate": 8.82926460932359e-06, + "loss": 0.8317, + "step": 5155 + }, + { + "epoch": 1.3061431285623812, + "grad_norm": 4.1981987953186035, + "learning_rate": 8.82872580645171e-06, + "loss": 0.779, + "step": 5156 + }, + { + "epoch": 1.3063964534515518, + "grad_norm": 3.631875514984131, + "learning_rate": 8.828186896071003e-06, + "loss": 0.785, + "step": 5157 + }, + { + "epoch": 1.306649778340722, + "grad_norm": 3.7511484622955322, + "learning_rate": 8.8276478781966e-06, + "loss": 0.6887, + "step": 5158 + }, + { + "epoch": 1.3069031032298923, + "grad_norm": 4.104414939880371, + "learning_rate": 8.827108752843642e-06, + "loss": 0.8932, + "step": 5159 + }, + { + "epoch": 1.3071564281190626, + "grad_norm": 3.691878318786621, + "learning_rate": 8.826569520027262e-06, + "loss": 0.7989, + "step": 5160 + }, + { + "epoch": 1.307409753008233, + "grad_norm": 3.4031450748443604, + "learning_rate": 8.826030179762604e-06, + "loss": 0.6762, + "step": 5161 + }, + { + "epoch": 1.3076630778974034, + "grad_norm": 4.122935771942139, + "learning_rate": 8.825490732064813e-06, + "loss": 0.7819, + "step": 5162 + }, + { + "epoch": 1.3079164027865737, + "grad_norm": 3.300018787384033, + "learning_rate": 8.824951176949032e-06, + "loss": 0.6875, + "step": 5163 + }, + { + "epoch": 1.308169727675744, + "grad_norm": 3.7323172092437744, + "learning_rate": 8.824411514430417e-06, + "loss": 0.8033, + "step": 5164 + }, + { + "epoch": 1.3084230525649145, + "grad_norm": 3.7336745262145996, + "learning_rate": 8.823871744524118e-06, + "loss": 0.7597, + "step": 5165 + }, + { + "epoch": 1.3086763774540848, + "grad_norm": 3.9146523475646973, + "learning_rate": 8.823331867245293e-06, + "loss": 0.7806, + "step": 5166 + }, + { + "epoch": 1.3089297023432551, + "grad_norm": 3.8385565280914307, + "learning_rate": 8.8227918826091e-06, + "loss": 0.7252, + "step": 5167 + }, + { + "epoch": 1.3091830272324256, + "grad_norm": 4.111937999725342, + "learning_rate": 8.822251790630705e-06, + "loss": 0.9118, + "step": 5168 + }, + { + "epoch": 1.309436352121596, + "grad_norm": 3.8348045349121094, + "learning_rate": 8.82171159132527e-06, + "loss": 0.7847, + "step": 5169 + }, + { + "epoch": 1.3096896770107662, + "grad_norm": 4.485629558563232, + "learning_rate": 8.821171284707964e-06, + "loss": 0.829, + "step": 5170 + }, + { + "epoch": 1.3099430018999367, + "grad_norm": 3.9793319702148438, + "learning_rate": 8.82063087079396e-06, + "loss": 0.8615, + "step": 5171 + }, + { + "epoch": 1.310196326789107, + "grad_norm": 3.95802903175354, + "learning_rate": 8.82009034959843e-06, + "loss": 0.7396, + "step": 5172 + }, + { + "epoch": 1.3104496516782773, + "grad_norm": 3.820927143096924, + "learning_rate": 8.819549721136555e-06, + "loss": 0.6987, + "step": 5173 + }, + { + "epoch": 1.3107029765674478, + "grad_norm": 3.222822427749634, + "learning_rate": 8.819008985423514e-06, + "loss": 0.6992, + "step": 5174 + }, + { + "epoch": 1.3109563014566181, + "grad_norm": 3.582071542739868, + "learning_rate": 8.818468142474492e-06, + "loss": 0.8175, + "step": 5175 + }, + { + "epoch": 1.3112096263457884, + "grad_norm": 3.913431167602539, + "learning_rate": 8.817927192304671e-06, + "loss": 0.8305, + "step": 5176 + }, + { + "epoch": 1.311462951234959, + "grad_norm": 3.6144256591796875, + "learning_rate": 8.817386134929246e-06, + "loss": 0.7904, + "step": 5177 + }, + { + "epoch": 1.3117162761241292, + "grad_norm": 4.031813144683838, + "learning_rate": 8.816844970363408e-06, + "loss": 0.8826, + "step": 5178 + }, + { + "epoch": 1.3119696010132995, + "grad_norm": 3.829005002975464, + "learning_rate": 8.81630369862235e-06, + "loss": 0.763, + "step": 5179 + }, + { + "epoch": 1.31222292590247, + "grad_norm": 3.531787633895874, + "learning_rate": 8.815762319721274e-06, + "loss": 0.8515, + "step": 5180 + }, + { + "epoch": 1.3124762507916403, + "grad_norm": 3.567216634750366, + "learning_rate": 8.81522083367538e-06, + "loss": 0.7158, + "step": 5181 + }, + { + "epoch": 1.3127295756808106, + "grad_norm": 3.6887264251708984, + "learning_rate": 8.814679240499872e-06, + "loss": 0.7349, + "step": 5182 + }, + { + "epoch": 1.3129829005699811, + "grad_norm": 3.498737335205078, + "learning_rate": 8.814137540209962e-06, + "loss": 0.7118, + "step": 5183 + }, + { + "epoch": 1.3132362254591514, + "grad_norm": 3.7857372760772705, + "learning_rate": 8.813595732820854e-06, + "loss": 0.8331, + "step": 5184 + }, + { + "epoch": 1.3134895503483217, + "grad_norm": 3.5319156646728516, + "learning_rate": 8.813053818347768e-06, + "loss": 0.735, + "step": 5185 + }, + { + "epoch": 1.313742875237492, + "grad_norm": 3.691115617752075, + "learning_rate": 8.812511796805915e-06, + "loss": 0.7919, + "step": 5186 + }, + { + "epoch": 1.3139962001266625, + "grad_norm": 4.09224271774292, + "learning_rate": 8.811969668210521e-06, + "loss": 0.8109, + "step": 5187 + }, + { + "epoch": 1.3142495250158328, + "grad_norm": 3.2587223052978516, + "learning_rate": 8.811427432576803e-06, + "loss": 0.6939, + "step": 5188 + }, + { + "epoch": 1.314502849905003, + "grad_norm": 3.6646640300750732, + "learning_rate": 8.810885089919989e-06, + "loss": 0.8137, + "step": 5189 + }, + { + "epoch": 1.3147561747941734, + "grad_norm": 3.475882053375244, + "learning_rate": 8.810342640255307e-06, + "loss": 0.7515, + "step": 5190 + }, + { + "epoch": 1.315009499683344, + "grad_norm": 3.985877752304077, + "learning_rate": 8.809800083597991e-06, + "loss": 0.8788, + "step": 5191 + }, + { + "epoch": 1.3152628245725142, + "grad_norm": 3.800947666168213, + "learning_rate": 8.809257419963275e-06, + "loss": 0.8266, + "step": 5192 + }, + { + "epoch": 1.3155161494616845, + "grad_norm": 3.7874348163604736, + "learning_rate": 8.808714649366395e-06, + "loss": 0.8686, + "step": 5193 + }, + { + "epoch": 1.315769474350855, + "grad_norm": 3.6544885635375977, + "learning_rate": 8.808171771822592e-06, + "loss": 0.6952, + "step": 5194 + }, + { + "epoch": 1.3160227992400253, + "grad_norm": 3.4128148555755615, + "learning_rate": 8.807628787347111e-06, + "loss": 0.7234, + "step": 5195 + }, + { + "epoch": 1.3162761241291956, + "grad_norm": 3.8209598064422607, + "learning_rate": 8.807085695955197e-06, + "loss": 0.8171, + "step": 5196 + }, + { + "epoch": 1.316529449018366, + "grad_norm": 4.04181432723999, + "learning_rate": 8.806542497662103e-06, + "loss": 0.7815, + "step": 5197 + }, + { + "epoch": 1.3167827739075364, + "grad_norm": 3.8541886806488037, + "learning_rate": 8.80599919248308e-06, + "loss": 0.8588, + "step": 5198 + }, + { + "epoch": 1.3170360987967067, + "grad_norm": 3.6714396476745605, + "learning_rate": 8.805455780433382e-06, + "loss": 0.8267, + "step": 5199 + }, + { + "epoch": 1.3172894236858772, + "grad_norm": 4.040272235870361, + "learning_rate": 8.804912261528269e-06, + "loss": 0.7836, + "step": 5200 + }, + { + "epoch": 1.3175427485750475, + "grad_norm": 3.6184239387512207, + "learning_rate": 8.804368635783002e-06, + "loss": 0.7391, + "step": 5201 + }, + { + "epoch": 1.3177960734642178, + "grad_norm": 3.2228989601135254, + "learning_rate": 8.80382490321285e-06, + "loss": 0.7524, + "step": 5202 + }, + { + "epoch": 1.3180493983533883, + "grad_norm": 3.5631792545318604, + "learning_rate": 8.803281063833076e-06, + "loss": 0.708, + "step": 5203 + }, + { + "epoch": 1.3183027232425586, + "grad_norm": 3.6151304244995117, + "learning_rate": 8.802737117658952e-06, + "loss": 0.8097, + "step": 5204 + }, + { + "epoch": 1.3185560481317289, + "grad_norm": 3.557908535003662, + "learning_rate": 8.802193064705753e-06, + "loss": 0.9571, + "step": 5205 + }, + { + "epoch": 1.3188093730208994, + "grad_norm": 3.8364086151123047, + "learning_rate": 8.801648904988753e-06, + "loss": 0.6698, + "step": 5206 + }, + { + "epoch": 1.3190626979100697, + "grad_norm": 3.4950175285339355, + "learning_rate": 8.801104638523233e-06, + "loss": 0.7809, + "step": 5207 + }, + { + "epoch": 1.31931602279924, + "grad_norm": 3.430807113647461, + "learning_rate": 8.800560265324477e-06, + "loss": 0.7195, + "step": 5208 + }, + { + "epoch": 1.3195693476884105, + "grad_norm": 4.011314868927002, + "learning_rate": 8.800015785407771e-06, + "loss": 0.8731, + "step": 5209 + }, + { + "epoch": 1.3198226725775808, + "grad_norm": 3.559040069580078, + "learning_rate": 8.799471198788402e-06, + "loss": 0.6956, + "step": 5210 + }, + { + "epoch": 1.320075997466751, + "grad_norm": 3.3630099296569824, + "learning_rate": 8.798926505481664e-06, + "loss": 0.6953, + "step": 5211 + }, + { + "epoch": 1.3203293223559216, + "grad_norm": 3.2390987873077393, + "learning_rate": 8.798381705502847e-06, + "loss": 0.7014, + "step": 5212 + }, + { + "epoch": 1.320582647245092, + "grad_norm": 3.863812208175659, + "learning_rate": 8.797836798867255e-06, + "loss": 0.7841, + "step": 5213 + }, + { + "epoch": 1.3208359721342622, + "grad_norm": 3.636995792388916, + "learning_rate": 8.797291785590184e-06, + "loss": 0.7871, + "step": 5214 + }, + { + "epoch": 1.3210892970234325, + "grad_norm": 3.817232608795166, + "learning_rate": 8.79674666568694e-06, + "loss": 0.8858, + "step": 5215 + }, + { + "epoch": 1.321342621912603, + "grad_norm": 3.6206257343292236, + "learning_rate": 8.796201439172829e-06, + "loss": 0.7299, + "step": 5216 + }, + { + "epoch": 1.3215959468017733, + "grad_norm": 3.469076156616211, + "learning_rate": 8.795656106063161e-06, + "loss": 0.7902, + "step": 5217 + }, + { + "epoch": 1.3218492716909436, + "grad_norm": 3.588515281677246, + "learning_rate": 8.795110666373248e-06, + "loss": 0.7992, + "step": 5218 + }, + { + "epoch": 1.3221025965801139, + "grad_norm": 3.2227203845977783, + "learning_rate": 8.794565120118406e-06, + "loss": 0.6846, + "step": 5219 + }, + { + "epoch": 1.3223559214692844, + "grad_norm": 3.3197062015533447, + "learning_rate": 8.794019467313955e-06, + "loss": 0.7482, + "step": 5220 + }, + { + "epoch": 1.3226092463584547, + "grad_norm": 3.9905765056610107, + "learning_rate": 8.793473707975214e-06, + "loss": 0.8468, + "step": 5221 + }, + { + "epoch": 1.322862571247625, + "grad_norm": 3.633476734161377, + "learning_rate": 8.792927842117509e-06, + "loss": 0.6167, + "step": 5222 + }, + { + "epoch": 1.3231158961367955, + "grad_norm": 3.5059194564819336, + "learning_rate": 8.792381869756168e-06, + "loss": 0.8105, + "step": 5223 + }, + { + "epoch": 1.3233692210259658, + "grad_norm": 3.969255208969116, + "learning_rate": 8.791835790906525e-06, + "loss": 0.92, + "step": 5224 + }, + { + "epoch": 1.323622545915136, + "grad_norm": 4.204358100891113, + "learning_rate": 8.791289605583907e-06, + "loss": 0.8562, + "step": 5225 + }, + { + "epoch": 1.3238758708043066, + "grad_norm": 4.095635414123535, + "learning_rate": 8.790743313803653e-06, + "loss": 0.9031, + "step": 5226 + }, + { + "epoch": 1.3241291956934769, + "grad_norm": 3.7705609798431396, + "learning_rate": 8.790196915581104e-06, + "loss": 0.8469, + "step": 5227 + }, + { + "epoch": 1.3243825205826472, + "grad_norm": 3.862409830093384, + "learning_rate": 8.789650410931603e-06, + "loss": 0.7578, + "step": 5228 + }, + { + "epoch": 1.3246358454718177, + "grad_norm": 3.427957057952881, + "learning_rate": 8.789103799870493e-06, + "loss": 0.6602, + "step": 5229 + }, + { + "epoch": 1.324889170360988, + "grad_norm": 4.275664806365967, + "learning_rate": 8.788557082413128e-06, + "loss": 0.8078, + "step": 5230 + }, + { + "epoch": 1.3251424952501583, + "grad_norm": 3.465346097946167, + "learning_rate": 8.788010258574851e-06, + "loss": 0.6734, + "step": 5231 + }, + { + "epoch": 1.3253958201393288, + "grad_norm": 3.960186719894409, + "learning_rate": 8.787463328371023e-06, + "loss": 0.6724, + "step": 5232 + }, + { + "epoch": 1.325649145028499, + "grad_norm": 3.8265140056610107, + "learning_rate": 8.786916291817e-06, + "loss": 0.8207, + "step": 5233 + }, + { + "epoch": 1.3259024699176694, + "grad_norm": 3.872192859649658, + "learning_rate": 8.786369148928143e-06, + "loss": 0.7123, + "step": 5234 + }, + { + "epoch": 1.3261557948068399, + "grad_norm": 4.1361188888549805, + "learning_rate": 8.785821899719815e-06, + "loss": 0.866, + "step": 5235 + }, + { + "epoch": 1.3264091196960102, + "grad_norm": 3.589270830154419, + "learning_rate": 8.785274544207382e-06, + "loss": 0.8116, + "step": 5236 + }, + { + "epoch": 1.3266624445851805, + "grad_norm": 3.4907267093658447, + "learning_rate": 8.784727082406214e-06, + "loss": 0.7012, + "step": 5237 + }, + { + "epoch": 1.326915769474351, + "grad_norm": 3.7468090057373047, + "learning_rate": 8.784179514331683e-06, + "loss": 0.883, + "step": 5238 + }, + { + "epoch": 1.3271690943635213, + "grad_norm": 3.560201644897461, + "learning_rate": 8.783631839999163e-06, + "loss": 0.7044, + "step": 5239 + }, + { + "epoch": 1.3274224192526916, + "grad_norm": 3.9083845615386963, + "learning_rate": 8.783084059424037e-06, + "loss": 0.7542, + "step": 5240 + }, + { + "epoch": 1.327675744141862, + "grad_norm": 3.9807448387145996, + "learning_rate": 8.782536172621682e-06, + "loss": 0.8834, + "step": 5241 + }, + { + "epoch": 1.3279290690310324, + "grad_norm": 3.6231117248535156, + "learning_rate": 8.781988179607486e-06, + "loss": 0.8607, + "step": 5242 + }, + { + "epoch": 1.3281823939202027, + "grad_norm": 4.24287748336792, + "learning_rate": 8.781440080396833e-06, + "loss": 0.9153, + "step": 5243 + }, + { + "epoch": 1.328435718809373, + "grad_norm": 3.889406442642212, + "learning_rate": 8.780891875005116e-06, + "loss": 0.7577, + "step": 5244 + }, + { + "epoch": 1.3286890436985435, + "grad_norm": 3.5610406398773193, + "learning_rate": 8.780343563447725e-06, + "loss": 0.7354, + "step": 5245 + }, + { + "epoch": 1.3289423685877138, + "grad_norm": 3.858607769012451, + "learning_rate": 8.77979514574006e-06, + "loss": 0.8152, + "step": 5246 + }, + { + "epoch": 1.329195693476884, + "grad_norm": 3.9125075340270996, + "learning_rate": 8.779246621897519e-06, + "loss": 0.7909, + "step": 5247 + }, + { + "epoch": 1.3294490183660543, + "grad_norm": 3.6272919178009033, + "learning_rate": 8.778697991935503e-06, + "loss": 0.6938, + "step": 5248 + }, + { + "epoch": 1.3297023432552249, + "grad_norm": 3.786247491836548, + "learning_rate": 8.77814925586942e-06, + "loss": 0.7825, + "step": 5249 + }, + { + "epoch": 1.3299556681443951, + "grad_norm": 4.02630615234375, + "learning_rate": 8.777600413714675e-06, + "loss": 0.765, + "step": 5250 + }, + { + "epoch": 1.3302089930335654, + "grad_norm": 3.590681791305542, + "learning_rate": 8.77705146548668e-06, + "loss": 0.7976, + "step": 5251 + }, + { + "epoch": 1.330462317922736, + "grad_norm": 3.867377996444702, + "learning_rate": 8.776502411200852e-06, + "loss": 0.7073, + "step": 5252 + }, + { + "epoch": 1.3307156428119062, + "grad_norm": 3.9471781253814697, + "learning_rate": 8.775953250872606e-06, + "loss": 0.7559, + "step": 5253 + }, + { + "epoch": 1.3309689677010765, + "grad_norm": 3.415630340576172, + "learning_rate": 8.775403984517363e-06, + "loss": 0.7754, + "step": 5254 + }, + { + "epoch": 1.331222292590247, + "grad_norm": 3.7324411869049072, + "learning_rate": 8.774854612150544e-06, + "loss": 0.8234, + "step": 5255 + }, + { + "epoch": 1.3314756174794173, + "grad_norm": 3.692962408065796, + "learning_rate": 8.774305133787577e-06, + "loss": 0.7491, + "step": 5256 + }, + { + "epoch": 1.3317289423685876, + "grad_norm": 4.279314041137695, + "learning_rate": 8.773755549443892e-06, + "loss": 0.928, + "step": 5257 + }, + { + "epoch": 1.3319822672577581, + "grad_norm": 3.6146011352539062, + "learning_rate": 8.773205859134918e-06, + "loss": 0.7835, + "step": 5258 + }, + { + "epoch": 1.3322355921469284, + "grad_norm": 3.7248826026916504, + "learning_rate": 8.772656062876094e-06, + "loss": 0.8181, + "step": 5259 + }, + { + "epoch": 1.3324889170360987, + "grad_norm": 4.029184818267822, + "learning_rate": 8.772106160682853e-06, + "loss": 0.8218, + "step": 5260 + }, + { + "epoch": 1.3327422419252692, + "grad_norm": 3.4886903762817383, + "learning_rate": 8.771556152570643e-06, + "loss": 0.8564, + "step": 5261 + }, + { + "epoch": 1.3329955668144395, + "grad_norm": 4.029396057128906, + "learning_rate": 8.771006038554901e-06, + "loss": 0.716, + "step": 5262 + }, + { + "epoch": 1.3332488917036098, + "grad_norm": 3.870342254638672, + "learning_rate": 8.770455818651078e-06, + "loss": 0.8832, + "step": 5263 + }, + { + "epoch": 1.3335022165927803, + "grad_norm": 3.8330764770507812, + "learning_rate": 8.769905492874622e-06, + "loss": 0.6664, + "step": 5264 + }, + { + "epoch": 1.3337555414819506, + "grad_norm": 3.253298044204712, + "learning_rate": 8.769355061240987e-06, + "loss": 0.7199, + "step": 5265 + }, + { + "epoch": 1.334008866371121, + "grad_norm": 4.0499467849731445, + "learning_rate": 8.768804523765629e-06, + "loss": 0.8866, + "step": 5266 + }, + { + "epoch": 1.3342621912602914, + "grad_norm": 3.5482170581817627, + "learning_rate": 8.768253880464005e-06, + "loss": 0.6303, + "step": 5267 + }, + { + "epoch": 1.3345155161494617, + "grad_norm": 4.1437249183654785, + "learning_rate": 8.76770313135158e-06, + "loss": 0.7959, + "step": 5268 + }, + { + "epoch": 1.334768841038632, + "grad_norm": 3.7345595359802246, + "learning_rate": 8.767152276443814e-06, + "loss": 0.7115, + "step": 5269 + }, + { + "epoch": 1.3350221659278025, + "grad_norm": 3.7965118885040283, + "learning_rate": 8.766601315756181e-06, + "loss": 0.7337, + "step": 5270 + }, + { + "epoch": 1.3352754908169728, + "grad_norm": 3.638084888458252, + "learning_rate": 8.766050249304145e-06, + "loss": 0.8228, + "step": 5271 + }, + { + "epoch": 1.3355288157061431, + "grad_norm": 3.803407669067383, + "learning_rate": 8.765499077103185e-06, + "loss": 0.8366, + "step": 5272 + }, + { + "epoch": 1.3357821405953134, + "grad_norm": 4.4520134925842285, + "learning_rate": 8.764947799168777e-06, + "loss": 0.824, + "step": 5273 + }, + { + "epoch": 1.336035465484484, + "grad_norm": 3.487675189971924, + "learning_rate": 8.764396415516396e-06, + "loss": 0.6718, + "step": 5274 + }, + { + "epoch": 1.3362887903736542, + "grad_norm": 3.448404312133789, + "learning_rate": 8.763844926161531e-06, + "loss": 0.721, + "step": 5275 + }, + { + "epoch": 1.3365421152628245, + "grad_norm": 3.6722261905670166, + "learning_rate": 8.763293331119664e-06, + "loss": 0.854, + "step": 5276 + }, + { + "epoch": 1.3367954401519948, + "grad_norm": 3.83449125289917, + "learning_rate": 8.762741630406282e-06, + "loss": 0.7718, + "step": 5277 + }, + { + "epoch": 1.3370487650411653, + "grad_norm": 4.610817909240723, + "learning_rate": 8.76218982403688e-06, + "loss": 0.9348, + "step": 5278 + }, + { + "epoch": 1.3373020899303356, + "grad_norm": 3.6970901489257812, + "learning_rate": 8.761637912026951e-06, + "loss": 0.7857, + "step": 5279 + }, + { + "epoch": 1.337555414819506, + "grad_norm": 3.252974033355713, + "learning_rate": 8.761085894391991e-06, + "loss": 0.6503, + "step": 5280 + }, + { + "epoch": 1.3378087397086764, + "grad_norm": 3.4152369499206543, + "learning_rate": 8.760533771147505e-06, + "loss": 0.6993, + "step": 5281 + }, + { + "epoch": 1.3380620645978467, + "grad_norm": 3.610792636871338, + "learning_rate": 8.759981542308991e-06, + "loss": 0.7445, + "step": 5282 + }, + { + "epoch": 1.338315389487017, + "grad_norm": 3.5054173469543457, + "learning_rate": 8.759429207891959e-06, + "loss": 0.7702, + "step": 5283 + }, + { + "epoch": 1.3385687143761875, + "grad_norm": 3.4513771533966064, + "learning_rate": 8.758876767911914e-06, + "loss": 0.6192, + "step": 5284 + }, + { + "epoch": 1.3388220392653578, + "grad_norm": 4.195075035095215, + "learning_rate": 8.758324222384373e-06, + "loss": 0.8552, + "step": 5285 + }, + { + "epoch": 1.339075364154528, + "grad_norm": 3.7252817153930664, + "learning_rate": 8.75777157132485e-06, + "loss": 0.743, + "step": 5286 + }, + { + "epoch": 1.3393286890436986, + "grad_norm": 3.722015142440796, + "learning_rate": 8.75721881474886e-06, + "loss": 0.8452, + "step": 5287 + }, + { + "epoch": 1.339582013932869, + "grad_norm": 3.226900577545166, + "learning_rate": 8.756665952671928e-06, + "loss": 0.7867, + "step": 5288 + }, + { + "epoch": 1.3398353388220392, + "grad_norm": 3.866802930831909, + "learning_rate": 8.756112985109577e-06, + "loss": 0.7965, + "step": 5289 + }, + { + "epoch": 1.3400886637112097, + "grad_norm": 3.816505193710327, + "learning_rate": 8.755559912077333e-06, + "loss": 0.7942, + "step": 5290 + }, + { + "epoch": 1.34034198860038, + "grad_norm": 3.6338398456573486, + "learning_rate": 8.755006733590729e-06, + "loss": 0.803, + "step": 5291 + }, + { + "epoch": 1.3405953134895503, + "grad_norm": 3.7093474864959717, + "learning_rate": 8.754453449665294e-06, + "loss": 0.8079, + "step": 5292 + }, + { + "epoch": 1.3408486383787208, + "grad_norm": 3.889493227005005, + "learning_rate": 8.753900060316565e-06, + "loss": 0.9149, + "step": 5293 + }, + { + "epoch": 1.341101963267891, + "grad_norm": 3.9428391456604004, + "learning_rate": 8.753346565560084e-06, + "loss": 0.7434, + "step": 5294 + }, + { + "epoch": 1.3413552881570614, + "grad_norm": 3.623548984527588, + "learning_rate": 8.752792965411389e-06, + "loss": 0.702, + "step": 5295 + }, + { + "epoch": 1.341608613046232, + "grad_norm": 3.752361536026001, + "learning_rate": 8.752239259886026e-06, + "loss": 0.7456, + "step": 5296 + }, + { + "epoch": 1.3418619379354022, + "grad_norm": 3.6794497966766357, + "learning_rate": 8.751685448999545e-06, + "loss": 0.6938, + "step": 5297 + }, + { + "epoch": 1.3421152628245725, + "grad_norm": 3.465989351272583, + "learning_rate": 8.751131532767491e-06, + "loss": 0.7443, + "step": 5298 + }, + { + "epoch": 1.342368587713743, + "grad_norm": 3.6318061351776123, + "learning_rate": 8.750577511205425e-06, + "loss": 0.7404, + "step": 5299 + }, + { + "epoch": 1.3426219126029133, + "grad_norm": 3.705566167831421, + "learning_rate": 8.750023384328901e-06, + "loss": 0.7912, + "step": 5300 + }, + { + "epoch": 1.3428752374920836, + "grad_norm": 3.9725582599639893, + "learning_rate": 8.749469152153475e-06, + "loss": 0.8353, + "step": 5301 + }, + { + "epoch": 1.3431285623812539, + "grad_norm": 3.7491159439086914, + "learning_rate": 8.748914814694715e-06, + "loss": 0.7185, + "step": 5302 + }, + { + "epoch": 1.3433818872704244, + "grad_norm": 3.601637125015259, + "learning_rate": 8.748360371968183e-06, + "loss": 0.774, + "step": 5303 + }, + { + "epoch": 1.3436352121595947, + "grad_norm": 3.9110145568847656, + "learning_rate": 8.747805823989448e-06, + "loss": 0.8749, + "step": 5304 + }, + { + "epoch": 1.343888537048765, + "grad_norm": 3.7703335285186768, + "learning_rate": 8.747251170774083e-06, + "loss": 0.8924, + "step": 5305 + }, + { + "epoch": 1.3441418619379353, + "grad_norm": 3.8879683017730713, + "learning_rate": 8.746696412337661e-06, + "loss": 0.9428, + "step": 5306 + }, + { + "epoch": 1.3443951868271058, + "grad_norm": 3.670351505279541, + "learning_rate": 8.74614154869576e-06, + "loss": 0.7602, + "step": 5307 + }, + { + "epoch": 1.344648511716276, + "grad_norm": 3.466641426086426, + "learning_rate": 8.745586579863958e-06, + "loss": 0.7725, + "step": 5308 + }, + { + "epoch": 1.3449018366054464, + "grad_norm": 3.7397964000701904, + "learning_rate": 8.745031505857842e-06, + "loss": 0.7807, + "step": 5309 + }, + { + "epoch": 1.345155161494617, + "grad_norm": 3.730457067489624, + "learning_rate": 8.744476326692998e-06, + "loss": 0.7872, + "step": 5310 + }, + { + "epoch": 1.3454084863837872, + "grad_norm": 3.326808452606201, + "learning_rate": 8.743921042385012e-06, + "loss": 0.7866, + "step": 5311 + }, + { + "epoch": 1.3456618112729575, + "grad_norm": 3.538274049758911, + "learning_rate": 8.743365652949479e-06, + "loss": 0.7308, + "step": 5312 + }, + { + "epoch": 1.345915136162128, + "grad_norm": 4.167849063873291, + "learning_rate": 8.742810158401991e-06, + "loss": 0.9237, + "step": 5313 + }, + { + "epoch": 1.3461684610512983, + "grad_norm": 3.679926872253418, + "learning_rate": 8.742254558758147e-06, + "loss": 0.7973, + "step": 5314 + }, + { + "epoch": 1.3464217859404686, + "grad_norm": 3.9718079566955566, + "learning_rate": 8.741698854033552e-06, + "loss": 0.732, + "step": 5315 + }, + { + "epoch": 1.346675110829639, + "grad_norm": 3.6564199924468994, + "learning_rate": 8.741143044243805e-06, + "loss": 0.7546, + "step": 5316 + }, + { + "epoch": 1.3469284357188094, + "grad_norm": 3.748065710067749, + "learning_rate": 8.740587129404517e-06, + "loss": 0.751, + "step": 5317 + }, + { + "epoch": 1.3471817606079797, + "grad_norm": 4.218088150024414, + "learning_rate": 8.740031109531293e-06, + "loss": 0.8092, + "step": 5318 + }, + { + "epoch": 1.3474350854971502, + "grad_norm": 3.18046236038208, + "learning_rate": 8.739474984639749e-06, + "loss": 0.671, + "step": 5319 + }, + { + "epoch": 1.3476884103863205, + "grad_norm": 4.337594032287598, + "learning_rate": 8.7389187547455e-06, + "loss": 0.9311, + "step": 5320 + }, + { + "epoch": 1.3479417352754908, + "grad_norm": 3.796656370162964, + "learning_rate": 8.738362419864164e-06, + "loss": 0.8098, + "step": 5321 + }, + { + "epoch": 1.3481950601646613, + "grad_norm": 3.755025863647461, + "learning_rate": 8.737805980011365e-06, + "loss": 0.7505, + "step": 5322 + }, + { + "epoch": 1.3484483850538316, + "grad_norm": 3.7076237201690674, + "learning_rate": 8.737249435202725e-06, + "loss": 0.8071, + "step": 5323 + }, + { + "epoch": 1.3487017099430019, + "grad_norm": 3.968423843383789, + "learning_rate": 8.736692785453873e-06, + "loss": 0.9095, + "step": 5324 + }, + { + "epoch": 1.3489550348321724, + "grad_norm": 3.9761288166046143, + "learning_rate": 8.736136030780438e-06, + "loss": 0.7993, + "step": 5325 + }, + { + "epoch": 1.3492083597213427, + "grad_norm": 4.578857421875, + "learning_rate": 8.735579171198055e-06, + "loss": 0.9795, + "step": 5326 + }, + { + "epoch": 1.349461684610513, + "grad_norm": 3.433751106262207, + "learning_rate": 8.735022206722359e-06, + "loss": 0.7931, + "step": 5327 + }, + { + "epoch": 1.3497150094996835, + "grad_norm": 3.4457149505615234, + "learning_rate": 8.73446513736899e-06, + "loss": 0.7807, + "step": 5328 + }, + { + "epoch": 1.3499683343888538, + "grad_norm": 4.20115852355957, + "learning_rate": 8.733907963153592e-06, + "loss": 0.8967, + "step": 5329 + }, + { + "epoch": 1.350221659278024, + "grad_norm": 3.8374433517456055, + "learning_rate": 8.733350684091806e-06, + "loss": 0.7224, + "step": 5330 + }, + { + "epoch": 1.3504749841671944, + "grad_norm": 3.5178754329681396, + "learning_rate": 8.732793300199284e-06, + "loss": 0.7479, + "step": 5331 + }, + { + "epoch": 1.3507283090563647, + "grad_norm": 3.638697624206543, + "learning_rate": 8.732235811491675e-06, + "loss": 0.8221, + "step": 5332 + }, + { + "epoch": 1.3509816339455352, + "grad_norm": 4.061330795288086, + "learning_rate": 8.731678217984633e-06, + "loss": 0.7904, + "step": 5333 + }, + { + "epoch": 1.3512349588347055, + "grad_norm": 3.571578025817871, + "learning_rate": 8.731120519693817e-06, + "loss": 0.6903, + "step": 5334 + }, + { + "epoch": 1.3514882837238757, + "grad_norm": 3.828092575073242, + "learning_rate": 8.730562716634885e-06, + "loss": 0.8504, + "step": 5335 + }, + { + "epoch": 1.3517416086130463, + "grad_norm": 3.6241204738616943, + "learning_rate": 8.7300048088235e-06, + "loss": 0.7688, + "step": 5336 + }, + { + "epoch": 1.3519949335022166, + "grad_norm": 3.739471435546875, + "learning_rate": 8.729446796275329e-06, + "loss": 0.8968, + "step": 5337 + }, + { + "epoch": 1.3522482583913868, + "grad_norm": 3.1405632495880127, + "learning_rate": 8.728888679006038e-06, + "loss": 0.705, + "step": 5338 + }, + { + "epoch": 1.3525015832805574, + "grad_norm": 4.076598167419434, + "learning_rate": 8.728330457031302e-06, + "loss": 0.8274, + "step": 5339 + }, + { + "epoch": 1.3527549081697277, + "grad_norm": 3.6819043159484863, + "learning_rate": 8.727772130366793e-06, + "loss": 0.6646, + "step": 5340 + }, + { + "epoch": 1.353008233058898, + "grad_norm": 4.0459442138671875, + "learning_rate": 8.727213699028193e-06, + "loss": 0.8875, + "step": 5341 + }, + { + "epoch": 1.3532615579480685, + "grad_norm": 3.763627052307129, + "learning_rate": 8.726655163031175e-06, + "loss": 0.771, + "step": 5342 + }, + { + "epoch": 1.3535148828372388, + "grad_norm": 3.540144920349121, + "learning_rate": 8.72609652239143e-06, + "loss": 0.7245, + "step": 5343 + }, + { + "epoch": 1.353768207726409, + "grad_norm": 3.6217358112335205, + "learning_rate": 8.725537777124639e-06, + "loss": 0.7704, + "step": 5344 + }, + { + "epoch": 1.3540215326155796, + "grad_norm": 3.867257595062256, + "learning_rate": 8.724978927246493e-06, + "loss": 0.7022, + "step": 5345 + }, + { + "epoch": 1.3542748575047499, + "grad_norm": 3.4698808193206787, + "learning_rate": 8.724419972772685e-06, + "loss": 0.8088, + "step": 5346 + }, + { + "epoch": 1.3545281823939201, + "grad_norm": 3.513871669769287, + "learning_rate": 8.72386091371891e-06, + "loss": 0.6515, + "step": 5347 + }, + { + "epoch": 1.3547815072830907, + "grad_norm": 3.530122756958008, + "learning_rate": 8.723301750100866e-06, + "loss": 0.7009, + "step": 5348 + }, + { + "epoch": 1.355034832172261, + "grad_norm": 3.7477927207946777, + "learning_rate": 8.722742481934253e-06, + "loss": 0.7698, + "step": 5349 + }, + { + "epoch": 1.3552881570614312, + "grad_norm": 3.7715811729431152, + "learning_rate": 8.722183109234776e-06, + "loss": 0.7976, + "step": 5350 + }, + { + "epoch": 1.3555414819506018, + "grad_norm": 3.772736072540283, + "learning_rate": 8.721623632018144e-06, + "loss": 0.8296, + "step": 5351 + }, + { + "epoch": 1.355794806839772, + "grad_norm": 4.373363971710205, + "learning_rate": 8.721064050300062e-06, + "loss": 0.8761, + "step": 5352 + }, + { + "epoch": 1.3560481317289423, + "grad_norm": 4.46722936630249, + "learning_rate": 8.720504364096247e-06, + "loss": 0.8954, + "step": 5353 + }, + { + "epoch": 1.3563014566181129, + "grad_norm": 3.9622766971588135, + "learning_rate": 8.719944573422413e-06, + "loss": 0.8905, + "step": 5354 + }, + { + "epoch": 1.3565547815072831, + "grad_norm": 4.295718669891357, + "learning_rate": 8.719384678294278e-06, + "loss": 0.8181, + "step": 5355 + }, + { + "epoch": 1.3568081063964534, + "grad_norm": 3.521519184112549, + "learning_rate": 8.718824678727564e-06, + "loss": 0.6659, + "step": 5356 + }, + { + "epoch": 1.3570614312856237, + "grad_norm": 3.8578474521636963, + "learning_rate": 8.718264574737998e-06, + "loss": 0.7713, + "step": 5357 + }, + { + "epoch": 1.3573147561747942, + "grad_norm": 3.9559175968170166, + "learning_rate": 8.717704366341305e-06, + "loss": 0.9289, + "step": 5358 + }, + { + "epoch": 1.3575680810639645, + "grad_norm": 3.6543405055999756, + "learning_rate": 8.717144053553213e-06, + "loss": 0.7082, + "step": 5359 + }, + { + "epoch": 1.3578214059531348, + "grad_norm": 3.4608185291290283, + "learning_rate": 8.71658363638946e-06, + "loss": 0.8287, + "step": 5360 + }, + { + "epoch": 1.3580747308423051, + "grad_norm": 3.783193349838257, + "learning_rate": 8.716023114865781e-06, + "loss": 0.8052, + "step": 5361 + }, + { + "epoch": 1.3583280557314756, + "grad_norm": 4.195127964019775, + "learning_rate": 8.715462488997915e-06, + "loss": 0.8677, + "step": 5362 + }, + { + "epoch": 1.358581380620646, + "grad_norm": 3.9031248092651367, + "learning_rate": 8.714901758801602e-06, + "loss": 0.887, + "step": 5363 + }, + { + "epoch": 1.3588347055098162, + "grad_norm": 3.688333034515381, + "learning_rate": 8.71434092429259e-06, + "loss": 0.8154, + "step": 5364 + }, + { + "epoch": 1.3590880303989867, + "grad_norm": 4.0218071937561035, + "learning_rate": 8.713779985486624e-06, + "loss": 0.7567, + "step": 5365 + }, + { + "epoch": 1.359341355288157, + "grad_norm": 3.3400766849517822, + "learning_rate": 8.71321894239946e-06, + "loss": 0.7612, + "step": 5366 + }, + { + "epoch": 1.3595946801773273, + "grad_norm": 3.4791815280914307, + "learning_rate": 8.712657795046843e-06, + "loss": 0.742, + "step": 5367 + }, + { + "epoch": 1.3598480050664978, + "grad_norm": 3.78662109375, + "learning_rate": 8.71209654344454e-06, + "loss": 0.8712, + "step": 5368 + }, + { + "epoch": 1.3601013299556681, + "grad_norm": 3.7294909954071045, + "learning_rate": 8.711535187608303e-06, + "loss": 0.8447, + "step": 5369 + }, + { + "epoch": 1.3603546548448384, + "grad_norm": 3.913294792175293, + "learning_rate": 8.710973727553898e-06, + "loss": 0.816, + "step": 5370 + }, + { + "epoch": 1.360607979734009, + "grad_norm": 2.970475435256958, + "learning_rate": 8.710412163297092e-06, + "loss": 0.6601, + "step": 5371 + }, + { + "epoch": 1.3608613046231792, + "grad_norm": 3.540560245513916, + "learning_rate": 8.709850494853649e-06, + "loss": 0.8807, + "step": 5372 + }, + { + "epoch": 1.3611146295123495, + "grad_norm": 3.5531113147735596, + "learning_rate": 8.709288722239345e-06, + "loss": 0.6864, + "step": 5373 + }, + { + "epoch": 1.36136795440152, + "grad_norm": 3.8616247177124023, + "learning_rate": 8.70872684546995e-06, + "loss": 0.7388, + "step": 5374 + }, + { + "epoch": 1.3616212792906903, + "grad_norm": 3.792313575744629, + "learning_rate": 8.708164864561244e-06, + "loss": 0.7444, + "step": 5375 + }, + { + "epoch": 1.3618746041798606, + "grad_norm": 4.210550308227539, + "learning_rate": 8.707602779529007e-06, + "loss": 0.9281, + "step": 5376 + }, + { + "epoch": 1.3621279290690311, + "grad_norm": 3.5212748050689697, + "learning_rate": 8.707040590389023e-06, + "loss": 0.7365, + "step": 5377 + }, + { + "epoch": 1.3623812539582014, + "grad_norm": 3.9914774894714355, + "learning_rate": 8.706478297157075e-06, + "loss": 0.8508, + "step": 5378 + }, + { + "epoch": 1.3626345788473717, + "grad_norm": 3.8686389923095703, + "learning_rate": 8.705915899848952e-06, + "loss": 0.7543, + "step": 5379 + }, + { + "epoch": 1.3628879037365422, + "grad_norm": 3.6494946479797363, + "learning_rate": 8.70535339848045e-06, + "loss": 0.7657, + "step": 5380 + }, + { + "epoch": 1.3631412286257125, + "grad_norm": 4.332513332366943, + "learning_rate": 8.70479079306736e-06, + "loss": 0.8377, + "step": 5381 + }, + { + "epoch": 1.3633945535148828, + "grad_norm": 3.740971326828003, + "learning_rate": 8.70422808362548e-06, + "loss": 0.7013, + "step": 5382 + }, + { + "epoch": 1.3636478784040533, + "grad_norm": 3.3125898838043213, + "learning_rate": 8.703665270170613e-06, + "loss": 0.7548, + "step": 5383 + }, + { + "epoch": 1.3639012032932236, + "grad_norm": 3.634493589401245, + "learning_rate": 8.70310235271856e-06, + "loss": 0.7967, + "step": 5384 + }, + { + "epoch": 1.364154528182394, + "grad_norm": 3.581219434738159, + "learning_rate": 8.70253933128513e-06, + "loss": 0.8396, + "step": 5385 + }, + { + "epoch": 1.3644078530715642, + "grad_norm": 3.6488876342773438, + "learning_rate": 8.701976205886128e-06, + "loss": 0.7908, + "step": 5386 + }, + { + "epoch": 1.3646611779607347, + "grad_norm": 3.6243093013763428, + "learning_rate": 8.701412976537374e-06, + "loss": 0.7404, + "step": 5387 + }, + { + "epoch": 1.364914502849905, + "grad_norm": 3.7664456367492676, + "learning_rate": 8.700849643254674e-06, + "loss": 0.7708, + "step": 5388 + }, + { + "epoch": 1.3651678277390753, + "grad_norm": 3.6289851665496826, + "learning_rate": 8.700286206053851e-06, + "loss": 0.8135, + "step": 5389 + }, + { + "epoch": 1.3654211526282456, + "grad_norm": 4.124152183532715, + "learning_rate": 8.699722664950728e-06, + "loss": 0.8847, + "step": 5390 + }, + { + "epoch": 1.365674477517416, + "grad_norm": 4.026323318481445, + "learning_rate": 8.699159019961125e-06, + "loss": 0.962, + "step": 5391 + }, + { + "epoch": 1.3659278024065864, + "grad_norm": 3.9823355674743652, + "learning_rate": 8.698595271100872e-06, + "loss": 0.752, + "step": 5392 + }, + { + "epoch": 1.3661811272957567, + "grad_norm": 3.1845438480377197, + "learning_rate": 8.698031418385795e-06, + "loss": 0.733, + "step": 5393 + }, + { + "epoch": 1.3664344521849272, + "grad_norm": 3.4990522861480713, + "learning_rate": 8.69746746183173e-06, + "loss": 0.6656, + "step": 5394 + }, + { + "epoch": 1.3666877770740975, + "grad_norm": 3.5090506076812744, + "learning_rate": 8.69690340145451e-06, + "loss": 0.753, + "step": 5395 + }, + { + "epoch": 1.3669411019632678, + "grad_norm": 3.798260450363159, + "learning_rate": 8.696339237269976e-06, + "loss": 0.868, + "step": 5396 + }, + { + "epoch": 1.3671944268524383, + "grad_norm": 3.562178134918213, + "learning_rate": 8.69577496929397e-06, + "loss": 0.9206, + "step": 5397 + }, + { + "epoch": 1.3674477517416086, + "grad_norm": 3.871551752090454, + "learning_rate": 8.695210597542335e-06, + "loss": 0.831, + "step": 5398 + }, + { + "epoch": 1.367701076630779, + "grad_norm": 3.3575351238250732, + "learning_rate": 8.694646122030918e-06, + "loss": 0.7636, + "step": 5399 + }, + { + "epoch": 1.3679544015199494, + "grad_norm": 4.550872802734375, + "learning_rate": 8.694081542775568e-06, + "loss": 0.7773, + "step": 5400 + }, + { + "epoch": 1.3682077264091197, + "grad_norm": 3.6302905082702637, + "learning_rate": 8.693516859792141e-06, + "loss": 0.763, + "step": 5401 + }, + { + "epoch": 1.36846105129829, + "grad_norm": 3.4151811599731445, + "learning_rate": 8.69295207309649e-06, + "loss": 0.7456, + "step": 5402 + }, + { + "epoch": 1.3687143761874605, + "grad_norm": 3.618662118911743, + "learning_rate": 8.692387182704478e-06, + "loss": 0.7602, + "step": 5403 + }, + { + "epoch": 1.3689677010766308, + "grad_norm": 3.916361093521118, + "learning_rate": 8.691822188631963e-06, + "loss": 0.7612, + "step": 5404 + }, + { + "epoch": 1.369221025965801, + "grad_norm": 3.935793161392212, + "learning_rate": 8.69125709089481e-06, + "loss": 0.7893, + "step": 5405 + }, + { + "epoch": 1.3694743508549716, + "grad_norm": 4.025719165802002, + "learning_rate": 8.690691889508892e-06, + "loss": 0.7205, + "step": 5406 + }, + { + "epoch": 1.369727675744142, + "grad_norm": 3.435882806777954, + "learning_rate": 8.690126584490072e-06, + "loss": 0.7244, + "step": 5407 + }, + { + "epoch": 1.3699810006333122, + "grad_norm": 3.6226847171783447, + "learning_rate": 8.689561175854227e-06, + "loss": 0.7393, + "step": 5408 + }, + { + "epoch": 1.3702343255224827, + "grad_norm": 3.647475004196167, + "learning_rate": 8.688995663617235e-06, + "loss": 0.8259, + "step": 5409 + }, + { + "epoch": 1.370487650411653, + "grad_norm": 3.8193819522857666, + "learning_rate": 8.688430047794974e-06, + "loss": 0.8489, + "step": 5410 + }, + { + "epoch": 1.3707409753008233, + "grad_norm": 3.629404306411743, + "learning_rate": 8.687864328403324e-06, + "loss": 0.7589, + "step": 5411 + }, + { + "epoch": 1.3709943001899938, + "grad_norm": 3.832561492919922, + "learning_rate": 8.687298505458173e-06, + "loss": 0.82, + "step": 5412 + }, + { + "epoch": 1.371247625079164, + "grad_norm": 3.3676552772521973, + "learning_rate": 8.686732578975407e-06, + "loss": 0.6939, + "step": 5413 + }, + { + "epoch": 1.3715009499683344, + "grad_norm": 3.587935447692871, + "learning_rate": 8.68616654897092e-06, + "loss": 0.6813, + "step": 5414 + }, + { + "epoch": 1.3717542748575047, + "grad_norm": 3.995159864425659, + "learning_rate": 8.685600415460603e-06, + "loss": 0.774, + "step": 5415 + }, + { + "epoch": 1.3720075997466752, + "grad_norm": 3.79481840133667, + "learning_rate": 8.685034178460354e-06, + "loss": 0.7407, + "step": 5416 + }, + { + "epoch": 1.3722609246358455, + "grad_norm": 4.003698348999023, + "learning_rate": 8.684467837986072e-06, + "loss": 0.8186, + "step": 5417 + }, + { + "epoch": 1.3725142495250158, + "grad_norm": 4.8956708908081055, + "learning_rate": 8.68390139405366e-06, + "loss": 0.7143, + "step": 5418 + }, + { + "epoch": 1.372767574414186, + "grad_norm": 3.907975196838379, + "learning_rate": 8.683334846679023e-06, + "loss": 0.7404, + "step": 5419 + }, + { + "epoch": 1.3730208993033566, + "grad_norm": 3.841737747192383, + "learning_rate": 8.68276819587807e-06, + "loss": 0.8271, + "step": 5420 + }, + { + "epoch": 1.3732742241925269, + "grad_norm": 3.39402437210083, + "learning_rate": 8.682201441666713e-06, + "loss": 0.7675, + "step": 5421 + }, + { + "epoch": 1.3735275490816972, + "grad_norm": 3.759916305541992, + "learning_rate": 8.681634584060865e-06, + "loss": 0.7823, + "step": 5422 + }, + { + "epoch": 1.3737808739708677, + "grad_norm": 3.8228251934051514, + "learning_rate": 8.681067623076442e-06, + "loss": 0.8867, + "step": 5423 + }, + { + "epoch": 1.374034198860038, + "grad_norm": 3.8812716007232666, + "learning_rate": 8.680500558729367e-06, + "loss": 0.7029, + "step": 5424 + }, + { + "epoch": 1.3742875237492083, + "grad_norm": 3.287914991378784, + "learning_rate": 8.67993339103556e-06, + "loss": 0.7383, + "step": 5425 + }, + { + "epoch": 1.3745408486383788, + "grad_norm": 3.307253837585449, + "learning_rate": 8.67936612001095e-06, + "loss": 0.7984, + "step": 5426 + }, + { + "epoch": 1.374794173527549, + "grad_norm": 3.8354382514953613, + "learning_rate": 8.678798745671462e-06, + "loss": 0.9244, + "step": 5427 + }, + { + "epoch": 1.3750474984167194, + "grad_norm": 3.7706069946289062, + "learning_rate": 8.67823126803303e-06, + "loss": 0.7722, + "step": 5428 + }, + { + "epoch": 1.3753008233058899, + "grad_norm": 4.056471347808838, + "learning_rate": 8.677663687111589e-06, + "loss": 0.6343, + "step": 5429 + }, + { + "epoch": 1.3755541481950602, + "grad_norm": 3.8904762268066406, + "learning_rate": 8.677096002923073e-06, + "loss": 0.8716, + "step": 5430 + }, + { + "epoch": 1.3758074730842305, + "grad_norm": 3.622706174850464, + "learning_rate": 8.676528215483426e-06, + "loss": 0.7528, + "step": 5431 + }, + { + "epoch": 1.376060797973401, + "grad_norm": 3.5586719512939453, + "learning_rate": 8.67596032480859e-06, + "loss": 0.6346, + "step": 5432 + }, + { + "epoch": 1.3763141228625713, + "grad_norm": 3.39465069770813, + "learning_rate": 8.67539233091451e-06, + "loss": 0.8285, + "step": 5433 + }, + { + "epoch": 1.3765674477517416, + "grad_norm": 3.934678554534912, + "learning_rate": 8.674824233817136e-06, + "loss": 0.7948, + "step": 5434 + }, + { + "epoch": 1.376820772640912, + "grad_norm": 3.390430450439453, + "learning_rate": 8.67425603353242e-06, + "loss": 0.8003, + "step": 5435 + }, + { + "epoch": 1.3770740975300824, + "grad_norm": 3.2829158306121826, + "learning_rate": 8.673687730076317e-06, + "loss": 0.6213, + "step": 5436 + }, + { + "epoch": 1.3773274224192527, + "grad_norm": 3.317941427230835, + "learning_rate": 8.673119323464785e-06, + "loss": 0.7675, + "step": 5437 + }, + { + "epoch": 1.3775807473084232, + "grad_norm": 3.8105340003967285, + "learning_rate": 8.672550813713782e-06, + "loss": 0.7542, + "step": 5438 + }, + { + "epoch": 1.3778340721975935, + "grad_norm": 3.4200289249420166, + "learning_rate": 8.671982200839276e-06, + "loss": 0.7855, + "step": 5439 + }, + { + "epoch": 1.3780873970867638, + "grad_norm": 4.118893146514893, + "learning_rate": 8.67141348485723e-06, + "loss": 0.8599, + "step": 5440 + }, + { + "epoch": 1.3783407219759343, + "grad_norm": 3.455399990081787, + "learning_rate": 8.670844665783613e-06, + "loss": 0.8014, + "step": 5441 + }, + { + "epoch": 1.3785940468651046, + "grad_norm": 3.6838300228118896, + "learning_rate": 8.670275743634398e-06, + "loss": 0.8718, + "step": 5442 + }, + { + "epoch": 1.3788473717542749, + "grad_norm": 3.4987316131591797, + "learning_rate": 8.66970671842556e-06, + "loss": 0.7655, + "step": 5443 + }, + { + "epoch": 1.3791006966434451, + "grad_norm": 3.2173705101013184, + "learning_rate": 8.669137590173078e-06, + "loss": 0.7933, + "step": 5444 + }, + { + "epoch": 1.3793540215326157, + "grad_norm": 3.6370177268981934, + "learning_rate": 8.668568358892933e-06, + "loss": 0.8318, + "step": 5445 + }, + { + "epoch": 1.379607346421786, + "grad_norm": 3.6624679565429688, + "learning_rate": 8.667999024601106e-06, + "loss": 0.8758, + "step": 5446 + }, + { + "epoch": 1.3798606713109562, + "grad_norm": 3.8879568576812744, + "learning_rate": 8.667429587313588e-06, + "loss": 0.8166, + "step": 5447 + }, + { + "epoch": 1.3801139962001265, + "grad_norm": 3.623021125793457, + "learning_rate": 8.666860047046364e-06, + "loss": 0.8983, + "step": 5448 + }, + { + "epoch": 1.380367321089297, + "grad_norm": 3.325852632522583, + "learning_rate": 8.666290403815429e-06, + "loss": 0.7732, + "step": 5449 + }, + { + "epoch": 1.3806206459784673, + "grad_norm": 3.675074815750122, + "learning_rate": 8.665720657636779e-06, + "loss": 0.9026, + "step": 5450 + }, + { + "epoch": 1.3808739708676376, + "grad_norm": 3.3649652004241943, + "learning_rate": 8.665150808526407e-06, + "loss": 0.7851, + "step": 5451 + }, + { + "epoch": 1.3811272957568081, + "grad_norm": 4.415377140045166, + "learning_rate": 8.664580856500321e-06, + "loss": 1.0493, + "step": 5452 + }, + { + "epoch": 1.3813806206459784, + "grad_norm": 3.6024553775787354, + "learning_rate": 8.664010801574523e-06, + "loss": 0.8162, + "step": 5453 + }, + { + "epoch": 1.3816339455351487, + "grad_norm": 3.582939386367798, + "learning_rate": 8.663440643765018e-06, + "loss": 0.6934, + "step": 5454 + }, + { + "epoch": 1.3818872704243192, + "grad_norm": 3.8280136585235596, + "learning_rate": 8.662870383087816e-06, + "loss": 0.9264, + "step": 5455 + }, + { + "epoch": 1.3821405953134895, + "grad_norm": 3.5490760803222656, + "learning_rate": 8.662300019558931e-06, + "loss": 0.6925, + "step": 5456 + }, + { + "epoch": 1.3823939202026598, + "grad_norm": 3.7981576919555664, + "learning_rate": 8.661729553194378e-06, + "loss": 0.7711, + "step": 5457 + }, + { + "epoch": 1.3826472450918303, + "grad_norm": 3.8355305194854736, + "learning_rate": 8.661158984010177e-06, + "loss": 0.9728, + "step": 5458 + }, + { + "epoch": 1.3829005699810006, + "grad_norm": 4.154177665710449, + "learning_rate": 8.660588312022345e-06, + "loss": 0.798, + "step": 5459 + }, + { + "epoch": 1.383153894870171, + "grad_norm": 3.3235461711883545, + "learning_rate": 8.660017537246908e-06, + "loss": 0.6501, + "step": 5460 + }, + { + "epoch": 1.3834072197593414, + "grad_norm": 4.393592357635498, + "learning_rate": 8.659446659699896e-06, + "loss": 0.8839, + "step": 5461 + }, + { + "epoch": 1.3836605446485117, + "grad_norm": 4.016276836395264, + "learning_rate": 8.658875679397335e-06, + "loss": 0.7344, + "step": 5462 + }, + { + "epoch": 1.383913869537682, + "grad_norm": 3.751662492752075, + "learning_rate": 8.658304596355263e-06, + "loss": 0.7765, + "step": 5463 + }, + { + "epoch": 1.3841671944268525, + "grad_norm": 3.7409374713897705, + "learning_rate": 8.657733410589711e-06, + "loss": 0.7355, + "step": 5464 + }, + { + "epoch": 1.3844205193160228, + "grad_norm": 3.8616786003112793, + "learning_rate": 8.657162122116718e-06, + "loss": 0.7349, + "step": 5465 + }, + { + "epoch": 1.3846738442051931, + "grad_norm": 4.051181793212891, + "learning_rate": 8.656590730952327e-06, + "loss": 0.7971, + "step": 5466 + }, + { + "epoch": 1.3849271690943636, + "grad_norm": 3.5221352577209473, + "learning_rate": 8.656019237112583e-06, + "loss": 0.6992, + "step": 5467 + }, + { + "epoch": 1.385180493983534, + "grad_norm": 3.9331021308898926, + "learning_rate": 8.655447640613532e-06, + "loss": 0.928, + "step": 5468 + }, + { + "epoch": 1.3854338188727042, + "grad_norm": 3.5726869106292725, + "learning_rate": 8.654875941471223e-06, + "loss": 0.6088, + "step": 5469 + }, + { + "epoch": 1.3856871437618747, + "grad_norm": 3.962658405303955, + "learning_rate": 8.654304139701712e-06, + "loss": 0.811, + "step": 5470 + }, + { + "epoch": 1.385940468651045, + "grad_norm": 3.7248291969299316, + "learning_rate": 8.653732235321054e-06, + "loss": 0.719, + "step": 5471 + }, + { + "epoch": 1.3861937935402153, + "grad_norm": 4.063478469848633, + "learning_rate": 8.653160228345306e-06, + "loss": 0.8795, + "step": 5472 + }, + { + "epoch": 1.3864471184293856, + "grad_norm": 3.4277076721191406, + "learning_rate": 8.652588118790531e-06, + "loss": 0.7127, + "step": 5473 + }, + { + "epoch": 1.3867004433185561, + "grad_norm": 3.921884298324585, + "learning_rate": 8.652015906672795e-06, + "loss": 0.8055, + "step": 5474 + }, + { + "epoch": 1.3869537682077264, + "grad_norm": 3.5158169269561768, + "learning_rate": 8.651443592008162e-06, + "loss": 0.7718, + "step": 5475 + }, + { + "epoch": 1.3872070930968967, + "grad_norm": 3.964005470275879, + "learning_rate": 8.650871174812706e-06, + "loss": 0.8302, + "step": 5476 + }, + { + "epoch": 1.387460417986067, + "grad_norm": 3.9888625144958496, + "learning_rate": 8.650298655102498e-06, + "loss": 0.8448, + "step": 5477 + }, + { + "epoch": 1.3877137428752375, + "grad_norm": 3.528465747833252, + "learning_rate": 8.649726032893614e-06, + "loss": 0.7621, + "step": 5478 + }, + { + "epoch": 1.3879670677644078, + "grad_norm": 3.5892975330352783, + "learning_rate": 8.649153308202133e-06, + "loss": 0.7764, + "step": 5479 + }, + { + "epoch": 1.388220392653578, + "grad_norm": 3.736577033996582, + "learning_rate": 8.648580481044138e-06, + "loss": 0.69, + "step": 5480 + }, + { + "epoch": 1.3884737175427486, + "grad_norm": 4.016088008880615, + "learning_rate": 8.648007551435713e-06, + "loss": 0.9458, + "step": 5481 + }, + { + "epoch": 1.388727042431919, + "grad_norm": 4.121703624725342, + "learning_rate": 8.647434519392947e-06, + "loss": 0.8649, + "step": 5482 + }, + { + "epoch": 1.3889803673210892, + "grad_norm": 3.927715539932251, + "learning_rate": 8.646861384931928e-06, + "loss": 0.7293, + "step": 5483 + }, + { + "epoch": 1.3892336922102597, + "grad_norm": 3.837312936782837, + "learning_rate": 8.646288148068751e-06, + "loss": 0.8717, + "step": 5484 + }, + { + "epoch": 1.38948701709943, + "grad_norm": 4.266937255859375, + "learning_rate": 8.64571480881951e-06, + "loss": 1.0201, + "step": 5485 + }, + { + "epoch": 1.3897403419886003, + "grad_norm": 3.5972275733947754, + "learning_rate": 8.645141367200307e-06, + "loss": 0.8153, + "step": 5486 + }, + { + "epoch": 1.3899936668777708, + "grad_norm": 4.050802230834961, + "learning_rate": 8.644567823227244e-06, + "loss": 0.7836, + "step": 5487 + }, + { + "epoch": 1.390246991766941, + "grad_norm": 3.532475233078003, + "learning_rate": 8.643994176916423e-06, + "loss": 0.8349, + "step": 5488 + }, + { + "epoch": 1.3905003166561114, + "grad_norm": 3.3756234645843506, + "learning_rate": 8.643420428283955e-06, + "loss": 0.7766, + "step": 5489 + }, + { + "epoch": 1.390753641545282, + "grad_norm": 3.807551860809326, + "learning_rate": 8.642846577345948e-06, + "loss": 0.8755, + "step": 5490 + }, + { + "epoch": 1.3910069664344522, + "grad_norm": 3.5619986057281494, + "learning_rate": 8.642272624118516e-06, + "loss": 0.8169, + "step": 5491 + }, + { + "epoch": 1.3912602913236225, + "grad_norm": 3.6280767917633057, + "learning_rate": 8.641698568617776e-06, + "loss": 0.9237, + "step": 5492 + }, + { + "epoch": 1.391513616212793, + "grad_norm": 3.6135356426239014, + "learning_rate": 8.641124410859846e-06, + "loss": 0.7958, + "step": 5493 + }, + { + "epoch": 1.3917669411019633, + "grad_norm": 3.8960044384002686, + "learning_rate": 8.640550150860852e-06, + "loss": 0.8649, + "step": 5494 + }, + { + "epoch": 1.3920202659911336, + "grad_norm": 3.3087053298950195, + "learning_rate": 8.639975788636914e-06, + "loss": 0.7286, + "step": 5495 + }, + { + "epoch": 1.3922735908803041, + "grad_norm": 3.6514222621917725, + "learning_rate": 8.639401324204161e-06, + "loss": 0.8522, + "step": 5496 + }, + { + "epoch": 1.3925269157694744, + "grad_norm": 3.496098279953003, + "learning_rate": 8.638826757578727e-06, + "loss": 0.701, + "step": 5497 + }, + { + "epoch": 1.3927802406586447, + "grad_norm": 3.282693386077881, + "learning_rate": 8.638252088776743e-06, + "loss": 0.6856, + "step": 5498 + }, + { + "epoch": 1.3930335655478152, + "grad_norm": 3.991990804672241, + "learning_rate": 8.637677317814342e-06, + "loss": 0.8066, + "step": 5499 + }, + { + "epoch": 1.3932868904369855, + "grad_norm": 4.0754218101501465, + "learning_rate": 8.63710244470767e-06, + "loss": 0.9027, + "step": 5500 + }, + { + "epoch": 1.3932868904369855, + "eval_loss": 1.1785508394241333, + "eval_runtime": 13.9529, + "eval_samples_per_second": 28.668, + "eval_steps_per_second": 3.583, + "step": 5500 + }, + { + "epoch": 1.3935402153261558, + "grad_norm": 3.994100332260132, + "learning_rate": 8.636527469472864e-06, + "loss": 0.8649, + "step": 5501 + }, + { + "epoch": 1.393793540215326, + "grad_norm": 3.872138738632202, + "learning_rate": 8.635952392126072e-06, + "loss": 0.8447, + "step": 5502 + }, + { + "epoch": 1.3940468651044964, + "grad_norm": 4.0767059326171875, + "learning_rate": 8.635377212683442e-06, + "loss": 0.737, + "step": 5503 + }, + { + "epoch": 1.394300189993667, + "grad_norm": 3.502206325531006, + "learning_rate": 8.634801931161122e-06, + "loss": 0.6714, + "step": 5504 + }, + { + "epoch": 1.3945535148828372, + "grad_norm": 3.764141082763672, + "learning_rate": 8.634226547575268e-06, + "loss": 0.7611, + "step": 5505 + }, + { + "epoch": 1.3948068397720075, + "grad_norm": 3.837657928466797, + "learning_rate": 8.633651061942035e-06, + "loss": 0.7611, + "step": 5506 + }, + { + "epoch": 1.395060164661178, + "grad_norm": 3.842958450317383, + "learning_rate": 8.633075474277585e-06, + "loss": 0.7583, + "step": 5507 + }, + { + "epoch": 1.3953134895503483, + "grad_norm": 3.8636865615844727, + "learning_rate": 8.632499784598076e-06, + "loss": 0.8368, + "step": 5508 + }, + { + "epoch": 1.3955668144395186, + "grad_norm": 3.5453438758850098, + "learning_rate": 8.631923992919677e-06, + "loss": 0.7305, + "step": 5509 + }, + { + "epoch": 1.395820139328689, + "grad_norm": 3.8817951679229736, + "learning_rate": 8.631348099258556e-06, + "loss": 0.7234, + "step": 5510 + }, + { + "epoch": 1.3960734642178594, + "grad_norm": 3.8134021759033203, + "learning_rate": 8.63077210363088e-06, + "loss": 0.8138, + "step": 5511 + }, + { + "epoch": 1.3963267891070297, + "grad_norm": 3.794355630874634, + "learning_rate": 8.630196006052826e-06, + "loss": 0.7811, + "step": 5512 + }, + { + "epoch": 1.3965801139962002, + "grad_norm": 3.3116185665130615, + "learning_rate": 8.629619806540568e-06, + "loss": 0.7572, + "step": 5513 + }, + { + "epoch": 1.3968334388853705, + "grad_norm": 3.5003790855407715, + "learning_rate": 8.629043505110288e-06, + "loss": 0.761, + "step": 5514 + }, + { + "epoch": 1.3970867637745408, + "grad_norm": 3.635342597961426, + "learning_rate": 8.628467101778168e-06, + "loss": 0.6976, + "step": 5515 + }, + { + "epoch": 1.3973400886637113, + "grad_norm": 3.8918471336364746, + "learning_rate": 8.627890596560392e-06, + "loss": 0.8624, + "step": 5516 + }, + { + "epoch": 1.3975934135528816, + "grad_norm": 4.08282470703125, + "learning_rate": 8.627313989473147e-06, + "loss": 0.8639, + "step": 5517 + }, + { + "epoch": 1.3978467384420519, + "grad_norm": 3.855114221572876, + "learning_rate": 8.626737280532626e-06, + "loss": 0.8408, + "step": 5518 + }, + { + "epoch": 1.3981000633312224, + "grad_norm": 3.85269832611084, + "learning_rate": 8.626160469755024e-06, + "loss": 0.9654, + "step": 5519 + }, + { + "epoch": 1.3983533882203927, + "grad_norm": 4.240458965301514, + "learning_rate": 8.625583557156534e-06, + "loss": 0.888, + "step": 5520 + }, + { + "epoch": 1.398606713109563, + "grad_norm": 3.8642497062683105, + "learning_rate": 8.625006542753355e-06, + "loss": 0.7675, + "step": 5521 + }, + { + "epoch": 1.3988600379987335, + "grad_norm": 3.7889339923858643, + "learning_rate": 8.624429426561694e-06, + "loss": 0.7812, + "step": 5522 + }, + { + "epoch": 1.3991133628879038, + "grad_norm": 3.741114854812622, + "learning_rate": 8.623852208597751e-06, + "loss": 0.7559, + "step": 5523 + }, + { + "epoch": 1.399366687777074, + "grad_norm": 3.518021821975708, + "learning_rate": 8.62327488887774e-06, + "loss": 0.8203, + "step": 5524 + }, + { + "epoch": 1.3996200126662446, + "grad_norm": 3.691943883895874, + "learning_rate": 8.622697467417864e-06, + "loss": 0.7384, + "step": 5525 + }, + { + "epoch": 1.3998733375554149, + "grad_norm": 3.277179718017578, + "learning_rate": 8.622119944234344e-06, + "loss": 0.7238, + "step": 5526 + }, + { + "epoch": 1.4001266624445852, + "grad_norm": 3.4786086082458496, + "learning_rate": 8.621542319343392e-06, + "loss": 0.8367, + "step": 5527 + }, + { + "epoch": 1.4003799873337555, + "grad_norm": 3.466938018798828, + "learning_rate": 8.62096459276123e-06, + "loss": 0.7996, + "step": 5528 + }, + { + "epoch": 1.400633312222926, + "grad_norm": 4.1230549812316895, + "learning_rate": 8.620386764504079e-06, + "loss": 0.9145, + "step": 5529 + }, + { + "epoch": 1.4008866371120963, + "grad_norm": 3.90274977684021, + "learning_rate": 8.619808834588163e-06, + "loss": 0.7999, + "step": 5530 + }, + { + "epoch": 1.4011399620012666, + "grad_norm": 3.6755170822143555, + "learning_rate": 8.619230803029715e-06, + "loss": 0.747, + "step": 5531 + }, + { + "epoch": 1.4013932868904369, + "grad_norm": 3.832758665084839, + "learning_rate": 8.618652669844959e-06, + "loss": 0.7941, + "step": 5532 + }, + { + "epoch": 1.4016466117796074, + "grad_norm": 3.6194913387298584, + "learning_rate": 8.618074435050134e-06, + "loss": 0.8249, + "step": 5533 + }, + { + "epoch": 1.4018999366687777, + "grad_norm": 3.7348620891571045, + "learning_rate": 8.617496098661474e-06, + "loss": 0.8321, + "step": 5534 + }, + { + "epoch": 1.402153261557948, + "grad_norm": 4.187718391418457, + "learning_rate": 8.616917660695218e-06, + "loss": 0.9079, + "step": 5535 + }, + { + "epoch": 1.4024065864471185, + "grad_norm": 3.6806905269622803, + "learning_rate": 8.61633912116761e-06, + "loss": 0.6829, + "step": 5536 + }, + { + "epoch": 1.4026599113362888, + "grad_norm": 3.9087767601013184, + "learning_rate": 8.615760480094893e-06, + "loss": 0.7842, + "step": 5537 + }, + { + "epoch": 1.402913236225459, + "grad_norm": 3.9356274604797363, + "learning_rate": 8.615181737493318e-06, + "loss": 0.9304, + "step": 5538 + }, + { + "epoch": 1.4031665611146296, + "grad_norm": 3.6604814529418945, + "learning_rate": 8.614602893379134e-06, + "loss": 0.7697, + "step": 5539 + }, + { + "epoch": 1.4034198860037999, + "grad_norm": 3.9226155281066895, + "learning_rate": 8.614023947768596e-06, + "loss": 0.766, + "step": 5540 + }, + { + "epoch": 1.4036732108929701, + "grad_norm": 4.183846950531006, + "learning_rate": 8.613444900677956e-06, + "loss": 0.8535, + "step": 5541 + }, + { + "epoch": 1.4039265357821407, + "grad_norm": 3.5136399269104004, + "learning_rate": 8.61286575212348e-06, + "loss": 0.7414, + "step": 5542 + }, + { + "epoch": 1.404179860671311, + "grad_norm": 3.9637434482574463, + "learning_rate": 8.612286502121425e-06, + "loss": 0.7569, + "step": 5543 + }, + { + "epoch": 1.4044331855604812, + "grad_norm": 4.18505859375, + "learning_rate": 8.61170715068806e-06, + "loss": 0.886, + "step": 5544 + }, + { + "epoch": 1.4046865104496518, + "grad_norm": 3.609485626220703, + "learning_rate": 8.611127697839649e-06, + "loss": 0.8163, + "step": 5545 + }, + { + "epoch": 1.404939835338822, + "grad_norm": 4.104346752166748, + "learning_rate": 8.610548143592465e-06, + "loss": 0.7772, + "step": 5546 + }, + { + "epoch": 1.4051931602279923, + "grad_norm": 3.5270981788635254, + "learning_rate": 8.60996848796278e-06, + "loss": 0.78, + "step": 5547 + }, + { + "epoch": 1.4054464851171629, + "grad_norm": 3.4089438915252686, + "learning_rate": 8.609388730966875e-06, + "loss": 0.7123, + "step": 5548 + }, + { + "epoch": 1.4056998100063332, + "grad_norm": 3.5410640239715576, + "learning_rate": 8.608808872621025e-06, + "loss": 0.7352, + "step": 5549 + }, + { + "epoch": 1.4059531348955034, + "grad_norm": 3.7271721363067627, + "learning_rate": 8.608228912941513e-06, + "loss": 0.7688, + "step": 5550 + }, + { + "epoch": 1.406206459784674, + "grad_norm": 4.498554706573486, + "learning_rate": 8.607648851944624e-06, + "loss": 0.798, + "step": 5551 + }, + { + "epoch": 1.4064597846738442, + "grad_norm": 3.568249225616455, + "learning_rate": 8.607068689646645e-06, + "loss": 0.8086, + "step": 5552 + }, + { + "epoch": 1.4067131095630145, + "grad_norm": 3.853929042816162, + "learning_rate": 8.606488426063868e-06, + "loss": 0.7863, + "step": 5553 + }, + { + "epoch": 1.406966434452185, + "grad_norm": 4.166558742523193, + "learning_rate": 8.605908061212588e-06, + "loss": 0.8253, + "step": 5554 + }, + { + "epoch": 1.4072197593413553, + "grad_norm": 3.6892173290252686, + "learning_rate": 8.605327595109099e-06, + "loss": 0.7393, + "step": 5555 + }, + { + "epoch": 1.4074730842305256, + "grad_norm": 3.8383421897888184, + "learning_rate": 8.6047470277697e-06, + "loss": 0.7559, + "step": 5556 + }, + { + "epoch": 1.407726409119696, + "grad_norm": 3.5189883708953857, + "learning_rate": 8.604166359210695e-06, + "loss": 0.8178, + "step": 5557 + }, + { + "epoch": 1.4079797340088664, + "grad_norm": 4.024250030517578, + "learning_rate": 8.603585589448387e-06, + "loss": 0.8893, + "step": 5558 + }, + { + "epoch": 1.4082330588980367, + "grad_norm": 4.513950824737549, + "learning_rate": 8.603004718499084e-06, + "loss": 0.8941, + "step": 5559 + }, + { + "epoch": 1.408486383787207, + "grad_norm": 3.8571219444274902, + "learning_rate": 8.602423746379098e-06, + "loss": 0.7141, + "step": 5560 + }, + { + "epoch": 1.4087397086763773, + "grad_norm": 3.1437816619873047, + "learning_rate": 8.601842673104743e-06, + "loss": 0.7115, + "step": 5561 + }, + { + "epoch": 1.4089930335655478, + "grad_norm": 3.529634714126587, + "learning_rate": 8.601261498692332e-06, + "loss": 0.8655, + "step": 5562 + }, + { + "epoch": 1.4092463584547181, + "grad_norm": 3.311220645904541, + "learning_rate": 8.600680223158186e-06, + "loss": 0.7382, + "step": 5563 + }, + { + "epoch": 1.4094996833438884, + "grad_norm": 3.498736619949341, + "learning_rate": 8.600098846518628e-06, + "loss": 0.7842, + "step": 5564 + }, + { + "epoch": 1.409753008233059, + "grad_norm": 4.130739688873291, + "learning_rate": 8.599517368789981e-06, + "loss": 0.83, + "step": 5565 + }, + { + "epoch": 1.4100063331222292, + "grad_norm": 3.8548920154571533, + "learning_rate": 8.598935789988572e-06, + "loss": 0.9314, + "step": 5566 + }, + { + "epoch": 1.4102596580113995, + "grad_norm": 3.650334358215332, + "learning_rate": 8.598354110130734e-06, + "loss": 0.7953, + "step": 5567 + }, + { + "epoch": 1.41051298290057, + "grad_norm": 3.7701852321624756, + "learning_rate": 8.597772329232799e-06, + "loss": 0.8699, + "step": 5568 + }, + { + "epoch": 1.4107663077897403, + "grad_norm": 4.091150760650635, + "learning_rate": 8.597190447311104e-06, + "loss": 0.8185, + "step": 5569 + }, + { + "epoch": 1.4110196326789106, + "grad_norm": 3.8292407989501953, + "learning_rate": 8.596608464381987e-06, + "loss": 0.7758, + "step": 5570 + }, + { + "epoch": 1.4112729575680811, + "grad_norm": 3.3151817321777344, + "learning_rate": 8.596026380461789e-06, + "loss": 0.7212, + "step": 5571 + }, + { + "epoch": 1.4115262824572514, + "grad_norm": 3.7377889156341553, + "learning_rate": 8.595444195566856e-06, + "loss": 0.7684, + "step": 5572 + }, + { + "epoch": 1.4117796073464217, + "grad_norm": 3.6139307022094727, + "learning_rate": 8.594861909713534e-06, + "loss": 0.7257, + "step": 5573 + }, + { + "epoch": 1.4120329322355922, + "grad_norm": 3.971952438354492, + "learning_rate": 8.594279522918176e-06, + "loss": 0.8327, + "step": 5574 + }, + { + "epoch": 1.4122862571247625, + "grad_norm": 4.215395927429199, + "learning_rate": 8.593697035197133e-06, + "loss": 0.8302, + "step": 5575 + }, + { + "epoch": 1.4125395820139328, + "grad_norm": 3.487239122390747, + "learning_rate": 8.593114446566762e-06, + "loss": 0.6953, + "step": 5576 + }, + { + "epoch": 1.4127929069031033, + "grad_norm": 3.808400869369507, + "learning_rate": 8.59253175704342e-06, + "loss": 0.7871, + "step": 5577 + }, + { + "epoch": 1.4130462317922736, + "grad_norm": 3.6702208518981934, + "learning_rate": 8.59194896664347e-06, + "loss": 0.8086, + "step": 5578 + }, + { + "epoch": 1.413299556681444, + "grad_norm": 3.7358663082122803, + "learning_rate": 8.591366075383278e-06, + "loss": 0.7254, + "step": 5579 + }, + { + "epoch": 1.4135528815706144, + "grad_norm": 3.666933059692383, + "learning_rate": 8.59078308327921e-06, + "loss": 0.8223, + "step": 5580 + }, + { + "epoch": 1.4138062064597847, + "grad_norm": 3.9592697620391846, + "learning_rate": 8.590199990347634e-06, + "loss": 0.8267, + "step": 5581 + }, + { + "epoch": 1.414059531348955, + "grad_norm": 3.7985219955444336, + "learning_rate": 8.589616796604927e-06, + "loss": 0.8266, + "step": 5582 + }, + { + "epoch": 1.4143128562381255, + "grad_norm": 3.6899030208587646, + "learning_rate": 8.589033502067462e-06, + "loss": 0.7936, + "step": 5583 + }, + { + "epoch": 1.4145661811272958, + "grad_norm": 3.7015511989593506, + "learning_rate": 8.588450106751617e-06, + "loss": 0.7876, + "step": 5584 + }, + { + "epoch": 1.414819506016466, + "grad_norm": 3.870482921600342, + "learning_rate": 8.587866610673777e-06, + "loss": 0.8474, + "step": 5585 + }, + { + "epoch": 1.4150728309056364, + "grad_norm": 4.06734037399292, + "learning_rate": 8.587283013850322e-06, + "loss": 0.7302, + "step": 5586 + }, + { + "epoch": 1.415326155794807, + "grad_norm": 3.912153959274292, + "learning_rate": 8.586699316297645e-06, + "loss": 0.8961, + "step": 5587 + }, + { + "epoch": 1.4155794806839772, + "grad_norm": 3.841379165649414, + "learning_rate": 8.586115518032128e-06, + "loss": 0.8071, + "step": 5588 + }, + { + "epoch": 1.4158328055731475, + "grad_norm": 3.9042487144470215, + "learning_rate": 8.585531619070168e-06, + "loss": 0.8927, + "step": 5589 + }, + { + "epoch": 1.4160861304623178, + "grad_norm": 3.9281656742095947, + "learning_rate": 8.584947619428164e-06, + "loss": 0.6475, + "step": 5590 + }, + { + "epoch": 1.4163394553514883, + "grad_norm": 3.717174768447876, + "learning_rate": 8.584363519122508e-06, + "loss": 0.7899, + "step": 5591 + }, + { + "epoch": 1.4165927802406586, + "grad_norm": 3.817960500717163, + "learning_rate": 8.583779318169605e-06, + "loss": 0.7954, + "step": 5592 + }, + { + "epoch": 1.416846105129829, + "grad_norm": 3.9455277919769287, + "learning_rate": 8.58319501658586e-06, + "loss": 0.7176, + "step": 5593 + }, + { + "epoch": 1.4170994300189994, + "grad_norm": 3.388749361038208, + "learning_rate": 8.582610614387676e-06, + "loss": 0.8285, + "step": 5594 + }, + { + "epoch": 1.4173527549081697, + "grad_norm": 3.77699613571167, + "learning_rate": 8.582026111591468e-06, + "loss": 0.7809, + "step": 5595 + }, + { + "epoch": 1.41760607979734, + "grad_norm": 3.567117929458618, + "learning_rate": 8.581441508213644e-06, + "loss": 0.7515, + "step": 5596 + }, + { + "epoch": 1.4178594046865105, + "grad_norm": 3.967083215713501, + "learning_rate": 8.58085680427062e-06, + "loss": 0.9218, + "step": 5597 + }, + { + "epoch": 1.4181127295756808, + "grad_norm": 3.3822312355041504, + "learning_rate": 8.580271999778816e-06, + "loss": 0.7725, + "step": 5598 + }, + { + "epoch": 1.418366054464851, + "grad_norm": 3.7598750591278076, + "learning_rate": 8.579687094754651e-06, + "loss": 0.8146, + "step": 5599 + }, + { + "epoch": 1.4186193793540216, + "grad_norm": 3.576932668685913, + "learning_rate": 8.579102089214553e-06, + "loss": 0.7237, + "step": 5600 + }, + { + "epoch": 1.418872704243192, + "grad_norm": 3.6534159183502197, + "learning_rate": 8.578516983174943e-06, + "loss": 0.6223, + "step": 5601 + }, + { + "epoch": 1.4191260291323622, + "grad_norm": 3.956620216369629, + "learning_rate": 8.577931776652256e-06, + "loss": 0.8515, + "step": 5602 + }, + { + "epoch": 1.4193793540215327, + "grad_norm": 3.514312505722046, + "learning_rate": 8.57734646966292e-06, + "loss": 0.7917, + "step": 5603 + }, + { + "epoch": 1.419632678910703, + "grad_norm": 3.4261457920074463, + "learning_rate": 8.576761062223371e-06, + "loss": 0.7618, + "step": 5604 + }, + { + "epoch": 1.4198860037998733, + "grad_norm": 3.548288106918335, + "learning_rate": 8.576175554350048e-06, + "loss": 0.7737, + "step": 5605 + }, + { + "epoch": 1.4201393286890438, + "grad_norm": 4.057507038116455, + "learning_rate": 8.57558994605939e-06, + "loss": 0.7519, + "step": 5606 + }, + { + "epoch": 1.420392653578214, + "grad_norm": 3.3332347869873047, + "learning_rate": 8.575004237367845e-06, + "loss": 0.7509, + "step": 5607 + }, + { + "epoch": 1.4206459784673844, + "grad_norm": 3.819678783416748, + "learning_rate": 8.574418428291857e-06, + "loss": 0.7421, + "step": 5608 + }, + { + "epoch": 1.420899303356555, + "grad_norm": 3.7176005840301514, + "learning_rate": 8.573832518847874e-06, + "loss": 0.7297, + "step": 5609 + }, + { + "epoch": 1.4211526282457252, + "grad_norm": 4.009670734405518, + "learning_rate": 8.573246509052346e-06, + "loss": 0.8571, + "step": 5610 + }, + { + "epoch": 1.4214059531348955, + "grad_norm": 3.8512253761291504, + "learning_rate": 8.572660398921734e-06, + "loss": 0.7957, + "step": 5611 + }, + { + "epoch": 1.421659278024066, + "grad_norm": 4.28138542175293, + "learning_rate": 8.572074188472492e-06, + "loss": 0.8452, + "step": 5612 + }, + { + "epoch": 1.4219126029132363, + "grad_norm": 3.7458267211914062, + "learning_rate": 8.571487877721082e-06, + "loss": 0.7774, + "step": 5613 + }, + { + "epoch": 1.4221659278024066, + "grad_norm": 3.1351423263549805, + "learning_rate": 8.570901466683963e-06, + "loss": 0.7704, + "step": 5614 + }, + { + "epoch": 1.4224192526915769, + "grad_norm": 3.2749578952789307, + "learning_rate": 8.570314955377606e-06, + "loss": 0.6954, + "step": 5615 + }, + { + "epoch": 1.4226725775807474, + "grad_norm": 3.894059419631958, + "learning_rate": 8.569728343818478e-06, + "loss": 0.8329, + "step": 5616 + }, + { + "epoch": 1.4229259024699177, + "grad_norm": 3.810101270675659, + "learning_rate": 8.569141632023052e-06, + "loss": 0.8064, + "step": 5617 + }, + { + "epoch": 1.423179227359088, + "grad_norm": 4.048166751861572, + "learning_rate": 8.5685548200078e-06, + "loss": 0.8891, + "step": 5618 + }, + { + "epoch": 1.4234325522482583, + "grad_norm": 3.691485643386841, + "learning_rate": 8.567967907789202e-06, + "loss": 0.7965, + "step": 5619 + }, + { + "epoch": 1.4236858771374288, + "grad_norm": 3.894789457321167, + "learning_rate": 8.567380895383738e-06, + "loss": 0.8681, + "step": 5620 + }, + { + "epoch": 1.423939202026599, + "grad_norm": 3.653616428375244, + "learning_rate": 8.566793782807892e-06, + "loss": 0.7101, + "step": 5621 + }, + { + "epoch": 1.4241925269157694, + "grad_norm": 3.6345016956329346, + "learning_rate": 8.566206570078147e-06, + "loss": 0.8477, + "step": 5622 + }, + { + "epoch": 1.4244458518049399, + "grad_norm": 4.286646366119385, + "learning_rate": 8.56561925721099e-06, + "loss": 0.8333, + "step": 5623 + }, + { + "epoch": 1.4246991766941102, + "grad_norm": 3.6579551696777344, + "learning_rate": 8.56503184422292e-06, + "loss": 0.7263, + "step": 5624 + }, + { + "epoch": 1.4249525015832805, + "grad_norm": 3.4948604106903076, + "learning_rate": 8.564444331130423e-06, + "loss": 0.7977, + "step": 5625 + }, + { + "epoch": 1.425205826472451, + "grad_norm": 3.829361915588379, + "learning_rate": 8.563856717950002e-06, + "loss": 0.8717, + "step": 5626 + }, + { + "epoch": 1.4254591513616213, + "grad_norm": 3.8146557807922363, + "learning_rate": 8.563269004698153e-06, + "loss": 0.8735, + "step": 5627 + }, + { + "epoch": 1.4257124762507916, + "grad_norm": 3.6773152351379395, + "learning_rate": 8.562681191391382e-06, + "loss": 0.7744, + "step": 5628 + }, + { + "epoch": 1.425965801139962, + "grad_norm": 4.1448564529418945, + "learning_rate": 8.562093278046193e-06, + "loss": 0.8276, + "step": 5629 + }, + { + "epoch": 1.4262191260291324, + "grad_norm": 4.072507381439209, + "learning_rate": 8.561505264679093e-06, + "loss": 0.7953, + "step": 5630 + }, + { + "epoch": 1.4264724509183027, + "grad_norm": 3.472202777862549, + "learning_rate": 8.560917151306594e-06, + "loss": 0.8121, + "step": 5631 + }, + { + "epoch": 1.4267257758074732, + "grad_norm": 3.6753032207489014, + "learning_rate": 8.560328937945212e-06, + "loss": 0.929, + "step": 5632 + }, + { + "epoch": 1.4269791006966435, + "grad_norm": 3.6754982471466064, + "learning_rate": 8.559740624611462e-06, + "loss": 0.8513, + "step": 5633 + }, + { + "epoch": 1.4272324255858138, + "grad_norm": 3.8619563579559326, + "learning_rate": 8.559152211321862e-06, + "loss": 0.7544, + "step": 5634 + }, + { + "epoch": 1.4274857504749843, + "grad_norm": 3.8075153827667236, + "learning_rate": 8.558563698092937e-06, + "loss": 0.7834, + "step": 5635 + }, + { + "epoch": 1.4277390753641546, + "grad_norm": 3.598496437072754, + "learning_rate": 8.557975084941211e-06, + "loss": 0.7952, + "step": 5636 + }, + { + "epoch": 1.4279924002533249, + "grad_norm": 3.260812759399414, + "learning_rate": 8.557386371883212e-06, + "loss": 0.8042, + "step": 5637 + }, + { + "epoch": 1.4282457251424954, + "grad_norm": 3.1854093074798584, + "learning_rate": 8.55679755893547e-06, + "loss": 0.7317, + "step": 5638 + }, + { + "epoch": 1.4284990500316657, + "grad_norm": 3.2150797843933105, + "learning_rate": 8.556208646114521e-06, + "loss": 0.6752, + "step": 5639 + }, + { + "epoch": 1.428752374920836, + "grad_norm": 3.4865453243255615, + "learning_rate": 8.5556196334369e-06, + "loss": 0.6955, + "step": 5640 + }, + { + "epoch": 1.4290056998100065, + "grad_norm": 3.760668992996216, + "learning_rate": 8.555030520919146e-06, + "loss": 0.9463, + "step": 5641 + }, + { + "epoch": 1.4292590246991768, + "grad_norm": 3.4559175968170166, + "learning_rate": 8.554441308577799e-06, + "loss": 0.7351, + "step": 5642 + }, + { + "epoch": 1.429512349588347, + "grad_norm": 4.000383377075195, + "learning_rate": 8.553851996429407e-06, + "loss": 0.8208, + "step": 5643 + }, + { + "epoch": 1.4297656744775173, + "grad_norm": 3.9102861881256104, + "learning_rate": 8.553262584490517e-06, + "loss": 0.7239, + "step": 5644 + }, + { + "epoch": 1.4300189993666879, + "grad_norm": 3.3569469451904297, + "learning_rate": 8.55267307277768e-06, + "loss": 0.7529, + "step": 5645 + }, + { + "epoch": 1.4302723242558582, + "grad_norm": 3.330298662185669, + "learning_rate": 8.552083461307447e-06, + "loss": 0.7734, + "step": 5646 + }, + { + "epoch": 1.4305256491450284, + "grad_norm": 4.039393901824951, + "learning_rate": 8.551493750096376e-06, + "loss": 0.7744, + "step": 5647 + }, + { + "epoch": 1.4307789740341987, + "grad_norm": 3.752800941467285, + "learning_rate": 8.550903939161024e-06, + "loss": 0.8754, + "step": 5648 + }, + { + "epoch": 1.4310322989233693, + "grad_norm": 3.302180051803589, + "learning_rate": 8.550314028517956e-06, + "loss": 0.8537, + "step": 5649 + }, + { + "epoch": 1.4312856238125395, + "grad_norm": 3.8774683475494385, + "learning_rate": 8.549724018183732e-06, + "loss": 0.7596, + "step": 5650 + }, + { + "epoch": 1.4315389487017098, + "grad_norm": 3.963120698928833, + "learning_rate": 8.54913390817492e-06, + "loss": 0.8536, + "step": 5651 + }, + { + "epoch": 1.4317922735908803, + "grad_norm": 3.766277551651001, + "learning_rate": 8.548543698508094e-06, + "loss": 0.7581, + "step": 5652 + }, + { + "epoch": 1.4320455984800506, + "grad_norm": 3.777329683303833, + "learning_rate": 8.547953389199823e-06, + "loss": 0.769, + "step": 5653 + }, + { + "epoch": 1.432298923369221, + "grad_norm": 3.928480625152588, + "learning_rate": 8.547362980266683e-06, + "loss": 0.8342, + "step": 5654 + }, + { + "epoch": 1.4325522482583914, + "grad_norm": 3.643662452697754, + "learning_rate": 8.546772471725253e-06, + "loss": 0.7035, + "step": 5655 + }, + { + "epoch": 1.4328055731475617, + "grad_norm": 3.8660430908203125, + "learning_rate": 8.546181863592115e-06, + "loss": 0.823, + "step": 5656 + }, + { + "epoch": 1.433058898036732, + "grad_norm": 3.334833860397339, + "learning_rate": 8.545591155883853e-06, + "loss": 0.7723, + "step": 5657 + }, + { + "epoch": 1.4333122229259025, + "grad_norm": 3.6953554153442383, + "learning_rate": 8.545000348617052e-06, + "loss": 0.808, + "step": 5658 + }, + { + "epoch": 1.4335655478150728, + "grad_norm": 3.6601805686950684, + "learning_rate": 8.544409441808303e-06, + "loss": 0.8103, + "step": 5659 + }, + { + "epoch": 1.4338188727042431, + "grad_norm": 3.3238003253936768, + "learning_rate": 8.543818435474199e-06, + "loss": 0.723, + "step": 5660 + }, + { + "epoch": 1.4340721975934136, + "grad_norm": 3.308959722518921, + "learning_rate": 8.543227329631332e-06, + "loss": 0.7424, + "step": 5661 + }, + { + "epoch": 1.434325522482584, + "grad_norm": 3.794067144393921, + "learning_rate": 8.542636124296303e-06, + "loss": 0.8461, + "step": 5662 + }, + { + "epoch": 1.4345788473717542, + "grad_norm": 3.4544320106506348, + "learning_rate": 8.542044819485714e-06, + "loss": 0.7192, + "step": 5663 + }, + { + "epoch": 1.4348321722609247, + "grad_norm": 4.148547649383545, + "learning_rate": 8.541453415216165e-06, + "loss": 0.9252, + "step": 5664 + }, + { + "epoch": 1.435085497150095, + "grad_norm": 3.8484859466552734, + "learning_rate": 8.540861911504264e-06, + "loss": 0.7667, + "step": 5665 + }, + { + "epoch": 1.4353388220392653, + "grad_norm": 3.4362473487854004, + "learning_rate": 8.540270308366618e-06, + "loss": 0.6631, + "step": 5666 + }, + { + "epoch": 1.4355921469284358, + "grad_norm": 3.6026856899261475, + "learning_rate": 8.539678605819843e-06, + "loss": 0.6789, + "step": 5667 + }, + { + "epoch": 1.4358454718176061, + "grad_norm": 3.5054380893707275, + "learning_rate": 8.539086803880552e-06, + "loss": 0.7149, + "step": 5668 + }, + { + "epoch": 1.4360987967067764, + "grad_norm": 3.5280587673187256, + "learning_rate": 8.538494902565362e-06, + "loss": 0.8204, + "step": 5669 + }, + { + "epoch": 1.436352121595947, + "grad_norm": 3.6347293853759766, + "learning_rate": 8.537902901890893e-06, + "loss": 0.7337, + "step": 5670 + }, + { + "epoch": 1.4366054464851172, + "grad_norm": 3.789095401763916, + "learning_rate": 8.53731080187377e-06, + "loss": 0.8534, + "step": 5671 + }, + { + "epoch": 1.4368587713742875, + "grad_norm": 3.7971978187561035, + "learning_rate": 8.536718602530616e-06, + "loss": 0.7479, + "step": 5672 + }, + { + "epoch": 1.4371120962634578, + "grad_norm": 3.743420124053955, + "learning_rate": 8.536126303878063e-06, + "loss": 0.8196, + "step": 5673 + }, + { + "epoch": 1.437365421152628, + "grad_norm": 3.237114191055298, + "learning_rate": 8.535533905932739e-06, + "loss": 0.718, + "step": 5674 + }, + { + "epoch": 1.4376187460417986, + "grad_norm": 3.5328898429870605, + "learning_rate": 8.534941408711279e-06, + "loss": 0.6758, + "step": 5675 + }, + { + "epoch": 1.437872070930969, + "grad_norm": 3.87256121635437, + "learning_rate": 8.534348812230324e-06, + "loss": 0.7721, + "step": 5676 + }, + { + "epoch": 1.4381253958201392, + "grad_norm": 3.472590923309326, + "learning_rate": 8.533756116506508e-06, + "loss": 0.7263, + "step": 5677 + }, + { + "epoch": 1.4383787207093097, + "grad_norm": 3.7455976009368896, + "learning_rate": 8.533163321556479e-06, + "loss": 0.7487, + "step": 5678 + }, + { + "epoch": 1.43863204559848, + "grad_norm": 4.153329849243164, + "learning_rate": 8.532570427396877e-06, + "loss": 0.8924, + "step": 5679 + }, + { + "epoch": 1.4388853704876503, + "grad_norm": 4.1146240234375, + "learning_rate": 8.531977434044356e-06, + "loss": 0.7607, + "step": 5680 + }, + { + "epoch": 1.4391386953768208, + "grad_norm": 3.8720202445983887, + "learning_rate": 8.531384341515562e-06, + "loss": 0.7506, + "step": 5681 + }, + { + "epoch": 1.4393920202659911, + "grad_norm": 3.8613243103027344, + "learning_rate": 8.530791149827152e-06, + "loss": 0.8279, + "step": 5682 + }, + { + "epoch": 1.4396453451551614, + "grad_norm": 3.2245066165924072, + "learning_rate": 8.53019785899578e-06, + "loss": 0.7867, + "step": 5683 + }, + { + "epoch": 1.439898670044332, + "grad_norm": 3.619999647140503, + "learning_rate": 8.529604469038106e-06, + "loss": 0.6425, + "step": 5684 + }, + { + "epoch": 1.4401519949335022, + "grad_norm": 3.314887762069702, + "learning_rate": 8.529010979970796e-06, + "loss": 0.8123, + "step": 5685 + }, + { + "epoch": 1.4404053198226725, + "grad_norm": 3.6747682094573975, + "learning_rate": 8.528417391810508e-06, + "loss": 0.6957, + "step": 5686 + }, + { + "epoch": 1.440658644711843, + "grad_norm": 4.630380153656006, + "learning_rate": 8.527823704573916e-06, + "loss": 0.8388, + "step": 5687 + }, + { + "epoch": 1.4409119696010133, + "grad_norm": 3.862455368041992, + "learning_rate": 8.527229918277688e-06, + "loss": 0.761, + "step": 5688 + }, + { + "epoch": 1.4411652944901836, + "grad_norm": 3.640131950378418, + "learning_rate": 8.526636032938495e-06, + "loss": 0.7136, + "step": 5689 + }, + { + "epoch": 1.4414186193793541, + "grad_norm": 3.6640071868896484, + "learning_rate": 8.526042048573016e-06, + "loss": 0.8372, + "step": 5690 + }, + { + "epoch": 1.4416719442685244, + "grad_norm": 4.051027774810791, + "learning_rate": 8.525447965197928e-06, + "loss": 0.7735, + "step": 5691 + }, + { + "epoch": 1.4419252691576947, + "grad_norm": 3.6243577003479004, + "learning_rate": 8.524853782829915e-06, + "loss": 0.8937, + "step": 5692 + }, + { + "epoch": 1.4421785940468652, + "grad_norm": 3.4633452892303467, + "learning_rate": 8.524259501485658e-06, + "loss": 0.7356, + "step": 5693 + }, + { + "epoch": 1.4424319189360355, + "grad_norm": 4.155950546264648, + "learning_rate": 8.523665121181848e-06, + "loss": 0.8695, + "step": 5694 + }, + { + "epoch": 1.4426852438252058, + "grad_norm": 3.9396297931671143, + "learning_rate": 8.523070641935172e-06, + "loss": 0.7235, + "step": 5695 + }, + { + "epoch": 1.4429385687143763, + "grad_norm": 3.9359302520751953, + "learning_rate": 8.522476063762322e-06, + "loss": 0.7997, + "step": 5696 + }, + { + "epoch": 1.4431918936035466, + "grad_norm": 3.7869248390197754, + "learning_rate": 8.521881386679997e-06, + "loss": 0.8, + "step": 5697 + }, + { + "epoch": 1.443445218492717, + "grad_norm": 3.976806640625, + "learning_rate": 8.521286610704893e-06, + "loss": 0.78, + "step": 5698 + }, + { + "epoch": 1.4436985433818872, + "grad_norm": 3.9138104915618896, + "learning_rate": 8.520691735853712e-06, + "loss": 0.8162, + "step": 5699 + }, + { + "epoch": 1.4439518682710577, + "grad_norm": 3.900667667388916, + "learning_rate": 8.520096762143155e-06, + "loss": 0.7667, + "step": 5700 + }, + { + "epoch": 1.444205193160228, + "grad_norm": 3.848956823348999, + "learning_rate": 8.519501689589932e-06, + "loss": 0.9847, + "step": 5701 + }, + { + "epoch": 1.4444585180493983, + "grad_norm": 4.102713108062744, + "learning_rate": 8.518906518210751e-06, + "loss": 0.9319, + "step": 5702 + }, + { + "epoch": 1.4447118429385686, + "grad_norm": 4.555942058563232, + "learning_rate": 8.518311248022324e-06, + "loss": 0.8847, + "step": 5703 + }, + { + "epoch": 1.444965167827739, + "grad_norm": 3.8152759075164795, + "learning_rate": 8.517715879041366e-06, + "loss": 0.8646, + "step": 5704 + }, + { + "epoch": 1.4452184927169094, + "grad_norm": 3.4794349670410156, + "learning_rate": 8.517120411284594e-06, + "loss": 0.8137, + "step": 5705 + }, + { + "epoch": 1.4454718176060797, + "grad_norm": 3.4123058319091797, + "learning_rate": 8.516524844768733e-06, + "loss": 0.7919, + "step": 5706 + }, + { + "epoch": 1.4457251424952502, + "grad_norm": 3.657371997833252, + "learning_rate": 8.515929179510498e-06, + "loss": 0.8292, + "step": 5707 + }, + { + "epoch": 1.4459784673844205, + "grad_norm": 3.533569574356079, + "learning_rate": 8.515333415526622e-06, + "loss": 0.7663, + "step": 5708 + }, + { + "epoch": 1.4462317922735908, + "grad_norm": 3.5966460704803467, + "learning_rate": 8.51473755283383e-06, + "loss": 0.7478, + "step": 5709 + }, + { + "epoch": 1.4464851171627613, + "grad_norm": 3.9291951656341553, + "learning_rate": 8.514141591448854e-06, + "loss": 0.7542, + "step": 5710 + }, + { + "epoch": 1.4467384420519316, + "grad_norm": 3.6165096759796143, + "learning_rate": 8.513545531388432e-06, + "loss": 0.7914, + "step": 5711 + }, + { + "epoch": 1.4469917669411019, + "grad_norm": 3.74674391746521, + "learning_rate": 8.512949372669298e-06, + "loss": 0.7878, + "step": 5712 + }, + { + "epoch": 1.4472450918302724, + "grad_norm": 3.324605703353882, + "learning_rate": 8.512353115308189e-06, + "loss": 0.7061, + "step": 5713 + }, + { + "epoch": 1.4474984167194427, + "grad_norm": 3.780344247817993, + "learning_rate": 8.511756759321852e-06, + "loss": 0.8317, + "step": 5714 + }, + { + "epoch": 1.447751741608613, + "grad_norm": 3.839493751525879, + "learning_rate": 8.511160304727031e-06, + "loss": 0.8069, + "step": 5715 + }, + { + "epoch": 1.4480050664977835, + "grad_norm": 3.744105339050293, + "learning_rate": 8.510563751540475e-06, + "loss": 0.7982, + "step": 5716 + }, + { + "epoch": 1.4482583913869538, + "grad_norm": 3.512763261795044, + "learning_rate": 8.509967099778934e-06, + "loss": 0.8294, + "step": 5717 + }, + { + "epoch": 1.448511716276124, + "grad_norm": 3.56489634513855, + "learning_rate": 8.509370349459161e-06, + "loss": 0.8668, + "step": 5718 + }, + { + "epoch": 1.4487650411652946, + "grad_norm": 3.4126763343811035, + "learning_rate": 8.508773500597915e-06, + "loss": 0.6083, + "step": 5719 + }, + { + "epoch": 1.4490183660544649, + "grad_norm": 3.7884819507598877, + "learning_rate": 8.508176553211951e-06, + "loss": 0.7594, + "step": 5720 + }, + { + "epoch": 1.4492716909436352, + "grad_norm": 3.8706231117248535, + "learning_rate": 8.507579507318036e-06, + "loss": 0.7924, + "step": 5721 + }, + { + "epoch": 1.4495250158328057, + "grad_norm": 4.001307010650635, + "learning_rate": 8.506982362932932e-06, + "loss": 0.8956, + "step": 5722 + }, + { + "epoch": 1.449778340721976, + "grad_norm": 3.834406852722168, + "learning_rate": 8.506385120073406e-06, + "loss": 0.7862, + "step": 5723 + }, + { + "epoch": 1.4500316656111463, + "grad_norm": 3.5482423305511475, + "learning_rate": 8.50578777875623e-06, + "loss": 0.7288, + "step": 5724 + }, + { + "epoch": 1.4502849905003168, + "grad_norm": 3.502732515335083, + "learning_rate": 8.505190338998176e-06, + "loss": 0.7459, + "step": 5725 + }, + { + "epoch": 1.450538315389487, + "grad_norm": 3.8053665161132812, + "learning_rate": 8.50459280081602e-06, + "loss": 0.7643, + "step": 5726 + }, + { + "epoch": 1.4507916402786574, + "grad_norm": 4.070937633514404, + "learning_rate": 8.503995164226542e-06, + "loss": 0.7852, + "step": 5727 + }, + { + "epoch": 1.4510449651678277, + "grad_norm": 3.704869270324707, + "learning_rate": 8.50339742924652e-06, + "loss": 0.6839, + "step": 5728 + }, + { + "epoch": 1.4512982900569982, + "grad_norm": 3.3609752655029297, + "learning_rate": 8.50279959589274e-06, + "loss": 0.7276, + "step": 5729 + }, + { + "epoch": 1.4515516149461685, + "grad_norm": 3.966762065887451, + "learning_rate": 8.50220166418199e-06, + "loss": 0.9289, + "step": 5730 + }, + { + "epoch": 1.4518049398353388, + "grad_norm": 3.9899070262908936, + "learning_rate": 8.50160363413106e-06, + "loss": 0.7779, + "step": 5731 + }, + { + "epoch": 1.452058264724509, + "grad_norm": 3.5301764011383057, + "learning_rate": 8.501005505756738e-06, + "loss": 0.7488, + "step": 5732 + }, + { + "epoch": 1.4523115896136796, + "grad_norm": 3.3429617881774902, + "learning_rate": 8.500407279075824e-06, + "loss": 0.7595, + "step": 5733 + }, + { + "epoch": 1.4525649145028499, + "grad_norm": 3.5351319313049316, + "learning_rate": 8.499808954105115e-06, + "loss": 0.8505, + "step": 5734 + }, + { + "epoch": 1.4528182393920201, + "grad_norm": 3.7764217853546143, + "learning_rate": 8.499210530861409e-06, + "loss": 0.9221, + "step": 5735 + }, + { + "epoch": 1.4530715642811907, + "grad_norm": 4.354663848876953, + "learning_rate": 8.498612009361513e-06, + "loss": 0.9962, + "step": 5736 + }, + { + "epoch": 1.453324889170361, + "grad_norm": 3.265813112258911, + "learning_rate": 8.498013389622231e-06, + "loss": 0.7001, + "step": 5737 + }, + { + "epoch": 1.4535782140595312, + "grad_norm": 4.211230278015137, + "learning_rate": 8.497414671660372e-06, + "loss": 0.8179, + "step": 5738 + }, + { + "epoch": 1.4538315389487018, + "grad_norm": 3.9012246131896973, + "learning_rate": 8.496815855492749e-06, + "loss": 0.8726, + "step": 5739 + }, + { + "epoch": 1.454084863837872, + "grad_norm": 3.661397695541382, + "learning_rate": 8.496216941136174e-06, + "loss": 0.774, + "step": 5740 + }, + { + "epoch": 1.4543381887270423, + "grad_norm": 3.973367929458618, + "learning_rate": 8.495617928607467e-06, + "loss": 0.7984, + "step": 5741 + }, + { + "epoch": 1.4545915136162129, + "grad_norm": 3.723512887954712, + "learning_rate": 8.495018817923445e-06, + "loss": 0.9125, + "step": 5742 + }, + { + "epoch": 1.4548448385053832, + "grad_norm": 3.5836737155914307, + "learning_rate": 8.494419609100936e-06, + "loss": 0.7135, + "step": 5743 + }, + { + "epoch": 1.4550981633945534, + "grad_norm": 3.5797481536865234, + "learning_rate": 8.49382030215676e-06, + "loss": 0.6496, + "step": 5744 + }, + { + "epoch": 1.455351488283724, + "grad_norm": 3.581573009490967, + "learning_rate": 8.493220897107748e-06, + "loss": 0.7338, + "step": 5745 + }, + { + "epoch": 1.4556048131728943, + "grad_norm": 3.822969675064087, + "learning_rate": 8.492621393970731e-06, + "loss": 0.8432, + "step": 5746 + }, + { + "epoch": 1.4558581380620645, + "grad_norm": 3.8776535987854004, + "learning_rate": 8.492021792762542e-06, + "loss": 0.7807, + "step": 5747 + }, + { + "epoch": 1.456111462951235, + "grad_norm": 3.600105047225952, + "learning_rate": 8.491422093500016e-06, + "loss": 0.7384, + "step": 5748 + }, + { + "epoch": 1.4563647878404054, + "grad_norm": 3.469967842102051, + "learning_rate": 8.490822296199995e-06, + "loss": 0.7266, + "step": 5749 + }, + { + "epoch": 1.4566181127295756, + "grad_norm": 4.049229621887207, + "learning_rate": 8.49022240087932e-06, + "loss": 0.7418, + "step": 5750 + }, + { + "epoch": 1.4568714376187462, + "grad_norm": 3.541820764541626, + "learning_rate": 8.489622407554837e-06, + "loss": 0.8721, + "step": 5751 + }, + { + "epoch": 1.4571247625079164, + "grad_norm": 3.556708812713623, + "learning_rate": 8.489022316243391e-06, + "loss": 0.72, + "step": 5752 + }, + { + "epoch": 1.4573780873970867, + "grad_norm": 3.6263678073883057, + "learning_rate": 8.488422126961833e-06, + "loss": 0.6651, + "step": 5753 + }, + { + "epoch": 1.4576314122862573, + "grad_norm": 3.4090118408203125, + "learning_rate": 8.487821839727018e-06, + "loss": 0.7079, + "step": 5754 + }, + { + "epoch": 1.4578847371754275, + "grad_norm": 3.7343039512634277, + "learning_rate": 8.4872214545558e-06, + "loss": 0.7699, + "step": 5755 + }, + { + "epoch": 1.4581380620645978, + "grad_norm": 3.486272096633911, + "learning_rate": 8.486620971465039e-06, + "loss": 0.7504, + "step": 5756 + }, + { + "epoch": 1.4583913869537681, + "grad_norm": 3.899477005004883, + "learning_rate": 8.486020390471593e-06, + "loss": 0.7199, + "step": 5757 + }, + { + "epoch": 1.4586447118429386, + "grad_norm": 3.771094799041748, + "learning_rate": 8.485419711592329e-06, + "loss": 0.7408, + "step": 5758 + }, + { + "epoch": 1.458898036732109, + "grad_norm": 3.6506221294403076, + "learning_rate": 8.484818934844112e-06, + "loss": 0.7438, + "step": 5759 + }, + { + "epoch": 1.4591513616212792, + "grad_norm": 3.7030067443847656, + "learning_rate": 8.484218060243816e-06, + "loss": 0.7507, + "step": 5760 + }, + { + "epoch": 1.4594046865104495, + "grad_norm": 3.648280143737793, + "learning_rate": 8.483617087808307e-06, + "loss": 0.6922, + "step": 5761 + }, + { + "epoch": 1.45965801139962, + "grad_norm": 3.9507155418395996, + "learning_rate": 8.483016017554463e-06, + "loss": 0.8309, + "step": 5762 + }, + { + "epoch": 1.4599113362887903, + "grad_norm": 3.9942426681518555, + "learning_rate": 8.482414849499164e-06, + "loss": 0.8634, + "step": 5763 + }, + { + "epoch": 1.4601646611779606, + "grad_norm": 4.629600524902344, + "learning_rate": 8.481813583659285e-06, + "loss": 0.8879, + "step": 5764 + }, + { + "epoch": 1.4604179860671311, + "grad_norm": 3.8053503036499023, + "learning_rate": 8.481212220051713e-06, + "loss": 0.7588, + "step": 5765 + }, + { + "epoch": 1.4606713109563014, + "grad_norm": 3.998711347579956, + "learning_rate": 8.480610758693335e-06, + "loss": 0.743, + "step": 5766 + }, + { + "epoch": 1.4609246358454717, + "grad_norm": 4.426061153411865, + "learning_rate": 8.480009199601039e-06, + "loss": 0.8845, + "step": 5767 + }, + { + "epoch": 1.4611779607346422, + "grad_norm": 3.610963821411133, + "learning_rate": 8.479407542791712e-06, + "loss": 0.6508, + "step": 5768 + }, + { + "epoch": 1.4614312856238125, + "grad_norm": 4.172120571136475, + "learning_rate": 8.478805788282254e-06, + "loss": 0.8132, + "step": 5769 + }, + { + "epoch": 1.4616846105129828, + "grad_norm": 3.924996852874756, + "learning_rate": 8.47820393608956e-06, + "loss": 0.8374, + "step": 5770 + }, + { + "epoch": 1.4619379354021533, + "grad_norm": 3.486543655395508, + "learning_rate": 8.47760198623053e-06, + "loss": 0.7782, + "step": 5771 + }, + { + "epoch": 1.4621912602913236, + "grad_norm": 3.5728392601013184, + "learning_rate": 8.476999938722066e-06, + "loss": 0.8747, + "step": 5772 + }, + { + "epoch": 1.462444585180494, + "grad_norm": 3.4813034534454346, + "learning_rate": 8.476397793581073e-06, + "loss": 0.7351, + "step": 5773 + }, + { + "epoch": 1.4626979100696644, + "grad_norm": 3.6758840084075928, + "learning_rate": 8.475795550824459e-06, + "loss": 0.8178, + "step": 5774 + }, + { + "epoch": 1.4629512349588347, + "grad_norm": 3.9572718143463135, + "learning_rate": 8.475193210469135e-06, + "loss": 0.8281, + "step": 5775 + }, + { + "epoch": 1.463204559848005, + "grad_norm": 3.5994718074798584, + "learning_rate": 8.474590772532015e-06, + "loss": 0.7396, + "step": 5776 + }, + { + "epoch": 1.4634578847371755, + "grad_norm": 3.2121098041534424, + "learning_rate": 8.473988237030013e-06, + "loss": 0.8072, + "step": 5777 + }, + { + "epoch": 1.4637112096263458, + "grad_norm": 3.8271384239196777, + "learning_rate": 8.473385603980051e-06, + "loss": 1.0376, + "step": 5778 + }, + { + "epoch": 1.4639645345155161, + "grad_norm": 3.357984781265259, + "learning_rate": 8.472782873399049e-06, + "loss": 0.6981, + "step": 5779 + }, + { + "epoch": 1.4642178594046866, + "grad_norm": 3.8599822521209717, + "learning_rate": 8.472180045303932e-06, + "loss": 0.8763, + "step": 5780 + }, + { + "epoch": 1.464471184293857, + "grad_norm": 3.2101244926452637, + "learning_rate": 8.471577119711625e-06, + "loss": 0.6959, + "step": 5781 + }, + { + "epoch": 1.4647245091830272, + "grad_norm": 3.952605724334717, + "learning_rate": 8.470974096639061e-06, + "loss": 0.786, + "step": 5782 + }, + { + "epoch": 1.4649778340721977, + "grad_norm": 3.574605703353882, + "learning_rate": 8.470370976103171e-06, + "loss": 0.7881, + "step": 5783 + }, + { + "epoch": 1.465231158961368, + "grad_norm": 4.299081802368164, + "learning_rate": 8.469767758120888e-06, + "loss": 0.8145, + "step": 5784 + }, + { + "epoch": 1.4654844838505383, + "grad_norm": 3.7053792476654053, + "learning_rate": 8.469164442709156e-06, + "loss": 0.8087, + "step": 5785 + }, + { + "epoch": 1.4657378087397086, + "grad_norm": 3.570216417312622, + "learning_rate": 8.468561029884912e-06, + "loss": 0.7369, + "step": 5786 + }, + { + "epoch": 1.4659911336288791, + "grad_norm": 3.463874340057373, + "learning_rate": 8.467957519665098e-06, + "loss": 0.8314, + "step": 5787 + }, + { + "epoch": 1.4662444585180494, + "grad_norm": 3.9500622749328613, + "learning_rate": 8.467353912066662e-06, + "loss": 0.914, + "step": 5788 + }, + { + "epoch": 1.4664977834072197, + "grad_norm": 3.8794491291046143, + "learning_rate": 8.466750207106555e-06, + "loss": 0.7971, + "step": 5789 + }, + { + "epoch": 1.46675110829639, + "grad_norm": 3.631758451461792, + "learning_rate": 8.466146404801727e-06, + "loss": 0.7149, + "step": 5790 + }, + { + "epoch": 1.4670044331855605, + "grad_norm": 3.9623847007751465, + "learning_rate": 8.46554250516913e-06, + "loss": 0.7581, + "step": 5791 + }, + { + "epoch": 1.4672577580747308, + "grad_norm": 3.570362091064453, + "learning_rate": 8.464938508225726e-06, + "loss": 0.6783, + "step": 5792 + }, + { + "epoch": 1.467511082963901, + "grad_norm": 4.077816009521484, + "learning_rate": 8.464334413988474e-06, + "loss": 0.7831, + "step": 5793 + }, + { + "epoch": 1.4677644078530716, + "grad_norm": 4.263125419616699, + "learning_rate": 8.463730222474332e-06, + "loss": 0.9054, + "step": 5794 + }, + { + "epoch": 1.468017732742242, + "grad_norm": 3.4157943725585938, + "learning_rate": 8.463125933700271e-06, + "loss": 0.674, + "step": 5795 + }, + { + "epoch": 1.4682710576314122, + "grad_norm": 3.7338168621063232, + "learning_rate": 8.462521547683255e-06, + "loss": 0.8226, + "step": 5796 + }, + { + "epoch": 1.4685243825205827, + "grad_norm": 3.9859232902526855, + "learning_rate": 8.461917064440258e-06, + "loss": 0.8197, + "step": 5797 + }, + { + "epoch": 1.468777707409753, + "grad_norm": 3.4105770587921143, + "learning_rate": 8.461312483988252e-06, + "loss": 0.7145, + "step": 5798 + }, + { + "epoch": 1.4690310322989233, + "grad_norm": 4.011654376983643, + "learning_rate": 8.460707806344215e-06, + "loss": 0.909, + "step": 5799 + }, + { + "epoch": 1.4692843571880938, + "grad_norm": 3.96620774269104, + "learning_rate": 8.460103031525123e-06, + "loss": 0.8062, + "step": 5800 + }, + { + "epoch": 1.469537682077264, + "grad_norm": 4.274330139160156, + "learning_rate": 8.45949815954796e-06, + "loss": 0.901, + "step": 5801 + }, + { + "epoch": 1.4697910069664344, + "grad_norm": 4.059905052185059, + "learning_rate": 8.458893190429709e-06, + "loss": 0.7766, + "step": 5802 + }, + { + "epoch": 1.470044331855605, + "grad_norm": 3.70166277885437, + "learning_rate": 8.45828812418736e-06, + "loss": 0.7859, + "step": 5803 + }, + { + "epoch": 1.4702976567447752, + "grad_norm": 3.623785972595215, + "learning_rate": 8.457682960837901e-06, + "loss": 0.866, + "step": 5804 + }, + { + "epoch": 1.4705509816339455, + "grad_norm": 3.6237847805023193, + "learning_rate": 8.457077700398325e-06, + "loss": 0.763, + "step": 5805 + }, + { + "epoch": 1.470804306523116, + "grad_norm": 3.97985577583313, + "learning_rate": 8.456472342885626e-06, + "loss": 0.9793, + "step": 5806 + }, + { + "epoch": 1.4710576314122863, + "grad_norm": 3.4481773376464844, + "learning_rate": 8.455866888316806e-06, + "loss": 0.7253, + "step": 5807 + }, + { + "epoch": 1.4713109563014566, + "grad_norm": 3.6168854236602783, + "learning_rate": 8.455261336708861e-06, + "loss": 0.8586, + "step": 5808 + }, + { + "epoch": 1.471564281190627, + "grad_norm": 3.81908917427063, + "learning_rate": 8.4546556880788e-06, + "loss": 0.7193, + "step": 5809 + }, + { + "epoch": 1.4718176060797974, + "grad_norm": 3.9625723361968994, + "learning_rate": 8.454049942443624e-06, + "loss": 0.7025, + "step": 5810 + }, + { + "epoch": 1.4720709309689677, + "grad_norm": 3.6989450454711914, + "learning_rate": 8.453444099820346e-06, + "loss": 0.805, + "step": 5811 + }, + { + "epoch": 1.4723242558581382, + "grad_norm": 4.027205944061279, + "learning_rate": 8.452838160225974e-06, + "loss": 0.7952, + "step": 5812 + }, + { + "epoch": 1.4725775807473085, + "grad_norm": 3.91361665725708, + "learning_rate": 8.452232123677526e-06, + "loss": 0.8229, + "step": 5813 + }, + { + "epoch": 1.4728309056364788, + "grad_norm": 3.8778324127197266, + "learning_rate": 8.451625990192019e-06, + "loss": 0.8737, + "step": 5814 + }, + { + "epoch": 1.473084230525649, + "grad_norm": 4.140817642211914, + "learning_rate": 8.451019759786471e-06, + "loss": 0.8426, + "step": 5815 + }, + { + "epoch": 1.4733375554148196, + "grad_norm": 3.918105125427246, + "learning_rate": 8.450413432477904e-06, + "loss": 0.7913, + "step": 5816 + }, + { + "epoch": 1.4735908803039899, + "grad_norm": 3.3620104789733887, + "learning_rate": 8.449807008283348e-06, + "loss": 0.6683, + "step": 5817 + }, + { + "epoch": 1.4738442051931602, + "grad_norm": 3.3116416931152344, + "learning_rate": 8.449200487219826e-06, + "loss": 0.7992, + "step": 5818 + }, + { + "epoch": 1.4740975300823305, + "grad_norm": 3.637850046157837, + "learning_rate": 8.44859386930437e-06, + "loss": 0.8076, + "step": 5819 + }, + { + "epoch": 1.474350854971501, + "grad_norm": 3.6628856658935547, + "learning_rate": 8.447987154554018e-06, + "loss": 0.6968, + "step": 5820 + }, + { + "epoch": 1.4746041798606713, + "grad_norm": 3.631540060043335, + "learning_rate": 8.4473803429858e-06, + "loss": 0.7686, + "step": 5821 + }, + { + "epoch": 1.4748575047498416, + "grad_norm": 3.455829620361328, + "learning_rate": 8.446773434616757e-06, + "loss": 0.7153, + "step": 5822 + }, + { + "epoch": 1.475110829639012, + "grad_norm": 3.7421679496765137, + "learning_rate": 8.446166429463933e-06, + "loss": 0.7868, + "step": 5823 + }, + { + "epoch": 1.4753641545281824, + "grad_norm": 4.175968170166016, + "learning_rate": 8.445559327544372e-06, + "loss": 0.8762, + "step": 5824 + }, + { + "epoch": 1.4756174794173527, + "grad_norm": 4.218408584594727, + "learning_rate": 8.444952128875118e-06, + "loss": 1.0124, + "step": 5825 + }, + { + "epoch": 1.4758708043065232, + "grad_norm": 3.955867290496826, + "learning_rate": 8.444344833473222e-06, + "loss": 0.7887, + "step": 5826 + }, + { + "epoch": 1.4761241291956935, + "grad_norm": 4.2899651527404785, + "learning_rate": 8.44373744135574e-06, + "loss": 0.8016, + "step": 5827 + }, + { + "epoch": 1.4763774540848638, + "grad_norm": 3.8998396396636963, + "learning_rate": 8.443129952539722e-06, + "loss": 0.7813, + "step": 5828 + }, + { + "epoch": 1.4766307789740343, + "grad_norm": 3.8248023986816406, + "learning_rate": 8.44252236704223e-06, + "loss": 0.7968, + "step": 5829 + }, + { + "epoch": 1.4768841038632046, + "grad_norm": 3.8769917488098145, + "learning_rate": 8.441914684880324e-06, + "loss": 0.7902, + "step": 5830 + }, + { + "epoch": 1.4771374287523749, + "grad_norm": 4.380746841430664, + "learning_rate": 8.441306906071065e-06, + "loss": 0.8351, + "step": 5831 + }, + { + "epoch": 1.4773907536415454, + "grad_norm": 3.37375545501709, + "learning_rate": 8.440699030631523e-06, + "loss": 0.7702, + "step": 5832 + }, + { + "epoch": 1.4776440785307157, + "grad_norm": 3.931594133377075, + "learning_rate": 8.440091058578763e-06, + "loss": 0.8683, + "step": 5833 + }, + { + "epoch": 1.477897403419886, + "grad_norm": 3.6735098361968994, + "learning_rate": 8.439482989929859e-06, + "loss": 0.8508, + "step": 5834 + }, + { + "epoch": 1.4781507283090565, + "grad_norm": 3.7484383583068848, + "learning_rate": 8.438874824701884e-06, + "loss": 0.7568, + "step": 5835 + }, + { + "epoch": 1.4784040531982268, + "grad_norm": 4.0925211906433105, + "learning_rate": 8.438266562911917e-06, + "loss": 0.8033, + "step": 5836 + }, + { + "epoch": 1.478657378087397, + "grad_norm": 3.8534984588623047, + "learning_rate": 8.437658204577035e-06, + "loss": 0.7125, + "step": 5837 + }, + { + "epoch": 1.4789107029765676, + "grad_norm": 3.55655837059021, + "learning_rate": 8.437049749714323e-06, + "loss": 0.7976, + "step": 5838 + }, + { + "epoch": 1.4791640278657379, + "grad_norm": 3.5970618724823, + "learning_rate": 8.436441198340864e-06, + "loss": 0.8552, + "step": 5839 + }, + { + "epoch": 1.4794173527549082, + "grad_norm": 3.8830974102020264, + "learning_rate": 8.435832550473748e-06, + "loss": 0.7632, + "step": 5840 + }, + { + "epoch": 1.4796706776440787, + "grad_norm": 3.6924026012420654, + "learning_rate": 8.435223806130063e-06, + "loss": 0.8214, + "step": 5841 + }, + { + "epoch": 1.479924002533249, + "grad_norm": 3.2904129028320312, + "learning_rate": 8.434614965326904e-06, + "loss": 0.6991, + "step": 5842 + }, + { + "epoch": 1.4801773274224193, + "grad_norm": 3.5483806133270264, + "learning_rate": 8.434006028081365e-06, + "loss": 0.7987, + "step": 5843 + }, + { + "epoch": 1.4804306523115895, + "grad_norm": 3.480278730392456, + "learning_rate": 8.433396994410548e-06, + "loss": 0.8048, + "step": 5844 + }, + { + "epoch": 1.4806839772007598, + "grad_norm": 4.472566604614258, + "learning_rate": 8.432787864331553e-06, + "loss": 0.9373, + "step": 5845 + }, + { + "epoch": 1.4809373020899304, + "grad_norm": 3.7041289806365967, + "learning_rate": 8.432178637861483e-06, + "loss": 0.6865, + "step": 5846 + }, + { + "epoch": 1.4811906269791006, + "grad_norm": 3.6985971927642822, + "learning_rate": 8.431569315017444e-06, + "loss": 0.7535, + "step": 5847 + }, + { + "epoch": 1.481443951868271, + "grad_norm": 3.5373966693878174, + "learning_rate": 8.430959895816548e-06, + "loss": 0.8311, + "step": 5848 + }, + { + "epoch": 1.4816972767574415, + "grad_norm": 3.5233519077301025, + "learning_rate": 8.430350380275907e-06, + "loss": 0.8306, + "step": 5849 + }, + { + "epoch": 1.4819506016466117, + "grad_norm": 3.967841625213623, + "learning_rate": 8.429740768412636e-06, + "loss": 0.734, + "step": 5850 + }, + { + "epoch": 1.482203926535782, + "grad_norm": 3.6912450790405273, + "learning_rate": 8.42913106024385e-06, + "loss": 0.7144, + "step": 5851 + }, + { + "epoch": 1.4824572514249525, + "grad_norm": 3.5270895957946777, + "learning_rate": 8.42852125578667e-06, + "loss": 0.7088, + "step": 5852 + }, + { + "epoch": 1.4827105763141228, + "grad_norm": 3.1713204383850098, + "learning_rate": 8.42791135505822e-06, + "loss": 0.7399, + "step": 5853 + }, + { + "epoch": 1.4829639012032931, + "grad_norm": 4.049830436706543, + "learning_rate": 8.427301358075627e-06, + "loss": 0.8691, + "step": 5854 + }, + { + "epoch": 1.4832172260924636, + "grad_norm": 3.835407018661499, + "learning_rate": 8.426691264856019e-06, + "loss": 0.7234, + "step": 5855 + }, + { + "epoch": 1.483470550981634, + "grad_norm": 3.9003965854644775, + "learning_rate": 8.426081075416524e-06, + "loss": 0.881, + "step": 5856 + }, + { + "epoch": 1.4837238758708042, + "grad_norm": 3.7081782817840576, + "learning_rate": 8.42547078977428e-06, + "loss": 0.8501, + "step": 5857 + }, + { + "epoch": 1.4839772007599747, + "grad_norm": 4.005224227905273, + "learning_rate": 8.424860407946421e-06, + "loss": 0.8632, + "step": 5858 + }, + { + "epoch": 1.484230525649145, + "grad_norm": 4.061598300933838, + "learning_rate": 8.424249929950087e-06, + "loss": 0.7495, + "step": 5859 + }, + { + "epoch": 1.4844838505383153, + "grad_norm": 3.4367449283599854, + "learning_rate": 8.42363935580242e-06, + "loss": 0.7512, + "step": 5860 + }, + { + "epoch": 1.4847371754274858, + "grad_norm": 3.680394411087036, + "learning_rate": 8.423028685520565e-06, + "loss": 0.6946, + "step": 5861 + }, + { + "epoch": 1.4849905003166561, + "grad_norm": 3.8052899837493896, + "learning_rate": 8.422417919121666e-06, + "loss": 0.911, + "step": 5862 + }, + { + "epoch": 1.4852438252058264, + "grad_norm": 3.931506633758545, + "learning_rate": 8.421807056622879e-06, + "loss": 0.9362, + "step": 5863 + }, + { + "epoch": 1.485497150094997, + "grad_norm": 4.07252836227417, + "learning_rate": 8.421196098041352e-06, + "loss": 0.8866, + "step": 5864 + }, + { + "epoch": 1.4857504749841672, + "grad_norm": 3.3964853286743164, + "learning_rate": 8.420585043394243e-06, + "loss": 0.8341, + "step": 5865 + }, + { + "epoch": 1.4860037998733375, + "grad_norm": 3.8817386627197266, + "learning_rate": 8.419973892698708e-06, + "loss": 0.7812, + "step": 5866 + }, + { + "epoch": 1.486257124762508, + "grad_norm": 3.665032386779785, + "learning_rate": 8.419362645971909e-06, + "loss": 0.6833, + "step": 5867 + }, + { + "epoch": 1.4865104496516783, + "grad_norm": 3.324069023132324, + "learning_rate": 8.41875130323101e-06, + "loss": 0.856, + "step": 5868 + }, + { + "epoch": 1.4867637745408486, + "grad_norm": 3.504530906677246, + "learning_rate": 8.418139864493178e-06, + "loss": 0.6517, + "step": 5869 + }, + { + "epoch": 1.487017099430019, + "grad_norm": 3.9483985900878906, + "learning_rate": 8.41752832977558e-06, + "loss": 0.699, + "step": 5870 + }, + { + "epoch": 1.4872704243191894, + "grad_norm": 3.775846481323242, + "learning_rate": 8.416916699095385e-06, + "loss": 0.8915, + "step": 5871 + }, + { + "epoch": 1.4875237492083597, + "grad_norm": 3.772552728652954, + "learning_rate": 8.416304972469774e-06, + "loss": 0.7129, + "step": 5872 + }, + { + "epoch": 1.48777707409753, + "grad_norm": 4.275998115539551, + "learning_rate": 8.41569314991592e-06, + "loss": 0.8379, + "step": 5873 + }, + { + "epoch": 1.4880303989867003, + "grad_norm": 3.6327340602874756, + "learning_rate": 8.415081231451004e-06, + "loss": 0.6871, + "step": 5874 + }, + { + "epoch": 1.4882837238758708, + "grad_norm": 3.631073236465454, + "learning_rate": 8.414469217092206e-06, + "loss": 0.7939, + "step": 5875 + }, + { + "epoch": 1.4885370487650411, + "grad_norm": 3.3416197299957275, + "learning_rate": 8.413857106856711e-06, + "loss": 0.7443, + "step": 5876 + }, + { + "epoch": 1.4887903736542114, + "grad_norm": 3.8389337062835693, + "learning_rate": 8.41324490076171e-06, + "loss": 0.8646, + "step": 5877 + }, + { + "epoch": 1.489043698543382, + "grad_norm": 3.729714870452881, + "learning_rate": 8.412632598824395e-06, + "loss": 0.753, + "step": 5878 + }, + { + "epoch": 1.4892970234325522, + "grad_norm": 3.532261848449707, + "learning_rate": 8.412020201061952e-06, + "loss": 0.7507, + "step": 5879 + }, + { + "epoch": 1.4895503483217225, + "grad_norm": 3.9118940830230713, + "learning_rate": 8.411407707491584e-06, + "loss": 0.8067, + "step": 5880 + }, + { + "epoch": 1.489803673210893, + "grad_norm": 3.997044324874878, + "learning_rate": 8.410795118130483e-06, + "loss": 0.756, + "step": 5881 + }, + { + "epoch": 1.4900569981000633, + "grad_norm": 3.8973376750946045, + "learning_rate": 8.410182432995855e-06, + "loss": 0.7656, + "step": 5882 + }, + { + "epoch": 1.4903103229892336, + "grad_norm": 3.140139579772949, + "learning_rate": 8.409569652104905e-06, + "loss": 0.7283, + "step": 5883 + }, + { + "epoch": 1.4905636478784041, + "grad_norm": 3.708293914794922, + "learning_rate": 8.408956775474835e-06, + "loss": 0.7921, + "step": 5884 + }, + { + "epoch": 1.4908169727675744, + "grad_norm": 3.818350315093994, + "learning_rate": 8.408343803122856e-06, + "loss": 0.8596, + "step": 5885 + }, + { + "epoch": 1.4910702976567447, + "grad_norm": 3.35256028175354, + "learning_rate": 8.407730735066179e-06, + "loss": 0.7102, + "step": 5886 + }, + { + "epoch": 1.4913236225459152, + "grad_norm": 3.895524740219116, + "learning_rate": 8.407117571322023e-06, + "loss": 0.7286, + "step": 5887 + }, + { + "epoch": 1.4915769474350855, + "grad_norm": 3.782482624053955, + "learning_rate": 8.406504311907602e-06, + "loss": 0.8667, + "step": 5888 + }, + { + "epoch": 1.4918302723242558, + "grad_norm": 3.7339634895324707, + "learning_rate": 8.405890956840136e-06, + "loss": 0.8495, + "step": 5889 + }, + { + "epoch": 1.4920835972134263, + "grad_norm": 3.5503053665161133, + "learning_rate": 8.40527750613685e-06, + "loss": 0.8218, + "step": 5890 + }, + { + "epoch": 1.4923369221025966, + "grad_norm": 3.9296751022338867, + "learning_rate": 8.404663959814964e-06, + "loss": 0.8036, + "step": 5891 + }, + { + "epoch": 1.492590246991767, + "grad_norm": 4.208617687225342, + "learning_rate": 8.40405031789171e-06, + "loss": 0.8858, + "step": 5892 + }, + { + "epoch": 1.4928435718809374, + "grad_norm": 3.8549439907073975, + "learning_rate": 8.403436580384321e-06, + "loss": 0.7757, + "step": 5893 + }, + { + "epoch": 1.4930968967701077, + "grad_norm": 3.352830410003662, + "learning_rate": 8.402822747310026e-06, + "loss": 0.7826, + "step": 5894 + }, + { + "epoch": 1.493350221659278, + "grad_norm": 3.9335927963256836, + "learning_rate": 8.402208818686066e-06, + "loss": 0.8549, + "step": 5895 + }, + { + "epoch": 1.4936035465484485, + "grad_norm": 3.6129844188690186, + "learning_rate": 8.401594794529674e-06, + "loss": 0.6835, + "step": 5896 + }, + { + "epoch": 1.4938568714376188, + "grad_norm": 3.4007370471954346, + "learning_rate": 8.400980674858095e-06, + "loss": 0.8272, + "step": 5897 + }, + { + "epoch": 1.494110196326789, + "grad_norm": 3.628884792327881, + "learning_rate": 8.400366459688574e-06, + "loss": 0.7141, + "step": 5898 + }, + { + "epoch": 1.4943635212159594, + "grad_norm": 3.4113211631774902, + "learning_rate": 8.399752149038355e-06, + "loss": 0.6891, + "step": 5899 + }, + { + "epoch": 1.49461684610513, + "grad_norm": 4.004037857055664, + "learning_rate": 8.39913774292469e-06, + "loss": 0.7637, + "step": 5900 + }, + { + "epoch": 1.4948701709943002, + "grad_norm": 3.8062174320220947, + "learning_rate": 8.398523241364829e-06, + "loss": 0.7391, + "step": 5901 + }, + { + "epoch": 1.4951234958834705, + "grad_norm": 3.5870893001556396, + "learning_rate": 8.397908644376029e-06, + "loss": 0.8925, + "step": 5902 + }, + { + "epoch": 1.4953768207726408, + "grad_norm": 3.6207046508789062, + "learning_rate": 8.397293951975546e-06, + "loss": 0.7037, + "step": 5903 + }, + { + "epoch": 1.4956301456618113, + "grad_norm": 3.3794548511505127, + "learning_rate": 8.396679164180641e-06, + "loss": 0.6752, + "step": 5904 + }, + { + "epoch": 1.4958834705509816, + "grad_norm": 3.95609974861145, + "learning_rate": 8.39606428100858e-06, + "loss": 0.7661, + "step": 5905 + }, + { + "epoch": 1.4961367954401519, + "grad_norm": 3.7931015491485596, + "learning_rate": 8.395449302476623e-06, + "loss": 0.8, + "step": 5906 + }, + { + "epoch": 1.4963901203293224, + "grad_norm": 3.8280766010284424, + "learning_rate": 8.39483422860204e-06, + "loss": 0.8512, + "step": 5907 + }, + { + "epoch": 1.4966434452184927, + "grad_norm": 4.196775913238525, + "learning_rate": 8.394219059402106e-06, + "loss": 0.7962, + "step": 5908 + }, + { + "epoch": 1.496896770107663, + "grad_norm": 3.526451587677002, + "learning_rate": 8.39360379489409e-06, + "loss": 0.7187, + "step": 5909 + }, + { + "epoch": 1.4971500949968335, + "grad_norm": 3.7106826305389404, + "learning_rate": 8.392988435095268e-06, + "loss": 0.7334, + "step": 5910 + }, + { + "epoch": 1.4974034198860038, + "grad_norm": 3.939286231994629, + "learning_rate": 8.392372980022923e-06, + "loss": 0.8712, + "step": 5911 + }, + { + "epoch": 1.497656744775174, + "grad_norm": 3.819657802581787, + "learning_rate": 8.391757429694336e-06, + "loss": 0.7607, + "step": 5912 + }, + { + "epoch": 1.4979100696643446, + "grad_norm": 4.027267932891846, + "learning_rate": 8.391141784126789e-06, + "loss": 0.7891, + "step": 5913 + }, + { + "epoch": 1.4981633945535149, + "grad_norm": 3.8232924938201904, + "learning_rate": 8.390526043337568e-06, + "loss": 0.8584, + "step": 5914 + }, + { + "epoch": 1.4984167194426852, + "grad_norm": 4.053662300109863, + "learning_rate": 8.389910207343967e-06, + "loss": 0.8404, + "step": 5915 + }, + { + "epoch": 1.4986700443318557, + "grad_norm": 4.3264360427856445, + "learning_rate": 8.389294276163276e-06, + "loss": 0.8765, + "step": 5916 + }, + { + "epoch": 1.498923369221026, + "grad_norm": 3.9155824184417725, + "learning_rate": 8.388678249812789e-06, + "loss": 0.8662, + "step": 5917 + }, + { + "epoch": 1.4991766941101963, + "grad_norm": 3.5717077255249023, + "learning_rate": 8.388062128309806e-06, + "loss": 0.7999, + "step": 5918 + }, + { + "epoch": 1.4994300189993668, + "grad_norm": 3.320936918258667, + "learning_rate": 8.387445911671626e-06, + "loss": 0.7039, + "step": 5919 + }, + { + "epoch": 1.499683343888537, + "grad_norm": 3.697159767150879, + "learning_rate": 8.38682959991555e-06, + "loss": 0.7215, + "step": 5920 + }, + { + "epoch": 1.4999366687777074, + "grad_norm": 3.7197892665863037, + "learning_rate": 8.38621319305889e-06, + "loss": 0.7379, + "step": 5921 + }, + { + "epoch": 1.5001899936668779, + "grad_norm": 3.9652554988861084, + "learning_rate": 8.38559669111895e-06, + "loss": 0.6469, + "step": 5922 + }, + { + "epoch": 1.5004433185560482, + "grad_norm": 3.6747124195098877, + "learning_rate": 8.384980094113042e-06, + "loss": 0.7645, + "step": 5923 + }, + { + "epoch": 1.5006966434452185, + "grad_norm": 3.62958025932312, + "learning_rate": 8.384363402058477e-06, + "loss": 0.7681, + "step": 5924 + }, + { + "epoch": 1.500949968334389, + "grad_norm": 3.7773592472076416, + "learning_rate": 8.383746614972574e-06, + "loss": 0.8069, + "step": 5925 + }, + { + "epoch": 1.501203293223559, + "grad_norm": 3.7049543857574463, + "learning_rate": 8.383129732872654e-06, + "loss": 0.8527, + "step": 5926 + }, + { + "epoch": 1.5014566181127296, + "grad_norm": 3.6561789512634277, + "learning_rate": 8.382512755776036e-06, + "loss": 0.728, + "step": 5927 + }, + { + "epoch": 1.5017099430019, + "grad_norm": 3.594363212585449, + "learning_rate": 8.381895683700045e-06, + "loss": 0.7722, + "step": 5928 + }, + { + "epoch": 1.5019632678910702, + "grad_norm": 3.8756494522094727, + "learning_rate": 8.381278516662009e-06, + "loss": 0.7587, + "step": 5929 + }, + { + "epoch": 1.5022165927802407, + "grad_norm": 3.8373332023620605, + "learning_rate": 8.380661254679254e-06, + "loss": 0.8258, + "step": 5930 + }, + { + "epoch": 1.5024699176694112, + "grad_norm": 3.442307233810425, + "learning_rate": 8.380043897769118e-06, + "loss": 0.7129, + "step": 5931 + }, + { + "epoch": 1.5027232425585813, + "grad_norm": 4.300326824188232, + "learning_rate": 8.379426445948933e-06, + "loss": 0.8127, + "step": 5932 + }, + { + "epoch": 1.5029765674477518, + "grad_norm": 4.1712422370910645, + "learning_rate": 8.378808899236037e-06, + "loss": 0.7044, + "step": 5933 + }, + { + "epoch": 1.503229892336922, + "grad_norm": 3.6050972938537598, + "learning_rate": 8.378191257647772e-06, + "loss": 0.7865, + "step": 5934 + }, + { + "epoch": 1.5034832172260923, + "grad_norm": 4.093749046325684, + "learning_rate": 8.377573521201479e-06, + "loss": 0.8434, + "step": 5935 + }, + { + "epoch": 1.5037365421152629, + "grad_norm": 3.5932419300079346, + "learning_rate": 8.376955689914503e-06, + "loss": 0.8215, + "step": 5936 + }, + { + "epoch": 1.5039898670044332, + "grad_norm": 3.8248860836029053, + "learning_rate": 8.376337763804196e-06, + "loss": 0.7794, + "step": 5937 + }, + { + "epoch": 1.5042431918936034, + "grad_norm": 3.567612648010254, + "learning_rate": 8.375719742887906e-06, + "loss": 0.6918, + "step": 5938 + }, + { + "epoch": 1.504496516782774, + "grad_norm": 3.5386483669281006, + "learning_rate": 8.375101627182986e-06, + "loss": 0.8891, + "step": 5939 + }, + { + "epoch": 1.5047498416719443, + "grad_norm": 3.645420551300049, + "learning_rate": 8.374483416706797e-06, + "loss": 0.8245, + "step": 5940 + }, + { + "epoch": 1.5050031665611145, + "grad_norm": 3.855353593826294, + "learning_rate": 8.373865111476694e-06, + "loss": 0.9123, + "step": 5941 + }, + { + "epoch": 1.505256491450285, + "grad_norm": 3.625429391860962, + "learning_rate": 8.373246711510041e-06, + "loss": 0.7567, + "step": 5942 + }, + { + "epoch": 1.5055098163394554, + "grad_norm": 3.689708948135376, + "learning_rate": 8.372628216824202e-06, + "loss": 0.845, + "step": 5943 + }, + { + "epoch": 1.5057631412286256, + "grad_norm": 3.7238686084747314, + "learning_rate": 8.372009627436539e-06, + "loss": 0.7627, + "step": 5944 + }, + { + "epoch": 1.5060164661177962, + "grad_norm": 3.5374462604522705, + "learning_rate": 8.37139094336443e-06, + "loss": 0.7364, + "step": 5945 + }, + { + "epoch": 1.5062697910069665, + "grad_norm": 3.7947185039520264, + "learning_rate": 8.370772164625242e-06, + "loss": 0.726, + "step": 5946 + }, + { + "epoch": 1.5065231158961367, + "grad_norm": 3.910332441329956, + "learning_rate": 8.370153291236351e-06, + "loss": 0.7871, + "step": 5947 + }, + { + "epoch": 1.5067764407853073, + "grad_norm": 4.113910675048828, + "learning_rate": 8.369534323215136e-06, + "loss": 0.841, + "step": 5948 + }, + { + "epoch": 1.5070297656744776, + "grad_norm": 3.6673974990844727, + "learning_rate": 8.368915260578976e-06, + "loss": 0.8295, + "step": 5949 + }, + { + "epoch": 1.5072830905636478, + "grad_norm": 3.6641221046447754, + "learning_rate": 8.368296103345255e-06, + "loss": 0.7011, + "step": 5950 + }, + { + "epoch": 1.5075364154528184, + "grad_norm": 3.5324556827545166, + "learning_rate": 8.367676851531359e-06, + "loss": 0.8896, + "step": 5951 + }, + { + "epoch": 1.5077897403419886, + "grad_norm": 3.4693355560302734, + "learning_rate": 8.367057505154674e-06, + "loss": 0.8551, + "step": 5952 + }, + { + "epoch": 1.508043065231159, + "grad_norm": 3.685776710510254, + "learning_rate": 8.366438064232591e-06, + "loss": 0.7481, + "step": 5953 + }, + { + "epoch": 1.5082963901203295, + "grad_norm": 3.870347738265991, + "learning_rate": 8.365818528782506e-06, + "loss": 0.9026, + "step": 5954 + }, + { + "epoch": 1.5085497150094995, + "grad_norm": 3.93721079826355, + "learning_rate": 8.365198898821816e-06, + "loss": 0.6967, + "step": 5955 + }, + { + "epoch": 1.50880303989867, + "grad_norm": 4.287343978881836, + "learning_rate": 8.364579174367914e-06, + "loss": 0.7866, + "step": 5956 + }, + { + "epoch": 1.5090563647878406, + "grad_norm": 3.6782515048980713, + "learning_rate": 8.36395935543821e-06, + "loss": 0.819, + "step": 5957 + }, + { + "epoch": 1.5093096896770106, + "grad_norm": 3.3889825344085693, + "learning_rate": 8.363339442050102e-06, + "loss": 0.7005, + "step": 5958 + }, + { + "epoch": 1.5095630145661811, + "grad_norm": 3.817948341369629, + "learning_rate": 8.362719434220999e-06, + "loss": 0.77, + "step": 5959 + }, + { + "epoch": 1.5098163394553517, + "grad_norm": 3.713702440261841, + "learning_rate": 8.36209933196831e-06, + "loss": 0.7629, + "step": 5960 + }, + { + "epoch": 1.5100696643445217, + "grad_norm": 3.784099817276001, + "learning_rate": 8.361479135309448e-06, + "loss": 0.9719, + "step": 5961 + }, + { + "epoch": 1.5103229892336922, + "grad_norm": 3.7557411193847656, + "learning_rate": 8.360858844261828e-06, + "loss": 0.7447, + "step": 5962 + }, + { + "epoch": 1.5105763141228625, + "grad_norm": 4.17030668258667, + "learning_rate": 8.360238458842866e-06, + "loss": 0.7602, + "step": 5963 + }, + { + "epoch": 1.5108296390120328, + "grad_norm": 3.6455342769622803, + "learning_rate": 8.359617979069983e-06, + "loss": 0.7874, + "step": 5964 + }, + { + "epoch": 1.5110829639012033, + "grad_norm": 4.094343662261963, + "learning_rate": 8.3589974049606e-06, + "loss": 0.7528, + "step": 5965 + }, + { + "epoch": 1.5113362887903736, + "grad_norm": 3.721818685531616, + "learning_rate": 8.358376736532147e-06, + "loss": 0.7711, + "step": 5966 + }, + { + "epoch": 1.511589613679544, + "grad_norm": 3.379783868789673, + "learning_rate": 8.357755973802048e-06, + "loss": 0.8169, + "step": 5967 + }, + { + "epoch": 1.5118429385687144, + "grad_norm": 3.173121213912964, + "learning_rate": 8.357135116787736e-06, + "loss": 0.8007, + "step": 5968 + }, + { + "epoch": 1.5120962634578847, + "grad_norm": 3.5667548179626465, + "learning_rate": 8.356514165506642e-06, + "loss": 0.7001, + "step": 5969 + }, + { + "epoch": 1.512349588347055, + "grad_norm": 3.823620557785034, + "learning_rate": 8.355893119976203e-06, + "loss": 0.7559, + "step": 5970 + }, + { + "epoch": 1.5126029132362255, + "grad_norm": 4.1985931396484375, + "learning_rate": 8.355271980213859e-06, + "loss": 0.756, + "step": 5971 + }, + { + "epoch": 1.5128562381253958, + "grad_norm": 4.071039199829102, + "learning_rate": 8.35465074623705e-06, + "loss": 0.7873, + "step": 5972 + }, + { + "epoch": 1.5131095630145661, + "grad_norm": 4.075260639190674, + "learning_rate": 8.35402941806322e-06, + "loss": 0.7944, + "step": 5973 + }, + { + "epoch": 1.5133628879037366, + "grad_norm": 3.681140899658203, + "learning_rate": 8.353407995709818e-06, + "loss": 0.8534, + "step": 5974 + }, + { + "epoch": 1.513616212792907, + "grad_norm": 3.6451416015625, + "learning_rate": 8.352786479194288e-06, + "loss": 0.831, + "step": 5975 + }, + { + "epoch": 1.5138695376820772, + "grad_norm": 3.862677574157715, + "learning_rate": 8.352164868534085e-06, + "loss": 0.7449, + "step": 5976 + }, + { + "epoch": 1.5141228625712477, + "grad_norm": 3.7428784370422363, + "learning_rate": 8.351543163746667e-06, + "loss": 0.7445, + "step": 5977 + }, + { + "epoch": 1.514376187460418, + "grad_norm": 4.085680961608887, + "learning_rate": 8.350921364849485e-06, + "loss": 0.857, + "step": 5978 + }, + { + "epoch": 1.5146295123495883, + "grad_norm": 3.952937364578247, + "learning_rate": 8.350299471860003e-06, + "loss": 0.7919, + "step": 5979 + }, + { + "epoch": 1.5148828372387588, + "grad_norm": 3.6056969165802, + "learning_rate": 8.34967748479568e-06, + "loss": 0.8, + "step": 5980 + }, + { + "epoch": 1.5151361621279291, + "grad_norm": 3.8927102088928223, + "learning_rate": 8.349055403673984e-06, + "loss": 0.8612, + "step": 5981 + }, + { + "epoch": 1.5153894870170994, + "grad_norm": 4.043025970458984, + "learning_rate": 8.348433228512382e-06, + "loss": 0.7934, + "step": 5982 + }, + { + "epoch": 1.51564281190627, + "grad_norm": 3.595097780227661, + "learning_rate": 8.347810959328346e-06, + "loss": 0.878, + "step": 5983 + }, + { + "epoch": 1.51589613679544, + "grad_norm": 3.671607732772827, + "learning_rate": 8.347188596139346e-06, + "loss": 0.7493, + "step": 5984 + }, + { + "epoch": 1.5161494616846105, + "grad_norm": 3.481942892074585, + "learning_rate": 8.34656613896286e-06, + "loss": 0.7861, + "step": 5985 + }, + { + "epoch": 1.516402786573781, + "grad_norm": 3.783560037612915, + "learning_rate": 8.345943587816363e-06, + "loss": 0.7773, + "step": 5986 + }, + { + "epoch": 1.516656111462951, + "grad_norm": 3.979797840118408, + "learning_rate": 8.345320942717339e-06, + "loss": 0.814, + "step": 5987 + }, + { + "epoch": 1.5169094363521216, + "grad_norm": 3.455920934677124, + "learning_rate": 8.344698203683273e-06, + "loss": 0.7721, + "step": 5988 + }, + { + "epoch": 1.517162761241292, + "grad_norm": 4.146459579467773, + "learning_rate": 8.344075370731646e-06, + "loss": 0.9018, + "step": 5989 + }, + { + "epoch": 1.5174160861304622, + "grad_norm": 4.218571662902832, + "learning_rate": 8.343452443879951e-06, + "loss": 0.9181, + "step": 5990 + }, + { + "epoch": 1.5176694110196327, + "grad_norm": 3.3880550861358643, + "learning_rate": 8.34282942314568e-06, + "loss": 0.784, + "step": 5991 + }, + { + "epoch": 1.517922735908803, + "grad_norm": 3.512906789779663, + "learning_rate": 8.342206308546323e-06, + "loss": 0.8124, + "step": 5992 + }, + { + "epoch": 1.5181760607979733, + "grad_norm": 4.195394515991211, + "learning_rate": 8.341583100099379e-06, + "loss": 0.8566, + "step": 5993 + }, + { + "epoch": 1.5184293856871438, + "grad_norm": 3.700171947479248, + "learning_rate": 8.34095979782235e-06, + "loss": 0.7759, + "step": 5994 + }, + { + "epoch": 1.518682710576314, + "grad_norm": 3.8103458881378174, + "learning_rate": 8.340336401732733e-06, + "loss": 0.8734, + "step": 5995 + }, + { + "epoch": 1.5189360354654844, + "grad_norm": 3.7539734840393066, + "learning_rate": 8.339712911848039e-06, + "loss": 0.8815, + "step": 5996 + }, + { + "epoch": 1.519189360354655, + "grad_norm": 3.4617977142333984, + "learning_rate": 8.33908932818577e-06, + "loss": 0.7813, + "step": 5997 + }, + { + "epoch": 1.5194426852438252, + "grad_norm": 3.9092464447021484, + "learning_rate": 8.338465650763437e-06, + "loss": 0.7633, + "step": 5998 + }, + { + "epoch": 1.5196960101329955, + "grad_norm": 3.7686939239501953, + "learning_rate": 8.337841879598554e-06, + "loss": 0.8092, + "step": 5999 + }, + { + "epoch": 1.519949335022166, + "grad_norm": 3.54663348197937, + "learning_rate": 8.337218014708635e-06, + "loss": 0.7971, + "step": 6000 + }, + { + "epoch": 1.519949335022166, + "eval_loss": 1.1638351678848267, + "eval_runtime": 13.1002, + "eval_samples_per_second": 30.534, + "eval_steps_per_second": 3.817, + "step": 6000 + }, + { + "epoch": 1.5202026599113363, + "grad_norm": 3.7645788192749023, + "learning_rate": 8.336594056111197e-06, + "loss": 0.9124, + "step": 6001 + }, + { + "epoch": 1.5204559848005066, + "grad_norm": 4.3909149169921875, + "learning_rate": 8.335970003823763e-06, + "loss": 0.8832, + "step": 6002 + }, + { + "epoch": 1.520709309689677, + "grad_norm": 3.7304556369781494, + "learning_rate": 8.335345857863855e-06, + "loss": 0.706, + "step": 6003 + }, + { + "epoch": 1.5209626345788474, + "grad_norm": 3.730750560760498, + "learning_rate": 8.334721618248998e-06, + "loss": 0.9056, + "step": 6004 + }, + { + "epoch": 1.5212159594680177, + "grad_norm": 3.7754292488098145, + "learning_rate": 8.334097284996721e-06, + "loss": 0.799, + "step": 6005 + }, + { + "epoch": 1.5214692843571882, + "grad_norm": 3.810028076171875, + "learning_rate": 8.333472858124557e-06, + "loss": 0.8023, + "step": 6006 + }, + { + "epoch": 1.5217226092463585, + "grad_norm": 3.8349390029907227, + "learning_rate": 8.332848337650034e-06, + "loss": 0.8124, + "step": 6007 + }, + { + "epoch": 1.5219759341355288, + "grad_norm": 3.920043468475342, + "learning_rate": 8.332223723590693e-06, + "loss": 0.8803, + "step": 6008 + }, + { + "epoch": 1.5222292590246993, + "grad_norm": 3.2182974815368652, + "learning_rate": 8.331599015964071e-06, + "loss": 0.6727, + "step": 6009 + }, + { + "epoch": 1.5224825839138696, + "grad_norm": 3.548264265060425, + "learning_rate": 8.330974214787712e-06, + "loss": 0.7973, + "step": 6010 + }, + { + "epoch": 1.5227359088030399, + "grad_norm": 3.722736358642578, + "learning_rate": 8.330349320079156e-06, + "loss": 0.8003, + "step": 6011 + }, + { + "epoch": 1.5229892336922104, + "grad_norm": 3.8040225505828857, + "learning_rate": 8.329724331855953e-06, + "loss": 0.6691, + "step": 6012 + }, + { + "epoch": 1.5232425585813805, + "grad_norm": 3.4361138343811035, + "learning_rate": 8.329099250135652e-06, + "loss": 0.7329, + "step": 6013 + }, + { + "epoch": 1.523495883470551, + "grad_norm": 4.150000095367432, + "learning_rate": 8.328474074935803e-06, + "loss": 0.7398, + "step": 6014 + }, + { + "epoch": 1.5237492083597215, + "grad_norm": 4.133837699890137, + "learning_rate": 8.327848806273962e-06, + "loss": 0.8963, + "step": 6015 + }, + { + "epoch": 1.5240025332488916, + "grad_norm": 3.950253486633301, + "learning_rate": 8.327223444167688e-06, + "loss": 0.8378, + "step": 6016 + }, + { + "epoch": 1.524255858138062, + "grad_norm": 3.9877703189849854, + "learning_rate": 8.326597988634538e-06, + "loss": 0.8598, + "step": 6017 + }, + { + "epoch": 1.5245091830272324, + "grad_norm": 3.686001777648926, + "learning_rate": 8.325972439692075e-06, + "loss": 0.7589, + "step": 6018 + }, + { + "epoch": 1.5247625079164027, + "grad_norm": 3.9073050022125244, + "learning_rate": 8.325346797357865e-06, + "loss": 0.9342, + "step": 6019 + }, + { + "epoch": 1.5250158328055732, + "grad_norm": 3.502190113067627, + "learning_rate": 8.324721061649475e-06, + "loss": 0.6362, + "step": 6020 + }, + { + "epoch": 1.5252691576947435, + "grad_norm": 3.5541303157806396, + "learning_rate": 8.324095232584477e-06, + "loss": 0.6861, + "step": 6021 + }, + { + "epoch": 1.5255224825839138, + "grad_norm": 3.7686498165130615, + "learning_rate": 8.323469310180442e-06, + "loss": 0.8424, + "step": 6022 + }, + { + "epoch": 1.5257758074730843, + "grad_norm": 3.8931078910827637, + "learning_rate": 8.322843294454946e-06, + "loss": 0.8232, + "step": 6023 + }, + { + "epoch": 1.5260291323622546, + "grad_norm": 4.055556774139404, + "learning_rate": 8.322217185425568e-06, + "loss": 0.8232, + "step": 6024 + }, + { + "epoch": 1.5262824572514249, + "grad_norm": 3.690748453140259, + "learning_rate": 8.321590983109889e-06, + "loss": 0.7097, + "step": 6025 + }, + { + "epoch": 1.5265357821405954, + "grad_norm": 3.429391622543335, + "learning_rate": 8.320964687525492e-06, + "loss": 0.8544, + "step": 6026 + }, + { + "epoch": 1.5267891070297657, + "grad_norm": 3.9555764198303223, + "learning_rate": 8.320338298689963e-06, + "loss": 0.7345, + "step": 6027 + }, + { + "epoch": 1.527042431918936, + "grad_norm": 3.7725577354431152, + "learning_rate": 8.31971181662089e-06, + "loss": 0.6701, + "step": 6028 + }, + { + "epoch": 1.5272957568081065, + "grad_norm": 3.3939309120178223, + "learning_rate": 8.319085241335865e-06, + "loss": 0.7715, + "step": 6029 + }, + { + "epoch": 1.5275490816972768, + "grad_norm": 3.706507682800293, + "learning_rate": 8.318458572852484e-06, + "loss": 0.7827, + "step": 6030 + }, + { + "epoch": 1.527802406586447, + "grad_norm": 3.487157106399536, + "learning_rate": 8.317831811188339e-06, + "loss": 0.7933, + "step": 6031 + }, + { + "epoch": 1.5280557314756176, + "grad_norm": 4.3159708976745605, + "learning_rate": 8.317204956361033e-06, + "loss": 0.7806, + "step": 6032 + }, + { + "epoch": 1.5283090563647879, + "grad_norm": 3.9915390014648438, + "learning_rate": 8.316578008388165e-06, + "loss": 0.7466, + "step": 6033 + }, + { + "epoch": 1.5285623812539582, + "grad_norm": 3.8348469734191895, + "learning_rate": 8.315950967287343e-06, + "loss": 0.8585, + "step": 6034 + }, + { + "epoch": 1.5288157061431287, + "grad_norm": 4.7072672843933105, + "learning_rate": 8.315323833076171e-06, + "loss": 0.7936, + "step": 6035 + }, + { + "epoch": 1.529069031032299, + "grad_norm": 3.6637141704559326, + "learning_rate": 8.31469660577226e-06, + "loss": 0.7369, + "step": 6036 + }, + { + "epoch": 1.5293223559214693, + "grad_norm": 3.2571606636047363, + "learning_rate": 8.314069285393222e-06, + "loss": 0.7446, + "step": 6037 + }, + { + "epoch": 1.5295756808106398, + "grad_norm": 3.9346470832824707, + "learning_rate": 8.313441871956671e-06, + "loss": 0.6638, + "step": 6038 + }, + { + "epoch": 1.5298290056998098, + "grad_norm": 3.5464725494384766, + "learning_rate": 8.312814365480225e-06, + "loss": 0.8284, + "step": 6039 + }, + { + "epoch": 1.5300823305889804, + "grad_norm": 3.887022018432617, + "learning_rate": 8.312186765981504e-06, + "loss": 0.7729, + "step": 6040 + }, + { + "epoch": 1.5303356554781509, + "grad_norm": 3.848708152770996, + "learning_rate": 8.311559073478133e-06, + "loss": 0.8094, + "step": 6041 + }, + { + "epoch": 1.530588980367321, + "grad_norm": 3.7966063022613525, + "learning_rate": 8.310931287987733e-06, + "loss": 0.8061, + "step": 6042 + }, + { + "epoch": 1.5308423052564915, + "grad_norm": 3.894991636276245, + "learning_rate": 8.310303409527935e-06, + "loss": 0.9106, + "step": 6043 + }, + { + "epoch": 1.531095630145662, + "grad_norm": 3.733093023300171, + "learning_rate": 8.30967543811637e-06, + "loss": 0.7332, + "step": 6044 + }, + { + "epoch": 1.531348955034832, + "grad_norm": 3.7139980792999268, + "learning_rate": 8.309047373770669e-06, + "loss": 0.7842, + "step": 6045 + }, + { + "epoch": 1.5316022799240026, + "grad_norm": 3.51163911819458, + "learning_rate": 8.308419216508467e-06, + "loss": 0.7031, + "step": 6046 + }, + { + "epoch": 1.5318556048131728, + "grad_norm": 4.045239448547363, + "learning_rate": 8.307790966347407e-06, + "loss": 0.8218, + "step": 6047 + }, + { + "epoch": 1.5321089297023431, + "grad_norm": 3.847369909286499, + "learning_rate": 8.307162623305125e-06, + "loss": 0.6955, + "step": 6048 + }, + { + "epoch": 1.5323622545915137, + "grad_norm": 3.456996440887451, + "learning_rate": 8.306534187399267e-06, + "loss": 0.8216, + "step": 6049 + }, + { + "epoch": 1.532615579480684, + "grad_norm": 3.546422243118286, + "learning_rate": 8.305905658647478e-06, + "loss": 0.7855, + "step": 6050 + }, + { + "epoch": 1.5328689043698542, + "grad_norm": 3.4630866050720215, + "learning_rate": 8.305277037067409e-06, + "loss": 0.7698, + "step": 6051 + }, + { + "epoch": 1.5331222292590247, + "grad_norm": 3.5826759338378906, + "learning_rate": 8.304648322676708e-06, + "loss": 0.7885, + "step": 6052 + }, + { + "epoch": 1.533375554148195, + "grad_norm": 3.8534812927246094, + "learning_rate": 8.304019515493031e-06, + "loss": 0.7569, + "step": 6053 + }, + { + "epoch": 1.5336288790373653, + "grad_norm": 3.715991973876953, + "learning_rate": 8.303390615534037e-06, + "loss": 0.8131, + "step": 6054 + }, + { + "epoch": 1.5338822039265358, + "grad_norm": 4.065611839294434, + "learning_rate": 8.302761622817381e-06, + "loss": 0.8437, + "step": 6055 + }, + { + "epoch": 1.5341355288157061, + "grad_norm": 3.571732521057129, + "learning_rate": 8.302132537360726e-06, + "loss": 0.7383, + "step": 6056 + }, + { + "epoch": 1.5343888537048764, + "grad_norm": 3.355422258377075, + "learning_rate": 8.301503359181738e-06, + "loss": 0.6935, + "step": 6057 + }, + { + "epoch": 1.534642178594047, + "grad_norm": 3.804828643798828, + "learning_rate": 8.300874088298083e-06, + "loss": 0.7726, + "step": 6058 + }, + { + "epoch": 1.5348955034832172, + "grad_norm": 3.91565203666687, + "learning_rate": 8.30024472472743e-06, + "loss": 0.9187, + "step": 6059 + }, + { + "epoch": 1.5351488283723875, + "grad_norm": 3.788347005844116, + "learning_rate": 8.299615268487454e-06, + "loss": 0.6832, + "step": 6060 + }, + { + "epoch": 1.535402153261558, + "grad_norm": 3.871217966079712, + "learning_rate": 8.298985719595824e-06, + "loss": 0.8161, + "step": 6061 + }, + { + "epoch": 1.5356554781507283, + "grad_norm": 3.481473445892334, + "learning_rate": 8.298356078070223e-06, + "loss": 0.7086, + "step": 6062 + }, + { + "epoch": 1.5359088030398986, + "grad_norm": 3.7261691093444824, + "learning_rate": 8.29772634392833e-06, + "loss": 0.7234, + "step": 6063 + }, + { + "epoch": 1.5361621279290691, + "grad_norm": 3.566100835800171, + "learning_rate": 8.297096517187826e-06, + "loss": 0.6692, + "step": 6064 + }, + { + "epoch": 1.5364154528182394, + "grad_norm": 3.87302827835083, + "learning_rate": 8.296466597866398e-06, + "loss": 0.8576, + "step": 6065 + }, + { + "epoch": 1.5366687777074097, + "grad_norm": 4.200307369232178, + "learning_rate": 8.295836585981731e-06, + "loss": 0.8331, + "step": 6066 + }, + { + "epoch": 1.5369221025965802, + "grad_norm": 3.3144524097442627, + "learning_rate": 8.295206481551518e-06, + "loss": 0.6906, + "step": 6067 + }, + { + "epoch": 1.5371754274857503, + "grad_norm": 3.8662407398223877, + "learning_rate": 8.294576284593453e-06, + "loss": 0.8347, + "step": 6068 + }, + { + "epoch": 1.5374287523749208, + "grad_norm": 3.577904224395752, + "learning_rate": 8.293945995125228e-06, + "loss": 0.7337, + "step": 6069 + }, + { + "epoch": 1.5376820772640913, + "grad_norm": 3.6100378036499023, + "learning_rate": 8.293315613164545e-06, + "loss": 0.8172, + "step": 6070 + }, + { + "epoch": 1.5379354021532614, + "grad_norm": 3.3867650032043457, + "learning_rate": 8.292685138729103e-06, + "loss": 0.7596, + "step": 6071 + }, + { + "epoch": 1.538188727042432, + "grad_norm": 3.6087770462036133, + "learning_rate": 8.292054571836604e-06, + "loss": 0.7493, + "step": 6072 + }, + { + "epoch": 1.5384420519316024, + "grad_norm": 3.534525156021118, + "learning_rate": 8.291423912504755e-06, + "loss": 0.676, + "step": 6073 + }, + { + "epoch": 1.5386953768207725, + "grad_norm": 3.5308618545532227, + "learning_rate": 8.290793160751267e-06, + "loss": 0.9538, + "step": 6074 + }, + { + "epoch": 1.538948701709943, + "grad_norm": 3.7343785762786865, + "learning_rate": 8.290162316593848e-06, + "loss": 0.8, + "step": 6075 + }, + { + "epoch": 1.5392020265991133, + "grad_norm": 4.010598182678223, + "learning_rate": 8.289531380050215e-06, + "loss": 0.7769, + "step": 6076 + }, + { + "epoch": 1.5394553514882836, + "grad_norm": 3.695286989212036, + "learning_rate": 8.28890035113808e-06, + "loss": 0.7746, + "step": 6077 + }, + { + "epoch": 1.5397086763774541, + "grad_norm": 3.6864147186279297, + "learning_rate": 8.288269229875167e-06, + "loss": 0.8323, + "step": 6078 + }, + { + "epoch": 1.5399620012666244, + "grad_norm": 3.7015504837036133, + "learning_rate": 8.287638016279193e-06, + "loss": 0.7643, + "step": 6079 + }, + { + "epoch": 1.5402153261557947, + "grad_norm": 4.204253196716309, + "learning_rate": 8.287006710367888e-06, + "loss": 0.9048, + "step": 6080 + }, + { + "epoch": 1.5404686510449652, + "grad_norm": 3.3711719512939453, + "learning_rate": 8.286375312158972e-06, + "loss": 0.733, + "step": 6081 + }, + { + "epoch": 1.5407219759341355, + "grad_norm": 4.128683090209961, + "learning_rate": 8.285743821670177e-06, + "loss": 0.8307, + "step": 6082 + }, + { + "epoch": 1.5409753008233058, + "grad_norm": 4.091029644012451, + "learning_rate": 8.285112238919237e-06, + "loss": 0.8763, + "step": 6083 + }, + { + "epoch": 1.5412286257124763, + "grad_norm": 3.8250246047973633, + "learning_rate": 8.284480563923884e-06, + "loss": 0.9446, + "step": 6084 + }, + { + "epoch": 1.5414819506016466, + "grad_norm": 3.400613784790039, + "learning_rate": 8.283848796701858e-06, + "loss": 0.8054, + "step": 6085 + }, + { + "epoch": 1.541735275490817, + "grad_norm": 3.6402640342712402, + "learning_rate": 8.283216937270895e-06, + "loss": 0.8743, + "step": 6086 + }, + { + "epoch": 1.5419886003799874, + "grad_norm": 3.620732307434082, + "learning_rate": 8.282584985648741e-06, + "loss": 0.8085, + "step": 6087 + }, + { + "epoch": 1.5422419252691577, + "grad_norm": 3.1914310455322266, + "learning_rate": 8.281952941853137e-06, + "loss": 0.7268, + "step": 6088 + }, + { + "epoch": 1.542495250158328, + "grad_norm": 3.587397813796997, + "learning_rate": 8.281320805901833e-06, + "loss": 0.7668, + "step": 6089 + }, + { + "epoch": 1.5427485750474985, + "grad_norm": 3.9668045043945312, + "learning_rate": 8.28068857781258e-06, + "loss": 0.7671, + "step": 6090 + }, + { + "epoch": 1.5430018999366688, + "grad_norm": 4.050612926483154, + "learning_rate": 8.280056257603128e-06, + "loss": 0.8197, + "step": 6091 + }, + { + "epoch": 1.543255224825839, + "grad_norm": 3.5672447681427, + "learning_rate": 8.279423845291234e-06, + "loss": 0.8322, + "step": 6092 + }, + { + "epoch": 1.5435085497150096, + "grad_norm": 3.5721328258514404, + "learning_rate": 8.278791340894657e-06, + "loss": 0.7212, + "step": 6093 + }, + { + "epoch": 1.54376187460418, + "grad_norm": 3.526211738586426, + "learning_rate": 8.278158744431153e-06, + "loss": 0.7312, + "step": 6094 + }, + { + "epoch": 1.5440151994933502, + "grad_norm": 3.6647751331329346, + "learning_rate": 8.27752605591849e-06, + "loss": 0.7607, + "step": 6095 + }, + { + "epoch": 1.5442685243825207, + "grad_norm": 3.6663804054260254, + "learning_rate": 8.27689327537443e-06, + "loss": 0.7713, + "step": 6096 + }, + { + "epoch": 1.5445218492716908, + "grad_norm": 4.1124162673950195, + "learning_rate": 8.276260402816743e-06, + "loss": 0.8451, + "step": 6097 + }, + { + "epoch": 1.5447751741608613, + "grad_norm": 4.174565315246582, + "learning_rate": 8.2756274382632e-06, + "loss": 0.8722, + "step": 6098 + }, + { + "epoch": 1.5450284990500318, + "grad_norm": 3.2694077491760254, + "learning_rate": 8.274994381731574e-06, + "loss": 0.6446, + "step": 6099 + }, + { + "epoch": 1.5452818239392019, + "grad_norm": 3.4651026725769043, + "learning_rate": 8.27436123323964e-06, + "loss": 0.7838, + "step": 6100 + }, + { + "epoch": 1.5455351488283724, + "grad_norm": 3.81234073638916, + "learning_rate": 8.273727992805177e-06, + "loss": 0.797, + "step": 6101 + }, + { + "epoch": 1.545788473717543, + "grad_norm": 3.965603828430176, + "learning_rate": 8.273094660445966e-06, + "loss": 0.6945, + "step": 6102 + }, + { + "epoch": 1.546041798606713, + "grad_norm": 3.6192595958709717, + "learning_rate": 8.272461236179792e-06, + "loss": 0.8607, + "step": 6103 + }, + { + "epoch": 1.5462951234958835, + "grad_norm": 4.0152764320373535, + "learning_rate": 8.27182772002444e-06, + "loss": 0.8548, + "step": 6104 + }, + { + "epoch": 1.5465484483850538, + "grad_norm": 3.598031520843506, + "learning_rate": 8.271194111997698e-06, + "loss": 0.8291, + "step": 6105 + }, + { + "epoch": 1.546801773274224, + "grad_norm": 3.3997223377227783, + "learning_rate": 8.270560412117359e-06, + "loss": 0.7448, + "step": 6106 + }, + { + "epoch": 1.5470550981633946, + "grad_norm": 4.2239556312561035, + "learning_rate": 8.269926620401216e-06, + "loss": 0.887, + "step": 6107 + }, + { + "epoch": 1.5473084230525649, + "grad_norm": 3.8402645587921143, + "learning_rate": 8.269292736867067e-06, + "loss": 0.7522, + "step": 6108 + }, + { + "epoch": 1.5475617479417352, + "grad_norm": 3.5056710243225098, + "learning_rate": 8.26865876153271e-06, + "loss": 0.8097, + "step": 6109 + }, + { + "epoch": 1.5478150728309057, + "grad_norm": 4.089310169219971, + "learning_rate": 8.268024694415949e-06, + "loss": 0.8353, + "step": 6110 + }, + { + "epoch": 1.548068397720076, + "grad_norm": 3.8327584266662598, + "learning_rate": 8.267390535534581e-06, + "loss": 0.8954, + "step": 6111 + }, + { + "epoch": 1.5483217226092463, + "grad_norm": 3.519998550415039, + "learning_rate": 8.266756284906421e-06, + "loss": 0.7823, + "step": 6112 + }, + { + "epoch": 1.5485750474984168, + "grad_norm": 3.7734134197235107, + "learning_rate": 8.266121942549276e-06, + "loss": 0.7635, + "step": 6113 + }, + { + "epoch": 1.548828372387587, + "grad_norm": 3.833378553390503, + "learning_rate": 8.265487508480958e-06, + "loss": 0.8667, + "step": 6114 + }, + { + "epoch": 1.5490816972767574, + "grad_norm": 3.733335018157959, + "learning_rate": 8.264852982719282e-06, + "loss": 0.8155, + "step": 6115 + }, + { + "epoch": 1.5493350221659279, + "grad_norm": 3.489326238632202, + "learning_rate": 8.264218365282061e-06, + "loss": 0.7451, + "step": 6116 + }, + { + "epoch": 1.5495883470550982, + "grad_norm": 4.047409534454346, + "learning_rate": 8.263583656187122e-06, + "loss": 0.9377, + "step": 6117 + }, + { + "epoch": 1.5498416719442685, + "grad_norm": 3.5313308238983154, + "learning_rate": 8.26294885545228e-06, + "loss": 0.7151, + "step": 6118 + }, + { + "epoch": 1.550094996833439, + "grad_norm": 3.8570024967193604, + "learning_rate": 8.262313963095366e-06, + "loss": 0.9263, + "step": 6119 + }, + { + "epoch": 1.5503483217226093, + "grad_norm": 4.1115899085998535, + "learning_rate": 8.261678979134204e-06, + "loss": 0.797, + "step": 6120 + }, + { + "epoch": 1.5506016466117796, + "grad_norm": 3.700599193572998, + "learning_rate": 8.261043903586625e-06, + "loss": 0.8243, + "step": 6121 + }, + { + "epoch": 1.55085497150095, + "grad_norm": 3.876948595046997, + "learning_rate": 8.260408736470462e-06, + "loss": 0.8056, + "step": 6122 + }, + { + "epoch": 1.5511082963901204, + "grad_norm": 3.469296932220459, + "learning_rate": 8.259773477803548e-06, + "loss": 0.7443, + "step": 6123 + }, + { + "epoch": 1.5513616212792907, + "grad_norm": 3.2719638347625732, + "learning_rate": 8.259138127603725e-06, + "loss": 0.7492, + "step": 6124 + }, + { + "epoch": 1.5516149461684612, + "grad_norm": 3.869877576828003, + "learning_rate": 8.25850268588883e-06, + "loss": 0.8034, + "step": 6125 + }, + { + "epoch": 1.5518682710576313, + "grad_norm": 3.4584498405456543, + "learning_rate": 8.257867152676705e-06, + "loss": 0.6768, + "step": 6126 + }, + { + "epoch": 1.5521215959468018, + "grad_norm": 4.024600982666016, + "learning_rate": 8.257231527985198e-06, + "loss": 0.7455, + "step": 6127 + }, + { + "epoch": 1.5523749208359723, + "grad_norm": 3.710503339767456, + "learning_rate": 8.256595811832158e-06, + "loss": 0.7653, + "step": 6128 + }, + { + "epoch": 1.5526282457251424, + "grad_norm": 3.544790267944336, + "learning_rate": 8.255960004235433e-06, + "loss": 0.7088, + "step": 6129 + }, + { + "epoch": 1.5528815706143129, + "grad_norm": 3.613865613937378, + "learning_rate": 8.255324105212876e-06, + "loss": 0.7456, + "step": 6130 + }, + { + "epoch": 1.5531348955034834, + "grad_norm": 3.8291051387786865, + "learning_rate": 8.254688114782346e-06, + "loss": 0.86, + "step": 6131 + }, + { + "epoch": 1.5533882203926535, + "grad_norm": 3.9924874305725098, + "learning_rate": 8.254052032961697e-06, + "loss": 0.8344, + "step": 6132 + }, + { + "epoch": 1.553641545281824, + "grad_norm": 4.155417442321777, + "learning_rate": 8.253415859768791e-06, + "loss": 0.705, + "step": 6133 + }, + { + "epoch": 1.5538948701709943, + "grad_norm": 3.7591300010681152, + "learning_rate": 8.252779595221496e-06, + "loss": 0.863, + "step": 6134 + }, + { + "epoch": 1.5541481950601646, + "grad_norm": 4.381962776184082, + "learning_rate": 8.252143239337673e-06, + "loss": 0.8352, + "step": 6135 + }, + { + "epoch": 1.554401519949335, + "grad_norm": 3.820098876953125, + "learning_rate": 8.25150679213519e-06, + "loss": 0.7966, + "step": 6136 + }, + { + "epoch": 1.5546548448385054, + "grad_norm": 4.8242926597595215, + "learning_rate": 8.250870253631924e-06, + "loss": 0.7907, + "step": 6137 + }, + { + "epoch": 1.5549081697276756, + "grad_norm": 4.002001762390137, + "learning_rate": 8.250233623845742e-06, + "loss": 0.7635, + "step": 6138 + }, + { + "epoch": 1.5551614946168462, + "grad_norm": 3.7615208625793457, + "learning_rate": 8.249596902794526e-06, + "loss": 0.8191, + "step": 6139 + }, + { + "epoch": 1.5554148195060165, + "grad_norm": 3.6249327659606934, + "learning_rate": 8.24896009049615e-06, + "loss": 0.833, + "step": 6140 + }, + { + "epoch": 1.5556681443951867, + "grad_norm": 3.682542324066162, + "learning_rate": 8.248323186968496e-06, + "loss": 0.8265, + "step": 6141 + }, + { + "epoch": 1.5559214692843573, + "grad_norm": 4.007565975189209, + "learning_rate": 8.24768619222945e-06, + "loss": 0.968, + "step": 6142 + }, + { + "epoch": 1.5561747941735276, + "grad_norm": 4.109283924102783, + "learning_rate": 8.2470491062969e-06, + "loss": 0.9266, + "step": 6143 + }, + { + "epoch": 1.5564281190626978, + "grad_norm": 3.45259428024292, + "learning_rate": 8.24641192918873e-06, + "loss": 0.7521, + "step": 6144 + }, + { + "epoch": 1.5566814439518684, + "grad_norm": 3.5808463096618652, + "learning_rate": 8.245774660922838e-06, + "loss": 0.796, + "step": 6145 + }, + { + "epoch": 1.5569347688410387, + "grad_norm": 3.752659797668457, + "learning_rate": 8.245137301517112e-06, + "loss": 0.8555, + "step": 6146 + }, + { + "epoch": 1.557188093730209, + "grad_norm": 3.394653797149658, + "learning_rate": 8.244499850989453e-06, + "loss": 0.669, + "step": 6147 + }, + { + "epoch": 1.5574414186193795, + "grad_norm": 3.9455931186676025, + "learning_rate": 8.243862309357757e-06, + "loss": 0.7548, + "step": 6148 + }, + { + "epoch": 1.5576947435085498, + "grad_norm": 3.751262664794922, + "learning_rate": 8.243224676639929e-06, + "loss": 0.7901, + "step": 6149 + }, + { + "epoch": 1.55794806839772, + "grad_norm": 3.648343324661255, + "learning_rate": 8.242586952853872e-06, + "loss": 0.8097, + "step": 6150 + }, + { + "epoch": 1.5582013932868906, + "grad_norm": 3.5928211212158203, + "learning_rate": 8.241949138017494e-06, + "loss": 0.7535, + "step": 6151 + }, + { + "epoch": 1.5584547181760608, + "grad_norm": 3.3643546104431152, + "learning_rate": 8.2413112321487e-06, + "loss": 0.6916, + "step": 6152 + }, + { + "epoch": 1.5587080430652311, + "grad_norm": 3.8847146034240723, + "learning_rate": 8.24067323526541e-06, + "loss": 0.7521, + "step": 6153 + }, + { + "epoch": 1.5589613679544017, + "grad_norm": 3.9309847354888916, + "learning_rate": 8.240035147385532e-06, + "loss": 0.9314, + "step": 6154 + }, + { + "epoch": 1.5592146928435717, + "grad_norm": 3.60764741897583, + "learning_rate": 8.239396968526988e-06, + "loss": 0.7827, + "step": 6155 + }, + { + "epoch": 1.5594680177327422, + "grad_norm": 4.254142761230469, + "learning_rate": 8.238758698707693e-06, + "loss": 0.8276, + "step": 6156 + }, + { + "epoch": 1.5597213426219128, + "grad_norm": 3.733480930328369, + "learning_rate": 8.238120337945573e-06, + "loss": 0.8402, + "step": 6157 + }, + { + "epoch": 1.5599746675110828, + "grad_norm": 3.5674009323120117, + "learning_rate": 8.237481886258552e-06, + "loss": 0.7259, + "step": 6158 + }, + { + "epoch": 1.5602279924002533, + "grad_norm": 3.84652042388916, + "learning_rate": 8.236843343664555e-06, + "loss": 0.709, + "step": 6159 + }, + { + "epoch": 1.5604813172894236, + "grad_norm": 3.706516742706299, + "learning_rate": 8.236204710181515e-06, + "loss": 0.7761, + "step": 6160 + }, + { + "epoch": 1.560734642178594, + "grad_norm": 3.8853776454925537, + "learning_rate": 8.235565985827363e-06, + "loss": 0.7936, + "step": 6161 + }, + { + "epoch": 1.5609879670677644, + "grad_norm": 3.6674692630767822, + "learning_rate": 8.234927170620034e-06, + "loss": 0.7314, + "step": 6162 + }, + { + "epoch": 1.5612412919569347, + "grad_norm": 3.5735464096069336, + "learning_rate": 8.234288264577469e-06, + "loss": 0.7055, + "step": 6163 + }, + { + "epoch": 1.561494616846105, + "grad_norm": 3.875180721282959, + "learning_rate": 8.233649267717602e-06, + "loss": 0.7637, + "step": 6164 + }, + { + "epoch": 1.5617479417352755, + "grad_norm": 3.7974531650543213, + "learning_rate": 8.23301018005838e-06, + "loss": 0.7814, + "step": 6165 + }, + { + "epoch": 1.5620012666244458, + "grad_norm": 3.64311146736145, + "learning_rate": 8.232371001617748e-06, + "loss": 0.7853, + "step": 6166 + }, + { + "epoch": 1.5622545915136161, + "grad_norm": 3.3769659996032715, + "learning_rate": 8.231731732413653e-06, + "loss": 0.7777, + "step": 6167 + }, + { + "epoch": 1.5625079164027866, + "grad_norm": 3.6891138553619385, + "learning_rate": 8.231092372464048e-06, + "loss": 0.8313, + "step": 6168 + }, + { + "epoch": 1.562761241291957, + "grad_norm": 3.456343412399292, + "learning_rate": 8.230452921786878e-06, + "loss": 0.8007, + "step": 6169 + }, + { + "epoch": 1.5630145661811272, + "grad_norm": 3.2791130542755127, + "learning_rate": 8.229813380400109e-06, + "loss": 0.695, + "step": 6170 + }, + { + "epoch": 1.5632678910702977, + "grad_norm": 4.603332042694092, + "learning_rate": 8.229173748321691e-06, + "loss": 0.8945, + "step": 6171 + }, + { + "epoch": 1.563521215959468, + "grad_norm": 3.6999411582946777, + "learning_rate": 8.228534025569589e-06, + "loss": 0.7581, + "step": 6172 + }, + { + "epoch": 1.5637745408486383, + "grad_norm": 3.513685464859009, + "learning_rate": 8.227894212161765e-06, + "loss": 0.7587, + "step": 6173 + }, + { + "epoch": 1.5640278657378088, + "grad_norm": 3.9109554290771484, + "learning_rate": 8.227254308116184e-06, + "loss": 0.7102, + "step": 6174 + }, + { + "epoch": 1.5642811906269791, + "grad_norm": 3.712085247039795, + "learning_rate": 8.226614313450814e-06, + "loss": 0.8107, + "step": 6175 + }, + { + "epoch": 1.5645345155161494, + "grad_norm": 3.6505749225616455, + "learning_rate": 8.225974228183626e-06, + "loss": 0.7926, + "step": 6176 + }, + { + "epoch": 1.56478784040532, + "grad_norm": 4.02791166305542, + "learning_rate": 8.225334052332596e-06, + "loss": 0.8007, + "step": 6177 + }, + { + "epoch": 1.5650411652944902, + "grad_norm": 4.112579345703125, + "learning_rate": 8.224693785915697e-06, + "loss": 0.8168, + "step": 6178 + }, + { + "epoch": 1.5652944901836605, + "grad_norm": 3.6569886207580566, + "learning_rate": 8.224053428950904e-06, + "loss": 0.8175, + "step": 6179 + }, + { + "epoch": 1.565547815072831, + "grad_norm": 3.2978391647338867, + "learning_rate": 8.223412981456205e-06, + "loss": 0.8381, + "step": 6180 + }, + { + "epoch": 1.5658011399620013, + "grad_norm": 4.3896894454956055, + "learning_rate": 8.22277244344958e-06, + "loss": 0.8294, + "step": 6181 + }, + { + "epoch": 1.5660544648511716, + "grad_norm": 3.775959014892578, + "learning_rate": 8.222131814949015e-06, + "loss": 0.8364, + "step": 6182 + }, + { + "epoch": 1.5663077897403421, + "grad_norm": 4.301949501037598, + "learning_rate": 8.221491095972498e-06, + "loss": 0.7441, + "step": 6183 + }, + { + "epoch": 1.5665611146295122, + "grad_norm": 3.5109403133392334, + "learning_rate": 8.220850286538022e-06, + "loss": 0.7974, + "step": 6184 + }, + { + "epoch": 1.5668144395186827, + "grad_norm": 3.6153130531311035, + "learning_rate": 8.22020938666358e-06, + "loss": 0.6844, + "step": 6185 + }, + { + "epoch": 1.5670677644078532, + "grad_norm": 3.187896251678467, + "learning_rate": 8.219568396367166e-06, + "loss": 0.8435, + "step": 6186 + }, + { + "epoch": 1.5673210892970233, + "grad_norm": 3.3308053016662598, + "learning_rate": 8.21892731566678e-06, + "loss": 0.7454, + "step": 6187 + }, + { + "epoch": 1.5675744141861938, + "grad_norm": 3.4108846187591553, + "learning_rate": 8.218286144580425e-06, + "loss": 0.7689, + "step": 6188 + }, + { + "epoch": 1.567827739075364, + "grad_norm": 3.6168429851531982, + "learning_rate": 8.217644883126103e-06, + "loss": 0.7955, + "step": 6189 + }, + { + "epoch": 1.5680810639645344, + "grad_norm": 3.7692012786865234, + "learning_rate": 8.21700353132182e-06, + "loss": 0.9237, + "step": 6190 + }, + { + "epoch": 1.568334388853705, + "grad_norm": 3.907435894012451, + "learning_rate": 8.216362089185587e-06, + "loss": 0.7422, + "step": 6191 + }, + { + "epoch": 1.5685877137428752, + "grad_norm": 4.282109260559082, + "learning_rate": 8.215720556735413e-06, + "loss": 0.9337, + "step": 6192 + }, + { + "epoch": 1.5688410386320455, + "grad_norm": 3.630056142807007, + "learning_rate": 8.215078933989314e-06, + "loss": 0.714, + "step": 6193 + }, + { + "epoch": 1.569094363521216, + "grad_norm": 3.63059663772583, + "learning_rate": 8.214437220965305e-06, + "loss": 0.7876, + "step": 6194 + }, + { + "epoch": 1.5693476884103863, + "grad_norm": 3.2414753437042236, + "learning_rate": 8.213795417681405e-06, + "loss": 0.6832, + "step": 6195 + }, + { + "epoch": 1.5696010132995566, + "grad_norm": 4.0775299072265625, + "learning_rate": 8.213153524155635e-06, + "loss": 0.9201, + "step": 6196 + }, + { + "epoch": 1.569854338188727, + "grad_norm": 3.5848569869995117, + "learning_rate": 8.212511540406022e-06, + "loss": 0.7547, + "step": 6197 + }, + { + "epoch": 1.5701076630778974, + "grad_norm": 3.9914603233337402, + "learning_rate": 8.211869466450589e-06, + "loss": 0.8342, + "step": 6198 + }, + { + "epoch": 1.5703609879670677, + "grad_norm": 3.6283152103424072, + "learning_rate": 8.211227302307367e-06, + "loss": 0.7353, + "step": 6199 + }, + { + "epoch": 1.5706143128562382, + "grad_norm": 3.5175790786743164, + "learning_rate": 8.210585047994389e-06, + "loss": 0.745, + "step": 6200 + }, + { + "epoch": 1.5708676377454085, + "grad_norm": 3.6858301162719727, + "learning_rate": 8.209942703529685e-06, + "loss": 0.7077, + "step": 6201 + }, + { + "epoch": 1.5711209626345788, + "grad_norm": 3.3568482398986816, + "learning_rate": 8.209300268931295e-06, + "loss": 0.7556, + "step": 6202 + }, + { + "epoch": 1.5713742875237493, + "grad_norm": 3.8684773445129395, + "learning_rate": 8.20865774421726e-06, + "loss": 0.8231, + "step": 6203 + }, + { + "epoch": 1.5716276124129196, + "grad_norm": 3.443450450897217, + "learning_rate": 8.208015129405615e-06, + "loss": 0.6694, + "step": 6204 + }, + { + "epoch": 1.5718809373020899, + "grad_norm": 4.031293869018555, + "learning_rate": 8.207372424514413e-06, + "loss": 0.8379, + "step": 6205 + }, + { + "epoch": 1.5721342621912604, + "grad_norm": 3.6145505905151367, + "learning_rate": 8.206729629561693e-06, + "loss": 0.7973, + "step": 6206 + }, + { + "epoch": 1.5723875870804307, + "grad_norm": 3.6999881267547607, + "learning_rate": 8.206086744565509e-06, + "loss": 0.9395, + "step": 6207 + }, + { + "epoch": 1.572640911969601, + "grad_norm": 3.5318551063537598, + "learning_rate": 8.205443769543914e-06, + "loss": 0.7197, + "step": 6208 + }, + { + "epoch": 1.5728942368587715, + "grad_norm": 3.9561591148376465, + "learning_rate": 8.20480070451496e-06, + "loss": 0.7936, + "step": 6209 + }, + { + "epoch": 1.5731475617479416, + "grad_norm": 4.161046028137207, + "learning_rate": 8.204157549496701e-06, + "loss": 0.9315, + "step": 6210 + }, + { + "epoch": 1.573400886637112, + "grad_norm": 4.216246128082275, + "learning_rate": 8.203514304507201e-06, + "loss": 0.839, + "step": 6211 + }, + { + "epoch": 1.5736542115262826, + "grad_norm": 4.26318883895874, + "learning_rate": 8.202870969564522e-06, + "loss": 0.8246, + "step": 6212 + }, + { + "epoch": 1.5739075364154527, + "grad_norm": 3.482059955596924, + "learning_rate": 8.202227544686727e-06, + "loss": 0.8972, + "step": 6213 + }, + { + "epoch": 1.5741608613046232, + "grad_norm": 4.002997875213623, + "learning_rate": 8.201584029891883e-06, + "loss": 0.7151, + "step": 6214 + }, + { + "epoch": 1.5744141861937937, + "grad_norm": 4.160106658935547, + "learning_rate": 8.20094042519806e-06, + "loss": 0.8671, + "step": 6215 + }, + { + "epoch": 1.5746675110829638, + "grad_norm": 3.74727725982666, + "learning_rate": 8.20029673062333e-06, + "loss": 0.7904, + "step": 6216 + }, + { + "epoch": 1.5749208359721343, + "grad_norm": 4.062234878540039, + "learning_rate": 8.199652946185768e-06, + "loss": 0.775, + "step": 6217 + }, + { + "epoch": 1.5751741608613046, + "grad_norm": 3.474189281463623, + "learning_rate": 8.19900907190345e-06, + "loss": 0.7235, + "step": 6218 + }, + { + "epoch": 1.5754274857504749, + "grad_norm": 3.7074568271636963, + "learning_rate": 8.198365107794457e-06, + "loss": 0.7201, + "step": 6219 + }, + { + "epoch": 1.5756808106396454, + "grad_norm": 4.050851345062256, + "learning_rate": 8.197721053876871e-06, + "loss": 0.9168, + "step": 6220 + }, + { + "epoch": 1.5759341355288157, + "grad_norm": 3.330204486846924, + "learning_rate": 8.197076910168777e-06, + "loss": 0.82, + "step": 6221 + }, + { + "epoch": 1.576187460417986, + "grad_norm": 3.733776569366455, + "learning_rate": 8.196432676688261e-06, + "loss": 0.7382, + "step": 6222 + }, + { + "epoch": 1.5764407853071565, + "grad_norm": 4.167757034301758, + "learning_rate": 8.195788353453412e-06, + "loss": 0.8078, + "step": 6223 + }, + { + "epoch": 1.5766941101963268, + "grad_norm": 3.423680305480957, + "learning_rate": 8.195143940482326e-06, + "loss": 0.7227, + "step": 6224 + }, + { + "epoch": 1.576947435085497, + "grad_norm": 3.5550029277801514, + "learning_rate": 8.194499437793097e-06, + "loss": 0.7647, + "step": 6225 + }, + { + "epoch": 1.5772007599746676, + "grad_norm": 3.564436912536621, + "learning_rate": 8.193854845403819e-06, + "loss": 0.7744, + "step": 6226 + }, + { + "epoch": 1.5774540848638379, + "grad_norm": 3.8896231651306152, + "learning_rate": 8.193210163332595e-06, + "loss": 0.8559, + "step": 6227 + }, + { + "epoch": 1.5777074097530082, + "grad_norm": 3.517395257949829, + "learning_rate": 8.192565391597524e-06, + "loss": 0.7062, + "step": 6228 + }, + { + "epoch": 1.5779607346421787, + "grad_norm": 3.6545779705047607, + "learning_rate": 8.191920530216714e-06, + "loss": 0.9066, + "step": 6229 + }, + { + "epoch": 1.578214059531349, + "grad_norm": 3.967564105987549, + "learning_rate": 8.19127557920827e-06, + "loss": 0.799, + "step": 6230 + }, + { + "epoch": 1.5784673844205193, + "grad_norm": 3.537055015563965, + "learning_rate": 8.190630538590307e-06, + "loss": 0.7337, + "step": 6231 + }, + { + "epoch": 1.5787207093096898, + "grad_norm": 3.7698004245758057, + "learning_rate": 8.189985408380934e-06, + "loss": 0.9251, + "step": 6232 + }, + { + "epoch": 1.57897403419886, + "grad_norm": 3.4278464317321777, + "learning_rate": 8.189340188598263e-06, + "loss": 0.7252, + "step": 6233 + }, + { + "epoch": 1.5792273590880304, + "grad_norm": 3.5584816932678223, + "learning_rate": 8.188694879260415e-06, + "loss": 0.7443, + "step": 6234 + }, + { + "epoch": 1.5794806839772009, + "grad_norm": 3.652059555053711, + "learning_rate": 8.18804948038551e-06, + "loss": 0.766, + "step": 6235 + }, + { + "epoch": 1.5797340088663712, + "grad_norm": 3.8366751670837402, + "learning_rate": 8.187403991991668e-06, + "loss": 0.7452, + "step": 6236 + }, + { + "epoch": 1.5799873337555415, + "grad_norm": 3.522740602493286, + "learning_rate": 8.186758414097018e-06, + "loss": 0.763, + "step": 6237 + }, + { + "epoch": 1.580240658644712, + "grad_norm": 3.8008928298950195, + "learning_rate": 8.186112746719683e-06, + "loss": 0.8408, + "step": 6238 + }, + { + "epoch": 1.580493983533882, + "grad_norm": 3.2887730598449707, + "learning_rate": 8.185466989877797e-06, + "loss": 0.652, + "step": 6239 + }, + { + "epoch": 1.5807473084230526, + "grad_norm": 3.574138879776001, + "learning_rate": 8.18482114358949e-06, + "loss": 0.7602, + "step": 6240 + }, + { + "epoch": 1.581000633312223, + "grad_norm": 4.546648025512695, + "learning_rate": 8.184175207872899e-06, + "loss": 0.9222, + "step": 6241 + }, + { + "epoch": 1.5812539582013931, + "grad_norm": 3.4243948459625244, + "learning_rate": 8.183529182746159e-06, + "loss": 0.8364, + "step": 6242 + }, + { + "epoch": 1.5815072830905637, + "grad_norm": 3.651503324508667, + "learning_rate": 8.182883068227412e-06, + "loss": 0.7696, + "step": 6243 + }, + { + "epoch": 1.5817606079797342, + "grad_norm": 3.653803825378418, + "learning_rate": 8.182236864334801e-06, + "loss": 0.7628, + "step": 6244 + }, + { + "epoch": 1.5820139328689042, + "grad_norm": 3.7829039096832275, + "learning_rate": 8.181590571086471e-06, + "loss": 0.8652, + "step": 6245 + }, + { + "epoch": 1.5822672577580748, + "grad_norm": 3.639207601547241, + "learning_rate": 8.180944188500567e-06, + "loss": 0.8595, + "step": 6246 + }, + { + "epoch": 1.582520582647245, + "grad_norm": 3.445230484008789, + "learning_rate": 8.180297716595242e-06, + "loss": 0.7211, + "step": 6247 + }, + { + "epoch": 1.5827739075364153, + "grad_norm": 3.762030601501465, + "learning_rate": 8.179651155388648e-06, + "loss": 0.7447, + "step": 6248 + }, + { + "epoch": 1.5830272324255859, + "grad_norm": 4.305011749267578, + "learning_rate": 8.17900450489894e-06, + "loss": 0.9536, + "step": 6249 + }, + { + "epoch": 1.5832805573147561, + "grad_norm": 3.5451323986053467, + "learning_rate": 8.178357765144274e-06, + "loss": 0.8557, + "step": 6250 + }, + { + "epoch": 1.5835338822039264, + "grad_norm": 3.3520631790161133, + "learning_rate": 8.177710936142813e-06, + "loss": 0.7228, + "step": 6251 + }, + { + "epoch": 1.583787207093097, + "grad_norm": 3.792992115020752, + "learning_rate": 8.177064017912717e-06, + "loss": 0.7737, + "step": 6252 + }, + { + "epoch": 1.5840405319822672, + "grad_norm": 3.613748788833618, + "learning_rate": 8.176417010472153e-06, + "loss": 0.8975, + "step": 6253 + }, + { + "epoch": 1.5842938568714375, + "grad_norm": 3.6810131072998047, + "learning_rate": 8.175769913839289e-06, + "loss": 0.7641, + "step": 6254 + }, + { + "epoch": 1.584547181760608, + "grad_norm": 3.514597177505493, + "learning_rate": 8.175122728032292e-06, + "loss": 0.6976, + "step": 6255 + }, + { + "epoch": 1.5848005066497783, + "grad_norm": 3.624741554260254, + "learning_rate": 8.174475453069339e-06, + "loss": 0.7989, + "step": 6256 + }, + { + "epoch": 1.5850538315389486, + "grad_norm": 3.5870413780212402, + "learning_rate": 8.173828088968603e-06, + "loss": 0.7174, + "step": 6257 + }, + { + "epoch": 1.5853071564281191, + "grad_norm": 3.642335891723633, + "learning_rate": 8.17318063574826e-06, + "loss": 0.7536, + "step": 6258 + }, + { + "epoch": 1.5855604813172894, + "grad_norm": 3.785813331604004, + "learning_rate": 8.172533093426493e-06, + "loss": 0.718, + "step": 6259 + }, + { + "epoch": 1.5858138062064597, + "grad_norm": 4.186957359313965, + "learning_rate": 8.171885462021485e-06, + "loss": 0.8191, + "step": 6260 + }, + { + "epoch": 1.5860671310956302, + "grad_norm": 4.091099739074707, + "learning_rate": 8.171237741551416e-06, + "loss": 0.9345, + "step": 6261 + }, + { + "epoch": 1.5863204559848005, + "grad_norm": 4.292994976043701, + "learning_rate": 8.170589932034482e-06, + "loss": 0.788, + "step": 6262 + }, + { + "epoch": 1.5865737808739708, + "grad_norm": 3.8370442390441895, + "learning_rate": 8.169942033488867e-06, + "loss": 0.7674, + "step": 6263 + }, + { + "epoch": 1.5868271057631413, + "grad_norm": 4.250424385070801, + "learning_rate": 8.169294045932764e-06, + "loss": 0.8521, + "step": 6264 + }, + { + "epoch": 1.5870804306523116, + "grad_norm": 4.1435227394104, + "learning_rate": 8.16864596938437e-06, + "loss": 1.0227, + "step": 6265 + }, + { + "epoch": 1.587333755541482, + "grad_norm": 3.793520927429199, + "learning_rate": 8.167997803861882e-06, + "loss": 0.86, + "step": 6266 + }, + { + "epoch": 1.5875870804306524, + "grad_norm": 3.6536591053009033, + "learning_rate": 8.167349549383502e-06, + "loss": 0.8048, + "step": 6267 + }, + { + "epoch": 1.5878404053198225, + "grad_norm": 3.6870744228363037, + "learning_rate": 8.16670120596743e-06, + "loss": 0.8393, + "step": 6268 + }, + { + "epoch": 1.588093730208993, + "grad_norm": 3.8530001640319824, + "learning_rate": 8.166052773631874e-06, + "loss": 0.8902, + "step": 6269 + }, + { + "epoch": 1.5883470550981635, + "grad_norm": 3.391432523727417, + "learning_rate": 8.165404252395038e-06, + "loss": 0.7476, + "step": 6270 + }, + { + "epoch": 1.5886003799873336, + "grad_norm": 4.023676872253418, + "learning_rate": 8.164755642275135e-06, + "loss": 0.7463, + "step": 6271 + }, + { + "epoch": 1.5888537048765041, + "grad_norm": 3.3576619625091553, + "learning_rate": 8.164106943290378e-06, + "loss": 0.7238, + "step": 6272 + }, + { + "epoch": 1.5891070297656746, + "grad_norm": 3.752878189086914, + "learning_rate": 8.163458155458978e-06, + "loss": 0.7281, + "step": 6273 + }, + { + "epoch": 1.5893603546548447, + "grad_norm": 3.7723312377929688, + "learning_rate": 8.162809278799157e-06, + "loss": 0.8379, + "step": 6274 + }, + { + "epoch": 1.5896136795440152, + "grad_norm": 3.6335339546203613, + "learning_rate": 8.162160313329136e-06, + "loss": 0.749, + "step": 6275 + }, + { + "epoch": 1.5898670044331855, + "grad_norm": 3.7029337882995605, + "learning_rate": 8.161511259067132e-06, + "loss": 0.8376, + "step": 6276 + }, + { + "epoch": 1.5901203293223558, + "grad_norm": 3.6301651000976562, + "learning_rate": 8.160862116031377e-06, + "loss": 0.8992, + "step": 6277 + }, + { + "epoch": 1.5903736542115263, + "grad_norm": 3.716869592666626, + "learning_rate": 8.160212884240092e-06, + "loss": 0.922, + "step": 6278 + }, + { + "epoch": 1.5906269791006966, + "grad_norm": 3.329904317855835, + "learning_rate": 8.159563563711512e-06, + "loss": 0.7718, + "step": 6279 + }, + { + "epoch": 1.590880303989867, + "grad_norm": 3.7120020389556885, + "learning_rate": 8.158914154463867e-06, + "loss": 0.7568, + "step": 6280 + }, + { + "epoch": 1.5911336288790374, + "grad_norm": 3.8210768699645996, + "learning_rate": 8.158264656515394e-06, + "loss": 0.7399, + "step": 6281 + }, + { + "epoch": 1.5913869537682077, + "grad_norm": 3.99299955368042, + "learning_rate": 8.157615069884329e-06, + "loss": 0.8497, + "step": 6282 + }, + { + "epoch": 1.591640278657378, + "grad_norm": 3.907954692840576, + "learning_rate": 8.156965394588912e-06, + "loss": 0.8495, + "step": 6283 + }, + { + "epoch": 1.5918936035465485, + "grad_norm": 4.188817977905273, + "learning_rate": 8.156315630647388e-06, + "loss": 0.7751, + "step": 6284 + }, + { + "epoch": 1.5921469284357188, + "grad_norm": 3.5710716247558594, + "learning_rate": 8.155665778077999e-06, + "loss": 0.6729, + "step": 6285 + }, + { + "epoch": 1.592400253324889, + "grad_norm": 3.619842529296875, + "learning_rate": 8.155015836898996e-06, + "loss": 0.7356, + "step": 6286 + }, + { + "epoch": 1.5926535782140596, + "grad_norm": 3.6929092407226562, + "learning_rate": 8.154365807128626e-06, + "loss": 0.7076, + "step": 6287 + }, + { + "epoch": 1.59290690310323, + "grad_norm": 3.8376362323760986, + "learning_rate": 8.153715688785142e-06, + "loss": 0.7177, + "step": 6288 + }, + { + "epoch": 1.5931602279924002, + "grad_norm": 4.196521282196045, + "learning_rate": 8.153065481886799e-06, + "loss": 0.8779, + "step": 6289 + }, + { + "epoch": 1.5934135528815707, + "grad_norm": 3.584399700164795, + "learning_rate": 8.152415186451855e-06, + "loss": 0.6938, + "step": 6290 + }, + { + "epoch": 1.593666877770741, + "grad_norm": 3.694575071334839, + "learning_rate": 8.151764802498573e-06, + "loss": 0.8249, + "step": 6291 + }, + { + "epoch": 1.5939202026599113, + "grad_norm": 3.6853747367858887, + "learning_rate": 8.15111433004521e-06, + "loss": 0.7948, + "step": 6292 + }, + { + "epoch": 1.5941735275490818, + "grad_norm": 3.983799934387207, + "learning_rate": 8.150463769110032e-06, + "loss": 0.916, + "step": 6293 + }, + { + "epoch": 1.594426852438252, + "grad_norm": 3.8763108253479004, + "learning_rate": 8.149813119711309e-06, + "loss": 0.8828, + "step": 6294 + }, + { + "epoch": 1.5946801773274224, + "grad_norm": 3.5073606967926025, + "learning_rate": 8.14916238186731e-06, + "loss": 0.7702, + "step": 6295 + }, + { + "epoch": 1.594933502216593, + "grad_norm": 3.7843570709228516, + "learning_rate": 8.14851155559631e-06, + "loss": 0.7596, + "step": 6296 + }, + { + "epoch": 1.595186827105763, + "grad_norm": 3.7611119747161865, + "learning_rate": 8.147860640916578e-06, + "loss": 0.7774, + "step": 6297 + }, + { + "epoch": 1.5954401519949335, + "grad_norm": 3.4019343852996826, + "learning_rate": 8.147209637846396e-06, + "loss": 0.8387, + "step": 6298 + }, + { + "epoch": 1.595693476884104, + "grad_norm": 4.175518989562988, + "learning_rate": 8.14655854640404e-06, + "loss": 0.9184, + "step": 6299 + }, + { + "epoch": 1.595946801773274, + "grad_norm": 3.809389591217041, + "learning_rate": 8.145907366607798e-06, + "loss": 0.848, + "step": 6300 + }, + { + "epoch": 1.5962001266624446, + "grad_norm": 3.6634557247161865, + "learning_rate": 8.145256098475952e-06, + "loss": 0.7103, + "step": 6301 + }, + { + "epoch": 1.596453451551615, + "grad_norm": 3.7986724376678467, + "learning_rate": 8.144604742026785e-06, + "loss": 0.7568, + "step": 6302 + }, + { + "epoch": 1.5967067764407852, + "grad_norm": 3.6229984760284424, + "learning_rate": 8.143953297278593e-06, + "loss": 0.7285, + "step": 6303 + }, + { + "epoch": 1.5969601013299557, + "grad_norm": 3.87339186668396, + "learning_rate": 8.143301764249664e-06, + "loss": 0.735, + "step": 6304 + }, + { + "epoch": 1.597213426219126, + "grad_norm": 3.9502570629119873, + "learning_rate": 8.142650142958296e-06, + "loss": 0.7562, + "step": 6305 + }, + { + "epoch": 1.5974667511082963, + "grad_norm": 4.399330139160156, + "learning_rate": 8.141998433422787e-06, + "loss": 0.7972, + "step": 6306 + }, + { + "epoch": 1.5977200759974668, + "grad_norm": 4.016336917877197, + "learning_rate": 8.141346635661432e-06, + "loss": 0.8932, + "step": 6307 + }, + { + "epoch": 1.597973400886637, + "grad_norm": 3.827622413635254, + "learning_rate": 8.140694749692538e-06, + "loss": 0.8481, + "step": 6308 + }, + { + "epoch": 1.5982267257758074, + "grad_norm": 3.6018435955047607, + "learning_rate": 8.140042775534405e-06, + "loss": 0.7741, + "step": 6309 + }, + { + "epoch": 1.598480050664978, + "grad_norm": 3.784566879272461, + "learning_rate": 8.139390713205341e-06, + "loss": 0.932, + "step": 6310 + }, + { + "epoch": 1.5987333755541482, + "grad_norm": 4.132288455963135, + "learning_rate": 8.138738562723661e-06, + "loss": 0.8902, + "step": 6311 + }, + { + "epoch": 1.5989867004433185, + "grad_norm": 3.89188289642334, + "learning_rate": 8.138086324107673e-06, + "loss": 0.8423, + "step": 6312 + }, + { + "epoch": 1.599240025332489, + "grad_norm": 4.1564788818359375, + "learning_rate": 8.13743399737569e-06, + "loss": 0.8726, + "step": 6313 + }, + { + "epoch": 1.5994933502216593, + "grad_norm": 4.011749267578125, + "learning_rate": 8.136781582546031e-06, + "loss": 0.9034, + "step": 6314 + }, + { + "epoch": 1.5997466751108296, + "grad_norm": 3.8636090755462646, + "learning_rate": 8.136129079637015e-06, + "loss": 0.8528, + "step": 6315 + }, + { + "epoch": 1.6, + "grad_norm": 3.402066469192505, + "learning_rate": 8.135476488666964e-06, + "loss": 0.704, + "step": 6316 + }, + { + "epoch": 1.6002533248891704, + "grad_norm": 4.022968769073486, + "learning_rate": 8.134823809654205e-06, + "loss": 0.8638, + "step": 6317 + }, + { + "epoch": 1.6005066497783407, + "grad_norm": 3.585280179977417, + "learning_rate": 8.13417104261706e-06, + "loss": 0.7055, + "step": 6318 + }, + { + "epoch": 1.6007599746675112, + "grad_norm": 3.2944531440734863, + "learning_rate": 8.133518187573864e-06, + "loss": 0.7464, + "step": 6319 + }, + { + "epoch": 1.6010132995566815, + "grad_norm": 3.7818500995635986, + "learning_rate": 8.132865244542942e-06, + "loss": 0.8499, + "step": 6320 + }, + { + "epoch": 1.6012666244458518, + "grad_norm": 3.902923822402954, + "learning_rate": 8.132212213542636e-06, + "loss": 0.97, + "step": 6321 + }, + { + "epoch": 1.6015199493350223, + "grad_norm": 3.6301355361938477, + "learning_rate": 8.131559094591278e-06, + "loss": 0.7429, + "step": 6322 + }, + { + "epoch": 1.6017732742241926, + "grad_norm": 3.7234926223754883, + "learning_rate": 8.130905887707208e-06, + "loss": 0.7835, + "step": 6323 + }, + { + "epoch": 1.6020265991133629, + "grad_norm": 3.9502055644989014, + "learning_rate": 8.130252592908766e-06, + "loss": 0.77, + "step": 6324 + }, + { + "epoch": 1.6022799240025334, + "grad_norm": 4.068994045257568, + "learning_rate": 8.1295992102143e-06, + "loss": 0.7259, + "step": 6325 + }, + { + "epoch": 1.6025332488917035, + "grad_norm": 3.827765703201294, + "learning_rate": 8.128945739642155e-06, + "loss": 0.7152, + "step": 6326 + }, + { + "epoch": 1.602786573780874, + "grad_norm": 3.77705717086792, + "learning_rate": 8.128292181210681e-06, + "loss": 0.7546, + "step": 6327 + }, + { + "epoch": 1.6030398986700445, + "grad_norm": 3.294593334197998, + "learning_rate": 8.127638534938227e-06, + "loss": 0.7279, + "step": 6328 + }, + { + "epoch": 1.6032932235592146, + "grad_norm": 3.7223188877105713, + "learning_rate": 8.12698480084315e-06, + "loss": 0.6827, + "step": 6329 + }, + { + "epoch": 1.603546548448385, + "grad_norm": 3.2967376708984375, + "learning_rate": 8.126330978943806e-06, + "loss": 0.7499, + "step": 6330 + }, + { + "epoch": 1.6037998733375554, + "grad_norm": 4.234163284301758, + "learning_rate": 8.125677069258551e-06, + "loss": 0.8274, + "step": 6331 + }, + { + "epoch": 1.6040531982267257, + "grad_norm": 3.841139078140259, + "learning_rate": 8.125023071805752e-06, + "loss": 0.8236, + "step": 6332 + }, + { + "epoch": 1.6043065231158962, + "grad_norm": 3.4479053020477295, + "learning_rate": 8.124368986603767e-06, + "loss": 0.7997, + "step": 6333 + }, + { + "epoch": 1.6045598480050665, + "grad_norm": 4.069573402404785, + "learning_rate": 8.123714813670967e-06, + "loss": 0.7907, + "step": 6334 + }, + { + "epoch": 1.6048131728942368, + "grad_norm": 3.8663175106048584, + "learning_rate": 8.123060553025716e-06, + "loss": 0.7616, + "step": 6335 + }, + { + "epoch": 1.6050664977834073, + "grad_norm": 3.6568849086761475, + "learning_rate": 8.122406204686391e-06, + "loss": 0.7551, + "step": 6336 + }, + { + "epoch": 1.6053198226725776, + "grad_norm": 3.8331940174102783, + "learning_rate": 8.121751768671363e-06, + "loss": 0.7646, + "step": 6337 + }, + { + "epoch": 1.6055731475617478, + "grad_norm": 3.666332244873047, + "learning_rate": 8.121097244999007e-06, + "loss": 0.7805, + "step": 6338 + }, + { + "epoch": 1.6058264724509184, + "grad_norm": 3.944143056869507, + "learning_rate": 8.120442633687705e-06, + "loss": 0.7167, + "step": 6339 + }, + { + "epoch": 1.6060797973400887, + "grad_norm": 3.5900228023529053, + "learning_rate": 8.119787934755836e-06, + "loss": 0.7489, + "step": 6340 + }, + { + "epoch": 1.606333122229259, + "grad_norm": 3.81634259223938, + "learning_rate": 8.119133148221784e-06, + "loss": 0.768, + "step": 6341 + }, + { + "epoch": 1.6065864471184295, + "grad_norm": 3.515068531036377, + "learning_rate": 8.118478274103934e-06, + "loss": 0.7691, + "step": 6342 + }, + { + "epoch": 1.6068397720075998, + "grad_norm": 3.412266731262207, + "learning_rate": 8.117823312420676e-06, + "loss": 0.6523, + "step": 6343 + }, + { + "epoch": 1.60709309689677, + "grad_norm": 3.9870100021362305, + "learning_rate": 8.117168263190401e-06, + "loss": 0.8259, + "step": 6344 + }, + { + "epoch": 1.6073464217859406, + "grad_norm": 3.4756460189819336, + "learning_rate": 8.116513126431504e-06, + "loss": 0.7214, + "step": 6345 + }, + { + "epoch": 1.6075997466751109, + "grad_norm": 4.4472126960754395, + "learning_rate": 8.115857902162377e-06, + "loss": 0.8546, + "step": 6346 + }, + { + "epoch": 1.6078530715642811, + "grad_norm": 3.385911226272583, + "learning_rate": 8.11520259040142e-06, + "loss": 0.5968, + "step": 6347 + }, + { + "epoch": 1.6081063964534517, + "grad_norm": 3.713108777999878, + "learning_rate": 8.114547191167034e-06, + "loss": 0.8377, + "step": 6348 + }, + { + "epoch": 1.608359721342622, + "grad_norm": 3.6968376636505127, + "learning_rate": 8.113891704477623e-06, + "loss": 0.8044, + "step": 6349 + }, + { + "epoch": 1.6086130462317922, + "grad_norm": 3.7799293994903564, + "learning_rate": 8.113236130351593e-06, + "loss": 0.8189, + "step": 6350 + }, + { + "epoch": 1.6088663711209628, + "grad_norm": 4.2639265060424805, + "learning_rate": 8.112580468807352e-06, + "loss": 0.8377, + "step": 6351 + }, + { + "epoch": 1.609119696010133, + "grad_norm": 4.169806957244873, + "learning_rate": 8.111924719863308e-06, + "loss": 0.8916, + "step": 6352 + }, + { + "epoch": 1.6093730208993033, + "grad_norm": 3.6500370502471924, + "learning_rate": 8.111268883537879e-06, + "loss": 0.8548, + "step": 6353 + }, + { + "epoch": 1.6096263457884739, + "grad_norm": 3.3989417552948, + "learning_rate": 8.110612959849477e-06, + "loss": 0.7529, + "step": 6354 + }, + { + "epoch": 1.609879670677644, + "grad_norm": 3.633368492126465, + "learning_rate": 8.10995694881652e-06, + "loss": 0.6994, + "step": 6355 + }, + { + "epoch": 1.6101329955668144, + "grad_norm": 3.9261820316314697, + "learning_rate": 8.10930085045743e-06, + "loss": 0.8472, + "step": 6356 + }, + { + "epoch": 1.610386320455985, + "grad_norm": 3.8959484100341797, + "learning_rate": 8.10864466479063e-06, + "loss": 0.7549, + "step": 6357 + }, + { + "epoch": 1.610639645345155, + "grad_norm": 3.712040424346924, + "learning_rate": 8.107988391834544e-06, + "loss": 0.8342, + "step": 6358 + }, + { + "epoch": 1.6108929702343255, + "grad_norm": 3.6048924922943115, + "learning_rate": 8.107332031607602e-06, + "loss": 0.7578, + "step": 6359 + }, + { + "epoch": 1.6111462951234958, + "grad_norm": 4.19912052154541, + "learning_rate": 8.106675584128232e-06, + "loss": 0.8127, + "step": 6360 + }, + { + "epoch": 1.6113996200126661, + "grad_norm": 3.623387575149536, + "learning_rate": 8.106019049414867e-06, + "loss": 0.7879, + "step": 6361 + }, + { + "epoch": 1.6116529449018366, + "grad_norm": 3.098299503326416, + "learning_rate": 8.105362427485942e-06, + "loss": 0.6986, + "step": 6362 + }, + { + "epoch": 1.611906269791007, + "grad_norm": 3.926645040512085, + "learning_rate": 8.104705718359897e-06, + "loss": 0.7917, + "step": 6363 + }, + { + "epoch": 1.6121595946801772, + "grad_norm": 3.5370430946350098, + "learning_rate": 8.10404892205517e-06, + "loss": 0.8019, + "step": 6364 + }, + { + "epoch": 1.6124129195693477, + "grad_norm": 3.5520987510681152, + "learning_rate": 8.103392038590205e-06, + "loss": 0.8157, + "step": 6365 + }, + { + "epoch": 1.612666244458518, + "grad_norm": 3.9524483680725098, + "learning_rate": 8.102735067983446e-06, + "loss": 0.8213, + "step": 6366 + }, + { + "epoch": 1.6129195693476883, + "grad_norm": 4.112865447998047, + "learning_rate": 8.10207801025334e-06, + "loss": 0.955, + "step": 6367 + }, + { + "epoch": 1.6131728942368588, + "grad_norm": 3.4801909923553467, + "learning_rate": 8.101420865418338e-06, + "loss": 0.7196, + "step": 6368 + }, + { + "epoch": 1.6134262191260291, + "grad_norm": 3.5850558280944824, + "learning_rate": 8.100763633496889e-06, + "loss": 0.8016, + "step": 6369 + }, + { + "epoch": 1.6136795440151994, + "grad_norm": 3.788353204727173, + "learning_rate": 8.100106314507454e-06, + "loss": 0.9478, + "step": 6370 + }, + { + "epoch": 1.61393286890437, + "grad_norm": 3.492480993270874, + "learning_rate": 8.099448908468486e-06, + "loss": 0.7154, + "step": 6371 + }, + { + "epoch": 1.6141861937935402, + "grad_norm": 3.3306331634521484, + "learning_rate": 8.098791415398442e-06, + "loss": 0.805, + "step": 6372 + }, + { + "epoch": 1.6144395186827105, + "grad_norm": 3.7816762924194336, + "learning_rate": 8.09813383531579e-06, + "loss": 0.8916, + "step": 6373 + }, + { + "epoch": 1.614692843571881, + "grad_norm": 3.386228322982788, + "learning_rate": 8.097476168238991e-06, + "loss": 0.7459, + "step": 6374 + }, + { + "epoch": 1.6149461684610513, + "grad_norm": 3.881789207458496, + "learning_rate": 8.096818414186515e-06, + "loss": 0.8246, + "step": 6375 + }, + { + "epoch": 1.6151994933502216, + "grad_norm": 3.399620771408081, + "learning_rate": 8.096160573176827e-06, + "loss": 0.7961, + "step": 6376 + }, + { + "epoch": 1.6154528182393921, + "grad_norm": 3.8142077922821045, + "learning_rate": 8.095502645228402e-06, + "loss": 0.8436, + "step": 6377 + }, + { + "epoch": 1.6157061431285624, + "grad_norm": 3.5735161304473877, + "learning_rate": 8.094844630359713e-06, + "loss": 0.7599, + "step": 6378 + }, + { + "epoch": 1.6159594680177327, + "grad_norm": 3.9081273078918457, + "learning_rate": 8.094186528589238e-06, + "loss": 0.7871, + "step": 6379 + }, + { + "epoch": 1.6162127929069032, + "grad_norm": 3.8461575508117676, + "learning_rate": 8.093528339935456e-06, + "loss": 0.7515, + "step": 6380 + }, + { + "epoch": 1.6164661177960733, + "grad_norm": 4.0900750160217285, + "learning_rate": 8.092870064416847e-06, + "loss": 0.7977, + "step": 6381 + }, + { + "epoch": 1.6167194426852438, + "grad_norm": 3.374234676361084, + "learning_rate": 8.092211702051898e-06, + "loss": 0.7515, + "step": 6382 + }, + { + "epoch": 1.6169727675744143, + "grad_norm": 3.309140682220459, + "learning_rate": 8.091553252859091e-06, + "loss": 0.7617, + "step": 6383 + }, + { + "epoch": 1.6172260924635844, + "grad_norm": 3.498622179031372, + "learning_rate": 8.090894716856918e-06, + "loss": 0.7781, + "step": 6384 + }, + { + "epoch": 1.617479417352755, + "grad_norm": 3.208578109741211, + "learning_rate": 8.090236094063872e-06, + "loss": 0.6439, + "step": 6385 + }, + { + "epoch": 1.6177327422419254, + "grad_norm": 3.7767622470855713, + "learning_rate": 8.089577384498443e-06, + "loss": 0.7293, + "step": 6386 + }, + { + "epoch": 1.6179860671310955, + "grad_norm": 3.8664450645446777, + "learning_rate": 8.08891858817913e-06, + "loss": 0.7885, + "step": 6387 + }, + { + "epoch": 1.618239392020266, + "grad_norm": 4.010339736938477, + "learning_rate": 8.08825970512443e-06, + "loss": 0.8411, + "step": 6388 + }, + { + "epoch": 1.6184927169094363, + "grad_norm": 4.171139717102051, + "learning_rate": 8.087600735352844e-06, + "loss": 0.85, + "step": 6389 + }, + { + "epoch": 1.6187460417986066, + "grad_norm": 3.5802998542785645, + "learning_rate": 8.08694167888288e-06, + "loss": 0.6371, + "step": 6390 + }, + { + "epoch": 1.618999366687777, + "grad_norm": 4.302485466003418, + "learning_rate": 8.086282535733037e-06, + "loss": 0.7943, + "step": 6391 + }, + { + "epoch": 1.6192526915769474, + "grad_norm": 3.7403008937835693, + "learning_rate": 8.085623305921828e-06, + "loss": 0.6183, + "step": 6392 + }, + { + "epoch": 1.6195060164661177, + "grad_norm": 3.924921751022339, + "learning_rate": 8.084963989467761e-06, + "loss": 0.7468, + "step": 6393 + }, + { + "epoch": 1.6197593413552882, + "grad_norm": 3.962587594985962, + "learning_rate": 8.084304586389355e-06, + "loss": 0.8031, + "step": 6394 + }, + { + "epoch": 1.6200126662444585, + "grad_norm": 3.9985625743865967, + "learning_rate": 8.08364509670512e-06, + "loss": 0.7817, + "step": 6395 + }, + { + "epoch": 1.6202659911336288, + "grad_norm": 3.9099555015563965, + "learning_rate": 8.082985520433575e-06, + "loss": 0.817, + "step": 6396 + }, + { + "epoch": 1.6205193160227993, + "grad_norm": 4.005500793457031, + "learning_rate": 8.082325857593241e-06, + "loss": 0.8748, + "step": 6397 + }, + { + "epoch": 1.6207726409119696, + "grad_norm": 3.6754150390625, + "learning_rate": 8.081666108202643e-06, + "loss": 0.8098, + "step": 6398 + }, + { + "epoch": 1.62102596580114, + "grad_norm": 3.8412888050079346, + "learning_rate": 8.081006272280305e-06, + "loss": 0.7834, + "step": 6399 + }, + { + "epoch": 1.6212792906903104, + "grad_norm": 3.7987236976623535, + "learning_rate": 8.080346349844755e-06, + "loss": 0.8147, + "step": 6400 + }, + { + "epoch": 1.6215326155794807, + "grad_norm": 4.28433084487915, + "learning_rate": 8.07968634091452e-06, + "loss": 0.9466, + "step": 6401 + }, + { + "epoch": 1.621785940468651, + "grad_norm": 3.7046642303466797, + "learning_rate": 8.079026245508139e-06, + "loss": 0.7772, + "step": 6402 + }, + { + "epoch": 1.6220392653578215, + "grad_norm": 3.5088951587677, + "learning_rate": 8.078366063644144e-06, + "loss": 0.6949, + "step": 6403 + }, + { + "epoch": 1.6222925902469918, + "grad_norm": 3.5445315837860107, + "learning_rate": 8.077705795341074e-06, + "loss": 0.7836, + "step": 6404 + }, + { + "epoch": 1.622545915136162, + "grad_norm": 3.655200719833374, + "learning_rate": 8.077045440617465e-06, + "loss": 0.7902, + "step": 6405 + }, + { + "epoch": 1.6227992400253326, + "grad_norm": 3.1639909744262695, + "learning_rate": 8.076384999491862e-06, + "loss": 0.7129, + "step": 6406 + }, + { + "epoch": 1.623052564914503, + "grad_norm": 3.459355592727661, + "learning_rate": 8.075724471982811e-06, + "loss": 0.8008, + "step": 6407 + }, + { + "epoch": 1.6233058898036732, + "grad_norm": 3.36025071144104, + "learning_rate": 8.075063858108859e-06, + "loss": 0.7163, + "step": 6408 + }, + { + "epoch": 1.6235592146928437, + "grad_norm": 3.429908275604248, + "learning_rate": 8.074403157888556e-06, + "loss": 0.7261, + "step": 6409 + }, + { + "epoch": 1.6238125395820138, + "grad_norm": 4.139374256134033, + "learning_rate": 8.07374237134045e-06, + "loss": 0.9176, + "step": 6410 + }, + { + "epoch": 1.6240658644711843, + "grad_norm": 4.374108791351318, + "learning_rate": 8.073081498483101e-06, + "loss": 0.7473, + "step": 6411 + }, + { + "epoch": 1.6243191893603548, + "grad_norm": 3.589346408843994, + "learning_rate": 8.072420539335063e-06, + "loss": 0.7826, + "step": 6412 + }, + { + "epoch": 1.6245725142495249, + "grad_norm": 3.336552858352661, + "learning_rate": 8.071759493914897e-06, + "loss": 0.7313, + "step": 6413 + }, + { + "epoch": 1.6248258391386954, + "grad_norm": 3.78987193107605, + "learning_rate": 8.071098362241164e-06, + "loss": 0.6873, + "step": 6414 + }, + { + "epoch": 1.625079164027866, + "grad_norm": 4.215850830078125, + "learning_rate": 8.070437144332428e-06, + "loss": 0.7653, + "step": 6415 + }, + { + "epoch": 1.625332488917036, + "grad_norm": 3.558863878250122, + "learning_rate": 8.069775840207256e-06, + "loss": 0.7366, + "step": 6416 + }, + { + "epoch": 1.6255858138062065, + "grad_norm": 3.667942762374878, + "learning_rate": 8.069114449884217e-06, + "loss": 0.7321, + "step": 6417 + }, + { + "epoch": 1.6258391386953768, + "grad_norm": 3.6310346126556396, + "learning_rate": 8.068452973381883e-06, + "loss": 0.8066, + "step": 6418 + }, + { + "epoch": 1.626092463584547, + "grad_norm": 3.59124755859375, + "learning_rate": 8.067791410718829e-06, + "loss": 0.845, + "step": 6419 + }, + { + "epoch": 1.6263457884737176, + "grad_norm": 3.835885763168335, + "learning_rate": 8.067129761913628e-06, + "loss": 0.7575, + "step": 6420 + }, + { + "epoch": 1.6265991133628879, + "grad_norm": 3.6241557598114014, + "learning_rate": 8.066468026984861e-06, + "loss": 0.8036, + "step": 6421 + }, + { + "epoch": 1.6268524382520582, + "grad_norm": 3.6531600952148438, + "learning_rate": 8.065806205951109e-06, + "loss": 0.7127, + "step": 6422 + }, + { + "epoch": 1.6271057631412287, + "grad_norm": 3.900205373764038, + "learning_rate": 8.065144298830956e-06, + "loss": 0.7207, + "step": 6423 + }, + { + "epoch": 1.627359088030399, + "grad_norm": 4.193485736846924, + "learning_rate": 8.064482305642989e-06, + "loss": 0.8922, + "step": 6424 + }, + { + "epoch": 1.6276124129195693, + "grad_norm": 4.118162155151367, + "learning_rate": 8.063820226405793e-06, + "loss": 0.8109, + "step": 6425 + }, + { + "epoch": 1.6278657378087398, + "grad_norm": 3.196044921875, + "learning_rate": 8.063158061137962e-06, + "loss": 0.7352, + "step": 6426 + }, + { + "epoch": 1.62811906269791, + "grad_norm": 3.3466603755950928, + "learning_rate": 8.062495809858088e-06, + "loss": 0.7719, + "step": 6427 + }, + { + "epoch": 1.6283723875870804, + "grad_norm": 4.01998233795166, + "learning_rate": 8.061833472584765e-06, + "loss": 0.7421, + "step": 6428 + }, + { + "epoch": 1.6286257124762509, + "grad_norm": 3.8521294593811035, + "learning_rate": 8.061171049336595e-06, + "loss": 0.8427, + "step": 6429 + }, + { + "epoch": 1.6288790373654212, + "grad_norm": 4.203442573547363, + "learning_rate": 8.060508540132179e-06, + "loss": 0.8202, + "step": 6430 + }, + { + "epoch": 1.6291323622545915, + "grad_norm": 3.664254903793335, + "learning_rate": 8.059845944990114e-06, + "loss": 0.7878, + "step": 6431 + }, + { + "epoch": 1.629385687143762, + "grad_norm": 3.6004889011383057, + "learning_rate": 8.05918326392901e-06, + "loss": 0.6552, + "step": 6432 + }, + { + "epoch": 1.6296390120329323, + "grad_norm": 3.683966636657715, + "learning_rate": 8.058520496967475e-06, + "loss": 0.6749, + "step": 6433 + }, + { + "epoch": 1.6298923369221026, + "grad_norm": 3.49242901802063, + "learning_rate": 8.057857644124116e-06, + "loss": 0.8431, + "step": 6434 + }, + { + "epoch": 1.630145661811273, + "grad_norm": 3.3450429439544678, + "learning_rate": 8.057194705417548e-06, + "loss": 0.7828, + "step": 6435 + }, + { + "epoch": 1.6303989867004434, + "grad_norm": 3.3921709060668945, + "learning_rate": 8.056531680866386e-06, + "loss": 0.8418, + "step": 6436 + }, + { + "epoch": 1.6306523115896137, + "grad_norm": 4.301143646240234, + "learning_rate": 8.055868570489247e-06, + "loss": 0.9744, + "step": 6437 + }, + { + "epoch": 1.6309056364787842, + "grad_norm": 3.922924280166626, + "learning_rate": 8.055205374304751e-06, + "loss": 0.7169, + "step": 6438 + }, + { + "epoch": 1.6311589613679542, + "grad_norm": 3.5931310653686523, + "learning_rate": 8.054542092331518e-06, + "loss": 0.643, + "step": 6439 + }, + { + "epoch": 1.6314122862571248, + "grad_norm": 4.096027851104736, + "learning_rate": 8.053878724588178e-06, + "loss": 0.7857, + "step": 6440 + }, + { + "epoch": 1.6316656111462953, + "grad_norm": 3.529952049255371, + "learning_rate": 8.053215271093353e-06, + "loss": 0.7983, + "step": 6441 + }, + { + "epoch": 1.6319189360354653, + "grad_norm": 3.8757197856903076, + "learning_rate": 8.052551731865673e-06, + "loss": 0.8223, + "step": 6442 + }, + { + "epoch": 1.6321722609246359, + "grad_norm": 3.756049394607544, + "learning_rate": 8.051888106923773e-06, + "loss": 0.7831, + "step": 6443 + }, + { + "epoch": 1.6324255858138064, + "grad_norm": 3.713315010070801, + "learning_rate": 8.051224396286283e-06, + "loss": 0.7794, + "step": 6444 + }, + { + "epoch": 1.6326789107029764, + "grad_norm": 3.53013277053833, + "learning_rate": 8.050560599971844e-06, + "loss": 0.7869, + "step": 6445 + }, + { + "epoch": 1.632932235592147, + "grad_norm": 3.8916704654693604, + "learning_rate": 8.049896717999094e-06, + "loss": 0.8467, + "step": 6446 + }, + { + "epoch": 1.6331855604813172, + "grad_norm": 4.025676727294922, + "learning_rate": 8.049232750386671e-06, + "loss": 0.8094, + "step": 6447 + }, + { + "epoch": 1.6334388853704875, + "grad_norm": 3.3451850414276123, + "learning_rate": 8.048568697153222e-06, + "loss": 0.8509, + "step": 6448 + }, + { + "epoch": 1.633692210259658, + "grad_norm": 3.587831974029541, + "learning_rate": 8.047904558317394e-06, + "loss": 0.6801, + "step": 6449 + }, + { + "epoch": 1.6339455351488283, + "grad_norm": 3.618785858154297, + "learning_rate": 8.047240333897834e-06, + "loss": 0.7699, + "step": 6450 + }, + { + "epoch": 1.6341988600379986, + "grad_norm": 3.751980781555176, + "learning_rate": 8.046576023913193e-06, + "loss": 0.8147, + "step": 6451 + }, + { + "epoch": 1.6344521849271691, + "grad_norm": 3.8179969787597656, + "learning_rate": 8.045911628382126e-06, + "loss": 0.8151, + "step": 6452 + }, + { + "epoch": 1.6347055098163394, + "grad_norm": 4.683689117431641, + "learning_rate": 8.045247147323288e-06, + "loss": 0.8582, + "step": 6453 + }, + { + "epoch": 1.6349588347055097, + "grad_norm": 3.9231245517730713, + "learning_rate": 8.044582580755336e-06, + "loss": 0.7611, + "step": 6454 + }, + { + "epoch": 1.6352121595946802, + "grad_norm": 3.878051280975342, + "learning_rate": 8.043917928696933e-06, + "loss": 0.7537, + "step": 6455 + }, + { + "epoch": 1.6354654844838505, + "grad_norm": 3.9709792137145996, + "learning_rate": 8.04325319116674e-06, + "loss": 0.8344, + "step": 6456 + }, + { + "epoch": 1.6357188093730208, + "grad_norm": 4.03284215927124, + "learning_rate": 8.042588368183425e-06, + "loss": 0.8706, + "step": 6457 + }, + { + "epoch": 1.6359721342621913, + "grad_norm": 3.7195112705230713, + "learning_rate": 8.041923459765655e-06, + "loss": 0.7358, + "step": 6458 + }, + { + "epoch": 1.6362254591513616, + "grad_norm": 3.867770195007324, + "learning_rate": 8.0412584659321e-06, + "loss": 0.8893, + "step": 6459 + }, + { + "epoch": 1.636478784040532, + "grad_norm": 3.8629443645477295, + "learning_rate": 8.040593386701431e-06, + "loss": 0.8314, + "step": 6460 + }, + { + "epoch": 1.6367321089297024, + "grad_norm": 3.434809923171997, + "learning_rate": 8.039928222092326e-06, + "loss": 0.6819, + "step": 6461 + }, + { + "epoch": 1.6369854338188727, + "grad_norm": 3.4478530883789062, + "learning_rate": 8.039262972123461e-06, + "loss": 0.8779, + "step": 6462 + }, + { + "epoch": 1.637238758708043, + "grad_norm": 4.03153133392334, + "learning_rate": 8.038597636813517e-06, + "loss": 0.8377, + "step": 6463 + }, + { + "epoch": 1.6374920835972135, + "grad_norm": 3.9537808895111084, + "learning_rate": 8.037932216181174e-06, + "loss": 0.7889, + "step": 6464 + }, + { + "epoch": 1.6377454084863838, + "grad_norm": 3.6047708988189697, + "learning_rate": 8.03726671024512e-06, + "loss": 0.8148, + "step": 6465 + }, + { + "epoch": 1.6379987333755541, + "grad_norm": 3.5980916023254395, + "learning_rate": 8.036601119024036e-06, + "loss": 0.7766, + "step": 6466 + }, + { + "epoch": 1.6382520582647246, + "grad_norm": 3.7234902381896973, + "learning_rate": 8.03593544253662e-06, + "loss": 0.6899, + "step": 6467 + }, + { + "epoch": 1.6385053831538947, + "grad_norm": 3.416440725326538, + "learning_rate": 8.03526968080156e-06, + "loss": 0.778, + "step": 6468 + }, + { + "epoch": 1.6387587080430652, + "grad_norm": 3.2809247970581055, + "learning_rate": 8.034603833837547e-06, + "loss": 0.8297, + "step": 6469 + }, + { + "epoch": 1.6390120329322357, + "grad_norm": 3.8302032947540283, + "learning_rate": 8.033937901663283e-06, + "loss": 0.734, + "step": 6470 + }, + { + "epoch": 1.6392653578214058, + "grad_norm": 3.3262205123901367, + "learning_rate": 8.033271884297463e-06, + "loss": 0.6733, + "step": 6471 + }, + { + "epoch": 1.6395186827105763, + "grad_norm": 3.435903549194336, + "learning_rate": 8.032605781758791e-06, + "loss": 0.7515, + "step": 6472 + }, + { + "epoch": 1.6397720075997468, + "grad_norm": 3.7613158226013184, + "learning_rate": 8.03193959406597e-06, + "loss": 0.8775, + "step": 6473 + }, + { + "epoch": 1.640025332488917, + "grad_norm": 3.747976064682007, + "learning_rate": 8.031273321237706e-06, + "loss": 0.9734, + "step": 6474 + }, + { + "epoch": 1.6402786573780874, + "grad_norm": 3.3887627124786377, + "learning_rate": 8.030606963292709e-06, + "loss": 0.8202, + "step": 6475 + }, + { + "epoch": 1.6405319822672577, + "grad_norm": 3.9756176471710205, + "learning_rate": 8.029940520249686e-06, + "loss": 0.8735, + "step": 6476 + }, + { + "epoch": 1.640785307156428, + "grad_norm": 3.6156015396118164, + "learning_rate": 8.029273992127356e-06, + "loss": 0.7155, + "step": 6477 + }, + { + "epoch": 1.6410386320455985, + "grad_norm": 4.137899875640869, + "learning_rate": 8.028607378944432e-06, + "loss": 0.689, + "step": 6478 + }, + { + "epoch": 1.6412919569347688, + "grad_norm": 3.173203229904175, + "learning_rate": 8.02794068071963e-06, + "loss": 0.9062, + "step": 6479 + }, + { + "epoch": 1.641545281823939, + "grad_norm": 3.9908053874969482, + "learning_rate": 8.027273897471674e-06, + "loss": 0.7302, + "step": 6480 + }, + { + "epoch": 1.6417986067131096, + "grad_norm": 3.3780364990234375, + "learning_rate": 8.026607029219285e-06, + "loss": 0.6795, + "step": 6481 + }, + { + "epoch": 1.64205193160228, + "grad_norm": 4.1177520751953125, + "learning_rate": 8.02594007598119e-06, + "loss": 0.7486, + "step": 6482 + }, + { + "epoch": 1.6423052564914502, + "grad_norm": 4.135778427124023, + "learning_rate": 8.025273037776116e-06, + "loss": 0.8212, + "step": 6483 + }, + { + "epoch": 1.6425585813806207, + "grad_norm": 3.945587396621704, + "learning_rate": 8.024605914622793e-06, + "loss": 0.6565, + "step": 6484 + }, + { + "epoch": 1.642811906269791, + "grad_norm": 3.3609964847564697, + "learning_rate": 8.023938706539953e-06, + "loss": 0.7832, + "step": 6485 + }, + { + "epoch": 1.6430652311589613, + "grad_norm": 3.868649959564209, + "learning_rate": 8.023271413546332e-06, + "loss": 0.8281, + "step": 6486 + }, + { + "epoch": 1.6433185560481318, + "grad_norm": 3.7476377487182617, + "learning_rate": 8.022604035660666e-06, + "loss": 0.7855, + "step": 6487 + }, + { + "epoch": 1.643571880937302, + "grad_norm": 3.978663682937622, + "learning_rate": 8.021936572901697e-06, + "loss": 0.9225, + "step": 6488 + }, + { + "epoch": 1.6438252058264724, + "grad_norm": 3.6580278873443604, + "learning_rate": 8.021269025288163e-06, + "loss": 0.8147, + "step": 6489 + }, + { + "epoch": 1.644078530715643, + "grad_norm": 3.498389959335327, + "learning_rate": 8.020601392838812e-06, + "loss": 0.7242, + "step": 6490 + }, + { + "epoch": 1.6443318556048132, + "grad_norm": 3.400737762451172, + "learning_rate": 8.019933675572389e-06, + "loss": 0.728, + "step": 6491 + }, + { + "epoch": 1.6445851804939835, + "grad_norm": 3.63218092918396, + "learning_rate": 8.019265873507644e-06, + "loss": 0.7941, + "step": 6492 + }, + { + "epoch": 1.644838505383154, + "grad_norm": 3.3384480476379395, + "learning_rate": 8.018597986663328e-06, + "loss": 0.7823, + "step": 6493 + }, + { + "epoch": 1.6450918302723243, + "grad_norm": 3.6739799976348877, + "learning_rate": 8.017930015058195e-06, + "loss": 0.8028, + "step": 6494 + }, + { + "epoch": 1.6453451551614946, + "grad_norm": 3.4653425216674805, + "learning_rate": 8.017261958711003e-06, + "loss": 0.7367, + "step": 6495 + }, + { + "epoch": 1.6455984800506651, + "grad_norm": 3.514143228530884, + "learning_rate": 8.016593817640507e-06, + "loss": 0.7542, + "step": 6496 + }, + { + "epoch": 1.6458518049398352, + "grad_norm": 4.184511184692383, + "learning_rate": 8.015925591865474e-06, + "loss": 0.7791, + "step": 6497 + }, + { + "epoch": 1.6461051298290057, + "grad_norm": 4.014858722686768, + "learning_rate": 8.015257281404662e-06, + "loss": 0.8363, + "step": 6498 + }, + { + "epoch": 1.6463584547181762, + "grad_norm": 3.3786003589630127, + "learning_rate": 8.01458888627684e-06, + "loss": 0.7193, + "step": 6499 + }, + { + "epoch": 1.6466117796073463, + "grad_norm": 3.679405927658081, + "learning_rate": 8.013920406500772e-06, + "loss": 0.7936, + "step": 6500 + }, + { + "epoch": 1.6466117796073463, + "eval_loss": 1.1629008054733276, + "eval_runtime": 13.7409, + "eval_samples_per_second": 29.11, + "eval_steps_per_second": 3.639, + "step": 6500 + }, + { + "epoch": 1.6468651044965168, + "grad_norm": 3.7600491046905518, + "learning_rate": 8.013251842095234e-06, + "loss": 0.6361, + "step": 6501 + }, + { + "epoch": 1.647118429385687, + "grad_norm": 3.616704225540161, + "learning_rate": 8.012583193078994e-06, + "loss": 0.7376, + "step": 6502 + }, + { + "epoch": 1.6473717542748574, + "grad_norm": 3.8548691272735596, + "learning_rate": 8.011914459470832e-06, + "loss": 0.8461, + "step": 6503 + }, + { + "epoch": 1.647625079164028, + "grad_norm": 3.9855406284332275, + "learning_rate": 8.011245641289522e-06, + "loss": 0.7625, + "step": 6504 + }, + { + "epoch": 1.6478784040531982, + "grad_norm": 3.7066471576690674, + "learning_rate": 8.010576738553848e-06, + "loss": 0.7952, + "step": 6505 + }, + { + "epoch": 1.6481317289423685, + "grad_norm": 3.8302483558654785, + "learning_rate": 8.009907751282588e-06, + "loss": 0.7149, + "step": 6506 + }, + { + "epoch": 1.648385053831539, + "grad_norm": 4.178551197052002, + "learning_rate": 8.009238679494531e-06, + "loss": 0.9022, + "step": 6507 + }, + { + "epoch": 1.6486383787207093, + "grad_norm": 3.9367873668670654, + "learning_rate": 8.00856952320846e-06, + "loss": 0.7939, + "step": 6508 + }, + { + "epoch": 1.6488917036098796, + "grad_norm": 3.814330816268921, + "learning_rate": 8.007900282443168e-06, + "loss": 0.7212, + "step": 6509 + }, + { + "epoch": 1.64914502849905, + "grad_norm": 4.1374125480651855, + "learning_rate": 8.007230957217447e-06, + "loss": 0.8048, + "step": 6510 + }, + { + "epoch": 1.6493983533882204, + "grad_norm": 4.204483985900879, + "learning_rate": 8.006561547550089e-06, + "loss": 0.9392, + "step": 6511 + }, + { + "epoch": 1.6496516782773907, + "grad_norm": 4.255924701690674, + "learning_rate": 8.005892053459892e-06, + "loss": 0.9636, + "step": 6512 + }, + { + "epoch": 1.6499050031665612, + "grad_norm": 3.670743942260742, + "learning_rate": 8.005222474965654e-06, + "loss": 0.7437, + "step": 6513 + }, + { + "epoch": 1.6501583280557315, + "grad_norm": 3.753647804260254, + "learning_rate": 8.004552812086179e-06, + "loss": 0.7605, + "step": 6514 + }, + { + "epoch": 1.6504116529449018, + "grad_norm": 3.7480428218841553, + "learning_rate": 8.003883064840267e-06, + "loss": 0.7204, + "step": 6515 + }, + { + "epoch": 1.6506649778340723, + "grad_norm": 3.120974540710449, + "learning_rate": 8.003213233246728e-06, + "loss": 0.6214, + "step": 6516 + }, + { + "epoch": 1.6509183027232426, + "grad_norm": 4.047748565673828, + "learning_rate": 8.002543317324369e-06, + "loss": 0.8907, + "step": 6517 + }, + { + "epoch": 1.6511716276124129, + "grad_norm": 4.058399200439453, + "learning_rate": 8.001873317092001e-06, + "loss": 0.8473, + "step": 6518 + }, + { + "epoch": 1.6514249525015834, + "grad_norm": 3.572179079055786, + "learning_rate": 8.001203232568436e-06, + "loss": 0.8149, + "step": 6519 + }, + { + "epoch": 1.6516782773907537, + "grad_norm": 3.811018943786621, + "learning_rate": 8.000533063772492e-06, + "loss": 0.7781, + "step": 6520 + }, + { + "epoch": 1.651931602279924, + "grad_norm": 4.158339977264404, + "learning_rate": 7.999862810722985e-06, + "loss": 0.7893, + "step": 6521 + }, + { + "epoch": 1.6521849271690945, + "grad_norm": 3.688552141189575, + "learning_rate": 7.999192473438737e-06, + "loss": 0.8393, + "step": 6522 + }, + { + "epoch": 1.6524382520582648, + "grad_norm": 3.995978355407715, + "learning_rate": 7.99852205193857e-06, + "loss": 0.8735, + "step": 6523 + }, + { + "epoch": 1.652691576947435, + "grad_norm": 3.4606595039367676, + "learning_rate": 7.997851546241308e-06, + "loss": 0.8132, + "step": 6524 + }, + { + "epoch": 1.6529449018366056, + "grad_norm": 4.029937267303467, + "learning_rate": 7.99718095636578e-06, + "loss": 0.83, + "step": 6525 + }, + { + "epoch": 1.6531982267257757, + "grad_norm": 4.23452615737915, + "learning_rate": 7.996510282330816e-06, + "loss": 0.7501, + "step": 6526 + }, + { + "epoch": 1.6534515516149462, + "grad_norm": 3.6601123809814453, + "learning_rate": 7.995839524155249e-06, + "loss": 0.8571, + "step": 6527 + }, + { + "epoch": 1.6537048765041167, + "grad_norm": 3.5844037532806396, + "learning_rate": 7.99516868185791e-06, + "loss": 0.6626, + "step": 6528 + }, + { + "epoch": 1.6539582013932868, + "grad_norm": 4.13832950592041, + "learning_rate": 7.99449775545764e-06, + "loss": 0.7667, + "step": 6529 + }, + { + "epoch": 1.6542115262824573, + "grad_norm": 3.335920810699463, + "learning_rate": 7.993826744973274e-06, + "loss": 0.8313, + "step": 6530 + }, + { + "epoch": 1.6544648511716276, + "grad_norm": 4.0323333740234375, + "learning_rate": 7.993155650423658e-06, + "loss": 0.8563, + "step": 6531 + }, + { + "epoch": 1.6547181760607979, + "grad_norm": 4.23358678817749, + "learning_rate": 7.992484471827634e-06, + "loss": 0.8968, + "step": 6532 + }, + { + "epoch": 1.6549715009499684, + "grad_norm": 3.3698556423187256, + "learning_rate": 7.991813209204047e-06, + "loss": 0.6257, + "step": 6533 + }, + { + "epoch": 1.6552248258391387, + "grad_norm": 3.3595569133758545, + "learning_rate": 7.991141862571749e-06, + "loss": 0.8493, + "step": 6534 + }, + { + "epoch": 1.655478150728309, + "grad_norm": 4.0302910804748535, + "learning_rate": 7.990470431949588e-06, + "loss": 0.9936, + "step": 6535 + }, + { + "epoch": 1.6557314756174795, + "grad_norm": 3.870558023452759, + "learning_rate": 7.989798917356422e-06, + "loss": 0.7429, + "step": 6536 + }, + { + "epoch": 1.6559848005066498, + "grad_norm": 3.3610174655914307, + "learning_rate": 7.9891273188111e-06, + "loss": 0.8919, + "step": 6537 + }, + { + "epoch": 1.65623812539582, + "grad_norm": 3.419107675552368, + "learning_rate": 7.988455636332487e-06, + "loss": 0.8021, + "step": 6538 + }, + { + "epoch": 1.6564914502849906, + "grad_norm": 3.5197911262512207, + "learning_rate": 7.987783869939439e-06, + "loss": 0.7365, + "step": 6539 + }, + { + "epoch": 1.6567447751741609, + "grad_norm": 3.383847236633301, + "learning_rate": 7.987112019650818e-06, + "loss": 0.7167, + "step": 6540 + }, + { + "epoch": 1.6569981000633311, + "grad_norm": 4.0686235427856445, + "learning_rate": 7.986440085485494e-06, + "loss": 0.9421, + "step": 6541 + }, + { + "epoch": 1.6572514249525017, + "grad_norm": 3.9128847122192383, + "learning_rate": 7.985768067462332e-06, + "loss": 0.7651, + "step": 6542 + }, + { + "epoch": 1.657504749841672, + "grad_norm": 3.5898830890655518, + "learning_rate": 7.9850959656002e-06, + "loss": 0.8104, + "step": 6543 + }, + { + "epoch": 1.6577580747308422, + "grad_norm": 3.5501456260681152, + "learning_rate": 7.984423779917974e-06, + "loss": 0.7083, + "step": 6544 + }, + { + "epoch": 1.6580113996200128, + "grad_norm": 3.715851306915283, + "learning_rate": 7.983751510434528e-06, + "loss": 0.8172, + "step": 6545 + }, + { + "epoch": 1.658264724509183, + "grad_norm": 3.674283981323242, + "learning_rate": 7.983079157168736e-06, + "loss": 0.829, + "step": 6546 + }, + { + "epoch": 1.6585180493983533, + "grad_norm": 3.1403985023498535, + "learning_rate": 7.98240672013948e-06, + "loss": 0.7097, + "step": 6547 + }, + { + "epoch": 1.6587713742875239, + "grad_norm": 4.111692905426025, + "learning_rate": 7.98173419936564e-06, + "loss": 0.8722, + "step": 6548 + }, + { + "epoch": 1.6590246991766942, + "grad_norm": 3.416438102722168, + "learning_rate": 7.981061594866105e-06, + "loss": 0.7898, + "step": 6549 + }, + { + "epoch": 1.6592780240658644, + "grad_norm": 3.63065242767334, + "learning_rate": 7.980388906659753e-06, + "loss": 0.7435, + "step": 6550 + }, + { + "epoch": 1.659531348955035, + "grad_norm": 3.939948797225952, + "learning_rate": 7.979716134765481e-06, + "loss": 0.7948, + "step": 6551 + }, + { + "epoch": 1.659784673844205, + "grad_norm": 3.4028239250183105, + "learning_rate": 7.979043279202175e-06, + "loss": 0.7807, + "step": 6552 + }, + { + "epoch": 1.6600379987333755, + "grad_norm": 3.9967074394226074, + "learning_rate": 7.978370339988728e-06, + "loss": 0.7236, + "step": 6553 + }, + { + "epoch": 1.660291323622546, + "grad_norm": 3.631662607192993, + "learning_rate": 7.97769731714404e-06, + "loss": 0.7455, + "step": 6554 + }, + { + "epoch": 1.6605446485117161, + "grad_norm": 3.5003862380981445, + "learning_rate": 7.977024210687005e-06, + "loss": 0.756, + "step": 6555 + }, + { + "epoch": 1.6607979734008866, + "grad_norm": 3.761894464492798, + "learning_rate": 7.976351020636528e-06, + "loss": 0.7335, + "step": 6556 + }, + { + "epoch": 1.6610512982900572, + "grad_norm": 3.5644938945770264, + "learning_rate": 7.975677747011508e-06, + "loss": 0.7435, + "step": 6557 + }, + { + "epoch": 1.6613046231792272, + "grad_norm": 3.92643141746521, + "learning_rate": 7.97500438983085e-06, + "loss": 0.8846, + "step": 6558 + }, + { + "epoch": 1.6615579480683977, + "grad_norm": 3.3626163005828857, + "learning_rate": 7.974330949113466e-06, + "loss": 0.6866, + "step": 6559 + }, + { + "epoch": 1.661811272957568, + "grad_norm": 3.531470537185669, + "learning_rate": 7.97365742487826e-06, + "loss": 0.809, + "step": 6560 + }, + { + "epoch": 1.6620645978467383, + "grad_norm": 4.272540092468262, + "learning_rate": 7.972983817144151e-06, + "loss": 0.8819, + "step": 6561 + }, + { + "epoch": 1.6623179227359088, + "grad_norm": 3.836256980895996, + "learning_rate": 7.972310125930047e-06, + "loss": 0.8513, + "step": 6562 + }, + { + "epoch": 1.6625712476250791, + "grad_norm": 3.676304578781128, + "learning_rate": 7.971636351254868e-06, + "loss": 0.7966, + "step": 6563 + }, + { + "epoch": 1.6628245725142494, + "grad_norm": 3.809302806854248, + "learning_rate": 7.970962493137533e-06, + "loss": 0.8453, + "step": 6564 + }, + { + "epoch": 1.66307789740342, + "grad_norm": 3.648832082748413, + "learning_rate": 7.970288551596964e-06, + "loss": 0.7627, + "step": 6565 + }, + { + "epoch": 1.6633312222925902, + "grad_norm": 3.469412088394165, + "learning_rate": 7.969614526652085e-06, + "loss": 0.7832, + "step": 6566 + }, + { + "epoch": 1.6635845471817605, + "grad_norm": 4.007893085479736, + "learning_rate": 7.96894041832182e-06, + "loss": 0.8051, + "step": 6567 + }, + { + "epoch": 1.663837872070931, + "grad_norm": 3.579482078552246, + "learning_rate": 7.968266226625102e-06, + "loss": 0.7833, + "step": 6568 + }, + { + "epoch": 1.6640911969601013, + "grad_norm": 4.226022720336914, + "learning_rate": 7.967591951580857e-06, + "loss": 0.7823, + "step": 6569 + }, + { + "epoch": 1.6643445218492716, + "grad_norm": 3.9336376190185547, + "learning_rate": 7.966917593208023e-06, + "loss": 0.8148, + "step": 6570 + }, + { + "epoch": 1.6645978467384421, + "grad_norm": 3.4499406814575195, + "learning_rate": 7.966243151525534e-06, + "loss": 0.7168, + "step": 6571 + }, + { + "epoch": 1.6648511716276124, + "grad_norm": 3.765089750289917, + "learning_rate": 7.965568626552324e-06, + "loss": 0.8394, + "step": 6572 + }, + { + "epoch": 1.6651044965167827, + "grad_norm": 3.9018733501434326, + "learning_rate": 7.96489401830734e-06, + "loss": 0.8357, + "step": 6573 + }, + { + "epoch": 1.6653578214059532, + "grad_norm": 3.840449810028076, + "learning_rate": 7.964219326809522e-06, + "loss": 0.8492, + "step": 6574 + }, + { + "epoch": 1.6656111462951235, + "grad_norm": 3.6411032676696777, + "learning_rate": 7.963544552077813e-06, + "loss": 0.8185, + "step": 6575 + }, + { + "epoch": 1.6658644711842938, + "grad_norm": 3.478626251220703, + "learning_rate": 7.96286969413116e-06, + "loss": 0.7777, + "step": 6576 + }, + { + "epoch": 1.6661177960734643, + "grad_norm": 3.901601791381836, + "learning_rate": 7.962194752988519e-06, + "loss": 0.8673, + "step": 6577 + }, + { + "epoch": 1.6663711209626346, + "grad_norm": 3.6975276470184326, + "learning_rate": 7.961519728668834e-06, + "loss": 0.8515, + "step": 6578 + }, + { + "epoch": 1.666624445851805, + "grad_norm": 4.250957489013672, + "learning_rate": 7.960844621191065e-06, + "loss": 0.7823, + "step": 6579 + }, + { + "epoch": 1.6668777707409754, + "grad_norm": 4.216465950012207, + "learning_rate": 7.960169430574166e-06, + "loss": 0.8317, + "step": 6580 + }, + { + "epoch": 1.6671310956301455, + "grad_norm": 3.7026209831237793, + "learning_rate": 7.959494156837097e-06, + "loss": 0.8145, + "step": 6581 + }, + { + "epoch": 1.667384420519316, + "grad_norm": 4.24608039855957, + "learning_rate": 7.95881879999882e-06, + "loss": 0.9671, + "step": 6582 + }, + { + "epoch": 1.6676377454084865, + "grad_norm": 3.7046186923980713, + "learning_rate": 7.958143360078297e-06, + "loss": 0.7755, + "step": 6583 + }, + { + "epoch": 1.6678910702976566, + "grad_norm": 3.856071949005127, + "learning_rate": 7.957467837094494e-06, + "loss": 0.7805, + "step": 6584 + }, + { + "epoch": 1.6681443951868271, + "grad_norm": 3.9666085243225098, + "learning_rate": 7.95679223106638e-06, + "loss": 0.7719, + "step": 6585 + }, + { + "epoch": 1.6683977200759976, + "grad_norm": 3.8352930545806885, + "learning_rate": 7.956116542012927e-06, + "loss": 0.8631, + "step": 6586 + }, + { + "epoch": 1.6686510449651677, + "grad_norm": 3.7137701511383057, + "learning_rate": 7.955440769953108e-06, + "loss": 0.9008, + "step": 6587 + }, + { + "epoch": 1.6689043698543382, + "grad_norm": 3.8792643547058105, + "learning_rate": 7.954764914905896e-06, + "loss": 0.8702, + "step": 6588 + }, + { + "epoch": 1.6691576947435085, + "grad_norm": 3.808335781097412, + "learning_rate": 7.95408897689027e-06, + "loss": 0.7988, + "step": 6589 + }, + { + "epoch": 1.6694110196326788, + "grad_norm": 3.4590396881103516, + "learning_rate": 7.95341295592521e-06, + "loss": 0.7599, + "step": 6590 + }, + { + "epoch": 1.6696643445218493, + "grad_norm": 4.048331260681152, + "learning_rate": 7.952736852029699e-06, + "loss": 0.7402, + "step": 6591 + }, + { + "epoch": 1.6699176694110196, + "grad_norm": 3.6097097396850586, + "learning_rate": 7.952060665222721e-06, + "loss": 0.7074, + "step": 6592 + }, + { + "epoch": 1.67017099430019, + "grad_norm": 3.667442798614502, + "learning_rate": 7.951384395523262e-06, + "loss": 0.7995, + "step": 6593 + }, + { + "epoch": 1.6704243191893604, + "grad_norm": 3.7107222080230713, + "learning_rate": 7.950708042950313e-06, + "loss": 0.8229, + "step": 6594 + }, + { + "epoch": 1.6706776440785307, + "grad_norm": 3.692390203475952, + "learning_rate": 7.950031607522865e-06, + "loss": 0.7444, + "step": 6595 + }, + { + "epoch": 1.670930968967701, + "grad_norm": 3.5283172130584717, + "learning_rate": 7.949355089259914e-06, + "loss": 0.6734, + "step": 6596 + }, + { + "epoch": 1.6711842938568715, + "grad_norm": 3.4671082496643066, + "learning_rate": 7.948678488180452e-06, + "loss": 0.8705, + "step": 6597 + }, + { + "epoch": 1.6714376187460418, + "grad_norm": 3.5219898223876953, + "learning_rate": 7.94800180430348e-06, + "loss": 0.7039, + "step": 6598 + }, + { + "epoch": 1.671690943635212, + "grad_norm": 3.6108665466308594, + "learning_rate": 7.947325037648e-06, + "loss": 0.7648, + "step": 6599 + }, + { + "epoch": 1.6719442685243826, + "grad_norm": 3.6292128562927246, + "learning_rate": 7.946648188233016e-06, + "loss": 0.6964, + "step": 6600 + }, + { + "epoch": 1.672197593413553, + "grad_norm": 3.8417787551879883, + "learning_rate": 7.945971256077529e-06, + "loss": 0.6973, + "step": 6601 + }, + { + "epoch": 1.6724509183027232, + "grad_norm": 3.655839204788208, + "learning_rate": 7.94529424120055e-06, + "loss": 0.7429, + "step": 6602 + }, + { + "epoch": 1.6727042431918937, + "grad_norm": 3.953800678253174, + "learning_rate": 7.94461714362109e-06, + "loss": 0.7063, + "step": 6603 + }, + { + "epoch": 1.672957568081064, + "grad_norm": 3.713174343109131, + "learning_rate": 7.94393996335816e-06, + "loss": 0.7668, + "step": 6604 + }, + { + "epoch": 1.6732108929702343, + "grad_norm": 4.089044570922852, + "learning_rate": 7.943262700430777e-06, + "loss": 0.8985, + "step": 6605 + }, + { + "epoch": 1.6734642178594048, + "grad_norm": 3.4259583950042725, + "learning_rate": 7.942585354857956e-06, + "loss": 0.7928, + "step": 6606 + }, + { + "epoch": 1.673717542748575, + "grad_norm": 3.7505693435668945, + "learning_rate": 7.941907926658718e-06, + "loss": 0.7894, + "step": 6607 + }, + { + "epoch": 1.6739708676377454, + "grad_norm": 3.6627004146575928, + "learning_rate": 7.941230415852084e-06, + "loss": 0.7854, + "step": 6608 + }, + { + "epoch": 1.674224192526916, + "grad_norm": 3.8761916160583496, + "learning_rate": 7.940552822457078e-06, + "loss": 0.8486, + "step": 6609 + }, + { + "epoch": 1.674477517416086, + "grad_norm": 3.4008986949920654, + "learning_rate": 7.939875146492725e-06, + "loss": 0.7706, + "step": 6610 + }, + { + "epoch": 1.6747308423052565, + "grad_norm": 3.642361640930176, + "learning_rate": 7.93919738797806e-06, + "loss": 0.7847, + "step": 6611 + }, + { + "epoch": 1.674984167194427, + "grad_norm": 3.589169502258301, + "learning_rate": 7.938519546932107e-06, + "loss": 0.8092, + "step": 6612 + }, + { + "epoch": 1.675237492083597, + "grad_norm": 3.996886730194092, + "learning_rate": 7.937841623373904e-06, + "loss": 0.8737, + "step": 6613 + }, + { + "epoch": 1.6754908169727676, + "grad_norm": 3.890979051589966, + "learning_rate": 7.937163617322484e-06, + "loss": 0.8329, + "step": 6614 + }, + { + "epoch": 1.675744141861938, + "grad_norm": 3.8833699226379395, + "learning_rate": 7.936485528796884e-06, + "loss": 0.8497, + "step": 6615 + }, + { + "epoch": 1.6759974667511082, + "grad_norm": 3.4291770458221436, + "learning_rate": 7.93580735781615e-06, + "loss": 0.7488, + "step": 6616 + }, + { + "epoch": 1.6762507916402787, + "grad_norm": 3.5235791206359863, + "learning_rate": 7.93512910439932e-06, + "loss": 0.7153, + "step": 6617 + }, + { + "epoch": 1.676504116529449, + "grad_norm": 3.9646575450897217, + "learning_rate": 7.934450768565441e-06, + "loss": 0.7636, + "step": 6618 + }, + { + "epoch": 1.6767574414186193, + "grad_norm": 3.3547985553741455, + "learning_rate": 7.933772350333559e-06, + "loss": 0.6881, + "step": 6619 + }, + { + "epoch": 1.6770107663077898, + "grad_norm": 4.283260345458984, + "learning_rate": 7.933093849722724e-06, + "loss": 0.8028, + "step": 6620 + }, + { + "epoch": 1.67726409119696, + "grad_norm": 3.5597147941589355, + "learning_rate": 7.93241526675199e-06, + "loss": 0.8416, + "step": 6621 + }, + { + "epoch": 1.6775174160861304, + "grad_norm": 3.506744384765625, + "learning_rate": 7.931736601440407e-06, + "loss": 0.7056, + "step": 6622 + }, + { + "epoch": 1.6777707409753009, + "grad_norm": 3.943193197250366, + "learning_rate": 7.931057853807037e-06, + "loss": 0.8271, + "step": 6623 + }, + { + "epoch": 1.6780240658644712, + "grad_norm": 4.149829864501953, + "learning_rate": 7.930379023870936e-06, + "loss": 0.7197, + "step": 6624 + }, + { + "epoch": 1.6782773907536415, + "grad_norm": 3.963623046875, + "learning_rate": 7.929700111651165e-06, + "loss": 0.7888, + "step": 6625 + }, + { + "epoch": 1.678530715642812, + "grad_norm": 3.586610794067383, + "learning_rate": 7.929021117166787e-06, + "loss": 0.712, + "step": 6626 + }, + { + "epoch": 1.6787840405319823, + "grad_norm": 3.788907289505005, + "learning_rate": 7.92834204043687e-06, + "loss": 0.8079, + "step": 6627 + }, + { + "epoch": 1.6790373654211526, + "grad_norm": 3.8412363529205322, + "learning_rate": 7.92766288148048e-06, + "loss": 0.7998, + "step": 6628 + }, + { + "epoch": 1.679290690310323, + "grad_norm": 3.252941370010376, + "learning_rate": 7.926983640316688e-06, + "loss": 0.7125, + "step": 6629 + }, + { + "epoch": 1.6795440151994934, + "grad_norm": 3.6767332553863525, + "learning_rate": 7.926304316964569e-06, + "loss": 0.6974, + "step": 6630 + }, + { + "epoch": 1.6797973400886637, + "grad_norm": 3.9207980632781982, + "learning_rate": 7.925624911443194e-06, + "loss": 0.7083, + "step": 6631 + }, + { + "epoch": 1.6800506649778342, + "grad_norm": 3.551576852798462, + "learning_rate": 7.924945423771643e-06, + "loss": 0.839, + "step": 6632 + }, + { + "epoch": 1.6803039898670045, + "grad_norm": 3.6196115016937256, + "learning_rate": 7.924265853968996e-06, + "loss": 0.8434, + "step": 6633 + }, + { + "epoch": 1.6805573147561748, + "grad_norm": 3.624392509460449, + "learning_rate": 7.923586202054336e-06, + "loss": 0.6152, + "step": 6634 + }, + { + "epoch": 1.6808106396453453, + "grad_norm": 3.9037556648254395, + "learning_rate": 7.922906468046744e-06, + "loss": 0.9458, + "step": 6635 + }, + { + "epoch": 1.6810639645345156, + "grad_norm": 3.74167537689209, + "learning_rate": 7.922226651965308e-06, + "loss": 0.879, + "step": 6636 + }, + { + "epoch": 1.6813172894236859, + "grad_norm": 3.045506477355957, + "learning_rate": 7.921546753829117e-06, + "loss": 0.6792, + "step": 6637 + }, + { + "epoch": 1.6815706143128564, + "grad_norm": 3.6199963092803955, + "learning_rate": 7.920866773657264e-06, + "loss": 0.7662, + "step": 6638 + }, + { + "epoch": 1.6818239392020264, + "grad_norm": 3.473907232284546, + "learning_rate": 7.92018671146884e-06, + "loss": 0.7948, + "step": 6639 + }, + { + "epoch": 1.682077264091197, + "grad_norm": 4.178169250488281, + "learning_rate": 7.919506567282941e-06, + "loss": 0.776, + "step": 6640 + }, + { + "epoch": 1.6823305889803675, + "grad_norm": 4.206727504730225, + "learning_rate": 7.918826341118668e-06, + "loss": 0.8756, + "step": 6641 + }, + { + "epoch": 1.6825839138695375, + "grad_norm": 3.7250514030456543, + "learning_rate": 7.918146032995117e-06, + "loss": 0.7612, + "step": 6642 + }, + { + "epoch": 1.682837238758708, + "grad_norm": 3.7533860206604004, + "learning_rate": 7.917465642931395e-06, + "loss": 0.7584, + "step": 6643 + }, + { + "epoch": 1.6830905636478786, + "grad_norm": 3.769502639770508, + "learning_rate": 7.916785170946603e-06, + "loss": 0.8779, + "step": 6644 + }, + { + "epoch": 1.6833438885370486, + "grad_norm": 4.405693531036377, + "learning_rate": 7.916104617059853e-06, + "loss": 0.8019, + "step": 6645 + }, + { + "epoch": 1.6835972134262192, + "grad_norm": 3.6255602836608887, + "learning_rate": 7.915423981290251e-06, + "loss": 0.7301, + "step": 6646 + }, + { + "epoch": 1.6838505383153894, + "grad_norm": 3.3909103870391846, + "learning_rate": 7.91474326365691e-06, + "loss": 0.6843, + "step": 6647 + }, + { + "epoch": 1.6841038632045597, + "grad_norm": 3.8130462169647217, + "learning_rate": 7.914062464178943e-06, + "loss": 0.7176, + "step": 6648 + }, + { + "epoch": 1.6843571880937303, + "grad_norm": 4.036614418029785, + "learning_rate": 7.913381582875468e-06, + "loss": 0.7759, + "step": 6649 + }, + { + "epoch": 1.6846105129829005, + "grad_norm": 3.8046438694000244, + "learning_rate": 7.912700619765605e-06, + "loss": 0.7424, + "step": 6650 + }, + { + "epoch": 1.6848638378720708, + "grad_norm": 3.53322172164917, + "learning_rate": 7.912019574868473e-06, + "loss": 0.7357, + "step": 6651 + }, + { + "epoch": 1.6851171627612413, + "grad_norm": 3.907158851623535, + "learning_rate": 7.911338448203197e-06, + "loss": 0.8641, + "step": 6652 + }, + { + "epoch": 1.6853704876504116, + "grad_norm": 3.558422327041626, + "learning_rate": 7.9106572397889e-06, + "loss": 0.797, + "step": 6653 + }, + { + "epoch": 1.685623812539582, + "grad_norm": 3.729743480682373, + "learning_rate": 7.909975949644713e-06, + "loss": 0.8328, + "step": 6654 + }, + { + "epoch": 1.6858771374287524, + "grad_norm": 3.8090381622314453, + "learning_rate": 7.909294577789765e-06, + "loss": 0.7349, + "step": 6655 + }, + { + "epoch": 1.6861304623179227, + "grad_norm": 3.698411703109741, + "learning_rate": 7.908613124243189e-06, + "loss": 0.7703, + "step": 6656 + }, + { + "epoch": 1.686383787207093, + "grad_norm": 3.94551944732666, + "learning_rate": 7.907931589024119e-06, + "loss": 0.742, + "step": 6657 + }, + { + "epoch": 1.6866371120962635, + "grad_norm": 3.8150436878204346, + "learning_rate": 7.907249972151694e-06, + "loss": 0.8785, + "step": 6658 + }, + { + "epoch": 1.6868904369854338, + "grad_norm": 3.8971149921417236, + "learning_rate": 7.906568273645051e-06, + "loss": 0.7242, + "step": 6659 + }, + { + "epoch": 1.6871437618746041, + "grad_norm": 3.75036883354187, + "learning_rate": 7.905886493523333e-06, + "loss": 0.8107, + "step": 6660 + }, + { + "epoch": 1.6873970867637746, + "grad_norm": 3.9910454750061035, + "learning_rate": 7.905204631805686e-06, + "loss": 0.8148, + "step": 6661 + }, + { + "epoch": 1.687650411652945, + "grad_norm": 3.8019652366638184, + "learning_rate": 7.904522688511253e-06, + "loss": 0.817, + "step": 6662 + }, + { + "epoch": 1.6879037365421152, + "grad_norm": 3.8441343307495117, + "learning_rate": 7.903840663659186e-06, + "loss": 0.85, + "step": 6663 + }, + { + "epoch": 1.6881570614312857, + "grad_norm": 3.426865577697754, + "learning_rate": 7.903158557268633e-06, + "loss": 0.8892, + "step": 6664 + }, + { + "epoch": 1.688410386320456, + "grad_norm": 3.5533223152160645, + "learning_rate": 7.902476369358748e-06, + "loss": 0.7583, + "step": 6665 + }, + { + "epoch": 1.6886637112096263, + "grad_norm": 3.424185276031494, + "learning_rate": 7.901794099948686e-06, + "loss": 0.6838, + "step": 6666 + }, + { + "epoch": 1.6889170360987968, + "grad_norm": 3.9086058139801025, + "learning_rate": 7.901111749057606e-06, + "loss": 0.8324, + "step": 6667 + }, + { + "epoch": 1.689170360987967, + "grad_norm": 3.792022943496704, + "learning_rate": 7.90042931670467e-06, + "loss": 0.8141, + "step": 6668 + }, + { + "epoch": 1.6894236858771374, + "grad_norm": 4.026228904724121, + "learning_rate": 7.899746802909037e-06, + "loss": 0.7859, + "step": 6669 + }, + { + "epoch": 1.689677010766308, + "grad_norm": 4.015260696411133, + "learning_rate": 7.899064207689873e-06, + "loss": 0.8104, + "step": 6670 + }, + { + "epoch": 1.689930335655478, + "grad_norm": 4.160395622253418, + "learning_rate": 7.898381531066344e-06, + "loss": 0.7034, + "step": 6671 + }, + { + "epoch": 1.6901836605446485, + "grad_norm": 3.6861190795898438, + "learning_rate": 7.89769877305762e-06, + "loss": 0.6766, + "step": 6672 + }, + { + "epoch": 1.6904369854338188, + "grad_norm": 4.139413356781006, + "learning_rate": 7.897015933682873e-06, + "loss": 0.7244, + "step": 6673 + }, + { + "epoch": 1.690690310322989, + "grad_norm": 3.5917487144470215, + "learning_rate": 7.896333012961277e-06, + "loss": 0.7738, + "step": 6674 + }, + { + "epoch": 1.6909436352121596, + "grad_norm": 3.593790054321289, + "learning_rate": 7.895650010912007e-06, + "loss": 0.7097, + "step": 6675 + }, + { + "epoch": 1.69119696010133, + "grad_norm": 3.585502862930298, + "learning_rate": 7.894966927554239e-06, + "loss": 0.7099, + "step": 6676 + }, + { + "epoch": 1.6914502849905002, + "grad_norm": 3.7600560188293457, + "learning_rate": 7.89428376290716e-06, + "loss": 0.8947, + "step": 6677 + }, + { + "epoch": 1.6917036098796707, + "grad_norm": 3.956672191619873, + "learning_rate": 7.893600516989948e-06, + "loss": 0.8116, + "step": 6678 + }, + { + "epoch": 1.691956934768841, + "grad_norm": 4.237945079803467, + "learning_rate": 7.892917189821791e-06, + "loss": 0.8744, + "step": 6679 + }, + { + "epoch": 1.6922102596580113, + "grad_norm": 3.645401954650879, + "learning_rate": 7.892233781421874e-06, + "loss": 0.7532, + "step": 6680 + }, + { + "epoch": 1.6924635845471818, + "grad_norm": 3.4173827171325684, + "learning_rate": 7.891550291809388e-06, + "loss": 0.7708, + "step": 6681 + }, + { + "epoch": 1.6927169094363521, + "grad_norm": 3.700458288192749, + "learning_rate": 7.890866721003525e-06, + "loss": 0.8057, + "step": 6682 + }, + { + "epoch": 1.6929702343255224, + "grad_norm": 3.7289626598358154, + "learning_rate": 7.890183069023478e-06, + "loss": 0.8639, + "step": 6683 + }, + { + "epoch": 1.693223559214693, + "grad_norm": 3.5245094299316406, + "learning_rate": 7.889499335888449e-06, + "loss": 0.7177, + "step": 6684 + }, + { + "epoch": 1.6934768841038632, + "grad_norm": 3.8554348945617676, + "learning_rate": 7.88881552161763e-06, + "loss": 0.8164, + "step": 6685 + }, + { + "epoch": 1.6937302089930335, + "grad_norm": 3.527906894683838, + "learning_rate": 7.888131626230224e-06, + "loss": 0.8275, + "step": 6686 + }, + { + "epoch": 1.693983533882204, + "grad_norm": 3.72906756401062, + "learning_rate": 7.887447649745437e-06, + "loss": 0.8171, + "step": 6687 + }, + { + "epoch": 1.6942368587713743, + "grad_norm": 3.4595229625701904, + "learning_rate": 7.886763592182474e-06, + "loss": 0.7817, + "step": 6688 + }, + { + "epoch": 1.6944901836605446, + "grad_norm": 3.7904465198516846, + "learning_rate": 7.88607945356054e-06, + "loss": 0.9005, + "step": 6689 + }, + { + "epoch": 1.6947435085497151, + "grad_norm": 3.807465076446533, + "learning_rate": 7.88539523389885e-06, + "loss": 0.8108, + "step": 6690 + }, + { + "epoch": 1.6949968334388854, + "grad_norm": 3.802807569503784, + "learning_rate": 7.884710933216613e-06, + "loss": 0.8049, + "step": 6691 + }, + { + "epoch": 1.6952501583280557, + "grad_norm": 3.7613372802734375, + "learning_rate": 7.884026551533046e-06, + "loss": 0.8984, + "step": 6692 + }, + { + "epoch": 1.6955034832172262, + "grad_norm": 3.503157615661621, + "learning_rate": 7.883342088867364e-06, + "loss": 0.7345, + "step": 6693 + }, + { + "epoch": 1.6957568081063965, + "grad_norm": 3.8023993968963623, + "learning_rate": 7.882657545238788e-06, + "loss": 0.728, + "step": 6694 + }, + { + "epoch": 1.6960101329955668, + "grad_norm": 3.663755178451538, + "learning_rate": 7.881972920666538e-06, + "loss": 0.7559, + "step": 6695 + }, + { + "epoch": 1.6962634578847373, + "grad_norm": 3.451871633529663, + "learning_rate": 7.88128821516984e-06, + "loss": 0.8482, + "step": 6696 + }, + { + "epoch": 1.6965167827739074, + "grad_norm": 3.7571277618408203, + "learning_rate": 7.88060342876792e-06, + "loss": 0.8549, + "step": 6697 + }, + { + "epoch": 1.696770107663078, + "grad_norm": 3.491603374481201, + "learning_rate": 7.879918561480006e-06, + "loss": 0.7404, + "step": 6698 + }, + { + "epoch": 1.6970234325522484, + "grad_norm": 3.8406331539154053, + "learning_rate": 7.879233613325328e-06, + "loss": 0.831, + "step": 6699 + }, + { + "epoch": 1.6972767574414185, + "grad_norm": 3.4100091457366943, + "learning_rate": 7.87854858432312e-06, + "loss": 0.7306, + "step": 6700 + }, + { + "epoch": 1.697530082330589, + "grad_norm": 3.4049994945526123, + "learning_rate": 7.877863474492616e-06, + "loss": 0.6876, + "step": 6701 + }, + { + "epoch": 1.6977834072197593, + "grad_norm": 3.7596635818481445, + "learning_rate": 7.877178283853053e-06, + "loss": 0.8153, + "step": 6702 + }, + { + "epoch": 1.6980367321089296, + "grad_norm": 4.205677509307861, + "learning_rate": 7.876493012423674e-06, + "loss": 0.7863, + "step": 6703 + }, + { + "epoch": 1.6982900569981, + "grad_norm": 3.8486273288726807, + "learning_rate": 7.87580766022372e-06, + "loss": 0.7443, + "step": 6704 + }, + { + "epoch": 1.6985433818872704, + "grad_norm": 3.5464746952056885, + "learning_rate": 7.875122227272435e-06, + "loss": 0.7064, + "step": 6705 + }, + { + "epoch": 1.6987967067764407, + "grad_norm": 3.5107100009918213, + "learning_rate": 7.874436713589065e-06, + "loss": 0.7617, + "step": 6706 + }, + { + "epoch": 1.6990500316656112, + "grad_norm": 4.054915904998779, + "learning_rate": 7.873751119192857e-06, + "loss": 0.9486, + "step": 6707 + }, + { + "epoch": 1.6993033565547815, + "grad_norm": 3.6590044498443604, + "learning_rate": 7.873065444103066e-06, + "loss": 0.6986, + "step": 6708 + }, + { + "epoch": 1.6995566814439518, + "grad_norm": 3.610285758972168, + "learning_rate": 7.872379688338945e-06, + "loss": 0.8146, + "step": 6709 + }, + { + "epoch": 1.6998100063331223, + "grad_norm": 3.8275539875030518, + "learning_rate": 7.871693851919747e-06, + "loss": 0.8404, + "step": 6710 + }, + { + "epoch": 1.7000633312222926, + "grad_norm": 3.4518802165985107, + "learning_rate": 7.871007934864732e-06, + "loss": 0.7698, + "step": 6711 + }, + { + "epoch": 1.7003166561114629, + "grad_norm": 3.5977344512939453, + "learning_rate": 7.87032193719316e-06, + "loss": 0.6872, + "step": 6712 + }, + { + "epoch": 1.7005699810006334, + "grad_norm": 3.663902521133423, + "learning_rate": 7.869635858924293e-06, + "loss": 0.8268, + "step": 6713 + }, + { + "epoch": 1.7008233058898037, + "grad_norm": 3.621511220932007, + "learning_rate": 7.868949700077396e-06, + "loss": 0.7852, + "step": 6714 + }, + { + "epoch": 1.701076630778974, + "grad_norm": 3.7081730365753174, + "learning_rate": 7.868263460671737e-06, + "loss": 0.7897, + "step": 6715 + }, + { + "epoch": 1.7013299556681445, + "grad_norm": 3.8370795249938965, + "learning_rate": 7.867577140726584e-06, + "loss": 0.8091, + "step": 6716 + }, + { + "epoch": 1.7015832805573148, + "grad_norm": 3.473798990249634, + "learning_rate": 7.866890740261205e-06, + "loss": 0.6665, + "step": 6717 + }, + { + "epoch": 1.701836605446485, + "grad_norm": 3.8767638206481934, + "learning_rate": 7.866204259294883e-06, + "loss": 0.7544, + "step": 6718 + }, + { + "epoch": 1.7020899303356556, + "grad_norm": 3.3869128227233887, + "learning_rate": 7.865517697846887e-06, + "loss": 0.8296, + "step": 6719 + }, + { + "epoch": 1.7023432552248259, + "grad_norm": 3.7934815883636475, + "learning_rate": 7.864831055936497e-06, + "loss": 0.733, + "step": 6720 + }, + { + "epoch": 1.7025965801139962, + "grad_norm": 4.060279846191406, + "learning_rate": 7.864144333582993e-06, + "loss": 0.8434, + "step": 6721 + }, + { + "epoch": 1.7028499050031667, + "grad_norm": 3.429332733154297, + "learning_rate": 7.863457530805659e-06, + "loss": 0.8536, + "step": 6722 + }, + { + "epoch": 1.7031032298923368, + "grad_norm": 3.738725185394287, + "learning_rate": 7.86277064762378e-06, + "loss": 0.8061, + "step": 6723 + }, + { + "epoch": 1.7033565547815073, + "grad_norm": 3.696758270263672, + "learning_rate": 7.862083684056641e-06, + "loss": 0.7556, + "step": 6724 + }, + { + "epoch": 1.7036098796706778, + "grad_norm": 3.453564167022705, + "learning_rate": 7.861396640123535e-06, + "loss": 0.9341, + "step": 6725 + }, + { + "epoch": 1.7038632045598479, + "grad_norm": 3.4374876022338867, + "learning_rate": 7.860709515843751e-06, + "loss": 0.7263, + "step": 6726 + }, + { + "epoch": 1.7041165294490184, + "grad_norm": 3.612027168273926, + "learning_rate": 7.860022311236588e-06, + "loss": 0.7434, + "step": 6727 + }, + { + "epoch": 1.7043698543381889, + "grad_norm": 3.5617215633392334, + "learning_rate": 7.859335026321336e-06, + "loss": 0.729, + "step": 6728 + }, + { + "epoch": 1.704623179227359, + "grad_norm": 3.6373393535614014, + "learning_rate": 7.858647661117298e-06, + "loss": 0.7424, + "step": 6729 + }, + { + "epoch": 1.7048765041165295, + "grad_norm": 4.018084526062012, + "learning_rate": 7.857960215643772e-06, + "loss": 0.8158, + "step": 6730 + }, + { + "epoch": 1.7051298290056998, + "grad_norm": 3.5702357292175293, + "learning_rate": 7.857272689920064e-06, + "loss": 0.7383, + "step": 6731 + }, + { + "epoch": 1.70538315389487, + "grad_norm": 3.8847665786743164, + "learning_rate": 7.856585083965477e-06, + "loss": 0.7452, + "step": 6732 + }, + { + "epoch": 1.7056364787840406, + "grad_norm": 3.6506965160369873, + "learning_rate": 7.85589739779932e-06, + "loss": 0.7257, + "step": 6733 + }, + { + "epoch": 1.7058898036732109, + "grad_norm": 3.6071321964263916, + "learning_rate": 7.855209631440904e-06, + "loss": 0.8079, + "step": 6734 + }, + { + "epoch": 1.7061431285623812, + "grad_norm": 3.5854055881500244, + "learning_rate": 7.854521784909537e-06, + "loss": 0.8451, + "step": 6735 + }, + { + "epoch": 1.7063964534515517, + "grad_norm": 4.117568492889404, + "learning_rate": 7.853833858224537e-06, + "loss": 0.8651, + "step": 6736 + }, + { + "epoch": 1.706649778340722, + "grad_norm": 3.4071497917175293, + "learning_rate": 7.853145851405222e-06, + "loss": 0.6812, + "step": 6737 + }, + { + "epoch": 1.7069031032298922, + "grad_norm": 3.7544307708740234, + "learning_rate": 7.852457764470907e-06, + "loss": 0.7665, + "step": 6738 + }, + { + "epoch": 1.7071564281190628, + "grad_norm": 3.820354700088501, + "learning_rate": 7.851769597440915e-06, + "loss": 0.8057, + "step": 6739 + }, + { + "epoch": 1.707409753008233, + "grad_norm": 3.7876126766204834, + "learning_rate": 7.851081350334568e-06, + "loss": 0.8439, + "step": 6740 + }, + { + "epoch": 1.7076630778974033, + "grad_norm": 3.3007097244262695, + "learning_rate": 7.850393023171194e-06, + "loss": 0.8229, + "step": 6741 + }, + { + "epoch": 1.7079164027865739, + "grad_norm": 3.5454154014587402, + "learning_rate": 7.84970461597012e-06, + "loss": 0.7838, + "step": 6742 + }, + { + "epoch": 1.7081697276757442, + "grad_norm": 3.9048492908477783, + "learning_rate": 7.849016128750676e-06, + "loss": 0.7753, + "step": 6743 + }, + { + "epoch": 1.7084230525649144, + "grad_norm": 3.585662841796875, + "learning_rate": 7.848327561532194e-06, + "loss": 0.7363, + "step": 6744 + }, + { + "epoch": 1.708676377454085, + "grad_norm": 3.464667558670044, + "learning_rate": 7.84763891433401e-06, + "loss": 0.7679, + "step": 6745 + }, + { + "epoch": 1.7089297023432553, + "grad_norm": 3.897409200668335, + "learning_rate": 7.846950187175458e-06, + "loss": 0.6852, + "step": 6746 + }, + { + "epoch": 1.7091830272324255, + "grad_norm": 3.634979009628296, + "learning_rate": 7.84626138007588e-06, + "loss": 0.7761, + "step": 6747 + }, + { + "epoch": 1.709436352121596, + "grad_norm": 3.673316240310669, + "learning_rate": 7.84557249305462e-06, + "loss": 0.8161, + "step": 6748 + }, + { + "epoch": 1.7096896770107664, + "grad_norm": 3.373833656311035, + "learning_rate": 7.844883526131014e-06, + "loss": 0.7044, + "step": 6749 + }, + { + "epoch": 1.7099430018999366, + "grad_norm": 3.8875572681427, + "learning_rate": 7.84419447932441e-06, + "loss": 0.8219, + "step": 6750 + }, + { + "epoch": 1.7101963267891072, + "grad_norm": 3.942722797393799, + "learning_rate": 7.843505352654162e-06, + "loss": 0.7947, + "step": 6751 + }, + { + "epoch": 1.7104496516782772, + "grad_norm": 3.7861433029174805, + "learning_rate": 7.842816146139613e-06, + "loss": 0.721, + "step": 6752 + }, + { + "epoch": 1.7107029765674477, + "grad_norm": 3.775155544281006, + "learning_rate": 7.842126859800123e-06, + "loss": 0.789, + "step": 6753 + }, + { + "epoch": 1.7109563014566183, + "grad_norm": 3.84187388420105, + "learning_rate": 7.841437493655039e-06, + "loss": 0.7874, + "step": 6754 + }, + { + "epoch": 1.7112096263457883, + "grad_norm": 3.837515354156494, + "learning_rate": 7.840748047723726e-06, + "loss": 0.7639, + "step": 6755 + }, + { + "epoch": 1.7114629512349588, + "grad_norm": 3.7836356163024902, + "learning_rate": 7.840058522025536e-06, + "loss": 0.7773, + "step": 6756 + }, + { + "epoch": 1.7117162761241294, + "grad_norm": 3.677025318145752, + "learning_rate": 7.839368916579835e-06, + "loss": 0.6485, + "step": 6757 + }, + { + "epoch": 1.7119696010132994, + "grad_norm": 3.7167978286743164, + "learning_rate": 7.838679231405985e-06, + "loss": 0.7052, + "step": 6758 + }, + { + "epoch": 1.71222292590247, + "grad_norm": 3.762216567993164, + "learning_rate": 7.837989466523352e-06, + "loss": 0.8725, + "step": 6759 + }, + { + "epoch": 1.7124762507916402, + "grad_norm": 3.482800006866455, + "learning_rate": 7.837299621951307e-06, + "loss": 0.6542, + "step": 6760 + }, + { + "epoch": 1.7127295756808105, + "grad_norm": 3.6897640228271484, + "learning_rate": 7.836609697709216e-06, + "loss": 0.7791, + "step": 6761 + }, + { + "epoch": 1.712982900569981, + "grad_norm": 3.613353729248047, + "learning_rate": 7.835919693816457e-06, + "loss": 0.7184, + "step": 6762 + }, + { + "epoch": 1.7132362254591513, + "grad_norm": 3.526890516281128, + "learning_rate": 7.835229610292399e-06, + "loss": 0.791, + "step": 6763 + }, + { + "epoch": 1.7134895503483216, + "grad_norm": 3.9536054134368896, + "learning_rate": 7.834539447156424e-06, + "loss": 0.7518, + "step": 6764 + }, + { + "epoch": 1.7137428752374921, + "grad_norm": 3.7822437286376953, + "learning_rate": 7.833849204427909e-06, + "loss": 0.7601, + "step": 6765 + }, + { + "epoch": 1.7139962001266624, + "grad_norm": 3.176074266433716, + "learning_rate": 7.833158882126237e-06, + "loss": 0.6864, + "step": 6766 + }, + { + "epoch": 1.7142495250158327, + "grad_norm": 3.7379746437072754, + "learning_rate": 7.832468480270792e-06, + "loss": 0.859, + "step": 6767 + }, + { + "epoch": 1.7145028499050032, + "grad_norm": 3.548861503601074, + "learning_rate": 7.831777998880958e-06, + "loss": 0.7918, + "step": 6768 + }, + { + "epoch": 1.7147561747941735, + "grad_norm": 4.13444185256958, + "learning_rate": 7.831087437976127e-06, + "loss": 0.8243, + "step": 6769 + }, + { + "epoch": 1.7150094996833438, + "grad_norm": 3.840388774871826, + "learning_rate": 7.830396797575687e-06, + "loss": 0.8015, + "step": 6770 + }, + { + "epoch": 1.7152628245725143, + "grad_norm": 3.7381751537323, + "learning_rate": 7.82970607769903e-06, + "loss": 0.8026, + "step": 6771 + }, + { + "epoch": 1.7155161494616846, + "grad_norm": 3.682285785675049, + "learning_rate": 7.829015278365555e-06, + "loss": 0.7376, + "step": 6772 + }, + { + "epoch": 1.715769474350855, + "grad_norm": 3.669313430786133, + "learning_rate": 7.828324399594655e-06, + "loss": 0.7427, + "step": 6773 + }, + { + "epoch": 1.7160227992400254, + "grad_norm": 3.579893112182617, + "learning_rate": 7.827633441405733e-06, + "loss": 0.8783, + "step": 6774 + }, + { + "epoch": 1.7162761241291957, + "grad_norm": 3.3315069675445557, + "learning_rate": 7.826942403818187e-06, + "loss": 0.7473, + "step": 6775 + }, + { + "epoch": 1.716529449018366, + "grad_norm": 3.5394914150238037, + "learning_rate": 7.826251286851425e-06, + "loss": 0.758, + "step": 6776 + }, + { + "epoch": 1.7167827739075365, + "grad_norm": 3.693305253982544, + "learning_rate": 7.825560090524851e-06, + "loss": 0.7425, + "step": 6777 + }, + { + "epoch": 1.7170360987967068, + "grad_norm": 3.6263012886047363, + "learning_rate": 7.824868814857873e-06, + "loss": 0.7584, + "step": 6778 + }, + { + "epoch": 1.7172894236858771, + "grad_norm": 3.2938084602355957, + "learning_rate": 7.824177459869904e-06, + "loss": 0.7826, + "step": 6779 + }, + { + "epoch": 1.7175427485750476, + "grad_norm": 3.781751871109009, + "learning_rate": 7.823486025580355e-06, + "loss": 0.8017, + "step": 6780 + }, + { + "epoch": 1.7177960734642177, + "grad_norm": 3.938286781311035, + "learning_rate": 7.822794512008643e-06, + "loss": 0.7905, + "step": 6781 + }, + { + "epoch": 1.7180493983533882, + "grad_norm": 3.3979578018188477, + "learning_rate": 7.822102919174182e-06, + "loss": 0.8073, + "step": 6782 + }, + { + "epoch": 1.7183027232425587, + "grad_norm": 3.630275011062622, + "learning_rate": 7.821411247096395e-06, + "loss": 0.7673, + "step": 6783 + }, + { + "epoch": 1.7185560481317288, + "grad_norm": 3.248453140258789, + "learning_rate": 7.820719495794701e-06, + "loss": 0.7601, + "step": 6784 + }, + { + "epoch": 1.7188093730208993, + "grad_norm": 3.203981876373291, + "learning_rate": 7.820027665288527e-06, + "loss": 0.7745, + "step": 6785 + }, + { + "epoch": 1.7190626979100698, + "grad_norm": 3.7932839393615723, + "learning_rate": 7.819335755597296e-06, + "loss": 0.7866, + "step": 6786 + }, + { + "epoch": 1.71931602279924, + "grad_norm": 3.8405110836029053, + "learning_rate": 7.818643766740442e-06, + "loss": 0.6981, + "step": 6787 + }, + { + "epoch": 1.7195693476884104, + "grad_norm": 3.728456735610962, + "learning_rate": 7.81795169873739e-06, + "loss": 0.6513, + "step": 6788 + }, + { + "epoch": 1.7198226725775807, + "grad_norm": 4.308320045471191, + "learning_rate": 7.817259551607574e-06, + "loss": 0.846, + "step": 6789 + }, + { + "epoch": 1.720075997466751, + "grad_norm": 3.710430145263672, + "learning_rate": 7.816567325370431e-06, + "loss": 0.7239, + "step": 6790 + }, + { + "epoch": 1.7203293223559215, + "grad_norm": 3.8289363384246826, + "learning_rate": 7.815875020045398e-06, + "loss": 0.8095, + "step": 6791 + }, + { + "epoch": 1.7205826472450918, + "grad_norm": 3.556525707244873, + "learning_rate": 7.815182635651913e-06, + "loss": 0.8069, + "step": 6792 + }, + { + "epoch": 1.720835972134262, + "grad_norm": 3.635162115097046, + "learning_rate": 7.81449017220942e-06, + "loss": 0.7704, + "step": 6793 + }, + { + "epoch": 1.7210892970234326, + "grad_norm": 3.9452712535858154, + "learning_rate": 7.813797629737361e-06, + "loss": 0.792, + "step": 6794 + }, + { + "epoch": 1.721342621912603, + "grad_norm": 4.019476413726807, + "learning_rate": 7.813105008255185e-06, + "loss": 0.8714, + "step": 6795 + }, + { + "epoch": 1.7215959468017732, + "grad_norm": 3.9796302318573, + "learning_rate": 7.812412307782338e-06, + "loss": 0.8975, + "step": 6796 + }, + { + "epoch": 1.7218492716909437, + "grad_norm": 3.6698226928710938, + "learning_rate": 7.811719528338273e-06, + "loss": 0.6774, + "step": 6797 + }, + { + "epoch": 1.722102596580114, + "grad_norm": 3.5384151935577393, + "learning_rate": 7.811026669942439e-06, + "loss": 0.8, + "step": 6798 + }, + { + "epoch": 1.7223559214692843, + "grad_norm": 3.7324986457824707, + "learning_rate": 7.810333732614294e-06, + "loss": 0.7856, + "step": 6799 + }, + { + "epoch": 1.7226092463584548, + "grad_norm": 3.710684299468994, + "learning_rate": 7.809640716373294e-06, + "loss": 0.8713, + "step": 6800 + }, + { + "epoch": 1.722862571247625, + "grad_norm": 3.9690101146698, + "learning_rate": 7.808947621238903e-06, + "loss": 0.9102, + "step": 6801 + }, + { + "epoch": 1.7231158961367954, + "grad_norm": 3.204129457473755, + "learning_rate": 7.808254447230576e-06, + "loss": 0.7127, + "step": 6802 + }, + { + "epoch": 1.723369221025966, + "grad_norm": 3.3978586196899414, + "learning_rate": 7.807561194367783e-06, + "loss": 0.6568, + "step": 6803 + }, + { + "epoch": 1.7236225459151362, + "grad_norm": 4.004883766174316, + "learning_rate": 7.806867862669985e-06, + "loss": 0.9047, + "step": 6804 + }, + { + "epoch": 1.7238758708043065, + "grad_norm": 3.607483148574829, + "learning_rate": 7.806174452156654e-06, + "loss": 0.6987, + "step": 6805 + }, + { + "epoch": 1.724129195693477, + "grad_norm": 3.898294448852539, + "learning_rate": 7.80548096284726e-06, + "loss": 0.7756, + "step": 6806 + }, + { + "epoch": 1.7243825205826473, + "grad_norm": 3.365597724914551, + "learning_rate": 7.804787394761275e-06, + "loss": 0.6364, + "step": 6807 + }, + { + "epoch": 1.7246358454718176, + "grad_norm": 3.672537088394165, + "learning_rate": 7.804093747918174e-06, + "loss": 0.8562, + "step": 6808 + }, + { + "epoch": 1.724889170360988, + "grad_norm": 3.7284512519836426, + "learning_rate": 7.803400022337435e-06, + "loss": 0.7594, + "step": 6809 + }, + { + "epoch": 1.7251424952501582, + "grad_norm": 3.423053503036499, + "learning_rate": 7.802706218038538e-06, + "loss": 0.7733, + "step": 6810 + }, + { + "epoch": 1.7253958201393287, + "grad_norm": 4.533360481262207, + "learning_rate": 7.802012335040962e-06, + "loss": 0.744, + "step": 6811 + }, + { + "epoch": 1.7256491450284992, + "grad_norm": 3.670292377471924, + "learning_rate": 7.801318373364195e-06, + "loss": 0.6766, + "step": 6812 + }, + { + "epoch": 1.7259024699176693, + "grad_norm": 3.8066697120666504, + "learning_rate": 7.80062433302772e-06, + "loss": 0.7875, + "step": 6813 + }, + { + "epoch": 1.7261557948068398, + "grad_norm": 3.981668710708618, + "learning_rate": 7.799930214051028e-06, + "loss": 0.865, + "step": 6814 + }, + { + "epoch": 1.7264091196960103, + "grad_norm": 3.672137975692749, + "learning_rate": 7.799236016453606e-06, + "loss": 0.7883, + "step": 6815 + }, + { + "epoch": 1.7266624445851804, + "grad_norm": 3.7523081302642822, + "learning_rate": 7.798541740254948e-06, + "loss": 0.6858, + "step": 6816 + }, + { + "epoch": 1.7269157694743509, + "grad_norm": 3.5492067337036133, + "learning_rate": 7.797847385474552e-06, + "loss": 0.7101, + "step": 6817 + }, + { + "epoch": 1.7271690943635212, + "grad_norm": 3.3900222778320312, + "learning_rate": 7.79715295213191e-06, + "loss": 0.6167, + "step": 6818 + }, + { + "epoch": 1.7274224192526915, + "grad_norm": 3.545079231262207, + "learning_rate": 7.796458440246525e-06, + "loss": 0.8016, + "step": 6819 + }, + { + "epoch": 1.727675744141862, + "grad_norm": 3.9680957794189453, + "learning_rate": 7.795763849837898e-06, + "loss": 0.7968, + "step": 6820 + }, + { + "epoch": 1.7279290690310323, + "grad_norm": 3.618556261062622, + "learning_rate": 7.795069180925532e-06, + "loss": 0.7006, + "step": 6821 + }, + { + "epoch": 1.7281823939202026, + "grad_norm": 3.432621955871582, + "learning_rate": 7.794374433528935e-06, + "loss": 0.6631, + "step": 6822 + }, + { + "epoch": 1.728435718809373, + "grad_norm": 3.838395833969116, + "learning_rate": 7.793679607667612e-06, + "loss": 0.9043, + "step": 6823 + }, + { + "epoch": 1.7286890436985434, + "grad_norm": 3.751610040664673, + "learning_rate": 7.792984703361076e-06, + "loss": 0.7957, + "step": 6824 + }, + { + "epoch": 1.7289423685877137, + "grad_norm": 3.9810147285461426, + "learning_rate": 7.792289720628838e-06, + "loss": 0.7891, + "step": 6825 + }, + { + "epoch": 1.7291956934768842, + "grad_norm": 3.6827688217163086, + "learning_rate": 7.791594659490414e-06, + "loss": 0.7953, + "step": 6826 + }, + { + "epoch": 1.7294490183660545, + "grad_norm": 4.050821304321289, + "learning_rate": 7.79089951996532e-06, + "loss": 0.9719, + "step": 6827 + }, + { + "epoch": 1.7297023432552248, + "grad_norm": 3.864492893218994, + "learning_rate": 7.790204302073074e-06, + "loss": 0.773, + "step": 6828 + }, + { + "epoch": 1.7299556681443953, + "grad_norm": 3.3809754848480225, + "learning_rate": 7.789509005833201e-06, + "loss": 0.7368, + "step": 6829 + }, + { + "epoch": 1.7302089930335656, + "grad_norm": 3.8033883571624756, + "learning_rate": 7.78881363126522e-06, + "loss": 0.6927, + "step": 6830 + }, + { + "epoch": 1.7304623179227359, + "grad_norm": 4.084654331207275, + "learning_rate": 7.78811817838866e-06, + "loss": 0.8477, + "step": 6831 + }, + { + "epoch": 1.7307156428119064, + "grad_norm": 3.4256844520568848, + "learning_rate": 7.787422647223052e-06, + "loss": 0.7813, + "step": 6832 + }, + { + "epoch": 1.7309689677010767, + "grad_norm": 4.229806423187256, + "learning_rate": 7.786727037787919e-06, + "loss": 0.8722, + "step": 6833 + }, + { + "epoch": 1.731222292590247, + "grad_norm": 3.398742437362671, + "learning_rate": 7.786031350102796e-06, + "loss": 0.7392, + "step": 6834 + }, + { + "epoch": 1.7314756174794175, + "grad_norm": 3.697896957397461, + "learning_rate": 7.78533558418722e-06, + "loss": 0.8457, + "step": 6835 + }, + { + "epoch": 1.7317289423685878, + "grad_norm": 3.491771936416626, + "learning_rate": 7.784639740060726e-06, + "loss": 0.8108, + "step": 6836 + }, + { + "epoch": 1.731982267257758, + "grad_norm": 4.137701034545898, + "learning_rate": 7.783943817742852e-06, + "loss": 0.7165, + "step": 6837 + }, + { + "epoch": 1.7322355921469286, + "grad_norm": 3.52500581741333, + "learning_rate": 7.783247817253143e-06, + "loss": 0.7965, + "step": 6838 + }, + { + "epoch": 1.7324889170360986, + "grad_norm": 3.7605082988739014, + "learning_rate": 7.782551738611138e-06, + "loss": 0.8274, + "step": 6839 + }, + { + "epoch": 1.7327422419252692, + "grad_norm": 3.7687313556671143, + "learning_rate": 7.781855581836384e-06, + "loss": 0.9359, + "step": 6840 + }, + { + "epoch": 1.7329955668144397, + "grad_norm": 3.346748113632202, + "learning_rate": 7.781159346948431e-06, + "loss": 0.7889, + "step": 6841 + }, + { + "epoch": 1.7332488917036097, + "grad_norm": 3.6077446937561035, + "learning_rate": 7.780463033966824e-06, + "loss": 0.7087, + "step": 6842 + }, + { + "epoch": 1.7335022165927803, + "grad_norm": 4.077489852905273, + "learning_rate": 7.779766642911119e-06, + "loss": 0.7461, + "step": 6843 + }, + { + "epoch": 1.7337555414819505, + "grad_norm": 3.5096306800842285, + "learning_rate": 7.77907017380087e-06, + "loss": 0.7826, + "step": 6844 + }, + { + "epoch": 1.7340088663711208, + "grad_norm": 3.6896841526031494, + "learning_rate": 7.778373626655635e-06, + "loss": 0.8887, + "step": 6845 + }, + { + "epoch": 1.7342621912602914, + "grad_norm": 3.148482322692871, + "learning_rate": 7.77767700149497e-06, + "loss": 0.6511, + "step": 6846 + }, + { + "epoch": 1.7345155161494616, + "grad_norm": 3.640227794647217, + "learning_rate": 7.776980298338435e-06, + "loss": 0.7533, + "step": 6847 + }, + { + "epoch": 1.734768841038632, + "grad_norm": 4.133090496063232, + "learning_rate": 7.776283517205596e-06, + "loss": 0.9398, + "step": 6848 + }, + { + "epoch": 1.7350221659278025, + "grad_norm": 3.8263661861419678, + "learning_rate": 7.775586658116015e-06, + "loss": 0.895, + "step": 6849 + }, + { + "epoch": 1.7352754908169727, + "grad_norm": 3.7483906745910645, + "learning_rate": 7.774889721089262e-06, + "loss": 0.8356, + "step": 6850 + }, + { + "epoch": 1.735528815706143, + "grad_norm": 3.824211359024048, + "learning_rate": 7.77419270614491e-06, + "loss": 0.7526, + "step": 6851 + }, + { + "epoch": 1.7357821405953135, + "grad_norm": 3.5242981910705566, + "learning_rate": 7.773495613302522e-06, + "loss": 0.7361, + "step": 6852 + }, + { + "epoch": 1.7360354654844838, + "grad_norm": 4.070830345153809, + "learning_rate": 7.77279844258168e-06, + "loss": 0.8171, + "step": 6853 + }, + { + "epoch": 1.7362887903736541, + "grad_norm": 3.4309775829315186, + "learning_rate": 7.772101194001955e-06, + "loss": 0.7072, + "step": 6854 + }, + { + "epoch": 1.7365421152628246, + "grad_norm": 4.016650199890137, + "learning_rate": 7.77140386758293e-06, + "loss": 0.7467, + "step": 6855 + }, + { + "epoch": 1.736795440151995, + "grad_norm": 3.6484382152557373, + "learning_rate": 7.770706463344183e-06, + "loss": 0.9306, + "step": 6856 + }, + { + "epoch": 1.7370487650411652, + "grad_norm": 3.8862953186035156, + "learning_rate": 7.770008981305295e-06, + "loss": 0.6854, + "step": 6857 + }, + { + "epoch": 1.7373020899303357, + "grad_norm": 3.322340726852417, + "learning_rate": 7.769311421485855e-06, + "loss": 0.7536, + "step": 6858 + }, + { + "epoch": 1.737555414819506, + "grad_norm": 3.605544090270996, + "learning_rate": 7.768613783905448e-06, + "loss": 0.8643, + "step": 6859 + }, + { + "epoch": 1.7378087397086763, + "grad_norm": 3.678415536880493, + "learning_rate": 7.767916068583662e-06, + "loss": 0.7121, + "step": 6860 + }, + { + "epoch": 1.7380620645978468, + "grad_norm": 3.8269083499908447, + "learning_rate": 7.767218275540092e-06, + "loss": 0.8628, + "step": 6861 + }, + { + "epoch": 1.7383153894870171, + "grad_norm": 3.2733418941497803, + "learning_rate": 7.766520404794329e-06, + "loss": 0.6492, + "step": 6862 + }, + { + "epoch": 1.7385687143761874, + "grad_norm": 4.225980281829834, + "learning_rate": 7.76582245636597e-06, + "loss": 0.8772, + "step": 6863 + }, + { + "epoch": 1.738822039265358, + "grad_norm": 3.494497537612915, + "learning_rate": 7.765124430274613e-06, + "loss": 0.7491, + "step": 6864 + }, + { + "epoch": 1.7390753641545282, + "grad_norm": 3.435108184814453, + "learning_rate": 7.764426326539855e-06, + "loss": 0.6523, + "step": 6865 + }, + { + "epoch": 1.7393286890436985, + "grad_norm": 3.816903829574585, + "learning_rate": 7.763728145181306e-06, + "loss": 0.8311, + "step": 6866 + }, + { + "epoch": 1.739582013932869, + "grad_norm": 3.6468505859375, + "learning_rate": 7.763029886218563e-06, + "loss": 0.7768, + "step": 6867 + }, + { + "epoch": 1.7398353388220391, + "grad_norm": 3.1998164653778076, + "learning_rate": 7.762331549671237e-06, + "loss": 0.6335, + "step": 6868 + }, + { + "epoch": 1.7400886637112096, + "grad_norm": 4.0369791984558105, + "learning_rate": 7.761633135558935e-06, + "loss": 0.7975, + "step": 6869 + }, + { + "epoch": 1.7403419886003801, + "grad_norm": 3.863938093185425, + "learning_rate": 7.760934643901269e-06, + "loss": 0.7428, + "step": 6870 + }, + { + "epoch": 1.7405953134895502, + "grad_norm": 3.4053783416748047, + "learning_rate": 7.760236074717853e-06, + "loss": 0.639, + "step": 6871 + }, + { + "epoch": 1.7408486383787207, + "grad_norm": 3.5806350708007812, + "learning_rate": 7.759537428028302e-06, + "loss": 0.6776, + "step": 6872 + }, + { + "epoch": 1.741101963267891, + "grad_norm": 3.7650678157806396, + "learning_rate": 7.75883870385223e-06, + "loss": 0.7025, + "step": 6873 + }, + { + "epoch": 1.7413552881570613, + "grad_norm": 3.6960256099700928, + "learning_rate": 7.758139902209262e-06, + "loss": 0.6717, + "step": 6874 + }, + { + "epoch": 1.7416086130462318, + "grad_norm": 3.6001505851745605, + "learning_rate": 7.757441023119019e-06, + "loss": 0.8484, + "step": 6875 + }, + { + "epoch": 1.7418619379354021, + "grad_norm": 3.406071662902832, + "learning_rate": 7.756742066601125e-06, + "loss": 0.7173, + "step": 6876 + }, + { + "epoch": 1.7421152628245724, + "grad_norm": 3.5370805263519287, + "learning_rate": 7.756043032675205e-06, + "loss": 0.7778, + "step": 6877 + }, + { + "epoch": 1.742368587713743, + "grad_norm": 3.511587142944336, + "learning_rate": 7.755343921360887e-06, + "loss": 0.7654, + "step": 6878 + }, + { + "epoch": 1.7426219126029132, + "grad_norm": 3.672236442565918, + "learning_rate": 7.754644732677805e-06, + "loss": 0.744, + "step": 6879 + }, + { + "epoch": 1.7428752374920835, + "grad_norm": 3.903546094894409, + "learning_rate": 7.753945466645589e-06, + "loss": 0.7901, + "step": 6880 + }, + { + "epoch": 1.743128562381254, + "grad_norm": 3.4485881328582764, + "learning_rate": 7.753246123283875e-06, + "loss": 0.7518, + "step": 6881 + }, + { + "epoch": 1.7433818872704243, + "grad_norm": 3.6792099475860596, + "learning_rate": 7.752546702612302e-06, + "loss": 0.8035, + "step": 6882 + }, + { + "epoch": 1.7436352121595946, + "grad_norm": 3.692992925643921, + "learning_rate": 7.751847204650505e-06, + "loss": 0.7208, + "step": 6883 + }, + { + "epoch": 1.7438885370487651, + "grad_norm": 3.7795684337615967, + "learning_rate": 7.75114762941813e-06, + "loss": 0.7926, + "step": 6884 + }, + { + "epoch": 1.7441418619379354, + "grad_norm": 4.182426452636719, + "learning_rate": 7.750447976934818e-06, + "loss": 0.8511, + "step": 6885 + }, + { + "epoch": 1.7443951868271057, + "grad_norm": 3.8321166038513184, + "learning_rate": 7.749748247220217e-06, + "loss": 0.862, + "step": 6886 + }, + { + "epoch": 1.7446485117162762, + "grad_norm": 3.795067310333252, + "learning_rate": 7.749048440293973e-06, + "loss": 0.8019, + "step": 6887 + }, + { + "epoch": 1.7449018366054465, + "grad_norm": 3.5560405254364014, + "learning_rate": 7.748348556175738e-06, + "loss": 0.764, + "step": 6888 + }, + { + "epoch": 1.7451551614946168, + "grad_norm": 3.8307154178619385, + "learning_rate": 7.747648594885162e-06, + "loss": 0.7913, + "step": 6889 + }, + { + "epoch": 1.7454084863837873, + "grad_norm": 3.801604747772217, + "learning_rate": 7.746948556441903e-06, + "loss": 0.8011, + "step": 6890 + }, + { + "epoch": 1.7456618112729576, + "grad_norm": 3.8079586029052734, + "learning_rate": 7.746248440865616e-06, + "loss": 0.6707, + "step": 6891 + }, + { + "epoch": 1.745915136162128, + "grad_norm": 3.6608381271362305, + "learning_rate": 7.745548248175958e-06, + "loss": 0.718, + "step": 6892 + }, + { + "epoch": 1.7461684610512984, + "grad_norm": 3.5695316791534424, + "learning_rate": 7.744847978392593e-06, + "loss": 0.6785, + "step": 6893 + }, + { + "epoch": 1.7464217859404685, + "grad_norm": 4.088541030883789, + "learning_rate": 7.744147631535183e-06, + "loss": 0.7726, + "step": 6894 + }, + { + "epoch": 1.746675110829639, + "grad_norm": 3.5368990898132324, + "learning_rate": 7.743447207623394e-06, + "loss": 0.8156, + "step": 6895 + }, + { + "epoch": 1.7469284357188095, + "grad_norm": 4.152857303619385, + "learning_rate": 7.742746706676893e-06, + "loss": 0.7564, + "step": 6896 + }, + { + "epoch": 1.7471817606079796, + "grad_norm": 3.8850326538085938, + "learning_rate": 7.742046128715351e-06, + "loss": 0.8044, + "step": 6897 + }, + { + "epoch": 1.74743508549715, + "grad_norm": 3.511019468307495, + "learning_rate": 7.741345473758438e-06, + "loss": 0.688, + "step": 6898 + }, + { + "epoch": 1.7476884103863206, + "grad_norm": 3.9197492599487305, + "learning_rate": 7.740644741825828e-06, + "loss": 0.9111, + "step": 6899 + }, + { + "epoch": 1.7479417352754907, + "grad_norm": 3.8701112270355225, + "learning_rate": 7.739943932937199e-06, + "loss": 0.6694, + "step": 6900 + }, + { + "epoch": 1.7481950601646612, + "grad_norm": 3.6412601470947266, + "learning_rate": 7.739243047112228e-06, + "loss": 0.6512, + "step": 6901 + }, + { + "epoch": 1.7484483850538315, + "grad_norm": 4.168801307678223, + "learning_rate": 7.738542084370598e-06, + "loss": 0.841, + "step": 6902 + }, + { + "epoch": 1.7487017099430018, + "grad_norm": 3.493496894836426, + "learning_rate": 7.737841044731987e-06, + "loss": 0.7524, + "step": 6903 + }, + { + "epoch": 1.7489550348321723, + "grad_norm": 4.134896278381348, + "learning_rate": 7.737139928216084e-06, + "loss": 0.968, + "step": 6904 + }, + { + "epoch": 1.7492083597213426, + "grad_norm": 3.665043592453003, + "learning_rate": 7.736438734842574e-06, + "loss": 0.8285, + "step": 6905 + }, + { + "epoch": 1.7494616846105129, + "grad_norm": 4.110346794128418, + "learning_rate": 7.735737464631149e-06, + "loss": 0.8057, + "step": 6906 + }, + { + "epoch": 1.7497150094996834, + "grad_norm": 3.8197755813598633, + "learning_rate": 7.735036117601495e-06, + "loss": 0.9154, + "step": 6907 + }, + { + "epoch": 1.7499683343888537, + "grad_norm": 3.5153720378875732, + "learning_rate": 7.73433469377331e-06, + "loss": 0.7457, + "step": 6908 + }, + { + "epoch": 1.750221659278024, + "grad_norm": 3.454111099243164, + "learning_rate": 7.73363319316629e-06, + "loss": 0.7152, + "step": 6909 + }, + { + "epoch": 1.7504749841671945, + "grad_norm": 3.6645593643188477, + "learning_rate": 7.73293161580013e-06, + "loss": 0.8098, + "step": 6910 + }, + { + "epoch": 1.7507283090563648, + "grad_norm": 3.816690444946289, + "learning_rate": 7.732229961694531e-06, + "loss": 0.9133, + "step": 6911 + }, + { + "epoch": 1.750981633945535, + "grad_norm": 3.9455504417419434, + "learning_rate": 7.731528230869194e-06, + "loss": 0.8381, + "step": 6912 + }, + { + "epoch": 1.7512349588347056, + "grad_norm": 3.983278751373291, + "learning_rate": 7.730826423343825e-06, + "loss": 0.8214, + "step": 6913 + }, + { + "epoch": 1.7514882837238759, + "grad_norm": 3.3927857875823975, + "learning_rate": 7.73012453913813e-06, + "loss": 0.7316, + "step": 6914 + }, + { + "epoch": 1.7517416086130462, + "grad_norm": 3.4403159618377686, + "learning_rate": 7.729422578271818e-06, + "loss": 0.7346, + "step": 6915 + }, + { + "epoch": 1.7519949335022167, + "grad_norm": 3.7879765033721924, + "learning_rate": 7.728720540764601e-06, + "loss": 0.8101, + "step": 6916 + }, + { + "epoch": 1.752248258391387, + "grad_norm": 3.7558810710906982, + "learning_rate": 7.728018426636188e-06, + "loss": 0.6966, + "step": 6917 + }, + { + "epoch": 1.7525015832805573, + "grad_norm": 3.143333911895752, + "learning_rate": 7.727316235906294e-06, + "loss": 0.7613, + "step": 6918 + }, + { + "epoch": 1.7527549081697278, + "grad_norm": 4.484152317047119, + "learning_rate": 7.726613968594642e-06, + "loss": 0.9042, + "step": 6919 + }, + { + "epoch": 1.753008233058898, + "grad_norm": 3.9656481742858887, + "learning_rate": 7.725911624720946e-06, + "loss": 0.7662, + "step": 6920 + }, + { + "epoch": 1.7532615579480684, + "grad_norm": 3.409182548522949, + "learning_rate": 7.72520920430493e-06, + "loss": 0.7463, + "step": 6921 + }, + { + "epoch": 1.7535148828372389, + "grad_norm": 3.8630242347717285, + "learning_rate": 7.724506707366317e-06, + "loss": 0.8621, + "step": 6922 + }, + { + "epoch": 1.753768207726409, + "grad_norm": 3.331718921661377, + "learning_rate": 7.72380413392483e-06, + "loss": 0.8027, + "step": 6923 + }, + { + "epoch": 1.7540215326155795, + "grad_norm": 3.4763405323028564, + "learning_rate": 7.723101484000201e-06, + "loss": 0.751, + "step": 6924 + }, + { + "epoch": 1.75427485750475, + "grad_norm": 3.9139089584350586, + "learning_rate": 7.72239875761216e-06, + "loss": 0.7899, + "step": 6925 + }, + { + "epoch": 1.75452818239392, + "grad_norm": 3.5342273712158203, + "learning_rate": 7.721695954780436e-06, + "loss": 0.7511, + "step": 6926 + }, + { + "epoch": 1.7547815072830906, + "grad_norm": 3.237011194229126, + "learning_rate": 7.720993075524768e-06, + "loss": 0.8379, + "step": 6927 + }, + { + "epoch": 1.755034832172261, + "grad_norm": 3.958935499191284, + "learning_rate": 7.720290119864887e-06, + "loss": 0.722, + "step": 6928 + }, + { + "epoch": 1.7552881570614312, + "grad_norm": 3.603102684020996, + "learning_rate": 7.719587087820534e-06, + "loss": 0.7568, + "step": 6929 + }, + { + "epoch": 1.7555414819506017, + "grad_norm": 4.152163982391357, + "learning_rate": 7.71888397941145e-06, + "loss": 0.832, + "step": 6930 + }, + { + "epoch": 1.755794806839772, + "grad_norm": 3.8672590255737305, + "learning_rate": 7.718180794657382e-06, + "loss": 0.7951, + "step": 6931 + }, + { + "epoch": 1.7560481317289423, + "grad_norm": 3.665332078933716, + "learning_rate": 7.717477533578069e-06, + "loss": 0.7543, + "step": 6932 + }, + { + "epoch": 1.7563014566181128, + "grad_norm": 3.344583749771118, + "learning_rate": 7.716774196193259e-06, + "loss": 0.7054, + "step": 6933 + }, + { + "epoch": 1.756554781507283, + "grad_norm": 3.377333164215088, + "learning_rate": 7.716070782522703e-06, + "loss": 0.6812, + "step": 6934 + }, + { + "epoch": 1.7568081063964534, + "grad_norm": 4.1590189933776855, + "learning_rate": 7.715367292586153e-06, + "loss": 0.8372, + "step": 6935 + }, + { + "epoch": 1.7570614312856239, + "grad_norm": 3.581063985824585, + "learning_rate": 7.714663726403363e-06, + "loss": 0.8029, + "step": 6936 + }, + { + "epoch": 1.7573147561747942, + "grad_norm": 3.578697443008423, + "learning_rate": 7.713960083994088e-06, + "loss": 0.7836, + "step": 6937 + }, + { + "epoch": 1.7575680810639644, + "grad_norm": 3.8199892044067383, + "learning_rate": 7.713256365378085e-06, + "loss": 0.7364, + "step": 6938 + }, + { + "epoch": 1.757821405953135, + "grad_norm": 3.588397264480591, + "learning_rate": 7.712552570575114e-06, + "loss": 0.8094, + "step": 6939 + }, + { + "epoch": 1.7580747308423053, + "grad_norm": 3.999547004699707, + "learning_rate": 7.711848699604941e-06, + "loss": 0.7968, + "step": 6940 + }, + { + "epoch": 1.7583280557314755, + "grad_norm": 3.7411811351776123, + "learning_rate": 7.711144752487325e-06, + "loss": 0.7764, + "step": 6941 + }, + { + "epoch": 1.758581380620646, + "grad_norm": 4.176236152648926, + "learning_rate": 7.710440729242034e-06, + "loss": 0.8102, + "step": 6942 + }, + { + "epoch": 1.7588347055098164, + "grad_norm": 4.13679838180542, + "learning_rate": 7.70973662988884e-06, + "loss": 0.8539, + "step": 6943 + }, + { + "epoch": 1.7590880303989866, + "grad_norm": 3.5241591930389404, + "learning_rate": 7.70903245444751e-06, + "loss": 0.7768, + "step": 6944 + }, + { + "epoch": 1.7593413552881572, + "grad_norm": 3.9646356105804443, + "learning_rate": 7.70832820293782e-06, + "loss": 0.8012, + "step": 6945 + }, + { + "epoch": 1.7595946801773275, + "grad_norm": 3.668579339981079, + "learning_rate": 7.707623875379542e-06, + "loss": 0.6945, + "step": 6946 + }, + { + "epoch": 1.7598480050664977, + "grad_norm": 3.9900786876678467, + "learning_rate": 7.706919471792455e-06, + "loss": 0.7609, + "step": 6947 + }, + { + "epoch": 1.7601013299556683, + "grad_norm": 3.8448877334594727, + "learning_rate": 7.706214992196338e-06, + "loss": 0.7881, + "step": 6948 + }, + { + "epoch": 1.7603546548448386, + "grad_norm": 3.9840433597564697, + "learning_rate": 7.705510436610973e-06, + "loss": 0.7961, + "step": 6949 + }, + { + "epoch": 1.7606079797340088, + "grad_norm": 3.714884042739868, + "learning_rate": 7.70480580505614e-06, + "loss": 0.8159, + "step": 6950 + }, + { + "epoch": 1.7608613046231794, + "grad_norm": 3.951197624206543, + "learning_rate": 7.70410109755163e-06, + "loss": 0.8104, + "step": 6951 + }, + { + "epoch": 1.7611146295123494, + "grad_norm": 3.7357263565063477, + "learning_rate": 7.703396314117229e-06, + "loss": 0.8794, + "step": 6952 + }, + { + "epoch": 1.76136795440152, + "grad_norm": 3.921632766723633, + "learning_rate": 7.702691454772727e-06, + "loss": 0.7692, + "step": 6953 + }, + { + "epoch": 1.7616212792906905, + "grad_norm": 3.5576512813568115, + "learning_rate": 7.701986519537914e-06, + "loss": 0.728, + "step": 6954 + }, + { + "epoch": 1.7618746041798605, + "grad_norm": 3.580655813217163, + "learning_rate": 7.701281508432587e-06, + "loss": 0.8122, + "step": 6955 + }, + { + "epoch": 1.762127929069031, + "grad_norm": 3.6133105754852295, + "learning_rate": 7.70057642147654e-06, + "loss": 0.8181, + "step": 6956 + }, + { + "epoch": 1.7623812539582016, + "grad_norm": 3.7095234394073486, + "learning_rate": 7.699871258689574e-06, + "loss": 0.8043, + "step": 6957 + }, + { + "epoch": 1.7626345788473716, + "grad_norm": 4.261842250823975, + "learning_rate": 7.699166020091489e-06, + "loss": 0.7951, + "step": 6958 + }, + { + "epoch": 1.7628879037365421, + "grad_norm": 3.069305896759033, + "learning_rate": 7.698460705702085e-06, + "loss": 0.7193, + "step": 6959 + }, + { + "epoch": 1.7631412286257124, + "grad_norm": 3.832467555999756, + "learning_rate": 7.69775531554117e-06, + "loss": 0.7279, + "step": 6960 + }, + { + "epoch": 1.7633945535148827, + "grad_norm": 3.7485153675079346, + "learning_rate": 7.697049849628551e-06, + "loss": 0.7454, + "step": 6961 + }, + { + "epoch": 1.7636478784040532, + "grad_norm": 3.542663335800171, + "learning_rate": 7.696344307984034e-06, + "loss": 0.755, + "step": 6962 + }, + { + "epoch": 1.7639012032932235, + "grad_norm": 3.6913812160491943, + "learning_rate": 7.695638690627435e-06, + "loss": 0.8295, + "step": 6963 + }, + { + "epoch": 1.7641545281823938, + "grad_norm": 3.591423749923706, + "learning_rate": 7.694932997578565e-06, + "loss": 0.7852, + "step": 6964 + }, + { + "epoch": 1.7644078530715643, + "grad_norm": 3.6839447021484375, + "learning_rate": 7.694227228857239e-06, + "loss": 0.7416, + "step": 6965 + }, + { + "epoch": 1.7646611779607346, + "grad_norm": 3.771757125854492, + "learning_rate": 7.693521384483274e-06, + "loss": 0.7232, + "step": 6966 + }, + { + "epoch": 1.764914502849905, + "grad_norm": 3.684927463531494, + "learning_rate": 7.692815464476491e-06, + "loss": 0.7959, + "step": 6967 + }, + { + "epoch": 1.7651678277390754, + "grad_norm": 3.483133554458618, + "learning_rate": 7.692109468856712e-06, + "loss": 0.825, + "step": 6968 + }, + { + "epoch": 1.7654211526282457, + "grad_norm": 3.5631825923919678, + "learning_rate": 7.691403397643761e-06, + "loss": 0.8089, + "step": 6969 + }, + { + "epoch": 1.765674477517416, + "grad_norm": 3.9040019512176514, + "learning_rate": 7.690697250857465e-06, + "loss": 0.8031, + "step": 6970 + }, + { + "epoch": 1.7659278024065865, + "grad_norm": 4.075888156890869, + "learning_rate": 7.68999102851765e-06, + "loss": 0.7231, + "step": 6971 + }, + { + "epoch": 1.7661811272957568, + "grad_norm": 3.605731964111328, + "learning_rate": 7.689284730644148e-06, + "loss": 0.8215, + "step": 6972 + }, + { + "epoch": 1.7664344521849271, + "grad_norm": 3.3009860515594482, + "learning_rate": 7.688578357256792e-06, + "loss": 0.7531, + "step": 6973 + }, + { + "epoch": 1.7666877770740976, + "grad_norm": 3.64111328125, + "learning_rate": 7.687871908375414e-06, + "loss": 0.8276, + "step": 6974 + }, + { + "epoch": 1.766941101963268, + "grad_norm": 3.633634567260742, + "learning_rate": 7.687165384019855e-06, + "loss": 0.7103, + "step": 6975 + }, + { + "epoch": 1.7671944268524382, + "grad_norm": 3.75003981590271, + "learning_rate": 7.68645878420995e-06, + "loss": 0.6741, + "step": 6976 + }, + { + "epoch": 1.7674477517416087, + "grad_norm": 3.559645414352417, + "learning_rate": 7.685752108965541e-06, + "loss": 0.7214, + "step": 6977 + }, + { + "epoch": 1.767701076630779, + "grad_norm": 3.410543918609619, + "learning_rate": 7.685045358306473e-06, + "loss": 0.8695, + "step": 6978 + }, + { + "epoch": 1.7679544015199493, + "grad_norm": 3.690166473388672, + "learning_rate": 7.68433853225259e-06, + "loss": 0.645, + "step": 6979 + }, + { + "epoch": 1.7682077264091198, + "grad_norm": 3.7979559898376465, + "learning_rate": 7.683631630823737e-06, + "loss": 0.9006, + "step": 6980 + }, + { + "epoch": 1.76846105129829, + "grad_norm": 3.594649076461792, + "learning_rate": 7.682924654039768e-06, + "loss": 0.7457, + "step": 6981 + }, + { + "epoch": 1.7687143761874604, + "grad_norm": 3.811100721359253, + "learning_rate": 7.682217601920529e-06, + "loss": 0.7932, + "step": 6982 + }, + { + "epoch": 1.768967701076631, + "grad_norm": 3.8262460231781006, + "learning_rate": 7.68151047448588e-06, + "loss": 0.9066, + "step": 6983 + }, + { + "epoch": 1.769221025965801, + "grad_norm": 3.9167635440826416, + "learning_rate": 7.680803271755672e-06, + "loss": 0.797, + "step": 6984 + }, + { + "epoch": 1.7694743508549715, + "grad_norm": 3.7758123874664307, + "learning_rate": 7.680095993749763e-06, + "loss": 0.7997, + "step": 6985 + }, + { + "epoch": 1.769727675744142, + "grad_norm": 3.794173240661621, + "learning_rate": 7.679388640488017e-06, + "loss": 0.7015, + "step": 6986 + }, + { + "epoch": 1.769981000633312, + "grad_norm": 3.8901114463806152, + "learning_rate": 7.678681211990293e-06, + "loss": 0.7464, + "step": 6987 + }, + { + "epoch": 1.7702343255224826, + "grad_norm": 3.6819403171539307, + "learning_rate": 7.677973708276456e-06, + "loss": 0.7785, + "step": 6988 + }, + { + "epoch": 1.770487650411653, + "grad_norm": 3.651333808898926, + "learning_rate": 7.677266129366374e-06, + "loss": 0.7664, + "step": 6989 + }, + { + "epoch": 1.7707409753008232, + "grad_norm": 3.645397663116455, + "learning_rate": 7.676558475279911e-06, + "loss": 0.923, + "step": 6990 + }, + { + "epoch": 1.7709943001899937, + "grad_norm": 3.6749892234802246, + "learning_rate": 7.675850746036942e-06, + "loss": 0.827, + "step": 6991 + }, + { + "epoch": 1.771247625079164, + "grad_norm": 3.93105411529541, + "learning_rate": 7.67514294165734e-06, + "loss": 0.8064, + "step": 6992 + }, + { + "epoch": 1.7715009499683343, + "grad_norm": 3.9308536052703857, + "learning_rate": 7.674435062160974e-06, + "loss": 0.7711, + "step": 6993 + }, + { + "epoch": 1.7717542748575048, + "grad_norm": 3.6619958877563477, + "learning_rate": 7.673727107567727e-06, + "loss": 0.8562, + "step": 6994 + }, + { + "epoch": 1.772007599746675, + "grad_norm": 3.6865439414978027, + "learning_rate": 7.673019077897474e-06, + "loss": 0.6428, + "step": 6995 + }, + { + "epoch": 1.7722609246358454, + "grad_norm": 3.309852361679077, + "learning_rate": 7.6723109731701e-06, + "loss": 0.7573, + "step": 6996 + }, + { + "epoch": 1.772514249525016, + "grad_norm": 3.565255880355835, + "learning_rate": 7.671602793405487e-06, + "loss": 0.7421, + "step": 6997 + }, + { + "epoch": 1.7727675744141862, + "grad_norm": 3.638303518295288, + "learning_rate": 7.67089453862352e-06, + "loss": 0.7285, + "step": 6998 + }, + { + "epoch": 1.7730208993033565, + "grad_norm": 3.856306552886963, + "learning_rate": 7.670186208844084e-06, + "loss": 0.8371, + "step": 6999 + }, + { + "epoch": 1.773274224192527, + "grad_norm": 3.933645009994507, + "learning_rate": 7.669477804087073e-06, + "loss": 0.7924, + "step": 7000 + }, + { + "epoch": 1.773274224192527, + "eval_loss": 1.1572972536087036, + "eval_runtime": 13.8931, + "eval_samples_per_second": 28.791, + "eval_steps_per_second": 3.599, + "step": 7000 + }, + { + "epoch": 1.7735275490816973, + "grad_norm": 3.429882526397705, + "learning_rate": 7.668769324372374e-06, + "loss": 0.7985, + "step": 7001 + }, + { + "epoch": 1.7737808739708676, + "grad_norm": 3.635460615158081, + "learning_rate": 7.668060769719885e-06, + "loss": 0.8965, + "step": 7002 + }, + { + "epoch": 1.774034198860038, + "grad_norm": 3.731748580932617, + "learning_rate": 7.6673521401495e-06, + "loss": 0.8512, + "step": 7003 + }, + { + "epoch": 1.7742875237492084, + "grad_norm": 3.781571865081787, + "learning_rate": 7.666643435681117e-06, + "loss": 0.8569, + "step": 7004 + }, + { + "epoch": 1.7745408486383787, + "grad_norm": 3.891633987426758, + "learning_rate": 7.665934656334633e-06, + "loss": 0.85, + "step": 7005 + }, + { + "epoch": 1.7747941735275492, + "grad_norm": 3.5965399742126465, + "learning_rate": 7.665225802129956e-06, + "loss": 0.7723, + "step": 7006 + }, + { + "epoch": 1.7750474984167195, + "grad_norm": 3.403123378753662, + "learning_rate": 7.664516873086987e-06, + "loss": 0.7731, + "step": 7007 + }, + { + "epoch": 1.7753008233058898, + "grad_norm": 3.796185255050659, + "learning_rate": 7.663807869225634e-06, + "loss": 0.9992, + "step": 7008 + }, + { + "epoch": 1.7755541481950603, + "grad_norm": 3.8913798332214355, + "learning_rate": 7.663098790565803e-06, + "loss": 0.7524, + "step": 7009 + }, + { + "epoch": 1.7758074730842304, + "grad_norm": 3.4342589378356934, + "learning_rate": 7.662389637127408e-06, + "loss": 0.6338, + "step": 7010 + }, + { + "epoch": 1.7760607979734009, + "grad_norm": 3.7228524684906006, + "learning_rate": 7.661680408930358e-06, + "loss": 0.8741, + "step": 7011 + }, + { + "epoch": 1.7763141228625714, + "grad_norm": 3.449136972427368, + "learning_rate": 7.66097110599457e-06, + "loss": 0.7831, + "step": 7012 + }, + { + "epoch": 1.7765674477517415, + "grad_norm": 3.6066532135009766, + "learning_rate": 7.660261728339962e-06, + "loss": 0.6429, + "step": 7013 + }, + { + "epoch": 1.776820772640912, + "grad_norm": 4.536827087402344, + "learning_rate": 7.65955227598645e-06, + "loss": 0.9473, + "step": 7014 + }, + { + "epoch": 1.7770740975300825, + "grad_norm": 3.375558853149414, + "learning_rate": 7.658842748953957e-06, + "loss": 0.7842, + "step": 7015 + }, + { + "epoch": 1.7773274224192526, + "grad_norm": 3.7360715866088867, + "learning_rate": 7.658133147262406e-06, + "loss": 0.7374, + "step": 7016 + }, + { + "epoch": 1.777580747308423, + "grad_norm": 3.8486199378967285, + "learning_rate": 7.657423470931721e-06, + "loss": 0.8768, + "step": 7017 + }, + { + "epoch": 1.7778340721975934, + "grad_norm": 4.047220706939697, + "learning_rate": 7.656713719981832e-06, + "loss": 0.8172, + "step": 7018 + }, + { + "epoch": 1.7780873970867637, + "grad_norm": 3.783325672149658, + "learning_rate": 7.656003894432666e-06, + "loss": 0.8239, + "step": 7019 + }, + { + "epoch": 1.7783407219759342, + "grad_norm": 3.9551491737365723, + "learning_rate": 7.655293994304154e-06, + "loss": 0.8671, + "step": 7020 + }, + { + "epoch": 1.7785940468651045, + "grad_norm": 3.8351070880889893, + "learning_rate": 7.654584019616234e-06, + "loss": 0.7545, + "step": 7021 + }, + { + "epoch": 1.7788473717542748, + "grad_norm": 3.5610098838806152, + "learning_rate": 7.653873970388836e-06, + "loss": 0.7242, + "step": 7022 + }, + { + "epoch": 1.7791006966434453, + "grad_norm": 3.9424612522125244, + "learning_rate": 7.653163846641903e-06, + "loss": 0.7859, + "step": 7023 + }, + { + "epoch": 1.7793540215326156, + "grad_norm": 3.8060503005981445, + "learning_rate": 7.652453648395373e-06, + "loss": 0.7809, + "step": 7024 + }, + { + "epoch": 1.7796073464217859, + "grad_norm": 3.557339668273926, + "learning_rate": 7.651743375669184e-06, + "loss": 0.7623, + "step": 7025 + }, + { + "epoch": 1.7798606713109564, + "grad_norm": 3.927065849304199, + "learning_rate": 7.651033028483287e-06, + "loss": 0.8165, + "step": 7026 + }, + { + "epoch": 1.7801139962001267, + "grad_norm": 4.229694366455078, + "learning_rate": 7.650322606857625e-06, + "loss": 0.8584, + "step": 7027 + }, + { + "epoch": 1.780367321089297, + "grad_norm": 4.551065444946289, + "learning_rate": 7.649612110812145e-06, + "loss": 0.8644, + "step": 7028 + }, + { + "epoch": 1.7806206459784675, + "grad_norm": 4.079436779022217, + "learning_rate": 7.6489015403668e-06, + "loss": 0.7658, + "step": 7029 + }, + { + "epoch": 1.7808739708676378, + "grad_norm": 3.6987128257751465, + "learning_rate": 7.64819089554154e-06, + "loss": 0.7306, + "step": 7030 + }, + { + "epoch": 1.781127295756808, + "grad_norm": 3.443990707397461, + "learning_rate": 7.647480176356321e-06, + "loss": 0.837, + "step": 7031 + }, + { + "epoch": 1.7813806206459786, + "grad_norm": 3.928731918334961, + "learning_rate": 7.6467693828311e-06, + "loss": 0.8902, + "step": 7032 + }, + { + "epoch": 1.7816339455351489, + "grad_norm": 3.5158531665802, + "learning_rate": 7.646058514985837e-06, + "loss": 0.7547, + "step": 7033 + }, + { + "epoch": 1.7818872704243192, + "grad_norm": 3.7430777549743652, + "learning_rate": 7.64534757284049e-06, + "loss": 0.8344, + "step": 7034 + }, + { + "epoch": 1.7821405953134897, + "grad_norm": 3.3484010696411133, + "learning_rate": 7.644636556415021e-06, + "loss": 0.7328, + "step": 7035 + }, + { + "epoch": 1.78239392020266, + "grad_norm": 3.6423325538635254, + "learning_rate": 7.643925465729399e-06, + "loss": 0.8076, + "step": 7036 + }, + { + "epoch": 1.7826472450918303, + "grad_norm": 3.464024305343628, + "learning_rate": 7.643214300803587e-06, + "loss": 0.8435, + "step": 7037 + }, + { + "epoch": 1.7829005699810008, + "grad_norm": 3.8605098724365234, + "learning_rate": 7.642503061657558e-06, + "loss": 0.72, + "step": 7038 + }, + { + "epoch": 1.7831538948701708, + "grad_norm": 4.1303277015686035, + "learning_rate": 7.64179174831128e-06, + "loss": 0.8559, + "step": 7039 + }, + { + "epoch": 1.7834072197593414, + "grad_norm": 3.7270262241363525, + "learning_rate": 7.64108036078473e-06, + "loss": 0.8068, + "step": 7040 + }, + { + "epoch": 1.7836605446485119, + "grad_norm": 3.541299819946289, + "learning_rate": 7.640368899097879e-06, + "loss": 0.9051, + "step": 7041 + }, + { + "epoch": 1.783913869537682, + "grad_norm": 3.303861141204834, + "learning_rate": 7.639657363270708e-06, + "loss": 0.7302, + "step": 7042 + }, + { + "epoch": 1.7841671944268525, + "grad_norm": 3.3426244258880615, + "learning_rate": 7.638945753323197e-06, + "loss": 0.7338, + "step": 7043 + }, + { + "epoch": 1.7844205193160227, + "grad_norm": 3.6744649410247803, + "learning_rate": 7.638234069275324e-06, + "loss": 0.7901, + "step": 7044 + }, + { + "epoch": 1.784673844205193, + "grad_norm": 3.590378522872925, + "learning_rate": 7.637522311147075e-06, + "loss": 0.76, + "step": 7045 + }, + { + "epoch": 1.7849271690943636, + "grad_norm": 4.428488254547119, + "learning_rate": 7.636810478958434e-06, + "loss": 0.8213, + "step": 7046 + }, + { + "epoch": 1.7851804939835338, + "grad_norm": 3.6275784969329834, + "learning_rate": 7.636098572729392e-06, + "loss": 0.6797, + "step": 7047 + }, + { + "epoch": 1.7854338188727041, + "grad_norm": 3.5639266967773438, + "learning_rate": 7.63538659247994e-06, + "loss": 0.6374, + "step": 7048 + }, + { + "epoch": 1.7856871437618747, + "grad_norm": 3.8096024990081787, + "learning_rate": 7.634674538230065e-06, + "loss": 0.8411, + "step": 7049 + }, + { + "epoch": 1.785940468651045, + "grad_norm": 3.4755046367645264, + "learning_rate": 7.633962409999765e-06, + "loss": 0.7631, + "step": 7050 + }, + { + "epoch": 1.7861937935402152, + "grad_norm": 3.6860246658325195, + "learning_rate": 7.633250207809034e-06, + "loss": 0.8002, + "step": 7051 + }, + { + "epoch": 1.7864471184293857, + "grad_norm": 3.7806756496429443, + "learning_rate": 7.632537931677871e-06, + "loss": 0.7146, + "step": 7052 + }, + { + "epoch": 1.786700443318556, + "grad_norm": 3.540724039077759, + "learning_rate": 7.631825581626278e-06, + "loss": 0.7278, + "step": 7053 + }, + { + "epoch": 1.7869537682077263, + "grad_norm": 3.7162272930145264, + "learning_rate": 7.631113157674254e-06, + "loss": 0.7267, + "step": 7054 + }, + { + "epoch": 1.7872070930968968, + "grad_norm": 4.121738910675049, + "learning_rate": 7.630400659841807e-06, + "loss": 0.9696, + "step": 7055 + }, + { + "epoch": 1.7874604179860671, + "grad_norm": 3.655700206756592, + "learning_rate": 7.629688088148943e-06, + "loss": 0.7802, + "step": 7056 + }, + { + "epoch": 1.7877137428752374, + "grad_norm": 3.860203742980957, + "learning_rate": 7.628975442615669e-06, + "loss": 0.8842, + "step": 7057 + }, + { + "epoch": 1.787967067764408, + "grad_norm": 3.4483695030212402, + "learning_rate": 7.628262723261999e-06, + "loss": 0.7747, + "step": 7058 + }, + { + "epoch": 1.7882203926535782, + "grad_norm": 3.793168544769287, + "learning_rate": 7.627549930107941e-06, + "loss": 0.8551, + "step": 7059 + }, + { + "epoch": 1.7884737175427485, + "grad_norm": 3.664818286895752, + "learning_rate": 7.626837063173514e-06, + "loss": 0.8862, + "step": 7060 + }, + { + "epoch": 1.788727042431919, + "grad_norm": 3.545707941055298, + "learning_rate": 7.626124122478731e-06, + "loss": 0.7819, + "step": 7061 + }, + { + "epoch": 1.7889803673210893, + "grad_norm": 3.70879864692688, + "learning_rate": 7.625411108043616e-06, + "loss": 0.8376, + "step": 7062 + }, + { + "epoch": 1.7892336922102596, + "grad_norm": 3.9669227600097656, + "learning_rate": 7.624698019888186e-06, + "loss": 0.9086, + "step": 7063 + }, + { + "epoch": 1.7894870170994301, + "grad_norm": 3.8559978008270264, + "learning_rate": 7.623984858032467e-06, + "loss": 0.9308, + "step": 7064 + }, + { + "epoch": 1.7897403419886002, + "grad_norm": 4.075738430023193, + "learning_rate": 7.623271622496482e-06, + "loss": 0.9201, + "step": 7065 + }, + { + "epoch": 1.7899936668777707, + "grad_norm": 3.997243642807007, + "learning_rate": 7.622558313300259e-06, + "loss": 0.7588, + "step": 7066 + }, + { + "epoch": 1.7902469917669412, + "grad_norm": 3.4122109413146973, + "learning_rate": 7.621844930463829e-06, + "loss": 0.7998, + "step": 7067 + }, + { + "epoch": 1.7905003166561113, + "grad_norm": 3.753685712814331, + "learning_rate": 7.621131474007222e-06, + "loss": 0.8078, + "step": 7068 + }, + { + "epoch": 1.7907536415452818, + "grad_norm": 3.4711172580718994, + "learning_rate": 7.620417943950469e-06, + "loss": 0.8868, + "step": 7069 + }, + { + "epoch": 1.7910069664344523, + "grad_norm": 3.7753467559814453, + "learning_rate": 7.619704340313611e-06, + "loss": 0.7757, + "step": 7070 + }, + { + "epoch": 1.7912602913236224, + "grad_norm": 3.4311575889587402, + "learning_rate": 7.618990663116681e-06, + "loss": 0.8055, + "step": 7071 + }, + { + "epoch": 1.791513616212793, + "grad_norm": 3.5160038471221924, + "learning_rate": 7.618276912379723e-06, + "loss": 0.6976, + "step": 7072 + }, + { + "epoch": 1.7917669411019632, + "grad_norm": 4.61091947555542, + "learning_rate": 7.617563088122775e-06, + "loss": 0.741, + "step": 7073 + }, + { + "epoch": 1.7920202659911335, + "grad_norm": 3.6943416595458984, + "learning_rate": 7.616849190365882e-06, + "loss": 0.7657, + "step": 7074 + }, + { + "epoch": 1.792273590880304, + "grad_norm": 3.6395230293273926, + "learning_rate": 7.616135219129093e-06, + "loss": 0.7429, + "step": 7075 + }, + { + "epoch": 1.7925269157694743, + "grad_norm": 3.929011821746826, + "learning_rate": 7.615421174432449e-06, + "loss": 0.7616, + "step": 7076 + }, + { + "epoch": 1.7927802406586446, + "grad_norm": 4.091475009918213, + "learning_rate": 7.614707056296008e-06, + "loss": 0.8471, + "step": 7077 + }, + { + "epoch": 1.7930335655478151, + "grad_norm": 4.02305793762207, + "learning_rate": 7.613992864739816e-06, + "loss": 0.7768, + "step": 7078 + }, + { + "epoch": 1.7932868904369854, + "grad_norm": 3.849973440170288, + "learning_rate": 7.613278599783929e-06, + "loss": 0.7575, + "step": 7079 + }, + { + "epoch": 1.7935402153261557, + "grad_norm": 3.536513090133667, + "learning_rate": 7.612564261448405e-06, + "loss": 0.7613, + "step": 7080 + }, + { + "epoch": 1.7937935402153262, + "grad_norm": 3.466219902038574, + "learning_rate": 7.611849849753301e-06, + "loss": 0.8575, + "step": 7081 + }, + { + "epoch": 1.7940468651044965, + "grad_norm": 3.7162365913391113, + "learning_rate": 7.611135364718677e-06, + "loss": 0.7508, + "step": 7082 + }, + { + "epoch": 1.7943001899936668, + "grad_norm": 3.8962814807891846, + "learning_rate": 7.6104208063645955e-06, + "loss": 0.823, + "step": 7083 + }, + { + "epoch": 1.7945535148828373, + "grad_norm": 3.6473841667175293, + "learning_rate": 7.609706174711122e-06, + "loss": 0.831, + "step": 7084 + }, + { + "epoch": 1.7948068397720076, + "grad_norm": 3.5618393421173096, + "learning_rate": 7.608991469778321e-06, + "loss": 0.7275, + "step": 7085 + }, + { + "epoch": 1.795060164661178, + "grad_norm": 3.5808346271514893, + "learning_rate": 7.608276691586263e-06, + "loss": 0.7537, + "step": 7086 + }, + { + "epoch": 1.7953134895503484, + "grad_norm": 3.967834949493408, + "learning_rate": 7.607561840155019e-06, + "loss": 0.8148, + "step": 7087 + }, + { + "epoch": 1.7955668144395187, + "grad_norm": 3.600773811340332, + "learning_rate": 7.6068469155046595e-06, + "loss": 0.9399, + "step": 7088 + }, + { + "epoch": 1.795820139328689, + "grad_norm": 3.679380178451538, + "learning_rate": 7.606131917655259e-06, + "loss": 0.7059, + "step": 7089 + }, + { + "epoch": 1.7960734642178595, + "grad_norm": 3.6357953548431396, + "learning_rate": 7.605416846626899e-06, + "loss": 0.662, + "step": 7090 + }, + { + "epoch": 1.7963267891070298, + "grad_norm": 3.8809664249420166, + "learning_rate": 7.604701702439652e-06, + "loss": 0.8056, + "step": 7091 + }, + { + "epoch": 1.7965801139962, + "grad_norm": 3.6901090145111084, + "learning_rate": 7.603986485113604e-06, + "loss": 0.7544, + "step": 7092 + }, + { + "epoch": 1.7968334388853706, + "grad_norm": 3.48818302154541, + "learning_rate": 7.603271194668835e-06, + "loss": 0.7084, + "step": 7093 + }, + { + "epoch": 1.7970867637745407, + "grad_norm": 3.4194297790527344, + "learning_rate": 7.60255583112543e-06, + "loss": 0.7291, + "step": 7094 + }, + { + "epoch": 1.7973400886637112, + "grad_norm": 3.7268667221069336, + "learning_rate": 7.601840394503478e-06, + "loss": 0.7406, + "step": 7095 + }, + { + "epoch": 1.7975934135528817, + "grad_norm": 3.3654115200042725, + "learning_rate": 7.601124884823067e-06, + "loss": 0.7558, + "step": 7096 + }, + { + "epoch": 1.7978467384420518, + "grad_norm": 3.6973989009857178, + "learning_rate": 7.600409302104289e-06, + "loss": 0.7835, + "step": 7097 + }, + { + "epoch": 1.7981000633312223, + "grad_norm": 3.9799880981445312, + "learning_rate": 7.5996936463672365e-06, + "loss": 0.9303, + "step": 7098 + }, + { + "epoch": 1.7983533882203928, + "grad_norm": 3.7412776947021484, + "learning_rate": 7.598977917632004e-06, + "loss": 0.7856, + "step": 7099 + }, + { + "epoch": 1.7986067131095629, + "grad_norm": 3.589317798614502, + "learning_rate": 7.59826211591869e-06, + "loss": 0.8086, + "step": 7100 + }, + { + "epoch": 1.7988600379987334, + "grad_norm": 3.5574934482574463, + "learning_rate": 7.597546241247393e-06, + "loss": 0.8053, + "step": 7101 + }, + { + "epoch": 1.7991133628879037, + "grad_norm": 3.4305031299591064, + "learning_rate": 7.596830293638217e-06, + "loss": 0.6677, + "step": 7102 + }, + { + "epoch": 1.799366687777074, + "grad_norm": 3.3890435695648193, + "learning_rate": 7.596114273111262e-06, + "loss": 0.7626, + "step": 7103 + }, + { + "epoch": 1.7996200126662445, + "grad_norm": 3.6799426078796387, + "learning_rate": 7.595398179686635e-06, + "loss": 0.6785, + "step": 7104 + }, + { + "epoch": 1.7998733375554148, + "grad_norm": 3.8996875286102295, + "learning_rate": 7.594682013384442e-06, + "loss": 0.8432, + "step": 7105 + }, + { + "epoch": 1.800126662444585, + "grad_norm": 4.103445529937744, + "learning_rate": 7.593965774224796e-06, + "loss": 0.9385, + "step": 7106 + }, + { + "epoch": 1.8003799873337556, + "grad_norm": 4.211414337158203, + "learning_rate": 7.593249462227807e-06, + "loss": 0.7408, + "step": 7107 + }, + { + "epoch": 1.8006333122229259, + "grad_norm": 3.424152135848999, + "learning_rate": 7.592533077413586e-06, + "loss": 0.7503, + "step": 7108 + }, + { + "epoch": 1.8008866371120962, + "grad_norm": 3.4662649631500244, + "learning_rate": 7.591816619802255e-06, + "loss": 0.7922, + "step": 7109 + }, + { + "epoch": 1.8011399620012667, + "grad_norm": 3.6717116832733154, + "learning_rate": 7.591100089413925e-06, + "loss": 0.7804, + "step": 7110 + }, + { + "epoch": 1.801393286890437, + "grad_norm": 3.3055622577667236, + "learning_rate": 7.59038348626872e-06, + "loss": 0.7876, + "step": 7111 + }, + { + "epoch": 1.8016466117796073, + "grad_norm": 3.6405632495880127, + "learning_rate": 7.589666810386762e-06, + "loss": 0.8317, + "step": 7112 + }, + { + "epoch": 1.8018999366687778, + "grad_norm": 3.5185089111328125, + "learning_rate": 7.5889500617881715e-06, + "loss": 0.88, + "step": 7113 + }, + { + "epoch": 1.802153261557948, + "grad_norm": 3.3841967582702637, + "learning_rate": 7.588233240493078e-06, + "loss": 0.7814, + "step": 7114 + }, + { + "epoch": 1.8024065864471184, + "grad_norm": 3.6781198978424072, + "learning_rate": 7.587516346521608e-06, + "loss": 0.7237, + "step": 7115 + }, + { + "epoch": 1.802659911336289, + "grad_norm": 3.568204641342163, + "learning_rate": 7.586799379893892e-06, + "loss": 0.8026, + "step": 7116 + }, + { + "epoch": 1.8029132362254592, + "grad_norm": 3.2144687175750732, + "learning_rate": 7.586082340630061e-06, + "loss": 0.6116, + "step": 7117 + }, + { + "epoch": 1.8031665611146295, + "grad_norm": 3.9251370429992676, + "learning_rate": 7.58536522875025e-06, + "loss": 0.941, + "step": 7118 + }, + { + "epoch": 1.8034198860038, + "grad_norm": 3.2826364040374756, + "learning_rate": 7.584648044274594e-06, + "loss": 0.7135, + "step": 7119 + }, + { + "epoch": 1.8036732108929703, + "grad_norm": 3.7196340560913086, + "learning_rate": 7.583930787223233e-06, + "loss": 0.8529, + "step": 7120 + }, + { + "epoch": 1.8039265357821406, + "grad_norm": 3.5239436626434326, + "learning_rate": 7.5832134576163085e-06, + "loss": 0.6152, + "step": 7121 + }, + { + "epoch": 1.804179860671311, + "grad_norm": 4.050826072692871, + "learning_rate": 7.58249605547396e-06, + "loss": 0.7714, + "step": 7122 + }, + { + "epoch": 1.8044331855604812, + "grad_norm": 3.6415138244628906, + "learning_rate": 7.581778580816331e-06, + "loss": 0.7681, + "step": 7123 + }, + { + "epoch": 1.8046865104496517, + "grad_norm": 3.7598166465759277, + "learning_rate": 7.58106103366357e-06, + "loss": 0.7684, + "step": 7124 + }, + { + "epoch": 1.8049398353388222, + "grad_norm": 3.0675861835479736, + "learning_rate": 7.580343414035826e-06, + "loss": 0.6349, + "step": 7125 + }, + { + "epoch": 1.8051931602279923, + "grad_norm": 3.739070415496826, + "learning_rate": 7.579625721953247e-06, + "loss": 0.7287, + "step": 7126 + }, + { + "epoch": 1.8054464851171628, + "grad_norm": 3.461012125015259, + "learning_rate": 7.578907957435988e-06, + "loss": 0.666, + "step": 7127 + }, + { + "epoch": 1.8056998100063333, + "grad_norm": 3.56827712059021, + "learning_rate": 7.578190120504202e-06, + "loss": 0.7404, + "step": 7128 + }, + { + "epoch": 1.8059531348955034, + "grad_norm": 4.071817874908447, + "learning_rate": 7.5774722111780454e-06, + "loss": 0.9501, + "step": 7129 + }, + { + "epoch": 1.8062064597846739, + "grad_norm": 3.8854007720947266, + "learning_rate": 7.5767542294776765e-06, + "loss": 0.8478, + "step": 7130 + }, + { + "epoch": 1.8064597846738442, + "grad_norm": 3.4795539379119873, + "learning_rate": 7.576036175423257e-06, + "loss": 0.6753, + "step": 7131 + }, + { + "epoch": 1.8067131095630145, + "grad_norm": 4.221245765686035, + "learning_rate": 7.57531804903495e-06, + "loss": 0.7443, + "step": 7132 + }, + { + "epoch": 1.806966434452185, + "grad_norm": 4.0690813064575195, + "learning_rate": 7.574599850332917e-06, + "loss": 0.7844, + "step": 7133 + }, + { + "epoch": 1.8072197593413553, + "grad_norm": 4.071067810058594, + "learning_rate": 7.573881579337327e-06, + "loss": 0.8057, + "step": 7134 + }, + { + "epoch": 1.8074730842305256, + "grad_norm": 3.6989645957946777, + "learning_rate": 7.57316323606835e-06, + "loss": 0.7898, + "step": 7135 + }, + { + "epoch": 1.807726409119696, + "grad_norm": 3.796231985092163, + "learning_rate": 7.572444820546157e-06, + "loss": 0.6651, + "step": 7136 + }, + { + "epoch": 1.8079797340088664, + "grad_norm": 3.7054240703582764, + "learning_rate": 7.571726332790916e-06, + "loss": 0.6976, + "step": 7137 + }, + { + "epoch": 1.8082330588980366, + "grad_norm": 3.9299869537353516, + "learning_rate": 7.571007772822807e-06, + "loss": 0.8387, + "step": 7138 + }, + { + "epoch": 1.8084863837872072, + "grad_norm": 4.315001010894775, + "learning_rate": 7.570289140662004e-06, + "loss": 0.8349, + "step": 7139 + }, + { + "epoch": 1.8087397086763775, + "grad_norm": 3.631094217300415, + "learning_rate": 7.569570436328686e-06, + "loss": 0.7793, + "step": 7140 + }, + { + "epoch": 1.8089930335655477, + "grad_norm": 4.102321147918701, + "learning_rate": 7.568851659843035e-06, + "loss": 0.8238, + "step": 7141 + }, + { + "epoch": 1.8092463584547183, + "grad_norm": 3.266963243484497, + "learning_rate": 7.568132811225233e-06, + "loss": 0.6934, + "step": 7142 + }, + { + "epoch": 1.8094996833438886, + "grad_norm": 3.3723645210266113, + "learning_rate": 7.567413890495465e-06, + "loss": 0.7455, + "step": 7143 + }, + { + "epoch": 1.8097530082330588, + "grad_norm": 3.8398401737213135, + "learning_rate": 7.5666948976739175e-06, + "loss": 0.9356, + "step": 7144 + }, + { + "epoch": 1.8100063331222294, + "grad_norm": 4.05166482925415, + "learning_rate": 7.565975832780781e-06, + "loss": 0.8372, + "step": 7145 + }, + { + "epoch": 1.8102596580113997, + "grad_norm": 3.7039108276367188, + "learning_rate": 7.565256695836247e-06, + "loss": 0.8427, + "step": 7146 + }, + { + "epoch": 1.81051298290057, + "grad_norm": 3.8387906551361084, + "learning_rate": 7.564537486860506e-06, + "loss": 0.8148, + "step": 7147 + }, + { + "epoch": 1.8107663077897405, + "grad_norm": 3.8914530277252197, + "learning_rate": 7.5638182058737545e-06, + "loss": 0.8513, + "step": 7148 + }, + { + "epoch": 1.8110196326789108, + "grad_norm": 3.7207016944885254, + "learning_rate": 7.563098852896189e-06, + "loss": 0.8369, + "step": 7149 + }, + { + "epoch": 1.811272957568081, + "grad_norm": 3.7874226570129395, + "learning_rate": 7.56237942794801e-06, + "loss": 0.7269, + "step": 7150 + }, + { + "epoch": 1.8115262824572516, + "grad_norm": 3.577648878097534, + "learning_rate": 7.561659931049418e-06, + "loss": 0.8044, + "step": 7151 + }, + { + "epoch": 1.8117796073464216, + "grad_norm": 3.5661303997039795, + "learning_rate": 7.560940362220614e-06, + "loss": 0.7571, + "step": 7152 + }, + { + "epoch": 1.8120329322355921, + "grad_norm": 3.839750051498413, + "learning_rate": 7.560220721481806e-06, + "loss": 0.8257, + "step": 7153 + }, + { + "epoch": 1.8122862571247627, + "grad_norm": 3.900562047958374, + "learning_rate": 7.559501008853201e-06, + "loss": 0.8372, + "step": 7154 + }, + { + "epoch": 1.8125395820139327, + "grad_norm": 3.8204472064971924, + "learning_rate": 7.558781224355005e-06, + "loss": 0.8736, + "step": 7155 + }, + { + "epoch": 1.8127929069031032, + "grad_norm": 3.8297312259674072, + "learning_rate": 7.558061368007432e-06, + "loss": 0.7803, + "step": 7156 + }, + { + "epoch": 1.8130462317922738, + "grad_norm": 3.4910902976989746, + "learning_rate": 7.557341439830694e-06, + "loss": 0.8462, + "step": 7157 + }, + { + "epoch": 1.8132995566814438, + "grad_norm": 3.6622111797332764, + "learning_rate": 7.556621439845009e-06, + "loss": 0.7792, + "step": 7158 + }, + { + "epoch": 1.8135528815706143, + "grad_norm": 4.028879642486572, + "learning_rate": 7.555901368070591e-06, + "loss": 1.0484, + "step": 7159 + }, + { + "epoch": 1.8138062064597846, + "grad_norm": 3.211138963699341, + "learning_rate": 7.5551812245276615e-06, + "loss": 0.739, + "step": 7160 + }, + { + "epoch": 1.814059531348955, + "grad_norm": 3.726255178451538, + "learning_rate": 7.5544610092364405e-06, + "loss": 0.7369, + "step": 7161 + }, + { + "epoch": 1.8143128562381254, + "grad_norm": 3.3407065868377686, + "learning_rate": 7.553740722217151e-06, + "loss": 0.7547, + "step": 7162 + }, + { + "epoch": 1.8145661811272957, + "grad_norm": 3.7033278942108154, + "learning_rate": 7.553020363490018e-06, + "loss": 0.8514, + "step": 7163 + }, + { + "epoch": 1.814819506016466, + "grad_norm": 3.4461936950683594, + "learning_rate": 7.552299933075271e-06, + "loss": 0.7574, + "step": 7164 + }, + { + "epoch": 1.8150728309056365, + "grad_norm": 3.6751558780670166, + "learning_rate": 7.551579430993138e-06, + "loss": 0.8685, + "step": 7165 + }, + { + "epoch": 1.8153261557948068, + "grad_norm": 3.683856725692749, + "learning_rate": 7.550858857263851e-06, + "loss": 0.8602, + "step": 7166 + }, + { + "epoch": 1.8155794806839771, + "grad_norm": 3.5902152061462402, + "learning_rate": 7.550138211907643e-06, + "loss": 0.7665, + "step": 7167 + }, + { + "epoch": 1.8158328055731476, + "grad_norm": 3.73374080657959, + "learning_rate": 7.549417494944748e-06, + "loss": 0.7848, + "step": 7168 + }, + { + "epoch": 1.816086130462318, + "grad_norm": 4.09006404876709, + "learning_rate": 7.548696706395403e-06, + "loss": 0.9717, + "step": 7169 + }, + { + "epoch": 1.8163394553514882, + "grad_norm": 4.417375087738037, + "learning_rate": 7.5479758462798515e-06, + "loss": 0.8825, + "step": 7170 + }, + { + "epoch": 1.8165927802406587, + "grad_norm": 3.414821147918701, + "learning_rate": 7.547254914618332e-06, + "loss": 0.6906, + "step": 7171 + }, + { + "epoch": 1.816846105129829, + "grad_norm": 3.8782753944396973, + "learning_rate": 7.5465339114310885e-06, + "loss": 0.91, + "step": 7172 + }, + { + "epoch": 1.8170994300189993, + "grad_norm": 3.4348368644714355, + "learning_rate": 7.545812836738366e-06, + "loss": 0.7046, + "step": 7173 + }, + { + "epoch": 1.8173527549081698, + "grad_norm": 3.380103826522827, + "learning_rate": 7.545091690560411e-06, + "loss": 0.751, + "step": 7174 + }, + { + "epoch": 1.8176060797973401, + "grad_norm": 3.654703140258789, + "learning_rate": 7.544370472917477e-06, + "loss": 0.8529, + "step": 7175 + }, + { + "epoch": 1.8178594046865104, + "grad_norm": 3.8183515071868896, + "learning_rate": 7.5436491838298105e-06, + "loss": 0.8491, + "step": 7176 + }, + { + "epoch": 1.818112729575681, + "grad_norm": 3.9947562217712402, + "learning_rate": 7.542927823317667e-06, + "loss": 0.7692, + "step": 7177 + }, + { + "epoch": 1.8183660544648512, + "grad_norm": 3.634908676147461, + "learning_rate": 7.5422063914013034e-06, + "loss": 0.7804, + "step": 7178 + }, + { + "epoch": 1.8186193793540215, + "grad_norm": 4.077807426452637, + "learning_rate": 7.541484888100974e-06, + "loss": 0.8387, + "step": 7179 + }, + { + "epoch": 1.818872704243192, + "grad_norm": 3.756589889526367, + "learning_rate": 7.54076331343694e-06, + "loss": 0.7606, + "step": 7180 + }, + { + "epoch": 1.819126029132362, + "grad_norm": 3.5974605083465576, + "learning_rate": 7.5400416674294655e-06, + "loss": 0.8288, + "step": 7181 + }, + { + "epoch": 1.8193793540215326, + "grad_norm": 3.844635486602783, + "learning_rate": 7.539319950098809e-06, + "loss": 0.7886, + "step": 7182 + }, + { + "epoch": 1.8196326789107031, + "grad_norm": 3.892279863357544, + "learning_rate": 7.538598161465239e-06, + "loss": 0.756, + "step": 7183 + }, + { + "epoch": 1.8198860037998732, + "grad_norm": 3.7107455730438232, + "learning_rate": 7.537876301549023e-06, + "loss": 0.6994, + "step": 7184 + }, + { + "epoch": 1.8201393286890437, + "grad_norm": 3.5131747722625732, + "learning_rate": 7.537154370370429e-06, + "loss": 0.7482, + "step": 7185 + }, + { + "epoch": 1.8203926535782142, + "grad_norm": 3.977311611175537, + "learning_rate": 7.536432367949731e-06, + "loss": 0.8869, + "step": 7186 + }, + { + "epoch": 1.8206459784673843, + "grad_norm": 3.570775270462036, + "learning_rate": 7.535710294307199e-06, + "loss": 0.7425, + "step": 7187 + }, + { + "epoch": 1.8208993033565548, + "grad_norm": 3.5198025703430176, + "learning_rate": 7.534988149463112e-06, + "loss": 0.8329, + "step": 7188 + }, + { + "epoch": 1.821152628245725, + "grad_norm": 3.6174983978271484, + "learning_rate": 7.534265933437746e-06, + "loss": 0.7919, + "step": 7189 + }, + { + "epoch": 1.8214059531348954, + "grad_norm": 4.100408554077148, + "learning_rate": 7.53354364625138e-06, + "loss": 0.8786, + "step": 7190 + }, + { + "epoch": 1.821659278024066, + "grad_norm": 3.5623393058776855, + "learning_rate": 7.532821287924294e-06, + "loss": 0.8098, + "step": 7191 + }, + { + "epoch": 1.8219126029132362, + "grad_norm": 3.5216543674468994, + "learning_rate": 7.5320988584767755e-06, + "loss": 0.6888, + "step": 7192 + }, + { + "epoch": 1.8221659278024065, + "grad_norm": 4.106544494628906, + "learning_rate": 7.5313763579291055e-06, + "loss": 0.9342, + "step": 7193 + }, + { + "epoch": 1.822419252691577, + "grad_norm": 3.917630672454834, + "learning_rate": 7.530653786301575e-06, + "loss": 1.0038, + "step": 7194 + }, + { + "epoch": 1.8226725775807473, + "grad_norm": 3.456974744796753, + "learning_rate": 7.529931143614473e-06, + "loss": 0.7547, + "step": 7195 + }, + { + "epoch": 1.8229259024699176, + "grad_norm": 3.7777621746063232, + "learning_rate": 7.529208429888089e-06, + "loss": 0.7315, + "step": 7196 + }, + { + "epoch": 1.823179227359088, + "grad_norm": 3.515810251235962, + "learning_rate": 7.528485645142718e-06, + "loss": 0.7033, + "step": 7197 + }, + { + "epoch": 1.8234325522482584, + "grad_norm": 4.235123634338379, + "learning_rate": 7.527762789398656e-06, + "loss": 0.8681, + "step": 7198 + }, + { + "epoch": 1.8236858771374287, + "grad_norm": 3.604100227355957, + "learning_rate": 7.527039862676198e-06, + "loss": 0.7855, + "step": 7199 + }, + { + "epoch": 1.8239392020265992, + "grad_norm": 3.5925567150115967, + "learning_rate": 7.526316864995648e-06, + "loss": 0.6977, + "step": 7200 + }, + { + "epoch": 1.8241925269157695, + "grad_norm": 3.774470567703247, + "learning_rate": 7.525593796377302e-06, + "loss": 0.7816, + "step": 7201 + }, + { + "epoch": 1.8244458518049398, + "grad_norm": 3.5756449699401855, + "learning_rate": 7.524870656841466e-06, + "loss": 0.7182, + "step": 7202 + }, + { + "epoch": 1.8246991766941103, + "grad_norm": 3.6559081077575684, + "learning_rate": 7.524147446408445e-06, + "loss": 0.7439, + "step": 7203 + }, + { + "epoch": 1.8249525015832806, + "grad_norm": 3.5166289806365967, + "learning_rate": 7.523424165098547e-06, + "loss": 0.7562, + "step": 7204 + }, + { + "epoch": 1.8252058264724509, + "grad_norm": 3.5432941913604736, + "learning_rate": 7.522700812932082e-06, + "loss": 0.7382, + "step": 7205 + }, + { + "epoch": 1.8254591513616214, + "grad_norm": 4.274906158447266, + "learning_rate": 7.5219773899293605e-06, + "loss": 0.9015, + "step": 7206 + }, + { + "epoch": 1.8257124762507917, + "grad_norm": 4.043605804443359, + "learning_rate": 7.521253896110695e-06, + "loss": 0.9145, + "step": 7207 + }, + { + "epoch": 1.825965801139962, + "grad_norm": 3.766462564468384, + "learning_rate": 7.520530331496403e-06, + "loss": 0.6623, + "step": 7208 + }, + { + "epoch": 1.8262191260291325, + "grad_norm": 4.061577796936035, + "learning_rate": 7.519806696106799e-06, + "loss": 0.9031, + "step": 7209 + }, + { + "epoch": 1.8264724509183026, + "grad_norm": 3.7861196994781494, + "learning_rate": 7.5190829899622055e-06, + "loss": 0.8022, + "step": 7210 + }, + { + "epoch": 1.826725775807473, + "grad_norm": 4.028881549835205, + "learning_rate": 7.5183592130829415e-06, + "loss": 0.8434, + "step": 7211 + }, + { + "epoch": 1.8269791006966436, + "grad_norm": 3.5819966793060303, + "learning_rate": 7.517635365489331e-06, + "loss": 0.7136, + "step": 7212 + }, + { + "epoch": 1.8272324255858137, + "grad_norm": 3.858046293258667, + "learning_rate": 7.516911447201699e-06, + "loss": 0.7895, + "step": 7213 + }, + { + "epoch": 1.8274857504749842, + "grad_norm": 3.5602328777313232, + "learning_rate": 7.516187458240374e-06, + "loss": 0.7321, + "step": 7214 + }, + { + "epoch": 1.8277390753641545, + "grad_norm": 4.064763069152832, + "learning_rate": 7.5154633986256845e-06, + "loss": 0.8498, + "step": 7215 + }, + { + "epoch": 1.8279924002533248, + "grad_norm": 3.7554080486297607, + "learning_rate": 7.514739268377962e-06, + "loss": 0.811, + "step": 7216 + }, + { + "epoch": 1.8282457251424953, + "grad_norm": 3.447010040283203, + "learning_rate": 7.514015067517537e-06, + "loss": 0.6723, + "step": 7217 + }, + { + "epoch": 1.8284990500316656, + "grad_norm": 3.7167718410491943, + "learning_rate": 7.5132907960647495e-06, + "loss": 0.7881, + "step": 7218 + }, + { + "epoch": 1.8287523749208359, + "grad_norm": 4.058993816375732, + "learning_rate": 7.512566454039934e-06, + "loss": 0.6933, + "step": 7219 + }, + { + "epoch": 1.8290056998100064, + "grad_norm": 4.2340617179870605, + "learning_rate": 7.51184204146343e-06, + "loss": 0.8528, + "step": 7220 + }, + { + "epoch": 1.8292590246991767, + "grad_norm": 3.6348915100097656, + "learning_rate": 7.511117558355579e-06, + "loss": 0.7332, + "step": 7221 + }, + { + "epoch": 1.829512349588347, + "grad_norm": 3.662551164627075, + "learning_rate": 7.510393004736723e-06, + "loss": 0.7758, + "step": 7222 + }, + { + "epoch": 1.8297656744775175, + "grad_norm": 4.1006317138671875, + "learning_rate": 7.509668380627207e-06, + "loss": 0.9005, + "step": 7223 + }, + { + "epoch": 1.8300189993666878, + "grad_norm": 4.0843186378479, + "learning_rate": 7.508943686047381e-06, + "loss": 0.8336, + "step": 7224 + }, + { + "epoch": 1.830272324255858, + "grad_norm": 3.8807265758514404, + "learning_rate": 7.508218921017593e-06, + "loss": 0.8249, + "step": 7225 + }, + { + "epoch": 1.8305256491450286, + "grad_norm": 3.4240353107452393, + "learning_rate": 7.5074940855581915e-06, + "loss": 0.7612, + "step": 7226 + }, + { + "epoch": 1.8307789740341989, + "grad_norm": 3.7285470962524414, + "learning_rate": 7.506769179689531e-06, + "loss": 0.8268, + "step": 7227 + }, + { + "epoch": 1.8310322989233692, + "grad_norm": 3.384099006652832, + "learning_rate": 7.506044203431968e-06, + "loss": 0.843, + "step": 7228 + }, + { + "epoch": 1.8312856238125397, + "grad_norm": 3.5275726318359375, + "learning_rate": 7.505319156805857e-06, + "loss": 0.7429, + "step": 7229 + }, + { + "epoch": 1.83153894870171, + "grad_norm": 3.8868675231933594, + "learning_rate": 7.504594039831559e-06, + "loss": 0.8878, + "step": 7230 + }, + { + "epoch": 1.8317922735908803, + "grad_norm": 3.7927005290985107, + "learning_rate": 7.503868852529435e-06, + "loss": 0.6936, + "step": 7231 + }, + { + "epoch": 1.8320455984800508, + "grad_norm": 3.8648810386657715, + "learning_rate": 7.503143594919846e-06, + "loss": 0.7206, + "step": 7232 + }, + { + "epoch": 1.832298923369221, + "grad_norm": 4.037606239318848, + "learning_rate": 7.5024182670231586e-06, + "loss": 0.8805, + "step": 7233 + }, + { + "epoch": 1.8325522482583914, + "grad_norm": 3.4119715690612793, + "learning_rate": 7.50169286885974e-06, + "loss": 0.8197, + "step": 7234 + }, + { + "epoch": 1.8328055731475619, + "grad_norm": 3.813147783279419, + "learning_rate": 7.500967400449957e-06, + "loss": 0.7418, + "step": 7235 + }, + { + "epoch": 1.833058898036732, + "grad_norm": 3.903792142868042, + "learning_rate": 7.5002418618141815e-06, + "loss": 0.8382, + "step": 7236 + }, + { + "epoch": 1.8333122229259025, + "grad_norm": 4.018380641937256, + "learning_rate": 7.499516252972786e-06, + "loss": 0.7733, + "step": 7237 + }, + { + "epoch": 1.833565547815073, + "grad_norm": 3.7444024085998535, + "learning_rate": 7.498790573946146e-06, + "loss": 0.839, + "step": 7238 + }, + { + "epoch": 1.833818872704243, + "grad_norm": 3.6033787727355957, + "learning_rate": 7.498064824754638e-06, + "loss": 0.7652, + "step": 7239 + }, + { + "epoch": 1.8340721975934136, + "grad_norm": 3.655672073364258, + "learning_rate": 7.49733900541864e-06, + "loss": 0.7462, + "step": 7240 + }, + { + "epoch": 1.834325522482584, + "grad_norm": 3.443094491958618, + "learning_rate": 7.4966131159585344e-06, + "loss": 0.7305, + "step": 7241 + }, + { + "epoch": 1.8345788473717541, + "grad_norm": 3.951950788497925, + "learning_rate": 7.495887156394701e-06, + "loss": 0.8662, + "step": 7242 + }, + { + "epoch": 1.8348321722609247, + "grad_norm": 3.543978214263916, + "learning_rate": 7.4951611267475265e-06, + "loss": 0.6832, + "step": 7243 + }, + { + "epoch": 1.835085497150095, + "grad_norm": 3.825437307357788, + "learning_rate": 7.494435027037396e-06, + "loss": 0.8139, + "step": 7244 + }, + { + "epoch": 1.8353388220392652, + "grad_norm": 3.6683316230773926, + "learning_rate": 7.4937088572847015e-06, + "loss": 0.7841, + "step": 7245 + }, + { + "epoch": 1.8355921469284358, + "grad_norm": 3.8685595989227295, + "learning_rate": 7.49298261750983e-06, + "loss": 0.9504, + "step": 7246 + }, + { + "epoch": 1.835845471817606, + "grad_norm": 3.4584453105926514, + "learning_rate": 7.492256307733174e-06, + "loss": 0.6819, + "step": 7247 + }, + { + "epoch": 1.8360987967067763, + "grad_norm": 4.211421489715576, + "learning_rate": 7.49152992797513e-06, + "loss": 0.7561, + "step": 7248 + }, + { + "epoch": 1.8363521215959469, + "grad_norm": 3.5700526237487793, + "learning_rate": 7.490803478256095e-06, + "loss": 0.839, + "step": 7249 + }, + { + "epoch": 1.8366054464851171, + "grad_norm": 3.460155963897705, + "learning_rate": 7.490076958596463e-06, + "loss": 0.7964, + "step": 7250 + }, + { + "epoch": 1.8368587713742874, + "grad_norm": 3.575927972793579, + "learning_rate": 7.489350369016637e-06, + "loss": 0.759, + "step": 7251 + }, + { + "epoch": 1.837112096263458, + "grad_norm": 3.743602752685547, + "learning_rate": 7.488623709537021e-06, + "loss": 0.8683, + "step": 7252 + }, + { + "epoch": 1.8373654211526282, + "grad_norm": 3.7064273357391357, + "learning_rate": 7.487896980178015e-06, + "loss": 0.774, + "step": 7253 + }, + { + "epoch": 1.8376187460417985, + "grad_norm": 3.5614736080169678, + "learning_rate": 7.4871701809600304e-06, + "loss": 0.7556, + "step": 7254 + }, + { + "epoch": 1.837872070930969, + "grad_norm": 3.733701705932617, + "learning_rate": 7.486443311903472e-06, + "loss": 0.7736, + "step": 7255 + }, + { + "epoch": 1.8381253958201393, + "grad_norm": 3.7013964653015137, + "learning_rate": 7.485716373028751e-06, + "loss": 0.7774, + "step": 7256 + }, + { + "epoch": 1.8383787207093096, + "grad_norm": 3.250471353530884, + "learning_rate": 7.4849893643562784e-06, + "loss": 0.748, + "step": 7257 + }, + { + "epoch": 1.8386320455984801, + "grad_norm": 3.583418846130371, + "learning_rate": 7.484262285906469e-06, + "loss": 0.8087, + "step": 7258 + }, + { + "epoch": 1.8388853704876504, + "grad_norm": 3.2715556621551514, + "learning_rate": 7.483535137699741e-06, + "loss": 0.7186, + "step": 7259 + }, + { + "epoch": 1.8391386953768207, + "grad_norm": 4.001896381378174, + "learning_rate": 7.48280791975651e-06, + "loss": 0.9446, + "step": 7260 + }, + { + "epoch": 1.8393920202659912, + "grad_norm": 3.499667167663574, + "learning_rate": 7.482080632097194e-06, + "loss": 0.7591, + "step": 7261 + }, + { + "epoch": 1.8396453451551615, + "grad_norm": 3.9465816020965576, + "learning_rate": 7.481353274742218e-06, + "loss": 0.8264, + "step": 7262 + }, + { + "epoch": 1.8398986700443318, + "grad_norm": 3.7553086280822754, + "learning_rate": 7.480625847712005e-06, + "loss": 0.8215, + "step": 7263 + }, + { + "epoch": 1.8401519949335023, + "grad_norm": 3.6734864711761475, + "learning_rate": 7.479898351026982e-06, + "loss": 0.7447, + "step": 7264 + }, + { + "epoch": 1.8404053198226724, + "grad_norm": 3.592437982559204, + "learning_rate": 7.479170784707574e-06, + "loss": 0.8483, + "step": 7265 + }, + { + "epoch": 1.840658644711843, + "grad_norm": 3.403601884841919, + "learning_rate": 7.478443148774214e-06, + "loss": 0.7511, + "step": 7266 + }, + { + "epoch": 1.8409119696010134, + "grad_norm": 4.034810543060303, + "learning_rate": 7.477715443247331e-06, + "loss": 0.8785, + "step": 7267 + }, + { + "epoch": 1.8411652944901835, + "grad_norm": 3.768348455429077, + "learning_rate": 7.4769876681473595e-06, + "loss": 0.8305, + "step": 7268 + }, + { + "epoch": 1.841418619379354, + "grad_norm": 3.432314872741699, + "learning_rate": 7.4762598234947345e-06, + "loss": 0.8843, + "step": 7269 + }, + { + "epoch": 1.8416719442685245, + "grad_norm": 3.3588666915893555, + "learning_rate": 7.475531909309896e-06, + "loss": 0.8027, + "step": 7270 + }, + { + "epoch": 1.8419252691576946, + "grad_norm": 3.7070810794830322, + "learning_rate": 7.4748039256132795e-06, + "loss": 0.759, + "step": 7271 + }, + { + "epoch": 1.8421785940468651, + "grad_norm": 3.649130344390869, + "learning_rate": 7.474075872425331e-06, + "loss": 0.8145, + "step": 7272 + }, + { + "epoch": 1.8424319189360354, + "grad_norm": 3.6345674991607666, + "learning_rate": 7.47334774976649e-06, + "loss": 0.6566, + "step": 7273 + }, + { + "epoch": 1.8426852438252057, + "grad_norm": 3.7379961013793945, + "learning_rate": 7.4726195576572035e-06, + "loss": 0.6899, + "step": 7274 + }, + { + "epoch": 1.8429385687143762, + "grad_norm": 3.89823579788208, + "learning_rate": 7.471891296117919e-06, + "loss": 0.8179, + "step": 7275 + }, + { + "epoch": 1.8431918936035465, + "grad_norm": 3.2547085285186768, + "learning_rate": 7.4711629651690855e-06, + "loss": 0.6627, + "step": 7276 + }, + { + "epoch": 1.8434452184927168, + "grad_norm": 3.5351154804229736, + "learning_rate": 7.470434564831154e-06, + "loss": 0.7797, + "step": 7277 + }, + { + "epoch": 1.8436985433818873, + "grad_norm": 3.759066343307495, + "learning_rate": 7.469706095124578e-06, + "loss": 0.7311, + "step": 7278 + }, + { + "epoch": 1.8439518682710576, + "grad_norm": 3.729231357574463, + "learning_rate": 7.468977556069812e-06, + "loss": 0.7887, + "step": 7279 + }, + { + "epoch": 1.844205193160228, + "grad_norm": 3.930779218673706, + "learning_rate": 7.4682489476873136e-06, + "loss": 0.8708, + "step": 7280 + }, + { + "epoch": 1.8444585180493984, + "grad_norm": 3.8201029300689697, + "learning_rate": 7.467520269997541e-06, + "loss": 0.8285, + "step": 7281 + }, + { + "epoch": 1.8447118429385687, + "grad_norm": 4.230695724487305, + "learning_rate": 7.4667915230209565e-06, + "loss": 0.8228, + "step": 7282 + }, + { + "epoch": 1.844965167827739, + "grad_norm": 3.5004734992980957, + "learning_rate": 7.466062706778021e-06, + "loss": 0.7594, + "step": 7283 + }, + { + "epoch": 1.8452184927169095, + "grad_norm": 4.275425434112549, + "learning_rate": 7.4653338212892026e-06, + "loss": 0.9477, + "step": 7284 + }, + { + "epoch": 1.8454718176060798, + "grad_norm": 3.7084357738494873, + "learning_rate": 7.464604866574965e-06, + "loss": 0.7995, + "step": 7285 + }, + { + "epoch": 1.84572514249525, + "grad_norm": 4.5522871017456055, + "learning_rate": 7.463875842655776e-06, + "loss": 0.8223, + "step": 7286 + }, + { + "epoch": 1.8459784673844206, + "grad_norm": 4.052094459533691, + "learning_rate": 7.46314674955211e-06, + "loss": 0.7677, + "step": 7287 + }, + { + "epoch": 1.846231792273591, + "grad_norm": 4.135094165802002, + "learning_rate": 7.462417587284438e-06, + "loss": 0.8058, + "step": 7288 + }, + { + "epoch": 1.8464851171627612, + "grad_norm": 3.323655605316162, + "learning_rate": 7.461688355873234e-06, + "loss": 0.6835, + "step": 7289 + }, + { + "epoch": 1.8467384420519317, + "grad_norm": 3.601247549057007, + "learning_rate": 7.4609590553389746e-06, + "loss": 0.6864, + "step": 7290 + }, + { + "epoch": 1.846991766941102, + "grad_norm": 3.8734934329986572, + "learning_rate": 7.460229685702137e-06, + "loss": 0.8819, + "step": 7291 + }, + { + "epoch": 1.8472450918302723, + "grad_norm": 3.679542303085327, + "learning_rate": 7.459500246983204e-06, + "loss": 0.7914, + "step": 7292 + }, + { + "epoch": 1.8474984167194428, + "grad_norm": 3.858290433883667, + "learning_rate": 7.458770739202656e-06, + "loss": 0.854, + "step": 7293 + }, + { + "epoch": 1.8477517416086129, + "grad_norm": 3.3771471977233887, + "learning_rate": 7.458041162380979e-06, + "loss": 0.7797, + "step": 7294 + }, + { + "epoch": 1.8480050664977834, + "grad_norm": 3.539262056350708, + "learning_rate": 7.4573115165386575e-06, + "loss": 0.7457, + "step": 7295 + }, + { + "epoch": 1.848258391386954, + "grad_norm": 3.617661952972412, + "learning_rate": 7.456581801696181e-06, + "loss": 0.7938, + "step": 7296 + }, + { + "epoch": 1.848511716276124, + "grad_norm": 3.5524685382843018, + "learning_rate": 7.455852017874038e-06, + "loss": 0.7001, + "step": 7297 + }, + { + "epoch": 1.8487650411652945, + "grad_norm": 3.7686891555786133, + "learning_rate": 7.455122165092723e-06, + "loss": 0.9146, + "step": 7298 + }, + { + "epoch": 1.849018366054465, + "grad_norm": 3.892432689666748, + "learning_rate": 7.454392243372727e-06, + "loss": 0.7147, + "step": 7299 + }, + { + "epoch": 1.849271690943635, + "grad_norm": 3.7161612510681152, + "learning_rate": 7.453662252734547e-06, + "loss": 0.8717, + "step": 7300 + }, + { + "epoch": 1.8495250158328056, + "grad_norm": 3.782010078430176, + "learning_rate": 7.4529321931986805e-06, + "loss": 0.7736, + "step": 7301 + }, + { + "epoch": 1.8497783407219759, + "grad_norm": 4.072753429412842, + "learning_rate": 7.452202064785628e-06, + "loss": 0.9576, + "step": 7302 + }, + { + "epoch": 1.8500316656111462, + "grad_norm": 3.7230913639068604, + "learning_rate": 7.4514718675158925e-06, + "loss": 0.8127, + "step": 7303 + }, + { + "epoch": 1.8502849905003167, + "grad_norm": 3.960256814956665, + "learning_rate": 7.450741601409973e-06, + "loss": 0.8021, + "step": 7304 + }, + { + "epoch": 1.850538315389487, + "grad_norm": 3.41961932182312, + "learning_rate": 7.4500112664883795e-06, + "loss": 0.6983, + "step": 7305 + }, + { + "epoch": 1.8507916402786573, + "grad_norm": 3.587430000305176, + "learning_rate": 7.449280862771618e-06, + "loss": 0.7786, + "step": 7306 + }, + { + "epoch": 1.8510449651678278, + "grad_norm": 3.4382851123809814, + "learning_rate": 7.448550390280197e-06, + "loss": 0.7976, + "step": 7307 + }, + { + "epoch": 1.851298290056998, + "grad_norm": 3.8374269008636475, + "learning_rate": 7.44781984903463e-06, + "loss": 0.7545, + "step": 7308 + }, + { + "epoch": 1.8515516149461684, + "grad_norm": 4.093745708465576, + "learning_rate": 7.447089239055428e-06, + "loss": 0.753, + "step": 7309 + }, + { + "epoch": 1.851804939835339, + "grad_norm": 3.698042154312134, + "learning_rate": 7.446358560363107e-06, + "loss": 0.7663, + "step": 7310 + }, + { + "epoch": 1.8520582647245092, + "grad_norm": 3.391146183013916, + "learning_rate": 7.445627812978184e-06, + "loss": 0.6712, + "step": 7311 + }, + { + "epoch": 1.8523115896136795, + "grad_norm": 3.688525676727295, + "learning_rate": 7.444896996921179e-06, + "loss": 0.7332, + "step": 7312 + }, + { + "epoch": 1.85256491450285, + "grad_norm": 3.518315553665161, + "learning_rate": 7.444166112212613e-06, + "loss": 0.6559, + "step": 7313 + }, + { + "epoch": 1.8528182393920203, + "grad_norm": 3.89802885055542, + "learning_rate": 7.443435158873007e-06, + "loss": 0.81, + "step": 7314 + }, + { + "epoch": 1.8530715642811906, + "grad_norm": 3.615739583969116, + "learning_rate": 7.442704136922887e-06, + "loss": 0.7755, + "step": 7315 + }, + { + "epoch": 1.853324889170361, + "grad_norm": 3.5944442749023438, + "learning_rate": 7.441973046382779e-06, + "loss": 0.8504, + "step": 7316 + }, + { + "epoch": 1.8535782140595314, + "grad_norm": 3.965315818786621, + "learning_rate": 7.441241887273213e-06, + "loss": 0.7938, + "step": 7317 + }, + { + "epoch": 1.8538315389487017, + "grad_norm": 3.6452298164367676, + "learning_rate": 7.440510659614718e-06, + "loss": 0.8022, + "step": 7318 + }, + { + "epoch": 1.8540848638378722, + "grad_norm": 3.2947707176208496, + "learning_rate": 7.439779363427828e-06, + "loss": 0.7323, + "step": 7319 + }, + { + "epoch": 1.8543381887270425, + "grad_norm": 3.4181292057037354, + "learning_rate": 7.4390479987330775e-06, + "loss": 0.6592, + "step": 7320 + }, + { + "epoch": 1.8545915136162128, + "grad_norm": 4.534851551055908, + "learning_rate": 7.438316565551001e-06, + "loss": 0.7476, + "step": 7321 + }, + { + "epoch": 1.8548448385053833, + "grad_norm": 3.6873083114624023, + "learning_rate": 7.4375850639021395e-06, + "loss": 0.7622, + "step": 7322 + }, + { + "epoch": 1.8550981633945534, + "grad_norm": 3.62625789642334, + "learning_rate": 7.436853493807032e-06, + "loss": 0.7018, + "step": 7323 + }, + { + "epoch": 1.8553514882837239, + "grad_norm": 3.600980520248413, + "learning_rate": 7.43612185528622e-06, + "loss": 0.7497, + "step": 7324 + }, + { + "epoch": 1.8556048131728944, + "grad_norm": 3.5620505809783936, + "learning_rate": 7.4353901483602485e-06, + "loss": 0.7743, + "step": 7325 + }, + { + "epoch": 1.8558581380620645, + "grad_norm": 3.5479280948638916, + "learning_rate": 7.434658373049662e-06, + "loss": 0.7191, + "step": 7326 + }, + { + "epoch": 1.856111462951235, + "grad_norm": 3.9729976654052734, + "learning_rate": 7.43392652937501e-06, + "loss": 0.7144, + "step": 7327 + }, + { + "epoch": 1.8563647878404055, + "grad_norm": 3.9694910049438477, + "learning_rate": 7.433194617356844e-06, + "loss": 0.8327, + "step": 7328 + }, + { + "epoch": 1.8566181127295756, + "grad_norm": 3.4946181774139404, + "learning_rate": 7.432462637015709e-06, + "loss": 0.7549, + "step": 7329 + }, + { + "epoch": 1.856871437618746, + "grad_norm": 4.2069501876831055, + "learning_rate": 7.431730588372166e-06, + "loss": 0.912, + "step": 7330 + }, + { + "epoch": 1.8571247625079164, + "grad_norm": 3.2858669757843018, + "learning_rate": 7.430998471446767e-06, + "loss": 0.7898, + "step": 7331 + }, + { + "epoch": 1.8573780873970867, + "grad_norm": 3.9718568325042725, + "learning_rate": 7.43026628626007e-06, + "loss": 0.8559, + "step": 7332 + }, + { + "epoch": 1.8576314122862572, + "grad_norm": 3.5112857818603516, + "learning_rate": 7.429534032832637e-06, + "loss": 0.7235, + "step": 7333 + }, + { + "epoch": 1.8578847371754275, + "grad_norm": 3.4231934547424316, + "learning_rate": 7.428801711185025e-06, + "loss": 0.7495, + "step": 7334 + }, + { + "epoch": 1.8581380620645978, + "grad_norm": 3.660020351409912, + "learning_rate": 7.428069321337799e-06, + "loss": 0.7971, + "step": 7335 + }, + { + "epoch": 1.8583913869537683, + "grad_norm": 3.602024793624878, + "learning_rate": 7.4273368633115265e-06, + "loss": 0.8713, + "step": 7336 + }, + { + "epoch": 1.8586447118429386, + "grad_norm": 3.547377586364746, + "learning_rate": 7.4266043371267705e-06, + "loss": 0.7743, + "step": 7337 + }, + { + "epoch": 1.8588980367321088, + "grad_norm": 3.8899481296539307, + "learning_rate": 7.4258717428041025e-06, + "loss": 0.7327, + "step": 7338 + }, + { + "epoch": 1.8591513616212794, + "grad_norm": 4.131229877471924, + "learning_rate": 7.425139080364094e-06, + "loss": 0.7848, + "step": 7339 + }, + { + "epoch": 1.8594046865104497, + "grad_norm": 3.900611400604248, + "learning_rate": 7.424406349827315e-06, + "loss": 0.8824, + "step": 7340 + }, + { + "epoch": 1.85965801139962, + "grad_norm": 3.615609645843506, + "learning_rate": 7.4236735512143434e-06, + "loss": 0.8079, + "step": 7341 + }, + { + "epoch": 1.8599113362887905, + "grad_norm": 3.6150715351104736, + "learning_rate": 7.422940684545753e-06, + "loss": 0.8096, + "step": 7342 + }, + { + "epoch": 1.8601646611779608, + "grad_norm": 3.648995876312256, + "learning_rate": 7.422207749842124e-06, + "loss": 0.8403, + "step": 7343 + }, + { + "epoch": 1.860417986067131, + "grad_norm": 3.13474702835083, + "learning_rate": 7.421474747124038e-06, + "loss": 0.6948, + "step": 7344 + }, + { + "epoch": 1.8606713109563016, + "grad_norm": 4.048909664154053, + "learning_rate": 7.420741676412075e-06, + "loss": 0.8387, + "step": 7345 + }, + { + "epoch": 1.8609246358454719, + "grad_norm": 4.388035774230957, + "learning_rate": 7.4200085377268195e-06, + "loss": 0.7915, + "step": 7346 + }, + { + "epoch": 1.8611779607346421, + "grad_norm": 4.1641764640808105, + "learning_rate": 7.4192753310888595e-06, + "loss": 0.7966, + "step": 7347 + }, + { + "epoch": 1.8614312856238127, + "grad_norm": 3.8473355770111084, + "learning_rate": 7.4185420565187826e-06, + "loss": 0.8086, + "step": 7348 + }, + { + "epoch": 1.861684610512983, + "grad_norm": 3.894773244857788, + "learning_rate": 7.4178087140371775e-06, + "loss": 0.885, + "step": 7349 + }, + { + "epoch": 1.8619379354021532, + "grad_norm": 3.767836809158325, + "learning_rate": 7.417075303664637e-06, + "loss": 0.7168, + "step": 7350 + }, + { + "epoch": 1.8621912602913238, + "grad_norm": 4.533100128173828, + "learning_rate": 7.416341825421755e-06, + "loss": 0.7662, + "step": 7351 + }, + { + "epoch": 1.8624445851804938, + "grad_norm": 3.6592841148376465, + "learning_rate": 7.415608279329127e-06, + "loss": 0.7133, + "step": 7352 + }, + { + "epoch": 1.8626979100696643, + "grad_norm": 3.5006027221679688, + "learning_rate": 7.41487466540735e-06, + "loss": 0.7597, + "step": 7353 + }, + { + "epoch": 1.8629512349588349, + "grad_norm": 3.608720064163208, + "learning_rate": 7.414140983677024e-06, + "loss": 0.8238, + "step": 7354 + }, + { + "epoch": 1.863204559848005, + "grad_norm": 3.5064187049865723, + "learning_rate": 7.4134072341587516e-06, + "loss": 0.7651, + "step": 7355 + }, + { + "epoch": 1.8634578847371754, + "grad_norm": 3.718505859375, + "learning_rate": 7.412673416873134e-06, + "loss": 0.8108, + "step": 7356 + }, + { + "epoch": 1.863711209626346, + "grad_norm": 3.588686943054199, + "learning_rate": 7.41193953184078e-06, + "loss": 0.8105, + "step": 7357 + }, + { + "epoch": 1.863964534515516, + "grad_norm": 3.4137682914733887, + "learning_rate": 7.411205579082292e-06, + "loss": 0.6981, + "step": 7358 + }, + { + "epoch": 1.8642178594046865, + "grad_norm": 3.5827314853668213, + "learning_rate": 7.4104715586182815e-06, + "loss": 0.6481, + "step": 7359 + }, + { + "epoch": 1.8644711842938568, + "grad_norm": 3.4952003955841064, + "learning_rate": 7.409737470469361e-06, + "loss": 0.962, + "step": 7360 + }, + { + "epoch": 1.8647245091830271, + "grad_norm": 3.540165662765503, + "learning_rate": 7.40900331465614e-06, + "loss": 0.6896, + "step": 7361 + }, + { + "epoch": 1.8649778340721976, + "grad_norm": 3.630929708480835, + "learning_rate": 7.408269091199237e-06, + "loss": 0.7493, + "step": 7362 + }, + { + "epoch": 1.865231158961368, + "grad_norm": 3.743705987930298, + "learning_rate": 7.407534800119265e-06, + "loss": 0.8047, + "step": 7363 + }, + { + "epoch": 1.8654844838505382, + "grad_norm": 3.5905940532684326, + "learning_rate": 7.406800441436845e-06, + "loss": 0.6626, + "step": 7364 + }, + { + "epoch": 1.8657378087397087, + "grad_norm": 3.31368350982666, + "learning_rate": 7.406066015172596e-06, + "loss": 0.6748, + "step": 7365 + }, + { + "epoch": 1.865991133628879, + "grad_norm": 4.063304424285889, + "learning_rate": 7.405331521347141e-06, + "loss": 0.8786, + "step": 7366 + }, + { + "epoch": 1.8662444585180493, + "grad_norm": 3.6789088249206543, + "learning_rate": 7.404596959981104e-06, + "loss": 0.7786, + "step": 7367 + }, + { + "epoch": 1.8664977834072198, + "grad_norm": 3.6941704750061035, + "learning_rate": 7.403862331095112e-06, + "loss": 0.7188, + "step": 7368 + }, + { + "epoch": 1.8667511082963901, + "grad_norm": 3.8199117183685303, + "learning_rate": 7.403127634709791e-06, + "loss": 0.834, + "step": 7369 + }, + { + "epoch": 1.8670044331855604, + "grad_norm": 3.6132707595825195, + "learning_rate": 7.402392870845774e-06, + "loss": 0.7723, + "step": 7370 + }, + { + "epoch": 1.867257758074731, + "grad_norm": 3.8342978954315186, + "learning_rate": 7.4016580395236906e-06, + "loss": 0.8459, + "step": 7371 + }, + { + "epoch": 1.8675110829639012, + "grad_norm": 3.7187366485595703, + "learning_rate": 7.400923140764176e-06, + "loss": 0.7621, + "step": 7372 + }, + { + "epoch": 1.8677644078530715, + "grad_norm": 3.7745282649993896, + "learning_rate": 7.400188174587863e-06, + "loss": 0.7832, + "step": 7373 + }, + { + "epoch": 1.868017732742242, + "grad_norm": 3.4873392581939697, + "learning_rate": 7.399453141015392e-06, + "loss": 0.7528, + "step": 7374 + }, + { + "epoch": 1.8682710576314123, + "grad_norm": 3.6364986896514893, + "learning_rate": 7.398718040067401e-06, + "loss": 0.801, + "step": 7375 + }, + { + "epoch": 1.8685243825205826, + "grad_norm": 3.6586132049560547, + "learning_rate": 7.397982871764532e-06, + "loss": 0.7347, + "step": 7376 + }, + { + "epoch": 1.8687777074097531, + "grad_norm": 3.6796278953552246, + "learning_rate": 7.397247636127428e-06, + "loss": 0.7327, + "step": 7377 + }, + { + "epoch": 1.8690310322989234, + "grad_norm": 3.7249855995178223, + "learning_rate": 7.396512333176734e-06, + "loss": 0.7736, + "step": 7378 + }, + { + "epoch": 1.8692843571880937, + "grad_norm": 3.6299843788146973, + "learning_rate": 7.395776962933097e-06, + "loss": 0.7297, + "step": 7379 + }, + { + "epoch": 1.8695376820772642, + "grad_norm": 4.824265003204346, + "learning_rate": 7.395041525417164e-06, + "loss": 0.8056, + "step": 7380 + }, + { + "epoch": 1.8697910069664343, + "grad_norm": 3.879260301589966, + "learning_rate": 7.394306020649588e-06, + "loss": 0.8066, + "step": 7381 + }, + { + "epoch": 1.8700443318556048, + "grad_norm": 3.766038179397583, + "learning_rate": 7.393570448651023e-06, + "loss": 0.7338, + "step": 7382 + }, + { + "epoch": 1.8702976567447753, + "grad_norm": 3.6172983646392822, + "learning_rate": 7.39283480944212e-06, + "loss": 0.7236, + "step": 7383 + }, + { + "epoch": 1.8705509816339454, + "grad_norm": 3.471947431564331, + "learning_rate": 7.392099103043537e-06, + "loss": 0.8803, + "step": 7384 + }, + { + "epoch": 1.870804306523116, + "grad_norm": 3.7183072566986084, + "learning_rate": 7.391363329475932e-06, + "loss": 0.7875, + "step": 7385 + }, + { + "epoch": 1.8710576314122862, + "grad_norm": 3.2768454551696777, + "learning_rate": 7.390627488759967e-06, + "loss": 0.7308, + "step": 7386 + }, + { + "epoch": 1.8713109563014565, + "grad_norm": 3.899739980697632, + "learning_rate": 7.389891580916302e-06, + "loss": 0.8452, + "step": 7387 + }, + { + "epoch": 1.871564281190627, + "grad_norm": 3.696506977081299, + "learning_rate": 7.389155605965601e-06, + "loss": 0.7286, + "step": 7388 + }, + { + "epoch": 1.8718176060797973, + "grad_norm": 3.8265140056610107, + "learning_rate": 7.38841956392853e-06, + "loss": 0.7616, + "step": 7389 + }, + { + "epoch": 1.8720709309689676, + "grad_norm": 3.4128339290618896, + "learning_rate": 7.387683454825758e-06, + "loss": 0.7428, + "step": 7390 + }, + { + "epoch": 1.872324255858138, + "grad_norm": 3.8740456104278564, + "learning_rate": 7.386947278677954e-06, + "loss": 0.8434, + "step": 7391 + }, + { + "epoch": 1.8725775807473084, + "grad_norm": 3.352545976638794, + "learning_rate": 7.386211035505788e-06, + "loss": 0.8019, + "step": 7392 + }, + { + "epoch": 1.8728309056364787, + "grad_norm": 3.2622337341308594, + "learning_rate": 7.385474725329935e-06, + "loss": 0.7074, + "step": 7393 + }, + { + "epoch": 1.8730842305256492, + "grad_norm": 3.2104744911193848, + "learning_rate": 7.384738348171069e-06, + "loss": 0.7666, + "step": 7394 + }, + { + "epoch": 1.8733375554148195, + "grad_norm": 3.6817729473114014, + "learning_rate": 7.384001904049869e-06, + "loss": 0.8851, + "step": 7395 + }, + { + "epoch": 1.8735908803039898, + "grad_norm": 3.282905340194702, + "learning_rate": 7.383265392987011e-06, + "loss": 0.6493, + "step": 7396 + }, + { + "epoch": 1.8738442051931603, + "grad_norm": 3.8238494396209717, + "learning_rate": 7.382528815003181e-06, + "loss": 0.7047, + "step": 7397 + }, + { + "epoch": 1.8740975300823306, + "grad_norm": 4.017204761505127, + "learning_rate": 7.381792170119057e-06, + "loss": 0.7825, + "step": 7398 + }, + { + "epoch": 1.874350854971501, + "grad_norm": 3.700448751449585, + "learning_rate": 7.381055458355324e-06, + "loss": 0.7084, + "step": 7399 + }, + { + "epoch": 1.8746041798606714, + "grad_norm": 3.997309684753418, + "learning_rate": 7.38031867973267e-06, + "loss": 0.8363, + "step": 7400 + }, + { + "epoch": 1.8748575047498417, + "grad_norm": 3.56485652923584, + "learning_rate": 7.379581834271785e-06, + "loss": 0.7459, + "step": 7401 + }, + { + "epoch": 1.875110829639012, + "grad_norm": 3.750089168548584, + "learning_rate": 7.3788449219933555e-06, + "loss": 0.6487, + "step": 7402 + }, + { + "epoch": 1.8753641545281825, + "grad_norm": 4.218666076660156, + "learning_rate": 7.378107942918076e-06, + "loss": 0.8, + "step": 7403 + }, + { + "epoch": 1.8756174794173528, + "grad_norm": 3.992666006088257, + "learning_rate": 7.377370897066639e-06, + "loss": 0.8409, + "step": 7404 + }, + { + "epoch": 1.875870804306523, + "grad_norm": 4.031217098236084, + "learning_rate": 7.376633784459741e-06, + "loss": 0.9294, + "step": 7405 + }, + { + "epoch": 1.8761241291956936, + "grad_norm": 3.875433921813965, + "learning_rate": 7.375896605118083e-06, + "loss": 0.8327, + "step": 7406 + }, + { + "epoch": 1.8763774540848637, + "grad_norm": 3.15423583984375, + "learning_rate": 7.375159359062361e-06, + "loss": 0.6785, + "step": 7407 + }, + { + "epoch": 1.8766307789740342, + "grad_norm": 3.5996413230895996, + "learning_rate": 7.374422046313276e-06, + "loss": 0.759, + "step": 7408 + }, + { + "epoch": 1.8768841038632047, + "grad_norm": 3.8724873065948486, + "learning_rate": 7.373684666891533e-06, + "loss": 0.9113, + "step": 7409 + }, + { + "epoch": 1.8771374287523748, + "grad_norm": 3.6898205280303955, + "learning_rate": 7.372947220817837e-06, + "loss": 0.7315, + "step": 7410 + }, + { + "epoch": 1.8773907536415453, + "grad_norm": 3.7833163738250732, + "learning_rate": 7.372209708112897e-06, + "loss": 0.7906, + "step": 7411 + }, + { + "epoch": 1.8776440785307158, + "grad_norm": 3.598475694656372, + "learning_rate": 7.371472128797419e-06, + "loss": 0.8126, + "step": 7412 + }, + { + "epoch": 1.8778974034198859, + "grad_norm": 3.3231332302093506, + "learning_rate": 7.3707344828921145e-06, + "loss": 0.7783, + "step": 7413 + }, + { + "epoch": 1.8781507283090564, + "grad_norm": 3.87412428855896, + "learning_rate": 7.369996770417698e-06, + "loss": 0.7888, + "step": 7414 + }, + { + "epoch": 1.8784040531982267, + "grad_norm": 3.6098060607910156, + "learning_rate": 7.369258991394882e-06, + "loss": 0.8262, + "step": 7415 + }, + { + "epoch": 1.878657378087397, + "grad_norm": 3.879749298095703, + "learning_rate": 7.368521145844384e-06, + "loss": 0.7725, + "step": 7416 + }, + { + "epoch": 1.8789107029765675, + "grad_norm": 3.698374032974243, + "learning_rate": 7.367783233786923e-06, + "loss": 0.8039, + "step": 7417 + }, + { + "epoch": 1.8791640278657378, + "grad_norm": 3.346038579940796, + "learning_rate": 7.367045255243217e-06, + "loss": 0.663, + "step": 7418 + }, + { + "epoch": 1.879417352754908, + "grad_norm": 3.278337240219116, + "learning_rate": 7.366307210233992e-06, + "loss": 0.6843, + "step": 7419 + }, + { + "epoch": 1.8796706776440786, + "grad_norm": 3.4066765308380127, + "learning_rate": 7.365569098779968e-06, + "loss": 0.7675, + "step": 7420 + }, + { + "epoch": 1.8799240025332489, + "grad_norm": 4.109745502471924, + "learning_rate": 7.364830920901873e-06, + "loss": 0.8416, + "step": 7421 + }, + { + "epoch": 1.8801773274224192, + "grad_norm": 3.6897478103637695, + "learning_rate": 7.3640926766204335e-06, + "loss": 0.7679, + "step": 7422 + }, + { + "epoch": 1.8804306523115897, + "grad_norm": 3.8376998901367188, + "learning_rate": 7.36335436595638e-06, + "loss": 0.7612, + "step": 7423 + }, + { + "epoch": 1.88068397720076, + "grad_norm": 3.4499313831329346, + "learning_rate": 7.362615988930442e-06, + "loss": 0.6903, + "step": 7424 + }, + { + "epoch": 1.8809373020899303, + "grad_norm": 3.5727550983428955, + "learning_rate": 7.3618775455633565e-06, + "loss": 0.7162, + "step": 7425 + }, + { + "epoch": 1.8811906269791008, + "grad_norm": 3.6529769897460938, + "learning_rate": 7.3611390358758574e-06, + "loss": 0.7865, + "step": 7426 + }, + { + "epoch": 1.881443951868271, + "grad_norm": 3.797576665878296, + "learning_rate": 7.360400459888678e-06, + "loss": 0.8227, + "step": 7427 + }, + { + "epoch": 1.8816972767574414, + "grad_norm": 3.2741506099700928, + "learning_rate": 7.359661817622561e-06, + "loss": 0.7678, + "step": 7428 + }, + { + "epoch": 1.8819506016466119, + "grad_norm": 3.7047836780548096, + "learning_rate": 7.358923109098246e-06, + "loss": 0.792, + "step": 7429 + }, + { + "epoch": 1.8822039265357822, + "grad_norm": 3.4118900299072266, + "learning_rate": 7.358184334336476e-06, + "loss": 0.7368, + "step": 7430 + }, + { + "epoch": 1.8824572514249525, + "grad_norm": 3.859285831451416, + "learning_rate": 7.357445493357995e-06, + "loss": 0.7663, + "step": 7431 + }, + { + "epoch": 1.882710576314123, + "grad_norm": 3.458205223083496, + "learning_rate": 7.35670658618355e-06, + "loss": 0.7991, + "step": 7432 + }, + { + "epoch": 1.8829639012032933, + "grad_norm": 3.5803966522216797, + "learning_rate": 7.355967612833887e-06, + "loss": 0.7658, + "step": 7433 + }, + { + "epoch": 1.8832172260924636, + "grad_norm": 3.285926342010498, + "learning_rate": 7.355228573329759e-06, + "loss": 0.6576, + "step": 7434 + }, + { + "epoch": 1.883470550981634, + "grad_norm": 3.422349691390991, + "learning_rate": 7.3544894676919155e-06, + "loss": 0.7696, + "step": 7435 + }, + { + "epoch": 1.8837238758708041, + "grad_norm": 3.667968988418579, + "learning_rate": 7.353750295941113e-06, + "loss": 0.7463, + "step": 7436 + }, + { + "epoch": 1.8839772007599747, + "grad_norm": 3.90153431892395, + "learning_rate": 7.353011058098104e-06, + "loss": 0.8393, + "step": 7437 + }, + { + "epoch": 1.8842305256491452, + "grad_norm": 3.7004783153533936, + "learning_rate": 7.3522717541836475e-06, + "loss": 0.7922, + "step": 7438 + }, + { + "epoch": 1.8844838505383152, + "grad_norm": 3.6125683784484863, + "learning_rate": 7.351532384218503e-06, + "loss": 0.7269, + "step": 7439 + }, + { + "epoch": 1.8847371754274858, + "grad_norm": 3.304962396621704, + "learning_rate": 7.35079294822343e-06, + "loss": 0.8017, + "step": 7440 + }, + { + "epoch": 1.8849905003166563, + "grad_norm": 3.801954746246338, + "learning_rate": 7.350053446219194e-06, + "loss": 0.7969, + "step": 7441 + }, + { + "epoch": 1.8852438252058263, + "grad_norm": 3.468815803527832, + "learning_rate": 7.349313878226558e-06, + "loss": 0.6211, + "step": 7442 + }, + { + "epoch": 1.8854971500949969, + "grad_norm": 3.8287646770477295, + "learning_rate": 7.348574244266289e-06, + "loss": 0.9248, + "step": 7443 + }, + { + "epoch": 1.8857504749841671, + "grad_norm": 3.3916845321655273, + "learning_rate": 7.347834544359157e-06, + "loss": 0.7739, + "step": 7444 + }, + { + "epoch": 1.8860037998733374, + "grad_norm": 3.608035087585449, + "learning_rate": 7.34709477852593e-06, + "loss": 0.6815, + "step": 7445 + }, + { + "epoch": 1.886257124762508, + "grad_norm": 3.456505060195923, + "learning_rate": 7.346354946787384e-06, + "loss": 0.756, + "step": 7446 + }, + { + "epoch": 1.8865104496516782, + "grad_norm": 3.5758090019226074, + "learning_rate": 7.34561504916429e-06, + "loss": 0.883, + "step": 7447 + }, + { + "epoch": 1.8867637745408485, + "grad_norm": 4.007209777832031, + "learning_rate": 7.344875085677423e-06, + "loss": 0.8203, + "step": 7448 + }, + { + "epoch": 1.887017099430019, + "grad_norm": 4.382864475250244, + "learning_rate": 7.3441350563475645e-06, + "loss": 0.9269, + "step": 7449 + }, + { + "epoch": 1.8872704243191893, + "grad_norm": 4.218381404876709, + "learning_rate": 7.3433949611954915e-06, + "loss": 0.7211, + "step": 7450 + }, + { + "epoch": 1.8875237492083596, + "grad_norm": 4.290537357330322, + "learning_rate": 7.342654800241986e-06, + "loss": 0.8025, + "step": 7451 + }, + { + "epoch": 1.8877770740975301, + "grad_norm": 3.8483150005340576, + "learning_rate": 7.341914573507832e-06, + "loss": 0.8395, + "step": 7452 + }, + { + "epoch": 1.8880303989867004, + "grad_norm": 3.528327465057373, + "learning_rate": 7.3411742810138146e-06, + "loss": 0.7301, + "step": 7453 + }, + { + "epoch": 1.8882837238758707, + "grad_norm": 3.4256582260131836, + "learning_rate": 7.3404339227807205e-06, + "loss": 0.7408, + "step": 7454 + }, + { + "epoch": 1.8885370487650412, + "grad_norm": 3.589458465576172, + "learning_rate": 7.3396934988293386e-06, + "loss": 0.8343, + "step": 7455 + }, + { + "epoch": 1.8887903736542115, + "grad_norm": 3.743173360824585, + "learning_rate": 7.338953009180459e-06, + "loss": 0.8369, + "step": 7456 + }, + { + "epoch": 1.8890436985433818, + "grad_norm": 3.327045440673828, + "learning_rate": 7.338212453854876e-06, + "loss": 0.7343, + "step": 7457 + }, + { + "epoch": 1.8892970234325523, + "grad_norm": 3.6900317668914795, + "learning_rate": 7.337471832873383e-06, + "loss": 0.8128, + "step": 7458 + }, + { + "epoch": 1.8895503483217226, + "grad_norm": 3.558347225189209, + "learning_rate": 7.336731146256777e-06, + "loss": 0.8122, + "step": 7459 + }, + { + "epoch": 1.889803673210893, + "grad_norm": 3.3445587158203125, + "learning_rate": 7.335990394025856e-06, + "loss": 0.7065, + "step": 7460 + }, + { + "epoch": 1.8900569981000634, + "grad_norm": 3.516998767852783, + "learning_rate": 7.3352495762014184e-06, + "loss": 0.7819, + "step": 7461 + }, + { + "epoch": 1.8903103229892337, + "grad_norm": 3.9598820209503174, + "learning_rate": 7.3345086928042675e-06, + "loss": 0.8612, + "step": 7462 + }, + { + "epoch": 1.890563647878404, + "grad_norm": 3.2197012901306152, + "learning_rate": 7.333767743855207e-06, + "loss": 0.6761, + "step": 7463 + }, + { + "epoch": 1.8908169727675745, + "grad_norm": 3.510225772857666, + "learning_rate": 7.333026729375041e-06, + "loss": 0.7569, + "step": 7464 + }, + { + "epoch": 1.8910702976567446, + "grad_norm": 3.2959768772125244, + "learning_rate": 7.332285649384578e-06, + "loss": 0.6855, + "step": 7465 + }, + { + "epoch": 1.8913236225459151, + "grad_norm": 3.7513856887817383, + "learning_rate": 7.331544503904629e-06, + "loss": 0.8205, + "step": 7466 + }, + { + "epoch": 1.8915769474350856, + "grad_norm": 4.012969493865967, + "learning_rate": 7.330803292956e-06, + "loss": 0.7834, + "step": 7467 + }, + { + "epoch": 1.8918302723242557, + "grad_norm": 4.250128269195557, + "learning_rate": 7.330062016559509e-06, + "loss": 0.8426, + "step": 7468 + }, + { + "epoch": 1.8920835972134262, + "grad_norm": 3.455901861190796, + "learning_rate": 7.329320674735968e-06, + "loss": 0.8845, + "step": 7469 + }, + { + "epoch": 1.8923369221025967, + "grad_norm": 3.4566681385040283, + "learning_rate": 7.328579267506195e-06, + "loss": 0.7361, + "step": 7470 + }, + { + "epoch": 1.8925902469917668, + "grad_norm": 3.7052054405212402, + "learning_rate": 7.3278377948910076e-06, + "loss": 0.7984, + "step": 7471 + }, + { + "epoch": 1.8928435718809373, + "grad_norm": 4.076179027557373, + "learning_rate": 7.327096256911225e-06, + "loss": 0.7878, + "step": 7472 + }, + { + "epoch": 1.8930968967701076, + "grad_norm": 3.4270684719085693, + "learning_rate": 7.326354653587669e-06, + "loss": 0.75, + "step": 7473 + }, + { + "epoch": 1.893350221659278, + "grad_norm": 3.6080360412597656, + "learning_rate": 7.325612984941167e-06, + "loss": 0.8201, + "step": 7474 + }, + { + "epoch": 1.8936035465484484, + "grad_norm": 3.11811900138855, + "learning_rate": 7.324871250992543e-06, + "loss": 0.6514, + "step": 7475 + }, + { + "epoch": 1.8938568714376187, + "grad_norm": 3.600656747817993, + "learning_rate": 7.324129451762622e-06, + "loss": 0.8541, + "step": 7476 + }, + { + "epoch": 1.894110196326789, + "grad_norm": 3.697023391723633, + "learning_rate": 7.323387587272235e-06, + "loss": 0.8515, + "step": 7477 + }, + { + "epoch": 1.8943635212159595, + "grad_norm": 3.2946245670318604, + "learning_rate": 7.322645657542214e-06, + "loss": 0.7835, + "step": 7478 + }, + { + "epoch": 1.8946168461051298, + "grad_norm": 3.841540575027466, + "learning_rate": 7.321903662593391e-06, + "loss": 0.8945, + "step": 7479 + }, + { + "epoch": 1.8948701709943, + "grad_norm": 4.031865119934082, + "learning_rate": 7.321161602446601e-06, + "loss": 0.9294, + "step": 7480 + }, + { + "epoch": 1.8951234958834706, + "grad_norm": 3.5384581089019775, + "learning_rate": 7.320419477122682e-06, + "loss": 0.698, + "step": 7481 + }, + { + "epoch": 1.895376820772641, + "grad_norm": 4.3465895652771, + "learning_rate": 7.319677286642472e-06, + "loss": 0.8124, + "step": 7482 + }, + { + "epoch": 1.8956301456618112, + "grad_norm": 3.62138295173645, + "learning_rate": 7.31893503102681e-06, + "loss": 0.8068, + "step": 7483 + }, + { + "epoch": 1.8958834705509817, + "grad_norm": 3.597529888153076, + "learning_rate": 7.31819271029654e-06, + "loss": 0.7252, + "step": 7484 + }, + { + "epoch": 1.896136795440152, + "grad_norm": 3.8947126865386963, + "learning_rate": 7.317450324472506e-06, + "loss": 0.6933, + "step": 7485 + }, + { + "epoch": 1.8963901203293223, + "grad_norm": 4.336637496948242, + "learning_rate": 7.316707873575551e-06, + "loss": 0.8949, + "step": 7486 + }, + { + "epoch": 1.8966434452184928, + "grad_norm": 3.8488662242889404, + "learning_rate": 7.315965357626527e-06, + "loss": 0.8458, + "step": 7487 + }, + { + "epoch": 1.896896770107663, + "grad_norm": 3.8070075511932373, + "learning_rate": 7.315222776646279e-06, + "loss": 0.7341, + "step": 7488 + }, + { + "epoch": 1.8971500949968334, + "grad_norm": 3.8892059326171875, + "learning_rate": 7.314480130655661e-06, + "loss": 0.8279, + "step": 7489 + }, + { + "epoch": 1.897403419886004, + "grad_norm": 4.109464645385742, + "learning_rate": 7.313737419675526e-06, + "loss": 0.7159, + "step": 7490 + }, + { + "epoch": 1.8976567447751742, + "grad_norm": 3.9447784423828125, + "learning_rate": 7.312994643726728e-06, + "loss": 0.8137, + "step": 7491 + }, + { + "epoch": 1.8979100696643445, + "grad_norm": 3.6503822803497314, + "learning_rate": 7.312251802830126e-06, + "loss": 0.7035, + "step": 7492 + }, + { + "epoch": 1.898163394553515, + "grad_norm": 3.267232894897461, + "learning_rate": 7.311508897006576e-06, + "loss": 0.655, + "step": 7493 + }, + { + "epoch": 1.898416719442685, + "grad_norm": 3.844186544418335, + "learning_rate": 7.310765926276939e-06, + "loss": 0.8321, + "step": 7494 + }, + { + "epoch": 1.8986700443318556, + "grad_norm": 3.721618175506592, + "learning_rate": 7.310022890662079e-06, + "loss": 0.8112, + "step": 7495 + }, + { + "epoch": 1.8989233692210261, + "grad_norm": 3.928321123123169, + "learning_rate": 7.309279790182859e-06, + "loss": 0.7592, + "step": 7496 + }, + { + "epoch": 1.8991766941101962, + "grad_norm": 3.996778726577759, + "learning_rate": 7.3085366248601445e-06, + "loss": 0.8084, + "step": 7497 + }, + { + "epoch": 1.8994300189993667, + "grad_norm": 3.359065055847168, + "learning_rate": 7.307793394714803e-06, + "loss": 0.688, + "step": 7498 + }, + { + "epoch": 1.8996833438885372, + "grad_norm": 3.5219783782958984, + "learning_rate": 7.307050099767704e-06, + "loss": 0.7543, + "step": 7499 + }, + { + "epoch": 1.8999366687777073, + "grad_norm": 3.678074359893799, + "learning_rate": 7.306306740039722e-06, + "loss": 0.9287, + "step": 7500 + }, + { + "epoch": 1.8999366687777073, + "eval_loss": 1.1349515914916992, + "eval_runtime": 14.2346, + "eval_samples_per_second": 28.101, + "eval_steps_per_second": 3.513, + "step": 7500 + }, + { + "epoch": 1.9001899936668778, + "grad_norm": 3.832315683364868, + "learning_rate": 7.305563315551725e-06, + "loss": 0.8444, + "step": 7501 + }, + { + "epoch": 1.900443318556048, + "grad_norm": 3.518361806869507, + "learning_rate": 7.304819826324592e-06, + "loss": 0.7014, + "step": 7502 + }, + { + "epoch": 1.9006966434452184, + "grad_norm": 3.5828850269317627, + "learning_rate": 7.3040762723791984e-06, + "loss": 0.84, + "step": 7503 + }, + { + "epoch": 1.900949968334389, + "grad_norm": 3.6892452239990234, + "learning_rate": 7.303332653736421e-06, + "loss": 0.7193, + "step": 7504 + }, + { + "epoch": 1.9012032932235592, + "grad_norm": 3.7808103561401367, + "learning_rate": 7.302588970417145e-06, + "loss": 0.679, + "step": 7505 + }, + { + "epoch": 1.9014566181127295, + "grad_norm": 3.564419984817505, + "learning_rate": 7.301845222442248e-06, + "loss": 0.7407, + "step": 7506 + }, + { + "epoch": 1.9017099430019, + "grad_norm": 4.022425174713135, + "learning_rate": 7.301101409832617e-06, + "loss": 0.8077, + "step": 7507 + }, + { + "epoch": 1.9019632678910703, + "grad_norm": 3.8452892303466797, + "learning_rate": 7.300357532609137e-06, + "loss": 0.8032, + "step": 7508 + }, + { + "epoch": 1.9022165927802406, + "grad_norm": 3.6932547092437744, + "learning_rate": 7.299613590792695e-06, + "loss": 0.6808, + "step": 7509 + }, + { + "epoch": 1.902469917669411, + "grad_norm": 3.6755733489990234, + "learning_rate": 7.2988695844041816e-06, + "loss": 0.8113, + "step": 7510 + }, + { + "epoch": 1.9027232425585814, + "grad_norm": 3.6966922283172607, + "learning_rate": 7.298125513464487e-06, + "loss": 0.808, + "step": 7511 + }, + { + "epoch": 1.9029765674477517, + "grad_norm": 3.531323194503784, + "learning_rate": 7.297381377994506e-06, + "loss": 0.7983, + "step": 7512 + }, + { + "epoch": 1.9032298923369222, + "grad_norm": 3.661247730255127, + "learning_rate": 7.296637178015131e-06, + "loss": 0.7768, + "step": 7513 + }, + { + "epoch": 1.9034832172260925, + "grad_norm": 4.042456150054932, + "learning_rate": 7.295892913547264e-06, + "loss": 0.787, + "step": 7514 + }, + { + "epoch": 1.9037365421152628, + "grad_norm": 3.56840443611145, + "learning_rate": 7.295148584611796e-06, + "loss": 0.7819, + "step": 7515 + }, + { + "epoch": 1.9039898670044333, + "grad_norm": 3.881992816925049, + "learning_rate": 7.294404191229633e-06, + "loss": 0.7504, + "step": 7516 + }, + { + "epoch": 1.9042431918936036, + "grad_norm": 4.397042274475098, + "learning_rate": 7.2936597334216755e-06, + "loss": 0.8692, + "step": 7517 + }, + { + "epoch": 1.9044965167827739, + "grad_norm": 3.9434621334075928, + "learning_rate": 7.292915211208828e-06, + "loss": 0.7889, + "step": 7518 + }, + { + "epoch": 1.9047498416719444, + "grad_norm": 3.710800886154175, + "learning_rate": 7.292170624611996e-06, + "loss": 0.802, + "step": 7519 + }, + { + "epoch": 1.9050031665611147, + "grad_norm": 3.4779880046844482, + "learning_rate": 7.291425973652087e-06, + "loss": 0.6578, + "step": 7520 + }, + { + "epoch": 1.905256491450285, + "grad_norm": 4.214772701263428, + "learning_rate": 7.290681258350011e-06, + "loss": 0.8795, + "step": 7521 + }, + { + "epoch": 1.9055098163394555, + "grad_norm": 3.4735662937164307, + "learning_rate": 7.289936478726678e-06, + "loss": 0.6979, + "step": 7522 + }, + { + "epoch": 1.9057631412286256, + "grad_norm": 3.842702627182007, + "learning_rate": 7.289191634803002e-06, + "loss": 0.7598, + "step": 7523 + }, + { + "epoch": 1.906016466117796, + "grad_norm": 4.186038970947266, + "learning_rate": 7.288446726599899e-06, + "loss": 0.8704, + "step": 7524 + }, + { + "epoch": 1.9062697910069666, + "grad_norm": 4.115698337554932, + "learning_rate": 7.287701754138283e-06, + "loss": 0.7708, + "step": 7525 + }, + { + "epoch": 1.9065231158961367, + "grad_norm": 3.6517629623413086, + "learning_rate": 7.286956717439075e-06, + "loss": 0.892, + "step": 7526 + }, + { + "epoch": 1.9067764407853072, + "grad_norm": 4.028857231140137, + "learning_rate": 7.286211616523193e-06, + "loss": 0.8999, + "step": 7527 + }, + { + "epoch": 1.9070297656744777, + "grad_norm": 3.448723554611206, + "learning_rate": 7.285466451411562e-06, + "loss": 0.7555, + "step": 7528 + }, + { + "epoch": 1.9072830905636478, + "grad_norm": 3.609137535095215, + "learning_rate": 7.2847212221251025e-06, + "loss": 0.8393, + "step": 7529 + }, + { + "epoch": 1.9075364154528183, + "grad_norm": 3.8167686462402344, + "learning_rate": 7.283975928684743e-06, + "loss": 0.8018, + "step": 7530 + }, + { + "epoch": 1.9077897403419886, + "grad_norm": 3.806180715560913, + "learning_rate": 7.2832305711114094e-06, + "loss": 0.7599, + "step": 7531 + }, + { + "epoch": 1.9080430652311589, + "grad_norm": 3.578470468521118, + "learning_rate": 7.282485149426031e-06, + "loss": 0.7468, + "step": 7532 + }, + { + "epoch": 1.9082963901203294, + "grad_norm": 3.534583806991577, + "learning_rate": 7.281739663649541e-06, + "loss": 0.7641, + "step": 7533 + }, + { + "epoch": 1.9085497150094997, + "grad_norm": 3.559796094894409, + "learning_rate": 7.28099411380287e-06, + "loss": 0.7771, + "step": 7534 + }, + { + "epoch": 1.90880303989867, + "grad_norm": 3.4391186237335205, + "learning_rate": 7.280248499906952e-06, + "loss": 0.8154, + "step": 7535 + }, + { + "epoch": 1.9090563647878405, + "grad_norm": 4.066721439361572, + "learning_rate": 7.279502821982725e-06, + "loss": 0.9031, + "step": 7536 + }, + { + "epoch": 1.9093096896770108, + "grad_norm": 4.040509223937988, + "learning_rate": 7.2787570800511284e-06, + "loss": 0.7016, + "step": 7537 + }, + { + "epoch": 1.909563014566181, + "grad_norm": 3.6556410789489746, + "learning_rate": 7.278011274133101e-06, + "loss": 0.7134, + "step": 7538 + }, + { + "epoch": 1.9098163394553516, + "grad_norm": 3.554013967514038, + "learning_rate": 7.277265404249585e-06, + "loss": 0.7547, + "step": 7539 + }, + { + "epoch": 1.9100696643445219, + "grad_norm": 3.970280408859253, + "learning_rate": 7.276519470421521e-06, + "loss": 0.8431, + "step": 7540 + }, + { + "epoch": 1.9103229892336921, + "grad_norm": 3.8135056495666504, + "learning_rate": 7.275773472669859e-06, + "loss": 0.8514, + "step": 7541 + }, + { + "epoch": 1.9105763141228627, + "grad_norm": 3.77400541305542, + "learning_rate": 7.275027411015545e-06, + "loss": 0.7811, + "step": 7542 + }, + { + "epoch": 1.910829639012033, + "grad_norm": 4.023411750793457, + "learning_rate": 7.274281285479527e-06, + "loss": 0.791, + "step": 7543 + }, + { + "epoch": 1.9110829639012032, + "grad_norm": 3.69769549369812, + "learning_rate": 7.273535096082758e-06, + "loss": 0.9101, + "step": 7544 + }, + { + "epoch": 1.9113362887903738, + "grad_norm": 3.17036509513855, + "learning_rate": 7.272788842846187e-06, + "loss": 0.6548, + "step": 7545 + }, + { + "epoch": 1.911589613679544, + "grad_norm": 3.6044716835021973, + "learning_rate": 7.272042525790771e-06, + "loss": 0.8813, + "step": 7546 + }, + { + "epoch": 1.9118429385687143, + "grad_norm": 3.249027729034424, + "learning_rate": 7.271296144937465e-06, + "loss": 0.7574, + "step": 7547 + }, + { + "epoch": 1.9120962634578849, + "grad_norm": 3.897709369659424, + "learning_rate": 7.2705497003072286e-06, + "loss": 0.8049, + "step": 7548 + }, + { + "epoch": 1.9123495883470552, + "grad_norm": 4.343056678771973, + "learning_rate": 7.269803191921022e-06, + "loss": 0.9261, + "step": 7549 + }, + { + "epoch": 1.9126029132362254, + "grad_norm": 3.8631136417388916, + "learning_rate": 7.269056619799805e-06, + "loss": 0.8151, + "step": 7550 + }, + { + "epoch": 1.912856238125396, + "grad_norm": 3.8449199199676514, + "learning_rate": 7.26830998396454e-06, + "loss": 0.7861, + "step": 7551 + }, + { + "epoch": 1.913109563014566, + "grad_norm": 4.171482086181641, + "learning_rate": 7.267563284436194e-06, + "loss": 0.8838, + "step": 7552 + }, + { + "epoch": 1.9133628879037365, + "grad_norm": 4.200960159301758, + "learning_rate": 7.266816521235736e-06, + "loss": 0.7035, + "step": 7553 + }, + { + "epoch": 1.913616212792907, + "grad_norm": 3.545377254486084, + "learning_rate": 7.2660696943841304e-06, + "loss": 0.7542, + "step": 7554 + }, + { + "epoch": 1.9138695376820771, + "grad_norm": 3.7115321159362793, + "learning_rate": 7.26532280390235e-06, + "loss": 0.77, + "step": 7555 + }, + { + "epoch": 1.9141228625712476, + "grad_norm": 3.9595353603363037, + "learning_rate": 7.264575849811368e-06, + "loss": 0.87, + "step": 7556 + }, + { + "epoch": 1.914376187460418, + "grad_norm": 3.5592095851898193, + "learning_rate": 7.263828832132156e-06, + "loss": 0.7631, + "step": 7557 + }, + { + "epoch": 1.9146295123495882, + "grad_norm": 3.449580192565918, + "learning_rate": 7.2630817508856924e-06, + "loss": 0.7732, + "step": 7558 + }, + { + "epoch": 1.9148828372387587, + "grad_norm": 3.6107394695281982, + "learning_rate": 7.262334606092954e-06, + "loss": 0.664, + "step": 7559 + }, + { + "epoch": 1.915136162127929, + "grad_norm": 3.481963634490967, + "learning_rate": 7.26158739777492e-06, + "loss": 0.7425, + "step": 7560 + }, + { + "epoch": 1.9153894870170993, + "grad_norm": 3.831488847732544, + "learning_rate": 7.2608401259525705e-06, + "loss": 0.7904, + "step": 7561 + }, + { + "epoch": 1.9156428119062698, + "grad_norm": 3.188628673553467, + "learning_rate": 7.260092790646891e-06, + "loss": 0.6817, + "step": 7562 + }, + { + "epoch": 1.9158961367954401, + "grad_norm": 3.5356452465057373, + "learning_rate": 7.259345391878866e-06, + "loss": 0.8112, + "step": 7563 + }, + { + "epoch": 1.9161494616846104, + "grad_norm": 3.9766602516174316, + "learning_rate": 7.258597929669481e-06, + "loss": 0.8041, + "step": 7564 + }, + { + "epoch": 1.916402786573781, + "grad_norm": 3.374429225921631, + "learning_rate": 7.257850404039724e-06, + "loss": 0.7427, + "step": 7565 + }, + { + "epoch": 1.9166561114629512, + "grad_norm": 4.160411834716797, + "learning_rate": 7.257102815010585e-06, + "loss": 0.9573, + "step": 7566 + }, + { + "epoch": 1.9169094363521215, + "grad_norm": 3.426292657852173, + "learning_rate": 7.256355162603058e-06, + "loss": 0.6885, + "step": 7567 + }, + { + "epoch": 1.917162761241292, + "grad_norm": 3.8988962173461914, + "learning_rate": 7.255607446838136e-06, + "loss": 0.9027, + "step": 7568 + }, + { + "epoch": 1.9174160861304623, + "grad_norm": 3.6588001251220703, + "learning_rate": 7.254859667736813e-06, + "loss": 0.731, + "step": 7569 + }, + { + "epoch": 1.9176694110196326, + "grad_norm": 3.37235689163208, + "learning_rate": 7.254111825320088e-06, + "loss": 0.8461, + "step": 7570 + }, + { + "epoch": 1.9179227359088031, + "grad_norm": 3.7164995670318604, + "learning_rate": 7.25336391960896e-06, + "loss": 0.6265, + "step": 7571 + }, + { + "epoch": 1.9181760607979734, + "grad_norm": 3.629835605621338, + "learning_rate": 7.252615950624428e-06, + "loss": 0.814, + "step": 7572 + }, + { + "epoch": 1.9184293856871437, + "grad_norm": 3.7369308471679688, + "learning_rate": 7.251867918387496e-06, + "loss": 0.7857, + "step": 7573 + }, + { + "epoch": 1.9186827105763142, + "grad_norm": 3.5513267517089844, + "learning_rate": 7.25111982291917e-06, + "loss": 0.7524, + "step": 7574 + }, + { + "epoch": 1.9189360354654845, + "grad_norm": 3.8231332302093506, + "learning_rate": 7.250371664240452e-06, + "loss": 0.6639, + "step": 7575 + }, + { + "epoch": 1.9191893603546548, + "grad_norm": 3.890836477279663, + "learning_rate": 7.249623442372353e-06, + "loss": 0.8387, + "step": 7576 + }, + { + "epoch": 1.9194426852438253, + "grad_norm": 3.755363702774048, + "learning_rate": 7.248875157335883e-06, + "loss": 0.7449, + "step": 7577 + }, + { + "epoch": 1.9196960101329954, + "grad_norm": 3.4669764041900635, + "learning_rate": 7.248126809152052e-06, + "loss": 0.7403, + "step": 7578 + }, + { + "epoch": 1.919949335022166, + "grad_norm": 4.011867523193359, + "learning_rate": 7.247378397841873e-06, + "loss": 0.7917, + "step": 7579 + }, + { + "epoch": 1.9202026599113364, + "grad_norm": 3.4292800426483154, + "learning_rate": 7.246629923426363e-06, + "loss": 0.8322, + "step": 7580 + }, + { + "epoch": 1.9204559848005065, + "grad_norm": 3.6036596298217773, + "learning_rate": 7.245881385926537e-06, + "loss": 0.7647, + "step": 7581 + }, + { + "epoch": 1.920709309689677, + "grad_norm": 3.850926160812378, + "learning_rate": 7.2451327853634145e-06, + "loss": 0.7483, + "step": 7582 + }, + { + "epoch": 1.9209626345788475, + "grad_norm": 3.4227712154388428, + "learning_rate": 7.2443841217580165e-06, + "loss": 0.7001, + "step": 7583 + }, + { + "epoch": 1.9212159594680176, + "grad_norm": 3.340449094772339, + "learning_rate": 7.243635395131364e-06, + "loss": 0.6489, + "step": 7584 + }, + { + "epoch": 1.9214692843571881, + "grad_norm": 3.679422378540039, + "learning_rate": 7.242886605504481e-06, + "loss": 0.7541, + "step": 7585 + }, + { + "epoch": 1.9217226092463584, + "grad_norm": 3.835430383682251, + "learning_rate": 7.242137752898393e-06, + "loss": 0.8837, + "step": 7586 + }, + { + "epoch": 1.9219759341355287, + "grad_norm": 3.531416654586792, + "learning_rate": 7.241388837334126e-06, + "loss": 0.7448, + "step": 7587 + }, + { + "epoch": 1.9222292590246992, + "grad_norm": 4.176506042480469, + "learning_rate": 7.240639858832716e-06, + "loss": 0.8177, + "step": 7588 + }, + { + "epoch": 1.9224825839138695, + "grad_norm": 3.8392982482910156, + "learning_rate": 7.239890817415185e-06, + "loss": 0.8242, + "step": 7589 + }, + { + "epoch": 1.9227359088030398, + "grad_norm": 4.653436183929443, + "learning_rate": 7.239141713102569e-06, + "loss": 1.0268, + "step": 7590 + }, + { + "epoch": 1.9229892336922103, + "grad_norm": 3.7734992504119873, + "learning_rate": 7.238392545915905e-06, + "loss": 0.6918, + "step": 7591 + }, + { + "epoch": 1.9232425585813806, + "grad_norm": 3.7001616954803467, + "learning_rate": 7.2376433158762264e-06, + "loss": 0.856, + "step": 7592 + }, + { + "epoch": 1.923495883470551, + "grad_norm": 3.344609498977661, + "learning_rate": 7.2368940230045724e-06, + "loss": 0.7274, + "step": 7593 + }, + { + "epoch": 1.9237492083597214, + "grad_norm": 3.9480466842651367, + "learning_rate": 7.236144667321982e-06, + "loss": 0.9505, + "step": 7594 + }, + { + "epoch": 1.9240025332488917, + "grad_norm": 3.309288740158081, + "learning_rate": 7.2353952488494994e-06, + "loss": 0.7103, + "step": 7595 + }, + { + "epoch": 1.924255858138062, + "grad_norm": 3.4350290298461914, + "learning_rate": 7.2346457676081636e-06, + "loss": 0.6724, + "step": 7596 + }, + { + "epoch": 1.9245091830272325, + "grad_norm": 3.8128178119659424, + "learning_rate": 7.233896223619023e-06, + "loss": 0.8788, + "step": 7597 + }, + { + "epoch": 1.9247625079164028, + "grad_norm": 3.7690844535827637, + "learning_rate": 7.233146616903125e-06, + "loss": 0.7628, + "step": 7598 + }, + { + "epoch": 1.925015832805573, + "grad_norm": 3.6346118450164795, + "learning_rate": 7.232396947481515e-06, + "loss": 0.7439, + "step": 7599 + }, + { + "epoch": 1.9252691576947436, + "grad_norm": 3.893569231033325, + "learning_rate": 7.231647215375245e-06, + "loss": 0.7907, + "step": 7600 + }, + { + "epoch": 1.925522482583914, + "grad_norm": 3.619025945663452, + "learning_rate": 7.230897420605367e-06, + "loss": 0.7784, + "step": 7601 + }, + { + "epoch": 1.9257758074730842, + "grad_norm": 4.2273101806640625, + "learning_rate": 7.230147563192935e-06, + "loss": 0.9102, + "step": 7602 + }, + { + "epoch": 1.9260291323622547, + "grad_norm": 3.5847582817077637, + "learning_rate": 7.229397643159005e-06, + "loss": 0.7227, + "step": 7603 + }, + { + "epoch": 1.926282457251425, + "grad_norm": 3.5376057624816895, + "learning_rate": 7.228647660524634e-06, + "loss": 0.8489, + "step": 7604 + }, + { + "epoch": 1.9265357821405953, + "grad_norm": 3.898679733276367, + "learning_rate": 7.227897615310881e-06, + "loss": 0.8561, + "step": 7605 + }, + { + "epoch": 1.9267891070297658, + "grad_norm": 3.8402748107910156, + "learning_rate": 7.227147507538806e-06, + "loss": 0.6799, + "step": 7606 + }, + { + "epoch": 1.9270424319189359, + "grad_norm": 3.4418163299560547, + "learning_rate": 7.226397337229475e-06, + "loss": 0.7967, + "step": 7607 + }, + { + "epoch": 1.9272957568081064, + "grad_norm": 3.647125720977783, + "learning_rate": 7.22564710440395e-06, + "loss": 0.7052, + "step": 7608 + }, + { + "epoch": 1.927549081697277, + "grad_norm": 3.678846836090088, + "learning_rate": 7.224896809083297e-06, + "loss": 0.8326, + "step": 7609 + }, + { + "epoch": 1.927802406586447, + "grad_norm": 3.386265277862549, + "learning_rate": 7.2241464512885825e-06, + "loss": 0.6196, + "step": 7610 + }, + { + "epoch": 1.9280557314756175, + "grad_norm": 3.275310754776001, + "learning_rate": 7.22339603104088e-06, + "loss": 0.7026, + "step": 7611 + }, + { + "epoch": 1.928309056364788, + "grad_norm": 3.610837697982788, + "learning_rate": 7.222645548361259e-06, + "loss": 0.6893, + "step": 7612 + }, + { + "epoch": 1.928562381253958, + "grad_norm": 4.1723761558532715, + "learning_rate": 7.221895003270794e-06, + "loss": 0.859, + "step": 7613 + }, + { + "epoch": 1.9288157061431286, + "grad_norm": 3.472421884536743, + "learning_rate": 7.221144395790556e-06, + "loss": 0.7079, + "step": 7614 + }, + { + "epoch": 1.9290690310322989, + "grad_norm": 3.805896520614624, + "learning_rate": 7.220393725941625e-06, + "loss": 0.7836, + "step": 7615 + }, + { + "epoch": 1.9293223559214692, + "grad_norm": 3.836207389831543, + "learning_rate": 7.219642993745079e-06, + "loss": 0.8581, + "step": 7616 + }, + { + "epoch": 1.9295756808106397, + "grad_norm": 4.602386474609375, + "learning_rate": 7.218892199221997e-06, + "loss": 0.7302, + "step": 7617 + }, + { + "epoch": 1.92982900569981, + "grad_norm": 3.5737602710723877, + "learning_rate": 7.218141342393464e-06, + "loss": 0.8373, + "step": 7618 + }, + { + "epoch": 1.9300823305889803, + "grad_norm": 3.311408042907715, + "learning_rate": 7.217390423280561e-06, + "loss": 0.7424, + "step": 7619 + }, + { + "epoch": 1.9303356554781508, + "grad_norm": 4.090540885925293, + "learning_rate": 7.216639441904374e-06, + "loss": 1.0196, + "step": 7620 + }, + { + "epoch": 1.930588980367321, + "grad_norm": 3.4615042209625244, + "learning_rate": 7.215888398285991e-06, + "loss": 0.7255, + "step": 7621 + }, + { + "epoch": 1.9308423052564914, + "grad_norm": 3.5449633598327637, + "learning_rate": 7.215137292446499e-06, + "loss": 0.7636, + "step": 7622 + }, + { + "epoch": 1.9310956301456619, + "grad_norm": 3.802718162536621, + "learning_rate": 7.214386124406992e-06, + "loss": 0.6599, + "step": 7623 + }, + { + "epoch": 1.9313489550348322, + "grad_norm": 3.457990884780884, + "learning_rate": 7.213634894188559e-06, + "loss": 0.7614, + "step": 7624 + }, + { + "epoch": 1.9316022799240025, + "grad_norm": 3.6755120754241943, + "learning_rate": 7.212883601812296e-06, + "loss": 0.8009, + "step": 7625 + }, + { + "epoch": 1.931855604813173, + "grad_norm": 3.7048819065093994, + "learning_rate": 7.212132247299298e-06, + "loss": 0.8258, + "step": 7626 + }, + { + "epoch": 1.9321089297023433, + "grad_norm": 3.331568479537964, + "learning_rate": 7.211380830670663e-06, + "loss": 0.7657, + "step": 7627 + }, + { + "epoch": 1.9323622545915136, + "grad_norm": 3.7705137729644775, + "learning_rate": 7.210629351947491e-06, + "loss": 0.7385, + "step": 7628 + }, + { + "epoch": 1.932615579480684, + "grad_norm": 4.362046241760254, + "learning_rate": 7.209877811150884e-06, + "loss": 0.9805, + "step": 7629 + }, + { + "epoch": 1.9328689043698544, + "grad_norm": 3.570105791091919, + "learning_rate": 7.209126208301943e-06, + "loss": 0.7812, + "step": 7630 + }, + { + "epoch": 1.9331222292590247, + "grad_norm": 3.9807088375091553, + "learning_rate": 7.208374543421773e-06, + "loss": 0.7976, + "step": 7631 + }, + { + "epoch": 1.9333755541481952, + "grad_norm": 3.6694552898406982, + "learning_rate": 7.2076228165314835e-06, + "loss": 0.8222, + "step": 7632 + }, + { + "epoch": 1.9336288790373655, + "grad_norm": 3.9893441200256348, + "learning_rate": 7.206871027652177e-06, + "loss": 0.8547, + "step": 7633 + }, + { + "epoch": 1.9338822039265358, + "grad_norm": 3.9237453937530518, + "learning_rate": 7.206119176804966e-06, + "loss": 0.8376, + "step": 7634 + }, + { + "epoch": 1.9341355288157063, + "grad_norm": 3.7758641242980957, + "learning_rate": 7.205367264010965e-06, + "loss": 0.7581, + "step": 7635 + }, + { + "epoch": 1.9343888537048763, + "grad_norm": 3.6023716926574707, + "learning_rate": 7.204615289291283e-06, + "loss": 0.777, + "step": 7636 + }, + { + "epoch": 1.9346421785940469, + "grad_norm": 3.6123085021972656, + "learning_rate": 7.203863252667038e-06, + "loss": 0.6647, + "step": 7637 + }, + { + "epoch": 1.9348955034832174, + "grad_norm": 4.211479187011719, + "learning_rate": 7.203111154159346e-06, + "loss": 0.8433, + "step": 7638 + }, + { + "epoch": 1.9351488283723874, + "grad_norm": 3.6146914958953857, + "learning_rate": 7.202358993789323e-06, + "loss": 0.6477, + "step": 7639 + }, + { + "epoch": 1.935402153261558, + "grad_norm": 3.787353277206421, + "learning_rate": 7.201606771578092e-06, + "loss": 0.7655, + "step": 7640 + }, + { + "epoch": 1.9356554781507285, + "grad_norm": 3.9961917400360107, + "learning_rate": 7.200854487546776e-06, + "loss": 0.8435, + "step": 7641 + }, + { + "epoch": 1.9359088030398985, + "grad_norm": 3.7698097229003906, + "learning_rate": 7.200102141716498e-06, + "loss": 0.6617, + "step": 7642 + }, + { + "epoch": 1.936162127929069, + "grad_norm": 3.973029613494873, + "learning_rate": 7.199349734108383e-06, + "loss": 0.8296, + "step": 7643 + }, + { + "epoch": 1.9364154528182393, + "grad_norm": 3.907489538192749, + "learning_rate": 7.198597264743558e-06, + "loss": 0.9128, + "step": 7644 + }, + { + "epoch": 1.9366687777074096, + "grad_norm": 3.565837860107422, + "learning_rate": 7.197844733643152e-06, + "loss": 0.7948, + "step": 7645 + }, + { + "epoch": 1.9369221025965802, + "grad_norm": 3.805122137069702, + "learning_rate": 7.197092140828295e-06, + "loss": 0.8112, + "step": 7646 + }, + { + "epoch": 1.9371754274857504, + "grad_norm": 3.4625070095062256, + "learning_rate": 7.196339486320122e-06, + "loss": 0.8329, + "step": 7647 + }, + { + "epoch": 1.9374287523749207, + "grad_norm": 4.034064292907715, + "learning_rate": 7.195586770139764e-06, + "loss": 0.681, + "step": 7648 + }, + { + "epoch": 1.9376820772640913, + "grad_norm": 3.428379535675049, + "learning_rate": 7.1948339923083596e-06, + "loss": 0.7458, + "step": 7649 + }, + { + "epoch": 1.9379354021532615, + "grad_norm": 3.5189270973205566, + "learning_rate": 7.194081152847045e-06, + "loss": 0.956, + "step": 7650 + }, + { + "epoch": 1.9381887270424318, + "grad_norm": 3.7598392963409424, + "learning_rate": 7.19332825177696e-06, + "loss": 0.6577, + "step": 7651 + }, + { + "epoch": 1.9384420519316024, + "grad_norm": 4.254335880279541, + "learning_rate": 7.192575289119246e-06, + "loss": 0.8169, + "step": 7652 + }, + { + "epoch": 1.9386953768207726, + "grad_norm": 3.6726346015930176, + "learning_rate": 7.191822264895045e-06, + "loss": 0.7635, + "step": 7653 + }, + { + "epoch": 1.938948701709943, + "grad_norm": 3.438404083251953, + "learning_rate": 7.191069179125501e-06, + "loss": 0.7811, + "step": 7654 + }, + { + "epoch": 1.9392020265991134, + "grad_norm": 3.4573304653167725, + "learning_rate": 7.190316031831763e-06, + "loss": 0.8712, + "step": 7655 + }, + { + "epoch": 1.9394553514882837, + "grad_norm": 3.2838821411132812, + "learning_rate": 7.189562823034977e-06, + "loss": 0.741, + "step": 7656 + }, + { + "epoch": 1.939708676377454, + "grad_norm": 3.823986291885376, + "learning_rate": 7.188809552756294e-06, + "loss": 0.8245, + "step": 7657 + }, + { + "epoch": 1.9399620012666245, + "grad_norm": 3.4117276668548584, + "learning_rate": 7.188056221016864e-06, + "loss": 0.7209, + "step": 7658 + }, + { + "epoch": 1.9402153261557948, + "grad_norm": 3.7551310062408447, + "learning_rate": 7.187302827837841e-06, + "loss": 0.7397, + "step": 7659 + }, + { + "epoch": 1.9404686510449651, + "grad_norm": 4.005005359649658, + "learning_rate": 7.186549373240379e-06, + "loss": 0.7896, + "step": 7660 + }, + { + "epoch": 1.9407219759341356, + "grad_norm": 3.9484128952026367, + "learning_rate": 7.185795857245634e-06, + "loss": 0.8911, + "step": 7661 + }, + { + "epoch": 1.940975300823306, + "grad_norm": 4.208717346191406, + "learning_rate": 7.18504227987477e-06, + "loss": 0.8898, + "step": 7662 + }, + { + "epoch": 1.9412286257124762, + "grad_norm": 3.811157703399658, + "learning_rate": 7.1842886411489395e-06, + "loss": 0.9327, + "step": 7663 + }, + { + "epoch": 1.9414819506016467, + "grad_norm": 3.709144115447998, + "learning_rate": 7.183534941089308e-06, + "loss": 0.8782, + "step": 7664 + }, + { + "epoch": 1.9417352754908168, + "grad_norm": 3.7643768787384033, + "learning_rate": 7.182781179717038e-06, + "loss": 0.802, + "step": 7665 + }, + { + "epoch": 1.9419886003799873, + "grad_norm": 4.248831748962402, + "learning_rate": 7.182027357053297e-06, + "loss": 0.7969, + "step": 7666 + }, + { + "epoch": 1.9422419252691578, + "grad_norm": 3.9656169414520264, + "learning_rate": 7.181273473119251e-06, + "loss": 0.893, + "step": 7667 + }, + { + "epoch": 1.942495250158328, + "grad_norm": 3.2689943313598633, + "learning_rate": 7.1805195279360674e-06, + "loss": 0.6699, + "step": 7668 + }, + { + "epoch": 1.9427485750474984, + "grad_norm": 3.957042932510376, + "learning_rate": 7.179765521524917e-06, + "loss": 0.7986, + "step": 7669 + }, + { + "epoch": 1.943001899936669, + "grad_norm": 4.168601989746094, + "learning_rate": 7.1790114539069725e-06, + "loss": 0.7537, + "step": 7670 + }, + { + "epoch": 1.943255224825839, + "grad_norm": 3.1504342555999756, + "learning_rate": 7.178257325103408e-06, + "loss": 0.6949, + "step": 7671 + }, + { + "epoch": 1.9435085497150095, + "grad_norm": 3.6926889419555664, + "learning_rate": 7.177503135135399e-06, + "loss": 0.8342, + "step": 7672 + }, + { + "epoch": 1.9437618746041798, + "grad_norm": 3.5451834201812744, + "learning_rate": 7.176748884024123e-06, + "loss": 0.7534, + "step": 7673 + }, + { + "epoch": 1.94401519949335, + "grad_norm": 3.647296190261841, + "learning_rate": 7.175994571790756e-06, + "loss": 0.8815, + "step": 7674 + }, + { + "epoch": 1.9442685243825206, + "grad_norm": 3.3122143745422363, + "learning_rate": 7.175240198456484e-06, + "loss": 0.7635, + "step": 7675 + }, + { + "epoch": 1.944521849271691, + "grad_norm": 3.637890338897705, + "learning_rate": 7.174485764042485e-06, + "loss": 0.9206, + "step": 7676 + }, + { + "epoch": 1.9447751741608612, + "grad_norm": 3.588634729385376, + "learning_rate": 7.1737312685699456e-06, + "loss": 0.8351, + "step": 7677 + }, + { + "epoch": 1.9450284990500317, + "grad_norm": 3.2763001918792725, + "learning_rate": 7.17297671206005e-06, + "loss": 0.7212, + "step": 7678 + }, + { + "epoch": 1.945281823939202, + "grad_norm": 3.9934630393981934, + "learning_rate": 7.172222094533987e-06, + "loss": 0.8481, + "step": 7679 + }, + { + "epoch": 1.9455351488283723, + "grad_norm": 3.5940845012664795, + "learning_rate": 7.171467416012947e-06, + "loss": 0.7426, + "step": 7680 + }, + { + "epoch": 1.9457884737175428, + "grad_norm": 3.457130193710327, + "learning_rate": 7.17071267651812e-06, + "loss": 0.7158, + "step": 7681 + }, + { + "epoch": 1.9460417986067131, + "grad_norm": 4.111364364624023, + "learning_rate": 7.169957876070697e-06, + "loss": 0.7249, + "step": 7682 + }, + { + "epoch": 1.9462951234958834, + "grad_norm": 3.623413324356079, + "learning_rate": 7.169203014691874e-06, + "loss": 0.8585, + "step": 7683 + }, + { + "epoch": 1.946548448385054, + "grad_norm": 3.4657838344573975, + "learning_rate": 7.1684480924028466e-06, + "loss": 0.7721, + "step": 7684 + }, + { + "epoch": 1.9468017732742242, + "grad_norm": 3.9353878498077393, + "learning_rate": 7.167693109224814e-06, + "loss": 0.7129, + "step": 7685 + }, + { + "epoch": 1.9470550981633945, + "grad_norm": 3.6031713485717773, + "learning_rate": 7.166938065178976e-06, + "loss": 0.7578, + "step": 7686 + }, + { + "epoch": 1.947308423052565, + "grad_norm": 3.3692307472229004, + "learning_rate": 7.166182960286531e-06, + "loss": 0.8478, + "step": 7687 + }, + { + "epoch": 1.9475617479417353, + "grad_norm": 3.4741501808166504, + "learning_rate": 7.165427794568683e-06, + "loss": 0.7355, + "step": 7688 + }, + { + "epoch": 1.9478150728309056, + "grad_norm": 3.8740181922912598, + "learning_rate": 7.1646725680466374e-06, + "loss": 0.9451, + "step": 7689 + }, + { + "epoch": 1.9480683977200761, + "grad_norm": 3.816019058227539, + "learning_rate": 7.163917280741601e-06, + "loss": 0.8195, + "step": 7690 + }, + { + "epoch": 1.9483217226092464, + "grad_norm": 3.896796703338623, + "learning_rate": 7.1631619326747836e-06, + "loss": 0.8767, + "step": 7691 + }, + { + "epoch": 1.9485750474984167, + "grad_norm": 3.359934091567993, + "learning_rate": 7.162406523867391e-06, + "loss": 0.5411, + "step": 7692 + }, + { + "epoch": 1.9488283723875872, + "grad_norm": 3.7625036239624023, + "learning_rate": 7.161651054340637e-06, + "loss": 0.8589, + "step": 7693 + }, + { + "epoch": 1.9490816972767573, + "grad_norm": 3.353876829147339, + "learning_rate": 7.160895524115734e-06, + "loss": 0.7515, + "step": 7694 + }, + { + "epoch": 1.9493350221659278, + "grad_norm": 3.5972187519073486, + "learning_rate": 7.160139933213899e-06, + "loss": 0.8326, + "step": 7695 + }, + { + "epoch": 1.9495883470550983, + "grad_norm": 4.180943489074707, + "learning_rate": 7.159384281656346e-06, + "loss": 0.8135, + "step": 7696 + }, + { + "epoch": 1.9498416719442684, + "grad_norm": 3.5679547786712646, + "learning_rate": 7.158628569464295e-06, + "loss": 0.6982, + "step": 7697 + }, + { + "epoch": 1.950094996833439, + "grad_norm": 3.4704604148864746, + "learning_rate": 7.157872796658964e-06, + "loss": 0.7795, + "step": 7698 + }, + { + "epoch": 1.9503483217226094, + "grad_norm": 4.002320766448975, + "learning_rate": 7.157116963261579e-06, + "loss": 0.8208, + "step": 7699 + }, + { + "epoch": 1.9506016466117795, + "grad_norm": 3.9060094356536865, + "learning_rate": 7.156361069293358e-06, + "loss": 0.9398, + "step": 7700 + }, + { + "epoch": 1.95085497150095, + "grad_norm": 4.0416951179504395, + "learning_rate": 7.15560511477553e-06, + "loss": 0.7783, + "step": 7701 + }, + { + "epoch": 1.9511082963901203, + "grad_norm": 3.742638111114502, + "learning_rate": 7.15484909972932e-06, + "loss": 0.8322, + "step": 7702 + }, + { + "epoch": 1.9513616212792906, + "grad_norm": 3.8053700923919678, + "learning_rate": 7.1540930241759575e-06, + "loss": 0.8473, + "step": 7703 + }, + { + "epoch": 1.951614946168461, + "grad_norm": 3.480557918548584, + "learning_rate": 7.153336888136673e-06, + "loss": 0.8242, + "step": 7704 + }, + { + "epoch": 1.9518682710576314, + "grad_norm": 3.211710214614868, + "learning_rate": 7.152580691632697e-06, + "loss": 0.7533, + "step": 7705 + }, + { + "epoch": 1.9521215959468017, + "grad_norm": 3.7976908683776855, + "learning_rate": 7.151824434685265e-06, + "loss": 0.8369, + "step": 7706 + }, + { + "epoch": 1.9523749208359722, + "grad_norm": 3.8064568042755127, + "learning_rate": 7.1510681173156116e-06, + "loss": 0.7898, + "step": 7707 + }, + { + "epoch": 1.9526282457251425, + "grad_norm": 3.578582525253296, + "learning_rate": 7.150311739544973e-06, + "loss": 0.7215, + "step": 7708 + }, + { + "epoch": 1.9528815706143128, + "grad_norm": 3.4547502994537354, + "learning_rate": 7.149555301394588e-06, + "loss": 0.7414, + "step": 7709 + }, + { + "epoch": 1.9531348955034833, + "grad_norm": 3.6342086791992188, + "learning_rate": 7.148798802885698e-06, + "loss": 0.7902, + "step": 7710 + }, + { + "epoch": 1.9533882203926536, + "grad_norm": 3.623147964477539, + "learning_rate": 7.148042244039546e-06, + "loss": 0.6889, + "step": 7711 + }, + { + "epoch": 1.9536415452818239, + "grad_norm": 3.8836662769317627, + "learning_rate": 7.1472856248773725e-06, + "loss": 0.8143, + "step": 7712 + }, + { + "epoch": 1.9538948701709944, + "grad_norm": 3.334202766418457, + "learning_rate": 7.146528945420426e-06, + "loss": 0.7151, + "step": 7713 + }, + { + "epoch": 1.9541481950601647, + "grad_norm": 3.4288785457611084, + "learning_rate": 7.1457722056899525e-06, + "loss": 0.7372, + "step": 7714 + }, + { + "epoch": 1.954401519949335, + "grad_norm": 3.8028464317321777, + "learning_rate": 7.1450154057072e-06, + "loss": 0.8303, + "step": 7715 + }, + { + "epoch": 1.9546548448385055, + "grad_norm": 3.7775213718414307, + "learning_rate": 7.144258545493422e-06, + "loss": 0.8075, + "step": 7716 + }, + { + "epoch": 1.9549081697276758, + "grad_norm": 3.732243776321411, + "learning_rate": 7.143501625069869e-06, + "loss": 0.8185, + "step": 7717 + }, + { + "epoch": 1.955161494616846, + "grad_norm": 3.6021058559417725, + "learning_rate": 7.142744644457794e-06, + "loss": 0.7003, + "step": 7718 + }, + { + "epoch": 1.9554148195060166, + "grad_norm": 4.029145240783691, + "learning_rate": 7.141987603678454e-06, + "loss": 0.7883, + "step": 7719 + }, + { + "epoch": 1.9556681443951869, + "grad_norm": 3.088573455810547, + "learning_rate": 7.141230502753106e-06, + "loss": 0.6743, + "step": 7720 + }, + { + "epoch": 1.9559214692843572, + "grad_norm": 3.913045644760132, + "learning_rate": 7.1404733417030105e-06, + "loss": 0.8128, + "step": 7721 + }, + { + "epoch": 1.9561747941735277, + "grad_norm": 3.5408408641815186, + "learning_rate": 7.139716120549425e-06, + "loss": 0.8064, + "step": 7722 + }, + { + "epoch": 1.9564281190626978, + "grad_norm": 3.7428712844848633, + "learning_rate": 7.138958839313613e-06, + "loss": 0.6369, + "step": 7723 + }, + { + "epoch": 1.9566814439518683, + "grad_norm": 3.7903828620910645, + "learning_rate": 7.138201498016841e-06, + "loss": 0.7891, + "step": 7724 + }, + { + "epoch": 1.9569347688410388, + "grad_norm": 3.8979010581970215, + "learning_rate": 7.137444096680373e-06, + "loss": 0.9399, + "step": 7725 + }, + { + "epoch": 1.9571880937302089, + "grad_norm": 3.3378214836120605, + "learning_rate": 7.136686635325475e-06, + "loss": 0.7309, + "step": 7726 + }, + { + "epoch": 1.9574414186193794, + "grad_norm": 4.116669654846191, + "learning_rate": 7.1359291139734185e-06, + "loss": 0.8399, + "step": 7727 + }, + { + "epoch": 1.9576947435085497, + "grad_norm": 3.8506109714508057, + "learning_rate": 7.1351715326454725e-06, + "loss": 0.7953, + "step": 7728 + }, + { + "epoch": 1.95794806839772, + "grad_norm": 3.805879592895508, + "learning_rate": 7.1344138913629125e-06, + "loss": 0.8638, + "step": 7729 + }, + { + "epoch": 1.9582013932868905, + "grad_norm": 3.966623544692993, + "learning_rate": 7.13365619014701e-06, + "loss": 0.822, + "step": 7730 + }, + { + "epoch": 1.9584547181760608, + "grad_norm": 3.9578821659088135, + "learning_rate": 7.132898429019043e-06, + "loss": 0.7866, + "step": 7731 + }, + { + "epoch": 1.958708043065231, + "grad_norm": 3.6943609714508057, + "learning_rate": 7.132140608000286e-06, + "loss": 0.8091, + "step": 7732 + }, + { + "epoch": 1.9589613679544016, + "grad_norm": 3.6804726123809814, + "learning_rate": 7.131382727112021e-06, + "loss": 0.7785, + "step": 7733 + }, + { + "epoch": 1.9592146928435719, + "grad_norm": 3.286087989807129, + "learning_rate": 7.130624786375526e-06, + "loss": 0.6549, + "step": 7734 + }, + { + "epoch": 1.9594680177327422, + "grad_norm": 3.5736477375030518, + "learning_rate": 7.129866785812087e-06, + "loss": 0.6937, + "step": 7735 + }, + { + "epoch": 1.9597213426219127, + "grad_norm": 3.499422073364258, + "learning_rate": 7.129108725442988e-06, + "loss": 0.8102, + "step": 7736 + }, + { + "epoch": 1.959974667511083, + "grad_norm": 3.411736249923706, + "learning_rate": 7.128350605289512e-06, + "loss": 0.8138, + "step": 7737 + }, + { + "epoch": 1.9602279924002532, + "grad_norm": 3.406217336654663, + "learning_rate": 7.12759242537295e-06, + "loss": 0.7493, + "step": 7738 + }, + { + "epoch": 1.9604813172894238, + "grad_norm": 3.62760591506958, + "learning_rate": 7.126834185714588e-06, + "loss": 0.8138, + "step": 7739 + }, + { + "epoch": 1.960734642178594, + "grad_norm": 4.046286582946777, + "learning_rate": 7.12607588633572e-06, + "loss": 0.7405, + "step": 7740 + }, + { + "epoch": 1.9609879670677643, + "grad_norm": 3.9970147609710693, + "learning_rate": 7.125317527257638e-06, + "loss": 0.8729, + "step": 7741 + }, + { + "epoch": 1.9612412919569349, + "grad_norm": 3.835890769958496, + "learning_rate": 7.124559108501636e-06, + "loss": 0.7778, + "step": 7742 + }, + { + "epoch": 1.9614946168461052, + "grad_norm": 3.5552656650543213, + "learning_rate": 7.123800630089008e-06, + "loss": 0.6406, + "step": 7743 + }, + { + "epoch": 1.9617479417352754, + "grad_norm": 4.11686372756958, + "learning_rate": 7.123042092041056e-06, + "loss": 0.6692, + "step": 7744 + }, + { + "epoch": 1.962001266624446, + "grad_norm": 3.6176023483276367, + "learning_rate": 7.122283494379076e-06, + "loss": 0.7699, + "step": 7745 + }, + { + "epoch": 1.9622545915136163, + "grad_norm": 3.621389627456665, + "learning_rate": 7.12152483712437e-06, + "loss": 0.8314, + "step": 7746 + }, + { + "epoch": 1.9625079164027865, + "grad_norm": 3.503429412841797, + "learning_rate": 7.1207661202982416e-06, + "loss": 0.7435, + "step": 7747 + }, + { + "epoch": 1.962761241291957, + "grad_norm": 3.614459276199341, + "learning_rate": 7.120007343921994e-06, + "loss": 0.8376, + "step": 7748 + }, + { + "epoch": 1.9630145661811271, + "grad_norm": 3.402324914932251, + "learning_rate": 7.119248508016934e-06, + "loss": 0.7428, + "step": 7749 + }, + { + "epoch": 1.9632678910702976, + "grad_norm": 4.012660980224609, + "learning_rate": 7.118489612604369e-06, + "loss": 0.817, + "step": 7750 + }, + { + "epoch": 1.9635212159594682, + "grad_norm": 4.236724376678467, + "learning_rate": 7.117730657705608e-06, + "loss": 0.8299, + "step": 7751 + }, + { + "epoch": 1.9637745408486382, + "grad_norm": 3.6919877529144287, + "learning_rate": 7.116971643341964e-06, + "loss": 0.8417, + "step": 7752 + }, + { + "epoch": 1.9640278657378087, + "grad_norm": 3.5648860931396484, + "learning_rate": 7.116212569534747e-06, + "loss": 0.737, + "step": 7753 + }, + { + "epoch": 1.9642811906269793, + "grad_norm": 3.3912346363067627, + "learning_rate": 7.115453436305273e-06, + "loss": 0.7417, + "step": 7754 + }, + { + "epoch": 1.9645345155161493, + "grad_norm": 3.2913858890533447, + "learning_rate": 7.11469424367486e-06, + "loss": 0.6647, + "step": 7755 + }, + { + "epoch": 1.9647878404053198, + "grad_norm": 3.5995373725891113, + "learning_rate": 7.113934991664821e-06, + "loss": 0.7586, + "step": 7756 + }, + { + "epoch": 1.9650411652944901, + "grad_norm": 3.7692344188690186, + "learning_rate": 7.11317568029648e-06, + "loss": 0.7317, + "step": 7757 + }, + { + "epoch": 1.9652944901836604, + "grad_norm": 3.9188623428344727, + "learning_rate": 7.112416309591156e-06, + "loss": 0.7836, + "step": 7758 + }, + { + "epoch": 1.965547815072831, + "grad_norm": 3.634542942047119, + "learning_rate": 7.111656879570173e-06, + "loss": 0.761, + "step": 7759 + }, + { + "epoch": 1.9658011399620012, + "grad_norm": 3.5849297046661377, + "learning_rate": 7.110897390254853e-06, + "loss": 0.8241, + "step": 7760 + }, + { + "epoch": 1.9660544648511715, + "grad_norm": 3.7755422592163086, + "learning_rate": 7.110137841666524e-06, + "loss": 0.8205, + "step": 7761 + }, + { + "epoch": 1.966307789740342, + "grad_norm": 3.55366587638855, + "learning_rate": 7.109378233826513e-06, + "loss": 0.7559, + "step": 7762 + }, + { + "epoch": 1.9665611146295123, + "grad_norm": 4.0000834465026855, + "learning_rate": 7.10861856675615e-06, + "loss": 0.8388, + "step": 7763 + }, + { + "epoch": 1.9668144395186826, + "grad_norm": 3.51295804977417, + "learning_rate": 7.107858840476766e-06, + "loss": 0.7637, + "step": 7764 + }, + { + "epoch": 1.9670677644078531, + "grad_norm": 3.6552960872650146, + "learning_rate": 7.107099055009694e-06, + "loss": 0.8367, + "step": 7765 + }, + { + "epoch": 1.9673210892970234, + "grad_norm": 3.5162038803100586, + "learning_rate": 7.106339210376267e-06, + "loss": 0.7118, + "step": 7766 + }, + { + "epoch": 1.9675744141861937, + "grad_norm": 4.007128715515137, + "learning_rate": 7.105579306597823e-06, + "loss": 0.7799, + "step": 7767 + }, + { + "epoch": 1.9678277390753642, + "grad_norm": 3.583599805831909, + "learning_rate": 7.104819343695699e-06, + "loss": 0.8737, + "step": 7768 + }, + { + "epoch": 1.9680810639645345, + "grad_norm": 3.4805471897125244, + "learning_rate": 7.1040593216912335e-06, + "loss": 0.7568, + "step": 7769 + }, + { + "epoch": 1.9683343888537048, + "grad_norm": 3.582451581954956, + "learning_rate": 7.10329924060577e-06, + "loss": 0.7771, + "step": 7770 + }, + { + "epoch": 1.9685877137428753, + "grad_norm": 3.251368999481201, + "learning_rate": 7.102539100460648e-06, + "loss": 0.8377, + "step": 7771 + }, + { + "epoch": 1.9688410386320456, + "grad_norm": 3.5506715774536133, + "learning_rate": 7.101778901277214e-06, + "loss": 0.76, + "step": 7772 + }, + { + "epoch": 1.969094363521216, + "grad_norm": 3.9643442630767822, + "learning_rate": 7.1010186430768134e-06, + "loss": 0.8701, + "step": 7773 + }, + { + "epoch": 1.9693476884103864, + "grad_norm": 3.7198781967163086, + "learning_rate": 7.100258325880796e-06, + "loss": 0.8223, + "step": 7774 + }, + { + "epoch": 1.9696010132995567, + "grad_norm": 3.424121141433716, + "learning_rate": 7.099497949710507e-06, + "loss": 0.7835, + "step": 7775 + }, + { + "epoch": 1.969854338188727, + "grad_norm": 3.452148914337158, + "learning_rate": 7.0987375145872994e-06, + "loss": 0.826, + "step": 7776 + }, + { + "epoch": 1.9701076630778975, + "grad_norm": 3.364057779312134, + "learning_rate": 7.097977020532526e-06, + "loss": 0.6752, + "step": 7777 + }, + { + "epoch": 1.9703609879670676, + "grad_norm": 3.64872145652771, + "learning_rate": 7.097216467567542e-06, + "loss": 0.7157, + "step": 7778 + }, + { + "epoch": 1.9706143128562381, + "grad_norm": 3.607300281524658, + "learning_rate": 7.096455855713702e-06, + "loss": 0.6867, + "step": 7779 + }, + { + "epoch": 1.9708676377454086, + "grad_norm": 3.4279701709747314, + "learning_rate": 7.095695184992365e-06, + "loss": 0.69, + "step": 7780 + }, + { + "epoch": 1.9711209626345787, + "grad_norm": 3.260676383972168, + "learning_rate": 7.094934455424889e-06, + "loss": 0.7615, + "step": 7781 + }, + { + "epoch": 1.9713742875237492, + "grad_norm": 3.7134289741516113, + "learning_rate": 7.0941736670326346e-06, + "loss": 0.7148, + "step": 7782 + }, + { + "epoch": 1.9716276124129197, + "grad_norm": 3.8525795936584473, + "learning_rate": 7.093412819836966e-06, + "loss": 0.8088, + "step": 7783 + }, + { + "epoch": 1.9718809373020898, + "grad_norm": 3.6673407554626465, + "learning_rate": 7.092651913859246e-06, + "loss": 0.7594, + "step": 7784 + }, + { + "epoch": 1.9721342621912603, + "grad_norm": 3.580387592315674, + "learning_rate": 7.091890949120841e-06, + "loss": 0.6788, + "step": 7785 + }, + { + "epoch": 1.9723875870804306, + "grad_norm": 4.053237438201904, + "learning_rate": 7.091129925643119e-06, + "loss": 0.8555, + "step": 7786 + }, + { + "epoch": 1.972640911969601, + "grad_norm": 3.7616658210754395, + "learning_rate": 7.090368843447448e-06, + "loss": 0.7988, + "step": 7787 + }, + { + "epoch": 1.9728942368587714, + "grad_norm": 3.4558634757995605, + "learning_rate": 7.089607702555201e-06, + "loss": 0.7842, + "step": 7788 + }, + { + "epoch": 1.9731475617479417, + "grad_norm": 3.9048306941986084, + "learning_rate": 7.088846502987747e-06, + "loss": 0.7438, + "step": 7789 + }, + { + "epoch": 1.973400886637112, + "grad_norm": 3.3912265300750732, + "learning_rate": 7.088085244766464e-06, + "loss": 0.751, + "step": 7790 + }, + { + "epoch": 1.9736542115262825, + "grad_norm": 3.807003974914551, + "learning_rate": 7.0873239279127246e-06, + "loss": 0.7561, + "step": 7791 + }, + { + "epoch": 1.9739075364154528, + "grad_norm": 3.7926597595214844, + "learning_rate": 7.086562552447909e-06, + "loss": 0.8205, + "step": 7792 + }, + { + "epoch": 1.974160861304623, + "grad_norm": 3.7442386150360107, + "learning_rate": 7.085801118393394e-06, + "loss": 0.8445, + "step": 7793 + }, + { + "epoch": 1.9744141861937936, + "grad_norm": 3.722290515899658, + "learning_rate": 7.085039625770562e-06, + "loss": 0.8169, + "step": 7794 + }, + { + "epoch": 1.974667511082964, + "grad_norm": 3.7071523666381836, + "learning_rate": 7.084278074600794e-06, + "loss": 0.7872, + "step": 7795 + }, + { + "epoch": 1.9749208359721342, + "grad_norm": 3.783977746963501, + "learning_rate": 7.0835164649054744e-06, + "loss": 0.8322, + "step": 7796 + }, + { + "epoch": 1.9751741608613047, + "grad_norm": 3.0392532348632812, + "learning_rate": 7.0827547967059885e-06, + "loss": 0.7639, + "step": 7797 + }, + { + "epoch": 1.975427485750475, + "grad_norm": 3.8044259548187256, + "learning_rate": 7.081993070023725e-06, + "loss": 0.8509, + "step": 7798 + }, + { + "epoch": 1.9756808106396453, + "grad_norm": 3.57563853263855, + "learning_rate": 7.081231284880071e-06, + "loss": 0.7586, + "step": 7799 + }, + { + "epoch": 1.9759341355288158, + "grad_norm": 3.971006393432617, + "learning_rate": 7.080469441296418e-06, + "loss": 0.8335, + "step": 7800 + }, + { + "epoch": 1.976187460417986, + "grad_norm": 3.3764495849609375, + "learning_rate": 7.079707539294158e-06, + "loss": 0.7588, + "step": 7801 + }, + { + "epoch": 1.9764407853071564, + "grad_norm": 3.7590389251708984, + "learning_rate": 7.0789455788946845e-06, + "loss": 0.8711, + "step": 7802 + }, + { + "epoch": 1.976694110196327, + "grad_norm": 4.3643574714660645, + "learning_rate": 7.078183560119395e-06, + "loss": 0.8518, + "step": 7803 + }, + { + "epoch": 1.9769474350854972, + "grad_norm": 3.7080447673797607, + "learning_rate": 7.077421482989686e-06, + "loss": 0.8306, + "step": 7804 + }, + { + "epoch": 1.9772007599746675, + "grad_norm": 3.6435539722442627, + "learning_rate": 7.076659347526954e-06, + "loss": 0.7207, + "step": 7805 + }, + { + "epoch": 1.977454084863838, + "grad_norm": 3.583136796951294, + "learning_rate": 7.0758971537526e-06, + "loss": 0.8254, + "step": 7806 + }, + { + "epoch": 1.977707409753008, + "grad_norm": 4.014275550842285, + "learning_rate": 7.075134901688028e-06, + "loss": 0.9418, + "step": 7807 + }, + { + "epoch": 1.9779607346421786, + "grad_norm": 3.8396565914154053, + "learning_rate": 7.07437259135464e-06, + "loss": 0.7992, + "step": 7808 + }, + { + "epoch": 1.978214059531349, + "grad_norm": 3.843681573867798, + "learning_rate": 7.073610222773844e-06, + "loss": 0.7475, + "step": 7809 + }, + { + "epoch": 1.9784673844205192, + "grad_norm": 3.5276191234588623, + "learning_rate": 7.0728477959670415e-06, + "loss": 0.7265, + "step": 7810 + }, + { + "epoch": 1.9787207093096897, + "grad_norm": 3.8873448371887207, + "learning_rate": 7.072085310955645e-06, + "loss": 0.8069, + "step": 7811 + }, + { + "epoch": 1.9789740341988602, + "grad_norm": 3.556809663772583, + "learning_rate": 7.0713227677610655e-06, + "loss": 0.7934, + "step": 7812 + }, + { + "epoch": 1.9792273590880303, + "grad_norm": 3.4937283992767334, + "learning_rate": 7.070560166404713e-06, + "loss": 0.8782, + "step": 7813 + }, + { + "epoch": 1.9794806839772008, + "grad_norm": 3.6915431022644043, + "learning_rate": 7.069797506908e-06, + "loss": 0.8141, + "step": 7814 + }, + { + "epoch": 1.979734008866371, + "grad_norm": 4.639492511749268, + "learning_rate": 7.069034789292345e-06, + "loss": 0.8645, + "step": 7815 + }, + { + "epoch": 1.9799873337555414, + "grad_norm": 3.667537212371826, + "learning_rate": 7.068272013579163e-06, + "loss": 0.6504, + "step": 7816 + }, + { + "epoch": 1.9802406586447119, + "grad_norm": 4.093923091888428, + "learning_rate": 7.067509179789871e-06, + "loss": 0.8769, + "step": 7817 + }, + { + "epoch": 1.9804939835338822, + "grad_norm": 3.478240489959717, + "learning_rate": 7.06674628794589e-06, + "loss": 0.7769, + "step": 7818 + }, + { + "epoch": 1.9807473084230525, + "grad_norm": 3.2936277389526367, + "learning_rate": 7.065983338068643e-06, + "loss": 0.7441, + "step": 7819 + }, + { + "epoch": 1.981000633312223, + "grad_norm": 3.8519601821899414, + "learning_rate": 7.065220330179552e-06, + "loss": 0.802, + "step": 7820 + }, + { + "epoch": 1.9812539582013933, + "grad_norm": 3.640568494796753, + "learning_rate": 7.064457264300041e-06, + "loss": 0.7295, + "step": 7821 + }, + { + "epoch": 1.9815072830905636, + "grad_norm": 3.5209779739379883, + "learning_rate": 7.063694140451538e-06, + "loss": 0.8936, + "step": 7822 + }, + { + "epoch": 1.981760607979734, + "grad_norm": 3.9504730701446533, + "learning_rate": 7.062930958655472e-06, + "loss": 0.8196, + "step": 7823 + }, + { + "epoch": 1.9820139328689044, + "grad_norm": 3.680300712585449, + "learning_rate": 7.06216771893327e-06, + "loss": 0.7316, + "step": 7824 + }, + { + "epoch": 1.9822672577580747, + "grad_norm": 3.858332872390747, + "learning_rate": 7.061404421306365e-06, + "loss": 0.8491, + "step": 7825 + }, + { + "epoch": 1.9825205826472452, + "grad_norm": 3.9465181827545166, + "learning_rate": 7.06064106579619e-06, + "loss": 0.7303, + "step": 7826 + }, + { + "epoch": 1.9827739075364155, + "grad_norm": 3.9884231090545654, + "learning_rate": 7.059877652424181e-06, + "loss": 0.8044, + "step": 7827 + }, + { + "epoch": 1.9830272324255858, + "grad_norm": 3.5908045768737793, + "learning_rate": 7.059114181211771e-06, + "loss": 0.7816, + "step": 7828 + }, + { + "epoch": 1.9832805573147563, + "grad_norm": 3.938767910003662, + "learning_rate": 7.058350652180401e-06, + "loss": 0.7904, + "step": 7829 + }, + { + "epoch": 1.9835338822039266, + "grad_norm": 3.7275545597076416, + "learning_rate": 7.05758706535151e-06, + "loss": 0.7388, + "step": 7830 + }, + { + "epoch": 1.9837872070930969, + "grad_norm": 3.831284999847412, + "learning_rate": 7.056823420746538e-06, + "loss": 0.7877, + "step": 7831 + }, + { + "epoch": 1.9840405319822674, + "grad_norm": 3.974714994430542, + "learning_rate": 7.056059718386927e-06, + "loss": 0.8221, + "step": 7832 + }, + { + "epoch": 1.9842938568714377, + "grad_norm": 3.6349573135375977, + "learning_rate": 7.055295958294124e-06, + "loss": 0.7492, + "step": 7833 + }, + { + "epoch": 1.984547181760608, + "grad_norm": 3.9935271739959717, + "learning_rate": 7.054532140489575e-06, + "loss": 0.7947, + "step": 7834 + }, + { + "epoch": 1.9848005066497785, + "grad_norm": 3.484941005706787, + "learning_rate": 7.053768264994725e-06, + "loss": 0.6921, + "step": 7835 + }, + { + "epoch": 1.9850538315389485, + "grad_norm": 3.6672487258911133, + "learning_rate": 7.053004331831025e-06, + "loss": 0.7615, + "step": 7836 + }, + { + "epoch": 1.985307156428119, + "grad_norm": 3.7978787422180176, + "learning_rate": 7.0522403410199245e-06, + "loss": 0.762, + "step": 7837 + }, + { + "epoch": 1.9855604813172896, + "grad_norm": 3.566520929336548, + "learning_rate": 7.051476292582878e-06, + "loss": 0.7821, + "step": 7838 + }, + { + "epoch": 1.9858138062064596, + "grad_norm": 3.4677298069000244, + "learning_rate": 7.050712186541339e-06, + "loss": 0.7713, + "step": 7839 + }, + { + "epoch": 1.9860671310956302, + "grad_norm": 3.8835580348968506, + "learning_rate": 7.0499480229167615e-06, + "loss": 0.8336, + "step": 7840 + }, + { + "epoch": 1.9863204559848007, + "grad_norm": 3.6812779903411865, + "learning_rate": 7.049183801730606e-06, + "loss": 0.7183, + "step": 7841 + }, + { + "epoch": 1.9865737808739707, + "grad_norm": 3.9878032207489014, + "learning_rate": 7.048419523004328e-06, + "loss": 0.8592, + "step": 7842 + }, + { + "epoch": 1.9868271057631413, + "grad_norm": 4.390382289886475, + "learning_rate": 7.0476551867593915e-06, + "loss": 0.852, + "step": 7843 + }, + { + "epoch": 1.9870804306523115, + "grad_norm": 3.578312397003174, + "learning_rate": 7.046890793017257e-06, + "loss": 0.8225, + "step": 7844 + }, + { + "epoch": 1.9873337555414818, + "grad_norm": 3.5974812507629395, + "learning_rate": 7.046126341799387e-06, + "loss": 0.8266, + "step": 7845 + }, + { + "epoch": 1.9875870804306524, + "grad_norm": 3.3824779987335205, + "learning_rate": 7.045361833127249e-06, + "loss": 0.6321, + "step": 7846 + }, + { + "epoch": 1.9878404053198226, + "grad_norm": 3.554611921310425, + "learning_rate": 7.04459726702231e-06, + "loss": 0.8004, + "step": 7847 + }, + { + "epoch": 1.988093730208993, + "grad_norm": 3.989422559738159, + "learning_rate": 7.043832643506036e-06, + "loss": 0.8787, + "step": 7848 + }, + { + "epoch": 1.9883470550981635, + "grad_norm": 3.763200283050537, + "learning_rate": 7.0430679625999035e-06, + "loss": 0.8115, + "step": 7849 + }, + { + "epoch": 1.9886003799873337, + "grad_norm": 3.778221607208252, + "learning_rate": 7.042303224325375e-06, + "loss": 0.8372, + "step": 7850 + }, + { + "epoch": 1.988853704876504, + "grad_norm": 3.8948304653167725, + "learning_rate": 7.041538428703931e-06, + "loss": 0.756, + "step": 7851 + }, + { + "epoch": 1.9891070297656746, + "grad_norm": 3.5517871379852295, + "learning_rate": 7.040773575757045e-06, + "loss": 0.7555, + "step": 7852 + }, + { + "epoch": 1.9893603546548448, + "grad_norm": 3.535416841506958, + "learning_rate": 7.040008665506195e-06, + "loss": 0.8107, + "step": 7853 + }, + { + "epoch": 1.9896136795440151, + "grad_norm": 4.027966499328613, + "learning_rate": 7.039243697972856e-06, + "loss": 0.8943, + "step": 7854 + }, + { + "epoch": 1.9898670044331856, + "grad_norm": 3.422999382019043, + "learning_rate": 7.03847867317851e-06, + "loss": 0.7349, + "step": 7855 + }, + { + "epoch": 1.990120329322356, + "grad_norm": 3.5104379653930664, + "learning_rate": 7.03771359114464e-06, + "loss": 0.7075, + "step": 7856 + }, + { + "epoch": 1.9903736542115262, + "grad_norm": 4.1053667068481445, + "learning_rate": 7.0369484518927245e-06, + "loss": 0.8716, + "step": 7857 + }, + { + "epoch": 1.9906269791006967, + "grad_norm": 3.7057645320892334, + "learning_rate": 7.036183255444253e-06, + "loss": 0.7537, + "step": 7858 + }, + { + "epoch": 1.990880303989867, + "grad_norm": 3.6755993366241455, + "learning_rate": 7.03541800182071e-06, + "loss": 0.7499, + "step": 7859 + }, + { + "epoch": 1.9911336288790373, + "grad_norm": 3.893325090408325, + "learning_rate": 7.034652691043582e-06, + "loss": 0.7036, + "step": 7860 + }, + { + "epoch": 1.9913869537682078, + "grad_norm": 3.5237503051757812, + "learning_rate": 7.033887323134361e-06, + "loss": 0.756, + "step": 7861 + }, + { + "epoch": 1.9916402786573781, + "grad_norm": 3.12933349609375, + "learning_rate": 7.033121898114537e-06, + "loss": 0.6639, + "step": 7862 + }, + { + "epoch": 1.9918936035465484, + "grad_norm": 3.5588948726654053, + "learning_rate": 7.032356416005603e-06, + "loss": 0.7761, + "step": 7863 + }, + { + "epoch": 1.992146928435719, + "grad_norm": 4.064525127410889, + "learning_rate": 7.031590876829053e-06, + "loss": 0.8608, + "step": 7864 + }, + { + "epoch": 1.992400253324889, + "grad_norm": 3.3582210540771484, + "learning_rate": 7.030825280606384e-06, + "loss": 0.738, + "step": 7865 + }, + { + "epoch": 1.9926535782140595, + "grad_norm": 3.7255241870880127, + "learning_rate": 7.030059627359093e-06, + "loss": 0.945, + "step": 7866 + }, + { + "epoch": 1.99290690310323, + "grad_norm": 3.6472373008728027, + "learning_rate": 7.029293917108678e-06, + "loss": 0.7403, + "step": 7867 + }, + { + "epoch": 1.9931602279924001, + "grad_norm": 3.845834732055664, + "learning_rate": 7.028528149876644e-06, + "loss": 0.8059, + "step": 7868 + }, + { + "epoch": 1.9934135528815706, + "grad_norm": 3.227482318878174, + "learning_rate": 7.027762325684488e-06, + "loss": 0.7175, + "step": 7869 + }, + { + "epoch": 1.9936668777707411, + "grad_norm": 4.206416130065918, + "learning_rate": 7.026996444553716e-06, + "loss": 0.7829, + "step": 7870 + }, + { + "epoch": 1.9939202026599112, + "grad_norm": 3.581221580505371, + "learning_rate": 7.026230506505834e-06, + "loss": 0.823, + "step": 7871 + }, + { + "epoch": 1.9941735275490817, + "grad_norm": 3.8868722915649414, + "learning_rate": 7.025464511562347e-06, + "loss": 0.7844, + "step": 7872 + }, + { + "epoch": 1.994426852438252, + "grad_norm": 4.018174171447754, + "learning_rate": 7.02469845974477e-06, + "loss": 0.8255, + "step": 7873 + }, + { + "epoch": 1.9946801773274223, + "grad_norm": 4.231254577636719, + "learning_rate": 7.0239323510746074e-06, + "loss": 0.8246, + "step": 7874 + }, + { + "epoch": 1.9949335022165928, + "grad_norm": 4.085860252380371, + "learning_rate": 7.023166185573371e-06, + "loss": 0.7985, + "step": 7875 + }, + { + "epoch": 1.9951868271057631, + "grad_norm": 3.792830228805542, + "learning_rate": 7.022399963262578e-06, + "loss": 0.8093, + "step": 7876 + }, + { + "epoch": 1.9954401519949334, + "grad_norm": 3.3098559379577637, + "learning_rate": 7.021633684163742e-06, + "loss": 0.7373, + "step": 7877 + }, + { + "epoch": 1.995693476884104, + "grad_norm": 3.6087119579315186, + "learning_rate": 7.020867348298381e-06, + "loss": 0.8426, + "step": 7878 + }, + { + "epoch": 1.9959468017732742, + "grad_norm": 3.725323438644409, + "learning_rate": 7.020100955688009e-06, + "loss": 0.811, + "step": 7879 + }, + { + "epoch": 1.9962001266624445, + "grad_norm": 3.5992720127105713, + "learning_rate": 7.019334506354151e-06, + "loss": 0.7686, + "step": 7880 + }, + { + "epoch": 1.996453451551615, + "grad_norm": 4.135205268859863, + "learning_rate": 7.018568000318327e-06, + "loss": 0.7655, + "step": 7881 + }, + { + "epoch": 1.9967067764407853, + "grad_norm": 3.6513140201568604, + "learning_rate": 7.0178014376020575e-06, + "loss": 0.7065, + "step": 7882 + }, + { + "epoch": 1.9969601013299556, + "grad_norm": 3.5085322856903076, + "learning_rate": 7.017034818226871e-06, + "loss": 0.7248, + "step": 7883 + }, + { + "epoch": 1.9972134262191261, + "grad_norm": 3.4335103034973145, + "learning_rate": 7.016268142214291e-06, + "loss": 0.7372, + "step": 7884 + }, + { + "epoch": 1.9974667511082964, + "grad_norm": 3.483394145965576, + "learning_rate": 7.015501409585847e-06, + "loss": 0.6897, + "step": 7885 + }, + { + "epoch": 1.9977200759974667, + "grad_norm": 3.5033979415893555, + "learning_rate": 7.0147346203630686e-06, + "loss": 0.7084, + "step": 7886 + }, + { + "epoch": 1.9979734008866372, + "grad_norm": 4.055886268615723, + "learning_rate": 7.013967774567485e-06, + "loss": 0.8701, + "step": 7887 + }, + { + "epoch": 1.9982267257758075, + "grad_norm": 3.7454984188079834, + "learning_rate": 7.0132008722206316e-06, + "loss": 0.715, + "step": 7888 + }, + { + "epoch": 1.9984800506649778, + "grad_norm": 3.3161065578460693, + "learning_rate": 7.01243391334404e-06, + "loss": 0.7263, + "step": 7889 + }, + { + "epoch": 1.9987333755541483, + "grad_norm": 3.382779598236084, + "learning_rate": 7.0116668979592485e-06, + "loss": 0.7841, + "step": 7890 + }, + { + "epoch": 1.9989867004433186, + "grad_norm": 3.61897349357605, + "learning_rate": 7.0108998260877925e-06, + "loss": 0.8071, + "step": 7891 + }, + { + "epoch": 1.999240025332489, + "grad_norm": 3.5087099075317383, + "learning_rate": 7.010132697751212e-06, + "loss": 0.6358, + "step": 7892 + }, + { + "epoch": 1.9994933502216594, + "grad_norm": 3.284588575363159, + "learning_rate": 7.009365512971048e-06, + "loss": 0.7479, + "step": 7893 + }, + { + "epoch": 1.9997466751108295, + "grad_norm": 3.6976845264434814, + "learning_rate": 7.008598271768842e-06, + "loss": 0.7084, + "step": 7894 + }, + { + "epoch": 2.0, + "grad_norm": 3.472818613052368, + "learning_rate": 7.007830974166138e-06, + "loss": 0.7147, + "step": 7895 + } + ], + "logging_steps": 1.0, + "max_steps": 19735, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.462352584434778e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}