{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998000399920016, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007998400319936012, "grad_norm": 4.033497738180381, "learning_rate": 0.0, "loss": -0.0349, "step": 1 }, { "epoch": 0.0015996800639872025, "grad_norm": 3.51489589336218, "learning_rate": 3.8110282485354675e-07, "loss": -0.0542, "step": 2 }, { "epoch": 0.0023995200959808036, "grad_norm": 3.185205419075282, "learning_rate": 6.040336863117743e-07, "loss": 0.0277, "step": 3 }, { "epoch": 0.003199360127974405, "grad_norm": 2.6795442503962392, "learning_rate": 7.622056497070935e-07, "loss": 0.0346, "step": 4 }, { "epoch": 0.003999200159968006, "grad_norm": 4.306548503833335, "learning_rate": 8.84893356068388e-07, "loss": -0.0123, "step": 5 }, { "epoch": 0.004799040191961607, "grad_norm": 3.232420399829724, "learning_rate": 9.85136511165321e-07, "loss": -0.0277, "step": 6 }, { "epoch": 0.005598880223955209, "grad_norm": 4.556647322380419, "learning_rate": 1.0698908911626617e-06, "loss": -0.0824, "step": 7 }, { "epoch": 0.00639872025594881, "grad_norm": 4.75835426891596, "learning_rate": 1.1433084745606403e-06, "loss": -0.0487, "step": 8 }, { "epoch": 0.007198560287942412, "grad_norm": 3.671579551985109, "learning_rate": 1.2080673726235485e-06, "loss": -0.101, "step": 9 }, { "epoch": 0.007998400319936013, "grad_norm": 5.478829821928343, "learning_rate": 1.2659961809219347e-06, "loss": 0.0194, "step": 10 }, { "epoch": 0.008798240351929614, "grad_norm": 4.697076805586441, "learning_rate": 1.318399162250352e-06, "loss": -0.0713, "step": 11 }, { "epoch": 0.009598080383923215, "grad_norm": 4.668341585855885, "learning_rate": 1.366239336018868e-06, "loss": -0.0365, "step": 12 }, { "epoch": 0.010397920415916816, "grad_norm": 4.0841275175924725, "learning_rate": 1.4102480297838326e-06, "loss": -0.0814, "step": 13 }, { "epoch": 0.011197760447910418, "grad_norm": 3.5676724545793244, "learning_rate": 1.4509937160162082e-06, "loss": -0.0815, "step": 14 }, { "epoch": 0.01199760047990402, "grad_norm": 3.467762437060136, "learning_rate": 1.4889270423801623e-06, "loss": -0.0654, "step": 15 }, { "epoch": 0.01279744051189762, "grad_norm": 2.8225474286212457, "learning_rate": 1.524411299414187e-06, "loss": -0.062, "step": 16 }, { "epoch": 0.013597280543891222, "grad_norm": 5.079798214426631, "learning_rate": 1.5577436352844088e-06, "loss": -0.1394, "step": 17 }, { "epoch": 0.014397120575884824, "grad_norm": 4.339673002027499, "learning_rate": 1.5891701974770953e-06, "loss": -0.042, "step": 18 }, { "epoch": 0.015196960607878424, "grad_norm": 4.245593535465187, "learning_rate": 1.6188971751464532e-06, "loss": 0.0082, "step": 19 }, { "epoch": 0.015996800639872025, "grad_norm": 2.975134158916166, "learning_rate": 1.6470990057754815e-06, "loss": -0.0294, "step": 20 }, { "epoch": 0.016796640671865627, "grad_norm": 3.331212004539707, "learning_rate": 1.673924577474436e-06, "loss": -0.0544, "step": 21 }, { "epoch": 0.01759648070385923, "grad_norm": 4.545480625771349, "learning_rate": 1.6995019871038986e-06, "loss": -0.0731, "step": 22 }, { "epoch": 0.01839632073585283, "grad_norm": 3.7461986348064595, "learning_rate": 1.7239422398533632e-06, "loss": -0.1278, "step": 23 }, { "epoch": 0.01919616076784643, "grad_norm": 2.8335266795291543, "learning_rate": 1.7473421608724147e-06, "loss": -0.083, "step": 24 }, { "epoch": 0.01999600079984003, "grad_norm": 3.68991966387542, "learning_rate": 1.769786712136776e-06, "loss": -0.0695, "step": 25 }, { "epoch": 0.020795840831833633, "grad_norm": 4.133751803923456, "learning_rate": 1.7913508546373795e-06, "loss": -0.0445, "step": 26 }, { "epoch": 0.021595680863827234, "grad_norm": 5.000412923928387, "learning_rate": 1.812101058935323e-06, "loss": 0.1318, "step": 27 }, { "epoch": 0.022395520895820836, "grad_norm": 3.5992183048553557, "learning_rate": 1.832096540869755e-06, "loss": -0.1249, "step": 28 }, { "epoch": 0.023195360927814438, "grad_norm": 3.427019545745099, "learning_rate": 1.8513902803279621e-06, "loss": -0.1743, "step": 29 }, { "epoch": 0.02399520095980804, "grad_norm": 4.725372315931298, "learning_rate": 1.8700298672337092e-06, "loss": 0.0091, "step": 30 }, { "epoch": 0.024795040991801638, "grad_norm": 5.254587258373332, "learning_rate": 1.888058208767457e-06, "loss": -0.0653, "step": 31 }, { "epoch": 0.02559488102379524, "grad_norm": 8.849971672055872, "learning_rate": 1.905514124267734e-06, "loss": -0.0049, "step": 32 }, { "epoch": 0.026394721055788842, "grad_norm": 3.4635929572339874, "learning_rate": 1.922432848562126e-06, "loss": -0.089, "step": 33 }, { "epoch": 0.027194561087782444, "grad_norm": 5.9169701870000715, "learning_rate": 1.9388464601379558e-06, "loss": -0.1119, "step": 34 }, { "epoch": 0.027994401119776045, "grad_norm": 5.188979707658736, "learning_rate": 1.9547842472310495e-06, "loss": -0.1121, "step": 35 }, { "epoch": 0.028794241151769647, "grad_norm": 4.896152090964042, "learning_rate": 1.970273022330642e-06, "loss": -0.0525, "step": 36 }, { "epoch": 0.02959408118376325, "grad_norm": 5.994962166737604, "learning_rate": 1.9853373935840096e-06, "loss": -0.1089, "step": 37 }, { "epoch": 0.030393921215756847, "grad_norm": 5.576677749054259, "learning_rate": 2e-06, "loss": 0.0304, "step": 38 }, { "epoch": 0.03119376124775045, "grad_norm": 4.313822225686198, "learning_rate": 2e-06, "loss": -0.1328, "step": 39 }, { "epoch": 0.03199360127974405, "grad_norm": 3.81381102986674, "learning_rate": 1.998349834983498e-06, "loss": -0.0792, "step": 40 }, { "epoch": 0.03279344131173765, "grad_norm": 4.839236808694631, "learning_rate": 1.996699669966997e-06, "loss": -0.056, "step": 41 }, { "epoch": 0.033593281343731254, "grad_norm": 3.77305458921913, "learning_rate": 1.995049504950495e-06, "loss": -0.0339, "step": 42 }, { "epoch": 0.03439312137572485, "grad_norm": 5.867440087459917, "learning_rate": 1.9933993399339932e-06, "loss": -0.0176, "step": 43 }, { "epoch": 0.03519296140771846, "grad_norm": 6.4685114572374465, "learning_rate": 1.991749174917492e-06, "loss": 0.0402, "step": 44 }, { "epoch": 0.035992801439712056, "grad_norm": 11.48483869174446, "learning_rate": 1.99009900990099e-06, "loss": -0.0702, "step": 45 }, { "epoch": 0.03679264147170566, "grad_norm": 4.2414264593452735, "learning_rate": 1.9884488448844884e-06, "loss": -0.1241, "step": 46 }, { "epoch": 0.03759248150369926, "grad_norm": 4.949735715342123, "learning_rate": 1.9867986798679866e-06, "loss": -0.0996, "step": 47 }, { "epoch": 0.03839232153569286, "grad_norm": 5.06186964090094, "learning_rate": 1.9851485148514852e-06, "loss": -0.1133, "step": 48 }, { "epoch": 0.039192161567686463, "grad_norm": 3.9056723205659183, "learning_rate": 1.9834983498349835e-06, "loss": -0.1631, "step": 49 }, { "epoch": 0.03999200159968006, "grad_norm": 5.51223763254555, "learning_rate": 1.9818481848184817e-06, "loss": 0.0191, "step": 50 }, { "epoch": 0.04079184163167367, "grad_norm": 3.7107878002289, "learning_rate": 1.98019801980198e-06, "loss": -0.0271, "step": 51 }, { "epoch": 0.041591681663667265, "grad_norm": 5.618046340756691, "learning_rate": 1.9785478547854786e-06, "loss": -0.0227, "step": 52 }, { "epoch": 0.04239152169566087, "grad_norm": 3.7602961019841468, "learning_rate": 1.976897689768977e-06, "loss": -0.126, "step": 53 }, { "epoch": 0.04319136172765447, "grad_norm": 4.322826902384424, "learning_rate": 1.975247524752475e-06, "loss": -0.1196, "step": 54 }, { "epoch": 0.04399120175964807, "grad_norm": 3.6276654934086565, "learning_rate": 1.9735973597359733e-06, "loss": -0.1098, "step": 55 }, { "epoch": 0.04479104179164167, "grad_norm": 3.729759012982189, "learning_rate": 1.971947194719472e-06, "loss": -0.2248, "step": 56 }, { "epoch": 0.04559088182363527, "grad_norm": 5.552584092947439, "learning_rate": 1.97029702970297e-06, "loss": -0.0821, "step": 57 }, { "epoch": 0.046390721855628876, "grad_norm": 6.016002296406734, "learning_rate": 1.9686468646864684e-06, "loss": -0.1302, "step": 58 }, { "epoch": 0.047190561887622474, "grad_norm": 6.7453871612622995, "learning_rate": 1.966996699669967e-06, "loss": -0.0652, "step": 59 }, { "epoch": 0.04799040191961608, "grad_norm": 4.874246979289447, "learning_rate": 1.9653465346534653e-06, "loss": -0.0409, "step": 60 }, { "epoch": 0.04879024195160968, "grad_norm": 3.894046979082966, "learning_rate": 1.9636963696369635e-06, "loss": 0.021, "step": 61 }, { "epoch": 0.049590081983603276, "grad_norm": 3.829546481539617, "learning_rate": 1.962046204620462e-06, "loss": -0.246, "step": 62 }, { "epoch": 0.05038992201559688, "grad_norm": 5.021080021581999, "learning_rate": 1.9603960396039604e-06, "loss": -0.0029, "step": 63 }, { "epoch": 0.05118976204759048, "grad_norm": 4.084832649304883, "learning_rate": 1.9587458745874586e-06, "loss": -0.1911, "step": 64 }, { "epoch": 0.051989602079584085, "grad_norm": 3.4567077830219595, "learning_rate": 1.9570957095709572e-06, "loss": 0.0388, "step": 65 }, { "epoch": 0.052789442111577684, "grad_norm": 4.523151395245523, "learning_rate": 1.9554455445544555e-06, "loss": -0.0422, "step": 66 }, { "epoch": 0.05358928214357129, "grad_norm": 4.574942149645985, "learning_rate": 1.9537953795379537e-06, "loss": 0.0052, "step": 67 }, { "epoch": 0.05438912217556489, "grad_norm": 5.884212332415378, "learning_rate": 1.952145214521452e-06, "loss": -0.0622, "step": 68 }, { "epoch": 0.055188962207558485, "grad_norm": 3.172106804857128, "learning_rate": 1.95049504950495e-06, "loss": -0.0936, "step": 69 }, { "epoch": 0.05598880223955209, "grad_norm": 4.882587885458746, "learning_rate": 1.948844884488449e-06, "loss": -0.1034, "step": 70 }, { "epoch": 0.05678864227154569, "grad_norm": 3.290096020906111, "learning_rate": 1.947194719471947e-06, "loss": -0.126, "step": 71 }, { "epoch": 0.057588482303539294, "grad_norm": 4.175847937084437, "learning_rate": 1.9455445544554453e-06, "loss": -0.0295, "step": 72 }, { "epoch": 0.05838832233553289, "grad_norm": 4.774862772782205, "learning_rate": 1.943894389438944e-06, "loss": -0.0737, "step": 73 }, { "epoch": 0.0591881623675265, "grad_norm": 4.866413673374395, "learning_rate": 1.942244224422442e-06, "loss": 0.0154, "step": 74 }, { "epoch": 0.059988002399520096, "grad_norm": 3.244110640100742, "learning_rate": 1.9405940594059404e-06, "loss": -0.0876, "step": 75 }, { "epoch": 0.060787842431513694, "grad_norm": 4.94642971249312, "learning_rate": 1.938943894389439e-06, "loss": -0.0634, "step": 76 }, { "epoch": 0.0615876824635073, "grad_norm": 3.1477348357592705, "learning_rate": 1.9372937293729373e-06, "loss": -0.0934, "step": 77 }, { "epoch": 0.0623875224955009, "grad_norm": 4.180278871715678, "learning_rate": 1.9356435643564355e-06, "loss": -0.0158, "step": 78 }, { "epoch": 0.0631873625274945, "grad_norm": 4.73751736841566, "learning_rate": 1.933993399339934e-06, "loss": 0.0419, "step": 79 }, { "epoch": 0.0639872025594881, "grad_norm": 3.7318625198178577, "learning_rate": 1.9323432343234324e-06, "loss": -0.0981, "step": 80 }, { "epoch": 0.0647870425914817, "grad_norm": 3.5344903982736016, "learning_rate": 1.9306930693069306e-06, "loss": 0.028, "step": 81 }, { "epoch": 0.0655868826234753, "grad_norm": 3.6655427915390653, "learning_rate": 1.9290429042904292e-06, "loss": -0.0343, "step": 82 }, { "epoch": 0.06638672265546891, "grad_norm": 3.8402537750787817, "learning_rate": 1.9273927392739275e-06, "loss": -0.0479, "step": 83 }, { "epoch": 0.06718656268746251, "grad_norm": 6.194776167870759, "learning_rate": 1.9257425742574257e-06, "loss": 0.0046, "step": 84 }, { "epoch": 0.06798640271945611, "grad_norm": 5.366879383554931, "learning_rate": 1.924092409240924e-06, "loss": -0.1113, "step": 85 }, { "epoch": 0.0687862427514497, "grad_norm": 3.9997732575047547, "learning_rate": 1.922442244224422e-06, "loss": 0.0446, "step": 86 }, { "epoch": 0.06958608278344332, "grad_norm": 9.73377697425672, "learning_rate": 1.920792079207921e-06, "loss": -0.0569, "step": 87 }, { "epoch": 0.07038592281543692, "grad_norm": 5.0689420802437875, "learning_rate": 1.919141914191419e-06, "loss": -0.0352, "step": 88 }, { "epoch": 0.07118576284743051, "grad_norm": 8.98640262446026, "learning_rate": 1.9174917491749173e-06, "loss": 0.041, "step": 89 }, { "epoch": 0.07198560287942411, "grad_norm": 5.63457538673664, "learning_rate": 1.9158415841584155e-06, "loss": -0.0193, "step": 90 }, { "epoch": 0.07278544291141771, "grad_norm": 4.290130537843607, "learning_rate": 1.914191419141914e-06, "loss": -0.0742, "step": 91 }, { "epoch": 0.07358528294341132, "grad_norm": 4.0945792486692465, "learning_rate": 1.9125412541254124e-06, "loss": 0.029, "step": 92 }, { "epoch": 0.07438512297540492, "grad_norm": 4.96670528541929, "learning_rate": 1.9108910891089106e-06, "loss": -0.1134, "step": 93 }, { "epoch": 0.07518496300739852, "grad_norm": 5.027466862141088, "learning_rate": 1.9092409240924093e-06, "loss": -0.0508, "step": 94 }, { "epoch": 0.07598480303939212, "grad_norm": 9.573722686429775, "learning_rate": 1.9075907590759075e-06, "loss": -0.1592, "step": 95 }, { "epoch": 0.07678464307138572, "grad_norm": 5.764961349212166, "learning_rate": 1.9059405940594057e-06, "loss": -0.0122, "step": 96 }, { "epoch": 0.07758448310337933, "grad_norm": 3.3502664260820247, "learning_rate": 1.9042904290429044e-06, "loss": -0.2179, "step": 97 }, { "epoch": 0.07838432313537293, "grad_norm": 3.967491851586746, "learning_rate": 1.9026402640264026e-06, "loss": -0.0234, "step": 98 }, { "epoch": 0.07918416316736653, "grad_norm": 4.552540817957586, "learning_rate": 1.9009900990099008e-06, "loss": -0.1591, "step": 99 }, { "epoch": 0.07998400319936012, "grad_norm": 10.823710544953496, "learning_rate": 1.8993399339933993e-06, "loss": 0.0374, "step": 100 }, { "epoch": 0.08078384323135374, "grad_norm": 4.806468007236691, "learning_rate": 1.8976897689768975e-06, "loss": -0.0565, "step": 101 }, { "epoch": 0.08158368326334733, "grad_norm": 3.634545747480329, "learning_rate": 1.896039603960396e-06, "loss": -0.0762, "step": 102 }, { "epoch": 0.08238352329534093, "grad_norm": 3.2837047295849597, "learning_rate": 1.8943894389438944e-06, "loss": -0.0491, "step": 103 }, { "epoch": 0.08318336332733453, "grad_norm": 5.176653817957751, "learning_rate": 1.8927392739273926e-06, "loss": -0.1934, "step": 104 }, { "epoch": 0.08398320335932813, "grad_norm": 6.107024303996945, "learning_rate": 1.8910891089108908e-06, "loss": -0.1129, "step": 105 }, { "epoch": 0.08478304339132174, "grad_norm": 4.489176343037952, "learning_rate": 1.8894389438943895e-06, "loss": 0.0883, "step": 106 }, { "epoch": 0.08558288342331534, "grad_norm": 4.318302618280909, "learning_rate": 1.8877887788778877e-06, "loss": -0.1609, "step": 107 }, { "epoch": 0.08638272345530894, "grad_norm": 4.634209202008312, "learning_rate": 1.886138613861386e-06, "loss": -0.0457, "step": 108 }, { "epoch": 0.08718256348730254, "grad_norm": 3.630881832190838, "learning_rate": 1.8844884488448844e-06, "loss": -0.1382, "step": 109 }, { "epoch": 0.08798240351929613, "grad_norm": 3.886065281514502, "learning_rate": 1.8828382838283828e-06, "loss": -0.0535, "step": 110 }, { "epoch": 0.08878224355128975, "grad_norm": 3.647392695144741, "learning_rate": 1.881188118811881e-06, "loss": -0.0809, "step": 111 }, { "epoch": 0.08958208358328335, "grad_norm": 3.9753438884802463, "learning_rate": 1.8795379537953795e-06, "loss": -0.0791, "step": 112 }, { "epoch": 0.09038192361527694, "grad_norm": 4.473252382488765, "learning_rate": 1.8778877887788777e-06, "loss": -0.0723, "step": 113 }, { "epoch": 0.09118176364727054, "grad_norm": 4.928253206993449, "learning_rate": 1.876237623762376e-06, "loss": 0.0125, "step": 114 }, { "epoch": 0.09198160367926415, "grad_norm": 4.557945800338486, "learning_rate": 1.8745874587458746e-06, "loss": -0.0889, "step": 115 }, { "epoch": 0.09278144371125775, "grad_norm": 5.830924417742841, "learning_rate": 1.8729372937293728e-06, "loss": 0.0504, "step": 116 }, { "epoch": 0.09358128374325135, "grad_norm": 6.4722171650631655, "learning_rate": 1.8712871287128713e-06, "loss": -0.008, "step": 117 }, { "epoch": 0.09438112377524495, "grad_norm": 3.1676413558574428, "learning_rate": 1.8696369636963695e-06, "loss": -0.0483, "step": 118 }, { "epoch": 0.09518096380723855, "grad_norm": 5.1310816710504845, "learning_rate": 1.867986798679868e-06, "loss": -0.0513, "step": 119 }, { "epoch": 0.09598080383923216, "grad_norm": 3.620114816562482, "learning_rate": 1.8663366336633664e-06, "loss": -0.1453, "step": 120 }, { "epoch": 0.09678064387122576, "grad_norm": 6.0676794834569865, "learning_rate": 1.8646864686468646e-06, "loss": -0.0194, "step": 121 }, { "epoch": 0.09758048390321936, "grad_norm": 6.414733331241253, "learning_rate": 1.8630363036303628e-06, "loss": -0.0488, "step": 122 }, { "epoch": 0.09838032393521295, "grad_norm": 4.6846628376767905, "learning_rate": 1.8613861386138615e-06, "loss": -0.0195, "step": 123 }, { "epoch": 0.09918016396720655, "grad_norm": 3.235246476419315, "learning_rate": 1.8597359735973597e-06, "loss": -0.0942, "step": 124 }, { "epoch": 0.09998000399920016, "grad_norm": 5.3470459527801495, "learning_rate": 1.858085808580858e-06, "loss": 0.0276, "step": 125 }, { "epoch": 0.10077984403119376, "grad_norm": 3.9287996597379995, "learning_rate": 1.8564356435643564e-06, "loss": 0.0306, "step": 126 }, { "epoch": 0.10157968406318736, "grad_norm": 4.995425229535215, "learning_rate": 1.8547854785478546e-06, "loss": 0.0087, "step": 127 }, { "epoch": 0.10237952409518096, "grad_norm": 4.573732944820577, "learning_rate": 1.853135313531353e-06, "loss": -0.1424, "step": 128 }, { "epoch": 0.10317936412717456, "grad_norm": 4.55020470630308, "learning_rate": 1.8514851485148515e-06, "loss": -0.0559, "step": 129 }, { "epoch": 0.10397920415916817, "grad_norm": 4.41698840906731, "learning_rate": 1.8498349834983497e-06, "loss": -0.0188, "step": 130 }, { "epoch": 0.10477904419116177, "grad_norm": 5.223224115420677, "learning_rate": 1.848184818481848e-06, "loss": -0.1098, "step": 131 }, { "epoch": 0.10557888422315537, "grad_norm": 3.8011698979898005, "learning_rate": 1.8465346534653466e-06, "loss": -0.0328, "step": 132 }, { "epoch": 0.10637872425514897, "grad_norm": 3.1746565925932835, "learning_rate": 1.8448844884488448e-06, "loss": -0.0525, "step": 133 }, { "epoch": 0.10717856428714258, "grad_norm": 3.9995360105342903, "learning_rate": 1.843234323432343e-06, "loss": -0.0665, "step": 134 }, { "epoch": 0.10797840431913618, "grad_norm": 4.722040708955319, "learning_rate": 1.8415841584158415e-06, "loss": -0.1398, "step": 135 }, { "epoch": 0.10877824435112977, "grad_norm": 4.002530013096379, "learning_rate": 1.83993399339934e-06, "loss": -0.0609, "step": 136 }, { "epoch": 0.10957808438312337, "grad_norm": 4.23131853552439, "learning_rate": 1.8382838283828382e-06, "loss": -0.0109, "step": 137 }, { "epoch": 0.11037792441511697, "grad_norm": 5.324154803758963, "learning_rate": 1.8366336633663366e-06, "loss": -0.0251, "step": 138 }, { "epoch": 0.11117776444711058, "grad_norm": 4.297402311394241, "learning_rate": 1.8349834983498348e-06, "loss": -0.0589, "step": 139 }, { "epoch": 0.11197760447910418, "grad_norm": 4.454960816079389, "learning_rate": 1.833333333333333e-06, "loss": -0.1049, "step": 140 }, { "epoch": 0.11277744451109778, "grad_norm": 4.709849875744532, "learning_rate": 1.8316831683168317e-06, "loss": -0.0279, "step": 141 }, { "epoch": 0.11357728454309138, "grad_norm": 3.9184959414442724, "learning_rate": 1.83003300330033e-06, "loss": -0.1333, "step": 142 }, { "epoch": 0.11437712457508498, "grad_norm": 4.362164005140024, "learning_rate": 1.8283828382838282e-06, "loss": -0.0821, "step": 143 }, { "epoch": 0.11517696460707859, "grad_norm": 3.814336740776002, "learning_rate": 1.8267326732673266e-06, "loss": -0.0764, "step": 144 }, { "epoch": 0.11597680463907219, "grad_norm": 4.1087265373281925, "learning_rate": 1.825082508250825e-06, "loss": 0.0595, "step": 145 }, { "epoch": 0.11677664467106579, "grad_norm": 5.05463448309474, "learning_rate": 1.8234323432343233e-06, "loss": -0.0749, "step": 146 }, { "epoch": 0.11757648470305938, "grad_norm": 7.009438010420224, "learning_rate": 1.8217821782178217e-06, "loss": -0.1623, "step": 147 }, { "epoch": 0.118376324735053, "grad_norm": 5.86862518535322, "learning_rate": 1.82013201320132e-06, "loss": -0.1914, "step": 148 }, { "epoch": 0.1191761647670466, "grad_norm": 8.568812361586986, "learning_rate": 1.8184818481848184e-06, "loss": -0.0496, "step": 149 }, { "epoch": 0.11997600479904019, "grad_norm": 9.02774053582229, "learning_rate": 1.8168316831683168e-06, "loss": -0.022, "step": 150 }, { "epoch": 0.12077584483103379, "grad_norm": 5.51491933312306, "learning_rate": 1.815181518151815e-06, "loss": -0.1322, "step": 151 }, { "epoch": 0.12157568486302739, "grad_norm": 5.304215018308479, "learning_rate": 1.8135313531353133e-06, "loss": -0.0676, "step": 152 }, { "epoch": 0.122375524895021, "grad_norm": 3.9922542678415565, "learning_rate": 1.811881188118812e-06, "loss": 0.0184, "step": 153 }, { "epoch": 0.1231753649270146, "grad_norm": 4.724197779779715, "learning_rate": 1.8102310231023102e-06, "loss": -0.1204, "step": 154 }, { "epoch": 0.1239752049590082, "grad_norm": 5.76455405608935, "learning_rate": 1.8085808580858084e-06, "loss": -0.1421, "step": 155 }, { "epoch": 0.1247750449910018, "grad_norm": 10.161753692062435, "learning_rate": 1.8069306930693068e-06, "loss": 0.0592, "step": 156 }, { "epoch": 0.1255748850229954, "grad_norm": 3.667923249601308, "learning_rate": 1.805280528052805e-06, "loss": -0.0988, "step": 157 }, { "epoch": 0.126374725054989, "grad_norm": 4.515737987543515, "learning_rate": 1.8036303630363035e-06, "loss": 0.0522, "step": 158 }, { "epoch": 0.1271745650869826, "grad_norm": 3.336996513422035, "learning_rate": 1.801980198019802e-06, "loss": -0.09, "step": 159 }, { "epoch": 0.1279744051189762, "grad_norm": 3.969953099317271, "learning_rate": 1.8003300330033002e-06, "loss": -0.0435, "step": 160 }, { "epoch": 0.1287742451509698, "grad_norm": 4.549949209747214, "learning_rate": 1.7986798679867984e-06, "loss": -0.0613, "step": 161 }, { "epoch": 0.1295740851829634, "grad_norm": 3.759639050223288, "learning_rate": 1.797029702970297e-06, "loss": -0.0784, "step": 162 }, { "epoch": 0.130373925214957, "grad_norm": 4.619365249559499, "learning_rate": 1.7953795379537953e-06, "loss": -0.0111, "step": 163 }, { "epoch": 0.1311737652469506, "grad_norm": 4.114791027895229, "learning_rate": 1.7937293729372935e-06, "loss": -0.0327, "step": 164 }, { "epoch": 0.13197360527894422, "grad_norm": 3.8956026767168836, "learning_rate": 1.792079207920792e-06, "loss": -0.1106, "step": 165 }, { "epoch": 0.13277344531093782, "grad_norm": 4.818435179721396, "learning_rate": 1.7904290429042904e-06, "loss": -0.0034, "step": 166 }, { "epoch": 0.13357328534293142, "grad_norm": 6.763152130893218, "learning_rate": 1.7887788778877888e-06, "loss": 0.0651, "step": 167 }, { "epoch": 0.13437312537492502, "grad_norm": 4.097132792098502, "learning_rate": 1.787128712871287e-06, "loss": -0.0269, "step": 168 }, { "epoch": 0.13517296540691862, "grad_norm": 4.706830462846675, "learning_rate": 1.7854785478547853e-06, "loss": 0.0558, "step": 169 }, { "epoch": 0.13597280543891221, "grad_norm": 4.254134691338051, "learning_rate": 1.783828382838284e-06, "loss": -0.0046, "step": 170 }, { "epoch": 0.1367726454709058, "grad_norm": 5.457939580250951, "learning_rate": 1.7821782178217822e-06, "loss": -0.0379, "step": 171 }, { "epoch": 0.1375724855028994, "grad_norm": 3.2577166280201544, "learning_rate": 1.7805280528052804e-06, "loss": -0.0993, "step": 172 }, { "epoch": 0.138372325534893, "grad_norm": 5.551040160162887, "learning_rate": 1.7788778877887789e-06, "loss": -0.0543, "step": 173 }, { "epoch": 0.13917216556688664, "grad_norm": 3.69149537962834, "learning_rate": 1.777227722772277e-06, "loss": -0.0443, "step": 174 }, { "epoch": 0.13997200559888023, "grad_norm": 4.4643620642536455, "learning_rate": 1.7755775577557755e-06, "loss": -0.0449, "step": 175 }, { "epoch": 0.14077184563087383, "grad_norm": 3.5240643064279977, "learning_rate": 1.773927392739274e-06, "loss": -0.0928, "step": 176 }, { "epoch": 0.14157168566286743, "grad_norm": 5.981016645991625, "learning_rate": 1.7722772277227722e-06, "loss": 0.0686, "step": 177 }, { "epoch": 0.14237152569486103, "grad_norm": 4.336791468199441, "learning_rate": 1.7706270627062704e-06, "loss": -0.0617, "step": 178 }, { "epoch": 0.14317136572685463, "grad_norm": 3.678032699373225, "learning_rate": 1.768976897689769e-06, "loss": -0.1058, "step": 179 }, { "epoch": 0.14397120575884823, "grad_norm": 8.431078918847803, "learning_rate": 1.7673267326732673e-06, "loss": -0.034, "step": 180 }, { "epoch": 0.14477104579084182, "grad_norm": 4.90238148952107, "learning_rate": 1.7656765676567655e-06, "loss": 0.0263, "step": 181 }, { "epoch": 0.14557088582283542, "grad_norm": 4.1587161441545115, "learning_rate": 1.764026402640264e-06, "loss": -0.0128, "step": 182 }, { "epoch": 0.14637072585482905, "grad_norm": 4.255313468888732, "learning_rate": 1.7623762376237624e-06, "loss": -0.0138, "step": 183 }, { "epoch": 0.14717056588682265, "grad_norm": 6.24454443290786, "learning_rate": 1.7607260726072606e-06, "loss": -0.0941, "step": 184 }, { "epoch": 0.14797040591881624, "grad_norm": 4.293655354485335, "learning_rate": 1.759075907590759e-06, "loss": -0.0621, "step": 185 }, { "epoch": 0.14877024595080984, "grad_norm": 4.224321769134034, "learning_rate": 1.7574257425742573e-06, "loss": -0.0214, "step": 186 }, { "epoch": 0.14957008598280344, "grad_norm": 3.7629471117165827, "learning_rate": 1.7557755775577555e-06, "loss": -0.0735, "step": 187 }, { "epoch": 0.15036992601479704, "grad_norm": 4.511985288731285, "learning_rate": 1.7541254125412542e-06, "loss": -0.1376, "step": 188 }, { "epoch": 0.15116976604679064, "grad_norm": 4.701449783409153, "learning_rate": 1.7524752475247524e-06, "loss": -0.1208, "step": 189 }, { "epoch": 0.15196960607878424, "grad_norm": 7.169693891516351, "learning_rate": 1.7508250825082506e-06, "loss": 0.0103, "step": 190 }, { "epoch": 0.15276944611077783, "grad_norm": 3.6302391864591126, "learning_rate": 1.749174917491749e-06, "loss": -0.036, "step": 191 }, { "epoch": 0.15356928614277143, "grad_norm": 8.15707311459662, "learning_rate": 1.7475247524752475e-06, "loss": -0.0226, "step": 192 }, { "epoch": 0.15436912617476506, "grad_norm": 4.001526302961896, "learning_rate": 1.7458745874587458e-06, "loss": -0.0587, "step": 193 }, { "epoch": 0.15516896620675866, "grad_norm": 4.468601251007179, "learning_rate": 1.7442244224422442e-06, "loss": -0.1388, "step": 194 }, { "epoch": 0.15596880623875226, "grad_norm": 4.107118632559092, "learning_rate": 1.7425742574257424e-06, "loss": -0.0961, "step": 195 }, { "epoch": 0.15676864627074585, "grad_norm": 3.4961373949789665, "learning_rate": 1.7409240924092409e-06, "loss": -0.0567, "step": 196 }, { "epoch": 0.15756848630273945, "grad_norm": 4.144654814148264, "learning_rate": 1.7392739273927393e-06, "loss": -0.1309, "step": 197 }, { "epoch": 0.15836832633473305, "grad_norm": 3.6625054473315664, "learning_rate": 1.7376237623762375e-06, "loss": -0.0208, "step": 198 }, { "epoch": 0.15916816636672665, "grad_norm": 4.664494531197071, "learning_rate": 1.7359735973597358e-06, "loss": 0.0178, "step": 199 }, { "epoch": 0.15996800639872025, "grad_norm": 6.383022272218445, "learning_rate": 1.7343234323432342e-06, "loss": -0.0616, "step": 200 }, { "epoch": 0.16076784643071385, "grad_norm": 5.505206158317875, "learning_rate": 1.7326732673267326e-06, "loss": -0.0452, "step": 201 }, { "epoch": 0.16156768646270747, "grad_norm": 3.5601606217056765, "learning_rate": 1.7310231023102309e-06, "loss": 0.0225, "step": 202 }, { "epoch": 0.16236752649470107, "grad_norm": 4.408138222273653, "learning_rate": 1.7293729372937293e-06, "loss": -0.1139, "step": 203 }, { "epoch": 0.16316736652669467, "grad_norm": 3.2562884601218087, "learning_rate": 1.7277227722772275e-06, "loss": 0.0087, "step": 204 }, { "epoch": 0.16396720655868827, "grad_norm": 4.350781355214131, "learning_rate": 1.726072607260726e-06, "loss": -0.0885, "step": 205 }, { "epoch": 0.16476704659068186, "grad_norm": 3.3568949216522475, "learning_rate": 1.7244224422442244e-06, "loss": -0.0134, "step": 206 }, { "epoch": 0.16556688662267546, "grad_norm": 6.798474914966856, "learning_rate": 1.7227722772277227e-06, "loss": -0.0945, "step": 207 }, { "epoch": 0.16636672665466906, "grad_norm": 4.577665859282248, "learning_rate": 1.7211221122112209e-06, "loss": -0.1607, "step": 208 }, { "epoch": 0.16716656668666266, "grad_norm": 6.460632243204499, "learning_rate": 1.7194719471947195e-06, "loss": -0.0235, "step": 209 }, { "epoch": 0.16796640671865626, "grad_norm": 4.306267256349224, "learning_rate": 1.7178217821782178e-06, "loss": -0.0059, "step": 210 }, { "epoch": 0.16876624675064986, "grad_norm": 3.0483507543879105, "learning_rate": 1.716171617161716e-06, "loss": -0.111, "step": 211 }, { "epoch": 0.16956608678264348, "grad_norm": 5.737336519193611, "learning_rate": 1.7145214521452144e-06, "loss": 0.018, "step": 212 }, { "epoch": 0.17036592681463708, "grad_norm": 3.7845990191052734, "learning_rate": 1.7128712871287127e-06, "loss": -0.0926, "step": 213 }, { "epoch": 0.17116576684663068, "grad_norm": 3.669531800966666, "learning_rate": 1.711221122112211e-06, "loss": -0.0776, "step": 214 }, { "epoch": 0.17196560687862428, "grad_norm": 4.005323920134325, "learning_rate": 1.7095709570957095e-06, "loss": -0.0399, "step": 215 }, { "epoch": 0.17276544691061788, "grad_norm": 2.8598435648570186, "learning_rate": 1.7079207920792078e-06, "loss": -0.0548, "step": 216 }, { "epoch": 0.17356528694261147, "grad_norm": 4.139220262158334, "learning_rate": 1.7062706270627062e-06, "loss": 0.0686, "step": 217 }, { "epoch": 0.17436512697460507, "grad_norm": 4.988425208682803, "learning_rate": 1.7046204620462046e-06, "loss": -0.0028, "step": 218 }, { "epoch": 0.17516496700659867, "grad_norm": 3.4806124639328164, "learning_rate": 1.7029702970297029e-06, "loss": -0.0914, "step": 219 }, { "epoch": 0.17596480703859227, "grad_norm": 6.013581164060899, "learning_rate": 1.7013201320132013e-06, "loss": -0.0207, "step": 220 }, { "epoch": 0.1767646470705859, "grad_norm": 6.048232130793178, "learning_rate": 1.6996699669966995e-06, "loss": 0.0301, "step": 221 }, { "epoch": 0.1775644871025795, "grad_norm": 4.206288334982141, "learning_rate": 1.698019801980198e-06, "loss": -0.0503, "step": 222 }, { "epoch": 0.1783643271345731, "grad_norm": 4.383148234898824, "learning_rate": 1.6963696369636964e-06, "loss": 0.0425, "step": 223 }, { "epoch": 0.1791641671665667, "grad_norm": 4.013900208301416, "learning_rate": 1.6947194719471947e-06, "loss": -0.0873, "step": 224 }, { "epoch": 0.1799640071985603, "grad_norm": 3.729807083009099, "learning_rate": 1.6930693069306929e-06, "loss": -0.0124, "step": 225 }, { "epoch": 0.1807638472305539, "grad_norm": 4.739805223350201, "learning_rate": 1.6914191419141915e-06, "loss": -0.0965, "step": 226 }, { "epoch": 0.18156368726254749, "grad_norm": 3.684225018193131, "learning_rate": 1.6897689768976898e-06, "loss": -0.0899, "step": 227 }, { "epoch": 0.18236352729454108, "grad_norm": 4.647773349022286, "learning_rate": 1.688118811881188e-06, "loss": -0.1433, "step": 228 }, { "epoch": 0.18316336732653468, "grad_norm": 4.314549940205055, "learning_rate": 1.6864686468646864e-06, "loss": -0.0987, "step": 229 }, { "epoch": 0.1839632073585283, "grad_norm": 6.602144366923463, "learning_rate": 1.6848184818481847e-06, "loss": -0.0855, "step": 230 }, { "epoch": 0.1847630473905219, "grad_norm": 4.611073533381248, "learning_rate": 1.683168316831683e-06, "loss": -0.1262, "step": 231 }, { "epoch": 0.1855628874225155, "grad_norm": 4.9020247032635655, "learning_rate": 1.6815181518151815e-06, "loss": -0.1706, "step": 232 }, { "epoch": 0.1863627274545091, "grad_norm": 4.16092482080365, "learning_rate": 1.6798679867986798e-06, "loss": -0.009, "step": 233 }, { "epoch": 0.1871625674865027, "grad_norm": 3.5906088992190277, "learning_rate": 1.678217821782178e-06, "loss": 0.0999, "step": 234 }, { "epoch": 0.1879624075184963, "grad_norm": 4.005270108795308, "learning_rate": 1.6765676567656767e-06, "loss": -0.0993, "step": 235 }, { "epoch": 0.1887622475504899, "grad_norm": 6.563769408476828, "learning_rate": 1.6749174917491749e-06, "loss": -0.0193, "step": 236 }, { "epoch": 0.1895620875824835, "grad_norm": 3.380070162840573, "learning_rate": 1.6732673267326731e-06, "loss": -0.0809, "step": 237 }, { "epoch": 0.1903619276144771, "grad_norm": 4.931354996369631, "learning_rate": 1.6716171617161716e-06, "loss": -0.0658, "step": 238 }, { "epoch": 0.1911617676464707, "grad_norm": 4.710207450817461, "learning_rate": 1.66996699669967e-06, "loss": -0.1167, "step": 239 }, { "epoch": 0.19196160767846432, "grad_norm": 3.361685025176525, "learning_rate": 1.6683168316831682e-06, "loss": -0.1245, "step": 240 }, { "epoch": 0.19276144771045792, "grad_norm": 3.767676589968502, "learning_rate": 1.6666666666666667e-06, "loss": -0.1638, "step": 241 }, { "epoch": 0.19356128774245152, "grad_norm": 3.7460434704410575, "learning_rate": 1.6650165016501649e-06, "loss": -0.1207, "step": 242 }, { "epoch": 0.1943611277744451, "grad_norm": 3.7655100191535413, "learning_rate": 1.6633663366336631e-06, "loss": 0.0038, "step": 243 }, { "epoch": 0.1951609678064387, "grad_norm": 4.3387270640143685, "learning_rate": 1.6617161716171618e-06, "loss": -0.0234, "step": 244 }, { "epoch": 0.1959608078384323, "grad_norm": 4.729420704117281, "learning_rate": 1.66006600660066e-06, "loss": -0.1446, "step": 245 }, { "epoch": 0.1967606478704259, "grad_norm": 11.46352939122658, "learning_rate": 1.6584158415841582e-06, "loss": -0.0447, "step": 246 }, { "epoch": 0.1975604879024195, "grad_norm": 4.6392172787916355, "learning_rate": 1.6567656765676567e-06, "loss": -0.1279, "step": 247 }, { "epoch": 0.1983603279344131, "grad_norm": 7.81945174107532, "learning_rate": 1.6551155115511551e-06, "loss": -0.1788, "step": 248 }, { "epoch": 0.19916016796640673, "grad_norm": 4.257894476705108, "learning_rate": 1.6534653465346533e-06, "loss": -0.0386, "step": 249 }, { "epoch": 0.19996000799840033, "grad_norm": 3.9255930993081094, "learning_rate": 1.6518151815181518e-06, "loss": -0.0204, "step": 250 }, { "epoch": 0.20075984803039393, "grad_norm": 8.61324493331346, "learning_rate": 1.65016501650165e-06, "loss": 0.0872, "step": 251 }, { "epoch": 0.20155968806238753, "grad_norm": 3.7965562474708525, "learning_rate": 1.6485148514851484e-06, "loss": -0.0834, "step": 252 }, { "epoch": 0.20235952809438112, "grad_norm": 4.327305685228189, "learning_rate": 1.6468646864686469e-06, "loss": -0.0639, "step": 253 }, { "epoch": 0.20315936812637472, "grad_norm": 3.461407011747761, "learning_rate": 1.6452145214521451e-06, "loss": -0.1243, "step": 254 }, { "epoch": 0.20395920815836832, "grad_norm": 5.164636623307167, "learning_rate": 1.6435643564356433e-06, "loss": -0.0318, "step": 255 }, { "epoch": 0.20475904819036192, "grad_norm": 4.411537190722961, "learning_rate": 1.641914191419142e-06, "loss": -0.1533, "step": 256 }, { "epoch": 0.20555888822235552, "grad_norm": 4.832045065537041, "learning_rate": 1.6402640264026402e-06, "loss": -0.0931, "step": 257 }, { "epoch": 0.20635872825434912, "grad_norm": 4.133203614158014, "learning_rate": 1.6386138613861385e-06, "loss": -0.0951, "step": 258 }, { "epoch": 0.20715856828634274, "grad_norm": 4.649558155027992, "learning_rate": 1.636963696369637e-06, "loss": -0.0459, "step": 259 }, { "epoch": 0.20795840831833634, "grad_norm": 3.7050574045200126, "learning_rate": 1.6353135313531351e-06, "loss": -0.1324, "step": 260 }, { "epoch": 0.20875824835032994, "grad_norm": 4.406446520163225, "learning_rate": 1.6336633663366336e-06, "loss": -0.0903, "step": 261 }, { "epoch": 0.20955808838232354, "grad_norm": 4.150658998676116, "learning_rate": 1.632013201320132e-06, "loss": 0.052, "step": 262 }, { "epoch": 0.21035792841431714, "grad_norm": 4.637643800546993, "learning_rate": 1.6303630363036302e-06, "loss": -0.1008, "step": 263 }, { "epoch": 0.21115776844631073, "grad_norm": 4.356392007316505, "learning_rate": 1.6287128712871285e-06, "loss": -0.0666, "step": 264 }, { "epoch": 0.21195760847830433, "grad_norm": 4.2232050225914675, "learning_rate": 1.6270627062706271e-06, "loss": -0.034, "step": 265 }, { "epoch": 0.21275744851029793, "grad_norm": 4.621467065766651, "learning_rate": 1.6254125412541253e-06, "loss": -0.1166, "step": 266 }, { "epoch": 0.21355728854229153, "grad_norm": 3.6996328893459385, "learning_rate": 1.6237623762376238e-06, "loss": -0.0651, "step": 267 }, { "epoch": 0.21435712857428516, "grad_norm": 6.476232651431598, "learning_rate": 1.622112211221122e-06, "loss": -0.087, "step": 268 }, { "epoch": 0.21515696860627875, "grad_norm": 3.3774511125642115, "learning_rate": 1.6204620462046205e-06, "loss": -0.0541, "step": 269 }, { "epoch": 0.21595680863827235, "grad_norm": 8.039893341875281, "learning_rate": 1.6188118811881189e-06, "loss": 0.0426, "step": 270 }, { "epoch": 0.21675664867026595, "grad_norm": 3.45288250792369, "learning_rate": 1.6171617161716171e-06, "loss": -0.071, "step": 271 }, { "epoch": 0.21755648870225955, "grad_norm": 5.813145099240533, "learning_rate": 1.6155115511551154e-06, "loss": 0.0774, "step": 272 }, { "epoch": 0.21835632873425315, "grad_norm": 3.4988010260216202, "learning_rate": 1.6138613861386138e-06, "loss": -0.0378, "step": 273 }, { "epoch": 0.21915616876624675, "grad_norm": 4.136529473287242, "learning_rate": 1.6122112211221122e-06, "loss": 0.0215, "step": 274 }, { "epoch": 0.21995600879824034, "grad_norm": 3.9538185204867884, "learning_rate": 1.6105610561056105e-06, "loss": -0.1014, "step": 275 }, { "epoch": 0.22075584883023394, "grad_norm": 4.987429074808495, "learning_rate": 1.608910891089109e-06, "loss": -0.0326, "step": 276 }, { "epoch": 0.22155568886222757, "grad_norm": 3.1011116987800595, "learning_rate": 1.6072607260726071e-06, "loss": -0.0742, "step": 277 }, { "epoch": 0.22235552889422117, "grad_norm": 4.87646247250274, "learning_rate": 1.6056105610561056e-06, "loss": -0.0488, "step": 278 }, { "epoch": 0.22315536892621476, "grad_norm": 5.406703510997709, "learning_rate": 1.603960396039604e-06, "loss": -0.0998, "step": 279 }, { "epoch": 0.22395520895820836, "grad_norm": 4.199602090060885, "learning_rate": 1.6023102310231022e-06, "loss": 0.0787, "step": 280 }, { "epoch": 0.22475504899020196, "grad_norm": 6.262166502823287, "learning_rate": 1.6006600660066005e-06, "loss": 0.0123, "step": 281 }, { "epoch": 0.22555488902219556, "grad_norm": 5.4085402736640225, "learning_rate": 1.5990099009900991e-06, "loss": -0.1219, "step": 282 }, { "epoch": 0.22635472905418916, "grad_norm": 14.35407252989058, "learning_rate": 1.5973597359735973e-06, "loss": -0.0427, "step": 283 }, { "epoch": 0.22715456908618276, "grad_norm": 5.560237467243524, "learning_rate": 1.5957095709570956e-06, "loss": -0.0363, "step": 284 }, { "epoch": 0.22795440911817635, "grad_norm": 5.376214533362693, "learning_rate": 1.594059405940594e-06, "loss": -0.1198, "step": 285 }, { "epoch": 0.22875424915016995, "grad_norm": 7.872347430401011, "learning_rate": 1.5924092409240922e-06, "loss": 0.02, "step": 286 }, { "epoch": 0.22955408918216358, "grad_norm": 4.079731942515135, "learning_rate": 1.5907590759075907e-06, "loss": -0.1465, "step": 287 }, { "epoch": 0.23035392921415718, "grad_norm": 4.054081807256331, "learning_rate": 1.5891089108910891e-06, "loss": -0.097, "step": 288 }, { "epoch": 0.23115376924615078, "grad_norm": 5.668828140611865, "learning_rate": 1.5874587458745874e-06, "loss": -0.0113, "step": 289 }, { "epoch": 0.23195360927814437, "grad_norm": 4.222209049226612, "learning_rate": 1.5858085808580856e-06, "loss": -0.0565, "step": 290 }, { "epoch": 0.23275344931013797, "grad_norm": 3.7308714963795735, "learning_rate": 1.5841584158415842e-06, "loss": -0.0311, "step": 291 }, { "epoch": 0.23355328934213157, "grad_norm": 3.9384379405107914, "learning_rate": 1.5825082508250825e-06, "loss": -0.0921, "step": 292 }, { "epoch": 0.23435312937412517, "grad_norm": 4.208635426370359, "learning_rate": 1.5808580858085807e-06, "loss": -0.1037, "step": 293 }, { "epoch": 0.23515296940611877, "grad_norm": 4.471661666164002, "learning_rate": 1.5792079207920791e-06, "loss": -0.048, "step": 294 }, { "epoch": 0.23595280943811237, "grad_norm": 5.922123322526879, "learning_rate": 1.5775577557755776e-06, "loss": -0.0822, "step": 295 }, { "epoch": 0.236752649470106, "grad_norm": 3.9336004171911596, "learning_rate": 1.5759075907590758e-06, "loss": -0.0751, "step": 296 }, { "epoch": 0.2375524895020996, "grad_norm": 2.9881202405051086, "learning_rate": 1.5742574257425742e-06, "loss": -0.0694, "step": 297 }, { "epoch": 0.2383523295340932, "grad_norm": 7.04293625200489, "learning_rate": 1.5726072607260725e-06, "loss": -0.1209, "step": 298 }, { "epoch": 0.2391521695660868, "grad_norm": 4.3791375350104165, "learning_rate": 1.5709570957095707e-06, "loss": -0.0704, "step": 299 }, { "epoch": 0.23995200959808038, "grad_norm": 4.4299869604327835, "learning_rate": 1.5693069306930694e-06, "loss": 0.0578, "step": 300 }, { "epoch": 0.24075184963007398, "grad_norm": 5.721847612449816, "learning_rate": 1.5676567656765676e-06, "loss": 0.0124, "step": 301 }, { "epoch": 0.24155168966206758, "grad_norm": 4.346519849517093, "learning_rate": 1.5660066006600658e-06, "loss": -0.0676, "step": 302 }, { "epoch": 0.24235152969406118, "grad_norm": 4.166900068739509, "learning_rate": 1.5643564356435643e-06, "loss": -0.0379, "step": 303 }, { "epoch": 0.24315136972605478, "grad_norm": 4.171740126126224, "learning_rate": 1.5627062706270627e-06, "loss": -0.0369, "step": 304 }, { "epoch": 0.2439512097580484, "grad_norm": 4.571373866809776, "learning_rate": 1.561056105610561e-06, "loss": -0.0423, "step": 305 }, { "epoch": 0.244751049790042, "grad_norm": 4.687528076087793, "learning_rate": 1.5594059405940594e-06, "loss": -0.0427, "step": 306 }, { "epoch": 0.2455508898220356, "grad_norm": 4.099266935733802, "learning_rate": 1.5577557755775576e-06, "loss": -0.1168, "step": 307 }, { "epoch": 0.2463507298540292, "grad_norm": 4.76705500925925, "learning_rate": 1.556105610561056e-06, "loss": -0.0726, "step": 308 }, { "epoch": 0.2471505698860228, "grad_norm": 7.726027050692815, "learning_rate": 1.5544554455445545e-06, "loss": -0.0858, "step": 309 }, { "epoch": 0.2479504099180164, "grad_norm": 4.588817621333979, "learning_rate": 1.5528052805280527e-06, "loss": 0.0889, "step": 310 }, { "epoch": 0.24875024995001, "grad_norm": 5.351566242300243, "learning_rate": 1.551155115511551e-06, "loss": -0.143, "step": 311 }, { "epoch": 0.2495500899820036, "grad_norm": 6.279051438632601, "learning_rate": 1.5495049504950496e-06, "loss": 0.0312, "step": 312 }, { "epoch": 0.2503499300139972, "grad_norm": 4.251123392069971, "learning_rate": 1.5478547854785478e-06, "loss": -0.0477, "step": 313 }, { "epoch": 0.2511497700459908, "grad_norm": 4.255617580398947, "learning_rate": 1.546204620462046e-06, "loss": -0.1445, "step": 314 }, { "epoch": 0.2519496100779844, "grad_norm": 3.979778076387235, "learning_rate": 1.5445544554455445e-06, "loss": -0.0937, "step": 315 }, { "epoch": 0.252749450109978, "grad_norm": 5.547095237980292, "learning_rate": 1.5429042904290427e-06, "loss": -0.0091, "step": 316 }, { "epoch": 0.2535492901419716, "grad_norm": 5.863554498962612, "learning_rate": 1.5412541254125414e-06, "loss": -0.0883, "step": 317 }, { "epoch": 0.2543491301739652, "grad_norm": 3.9832799266815533, "learning_rate": 1.5396039603960396e-06, "loss": -0.1173, "step": 318 }, { "epoch": 0.2551489702059588, "grad_norm": 4.961222194402448, "learning_rate": 1.5379537953795378e-06, "loss": -0.0647, "step": 319 }, { "epoch": 0.2559488102379524, "grad_norm": 2.7901444246654945, "learning_rate": 1.5363036303630363e-06, "loss": -0.0873, "step": 320 }, { "epoch": 0.25674865026994603, "grad_norm": 4.6616454131308265, "learning_rate": 1.5346534653465347e-06, "loss": -0.1283, "step": 321 }, { "epoch": 0.2575484903019396, "grad_norm": 4.32603696177896, "learning_rate": 1.533003300330033e-06, "loss": -0.0748, "step": 322 }, { "epoch": 0.25834833033393323, "grad_norm": 4.653928241866685, "learning_rate": 1.5313531353135314e-06, "loss": -0.1215, "step": 323 }, { "epoch": 0.2591481703659268, "grad_norm": 4.476046494175142, "learning_rate": 1.5297029702970296e-06, "loss": -0.0247, "step": 324 }, { "epoch": 0.2599480103979204, "grad_norm": 5.41605277862076, "learning_rate": 1.528052805280528e-06, "loss": 0.0055, "step": 325 }, { "epoch": 0.260747850429914, "grad_norm": 7.359939974664472, "learning_rate": 1.5264026402640265e-06, "loss": -0.0994, "step": 326 }, { "epoch": 0.2615476904619076, "grad_norm": 3.7953460503418794, "learning_rate": 1.5247524752475247e-06, "loss": 0.0777, "step": 327 }, { "epoch": 0.2623475304939012, "grad_norm": 4.375620334787856, "learning_rate": 1.523102310231023e-06, "loss": -0.1532, "step": 328 }, { "epoch": 0.2631473705258948, "grad_norm": 5.690054518936246, "learning_rate": 1.5214521452145214e-06, "loss": -0.0744, "step": 329 }, { "epoch": 0.26394721055788845, "grad_norm": 4.095859129867475, "learning_rate": 1.5198019801980198e-06, "loss": -0.1342, "step": 330 }, { "epoch": 0.264747050589882, "grad_norm": 5.261928086906211, "learning_rate": 1.518151815181518e-06, "loss": -0.0327, "step": 331 }, { "epoch": 0.26554689062187564, "grad_norm": 3.723958703243353, "learning_rate": 1.5165016501650165e-06, "loss": -0.0919, "step": 332 }, { "epoch": 0.2663467306538692, "grad_norm": 7.064342427249925, "learning_rate": 1.5148514851485147e-06, "loss": -0.3055, "step": 333 }, { "epoch": 0.26714657068586284, "grad_norm": 5.094162064249706, "learning_rate": 1.5132013201320131e-06, "loss": -0.1551, "step": 334 }, { "epoch": 0.2679464107178564, "grad_norm": 5.182464177568643, "learning_rate": 1.5115511551155116e-06, "loss": -0.1656, "step": 335 }, { "epoch": 0.26874625074985004, "grad_norm": 4.205631232130195, "learning_rate": 1.5099009900990098e-06, "loss": -0.1339, "step": 336 }, { "epoch": 0.2695460907818436, "grad_norm": 3.4595951551287234, "learning_rate": 1.508250825082508e-06, "loss": -0.0259, "step": 337 }, { "epoch": 0.27034593081383723, "grad_norm": 4.040524953973991, "learning_rate": 1.5066006600660067e-06, "loss": 0.0424, "step": 338 }, { "epoch": 0.27114577084583086, "grad_norm": 3.3792775209230044, "learning_rate": 1.504950495049505e-06, "loss": -0.0252, "step": 339 }, { "epoch": 0.27194561087782443, "grad_norm": 5.329937489556339, "learning_rate": 1.5033003300330032e-06, "loss": -0.1064, "step": 340 }, { "epoch": 0.27274545090981805, "grad_norm": 3.8366500907383, "learning_rate": 1.5016501650165016e-06, "loss": -0.1327, "step": 341 }, { "epoch": 0.2735452909418116, "grad_norm": 3.4211397121327334, "learning_rate": 1.5e-06, "loss": -0.0469, "step": 342 }, { "epoch": 0.27434513097380525, "grad_norm": 7.190396728605877, "learning_rate": 1.4983498349834983e-06, "loss": -0.0455, "step": 343 }, { "epoch": 0.2751449710057988, "grad_norm": 5.208941899667468, "learning_rate": 1.4966996699669967e-06, "loss": -0.0118, "step": 344 }, { "epoch": 0.27594481103779245, "grad_norm": 5.0666099160345635, "learning_rate": 1.495049504950495e-06, "loss": -0.08, "step": 345 }, { "epoch": 0.276744651069786, "grad_norm": 3.445240945570377, "learning_rate": 1.4933993399339932e-06, "loss": 0.0043, "step": 346 }, { "epoch": 0.27754449110177964, "grad_norm": 6.719396089938185, "learning_rate": 1.4917491749174918e-06, "loss": -0.0528, "step": 347 }, { "epoch": 0.27834433113377327, "grad_norm": 4.948551220275233, "learning_rate": 1.49009900990099e-06, "loss": -0.0351, "step": 348 }, { "epoch": 0.27914417116576684, "grad_norm": 4.198757242081244, "learning_rate": 1.4884488448844883e-06, "loss": -0.1767, "step": 349 }, { "epoch": 0.27994401119776047, "grad_norm": 4.020517893591624, "learning_rate": 1.4867986798679867e-06, "loss": -0.0777, "step": 350 }, { "epoch": 0.28074385122975404, "grad_norm": 7.665385125345826, "learning_rate": 1.4851485148514852e-06, "loss": -0.0675, "step": 351 }, { "epoch": 0.28154369126174766, "grad_norm": 4.359035902610134, "learning_rate": 1.4834983498349834e-06, "loss": 0.1022, "step": 352 }, { "epoch": 0.28234353129374123, "grad_norm": 4.515833866344382, "learning_rate": 1.4818481848184818e-06, "loss": -0.1318, "step": 353 }, { "epoch": 0.28314337132573486, "grad_norm": 10.48643307447715, "learning_rate": 1.48019801980198e-06, "loss": -0.0624, "step": 354 }, { "epoch": 0.28394321135772843, "grad_norm": 3.9055137245563167, "learning_rate": 1.4785478547854785e-06, "loss": 0.0131, "step": 355 }, { "epoch": 0.28474305138972206, "grad_norm": 5.064555563223541, "learning_rate": 1.476897689768977e-06, "loss": -0.0848, "step": 356 }, { "epoch": 0.2855428914217157, "grad_norm": 6.403904331900866, "learning_rate": 1.4752475247524752e-06, "loss": -0.1231, "step": 357 }, { "epoch": 0.28634273145370925, "grad_norm": 4.4680198659839405, "learning_rate": 1.4735973597359734e-06, "loss": -0.0751, "step": 358 }, { "epoch": 0.2871425714857029, "grad_norm": 7.88048544071049, "learning_rate": 1.4719471947194718e-06, "loss": -0.0111, "step": 359 }, { "epoch": 0.28794241151769645, "grad_norm": 4.041245481168213, "learning_rate": 1.4702970297029703e-06, "loss": -0.0219, "step": 360 }, { "epoch": 0.2887422515496901, "grad_norm": 3.2378522821181748, "learning_rate": 1.4686468646864685e-06, "loss": 0.0154, "step": 361 }, { "epoch": 0.28954209158168365, "grad_norm": 5.187324980575399, "learning_rate": 1.466996699669967e-06, "loss": 0.0768, "step": 362 }, { "epoch": 0.2903419316136773, "grad_norm": 3.892629574264858, "learning_rate": 1.4653465346534652e-06, "loss": -0.1757, "step": 363 }, { "epoch": 0.29114177164567084, "grad_norm": 4.66291997005039, "learning_rate": 1.4636963696369636e-06, "loss": -0.0897, "step": 364 }, { "epoch": 0.29194161167766447, "grad_norm": 4.490266870328807, "learning_rate": 1.462046204620462e-06, "loss": -0.1638, "step": 365 }, { "epoch": 0.2927414517096581, "grad_norm": 7.248644471878, "learning_rate": 1.4603960396039603e-06, "loss": -0.1413, "step": 366 }, { "epoch": 0.29354129174165167, "grad_norm": 13.65496051906939, "learning_rate": 1.4587458745874585e-06, "loss": -0.0144, "step": 367 }, { "epoch": 0.2943411317736453, "grad_norm": 3.0336626027850593, "learning_rate": 1.4570957095709572e-06, "loss": -0.008, "step": 368 }, { "epoch": 0.29514097180563886, "grad_norm": 7.873854054225054, "learning_rate": 1.4554455445544554e-06, "loss": 0.1034, "step": 369 }, { "epoch": 0.2959408118376325, "grad_norm": 3.727594324731175, "learning_rate": 1.4537953795379538e-06, "loss": -0.1401, "step": 370 }, { "epoch": 0.29674065186962606, "grad_norm": 5.229701446706082, "learning_rate": 1.452145214521452e-06, "loss": -0.1203, "step": 371 }, { "epoch": 0.2975404919016197, "grad_norm": 4.492128268970922, "learning_rate": 1.4504950495049503e-06, "loss": 0.0004, "step": 372 }, { "epoch": 0.29834033193361326, "grad_norm": 5.077090301738471, "learning_rate": 1.448844884488449e-06, "loss": 0.0219, "step": 373 }, { "epoch": 0.2991401719656069, "grad_norm": 4.796744776644939, "learning_rate": 1.4471947194719472e-06, "loss": -0.0064, "step": 374 }, { "epoch": 0.29994001199760045, "grad_norm": 7.42447528462134, "learning_rate": 1.4455445544554454e-06, "loss": 0.07, "step": 375 }, { "epoch": 0.3007398520295941, "grad_norm": 3.848638759590051, "learning_rate": 1.4438943894389438e-06, "loss": -0.0777, "step": 376 }, { "epoch": 0.3015396920615877, "grad_norm": 4.256980996790008, "learning_rate": 1.4422442244224423e-06, "loss": -0.1766, "step": 377 }, { "epoch": 0.3023395320935813, "grad_norm": 3.961327287203466, "learning_rate": 1.4405940594059405e-06, "loss": -0.0571, "step": 378 }, { "epoch": 0.3031393721255749, "grad_norm": 5.478690567895318, "learning_rate": 1.438943894389439e-06, "loss": 0.0013, "step": 379 }, { "epoch": 0.3039392121575685, "grad_norm": 3.8685538296119106, "learning_rate": 1.4372937293729372e-06, "loss": -0.0135, "step": 380 }, { "epoch": 0.3047390521895621, "grad_norm": 3.712350805091167, "learning_rate": 1.4356435643564356e-06, "loss": -0.0965, "step": 381 }, { "epoch": 0.30553889222155567, "grad_norm": 4.12545866294737, "learning_rate": 1.433993399339934e-06, "loss": 0.0192, "step": 382 }, { "epoch": 0.3063387322535493, "grad_norm": 3.9826126090375085, "learning_rate": 1.4323432343234323e-06, "loss": 0.0096, "step": 383 }, { "epoch": 0.30713857228554287, "grad_norm": 5.253969236088526, "learning_rate": 1.4306930693069305e-06, "loss": 0.0596, "step": 384 }, { "epoch": 0.3079384123175365, "grad_norm": 4.369221167744991, "learning_rate": 1.4290429042904292e-06, "loss": -0.0586, "step": 385 }, { "epoch": 0.3087382523495301, "grad_norm": 3.386456014084215, "learning_rate": 1.4273927392739274e-06, "loss": -0.1952, "step": 386 }, { "epoch": 0.3095380923815237, "grad_norm": 4.175162288229841, "learning_rate": 1.4257425742574256e-06, "loss": -0.1559, "step": 387 }, { "epoch": 0.3103379324135173, "grad_norm": 4.07269720996871, "learning_rate": 1.424092409240924e-06, "loss": -0.0591, "step": 388 }, { "epoch": 0.3111377724455109, "grad_norm": 3.873233515579836, "learning_rate": 1.4224422442244223e-06, "loss": -0.0649, "step": 389 }, { "epoch": 0.3119376124775045, "grad_norm": 5.33165026969968, "learning_rate": 1.4207920792079207e-06, "loss": -0.0568, "step": 390 }, { "epoch": 0.3127374525094981, "grad_norm": 5.644618937197355, "learning_rate": 1.4191419141914192e-06, "loss": -0.0425, "step": 391 }, { "epoch": 0.3135372925414917, "grad_norm": 4.609038777130941, "learning_rate": 1.4174917491749174e-06, "loss": -0.0991, "step": 392 }, { "epoch": 0.3143371325734853, "grad_norm": 5.362814464107483, "learning_rate": 1.4158415841584156e-06, "loss": -0.0377, "step": 393 }, { "epoch": 0.3151369726054789, "grad_norm": 4.1100020129716315, "learning_rate": 1.4141914191419143e-06, "loss": -0.0176, "step": 394 }, { "epoch": 0.31593681263747253, "grad_norm": 3.6462471572713198, "learning_rate": 1.4125412541254125e-06, "loss": 0.0183, "step": 395 }, { "epoch": 0.3167366526694661, "grad_norm": 3.425535847868438, "learning_rate": 1.4108910891089107e-06, "loss": -0.1166, "step": 396 }, { "epoch": 0.3175364927014597, "grad_norm": 4.023065583159361, "learning_rate": 1.4092409240924092e-06, "loss": -0.069, "step": 397 }, { "epoch": 0.3183363327334533, "grad_norm": 4.435192529053884, "learning_rate": 1.4075907590759076e-06, "loss": -0.1024, "step": 398 }, { "epoch": 0.3191361727654469, "grad_norm": 4.351874787170239, "learning_rate": 1.4059405940594058e-06, "loss": -0.1381, "step": 399 }, { "epoch": 0.3199360127974405, "grad_norm": 5.114118048590294, "learning_rate": 1.4042904290429043e-06, "loss": -0.027, "step": 400 }, { "epoch": 0.3207358528294341, "grad_norm": 6.62264310550409, "learning_rate": 1.4026402640264025e-06, "loss": -0.1372, "step": 401 }, { "epoch": 0.3215356928614277, "grad_norm": 5.515472496453124, "learning_rate": 1.4009900990099007e-06, "loss": 0.0028, "step": 402 }, { "epoch": 0.3223355328934213, "grad_norm": 5.43524070368167, "learning_rate": 1.3993399339933994e-06, "loss": -0.1192, "step": 403 }, { "epoch": 0.32313537292541494, "grad_norm": 4.309916510249054, "learning_rate": 1.3976897689768976e-06, "loss": -0.049, "step": 404 }, { "epoch": 0.3239352129574085, "grad_norm": 4.392826058059571, "learning_rate": 1.3960396039603959e-06, "loss": -0.1248, "step": 405 }, { "epoch": 0.32473505298940214, "grad_norm": 5.384606404349416, "learning_rate": 1.3943894389438943e-06, "loss": -0.0248, "step": 406 }, { "epoch": 0.3255348930213957, "grad_norm": 5.369884451931867, "learning_rate": 1.3927392739273927e-06, "loss": 0.0453, "step": 407 }, { "epoch": 0.32633473305338934, "grad_norm": 3.799887635426884, "learning_rate": 1.391089108910891e-06, "loss": 0.0924, "step": 408 }, { "epoch": 0.3271345730853829, "grad_norm": 5.151153079821819, "learning_rate": 1.3894389438943894e-06, "loss": -0.1524, "step": 409 }, { "epoch": 0.32793441311737653, "grad_norm": 4.9429474730234935, "learning_rate": 1.3877887788778876e-06, "loss": -0.0066, "step": 410 }, { "epoch": 0.3287342531493701, "grad_norm": 3.8669767688401637, "learning_rate": 1.386138613861386e-06, "loss": -0.0998, "step": 411 }, { "epoch": 0.32953409318136373, "grad_norm": 4.1249285605053165, "learning_rate": 1.3844884488448845e-06, "loss": -0.1198, "step": 412 }, { "epoch": 0.33033393321335736, "grad_norm": 4.264021911092433, "learning_rate": 1.3828382838283827e-06, "loss": -0.027, "step": 413 }, { "epoch": 0.3311337732453509, "grad_norm": 8.192155984704781, "learning_rate": 1.381188118811881e-06, "loss": 0.0037, "step": 414 }, { "epoch": 0.33193361327734455, "grad_norm": 4.842071045333458, "learning_rate": 1.3795379537953794e-06, "loss": -0.0183, "step": 415 }, { "epoch": 0.3327334533093381, "grad_norm": 5.69008602834876, "learning_rate": 1.3778877887788779e-06, "loss": -0.037, "step": 416 }, { "epoch": 0.33353329334133175, "grad_norm": 3.4506588237689044, "learning_rate": 1.376237623762376e-06, "loss": -0.1827, "step": 417 }, { "epoch": 0.3343331333733253, "grad_norm": 4.276859677588479, "learning_rate": 1.3745874587458745e-06, "loss": -0.1397, "step": 418 }, { "epoch": 0.33513297340531895, "grad_norm": 5.137955642134524, "learning_rate": 1.3729372937293728e-06, "loss": 0.0313, "step": 419 }, { "epoch": 0.3359328134373125, "grad_norm": 5.626427193889533, "learning_rate": 1.3712871287128714e-06, "loss": 0.0585, "step": 420 }, { "epoch": 0.33673265346930614, "grad_norm": 4.259015114708382, "learning_rate": 1.3696369636963696e-06, "loss": -0.1352, "step": 421 }, { "epoch": 0.3375324935012997, "grad_norm": 3.900501996524311, "learning_rate": 1.3679867986798679e-06, "loss": -0.1541, "step": 422 }, { "epoch": 0.33833233353329334, "grad_norm": 21.342155698599925, "learning_rate": 1.3663366336633663e-06, "loss": 0.0163, "step": 423 }, { "epoch": 0.33913217356528697, "grad_norm": 9.376314198251674, "learning_rate": 1.3646864686468647e-06, "loss": -0.1147, "step": 424 }, { "epoch": 0.33993201359728054, "grad_norm": 3.9556694436435773, "learning_rate": 1.363036303630363e-06, "loss": 0.0607, "step": 425 }, { "epoch": 0.34073185362927416, "grad_norm": 4.413407376716041, "learning_rate": 1.3613861386138614e-06, "loss": 0.0269, "step": 426 }, { "epoch": 0.34153169366126773, "grad_norm": 4.5745629523971285, "learning_rate": 1.3597359735973596e-06, "loss": -0.1232, "step": 427 }, { "epoch": 0.34233153369326136, "grad_norm": 6.482169711595175, "learning_rate": 1.3580858085808579e-06, "loss": -0.0933, "step": 428 }, { "epoch": 0.34313137372525493, "grad_norm": 4.614948794989073, "learning_rate": 1.3564356435643565e-06, "loss": -0.0881, "step": 429 }, { "epoch": 0.34393121375724856, "grad_norm": 4.902443350581836, "learning_rate": 1.3547854785478547e-06, "loss": -0.0492, "step": 430 }, { "epoch": 0.3447310537892421, "grad_norm": 4.293832374460016, "learning_rate": 1.353135313531353e-06, "loss": -0.0988, "step": 431 }, { "epoch": 0.34553089382123575, "grad_norm": 4.239300667652253, "learning_rate": 1.3514851485148514e-06, "loss": -0.0395, "step": 432 }, { "epoch": 0.3463307338532294, "grad_norm": 6.000658634911202, "learning_rate": 1.3498349834983499e-06, "loss": -0.0687, "step": 433 }, { "epoch": 0.34713057388522295, "grad_norm": 4.533327665512432, "learning_rate": 1.348184818481848e-06, "loss": -0.0457, "step": 434 }, { "epoch": 0.3479304139172166, "grad_norm": 4.966203144811649, "learning_rate": 1.3465346534653465e-06, "loss": -0.0891, "step": 435 }, { "epoch": 0.34873025394921014, "grad_norm": 3.160979702375991, "learning_rate": 1.3448844884488448e-06, "loss": 0.0509, "step": 436 }, { "epoch": 0.34953009398120377, "grad_norm": 7.163626654459487, "learning_rate": 1.3432343234323432e-06, "loss": -0.1026, "step": 437 }, { "epoch": 0.35032993401319734, "grad_norm": 4.10929586240042, "learning_rate": 1.3415841584158416e-06, "loss": -0.0346, "step": 438 }, { "epoch": 0.35112977404519097, "grad_norm": 7.444864169509166, "learning_rate": 1.3399339933993399e-06, "loss": -0.084, "step": 439 }, { "epoch": 0.35192961407718454, "grad_norm": 4.279436158804133, "learning_rate": 1.338283828382838e-06, "loss": -0.0753, "step": 440 }, { "epoch": 0.35272945410917816, "grad_norm": 7.0310221317242965, "learning_rate": 1.3366336633663367e-06, "loss": 0.0822, "step": 441 }, { "epoch": 0.3535292941411718, "grad_norm": 3.546380500099962, "learning_rate": 1.334983498349835e-06, "loss": -0.0826, "step": 442 }, { "epoch": 0.35432913417316536, "grad_norm": 3.978575910618056, "learning_rate": 1.3333333333333332e-06, "loss": -0.0183, "step": 443 }, { "epoch": 0.355128974205159, "grad_norm": 4.893702894351932, "learning_rate": 1.3316831683168316e-06, "loss": 0.0513, "step": 444 }, { "epoch": 0.35592881423715256, "grad_norm": 4.712476792012751, "learning_rate": 1.3300330033003299e-06, "loss": 0.0161, "step": 445 }, { "epoch": 0.3567286542691462, "grad_norm": 4.363095681693482, "learning_rate": 1.3283828382838283e-06, "loss": -0.0878, "step": 446 }, { "epoch": 0.35752849430113975, "grad_norm": 3.6779713769559206, "learning_rate": 1.3267326732673268e-06, "loss": -0.0884, "step": 447 }, { "epoch": 0.3583283343331334, "grad_norm": 4.691244638726057, "learning_rate": 1.325082508250825e-06, "loss": 0.0164, "step": 448 }, { "epoch": 0.35912817436512695, "grad_norm": 3.9918624835208574, "learning_rate": 1.3234323432343232e-06, "loss": -0.0623, "step": 449 }, { "epoch": 0.3599280143971206, "grad_norm": 4.3423857158760475, "learning_rate": 1.3217821782178219e-06, "loss": -0.0166, "step": 450 }, { "epoch": 0.3607278544291142, "grad_norm": 3.3557272335230266, "learning_rate": 1.32013201320132e-06, "loss": -0.1243, "step": 451 }, { "epoch": 0.3615276944611078, "grad_norm": 4.121010209091045, "learning_rate": 1.3184818481848183e-06, "loss": 0.0157, "step": 452 }, { "epoch": 0.3623275344931014, "grad_norm": 5.61777014754645, "learning_rate": 1.3168316831683168e-06, "loss": 0.0009, "step": 453 }, { "epoch": 0.36312737452509497, "grad_norm": 3.9497241442966673, "learning_rate": 1.3151815181518152e-06, "loss": -0.0748, "step": 454 }, { "epoch": 0.3639272145570886, "grad_norm": 3.78165099484685, "learning_rate": 1.3135313531353134e-06, "loss": 0.0137, "step": 455 }, { "epoch": 0.36472705458908217, "grad_norm": 3.8365555088656573, "learning_rate": 1.3118811881188119e-06, "loss": 0.0, "step": 456 }, { "epoch": 0.3655268946210758, "grad_norm": 3.9613296946642933, "learning_rate": 1.31023102310231e-06, "loss": -0.068, "step": 457 }, { "epoch": 0.36632673465306936, "grad_norm": 3.558717962079936, "learning_rate": 1.3085808580858083e-06, "loss": -0.1112, "step": 458 }, { "epoch": 0.367126574685063, "grad_norm": 4.93902023669042, "learning_rate": 1.306930693069307e-06, "loss": -0.0433, "step": 459 }, { "epoch": 0.3679264147170566, "grad_norm": 4.69421251966819, "learning_rate": 1.3052805280528052e-06, "loss": 0.0994, "step": 460 }, { "epoch": 0.3687262547490502, "grad_norm": 6.3721851791610336, "learning_rate": 1.3036303630363034e-06, "loss": -0.1078, "step": 461 }, { "epoch": 0.3695260947810438, "grad_norm": 3.389060929800596, "learning_rate": 1.3019801980198019e-06, "loss": -0.088, "step": 462 }, { "epoch": 0.3703259348130374, "grad_norm": 5.748513070947605, "learning_rate": 1.3003300330033003e-06, "loss": -0.029, "step": 463 }, { "epoch": 0.371125774845031, "grad_norm": 4.690511727792042, "learning_rate": 1.2986798679867985e-06, "loss": -0.0756, "step": 464 }, { "epoch": 0.3719256148770246, "grad_norm": 5.217441052047622, "learning_rate": 1.297029702970297e-06, "loss": -0.0748, "step": 465 }, { "epoch": 0.3727254549090182, "grad_norm": 4.240980113487688, "learning_rate": 1.2953795379537952e-06, "loss": -0.0008, "step": 466 }, { "epoch": 0.3735252949410118, "grad_norm": 4.743889341456478, "learning_rate": 1.2937293729372937e-06, "loss": -0.0671, "step": 467 }, { "epoch": 0.3743251349730054, "grad_norm": 4.473362389672442, "learning_rate": 1.292079207920792e-06, "loss": 0.0101, "step": 468 }, { "epoch": 0.375124975004999, "grad_norm": 4.197750015674087, "learning_rate": 1.2904290429042903e-06, "loss": 0.0175, "step": 469 }, { "epoch": 0.3759248150369926, "grad_norm": 6.425414954415456, "learning_rate": 1.2887788778877888e-06, "loss": -0.0783, "step": 470 }, { "epoch": 0.3767246550689862, "grad_norm": 2.9864850798252855, "learning_rate": 1.2871287128712872e-06, "loss": -0.0884, "step": 471 }, { "epoch": 0.3775244951009798, "grad_norm": 5.261385424958508, "learning_rate": 1.2854785478547854e-06, "loss": -0.09, "step": 472 }, { "epoch": 0.3783243351329734, "grad_norm": 4.958045993888585, "learning_rate": 1.2838283828382839e-06, "loss": -0.0682, "step": 473 }, { "epoch": 0.379124175164967, "grad_norm": 4.7190018138263605, "learning_rate": 1.282178217821782e-06, "loss": -0.1395, "step": 474 }, { "epoch": 0.3799240151969606, "grad_norm": 5.882864051380202, "learning_rate": 1.2805280528052803e-06, "loss": -0.1829, "step": 475 }, { "epoch": 0.3807238552289542, "grad_norm": 3.7556665205378352, "learning_rate": 1.278877887788779e-06, "loss": -0.021, "step": 476 }, { "epoch": 0.3815236952609478, "grad_norm": 4.086321431606577, "learning_rate": 1.2772277227722772e-06, "loss": -0.0382, "step": 477 }, { "epoch": 0.3823235352929414, "grad_norm": 4.616776862820448, "learning_rate": 1.2755775577557754e-06, "loss": -0.1779, "step": 478 }, { "epoch": 0.383123375324935, "grad_norm": 4.004332580198827, "learning_rate": 1.2739273927392739e-06, "loss": -0.0252, "step": 479 }, { "epoch": 0.38392321535692864, "grad_norm": 4.624789258949781, "learning_rate": 1.2722772277227723e-06, "loss": -0.0274, "step": 480 }, { "epoch": 0.3847230553889222, "grad_norm": 4.107644532644881, "learning_rate": 1.2706270627062705e-06, "loss": -0.0706, "step": 481 }, { "epoch": 0.38552289542091583, "grad_norm": 5.606536912327608, "learning_rate": 1.268976897689769e-06, "loss": -0.1579, "step": 482 }, { "epoch": 0.3863227354529094, "grad_norm": 3.661768864377637, "learning_rate": 1.2673267326732672e-06, "loss": -0.0483, "step": 483 }, { "epoch": 0.38712257548490303, "grad_norm": 4.163789722318428, "learning_rate": 1.2656765676567657e-06, "loss": -0.1628, "step": 484 }, { "epoch": 0.3879224155168966, "grad_norm": 5.862521290689618, "learning_rate": 1.264026402640264e-06, "loss": -0.0378, "step": 485 }, { "epoch": 0.3887222555488902, "grad_norm": 4.451191371926914, "learning_rate": 1.2623762376237623e-06, "loss": 0.041, "step": 486 }, { "epoch": 0.3895220955808838, "grad_norm": 6.554041470323983, "learning_rate": 1.2607260726072606e-06, "loss": -0.0089, "step": 487 }, { "epoch": 0.3903219356128774, "grad_norm": 4.958459911280161, "learning_rate": 1.259075907590759e-06, "loss": -0.0351, "step": 488 }, { "epoch": 0.39112177564487105, "grad_norm": 5.5754285433841595, "learning_rate": 1.2574257425742574e-06, "loss": -0.0866, "step": 489 }, { "epoch": 0.3919216156768646, "grad_norm": 4.927561354349523, "learning_rate": 1.2557755775577557e-06, "loss": 0.0114, "step": 490 }, { "epoch": 0.39272145570885825, "grad_norm": 4.275369657183623, "learning_rate": 1.2541254125412541e-06, "loss": 0.0731, "step": 491 }, { "epoch": 0.3935212957408518, "grad_norm": 4.553288397020381, "learning_rate": 1.2524752475247523e-06, "loss": -0.0366, "step": 492 }, { "epoch": 0.39432113577284544, "grad_norm": 4.3640356820358415, "learning_rate": 1.2508250825082508e-06, "loss": -0.026, "step": 493 }, { "epoch": 0.395120975804839, "grad_norm": 6.781778763227194, "learning_rate": 1.2491749174917492e-06, "loss": 0.0272, "step": 494 }, { "epoch": 0.39592081583683264, "grad_norm": 4.147600624744722, "learning_rate": 1.2475247524752474e-06, "loss": -0.0533, "step": 495 }, { "epoch": 0.3967206558688262, "grad_norm": 7.925587764087279, "learning_rate": 1.2458745874587457e-06, "loss": -0.0023, "step": 496 }, { "epoch": 0.39752049590081984, "grad_norm": 3.9471683782785267, "learning_rate": 1.2442244224422443e-06, "loss": -0.0624, "step": 497 }, { "epoch": 0.39832033593281346, "grad_norm": 5.046330000323796, "learning_rate": 1.2425742574257426e-06, "loss": -0.1152, "step": 498 }, { "epoch": 0.39912017596480703, "grad_norm": 3.797212185428219, "learning_rate": 1.2409240924092408e-06, "loss": -0.0566, "step": 499 }, { "epoch": 0.39992001599680066, "grad_norm": 5.219397955775355, "learning_rate": 1.2392739273927392e-06, "loss": -0.0473, "step": 500 }, { "epoch": 0.40071985602879423, "grad_norm": 4.888043487068187, "learning_rate": 1.2376237623762375e-06, "loss": -0.0766, "step": 501 }, { "epoch": 0.40151969606078786, "grad_norm": 4.812490488018197, "learning_rate": 1.2359735973597359e-06, "loss": -0.0063, "step": 502 }, { "epoch": 0.4023195360927814, "grad_norm": 4.137421603194797, "learning_rate": 1.2343234323432343e-06, "loss": -0.0894, "step": 503 }, { "epoch": 0.40311937612477505, "grad_norm": 4.446466976487968, "learning_rate": 1.2326732673267326e-06, "loss": -0.1032, "step": 504 }, { "epoch": 0.4039192161567686, "grad_norm": 4.517828908408806, "learning_rate": 1.2310231023102308e-06, "loss": -0.0366, "step": 505 }, { "epoch": 0.40471905618876225, "grad_norm": 4.025363379714323, "learning_rate": 1.2293729372937294e-06, "loss": -0.0322, "step": 506 }, { "epoch": 0.4055188962207559, "grad_norm": 3.0065985273378026, "learning_rate": 1.2277227722772277e-06, "loss": -0.1451, "step": 507 }, { "epoch": 0.40631873625274945, "grad_norm": 6.102992165053075, "learning_rate": 1.226072607260726e-06, "loss": -0.0947, "step": 508 }, { "epoch": 0.4071185762847431, "grad_norm": 5.373085923158729, "learning_rate": 1.2244224422442243e-06, "loss": -0.038, "step": 509 }, { "epoch": 0.40791841631673664, "grad_norm": 6.507842394701745, "learning_rate": 1.2227722772277228e-06, "loss": -0.0416, "step": 510 }, { "epoch": 0.40871825634873027, "grad_norm": 6.211682775156014, "learning_rate": 1.221122112211221e-06, "loss": -0.0155, "step": 511 }, { "epoch": 0.40951809638072384, "grad_norm": 3.253438304578216, "learning_rate": 1.2194719471947194e-06, "loss": -0.0971, "step": 512 }, { "epoch": 0.41031793641271747, "grad_norm": 3.562574820820311, "learning_rate": 1.2178217821782177e-06, "loss": -0.0982, "step": 513 }, { "epoch": 0.41111777644471104, "grad_norm": 3.5746182911507067, "learning_rate": 1.216171617161716e-06, "loss": -0.023, "step": 514 }, { "epoch": 0.41191761647670466, "grad_norm": 3.488381447372906, "learning_rate": 1.2145214521452146e-06, "loss": 0.1215, "step": 515 }, { "epoch": 0.41271745650869823, "grad_norm": 3.1641673883077788, "learning_rate": 1.2128712871287128e-06, "loss": -0.0269, "step": 516 }, { "epoch": 0.41351729654069186, "grad_norm": 4.13780306256476, "learning_rate": 1.211221122112211e-06, "loss": -0.08, "step": 517 }, { "epoch": 0.4143171365726855, "grad_norm": 5.23630954806011, "learning_rate": 1.2095709570957095e-06, "loss": 0.0683, "step": 518 }, { "epoch": 0.41511697660467906, "grad_norm": 5.08041039318514, "learning_rate": 1.207920792079208e-06, "loss": -0.0634, "step": 519 }, { "epoch": 0.4159168166366727, "grad_norm": 4.328106931793288, "learning_rate": 1.2062706270627063e-06, "loss": -0.1251, "step": 520 }, { "epoch": 0.41671665666866625, "grad_norm": 3.8685318893058978, "learning_rate": 1.2046204620462046e-06, "loss": -0.0397, "step": 521 }, { "epoch": 0.4175164967006599, "grad_norm": 4.228873306898751, "learning_rate": 1.2029702970297028e-06, "loss": -0.0097, "step": 522 }, { "epoch": 0.41831633673265345, "grad_norm": 8.545101114191558, "learning_rate": 1.2013201320132014e-06, "loss": 0.0257, "step": 523 }, { "epoch": 0.4191161767646471, "grad_norm": 4.940963303084406, "learning_rate": 1.1996699669966997e-06, "loss": -0.0218, "step": 524 }, { "epoch": 0.41991601679664065, "grad_norm": 4.486186548838174, "learning_rate": 1.198019801980198e-06, "loss": -0.1274, "step": 525 }, { "epoch": 0.42071585682863427, "grad_norm": 6.117424152809813, "learning_rate": 1.1963696369636963e-06, "loss": -0.0412, "step": 526 }, { "epoch": 0.4215156968606279, "grad_norm": 6.026088581435606, "learning_rate": 1.1947194719471948e-06, "loss": -0.1461, "step": 527 }, { "epoch": 0.42231553689262147, "grad_norm": 4.391032244166686, "learning_rate": 1.193069306930693e-06, "loss": 0.0956, "step": 528 }, { "epoch": 0.4231153769246151, "grad_norm": 5.139889742785653, "learning_rate": 1.1914191419141915e-06, "loss": -0.0998, "step": 529 }, { "epoch": 0.42391521695660866, "grad_norm": 5.63979191849408, "learning_rate": 1.1897689768976897e-06, "loss": -0.0672, "step": 530 }, { "epoch": 0.4247150569886023, "grad_norm": 8.323803093358931, "learning_rate": 1.188118811881188e-06, "loss": -0.1988, "step": 531 }, { "epoch": 0.42551489702059586, "grad_norm": 3.5224991117629263, "learning_rate": 1.1864686468646866e-06, "loss": -0.0976, "step": 532 }, { "epoch": 0.4263147370525895, "grad_norm": 3.3222865799787407, "learning_rate": 1.1848184818481848e-06, "loss": -0.1625, "step": 533 }, { "epoch": 0.42711457708458306, "grad_norm": 4.473219337166838, "learning_rate": 1.183168316831683e-06, "loss": -0.102, "step": 534 }, { "epoch": 0.4279144171165767, "grad_norm": 4.255445918061684, "learning_rate": 1.1815181518151815e-06, "loss": 0.0347, "step": 535 }, { "epoch": 0.4287142571485703, "grad_norm": 5.273596279438336, "learning_rate": 1.17986798679868e-06, "loss": -0.0233, "step": 536 }, { "epoch": 0.4295140971805639, "grad_norm": 3.2928329204452167, "learning_rate": 1.1782178217821781e-06, "loss": -0.1795, "step": 537 }, { "epoch": 0.4303139372125575, "grad_norm": 3.1943187707330676, "learning_rate": 1.1765676567656766e-06, "loss": -0.1193, "step": 538 }, { "epoch": 0.4311137772445511, "grad_norm": 4.457107636902936, "learning_rate": 1.1749174917491748e-06, "loss": -0.0256, "step": 539 }, { "epoch": 0.4319136172765447, "grad_norm": 4.508728040150466, "learning_rate": 1.1732673267326732e-06, "loss": -0.1272, "step": 540 }, { "epoch": 0.4327134573085383, "grad_norm": 4.679176366336832, "learning_rate": 1.1716171617161717e-06, "loss": -0.0107, "step": 541 }, { "epoch": 0.4335132973405319, "grad_norm": 4.572704243632147, "learning_rate": 1.16996699669967e-06, "loss": -0.0189, "step": 542 }, { "epoch": 0.43431313737252547, "grad_norm": 3.823996049360206, "learning_rate": 1.1683168316831681e-06, "loss": 0.0071, "step": 543 }, { "epoch": 0.4351129774045191, "grad_norm": 4.448417665137879, "learning_rate": 1.1666666666666668e-06, "loss": 0.0018, "step": 544 }, { "epoch": 0.4359128174365127, "grad_norm": 3.7067784825161625, "learning_rate": 1.165016501650165e-06, "loss": -0.0643, "step": 545 }, { "epoch": 0.4367126574685063, "grad_norm": 4.304960211061566, "learning_rate": 1.1633663366336632e-06, "loss": 0.0434, "step": 546 }, { "epoch": 0.4375124975004999, "grad_norm": 4.809624741919171, "learning_rate": 1.1617161716171617e-06, "loss": -0.1175, "step": 547 }, { "epoch": 0.4383123375324935, "grad_norm": 3.572402442577118, "learning_rate": 1.16006600660066e-06, "loss": 0.0641, "step": 548 }, { "epoch": 0.4391121775644871, "grad_norm": 3.1323439706728173, "learning_rate": 1.1584158415841584e-06, "loss": -0.1315, "step": 549 }, { "epoch": 0.4399120175964807, "grad_norm": 6.63310206919076, "learning_rate": 1.1567656765676568e-06, "loss": -0.1572, "step": 550 }, { "epoch": 0.4407118576284743, "grad_norm": 5.7194336862922475, "learning_rate": 1.155115511551155e-06, "loss": -0.0498, "step": 551 }, { "epoch": 0.4415116976604679, "grad_norm": 5.0355458371512976, "learning_rate": 1.1534653465346533e-06, "loss": -0.0343, "step": 552 }, { "epoch": 0.4423115376924615, "grad_norm": 4.479813545297925, "learning_rate": 1.151815181518152e-06, "loss": -0.052, "step": 553 }, { "epoch": 0.44311137772445514, "grad_norm": 5.515043665694904, "learning_rate": 1.1501650165016501e-06, "loss": -0.0962, "step": 554 }, { "epoch": 0.4439112177564487, "grad_norm": 4.35349503345848, "learning_rate": 1.1485148514851484e-06, "loss": -0.1718, "step": 555 }, { "epoch": 0.44471105778844233, "grad_norm": 4.324313582265245, "learning_rate": 1.1468646864686468e-06, "loss": -0.1059, "step": 556 }, { "epoch": 0.4455108978204359, "grad_norm": 6.603357917371442, "learning_rate": 1.1452145214521452e-06, "loss": -0.0179, "step": 557 }, { "epoch": 0.44631073785242953, "grad_norm": 4.731847308612818, "learning_rate": 1.1435643564356435e-06, "loss": -0.0285, "step": 558 }, { "epoch": 0.4471105778844231, "grad_norm": 3.2387655192879, "learning_rate": 1.141914191419142e-06, "loss": 0.0157, "step": 559 }, { "epoch": 0.4479104179164167, "grad_norm": 4.11046977381839, "learning_rate": 1.1402640264026401e-06, "loss": -0.0738, "step": 560 }, { "epoch": 0.4487102579484103, "grad_norm": 4.7767170879491765, "learning_rate": 1.1386138613861384e-06, "loss": -0.1826, "step": 561 }, { "epoch": 0.4495100979804039, "grad_norm": 4.712736226037487, "learning_rate": 1.136963696369637e-06, "loss": -0.0453, "step": 562 }, { "epoch": 0.45030993801239755, "grad_norm": 5.2823586287855795, "learning_rate": 1.1353135313531353e-06, "loss": -0.0112, "step": 563 }, { "epoch": 0.4511097780443911, "grad_norm": 4.313375854007458, "learning_rate": 1.1336633663366335e-06, "loss": -0.1529, "step": 564 }, { "epoch": 0.45190961807638474, "grad_norm": 3.661113275988152, "learning_rate": 1.132013201320132e-06, "loss": 0.0182, "step": 565 }, { "epoch": 0.4527094581083783, "grad_norm": 3.3641831166871015, "learning_rate": 1.1303630363036304e-06, "loss": -0.0967, "step": 566 }, { "epoch": 0.45350929814037194, "grad_norm": 3.5187461731044634, "learning_rate": 1.1287128712871286e-06, "loss": -0.0115, "step": 567 }, { "epoch": 0.4543091381723655, "grad_norm": 5.055238201441692, "learning_rate": 1.127062706270627e-06, "loss": -0.0701, "step": 568 }, { "epoch": 0.45510897820435914, "grad_norm": 5.048998878882335, "learning_rate": 1.1254125412541253e-06, "loss": -0.0982, "step": 569 }, { "epoch": 0.4559088182363527, "grad_norm": 6.706995417966311, "learning_rate": 1.123762376237624e-06, "loss": -0.0323, "step": 570 }, { "epoch": 0.45670865826834633, "grad_norm": 5.717361031375047, "learning_rate": 1.1221122112211221e-06, "loss": -0.0251, "step": 571 }, { "epoch": 0.4575084983003399, "grad_norm": 5.073568794377317, "learning_rate": 1.1204620462046204e-06, "loss": -0.2165, "step": 572 }, { "epoch": 0.45830833833233353, "grad_norm": 4.340499430104141, "learning_rate": 1.1188118811881188e-06, "loss": -0.1008, "step": 573 }, { "epoch": 0.45910817836432716, "grad_norm": 11.188522206922801, "learning_rate": 1.117161716171617e-06, "loss": -0.0362, "step": 574 }, { "epoch": 0.45990801839632073, "grad_norm": 8.96889120914533, "learning_rate": 1.1155115511551155e-06, "loss": 0.0492, "step": 575 }, { "epoch": 0.46070785842831435, "grad_norm": 4.7623433805729825, "learning_rate": 1.113861386138614e-06, "loss": -0.0462, "step": 576 }, { "epoch": 0.4615076984603079, "grad_norm": 4.809552169223393, "learning_rate": 1.1122112211221121e-06, "loss": -0.0298, "step": 577 }, { "epoch": 0.46230753849230155, "grad_norm": 4.199314588295882, "learning_rate": 1.1105610561056104e-06, "loss": -0.1501, "step": 578 }, { "epoch": 0.4631073785242951, "grad_norm": 3.531450924178731, "learning_rate": 1.108910891089109e-06, "loss": -0.0589, "step": 579 }, { "epoch": 0.46390721855628875, "grad_norm": 4.247443159328166, "learning_rate": 1.1072607260726073e-06, "loss": -0.0362, "step": 580 }, { "epoch": 0.4647070585882823, "grad_norm": 3.4845953217374346, "learning_rate": 1.1056105610561055e-06, "loss": -0.0171, "step": 581 }, { "epoch": 0.46550689862027594, "grad_norm": 6.269891746353069, "learning_rate": 1.103960396039604e-06, "loss": -0.0552, "step": 582 }, { "epoch": 0.46630673865226957, "grad_norm": 4.110925186237378, "learning_rate": 1.1023102310231024e-06, "loss": -0.1525, "step": 583 }, { "epoch": 0.46710657868426314, "grad_norm": 4.027706037445169, "learning_rate": 1.1006600660066006e-06, "loss": -0.1283, "step": 584 }, { "epoch": 0.46790641871625677, "grad_norm": 6.085436348609565, "learning_rate": 1.099009900990099e-06, "loss": 0.0152, "step": 585 }, { "epoch": 0.46870625874825034, "grad_norm": 4.403688809188704, "learning_rate": 1.0973597359735973e-06, "loss": -0.0462, "step": 586 }, { "epoch": 0.46950609878024396, "grad_norm": 3.7204758942669924, "learning_rate": 1.0957095709570955e-06, "loss": -0.0472, "step": 587 }, { "epoch": 0.47030593881223753, "grad_norm": 4.16282077045935, "learning_rate": 1.0940594059405941e-06, "loss": -0.1098, "step": 588 }, { "epoch": 0.47110577884423116, "grad_norm": 4.490618781800236, "learning_rate": 1.0924092409240924e-06, "loss": -0.0323, "step": 589 }, { "epoch": 0.47190561887622473, "grad_norm": 3.83450769320751, "learning_rate": 1.0907590759075906e-06, "loss": -0.0482, "step": 590 }, { "epoch": 0.47270545890821836, "grad_norm": 3.951497100856045, "learning_rate": 1.089108910891089e-06, "loss": -0.1206, "step": 591 }, { "epoch": 0.473505298940212, "grad_norm": 4.798754196622245, "learning_rate": 1.0874587458745875e-06, "loss": 0.0584, "step": 592 }, { "epoch": 0.47430513897220555, "grad_norm": 4.437199971976538, "learning_rate": 1.0858085808580857e-06, "loss": -0.0666, "step": 593 }, { "epoch": 0.4751049790041992, "grad_norm": 5.877945506525689, "learning_rate": 1.0841584158415842e-06, "loss": -0.0795, "step": 594 }, { "epoch": 0.47590481903619275, "grad_norm": 4.9827179740392165, "learning_rate": 1.0825082508250824e-06, "loss": -0.145, "step": 595 }, { "epoch": 0.4767046590681864, "grad_norm": 4.579893011650921, "learning_rate": 1.0808580858085808e-06, "loss": 0.0438, "step": 596 }, { "epoch": 0.47750449910017995, "grad_norm": 4.6741113317873975, "learning_rate": 1.0792079207920793e-06, "loss": -0.0471, "step": 597 }, { "epoch": 0.4783043391321736, "grad_norm": 4.45102253856279, "learning_rate": 1.0775577557755775e-06, "loss": -0.0705, "step": 598 }, { "epoch": 0.47910417916416714, "grad_norm": 7.735572429403455, "learning_rate": 1.0759075907590757e-06, "loss": -0.0004, "step": 599 }, { "epoch": 0.47990401919616077, "grad_norm": 4.3890790831054645, "learning_rate": 1.0742574257425744e-06, "loss": -0.11, "step": 600 }, { "epoch": 0.4807038592281544, "grad_norm": 3.750715441802859, "learning_rate": 1.0726072607260726e-06, "loss": -0.0943, "step": 601 }, { "epoch": 0.48150369926014797, "grad_norm": 3.65650652124133, "learning_rate": 1.0709570957095708e-06, "loss": -0.169, "step": 602 }, { "epoch": 0.4823035392921416, "grad_norm": 4.55201247427091, "learning_rate": 1.0693069306930693e-06, "loss": -0.0753, "step": 603 }, { "epoch": 0.48310337932413516, "grad_norm": 4.55776208974576, "learning_rate": 1.0676567656765675e-06, "loss": -0.1347, "step": 604 }, { "epoch": 0.4839032193561288, "grad_norm": 4.48864117831728, "learning_rate": 1.066006600660066e-06, "loss": -0.0974, "step": 605 }, { "epoch": 0.48470305938812236, "grad_norm": 3.9363005491507668, "learning_rate": 1.0643564356435644e-06, "loss": -0.0079, "step": 606 }, { "epoch": 0.485502899420116, "grad_norm": 4.252862078789536, "learning_rate": 1.0627062706270626e-06, "loss": -0.0136, "step": 607 }, { "epoch": 0.48630273945210956, "grad_norm": 4.543019341197776, "learning_rate": 1.0610561056105608e-06, "loss": -0.1351, "step": 608 }, { "epoch": 0.4871025794841032, "grad_norm": 5.655845163937271, "learning_rate": 1.0594059405940595e-06, "loss": -0.0266, "step": 609 }, { "epoch": 0.4879024195160968, "grad_norm": 11.681523554473147, "learning_rate": 1.0577557755775577e-06, "loss": -0.1126, "step": 610 }, { "epoch": 0.4887022595480904, "grad_norm": 4.307554643653013, "learning_rate": 1.056105610561056e-06, "loss": -0.0647, "step": 611 }, { "epoch": 0.489502099580084, "grad_norm": 5.732899988046993, "learning_rate": 1.0544554455445544e-06, "loss": -0.0934, "step": 612 }, { "epoch": 0.4903019396120776, "grad_norm": 4.150445541916088, "learning_rate": 1.0528052805280528e-06, "loss": -0.1406, "step": 613 }, { "epoch": 0.4911017796440712, "grad_norm": 5.1082669144378725, "learning_rate": 1.051155115511551e-06, "loss": 0.0558, "step": 614 }, { "epoch": 0.49190161967606477, "grad_norm": 7.67678160227598, "learning_rate": 1.0495049504950495e-06, "loss": 0.0088, "step": 615 }, { "epoch": 0.4927014597080584, "grad_norm": 3.354349182723264, "learning_rate": 1.0478547854785477e-06, "loss": -0.1359, "step": 616 }, { "epoch": 0.49350129974005197, "grad_norm": 4.725187848913473, "learning_rate": 1.046204620462046e-06, "loss": -0.0762, "step": 617 }, { "epoch": 0.4943011397720456, "grad_norm": 5.7805399690672825, "learning_rate": 1.0445544554455446e-06, "loss": 0.0578, "step": 618 }, { "epoch": 0.49510097980403917, "grad_norm": 4.179300735975294, "learning_rate": 1.0429042904290428e-06, "loss": 0.005, "step": 619 }, { "epoch": 0.4959008198360328, "grad_norm": 5.098901682751034, "learning_rate": 1.0412541254125413e-06, "loss": -0.0156, "step": 620 }, { "epoch": 0.4967006598680264, "grad_norm": 8.248543813099444, "learning_rate": 1.0396039603960395e-06, "loss": -0.0457, "step": 621 }, { "epoch": 0.49750049990002, "grad_norm": 4.864094199749138, "learning_rate": 1.037953795379538e-06, "loss": 0.0829, "step": 622 }, { "epoch": 0.4983003399320136, "grad_norm": 4.692602610129244, "learning_rate": 1.0363036303630364e-06, "loss": -0.0396, "step": 623 }, { "epoch": 0.4991001799640072, "grad_norm": 3.696934414641692, "learning_rate": 1.0346534653465346e-06, "loss": -0.1939, "step": 624 }, { "epoch": 0.4999000199960008, "grad_norm": 5.491055932283412, "learning_rate": 1.0330033003300328e-06, "loss": -0.0639, "step": 625 }, { "epoch": 0.5006998600279944, "grad_norm": 3.92108657024522, "learning_rate": 1.0313531353135315e-06, "loss": -0.0969, "step": 626 }, { "epoch": 0.501499700059988, "grad_norm": 5.243975134143899, "learning_rate": 1.0297029702970297e-06, "loss": -0.0206, "step": 627 }, { "epoch": 0.5022995400919816, "grad_norm": 4.328763949057532, "learning_rate": 1.028052805280528e-06, "loss": -0.1081, "step": 628 }, { "epoch": 0.5030993801239753, "grad_norm": 4.197218364380273, "learning_rate": 1.0264026402640264e-06, "loss": -0.0932, "step": 629 }, { "epoch": 0.5038992201559688, "grad_norm": 4.300285400855226, "learning_rate": 1.0247524752475248e-06, "loss": -0.1016, "step": 630 }, { "epoch": 0.5046990601879624, "grad_norm": 4.4977354073528675, "learning_rate": 1.023102310231023e-06, "loss": -0.0846, "step": 631 }, { "epoch": 0.505498900219956, "grad_norm": 4.303063503427107, "learning_rate": 1.0214521452145215e-06, "loss": -0.0724, "step": 632 }, { "epoch": 0.5062987402519497, "grad_norm": 6.073276027125735, "learning_rate": 1.0198019801980197e-06, "loss": -0.0326, "step": 633 }, { "epoch": 0.5070985802839432, "grad_norm": 4.973392453051079, "learning_rate": 1.018151815181518e-06, "loss": 0.0189, "step": 634 }, { "epoch": 0.5078984203159368, "grad_norm": 4.811288854611089, "learning_rate": 1.0165016501650166e-06, "loss": -0.1415, "step": 635 }, { "epoch": 0.5086982603479304, "grad_norm": 4.853499316182358, "learning_rate": 1.0148514851485148e-06, "loss": -0.0896, "step": 636 }, { "epoch": 0.509498100379924, "grad_norm": 3.839359784599905, "learning_rate": 1.013201320132013e-06, "loss": -0.0358, "step": 637 }, { "epoch": 0.5102979404119176, "grad_norm": 4.783165043297262, "learning_rate": 1.0115511551155115e-06, "loss": -0.0102, "step": 638 }, { "epoch": 0.5110977804439112, "grad_norm": 5.184943614862812, "learning_rate": 1.00990099009901e-06, "loss": -0.1354, "step": 639 }, { "epoch": 0.5118976204759048, "grad_norm": 39.49636862757923, "learning_rate": 1.0082508250825082e-06, "loss": 0.0023, "step": 640 }, { "epoch": 0.5126974605078984, "grad_norm": 4.083973866781674, "learning_rate": 1.0066006600660066e-06, "loss": -0.0357, "step": 641 }, { "epoch": 0.5134973005398921, "grad_norm": 5.817761080874723, "learning_rate": 1.0049504950495048e-06, "loss": -0.0441, "step": 642 }, { "epoch": 0.5142971405718856, "grad_norm": 8.811705641420119, "learning_rate": 1.0033003300330033e-06, "loss": 0.1013, "step": 643 }, { "epoch": 0.5150969806038792, "grad_norm": 4.211618456869653, "learning_rate": 1.0016501650165017e-06, "loss": -0.1781, "step": 644 }, { "epoch": 0.5158968206358728, "grad_norm": 6.430696021299668, "learning_rate": 1e-06, "loss": -0.0475, "step": 645 }, { "epoch": 0.5166966606678665, "grad_norm": 4.287574273625528, "learning_rate": 9.983498349834984e-07, "loss": -0.0304, "step": 646 }, { "epoch": 0.51749650069986, "grad_norm": 2.916483591782696, "learning_rate": 9.966996699669966e-07, "loss": -0.1574, "step": 647 }, { "epoch": 0.5182963407318536, "grad_norm": 5.071406876730617, "learning_rate": 9.95049504950495e-07, "loss": -0.0227, "step": 648 }, { "epoch": 0.5190961807638472, "grad_norm": 3.863107212570657, "learning_rate": 9.933993399339933e-07, "loss": -0.0452, "step": 649 }, { "epoch": 0.5198960207958409, "grad_norm": 4.234640528387632, "learning_rate": 9.917491749174917e-07, "loss": -0.0816, "step": 650 }, { "epoch": 0.5206958608278345, "grad_norm": 4.255603238118902, "learning_rate": 9.9009900990099e-07, "loss": -0.1311, "step": 651 }, { "epoch": 0.521495700859828, "grad_norm": 4.723143092604518, "learning_rate": 9.884488448844884e-07, "loss": -0.1076, "step": 652 }, { "epoch": 0.5222955408918216, "grad_norm": 4.609676855516043, "learning_rate": 9.867986798679866e-07, "loss": -0.0468, "step": 653 }, { "epoch": 0.5230953809238152, "grad_norm": 4.648497611546731, "learning_rate": 9.85148514851485e-07, "loss": 0.066, "step": 654 }, { "epoch": 0.5238952209558089, "grad_norm": 4.444890919088204, "learning_rate": 9.834983498349835e-07, "loss": -0.0954, "step": 655 }, { "epoch": 0.5246950609878024, "grad_norm": 3.789960680030435, "learning_rate": 9.818481848184817e-07, "loss": -0.0846, "step": 656 }, { "epoch": 0.525494901019796, "grad_norm": 7.20767352956141, "learning_rate": 9.801980198019802e-07, "loss": -0.0689, "step": 657 }, { "epoch": 0.5262947410517896, "grad_norm": 3.8251645221108883, "learning_rate": 9.785478547854786e-07, "loss": -0.0026, "step": 658 }, { "epoch": 0.5270945810837833, "grad_norm": 10.44191187281191, "learning_rate": 9.768976897689768e-07, "loss": -0.1835, "step": 659 }, { "epoch": 0.5278944211157769, "grad_norm": 6.355832474610427, "learning_rate": 9.75247524752475e-07, "loss": -0.0168, "step": 660 }, { "epoch": 0.5286942611477704, "grad_norm": 3.864986648832606, "learning_rate": 9.735973597359735e-07, "loss": -0.0699, "step": 661 }, { "epoch": 0.529494101179764, "grad_norm": 9.07974895718424, "learning_rate": 9.71947194719472e-07, "loss": -0.0013, "step": 662 }, { "epoch": 0.5302939412117577, "grad_norm": 10.325195266318097, "learning_rate": 9.702970297029702e-07, "loss": -0.1641, "step": 663 }, { "epoch": 0.5310937812437513, "grad_norm": 4.441568363287549, "learning_rate": 9.686468646864686e-07, "loss": 0.0397, "step": 664 }, { "epoch": 0.5318936212757448, "grad_norm": 5.1229672005813605, "learning_rate": 9.66996699669967e-07, "loss": -0.1918, "step": 665 }, { "epoch": 0.5326934613077384, "grad_norm": 4.721345907443594, "learning_rate": 9.653465346534653e-07, "loss": -0.0215, "step": 666 }, { "epoch": 0.533493301339732, "grad_norm": 7.118557074848867, "learning_rate": 9.636963696369637e-07, "loss": -0.0039, "step": 667 }, { "epoch": 0.5342931413717257, "grad_norm": 4.7832103026691755, "learning_rate": 9.62046204620462e-07, "loss": -0.0221, "step": 668 }, { "epoch": 0.5350929814037193, "grad_norm": 6.3240275362924505, "learning_rate": 9.603960396039604e-07, "loss": -0.0409, "step": 669 }, { "epoch": 0.5358928214357128, "grad_norm": 4.027979744126303, "learning_rate": 9.587458745874586e-07, "loss": -0.0503, "step": 670 }, { "epoch": 0.5366926614677064, "grad_norm": 4.74302864470449, "learning_rate": 9.57095709570957e-07, "loss": -0.0521, "step": 671 }, { "epoch": 0.5374925014997001, "grad_norm": 4.985380734947703, "learning_rate": 9.554455445544553e-07, "loss": -0.082, "step": 672 }, { "epoch": 0.5382923415316937, "grad_norm": 4.472849852523853, "learning_rate": 9.537953795379537e-07, "loss": 0.1016, "step": 673 }, { "epoch": 0.5390921815636872, "grad_norm": 3.308647938974776, "learning_rate": 9.521452145214522e-07, "loss": -0.0457, "step": 674 }, { "epoch": 0.5398920215956808, "grad_norm": 4.677054809392709, "learning_rate": 9.504950495049504e-07, "loss": -0.1658, "step": 675 }, { "epoch": 0.5406918616276745, "grad_norm": 4.067875284374342, "learning_rate": 9.488448844884487e-07, "loss": 0.0196, "step": 676 }, { "epoch": 0.5414917016596681, "grad_norm": 3.730643734647644, "learning_rate": 9.471947194719472e-07, "loss": -0.0894, "step": 677 }, { "epoch": 0.5422915416916617, "grad_norm": 5.204622803431674, "learning_rate": 9.455445544554454e-07, "loss": -0.0538, "step": 678 }, { "epoch": 0.5430913817236552, "grad_norm": 5.261361679954622, "learning_rate": 9.438943894389439e-07, "loss": -0.026, "step": 679 }, { "epoch": 0.5438912217556489, "grad_norm": 4.057248991454938, "learning_rate": 9.422442244224422e-07, "loss": -0.0785, "step": 680 }, { "epoch": 0.5446910617876425, "grad_norm": 5.580346613410825, "learning_rate": 9.405940594059405e-07, "loss": -0.0335, "step": 681 }, { "epoch": 0.5454909018196361, "grad_norm": 3.287439521262259, "learning_rate": 9.389438943894389e-07, "loss": -0.014, "step": 682 }, { "epoch": 0.5462907418516296, "grad_norm": 4.383770462305995, "learning_rate": 9.372937293729373e-07, "loss": -0.1349, "step": 683 }, { "epoch": 0.5470905818836233, "grad_norm": 3.570830393151609, "learning_rate": 9.356435643564356e-07, "loss": -0.1603, "step": 684 }, { "epoch": 0.5478904219156169, "grad_norm": 3.9301633755259076, "learning_rate": 9.33993399339934e-07, "loss": -0.0728, "step": 685 }, { "epoch": 0.5486902619476105, "grad_norm": 4.577722525237458, "learning_rate": 9.323432343234323e-07, "loss": -0.0686, "step": 686 }, { "epoch": 0.5494901019796041, "grad_norm": 3.4290177162671704, "learning_rate": 9.306930693069307e-07, "loss": -0.0002, "step": 687 }, { "epoch": 0.5502899420115976, "grad_norm": 3.7664714194362574, "learning_rate": 9.29042904290429e-07, "loss": 0.0181, "step": 688 }, { "epoch": 0.5510897820435913, "grad_norm": 3.53486758576286, "learning_rate": 9.273927392739273e-07, "loss": 0.0046, "step": 689 }, { "epoch": 0.5518896220755849, "grad_norm": 5.533353761076084, "learning_rate": 9.257425742574257e-07, "loss": -0.1142, "step": 690 }, { "epoch": 0.5526894621075785, "grad_norm": 3.618813464410366, "learning_rate": 9.24092409240924e-07, "loss": -0.0533, "step": 691 }, { "epoch": 0.553489302139572, "grad_norm": 3.2399813948203064, "learning_rate": 9.224422442244224e-07, "loss": -0.2655, "step": 692 }, { "epoch": 0.5542891421715657, "grad_norm": 4.777938394039966, "learning_rate": 9.207920792079208e-07, "loss": -0.1287, "step": 693 }, { "epoch": 0.5550889822035593, "grad_norm": 4.753607333838816, "learning_rate": 9.191419141914191e-07, "loss": -0.0197, "step": 694 }, { "epoch": 0.5558888222355529, "grad_norm": 4.574962995202333, "learning_rate": 9.174917491749174e-07, "loss": -0.0882, "step": 695 }, { "epoch": 0.5566886622675465, "grad_norm": 5.716789868568477, "learning_rate": 9.158415841584159e-07, "loss": -0.0978, "step": 696 }, { "epoch": 0.5574885022995401, "grad_norm": 5.1621141783698805, "learning_rate": 9.141914191419141e-07, "loss": -0.0492, "step": 697 }, { "epoch": 0.5582883423315337, "grad_norm": 5.066721910041668, "learning_rate": 9.125412541254125e-07, "loss": -0.1249, "step": 698 }, { "epoch": 0.5590881823635273, "grad_norm": 6.6614345364199, "learning_rate": 9.108910891089109e-07, "loss": -0.0784, "step": 699 }, { "epoch": 0.5598880223955209, "grad_norm": 7.651991800546116, "learning_rate": 9.092409240924092e-07, "loss": 0.0583, "step": 700 }, { "epoch": 0.5606878624275144, "grad_norm": 5.556872832637128, "learning_rate": 9.075907590759075e-07, "loss": -0.0337, "step": 701 }, { "epoch": 0.5614877024595081, "grad_norm": 10.302249877529787, "learning_rate": 9.05940594059406e-07, "loss": -0.0318, "step": 702 }, { "epoch": 0.5622875424915017, "grad_norm": 6.446831319626775, "learning_rate": 9.042904290429042e-07, "loss": -0.032, "step": 703 }, { "epoch": 0.5630873825234953, "grad_norm": 3.1431425262284156, "learning_rate": 9.026402640264025e-07, "loss": -0.1257, "step": 704 }, { "epoch": 0.563887222555489, "grad_norm": 6.8613589813033755, "learning_rate": 9.00990099009901e-07, "loss": -0.0899, "step": 705 }, { "epoch": 0.5646870625874825, "grad_norm": 4.727075571003651, "learning_rate": 8.993399339933992e-07, "loss": -0.0103, "step": 706 }, { "epoch": 0.5654869026194761, "grad_norm": 3.757033554841126, "learning_rate": 8.976897689768976e-07, "loss": -0.1201, "step": 707 }, { "epoch": 0.5662867426514697, "grad_norm": 3.6568326567325586, "learning_rate": 8.96039603960396e-07, "loss": -0.0442, "step": 708 }, { "epoch": 0.5670865826834633, "grad_norm": 4.422118271245446, "learning_rate": 8.943894389438944e-07, "loss": -0.1491, "step": 709 }, { "epoch": 0.5678864227154569, "grad_norm": 4.090528079313399, "learning_rate": 8.927392739273927e-07, "loss": -0.1213, "step": 710 }, { "epoch": 0.5686862627474505, "grad_norm": 6.203118235394611, "learning_rate": 8.910891089108911e-07, "loss": -0.0415, "step": 711 }, { "epoch": 0.5694861027794441, "grad_norm": 4.192997526379617, "learning_rate": 8.894389438943894e-07, "loss": -0.0378, "step": 712 }, { "epoch": 0.5702859428114377, "grad_norm": 4.532567536428998, "learning_rate": 8.877887788778878e-07, "loss": -0.0275, "step": 713 }, { "epoch": 0.5710857828434314, "grad_norm": 4.168577112275988, "learning_rate": 8.861386138613861e-07, "loss": -0.0494, "step": 714 }, { "epoch": 0.5718856228754249, "grad_norm": 7.436541583714728, "learning_rate": 8.844884488448845e-07, "loss": 0.0338, "step": 715 }, { "epoch": 0.5726854629074185, "grad_norm": 4.3341221213745555, "learning_rate": 8.828382838283828e-07, "loss": -0.1824, "step": 716 }, { "epoch": 0.5734853029394121, "grad_norm": 5.774496226610818, "learning_rate": 8.811881188118812e-07, "loss": -0.0764, "step": 717 }, { "epoch": 0.5742851429714058, "grad_norm": 4.53339855951246, "learning_rate": 8.795379537953795e-07, "loss": -0.0494, "step": 718 }, { "epoch": 0.5750849830033993, "grad_norm": 3.7011611817540118, "learning_rate": 8.778877887788778e-07, "loss": -0.1227, "step": 719 }, { "epoch": 0.5758848230353929, "grad_norm": 4.117237995569142, "learning_rate": 8.762376237623762e-07, "loss": -0.0782, "step": 720 }, { "epoch": 0.5766846630673865, "grad_norm": 3.347109381610254, "learning_rate": 8.745874587458745e-07, "loss": -0.1517, "step": 721 }, { "epoch": 0.5774845030993802, "grad_norm": 3.9587273384893447, "learning_rate": 8.729372937293729e-07, "loss": -0.0647, "step": 722 }, { "epoch": 0.5782843431313738, "grad_norm": 4.073013317525639, "learning_rate": 8.712871287128712e-07, "loss": -0.0354, "step": 723 }, { "epoch": 0.5790841831633673, "grad_norm": 4.360786018214523, "learning_rate": 8.696369636963697e-07, "loss": -0.0845, "step": 724 }, { "epoch": 0.5798840231953609, "grad_norm": 4.911325926898916, "learning_rate": 8.679867986798679e-07, "loss": -0.0581, "step": 725 }, { "epoch": 0.5806838632273545, "grad_norm": 7.374565499647674, "learning_rate": 8.663366336633663e-07, "loss": 0.0876, "step": 726 }, { "epoch": 0.5814837032593482, "grad_norm": 5.228899924882716, "learning_rate": 8.646864686468647e-07, "loss": -0.0188, "step": 727 }, { "epoch": 0.5822835432913417, "grad_norm": 5.076190474367137, "learning_rate": 8.63036303630363e-07, "loss": -0.0482, "step": 728 }, { "epoch": 0.5830833833233353, "grad_norm": 3.8391392043031067, "learning_rate": 8.613861386138613e-07, "loss": -0.0351, "step": 729 }, { "epoch": 0.5838832233553289, "grad_norm": 4.720664943150823, "learning_rate": 8.597359735973598e-07, "loss": -0.0919, "step": 730 }, { "epoch": 0.5846830633873226, "grad_norm": 3.696825036479519, "learning_rate": 8.58085808580858e-07, "loss": -0.0491, "step": 731 }, { "epoch": 0.5854829034193162, "grad_norm": 4.957633580857494, "learning_rate": 8.564356435643563e-07, "loss": -0.0084, "step": 732 }, { "epoch": 0.5862827434513097, "grad_norm": 4.145591204807576, "learning_rate": 8.547854785478548e-07, "loss": -0.0777, "step": 733 }, { "epoch": 0.5870825834833033, "grad_norm": 6.008926427229853, "learning_rate": 8.531353135313531e-07, "loss": -0.0285, "step": 734 }, { "epoch": 0.587882423515297, "grad_norm": 6.457940104008622, "learning_rate": 8.514851485148514e-07, "loss": -0.1126, "step": 735 }, { "epoch": 0.5886822635472906, "grad_norm": 4.150286303835989, "learning_rate": 8.498349834983498e-07, "loss": -0.0633, "step": 736 }, { "epoch": 0.5894821035792841, "grad_norm": 4.765817707105298, "learning_rate": 8.481848184818482e-07, "loss": -0.1274, "step": 737 }, { "epoch": 0.5902819436112777, "grad_norm": 2.944307272093047, "learning_rate": 8.465346534653464e-07, "loss": -0.1143, "step": 738 }, { "epoch": 0.5910817836432714, "grad_norm": 4.301068842918969, "learning_rate": 8.448844884488449e-07, "loss": -0.0303, "step": 739 }, { "epoch": 0.591881623675265, "grad_norm": 3.5345181895694724, "learning_rate": 8.432343234323432e-07, "loss": -0.0369, "step": 740 }, { "epoch": 0.5926814637072585, "grad_norm": 3.6758939784537477, "learning_rate": 8.415841584158416e-07, "loss": -0.0435, "step": 741 }, { "epoch": 0.5934813037392521, "grad_norm": 9.280032916175081, "learning_rate": 8.399339933993399e-07, "loss": -0.077, "step": 742 }, { "epoch": 0.5942811437712457, "grad_norm": 3.691324223005634, "learning_rate": 8.382838283828383e-07, "loss": 0.0384, "step": 743 }, { "epoch": 0.5950809838032394, "grad_norm": 5.13228312974853, "learning_rate": 8.366336633663366e-07, "loss": -0.0101, "step": 744 }, { "epoch": 0.595880823835233, "grad_norm": 3.127958499467288, "learning_rate": 8.34983498349835e-07, "loss": 0.0036, "step": 745 }, { "epoch": 0.5966806638672265, "grad_norm": 4.371581567674568, "learning_rate": 8.333333333333333e-07, "loss": -0.088, "step": 746 }, { "epoch": 0.5974805038992201, "grad_norm": 3.7498910252313786, "learning_rate": 8.316831683168316e-07, "loss": -0.0249, "step": 747 }, { "epoch": 0.5982803439312138, "grad_norm": 9.554152491664782, "learning_rate": 8.3003300330033e-07, "loss": -0.0718, "step": 748 }, { "epoch": 0.5990801839632074, "grad_norm": 4.450073267301403, "learning_rate": 8.283828382838283e-07, "loss": -0.038, "step": 749 }, { "epoch": 0.5998800239952009, "grad_norm": 4.885796549111672, "learning_rate": 8.267326732673267e-07, "loss": -0.0279, "step": 750 }, { "epoch": 0.6006798640271945, "grad_norm": 4.729710058959354, "learning_rate": 8.25082508250825e-07, "loss": -0.0415, "step": 751 }, { "epoch": 0.6014797040591882, "grad_norm": 5.310403387692374, "learning_rate": 8.234323432343234e-07, "loss": -0.1964, "step": 752 }, { "epoch": 0.6022795440911818, "grad_norm": 4.462129183196535, "learning_rate": 8.217821782178217e-07, "loss": -0.0289, "step": 753 }, { "epoch": 0.6030793841231754, "grad_norm": 4.343559206058792, "learning_rate": 8.201320132013201e-07, "loss": 0.0155, "step": 754 }, { "epoch": 0.6038792241551689, "grad_norm": 7.14111860643498, "learning_rate": 8.184818481848184e-07, "loss": 0.076, "step": 755 }, { "epoch": 0.6046790641871626, "grad_norm": 5.741261351757093, "learning_rate": 8.168316831683168e-07, "loss": -0.0684, "step": 756 }, { "epoch": 0.6054789042191562, "grad_norm": 4.841100743497433, "learning_rate": 8.151815181518151e-07, "loss": 0.0251, "step": 757 }, { "epoch": 0.6062787442511498, "grad_norm": 5.776828704222559, "learning_rate": 8.135313531353136e-07, "loss": 0.0922, "step": 758 }, { "epoch": 0.6070785842831433, "grad_norm": 5.171546395456714, "learning_rate": 8.118811881188119e-07, "loss": 0.0378, "step": 759 }, { "epoch": 0.607878424315137, "grad_norm": 4.5497593071961475, "learning_rate": 8.102310231023102e-07, "loss": -0.0091, "step": 760 }, { "epoch": 0.6086782643471306, "grad_norm": 3.4008260116242837, "learning_rate": 8.085808580858086e-07, "loss": -0.0513, "step": 761 }, { "epoch": 0.6094781043791242, "grad_norm": 5.9528170929025475, "learning_rate": 8.069306930693069e-07, "loss": -0.0846, "step": 762 }, { "epoch": 0.6102779444111178, "grad_norm": 8.59254741230532, "learning_rate": 8.052805280528052e-07, "loss": 0.0454, "step": 763 }, { "epoch": 0.6110777844431113, "grad_norm": 5.92388092933109, "learning_rate": 8.036303630363036e-07, "loss": -0.0677, "step": 764 }, { "epoch": 0.611877624475105, "grad_norm": 4.5071343981279375, "learning_rate": 8.01980198019802e-07, "loss": -0.0598, "step": 765 }, { "epoch": 0.6126774645070986, "grad_norm": 5.095952967655762, "learning_rate": 8.003300330033002e-07, "loss": -0.1077, "step": 766 }, { "epoch": 0.6134773045390922, "grad_norm": 3.686767117360266, "learning_rate": 7.986798679867987e-07, "loss": -0.0509, "step": 767 }, { "epoch": 0.6142771445710857, "grad_norm": 4.709313867244328, "learning_rate": 7.97029702970297e-07, "loss": -0.0384, "step": 768 }, { "epoch": 0.6150769846030794, "grad_norm": 4.606485786769665, "learning_rate": 7.953795379537953e-07, "loss": -0.0639, "step": 769 }, { "epoch": 0.615876824635073, "grad_norm": 4.126308811511686, "learning_rate": 7.937293729372937e-07, "loss": -0.1454, "step": 770 }, { "epoch": 0.6166766646670666, "grad_norm": 5.0891862328923985, "learning_rate": 7.920792079207921e-07, "loss": 0.0552, "step": 771 }, { "epoch": 0.6174765046990602, "grad_norm": 4.348447825302712, "learning_rate": 7.904290429042903e-07, "loss": -0.0914, "step": 772 }, { "epoch": 0.6182763447310538, "grad_norm": 4.069062786121328, "learning_rate": 7.887788778877888e-07, "loss": -0.0, "step": 773 }, { "epoch": 0.6190761847630474, "grad_norm": 4.293180451800697, "learning_rate": 7.871287128712871e-07, "loss": -0.2021, "step": 774 }, { "epoch": 0.619876024795041, "grad_norm": 6.832958614340714, "learning_rate": 7.854785478547854e-07, "loss": -0.049, "step": 775 }, { "epoch": 0.6206758648270346, "grad_norm": 4.62295713929861, "learning_rate": 7.838283828382838e-07, "loss": -0.0099, "step": 776 }, { "epoch": 0.6214757048590281, "grad_norm": 16.021103293232393, "learning_rate": 7.821782178217821e-07, "loss": -0.0229, "step": 777 }, { "epoch": 0.6222755448910218, "grad_norm": 6.878826117062365, "learning_rate": 7.805280528052805e-07, "loss": 0.0852, "step": 778 }, { "epoch": 0.6230753849230154, "grad_norm": 3.5925006851057595, "learning_rate": 7.788778877887788e-07, "loss": -0.0838, "step": 779 }, { "epoch": 0.623875224955009, "grad_norm": 4.312199015410901, "learning_rate": 7.772277227722772e-07, "loss": -0.1938, "step": 780 }, { "epoch": 0.6246750649870026, "grad_norm": 4.427437777822434, "learning_rate": 7.755775577557755e-07, "loss": -0.1088, "step": 781 }, { "epoch": 0.6254749050189962, "grad_norm": 3.391720502521415, "learning_rate": 7.739273927392739e-07, "loss": -0.1204, "step": 782 }, { "epoch": 0.6262747450509898, "grad_norm": 3.3125718306904512, "learning_rate": 7.722772277227722e-07, "loss": -0.0828, "step": 783 }, { "epoch": 0.6270745850829834, "grad_norm": 5.402117257600779, "learning_rate": 7.706270627062707e-07, "loss": -0.054, "step": 784 }, { "epoch": 0.627874425114977, "grad_norm": 4.489840589382479, "learning_rate": 7.689768976897689e-07, "loss": -0.0368, "step": 785 }, { "epoch": 0.6286742651469706, "grad_norm": 3.848546702562119, "learning_rate": 7.673267326732673e-07, "loss": -0.0842, "step": 786 }, { "epoch": 0.6294741051789642, "grad_norm": 4.3003301981629765, "learning_rate": 7.656765676567657e-07, "loss": -0.1575, "step": 787 }, { "epoch": 0.6302739452109578, "grad_norm": 3.234095759477404, "learning_rate": 7.64026402640264e-07, "loss": -0.1448, "step": 788 }, { "epoch": 0.6310737852429514, "grad_norm": 4.587363650091271, "learning_rate": 7.623762376237624e-07, "loss": -0.0806, "step": 789 }, { "epoch": 0.6318736252749451, "grad_norm": 4.91701458923025, "learning_rate": 7.607260726072607e-07, "loss": 0.0289, "step": 790 }, { "epoch": 0.6326734653069386, "grad_norm": 4.26117605640356, "learning_rate": 7.59075907590759e-07, "loss": -0.0442, "step": 791 }, { "epoch": 0.6334733053389322, "grad_norm": 4.714328336316601, "learning_rate": 7.574257425742574e-07, "loss": -0.1182, "step": 792 }, { "epoch": 0.6342731453709258, "grad_norm": 4.8043256421789975, "learning_rate": 7.557755775577558e-07, "loss": -0.0532, "step": 793 }, { "epoch": 0.6350729854029195, "grad_norm": 4.981158806362152, "learning_rate": 7.54125412541254e-07, "loss": -0.0322, "step": 794 }, { "epoch": 0.635872825434913, "grad_norm": 6.600526915815604, "learning_rate": 7.524752475247525e-07, "loss": -0.1059, "step": 795 }, { "epoch": 0.6366726654669066, "grad_norm": 4.989184316503347, "learning_rate": 7.508250825082508e-07, "loss": -0.0888, "step": 796 }, { "epoch": 0.6374725054989002, "grad_norm": 4.683126825596764, "learning_rate": 7.491749174917491e-07, "loss": -0.0425, "step": 797 }, { "epoch": 0.6382723455308938, "grad_norm": 2.835338855117646, "learning_rate": 7.475247524752475e-07, "loss": -0.1188, "step": 798 }, { "epoch": 0.6390721855628875, "grad_norm": 3.334782349034145, "learning_rate": 7.458745874587459e-07, "loss": -0.0085, "step": 799 }, { "epoch": 0.639872025594881, "grad_norm": 3.8950834045490677, "learning_rate": 7.442244224422441e-07, "loss": -0.0866, "step": 800 }, { "epoch": 0.6406718656268746, "grad_norm": 3.7954504535302047, "learning_rate": 7.425742574257426e-07, "loss": -0.1718, "step": 801 }, { "epoch": 0.6414717056588682, "grad_norm": 3.9147882759458907, "learning_rate": 7.409240924092409e-07, "loss": -0.1172, "step": 802 }, { "epoch": 0.6422715456908619, "grad_norm": 4.14195212922553, "learning_rate": 7.392739273927392e-07, "loss": -0.2055, "step": 803 }, { "epoch": 0.6430713857228554, "grad_norm": 5.564167929906255, "learning_rate": 7.376237623762376e-07, "loss": -0.0587, "step": 804 }, { "epoch": 0.643871225754849, "grad_norm": 5.104003509197404, "learning_rate": 7.359735973597359e-07, "loss": -0.1599, "step": 805 }, { "epoch": 0.6446710657868426, "grad_norm": 3.249110466926901, "learning_rate": 7.343234323432343e-07, "loss": 0.0254, "step": 806 }, { "epoch": 0.6454709058188363, "grad_norm": 5.330488201062819, "learning_rate": 7.326732673267326e-07, "loss": -0.0932, "step": 807 }, { "epoch": 0.6462707458508299, "grad_norm": 3.391881050876262, "learning_rate": 7.31023102310231e-07, "loss": -0.0045, "step": 808 }, { "epoch": 0.6470705858828234, "grad_norm": 4.729176906400958, "learning_rate": 7.293729372937293e-07, "loss": 0.0309, "step": 809 }, { "epoch": 0.647870425914817, "grad_norm": 4.873305784391995, "learning_rate": 7.277227722772277e-07, "loss": -0.016, "step": 810 }, { "epoch": 0.6486702659468107, "grad_norm": 4.737325724513948, "learning_rate": 7.26072607260726e-07, "loss": -0.0555, "step": 811 }, { "epoch": 0.6494701059788043, "grad_norm": 4.700178573137915, "learning_rate": 7.244224422442245e-07, "loss": -0.0054, "step": 812 }, { "epoch": 0.6502699460107978, "grad_norm": 3.8303301007119375, "learning_rate": 7.227722772277227e-07, "loss": -0.0237, "step": 813 }, { "epoch": 0.6510697860427914, "grad_norm": 3.9787912774229404, "learning_rate": 7.211221122112211e-07, "loss": -0.0158, "step": 814 }, { "epoch": 0.651869626074785, "grad_norm": 4.3995808661843805, "learning_rate": 7.194719471947195e-07, "loss": 0.0276, "step": 815 }, { "epoch": 0.6526694661067787, "grad_norm": 4.052268422009291, "learning_rate": 7.178217821782178e-07, "loss": -0.0178, "step": 816 }, { "epoch": 0.6534693061387723, "grad_norm": 4.162149014725009, "learning_rate": 7.161716171617161e-07, "loss": -0.0343, "step": 817 }, { "epoch": 0.6542691461707658, "grad_norm": 4.733808124710197, "learning_rate": 7.145214521452146e-07, "loss": -0.0719, "step": 818 }, { "epoch": 0.6550689862027594, "grad_norm": 4.367810691084756, "learning_rate": 7.128712871287128e-07, "loss": -0.1031, "step": 819 }, { "epoch": 0.6558688262347531, "grad_norm": 3.393663864026175, "learning_rate": 7.112211221122111e-07, "loss": -0.0469, "step": 820 }, { "epoch": 0.6566686662667467, "grad_norm": 6.593078579096907, "learning_rate": 7.095709570957096e-07, "loss": -0.0492, "step": 821 }, { "epoch": 0.6574685062987402, "grad_norm": 5.90593512770552, "learning_rate": 7.079207920792078e-07, "loss": 0.017, "step": 822 }, { "epoch": 0.6582683463307338, "grad_norm": 4.280214281867913, "learning_rate": 7.062706270627063e-07, "loss": -0.0432, "step": 823 }, { "epoch": 0.6590681863627275, "grad_norm": 8.423741404535653, "learning_rate": 7.046204620462046e-07, "loss": -0.0291, "step": 824 }, { "epoch": 0.6598680263947211, "grad_norm": 5.038317153573228, "learning_rate": 7.029702970297029e-07, "loss": -0.093, "step": 825 }, { "epoch": 0.6606678664267147, "grad_norm": 6.706344720432834, "learning_rate": 7.013201320132013e-07, "loss": -0.1315, "step": 826 }, { "epoch": 0.6614677064587082, "grad_norm": 3.716489601764274, "learning_rate": 6.996699669966997e-07, "loss": -0.0825, "step": 827 }, { "epoch": 0.6622675464907019, "grad_norm": 4.158836764412884, "learning_rate": 6.980198019801979e-07, "loss": -0.0161, "step": 828 }, { "epoch": 0.6630673865226955, "grad_norm": 3.733283485919958, "learning_rate": 6.963696369636964e-07, "loss": -0.0594, "step": 829 }, { "epoch": 0.6638672265546891, "grad_norm": 6.968567713247902, "learning_rate": 6.947194719471947e-07, "loss": -0.0441, "step": 830 }, { "epoch": 0.6646670665866826, "grad_norm": 5.008730323701448, "learning_rate": 6.93069306930693e-07, "loss": -0.1307, "step": 831 }, { "epoch": 0.6654669066186762, "grad_norm": 4.2175650733942955, "learning_rate": 6.914191419141914e-07, "loss": -0.0771, "step": 832 }, { "epoch": 0.6662667466506699, "grad_norm": 5.402322742920563, "learning_rate": 6.897689768976897e-07, "loss": -0.0841, "step": 833 }, { "epoch": 0.6670665866826635, "grad_norm": 5.646942573991696, "learning_rate": 6.88118811881188e-07, "loss": -0.0622, "step": 834 }, { "epoch": 0.667866426714657, "grad_norm": 13.649360926832344, "learning_rate": 6.864686468646864e-07, "loss": -0.0637, "step": 835 }, { "epoch": 0.6686662667466506, "grad_norm": 5.461268948386568, "learning_rate": 6.848184818481848e-07, "loss": -0.0241, "step": 836 }, { "epoch": 0.6694661067786443, "grad_norm": 4.011621520471584, "learning_rate": 6.831683168316831e-07, "loss": -0.0422, "step": 837 }, { "epoch": 0.6702659468106379, "grad_norm": 20.835014010983784, "learning_rate": 6.815181518151815e-07, "loss": -0.126, "step": 838 }, { "epoch": 0.6710657868426315, "grad_norm": 5.041368060556288, "learning_rate": 6.798679867986798e-07, "loss": -0.1016, "step": 839 }, { "epoch": 0.671865626874625, "grad_norm": 5.834292995896152, "learning_rate": 6.782178217821783e-07, "loss": -0.0738, "step": 840 }, { "epoch": 0.6726654669066187, "grad_norm": 3.5120723151753985, "learning_rate": 6.765676567656765e-07, "loss": -0.0504, "step": 841 }, { "epoch": 0.6734653069386123, "grad_norm": 2.901517891733533, "learning_rate": 6.749174917491749e-07, "loss": -0.0698, "step": 842 }, { "epoch": 0.6742651469706059, "grad_norm": 3.465841087435974, "learning_rate": 6.732673267326733e-07, "loss": -0.1227, "step": 843 }, { "epoch": 0.6750649870025994, "grad_norm": 4.536588693958206, "learning_rate": 6.716171617161716e-07, "loss": -0.0602, "step": 844 }, { "epoch": 0.675864827034593, "grad_norm": 10.903656834330391, "learning_rate": 6.699669966996699e-07, "loss": -0.1289, "step": 845 }, { "epoch": 0.6766646670665867, "grad_norm": 7.296365266758308, "learning_rate": 6.683168316831684e-07, "loss": -0.1561, "step": 846 }, { "epoch": 0.6774645070985803, "grad_norm": 4.412331570876947, "learning_rate": 6.666666666666666e-07, "loss": -0.0958, "step": 847 }, { "epoch": 0.6782643471305739, "grad_norm": 2.8672230897612345, "learning_rate": 6.650165016501649e-07, "loss": -0.1643, "step": 848 }, { "epoch": 0.6790641871625674, "grad_norm": 4.5674033793568904, "learning_rate": 6.633663366336634e-07, "loss": -0.1336, "step": 849 }, { "epoch": 0.6798640271945611, "grad_norm": 4.381540695320094, "learning_rate": 6.617161716171616e-07, "loss": -0.0132, "step": 850 }, { "epoch": 0.6806638672265547, "grad_norm": 5.664059132542103, "learning_rate": 6.6006600660066e-07, "loss": -0.0285, "step": 851 }, { "epoch": 0.6814637072585483, "grad_norm": 5.7461780617115, "learning_rate": 6.584158415841584e-07, "loss": 0.0047, "step": 852 }, { "epoch": 0.6822635472905418, "grad_norm": 4.320720191192789, "learning_rate": 6.567656765676567e-07, "loss": -0.1447, "step": 853 }, { "epoch": 0.6830633873225355, "grad_norm": 3.108737285911658, "learning_rate": 6.55115511551155e-07, "loss": -0.1611, "step": 854 }, { "epoch": 0.6838632273545291, "grad_norm": 5.023876212557061, "learning_rate": 6.534653465346535e-07, "loss": -0.1529, "step": 855 }, { "epoch": 0.6846630673865227, "grad_norm": 4.545192341611211, "learning_rate": 6.518151815181517e-07, "loss": -0.0456, "step": 856 }, { "epoch": 0.6854629074185163, "grad_norm": 5.267884265007784, "learning_rate": 6.501650165016502e-07, "loss": -0.0288, "step": 857 }, { "epoch": 0.6862627474505099, "grad_norm": 5.101939165542976, "learning_rate": 6.485148514851485e-07, "loss": -0.0036, "step": 858 }, { "epoch": 0.6870625874825035, "grad_norm": 3.55089136672625, "learning_rate": 6.468646864686468e-07, "loss": -0.0762, "step": 859 }, { "epoch": 0.6878624275144971, "grad_norm": 4.121292066096188, "learning_rate": 6.452145214521452e-07, "loss": -0.0858, "step": 860 }, { "epoch": 0.6886622675464907, "grad_norm": 4.411771304555708, "learning_rate": 6.435643564356436e-07, "loss": -0.0359, "step": 861 }, { "epoch": 0.6894621075784843, "grad_norm": 4.163024887578695, "learning_rate": 6.419141914191419e-07, "loss": -0.1036, "step": 862 }, { "epoch": 0.6902619476104779, "grad_norm": 3.9253552561550307, "learning_rate": 6.402640264026402e-07, "loss": -0.092, "step": 863 }, { "epoch": 0.6910617876424715, "grad_norm": 3.3756432096953906, "learning_rate": 6.386138613861386e-07, "loss": 0.0002, "step": 864 }, { "epoch": 0.6918616276744651, "grad_norm": 4.4942197763700245, "learning_rate": 6.369636963696369e-07, "loss": 0.0044, "step": 865 }, { "epoch": 0.6926614677064588, "grad_norm": 4.469003096662444, "learning_rate": 6.353135313531353e-07, "loss": -0.0027, "step": 866 }, { "epoch": 0.6934613077384523, "grad_norm": 3.212779461175308, "learning_rate": 6.336633663366336e-07, "loss": -0.0255, "step": 867 }, { "epoch": 0.6942611477704459, "grad_norm": 3.473986685130551, "learning_rate": 6.32013201320132e-07, "loss": -0.0171, "step": 868 }, { "epoch": 0.6950609878024395, "grad_norm": 4.216176017792983, "learning_rate": 6.303630363036303e-07, "loss": -0.0673, "step": 869 }, { "epoch": 0.6958608278344331, "grad_norm": 12.098304541476889, "learning_rate": 6.287128712871287e-07, "loss": 0.0742, "step": 870 }, { "epoch": 0.6966606678664267, "grad_norm": 8.083305542363899, "learning_rate": 6.270627062706271e-07, "loss": 0.0401, "step": 871 }, { "epoch": 0.6974605078984203, "grad_norm": 4.685734734341067, "learning_rate": 6.254125412541254e-07, "loss": -0.0235, "step": 872 }, { "epoch": 0.6982603479304139, "grad_norm": 5.85956379904162, "learning_rate": 6.237623762376237e-07, "loss": -0.1556, "step": 873 }, { "epoch": 0.6990601879624075, "grad_norm": 3.61321328588953, "learning_rate": 6.221122112211222e-07, "loss": -0.093, "step": 874 }, { "epoch": 0.6998600279944012, "grad_norm": 3.528560458155755, "learning_rate": 6.204620462046204e-07, "loss": -0.0322, "step": 875 }, { "epoch": 0.7006598680263947, "grad_norm": 6.51982029204985, "learning_rate": 6.188118811881187e-07, "loss": 0.0667, "step": 876 }, { "epoch": 0.7014597080583883, "grad_norm": 4.530213336089527, "learning_rate": 6.171617161716172e-07, "loss": -0.0434, "step": 877 }, { "epoch": 0.7022595480903819, "grad_norm": 6.616076811947681, "learning_rate": 6.155115511551154e-07, "loss": 0.0219, "step": 878 }, { "epoch": 0.7030593881223756, "grad_norm": 4.1562171877376075, "learning_rate": 6.138613861386138e-07, "loss": -0.0648, "step": 879 }, { "epoch": 0.7038592281543691, "grad_norm": 4.419647626337558, "learning_rate": 6.122112211221122e-07, "loss": -0.1112, "step": 880 }, { "epoch": 0.7046590681863627, "grad_norm": 4.940388667457944, "learning_rate": 6.105610561056105e-07, "loss": -0.0162, "step": 881 }, { "epoch": 0.7054589082183563, "grad_norm": 6.023947292183416, "learning_rate": 6.089108910891088e-07, "loss": -0.0435, "step": 882 }, { "epoch": 0.70625874825035, "grad_norm": 7.1901426018379935, "learning_rate": 6.072607260726073e-07, "loss": -0.001, "step": 883 }, { "epoch": 0.7070585882823436, "grad_norm": 5.858167518812244, "learning_rate": 6.056105610561055e-07, "loss": -0.0017, "step": 884 }, { "epoch": 0.7078584283143371, "grad_norm": 4.6643313740940835, "learning_rate": 6.03960396039604e-07, "loss": -0.0808, "step": 885 }, { "epoch": 0.7086582683463307, "grad_norm": 3.964900755824796, "learning_rate": 6.023102310231023e-07, "loss": -0.1257, "step": 886 }, { "epoch": 0.7094581083783243, "grad_norm": 3.6312620074127797, "learning_rate": 6.006600660066007e-07, "loss": -0.044, "step": 887 }, { "epoch": 0.710257948410318, "grad_norm": 4.46804227760141, "learning_rate": 5.99009900990099e-07, "loss": -0.0864, "step": 888 }, { "epoch": 0.7110577884423115, "grad_norm": 3.9877331513072884, "learning_rate": 5.973597359735974e-07, "loss": -0.0555, "step": 889 }, { "epoch": 0.7118576284743051, "grad_norm": 5.71452000001456, "learning_rate": 5.957095709570957e-07, "loss": -0.0516, "step": 890 }, { "epoch": 0.7126574685062987, "grad_norm": 4.729225707280767, "learning_rate": 5.94059405940594e-07, "loss": -0.1252, "step": 891 }, { "epoch": 0.7134573085382924, "grad_norm": 10.531702048903348, "learning_rate": 5.924092409240924e-07, "loss": -0.0143, "step": 892 }, { "epoch": 0.714257148570286, "grad_norm": 4.337061940699697, "learning_rate": 5.907590759075907e-07, "loss": -0.0256, "step": 893 }, { "epoch": 0.7150569886022795, "grad_norm": 4.553139268045056, "learning_rate": 5.891089108910891e-07, "loss": 0.0592, "step": 894 }, { "epoch": 0.7158568286342731, "grad_norm": 3.65289282929829, "learning_rate": 5.874587458745874e-07, "loss": 0.0011, "step": 895 }, { "epoch": 0.7166566686662668, "grad_norm": 2.969949901428, "learning_rate": 5.858085808580858e-07, "loss": -0.1288, "step": 896 }, { "epoch": 0.7174565086982604, "grad_norm": 5.316965178875907, "learning_rate": 5.841584158415841e-07, "loss": -0.0688, "step": 897 }, { "epoch": 0.7182563487302539, "grad_norm": 6.112915886146603, "learning_rate": 5.825082508250825e-07, "loss": -0.1923, "step": 898 }, { "epoch": 0.7190561887622475, "grad_norm": 4.072265156624673, "learning_rate": 5.808580858085808e-07, "loss": -0.1749, "step": 899 }, { "epoch": 0.7198560287942412, "grad_norm": 4.286524287381163, "learning_rate": 5.792079207920792e-07, "loss": -0.0736, "step": 900 }, { "epoch": 0.7206558688262348, "grad_norm": 6.654813369667659, "learning_rate": 5.775577557755775e-07, "loss": -0.1185, "step": 901 }, { "epoch": 0.7214557088582284, "grad_norm": 5.075962580453491, "learning_rate": 5.75907590759076e-07, "loss": 0.0889, "step": 902 }, { "epoch": 0.7222555488902219, "grad_norm": 5.581410015072146, "learning_rate": 5.742574257425742e-07, "loss": -0.0718, "step": 903 }, { "epoch": 0.7230553889222155, "grad_norm": 7.760040178489886, "learning_rate": 5.726072607260726e-07, "loss": -0.0227, "step": 904 }, { "epoch": 0.7238552289542092, "grad_norm": 4.491264765964933, "learning_rate": 5.70957095709571e-07, "loss": -0.0838, "step": 905 }, { "epoch": 0.7246550689862028, "grad_norm": 4.876358038696258, "learning_rate": 5.693069306930692e-07, "loss": -0.172, "step": 906 }, { "epoch": 0.7254549090181963, "grad_norm": 3.9354286195012422, "learning_rate": 5.676567656765676e-07, "loss": -0.0793, "step": 907 }, { "epoch": 0.7262547490501899, "grad_norm": 3.51016598192195, "learning_rate": 5.66006600660066e-07, "loss": 0.0052, "step": 908 }, { "epoch": 0.7270545890821836, "grad_norm": 3.6895775842146166, "learning_rate": 5.643564356435643e-07, "loss": -0.0167, "step": 909 }, { "epoch": 0.7278544291141772, "grad_norm": 4.773443293841103, "learning_rate": 5.627062706270626e-07, "loss": -0.0359, "step": 910 }, { "epoch": 0.7286542691461708, "grad_norm": 4.278237278247243, "learning_rate": 5.610561056105611e-07, "loss": -0.054, "step": 911 }, { "epoch": 0.7294541091781643, "grad_norm": 10.27415077431224, "learning_rate": 5.594059405940594e-07, "loss": -0.0075, "step": 912 }, { "epoch": 0.730253949210158, "grad_norm": 4.2766288821859755, "learning_rate": 5.577557755775577e-07, "loss": 0.0493, "step": 913 }, { "epoch": 0.7310537892421516, "grad_norm": 3.5281016400546275, "learning_rate": 5.561056105610561e-07, "loss": -0.0766, "step": 914 }, { "epoch": 0.7318536292741452, "grad_norm": 3.8674946364382223, "learning_rate": 5.544554455445545e-07, "loss": -0.0442, "step": 915 }, { "epoch": 0.7326534693061387, "grad_norm": 5.4465871711884395, "learning_rate": 5.528052805280527e-07, "loss": -0.0841, "step": 916 }, { "epoch": 0.7334533093381324, "grad_norm": 3.530741427097772, "learning_rate": 5.511551155115512e-07, "loss": -0.0629, "step": 917 }, { "epoch": 0.734253149370126, "grad_norm": 3.965321298788348, "learning_rate": 5.495049504950495e-07, "loss": -0.0573, "step": 918 }, { "epoch": 0.7350529894021196, "grad_norm": 4.295307109186891, "learning_rate": 5.478547854785477e-07, "loss": -0.1381, "step": 919 }, { "epoch": 0.7358528294341132, "grad_norm": 3.8500617084264257, "learning_rate": 5.462046204620462e-07, "loss": -0.049, "step": 920 }, { "epoch": 0.7366526694661067, "grad_norm": 4.8663143152337005, "learning_rate": 5.445544554455445e-07, "loss": -0.0553, "step": 921 }, { "epoch": 0.7374525094981004, "grad_norm": 3.4389426238655476, "learning_rate": 5.429042904290429e-07, "loss": -0.1896, "step": 922 }, { "epoch": 0.738252349530094, "grad_norm": 5.019665193069423, "learning_rate": 5.412541254125412e-07, "loss": 0.1007, "step": 923 }, { "epoch": 0.7390521895620876, "grad_norm": 3.9100586900916126, "learning_rate": 5.396039603960396e-07, "loss": -0.0697, "step": 924 }, { "epoch": 0.7398520295940811, "grad_norm": 3.919136094051066, "learning_rate": 5.379537953795379e-07, "loss": -0.0907, "step": 925 }, { "epoch": 0.7406518696260748, "grad_norm": 4.124863593418168, "learning_rate": 5.363036303630363e-07, "loss": -0.0569, "step": 926 }, { "epoch": 0.7414517096580684, "grad_norm": 3.9668145454046977, "learning_rate": 5.346534653465346e-07, "loss": -0.0045, "step": 927 }, { "epoch": 0.742251549690062, "grad_norm": 4.794421236275003, "learning_rate": 5.33003300330033e-07, "loss": -0.1284, "step": 928 }, { "epoch": 0.7430513897220556, "grad_norm": 5.586609443850984, "learning_rate": 5.313531353135313e-07, "loss": -0.0346, "step": 929 }, { "epoch": 0.7438512297540492, "grad_norm": 5.408030486873039, "learning_rate": 5.297029702970297e-07, "loss": -0.1473, "step": 930 }, { "epoch": 0.7446510697860428, "grad_norm": 3.506888694865617, "learning_rate": 5.28052805280528e-07, "loss": -0.0018, "step": 931 }, { "epoch": 0.7454509098180364, "grad_norm": 4.703351899310227, "learning_rate": 5.264026402640264e-07, "loss": 0.0077, "step": 932 }, { "epoch": 0.74625074985003, "grad_norm": 5.030173808558858, "learning_rate": 5.247524752475247e-07, "loss": -0.1189, "step": 933 }, { "epoch": 0.7470505898820236, "grad_norm": 6.14459022838033, "learning_rate": 5.23102310231023e-07, "loss": 0.0135, "step": 934 }, { "epoch": 0.7478504299140172, "grad_norm": 3.162063833925453, "learning_rate": 5.214521452145214e-07, "loss": -0.0648, "step": 935 }, { "epoch": 0.7486502699460108, "grad_norm": 3.322990934843452, "learning_rate": 5.198019801980198e-07, "loss": -0.0444, "step": 936 }, { "epoch": 0.7494501099780044, "grad_norm": 7.281580577762579, "learning_rate": 5.181518151815182e-07, "loss": -0.079, "step": 937 }, { "epoch": 0.750249950009998, "grad_norm": 4.995689441346887, "learning_rate": 5.165016501650164e-07, "loss": -0.0494, "step": 938 }, { "epoch": 0.7510497900419916, "grad_norm": 3.662981016059356, "learning_rate": 5.148514851485149e-07, "loss": -0.1306, "step": 939 }, { "epoch": 0.7518496300739852, "grad_norm": 4.564675844346983, "learning_rate": 5.132013201320132e-07, "loss": 0.0849, "step": 940 }, { "epoch": 0.7526494701059788, "grad_norm": 4.127114880881102, "learning_rate": 5.115511551155115e-07, "loss": -0.1003, "step": 941 }, { "epoch": 0.7534493101379725, "grad_norm": 7.580284959172436, "learning_rate": 5.099009900990099e-07, "loss": 0.0127, "step": 942 }, { "epoch": 0.754249150169966, "grad_norm": 3.8477764594959223, "learning_rate": 5.082508250825083e-07, "loss": -0.0247, "step": 943 }, { "epoch": 0.7550489902019596, "grad_norm": 5.134480523239938, "learning_rate": 5.066006600660065e-07, "loss": -0.064, "step": 944 }, { "epoch": 0.7558488302339532, "grad_norm": 4.184980403074917, "learning_rate": 5.04950495049505e-07, "loss": -0.0374, "step": 945 }, { "epoch": 0.7566486702659468, "grad_norm": 4.095109087896494, "learning_rate": 5.033003300330033e-07, "loss": -0.0667, "step": 946 }, { "epoch": 0.7574485102979404, "grad_norm": 4.493509273952016, "learning_rate": 5.016501650165016e-07, "loss": -0.046, "step": 947 }, { "epoch": 0.758248350329934, "grad_norm": 4.37283031614021, "learning_rate": 5e-07, "loss": -0.0662, "step": 948 }, { "epoch": 0.7590481903619276, "grad_norm": 4.509842387488209, "learning_rate": 4.983498349834983e-07, "loss": -0.0951, "step": 949 }, { "epoch": 0.7598480303939212, "grad_norm": 5.100099502789577, "learning_rate": 4.966996699669966e-07, "loss": -0.1086, "step": 950 }, { "epoch": 0.7606478704259149, "grad_norm": 4.034320460393353, "learning_rate": 4.95049504950495e-07, "loss": -0.0263, "step": 951 }, { "epoch": 0.7614477104579084, "grad_norm": 4.593555887493731, "learning_rate": 4.933993399339933e-07, "loss": -0.1636, "step": 952 }, { "epoch": 0.762247550489902, "grad_norm": 4.526550802808929, "learning_rate": 4.917491749174918e-07, "loss": -0.0163, "step": 953 }, { "epoch": 0.7630473905218956, "grad_norm": 5.515431862710494, "learning_rate": 4.900990099009901e-07, "loss": 0.0348, "step": 954 }, { "epoch": 0.7638472305538893, "grad_norm": 4.2009128577609145, "learning_rate": 4.884488448844884e-07, "loss": -0.2542, "step": 955 }, { "epoch": 0.7646470705858828, "grad_norm": 4.271019064733657, "learning_rate": 4.867986798679868e-07, "loss": -0.1869, "step": 956 }, { "epoch": 0.7654469106178764, "grad_norm": 2.77377368877347, "learning_rate": 4.851485148514851e-07, "loss": -0.0735, "step": 957 }, { "epoch": 0.76624675064987, "grad_norm": 6.0608792055595995, "learning_rate": 4.834983498349835e-07, "loss": -0.0099, "step": 958 }, { "epoch": 0.7670465906818636, "grad_norm": 4.371986441183748, "learning_rate": 4.818481848184819e-07, "loss": -0.0748, "step": 959 }, { "epoch": 0.7678464307138573, "grad_norm": 3.611071563569357, "learning_rate": 4.801980198019802e-07, "loss": -0.085, "step": 960 }, { "epoch": 0.7686462707458508, "grad_norm": 5.1052303944455915, "learning_rate": 4.785478547854785e-07, "loss": 0.0156, "step": 961 }, { "epoch": 0.7694461107778444, "grad_norm": 4.3911870956367505, "learning_rate": 4.768976897689769e-07, "loss": -0.0685, "step": 962 }, { "epoch": 0.770245950809838, "grad_norm": 4.259527634413816, "learning_rate": 4.752475247524752e-07, "loss": -0.0837, "step": 963 }, { "epoch": 0.7710457908418317, "grad_norm": 4.970224413650343, "learning_rate": 4.735973597359736e-07, "loss": -0.1512, "step": 964 }, { "epoch": 0.7718456308738252, "grad_norm": 5.7445596582962395, "learning_rate": 4.7194719471947193e-07, "loss": -0.0713, "step": 965 }, { "epoch": 0.7726454709058188, "grad_norm": 4.543014932858551, "learning_rate": 4.7029702970297026e-07, "loss": -0.1337, "step": 966 }, { "epoch": 0.7734453109378124, "grad_norm": 4.451634421927441, "learning_rate": 4.6864686468646865e-07, "loss": -0.1269, "step": 967 }, { "epoch": 0.7742451509698061, "grad_norm": 5.379063157263456, "learning_rate": 4.66996699669967e-07, "loss": -0.0862, "step": 968 }, { "epoch": 0.7750449910017997, "grad_norm": 10.70668137767473, "learning_rate": 4.6534653465346537e-07, "loss": 0.0881, "step": 969 }, { "epoch": 0.7758448310337932, "grad_norm": 3.7167032202960177, "learning_rate": 4.6369636963696365e-07, "loss": -0.017, "step": 970 }, { "epoch": 0.7766446710657868, "grad_norm": 4.634801839565637, "learning_rate": 4.62046204620462e-07, "loss": -0.1011, "step": 971 }, { "epoch": 0.7774445110977805, "grad_norm": 3.7911455543889354, "learning_rate": 4.603960396039604e-07, "loss": -0.1149, "step": 972 }, { "epoch": 0.7782443511297741, "grad_norm": 3.824528464380088, "learning_rate": 4.587458745874587e-07, "loss": 0.0263, "step": 973 }, { "epoch": 0.7790441911617676, "grad_norm": 3.978602397648478, "learning_rate": 4.5709570957095705e-07, "loss": -0.0273, "step": 974 }, { "epoch": 0.7798440311937612, "grad_norm": 3.9315702808225206, "learning_rate": 4.5544554455445543e-07, "loss": -0.1133, "step": 975 }, { "epoch": 0.7806438712257548, "grad_norm": 4.4594711325427845, "learning_rate": 4.5379537953795377e-07, "loss": -0.0891, "step": 976 }, { "epoch": 0.7814437112577485, "grad_norm": 4.03305817498308, "learning_rate": 4.521452145214521e-07, "loss": -0.1069, "step": 977 }, { "epoch": 0.7822435512897421, "grad_norm": 4.444308363632601, "learning_rate": 4.504950495049505e-07, "loss": -0.1274, "step": 978 }, { "epoch": 0.7830433913217356, "grad_norm": 3.8559207670610953, "learning_rate": 4.488448844884488e-07, "loss": -0.1089, "step": 979 }, { "epoch": 0.7838432313537292, "grad_norm": 6.852327449815295, "learning_rate": 4.471947194719472e-07, "loss": -0.0608, "step": 980 }, { "epoch": 0.7846430713857229, "grad_norm": 5.172264061437722, "learning_rate": 4.4554455445544555e-07, "loss": -0.0064, "step": 981 }, { "epoch": 0.7854429114177165, "grad_norm": 4.338134868672705, "learning_rate": 4.438943894389439e-07, "loss": 0.0261, "step": 982 }, { "epoch": 0.78624275144971, "grad_norm": 4.7120432908120975, "learning_rate": 4.4224422442244227e-07, "loss": -0.006, "step": 983 }, { "epoch": 0.7870425914817036, "grad_norm": 4.093388665246742, "learning_rate": 4.405940594059406e-07, "loss": -0.1195, "step": 984 }, { "epoch": 0.7878424315136973, "grad_norm": 3.9779679788074565, "learning_rate": 4.389438943894389e-07, "loss": -0.0318, "step": 985 }, { "epoch": 0.7886422715456909, "grad_norm": 5.681211915009173, "learning_rate": 4.3729372937293727e-07, "loss": -0.0715, "step": 986 }, { "epoch": 0.7894421115776845, "grad_norm": 3.6961307708427875, "learning_rate": 4.356435643564356e-07, "loss": -0.1238, "step": 987 }, { "epoch": 0.790241951609678, "grad_norm": 5.551081220452864, "learning_rate": 4.3399339933993394e-07, "loss": -0.0353, "step": 988 }, { "epoch": 0.7910417916416717, "grad_norm": 4.1540822277204725, "learning_rate": 4.3234323432343233e-07, "loss": -0.1044, "step": 989 }, { "epoch": 0.7918416316736653, "grad_norm": 5.690644788629102, "learning_rate": 4.3069306930693066e-07, "loss": -0.0824, "step": 990 }, { "epoch": 0.7926414717056589, "grad_norm": 5.090727863244342, "learning_rate": 4.29042904290429e-07, "loss": -0.0275, "step": 991 }, { "epoch": 0.7934413117376524, "grad_norm": 4.9291753971455705, "learning_rate": 4.273927392739274e-07, "loss": -0.0339, "step": 992 }, { "epoch": 0.794241151769646, "grad_norm": 6.803186332065206, "learning_rate": 4.257425742574257e-07, "loss": -0.0342, "step": 993 }, { "epoch": 0.7950409918016397, "grad_norm": 5.804775232239933, "learning_rate": 4.240924092409241e-07, "loss": -0.0866, "step": 994 }, { "epoch": 0.7958408318336333, "grad_norm": 4.750722930505078, "learning_rate": 4.2244224422442244e-07, "loss": -0.1836, "step": 995 }, { "epoch": 0.7966406718656269, "grad_norm": 6.716244820472627, "learning_rate": 4.207920792079208e-07, "loss": -0.1024, "step": 996 }, { "epoch": 0.7974405118976204, "grad_norm": 3.722729260908509, "learning_rate": 4.1914191419141916e-07, "loss": -0.0993, "step": 997 }, { "epoch": 0.7982403519296141, "grad_norm": 3.311724877987371, "learning_rate": 4.174917491749175e-07, "loss": -0.104, "step": 998 }, { "epoch": 0.7990401919616077, "grad_norm": 5.521959854449801, "learning_rate": 4.158415841584158e-07, "loss": -0.1918, "step": 999 }, { "epoch": 0.7998400319936013, "grad_norm": 5.022786099690795, "learning_rate": 4.1419141914191417e-07, "loss": -0.044, "step": 1000 }, { "epoch": 0.8006398720255948, "grad_norm": 3.85176033301189, "learning_rate": 4.125412541254125e-07, "loss": -0.1389, "step": 1001 }, { "epoch": 0.8014397120575885, "grad_norm": 3.850320054659798, "learning_rate": 4.1089108910891084e-07, "loss": -0.0621, "step": 1002 }, { "epoch": 0.8022395520895821, "grad_norm": 5.125417738846334, "learning_rate": 4.092409240924092e-07, "loss": 0.024, "step": 1003 }, { "epoch": 0.8030393921215757, "grad_norm": 5.550789325018317, "learning_rate": 4.0759075907590756e-07, "loss": -0.0596, "step": 1004 }, { "epoch": 0.8038392321535693, "grad_norm": 2.987341879008025, "learning_rate": 4.0594059405940595e-07, "loss": 0.0174, "step": 1005 }, { "epoch": 0.8046390721855629, "grad_norm": 3.840920872539408, "learning_rate": 4.042904290429043e-07, "loss": -0.1812, "step": 1006 }, { "epoch": 0.8054389122175565, "grad_norm": 4.5904346361674495, "learning_rate": 4.026402640264026e-07, "loss": -0.153, "step": 1007 }, { "epoch": 0.8062387522495501, "grad_norm": 4.716177536008103, "learning_rate": 4.00990099009901e-07, "loss": -0.0347, "step": 1008 }, { "epoch": 0.8070385922815437, "grad_norm": 3.8819770758540106, "learning_rate": 3.9933993399339934e-07, "loss": -0.0491, "step": 1009 }, { "epoch": 0.8078384323135372, "grad_norm": 4.337114713855018, "learning_rate": 3.9768976897689767e-07, "loss": -0.076, "step": 1010 }, { "epoch": 0.8086382723455309, "grad_norm": 5.085958876323165, "learning_rate": 3.9603960396039606e-07, "loss": -0.1292, "step": 1011 }, { "epoch": 0.8094381123775245, "grad_norm": 4.225428966888881, "learning_rate": 3.943894389438944e-07, "loss": -0.0349, "step": 1012 }, { "epoch": 0.8102379524095181, "grad_norm": 3.132492848210798, "learning_rate": 3.927392739273927e-07, "loss": 0.0267, "step": 1013 }, { "epoch": 0.8110377924415118, "grad_norm": 4.728427487496938, "learning_rate": 3.9108910891089106e-07, "loss": 0.0251, "step": 1014 }, { "epoch": 0.8118376324735053, "grad_norm": 3.231169950869779, "learning_rate": 3.894389438943894e-07, "loss": -0.0048, "step": 1015 }, { "epoch": 0.8126374725054989, "grad_norm": 5.4315342239443645, "learning_rate": 3.8778877887788773e-07, "loss": 0.0017, "step": 1016 }, { "epoch": 0.8134373125374925, "grad_norm": 4.1495886173643015, "learning_rate": 3.861386138613861e-07, "loss": -0.0606, "step": 1017 }, { "epoch": 0.8142371525694861, "grad_norm": 4.571814448385221, "learning_rate": 3.8448844884488445e-07, "loss": 0.0305, "step": 1018 }, { "epoch": 0.8150369926014797, "grad_norm": 5.376749508040782, "learning_rate": 3.8283828382838284e-07, "loss": -0.0529, "step": 1019 }, { "epoch": 0.8158368326334733, "grad_norm": 4.234529949886336, "learning_rate": 3.811881188118812e-07, "loss": -0.0884, "step": 1020 }, { "epoch": 0.8166366726654669, "grad_norm": 5.103075536241735, "learning_rate": 3.795379537953795e-07, "loss": -0.0217, "step": 1021 }, { "epoch": 0.8174365126974605, "grad_norm": 4.089650205096865, "learning_rate": 3.778877887788779e-07, "loss": -0.1912, "step": 1022 }, { "epoch": 0.8182363527294542, "grad_norm": 4.315512669100621, "learning_rate": 3.7623762376237623e-07, "loss": -0.1096, "step": 1023 }, { "epoch": 0.8190361927614477, "grad_norm": 5.730342268116768, "learning_rate": 3.7458745874587457e-07, "loss": -0.0613, "step": 1024 }, { "epoch": 0.8198360327934413, "grad_norm": 4.482232090208855, "learning_rate": 3.7293729372937295e-07, "loss": -0.0025, "step": 1025 }, { "epoch": 0.8206358728254349, "grad_norm": 4.242971330310737, "learning_rate": 3.712871287128713e-07, "loss": -0.0786, "step": 1026 }, { "epoch": 0.8214357128574286, "grad_norm": 4.11563596499524, "learning_rate": 3.696369636963696e-07, "loss": -0.082, "step": 1027 }, { "epoch": 0.8222355528894221, "grad_norm": 9.332422439085308, "learning_rate": 3.6798679867986796e-07, "loss": 0.0022, "step": 1028 }, { "epoch": 0.8230353929214157, "grad_norm": 4.067536269426566, "learning_rate": 3.663366336633663e-07, "loss": -0.0361, "step": 1029 }, { "epoch": 0.8238352329534093, "grad_norm": 6.044500944552922, "learning_rate": 3.6468646864686463e-07, "loss": 0.1496, "step": 1030 }, { "epoch": 0.824635072985403, "grad_norm": 4.30205099295344, "learning_rate": 3.63036303630363e-07, "loss": -0.045, "step": 1031 }, { "epoch": 0.8254349130173965, "grad_norm": 4.012282824952302, "learning_rate": 3.6138613861386135e-07, "loss": -0.1718, "step": 1032 }, { "epoch": 0.8262347530493901, "grad_norm": 4.159359564634324, "learning_rate": 3.5973597359735974e-07, "loss": -0.0454, "step": 1033 }, { "epoch": 0.8270345930813837, "grad_norm": 3.814387620245282, "learning_rate": 3.5808580858085807e-07, "loss": -0.0466, "step": 1034 }, { "epoch": 0.8278344331133773, "grad_norm": 6.904216207979962, "learning_rate": 3.564356435643564e-07, "loss": -0.1518, "step": 1035 }, { "epoch": 0.828634273145371, "grad_norm": 5.250865501632598, "learning_rate": 3.547854785478548e-07, "loss": -0.0118, "step": 1036 }, { "epoch": 0.8294341131773645, "grad_norm": 5.846194811534598, "learning_rate": 3.5313531353135313e-07, "loss": 0.0351, "step": 1037 }, { "epoch": 0.8302339532093581, "grad_norm": 4.460630924092106, "learning_rate": 3.5148514851485146e-07, "loss": -0.2443, "step": 1038 }, { "epoch": 0.8310337932413517, "grad_norm": 4.1447157895119995, "learning_rate": 3.4983498349834985e-07, "loss": -0.0134, "step": 1039 }, { "epoch": 0.8318336332733454, "grad_norm": 3.5252220895483517, "learning_rate": 3.481848184818482e-07, "loss": -0.0577, "step": 1040 }, { "epoch": 0.8326334733053389, "grad_norm": 4.361902741479118, "learning_rate": 3.465346534653465e-07, "loss": 0.0104, "step": 1041 }, { "epoch": 0.8334333133373325, "grad_norm": 4.461915879260683, "learning_rate": 3.4488448844884485e-07, "loss": -0.0156, "step": 1042 }, { "epoch": 0.8342331533693261, "grad_norm": 4.834838939615413, "learning_rate": 3.432343234323432e-07, "loss": -0.0306, "step": 1043 }, { "epoch": 0.8350329934013198, "grad_norm": 4.457492333115142, "learning_rate": 3.415841584158416e-07, "loss": -0.0158, "step": 1044 }, { "epoch": 0.8358328334333134, "grad_norm": 6.418129824325349, "learning_rate": 3.399339933993399e-07, "loss": 0.023, "step": 1045 }, { "epoch": 0.8366326734653069, "grad_norm": 5.631846859681406, "learning_rate": 3.3828382838283824e-07, "loss": -0.0842, "step": 1046 }, { "epoch": 0.8374325134973005, "grad_norm": 4.893647743608584, "learning_rate": 3.3663366336633663e-07, "loss": 0.0169, "step": 1047 }, { "epoch": 0.8382323535292941, "grad_norm": 3.327224537992695, "learning_rate": 3.3498349834983497e-07, "loss": 0.0234, "step": 1048 }, { "epoch": 0.8390321935612878, "grad_norm": 3.334769765979331, "learning_rate": 3.333333333333333e-07, "loss": -0.0878, "step": 1049 }, { "epoch": 0.8398320335932813, "grad_norm": 4.036389763362471, "learning_rate": 3.316831683168317e-07, "loss": -0.0402, "step": 1050 }, { "epoch": 0.8406318736252749, "grad_norm": 3.653874204118681, "learning_rate": 3.3003300330033e-07, "loss": -0.0631, "step": 1051 }, { "epoch": 0.8414317136572685, "grad_norm": 4.88359310166619, "learning_rate": 3.2838283828382836e-07, "loss": -0.0544, "step": 1052 }, { "epoch": 0.8422315536892622, "grad_norm": 6.462333703622296, "learning_rate": 3.2673267326732674e-07, "loss": -0.0678, "step": 1053 }, { "epoch": 0.8430313937212558, "grad_norm": 7.009021395345441, "learning_rate": 3.250825082508251e-07, "loss": 0.0099, "step": 1054 }, { "epoch": 0.8438312337532493, "grad_norm": 4.2767377032125875, "learning_rate": 3.234323432343234e-07, "loss": 0.0297, "step": 1055 }, { "epoch": 0.8446310737852429, "grad_norm": 9.79641552202019, "learning_rate": 3.217821782178218e-07, "loss": 0.0968, "step": 1056 }, { "epoch": 0.8454309138172366, "grad_norm": 4.669605737417231, "learning_rate": 3.201320132013201e-07, "loss": 0.0097, "step": 1057 }, { "epoch": 0.8462307538492302, "grad_norm": 3.1819061861624807, "learning_rate": 3.1848184818481847e-07, "loss": -0.0776, "step": 1058 }, { "epoch": 0.8470305938812237, "grad_norm": 4.1774987880629695, "learning_rate": 3.168316831683168e-07, "loss": -0.0806, "step": 1059 }, { "epoch": 0.8478304339132173, "grad_norm": 4.454569906758588, "learning_rate": 3.1518151815181514e-07, "loss": -0.0526, "step": 1060 }, { "epoch": 0.848630273945211, "grad_norm": 3.527299815228531, "learning_rate": 3.1353135313531353e-07, "loss": -0.1065, "step": 1061 }, { "epoch": 0.8494301139772046, "grad_norm": 4.481801002071373, "learning_rate": 3.1188118811881186e-07, "loss": -0.0022, "step": 1062 }, { "epoch": 0.8502299540091982, "grad_norm": 4.3147168197624755, "learning_rate": 3.102310231023102e-07, "loss": -0.011, "step": 1063 }, { "epoch": 0.8510297940411917, "grad_norm": 3.812340279657359, "learning_rate": 3.085808580858086e-07, "loss": -0.2093, "step": 1064 }, { "epoch": 0.8518296340731853, "grad_norm": 5.097295358094463, "learning_rate": 3.069306930693069e-07, "loss": -0.0048, "step": 1065 }, { "epoch": 0.852629474105179, "grad_norm": 5.088642578790314, "learning_rate": 3.0528052805280525e-07, "loss": -0.1315, "step": 1066 }, { "epoch": 0.8534293141371726, "grad_norm": 4.052723785754238, "learning_rate": 3.0363036303630364e-07, "loss": -0.1133, "step": 1067 }, { "epoch": 0.8542291541691661, "grad_norm": 5.193579179546016, "learning_rate": 3.01980198019802e-07, "loss": -0.0787, "step": 1068 }, { "epoch": 0.8550289942011597, "grad_norm": 3.030054387526671, "learning_rate": 3.0033003300330036e-07, "loss": -0.0677, "step": 1069 }, { "epoch": 0.8558288342331534, "grad_norm": 9.135373007054904, "learning_rate": 2.986798679867987e-07, "loss": -0.1171, "step": 1070 }, { "epoch": 0.856628674265147, "grad_norm": 3.3785319743939013, "learning_rate": 2.97029702970297e-07, "loss": -0.0803, "step": 1071 }, { "epoch": 0.8574285142971406, "grad_norm": 3.9077882713350762, "learning_rate": 2.9537953795379537e-07, "loss": -0.0513, "step": 1072 }, { "epoch": 0.8582283543291341, "grad_norm": 4.038560493011451, "learning_rate": 2.937293729372937e-07, "loss": -0.0518, "step": 1073 }, { "epoch": 0.8590281943611278, "grad_norm": 3.800775478942818, "learning_rate": 2.9207920792079203e-07, "loss": -0.1613, "step": 1074 }, { "epoch": 0.8598280343931214, "grad_norm": 5.948071515082444, "learning_rate": 2.904290429042904e-07, "loss": -0.0834, "step": 1075 }, { "epoch": 0.860627874425115, "grad_norm": 4.190899280558429, "learning_rate": 2.8877887788778876e-07, "loss": -0.103, "step": 1076 }, { "epoch": 0.8614277144571085, "grad_norm": 4.910742773988997, "learning_rate": 2.871287128712871e-07, "loss": -0.0388, "step": 1077 }, { "epoch": 0.8622275544891022, "grad_norm": 7.707730353888358, "learning_rate": 2.854785478547855e-07, "loss": -0.1477, "step": 1078 }, { "epoch": 0.8630273945210958, "grad_norm": 4.297638426499125, "learning_rate": 2.838283828382838e-07, "loss": 0.0134, "step": 1079 }, { "epoch": 0.8638272345530894, "grad_norm": 6.145968919540754, "learning_rate": 2.8217821782178215e-07, "loss": 0.0021, "step": 1080 }, { "epoch": 0.864627074585083, "grad_norm": 3.7854604687393296, "learning_rate": 2.8052805280528054e-07, "loss": -0.1547, "step": 1081 }, { "epoch": 0.8654269146170765, "grad_norm": 3.991674461171312, "learning_rate": 2.7887788778877887e-07, "loss": -0.0668, "step": 1082 }, { "epoch": 0.8662267546490702, "grad_norm": 4.795940892228349, "learning_rate": 2.7722772277227726e-07, "loss": 0.0146, "step": 1083 }, { "epoch": 0.8670265946810638, "grad_norm": 4.944098976058084, "learning_rate": 2.755775577557756e-07, "loss": -0.0523, "step": 1084 }, { "epoch": 0.8678264347130574, "grad_norm": 5.005504629817167, "learning_rate": 2.7392739273927387e-07, "loss": 0.0065, "step": 1085 }, { "epoch": 0.8686262747450509, "grad_norm": 4.33635141957305, "learning_rate": 2.7227722772277226e-07, "loss": 0.0051, "step": 1086 }, { "epoch": 0.8694261147770446, "grad_norm": 6.827221005304679, "learning_rate": 2.706270627062706e-07, "loss": -0.1068, "step": 1087 }, { "epoch": 0.8702259548090382, "grad_norm": 3.763012118037954, "learning_rate": 2.6897689768976893e-07, "loss": 0.0675, "step": 1088 }, { "epoch": 0.8710257948410318, "grad_norm": 3.476547412625268, "learning_rate": 2.673267326732673e-07, "loss": -0.1067, "step": 1089 }, { "epoch": 0.8718256348730254, "grad_norm": 3.622631746348685, "learning_rate": 2.6567656765676565e-07, "loss": -0.0829, "step": 1090 }, { "epoch": 0.872625474905019, "grad_norm": 4.835180762619133, "learning_rate": 2.64026402640264e-07, "loss": -0.0761, "step": 1091 }, { "epoch": 0.8734253149370126, "grad_norm": 4.188641976033946, "learning_rate": 2.623762376237624e-07, "loss": -0.114, "step": 1092 }, { "epoch": 0.8742251549690062, "grad_norm": 4.833712113544916, "learning_rate": 2.607260726072607e-07, "loss": -0.1158, "step": 1093 }, { "epoch": 0.8750249950009998, "grad_norm": 3.6494839656219935, "learning_rate": 2.590759075907591e-07, "loss": -0.048, "step": 1094 }, { "epoch": 0.8758248350329934, "grad_norm": 3.9306302162750857, "learning_rate": 2.5742574257425743e-07, "loss": -0.0928, "step": 1095 }, { "epoch": 0.876624675064987, "grad_norm": 3.7010390446563517, "learning_rate": 2.5577557755775576e-07, "loss": 0.0242, "step": 1096 }, { "epoch": 0.8774245150969806, "grad_norm": 3.641273539002507, "learning_rate": 2.5412541254125415e-07, "loss": -0.1014, "step": 1097 }, { "epoch": 0.8782243551289742, "grad_norm": 4.233409363271656, "learning_rate": 2.524752475247525e-07, "loss": -0.0404, "step": 1098 }, { "epoch": 0.8790241951609679, "grad_norm": 4.188973466495453, "learning_rate": 2.508250825082508e-07, "loss": -0.0684, "step": 1099 }, { "epoch": 0.8798240351929614, "grad_norm": 5.017584397195866, "learning_rate": 2.4917491749174916e-07, "loss": -0.0368, "step": 1100 }, { "epoch": 0.880623875224955, "grad_norm": 3.9510700176873566, "learning_rate": 2.475247524752475e-07, "loss": -0.1375, "step": 1101 }, { "epoch": 0.8814237152569486, "grad_norm": 5.84233851394486, "learning_rate": 2.458745874587459e-07, "loss": -0.1067, "step": 1102 }, { "epoch": 0.8822235552889423, "grad_norm": 5.406949565806744, "learning_rate": 2.442244224422442e-07, "loss": -0.0962, "step": 1103 }, { "epoch": 0.8830233953209358, "grad_norm": 5.315262379239265, "learning_rate": 2.4257425742574255e-07, "loss": -0.066, "step": 1104 }, { "epoch": 0.8838232353529294, "grad_norm": 4.836530658291514, "learning_rate": 2.4092409240924093e-07, "loss": -0.0646, "step": 1105 }, { "epoch": 0.884623075384923, "grad_norm": 3.293455547222145, "learning_rate": 2.3927392739273927e-07, "loss": -0.1701, "step": 1106 }, { "epoch": 0.8854229154169166, "grad_norm": 4.709525078481242, "learning_rate": 2.376237623762376e-07, "loss": -0.0504, "step": 1107 }, { "epoch": 0.8862227554489103, "grad_norm": 4.295657231556702, "learning_rate": 2.3597359735973596e-07, "loss": -0.1419, "step": 1108 }, { "epoch": 0.8870225954809038, "grad_norm": 3.682029286721376, "learning_rate": 2.3432343234323433e-07, "loss": -0.0927, "step": 1109 }, { "epoch": 0.8878224355128974, "grad_norm": 7.500929711256007, "learning_rate": 2.3267326732673269e-07, "loss": -0.001, "step": 1110 }, { "epoch": 0.888622275544891, "grad_norm": 3.370577280876358, "learning_rate": 2.31023102310231e-07, "loss": -0.0219, "step": 1111 }, { "epoch": 0.8894221155768847, "grad_norm": 4.603247549338215, "learning_rate": 2.2937293729372936e-07, "loss": -0.0407, "step": 1112 }, { "epoch": 0.8902219556088782, "grad_norm": 3.033292259385364, "learning_rate": 2.2772277227722772e-07, "loss": -0.0612, "step": 1113 }, { "epoch": 0.8910217956408718, "grad_norm": 5.654397566299044, "learning_rate": 2.2607260726072605e-07, "loss": -0.0081, "step": 1114 }, { "epoch": 0.8918216356728654, "grad_norm": 4.7463861069291235, "learning_rate": 2.244224422442244e-07, "loss": 0.0324, "step": 1115 }, { "epoch": 0.8926214757048591, "grad_norm": 3.6795751469461697, "learning_rate": 2.2277227722772277e-07, "loss": -0.082, "step": 1116 }, { "epoch": 0.8934213157368527, "grad_norm": 6.7304826361036385, "learning_rate": 2.2112211221122113e-07, "loss": -0.1421, "step": 1117 }, { "epoch": 0.8942211557688462, "grad_norm": 4.6084848352584, "learning_rate": 2.1947194719471944e-07, "loss": -0.0154, "step": 1118 }, { "epoch": 0.8950209958008398, "grad_norm": 3.3241186250160673, "learning_rate": 2.178217821782178e-07, "loss": -0.0463, "step": 1119 }, { "epoch": 0.8958208358328335, "grad_norm": 4.285630706698749, "learning_rate": 2.1617161716171616e-07, "loss": -0.0102, "step": 1120 }, { "epoch": 0.8966206758648271, "grad_norm": 3.7442923024099266, "learning_rate": 2.145214521452145e-07, "loss": -0.0737, "step": 1121 }, { "epoch": 0.8974205158968206, "grad_norm": 4.067618329578387, "learning_rate": 2.1287128712871286e-07, "loss": -0.0694, "step": 1122 }, { "epoch": 0.8982203559288142, "grad_norm": 4.012101702069505, "learning_rate": 2.1122112211221122e-07, "loss": 0.013, "step": 1123 }, { "epoch": 0.8990201959608078, "grad_norm": 3.927004556177739, "learning_rate": 2.0957095709570958e-07, "loss": 0.0032, "step": 1124 }, { "epoch": 0.8998200359928015, "grad_norm": 4.153485289274271, "learning_rate": 2.079207920792079e-07, "loss": 0.0691, "step": 1125 }, { "epoch": 0.9006198760247951, "grad_norm": 3.0852405388784936, "learning_rate": 2.0627062706270625e-07, "loss": -0.152, "step": 1126 }, { "epoch": 0.9014197160567886, "grad_norm": 5.668232907029322, "learning_rate": 2.046204620462046e-07, "loss": -0.0051, "step": 1127 }, { "epoch": 0.9022195560887822, "grad_norm": 4.016353081034237, "learning_rate": 2.0297029702970297e-07, "loss": 0.004, "step": 1128 }, { "epoch": 0.9030193961207759, "grad_norm": 5.222331571223233, "learning_rate": 2.013201320132013e-07, "loss": -0.0398, "step": 1129 }, { "epoch": 0.9038192361527695, "grad_norm": 6.050674073050146, "learning_rate": 1.9966996699669967e-07, "loss": -0.0201, "step": 1130 }, { "epoch": 0.904619076184763, "grad_norm": 3.6129712440915336, "learning_rate": 1.9801980198019803e-07, "loss": -0.0873, "step": 1131 }, { "epoch": 0.9054189162167566, "grad_norm": 3.9706646963831527, "learning_rate": 1.9636963696369634e-07, "loss": -0.0425, "step": 1132 }, { "epoch": 0.9062187562487503, "grad_norm": 3.575800911347383, "learning_rate": 1.947194719471947e-07, "loss": -0.0728, "step": 1133 }, { "epoch": 0.9070185962807439, "grad_norm": 4.9127144958415165, "learning_rate": 1.9306930693069306e-07, "loss": -0.1456, "step": 1134 }, { "epoch": 0.9078184363127374, "grad_norm": 4.630716245217967, "learning_rate": 1.9141914191419142e-07, "loss": -0.1385, "step": 1135 }, { "epoch": 0.908618276344731, "grad_norm": 3.7522928222148413, "learning_rate": 1.8976897689768976e-07, "loss": -0.0473, "step": 1136 }, { "epoch": 0.9094181163767247, "grad_norm": 3.702942791411621, "learning_rate": 1.8811881188118812e-07, "loss": -0.0115, "step": 1137 }, { "epoch": 0.9102179564087183, "grad_norm": 4.215778349737591, "learning_rate": 1.8646864686468648e-07, "loss": 0.0369, "step": 1138 }, { "epoch": 0.9110177964407119, "grad_norm": 4.312725558809124, "learning_rate": 1.848184818481848e-07, "loss": -0.0821, "step": 1139 }, { "epoch": 0.9118176364727054, "grad_norm": 4.321449833697151, "learning_rate": 1.8316831683168315e-07, "loss": -0.1315, "step": 1140 }, { "epoch": 0.912617476504699, "grad_norm": 3.2649224360601234, "learning_rate": 1.815181518151815e-07, "loss": -0.1925, "step": 1141 }, { "epoch": 0.9134173165366927, "grad_norm": 3.760790622253671, "learning_rate": 1.7986798679867987e-07, "loss": -0.0013, "step": 1142 }, { "epoch": 0.9142171565686863, "grad_norm": 4.157207407424451, "learning_rate": 1.782178217821782e-07, "loss": -0.0581, "step": 1143 }, { "epoch": 0.9150169966006798, "grad_norm": 5.763265152647982, "learning_rate": 1.7656765676567656e-07, "loss": -0.0789, "step": 1144 }, { "epoch": 0.9158168366326734, "grad_norm": 5.000391563342132, "learning_rate": 1.7491749174917492e-07, "loss": 0.017, "step": 1145 }, { "epoch": 0.9166166766646671, "grad_norm": 5.503835207911807, "learning_rate": 1.7326732673267326e-07, "loss": -0.0498, "step": 1146 }, { "epoch": 0.9174165166966607, "grad_norm": 4.892439922885906, "learning_rate": 1.716171617161716e-07, "loss": -0.0188, "step": 1147 }, { "epoch": 0.9182163567286543, "grad_norm": 4.1251081972670915, "learning_rate": 1.6996699669966995e-07, "loss": -0.1601, "step": 1148 }, { "epoch": 0.9190161967606478, "grad_norm": 5.234413078511215, "learning_rate": 1.6831683168316832e-07, "loss": -0.0751, "step": 1149 }, { "epoch": 0.9198160367926415, "grad_norm": 5.21079968866447, "learning_rate": 1.6666666666666665e-07, "loss": 0.0173, "step": 1150 }, { "epoch": 0.9206158768246351, "grad_norm": 4.287315805109782, "learning_rate": 1.65016501650165e-07, "loss": -0.0245, "step": 1151 }, { "epoch": 0.9214157168566287, "grad_norm": 2.89403897319458, "learning_rate": 1.6336633663366337e-07, "loss": -0.1234, "step": 1152 }, { "epoch": 0.9222155568886222, "grad_norm": 5.421448264794362, "learning_rate": 1.617161716171617e-07, "loss": -0.0738, "step": 1153 }, { "epoch": 0.9230153969206158, "grad_norm": 5.340841829330172, "learning_rate": 1.6006600660066004e-07, "loss": -0.0978, "step": 1154 }, { "epoch": 0.9238152369526095, "grad_norm": 5.139659477858416, "learning_rate": 1.584158415841584e-07, "loss": -0.0053, "step": 1155 }, { "epoch": 0.9246150769846031, "grad_norm": 3.5692046134784676, "learning_rate": 1.5676567656765676e-07, "loss": -0.078, "step": 1156 }, { "epoch": 0.9254149170165967, "grad_norm": 3.6214949664373624, "learning_rate": 1.551155115511551e-07, "loss": -0.1948, "step": 1157 }, { "epoch": 0.9262147570485902, "grad_norm": 3.6462100170455516, "learning_rate": 1.5346534653465346e-07, "loss": -0.1466, "step": 1158 }, { "epoch": 0.9270145970805839, "grad_norm": 4.042490583513813, "learning_rate": 1.5181518151815182e-07, "loss": -0.0494, "step": 1159 }, { "epoch": 0.9278144371125775, "grad_norm": 3.570518304081052, "learning_rate": 1.5016501650165018e-07, "loss": -0.0203, "step": 1160 }, { "epoch": 0.9286142771445711, "grad_norm": 4.860609480391736, "learning_rate": 1.485148514851485e-07, "loss": -0.0608, "step": 1161 }, { "epoch": 0.9294141171765646, "grad_norm": 3.6503782154768336, "learning_rate": 1.4686468646864685e-07, "loss": -0.1209, "step": 1162 }, { "epoch": 0.9302139572085583, "grad_norm": 5.2679977730736915, "learning_rate": 1.452145214521452e-07, "loss": -0.2231, "step": 1163 }, { "epoch": 0.9310137972405519, "grad_norm": 4.077205627405786, "learning_rate": 1.4356435643564355e-07, "loss": -0.0918, "step": 1164 }, { "epoch": 0.9318136372725455, "grad_norm": 4.7895431899614245, "learning_rate": 1.419141914191419e-07, "loss": -0.0527, "step": 1165 }, { "epoch": 0.9326134773045391, "grad_norm": 5.279018314307402, "learning_rate": 1.4026402640264027e-07, "loss": 0.0147, "step": 1166 }, { "epoch": 0.9334133173365327, "grad_norm": 4.336258829943017, "learning_rate": 1.3861386138613863e-07, "loss": -0.0938, "step": 1167 }, { "epoch": 0.9342131573685263, "grad_norm": 4.54870086400182, "learning_rate": 1.3696369636963694e-07, "loss": -0.1337, "step": 1168 }, { "epoch": 0.9350129974005199, "grad_norm": 4.5996184978678105, "learning_rate": 1.353135313531353e-07, "loss": -0.0564, "step": 1169 }, { "epoch": 0.9358128374325135, "grad_norm": 3.580175353715861, "learning_rate": 1.3366336633663366e-07, "loss": -0.0917, "step": 1170 }, { "epoch": 0.936612677464507, "grad_norm": 4.236520821746979, "learning_rate": 1.32013201320132e-07, "loss": -0.0952, "step": 1171 }, { "epoch": 0.9374125174965007, "grad_norm": 4.45059914145225, "learning_rate": 1.3036303630363035e-07, "loss": -0.0501, "step": 1172 }, { "epoch": 0.9382123575284943, "grad_norm": 3.895112295625407, "learning_rate": 1.2871287128712872e-07, "loss": -0.0248, "step": 1173 }, { "epoch": 0.9390121975604879, "grad_norm": 3.446049410323525, "learning_rate": 1.2706270627062708e-07, "loss": -0.0609, "step": 1174 }, { "epoch": 0.9398120375924816, "grad_norm": 4.440477903266653, "learning_rate": 1.254125412541254e-07, "loss": -0.1183, "step": 1175 }, { "epoch": 0.9406118776244751, "grad_norm": 8.879875026201496, "learning_rate": 1.2376237623762375e-07, "loss": 0.0741, "step": 1176 }, { "epoch": 0.9414117176564687, "grad_norm": 3.251289551995566, "learning_rate": 1.221122112211221e-07, "loss": -0.2012, "step": 1177 }, { "epoch": 0.9422115576884623, "grad_norm": 5.090866489665643, "learning_rate": 1.2046204620462047e-07, "loss": -0.0937, "step": 1178 }, { "epoch": 0.943011397720456, "grad_norm": 5.634824692335556, "learning_rate": 1.188118811881188e-07, "loss": -0.0674, "step": 1179 }, { "epoch": 0.9438112377524495, "grad_norm": 7.237946586468722, "learning_rate": 1.1716171617161716e-07, "loss": 0.0063, "step": 1180 }, { "epoch": 0.9446110777844431, "grad_norm": 5.322093424679851, "learning_rate": 1.155115511551155e-07, "loss": 0.0182, "step": 1181 }, { "epoch": 0.9454109178164367, "grad_norm": 5.685444219272491, "learning_rate": 1.1386138613861386e-07, "loss": -0.0409, "step": 1182 }, { "epoch": 0.9462107578484303, "grad_norm": 4.89343356574685, "learning_rate": 1.122112211221122e-07, "loss": -0.1669, "step": 1183 }, { "epoch": 0.947010597880424, "grad_norm": 2.976462375028197, "learning_rate": 1.1056105610561057e-07, "loss": -0.1014, "step": 1184 }, { "epoch": 0.9478104379124175, "grad_norm": 5.789816947507454, "learning_rate": 1.089108910891089e-07, "loss": -0.0268, "step": 1185 }, { "epoch": 0.9486102779444111, "grad_norm": 4.547707602704605, "learning_rate": 1.0726072607260725e-07, "loss": -0.0252, "step": 1186 }, { "epoch": 0.9494101179764047, "grad_norm": 3.669232395567316, "learning_rate": 1.0561056105610561e-07, "loss": -0.0721, "step": 1187 }, { "epoch": 0.9502099580083984, "grad_norm": 4.725560206551925, "learning_rate": 1.0396039603960394e-07, "loss": -0.0374, "step": 1188 }, { "epoch": 0.9510097980403919, "grad_norm": 5.642201380367948, "learning_rate": 1.023102310231023e-07, "loss": -0.0524, "step": 1189 }, { "epoch": 0.9518096380723855, "grad_norm": 5.1228456388588555, "learning_rate": 1.0066006600660065e-07, "loss": -0.0122, "step": 1190 }, { "epoch": 0.9526094781043791, "grad_norm": 3.5927188886776995, "learning_rate": 9.900990099009901e-08, "loss": -0.1424, "step": 1191 }, { "epoch": 0.9534093181363728, "grad_norm": 3.815501173104961, "learning_rate": 9.735973597359735e-08, "loss": -0.1578, "step": 1192 }, { "epoch": 0.9542091581683664, "grad_norm": 3.474045980664194, "learning_rate": 9.570957095709571e-08, "loss": -0.0597, "step": 1193 }, { "epoch": 0.9550089982003599, "grad_norm": 4.782969446108987, "learning_rate": 9.405940594059406e-08, "loss": -0.0957, "step": 1194 }, { "epoch": 0.9558088382323535, "grad_norm": 4.74265001734386, "learning_rate": 9.24092409240924e-08, "loss": -0.0267, "step": 1195 }, { "epoch": 0.9566086782643471, "grad_norm": 3.661437568587583, "learning_rate": 9.075907590759075e-08, "loss": -0.1227, "step": 1196 }, { "epoch": 0.9574085182963408, "grad_norm": 4.279576775134146, "learning_rate": 8.91089108910891e-08, "loss": 0.0111, "step": 1197 }, { "epoch": 0.9582083583283343, "grad_norm": 4.169597103863264, "learning_rate": 8.745874587458746e-08, "loss": 0.0377, "step": 1198 }, { "epoch": 0.9590081983603279, "grad_norm": 6.207479567569039, "learning_rate": 8.58085808580858e-08, "loss": 0.0014, "step": 1199 }, { "epoch": 0.9598080383923215, "grad_norm": 3.5423075974898586, "learning_rate": 8.415841584158416e-08, "loss": -0.1664, "step": 1200 }, { "epoch": 0.9606078784243152, "grad_norm": 3.6165897542858887, "learning_rate": 8.25082508250825e-08, "loss": -0.0884, "step": 1201 }, { "epoch": 0.9614077184563088, "grad_norm": 4.517734549618362, "learning_rate": 8.085808580858085e-08, "loss": -0.1076, "step": 1202 }, { "epoch": 0.9622075584883023, "grad_norm": 4.307866136219069, "learning_rate": 7.92079207920792e-08, "loss": -0.115, "step": 1203 }, { "epoch": 0.9630073985202959, "grad_norm": 4.922270840667124, "learning_rate": 7.755775577557755e-08, "loss": -0.055, "step": 1204 }, { "epoch": 0.9638072385522896, "grad_norm": 4.385179502669176, "learning_rate": 7.590759075907591e-08, "loss": -0.1254, "step": 1205 }, { "epoch": 0.9646070785842832, "grad_norm": 3.9178495988004443, "learning_rate": 7.425742574257424e-08, "loss": -0.1016, "step": 1206 }, { "epoch": 0.9654069186162767, "grad_norm": 3.3585705170911515, "learning_rate": 7.26072607260726e-08, "loss": -0.0209, "step": 1207 }, { "epoch": 0.9662067586482703, "grad_norm": 3.815494549495067, "learning_rate": 7.095709570957095e-08, "loss": -0.0635, "step": 1208 }, { "epoch": 0.967006598680264, "grad_norm": 8.403899931437618, "learning_rate": 6.930693069306931e-08, "loss": -0.0465, "step": 1209 }, { "epoch": 0.9678064387122576, "grad_norm": 4.29568964473992, "learning_rate": 6.765676567656765e-08, "loss": -0.0332, "step": 1210 }, { "epoch": 0.9686062787442512, "grad_norm": 3.4280415193587235, "learning_rate": 6.6006600660066e-08, "loss": -0.0803, "step": 1211 }, { "epoch": 0.9694061187762447, "grad_norm": 4.6120423955763625, "learning_rate": 6.435643564356436e-08, "loss": -0.0619, "step": 1212 }, { "epoch": 0.9702059588082383, "grad_norm": 6.424876752925553, "learning_rate": 6.27062706270627e-08, "loss": -0.1442, "step": 1213 }, { "epoch": 0.971005798840232, "grad_norm": 5.485217081397391, "learning_rate": 6.105610561056105e-08, "loss": -0.0939, "step": 1214 }, { "epoch": 0.9718056388722256, "grad_norm": 6.774111317136949, "learning_rate": 5.94059405940594e-08, "loss": -0.0439, "step": 1215 }, { "epoch": 0.9726054789042191, "grad_norm": 5.096515115630733, "learning_rate": 5.775577557755775e-08, "loss": -0.0734, "step": 1216 }, { "epoch": 0.9734053189362127, "grad_norm": 4.159248360440637, "learning_rate": 5.61056105610561e-08, "loss": -0.0121, "step": 1217 }, { "epoch": 0.9742051589682064, "grad_norm": 7.334455086425638, "learning_rate": 5.445544554455445e-08, "loss": -0.1328, "step": 1218 }, { "epoch": 0.9750049990002, "grad_norm": 5.126425754126674, "learning_rate": 5.2805280528052805e-08, "loss": -0.123, "step": 1219 }, { "epoch": 0.9758048390321936, "grad_norm": 4.215981033934002, "learning_rate": 5.115511551155115e-08, "loss": -0.0326, "step": 1220 }, { "epoch": 0.9766046790641871, "grad_norm": 5.727858996419284, "learning_rate": 4.950495049504951e-08, "loss": -0.078, "step": 1221 }, { "epoch": 0.9774045190961808, "grad_norm": 4.2278775409875475, "learning_rate": 4.7854785478547855e-08, "loss": -0.1444, "step": 1222 }, { "epoch": 0.9782043591281744, "grad_norm": 5.1475401401668455, "learning_rate": 4.62046204620462e-08, "loss": -0.0758, "step": 1223 }, { "epoch": 0.979004199160168, "grad_norm": 4.979540737683743, "learning_rate": 4.455445544554455e-08, "loss": -0.0669, "step": 1224 }, { "epoch": 0.9798040391921615, "grad_norm": 3.4515888370253385, "learning_rate": 4.29042904290429e-08, "loss": -0.0937, "step": 1225 }, { "epoch": 0.9806038792241552, "grad_norm": 6.288776500193402, "learning_rate": 4.125412541254125e-08, "loss": 0.1102, "step": 1226 }, { "epoch": 0.9814037192561488, "grad_norm": 5.48415137375722, "learning_rate": 3.96039603960396e-08, "loss": -0.0964, "step": 1227 }, { "epoch": 0.9822035592881424, "grad_norm": 4.355868334038742, "learning_rate": 3.7953795379537955e-08, "loss": -0.0328, "step": 1228 }, { "epoch": 0.9830033993201359, "grad_norm": 4.721491339476331, "learning_rate": 3.63036303630363e-08, "loss": -0.0365, "step": 1229 }, { "epoch": 0.9838032393521295, "grad_norm": 6.339518521675752, "learning_rate": 3.465346534653466e-08, "loss": -0.0573, "step": 1230 }, { "epoch": 0.9846030793841232, "grad_norm": 4.434922235230731, "learning_rate": 3.3003300330033e-08, "loss": -0.1461, "step": 1231 }, { "epoch": 0.9854029194161168, "grad_norm": 5.310987908083999, "learning_rate": 3.135313531353135e-08, "loss": -0.0746, "step": 1232 }, { "epoch": 0.9862027594481104, "grad_norm": 5.686966755780067, "learning_rate": 2.97029702970297e-08, "loss": -0.0133, "step": 1233 }, { "epoch": 0.9870025994801039, "grad_norm": 4.108463781012627, "learning_rate": 2.805280528052805e-08, "loss": 0.0749, "step": 1234 }, { "epoch": 0.9878024395120976, "grad_norm": 3.6672659008615764, "learning_rate": 2.6402640264026403e-08, "loss": -0.1047, "step": 1235 }, { "epoch": 0.9886022795440912, "grad_norm": 3.9834854628962146, "learning_rate": 2.4752475247524754e-08, "loss": -0.0921, "step": 1236 }, { "epoch": 0.9894021195760848, "grad_norm": 3.5139198067318054, "learning_rate": 2.31023102310231e-08, "loss": -0.1611, "step": 1237 }, { "epoch": 0.9902019596080783, "grad_norm": 4.1541924223616356, "learning_rate": 2.145214521452145e-08, "loss": -0.0147, "step": 1238 }, { "epoch": 0.991001799640072, "grad_norm": 6.337275466101498, "learning_rate": 1.98019801980198e-08, "loss": -0.0767, "step": 1239 }, { "epoch": 0.9918016396720656, "grad_norm": 82.20527671342789, "learning_rate": 1.815181518151815e-08, "loss": -0.0431, "step": 1240 }, { "epoch": 0.9926014797040592, "grad_norm": 2.6922514851959494, "learning_rate": 1.65016501650165e-08, "loss": -0.0195, "step": 1241 }, { "epoch": 0.9934013197360528, "grad_norm": 3.6925978256211747, "learning_rate": 1.485148514851485e-08, "loss": -0.1541, "step": 1242 }, { "epoch": 0.9942011597680463, "grad_norm": 4.79635865852686, "learning_rate": 1.3201320132013201e-08, "loss": -0.1104, "step": 1243 }, { "epoch": 0.99500099980004, "grad_norm": 4.327272847702339, "learning_rate": 1.155115511551155e-08, "loss": -0.0807, "step": 1244 }, { "epoch": 0.9958008398320336, "grad_norm": 4.256644720520306, "learning_rate": 9.9009900990099e-09, "loss": -0.0278, "step": 1245 }, { "epoch": 0.9966006798640272, "grad_norm": 3.586841344680467, "learning_rate": 8.25082508250825e-09, "loss": 0.0033, "step": 1246 }, { "epoch": 0.9974005198960207, "grad_norm": 4.084312918321821, "learning_rate": 6.600660066006601e-09, "loss": -0.0053, "step": 1247 }, { "epoch": 0.9982003599280144, "grad_norm": 3.983696015790867, "learning_rate": 4.95049504950495e-09, "loss": -0.0229, "step": 1248 }, { "epoch": 0.999000199960008, "grad_norm": 4.1770415014644104, "learning_rate": 3.3003300330033003e-09, "loss": -0.1898, "step": 1249 }, { "epoch": 0.9998000399920016, "grad_norm": 4.645732496504415, "learning_rate": 1.6501650165016502e-09, "loss": 0.0221, "step": 1250 }, { "epoch": 0.9998000399920016, "step": 1250, "total_flos": 208730583859200.0, "train_loss": -0.05900815903544426, "train_runtime": 14539.7464, "train_samples_per_second": 11.005, "train_steps_per_second": 0.086 } ], "logging_steps": 1.0, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 208730583859200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }