{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5048065072713825, "eval_steps": 500, "global_step": 2540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009859502095144195, "grad_norm": 1.3765249905290702, "learning_rate": 2.0000000000000003e-06, "loss": 0.8092, "step": 1 }, { "epoch": 0.001971900419028839, "grad_norm": 1.367289057075295, "learning_rate": 4.000000000000001e-06, "loss": 0.8, "step": 2 }, { "epoch": 0.0029578506285432584, "grad_norm": 1.3431155113747304, "learning_rate": 6e-06, "loss": 0.8049, "step": 3 }, { "epoch": 0.003943800838057678, "grad_norm": 1.151248103486504, "learning_rate": 8.000000000000001e-06, "loss": 0.8039, "step": 4 }, { "epoch": 0.004929751047572098, "grad_norm": 0.8104674792888513, "learning_rate": 1e-05, "loss": 0.7477, "step": 5 }, { "epoch": 0.005915701257086517, "grad_norm": 2.727064547227183, "learning_rate": 9.999997324837724e-06, "loss": 0.7947, "step": 6 }, { "epoch": 0.006901651466600937, "grad_norm": 1.0749417778668735, "learning_rate": 9.99998929935376e-06, "loss": 0.7307, "step": 7 }, { "epoch": 0.007887601676115356, "grad_norm": 0.7628530475064141, "learning_rate": 9.999975923556696e-06, "loss": 0.7165, "step": 8 }, { "epoch": 0.008873551885629776, "grad_norm": 0.5857783129273212, "learning_rate": 9.999957197460844e-06, "loss": 0.73, "step": 9 }, { "epoch": 0.009859502095144195, "grad_norm": 0.5407067952150867, "learning_rate": 9.99993312108624e-06, "loss": 0.6877, "step": 10 }, { "epoch": 0.010845452304658615, "grad_norm": 0.39622766885472044, "learning_rate": 9.999903694458653e-06, "loss": 0.6758, "step": 11 }, { "epoch": 0.011831402514173034, "grad_norm": 0.3931749811712902, "learning_rate": 9.999868917609565e-06, "loss": 0.6651, "step": 12 }, { "epoch": 0.012817352723687454, "grad_norm": 0.40125124272709395, "learning_rate": 9.999828790576194e-06, "loss": 0.6986, "step": 13 }, { "epoch": 0.013803302933201873, "grad_norm": 0.3370162547077992, "learning_rate": 9.999783313401478e-06, "loss": 0.6799, "step": 14 }, { "epoch": 0.014789253142716292, "grad_norm": 0.2931745441380535, "learning_rate": 9.999732486134078e-06, "loss": 0.6626, "step": 15 }, { "epoch": 0.01577520335223071, "grad_norm": 0.33086761396823533, "learning_rate": 9.999676308828385e-06, "loss": 0.6753, "step": 16 }, { "epoch": 0.01676115356174513, "grad_norm": 0.36273495900838076, "learning_rate": 9.999614781544512e-06, "loss": 0.6809, "step": 17 }, { "epoch": 0.017747103771259553, "grad_norm": 0.3269646639666865, "learning_rate": 9.999547904348294e-06, "loss": 0.6486, "step": 18 }, { "epoch": 0.018733053980773972, "grad_norm": 0.311315845954239, "learning_rate": 9.999475677311298e-06, "loss": 0.6707, "step": 19 }, { "epoch": 0.01971900419028839, "grad_norm": 0.2646645447167399, "learning_rate": 9.99939810051081e-06, "loss": 0.6517, "step": 20 }, { "epoch": 0.02070495439980281, "grad_norm": 0.24097098469055522, "learning_rate": 9.999315174029843e-06, "loss": 0.6188, "step": 21 }, { "epoch": 0.02169090460931723, "grad_norm": 0.2715478883936797, "learning_rate": 9.999226897957132e-06, "loss": 0.6572, "step": 22 }, { "epoch": 0.022676854818831648, "grad_norm": 0.2542812528029893, "learning_rate": 9.99913327238714e-06, "loss": 0.652, "step": 23 }, { "epoch": 0.023662805028346067, "grad_norm": 0.2569805794182414, "learning_rate": 9.999034297420053e-06, "loss": 0.6522, "step": 24 }, { "epoch": 0.02464875523786049, "grad_norm": 0.2396967420842906, "learning_rate": 9.998929973161777e-06, "loss": 0.635, "step": 25 }, { "epoch": 0.02563470544737491, "grad_norm": 0.24159797374059683, "learning_rate": 9.99882029972395e-06, "loss": 0.6372, "step": 26 }, { "epoch": 0.026620655656889328, "grad_norm": 0.2519427485321024, "learning_rate": 9.998705277223926e-06, "loss": 0.6374, "step": 27 }, { "epoch": 0.027606605866403747, "grad_norm": 0.23714310942142935, "learning_rate": 9.99858490578479e-06, "loss": 0.6304, "step": 28 }, { "epoch": 0.028592556075918166, "grad_norm": 0.23852507363196584, "learning_rate": 9.998459185535342e-06, "loss": 0.6167, "step": 29 }, { "epoch": 0.029578506285432585, "grad_norm": 0.2237646708710333, "learning_rate": 9.998328116610118e-06, "loss": 0.6265, "step": 30 }, { "epoch": 0.030564456494947004, "grad_norm": 0.22617135505726307, "learning_rate": 9.998191699149367e-06, "loss": 0.6232, "step": 31 }, { "epoch": 0.03155040670446142, "grad_norm": 0.2367290677204515, "learning_rate": 9.99804993329906e-06, "loss": 0.6334, "step": 32 }, { "epoch": 0.03253635691397584, "grad_norm": 0.23223879565221392, "learning_rate": 9.997902819210903e-06, "loss": 0.6479, "step": 33 }, { "epoch": 0.03352230712349026, "grad_norm": 0.21858220436116263, "learning_rate": 9.997750357042315e-06, "loss": 0.6214, "step": 34 }, { "epoch": 0.03450825733300468, "grad_norm": 0.20343274643065565, "learning_rate": 9.997592546956439e-06, "loss": 0.6142, "step": 35 }, { "epoch": 0.035494207542519106, "grad_norm": 0.2577948288273245, "learning_rate": 9.997429389122141e-06, "loss": 0.6379, "step": 36 }, { "epoch": 0.036480157752033525, "grad_norm": 0.20636422243489688, "learning_rate": 9.997260883714015e-06, "loss": 0.6348, "step": 37 }, { "epoch": 0.037466107961547944, "grad_norm": 0.20502145548371767, "learning_rate": 9.99708703091237e-06, "loss": 0.6065, "step": 38 }, { "epoch": 0.03845205817106236, "grad_norm": 0.21299232506890886, "learning_rate": 9.996907830903238e-06, "loss": 0.6263, "step": 39 }, { "epoch": 0.03943800838057678, "grad_norm": 0.20706080631505255, "learning_rate": 9.996723283878376e-06, "loss": 0.6138, "step": 40 }, { "epoch": 0.0404239585900912, "grad_norm": 0.21574359503465926, "learning_rate": 9.996533390035264e-06, "loss": 0.6359, "step": 41 }, { "epoch": 0.04140990879960562, "grad_norm": 0.2094884886807774, "learning_rate": 9.996338149577098e-06, "loss": 0.6233, "step": 42 }, { "epoch": 0.04239585900912004, "grad_norm": 0.21797785845613704, "learning_rate": 9.996137562712798e-06, "loss": 0.6074, "step": 43 }, { "epoch": 0.04338180921863446, "grad_norm": 0.20885248816596044, "learning_rate": 9.995931629657005e-06, "loss": 0.609, "step": 44 }, { "epoch": 0.04436775942814888, "grad_norm": 0.2300394406472479, "learning_rate": 9.995720350630083e-06, "loss": 0.6222, "step": 45 }, { "epoch": 0.045353709637663296, "grad_norm": 0.2083396663165689, "learning_rate": 9.99550372585811e-06, "loss": 0.6122, "step": 46 }, { "epoch": 0.046339659847177715, "grad_norm": 0.20433334433389508, "learning_rate": 9.995281755572891e-06, "loss": 0.6177, "step": 47 }, { "epoch": 0.047325610056692134, "grad_norm": 0.1973460676616573, "learning_rate": 9.99505444001195e-06, "loss": 0.6073, "step": 48 }, { "epoch": 0.04831156026620655, "grad_norm": 0.2562590619272977, "learning_rate": 9.994821779418529e-06, "loss": 0.5842, "step": 49 }, { "epoch": 0.04929751047572098, "grad_norm": 0.1988436425039502, "learning_rate": 9.994583774041588e-06, "loss": 0.6125, "step": 50 }, { "epoch": 0.0502834606852354, "grad_norm": 0.21977567705232728, "learning_rate": 9.994340424135808e-06, "loss": 0.6044, "step": 51 }, { "epoch": 0.05126941089474982, "grad_norm": 0.1940896134389713, "learning_rate": 9.994091729961593e-06, "loss": 0.6047, "step": 52 }, { "epoch": 0.052255361104264236, "grad_norm": 0.19863138435651861, "learning_rate": 9.993837691785058e-06, "loss": 0.6192, "step": 53 }, { "epoch": 0.053241311313778655, "grad_norm": 0.26855885840780924, "learning_rate": 9.99357830987804e-06, "loss": 0.613, "step": 54 }, { "epoch": 0.054227261523293074, "grad_norm": 0.2004024021114952, "learning_rate": 9.9933135845181e-06, "loss": 0.6067, "step": 55 }, { "epoch": 0.05521321173280749, "grad_norm": 0.21801464983753077, "learning_rate": 9.993043515988504e-06, "loss": 0.6115, "step": 56 }, { "epoch": 0.05619916194232191, "grad_norm": 0.19078374812598073, "learning_rate": 9.992768104578248e-06, "loss": 0.5975, "step": 57 }, { "epoch": 0.05718511215183633, "grad_norm": 0.19814776870710335, "learning_rate": 9.992487350582037e-06, "loss": 0.5711, "step": 58 }, { "epoch": 0.05817106236135075, "grad_norm": 0.20847307806857995, "learning_rate": 9.992201254300299e-06, "loss": 0.6151, "step": 59 }, { "epoch": 0.05915701257086517, "grad_norm": 0.18634768164717536, "learning_rate": 9.991909816039174e-06, "loss": 0.5866, "step": 60 }, { "epoch": 0.06014296278037959, "grad_norm": 0.22058492402253121, "learning_rate": 9.991613036110517e-06, "loss": 0.6051, "step": 61 }, { "epoch": 0.06112891298989401, "grad_norm": 0.19257108247180021, "learning_rate": 9.991310914831908e-06, "loss": 0.6024, "step": 62 }, { "epoch": 0.06211486319940843, "grad_norm": 0.196142910088349, "learning_rate": 9.991003452526632e-06, "loss": 0.6024, "step": 63 }, { "epoch": 0.06310081340892285, "grad_norm": 0.20401755542944414, "learning_rate": 9.990690649523694e-06, "loss": 0.6013, "step": 64 }, { "epoch": 0.06408676361843726, "grad_norm": 0.19636579730374507, "learning_rate": 9.990372506157813e-06, "loss": 0.5949, "step": 65 }, { "epoch": 0.06507271382795168, "grad_norm": 0.1848833418382152, "learning_rate": 9.990049022769426e-06, "loss": 0.591, "step": 66 }, { "epoch": 0.0660586640374661, "grad_norm": 0.19274419278695507, "learning_rate": 9.989720199704678e-06, "loss": 0.5972, "step": 67 }, { "epoch": 0.06704461424698052, "grad_norm": 0.1980938095739198, "learning_rate": 9.989386037315433e-06, "loss": 0.5877, "step": 68 }, { "epoch": 0.06803056445649494, "grad_norm": 0.19027491755499437, "learning_rate": 9.989046535959269e-06, "loss": 0.5849, "step": 69 }, { "epoch": 0.06901651466600936, "grad_norm": 0.20011979699788518, "learning_rate": 9.988701695999467e-06, "loss": 0.6015, "step": 70 }, { "epoch": 0.07000246487552379, "grad_norm": 0.19268639996801692, "learning_rate": 9.988351517805034e-06, "loss": 0.6134, "step": 71 }, { "epoch": 0.07098841508503821, "grad_norm": 0.19634572510337903, "learning_rate": 9.987996001750682e-06, "loss": 0.5951, "step": 72 }, { "epoch": 0.07197436529455263, "grad_norm": 0.19629312560061116, "learning_rate": 9.987635148216837e-06, "loss": 0.6058, "step": 73 }, { "epoch": 0.07296031550406705, "grad_norm": 0.20573948484183535, "learning_rate": 9.987268957589633e-06, "loss": 0.6016, "step": 74 }, { "epoch": 0.07394626571358147, "grad_norm": 0.20941911620613557, "learning_rate": 9.986897430260922e-06, "loss": 0.5855, "step": 75 }, { "epoch": 0.07493221592309589, "grad_norm": 0.19813441493160366, "learning_rate": 9.986520566628256e-06, "loss": 0.5926, "step": 76 }, { "epoch": 0.0759181661326103, "grad_norm": 0.19873514925053815, "learning_rate": 9.986138367094913e-06, "loss": 0.5963, "step": 77 }, { "epoch": 0.07690411634212473, "grad_norm": 0.18811627969165587, "learning_rate": 9.985750832069861e-06, "loss": 0.5815, "step": 78 }, { "epoch": 0.07789006655163914, "grad_norm": 0.19234106290642222, "learning_rate": 9.985357961967795e-06, "loss": 0.5949, "step": 79 }, { "epoch": 0.07887601676115356, "grad_norm": 0.20955249698840359, "learning_rate": 9.984959757209108e-06, "loss": 0.5877, "step": 80 }, { "epoch": 0.07986196697066798, "grad_norm": 0.19383207007852313, "learning_rate": 9.984556218219908e-06, "loss": 0.5966, "step": 81 }, { "epoch": 0.0808479171801824, "grad_norm": 0.1947234699228627, "learning_rate": 9.984147345432003e-06, "loss": 0.5768, "step": 82 }, { "epoch": 0.08183386738969682, "grad_norm": 0.19783757001989402, "learning_rate": 9.983733139282917e-06, "loss": 0.5843, "step": 83 }, { "epoch": 0.08281981759921124, "grad_norm": 0.21204862645501593, "learning_rate": 9.983313600215876e-06, "loss": 0.6108, "step": 84 }, { "epoch": 0.08380576780872566, "grad_norm": 0.18979667022849442, "learning_rate": 9.982888728679816e-06, "loss": 0.586, "step": 85 }, { "epoch": 0.08479171801824008, "grad_norm": 0.35843719108760985, "learning_rate": 9.982458525129377e-06, "loss": 0.5756, "step": 86 }, { "epoch": 0.0857776682277545, "grad_norm": 0.2017647897675324, "learning_rate": 9.982022990024903e-06, "loss": 0.6062, "step": 87 }, { "epoch": 0.08676361843726892, "grad_norm": 0.18829410076504116, "learning_rate": 9.981582123832443e-06, "loss": 0.583, "step": 88 }, { "epoch": 0.08774956864678334, "grad_norm": 0.20926053875571596, "learning_rate": 9.981135927023758e-06, "loss": 0.5939, "step": 89 }, { "epoch": 0.08873551885629775, "grad_norm": 0.21176312397235778, "learning_rate": 9.980684400076301e-06, "loss": 0.5802, "step": 90 }, { "epoch": 0.08972146906581217, "grad_norm": 0.1866670497645919, "learning_rate": 9.980227543473243e-06, "loss": 0.5983, "step": 91 }, { "epoch": 0.09070741927532659, "grad_norm": 0.20887738100175152, "learning_rate": 9.979765357703442e-06, "loss": 0.5711, "step": 92 }, { "epoch": 0.09169336948484101, "grad_norm": 0.18824727898046928, "learning_rate": 9.97929784326147e-06, "loss": 0.5842, "step": 93 }, { "epoch": 0.09267931969435543, "grad_norm": 0.20872137142073366, "learning_rate": 9.978825000647603e-06, "loss": 0.5526, "step": 94 }, { "epoch": 0.09366526990386985, "grad_norm": 0.20588660604802805, "learning_rate": 9.978346830367804e-06, "loss": 0.5772, "step": 95 }, { "epoch": 0.09465122011338427, "grad_norm": 0.1897213779338987, "learning_rate": 9.977863332933752e-06, "loss": 0.589, "step": 96 }, { "epoch": 0.09563717032289869, "grad_norm": 0.1938789139645962, "learning_rate": 9.97737450886282e-06, "loss": 0.5826, "step": 97 }, { "epoch": 0.0966231205324131, "grad_norm": 0.18708886016881085, "learning_rate": 9.976880358678083e-06, "loss": 0.5747, "step": 98 }, { "epoch": 0.09760907074192754, "grad_norm": 0.194768187909786, "learning_rate": 9.97638088290831e-06, "loss": 0.5743, "step": 99 }, { "epoch": 0.09859502095144196, "grad_norm": 0.20855204340244599, "learning_rate": 9.975876082087974e-06, "loss": 0.5895, "step": 100 }, { "epoch": 0.09958097116095638, "grad_norm": 0.19682699295375078, "learning_rate": 9.975365956757245e-06, "loss": 0.5815, "step": 101 }, { "epoch": 0.1005669213704708, "grad_norm": 0.19761425189672296, "learning_rate": 9.974850507461989e-06, "loss": 0.57, "step": 102 }, { "epoch": 0.10155287157998522, "grad_norm": 0.1969307392448049, "learning_rate": 9.974329734753773e-06, "loss": 0.6095, "step": 103 }, { "epoch": 0.10253882178949963, "grad_norm": 0.19238160658361544, "learning_rate": 9.973803639189857e-06, "loss": 0.5953, "step": 104 }, { "epoch": 0.10352477199901405, "grad_norm": 0.19536784964342985, "learning_rate": 9.973272221333194e-06, "loss": 0.5724, "step": 105 }, { "epoch": 0.10451072220852847, "grad_norm": 0.18177641456857388, "learning_rate": 9.972735481752438e-06, "loss": 0.5994, "step": 106 }, { "epoch": 0.10549667241804289, "grad_norm": 0.2058000188653258, "learning_rate": 9.972193421021936e-06, "loss": 0.5839, "step": 107 }, { "epoch": 0.10648262262755731, "grad_norm": 0.20116963669280283, "learning_rate": 9.971646039721727e-06, "loss": 0.5845, "step": 108 }, { "epoch": 0.10746857283707173, "grad_norm": 0.192644533678385, "learning_rate": 9.971093338437545e-06, "loss": 0.5943, "step": 109 }, { "epoch": 0.10845452304658615, "grad_norm": 0.2051913783736844, "learning_rate": 9.970535317760817e-06, "loss": 0.5874, "step": 110 }, { "epoch": 0.10944047325610057, "grad_norm": 0.18646892654806063, "learning_rate": 9.96997197828866e-06, "loss": 0.5855, "step": 111 }, { "epoch": 0.11042642346561499, "grad_norm": 0.18808130693791184, "learning_rate": 9.969403320623883e-06, "loss": 0.5762, "step": 112 }, { "epoch": 0.1114123736751294, "grad_norm": 0.18786557009902174, "learning_rate": 9.968829345374988e-06, "loss": 0.5782, "step": 113 }, { "epoch": 0.11239832388464382, "grad_norm": 0.19710354354314336, "learning_rate": 9.968250053156165e-06, "loss": 0.5636, "step": 114 }, { "epoch": 0.11338427409415824, "grad_norm": 0.19653609173201655, "learning_rate": 9.967665444587298e-06, "loss": 0.5575, "step": 115 }, { "epoch": 0.11437022430367266, "grad_norm": 0.1934871588149603, "learning_rate": 9.96707552029395e-06, "loss": 0.5856, "step": 116 }, { "epoch": 0.11535617451318708, "grad_norm": 0.21484146881712957, "learning_rate": 9.966480280907383e-06, "loss": 0.5825, "step": 117 }, { "epoch": 0.1163421247227015, "grad_norm": 0.18883422828114707, "learning_rate": 9.965879727064538e-06, "loss": 0.5587, "step": 118 }, { "epoch": 0.11732807493221592, "grad_norm": 0.18961757693952455, "learning_rate": 9.965273859408052e-06, "loss": 0.5752, "step": 119 }, { "epoch": 0.11831402514173034, "grad_norm": 0.19297702343721243, "learning_rate": 9.964662678586235e-06, "loss": 0.5645, "step": 120 }, { "epoch": 0.11929997535124476, "grad_norm": 0.2304035112033173, "learning_rate": 9.964046185253098e-06, "loss": 0.5786, "step": 121 }, { "epoch": 0.12028592556075918, "grad_norm": 0.2506785823292142, "learning_rate": 9.963424380068324e-06, "loss": 0.583, "step": 122 }, { "epoch": 0.1212718757702736, "grad_norm": 0.20430322582417668, "learning_rate": 9.962797263697286e-06, "loss": 0.596, "step": 123 }, { "epoch": 0.12225782597978802, "grad_norm": 0.19713747960925884, "learning_rate": 9.96216483681104e-06, "loss": 0.5596, "step": 124 }, { "epoch": 0.12324377618930243, "grad_norm": 0.195136749034251, "learning_rate": 9.961527100086323e-06, "loss": 0.5739, "step": 125 }, { "epoch": 0.12422972639881685, "grad_norm": 0.18181708293146226, "learning_rate": 9.960884054205556e-06, "loss": 0.5757, "step": 126 }, { "epoch": 0.12521567660833127, "grad_norm": 0.20850687238386326, "learning_rate": 9.960235699856838e-06, "loss": 0.5786, "step": 127 }, { "epoch": 0.1262016268178457, "grad_norm": 0.6952482779983634, "learning_rate": 9.959582037733952e-06, "loss": 0.5802, "step": 128 }, { "epoch": 0.1271875770273601, "grad_norm": 0.1950690815783342, "learning_rate": 9.958923068536356e-06, "loss": 0.6032, "step": 129 }, { "epoch": 0.12817352723687453, "grad_norm": 0.19926639507294414, "learning_rate": 9.958258792969195e-06, "loss": 0.5768, "step": 130 }, { "epoch": 0.12915947744638895, "grad_norm": 0.46729534031431685, "learning_rate": 9.95758921174328e-06, "loss": 0.5646, "step": 131 }, { "epoch": 0.13014542765590337, "grad_norm": 0.21133226011241232, "learning_rate": 9.956914325575114e-06, "loss": 0.5527, "step": 132 }, { "epoch": 0.1311313778654178, "grad_norm": 0.19752264024958172, "learning_rate": 9.956234135186864e-06, "loss": 0.5817, "step": 133 }, { "epoch": 0.1321173280749322, "grad_norm": 0.43869306042549744, "learning_rate": 9.955548641306379e-06, "loss": 0.5625, "step": 134 }, { "epoch": 0.13310327828444662, "grad_norm": 0.20043630296357293, "learning_rate": 9.95485784466718e-06, "loss": 0.5724, "step": 135 }, { "epoch": 0.13408922849396104, "grad_norm": 0.19166523534205976, "learning_rate": 9.954161746008468e-06, "loss": 0.578, "step": 136 }, { "epoch": 0.13507517870347546, "grad_norm": 0.1915364173625567, "learning_rate": 9.953460346075112e-06, "loss": 0.5667, "step": 137 }, { "epoch": 0.13606112891298988, "grad_norm": 0.18716037179875467, "learning_rate": 9.952753645617656e-06, "loss": 0.5699, "step": 138 }, { "epoch": 0.1370470791225043, "grad_norm": 0.19586717678531615, "learning_rate": 9.952041645392313e-06, "loss": 0.5787, "step": 139 }, { "epoch": 0.13803302933201872, "grad_norm": 0.2144468317967703, "learning_rate": 9.951324346160973e-06, "loss": 0.5878, "step": 140 }, { "epoch": 0.13901897954153317, "grad_norm": 0.24086052677356343, "learning_rate": 9.95060174869119e-06, "loss": 0.5489, "step": 141 }, { "epoch": 0.14000492975104759, "grad_norm": 0.298794931994396, "learning_rate": 9.94987385375619e-06, "loss": 0.5677, "step": 142 }, { "epoch": 0.140990879960562, "grad_norm": 0.1915597370513195, "learning_rate": 9.949140662134873e-06, "loss": 0.5816, "step": 143 }, { "epoch": 0.14197683017007642, "grad_norm": 0.18685365109490332, "learning_rate": 9.948402174611795e-06, "loss": 0.5694, "step": 144 }, { "epoch": 0.14296278037959084, "grad_norm": 0.19692674211623318, "learning_rate": 9.947658391977188e-06, "loss": 0.567, "step": 145 }, { "epoch": 0.14394873058910526, "grad_norm": 0.1874193951834725, "learning_rate": 9.94690931502695e-06, "loss": 0.5965, "step": 146 }, { "epoch": 0.14493468079861968, "grad_norm": 0.19259727731903561, "learning_rate": 9.946154944562637e-06, "loss": 0.5698, "step": 147 }, { "epoch": 0.1459206310081341, "grad_norm": 0.9856315571854328, "learning_rate": 9.945395281391478e-06, "loss": 0.5816, "step": 148 }, { "epoch": 0.14690658121764852, "grad_norm": 0.19217330617850037, "learning_rate": 9.944630326326362e-06, "loss": 0.5604, "step": 149 }, { "epoch": 0.14789253142716294, "grad_norm": 0.18772651400583082, "learning_rate": 9.94386008018584e-06, "loss": 0.573, "step": 150 }, { "epoch": 0.14887848163667736, "grad_norm": 0.5746457009545806, "learning_rate": 9.943084543794123e-06, "loss": 0.578, "step": 151 }, { "epoch": 0.14986443184619178, "grad_norm": 0.20766753738887306, "learning_rate": 9.94230371798109e-06, "loss": 0.5667, "step": 152 }, { "epoch": 0.1508503820557062, "grad_norm": 0.2004304147804513, "learning_rate": 9.94151760358227e-06, "loss": 0.5571, "step": 153 }, { "epoch": 0.1518363322652206, "grad_norm": 0.20455822399303228, "learning_rate": 9.940726201438862e-06, "loss": 0.5672, "step": 154 }, { "epoch": 0.15282228247473503, "grad_norm": 0.1838803288134749, "learning_rate": 9.939929512397715e-06, "loss": 0.5987, "step": 155 }, { "epoch": 0.15380823268424945, "grad_norm": 0.20050980222504577, "learning_rate": 9.939127537311337e-06, "loss": 0.5699, "step": 156 }, { "epoch": 0.15479418289376387, "grad_norm": 0.19441962825480735, "learning_rate": 9.938320277037893e-06, "loss": 0.5628, "step": 157 }, { "epoch": 0.1557801331032783, "grad_norm": 0.19936545332740366, "learning_rate": 9.937507732441206e-06, "loss": 0.5605, "step": 158 }, { "epoch": 0.1567660833127927, "grad_norm": 1.309212076590478, "learning_rate": 9.93668990439075e-06, "loss": 0.5693, "step": 159 }, { "epoch": 0.15775203352230713, "grad_norm": 0.18988324352303748, "learning_rate": 9.935866793761656e-06, "loss": 0.5676, "step": 160 }, { "epoch": 0.15873798373182155, "grad_norm": 0.19272518366856584, "learning_rate": 9.935038401434702e-06, "loss": 0.5371, "step": 161 }, { "epoch": 0.15972393394133597, "grad_norm": 0.18550671681247388, "learning_rate": 9.934204728296324e-06, "loss": 0.5753, "step": 162 }, { "epoch": 0.16070988415085039, "grad_norm": 0.19249215809324505, "learning_rate": 9.933365775238609e-06, "loss": 0.5609, "step": 163 }, { "epoch": 0.1616958343603648, "grad_norm": 0.20411023032602532, "learning_rate": 9.932521543159285e-06, "loss": 0.5531, "step": 164 }, { "epoch": 0.16268178456987922, "grad_norm": 0.19335472696117534, "learning_rate": 9.931672032961742e-06, "loss": 0.5469, "step": 165 }, { "epoch": 0.16366773477939364, "grad_norm": 0.22420301199053938, "learning_rate": 9.930817245555007e-06, "loss": 0.5734, "step": 166 }, { "epoch": 0.16465368498890806, "grad_norm": 0.2025719628644905, "learning_rate": 9.929957181853758e-06, "loss": 0.5653, "step": 167 }, { "epoch": 0.16563963519842248, "grad_norm": 0.17704783553055983, "learning_rate": 9.92909184277832e-06, "loss": 0.5461, "step": 168 }, { "epoch": 0.1666255854079369, "grad_norm": 0.18847786991320487, "learning_rate": 9.928221229254661e-06, "loss": 0.5734, "step": 169 }, { "epoch": 0.16761153561745132, "grad_norm": 0.19669314062323434, "learning_rate": 9.927345342214398e-06, "loss": 0.5465, "step": 170 }, { "epoch": 0.16859748582696574, "grad_norm": 0.19105027030988314, "learning_rate": 9.926464182594781e-06, "loss": 0.5818, "step": 171 }, { "epoch": 0.16958343603648016, "grad_norm": 0.1874441302590153, "learning_rate": 9.925577751338711e-06, "loss": 0.5506, "step": 172 }, { "epoch": 0.17056938624599458, "grad_norm": 0.17967781696372298, "learning_rate": 9.924686049394728e-06, "loss": 0.5244, "step": 173 }, { "epoch": 0.171555336455509, "grad_norm": 0.19009914054945548, "learning_rate": 9.923789077717007e-06, "loss": 0.5577, "step": 174 }, { "epoch": 0.1725412866650234, "grad_norm": 0.1931608565336059, "learning_rate": 9.922886837265371e-06, "loss": 0.5638, "step": 175 }, { "epoch": 0.17352723687453783, "grad_norm": 0.26666031921399536, "learning_rate": 9.921979329005271e-06, "loss": 0.5409, "step": 176 }, { "epoch": 0.17451318708405225, "grad_norm": 0.20832489811919228, "learning_rate": 9.921066553907803e-06, "loss": 0.5732, "step": 177 }, { "epoch": 0.17549913729356667, "grad_norm": 0.18434435545608718, "learning_rate": 9.920148512949697e-06, "loss": 0.5578, "step": 178 }, { "epoch": 0.1764850875030811, "grad_norm": 0.17758143999650278, "learning_rate": 9.919225207113313e-06, "loss": 0.5638, "step": 179 }, { "epoch": 0.1774710377125955, "grad_norm": 0.1904346420359084, "learning_rate": 9.918296637386648e-06, "loss": 0.5635, "step": 180 }, { "epoch": 0.17845698792210993, "grad_norm": 0.20043556886204436, "learning_rate": 9.917362804763334e-06, "loss": 0.5821, "step": 181 }, { "epoch": 0.17944293813162435, "grad_norm": 0.3301171602767553, "learning_rate": 9.91642371024263e-06, "loss": 0.5734, "step": 182 }, { "epoch": 0.18042888834113877, "grad_norm": 0.6608374464890221, "learning_rate": 9.915479354829433e-06, "loss": 0.5758, "step": 183 }, { "epoch": 0.18141483855065318, "grad_norm": 0.18220942279149702, "learning_rate": 9.91452973953426e-06, "loss": 0.5357, "step": 184 }, { "epoch": 0.1824007887601676, "grad_norm": 0.2203525160795268, "learning_rate": 9.913574865373264e-06, "loss": 0.5541, "step": 185 }, { "epoch": 0.18338673896968202, "grad_norm": 0.18496823067115975, "learning_rate": 9.912614733368218e-06, "loss": 0.5693, "step": 186 }, { "epoch": 0.18437268917919644, "grad_norm": 0.5019587689639134, "learning_rate": 9.91164934454653e-06, "loss": 0.5627, "step": 187 }, { "epoch": 0.18535863938871086, "grad_norm": 0.2151624048884641, "learning_rate": 9.910678699941227e-06, "loss": 0.5584, "step": 188 }, { "epoch": 0.18634458959822528, "grad_norm": 0.19157867884160143, "learning_rate": 9.90970280059096e-06, "loss": 0.5463, "step": 189 }, { "epoch": 0.1873305398077397, "grad_norm": 0.19102624119101116, "learning_rate": 9.90872164754001e-06, "loss": 0.5479, "step": 190 }, { "epoch": 0.18831649001725412, "grad_norm": 0.18358944790975948, "learning_rate": 9.907735241838268e-06, "loss": 0.5508, "step": 191 }, { "epoch": 0.18930244022676854, "grad_norm": 0.17766251380399775, "learning_rate": 9.906743584541256e-06, "loss": 0.5637, "step": 192 }, { "epoch": 0.19028839043628296, "grad_norm": 0.18203909053982936, "learning_rate": 9.90574667671011e-06, "loss": 0.5486, "step": 193 }, { "epoch": 0.19127434064579737, "grad_norm": 0.18077750869311915, "learning_rate": 9.904744519411588e-06, "loss": 0.5719, "step": 194 }, { "epoch": 0.1922602908553118, "grad_norm": 0.20529393815616548, "learning_rate": 9.903737113718062e-06, "loss": 0.5683, "step": 195 }, { "epoch": 0.1932462410648262, "grad_norm": 0.17841515699100288, "learning_rate": 9.90272446070752e-06, "loss": 0.5439, "step": 196 }, { "epoch": 0.19423219127434063, "grad_norm": 0.18054547942202176, "learning_rate": 9.90170656146357e-06, "loss": 0.5568, "step": 197 }, { "epoch": 0.19521814148385508, "grad_norm": 0.32374590769366285, "learning_rate": 9.900683417075427e-06, "loss": 0.5561, "step": 198 }, { "epoch": 0.1962040916933695, "grad_norm": 0.19010714932520278, "learning_rate": 9.899655028637924e-06, "loss": 0.5421, "step": 199 }, { "epoch": 0.19719004190288392, "grad_norm": 0.1754484113927627, "learning_rate": 9.898621397251503e-06, "loss": 0.5575, "step": 200 }, { "epoch": 0.19817599211239834, "grad_norm": 0.1858887349509897, "learning_rate": 9.897582524022216e-06, "loss": 0.5396, "step": 201 }, { "epoch": 0.19916194232191275, "grad_norm": 0.1737758449718213, "learning_rate": 9.896538410061724e-06, "loss": 0.5496, "step": 202 }, { "epoch": 0.20014789253142717, "grad_norm": 0.19356547299150076, "learning_rate": 9.895489056487298e-06, "loss": 0.5549, "step": 203 }, { "epoch": 0.2011338427409416, "grad_norm": 0.1776417983268236, "learning_rate": 9.894434464421817e-06, "loss": 0.5696, "step": 204 }, { "epoch": 0.202119792950456, "grad_norm": 0.18109120205294296, "learning_rate": 9.893374634993756e-06, "loss": 0.5504, "step": 205 }, { "epoch": 0.20310574315997043, "grad_norm": 0.18540548568684925, "learning_rate": 9.892309569337208e-06, "loss": 0.5532, "step": 206 }, { "epoch": 0.20409169336948485, "grad_norm": 0.17913710095312485, "learning_rate": 9.891239268591858e-06, "loss": 0.5504, "step": 207 }, { "epoch": 0.20507764357899927, "grad_norm": 0.18027765377069868, "learning_rate": 9.890163733903003e-06, "loss": 0.5569, "step": 208 }, { "epoch": 0.2060635937885137, "grad_norm": 0.18205724705660326, "learning_rate": 9.889082966421529e-06, "loss": 0.5585, "step": 209 }, { "epoch": 0.2070495439980281, "grad_norm": 0.1892943540336806, "learning_rate": 9.887996967303928e-06, "loss": 0.561, "step": 210 }, { "epoch": 0.20803549420754253, "grad_norm": 0.352372345851401, "learning_rate": 9.88690573771229e-06, "loss": 0.5594, "step": 211 }, { "epoch": 0.20902144441705695, "grad_norm": 0.17451224769434992, "learning_rate": 9.885809278814307e-06, "loss": 0.5637, "step": 212 }, { "epoch": 0.21000739462657136, "grad_norm": 0.19118016535636867, "learning_rate": 9.884707591783253e-06, "loss": 0.5532, "step": 213 }, { "epoch": 0.21099334483608578, "grad_norm": 0.17955764775368546, "learning_rate": 9.88360067779801e-06, "loss": 0.5423, "step": 214 }, { "epoch": 0.2119792950456002, "grad_norm": 0.19225335942620098, "learning_rate": 9.882488538043044e-06, "loss": 0.5498, "step": 215 }, { "epoch": 0.21296524525511462, "grad_norm": 0.17835521350051764, "learning_rate": 9.881371173708421e-06, "loss": 0.565, "step": 216 }, { "epoch": 0.21395119546462904, "grad_norm": 0.17648927779006615, "learning_rate": 9.88024858598979e-06, "loss": 0.5699, "step": 217 }, { "epoch": 0.21493714567414346, "grad_norm": 0.23186770896350944, "learning_rate": 9.879120776088396e-06, "loss": 0.5336, "step": 218 }, { "epoch": 0.21592309588365788, "grad_norm": 0.1865552812650371, "learning_rate": 9.877987745211065e-06, "loss": 0.5603, "step": 219 }, { "epoch": 0.2169090460931723, "grad_norm": 0.187354863042195, "learning_rate": 9.876849494570216e-06, "loss": 0.5811, "step": 220 }, { "epoch": 0.21789499630268672, "grad_norm": 0.17681306208150294, "learning_rate": 9.87570602538385e-06, "loss": 0.5486, "step": 221 }, { "epoch": 0.21888094651220114, "grad_norm": 0.19341784130459566, "learning_rate": 9.874557338875554e-06, "loss": 0.5473, "step": 222 }, { "epoch": 0.21986689672171555, "grad_norm": 0.18861128025094429, "learning_rate": 9.873403436274495e-06, "loss": 0.5463, "step": 223 }, { "epoch": 0.22085284693122997, "grad_norm": 0.18384453190517155, "learning_rate": 9.872244318815428e-06, "loss": 0.5435, "step": 224 }, { "epoch": 0.2218387971407444, "grad_norm": 0.18217399013944716, "learning_rate": 9.871079987738681e-06, "loss": 0.5576, "step": 225 }, { "epoch": 0.2228247473502588, "grad_norm": 0.18430432846747724, "learning_rate": 9.869910444290162e-06, "loss": 0.5578, "step": 226 }, { "epoch": 0.22381069755977323, "grad_norm": 0.18089824279506567, "learning_rate": 9.868735689721363e-06, "loss": 0.5618, "step": 227 }, { "epoch": 0.22479664776928765, "grad_norm": 0.17403827133499156, "learning_rate": 9.867555725289344e-06, "loss": 0.5654, "step": 228 }, { "epoch": 0.22578259797880207, "grad_norm": 0.18781609637012886, "learning_rate": 9.866370552256746e-06, "loss": 0.5405, "step": 229 }, { "epoch": 0.2267685481883165, "grad_norm": 0.1762793424800406, "learning_rate": 9.865180171891778e-06, "loss": 0.5504, "step": 230 }, { "epoch": 0.2277544983978309, "grad_norm": 0.18853685496248657, "learning_rate": 9.863984585468226e-06, "loss": 0.5503, "step": 231 }, { "epoch": 0.22874044860734533, "grad_norm": 0.1823533722312669, "learning_rate": 9.862783794265448e-06, "loss": 0.5607, "step": 232 }, { "epoch": 0.22972639881685974, "grad_norm": 0.18201466636582803, "learning_rate": 9.861577799568364e-06, "loss": 0.5387, "step": 233 }, { "epoch": 0.23071234902637416, "grad_norm": 0.1862180638822258, "learning_rate": 9.860366602667469e-06, "loss": 0.5594, "step": 234 }, { "epoch": 0.23169829923588858, "grad_norm": 0.1755168236218862, "learning_rate": 9.85915020485882e-06, "loss": 0.53, "step": 235 }, { "epoch": 0.232684249445403, "grad_norm": 0.1824850169156925, "learning_rate": 9.857928607444045e-06, "loss": 0.5385, "step": 236 }, { "epoch": 0.23367019965491742, "grad_norm": 0.18537873529999935, "learning_rate": 9.85670181173033e-06, "loss": 0.5407, "step": 237 }, { "epoch": 0.23465614986443184, "grad_norm": 0.20704735012455386, "learning_rate": 9.855469819030425e-06, "loss": 0.558, "step": 238 }, { "epoch": 0.23564210007394626, "grad_norm": 0.18429423614956533, "learning_rate": 9.854232630662647e-06, "loss": 0.5399, "step": 239 }, { "epoch": 0.23662805028346068, "grad_norm": 0.19422829306134085, "learning_rate": 9.852990247950863e-06, "loss": 0.5682, "step": 240 }, { "epoch": 0.2376140004929751, "grad_norm": 0.1913164213893665, "learning_rate": 9.851742672224506e-06, "loss": 0.5513, "step": 241 }, { "epoch": 0.23859995070248952, "grad_norm": 0.1788406214546875, "learning_rate": 9.850489904818561e-06, "loss": 0.5419, "step": 242 }, { "epoch": 0.23958590091200394, "grad_norm": 0.18794434492520137, "learning_rate": 9.849231947073571e-06, "loss": 0.5604, "step": 243 }, { "epoch": 0.24057185112151835, "grad_norm": 0.18685472437860745, "learning_rate": 9.847968800335635e-06, "loss": 0.5419, "step": 244 }, { "epoch": 0.24155780133103277, "grad_norm": 0.18174774064309807, "learning_rate": 9.846700465956399e-06, "loss": 0.562, "step": 245 }, { "epoch": 0.2425437515405472, "grad_norm": 0.1824025047659135, "learning_rate": 9.845426945293064e-06, "loss": 0.5365, "step": 246 }, { "epoch": 0.2435297017500616, "grad_norm": 0.23426015842358192, "learning_rate": 9.84414823970838e-06, "loss": 0.5302, "step": 247 }, { "epoch": 0.24451565195957603, "grad_norm": 0.1931454250333834, "learning_rate": 9.842864350570645e-06, "loss": 0.5161, "step": 248 }, { "epoch": 0.24550160216909045, "grad_norm": 0.18283942948337056, "learning_rate": 9.8415752792537e-06, "loss": 0.5548, "step": 249 }, { "epoch": 0.24648755237860487, "grad_norm": 0.19154911738206512, "learning_rate": 9.840281027136943e-06, "loss": 0.5597, "step": 250 }, { "epoch": 0.2474735025881193, "grad_norm": 0.18088172165095986, "learning_rate": 9.838981595605301e-06, "loss": 0.5592, "step": 251 }, { "epoch": 0.2484594527976337, "grad_norm": 0.19847882231986305, "learning_rate": 9.837676986049253e-06, "loss": 0.5424, "step": 252 }, { "epoch": 0.24944540300714813, "grad_norm": 0.190254367466553, "learning_rate": 9.836367199864814e-06, "loss": 0.5675, "step": 253 }, { "epoch": 0.25043135321666254, "grad_norm": 0.1863957829966417, "learning_rate": 9.835052238453543e-06, "loss": 0.5467, "step": 254 }, { "epoch": 0.25141730342617696, "grad_norm": 0.1895012556382713, "learning_rate": 9.833732103222531e-06, "loss": 0.5455, "step": 255 }, { "epoch": 0.2524032536356914, "grad_norm": 0.18702548146488163, "learning_rate": 9.832406795584412e-06, "loss": 0.5611, "step": 256 }, { "epoch": 0.2533892038452058, "grad_norm": 0.1918046148589585, "learning_rate": 9.831076316957348e-06, "loss": 0.5472, "step": 257 }, { "epoch": 0.2543751540547202, "grad_norm": 0.19638448364666938, "learning_rate": 9.829740668765037e-06, "loss": 0.5548, "step": 258 }, { "epoch": 0.25536110426423464, "grad_norm": 0.18213286342958498, "learning_rate": 9.828399852436714e-06, "loss": 0.5433, "step": 259 }, { "epoch": 0.25634705447374906, "grad_norm": 0.20040305338002562, "learning_rate": 9.827053869407134e-06, "loss": 0.5809, "step": 260 }, { "epoch": 0.2573330046832635, "grad_norm": 0.17823037993061608, "learning_rate": 9.825702721116587e-06, "loss": 0.5659, "step": 261 }, { "epoch": 0.2583189548927779, "grad_norm": 0.1906176349922938, "learning_rate": 9.824346409010895e-06, "loss": 0.5542, "step": 262 }, { "epoch": 0.2593049051022923, "grad_norm": 0.18236764853176807, "learning_rate": 9.822984934541393e-06, "loss": 0.5422, "step": 263 }, { "epoch": 0.26029085531180673, "grad_norm": 0.3446126733000217, "learning_rate": 9.821618299164953e-06, "loss": 0.5439, "step": 264 }, { "epoch": 0.26127680552132115, "grad_norm": 0.20311944757484177, "learning_rate": 9.820246504343958e-06, "loss": 0.5606, "step": 265 }, { "epoch": 0.2622627557308356, "grad_norm": 0.1799513865731101, "learning_rate": 9.818869551546319e-06, "loss": 0.5256, "step": 266 }, { "epoch": 0.26324870594035, "grad_norm": 0.18272493636473558, "learning_rate": 9.817487442245468e-06, "loss": 0.576, "step": 267 }, { "epoch": 0.2642346561498644, "grad_norm": 0.21962884787364956, "learning_rate": 9.816100177920349e-06, "loss": 0.5376, "step": 268 }, { "epoch": 0.26522060635937883, "grad_norm": 0.19015374963011628, "learning_rate": 9.814707760055427e-06, "loss": 0.5398, "step": 269 }, { "epoch": 0.26620655656889325, "grad_norm": 0.20512522933688232, "learning_rate": 9.813310190140676e-06, "loss": 0.5708, "step": 270 }, { "epoch": 0.26719250677840767, "grad_norm": 0.1807567800669277, "learning_rate": 9.81190746967159e-06, "loss": 0.5646, "step": 271 }, { "epoch": 0.2681784569879221, "grad_norm": 0.2891229320005016, "learning_rate": 9.810499600149166e-06, "loss": 0.5372, "step": 272 }, { "epoch": 0.2691644071974365, "grad_norm": 0.1806821909652169, "learning_rate": 9.809086583079923e-06, "loss": 0.5237, "step": 273 }, { "epoch": 0.2701503574069509, "grad_norm": 0.18982968914898915, "learning_rate": 9.807668419975876e-06, "loss": 0.5571, "step": 274 }, { "epoch": 0.27113630761646534, "grad_norm": 0.20260892553853443, "learning_rate": 9.806245112354552e-06, "loss": 0.5324, "step": 275 }, { "epoch": 0.27212225782597976, "grad_norm": 0.18353665591257465, "learning_rate": 9.804816661738984e-06, "loss": 0.5298, "step": 276 }, { "epoch": 0.2731082080354942, "grad_norm": 0.1776571164680753, "learning_rate": 9.803383069657706e-06, "loss": 0.5418, "step": 277 }, { "epoch": 0.2740941582450086, "grad_norm": 0.1870562181329872, "learning_rate": 9.801944337644755e-06, "loss": 0.5607, "step": 278 }, { "epoch": 0.275080108454523, "grad_norm": 0.20423434175503252, "learning_rate": 9.800500467239666e-06, "loss": 0.5466, "step": 279 }, { "epoch": 0.27606605866403744, "grad_norm": 1.051465429104708, "learning_rate": 9.799051459987478e-06, "loss": 0.556, "step": 280 }, { "epoch": 0.27705200887355186, "grad_norm": 0.21609582147081427, "learning_rate": 9.797597317438719e-06, "loss": 0.5555, "step": 281 }, { "epoch": 0.27803795908306633, "grad_norm": 0.1876184929215964, "learning_rate": 9.796138041149416e-06, "loss": 0.5652, "step": 282 }, { "epoch": 0.27902390929258075, "grad_norm": 0.18576235687638093, "learning_rate": 9.794673632681093e-06, "loss": 0.5323, "step": 283 }, { "epoch": 0.28000985950209517, "grad_norm": 0.18728666616264814, "learning_rate": 9.793204093600758e-06, "loss": 0.5513, "step": 284 }, { "epoch": 0.2809958097116096, "grad_norm": 0.34552956735847234, "learning_rate": 9.791729425480917e-06, "loss": 0.5436, "step": 285 }, { "epoch": 0.281981759921124, "grad_norm": 0.1892052352760262, "learning_rate": 9.790249629899555e-06, "loss": 0.5698, "step": 286 }, { "epoch": 0.28296771013063843, "grad_norm": 0.18248732304052728, "learning_rate": 9.788764708440154e-06, "loss": 0.5355, "step": 287 }, { "epoch": 0.28395366034015285, "grad_norm": 0.1985966071161362, "learning_rate": 9.787274662691677e-06, "loss": 0.5446, "step": 288 }, { "epoch": 0.28493961054966727, "grad_norm": 0.18044086283569283, "learning_rate": 9.785779494248566e-06, "loss": 0.5313, "step": 289 }, { "epoch": 0.2859255607591817, "grad_norm": 0.1827212667367703, "learning_rate": 9.784279204710751e-06, "loss": 0.5566, "step": 290 }, { "epoch": 0.2869115109686961, "grad_norm": 0.20487472968156348, "learning_rate": 9.782773795683638e-06, "loss": 0.5467, "step": 291 }, { "epoch": 0.2878974611782105, "grad_norm": 0.17915713143387146, "learning_rate": 9.781263268778112e-06, "loss": 0.5555, "step": 292 }, { "epoch": 0.28888341138772494, "grad_norm": 0.28311912335362205, "learning_rate": 9.779747625610536e-06, "loss": 0.5331, "step": 293 }, { "epoch": 0.28986936159723936, "grad_norm": 0.19046021912368227, "learning_rate": 9.778226867802748e-06, "loss": 0.5458, "step": 294 }, { "epoch": 0.2908553118067538, "grad_norm": 0.1949435939962373, "learning_rate": 9.776700996982054e-06, "loss": 0.5417, "step": 295 }, { "epoch": 0.2918412620162682, "grad_norm": 0.17508894414087878, "learning_rate": 9.775170014781235e-06, "loss": 0.5303, "step": 296 }, { "epoch": 0.2928272122257826, "grad_norm": 0.186462145746406, "learning_rate": 9.773633922838545e-06, "loss": 0.5335, "step": 297 }, { "epoch": 0.29381316243529704, "grad_norm": 0.18907878327589916, "learning_rate": 9.772092722797699e-06, "loss": 0.5417, "step": 298 }, { "epoch": 0.29479911264481146, "grad_norm": 0.18392737568551656, "learning_rate": 9.770546416307883e-06, "loss": 0.5471, "step": 299 }, { "epoch": 0.2957850628543259, "grad_norm": 0.19021871221942088, "learning_rate": 9.768995005023743e-06, "loss": 0.5446, "step": 300 }, { "epoch": 0.2967710130638403, "grad_norm": 0.18497373304429987, "learning_rate": 9.76743849060539e-06, "loss": 0.5287, "step": 301 }, { "epoch": 0.2977569632733547, "grad_norm": 0.18659210300540374, "learning_rate": 9.765876874718399e-06, "loss": 0.5639, "step": 302 }, { "epoch": 0.29874291348286913, "grad_norm": 0.17894062952145798, "learning_rate": 9.764310159033797e-06, "loss": 0.5553, "step": 303 }, { "epoch": 0.29972886369238355, "grad_norm": 0.19551203985949925, "learning_rate": 9.76273834522807e-06, "loss": 0.5806, "step": 304 }, { "epoch": 0.30071481390189797, "grad_norm": 0.18731748927278455, "learning_rate": 9.761161434983166e-06, "loss": 0.5663, "step": 305 }, { "epoch": 0.3017007641114124, "grad_norm": 0.17968426298994367, "learning_rate": 9.759579429986479e-06, "loss": 0.5437, "step": 306 }, { "epoch": 0.3026867143209268, "grad_norm": 0.22204292376132498, "learning_rate": 9.757992331930855e-06, "loss": 0.5432, "step": 307 }, { "epoch": 0.3036726645304412, "grad_norm": 0.18184753314796798, "learning_rate": 9.756400142514593e-06, "loss": 0.548, "step": 308 }, { "epoch": 0.30465861473995565, "grad_norm": 0.20647974009986644, "learning_rate": 9.754802863441441e-06, "loss": 0.5623, "step": 309 }, { "epoch": 0.30564456494947007, "grad_norm": 0.18725507073578004, "learning_rate": 9.75320049642059e-06, "loss": 0.5568, "step": 310 }, { "epoch": 0.3066305151589845, "grad_norm": 0.1800455366542249, "learning_rate": 9.751593043166673e-06, "loss": 0.5331, "step": 311 }, { "epoch": 0.3076164653684989, "grad_norm": 0.1841426520536834, "learning_rate": 9.749980505399777e-06, "loss": 0.5407, "step": 312 }, { "epoch": 0.3086024155780133, "grad_norm": 0.19659367711913434, "learning_rate": 9.748362884845417e-06, "loss": 0.5686, "step": 313 }, { "epoch": 0.30958836578752774, "grad_norm": 0.17269369509189947, "learning_rate": 9.74674018323455e-06, "loss": 0.5309, "step": 314 }, { "epoch": 0.31057431599704216, "grad_norm": 0.18860615825895338, "learning_rate": 9.745112402303577e-06, "loss": 0.5358, "step": 315 }, { "epoch": 0.3115602662065566, "grad_norm": 0.18729708383811847, "learning_rate": 9.74347954379433e-06, "loss": 0.5558, "step": 316 }, { "epoch": 0.312546216416071, "grad_norm": 0.1763617382688214, "learning_rate": 9.741841609454067e-06, "loss": 0.5358, "step": 317 }, { "epoch": 0.3135321666255854, "grad_norm": 0.17979141365552667, "learning_rate": 9.740198601035489e-06, "loss": 0.5501, "step": 318 }, { "epoch": 0.31451811683509984, "grad_norm": 0.1803838126614993, "learning_rate": 9.738550520296722e-06, "loss": 0.5325, "step": 319 }, { "epoch": 0.31550406704461426, "grad_norm": 0.18884339503707848, "learning_rate": 9.736897369001315e-06, "loss": 0.5674, "step": 320 }, { "epoch": 0.3164900172541287, "grad_norm": 0.1865791726288368, "learning_rate": 9.735239148918251e-06, "loss": 0.5402, "step": 321 }, { "epoch": 0.3174759674636431, "grad_norm": 0.18694529564515183, "learning_rate": 9.733575861821934e-06, "loss": 0.5589, "step": 322 }, { "epoch": 0.3184619176731575, "grad_norm": 0.20535318492014717, "learning_rate": 9.731907509492185e-06, "loss": 0.5424, "step": 323 }, { "epoch": 0.31944786788267193, "grad_norm": 0.2071518418750301, "learning_rate": 9.730234093714253e-06, "loss": 0.5575, "step": 324 }, { "epoch": 0.32043381809218635, "grad_norm": 0.17481253054592302, "learning_rate": 9.7285556162788e-06, "loss": 0.5409, "step": 325 }, { "epoch": 0.32141976830170077, "grad_norm": 0.18594393577387902, "learning_rate": 9.726872078981906e-06, "loss": 0.5412, "step": 326 }, { "epoch": 0.3224057185112152, "grad_norm": 0.19164197402550306, "learning_rate": 9.725183483625065e-06, "loss": 0.5555, "step": 327 }, { "epoch": 0.3233916687207296, "grad_norm": 0.186053937689582, "learning_rate": 9.723489832015183e-06, "loss": 0.5718, "step": 328 }, { "epoch": 0.324377618930244, "grad_norm": 0.1875853203551617, "learning_rate": 9.721791125964578e-06, "loss": 0.5575, "step": 329 }, { "epoch": 0.32536356913975845, "grad_norm": 0.18819870895985158, "learning_rate": 9.720087367290977e-06, "loss": 0.5328, "step": 330 }, { "epoch": 0.32634951934927287, "grad_norm": 0.18068715131695356, "learning_rate": 9.71837855781751e-06, "loss": 0.5326, "step": 331 }, { "epoch": 0.3273354695587873, "grad_norm": 0.18016311746756167, "learning_rate": 9.716664699372715e-06, "loss": 0.533, "step": 332 }, { "epoch": 0.3283214197683017, "grad_norm": 0.18207253599137066, "learning_rate": 9.714945793790534e-06, "loss": 0.5226, "step": 333 }, { "epoch": 0.3293073699778161, "grad_norm": 0.20785211521181185, "learning_rate": 9.713221842910304e-06, "loss": 0.5423, "step": 334 }, { "epoch": 0.33029332018733054, "grad_norm": 0.1845681139457901, "learning_rate": 9.711492848576765e-06, "loss": 0.536, "step": 335 }, { "epoch": 0.33127927039684496, "grad_norm": 0.21007691072498666, "learning_rate": 9.709758812640054e-06, "loss": 0.5516, "step": 336 }, { "epoch": 0.3322652206063594, "grad_norm": 0.1941130143828044, "learning_rate": 9.708019736955701e-06, "loss": 0.5592, "step": 337 }, { "epoch": 0.3332511708158738, "grad_norm": 0.18770453694072897, "learning_rate": 9.706275623384633e-06, "loss": 0.5508, "step": 338 }, { "epoch": 0.3342371210253882, "grad_norm": 0.18903620940691857, "learning_rate": 9.70452647379316e-06, "loss": 0.5311, "step": 339 }, { "epoch": 0.33522307123490264, "grad_norm": 0.18273765065473813, "learning_rate": 9.702772290052992e-06, "loss": 0.5313, "step": 340 }, { "epoch": 0.33620902144441706, "grad_norm": 0.1942003299384735, "learning_rate": 9.701013074041213e-06, "loss": 0.5437, "step": 341 }, { "epoch": 0.3371949716539315, "grad_norm": 0.3108557907368959, "learning_rate": 9.699248827640302e-06, "loss": 0.5572, "step": 342 }, { "epoch": 0.3381809218634459, "grad_norm": 0.1782316909211875, "learning_rate": 9.697479552738117e-06, "loss": 0.5283, "step": 343 }, { "epoch": 0.3391668720729603, "grad_norm": 0.18682895440603317, "learning_rate": 9.695705251227893e-06, "loss": 0.5473, "step": 344 }, { "epoch": 0.34015282228247473, "grad_norm": 0.18299858006665634, "learning_rate": 9.693925925008251e-06, "loss": 0.5547, "step": 345 }, { "epoch": 0.34113877249198915, "grad_norm": 0.19762058128533963, "learning_rate": 9.692141575983189e-06, "loss": 0.5417, "step": 346 }, { "epoch": 0.34212472270150357, "grad_norm": 0.2009208919552246, "learning_rate": 9.69035220606207e-06, "loss": 0.5515, "step": 347 }, { "epoch": 0.343110672911018, "grad_norm": 0.17113169383263752, "learning_rate": 9.68855781715964e-06, "loss": 0.5305, "step": 348 }, { "epoch": 0.3440966231205324, "grad_norm": 0.18805578103017412, "learning_rate": 9.686758411196009e-06, "loss": 0.5025, "step": 349 }, { "epoch": 0.3450825733300468, "grad_norm": 0.21176364012019433, "learning_rate": 9.68495399009666e-06, "loss": 0.5279, "step": 350 }, { "epoch": 0.34606852353956125, "grad_norm": 0.18507930814721, "learning_rate": 9.683144555792441e-06, "loss": 0.5272, "step": 351 }, { "epoch": 0.34705447374907566, "grad_norm": 0.18608103510728297, "learning_rate": 9.681330110219563e-06, "loss": 0.5352, "step": 352 }, { "epoch": 0.3480404239585901, "grad_norm": 0.1768125300825402, "learning_rate": 9.6795106553196e-06, "loss": 0.5548, "step": 353 }, { "epoch": 0.3490263741681045, "grad_norm": 0.19221618896369344, "learning_rate": 9.677686193039489e-06, "loss": 0.5351, "step": 354 }, { "epoch": 0.3500123243776189, "grad_norm": 0.17206225544086962, "learning_rate": 9.67585672533152e-06, "loss": 0.5293, "step": 355 }, { "epoch": 0.35099827458713334, "grad_norm": 0.17510162004878352, "learning_rate": 9.674022254153345e-06, "loss": 0.561, "step": 356 }, { "epoch": 0.35198422479664776, "grad_norm": 0.18027683086974733, "learning_rate": 9.672182781467967e-06, "loss": 0.5412, "step": 357 }, { "epoch": 0.3529701750061622, "grad_norm": 0.18051012010020695, "learning_rate": 9.670338309243738e-06, "loss": 0.5205, "step": 358 }, { "epoch": 0.3539561252156766, "grad_norm": 0.17824096043286206, "learning_rate": 9.668488839454367e-06, "loss": 0.5289, "step": 359 }, { "epoch": 0.354942075425191, "grad_norm": 0.1899269535230839, "learning_rate": 9.666634374078906e-06, "loss": 0.5399, "step": 360 }, { "epoch": 0.35592802563470544, "grad_norm": 0.2000896624953639, "learning_rate": 9.664774915101751e-06, "loss": 0.5316, "step": 361 }, { "epoch": 0.35691397584421986, "grad_norm": 0.17982429332089273, "learning_rate": 9.662910464512646e-06, "loss": 0.5256, "step": 362 }, { "epoch": 0.3578999260537343, "grad_norm": 0.18493042035953866, "learning_rate": 9.661041024306673e-06, "loss": 0.5602, "step": 363 }, { "epoch": 0.3588858762632487, "grad_norm": 0.17638637049875583, "learning_rate": 9.659166596484253e-06, "loss": 0.5302, "step": 364 }, { "epoch": 0.3598718264727631, "grad_norm": 0.18076729211946937, "learning_rate": 9.65728718305115e-06, "loss": 0.5502, "step": 365 }, { "epoch": 0.36085777668227753, "grad_norm": 0.17444026167804935, "learning_rate": 9.655402786018455e-06, "loss": 0.553, "step": 366 }, { "epoch": 0.36184372689179195, "grad_norm": 0.17009498146673663, "learning_rate": 9.653513407402596e-06, "loss": 0.5143, "step": 367 }, { "epoch": 0.36282967710130637, "grad_norm": 0.21593875928225242, "learning_rate": 9.651619049225328e-06, "loss": 0.548, "step": 368 }, { "epoch": 0.3638156273108208, "grad_norm": 0.1842107390428221, "learning_rate": 9.649719713513742e-06, "loss": 0.5366, "step": 369 }, { "epoch": 0.3648015775203352, "grad_norm": 0.173489560161717, "learning_rate": 9.647815402300247e-06, "loss": 0.5181, "step": 370 }, { "epoch": 0.3657875277298496, "grad_norm": 0.18390441935466698, "learning_rate": 9.645906117622581e-06, "loss": 0.5163, "step": 371 }, { "epoch": 0.36677347793936405, "grad_norm": 0.22468546223912358, "learning_rate": 9.643991861523802e-06, "loss": 0.5262, "step": 372 }, { "epoch": 0.36775942814887846, "grad_norm": 0.1900161096137712, "learning_rate": 9.64207263605229e-06, "loss": 0.5667, "step": 373 }, { "epoch": 0.3687453783583929, "grad_norm": 0.18028388695730332, "learning_rate": 9.640148443261739e-06, "loss": 0.4991, "step": 374 }, { "epoch": 0.3697313285679073, "grad_norm": 0.1808083614104452, "learning_rate": 9.63821928521116e-06, "loss": 0.5323, "step": 375 }, { "epoch": 0.3707172787774217, "grad_norm": 0.18225409843311444, "learning_rate": 9.636285163964877e-06, "loss": 0.542, "step": 376 }, { "epoch": 0.37170322898693614, "grad_norm": 0.1800024905646102, "learning_rate": 9.634346081592527e-06, "loss": 0.5369, "step": 377 }, { "epoch": 0.37268917919645056, "grad_norm": 0.1878497922667102, "learning_rate": 9.632402040169055e-06, "loss": 0.5482, "step": 378 }, { "epoch": 0.373675129405965, "grad_norm": 0.2620013530339256, "learning_rate": 9.630453041774708e-06, "loss": 0.5514, "step": 379 }, { "epoch": 0.3746610796154794, "grad_norm": 0.18784755250804422, "learning_rate": 9.628499088495043e-06, "loss": 0.5375, "step": 380 }, { "epoch": 0.3756470298249938, "grad_norm": 0.1812376528970597, "learning_rate": 9.626540182420916e-06, "loss": 0.5456, "step": 381 }, { "epoch": 0.37663298003450824, "grad_norm": 0.17846141310959251, "learning_rate": 9.624576325648485e-06, "loss": 0.5393, "step": 382 }, { "epoch": 0.37761893024402265, "grad_norm": 0.17766955549073946, "learning_rate": 9.622607520279201e-06, "loss": 0.5307, "step": 383 }, { "epoch": 0.3786048804535371, "grad_norm": 0.1744935339508359, "learning_rate": 9.620633768419819e-06, "loss": 0.562, "step": 384 }, { "epoch": 0.3795908306630515, "grad_norm": 0.18099126482002742, "learning_rate": 9.618655072182376e-06, "loss": 0.5515, "step": 385 }, { "epoch": 0.3805767808725659, "grad_norm": 0.1757976737358108, "learning_rate": 9.616671433684208e-06, "loss": 0.5389, "step": 386 }, { "epoch": 0.38156273108208033, "grad_norm": 0.19447865829681363, "learning_rate": 9.614682855047938e-06, "loss": 0.5651, "step": 387 }, { "epoch": 0.38254868129159475, "grad_norm": 0.17219214625700813, "learning_rate": 9.612689338401472e-06, "loss": 0.5256, "step": 388 }, { "epoch": 0.38353463150110917, "grad_norm": 0.17220965615808365, "learning_rate": 9.610690885878002e-06, "loss": 0.5323, "step": 389 }, { "epoch": 0.3845205817106236, "grad_norm": 0.18737760870959425, "learning_rate": 9.608687499616005e-06, "loss": 0.5327, "step": 390 }, { "epoch": 0.385506531920138, "grad_norm": 0.1852281145185945, "learning_rate": 9.606679181759233e-06, "loss": 0.5255, "step": 391 }, { "epoch": 0.3864924821296524, "grad_norm": 0.18165566102447128, "learning_rate": 9.604665934456714e-06, "loss": 0.5546, "step": 392 }, { "epoch": 0.38747843233916685, "grad_norm": 0.17861142125772, "learning_rate": 9.602647759862756e-06, "loss": 0.5472, "step": 393 }, { "epoch": 0.38846438254868126, "grad_norm": 0.18792980682702926, "learning_rate": 9.600624660136937e-06, "loss": 0.5588, "step": 394 }, { "epoch": 0.38945033275819574, "grad_norm": 0.18451074189767946, "learning_rate": 9.598596637444101e-06, "loss": 0.5338, "step": 395 }, { "epoch": 0.39043628296771016, "grad_norm": 0.17115955881842462, "learning_rate": 9.59656369395437e-06, "loss": 0.5166, "step": 396 }, { "epoch": 0.3914222331772246, "grad_norm": 0.1819762184488862, "learning_rate": 9.594525831843122e-06, "loss": 0.5351, "step": 397 }, { "epoch": 0.392408183386739, "grad_norm": 0.17745975002475423, "learning_rate": 9.592483053291002e-06, "loss": 0.5409, "step": 398 }, { "epoch": 0.3933941335962534, "grad_norm": 0.17283712194035614, "learning_rate": 9.590435360483917e-06, "loss": 0.5296, "step": 399 }, { "epoch": 0.39438008380576783, "grad_norm": 0.1789037178138004, "learning_rate": 9.588382755613029e-06, "loss": 0.5506, "step": 400 }, { "epoch": 0.39536603401528225, "grad_norm": 0.1751593495300854, "learning_rate": 9.586325240874759e-06, "loss": 0.5411, "step": 401 }, { "epoch": 0.39635198422479667, "grad_norm": 0.17753137950472841, "learning_rate": 9.584262818470781e-06, "loss": 0.5553, "step": 402 }, { "epoch": 0.3973379344343111, "grad_norm": 0.17729009012114222, "learning_rate": 9.582195490608023e-06, "loss": 0.5139, "step": 403 }, { "epoch": 0.3983238846438255, "grad_norm": 0.17404500598409495, "learning_rate": 9.580123259498658e-06, "loss": 0.5367, "step": 404 }, { "epoch": 0.39930983485333993, "grad_norm": 0.18539611447949306, "learning_rate": 9.57804612736011e-06, "loss": 0.5049, "step": 405 }, { "epoch": 0.40029578506285435, "grad_norm": 0.16766267862954987, "learning_rate": 9.575964096415042e-06, "loss": 0.4945, "step": 406 }, { "epoch": 0.40128173527236877, "grad_norm": 0.3041454463230651, "learning_rate": 9.573877168891365e-06, "loss": 0.5319, "step": 407 }, { "epoch": 0.4022676854818832, "grad_norm": 0.1921318752247221, "learning_rate": 9.571785347022225e-06, "loss": 0.5181, "step": 408 }, { "epoch": 0.4032536356913976, "grad_norm": 0.1767934643651254, "learning_rate": 9.569688633046009e-06, "loss": 0.5612, "step": 409 }, { "epoch": 0.404239585900912, "grad_norm": 0.18885240184925087, "learning_rate": 9.567587029206335e-06, "loss": 0.5214, "step": 410 }, { "epoch": 0.40522553611042644, "grad_norm": 0.199447723801938, "learning_rate": 9.565480537752057e-06, "loss": 0.5379, "step": 411 }, { "epoch": 0.40621148631994086, "grad_norm": 0.3608200962774075, "learning_rate": 9.563369160937259e-06, "loss": 0.542, "step": 412 }, { "epoch": 0.4071974365294553, "grad_norm": 0.1724817976169056, "learning_rate": 9.561252901021247e-06, "loss": 0.5299, "step": 413 }, { "epoch": 0.4081833867389697, "grad_norm": 0.19258471068149405, "learning_rate": 9.55913176026856e-06, "loss": 0.5364, "step": 414 }, { "epoch": 0.4091693369484841, "grad_norm": 0.1847703010304863, "learning_rate": 9.557005740948954e-06, "loss": 0.5597, "step": 415 }, { "epoch": 0.41015528715799854, "grad_norm": 0.17240172918677668, "learning_rate": 9.55487484533741e-06, "loss": 0.5335, "step": 416 }, { "epoch": 0.41114123736751296, "grad_norm": 0.26185902201310524, "learning_rate": 9.552739075714125e-06, "loss": 0.5244, "step": 417 }, { "epoch": 0.4121271875770274, "grad_norm": 0.19211187561141485, "learning_rate": 9.550598434364507e-06, "loss": 0.5413, "step": 418 }, { "epoch": 0.4131131377865418, "grad_norm": 0.1781929568483824, "learning_rate": 9.548452923579186e-06, "loss": 0.5464, "step": 419 }, { "epoch": 0.4140990879960562, "grad_norm": 0.2427129508527206, "learning_rate": 9.546302545653994e-06, "loss": 0.544, "step": 420 }, { "epoch": 0.41508503820557063, "grad_norm": 0.171747906030444, "learning_rate": 9.544147302889977e-06, "loss": 0.5144, "step": 421 }, { "epoch": 0.41607098841508505, "grad_norm": 0.17907222130079994, "learning_rate": 9.541987197593385e-06, "loss": 0.536, "step": 422 }, { "epoch": 0.41705693862459947, "grad_norm": 0.18469819930878412, "learning_rate": 9.539822232075669e-06, "loss": 0.5462, "step": 423 }, { "epoch": 0.4180428888341139, "grad_norm": 0.1802968701146283, "learning_rate": 9.537652408653485e-06, "loss": 0.539, "step": 424 }, { "epoch": 0.4190288390436283, "grad_norm": 0.25158217355630896, "learning_rate": 9.535477729648683e-06, "loss": 0.5467, "step": 425 }, { "epoch": 0.42001478925314273, "grad_norm": 0.19205046887586966, "learning_rate": 9.533298197388313e-06, "loss": 0.5338, "step": 426 }, { "epoch": 0.42100073946265715, "grad_norm": 0.17906515816988378, "learning_rate": 9.531113814204611e-06, "loss": 0.5444, "step": 427 }, { "epoch": 0.42198668967217157, "grad_norm": 0.1813181727528524, "learning_rate": 9.528924582435015e-06, "loss": 0.5413, "step": 428 }, { "epoch": 0.422972639881686, "grad_norm": 0.20098658554703583, "learning_rate": 9.526730504422142e-06, "loss": 0.542, "step": 429 }, { "epoch": 0.4239585900912004, "grad_norm": 0.1845545628875529, "learning_rate": 9.524531582513797e-06, "loss": 0.5407, "step": 430 }, { "epoch": 0.4249445403007148, "grad_norm": 0.17365774613172688, "learning_rate": 9.522327819062971e-06, "loss": 0.5169, "step": 431 }, { "epoch": 0.42593049051022924, "grad_norm": 0.17486639323125394, "learning_rate": 9.520119216427832e-06, "loss": 0.5432, "step": 432 }, { "epoch": 0.42691644071974366, "grad_norm": 0.19178935146833231, "learning_rate": 9.517905776971731e-06, "loss": 0.5269, "step": 433 }, { "epoch": 0.4279023909292581, "grad_norm": 0.22040389128943744, "learning_rate": 9.51568750306319e-06, "loss": 0.5503, "step": 434 }, { "epoch": 0.4288883411387725, "grad_norm": 0.3848732264146601, "learning_rate": 9.513464397075906e-06, "loss": 0.5392, "step": 435 }, { "epoch": 0.4298742913482869, "grad_norm": 0.1800777856034973, "learning_rate": 9.511236461388748e-06, "loss": 0.5442, "step": 436 }, { "epoch": 0.43086024155780134, "grad_norm": 0.17156221801999408, "learning_rate": 9.509003698385751e-06, "loss": 0.5259, "step": 437 }, { "epoch": 0.43184619176731576, "grad_norm": 0.1781982902672412, "learning_rate": 9.506766110456114e-06, "loss": 0.5427, "step": 438 }, { "epoch": 0.4328321419768302, "grad_norm": 0.19272411397386807, "learning_rate": 9.504523699994206e-06, "loss": 0.5544, "step": 439 }, { "epoch": 0.4338180921863446, "grad_norm": 0.18322894068935697, "learning_rate": 9.502276469399547e-06, "loss": 0.5324, "step": 440 }, { "epoch": 0.434804042395859, "grad_norm": 0.19942159600279194, "learning_rate": 9.500024421076825e-06, "loss": 0.536, "step": 441 }, { "epoch": 0.43578999260537343, "grad_norm": 0.17674182164248262, "learning_rate": 9.497767557435873e-06, "loss": 0.5202, "step": 442 }, { "epoch": 0.43677594281488785, "grad_norm": 0.1801472124058599, "learning_rate": 9.495505880891683e-06, "loss": 0.5397, "step": 443 }, { "epoch": 0.43776189302440227, "grad_norm": 0.1751876020459294, "learning_rate": 9.493239393864397e-06, "loss": 0.5223, "step": 444 }, { "epoch": 0.4387478432339167, "grad_norm": 0.1746613728257611, "learning_rate": 9.490968098779304e-06, "loss": 0.5099, "step": 445 }, { "epoch": 0.4397337934434311, "grad_norm": 0.18029420055134318, "learning_rate": 9.488691998066833e-06, "loss": 0.545, "step": 446 }, { "epoch": 0.44071974365294553, "grad_norm": 0.17599971146131657, "learning_rate": 9.486411094162562e-06, "loss": 0.5428, "step": 447 }, { "epoch": 0.44170569386245995, "grad_norm": 0.17011693792438803, "learning_rate": 9.484125389507206e-06, "loss": 0.5267, "step": 448 }, { "epoch": 0.44269164407197437, "grad_norm": 0.1822455591186834, "learning_rate": 9.481834886546618e-06, "loss": 0.5493, "step": 449 }, { "epoch": 0.4436775942814888, "grad_norm": 0.18965504353166301, "learning_rate": 9.479539587731788e-06, "loss": 0.5409, "step": 450 }, { "epoch": 0.4446635444910032, "grad_norm": 0.18495841838242305, "learning_rate": 9.477239495518826e-06, "loss": 0.5373, "step": 451 }, { "epoch": 0.4456494947005176, "grad_norm": 0.17962496584208223, "learning_rate": 9.474934612368989e-06, "loss": 0.5243, "step": 452 }, { "epoch": 0.44663544491003204, "grad_norm": 0.18091336056439608, "learning_rate": 9.472624940748644e-06, "loss": 0.5606, "step": 453 }, { "epoch": 0.44762139511954646, "grad_norm": 0.1947142637572019, "learning_rate": 9.470310483129298e-06, "loss": 0.539, "step": 454 }, { "epoch": 0.4486073453290609, "grad_norm": 0.1739883293626402, "learning_rate": 9.467991241987562e-06, "loss": 0.5269, "step": 455 }, { "epoch": 0.4495932955385753, "grad_norm": 0.17548396273813394, "learning_rate": 9.465667219805182e-06, "loss": 0.5452, "step": 456 }, { "epoch": 0.4505792457480897, "grad_norm": 0.17888393002480893, "learning_rate": 9.463338419069007e-06, "loss": 0.5415, "step": 457 }, { "epoch": 0.45156519595760414, "grad_norm": 0.17444896541539015, "learning_rate": 9.461004842271008e-06, "loss": 0.5213, "step": 458 }, { "epoch": 0.45255114616711856, "grad_norm": 0.17284634720146852, "learning_rate": 9.458666491908264e-06, "loss": 0.5393, "step": 459 }, { "epoch": 0.453537096376633, "grad_norm": 0.1868956394912211, "learning_rate": 9.456323370482959e-06, "loss": 0.5327, "step": 460 }, { "epoch": 0.4545230465861474, "grad_norm": 0.17853379090704663, "learning_rate": 9.453975480502387e-06, "loss": 0.5438, "step": 461 }, { "epoch": 0.4555089967956618, "grad_norm": 0.1728035132474683, "learning_rate": 9.451622824478941e-06, "loss": 0.5396, "step": 462 }, { "epoch": 0.45649494700517623, "grad_norm": 0.1759478823382504, "learning_rate": 9.44926540493012e-06, "loss": 0.5561, "step": 463 }, { "epoch": 0.45748089721469065, "grad_norm": 0.16797933840931661, "learning_rate": 9.44690322437851e-06, "loss": 0.5343, "step": 464 }, { "epoch": 0.45846684742420507, "grad_norm": 0.17310988682293296, "learning_rate": 9.444536285351803e-06, "loss": 0.5263, "step": 465 }, { "epoch": 0.4594527976337195, "grad_norm": 0.1743341428430828, "learning_rate": 9.442164590382771e-06, "loss": 0.5234, "step": 466 }, { "epoch": 0.4604387478432339, "grad_norm": 0.18157823661205635, "learning_rate": 9.43978814200929e-06, "loss": 0.5485, "step": 467 }, { "epoch": 0.4614246980527483, "grad_norm": 0.16690352875330322, "learning_rate": 9.437406942774308e-06, "loss": 0.5234, "step": 468 }, { "epoch": 0.46241064826226275, "grad_norm": 0.17294814548184448, "learning_rate": 9.435020995225863e-06, "loss": 0.5498, "step": 469 }, { "epoch": 0.46339659847177717, "grad_norm": 0.18364242361789426, "learning_rate": 9.432630301917075e-06, "loss": 0.5321, "step": 470 }, { "epoch": 0.4643825486812916, "grad_norm": 0.16871216430194655, "learning_rate": 9.43023486540614e-06, "loss": 0.5273, "step": 471 }, { "epoch": 0.465368498890806, "grad_norm": 0.1776272651026022, "learning_rate": 9.427834688256333e-06, "loss": 0.5222, "step": 472 }, { "epoch": 0.4663544491003204, "grad_norm": 0.1799030712370558, "learning_rate": 9.425429773035997e-06, "loss": 0.5303, "step": 473 }, { "epoch": 0.46734039930983484, "grad_norm": 0.17383910124854074, "learning_rate": 9.42302012231855e-06, "loss": 0.5167, "step": 474 }, { "epoch": 0.46832634951934926, "grad_norm": 0.19827846910160685, "learning_rate": 9.420605738682471e-06, "loss": 0.5152, "step": 475 }, { "epoch": 0.4693122997288637, "grad_norm": 0.17635148231459685, "learning_rate": 9.418186624711309e-06, "loss": 0.54, "step": 476 }, { "epoch": 0.4702982499383781, "grad_norm": 0.22411418332504124, "learning_rate": 9.415762782993673e-06, "loss": 0.5137, "step": 477 }, { "epoch": 0.4712842001478925, "grad_norm": 0.1840318252535409, "learning_rate": 9.413334216123233e-06, "loss": 0.5482, "step": 478 }, { "epoch": 0.47227015035740694, "grad_norm": 0.17458749642531488, "learning_rate": 9.41090092669871e-06, "loss": 0.5363, "step": 479 }, { "epoch": 0.47325610056692136, "grad_norm": 0.17585310210282468, "learning_rate": 9.408462917323882e-06, "loss": 0.5117, "step": 480 }, { "epoch": 0.4742420507764358, "grad_norm": 0.17990947019554057, "learning_rate": 9.40602019060758e-06, "loss": 0.5363, "step": 481 }, { "epoch": 0.4752280009859502, "grad_norm": 0.1907254005157449, "learning_rate": 9.403572749163675e-06, "loss": 0.5217, "step": 482 }, { "epoch": 0.4762139511954646, "grad_norm": 0.1821487814637426, "learning_rate": 9.401120595611094e-06, "loss": 0.5364, "step": 483 }, { "epoch": 0.47719990140497903, "grad_norm": 0.18945645608652886, "learning_rate": 9.398663732573798e-06, "loss": 0.5305, "step": 484 }, { "epoch": 0.47818585161449345, "grad_norm": 0.1871072551732735, "learning_rate": 9.396202162680789e-06, "loss": 0.532, "step": 485 }, { "epoch": 0.47917180182400787, "grad_norm": 0.1811390463852209, "learning_rate": 9.393735888566107e-06, "loss": 0.5293, "step": 486 }, { "epoch": 0.4801577520335223, "grad_norm": 0.18049327995553202, "learning_rate": 9.391264912868828e-06, "loss": 0.5419, "step": 487 }, { "epoch": 0.4811437022430367, "grad_norm": 0.176874912249078, "learning_rate": 9.388789238233052e-06, "loss": 0.5357, "step": 488 }, { "epoch": 0.4821296524525511, "grad_norm": 0.18204471162500285, "learning_rate": 9.386308867307915e-06, "loss": 0.5313, "step": 489 }, { "epoch": 0.48311560266206555, "grad_norm": 0.20790754915654208, "learning_rate": 9.383823802747572e-06, "loss": 0.5152, "step": 490 }, { "epoch": 0.48410155287157997, "grad_norm": 0.1920201424904351, "learning_rate": 9.381334047211208e-06, "loss": 0.5451, "step": 491 }, { "epoch": 0.4850875030810944, "grad_norm": 0.17533154689525784, "learning_rate": 9.378839603363018e-06, "loss": 0.5437, "step": 492 }, { "epoch": 0.4860734532906088, "grad_norm": 0.17772839355273848, "learning_rate": 9.376340473872221e-06, "loss": 0.5367, "step": 493 }, { "epoch": 0.4870594035001232, "grad_norm": 1.094859663460843, "learning_rate": 9.373836661413048e-06, "loss": 0.5597, "step": 494 }, { "epoch": 0.48804535370963764, "grad_norm": 0.18438873724873134, "learning_rate": 9.37132816866474e-06, "loss": 0.5432, "step": 495 }, { "epoch": 0.48903130391915206, "grad_norm": 0.18810839204318186, "learning_rate": 9.368814998311548e-06, "loss": 0.5318, "step": 496 }, { "epoch": 0.4900172541286665, "grad_norm": 0.18444665812331418, "learning_rate": 9.366297153042727e-06, "loss": 0.5326, "step": 497 }, { "epoch": 0.4910032043381809, "grad_norm": 0.1707078516753226, "learning_rate": 9.363774635552536e-06, "loss": 0.5368, "step": 498 }, { "epoch": 0.4919891545476953, "grad_norm": 0.18433209786482785, "learning_rate": 9.36124744854023e-06, "loss": 0.5153, "step": 499 }, { "epoch": 0.49297510475720974, "grad_norm": 0.18444241059848385, "learning_rate": 9.358715594710065e-06, "loss": 0.5259, "step": 500 }, { "epoch": 0.49396105496672416, "grad_norm": 0.1888150737018664, "learning_rate": 9.35617907677129e-06, "loss": 0.5166, "step": 501 }, { "epoch": 0.4949470051762386, "grad_norm": 0.1865721892028227, "learning_rate": 9.353637897438139e-06, "loss": 0.4966, "step": 502 }, { "epoch": 0.495932955385753, "grad_norm": 0.18872924962167356, "learning_rate": 9.351092059429845e-06, "loss": 0.5172, "step": 503 }, { "epoch": 0.4969189055952674, "grad_norm": 0.17940409379970196, "learning_rate": 9.348541565470614e-06, "loss": 0.5352, "step": 504 }, { "epoch": 0.49790485580478183, "grad_norm": 0.1982895133569798, "learning_rate": 9.345986418289645e-06, "loss": 0.526, "step": 505 }, { "epoch": 0.49889080601429625, "grad_norm": 0.17982686539540155, "learning_rate": 9.34342662062111e-06, "loss": 0.5341, "step": 506 }, { "epoch": 0.49987675622381067, "grad_norm": 0.17361686526600992, "learning_rate": 9.340862175204157e-06, "loss": 0.5198, "step": 507 }, { "epoch": 0.5008627064333251, "grad_norm": 0.1753173227702939, "learning_rate": 9.338293084782912e-06, "loss": 0.5207, "step": 508 }, { "epoch": 0.5018486566428395, "grad_norm": 0.1719528503287593, "learning_rate": 9.335719352106465e-06, "loss": 0.5261, "step": 509 }, { "epoch": 0.5028346068523539, "grad_norm": 0.20400584078299505, "learning_rate": 9.33314097992888e-06, "loss": 0.5389, "step": 510 }, { "epoch": 0.5038205570618683, "grad_norm": 0.17253150996749836, "learning_rate": 9.33055797100918e-06, "loss": 0.505, "step": 511 }, { "epoch": 0.5048065072713828, "grad_norm": 0.21680273811570544, "learning_rate": 9.327970328111354e-06, "loss": 0.5236, "step": 512 }, { "epoch": 0.5057924574808972, "grad_norm": 0.18983001098675922, "learning_rate": 9.325378054004346e-06, "loss": 0.5134, "step": 513 }, { "epoch": 0.5067784076904116, "grad_norm": 0.23237815078364535, "learning_rate": 9.32278115146206e-06, "loss": 0.5373, "step": 514 }, { "epoch": 0.507764357899926, "grad_norm": 0.18787252749594727, "learning_rate": 9.32017962326335e-06, "loss": 0.52, "step": 515 }, { "epoch": 0.5087503081094404, "grad_norm": 0.1778882773289981, "learning_rate": 9.317573472192018e-06, "loss": 0.5191, "step": 516 }, { "epoch": 0.5097362583189549, "grad_norm": 0.17816627152377676, "learning_rate": 9.314962701036818e-06, "loss": 0.5115, "step": 517 }, { "epoch": 0.5107222085284693, "grad_norm": 0.17526363387693264, "learning_rate": 9.31234731259144e-06, "loss": 0.513, "step": 518 }, { "epoch": 0.5117081587379837, "grad_norm": 0.18547672153027575, "learning_rate": 9.309727309654524e-06, "loss": 0.5385, "step": 519 }, { "epoch": 0.5126941089474981, "grad_norm": 0.18888072990131097, "learning_rate": 9.30710269502964e-06, "loss": 0.5421, "step": 520 }, { "epoch": 0.5136800591570125, "grad_norm": 0.1932038565992329, "learning_rate": 9.3044734715253e-06, "loss": 0.5524, "step": 521 }, { "epoch": 0.514666009366527, "grad_norm": 0.17154783813009652, "learning_rate": 9.301839641954937e-06, "loss": 0.5404, "step": 522 }, { "epoch": 0.5156519595760414, "grad_norm": 0.17065826544657794, "learning_rate": 9.299201209136927e-06, "loss": 0.5036, "step": 523 }, { "epoch": 0.5166379097855558, "grad_norm": 0.18081266482056293, "learning_rate": 9.296558175894559e-06, "loss": 0.5358, "step": 524 }, { "epoch": 0.5176238599950702, "grad_norm": 0.18409344660188967, "learning_rate": 9.293910545056054e-06, "loss": 0.5392, "step": 525 }, { "epoch": 0.5186098102045846, "grad_norm": 0.18474516901228663, "learning_rate": 9.291258319454546e-06, "loss": 0.5325, "step": 526 }, { "epoch": 0.519595760414099, "grad_norm": 0.16362804198311526, "learning_rate": 9.28860150192809e-06, "loss": 0.5267, "step": 527 }, { "epoch": 0.5205817106236135, "grad_norm": 0.1961675315869362, "learning_rate": 9.285940095319651e-06, "loss": 0.5292, "step": 528 }, { "epoch": 0.5215676608331279, "grad_norm": 0.1690370738666498, "learning_rate": 9.28327410247711e-06, "loss": 0.5319, "step": 529 }, { "epoch": 0.5225536110426423, "grad_norm": 0.17344100056077083, "learning_rate": 9.280603526253253e-06, "loss": 0.5181, "step": 530 }, { "epoch": 0.5235395612521567, "grad_norm": 0.1885771373904881, "learning_rate": 9.277928369505766e-06, "loss": 0.5287, "step": 531 }, { "epoch": 0.5245255114616711, "grad_norm": 0.17258458207149155, "learning_rate": 9.275248635097242e-06, "loss": 0.5272, "step": 532 }, { "epoch": 0.5255114616711856, "grad_norm": 0.16980858974454846, "learning_rate": 9.272564325895172e-06, "loss": 0.5222, "step": 533 }, { "epoch": 0.5264974118807, "grad_norm": 0.17303373845587997, "learning_rate": 9.269875444771941e-06, "loss": 0.541, "step": 534 }, { "epoch": 0.5274833620902144, "grad_norm": 0.17472353980786784, "learning_rate": 9.267181994604824e-06, "loss": 0.53, "step": 535 }, { "epoch": 0.5284693122997288, "grad_norm": 0.1790405001202461, "learning_rate": 9.26448397827599e-06, "loss": 0.508, "step": 536 }, { "epoch": 0.5294552625092432, "grad_norm": 0.18351127199828568, "learning_rate": 9.261781398672489e-06, "loss": 0.5603, "step": 537 }, { "epoch": 0.5304412127187577, "grad_norm": 0.18363336505242986, "learning_rate": 9.25907425868626e-06, "loss": 0.5144, "step": 538 }, { "epoch": 0.5314271629282721, "grad_norm": 0.17658005873610938, "learning_rate": 9.256362561214116e-06, "loss": 0.5267, "step": 539 }, { "epoch": 0.5324131131377865, "grad_norm": 0.18840777055635421, "learning_rate": 9.25364630915775e-06, "loss": 0.5234, "step": 540 }, { "epoch": 0.5333990633473009, "grad_norm": 0.1728518263781916, "learning_rate": 9.250925505423728e-06, "loss": 0.5265, "step": 541 }, { "epoch": 0.5343850135568153, "grad_norm": 0.17488975556875405, "learning_rate": 9.248200152923487e-06, "loss": 0.5238, "step": 542 }, { "epoch": 0.5353709637663298, "grad_norm": 0.1882758465967878, "learning_rate": 9.24547025457333e-06, "loss": 0.5564, "step": 543 }, { "epoch": 0.5363569139758442, "grad_norm": 0.19025557740461027, "learning_rate": 9.242735813294425e-06, "loss": 0.5133, "step": 544 }, { "epoch": 0.5373428641853586, "grad_norm": 0.179824214551046, "learning_rate": 9.239996832012805e-06, "loss": 0.5445, "step": 545 }, { "epoch": 0.538328814394873, "grad_norm": 0.1806716065069949, "learning_rate": 9.237253313659355e-06, "loss": 0.5231, "step": 546 }, { "epoch": 0.5393147646043874, "grad_norm": 0.18635574300948382, "learning_rate": 9.234505261169819e-06, "loss": 0.5449, "step": 547 }, { "epoch": 0.5403007148139019, "grad_norm": 0.184656164635862, "learning_rate": 9.23175267748479e-06, "loss": 0.5336, "step": 548 }, { "epoch": 0.5412866650234163, "grad_norm": 0.17411313876144827, "learning_rate": 9.228995565549712e-06, "loss": 0.5176, "step": 549 }, { "epoch": 0.5422726152329307, "grad_norm": 0.1753064976401272, "learning_rate": 9.226233928314874e-06, "loss": 0.529, "step": 550 }, { "epoch": 0.5432585654424451, "grad_norm": 0.20170952807937534, "learning_rate": 9.223467768735407e-06, "loss": 0.5297, "step": 551 }, { "epoch": 0.5442445156519595, "grad_norm": 0.18530425670711484, "learning_rate": 9.22069708977128e-06, "loss": 0.54, "step": 552 }, { "epoch": 0.545230465861474, "grad_norm": 0.20615172776597993, "learning_rate": 9.217921894387303e-06, "loss": 0.5094, "step": 553 }, { "epoch": 0.5462164160709884, "grad_norm": 0.170874507407419, "learning_rate": 9.21514218555311e-06, "loss": 0.5132, "step": 554 }, { "epoch": 0.5472023662805028, "grad_norm": 0.19009706896809436, "learning_rate": 9.212357966243176e-06, "loss": 0.5202, "step": 555 }, { "epoch": 0.5481883164900172, "grad_norm": 0.17760598662710642, "learning_rate": 9.20956923943679e-06, "loss": 0.5036, "step": 556 }, { "epoch": 0.5491742666995316, "grad_norm": 0.17949196898983627, "learning_rate": 9.206776008118075e-06, "loss": 0.5274, "step": 557 }, { "epoch": 0.550160216909046, "grad_norm": 0.1693635169054839, "learning_rate": 9.203978275275967e-06, "loss": 0.5199, "step": 558 }, { "epoch": 0.5511461671185605, "grad_norm": 0.17813196882293547, "learning_rate": 9.201176043904225e-06, "loss": 0.5278, "step": 559 }, { "epoch": 0.5521321173280749, "grad_norm": 0.22647859583966012, "learning_rate": 9.198369317001417e-06, "loss": 0.5301, "step": 560 }, { "epoch": 0.5531180675375893, "grad_norm": 0.1818989370240682, "learning_rate": 9.19555809757092e-06, "loss": 0.5204, "step": 561 }, { "epoch": 0.5541040177471037, "grad_norm": 0.18242604280358907, "learning_rate": 9.192742388620926e-06, "loss": 0.5389, "step": 562 }, { "epoch": 0.5550899679566181, "grad_norm": 0.17696765474750256, "learning_rate": 9.189922193164424e-06, "loss": 0.5295, "step": 563 }, { "epoch": 0.5560759181661327, "grad_norm": 0.1808515728973198, "learning_rate": 9.187097514219207e-06, "loss": 0.5431, "step": 564 }, { "epoch": 0.5570618683756471, "grad_norm": 0.17047869791424183, "learning_rate": 9.184268354807863e-06, "loss": 0.5272, "step": 565 }, { "epoch": 0.5580478185851615, "grad_norm": 0.1900426807328964, "learning_rate": 9.181434717957779e-06, "loss": 0.5261, "step": 566 }, { "epoch": 0.5590337687946759, "grad_norm": 0.17617470606974053, "learning_rate": 9.17859660670113e-06, "loss": 0.5311, "step": 567 }, { "epoch": 0.5600197190041903, "grad_norm": 0.20130597373832468, "learning_rate": 9.175754024074874e-06, "loss": 0.5157, "step": 568 }, { "epoch": 0.5610056692137048, "grad_norm": 0.17294261538712863, "learning_rate": 9.172906973120767e-06, "loss": 0.5228, "step": 569 }, { "epoch": 0.5619916194232192, "grad_norm": 0.1852675528253617, "learning_rate": 9.170055456885332e-06, "loss": 0.5243, "step": 570 }, { "epoch": 0.5629775696327336, "grad_norm": 0.18500105862976746, "learning_rate": 9.16719947841988e-06, "loss": 0.5399, "step": 571 }, { "epoch": 0.563963519842248, "grad_norm": 0.1756914744676518, "learning_rate": 9.164339040780492e-06, "loss": 0.5188, "step": 572 }, { "epoch": 0.5649494700517624, "grad_norm": 0.2025007793819143, "learning_rate": 9.161474147028022e-06, "loss": 0.5283, "step": 573 }, { "epoch": 0.5659354202612769, "grad_norm": 0.1808833374255632, "learning_rate": 9.158604800228094e-06, "loss": 0.5186, "step": 574 }, { "epoch": 0.5669213704707913, "grad_norm": 0.1867472385948832, "learning_rate": 9.155731003451091e-06, "loss": 0.5013, "step": 575 }, { "epoch": 0.5679073206803057, "grad_norm": 0.18062805229765697, "learning_rate": 9.152852759772167e-06, "loss": 0.5165, "step": 576 }, { "epoch": 0.5688932708898201, "grad_norm": 0.17552623211314305, "learning_rate": 9.149970072271226e-06, "loss": 0.5034, "step": 577 }, { "epoch": 0.5698792210993345, "grad_norm": 0.1919350685661946, "learning_rate": 9.147082944032934e-06, "loss": 0.5172, "step": 578 }, { "epoch": 0.570865171308849, "grad_norm": 0.1811007654992155, "learning_rate": 9.144191378146702e-06, "loss": 0.5199, "step": 579 }, { "epoch": 0.5718511215183634, "grad_norm": 0.19407623548069264, "learning_rate": 9.141295377706695e-06, "loss": 0.527, "step": 580 }, { "epoch": 0.5728370717278778, "grad_norm": 0.18908186768753915, "learning_rate": 9.138394945811825e-06, "loss": 0.5129, "step": 581 }, { "epoch": 0.5738230219373922, "grad_norm": 0.18119416709811703, "learning_rate": 9.135490085565735e-06, "loss": 0.5419, "step": 582 }, { "epoch": 0.5748089721469066, "grad_norm": 0.17536626522865337, "learning_rate": 9.132580800076819e-06, "loss": 0.5419, "step": 583 }, { "epoch": 0.575794922356421, "grad_norm": 0.17746659463522532, "learning_rate": 9.129667092458198e-06, "loss": 0.5174, "step": 584 }, { "epoch": 0.5767808725659355, "grad_norm": 0.1857542289742682, "learning_rate": 9.126748965827732e-06, "loss": 0.5035, "step": 585 }, { "epoch": 0.5777668227754499, "grad_norm": 0.17427240686444098, "learning_rate": 9.123826423308005e-06, "loss": 0.5292, "step": 586 }, { "epoch": 0.5787527729849643, "grad_norm": 0.18841192624205355, "learning_rate": 9.120899468026327e-06, "loss": 0.5384, "step": 587 }, { "epoch": 0.5797387231944787, "grad_norm": 0.18603041739827225, "learning_rate": 9.117968103114729e-06, "loss": 0.5434, "step": 588 }, { "epoch": 0.5807246734039931, "grad_norm": 0.17510196686929236, "learning_rate": 9.115032331709961e-06, "loss": 0.5213, "step": 589 }, { "epoch": 0.5817106236135076, "grad_norm": 0.1747927492804185, "learning_rate": 9.112092156953493e-06, "loss": 0.529, "step": 590 }, { "epoch": 0.582696573823022, "grad_norm": 0.17858183539184375, "learning_rate": 9.109147581991499e-06, "loss": 0.5252, "step": 591 }, { "epoch": 0.5836825240325364, "grad_norm": 0.17461562316476606, "learning_rate": 9.106198609974865e-06, "loss": 0.5383, "step": 592 }, { "epoch": 0.5846684742420508, "grad_norm": 0.18330748062867017, "learning_rate": 9.103245244059187e-06, "loss": 0.5146, "step": 593 }, { "epoch": 0.5856544244515652, "grad_norm": 0.1753610355571392, "learning_rate": 9.100287487404753e-06, "loss": 0.5159, "step": 594 }, { "epoch": 0.5866403746610797, "grad_norm": 0.17367460310203844, "learning_rate": 9.097325343176558e-06, "loss": 0.5065, "step": 595 }, { "epoch": 0.5876263248705941, "grad_norm": 0.17222303467067504, "learning_rate": 9.094358814544286e-06, "loss": 0.5376, "step": 596 }, { "epoch": 0.5886122750801085, "grad_norm": 0.1873340504183569, "learning_rate": 9.091387904682318e-06, "loss": 0.5202, "step": 597 }, { "epoch": 0.5895982252896229, "grad_norm": 0.17438861375145898, "learning_rate": 9.08841261676972e-06, "loss": 0.5212, "step": 598 }, { "epoch": 0.5905841754991373, "grad_norm": 0.1714457753169691, "learning_rate": 9.08543295399024e-06, "loss": 0.5055, "step": 599 }, { "epoch": 0.5915701257086518, "grad_norm": 0.20022070999928204, "learning_rate": 9.082448919532314e-06, "loss": 0.4946, "step": 600 }, { "epoch": 0.5925560759181662, "grad_norm": 0.178991294093694, "learning_rate": 9.07946051658905e-06, "loss": 0.5387, "step": 601 }, { "epoch": 0.5935420261276806, "grad_norm": 0.18384867168921457, "learning_rate": 9.076467748358237e-06, "loss": 0.542, "step": 602 }, { "epoch": 0.594527976337195, "grad_norm": 0.1725702905420044, "learning_rate": 9.073470618042328e-06, "loss": 0.5228, "step": 603 }, { "epoch": 0.5955139265467094, "grad_norm": 0.16585800029266956, "learning_rate": 9.070469128848447e-06, "loss": 0.5035, "step": 604 }, { "epoch": 0.5964998767562238, "grad_norm": 0.5047088349732528, "learning_rate": 9.067463283988385e-06, "loss": 0.5234, "step": 605 }, { "epoch": 0.5974858269657383, "grad_norm": 0.17584146068307602, "learning_rate": 9.064453086678587e-06, "loss": 0.5461, "step": 606 }, { "epoch": 0.5984717771752527, "grad_norm": 0.20307560060614097, "learning_rate": 9.061438540140161e-06, "loss": 0.5296, "step": 607 }, { "epoch": 0.5994577273847671, "grad_norm": 0.17851206137289966, "learning_rate": 9.05841964759887e-06, "loss": 0.5365, "step": 608 }, { "epoch": 0.6004436775942815, "grad_norm": 0.17546432011636895, "learning_rate": 9.055396412285122e-06, "loss": 0.5211, "step": 609 }, { "epoch": 0.6014296278037959, "grad_norm": 0.17844165754682811, "learning_rate": 9.052368837433978e-06, "loss": 0.52, "step": 610 }, { "epoch": 0.6024155780133104, "grad_norm": 0.1687356005922332, "learning_rate": 9.049336926285136e-06, "loss": 0.516, "step": 611 }, { "epoch": 0.6034015282228248, "grad_norm": 0.18488627019817228, "learning_rate": 9.04630068208294e-06, "loss": 0.5448, "step": 612 }, { "epoch": 0.6043874784323392, "grad_norm": 0.17424086572152608, "learning_rate": 9.043260108076369e-06, "loss": 0.5281, "step": 613 }, { "epoch": 0.6053734286418536, "grad_norm": 0.17573018876393326, "learning_rate": 9.040215207519031e-06, "loss": 0.5125, "step": 614 }, { "epoch": 0.606359378851368, "grad_norm": 0.17297871673089396, "learning_rate": 9.037165983669172e-06, "loss": 0.5335, "step": 615 }, { "epoch": 0.6073453290608825, "grad_norm": 0.18495495026508482, "learning_rate": 9.034112439789655e-06, "loss": 0.5165, "step": 616 }, { "epoch": 0.6083312792703969, "grad_norm": 0.18458607108773536, "learning_rate": 9.031054579147973e-06, "loss": 0.5222, "step": 617 }, { "epoch": 0.6093172294799113, "grad_norm": 0.18087886436501827, "learning_rate": 9.027992405016234e-06, "loss": 0.5323, "step": 618 }, { "epoch": 0.6103031796894257, "grad_norm": 0.1801917170488805, "learning_rate": 9.024925920671167e-06, "loss": 0.5426, "step": 619 }, { "epoch": 0.6112891298989401, "grad_norm": 0.17289962272813006, "learning_rate": 9.021855129394103e-06, "loss": 0.528, "step": 620 }, { "epoch": 0.6122750801084546, "grad_norm": 0.17531792307433441, "learning_rate": 9.018780034470991e-06, "loss": 0.5216, "step": 621 }, { "epoch": 0.613261030317969, "grad_norm": 0.1741054805237634, "learning_rate": 9.015700639192384e-06, "loss": 0.5243, "step": 622 }, { "epoch": 0.6142469805274834, "grad_norm": 0.1835618139824688, "learning_rate": 9.012616946853432e-06, "loss": 0.5287, "step": 623 }, { "epoch": 0.6152329307369978, "grad_norm": 0.17638053998512027, "learning_rate": 9.009528960753885e-06, "loss": 0.5331, "step": 624 }, { "epoch": 0.6162188809465122, "grad_norm": 0.17975674286395826, "learning_rate": 9.006436684198093e-06, "loss": 0.4977, "step": 625 }, { "epoch": 0.6172048311560266, "grad_norm": 0.18533891500734204, "learning_rate": 9.00334012049499e-06, "loss": 0.5129, "step": 626 }, { "epoch": 0.6181907813655411, "grad_norm": 0.18383024671858167, "learning_rate": 9.0002392729581e-06, "loss": 0.5275, "step": 627 }, { "epoch": 0.6191767315750555, "grad_norm": 0.16821043921604145, "learning_rate": 8.99713414490553e-06, "loss": 0.4997, "step": 628 }, { "epoch": 0.6201626817845699, "grad_norm": 0.1704480999997968, "learning_rate": 8.994024739659972e-06, "loss": 0.5139, "step": 629 }, { "epoch": 0.6211486319940843, "grad_norm": 0.18746952704693728, "learning_rate": 8.990911060548689e-06, "loss": 0.5194, "step": 630 }, { "epoch": 0.6221345822035987, "grad_norm": 0.17341484319326284, "learning_rate": 8.98779311090352e-06, "loss": 0.5233, "step": 631 }, { "epoch": 0.6231205324131132, "grad_norm": 0.17844220202436867, "learning_rate": 8.984670894060874e-06, "loss": 0.5087, "step": 632 }, { "epoch": 0.6241064826226276, "grad_norm": 0.16855815973950497, "learning_rate": 8.981544413361725e-06, "loss": 0.5104, "step": 633 }, { "epoch": 0.625092432832142, "grad_norm": 0.1849169795066938, "learning_rate": 8.978413672151612e-06, "loss": 0.5251, "step": 634 }, { "epoch": 0.6260783830416564, "grad_norm": 0.1773493478914938, "learning_rate": 8.97527867378063e-06, "loss": 0.5235, "step": 635 }, { "epoch": 0.6270643332511708, "grad_norm": 0.1841968528665142, "learning_rate": 8.97213942160343e-06, "loss": 0.529, "step": 636 }, { "epoch": 0.6280502834606853, "grad_norm": 0.17046732498513104, "learning_rate": 8.968995918979216e-06, "loss": 0.5131, "step": 637 }, { "epoch": 0.6290362336701997, "grad_norm": 0.17779413754694615, "learning_rate": 8.96584816927174e-06, "loss": 0.5082, "step": 638 }, { "epoch": 0.6300221838797141, "grad_norm": 0.1770139560344267, "learning_rate": 8.962696175849299e-06, "loss": 0.5194, "step": 639 }, { "epoch": 0.6310081340892285, "grad_norm": 0.23355583624196918, "learning_rate": 8.959539942084731e-06, "loss": 0.5152, "step": 640 }, { "epoch": 0.6319940842987429, "grad_norm": 0.18163116429500578, "learning_rate": 8.956379471355411e-06, "loss": 0.5312, "step": 641 }, { "epoch": 0.6329800345082573, "grad_norm": 0.17497411266633903, "learning_rate": 8.953214767043246e-06, "loss": 0.5166, "step": 642 }, { "epoch": 0.6339659847177718, "grad_norm": 0.17616177689851764, "learning_rate": 8.950045832534678e-06, "loss": 0.5048, "step": 643 }, { "epoch": 0.6349519349272862, "grad_norm": 0.17784544913925193, "learning_rate": 8.946872671220669e-06, "loss": 0.5136, "step": 644 }, { "epoch": 0.6359378851368006, "grad_norm": 0.17088153319930033, "learning_rate": 8.943695286496709e-06, "loss": 0.4895, "step": 645 }, { "epoch": 0.636923835346315, "grad_norm": 0.17832756103730774, "learning_rate": 8.940513681762807e-06, "loss": 0.5211, "step": 646 }, { "epoch": 0.6379097855558294, "grad_norm": 0.17537595009721352, "learning_rate": 8.937327860423487e-06, "loss": 0.5225, "step": 647 }, { "epoch": 0.6388957357653439, "grad_norm": 0.18717683211393102, "learning_rate": 8.93413782588778e-06, "loss": 0.5315, "step": 648 }, { "epoch": 0.6398816859748583, "grad_norm": 0.25977649967137717, "learning_rate": 8.930943581569236e-06, "loss": 0.5431, "step": 649 }, { "epoch": 0.6408676361843727, "grad_norm": 0.1800160217835687, "learning_rate": 8.927745130885902e-06, "loss": 0.5173, "step": 650 }, { "epoch": 0.6418535863938871, "grad_norm": 0.1896989176209276, "learning_rate": 8.924542477260325e-06, "loss": 0.5168, "step": 651 }, { "epoch": 0.6428395366034015, "grad_norm": 0.17510675412492296, "learning_rate": 8.921335624119555e-06, "loss": 0.5334, "step": 652 }, { "epoch": 0.643825486812916, "grad_norm": 0.1846458906517486, "learning_rate": 8.918124574895133e-06, "loss": 0.5108, "step": 653 }, { "epoch": 0.6448114370224304, "grad_norm": 0.18874230582806142, "learning_rate": 8.91490933302309e-06, "loss": 0.5174, "step": 654 }, { "epoch": 0.6457973872319448, "grad_norm": 0.19499657504014306, "learning_rate": 8.911689901943943e-06, "loss": 0.5125, "step": 655 }, { "epoch": 0.6467833374414592, "grad_norm": 0.18253219610497026, "learning_rate": 8.90846628510269e-06, "loss": 0.508, "step": 656 }, { "epoch": 0.6477692876509736, "grad_norm": 0.1744591790741951, "learning_rate": 8.905238485948815e-06, "loss": 0.5352, "step": 657 }, { "epoch": 0.648755237860488, "grad_norm": 0.18783949825896712, "learning_rate": 8.90200650793627e-06, "loss": 0.5306, "step": 658 }, { "epoch": 0.6497411880700025, "grad_norm": 0.17134031494103244, "learning_rate": 8.89877035452348e-06, "loss": 0.5014, "step": 659 }, { "epoch": 0.6507271382795169, "grad_norm": 0.17370655934781326, "learning_rate": 8.895530029173343e-06, "loss": 0.5102, "step": 660 }, { "epoch": 0.6517130884890313, "grad_norm": 0.1770600492093783, "learning_rate": 8.892285535353212e-06, "loss": 0.5314, "step": 661 }, { "epoch": 0.6526990386985457, "grad_norm": 0.18204208757325732, "learning_rate": 8.889036876534911e-06, "loss": 0.5348, "step": 662 }, { "epoch": 0.6536849889080601, "grad_norm": 0.17762241359082767, "learning_rate": 8.885784056194712e-06, "loss": 0.5296, "step": 663 }, { "epoch": 0.6546709391175746, "grad_norm": 0.1676215990546039, "learning_rate": 8.882527077813348e-06, "loss": 0.4964, "step": 664 }, { "epoch": 0.655656889327089, "grad_norm": 0.1882492905393916, "learning_rate": 8.879265944875994e-06, "loss": 0.5196, "step": 665 }, { "epoch": 0.6566428395366034, "grad_norm": 0.17397694941479602, "learning_rate": 8.876000660872274e-06, "loss": 0.5111, "step": 666 }, { "epoch": 0.6576287897461178, "grad_norm": 0.17999310124671788, "learning_rate": 8.872731229296256e-06, "loss": 0.5402, "step": 667 }, { "epoch": 0.6586147399556322, "grad_norm": 0.16951200375869988, "learning_rate": 8.869457653646443e-06, "loss": 0.5375, "step": 668 }, { "epoch": 0.6596006901651467, "grad_norm": 0.18029882359082766, "learning_rate": 8.866179937425772e-06, "loss": 0.5394, "step": 669 }, { "epoch": 0.6605866403746611, "grad_norm": 0.1869965519670085, "learning_rate": 8.862898084141614e-06, "loss": 0.5185, "step": 670 }, { "epoch": 0.6615725905841755, "grad_norm": 0.16817396755280328, "learning_rate": 8.859612097305764e-06, "loss": 0.5186, "step": 671 }, { "epoch": 0.6625585407936899, "grad_norm": 0.17438563993837336, "learning_rate": 8.85632198043444e-06, "loss": 0.5272, "step": 672 }, { "epoch": 0.6635444910032043, "grad_norm": 0.1804169563087905, "learning_rate": 8.853027737048286e-06, "loss": 0.507, "step": 673 }, { "epoch": 0.6645304412127188, "grad_norm": 0.1785577926904514, "learning_rate": 8.849729370672352e-06, "loss": 0.5215, "step": 674 }, { "epoch": 0.6655163914222332, "grad_norm": 0.16911658733520982, "learning_rate": 8.8464268848361e-06, "loss": 0.5232, "step": 675 }, { "epoch": 0.6665023416317476, "grad_norm": 0.1773874030449928, "learning_rate": 8.843120283073415e-06, "loss": 0.531, "step": 676 }, { "epoch": 0.667488291841262, "grad_norm": 0.397460651171539, "learning_rate": 8.839809568922565e-06, "loss": 0.5075, "step": 677 }, { "epoch": 0.6684742420507764, "grad_norm": 0.18509188624181927, "learning_rate": 8.836494745926234e-06, "loss": 0.5245, "step": 678 }, { "epoch": 0.6694601922602909, "grad_norm": 0.17677712795862216, "learning_rate": 8.833175817631499e-06, "loss": 0.5132, "step": 679 }, { "epoch": 0.6704461424698053, "grad_norm": 0.1915713550121279, "learning_rate": 8.829852787589824e-06, "loss": 0.5186, "step": 680 }, { "epoch": 0.6714320926793197, "grad_norm": 0.22366661643432598, "learning_rate": 8.826525659357071e-06, "loss": 0.5316, "step": 681 }, { "epoch": 0.6724180428888341, "grad_norm": 0.17281416028542135, "learning_rate": 8.823194436493483e-06, "loss": 0.4999, "step": 682 }, { "epoch": 0.6734039930983485, "grad_norm": 0.2658779409346162, "learning_rate": 8.819859122563682e-06, "loss": 0.5118, "step": 683 }, { "epoch": 0.674389943307863, "grad_norm": 0.18389819905434257, "learning_rate": 8.816519721136673e-06, "loss": 0.5159, "step": 684 }, { "epoch": 0.6753758935173774, "grad_norm": 0.1845332508473855, "learning_rate": 8.813176235785829e-06, "loss": 0.5409, "step": 685 }, { "epoch": 0.6763618437268918, "grad_norm": 0.17160868622207487, "learning_rate": 8.8098286700889e-06, "loss": 0.5265, "step": 686 }, { "epoch": 0.6773477939364062, "grad_norm": 0.17548251551800362, "learning_rate": 8.806477027627997e-06, "loss": 0.5121, "step": 687 }, { "epoch": 0.6783337441459206, "grad_norm": 0.17118384896055586, "learning_rate": 8.803121311989598e-06, "loss": 0.5327, "step": 688 }, { "epoch": 0.679319694355435, "grad_norm": 0.171583103142981, "learning_rate": 8.79976152676453e-06, "loss": 0.5305, "step": 689 }, { "epoch": 0.6803056445649495, "grad_norm": 0.17345039764589554, "learning_rate": 8.796397675547986e-06, "loss": 0.5304, "step": 690 }, { "epoch": 0.6812915947744639, "grad_norm": 0.16964715352228815, "learning_rate": 8.793029761939504e-06, "loss": 0.5196, "step": 691 }, { "epoch": 0.6822775449839783, "grad_norm": 0.19762274913482514, "learning_rate": 8.789657789542972e-06, "loss": 0.5103, "step": 692 }, { "epoch": 0.6832634951934927, "grad_norm": 0.20395516806477976, "learning_rate": 8.786281761966615e-06, "loss": 0.5261, "step": 693 }, { "epoch": 0.6842494454030071, "grad_norm": 0.18121458134398816, "learning_rate": 8.782901682823004e-06, "loss": 0.5254, "step": 694 }, { "epoch": 0.6852353956125216, "grad_norm": 0.16959004669484334, "learning_rate": 8.779517555729046e-06, "loss": 0.5196, "step": 695 }, { "epoch": 0.686221345822036, "grad_norm": 0.1845303297736926, "learning_rate": 8.776129384305973e-06, "loss": 0.5282, "step": 696 }, { "epoch": 0.6872072960315504, "grad_norm": 0.1759390137413818, "learning_rate": 8.772737172179348e-06, "loss": 0.5129, "step": 697 }, { "epoch": 0.6881932462410648, "grad_norm": 0.1689420049611807, "learning_rate": 8.769340922979062e-06, "loss": 0.5357, "step": 698 }, { "epoch": 0.6891791964505792, "grad_norm": 0.18054094111426847, "learning_rate": 8.765940640339318e-06, "loss": 0.5182, "step": 699 }, { "epoch": 0.6901651466600937, "grad_norm": 0.1678558299690107, "learning_rate": 8.76253632789864e-06, "loss": 0.5313, "step": 700 }, { "epoch": 0.6911510968696081, "grad_norm": 0.17772178459431145, "learning_rate": 8.759127989299865e-06, "loss": 0.5093, "step": 701 }, { "epoch": 0.6921370470791225, "grad_norm": 0.21720545871461938, "learning_rate": 8.755715628190136e-06, "loss": 0.5381, "step": 702 }, { "epoch": 0.6931229972886369, "grad_norm": 0.17115182309046265, "learning_rate": 8.752299248220901e-06, "loss": 0.504, "step": 703 }, { "epoch": 0.6941089474981513, "grad_norm": 0.18270723154242413, "learning_rate": 8.748878853047906e-06, "loss": 0.519, "step": 704 }, { "epoch": 0.6950948977076657, "grad_norm": 0.1702302493339554, "learning_rate": 8.7454544463312e-06, "loss": 0.5246, "step": 705 }, { "epoch": 0.6960808479171802, "grad_norm": 0.19264316148857302, "learning_rate": 8.742026031735116e-06, "loss": 0.5322, "step": 706 }, { "epoch": 0.6970667981266946, "grad_norm": 0.17481732405788628, "learning_rate": 8.738593612928283e-06, "loss": 0.5046, "step": 707 }, { "epoch": 0.698052748336209, "grad_norm": 0.17351857644214455, "learning_rate": 8.735157193583611e-06, "loss": 0.5307, "step": 708 }, { "epoch": 0.6990386985457234, "grad_norm": 0.32734345905410767, "learning_rate": 8.731716777378292e-06, "loss": 0.5222, "step": 709 }, { "epoch": 0.7000246487552378, "grad_norm": 0.18202642303027347, "learning_rate": 8.728272367993795e-06, "loss": 0.5131, "step": 710 }, { "epoch": 0.7010105989647523, "grad_norm": 0.17044576847981274, "learning_rate": 8.724823969115861e-06, "loss": 0.5321, "step": 711 }, { "epoch": 0.7019965491742667, "grad_norm": 0.18106248863996377, "learning_rate": 8.721371584434502e-06, "loss": 0.5256, "step": 712 }, { "epoch": 0.7029824993837811, "grad_norm": 0.1824247086532533, "learning_rate": 8.71791521764399e-06, "loss": 0.5178, "step": 713 }, { "epoch": 0.7039684495932955, "grad_norm": 0.16428313462842645, "learning_rate": 8.714454872442869e-06, "loss": 0.5167, "step": 714 }, { "epoch": 0.7049543998028099, "grad_norm": 0.17087528650372252, "learning_rate": 8.710990552533923e-06, "loss": 0.5316, "step": 715 }, { "epoch": 0.7059403500123244, "grad_norm": 0.17731853379074156, "learning_rate": 8.707522261624208e-06, "loss": 0.5168, "step": 716 }, { "epoch": 0.7069263002218388, "grad_norm": 0.18217115811783596, "learning_rate": 8.704050003425015e-06, "loss": 0.5193, "step": 717 }, { "epoch": 0.7079122504313532, "grad_norm": 0.17094344995648159, "learning_rate": 8.700573781651889e-06, "loss": 0.5213, "step": 718 }, { "epoch": 0.7088982006408676, "grad_norm": 0.17574602574233736, "learning_rate": 8.69709360002461e-06, "loss": 0.5281, "step": 719 }, { "epoch": 0.709884150850382, "grad_norm": 0.17241572672508582, "learning_rate": 8.693609462267202e-06, "loss": 0.5147, "step": 720 }, { "epoch": 0.7108701010598965, "grad_norm": 0.18222696431042285, "learning_rate": 8.690121372107914e-06, "loss": 0.5169, "step": 721 }, { "epoch": 0.7118560512694109, "grad_norm": 0.17050122835486803, "learning_rate": 8.686629333279233e-06, "loss": 0.5161, "step": 722 }, { "epoch": 0.7128420014789253, "grad_norm": 0.5388747937922178, "learning_rate": 8.683133349517863e-06, "loss": 0.52, "step": 723 }, { "epoch": 0.7138279516884397, "grad_norm": 0.1680344800860356, "learning_rate": 8.679633424564739e-06, "loss": 0.5104, "step": 724 }, { "epoch": 0.7148139018979541, "grad_norm": 0.17414658048357418, "learning_rate": 8.676129562165004e-06, "loss": 0.5249, "step": 725 }, { "epoch": 0.7157998521074685, "grad_norm": 0.17932147298985338, "learning_rate": 8.672621766068017e-06, "loss": 0.5428, "step": 726 }, { "epoch": 0.716785802316983, "grad_norm": 0.16685313372848734, "learning_rate": 8.66911004002735e-06, "loss": 0.523, "step": 727 }, { "epoch": 0.7177717525264974, "grad_norm": 0.1877324601151875, "learning_rate": 8.66559438780078e-06, "loss": 0.5054, "step": 728 }, { "epoch": 0.7187577027360118, "grad_norm": 0.16681356492773824, "learning_rate": 8.66207481315028e-06, "loss": 0.5048, "step": 729 }, { "epoch": 0.7197436529455262, "grad_norm": 0.1805457431900985, "learning_rate": 8.658551319842022e-06, "loss": 0.529, "step": 730 }, { "epoch": 0.7207296031550406, "grad_norm": 0.17751202813071432, "learning_rate": 8.655023911646374e-06, "loss": 0.5418, "step": 731 }, { "epoch": 0.7217155533645551, "grad_norm": 0.17157275389362875, "learning_rate": 8.651492592337895e-06, "loss": 0.5143, "step": 732 }, { "epoch": 0.7227015035740695, "grad_norm": 0.18539140436717147, "learning_rate": 8.647957365695321e-06, "loss": 0.5189, "step": 733 }, { "epoch": 0.7236874537835839, "grad_norm": 0.16796283507271556, "learning_rate": 8.644418235501576e-06, "loss": 0.5215, "step": 734 }, { "epoch": 0.7246734039930983, "grad_norm": 0.1747485196836248, "learning_rate": 8.64087520554376e-06, "loss": 0.5347, "step": 735 }, { "epoch": 0.7256593542026127, "grad_norm": 0.3195405598301076, "learning_rate": 8.637328279613143e-06, "loss": 0.5253, "step": 736 }, { "epoch": 0.7266453044121272, "grad_norm": 0.17565048687872226, "learning_rate": 8.633777461505167e-06, "loss": 0.5154, "step": 737 }, { "epoch": 0.7276312546216416, "grad_norm": 0.1733570592289281, "learning_rate": 8.630222755019437e-06, "loss": 0.502, "step": 738 }, { "epoch": 0.728617204831156, "grad_norm": 0.3577630848232857, "learning_rate": 8.626664163959722e-06, "loss": 0.5404, "step": 739 }, { "epoch": 0.7296031550406704, "grad_norm": 0.17605315644053063, "learning_rate": 8.623101692133943e-06, "loss": 0.4973, "step": 740 }, { "epoch": 0.7305891052501848, "grad_norm": 0.17239188746088818, "learning_rate": 8.61953534335418e-06, "loss": 0.5219, "step": 741 }, { "epoch": 0.7315750554596993, "grad_norm": 0.17111425984714337, "learning_rate": 8.615965121436652e-06, "loss": 0.5205, "step": 742 }, { "epoch": 0.7325610056692137, "grad_norm": 0.20812294181276428, "learning_rate": 8.61239103020173e-06, "loss": 0.5341, "step": 743 }, { "epoch": 0.7335469558787281, "grad_norm": 0.17875989631698133, "learning_rate": 8.608813073473927e-06, "loss": 0.5249, "step": 744 }, { "epoch": 0.7345329060882425, "grad_norm": 0.17716741956150453, "learning_rate": 8.605231255081885e-06, "loss": 0.5377, "step": 745 }, { "epoch": 0.7355188562977569, "grad_norm": 0.16697367846409525, "learning_rate": 8.601645578858385e-06, "loss": 0.5362, "step": 746 }, { "epoch": 0.7365048065072713, "grad_norm": 0.16740450652200117, "learning_rate": 8.598056048640331e-06, "loss": 0.516, "step": 747 }, { "epoch": 0.7374907567167858, "grad_norm": 0.17036646832008162, "learning_rate": 8.594462668268754e-06, "loss": 0.5254, "step": 748 }, { "epoch": 0.7384767069263002, "grad_norm": 0.18306208877356162, "learning_rate": 8.590865441588804e-06, "loss": 0.5133, "step": 749 }, { "epoch": 0.7394626571358146, "grad_norm": 0.16885245950372, "learning_rate": 8.58726437244975e-06, "loss": 0.5193, "step": 750 }, { "epoch": 0.740448607345329, "grad_norm": 0.1762670178152663, "learning_rate": 8.583659464704965e-06, "loss": 0.5338, "step": 751 }, { "epoch": 0.7414345575548434, "grad_norm": 0.17222086219642968, "learning_rate": 8.580050722211937e-06, "loss": 0.5196, "step": 752 }, { "epoch": 0.7424205077643579, "grad_norm": 0.17374197299439567, "learning_rate": 8.576438148832256e-06, "loss": 0.5273, "step": 753 }, { "epoch": 0.7434064579738723, "grad_norm": 0.17175106736459506, "learning_rate": 8.572821748431606e-06, "loss": 0.5182, "step": 754 }, { "epoch": 0.7443924081833867, "grad_norm": 0.1706566026586807, "learning_rate": 8.569201524879775e-06, "loss": 0.5236, "step": 755 }, { "epoch": 0.7453783583929011, "grad_norm": 0.17427390185507804, "learning_rate": 8.565577482050631e-06, "loss": 0.5223, "step": 756 }, { "epoch": 0.7463643086024155, "grad_norm": 0.18022321211747883, "learning_rate": 8.561949623822141e-06, "loss": 0.5225, "step": 757 }, { "epoch": 0.74735025881193, "grad_norm": 0.17662547715213833, "learning_rate": 8.558317954076349e-06, "loss": 0.5166, "step": 758 }, { "epoch": 0.7483362090214444, "grad_norm": 0.17273233750102043, "learning_rate": 8.554682476699372e-06, "loss": 0.5381, "step": 759 }, { "epoch": 0.7493221592309588, "grad_norm": 0.1872194206953377, "learning_rate": 8.55104319558141e-06, "loss": 0.5323, "step": 760 }, { "epoch": 0.7503081094404732, "grad_norm": 0.2527148861263636, "learning_rate": 8.547400114616728e-06, "loss": 0.5125, "step": 761 }, { "epoch": 0.7512940596499876, "grad_norm": 0.17465398129673773, "learning_rate": 8.543753237703665e-06, "loss": 0.5248, "step": 762 }, { "epoch": 0.752280009859502, "grad_norm": 0.1660106297923328, "learning_rate": 8.540102568744608e-06, "loss": 0.5109, "step": 763 }, { "epoch": 0.7532659600690165, "grad_norm": 0.17253706651070183, "learning_rate": 8.536448111646017e-06, "loss": 0.5307, "step": 764 }, { "epoch": 0.7542519102785309, "grad_norm": 0.17289757333017136, "learning_rate": 8.532789870318392e-06, "loss": 0.5077, "step": 765 }, { "epoch": 0.7552378604880453, "grad_norm": 0.1868838629606392, "learning_rate": 8.529127848676293e-06, "loss": 0.5228, "step": 766 }, { "epoch": 0.7562238106975597, "grad_norm": 0.18131235865759035, "learning_rate": 8.525462050638317e-06, "loss": 0.5178, "step": 767 }, { "epoch": 0.7572097609070741, "grad_norm": 0.17232552266199797, "learning_rate": 8.521792480127111e-06, "loss": 0.5171, "step": 768 }, { "epoch": 0.7581957111165886, "grad_norm": 0.1746415429343793, "learning_rate": 8.51811914106935e-06, "loss": 0.5182, "step": 769 }, { "epoch": 0.759181661326103, "grad_norm": 0.17857413236160963, "learning_rate": 8.514442037395747e-06, "loss": 0.5148, "step": 770 }, { "epoch": 0.7601676115356174, "grad_norm": 0.17572257432702684, "learning_rate": 8.51076117304104e-06, "loss": 0.5189, "step": 771 }, { "epoch": 0.7611535617451318, "grad_norm": 0.16957297368102137, "learning_rate": 8.507076551943993e-06, "loss": 0.5157, "step": 772 }, { "epoch": 0.7621395119546462, "grad_norm": 0.18128339179208697, "learning_rate": 8.503388178047392e-06, "loss": 0.53, "step": 773 }, { "epoch": 0.7631254621641607, "grad_norm": 0.16976166249276048, "learning_rate": 8.499696055298033e-06, "loss": 0.5065, "step": 774 }, { "epoch": 0.7641114123736751, "grad_norm": 0.2109153690670825, "learning_rate": 8.496000187646729e-06, "loss": 0.5076, "step": 775 }, { "epoch": 0.7650973625831895, "grad_norm": 0.17292177920516993, "learning_rate": 8.4923005790483e-06, "loss": 0.504, "step": 776 }, { "epoch": 0.7660833127927039, "grad_norm": 0.20856268456176974, "learning_rate": 8.488597233461563e-06, "loss": 0.5301, "step": 777 }, { "epoch": 0.7670692630022183, "grad_norm": 0.16833157301414384, "learning_rate": 8.48489015484934e-06, "loss": 0.5145, "step": 778 }, { "epoch": 0.7680552132117328, "grad_norm": 0.17347222595579526, "learning_rate": 8.48117934717845e-06, "loss": 0.516, "step": 779 }, { "epoch": 0.7690411634212472, "grad_norm": 0.24132985412414357, "learning_rate": 8.47746481441969e-06, "loss": 0.5063, "step": 780 }, { "epoch": 0.7700271136307616, "grad_norm": 0.17026387255922254, "learning_rate": 8.473746560547857e-06, "loss": 0.5172, "step": 781 }, { "epoch": 0.771013063840276, "grad_norm": 0.16638691748823983, "learning_rate": 8.470024589541724e-06, "loss": 0.5201, "step": 782 }, { "epoch": 0.7719990140497904, "grad_norm": 0.36498027012790685, "learning_rate": 8.466298905384039e-06, "loss": 0.5063, "step": 783 }, { "epoch": 0.7729849642593049, "grad_norm": 0.16813785042319146, "learning_rate": 8.462569512061526e-06, "loss": 0.5365, "step": 784 }, { "epoch": 0.7739709144688193, "grad_norm": 0.17121635835533072, "learning_rate": 8.458836413564881e-06, "loss": 0.5179, "step": 785 }, { "epoch": 0.7749568646783337, "grad_norm": 0.17475909353692448, "learning_rate": 8.45509961388876e-06, "loss": 0.5146, "step": 786 }, { "epoch": 0.7759428148878481, "grad_norm": 0.2162109626239601, "learning_rate": 8.451359117031779e-06, "loss": 0.528, "step": 787 }, { "epoch": 0.7769287650973625, "grad_norm": 0.18007170603621503, "learning_rate": 8.447614926996513e-06, "loss": 0.5175, "step": 788 }, { "epoch": 0.7779147153068771, "grad_norm": 0.17144065048679238, "learning_rate": 8.443867047789494e-06, "loss": 0.5148, "step": 789 }, { "epoch": 0.7789006655163915, "grad_norm": 0.17851513494300253, "learning_rate": 8.440115483421187e-06, "loss": 0.5311, "step": 790 }, { "epoch": 0.7798866157259059, "grad_norm": 0.16612173862337312, "learning_rate": 8.436360237906017e-06, "loss": 0.512, "step": 791 }, { "epoch": 0.7808725659354203, "grad_norm": 0.1853966280987676, "learning_rate": 8.432601315262336e-06, "loss": 0.5019, "step": 792 }, { "epoch": 0.7818585161449347, "grad_norm": 0.17683658205732966, "learning_rate": 8.428838719512437e-06, "loss": 0.484, "step": 793 }, { "epoch": 0.7828444663544492, "grad_norm": 0.536136400317554, "learning_rate": 8.425072454682543e-06, "loss": 0.5322, "step": 794 }, { "epoch": 0.7838304165639636, "grad_norm": 0.28768019540866874, "learning_rate": 8.421302524802799e-06, "loss": 0.5279, "step": 795 }, { "epoch": 0.784816366773478, "grad_norm": 0.16627609196370421, "learning_rate": 8.417528933907276e-06, "loss": 0.5076, "step": 796 }, { "epoch": 0.7858023169829924, "grad_norm": 0.17247981442273191, "learning_rate": 8.413751686033961e-06, "loss": 0.5199, "step": 797 }, { "epoch": 0.7867882671925068, "grad_norm": 0.17103119472700135, "learning_rate": 8.409970785224755e-06, "loss": 0.5059, "step": 798 }, { "epoch": 0.7877742174020212, "grad_norm": 0.18970462671162505, "learning_rate": 8.406186235525466e-06, "loss": 0.514, "step": 799 }, { "epoch": 0.7887601676115357, "grad_norm": 0.17506332006283146, "learning_rate": 8.402398040985809e-06, "loss": 0.5157, "step": 800 }, { "epoch": 0.7897461178210501, "grad_norm": 0.17254204827600841, "learning_rate": 8.398606205659397e-06, "loss": 0.5093, "step": 801 }, { "epoch": 0.7907320680305645, "grad_norm": 0.1715289364194175, "learning_rate": 8.394810733603742e-06, "loss": 0.5174, "step": 802 }, { "epoch": 0.7917180182400789, "grad_norm": 0.16717707688301195, "learning_rate": 8.391011628880243e-06, "loss": 0.5077, "step": 803 }, { "epoch": 0.7927039684495933, "grad_norm": 0.18976171328191369, "learning_rate": 8.387208895554191e-06, "loss": 0.5261, "step": 804 }, { "epoch": 0.7936899186591078, "grad_norm": 0.18091511950181954, "learning_rate": 8.383402537694755e-06, "loss": 0.5329, "step": 805 }, { "epoch": 0.7946758688686222, "grad_norm": 0.18000960716089773, "learning_rate": 8.379592559374987e-06, "loss": 0.5244, "step": 806 }, { "epoch": 0.7956618190781366, "grad_norm": 0.1713765859795502, "learning_rate": 8.37577896467181e-06, "loss": 0.5069, "step": 807 }, { "epoch": 0.796647769287651, "grad_norm": 0.1676906230612834, "learning_rate": 8.371961757666018e-06, "loss": 0.5138, "step": 808 }, { "epoch": 0.7976337194971654, "grad_norm": 0.17398939308947162, "learning_rate": 8.36814094244227e-06, "loss": 0.5008, "step": 809 }, { "epoch": 0.7986196697066799, "grad_norm": 0.16647096267780487, "learning_rate": 8.364316523089089e-06, "loss": 0.5075, "step": 810 }, { "epoch": 0.7996056199161943, "grad_norm": 0.17850069609849975, "learning_rate": 8.360488503698848e-06, "loss": 0.5286, "step": 811 }, { "epoch": 0.8005915701257087, "grad_norm": 0.17037329244238567, "learning_rate": 8.35665688836778e-06, "loss": 0.5076, "step": 812 }, { "epoch": 0.8015775203352231, "grad_norm": 0.17934894999043896, "learning_rate": 8.352821681195958e-06, "loss": 0.4978, "step": 813 }, { "epoch": 0.8025634705447375, "grad_norm": 0.17546299537076732, "learning_rate": 8.348982886287305e-06, "loss": 0.5278, "step": 814 }, { "epoch": 0.803549420754252, "grad_norm": 0.1703902987238829, "learning_rate": 8.345140507749579e-06, "loss": 0.5226, "step": 815 }, { "epoch": 0.8045353709637664, "grad_norm": 0.1684317982131377, "learning_rate": 8.341294549694379e-06, "loss": 0.5147, "step": 816 }, { "epoch": 0.8055213211732808, "grad_norm": 0.17252078349532643, "learning_rate": 8.337445016237124e-06, "loss": 0.5209, "step": 817 }, { "epoch": 0.8065072713827952, "grad_norm": 0.17367668306172382, "learning_rate": 8.333591911497069e-06, "loss": 0.5316, "step": 818 }, { "epoch": 0.8074932215923096, "grad_norm": 0.16782970866036928, "learning_rate": 8.329735239597282e-06, "loss": 0.504, "step": 819 }, { "epoch": 0.808479171801824, "grad_norm": 0.17483998670023923, "learning_rate": 8.325875004664659e-06, "loss": 0.5135, "step": 820 }, { "epoch": 0.8094651220113385, "grad_norm": 0.1708654556051924, "learning_rate": 8.322011210829895e-06, "loss": 0.5333, "step": 821 }, { "epoch": 0.8104510722208529, "grad_norm": 0.17013008093470075, "learning_rate": 8.318143862227504e-06, "loss": 0.5073, "step": 822 }, { "epoch": 0.8114370224303673, "grad_norm": 0.16511648195326395, "learning_rate": 8.314272962995796e-06, "loss": 0.5136, "step": 823 }, { "epoch": 0.8124229726398817, "grad_norm": 0.16893211551503326, "learning_rate": 8.31039851727689e-06, "loss": 0.4973, "step": 824 }, { "epoch": 0.8134089228493961, "grad_norm": 0.17199240095995227, "learning_rate": 8.30652052921669e-06, "loss": 0.5096, "step": 825 }, { "epoch": 0.8143948730589106, "grad_norm": 0.18280485626898793, "learning_rate": 8.302639002964899e-06, "loss": 0.5188, "step": 826 }, { "epoch": 0.815380823268425, "grad_norm": 0.16706532361316342, "learning_rate": 8.298753942674999e-06, "loss": 0.5116, "step": 827 }, { "epoch": 0.8163667734779394, "grad_norm": 0.1721919933141353, "learning_rate": 8.294865352504257e-06, "loss": 0.5346, "step": 828 }, { "epoch": 0.8173527236874538, "grad_norm": 0.17172107926109706, "learning_rate": 8.290973236613718e-06, "loss": 0.5342, "step": 829 }, { "epoch": 0.8183386738969682, "grad_norm": 0.21013618327969716, "learning_rate": 8.287077599168197e-06, "loss": 0.524, "step": 830 }, { "epoch": 0.8193246241064827, "grad_norm": 0.17027795744853666, "learning_rate": 8.283178444336281e-06, "loss": 0.5326, "step": 831 }, { "epoch": 0.8203105743159971, "grad_norm": 0.17343975258847022, "learning_rate": 8.279275776290316e-06, "loss": 0.5108, "step": 832 }, { "epoch": 0.8212965245255115, "grad_norm": 0.1712135908466209, "learning_rate": 8.275369599206415e-06, "loss": 0.5173, "step": 833 }, { "epoch": 0.8222824747350259, "grad_norm": 0.18313799778430767, "learning_rate": 8.271459917264435e-06, "loss": 0.5245, "step": 834 }, { "epoch": 0.8232684249445403, "grad_norm": 0.17493574661254696, "learning_rate": 8.267546734647993e-06, "loss": 0.5171, "step": 835 }, { "epoch": 0.8242543751540548, "grad_norm": 0.16832759951679077, "learning_rate": 8.263630055544447e-06, "loss": 0.5179, "step": 836 }, { "epoch": 0.8252403253635692, "grad_norm": 0.16866467872444635, "learning_rate": 8.2597098841449e-06, "loss": 0.5068, "step": 837 }, { "epoch": 0.8262262755730836, "grad_norm": 0.1808763535003984, "learning_rate": 8.25578622464419e-06, "loss": 0.4934, "step": 838 }, { "epoch": 0.827212225782598, "grad_norm": 0.1655924359407733, "learning_rate": 8.251859081240882e-06, "loss": 0.5008, "step": 839 }, { "epoch": 0.8281981759921124, "grad_norm": 0.1794886277416319, "learning_rate": 8.24792845813728e-06, "loss": 0.5236, "step": 840 }, { "epoch": 0.8291841262016268, "grad_norm": 0.18082788296448596, "learning_rate": 8.243994359539404e-06, "loss": 0.5277, "step": 841 }, { "epoch": 0.8301700764111413, "grad_norm": 0.17032445693735299, "learning_rate": 8.240056789656996e-06, "loss": 0.492, "step": 842 }, { "epoch": 0.8311560266206557, "grad_norm": 0.1818549810915091, "learning_rate": 8.23611575270351e-06, "loss": 0.5179, "step": 843 }, { "epoch": 0.8321419768301701, "grad_norm": 0.1800975573391804, "learning_rate": 8.23217125289611e-06, "loss": 0.5441, "step": 844 }, { "epoch": 0.8331279270396845, "grad_norm": 0.17835570335483725, "learning_rate": 8.228223294455668e-06, "loss": 0.5355, "step": 845 }, { "epoch": 0.8341138772491989, "grad_norm": 0.1786970391841637, "learning_rate": 8.224271881606758e-06, "loss": 0.5266, "step": 846 }, { "epoch": 0.8350998274587134, "grad_norm": 0.1719155152862329, "learning_rate": 8.220317018577645e-06, "loss": 0.5048, "step": 847 }, { "epoch": 0.8360857776682278, "grad_norm": 0.1919660610537843, "learning_rate": 8.216358709600291e-06, "loss": 0.5093, "step": 848 }, { "epoch": 0.8370717278777422, "grad_norm": 0.2929155996744783, "learning_rate": 8.212396958910343e-06, "loss": 0.5188, "step": 849 }, { "epoch": 0.8380576780872566, "grad_norm": 0.16517385158404066, "learning_rate": 8.208431770747133e-06, "loss": 0.5163, "step": 850 }, { "epoch": 0.839043628296771, "grad_norm": 0.1610551985539912, "learning_rate": 8.204463149353667e-06, "loss": 0.5115, "step": 851 }, { "epoch": 0.8400295785062855, "grad_norm": 0.17113931041207608, "learning_rate": 8.20049109897663e-06, "loss": 0.5045, "step": 852 }, { "epoch": 0.8410155287157999, "grad_norm": 0.17732907409269819, "learning_rate": 8.19651562386637e-06, "loss": 0.5134, "step": 853 }, { "epoch": 0.8420014789253143, "grad_norm": 0.16174127147822506, "learning_rate": 8.192536728276907e-06, "loss": 0.5275, "step": 854 }, { "epoch": 0.8429874291348287, "grad_norm": 0.18285945322027478, "learning_rate": 8.188554416465918e-06, "loss": 0.5036, "step": 855 }, { "epoch": 0.8439733793443431, "grad_norm": 0.17393897848244694, "learning_rate": 8.184568692694732e-06, "loss": 0.5335, "step": 856 }, { "epoch": 0.8449593295538576, "grad_norm": 0.16875773096341296, "learning_rate": 8.180579561228334e-06, "loss": 0.5056, "step": 857 }, { "epoch": 0.845945279763372, "grad_norm": 0.1708121787085384, "learning_rate": 8.176587026335354e-06, "loss": 0.5163, "step": 858 }, { "epoch": 0.8469312299728864, "grad_norm": 0.18144067908380584, "learning_rate": 8.172591092288062e-06, "loss": 0.5174, "step": 859 }, { "epoch": 0.8479171801824008, "grad_norm": 0.17396213489252815, "learning_rate": 8.168591763362369e-06, "loss": 0.532, "step": 860 }, { "epoch": 0.8489031303919152, "grad_norm": 0.17377129128530383, "learning_rate": 8.164589043837814e-06, "loss": 0.51, "step": 861 }, { "epoch": 0.8498890806014296, "grad_norm": 0.17497985775285094, "learning_rate": 8.160582937997567e-06, "loss": 0.5287, "step": 862 }, { "epoch": 0.8508750308109441, "grad_norm": 0.17391851185726995, "learning_rate": 8.156573450128425e-06, "loss": 0.5177, "step": 863 }, { "epoch": 0.8518609810204585, "grad_norm": 0.3233822011352339, "learning_rate": 8.152560584520794e-06, "loss": 0.5058, "step": 864 }, { "epoch": 0.8528469312299729, "grad_norm": 0.18345526720652805, "learning_rate": 8.148544345468707e-06, "loss": 0.5169, "step": 865 }, { "epoch": 0.8538328814394873, "grad_norm": 0.18520161454156156, "learning_rate": 8.144524737269797e-06, "loss": 0.5204, "step": 866 }, { "epoch": 0.8548188316490017, "grad_norm": 0.16136672291650989, "learning_rate": 8.140501764225304e-06, "loss": 0.4984, "step": 867 }, { "epoch": 0.8558047818585162, "grad_norm": 0.17069620807049551, "learning_rate": 8.136475430640076e-06, "loss": 0.5251, "step": 868 }, { "epoch": 0.8567907320680306, "grad_norm": 0.17689553735324884, "learning_rate": 8.132445740822546e-06, "loss": 0.5151, "step": 869 }, { "epoch": 0.857776682277545, "grad_norm": 0.18881379637189347, "learning_rate": 8.128412699084744e-06, "loss": 0.5173, "step": 870 }, { "epoch": 0.8587626324870594, "grad_norm": 0.17266036667964668, "learning_rate": 8.12437630974229e-06, "loss": 0.504, "step": 871 }, { "epoch": 0.8597485826965738, "grad_norm": 0.17363778574025837, "learning_rate": 8.120336577114382e-06, "loss": 0.5288, "step": 872 }, { "epoch": 0.8607345329060883, "grad_norm": 0.17300665468675847, "learning_rate": 8.116293505523793e-06, "loss": 0.5192, "step": 873 }, { "epoch": 0.8617204831156027, "grad_norm": 0.16167858955319575, "learning_rate": 8.112247099296873e-06, "loss": 0.5215, "step": 874 }, { "epoch": 0.8627064333251171, "grad_norm": 0.16165411639123678, "learning_rate": 8.108197362763542e-06, "loss": 0.4974, "step": 875 }, { "epoch": 0.8636923835346315, "grad_norm": 0.17052900436983048, "learning_rate": 8.104144300257277e-06, "loss": 0.5027, "step": 876 }, { "epoch": 0.8646783337441459, "grad_norm": 0.170918407922491, "learning_rate": 8.100087916115121e-06, "loss": 0.5205, "step": 877 }, { "epoch": 0.8656642839536604, "grad_norm": 0.1658525751356633, "learning_rate": 8.096028214677666e-06, "loss": 0.4945, "step": 878 }, { "epoch": 0.8666502341631748, "grad_norm": 0.1638429649649223, "learning_rate": 8.09196520028906e-06, "loss": 0.4963, "step": 879 }, { "epoch": 0.8676361843726892, "grad_norm": 0.17112577903597162, "learning_rate": 8.08789887729699e-06, "loss": 0.5219, "step": 880 }, { "epoch": 0.8686221345822036, "grad_norm": 0.17209000636907013, "learning_rate": 8.08382925005268e-06, "loss": 0.5191, "step": 881 }, { "epoch": 0.869608084791718, "grad_norm": 0.17224830148926035, "learning_rate": 8.079756322910906e-06, "loss": 0.5275, "step": 882 }, { "epoch": 0.8705940350012324, "grad_norm": 0.1736943438173366, "learning_rate": 8.075680100229957e-06, "loss": 0.5165, "step": 883 }, { "epoch": 0.8715799852107469, "grad_norm": 0.18029154396746958, "learning_rate": 8.071600586371655e-06, "loss": 0.5139, "step": 884 }, { "epoch": 0.8725659354202613, "grad_norm": 0.17136245587960056, "learning_rate": 8.067517785701352e-06, "loss": 0.5233, "step": 885 }, { "epoch": 0.8735518856297757, "grad_norm": 0.17555967885359414, "learning_rate": 8.0634317025879e-06, "loss": 0.5443, "step": 886 }, { "epoch": 0.8745378358392901, "grad_norm": 0.16247925338321514, "learning_rate": 8.059342341403683e-06, "loss": 0.5183, "step": 887 }, { "epoch": 0.8755237860488045, "grad_norm": 0.16893395689368534, "learning_rate": 8.055249706524575e-06, "loss": 0.5199, "step": 888 }, { "epoch": 0.876509736258319, "grad_norm": 0.16949385808256937, "learning_rate": 8.051153802329963e-06, "loss": 0.5016, "step": 889 }, { "epoch": 0.8774956864678334, "grad_norm": 0.1927330298612067, "learning_rate": 8.047054633202734e-06, "loss": 0.5129, "step": 890 }, { "epoch": 0.8784816366773478, "grad_norm": 0.17067432797447837, "learning_rate": 8.042952203529262e-06, "loss": 0.5326, "step": 891 }, { "epoch": 0.8794675868868622, "grad_norm": 0.1739293602849575, "learning_rate": 8.038846517699413e-06, "loss": 0.5241, "step": 892 }, { "epoch": 0.8804535370963766, "grad_norm": 0.171650640715769, "learning_rate": 8.034737580106537e-06, "loss": 0.5209, "step": 893 }, { "epoch": 0.8814394873058911, "grad_norm": 0.1757326594314937, "learning_rate": 8.030625395147467e-06, "loss": 0.53, "step": 894 }, { "epoch": 0.8824254375154055, "grad_norm": 0.1604081833605916, "learning_rate": 8.026509967222504e-06, "loss": 0.4722, "step": 895 }, { "epoch": 0.8834113877249199, "grad_norm": 0.17236074876862664, "learning_rate": 8.022391300735424e-06, "loss": 0.5274, "step": 896 }, { "epoch": 0.8843973379344343, "grad_norm": 0.1794389453398671, "learning_rate": 8.01826940009347e-06, "loss": 0.5235, "step": 897 }, { "epoch": 0.8853832881439487, "grad_norm": 0.17308007813999723, "learning_rate": 8.01414426970734e-06, "loss": 0.5361, "step": 898 }, { "epoch": 0.8863692383534632, "grad_norm": 0.18746313776847923, "learning_rate": 8.010015913991194e-06, "loss": 0.5124, "step": 899 }, { "epoch": 0.8873551885629776, "grad_norm": 0.16596205512741172, "learning_rate": 8.005884337362637e-06, "loss": 0.5024, "step": 900 }, { "epoch": 0.888341138772492, "grad_norm": 0.17098191624404319, "learning_rate": 8.001749544242728e-06, "loss": 0.5285, "step": 901 }, { "epoch": 0.8893270889820064, "grad_norm": 0.17678419257324562, "learning_rate": 7.997611539055962e-06, "loss": 0.5244, "step": 902 }, { "epoch": 0.8903130391915208, "grad_norm": 0.16721705873096537, "learning_rate": 7.993470326230274e-06, "loss": 0.5296, "step": 903 }, { "epoch": 0.8912989894010352, "grad_norm": 0.17021479193248848, "learning_rate": 7.98932591019703e-06, "loss": 0.5162, "step": 904 }, { "epoch": 0.8922849396105497, "grad_norm": 0.17301667833486167, "learning_rate": 7.985178295391023e-06, "loss": 0.5302, "step": 905 }, { "epoch": 0.8932708898200641, "grad_norm": 0.1736113746385346, "learning_rate": 7.981027486250472e-06, "loss": 0.5193, "step": 906 }, { "epoch": 0.8942568400295785, "grad_norm": 0.16199362086063365, "learning_rate": 7.976873487217011e-06, "loss": 0.5048, "step": 907 }, { "epoch": 0.8952427902390929, "grad_norm": 0.17212503754715186, "learning_rate": 7.972716302735692e-06, "loss": 0.5225, "step": 908 }, { "epoch": 0.8962287404486073, "grad_norm": 0.1645622560183533, "learning_rate": 7.968555937254967e-06, "loss": 0.5165, "step": 909 }, { "epoch": 0.8972146906581218, "grad_norm": 0.16996160353433545, "learning_rate": 7.964392395226699e-06, "loss": 0.518, "step": 910 }, { "epoch": 0.8982006408676362, "grad_norm": 0.17572896215315817, "learning_rate": 7.960225681106151e-06, "loss": 0.5247, "step": 911 }, { "epoch": 0.8991865910771506, "grad_norm": 0.1719561893313999, "learning_rate": 7.956055799351972e-06, "loss": 0.5073, "step": 912 }, { "epoch": 0.900172541286665, "grad_norm": 0.16090581754225922, "learning_rate": 7.951882754426212e-06, "loss": 0.516, "step": 913 }, { "epoch": 0.9011584914961794, "grad_norm": 0.1807242712673357, "learning_rate": 7.947706550794297e-06, "loss": 0.5059, "step": 914 }, { "epoch": 0.9021444417056939, "grad_norm": 0.17762020465728326, "learning_rate": 7.943527192925035e-06, "loss": 0.5195, "step": 915 }, { "epoch": 0.9031303919152083, "grad_norm": 0.17141091502345296, "learning_rate": 7.939344685290612e-06, "loss": 0.5204, "step": 916 }, { "epoch": 0.9041163421247227, "grad_norm": 0.17266717729532846, "learning_rate": 7.935159032366583e-06, "loss": 0.5046, "step": 917 }, { "epoch": 0.9051022923342371, "grad_norm": 0.16644553924878852, "learning_rate": 7.930970238631867e-06, "loss": 0.5212, "step": 918 }, { "epoch": 0.9060882425437515, "grad_norm": 0.1792204259622754, "learning_rate": 7.926778308568746e-06, "loss": 0.5103, "step": 919 }, { "epoch": 0.907074192753266, "grad_norm": 0.16523094967902968, "learning_rate": 7.922583246662858e-06, "loss": 0.5136, "step": 920 }, { "epoch": 0.9080601429627804, "grad_norm": 0.2119077911673333, "learning_rate": 7.918385057403188e-06, "loss": 0.5039, "step": 921 }, { "epoch": 0.9090460931722948, "grad_norm": 0.17711635746479226, "learning_rate": 7.914183745282076e-06, "loss": 0.5233, "step": 922 }, { "epoch": 0.9100320433818092, "grad_norm": 0.1660947936258377, "learning_rate": 7.909979314795195e-06, "loss": 0.5132, "step": 923 }, { "epoch": 0.9110179935913236, "grad_norm": 0.16855795483064662, "learning_rate": 7.905771770441559e-06, "loss": 0.5065, "step": 924 }, { "epoch": 0.912003943800838, "grad_norm": 0.16027620736877113, "learning_rate": 7.901561116723518e-06, "loss": 0.5091, "step": 925 }, { "epoch": 0.9129898940103525, "grad_norm": 0.16175407780430773, "learning_rate": 7.897347358146736e-06, "loss": 0.5193, "step": 926 }, { "epoch": 0.9139758442198669, "grad_norm": 0.16488932476570586, "learning_rate": 7.893130499220216e-06, "loss": 0.5123, "step": 927 }, { "epoch": 0.9149617944293813, "grad_norm": 0.1734873791504118, "learning_rate": 7.888910544456269e-06, "loss": 0.5063, "step": 928 }, { "epoch": 0.9159477446388957, "grad_norm": 0.17958456178996576, "learning_rate": 7.884687498370519e-06, "loss": 0.4909, "step": 929 }, { "epoch": 0.9169336948484101, "grad_norm": 0.16643703344223132, "learning_rate": 7.880461365481898e-06, "loss": 0.5182, "step": 930 }, { "epoch": 0.9179196450579246, "grad_norm": 0.1720747470600968, "learning_rate": 7.876232150312646e-06, "loss": 0.5151, "step": 931 }, { "epoch": 0.918905595267439, "grad_norm": 0.17776188364898196, "learning_rate": 7.871999857388295e-06, "loss": 0.5183, "step": 932 }, { "epoch": 0.9198915454769534, "grad_norm": 0.1615927283246447, "learning_rate": 7.867764491237675e-06, "loss": 0.5327, "step": 933 }, { "epoch": 0.9208774956864678, "grad_norm": 0.1677229785390601, "learning_rate": 7.863526056392904e-06, "loss": 0.5269, "step": 934 }, { "epoch": 0.9218634458959822, "grad_norm": 0.18531257466086812, "learning_rate": 7.85928455738938e-06, "loss": 0.5076, "step": 935 }, { "epoch": 0.9228493961054967, "grad_norm": 0.17682682721854523, "learning_rate": 7.855039998765781e-06, "loss": 0.5311, "step": 936 }, { "epoch": 0.9238353463150111, "grad_norm": 0.1595345354564208, "learning_rate": 7.850792385064064e-06, "loss": 0.5139, "step": 937 }, { "epoch": 0.9248212965245255, "grad_norm": 0.1621031020930392, "learning_rate": 7.846541720829448e-06, "loss": 0.5065, "step": 938 }, { "epoch": 0.9258072467340399, "grad_norm": 0.16982048292608531, "learning_rate": 7.84228801061042e-06, "loss": 0.5244, "step": 939 }, { "epoch": 0.9267931969435543, "grad_norm": 0.164667046714563, "learning_rate": 7.83803125895873e-06, "loss": 0.4993, "step": 940 }, { "epoch": 0.9277791471530688, "grad_norm": 0.1664120555152627, "learning_rate": 7.833771470429375e-06, "loss": 0.5236, "step": 941 }, { "epoch": 0.9287650973625832, "grad_norm": 0.16363510468965828, "learning_rate": 7.829508649580604e-06, "loss": 0.483, "step": 942 }, { "epoch": 0.9297510475720976, "grad_norm": 0.18300991114328144, "learning_rate": 7.825242800973915e-06, "loss": 0.5052, "step": 943 }, { "epoch": 0.930736997781612, "grad_norm": 0.16805069209354068, "learning_rate": 7.82097392917404e-06, "loss": 0.5023, "step": 944 }, { "epoch": 0.9317229479911264, "grad_norm": 0.17945312719468895, "learning_rate": 7.816702038748953e-06, "loss": 0.5206, "step": 945 }, { "epoch": 0.9327088982006408, "grad_norm": 0.16520942441751493, "learning_rate": 7.812427134269852e-06, "loss": 0.5054, "step": 946 }, { "epoch": 0.9336948484101553, "grad_norm": 0.18163830376615256, "learning_rate": 7.80814922031116e-06, "loss": 0.5371, "step": 947 }, { "epoch": 0.9346807986196697, "grad_norm": 0.16622720857375242, "learning_rate": 7.803868301450528e-06, "loss": 0.5063, "step": 948 }, { "epoch": 0.9356667488291841, "grad_norm": 0.17626958165949663, "learning_rate": 7.79958438226881e-06, "loss": 0.5154, "step": 949 }, { "epoch": 0.9366526990386985, "grad_norm": 0.17179172076094043, "learning_rate": 7.795297467350083e-06, "loss": 0.5056, "step": 950 }, { "epoch": 0.9376386492482129, "grad_norm": 0.17404831583610947, "learning_rate": 7.791007561281623e-06, "loss": 0.504, "step": 951 }, { "epoch": 0.9386245994577274, "grad_norm": 0.17341440294441673, "learning_rate": 7.786714668653907e-06, "loss": 0.4975, "step": 952 }, { "epoch": 0.9396105496672418, "grad_norm": 0.17106305253040335, "learning_rate": 7.782418794060609e-06, "loss": 0.5304, "step": 953 }, { "epoch": 0.9405964998767562, "grad_norm": 0.1657714097764547, "learning_rate": 7.778119942098594e-06, "loss": 0.5008, "step": 954 }, { "epoch": 0.9415824500862706, "grad_norm": 0.17215962243937322, "learning_rate": 7.773818117367913e-06, "loss": 0.5039, "step": 955 }, { "epoch": 0.942568400295785, "grad_norm": 0.1708309325645627, "learning_rate": 7.769513324471798e-06, "loss": 0.4907, "step": 956 }, { "epoch": 0.9435543505052995, "grad_norm": 0.16824173694397346, "learning_rate": 7.765205568016654e-06, "loss": 0.5279, "step": 957 }, { "epoch": 0.9445403007148139, "grad_norm": 0.1631371599910111, "learning_rate": 7.760894852612064e-06, "loss": 0.5017, "step": 958 }, { "epoch": 0.9455262509243283, "grad_norm": 0.1654800873573575, "learning_rate": 7.75658118287077e-06, "loss": 0.5011, "step": 959 }, { "epoch": 0.9465122011338427, "grad_norm": 0.1701294258575521, "learning_rate": 7.75226456340868e-06, "loss": 0.5028, "step": 960 }, { "epoch": 0.9474981513433571, "grad_norm": 0.17081358124511606, "learning_rate": 7.747944998844858e-06, "loss": 0.5147, "step": 961 }, { "epoch": 0.9484841015528716, "grad_norm": 0.1702922338705157, "learning_rate": 7.743622493801518e-06, "loss": 0.5247, "step": 962 }, { "epoch": 0.949470051762386, "grad_norm": 0.16354067278521262, "learning_rate": 7.739297052904018e-06, "loss": 0.508, "step": 963 }, { "epoch": 0.9504560019719004, "grad_norm": 0.1632330387681569, "learning_rate": 7.734968680780865e-06, "loss": 0.5197, "step": 964 }, { "epoch": 0.9514419521814148, "grad_norm": 0.16707232934194988, "learning_rate": 7.730637382063696e-06, "loss": 0.5168, "step": 965 }, { "epoch": 0.9524279023909292, "grad_norm": 0.1698770968536073, "learning_rate": 7.72630316138728e-06, "loss": 0.501, "step": 966 }, { "epoch": 0.9534138526004436, "grad_norm": 0.16574650052795398, "learning_rate": 7.721966023389519e-06, "loss": 0.481, "step": 967 }, { "epoch": 0.9543998028099581, "grad_norm": 0.1765087940076975, "learning_rate": 7.717625972711429e-06, "loss": 0.5018, "step": 968 }, { "epoch": 0.9553857530194725, "grad_norm": 0.16999695085713057, "learning_rate": 7.713283013997145e-06, "loss": 0.5117, "step": 969 }, { "epoch": 0.9563717032289869, "grad_norm": 0.16492375715904378, "learning_rate": 7.708937151893917e-06, "loss": 0.5295, "step": 970 }, { "epoch": 0.9573576534385013, "grad_norm": 0.1770079333459357, "learning_rate": 7.704588391052099e-06, "loss": 0.5096, "step": 971 }, { "epoch": 0.9583436036480157, "grad_norm": 0.26749772764267893, "learning_rate": 7.700236736125146e-06, "loss": 0.5036, "step": 972 }, { "epoch": 0.9593295538575302, "grad_norm": 0.16742320355306792, "learning_rate": 7.695882191769614e-06, "loss": 0.513, "step": 973 }, { "epoch": 0.9603155040670446, "grad_norm": 0.1654819848687592, "learning_rate": 7.691524762645147e-06, "loss": 0.5019, "step": 974 }, { "epoch": 0.961301454276559, "grad_norm": 0.17272585215388925, "learning_rate": 7.687164453414475e-06, "loss": 0.53, "step": 975 }, { "epoch": 0.9622874044860734, "grad_norm": 0.17361850361878714, "learning_rate": 7.682801268743413e-06, "loss": 0.5216, "step": 976 }, { "epoch": 0.9632733546955878, "grad_norm": 0.1705371849345429, "learning_rate": 7.678435213300851e-06, "loss": 0.5023, "step": 977 }, { "epoch": 0.9642593049051023, "grad_norm": 0.16294961746913053, "learning_rate": 7.674066291758756e-06, "loss": 0.4913, "step": 978 }, { "epoch": 0.9652452551146167, "grad_norm": 0.16631852074052206, "learning_rate": 7.669694508792153e-06, "loss": 0.5192, "step": 979 }, { "epoch": 0.9662312053241311, "grad_norm": 0.1703772839195891, "learning_rate": 7.665319869079136e-06, "loss": 0.5094, "step": 980 }, { "epoch": 0.9672171555336455, "grad_norm": 0.16104122981289215, "learning_rate": 7.660942377300853e-06, "loss": 0.5045, "step": 981 }, { "epoch": 0.9682031057431599, "grad_norm": 0.1687999170074842, "learning_rate": 7.656562038141502e-06, "loss": 0.518, "step": 982 }, { "epoch": 0.9691890559526743, "grad_norm": 0.1640142258647821, "learning_rate": 7.652178856288333e-06, "loss": 0.5123, "step": 983 }, { "epoch": 0.9701750061621888, "grad_norm": 0.29857681452884277, "learning_rate": 7.647792836431633e-06, "loss": 0.5014, "step": 984 }, { "epoch": 0.9711609563717032, "grad_norm": 0.166417158069653, "learning_rate": 7.643403983264733e-06, "loss": 0.5174, "step": 985 }, { "epoch": 0.9721469065812176, "grad_norm": 0.17976511665766357, "learning_rate": 7.639012301483983e-06, "loss": 0.51, "step": 986 }, { "epoch": 0.973132856790732, "grad_norm": 0.16710700103318898, "learning_rate": 7.634617795788773e-06, "loss": 0.508, "step": 987 }, { "epoch": 0.9741188070002464, "grad_norm": 0.16652682426124865, "learning_rate": 7.630220470881506e-06, "loss": 0.4879, "step": 988 }, { "epoch": 0.9751047572097609, "grad_norm": 0.18519334923187553, "learning_rate": 7.6258203314676105e-06, "loss": 0.523, "step": 989 }, { "epoch": 0.9760907074192753, "grad_norm": 0.1769398076186771, "learning_rate": 7.621417382255516e-06, "loss": 0.4924, "step": 990 }, { "epoch": 0.9770766576287897, "grad_norm": 0.1749755537077236, "learning_rate": 7.617011627956665e-06, "loss": 0.5261, "step": 991 }, { "epoch": 0.9780626078383041, "grad_norm": 0.19223145523362156, "learning_rate": 7.612603073285503e-06, "loss": 0.5277, "step": 992 }, { "epoch": 0.9790485580478185, "grad_norm": 0.16720986150018388, "learning_rate": 7.608191722959466e-06, "loss": 0.5331, "step": 993 }, { "epoch": 0.980034508257333, "grad_norm": 0.17106542111837608, "learning_rate": 7.6037775816989875e-06, "loss": 0.5117, "step": 994 }, { "epoch": 0.9810204584668474, "grad_norm": 0.17579588202811694, "learning_rate": 7.599360654227485e-06, "loss": 0.5151, "step": 995 }, { "epoch": 0.9820064086763618, "grad_norm": 0.17106435785907712, "learning_rate": 7.5949409452713585e-06, "loss": 0.5201, "step": 996 }, { "epoch": 0.9829923588858762, "grad_norm": 0.16347983018754825, "learning_rate": 7.590518459559981e-06, "loss": 0.5084, "step": 997 }, { "epoch": 0.9839783090953906, "grad_norm": 0.18181892193197213, "learning_rate": 7.586093201825702e-06, "loss": 0.5467, "step": 998 }, { "epoch": 0.984964259304905, "grad_norm": 0.16737675009795525, "learning_rate": 7.581665176803832e-06, "loss": 0.5264, "step": 999 }, { "epoch": 0.9859502095144195, "grad_norm": 0.1698415867558201, "learning_rate": 7.577234389232646e-06, "loss": 0.5005, "step": 1000 }, { "epoch": 0.9869361597239339, "grad_norm": 0.17199239759519194, "learning_rate": 7.572800843853376e-06, "loss": 0.5128, "step": 1001 }, { "epoch": 0.9879221099334483, "grad_norm": 0.18133702646379646, "learning_rate": 7.568364545410201e-06, "loss": 0.5225, "step": 1002 }, { "epoch": 0.9889080601429627, "grad_norm": 0.19334275561566608, "learning_rate": 7.563925498650248e-06, "loss": 0.5261, "step": 1003 }, { "epoch": 0.9898940103524771, "grad_norm": 0.17381612732882007, "learning_rate": 7.5594837083235894e-06, "loss": 0.5072, "step": 1004 }, { "epoch": 0.9908799605619916, "grad_norm": 0.16380698462662358, "learning_rate": 7.555039179183223e-06, "loss": 0.5049, "step": 1005 }, { "epoch": 0.991865910771506, "grad_norm": 0.17462198651108224, "learning_rate": 7.55059191598509e-06, "loss": 0.5277, "step": 1006 }, { "epoch": 0.9928518609810204, "grad_norm": 0.17953006072611008, "learning_rate": 7.546141923488045e-06, "loss": 0.5186, "step": 1007 }, { "epoch": 0.9938378111905348, "grad_norm": 0.16865358332826616, "learning_rate": 7.541689206453873e-06, "loss": 0.5244, "step": 1008 }, { "epoch": 0.9948237614000492, "grad_norm": 0.17304916526129824, "learning_rate": 7.5372337696472674e-06, "loss": 0.5171, "step": 1009 }, { "epoch": 0.9958097116095637, "grad_norm": 0.17637586538791555, "learning_rate": 7.532775617835836e-06, "loss": 0.5249, "step": 1010 }, { "epoch": 0.9967956618190781, "grad_norm": 0.16779739755098547, "learning_rate": 7.528314755790089e-06, "loss": 0.5117, "step": 1011 }, { "epoch": 0.9977816120285925, "grad_norm": 0.16941643606585818, "learning_rate": 7.523851188283442e-06, "loss": 0.4903, "step": 1012 }, { "epoch": 0.9987675622381069, "grad_norm": 0.2093863707411628, "learning_rate": 7.5193849200921986e-06, "loss": 0.537, "step": 1013 }, { "epoch": 0.9997535124476213, "grad_norm": 0.16661642473837618, "learning_rate": 7.514915955995558e-06, "loss": 0.5032, "step": 1014 }, { "epoch": 1.0007394626571358, "grad_norm": 0.17389943690378837, "learning_rate": 7.510444300775599e-06, "loss": 0.524, "step": 1015 }, { "epoch": 1.0017254128666502, "grad_norm": 0.17311914260770017, "learning_rate": 7.505969959217285e-06, "loss": 0.5028, "step": 1016 }, { "epoch": 1.0029578506285433, "grad_norm": 0.19178452526435935, "learning_rate": 7.501492936108454e-06, "loss": 0.4834, "step": 1017 }, { "epoch": 1.0039438008380577, "grad_norm": 0.16819127155730185, "learning_rate": 7.497013236239805e-06, "loss": 0.4673, "step": 1018 }, { "epoch": 1.004929751047572, "grad_norm": 0.2087088082599385, "learning_rate": 7.492530864404916e-06, "loss": 0.463, "step": 1019 }, { "epoch": 1.0059157012570865, "grad_norm": 0.17023882357913672, "learning_rate": 7.488045825400208e-06, "loss": 0.465, "step": 1020 }, { "epoch": 1.006901651466601, "grad_norm": 0.19096706186861023, "learning_rate": 7.483558124024968e-06, "loss": 0.4836, "step": 1021 }, { "epoch": 1.0078876016761154, "grad_norm": 0.17292587142561078, "learning_rate": 7.479067765081327e-06, "loss": 0.4647, "step": 1022 }, { "epoch": 1.0088735518856298, "grad_norm": 0.20970851321219616, "learning_rate": 7.4745747533742604e-06, "loss": 0.481, "step": 1023 }, { "epoch": 1.0098595020951442, "grad_norm": 0.19957414505514495, "learning_rate": 7.470079093711583e-06, "loss": 0.4819, "step": 1024 }, { "epoch": 1.0108454523046586, "grad_norm": 0.17235557104698107, "learning_rate": 7.465580790903941e-06, "loss": 0.4664, "step": 1025 }, { "epoch": 1.011831402514173, "grad_norm": 0.17099614783137973, "learning_rate": 7.461079849764812e-06, "loss": 0.4594, "step": 1026 }, { "epoch": 1.0128173527236874, "grad_norm": 0.17371113860745846, "learning_rate": 7.456576275110495e-06, "loss": 0.4627, "step": 1027 }, { "epoch": 1.0138033029332019, "grad_norm": 0.18992048708241716, "learning_rate": 7.452070071760106e-06, "loss": 0.4753, "step": 1028 }, { "epoch": 1.0147892531427163, "grad_norm": 0.17058713785573343, "learning_rate": 7.447561244535575e-06, "loss": 0.4693, "step": 1029 }, { "epoch": 1.0157752033522307, "grad_norm": 0.1696152852494023, "learning_rate": 7.443049798261643e-06, "loss": 0.4748, "step": 1030 }, { "epoch": 1.0167611535617451, "grad_norm": 0.17311444766409678, "learning_rate": 7.438535737765846e-06, "loss": 0.4738, "step": 1031 }, { "epoch": 1.0177471037712595, "grad_norm": 0.1789380413927639, "learning_rate": 7.434019067878524e-06, "loss": 0.4868, "step": 1032 }, { "epoch": 1.018733053980774, "grad_norm": 0.1776370707448934, "learning_rate": 7.429499793432806e-06, "loss": 0.4639, "step": 1033 }, { "epoch": 1.0197190041902884, "grad_norm": 0.16321989041138424, "learning_rate": 7.424977919264611e-06, "loss": 0.4646, "step": 1034 }, { "epoch": 1.0207049543998028, "grad_norm": 0.3565072795267459, "learning_rate": 7.420453450212635e-06, "loss": 0.4731, "step": 1035 }, { "epoch": 1.0216909046093172, "grad_norm": 0.1718732951805992, "learning_rate": 7.415926391118357e-06, "loss": 0.4734, "step": 1036 }, { "epoch": 1.0226768548188316, "grad_norm": 0.17702243568254172, "learning_rate": 7.41139674682602e-06, "loss": 0.4752, "step": 1037 }, { "epoch": 1.023662805028346, "grad_norm": 0.1802015150153324, "learning_rate": 7.4068645221826415e-06, "loss": 0.467, "step": 1038 }, { "epoch": 1.0246487552378605, "grad_norm": 0.17785257353690725, "learning_rate": 7.402329722037993e-06, "loss": 0.464, "step": 1039 }, { "epoch": 1.025634705447375, "grad_norm": 0.17670639846700095, "learning_rate": 7.397792351244607e-06, "loss": 0.461, "step": 1040 }, { "epoch": 1.0266206556568893, "grad_norm": 0.16971386522429677, "learning_rate": 7.393252414657762e-06, "loss": 0.4915, "step": 1041 }, { "epoch": 1.0276066058664037, "grad_norm": 0.21567966408791525, "learning_rate": 7.388709917135489e-06, "loss": 0.4647, "step": 1042 }, { "epoch": 1.0285925560759182, "grad_norm": 0.171778494469584, "learning_rate": 7.3841648635385525e-06, "loss": 0.4772, "step": 1043 }, { "epoch": 1.0295785062854326, "grad_norm": 0.17106503306799054, "learning_rate": 7.379617258730456e-06, "loss": 0.4684, "step": 1044 }, { "epoch": 1.030564456494947, "grad_norm": 0.17303586751910072, "learning_rate": 7.375067107577428e-06, "loss": 0.4829, "step": 1045 }, { "epoch": 1.0315504067044614, "grad_norm": 0.18405366877277182, "learning_rate": 7.370514414948432e-06, "loss": 0.4874, "step": 1046 }, { "epoch": 1.0325363569139758, "grad_norm": 0.1737527019460879, "learning_rate": 7.3659591857151405e-06, "loss": 0.4643, "step": 1047 }, { "epoch": 1.0335223071234902, "grad_norm": 0.38823443343321035, "learning_rate": 7.361401424751945e-06, "loss": 0.47, "step": 1048 }, { "epoch": 1.0345082573330047, "grad_norm": 0.17985440451371562, "learning_rate": 7.356841136935946e-06, "loss": 0.4926, "step": 1049 }, { "epoch": 1.035494207542519, "grad_norm": 0.1655664487500876, "learning_rate": 7.352278327146946e-06, "loss": 0.4631, "step": 1050 }, { "epoch": 1.0364801577520335, "grad_norm": 0.16699537292326075, "learning_rate": 7.347713000267451e-06, "loss": 0.4775, "step": 1051 }, { "epoch": 1.037466107961548, "grad_norm": 0.1653107310751375, "learning_rate": 7.343145161182654e-06, "loss": 0.4486, "step": 1052 }, { "epoch": 1.0384520581710623, "grad_norm": 0.20928144693777923, "learning_rate": 7.338574814780442e-06, "loss": 0.4714, "step": 1053 }, { "epoch": 1.0394380083805768, "grad_norm": 0.16832051275492646, "learning_rate": 7.33400196595138e-06, "loss": 0.4689, "step": 1054 }, { "epoch": 1.0404239585900912, "grad_norm": 0.17097271806660913, "learning_rate": 7.329426619588713e-06, "loss": 0.4737, "step": 1055 }, { "epoch": 1.0414099087996056, "grad_norm": 0.16375357742355764, "learning_rate": 7.324848780588359e-06, "loss": 0.464, "step": 1056 }, { "epoch": 1.04239585900912, "grad_norm": 0.16418865158498658, "learning_rate": 7.3202684538489056e-06, "loss": 0.4629, "step": 1057 }, { "epoch": 1.0433818092186344, "grad_norm": 0.16611259691226854, "learning_rate": 7.315685644271595e-06, "loss": 0.4667, "step": 1058 }, { "epoch": 1.0443677594281489, "grad_norm": 0.17033513470900657, "learning_rate": 7.311100356760334e-06, "loss": 0.473, "step": 1059 }, { "epoch": 1.0453537096376633, "grad_norm": 0.17518713238225492, "learning_rate": 7.306512596221678e-06, "loss": 0.4884, "step": 1060 }, { "epoch": 1.0463396598471777, "grad_norm": 0.20198021631299554, "learning_rate": 7.301922367564828e-06, "loss": 0.4778, "step": 1061 }, { "epoch": 1.0473256100566921, "grad_norm": 0.16412321917522685, "learning_rate": 7.297329675701625e-06, "loss": 0.462, "step": 1062 }, { "epoch": 1.0483115602662065, "grad_norm": 0.17500314542240664, "learning_rate": 7.29273452554655e-06, "loss": 0.4607, "step": 1063 }, { "epoch": 1.049297510475721, "grad_norm": 0.1662387855658783, "learning_rate": 7.28813692201671e-06, "loss": 0.4722, "step": 1064 }, { "epoch": 1.0502834606852354, "grad_norm": 0.1651013643582452, "learning_rate": 7.283536870031841e-06, "loss": 0.455, "step": 1065 }, { "epoch": 1.0512694108947498, "grad_norm": 0.1980382070465155, "learning_rate": 7.278934374514295e-06, "loss": 0.4727, "step": 1066 }, { "epoch": 1.0522553611042642, "grad_norm": 0.21478207995676313, "learning_rate": 7.274329440389043e-06, "loss": 0.4817, "step": 1067 }, { "epoch": 1.0532413113137786, "grad_norm": 0.1669942279540382, "learning_rate": 7.269722072583661e-06, "loss": 0.4667, "step": 1068 }, { "epoch": 1.054227261523293, "grad_norm": 0.1833073183115231, "learning_rate": 7.265112276028334e-06, "loss": 0.4729, "step": 1069 }, { "epoch": 1.0552132117328075, "grad_norm": 0.16101246541844533, "learning_rate": 7.260500055655843e-06, "loss": 0.4605, "step": 1070 }, { "epoch": 1.0561991619423219, "grad_norm": 0.1724004172085386, "learning_rate": 7.255885416401565e-06, "loss": 0.4557, "step": 1071 }, { "epoch": 1.0571851121518363, "grad_norm": 0.17652668859621282, "learning_rate": 7.251268363203458e-06, "loss": 0.4679, "step": 1072 }, { "epoch": 1.0581710623613507, "grad_norm": 0.16265501174831257, "learning_rate": 7.246648901002073e-06, "loss": 0.4623, "step": 1073 }, { "epoch": 1.0591570125708651, "grad_norm": 0.1681511080215305, "learning_rate": 7.242027034740533e-06, "loss": 0.4741, "step": 1074 }, { "epoch": 1.0601429627803796, "grad_norm": 0.16691945200969913, "learning_rate": 7.2374027693645364e-06, "loss": 0.4738, "step": 1075 }, { "epoch": 1.061128912989894, "grad_norm": 0.1753163266766736, "learning_rate": 7.232776109822346e-06, "loss": 0.4749, "step": 1076 }, { "epoch": 1.0621148631994084, "grad_norm": 0.16592219858586765, "learning_rate": 7.2281470610647885e-06, "loss": 0.476, "step": 1077 }, { "epoch": 1.0631008134089228, "grad_norm": 0.168087769220742, "learning_rate": 7.223515628045246e-06, "loss": 0.4617, "step": 1078 }, { "epoch": 1.0640867636184372, "grad_norm": 0.3051422068414839, "learning_rate": 7.218881815719651e-06, "loss": 0.4691, "step": 1079 }, { "epoch": 1.0650727138279517, "grad_norm": 0.16389945371975692, "learning_rate": 7.214245629046488e-06, "loss": 0.4565, "step": 1080 }, { "epoch": 1.066058664037466, "grad_norm": 0.1646261439122491, "learning_rate": 7.209607072986772e-06, "loss": 0.4889, "step": 1081 }, { "epoch": 1.0670446142469805, "grad_norm": 0.5052895776621189, "learning_rate": 7.204966152504064e-06, "loss": 0.4718, "step": 1082 }, { "epoch": 1.068030564456495, "grad_norm": 0.16767373688606013, "learning_rate": 7.200322872564444e-06, "loss": 0.4503, "step": 1083 }, { "epoch": 1.0690165146660093, "grad_norm": 0.17344310017547962, "learning_rate": 7.195677238136532e-06, "loss": 0.4584, "step": 1084 }, { "epoch": 1.0700024648755238, "grad_norm": 0.17012418880271177, "learning_rate": 7.1910292541914505e-06, "loss": 0.4901, "step": 1085 }, { "epoch": 1.0709884150850382, "grad_norm": 0.17520849654487555, "learning_rate": 7.186378925702847e-06, "loss": 0.4632, "step": 1086 }, { "epoch": 1.0719743652945526, "grad_norm": 0.16694682563355986, "learning_rate": 7.181726257646875e-06, "loss": 0.4632, "step": 1087 }, { "epoch": 1.072960315504067, "grad_norm": 0.16679362368659548, "learning_rate": 7.17707125500219e-06, "loss": 0.4531, "step": 1088 }, { "epoch": 1.0739462657135814, "grad_norm": 0.17233737738544838, "learning_rate": 7.172413922749949e-06, "loss": 0.4681, "step": 1089 }, { "epoch": 1.0749322159230958, "grad_norm": 0.17216500444653932, "learning_rate": 7.167754265873799e-06, "loss": 0.457, "step": 1090 }, { "epoch": 1.0759181661326103, "grad_norm": 0.1717603157377839, "learning_rate": 7.163092289359874e-06, "loss": 0.4582, "step": 1091 }, { "epoch": 1.0769041163421247, "grad_norm": 0.17624404758316678, "learning_rate": 7.158427998196794e-06, "loss": 0.4498, "step": 1092 }, { "epoch": 1.077890066551639, "grad_norm": 0.1643558553861484, "learning_rate": 7.15376139737565e-06, "loss": 0.467, "step": 1093 }, { "epoch": 1.0788760167611535, "grad_norm": 0.16438382664576945, "learning_rate": 7.149092491890012e-06, "loss": 0.4725, "step": 1094 }, { "epoch": 1.079861966970668, "grad_norm": 0.16037568324882467, "learning_rate": 7.144421286735907e-06, "loss": 0.4897, "step": 1095 }, { "epoch": 1.0808479171801824, "grad_norm": 0.1736892235376666, "learning_rate": 7.139747786911833e-06, "loss": 0.4555, "step": 1096 }, { "epoch": 1.0818338673896968, "grad_norm": 0.16573370625810585, "learning_rate": 7.135071997418733e-06, "loss": 0.4825, "step": 1097 }, { "epoch": 1.0828198175992112, "grad_norm": 0.17245174768386456, "learning_rate": 7.130393923260008e-06, "loss": 0.4726, "step": 1098 }, { "epoch": 1.0838057678087256, "grad_norm": 0.16248610358495352, "learning_rate": 7.125713569441502e-06, "loss": 0.457, "step": 1099 }, { "epoch": 1.08479171801824, "grad_norm": 0.1645857835183841, "learning_rate": 7.121030940971496e-06, "loss": 0.471, "step": 1100 }, { "epoch": 1.0857776682277545, "grad_norm": 0.17570847262336248, "learning_rate": 7.1163460428607065e-06, "loss": 0.4836, "step": 1101 }, { "epoch": 1.0867636184372689, "grad_norm": 0.16498768564272298, "learning_rate": 7.1116588801222785e-06, "loss": 0.4694, "step": 1102 }, { "epoch": 1.0877495686467833, "grad_norm": 0.16798012630933712, "learning_rate": 7.106969457771782e-06, "loss": 0.465, "step": 1103 }, { "epoch": 1.0887355188562977, "grad_norm": 0.224016566078945, "learning_rate": 7.102277780827198e-06, "loss": 0.4653, "step": 1104 }, { "epoch": 1.0897214690658121, "grad_norm": 0.17016452650065036, "learning_rate": 7.097583854308934e-06, "loss": 0.4875, "step": 1105 }, { "epoch": 1.0907074192753266, "grad_norm": 0.16271342027732072, "learning_rate": 7.092887683239786e-06, "loss": 0.4696, "step": 1106 }, { "epoch": 1.091693369484841, "grad_norm": 0.16688527743939172, "learning_rate": 7.088189272644971e-06, "loss": 0.4622, "step": 1107 }, { "epoch": 1.0926793196943554, "grad_norm": 0.16638349989284135, "learning_rate": 7.083488627552089e-06, "loss": 0.4709, "step": 1108 }, { "epoch": 1.0936652699038698, "grad_norm": 0.16461375486233376, "learning_rate": 7.078785752991134e-06, "loss": 0.4714, "step": 1109 }, { "epoch": 1.0946512201133842, "grad_norm": 0.17605419735885408, "learning_rate": 7.074080653994491e-06, "loss": 0.469, "step": 1110 }, { "epoch": 1.0956371703228986, "grad_norm": 0.16324910009751914, "learning_rate": 7.069373335596918e-06, "loss": 0.466, "step": 1111 }, { "epoch": 1.096623120532413, "grad_norm": 0.2943390226786043, "learning_rate": 7.0646638028355515e-06, "loss": 0.4672, "step": 1112 }, { "epoch": 1.0976090707419275, "grad_norm": 0.16732350138152158, "learning_rate": 7.0599520607499e-06, "loss": 0.4764, "step": 1113 }, { "epoch": 1.098595020951442, "grad_norm": 0.1692670065093426, "learning_rate": 7.0552381143818295e-06, "loss": 0.4645, "step": 1114 }, { "epoch": 1.0995809711609563, "grad_norm": 0.17387257746846765, "learning_rate": 7.050521968775574e-06, "loss": 0.4751, "step": 1115 }, { "epoch": 1.1005669213704707, "grad_norm": 0.16017857772958252, "learning_rate": 7.045803628977708e-06, "loss": 0.473, "step": 1116 }, { "epoch": 1.1015528715799852, "grad_norm": 0.16293349123194426, "learning_rate": 7.041083100037167e-06, "loss": 0.4607, "step": 1117 }, { "epoch": 1.1025388217894996, "grad_norm": 0.17002810017550496, "learning_rate": 7.036360387005223e-06, "loss": 0.4827, "step": 1118 }, { "epoch": 1.103524771999014, "grad_norm": 0.15772025383880783, "learning_rate": 7.031635494935483e-06, "loss": 0.4478, "step": 1119 }, { "epoch": 1.1045107222085284, "grad_norm": 0.16545767237670908, "learning_rate": 7.02690842888389e-06, "loss": 0.4746, "step": 1120 }, { "epoch": 1.1054966724180428, "grad_norm": 0.16915171076158128, "learning_rate": 7.02217919390871e-06, "loss": 0.4788, "step": 1121 }, { "epoch": 1.1064826226275573, "grad_norm": 0.168632485475845, "learning_rate": 7.017447795070533e-06, "loss": 0.4846, "step": 1122 }, { "epoch": 1.1074685728370717, "grad_norm": 0.16577222327335642, "learning_rate": 7.0127142374322634e-06, "loss": 0.4624, "step": 1123 }, { "epoch": 1.108454523046586, "grad_norm": 0.16593246110678658, "learning_rate": 7.007978526059113e-06, "loss": 0.4646, "step": 1124 }, { "epoch": 1.1094404732561005, "grad_norm": 0.1622469903471737, "learning_rate": 7.003240666018602e-06, "loss": 0.4593, "step": 1125 }, { "epoch": 1.110426423465615, "grad_norm": 0.16740371758851988, "learning_rate": 6.998500662380547e-06, "loss": 0.4769, "step": 1126 }, { "epoch": 1.1114123736751294, "grad_norm": 0.16729177647362864, "learning_rate": 6.993758520217059e-06, "loss": 0.4746, "step": 1127 }, { "epoch": 1.1123983238846438, "grad_norm": 0.17152389487000505, "learning_rate": 6.989014244602541e-06, "loss": 0.4617, "step": 1128 }, { "epoch": 1.1133842740941582, "grad_norm": 0.16258445009175337, "learning_rate": 6.984267840613672e-06, "loss": 0.4769, "step": 1129 }, { "epoch": 1.1143702243036726, "grad_norm": 0.17440742675904922, "learning_rate": 6.979519313329417e-06, "loss": 0.4731, "step": 1130 }, { "epoch": 1.115356174513187, "grad_norm": 0.16660447566325526, "learning_rate": 6.974768667831003e-06, "loss": 0.4585, "step": 1131 }, { "epoch": 1.1163421247227014, "grad_norm": 0.16484065818073312, "learning_rate": 6.970015909201933e-06, "loss": 0.46, "step": 1132 }, { "epoch": 1.1173280749322159, "grad_norm": 0.16154224141757775, "learning_rate": 6.965261042527967e-06, "loss": 0.4618, "step": 1133 }, { "epoch": 1.1183140251417303, "grad_norm": 0.17214462447951392, "learning_rate": 6.960504072897119e-06, "loss": 0.4623, "step": 1134 }, { "epoch": 1.1192999753512447, "grad_norm": 0.1725997618736933, "learning_rate": 6.9557450053996545e-06, "loss": 0.4755, "step": 1135 }, { "epoch": 1.1202859255607591, "grad_norm": 0.17276383842533038, "learning_rate": 6.950983845128089e-06, "loss": 0.4781, "step": 1136 }, { "epoch": 1.1212718757702735, "grad_norm": 0.16154453911839164, "learning_rate": 6.946220597177168e-06, "loss": 0.4627, "step": 1137 }, { "epoch": 1.122257825979788, "grad_norm": 0.1703763535245707, "learning_rate": 6.94145526664388e-06, "loss": 0.47, "step": 1138 }, { "epoch": 1.1232437761893024, "grad_norm": 0.28774812472020417, "learning_rate": 6.936687858627435e-06, "loss": 0.4901, "step": 1139 }, { "epoch": 1.1242297263988168, "grad_norm": 0.18344988614290553, "learning_rate": 6.931918378229272e-06, "loss": 0.479, "step": 1140 }, { "epoch": 1.1252156766083312, "grad_norm": 0.17717966653243214, "learning_rate": 6.927146830553042e-06, "loss": 0.4683, "step": 1141 }, { "epoch": 1.1262016268178456, "grad_norm": 0.16444086161857938, "learning_rate": 6.9223732207046135e-06, "loss": 0.4628, "step": 1142 }, { "epoch": 1.12718757702736, "grad_norm": 0.1820239289873006, "learning_rate": 6.917597553792056e-06, "loss": 0.4807, "step": 1143 }, { "epoch": 1.1281735272368745, "grad_norm": 0.17719076702306055, "learning_rate": 6.9128198349256425e-06, "loss": 0.4749, "step": 1144 }, { "epoch": 1.129159477446389, "grad_norm": 0.16908077087864143, "learning_rate": 6.908040069217846e-06, "loss": 0.4782, "step": 1145 }, { "epoch": 1.1301454276559033, "grad_norm": 0.1746598197608833, "learning_rate": 6.903258261783325e-06, "loss": 0.4667, "step": 1146 }, { "epoch": 1.1311313778654177, "grad_norm": 0.17432819355640805, "learning_rate": 6.898474417738921e-06, "loss": 0.4729, "step": 1147 }, { "epoch": 1.1321173280749321, "grad_norm": 0.1793286884959327, "learning_rate": 6.8936885422036605e-06, "loss": 0.4644, "step": 1148 }, { "epoch": 1.1331032782844466, "grad_norm": 0.16360993038795005, "learning_rate": 6.88890064029874e-06, "loss": 0.4765, "step": 1149 }, { "epoch": 1.134089228493961, "grad_norm": 0.21984283899142565, "learning_rate": 6.884110717147524e-06, "loss": 0.4668, "step": 1150 }, { "epoch": 1.1350751787034754, "grad_norm": 0.16589158210647748, "learning_rate": 6.879318777875545e-06, "loss": 0.4542, "step": 1151 }, { "epoch": 1.1360611289129898, "grad_norm": 0.17202803684950171, "learning_rate": 6.874524827610485e-06, "loss": 0.4911, "step": 1152 }, { "epoch": 1.1370470791225042, "grad_norm": 0.18297592552668762, "learning_rate": 6.869728871482185e-06, "loss": 0.4626, "step": 1153 }, { "epoch": 1.1380330293320187, "grad_norm": 0.1714123832013582, "learning_rate": 6.864930914622627e-06, "loss": 0.4708, "step": 1154 }, { "epoch": 1.139018979541533, "grad_norm": 0.16642010529201842, "learning_rate": 6.860130962165937e-06, "loss": 0.4646, "step": 1155 }, { "epoch": 1.1400049297510475, "grad_norm": 0.16755274074404958, "learning_rate": 6.855329019248377e-06, "loss": 0.4543, "step": 1156 }, { "epoch": 1.140990879960562, "grad_norm": 0.17173758815122947, "learning_rate": 6.850525091008337e-06, "loss": 0.4639, "step": 1157 }, { "epoch": 1.1419768301700763, "grad_norm": 0.16844137773935142, "learning_rate": 6.8457191825863305e-06, "loss": 0.4876, "step": 1158 }, { "epoch": 1.1429627803795908, "grad_norm": 0.17557928876543696, "learning_rate": 6.840911299124993e-06, "loss": 0.4696, "step": 1159 }, { "epoch": 1.1439487305891052, "grad_norm": 0.16525897262605324, "learning_rate": 6.83610144576907e-06, "loss": 0.4566, "step": 1160 }, { "epoch": 1.1449346807986196, "grad_norm": 0.16496392440325483, "learning_rate": 6.831289627665418e-06, "loss": 0.4689, "step": 1161 }, { "epoch": 1.145920631008134, "grad_norm": 0.17009314190217278, "learning_rate": 6.8264758499629966e-06, "loss": 0.4886, "step": 1162 }, { "epoch": 1.1469065812176484, "grad_norm": 0.16660570181208462, "learning_rate": 6.82166011781286e-06, "loss": 0.4759, "step": 1163 }, { "epoch": 1.1478925314271629, "grad_norm": 0.16569799034550128, "learning_rate": 6.816842436368152e-06, "loss": 0.4705, "step": 1164 }, { "epoch": 1.1488784816366773, "grad_norm": 0.16786395945049576, "learning_rate": 6.812022810784105e-06, "loss": 0.4797, "step": 1165 }, { "epoch": 1.1498644318461917, "grad_norm": 0.17044498762909724, "learning_rate": 6.807201246218032e-06, "loss": 0.4849, "step": 1166 }, { "epoch": 1.1508503820557061, "grad_norm": 0.16634613228399914, "learning_rate": 6.802377747829317e-06, "loss": 0.4802, "step": 1167 }, { "epoch": 1.1518363322652205, "grad_norm": 0.17795643655428448, "learning_rate": 6.7975523207794225e-06, "loss": 0.4794, "step": 1168 }, { "epoch": 1.152822282474735, "grad_norm": 0.1611271594226689, "learning_rate": 6.792724970231863e-06, "loss": 0.473, "step": 1169 }, { "epoch": 1.1538082326842494, "grad_norm": 0.16386488268030752, "learning_rate": 6.78789570135222e-06, "loss": 0.4656, "step": 1170 }, { "epoch": 1.1547941828937638, "grad_norm": 0.15555865771083044, "learning_rate": 6.783064519308124e-06, "loss": 0.4513, "step": 1171 }, { "epoch": 1.1557801331032782, "grad_norm": 0.16149298292896075, "learning_rate": 6.778231429269254e-06, "loss": 0.4586, "step": 1172 }, { "epoch": 1.1567660833127926, "grad_norm": 0.16270351048734, "learning_rate": 6.773396436407329e-06, "loss": 0.4597, "step": 1173 }, { "epoch": 1.157752033522307, "grad_norm": 0.16464711130538048, "learning_rate": 6.768559545896105e-06, "loss": 0.4592, "step": 1174 }, { "epoch": 1.1587379837318215, "grad_norm": 0.17311614033058068, "learning_rate": 6.763720762911369e-06, "loss": 0.4511, "step": 1175 }, { "epoch": 1.1597239339413359, "grad_norm": 0.16841372538790297, "learning_rate": 6.758880092630935e-06, "loss": 0.4743, "step": 1176 }, { "epoch": 1.1607098841508503, "grad_norm": 0.16587863597246358, "learning_rate": 6.75403754023463e-06, "loss": 0.4593, "step": 1177 }, { "epoch": 1.1616958343603647, "grad_norm": 0.1636018720957146, "learning_rate": 6.749193110904303e-06, "loss": 0.4797, "step": 1178 }, { "epoch": 1.1626817845698791, "grad_norm": 0.1740505271613513, "learning_rate": 6.744346809823807e-06, "loss": 0.4728, "step": 1179 }, { "epoch": 1.1636677347793936, "grad_norm": 0.2693536885187195, "learning_rate": 6.739498642178999e-06, "loss": 0.478, "step": 1180 }, { "epoch": 1.164653684988908, "grad_norm": 0.16577868601241794, "learning_rate": 6.734648613157732e-06, "loss": 0.4624, "step": 1181 }, { "epoch": 1.1656396351984224, "grad_norm": 0.1744766961452551, "learning_rate": 6.729796727949852e-06, "loss": 0.4894, "step": 1182 }, { "epoch": 1.1666255854079368, "grad_norm": 0.16541529119470244, "learning_rate": 6.724942991747191e-06, "loss": 0.4584, "step": 1183 }, { "epoch": 1.1676115356174512, "grad_norm": 0.17414051379949466, "learning_rate": 6.720087409743564e-06, "loss": 0.4521, "step": 1184 }, { "epoch": 1.1685974858269657, "grad_norm": 0.1702068088669824, "learning_rate": 6.715229987134757e-06, "loss": 0.444, "step": 1185 }, { "epoch": 1.16958343603648, "grad_norm": 0.16404748027845822, "learning_rate": 6.710370729118527e-06, "loss": 0.4677, "step": 1186 }, { "epoch": 1.1705693862459945, "grad_norm": 0.16056225532303414, "learning_rate": 6.705509640894597e-06, "loss": 0.46, "step": 1187 }, { "epoch": 1.171555336455509, "grad_norm": 0.16207079229514923, "learning_rate": 6.700646727664647e-06, "loss": 0.467, "step": 1188 }, { "epoch": 1.1725412866650233, "grad_norm": 0.16149954151323434, "learning_rate": 6.695781994632308e-06, "loss": 0.4465, "step": 1189 }, { "epoch": 1.1735272368745377, "grad_norm": 0.16566375036652037, "learning_rate": 6.69091544700316e-06, "loss": 0.4587, "step": 1190 }, { "epoch": 1.1745131870840522, "grad_norm": 0.16197493545476, "learning_rate": 6.686047089984728e-06, "loss": 0.4639, "step": 1191 }, { "epoch": 1.1754991372935666, "grad_norm": 0.17691894653661064, "learning_rate": 6.681176928786467e-06, "loss": 0.4581, "step": 1192 }, { "epoch": 1.176485087503081, "grad_norm": 0.16291898757439355, "learning_rate": 6.6763049686197665e-06, "loss": 0.4419, "step": 1193 }, { "epoch": 1.1774710377125954, "grad_norm": 0.16579761031452891, "learning_rate": 6.671431214697941e-06, "loss": 0.4649, "step": 1194 }, { "epoch": 1.1784569879221098, "grad_norm": 0.16811772025638116, "learning_rate": 6.666555672236222e-06, "loss": 0.4809, "step": 1195 }, { "epoch": 1.1794429381316243, "grad_norm": 0.17605395928097767, "learning_rate": 6.661678346451758e-06, "loss": 0.4801, "step": 1196 }, { "epoch": 1.1804288883411387, "grad_norm": 0.16485846721219174, "learning_rate": 6.656799242563603e-06, "loss": 0.4724, "step": 1197 }, { "epoch": 1.181414838550653, "grad_norm": 0.1631486631870248, "learning_rate": 6.651918365792715e-06, "loss": 0.4678, "step": 1198 }, { "epoch": 1.1824007887601675, "grad_norm": 0.1614771611712231, "learning_rate": 6.647035721361951e-06, "loss": 0.4919, "step": 1199 }, { "epoch": 1.183386738969682, "grad_norm": 0.1652012155178517, "learning_rate": 6.642151314496053e-06, "loss": 0.4557, "step": 1200 }, { "epoch": 1.1843726891791964, "grad_norm": 0.16882925772854435, "learning_rate": 6.637265150421658e-06, "loss": 0.4986, "step": 1201 }, { "epoch": 1.1853586393887108, "grad_norm": 0.1628492998330556, "learning_rate": 6.632377234367276e-06, "loss": 0.4596, "step": 1202 }, { "epoch": 1.1863445895982252, "grad_norm": 0.26227874201540097, "learning_rate": 6.627487571563293e-06, "loss": 0.4763, "step": 1203 }, { "epoch": 1.1873305398077396, "grad_norm": 0.16499219832675732, "learning_rate": 6.622596167241971e-06, "loss": 0.4539, "step": 1204 }, { "epoch": 1.188316490017254, "grad_norm": 0.2066692200887719, "learning_rate": 6.617703026637426e-06, "loss": 0.4734, "step": 1205 }, { "epoch": 1.1893024402267685, "grad_norm": 0.16630659374021256, "learning_rate": 6.612808154985637e-06, "loss": 0.4931, "step": 1206 }, { "epoch": 1.1902883904362829, "grad_norm": 0.17020363980871317, "learning_rate": 6.607911557524434e-06, "loss": 0.4779, "step": 1207 }, { "epoch": 1.1912743406457973, "grad_norm": 0.16697465970372505, "learning_rate": 6.603013239493495e-06, "loss": 0.4625, "step": 1208 }, { "epoch": 1.1922602908553117, "grad_norm": 0.17274524007780673, "learning_rate": 6.598113206134338e-06, "loss": 0.469, "step": 1209 }, { "epoch": 1.1932462410648261, "grad_norm": 0.15882550112893343, "learning_rate": 6.593211462690317e-06, "loss": 0.4672, "step": 1210 }, { "epoch": 1.1942321912743405, "grad_norm": 0.16581349881129095, "learning_rate": 6.5883080144066145e-06, "loss": 0.4793, "step": 1211 }, { "epoch": 1.195218141483855, "grad_norm": 0.1713090589344652, "learning_rate": 6.58340286653024e-06, "loss": 0.4696, "step": 1212 }, { "epoch": 1.1962040916933696, "grad_norm": 0.17539363763708982, "learning_rate": 6.578496024310017e-06, "loss": 0.4795, "step": 1213 }, { "epoch": 1.1971900419028838, "grad_norm": 0.16582363125647054, "learning_rate": 6.573587492996589e-06, "loss": 0.4806, "step": 1214 }, { "epoch": 1.1981759921123984, "grad_norm": 0.1749027122510544, "learning_rate": 6.568677277842401e-06, "loss": 0.459, "step": 1215 }, { "epoch": 1.1991619423219126, "grad_norm": 0.17937901370390394, "learning_rate": 6.563765384101704e-06, "loss": 0.475, "step": 1216 }, { "epoch": 1.2001478925314273, "grad_norm": 0.17045662665449884, "learning_rate": 6.558851817030541e-06, "loss": 0.4719, "step": 1217 }, { "epoch": 1.2011338427409415, "grad_norm": 0.1629843929250449, "learning_rate": 6.5539365818867474e-06, "loss": 0.4572, "step": 1218 }, { "epoch": 1.2021197929504561, "grad_norm": 0.16442995828161608, "learning_rate": 6.549019683929945e-06, "loss": 0.4921, "step": 1219 }, { "epoch": 1.2031057431599703, "grad_norm": 0.16487269651575245, "learning_rate": 6.544101128421534e-06, "loss": 0.4748, "step": 1220 }, { "epoch": 1.204091693369485, "grad_norm": 0.16386476223301738, "learning_rate": 6.539180920624687e-06, "loss": 0.478, "step": 1221 }, { "epoch": 1.2050776435789992, "grad_norm": 0.163091810991172, "learning_rate": 6.534259065804348e-06, "loss": 0.4411, "step": 1222 }, { "epoch": 1.2060635937885138, "grad_norm": 0.16456812390268993, "learning_rate": 6.5293355692272175e-06, "loss": 0.4639, "step": 1223 }, { "epoch": 1.207049543998028, "grad_norm": 0.1623734530086265, "learning_rate": 6.52441043616176e-06, "loss": 0.4813, "step": 1224 }, { "epoch": 1.2080354942075426, "grad_norm": 0.19053264204778192, "learning_rate": 6.519483671878184e-06, "loss": 0.481, "step": 1225 }, { "epoch": 1.2090214444170568, "grad_norm": 0.16596042008577788, "learning_rate": 6.514555281648451e-06, "loss": 0.4675, "step": 1226 }, { "epoch": 1.2100073946265715, "grad_norm": 0.18419164498767882, "learning_rate": 6.509625270746256e-06, "loss": 0.4763, "step": 1227 }, { "epoch": 1.2109933448360857, "grad_norm": 0.16272191942149414, "learning_rate": 6.504693644447031e-06, "loss": 0.4494, "step": 1228 }, { "epoch": 1.2119792950456003, "grad_norm": 0.16270551892164925, "learning_rate": 6.499760408027936e-06, "loss": 0.4598, "step": 1229 }, { "epoch": 1.2129652452551145, "grad_norm": 0.1847597471968673, "learning_rate": 6.494825566767855e-06, "loss": 0.4681, "step": 1230 }, { "epoch": 1.2139511954646292, "grad_norm": 0.17468692693829316, "learning_rate": 6.489889125947388e-06, "loss": 0.4928, "step": 1231 }, { "epoch": 1.2149371456741433, "grad_norm": 0.1713936637052054, "learning_rate": 6.484951090848848e-06, "loss": 0.4722, "step": 1232 }, { "epoch": 1.215923095883658, "grad_norm": 0.1610173653813777, "learning_rate": 6.480011466756251e-06, "loss": 0.4705, "step": 1233 }, { "epoch": 1.2169090460931722, "grad_norm": 0.16242661303668252, "learning_rate": 6.475070258955317e-06, "loss": 0.473, "step": 1234 }, { "epoch": 1.2178949963026868, "grad_norm": 0.17018172840938867, "learning_rate": 6.470127472733459e-06, "loss": 0.4763, "step": 1235 }, { "epoch": 1.218880946512201, "grad_norm": 0.16686134054815724, "learning_rate": 6.465183113379778e-06, "loss": 0.4659, "step": 1236 }, { "epoch": 1.2198668967217157, "grad_norm": 0.16195163014916636, "learning_rate": 6.4602371861850636e-06, "loss": 0.461, "step": 1237 }, { "epoch": 1.2208528469312299, "grad_norm": 0.1677888366893549, "learning_rate": 6.455289696441772e-06, "loss": 0.4503, "step": 1238 }, { "epoch": 1.2218387971407445, "grad_norm": 0.16238862790861802, "learning_rate": 6.450340649444045e-06, "loss": 0.454, "step": 1239 }, { "epoch": 1.2228247473502587, "grad_norm": 0.16927755272637426, "learning_rate": 6.445390050487678e-06, "loss": 0.4545, "step": 1240 }, { "epoch": 1.2238106975597733, "grad_norm": 0.1772015943207728, "learning_rate": 6.440437904870138e-06, "loss": 0.477, "step": 1241 }, { "epoch": 1.2247966477692875, "grad_norm": 0.163709112796267, "learning_rate": 6.435484217890539e-06, "loss": 0.4788, "step": 1242 }, { "epoch": 1.2257825979788022, "grad_norm": 0.16481737010694317, "learning_rate": 6.430528994849652e-06, "loss": 0.4717, "step": 1243 }, { "epoch": 1.2267685481883164, "grad_norm": 0.16548405260129903, "learning_rate": 6.425572241049883e-06, "loss": 0.4618, "step": 1244 }, { "epoch": 1.227754498397831, "grad_norm": 0.16990397077693506, "learning_rate": 6.420613961795284e-06, "loss": 0.4742, "step": 1245 }, { "epoch": 1.2287404486073452, "grad_norm": 0.16030382598931203, "learning_rate": 6.415654162391529e-06, "loss": 0.4625, "step": 1246 }, { "epoch": 1.2297263988168599, "grad_norm": 0.1674252777794334, "learning_rate": 6.410692848145934e-06, "loss": 0.4837, "step": 1247 }, { "epoch": 1.230712349026374, "grad_norm": 0.16915053152864798, "learning_rate": 6.40573002436742e-06, "loss": 0.4801, "step": 1248 }, { "epoch": 1.2316982992358887, "grad_norm": 0.16523840131205747, "learning_rate": 6.4007656963665356e-06, "loss": 0.4615, "step": 1249 }, { "epoch": 1.232684249445403, "grad_norm": 0.15743869879284614, "learning_rate": 6.395799869455433e-06, "loss": 0.4577, "step": 1250 }, { "epoch": 1.2336701996549175, "grad_norm": 0.15733935106047514, "learning_rate": 6.390832548947866e-06, "loss": 0.4599, "step": 1251 }, { "epoch": 1.2346561498644317, "grad_norm": 0.1773091500925943, "learning_rate": 6.385863740159194e-06, "loss": 0.473, "step": 1252 }, { "epoch": 1.2356421000739464, "grad_norm": 0.17114851438291118, "learning_rate": 6.3808934484063625e-06, "loss": 0.4749, "step": 1253 }, { "epoch": 1.2366280502834606, "grad_norm": 0.16869839701286612, "learning_rate": 6.3759216790079085e-06, "loss": 0.4648, "step": 1254 }, { "epoch": 1.2376140004929752, "grad_norm": 0.16990904986951733, "learning_rate": 6.370948437283944e-06, "loss": 0.4629, "step": 1255 }, { "epoch": 1.2385999507024894, "grad_norm": 0.17412698200049834, "learning_rate": 6.365973728556164e-06, "loss": 0.4851, "step": 1256 }, { "epoch": 1.239585900912004, "grad_norm": 0.16591032990859195, "learning_rate": 6.36099755814783e-06, "loss": 0.4242, "step": 1257 }, { "epoch": 1.2405718511215182, "grad_norm": 0.157269877576396, "learning_rate": 6.3560199313837646e-06, "loss": 0.4757, "step": 1258 }, { "epoch": 1.2415578013310329, "grad_norm": 0.18554747819079026, "learning_rate": 6.351040853590354e-06, "loss": 0.4481, "step": 1259 }, { "epoch": 1.242543751540547, "grad_norm": 0.1648461295752464, "learning_rate": 6.3460603300955334e-06, "loss": 0.4565, "step": 1260 }, { "epoch": 1.2435297017500617, "grad_norm": 0.18926894216889145, "learning_rate": 6.341078366228786e-06, "loss": 0.4725, "step": 1261 }, { "epoch": 1.244515651959576, "grad_norm": 0.16195265923026017, "learning_rate": 6.336094967321138e-06, "loss": 0.4685, "step": 1262 }, { "epoch": 1.2455016021690906, "grad_norm": 0.1656917211614036, "learning_rate": 6.331110138705148e-06, "loss": 0.4779, "step": 1263 }, { "epoch": 1.2464875523786048, "grad_norm": 0.16142209518173187, "learning_rate": 6.326123885714907e-06, "loss": 0.5116, "step": 1264 }, { "epoch": 1.2474735025881194, "grad_norm": 0.31997566157907265, "learning_rate": 6.32113621368603e-06, "loss": 0.4564, "step": 1265 }, { "epoch": 1.2484594527976336, "grad_norm": 0.17730394323574275, "learning_rate": 6.316147127955649e-06, "loss": 0.4687, "step": 1266 }, { "epoch": 1.2494454030071482, "grad_norm": 0.1620764753291925, "learning_rate": 6.3111566338624095e-06, "loss": 0.4594, "step": 1267 }, { "epoch": 1.2504313532166624, "grad_norm": 0.1678496281527768, "learning_rate": 6.306164736746464e-06, "loss": 0.47, "step": 1268 }, { "epoch": 1.251417303426177, "grad_norm": 0.16089754499247352, "learning_rate": 6.3011714419494655e-06, "loss": 0.4762, "step": 1269 }, { "epoch": 1.2524032536356913, "grad_norm": 0.1672739021913524, "learning_rate": 6.296176754814567e-06, "loss": 0.4712, "step": 1270 }, { "epoch": 1.253389203845206, "grad_norm": 0.16876494261501873, "learning_rate": 6.291180680686404e-06, "loss": 0.487, "step": 1271 }, { "epoch": 1.25437515405472, "grad_norm": 0.16771770316083348, "learning_rate": 6.2861832249111036e-06, "loss": 0.4762, "step": 1272 }, { "epoch": 1.2553611042642348, "grad_norm": 0.16924318609516403, "learning_rate": 6.281184392836265e-06, "loss": 0.4739, "step": 1273 }, { "epoch": 1.256347054473749, "grad_norm": 0.1619535812048183, "learning_rate": 6.276184189810964e-06, "loss": 0.4793, "step": 1274 }, { "epoch": 1.2573330046832636, "grad_norm": 0.17080622465645712, "learning_rate": 6.271182621185743e-06, "loss": 0.4599, "step": 1275 }, { "epoch": 1.2583189548927778, "grad_norm": 0.16123342241345748, "learning_rate": 6.266179692312604e-06, "loss": 0.4461, "step": 1276 }, { "epoch": 1.2593049051022924, "grad_norm": 0.16737822044918219, "learning_rate": 6.261175408545007e-06, "loss": 0.4578, "step": 1277 }, { "epoch": 1.2602908553118066, "grad_norm": 0.1703890577074199, "learning_rate": 6.256169775237858e-06, "loss": 0.4596, "step": 1278 }, { "epoch": 1.2612768055213213, "grad_norm": 0.16742321142966837, "learning_rate": 6.251162797747513e-06, "loss": 0.4679, "step": 1279 }, { "epoch": 1.2622627557308355, "grad_norm": 0.1815832800104829, "learning_rate": 6.246154481431761e-06, "loss": 0.4718, "step": 1280 }, { "epoch": 1.26324870594035, "grad_norm": 0.16788093578994845, "learning_rate": 6.241144831649825e-06, "loss": 0.4535, "step": 1281 }, { "epoch": 1.2642346561498643, "grad_norm": 0.15990959895353038, "learning_rate": 6.236133853762356e-06, "loss": 0.4618, "step": 1282 }, { "epoch": 1.265220606359379, "grad_norm": 0.16402331148916255, "learning_rate": 6.2311215531314266e-06, "loss": 0.4655, "step": 1283 }, { "epoch": 1.2662065565688931, "grad_norm": 0.1852919368904106, "learning_rate": 6.226107935120521e-06, "loss": 0.4718, "step": 1284 }, { "epoch": 1.2671925067784078, "grad_norm": 0.16327620865295686, "learning_rate": 6.22109300509454e-06, "loss": 0.4347, "step": 1285 }, { "epoch": 1.268178456987922, "grad_norm": 0.15772623902003347, "learning_rate": 6.216076768419782e-06, "loss": 0.4558, "step": 1286 }, { "epoch": 1.2691644071974366, "grad_norm": 0.15816908510441025, "learning_rate": 6.2110592304639465e-06, "loss": 0.4716, "step": 1287 }, { "epoch": 1.2701503574069508, "grad_norm": 0.16483487579973904, "learning_rate": 6.206040396596122e-06, "loss": 0.4733, "step": 1288 }, { "epoch": 1.2711363076164655, "grad_norm": 0.16045450842226178, "learning_rate": 6.2010202721867905e-06, "loss": 0.4708, "step": 1289 }, { "epoch": 1.2721222578259797, "grad_norm": 0.16640850424068243, "learning_rate": 6.195998862607808e-06, "loss": 0.4723, "step": 1290 }, { "epoch": 1.2731082080354943, "grad_norm": 0.16315540706301973, "learning_rate": 6.190976173232411e-06, "loss": 0.4691, "step": 1291 }, { "epoch": 1.2740941582450085, "grad_norm": 0.16461112623105786, "learning_rate": 6.185952209435202e-06, "loss": 0.448, "step": 1292 }, { "epoch": 1.2750801084545231, "grad_norm": 0.1612932926419046, "learning_rate": 6.180926976592149e-06, "loss": 0.4797, "step": 1293 }, { "epoch": 1.2760660586640373, "grad_norm": 0.16633090978817794, "learning_rate": 6.1759004800805745e-06, "loss": 0.4737, "step": 1294 }, { "epoch": 1.277052008873552, "grad_norm": 0.1668446080528459, "learning_rate": 6.17087272527916e-06, "loss": 0.4701, "step": 1295 }, { "epoch": 1.2780379590830664, "grad_norm": 0.16724989223783207, "learning_rate": 6.165843717567928e-06, "loss": 0.4626, "step": 1296 }, { "epoch": 1.2790239092925808, "grad_norm": 0.16331114452795373, "learning_rate": 6.160813462328243e-06, "loss": 0.4525, "step": 1297 }, { "epoch": 1.2800098595020952, "grad_norm": 0.1657302360887789, "learning_rate": 6.155781964942805e-06, "loss": 0.4708, "step": 1298 }, { "epoch": 1.2809958097116096, "grad_norm": 0.16514305117587627, "learning_rate": 6.15074923079564e-06, "loss": 0.474, "step": 1299 }, { "epoch": 1.281981759921124, "grad_norm": 0.16147707778750775, "learning_rate": 6.145715265272106e-06, "loss": 0.4713, "step": 1300 }, { "epoch": 1.2829677101306385, "grad_norm": 0.16138878314330543, "learning_rate": 6.140680073758868e-06, "loss": 0.4553, "step": 1301 }, { "epoch": 1.283953660340153, "grad_norm": 0.17315028127761717, "learning_rate": 6.135643661643909e-06, "loss": 0.4495, "step": 1302 }, { "epoch": 1.2849396105496673, "grad_norm": 0.1671419705848458, "learning_rate": 6.1306060343165175e-06, "loss": 0.4578, "step": 1303 }, { "epoch": 1.2859255607591817, "grad_norm": 0.16664147031192345, "learning_rate": 6.125567197167281e-06, "loss": 0.4685, "step": 1304 }, { "epoch": 1.2869115109686962, "grad_norm": 0.16788815671047838, "learning_rate": 6.120527155588084e-06, "loss": 0.4906, "step": 1305 }, { "epoch": 1.2878974611782106, "grad_norm": 0.1666309231159149, "learning_rate": 6.115485914972096e-06, "loss": 0.4625, "step": 1306 }, { "epoch": 1.288883411387725, "grad_norm": 0.16012722394551646, "learning_rate": 6.110443480713771e-06, "loss": 0.468, "step": 1307 }, { "epoch": 1.2898693615972394, "grad_norm": 0.15592975147241056, "learning_rate": 6.1053998582088454e-06, "loss": 0.4437, "step": 1308 }, { "epoch": 1.2908553118067538, "grad_norm": 0.16046644072262936, "learning_rate": 6.1003550528543175e-06, "loss": 0.4592, "step": 1309 }, { "epoch": 1.2918412620162683, "grad_norm": 0.1666441859359538, "learning_rate": 6.0953090700484604e-06, "loss": 0.4716, "step": 1310 }, { "epoch": 1.2928272122257827, "grad_norm": 0.16569836512630953, "learning_rate": 6.0902619151908e-06, "loss": 0.4803, "step": 1311 }, { "epoch": 1.293813162435297, "grad_norm": 0.19211531217262628, "learning_rate": 6.085213593682122e-06, "loss": 0.4692, "step": 1312 }, { "epoch": 1.2947991126448115, "grad_norm": 0.163248231523702, "learning_rate": 6.080164110924458e-06, "loss": 0.4651, "step": 1313 }, { "epoch": 1.295785062854326, "grad_norm": 0.17678443752305695, "learning_rate": 6.07511347232108e-06, "loss": 0.4803, "step": 1314 }, { "epoch": 1.2967710130638403, "grad_norm": 0.17048543008252076, "learning_rate": 6.070061683276503e-06, "loss": 0.472, "step": 1315 }, { "epoch": 1.2977569632733548, "grad_norm": 0.16168795263522812, "learning_rate": 6.065008749196465e-06, "loss": 0.4664, "step": 1316 }, { "epoch": 1.2987429134828692, "grad_norm": 0.16696176979258565, "learning_rate": 6.0599546754879355e-06, "loss": 0.4544, "step": 1317 }, { "epoch": 1.2997288636923836, "grad_norm": 0.1665789392319412, "learning_rate": 6.054899467559101e-06, "loss": 0.4628, "step": 1318 }, { "epoch": 1.300714813901898, "grad_norm": 0.16431155738959483, "learning_rate": 6.049843130819364e-06, "loss": 0.4764, "step": 1319 }, { "epoch": 1.3017007641114124, "grad_norm": 0.1631417297949849, "learning_rate": 6.044785670679331e-06, "loss": 0.4616, "step": 1320 }, { "epoch": 1.3026867143209269, "grad_norm": 0.16076143236596774, "learning_rate": 6.039727092550812e-06, "loss": 0.4719, "step": 1321 }, { "epoch": 1.3036726645304413, "grad_norm": 0.1604735184866197, "learning_rate": 6.034667401846815e-06, "loss": 0.456, "step": 1322 }, { "epoch": 1.3046586147399557, "grad_norm": 0.1699803801824112, "learning_rate": 6.02960660398154e-06, "loss": 0.4766, "step": 1323 }, { "epoch": 1.3056445649494701, "grad_norm": 0.1693299534355223, "learning_rate": 6.024544704370364e-06, "loss": 0.4537, "step": 1324 }, { "epoch": 1.3066305151589845, "grad_norm": 0.16625676705170947, "learning_rate": 6.019481708429853e-06, "loss": 0.4947, "step": 1325 }, { "epoch": 1.307616465368499, "grad_norm": 0.1897305463235135, "learning_rate": 6.014417621577737e-06, "loss": 0.4647, "step": 1326 }, { "epoch": 1.3086024155780134, "grad_norm": 0.16692803122511632, "learning_rate": 6.0093524492329216e-06, "loss": 0.4721, "step": 1327 }, { "epoch": 1.3095883657875278, "grad_norm": 0.20823083410362914, "learning_rate": 6.004286196815467e-06, "loss": 0.4639, "step": 1328 }, { "epoch": 1.3105743159970422, "grad_norm": 0.1673197758356401, "learning_rate": 5.999218869746595e-06, "loss": 0.486, "step": 1329 }, { "epoch": 1.3115602662065566, "grad_norm": 0.15998251611718828, "learning_rate": 5.994150473448672e-06, "loss": 0.4551, "step": 1330 }, { "epoch": 1.312546216416071, "grad_norm": 0.1578637250927203, "learning_rate": 5.989081013345211e-06, "loss": 0.4536, "step": 1331 }, { "epoch": 1.3135321666255855, "grad_norm": 0.16366294834189274, "learning_rate": 5.984010494860865e-06, "loss": 0.487, "step": 1332 }, { "epoch": 1.3145181168351, "grad_norm": 0.1651012646941456, "learning_rate": 5.978938923421418e-06, "loss": 0.474, "step": 1333 }, { "epoch": 1.3155040670446143, "grad_norm": 0.1721464729952915, "learning_rate": 5.973866304453778e-06, "loss": 0.4831, "step": 1334 }, { "epoch": 1.3164900172541287, "grad_norm": 0.17333625543229284, "learning_rate": 5.9687926433859785e-06, "loss": 0.4833, "step": 1335 }, { "epoch": 1.3174759674636431, "grad_norm": 0.16199175748787464, "learning_rate": 5.963717945647167e-06, "loss": 0.4589, "step": 1336 }, { "epoch": 1.3184619176731576, "grad_norm": 0.1633274934497099, "learning_rate": 5.958642216667598e-06, "loss": 0.4635, "step": 1337 }, { "epoch": 1.319447867882672, "grad_norm": 0.1647858407826564, "learning_rate": 5.953565461878633e-06, "loss": 0.4511, "step": 1338 }, { "epoch": 1.3204338180921864, "grad_norm": 0.16666391657248938, "learning_rate": 5.948487686712725e-06, "loss": 0.4759, "step": 1339 }, { "epoch": 1.3214197683017008, "grad_norm": 0.1608303742929351, "learning_rate": 5.943408896603428e-06, "loss": 0.4565, "step": 1340 }, { "epoch": 1.3224057185112152, "grad_norm": 0.16619810102543203, "learning_rate": 5.938329096985374e-06, "loss": 0.4877, "step": 1341 }, { "epoch": 1.3233916687207297, "grad_norm": 0.16861142230966175, "learning_rate": 5.933248293294278e-06, "loss": 0.4817, "step": 1342 }, { "epoch": 1.324377618930244, "grad_norm": 0.1613311991031562, "learning_rate": 5.928166490966933e-06, "loss": 0.4658, "step": 1343 }, { "epoch": 1.3253635691397585, "grad_norm": 0.16613527042960732, "learning_rate": 5.923083695441193e-06, "loss": 0.4797, "step": 1344 }, { "epoch": 1.326349519349273, "grad_norm": 0.17375547190336268, "learning_rate": 5.9179999121559816e-06, "loss": 0.4644, "step": 1345 }, { "epoch": 1.3273354695587873, "grad_norm": 0.16483341515761354, "learning_rate": 5.912915146551278e-06, "loss": 0.4826, "step": 1346 }, { "epoch": 1.3283214197683018, "grad_norm": 0.1645542396832144, "learning_rate": 5.907829404068108e-06, "loss": 0.468, "step": 1347 }, { "epoch": 1.3293073699778162, "grad_norm": 0.1671629788898561, "learning_rate": 5.902742690148551e-06, "loss": 0.486, "step": 1348 }, { "epoch": 1.3302933201873306, "grad_norm": 0.16099037162037994, "learning_rate": 5.897655010235715e-06, "loss": 0.4563, "step": 1349 }, { "epoch": 1.331279270396845, "grad_norm": 0.18211649634751956, "learning_rate": 5.892566369773753e-06, "loss": 0.4551, "step": 1350 }, { "epoch": 1.3322652206063594, "grad_norm": 0.16220967017743546, "learning_rate": 5.887476774207839e-06, "loss": 0.4642, "step": 1351 }, { "epoch": 1.3332511708158739, "grad_norm": 0.19336714192891388, "learning_rate": 5.88238622898417e-06, "loss": 0.457, "step": 1352 }, { "epoch": 1.3342371210253883, "grad_norm": 0.1625183100395215, "learning_rate": 5.87729473954996e-06, "loss": 0.4542, "step": 1353 }, { "epoch": 1.3352230712349027, "grad_norm": 0.16533821732733284, "learning_rate": 5.872202311353433e-06, "loss": 0.4669, "step": 1354 }, { "epoch": 1.336209021444417, "grad_norm": 0.16833781950905896, "learning_rate": 5.867108949843817e-06, "loss": 0.4893, "step": 1355 }, { "epoch": 1.3371949716539315, "grad_norm": 0.16855094056463374, "learning_rate": 5.8620146604713435e-06, "loss": 0.4774, "step": 1356 }, { "epoch": 1.338180921863446, "grad_norm": 0.16556996608166277, "learning_rate": 5.856919448687226e-06, "loss": 0.4715, "step": 1357 }, { "epoch": 1.3391668720729604, "grad_norm": 0.16669043040756595, "learning_rate": 5.851823319943678e-06, "loss": 0.4788, "step": 1358 }, { "epoch": 1.3401528222824748, "grad_norm": 0.16574750712062733, "learning_rate": 5.846726279693885e-06, "loss": 0.4704, "step": 1359 }, { "epoch": 1.3411387724919892, "grad_norm": 0.17279517228358982, "learning_rate": 5.841628333392011e-06, "loss": 0.472, "step": 1360 }, { "epoch": 1.3421247227015036, "grad_norm": 0.1630485863540864, "learning_rate": 5.836529486493191e-06, "loss": 0.4632, "step": 1361 }, { "epoch": 1.343110672911018, "grad_norm": 0.1633776581966279, "learning_rate": 5.831429744453519e-06, "loss": 0.4771, "step": 1362 }, { "epoch": 1.3440966231205325, "grad_norm": 0.1666344556896262, "learning_rate": 5.826329112730056e-06, "loss": 0.4446, "step": 1363 }, { "epoch": 1.3450825733300469, "grad_norm": 0.16326008063661362, "learning_rate": 5.821227596780802e-06, "loss": 0.4713, "step": 1364 }, { "epoch": 1.3460685235395613, "grad_norm": 0.16760883043753336, "learning_rate": 5.816125202064714e-06, "loss": 0.4734, "step": 1365 }, { "epoch": 1.3470544737490757, "grad_norm": 0.16263318856999043, "learning_rate": 5.811021934041685e-06, "loss": 0.4733, "step": 1366 }, { "epoch": 1.3480404239585901, "grad_norm": 0.16694734396059974, "learning_rate": 5.805917798172543e-06, "loss": 0.4625, "step": 1367 }, { "epoch": 1.3490263741681046, "grad_norm": 0.15700598417509207, "learning_rate": 5.800812799919046e-06, "loss": 0.4603, "step": 1368 }, { "epoch": 1.350012324377619, "grad_norm": 0.1640378078382763, "learning_rate": 5.795706944743871e-06, "loss": 0.4681, "step": 1369 }, { "epoch": 1.3509982745871334, "grad_norm": 0.16548375785466893, "learning_rate": 5.790600238110614e-06, "loss": 0.4625, "step": 1370 }, { "epoch": 1.3519842247966478, "grad_norm": 0.16307590431256164, "learning_rate": 5.785492685483787e-06, "loss": 0.4484, "step": 1371 }, { "epoch": 1.3529701750061622, "grad_norm": 0.16742781126574774, "learning_rate": 5.780384292328798e-06, "loss": 0.4803, "step": 1372 }, { "epoch": 1.3539561252156767, "grad_norm": 0.1613501885215972, "learning_rate": 5.775275064111962e-06, "loss": 0.4604, "step": 1373 }, { "epoch": 1.354942075425191, "grad_norm": 0.19920118243153734, "learning_rate": 5.770165006300485e-06, "loss": 0.4681, "step": 1374 }, { "epoch": 1.3559280256347055, "grad_norm": 0.1594088819002285, "learning_rate": 5.765054124362458e-06, "loss": 0.4678, "step": 1375 }, { "epoch": 1.35691397584422, "grad_norm": 0.1700686727742075, "learning_rate": 5.759942423766859e-06, "loss": 0.4842, "step": 1376 }, { "epoch": 1.3578999260537343, "grad_norm": 0.1655330450181918, "learning_rate": 5.754829909983539e-06, "loss": 0.4704, "step": 1377 }, { "epoch": 1.3588858762632487, "grad_norm": 0.16107473023925153, "learning_rate": 5.7497165884832185e-06, "loss": 0.4619, "step": 1378 }, { "epoch": 1.3598718264727632, "grad_norm": 0.16500048672225306, "learning_rate": 5.744602464737484e-06, "loss": 0.4755, "step": 1379 }, { "epoch": 1.3608577766822776, "grad_norm": 0.20843416692703762, "learning_rate": 5.739487544218779e-06, "loss": 0.4523, "step": 1380 }, { "epoch": 1.361843726891792, "grad_norm": 0.16443032992627848, "learning_rate": 5.734371832400403e-06, "loss": 0.4784, "step": 1381 }, { "epoch": 1.3628296771013064, "grad_norm": 0.17828470247210604, "learning_rate": 5.729255334756497e-06, "loss": 0.4719, "step": 1382 }, { "epoch": 1.3638156273108208, "grad_norm": 0.1649213056572908, "learning_rate": 5.7241380567620475e-06, "loss": 0.4643, "step": 1383 }, { "epoch": 1.3648015775203353, "grad_norm": 0.1633401231658992, "learning_rate": 5.719020003892873e-06, "loss": 0.4676, "step": 1384 }, { "epoch": 1.3657875277298497, "grad_norm": 0.16414710207524366, "learning_rate": 5.7139011816256215e-06, "loss": 0.4531, "step": 1385 }, { "epoch": 1.366773477939364, "grad_norm": 0.1861612278327007, "learning_rate": 5.708781595437769e-06, "loss": 0.4669, "step": 1386 }, { "epoch": 1.3677594281488785, "grad_norm": 0.16442997293411388, "learning_rate": 5.703661250807599e-06, "loss": 0.4622, "step": 1387 }, { "epoch": 1.368745378358393, "grad_norm": 0.16661472250606849, "learning_rate": 5.698540153214218e-06, "loss": 0.4687, "step": 1388 }, { "epoch": 1.3697313285679074, "grad_norm": 0.16345551302490932, "learning_rate": 5.69341830813753e-06, "loss": 0.475, "step": 1389 }, { "epoch": 1.3707172787774218, "grad_norm": 0.15839635099473848, "learning_rate": 5.688295721058242e-06, "loss": 0.4632, "step": 1390 }, { "epoch": 1.3717032289869362, "grad_norm": 0.1622382829800274, "learning_rate": 5.683172397457856e-06, "loss": 0.4537, "step": 1391 }, { "epoch": 1.3726891791964506, "grad_norm": 0.18459014148936892, "learning_rate": 5.678048342818658e-06, "loss": 0.4481, "step": 1392 }, { "epoch": 1.373675129405965, "grad_norm": 0.1697886021302356, "learning_rate": 5.672923562623722e-06, "loss": 0.4617, "step": 1393 }, { "epoch": 1.3746610796154795, "grad_norm": 0.1645550056439435, "learning_rate": 5.667798062356895e-06, "loss": 0.4577, "step": 1394 }, { "epoch": 1.3756470298249939, "grad_norm": 0.17143846073074367, "learning_rate": 5.662671847502793e-06, "loss": 0.4523, "step": 1395 }, { "epoch": 1.3766329800345083, "grad_norm": 0.16344128759827684, "learning_rate": 5.657544923546803e-06, "loss": 0.4666, "step": 1396 }, { "epoch": 1.3776189302440227, "grad_norm": 0.16425518034491454, "learning_rate": 5.65241729597506e-06, "loss": 0.4638, "step": 1397 }, { "epoch": 1.3786048804535371, "grad_norm": 0.16534170302161283, "learning_rate": 5.647288970274463e-06, "loss": 0.4511, "step": 1398 }, { "epoch": 1.3795908306630515, "grad_norm": 0.1617323018590922, "learning_rate": 5.642159951932652e-06, "loss": 0.4702, "step": 1399 }, { "epoch": 1.380576780872566, "grad_norm": 0.16083214099638724, "learning_rate": 5.63703024643801e-06, "loss": 0.4828, "step": 1400 }, { "epoch": 1.3815627310820804, "grad_norm": 0.1720696143022807, "learning_rate": 5.631899859279654e-06, "loss": 0.475, "step": 1401 }, { "epoch": 1.3825486812915948, "grad_norm": 0.17435364748236737, "learning_rate": 5.626768795947432e-06, "loss": 0.4802, "step": 1402 }, { "epoch": 1.3835346315011092, "grad_norm": 0.16437688195121364, "learning_rate": 5.6216370619319134e-06, "loss": 0.4609, "step": 1403 }, { "epoch": 1.3845205817106236, "grad_norm": 0.16548132070839128, "learning_rate": 5.61650466272439e-06, "loss": 0.484, "step": 1404 }, { "epoch": 1.385506531920138, "grad_norm": 0.1549013795519419, "learning_rate": 5.61137160381686e-06, "loss": 0.4506, "step": 1405 }, { "epoch": 1.3864924821296525, "grad_norm": 0.16939320130209762, "learning_rate": 5.606237890702028e-06, "loss": 0.4718, "step": 1406 }, { "epoch": 1.387478432339167, "grad_norm": 0.17051233592675338, "learning_rate": 5.601103528873304e-06, "loss": 0.4367, "step": 1407 }, { "epoch": 1.3884643825486813, "grad_norm": 0.15943541732452363, "learning_rate": 5.595968523824784e-06, "loss": 0.4704, "step": 1408 }, { "epoch": 1.3894503327581957, "grad_norm": 0.16329645709213123, "learning_rate": 5.590832881051262e-06, "loss": 0.4768, "step": 1409 }, { "epoch": 1.3904362829677102, "grad_norm": 0.15557702222562347, "learning_rate": 5.5856966060482024e-06, "loss": 0.4647, "step": 1410 }, { "epoch": 1.3914222331772246, "grad_norm": 0.1561786248276199, "learning_rate": 5.58055970431176e-06, "loss": 0.4519, "step": 1411 }, { "epoch": 1.392408183386739, "grad_norm": 0.16384753935726618, "learning_rate": 5.575422181338748e-06, "loss": 0.4632, "step": 1412 }, { "epoch": 1.3933941335962534, "grad_norm": 0.15968604821114685, "learning_rate": 5.570284042626651e-06, "loss": 0.4497, "step": 1413 }, { "epoch": 1.3943800838057678, "grad_norm": 0.16620749209474933, "learning_rate": 5.565145293673612e-06, "loss": 0.4656, "step": 1414 }, { "epoch": 1.3953660340152823, "grad_norm": 0.1669612046116747, "learning_rate": 5.5600059399784245e-06, "loss": 0.4662, "step": 1415 }, { "epoch": 1.3963519842247967, "grad_norm": 0.17082902317335172, "learning_rate": 5.554865987040532e-06, "loss": 0.468, "step": 1416 }, { "epoch": 1.397337934434311, "grad_norm": 0.15940364329031975, "learning_rate": 5.549725440360016e-06, "loss": 0.4485, "step": 1417 }, { "epoch": 1.3983238846438255, "grad_norm": 0.16212216172085825, "learning_rate": 5.5445843054375945e-06, "loss": 0.4583, "step": 1418 }, { "epoch": 1.39930983485334, "grad_norm": 0.163355919009208, "learning_rate": 5.53944258777462e-06, "loss": 0.4568, "step": 1419 }, { "epoch": 1.4002957850628543, "grad_norm": 0.16262161805177902, "learning_rate": 5.534300292873059e-06, "loss": 0.4512, "step": 1420 }, { "epoch": 1.4012817352723688, "grad_norm": 0.16106454629537711, "learning_rate": 5.5291574262355055e-06, "loss": 0.4613, "step": 1421 }, { "epoch": 1.4022676854818832, "grad_norm": 0.184319064463163, "learning_rate": 5.524013993365156e-06, "loss": 0.4565, "step": 1422 }, { "epoch": 1.4032536356913976, "grad_norm": 0.17067281877077936, "learning_rate": 5.518869999765821e-06, "loss": 0.4523, "step": 1423 }, { "epoch": 1.404239585900912, "grad_norm": 0.1653886506783148, "learning_rate": 5.513725450941906e-06, "loss": 0.4795, "step": 1424 }, { "epoch": 1.4052255361104264, "grad_norm": 0.1738770503738012, "learning_rate": 5.508580352398413e-06, "loss": 0.4845, "step": 1425 }, { "epoch": 1.4062114863199409, "grad_norm": 0.16058762206241256, "learning_rate": 5.503434709640929e-06, "loss": 0.4586, "step": 1426 }, { "epoch": 1.4071974365294553, "grad_norm": 0.16389304571758812, "learning_rate": 5.498288528175628e-06, "loss": 0.4505, "step": 1427 }, { "epoch": 1.4081833867389697, "grad_norm": 0.16483781266044475, "learning_rate": 5.49314181350926e-06, "loss": 0.4583, "step": 1428 }, { "epoch": 1.4091693369484841, "grad_norm": 0.16977386868298516, "learning_rate": 5.487994571149139e-06, "loss": 0.4735, "step": 1429 }, { "epoch": 1.4101552871579985, "grad_norm": 0.17373734902192936, "learning_rate": 5.482846806603153e-06, "loss": 0.4739, "step": 1430 }, { "epoch": 1.411141237367513, "grad_norm": 0.15581196161677877, "learning_rate": 5.47769852537974e-06, "loss": 0.4747, "step": 1431 }, { "epoch": 1.4121271875770274, "grad_norm": 0.18696441199681657, "learning_rate": 5.4725497329879006e-06, "loss": 0.4656, "step": 1432 }, { "epoch": 1.4131131377865418, "grad_norm": 0.16242538336812662, "learning_rate": 5.46740043493717e-06, "loss": 0.4763, "step": 1433 }, { "epoch": 1.4140990879960562, "grad_norm": 0.16208367941537546, "learning_rate": 5.462250636737638e-06, "loss": 0.4671, "step": 1434 }, { "epoch": 1.4150850382055706, "grad_norm": 0.1681451829161332, "learning_rate": 5.457100343899918e-06, "loss": 0.4628, "step": 1435 }, { "epoch": 1.416070988415085, "grad_norm": 0.1606132001352093, "learning_rate": 5.451949561935161e-06, "loss": 0.4793, "step": 1436 }, { "epoch": 1.4170569386245995, "grad_norm": 0.1601840747483185, "learning_rate": 5.4467982963550346e-06, "loss": 0.4566, "step": 1437 }, { "epoch": 1.418042888834114, "grad_norm": 0.16071872559154282, "learning_rate": 5.441646552671731e-06, "loss": 0.4583, "step": 1438 }, { "epoch": 1.4190288390436283, "grad_norm": 0.16357197680596194, "learning_rate": 5.436494336397948e-06, "loss": 0.4604, "step": 1439 }, { "epoch": 1.4200147892531427, "grad_norm": 0.168978827594285, "learning_rate": 5.431341653046893e-06, "loss": 0.4775, "step": 1440 }, { "epoch": 1.4210007394626571, "grad_norm": 0.16579442234725897, "learning_rate": 5.4261885081322685e-06, "loss": 0.4535, "step": 1441 }, { "epoch": 1.4219866896721716, "grad_norm": 0.1577143915266565, "learning_rate": 5.421034907168279e-06, "loss": 0.4651, "step": 1442 }, { "epoch": 1.422972639881686, "grad_norm": 0.16462619439183032, "learning_rate": 5.415880855669607e-06, "loss": 0.4716, "step": 1443 }, { "epoch": 1.4239585900912004, "grad_norm": 0.15661772083930173, "learning_rate": 5.410726359151426e-06, "loss": 0.4581, "step": 1444 }, { "epoch": 1.4249445403007148, "grad_norm": 0.15937762868230831, "learning_rate": 5.40557142312938e-06, "loss": 0.4811, "step": 1445 }, { "epoch": 1.4259304905102292, "grad_norm": 0.16770318868465095, "learning_rate": 5.400416053119586e-06, "loss": 0.4624, "step": 1446 }, { "epoch": 1.4269164407197437, "grad_norm": 0.15556893949406567, "learning_rate": 5.395260254638624e-06, "loss": 0.4672, "step": 1447 }, { "epoch": 1.427902390929258, "grad_norm": 0.16142586376021115, "learning_rate": 5.390104033203533e-06, "loss": 0.4537, "step": 1448 }, { "epoch": 1.4288883411387725, "grad_norm": 0.1650474674463431, "learning_rate": 5.3849473943318045e-06, "loss": 0.473, "step": 1449 }, { "epoch": 1.429874291348287, "grad_norm": 0.16021595164370292, "learning_rate": 5.379790343541376e-06, "loss": 0.456, "step": 1450 }, { "epoch": 1.4308602415578013, "grad_norm": 0.16147362832111686, "learning_rate": 5.374632886350628e-06, "loss": 0.4627, "step": 1451 }, { "epoch": 1.4318461917673158, "grad_norm": 0.16205571482787692, "learning_rate": 5.3694750282783745e-06, "loss": 0.4455, "step": 1452 }, { "epoch": 1.4328321419768302, "grad_norm": 0.15866834834604854, "learning_rate": 5.36431677484386e-06, "loss": 0.466, "step": 1453 }, { "epoch": 1.4338180921863446, "grad_norm": 0.17128940295948444, "learning_rate": 5.3591581315667465e-06, "loss": 0.4717, "step": 1454 }, { "epoch": 1.434804042395859, "grad_norm": 0.1661069463954318, "learning_rate": 5.353999103967119e-06, "loss": 0.4846, "step": 1455 }, { "epoch": 1.4357899926053734, "grad_norm": 0.16101274669849455, "learning_rate": 5.348839697565472e-06, "loss": 0.4547, "step": 1456 }, { "epoch": 1.4367759428148879, "grad_norm": 0.1576415609417486, "learning_rate": 5.343679917882707e-06, "loss": 0.4614, "step": 1457 }, { "epoch": 1.4377618930244023, "grad_norm": 0.17253401627812268, "learning_rate": 5.338519770440119e-06, "loss": 0.4699, "step": 1458 }, { "epoch": 1.4387478432339167, "grad_norm": 0.16345541147492304, "learning_rate": 5.333359260759406e-06, "loss": 0.4583, "step": 1459 }, { "epoch": 1.439733793443431, "grad_norm": 0.17320027594932633, "learning_rate": 5.3281983943626436e-06, "loss": 0.4754, "step": 1460 }, { "epoch": 1.4407197436529455, "grad_norm": 0.16554097410343635, "learning_rate": 5.3230371767722966e-06, "loss": 0.4756, "step": 1461 }, { "epoch": 1.44170569386246, "grad_norm": 0.1660296139253079, "learning_rate": 5.317875613511202e-06, "loss": 0.4574, "step": 1462 }, { "epoch": 1.4426916440719744, "grad_norm": 0.1617940600058465, "learning_rate": 5.312713710102567e-06, "loss": 0.4615, "step": 1463 }, { "epoch": 1.4436775942814888, "grad_norm": 0.16950274075793623, "learning_rate": 5.307551472069964e-06, "loss": 0.4792, "step": 1464 }, { "epoch": 1.4446635444910032, "grad_norm": 0.17120408352777172, "learning_rate": 5.302388904937323e-06, "loss": 0.4708, "step": 1465 }, { "epoch": 1.4456494947005176, "grad_norm": 0.1645352322140342, "learning_rate": 5.2972260142289255e-06, "loss": 0.4603, "step": 1466 }, { "epoch": 1.446635444910032, "grad_norm": 0.16716175595707267, "learning_rate": 5.2920628054694004e-06, "loss": 0.4691, "step": 1467 }, { "epoch": 1.4476213951195465, "grad_norm": 0.165255412064843, "learning_rate": 5.286899284183714e-06, "loss": 0.4617, "step": 1468 }, { "epoch": 1.4486073453290609, "grad_norm": 0.1605534962378199, "learning_rate": 5.281735455897172e-06, "loss": 0.457, "step": 1469 }, { "epoch": 1.4495932955385753, "grad_norm": 0.1620741209304123, "learning_rate": 5.276571326135405e-06, "loss": 0.4545, "step": 1470 }, { "epoch": 1.4505792457480897, "grad_norm": 0.15537470368008646, "learning_rate": 5.271406900424366e-06, "loss": 0.4581, "step": 1471 }, { "epoch": 1.4515651959576041, "grad_norm": 0.1606369928134782, "learning_rate": 5.266242184290327e-06, "loss": 0.4704, "step": 1472 }, { "epoch": 1.4525511461671186, "grad_norm": 0.16149958580223164, "learning_rate": 5.261077183259867e-06, "loss": 0.4487, "step": 1473 }, { "epoch": 1.453537096376633, "grad_norm": 0.1639900897443875, "learning_rate": 5.2559119028598775e-06, "loss": 0.4466, "step": 1474 }, { "epoch": 1.4545230465861474, "grad_norm": 0.16965276827089013, "learning_rate": 5.250746348617538e-06, "loss": 0.4743, "step": 1475 }, { "epoch": 1.4555089967956618, "grad_norm": 0.16501635832606454, "learning_rate": 5.245580526060331e-06, "loss": 0.446, "step": 1476 }, { "epoch": 1.4564949470051762, "grad_norm": 0.15996363943632896, "learning_rate": 5.2404144407160195e-06, "loss": 0.4678, "step": 1477 }, { "epoch": 1.4574808972146907, "grad_norm": 0.16159558576747365, "learning_rate": 5.235248098112652e-06, "loss": 0.4726, "step": 1478 }, { "epoch": 1.458466847424205, "grad_norm": 0.16461503170307748, "learning_rate": 5.230081503778548e-06, "loss": 0.4693, "step": 1479 }, { "epoch": 1.4594527976337195, "grad_norm": 0.15563785455833773, "learning_rate": 5.224914663242303e-06, "loss": 0.4559, "step": 1480 }, { "epoch": 1.460438747843234, "grad_norm": 0.1651776006276481, "learning_rate": 5.219747582032767e-06, "loss": 0.4746, "step": 1481 }, { "epoch": 1.4614246980527483, "grad_norm": 0.16077563836492886, "learning_rate": 5.214580265679055e-06, "loss": 0.46, "step": 1482 }, { "epoch": 1.4624106482622627, "grad_norm": 0.16175140377905586, "learning_rate": 5.209412719710529e-06, "loss": 0.4657, "step": 1483 }, { "epoch": 1.4633965984717772, "grad_norm": 0.1696886394681388, "learning_rate": 5.204244949656802e-06, "loss": 0.4611, "step": 1484 }, { "epoch": 1.4643825486812916, "grad_norm": 0.15941472334718274, "learning_rate": 5.19907696104772e-06, "loss": 0.487, "step": 1485 }, { "epoch": 1.465368498890806, "grad_norm": 0.16204398737122389, "learning_rate": 5.193908759413369e-06, "loss": 0.4795, "step": 1486 }, { "epoch": 1.4663544491003204, "grad_norm": 0.1658665941820137, "learning_rate": 5.188740350284058e-06, "loss": 0.4766, "step": 1487 }, { "epoch": 1.4673403993098348, "grad_norm": 0.16443166686761135, "learning_rate": 5.18357173919032e-06, "loss": 0.4696, "step": 1488 }, { "epoch": 1.4683263495193493, "grad_norm": 0.1961560666508905, "learning_rate": 5.178402931662905e-06, "loss": 0.4579, "step": 1489 }, { "epoch": 1.4693122997288637, "grad_norm": 0.16138047883085868, "learning_rate": 5.173233933232774e-06, "loss": 0.4583, "step": 1490 }, { "epoch": 1.470298249938378, "grad_norm": 0.16087565868089423, "learning_rate": 5.168064749431089e-06, "loss": 0.4507, "step": 1491 }, { "epoch": 1.4712842001478925, "grad_norm": 0.15835325164556538, "learning_rate": 5.162895385789214e-06, "loss": 0.4618, "step": 1492 }, { "epoch": 1.472270150357407, "grad_norm": 0.15972352925273195, "learning_rate": 5.157725847838702e-06, "loss": 0.4639, "step": 1493 }, { "epoch": 1.4732561005669214, "grad_norm": 0.16660573318241859, "learning_rate": 5.152556141111295e-06, "loss": 0.4719, "step": 1494 }, { "epoch": 1.4742420507764358, "grad_norm": 0.18062616563596273, "learning_rate": 5.147386271138916e-06, "loss": 0.4885, "step": 1495 }, { "epoch": 1.4752280009859502, "grad_norm": 0.1659592283655134, "learning_rate": 5.142216243453657e-06, "loss": 0.458, "step": 1496 }, { "epoch": 1.4762139511954646, "grad_norm": 0.16673357568626188, "learning_rate": 5.137046063587789e-06, "loss": 0.4655, "step": 1497 }, { "epoch": 1.477199901404979, "grad_norm": 0.16242794037498784, "learning_rate": 5.131875737073736e-06, "loss": 0.4653, "step": 1498 }, { "epoch": 1.4781858516144935, "grad_norm": 0.16336059472360087, "learning_rate": 5.126705269444084e-06, "loss": 0.4488, "step": 1499 }, { "epoch": 1.4791718018240079, "grad_norm": 0.16619479701184342, "learning_rate": 5.1215346662315705e-06, "loss": 0.4681, "step": 1500 }, { "epoch": 1.4801577520335223, "grad_norm": 0.15907452378843642, "learning_rate": 5.116363932969074e-06, "loss": 0.4701, "step": 1501 }, { "epoch": 1.4811437022430367, "grad_norm": 0.16682593799018364, "learning_rate": 5.111193075189617e-06, "loss": 0.4856, "step": 1502 }, { "epoch": 1.4821296524525511, "grad_norm": 0.1643805853475963, "learning_rate": 5.106022098426351e-06, "loss": 0.4595, "step": 1503 }, { "epoch": 1.4831156026620655, "grad_norm": 0.15870679737925164, "learning_rate": 5.100851008212557e-06, "loss": 0.4582, "step": 1504 }, { "epoch": 1.48410155287158, "grad_norm": 0.15789400323136946, "learning_rate": 5.095679810081641e-06, "loss": 0.4776, "step": 1505 }, { "epoch": 1.4850875030810944, "grad_norm": 0.15848384285661024, "learning_rate": 5.090508509567115e-06, "loss": 0.4697, "step": 1506 }, { "epoch": 1.4860734532906088, "grad_norm": 0.16505008193274287, "learning_rate": 5.08533711220261e-06, "loss": 0.4665, "step": 1507 }, { "epoch": 1.4870594035001232, "grad_norm": 0.17116248764476494, "learning_rate": 5.080165623521854e-06, "loss": 0.4541, "step": 1508 }, { "epoch": 1.4880453537096376, "grad_norm": 0.1541353248533124, "learning_rate": 5.0749940490586795e-06, "loss": 0.4711, "step": 1509 }, { "epoch": 1.489031303919152, "grad_norm": 0.1623999670338064, "learning_rate": 5.069822394347004e-06, "loss": 0.4818, "step": 1510 }, { "epoch": 1.4900172541286665, "grad_norm": 0.32249731811742466, "learning_rate": 5.064650664920834e-06, "loss": 0.475, "step": 1511 }, { "epoch": 1.491003204338181, "grad_norm": 0.16355643012459464, "learning_rate": 5.059478866314255e-06, "loss": 0.4768, "step": 1512 }, { "epoch": 1.4919891545476953, "grad_norm": 0.15994494636484732, "learning_rate": 5.05430700406143e-06, "loss": 0.4566, "step": 1513 }, { "epoch": 1.4929751047572097, "grad_norm": 0.16280872589015538, "learning_rate": 5.049135083696585e-06, "loss": 0.4881, "step": 1514 }, { "epoch": 1.4939610549667242, "grad_norm": 0.1644086039173946, "learning_rate": 5.04396311075401e-06, "loss": 0.4654, "step": 1515 }, { "epoch": 1.4949470051762386, "grad_norm": 0.16504330201013406, "learning_rate": 5.038791090768055e-06, "loss": 0.458, "step": 1516 }, { "epoch": 1.495932955385753, "grad_norm": 0.17997686772783875, "learning_rate": 5.033619029273112e-06, "loss": 0.4736, "step": 1517 }, { "epoch": 1.4969189055952674, "grad_norm": 0.15963908599115606, "learning_rate": 5.0284469318036285e-06, "loss": 0.4536, "step": 1518 }, { "epoch": 1.4979048558047818, "grad_norm": 0.15979259663425793, "learning_rate": 5.023274803894079e-06, "loss": 0.4619, "step": 1519 }, { "epoch": 1.4988908060142963, "grad_norm": 0.1565926166086123, "learning_rate": 5.018102651078981e-06, "loss": 0.4569, "step": 1520 }, { "epoch": 1.4998767562238107, "grad_norm": 0.16128313111834786, "learning_rate": 5.012930478892869e-06, "loss": 0.4893, "step": 1521 }, { "epoch": 1.500862706433325, "grad_norm": 0.16573407567036658, "learning_rate": 5.0077582928703065e-06, "loss": 0.47, "step": 1522 }, { "epoch": 1.5018486566428395, "grad_norm": 0.1587716773283982, "learning_rate": 5.002586098545867e-06, "loss": 0.4637, "step": 1523 }, { "epoch": 1.502834606852354, "grad_norm": 0.15964197854300818, "learning_rate": 4.997413901454134e-06, "loss": 0.4519, "step": 1524 }, { "epoch": 1.5038205570618683, "grad_norm": 0.15912646887006812, "learning_rate": 4.9922417071296935e-06, "loss": 0.4649, "step": 1525 }, { "epoch": 1.5048065072713828, "grad_norm": 0.17354657537387477, "learning_rate": 4.987069521107131e-06, "loss": 0.4517, "step": 1526 }, { "epoch": 1.5057924574808972, "grad_norm": 0.1669639920640026, "learning_rate": 4.981897348921021e-06, "loss": 0.4517, "step": 1527 }, { "epoch": 1.5067784076904116, "grad_norm": 0.15981049076839046, "learning_rate": 4.976725196105922e-06, "loss": 0.4684, "step": 1528 }, { "epoch": 1.507764357899926, "grad_norm": 0.16231859468822926, "learning_rate": 4.971553068196373e-06, "loss": 0.4502, "step": 1529 }, { "epoch": 1.5087503081094404, "grad_norm": 0.17331787623465963, "learning_rate": 4.966380970726889e-06, "loss": 0.4512, "step": 1530 }, { "epoch": 1.5097362583189549, "grad_norm": 0.1712976011319666, "learning_rate": 4.961208909231946e-06, "loss": 0.4866, "step": 1531 }, { "epoch": 1.5107222085284693, "grad_norm": 0.16007715604197134, "learning_rate": 4.956036889245991e-06, "loss": 0.4321, "step": 1532 }, { "epoch": 1.5117081587379837, "grad_norm": 0.16106511677415258, "learning_rate": 4.950864916303417e-06, "loss": 0.461, "step": 1533 }, { "epoch": 1.5126941089474981, "grad_norm": 0.1607939029014845, "learning_rate": 4.945692995938573e-06, "loss": 0.462, "step": 1534 }, { "epoch": 1.5136800591570125, "grad_norm": 0.16294803618833043, "learning_rate": 4.940521133685746e-06, "loss": 0.4719, "step": 1535 }, { "epoch": 1.514666009366527, "grad_norm": 0.16300678600106153, "learning_rate": 4.935349335079168e-06, "loss": 0.4754, "step": 1536 }, { "epoch": 1.5156519595760414, "grad_norm": 0.16337752099860722, "learning_rate": 4.930177605652999e-06, "loss": 0.4769, "step": 1537 }, { "epoch": 1.5166379097855558, "grad_norm": 0.17190254222704324, "learning_rate": 4.925005950941322e-06, "loss": 0.4773, "step": 1538 }, { "epoch": 1.5176238599950702, "grad_norm": 0.16075229340350197, "learning_rate": 4.919834376478147e-06, "loss": 0.4537, "step": 1539 }, { "epoch": 1.5186098102045846, "grad_norm": 0.16430904130065005, "learning_rate": 4.914662887797391e-06, "loss": 0.4771, "step": 1540 }, { "epoch": 1.519595760414099, "grad_norm": 0.1590528354097433, "learning_rate": 4.909491490432886e-06, "loss": 0.4659, "step": 1541 }, { "epoch": 1.5205817106236135, "grad_norm": 0.16323440850911192, "learning_rate": 4.904320189918362e-06, "loss": 0.4754, "step": 1542 }, { "epoch": 1.5215676608331279, "grad_norm": 0.16394665404460873, "learning_rate": 4.899148991787444e-06, "loss": 0.4664, "step": 1543 }, { "epoch": 1.5225536110426423, "grad_norm": 0.16259969161433288, "learning_rate": 4.893977901573651e-06, "loss": 0.4695, "step": 1544 }, { "epoch": 1.5235395612521567, "grad_norm": 0.1697390736728767, "learning_rate": 4.888806924810385e-06, "loss": 0.4807, "step": 1545 }, { "epoch": 1.5245255114616711, "grad_norm": 0.16879485066467068, "learning_rate": 4.883636067030927e-06, "loss": 0.4772, "step": 1546 }, { "epoch": 1.5255114616711856, "grad_norm": 0.16094250386353298, "learning_rate": 4.878465333768432e-06, "loss": 0.4771, "step": 1547 }, { "epoch": 1.5264974118807, "grad_norm": 0.16603533601441273, "learning_rate": 4.873294730555917e-06, "loss": 0.4579, "step": 1548 }, { "epoch": 1.5274833620902144, "grad_norm": 0.1662652078835514, "learning_rate": 4.868124262926266e-06, "loss": 0.4482, "step": 1549 }, { "epoch": 1.5284693122997288, "grad_norm": 0.2053258025193838, "learning_rate": 4.862953936412212e-06, "loss": 0.4744, "step": 1550 }, { "epoch": 1.5294552625092432, "grad_norm": 0.1610932388153637, "learning_rate": 4.857783756546343e-06, "loss": 0.4697, "step": 1551 }, { "epoch": 1.5304412127187577, "grad_norm": 0.16323632481812023, "learning_rate": 4.852613728861087e-06, "loss": 0.4596, "step": 1552 }, { "epoch": 1.531427162928272, "grad_norm": 0.15864551904706486, "learning_rate": 4.847443858888707e-06, "loss": 0.4642, "step": 1553 }, { "epoch": 1.5324131131377865, "grad_norm": 0.16363552393779485, "learning_rate": 4.842274152161298e-06, "loss": 0.4665, "step": 1554 }, { "epoch": 1.533399063347301, "grad_norm": 0.1606011248078566, "learning_rate": 4.8371046142107865e-06, "loss": 0.4547, "step": 1555 }, { "epoch": 1.5343850135568153, "grad_norm": 0.16610436957840455, "learning_rate": 4.831935250568911e-06, "loss": 0.4789, "step": 1556 }, { "epoch": 1.5353709637663298, "grad_norm": 0.16457826211107202, "learning_rate": 4.826766066767228e-06, "loss": 0.4577, "step": 1557 }, { "epoch": 1.5363569139758442, "grad_norm": 0.1583944339333812, "learning_rate": 4.821597068337097e-06, "loss": 0.4686, "step": 1558 }, { "epoch": 1.5373428641853586, "grad_norm": 0.2130665095886175, "learning_rate": 4.816428260809682e-06, "loss": 0.4815, "step": 1559 }, { "epoch": 1.538328814394873, "grad_norm": 0.1717835073413018, "learning_rate": 4.811259649715945e-06, "loss": 0.478, "step": 1560 }, { "epoch": 1.5393147646043874, "grad_norm": 0.16485264222155294, "learning_rate": 4.806091240586633e-06, "loss": 0.4796, "step": 1561 }, { "epoch": 1.5403007148139019, "grad_norm": 0.16782791251902787, "learning_rate": 4.800923038952282e-06, "loss": 0.4606, "step": 1562 }, { "epoch": 1.5412866650234163, "grad_norm": 0.15869786778702402, "learning_rate": 4.795755050343199e-06, "loss": 0.4613, "step": 1563 }, { "epoch": 1.5422726152329307, "grad_norm": 0.16360053404141608, "learning_rate": 4.790587280289472e-06, "loss": 0.4677, "step": 1564 }, { "epoch": 1.543258565442445, "grad_norm": 0.17053556390655156, "learning_rate": 4.785419734320946e-06, "loss": 0.4779, "step": 1565 }, { "epoch": 1.5442445156519595, "grad_norm": 0.16345973492325022, "learning_rate": 4.780252417967234e-06, "loss": 0.4794, "step": 1566 }, { "epoch": 1.545230465861474, "grad_norm": 0.1624815669558633, "learning_rate": 4.775085336757699e-06, "loss": 0.4815, "step": 1567 }, { "epoch": 1.5462164160709884, "grad_norm": 0.16126412585156985, "learning_rate": 4.7699184962214526e-06, "loss": 0.4676, "step": 1568 }, { "epoch": 1.5472023662805028, "grad_norm": 0.3017180723300519, "learning_rate": 4.764751901887349e-06, "loss": 0.4688, "step": 1569 }, { "epoch": 1.5481883164900172, "grad_norm": 0.16701523355162504, "learning_rate": 4.759585559283981e-06, "loss": 0.4846, "step": 1570 }, { "epoch": 1.5491742666995316, "grad_norm": 0.16211400835309955, "learning_rate": 4.754419473939669e-06, "loss": 0.4758, "step": 1571 }, { "epoch": 1.550160216909046, "grad_norm": 0.1564366269152217, "learning_rate": 4.7492536513824634e-06, "loss": 0.4503, "step": 1572 }, { "epoch": 1.5511461671185605, "grad_norm": 0.17540862590653875, "learning_rate": 4.744088097140125e-06, "loss": 0.4452, "step": 1573 }, { "epoch": 1.5521321173280749, "grad_norm": 0.15707201548033853, "learning_rate": 4.738922816740134e-06, "loss": 0.4548, "step": 1574 }, { "epoch": 1.5531180675375893, "grad_norm": 0.16604684393940009, "learning_rate": 4.733757815709675e-06, "loss": 0.4701, "step": 1575 }, { "epoch": 1.5541040177471037, "grad_norm": 0.16027270475518166, "learning_rate": 4.7285930995756355e-06, "loss": 0.4637, "step": 1576 }, { "epoch": 1.5550899679566181, "grad_norm": 0.16005286781324335, "learning_rate": 4.7234286738645975e-06, "loss": 0.4723, "step": 1577 }, { "epoch": 1.5560759181661328, "grad_norm": 0.16517741964187063, "learning_rate": 4.718264544102829e-06, "loss": 0.4765, "step": 1578 }, { "epoch": 1.557061868375647, "grad_norm": 0.17187802486105916, "learning_rate": 4.713100715816287e-06, "loss": 0.4653, "step": 1579 }, { "epoch": 1.5580478185851616, "grad_norm": 0.15723170870463232, "learning_rate": 4.707937194530602e-06, "loss": 0.4597, "step": 1580 }, { "epoch": 1.5590337687946758, "grad_norm": 0.16060894388992308, "learning_rate": 4.702773985771075e-06, "loss": 0.4468, "step": 1581 }, { "epoch": 1.5600197190041905, "grad_norm": 0.16123556569027944, "learning_rate": 4.697611095062679e-06, "loss": 0.4735, "step": 1582 }, { "epoch": 1.5610056692137046, "grad_norm": 0.16803448436652849, "learning_rate": 4.692448527930038e-06, "loss": 0.4645, "step": 1583 }, { "epoch": 1.5619916194232193, "grad_norm": 0.1597392579513729, "learning_rate": 4.6872862898974345e-06, "loss": 0.4621, "step": 1584 }, { "epoch": 1.5629775696327335, "grad_norm": 0.15735379441805616, "learning_rate": 4.6821243864888e-06, "loss": 0.4709, "step": 1585 }, { "epoch": 1.5639635198422481, "grad_norm": 0.15404594214049994, "learning_rate": 4.676962823227704e-06, "loss": 0.4648, "step": 1586 }, { "epoch": 1.5649494700517623, "grad_norm": 0.1615620632965602, "learning_rate": 4.67180160563736e-06, "loss": 0.4564, "step": 1587 }, { "epoch": 1.565935420261277, "grad_norm": 0.16706931355044777, "learning_rate": 4.666640739240596e-06, "loss": 0.4577, "step": 1588 }, { "epoch": 1.5669213704707912, "grad_norm": 0.16696669317402332, "learning_rate": 4.661480229559882e-06, "loss": 0.4842, "step": 1589 }, { "epoch": 1.5679073206803058, "grad_norm": 0.1579948637916331, "learning_rate": 4.656320082117295e-06, "loss": 0.4547, "step": 1590 }, { "epoch": 1.56889327088982, "grad_norm": 0.15781583927294385, "learning_rate": 4.6511603024345286e-06, "loss": 0.4647, "step": 1591 }, { "epoch": 1.5698792210993346, "grad_norm": 0.2129761427535933, "learning_rate": 4.6460008960328834e-06, "loss": 0.4487, "step": 1592 }, { "epoch": 1.5708651713088488, "grad_norm": 0.2337698011010383, "learning_rate": 4.640841868433257e-06, "loss": 0.4575, "step": 1593 }, { "epoch": 1.5718511215183635, "grad_norm": 0.16212957242189352, "learning_rate": 4.635683225156142e-06, "loss": 0.4696, "step": 1594 }, { "epoch": 1.5728370717278777, "grad_norm": 0.16507879401280473, "learning_rate": 4.630524971721626e-06, "loss": 0.4648, "step": 1595 }, { "epoch": 1.5738230219373923, "grad_norm": 0.16957907647895618, "learning_rate": 4.625367113649371e-06, "loss": 0.4868, "step": 1596 }, { "epoch": 1.5748089721469065, "grad_norm": 0.16317374423938397, "learning_rate": 4.620209656458626e-06, "loss": 0.4871, "step": 1597 }, { "epoch": 1.5757949223564212, "grad_norm": 0.16220240581447787, "learning_rate": 4.615052605668198e-06, "loss": 0.4669, "step": 1598 }, { "epoch": 1.5767808725659354, "grad_norm": 0.16935425806114593, "learning_rate": 4.609895966796469e-06, "loss": 0.4679, "step": 1599 }, { "epoch": 1.57776682277545, "grad_norm": 0.16292461772554667, "learning_rate": 4.604739745361377e-06, "loss": 0.4421, "step": 1600 }, { "epoch": 1.5787527729849642, "grad_norm": 0.1605822869671949, "learning_rate": 4.599583946880415e-06, "loss": 0.4593, "step": 1601 }, { "epoch": 1.5797387231944788, "grad_norm": 0.16032696814199088, "learning_rate": 4.594428576870622e-06, "loss": 0.467, "step": 1602 }, { "epoch": 1.580724673403993, "grad_norm": 0.16240637061318583, "learning_rate": 4.589273640848575e-06, "loss": 0.4717, "step": 1603 }, { "epoch": 1.5817106236135077, "grad_norm": 0.16418809930496206, "learning_rate": 4.584119144330394e-06, "loss": 0.478, "step": 1604 }, { "epoch": 1.5826965738230219, "grad_norm": 0.17631339108317104, "learning_rate": 4.578965092831722e-06, "loss": 0.4878, "step": 1605 }, { "epoch": 1.5836825240325365, "grad_norm": 0.16036063679879956, "learning_rate": 4.5738114918677315e-06, "loss": 0.4712, "step": 1606 }, { "epoch": 1.5846684742420507, "grad_norm": 0.1605868130676638, "learning_rate": 4.568658346953109e-06, "loss": 0.4926, "step": 1607 }, { "epoch": 1.5856544244515653, "grad_norm": 0.16388430528523226, "learning_rate": 4.563505663602054e-06, "loss": 0.4574, "step": 1608 }, { "epoch": 1.5866403746610795, "grad_norm": 0.16391718314552686, "learning_rate": 4.558353447328271e-06, "loss": 0.4617, "step": 1609 }, { "epoch": 1.5876263248705942, "grad_norm": 0.16632160236460128, "learning_rate": 4.553201703644966e-06, "loss": 0.4643, "step": 1610 }, { "epoch": 1.5886122750801084, "grad_norm": 0.17815327746175302, "learning_rate": 4.54805043806484e-06, "loss": 0.4697, "step": 1611 }, { "epoch": 1.589598225289623, "grad_norm": 0.16042376911405104, "learning_rate": 4.542899656100082e-06, "loss": 0.4638, "step": 1612 }, { "epoch": 1.5905841754991372, "grad_norm": 0.16155445892241058, "learning_rate": 4.5377493632623644e-06, "loss": 0.4662, "step": 1613 }, { "epoch": 1.5915701257086519, "grad_norm": 0.16766169662221317, "learning_rate": 4.532599565062831e-06, "loss": 0.4596, "step": 1614 }, { "epoch": 1.592556075918166, "grad_norm": 0.17423163991048965, "learning_rate": 4.527450267012101e-06, "loss": 0.4677, "step": 1615 }, { "epoch": 1.5935420261276807, "grad_norm": 0.15822696845412731, "learning_rate": 4.52230147462026e-06, "loss": 0.4441, "step": 1616 }, { "epoch": 1.594527976337195, "grad_norm": 0.16065271742379106, "learning_rate": 4.517153193396847e-06, "loss": 0.4548, "step": 1617 }, { "epoch": 1.5955139265467095, "grad_norm": 0.15326255834855834, "learning_rate": 4.5120054288508615e-06, "loss": 0.4367, "step": 1618 }, { "epoch": 1.5964998767562237, "grad_norm": 0.16989129262402686, "learning_rate": 4.506858186490743e-06, "loss": 0.4661, "step": 1619 }, { "epoch": 1.5974858269657384, "grad_norm": 0.16496856927193837, "learning_rate": 4.501711471824373e-06, "loss": 0.4677, "step": 1620 }, { "epoch": 1.5984717771752526, "grad_norm": 0.16312540567729633, "learning_rate": 4.496565290359072e-06, "loss": 0.4702, "step": 1621 }, { "epoch": 1.5994577273847672, "grad_norm": 0.15744378380707014, "learning_rate": 4.49141964760159e-06, "loss": 0.4421, "step": 1622 }, { "epoch": 1.6004436775942814, "grad_norm": 0.16246223308682423, "learning_rate": 4.486274549058097e-06, "loss": 0.4628, "step": 1623 }, { "epoch": 1.601429627803796, "grad_norm": 0.1626597411867869, "learning_rate": 4.481130000234181e-06, "loss": 0.4698, "step": 1624 }, { "epoch": 1.6024155780133102, "grad_norm": 0.16303281734251007, "learning_rate": 4.475986006634845e-06, "loss": 0.4809, "step": 1625 }, { "epoch": 1.603401528222825, "grad_norm": 0.15722472582946692, "learning_rate": 4.470842573764497e-06, "loss": 0.456, "step": 1626 }, { "epoch": 1.604387478432339, "grad_norm": 0.16625605076459507, "learning_rate": 4.465699707126941e-06, "loss": 0.4429, "step": 1627 }, { "epoch": 1.6053734286418537, "grad_norm": 0.18406674358801198, "learning_rate": 4.460557412225382e-06, "loss": 0.4699, "step": 1628 }, { "epoch": 1.606359378851368, "grad_norm": 0.1721080150370503, "learning_rate": 4.455415694562406e-06, "loss": 0.4697, "step": 1629 }, { "epoch": 1.6073453290608826, "grad_norm": 0.16948457739433334, "learning_rate": 4.450274559639985e-06, "loss": 0.4678, "step": 1630 }, { "epoch": 1.6083312792703968, "grad_norm": 0.15883455112278896, "learning_rate": 4.44513401295947e-06, "loss": 0.4687, "step": 1631 }, { "epoch": 1.6093172294799114, "grad_norm": 0.15632932325561036, "learning_rate": 4.4399940600215755e-06, "loss": 0.4517, "step": 1632 }, { "epoch": 1.6103031796894256, "grad_norm": 0.16230052278603177, "learning_rate": 4.434854706326391e-06, "loss": 0.4729, "step": 1633 }, { "epoch": 1.6112891298989402, "grad_norm": 0.15862028377618406, "learning_rate": 4.42971595737335e-06, "loss": 0.4564, "step": 1634 }, { "epoch": 1.6122750801084544, "grad_norm": 0.16736418715930995, "learning_rate": 4.424577818661255e-06, "loss": 0.4691, "step": 1635 }, { "epoch": 1.613261030317969, "grad_norm": 0.1711039646803351, "learning_rate": 4.419440295688241e-06, "loss": 0.4627, "step": 1636 }, { "epoch": 1.6142469805274833, "grad_norm": 0.15903724979653044, "learning_rate": 4.4143033939517975e-06, "loss": 0.4729, "step": 1637 }, { "epoch": 1.615232930736998, "grad_norm": 0.1607537226026667, "learning_rate": 4.409167118948742e-06, "loss": 0.4786, "step": 1638 }, { "epoch": 1.6162188809465121, "grad_norm": 5.445659751986802, "learning_rate": 4.404031476175218e-06, "loss": 0.4643, "step": 1639 }, { "epoch": 1.6172048311560268, "grad_norm": 0.16996628246894466, "learning_rate": 4.398896471126698e-06, "loss": 0.4655, "step": 1640 }, { "epoch": 1.618190781365541, "grad_norm": 0.17309182559629868, "learning_rate": 4.393762109297973e-06, "loss": 0.4658, "step": 1641 }, { "epoch": 1.6191767315750556, "grad_norm": 0.16078101698207184, "learning_rate": 4.388628396183141e-06, "loss": 0.4393, "step": 1642 }, { "epoch": 1.6201626817845698, "grad_norm": 0.1629599532326369, "learning_rate": 4.383495337275611e-06, "loss": 0.4589, "step": 1643 }, { "epoch": 1.6211486319940844, "grad_norm": 0.1550532532080627, "learning_rate": 4.378362938068087e-06, "loss": 0.457, "step": 1644 }, { "epoch": 1.6221345822035986, "grad_norm": 0.16125937412625427, "learning_rate": 4.3732312040525694e-06, "loss": 0.4645, "step": 1645 }, { "epoch": 1.6231205324131133, "grad_norm": 0.15873540121958585, "learning_rate": 4.368100140720347e-06, "loss": 0.4731, "step": 1646 }, { "epoch": 1.6241064826226275, "grad_norm": 0.17053084904384064, "learning_rate": 4.362969753561992e-06, "loss": 0.4716, "step": 1647 }, { "epoch": 1.625092432832142, "grad_norm": 0.16579599401772996, "learning_rate": 4.357840048067351e-06, "loss": 0.4747, "step": 1648 }, { "epoch": 1.6260783830416563, "grad_norm": 0.18203093078322705, "learning_rate": 4.352711029725539e-06, "loss": 0.4582, "step": 1649 }, { "epoch": 1.627064333251171, "grad_norm": 0.1687855099891574, "learning_rate": 4.347582704024942e-06, "loss": 0.4528, "step": 1650 }, { "epoch": 1.6280502834606851, "grad_norm": 0.16436232071081422, "learning_rate": 4.3424550764531995e-06, "loss": 0.4677, "step": 1651 }, { "epoch": 1.6290362336701998, "grad_norm": 0.17223817477890296, "learning_rate": 4.337328152497206e-06, "loss": 0.458, "step": 1652 }, { "epoch": 1.630022183879714, "grad_norm": 0.16785666118136983, "learning_rate": 4.332201937643107e-06, "loss": 0.4709, "step": 1653 }, { "epoch": 1.6310081340892286, "grad_norm": 0.16085219192688313, "learning_rate": 4.3270764373762796e-06, "loss": 0.4787, "step": 1654 }, { "epoch": 1.6319940842987428, "grad_norm": 0.16254146140856732, "learning_rate": 4.321951657181343e-06, "loss": 0.4763, "step": 1655 }, { "epoch": 1.6329800345082575, "grad_norm": 0.1598557744687256, "learning_rate": 4.316827602542146e-06, "loss": 0.4612, "step": 1656 }, { "epoch": 1.6339659847177717, "grad_norm": 0.15979576055073055, "learning_rate": 4.3117042789417586e-06, "loss": 0.468, "step": 1657 }, { "epoch": 1.6349519349272863, "grad_norm": 0.16708220010734257, "learning_rate": 4.306581691862471e-06, "loss": 0.4596, "step": 1658 }, { "epoch": 1.6359378851368005, "grad_norm": 0.1600582187214164, "learning_rate": 4.301459846785784e-06, "loss": 0.4488, "step": 1659 }, { "epoch": 1.6369238353463151, "grad_norm": 0.16661403488418633, "learning_rate": 4.2963387491924015e-06, "loss": 0.477, "step": 1660 }, { "epoch": 1.6379097855558293, "grad_norm": 0.15859897062400877, "learning_rate": 4.2912184045622325e-06, "loss": 0.483, "step": 1661 }, { "epoch": 1.638895735765344, "grad_norm": 0.16991212485166873, "learning_rate": 4.2860988183743785e-06, "loss": 0.4574, "step": 1662 }, { "epoch": 1.6398816859748582, "grad_norm": 0.16312737419856346, "learning_rate": 4.280979996107129e-06, "loss": 0.4533, "step": 1663 }, { "epoch": 1.6408676361843728, "grad_norm": 0.15925356100580668, "learning_rate": 4.275861943237953e-06, "loss": 0.4583, "step": 1664 }, { "epoch": 1.641853586393887, "grad_norm": 0.16127114186632893, "learning_rate": 4.270744665243504e-06, "loss": 0.4606, "step": 1665 }, { "epoch": 1.6428395366034017, "grad_norm": 0.16111245788776873, "learning_rate": 4.265628167599599e-06, "loss": 0.4588, "step": 1666 }, { "epoch": 1.6438254868129158, "grad_norm": 0.16822860303261428, "learning_rate": 4.260512455781221e-06, "loss": 0.4807, "step": 1667 }, { "epoch": 1.6448114370224305, "grad_norm": 0.15926862917951995, "learning_rate": 4.255397535262518e-06, "loss": 0.4551, "step": 1668 }, { "epoch": 1.6457973872319447, "grad_norm": 0.1592182907342588, "learning_rate": 4.250283411516784e-06, "loss": 0.4562, "step": 1669 }, { "epoch": 1.6467833374414593, "grad_norm": 0.16578470661790773, "learning_rate": 4.245170090016463e-06, "loss": 0.46, "step": 1670 }, { "epoch": 1.6477692876509735, "grad_norm": 0.16954867487540962, "learning_rate": 4.240057576233142e-06, "loss": 0.4662, "step": 1671 }, { "epoch": 1.6487552378604882, "grad_norm": 0.16085933560840476, "learning_rate": 4.234945875637543e-06, "loss": 0.4656, "step": 1672 }, { "epoch": 1.6497411880700024, "grad_norm": 0.15603353053978614, "learning_rate": 4.229834993699518e-06, "loss": 0.4515, "step": 1673 }, { "epoch": 1.650727138279517, "grad_norm": 0.1600911760925506, "learning_rate": 4.224724935888039e-06, "loss": 0.4741, "step": 1674 }, { "epoch": 1.6517130884890312, "grad_norm": 0.15705474690537036, "learning_rate": 4.219615707671204e-06, "loss": 0.454, "step": 1675 }, { "epoch": 1.6526990386985458, "grad_norm": 0.15647312431168459, "learning_rate": 4.214507314516214e-06, "loss": 0.4631, "step": 1676 }, { "epoch": 1.65368498890806, "grad_norm": 0.1686109701449764, "learning_rate": 4.2093997618893865e-06, "loss": 0.4626, "step": 1677 }, { "epoch": 1.6546709391175747, "grad_norm": 0.16045455935162395, "learning_rate": 4.204293055256131e-06, "loss": 0.4718, "step": 1678 }, { "epoch": 1.6556568893270889, "grad_norm": 0.15497677791978665, "learning_rate": 4.1991872000809566e-06, "loss": 0.4706, "step": 1679 }, { "epoch": 1.6566428395366035, "grad_norm": 0.15549738678280184, "learning_rate": 4.194082201827458e-06, "loss": 0.4404, "step": 1680 }, { "epoch": 1.6576287897461177, "grad_norm": 0.15676194115412878, "learning_rate": 4.1889780659583165e-06, "loss": 0.46, "step": 1681 }, { "epoch": 1.6586147399556324, "grad_norm": 0.1596892622244796, "learning_rate": 4.183874797935286e-06, "loss": 0.4521, "step": 1682 }, { "epoch": 1.6596006901651466, "grad_norm": 0.1597247735805591, "learning_rate": 4.1787724032192e-06, "loss": 0.4593, "step": 1683 }, { "epoch": 1.6605866403746612, "grad_norm": 0.16019979591444788, "learning_rate": 4.173670887269946e-06, "loss": 0.4445, "step": 1684 }, { "epoch": 1.6615725905841754, "grad_norm": 0.15940742116423462, "learning_rate": 4.1685702555464815e-06, "loss": 0.4675, "step": 1685 }, { "epoch": 1.66255854079369, "grad_norm": 0.15907571330710468, "learning_rate": 4.16347051350681e-06, "loss": 0.46, "step": 1686 }, { "epoch": 1.6635444910032042, "grad_norm": 0.1584986180064822, "learning_rate": 4.1583716666079894e-06, "loss": 0.4704, "step": 1687 }, { "epoch": 1.6645304412127189, "grad_norm": 0.161288876602103, "learning_rate": 4.153273720306115e-06, "loss": 0.4654, "step": 1688 }, { "epoch": 1.665516391422233, "grad_norm": 0.16426213294525213, "learning_rate": 4.148176680056323e-06, "loss": 0.459, "step": 1689 }, { "epoch": 1.6665023416317477, "grad_norm": 0.15770348258781552, "learning_rate": 4.143080551312775e-06, "loss": 0.4648, "step": 1690 }, { "epoch": 1.667488291841262, "grad_norm": 0.1624674809972469, "learning_rate": 4.137985339528658e-06, "loss": 0.4733, "step": 1691 }, { "epoch": 1.6684742420507765, "grad_norm": 0.16664291687162525, "learning_rate": 4.132891050156183e-06, "loss": 0.4672, "step": 1692 }, { "epoch": 1.6694601922602907, "grad_norm": 0.15942602998198058, "learning_rate": 4.127797688646568e-06, "loss": 0.4798, "step": 1693 }, { "epoch": 1.6704461424698054, "grad_norm": 0.15683100493840488, "learning_rate": 4.1227052604500425e-06, "loss": 0.4522, "step": 1694 }, { "epoch": 1.6714320926793196, "grad_norm": 0.1612447220048338, "learning_rate": 4.117613771015831e-06, "loss": 0.4655, "step": 1695 }, { "epoch": 1.6724180428888342, "grad_norm": 0.16599298633234355, "learning_rate": 4.112523225792162e-06, "loss": 0.4631, "step": 1696 }, { "epoch": 1.6734039930983484, "grad_norm": 0.15757793172827048, "learning_rate": 4.107433630226247e-06, "loss": 0.4472, "step": 1697 }, { "epoch": 1.674389943307863, "grad_norm": 0.16235656077954422, "learning_rate": 4.102344989764285e-06, "loss": 0.4808, "step": 1698 }, { "epoch": 1.6753758935173773, "grad_norm": 0.15966798991344844, "learning_rate": 4.097257309851452e-06, "loss": 0.4677, "step": 1699 }, { "epoch": 1.676361843726892, "grad_norm": 0.16022835814471756, "learning_rate": 4.092170595931893e-06, "loss": 0.484, "step": 1700 }, { "epoch": 1.677347793936406, "grad_norm": 0.15971748320658116, "learning_rate": 4.0870848534487236e-06, "loss": 0.4556, "step": 1701 }, { "epoch": 1.6783337441459207, "grad_norm": 0.166482079806644, "learning_rate": 4.082000087844019e-06, "loss": 0.4644, "step": 1702 }, { "epoch": 1.679319694355435, "grad_norm": 0.1583529523684524, "learning_rate": 4.076916304558807e-06, "loss": 0.4585, "step": 1703 }, { "epoch": 1.6803056445649496, "grad_norm": 0.16078954728983477, "learning_rate": 4.07183350903307e-06, "loss": 0.459, "step": 1704 }, { "epoch": 1.6812915947744638, "grad_norm": 0.16189131493190925, "learning_rate": 4.066751706705723e-06, "loss": 0.4558, "step": 1705 }, { "epoch": 1.6822775449839784, "grad_norm": 0.1610716395237498, "learning_rate": 4.061670903014629e-06, "loss": 0.4496, "step": 1706 }, { "epoch": 1.6832634951934926, "grad_norm": 0.16004742786983514, "learning_rate": 4.056591103396573e-06, "loss": 0.4647, "step": 1707 }, { "epoch": 1.6842494454030073, "grad_norm": 0.16061135639699073, "learning_rate": 4.051512313287276e-06, "loss": 0.4644, "step": 1708 }, { "epoch": 1.6852353956125214, "grad_norm": 0.15916518626370224, "learning_rate": 4.04643453812137e-06, "loss": 0.4682, "step": 1709 }, { "epoch": 1.686221345822036, "grad_norm": 0.16356830805591838, "learning_rate": 4.041357783332403e-06, "loss": 0.4627, "step": 1710 }, { "epoch": 1.6872072960315503, "grad_norm": 0.1659003580908028, "learning_rate": 4.036282054352833e-06, "loss": 0.462, "step": 1711 }, { "epoch": 1.688193246241065, "grad_norm": 0.16266201853138954, "learning_rate": 4.031207356614022e-06, "loss": 0.4461, "step": 1712 }, { "epoch": 1.6891791964505791, "grad_norm": 0.1641383969266073, "learning_rate": 4.026133695546223e-06, "loss": 0.4534, "step": 1713 }, { "epoch": 1.6901651466600938, "grad_norm": 0.1546939101720392, "learning_rate": 4.021061076578585e-06, "loss": 0.4623, "step": 1714 }, { "epoch": 1.691151096869608, "grad_norm": 0.1609467801963117, "learning_rate": 4.015989505139137e-06, "loss": 0.4575, "step": 1715 }, { "epoch": 1.6921370470791226, "grad_norm": 0.2973852373889811, "learning_rate": 4.0109189866547896e-06, "loss": 0.4637, "step": 1716 }, { "epoch": 1.6931229972886368, "grad_norm": 0.1649424082175255, "learning_rate": 4.00584952655133e-06, "loss": 0.4492, "step": 1717 }, { "epoch": 1.6941089474981514, "grad_norm": 0.1974452838131342, "learning_rate": 4.000781130253406e-06, "loss": 0.4392, "step": 1718 }, { "epoch": 1.6950948977076656, "grad_norm": 0.15973190521591996, "learning_rate": 3.995713803184535e-06, "loss": 0.4795, "step": 1719 }, { "epoch": 1.6960808479171803, "grad_norm": 0.1625275330726248, "learning_rate": 3.99064755076708e-06, "loss": 0.4526, "step": 1720 }, { "epoch": 1.6970667981266945, "grad_norm": 0.16264247139181023, "learning_rate": 3.985582378422264e-06, "loss": 0.4686, "step": 1721 }, { "epoch": 1.6980527483362091, "grad_norm": 0.16762631815091714, "learning_rate": 3.980518291570148e-06, "loss": 0.4697, "step": 1722 }, { "epoch": 1.6990386985457233, "grad_norm": 0.1592486161979069, "learning_rate": 3.9754552956296365e-06, "loss": 0.46, "step": 1723 }, { "epoch": 1.700024648755238, "grad_norm": 0.1626241317549875, "learning_rate": 3.970393396018462e-06, "loss": 0.4616, "step": 1724 }, { "epoch": 1.7010105989647522, "grad_norm": 0.15743672584041096, "learning_rate": 3.965332598153186e-06, "loss": 0.4648, "step": 1725 }, { "epoch": 1.7019965491742668, "grad_norm": 0.16591668834389786, "learning_rate": 3.9602729074491884e-06, "loss": 0.4635, "step": 1726 }, { "epoch": 1.702982499383781, "grad_norm": 0.16505167909686655, "learning_rate": 3.955214329320671e-06, "loss": 0.4673, "step": 1727 }, { "epoch": 1.7039684495932956, "grad_norm": 0.15595137138501824, "learning_rate": 3.950156869180637e-06, "loss": 0.4507, "step": 1728 }, { "epoch": 1.7049543998028098, "grad_norm": 0.16523673259698257, "learning_rate": 3.9451005324409e-06, "loss": 0.4783, "step": 1729 }, { "epoch": 1.7059403500123245, "grad_norm": 0.1526757287169731, "learning_rate": 3.940045324512066e-06, "loss": 0.4521, "step": 1730 }, { "epoch": 1.7069263002218387, "grad_norm": 0.15949669042895306, "learning_rate": 3.934991250803537e-06, "loss": 0.4742, "step": 1731 }, { "epoch": 1.7079122504313533, "grad_norm": 0.16086159798759384, "learning_rate": 3.929938316723499e-06, "loss": 0.4552, "step": 1732 }, { "epoch": 1.7088982006408675, "grad_norm": 0.16265387416782154, "learning_rate": 3.924886527678921e-06, "loss": 0.4623, "step": 1733 }, { "epoch": 1.7098841508503821, "grad_norm": 0.15920090990234076, "learning_rate": 3.919835889075545e-06, "loss": 0.4549, "step": 1734 }, { "epoch": 1.7108701010598963, "grad_norm": 0.15919083855836072, "learning_rate": 3.914786406317879e-06, "loss": 0.4507, "step": 1735 }, { "epoch": 1.711856051269411, "grad_norm": 0.15985246069413103, "learning_rate": 3.909738084809201e-06, "loss": 0.4652, "step": 1736 }, { "epoch": 1.7128420014789252, "grad_norm": 0.1625145221557171, "learning_rate": 3.90469092995154e-06, "loss": 0.4774, "step": 1737 }, { "epoch": 1.7138279516884398, "grad_norm": 0.16428618121894853, "learning_rate": 3.8996449471456825e-06, "loss": 0.4798, "step": 1738 }, { "epoch": 1.714813901897954, "grad_norm": 0.15946513268958273, "learning_rate": 3.894600141791156e-06, "loss": 0.4737, "step": 1739 }, { "epoch": 1.7157998521074687, "grad_norm": 0.15438322650671502, "learning_rate": 3.88955651928623e-06, "loss": 0.4385, "step": 1740 }, { "epoch": 1.7167858023169829, "grad_norm": 0.16197537413811997, "learning_rate": 3.884514085027905e-06, "loss": 0.4584, "step": 1741 }, { "epoch": 1.7177717525264975, "grad_norm": 0.1591918416840697, "learning_rate": 3.879472844411917e-06, "loss": 0.4707, "step": 1742 }, { "epoch": 1.7187577027360117, "grad_norm": 0.16316356543524038, "learning_rate": 3.874432802832718e-06, "loss": 0.4526, "step": 1743 }, { "epoch": 1.7197436529455263, "grad_norm": 0.15830112702696214, "learning_rate": 3.869393965683484e-06, "loss": 0.4372, "step": 1744 }, { "epoch": 1.7207296031550405, "grad_norm": 0.15868936240516976, "learning_rate": 3.864356338356092e-06, "loss": 0.4676, "step": 1745 }, { "epoch": 1.7217155533645552, "grad_norm": 0.16180924305285438, "learning_rate": 3.8593199262411335e-06, "loss": 0.4598, "step": 1746 }, { "epoch": 1.7227015035740694, "grad_norm": 0.16499470407397807, "learning_rate": 3.854284734727895e-06, "loss": 0.4629, "step": 1747 }, { "epoch": 1.723687453783584, "grad_norm": 0.15922359738365144, "learning_rate": 3.84925076920436e-06, "loss": 0.4579, "step": 1748 }, { "epoch": 1.7246734039930982, "grad_norm": 0.1666424462570064, "learning_rate": 3.8442180350571974e-06, "loss": 0.4675, "step": 1749 }, { "epoch": 1.7256593542026128, "grad_norm": 0.16124955431233737, "learning_rate": 3.839186537671758e-06, "loss": 0.4643, "step": 1750 }, { "epoch": 1.726645304412127, "grad_norm": 0.1677657662261242, "learning_rate": 3.8341562824320724e-06, "loss": 0.4664, "step": 1751 }, { "epoch": 1.7276312546216417, "grad_norm": 0.15804633567061305, "learning_rate": 3.829127274720841e-06, "loss": 0.4466, "step": 1752 }, { "epoch": 1.7286172048311559, "grad_norm": 0.15774444235500484, "learning_rate": 3.8240995199194255e-06, "loss": 0.4787, "step": 1753 }, { "epoch": 1.7296031550406705, "grad_norm": 0.1547158604668105, "learning_rate": 3.819073023407854e-06, "loss": 0.4674, "step": 1754 }, { "epoch": 1.7305891052501847, "grad_norm": 0.15481840956469606, "learning_rate": 3.8140477905648e-06, "loss": 0.4562, "step": 1755 }, { "epoch": 1.7315750554596994, "grad_norm": 0.15551552639232807, "learning_rate": 3.80902382676759e-06, "loss": 0.4487, "step": 1756 }, { "epoch": 1.7325610056692136, "grad_norm": 0.1594096173164566, "learning_rate": 3.8040011373921925e-06, "loss": 0.4629, "step": 1757 }, { "epoch": 1.7335469558787282, "grad_norm": 0.1686421203994268, "learning_rate": 3.798979727813211e-06, "loss": 0.4721, "step": 1758 }, { "epoch": 1.7345329060882424, "grad_norm": 0.1701916542255308, "learning_rate": 3.7939596034038807e-06, "loss": 0.4693, "step": 1759 }, { "epoch": 1.735518856297757, "grad_norm": 0.1604063653101608, "learning_rate": 3.7889407695360565e-06, "loss": 0.4689, "step": 1760 }, { "epoch": 1.7365048065072712, "grad_norm": 0.15803387279080147, "learning_rate": 3.78392323158022e-06, "loss": 0.4514, "step": 1761 }, { "epoch": 1.7374907567167859, "grad_norm": 0.16542809413251114, "learning_rate": 3.77890699490546e-06, "loss": 0.4697, "step": 1762 }, { "epoch": 1.7384767069263, "grad_norm": 0.16354444721337727, "learning_rate": 3.7738920648794785e-06, "loss": 0.4756, "step": 1763 }, { "epoch": 1.7394626571358147, "grad_norm": 0.1669387947207977, "learning_rate": 3.768878446868576e-06, "loss": 0.4652, "step": 1764 }, { "epoch": 1.740448607345329, "grad_norm": 0.15513379263677446, "learning_rate": 3.7638661462376464e-06, "loss": 0.4514, "step": 1765 }, { "epoch": 1.7414345575548436, "grad_norm": 0.15457732346104522, "learning_rate": 3.7588551683501767e-06, "loss": 0.4557, "step": 1766 }, { "epoch": 1.7424205077643578, "grad_norm": 0.15684459051807895, "learning_rate": 3.753845518568241e-06, "loss": 0.4675, "step": 1767 }, { "epoch": 1.7434064579738724, "grad_norm": 0.1643668072910157, "learning_rate": 3.748837202252488e-06, "loss": 0.4613, "step": 1768 }, { "epoch": 1.7443924081833866, "grad_norm": 0.16720662992421156, "learning_rate": 3.7438302247621433e-06, "loss": 0.4556, "step": 1769 }, { "epoch": 1.7453783583929012, "grad_norm": 0.16017914106674547, "learning_rate": 3.738824591454996e-06, "loss": 0.4753, "step": 1770 }, { "epoch": 1.7463643086024154, "grad_norm": 0.16304058666183993, "learning_rate": 3.733820307687398e-06, "loss": 0.4606, "step": 1771 }, { "epoch": 1.74735025881193, "grad_norm": 0.15879668350572085, "learning_rate": 3.7288173788142586e-06, "loss": 0.4689, "step": 1772 }, { "epoch": 1.7483362090214443, "grad_norm": 0.16213042148565254, "learning_rate": 3.7238158101890376e-06, "loss": 0.4811, "step": 1773 }, { "epoch": 1.749322159230959, "grad_norm": 0.162055828243607, "learning_rate": 3.718815607163736e-06, "loss": 0.4846, "step": 1774 }, { "epoch": 1.750308109440473, "grad_norm": 0.1625855396752446, "learning_rate": 3.7138167750888985e-06, "loss": 0.4623, "step": 1775 }, { "epoch": 1.7512940596499877, "grad_norm": 0.15526605487307704, "learning_rate": 3.708819319313597e-06, "loss": 0.4621, "step": 1776 }, { "epoch": 1.752280009859502, "grad_norm": 0.16113481794568882, "learning_rate": 3.703823245185434e-06, "loss": 0.4538, "step": 1777 }, { "epoch": 1.7532659600690166, "grad_norm": 0.16413413164604834, "learning_rate": 3.6988285580505345e-06, "loss": 0.4737, "step": 1778 }, { "epoch": 1.7542519102785308, "grad_norm": 0.15714016554624696, "learning_rate": 3.693835263253538e-06, "loss": 0.464, "step": 1779 }, { "epoch": 1.7552378604880454, "grad_norm": 0.15705356629935893, "learning_rate": 3.6888433661375934e-06, "loss": 0.458, "step": 1780 }, { "epoch": 1.7562238106975596, "grad_norm": 0.16381244221833122, "learning_rate": 3.683852872044353e-06, "loss": 0.467, "step": 1781 }, { "epoch": 1.7572097609070743, "grad_norm": 0.16170181326889393, "learning_rate": 3.6788637863139716e-06, "loss": 0.4634, "step": 1782 }, { "epoch": 1.7581957111165885, "grad_norm": 0.1843592684562403, "learning_rate": 3.673876114285093e-06, "loss": 0.4656, "step": 1783 }, { "epoch": 1.759181661326103, "grad_norm": 0.15988105544370507, "learning_rate": 3.668889861294852e-06, "loss": 0.4761, "step": 1784 }, { "epoch": 1.7601676115356173, "grad_norm": 0.16154518889017053, "learning_rate": 3.6639050326788637e-06, "loss": 0.4482, "step": 1785 }, { "epoch": 1.761153561745132, "grad_norm": 0.16842499238085878, "learning_rate": 3.6589216337712153e-06, "loss": 0.4444, "step": 1786 }, { "epoch": 1.7621395119546461, "grad_norm": 0.15648832899751022, "learning_rate": 3.653939669904468e-06, "loss": 0.4564, "step": 1787 }, { "epoch": 1.7631254621641608, "grad_norm": 0.1561238377660689, "learning_rate": 3.6489591464096475e-06, "loss": 0.4554, "step": 1788 }, { "epoch": 1.764111412373675, "grad_norm": 0.15793615530774216, "learning_rate": 3.6439800686162354e-06, "loss": 0.481, "step": 1789 }, { "epoch": 1.7650973625831896, "grad_norm": 0.16722508169816377, "learning_rate": 3.639002441852173e-06, "loss": 0.4704, "step": 1790 }, { "epoch": 1.7660833127927038, "grad_norm": 0.15932686815960895, "learning_rate": 3.634026271443837e-06, "loss": 0.4488, "step": 1791 }, { "epoch": 1.7670692630022184, "grad_norm": 0.16354950036583038, "learning_rate": 3.629051562716058e-06, "loss": 0.4701, "step": 1792 }, { "epoch": 1.7680552132117326, "grad_norm": 0.15870969754085226, "learning_rate": 3.624078320992094e-06, "loss": 0.4509, "step": 1793 }, { "epoch": 1.7690411634212473, "grad_norm": 0.16063785348184123, "learning_rate": 3.6191065515936387e-06, "loss": 0.4533, "step": 1794 }, { "epoch": 1.7700271136307615, "grad_norm": 0.15752721509162096, "learning_rate": 3.6141362598408087e-06, "loss": 0.4633, "step": 1795 }, { "epoch": 1.7710130638402761, "grad_norm": 0.15630592366386475, "learning_rate": 3.609167451052135e-06, "loss": 0.4639, "step": 1796 }, { "epoch": 1.7719990140497903, "grad_norm": 0.15733705201561435, "learning_rate": 3.6042001305445693e-06, "loss": 0.4666, "step": 1797 }, { "epoch": 1.772984964259305, "grad_norm": 0.15860971912053975, "learning_rate": 3.5992343036334653e-06, "loss": 0.4607, "step": 1798 }, { "epoch": 1.7739709144688192, "grad_norm": 0.16061398006664077, "learning_rate": 3.5942699756325795e-06, "loss": 0.4715, "step": 1799 }, { "epoch": 1.7749568646783338, "grad_norm": 0.15145860278327483, "learning_rate": 3.5893071518540683e-06, "loss": 0.4335, "step": 1800 }, { "epoch": 1.775942814887848, "grad_norm": 0.1592736663241254, "learning_rate": 3.5843458376084715e-06, "loss": 0.4658, "step": 1801 }, { "epoch": 1.7769287650973626, "grad_norm": 0.16288409017228925, "learning_rate": 3.5793860382047185e-06, "loss": 0.4924, "step": 1802 }, { "epoch": 1.777914715306877, "grad_norm": 0.1702474657072719, "learning_rate": 3.5744277589501174e-06, "loss": 0.4745, "step": 1803 }, { "epoch": 1.7789006655163915, "grad_norm": 0.22498696327184153, "learning_rate": 3.569471005150349e-06, "loss": 0.4575, "step": 1804 }, { "epoch": 1.779886615725906, "grad_norm": 0.15754237349064829, "learning_rate": 3.5645157821094623e-06, "loss": 0.4635, "step": 1805 }, { "epoch": 1.7808725659354203, "grad_norm": 0.16167295071553067, "learning_rate": 3.5595620951298637e-06, "loss": 0.4728, "step": 1806 }, { "epoch": 1.7818585161449347, "grad_norm": 0.1543577728513248, "learning_rate": 3.554609949512324e-06, "loss": 0.4512, "step": 1807 }, { "epoch": 1.7828444663544492, "grad_norm": 0.16074266985328764, "learning_rate": 3.5496593505559575e-06, "loss": 0.4809, "step": 1808 }, { "epoch": 1.7838304165639636, "grad_norm": 0.17896578555040946, "learning_rate": 3.5447103035582285e-06, "loss": 0.4621, "step": 1809 }, { "epoch": 1.784816366773478, "grad_norm": 0.15710714831802694, "learning_rate": 3.53976281381494e-06, "loss": 0.4446, "step": 1810 }, { "epoch": 1.7858023169829924, "grad_norm": 0.15832639388066821, "learning_rate": 3.5348168866202226e-06, "loss": 0.4598, "step": 1811 }, { "epoch": 1.7867882671925068, "grad_norm": 0.18073821236033372, "learning_rate": 3.529872527266542e-06, "loss": 0.4631, "step": 1812 }, { "epoch": 1.7877742174020212, "grad_norm": 0.15627997258945053, "learning_rate": 3.5249297410446836e-06, "loss": 0.4615, "step": 1813 }, { "epoch": 1.7887601676115357, "grad_norm": 0.15690753303724347, "learning_rate": 3.519988533243749e-06, "loss": 0.4696, "step": 1814 }, { "epoch": 1.78974611782105, "grad_norm": 0.15726083379428013, "learning_rate": 3.515048909151154e-06, "loss": 0.4839, "step": 1815 }, { "epoch": 1.7907320680305645, "grad_norm": 0.15438431777406547, "learning_rate": 3.5101108740526134e-06, "loss": 0.4549, "step": 1816 }, { "epoch": 1.791718018240079, "grad_norm": 0.15512043084625451, "learning_rate": 3.505174433232147e-06, "loss": 0.4491, "step": 1817 }, { "epoch": 1.7927039684495933, "grad_norm": 0.1633839584996681, "learning_rate": 3.500239591972065e-06, "loss": 0.4543, "step": 1818 }, { "epoch": 1.7936899186591078, "grad_norm": 0.1616122255325751, "learning_rate": 3.4953063555529703e-06, "loss": 0.486, "step": 1819 }, { "epoch": 1.7946758688686222, "grad_norm": 0.16712653497862173, "learning_rate": 3.4903747292537467e-06, "loss": 0.4659, "step": 1820 }, { "epoch": 1.7956618190781366, "grad_norm": 0.16922792213817878, "learning_rate": 3.4854447183515504e-06, "loss": 0.4615, "step": 1821 }, { "epoch": 1.796647769287651, "grad_norm": 0.16721935928538403, "learning_rate": 3.480516328121817e-06, "loss": 0.4344, "step": 1822 }, { "epoch": 1.7976337194971654, "grad_norm": 0.15720134451512144, "learning_rate": 3.4755895638382413e-06, "loss": 0.448, "step": 1823 }, { "epoch": 1.7986196697066799, "grad_norm": 0.1571325895931652, "learning_rate": 3.4706644307727833e-06, "loss": 0.4611, "step": 1824 }, { "epoch": 1.7996056199161943, "grad_norm": 0.16817426785366024, "learning_rate": 3.465740934195655e-06, "loss": 0.4626, "step": 1825 }, { "epoch": 1.8005915701257087, "grad_norm": 0.16516151765217899, "learning_rate": 3.460819079375315e-06, "loss": 0.4655, "step": 1826 }, { "epoch": 1.8015775203352231, "grad_norm": 0.16229957509756535, "learning_rate": 3.4558988715784677e-06, "loss": 0.4675, "step": 1827 }, { "epoch": 1.8025634705447375, "grad_norm": 0.15799579555997448, "learning_rate": 3.4509803160700562e-06, "loss": 0.4684, "step": 1828 }, { "epoch": 1.803549420754252, "grad_norm": 0.1662848802427933, "learning_rate": 3.4460634181132534e-06, "loss": 0.4579, "step": 1829 }, { "epoch": 1.8045353709637664, "grad_norm": 0.16554653297373703, "learning_rate": 3.4411481829694627e-06, "loss": 0.4767, "step": 1830 }, { "epoch": 1.8055213211732808, "grad_norm": 0.17551491880662762, "learning_rate": 3.4362346158982985e-06, "loss": 0.4516, "step": 1831 }, { "epoch": 1.8065072713827952, "grad_norm": 0.1599166072934886, "learning_rate": 3.4313227221576008e-06, "loss": 0.4843, "step": 1832 }, { "epoch": 1.8074932215923096, "grad_norm": 0.15895594620786174, "learning_rate": 3.4264125070034115e-06, "loss": 0.4666, "step": 1833 }, { "epoch": 1.808479171801824, "grad_norm": 0.16165619505400786, "learning_rate": 3.4215039756899836e-06, "loss": 0.455, "step": 1834 }, { "epoch": 1.8094651220113385, "grad_norm": 0.15832254009515392, "learning_rate": 3.4165971334697633e-06, "loss": 0.4558, "step": 1835 }, { "epoch": 1.8104510722208529, "grad_norm": 0.15990852763698515, "learning_rate": 3.411691985593387e-06, "loss": 0.4596, "step": 1836 }, { "epoch": 1.8114370224303673, "grad_norm": 0.15622449666788027, "learning_rate": 3.406788537309685e-06, "loss": 0.4537, "step": 1837 }, { "epoch": 1.8124229726398817, "grad_norm": 0.17240945192540147, "learning_rate": 3.401886793865663e-06, "loss": 0.4598, "step": 1838 }, { "epoch": 1.8134089228493961, "grad_norm": 0.158165938089437, "learning_rate": 3.3969867605065055e-06, "loss": 0.4489, "step": 1839 }, { "epoch": 1.8143948730589106, "grad_norm": 0.1604750780707797, "learning_rate": 3.3920884424755674e-06, "loss": 0.4818, "step": 1840 }, { "epoch": 1.815380823268425, "grad_norm": 0.15646417111600883, "learning_rate": 3.3871918450143647e-06, "loss": 0.4604, "step": 1841 }, { "epoch": 1.8163667734779394, "grad_norm": 0.15902233562082224, "learning_rate": 3.3822969733625747e-06, "loss": 0.431, "step": 1842 }, { "epoch": 1.8173527236874538, "grad_norm": 0.17308042764576753, "learning_rate": 3.37740383275803e-06, "loss": 0.4647, "step": 1843 }, { "epoch": 1.8183386738969682, "grad_norm": 0.16678974761648963, "learning_rate": 3.3725124284367074e-06, "loss": 0.461, "step": 1844 }, { "epoch": 1.8193246241064827, "grad_norm": 0.16282534173302918, "learning_rate": 3.3676227656327277e-06, "loss": 0.4458, "step": 1845 }, { "epoch": 1.820310574315997, "grad_norm": 0.15062473045003175, "learning_rate": 3.3627348495783445e-06, "loss": 0.4575, "step": 1846 }, { "epoch": 1.8212965245255115, "grad_norm": 0.18143489354081666, "learning_rate": 3.3578486855039488e-06, "loss": 0.4439, "step": 1847 }, { "epoch": 1.822282474735026, "grad_norm": 0.15234651761383963, "learning_rate": 3.352964278638051e-06, "loss": 0.4602, "step": 1848 }, { "epoch": 1.8232684249445403, "grad_norm": 0.15984031597904724, "learning_rate": 3.3480816342072853e-06, "loss": 0.4689, "step": 1849 }, { "epoch": 1.8242543751540548, "grad_norm": 0.1628495763093985, "learning_rate": 3.343200757436399e-06, "loss": 0.4642, "step": 1850 }, { "epoch": 1.8252403253635692, "grad_norm": 0.15565905371256994, "learning_rate": 3.338321653548244e-06, "loss": 0.4618, "step": 1851 }, { "epoch": 1.8262262755730836, "grad_norm": 0.16164990092885384, "learning_rate": 3.3334443277637786e-06, "loss": 0.4638, "step": 1852 }, { "epoch": 1.827212225782598, "grad_norm": 0.1577594623352456, "learning_rate": 3.3285687853020604e-06, "loss": 0.4724, "step": 1853 }, { "epoch": 1.8281981759921124, "grad_norm": 0.1625290547710255, "learning_rate": 3.3236950313802334e-06, "loss": 0.4676, "step": 1854 }, { "epoch": 1.8291841262016268, "grad_norm": 0.15682998914590174, "learning_rate": 3.318823071213534e-06, "loss": 0.4641, "step": 1855 }, { "epoch": 1.8301700764111413, "grad_norm": 0.16434356883191334, "learning_rate": 3.313952910015274e-06, "loss": 0.4738, "step": 1856 }, { "epoch": 1.8311560266206557, "grad_norm": 0.16041096783467473, "learning_rate": 3.3090845529968414e-06, "loss": 0.4554, "step": 1857 }, { "epoch": 1.83214197683017, "grad_norm": 0.15907413737106404, "learning_rate": 3.3042180053676937e-06, "loss": 0.4767, "step": 1858 }, { "epoch": 1.8331279270396845, "grad_norm": 0.18226908874849415, "learning_rate": 3.2993532723353548e-06, "loss": 0.4808, "step": 1859 }, { "epoch": 1.834113877249199, "grad_norm": 0.1570027521805479, "learning_rate": 3.2944903591054033e-06, "loss": 0.4534, "step": 1860 }, { "epoch": 1.8350998274587134, "grad_norm": 0.16304082965163516, "learning_rate": 3.2896292708814736e-06, "loss": 0.4533, "step": 1861 }, { "epoch": 1.8360857776682278, "grad_norm": 0.15870440809124775, "learning_rate": 3.284770012865245e-06, "loss": 0.4522, "step": 1862 }, { "epoch": 1.8370717278777422, "grad_norm": 0.15842408817291853, "learning_rate": 3.279912590256438e-06, "loss": 0.459, "step": 1863 }, { "epoch": 1.8380576780872566, "grad_norm": 0.15994740780885353, "learning_rate": 3.275057008252809e-06, "loss": 0.4503, "step": 1864 }, { "epoch": 1.839043628296771, "grad_norm": 0.1580316715220139, "learning_rate": 3.270203272050149e-06, "loss": 0.4585, "step": 1865 }, { "epoch": 1.8400295785062855, "grad_norm": 0.19051325191228838, "learning_rate": 3.265351386842271e-06, "loss": 0.4579, "step": 1866 }, { "epoch": 1.8410155287157999, "grad_norm": 0.1566686670213712, "learning_rate": 3.2605013578210033e-06, "loss": 0.4541, "step": 1867 }, { "epoch": 1.8420014789253143, "grad_norm": 0.1560100801294482, "learning_rate": 3.2556531901761945e-06, "loss": 0.4591, "step": 1868 }, { "epoch": 1.8429874291348287, "grad_norm": 0.1588568819489083, "learning_rate": 3.250806889095698e-06, "loss": 0.4564, "step": 1869 }, { "epoch": 1.8439733793443431, "grad_norm": 0.157291542776539, "learning_rate": 3.2459624597653703e-06, "loss": 0.4668, "step": 1870 }, { "epoch": 1.8449593295538576, "grad_norm": 0.1601965240155022, "learning_rate": 3.241119907369068e-06, "loss": 0.4768, "step": 1871 }, { "epoch": 1.845945279763372, "grad_norm": 0.157013646106603, "learning_rate": 3.2362792370886325e-06, "loss": 0.4704, "step": 1872 }, { "epoch": 1.8469312299728864, "grad_norm": 0.21012822042122972, "learning_rate": 3.231440454103896e-06, "loss": 0.467, "step": 1873 }, { "epoch": 1.8479171801824008, "grad_norm": 0.1577460006053354, "learning_rate": 3.226603563592672e-06, "loss": 0.4682, "step": 1874 }, { "epoch": 1.8489031303919152, "grad_norm": 0.1571004003202859, "learning_rate": 3.2217685707307454e-06, "loss": 0.4615, "step": 1875 }, { "epoch": 1.8498890806014296, "grad_norm": 0.1684311540483634, "learning_rate": 3.2169354806918773e-06, "loss": 0.4494, "step": 1876 }, { "epoch": 1.850875030810944, "grad_norm": 0.15444772127631406, "learning_rate": 3.21210429864778e-06, "loss": 0.4428, "step": 1877 }, { "epoch": 1.8518609810204585, "grad_norm": 0.15973429963787777, "learning_rate": 3.2072750297681375e-06, "loss": 0.4717, "step": 1878 }, { "epoch": 1.852846931229973, "grad_norm": 0.1525158645228592, "learning_rate": 3.2024476792205783e-06, "loss": 0.4578, "step": 1879 }, { "epoch": 1.8538328814394873, "grad_norm": 0.1655696267227171, "learning_rate": 3.1976222521706834e-06, "loss": 0.4763, "step": 1880 }, { "epoch": 1.8548188316490017, "grad_norm": 0.1540107823048566, "learning_rate": 3.1927987537819717e-06, "loss": 0.4506, "step": 1881 }, { "epoch": 1.8558047818585162, "grad_norm": 0.1521294452532488, "learning_rate": 3.1879771892158972e-06, "loss": 0.4464, "step": 1882 }, { "epoch": 1.8567907320680306, "grad_norm": 0.16265111807762048, "learning_rate": 3.18315756363185e-06, "loss": 0.4716, "step": 1883 }, { "epoch": 1.857776682277545, "grad_norm": 0.18365951500532943, "learning_rate": 3.178339882187142e-06, "loss": 0.4762, "step": 1884 }, { "epoch": 1.8587626324870594, "grad_norm": 0.19219541326891357, "learning_rate": 3.173524150037003e-06, "loss": 0.4549, "step": 1885 }, { "epoch": 1.8597485826965738, "grad_norm": 0.15866139316926775, "learning_rate": 3.1687103723345824e-06, "loss": 0.4625, "step": 1886 }, { "epoch": 1.8607345329060883, "grad_norm": 0.16772825255039564, "learning_rate": 3.163898554230932e-06, "loss": 0.4661, "step": 1887 }, { "epoch": 1.8617204831156027, "grad_norm": 0.1605491227208391, "learning_rate": 3.1590887008750092e-06, "loss": 0.4404, "step": 1888 }, { "epoch": 1.862706433325117, "grad_norm": 0.1495008561900215, "learning_rate": 3.154280817413672e-06, "loss": 0.4455, "step": 1889 }, { "epoch": 1.8636923835346315, "grad_norm": 0.15886459063277308, "learning_rate": 3.1494749089916652e-06, "loss": 0.4537, "step": 1890 }, { "epoch": 1.864678333744146, "grad_norm": 0.1640140958159717, "learning_rate": 3.144670980751625e-06, "loss": 0.4619, "step": 1891 }, { "epoch": 1.8656642839536604, "grad_norm": 0.15834493781763131, "learning_rate": 3.139869037834064e-06, "loss": 0.468, "step": 1892 }, { "epoch": 1.8666502341631748, "grad_norm": 0.16134168749488406, "learning_rate": 3.1350690853773746e-06, "loss": 0.4398, "step": 1893 }, { "epoch": 1.8676361843726892, "grad_norm": 0.2253324483513052, "learning_rate": 3.1302711285178156e-06, "loss": 0.4507, "step": 1894 }, { "epoch": 1.8686221345822036, "grad_norm": 0.16004373862290389, "learning_rate": 3.125475172389515e-06, "loss": 0.4777, "step": 1895 }, { "epoch": 1.869608084791718, "grad_norm": 0.15471993032254314, "learning_rate": 3.120681222124457e-06, "loss": 0.451, "step": 1896 }, { "epoch": 1.8705940350012324, "grad_norm": 0.3721044055142246, "learning_rate": 3.115889282852477e-06, "loss": 0.4748, "step": 1897 }, { "epoch": 1.8715799852107469, "grad_norm": 0.1592218948443952, "learning_rate": 3.1110993597012616e-06, "loss": 0.4709, "step": 1898 }, { "epoch": 1.8725659354202613, "grad_norm": 0.15870421761671272, "learning_rate": 3.106311457796341e-06, "loss": 0.4398, "step": 1899 }, { "epoch": 1.8735518856297757, "grad_norm": 0.15730969558231842, "learning_rate": 3.1015255822610794e-06, "loss": 0.4632, "step": 1900 }, { "epoch": 1.8745378358392901, "grad_norm": 0.16061188271156304, "learning_rate": 3.0967417382166777e-06, "loss": 0.4739, "step": 1901 }, { "epoch": 1.8755237860488045, "grad_norm": 0.15793561334312298, "learning_rate": 3.0919599307821556e-06, "loss": 0.4659, "step": 1902 }, { "epoch": 1.876509736258319, "grad_norm": 0.16235126158620328, "learning_rate": 3.0871801650743583e-06, "loss": 0.4707, "step": 1903 }, { "epoch": 1.8774956864678334, "grad_norm": 0.16356268873089355, "learning_rate": 3.082402446207946e-06, "loss": 0.4748, "step": 1904 }, { "epoch": 1.8784816366773478, "grad_norm": 0.15619435962510766, "learning_rate": 3.0776267792953886e-06, "loss": 0.4399, "step": 1905 }, { "epoch": 1.8794675868868622, "grad_norm": 0.16595363086120873, "learning_rate": 3.07285316944696e-06, "loss": 0.4562, "step": 1906 }, { "epoch": 1.8804535370963766, "grad_norm": 0.15997234378256922, "learning_rate": 3.0680816217707293e-06, "loss": 0.4632, "step": 1907 }, { "epoch": 1.881439487305891, "grad_norm": 0.16296615025499692, "learning_rate": 3.063312141372566e-06, "loss": 0.4649, "step": 1908 }, { "epoch": 1.8824254375154055, "grad_norm": 0.16153964356530928, "learning_rate": 3.0585447333561206e-06, "loss": 0.4659, "step": 1909 }, { "epoch": 1.88341138772492, "grad_norm": 0.1555280681334106, "learning_rate": 3.0537794028228327e-06, "loss": 0.4662, "step": 1910 }, { "epoch": 1.8843973379344343, "grad_norm": 0.15917860361428368, "learning_rate": 3.049016154871914e-06, "loss": 0.4555, "step": 1911 }, { "epoch": 1.8853832881439487, "grad_norm": 0.15686598908876545, "learning_rate": 3.0442549946003475e-06, "loss": 0.4671, "step": 1912 }, { "epoch": 1.8863692383534632, "grad_norm": 0.15708166799382442, "learning_rate": 3.0394959271028836e-06, "loss": 0.4767, "step": 1913 }, { "epoch": 1.8873551885629776, "grad_norm": 0.16339088997013207, "learning_rate": 3.0347389574720355e-06, "loss": 0.4668, "step": 1914 }, { "epoch": 1.888341138772492, "grad_norm": 0.15546193949380208, "learning_rate": 3.029984090798067e-06, "loss": 0.4565, "step": 1915 }, { "epoch": 1.8893270889820064, "grad_norm": 0.15540520000015712, "learning_rate": 3.025231332168999e-06, "loss": 0.4347, "step": 1916 }, { "epoch": 1.8903130391915208, "grad_norm": 0.15639257919675925, "learning_rate": 3.0204806866705847e-06, "loss": 0.4656, "step": 1917 }, { "epoch": 1.8912989894010352, "grad_norm": 0.1590228273368216, "learning_rate": 3.015732159386329e-06, "loss": 0.4743, "step": 1918 }, { "epoch": 1.8922849396105497, "grad_norm": 0.1581612910456602, "learning_rate": 3.0109857553974598e-06, "loss": 0.4666, "step": 1919 }, { "epoch": 1.893270889820064, "grad_norm": 0.18049981454409514, "learning_rate": 3.006241479782941e-06, "loss": 0.4556, "step": 1920 }, { "epoch": 1.8942568400295785, "grad_norm": 0.16070363536732327, "learning_rate": 3.0014993376194555e-06, "loss": 0.457, "step": 1921 }, { "epoch": 1.895242790239093, "grad_norm": 0.165616397602939, "learning_rate": 2.9967593339814003e-06, "loss": 0.4563, "step": 1922 }, { "epoch": 1.8962287404486073, "grad_norm": 0.1559989880531137, "learning_rate": 2.992021473940888e-06, "loss": 0.4689, "step": 1923 }, { "epoch": 1.8972146906581218, "grad_norm": 0.15719253856621765, "learning_rate": 2.9872857625677382e-06, "loss": 0.4743, "step": 1924 }, { "epoch": 1.8982006408676362, "grad_norm": 0.15945375444114104, "learning_rate": 2.982552204929467e-06, "loss": 0.4548, "step": 1925 }, { "epoch": 1.8991865910771506, "grad_norm": 0.18053807856357737, "learning_rate": 2.9778208060912915e-06, "loss": 0.4808, "step": 1926 }, { "epoch": 1.900172541286665, "grad_norm": 0.1695676212701229, "learning_rate": 2.9730915711161125e-06, "loss": 0.4611, "step": 1927 }, { "epoch": 1.9011584914961794, "grad_norm": 0.1577863548799166, "learning_rate": 2.968364505064518e-06, "loss": 0.4669, "step": 1928 }, { "epoch": 1.9021444417056939, "grad_norm": 0.16087139519802168, "learning_rate": 2.963639612994779e-06, "loss": 0.4398, "step": 1929 }, { "epoch": 1.9031303919152083, "grad_norm": 0.15507935944597756, "learning_rate": 2.9589168999628335e-06, "loss": 0.4513, "step": 1930 }, { "epoch": 1.9041163421247227, "grad_norm": 0.21589745345780148, "learning_rate": 2.9541963710222944e-06, "loss": 0.4655, "step": 1931 }, { "epoch": 1.9051022923342371, "grad_norm": 0.15433199589637409, "learning_rate": 2.9494780312244293e-06, "loss": 0.451, "step": 1932 }, { "epoch": 1.9060882425437515, "grad_norm": 0.15924290014028386, "learning_rate": 2.9447618856181713e-06, "loss": 0.4584, "step": 1933 }, { "epoch": 1.907074192753266, "grad_norm": 0.16140321856670797, "learning_rate": 2.9400479392501015e-06, "loss": 0.4595, "step": 1934 }, { "epoch": 1.9080601429627804, "grad_norm": 0.16019290065835354, "learning_rate": 2.935336197164449e-06, "loss": 0.4793, "step": 1935 }, { "epoch": 1.9090460931722948, "grad_norm": 0.16092143984089813, "learning_rate": 2.9306266644030836e-06, "loss": 0.4739, "step": 1936 }, { "epoch": 1.9100320433818092, "grad_norm": 0.1633958525890816, "learning_rate": 2.925919346005512e-06, "loss": 0.4691, "step": 1937 }, { "epoch": 1.9110179935913236, "grad_norm": 0.15500558425847583, "learning_rate": 2.9212142470088665e-06, "loss": 0.4546, "step": 1938 }, { "epoch": 1.912003943800838, "grad_norm": 0.1482116239473967, "learning_rate": 2.916511372447912e-06, "loss": 0.4374, "step": 1939 }, { "epoch": 1.9129898940103525, "grad_norm": 0.15839043230733438, "learning_rate": 2.9118107273550295e-06, "loss": 0.4529, "step": 1940 }, { "epoch": 1.9139758442198669, "grad_norm": 0.17484570335708627, "learning_rate": 2.907112316760213e-06, "loss": 0.454, "step": 1941 }, { "epoch": 1.9149617944293813, "grad_norm": 0.16851150199128412, "learning_rate": 2.9024161456910704e-06, "loss": 0.4565, "step": 1942 }, { "epoch": 1.9159477446388957, "grad_norm": 0.15688125899118022, "learning_rate": 2.8977222191728015e-06, "loss": 0.4519, "step": 1943 }, { "epoch": 1.9169336948484101, "grad_norm": 0.15899098416715704, "learning_rate": 2.89303054222822e-06, "loss": 0.4529, "step": 1944 }, { "epoch": 1.9179196450579246, "grad_norm": 0.1610388113835398, "learning_rate": 2.8883411198777224e-06, "loss": 0.4635, "step": 1945 }, { "epoch": 1.918905595267439, "grad_norm": 0.16148759270031587, "learning_rate": 2.883653957139294e-06, "loss": 0.4536, "step": 1946 }, { "epoch": 1.9198915454769534, "grad_norm": 0.1575978985313557, "learning_rate": 2.8789690590285048e-06, "loss": 0.4508, "step": 1947 }, { "epoch": 1.9208774956864678, "grad_norm": 0.16225728719964383, "learning_rate": 2.8742864305584982e-06, "loss": 0.4568, "step": 1948 }, { "epoch": 1.9218634458959822, "grad_norm": 0.15894418451652362, "learning_rate": 2.8696060767399926e-06, "loss": 0.4653, "step": 1949 }, { "epoch": 1.9228493961054967, "grad_norm": 0.15910831562107275, "learning_rate": 2.8649280025812688e-06, "loss": 0.475, "step": 1950 }, { "epoch": 1.923835346315011, "grad_norm": 0.15411775631030855, "learning_rate": 2.860252213088168e-06, "loss": 0.459, "step": 1951 }, { "epoch": 1.9248212965245255, "grad_norm": 0.15869065524896728, "learning_rate": 2.8555787132640943e-06, "loss": 0.4604, "step": 1952 }, { "epoch": 1.92580724673404, "grad_norm": 0.15371324508352271, "learning_rate": 2.8509075081099913e-06, "loss": 0.4487, "step": 1953 }, { "epoch": 1.9267931969435543, "grad_norm": 0.1663553768569787, "learning_rate": 2.84623860262435e-06, "loss": 0.4455, "step": 1954 }, { "epoch": 1.9277791471530688, "grad_norm": 0.16223804975948958, "learning_rate": 2.8415720018032066e-06, "loss": 0.4482, "step": 1955 }, { "epoch": 1.9287650973625832, "grad_norm": 0.16013186345810299, "learning_rate": 2.8369077106401266e-06, "loss": 0.4574, "step": 1956 }, { "epoch": 1.9297510475720976, "grad_norm": 0.1569134362810151, "learning_rate": 2.8322457341262044e-06, "loss": 0.4684, "step": 1957 }, { "epoch": 1.930736997781612, "grad_norm": 0.1624895711166642, "learning_rate": 2.827586077250052e-06, "loss": 0.4536, "step": 1958 }, { "epoch": 1.9317229479911264, "grad_norm": 0.1571745912877177, "learning_rate": 2.8229287449978105e-06, "loss": 0.4618, "step": 1959 }, { "epoch": 1.9327088982006408, "grad_norm": 0.15917210331188558, "learning_rate": 2.8182737423531264e-06, "loss": 0.4686, "step": 1960 }, { "epoch": 1.9336948484101553, "grad_norm": 0.1560915049196943, "learning_rate": 2.813621074297155e-06, "loss": 0.4443, "step": 1961 }, { "epoch": 1.9346807986196697, "grad_norm": 0.15844711297710645, "learning_rate": 2.808970745808551e-06, "loss": 0.4712, "step": 1962 }, { "epoch": 1.935666748829184, "grad_norm": 0.16425246698893556, "learning_rate": 2.8043227618634703e-06, "loss": 0.4548, "step": 1963 }, { "epoch": 1.9366526990386985, "grad_norm": 0.17328731147764184, "learning_rate": 2.799677127435556e-06, "loss": 0.4524, "step": 1964 }, { "epoch": 1.937638649248213, "grad_norm": 0.16153363030874587, "learning_rate": 2.7950338474959395e-06, "loss": 0.4531, "step": 1965 }, { "epoch": 1.9386245994577274, "grad_norm": 0.15840085787844096, "learning_rate": 2.790392927013228e-06, "loss": 0.4634, "step": 1966 }, { "epoch": 1.9396105496672418, "grad_norm": 0.156182280688547, "learning_rate": 2.785754370953515e-06, "loss": 0.4659, "step": 1967 }, { "epoch": 1.9405964998767562, "grad_norm": 0.1577606130078244, "learning_rate": 2.7811181842803504e-06, "loss": 0.4562, "step": 1968 }, { "epoch": 1.9415824500862706, "grad_norm": 0.16285074167980573, "learning_rate": 2.7764843719547562e-06, "loss": 0.4673, "step": 1969 }, { "epoch": 1.942568400295785, "grad_norm": 0.16210584044877782, "learning_rate": 2.7718529389352123e-06, "loss": 0.4427, "step": 1970 }, { "epoch": 1.9435543505052995, "grad_norm": 0.15264033577467623, "learning_rate": 2.7672238901776537e-06, "loss": 0.4622, "step": 1971 }, { "epoch": 1.9445403007148139, "grad_norm": 0.1553511328644889, "learning_rate": 2.7625972306354652e-06, "loss": 0.4601, "step": 1972 }, { "epoch": 1.9455262509243283, "grad_norm": 0.15749846113915345, "learning_rate": 2.7579729652594667e-06, "loss": 0.4631, "step": 1973 }, { "epoch": 1.9465122011338427, "grad_norm": 0.1523349961098614, "learning_rate": 2.7533510989979272e-06, "loss": 0.4386, "step": 1974 }, { "epoch": 1.9474981513433571, "grad_norm": 0.15286019953152136, "learning_rate": 2.7487316367965435e-06, "loss": 0.4512, "step": 1975 }, { "epoch": 1.9484841015528716, "grad_norm": 0.16000652795911166, "learning_rate": 2.7441145835984374e-06, "loss": 0.4665, "step": 1976 }, { "epoch": 1.949470051762386, "grad_norm": 0.17317625216154026, "learning_rate": 2.739499944344157e-06, "loss": 0.4704, "step": 1977 }, { "epoch": 1.9504560019719004, "grad_norm": 0.15390488605498684, "learning_rate": 2.7348877239716665e-06, "loss": 0.4502, "step": 1978 }, { "epoch": 1.9514419521814148, "grad_norm": 0.15929470966946352, "learning_rate": 2.730277927416339e-06, "loss": 0.4646, "step": 1979 }, { "epoch": 1.9524279023909292, "grad_norm": 0.16268777255975028, "learning_rate": 2.725670559610959e-06, "loss": 0.4726, "step": 1980 }, { "epoch": 1.9534138526004436, "grad_norm": 0.1701458164230504, "learning_rate": 2.721065625485705e-06, "loss": 0.451, "step": 1981 }, { "epoch": 1.954399802809958, "grad_norm": 0.15856203668413626, "learning_rate": 2.716463129968161e-06, "loss": 0.4578, "step": 1982 }, { "epoch": 1.9553857530194725, "grad_norm": 0.15688745928162184, "learning_rate": 2.7118630779832918e-06, "loss": 0.4334, "step": 1983 }, { "epoch": 1.956371703228987, "grad_norm": 0.15049989939886538, "learning_rate": 2.707265474453452e-06, "loss": 0.4439, "step": 1984 }, { "epoch": 1.9573576534385013, "grad_norm": 0.15598612242573837, "learning_rate": 2.702670324298375e-06, "loss": 0.4749, "step": 1985 }, { "epoch": 1.9583436036480157, "grad_norm": 0.15765072424731968, "learning_rate": 2.698077632435173e-06, "loss": 0.4553, "step": 1986 }, { "epoch": 1.9593295538575302, "grad_norm": 0.15897114221475683, "learning_rate": 2.6934874037783245e-06, "loss": 0.471, "step": 1987 }, { "epoch": 1.9603155040670446, "grad_norm": 0.15597262811001858, "learning_rate": 2.688899643239668e-06, "loss": 0.4539, "step": 1988 }, { "epoch": 1.961301454276559, "grad_norm": 0.15694708957284073, "learning_rate": 2.6843143557284055e-06, "loss": 0.4427, "step": 1989 }, { "epoch": 1.9622874044860734, "grad_norm": 0.16147762390646708, "learning_rate": 2.6797315461510965e-06, "loss": 0.4531, "step": 1990 }, { "epoch": 1.9632733546955878, "grad_norm": 0.16235553988010826, "learning_rate": 2.6751512194116415e-06, "loss": 0.452, "step": 1991 }, { "epoch": 1.9642593049051023, "grad_norm": 0.15833390799471556, "learning_rate": 2.6705733804112886e-06, "loss": 0.4545, "step": 1992 }, { "epoch": 1.9652452551146167, "grad_norm": 0.15819603071568658, "learning_rate": 2.6659980340486225e-06, "loss": 0.4551, "step": 1993 }, { "epoch": 1.966231205324131, "grad_norm": 0.15336847229022132, "learning_rate": 2.66142518521956e-06, "loss": 0.4269, "step": 1994 }, { "epoch": 1.9672171555336455, "grad_norm": 0.15976227175164648, "learning_rate": 2.656854838817347e-06, "loss": 0.4716, "step": 1995 }, { "epoch": 1.96820310574316, "grad_norm": 0.1568909109038371, "learning_rate": 2.6522869997325505e-06, "loss": 0.4582, "step": 1996 }, { "epoch": 1.9691890559526743, "grad_norm": 0.15471315466033486, "learning_rate": 2.6477216728530553e-06, "loss": 0.448, "step": 1997 }, { "epoch": 1.9701750061621888, "grad_norm": 0.1568241985290219, "learning_rate": 2.643158863064057e-06, "loss": 0.4525, "step": 1998 }, { "epoch": 1.9711609563717032, "grad_norm": 0.1552060004095341, "learning_rate": 2.638598575248058e-06, "loss": 0.4552, "step": 1999 }, { "epoch": 1.9721469065812176, "grad_norm": 0.15262270444362303, "learning_rate": 2.6340408142848607e-06, "loss": 0.4451, "step": 2000 }, { "epoch": 1.973132856790732, "grad_norm": 0.15713750671861487, "learning_rate": 2.6294855850515687e-06, "loss": 0.4597, "step": 2001 }, { "epoch": 1.9741188070002464, "grad_norm": 0.16171825414027022, "learning_rate": 2.6249328924225737e-06, "loss": 0.4739, "step": 2002 }, { "epoch": 1.9751047572097609, "grad_norm": 0.1564439191029133, "learning_rate": 2.6203827412695482e-06, "loss": 0.4519, "step": 2003 }, { "epoch": 1.9760907074192753, "grad_norm": 0.1555304959709648, "learning_rate": 2.6158351364614487e-06, "loss": 0.4401, "step": 2004 }, { "epoch": 1.9770766576287897, "grad_norm": 0.15741543443360018, "learning_rate": 2.6112900828645116e-06, "loss": 0.4583, "step": 2005 }, { "epoch": 1.9780626078383041, "grad_norm": 0.1590870159959881, "learning_rate": 2.606747585342238e-06, "loss": 0.4596, "step": 2006 }, { "epoch": 1.9790485580478185, "grad_norm": 0.16034416018300318, "learning_rate": 2.602207648755395e-06, "loss": 0.4688, "step": 2007 }, { "epoch": 1.980034508257333, "grad_norm": 0.15786997284929244, "learning_rate": 2.5976702779620077e-06, "loss": 0.4623, "step": 2008 }, { "epoch": 1.9810204584668474, "grad_norm": 0.15252651356961516, "learning_rate": 2.59313547781736e-06, "loss": 0.4419, "step": 2009 }, { "epoch": 1.9820064086763618, "grad_norm": 0.16291103856756803, "learning_rate": 2.5886032531739813e-06, "loss": 0.4622, "step": 2010 }, { "epoch": 1.9829923588858762, "grad_norm": 0.1583943249207326, "learning_rate": 2.584073608881645e-06, "loss": 0.4578, "step": 2011 }, { "epoch": 1.9839783090953906, "grad_norm": 0.15954374949997202, "learning_rate": 2.579546549787366e-06, "loss": 0.4709, "step": 2012 }, { "epoch": 1.984964259304905, "grad_norm": 0.17337484989325813, "learning_rate": 2.575022080735391e-06, "loss": 0.4996, "step": 2013 }, { "epoch": 1.9859502095144195, "grad_norm": 0.15914435262299276, "learning_rate": 2.5705002065671956e-06, "loss": 0.469, "step": 2014 }, { "epoch": 1.986936159723934, "grad_norm": 0.15636976490103763, "learning_rate": 2.5659809321214784e-06, "loss": 0.4543, "step": 2015 }, { "epoch": 1.9879221099334483, "grad_norm": 0.18144576680530625, "learning_rate": 2.5614642622341548e-06, "loss": 0.4543, "step": 2016 }, { "epoch": 1.9889080601429627, "grad_norm": 0.15069657104321396, "learning_rate": 2.5569502017383585e-06, "loss": 0.4472, "step": 2017 }, { "epoch": 1.9898940103524771, "grad_norm": 0.1581192101077221, "learning_rate": 2.552438755464427e-06, "loss": 0.4503, "step": 2018 }, { "epoch": 1.9908799605619916, "grad_norm": 0.15794622540757225, "learning_rate": 2.547929928239895e-06, "loss": 0.4648, "step": 2019 }, { "epoch": 1.991865910771506, "grad_norm": 0.16638480468830721, "learning_rate": 2.543423724889506e-06, "loss": 0.4699, "step": 2020 }, { "epoch": 1.9928518609810204, "grad_norm": 0.1638747412332113, "learning_rate": 2.538920150235189e-06, "loss": 0.4483, "step": 2021 }, { "epoch": 1.9938378111905348, "grad_norm": 0.15361835011940492, "learning_rate": 2.5344192090960594e-06, "loss": 0.4588, "step": 2022 }, { "epoch": 1.9948237614000492, "grad_norm": 0.17243925193115178, "learning_rate": 2.5299209062884185e-06, "loss": 0.4734, "step": 2023 }, { "epoch": 1.9958097116095637, "grad_norm": 0.15873734963755304, "learning_rate": 2.5254252466257412e-06, "loss": 0.4612, "step": 2024 }, { "epoch": 1.996795661819078, "grad_norm": 0.1613122922200011, "learning_rate": 2.5209322349186743e-06, "loss": 0.4694, "step": 2025 }, { "epoch": 1.9977816120285925, "grad_norm": 0.1580632926224322, "learning_rate": 2.5164418759750337e-06, "loss": 0.453, "step": 2026 }, { "epoch": 1.998767562238107, "grad_norm": 0.15721020580588313, "learning_rate": 2.5119541745997923e-06, "loss": 0.4544, "step": 2027 }, { "epoch": 1.9997535124476213, "grad_norm": 0.160681699855714, "learning_rate": 2.507469135595087e-06, "loss": 0.4667, "step": 2028 }, { "epoch": 2.000739462657136, "grad_norm": 0.16414460372966053, "learning_rate": 2.5029867637601955e-06, "loss": 0.4667, "step": 2029 }, { "epoch": 2.00172541286665, "grad_norm": 0.15005003920043286, "learning_rate": 2.4985070638915485e-06, "loss": 0.4632, "step": 2030 }, { "epoch": 2.0027113630761644, "grad_norm": 0.15917893988755794, "learning_rate": 2.494030040782714e-06, "loss": 0.4464, "step": 2031 }, { "epoch": 2.003697313285679, "grad_norm": 0.15354329145987627, "learning_rate": 2.489555699224401e-06, "loss": 0.4544, "step": 2032 }, { "epoch": 2.0049297510475723, "grad_norm": 0.19316274103980197, "learning_rate": 2.485084044004445e-06, "loss": 0.4115, "step": 2033 }, { "epoch": 2.0059157012570865, "grad_norm": 0.1926969418616464, "learning_rate": 2.480615079907804e-06, "loss": 0.4139, "step": 2034 }, { "epoch": 2.006901651466601, "grad_norm": 0.18292987466593616, "learning_rate": 2.476148811716559e-06, "loss": 0.4275, "step": 2035 }, { "epoch": 2.0078876016761154, "grad_norm": 0.16420682338832338, "learning_rate": 2.471685244209911e-06, "loss": 0.4048, "step": 2036 }, { "epoch": 2.00887355188563, "grad_norm": 0.1902366011581597, "learning_rate": 2.4672243821641656e-06, "loss": 0.4222, "step": 2037 }, { "epoch": 2.009859502095144, "grad_norm": 0.2254711219316246, "learning_rate": 2.4627662303527342e-06, "loss": 0.4154, "step": 2038 }, { "epoch": 2.010845452304659, "grad_norm": 0.20342286022645653, "learning_rate": 2.458310793546129e-06, "loss": 0.419, "step": 2039 }, { "epoch": 2.011831402514173, "grad_norm": 0.18952275197647123, "learning_rate": 2.4538580765119563e-06, "loss": 0.4155, "step": 2040 }, { "epoch": 2.0128173527236877, "grad_norm": 0.17381576228322404, "learning_rate": 2.449408084014912e-06, "loss": 0.4122, "step": 2041 }, { "epoch": 2.013803302933202, "grad_norm": 0.20050359828116268, "learning_rate": 2.4449608208167774e-06, "loss": 0.3976, "step": 2042 }, { "epoch": 2.0147892531427165, "grad_norm": 0.17915308930601712, "learning_rate": 2.440516291676413e-06, "loss": 0.4207, "step": 2043 }, { "epoch": 2.0157752033522307, "grad_norm": 0.19314790795510625, "learning_rate": 2.4360745013497526e-06, "loss": 0.4211, "step": 2044 }, { "epoch": 2.0167611535617453, "grad_norm": 0.4596523922508372, "learning_rate": 2.431635454589801e-06, "loss": 0.418, "step": 2045 }, { "epoch": 2.0177471037712595, "grad_norm": 0.17206936744944476, "learning_rate": 2.4271991561466254e-06, "loss": 0.4084, "step": 2046 }, { "epoch": 2.018733053980774, "grad_norm": 0.16986684157695564, "learning_rate": 2.422765610767354e-06, "loss": 0.4104, "step": 2047 }, { "epoch": 2.0197190041902884, "grad_norm": 0.1745733524015661, "learning_rate": 2.4183348231961707e-06, "loss": 0.4271, "step": 2048 }, { "epoch": 2.020704954399803, "grad_norm": 0.178360592939682, "learning_rate": 2.4139067981743014e-06, "loss": 0.4167, "step": 2049 }, { "epoch": 2.021690904609317, "grad_norm": 0.16354451493957817, "learning_rate": 2.4094815404400196e-06, "loss": 0.4219, "step": 2050 }, { "epoch": 2.022676854818832, "grad_norm": 0.16264836760005474, "learning_rate": 2.4050590547286423e-06, "loss": 0.3999, "step": 2051 }, { "epoch": 2.023662805028346, "grad_norm": 0.159353154129677, "learning_rate": 2.400639345772515e-06, "loss": 0.4171, "step": 2052 }, { "epoch": 2.0246487552378607, "grad_norm": 0.1614189992245902, "learning_rate": 2.396222418301013e-06, "loss": 0.4096, "step": 2053 }, { "epoch": 2.025634705447375, "grad_norm": 0.17812903834233001, "learning_rate": 2.3918082770405347e-06, "loss": 0.3997, "step": 2054 }, { "epoch": 2.0266206556568895, "grad_norm": 0.16519033443031572, "learning_rate": 2.3873969267144993e-06, "loss": 0.4209, "step": 2055 }, { "epoch": 2.0276066058664037, "grad_norm": 0.15897351804597687, "learning_rate": 2.382988372043336e-06, "loss": 0.4082, "step": 2056 }, { "epoch": 2.0285925560759184, "grad_norm": 0.16366118349259243, "learning_rate": 2.378582617744486e-06, "loss": 0.4199, "step": 2057 }, { "epoch": 2.0295785062854326, "grad_norm": 0.23354903751658837, "learning_rate": 2.3741796685323916e-06, "loss": 0.4159, "step": 2058 }, { "epoch": 2.030564456494947, "grad_norm": 0.16766034200865684, "learning_rate": 2.369779529118494e-06, "loss": 0.4163, "step": 2059 }, { "epoch": 2.0315504067044614, "grad_norm": 0.16722153377411042, "learning_rate": 2.365382204211229e-06, "loss": 0.4207, "step": 2060 }, { "epoch": 2.032536356913976, "grad_norm": 0.15843660439226942, "learning_rate": 2.3609876985160192e-06, "loss": 0.3986, "step": 2061 }, { "epoch": 2.0335223071234902, "grad_norm": 0.16596114170737103, "learning_rate": 2.3565960167352686e-06, "loss": 0.417, "step": 2062 }, { "epoch": 2.034508257333005, "grad_norm": 0.15935028620806083, "learning_rate": 2.352207163568368e-06, "loss": 0.3908, "step": 2063 }, { "epoch": 2.035494207542519, "grad_norm": 0.1604915683673604, "learning_rate": 2.3478211437116694e-06, "loss": 0.435, "step": 2064 }, { "epoch": 2.0364801577520337, "grad_norm": 0.161152593080013, "learning_rate": 2.3434379618584986e-06, "loss": 0.4209, "step": 2065 }, { "epoch": 2.037466107961548, "grad_norm": 0.194730826383451, "learning_rate": 2.3390576226991486e-06, "loss": 0.4152, "step": 2066 }, { "epoch": 2.0384520581710626, "grad_norm": 0.20785591489633165, "learning_rate": 2.334680130920865e-06, "loss": 0.422, "step": 2067 }, { "epoch": 2.0394380083805768, "grad_norm": 0.15622260033990645, "learning_rate": 2.3303054912078492e-06, "loss": 0.4087, "step": 2068 }, { "epoch": 2.0404239585900914, "grad_norm": 0.16398917357758674, "learning_rate": 2.3259337082412446e-06, "loss": 0.4249, "step": 2069 }, { "epoch": 2.0414099087996056, "grad_norm": 0.1588498966871557, "learning_rate": 2.3215647866991485e-06, "loss": 0.4143, "step": 2070 }, { "epoch": 2.0423958590091202, "grad_norm": 0.15823541866578061, "learning_rate": 2.3171987312565885e-06, "loss": 0.3929, "step": 2071 }, { "epoch": 2.0433818092186344, "grad_norm": 0.16394423751651, "learning_rate": 2.3128355465855273e-06, "loss": 0.4191, "step": 2072 }, { "epoch": 2.044367759428149, "grad_norm": 0.16884273307766906, "learning_rate": 2.308475237354856e-06, "loss": 0.4298, "step": 2073 }, { "epoch": 2.0453537096376633, "grad_norm": 0.1756595085447255, "learning_rate": 2.3041178082303878e-06, "loss": 0.4213, "step": 2074 }, { "epoch": 2.046339659847178, "grad_norm": 0.16665318581076202, "learning_rate": 2.2997632638748553e-06, "loss": 0.4113, "step": 2075 }, { "epoch": 2.047325610056692, "grad_norm": 0.1774429295851841, "learning_rate": 2.295411608947903e-06, "loss": 0.4081, "step": 2076 }, { "epoch": 2.0483115602662068, "grad_norm": 0.15685031607760505, "learning_rate": 2.291062848106083e-06, "loss": 0.4191, "step": 2077 }, { "epoch": 2.049297510475721, "grad_norm": 0.165764475113141, "learning_rate": 2.286716986002857e-06, "loss": 0.4302, "step": 2078 }, { "epoch": 2.0502834606852356, "grad_norm": 0.16502553641738632, "learning_rate": 2.2823740272885742e-06, "loss": 0.3987, "step": 2079 }, { "epoch": 2.05126941089475, "grad_norm": 0.16071862487005942, "learning_rate": 2.278033976610482e-06, "loss": 0.4264, "step": 2080 }, { "epoch": 2.0522553611042644, "grad_norm": 0.1656052682627577, "learning_rate": 2.2736968386127196e-06, "loss": 0.4003, "step": 2081 }, { "epoch": 2.0532413113137786, "grad_norm": 0.15904591972000107, "learning_rate": 2.2693626179363056e-06, "loss": 0.4079, "step": 2082 }, { "epoch": 2.0542272615232933, "grad_norm": 0.16097964204825316, "learning_rate": 2.265031319219138e-06, "loss": 0.4191, "step": 2083 }, { "epoch": 2.0552132117328075, "grad_norm": 0.1728884836779008, "learning_rate": 2.260702947095983e-06, "loss": 0.41, "step": 2084 }, { "epoch": 2.056199161942322, "grad_norm": 0.16295443236310594, "learning_rate": 2.2563775061984844e-06, "loss": 0.3917, "step": 2085 }, { "epoch": 2.0571851121518363, "grad_norm": 0.1671292038118426, "learning_rate": 2.2520550011551435e-06, "loss": 0.412, "step": 2086 }, { "epoch": 2.058171062361351, "grad_norm": 0.1583333241805396, "learning_rate": 2.2477354365913212e-06, "loss": 0.3907, "step": 2087 }, { "epoch": 2.059157012570865, "grad_norm": 0.16190713679960353, "learning_rate": 2.2434188171292313e-06, "loss": 0.4133, "step": 2088 }, { "epoch": 2.06014296278038, "grad_norm": 0.16277119817080535, "learning_rate": 2.239105147387938e-06, "loss": 0.4099, "step": 2089 }, { "epoch": 2.061128912989894, "grad_norm": 0.16058151549029703, "learning_rate": 2.2347944319833476e-06, "loss": 0.4168, "step": 2090 }, { "epoch": 2.0621148631994086, "grad_norm": 0.15709005185388775, "learning_rate": 2.2304866755282044e-06, "loss": 0.396, "step": 2091 }, { "epoch": 2.063100813408923, "grad_norm": 0.1662005825518739, "learning_rate": 2.226181882632087e-06, "loss": 0.4299, "step": 2092 }, { "epoch": 2.0640867636184375, "grad_norm": 0.17124812316783214, "learning_rate": 2.2218800579014076e-06, "loss": 0.4178, "step": 2093 }, { "epoch": 2.0650727138279517, "grad_norm": 0.15962448482388233, "learning_rate": 2.2175812059393926e-06, "loss": 0.4122, "step": 2094 }, { "epoch": 2.0660586640374663, "grad_norm": 0.16387594165192024, "learning_rate": 2.213285331346095e-06, "loss": 0.422, "step": 2095 }, { "epoch": 2.0670446142469805, "grad_norm": 0.16554271394019585, "learning_rate": 2.2089924387183774e-06, "loss": 0.4166, "step": 2096 }, { "epoch": 2.068030564456495, "grad_norm": 0.1713818526717728, "learning_rate": 2.204702532649917e-06, "loss": 0.4227, "step": 2097 }, { "epoch": 2.0690165146660093, "grad_norm": 0.1621404786261226, "learning_rate": 2.200415617731192e-06, "loss": 0.4115, "step": 2098 }, { "epoch": 2.070002464875524, "grad_norm": 0.17156265722786548, "learning_rate": 2.1961316985494737e-06, "loss": 0.4127, "step": 2099 }, { "epoch": 2.070988415085038, "grad_norm": 0.16477435588711076, "learning_rate": 2.19185077968884e-06, "loss": 0.4133, "step": 2100 }, { "epoch": 2.071974365294553, "grad_norm": 0.15846960416926448, "learning_rate": 2.1875728657301493e-06, "loss": 0.4071, "step": 2101 }, { "epoch": 2.072960315504067, "grad_norm": 0.17152652490165182, "learning_rate": 2.1832979612510475e-06, "loss": 0.4035, "step": 2102 }, { "epoch": 2.0739462657135816, "grad_norm": 0.16031949415577315, "learning_rate": 2.17902607082596e-06, "loss": 0.4146, "step": 2103 }, { "epoch": 2.074932215923096, "grad_norm": 0.16347258382061014, "learning_rate": 2.1747571990260867e-06, "loss": 0.4034, "step": 2104 }, { "epoch": 2.0759181661326105, "grad_norm": 0.16202541571213971, "learning_rate": 2.170491350419398e-06, "loss": 0.4035, "step": 2105 }, { "epoch": 2.0769041163421247, "grad_norm": 0.16057032837909369, "learning_rate": 2.166228529570628e-06, "loss": 0.4221, "step": 2106 }, { "epoch": 2.0778900665516393, "grad_norm": 0.17546009537245488, "learning_rate": 2.1619687410412728e-06, "loss": 0.4091, "step": 2107 }, { "epoch": 2.0788760167611535, "grad_norm": 0.16193591789438427, "learning_rate": 2.157711989389579e-06, "loss": 0.4215, "step": 2108 }, { "epoch": 2.079861966970668, "grad_norm": 0.1735122505713575, "learning_rate": 2.1534582791705545e-06, "loss": 0.415, "step": 2109 }, { "epoch": 2.0808479171801824, "grad_norm": 0.16593601833381807, "learning_rate": 2.149207614935939e-06, "loss": 0.4087, "step": 2110 }, { "epoch": 2.081833867389697, "grad_norm": 0.16634238973258567, "learning_rate": 2.1449600012342193e-06, "loss": 0.4243, "step": 2111 }, { "epoch": 2.082819817599211, "grad_norm": 0.16930759969879747, "learning_rate": 2.1407154426106214e-06, "loss": 0.4238, "step": 2112 }, { "epoch": 2.083805767808726, "grad_norm": 0.16504342842028494, "learning_rate": 2.136473943607097e-06, "loss": 0.3924, "step": 2113 }, { "epoch": 2.08479171801824, "grad_norm": 0.16081302287822755, "learning_rate": 2.1322355087623264e-06, "loss": 0.413, "step": 2114 }, { "epoch": 2.0857776682277547, "grad_norm": 0.16661725880499173, "learning_rate": 2.1280001426117053e-06, "loss": 0.42, "step": 2115 }, { "epoch": 2.086763618437269, "grad_norm": 0.1641100802727998, "learning_rate": 2.1237678496873554e-06, "loss": 0.4155, "step": 2116 }, { "epoch": 2.0877495686467835, "grad_norm": 0.16466429540472657, "learning_rate": 2.1195386345181033e-06, "loss": 0.4211, "step": 2117 }, { "epoch": 2.0887355188562977, "grad_norm": 0.16556551109315398, "learning_rate": 2.1153125016294838e-06, "loss": 0.4179, "step": 2118 }, { "epoch": 2.0897214690658124, "grad_norm": 0.17060075762383226, "learning_rate": 2.1110894555437333e-06, "loss": 0.4223, "step": 2119 }, { "epoch": 2.0907074192753266, "grad_norm": 0.16006232694808414, "learning_rate": 2.1068695007797853e-06, "loss": 0.4002, "step": 2120 }, { "epoch": 2.091693369484841, "grad_norm": 0.16500201659889524, "learning_rate": 2.102652641853265e-06, "loss": 0.4227, "step": 2121 }, { "epoch": 2.0926793196943554, "grad_norm": 0.1651185828677862, "learning_rate": 2.0984388832764853e-06, "loss": 0.4327, "step": 2122 }, { "epoch": 2.09366526990387, "grad_norm": 0.16420757197401067, "learning_rate": 2.09422822955844e-06, "loss": 0.4138, "step": 2123 }, { "epoch": 2.0946512201133842, "grad_norm": 0.1791171221018479, "learning_rate": 2.0900206852048065e-06, "loss": 0.4151, "step": 2124 }, { "epoch": 2.095637170322899, "grad_norm": 0.1624292350658065, "learning_rate": 2.085816254717926e-06, "loss": 0.4221, "step": 2125 }, { "epoch": 2.096623120532413, "grad_norm": 0.16270660939755127, "learning_rate": 2.0816149425968113e-06, "loss": 0.4024, "step": 2126 }, { "epoch": 2.0976090707419277, "grad_norm": 0.1590699419416762, "learning_rate": 2.077416753337143e-06, "loss": 0.4077, "step": 2127 }, { "epoch": 2.098595020951442, "grad_norm": 0.17500208343336374, "learning_rate": 2.073221691431254e-06, "loss": 0.4164, "step": 2128 }, { "epoch": 2.0995809711609565, "grad_norm": 0.15798359813998206, "learning_rate": 2.0690297613681348e-06, "loss": 0.4107, "step": 2129 }, { "epoch": 2.1005669213704707, "grad_norm": 0.16072774202565737, "learning_rate": 2.0648409676334173e-06, "loss": 0.4155, "step": 2130 }, { "epoch": 2.1015528715799854, "grad_norm": 0.16492565635865292, "learning_rate": 2.0606553147093883e-06, "loss": 0.4143, "step": 2131 }, { "epoch": 2.1025388217894996, "grad_norm": 0.16183912142588155, "learning_rate": 2.0564728070749657e-06, "loss": 0.414, "step": 2132 }, { "epoch": 2.103524771999014, "grad_norm": 0.1676213094264793, "learning_rate": 2.0522934492057046e-06, "loss": 0.3956, "step": 2133 }, { "epoch": 2.1045107222085284, "grad_norm": 0.16152895143010376, "learning_rate": 2.0481172455737896e-06, "loss": 0.419, "step": 2134 }, { "epoch": 2.105496672418043, "grad_norm": 0.16918238818197492, "learning_rate": 2.0439442006480288e-06, "loss": 0.4214, "step": 2135 }, { "epoch": 2.1064826226275573, "grad_norm": 0.1670519009757173, "learning_rate": 2.039774318893852e-06, "loss": 0.406, "step": 2136 }, { "epoch": 2.107468572837072, "grad_norm": 0.1632550277509182, "learning_rate": 2.0356076047733026e-06, "loss": 0.4087, "step": 2137 }, { "epoch": 2.108454523046586, "grad_norm": 0.16539817074272825, "learning_rate": 2.0314440627450333e-06, "loss": 0.4147, "step": 2138 }, { "epoch": 2.1094404732561007, "grad_norm": 0.16232307861452594, "learning_rate": 2.027283697264311e-06, "loss": 0.4224, "step": 2139 }, { "epoch": 2.110426423465615, "grad_norm": 0.15828565746283588, "learning_rate": 2.02312651278299e-06, "loss": 0.4023, "step": 2140 }, { "epoch": 2.1114123736751296, "grad_norm": 0.16119854000248965, "learning_rate": 2.01897251374953e-06, "loss": 0.4095, "step": 2141 }, { "epoch": 2.1123983238846438, "grad_norm": 0.15787852221084392, "learning_rate": 2.014821704608977e-06, "loss": 0.3925, "step": 2142 }, { "epoch": 2.1133842740941584, "grad_norm": 0.18047411849302172, "learning_rate": 2.0106740898029707e-06, "loss": 0.4135, "step": 2143 }, { "epoch": 2.1143702243036726, "grad_norm": 0.16464118652101478, "learning_rate": 2.0065296737697286e-06, "loss": 0.4184, "step": 2144 }, { "epoch": 2.1153561745131872, "grad_norm": 0.15975791419504334, "learning_rate": 2.0023884609440387e-06, "loss": 0.3998, "step": 2145 }, { "epoch": 2.1163421247227014, "grad_norm": 0.15825991418701266, "learning_rate": 1.998250455757273e-06, "loss": 0.4033, "step": 2146 }, { "epoch": 2.117328074932216, "grad_norm": 0.15782928807968785, "learning_rate": 1.994115662637364e-06, "loss": 0.4108, "step": 2147 }, { "epoch": 2.1183140251417303, "grad_norm": 0.15913137118988563, "learning_rate": 1.9899840860088075e-06, "loss": 0.4152, "step": 2148 }, { "epoch": 2.119299975351245, "grad_norm": 0.18908050771039311, "learning_rate": 1.9858557302926605e-06, "loss": 0.4168, "step": 2149 }, { "epoch": 2.120285925560759, "grad_norm": 0.16220131083638675, "learning_rate": 1.9817305999065312e-06, "loss": 0.4192, "step": 2150 }, { "epoch": 2.1212718757702738, "grad_norm": 0.15743426841699798, "learning_rate": 1.9776086992645765e-06, "loss": 0.4094, "step": 2151 }, { "epoch": 2.122257825979788, "grad_norm": 0.15562051150111653, "learning_rate": 1.9734900327774976e-06, "loss": 0.4194, "step": 2152 }, { "epoch": 2.1232437761893026, "grad_norm": 0.1671001502186119, "learning_rate": 1.969374604852535e-06, "loss": 0.4208, "step": 2153 }, { "epoch": 2.124229726398817, "grad_norm": 0.15953800531347767, "learning_rate": 1.9652624198934637e-06, "loss": 0.3975, "step": 2154 }, { "epoch": 2.1252156766083314, "grad_norm": 0.1622468362221989, "learning_rate": 1.961153482300589e-06, "loss": 0.4181, "step": 2155 }, { "epoch": 2.1262016268178456, "grad_norm": 0.16332419993400998, "learning_rate": 1.95704779647074e-06, "loss": 0.4165, "step": 2156 }, { "epoch": 2.1271875770273603, "grad_norm": 0.15946977213145044, "learning_rate": 1.9529453667972664e-06, "loss": 0.4026, "step": 2157 }, { "epoch": 2.1281735272368745, "grad_norm": 0.1592368242342468, "learning_rate": 1.948846197670036e-06, "loss": 0.4193, "step": 2158 }, { "epoch": 2.129159477446389, "grad_norm": 0.1593730327730932, "learning_rate": 1.944750293475428e-06, "loss": 0.4089, "step": 2159 }, { "epoch": 2.1301454276559033, "grad_norm": 0.15500326421754593, "learning_rate": 1.940657658596321e-06, "loss": 0.3929, "step": 2160 }, { "epoch": 2.131131377865418, "grad_norm": 0.16470379968502602, "learning_rate": 1.9365682974120996e-06, "loss": 0.404, "step": 2161 }, { "epoch": 2.132117328074932, "grad_norm": 0.19538808324042145, "learning_rate": 1.9324822142986505e-06, "loss": 0.4246, "step": 2162 }, { "epoch": 2.133103278284447, "grad_norm": 0.16156242091510378, "learning_rate": 1.928399413628345e-06, "loss": 0.417, "step": 2163 }, { "epoch": 2.134089228493961, "grad_norm": 0.17127984814270825, "learning_rate": 1.924319899770045e-06, "loss": 0.4109, "step": 2164 }, { "epoch": 2.1350751787034756, "grad_norm": 0.16546930929036938, "learning_rate": 1.9202436770890958e-06, "loss": 0.3997, "step": 2165 }, { "epoch": 2.13606112891299, "grad_norm": 0.16028629302094813, "learning_rate": 1.9161707499473196e-06, "loss": 0.4137, "step": 2166 }, { "epoch": 2.1370470791225045, "grad_norm": 0.16430777544125744, "learning_rate": 1.9121011227030127e-06, "loss": 0.404, "step": 2167 }, { "epoch": 2.1380330293320187, "grad_norm": 0.16082152748676395, "learning_rate": 1.908034799710941e-06, "loss": 0.4057, "step": 2168 }, { "epoch": 2.1390189795415333, "grad_norm": 0.16663143353116502, "learning_rate": 1.9039717853223343e-06, "loss": 0.4087, "step": 2169 }, { "epoch": 2.1400049297510475, "grad_norm": 0.16418114703833722, "learning_rate": 1.8999120838848806e-06, "loss": 0.404, "step": 2170 }, { "epoch": 2.140990879960562, "grad_norm": 0.1783395541806536, "learning_rate": 1.8958556997427247e-06, "loss": 0.4117, "step": 2171 }, { "epoch": 2.1419768301700763, "grad_norm": 0.16686800739942645, "learning_rate": 1.891802637236459e-06, "loss": 0.4068, "step": 2172 }, { "epoch": 2.142962780379591, "grad_norm": 0.16548420893496701, "learning_rate": 1.887752900703127e-06, "loss": 0.419, "step": 2173 }, { "epoch": 2.143948730589105, "grad_norm": 0.17374274058848405, "learning_rate": 1.8837064944762097e-06, "loss": 0.4032, "step": 2174 }, { "epoch": 2.14493468079862, "grad_norm": 0.16498670560317108, "learning_rate": 1.8796634228856209e-06, "loss": 0.4044, "step": 2175 }, { "epoch": 2.145920631008134, "grad_norm": 0.1654843853823051, "learning_rate": 1.8756236902577096e-06, "loss": 0.4319, "step": 2176 }, { "epoch": 2.1469065812176487, "grad_norm": 0.16025865811828457, "learning_rate": 1.8715873009152558e-06, "loss": 0.4054, "step": 2177 }, { "epoch": 2.147892531427163, "grad_norm": 0.16962396410394456, "learning_rate": 1.8675542591774554e-06, "loss": 0.4328, "step": 2178 }, { "epoch": 2.1488784816366775, "grad_norm": 0.16501085841999438, "learning_rate": 1.8635245693599275e-06, "loss": 0.4067, "step": 2179 }, { "epoch": 2.1498644318461917, "grad_norm": 0.16200967595281812, "learning_rate": 1.8594982357746965e-06, "loss": 0.4038, "step": 2180 }, { "epoch": 2.1508503820557063, "grad_norm": 0.15857804828382674, "learning_rate": 1.8554752627302052e-06, "loss": 0.401, "step": 2181 }, { "epoch": 2.1518363322652205, "grad_norm": 0.16178670261990907, "learning_rate": 1.8514556545312945e-06, "loss": 0.4118, "step": 2182 }, { "epoch": 2.152822282474735, "grad_norm": 0.16406883339538325, "learning_rate": 1.847439415479207e-06, "loss": 0.4154, "step": 2183 }, { "epoch": 2.1538082326842494, "grad_norm": 0.16252922142205084, "learning_rate": 1.8434265498715758e-06, "loss": 0.4026, "step": 2184 }, { "epoch": 2.154794182893764, "grad_norm": 0.16317163382582592, "learning_rate": 1.8394170620024337e-06, "loss": 0.4224, "step": 2185 }, { "epoch": 2.155780133103278, "grad_norm": 0.16038520344326254, "learning_rate": 1.835410956162188e-06, "loss": 0.4266, "step": 2186 }, { "epoch": 2.156766083312793, "grad_norm": 0.15798096004987605, "learning_rate": 1.8314082366376335e-06, "loss": 0.4039, "step": 2187 }, { "epoch": 2.157752033522307, "grad_norm": 0.1592308865545846, "learning_rate": 1.8274089077119378e-06, "loss": 0.4068, "step": 2188 }, { "epoch": 2.1587379837318217, "grad_norm": 0.17288103981264286, "learning_rate": 1.8234129736646461e-06, "loss": 0.4019, "step": 2189 }, { "epoch": 2.159723933941336, "grad_norm": 0.1609061792043482, "learning_rate": 1.8194204387716675e-06, "loss": 0.4181, "step": 2190 }, { "epoch": 2.1607098841508505, "grad_norm": 0.16971146139080798, "learning_rate": 1.8154313073052681e-06, "loss": 0.4265, "step": 2191 }, { "epoch": 2.1616958343603647, "grad_norm": 0.1620419097268328, "learning_rate": 1.8114455835340827e-06, "loss": 0.423, "step": 2192 }, { "epoch": 2.1626817845698794, "grad_norm": 0.16480446858563771, "learning_rate": 1.8074632717230927e-06, "loss": 0.4185, "step": 2193 }, { "epoch": 2.1636677347793936, "grad_norm": 0.1681441180898963, "learning_rate": 1.80348437613363e-06, "loss": 0.4136, "step": 2194 }, { "epoch": 2.164653684988908, "grad_norm": 0.1591729040367542, "learning_rate": 1.7995089010233718e-06, "loss": 0.4102, "step": 2195 }, { "epoch": 2.1656396351984224, "grad_norm": 0.16875051880378447, "learning_rate": 1.7955368506463338e-06, "loss": 0.3945, "step": 2196 }, { "epoch": 2.166625585407937, "grad_norm": 0.16056137879840315, "learning_rate": 1.7915682292528685e-06, "loss": 0.4148, "step": 2197 }, { "epoch": 2.1676115356174512, "grad_norm": 0.16167005788700176, "learning_rate": 1.7876030410896578e-06, "loss": 0.4118, "step": 2198 }, { "epoch": 2.168597485826966, "grad_norm": 0.16263157826789873, "learning_rate": 1.7836412903997085e-06, "loss": 0.4297, "step": 2199 }, { "epoch": 2.16958343603648, "grad_norm": 0.15948787430453046, "learning_rate": 1.7796829814223565e-06, "loss": 0.4177, "step": 2200 }, { "epoch": 2.1705693862459947, "grad_norm": 0.16878216756886477, "learning_rate": 1.7757281183932445e-06, "loss": 0.4158, "step": 2201 }, { "epoch": 2.171555336455509, "grad_norm": 0.18730391663546847, "learning_rate": 1.771776705544334e-06, "loss": 0.4062, "step": 2202 }, { "epoch": 2.1725412866650236, "grad_norm": 0.16302883810094165, "learning_rate": 1.7678287471038914e-06, "loss": 0.4336, "step": 2203 }, { "epoch": 2.1735272368745377, "grad_norm": 0.17081710177280024, "learning_rate": 1.7638842472964923e-06, "loss": 0.4158, "step": 2204 }, { "epoch": 2.1745131870840524, "grad_norm": 0.1660414253808993, "learning_rate": 1.759943210343007e-06, "loss": 0.4302, "step": 2205 }, { "epoch": 2.1754991372935666, "grad_norm": 0.16987499571555054, "learning_rate": 1.756005640460598e-06, "loss": 0.4172, "step": 2206 }, { "epoch": 2.1764850875030812, "grad_norm": 0.16993580402782144, "learning_rate": 1.7520715418627203e-06, "loss": 0.4187, "step": 2207 }, { "epoch": 2.1774710377125954, "grad_norm": 0.1642323672574239, "learning_rate": 1.7481409187591186e-06, "loss": 0.4219, "step": 2208 }, { "epoch": 2.17845698792211, "grad_norm": 0.18503526323285635, "learning_rate": 1.7442137753558126e-06, "loss": 0.4249, "step": 2209 }, { "epoch": 2.1794429381316243, "grad_norm": 0.16407691332661228, "learning_rate": 1.7402901158551006e-06, "loss": 0.4281, "step": 2210 }, { "epoch": 2.180428888341139, "grad_norm": 0.15999767519315217, "learning_rate": 1.7363699444555532e-06, "loss": 0.4025, "step": 2211 }, { "epoch": 2.181414838550653, "grad_norm": 0.1649245050856335, "learning_rate": 1.7324532653520082e-06, "loss": 0.4171, "step": 2212 }, { "epoch": 2.1824007887601677, "grad_norm": 0.16004150884273285, "learning_rate": 1.7285400827355663e-06, "loss": 0.4185, "step": 2213 }, { "epoch": 2.183386738969682, "grad_norm": 0.16081158077390012, "learning_rate": 1.7246304007935872e-06, "loss": 0.4119, "step": 2214 }, { "epoch": 2.1843726891791966, "grad_norm": 0.1810009258055357, "learning_rate": 1.7207242237096844e-06, "loss": 0.4156, "step": 2215 }, { "epoch": 2.1853586393887108, "grad_norm": 0.16472612332180087, "learning_rate": 1.7168215556637208e-06, "loss": 0.4236, "step": 2216 }, { "epoch": 2.1863445895982254, "grad_norm": 0.16022270898964686, "learning_rate": 1.7129224008318047e-06, "loss": 0.422, "step": 2217 }, { "epoch": 2.1873305398077396, "grad_norm": 0.18361054917211825, "learning_rate": 1.7090267633862822e-06, "loss": 0.4266, "step": 2218 }, { "epoch": 2.1883164900172543, "grad_norm": 0.1588835134399213, "learning_rate": 1.7051346474957432e-06, "loss": 0.3957, "step": 2219 }, { "epoch": 2.1893024402267685, "grad_norm": 0.15886974731578227, "learning_rate": 1.7012460573250034e-06, "loss": 0.4305, "step": 2220 }, { "epoch": 2.190288390436283, "grad_norm": 0.16631333224059686, "learning_rate": 1.6973609970351029e-06, "loss": 0.4093, "step": 2221 }, { "epoch": 2.1912743406457973, "grad_norm": 0.17509958145294982, "learning_rate": 1.6934794707833096e-06, "loss": 0.4082, "step": 2222 }, { "epoch": 2.192260290855312, "grad_norm": 0.1654129724979681, "learning_rate": 1.6896014827231111e-06, "loss": 0.4135, "step": 2223 }, { "epoch": 2.193246241064826, "grad_norm": 1.0116963699072126, "learning_rate": 1.6857270370042044e-06, "loss": 0.4142, "step": 2224 }, { "epoch": 2.1942321912743408, "grad_norm": 0.1593508623469355, "learning_rate": 1.6818561377725002e-06, "loss": 0.4094, "step": 2225 }, { "epoch": 2.195218141483855, "grad_norm": 0.16290684883919596, "learning_rate": 1.6779887891701068e-06, "loss": 0.426, "step": 2226 }, { "epoch": 2.1962040916933696, "grad_norm": 0.1562685873603517, "learning_rate": 1.6741249953353434e-06, "loss": 0.3952, "step": 2227 }, { "epoch": 2.197190041902884, "grad_norm": 0.15898611648713112, "learning_rate": 1.6702647604027178e-06, "loss": 0.4211, "step": 2228 }, { "epoch": 2.1981759921123984, "grad_norm": 0.17053474456190576, "learning_rate": 1.6664080885029328e-06, "loss": 0.4076, "step": 2229 }, { "epoch": 2.1991619423219126, "grad_norm": 0.16834333834852952, "learning_rate": 1.6625549837628773e-06, "loss": 0.4305, "step": 2230 }, { "epoch": 2.2001478925314273, "grad_norm": 0.15729964613924843, "learning_rate": 1.6587054503056232e-06, "loss": 0.4075, "step": 2231 }, { "epoch": 2.2011338427409415, "grad_norm": 0.1655216429147605, "learning_rate": 1.654859492250422e-06, "loss": 0.4076, "step": 2232 }, { "epoch": 2.202119792950456, "grad_norm": 0.16018527329403168, "learning_rate": 1.6510171137126974e-06, "loss": 0.4046, "step": 2233 }, { "epoch": 2.2031057431599703, "grad_norm": 0.1576112435570535, "learning_rate": 1.647178318804043e-06, "loss": 0.393, "step": 2234 }, { "epoch": 2.204091693369485, "grad_norm": 0.1563385792405461, "learning_rate": 1.6433431116322235e-06, "loss": 0.4135, "step": 2235 }, { "epoch": 2.205077643578999, "grad_norm": 0.1657235083932023, "learning_rate": 1.6395114963011538e-06, "loss": 0.4072, "step": 2236 }, { "epoch": 2.206063593788514, "grad_norm": 0.1600732025809443, "learning_rate": 1.6356834769109114e-06, "loss": 0.4054, "step": 2237 }, { "epoch": 2.207049543998028, "grad_norm": 0.16036074523541055, "learning_rate": 1.6318590575577293e-06, "loss": 0.4186, "step": 2238 }, { "epoch": 2.2080354942075426, "grad_norm": 0.15797831064291587, "learning_rate": 1.6280382423339818e-06, "loss": 0.4072, "step": 2239 }, { "epoch": 2.209021444417057, "grad_norm": 0.15994603084363623, "learning_rate": 1.6242210353281922e-06, "loss": 0.426, "step": 2240 }, { "epoch": 2.2100073946265715, "grad_norm": 0.1681290966340985, "learning_rate": 1.6204074406250136e-06, "loss": 0.4333, "step": 2241 }, { "epoch": 2.2109933448360857, "grad_norm": 0.15643437806841407, "learning_rate": 1.6165974623052455e-06, "loss": 0.4048, "step": 2242 }, { "epoch": 2.2119792950456003, "grad_norm": 0.16355208894238868, "learning_rate": 1.6127911044458106e-06, "loss": 0.4145, "step": 2243 }, { "epoch": 2.2129652452551145, "grad_norm": 0.1641335483343855, "learning_rate": 1.608988371119758e-06, "loss": 0.4042, "step": 2244 }, { "epoch": 2.213951195464629, "grad_norm": 0.16396929782208178, "learning_rate": 1.6051892663962593e-06, "loss": 0.4193, "step": 2245 }, { "epoch": 2.2149371456741433, "grad_norm": 0.16150400403270612, "learning_rate": 1.6013937943406038e-06, "loss": 0.4158, "step": 2246 }, { "epoch": 2.215923095883658, "grad_norm": 0.16093855664741763, "learning_rate": 1.5976019590141929e-06, "loss": 0.3989, "step": 2247 }, { "epoch": 2.216909046093172, "grad_norm": 0.16155833977339337, "learning_rate": 1.593813764474536e-06, "loss": 0.3997, "step": 2248 }, { "epoch": 2.217894996302687, "grad_norm": 0.16088300640877046, "learning_rate": 1.5900292147752462e-06, "loss": 0.4163, "step": 2249 }, { "epoch": 2.218880946512201, "grad_norm": 0.16078202786819562, "learning_rate": 1.5862483139660413e-06, "loss": 0.4148, "step": 2250 }, { "epoch": 2.2198668967217157, "grad_norm": 0.15925355651700746, "learning_rate": 1.5824710660927268e-06, "loss": 0.4235, "step": 2251 }, { "epoch": 2.22085284693123, "grad_norm": 0.16207978965295122, "learning_rate": 1.5786974751972033e-06, "loss": 0.4183, "step": 2252 }, { "epoch": 2.2218387971407445, "grad_norm": 0.16429033852290004, "learning_rate": 1.5749275453174584e-06, "loss": 0.3981, "step": 2253 }, { "epoch": 2.2228247473502587, "grad_norm": 0.16587177946082177, "learning_rate": 1.5711612804875632e-06, "loss": 0.4239, "step": 2254 }, { "epoch": 2.2238106975597733, "grad_norm": 0.16213834725171952, "learning_rate": 1.567398684737666e-06, "loss": 0.4227, "step": 2255 }, { "epoch": 2.2247966477692875, "grad_norm": 0.16324806110391102, "learning_rate": 1.5636397620939842e-06, "loss": 0.3984, "step": 2256 }, { "epoch": 2.225782597978802, "grad_norm": 0.16422658299514456, "learning_rate": 1.5598845165788134e-06, "loss": 0.4068, "step": 2257 }, { "epoch": 2.2267685481883164, "grad_norm": 0.16303998595459007, "learning_rate": 1.5561329522105083e-06, "loss": 0.4065, "step": 2258 }, { "epoch": 2.227754498397831, "grad_norm": 0.16431900778898972, "learning_rate": 1.5523850730034874e-06, "loss": 0.4177, "step": 2259 }, { "epoch": 2.228740448607345, "grad_norm": 0.16543433401498112, "learning_rate": 1.5486408829682232e-06, "loss": 0.4224, "step": 2260 }, { "epoch": 2.22972639881686, "grad_norm": 0.16561195089040817, "learning_rate": 1.5449003861112427e-06, "loss": 0.3978, "step": 2261 }, { "epoch": 2.230712349026374, "grad_norm": 0.1600352065459141, "learning_rate": 1.5411635864351204e-06, "loss": 0.4025, "step": 2262 }, { "epoch": 2.2316982992358887, "grad_norm": 0.16229768962021862, "learning_rate": 1.5374304879384744e-06, "loss": 0.4125, "step": 2263 }, { "epoch": 2.232684249445403, "grad_norm": 0.16295485519278402, "learning_rate": 1.5337010946159609e-06, "loss": 0.4117, "step": 2264 }, { "epoch": 2.2336701996549175, "grad_norm": 0.1663620317984901, "learning_rate": 1.5299754104582765e-06, "loss": 0.4214, "step": 2265 }, { "epoch": 2.2346561498644317, "grad_norm": 0.16820707279695954, "learning_rate": 1.526253439452144e-06, "loss": 0.4194, "step": 2266 }, { "epoch": 2.2356421000739464, "grad_norm": 0.16728411638631385, "learning_rate": 1.5225351855803117e-06, "loss": 0.4057, "step": 2267 }, { "epoch": 2.2366280502834606, "grad_norm": 0.17620979782236681, "learning_rate": 1.5188206528215522e-06, "loss": 0.4107, "step": 2268 }, { "epoch": 2.237614000492975, "grad_norm": 0.159934923213865, "learning_rate": 1.5151098451506596e-06, "loss": 0.4122, "step": 2269 }, { "epoch": 2.2385999507024894, "grad_norm": 0.16383742998598416, "learning_rate": 1.5114027665384384e-06, "loss": 0.4096, "step": 2270 }, { "epoch": 2.239585900912004, "grad_norm": 0.16534238981185112, "learning_rate": 1.5076994209517038e-06, "loss": 0.4185, "step": 2271 }, { "epoch": 2.2405718511215182, "grad_norm": 0.16226097304859072, "learning_rate": 1.5039998123532717e-06, "loss": 0.392, "step": 2272 }, { "epoch": 2.241557801331033, "grad_norm": 0.16941902784767632, "learning_rate": 1.500303944701968e-06, "loss": 0.4173, "step": 2273 }, { "epoch": 2.242543751540547, "grad_norm": 0.16557680158057791, "learning_rate": 1.4966118219526099e-06, "loss": 0.42, "step": 2274 }, { "epoch": 2.2435297017500617, "grad_norm": 0.16206832978749852, "learning_rate": 1.4929234480560078e-06, "loss": 0.4146, "step": 2275 }, { "epoch": 2.244515651959576, "grad_norm": 0.16827476886247675, "learning_rate": 1.4892388269589615e-06, "loss": 0.3898, "step": 2276 }, { "epoch": 2.2455016021690906, "grad_norm": 0.16194306645525028, "learning_rate": 1.4855579626042542e-06, "loss": 0.3905, "step": 2277 }, { "epoch": 2.2464875523786048, "grad_norm": 0.16473436355046234, "learning_rate": 1.481880858930651e-06, "loss": 0.406, "step": 2278 }, { "epoch": 2.2474735025881194, "grad_norm": 0.17270595927025267, "learning_rate": 1.47820751987289e-06, "loss": 0.4133, "step": 2279 }, { "epoch": 2.2484594527976336, "grad_norm": 0.16459987916542002, "learning_rate": 1.4745379493616817e-06, "loss": 0.4143, "step": 2280 }, { "epoch": 2.2494454030071482, "grad_norm": 0.16364511705619886, "learning_rate": 1.4708721513237096e-06, "loss": 0.4154, "step": 2281 }, { "epoch": 2.2504313532166624, "grad_norm": 0.1588633085001226, "learning_rate": 1.4672101296816099e-06, "loss": 0.4105, "step": 2282 }, { "epoch": 2.251417303426177, "grad_norm": 0.15929228670510678, "learning_rate": 1.4635518883539846e-06, "loss": 0.4025, "step": 2283 }, { "epoch": 2.2524032536356913, "grad_norm": 0.16179783135092632, "learning_rate": 1.4598974312553915e-06, "loss": 0.4163, "step": 2284 }, { "epoch": 2.253389203845206, "grad_norm": 0.1614449546919737, "learning_rate": 1.4562467622963367e-06, "loss": 0.3943, "step": 2285 }, { "epoch": 2.25437515405472, "grad_norm": 0.16651728964995471, "learning_rate": 1.4525998853832729e-06, "loss": 0.4216, "step": 2286 }, { "epoch": 2.2553611042642348, "grad_norm": 0.18272127165647753, "learning_rate": 1.4489568044185914e-06, "loss": 0.413, "step": 2287 }, { "epoch": 2.256347054473749, "grad_norm": 0.1643655873287094, "learning_rate": 1.4453175233006295e-06, "loss": 0.4149, "step": 2288 }, { "epoch": 2.2573330046832636, "grad_norm": 0.1716570649131217, "learning_rate": 1.441682045923653e-06, "loss": 0.3974, "step": 2289 }, { "epoch": 2.258318954892778, "grad_norm": 0.19409339613015558, "learning_rate": 1.4380503761778585e-06, "loss": 0.3761, "step": 2290 }, { "epoch": 2.2593049051022924, "grad_norm": 0.16359834650509225, "learning_rate": 1.4344225179493687e-06, "loss": 0.4342, "step": 2291 }, { "epoch": 2.2602908553118066, "grad_norm": 0.1741665404298848, "learning_rate": 1.4307984751202274e-06, "loss": 0.3925, "step": 2292 }, { "epoch": 2.2612768055213213, "grad_norm": 0.1614225854140192, "learning_rate": 1.4271782515683952e-06, "loss": 0.4044, "step": 2293 }, { "epoch": 2.2622627557308355, "grad_norm": 0.16395102476888754, "learning_rate": 1.4235618511677462e-06, "loss": 0.4039, "step": 2294 }, { "epoch": 2.26324870594035, "grad_norm": 0.16325766141781928, "learning_rate": 1.4199492777880624e-06, "loss": 0.4174, "step": 2295 }, { "epoch": 2.2642346561498643, "grad_norm": 0.1596825751015749, "learning_rate": 1.4163405352950365e-06, "loss": 0.4171, "step": 2296 }, { "epoch": 2.265220606359379, "grad_norm": 0.15904624178418814, "learning_rate": 1.412735627550253e-06, "loss": 0.3946, "step": 2297 }, { "epoch": 2.266206556568893, "grad_norm": 0.16666928908687545, "learning_rate": 1.4091345584111976e-06, "loss": 0.4212, "step": 2298 }, { "epoch": 2.267192506778408, "grad_norm": 0.1581487250365793, "learning_rate": 1.405537331731247e-06, "loss": 0.4085, "step": 2299 }, { "epoch": 2.268178456987922, "grad_norm": 0.18782944877358512, "learning_rate": 1.4019439513596705e-06, "loss": 0.4176, "step": 2300 }, { "epoch": 2.2691644071974366, "grad_norm": 0.16578694247998996, "learning_rate": 1.3983544211416184e-06, "loss": 0.4135, "step": 2301 }, { "epoch": 2.270150357406951, "grad_norm": 0.16377742221519453, "learning_rate": 1.3947687449181158e-06, "loss": 0.4303, "step": 2302 }, { "epoch": 2.2711363076164655, "grad_norm": 0.16748897978969077, "learning_rate": 1.391186926526074e-06, "loss": 0.4281, "step": 2303 }, { "epoch": 2.2721222578259797, "grad_norm": 0.16299536900408998, "learning_rate": 1.3876089697982704e-06, "loss": 0.4079, "step": 2304 }, { "epoch": 2.2731082080354943, "grad_norm": 0.18379707681905336, "learning_rate": 1.3840348785633494e-06, "loss": 0.4274, "step": 2305 }, { "epoch": 2.2740941582450085, "grad_norm": 0.1670828207958332, "learning_rate": 1.3804646566458225e-06, "loss": 0.4219, "step": 2306 }, { "epoch": 2.275080108454523, "grad_norm": 0.16235536756139166, "learning_rate": 1.3768983078660569e-06, "loss": 0.4287, "step": 2307 }, { "epoch": 2.2760660586640373, "grad_norm": 0.16478870974164742, "learning_rate": 1.3733358360402788e-06, "loss": 0.4162, "step": 2308 }, { "epoch": 2.277052008873552, "grad_norm": 0.15814873318539266, "learning_rate": 1.3697772449805635e-06, "loss": 0.3941, "step": 2309 }, { "epoch": 2.278037959083066, "grad_norm": 0.15759543961241237, "learning_rate": 1.366222538494833e-06, "loss": 0.4147, "step": 2310 }, { "epoch": 2.279023909292581, "grad_norm": 0.15750970138351825, "learning_rate": 1.362671720386859e-06, "loss": 0.4082, "step": 2311 }, { "epoch": 2.280009859502095, "grad_norm": 0.16639281028239625, "learning_rate": 1.3591247944562424e-06, "loss": 0.4033, "step": 2312 }, { "epoch": 2.2809958097116096, "grad_norm": 0.16318389717975754, "learning_rate": 1.3555817644984259e-06, "loss": 0.4121, "step": 2313 }, { "epoch": 2.281981759921124, "grad_norm": 0.1600569608278272, "learning_rate": 1.3520426343046794e-06, "loss": 0.4154, "step": 2314 }, { "epoch": 2.2829677101306385, "grad_norm": 0.1579638199323022, "learning_rate": 1.3485074076621063e-06, "loss": 0.4025, "step": 2315 }, { "epoch": 2.2839536603401527, "grad_norm": 0.15864404733099394, "learning_rate": 1.3449760883536266e-06, "loss": 0.4196, "step": 2316 }, { "epoch": 2.2849396105496673, "grad_norm": 0.16399490747054, "learning_rate": 1.341448680157979e-06, "loss": 0.4142, "step": 2317 }, { "epoch": 2.2859255607591815, "grad_norm": 0.16011740107155603, "learning_rate": 1.3379251868497217e-06, "loss": 0.4079, "step": 2318 }, { "epoch": 2.286911510968696, "grad_norm": 0.16148519434557543, "learning_rate": 1.334405612199221e-06, "loss": 0.4111, "step": 2319 }, { "epoch": 2.2878974611782104, "grad_norm": 0.15759777608133907, "learning_rate": 1.3308899599726493e-06, "loss": 0.4151, "step": 2320 }, { "epoch": 2.288883411387725, "grad_norm": 0.16955653152412, "learning_rate": 1.3273782339319835e-06, "loss": 0.4172, "step": 2321 }, { "epoch": 2.289869361597239, "grad_norm": 0.1644435979901736, "learning_rate": 1.3238704378349987e-06, "loss": 0.4416, "step": 2322 }, { "epoch": 2.290855311806754, "grad_norm": 0.16345034177638715, "learning_rate": 1.320366575435263e-06, "loss": 0.4164, "step": 2323 }, { "epoch": 2.291841262016268, "grad_norm": 0.16415191454126113, "learning_rate": 1.3168666504821375e-06, "loss": 0.4098, "step": 2324 }, { "epoch": 2.2928272122257827, "grad_norm": 0.16565438768318477, "learning_rate": 1.3133706667207697e-06, "loss": 0.422, "step": 2325 }, { "epoch": 2.293813162435297, "grad_norm": 0.15995448712585397, "learning_rate": 1.3098786278920877e-06, "loss": 0.4154, "step": 2326 }, { "epoch": 2.2947991126448115, "grad_norm": 0.16198130499563007, "learning_rate": 1.3063905377328006e-06, "loss": 0.4279, "step": 2327 }, { "epoch": 2.2957850628543257, "grad_norm": 0.1617429066817796, "learning_rate": 1.3029063999753916e-06, "loss": 0.4205, "step": 2328 }, { "epoch": 2.2967710130638403, "grad_norm": 0.1635844701720503, "learning_rate": 1.2994262183481121e-06, "loss": 0.4173, "step": 2329 }, { "epoch": 2.2977569632733545, "grad_norm": 0.15800385788487895, "learning_rate": 1.2959499965749855e-06, "loss": 0.3905, "step": 2330 }, { "epoch": 2.298742913482869, "grad_norm": 0.16250204875623522, "learning_rate": 1.2924777383757947e-06, "loss": 0.4118, "step": 2331 }, { "epoch": 2.2997288636923834, "grad_norm": 0.16294872877278363, "learning_rate": 1.2890094474660785e-06, "loss": 0.4215, "step": 2332 }, { "epoch": 2.300714813901898, "grad_norm": 0.15911908618348297, "learning_rate": 1.2855451275571335e-06, "loss": 0.3932, "step": 2333 }, { "epoch": 2.3017007641114122, "grad_norm": 0.16051854421822973, "learning_rate": 1.2820847823560095e-06, "loss": 0.4131, "step": 2334 }, { "epoch": 2.302686714320927, "grad_norm": 0.16169563962111952, "learning_rate": 1.2786284155654993e-06, "loss": 0.433, "step": 2335 }, { "epoch": 2.303672664530441, "grad_norm": 0.1610602524410069, "learning_rate": 1.2751760308841393e-06, "loss": 0.4312, "step": 2336 }, { "epoch": 2.3046586147399557, "grad_norm": 0.16258865600814715, "learning_rate": 1.2717276320062055e-06, "loss": 0.4178, "step": 2337 }, { "epoch": 2.30564456494947, "grad_norm": 0.16625316687523686, "learning_rate": 1.2682832226217085e-06, "loss": 0.4276, "step": 2338 }, { "epoch": 2.3066305151589845, "grad_norm": 0.17782087539321642, "learning_rate": 1.2648428064163898e-06, "loss": 0.4058, "step": 2339 }, { "epoch": 2.3076164653684987, "grad_norm": 0.16349501225562255, "learning_rate": 1.261406387071718e-06, "loss": 0.4248, "step": 2340 }, { "epoch": 2.3086024155780134, "grad_norm": 0.18761252560626943, "learning_rate": 1.257973968264885e-06, "loss": 0.3802, "step": 2341 }, { "epoch": 2.3095883657875276, "grad_norm": 0.16517967277010326, "learning_rate": 1.2545455536688022e-06, "loss": 0.4261, "step": 2342 }, { "epoch": 2.310574315997042, "grad_norm": 0.15863580873816105, "learning_rate": 1.2511211469520945e-06, "loss": 0.3891, "step": 2343 }, { "epoch": 2.3115602662065564, "grad_norm": 0.16029761268638454, "learning_rate": 1.2477007517791007e-06, "loss": 0.4195, "step": 2344 }, { "epoch": 2.312546216416071, "grad_norm": 0.16565382525985026, "learning_rate": 1.2442843718098635e-06, "loss": 0.411, "step": 2345 }, { "epoch": 2.3135321666255853, "grad_norm": 0.15986114270844246, "learning_rate": 1.2408720107001343e-06, "loss": 0.4082, "step": 2346 }, { "epoch": 2.3145181168351, "grad_norm": 0.16280923514839396, "learning_rate": 1.237463672101361e-06, "loss": 0.4141, "step": 2347 }, { "epoch": 2.315504067044614, "grad_norm": 0.1604111045926094, "learning_rate": 1.2340593596606832e-06, "loss": 0.3961, "step": 2348 }, { "epoch": 2.3164900172541287, "grad_norm": 0.16205921559293093, "learning_rate": 1.2306590770209393e-06, "loss": 0.4351, "step": 2349 }, { "epoch": 2.317475967463643, "grad_norm": 0.17346077650464367, "learning_rate": 1.2272628278206521e-06, "loss": 0.4266, "step": 2350 }, { "epoch": 2.3184619176731576, "grad_norm": 0.1689247568840012, "learning_rate": 1.2238706156940284e-06, "loss": 0.413, "step": 2351 }, { "epoch": 2.3194478678826718, "grad_norm": 0.35044658063633505, "learning_rate": 1.220482444270955e-06, "loss": 0.3969, "step": 2352 }, { "epoch": 2.3204338180921864, "grad_norm": 0.1681692999703932, "learning_rate": 1.2170983171769963e-06, "loss": 0.4279, "step": 2353 }, { "epoch": 2.3214197683017006, "grad_norm": 0.1639903546130607, "learning_rate": 1.2137182380333867e-06, "loss": 0.4029, "step": 2354 }, { "epoch": 2.3224057185112152, "grad_norm": 0.1657581841212683, "learning_rate": 1.2103422104570311e-06, "loss": 0.4144, "step": 2355 }, { "epoch": 2.3233916687207294, "grad_norm": 0.15761773790631234, "learning_rate": 1.2069702380604958e-06, "loss": 0.4151, "step": 2356 }, { "epoch": 2.324377618930244, "grad_norm": 0.16581608724165262, "learning_rate": 1.2036023244520157e-06, "loss": 0.4132, "step": 2357 }, { "epoch": 2.3253635691397583, "grad_norm": 0.16660329033788626, "learning_rate": 1.200238473235472e-06, "loss": 0.3983, "step": 2358 }, { "epoch": 2.326349519349273, "grad_norm": 0.16813796837382863, "learning_rate": 1.1968786880104049e-06, "loss": 0.417, "step": 2359 }, { "epoch": 2.327335469558787, "grad_norm": 0.1606712490218808, "learning_rate": 1.193522972372002e-06, "loss": 0.4141, "step": 2360 }, { "epoch": 2.3283214197683018, "grad_norm": 0.1662513472911015, "learning_rate": 1.1901713299110995e-06, "loss": 0.4227, "step": 2361 }, { "epoch": 2.329307369977816, "grad_norm": 0.1701130339328463, "learning_rate": 1.1868237642141723e-06, "loss": 0.4232, "step": 2362 }, { "epoch": 2.3302933201873306, "grad_norm": 0.17146127994694974, "learning_rate": 1.1834802788633288e-06, "loss": 0.4101, "step": 2363 }, { "epoch": 2.331279270396845, "grad_norm": 0.15743290116696995, "learning_rate": 1.1801408774363188e-06, "loss": 0.4103, "step": 2364 }, { "epoch": 2.3322652206063594, "grad_norm": 0.1568591856904107, "learning_rate": 1.1768055635065184e-06, "loss": 0.4191, "step": 2365 }, { "epoch": 2.3332511708158736, "grad_norm": 0.16019347204307025, "learning_rate": 1.1734743406429295e-06, "loss": 0.415, "step": 2366 }, { "epoch": 2.3342371210253883, "grad_norm": 0.16613274290188823, "learning_rate": 1.1701472124101765e-06, "loss": 0.4224, "step": 2367 }, { "epoch": 2.3352230712349025, "grad_norm": 0.1725775485591128, "learning_rate": 1.1668241823685028e-06, "loss": 0.4063, "step": 2368 }, { "epoch": 2.336209021444417, "grad_norm": 0.19590817684839829, "learning_rate": 1.1635052540737668e-06, "loss": 0.4092, "step": 2369 }, { "epoch": 2.3371949716539313, "grad_norm": 0.160213657045184, "learning_rate": 1.1601904310774364e-06, "loss": 0.4183, "step": 2370 }, { "epoch": 2.338180921863446, "grad_norm": 0.1650783394041844, "learning_rate": 1.1568797169265878e-06, "loss": 0.4101, "step": 2371 }, { "epoch": 2.33916687207296, "grad_norm": 0.16119554712600798, "learning_rate": 1.1535731151638997e-06, "loss": 0.4064, "step": 2372 }, { "epoch": 2.340152822282475, "grad_norm": 0.1879428160296416, "learning_rate": 1.1502706293276504e-06, "loss": 0.4212, "step": 2373 }, { "epoch": 2.341138772491989, "grad_norm": 0.1629396282658936, "learning_rate": 1.1469722629517156e-06, "loss": 0.4216, "step": 2374 }, { "epoch": 2.3421247227015036, "grad_norm": 0.1619454303955732, "learning_rate": 1.1436780195655583e-06, "loss": 0.3997, "step": 2375 }, { "epoch": 2.343110672911018, "grad_norm": 0.15899330964730712, "learning_rate": 1.1403879026942361e-06, "loss": 0.4201, "step": 2376 }, { "epoch": 2.3440966231205325, "grad_norm": 0.16546029939437998, "learning_rate": 1.137101915858388e-06, "loss": 0.4069, "step": 2377 }, { "epoch": 2.3450825733300467, "grad_norm": 0.15966260774396165, "learning_rate": 1.13382006257423e-06, "loss": 0.4152, "step": 2378 }, { "epoch": 2.3460685235395613, "grad_norm": 0.1628680329736944, "learning_rate": 1.130542346353558e-06, "loss": 0.4191, "step": 2379 }, { "epoch": 2.3470544737490755, "grad_norm": 0.18130856121973396, "learning_rate": 1.127268770703745e-06, "loss": 0.4224, "step": 2380 }, { "epoch": 2.34804042395859, "grad_norm": 0.15908995889307495, "learning_rate": 1.1239993391277264e-06, "loss": 0.4286, "step": 2381 }, { "epoch": 2.3490263741681043, "grad_norm": 0.1595524071803773, "learning_rate": 1.1207340551240076e-06, "loss": 0.411, "step": 2382 }, { "epoch": 2.350012324377619, "grad_norm": 0.16459379825706655, "learning_rate": 1.1174729221866532e-06, "loss": 0.4192, "step": 2383 }, { "epoch": 2.350998274587133, "grad_norm": 0.16973630430457837, "learning_rate": 1.1142159438052886e-06, "loss": 0.4406, "step": 2384 }, { "epoch": 2.351984224796648, "grad_norm": 0.16294530645656632, "learning_rate": 1.1109631234650903e-06, "loss": 0.3993, "step": 2385 }, { "epoch": 2.352970175006162, "grad_norm": 0.16294863825764402, "learning_rate": 1.107714464646789e-06, "loss": 0.4092, "step": 2386 }, { "epoch": 2.3539561252156767, "grad_norm": 0.16891054832785016, "learning_rate": 1.1044699708266594e-06, "loss": 0.4206, "step": 2387 }, { "epoch": 2.354942075425191, "grad_norm": 0.1692897374545682, "learning_rate": 1.1012296454765208e-06, "loss": 0.4202, "step": 2388 }, { "epoch": 2.3559280256347055, "grad_norm": 0.16625081299129083, "learning_rate": 1.0979934920637314e-06, "loss": 0.4097, "step": 2389 }, { "epoch": 2.3569139758442197, "grad_norm": 0.16511966654296223, "learning_rate": 1.0947615140511858e-06, "loss": 0.4032, "step": 2390 }, { "epoch": 2.3578999260537343, "grad_norm": 0.16264373202500912, "learning_rate": 1.0915337148973088e-06, "loss": 0.4199, "step": 2391 }, { "epoch": 2.3588858762632485, "grad_norm": 0.1689305005673266, "learning_rate": 1.088310098056059e-06, "loss": 0.4006, "step": 2392 }, { "epoch": 2.359871826472763, "grad_norm": 0.16546899130059672, "learning_rate": 1.0850906669769113e-06, "loss": 0.4244, "step": 2393 }, { "epoch": 2.3608577766822774, "grad_norm": 0.15428546428468326, "learning_rate": 1.0818754251048664e-06, "loss": 0.3785, "step": 2394 }, { "epoch": 2.361843726891792, "grad_norm": 0.15805192109688979, "learning_rate": 1.0786643758804444e-06, "loss": 0.4169, "step": 2395 }, { "epoch": 2.362829677101306, "grad_norm": 0.1560837158001152, "learning_rate": 1.075457522739675e-06, "loss": 0.4083, "step": 2396 }, { "epoch": 2.363815627310821, "grad_norm": 0.1626728416196687, "learning_rate": 1.072254869114101e-06, "loss": 0.4255, "step": 2397 }, { "epoch": 2.364801577520335, "grad_norm": 0.16345611835914114, "learning_rate": 1.0690564184307645e-06, "loss": 0.423, "step": 2398 }, { "epoch": 2.3657875277298497, "grad_norm": 0.16007046777382433, "learning_rate": 1.0658621741122205e-06, "loss": 0.4189, "step": 2399 }, { "epoch": 2.366773477939364, "grad_norm": 0.15778022472510203, "learning_rate": 1.062672139576516e-06, "loss": 0.4147, "step": 2400 }, { "epoch": 2.3677594281488785, "grad_norm": 0.16476560246321204, "learning_rate": 1.059486318237195e-06, "loss": 0.4369, "step": 2401 }, { "epoch": 2.3687453783583927, "grad_norm": 0.1614508122516947, "learning_rate": 1.0563047135032928e-06, "loss": 0.419, "step": 2402 }, { "epoch": 2.3697313285679074, "grad_norm": 0.15979009015839116, "learning_rate": 1.0531273287793336e-06, "loss": 0.4016, "step": 2403 }, { "epoch": 2.3707172787774216, "grad_norm": 0.19544947099284235, "learning_rate": 1.0499541674653251e-06, "loss": 0.4162, "step": 2404 }, { "epoch": 2.371703228986936, "grad_norm": 0.1671507504998902, "learning_rate": 1.0467852329567558e-06, "loss": 0.4219, "step": 2405 }, { "epoch": 2.3726891791964504, "grad_norm": 0.15983792455754875, "learning_rate": 1.0436205286445893e-06, "loss": 0.4164, "step": 2406 }, { "epoch": 2.373675129405965, "grad_norm": 0.1565821981045399, "learning_rate": 1.0404600579152702e-06, "loss": 0.3978, "step": 2407 }, { "epoch": 2.3746610796154792, "grad_norm": 0.16628224983624315, "learning_rate": 1.0373038241507017e-06, "loss": 0.4066, "step": 2408 }, { "epoch": 2.375647029824994, "grad_norm": 0.1548084367651464, "learning_rate": 1.03415183072826e-06, "loss": 0.4055, "step": 2409 }, { "epoch": 2.376632980034508, "grad_norm": 0.17412370535395244, "learning_rate": 1.031004081020785e-06, "loss": 0.4321, "step": 2410 }, { "epoch": 2.3776189302440227, "grad_norm": 0.16819021214759913, "learning_rate": 1.0278605783965712e-06, "loss": 0.4263, "step": 2411 }, { "epoch": 2.378604880453537, "grad_norm": 0.16241930999439794, "learning_rate": 1.0247213262193728e-06, "loss": 0.4172, "step": 2412 }, { "epoch": 2.3795908306630515, "grad_norm": 0.18316525935336245, "learning_rate": 1.021586327848389e-06, "loss": 0.4102, "step": 2413 }, { "epoch": 2.3805767808725657, "grad_norm": 0.16283384740375167, "learning_rate": 1.018455586638275e-06, "loss": 0.4209, "step": 2414 }, { "epoch": 2.3815627310820804, "grad_norm": 0.15909239037933473, "learning_rate": 1.0153291059391269e-06, "loss": 0.4025, "step": 2415 }, { "epoch": 2.3825486812915946, "grad_norm": 0.16703744818586627, "learning_rate": 1.012206889096481e-06, "loss": 0.4099, "step": 2416 }, { "epoch": 2.3835346315011092, "grad_norm": 0.16602750313595738, "learning_rate": 1.009088939451312e-06, "loss": 0.4203, "step": 2417 }, { "epoch": 2.3845205817106234, "grad_norm": 0.1602146542369932, "learning_rate": 1.0059752603400291e-06, "loss": 0.3774, "step": 2418 }, { "epoch": 2.385506531920138, "grad_norm": 0.15588894044415524, "learning_rate": 1.0028658550944703e-06, "loss": 0.3932, "step": 2419 }, { "epoch": 2.3864924821296523, "grad_norm": 0.16134547972178684, "learning_rate": 9.997607270419018e-07, "loss": 0.4115, "step": 2420 }, { "epoch": 2.387478432339167, "grad_norm": 0.18366412150660394, "learning_rate": 9.96659879505011e-07, "loss": 0.4022, "step": 2421 }, { "epoch": 2.388464382548681, "grad_norm": 0.15947815873855078, "learning_rate": 9.935633158019087e-07, "loss": 0.3991, "step": 2422 }, { "epoch": 2.3894503327581957, "grad_norm": 0.16201163898151466, "learning_rate": 9.90471039246116e-07, "loss": 0.4258, "step": 2423 }, { "epoch": 2.39043628296771, "grad_norm": 0.642697359816267, "learning_rate": 9.873830531465711e-07, "loss": 0.4156, "step": 2424 }, { "epoch": 2.3914222331772246, "grad_norm": 0.15781129262500931, "learning_rate": 9.842993608076174e-07, "loss": 0.4077, "step": 2425 }, { "epoch": 2.392408183386739, "grad_norm": 0.16218024963353128, "learning_rate": 9.812199655290095e-07, "loss": 0.4258, "step": 2426 }, { "epoch": 2.3933941335962534, "grad_norm": 0.16559005058587487, "learning_rate": 9.781448706058983e-07, "loss": 0.412, "step": 2427 }, { "epoch": 2.3943800838057676, "grad_norm": 0.16040452075409298, "learning_rate": 9.75074079328835e-07, "loss": 0.4281, "step": 2428 }, { "epoch": 2.3953660340152823, "grad_norm": 0.15969965446120735, "learning_rate": 9.720075949837659e-07, "loss": 0.4102, "step": 2429 }, { "epoch": 2.396351984224797, "grad_norm": 0.1760116297111136, "learning_rate": 9.689454208520276e-07, "loss": 0.4295, "step": 2430 }, { "epoch": 2.397337934434311, "grad_norm": 0.16506477568007438, "learning_rate": 9.658875602103461e-07, "loss": 0.4128, "step": 2431 }, { "epoch": 2.3983238846438253, "grad_norm": 0.1650611715314464, "learning_rate": 9.628340163308304e-07, "loss": 0.4168, "step": 2432 }, { "epoch": 2.39930983485334, "grad_norm": 0.158898169747212, "learning_rate": 9.5978479248097e-07, "loss": 0.4272, "step": 2433 }, { "epoch": 2.4002957850628546, "grad_norm": 0.16249408933862527, "learning_rate": 9.567398919236332e-07, "loss": 0.4217, "step": 2434 }, { "epoch": 2.4012817352723688, "grad_norm": 0.15661362275022744, "learning_rate": 9.536993179170612e-07, "loss": 0.4138, "step": 2435 }, { "epoch": 2.402267685481883, "grad_norm": 0.16349044834567097, "learning_rate": 9.506630737148642e-07, "loss": 0.4262, "step": 2436 }, { "epoch": 2.4032536356913976, "grad_norm": 0.15986245087226977, "learning_rate": 9.476311625660228e-07, "loss": 0.4027, "step": 2437 }, { "epoch": 2.4042395859009122, "grad_norm": 0.16044952949303962, "learning_rate": 9.446035877148785e-07, "loss": 0.4127, "step": 2438 }, { "epoch": 2.4052255361104264, "grad_norm": 0.16997371196521344, "learning_rate": 9.415803524011313e-07, "loss": 0.4071, "step": 2439 }, { "epoch": 2.4062114863199406, "grad_norm": 0.16335962168658272, "learning_rate": 9.385614598598386e-07, "loss": 0.4156, "step": 2440 }, { "epoch": 2.4071974365294553, "grad_norm": 0.161844721602425, "learning_rate": 9.35546913321414e-07, "loss": 0.4213, "step": 2441 }, { "epoch": 2.40818338673897, "grad_norm": 0.15902052235840378, "learning_rate": 9.325367160116167e-07, "loss": 0.399, "step": 2442 }, { "epoch": 2.409169336948484, "grad_norm": 0.16331959521354725, "learning_rate": 9.295308711515543e-07, "loss": 0.4076, "step": 2443 }, { "epoch": 2.4101552871579983, "grad_norm": 0.16060558877947725, "learning_rate": 9.265293819576726e-07, "loss": 0.4244, "step": 2444 }, { "epoch": 2.411141237367513, "grad_norm": 0.17080526417558123, "learning_rate": 9.235322516417633e-07, "loss": 0.4287, "step": 2445 }, { "epoch": 2.4121271875770276, "grad_norm": 0.16474200219189994, "learning_rate": 9.205394834109494e-07, "loss": 0.414, "step": 2446 }, { "epoch": 2.413113137786542, "grad_norm": 0.15616101277976135, "learning_rate": 9.175510804676868e-07, "loss": 0.4115, "step": 2447 }, { "epoch": 2.414099087996056, "grad_norm": 0.16259371580323215, "learning_rate": 9.145670460097606e-07, "loss": 0.41, "step": 2448 }, { "epoch": 2.4150850382055706, "grad_norm": 0.1605263138263877, "learning_rate": 9.115873832302818e-07, "loss": 0.403, "step": 2449 }, { "epoch": 2.4160709884150853, "grad_norm": 0.1640492641760322, "learning_rate": 9.08612095317683e-07, "loss": 0.3997, "step": 2450 }, { "epoch": 2.4170569386245995, "grad_norm": 0.15425470547068335, "learning_rate": 9.056411854557146e-07, "loss": 0.4098, "step": 2451 }, { "epoch": 2.4180428888341137, "grad_norm": 0.2353652395628097, "learning_rate": 9.026746568234424e-07, "loss": 0.4212, "step": 2452 }, { "epoch": 2.4190288390436283, "grad_norm": 0.17285285897048183, "learning_rate": 8.997125125952483e-07, "loss": 0.4289, "step": 2453 }, { "epoch": 2.420014789253143, "grad_norm": 0.1632310872062539, "learning_rate": 8.967547559408152e-07, "loss": 0.4125, "step": 2454 }, { "epoch": 2.421000739462657, "grad_norm": 0.15894065204491595, "learning_rate": 8.938013900251346e-07, "loss": 0.4224, "step": 2455 }, { "epoch": 2.4219866896721713, "grad_norm": 0.16483748236212414, "learning_rate": 8.908524180085021e-07, "loss": 0.4104, "step": 2456 }, { "epoch": 2.422972639881686, "grad_norm": 0.15904030114263468, "learning_rate": 8.879078430465082e-07, "loss": 0.41, "step": 2457 }, { "epoch": 2.4239585900912006, "grad_norm": 0.4352207499954632, "learning_rate": 8.849676682900399e-07, "loss": 0.4061, "step": 2458 }, { "epoch": 2.424944540300715, "grad_norm": 0.16021618921464745, "learning_rate": 8.82031896885272e-07, "loss": 0.4339, "step": 2459 }, { "epoch": 2.425930490510229, "grad_norm": 0.16891102494389298, "learning_rate": 8.79100531973674e-07, "loss": 0.4089, "step": 2460 }, { "epoch": 2.4269164407197437, "grad_norm": 0.15956671598032787, "learning_rate": 8.761735766919955e-07, "loss": 0.4087, "step": 2461 }, { "epoch": 2.4279023909292583, "grad_norm": 0.15840194748311953, "learning_rate": 8.732510341722678e-07, "loss": 0.4186, "step": 2462 }, { "epoch": 2.4288883411387725, "grad_norm": 0.16173441730914487, "learning_rate": 8.703329075418021e-07, "loss": 0.4245, "step": 2463 }, { "epoch": 2.4298742913482867, "grad_norm": 0.16185462013620347, "learning_rate": 8.674191999231835e-07, "loss": 0.4431, "step": 2464 }, { "epoch": 2.4308602415578013, "grad_norm": 0.15926357057296528, "learning_rate": 8.645099144342672e-07, "loss": 0.3945, "step": 2465 }, { "epoch": 2.431846191767316, "grad_norm": 0.1637089151876667, "learning_rate": 8.616050541881782e-07, "loss": 0.4168, "step": 2466 }, { "epoch": 2.43283214197683, "grad_norm": 0.1597325067081291, "learning_rate": 8.587046222933038e-07, "loss": 0.4099, "step": 2467 }, { "epoch": 2.4338180921863444, "grad_norm": 0.1652612101877314, "learning_rate": 8.55808621853299e-07, "loss": 0.4121, "step": 2468 }, { "epoch": 2.434804042395859, "grad_norm": 0.1558208278498863, "learning_rate": 8.529170559670674e-07, "loss": 0.4186, "step": 2469 }, { "epoch": 2.4357899926053737, "grad_norm": 0.16267826283367576, "learning_rate": 8.500299277287744e-07, "loss": 0.3965, "step": 2470 }, { "epoch": 2.436775942814888, "grad_norm": 0.16653164908480583, "learning_rate": 8.47147240227833e-07, "loss": 0.422, "step": 2471 }, { "epoch": 2.437761893024402, "grad_norm": 0.15965025920038586, "learning_rate": 8.442689965489087e-07, "loss": 0.4152, "step": 2472 }, { "epoch": 2.4387478432339167, "grad_norm": 0.1634090846806474, "learning_rate": 8.413951997719083e-07, "loss": 0.3924, "step": 2473 }, { "epoch": 2.4397337934434313, "grad_norm": 0.15968750027676784, "learning_rate": 8.385258529719781e-07, "loss": 0.4201, "step": 2474 }, { "epoch": 2.4407197436529455, "grad_norm": 0.16285082108863647, "learning_rate": 8.356609592195081e-07, "loss": 0.4077, "step": 2475 }, { "epoch": 2.4417056938624597, "grad_norm": 0.16079529099764747, "learning_rate": 8.328005215801205e-07, "loss": 0.4261, "step": 2476 }, { "epoch": 2.4426916440719744, "grad_norm": 0.17260608545522937, "learning_rate": 8.299445431146686e-07, "loss": 0.4062, "step": 2477 }, { "epoch": 2.443677594281489, "grad_norm": 0.16877987809000522, "learning_rate": 8.270930268792343e-07, "loss": 0.4231, "step": 2478 }, { "epoch": 2.444663544491003, "grad_norm": 0.1603976054916282, "learning_rate": 8.242459759251259e-07, "loss": 0.405, "step": 2479 }, { "epoch": 2.4456494947005174, "grad_norm": 0.16335409022031283, "learning_rate": 8.214033932988724e-07, "loss": 0.4152, "step": 2480 }, { "epoch": 2.446635444910032, "grad_norm": 0.16189146972601887, "learning_rate": 8.185652820422219e-07, "loss": 0.4255, "step": 2481 }, { "epoch": 2.4476213951195467, "grad_norm": 0.15963137364834074, "learning_rate": 8.15731645192136e-07, "loss": 0.4021, "step": 2482 }, { "epoch": 2.448607345329061, "grad_norm": 0.16265749135551338, "learning_rate": 8.129024857807943e-07, "loss": 0.437, "step": 2483 }, { "epoch": 2.449593295538575, "grad_norm": 0.16056692700621705, "learning_rate": 8.100778068355769e-07, "loss": 0.4249, "step": 2484 }, { "epoch": 2.4505792457480897, "grad_norm": 0.16598876062970291, "learning_rate": 8.072576113790754e-07, "loss": 0.4051, "step": 2485 }, { "epoch": 2.4515651959576044, "grad_norm": 0.1608088923146115, "learning_rate": 8.0444190242908e-07, "loss": 0.4102, "step": 2486 }, { "epoch": 2.4525511461671186, "grad_norm": 0.16286213988022255, "learning_rate": 8.016306829985848e-07, "loss": 0.4214, "step": 2487 }, { "epoch": 2.4535370963766328, "grad_norm": 0.16028060307774927, "learning_rate": 7.988239560957773e-07, "loss": 0.4178, "step": 2488 }, { "epoch": 2.4545230465861474, "grad_norm": 0.16714415782305406, "learning_rate": 7.960217247240342e-07, "loss": 0.4093, "step": 2489 }, { "epoch": 2.455508996795662, "grad_norm": 0.16666325491450767, "learning_rate": 7.932239918819262e-07, "loss": 0.4175, "step": 2490 }, { "epoch": 2.4564949470051762, "grad_norm": 0.17489421957601758, "learning_rate": 7.904307605632111e-07, "loss": 0.437, "step": 2491 }, { "epoch": 2.4574808972146904, "grad_norm": 0.17206936312624158, "learning_rate": 7.876420337568264e-07, "loss": 0.4237, "step": 2492 }, { "epoch": 2.458466847424205, "grad_norm": 0.1947580103630649, "learning_rate": 7.848578144468899e-07, "loss": 0.425, "step": 2493 }, { "epoch": 2.4594527976337197, "grad_norm": 0.15966031523643928, "learning_rate": 7.820781056126986e-07, "loss": 0.4273, "step": 2494 }, { "epoch": 2.460438747843234, "grad_norm": 0.15855053189533086, "learning_rate": 7.793029102287202e-07, "loss": 0.3993, "step": 2495 }, { "epoch": 2.461424698052748, "grad_norm": 0.16258951492067503, "learning_rate": 7.76532231264594e-07, "loss": 0.4226, "step": 2496 }, { "epoch": 2.4624106482622627, "grad_norm": 0.1565035184019167, "learning_rate": 7.73766071685127e-07, "loss": 0.3969, "step": 2497 }, { "epoch": 2.4633965984717774, "grad_norm": 0.1670879812733264, "learning_rate": 7.710044344502893e-07, "loss": 0.4049, "step": 2498 }, { "epoch": 2.4643825486812916, "grad_norm": 0.1611325585036167, "learning_rate": 7.682473225152115e-07, "loss": 0.41, "step": 2499 }, { "epoch": 2.465368498890806, "grad_norm": 0.16312168729718626, "learning_rate": 7.654947388301826e-07, "loss": 0.4094, "step": 2500 }, { "epoch": 2.4663544491003204, "grad_norm": 0.17710984544089278, "learning_rate": 7.627466863406446e-07, "loss": 0.3929, "step": 2501 }, { "epoch": 2.467340399309835, "grad_norm": 0.16386566596517432, "learning_rate": 7.600031679871944e-07, "loss": 0.4092, "step": 2502 }, { "epoch": 2.4683263495193493, "grad_norm": 0.17407210750463106, "learning_rate": 7.572641867055752e-07, "loss": 0.4061, "step": 2503 }, { "epoch": 2.4693122997288635, "grad_norm": 0.1586376619062199, "learning_rate": 7.54529745426672e-07, "loss": 0.4091, "step": 2504 }, { "epoch": 2.470298249938378, "grad_norm": 0.20293350660629642, "learning_rate": 7.517998470765142e-07, "loss": 0.4152, "step": 2505 }, { "epoch": 2.4712842001478927, "grad_norm": 0.15537060243547707, "learning_rate": 7.490744945762729e-07, "loss": 0.414, "step": 2506 }, { "epoch": 2.472270150357407, "grad_norm": 0.16887866356080697, "learning_rate": 7.463536908422508e-07, "loss": 0.3985, "step": 2507 }, { "epoch": 2.473256100566921, "grad_norm": 0.16196577013182265, "learning_rate": 7.436374387858863e-07, "loss": 0.4086, "step": 2508 }, { "epoch": 2.4742420507764358, "grad_norm": 0.1583935835828545, "learning_rate": 7.409257413137411e-07, "loss": 0.3896, "step": 2509 }, { "epoch": 2.4752280009859504, "grad_norm": 0.16004952218937224, "learning_rate": 7.382186013275117e-07, "loss": 0.4009, "step": 2510 }, { "epoch": 2.4762139511954646, "grad_norm": 0.16154490144280742, "learning_rate": 7.355160217240114e-07, "loss": 0.4127, "step": 2511 }, { "epoch": 2.477199901404979, "grad_norm": 0.16446533261954524, "learning_rate": 7.328180053951773e-07, "loss": 0.395, "step": 2512 }, { "epoch": 2.4781858516144935, "grad_norm": 0.15693037541777302, "learning_rate": 7.301245552280594e-07, "loss": 0.3993, "step": 2513 }, { "epoch": 2.479171801824008, "grad_norm": 0.1626364085065384, "learning_rate": 7.274356741048283e-07, "loss": 0.4094, "step": 2514 }, { "epoch": 2.4801577520335223, "grad_norm": 0.16596135636991863, "learning_rate": 7.247513649027582e-07, "loss": 0.4203, "step": 2515 }, { "epoch": 2.4811437022430365, "grad_norm": 0.20416447092391282, "learning_rate": 7.220716304942349e-07, "loss": 0.4174, "step": 2516 }, { "epoch": 2.482129652452551, "grad_norm": 0.16086469498638334, "learning_rate": 7.193964737467474e-07, "loss": 0.4303, "step": 2517 }, { "epoch": 2.4831156026620658, "grad_norm": 0.18584061035614108, "learning_rate": 7.167258975228886e-07, "loss": 0.4186, "step": 2518 }, { "epoch": 2.48410155287158, "grad_norm": 0.16116384490845706, "learning_rate": 7.140599046803492e-07, "loss": 0.4343, "step": 2519 }, { "epoch": 2.485087503081094, "grad_norm": 0.16595715949503784, "learning_rate": 7.113984980719107e-07, "loss": 0.416, "step": 2520 }, { "epoch": 2.486073453290609, "grad_norm": 0.16213255355579434, "learning_rate": 7.08741680545455e-07, "loss": 0.411, "step": 2521 }, { "epoch": 2.4870594035001234, "grad_norm": 0.5202621030835727, "learning_rate": 7.060894549439474e-07, "loss": 0.4251, "step": 2522 }, { "epoch": 2.4880453537096376, "grad_norm": 0.1626132249365722, "learning_rate": 7.034418241054414e-07, "loss": 0.4244, "step": 2523 }, { "epoch": 2.489031303919152, "grad_norm": 0.1658844835442372, "learning_rate": 7.007987908630742e-07, "loss": 0.4194, "step": 2524 }, { "epoch": 2.4900172541286665, "grad_norm": 0.16676631005694778, "learning_rate": 6.98160358045063e-07, "loss": 0.4097, "step": 2525 }, { "epoch": 2.491003204338181, "grad_norm": 0.16689683870697927, "learning_rate": 6.955265284747026e-07, "loss": 0.4123, "step": 2526 }, { "epoch": 2.4919891545476953, "grad_norm": 0.16335327159378088, "learning_rate": 6.928973049703608e-07, "loss": 0.4082, "step": 2527 }, { "epoch": 2.4929751047572095, "grad_norm": 0.16794880378266772, "learning_rate": 6.902726903454765e-07, "loss": 0.4147, "step": 2528 }, { "epoch": 2.493961054966724, "grad_norm": 0.16809494987324144, "learning_rate": 6.876526874085609e-07, "loss": 0.4094, "step": 2529 }, { "epoch": 2.494947005176239, "grad_norm": 0.161228286434843, "learning_rate": 6.850372989631842e-07, "loss": 0.4082, "step": 2530 }, { "epoch": 2.495932955385753, "grad_norm": 0.16522154052752383, "learning_rate": 6.824265278079834e-07, "loss": 0.4213, "step": 2531 }, { "epoch": 2.496918905595267, "grad_norm": 0.2903809047831653, "learning_rate": 6.798203767366507e-07, "loss": 0.4135, "step": 2532 }, { "epoch": 2.497904855804782, "grad_norm": 0.1596744108428839, "learning_rate": 6.7721884853794e-07, "loss": 0.4165, "step": 2533 }, { "epoch": 2.4988908060142965, "grad_norm": 0.15766713592881945, "learning_rate": 6.746219459956554e-07, "loss": 0.4062, "step": 2534 }, { "epoch": 2.4998767562238107, "grad_norm": 0.16199793614343647, "learning_rate": 6.720296718886488e-07, "loss": 0.4071, "step": 2535 }, { "epoch": 2.500862706433325, "grad_norm": 0.16602228019139734, "learning_rate": 6.694420289908215e-07, "loss": 0.4209, "step": 2536 }, { "epoch": 2.5018486566428395, "grad_norm": 0.15969240055857473, "learning_rate": 6.668590200711222e-07, "loss": 0.4037, "step": 2537 }, { "epoch": 2.502834606852354, "grad_norm": 0.1598104968992545, "learning_rate": 6.642806478935359e-07, "loss": 0.4161, "step": 2538 }, { "epoch": 2.5038205570618683, "grad_norm": 0.16839173517332648, "learning_rate": 6.617069152170896e-07, "loss": 0.414, "step": 2539 }, { "epoch": 2.5048065072713825, "grad_norm": 0.163374646034395, "learning_rate": 6.591378247958435e-07, "loss": 0.4239, "step": 2540 } ], "logging_steps": 1, "max_steps": 3042, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 254, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.802771677524787e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }