{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.7757731958762886, "eval_steps": 49, "global_step": 1164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002577319587628866, "grad_norm": 3.206880709337614, "learning_rate": 5e-08, "loss": 1.772, "step": 1 }, { "epoch": 0.002577319587628866, "eval_loss": 1.6304376125335693, "eval_runtime": 78.4604, "eval_samples_per_second": 21.195, "eval_steps_per_second": 1.326, "step": 1 }, { "epoch": 0.005154639175257732, "grad_norm": 3.3587112552116953, "learning_rate": 1e-07, "loss": 1.666, "step": 2 }, { "epoch": 0.007731958762886598, "grad_norm": 3.1385995190528324, "learning_rate": 1.5e-07, "loss": 1.5471, "step": 3 }, { "epoch": 0.010309278350515464, "grad_norm": 3.531264158181801, "learning_rate": 2e-07, "loss": 1.6718, "step": 4 }, { "epoch": 0.01288659793814433, "grad_norm": 2.993529294622099, "learning_rate": 1.9999979406617412e-07, "loss": 1.6334, "step": 5 }, { "epoch": 0.015463917525773196, "grad_norm": 3.151745142356583, "learning_rate": 1.999991762655447e-07, "loss": 1.5647, "step": 6 }, { "epoch": 0.01804123711340206, "grad_norm": 3.3440809481325333, "learning_rate": 1.9999814660065617e-07, "loss": 1.7122, "step": 7 }, { "epoch": 0.020618556701030927, "grad_norm": 3.1146822679211805, "learning_rate": 1.9999670507574944e-07, "loss": 1.5921, "step": 8 }, { "epoch": 0.023195876288659795, "grad_norm": 3.345986552710787, "learning_rate": 1.9999485169676173e-07, "loss": 1.7131, "step": 9 }, { "epoch": 0.02577319587628866, "grad_norm": 2.9626668283812045, "learning_rate": 1.9999258647132644e-07, "loss": 1.6699, "step": 10 }, { "epoch": 0.028350515463917526, "grad_norm": 3.4953806538783527, "learning_rate": 1.9998990940877333e-07, "loss": 1.6785, "step": 11 }, { "epoch": 0.030927835051546393, "grad_norm": 3.3004651951030097, "learning_rate": 1.9998682052012837e-07, "loss": 1.6681, "step": 12 }, { "epoch": 0.03350515463917526, "grad_norm": 2.9639928990218802, "learning_rate": 1.9998331981811364e-07, "loss": 1.5618, "step": 13 }, { "epoch": 0.03608247422680412, "grad_norm": 3.0779182905002234, "learning_rate": 1.9997940731714744e-07, "loss": 1.7039, "step": 14 }, { "epoch": 0.03865979381443299, "grad_norm": 2.9325641285273574, "learning_rate": 1.9997508303334409e-07, "loss": 1.6219, "step": 15 }, { "epoch": 0.041237113402061855, "grad_norm": 2.8809471060714555, "learning_rate": 1.9997034698451393e-07, "loss": 1.7566, "step": 16 }, { "epoch": 0.04381443298969072, "grad_norm": 3.341100705652755, "learning_rate": 1.999651991901632e-07, "loss": 1.6958, "step": 17 }, { "epoch": 0.04639175257731959, "grad_norm": 2.8521720798434216, "learning_rate": 1.9995963967149398e-07, "loss": 1.5833, "step": 18 }, { "epoch": 0.04896907216494845, "grad_norm": 3.2447253207769338, "learning_rate": 1.9995366845140414e-07, "loss": 1.6854, "step": 19 }, { "epoch": 0.05154639175257732, "grad_norm": 3.033054116340073, "learning_rate": 1.999472855544872e-07, "loss": 1.6768, "step": 20 }, { "epoch": 0.05412371134020619, "grad_norm": 2.7106967773151665, "learning_rate": 1.9994049100703232e-07, "loss": 1.5709, "step": 21 }, { "epoch": 0.05670103092783505, "grad_norm": 2.8131217459267974, "learning_rate": 1.9993328483702392e-07, "loss": 1.5352, "step": 22 }, { "epoch": 0.059278350515463915, "grad_norm": 2.9454711288855115, "learning_rate": 1.9992566707414195e-07, "loss": 1.6292, "step": 23 }, { "epoch": 0.061855670103092786, "grad_norm": 2.719048700095618, "learning_rate": 1.9991763774976155e-07, "loss": 1.6504, "step": 24 }, { "epoch": 0.06443298969072164, "grad_norm": 2.6465097422508914, "learning_rate": 1.9990919689695282e-07, "loss": 1.6398, "step": 25 }, { "epoch": 0.06701030927835051, "grad_norm": 2.565964847805824, "learning_rate": 1.9990034455048098e-07, "loss": 1.6024, "step": 26 }, { "epoch": 0.06958762886597938, "grad_norm": 2.4151701145393787, "learning_rate": 1.9989108074680595e-07, "loss": 1.6316, "step": 27 }, { "epoch": 0.07216494845360824, "grad_norm": 2.6823187985959276, "learning_rate": 1.998814055240823e-07, "loss": 1.7421, "step": 28 }, { "epoch": 0.07474226804123711, "grad_norm": 2.6044420857485755, "learning_rate": 1.998713189221592e-07, "loss": 1.5983, "step": 29 }, { "epoch": 0.07731958762886598, "grad_norm": 2.3579361514426784, "learning_rate": 1.9986082098258008e-07, "loss": 1.5468, "step": 30 }, { "epoch": 0.07989690721649484, "grad_norm": 2.3088177834083146, "learning_rate": 1.9984991174858257e-07, "loss": 1.5852, "step": 31 }, { "epoch": 0.08247422680412371, "grad_norm": 2.5839184979450005, "learning_rate": 1.9983859126509825e-07, "loss": 1.6647, "step": 32 }, { "epoch": 0.08505154639175258, "grad_norm": 2.2905291979602844, "learning_rate": 1.9982685957875257e-07, "loss": 1.5935, "step": 33 }, { "epoch": 0.08762886597938144, "grad_norm": 2.3660300818606568, "learning_rate": 1.998147167378645e-07, "loss": 1.7655, "step": 34 }, { "epoch": 0.09020618556701031, "grad_norm": 2.269544029552125, "learning_rate": 1.9980216279244653e-07, "loss": 1.6383, "step": 35 }, { "epoch": 0.09278350515463918, "grad_norm": 2.2148823132358477, "learning_rate": 1.9978919779420423e-07, "loss": 1.7191, "step": 36 }, { "epoch": 0.09536082474226804, "grad_norm": 2.295307555280267, "learning_rate": 1.9977582179653632e-07, "loss": 1.5571, "step": 37 }, { "epoch": 0.0979381443298969, "grad_norm": 2.1570012388049262, "learning_rate": 1.9976203485453414e-07, "loss": 1.642, "step": 38 }, { "epoch": 0.10051546391752578, "grad_norm": 2.327694183291453, "learning_rate": 1.9974783702498166e-07, "loss": 1.6388, "step": 39 }, { "epoch": 0.10309278350515463, "grad_norm": 2.3531823980910382, "learning_rate": 1.9973322836635516e-07, "loss": 1.6407, "step": 40 }, { "epoch": 0.1056701030927835, "grad_norm": 2.148246998681959, "learning_rate": 1.9971820893882297e-07, "loss": 1.6316, "step": 41 }, { "epoch": 0.10824742268041238, "grad_norm": 1.824359532091145, "learning_rate": 1.9970277880424528e-07, "loss": 1.4812, "step": 42 }, { "epoch": 0.11082474226804123, "grad_norm": 1.8420872667750698, "learning_rate": 1.9968693802617374e-07, "loss": 1.6208, "step": 43 }, { "epoch": 0.1134020618556701, "grad_norm": 1.9242569129206386, "learning_rate": 1.9967068666985148e-07, "loss": 1.6866, "step": 44 }, { "epoch": 0.11597938144329897, "grad_norm": 1.7555101549111227, "learning_rate": 1.9965402480221257e-07, "loss": 1.59, "step": 45 }, { "epoch": 0.11855670103092783, "grad_norm": 1.83328616320706, "learning_rate": 1.9963695249188181e-07, "loss": 1.7787, "step": 46 }, { "epoch": 0.1211340206185567, "grad_norm": 1.5464144842738474, "learning_rate": 1.9961946980917453e-07, "loss": 1.5605, "step": 47 }, { "epoch": 0.12371134020618557, "grad_norm": 1.5700132071559665, "learning_rate": 1.9960157682609632e-07, "loss": 1.5188, "step": 48 }, { "epoch": 0.12628865979381443, "grad_norm": 1.551927803815323, "learning_rate": 1.9958327361634247e-07, "loss": 1.5921, "step": 49 }, { "epoch": 0.12628865979381443, "eval_loss": 1.5858733654022217, "eval_runtime": 78.6563, "eval_samples_per_second": 21.143, "eval_steps_per_second": 1.322, "step": 49 }, { "epoch": 0.12886597938144329, "grad_norm": 1.6459186978386617, "learning_rate": 1.9956456025529805e-07, "loss": 1.6407, "step": 50 }, { "epoch": 0.13144329896907217, "grad_norm": 1.6778367242552643, "learning_rate": 1.9954543682003732e-07, "loss": 1.5755, "step": 51 }, { "epoch": 0.13402061855670103, "grad_norm": 1.5846228635636366, "learning_rate": 1.9952590338932356e-07, "loss": 1.5236, "step": 52 }, { "epoch": 0.13659793814432988, "grad_norm": 1.530322622789531, "learning_rate": 1.9950596004360864e-07, "loss": 1.6474, "step": 53 }, { "epoch": 0.13917525773195877, "grad_norm": 1.5541727762346491, "learning_rate": 1.994856068650327e-07, "loss": 1.5926, "step": 54 }, { "epoch": 0.14175257731958762, "grad_norm": 1.5422089413059752, "learning_rate": 1.9946484393742394e-07, "loss": 1.6057, "step": 55 }, { "epoch": 0.14432989690721648, "grad_norm": 1.5086078750620586, "learning_rate": 1.994436713462982e-07, "loss": 1.6139, "step": 56 }, { "epoch": 0.14690721649484537, "grad_norm": 1.4904490748313473, "learning_rate": 1.994220891788584e-07, "loss": 1.5613, "step": 57 }, { "epoch": 0.14948453608247422, "grad_norm": 1.4446085113828102, "learning_rate": 1.9940009752399457e-07, "loss": 1.5838, "step": 58 }, { "epoch": 0.15206185567010308, "grad_norm": 1.4944945344118559, "learning_rate": 1.9937769647228327e-07, "loss": 1.6009, "step": 59 }, { "epoch": 0.15463917525773196, "grad_norm": 1.3673177038874413, "learning_rate": 1.9935488611598714e-07, "loss": 1.5295, "step": 60 }, { "epoch": 0.15721649484536082, "grad_norm": 1.489918654317649, "learning_rate": 1.9933166654905465e-07, "loss": 1.6855, "step": 61 }, { "epoch": 0.15979381443298968, "grad_norm": 1.4085364811053838, "learning_rate": 1.993080378671197e-07, "loss": 1.6171, "step": 62 }, { "epoch": 0.16237113402061856, "grad_norm": 1.4063494910858265, "learning_rate": 1.992840001675012e-07, "loss": 1.548, "step": 63 }, { "epoch": 0.16494845360824742, "grad_norm": 1.4013900053822443, "learning_rate": 1.9925955354920263e-07, "loss": 1.5674, "step": 64 }, { "epoch": 0.16752577319587628, "grad_norm": 1.3995913424696536, "learning_rate": 1.9923469811291173e-07, "loss": 1.644, "step": 65 }, { "epoch": 0.17010309278350516, "grad_norm": 1.4951716735691833, "learning_rate": 1.99209433961e-07, "loss": 1.6752, "step": 66 }, { "epoch": 0.17268041237113402, "grad_norm": 1.4354454580093134, "learning_rate": 1.9918376119752226e-07, "loss": 1.6076, "step": 67 }, { "epoch": 0.17525773195876287, "grad_norm": 1.5307588716137506, "learning_rate": 1.9915767992821639e-07, "loss": 1.6192, "step": 68 }, { "epoch": 0.17783505154639176, "grad_norm": 1.37638400966553, "learning_rate": 1.9913119026050267e-07, "loss": 1.5744, "step": 69 }, { "epoch": 0.18041237113402062, "grad_norm": 1.3694054278862016, "learning_rate": 1.9910429230348344e-07, "loss": 1.4495, "step": 70 }, { "epoch": 0.18298969072164947, "grad_norm": 1.4276322894882787, "learning_rate": 1.9907698616794276e-07, "loss": 1.6427, "step": 71 }, { "epoch": 0.18556701030927836, "grad_norm": 1.475589693442013, "learning_rate": 1.990492719663457e-07, "loss": 1.6231, "step": 72 }, { "epoch": 0.18814432989690721, "grad_norm": 1.505476760952321, "learning_rate": 1.990211498128381e-07, "loss": 1.7036, "step": 73 }, { "epoch": 0.19072164948453607, "grad_norm": 1.4498365666960409, "learning_rate": 1.9899261982324607e-07, "loss": 1.5564, "step": 74 }, { "epoch": 0.19329896907216496, "grad_norm": 1.4542099562182622, "learning_rate": 1.9896368211507535e-07, "loss": 1.6012, "step": 75 }, { "epoch": 0.1958762886597938, "grad_norm": 1.408394462248393, "learning_rate": 1.9893433680751103e-07, "loss": 1.5493, "step": 76 }, { "epoch": 0.19845360824742267, "grad_norm": 1.4023960052363178, "learning_rate": 1.9890458402141688e-07, "loss": 1.6452, "step": 77 }, { "epoch": 0.20103092783505155, "grad_norm": 1.4823050133687188, "learning_rate": 1.988744238793351e-07, "loss": 1.5991, "step": 78 }, { "epoch": 0.2036082474226804, "grad_norm": 1.32937819085943, "learning_rate": 1.9884385650548548e-07, "loss": 1.5358, "step": 79 }, { "epoch": 0.20618556701030927, "grad_norm": 1.3471888309972797, "learning_rate": 1.9881288202576517e-07, "loss": 1.5426, "step": 80 }, { "epoch": 0.20876288659793815, "grad_norm": 1.34250330197651, "learning_rate": 1.98781500567748e-07, "loss": 1.5743, "step": 81 }, { "epoch": 0.211340206185567, "grad_norm": 1.3158395928293942, "learning_rate": 1.9874971226068412e-07, "loss": 1.5914, "step": 82 }, { "epoch": 0.21391752577319587, "grad_norm": 1.3088201655236604, "learning_rate": 1.9871751723549926e-07, "loss": 1.5307, "step": 83 }, { "epoch": 0.21649484536082475, "grad_norm": 1.4622234110087462, "learning_rate": 1.9868491562479426e-07, "loss": 1.6698, "step": 84 }, { "epoch": 0.2190721649484536, "grad_norm": 1.2966036743967264, "learning_rate": 1.9865190756284464e-07, "loss": 1.6172, "step": 85 }, { "epoch": 0.22164948453608246, "grad_norm": 1.3416821729559592, "learning_rate": 1.9861849318559995e-07, "loss": 1.6395, "step": 86 }, { "epoch": 0.22422680412371135, "grad_norm": 1.4246775767306445, "learning_rate": 1.9858467263068319e-07, "loss": 1.6048, "step": 87 }, { "epoch": 0.2268041237113402, "grad_norm": 1.332606463309659, "learning_rate": 1.9855044603739028e-07, "loss": 1.6383, "step": 88 }, { "epoch": 0.22938144329896906, "grad_norm": 1.380602547288226, "learning_rate": 1.9851581354668948e-07, "loss": 1.64, "step": 89 }, { "epoch": 0.23195876288659795, "grad_norm": 1.3407177446168135, "learning_rate": 1.984807753012208e-07, "loss": 1.7039, "step": 90 }, { "epoch": 0.2345360824742268, "grad_norm": 1.338866434398542, "learning_rate": 1.9844533144529547e-07, "loss": 1.5236, "step": 91 }, { "epoch": 0.23711340206185566, "grad_norm": 1.274500058980513, "learning_rate": 1.9840948212489526e-07, "loss": 1.5713, "step": 92 }, { "epoch": 0.23969072164948454, "grad_norm": 1.3410204352377493, "learning_rate": 1.9837322748767194e-07, "loss": 1.6058, "step": 93 }, { "epoch": 0.2422680412371134, "grad_norm": 1.3188947135915765, "learning_rate": 1.983365676829466e-07, "loss": 1.6209, "step": 94 }, { "epoch": 0.24484536082474226, "grad_norm": 1.2787506674738858, "learning_rate": 1.9829950286170913e-07, "loss": 1.5984, "step": 95 }, { "epoch": 0.24742268041237114, "grad_norm": 1.3508302652980064, "learning_rate": 1.9826203317661756e-07, "loss": 1.5126, "step": 96 }, { "epoch": 0.25, "grad_norm": 1.3775203706307013, "learning_rate": 1.9822415878199737e-07, "loss": 1.5806, "step": 97 }, { "epoch": 0.25257731958762886, "grad_norm": 1.3953183701272227, "learning_rate": 1.9818587983384095e-07, "loss": 1.6391, "step": 98 }, { "epoch": 0.25257731958762886, "eval_loss": 1.5530622005462646, "eval_runtime": 78.7591, "eval_samples_per_second": 21.115, "eval_steps_per_second": 1.32, "step": 98 }, { "epoch": 0.2551546391752577, "grad_norm": 1.2639205955569304, "learning_rate": 1.981471964898069e-07, "loss": 1.6154, "step": 99 }, { "epoch": 0.25773195876288657, "grad_norm": 1.33461619126327, "learning_rate": 1.9810810890921942e-07, "loss": 1.5841, "step": 100 }, { "epoch": 0.2603092783505155, "grad_norm": 1.3223001702133927, "learning_rate": 1.980686172530676e-07, "loss": 1.6292, "step": 101 }, { "epoch": 0.26288659793814434, "grad_norm": 1.2560649642869146, "learning_rate": 1.9802872168400478e-07, "loss": 1.5673, "step": 102 }, { "epoch": 0.2654639175257732, "grad_norm": 1.2597104528650152, "learning_rate": 1.9798842236634795e-07, "loss": 1.6508, "step": 103 }, { "epoch": 0.26804123711340205, "grad_norm": 1.407282635250448, "learning_rate": 1.979477194660769e-07, "loss": 1.4872, "step": 104 }, { "epoch": 0.2706185567010309, "grad_norm": 1.2016832149108632, "learning_rate": 1.9790661315083375e-07, "loss": 1.5604, "step": 105 }, { "epoch": 0.27319587628865977, "grad_norm": 1.149030350241683, "learning_rate": 1.978651035899221e-07, "loss": 1.421, "step": 106 }, { "epoch": 0.2757731958762887, "grad_norm": 1.3215975195174274, "learning_rate": 1.9782319095430643e-07, "loss": 1.5786, "step": 107 }, { "epoch": 0.27835051546391754, "grad_norm": 1.2703092272910235, "learning_rate": 1.9778087541661131e-07, "loss": 1.484, "step": 108 }, { "epoch": 0.2809278350515464, "grad_norm": 1.2413825121259754, "learning_rate": 1.9773815715112072e-07, "loss": 1.5041, "step": 109 }, { "epoch": 0.28350515463917525, "grad_norm": 1.2972955973409976, "learning_rate": 1.9769503633377743e-07, "loss": 1.5719, "step": 110 }, { "epoch": 0.2860824742268041, "grad_norm": 1.3905442390636398, "learning_rate": 1.9765151314218209e-07, "loss": 1.5788, "step": 111 }, { "epoch": 0.28865979381443296, "grad_norm": 1.269867236059509, "learning_rate": 1.976075877555927e-07, "loss": 1.5358, "step": 112 }, { "epoch": 0.2912371134020619, "grad_norm": 1.2521107632001138, "learning_rate": 1.975632603549237e-07, "loss": 1.5908, "step": 113 }, { "epoch": 0.29381443298969073, "grad_norm": 1.2496393834141784, "learning_rate": 1.9751853112274527e-07, "loss": 1.5506, "step": 114 }, { "epoch": 0.2963917525773196, "grad_norm": 1.2871218607928567, "learning_rate": 1.974734002432827e-07, "loss": 1.5275, "step": 115 }, { "epoch": 0.29896907216494845, "grad_norm": 1.2976234741205572, "learning_rate": 1.9742786790241546e-07, "loss": 1.5444, "step": 116 }, { "epoch": 0.3015463917525773, "grad_norm": 1.2017823329368622, "learning_rate": 1.9738193428767654e-07, "loss": 1.543, "step": 117 }, { "epoch": 0.30412371134020616, "grad_norm": 1.226770431675134, "learning_rate": 1.9733559958825167e-07, "loss": 1.5397, "step": 118 }, { "epoch": 0.30670103092783507, "grad_norm": 1.3442951015324778, "learning_rate": 1.9728886399497844e-07, "loss": 1.5852, "step": 119 }, { "epoch": 0.30927835051546393, "grad_norm": 1.2017473551527889, "learning_rate": 1.9724172770034564e-07, "loss": 1.5318, "step": 120 }, { "epoch": 0.3118556701030928, "grad_norm": 1.211656114042897, "learning_rate": 1.9719419089849246e-07, "loss": 1.5028, "step": 121 }, { "epoch": 0.31443298969072164, "grad_norm": 1.400130154858166, "learning_rate": 1.9714625378520756e-07, "loss": 1.5582, "step": 122 }, { "epoch": 0.3170103092783505, "grad_norm": 1.3086898697605782, "learning_rate": 1.9709791655792847e-07, "loss": 1.6549, "step": 123 }, { "epoch": 0.31958762886597936, "grad_norm": 1.278029367300382, "learning_rate": 1.9704917941574052e-07, "loss": 1.5557, "step": 124 }, { "epoch": 0.32216494845360827, "grad_norm": 1.2356382868741678, "learning_rate": 1.9700004255937627e-07, "loss": 1.5288, "step": 125 }, { "epoch": 0.3247422680412371, "grad_norm": 1.28937440464536, "learning_rate": 1.9695050619121457e-07, "loss": 1.5266, "step": 126 }, { "epoch": 0.327319587628866, "grad_norm": 1.4414848109811116, "learning_rate": 1.9690057051527963e-07, "loss": 1.6097, "step": 127 }, { "epoch": 0.32989690721649484, "grad_norm": 1.2136781418976954, "learning_rate": 1.9685023573724035e-07, "loss": 1.4935, "step": 128 }, { "epoch": 0.3324742268041237, "grad_norm": 1.3341115569144475, "learning_rate": 1.9679950206440948e-07, "loss": 1.5987, "step": 129 }, { "epoch": 0.33505154639175255, "grad_norm": 1.329559323076734, "learning_rate": 1.967483697057425e-07, "loss": 1.5782, "step": 130 }, { "epoch": 0.33762886597938147, "grad_norm": 1.2026583523005048, "learning_rate": 1.9669683887183714e-07, "loss": 1.5482, "step": 131 }, { "epoch": 0.3402061855670103, "grad_norm": 1.230715216092296, "learning_rate": 1.966449097749322e-07, "loss": 1.637, "step": 132 }, { "epoch": 0.3427835051546392, "grad_norm": 1.3616177214331797, "learning_rate": 1.965925826289068e-07, "loss": 1.5264, "step": 133 }, { "epoch": 0.34536082474226804, "grad_norm": 1.1816372421732182, "learning_rate": 1.965398576492796e-07, "loss": 1.5349, "step": 134 }, { "epoch": 0.3479381443298969, "grad_norm": 1.3503944653975188, "learning_rate": 1.964867350532077e-07, "loss": 1.5317, "step": 135 }, { "epoch": 0.35051546391752575, "grad_norm": 1.3016847854244378, "learning_rate": 1.9643321505948584e-07, "loss": 1.6062, "step": 136 }, { "epoch": 0.35309278350515466, "grad_norm": 1.19908669818476, "learning_rate": 1.9637929788854564e-07, "loss": 1.6179, "step": 137 }, { "epoch": 0.3556701030927835, "grad_norm": 1.1945706816984818, "learning_rate": 1.9632498376245445e-07, "loss": 1.5982, "step": 138 }, { "epoch": 0.3582474226804124, "grad_norm": 1.233096157789794, "learning_rate": 1.9627027290491458e-07, "loss": 1.572, "step": 139 }, { "epoch": 0.36082474226804123, "grad_norm": 1.2228780779938433, "learning_rate": 1.9621516554126237e-07, "loss": 1.5789, "step": 140 }, { "epoch": 0.3634020618556701, "grad_norm": 1.1898193013734535, "learning_rate": 1.961596618984672e-07, "loss": 1.4511, "step": 141 }, { "epoch": 0.36597938144329895, "grad_norm": 1.25230398028528, "learning_rate": 1.9610376220513066e-07, "loss": 1.5529, "step": 142 }, { "epoch": 0.36855670103092786, "grad_norm": 1.2693796938125035, "learning_rate": 1.960474666914855e-07, "loss": 1.5403, "step": 143 }, { "epoch": 0.3711340206185567, "grad_norm": 1.3275717703634924, "learning_rate": 1.9599077558939464e-07, "loss": 1.4989, "step": 144 }, { "epoch": 0.37371134020618557, "grad_norm": 1.1489906814896371, "learning_rate": 1.959336891323505e-07, "loss": 1.5074, "step": 145 }, { "epoch": 0.37628865979381443, "grad_norm": 1.1875368070507506, "learning_rate": 1.958762075554737e-07, "loss": 1.5219, "step": 146 }, { "epoch": 0.3788659793814433, "grad_norm": 1.2013715546004073, "learning_rate": 1.9581833109551228e-07, "loss": 1.5413, "step": 147 }, { "epoch": 0.3788659793814433, "eval_loss": 1.5337220430374146, "eval_runtime": 78.6436, "eval_samples_per_second": 21.146, "eval_steps_per_second": 1.322, "step": 147 }, { "epoch": 0.38144329896907214, "grad_norm": 1.348552262306386, "learning_rate": 1.9576005999084056e-07, "loss": 1.5713, "step": 148 }, { "epoch": 0.38402061855670105, "grad_norm": 1.2579524096365415, "learning_rate": 1.9570139448145852e-07, "loss": 1.5042, "step": 149 }, { "epoch": 0.3865979381443299, "grad_norm": 1.2007903800378994, "learning_rate": 1.9564233480899028e-07, "loss": 1.4753, "step": 150 }, { "epoch": 0.38917525773195877, "grad_norm": 1.14999357355067, "learning_rate": 1.955828812166836e-07, "loss": 1.489, "step": 151 }, { "epoch": 0.3917525773195876, "grad_norm": 1.2834202884360733, "learning_rate": 1.955230339494086e-07, "loss": 1.5672, "step": 152 }, { "epoch": 0.3943298969072165, "grad_norm": 1.2110339834614112, "learning_rate": 1.9546279325365675e-07, "loss": 1.5138, "step": 153 }, { "epoch": 0.39690721649484534, "grad_norm": 1.2447583871603898, "learning_rate": 1.9540215937754007e-07, "loss": 1.5324, "step": 154 }, { "epoch": 0.39948453608247425, "grad_norm": 1.2169740146814894, "learning_rate": 1.9534113257078978e-07, "loss": 1.5228, "step": 155 }, { "epoch": 0.4020618556701031, "grad_norm": 1.3339392292279337, "learning_rate": 1.9527971308475568e-07, "loss": 1.5537, "step": 156 }, { "epoch": 0.40463917525773196, "grad_norm": 1.1629410191581253, "learning_rate": 1.952179011724047e-07, "loss": 1.4565, "step": 157 }, { "epoch": 0.4072164948453608, "grad_norm": 1.2166854685328994, "learning_rate": 1.951556970883201e-07, "loss": 1.4996, "step": 158 }, { "epoch": 0.4097938144329897, "grad_norm": 1.1864599175194743, "learning_rate": 1.9509310108870037e-07, "loss": 1.5078, "step": 159 }, { "epoch": 0.41237113402061853, "grad_norm": 1.2614891919139117, "learning_rate": 1.9503011343135826e-07, "loss": 1.6787, "step": 160 }, { "epoch": 0.41494845360824745, "grad_norm": 1.2538176997908546, "learning_rate": 1.9496673437571945e-07, "loss": 1.5567, "step": 161 }, { "epoch": 0.4175257731958763, "grad_norm": 1.2100512003350425, "learning_rate": 1.9490296418282183e-07, "loss": 1.5835, "step": 162 }, { "epoch": 0.42010309278350516, "grad_norm": 1.176294102289334, "learning_rate": 1.9483880311531423e-07, "loss": 1.4902, "step": 163 }, { "epoch": 0.422680412371134, "grad_norm": 1.2400060721796176, "learning_rate": 1.9477425143745525e-07, "loss": 1.5971, "step": 164 }, { "epoch": 0.4252577319587629, "grad_norm": 1.1621100701911136, "learning_rate": 1.9470930941511243e-07, "loss": 1.5171, "step": 165 }, { "epoch": 0.42783505154639173, "grad_norm": 1.2424661949562683, "learning_rate": 1.9464397731576091e-07, "loss": 1.4954, "step": 166 }, { "epoch": 0.43041237113402064, "grad_norm": 1.23770627068237, "learning_rate": 1.9457825540848255e-07, "loss": 1.5326, "step": 167 }, { "epoch": 0.4329896907216495, "grad_norm": 1.1862612005970397, "learning_rate": 1.9451214396396453e-07, "loss": 1.4912, "step": 168 }, { "epoch": 0.43556701030927836, "grad_norm": 1.2831749441379539, "learning_rate": 1.9444564325449853e-07, "loss": 1.6117, "step": 169 }, { "epoch": 0.4381443298969072, "grad_norm": 1.1531718726331943, "learning_rate": 1.943787535539795e-07, "loss": 1.4855, "step": 170 }, { "epoch": 0.44072164948453607, "grad_norm": 1.1826441581231952, "learning_rate": 1.9431147513790446e-07, "loss": 1.5582, "step": 171 }, { "epoch": 0.44329896907216493, "grad_norm": 1.1887449944628656, "learning_rate": 1.9424380828337143e-07, "loss": 1.5564, "step": 172 }, { "epoch": 0.44587628865979384, "grad_norm": 1.249570543310612, "learning_rate": 1.9417575326907831e-07, "loss": 1.621, "step": 173 }, { "epoch": 0.4484536082474227, "grad_norm": 1.3090306728609684, "learning_rate": 1.941073103753217e-07, "loss": 1.5282, "step": 174 }, { "epoch": 0.45103092783505155, "grad_norm": 1.2503633263430554, "learning_rate": 1.9403847988399566e-07, "loss": 1.5513, "step": 175 }, { "epoch": 0.4536082474226804, "grad_norm": 1.2018168355345367, "learning_rate": 1.9396926207859085e-07, "loss": 1.4957, "step": 176 }, { "epoch": 0.45618556701030927, "grad_norm": 1.168765093642791, "learning_rate": 1.9389965724419288e-07, "loss": 1.5004, "step": 177 }, { "epoch": 0.4587628865979381, "grad_norm": 1.250633142422843, "learning_rate": 1.9382966566748167e-07, "loss": 1.5387, "step": 178 }, { "epoch": 0.46134020618556704, "grad_norm": 1.171229347123422, "learning_rate": 1.9375928763672982e-07, "loss": 1.596, "step": 179 }, { "epoch": 0.4639175257731959, "grad_norm": 1.1693848944378227, "learning_rate": 1.9368852344180166e-07, "loss": 1.5147, "step": 180 }, { "epoch": 0.46649484536082475, "grad_norm": 1.2828987442740891, "learning_rate": 1.9361737337415204e-07, "loss": 1.5539, "step": 181 }, { "epoch": 0.4690721649484536, "grad_norm": 1.1925907017733204, "learning_rate": 1.9354583772682512e-07, "loss": 1.5752, "step": 182 }, { "epoch": 0.47164948453608246, "grad_norm": 1.321152376647017, "learning_rate": 1.93473916794453e-07, "loss": 1.5952, "step": 183 }, { "epoch": 0.4742268041237113, "grad_norm": 1.2480635026506552, "learning_rate": 1.934016108732548e-07, "loss": 1.5068, "step": 184 }, { "epoch": 0.47680412371134023, "grad_norm": 1.2890663133137021, "learning_rate": 1.9332892026103517e-07, "loss": 1.4498, "step": 185 }, { "epoch": 0.4793814432989691, "grad_norm": 1.278439525246191, "learning_rate": 1.932558452571833e-07, "loss": 1.5061, "step": 186 }, { "epoch": 0.48195876288659795, "grad_norm": 1.2481302944858157, "learning_rate": 1.931823861626714e-07, "loss": 1.5672, "step": 187 }, { "epoch": 0.4845360824742268, "grad_norm": 1.2421848632538859, "learning_rate": 1.9310854328005378e-07, "loss": 1.4985, "step": 188 }, { "epoch": 0.48711340206185566, "grad_norm": 1.1840656288458875, "learning_rate": 1.930343169134654e-07, "loss": 1.556, "step": 189 }, { "epoch": 0.4896907216494845, "grad_norm": 1.2585791993336888, "learning_rate": 1.929597073686206e-07, "loss": 1.5539, "step": 190 }, { "epoch": 0.49226804123711343, "grad_norm": 1.123656686890668, "learning_rate": 1.9288471495281203e-07, "loss": 1.5377, "step": 191 }, { "epoch": 0.4948453608247423, "grad_norm": 1.276688134117863, "learning_rate": 1.9280933997490912e-07, "loss": 1.5845, "step": 192 }, { "epoch": 0.49742268041237114, "grad_norm": 1.231953746707157, "learning_rate": 1.9273358274535702e-07, "loss": 1.6142, "step": 193 }, { "epoch": 0.5, "grad_norm": 1.3230553754067966, "learning_rate": 1.926574435761753e-07, "loss": 1.4738, "step": 194 }, { "epoch": 0.5025773195876289, "grad_norm": 1.2436732656409537, "learning_rate": 1.9258092278095657e-07, "loss": 1.5969, "step": 195 }, { "epoch": 0.5051546391752577, "grad_norm": 1.221047910828976, "learning_rate": 1.925040206748652e-07, "loss": 1.5962, "step": 196 }, { "epoch": 0.5051546391752577, "eval_loss": 1.520858883857727, "eval_runtime": 78.5683, "eval_samples_per_second": 21.166, "eval_steps_per_second": 1.324, "step": 196 }, { "epoch": 0.5077319587628866, "grad_norm": 1.2212270479150868, "learning_rate": 1.924267375746361e-07, "loss": 1.5033, "step": 197 }, { "epoch": 0.5103092783505154, "grad_norm": 1.2178250609326542, "learning_rate": 1.9234907379857334e-07, "loss": 1.577, "step": 198 }, { "epoch": 0.5128865979381443, "grad_norm": 1.1521118751035526, "learning_rate": 1.9227102966654895e-07, "loss": 1.4468, "step": 199 }, { "epoch": 0.5154639175257731, "grad_norm": 1.2132226025196962, "learning_rate": 1.9219260550000143e-07, "loss": 1.5135, "step": 200 }, { "epoch": 0.5180412371134021, "grad_norm": 1.191186345232448, "learning_rate": 1.921138016219345e-07, "loss": 1.5146, "step": 201 }, { "epoch": 0.520618556701031, "grad_norm": 1.2208830731174638, "learning_rate": 1.9203461835691592e-07, "loss": 1.5452, "step": 202 }, { "epoch": 0.5231958762886598, "grad_norm": 1.2176060346511148, "learning_rate": 1.9195505603107594e-07, "loss": 1.5144, "step": 203 }, { "epoch": 0.5257731958762887, "grad_norm": 1.1351041872872305, "learning_rate": 1.9187511497210597e-07, "loss": 1.5463, "step": 204 }, { "epoch": 0.5283505154639175, "grad_norm": 1.1782470225350157, "learning_rate": 1.9179479550925747e-07, "loss": 1.4878, "step": 205 }, { "epoch": 0.5309278350515464, "grad_norm": 1.0942788691010794, "learning_rate": 1.9171409797334025e-07, "loss": 1.5423, "step": 206 }, { "epoch": 0.5335051546391752, "grad_norm": 1.2422690533739307, "learning_rate": 1.9163302269672137e-07, "loss": 1.5543, "step": 207 }, { "epoch": 0.5360824742268041, "grad_norm": 1.187410857798478, "learning_rate": 1.9155157001332372e-07, "loss": 1.4864, "step": 208 }, { "epoch": 0.538659793814433, "grad_norm": 1.2521757262499582, "learning_rate": 1.9146974025862448e-07, "loss": 1.5678, "step": 209 }, { "epoch": 0.5412371134020618, "grad_norm": 1.1895335891190835, "learning_rate": 1.91387533769654e-07, "loss": 1.5359, "step": 210 }, { "epoch": 0.5438144329896907, "grad_norm": 1.156080510817116, "learning_rate": 1.9130495088499417e-07, "loss": 1.4179, "step": 211 }, { "epoch": 0.5463917525773195, "grad_norm": 1.2160395280121006, "learning_rate": 1.912219919447772e-07, "loss": 1.5288, "step": 212 }, { "epoch": 0.5489690721649485, "grad_norm": 1.187251015976325, "learning_rate": 1.9113865729068413e-07, "loss": 1.5829, "step": 213 }, { "epoch": 0.5515463917525774, "grad_norm": 1.2325994836421947, "learning_rate": 1.9105494726594342e-07, "loss": 1.5918, "step": 214 }, { "epoch": 0.5541237113402062, "grad_norm": 1.2136013415323126, "learning_rate": 1.9097086221532964e-07, "loss": 1.5093, "step": 215 }, { "epoch": 0.5567010309278351, "grad_norm": 1.1685027007257103, "learning_rate": 1.9088640248516185e-07, "loss": 1.5992, "step": 216 }, { "epoch": 0.5592783505154639, "grad_norm": 1.2470178729913264, "learning_rate": 1.908015684233024e-07, "loss": 1.5845, "step": 217 }, { "epoch": 0.5618556701030928, "grad_norm": 1.3342781963513264, "learning_rate": 1.9071636037915533e-07, "loss": 1.5227, "step": 218 }, { "epoch": 0.5644329896907216, "grad_norm": 1.2834111003737632, "learning_rate": 1.90630778703665e-07, "loss": 1.5278, "step": 219 }, { "epoch": 0.5670103092783505, "grad_norm": 1.2731317285054349, "learning_rate": 1.9054482374931466e-07, "loss": 1.558, "step": 220 }, { "epoch": 0.5695876288659794, "grad_norm": 1.2315820199483811, "learning_rate": 1.9045849587012496e-07, "loss": 1.5586, "step": 221 }, { "epoch": 0.5721649484536082, "grad_norm": 1.2995032591648374, "learning_rate": 1.9037179542165253e-07, "loss": 1.5726, "step": 222 }, { "epoch": 0.5747422680412371, "grad_norm": 1.2207628382258247, "learning_rate": 1.902847227609884e-07, "loss": 1.5622, "step": 223 }, { "epoch": 0.5773195876288659, "grad_norm": 1.1578307509849368, "learning_rate": 1.901972782467568e-07, "loss": 1.5029, "step": 224 }, { "epoch": 0.5798969072164949, "grad_norm": 1.2559554939477484, "learning_rate": 1.9010946223911333e-07, "loss": 1.5536, "step": 225 }, { "epoch": 0.5824742268041238, "grad_norm": 1.1912957688409214, "learning_rate": 1.9002127509974374e-07, "loss": 1.4107, "step": 226 }, { "epoch": 0.5850515463917526, "grad_norm": 1.347391803127549, "learning_rate": 1.899327171918623e-07, "loss": 1.4981, "step": 227 }, { "epoch": 0.5876288659793815, "grad_norm": 1.1735029116257494, "learning_rate": 1.8984378888021042e-07, "loss": 1.4931, "step": 228 }, { "epoch": 0.5902061855670103, "grad_norm": 1.1491563326269614, "learning_rate": 1.8975449053105503e-07, "loss": 1.439, "step": 229 }, { "epoch": 0.5927835051546392, "grad_norm": 1.1281459530728108, "learning_rate": 1.8966482251218715e-07, "loss": 1.5317, "step": 230 }, { "epoch": 0.595360824742268, "grad_norm": 1.1698523464033057, "learning_rate": 1.8957478519292032e-07, "loss": 1.533, "step": 231 }, { "epoch": 0.5979381443298969, "grad_norm": 1.2253794089203258, "learning_rate": 1.8948437894408918e-07, "loss": 1.566, "step": 232 }, { "epoch": 0.6005154639175257, "grad_norm": 1.2704578177761554, "learning_rate": 1.893936041380478e-07, "loss": 1.5496, "step": 233 }, { "epoch": 0.6030927835051546, "grad_norm": 1.270569192705897, "learning_rate": 1.8930246114866822e-07, "loss": 1.4762, "step": 234 }, { "epoch": 0.6056701030927835, "grad_norm": 1.1748786103242588, "learning_rate": 1.8921095035133896e-07, "loss": 1.5641, "step": 235 }, { "epoch": 0.6082474226804123, "grad_norm": 1.2029791452687832, "learning_rate": 1.891190721229634e-07, "loss": 1.5694, "step": 236 }, { "epoch": 0.6108247422680413, "grad_norm": 1.19680587233996, "learning_rate": 1.890268268419582e-07, "loss": 1.5538, "step": 237 }, { "epoch": 0.6134020618556701, "grad_norm": 1.1874592772095638, "learning_rate": 1.8893421488825187e-07, "loss": 1.4978, "step": 238 }, { "epoch": 0.615979381443299, "grad_norm": 1.216069233807722, "learning_rate": 1.888412366432831e-07, "loss": 1.584, "step": 239 }, { "epoch": 0.6185567010309279, "grad_norm": 1.2090175073299552, "learning_rate": 1.8874789248999913e-07, "loss": 1.5486, "step": 240 }, { "epoch": 0.6211340206185567, "grad_norm": 1.1599735542109655, "learning_rate": 1.8865418281285444e-07, "loss": 1.512, "step": 241 }, { "epoch": 0.6237113402061856, "grad_norm": 1.1508476690774565, "learning_rate": 1.885601079978088e-07, "loss": 1.4699, "step": 242 }, { "epoch": 0.6262886597938144, "grad_norm": 1.294126202956922, "learning_rate": 1.8846566843232594e-07, "loss": 1.6185, "step": 243 }, { "epoch": 0.6288659793814433, "grad_norm": 1.1538551018422412, "learning_rate": 1.883708645053719e-07, "loss": 1.5284, "step": 244 }, { "epoch": 0.6314432989690721, "grad_norm": 1.1790058528070886, "learning_rate": 1.882756966074134e-07, "loss": 1.5235, "step": 245 }, { "epoch": 0.6314432989690721, "eval_loss": 1.510589361190796, "eval_runtime": 78.6198, "eval_samples_per_second": 21.152, "eval_steps_per_second": 1.323, "step": 245 }, { "epoch": 0.634020618556701, "grad_norm": 1.1938102380471263, "learning_rate": 1.8818016513041623e-07, "loss": 1.5028, "step": 246 }, { "epoch": 0.6365979381443299, "grad_norm": 1.231310461159998, "learning_rate": 1.8808427046784362e-07, "loss": 1.5686, "step": 247 }, { "epoch": 0.6391752577319587, "grad_norm": 1.3015696329059996, "learning_rate": 1.8798801301465467e-07, "loss": 1.579, "step": 248 }, { "epoch": 0.6417525773195877, "grad_norm": 1.1482602866030465, "learning_rate": 1.8789139316730269e-07, "loss": 1.5331, "step": 249 }, { "epoch": 0.6443298969072165, "grad_norm": 1.231219314227984, "learning_rate": 1.8779441132373359e-07, "loss": 1.5366, "step": 250 }, { "epoch": 0.6469072164948454, "grad_norm": 1.2531642119413817, "learning_rate": 1.876970678833842e-07, "loss": 1.5246, "step": 251 }, { "epoch": 0.6494845360824743, "grad_norm": 1.1332607994718875, "learning_rate": 1.8759936324718066e-07, "loss": 1.5029, "step": 252 }, { "epoch": 0.6520618556701031, "grad_norm": 1.123414985710231, "learning_rate": 1.8750129781753677e-07, "loss": 1.5992, "step": 253 }, { "epoch": 0.654639175257732, "grad_norm": 1.1601574273566644, "learning_rate": 1.874028719983523e-07, "loss": 1.4271, "step": 254 }, { "epoch": 0.6572164948453608, "grad_norm": 1.2155208006708451, "learning_rate": 1.8730408619501138e-07, "loss": 1.5939, "step": 255 }, { "epoch": 0.6597938144329897, "grad_norm": 1.181434829014358, "learning_rate": 1.8720494081438076e-07, "loss": 1.5416, "step": 256 }, { "epoch": 0.6623711340206185, "grad_norm": 1.1457316456562228, "learning_rate": 1.8710543626480818e-07, "loss": 1.4854, "step": 257 }, { "epoch": 0.6649484536082474, "grad_norm": 1.1872624778137861, "learning_rate": 1.8700557295612072e-07, "loss": 1.5045, "step": 258 }, { "epoch": 0.6675257731958762, "grad_norm": 1.2856636838183533, "learning_rate": 1.8690535129962305e-07, "loss": 1.4678, "step": 259 }, { "epoch": 0.6701030927835051, "grad_norm": 1.131984435899355, "learning_rate": 1.8680477170809572e-07, "loss": 1.5706, "step": 260 }, { "epoch": 0.6726804123711341, "grad_norm": 1.2653048133418598, "learning_rate": 1.8670383459579356e-07, "loss": 1.5623, "step": 261 }, { "epoch": 0.6752577319587629, "grad_norm": 1.2245543813976405, "learning_rate": 1.8660254037844388e-07, "loss": 1.5039, "step": 262 }, { "epoch": 0.6778350515463918, "grad_norm": 1.1778675556929805, "learning_rate": 1.8650088947324475e-07, "loss": 1.5143, "step": 263 }, { "epoch": 0.6804123711340206, "grad_norm": 1.1796106429583424, "learning_rate": 1.863988822988634e-07, "loss": 1.5867, "step": 264 }, { "epoch": 0.6829896907216495, "grad_norm": 1.143095546666012, "learning_rate": 1.8629651927543443e-07, "loss": 1.4735, "step": 265 }, { "epoch": 0.6855670103092784, "grad_norm": 1.1803235220482347, "learning_rate": 1.8619380082455796e-07, "loss": 1.4606, "step": 266 }, { "epoch": 0.6881443298969072, "grad_norm": 1.2218442431344259, "learning_rate": 1.8609072736929806e-07, "loss": 1.5409, "step": 267 }, { "epoch": 0.6907216494845361, "grad_norm": 1.2044546146531363, "learning_rate": 1.85987299334181e-07, "loss": 1.5279, "step": 268 }, { "epoch": 0.6932989690721649, "grad_norm": 1.2619745333120211, "learning_rate": 1.8588351714519335e-07, "loss": 1.5244, "step": 269 }, { "epoch": 0.6958762886597938, "grad_norm": 1.256000322805203, "learning_rate": 1.8577938122978042e-07, "loss": 1.5294, "step": 270 }, { "epoch": 0.6984536082474226, "grad_norm": 1.2356982681147777, "learning_rate": 1.856748920168443e-07, "loss": 1.5036, "step": 271 }, { "epoch": 0.7010309278350515, "grad_norm": 1.2037362943983936, "learning_rate": 1.855700499367423e-07, "loss": 1.5235, "step": 272 }, { "epoch": 0.7036082474226805, "grad_norm": 1.2017143929693659, "learning_rate": 1.85464855421285e-07, "loss": 1.4204, "step": 273 }, { "epoch": 0.7061855670103093, "grad_norm": 1.1908996404734937, "learning_rate": 1.8535930890373465e-07, "loss": 1.4969, "step": 274 }, { "epoch": 0.7087628865979382, "grad_norm": 1.1577329971672512, "learning_rate": 1.8525341081880312e-07, "loss": 1.5319, "step": 275 }, { "epoch": 0.711340206185567, "grad_norm": 1.1714981246895275, "learning_rate": 1.8514716160265045e-07, "loss": 1.4177, "step": 276 }, { "epoch": 0.7139175257731959, "grad_norm": 1.1688981848930113, "learning_rate": 1.8504056169288274e-07, "loss": 1.5234, "step": 277 }, { "epoch": 0.7164948453608248, "grad_norm": 1.176710170060508, "learning_rate": 1.8493361152855057e-07, "loss": 1.499, "step": 278 }, { "epoch": 0.7190721649484536, "grad_norm": 1.1039383442864374, "learning_rate": 1.8482631155014703e-07, "loss": 1.5258, "step": 279 }, { "epoch": 0.7216494845360825, "grad_norm": 1.232497346510154, "learning_rate": 1.84718662199606e-07, "loss": 1.5564, "step": 280 }, { "epoch": 0.7242268041237113, "grad_norm": 1.1628995381634444, "learning_rate": 1.8461066392030046e-07, "loss": 1.4091, "step": 281 }, { "epoch": 0.7268041237113402, "grad_norm": 1.2777142820565022, "learning_rate": 1.8450231715704026e-07, "loss": 1.4754, "step": 282 }, { "epoch": 0.729381443298969, "grad_norm": 1.2162243240659913, "learning_rate": 1.843936223560707e-07, "loss": 1.5473, "step": 283 }, { "epoch": 0.7319587628865979, "grad_norm": 1.2147904802438685, "learning_rate": 1.8428457996507053e-07, "loss": 1.5296, "step": 284 }, { "epoch": 0.7345360824742269, "grad_norm": 1.19577901711321, "learning_rate": 1.8417519043315004e-07, "loss": 1.542, "step": 285 }, { "epoch": 0.7371134020618557, "grad_norm": 1.252475138336633, "learning_rate": 1.8406545421084938e-07, "loss": 1.5293, "step": 286 }, { "epoch": 0.7396907216494846, "grad_norm": 1.1515656379492916, "learning_rate": 1.8395537175013654e-07, "loss": 1.5272, "step": 287 }, { "epoch": 0.7422680412371134, "grad_norm": 1.1517700578396561, "learning_rate": 1.8384494350440552e-07, "loss": 1.5133, "step": 288 }, { "epoch": 0.7448453608247423, "grad_norm": 1.217323252639824, "learning_rate": 1.8373416992847458e-07, "loss": 1.5009, "step": 289 }, { "epoch": 0.7474226804123711, "grad_norm": 1.1814204725087243, "learning_rate": 1.8362305147858428e-07, "loss": 1.4538, "step": 290 }, { "epoch": 0.75, "grad_norm": 1.1842613200601082, "learning_rate": 1.835115886123955e-07, "loss": 1.3816, "step": 291 }, { "epoch": 0.7525773195876289, "grad_norm": 1.2063574196502098, "learning_rate": 1.8339978178898778e-07, "loss": 1.5965, "step": 292 }, { "epoch": 0.7551546391752577, "grad_norm": 1.2685230099116653, "learning_rate": 1.8328763146885725e-07, "loss": 1.5637, "step": 293 }, { "epoch": 0.7577319587628866, "grad_norm": 1.295213064366882, "learning_rate": 1.8317513811391476e-07, "loss": 1.5592, "step": 294 }, { "epoch": 0.7577319587628866, "eval_loss": 1.5018398761749268, "eval_runtime": 78.561, "eval_samples_per_second": 21.168, "eval_steps_per_second": 1.324, "step": 294 }, { "epoch": 0.7603092783505154, "grad_norm": 1.1669863622367527, "learning_rate": 1.830623021874841e-07, "loss": 1.5081, "step": 295 }, { "epoch": 0.7628865979381443, "grad_norm": 1.1910397422917334, "learning_rate": 1.8294912415429992e-07, "loss": 1.523, "step": 296 }, { "epoch": 0.7654639175257731, "grad_norm": 1.1665026656613802, "learning_rate": 1.8283560448050594e-07, "loss": 1.4753, "step": 297 }, { "epoch": 0.7680412371134021, "grad_norm": 1.212187645390271, "learning_rate": 1.8272174363365297e-07, "loss": 1.4983, "step": 298 }, { "epoch": 0.770618556701031, "grad_norm": 1.2227876601034444, "learning_rate": 1.8260754208269701e-07, "loss": 1.5019, "step": 299 }, { "epoch": 0.7731958762886598, "grad_norm": 1.2358555763549743, "learning_rate": 1.8249300029799733e-07, "loss": 1.5965, "step": 300 }, { "epoch": 0.7757731958762887, "grad_norm": 1.187640438130257, "learning_rate": 1.8237811875131444e-07, "loss": 1.591, "step": 301 }, { "epoch": 0.7783505154639175, "grad_norm": 1.2214707732869985, "learning_rate": 1.8226289791580828e-07, "loss": 1.5274, "step": 302 }, { "epoch": 0.7809278350515464, "grad_norm": 1.2019657180078016, "learning_rate": 1.8214733826603625e-07, "loss": 1.5021, "step": 303 }, { "epoch": 0.7835051546391752, "grad_norm": 1.16960231687607, "learning_rate": 1.820314402779511e-07, "loss": 1.5763, "step": 304 }, { "epoch": 0.7860824742268041, "grad_norm": 1.152389731802479, "learning_rate": 1.8191520442889918e-07, "loss": 1.5176, "step": 305 }, { "epoch": 0.788659793814433, "grad_norm": 1.1132515669118002, "learning_rate": 1.8179863119761833e-07, "loss": 1.4634, "step": 306 }, { "epoch": 0.7912371134020618, "grad_norm": 1.1607539313280772, "learning_rate": 1.8168172106423606e-07, "loss": 1.4798, "step": 307 }, { "epoch": 0.7938144329896907, "grad_norm": 1.2145359718563615, "learning_rate": 1.8156447451026728e-07, "loss": 1.594, "step": 308 }, { "epoch": 0.7963917525773195, "grad_norm": 1.1870844292463605, "learning_rate": 1.814468920186127e-07, "loss": 1.478, "step": 309 }, { "epoch": 0.7989690721649485, "grad_norm": 1.1233767004431354, "learning_rate": 1.8132897407355653e-07, "loss": 1.5882, "step": 310 }, { "epoch": 0.8015463917525774, "grad_norm": 1.1738330684693277, "learning_rate": 1.8121072116076464e-07, "loss": 1.4284, "step": 311 }, { "epoch": 0.8041237113402062, "grad_norm": 1.247978839030236, "learning_rate": 1.8109213376728257e-07, "loss": 1.5824, "step": 312 }, { "epoch": 0.8067010309278351, "grad_norm": 1.2318777988562417, "learning_rate": 1.8097321238153336e-07, "loss": 1.5185, "step": 313 }, { "epoch": 0.8092783505154639, "grad_norm": 1.137207160847728, "learning_rate": 1.808539574933158e-07, "loss": 1.448, "step": 314 }, { "epoch": 0.8118556701030928, "grad_norm": 1.203622066974504, "learning_rate": 1.8073436959380212e-07, "loss": 1.5003, "step": 315 }, { "epoch": 0.8144329896907216, "grad_norm": 1.1618827104260305, "learning_rate": 1.8061444917553627e-07, "loss": 1.4603, "step": 316 }, { "epoch": 0.8170103092783505, "grad_norm": 1.1455984024451822, "learning_rate": 1.8049419673243164e-07, "loss": 1.4366, "step": 317 }, { "epoch": 0.8195876288659794, "grad_norm": 1.1500253179290463, "learning_rate": 1.803736127597691e-07, "loss": 1.5403, "step": 318 }, { "epoch": 0.8221649484536082, "grad_norm": 1.2632412244799347, "learning_rate": 1.8025269775419507e-07, "loss": 1.5003, "step": 319 }, { "epoch": 0.8247422680412371, "grad_norm": 1.142698108221298, "learning_rate": 1.8013145221371934e-07, "loss": 1.4732, "step": 320 }, { "epoch": 0.8273195876288659, "grad_norm": 1.2124460871646654, "learning_rate": 1.8000987663771306e-07, "loss": 1.5311, "step": 321 }, { "epoch": 0.8298969072164949, "grad_norm": 1.2348590930541292, "learning_rate": 1.798879715269067e-07, "loss": 1.5741, "step": 322 }, { "epoch": 0.8324742268041238, "grad_norm": 1.1498349377386237, "learning_rate": 1.79765737383388e-07, "loss": 1.361, "step": 323 }, { "epoch": 0.8350515463917526, "grad_norm": 1.189403441559741, "learning_rate": 1.796431747105998e-07, "loss": 1.5002, "step": 324 }, { "epoch": 0.8376288659793815, "grad_norm": 1.2170644285030623, "learning_rate": 1.7952028401333816e-07, "loss": 1.5508, "step": 325 }, { "epoch": 0.8402061855670103, "grad_norm": 1.2305649106918, "learning_rate": 1.793970657977501e-07, "loss": 1.5185, "step": 326 }, { "epoch": 0.8427835051546392, "grad_norm": 1.1928858589906648, "learning_rate": 1.7927352057133156e-07, "loss": 1.5859, "step": 327 }, { "epoch": 0.845360824742268, "grad_norm": 1.2402447474397933, "learning_rate": 1.791496488429254e-07, "loss": 1.4482, "step": 328 }, { "epoch": 0.8479381443298969, "grad_norm": 1.3004615784711493, "learning_rate": 1.7902545112271916e-07, "loss": 1.4996, "step": 329 }, { "epoch": 0.8505154639175257, "grad_norm": 1.2029226714523475, "learning_rate": 1.7890092792224314e-07, "loss": 1.4729, "step": 330 }, { "epoch": 0.8530927835051546, "grad_norm": 1.1646016402710766, "learning_rate": 1.7877607975436803e-07, "loss": 1.511, "step": 331 }, { "epoch": 0.8556701030927835, "grad_norm": 1.1748241861140345, "learning_rate": 1.7865090713330312e-07, "loss": 1.5406, "step": 332 }, { "epoch": 0.8582474226804123, "grad_norm": 1.1988219111182623, "learning_rate": 1.785254105745939e-07, "loss": 1.5364, "step": 333 }, { "epoch": 0.8608247422680413, "grad_norm": 1.2920016906616154, "learning_rate": 1.7839959059512014e-07, "loss": 1.5188, "step": 334 }, { "epoch": 0.8634020618556701, "grad_norm": 1.1390205414249481, "learning_rate": 1.7827344771309362e-07, "loss": 1.4749, "step": 335 }, { "epoch": 0.865979381443299, "grad_norm": 1.207725667468718, "learning_rate": 1.7814698244805603e-07, "loss": 1.5144, "step": 336 }, { "epoch": 0.8685567010309279, "grad_norm": 1.2708389359824341, "learning_rate": 1.780201953208769e-07, "loss": 1.4633, "step": 337 }, { "epoch": 0.8711340206185567, "grad_norm": 1.3588744934998203, "learning_rate": 1.7789308685375146e-07, "loss": 1.5194, "step": 338 }, { "epoch": 0.8737113402061856, "grad_norm": 1.1714299642439896, "learning_rate": 1.7776565757019829e-07, "loss": 1.4378, "step": 339 }, { "epoch": 0.8762886597938144, "grad_norm": 1.2349197329756814, "learning_rate": 1.7763790799505743e-07, "loss": 1.501, "step": 340 }, { "epoch": 0.8788659793814433, "grad_norm": 1.145994840644305, "learning_rate": 1.7750983865448804e-07, "loss": 1.3569, "step": 341 }, { "epoch": 0.8814432989690721, "grad_norm": 1.147878510470048, "learning_rate": 1.773814500759663e-07, "loss": 1.4907, "step": 342 }, { "epoch": 0.884020618556701, "grad_norm": 1.2101479142325238, "learning_rate": 1.7725274278828324e-07, "loss": 1.5045, "step": 343 }, { "epoch": 0.884020618556701, "eval_loss": 1.4945380687713623, "eval_runtime": 78.6415, "eval_samples_per_second": 21.147, "eval_steps_per_second": 1.322, "step": 343 }, { "epoch": 0.8865979381443299, "grad_norm": 1.2038990843843793, "learning_rate": 1.7712371732154257e-07, "loss": 1.4554, "step": 344 }, { "epoch": 0.8891752577319587, "grad_norm": 1.1472367305664413, "learning_rate": 1.7699437420715838e-07, "loss": 1.4611, "step": 345 }, { "epoch": 0.8917525773195877, "grad_norm": 1.2170090657627353, "learning_rate": 1.768647139778532e-07, "loss": 1.4619, "step": 346 }, { "epoch": 0.8943298969072165, "grad_norm": 1.1815824919293882, "learning_rate": 1.7673473716765553e-07, "loss": 1.5022, "step": 347 }, { "epoch": 0.8969072164948454, "grad_norm": 1.1967591939256936, "learning_rate": 1.766044443118978e-07, "loss": 1.4812, "step": 348 }, { "epoch": 0.8994845360824743, "grad_norm": 1.228975686058958, "learning_rate": 1.7647383594721413e-07, "loss": 1.4943, "step": 349 }, { "epoch": 0.9020618556701031, "grad_norm": 1.2132506060158343, "learning_rate": 1.7634291261153818e-07, "loss": 1.4852, "step": 350 }, { "epoch": 0.904639175257732, "grad_norm": 1.2581183528068558, "learning_rate": 1.7621167484410076e-07, "loss": 1.5311, "step": 351 }, { "epoch": 0.9072164948453608, "grad_norm": 1.1976025658343157, "learning_rate": 1.7608012318542776e-07, "loss": 1.5623, "step": 352 }, { "epoch": 0.9097938144329897, "grad_norm": 1.2081117148971663, "learning_rate": 1.7594825817733804e-07, "loss": 1.4877, "step": 353 }, { "epoch": 0.9123711340206185, "grad_norm": 1.25102310904074, "learning_rate": 1.7581608036294074e-07, "loss": 1.5166, "step": 354 }, { "epoch": 0.9149484536082474, "grad_norm": 1.1251058107211171, "learning_rate": 1.7568359028663362e-07, "loss": 1.4818, "step": 355 }, { "epoch": 0.9175257731958762, "grad_norm": 1.162404179159399, "learning_rate": 1.7555078849410042e-07, "loss": 1.4684, "step": 356 }, { "epoch": 0.9201030927835051, "grad_norm": 1.1939177374027512, "learning_rate": 1.754176755323088e-07, "loss": 1.3906, "step": 357 }, { "epoch": 0.9226804123711341, "grad_norm": 1.2277839442625762, "learning_rate": 1.7528425194950793e-07, "loss": 1.5206, "step": 358 }, { "epoch": 0.9252577319587629, "grad_norm": 1.1589149786868607, "learning_rate": 1.7515051829522643e-07, "loss": 1.5117, "step": 359 }, { "epoch": 0.9278350515463918, "grad_norm": 1.161766915938516, "learning_rate": 1.7501647512026993e-07, "loss": 1.5142, "step": 360 }, { "epoch": 0.9304123711340206, "grad_norm": 1.1895671903848675, "learning_rate": 1.7488212297671897e-07, "loss": 1.5279, "step": 361 }, { "epoch": 0.9329896907216495, "grad_norm": 1.3331865087236399, "learning_rate": 1.7474746241792646e-07, "loss": 1.4476, "step": 362 }, { "epoch": 0.9355670103092784, "grad_norm": 1.1227191881644327, "learning_rate": 1.746124939985158e-07, "loss": 1.436, "step": 363 }, { "epoch": 0.9381443298969072, "grad_norm": 1.1453288975869358, "learning_rate": 1.7447721827437817e-07, "loss": 1.4721, "step": 364 }, { "epoch": 0.9407216494845361, "grad_norm": 1.1800301680843552, "learning_rate": 1.7434163580267056e-07, "loss": 1.4648, "step": 365 }, { "epoch": 0.9432989690721649, "grad_norm": 1.1592086626138536, "learning_rate": 1.7420574714181327e-07, "loss": 1.4645, "step": 366 }, { "epoch": 0.9458762886597938, "grad_norm": 1.1969987793516494, "learning_rate": 1.7406955285148782e-07, "loss": 1.4628, "step": 367 }, { "epoch": 0.9484536082474226, "grad_norm": 1.25319893461736, "learning_rate": 1.7393305349263432e-07, "loss": 1.5327, "step": 368 }, { "epoch": 0.9510309278350515, "grad_norm": 1.1235076122412295, "learning_rate": 1.7379624962744954e-07, "loss": 1.457, "step": 369 }, { "epoch": 0.9536082474226805, "grad_norm": 1.215770975088775, "learning_rate": 1.7365914181938438e-07, "loss": 1.4802, "step": 370 }, { "epoch": 0.9561855670103093, "grad_norm": 1.1400445439752551, "learning_rate": 1.7352173063314147e-07, "loss": 1.4078, "step": 371 }, { "epoch": 0.9587628865979382, "grad_norm": 1.219412218457137, "learning_rate": 1.7338401663467307e-07, "loss": 1.4863, "step": 372 }, { "epoch": 0.961340206185567, "grad_norm": 1.2307165231693638, "learning_rate": 1.732460003911786e-07, "loss": 1.547, "step": 373 }, { "epoch": 0.9639175257731959, "grad_norm": 1.1928743718959285, "learning_rate": 1.731076824711023e-07, "loss": 1.4681, "step": 374 }, { "epoch": 0.9664948453608248, "grad_norm": 1.2210774438706382, "learning_rate": 1.7296906344413101e-07, "loss": 1.5359, "step": 375 }, { "epoch": 0.9690721649484536, "grad_norm": 1.1755911854453769, "learning_rate": 1.7283014388119157e-07, "loss": 1.5286, "step": 376 }, { "epoch": 0.9716494845360825, "grad_norm": 1.1189926107564905, "learning_rate": 1.7269092435444878e-07, "loss": 1.4309, "step": 377 }, { "epoch": 0.9742268041237113, "grad_norm": 1.209816536244005, "learning_rate": 1.7255140543730282e-07, "loss": 1.4689, "step": 378 }, { "epoch": 0.9768041237113402, "grad_norm": 1.1866285142861848, "learning_rate": 1.7241158770438697e-07, "loss": 1.4972, "step": 379 }, { "epoch": 0.979381443298969, "grad_norm": 1.1354634757481643, "learning_rate": 1.722714717315652e-07, "loss": 1.4873, "step": 380 }, { "epoch": 0.9819587628865979, "grad_norm": 1.2944770552807037, "learning_rate": 1.7213105809593e-07, "loss": 1.4974, "step": 381 }, { "epoch": 0.9845360824742269, "grad_norm": 1.103791679895453, "learning_rate": 1.719903473757996e-07, "loss": 1.4338, "step": 382 }, { "epoch": 0.9871134020618557, "grad_norm": 1.1784721051806777, "learning_rate": 1.7184934015071594e-07, "loss": 1.4041, "step": 383 }, { "epoch": 0.9896907216494846, "grad_norm": 1.1348338130977504, "learning_rate": 1.7170803700144225e-07, "loss": 1.4413, "step": 384 }, { "epoch": 0.9922680412371134, "grad_norm": 1.2250889412679622, "learning_rate": 1.7156643850996044e-07, "loss": 1.4629, "step": 385 }, { "epoch": 0.9948453608247423, "grad_norm": 1.1045983289273678, "learning_rate": 1.7142454525946888e-07, "loss": 1.5546, "step": 386 }, { "epoch": 0.9974226804123711, "grad_norm": 1.1516418913315656, "learning_rate": 1.7128235783437998e-07, "loss": 1.5631, "step": 387 }, { "epoch": 1.0, "grad_norm": 1.252168700059035, "learning_rate": 1.7113987682031778e-07, "loss": 1.4422, "step": 388 }, { "epoch": 1.0025773195876289, "grad_norm": 1.189319163542339, "learning_rate": 1.7099710280411546e-07, "loss": 1.5383, "step": 389 }, { "epoch": 1.0051546391752577, "grad_norm": 1.2727165097128585, "learning_rate": 1.70854036373813e-07, "loss": 1.5408, "step": 390 }, { "epoch": 1.0077319587628866, "grad_norm": 1.1517050348302873, "learning_rate": 1.7071067811865473e-07, "loss": 1.5864, "step": 391 }, { "epoch": 1.0103092783505154, "grad_norm": 1.3325861122052731, "learning_rate": 1.7056702862908702e-07, "loss": 1.5524, "step": 392 }, { "epoch": 1.0103092783505154, "eval_loss": 1.4885141849517822, "eval_runtime": 78.424, "eval_samples_per_second": 21.205, "eval_steps_per_second": 1.326, "step": 392 }, { "epoch": 1.0128865979381443, "grad_norm": 1.1135739405983736, "learning_rate": 1.7042308849675554e-07, "loss": 1.5054, "step": 393 }, { "epoch": 1.0154639175257731, "grad_norm": 1.1782103759330078, "learning_rate": 1.7027885831450317e-07, "loss": 1.4809, "step": 394 }, { "epoch": 1.018041237113402, "grad_norm": 1.1307316665373648, "learning_rate": 1.701343386763674e-07, "loss": 1.4176, "step": 395 }, { "epoch": 1.0206185567010309, "grad_norm": 1.2226276517588748, "learning_rate": 1.6998953017757785e-07, "loss": 1.5829, "step": 396 }, { "epoch": 1.0231958762886597, "grad_norm": 1.2403418129653008, "learning_rate": 1.698444334145539e-07, "loss": 1.5954, "step": 397 }, { "epoch": 1.0257731958762886, "grad_norm": 1.1302836106915826, "learning_rate": 1.6969904898490212e-07, "loss": 1.4231, "step": 398 }, { "epoch": 1.0283505154639174, "grad_norm": 1.141960483416689, "learning_rate": 1.6955337748741405e-07, "loss": 1.4287, "step": 399 }, { "epoch": 1.0309278350515463, "grad_norm": 1.196477232474438, "learning_rate": 1.694074195220634e-07, "loss": 1.5239, "step": 400 }, { "epoch": 1.0335051546391754, "grad_norm": 1.183187501385808, "learning_rate": 1.692611756900038e-07, "loss": 1.497, "step": 401 }, { "epoch": 1.0360824742268042, "grad_norm": 1.150174147558412, "learning_rate": 1.691146465935663e-07, "loss": 1.5532, "step": 402 }, { "epoch": 1.038659793814433, "grad_norm": 1.2448204002333718, "learning_rate": 1.689678328362569e-07, "loss": 1.416, "step": 403 }, { "epoch": 1.041237113402062, "grad_norm": 1.1109759208202117, "learning_rate": 1.6882073502275392e-07, "loss": 1.5012, "step": 404 }, { "epoch": 1.0438144329896908, "grad_norm": 1.1567096038742686, "learning_rate": 1.6867335375890566e-07, "loss": 1.5053, "step": 405 }, { "epoch": 1.0463917525773196, "grad_norm": 1.1754138924074398, "learning_rate": 1.6852568965172792e-07, "loss": 1.5129, "step": 406 }, { "epoch": 1.0489690721649485, "grad_norm": 1.23193132568122, "learning_rate": 1.6837774330940136e-07, "loss": 1.5573, "step": 407 }, { "epoch": 1.0515463917525774, "grad_norm": 1.154132682102343, "learning_rate": 1.6822951534126908e-07, "loss": 1.4258, "step": 408 }, { "epoch": 1.0541237113402062, "grad_norm": 1.1683702220075676, "learning_rate": 1.680810063578342e-07, "loss": 1.493, "step": 409 }, { "epoch": 1.056701030927835, "grad_norm": 1.1355190434284121, "learning_rate": 1.6793221697075716e-07, "loss": 1.5119, "step": 410 }, { "epoch": 1.059278350515464, "grad_norm": 1.1992497667084585, "learning_rate": 1.6778314779285324e-07, "loss": 1.538, "step": 411 }, { "epoch": 1.0618556701030928, "grad_norm": 1.1517964539720562, "learning_rate": 1.6763379943809027e-07, "loss": 1.4665, "step": 412 }, { "epoch": 1.0644329896907216, "grad_norm": 1.0984210499840694, "learning_rate": 1.6748417252158577e-07, "loss": 1.4328, "step": 413 }, { "epoch": 1.0670103092783505, "grad_norm": 1.1299450982658101, "learning_rate": 1.6733426765960456e-07, "loss": 1.5028, "step": 414 }, { "epoch": 1.0695876288659794, "grad_norm": 1.212850591316243, "learning_rate": 1.6718408546955635e-07, "loss": 1.5834, "step": 415 }, { "epoch": 1.0721649484536082, "grad_norm": 1.187341231477269, "learning_rate": 1.6703362656999299e-07, "loss": 1.5069, "step": 416 }, { "epoch": 1.074742268041237, "grad_norm": 1.2469684651532016, "learning_rate": 1.6688289158060593e-07, "loss": 1.518, "step": 417 }, { "epoch": 1.077319587628866, "grad_norm": 1.254398054291776, "learning_rate": 1.6673188112222395e-07, "loss": 1.578, "step": 418 }, { "epoch": 1.0798969072164948, "grad_norm": 1.1499801218824168, "learning_rate": 1.665805958168102e-07, "loss": 1.4979, "step": 419 }, { "epoch": 1.0824742268041236, "grad_norm": 1.1976396691121443, "learning_rate": 1.664290362874599e-07, "loss": 1.4914, "step": 420 }, { "epoch": 1.0850515463917525, "grad_norm": 1.1348401564795523, "learning_rate": 1.662772031583978e-07, "loss": 1.3902, "step": 421 }, { "epoch": 1.0876288659793814, "grad_norm": 1.2267166932133524, "learning_rate": 1.6612509705497542e-07, "loss": 1.4352, "step": 422 }, { "epoch": 1.0902061855670102, "grad_norm": 1.2873463533597629, "learning_rate": 1.6597271860366856e-07, "loss": 1.4478, "step": 423 }, { "epoch": 1.092783505154639, "grad_norm": 1.3679857975054832, "learning_rate": 1.6582006843207478e-07, "loss": 1.5168, "step": 424 }, { "epoch": 1.0953608247422681, "grad_norm": 1.326554289290517, "learning_rate": 1.6566714716891078e-07, "loss": 1.5008, "step": 425 }, { "epoch": 1.097938144329897, "grad_norm": 1.168969016350491, "learning_rate": 1.6551395544400978e-07, "loss": 1.4917, "step": 426 }, { "epoch": 1.1005154639175259, "grad_norm": 1.2413798753485674, "learning_rate": 1.6536049388831893e-07, "loss": 1.4502, "step": 427 }, { "epoch": 1.1030927835051547, "grad_norm": 1.1635621820926023, "learning_rate": 1.652067631338967e-07, "loss": 1.557, "step": 428 }, { "epoch": 1.1056701030927836, "grad_norm": 1.1573375306268514, "learning_rate": 1.6505276381391036e-07, "loss": 1.4244, "step": 429 }, { "epoch": 1.1082474226804124, "grad_norm": 1.2312412177915255, "learning_rate": 1.6489849656263335e-07, "loss": 1.5494, "step": 430 }, { "epoch": 1.1108247422680413, "grad_norm": 1.219284880839308, "learning_rate": 1.647439620154425e-07, "loss": 1.5306, "step": 431 }, { "epoch": 1.1134020618556701, "grad_norm": 1.173558682623126, "learning_rate": 1.6458916080881563e-07, "loss": 1.4429, "step": 432 }, { "epoch": 1.0025773195876289, "grad_norm": 1.229487690642213, "learning_rate": 1.6443409358032887e-07, "loss": 1.5753, "step": 433 }, { "epoch": 1.0051546391752577, "grad_norm": 1.2105170741564812, "learning_rate": 1.6427876096865392e-07, "loss": 1.5334, "step": 434 }, { "epoch": 1.0077319587628866, "grad_norm": 1.204008054808549, "learning_rate": 1.6412316361355562e-07, "loss": 1.42, "step": 435 }, { "epoch": 1.0103092783505154, "grad_norm": 1.1326791826110472, "learning_rate": 1.6396730215588912e-07, "loss": 1.4714, "step": 436 }, { "epoch": 1.0128865979381443, "grad_norm": 1.1200550697122906, "learning_rate": 1.6381117723759734e-07, "loss": 1.514, "step": 437 }, { "epoch": 1.0154639175257731, "grad_norm": 1.1890623492712462, "learning_rate": 1.6365478950170833e-07, "loss": 1.4181, "step": 438 }, { "epoch": 1.018041237113402, "grad_norm": 1.1631198253400261, "learning_rate": 1.6349813959233255e-07, "loss": 1.4062, "step": 439 }, { "epoch": 1.0206185567010309, "grad_norm": 1.1360996622048518, "learning_rate": 1.6334122815466031e-07, "loss": 1.4486, "step": 440 }, { "epoch": 1.0231958762886597, "grad_norm": 1.1864758464899412, "learning_rate": 1.6318405583495913e-07, "loss": 1.5347, "step": 441 }, { "epoch": 1.0231958762886597, "eval_loss": 1.4830812215805054, "eval_runtime": 78.5114, "eval_samples_per_second": 21.182, "eval_steps_per_second": 1.325, "step": 441 }, { "epoch": 1.0257731958762886, "grad_norm": 1.1301160006601543, "learning_rate": 1.6302662328057085e-07, "loss": 1.4353, "step": 442 }, { "epoch": 1.0283505154639174, "grad_norm": 1.1894059515483042, "learning_rate": 1.6286893113990932e-07, "loss": 1.469, "step": 443 }, { "epoch": 1.0309278350515463, "grad_norm": 1.1496261846772073, "learning_rate": 1.627109800624574e-07, "loss": 1.5501, "step": 444 }, { "epoch": 1.0335051546391754, "grad_norm": 1.2088185832357161, "learning_rate": 1.6255277069876451e-07, "loss": 1.4899, "step": 445 }, { "epoch": 1.0360824742268042, "grad_norm": 1.1253812221554047, "learning_rate": 1.6239430370044387e-07, "loss": 1.4122, "step": 446 }, { "epoch": 1.038659793814433, "grad_norm": 1.1716232931347121, "learning_rate": 1.6223557972016973e-07, "loss": 1.439, "step": 447 }, { "epoch": 1.041237113402062, "grad_norm": 1.18342528126353, "learning_rate": 1.6207659941167485e-07, "loss": 1.5094, "step": 448 }, { "epoch": 1.0438144329896908, "grad_norm": 1.2039062898512076, "learning_rate": 1.6191736342974767e-07, "loss": 1.4619, "step": 449 }, { "epoch": 1.0463917525773196, "grad_norm": 1.2183703075903023, "learning_rate": 1.617578724302297e-07, "loss": 1.5232, "step": 450 }, { "epoch": 1.0489690721649485, "grad_norm": 1.1388070881208434, "learning_rate": 1.615981270700128e-07, "loss": 1.4638, "step": 451 }, { "epoch": 1.0515463917525774, "grad_norm": 1.0877380908149572, "learning_rate": 1.6143812800703642e-07, "loss": 1.4447, "step": 452 }, { "epoch": 1.0541237113402062, "grad_norm": 1.1716268781083103, "learning_rate": 1.6127787590028495e-07, "loss": 1.5212, "step": 453 }, { "epoch": 1.056701030927835, "grad_norm": 1.107434556978612, "learning_rate": 1.6111737140978493e-07, "loss": 1.4558, "step": 454 }, { "epoch": 1.059278350515464, "grad_norm": 1.2519354029249565, "learning_rate": 1.609566151966025e-07, "loss": 1.4528, "step": 455 }, { "epoch": 1.0618556701030928, "grad_norm": 1.1919323581174677, "learning_rate": 1.6079560792284045e-07, "loss": 1.5621, "step": 456 }, { "epoch": 1.0644329896907216, "grad_norm": 1.1817947401366415, "learning_rate": 1.6063435025163568e-07, "loss": 1.4662, "step": 457 }, { "epoch": 1.0670103092783505, "grad_norm": 1.2557632574926887, "learning_rate": 1.6047284284715642e-07, "loss": 1.4804, "step": 458 }, { "epoch": 1.0695876288659794, "grad_norm": 1.2611184908202628, "learning_rate": 1.6031108637459932e-07, "loss": 1.3898, "step": 459 }, { "epoch": 1.0721649484536082, "grad_norm": 1.2900278262304008, "learning_rate": 1.6014908150018703e-07, "loss": 1.5064, "step": 460 }, { "epoch": 1.074742268041237, "grad_norm": 1.195779708533936, "learning_rate": 1.5998682889116524e-07, "loss": 1.5224, "step": 461 }, { "epoch": 1.077319587628866, "grad_norm": 1.1566664249843968, "learning_rate": 1.5982432921579993e-07, "loss": 1.4517, "step": 462 }, { "epoch": 1.0798969072164948, "grad_norm": 1.2001020296312388, "learning_rate": 1.596615831433747e-07, "loss": 1.5602, "step": 463 }, { "epoch": 1.0824742268041236, "grad_norm": 1.1943899233375934, "learning_rate": 1.5949859134418796e-07, "loss": 1.3757, "step": 464 }, { "epoch": 1.0850515463917525, "grad_norm": 1.231964645169981, "learning_rate": 1.5933535448955027e-07, "loss": 1.4859, "step": 465 }, { "epoch": 1.0876288659793814, "grad_norm": 1.1068734683342414, "learning_rate": 1.5917187325178137e-07, "loss": 1.4629, "step": 466 }, { "epoch": 1.0902061855670102, "grad_norm": 1.1513773116941175, "learning_rate": 1.590081483042076e-07, "loss": 1.5125, "step": 467 }, { "epoch": 1.0927835051546393, "grad_norm": 1.265359820624344, "learning_rate": 1.5884418032115906e-07, "loss": 1.5204, "step": 468 }, { "epoch": 1.0953608247422681, "grad_norm": 1.1596012619544869, "learning_rate": 1.5867996997796683e-07, "loss": 1.4528, "step": 469 }, { "epoch": 1.097938144329897, "grad_norm": 1.1953930948748877, "learning_rate": 1.5851551795096025e-07, "loss": 1.404, "step": 470 }, { "epoch": 1.1005154639175259, "grad_norm": 1.1467999018042732, "learning_rate": 1.5835082491746393e-07, "loss": 1.5314, "step": 471 }, { "epoch": 1.1030927835051547, "grad_norm": 1.208554802219746, "learning_rate": 1.581858915557953e-07, "loss": 1.4632, "step": 472 }, { "epoch": 1.1056701030927836, "grad_norm": 1.210149302840143, "learning_rate": 1.580207185452614e-07, "loss": 1.4828, "step": 473 }, { "epoch": 1.1082474226804124, "grad_norm": 1.0949101750229728, "learning_rate": 1.5785530656615654e-07, "loss": 1.4612, "step": 474 }, { "epoch": 1.1108247422680413, "grad_norm": 1.1550991304470553, "learning_rate": 1.576896562997591e-07, "loss": 1.5112, "step": 475 }, { "epoch": 1.1134020618556701, "grad_norm": 1.267086705459486, "learning_rate": 1.5752376842832898e-07, "loss": 1.5086, "step": 476 }, { "epoch": 1.115979381443299, "grad_norm": 1.157659801945543, "learning_rate": 1.573576436351046e-07, "loss": 1.4721, "step": 477 }, { "epoch": 1.1185567010309279, "grad_norm": 1.1792779255646542, "learning_rate": 1.571912826043003e-07, "loss": 1.4216, "step": 478 }, { "epoch": 1.1211340206185567, "grad_norm": 1.279434721476292, "learning_rate": 1.5702468602110331e-07, "loss": 1.4098, "step": 479 }, { "epoch": 1.1237113402061856, "grad_norm": 1.2412716991217037, "learning_rate": 1.5685785457167113e-07, "loss": 1.4855, "step": 480 }, { "epoch": 1.1262886597938144, "grad_norm": 1.1878566044688987, "learning_rate": 1.5669078894312847e-07, "loss": 1.5252, "step": 481 }, { "epoch": 1.1288659793814433, "grad_norm": 1.2441727908973987, "learning_rate": 1.565234898235646e-07, "loss": 1.5462, "step": 482 }, { "epoch": 1.1314432989690721, "grad_norm": 1.176061624777031, "learning_rate": 1.5635595790203056e-07, "loss": 1.5135, "step": 483 }, { "epoch": 1.134020618556701, "grad_norm": 1.246481799384192, "learning_rate": 1.5618819386853602e-07, "loss": 1.5357, "step": 484 }, { "epoch": 1.1365979381443299, "grad_norm": 1.2042279646873306, "learning_rate": 1.5602019841404688e-07, "loss": 1.5146, "step": 485 }, { "epoch": 1.1391752577319587, "grad_norm": 1.1664753868373192, "learning_rate": 1.5585197223048202e-07, "loss": 1.5007, "step": 486 }, { "epoch": 1.1417525773195876, "grad_norm": 1.0786695822166654, "learning_rate": 1.5568351601071068e-07, "loss": 1.4637, "step": 487 }, { "epoch": 1.1443298969072164, "grad_norm": 1.1782507265833873, "learning_rate": 1.5551483044854952e-07, "loss": 1.4811, "step": 488 }, { "epoch": 1.1469072164948453, "grad_norm": 1.2326350516083906, "learning_rate": 1.5534591623875985e-07, "loss": 1.5482, "step": 489 }, { "epoch": 1.1494845360824741, "grad_norm": 1.3932475474558166, "learning_rate": 1.551767740770446e-07, "loss": 1.4994, "step": 490 }, { "epoch": 1.1494845360824741, "eval_loss": 1.4784166812896729, "eval_runtime": 78.5816, "eval_samples_per_second": 21.163, "eval_steps_per_second": 1.323, "step": 490 }, { "epoch": 1.152061855670103, "grad_norm": 1.2782842614630645, "learning_rate": 1.5500740466004562e-07, "loss": 1.4751, "step": 491 }, { "epoch": 1.1546391752577319, "grad_norm": 1.216799121655535, "learning_rate": 1.5483780868534083e-07, "loss": 1.4724, "step": 492 }, { "epoch": 1.1572164948453607, "grad_norm": 1.1868499010457458, "learning_rate": 1.546679868514411e-07, "loss": 1.4335, "step": 493 }, { "epoch": 1.1597938144329896, "grad_norm": 1.199212625101753, "learning_rate": 1.544979398577877e-07, "loss": 1.428, "step": 494 }, { "epoch": 1.1623711340206186, "grad_norm": 1.1357296953077098, "learning_rate": 1.543276684047491e-07, "loss": 1.4542, "step": 495 }, { "epoch": 1.1649484536082475, "grad_norm": 1.213487385523563, "learning_rate": 1.5415717319361846e-07, "loss": 1.4724, "step": 496 }, { "epoch": 1.1675257731958764, "grad_norm": 1.1099648565570772, "learning_rate": 1.5398645492661028e-07, "loss": 1.4254, "step": 497 }, { "epoch": 1.1701030927835052, "grad_norm": 1.1324706525701729, "learning_rate": 1.5381551430685795e-07, "loss": 1.5048, "step": 498 }, { "epoch": 1.172680412371134, "grad_norm": 1.312867551517799, "learning_rate": 1.5364435203841056e-07, "loss": 1.4713, "step": 499 }, { "epoch": 1.175257731958763, "grad_norm": 1.1933326421003594, "learning_rate": 1.5347296882623017e-07, "loss": 1.5138, "step": 500 }, { "epoch": 1.1778350515463918, "grad_norm": 1.0985685695284346, "learning_rate": 1.533013653761887e-07, "loss": 1.433, "step": 501 }, { "epoch": 1.1804123711340206, "grad_norm": 1.1149163975715322, "learning_rate": 1.5312954239506533e-07, "loss": 1.3835, "step": 502 }, { "epoch": 1.1829896907216495, "grad_norm": 1.3227767494195912, "learning_rate": 1.529575005905433e-07, "loss": 1.4895, "step": 503 }, { "epoch": 1.1855670103092784, "grad_norm": 1.214579951187228, "learning_rate": 1.5278524067120717e-07, "loss": 1.5998, "step": 504 }, { "epoch": 1.1881443298969072, "grad_norm": 1.242415460112634, "learning_rate": 1.5261276334653982e-07, "loss": 1.419, "step": 505 }, { "epoch": 1.190721649484536, "grad_norm": 1.2389773021924564, "learning_rate": 1.5244006932691953e-07, "loss": 1.4202, "step": 506 }, { "epoch": 1.193298969072165, "grad_norm": 1.2349193613971634, "learning_rate": 1.5226715932361716e-07, "loss": 1.5457, "step": 507 }, { "epoch": 1.1958762886597938, "grad_norm": 1.1148921709276238, "learning_rate": 1.5209403404879302e-07, "loss": 1.3884, "step": 508 }, { "epoch": 1.1984536082474226, "grad_norm": 1.2416754407978092, "learning_rate": 1.5192069421549416e-07, "loss": 1.4643, "step": 509 }, { "epoch": 1.2010309278350515, "grad_norm": 1.240689395283768, "learning_rate": 1.5174714053765122e-07, "loss": 1.572, "step": 510 }, { "epoch": 1.2036082474226804, "grad_norm": 1.140745518174075, "learning_rate": 1.5157337373007578e-07, "loss": 1.3663, "step": 511 }, { "epoch": 1.2061855670103092, "grad_norm": 1.1312657539313165, "learning_rate": 1.5139939450845698e-07, "loss": 1.4681, "step": 512 }, { "epoch": 1.208762886597938, "grad_norm": 1.1613965817840117, "learning_rate": 1.51225203589359e-07, "loss": 1.548, "step": 513 }, { "epoch": 1.211340206185567, "grad_norm": 1.1253218321610134, "learning_rate": 1.5105080169021789e-07, "loss": 1.4644, "step": 514 }, { "epoch": 1.2139175257731958, "grad_norm": 1.1202729459915262, "learning_rate": 1.5087618952933866e-07, "loss": 1.4874, "step": 515 }, { "epoch": 1.2164948453608249, "grad_norm": 1.1754915638068841, "learning_rate": 1.5070136782589233e-07, "loss": 1.4904, "step": 516 }, { "epoch": 1.2190721649484537, "grad_norm": 1.211459122094429, "learning_rate": 1.5052633729991294e-07, "loss": 1.4832, "step": 517 }, { "epoch": 1.2216494845360826, "grad_norm": 1.2489759850317173, "learning_rate": 1.5035109867229456e-07, "loss": 1.4464, "step": 518 }, { "epoch": 1.2242268041237114, "grad_norm": 1.2194317834170105, "learning_rate": 1.5017565266478848e-07, "loss": 1.4897, "step": 519 }, { "epoch": 1.2268041237113403, "grad_norm": 1.1036732258357687, "learning_rate": 1.5e-07, "loss": 1.491, "step": 520 }, { "epoch": 1.2293814432989691, "grad_norm": 1.1658107658884465, "learning_rate": 1.4982414140138563e-07, "loss": 1.4678, "step": 521 }, { "epoch": 1.231958762886598, "grad_norm": 1.2704801398111358, "learning_rate": 1.4964807759325008e-07, "loss": 1.3781, "step": 522 }, { "epoch": 1.2345360824742269, "grad_norm": 1.1848897409786574, "learning_rate": 1.4947180930074323e-07, "loss": 1.4799, "step": 523 }, { "epoch": 1.2371134020618557, "grad_norm": 1.2016447040520333, "learning_rate": 1.492953372498571e-07, "loss": 1.5686, "step": 524 }, { "epoch": 1.2396907216494846, "grad_norm": 1.2911746325303657, "learning_rate": 1.4911866216742307e-07, "loss": 1.5241, "step": 525 }, { "epoch": 1.2422680412371134, "grad_norm": 1.1990990248512616, "learning_rate": 1.4894178478110855e-07, "loss": 1.5357, "step": 526 }, { "epoch": 1.2448453608247423, "grad_norm": 1.149144012214145, "learning_rate": 1.4876470581941434e-07, "loss": 1.4571, "step": 527 }, { "epoch": 1.2474226804123711, "grad_norm": 1.198321859008649, "learning_rate": 1.485874260116714e-07, "loss": 1.5113, "step": 528 }, { "epoch": 1.25, "grad_norm": 1.2113266741136735, "learning_rate": 1.4840994608803788e-07, "loss": 1.4782, "step": 529 }, { "epoch": 1.2525773195876289, "grad_norm": 1.1425317175556289, "learning_rate": 1.4823226677949622e-07, "loss": 1.5012, "step": 530 }, { "epoch": 1.2551546391752577, "grad_norm": 1.268980235594048, "learning_rate": 1.4805438881784995e-07, "loss": 1.4529, "step": 531 }, { "epoch": 1.2577319587628866, "grad_norm": 1.151209820959519, "learning_rate": 1.478763129357209e-07, "loss": 1.4734, "step": 532 }, { "epoch": 1.2603092783505154, "grad_norm": 1.2260752095042977, "learning_rate": 1.4769803986654603e-07, "loss": 1.4896, "step": 533 }, { "epoch": 1.2628865979381443, "grad_norm": 1.2017887268263763, "learning_rate": 1.4751957034457445e-07, "loss": 1.4667, "step": 534 }, { "epoch": 1.2654639175257731, "grad_norm": 1.2271959233872554, "learning_rate": 1.4734090510486432e-07, "loss": 1.4888, "step": 535 }, { "epoch": 1.268041237113402, "grad_norm": 1.2197382019523413, "learning_rate": 1.4716204488328006e-07, "loss": 1.5358, "step": 536 }, { "epoch": 1.2706185567010309, "grad_norm": 1.1416105765632265, "learning_rate": 1.4698299041648902e-07, "loss": 1.4275, "step": 537 }, { "epoch": 1.2731958762886597, "grad_norm": 1.2103999127902116, "learning_rate": 1.468037424419586e-07, "loss": 1.4822, "step": 538 }, { "epoch": 1.2757731958762886, "grad_norm": 1.2127169663908728, "learning_rate": 1.4662430169795328e-07, "loss": 1.4477, "step": 539 }, { "epoch": 1.2757731958762886, "eval_loss": 1.474165678024292, "eval_runtime": 78.6592, "eval_samples_per_second": 21.142, "eval_steps_per_second": 1.322, "step": 539 }, { "epoch": 1.2783505154639174, "grad_norm": 1.2719723678439783, "learning_rate": 1.464446689235314e-07, "loss": 1.5694, "step": 540 }, { "epoch": 1.2809278350515463, "grad_norm": 1.094905461428815, "learning_rate": 1.4626484485854228e-07, "loss": 1.4405, "step": 541 }, { "epoch": 1.2835051546391751, "grad_norm": 1.1572746515704029, "learning_rate": 1.4608483024362303e-07, "loss": 1.49, "step": 542 }, { "epoch": 1.286082474226804, "grad_norm": 1.2136708668686302, "learning_rate": 1.4590462582019566e-07, "loss": 1.5488, "step": 543 }, { "epoch": 1.2886597938144329, "grad_norm": 1.1351781538641772, "learning_rate": 1.4572423233046385e-07, "loss": 1.4436, "step": 544 }, { "epoch": 1.291237113402062, "grad_norm": 1.2233902585418839, "learning_rate": 1.455436505174101e-07, "loss": 1.4752, "step": 545 }, { "epoch": 1.2938144329896908, "grad_norm": 1.2111257906769834, "learning_rate": 1.453628811247924e-07, "loss": 1.5437, "step": 546 }, { "epoch": 1.2963917525773196, "grad_norm": 1.214330730454999, "learning_rate": 1.4518192489714148e-07, "loss": 1.5874, "step": 547 }, { "epoch": 1.2989690721649485, "grad_norm": 1.1501171354212085, "learning_rate": 1.4500078257975746e-07, "loss": 1.4441, "step": 548 }, { "epoch": 1.3015463917525774, "grad_norm": 1.256132517451847, "learning_rate": 1.4481945491870692e-07, "loss": 1.4869, "step": 549 }, { "epoch": 1.3041237113402062, "grad_norm": 1.153698353782002, "learning_rate": 1.4463794266081993e-07, "loss": 1.4298, "step": 550 }, { "epoch": 1.306701030927835, "grad_norm": 1.1141900425922164, "learning_rate": 1.4445624655368672e-07, "loss": 1.4081, "step": 551 }, { "epoch": 1.309278350515464, "grad_norm": 1.1727962553732723, "learning_rate": 1.4427436734565474e-07, "loss": 1.4843, "step": 552 }, { "epoch": 1.3118556701030928, "grad_norm": 1.1904748231664284, "learning_rate": 1.4409230578582564e-07, "loss": 1.4408, "step": 553 }, { "epoch": 1.3144329896907216, "grad_norm": 1.1596562097777137, "learning_rate": 1.4391006262405212e-07, "loss": 1.5078, "step": 554 }, { "epoch": 1.3170103092783505, "grad_norm": 1.1362387372168263, "learning_rate": 1.4372763861093478e-07, "loss": 1.4596, "step": 555 }, { "epoch": 1.3195876288659794, "grad_norm": 1.2438435278065572, "learning_rate": 1.4354503449781913e-07, "loss": 1.536, "step": 556 }, { "epoch": 1.3221649484536082, "grad_norm": 1.182522665170931, "learning_rate": 1.4336225103679243e-07, "loss": 1.5611, "step": 557 }, { "epoch": 1.324742268041237, "grad_norm": 1.2822957709992338, "learning_rate": 1.4317928898068066e-07, "loss": 1.4826, "step": 558 }, { "epoch": 1.327319587628866, "grad_norm": 1.2758012985116745, "learning_rate": 1.4299614908304528e-07, "loss": 1.4543, "step": 559 }, { "epoch": 1.3298969072164948, "grad_norm": 1.164766457118801, "learning_rate": 1.4281283209818038e-07, "loss": 1.4061, "step": 560 }, { "epoch": 1.3324742268041236, "grad_norm": 1.1663065580316805, "learning_rate": 1.4262933878110923e-07, "loss": 1.5151, "step": 561 }, { "epoch": 1.3350515463917525, "grad_norm": 1.1525726704239359, "learning_rate": 1.4244566988758152e-07, "loss": 1.5209, "step": 562 }, { "epoch": 1.3376288659793816, "grad_norm": 1.194456252210575, "learning_rate": 1.4226182617406994e-07, "loss": 1.5003, "step": 563 }, { "epoch": 1.3402061855670104, "grad_norm": 1.2788205228042828, "learning_rate": 1.4207780839776734e-07, "loss": 1.5807, "step": 564 }, { "epoch": 1.3427835051546393, "grad_norm": 1.2101911204508933, "learning_rate": 1.4189361731658336e-07, "loss": 1.4851, "step": 565 }, { "epoch": 1.3453608247422681, "grad_norm": 1.143725315674112, "learning_rate": 1.417092536891415e-07, "loss": 1.5258, "step": 566 }, { "epoch": 1.347938144329897, "grad_norm": 1.1692223610404973, "learning_rate": 1.4152471827477593e-07, "loss": 1.4843, "step": 567 }, { "epoch": 1.3505154639175259, "grad_norm": 1.106947712823219, "learning_rate": 1.413400118335283e-07, "loss": 1.4339, "step": 568 }, { "epoch": 1.3530927835051547, "grad_norm": 1.27487397886756, "learning_rate": 1.4115513512614468e-07, "loss": 1.4993, "step": 569 }, { "epoch": 1.3556701030927836, "grad_norm": 1.2236429851509971, "learning_rate": 1.4097008891407245e-07, "loss": 1.4858, "step": 570 }, { "epoch": 1.3582474226804124, "grad_norm": 1.156634200386137, "learning_rate": 1.407848739594571e-07, "loss": 1.4973, "step": 571 }, { "epoch": 1.3608247422680413, "grad_norm": 1.287092803375809, "learning_rate": 1.4059949102513913e-07, "loss": 1.476, "step": 572 }, { "epoch": 1.3634020618556701, "grad_norm": 1.2572273439235049, "learning_rate": 1.404139408746508e-07, "loss": 1.4798, "step": 573 }, { "epoch": 1.365979381443299, "grad_norm": 1.2276167223192924, "learning_rate": 1.4022822427221322e-07, "loss": 1.497, "step": 574 }, { "epoch": 1.3685567010309279, "grad_norm": 1.2392858668139202, "learning_rate": 1.4004234198273302e-07, "loss": 1.5471, "step": 575 }, { "epoch": 1.3711340206185567, "grad_norm": 1.2887104141411092, "learning_rate": 1.3985629477179915e-07, "loss": 1.4953, "step": 576 }, { "epoch": 1.3737113402061856, "grad_norm": 1.2401450542055277, "learning_rate": 1.3967008340567996e-07, "loss": 1.5095, "step": 577 }, { "epoch": 1.3762886597938144, "grad_norm": 1.1989888153377388, "learning_rate": 1.3948370865131977e-07, "loss": 1.5633, "step": 578 }, { "epoch": 1.3788659793814433, "grad_norm": 1.1616958019574242, "learning_rate": 1.3929717127633597e-07, "loss": 1.5035, "step": 579 }, { "epoch": 1.3814432989690721, "grad_norm": 1.1581446950268255, "learning_rate": 1.3911047204901557e-07, "loss": 1.5232, "step": 580 }, { "epoch": 1.384020618556701, "grad_norm": 1.2240328360723358, "learning_rate": 1.3892361173831243e-07, "loss": 1.4948, "step": 581 }, { "epoch": 1.3865979381443299, "grad_norm": 1.2405325514200207, "learning_rate": 1.3873659111384362e-07, "loss": 1.4815, "step": 582 }, { "epoch": 1.3891752577319587, "grad_norm": 1.253563661932654, "learning_rate": 1.385494109458866e-07, "loss": 1.4284, "step": 583 }, { "epoch": 1.3917525773195876, "grad_norm": 1.1541355431922666, "learning_rate": 1.3836207200537596e-07, "loss": 1.4213, "step": 584 }, { "epoch": 1.3943298969072164, "grad_norm": 1.2315631871967962, "learning_rate": 1.381745750639002e-07, "loss": 1.5876, "step": 585 }, { "epoch": 1.3969072164948453, "grad_norm": 1.3294003251532183, "learning_rate": 1.3798692089369854e-07, "loss": 1.5821, "step": 586 }, { "epoch": 1.3994845360824741, "grad_norm": 1.1726722981119444, "learning_rate": 1.3779911026765784e-07, "loss": 1.4679, "step": 587 }, { "epoch": 1.402061855670103, "grad_norm": 1.1782526174868226, "learning_rate": 1.3761114395930927e-07, "loss": 1.4851, "step": 588 }, { "epoch": 1.402061855670103, "eval_loss": 1.4704606533050537, "eval_runtime": 78.4306, "eval_samples_per_second": 21.203, "eval_steps_per_second": 1.326, "step": 588 }, { "epoch": 1.4046391752577319, "grad_norm": 1.192382882455904, "learning_rate": 1.3742302274282532e-07, "loss": 1.4707, "step": 589 }, { "epoch": 1.4072164948453607, "grad_norm": 1.1364191762169735, "learning_rate": 1.3723474739301636e-07, "loss": 1.4066, "step": 590 }, { "epoch": 1.4097938144329896, "grad_norm": 1.1453269827664123, "learning_rate": 1.3704631868532767e-07, "loss": 1.4515, "step": 591 }, { "epoch": 1.4123711340206184, "grad_norm": 1.1956529180296382, "learning_rate": 1.3685773739583617e-07, "loss": 1.5102, "step": 592 }, { "epoch": 1.4149484536082475, "grad_norm": 1.2356880855065446, "learning_rate": 1.3666900430124717e-07, "loss": 1.497, "step": 593 }, { "epoch": 1.4175257731958764, "grad_norm": 1.1639642247143227, "learning_rate": 1.3648012017889121e-07, "loss": 1.485, "step": 594 }, { "epoch": 1.4201030927835052, "grad_norm": 1.3028192646105916, "learning_rate": 1.3629108580672093e-07, "loss": 1.5073, "step": 595 }, { "epoch": 1.422680412371134, "grad_norm": 1.1389634389377659, "learning_rate": 1.3610190196330775e-07, "loss": 1.4455, "step": 596 }, { "epoch": 1.425257731958763, "grad_norm": 1.2726998162356058, "learning_rate": 1.3591256942783868e-07, "loss": 1.6226, "step": 597 }, { "epoch": 1.4278350515463918, "grad_norm": 1.257001783763068, "learning_rate": 1.3572308898011326e-07, "loss": 1.4527, "step": 598 }, { "epoch": 1.4304123711340206, "grad_norm": 1.1897100853456886, "learning_rate": 1.3553346140054013e-07, "loss": 1.4748, "step": 599 }, { "epoch": 1.4329896907216495, "grad_norm": 1.144640373535268, "learning_rate": 1.3534368747013394e-07, "loss": 1.4733, "step": 600 }, { "epoch": 1.4355670103092784, "grad_norm": 1.2252518120948153, "learning_rate": 1.351537679705121e-07, "loss": 1.4539, "step": 601 }, { "epoch": 1.4381443298969072, "grad_norm": 1.1565118663607803, "learning_rate": 1.3496370368389165e-07, "loss": 1.5236, "step": 602 }, { "epoch": 1.440721649484536, "grad_norm": 1.2594818027515957, "learning_rate": 1.3477349539308584e-07, "loss": 1.4856, "step": 603 }, { "epoch": 1.443298969072165, "grad_norm": 1.1419387268061763, "learning_rate": 1.3458314388150115e-07, "loss": 1.4153, "step": 604 }, { "epoch": 1.4458762886597938, "grad_norm": 1.098148594961463, "learning_rate": 1.3439264993313385e-07, "loss": 1.4447, "step": 605 }, { "epoch": 1.4484536082474226, "grad_norm": 1.2022510861175644, "learning_rate": 1.342020143325669e-07, "loss": 1.5516, "step": 606 }, { "epoch": 1.4510309278350515, "grad_norm": 1.1444341747665796, "learning_rate": 1.3401123786496663e-07, "loss": 1.4224, "step": 607 }, { "epoch": 1.4536082474226804, "grad_norm": 1.1349715757276768, "learning_rate": 1.3382032131607965e-07, "loss": 1.3973, "step": 608 }, { "epoch": 1.4561855670103092, "grad_norm": 1.1228999228709107, "learning_rate": 1.3362926547222946e-07, "loss": 1.4149, "step": 609 }, { "epoch": 1.458762886597938, "grad_norm": 1.2396644989009444, "learning_rate": 1.3343807112031327e-07, "loss": 1.4999, "step": 610 }, { "epoch": 1.4613402061855671, "grad_norm": 1.1458789067959891, "learning_rate": 1.3324673904779874e-07, "loss": 1.4606, "step": 611 }, { "epoch": 1.463917525773196, "grad_norm": 1.1579136550048348, "learning_rate": 1.3305527004272087e-07, "loss": 1.5091, "step": 612 }, { "epoch": 1.4664948453608249, "grad_norm": 1.1065943702186947, "learning_rate": 1.3286366489367846e-07, "loss": 1.4981, "step": 613 }, { "epoch": 1.4690721649484537, "grad_norm": 1.1701708173193963, "learning_rate": 1.3267192438983117e-07, "loss": 1.4864, "step": 614 }, { "epoch": 1.4716494845360826, "grad_norm": 1.1655119326822228, "learning_rate": 1.324800493208961e-07, "loss": 1.4609, "step": 615 }, { "epoch": 1.4742268041237114, "grad_norm": 1.1668952825289185, "learning_rate": 1.322880404771446e-07, "loss": 1.5529, "step": 616 }, { "epoch": 1.4768041237113403, "grad_norm": 1.1827027818749032, "learning_rate": 1.3209589864939906e-07, "loss": 1.4712, "step": 617 }, { "epoch": 1.4793814432989691, "grad_norm": 1.16644526665677, "learning_rate": 1.3190362462902935e-07, "loss": 1.4444, "step": 618 }, { "epoch": 1.481958762886598, "grad_norm": 1.2457925422571992, "learning_rate": 1.3171121920795012e-07, "loss": 1.5042, "step": 619 }, { "epoch": 1.4845360824742269, "grad_norm": 1.1848287601135188, "learning_rate": 1.3151868317861698e-07, "loss": 1.5314, "step": 620 }, { "epoch": 1.4871134020618557, "grad_norm": 1.181022425488295, "learning_rate": 1.3132601733402355e-07, "loss": 1.5557, "step": 621 }, { "epoch": 1.4896907216494846, "grad_norm": 1.2220291945868886, "learning_rate": 1.3113322246769816e-07, "loss": 1.4743, "step": 622 }, { "epoch": 1.4922680412371134, "grad_norm": 1.23521757296614, "learning_rate": 1.3094029937370049e-07, "loss": 1.4494, "step": 623 }, { "epoch": 1.4948453608247423, "grad_norm": 1.1540829106187, "learning_rate": 1.3074724884661832e-07, "loss": 1.492, "step": 624 }, { "epoch": 1.4974226804123711, "grad_norm": 1.2734897659131177, "learning_rate": 1.3055407168156436e-07, "loss": 1.5114, "step": 625 }, { "epoch": 1.5, "grad_norm": 1.0821110483827021, "learning_rate": 1.3036076867417286e-07, "loss": 1.4899, "step": 626 }, { "epoch": 1.5025773195876289, "grad_norm": 1.1591573630093586, "learning_rate": 1.3016734062059636e-07, "loss": 1.4287, "step": 627 }, { "epoch": 1.5051546391752577, "grad_norm": 1.252040765136315, "learning_rate": 1.299737883175024e-07, "loss": 1.4215, "step": 628 }, { "epoch": 1.5077319587628866, "grad_norm": 1.122072741553452, "learning_rate": 1.2978011256207041e-07, "loss": 1.4535, "step": 629 }, { "epoch": 1.5103092783505154, "grad_norm": 1.1929144211640363, "learning_rate": 1.2958631415198813e-07, "loss": 1.4264, "step": 630 }, { "epoch": 1.5128865979381443, "grad_norm": 1.1904423534607285, "learning_rate": 1.293923938854485e-07, "loss": 1.4966, "step": 631 }, { "epoch": 1.5154639175257731, "grad_norm": 1.2142748405878527, "learning_rate": 1.2919835256114638e-07, "loss": 1.4152, "step": 632 }, { "epoch": 1.518041237113402, "grad_norm": 1.2310572109795892, "learning_rate": 1.290041909782752e-07, "loss": 1.3986, "step": 633 }, { "epoch": 1.5206185567010309, "grad_norm": 1.1532910482056786, "learning_rate": 1.2880990993652377e-07, "loss": 1.4606, "step": 634 }, { "epoch": 1.5231958762886597, "grad_norm": 1.3007475509786544, "learning_rate": 1.2861551023607276e-07, "loss": 1.5304, "step": 635 }, { "epoch": 1.5257731958762886, "grad_norm": 1.15166741332348, "learning_rate": 1.2842099267759174e-07, "loss": 1.3824, "step": 636 }, { "epoch": 1.5283505154639174, "grad_norm": 1.1988826738728366, "learning_rate": 1.2822635806223556e-07, "loss": 1.567, "step": 637 }, { "epoch": 1.5283505154639174, "eval_loss": 1.4671498537063599, "eval_runtime": 78.5049, "eval_samples_per_second": 21.183, "eval_steps_per_second": 1.325, "step": 637 }, { "epoch": 1.5309278350515463, "grad_norm": 1.2386857438447851, "learning_rate": 1.2803160719164125e-07, "loss": 1.5304, "step": 638 }, { "epoch": 1.5335051546391751, "grad_norm": 1.1597769415791235, "learning_rate": 1.2783674086792466e-07, "loss": 1.497, "step": 639 }, { "epoch": 1.536082474226804, "grad_norm": 1.2924257071547485, "learning_rate": 1.2764175989367717e-07, "loss": 1.4877, "step": 640 }, { "epoch": 1.5386597938144329, "grad_norm": 1.222248016944084, "learning_rate": 1.2744666507196224e-07, "loss": 1.5257, "step": 641 }, { "epoch": 1.5412371134020617, "grad_norm": 1.0852012266696331, "learning_rate": 1.2725145720631242e-07, "loss": 1.4657, "step": 642 }, { "epoch": 1.5438144329896906, "grad_norm": 1.2029751793520205, "learning_rate": 1.2705613710072573e-07, "loss": 1.543, "step": 643 }, { "epoch": 1.5463917525773194, "grad_norm": 1.1786774736346322, "learning_rate": 1.2686070555966252e-07, "loss": 1.4163, "step": 644 }, { "epoch": 1.5489690721649485, "grad_norm": 1.1923466397926792, "learning_rate": 1.2666516338804208e-07, "loss": 1.449, "step": 645 }, { "epoch": 1.5515463917525774, "grad_norm": 1.1491363181852474, "learning_rate": 1.2646951139123932e-07, "loss": 1.4773, "step": 646 }, { "epoch": 1.5541237113402062, "grad_norm": 1.1921001128896263, "learning_rate": 1.2627375037508162e-07, "loss": 1.4596, "step": 647 }, { "epoch": 1.556701030927835, "grad_norm": 1.2215090538297548, "learning_rate": 1.2607788114584522e-07, "loss": 1.5697, "step": 648 }, { "epoch": 1.559278350515464, "grad_norm": 1.1364987023852344, "learning_rate": 1.2588190451025208e-07, "loss": 1.4126, "step": 649 }, { "epoch": 1.5618556701030928, "grad_norm": 1.139874297388743, "learning_rate": 1.2568582127546662e-07, "loss": 1.4104, "step": 650 }, { "epoch": 1.5644329896907216, "grad_norm": 1.1273021800754177, "learning_rate": 1.2548963224909223e-07, "loss": 1.4407, "step": 651 }, { "epoch": 1.5670103092783505, "grad_norm": 1.1999146152571862, "learning_rate": 1.2529333823916806e-07, "loss": 1.4779, "step": 652 }, { "epoch": 1.5695876288659794, "grad_norm": 1.1170496605169837, "learning_rate": 1.2509694005416563e-07, "loss": 1.4368, "step": 653 }, { "epoch": 1.5721649484536082, "grad_norm": 1.099167093974349, "learning_rate": 1.2490043850298557e-07, "loss": 1.4932, "step": 654 }, { "epoch": 1.574742268041237, "grad_norm": 1.219342527534671, "learning_rate": 1.2470383439495416e-07, "loss": 1.4633, "step": 655 }, { "epoch": 1.577319587628866, "grad_norm": 1.2125741355588842, "learning_rate": 1.2450712853982014e-07, "loss": 1.5161, "step": 656 }, { "epoch": 1.579896907216495, "grad_norm": 1.2755825455134522, "learning_rate": 1.2431032174775127e-07, "loss": 1.5225, "step": 657 }, { "epoch": 1.5824742268041239, "grad_norm": 1.1521606084223124, "learning_rate": 1.2411341482933108e-07, "loss": 1.4308, "step": 658 }, { "epoch": 1.5850515463917527, "grad_norm": 1.165275382475451, "learning_rate": 1.239164085955555e-07, "loss": 1.5024, "step": 659 }, { "epoch": 1.5876288659793816, "grad_norm": 1.2609655964912305, "learning_rate": 1.2371930385782943e-07, "loss": 1.4669, "step": 660 }, { "epoch": 1.5902061855670104, "grad_norm": 1.1698575645046683, "learning_rate": 1.2352210142796356e-07, "loss": 1.4752, "step": 661 }, { "epoch": 1.5927835051546393, "grad_norm": 1.1966335794904208, "learning_rate": 1.2332480211817091e-07, "loss": 1.5478, "step": 662 }, { "epoch": 1.5953608247422681, "grad_norm": 1.081476396234954, "learning_rate": 1.2312740674106347e-07, "loss": 1.451, "step": 663 }, { "epoch": 1.597938144329897, "grad_norm": 1.2089145441748135, "learning_rate": 1.22929916109649e-07, "loss": 1.4975, "step": 664 }, { "epoch": 1.6005154639175259, "grad_norm": 1.2416284172109027, "learning_rate": 1.227323310373275e-07, "loss": 1.43, "step": 665 }, { "epoch": 1.6030927835051547, "grad_norm": 1.2758382819864167, "learning_rate": 1.2253465233788794e-07, "loss": 1.4589, "step": 666 }, { "epoch": 1.6056701030927836, "grad_norm": 1.1736803322764697, "learning_rate": 1.22336880825505e-07, "loss": 1.3896, "step": 667 }, { "epoch": 1.6082474226804124, "grad_norm": 1.1927775409437176, "learning_rate": 1.2213901731473551e-07, "loss": 1.5394, "step": 668 }, { "epoch": 1.6108247422680413, "grad_norm": 1.2264294531171918, "learning_rate": 1.219410626205153e-07, "loss": 1.4543, "step": 669 }, { "epoch": 1.6134020618556701, "grad_norm": 1.2693861374653377, "learning_rate": 1.217430175581557e-07, "loss": 1.484, "step": 670 }, { "epoch": 1.615979381443299, "grad_norm": 1.2665036241537893, "learning_rate": 1.2154488294334027e-07, "loss": 1.5607, "step": 671 }, { "epoch": 1.6185567010309279, "grad_norm": 1.1703235363860394, "learning_rate": 1.2134665959212136e-07, "loss": 1.4644, "step": 672 }, { "epoch": 1.6211340206185567, "grad_norm": 1.193069004037872, "learning_rate": 1.211483483209169e-07, "loss": 1.4888, "step": 673 }, { "epoch": 1.6237113402061856, "grad_norm": 1.2361705074035756, "learning_rate": 1.209499499465068e-07, "loss": 1.4504, "step": 674 }, { "epoch": 1.6262886597938144, "grad_norm": 1.095084009584948, "learning_rate": 1.2075146528602983e-07, "loss": 1.4828, "step": 675 }, { "epoch": 1.6288659793814433, "grad_norm": 1.1262123200952905, "learning_rate": 1.2055289515698006e-07, "loss": 1.5487, "step": 676 }, { "epoch": 1.6314432989690721, "grad_norm": 1.1378828378426857, "learning_rate": 1.2035424037720364e-07, "loss": 1.4921, "step": 677 }, { "epoch": 1.634020618556701, "grad_norm": 1.1961288239091903, "learning_rate": 1.2015550176489537e-07, "loss": 1.4421, "step": 678 }, { "epoch": 1.6365979381443299, "grad_norm": 1.1366747357584532, "learning_rate": 1.199566801385953e-07, "loss": 1.4392, "step": 679 }, { "epoch": 1.6391752577319587, "grad_norm": 1.1909816425714403, "learning_rate": 1.1975777631718532e-07, "loss": 1.5001, "step": 680 }, { "epoch": 1.6417525773195876, "grad_norm": 1.2963539362844378, "learning_rate": 1.19558791119886e-07, "loss": 1.4605, "step": 681 }, { "epoch": 1.6443298969072164, "grad_norm": 1.1580390642200817, "learning_rate": 1.19359725366253e-07, "loss": 1.5063, "step": 682 }, { "epoch": 1.6469072164948453, "grad_norm": 1.216487820544871, "learning_rate": 1.1916057987617374e-07, "loss": 1.4886, "step": 683 }, { "epoch": 1.6494845360824741, "grad_norm": 1.2218109581350323, "learning_rate": 1.1896135546986407e-07, "loss": 1.4608, "step": 684 }, { "epoch": 1.652061855670103, "grad_norm": 1.2280111906896558, "learning_rate": 1.1876205296786493e-07, "loss": 1.5096, "step": 685 }, { "epoch": 1.6546391752577319, "grad_norm": 1.2166796078055058, "learning_rate": 1.1856267319103876e-07, "loss": 1.4692, "step": 686 }, { "epoch": 1.6546391752577319, "eval_loss": 1.4642903804779053, "eval_runtime": 78.6766, "eval_samples_per_second": 21.137, "eval_steps_per_second": 1.322, "step": 686 }, { "epoch": 1.6572164948453607, "grad_norm": 1.1939355446450859, "learning_rate": 1.1836321696056645e-07, "loss": 1.4137, "step": 687 }, { "epoch": 1.6597938144329896, "grad_norm": 1.2546613486361071, "learning_rate": 1.1816368509794363e-07, "loss": 1.512, "step": 688 }, { "epoch": 1.6623711340206184, "grad_norm": 1.1366449756739982, "learning_rate": 1.1796407842497753e-07, "loss": 1.3836, "step": 689 }, { "epoch": 1.6649484536082473, "grad_norm": 1.2553355162175337, "learning_rate": 1.1776439776378351e-07, "loss": 1.4565, "step": 690 }, { "epoch": 1.6675257731958761, "grad_norm": 1.20909630643183, "learning_rate": 1.1756464393678151e-07, "loss": 1.4481, "step": 691 }, { "epoch": 1.670103092783505, "grad_norm": 1.2273438479078924, "learning_rate": 1.1736481776669305e-07, "loss": 1.4903, "step": 692 }, { "epoch": 1.672680412371134, "grad_norm": 1.1909626287045671, "learning_rate": 1.1716492007653737e-07, "loss": 1.5012, "step": 693 }, { "epoch": 1.675257731958763, "grad_norm": 1.158968259505721, "learning_rate": 1.1696495168962845e-07, "loss": 1.5465, "step": 694 }, { "epoch": 1.6778350515463918, "grad_norm": 1.1963581026774628, "learning_rate": 1.1676491342957142e-07, "loss": 1.4729, "step": 695 }, { "epoch": 1.6804123711340206, "grad_norm": 1.215536392765087, "learning_rate": 1.1656480612025911e-07, "loss": 1.4164, "step": 696 }, { "epoch": 1.6829896907216495, "grad_norm": 1.0521259077304612, "learning_rate": 1.163646305858688e-07, "loss": 1.3678, "step": 697 }, { "epoch": 1.6855670103092784, "grad_norm": 1.295543359347737, "learning_rate": 1.1616438765085881e-07, "loss": 1.57, "step": 698 }, { "epoch": 1.6881443298969072, "grad_norm": 1.1720574150387943, "learning_rate": 1.1596407813996498e-07, "loss": 1.5221, "step": 699 }, { "epoch": 1.690721649484536, "grad_norm": 1.186785802460397, "learning_rate": 1.1576370287819735e-07, "loss": 1.4673, "step": 700 }, { "epoch": 1.693298969072165, "grad_norm": 1.1589224859683183, "learning_rate": 1.155632626908369e-07, "loss": 1.3919, "step": 701 }, { "epoch": 1.6958762886597938, "grad_norm": 1.3034607577131674, "learning_rate": 1.1536275840343183e-07, "loss": 1.43, "step": 702 }, { "epoch": 1.6984536082474226, "grad_norm": 1.1721298121139936, "learning_rate": 1.1516219084179448e-07, "loss": 1.5556, "step": 703 }, { "epoch": 1.7010309278350515, "grad_norm": 1.164281783704574, "learning_rate": 1.149615608319978e-07, "loss": 1.4449, "step": 704 }, { "epoch": 1.7036082474226806, "grad_norm": 1.1144845067827036, "learning_rate": 1.1476086920037183e-07, "loss": 1.5204, "step": 705 }, { "epoch": 1.7061855670103094, "grad_norm": 1.1470381221039117, "learning_rate": 1.1456011677350051e-07, "loss": 1.4096, "step": 706 }, { "epoch": 1.7087628865979383, "grad_norm": 1.1938066626201722, "learning_rate": 1.1435930437821812e-07, "loss": 1.4299, "step": 707 }, { "epoch": 1.7113402061855671, "grad_norm": 1.389576843014182, "learning_rate": 1.1415843284160598e-07, "loss": 1.4736, "step": 708 }, { "epoch": 1.713917525773196, "grad_norm": 1.1886965701829686, "learning_rate": 1.1395750299098899e-07, "loss": 1.4972, "step": 709 }, { "epoch": 1.7164948453608249, "grad_norm": 1.1389546972088997, "learning_rate": 1.1375651565393218e-07, "loss": 1.5518, "step": 710 }, { "epoch": 1.7190721649484537, "grad_norm": 1.1743796585118387, "learning_rate": 1.1355547165823738e-07, "loss": 1.4672, "step": 711 }, { "epoch": 1.7216494845360826, "grad_norm": 1.1460525519017093, "learning_rate": 1.1335437183193979e-07, "loss": 1.5233, "step": 712 }, { "epoch": 1.7242268041237114, "grad_norm": 1.1586406558048044, "learning_rate": 1.1315321700330454e-07, "loss": 1.4686, "step": 713 }, { "epoch": 1.7268041237113403, "grad_norm": 1.1369470779252082, "learning_rate": 1.1295200800082326e-07, "loss": 1.4688, "step": 714 }, { "epoch": 1.7293814432989691, "grad_norm": 1.1705799315615684, "learning_rate": 1.1275074565321078e-07, "loss": 1.3893, "step": 715 }, { "epoch": 1.731958762886598, "grad_norm": 1.1725120595380418, "learning_rate": 1.125494307894016e-07, "loss": 1.537, "step": 716 }, { "epoch": 1.7345360824742269, "grad_norm": 1.0734797144766555, "learning_rate": 1.1234806423854653e-07, "loss": 1.4388, "step": 717 }, { "epoch": 1.7371134020618557, "grad_norm": 1.170033873518124, "learning_rate": 1.1214664683000924e-07, "loss": 1.3753, "step": 718 }, { "epoch": 1.7396907216494846, "grad_norm": 1.231373540289329, "learning_rate": 1.1194517939336287e-07, "loss": 1.5497, "step": 719 }, { "epoch": 1.7422680412371134, "grad_norm": 1.1946433920559838, "learning_rate": 1.1174366275838662e-07, "loss": 1.413, "step": 720 }, { "epoch": 1.7448453608247423, "grad_norm": 1.1418431201062664, "learning_rate": 1.115420977550624e-07, "loss": 1.4914, "step": 721 }, { "epoch": 1.7474226804123711, "grad_norm": 1.2072128707535221, "learning_rate": 1.1134048521357115e-07, "loss": 1.4836, "step": 722 }, { "epoch": 1.75, "grad_norm": 1.194692316000769, "learning_rate": 1.1113882596428976e-07, "loss": 1.4389, "step": 723 }, { "epoch": 1.7525773195876289, "grad_norm": 1.181835370102449, "learning_rate": 1.1093712083778746e-07, "loss": 1.4542, "step": 724 }, { "epoch": 1.7551546391752577, "grad_norm": 1.1310122085797796, "learning_rate": 1.1073537066482235e-07, "loss": 1.4572, "step": 725 }, { "epoch": 1.7577319587628866, "grad_norm": 1.1299559219838877, "learning_rate": 1.1053357627633821e-07, "loss": 1.5374, "step": 726 }, { "epoch": 1.7603092783505154, "grad_norm": 1.2302892939334757, "learning_rate": 1.1033173850346081e-07, "loss": 1.5156, "step": 727 }, { "epoch": 1.7628865979381443, "grad_norm": 1.1376050539784393, "learning_rate": 1.1012985817749462e-07, "loss": 1.4994, "step": 728 }, { "epoch": 1.7654639175257731, "grad_norm": 1.1912506938583958, "learning_rate": 1.0992793612991946e-07, "loss": 1.5358, "step": 729 }, { "epoch": 1.768041237113402, "grad_norm": 1.2323374068579527, "learning_rate": 1.097259731923869e-07, "loss": 1.5446, "step": 730 }, { "epoch": 1.7706185567010309, "grad_norm": 1.2255437302126448, "learning_rate": 1.0952397019671694e-07, "loss": 1.413, "step": 731 }, { "epoch": 1.7731958762886597, "grad_norm": 1.2608512214948044, "learning_rate": 1.0932192797489459e-07, "loss": 1.5306, "step": 732 }, { "epoch": 1.7757731958762886, "grad_norm": 1.187848987827898, "learning_rate": 1.0911984735906635e-07, "loss": 1.4589, "step": 733 }, { "epoch": 1.7783505154639174, "grad_norm": 1.1078353763626878, "learning_rate": 1.0891772918153694e-07, "loss": 1.5026, "step": 734 }, { "epoch": 1.7809278350515463, "grad_norm": 1.1847073079284023, "learning_rate": 1.0871557427476584e-07, "loss": 1.4819, "step": 735 }, { "epoch": 1.7809278350515463, "eval_loss": 1.4616869688034058, "eval_runtime": 78.6285, "eval_samples_per_second": 21.15, "eval_steps_per_second": 1.323, "step": 735 }, { "epoch": 1.7835051546391751, "grad_norm": 1.1649395427594373, "learning_rate": 1.0851338347136356e-07, "loss": 1.5143, "step": 736 }, { "epoch": 1.786082474226804, "grad_norm": 1.284550306447524, "learning_rate": 1.0831115760408871e-07, "loss": 1.4542, "step": 737 }, { "epoch": 1.7886597938144329, "grad_norm": 1.1925120790488934, "learning_rate": 1.0810889750584424e-07, "loss": 1.426, "step": 738 }, { "epoch": 1.7912371134020617, "grad_norm": 1.178551347790486, "learning_rate": 1.07906604009674e-07, "loss": 1.4372, "step": 739 }, { "epoch": 1.7938144329896906, "grad_norm": 1.2458332188073578, "learning_rate": 1.077042779487595e-07, "loss": 1.5252, "step": 740 }, { "epoch": 1.7963917525773194, "grad_norm": 1.2661697455131442, "learning_rate": 1.0750192015641633e-07, "loss": 1.5066, "step": 741 }, { "epoch": 1.7989690721649485, "grad_norm": 1.1069806037454215, "learning_rate": 1.0729953146609074e-07, "loss": 1.4264, "step": 742 }, { "epoch": 1.8015463917525774, "grad_norm": 1.194263854578521, "learning_rate": 1.0709711271135635e-07, "loss": 1.4339, "step": 743 }, { "epoch": 1.8041237113402062, "grad_norm": 1.2068338783635435, "learning_rate": 1.0689466472591048e-07, "loss": 1.4341, "step": 744 }, { "epoch": 1.806701030927835, "grad_norm": 1.1526056815131385, "learning_rate": 1.066921883435709e-07, "loss": 1.4382, "step": 745 }, { "epoch": 1.809278350515464, "grad_norm": 1.1526436748662838, "learning_rate": 1.0648968439827239e-07, "loss": 1.4525, "step": 746 }, { "epoch": 1.8118556701030928, "grad_norm": 1.2587407335769552, "learning_rate": 1.0628715372406309e-07, "loss": 1.4995, "step": 747 }, { "epoch": 1.8144329896907216, "grad_norm": 1.2439345895593688, "learning_rate": 1.0608459715510139e-07, "loss": 1.4172, "step": 748 }, { "epoch": 1.8170103092783505, "grad_norm": 1.2048841761527278, "learning_rate": 1.058820155256523e-07, "loss": 1.4536, "step": 749 }, { "epoch": 1.8195876288659794, "grad_norm": 1.1712052519870668, "learning_rate": 1.0567940967008396e-07, "loss": 1.4739, "step": 750 }, { "epoch": 1.8221649484536082, "grad_norm": 1.1253615480764265, "learning_rate": 1.0547678042286435e-07, "loss": 1.4362, "step": 751 }, { "epoch": 1.824742268041237, "grad_norm": 1.1941314320057088, "learning_rate": 1.0527412861855789e-07, "loss": 1.5473, "step": 752 }, { "epoch": 1.827319587628866, "grad_norm": 1.1515723933518516, "learning_rate": 1.0507145509182169e-07, "loss": 1.4095, "step": 753 }, { "epoch": 1.829896907216495, "grad_norm": 1.1459437804868953, "learning_rate": 1.0486876067740252e-07, "loss": 1.4454, "step": 754 }, { "epoch": 1.8324742268041239, "grad_norm": 1.2555188381647702, "learning_rate": 1.0466604621013306e-07, "loss": 1.5032, "step": 755 }, { "epoch": 1.8350515463917527, "grad_norm": 1.173256763665764, "learning_rate": 1.0446331252492864e-07, "loss": 1.542, "step": 756 }, { "epoch": 1.8376288659793816, "grad_norm": 1.1616854603706852, "learning_rate": 1.0426056045678375e-07, "loss": 1.3301, "step": 757 }, { "epoch": 1.8402061855670104, "grad_norm": 1.1961580743330678, "learning_rate": 1.0405779084076855e-07, "loss": 1.5125, "step": 758 }, { "epoch": 1.8427835051546393, "grad_norm": 1.153920316864521, "learning_rate": 1.0385500451202549e-07, "loss": 1.5104, "step": 759 }, { "epoch": 1.8453608247422681, "grad_norm": 1.2288872831871334, "learning_rate": 1.036522023057659e-07, "loss": 1.54, "step": 760 }, { "epoch": 1.847938144329897, "grad_norm": 1.1774978065006576, "learning_rate": 1.0344938505726641e-07, "loss": 1.4226, "step": 761 }, { "epoch": 1.8505154639175259, "grad_norm": 1.18190720576571, "learning_rate": 1.0324655360186567e-07, "loss": 1.4874, "step": 762 }, { "epoch": 1.8530927835051547, "grad_norm": 1.0881741375618583, "learning_rate": 1.0304370877496089e-07, "loss": 1.4196, "step": 763 }, { "epoch": 1.8556701030927836, "grad_norm": 1.1920925526660484, "learning_rate": 1.0284085141200423e-07, "loss": 1.4022, "step": 764 }, { "epoch": 1.8582474226804124, "grad_norm": 1.2553686949150205, "learning_rate": 1.0263798234849954e-07, "loss": 1.5411, "step": 765 }, { "epoch": 1.8608247422680413, "grad_norm": 1.1849323570576418, "learning_rate": 1.0243510241999897e-07, "loss": 1.4376, "step": 766 }, { "epoch": 1.8634020618556701, "grad_norm": 1.1748076105825112, "learning_rate": 1.0223221246209918e-07, "loss": 1.3917, "step": 767 }, { "epoch": 1.865979381443299, "grad_norm": 1.1437404458677716, "learning_rate": 1.0202931331043839e-07, "loss": 1.5412, "step": 768 }, { "epoch": 1.8685567010309279, "grad_norm": 1.1588752261265902, "learning_rate": 1.0182640580069248e-07, "loss": 1.4016, "step": 769 }, { "epoch": 1.8711340206185567, "grad_norm": 1.244615607327111, "learning_rate": 1.016234907685719e-07, "loss": 1.4501, "step": 770 }, { "epoch": 1.8737113402061856, "grad_norm": 1.1809049167530614, "learning_rate": 1.0142056904981802e-07, "loss": 1.4637, "step": 771 }, { "epoch": 1.8762886597938144, "grad_norm": 1.1101634996349434, "learning_rate": 1.0121764148019975e-07, "loss": 1.4228, "step": 772 }, { "epoch": 1.8788659793814433, "grad_norm": 1.2377079616714697, "learning_rate": 1.0101470889551012e-07, "loss": 1.4533, "step": 773 }, { "epoch": 1.8814432989690721, "grad_norm": 1.160543485045226, "learning_rate": 1.0081177213156278e-07, "loss": 1.4735, "step": 774 }, { "epoch": 1.884020618556701, "grad_norm": 1.115374473748354, "learning_rate": 1.0060883202418861e-07, "loss": 1.438, "step": 775 }, { "epoch": 1.8865979381443299, "grad_norm": 1.1305131743119878, "learning_rate": 1.004058894092323e-07, "loss": 1.4186, "step": 776 }, { "epoch": 1.8891752577319587, "grad_norm": 1.151990553361531, "learning_rate": 1.0020294512254883e-07, "loss": 1.5121, "step": 777 }, { "epoch": 1.8917525773195876, "grad_norm": 1.1278991620860568, "learning_rate": 1e-07, "loss": 1.4333, "step": 778 }, { "epoch": 1.8943298969072164, "grad_norm": 1.281137685220673, "learning_rate": 9.97970548774512e-08, "loss": 1.4416, "step": 779 }, { "epoch": 1.8969072164948453, "grad_norm": 1.1772600120424532, "learning_rate": 9.959411059076768e-08, "loss": 1.409, "step": 780 }, { "epoch": 1.8994845360824741, "grad_norm": 1.16485761208349, "learning_rate": 9.939116797581138e-08, "loss": 1.4324, "step": 781 }, { "epoch": 1.902061855670103, "grad_norm": 1.203443440232203, "learning_rate": 9.918822786843724e-08, "loss": 1.4324, "step": 782 }, { "epoch": 1.9046391752577319, "grad_norm": 1.20376421998538, "learning_rate": 9.898529110448987e-08, "loss": 1.501, "step": 783 }, { "epoch": 1.9072164948453607, "grad_norm": 1.1533270795807118, "learning_rate": 9.878235851980025e-08, "loss": 1.404, "step": 784 }, { "epoch": 1.9072164948453607, "eval_loss": 1.4594255685806274, "eval_runtime": 78.6148, "eval_samples_per_second": 21.154, "eval_steps_per_second": 1.323, "step": 784 }, { "epoch": 1.9097938144329896, "grad_norm": 1.1889743164637112, "learning_rate": 9.857943095018198e-08, "loss": 1.4652, "step": 785 }, { "epoch": 1.9123711340206184, "grad_norm": 1.162304380840768, "learning_rate": 9.837650923142809e-08, "loss": 1.3641, "step": 786 }, { "epoch": 1.9149484536082473, "grad_norm": 1.260002079711297, "learning_rate": 9.817359419930751e-08, "loss": 1.5022, "step": 787 }, { "epoch": 1.9175257731958761, "grad_norm": 1.1295427248534264, "learning_rate": 9.797068668956162e-08, "loss": 1.4553, "step": 788 }, { "epoch": 1.920103092783505, "grad_norm": 1.1730252131786578, "learning_rate": 9.77677875379008e-08, "loss": 1.4748, "step": 789 }, { "epoch": 1.922680412371134, "grad_norm": 1.2020202803132716, "learning_rate": 9.756489758000104e-08, "loss": 1.4479, "step": 790 }, { "epoch": 1.925257731958763, "grad_norm": 1.180219637473307, "learning_rate": 9.736201765150045e-08, "loss": 1.4974, "step": 791 }, { "epoch": 1.9278350515463918, "grad_norm": 1.2291944688317633, "learning_rate": 9.715914858799575e-08, "loss": 1.4228, "step": 792 }, { "epoch": 1.9304123711340206, "grad_norm": 1.1131303155372065, "learning_rate": 9.69562912250391e-08, "loss": 1.4693, "step": 793 }, { "epoch": 1.9329896907216495, "grad_norm": 1.1994615231875885, "learning_rate": 9.675344639813433e-08, "loss": 1.4745, "step": 794 }, { "epoch": 1.9355670103092784, "grad_norm": 1.115870585349576, "learning_rate": 9.655061494273362e-08, "loss": 1.4671, "step": 795 }, { "epoch": 1.9381443298969072, "grad_norm": 1.2054754001979724, "learning_rate": 9.63477976942341e-08, "loss": 1.5195, "step": 796 }, { "epoch": 1.940721649484536, "grad_norm": 1.1464295691900082, "learning_rate": 9.614499548797452e-08, "loss": 1.4402, "step": 797 }, { "epoch": 1.943298969072165, "grad_norm": 1.1914103186703613, "learning_rate": 9.594220915923148e-08, "loss": 1.5797, "step": 798 }, { "epoch": 1.9458762886597938, "grad_norm": 1.1615295842359556, "learning_rate": 9.573943954321626e-08, "loss": 1.4126, "step": 799 }, { "epoch": 1.9484536082474226, "grad_norm": 1.19026250293737, "learning_rate": 9.553668747507138e-08, "loss": 1.4332, "step": 800 }, { "epoch": 1.9510309278350515, "grad_norm": 1.1351877413773055, "learning_rate": 9.533395378986697e-08, "loss": 1.4784, "step": 801 }, { "epoch": 1.9536082474226806, "grad_norm": 1.227791339106945, "learning_rate": 9.51312393225975e-08, "loss": 1.4392, "step": 802 }, { "epoch": 1.9561855670103094, "grad_norm": 1.2100140189737674, "learning_rate": 9.492854490817833e-08, "loss": 1.4693, "step": 803 }, { "epoch": 1.9587628865979383, "grad_norm": 1.0478682320033872, "learning_rate": 9.472587138144214e-08, "loss": 1.4117, "step": 804 }, { "epoch": 1.9613402061855671, "grad_norm": 1.1920119917461085, "learning_rate": 9.452321957713563e-08, "loss": 1.556, "step": 805 }, { "epoch": 1.963917525773196, "grad_norm": 1.1902655777598523, "learning_rate": 9.432059032991606e-08, "loss": 1.5102, "step": 806 }, { "epoch": 1.9664948453608249, "grad_norm": 1.1511704775031535, "learning_rate": 9.411798447434773e-08, "loss": 1.5281, "step": 807 }, { "epoch": 1.9690721649484537, "grad_norm": 1.1636100359208144, "learning_rate": 9.39154028448986e-08, "loss": 1.4024, "step": 808 }, { "epoch": 1.9716494845360826, "grad_norm": 1.2582478560602157, "learning_rate": 9.371284627593691e-08, "loss": 1.4519, "step": 809 }, { "epoch": 1.9742268041237114, "grad_norm": 1.1608958350691665, "learning_rate": 9.351031560172764e-08, "loss": 1.4286, "step": 810 }, { "epoch": 1.9768041237113403, "grad_norm": 1.1725970187771935, "learning_rate": 9.330781165642907e-08, "loss": 1.4858, "step": 811 }, { "epoch": 1.9793814432989691, "grad_norm": 1.181405747708069, "learning_rate": 9.310533527408951e-08, "loss": 1.5193, "step": 812 }, { "epoch": 1.981958762886598, "grad_norm": 1.1949902203170548, "learning_rate": 9.290288728864365e-08, "loss": 1.3768, "step": 813 }, { "epoch": 1.9845360824742269, "grad_norm": 1.2444243036816676, "learning_rate": 9.270046853390924e-08, "loss": 1.4866, "step": 814 }, { "epoch": 1.9871134020618557, "grad_norm": 1.162040164523566, "learning_rate": 9.249807984358369e-08, "loss": 1.4277, "step": 815 }, { "epoch": 1.9896907216494846, "grad_norm": 1.3041991278727916, "learning_rate": 9.229572205124051e-08, "loss": 1.4895, "step": 816 }, { "epoch": 1.9922680412371134, "grad_norm": 1.1800946591513317, "learning_rate": 9.2093395990326e-08, "loss": 1.6118, "step": 817 }, { "epoch": 1.9948453608247423, "grad_norm": 1.120730199367575, "learning_rate": 9.189110249415576e-08, "loss": 1.4777, "step": 818 }, { "epoch": 1.9974226804123711, "grad_norm": 1.165214854260427, "learning_rate": 9.168884239591129e-08, "loss": 1.4491, "step": 819 }, { "epoch": 2.0, "grad_norm": 1.1460287106000804, "learning_rate": 9.148661652863641e-08, "loss": 1.442, "step": 820 }, { "epoch": 2.002577319587629, "grad_norm": 1.245092231884586, "learning_rate": 9.128442572523417e-08, "loss": 1.4238, "step": 821 }, { "epoch": 2.0051546391752577, "grad_norm": 1.1566295496507226, "learning_rate": 9.108227081846305e-08, "loss": 1.4313, "step": 822 }, { "epoch": 2.0077319587628866, "grad_norm": 1.2544751166156012, "learning_rate": 9.088015264093364e-08, "loss": 1.4879, "step": 823 }, { "epoch": 2.0103092783505154, "grad_norm": 1.2229877060400391, "learning_rate": 9.067807202510542e-08, "loss": 1.4781, "step": 824 }, { "epoch": 2.0128865979381443, "grad_norm": 1.1382534019879336, "learning_rate": 9.047602980328308e-08, "loss": 1.4163, "step": 825 }, { "epoch": 2.015463917525773, "grad_norm": 1.1936874170381253, "learning_rate": 9.027402680761309e-08, "loss": 1.5233, "step": 826 }, { "epoch": 2.018041237113402, "grad_norm": 1.133631677446316, "learning_rate": 9.007206387008053e-08, "loss": 1.397, "step": 827 }, { "epoch": 2.020618556701031, "grad_norm": 1.2442262218300326, "learning_rate": 8.987014182250538e-08, "loss": 1.4734, "step": 828 }, { "epoch": 2.0231958762886597, "grad_norm": 1.1593473271235548, "learning_rate": 8.966826149653922e-08, "loss": 1.5101, "step": 829 }, { "epoch": 2.0257731958762886, "grad_norm": 1.2054412501356118, "learning_rate": 8.94664237236618e-08, "loss": 1.4657, "step": 830 }, { "epoch": 2.0283505154639174, "grad_norm": 1.1696863220137095, "learning_rate": 8.926462933517765e-08, "loss": 1.4385, "step": 831 }, { "epoch": 2.0309278350515463, "grad_norm": 1.2085685291526942, "learning_rate": 8.906287916221257e-08, "loss": 1.4567, "step": 832 }, { "epoch": 2.033505154639175, "grad_norm": 1.2062684152337084, "learning_rate": 8.886117403571023e-08, "loss": 1.4903, "step": 833 }, { "epoch": 2.033505154639175, "eval_loss": 1.4574321508407593, "eval_runtime": 78.538, "eval_samples_per_second": 21.174, "eval_steps_per_second": 1.324, "step": 833 }, { "epoch": 2.036082474226804, "grad_norm": 1.2985823482438499, "learning_rate": 8.865951478642886e-08, "loss": 1.4945, "step": 834 }, { "epoch": 2.038659793814433, "grad_norm": 1.2008208109365806, "learning_rate": 8.845790224493761e-08, "loss": 1.4053, "step": 835 }, { "epoch": 2.0412371134020617, "grad_norm": 1.1173370303783305, "learning_rate": 8.825633724161334e-08, "loss": 1.437, "step": 836 }, { "epoch": 2.0438144329896906, "grad_norm": 1.2675969181316824, "learning_rate": 8.805482060663712e-08, "loss": 1.4189, "step": 837 }, { "epoch": 2.0463917525773194, "grad_norm": 1.2147757078811159, "learning_rate": 8.785335316999077e-08, "loss": 1.4214, "step": 838 }, { "epoch": 2.0489690721649483, "grad_norm": 1.09453864552264, "learning_rate": 8.765193576145346e-08, "loss": 1.4027, "step": 839 }, { "epoch": 2.051546391752577, "grad_norm": 1.216226711944593, "learning_rate": 8.745056921059839e-08, "loss": 1.5143, "step": 840 }, { "epoch": 2.054123711340206, "grad_norm": 1.2055008222540708, "learning_rate": 8.724925434678922e-08, "loss": 1.4489, "step": 841 }, { "epoch": 2.056701030927835, "grad_norm": 1.1336500080565066, "learning_rate": 8.704799199917673e-08, "loss": 1.4248, "step": 842 }, { "epoch": 2.0592783505154637, "grad_norm": 1.215103376196868, "learning_rate": 8.684678299669546e-08, "loss": 1.4463, "step": 843 }, { "epoch": 2.0618556701030926, "grad_norm": 1.1882950937372736, "learning_rate": 8.664562816806021e-08, "loss": 1.4444, "step": 844 }, { "epoch": 2.0644329896907214, "grad_norm": 1.2047730105242802, "learning_rate": 8.64445283417626e-08, "loss": 1.4514, "step": 845 }, { "epoch": 2.0670103092783503, "grad_norm": 1.1364686666884227, "learning_rate": 8.624348434606781e-08, "loss": 1.4285, "step": 846 }, { "epoch": 2.069587628865979, "grad_norm": 1.2216577804549105, "learning_rate": 8.6042497009011e-08, "loss": 1.5001, "step": 847 }, { "epoch": 2.072164948453608, "grad_norm": 1.167316107588148, "learning_rate": 8.5841567158394e-08, "loss": 1.4095, "step": 848 }, { "epoch": 2.074742268041237, "grad_norm": 1.087136320546188, "learning_rate": 8.564069562178188e-08, "loss": 1.4547, "step": 849 }, { "epoch": 2.0773195876288657, "grad_norm": 1.10777310102604, "learning_rate": 8.543988322649954e-08, "loss": 1.4905, "step": 850 }, { "epoch": 2.0798969072164946, "grad_norm": 1.2198690834759995, "learning_rate": 8.523913079962816e-08, "loss": 1.3988, "step": 851 }, { "epoch": 2.0824742268041234, "grad_norm": 1.2266366218856903, "learning_rate": 8.50384391680022e-08, "loss": 1.4972, "step": 852 }, { "epoch": 2.0850515463917523, "grad_norm": 1.1644015048600025, "learning_rate": 8.483780915820552e-08, "loss": 1.4233, "step": 853 }, { "epoch": 2.087628865979381, "grad_norm": 1.1537200560912633, "learning_rate": 8.463724159656814e-08, "loss": 1.5044, "step": 854 }, { "epoch": 2.09020618556701, "grad_norm": 1.1190956026619867, "learning_rate": 8.443673730916312e-08, "loss": 1.4284, "step": 855 }, { "epoch": 2.092783505154639, "grad_norm": 1.1476534954615265, "learning_rate": 8.423629712180264e-08, "loss": 1.4601, "step": 856 }, { "epoch": 2.095360824742268, "grad_norm": 1.2130889970169285, "learning_rate": 8.403592186003501e-08, "loss": 1.3902, "step": 857 }, { "epoch": 2.097938144329897, "grad_norm": 1.2106313562862567, "learning_rate": 8.383561234914119e-08, "loss": 1.5202, "step": 858 }, { "epoch": 2.100515463917526, "grad_norm": 1.2790874195534712, "learning_rate": 8.36353694141312e-08, "loss": 1.5241, "step": 859 }, { "epoch": 2.1030927835051547, "grad_norm": 1.1984788041581806, "learning_rate": 8.34351938797409e-08, "loss": 1.5185, "step": 860 }, { "epoch": 2.1056701030927836, "grad_norm": 1.1224530119764298, "learning_rate": 8.323508657042858e-08, "loss": 1.4387, "step": 861 }, { "epoch": 2.1082474226804124, "grad_norm": 1.1916193301815299, "learning_rate": 8.303504831037154e-08, "loss": 1.433, "step": 862 }, { "epoch": 2.1108247422680413, "grad_norm": 1.269383237065682, "learning_rate": 8.283507992346263e-08, "loss": 1.58, "step": 863 }, { "epoch": 2.002577319587629, "grad_norm": 1.0439514094170574, "learning_rate": 8.263518223330696e-08, "loss": 1.3774, "step": 864 }, { "epoch": 2.0051546391752577, "grad_norm": 1.1249347513631904, "learning_rate": 8.243535606321848e-08, "loss": 1.4098, "step": 865 }, { "epoch": 2.0077319587628866, "grad_norm": 1.375007615993654, "learning_rate": 8.22356022362165e-08, "loss": 1.4725, "step": 866 }, { "epoch": 2.0103092783505154, "grad_norm": 1.1571951227795978, "learning_rate": 8.203592157502244e-08, "loss": 1.4642, "step": 867 }, { "epoch": 2.0128865979381443, "grad_norm": 1.1725964239389173, "learning_rate": 8.183631490205636e-08, "loss": 1.4317, "step": 868 }, { "epoch": 2.015463917525773, "grad_norm": 1.1131141063076042, "learning_rate": 8.163678303943356e-08, "loss": 1.4534, "step": 869 }, { "epoch": 2.018041237113402, "grad_norm": 1.174599695198473, "learning_rate": 8.143732680896123e-08, "loss": 1.4076, "step": 870 }, { "epoch": 2.020618556701031, "grad_norm": 1.1730868356762598, "learning_rate": 8.123794703213509e-08, "loss": 1.457, "step": 871 }, { "epoch": 2.0231958762886597, "grad_norm": 1.194870586046834, "learning_rate": 8.103864453013592e-08, "loss": 1.5082, "step": 872 }, { "epoch": 2.0257731958762886, "grad_norm": 1.1351876585089653, "learning_rate": 8.083942012382625e-08, "loss": 1.4886, "step": 873 }, { "epoch": 2.0283505154639174, "grad_norm": 1.1792650671176743, "learning_rate": 8.064027463374701e-08, "loss": 1.4118, "step": 874 }, { "epoch": 2.0309278350515463, "grad_norm": 1.153547305161426, "learning_rate": 8.0441208880114e-08, "loss": 1.4064, "step": 875 }, { "epoch": 2.033505154639175, "grad_norm": 1.2783578209502229, "learning_rate": 8.024222368281469e-08, "loss": 1.4816, "step": 876 }, { "epoch": 2.036082474226804, "grad_norm": 1.240844307809194, "learning_rate": 8.004331986140473e-08, "loss": 1.4598, "step": 877 }, { "epoch": 2.038659793814433, "grad_norm": 1.1295638200937268, "learning_rate": 7.984449823510467e-08, "loss": 1.4081, "step": 878 }, { "epoch": 2.0412371134020617, "grad_norm": 1.1888063217054325, "learning_rate": 7.964575962279634e-08, "loss": 1.4618, "step": 879 }, { "epoch": 2.0438144329896906, "grad_norm": 1.2357228980469037, "learning_rate": 7.944710484301995e-08, "loss": 1.3963, "step": 880 }, { "epoch": 2.0463917525773194, "grad_norm": 1.0786846944064847, "learning_rate": 7.92485347139702e-08, "loss": 1.4514, "step": 881 }, { "epoch": 2.0489690721649483, "grad_norm": 1.1666214344742663, "learning_rate": 7.90500500534932e-08, "loss": 1.4389, "step": 882 }, { "epoch": 2.0489690721649483, "eval_loss": 1.4557408094406128, "eval_runtime": 78.6008, "eval_samples_per_second": 21.158, "eval_steps_per_second": 1.323, "step": 882 }, { "epoch": 2.051546391752577, "grad_norm": 1.1265923768111081, "learning_rate": 7.88516516790831e-08, "loss": 1.4401, "step": 883 }, { "epoch": 2.054123711340206, "grad_norm": 1.2322020489966297, "learning_rate": 7.865334040787866e-08, "loss": 1.5326, "step": 884 }, { "epoch": 2.056701030927835, "grad_norm": 1.1620543990403278, "learning_rate": 7.845511705665973e-08, "loss": 1.4151, "step": 885 }, { "epoch": 2.0592783505154637, "grad_norm": 1.2532645521350043, "learning_rate": 7.82569824418443e-08, "loss": 1.485, "step": 886 }, { "epoch": 2.0618556701030926, "grad_norm": 1.2322746000056972, "learning_rate": 7.805893737948472e-08, "loss": 1.439, "step": 887 }, { "epoch": 2.0644329896907214, "grad_norm": 1.1992705537386268, "learning_rate": 7.786098268526446e-08, "loss": 1.4927, "step": 888 }, { "epoch": 2.0670103092783507, "grad_norm": 1.219061389377471, "learning_rate": 7.7663119174495e-08, "loss": 1.5607, "step": 889 }, { "epoch": 2.0695876288659796, "grad_norm": 1.2161975840628703, "learning_rate": 7.746534766211206e-08, "loss": 1.5666, "step": 890 }, { "epoch": 2.0721649484536084, "grad_norm": 1.296835674200516, "learning_rate": 7.726766896267253e-08, "loss": 1.4738, "step": 891 }, { "epoch": 2.0747422680412373, "grad_norm": 1.1913191108570989, "learning_rate": 7.7070083890351e-08, "loss": 1.4345, "step": 892 }, { "epoch": 2.077319587628866, "grad_norm": 1.1697890394016621, "learning_rate": 7.687259325893654e-08, "loss": 1.4431, "step": 893 }, { "epoch": 2.079896907216495, "grad_norm": 1.2354727439582665, "learning_rate": 7.667519788182912e-08, "loss": 1.4302, "step": 894 }, { "epoch": 2.082474226804124, "grad_norm": 1.1445036968078774, "learning_rate": 7.647789857203644e-08, "loss": 1.4532, "step": 895 }, { "epoch": 2.0850515463917527, "grad_norm": 1.196595545836434, "learning_rate": 7.628069614217058e-08, "loss": 1.3915, "step": 896 }, { "epoch": 2.0876288659793816, "grad_norm": 1.2451954556034555, "learning_rate": 7.608359140444453e-08, "loss": 1.502, "step": 897 }, { "epoch": 2.0902061855670104, "grad_norm": 1.1198448743060805, "learning_rate": 7.588658517066892e-08, "loss": 1.4182, "step": 898 }, { "epoch": 2.0927835051546393, "grad_norm": 1.178128381993088, "learning_rate": 7.568967825224875e-08, "loss": 1.5009, "step": 899 }, { "epoch": 2.095360824742268, "grad_norm": 1.1493716638910112, "learning_rate": 7.549287146017988e-08, "loss": 1.4575, "step": 900 }, { "epoch": 2.097938144329897, "grad_norm": 1.2133662857011498, "learning_rate": 7.529616560504583e-08, "loss": 1.5579, "step": 901 }, { "epoch": 2.100515463917526, "grad_norm": 1.3854933572472905, "learning_rate": 7.509956149701444e-08, "loss": 1.4113, "step": 902 }, { "epoch": 2.1030927835051547, "grad_norm": 1.263798951148438, "learning_rate": 7.490305994583435e-08, "loss": 1.4258, "step": 903 }, { "epoch": 2.1056701030927836, "grad_norm": 1.1393321990385807, "learning_rate": 7.470666176083191e-08, "loss": 1.4943, "step": 904 }, { "epoch": 2.1082474226804124, "grad_norm": 1.1741861811520338, "learning_rate": 7.451036775090775e-08, "loss": 1.3918, "step": 905 }, { "epoch": 2.1108247422680413, "grad_norm": 1.222621280727268, "learning_rate": 7.431417872453338e-08, "loss": 1.513, "step": 906 }, { "epoch": 2.11340206185567, "grad_norm": 1.1452645437770688, "learning_rate": 7.411809548974791e-08, "loss": 1.4496, "step": 907 }, { "epoch": 2.115979381443299, "grad_norm": 1.1804026334318425, "learning_rate": 7.39221188541548e-08, "loss": 1.4644, "step": 908 }, { "epoch": 2.118556701030928, "grad_norm": 1.1527370569507815, "learning_rate": 7.372624962491841e-08, "loss": 1.4698, "step": 909 }, { "epoch": 2.1211340206185567, "grad_norm": 1.211563683201349, "learning_rate": 7.353048860876063e-08, "loss": 1.4671, "step": 910 }, { "epoch": 2.1237113402061856, "grad_norm": 1.1550395362954822, "learning_rate": 7.333483661195792e-08, "loss": 1.3627, "step": 911 }, { "epoch": 2.1262886597938144, "grad_norm": 1.1772438114561363, "learning_rate": 7.31392944403375e-08, "loss": 1.4349, "step": 912 }, { "epoch": 2.1288659793814433, "grad_norm": 1.1316430782314122, "learning_rate": 7.294386289927424e-08, "loss": 1.4892, "step": 913 }, { "epoch": 2.131443298969072, "grad_norm": 1.2166109017309248, "learning_rate": 7.274854279368758e-08, "loss": 1.4753, "step": 914 }, { "epoch": 2.134020618556701, "grad_norm": 1.2508664732495605, "learning_rate": 7.255333492803777e-08, "loss": 1.3593, "step": 915 }, { "epoch": 2.13659793814433, "grad_norm": 1.1270294993138392, "learning_rate": 7.235824010632283e-08, "loss": 1.5031, "step": 916 }, { "epoch": 2.1391752577319587, "grad_norm": 1.142323203849277, "learning_rate": 7.216325913207534e-08, "loss": 1.4747, "step": 917 }, { "epoch": 2.1417525773195876, "grad_norm": 1.198388386752302, "learning_rate": 7.196839280835875e-08, "loss": 1.4787, "step": 918 }, { "epoch": 2.1443298969072164, "grad_norm": 1.288933637399068, "learning_rate": 7.17736419377644e-08, "loss": 1.458, "step": 919 }, { "epoch": 2.1469072164948453, "grad_norm": 1.2342213116469787, "learning_rate": 7.157900732240826e-08, "loss": 1.4902, "step": 920 }, { "epoch": 2.149484536082474, "grad_norm": 1.2300130857871707, "learning_rate": 7.138448976392724e-08, "loss": 1.4835, "step": 921 }, { "epoch": 2.152061855670103, "grad_norm": 1.169125520832618, "learning_rate": 7.119009006347624e-08, "loss": 1.413, "step": 922 }, { "epoch": 2.154639175257732, "grad_norm": 1.1702489758289347, "learning_rate": 7.09958090217248e-08, "loss": 1.4857, "step": 923 }, { "epoch": 2.1572164948453607, "grad_norm": 1.179155067994331, "learning_rate": 7.080164743885362e-08, "loss": 1.507, "step": 924 }, { "epoch": 2.1597938144329896, "grad_norm": 1.149588572227629, "learning_rate": 7.060760611455151e-08, "loss": 1.3957, "step": 925 }, { "epoch": 2.1623711340206184, "grad_norm": 1.1269730428089064, "learning_rate": 7.041368584801186e-08, "loss": 1.515, "step": 926 }, { "epoch": 2.1649484536082473, "grad_norm": 1.2614734844469966, "learning_rate": 7.021988743792958e-08, "loss": 1.4752, "step": 927 }, { "epoch": 2.167525773195876, "grad_norm": 1.26049546725807, "learning_rate": 7.002621168249758e-08, "loss": 1.4222, "step": 928 }, { "epoch": 2.170103092783505, "grad_norm": 1.2122490418432295, "learning_rate": 6.983265937940365e-08, "loss": 1.5258, "step": 929 }, { "epoch": 2.172680412371134, "grad_norm": 1.163933149699957, "learning_rate": 6.963923132582715e-08, "loss": 1.4406, "step": 930 }, { "epoch": 2.1752577319587627, "grad_norm": 1.2117410126905865, "learning_rate": 6.944592831843566e-08, "loss": 1.4541, "step": 931 }, { "epoch": 2.1752577319587627, "eval_loss": 1.4543218612670898, "eval_runtime": 78.6219, "eval_samples_per_second": 21.152, "eval_steps_per_second": 1.323, "step": 931 }, { "epoch": 2.1778350515463916, "grad_norm": 1.2898700377788812, "learning_rate": 6.925275115338167e-08, "loss": 1.458, "step": 932 }, { "epoch": 2.1804123711340204, "grad_norm": 1.1426836123172524, "learning_rate": 6.90597006262995e-08, "loss": 1.3469, "step": 933 }, { "epoch": 2.1829896907216493, "grad_norm": 1.224441134115869, "learning_rate": 6.886677753230183e-08, "loss": 1.4027, "step": 934 }, { "epoch": 2.1855670103092786, "grad_norm": 1.387271519204012, "learning_rate": 6.867398266597642e-08, "loss": 1.4359, "step": 935 }, { "epoch": 2.1881443298969074, "grad_norm": 1.2243550754367374, "learning_rate": 6.848131682138303e-08, "loss": 1.4891, "step": 936 }, { "epoch": 2.1907216494845363, "grad_norm": 1.2282484095681934, "learning_rate": 6.82887807920499e-08, "loss": 1.4571, "step": 937 }, { "epoch": 2.193298969072165, "grad_norm": 1.252437764569184, "learning_rate": 6.809637537097061e-08, "loss": 1.4845, "step": 938 }, { "epoch": 2.195876288659794, "grad_norm": 1.2033826306564712, "learning_rate": 6.790410135060096e-08, "loss": 1.3981, "step": 939 }, { "epoch": 2.198453608247423, "grad_norm": 1.2730733273660004, "learning_rate": 6.77119595228554e-08, "loss": 1.5428, "step": 940 }, { "epoch": 2.2010309278350517, "grad_norm": 1.1145258448772917, "learning_rate": 6.751995067910388e-08, "loss": 1.4391, "step": 941 }, { "epoch": 2.2036082474226806, "grad_norm": 1.2423736700157595, "learning_rate": 6.732807561016884e-08, "loss": 1.3461, "step": 942 }, { "epoch": 2.2061855670103094, "grad_norm": 1.2567446761007774, "learning_rate": 6.713633510632157e-08, "loss": 1.4424, "step": 943 }, { "epoch": 2.2087628865979383, "grad_norm": 1.1962904231989222, "learning_rate": 6.694472995727913e-08, "loss": 1.5211, "step": 944 }, { "epoch": 2.211340206185567, "grad_norm": 1.2697071279271324, "learning_rate": 6.675326095220124e-08, "loss": 1.5138, "step": 945 }, { "epoch": 2.213917525773196, "grad_norm": 1.1182813975437969, "learning_rate": 6.656192887968674e-08, "loss": 1.4643, "step": 946 }, { "epoch": 2.216494845360825, "grad_norm": 1.2209457066901777, "learning_rate": 6.637073452777051e-08, "loss": 1.4646, "step": 947 }, { "epoch": 2.2190721649484537, "grad_norm": 1.2364207179496447, "learning_rate": 6.617967868392035e-08, "loss": 1.4531, "step": 948 }, { "epoch": 2.2216494845360826, "grad_norm": 1.1596958099892627, "learning_rate": 6.598876213503339e-08, "loss": 1.3596, "step": 949 }, { "epoch": 2.2242268041237114, "grad_norm": 1.1861584749981382, "learning_rate": 6.579798566743313e-08, "loss": 1.4605, "step": 950 }, { "epoch": 2.2268041237113403, "grad_norm": 1.2713750509697457, "learning_rate": 6.560735006686617e-08, "loss": 1.5169, "step": 951 }, { "epoch": 2.229381443298969, "grad_norm": 1.166290536481266, "learning_rate": 6.541685611849887e-08, "loss": 1.4436, "step": 952 }, { "epoch": 2.231958762886598, "grad_norm": 1.1735876550775757, "learning_rate": 6.522650460691415e-08, "loss": 1.4548, "step": 953 }, { "epoch": 2.234536082474227, "grad_norm": 1.2477782864575375, "learning_rate": 6.503629631610836e-08, "loss": 1.4534, "step": 954 }, { "epoch": 2.2371134020618557, "grad_norm": 1.2173622340437633, "learning_rate": 6.48462320294879e-08, "loss": 1.4595, "step": 955 }, { "epoch": 2.2396907216494846, "grad_norm": 1.1869675634283399, "learning_rate": 6.465631252986608e-08, "loss": 1.4451, "step": 956 }, { "epoch": 2.2422680412371134, "grad_norm": 1.1456159400412829, "learning_rate": 6.446653859945986e-08, "loss": 1.4064, "step": 957 }, { "epoch": 2.2448453608247423, "grad_norm": 1.2491020198374654, "learning_rate": 6.427691101988673e-08, "loss": 1.4949, "step": 958 }, { "epoch": 2.247422680412371, "grad_norm": 1.2282744468510673, "learning_rate": 6.40874305721613e-08, "loss": 1.4545, "step": 959 }, { "epoch": 2.25, "grad_norm": 1.0996865259394428, "learning_rate": 6.389809803669226e-08, "loss": 1.3342, "step": 960 }, { "epoch": 2.252577319587629, "grad_norm": 1.230550939339635, "learning_rate": 6.370891419327906e-08, "loss": 1.5121, "step": 961 }, { "epoch": 2.2551546391752577, "grad_norm": 1.2652568339180974, "learning_rate": 6.351987982110879e-08, "loss": 1.5533, "step": 962 }, { "epoch": 2.2577319587628866, "grad_norm": 1.173180731192026, "learning_rate": 6.333099569875284e-08, "loss": 1.4439, "step": 963 }, { "epoch": 2.2603092783505154, "grad_norm": 1.1001923514400465, "learning_rate": 6.314226260416382e-08, "loss": 1.4376, "step": 964 }, { "epoch": 2.2628865979381443, "grad_norm": 1.1389700541958854, "learning_rate": 6.295368131467235e-08, "loss": 1.4357, "step": 965 }, { "epoch": 2.265463917525773, "grad_norm": 1.1695985290298057, "learning_rate": 6.276525260698363e-08, "loss": 1.5309, "step": 966 }, { "epoch": 2.268041237113402, "grad_norm": 1.2012587244050719, "learning_rate": 6.257697725717468e-08, "loss": 1.5271, "step": 967 }, { "epoch": 2.270618556701031, "grad_norm": 1.2116419761383141, "learning_rate": 6.238885604069075e-08, "loss": 1.4536, "step": 968 }, { "epoch": 2.2731958762886597, "grad_norm": 1.169258658026815, "learning_rate": 6.220088973234215e-08, "loss": 1.4662, "step": 969 }, { "epoch": 2.2757731958762886, "grad_norm": 1.1455385835708687, "learning_rate": 6.201307910630145e-08, "loss": 1.4339, "step": 970 }, { "epoch": 2.2783505154639174, "grad_norm": 1.1833257380384377, "learning_rate": 6.182542493609984e-08, "loss": 1.3253, "step": 971 }, { "epoch": 2.2809278350515463, "grad_norm": 1.28784815413645, "learning_rate": 6.163792799462403e-08, "loss": 1.4603, "step": 972 }, { "epoch": 2.283505154639175, "grad_norm": 1.1970928590978123, "learning_rate": 6.145058905411342e-08, "loss": 1.4683, "step": 973 }, { "epoch": 2.286082474226804, "grad_norm": 1.149098853897877, "learning_rate": 6.126340888615641e-08, "loss": 1.4729, "step": 974 }, { "epoch": 2.288659793814433, "grad_norm": 1.209952156325127, "learning_rate": 6.107638826168756e-08, "loss": 1.5063, "step": 975 }, { "epoch": 2.2912371134020617, "grad_norm": 1.093427620169618, "learning_rate": 6.088952795098441e-08, "loss": 1.4402, "step": 976 }, { "epoch": 2.2938144329896906, "grad_norm": 1.1277798916215127, "learning_rate": 6.070282872366406e-08, "loss": 1.5049, "step": 977 }, { "epoch": 2.2963917525773194, "grad_norm": 1.1497157702484186, "learning_rate": 6.05162913486802e-08, "loss": 1.4331, "step": 978 }, { "epoch": 2.2989690721649483, "grad_norm": 1.2127687421273623, "learning_rate": 6.032991659432006e-08, "loss": 1.464, "step": 979 }, { "epoch": 2.301546391752577, "grad_norm": 1.2091736243527582, "learning_rate": 6.014370522820084e-08, "loss": 1.4257, "step": 980 }, { "epoch": 2.301546391752577, "eval_loss": 1.4530315399169922, "eval_runtime": 78.4954, "eval_samples_per_second": 21.186, "eval_steps_per_second": 1.325, "step": 980 }, { "epoch": 2.304123711340206, "grad_norm": 1.1621649511934278, "learning_rate": 5.995765801726698e-08, "loss": 1.4808, "step": 981 }, { "epoch": 2.306701030927835, "grad_norm": 1.1581272698070357, "learning_rate": 5.977177572778678e-08, "loss": 1.3401, "step": 982 }, { "epoch": 2.3092783505154637, "grad_norm": 1.1599391051626198, "learning_rate": 5.958605912534921e-08, "loss": 1.4917, "step": 983 }, { "epoch": 2.3118556701030926, "grad_norm": 1.3034698067830743, "learning_rate": 5.9400508974860885e-08, "loss": 1.4841, "step": 984 }, { "epoch": 2.3144329896907214, "grad_norm": 1.2060359148709237, "learning_rate": 5.9215126040542886e-08, "loss": 1.4479, "step": 985 }, { "epoch": 2.3170103092783503, "grad_norm": 1.2258119330781094, "learning_rate": 5.902991108592754e-08, "loss": 1.4949, "step": 986 }, { "epoch": 2.319587628865979, "grad_norm": 1.2150702094703367, "learning_rate": 5.8844864873855296e-08, "loss": 1.4329, "step": 987 }, { "epoch": 2.3221649484536084, "grad_norm": 1.1354804163624515, "learning_rate": 5.8659988166471706e-08, "loss": 1.3683, "step": 988 }, { "epoch": 2.3247422680412373, "grad_norm": 1.1304878710380117, "learning_rate": 5.847528172522407e-08, "loss": 1.4345, "step": 989 }, { "epoch": 2.327319587628866, "grad_norm": 1.2388489587800555, "learning_rate": 5.829074631085852e-08, "loss": 1.5177, "step": 990 }, { "epoch": 2.329896907216495, "grad_norm": 1.2418385155763394, "learning_rate": 5.8106382683416636e-08, "loss": 1.5666, "step": 991 }, { "epoch": 2.332474226804124, "grad_norm": 1.2067656028810445, "learning_rate": 5.7922191602232675e-08, "loss": 1.501, "step": 992 }, { "epoch": 2.3350515463917527, "grad_norm": 1.2443124436097661, "learning_rate": 5.773817382593007e-08, "loss": 1.4516, "step": 993 }, { "epoch": 2.3376288659793816, "grad_norm": 1.2589938629670394, "learning_rate": 5.7554330112418504e-08, "loss": 1.4955, "step": 994 }, { "epoch": 2.3402061855670104, "grad_norm": 1.1979526509329819, "learning_rate": 5.737066121889078e-08, "loss": 1.4224, "step": 995 }, { "epoch": 2.3427835051546393, "grad_norm": 1.1895398966073056, "learning_rate": 5.718716790181965e-08, "loss": 1.4243, "step": 996 }, { "epoch": 2.345360824742268, "grad_norm": 1.1828652518517522, "learning_rate": 5.70038509169547e-08, "loss": 1.4559, "step": 997 }, { "epoch": 2.347938144329897, "grad_norm": 1.2201556733969088, "learning_rate": 5.682071101931936e-08, "loss": 1.5799, "step": 998 }, { "epoch": 2.350515463917526, "grad_norm": 1.2211801179218442, "learning_rate": 5.6637748963207566e-08, "loss": 1.4684, "step": 999 }, { "epoch": 2.3530927835051547, "grad_norm": 1.2453622614111477, "learning_rate": 5.6454965502180884e-08, "loss": 1.4854, "step": 1000 }, { "epoch": 2.3556701030927836, "grad_norm": 1.1220592371624576, "learning_rate": 5.627236138906524e-08, "loss": 1.5089, "step": 1001 }, { "epoch": 2.3582474226804124, "grad_norm": 1.1369675384518176, "learning_rate": 5.60899373759479e-08, "loss": 1.4088, "step": 1002 }, { "epoch": 2.3608247422680413, "grad_norm": 1.1583531710119257, "learning_rate": 5.590769421417434e-08, "loss": 1.4299, "step": 1003 }, { "epoch": 2.36340206185567, "grad_norm": 1.2204630482972216, "learning_rate": 5.572563265434527e-08, "loss": 1.421, "step": 1004 }, { "epoch": 2.365979381443299, "grad_norm": 1.1654233558024554, "learning_rate": 5.55437534463133e-08, "loss": 1.4153, "step": 1005 }, { "epoch": 2.368556701030928, "grad_norm": 1.1255124035829496, "learning_rate": 5.536205733918007e-08, "loss": 1.4196, "step": 1006 }, { "epoch": 2.3711340206185567, "grad_norm": 1.1998683282168985, "learning_rate": 5.5180545081293074e-08, "loss": 1.4067, "step": 1007 }, { "epoch": 2.3737113402061856, "grad_norm": 1.2097328179188533, "learning_rate": 5.4999217420242574e-08, "loss": 1.4221, "step": 1008 }, { "epoch": 2.3762886597938144, "grad_norm": 1.2465777328454615, "learning_rate": 5.481807510285852e-08, "loss": 1.5432, "step": 1009 }, { "epoch": 2.3788659793814433, "grad_norm": 1.1017326736009339, "learning_rate": 5.4637118875207585e-08, "loss": 1.4498, "step": 1010 }, { "epoch": 2.381443298969072, "grad_norm": 1.1894534742510336, "learning_rate": 5.445634948258991e-08, "loss": 1.4779, "step": 1011 }, { "epoch": 2.384020618556701, "grad_norm": 1.2240426429209377, "learning_rate": 5.4275767669536145e-08, "loss": 1.4643, "step": 1012 }, { "epoch": 2.38659793814433, "grad_norm": 1.1865338401108185, "learning_rate": 5.4095374179804365e-08, "loss": 1.4218, "step": 1013 }, { "epoch": 2.3891752577319587, "grad_norm": 1.1332962977107732, "learning_rate": 5.391516975637699e-08, "loss": 1.4893, "step": 1014 }, { "epoch": 2.3917525773195876, "grad_norm": 1.1749099925869624, "learning_rate": 5.373515514145771e-08, "loss": 1.4223, "step": 1015 }, { "epoch": 2.3943298969072164, "grad_norm": 1.2704273457918143, "learning_rate": 5.355533107646858e-08, "loss": 1.4625, "step": 1016 }, { "epoch": 2.3969072164948453, "grad_norm": 1.2661897531951014, "learning_rate": 5.3375698302046745e-08, "loss": 1.4886, "step": 1017 }, { "epoch": 2.399484536082474, "grad_norm": 1.1604729483093374, "learning_rate": 5.319625755804138e-08, "loss": 1.433, "step": 1018 }, { "epoch": 2.402061855670103, "grad_norm": 1.1177913422918446, "learning_rate": 5.301700958351098e-08, "loss": 1.3745, "step": 1019 }, { "epoch": 2.404639175257732, "grad_norm": 1.350758760981664, "learning_rate": 5.283795511671994e-08, "loss": 1.5148, "step": 1020 }, { "epoch": 2.4072164948453607, "grad_norm": 1.1721177815291475, "learning_rate": 5.265909489513567e-08, "loss": 1.4789, "step": 1021 }, { "epoch": 2.4097938144329896, "grad_norm": 1.1121369880829992, "learning_rate": 5.248042965542558e-08, "loss": 1.4492, "step": 1022 }, { "epoch": 2.4123711340206184, "grad_norm": 1.172764927678444, "learning_rate": 5.230196013345398e-08, "loss": 1.495, "step": 1023 }, { "epoch": 2.4149484536082473, "grad_norm": 1.2211219953558563, "learning_rate": 5.212368706427912e-08, "loss": 1.4839, "step": 1024 }, { "epoch": 2.417525773195876, "grad_norm": 1.2134922811527864, "learning_rate": 5.194561118215004e-08, "loss": 1.4247, "step": 1025 }, { "epoch": 2.420103092783505, "grad_norm": 1.1269911256995855, "learning_rate": 5.176773322050381e-08, "loss": 1.4484, "step": 1026 }, { "epoch": 2.422680412371134, "grad_norm": 1.119051207691081, "learning_rate": 5.1590053911962127e-08, "loss": 1.3717, "step": 1027 }, { "epoch": 2.4252577319587627, "grad_norm": 1.1877122575741303, "learning_rate": 5.141257398832862e-08, "loss": 1.416, "step": 1028 }, { "epoch": 2.4278350515463916, "grad_norm": 1.1267435950520672, "learning_rate": 5.1235294180585674e-08, "loss": 1.4357, "step": 1029 }, { "epoch": 2.4278350515463916, "eval_loss": 1.4520158767700195, "eval_runtime": 78.5953, "eval_samples_per_second": 21.159, "eval_steps_per_second": 1.323, "step": 1029 }, { "epoch": 2.430412371134021, "grad_norm": 1.0857318983382882, "learning_rate": 5.1058215218891464e-08, "loss": 1.4512, "step": 1030 }, { "epoch": 2.4329896907216497, "grad_norm": 1.155498319174195, "learning_rate": 5.088133783257693e-08, "loss": 1.5014, "step": 1031 }, { "epoch": 2.4355670103092786, "grad_norm": 1.2379699109090305, "learning_rate": 5.070466275014287e-08, "loss": 1.5288, "step": 1032 }, { "epoch": 2.4381443298969074, "grad_norm": 1.3260836529994613, "learning_rate": 5.0528190699256756e-08, "loss": 1.456, "step": 1033 }, { "epoch": 2.4407216494845363, "grad_norm": 1.1737794063785383, "learning_rate": 5.03519224067499e-08, "loss": 1.4514, "step": 1034 }, { "epoch": 2.443298969072165, "grad_norm": 1.183113595964214, "learning_rate": 5.0175858598614363e-08, "loss": 1.4507, "step": 1035 }, { "epoch": 2.445876288659794, "grad_norm": 1.1143164931619889, "learning_rate": 5.000000000000002e-08, "loss": 1.3849, "step": 1036 }, { "epoch": 2.448453608247423, "grad_norm": 1.1724349277334387, "learning_rate": 4.9824347335211514e-08, "loss": 1.4424, "step": 1037 }, { "epoch": 2.4510309278350517, "grad_norm": 1.1212216527840104, "learning_rate": 4.964890132770543e-08, "loss": 1.4082, "step": 1038 }, { "epoch": 2.4536082474226806, "grad_norm": 1.1522290603715333, "learning_rate": 4.947366270008707e-08, "loss": 1.4314, "step": 1039 }, { "epoch": 2.4561855670103094, "grad_norm": 1.1633774724561892, "learning_rate": 4.929863217410767e-08, "loss": 1.4865, "step": 1040 }, { "epoch": 2.4587628865979383, "grad_norm": 1.1406335428126368, "learning_rate": 4.912381047066133e-08, "loss": 1.4458, "step": 1041 }, { "epoch": 2.461340206185567, "grad_norm": 1.1104681920852408, "learning_rate": 4.894919830978211e-08, "loss": 1.397, "step": 1042 }, { "epoch": 2.463917525773196, "grad_norm": 1.2181204959510732, "learning_rate": 4.8774796410640983e-08, "loss": 1.4955, "step": 1043 }, { "epoch": 2.466494845360825, "grad_norm": 1.15471572592744, "learning_rate": 4.860060549154301e-08, "loss": 1.3996, "step": 1044 }, { "epoch": 2.4690721649484537, "grad_norm": 1.19065290512176, "learning_rate": 4.842662626992426e-08, "loss": 1.4755, "step": 1045 }, { "epoch": 2.4716494845360826, "grad_norm": 1.351223096851913, "learning_rate": 4.825285946234874e-08, "loss": 1.4747, "step": 1046 }, { "epoch": 2.4742268041237114, "grad_norm": 1.141166837825934, "learning_rate": 4.807930578450584e-08, "loss": 1.4063, "step": 1047 }, { "epoch": 2.4768041237113403, "grad_norm": 1.1861721992764764, "learning_rate": 4.7905965951206986e-08, "loss": 1.4967, "step": 1048 }, { "epoch": 2.479381443298969, "grad_norm": 1.2595851597755765, "learning_rate": 4.773284067638281e-08, "loss": 1.4877, "step": 1049 }, { "epoch": 2.481958762886598, "grad_norm": 1.1088230107238257, "learning_rate": 4.755993067308047e-08, "loss": 1.4385, "step": 1050 }, { "epoch": 2.484536082474227, "grad_norm": 1.2852932163080484, "learning_rate": 4.7387236653460205e-08, "loss": 1.4141, "step": 1051 }, { "epoch": 2.4871134020618557, "grad_norm": 1.244645084527039, "learning_rate": 4.721475932879282e-08, "loss": 1.482, "step": 1052 }, { "epoch": 2.4896907216494846, "grad_norm": 1.2466688875419663, "learning_rate": 4.7042499409456695e-08, "loss": 1.4382, "step": 1053 }, { "epoch": 2.4922680412371134, "grad_norm": 1.2462831105011571, "learning_rate": 4.687045760493468e-08, "loss": 1.536, "step": 1054 }, { "epoch": 2.4948453608247423, "grad_norm": 1.1482492444378036, "learning_rate": 4.6698634623811307e-08, "loss": 1.4406, "step": 1055 }, { "epoch": 2.497422680412371, "grad_norm": 1.1978027196822072, "learning_rate": 4.652703117376986e-08, "loss": 1.4288, "step": 1056 }, { "epoch": 2.5, "grad_norm": 1.205112527214404, "learning_rate": 4.635564796158945e-08, "loss": 1.4066, "step": 1057 }, { "epoch": 2.502577319587629, "grad_norm": 1.1958287831198664, "learning_rate": 4.618448569314206e-08, "loss": 1.4194, "step": 1058 }, { "epoch": 2.5051546391752577, "grad_norm": 1.0972900424361671, "learning_rate": 4.60135450733897e-08, "loss": 1.4838, "step": 1059 }, { "epoch": 2.5077319587628866, "grad_norm": 1.2508036239600449, "learning_rate": 4.584282680638154e-08, "loss": 1.4443, "step": 1060 }, { "epoch": 2.5103092783505154, "grad_norm": 1.1703232750822017, "learning_rate": 4.567233159525088e-08, "loss": 1.434, "step": 1061 }, { "epoch": 2.5128865979381443, "grad_norm": 1.1666987794362405, "learning_rate": 4.550206014221232e-08, "loss": 1.4857, "step": 1062 }, { "epoch": 2.515463917525773, "grad_norm": 1.118899379693407, "learning_rate": 4.53320131485589e-08, "loss": 1.4753, "step": 1063 }, { "epoch": 2.518041237113402, "grad_norm": 1.2072619010906969, "learning_rate": 4.516219131465919e-08, "loss": 1.461, "step": 1064 }, { "epoch": 2.520618556701031, "grad_norm": 1.1330825353202136, "learning_rate": 4.499259533995434e-08, "loss": 1.3632, "step": 1065 }, { "epoch": 2.5231958762886597, "grad_norm": 1.087244159516567, "learning_rate": 4.48232259229554e-08, "loss": 1.4907, "step": 1066 }, { "epoch": 2.5257731958762886, "grad_norm": 1.113783698087956, "learning_rate": 4.465408376124016e-08, "loss": 1.425, "step": 1067 }, { "epoch": 2.5283505154639174, "grad_norm": 1.2174392360989843, "learning_rate": 4.448516955145047e-08, "loss": 1.5075, "step": 1068 }, { "epoch": 2.5309278350515463, "grad_norm": 1.2580642720936182, "learning_rate": 4.431648398928932e-08, "loss": 1.4312, "step": 1069 }, { "epoch": 2.533505154639175, "grad_norm": 1.2608189792754003, "learning_rate": 4.414802776951798e-08, "loss": 1.4614, "step": 1070 }, { "epoch": 2.536082474226804, "grad_norm": 1.1608489532256927, "learning_rate": 4.3979801585953094e-08, "loss": 1.4286, "step": 1071 }, { "epoch": 2.538659793814433, "grad_norm": 1.241756886612098, "learning_rate": 4.381180613146395e-08, "loss": 1.4545, "step": 1072 }, { "epoch": 2.5412371134020617, "grad_norm": 1.1267401284402057, "learning_rate": 4.364404209796948e-08, "loss": 1.4289, "step": 1073 }, { "epoch": 2.5438144329896906, "grad_norm": 1.1675743288280764, "learning_rate": 4.347651017643539e-08, "loss": 1.4545, "step": 1074 }, { "epoch": 2.5463917525773194, "grad_norm": 1.1014672234344964, "learning_rate": 4.3309211056871544e-08, "loss": 1.4588, "step": 1075 }, { "epoch": 2.5489690721649483, "grad_norm": 1.1537126237371678, "learning_rate": 4.314214542832888e-08, "loss": 1.4922, "step": 1076 }, { "epoch": 2.551546391752577, "grad_norm": 1.0803879548258355, "learning_rate": 4.2975313978896644e-08, "loss": 1.4505, "step": 1077 }, { "epoch": 2.554123711340206, "grad_norm": 1.1135211277789598, "learning_rate": 4.280871739569971e-08, "loss": 1.4256, "step": 1078 }, { "epoch": 2.554123711340206, "eval_loss": 1.4510596990585327, "eval_runtime": 78.5321, "eval_samples_per_second": 21.176, "eval_steps_per_second": 1.324, "step": 1078 }, { "epoch": 2.556701030927835, "grad_norm": 1.1587956973540048, "learning_rate": 4.2642356364895414e-08, "loss": 1.3874, "step": 1079 }, { "epoch": 2.5592783505154637, "grad_norm": 1.2208237784983438, "learning_rate": 4.247623157167102e-08, "loss": 1.4828, "step": 1080 }, { "epoch": 2.5618556701030926, "grad_norm": 1.1970857349297972, "learning_rate": 4.231034370024088e-08, "loss": 1.4412, "step": 1081 }, { "epoch": 2.5644329896907214, "grad_norm": 1.1543756364647166, "learning_rate": 4.214469343384346e-08, "loss": 1.4448, "step": 1082 }, { "epoch": 2.5670103092783503, "grad_norm": 1.125316478876826, "learning_rate": 4.197928145473856e-08, "loss": 1.3943, "step": 1083 }, { "epoch": 2.569587628865979, "grad_norm": 1.1220973164280506, "learning_rate": 4.181410844420473e-08, "loss": 1.4221, "step": 1084 }, { "epoch": 2.572164948453608, "grad_norm": 1.1654590544487953, "learning_rate": 4.164917508253607e-08, "loss": 1.433, "step": 1085 }, { "epoch": 2.574742268041237, "grad_norm": 1.1709294745599472, "learning_rate": 4.148448204903977e-08, "loss": 1.3952, "step": 1086 }, { "epoch": 2.5773195876288657, "grad_norm": 1.1679647806294131, "learning_rate": 4.132003002203314e-08, "loss": 1.4641, "step": 1087 }, { "epoch": 2.579896907216495, "grad_norm": 1.3695549935841669, "learning_rate": 4.115581967884093e-08, "loss": 1.5259, "step": 1088 }, { "epoch": 2.582474226804124, "grad_norm": 1.1307837909317393, "learning_rate": 4.099185169579241e-08, "loss": 1.4012, "step": 1089 }, { "epoch": 2.5850515463917527, "grad_norm": 1.1501589873026261, "learning_rate": 4.0828126748218647e-08, "loss": 1.4582, "step": 1090 }, { "epoch": 2.5876288659793816, "grad_norm": 1.1069474546473044, "learning_rate": 4.0664645510449745e-08, "loss": 1.4335, "step": 1091 }, { "epoch": 2.5902061855670104, "grad_norm": 1.1910808093335385, "learning_rate": 4.050140865581204e-08, "loss": 1.458, "step": 1092 }, { "epoch": 2.5927835051546393, "grad_norm": 1.1210216135242885, "learning_rate": 4.033841685662529e-08, "loss": 1.4671, "step": 1093 }, { "epoch": 2.595360824742268, "grad_norm": 1.1392325814801574, "learning_rate": 4.0175670784200066e-08, "loss": 1.4687, "step": 1094 }, { "epoch": 2.597938144329897, "grad_norm": 1.2066331988995807, "learning_rate": 4.001317110883477e-08, "loss": 1.6142, "step": 1095 }, { "epoch": 2.600515463917526, "grad_norm": 1.120036816028406, "learning_rate": 3.985091849981297e-08, "loss": 1.4617, "step": 1096 }, { "epoch": 2.6030927835051547, "grad_norm": 1.1171460565708284, "learning_rate": 3.96889136254007e-08, "loss": 1.459, "step": 1097 }, { "epoch": 2.6056701030927836, "grad_norm": 1.2472238722902789, "learning_rate": 3.952715715284363e-08, "loss": 1.5456, "step": 1098 }, { "epoch": 2.6082474226804124, "grad_norm": 1.2133346933773341, "learning_rate": 3.93656497483643e-08, "loss": 1.5134, "step": 1099 }, { "epoch": 2.6108247422680413, "grad_norm": 1.1470733566590117, "learning_rate": 3.9204392077159544e-08, "loss": 1.4653, "step": 1100 }, { "epoch": 2.61340206185567, "grad_norm": 1.1608282166724524, "learning_rate": 3.904338480339755e-08, "loss": 1.479, "step": 1101 }, { "epoch": 2.615979381443299, "grad_norm": 1.1508782189162872, "learning_rate": 3.888262859021507e-08, "loss": 1.4025, "step": 1102 }, { "epoch": 2.618556701030928, "grad_norm": 1.178209399181694, "learning_rate": 3.872212409971507e-08, "loss": 1.2948, "step": 1103 }, { "epoch": 2.6211340206185567, "grad_norm": 1.32807190899102, "learning_rate": 3.856187199296358e-08, "loss": 1.5456, "step": 1104 }, { "epoch": 2.6237113402061856, "grad_norm": 1.2185169437161736, "learning_rate": 3.8401872929987166e-08, "loss": 1.429, "step": 1105 }, { "epoch": 2.6262886597938144, "grad_norm": 1.2304397213352538, "learning_rate": 3.824212756977027e-08, "loss": 1.4558, "step": 1106 }, { "epoch": 2.6288659793814433, "grad_norm": 1.1724306586240414, "learning_rate": 3.8082636570252346e-08, "loss": 1.4984, "step": 1107 }, { "epoch": 2.631443298969072, "grad_norm": 1.1298977167004856, "learning_rate": 3.7923400588325147e-08, "loss": 1.4417, "step": 1108 }, { "epoch": 2.634020618556701, "grad_norm": 1.1784947581476026, "learning_rate": 3.7764420279830266e-08, "loss": 1.4164, "step": 1109 }, { "epoch": 2.63659793814433, "grad_norm": 1.155170570736418, "learning_rate": 3.7605696299556135e-08, "loss": 1.4371, "step": 1110 }, { "epoch": 2.6391752577319587, "grad_norm": 1.1663523776289366, "learning_rate": 3.744722930123544e-08, "loss": 1.4747, "step": 1111 }, { "epoch": 2.6417525773195876, "grad_norm": 1.2126168901096435, "learning_rate": 3.72890199375426e-08, "loss": 1.5058, "step": 1112 }, { "epoch": 2.6443298969072164, "grad_norm": 1.2017176914352923, "learning_rate": 3.71310688600907e-08, "loss": 1.4733, "step": 1113 }, { "epoch": 2.6469072164948453, "grad_norm": 1.1119469160793427, "learning_rate": 3.6973376719429125e-08, "loss": 1.476, "step": 1114 }, { "epoch": 2.649484536082474, "grad_norm": 1.130792424586462, "learning_rate": 3.681594416504088e-08, "loss": 1.4494, "step": 1115 }, { "epoch": 2.652061855670103, "grad_norm": 1.222509795849272, "learning_rate": 3.6658771845339676e-08, "loss": 1.4999, "step": 1116 }, { "epoch": 2.654639175257732, "grad_norm": 1.1385228914334713, "learning_rate": 3.650186040766746e-08, "loss": 1.4402, "step": 1117 }, { "epoch": 2.6572164948453607, "grad_norm": 1.1448576075492045, "learning_rate": 3.634521049829169e-08, "loss": 1.4132, "step": 1118 }, { "epoch": 2.6597938144329896, "grad_norm": 1.139064959062427, "learning_rate": 3.618882276240267e-08, "loss": 1.3994, "step": 1119 }, { "epoch": 2.6623711340206184, "grad_norm": 1.161606746690635, "learning_rate": 3.603269784411089e-08, "loss": 1.4385, "step": 1120 }, { "epoch": 2.6649484536082473, "grad_norm": 1.1300734708150515, "learning_rate": 3.587683638644437e-08, "loss": 1.4228, "step": 1121 }, { "epoch": 2.667525773195876, "grad_norm": 1.1979334493577922, "learning_rate": 3.572123903134606e-08, "loss": 1.3946, "step": 1122 }, { "epoch": 2.670103092783505, "grad_norm": 1.2108873546484593, "learning_rate": 3.556590641967114e-08, "loss": 1.4019, "step": 1123 }, { "epoch": 2.6726804123711343, "grad_norm": 1.252184087003669, "learning_rate": 3.5410839191184386e-08, "loss": 1.4863, "step": 1124 }, { "epoch": 2.675257731958763, "grad_norm": 1.1268238345165822, "learning_rate": 3.525603798455753e-08, "loss": 1.4624, "step": 1125 }, { "epoch": 2.677835051546392, "grad_norm": 1.2410354943951132, "learning_rate": 3.5101503437366676e-08, "loss": 1.5426, "step": 1126 }, { "epoch": 2.680412371134021, "grad_norm": 1.2054964281688654, "learning_rate": 3.49472361860896e-08, "loss": 1.4182, "step": 1127 }, { "epoch": 2.680412371134021, "eval_loss": 1.4503966569900513, "eval_runtime": 78.5776, "eval_samples_per_second": 21.164, "eval_steps_per_second": 1.324, "step": 1127 }, { "epoch": 2.6829896907216497, "grad_norm": 1.18692856703466, "learning_rate": 3.4793236866103294e-08, "loss": 1.5021, "step": 1128 }, { "epoch": 2.6855670103092786, "grad_norm": 1.099606075968585, "learning_rate": 3.463950611168111e-08, "loss": 1.4051, "step": 1129 }, { "epoch": 2.6881443298969074, "grad_norm": 1.1712675559534376, "learning_rate": 3.448604455599021e-08, "loss": 1.4565, "step": 1130 }, { "epoch": 2.6907216494845363, "grad_norm": 1.2365327819201322, "learning_rate": 3.43328528310892e-08, "loss": 1.4418, "step": 1131 }, { "epoch": 2.693298969072165, "grad_norm": 1.1186618547215839, "learning_rate": 3.4179931567925215e-08, "loss": 1.4987, "step": 1132 }, { "epoch": 2.695876288659794, "grad_norm": 1.2081208242761923, "learning_rate": 3.402728139633142e-08, "loss": 1.441, "step": 1133 }, { "epoch": 2.698453608247423, "grad_norm": 1.218636962355054, "learning_rate": 3.387490294502457e-08, "loss": 1.4067, "step": 1134 }, { "epoch": 2.7010309278350517, "grad_norm": 1.1637394002772754, "learning_rate": 3.372279684160221e-08, "loss": 1.5326, "step": 1135 }, { "epoch": 2.7036082474226806, "grad_norm": 1.2353156557559488, "learning_rate": 3.357096371254008e-08, "loss": 1.472, "step": 1136 }, { "epoch": 2.7061855670103094, "grad_norm": 1.19587166321243, "learning_rate": 3.3419404183189813e-08, "loss": 1.4886, "step": 1137 }, { "epoch": 2.7087628865979383, "grad_norm": 1.1730315855085072, "learning_rate": 3.326811887777606e-08, "loss": 1.3887, "step": 1138 }, { "epoch": 2.711340206185567, "grad_norm": 1.2017905489788439, "learning_rate": 3.3117108419394036e-08, "loss": 1.4376, "step": 1139 }, { "epoch": 2.713917525773196, "grad_norm": 1.223875153650053, "learning_rate": 3.2966373430007044e-08, "loss": 1.4841, "step": 1140 }, { "epoch": 2.716494845360825, "grad_norm": 1.163982928943064, "learning_rate": 3.2815914530443656e-08, "loss": 1.5057, "step": 1141 }, { "epoch": 2.7190721649484537, "grad_norm": 1.1065194981403395, "learning_rate": 3.2665732340395413e-08, "loss": 1.5145, "step": 1142 }, { "epoch": 2.7216494845360826, "grad_norm": 1.1802479694554426, "learning_rate": 3.2515827478414227e-08, "loss": 1.4639, "step": 1143 }, { "epoch": 2.7242268041237114, "grad_norm": 1.1042272626565486, "learning_rate": 3.236620056190972e-08, "loss": 1.3944, "step": 1144 }, { "epoch": 2.7268041237113403, "grad_norm": 1.2114102979959467, "learning_rate": 3.221685220714674e-08, "loss": 1.4298, "step": 1145 }, { "epoch": 2.729381443298969, "grad_norm": 1.1393577034048052, "learning_rate": 3.2067783029242866e-08, "loss": 1.3856, "step": 1146 }, { "epoch": 2.731958762886598, "grad_norm": 1.1037036354008587, "learning_rate": 3.1918993642165804e-08, "loss": 1.3889, "step": 1147 }, { "epoch": 2.734536082474227, "grad_norm": 1.2272871402765764, "learning_rate": 3.177048465873089e-08, "loss": 1.4043, "step": 1148 }, { "epoch": 2.7371134020618557, "grad_norm": 1.210586273197648, "learning_rate": 3.1622256690598633e-08, "loss": 1.4999, "step": 1149 }, { "epoch": 2.7396907216494846, "grad_norm": 1.1746574581016895, "learning_rate": 3.147431034827208e-08, "loss": 1.4216, "step": 1150 }, { "epoch": 2.7422680412371134, "grad_norm": 1.1586070909228363, "learning_rate": 3.1326646241094336e-08, "loss": 1.4696, "step": 1151 }, { "epoch": 2.7448453608247423, "grad_norm": 1.1312629920265729, "learning_rate": 3.11792649772461e-08, "loss": 1.5172, "step": 1152 }, { "epoch": 2.747422680412371, "grad_norm": 1.181603470826963, "learning_rate": 3.1032167163743115e-08, "loss": 1.4453, "step": 1153 }, { "epoch": 2.75, "grad_norm": 1.1958639955584416, "learning_rate": 3.0885353406433703e-08, "loss": 1.5075, "step": 1154 }, { "epoch": 2.752577319587629, "grad_norm": 1.200258914978432, "learning_rate": 3.073882430999619e-08, "loss": 1.409, "step": 1155 }, { "epoch": 2.7551546391752577, "grad_norm": 1.1425311029684388, "learning_rate": 3.05925804779366e-08, "loss": 1.4537, "step": 1156 }, { "epoch": 2.7577319587628866, "grad_norm": 1.1441189180372324, "learning_rate": 3.044662251258595e-08, "loss": 1.567, "step": 1157 }, { "epoch": 2.7603092783505154, "grad_norm": 1.1519696479164119, "learning_rate": 3.030095101509786e-08, "loss": 1.4678, "step": 1158 }, { "epoch": 2.7628865979381443, "grad_norm": 1.2588291000562302, "learning_rate": 3.0155566585446114e-08, "loss": 1.5141, "step": 1159 }, { "epoch": 2.765463917525773, "grad_norm": 1.1712961770904633, "learning_rate": 3.0010469822422156e-08, "loss": 1.4298, "step": 1160 }, { "epoch": 2.768041237113402, "grad_norm": 1.2155090578526457, "learning_rate": 2.986566132363259e-08, "loss": 1.5341, "step": 1161 }, { "epoch": 2.770618556701031, "grad_norm": 1.1558741286842076, "learning_rate": 2.972114168549682e-08, "loss": 1.4089, "step": 1162 }, { "epoch": 2.7731958762886597, "grad_norm": 1.281655267971227, "learning_rate": 2.9576911503244494e-08, "loss": 1.3596, "step": 1163 }, { "epoch": 2.7757731958762886, "grad_norm": 1.1885614767244468, "learning_rate": 2.9432971370912995e-08, "loss": 1.4181, "step": 1164 } ], "logging_steps": 1, "max_steps": 1552, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 388, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 305116087320576.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }