dontusethis / trainer_state.json
jeiku's picture
Upload 15 files
dd0251a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.7757731958762886,
"eval_steps": 49,
"global_step": 1164,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002577319587628866,
"grad_norm": 3.206880709337614,
"learning_rate": 5e-08,
"loss": 1.772,
"step": 1
},
{
"epoch": 0.002577319587628866,
"eval_loss": 1.6304376125335693,
"eval_runtime": 78.4604,
"eval_samples_per_second": 21.195,
"eval_steps_per_second": 1.326,
"step": 1
},
{
"epoch": 0.005154639175257732,
"grad_norm": 3.3587112552116953,
"learning_rate": 1e-07,
"loss": 1.666,
"step": 2
},
{
"epoch": 0.007731958762886598,
"grad_norm": 3.1385995190528324,
"learning_rate": 1.5e-07,
"loss": 1.5471,
"step": 3
},
{
"epoch": 0.010309278350515464,
"grad_norm": 3.531264158181801,
"learning_rate": 2e-07,
"loss": 1.6718,
"step": 4
},
{
"epoch": 0.01288659793814433,
"grad_norm": 2.993529294622099,
"learning_rate": 1.9999979406617412e-07,
"loss": 1.6334,
"step": 5
},
{
"epoch": 0.015463917525773196,
"grad_norm": 3.151745142356583,
"learning_rate": 1.999991762655447e-07,
"loss": 1.5647,
"step": 6
},
{
"epoch": 0.01804123711340206,
"grad_norm": 3.3440809481325333,
"learning_rate": 1.9999814660065617e-07,
"loss": 1.7122,
"step": 7
},
{
"epoch": 0.020618556701030927,
"grad_norm": 3.1146822679211805,
"learning_rate": 1.9999670507574944e-07,
"loss": 1.5921,
"step": 8
},
{
"epoch": 0.023195876288659795,
"grad_norm": 3.345986552710787,
"learning_rate": 1.9999485169676173e-07,
"loss": 1.7131,
"step": 9
},
{
"epoch": 0.02577319587628866,
"grad_norm": 2.9626668283812045,
"learning_rate": 1.9999258647132644e-07,
"loss": 1.6699,
"step": 10
},
{
"epoch": 0.028350515463917526,
"grad_norm": 3.4953806538783527,
"learning_rate": 1.9998990940877333e-07,
"loss": 1.6785,
"step": 11
},
{
"epoch": 0.030927835051546393,
"grad_norm": 3.3004651951030097,
"learning_rate": 1.9998682052012837e-07,
"loss": 1.6681,
"step": 12
},
{
"epoch": 0.03350515463917526,
"grad_norm": 2.9639928990218802,
"learning_rate": 1.9998331981811364e-07,
"loss": 1.5618,
"step": 13
},
{
"epoch": 0.03608247422680412,
"grad_norm": 3.0779182905002234,
"learning_rate": 1.9997940731714744e-07,
"loss": 1.7039,
"step": 14
},
{
"epoch": 0.03865979381443299,
"grad_norm": 2.9325641285273574,
"learning_rate": 1.9997508303334409e-07,
"loss": 1.6219,
"step": 15
},
{
"epoch": 0.041237113402061855,
"grad_norm": 2.8809471060714555,
"learning_rate": 1.9997034698451393e-07,
"loss": 1.7566,
"step": 16
},
{
"epoch": 0.04381443298969072,
"grad_norm": 3.341100705652755,
"learning_rate": 1.999651991901632e-07,
"loss": 1.6958,
"step": 17
},
{
"epoch": 0.04639175257731959,
"grad_norm": 2.8521720798434216,
"learning_rate": 1.9995963967149398e-07,
"loss": 1.5833,
"step": 18
},
{
"epoch": 0.04896907216494845,
"grad_norm": 3.2447253207769338,
"learning_rate": 1.9995366845140414e-07,
"loss": 1.6854,
"step": 19
},
{
"epoch": 0.05154639175257732,
"grad_norm": 3.033054116340073,
"learning_rate": 1.999472855544872e-07,
"loss": 1.6768,
"step": 20
},
{
"epoch": 0.05412371134020619,
"grad_norm": 2.7106967773151665,
"learning_rate": 1.9994049100703232e-07,
"loss": 1.5709,
"step": 21
},
{
"epoch": 0.05670103092783505,
"grad_norm": 2.8131217459267974,
"learning_rate": 1.9993328483702392e-07,
"loss": 1.5352,
"step": 22
},
{
"epoch": 0.059278350515463915,
"grad_norm": 2.9454711288855115,
"learning_rate": 1.9992566707414195e-07,
"loss": 1.6292,
"step": 23
},
{
"epoch": 0.061855670103092786,
"grad_norm": 2.719048700095618,
"learning_rate": 1.9991763774976155e-07,
"loss": 1.6504,
"step": 24
},
{
"epoch": 0.06443298969072164,
"grad_norm": 2.6465097422508914,
"learning_rate": 1.9990919689695282e-07,
"loss": 1.6398,
"step": 25
},
{
"epoch": 0.06701030927835051,
"grad_norm": 2.565964847805824,
"learning_rate": 1.9990034455048098e-07,
"loss": 1.6024,
"step": 26
},
{
"epoch": 0.06958762886597938,
"grad_norm": 2.4151701145393787,
"learning_rate": 1.9989108074680595e-07,
"loss": 1.6316,
"step": 27
},
{
"epoch": 0.07216494845360824,
"grad_norm": 2.6823187985959276,
"learning_rate": 1.998814055240823e-07,
"loss": 1.7421,
"step": 28
},
{
"epoch": 0.07474226804123711,
"grad_norm": 2.6044420857485755,
"learning_rate": 1.998713189221592e-07,
"loss": 1.5983,
"step": 29
},
{
"epoch": 0.07731958762886598,
"grad_norm": 2.3579361514426784,
"learning_rate": 1.9986082098258008e-07,
"loss": 1.5468,
"step": 30
},
{
"epoch": 0.07989690721649484,
"grad_norm": 2.3088177834083146,
"learning_rate": 1.9984991174858257e-07,
"loss": 1.5852,
"step": 31
},
{
"epoch": 0.08247422680412371,
"grad_norm": 2.5839184979450005,
"learning_rate": 1.9983859126509825e-07,
"loss": 1.6647,
"step": 32
},
{
"epoch": 0.08505154639175258,
"grad_norm": 2.2905291979602844,
"learning_rate": 1.9982685957875257e-07,
"loss": 1.5935,
"step": 33
},
{
"epoch": 0.08762886597938144,
"grad_norm": 2.3660300818606568,
"learning_rate": 1.998147167378645e-07,
"loss": 1.7655,
"step": 34
},
{
"epoch": 0.09020618556701031,
"grad_norm": 2.269544029552125,
"learning_rate": 1.9980216279244653e-07,
"loss": 1.6383,
"step": 35
},
{
"epoch": 0.09278350515463918,
"grad_norm": 2.2148823132358477,
"learning_rate": 1.9978919779420423e-07,
"loss": 1.7191,
"step": 36
},
{
"epoch": 0.09536082474226804,
"grad_norm": 2.295307555280267,
"learning_rate": 1.9977582179653632e-07,
"loss": 1.5571,
"step": 37
},
{
"epoch": 0.0979381443298969,
"grad_norm": 2.1570012388049262,
"learning_rate": 1.9976203485453414e-07,
"loss": 1.642,
"step": 38
},
{
"epoch": 0.10051546391752578,
"grad_norm": 2.327694183291453,
"learning_rate": 1.9974783702498166e-07,
"loss": 1.6388,
"step": 39
},
{
"epoch": 0.10309278350515463,
"grad_norm": 2.3531823980910382,
"learning_rate": 1.9973322836635516e-07,
"loss": 1.6407,
"step": 40
},
{
"epoch": 0.1056701030927835,
"grad_norm": 2.148246998681959,
"learning_rate": 1.9971820893882297e-07,
"loss": 1.6316,
"step": 41
},
{
"epoch": 0.10824742268041238,
"grad_norm": 1.824359532091145,
"learning_rate": 1.9970277880424528e-07,
"loss": 1.4812,
"step": 42
},
{
"epoch": 0.11082474226804123,
"grad_norm": 1.8420872667750698,
"learning_rate": 1.9968693802617374e-07,
"loss": 1.6208,
"step": 43
},
{
"epoch": 0.1134020618556701,
"grad_norm": 1.9242569129206386,
"learning_rate": 1.9967068666985148e-07,
"loss": 1.6866,
"step": 44
},
{
"epoch": 0.11597938144329897,
"grad_norm": 1.7555101549111227,
"learning_rate": 1.9965402480221257e-07,
"loss": 1.59,
"step": 45
},
{
"epoch": 0.11855670103092783,
"grad_norm": 1.83328616320706,
"learning_rate": 1.9963695249188181e-07,
"loss": 1.7787,
"step": 46
},
{
"epoch": 0.1211340206185567,
"grad_norm": 1.5464144842738474,
"learning_rate": 1.9961946980917453e-07,
"loss": 1.5605,
"step": 47
},
{
"epoch": 0.12371134020618557,
"grad_norm": 1.5700132071559665,
"learning_rate": 1.9960157682609632e-07,
"loss": 1.5188,
"step": 48
},
{
"epoch": 0.12628865979381443,
"grad_norm": 1.551927803815323,
"learning_rate": 1.9958327361634247e-07,
"loss": 1.5921,
"step": 49
},
{
"epoch": 0.12628865979381443,
"eval_loss": 1.5858733654022217,
"eval_runtime": 78.6563,
"eval_samples_per_second": 21.143,
"eval_steps_per_second": 1.322,
"step": 49
},
{
"epoch": 0.12886597938144329,
"grad_norm": 1.6459186978386617,
"learning_rate": 1.9956456025529805e-07,
"loss": 1.6407,
"step": 50
},
{
"epoch": 0.13144329896907217,
"grad_norm": 1.6778367242552643,
"learning_rate": 1.9954543682003732e-07,
"loss": 1.5755,
"step": 51
},
{
"epoch": 0.13402061855670103,
"grad_norm": 1.5846228635636366,
"learning_rate": 1.9952590338932356e-07,
"loss": 1.5236,
"step": 52
},
{
"epoch": 0.13659793814432988,
"grad_norm": 1.530322622789531,
"learning_rate": 1.9950596004360864e-07,
"loss": 1.6474,
"step": 53
},
{
"epoch": 0.13917525773195877,
"grad_norm": 1.5541727762346491,
"learning_rate": 1.994856068650327e-07,
"loss": 1.5926,
"step": 54
},
{
"epoch": 0.14175257731958762,
"grad_norm": 1.5422089413059752,
"learning_rate": 1.9946484393742394e-07,
"loss": 1.6057,
"step": 55
},
{
"epoch": 0.14432989690721648,
"grad_norm": 1.5086078750620586,
"learning_rate": 1.994436713462982e-07,
"loss": 1.6139,
"step": 56
},
{
"epoch": 0.14690721649484537,
"grad_norm": 1.4904490748313473,
"learning_rate": 1.994220891788584e-07,
"loss": 1.5613,
"step": 57
},
{
"epoch": 0.14948453608247422,
"grad_norm": 1.4446085113828102,
"learning_rate": 1.9940009752399457e-07,
"loss": 1.5838,
"step": 58
},
{
"epoch": 0.15206185567010308,
"grad_norm": 1.4944945344118559,
"learning_rate": 1.9937769647228327e-07,
"loss": 1.6009,
"step": 59
},
{
"epoch": 0.15463917525773196,
"grad_norm": 1.3673177038874413,
"learning_rate": 1.9935488611598714e-07,
"loss": 1.5295,
"step": 60
},
{
"epoch": 0.15721649484536082,
"grad_norm": 1.489918654317649,
"learning_rate": 1.9933166654905465e-07,
"loss": 1.6855,
"step": 61
},
{
"epoch": 0.15979381443298968,
"grad_norm": 1.4085364811053838,
"learning_rate": 1.993080378671197e-07,
"loss": 1.6171,
"step": 62
},
{
"epoch": 0.16237113402061856,
"grad_norm": 1.4063494910858265,
"learning_rate": 1.992840001675012e-07,
"loss": 1.548,
"step": 63
},
{
"epoch": 0.16494845360824742,
"grad_norm": 1.4013900053822443,
"learning_rate": 1.9925955354920263e-07,
"loss": 1.5674,
"step": 64
},
{
"epoch": 0.16752577319587628,
"grad_norm": 1.3995913424696536,
"learning_rate": 1.9923469811291173e-07,
"loss": 1.644,
"step": 65
},
{
"epoch": 0.17010309278350516,
"grad_norm": 1.4951716735691833,
"learning_rate": 1.99209433961e-07,
"loss": 1.6752,
"step": 66
},
{
"epoch": 0.17268041237113402,
"grad_norm": 1.4354454580093134,
"learning_rate": 1.9918376119752226e-07,
"loss": 1.6076,
"step": 67
},
{
"epoch": 0.17525773195876287,
"grad_norm": 1.5307588716137506,
"learning_rate": 1.9915767992821639e-07,
"loss": 1.6192,
"step": 68
},
{
"epoch": 0.17783505154639176,
"grad_norm": 1.37638400966553,
"learning_rate": 1.9913119026050267e-07,
"loss": 1.5744,
"step": 69
},
{
"epoch": 0.18041237113402062,
"grad_norm": 1.3694054278862016,
"learning_rate": 1.9910429230348344e-07,
"loss": 1.4495,
"step": 70
},
{
"epoch": 0.18298969072164947,
"grad_norm": 1.4276322894882787,
"learning_rate": 1.9907698616794276e-07,
"loss": 1.6427,
"step": 71
},
{
"epoch": 0.18556701030927836,
"grad_norm": 1.475589693442013,
"learning_rate": 1.990492719663457e-07,
"loss": 1.6231,
"step": 72
},
{
"epoch": 0.18814432989690721,
"grad_norm": 1.505476760952321,
"learning_rate": 1.990211498128381e-07,
"loss": 1.7036,
"step": 73
},
{
"epoch": 0.19072164948453607,
"grad_norm": 1.4498365666960409,
"learning_rate": 1.9899261982324607e-07,
"loss": 1.5564,
"step": 74
},
{
"epoch": 0.19329896907216496,
"grad_norm": 1.4542099562182622,
"learning_rate": 1.9896368211507535e-07,
"loss": 1.6012,
"step": 75
},
{
"epoch": 0.1958762886597938,
"grad_norm": 1.408394462248393,
"learning_rate": 1.9893433680751103e-07,
"loss": 1.5493,
"step": 76
},
{
"epoch": 0.19845360824742267,
"grad_norm": 1.4023960052363178,
"learning_rate": 1.9890458402141688e-07,
"loss": 1.6452,
"step": 77
},
{
"epoch": 0.20103092783505155,
"grad_norm": 1.4823050133687188,
"learning_rate": 1.988744238793351e-07,
"loss": 1.5991,
"step": 78
},
{
"epoch": 0.2036082474226804,
"grad_norm": 1.32937819085943,
"learning_rate": 1.9884385650548548e-07,
"loss": 1.5358,
"step": 79
},
{
"epoch": 0.20618556701030927,
"grad_norm": 1.3471888309972797,
"learning_rate": 1.9881288202576517e-07,
"loss": 1.5426,
"step": 80
},
{
"epoch": 0.20876288659793815,
"grad_norm": 1.34250330197651,
"learning_rate": 1.98781500567748e-07,
"loss": 1.5743,
"step": 81
},
{
"epoch": 0.211340206185567,
"grad_norm": 1.3158395928293942,
"learning_rate": 1.9874971226068412e-07,
"loss": 1.5914,
"step": 82
},
{
"epoch": 0.21391752577319587,
"grad_norm": 1.3088201655236604,
"learning_rate": 1.9871751723549926e-07,
"loss": 1.5307,
"step": 83
},
{
"epoch": 0.21649484536082475,
"grad_norm": 1.4622234110087462,
"learning_rate": 1.9868491562479426e-07,
"loss": 1.6698,
"step": 84
},
{
"epoch": 0.2190721649484536,
"grad_norm": 1.2966036743967264,
"learning_rate": 1.9865190756284464e-07,
"loss": 1.6172,
"step": 85
},
{
"epoch": 0.22164948453608246,
"grad_norm": 1.3416821729559592,
"learning_rate": 1.9861849318559995e-07,
"loss": 1.6395,
"step": 86
},
{
"epoch": 0.22422680412371135,
"grad_norm": 1.4246775767306445,
"learning_rate": 1.9858467263068319e-07,
"loss": 1.6048,
"step": 87
},
{
"epoch": 0.2268041237113402,
"grad_norm": 1.332606463309659,
"learning_rate": 1.9855044603739028e-07,
"loss": 1.6383,
"step": 88
},
{
"epoch": 0.22938144329896906,
"grad_norm": 1.380602547288226,
"learning_rate": 1.9851581354668948e-07,
"loss": 1.64,
"step": 89
},
{
"epoch": 0.23195876288659795,
"grad_norm": 1.3407177446168135,
"learning_rate": 1.984807753012208e-07,
"loss": 1.7039,
"step": 90
},
{
"epoch": 0.2345360824742268,
"grad_norm": 1.338866434398542,
"learning_rate": 1.9844533144529547e-07,
"loss": 1.5236,
"step": 91
},
{
"epoch": 0.23711340206185566,
"grad_norm": 1.274500058980513,
"learning_rate": 1.9840948212489526e-07,
"loss": 1.5713,
"step": 92
},
{
"epoch": 0.23969072164948454,
"grad_norm": 1.3410204352377493,
"learning_rate": 1.9837322748767194e-07,
"loss": 1.6058,
"step": 93
},
{
"epoch": 0.2422680412371134,
"grad_norm": 1.3188947135915765,
"learning_rate": 1.983365676829466e-07,
"loss": 1.6209,
"step": 94
},
{
"epoch": 0.24484536082474226,
"grad_norm": 1.2787506674738858,
"learning_rate": 1.9829950286170913e-07,
"loss": 1.5984,
"step": 95
},
{
"epoch": 0.24742268041237114,
"grad_norm": 1.3508302652980064,
"learning_rate": 1.9826203317661756e-07,
"loss": 1.5126,
"step": 96
},
{
"epoch": 0.25,
"grad_norm": 1.3775203706307013,
"learning_rate": 1.9822415878199737e-07,
"loss": 1.5806,
"step": 97
},
{
"epoch": 0.25257731958762886,
"grad_norm": 1.3953183701272227,
"learning_rate": 1.9818587983384095e-07,
"loss": 1.6391,
"step": 98
},
{
"epoch": 0.25257731958762886,
"eval_loss": 1.5530622005462646,
"eval_runtime": 78.7591,
"eval_samples_per_second": 21.115,
"eval_steps_per_second": 1.32,
"step": 98
},
{
"epoch": 0.2551546391752577,
"grad_norm": 1.2639205955569304,
"learning_rate": 1.981471964898069e-07,
"loss": 1.6154,
"step": 99
},
{
"epoch": 0.25773195876288657,
"grad_norm": 1.33461619126327,
"learning_rate": 1.9810810890921942e-07,
"loss": 1.5841,
"step": 100
},
{
"epoch": 0.2603092783505155,
"grad_norm": 1.3223001702133927,
"learning_rate": 1.980686172530676e-07,
"loss": 1.6292,
"step": 101
},
{
"epoch": 0.26288659793814434,
"grad_norm": 1.2560649642869146,
"learning_rate": 1.9802872168400478e-07,
"loss": 1.5673,
"step": 102
},
{
"epoch": 0.2654639175257732,
"grad_norm": 1.2597104528650152,
"learning_rate": 1.9798842236634795e-07,
"loss": 1.6508,
"step": 103
},
{
"epoch": 0.26804123711340205,
"grad_norm": 1.407282635250448,
"learning_rate": 1.979477194660769e-07,
"loss": 1.4872,
"step": 104
},
{
"epoch": 0.2706185567010309,
"grad_norm": 1.2016832149108632,
"learning_rate": 1.9790661315083375e-07,
"loss": 1.5604,
"step": 105
},
{
"epoch": 0.27319587628865977,
"grad_norm": 1.149030350241683,
"learning_rate": 1.978651035899221e-07,
"loss": 1.421,
"step": 106
},
{
"epoch": 0.2757731958762887,
"grad_norm": 1.3215975195174274,
"learning_rate": 1.9782319095430643e-07,
"loss": 1.5786,
"step": 107
},
{
"epoch": 0.27835051546391754,
"grad_norm": 1.2703092272910235,
"learning_rate": 1.9778087541661131e-07,
"loss": 1.484,
"step": 108
},
{
"epoch": 0.2809278350515464,
"grad_norm": 1.2413825121259754,
"learning_rate": 1.9773815715112072e-07,
"loss": 1.5041,
"step": 109
},
{
"epoch": 0.28350515463917525,
"grad_norm": 1.2972955973409976,
"learning_rate": 1.9769503633377743e-07,
"loss": 1.5719,
"step": 110
},
{
"epoch": 0.2860824742268041,
"grad_norm": 1.3905442390636398,
"learning_rate": 1.9765151314218209e-07,
"loss": 1.5788,
"step": 111
},
{
"epoch": 0.28865979381443296,
"grad_norm": 1.269867236059509,
"learning_rate": 1.976075877555927e-07,
"loss": 1.5358,
"step": 112
},
{
"epoch": 0.2912371134020619,
"grad_norm": 1.2521107632001138,
"learning_rate": 1.975632603549237e-07,
"loss": 1.5908,
"step": 113
},
{
"epoch": 0.29381443298969073,
"grad_norm": 1.2496393834141784,
"learning_rate": 1.9751853112274527e-07,
"loss": 1.5506,
"step": 114
},
{
"epoch": 0.2963917525773196,
"grad_norm": 1.2871218607928567,
"learning_rate": 1.974734002432827e-07,
"loss": 1.5275,
"step": 115
},
{
"epoch": 0.29896907216494845,
"grad_norm": 1.2976234741205572,
"learning_rate": 1.9742786790241546e-07,
"loss": 1.5444,
"step": 116
},
{
"epoch": 0.3015463917525773,
"grad_norm": 1.2017823329368622,
"learning_rate": 1.9738193428767654e-07,
"loss": 1.543,
"step": 117
},
{
"epoch": 0.30412371134020616,
"grad_norm": 1.226770431675134,
"learning_rate": 1.9733559958825167e-07,
"loss": 1.5397,
"step": 118
},
{
"epoch": 0.30670103092783507,
"grad_norm": 1.3442951015324778,
"learning_rate": 1.9728886399497844e-07,
"loss": 1.5852,
"step": 119
},
{
"epoch": 0.30927835051546393,
"grad_norm": 1.2017473551527889,
"learning_rate": 1.9724172770034564e-07,
"loss": 1.5318,
"step": 120
},
{
"epoch": 0.3118556701030928,
"grad_norm": 1.211656114042897,
"learning_rate": 1.9719419089849246e-07,
"loss": 1.5028,
"step": 121
},
{
"epoch": 0.31443298969072164,
"grad_norm": 1.400130154858166,
"learning_rate": 1.9714625378520756e-07,
"loss": 1.5582,
"step": 122
},
{
"epoch": 0.3170103092783505,
"grad_norm": 1.3086898697605782,
"learning_rate": 1.9709791655792847e-07,
"loss": 1.6549,
"step": 123
},
{
"epoch": 0.31958762886597936,
"grad_norm": 1.278029367300382,
"learning_rate": 1.9704917941574052e-07,
"loss": 1.5557,
"step": 124
},
{
"epoch": 0.32216494845360827,
"grad_norm": 1.2356382868741678,
"learning_rate": 1.9700004255937627e-07,
"loss": 1.5288,
"step": 125
},
{
"epoch": 0.3247422680412371,
"grad_norm": 1.28937440464536,
"learning_rate": 1.9695050619121457e-07,
"loss": 1.5266,
"step": 126
},
{
"epoch": 0.327319587628866,
"grad_norm": 1.4414848109811116,
"learning_rate": 1.9690057051527963e-07,
"loss": 1.6097,
"step": 127
},
{
"epoch": 0.32989690721649484,
"grad_norm": 1.2136781418976954,
"learning_rate": 1.9685023573724035e-07,
"loss": 1.4935,
"step": 128
},
{
"epoch": 0.3324742268041237,
"grad_norm": 1.3341115569144475,
"learning_rate": 1.9679950206440948e-07,
"loss": 1.5987,
"step": 129
},
{
"epoch": 0.33505154639175255,
"grad_norm": 1.329559323076734,
"learning_rate": 1.967483697057425e-07,
"loss": 1.5782,
"step": 130
},
{
"epoch": 0.33762886597938147,
"grad_norm": 1.2026583523005048,
"learning_rate": 1.9669683887183714e-07,
"loss": 1.5482,
"step": 131
},
{
"epoch": 0.3402061855670103,
"grad_norm": 1.230715216092296,
"learning_rate": 1.966449097749322e-07,
"loss": 1.637,
"step": 132
},
{
"epoch": 0.3427835051546392,
"grad_norm": 1.3616177214331797,
"learning_rate": 1.965925826289068e-07,
"loss": 1.5264,
"step": 133
},
{
"epoch": 0.34536082474226804,
"grad_norm": 1.1816372421732182,
"learning_rate": 1.965398576492796e-07,
"loss": 1.5349,
"step": 134
},
{
"epoch": 0.3479381443298969,
"grad_norm": 1.3503944653975188,
"learning_rate": 1.964867350532077e-07,
"loss": 1.5317,
"step": 135
},
{
"epoch": 0.35051546391752575,
"grad_norm": 1.3016847854244378,
"learning_rate": 1.9643321505948584e-07,
"loss": 1.6062,
"step": 136
},
{
"epoch": 0.35309278350515466,
"grad_norm": 1.19908669818476,
"learning_rate": 1.9637929788854564e-07,
"loss": 1.6179,
"step": 137
},
{
"epoch": 0.3556701030927835,
"grad_norm": 1.1945706816984818,
"learning_rate": 1.9632498376245445e-07,
"loss": 1.5982,
"step": 138
},
{
"epoch": 0.3582474226804124,
"grad_norm": 1.233096157789794,
"learning_rate": 1.9627027290491458e-07,
"loss": 1.572,
"step": 139
},
{
"epoch": 0.36082474226804123,
"grad_norm": 1.2228780779938433,
"learning_rate": 1.9621516554126237e-07,
"loss": 1.5789,
"step": 140
},
{
"epoch": 0.3634020618556701,
"grad_norm": 1.1898193013734535,
"learning_rate": 1.961596618984672e-07,
"loss": 1.4511,
"step": 141
},
{
"epoch": 0.36597938144329895,
"grad_norm": 1.25230398028528,
"learning_rate": 1.9610376220513066e-07,
"loss": 1.5529,
"step": 142
},
{
"epoch": 0.36855670103092786,
"grad_norm": 1.2693796938125035,
"learning_rate": 1.960474666914855e-07,
"loss": 1.5403,
"step": 143
},
{
"epoch": 0.3711340206185567,
"grad_norm": 1.3275717703634924,
"learning_rate": 1.9599077558939464e-07,
"loss": 1.4989,
"step": 144
},
{
"epoch": 0.37371134020618557,
"grad_norm": 1.1489906814896371,
"learning_rate": 1.959336891323505e-07,
"loss": 1.5074,
"step": 145
},
{
"epoch": 0.37628865979381443,
"grad_norm": 1.1875368070507506,
"learning_rate": 1.958762075554737e-07,
"loss": 1.5219,
"step": 146
},
{
"epoch": 0.3788659793814433,
"grad_norm": 1.2013715546004073,
"learning_rate": 1.9581833109551228e-07,
"loss": 1.5413,
"step": 147
},
{
"epoch": 0.3788659793814433,
"eval_loss": 1.5337220430374146,
"eval_runtime": 78.6436,
"eval_samples_per_second": 21.146,
"eval_steps_per_second": 1.322,
"step": 147
},
{
"epoch": 0.38144329896907214,
"grad_norm": 1.348552262306386,
"learning_rate": 1.9576005999084056e-07,
"loss": 1.5713,
"step": 148
},
{
"epoch": 0.38402061855670105,
"grad_norm": 1.2579524096365415,
"learning_rate": 1.9570139448145852e-07,
"loss": 1.5042,
"step": 149
},
{
"epoch": 0.3865979381443299,
"grad_norm": 1.2007903800378994,
"learning_rate": 1.9564233480899028e-07,
"loss": 1.4753,
"step": 150
},
{
"epoch": 0.38917525773195877,
"grad_norm": 1.14999357355067,
"learning_rate": 1.955828812166836e-07,
"loss": 1.489,
"step": 151
},
{
"epoch": 0.3917525773195876,
"grad_norm": 1.2834202884360733,
"learning_rate": 1.955230339494086e-07,
"loss": 1.5672,
"step": 152
},
{
"epoch": 0.3943298969072165,
"grad_norm": 1.2110339834614112,
"learning_rate": 1.9546279325365675e-07,
"loss": 1.5138,
"step": 153
},
{
"epoch": 0.39690721649484534,
"grad_norm": 1.2447583871603898,
"learning_rate": 1.9540215937754007e-07,
"loss": 1.5324,
"step": 154
},
{
"epoch": 0.39948453608247425,
"grad_norm": 1.2169740146814894,
"learning_rate": 1.9534113257078978e-07,
"loss": 1.5228,
"step": 155
},
{
"epoch": 0.4020618556701031,
"grad_norm": 1.3339392292279337,
"learning_rate": 1.9527971308475568e-07,
"loss": 1.5537,
"step": 156
},
{
"epoch": 0.40463917525773196,
"grad_norm": 1.1629410191581253,
"learning_rate": 1.952179011724047e-07,
"loss": 1.4565,
"step": 157
},
{
"epoch": 0.4072164948453608,
"grad_norm": 1.2166854685328994,
"learning_rate": 1.951556970883201e-07,
"loss": 1.4996,
"step": 158
},
{
"epoch": 0.4097938144329897,
"grad_norm": 1.1864599175194743,
"learning_rate": 1.9509310108870037e-07,
"loss": 1.5078,
"step": 159
},
{
"epoch": 0.41237113402061853,
"grad_norm": 1.2614891919139117,
"learning_rate": 1.9503011343135826e-07,
"loss": 1.6787,
"step": 160
},
{
"epoch": 0.41494845360824745,
"grad_norm": 1.2538176997908546,
"learning_rate": 1.9496673437571945e-07,
"loss": 1.5567,
"step": 161
},
{
"epoch": 0.4175257731958763,
"grad_norm": 1.2100512003350425,
"learning_rate": 1.9490296418282183e-07,
"loss": 1.5835,
"step": 162
},
{
"epoch": 0.42010309278350516,
"grad_norm": 1.176294102289334,
"learning_rate": 1.9483880311531423e-07,
"loss": 1.4902,
"step": 163
},
{
"epoch": 0.422680412371134,
"grad_norm": 1.2400060721796176,
"learning_rate": 1.9477425143745525e-07,
"loss": 1.5971,
"step": 164
},
{
"epoch": 0.4252577319587629,
"grad_norm": 1.1621100701911136,
"learning_rate": 1.9470930941511243e-07,
"loss": 1.5171,
"step": 165
},
{
"epoch": 0.42783505154639173,
"grad_norm": 1.2424661949562683,
"learning_rate": 1.9464397731576091e-07,
"loss": 1.4954,
"step": 166
},
{
"epoch": 0.43041237113402064,
"grad_norm": 1.23770627068237,
"learning_rate": 1.9457825540848255e-07,
"loss": 1.5326,
"step": 167
},
{
"epoch": 0.4329896907216495,
"grad_norm": 1.1862612005970397,
"learning_rate": 1.9451214396396453e-07,
"loss": 1.4912,
"step": 168
},
{
"epoch": 0.43556701030927836,
"grad_norm": 1.2831749441379539,
"learning_rate": 1.9444564325449853e-07,
"loss": 1.6117,
"step": 169
},
{
"epoch": 0.4381443298969072,
"grad_norm": 1.1531718726331943,
"learning_rate": 1.943787535539795e-07,
"loss": 1.4855,
"step": 170
},
{
"epoch": 0.44072164948453607,
"grad_norm": 1.1826441581231952,
"learning_rate": 1.9431147513790446e-07,
"loss": 1.5582,
"step": 171
},
{
"epoch": 0.44329896907216493,
"grad_norm": 1.1887449944628656,
"learning_rate": 1.9424380828337143e-07,
"loss": 1.5564,
"step": 172
},
{
"epoch": 0.44587628865979384,
"grad_norm": 1.249570543310612,
"learning_rate": 1.9417575326907831e-07,
"loss": 1.621,
"step": 173
},
{
"epoch": 0.4484536082474227,
"grad_norm": 1.3090306728609684,
"learning_rate": 1.941073103753217e-07,
"loss": 1.5282,
"step": 174
},
{
"epoch": 0.45103092783505155,
"grad_norm": 1.2503633263430554,
"learning_rate": 1.9403847988399566e-07,
"loss": 1.5513,
"step": 175
},
{
"epoch": 0.4536082474226804,
"grad_norm": 1.2018168355345367,
"learning_rate": 1.9396926207859085e-07,
"loss": 1.4957,
"step": 176
},
{
"epoch": 0.45618556701030927,
"grad_norm": 1.168765093642791,
"learning_rate": 1.9389965724419288e-07,
"loss": 1.5004,
"step": 177
},
{
"epoch": 0.4587628865979381,
"grad_norm": 1.250633142422843,
"learning_rate": 1.9382966566748167e-07,
"loss": 1.5387,
"step": 178
},
{
"epoch": 0.46134020618556704,
"grad_norm": 1.171229347123422,
"learning_rate": 1.9375928763672982e-07,
"loss": 1.596,
"step": 179
},
{
"epoch": 0.4639175257731959,
"grad_norm": 1.1693848944378227,
"learning_rate": 1.9368852344180166e-07,
"loss": 1.5147,
"step": 180
},
{
"epoch": 0.46649484536082475,
"grad_norm": 1.2828987442740891,
"learning_rate": 1.9361737337415204e-07,
"loss": 1.5539,
"step": 181
},
{
"epoch": 0.4690721649484536,
"grad_norm": 1.1925907017733204,
"learning_rate": 1.9354583772682512e-07,
"loss": 1.5752,
"step": 182
},
{
"epoch": 0.47164948453608246,
"grad_norm": 1.321152376647017,
"learning_rate": 1.93473916794453e-07,
"loss": 1.5952,
"step": 183
},
{
"epoch": 0.4742268041237113,
"grad_norm": 1.2480635026506552,
"learning_rate": 1.934016108732548e-07,
"loss": 1.5068,
"step": 184
},
{
"epoch": 0.47680412371134023,
"grad_norm": 1.2890663133137021,
"learning_rate": 1.9332892026103517e-07,
"loss": 1.4498,
"step": 185
},
{
"epoch": 0.4793814432989691,
"grad_norm": 1.278439525246191,
"learning_rate": 1.932558452571833e-07,
"loss": 1.5061,
"step": 186
},
{
"epoch": 0.48195876288659795,
"grad_norm": 1.2481302944858157,
"learning_rate": 1.931823861626714e-07,
"loss": 1.5672,
"step": 187
},
{
"epoch": 0.4845360824742268,
"grad_norm": 1.2421848632538859,
"learning_rate": 1.9310854328005378e-07,
"loss": 1.4985,
"step": 188
},
{
"epoch": 0.48711340206185566,
"grad_norm": 1.1840656288458875,
"learning_rate": 1.930343169134654e-07,
"loss": 1.556,
"step": 189
},
{
"epoch": 0.4896907216494845,
"grad_norm": 1.2585791993336888,
"learning_rate": 1.929597073686206e-07,
"loss": 1.5539,
"step": 190
},
{
"epoch": 0.49226804123711343,
"grad_norm": 1.123656686890668,
"learning_rate": 1.9288471495281203e-07,
"loss": 1.5377,
"step": 191
},
{
"epoch": 0.4948453608247423,
"grad_norm": 1.276688134117863,
"learning_rate": 1.9280933997490912e-07,
"loss": 1.5845,
"step": 192
},
{
"epoch": 0.49742268041237114,
"grad_norm": 1.231953746707157,
"learning_rate": 1.9273358274535702e-07,
"loss": 1.6142,
"step": 193
},
{
"epoch": 0.5,
"grad_norm": 1.3230553754067966,
"learning_rate": 1.926574435761753e-07,
"loss": 1.4738,
"step": 194
},
{
"epoch": 0.5025773195876289,
"grad_norm": 1.2436732656409537,
"learning_rate": 1.9258092278095657e-07,
"loss": 1.5969,
"step": 195
},
{
"epoch": 0.5051546391752577,
"grad_norm": 1.221047910828976,
"learning_rate": 1.925040206748652e-07,
"loss": 1.5962,
"step": 196
},
{
"epoch": 0.5051546391752577,
"eval_loss": 1.520858883857727,
"eval_runtime": 78.5683,
"eval_samples_per_second": 21.166,
"eval_steps_per_second": 1.324,
"step": 196
},
{
"epoch": 0.5077319587628866,
"grad_norm": 1.2212270479150868,
"learning_rate": 1.924267375746361e-07,
"loss": 1.5033,
"step": 197
},
{
"epoch": 0.5103092783505154,
"grad_norm": 1.2178250609326542,
"learning_rate": 1.9234907379857334e-07,
"loss": 1.577,
"step": 198
},
{
"epoch": 0.5128865979381443,
"grad_norm": 1.1521118751035526,
"learning_rate": 1.9227102966654895e-07,
"loss": 1.4468,
"step": 199
},
{
"epoch": 0.5154639175257731,
"grad_norm": 1.2132226025196962,
"learning_rate": 1.9219260550000143e-07,
"loss": 1.5135,
"step": 200
},
{
"epoch": 0.5180412371134021,
"grad_norm": 1.191186345232448,
"learning_rate": 1.921138016219345e-07,
"loss": 1.5146,
"step": 201
},
{
"epoch": 0.520618556701031,
"grad_norm": 1.2208830731174638,
"learning_rate": 1.9203461835691592e-07,
"loss": 1.5452,
"step": 202
},
{
"epoch": 0.5231958762886598,
"grad_norm": 1.2176060346511148,
"learning_rate": 1.9195505603107594e-07,
"loss": 1.5144,
"step": 203
},
{
"epoch": 0.5257731958762887,
"grad_norm": 1.1351041872872305,
"learning_rate": 1.9187511497210597e-07,
"loss": 1.5463,
"step": 204
},
{
"epoch": 0.5283505154639175,
"grad_norm": 1.1782470225350157,
"learning_rate": 1.9179479550925747e-07,
"loss": 1.4878,
"step": 205
},
{
"epoch": 0.5309278350515464,
"grad_norm": 1.0942788691010794,
"learning_rate": 1.9171409797334025e-07,
"loss": 1.5423,
"step": 206
},
{
"epoch": 0.5335051546391752,
"grad_norm": 1.2422690533739307,
"learning_rate": 1.9163302269672137e-07,
"loss": 1.5543,
"step": 207
},
{
"epoch": 0.5360824742268041,
"grad_norm": 1.187410857798478,
"learning_rate": 1.9155157001332372e-07,
"loss": 1.4864,
"step": 208
},
{
"epoch": 0.538659793814433,
"grad_norm": 1.2521757262499582,
"learning_rate": 1.9146974025862448e-07,
"loss": 1.5678,
"step": 209
},
{
"epoch": 0.5412371134020618,
"grad_norm": 1.1895335891190835,
"learning_rate": 1.91387533769654e-07,
"loss": 1.5359,
"step": 210
},
{
"epoch": 0.5438144329896907,
"grad_norm": 1.156080510817116,
"learning_rate": 1.9130495088499417e-07,
"loss": 1.4179,
"step": 211
},
{
"epoch": 0.5463917525773195,
"grad_norm": 1.2160395280121006,
"learning_rate": 1.912219919447772e-07,
"loss": 1.5288,
"step": 212
},
{
"epoch": 0.5489690721649485,
"grad_norm": 1.187251015976325,
"learning_rate": 1.9113865729068413e-07,
"loss": 1.5829,
"step": 213
},
{
"epoch": 0.5515463917525774,
"grad_norm": 1.2325994836421947,
"learning_rate": 1.9105494726594342e-07,
"loss": 1.5918,
"step": 214
},
{
"epoch": 0.5541237113402062,
"grad_norm": 1.2136013415323126,
"learning_rate": 1.9097086221532964e-07,
"loss": 1.5093,
"step": 215
},
{
"epoch": 0.5567010309278351,
"grad_norm": 1.1685027007257103,
"learning_rate": 1.9088640248516185e-07,
"loss": 1.5992,
"step": 216
},
{
"epoch": 0.5592783505154639,
"grad_norm": 1.2470178729913264,
"learning_rate": 1.908015684233024e-07,
"loss": 1.5845,
"step": 217
},
{
"epoch": 0.5618556701030928,
"grad_norm": 1.3342781963513264,
"learning_rate": 1.9071636037915533e-07,
"loss": 1.5227,
"step": 218
},
{
"epoch": 0.5644329896907216,
"grad_norm": 1.2834111003737632,
"learning_rate": 1.90630778703665e-07,
"loss": 1.5278,
"step": 219
},
{
"epoch": 0.5670103092783505,
"grad_norm": 1.2731317285054349,
"learning_rate": 1.9054482374931466e-07,
"loss": 1.558,
"step": 220
},
{
"epoch": 0.5695876288659794,
"grad_norm": 1.2315820199483811,
"learning_rate": 1.9045849587012496e-07,
"loss": 1.5586,
"step": 221
},
{
"epoch": 0.5721649484536082,
"grad_norm": 1.2995032591648374,
"learning_rate": 1.9037179542165253e-07,
"loss": 1.5726,
"step": 222
},
{
"epoch": 0.5747422680412371,
"grad_norm": 1.2207628382258247,
"learning_rate": 1.902847227609884e-07,
"loss": 1.5622,
"step": 223
},
{
"epoch": 0.5773195876288659,
"grad_norm": 1.1578307509849368,
"learning_rate": 1.901972782467568e-07,
"loss": 1.5029,
"step": 224
},
{
"epoch": 0.5798969072164949,
"grad_norm": 1.2559554939477484,
"learning_rate": 1.9010946223911333e-07,
"loss": 1.5536,
"step": 225
},
{
"epoch": 0.5824742268041238,
"grad_norm": 1.1912957688409214,
"learning_rate": 1.9002127509974374e-07,
"loss": 1.4107,
"step": 226
},
{
"epoch": 0.5850515463917526,
"grad_norm": 1.347391803127549,
"learning_rate": 1.899327171918623e-07,
"loss": 1.4981,
"step": 227
},
{
"epoch": 0.5876288659793815,
"grad_norm": 1.1735029116257494,
"learning_rate": 1.8984378888021042e-07,
"loss": 1.4931,
"step": 228
},
{
"epoch": 0.5902061855670103,
"grad_norm": 1.1491563326269614,
"learning_rate": 1.8975449053105503e-07,
"loss": 1.439,
"step": 229
},
{
"epoch": 0.5927835051546392,
"grad_norm": 1.1281459530728108,
"learning_rate": 1.8966482251218715e-07,
"loss": 1.5317,
"step": 230
},
{
"epoch": 0.595360824742268,
"grad_norm": 1.1698523464033057,
"learning_rate": 1.8957478519292032e-07,
"loss": 1.533,
"step": 231
},
{
"epoch": 0.5979381443298969,
"grad_norm": 1.2253794089203258,
"learning_rate": 1.8948437894408918e-07,
"loss": 1.566,
"step": 232
},
{
"epoch": 0.6005154639175257,
"grad_norm": 1.2704578177761554,
"learning_rate": 1.893936041380478e-07,
"loss": 1.5496,
"step": 233
},
{
"epoch": 0.6030927835051546,
"grad_norm": 1.270569192705897,
"learning_rate": 1.8930246114866822e-07,
"loss": 1.4762,
"step": 234
},
{
"epoch": 0.6056701030927835,
"grad_norm": 1.1748786103242588,
"learning_rate": 1.8921095035133896e-07,
"loss": 1.5641,
"step": 235
},
{
"epoch": 0.6082474226804123,
"grad_norm": 1.2029791452687832,
"learning_rate": 1.891190721229634e-07,
"loss": 1.5694,
"step": 236
},
{
"epoch": 0.6108247422680413,
"grad_norm": 1.19680587233996,
"learning_rate": 1.890268268419582e-07,
"loss": 1.5538,
"step": 237
},
{
"epoch": 0.6134020618556701,
"grad_norm": 1.1874592772095638,
"learning_rate": 1.8893421488825187e-07,
"loss": 1.4978,
"step": 238
},
{
"epoch": 0.615979381443299,
"grad_norm": 1.216069233807722,
"learning_rate": 1.888412366432831e-07,
"loss": 1.584,
"step": 239
},
{
"epoch": 0.6185567010309279,
"grad_norm": 1.2090175073299552,
"learning_rate": 1.8874789248999913e-07,
"loss": 1.5486,
"step": 240
},
{
"epoch": 0.6211340206185567,
"grad_norm": 1.1599735542109655,
"learning_rate": 1.8865418281285444e-07,
"loss": 1.512,
"step": 241
},
{
"epoch": 0.6237113402061856,
"grad_norm": 1.1508476690774565,
"learning_rate": 1.885601079978088e-07,
"loss": 1.4699,
"step": 242
},
{
"epoch": 0.6262886597938144,
"grad_norm": 1.294126202956922,
"learning_rate": 1.8846566843232594e-07,
"loss": 1.6185,
"step": 243
},
{
"epoch": 0.6288659793814433,
"grad_norm": 1.1538551018422412,
"learning_rate": 1.883708645053719e-07,
"loss": 1.5284,
"step": 244
},
{
"epoch": 0.6314432989690721,
"grad_norm": 1.1790058528070886,
"learning_rate": 1.882756966074134e-07,
"loss": 1.5235,
"step": 245
},
{
"epoch": 0.6314432989690721,
"eval_loss": 1.510589361190796,
"eval_runtime": 78.6198,
"eval_samples_per_second": 21.152,
"eval_steps_per_second": 1.323,
"step": 245
},
{
"epoch": 0.634020618556701,
"grad_norm": 1.1938102380471263,
"learning_rate": 1.8818016513041623e-07,
"loss": 1.5028,
"step": 246
},
{
"epoch": 0.6365979381443299,
"grad_norm": 1.231310461159998,
"learning_rate": 1.8808427046784362e-07,
"loss": 1.5686,
"step": 247
},
{
"epoch": 0.6391752577319587,
"grad_norm": 1.3015696329059996,
"learning_rate": 1.8798801301465467e-07,
"loss": 1.579,
"step": 248
},
{
"epoch": 0.6417525773195877,
"grad_norm": 1.1482602866030465,
"learning_rate": 1.8789139316730269e-07,
"loss": 1.5331,
"step": 249
},
{
"epoch": 0.6443298969072165,
"grad_norm": 1.231219314227984,
"learning_rate": 1.8779441132373359e-07,
"loss": 1.5366,
"step": 250
},
{
"epoch": 0.6469072164948454,
"grad_norm": 1.2531642119413817,
"learning_rate": 1.876970678833842e-07,
"loss": 1.5246,
"step": 251
},
{
"epoch": 0.6494845360824743,
"grad_norm": 1.1332607994718875,
"learning_rate": 1.8759936324718066e-07,
"loss": 1.5029,
"step": 252
},
{
"epoch": 0.6520618556701031,
"grad_norm": 1.123414985710231,
"learning_rate": 1.8750129781753677e-07,
"loss": 1.5992,
"step": 253
},
{
"epoch": 0.654639175257732,
"grad_norm": 1.1601574273566644,
"learning_rate": 1.874028719983523e-07,
"loss": 1.4271,
"step": 254
},
{
"epoch": 0.6572164948453608,
"grad_norm": 1.2155208006708451,
"learning_rate": 1.8730408619501138e-07,
"loss": 1.5939,
"step": 255
},
{
"epoch": 0.6597938144329897,
"grad_norm": 1.181434829014358,
"learning_rate": 1.8720494081438076e-07,
"loss": 1.5416,
"step": 256
},
{
"epoch": 0.6623711340206185,
"grad_norm": 1.1457316456562228,
"learning_rate": 1.8710543626480818e-07,
"loss": 1.4854,
"step": 257
},
{
"epoch": 0.6649484536082474,
"grad_norm": 1.1872624778137861,
"learning_rate": 1.8700557295612072e-07,
"loss": 1.5045,
"step": 258
},
{
"epoch": 0.6675257731958762,
"grad_norm": 1.2856636838183533,
"learning_rate": 1.8690535129962305e-07,
"loss": 1.4678,
"step": 259
},
{
"epoch": 0.6701030927835051,
"grad_norm": 1.131984435899355,
"learning_rate": 1.8680477170809572e-07,
"loss": 1.5706,
"step": 260
},
{
"epoch": 0.6726804123711341,
"grad_norm": 1.2653048133418598,
"learning_rate": 1.8670383459579356e-07,
"loss": 1.5623,
"step": 261
},
{
"epoch": 0.6752577319587629,
"grad_norm": 1.2245543813976405,
"learning_rate": 1.8660254037844388e-07,
"loss": 1.5039,
"step": 262
},
{
"epoch": 0.6778350515463918,
"grad_norm": 1.1778675556929805,
"learning_rate": 1.8650088947324475e-07,
"loss": 1.5143,
"step": 263
},
{
"epoch": 0.6804123711340206,
"grad_norm": 1.1796106429583424,
"learning_rate": 1.863988822988634e-07,
"loss": 1.5867,
"step": 264
},
{
"epoch": 0.6829896907216495,
"grad_norm": 1.143095546666012,
"learning_rate": 1.8629651927543443e-07,
"loss": 1.4735,
"step": 265
},
{
"epoch": 0.6855670103092784,
"grad_norm": 1.1803235220482347,
"learning_rate": 1.8619380082455796e-07,
"loss": 1.4606,
"step": 266
},
{
"epoch": 0.6881443298969072,
"grad_norm": 1.2218442431344259,
"learning_rate": 1.8609072736929806e-07,
"loss": 1.5409,
"step": 267
},
{
"epoch": 0.6907216494845361,
"grad_norm": 1.2044546146531363,
"learning_rate": 1.85987299334181e-07,
"loss": 1.5279,
"step": 268
},
{
"epoch": 0.6932989690721649,
"grad_norm": 1.2619745333120211,
"learning_rate": 1.8588351714519335e-07,
"loss": 1.5244,
"step": 269
},
{
"epoch": 0.6958762886597938,
"grad_norm": 1.256000322805203,
"learning_rate": 1.8577938122978042e-07,
"loss": 1.5294,
"step": 270
},
{
"epoch": 0.6984536082474226,
"grad_norm": 1.2356982681147777,
"learning_rate": 1.856748920168443e-07,
"loss": 1.5036,
"step": 271
},
{
"epoch": 0.7010309278350515,
"grad_norm": 1.2037362943983936,
"learning_rate": 1.855700499367423e-07,
"loss": 1.5235,
"step": 272
},
{
"epoch": 0.7036082474226805,
"grad_norm": 1.2017143929693659,
"learning_rate": 1.85464855421285e-07,
"loss": 1.4204,
"step": 273
},
{
"epoch": 0.7061855670103093,
"grad_norm": 1.1908996404734937,
"learning_rate": 1.8535930890373465e-07,
"loss": 1.4969,
"step": 274
},
{
"epoch": 0.7087628865979382,
"grad_norm": 1.1577329971672512,
"learning_rate": 1.8525341081880312e-07,
"loss": 1.5319,
"step": 275
},
{
"epoch": 0.711340206185567,
"grad_norm": 1.1714981246895275,
"learning_rate": 1.8514716160265045e-07,
"loss": 1.4177,
"step": 276
},
{
"epoch": 0.7139175257731959,
"grad_norm": 1.1688981848930113,
"learning_rate": 1.8504056169288274e-07,
"loss": 1.5234,
"step": 277
},
{
"epoch": 0.7164948453608248,
"grad_norm": 1.176710170060508,
"learning_rate": 1.8493361152855057e-07,
"loss": 1.499,
"step": 278
},
{
"epoch": 0.7190721649484536,
"grad_norm": 1.1039383442864374,
"learning_rate": 1.8482631155014703e-07,
"loss": 1.5258,
"step": 279
},
{
"epoch": 0.7216494845360825,
"grad_norm": 1.232497346510154,
"learning_rate": 1.84718662199606e-07,
"loss": 1.5564,
"step": 280
},
{
"epoch": 0.7242268041237113,
"grad_norm": 1.1628995381634444,
"learning_rate": 1.8461066392030046e-07,
"loss": 1.4091,
"step": 281
},
{
"epoch": 0.7268041237113402,
"grad_norm": 1.2777142820565022,
"learning_rate": 1.8450231715704026e-07,
"loss": 1.4754,
"step": 282
},
{
"epoch": 0.729381443298969,
"grad_norm": 1.2162243240659913,
"learning_rate": 1.843936223560707e-07,
"loss": 1.5473,
"step": 283
},
{
"epoch": 0.7319587628865979,
"grad_norm": 1.2147904802438685,
"learning_rate": 1.8428457996507053e-07,
"loss": 1.5296,
"step": 284
},
{
"epoch": 0.7345360824742269,
"grad_norm": 1.19577901711321,
"learning_rate": 1.8417519043315004e-07,
"loss": 1.542,
"step": 285
},
{
"epoch": 0.7371134020618557,
"grad_norm": 1.252475138336633,
"learning_rate": 1.8406545421084938e-07,
"loss": 1.5293,
"step": 286
},
{
"epoch": 0.7396907216494846,
"grad_norm": 1.1515656379492916,
"learning_rate": 1.8395537175013654e-07,
"loss": 1.5272,
"step": 287
},
{
"epoch": 0.7422680412371134,
"grad_norm": 1.1517700578396561,
"learning_rate": 1.8384494350440552e-07,
"loss": 1.5133,
"step": 288
},
{
"epoch": 0.7448453608247423,
"grad_norm": 1.217323252639824,
"learning_rate": 1.8373416992847458e-07,
"loss": 1.5009,
"step": 289
},
{
"epoch": 0.7474226804123711,
"grad_norm": 1.1814204725087243,
"learning_rate": 1.8362305147858428e-07,
"loss": 1.4538,
"step": 290
},
{
"epoch": 0.75,
"grad_norm": 1.1842613200601082,
"learning_rate": 1.835115886123955e-07,
"loss": 1.3816,
"step": 291
},
{
"epoch": 0.7525773195876289,
"grad_norm": 1.2063574196502098,
"learning_rate": 1.8339978178898778e-07,
"loss": 1.5965,
"step": 292
},
{
"epoch": 0.7551546391752577,
"grad_norm": 1.2685230099116653,
"learning_rate": 1.8328763146885725e-07,
"loss": 1.5637,
"step": 293
},
{
"epoch": 0.7577319587628866,
"grad_norm": 1.295213064366882,
"learning_rate": 1.8317513811391476e-07,
"loss": 1.5592,
"step": 294
},
{
"epoch": 0.7577319587628866,
"eval_loss": 1.5018398761749268,
"eval_runtime": 78.561,
"eval_samples_per_second": 21.168,
"eval_steps_per_second": 1.324,
"step": 294
},
{
"epoch": 0.7603092783505154,
"grad_norm": 1.1669863622367527,
"learning_rate": 1.830623021874841e-07,
"loss": 1.5081,
"step": 295
},
{
"epoch": 0.7628865979381443,
"grad_norm": 1.1910397422917334,
"learning_rate": 1.8294912415429992e-07,
"loss": 1.523,
"step": 296
},
{
"epoch": 0.7654639175257731,
"grad_norm": 1.1665026656613802,
"learning_rate": 1.8283560448050594e-07,
"loss": 1.4753,
"step": 297
},
{
"epoch": 0.7680412371134021,
"grad_norm": 1.212187645390271,
"learning_rate": 1.8272174363365297e-07,
"loss": 1.4983,
"step": 298
},
{
"epoch": 0.770618556701031,
"grad_norm": 1.2227876601034444,
"learning_rate": 1.8260754208269701e-07,
"loss": 1.5019,
"step": 299
},
{
"epoch": 0.7731958762886598,
"grad_norm": 1.2358555763549743,
"learning_rate": 1.8249300029799733e-07,
"loss": 1.5965,
"step": 300
},
{
"epoch": 0.7757731958762887,
"grad_norm": 1.187640438130257,
"learning_rate": 1.8237811875131444e-07,
"loss": 1.591,
"step": 301
},
{
"epoch": 0.7783505154639175,
"grad_norm": 1.2214707732869985,
"learning_rate": 1.8226289791580828e-07,
"loss": 1.5274,
"step": 302
},
{
"epoch": 0.7809278350515464,
"grad_norm": 1.2019657180078016,
"learning_rate": 1.8214733826603625e-07,
"loss": 1.5021,
"step": 303
},
{
"epoch": 0.7835051546391752,
"grad_norm": 1.16960231687607,
"learning_rate": 1.820314402779511e-07,
"loss": 1.5763,
"step": 304
},
{
"epoch": 0.7860824742268041,
"grad_norm": 1.152389731802479,
"learning_rate": 1.8191520442889918e-07,
"loss": 1.5176,
"step": 305
},
{
"epoch": 0.788659793814433,
"grad_norm": 1.1132515669118002,
"learning_rate": 1.8179863119761833e-07,
"loss": 1.4634,
"step": 306
},
{
"epoch": 0.7912371134020618,
"grad_norm": 1.1607539313280772,
"learning_rate": 1.8168172106423606e-07,
"loss": 1.4798,
"step": 307
},
{
"epoch": 0.7938144329896907,
"grad_norm": 1.2145359718563615,
"learning_rate": 1.8156447451026728e-07,
"loss": 1.594,
"step": 308
},
{
"epoch": 0.7963917525773195,
"grad_norm": 1.1870844292463605,
"learning_rate": 1.814468920186127e-07,
"loss": 1.478,
"step": 309
},
{
"epoch": 0.7989690721649485,
"grad_norm": 1.1233767004431354,
"learning_rate": 1.8132897407355653e-07,
"loss": 1.5882,
"step": 310
},
{
"epoch": 0.8015463917525774,
"grad_norm": 1.1738330684693277,
"learning_rate": 1.8121072116076464e-07,
"loss": 1.4284,
"step": 311
},
{
"epoch": 0.8041237113402062,
"grad_norm": 1.247978839030236,
"learning_rate": 1.8109213376728257e-07,
"loss": 1.5824,
"step": 312
},
{
"epoch": 0.8067010309278351,
"grad_norm": 1.2318777988562417,
"learning_rate": 1.8097321238153336e-07,
"loss": 1.5185,
"step": 313
},
{
"epoch": 0.8092783505154639,
"grad_norm": 1.137207160847728,
"learning_rate": 1.808539574933158e-07,
"loss": 1.448,
"step": 314
},
{
"epoch": 0.8118556701030928,
"grad_norm": 1.203622066974504,
"learning_rate": 1.8073436959380212e-07,
"loss": 1.5003,
"step": 315
},
{
"epoch": 0.8144329896907216,
"grad_norm": 1.1618827104260305,
"learning_rate": 1.8061444917553627e-07,
"loss": 1.4603,
"step": 316
},
{
"epoch": 0.8170103092783505,
"grad_norm": 1.1455984024451822,
"learning_rate": 1.8049419673243164e-07,
"loss": 1.4366,
"step": 317
},
{
"epoch": 0.8195876288659794,
"grad_norm": 1.1500253179290463,
"learning_rate": 1.803736127597691e-07,
"loss": 1.5403,
"step": 318
},
{
"epoch": 0.8221649484536082,
"grad_norm": 1.2632412244799347,
"learning_rate": 1.8025269775419507e-07,
"loss": 1.5003,
"step": 319
},
{
"epoch": 0.8247422680412371,
"grad_norm": 1.142698108221298,
"learning_rate": 1.8013145221371934e-07,
"loss": 1.4732,
"step": 320
},
{
"epoch": 0.8273195876288659,
"grad_norm": 1.2124460871646654,
"learning_rate": 1.8000987663771306e-07,
"loss": 1.5311,
"step": 321
},
{
"epoch": 0.8298969072164949,
"grad_norm": 1.2348590930541292,
"learning_rate": 1.798879715269067e-07,
"loss": 1.5741,
"step": 322
},
{
"epoch": 0.8324742268041238,
"grad_norm": 1.1498349377386237,
"learning_rate": 1.79765737383388e-07,
"loss": 1.361,
"step": 323
},
{
"epoch": 0.8350515463917526,
"grad_norm": 1.189403441559741,
"learning_rate": 1.796431747105998e-07,
"loss": 1.5002,
"step": 324
},
{
"epoch": 0.8376288659793815,
"grad_norm": 1.2170644285030623,
"learning_rate": 1.7952028401333816e-07,
"loss": 1.5508,
"step": 325
},
{
"epoch": 0.8402061855670103,
"grad_norm": 1.2305649106918,
"learning_rate": 1.793970657977501e-07,
"loss": 1.5185,
"step": 326
},
{
"epoch": 0.8427835051546392,
"grad_norm": 1.1928858589906648,
"learning_rate": 1.7927352057133156e-07,
"loss": 1.5859,
"step": 327
},
{
"epoch": 0.845360824742268,
"grad_norm": 1.2402447474397933,
"learning_rate": 1.791496488429254e-07,
"loss": 1.4482,
"step": 328
},
{
"epoch": 0.8479381443298969,
"grad_norm": 1.3004615784711493,
"learning_rate": 1.7902545112271916e-07,
"loss": 1.4996,
"step": 329
},
{
"epoch": 0.8505154639175257,
"grad_norm": 1.2029226714523475,
"learning_rate": 1.7890092792224314e-07,
"loss": 1.4729,
"step": 330
},
{
"epoch": 0.8530927835051546,
"grad_norm": 1.1646016402710766,
"learning_rate": 1.7877607975436803e-07,
"loss": 1.511,
"step": 331
},
{
"epoch": 0.8556701030927835,
"grad_norm": 1.1748241861140345,
"learning_rate": 1.7865090713330312e-07,
"loss": 1.5406,
"step": 332
},
{
"epoch": 0.8582474226804123,
"grad_norm": 1.1988219111182623,
"learning_rate": 1.785254105745939e-07,
"loss": 1.5364,
"step": 333
},
{
"epoch": 0.8608247422680413,
"grad_norm": 1.2920016906616154,
"learning_rate": 1.7839959059512014e-07,
"loss": 1.5188,
"step": 334
},
{
"epoch": 0.8634020618556701,
"grad_norm": 1.1390205414249481,
"learning_rate": 1.7827344771309362e-07,
"loss": 1.4749,
"step": 335
},
{
"epoch": 0.865979381443299,
"grad_norm": 1.207725667468718,
"learning_rate": 1.7814698244805603e-07,
"loss": 1.5144,
"step": 336
},
{
"epoch": 0.8685567010309279,
"grad_norm": 1.2708389359824341,
"learning_rate": 1.780201953208769e-07,
"loss": 1.4633,
"step": 337
},
{
"epoch": 0.8711340206185567,
"grad_norm": 1.3588744934998203,
"learning_rate": 1.7789308685375146e-07,
"loss": 1.5194,
"step": 338
},
{
"epoch": 0.8737113402061856,
"grad_norm": 1.1714299642439896,
"learning_rate": 1.7776565757019829e-07,
"loss": 1.4378,
"step": 339
},
{
"epoch": 0.8762886597938144,
"grad_norm": 1.2349197329756814,
"learning_rate": 1.7763790799505743e-07,
"loss": 1.501,
"step": 340
},
{
"epoch": 0.8788659793814433,
"grad_norm": 1.145994840644305,
"learning_rate": 1.7750983865448804e-07,
"loss": 1.3569,
"step": 341
},
{
"epoch": 0.8814432989690721,
"grad_norm": 1.147878510470048,
"learning_rate": 1.773814500759663e-07,
"loss": 1.4907,
"step": 342
},
{
"epoch": 0.884020618556701,
"grad_norm": 1.2101479142325238,
"learning_rate": 1.7725274278828324e-07,
"loss": 1.5045,
"step": 343
},
{
"epoch": 0.884020618556701,
"eval_loss": 1.4945380687713623,
"eval_runtime": 78.6415,
"eval_samples_per_second": 21.147,
"eval_steps_per_second": 1.322,
"step": 343
},
{
"epoch": 0.8865979381443299,
"grad_norm": 1.2038990843843793,
"learning_rate": 1.7712371732154257e-07,
"loss": 1.4554,
"step": 344
},
{
"epoch": 0.8891752577319587,
"grad_norm": 1.1472367305664413,
"learning_rate": 1.7699437420715838e-07,
"loss": 1.4611,
"step": 345
},
{
"epoch": 0.8917525773195877,
"grad_norm": 1.2170090657627353,
"learning_rate": 1.768647139778532e-07,
"loss": 1.4619,
"step": 346
},
{
"epoch": 0.8943298969072165,
"grad_norm": 1.1815824919293882,
"learning_rate": 1.7673473716765553e-07,
"loss": 1.5022,
"step": 347
},
{
"epoch": 0.8969072164948454,
"grad_norm": 1.1967591939256936,
"learning_rate": 1.766044443118978e-07,
"loss": 1.4812,
"step": 348
},
{
"epoch": 0.8994845360824743,
"grad_norm": 1.228975686058958,
"learning_rate": 1.7647383594721413e-07,
"loss": 1.4943,
"step": 349
},
{
"epoch": 0.9020618556701031,
"grad_norm": 1.2132506060158343,
"learning_rate": 1.7634291261153818e-07,
"loss": 1.4852,
"step": 350
},
{
"epoch": 0.904639175257732,
"grad_norm": 1.2581183528068558,
"learning_rate": 1.7621167484410076e-07,
"loss": 1.5311,
"step": 351
},
{
"epoch": 0.9072164948453608,
"grad_norm": 1.1976025658343157,
"learning_rate": 1.7608012318542776e-07,
"loss": 1.5623,
"step": 352
},
{
"epoch": 0.9097938144329897,
"grad_norm": 1.2081117148971663,
"learning_rate": 1.7594825817733804e-07,
"loss": 1.4877,
"step": 353
},
{
"epoch": 0.9123711340206185,
"grad_norm": 1.25102310904074,
"learning_rate": 1.7581608036294074e-07,
"loss": 1.5166,
"step": 354
},
{
"epoch": 0.9149484536082474,
"grad_norm": 1.1251058107211171,
"learning_rate": 1.7568359028663362e-07,
"loss": 1.4818,
"step": 355
},
{
"epoch": 0.9175257731958762,
"grad_norm": 1.162404179159399,
"learning_rate": 1.7555078849410042e-07,
"loss": 1.4684,
"step": 356
},
{
"epoch": 0.9201030927835051,
"grad_norm": 1.1939177374027512,
"learning_rate": 1.754176755323088e-07,
"loss": 1.3906,
"step": 357
},
{
"epoch": 0.9226804123711341,
"grad_norm": 1.2277839442625762,
"learning_rate": 1.7528425194950793e-07,
"loss": 1.5206,
"step": 358
},
{
"epoch": 0.9252577319587629,
"grad_norm": 1.1589149786868607,
"learning_rate": 1.7515051829522643e-07,
"loss": 1.5117,
"step": 359
},
{
"epoch": 0.9278350515463918,
"grad_norm": 1.161766915938516,
"learning_rate": 1.7501647512026993e-07,
"loss": 1.5142,
"step": 360
},
{
"epoch": 0.9304123711340206,
"grad_norm": 1.1895671903848675,
"learning_rate": 1.7488212297671897e-07,
"loss": 1.5279,
"step": 361
},
{
"epoch": 0.9329896907216495,
"grad_norm": 1.3331865087236399,
"learning_rate": 1.7474746241792646e-07,
"loss": 1.4476,
"step": 362
},
{
"epoch": 0.9355670103092784,
"grad_norm": 1.1227191881644327,
"learning_rate": 1.746124939985158e-07,
"loss": 1.436,
"step": 363
},
{
"epoch": 0.9381443298969072,
"grad_norm": 1.1453288975869358,
"learning_rate": 1.7447721827437817e-07,
"loss": 1.4721,
"step": 364
},
{
"epoch": 0.9407216494845361,
"grad_norm": 1.1800301680843552,
"learning_rate": 1.7434163580267056e-07,
"loss": 1.4648,
"step": 365
},
{
"epoch": 0.9432989690721649,
"grad_norm": 1.1592086626138536,
"learning_rate": 1.7420574714181327e-07,
"loss": 1.4645,
"step": 366
},
{
"epoch": 0.9458762886597938,
"grad_norm": 1.1969987793516494,
"learning_rate": 1.7406955285148782e-07,
"loss": 1.4628,
"step": 367
},
{
"epoch": 0.9484536082474226,
"grad_norm": 1.25319893461736,
"learning_rate": 1.7393305349263432e-07,
"loss": 1.5327,
"step": 368
},
{
"epoch": 0.9510309278350515,
"grad_norm": 1.1235076122412295,
"learning_rate": 1.7379624962744954e-07,
"loss": 1.457,
"step": 369
},
{
"epoch": 0.9536082474226805,
"grad_norm": 1.215770975088775,
"learning_rate": 1.7365914181938438e-07,
"loss": 1.4802,
"step": 370
},
{
"epoch": 0.9561855670103093,
"grad_norm": 1.1400445439752551,
"learning_rate": 1.7352173063314147e-07,
"loss": 1.4078,
"step": 371
},
{
"epoch": 0.9587628865979382,
"grad_norm": 1.219412218457137,
"learning_rate": 1.7338401663467307e-07,
"loss": 1.4863,
"step": 372
},
{
"epoch": 0.961340206185567,
"grad_norm": 1.2307165231693638,
"learning_rate": 1.732460003911786e-07,
"loss": 1.547,
"step": 373
},
{
"epoch": 0.9639175257731959,
"grad_norm": 1.1928743718959285,
"learning_rate": 1.731076824711023e-07,
"loss": 1.4681,
"step": 374
},
{
"epoch": 0.9664948453608248,
"grad_norm": 1.2210774438706382,
"learning_rate": 1.7296906344413101e-07,
"loss": 1.5359,
"step": 375
},
{
"epoch": 0.9690721649484536,
"grad_norm": 1.1755911854453769,
"learning_rate": 1.7283014388119157e-07,
"loss": 1.5286,
"step": 376
},
{
"epoch": 0.9716494845360825,
"grad_norm": 1.1189926107564905,
"learning_rate": 1.7269092435444878e-07,
"loss": 1.4309,
"step": 377
},
{
"epoch": 0.9742268041237113,
"grad_norm": 1.209816536244005,
"learning_rate": 1.7255140543730282e-07,
"loss": 1.4689,
"step": 378
},
{
"epoch": 0.9768041237113402,
"grad_norm": 1.1866285142861848,
"learning_rate": 1.7241158770438697e-07,
"loss": 1.4972,
"step": 379
},
{
"epoch": 0.979381443298969,
"grad_norm": 1.1354634757481643,
"learning_rate": 1.722714717315652e-07,
"loss": 1.4873,
"step": 380
},
{
"epoch": 0.9819587628865979,
"grad_norm": 1.2944770552807037,
"learning_rate": 1.7213105809593e-07,
"loss": 1.4974,
"step": 381
},
{
"epoch": 0.9845360824742269,
"grad_norm": 1.103791679895453,
"learning_rate": 1.719903473757996e-07,
"loss": 1.4338,
"step": 382
},
{
"epoch": 0.9871134020618557,
"grad_norm": 1.1784721051806777,
"learning_rate": 1.7184934015071594e-07,
"loss": 1.4041,
"step": 383
},
{
"epoch": 0.9896907216494846,
"grad_norm": 1.1348338130977504,
"learning_rate": 1.7170803700144225e-07,
"loss": 1.4413,
"step": 384
},
{
"epoch": 0.9922680412371134,
"grad_norm": 1.2250889412679622,
"learning_rate": 1.7156643850996044e-07,
"loss": 1.4629,
"step": 385
},
{
"epoch": 0.9948453608247423,
"grad_norm": 1.1045983289273678,
"learning_rate": 1.7142454525946888e-07,
"loss": 1.5546,
"step": 386
},
{
"epoch": 0.9974226804123711,
"grad_norm": 1.1516418913315656,
"learning_rate": 1.7128235783437998e-07,
"loss": 1.5631,
"step": 387
},
{
"epoch": 1.0,
"grad_norm": 1.252168700059035,
"learning_rate": 1.7113987682031778e-07,
"loss": 1.4422,
"step": 388
},
{
"epoch": 1.0025773195876289,
"grad_norm": 1.189319163542339,
"learning_rate": 1.7099710280411546e-07,
"loss": 1.5383,
"step": 389
},
{
"epoch": 1.0051546391752577,
"grad_norm": 1.2727165097128585,
"learning_rate": 1.70854036373813e-07,
"loss": 1.5408,
"step": 390
},
{
"epoch": 1.0077319587628866,
"grad_norm": 1.1517050348302873,
"learning_rate": 1.7071067811865473e-07,
"loss": 1.5864,
"step": 391
},
{
"epoch": 1.0103092783505154,
"grad_norm": 1.3325861122052731,
"learning_rate": 1.7056702862908702e-07,
"loss": 1.5524,
"step": 392
},
{
"epoch": 1.0103092783505154,
"eval_loss": 1.4885141849517822,
"eval_runtime": 78.424,
"eval_samples_per_second": 21.205,
"eval_steps_per_second": 1.326,
"step": 392
},
{
"epoch": 1.0128865979381443,
"grad_norm": 1.1135739405983736,
"learning_rate": 1.7042308849675554e-07,
"loss": 1.5054,
"step": 393
},
{
"epoch": 1.0154639175257731,
"grad_norm": 1.1782103759330078,
"learning_rate": 1.7027885831450317e-07,
"loss": 1.4809,
"step": 394
},
{
"epoch": 1.018041237113402,
"grad_norm": 1.1307316665373648,
"learning_rate": 1.701343386763674e-07,
"loss": 1.4176,
"step": 395
},
{
"epoch": 1.0206185567010309,
"grad_norm": 1.2226276517588748,
"learning_rate": 1.6998953017757785e-07,
"loss": 1.5829,
"step": 396
},
{
"epoch": 1.0231958762886597,
"grad_norm": 1.2403418129653008,
"learning_rate": 1.698444334145539e-07,
"loss": 1.5954,
"step": 397
},
{
"epoch": 1.0257731958762886,
"grad_norm": 1.1302836106915826,
"learning_rate": 1.6969904898490212e-07,
"loss": 1.4231,
"step": 398
},
{
"epoch": 1.0283505154639174,
"grad_norm": 1.141960483416689,
"learning_rate": 1.6955337748741405e-07,
"loss": 1.4287,
"step": 399
},
{
"epoch": 1.0309278350515463,
"grad_norm": 1.196477232474438,
"learning_rate": 1.694074195220634e-07,
"loss": 1.5239,
"step": 400
},
{
"epoch": 1.0335051546391754,
"grad_norm": 1.183187501385808,
"learning_rate": 1.692611756900038e-07,
"loss": 1.497,
"step": 401
},
{
"epoch": 1.0360824742268042,
"grad_norm": 1.150174147558412,
"learning_rate": 1.691146465935663e-07,
"loss": 1.5532,
"step": 402
},
{
"epoch": 1.038659793814433,
"grad_norm": 1.2448204002333718,
"learning_rate": 1.689678328362569e-07,
"loss": 1.416,
"step": 403
},
{
"epoch": 1.041237113402062,
"grad_norm": 1.1109759208202117,
"learning_rate": 1.6882073502275392e-07,
"loss": 1.5012,
"step": 404
},
{
"epoch": 1.0438144329896908,
"grad_norm": 1.1567096038742686,
"learning_rate": 1.6867335375890566e-07,
"loss": 1.5053,
"step": 405
},
{
"epoch": 1.0463917525773196,
"grad_norm": 1.1754138924074398,
"learning_rate": 1.6852568965172792e-07,
"loss": 1.5129,
"step": 406
},
{
"epoch": 1.0489690721649485,
"grad_norm": 1.23193132568122,
"learning_rate": 1.6837774330940136e-07,
"loss": 1.5573,
"step": 407
},
{
"epoch": 1.0515463917525774,
"grad_norm": 1.154132682102343,
"learning_rate": 1.6822951534126908e-07,
"loss": 1.4258,
"step": 408
},
{
"epoch": 1.0541237113402062,
"grad_norm": 1.1683702220075676,
"learning_rate": 1.680810063578342e-07,
"loss": 1.493,
"step": 409
},
{
"epoch": 1.056701030927835,
"grad_norm": 1.1355190434284121,
"learning_rate": 1.6793221697075716e-07,
"loss": 1.5119,
"step": 410
},
{
"epoch": 1.059278350515464,
"grad_norm": 1.1992497667084585,
"learning_rate": 1.6778314779285324e-07,
"loss": 1.538,
"step": 411
},
{
"epoch": 1.0618556701030928,
"grad_norm": 1.1517964539720562,
"learning_rate": 1.6763379943809027e-07,
"loss": 1.4665,
"step": 412
},
{
"epoch": 1.0644329896907216,
"grad_norm": 1.0984210499840694,
"learning_rate": 1.6748417252158577e-07,
"loss": 1.4328,
"step": 413
},
{
"epoch": 1.0670103092783505,
"grad_norm": 1.1299450982658101,
"learning_rate": 1.6733426765960456e-07,
"loss": 1.5028,
"step": 414
},
{
"epoch": 1.0695876288659794,
"grad_norm": 1.212850591316243,
"learning_rate": 1.6718408546955635e-07,
"loss": 1.5834,
"step": 415
},
{
"epoch": 1.0721649484536082,
"grad_norm": 1.187341231477269,
"learning_rate": 1.6703362656999299e-07,
"loss": 1.5069,
"step": 416
},
{
"epoch": 1.074742268041237,
"grad_norm": 1.2469684651532016,
"learning_rate": 1.6688289158060593e-07,
"loss": 1.518,
"step": 417
},
{
"epoch": 1.077319587628866,
"grad_norm": 1.254398054291776,
"learning_rate": 1.6673188112222395e-07,
"loss": 1.578,
"step": 418
},
{
"epoch": 1.0798969072164948,
"grad_norm": 1.1499801218824168,
"learning_rate": 1.665805958168102e-07,
"loss": 1.4979,
"step": 419
},
{
"epoch": 1.0824742268041236,
"grad_norm": 1.1976396691121443,
"learning_rate": 1.664290362874599e-07,
"loss": 1.4914,
"step": 420
},
{
"epoch": 1.0850515463917525,
"grad_norm": 1.1348401564795523,
"learning_rate": 1.662772031583978e-07,
"loss": 1.3902,
"step": 421
},
{
"epoch": 1.0876288659793814,
"grad_norm": 1.2267166932133524,
"learning_rate": 1.6612509705497542e-07,
"loss": 1.4352,
"step": 422
},
{
"epoch": 1.0902061855670102,
"grad_norm": 1.2873463533597629,
"learning_rate": 1.6597271860366856e-07,
"loss": 1.4478,
"step": 423
},
{
"epoch": 1.092783505154639,
"grad_norm": 1.3679857975054832,
"learning_rate": 1.6582006843207478e-07,
"loss": 1.5168,
"step": 424
},
{
"epoch": 1.0953608247422681,
"grad_norm": 1.326554289290517,
"learning_rate": 1.6566714716891078e-07,
"loss": 1.5008,
"step": 425
},
{
"epoch": 1.097938144329897,
"grad_norm": 1.168969016350491,
"learning_rate": 1.6551395544400978e-07,
"loss": 1.4917,
"step": 426
},
{
"epoch": 1.1005154639175259,
"grad_norm": 1.2413798753485674,
"learning_rate": 1.6536049388831893e-07,
"loss": 1.4502,
"step": 427
},
{
"epoch": 1.1030927835051547,
"grad_norm": 1.1635621820926023,
"learning_rate": 1.652067631338967e-07,
"loss": 1.557,
"step": 428
},
{
"epoch": 1.1056701030927836,
"grad_norm": 1.1573375306268514,
"learning_rate": 1.6505276381391036e-07,
"loss": 1.4244,
"step": 429
},
{
"epoch": 1.1082474226804124,
"grad_norm": 1.2312412177915255,
"learning_rate": 1.6489849656263335e-07,
"loss": 1.5494,
"step": 430
},
{
"epoch": 1.1108247422680413,
"grad_norm": 1.219284880839308,
"learning_rate": 1.647439620154425e-07,
"loss": 1.5306,
"step": 431
},
{
"epoch": 1.1134020618556701,
"grad_norm": 1.173558682623126,
"learning_rate": 1.6458916080881563e-07,
"loss": 1.4429,
"step": 432
},
{
"epoch": 1.0025773195876289,
"grad_norm": 1.229487690642213,
"learning_rate": 1.6443409358032887e-07,
"loss": 1.5753,
"step": 433
},
{
"epoch": 1.0051546391752577,
"grad_norm": 1.2105170741564812,
"learning_rate": 1.6427876096865392e-07,
"loss": 1.5334,
"step": 434
},
{
"epoch": 1.0077319587628866,
"grad_norm": 1.204008054808549,
"learning_rate": 1.6412316361355562e-07,
"loss": 1.42,
"step": 435
},
{
"epoch": 1.0103092783505154,
"grad_norm": 1.1326791826110472,
"learning_rate": 1.6396730215588912e-07,
"loss": 1.4714,
"step": 436
},
{
"epoch": 1.0128865979381443,
"grad_norm": 1.1200550697122906,
"learning_rate": 1.6381117723759734e-07,
"loss": 1.514,
"step": 437
},
{
"epoch": 1.0154639175257731,
"grad_norm": 1.1890623492712462,
"learning_rate": 1.6365478950170833e-07,
"loss": 1.4181,
"step": 438
},
{
"epoch": 1.018041237113402,
"grad_norm": 1.1631198253400261,
"learning_rate": 1.6349813959233255e-07,
"loss": 1.4062,
"step": 439
},
{
"epoch": 1.0206185567010309,
"grad_norm": 1.1360996622048518,
"learning_rate": 1.6334122815466031e-07,
"loss": 1.4486,
"step": 440
},
{
"epoch": 1.0231958762886597,
"grad_norm": 1.1864758464899412,
"learning_rate": 1.6318405583495913e-07,
"loss": 1.5347,
"step": 441
},
{
"epoch": 1.0231958762886597,
"eval_loss": 1.4830812215805054,
"eval_runtime": 78.5114,
"eval_samples_per_second": 21.182,
"eval_steps_per_second": 1.325,
"step": 441
},
{
"epoch": 1.0257731958762886,
"grad_norm": 1.1301160006601543,
"learning_rate": 1.6302662328057085e-07,
"loss": 1.4353,
"step": 442
},
{
"epoch": 1.0283505154639174,
"grad_norm": 1.1894059515483042,
"learning_rate": 1.6286893113990932e-07,
"loss": 1.469,
"step": 443
},
{
"epoch": 1.0309278350515463,
"grad_norm": 1.1496261846772073,
"learning_rate": 1.627109800624574e-07,
"loss": 1.5501,
"step": 444
},
{
"epoch": 1.0335051546391754,
"grad_norm": 1.2088185832357161,
"learning_rate": 1.6255277069876451e-07,
"loss": 1.4899,
"step": 445
},
{
"epoch": 1.0360824742268042,
"grad_norm": 1.1253812221554047,
"learning_rate": 1.6239430370044387e-07,
"loss": 1.4122,
"step": 446
},
{
"epoch": 1.038659793814433,
"grad_norm": 1.1716232931347121,
"learning_rate": 1.6223557972016973e-07,
"loss": 1.439,
"step": 447
},
{
"epoch": 1.041237113402062,
"grad_norm": 1.18342528126353,
"learning_rate": 1.6207659941167485e-07,
"loss": 1.5094,
"step": 448
},
{
"epoch": 1.0438144329896908,
"grad_norm": 1.2039062898512076,
"learning_rate": 1.6191736342974767e-07,
"loss": 1.4619,
"step": 449
},
{
"epoch": 1.0463917525773196,
"grad_norm": 1.2183703075903023,
"learning_rate": 1.617578724302297e-07,
"loss": 1.5232,
"step": 450
},
{
"epoch": 1.0489690721649485,
"grad_norm": 1.1388070881208434,
"learning_rate": 1.615981270700128e-07,
"loss": 1.4638,
"step": 451
},
{
"epoch": 1.0515463917525774,
"grad_norm": 1.0877380908149572,
"learning_rate": 1.6143812800703642e-07,
"loss": 1.4447,
"step": 452
},
{
"epoch": 1.0541237113402062,
"grad_norm": 1.1716268781083103,
"learning_rate": 1.6127787590028495e-07,
"loss": 1.5212,
"step": 453
},
{
"epoch": 1.056701030927835,
"grad_norm": 1.107434556978612,
"learning_rate": 1.6111737140978493e-07,
"loss": 1.4558,
"step": 454
},
{
"epoch": 1.059278350515464,
"grad_norm": 1.2519354029249565,
"learning_rate": 1.609566151966025e-07,
"loss": 1.4528,
"step": 455
},
{
"epoch": 1.0618556701030928,
"grad_norm": 1.1919323581174677,
"learning_rate": 1.6079560792284045e-07,
"loss": 1.5621,
"step": 456
},
{
"epoch": 1.0644329896907216,
"grad_norm": 1.1817947401366415,
"learning_rate": 1.6063435025163568e-07,
"loss": 1.4662,
"step": 457
},
{
"epoch": 1.0670103092783505,
"grad_norm": 1.2557632574926887,
"learning_rate": 1.6047284284715642e-07,
"loss": 1.4804,
"step": 458
},
{
"epoch": 1.0695876288659794,
"grad_norm": 1.2611184908202628,
"learning_rate": 1.6031108637459932e-07,
"loss": 1.3898,
"step": 459
},
{
"epoch": 1.0721649484536082,
"grad_norm": 1.2900278262304008,
"learning_rate": 1.6014908150018703e-07,
"loss": 1.5064,
"step": 460
},
{
"epoch": 1.074742268041237,
"grad_norm": 1.195779708533936,
"learning_rate": 1.5998682889116524e-07,
"loss": 1.5224,
"step": 461
},
{
"epoch": 1.077319587628866,
"grad_norm": 1.1566664249843968,
"learning_rate": 1.5982432921579993e-07,
"loss": 1.4517,
"step": 462
},
{
"epoch": 1.0798969072164948,
"grad_norm": 1.2001020296312388,
"learning_rate": 1.596615831433747e-07,
"loss": 1.5602,
"step": 463
},
{
"epoch": 1.0824742268041236,
"grad_norm": 1.1943899233375934,
"learning_rate": 1.5949859134418796e-07,
"loss": 1.3757,
"step": 464
},
{
"epoch": 1.0850515463917525,
"grad_norm": 1.231964645169981,
"learning_rate": 1.5933535448955027e-07,
"loss": 1.4859,
"step": 465
},
{
"epoch": 1.0876288659793814,
"grad_norm": 1.1068734683342414,
"learning_rate": 1.5917187325178137e-07,
"loss": 1.4629,
"step": 466
},
{
"epoch": 1.0902061855670102,
"grad_norm": 1.1513773116941175,
"learning_rate": 1.590081483042076e-07,
"loss": 1.5125,
"step": 467
},
{
"epoch": 1.0927835051546393,
"grad_norm": 1.265359820624344,
"learning_rate": 1.5884418032115906e-07,
"loss": 1.5204,
"step": 468
},
{
"epoch": 1.0953608247422681,
"grad_norm": 1.1596012619544869,
"learning_rate": 1.5867996997796683e-07,
"loss": 1.4528,
"step": 469
},
{
"epoch": 1.097938144329897,
"grad_norm": 1.1953930948748877,
"learning_rate": 1.5851551795096025e-07,
"loss": 1.404,
"step": 470
},
{
"epoch": 1.1005154639175259,
"grad_norm": 1.1467999018042732,
"learning_rate": 1.5835082491746393e-07,
"loss": 1.5314,
"step": 471
},
{
"epoch": 1.1030927835051547,
"grad_norm": 1.208554802219746,
"learning_rate": 1.581858915557953e-07,
"loss": 1.4632,
"step": 472
},
{
"epoch": 1.1056701030927836,
"grad_norm": 1.210149302840143,
"learning_rate": 1.580207185452614e-07,
"loss": 1.4828,
"step": 473
},
{
"epoch": 1.1082474226804124,
"grad_norm": 1.0949101750229728,
"learning_rate": 1.5785530656615654e-07,
"loss": 1.4612,
"step": 474
},
{
"epoch": 1.1108247422680413,
"grad_norm": 1.1550991304470553,
"learning_rate": 1.576896562997591e-07,
"loss": 1.5112,
"step": 475
},
{
"epoch": 1.1134020618556701,
"grad_norm": 1.267086705459486,
"learning_rate": 1.5752376842832898e-07,
"loss": 1.5086,
"step": 476
},
{
"epoch": 1.115979381443299,
"grad_norm": 1.157659801945543,
"learning_rate": 1.573576436351046e-07,
"loss": 1.4721,
"step": 477
},
{
"epoch": 1.1185567010309279,
"grad_norm": 1.1792779255646542,
"learning_rate": 1.571912826043003e-07,
"loss": 1.4216,
"step": 478
},
{
"epoch": 1.1211340206185567,
"grad_norm": 1.279434721476292,
"learning_rate": 1.5702468602110331e-07,
"loss": 1.4098,
"step": 479
},
{
"epoch": 1.1237113402061856,
"grad_norm": 1.2412716991217037,
"learning_rate": 1.5685785457167113e-07,
"loss": 1.4855,
"step": 480
},
{
"epoch": 1.1262886597938144,
"grad_norm": 1.1878566044688987,
"learning_rate": 1.5669078894312847e-07,
"loss": 1.5252,
"step": 481
},
{
"epoch": 1.1288659793814433,
"grad_norm": 1.2441727908973987,
"learning_rate": 1.565234898235646e-07,
"loss": 1.5462,
"step": 482
},
{
"epoch": 1.1314432989690721,
"grad_norm": 1.176061624777031,
"learning_rate": 1.5635595790203056e-07,
"loss": 1.5135,
"step": 483
},
{
"epoch": 1.134020618556701,
"grad_norm": 1.246481799384192,
"learning_rate": 1.5618819386853602e-07,
"loss": 1.5357,
"step": 484
},
{
"epoch": 1.1365979381443299,
"grad_norm": 1.2042279646873306,
"learning_rate": 1.5602019841404688e-07,
"loss": 1.5146,
"step": 485
},
{
"epoch": 1.1391752577319587,
"grad_norm": 1.1664753868373192,
"learning_rate": 1.5585197223048202e-07,
"loss": 1.5007,
"step": 486
},
{
"epoch": 1.1417525773195876,
"grad_norm": 1.0786695822166654,
"learning_rate": 1.5568351601071068e-07,
"loss": 1.4637,
"step": 487
},
{
"epoch": 1.1443298969072164,
"grad_norm": 1.1782507265833873,
"learning_rate": 1.5551483044854952e-07,
"loss": 1.4811,
"step": 488
},
{
"epoch": 1.1469072164948453,
"grad_norm": 1.2326350516083906,
"learning_rate": 1.5534591623875985e-07,
"loss": 1.5482,
"step": 489
},
{
"epoch": 1.1494845360824741,
"grad_norm": 1.3932475474558166,
"learning_rate": 1.551767740770446e-07,
"loss": 1.4994,
"step": 490
},
{
"epoch": 1.1494845360824741,
"eval_loss": 1.4784166812896729,
"eval_runtime": 78.5816,
"eval_samples_per_second": 21.163,
"eval_steps_per_second": 1.323,
"step": 490
},
{
"epoch": 1.152061855670103,
"grad_norm": 1.2782842614630645,
"learning_rate": 1.5500740466004562e-07,
"loss": 1.4751,
"step": 491
},
{
"epoch": 1.1546391752577319,
"grad_norm": 1.216799121655535,
"learning_rate": 1.5483780868534083e-07,
"loss": 1.4724,
"step": 492
},
{
"epoch": 1.1572164948453607,
"grad_norm": 1.1868499010457458,
"learning_rate": 1.546679868514411e-07,
"loss": 1.4335,
"step": 493
},
{
"epoch": 1.1597938144329896,
"grad_norm": 1.199212625101753,
"learning_rate": 1.544979398577877e-07,
"loss": 1.428,
"step": 494
},
{
"epoch": 1.1623711340206186,
"grad_norm": 1.1357296953077098,
"learning_rate": 1.543276684047491e-07,
"loss": 1.4542,
"step": 495
},
{
"epoch": 1.1649484536082475,
"grad_norm": 1.213487385523563,
"learning_rate": 1.5415717319361846e-07,
"loss": 1.4724,
"step": 496
},
{
"epoch": 1.1675257731958764,
"grad_norm": 1.1099648565570772,
"learning_rate": 1.5398645492661028e-07,
"loss": 1.4254,
"step": 497
},
{
"epoch": 1.1701030927835052,
"grad_norm": 1.1324706525701729,
"learning_rate": 1.5381551430685795e-07,
"loss": 1.5048,
"step": 498
},
{
"epoch": 1.172680412371134,
"grad_norm": 1.312867551517799,
"learning_rate": 1.5364435203841056e-07,
"loss": 1.4713,
"step": 499
},
{
"epoch": 1.175257731958763,
"grad_norm": 1.1933326421003594,
"learning_rate": 1.5347296882623017e-07,
"loss": 1.5138,
"step": 500
},
{
"epoch": 1.1778350515463918,
"grad_norm": 1.0985685695284346,
"learning_rate": 1.533013653761887e-07,
"loss": 1.433,
"step": 501
},
{
"epoch": 1.1804123711340206,
"grad_norm": 1.1149163975715322,
"learning_rate": 1.5312954239506533e-07,
"loss": 1.3835,
"step": 502
},
{
"epoch": 1.1829896907216495,
"grad_norm": 1.3227767494195912,
"learning_rate": 1.529575005905433e-07,
"loss": 1.4895,
"step": 503
},
{
"epoch": 1.1855670103092784,
"grad_norm": 1.214579951187228,
"learning_rate": 1.5278524067120717e-07,
"loss": 1.5998,
"step": 504
},
{
"epoch": 1.1881443298969072,
"grad_norm": 1.242415460112634,
"learning_rate": 1.5261276334653982e-07,
"loss": 1.419,
"step": 505
},
{
"epoch": 1.190721649484536,
"grad_norm": 1.2389773021924564,
"learning_rate": 1.5244006932691953e-07,
"loss": 1.4202,
"step": 506
},
{
"epoch": 1.193298969072165,
"grad_norm": 1.2349193613971634,
"learning_rate": 1.5226715932361716e-07,
"loss": 1.5457,
"step": 507
},
{
"epoch": 1.1958762886597938,
"grad_norm": 1.1148921709276238,
"learning_rate": 1.5209403404879302e-07,
"loss": 1.3884,
"step": 508
},
{
"epoch": 1.1984536082474226,
"grad_norm": 1.2416754407978092,
"learning_rate": 1.5192069421549416e-07,
"loss": 1.4643,
"step": 509
},
{
"epoch": 1.2010309278350515,
"grad_norm": 1.240689395283768,
"learning_rate": 1.5174714053765122e-07,
"loss": 1.572,
"step": 510
},
{
"epoch": 1.2036082474226804,
"grad_norm": 1.140745518174075,
"learning_rate": 1.5157337373007578e-07,
"loss": 1.3663,
"step": 511
},
{
"epoch": 1.2061855670103092,
"grad_norm": 1.1312657539313165,
"learning_rate": 1.5139939450845698e-07,
"loss": 1.4681,
"step": 512
},
{
"epoch": 1.208762886597938,
"grad_norm": 1.1613965817840117,
"learning_rate": 1.51225203589359e-07,
"loss": 1.548,
"step": 513
},
{
"epoch": 1.211340206185567,
"grad_norm": 1.1253218321610134,
"learning_rate": 1.5105080169021789e-07,
"loss": 1.4644,
"step": 514
},
{
"epoch": 1.2139175257731958,
"grad_norm": 1.1202729459915262,
"learning_rate": 1.5087618952933866e-07,
"loss": 1.4874,
"step": 515
},
{
"epoch": 1.2164948453608249,
"grad_norm": 1.1754915638068841,
"learning_rate": 1.5070136782589233e-07,
"loss": 1.4904,
"step": 516
},
{
"epoch": 1.2190721649484537,
"grad_norm": 1.211459122094429,
"learning_rate": 1.5052633729991294e-07,
"loss": 1.4832,
"step": 517
},
{
"epoch": 1.2216494845360826,
"grad_norm": 1.2489759850317173,
"learning_rate": 1.5035109867229456e-07,
"loss": 1.4464,
"step": 518
},
{
"epoch": 1.2242268041237114,
"grad_norm": 1.2194317834170105,
"learning_rate": 1.5017565266478848e-07,
"loss": 1.4897,
"step": 519
},
{
"epoch": 1.2268041237113403,
"grad_norm": 1.1036732258357687,
"learning_rate": 1.5e-07,
"loss": 1.491,
"step": 520
},
{
"epoch": 1.2293814432989691,
"grad_norm": 1.1658107658884465,
"learning_rate": 1.4982414140138563e-07,
"loss": 1.4678,
"step": 521
},
{
"epoch": 1.231958762886598,
"grad_norm": 1.2704801398111358,
"learning_rate": 1.4964807759325008e-07,
"loss": 1.3781,
"step": 522
},
{
"epoch": 1.2345360824742269,
"grad_norm": 1.1848897409786574,
"learning_rate": 1.4947180930074323e-07,
"loss": 1.4799,
"step": 523
},
{
"epoch": 1.2371134020618557,
"grad_norm": 1.2016447040520333,
"learning_rate": 1.492953372498571e-07,
"loss": 1.5686,
"step": 524
},
{
"epoch": 1.2396907216494846,
"grad_norm": 1.2911746325303657,
"learning_rate": 1.4911866216742307e-07,
"loss": 1.5241,
"step": 525
},
{
"epoch": 1.2422680412371134,
"grad_norm": 1.1990990248512616,
"learning_rate": 1.4894178478110855e-07,
"loss": 1.5357,
"step": 526
},
{
"epoch": 1.2448453608247423,
"grad_norm": 1.149144012214145,
"learning_rate": 1.4876470581941434e-07,
"loss": 1.4571,
"step": 527
},
{
"epoch": 1.2474226804123711,
"grad_norm": 1.198321859008649,
"learning_rate": 1.485874260116714e-07,
"loss": 1.5113,
"step": 528
},
{
"epoch": 1.25,
"grad_norm": 1.2113266741136735,
"learning_rate": 1.4840994608803788e-07,
"loss": 1.4782,
"step": 529
},
{
"epoch": 1.2525773195876289,
"grad_norm": 1.1425317175556289,
"learning_rate": 1.4823226677949622e-07,
"loss": 1.5012,
"step": 530
},
{
"epoch": 1.2551546391752577,
"grad_norm": 1.268980235594048,
"learning_rate": 1.4805438881784995e-07,
"loss": 1.4529,
"step": 531
},
{
"epoch": 1.2577319587628866,
"grad_norm": 1.151209820959519,
"learning_rate": 1.478763129357209e-07,
"loss": 1.4734,
"step": 532
},
{
"epoch": 1.2603092783505154,
"grad_norm": 1.2260752095042977,
"learning_rate": 1.4769803986654603e-07,
"loss": 1.4896,
"step": 533
},
{
"epoch": 1.2628865979381443,
"grad_norm": 1.2017887268263763,
"learning_rate": 1.4751957034457445e-07,
"loss": 1.4667,
"step": 534
},
{
"epoch": 1.2654639175257731,
"grad_norm": 1.2271959233872554,
"learning_rate": 1.4734090510486432e-07,
"loss": 1.4888,
"step": 535
},
{
"epoch": 1.268041237113402,
"grad_norm": 1.2197382019523413,
"learning_rate": 1.4716204488328006e-07,
"loss": 1.5358,
"step": 536
},
{
"epoch": 1.2706185567010309,
"grad_norm": 1.1416105765632265,
"learning_rate": 1.4698299041648902e-07,
"loss": 1.4275,
"step": 537
},
{
"epoch": 1.2731958762886597,
"grad_norm": 1.2103999127902116,
"learning_rate": 1.468037424419586e-07,
"loss": 1.4822,
"step": 538
},
{
"epoch": 1.2757731958762886,
"grad_norm": 1.2127169663908728,
"learning_rate": 1.4662430169795328e-07,
"loss": 1.4477,
"step": 539
},
{
"epoch": 1.2757731958762886,
"eval_loss": 1.474165678024292,
"eval_runtime": 78.6592,
"eval_samples_per_second": 21.142,
"eval_steps_per_second": 1.322,
"step": 539
},
{
"epoch": 1.2783505154639174,
"grad_norm": 1.2719723678439783,
"learning_rate": 1.464446689235314e-07,
"loss": 1.5694,
"step": 540
},
{
"epoch": 1.2809278350515463,
"grad_norm": 1.094905461428815,
"learning_rate": 1.4626484485854228e-07,
"loss": 1.4405,
"step": 541
},
{
"epoch": 1.2835051546391751,
"grad_norm": 1.1572746515704029,
"learning_rate": 1.4608483024362303e-07,
"loss": 1.49,
"step": 542
},
{
"epoch": 1.286082474226804,
"grad_norm": 1.2136708668686302,
"learning_rate": 1.4590462582019566e-07,
"loss": 1.5488,
"step": 543
},
{
"epoch": 1.2886597938144329,
"grad_norm": 1.1351781538641772,
"learning_rate": 1.4572423233046385e-07,
"loss": 1.4436,
"step": 544
},
{
"epoch": 1.291237113402062,
"grad_norm": 1.2233902585418839,
"learning_rate": 1.455436505174101e-07,
"loss": 1.4752,
"step": 545
},
{
"epoch": 1.2938144329896908,
"grad_norm": 1.2111257906769834,
"learning_rate": 1.453628811247924e-07,
"loss": 1.5437,
"step": 546
},
{
"epoch": 1.2963917525773196,
"grad_norm": 1.214330730454999,
"learning_rate": 1.4518192489714148e-07,
"loss": 1.5874,
"step": 547
},
{
"epoch": 1.2989690721649485,
"grad_norm": 1.1501171354212085,
"learning_rate": 1.4500078257975746e-07,
"loss": 1.4441,
"step": 548
},
{
"epoch": 1.3015463917525774,
"grad_norm": 1.256132517451847,
"learning_rate": 1.4481945491870692e-07,
"loss": 1.4869,
"step": 549
},
{
"epoch": 1.3041237113402062,
"grad_norm": 1.153698353782002,
"learning_rate": 1.4463794266081993e-07,
"loss": 1.4298,
"step": 550
},
{
"epoch": 1.306701030927835,
"grad_norm": 1.1141900425922164,
"learning_rate": 1.4445624655368672e-07,
"loss": 1.4081,
"step": 551
},
{
"epoch": 1.309278350515464,
"grad_norm": 1.1727962553732723,
"learning_rate": 1.4427436734565474e-07,
"loss": 1.4843,
"step": 552
},
{
"epoch": 1.3118556701030928,
"grad_norm": 1.1904748231664284,
"learning_rate": 1.4409230578582564e-07,
"loss": 1.4408,
"step": 553
},
{
"epoch": 1.3144329896907216,
"grad_norm": 1.1596562097777137,
"learning_rate": 1.4391006262405212e-07,
"loss": 1.5078,
"step": 554
},
{
"epoch": 1.3170103092783505,
"grad_norm": 1.1362387372168263,
"learning_rate": 1.4372763861093478e-07,
"loss": 1.4596,
"step": 555
},
{
"epoch": 1.3195876288659794,
"grad_norm": 1.2438435278065572,
"learning_rate": 1.4354503449781913e-07,
"loss": 1.536,
"step": 556
},
{
"epoch": 1.3221649484536082,
"grad_norm": 1.182522665170931,
"learning_rate": 1.4336225103679243e-07,
"loss": 1.5611,
"step": 557
},
{
"epoch": 1.324742268041237,
"grad_norm": 1.2822957709992338,
"learning_rate": 1.4317928898068066e-07,
"loss": 1.4826,
"step": 558
},
{
"epoch": 1.327319587628866,
"grad_norm": 1.2758012985116745,
"learning_rate": 1.4299614908304528e-07,
"loss": 1.4543,
"step": 559
},
{
"epoch": 1.3298969072164948,
"grad_norm": 1.164766457118801,
"learning_rate": 1.4281283209818038e-07,
"loss": 1.4061,
"step": 560
},
{
"epoch": 1.3324742268041236,
"grad_norm": 1.1663065580316805,
"learning_rate": 1.4262933878110923e-07,
"loss": 1.5151,
"step": 561
},
{
"epoch": 1.3350515463917525,
"grad_norm": 1.1525726704239359,
"learning_rate": 1.4244566988758152e-07,
"loss": 1.5209,
"step": 562
},
{
"epoch": 1.3376288659793816,
"grad_norm": 1.194456252210575,
"learning_rate": 1.4226182617406994e-07,
"loss": 1.5003,
"step": 563
},
{
"epoch": 1.3402061855670104,
"grad_norm": 1.2788205228042828,
"learning_rate": 1.4207780839776734e-07,
"loss": 1.5807,
"step": 564
},
{
"epoch": 1.3427835051546393,
"grad_norm": 1.2101911204508933,
"learning_rate": 1.4189361731658336e-07,
"loss": 1.4851,
"step": 565
},
{
"epoch": 1.3453608247422681,
"grad_norm": 1.143725315674112,
"learning_rate": 1.417092536891415e-07,
"loss": 1.5258,
"step": 566
},
{
"epoch": 1.347938144329897,
"grad_norm": 1.1692223610404973,
"learning_rate": 1.4152471827477593e-07,
"loss": 1.4843,
"step": 567
},
{
"epoch": 1.3505154639175259,
"grad_norm": 1.106947712823219,
"learning_rate": 1.413400118335283e-07,
"loss": 1.4339,
"step": 568
},
{
"epoch": 1.3530927835051547,
"grad_norm": 1.27487397886756,
"learning_rate": 1.4115513512614468e-07,
"loss": 1.4993,
"step": 569
},
{
"epoch": 1.3556701030927836,
"grad_norm": 1.2236429851509971,
"learning_rate": 1.4097008891407245e-07,
"loss": 1.4858,
"step": 570
},
{
"epoch": 1.3582474226804124,
"grad_norm": 1.156634200386137,
"learning_rate": 1.407848739594571e-07,
"loss": 1.4973,
"step": 571
},
{
"epoch": 1.3608247422680413,
"grad_norm": 1.287092803375809,
"learning_rate": 1.4059949102513913e-07,
"loss": 1.476,
"step": 572
},
{
"epoch": 1.3634020618556701,
"grad_norm": 1.2572273439235049,
"learning_rate": 1.404139408746508e-07,
"loss": 1.4798,
"step": 573
},
{
"epoch": 1.365979381443299,
"grad_norm": 1.2276167223192924,
"learning_rate": 1.4022822427221322e-07,
"loss": 1.497,
"step": 574
},
{
"epoch": 1.3685567010309279,
"grad_norm": 1.2392858668139202,
"learning_rate": 1.4004234198273302e-07,
"loss": 1.5471,
"step": 575
},
{
"epoch": 1.3711340206185567,
"grad_norm": 1.2887104141411092,
"learning_rate": 1.3985629477179915e-07,
"loss": 1.4953,
"step": 576
},
{
"epoch": 1.3737113402061856,
"grad_norm": 1.2401450542055277,
"learning_rate": 1.3967008340567996e-07,
"loss": 1.5095,
"step": 577
},
{
"epoch": 1.3762886597938144,
"grad_norm": 1.1989888153377388,
"learning_rate": 1.3948370865131977e-07,
"loss": 1.5633,
"step": 578
},
{
"epoch": 1.3788659793814433,
"grad_norm": 1.1616958019574242,
"learning_rate": 1.3929717127633597e-07,
"loss": 1.5035,
"step": 579
},
{
"epoch": 1.3814432989690721,
"grad_norm": 1.1581446950268255,
"learning_rate": 1.3911047204901557e-07,
"loss": 1.5232,
"step": 580
},
{
"epoch": 1.384020618556701,
"grad_norm": 1.2240328360723358,
"learning_rate": 1.3892361173831243e-07,
"loss": 1.4948,
"step": 581
},
{
"epoch": 1.3865979381443299,
"grad_norm": 1.2405325514200207,
"learning_rate": 1.3873659111384362e-07,
"loss": 1.4815,
"step": 582
},
{
"epoch": 1.3891752577319587,
"grad_norm": 1.253563661932654,
"learning_rate": 1.385494109458866e-07,
"loss": 1.4284,
"step": 583
},
{
"epoch": 1.3917525773195876,
"grad_norm": 1.1541355431922666,
"learning_rate": 1.3836207200537596e-07,
"loss": 1.4213,
"step": 584
},
{
"epoch": 1.3943298969072164,
"grad_norm": 1.2315631871967962,
"learning_rate": 1.381745750639002e-07,
"loss": 1.5876,
"step": 585
},
{
"epoch": 1.3969072164948453,
"grad_norm": 1.3294003251532183,
"learning_rate": 1.3798692089369854e-07,
"loss": 1.5821,
"step": 586
},
{
"epoch": 1.3994845360824741,
"grad_norm": 1.1726722981119444,
"learning_rate": 1.3779911026765784e-07,
"loss": 1.4679,
"step": 587
},
{
"epoch": 1.402061855670103,
"grad_norm": 1.1782526174868226,
"learning_rate": 1.3761114395930927e-07,
"loss": 1.4851,
"step": 588
},
{
"epoch": 1.402061855670103,
"eval_loss": 1.4704606533050537,
"eval_runtime": 78.4306,
"eval_samples_per_second": 21.203,
"eval_steps_per_second": 1.326,
"step": 588
},
{
"epoch": 1.4046391752577319,
"grad_norm": 1.192382882455904,
"learning_rate": 1.3742302274282532e-07,
"loss": 1.4707,
"step": 589
},
{
"epoch": 1.4072164948453607,
"grad_norm": 1.1364191762169735,
"learning_rate": 1.3723474739301636e-07,
"loss": 1.4066,
"step": 590
},
{
"epoch": 1.4097938144329896,
"grad_norm": 1.1453269827664123,
"learning_rate": 1.3704631868532767e-07,
"loss": 1.4515,
"step": 591
},
{
"epoch": 1.4123711340206184,
"grad_norm": 1.1956529180296382,
"learning_rate": 1.3685773739583617e-07,
"loss": 1.5102,
"step": 592
},
{
"epoch": 1.4149484536082475,
"grad_norm": 1.2356880855065446,
"learning_rate": 1.3666900430124717e-07,
"loss": 1.497,
"step": 593
},
{
"epoch": 1.4175257731958764,
"grad_norm": 1.1639642247143227,
"learning_rate": 1.3648012017889121e-07,
"loss": 1.485,
"step": 594
},
{
"epoch": 1.4201030927835052,
"grad_norm": 1.3028192646105916,
"learning_rate": 1.3629108580672093e-07,
"loss": 1.5073,
"step": 595
},
{
"epoch": 1.422680412371134,
"grad_norm": 1.1389634389377659,
"learning_rate": 1.3610190196330775e-07,
"loss": 1.4455,
"step": 596
},
{
"epoch": 1.425257731958763,
"grad_norm": 1.2726998162356058,
"learning_rate": 1.3591256942783868e-07,
"loss": 1.6226,
"step": 597
},
{
"epoch": 1.4278350515463918,
"grad_norm": 1.257001783763068,
"learning_rate": 1.3572308898011326e-07,
"loss": 1.4527,
"step": 598
},
{
"epoch": 1.4304123711340206,
"grad_norm": 1.1897100853456886,
"learning_rate": 1.3553346140054013e-07,
"loss": 1.4748,
"step": 599
},
{
"epoch": 1.4329896907216495,
"grad_norm": 1.144640373535268,
"learning_rate": 1.3534368747013394e-07,
"loss": 1.4733,
"step": 600
},
{
"epoch": 1.4355670103092784,
"grad_norm": 1.2252518120948153,
"learning_rate": 1.351537679705121e-07,
"loss": 1.4539,
"step": 601
},
{
"epoch": 1.4381443298969072,
"grad_norm": 1.1565118663607803,
"learning_rate": 1.3496370368389165e-07,
"loss": 1.5236,
"step": 602
},
{
"epoch": 1.440721649484536,
"grad_norm": 1.2594818027515957,
"learning_rate": 1.3477349539308584e-07,
"loss": 1.4856,
"step": 603
},
{
"epoch": 1.443298969072165,
"grad_norm": 1.1419387268061763,
"learning_rate": 1.3458314388150115e-07,
"loss": 1.4153,
"step": 604
},
{
"epoch": 1.4458762886597938,
"grad_norm": 1.098148594961463,
"learning_rate": 1.3439264993313385e-07,
"loss": 1.4447,
"step": 605
},
{
"epoch": 1.4484536082474226,
"grad_norm": 1.2022510861175644,
"learning_rate": 1.342020143325669e-07,
"loss": 1.5516,
"step": 606
},
{
"epoch": 1.4510309278350515,
"grad_norm": 1.1444341747665796,
"learning_rate": 1.3401123786496663e-07,
"loss": 1.4224,
"step": 607
},
{
"epoch": 1.4536082474226804,
"grad_norm": 1.1349715757276768,
"learning_rate": 1.3382032131607965e-07,
"loss": 1.3973,
"step": 608
},
{
"epoch": 1.4561855670103092,
"grad_norm": 1.1228999228709107,
"learning_rate": 1.3362926547222946e-07,
"loss": 1.4149,
"step": 609
},
{
"epoch": 1.458762886597938,
"grad_norm": 1.2396644989009444,
"learning_rate": 1.3343807112031327e-07,
"loss": 1.4999,
"step": 610
},
{
"epoch": 1.4613402061855671,
"grad_norm": 1.1458789067959891,
"learning_rate": 1.3324673904779874e-07,
"loss": 1.4606,
"step": 611
},
{
"epoch": 1.463917525773196,
"grad_norm": 1.1579136550048348,
"learning_rate": 1.3305527004272087e-07,
"loss": 1.5091,
"step": 612
},
{
"epoch": 1.4664948453608249,
"grad_norm": 1.1065943702186947,
"learning_rate": 1.3286366489367846e-07,
"loss": 1.4981,
"step": 613
},
{
"epoch": 1.4690721649484537,
"grad_norm": 1.1701708173193963,
"learning_rate": 1.3267192438983117e-07,
"loss": 1.4864,
"step": 614
},
{
"epoch": 1.4716494845360826,
"grad_norm": 1.1655119326822228,
"learning_rate": 1.324800493208961e-07,
"loss": 1.4609,
"step": 615
},
{
"epoch": 1.4742268041237114,
"grad_norm": 1.1668952825289185,
"learning_rate": 1.322880404771446e-07,
"loss": 1.5529,
"step": 616
},
{
"epoch": 1.4768041237113403,
"grad_norm": 1.1827027818749032,
"learning_rate": 1.3209589864939906e-07,
"loss": 1.4712,
"step": 617
},
{
"epoch": 1.4793814432989691,
"grad_norm": 1.16644526665677,
"learning_rate": 1.3190362462902935e-07,
"loss": 1.4444,
"step": 618
},
{
"epoch": 1.481958762886598,
"grad_norm": 1.2457925422571992,
"learning_rate": 1.3171121920795012e-07,
"loss": 1.5042,
"step": 619
},
{
"epoch": 1.4845360824742269,
"grad_norm": 1.1848287601135188,
"learning_rate": 1.3151868317861698e-07,
"loss": 1.5314,
"step": 620
},
{
"epoch": 1.4871134020618557,
"grad_norm": 1.181022425488295,
"learning_rate": 1.3132601733402355e-07,
"loss": 1.5557,
"step": 621
},
{
"epoch": 1.4896907216494846,
"grad_norm": 1.2220291945868886,
"learning_rate": 1.3113322246769816e-07,
"loss": 1.4743,
"step": 622
},
{
"epoch": 1.4922680412371134,
"grad_norm": 1.23521757296614,
"learning_rate": 1.3094029937370049e-07,
"loss": 1.4494,
"step": 623
},
{
"epoch": 1.4948453608247423,
"grad_norm": 1.1540829106187,
"learning_rate": 1.3074724884661832e-07,
"loss": 1.492,
"step": 624
},
{
"epoch": 1.4974226804123711,
"grad_norm": 1.2734897659131177,
"learning_rate": 1.3055407168156436e-07,
"loss": 1.5114,
"step": 625
},
{
"epoch": 1.5,
"grad_norm": 1.0821110483827021,
"learning_rate": 1.3036076867417286e-07,
"loss": 1.4899,
"step": 626
},
{
"epoch": 1.5025773195876289,
"grad_norm": 1.1591573630093586,
"learning_rate": 1.3016734062059636e-07,
"loss": 1.4287,
"step": 627
},
{
"epoch": 1.5051546391752577,
"grad_norm": 1.252040765136315,
"learning_rate": 1.299737883175024e-07,
"loss": 1.4215,
"step": 628
},
{
"epoch": 1.5077319587628866,
"grad_norm": 1.122072741553452,
"learning_rate": 1.2978011256207041e-07,
"loss": 1.4535,
"step": 629
},
{
"epoch": 1.5103092783505154,
"grad_norm": 1.1929144211640363,
"learning_rate": 1.2958631415198813e-07,
"loss": 1.4264,
"step": 630
},
{
"epoch": 1.5128865979381443,
"grad_norm": 1.1904423534607285,
"learning_rate": 1.293923938854485e-07,
"loss": 1.4966,
"step": 631
},
{
"epoch": 1.5154639175257731,
"grad_norm": 1.2142748405878527,
"learning_rate": 1.2919835256114638e-07,
"loss": 1.4152,
"step": 632
},
{
"epoch": 1.518041237113402,
"grad_norm": 1.2310572109795892,
"learning_rate": 1.290041909782752e-07,
"loss": 1.3986,
"step": 633
},
{
"epoch": 1.5206185567010309,
"grad_norm": 1.1532910482056786,
"learning_rate": 1.2880990993652377e-07,
"loss": 1.4606,
"step": 634
},
{
"epoch": 1.5231958762886597,
"grad_norm": 1.3007475509786544,
"learning_rate": 1.2861551023607276e-07,
"loss": 1.5304,
"step": 635
},
{
"epoch": 1.5257731958762886,
"grad_norm": 1.15166741332348,
"learning_rate": 1.2842099267759174e-07,
"loss": 1.3824,
"step": 636
},
{
"epoch": 1.5283505154639174,
"grad_norm": 1.1988826738728366,
"learning_rate": 1.2822635806223556e-07,
"loss": 1.567,
"step": 637
},
{
"epoch": 1.5283505154639174,
"eval_loss": 1.4671498537063599,
"eval_runtime": 78.5049,
"eval_samples_per_second": 21.183,
"eval_steps_per_second": 1.325,
"step": 637
},
{
"epoch": 1.5309278350515463,
"grad_norm": 1.2386857438447851,
"learning_rate": 1.2803160719164125e-07,
"loss": 1.5304,
"step": 638
},
{
"epoch": 1.5335051546391751,
"grad_norm": 1.1597769415791235,
"learning_rate": 1.2783674086792466e-07,
"loss": 1.497,
"step": 639
},
{
"epoch": 1.536082474226804,
"grad_norm": 1.2924257071547485,
"learning_rate": 1.2764175989367717e-07,
"loss": 1.4877,
"step": 640
},
{
"epoch": 1.5386597938144329,
"grad_norm": 1.222248016944084,
"learning_rate": 1.2744666507196224e-07,
"loss": 1.5257,
"step": 641
},
{
"epoch": 1.5412371134020617,
"grad_norm": 1.0852012266696331,
"learning_rate": 1.2725145720631242e-07,
"loss": 1.4657,
"step": 642
},
{
"epoch": 1.5438144329896906,
"grad_norm": 1.2029751793520205,
"learning_rate": 1.2705613710072573e-07,
"loss": 1.543,
"step": 643
},
{
"epoch": 1.5463917525773194,
"grad_norm": 1.1786774736346322,
"learning_rate": 1.2686070555966252e-07,
"loss": 1.4163,
"step": 644
},
{
"epoch": 1.5489690721649485,
"grad_norm": 1.1923466397926792,
"learning_rate": 1.2666516338804208e-07,
"loss": 1.449,
"step": 645
},
{
"epoch": 1.5515463917525774,
"grad_norm": 1.1491363181852474,
"learning_rate": 1.2646951139123932e-07,
"loss": 1.4773,
"step": 646
},
{
"epoch": 1.5541237113402062,
"grad_norm": 1.1921001128896263,
"learning_rate": 1.2627375037508162e-07,
"loss": 1.4596,
"step": 647
},
{
"epoch": 1.556701030927835,
"grad_norm": 1.2215090538297548,
"learning_rate": 1.2607788114584522e-07,
"loss": 1.5697,
"step": 648
},
{
"epoch": 1.559278350515464,
"grad_norm": 1.1364987023852344,
"learning_rate": 1.2588190451025208e-07,
"loss": 1.4126,
"step": 649
},
{
"epoch": 1.5618556701030928,
"grad_norm": 1.139874297388743,
"learning_rate": 1.2568582127546662e-07,
"loss": 1.4104,
"step": 650
},
{
"epoch": 1.5644329896907216,
"grad_norm": 1.1273021800754177,
"learning_rate": 1.2548963224909223e-07,
"loss": 1.4407,
"step": 651
},
{
"epoch": 1.5670103092783505,
"grad_norm": 1.1999146152571862,
"learning_rate": 1.2529333823916806e-07,
"loss": 1.4779,
"step": 652
},
{
"epoch": 1.5695876288659794,
"grad_norm": 1.1170496605169837,
"learning_rate": 1.2509694005416563e-07,
"loss": 1.4368,
"step": 653
},
{
"epoch": 1.5721649484536082,
"grad_norm": 1.099167093974349,
"learning_rate": 1.2490043850298557e-07,
"loss": 1.4932,
"step": 654
},
{
"epoch": 1.574742268041237,
"grad_norm": 1.219342527534671,
"learning_rate": 1.2470383439495416e-07,
"loss": 1.4633,
"step": 655
},
{
"epoch": 1.577319587628866,
"grad_norm": 1.2125741355588842,
"learning_rate": 1.2450712853982014e-07,
"loss": 1.5161,
"step": 656
},
{
"epoch": 1.579896907216495,
"grad_norm": 1.2755825455134522,
"learning_rate": 1.2431032174775127e-07,
"loss": 1.5225,
"step": 657
},
{
"epoch": 1.5824742268041239,
"grad_norm": 1.1521606084223124,
"learning_rate": 1.2411341482933108e-07,
"loss": 1.4308,
"step": 658
},
{
"epoch": 1.5850515463917527,
"grad_norm": 1.165275382475451,
"learning_rate": 1.239164085955555e-07,
"loss": 1.5024,
"step": 659
},
{
"epoch": 1.5876288659793816,
"grad_norm": 1.2609655964912305,
"learning_rate": 1.2371930385782943e-07,
"loss": 1.4669,
"step": 660
},
{
"epoch": 1.5902061855670104,
"grad_norm": 1.1698575645046683,
"learning_rate": 1.2352210142796356e-07,
"loss": 1.4752,
"step": 661
},
{
"epoch": 1.5927835051546393,
"grad_norm": 1.1966335794904208,
"learning_rate": 1.2332480211817091e-07,
"loss": 1.5478,
"step": 662
},
{
"epoch": 1.5953608247422681,
"grad_norm": 1.081476396234954,
"learning_rate": 1.2312740674106347e-07,
"loss": 1.451,
"step": 663
},
{
"epoch": 1.597938144329897,
"grad_norm": 1.2089145441748135,
"learning_rate": 1.22929916109649e-07,
"loss": 1.4975,
"step": 664
},
{
"epoch": 1.6005154639175259,
"grad_norm": 1.2416284172109027,
"learning_rate": 1.227323310373275e-07,
"loss": 1.43,
"step": 665
},
{
"epoch": 1.6030927835051547,
"grad_norm": 1.2758382819864167,
"learning_rate": 1.2253465233788794e-07,
"loss": 1.4589,
"step": 666
},
{
"epoch": 1.6056701030927836,
"grad_norm": 1.1736803322764697,
"learning_rate": 1.22336880825505e-07,
"loss": 1.3896,
"step": 667
},
{
"epoch": 1.6082474226804124,
"grad_norm": 1.1927775409437176,
"learning_rate": 1.2213901731473551e-07,
"loss": 1.5394,
"step": 668
},
{
"epoch": 1.6108247422680413,
"grad_norm": 1.2264294531171918,
"learning_rate": 1.219410626205153e-07,
"loss": 1.4543,
"step": 669
},
{
"epoch": 1.6134020618556701,
"grad_norm": 1.2693861374653377,
"learning_rate": 1.217430175581557e-07,
"loss": 1.484,
"step": 670
},
{
"epoch": 1.615979381443299,
"grad_norm": 1.2665036241537893,
"learning_rate": 1.2154488294334027e-07,
"loss": 1.5607,
"step": 671
},
{
"epoch": 1.6185567010309279,
"grad_norm": 1.1703235363860394,
"learning_rate": 1.2134665959212136e-07,
"loss": 1.4644,
"step": 672
},
{
"epoch": 1.6211340206185567,
"grad_norm": 1.193069004037872,
"learning_rate": 1.211483483209169e-07,
"loss": 1.4888,
"step": 673
},
{
"epoch": 1.6237113402061856,
"grad_norm": 1.2361705074035756,
"learning_rate": 1.209499499465068e-07,
"loss": 1.4504,
"step": 674
},
{
"epoch": 1.6262886597938144,
"grad_norm": 1.095084009584948,
"learning_rate": 1.2075146528602983e-07,
"loss": 1.4828,
"step": 675
},
{
"epoch": 1.6288659793814433,
"grad_norm": 1.1262123200952905,
"learning_rate": 1.2055289515698006e-07,
"loss": 1.5487,
"step": 676
},
{
"epoch": 1.6314432989690721,
"grad_norm": 1.1378828378426857,
"learning_rate": 1.2035424037720364e-07,
"loss": 1.4921,
"step": 677
},
{
"epoch": 1.634020618556701,
"grad_norm": 1.1961288239091903,
"learning_rate": 1.2015550176489537e-07,
"loss": 1.4421,
"step": 678
},
{
"epoch": 1.6365979381443299,
"grad_norm": 1.1366747357584532,
"learning_rate": 1.199566801385953e-07,
"loss": 1.4392,
"step": 679
},
{
"epoch": 1.6391752577319587,
"grad_norm": 1.1909816425714403,
"learning_rate": 1.1975777631718532e-07,
"loss": 1.5001,
"step": 680
},
{
"epoch": 1.6417525773195876,
"grad_norm": 1.2963539362844378,
"learning_rate": 1.19558791119886e-07,
"loss": 1.4605,
"step": 681
},
{
"epoch": 1.6443298969072164,
"grad_norm": 1.1580390642200817,
"learning_rate": 1.19359725366253e-07,
"loss": 1.5063,
"step": 682
},
{
"epoch": 1.6469072164948453,
"grad_norm": 1.216487820544871,
"learning_rate": 1.1916057987617374e-07,
"loss": 1.4886,
"step": 683
},
{
"epoch": 1.6494845360824741,
"grad_norm": 1.2218109581350323,
"learning_rate": 1.1896135546986407e-07,
"loss": 1.4608,
"step": 684
},
{
"epoch": 1.652061855670103,
"grad_norm": 1.2280111906896558,
"learning_rate": 1.1876205296786493e-07,
"loss": 1.5096,
"step": 685
},
{
"epoch": 1.6546391752577319,
"grad_norm": 1.2166796078055058,
"learning_rate": 1.1856267319103876e-07,
"loss": 1.4692,
"step": 686
},
{
"epoch": 1.6546391752577319,
"eval_loss": 1.4642903804779053,
"eval_runtime": 78.6766,
"eval_samples_per_second": 21.137,
"eval_steps_per_second": 1.322,
"step": 686
},
{
"epoch": 1.6572164948453607,
"grad_norm": 1.1939355446450859,
"learning_rate": 1.1836321696056645e-07,
"loss": 1.4137,
"step": 687
},
{
"epoch": 1.6597938144329896,
"grad_norm": 1.2546613486361071,
"learning_rate": 1.1816368509794363e-07,
"loss": 1.512,
"step": 688
},
{
"epoch": 1.6623711340206184,
"grad_norm": 1.1366449756739982,
"learning_rate": 1.1796407842497753e-07,
"loss": 1.3836,
"step": 689
},
{
"epoch": 1.6649484536082473,
"grad_norm": 1.2553355162175337,
"learning_rate": 1.1776439776378351e-07,
"loss": 1.4565,
"step": 690
},
{
"epoch": 1.6675257731958761,
"grad_norm": 1.20909630643183,
"learning_rate": 1.1756464393678151e-07,
"loss": 1.4481,
"step": 691
},
{
"epoch": 1.670103092783505,
"grad_norm": 1.2273438479078924,
"learning_rate": 1.1736481776669305e-07,
"loss": 1.4903,
"step": 692
},
{
"epoch": 1.672680412371134,
"grad_norm": 1.1909626287045671,
"learning_rate": 1.1716492007653737e-07,
"loss": 1.5012,
"step": 693
},
{
"epoch": 1.675257731958763,
"grad_norm": 1.158968259505721,
"learning_rate": 1.1696495168962845e-07,
"loss": 1.5465,
"step": 694
},
{
"epoch": 1.6778350515463918,
"grad_norm": 1.1963581026774628,
"learning_rate": 1.1676491342957142e-07,
"loss": 1.4729,
"step": 695
},
{
"epoch": 1.6804123711340206,
"grad_norm": 1.215536392765087,
"learning_rate": 1.1656480612025911e-07,
"loss": 1.4164,
"step": 696
},
{
"epoch": 1.6829896907216495,
"grad_norm": 1.0521259077304612,
"learning_rate": 1.163646305858688e-07,
"loss": 1.3678,
"step": 697
},
{
"epoch": 1.6855670103092784,
"grad_norm": 1.295543359347737,
"learning_rate": 1.1616438765085881e-07,
"loss": 1.57,
"step": 698
},
{
"epoch": 1.6881443298969072,
"grad_norm": 1.1720574150387943,
"learning_rate": 1.1596407813996498e-07,
"loss": 1.5221,
"step": 699
},
{
"epoch": 1.690721649484536,
"grad_norm": 1.186785802460397,
"learning_rate": 1.1576370287819735e-07,
"loss": 1.4673,
"step": 700
},
{
"epoch": 1.693298969072165,
"grad_norm": 1.1589224859683183,
"learning_rate": 1.155632626908369e-07,
"loss": 1.3919,
"step": 701
},
{
"epoch": 1.6958762886597938,
"grad_norm": 1.3034607577131674,
"learning_rate": 1.1536275840343183e-07,
"loss": 1.43,
"step": 702
},
{
"epoch": 1.6984536082474226,
"grad_norm": 1.1721298121139936,
"learning_rate": 1.1516219084179448e-07,
"loss": 1.5556,
"step": 703
},
{
"epoch": 1.7010309278350515,
"grad_norm": 1.164281783704574,
"learning_rate": 1.149615608319978e-07,
"loss": 1.4449,
"step": 704
},
{
"epoch": 1.7036082474226806,
"grad_norm": 1.1144845067827036,
"learning_rate": 1.1476086920037183e-07,
"loss": 1.5204,
"step": 705
},
{
"epoch": 1.7061855670103094,
"grad_norm": 1.1470381221039117,
"learning_rate": 1.1456011677350051e-07,
"loss": 1.4096,
"step": 706
},
{
"epoch": 1.7087628865979383,
"grad_norm": 1.1938066626201722,
"learning_rate": 1.1435930437821812e-07,
"loss": 1.4299,
"step": 707
},
{
"epoch": 1.7113402061855671,
"grad_norm": 1.389576843014182,
"learning_rate": 1.1415843284160598e-07,
"loss": 1.4736,
"step": 708
},
{
"epoch": 1.713917525773196,
"grad_norm": 1.1886965701829686,
"learning_rate": 1.1395750299098899e-07,
"loss": 1.4972,
"step": 709
},
{
"epoch": 1.7164948453608249,
"grad_norm": 1.1389546972088997,
"learning_rate": 1.1375651565393218e-07,
"loss": 1.5518,
"step": 710
},
{
"epoch": 1.7190721649484537,
"grad_norm": 1.1743796585118387,
"learning_rate": 1.1355547165823738e-07,
"loss": 1.4672,
"step": 711
},
{
"epoch": 1.7216494845360826,
"grad_norm": 1.1460525519017093,
"learning_rate": 1.1335437183193979e-07,
"loss": 1.5233,
"step": 712
},
{
"epoch": 1.7242268041237114,
"grad_norm": 1.1586406558048044,
"learning_rate": 1.1315321700330454e-07,
"loss": 1.4686,
"step": 713
},
{
"epoch": 1.7268041237113403,
"grad_norm": 1.1369470779252082,
"learning_rate": 1.1295200800082326e-07,
"loss": 1.4688,
"step": 714
},
{
"epoch": 1.7293814432989691,
"grad_norm": 1.1705799315615684,
"learning_rate": 1.1275074565321078e-07,
"loss": 1.3893,
"step": 715
},
{
"epoch": 1.731958762886598,
"grad_norm": 1.1725120595380418,
"learning_rate": 1.125494307894016e-07,
"loss": 1.537,
"step": 716
},
{
"epoch": 1.7345360824742269,
"grad_norm": 1.0734797144766555,
"learning_rate": 1.1234806423854653e-07,
"loss": 1.4388,
"step": 717
},
{
"epoch": 1.7371134020618557,
"grad_norm": 1.170033873518124,
"learning_rate": 1.1214664683000924e-07,
"loss": 1.3753,
"step": 718
},
{
"epoch": 1.7396907216494846,
"grad_norm": 1.231373540289329,
"learning_rate": 1.1194517939336287e-07,
"loss": 1.5497,
"step": 719
},
{
"epoch": 1.7422680412371134,
"grad_norm": 1.1946433920559838,
"learning_rate": 1.1174366275838662e-07,
"loss": 1.413,
"step": 720
},
{
"epoch": 1.7448453608247423,
"grad_norm": 1.1418431201062664,
"learning_rate": 1.115420977550624e-07,
"loss": 1.4914,
"step": 721
},
{
"epoch": 1.7474226804123711,
"grad_norm": 1.2072128707535221,
"learning_rate": 1.1134048521357115e-07,
"loss": 1.4836,
"step": 722
},
{
"epoch": 1.75,
"grad_norm": 1.194692316000769,
"learning_rate": 1.1113882596428976e-07,
"loss": 1.4389,
"step": 723
},
{
"epoch": 1.7525773195876289,
"grad_norm": 1.181835370102449,
"learning_rate": 1.1093712083778746e-07,
"loss": 1.4542,
"step": 724
},
{
"epoch": 1.7551546391752577,
"grad_norm": 1.1310122085797796,
"learning_rate": 1.1073537066482235e-07,
"loss": 1.4572,
"step": 725
},
{
"epoch": 1.7577319587628866,
"grad_norm": 1.1299559219838877,
"learning_rate": 1.1053357627633821e-07,
"loss": 1.5374,
"step": 726
},
{
"epoch": 1.7603092783505154,
"grad_norm": 1.2302892939334757,
"learning_rate": 1.1033173850346081e-07,
"loss": 1.5156,
"step": 727
},
{
"epoch": 1.7628865979381443,
"grad_norm": 1.1376050539784393,
"learning_rate": 1.1012985817749462e-07,
"loss": 1.4994,
"step": 728
},
{
"epoch": 1.7654639175257731,
"grad_norm": 1.1912506938583958,
"learning_rate": 1.0992793612991946e-07,
"loss": 1.5358,
"step": 729
},
{
"epoch": 1.768041237113402,
"grad_norm": 1.2323374068579527,
"learning_rate": 1.097259731923869e-07,
"loss": 1.5446,
"step": 730
},
{
"epoch": 1.7706185567010309,
"grad_norm": 1.2255437302126448,
"learning_rate": 1.0952397019671694e-07,
"loss": 1.413,
"step": 731
},
{
"epoch": 1.7731958762886597,
"grad_norm": 1.2608512214948044,
"learning_rate": 1.0932192797489459e-07,
"loss": 1.5306,
"step": 732
},
{
"epoch": 1.7757731958762886,
"grad_norm": 1.187848987827898,
"learning_rate": 1.0911984735906635e-07,
"loss": 1.4589,
"step": 733
},
{
"epoch": 1.7783505154639174,
"grad_norm": 1.1078353763626878,
"learning_rate": 1.0891772918153694e-07,
"loss": 1.5026,
"step": 734
},
{
"epoch": 1.7809278350515463,
"grad_norm": 1.1847073079284023,
"learning_rate": 1.0871557427476584e-07,
"loss": 1.4819,
"step": 735
},
{
"epoch": 1.7809278350515463,
"eval_loss": 1.4616869688034058,
"eval_runtime": 78.6285,
"eval_samples_per_second": 21.15,
"eval_steps_per_second": 1.323,
"step": 735
},
{
"epoch": 1.7835051546391751,
"grad_norm": 1.1649395427594373,
"learning_rate": 1.0851338347136356e-07,
"loss": 1.5143,
"step": 736
},
{
"epoch": 1.786082474226804,
"grad_norm": 1.284550306447524,
"learning_rate": 1.0831115760408871e-07,
"loss": 1.4542,
"step": 737
},
{
"epoch": 1.7886597938144329,
"grad_norm": 1.1925120790488934,
"learning_rate": 1.0810889750584424e-07,
"loss": 1.426,
"step": 738
},
{
"epoch": 1.7912371134020617,
"grad_norm": 1.178551347790486,
"learning_rate": 1.07906604009674e-07,
"loss": 1.4372,
"step": 739
},
{
"epoch": 1.7938144329896906,
"grad_norm": 1.2458332188073578,
"learning_rate": 1.077042779487595e-07,
"loss": 1.5252,
"step": 740
},
{
"epoch": 1.7963917525773194,
"grad_norm": 1.2661697455131442,
"learning_rate": 1.0750192015641633e-07,
"loss": 1.5066,
"step": 741
},
{
"epoch": 1.7989690721649485,
"grad_norm": 1.1069806037454215,
"learning_rate": 1.0729953146609074e-07,
"loss": 1.4264,
"step": 742
},
{
"epoch": 1.8015463917525774,
"grad_norm": 1.194263854578521,
"learning_rate": 1.0709711271135635e-07,
"loss": 1.4339,
"step": 743
},
{
"epoch": 1.8041237113402062,
"grad_norm": 1.2068338783635435,
"learning_rate": 1.0689466472591048e-07,
"loss": 1.4341,
"step": 744
},
{
"epoch": 1.806701030927835,
"grad_norm": 1.1526056815131385,
"learning_rate": 1.066921883435709e-07,
"loss": 1.4382,
"step": 745
},
{
"epoch": 1.809278350515464,
"grad_norm": 1.1526436748662838,
"learning_rate": 1.0648968439827239e-07,
"loss": 1.4525,
"step": 746
},
{
"epoch": 1.8118556701030928,
"grad_norm": 1.2587407335769552,
"learning_rate": 1.0628715372406309e-07,
"loss": 1.4995,
"step": 747
},
{
"epoch": 1.8144329896907216,
"grad_norm": 1.2439345895593688,
"learning_rate": 1.0608459715510139e-07,
"loss": 1.4172,
"step": 748
},
{
"epoch": 1.8170103092783505,
"grad_norm": 1.2048841761527278,
"learning_rate": 1.058820155256523e-07,
"loss": 1.4536,
"step": 749
},
{
"epoch": 1.8195876288659794,
"grad_norm": 1.1712052519870668,
"learning_rate": 1.0567940967008396e-07,
"loss": 1.4739,
"step": 750
},
{
"epoch": 1.8221649484536082,
"grad_norm": 1.1253615480764265,
"learning_rate": 1.0547678042286435e-07,
"loss": 1.4362,
"step": 751
},
{
"epoch": 1.824742268041237,
"grad_norm": 1.1941314320057088,
"learning_rate": 1.0527412861855789e-07,
"loss": 1.5473,
"step": 752
},
{
"epoch": 1.827319587628866,
"grad_norm": 1.1515723933518516,
"learning_rate": 1.0507145509182169e-07,
"loss": 1.4095,
"step": 753
},
{
"epoch": 1.829896907216495,
"grad_norm": 1.1459437804868953,
"learning_rate": 1.0486876067740252e-07,
"loss": 1.4454,
"step": 754
},
{
"epoch": 1.8324742268041239,
"grad_norm": 1.2555188381647702,
"learning_rate": 1.0466604621013306e-07,
"loss": 1.5032,
"step": 755
},
{
"epoch": 1.8350515463917527,
"grad_norm": 1.173256763665764,
"learning_rate": 1.0446331252492864e-07,
"loss": 1.542,
"step": 756
},
{
"epoch": 1.8376288659793816,
"grad_norm": 1.1616854603706852,
"learning_rate": 1.0426056045678375e-07,
"loss": 1.3301,
"step": 757
},
{
"epoch": 1.8402061855670104,
"grad_norm": 1.1961580743330678,
"learning_rate": 1.0405779084076855e-07,
"loss": 1.5125,
"step": 758
},
{
"epoch": 1.8427835051546393,
"grad_norm": 1.153920316864521,
"learning_rate": 1.0385500451202549e-07,
"loss": 1.5104,
"step": 759
},
{
"epoch": 1.8453608247422681,
"grad_norm": 1.2288872831871334,
"learning_rate": 1.036522023057659e-07,
"loss": 1.54,
"step": 760
},
{
"epoch": 1.847938144329897,
"grad_norm": 1.1774978065006576,
"learning_rate": 1.0344938505726641e-07,
"loss": 1.4226,
"step": 761
},
{
"epoch": 1.8505154639175259,
"grad_norm": 1.18190720576571,
"learning_rate": 1.0324655360186567e-07,
"loss": 1.4874,
"step": 762
},
{
"epoch": 1.8530927835051547,
"grad_norm": 1.0881741375618583,
"learning_rate": 1.0304370877496089e-07,
"loss": 1.4196,
"step": 763
},
{
"epoch": 1.8556701030927836,
"grad_norm": 1.1920925526660484,
"learning_rate": 1.0284085141200423e-07,
"loss": 1.4022,
"step": 764
},
{
"epoch": 1.8582474226804124,
"grad_norm": 1.2553686949150205,
"learning_rate": 1.0263798234849954e-07,
"loss": 1.5411,
"step": 765
},
{
"epoch": 1.8608247422680413,
"grad_norm": 1.1849323570576418,
"learning_rate": 1.0243510241999897e-07,
"loss": 1.4376,
"step": 766
},
{
"epoch": 1.8634020618556701,
"grad_norm": 1.1748076105825112,
"learning_rate": 1.0223221246209918e-07,
"loss": 1.3917,
"step": 767
},
{
"epoch": 1.865979381443299,
"grad_norm": 1.1437404458677716,
"learning_rate": 1.0202931331043839e-07,
"loss": 1.5412,
"step": 768
},
{
"epoch": 1.8685567010309279,
"grad_norm": 1.1588752261265902,
"learning_rate": 1.0182640580069248e-07,
"loss": 1.4016,
"step": 769
},
{
"epoch": 1.8711340206185567,
"grad_norm": 1.244615607327111,
"learning_rate": 1.016234907685719e-07,
"loss": 1.4501,
"step": 770
},
{
"epoch": 1.8737113402061856,
"grad_norm": 1.1809049167530614,
"learning_rate": 1.0142056904981802e-07,
"loss": 1.4637,
"step": 771
},
{
"epoch": 1.8762886597938144,
"grad_norm": 1.1101634996349434,
"learning_rate": 1.0121764148019975e-07,
"loss": 1.4228,
"step": 772
},
{
"epoch": 1.8788659793814433,
"grad_norm": 1.2377079616714697,
"learning_rate": 1.0101470889551012e-07,
"loss": 1.4533,
"step": 773
},
{
"epoch": 1.8814432989690721,
"grad_norm": 1.160543485045226,
"learning_rate": 1.0081177213156278e-07,
"loss": 1.4735,
"step": 774
},
{
"epoch": 1.884020618556701,
"grad_norm": 1.115374473748354,
"learning_rate": 1.0060883202418861e-07,
"loss": 1.438,
"step": 775
},
{
"epoch": 1.8865979381443299,
"grad_norm": 1.1305131743119878,
"learning_rate": 1.004058894092323e-07,
"loss": 1.4186,
"step": 776
},
{
"epoch": 1.8891752577319587,
"grad_norm": 1.151990553361531,
"learning_rate": 1.0020294512254883e-07,
"loss": 1.5121,
"step": 777
},
{
"epoch": 1.8917525773195876,
"grad_norm": 1.1278991620860568,
"learning_rate": 1e-07,
"loss": 1.4333,
"step": 778
},
{
"epoch": 1.8943298969072164,
"grad_norm": 1.281137685220673,
"learning_rate": 9.97970548774512e-08,
"loss": 1.4416,
"step": 779
},
{
"epoch": 1.8969072164948453,
"grad_norm": 1.1772600120424532,
"learning_rate": 9.959411059076768e-08,
"loss": 1.409,
"step": 780
},
{
"epoch": 1.8994845360824741,
"grad_norm": 1.16485761208349,
"learning_rate": 9.939116797581138e-08,
"loss": 1.4324,
"step": 781
},
{
"epoch": 1.902061855670103,
"grad_norm": 1.203443440232203,
"learning_rate": 9.918822786843724e-08,
"loss": 1.4324,
"step": 782
},
{
"epoch": 1.9046391752577319,
"grad_norm": 1.20376421998538,
"learning_rate": 9.898529110448987e-08,
"loss": 1.501,
"step": 783
},
{
"epoch": 1.9072164948453607,
"grad_norm": 1.1533270795807118,
"learning_rate": 9.878235851980025e-08,
"loss": 1.404,
"step": 784
},
{
"epoch": 1.9072164948453607,
"eval_loss": 1.4594255685806274,
"eval_runtime": 78.6148,
"eval_samples_per_second": 21.154,
"eval_steps_per_second": 1.323,
"step": 784
},
{
"epoch": 1.9097938144329896,
"grad_norm": 1.1889743164637112,
"learning_rate": 9.857943095018198e-08,
"loss": 1.4652,
"step": 785
},
{
"epoch": 1.9123711340206184,
"grad_norm": 1.162304380840768,
"learning_rate": 9.837650923142809e-08,
"loss": 1.3641,
"step": 786
},
{
"epoch": 1.9149484536082473,
"grad_norm": 1.260002079711297,
"learning_rate": 9.817359419930751e-08,
"loss": 1.5022,
"step": 787
},
{
"epoch": 1.9175257731958761,
"grad_norm": 1.1295427248534264,
"learning_rate": 9.797068668956162e-08,
"loss": 1.4553,
"step": 788
},
{
"epoch": 1.920103092783505,
"grad_norm": 1.1730252131786578,
"learning_rate": 9.77677875379008e-08,
"loss": 1.4748,
"step": 789
},
{
"epoch": 1.922680412371134,
"grad_norm": 1.2020202803132716,
"learning_rate": 9.756489758000104e-08,
"loss": 1.4479,
"step": 790
},
{
"epoch": 1.925257731958763,
"grad_norm": 1.180219637473307,
"learning_rate": 9.736201765150045e-08,
"loss": 1.4974,
"step": 791
},
{
"epoch": 1.9278350515463918,
"grad_norm": 1.2291944688317633,
"learning_rate": 9.715914858799575e-08,
"loss": 1.4228,
"step": 792
},
{
"epoch": 1.9304123711340206,
"grad_norm": 1.1131303155372065,
"learning_rate": 9.69562912250391e-08,
"loss": 1.4693,
"step": 793
},
{
"epoch": 1.9329896907216495,
"grad_norm": 1.1994615231875885,
"learning_rate": 9.675344639813433e-08,
"loss": 1.4745,
"step": 794
},
{
"epoch": 1.9355670103092784,
"grad_norm": 1.115870585349576,
"learning_rate": 9.655061494273362e-08,
"loss": 1.4671,
"step": 795
},
{
"epoch": 1.9381443298969072,
"grad_norm": 1.2054754001979724,
"learning_rate": 9.63477976942341e-08,
"loss": 1.5195,
"step": 796
},
{
"epoch": 1.940721649484536,
"grad_norm": 1.1464295691900082,
"learning_rate": 9.614499548797452e-08,
"loss": 1.4402,
"step": 797
},
{
"epoch": 1.943298969072165,
"grad_norm": 1.1914103186703613,
"learning_rate": 9.594220915923148e-08,
"loss": 1.5797,
"step": 798
},
{
"epoch": 1.9458762886597938,
"grad_norm": 1.1615295842359556,
"learning_rate": 9.573943954321626e-08,
"loss": 1.4126,
"step": 799
},
{
"epoch": 1.9484536082474226,
"grad_norm": 1.19026250293737,
"learning_rate": 9.553668747507138e-08,
"loss": 1.4332,
"step": 800
},
{
"epoch": 1.9510309278350515,
"grad_norm": 1.1351877413773055,
"learning_rate": 9.533395378986697e-08,
"loss": 1.4784,
"step": 801
},
{
"epoch": 1.9536082474226806,
"grad_norm": 1.227791339106945,
"learning_rate": 9.51312393225975e-08,
"loss": 1.4392,
"step": 802
},
{
"epoch": 1.9561855670103094,
"grad_norm": 1.2100140189737674,
"learning_rate": 9.492854490817833e-08,
"loss": 1.4693,
"step": 803
},
{
"epoch": 1.9587628865979383,
"grad_norm": 1.0478682320033872,
"learning_rate": 9.472587138144214e-08,
"loss": 1.4117,
"step": 804
},
{
"epoch": 1.9613402061855671,
"grad_norm": 1.1920119917461085,
"learning_rate": 9.452321957713563e-08,
"loss": 1.556,
"step": 805
},
{
"epoch": 1.963917525773196,
"grad_norm": 1.1902655777598523,
"learning_rate": 9.432059032991606e-08,
"loss": 1.5102,
"step": 806
},
{
"epoch": 1.9664948453608249,
"grad_norm": 1.1511704775031535,
"learning_rate": 9.411798447434773e-08,
"loss": 1.5281,
"step": 807
},
{
"epoch": 1.9690721649484537,
"grad_norm": 1.1636100359208144,
"learning_rate": 9.39154028448986e-08,
"loss": 1.4024,
"step": 808
},
{
"epoch": 1.9716494845360826,
"grad_norm": 1.2582478560602157,
"learning_rate": 9.371284627593691e-08,
"loss": 1.4519,
"step": 809
},
{
"epoch": 1.9742268041237114,
"grad_norm": 1.1608958350691665,
"learning_rate": 9.351031560172764e-08,
"loss": 1.4286,
"step": 810
},
{
"epoch": 1.9768041237113403,
"grad_norm": 1.1725970187771935,
"learning_rate": 9.330781165642907e-08,
"loss": 1.4858,
"step": 811
},
{
"epoch": 1.9793814432989691,
"grad_norm": 1.181405747708069,
"learning_rate": 9.310533527408951e-08,
"loss": 1.5193,
"step": 812
},
{
"epoch": 1.981958762886598,
"grad_norm": 1.1949902203170548,
"learning_rate": 9.290288728864365e-08,
"loss": 1.3768,
"step": 813
},
{
"epoch": 1.9845360824742269,
"grad_norm": 1.2444243036816676,
"learning_rate": 9.270046853390924e-08,
"loss": 1.4866,
"step": 814
},
{
"epoch": 1.9871134020618557,
"grad_norm": 1.162040164523566,
"learning_rate": 9.249807984358369e-08,
"loss": 1.4277,
"step": 815
},
{
"epoch": 1.9896907216494846,
"grad_norm": 1.3041991278727916,
"learning_rate": 9.229572205124051e-08,
"loss": 1.4895,
"step": 816
},
{
"epoch": 1.9922680412371134,
"grad_norm": 1.1800946591513317,
"learning_rate": 9.2093395990326e-08,
"loss": 1.6118,
"step": 817
},
{
"epoch": 1.9948453608247423,
"grad_norm": 1.120730199367575,
"learning_rate": 9.189110249415576e-08,
"loss": 1.4777,
"step": 818
},
{
"epoch": 1.9974226804123711,
"grad_norm": 1.165214854260427,
"learning_rate": 9.168884239591129e-08,
"loss": 1.4491,
"step": 819
},
{
"epoch": 2.0,
"grad_norm": 1.1460287106000804,
"learning_rate": 9.148661652863641e-08,
"loss": 1.442,
"step": 820
},
{
"epoch": 2.002577319587629,
"grad_norm": 1.245092231884586,
"learning_rate": 9.128442572523417e-08,
"loss": 1.4238,
"step": 821
},
{
"epoch": 2.0051546391752577,
"grad_norm": 1.1566295496507226,
"learning_rate": 9.108227081846305e-08,
"loss": 1.4313,
"step": 822
},
{
"epoch": 2.0077319587628866,
"grad_norm": 1.2544751166156012,
"learning_rate": 9.088015264093364e-08,
"loss": 1.4879,
"step": 823
},
{
"epoch": 2.0103092783505154,
"grad_norm": 1.2229877060400391,
"learning_rate": 9.067807202510542e-08,
"loss": 1.4781,
"step": 824
},
{
"epoch": 2.0128865979381443,
"grad_norm": 1.1382534019879336,
"learning_rate": 9.047602980328308e-08,
"loss": 1.4163,
"step": 825
},
{
"epoch": 2.015463917525773,
"grad_norm": 1.1936874170381253,
"learning_rate": 9.027402680761309e-08,
"loss": 1.5233,
"step": 826
},
{
"epoch": 2.018041237113402,
"grad_norm": 1.133631677446316,
"learning_rate": 9.007206387008053e-08,
"loss": 1.397,
"step": 827
},
{
"epoch": 2.020618556701031,
"grad_norm": 1.2442262218300326,
"learning_rate": 8.987014182250538e-08,
"loss": 1.4734,
"step": 828
},
{
"epoch": 2.0231958762886597,
"grad_norm": 1.1593473271235548,
"learning_rate": 8.966826149653922e-08,
"loss": 1.5101,
"step": 829
},
{
"epoch": 2.0257731958762886,
"grad_norm": 1.2054412501356118,
"learning_rate": 8.94664237236618e-08,
"loss": 1.4657,
"step": 830
},
{
"epoch": 2.0283505154639174,
"grad_norm": 1.1696863220137095,
"learning_rate": 8.926462933517765e-08,
"loss": 1.4385,
"step": 831
},
{
"epoch": 2.0309278350515463,
"grad_norm": 1.2085685291526942,
"learning_rate": 8.906287916221257e-08,
"loss": 1.4567,
"step": 832
},
{
"epoch": 2.033505154639175,
"grad_norm": 1.2062684152337084,
"learning_rate": 8.886117403571023e-08,
"loss": 1.4903,
"step": 833
},
{
"epoch": 2.033505154639175,
"eval_loss": 1.4574321508407593,
"eval_runtime": 78.538,
"eval_samples_per_second": 21.174,
"eval_steps_per_second": 1.324,
"step": 833
},
{
"epoch": 2.036082474226804,
"grad_norm": 1.2985823482438499,
"learning_rate": 8.865951478642886e-08,
"loss": 1.4945,
"step": 834
},
{
"epoch": 2.038659793814433,
"grad_norm": 1.2008208109365806,
"learning_rate": 8.845790224493761e-08,
"loss": 1.4053,
"step": 835
},
{
"epoch": 2.0412371134020617,
"grad_norm": 1.1173370303783305,
"learning_rate": 8.825633724161334e-08,
"loss": 1.437,
"step": 836
},
{
"epoch": 2.0438144329896906,
"grad_norm": 1.2675969181316824,
"learning_rate": 8.805482060663712e-08,
"loss": 1.4189,
"step": 837
},
{
"epoch": 2.0463917525773194,
"grad_norm": 1.2147757078811159,
"learning_rate": 8.785335316999077e-08,
"loss": 1.4214,
"step": 838
},
{
"epoch": 2.0489690721649483,
"grad_norm": 1.09453864552264,
"learning_rate": 8.765193576145346e-08,
"loss": 1.4027,
"step": 839
},
{
"epoch": 2.051546391752577,
"grad_norm": 1.216226711944593,
"learning_rate": 8.745056921059839e-08,
"loss": 1.5143,
"step": 840
},
{
"epoch": 2.054123711340206,
"grad_norm": 1.2055008222540708,
"learning_rate": 8.724925434678922e-08,
"loss": 1.4489,
"step": 841
},
{
"epoch": 2.056701030927835,
"grad_norm": 1.1336500080565066,
"learning_rate": 8.704799199917673e-08,
"loss": 1.4248,
"step": 842
},
{
"epoch": 2.0592783505154637,
"grad_norm": 1.215103376196868,
"learning_rate": 8.684678299669546e-08,
"loss": 1.4463,
"step": 843
},
{
"epoch": 2.0618556701030926,
"grad_norm": 1.1882950937372736,
"learning_rate": 8.664562816806021e-08,
"loss": 1.4444,
"step": 844
},
{
"epoch": 2.0644329896907214,
"grad_norm": 1.2047730105242802,
"learning_rate": 8.64445283417626e-08,
"loss": 1.4514,
"step": 845
},
{
"epoch": 2.0670103092783503,
"grad_norm": 1.1364686666884227,
"learning_rate": 8.624348434606781e-08,
"loss": 1.4285,
"step": 846
},
{
"epoch": 2.069587628865979,
"grad_norm": 1.2216577804549105,
"learning_rate": 8.6042497009011e-08,
"loss": 1.5001,
"step": 847
},
{
"epoch": 2.072164948453608,
"grad_norm": 1.167316107588148,
"learning_rate": 8.5841567158394e-08,
"loss": 1.4095,
"step": 848
},
{
"epoch": 2.074742268041237,
"grad_norm": 1.087136320546188,
"learning_rate": 8.564069562178188e-08,
"loss": 1.4547,
"step": 849
},
{
"epoch": 2.0773195876288657,
"grad_norm": 1.10777310102604,
"learning_rate": 8.543988322649954e-08,
"loss": 1.4905,
"step": 850
},
{
"epoch": 2.0798969072164946,
"grad_norm": 1.2198690834759995,
"learning_rate": 8.523913079962816e-08,
"loss": 1.3988,
"step": 851
},
{
"epoch": 2.0824742268041234,
"grad_norm": 1.2266366218856903,
"learning_rate": 8.50384391680022e-08,
"loss": 1.4972,
"step": 852
},
{
"epoch": 2.0850515463917523,
"grad_norm": 1.1644015048600025,
"learning_rate": 8.483780915820552e-08,
"loss": 1.4233,
"step": 853
},
{
"epoch": 2.087628865979381,
"grad_norm": 1.1537200560912633,
"learning_rate": 8.463724159656814e-08,
"loss": 1.5044,
"step": 854
},
{
"epoch": 2.09020618556701,
"grad_norm": 1.1190956026619867,
"learning_rate": 8.443673730916312e-08,
"loss": 1.4284,
"step": 855
},
{
"epoch": 2.092783505154639,
"grad_norm": 1.1476534954615265,
"learning_rate": 8.423629712180264e-08,
"loss": 1.4601,
"step": 856
},
{
"epoch": 2.095360824742268,
"grad_norm": 1.2130889970169285,
"learning_rate": 8.403592186003501e-08,
"loss": 1.3902,
"step": 857
},
{
"epoch": 2.097938144329897,
"grad_norm": 1.2106313562862567,
"learning_rate": 8.383561234914119e-08,
"loss": 1.5202,
"step": 858
},
{
"epoch": 2.100515463917526,
"grad_norm": 1.2790874195534712,
"learning_rate": 8.36353694141312e-08,
"loss": 1.5241,
"step": 859
},
{
"epoch": 2.1030927835051547,
"grad_norm": 1.1984788041581806,
"learning_rate": 8.34351938797409e-08,
"loss": 1.5185,
"step": 860
},
{
"epoch": 2.1056701030927836,
"grad_norm": 1.1224530119764298,
"learning_rate": 8.323508657042858e-08,
"loss": 1.4387,
"step": 861
},
{
"epoch": 2.1082474226804124,
"grad_norm": 1.1916193301815299,
"learning_rate": 8.303504831037154e-08,
"loss": 1.433,
"step": 862
},
{
"epoch": 2.1108247422680413,
"grad_norm": 1.269383237065682,
"learning_rate": 8.283507992346263e-08,
"loss": 1.58,
"step": 863
},
{
"epoch": 2.002577319587629,
"grad_norm": 1.0439514094170574,
"learning_rate": 8.263518223330696e-08,
"loss": 1.3774,
"step": 864
},
{
"epoch": 2.0051546391752577,
"grad_norm": 1.1249347513631904,
"learning_rate": 8.243535606321848e-08,
"loss": 1.4098,
"step": 865
},
{
"epoch": 2.0077319587628866,
"grad_norm": 1.375007615993654,
"learning_rate": 8.22356022362165e-08,
"loss": 1.4725,
"step": 866
},
{
"epoch": 2.0103092783505154,
"grad_norm": 1.1571951227795978,
"learning_rate": 8.203592157502244e-08,
"loss": 1.4642,
"step": 867
},
{
"epoch": 2.0128865979381443,
"grad_norm": 1.1725964239389173,
"learning_rate": 8.183631490205636e-08,
"loss": 1.4317,
"step": 868
},
{
"epoch": 2.015463917525773,
"grad_norm": 1.1131141063076042,
"learning_rate": 8.163678303943356e-08,
"loss": 1.4534,
"step": 869
},
{
"epoch": 2.018041237113402,
"grad_norm": 1.174599695198473,
"learning_rate": 8.143732680896123e-08,
"loss": 1.4076,
"step": 870
},
{
"epoch": 2.020618556701031,
"grad_norm": 1.1730868356762598,
"learning_rate": 8.123794703213509e-08,
"loss": 1.457,
"step": 871
},
{
"epoch": 2.0231958762886597,
"grad_norm": 1.194870586046834,
"learning_rate": 8.103864453013592e-08,
"loss": 1.5082,
"step": 872
},
{
"epoch": 2.0257731958762886,
"grad_norm": 1.1351876585089653,
"learning_rate": 8.083942012382625e-08,
"loss": 1.4886,
"step": 873
},
{
"epoch": 2.0283505154639174,
"grad_norm": 1.1792650671176743,
"learning_rate": 8.064027463374701e-08,
"loss": 1.4118,
"step": 874
},
{
"epoch": 2.0309278350515463,
"grad_norm": 1.153547305161426,
"learning_rate": 8.0441208880114e-08,
"loss": 1.4064,
"step": 875
},
{
"epoch": 2.033505154639175,
"grad_norm": 1.2783578209502229,
"learning_rate": 8.024222368281469e-08,
"loss": 1.4816,
"step": 876
},
{
"epoch": 2.036082474226804,
"grad_norm": 1.240844307809194,
"learning_rate": 8.004331986140473e-08,
"loss": 1.4598,
"step": 877
},
{
"epoch": 2.038659793814433,
"grad_norm": 1.1295638200937268,
"learning_rate": 7.984449823510467e-08,
"loss": 1.4081,
"step": 878
},
{
"epoch": 2.0412371134020617,
"grad_norm": 1.1888063217054325,
"learning_rate": 7.964575962279634e-08,
"loss": 1.4618,
"step": 879
},
{
"epoch": 2.0438144329896906,
"grad_norm": 1.2357228980469037,
"learning_rate": 7.944710484301995e-08,
"loss": 1.3963,
"step": 880
},
{
"epoch": 2.0463917525773194,
"grad_norm": 1.0786846944064847,
"learning_rate": 7.92485347139702e-08,
"loss": 1.4514,
"step": 881
},
{
"epoch": 2.0489690721649483,
"grad_norm": 1.1666214344742663,
"learning_rate": 7.90500500534932e-08,
"loss": 1.4389,
"step": 882
},
{
"epoch": 2.0489690721649483,
"eval_loss": 1.4557408094406128,
"eval_runtime": 78.6008,
"eval_samples_per_second": 21.158,
"eval_steps_per_second": 1.323,
"step": 882
},
{
"epoch": 2.051546391752577,
"grad_norm": 1.1265923768111081,
"learning_rate": 7.88516516790831e-08,
"loss": 1.4401,
"step": 883
},
{
"epoch": 2.054123711340206,
"grad_norm": 1.2322020489966297,
"learning_rate": 7.865334040787866e-08,
"loss": 1.5326,
"step": 884
},
{
"epoch": 2.056701030927835,
"grad_norm": 1.1620543990403278,
"learning_rate": 7.845511705665973e-08,
"loss": 1.4151,
"step": 885
},
{
"epoch": 2.0592783505154637,
"grad_norm": 1.2532645521350043,
"learning_rate": 7.82569824418443e-08,
"loss": 1.485,
"step": 886
},
{
"epoch": 2.0618556701030926,
"grad_norm": 1.2322746000056972,
"learning_rate": 7.805893737948472e-08,
"loss": 1.439,
"step": 887
},
{
"epoch": 2.0644329896907214,
"grad_norm": 1.1992705537386268,
"learning_rate": 7.786098268526446e-08,
"loss": 1.4927,
"step": 888
},
{
"epoch": 2.0670103092783507,
"grad_norm": 1.219061389377471,
"learning_rate": 7.7663119174495e-08,
"loss": 1.5607,
"step": 889
},
{
"epoch": 2.0695876288659796,
"grad_norm": 1.2161975840628703,
"learning_rate": 7.746534766211206e-08,
"loss": 1.5666,
"step": 890
},
{
"epoch": 2.0721649484536084,
"grad_norm": 1.296835674200516,
"learning_rate": 7.726766896267253e-08,
"loss": 1.4738,
"step": 891
},
{
"epoch": 2.0747422680412373,
"grad_norm": 1.1913191108570989,
"learning_rate": 7.7070083890351e-08,
"loss": 1.4345,
"step": 892
},
{
"epoch": 2.077319587628866,
"grad_norm": 1.1697890394016621,
"learning_rate": 7.687259325893654e-08,
"loss": 1.4431,
"step": 893
},
{
"epoch": 2.079896907216495,
"grad_norm": 1.2354727439582665,
"learning_rate": 7.667519788182912e-08,
"loss": 1.4302,
"step": 894
},
{
"epoch": 2.082474226804124,
"grad_norm": 1.1445036968078774,
"learning_rate": 7.647789857203644e-08,
"loss": 1.4532,
"step": 895
},
{
"epoch": 2.0850515463917527,
"grad_norm": 1.196595545836434,
"learning_rate": 7.628069614217058e-08,
"loss": 1.3915,
"step": 896
},
{
"epoch": 2.0876288659793816,
"grad_norm": 1.2451954556034555,
"learning_rate": 7.608359140444453e-08,
"loss": 1.502,
"step": 897
},
{
"epoch": 2.0902061855670104,
"grad_norm": 1.1198448743060805,
"learning_rate": 7.588658517066892e-08,
"loss": 1.4182,
"step": 898
},
{
"epoch": 2.0927835051546393,
"grad_norm": 1.178128381993088,
"learning_rate": 7.568967825224875e-08,
"loss": 1.5009,
"step": 899
},
{
"epoch": 2.095360824742268,
"grad_norm": 1.1493716638910112,
"learning_rate": 7.549287146017988e-08,
"loss": 1.4575,
"step": 900
},
{
"epoch": 2.097938144329897,
"grad_norm": 1.2133662857011498,
"learning_rate": 7.529616560504583e-08,
"loss": 1.5579,
"step": 901
},
{
"epoch": 2.100515463917526,
"grad_norm": 1.3854933572472905,
"learning_rate": 7.509956149701444e-08,
"loss": 1.4113,
"step": 902
},
{
"epoch": 2.1030927835051547,
"grad_norm": 1.263798951148438,
"learning_rate": 7.490305994583435e-08,
"loss": 1.4258,
"step": 903
},
{
"epoch": 2.1056701030927836,
"grad_norm": 1.1393321990385807,
"learning_rate": 7.470666176083191e-08,
"loss": 1.4943,
"step": 904
},
{
"epoch": 2.1082474226804124,
"grad_norm": 1.1741861811520338,
"learning_rate": 7.451036775090775e-08,
"loss": 1.3918,
"step": 905
},
{
"epoch": 2.1108247422680413,
"grad_norm": 1.222621280727268,
"learning_rate": 7.431417872453338e-08,
"loss": 1.513,
"step": 906
},
{
"epoch": 2.11340206185567,
"grad_norm": 1.1452645437770688,
"learning_rate": 7.411809548974791e-08,
"loss": 1.4496,
"step": 907
},
{
"epoch": 2.115979381443299,
"grad_norm": 1.1804026334318425,
"learning_rate": 7.39221188541548e-08,
"loss": 1.4644,
"step": 908
},
{
"epoch": 2.118556701030928,
"grad_norm": 1.1527370569507815,
"learning_rate": 7.372624962491841e-08,
"loss": 1.4698,
"step": 909
},
{
"epoch": 2.1211340206185567,
"grad_norm": 1.211563683201349,
"learning_rate": 7.353048860876063e-08,
"loss": 1.4671,
"step": 910
},
{
"epoch": 2.1237113402061856,
"grad_norm": 1.1550395362954822,
"learning_rate": 7.333483661195792e-08,
"loss": 1.3627,
"step": 911
},
{
"epoch": 2.1262886597938144,
"grad_norm": 1.1772438114561363,
"learning_rate": 7.31392944403375e-08,
"loss": 1.4349,
"step": 912
},
{
"epoch": 2.1288659793814433,
"grad_norm": 1.1316430782314122,
"learning_rate": 7.294386289927424e-08,
"loss": 1.4892,
"step": 913
},
{
"epoch": 2.131443298969072,
"grad_norm": 1.2166109017309248,
"learning_rate": 7.274854279368758e-08,
"loss": 1.4753,
"step": 914
},
{
"epoch": 2.134020618556701,
"grad_norm": 1.2508664732495605,
"learning_rate": 7.255333492803777e-08,
"loss": 1.3593,
"step": 915
},
{
"epoch": 2.13659793814433,
"grad_norm": 1.1270294993138392,
"learning_rate": 7.235824010632283e-08,
"loss": 1.5031,
"step": 916
},
{
"epoch": 2.1391752577319587,
"grad_norm": 1.142323203849277,
"learning_rate": 7.216325913207534e-08,
"loss": 1.4747,
"step": 917
},
{
"epoch": 2.1417525773195876,
"grad_norm": 1.198388386752302,
"learning_rate": 7.196839280835875e-08,
"loss": 1.4787,
"step": 918
},
{
"epoch": 2.1443298969072164,
"grad_norm": 1.288933637399068,
"learning_rate": 7.17736419377644e-08,
"loss": 1.458,
"step": 919
},
{
"epoch": 2.1469072164948453,
"grad_norm": 1.2342213116469787,
"learning_rate": 7.157900732240826e-08,
"loss": 1.4902,
"step": 920
},
{
"epoch": 2.149484536082474,
"grad_norm": 1.2300130857871707,
"learning_rate": 7.138448976392724e-08,
"loss": 1.4835,
"step": 921
},
{
"epoch": 2.152061855670103,
"grad_norm": 1.169125520832618,
"learning_rate": 7.119009006347624e-08,
"loss": 1.413,
"step": 922
},
{
"epoch": 2.154639175257732,
"grad_norm": 1.1702489758289347,
"learning_rate": 7.09958090217248e-08,
"loss": 1.4857,
"step": 923
},
{
"epoch": 2.1572164948453607,
"grad_norm": 1.179155067994331,
"learning_rate": 7.080164743885362e-08,
"loss": 1.507,
"step": 924
},
{
"epoch": 2.1597938144329896,
"grad_norm": 1.149588572227629,
"learning_rate": 7.060760611455151e-08,
"loss": 1.3957,
"step": 925
},
{
"epoch": 2.1623711340206184,
"grad_norm": 1.1269730428089064,
"learning_rate": 7.041368584801186e-08,
"loss": 1.515,
"step": 926
},
{
"epoch": 2.1649484536082473,
"grad_norm": 1.2614734844469966,
"learning_rate": 7.021988743792958e-08,
"loss": 1.4752,
"step": 927
},
{
"epoch": 2.167525773195876,
"grad_norm": 1.26049546725807,
"learning_rate": 7.002621168249758e-08,
"loss": 1.4222,
"step": 928
},
{
"epoch": 2.170103092783505,
"grad_norm": 1.2122490418432295,
"learning_rate": 6.983265937940365e-08,
"loss": 1.5258,
"step": 929
},
{
"epoch": 2.172680412371134,
"grad_norm": 1.163933149699957,
"learning_rate": 6.963923132582715e-08,
"loss": 1.4406,
"step": 930
},
{
"epoch": 2.1752577319587627,
"grad_norm": 1.2117410126905865,
"learning_rate": 6.944592831843566e-08,
"loss": 1.4541,
"step": 931
},
{
"epoch": 2.1752577319587627,
"eval_loss": 1.4543218612670898,
"eval_runtime": 78.6219,
"eval_samples_per_second": 21.152,
"eval_steps_per_second": 1.323,
"step": 931
},
{
"epoch": 2.1778350515463916,
"grad_norm": 1.2898700377788812,
"learning_rate": 6.925275115338167e-08,
"loss": 1.458,
"step": 932
},
{
"epoch": 2.1804123711340204,
"grad_norm": 1.1426836123172524,
"learning_rate": 6.90597006262995e-08,
"loss": 1.3469,
"step": 933
},
{
"epoch": 2.1829896907216493,
"grad_norm": 1.224441134115869,
"learning_rate": 6.886677753230183e-08,
"loss": 1.4027,
"step": 934
},
{
"epoch": 2.1855670103092786,
"grad_norm": 1.387271519204012,
"learning_rate": 6.867398266597642e-08,
"loss": 1.4359,
"step": 935
},
{
"epoch": 2.1881443298969074,
"grad_norm": 1.2243550754367374,
"learning_rate": 6.848131682138303e-08,
"loss": 1.4891,
"step": 936
},
{
"epoch": 2.1907216494845363,
"grad_norm": 1.2282484095681934,
"learning_rate": 6.82887807920499e-08,
"loss": 1.4571,
"step": 937
},
{
"epoch": 2.193298969072165,
"grad_norm": 1.252437764569184,
"learning_rate": 6.809637537097061e-08,
"loss": 1.4845,
"step": 938
},
{
"epoch": 2.195876288659794,
"grad_norm": 1.2033826306564712,
"learning_rate": 6.790410135060096e-08,
"loss": 1.3981,
"step": 939
},
{
"epoch": 2.198453608247423,
"grad_norm": 1.2730733273660004,
"learning_rate": 6.77119595228554e-08,
"loss": 1.5428,
"step": 940
},
{
"epoch": 2.2010309278350517,
"grad_norm": 1.1145258448772917,
"learning_rate": 6.751995067910388e-08,
"loss": 1.4391,
"step": 941
},
{
"epoch": 2.2036082474226806,
"grad_norm": 1.2423736700157595,
"learning_rate": 6.732807561016884e-08,
"loss": 1.3461,
"step": 942
},
{
"epoch": 2.2061855670103094,
"grad_norm": 1.2567446761007774,
"learning_rate": 6.713633510632157e-08,
"loss": 1.4424,
"step": 943
},
{
"epoch": 2.2087628865979383,
"grad_norm": 1.1962904231989222,
"learning_rate": 6.694472995727913e-08,
"loss": 1.5211,
"step": 944
},
{
"epoch": 2.211340206185567,
"grad_norm": 1.2697071279271324,
"learning_rate": 6.675326095220124e-08,
"loss": 1.5138,
"step": 945
},
{
"epoch": 2.213917525773196,
"grad_norm": 1.1182813975437969,
"learning_rate": 6.656192887968674e-08,
"loss": 1.4643,
"step": 946
},
{
"epoch": 2.216494845360825,
"grad_norm": 1.2209457066901777,
"learning_rate": 6.637073452777051e-08,
"loss": 1.4646,
"step": 947
},
{
"epoch": 2.2190721649484537,
"grad_norm": 1.2364207179496447,
"learning_rate": 6.617967868392035e-08,
"loss": 1.4531,
"step": 948
},
{
"epoch": 2.2216494845360826,
"grad_norm": 1.1596958099892627,
"learning_rate": 6.598876213503339e-08,
"loss": 1.3596,
"step": 949
},
{
"epoch": 2.2242268041237114,
"grad_norm": 1.1861584749981382,
"learning_rate": 6.579798566743313e-08,
"loss": 1.4605,
"step": 950
},
{
"epoch": 2.2268041237113403,
"grad_norm": 1.2713750509697457,
"learning_rate": 6.560735006686617e-08,
"loss": 1.5169,
"step": 951
},
{
"epoch": 2.229381443298969,
"grad_norm": 1.166290536481266,
"learning_rate": 6.541685611849887e-08,
"loss": 1.4436,
"step": 952
},
{
"epoch": 2.231958762886598,
"grad_norm": 1.1735876550775757,
"learning_rate": 6.522650460691415e-08,
"loss": 1.4548,
"step": 953
},
{
"epoch": 2.234536082474227,
"grad_norm": 1.2477782864575375,
"learning_rate": 6.503629631610836e-08,
"loss": 1.4534,
"step": 954
},
{
"epoch": 2.2371134020618557,
"grad_norm": 1.2173622340437633,
"learning_rate": 6.48462320294879e-08,
"loss": 1.4595,
"step": 955
},
{
"epoch": 2.2396907216494846,
"grad_norm": 1.1869675634283399,
"learning_rate": 6.465631252986608e-08,
"loss": 1.4451,
"step": 956
},
{
"epoch": 2.2422680412371134,
"grad_norm": 1.1456159400412829,
"learning_rate": 6.446653859945986e-08,
"loss": 1.4064,
"step": 957
},
{
"epoch": 2.2448453608247423,
"grad_norm": 1.2491020198374654,
"learning_rate": 6.427691101988673e-08,
"loss": 1.4949,
"step": 958
},
{
"epoch": 2.247422680412371,
"grad_norm": 1.2282744468510673,
"learning_rate": 6.40874305721613e-08,
"loss": 1.4545,
"step": 959
},
{
"epoch": 2.25,
"grad_norm": 1.0996865259394428,
"learning_rate": 6.389809803669226e-08,
"loss": 1.3342,
"step": 960
},
{
"epoch": 2.252577319587629,
"grad_norm": 1.230550939339635,
"learning_rate": 6.370891419327906e-08,
"loss": 1.5121,
"step": 961
},
{
"epoch": 2.2551546391752577,
"grad_norm": 1.2652568339180974,
"learning_rate": 6.351987982110879e-08,
"loss": 1.5533,
"step": 962
},
{
"epoch": 2.2577319587628866,
"grad_norm": 1.173180731192026,
"learning_rate": 6.333099569875284e-08,
"loss": 1.4439,
"step": 963
},
{
"epoch": 2.2603092783505154,
"grad_norm": 1.1001923514400465,
"learning_rate": 6.314226260416382e-08,
"loss": 1.4376,
"step": 964
},
{
"epoch": 2.2628865979381443,
"grad_norm": 1.1389700541958854,
"learning_rate": 6.295368131467235e-08,
"loss": 1.4357,
"step": 965
},
{
"epoch": 2.265463917525773,
"grad_norm": 1.1695985290298057,
"learning_rate": 6.276525260698363e-08,
"loss": 1.5309,
"step": 966
},
{
"epoch": 2.268041237113402,
"grad_norm": 1.2012587244050719,
"learning_rate": 6.257697725717468e-08,
"loss": 1.5271,
"step": 967
},
{
"epoch": 2.270618556701031,
"grad_norm": 1.2116419761383141,
"learning_rate": 6.238885604069075e-08,
"loss": 1.4536,
"step": 968
},
{
"epoch": 2.2731958762886597,
"grad_norm": 1.169258658026815,
"learning_rate": 6.220088973234215e-08,
"loss": 1.4662,
"step": 969
},
{
"epoch": 2.2757731958762886,
"grad_norm": 1.1455385835708687,
"learning_rate": 6.201307910630145e-08,
"loss": 1.4339,
"step": 970
},
{
"epoch": 2.2783505154639174,
"grad_norm": 1.1833257380384377,
"learning_rate": 6.182542493609984e-08,
"loss": 1.3253,
"step": 971
},
{
"epoch": 2.2809278350515463,
"grad_norm": 1.28784815413645,
"learning_rate": 6.163792799462403e-08,
"loss": 1.4603,
"step": 972
},
{
"epoch": 2.283505154639175,
"grad_norm": 1.1970928590978123,
"learning_rate": 6.145058905411342e-08,
"loss": 1.4683,
"step": 973
},
{
"epoch": 2.286082474226804,
"grad_norm": 1.149098853897877,
"learning_rate": 6.126340888615641e-08,
"loss": 1.4729,
"step": 974
},
{
"epoch": 2.288659793814433,
"grad_norm": 1.209952156325127,
"learning_rate": 6.107638826168756e-08,
"loss": 1.5063,
"step": 975
},
{
"epoch": 2.2912371134020617,
"grad_norm": 1.093427620169618,
"learning_rate": 6.088952795098441e-08,
"loss": 1.4402,
"step": 976
},
{
"epoch": 2.2938144329896906,
"grad_norm": 1.1277798916215127,
"learning_rate": 6.070282872366406e-08,
"loss": 1.5049,
"step": 977
},
{
"epoch": 2.2963917525773194,
"grad_norm": 1.1497157702484186,
"learning_rate": 6.05162913486802e-08,
"loss": 1.4331,
"step": 978
},
{
"epoch": 2.2989690721649483,
"grad_norm": 1.2127687421273623,
"learning_rate": 6.032991659432006e-08,
"loss": 1.464,
"step": 979
},
{
"epoch": 2.301546391752577,
"grad_norm": 1.2091736243527582,
"learning_rate": 6.014370522820084e-08,
"loss": 1.4257,
"step": 980
},
{
"epoch": 2.301546391752577,
"eval_loss": 1.4530315399169922,
"eval_runtime": 78.4954,
"eval_samples_per_second": 21.186,
"eval_steps_per_second": 1.325,
"step": 980
},
{
"epoch": 2.304123711340206,
"grad_norm": 1.1621649511934278,
"learning_rate": 5.995765801726698e-08,
"loss": 1.4808,
"step": 981
},
{
"epoch": 2.306701030927835,
"grad_norm": 1.1581272698070357,
"learning_rate": 5.977177572778678e-08,
"loss": 1.3401,
"step": 982
},
{
"epoch": 2.3092783505154637,
"grad_norm": 1.1599391051626198,
"learning_rate": 5.958605912534921e-08,
"loss": 1.4917,
"step": 983
},
{
"epoch": 2.3118556701030926,
"grad_norm": 1.3034698067830743,
"learning_rate": 5.9400508974860885e-08,
"loss": 1.4841,
"step": 984
},
{
"epoch": 2.3144329896907214,
"grad_norm": 1.2060359148709237,
"learning_rate": 5.9215126040542886e-08,
"loss": 1.4479,
"step": 985
},
{
"epoch": 2.3170103092783503,
"grad_norm": 1.2258119330781094,
"learning_rate": 5.902991108592754e-08,
"loss": 1.4949,
"step": 986
},
{
"epoch": 2.319587628865979,
"grad_norm": 1.2150702094703367,
"learning_rate": 5.8844864873855296e-08,
"loss": 1.4329,
"step": 987
},
{
"epoch": 2.3221649484536084,
"grad_norm": 1.1354804163624515,
"learning_rate": 5.8659988166471706e-08,
"loss": 1.3683,
"step": 988
},
{
"epoch": 2.3247422680412373,
"grad_norm": 1.1304878710380117,
"learning_rate": 5.847528172522407e-08,
"loss": 1.4345,
"step": 989
},
{
"epoch": 2.327319587628866,
"grad_norm": 1.2388489587800555,
"learning_rate": 5.829074631085852e-08,
"loss": 1.5177,
"step": 990
},
{
"epoch": 2.329896907216495,
"grad_norm": 1.2418385155763394,
"learning_rate": 5.8106382683416636e-08,
"loss": 1.5666,
"step": 991
},
{
"epoch": 2.332474226804124,
"grad_norm": 1.2067656028810445,
"learning_rate": 5.7922191602232675e-08,
"loss": 1.501,
"step": 992
},
{
"epoch": 2.3350515463917527,
"grad_norm": 1.2443124436097661,
"learning_rate": 5.773817382593007e-08,
"loss": 1.4516,
"step": 993
},
{
"epoch": 2.3376288659793816,
"grad_norm": 1.2589938629670394,
"learning_rate": 5.7554330112418504e-08,
"loss": 1.4955,
"step": 994
},
{
"epoch": 2.3402061855670104,
"grad_norm": 1.1979526509329819,
"learning_rate": 5.737066121889078e-08,
"loss": 1.4224,
"step": 995
},
{
"epoch": 2.3427835051546393,
"grad_norm": 1.1895398966073056,
"learning_rate": 5.718716790181965e-08,
"loss": 1.4243,
"step": 996
},
{
"epoch": 2.345360824742268,
"grad_norm": 1.1828652518517522,
"learning_rate": 5.70038509169547e-08,
"loss": 1.4559,
"step": 997
},
{
"epoch": 2.347938144329897,
"grad_norm": 1.2201556733969088,
"learning_rate": 5.682071101931936e-08,
"loss": 1.5799,
"step": 998
},
{
"epoch": 2.350515463917526,
"grad_norm": 1.2211801179218442,
"learning_rate": 5.6637748963207566e-08,
"loss": 1.4684,
"step": 999
},
{
"epoch": 2.3530927835051547,
"grad_norm": 1.2453622614111477,
"learning_rate": 5.6454965502180884e-08,
"loss": 1.4854,
"step": 1000
},
{
"epoch": 2.3556701030927836,
"grad_norm": 1.1220592371624576,
"learning_rate": 5.627236138906524e-08,
"loss": 1.5089,
"step": 1001
},
{
"epoch": 2.3582474226804124,
"grad_norm": 1.1369675384518176,
"learning_rate": 5.60899373759479e-08,
"loss": 1.4088,
"step": 1002
},
{
"epoch": 2.3608247422680413,
"grad_norm": 1.1583531710119257,
"learning_rate": 5.590769421417434e-08,
"loss": 1.4299,
"step": 1003
},
{
"epoch": 2.36340206185567,
"grad_norm": 1.2204630482972216,
"learning_rate": 5.572563265434527e-08,
"loss": 1.421,
"step": 1004
},
{
"epoch": 2.365979381443299,
"grad_norm": 1.1654233558024554,
"learning_rate": 5.55437534463133e-08,
"loss": 1.4153,
"step": 1005
},
{
"epoch": 2.368556701030928,
"grad_norm": 1.1255124035829496,
"learning_rate": 5.536205733918007e-08,
"loss": 1.4196,
"step": 1006
},
{
"epoch": 2.3711340206185567,
"grad_norm": 1.1998683282168985,
"learning_rate": 5.5180545081293074e-08,
"loss": 1.4067,
"step": 1007
},
{
"epoch": 2.3737113402061856,
"grad_norm": 1.2097328179188533,
"learning_rate": 5.4999217420242574e-08,
"loss": 1.4221,
"step": 1008
},
{
"epoch": 2.3762886597938144,
"grad_norm": 1.2465777328454615,
"learning_rate": 5.481807510285852e-08,
"loss": 1.5432,
"step": 1009
},
{
"epoch": 2.3788659793814433,
"grad_norm": 1.1017326736009339,
"learning_rate": 5.4637118875207585e-08,
"loss": 1.4498,
"step": 1010
},
{
"epoch": 2.381443298969072,
"grad_norm": 1.1894534742510336,
"learning_rate": 5.445634948258991e-08,
"loss": 1.4779,
"step": 1011
},
{
"epoch": 2.384020618556701,
"grad_norm": 1.2240426429209377,
"learning_rate": 5.4275767669536145e-08,
"loss": 1.4643,
"step": 1012
},
{
"epoch": 2.38659793814433,
"grad_norm": 1.1865338401108185,
"learning_rate": 5.4095374179804365e-08,
"loss": 1.4218,
"step": 1013
},
{
"epoch": 2.3891752577319587,
"grad_norm": 1.1332962977107732,
"learning_rate": 5.391516975637699e-08,
"loss": 1.4893,
"step": 1014
},
{
"epoch": 2.3917525773195876,
"grad_norm": 1.1749099925869624,
"learning_rate": 5.373515514145771e-08,
"loss": 1.4223,
"step": 1015
},
{
"epoch": 2.3943298969072164,
"grad_norm": 1.2704273457918143,
"learning_rate": 5.355533107646858e-08,
"loss": 1.4625,
"step": 1016
},
{
"epoch": 2.3969072164948453,
"grad_norm": 1.2661897531951014,
"learning_rate": 5.3375698302046745e-08,
"loss": 1.4886,
"step": 1017
},
{
"epoch": 2.399484536082474,
"grad_norm": 1.1604729483093374,
"learning_rate": 5.319625755804138e-08,
"loss": 1.433,
"step": 1018
},
{
"epoch": 2.402061855670103,
"grad_norm": 1.1177913422918446,
"learning_rate": 5.301700958351098e-08,
"loss": 1.3745,
"step": 1019
},
{
"epoch": 2.404639175257732,
"grad_norm": 1.350758760981664,
"learning_rate": 5.283795511671994e-08,
"loss": 1.5148,
"step": 1020
},
{
"epoch": 2.4072164948453607,
"grad_norm": 1.1721177815291475,
"learning_rate": 5.265909489513567e-08,
"loss": 1.4789,
"step": 1021
},
{
"epoch": 2.4097938144329896,
"grad_norm": 1.1121369880829992,
"learning_rate": 5.248042965542558e-08,
"loss": 1.4492,
"step": 1022
},
{
"epoch": 2.4123711340206184,
"grad_norm": 1.172764927678444,
"learning_rate": 5.230196013345398e-08,
"loss": 1.495,
"step": 1023
},
{
"epoch": 2.4149484536082473,
"grad_norm": 1.2211219953558563,
"learning_rate": 5.212368706427912e-08,
"loss": 1.4839,
"step": 1024
},
{
"epoch": 2.417525773195876,
"grad_norm": 1.2134922811527864,
"learning_rate": 5.194561118215004e-08,
"loss": 1.4247,
"step": 1025
},
{
"epoch": 2.420103092783505,
"grad_norm": 1.1269911256995855,
"learning_rate": 5.176773322050381e-08,
"loss": 1.4484,
"step": 1026
},
{
"epoch": 2.422680412371134,
"grad_norm": 1.119051207691081,
"learning_rate": 5.1590053911962127e-08,
"loss": 1.3717,
"step": 1027
},
{
"epoch": 2.4252577319587627,
"grad_norm": 1.1877122575741303,
"learning_rate": 5.141257398832862e-08,
"loss": 1.416,
"step": 1028
},
{
"epoch": 2.4278350515463916,
"grad_norm": 1.1267435950520672,
"learning_rate": 5.1235294180585674e-08,
"loss": 1.4357,
"step": 1029
},
{
"epoch": 2.4278350515463916,
"eval_loss": 1.4520158767700195,
"eval_runtime": 78.5953,
"eval_samples_per_second": 21.159,
"eval_steps_per_second": 1.323,
"step": 1029
},
{
"epoch": 2.430412371134021,
"grad_norm": 1.0857318983382882,
"learning_rate": 5.1058215218891464e-08,
"loss": 1.4512,
"step": 1030
},
{
"epoch": 2.4329896907216497,
"grad_norm": 1.155498319174195,
"learning_rate": 5.088133783257693e-08,
"loss": 1.5014,
"step": 1031
},
{
"epoch": 2.4355670103092786,
"grad_norm": 1.2379699109090305,
"learning_rate": 5.070466275014287e-08,
"loss": 1.5288,
"step": 1032
},
{
"epoch": 2.4381443298969074,
"grad_norm": 1.3260836529994613,
"learning_rate": 5.0528190699256756e-08,
"loss": 1.456,
"step": 1033
},
{
"epoch": 2.4407216494845363,
"grad_norm": 1.1737794063785383,
"learning_rate": 5.03519224067499e-08,
"loss": 1.4514,
"step": 1034
},
{
"epoch": 2.443298969072165,
"grad_norm": 1.183113595964214,
"learning_rate": 5.0175858598614363e-08,
"loss": 1.4507,
"step": 1035
},
{
"epoch": 2.445876288659794,
"grad_norm": 1.1143164931619889,
"learning_rate": 5.000000000000002e-08,
"loss": 1.3849,
"step": 1036
},
{
"epoch": 2.448453608247423,
"grad_norm": 1.1724349277334387,
"learning_rate": 4.9824347335211514e-08,
"loss": 1.4424,
"step": 1037
},
{
"epoch": 2.4510309278350517,
"grad_norm": 1.1212216527840104,
"learning_rate": 4.964890132770543e-08,
"loss": 1.4082,
"step": 1038
},
{
"epoch": 2.4536082474226806,
"grad_norm": 1.1522290603715333,
"learning_rate": 4.947366270008707e-08,
"loss": 1.4314,
"step": 1039
},
{
"epoch": 2.4561855670103094,
"grad_norm": 1.1633774724561892,
"learning_rate": 4.929863217410767e-08,
"loss": 1.4865,
"step": 1040
},
{
"epoch": 2.4587628865979383,
"grad_norm": 1.1406335428126368,
"learning_rate": 4.912381047066133e-08,
"loss": 1.4458,
"step": 1041
},
{
"epoch": 2.461340206185567,
"grad_norm": 1.1104681920852408,
"learning_rate": 4.894919830978211e-08,
"loss": 1.397,
"step": 1042
},
{
"epoch": 2.463917525773196,
"grad_norm": 1.2181204959510732,
"learning_rate": 4.8774796410640983e-08,
"loss": 1.4955,
"step": 1043
},
{
"epoch": 2.466494845360825,
"grad_norm": 1.15471572592744,
"learning_rate": 4.860060549154301e-08,
"loss": 1.3996,
"step": 1044
},
{
"epoch": 2.4690721649484537,
"grad_norm": 1.19065290512176,
"learning_rate": 4.842662626992426e-08,
"loss": 1.4755,
"step": 1045
},
{
"epoch": 2.4716494845360826,
"grad_norm": 1.351223096851913,
"learning_rate": 4.825285946234874e-08,
"loss": 1.4747,
"step": 1046
},
{
"epoch": 2.4742268041237114,
"grad_norm": 1.141166837825934,
"learning_rate": 4.807930578450584e-08,
"loss": 1.4063,
"step": 1047
},
{
"epoch": 2.4768041237113403,
"grad_norm": 1.1861721992764764,
"learning_rate": 4.7905965951206986e-08,
"loss": 1.4967,
"step": 1048
},
{
"epoch": 2.479381443298969,
"grad_norm": 1.2595851597755765,
"learning_rate": 4.773284067638281e-08,
"loss": 1.4877,
"step": 1049
},
{
"epoch": 2.481958762886598,
"grad_norm": 1.1088230107238257,
"learning_rate": 4.755993067308047e-08,
"loss": 1.4385,
"step": 1050
},
{
"epoch": 2.484536082474227,
"grad_norm": 1.2852932163080484,
"learning_rate": 4.7387236653460205e-08,
"loss": 1.4141,
"step": 1051
},
{
"epoch": 2.4871134020618557,
"grad_norm": 1.244645084527039,
"learning_rate": 4.721475932879282e-08,
"loss": 1.482,
"step": 1052
},
{
"epoch": 2.4896907216494846,
"grad_norm": 1.2466688875419663,
"learning_rate": 4.7042499409456695e-08,
"loss": 1.4382,
"step": 1053
},
{
"epoch": 2.4922680412371134,
"grad_norm": 1.2462831105011571,
"learning_rate": 4.687045760493468e-08,
"loss": 1.536,
"step": 1054
},
{
"epoch": 2.4948453608247423,
"grad_norm": 1.1482492444378036,
"learning_rate": 4.6698634623811307e-08,
"loss": 1.4406,
"step": 1055
},
{
"epoch": 2.497422680412371,
"grad_norm": 1.1978027196822072,
"learning_rate": 4.652703117376986e-08,
"loss": 1.4288,
"step": 1056
},
{
"epoch": 2.5,
"grad_norm": 1.205112527214404,
"learning_rate": 4.635564796158945e-08,
"loss": 1.4066,
"step": 1057
},
{
"epoch": 2.502577319587629,
"grad_norm": 1.1958287831198664,
"learning_rate": 4.618448569314206e-08,
"loss": 1.4194,
"step": 1058
},
{
"epoch": 2.5051546391752577,
"grad_norm": 1.0972900424361671,
"learning_rate": 4.60135450733897e-08,
"loss": 1.4838,
"step": 1059
},
{
"epoch": 2.5077319587628866,
"grad_norm": 1.2508036239600449,
"learning_rate": 4.584282680638154e-08,
"loss": 1.4443,
"step": 1060
},
{
"epoch": 2.5103092783505154,
"grad_norm": 1.1703232750822017,
"learning_rate": 4.567233159525088e-08,
"loss": 1.434,
"step": 1061
},
{
"epoch": 2.5128865979381443,
"grad_norm": 1.1666987794362405,
"learning_rate": 4.550206014221232e-08,
"loss": 1.4857,
"step": 1062
},
{
"epoch": 2.515463917525773,
"grad_norm": 1.118899379693407,
"learning_rate": 4.53320131485589e-08,
"loss": 1.4753,
"step": 1063
},
{
"epoch": 2.518041237113402,
"grad_norm": 1.2072619010906969,
"learning_rate": 4.516219131465919e-08,
"loss": 1.461,
"step": 1064
},
{
"epoch": 2.520618556701031,
"grad_norm": 1.1330825353202136,
"learning_rate": 4.499259533995434e-08,
"loss": 1.3632,
"step": 1065
},
{
"epoch": 2.5231958762886597,
"grad_norm": 1.087244159516567,
"learning_rate": 4.48232259229554e-08,
"loss": 1.4907,
"step": 1066
},
{
"epoch": 2.5257731958762886,
"grad_norm": 1.113783698087956,
"learning_rate": 4.465408376124016e-08,
"loss": 1.425,
"step": 1067
},
{
"epoch": 2.5283505154639174,
"grad_norm": 1.2174392360989843,
"learning_rate": 4.448516955145047e-08,
"loss": 1.5075,
"step": 1068
},
{
"epoch": 2.5309278350515463,
"grad_norm": 1.2580642720936182,
"learning_rate": 4.431648398928932e-08,
"loss": 1.4312,
"step": 1069
},
{
"epoch": 2.533505154639175,
"grad_norm": 1.2608189792754003,
"learning_rate": 4.414802776951798e-08,
"loss": 1.4614,
"step": 1070
},
{
"epoch": 2.536082474226804,
"grad_norm": 1.1608489532256927,
"learning_rate": 4.3979801585953094e-08,
"loss": 1.4286,
"step": 1071
},
{
"epoch": 2.538659793814433,
"grad_norm": 1.241756886612098,
"learning_rate": 4.381180613146395e-08,
"loss": 1.4545,
"step": 1072
},
{
"epoch": 2.5412371134020617,
"grad_norm": 1.1267401284402057,
"learning_rate": 4.364404209796948e-08,
"loss": 1.4289,
"step": 1073
},
{
"epoch": 2.5438144329896906,
"grad_norm": 1.1675743288280764,
"learning_rate": 4.347651017643539e-08,
"loss": 1.4545,
"step": 1074
},
{
"epoch": 2.5463917525773194,
"grad_norm": 1.1014672234344964,
"learning_rate": 4.3309211056871544e-08,
"loss": 1.4588,
"step": 1075
},
{
"epoch": 2.5489690721649483,
"grad_norm": 1.1537126237371678,
"learning_rate": 4.314214542832888e-08,
"loss": 1.4922,
"step": 1076
},
{
"epoch": 2.551546391752577,
"grad_norm": 1.0803879548258355,
"learning_rate": 4.2975313978896644e-08,
"loss": 1.4505,
"step": 1077
},
{
"epoch": 2.554123711340206,
"grad_norm": 1.1135211277789598,
"learning_rate": 4.280871739569971e-08,
"loss": 1.4256,
"step": 1078
},
{
"epoch": 2.554123711340206,
"eval_loss": 1.4510596990585327,
"eval_runtime": 78.5321,
"eval_samples_per_second": 21.176,
"eval_steps_per_second": 1.324,
"step": 1078
},
{
"epoch": 2.556701030927835,
"grad_norm": 1.1587956973540048,
"learning_rate": 4.2642356364895414e-08,
"loss": 1.3874,
"step": 1079
},
{
"epoch": 2.5592783505154637,
"grad_norm": 1.2208237784983438,
"learning_rate": 4.247623157167102e-08,
"loss": 1.4828,
"step": 1080
},
{
"epoch": 2.5618556701030926,
"grad_norm": 1.1970857349297972,
"learning_rate": 4.231034370024088e-08,
"loss": 1.4412,
"step": 1081
},
{
"epoch": 2.5644329896907214,
"grad_norm": 1.1543756364647166,
"learning_rate": 4.214469343384346e-08,
"loss": 1.4448,
"step": 1082
},
{
"epoch": 2.5670103092783503,
"grad_norm": 1.125316478876826,
"learning_rate": 4.197928145473856e-08,
"loss": 1.3943,
"step": 1083
},
{
"epoch": 2.569587628865979,
"grad_norm": 1.1220973164280506,
"learning_rate": 4.181410844420473e-08,
"loss": 1.4221,
"step": 1084
},
{
"epoch": 2.572164948453608,
"grad_norm": 1.1654590544487953,
"learning_rate": 4.164917508253607e-08,
"loss": 1.433,
"step": 1085
},
{
"epoch": 2.574742268041237,
"grad_norm": 1.1709294745599472,
"learning_rate": 4.148448204903977e-08,
"loss": 1.3952,
"step": 1086
},
{
"epoch": 2.5773195876288657,
"grad_norm": 1.1679647806294131,
"learning_rate": 4.132003002203314e-08,
"loss": 1.4641,
"step": 1087
},
{
"epoch": 2.579896907216495,
"grad_norm": 1.3695549935841669,
"learning_rate": 4.115581967884093e-08,
"loss": 1.5259,
"step": 1088
},
{
"epoch": 2.582474226804124,
"grad_norm": 1.1307837909317393,
"learning_rate": 4.099185169579241e-08,
"loss": 1.4012,
"step": 1089
},
{
"epoch": 2.5850515463917527,
"grad_norm": 1.1501589873026261,
"learning_rate": 4.0828126748218647e-08,
"loss": 1.4582,
"step": 1090
},
{
"epoch": 2.5876288659793816,
"grad_norm": 1.1069474546473044,
"learning_rate": 4.0664645510449745e-08,
"loss": 1.4335,
"step": 1091
},
{
"epoch": 2.5902061855670104,
"grad_norm": 1.1910808093335385,
"learning_rate": 4.050140865581204e-08,
"loss": 1.458,
"step": 1092
},
{
"epoch": 2.5927835051546393,
"grad_norm": 1.1210216135242885,
"learning_rate": 4.033841685662529e-08,
"loss": 1.4671,
"step": 1093
},
{
"epoch": 2.595360824742268,
"grad_norm": 1.1392325814801574,
"learning_rate": 4.0175670784200066e-08,
"loss": 1.4687,
"step": 1094
},
{
"epoch": 2.597938144329897,
"grad_norm": 1.2066331988995807,
"learning_rate": 4.001317110883477e-08,
"loss": 1.6142,
"step": 1095
},
{
"epoch": 2.600515463917526,
"grad_norm": 1.120036816028406,
"learning_rate": 3.985091849981297e-08,
"loss": 1.4617,
"step": 1096
},
{
"epoch": 2.6030927835051547,
"grad_norm": 1.1171460565708284,
"learning_rate": 3.96889136254007e-08,
"loss": 1.459,
"step": 1097
},
{
"epoch": 2.6056701030927836,
"grad_norm": 1.2472238722902789,
"learning_rate": 3.952715715284363e-08,
"loss": 1.5456,
"step": 1098
},
{
"epoch": 2.6082474226804124,
"grad_norm": 1.2133346933773341,
"learning_rate": 3.93656497483643e-08,
"loss": 1.5134,
"step": 1099
},
{
"epoch": 2.6108247422680413,
"grad_norm": 1.1470733566590117,
"learning_rate": 3.9204392077159544e-08,
"loss": 1.4653,
"step": 1100
},
{
"epoch": 2.61340206185567,
"grad_norm": 1.1608282166724524,
"learning_rate": 3.904338480339755e-08,
"loss": 1.479,
"step": 1101
},
{
"epoch": 2.615979381443299,
"grad_norm": 1.1508782189162872,
"learning_rate": 3.888262859021507e-08,
"loss": 1.4025,
"step": 1102
},
{
"epoch": 2.618556701030928,
"grad_norm": 1.178209399181694,
"learning_rate": 3.872212409971507e-08,
"loss": 1.2948,
"step": 1103
},
{
"epoch": 2.6211340206185567,
"grad_norm": 1.32807190899102,
"learning_rate": 3.856187199296358e-08,
"loss": 1.5456,
"step": 1104
},
{
"epoch": 2.6237113402061856,
"grad_norm": 1.2185169437161736,
"learning_rate": 3.8401872929987166e-08,
"loss": 1.429,
"step": 1105
},
{
"epoch": 2.6262886597938144,
"grad_norm": 1.2304397213352538,
"learning_rate": 3.824212756977027e-08,
"loss": 1.4558,
"step": 1106
},
{
"epoch": 2.6288659793814433,
"grad_norm": 1.1724306586240414,
"learning_rate": 3.8082636570252346e-08,
"loss": 1.4984,
"step": 1107
},
{
"epoch": 2.631443298969072,
"grad_norm": 1.1298977167004856,
"learning_rate": 3.7923400588325147e-08,
"loss": 1.4417,
"step": 1108
},
{
"epoch": 2.634020618556701,
"grad_norm": 1.1784947581476026,
"learning_rate": 3.7764420279830266e-08,
"loss": 1.4164,
"step": 1109
},
{
"epoch": 2.63659793814433,
"grad_norm": 1.155170570736418,
"learning_rate": 3.7605696299556135e-08,
"loss": 1.4371,
"step": 1110
},
{
"epoch": 2.6391752577319587,
"grad_norm": 1.1663523776289366,
"learning_rate": 3.744722930123544e-08,
"loss": 1.4747,
"step": 1111
},
{
"epoch": 2.6417525773195876,
"grad_norm": 1.2126168901096435,
"learning_rate": 3.72890199375426e-08,
"loss": 1.5058,
"step": 1112
},
{
"epoch": 2.6443298969072164,
"grad_norm": 1.2017176914352923,
"learning_rate": 3.71310688600907e-08,
"loss": 1.4733,
"step": 1113
},
{
"epoch": 2.6469072164948453,
"grad_norm": 1.1119469160793427,
"learning_rate": 3.6973376719429125e-08,
"loss": 1.476,
"step": 1114
},
{
"epoch": 2.649484536082474,
"grad_norm": 1.130792424586462,
"learning_rate": 3.681594416504088e-08,
"loss": 1.4494,
"step": 1115
},
{
"epoch": 2.652061855670103,
"grad_norm": 1.222509795849272,
"learning_rate": 3.6658771845339676e-08,
"loss": 1.4999,
"step": 1116
},
{
"epoch": 2.654639175257732,
"grad_norm": 1.1385228914334713,
"learning_rate": 3.650186040766746e-08,
"loss": 1.4402,
"step": 1117
},
{
"epoch": 2.6572164948453607,
"grad_norm": 1.1448576075492045,
"learning_rate": 3.634521049829169e-08,
"loss": 1.4132,
"step": 1118
},
{
"epoch": 2.6597938144329896,
"grad_norm": 1.139064959062427,
"learning_rate": 3.618882276240267e-08,
"loss": 1.3994,
"step": 1119
},
{
"epoch": 2.6623711340206184,
"grad_norm": 1.161606746690635,
"learning_rate": 3.603269784411089e-08,
"loss": 1.4385,
"step": 1120
},
{
"epoch": 2.6649484536082473,
"grad_norm": 1.1300734708150515,
"learning_rate": 3.587683638644437e-08,
"loss": 1.4228,
"step": 1121
},
{
"epoch": 2.667525773195876,
"grad_norm": 1.1979334493577922,
"learning_rate": 3.572123903134606e-08,
"loss": 1.3946,
"step": 1122
},
{
"epoch": 2.670103092783505,
"grad_norm": 1.2108873546484593,
"learning_rate": 3.556590641967114e-08,
"loss": 1.4019,
"step": 1123
},
{
"epoch": 2.6726804123711343,
"grad_norm": 1.252184087003669,
"learning_rate": 3.5410839191184386e-08,
"loss": 1.4863,
"step": 1124
},
{
"epoch": 2.675257731958763,
"grad_norm": 1.1268238345165822,
"learning_rate": 3.525603798455753e-08,
"loss": 1.4624,
"step": 1125
},
{
"epoch": 2.677835051546392,
"grad_norm": 1.2410354943951132,
"learning_rate": 3.5101503437366676e-08,
"loss": 1.5426,
"step": 1126
},
{
"epoch": 2.680412371134021,
"grad_norm": 1.2054964281688654,
"learning_rate": 3.49472361860896e-08,
"loss": 1.4182,
"step": 1127
},
{
"epoch": 2.680412371134021,
"eval_loss": 1.4503966569900513,
"eval_runtime": 78.5776,
"eval_samples_per_second": 21.164,
"eval_steps_per_second": 1.324,
"step": 1127
},
{
"epoch": 2.6829896907216497,
"grad_norm": 1.18692856703466,
"learning_rate": 3.4793236866103294e-08,
"loss": 1.5021,
"step": 1128
},
{
"epoch": 2.6855670103092786,
"grad_norm": 1.099606075968585,
"learning_rate": 3.463950611168111e-08,
"loss": 1.4051,
"step": 1129
},
{
"epoch": 2.6881443298969074,
"grad_norm": 1.1712675559534376,
"learning_rate": 3.448604455599021e-08,
"loss": 1.4565,
"step": 1130
},
{
"epoch": 2.6907216494845363,
"grad_norm": 1.2365327819201322,
"learning_rate": 3.43328528310892e-08,
"loss": 1.4418,
"step": 1131
},
{
"epoch": 2.693298969072165,
"grad_norm": 1.1186618547215839,
"learning_rate": 3.4179931567925215e-08,
"loss": 1.4987,
"step": 1132
},
{
"epoch": 2.695876288659794,
"grad_norm": 1.2081208242761923,
"learning_rate": 3.402728139633142e-08,
"loss": 1.441,
"step": 1133
},
{
"epoch": 2.698453608247423,
"grad_norm": 1.218636962355054,
"learning_rate": 3.387490294502457e-08,
"loss": 1.4067,
"step": 1134
},
{
"epoch": 2.7010309278350517,
"grad_norm": 1.1637394002772754,
"learning_rate": 3.372279684160221e-08,
"loss": 1.5326,
"step": 1135
},
{
"epoch": 2.7036082474226806,
"grad_norm": 1.2353156557559488,
"learning_rate": 3.357096371254008e-08,
"loss": 1.472,
"step": 1136
},
{
"epoch": 2.7061855670103094,
"grad_norm": 1.19587166321243,
"learning_rate": 3.3419404183189813e-08,
"loss": 1.4886,
"step": 1137
},
{
"epoch": 2.7087628865979383,
"grad_norm": 1.1730315855085072,
"learning_rate": 3.326811887777606e-08,
"loss": 1.3887,
"step": 1138
},
{
"epoch": 2.711340206185567,
"grad_norm": 1.2017905489788439,
"learning_rate": 3.3117108419394036e-08,
"loss": 1.4376,
"step": 1139
},
{
"epoch": 2.713917525773196,
"grad_norm": 1.223875153650053,
"learning_rate": 3.2966373430007044e-08,
"loss": 1.4841,
"step": 1140
},
{
"epoch": 2.716494845360825,
"grad_norm": 1.163982928943064,
"learning_rate": 3.2815914530443656e-08,
"loss": 1.5057,
"step": 1141
},
{
"epoch": 2.7190721649484537,
"grad_norm": 1.1065194981403395,
"learning_rate": 3.2665732340395413e-08,
"loss": 1.5145,
"step": 1142
},
{
"epoch": 2.7216494845360826,
"grad_norm": 1.1802479694554426,
"learning_rate": 3.2515827478414227e-08,
"loss": 1.4639,
"step": 1143
},
{
"epoch": 2.7242268041237114,
"grad_norm": 1.1042272626565486,
"learning_rate": 3.236620056190972e-08,
"loss": 1.3944,
"step": 1144
},
{
"epoch": 2.7268041237113403,
"grad_norm": 1.2114102979959467,
"learning_rate": 3.221685220714674e-08,
"loss": 1.4298,
"step": 1145
},
{
"epoch": 2.729381443298969,
"grad_norm": 1.1393577034048052,
"learning_rate": 3.2067783029242866e-08,
"loss": 1.3856,
"step": 1146
},
{
"epoch": 2.731958762886598,
"grad_norm": 1.1037036354008587,
"learning_rate": 3.1918993642165804e-08,
"loss": 1.3889,
"step": 1147
},
{
"epoch": 2.734536082474227,
"grad_norm": 1.2272871402765764,
"learning_rate": 3.177048465873089e-08,
"loss": 1.4043,
"step": 1148
},
{
"epoch": 2.7371134020618557,
"grad_norm": 1.210586273197648,
"learning_rate": 3.1622256690598633e-08,
"loss": 1.4999,
"step": 1149
},
{
"epoch": 2.7396907216494846,
"grad_norm": 1.1746574581016895,
"learning_rate": 3.147431034827208e-08,
"loss": 1.4216,
"step": 1150
},
{
"epoch": 2.7422680412371134,
"grad_norm": 1.1586070909228363,
"learning_rate": 3.1326646241094336e-08,
"loss": 1.4696,
"step": 1151
},
{
"epoch": 2.7448453608247423,
"grad_norm": 1.1312629920265729,
"learning_rate": 3.11792649772461e-08,
"loss": 1.5172,
"step": 1152
},
{
"epoch": 2.747422680412371,
"grad_norm": 1.181603470826963,
"learning_rate": 3.1032167163743115e-08,
"loss": 1.4453,
"step": 1153
},
{
"epoch": 2.75,
"grad_norm": 1.1958639955584416,
"learning_rate": 3.0885353406433703e-08,
"loss": 1.5075,
"step": 1154
},
{
"epoch": 2.752577319587629,
"grad_norm": 1.200258914978432,
"learning_rate": 3.073882430999619e-08,
"loss": 1.409,
"step": 1155
},
{
"epoch": 2.7551546391752577,
"grad_norm": 1.1425311029684388,
"learning_rate": 3.05925804779366e-08,
"loss": 1.4537,
"step": 1156
},
{
"epoch": 2.7577319587628866,
"grad_norm": 1.1441189180372324,
"learning_rate": 3.044662251258595e-08,
"loss": 1.567,
"step": 1157
},
{
"epoch": 2.7603092783505154,
"grad_norm": 1.1519696479164119,
"learning_rate": 3.030095101509786e-08,
"loss": 1.4678,
"step": 1158
},
{
"epoch": 2.7628865979381443,
"grad_norm": 1.2588291000562302,
"learning_rate": 3.0155566585446114e-08,
"loss": 1.5141,
"step": 1159
},
{
"epoch": 2.765463917525773,
"grad_norm": 1.1712961770904633,
"learning_rate": 3.0010469822422156e-08,
"loss": 1.4298,
"step": 1160
},
{
"epoch": 2.768041237113402,
"grad_norm": 1.2155090578526457,
"learning_rate": 2.986566132363259e-08,
"loss": 1.5341,
"step": 1161
},
{
"epoch": 2.770618556701031,
"grad_norm": 1.1558741286842076,
"learning_rate": 2.972114168549682e-08,
"loss": 1.4089,
"step": 1162
},
{
"epoch": 2.7731958762886597,
"grad_norm": 1.281655267971227,
"learning_rate": 2.9576911503244494e-08,
"loss": 1.3596,
"step": 1163
},
{
"epoch": 2.7757731958762886,
"grad_norm": 1.1885614767244468,
"learning_rate": 2.9432971370912995e-08,
"loss": 1.4181,
"step": 1164
}
],
"logging_steps": 1,
"max_steps": 1552,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 388,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 305116087320576.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}