trace-ft-qvhighlights / trainer_state.json
Yongxin-Guo's picture
Upload 13 files
aa2278b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.591836734693878,
"eval_steps": 500,
"global_step": 840,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023323615160349854,
"grad_norm": 481.58927975506265,
"learning_rate": 1.9230769230769234e-07,
"loss": 4.1544,
"step": 1
},
{
"epoch": 0.04664723032069971,
"grad_norm": 480.04985092696364,
"learning_rate": 3.846153846153847e-07,
"loss": 4.1893,
"step": 2
},
{
"epoch": 0.06997084548104957,
"grad_norm": 464.6078319400137,
"learning_rate": 5.76923076923077e-07,
"loss": 3.9817,
"step": 3
},
{
"epoch": 0.09329446064139942,
"grad_norm": 313.6186353479334,
"learning_rate": 7.692307692307694e-07,
"loss": 3.5082,
"step": 4
},
{
"epoch": 0.11661807580174927,
"grad_norm": 130.28683050533158,
"learning_rate": 9.615384615384617e-07,
"loss": 3.1032,
"step": 5
},
{
"epoch": 0.13994169096209913,
"grad_norm": 108.72214082812204,
"learning_rate": 1.153846153846154e-06,
"loss": 2.9264,
"step": 6
},
{
"epoch": 0.16326530612244897,
"grad_norm": 90.00734833282364,
"learning_rate": 1.3461538461538462e-06,
"loss": 2.5326,
"step": 7
},
{
"epoch": 0.18658892128279883,
"grad_norm": 80.17560377949299,
"learning_rate": 1.5384615384615387e-06,
"loss": 2.2113,
"step": 8
},
{
"epoch": 0.2099125364431487,
"grad_norm": 256.89238079884734,
"learning_rate": 1.7307692307692308e-06,
"loss": 2.3292,
"step": 9
},
{
"epoch": 0.23323615160349853,
"grad_norm": 40.345129720791576,
"learning_rate": 1.9230769230769234e-06,
"loss": 1.8276,
"step": 10
},
{
"epoch": 0.2565597667638484,
"grad_norm": 37.352128701784686,
"learning_rate": 2.1153846153846155e-06,
"loss": 1.732,
"step": 11
},
{
"epoch": 0.27988338192419826,
"grad_norm": 39.91551747323652,
"learning_rate": 2.307692307692308e-06,
"loss": 1.558,
"step": 12
},
{
"epoch": 0.3032069970845481,
"grad_norm": 46.301839613156844,
"learning_rate": 2.5e-06,
"loss": 1.6342,
"step": 13
},
{
"epoch": 0.32653061224489793,
"grad_norm": 40.14073751930512,
"learning_rate": 2.6923076923076923e-06,
"loss": 1.4451,
"step": 14
},
{
"epoch": 0.3498542274052478,
"grad_norm": 42.4956914176308,
"learning_rate": 2.8846153846153845e-06,
"loss": 1.2808,
"step": 15
},
{
"epoch": 0.37317784256559766,
"grad_norm": 33.67740673818837,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.0825,
"step": 16
},
{
"epoch": 0.3965014577259475,
"grad_norm": 129.23064107923477,
"learning_rate": 3.2692307692307696e-06,
"loss": 1.0782,
"step": 17
},
{
"epoch": 0.4198250728862974,
"grad_norm": 36.39439855381251,
"learning_rate": 3.4615384615384617e-06,
"loss": 0.6994,
"step": 18
},
{
"epoch": 0.44314868804664725,
"grad_norm": 15.512433879259516,
"learning_rate": 3.653846153846154e-06,
"loss": 0.5511,
"step": 19
},
{
"epoch": 0.46647230320699706,
"grad_norm": 18.30849165549196,
"learning_rate": 3.846153846153847e-06,
"loss": 0.5391,
"step": 20
},
{
"epoch": 0.4897959183673469,
"grad_norm": 20.61130656343632,
"learning_rate": 4.0384615384615385e-06,
"loss": 0.5451,
"step": 21
},
{
"epoch": 0.5131195335276968,
"grad_norm": 14.982450264158022,
"learning_rate": 4.230769230769231e-06,
"loss": 0.4925,
"step": 22
},
{
"epoch": 0.5364431486880467,
"grad_norm": 13.209296703805986,
"learning_rate": 4.423076923076924e-06,
"loss": 0.4292,
"step": 23
},
{
"epoch": 0.5597667638483965,
"grad_norm": 8.1438483301815,
"learning_rate": 4.615384615384616e-06,
"loss": 0.3799,
"step": 24
},
{
"epoch": 0.5830903790087464,
"grad_norm": 19.941972093465644,
"learning_rate": 4.807692307692308e-06,
"loss": 0.3865,
"step": 25
},
{
"epoch": 0.6064139941690962,
"grad_norm": 11.016048515240694,
"learning_rate": 5e-06,
"loss": 0.3728,
"step": 26
},
{
"epoch": 0.6297376093294461,
"grad_norm": 15.074313900645539,
"learning_rate": 4.999981380826043e-06,
"loss": 0.3462,
"step": 27
},
{
"epoch": 0.6530612244897959,
"grad_norm": 8.543797917151428,
"learning_rate": 4.999925523581508e-06,
"loss": 0.3476,
"step": 28
},
{
"epoch": 0.6763848396501457,
"grad_norm": 13.753135185003897,
"learning_rate": 4.99983242909841e-06,
"loss": 0.3278,
"step": 29
},
{
"epoch": 0.6997084548104956,
"grad_norm": 8.608638115725363,
"learning_rate": 4.999702098763422e-06,
"loss": 0.2821,
"step": 30
},
{
"epoch": 0.7230320699708455,
"grad_norm": 6.19614153747254,
"learning_rate": 4.9995345345178584e-06,
"loss": 0.2994,
"step": 31
},
{
"epoch": 0.7463556851311953,
"grad_norm": 7.678155942302579,
"learning_rate": 4.999329738857645e-06,
"loss": 0.2916,
"step": 32
},
{
"epoch": 0.7696793002915452,
"grad_norm": 8.051557345234135,
"learning_rate": 4.999087714833284e-06,
"loss": 0.277,
"step": 33
},
{
"epoch": 0.793002915451895,
"grad_norm": 6.334602262299757,
"learning_rate": 4.998808466049803e-06,
"loss": 0.2597,
"step": 34
},
{
"epoch": 0.8163265306122449,
"grad_norm": 5.092340192351995,
"learning_rate": 4.998491996666711e-06,
"loss": 0.2621,
"step": 35
},
{
"epoch": 0.8396501457725948,
"grad_norm": 12.200374434931867,
"learning_rate": 4.998138311397924e-06,
"loss": 0.2351,
"step": 36
},
{
"epoch": 0.8629737609329446,
"grad_norm": 5.732076823622836,
"learning_rate": 4.997747415511705e-06,
"loss": 0.202,
"step": 37
},
{
"epoch": 0.8862973760932945,
"grad_norm": 5.650767667488515,
"learning_rate": 4.9973193148305784e-06,
"loss": 0.1888,
"step": 38
},
{
"epoch": 0.9096209912536443,
"grad_norm": 8.206628251201462,
"learning_rate": 4.996854015731253e-06,
"loss": 0.1775,
"step": 39
},
{
"epoch": 0.9329446064139941,
"grad_norm": 8.145125128228061,
"learning_rate": 4.996351525144515e-06,
"loss": 0.1743,
"step": 40
},
{
"epoch": 0.956268221574344,
"grad_norm": 9.558683543841621,
"learning_rate": 4.995811850555131e-06,
"loss": 0.1706,
"step": 41
},
{
"epoch": 0.9795918367346939,
"grad_norm": 21.870848517891986,
"learning_rate": 4.995235000001739e-06,
"loss": 0.192,
"step": 42
},
{
"epoch": 1.0029154518950438,
"grad_norm": 258.46039046147865,
"learning_rate": 4.994620982076721e-06,
"loss": 0.4032,
"step": 43
},
{
"epoch": 1.0262390670553936,
"grad_norm": 26.567296212210675,
"learning_rate": 4.993969805926085e-06,
"loss": 0.5322,
"step": 44
},
{
"epoch": 1.0495626822157433,
"grad_norm": 25.494805498804556,
"learning_rate": 4.99328148124932e-06,
"loss": 0.6008,
"step": 45
},
{
"epoch": 1.0728862973760933,
"grad_norm": 15.470490452118884,
"learning_rate": 4.992556018299255e-06,
"loss": 0.4806,
"step": 46
},
{
"epoch": 1.096209912536443,
"grad_norm": 14.107985790378562,
"learning_rate": 4.9917934278819055e-06,
"loss": 0.4627,
"step": 47
},
{
"epoch": 1.119533527696793,
"grad_norm": 12.124582814383823,
"learning_rate": 4.990993721356317e-06,
"loss": 0.4231,
"step": 48
},
{
"epoch": 1.1428571428571428,
"grad_norm": 8.12927884627824,
"learning_rate": 4.990156910634387e-06,
"loss": 0.3367,
"step": 49
},
{
"epoch": 1.1661807580174928,
"grad_norm": 8.363524360811459,
"learning_rate": 4.989283008180697e-06,
"loss": 0.3325,
"step": 50
},
{
"epoch": 1.1895043731778425,
"grad_norm": 8.263812437870355,
"learning_rate": 4.988372027012319e-06,
"loss": 0.3007,
"step": 51
},
{
"epoch": 1.2128279883381925,
"grad_norm": 7.277527999797857,
"learning_rate": 4.987423980698627e-06,
"loss": 0.294,
"step": 52
},
{
"epoch": 1.2361516034985423,
"grad_norm": 12.255979985711312,
"learning_rate": 4.986438883361092e-06,
"loss": 0.3,
"step": 53
},
{
"epoch": 1.259475218658892,
"grad_norm": 20.381703777578387,
"learning_rate": 4.985416749673075e-06,
"loss": 0.2891,
"step": 54
},
{
"epoch": 1.282798833819242,
"grad_norm": 11.379464750926473,
"learning_rate": 4.9843575948596e-06,
"loss": 0.279,
"step": 55
},
{
"epoch": 1.306122448979592,
"grad_norm": 3.571428587777274,
"learning_rate": 4.983261434697141e-06,
"loss": 0.2591,
"step": 56
},
{
"epoch": 1.3294460641399417,
"grad_norm": 3.424764867354762,
"learning_rate": 4.982128285513373e-06,
"loss": 0.2327,
"step": 57
},
{
"epoch": 1.3527696793002915,
"grad_norm": 4.582571324669013,
"learning_rate": 4.98095816418694e-06,
"loss": 0.243,
"step": 58
},
{
"epoch": 1.3760932944606414,
"grad_norm": 6.075638838721734,
"learning_rate": 4.979751088147192e-06,
"loss": 0.2754,
"step": 59
},
{
"epoch": 1.3994169096209912,
"grad_norm": 6.412677796633595,
"learning_rate": 4.97850707537394e-06,
"loss": 0.261,
"step": 60
},
{
"epoch": 1.4227405247813412,
"grad_norm": 5.315104126793953,
"learning_rate": 4.977226144397174e-06,
"loss": 0.2357,
"step": 61
},
{
"epoch": 1.446064139941691,
"grad_norm": 6.211582673349952,
"learning_rate": 4.9759083142967965e-06,
"loss": 0.2632,
"step": 62
},
{
"epoch": 1.469387755102041,
"grad_norm": 5.980458863173571,
"learning_rate": 4.974553604702332e-06,
"loss": 0.251,
"step": 63
},
{
"epoch": 1.4927113702623906,
"grad_norm": 5.652339700672357,
"learning_rate": 4.973162035792641e-06,
"loss": 0.2476,
"step": 64
},
{
"epoch": 1.5160349854227406,
"grad_norm": 4.840728604985413,
"learning_rate": 4.971733628295614e-06,
"loss": 0.235,
"step": 65
},
{
"epoch": 1.5393586005830904,
"grad_norm": 3.658991641610298,
"learning_rate": 4.970268403487866e-06,
"loss": 0.2321,
"step": 66
},
{
"epoch": 1.5626822157434401,
"grad_norm": 3.487525781242212,
"learning_rate": 4.9687663831944156e-06,
"loss": 0.2429,
"step": 67
},
{
"epoch": 1.58600583090379,
"grad_norm": 3.749615204460126,
"learning_rate": 4.967227589788365e-06,
"loss": 0.2394,
"step": 68
},
{
"epoch": 1.60932944606414,
"grad_norm": 4.290007606979775,
"learning_rate": 4.965652046190565e-06,
"loss": 0.2362,
"step": 69
},
{
"epoch": 1.6326530612244898,
"grad_norm": 4.69637810774778,
"learning_rate": 4.964039775869271e-06,
"loss": 0.2208,
"step": 70
},
{
"epoch": 1.6559766763848396,
"grad_norm": 3.6428908472078065,
"learning_rate": 4.962390802839797e-06,
"loss": 0.234,
"step": 71
},
{
"epoch": 1.6793002915451893,
"grad_norm": 2.7063967799319495,
"learning_rate": 4.960705151664155e-06,
"loss": 0.2171,
"step": 72
},
{
"epoch": 1.7026239067055393,
"grad_norm": 2.9521513448960213,
"learning_rate": 4.9589828474506906e-06,
"loss": 0.2166,
"step": 73
},
{
"epoch": 1.7259475218658893,
"grad_norm": 3.575771040565363,
"learning_rate": 4.9572239158537095e-06,
"loss": 0.2117,
"step": 74
},
{
"epoch": 1.749271137026239,
"grad_norm": 4.218830249164411,
"learning_rate": 4.955428383073094e-06,
"loss": 0.2067,
"step": 75
},
{
"epoch": 1.7725947521865888,
"grad_norm": 5.195007590617856,
"learning_rate": 4.9535962758539155e-06,
"loss": 0.2222,
"step": 76
},
{
"epoch": 1.7959183673469388,
"grad_norm": 4.3676585044114,
"learning_rate": 4.951727621486031e-06,
"loss": 0.1933,
"step": 77
},
{
"epoch": 1.8192419825072887,
"grad_norm": 3.8250350003105953,
"learning_rate": 4.949822447803681e-06,
"loss": 0.1821,
"step": 78
},
{
"epoch": 1.8425655976676385,
"grad_norm": 3.7621739192349652,
"learning_rate": 4.947880783185074e-06,
"loss": 0.1907,
"step": 79
},
{
"epoch": 1.8658892128279883,
"grad_norm": 3.2426278056631124,
"learning_rate": 4.945902656551964e-06,
"loss": 0.1862,
"step": 80
},
{
"epoch": 1.8892128279883382,
"grad_norm": 3.0607040744667753,
"learning_rate": 4.943888097369216e-06,
"loss": 0.1903,
"step": 81
},
{
"epoch": 1.9125364431486882,
"grad_norm": 1.9997107177378481,
"learning_rate": 4.941837135644374e-06,
"loss": 0.1699,
"step": 82
},
{
"epoch": 1.935860058309038,
"grad_norm": 2.9208613794769716,
"learning_rate": 4.939749801927208e-06,
"loss": 0.1733,
"step": 83
},
{
"epoch": 1.9591836734693877,
"grad_norm": 2.446872104141256,
"learning_rate": 4.9376261273092614e-06,
"loss": 0.1755,
"step": 84
},
{
"epoch": 1.9825072886297375,
"grad_norm": 2.3668168284457636,
"learning_rate": 4.935466143423389e-06,
"loss": 0.1675,
"step": 85
},
{
"epoch": 2.0058309037900877,
"grad_norm": 2.2402757742265083,
"learning_rate": 4.933269882443281e-06,
"loss": 0.1691,
"step": 86
},
{
"epoch": 2.0291545189504374,
"grad_norm": 1.3263354642981466,
"learning_rate": 4.9310373770829925e-06,
"loss": 0.1523,
"step": 87
},
{
"epoch": 2.052478134110787,
"grad_norm": 2.836622255323569,
"learning_rate": 4.928768660596446e-06,
"loss": 0.156,
"step": 88
},
{
"epoch": 2.075801749271137,
"grad_norm": 2.479270537521849,
"learning_rate": 4.926463766776945e-06,
"loss": 0.1613,
"step": 89
},
{
"epoch": 2.0991253644314867,
"grad_norm": 2.975995828348743,
"learning_rate": 4.924122729956662e-06,
"loss": 0.1495,
"step": 90
},
{
"epoch": 2.122448979591837,
"grad_norm": 1.9541545138221803,
"learning_rate": 4.921745585006135e-06,
"loss": 0.1527,
"step": 91
},
{
"epoch": 2.1457725947521866,
"grad_norm": 4.856180692789397,
"learning_rate": 4.919332367333748e-06,
"loss": 0.1459,
"step": 92
},
{
"epoch": 2.1690962099125364,
"grad_norm": 4.550523899643916,
"learning_rate": 4.916883112885192e-06,
"loss": 0.1442,
"step": 93
},
{
"epoch": 2.192419825072886,
"grad_norm": 4.080918564709067,
"learning_rate": 4.914397858142945e-06,
"loss": 0.145,
"step": 94
},
{
"epoch": 2.2157434402332363,
"grad_norm": 3.177257762352267,
"learning_rate": 4.911876640125719e-06,
"loss": 0.1464,
"step": 95
},
{
"epoch": 2.239067055393586,
"grad_norm": 3.2919310612465007,
"learning_rate": 4.909319496387911e-06,
"loss": 0.1405,
"step": 96
},
{
"epoch": 2.262390670553936,
"grad_norm": 2.4487248703045426,
"learning_rate": 4.9067264650190436e-06,
"loss": 0.1401,
"step": 97
},
{
"epoch": 2.2857142857142856,
"grad_norm": 3.2548940929053702,
"learning_rate": 4.904097584643201e-06,
"loss": 0.1425,
"step": 98
},
{
"epoch": 2.3090379008746353,
"grad_norm": 2.3394224777927675,
"learning_rate": 4.901432894418446e-06,
"loss": 0.144,
"step": 99
},
{
"epoch": 2.3323615160349855,
"grad_norm": 1.987074930092349,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.1271,
"step": 100
},
{
"epoch": 2.3556851311953353,
"grad_norm": 2.269137691444028,
"learning_rate": 4.895996243720868e-06,
"loss": 0.1273,
"step": 101
},
{
"epoch": 2.379008746355685,
"grad_norm": 3.6904521698177315,
"learning_rate": 4.8932243642288e-06,
"loss": 0.1418,
"step": 102
},
{
"epoch": 2.402332361516035,
"grad_norm": 7.437555393044772,
"learning_rate": 4.890416836848128e-06,
"loss": 0.1359,
"step": 103
},
{
"epoch": 2.425655976676385,
"grad_norm": 6.088458615126284,
"learning_rate": 4.887573703397921e-06,
"loss": 0.1386,
"step": 104
},
{
"epoch": 2.4489795918367347,
"grad_norm": 1.9457975954726117,
"learning_rate": 4.884695006227619e-06,
"loss": 0.1227,
"step": 105
},
{
"epoch": 2.4723032069970845,
"grad_norm": 2.127917923004136,
"learning_rate": 4.8817807882163904e-06,
"loss": 0.1284,
"step": 106
},
{
"epoch": 2.4956268221574343,
"grad_norm": 4.11663293451448,
"learning_rate": 4.878831092772501e-06,
"loss": 0.129,
"step": 107
},
{
"epoch": 2.518950437317784,
"grad_norm": 6.08617442775337,
"learning_rate": 4.875845963832667e-06,
"loss": 0.1305,
"step": 108
},
{
"epoch": 2.542274052478134,
"grad_norm": 1.6921004000832196,
"learning_rate": 4.872825445861395e-06,
"loss": 0.1356,
"step": 109
},
{
"epoch": 2.565597667638484,
"grad_norm": 10.251083316775865,
"learning_rate": 4.869769583850324e-06,
"loss": 0.1247,
"step": 110
},
{
"epoch": 2.5889212827988337,
"grad_norm": 8.60721468259536,
"learning_rate": 4.8666784233175566e-06,
"loss": 0.1315,
"step": 111
},
{
"epoch": 2.612244897959184,
"grad_norm": 6.1560755671076235,
"learning_rate": 4.863552010306976e-06,
"loss": 0.1321,
"step": 112
},
{
"epoch": 2.6355685131195337,
"grad_norm": 9.37865041235299,
"learning_rate": 4.860390391387566e-06,
"loss": 0.1327,
"step": 113
},
{
"epoch": 2.6588921282798834,
"grad_norm": 4.762581475629758,
"learning_rate": 4.857193613652711e-06,
"loss": 0.1255,
"step": 114
},
{
"epoch": 2.682215743440233,
"grad_norm": 6.745590182330897,
"learning_rate": 4.8539617247195e-06,
"loss": 0.1446,
"step": 115
},
{
"epoch": 2.705539358600583,
"grad_norm": 3.3151863289735295,
"learning_rate": 4.850694772728015e-06,
"loss": 0.125,
"step": 116
},
{
"epoch": 2.7288629737609327,
"grad_norm": 3.6648399975915424,
"learning_rate": 4.847392806340615e-06,
"loss": 0.1308,
"step": 117
},
{
"epoch": 2.752186588921283,
"grad_norm": 3.9684370022599276,
"learning_rate": 4.844055874741208e-06,
"loss": 0.1276,
"step": 118
},
{
"epoch": 2.7755102040816326,
"grad_norm": 3.9099707621386135,
"learning_rate": 4.8406840276345215e-06,
"loss": 0.1353,
"step": 119
},
{
"epoch": 2.7988338192419824,
"grad_norm": 2.4159538489348478,
"learning_rate": 4.837277315245364e-06,
"loss": 0.121,
"step": 120
},
{
"epoch": 2.8221574344023326,
"grad_norm": 3.3742191683006513,
"learning_rate": 4.833835788317869e-06,
"loss": 0.1338,
"step": 121
},
{
"epoch": 2.8454810495626823,
"grad_norm": 2.2056302375331316,
"learning_rate": 4.830359498114749e-06,
"loss": 0.1257,
"step": 122
},
{
"epoch": 2.868804664723032,
"grad_norm": 2.4034843900335123,
"learning_rate": 4.826848496416526e-06,
"loss": 0.1348,
"step": 123
},
{
"epoch": 2.892128279883382,
"grad_norm": 2.2255115906707745,
"learning_rate": 4.82330283552076e-06,
"loss": 0.137,
"step": 124
},
{
"epoch": 2.9154518950437316,
"grad_norm": 3.043279002757231,
"learning_rate": 4.819722568241274e-06,
"loss": 0.1339,
"step": 125
},
{
"epoch": 2.938775510204082,
"grad_norm": 2.172615748264224,
"learning_rate": 4.816107747907362e-06,
"loss": 0.1245,
"step": 126
},
{
"epoch": 2.9620991253644315,
"grad_norm": 3.9756197608178,
"learning_rate": 4.812458428362999e-06,
"loss": 0.1256,
"step": 127
},
{
"epoch": 2.9854227405247813,
"grad_norm": 1.6228193910931026,
"learning_rate": 4.808774663966039e-06,
"loss": 0.1268,
"step": 128
},
{
"epoch": 3.008746355685131,
"grad_norm": 2.666938281134312,
"learning_rate": 4.8050565095874e-06,
"loss": 0.1239,
"step": 129
},
{
"epoch": 3.0320699708454812,
"grad_norm": 2.427622711977247,
"learning_rate": 4.801304020610255e-06,
"loss": 0.1271,
"step": 130
},
{
"epoch": 3.055393586005831,
"grad_norm": 3.238407882588861,
"learning_rate": 4.7975172529291965e-06,
"loss": 0.1279,
"step": 131
},
{
"epoch": 3.0787172011661808,
"grad_norm": 4.385283020351036,
"learning_rate": 4.793696262949417e-06,
"loss": 0.1246,
"step": 132
},
{
"epoch": 3.1020408163265305,
"grad_norm": 1.7220244687210526,
"learning_rate": 4.789841107585855e-06,
"loss": 0.1151,
"step": 133
},
{
"epoch": 3.1253644314868803,
"grad_norm": 2.19739951848995,
"learning_rate": 4.785951844262361e-06,
"loss": 0.1215,
"step": 134
},
{
"epoch": 3.1486880466472305,
"grad_norm": 3.1169982699749537,
"learning_rate": 4.782028530910827e-06,
"loss": 0.1241,
"step": 135
},
{
"epoch": 3.17201166180758,
"grad_norm": 3.16924293190244,
"learning_rate": 4.77807122597034e-06,
"loss": 0.1209,
"step": 136
},
{
"epoch": 3.19533527696793,
"grad_norm": 3.8621927083561602,
"learning_rate": 4.7740799883862966e-06,
"loss": 0.1173,
"step": 137
},
{
"epoch": 3.2186588921282797,
"grad_norm": 2.9142580350261293,
"learning_rate": 4.770054877609534e-06,
"loss": 0.1171,
"step": 138
},
{
"epoch": 3.24198250728863,
"grad_norm": 5.773960446017771,
"learning_rate": 4.765995953595446e-06,
"loss": 0.1298,
"step": 139
},
{
"epoch": 3.2653061224489797,
"grad_norm": 3.8584285919316486,
"learning_rate": 4.761903276803078e-06,
"loss": 0.1254,
"step": 140
},
{
"epoch": 3.2886297376093294,
"grad_norm": 5.500769127907038,
"learning_rate": 4.7577769081942425e-06,
"loss": 0.1245,
"step": 141
},
{
"epoch": 3.311953352769679,
"grad_norm": 3.679145141285145,
"learning_rate": 4.753616909232597e-06,
"loss": 0.124,
"step": 142
},
{
"epoch": 3.335276967930029,
"grad_norm": 4.3867652734342855,
"learning_rate": 4.749423341882738e-06,
"loss": 0.1182,
"step": 143
},
{
"epoch": 3.358600583090379,
"grad_norm": 2.816608564541258,
"learning_rate": 4.745196268609274e-06,
"loss": 0.1267,
"step": 144
},
{
"epoch": 3.381924198250729,
"grad_norm": 11.096184949547764,
"learning_rate": 4.740935752375893e-06,
"loss": 0.1297,
"step": 145
},
{
"epoch": 3.4052478134110786,
"grad_norm": 1.649738611844767,
"learning_rate": 4.736641856644431e-06,
"loss": 0.1211,
"step": 146
},
{
"epoch": 3.4285714285714284,
"grad_norm": 8.186265226248914,
"learning_rate": 4.732314645373922e-06,
"loss": 0.1353,
"step": 147
},
{
"epoch": 3.4518950437317786,
"grad_norm": 5.788547837910415,
"learning_rate": 4.727954183019644e-06,
"loss": 0.125,
"step": 148
},
{
"epoch": 3.4752186588921283,
"grad_norm": 7.463202678882185,
"learning_rate": 4.723560534532163e-06,
"loss": 0.1214,
"step": 149
},
{
"epoch": 3.498542274052478,
"grad_norm": 9.046619892641317,
"learning_rate": 4.7191337653563635e-06,
"loss": 0.1272,
"step": 150
},
{
"epoch": 3.521865889212828,
"grad_norm": 4.648476069955531,
"learning_rate": 4.714673941430474e-06,
"loss": 0.1204,
"step": 151
},
{
"epoch": 3.5451895043731776,
"grad_norm": 6.417386827886447,
"learning_rate": 4.710181129185085e-06,
"loss": 0.1299,
"step": 152
},
{
"epoch": 3.568513119533528,
"grad_norm": 5.286844754933566,
"learning_rate": 4.705655395542158e-06,
"loss": 0.1263,
"step": 153
},
{
"epoch": 3.5918367346938775,
"grad_norm": 2.578749240092962,
"learning_rate": 4.70109680791403e-06,
"loss": 0.1209,
"step": 154
},
{
"epoch": 3.6151603498542273,
"grad_norm": 9.466600835150404,
"learning_rate": 4.696505434202411e-06,
"loss": 0.1281,
"step": 155
},
{
"epoch": 3.6384839650145775,
"grad_norm": 1.9708225301025741,
"learning_rate": 4.691881342797368e-06,
"loss": 0.1187,
"step": 156
},
{
"epoch": 3.6618075801749272,
"grad_norm": 3.655745903542407,
"learning_rate": 4.6872246025763135e-06,
"loss": 0.1158,
"step": 157
},
{
"epoch": 3.685131195335277,
"grad_norm": 4.730977164178507,
"learning_rate": 4.6825352829029705e-06,
"loss": 0.1267,
"step": 158
},
{
"epoch": 3.7084548104956268,
"grad_norm": 4.191588270744791,
"learning_rate": 4.677813453626347e-06,
"loss": 0.1249,
"step": 159
},
{
"epoch": 3.7317784256559765,
"grad_norm": 2.8625748870249113,
"learning_rate": 4.67305918507969e-06,
"loss": 0.1103,
"step": 160
},
{
"epoch": 3.7551020408163263,
"grad_norm": 7.161616682671691,
"learning_rate": 4.668272548079445e-06,
"loss": 0.1147,
"step": 161
},
{
"epoch": 3.7784256559766765,
"grad_norm": 1.9603442628349634,
"learning_rate": 4.66345361392419e-06,
"loss": 0.1146,
"step": 162
},
{
"epoch": 3.801749271137026,
"grad_norm": 3.434376077417512,
"learning_rate": 4.658602454393584e-06,
"loss": 0.1186,
"step": 163
},
{
"epoch": 3.825072886297376,
"grad_norm": 5.614931038002465,
"learning_rate": 4.653719141747297e-06,
"loss": 0.1335,
"step": 164
},
{
"epoch": 3.848396501457726,
"grad_norm": 3.7351012222076014,
"learning_rate": 4.648803748723925e-06,
"loss": 0.1182,
"step": 165
},
{
"epoch": 3.871720116618076,
"grad_norm": 4.285033398991045,
"learning_rate": 4.643856348539913e-06,
"loss": 0.1285,
"step": 166
},
{
"epoch": 3.8950437317784257,
"grad_norm": 5.9516731153780995,
"learning_rate": 4.638877014888468e-06,
"loss": 0.1204,
"step": 167
},
{
"epoch": 3.9183673469387754,
"grad_norm": 2.98542040345268,
"learning_rate": 4.633865821938449e-06,
"loss": 0.1184,
"step": 168
},
{
"epoch": 3.941690962099125,
"grad_norm": 3.50350218349349,
"learning_rate": 4.6288228443332786e-06,
"loss": 0.1166,
"step": 169
},
{
"epoch": 3.9650145772594754,
"grad_norm": 4.810586127656542,
"learning_rate": 4.623748157189817e-06,
"loss": 0.1217,
"step": 170
},
{
"epoch": 3.988338192419825,
"grad_norm": 2.569654107918901,
"learning_rate": 4.61864183609725e-06,
"loss": 0.1212,
"step": 171
},
{
"epoch": 4.011661807580175,
"grad_norm": 1.9366280839570846,
"learning_rate": 4.613503957115963e-06,
"loss": 0.1188,
"step": 172
},
{
"epoch": 4.034985422740525,
"grad_norm": 3.4762352264320815,
"learning_rate": 4.608334596776406e-06,
"loss": 0.1222,
"step": 173
},
{
"epoch": 4.058309037900875,
"grad_norm": 4.344331551462011,
"learning_rate": 4.603133832077953e-06,
"loss": 0.1239,
"step": 174
},
{
"epoch": 4.081632653061225,
"grad_norm": 2.095141275963881,
"learning_rate": 4.597901740487761e-06,
"loss": 0.1167,
"step": 175
},
{
"epoch": 4.104956268221574,
"grad_norm": 2.8477141867605917,
"learning_rate": 4.592638399939606e-06,
"loss": 0.1131,
"step": 176
},
{
"epoch": 4.128279883381924,
"grad_norm": 3.481611858306002,
"learning_rate": 4.587343888832732e-06,
"loss": 0.117,
"step": 177
},
{
"epoch": 4.151603498542274,
"grad_norm": 2.8284834323753216,
"learning_rate": 4.582018286030677e-06,
"loss": 0.1093,
"step": 178
},
{
"epoch": 4.174927113702624,
"grad_norm": 2.1278869988762494,
"learning_rate": 4.576661670860102e-06,
"loss": 0.1205,
"step": 179
},
{
"epoch": 4.198250728862973,
"grad_norm": 4.183833780535433,
"learning_rate": 4.571274123109606e-06,
"loss": 0.1121,
"step": 180
},
{
"epoch": 4.221574344023324,
"grad_norm": 2.1166025177053305,
"learning_rate": 4.565855723028539e-06,
"loss": 0.1044,
"step": 181
},
{
"epoch": 4.244897959183674,
"grad_norm": 2.479368816009341,
"learning_rate": 4.560406551325811e-06,
"loss": 0.1161,
"step": 182
},
{
"epoch": 4.2682215743440235,
"grad_norm": 2.987320649577329,
"learning_rate": 4.55492668916868e-06,
"loss": 0.1211,
"step": 183
},
{
"epoch": 4.291545189504373,
"grad_norm": 1.9760419439041783,
"learning_rate": 4.5494162181815515e-06,
"loss": 0.1204,
"step": 184
},
{
"epoch": 4.314868804664723,
"grad_norm": 4.117652128521018,
"learning_rate": 4.543875220444761e-06,
"loss": 0.1175,
"step": 185
},
{
"epoch": 4.338192419825073,
"grad_norm": 3.226732595915632,
"learning_rate": 4.5383037784933494e-06,
"loss": 0.115,
"step": 186
},
{
"epoch": 4.3615160349854225,
"grad_norm": 2.9884740031788435,
"learning_rate": 4.532701975315832e-06,
"loss": 0.1163,
"step": 187
},
{
"epoch": 4.384839650145772,
"grad_norm": 4.084097628525921,
"learning_rate": 4.52706989435297e-06,
"loss": 0.1098,
"step": 188
},
{
"epoch": 4.408163265306122,
"grad_norm": 2.097393907448701,
"learning_rate": 4.521407619496517e-06,
"loss": 0.1168,
"step": 189
},
{
"epoch": 4.431486880466473,
"grad_norm": 2.798836132392157,
"learning_rate": 4.515715235087979e-06,
"loss": 0.1085,
"step": 190
},
{
"epoch": 4.454810495626822,
"grad_norm": 5.007862394325352,
"learning_rate": 4.509992825917352e-06,
"loss": 0.1186,
"step": 191
},
{
"epoch": 4.478134110787172,
"grad_norm": 1.7983696733373176,
"learning_rate": 4.504240477221861e-06,
"loss": 0.1163,
"step": 192
},
{
"epoch": 4.501457725947522,
"grad_norm": 3.8700602640590405,
"learning_rate": 4.498458274684691e-06,
"loss": 0.1175,
"step": 193
},
{
"epoch": 4.524781341107872,
"grad_norm": 2.6360213197995006,
"learning_rate": 4.492646304433711e-06,
"loss": 0.1214,
"step": 194
},
{
"epoch": 4.548104956268221,
"grad_norm": 1.6210900731791886,
"learning_rate": 4.486804653040187e-06,
"loss": 0.1175,
"step": 195
},
{
"epoch": 4.571428571428571,
"grad_norm": 4.3135489694582185,
"learning_rate": 4.4809334075175e-06,
"loss": 0.1199,
"step": 196
},
{
"epoch": 4.594752186588921,
"grad_norm": 1.5436096405592212,
"learning_rate": 4.475032655319842e-06,
"loss": 0.1111,
"step": 197
},
{
"epoch": 4.618075801749271,
"grad_norm": 2.1458534931112387,
"learning_rate": 4.469102484340919e-06,
"loss": 0.113,
"step": 198
},
{
"epoch": 4.641399416909621,
"grad_norm": 1.8801712567873288,
"learning_rate": 4.463142982912638e-06,
"loss": 0.1092,
"step": 199
},
{
"epoch": 4.664723032069971,
"grad_norm": 2.376405199853165,
"learning_rate": 4.457154239803796e-06,
"loss": 0.1186,
"step": 200
},
{
"epoch": 4.688046647230321,
"grad_norm": 2.029847820196563,
"learning_rate": 4.451136344218751e-06,
"loss": 0.1197,
"step": 201
},
{
"epoch": 4.711370262390671,
"grad_norm": 1.947337066043897,
"learning_rate": 4.445089385796099e-06,
"loss": 0.1178,
"step": 202
},
{
"epoch": 4.73469387755102,
"grad_norm": 4.016814890510145,
"learning_rate": 4.439013454607338e-06,
"loss": 0.1127,
"step": 203
},
{
"epoch": 4.75801749271137,
"grad_norm": 3.0189359998562453,
"learning_rate": 4.432908641155522e-06,
"loss": 0.1215,
"step": 204
},
{
"epoch": 4.78134110787172,
"grad_norm": 4.180011943453285,
"learning_rate": 4.4267750363739195e-06,
"loss": 0.1166,
"step": 205
},
{
"epoch": 4.80466472303207,
"grad_norm": 2.289589362328887,
"learning_rate": 4.420612731624652e-06,
"loss": 0.108,
"step": 206
},
{
"epoch": 4.827988338192419,
"grad_norm": 6.041807431740632,
"learning_rate": 4.414421818697341e-06,
"loss": 0.1171,
"step": 207
},
{
"epoch": 4.85131195335277,
"grad_norm": 2.4860177037352154,
"learning_rate": 4.408202389807733e-06,
"loss": 0.119,
"step": 208
},
{
"epoch": 4.87463556851312,
"grad_norm": 2.635976458014569,
"learning_rate": 4.401954537596332e-06,
"loss": 0.109,
"step": 209
},
{
"epoch": 4.8979591836734695,
"grad_norm": 3.1423969489220687,
"learning_rate": 4.395678355127013e-06,
"loss": 0.1217,
"step": 210
},
{
"epoch": 4.921282798833819,
"grad_norm": 2.382465975219246,
"learning_rate": 4.3893739358856465e-06,
"loss": 0.1074,
"step": 211
},
{
"epoch": 4.944606413994169,
"grad_norm": 2.057841386063691,
"learning_rate": 4.383041373778691e-06,
"loss": 0.1147,
"step": 212
},
{
"epoch": 4.967930029154519,
"grad_norm": 3.155060950172627,
"learning_rate": 4.376680763131811e-06,
"loss": 0.116,
"step": 213
},
{
"epoch": 4.9912536443148685,
"grad_norm": 2.865124701004675,
"learning_rate": 4.3702921986884576e-06,
"loss": 0.1147,
"step": 214
},
{
"epoch": 5.014577259475218,
"grad_norm": 1.9051999811650222,
"learning_rate": 4.363875775608464e-06,
"loss": 0.1101,
"step": 215
},
{
"epoch": 5.037900874635569,
"grad_norm": 2.938649054269692,
"learning_rate": 4.357431589466629e-06,
"loss": 0.1107,
"step": 216
},
{
"epoch": 5.061224489795919,
"grad_norm": 3.6046541064451407,
"learning_rate": 4.350959736251291e-06,
"loss": 0.1098,
"step": 217
},
{
"epoch": 5.084548104956268,
"grad_norm": 2.1379404428463866,
"learning_rate": 4.344460312362899e-06,
"loss": 0.1066,
"step": 218
},
{
"epoch": 5.107871720116618,
"grad_norm": 4.314344150035587,
"learning_rate": 4.337933414612576e-06,
"loss": 0.1089,
"step": 219
},
{
"epoch": 5.131195335276968,
"grad_norm": 3.577256942798804,
"learning_rate": 4.3313791402206765e-06,
"loss": 0.1202,
"step": 220
},
{
"epoch": 5.154518950437318,
"grad_norm": 3.4475350682589148,
"learning_rate": 4.324797586815342e-06,
"loss": 0.1112,
"step": 221
},
{
"epoch": 5.177842565597667,
"grad_norm": 4.006896987066902,
"learning_rate": 4.318188852431043e-06,
"loss": 0.1069,
"step": 222
},
{
"epoch": 5.201166180758017,
"grad_norm": 3.903254344498221,
"learning_rate": 4.311553035507118e-06,
"loss": 0.1081,
"step": 223
},
{
"epoch": 5.224489795918367,
"grad_norm": 3.22338459454092,
"learning_rate": 4.3048902348863116e-06,
"loss": 0.1146,
"step": 224
},
{
"epoch": 5.247813411078718,
"grad_norm": 3.776843948198735,
"learning_rate": 4.2982005498133e-06,
"loss": 0.1108,
"step": 225
},
{
"epoch": 5.271137026239067,
"grad_norm": 2.350737873867638,
"learning_rate": 4.291484079933208e-06,
"loss": 0.0966,
"step": 226
},
{
"epoch": 5.294460641399417,
"grad_norm": 3.09101608162593,
"learning_rate": 4.284740925290136e-06,
"loss": 0.1102,
"step": 227
},
{
"epoch": 5.317784256559767,
"grad_norm": 4.080414695608127,
"learning_rate": 4.277971186325658e-06,
"loss": 0.1114,
"step": 228
},
{
"epoch": 5.341107871720117,
"grad_norm": 2.4749981966879084,
"learning_rate": 4.271174963877331e-06,
"loss": 0.1114,
"step": 229
},
{
"epoch": 5.364431486880466,
"grad_norm": 6.2340152034335485,
"learning_rate": 4.264352359177196e-06,
"loss": 0.1203,
"step": 230
},
{
"epoch": 5.387755102040816,
"grad_norm": 2.998631324089385,
"learning_rate": 4.2575034738502615e-06,
"loss": 0.104,
"step": 231
},
{
"epoch": 5.411078717201166,
"grad_norm": 4.196171450636836,
"learning_rate": 4.250628409912998e-06,
"loss": 0.1154,
"step": 232
},
{
"epoch": 5.4344023323615165,
"grad_norm": 2.2916532573885626,
"learning_rate": 4.243727269771815e-06,
"loss": 0.1098,
"step": 233
},
{
"epoch": 5.457725947521866,
"grad_norm": 3.5946716453629075,
"learning_rate": 4.236800156221536e-06,
"loss": 0.1124,
"step": 234
},
{
"epoch": 5.481049562682216,
"grad_norm": 2.0708027960855855,
"learning_rate": 4.229847172443866e-06,
"loss": 0.1147,
"step": 235
},
{
"epoch": 5.504373177842566,
"grad_norm": 3.110015343140659,
"learning_rate": 4.222868422005856e-06,
"loss": 0.1092,
"step": 236
},
{
"epoch": 5.5276967930029155,
"grad_norm": 1.9738246969785502,
"learning_rate": 4.2158640088583625e-06,
"loss": 0.1106,
"step": 237
},
{
"epoch": 5.551020408163265,
"grad_norm": 2.3185763558818153,
"learning_rate": 4.208834037334494e-06,
"loss": 0.1114,
"step": 238
},
{
"epoch": 5.574344023323615,
"grad_norm": 3.0247875422752126,
"learning_rate": 4.20177861214806e-06,
"loss": 0.111,
"step": 239
},
{
"epoch": 5.597667638483965,
"grad_norm": 1.4817220169187448,
"learning_rate": 4.194697838392013e-06,
"loss": 0.1082,
"step": 240
},
{
"epoch": 5.6209912536443145,
"grad_norm": 3.4112665874767045,
"learning_rate": 4.1875918215368785e-06,
"loss": 0.1087,
"step": 241
},
{
"epoch": 5.644314868804665,
"grad_norm": 1.788023601218338,
"learning_rate": 4.180460667429188e-06,
"loss": 0.1158,
"step": 242
},
{
"epoch": 5.667638483965015,
"grad_norm": 2.4010457820126123,
"learning_rate": 4.1733044822899016e-06,
"loss": 0.1082,
"step": 243
},
{
"epoch": 5.690962099125365,
"grad_norm": 1.9745487615109616,
"learning_rate": 4.166123372712823e-06,
"loss": 0.1077,
"step": 244
},
{
"epoch": 5.714285714285714,
"grad_norm": 2.7086581891198156,
"learning_rate": 4.158917445663014e-06,
"loss": 0.1158,
"step": 245
},
{
"epoch": 5.737609329446064,
"grad_norm": 3.2519661751333495,
"learning_rate": 4.151686808475204e-06,
"loss": 0.1075,
"step": 246
},
{
"epoch": 5.760932944606414,
"grad_norm": 2.5247034257513272,
"learning_rate": 4.144431568852185e-06,
"loss": 0.1098,
"step": 247
},
{
"epoch": 5.784256559766764,
"grad_norm": 2.1809660193845204,
"learning_rate": 4.137151834863213e-06,
"loss": 0.1041,
"step": 248
},
{
"epoch": 5.807580174927113,
"grad_norm": 3.8326422994514417,
"learning_rate": 4.1298477149423935e-06,
"loss": 0.1004,
"step": 249
},
{
"epoch": 5.830903790087463,
"grad_norm": 1.598467781622798,
"learning_rate": 4.122519317887072e-06,
"loss": 0.1085,
"step": 250
},
{
"epoch": 5.854227405247814,
"grad_norm": 2.765625862078344,
"learning_rate": 4.115166752856206e-06,
"loss": 0.1046,
"step": 251
},
{
"epoch": 5.877551020408164,
"grad_norm": 1.6978018630207297,
"learning_rate": 4.1077901293687464e-06,
"loss": 0.1068,
"step": 252
},
{
"epoch": 5.900874635568513,
"grad_norm": 1.7221455411691755,
"learning_rate": 4.100389557302001e-06,
"loss": 0.108,
"step": 253
},
{
"epoch": 5.924198250728863,
"grad_norm": 3.352657539369917,
"learning_rate": 4.092965146890002e-06,
"loss": 0.1098,
"step": 254
},
{
"epoch": 5.947521865889213,
"grad_norm": 2.23141689157229,
"learning_rate": 4.085517008721861e-06,
"loss": 0.1066,
"step": 255
},
{
"epoch": 5.970845481049563,
"grad_norm": 2.1918226856159384,
"learning_rate": 4.078045253740121e-06,
"loss": 0.1126,
"step": 256
},
{
"epoch": 5.994169096209912,
"grad_norm": 2.4817818597720085,
"learning_rate": 4.070549993239106e-06,
"loss": 0.1117,
"step": 257
},
{
"epoch": 6.017492711370262,
"grad_norm": 2.701936405356819,
"learning_rate": 4.0630313388632645e-06,
"loss": 0.0974,
"step": 258
},
{
"epoch": 6.040816326530612,
"grad_norm": 2.094145552436028,
"learning_rate": 4.055489402605504e-06,
"loss": 0.1021,
"step": 259
},
{
"epoch": 6.0641399416909625,
"grad_norm": 2.539235740033433,
"learning_rate": 4.047924296805522e-06,
"loss": 0.0984,
"step": 260
},
{
"epoch": 6.087463556851312,
"grad_norm": 2.5824401808658113,
"learning_rate": 4.040336134148136e-06,
"loss": 0.1089,
"step": 261
},
{
"epoch": 6.110787172011662,
"grad_norm": 3.596662585020769,
"learning_rate": 4.032725027661601e-06,
"loss": 0.1055,
"step": 262
},
{
"epoch": 6.134110787172012,
"grad_norm": 2.457646473335156,
"learning_rate": 4.025091090715931e-06,
"loss": 0.1096,
"step": 263
},
{
"epoch": 6.1574344023323615,
"grad_norm": 1.7432745035698631,
"learning_rate": 4.017434437021206e-06,
"loss": 0.0979,
"step": 264
},
{
"epoch": 6.180758017492711,
"grad_norm": 2.936327111671702,
"learning_rate": 4.009755180625878e-06,
"loss": 0.1071,
"step": 265
},
{
"epoch": 6.204081632653061,
"grad_norm": 1.7878964551347618,
"learning_rate": 4.002053435915078e-06,
"loss": 0.094,
"step": 266
},
{
"epoch": 6.227405247813411,
"grad_norm": 3.0620629134937523,
"learning_rate": 3.9943293176089036e-06,
"loss": 0.1029,
"step": 267
},
{
"epoch": 6.2507288629737605,
"grad_norm": 1.9573953560132995,
"learning_rate": 3.986582940760717e-06,
"loss": 0.1013,
"step": 268
},
{
"epoch": 6.274052478134111,
"grad_norm": 3.61949766140591,
"learning_rate": 3.978814420755429e-06,
"loss": 0.1105,
"step": 269
},
{
"epoch": 6.297376093294461,
"grad_norm": 2.0103439818635613,
"learning_rate": 3.971023873307781e-06,
"loss": 0.0997,
"step": 270
},
{
"epoch": 6.320699708454811,
"grad_norm": 3.672405448096318,
"learning_rate": 3.963211414460618e-06,
"loss": 0.1008,
"step": 271
},
{
"epoch": 6.34402332361516,
"grad_norm": 2.144377933990508,
"learning_rate": 3.955377160583165e-06,
"loss": 0.1027,
"step": 272
},
{
"epoch": 6.36734693877551,
"grad_norm": 2.7273008554626452,
"learning_rate": 3.94752122836929e-06,
"loss": 0.1015,
"step": 273
},
{
"epoch": 6.39067055393586,
"grad_norm": 2.13308524040233,
"learning_rate": 3.939643734835768e-06,
"loss": 0.1033,
"step": 274
},
{
"epoch": 6.41399416909621,
"grad_norm": 2.0125298207602946,
"learning_rate": 3.931744797320538e-06,
"loss": 0.1083,
"step": 275
},
{
"epoch": 6.437317784256559,
"grad_norm": 1.7281564802782643,
"learning_rate": 3.9238245334809525e-06,
"loss": 0.1013,
"step": 276
},
{
"epoch": 6.460641399416909,
"grad_norm": 2.7281331345129924,
"learning_rate": 3.915883061292027e-06,
"loss": 0.1001,
"step": 277
},
{
"epoch": 6.48396501457726,
"grad_norm": 1.9362860537497555,
"learning_rate": 3.907920499044684e-06,
"loss": 0.1033,
"step": 278
},
{
"epoch": 6.50728862973761,
"grad_norm": 3.094024646012348,
"learning_rate": 3.899936965343989e-06,
"loss": 0.1088,
"step": 279
},
{
"epoch": 6.530612244897959,
"grad_norm": 3.0693157412634937,
"learning_rate": 3.891932579107384e-06,
"loss": 0.1002,
"step": 280
},
{
"epoch": 6.553935860058309,
"grad_norm": 1.811589768832765,
"learning_rate": 3.883907459562916e-06,
"loss": 0.097,
"step": 281
},
{
"epoch": 6.577259475218659,
"grad_norm": 2.5573629665858912,
"learning_rate": 3.875861726247464e-06,
"loss": 0.1061,
"step": 282
},
{
"epoch": 6.600583090379009,
"grad_norm": 3.1377447025271676,
"learning_rate": 3.867795499004954e-06,
"loss": 0.1006,
"step": 283
},
{
"epoch": 6.623906705539358,
"grad_norm": 2.754614339929124,
"learning_rate": 3.859708897984575e-06,
"loss": 0.105,
"step": 284
},
{
"epoch": 6.647230320699708,
"grad_norm": 2.6639057012599148,
"learning_rate": 3.8516020436389945e-06,
"loss": 0.1049,
"step": 285
},
{
"epoch": 6.670553935860058,
"grad_norm": 3.6023054370319465,
"learning_rate": 3.843475056722555e-06,
"loss": 0.1028,
"step": 286
},
{
"epoch": 6.6938775510204085,
"grad_norm": 3.266025062404138,
"learning_rate": 3.835328058289486e-06,
"loss": 0.1078,
"step": 287
},
{
"epoch": 6.717201166180758,
"grad_norm": 4.453189889535449,
"learning_rate": 3.82716116969209e-06,
"loss": 0.1047,
"step": 288
},
{
"epoch": 6.740524781341108,
"grad_norm": 5.333045812645013,
"learning_rate": 3.818974512578943e-06,
"loss": 0.1071,
"step": 289
},
{
"epoch": 6.763848396501458,
"grad_norm": 3.2006720850408077,
"learning_rate": 3.8107682088930797e-06,
"loss": 0.1065,
"step": 290
},
{
"epoch": 6.7871720116618075,
"grad_norm": 5.145686385595828,
"learning_rate": 3.802542380870177e-06,
"loss": 0.1045,
"step": 291
},
{
"epoch": 6.810495626822157,
"grad_norm": 3.6186906562643433,
"learning_rate": 3.794297151036732e-06,
"loss": 0.0996,
"step": 292
},
{
"epoch": 6.833819241982507,
"grad_norm": 5.063709820898958,
"learning_rate": 3.7860326422082417e-06,
"loss": 0.107,
"step": 293
},
{
"epoch": 6.857142857142857,
"grad_norm": 4.260079716397743,
"learning_rate": 3.777748977487366e-06,
"loss": 0.1068,
"step": 294
},
{
"epoch": 6.8804664723032065,
"grad_norm": 2.772148240995808,
"learning_rate": 3.7694462802621025e-06,
"loss": 0.1039,
"step": 295
},
{
"epoch": 6.903790087463557,
"grad_norm": 2.468481411386976,
"learning_rate": 3.76112467420394e-06,
"loss": 0.101,
"step": 296
},
{
"epoch": 6.927113702623907,
"grad_norm": 2.7704651284383757,
"learning_rate": 3.7527842832660244e-06,
"loss": 0.107,
"step": 297
},
{
"epoch": 6.950437317784257,
"grad_norm": 3.261999572725812,
"learning_rate": 3.744425231681308e-06,
"loss": 0.1054,
"step": 298
},
{
"epoch": 6.973760932944606,
"grad_norm": 3.8333346048988775,
"learning_rate": 3.7360476439606984e-06,
"loss": 0.1059,
"step": 299
},
{
"epoch": 6.997084548104956,
"grad_norm": 3.781950704421763,
"learning_rate": 3.727651644891207e-06,
"loss": 0.1038,
"step": 300
},
{
"epoch": 7.020408163265306,
"grad_norm": 3.670002951140088,
"learning_rate": 3.719237359534087e-06,
"loss": 0.1056,
"step": 301
},
{
"epoch": 7.043731778425656,
"grad_norm": 4.066117070274146,
"learning_rate": 3.710804913222972e-06,
"loss": 0.0988,
"step": 302
},
{
"epoch": 7.067055393586005,
"grad_norm": 3.10789584915452,
"learning_rate": 3.702354431562011e-06,
"loss": 0.1054,
"step": 303
},
{
"epoch": 7.090379008746356,
"grad_norm": 2.740386719387549,
"learning_rate": 3.693886040423994e-06,
"loss": 0.1001,
"step": 304
},
{
"epoch": 7.113702623906706,
"grad_norm": 3.473383377895931,
"learning_rate": 3.6853998659484784e-06,
"loss": 0.0988,
"step": 305
},
{
"epoch": 7.137026239067056,
"grad_norm": 3.3568900920476814,
"learning_rate": 3.676896034539913e-06,
"loss": 0.1002,
"step": 306
},
{
"epoch": 7.160349854227405,
"grad_norm": 5.428025332926568,
"learning_rate": 3.668374672865749e-06,
"loss": 0.1043,
"step": 307
},
{
"epoch": 7.183673469387755,
"grad_norm": 4.99524787142711,
"learning_rate": 3.6598359078545597e-06,
"loss": 0.0998,
"step": 308
},
{
"epoch": 7.206997084548105,
"grad_norm": 4.265821291700417,
"learning_rate": 3.6512798666941457e-06,
"loss": 0.0973,
"step": 309
},
{
"epoch": 7.230320699708455,
"grad_norm": 4.835380559104649,
"learning_rate": 3.6427066768296425e-06,
"loss": 0.1047,
"step": 310
},
{
"epoch": 7.253644314868804,
"grad_norm": 3.131377312362004,
"learning_rate": 3.634116465961621e-06,
"loss": 0.0918,
"step": 311
},
{
"epoch": 7.276967930029155,
"grad_norm": 3.518209092623886,
"learning_rate": 3.6255093620441835e-06,
"loss": 0.0972,
"step": 312
},
{
"epoch": 7.300291545189505,
"grad_norm": 4.999014757360847,
"learning_rate": 3.6168854932830643e-06,
"loss": 0.0989,
"step": 313
},
{
"epoch": 7.3236151603498545,
"grad_norm": 2.90469532063187,
"learning_rate": 3.6082449881337132e-06,
"loss": 0.102,
"step": 314
},
{
"epoch": 7.346938775510204,
"grad_norm": 1.9017338019692047,
"learning_rate": 3.5995879752993846e-06,
"loss": 0.0919,
"step": 315
},
{
"epoch": 7.370262390670554,
"grad_norm": 4.74281611039079,
"learning_rate": 3.5909145837292207e-06,
"loss": 0.1062,
"step": 316
},
{
"epoch": 7.393586005830904,
"grad_norm": 2.7998177162508595,
"learning_rate": 3.5822249426163302e-06,
"loss": 0.0975,
"step": 317
},
{
"epoch": 7.4169096209912535,
"grad_norm": 5.268248676962246,
"learning_rate": 3.5735191813958657e-06,
"loss": 0.0952,
"step": 318
},
{
"epoch": 7.440233236151603,
"grad_norm": 2.514032938174696,
"learning_rate": 3.564797429743093e-06,
"loss": 0.0954,
"step": 319
},
{
"epoch": 7.463556851311953,
"grad_norm": 4.346782915358994,
"learning_rate": 3.5560598175714617e-06,
"loss": 0.0905,
"step": 320
},
{
"epoch": 7.486880466472304,
"grad_norm": 2.662355743225369,
"learning_rate": 3.5473064750306675e-06,
"loss": 0.1002,
"step": 321
},
{
"epoch": 7.510204081632653,
"grad_norm": 2.6526705690815224,
"learning_rate": 3.5385375325047167e-06,
"loss": 0.0941,
"step": 322
},
{
"epoch": 7.533527696793003,
"grad_norm": 3.1377681055301125,
"learning_rate": 3.529753120609982e-06,
"loss": 0.1013,
"step": 323
},
{
"epoch": 7.556851311953353,
"grad_norm": 2.8516907545536796,
"learning_rate": 3.520953370193259e-06,
"loss": 0.0943,
"step": 324
},
{
"epoch": 7.580174927113703,
"grad_norm": 2.2692649277972192,
"learning_rate": 3.5121384123298137e-06,
"loss": 0.0987,
"step": 325
},
{
"epoch": 7.603498542274052,
"grad_norm": 3.201499414823122,
"learning_rate": 3.5033083783214334e-06,
"loss": 0.0941,
"step": 326
},
{
"epoch": 7.626822157434402,
"grad_norm": 2.3873360143137075,
"learning_rate": 3.4944633996944705e-06,
"loss": 0.0975,
"step": 327
},
{
"epoch": 7.650145772594752,
"grad_norm": 3.9439172676028953,
"learning_rate": 3.48560360819788e-06,
"loss": 0.0965,
"step": 328
},
{
"epoch": 7.673469387755102,
"grad_norm": 3.0373265032413754,
"learning_rate": 3.4767291358012623e-06,
"loss": 0.0925,
"step": 329
},
{
"epoch": 7.696793002915452,
"grad_norm": 2.539015737249666,
"learning_rate": 3.467840114692894e-06,
"loss": 0.0926,
"step": 330
},
{
"epoch": 7.720116618075802,
"grad_norm": 3.7437097563196575,
"learning_rate": 3.4589366772777578e-06,
"loss": 0.0946,
"step": 331
},
{
"epoch": 7.743440233236152,
"grad_norm": 2.9377775669851274,
"learning_rate": 3.450018956175575e-06,
"loss": 0.0987,
"step": 332
},
{
"epoch": 7.766763848396502,
"grad_norm": 2.98638049741519,
"learning_rate": 3.441087084218826e-06,
"loss": 0.0954,
"step": 333
},
{
"epoch": 7.790087463556851,
"grad_norm": 3.609350559432606,
"learning_rate": 3.432141194450772e-06,
"loss": 0.1007,
"step": 334
},
{
"epoch": 7.813411078717201,
"grad_norm": 3.2372500407812317,
"learning_rate": 3.4231814201234763e-06,
"loss": 0.0984,
"step": 335
},
{
"epoch": 7.836734693877551,
"grad_norm": 4.127296964202223,
"learning_rate": 3.414207894695816e-06,
"loss": 0.1012,
"step": 336
},
{
"epoch": 7.860058309037901,
"grad_norm": 3.1118203520319603,
"learning_rate": 3.405220751831495e-06,
"loss": 0.1003,
"step": 337
},
{
"epoch": 7.88338192419825,
"grad_norm": 3.507953190690225,
"learning_rate": 3.3962201253970563e-06,
"loss": 0.1025,
"step": 338
},
{
"epoch": 7.906705539358601,
"grad_norm": 2.2012231071081474,
"learning_rate": 3.387206149459882e-06,
"loss": 0.0965,
"step": 339
},
{
"epoch": 7.930029154518951,
"grad_norm": 2.2871527403019467,
"learning_rate": 3.3781789582862e-06,
"loss": 0.0998,
"step": 340
},
{
"epoch": 7.9533527696793005,
"grad_norm": 2.4730155888616405,
"learning_rate": 3.369138686339087e-06,
"loss": 0.0945,
"step": 341
},
{
"epoch": 7.97667638483965,
"grad_norm": 2.1207877653231675,
"learning_rate": 3.3600854682764585e-06,
"loss": 0.0938,
"step": 342
},
{
"epoch": 8.0,
"grad_norm": 2.6214986792515047,
"learning_rate": 3.351019438949068e-06,
"loss": 0.0963,
"step": 343
},
{
"epoch": 8.02332361516035,
"grad_norm": 3.084487613063804,
"learning_rate": 3.3419407333984972e-06,
"loss": 0.0839,
"step": 344
},
{
"epoch": 8.0466472303207,
"grad_norm": 2.0003748780822432,
"learning_rate": 3.3328494868551444e-06,
"loss": 0.0971,
"step": 345
},
{
"epoch": 8.06997084548105,
"grad_norm": 3.204166745374767,
"learning_rate": 3.3237458347362106e-06,
"loss": 0.0931,
"step": 346
},
{
"epoch": 8.093294460641399,
"grad_norm": 2.2486814238019472,
"learning_rate": 3.314629912643682e-06,
"loss": 0.0896,
"step": 347
},
{
"epoch": 8.11661807580175,
"grad_norm": 2.2590096543685405,
"learning_rate": 3.3055018563623085e-06,
"loss": 0.0857,
"step": 348
},
{
"epoch": 8.139941690962099,
"grad_norm": 3.2766070259645264,
"learning_rate": 3.296361801857587e-06,
"loss": 0.0916,
"step": 349
},
{
"epoch": 8.16326530612245,
"grad_norm": 2.2460306971448745,
"learning_rate": 3.2872098852737274e-06,
"loss": 0.0839,
"step": 350
},
{
"epoch": 8.186588921282798,
"grad_norm": 2.7558318636228667,
"learning_rate": 3.2780462429316317e-06,
"loss": 0.0878,
"step": 351
},
{
"epoch": 8.209912536443149,
"grad_norm": 3.1544483443701856,
"learning_rate": 3.268871011326861e-06,
"loss": 0.0867,
"step": 352
},
{
"epoch": 8.2332361516035,
"grad_norm": 2.236180676259461,
"learning_rate": 3.2596843271276003e-06,
"loss": 0.0888,
"step": 353
},
{
"epoch": 8.256559766763848,
"grad_norm": 4.011686657384414,
"learning_rate": 3.2504863271726284e-06,
"loss": 0.0821,
"step": 354
},
{
"epoch": 8.279883381924199,
"grad_norm": 2.5203855027411937,
"learning_rate": 3.241277148469274e-06,
"loss": 0.0942,
"step": 355
},
{
"epoch": 8.303206997084548,
"grad_norm": 3.540099945597143,
"learning_rate": 3.232056928191376e-06,
"loss": 0.0866,
"step": 356
},
{
"epoch": 8.326530612244898,
"grad_norm": 3.43864726948626,
"learning_rate": 3.2228258036772443e-06,
"loss": 0.0894,
"step": 357
},
{
"epoch": 8.349854227405247,
"grad_norm": 4.373034444419023,
"learning_rate": 3.213583912427609e-06,
"loss": 0.089,
"step": 358
},
{
"epoch": 8.373177842565598,
"grad_norm": 3.7748671956733397,
"learning_rate": 3.2043313921035747e-06,
"loss": 0.0857,
"step": 359
},
{
"epoch": 8.396501457725947,
"grad_norm": 4.151535917347032,
"learning_rate": 3.195068380524569e-06,
"loss": 0.0915,
"step": 360
},
{
"epoch": 8.419825072886297,
"grad_norm": 4.7744553100527005,
"learning_rate": 3.185795015666292e-06,
"loss": 0.0932,
"step": 361
},
{
"epoch": 8.443148688046648,
"grad_norm": 3.8892753742055044,
"learning_rate": 3.1765114356586592e-06,
"loss": 0.0848,
"step": 362
},
{
"epoch": 8.466472303206997,
"grad_norm": 3.993048468689458,
"learning_rate": 3.1672177787837404e-06,
"loss": 0.0863,
"step": 363
},
{
"epoch": 8.489795918367347,
"grad_norm": 3.4051738753996252,
"learning_rate": 3.1579141834737106e-06,
"loss": 0.0843,
"step": 364
},
{
"epoch": 8.513119533527696,
"grad_norm": 4.425378712111029,
"learning_rate": 3.1486007883087754e-06,
"loss": 0.0951,
"step": 365
},
{
"epoch": 8.536443148688047,
"grad_norm": 2.674656541112342,
"learning_rate": 3.139277732015114e-06,
"loss": 0.087,
"step": 366
},
{
"epoch": 8.559766763848396,
"grad_norm": 4.3155606231196995,
"learning_rate": 3.1299451534628134e-06,
"loss": 0.0886,
"step": 367
},
{
"epoch": 8.583090379008746,
"grad_norm": 2.4973769732503333,
"learning_rate": 3.1206031916637956e-06,
"loss": 0.0886,
"step": 368
},
{
"epoch": 8.606413994169095,
"grad_norm": 4.782817708224153,
"learning_rate": 3.1112519857697505e-06,
"loss": 0.0923,
"step": 369
},
{
"epoch": 8.629737609329446,
"grad_norm": 2.5062794976151435,
"learning_rate": 3.1018916750700618e-06,
"loss": 0.0911,
"step": 370
},
{
"epoch": 8.653061224489797,
"grad_norm": 5.905723588685558,
"learning_rate": 3.092522398989732e-06,
"loss": 0.0986,
"step": 371
},
{
"epoch": 8.676384839650146,
"grad_norm": 3.567470095214398,
"learning_rate": 3.0831442970873044e-06,
"loss": 0.0909,
"step": 372
},
{
"epoch": 8.699708454810496,
"grad_norm": 3.8365983879273293,
"learning_rate": 3.0737575090527892e-06,
"loss": 0.0881,
"step": 373
},
{
"epoch": 8.723032069970845,
"grad_norm": 5.570185292036708,
"learning_rate": 3.0643621747055785e-06,
"loss": 0.0901,
"step": 374
},
{
"epoch": 8.746355685131196,
"grad_norm": 4.025989027119222,
"learning_rate": 3.054958433992362e-06,
"loss": 0.0855,
"step": 375
},
{
"epoch": 8.769679300291545,
"grad_norm": 4.030985505646047,
"learning_rate": 3.045546426985049e-06,
"loss": 0.0852,
"step": 376
},
{
"epoch": 8.793002915451895,
"grad_norm": 4.022543561567968,
"learning_rate": 3.036126293878674e-06,
"loss": 0.093,
"step": 377
},
{
"epoch": 8.816326530612244,
"grad_norm": 4.828598838689659,
"learning_rate": 3.026698174989316e-06,
"loss": 0.0953,
"step": 378
},
{
"epoch": 8.839650145772595,
"grad_norm": 4.2944264596746455,
"learning_rate": 3.0172622107520033e-06,
"loss": 0.089,
"step": 379
},
{
"epoch": 8.862973760932945,
"grad_norm": 4.049298242123209,
"learning_rate": 3.0078185417186245e-06,
"loss": 0.0978,
"step": 380
},
{
"epoch": 8.886297376093294,
"grad_norm": 3.046707696478407,
"learning_rate": 2.9983673085558306e-06,
"loss": 0.0924,
"step": 381
},
{
"epoch": 8.909620991253645,
"grad_norm": 4.541947988960665,
"learning_rate": 2.988908652042948e-06,
"loss": 0.0902,
"step": 382
},
{
"epoch": 8.932944606413994,
"grad_norm": 2.7874887337906,
"learning_rate": 2.979442713069871e-06,
"loss": 0.0892,
"step": 383
},
{
"epoch": 8.956268221574344,
"grad_norm": 4.904390407872027,
"learning_rate": 2.9699696326349726e-06,
"loss": 0.0928,
"step": 384
},
{
"epoch": 8.979591836734693,
"grad_norm": 4.009865753514206,
"learning_rate": 2.960489551842998e-06,
"loss": 0.0899,
"step": 385
},
{
"epoch": 9.002915451895044,
"grad_norm": 3.724178039187428,
"learning_rate": 2.9510026119029655e-06,
"loss": 0.0899,
"step": 386
},
{
"epoch": 9.026239067055394,
"grad_norm": 4.384561172850345,
"learning_rate": 2.9415089541260645e-06,
"loss": 0.0854,
"step": 387
},
{
"epoch": 9.049562682215743,
"grad_norm": 2.1881553758061685,
"learning_rate": 2.9320087199235463e-06,
"loss": 0.0771,
"step": 388
},
{
"epoch": 9.072886297376094,
"grad_norm": 4.409817597299811,
"learning_rate": 2.9225020508046233e-06,
"loss": 0.0832,
"step": 389
},
{
"epoch": 9.096209912536443,
"grad_norm": 3.108449827027684,
"learning_rate": 2.9129890883743544e-06,
"loss": 0.0796,
"step": 390
},
{
"epoch": 9.119533527696793,
"grad_norm": 3.7741277785995506,
"learning_rate": 2.9034699743315418e-06,
"loss": 0.0751,
"step": 391
},
{
"epoch": 9.142857142857142,
"grad_norm": 5.538458658079067,
"learning_rate": 2.893944850466619e-06,
"loss": 0.0891,
"step": 392
},
{
"epoch": 9.166180758017493,
"grad_norm": 4.107660784555083,
"learning_rate": 2.8844138586595354e-06,
"loss": 0.0809,
"step": 393
},
{
"epoch": 9.189504373177842,
"grad_norm": 5.161974324578949,
"learning_rate": 2.8748771408776467e-06,
"loss": 0.0766,
"step": 394
},
{
"epoch": 9.212827988338192,
"grad_norm": 3.5131532923575977,
"learning_rate": 2.8653348391735996e-06,
"loss": 0.0808,
"step": 395
},
{
"epoch": 9.236151603498543,
"grad_norm": 7.6199699445316975,
"learning_rate": 2.8557870956832135e-06,
"loss": 0.0853,
"step": 396
},
{
"epoch": 9.259475218658892,
"grad_norm": 4.24358624464005,
"learning_rate": 2.8462340526233657e-06,
"loss": 0.082,
"step": 397
},
{
"epoch": 9.282798833819243,
"grad_norm": 5.131989899420963,
"learning_rate": 2.8366758522898737e-06,
"loss": 0.0797,
"step": 398
},
{
"epoch": 9.306122448979592,
"grad_norm": 4.042970637076854,
"learning_rate": 2.8271126370553727e-06,
"loss": 0.0766,
"step": 399
},
{
"epoch": 9.329446064139942,
"grad_norm": 6.451199962567526,
"learning_rate": 2.817544549367197e-06,
"loss": 0.0812,
"step": 400
},
{
"epoch": 9.352769679300291,
"grad_norm": 4.460602275057708,
"learning_rate": 2.8079717317452582e-06,
"loss": 0.0835,
"step": 401
},
{
"epoch": 9.376093294460642,
"grad_norm": 4.474716365072937,
"learning_rate": 2.798394326779922e-06,
"loss": 0.0828,
"step": 402
},
{
"epoch": 9.39941690962099,
"grad_norm": 5.622483601900293,
"learning_rate": 2.788812477129883e-06,
"loss": 0.0843,
"step": 403
},
{
"epoch": 9.422740524781341,
"grad_norm": 3.953290973541459,
"learning_rate": 2.7792263255200406e-06,
"loss": 0.0844,
"step": 404
},
{
"epoch": 9.446064139941692,
"grad_norm": 5.3449177497837255,
"learning_rate": 2.769636014739377e-06,
"loss": 0.0873,
"step": 405
},
{
"epoch": 9.46938775510204,
"grad_norm": 4.633708487675383,
"learning_rate": 2.760041687638822e-06,
"loss": 0.0816,
"step": 406
},
{
"epoch": 9.492711370262391,
"grad_norm": 3.38289326842269,
"learning_rate": 2.7504434871291317e-06,
"loss": 0.083,
"step": 407
},
{
"epoch": 9.51603498542274,
"grad_norm": 5.207711063087272,
"learning_rate": 2.7408415561787587e-06,
"loss": 0.0797,
"step": 408
},
{
"epoch": 9.53935860058309,
"grad_norm": 4.7282328530949,
"learning_rate": 2.7312360378117214e-06,
"loss": 0.0791,
"step": 409
},
{
"epoch": 9.56268221574344,
"grad_norm": 3.9839978806287037,
"learning_rate": 2.721627075105473e-06,
"loss": 0.0789,
"step": 410
},
{
"epoch": 9.58600583090379,
"grad_norm": 4.795922597965258,
"learning_rate": 2.7120148111887732e-06,
"loss": 0.078,
"step": 411
},
{
"epoch": 9.60932944606414,
"grad_norm": 3.7271565537628444,
"learning_rate": 2.7023993892395523e-06,
"loss": 0.0809,
"step": 412
},
{
"epoch": 9.63265306122449,
"grad_norm": 4.174873788719055,
"learning_rate": 2.6927809524827815e-06,
"loss": 0.0769,
"step": 413
},
{
"epoch": 9.65597667638484,
"grad_norm": 4.259366205196579,
"learning_rate": 2.6831596441883388e-06,
"loss": 0.0825,
"step": 414
},
{
"epoch": 9.67930029154519,
"grad_norm": 5.37835783645601,
"learning_rate": 2.6735356076688744e-06,
"loss": 0.0827,
"step": 415
},
{
"epoch": 9.70262390670554,
"grad_norm": 3.7147173132975277,
"learning_rate": 2.6639089862776763e-06,
"loss": 0.0821,
"step": 416
},
{
"epoch": 9.725947521865889,
"grad_norm": 5.772966378570507,
"learning_rate": 2.6542799234065354e-06,
"loss": 0.0811,
"step": 417
},
{
"epoch": 9.74927113702624,
"grad_norm": 4.34521503579229,
"learning_rate": 2.644648562483608e-06,
"loss": 0.0874,
"step": 418
},
{
"epoch": 9.772594752186588,
"grad_norm": 3.6215374951442034,
"learning_rate": 2.6350150469712824e-06,
"loss": 0.0815,
"step": 419
},
{
"epoch": 9.795918367346939,
"grad_norm": 3.4450949256342502,
"learning_rate": 2.625379520364041e-06,
"loss": 0.0768,
"step": 420
},
{
"epoch": 9.819241982507288,
"grad_norm": 3.95042321074671,
"learning_rate": 2.615742126186319e-06,
"loss": 0.0776,
"step": 421
},
{
"epoch": 9.842565597667638,
"grad_norm": 3.0236686992714827,
"learning_rate": 2.606103007990371e-06,
"loss": 0.0796,
"step": 422
},
{
"epoch": 9.86588921282799,
"grad_norm": 4.361007111450639,
"learning_rate": 2.5964623093541326e-06,
"loss": 0.0857,
"step": 423
},
{
"epoch": 9.889212827988338,
"grad_norm": 3.0417992201536803,
"learning_rate": 2.5868201738790798e-06,
"loss": 0.0793,
"step": 424
},
{
"epoch": 9.912536443148689,
"grad_norm": 4.234482429521092,
"learning_rate": 2.5771767451880908e-06,
"loss": 0.0844,
"step": 425
},
{
"epoch": 9.935860058309038,
"grad_norm": 3.2532366261674546,
"learning_rate": 2.567532166923306e-06,
"loss": 0.0845,
"step": 426
},
{
"epoch": 9.959183673469388,
"grad_norm": 5.25009409502266,
"learning_rate": 2.557886582743991e-06,
"loss": 0.085,
"step": 427
},
{
"epoch": 9.982507288629737,
"grad_norm": 3.3120422046939297,
"learning_rate": 2.548240136324392e-06,
"loss": 0.0855,
"step": 428
},
{
"epoch": 10.005830903790088,
"grad_norm": 3.6242571102413876,
"learning_rate": 2.5385929713516006e-06,
"loss": 0.0795,
"step": 429
},
{
"epoch": 10.029154518950437,
"grad_norm": 3.484857500137509,
"learning_rate": 2.5289452315234125e-06,
"loss": 0.0743,
"step": 430
},
{
"epoch": 10.052478134110787,
"grad_norm": 2.9146847303493293,
"learning_rate": 2.5192970605461835e-06,
"loss": 0.0716,
"step": 431
},
{
"epoch": 10.075801749271138,
"grad_norm": 4.090638820313027,
"learning_rate": 2.509648602132692e-06,
"loss": 0.071,
"step": 432
},
{
"epoch": 10.099125364431487,
"grad_norm": 2.7367985039975364,
"learning_rate": 2.5e-06,
"loss": 0.0653,
"step": 433
},
{
"epoch": 10.122448979591837,
"grad_norm": 3.1514711365816175,
"learning_rate": 2.4903513978673076e-06,
"loss": 0.0724,
"step": 434
},
{
"epoch": 10.145772594752186,
"grad_norm": 3.1212207066540483,
"learning_rate": 2.480702939453818e-06,
"loss": 0.066,
"step": 435
},
{
"epoch": 10.169096209912537,
"grad_norm": 3.5360489366260692,
"learning_rate": 2.4710547684765884e-06,
"loss": 0.0726,
"step": 436
},
{
"epoch": 10.192419825072886,
"grad_norm": 4.199846655453183,
"learning_rate": 2.4614070286483994e-06,
"loss": 0.0751,
"step": 437
},
{
"epoch": 10.215743440233236,
"grad_norm": 4.2344916633778835,
"learning_rate": 2.451759863675609e-06,
"loss": 0.0673,
"step": 438
},
{
"epoch": 10.239067055393585,
"grad_norm": 5.237854487183302,
"learning_rate": 2.44211341725601e-06,
"loss": 0.0785,
"step": 439
},
{
"epoch": 10.262390670553936,
"grad_norm": 4.301828643181056,
"learning_rate": 2.432467833076694e-06,
"loss": 0.0773,
"step": 440
},
{
"epoch": 10.285714285714286,
"grad_norm": 4.474031968078861,
"learning_rate": 2.42282325481191e-06,
"loss": 0.0737,
"step": 441
},
{
"epoch": 10.309037900874635,
"grad_norm": 6.017173349843788,
"learning_rate": 2.4131798261209206e-06,
"loss": 0.078,
"step": 442
},
{
"epoch": 10.332361516034986,
"grad_norm": 3.7621797495056417,
"learning_rate": 2.4035376906458674e-06,
"loss": 0.0704,
"step": 443
},
{
"epoch": 10.355685131195335,
"grad_norm": 5.9349864257040235,
"learning_rate": 2.39389699200963e-06,
"loss": 0.0719,
"step": 444
},
{
"epoch": 10.379008746355685,
"grad_norm": 4.943076738697602,
"learning_rate": 2.3842578738136816e-06,
"loss": 0.07,
"step": 445
},
{
"epoch": 10.402332361516034,
"grad_norm": 4.132956720442613,
"learning_rate": 2.374620479635959e-06,
"loss": 0.0707,
"step": 446
},
{
"epoch": 10.425655976676385,
"grad_norm": 3.519182634605719,
"learning_rate": 2.364984953028718e-06,
"loss": 0.0669,
"step": 447
},
{
"epoch": 10.448979591836734,
"grad_norm": 4.880062116008721,
"learning_rate": 2.3553514375163926e-06,
"loss": 0.0716,
"step": 448
},
{
"epoch": 10.472303206997085,
"grad_norm": 3.6519392636148584,
"learning_rate": 2.3457200765934654e-06,
"loss": 0.0702,
"step": 449
},
{
"epoch": 10.495626822157435,
"grad_norm": 3.9433212845253585,
"learning_rate": 2.3360910137223246e-06,
"loss": 0.0711,
"step": 450
},
{
"epoch": 10.518950437317784,
"grad_norm": 4.1091062508662635,
"learning_rate": 2.326464392331126e-06,
"loss": 0.0719,
"step": 451
},
{
"epoch": 10.542274052478135,
"grad_norm": 4.652424924512233,
"learning_rate": 2.3168403558116612e-06,
"loss": 0.0723,
"step": 452
},
{
"epoch": 10.565597667638484,
"grad_norm": 3.4455140070040255,
"learning_rate": 2.3072190475172194e-06,
"loss": 0.07,
"step": 453
},
{
"epoch": 10.588921282798834,
"grad_norm": 4.649570871876516,
"learning_rate": 2.297600610760448e-06,
"loss": 0.0696,
"step": 454
},
{
"epoch": 10.612244897959183,
"grad_norm": 4.559645922784148,
"learning_rate": 2.287985188811228e-06,
"loss": 0.0746,
"step": 455
},
{
"epoch": 10.635568513119534,
"grad_norm": 3.5444974618957277,
"learning_rate": 2.2783729248945275e-06,
"loss": 0.0684,
"step": 456
},
{
"epoch": 10.658892128279884,
"grad_norm": 3.993887164367774,
"learning_rate": 2.2687639621882795e-06,
"loss": 0.0712,
"step": 457
},
{
"epoch": 10.682215743440233,
"grad_norm": 3.1108405718415604,
"learning_rate": 2.259158443821242e-06,
"loss": 0.0744,
"step": 458
},
{
"epoch": 10.705539358600584,
"grad_norm": 3.3980853018828223,
"learning_rate": 2.249556512870869e-06,
"loss": 0.0686,
"step": 459
},
{
"epoch": 10.728862973760933,
"grad_norm": 4.093233731162892,
"learning_rate": 2.2399583123611788e-06,
"loss": 0.0788,
"step": 460
},
{
"epoch": 10.752186588921283,
"grad_norm": 3.8432637969379626,
"learning_rate": 2.2303639852606244e-06,
"loss": 0.0694,
"step": 461
},
{
"epoch": 10.775510204081632,
"grad_norm": 3.883530609839273,
"learning_rate": 2.2207736744799598e-06,
"loss": 0.0703,
"step": 462
},
{
"epoch": 10.798833819241983,
"grad_norm": 4.4013742294915374,
"learning_rate": 2.2111875228701175e-06,
"loss": 0.0765,
"step": 463
},
{
"epoch": 10.822157434402332,
"grad_norm": 4.486608288079545,
"learning_rate": 2.2016056732200794e-06,
"loss": 0.0733,
"step": 464
},
{
"epoch": 10.845481049562682,
"grad_norm": 3.687365383988495,
"learning_rate": 2.192028268254742e-06,
"loss": 0.0683,
"step": 465
},
{
"epoch": 10.868804664723033,
"grad_norm": 4.698752566431009,
"learning_rate": 2.1824554506328033e-06,
"loss": 0.0715,
"step": 466
},
{
"epoch": 10.892128279883382,
"grad_norm": 3.1651675708156075,
"learning_rate": 2.1728873629446277e-06,
"loss": 0.0674,
"step": 467
},
{
"epoch": 10.915451895043732,
"grad_norm": 3.9151451456147153,
"learning_rate": 2.163324147710127e-06,
"loss": 0.0784,
"step": 468
},
{
"epoch": 10.938775510204081,
"grad_norm": 3.5638025144670378,
"learning_rate": 2.1537659473766347e-06,
"loss": 0.0713,
"step": 469
},
{
"epoch": 10.962099125364432,
"grad_norm": 3.6805884033912597,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.0732,
"step": 470
},
{
"epoch": 10.98542274052478,
"grad_norm": 4.068187078159665,
"learning_rate": 2.1346651608264012e-06,
"loss": 0.0717,
"step": 471
},
{
"epoch": 11.008746355685131,
"grad_norm": 3.5401808969431285,
"learning_rate": 2.1251228591223533e-06,
"loss": 0.0697,
"step": 472
},
{
"epoch": 11.03206997084548,
"grad_norm": 4.771005463583638,
"learning_rate": 2.1155861413404655e-06,
"loss": 0.0684,
"step": 473
},
{
"epoch": 11.055393586005831,
"grad_norm": 4.2335670516810175,
"learning_rate": 2.1060551495333816e-06,
"loss": 0.0652,
"step": 474
},
{
"epoch": 11.078717201166182,
"grad_norm": 3.978324814398319,
"learning_rate": 2.096530025668458e-06,
"loss": 0.0653,
"step": 475
},
{
"epoch": 11.10204081632653,
"grad_norm": 3.45924049949899,
"learning_rate": 2.087010911625647e-06,
"loss": 0.0601,
"step": 476
},
{
"epoch": 11.125364431486881,
"grad_norm": 3.9286305881356,
"learning_rate": 2.077497949195378e-06,
"loss": 0.0602,
"step": 477
},
{
"epoch": 11.14868804664723,
"grad_norm": 4.2209211625918135,
"learning_rate": 2.0679912800764537e-06,
"loss": 0.0682,
"step": 478
},
{
"epoch": 11.17201166180758,
"grad_norm": 4.0900836863268735,
"learning_rate": 2.0584910458739367e-06,
"loss": 0.0628,
"step": 479
},
{
"epoch": 11.19533527696793,
"grad_norm": 4.350925145481364,
"learning_rate": 2.048997388097035e-06,
"loss": 0.0586,
"step": 480
},
{
"epoch": 11.21865889212828,
"grad_norm": 4.567833696107266,
"learning_rate": 2.0395104481570025e-06,
"loss": 0.0649,
"step": 481
},
{
"epoch": 11.241982507288629,
"grad_norm": 4.3906584497446906,
"learning_rate": 2.0300303673650286e-06,
"loss": 0.0573,
"step": 482
},
{
"epoch": 11.26530612244898,
"grad_norm": 4.47237052171397,
"learning_rate": 2.0205572869301295e-06,
"loss": 0.0574,
"step": 483
},
{
"epoch": 11.28862973760933,
"grad_norm": 4.007831536260833,
"learning_rate": 2.0110913479570525e-06,
"loss": 0.061,
"step": 484
},
{
"epoch": 11.31195335276968,
"grad_norm": 4.571950452681269,
"learning_rate": 2.00163269144417e-06,
"loss": 0.0579,
"step": 485
},
{
"epoch": 11.33527696793003,
"grad_norm": 5.218817430620227,
"learning_rate": 1.9921814582813763e-06,
"loss": 0.0669,
"step": 486
},
{
"epoch": 11.358600583090379,
"grad_norm": 3.9881175696012776,
"learning_rate": 1.9827377892479967e-06,
"loss": 0.0601,
"step": 487
},
{
"epoch": 11.38192419825073,
"grad_norm": 4.1143174999008325,
"learning_rate": 1.973301825010685e-06,
"loss": 0.062,
"step": 488
},
{
"epoch": 11.405247813411078,
"grad_norm": 3.9485244216076754,
"learning_rate": 1.963873706121327e-06,
"loss": 0.061,
"step": 489
},
{
"epoch": 11.428571428571429,
"grad_norm": 3.873171959654352,
"learning_rate": 1.9544535730149524e-06,
"loss": 0.0673,
"step": 490
},
{
"epoch": 11.451895043731778,
"grad_norm": 5.007394091761698,
"learning_rate": 1.9450415660076388e-06,
"loss": 0.0666,
"step": 491
},
{
"epoch": 11.475218658892128,
"grad_norm": 3.8132922490669965,
"learning_rate": 1.9356378252944223e-06,
"loss": 0.0653,
"step": 492
},
{
"epoch": 11.498542274052479,
"grad_norm": 4.60560810890988,
"learning_rate": 1.9262424909472103e-06,
"loss": 0.0672,
"step": 493
},
{
"epoch": 11.521865889212828,
"grad_norm": 4.209183987798544,
"learning_rate": 1.9168557029126965e-06,
"loss": 0.0644,
"step": 494
},
{
"epoch": 11.545189504373178,
"grad_norm": 5.595398388299023,
"learning_rate": 1.9074776010102693e-06,
"loss": 0.0666,
"step": 495
},
{
"epoch": 11.568513119533527,
"grad_norm": 4.004167531490067,
"learning_rate": 1.8981083249299393e-06,
"loss": 0.0661,
"step": 496
},
{
"epoch": 11.591836734693878,
"grad_norm": 4.686857255738534,
"learning_rate": 1.88874801423025e-06,
"loss": 0.0578,
"step": 497
},
{
"epoch": 11.615160349854227,
"grad_norm": 4.644190590495262,
"learning_rate": 1.879396808336205e-06,
"loss": 0.0681,
"step": 498
},
{
"epoch": 11.638483965014577,
"grad_norm": 5.0449793417652256,
"learning_rate": 1.8700548465371877e-06,
"loss": 0.0604,
"step": 499
},
{
"epoch": 11.661807580174926,
"grad_norm": 4.855446751760304,
"learning_rate": 1.860722267984887e-06,
"loss": 0.0648,
"step": 500
},
{
"epoch": 11.685131195335277,
"grad_norm": 5.429052345943996,
"learning_rate": 1.8513992116912254e-06,
"loss": 0.0625,
"step": 501
},
{
"epoch": 11.708454810495628,
"grad_norm": 4.2187151872116875,
"learning_rate": 1.8420858165262905e-06,
"loss": 0.0598,
"step": 502
},
{
"epoch": 11.731778425655977,
"grad_norm": 6.771490853490556,
"learning_rate": 1.8327822212162598e-06,
"loss": 0.0674,
"step": 503
},
{
"epoch": 11.755102040816327,
"grad_norm": 5.9975504643367215,
"learning_rate": 1.823488564341342e-06,
"loss": 0.0599,
"step": 504
},
{
"epoch": 11.778425655976676,
"grad_norm": 4.1364213442135265,
"learning_rate": 1.814204984333709e-06,
"loss": 0.067,
"step": 505
},
{
"epoch": 11.801749271137027,
"grad_norm": 5.912170026737122,
"learning_rate": 1.8049316194754319e-06,
"loss": 0.0668,
"step": 506
},
{
"epoch": 11.825072886297376,
"grad_norm": 5.308482008228855,
"learning_rate": 1.7956686078964257e-06,
"loss": 0.0626,
"step": 507
},
{
"epoch": 11.848396501457726,
"grad_norm": 6.158712911027675,
"learning_rate": 1.7864160875723916e-06,
"loss": 0.0675,
"step": 508
},
{
"epoch": 11.871720116618075,
"grad_norm": 7.051006045152656,
"learning_rate": 1.7771741963227563e-06,
"loss": 0.0678,
"step": 509
},
{
"epoch": 11.895043731778426,
"grad_norm": 4.332760088246268,
"learning_rate": 1.7679430718086244e-06,
"loss": 0.0633,
"step": 510
},
{
"epoch": 11.918367346938776,
"grad_norm": 4.451530090621097,
"learning_rate": 1.7587228515307273e-06,
"loss": 0.0623,
"step": 511
},
{
"epoch": 11.941690962099125,
"grad_norm": 4.5406241430713665,
"learning_rate": 1.7495136728273722e-06,
"loss": 0.0629,
"step": 512
},
{
"epoch": 11.965014577259476,
"grad_norm": 3.310623933238498,
"learning_rate": 1.7403156728724003e-06,
"loss": 0.0532,
"step": 513
},
{
"epoch": 11.988338192419825,
"grad_norm": 5.673693320795326,
"learning_rate": 1.7311289886731408e-06,
"loss": 0.0648,
"step": 514
},
{
"epoch": 12.011661807580175,
"grad_norm": 3.6331444373300714,
"learning_rate": 1.7219537570683692e-06,
"loss": 0.0605,
"step": 515
},
{
"epoch": 12.034985422740524,
"grad_norm": 4.178597167218763,
"learning_rate": 1.7127901147262732e-06,
"loss": 0.0533,
"step": 516
},
{
"epoch": 12.058309037900875,
"grad_norm": 4.026501838603133,
"learning_rate": 1.7036381981424141e-06,
"loss": 0.0542,
"step": 517
},
{
"epoch": 12.081632653061224,
"grad_norm": 5.886567435949283,
"learning_rate": 1.6944981436376917e-06,
"loss": 0.0576,
"step": 518
},
{
"epoch": 12.104956268221574,
"grad_norm": 4.151810189217474,
"learning_rate": 1.6853700873563188e-06,
"loss": 0.0551,
"step": 519
},
{
"epoch": 12.128279883381925,
"grad_norm": 5.477245076043875,
"learning_rate": 1.6762541652637904e-06,
"loss": 0.0581,
"step": 520
},
{
"epoch": 12.151603498542274,
"grad_norm": 5.0774749809336175,
"learning_rate": 1.6671505131448562e-06,
"loss": 0.056,
"step": 521
},
{
"epoch": 12.174927113702624,
"grad_norm": 3.601653058616095,
"learning_rate": 1.6580592666015034e-06,
"loss": 0.052,
"step": 522
},
{
"epoch": 12.198250728862973,
"grad_norm": 4.176157150264005,
"learning_rate": 1.648980561050933e-06,
"loss": 0.0528,
"step": 523
},
{
"epoch": 12.221574344023324,
"grad_norm": 3.556338687588662,
"learning_rate": 1.6399145317235422e-06,
"loss": 0.0538,
"step": 524
},
{
"epoch": 12.244897959183673,
"grad_norm": 3.880914911372885,
"learning_rate": 1.630861313660913e-06,
"loss": 0.0493,
"step": 525
},
{
"epoch": 12.268221574344023,
"grad_norm": 4.8554685535875866,
"learning_rate": 1.6218210417138007e-06,
"loss": 0.0491,
"step": 526
},
{
"epoch": 12.291545189504372,
"grad_norm": 4.2655409360993195,
"learning_rate": 1.6127938505401192e-06,
"loss": 0.0531,
"step": 527
},
{
"epoch": 12.314868804664723,
"grad_norm": 3.9912333981353574,
"learning_rate": 1.6037798746029444e-06,
"loss": 0.0565,
"step": 528
},
{
"epoch": 12.338192419825074,
"grad_norm": 3.8208239628390213,
"learning_rate": 1.5947792481685054e-06,
"loss": 0.0479,
"step": 529
},
{
"epoch": 12.361516034985423,
"grad_norm": 4.109928456447175,
"learning_rate": 1.5857921053041852e-06,
"loss": 0.0555,
"step": 530
},
{
"epoch": 12.384839650145773,
"grad_norm": 4.093633984967496,
"learning_rate": 1.5768185798765246e-06,
"loss": 0.0497,
"step": 531
},
{
"epoch": 12.408163265306122,
"grad_norm": 4.939878832300203,
"learning_rate": 1.5678588055492289e-06,
"loss": 0.0547,
"step": 532
},
{
"epoch": 12.431486880466473,
"grad_norm": 5.319609608201244,
"learning_rate": 1.558912915781175e-06,
"loss": 0.0528,
"step": 533
},
{
"epoch": 12.454810495626822,
"grad_norm": 3.872044205066462,
"learning_rate": 1.5499810438244251e-06,
"loss": 0.0563,
"step": 534
},
{
"epoch": 12.478134110787172,
"grad_norm": 5.036716500588434,
"learning_rate": 1.5410633227222433e-06,
"loss": 0.0552,
"step": 535
},
{
"epoch": 12.501457725947521,
"grad_norm": 4.09470175824976,
"learning_rate": 1.5321598853071068e-06,
"loss": 0.0513,
"step": 536
},
{
"epoch": 12.524781341107872,
"grad_norm": 4.375511139574889,
"learning_rate": 1.5232708641987388e-06,
"loss": 0.0491,
"step": 537
},
{
"epoch": 12.548104956268222,
"grad_norm": 5.0017332887433765,
"learning_rate": 1.514396391802121e-06,
"loss": 0.0571,
"step": 538
},
{
"epoch": 12.571428571428571,
"grad_norm": 3.908119425718553,
"learning_rate": 1.5055366003055305e-06,
"loss": 0.0503,
"step": 539
},
{
"epoch": 12.594752186588922,
"grad_norm": 4.2271482164051255,
"learning_rate": 1.496691621678567e-06,
"loss": 0.0564,
"step": 540
},
{
"epoch": 12.61807580174927,
"grad_norm": 6.6548142295032715,
"learning_rate": 1.487861587670187e-06,
"loss": 0.0576,
"step": 541
},
{
"epoch": 12.641399416909621,
"grad_norm": 6.231794774644079,
"learning_rate": 1.4790466298067415e-06,
"loss": 0.0573,
"step": 542
},
{
"epoch": 12.66472303206997,
"grad_norm": 4.344863419575536,
"learning_rate": 1.4702468793900187e-06,
"loss": 0.0534,
"step": 543
},
{
"epoch": 12.68804664723032,
"grad_norm": 4.213349000402689,
"learning_rate": 1.4614624674952843e-06,
"loss": 0.0518,
"step": 544
},
{
"epoch": 12.71137026239067,
"grad_norm": 7.569114629654876,
"learning_rate": 1.4526935249693336e-06,
"loss": 0.0531,
"step": 545
},
{
"epoch": 12.73469387755102,
"grad_norm": 6.067248185900788,
"learning_rate": 1.4439401824285387e-06,
"loss": 0.0548,
"step": 546
},
{
"epoch": 12.758017492711371,
"grad_norm": 4.958285270051922,
"learning_rate": 1.4352025702569066e-06,
"loss": 0.0546,
"step": 547
},
{
"epoch": 12.78134110787172,
"grad_norm": 5.615875642730984,
"learning_rate": 1.426480818604134e-06,
"loss": 0.0594,
"step": 548
},
{
"epoch": 12.80466472303207,
"grad_norm": 4.328415403022571,
"learning_rate": 1.4177750573836702e-06,
"loss": 0.0533,
"step": 549
},
{
"epoch": 12.82798833819242,
"grad_norm": 5.138897730272723,
"learning_rate": 1.4090854162707801e-06,
"loss": 0.0496,
"step": 550
},
{
"epoch": 12.85131195335277,
"grad_norm": 4.839504029895119,
"learning_rate": 1.4004120247006159e-06,
"loss": 0.0501,
"step": 551
},
{
"epoch": 12.874635568513119,
"grad_norm": 3.8022100697475603,
"learning_rate": 1.3917550118662876e-06,
"loss": 0.0513,
"step": 552
},
{
"epoch": 12.89795918367347,
"grad_norm": 6.104926711883666,
"learning_rate": 1.383114506716936e-06,
"loss": 0.0523,
"step": 553
},
{
"epoch": 12.921282798833818,
"grad_norm": 3.748559929588339,
"learning_rate": 1.3744906379558165e-06,
"loss": 0.0473,
"step": 554
},
{
"epoch": 12.944606413994169,
"grad_norm": 5.338422115817136,
"learning_rate": 1.3658835340383809e-06,
"loss": 0.0556,
"step": 555
},
{
"epoch": 12.96793002915452,
"grad_norm": 4.161310313444585,
"learning_rate": 1.3572933231703582e-06,
"loss": 0.0544,
"step": 556
},
{
"epoch": 12.991253644314869,
"grad_norm": 6.0576026248464725,
"learning_rate": 1.3487201333058543e-06,
"loss": 0.0564,
"step": 557
},
{
"epoch": 13.01457725947522,
"grad_norm": 3.6315615233113174,
"learning_rate": 1.3401640921454411e-06,
"loss": 0.0414,
"step": 558
},
{
"epoch": 13.037900874635568,
"grad_norm": 3.9125541171392704,
"learning_rate": 1.3316253271342517e-06,
"loss": 0.0441,
"step": 559
},
{
"epoch": 13.061224489795919,
"grad_norm": 3.563470139501611,
"learning_rate": 1.3231039654600875e-06,
"loss": 0.0451,
"step": 560
},
{
"epoch": 13.084548104956268,
"grad_norm": 4.0771176785890635,
"learning_rate": 1.3146001340515224e-06,
"loss": 0.0468,
"step": 561
},
{
"epoch": 13.107871720116618,
"grad_norm": 4.509589468124622,
"learning_rate": 1.306113959576007e-06,
"loss": 0.0473,
"step": 562
},
{
"epoch": 13.131195335276967,
"grad_norm": 3.9113416523129456,
"learning_rate": 1.2976455684379895e-06,
"loss": 0.0473,
"step": 563
},
{
"epoch": 13.154518950437318,
"grad_norm": 4.7449900658586905,
"learning_rate": 1.2891950867770291e-06,
"loss": 0.0413,
"step": 564
},
{
"epoch": 13.177842565597668,
"grad_norm": 4.4217248897886625,
"learning_rate": 1.2807626404659144e-06,
"loss": 0.0472,
"step": 565
},
{
"epoch": 13.201166180758017,
"grad_norm": 4.584718298458554,
"learning_rate": 1.272348355108794e-06,
"loss": 0.0462,
"step": 566
},
{
"epoch": 13.224489795918368,
"grad_norm": 5.466004298175304,
"learning_rate": 1.263952356039302e-06,
"loss": 0.0476,
"step": 567
},
{
"epoch": 13.247813411078717,
"grad_norm": 4.81734965320916,
"learning_rate": 1.2555747683186922e-06,
"loss": 0.0455,
"step": 568
},
{
"epoch": 13.271137026239067,
"grad_norm": 5.756185107533244,
"learning_rate": 1.2472157167339754e-06,
"loss": 0.0453,
"step": 569
},
{
"epoch": 13.294460641399416,
"grad_norm": 3.822016985198266,
"learning_rate": 1.2388753257960611e-06,
"loss": 0.041,
"step": 570
},
{
"epoch": 13.317784256559767,
"grad_norm": 4.691368439559332,
"learning_rate": 1.2305537197378985e-06,
"loss": 0.0426,
"step": 571
},
{
"epoch": 13.341107871720117,
"grad_norm": 5.311689068104946,
"learning_rate": 1.222251022512634e-06,
"loss": 0.0445,
"step": 572
},
{
"epoch": 13.364431486880466,
"grad_norm": 4.8844713978886185,
"learning_rate": 1.2139673577917594e-06,
"loss": 0.0496,
"step": 573
},
{
"epoch": 13.387755102040817,
"grad_norm": 5.524811871808131,
"learning_rate": 1.2057028489632682e-06,
"loss": 0.0471,
"step": 574
},
{
"epoch": 13.411078717201166,
"grad_norm": 3.8990317830084007,
"learning_rate": 1.1974576191298235e-06,
"loss": 0.0474,
"step": 575
},
{
"epoch": 13.434402332361516,
"grad_norm": 5.0817642083111325,
"learning_rate": 1.1892317911069212e-06,
"loss": 0.0437,
"step": 576
},
{
"epoch": 13.457725947521865,
"grad_norm": 6.175434409239747,
"learning_rate": 1.1810254874210578e-06,
"loss": 0.0415,
"step": 577
},
{
"epoch": 13.481049562682216,
"grad_norm": 4.105620202665507,
"learning_rate": 1.1728388303079114e-06,
"loss": 0.0398,
"step": 578
},
{
"epoch": 13.504373177842565,
"grad_norm": 5.399997160169066,
"learning_rate": 1.1646719417105152e-06,
"loss": 0.0443,
"step": 579
},
{
"epoch": 13.527696793002915,
"grad_norm": 5.360588270045755,
"learning_rate": 1.156524943277445e-06,
"loss": 0.0477,
"step": 580
},
{
"epoch": 13.551020408163264,
"grad_norm": 3.9622517649396145,
"learning_rate": 1.148397956361007e-06,
"loss": 0.0414,
"step": 581
},
{
"epoch": 13.574344023323615,
"grad_norm": 4.5985837663622045,
"learning_rate": 1.1402911020154258e-06,
"loss": 0.0444,
"step": 582
},
{
"epoch": 13.597667638483966,
"grad_norm": 4.261820119741356,
"learning_rate": 1.1322045009950475e-06,
"loss": 0.0423,
"step": 583
},
{
"epoch": 13.620991253644315,
"grad_norm": 3.7606986971242136,
"learning_rate": 1.1241382737525367e-06,
"loss": 0.0445,
"step": 584
},
{
"epoch": 13.644314868804665,
"grad_norm": 5.001124064931982,
"learning_rate": 1.1160925404370851e-06,
"loss": 0.0416,
"step": 585
},
{
"epoch": 13.667638483965014,
"grad_norm": 4.126856299517844,
"learning_rate": 1.1080674208926173e-06,
"loss": 0.0398,
"step": 586
},
{
"epoch": 13.690962099125365,
"grad_norm": 4.164557622563336,
"learning_rate": 1.1000630346560118e-06,
"loss": 0.0428,
"step": 587
},
{
"epoch": 13.714285714285714,
"grad_norm": 4.26993463256259,
"learning_rate": 1.0920795009553164e-06,
"loss": 0.0417,
"step": 588
},
{
"epoch": 13.737609329446064,
"grad_norm": 4.325947899111832,
"learning_rate": 1.0841169387079734e-06,
"loss": 0.044,
"step": 589
},
{
"epoch": 13.760932944606415,
"grad_norm": 5.762006083937098,
"learning_rate": 1.0761754665190486e-06,
"loss": 0.0447,
"step": 590
},
{
"epoch": 13.784256559766764,
"grad_norm": 3.5402602301601247,
"learning_rate": 1.0682552026794624e-06,
"loss": 0.0431,
"step": 591
},
{
"epoch": 13.807580174927114,
"grad_norm": 5.245561994585335,
"learning_rate": 1.0603562651642318e-06,
"loss": 0.0443,
"step": 592
},
{
"epoch": 13.830903790087463,
"grad_norm": 4.92187887524649,
"learning_rate": 1.0524787716307107e-06,
"loss": 0.0395,
"step": 593
},
{
"epoch": 13.854227405247814,
"grad_norm": 5.371003905498567,
"learning_rate": 1.0446228394168356e-06,
"loss": 0.0436,
"step": 594
},
{
"epoch": 13.877551020408163,
"grad_norm": 4.753556175162423,
"learning_rate": 1.0367885855393823e-06,
"loss": 0.048,
"step": 595
},
{
"epoch": 13.900874635568513,
"grad_norm": 4.399646978955097,
"learning_rate": 1.02897612669222e-06,
"loss": 0.0404,
"step": 596
},
{
"epoch": 13.924198250728862,
"grad_norm": 4.0872273205438185,
"learning_rate": 1.0211855792445712e-06,
"loss": 0.0421,
"step": 597
},
{
"epoch": 13.947521865889213,
"grad_norm": 4.404736630969894,
"learning_rate": 1.0134170592392837e-06,
"loss": 0.0445,
"step": 598
},
{
"epoch": 13.970845481049563,
"grad_norm": 4.508551419683562,
"learning_rate": 1.005670682391098e-06,
"loss": 0.0425,
"step": 599
},
{
"epoch": 13.994169096209912,
"grad_norm": 5.648490346236622,
"learning_rate": 9.97946564084923e-07,
"loss": 0.0513,
"step": 600
},
{
"epoch": 14.017492711370263,
"grad_norm": 3.6058075684685664,
"learning_rate": 9.90244819374122e-07,
"loss": 0.0382,
"step": 601
},
{
"epoch": 14.040816326530612,
"grad_norm": 3.4872878690349833,
"learning_rate": 9.825655629787952e-07,
"loss": 0.0408,
"step": 602
},
{
"epoch": 14.064139941690962,
"grad_norm": 3.9437709705018924,
"learning_rate": 9.749089092840694e-07,
"loss": 0.0358,
"step": 603
},
{
"epoch": 14.087463556851311,
"grad_norm": 3.098542479921476,
"learning_rate": 9.672749723383993e-07,
"loss": 0.0314,
"step": 604
},
{
"epoch": 14.110787172011662,
"grad_norm": 4.3034719422876035,
"learning_rate": 9.596638658518644e-07,
"loss": 0.0403,
"step": 605
},
{
"epoch": 14.13411078717201,
"grad_norm": 3.9472960669595554,
"learning_rate": 9.520757031944786e-07,
"loss": 0.0371,
"step": 606
},
{
"epoch": 14.157434402332362,
"grad_norm": 3.634289513039296,
"learning_rate": 9.445105973944962e-07,
"loss": 0.0357,
"step": 607
},
{
"epoch": 14.180758017492712,
"grad_norm": 4.147703633888312,
"learning_rate": 9.369686611367354e-07,
"loss": 0.0341,
"step": 608
},
{
"epoch": 14.204081632653061,
"grad_norm": 4.857802746929539,
"learning_rate": 9.294500067608941e-07,
"loss": 0.0345,
"step": 609
},
{
"epoch": 14.227405247813412,
"grad_norm": 4.887916863266706,
"learning_rate": 9.219547462598796e-07,
"loss": 0.0376,
"step": 610
},
{
"epoch": 14.25072886297376,
"grad_norm": 4.601389028391382,
"learning_rate": 9.1448299127814e-07,
"loss": 0.0362,
"step": 611
},
{
"epoch": 14.274052478134111,
"grad_norm": 4.6368102761212,
"learning_rate": 9.070348531099982e-07,
"loss": 0.0363,
"step": 612
},
{
"epoch": 14.29737609329446,
"grad_norm": 5.32549464834129,
"learning_rate": 8.99610442697999e-07,
"loss": 0.0361,
"step": 613
},
{
"epoch": 14.32069970845481,
"grad_norm": 4.278979290973843,
"learning_rate": 8.922098706312548e-07,
"loss": 0.0356,
"step": 614
},
{
"epoch": 14.34402332361516,
"grad_norm": 5.638938419185062,
"learning_rate": 8.848332471437948e-07,
"loss": 0.0369,
"step": 615
},
{
"epoch": 14.36734693877551,
"grad_norm": 4.43808624263548,
"learning_rate": 8.774806821129286e-07,
"loss": 0.0362,
"step": 616
},
{
"epoch": 14.39067055393586,
"grad_norm": 5.872231987967421,
"learning_rate": 8.701522850576069e-07,
"loss": 0.037,
"step": 617
},
{
"epoch": 14.41399416909621,
"grad_norm": 5.542707570154857,
"learning_rate": 8.628481651367876e-07,
"loss": 0.0342,
"step": 618
},
{
"epoch": 14.43731778425656,
"grad_norm": 3.9740951467872248,
"learning_rate": 8.555684311478158e-07,
"loss": 0.0301,
"step": 619
},
{
"epoch": 14.46064139941691,
"grad_norm": 4.520913165862787,
"learning_rate": 8.483131915247969e-07,
"loss": 0.0369,
"step": 620
},
{
"epoch": 14.48396501457726,
"grad_norm": 6.336511249646333,
"learning_rate": 8.410825543369866e-07,
"loss": 0.0381,
"step": 621
},
{
"epoch": 14.507288629737609,
"grad_norm": 6.351571809168301,
"learning_rate": 8.33876627287179e-07,
"loss": 0.0411,
"step": 622
},
{
"epoch": 14.53061224489796,
"grad_norm": 5.359115371865932,
"learning_rate": 8.266955177100997e-07,
"loss": 0.0302,
"step": 623
},
{
"epoch": 14.55393586005831,
"grad_norm": 4.855515888749458,
"learning_rate": 8.195393325708123e-07,
"loss": 0.0349,
"step": 624
},
{
"epoch": 14.577259475218659,
"grad_norm": 5.699262591396659,
"learning_rate": 8.124081784631219e-07,
"loss": 0.0293,
"step": 625
},
{
"epoch": 14.60058309037901,
"grad_norm": 4.567521558688794,
"learning_rate": 8.053021616079873e-07,
"loss": 0.0328,
"step": 626
},
{
"epoch": 14.623906705539358,
"grad_norm": 4.609803260692903,
"learning_rate": 7.982213878519405e-07,
"loss": 0.0317,
"step": 627
},
{
"epoch": 14.647230320699709,
"grad_norm": 6.458495632306298,
"learning_rate": 7.911659626655066e-07,
"loss": 0.0354,
"step": 628
},
{
"epoch": 14.670553935860058,
"grad_norm": 4.150607071561942,
"learning_rate": 7.841359911416377e-07,
"loss": 0.0388,
"step": 629
},
{
"epoch": 14.693877551020408,
"grad_norm": 5.1220569999529095,
"learning_rate": 7.771315779941435e-07,
"loss": 0.0347,
"step": 630
},
{
"epoch": 14.717201166180757,
"grad_norm": 5.132376038270946,
"learning_rate": 7.701528275561349e-07,
"loss": 0.0365,
"step": 631
},
{
"epoch": 14.740524781341108,
"grad_norm": 3.5918722803395284,
"learning_rate": 7.631998437784644e-07,
"loss": 0.035,
"step": 632
},
{
"epoch": 14.763848396501459,
"grad_norm": 4.178545815525575,
"learning_rate": 7.56272730228185e-07,
"loss": 0.0327,
"step": 633
},
{
"epoch": 14.787172011661808,
"grad_norm": 4.4614095087413235,
"learning_rate": 7.493715900870027e-07,
"loss": 0.0368,
"step": 634
},
{
"epoch": 14.810495626822158,
"grad_norm": 4.519648571024788,
"learning_rate": 7.424965261497394e-07,
"loss": 0.0372,
"step": 635
},
{
"epoch": 14.833819241982507,
"grad_norm": 5.416749669930699,
"learning_rate": 7.356476408228047e-07,
"loss": 0.0355,
"step": 636
},
{
"epoch": 14.857142857142858,
"grad_norm": 5.91713297264535,
"learning_rate": 7.288250361226692e-07,
"loss": 0.0398,
"step": 637
},
{
"epoch": 14.880466472303207,
"grad_norm": 4.330248150135053,
"learning_rate": 7.220288136743428e-07,
"loss": 0.0349,
"step": 638
},
{
"epoch": 14.903790087463557,
"grad_norm": 5.328988729592788,
"learning_rate": 7.152590747098645e-07,
"loss": 0.0396,
"step": 639
},
{
"epoch": 14.927113702623906,
"grad_norm": 5.698975775535172,
"learning_rate": 7.085159200667929e-07,
"loss": 0.0371,
"step": 640
},
{
"epoch": 14.950437317784257,
"grad_norm": 4.510406963992824,
"learning_rate": 7.017994501867017e-07,
"loss": 0.032,
"step": 641
},
{
"epoch": 14.973760932944607,
"grad_norm": 5.494979127214271,
"learning_rate": 6.95109765113689e-07,
"loss": 0.0347,
"step": 642
},
{
"epoch": 14.997084548104956,
"grad_norm": 5.2319532571156975,
"learning_rate": 6.884469644928834e-07,
"loss": 0.0392,
"step": 643
},
{
"epoch": 15.020408163265307,
"grad_norm": 3.3408845406351566,
"learning_rate": 6.818111475689585e-07,
"loss": 0.0305,
"step": 644
},
{
"epoch": 15.043731778425656,
"grad_norm": 3.5179721711116914,
"learning_rate": 6.752024131846586e-07,
"loss": 0.0263,
"step": 645
},
{
"epoch": 15.067055393586006,
"grad_norm": 4.349263595862402,
"learning_rate": 6.68620859779324e-07,
"loss": 0.0315,
"step": 646
},
{
"epoch": 15.090379008746355,
"grad_norm": 3.617996290180757,
"learning_rate": 6.620665853874252e-07,
"loss": 0.0286,
"step": 647
},
{
"epoch": 15.113702623906706,
"grad_norm": 3.675480407662625,
"learning_rate": 6.555396876371017e-07,
"loss": 0.0277,
"step": 648
},
{
"epoch": 15.137026239067055,
"grad_norm": 4.156256424748324,
"learning_rate": 6.490402637487093e-07,
"loss": 0.0293,
"step": 649
},
{
"epoch": 15.160349854227405,
"grad_norm": 4.293695334540458,
"learning_rate": 6.425684105333713e-07,
"loss": 0.029,
"step": 650
},
{
"epoch": 15.183673469387756,
"grad_norm": 4.138107513358523,
"learning_rate": 6.361242243915364e-07,
"loss": 0.0243,
"step": 651
},
{
"epoch": 15.206997084548105,
"grad_norm": 3.686422675568326,
"learning_rate": 6.297078013115435e-07,
"loss": 0.0265,
"step": 652
},
{
"epoch": 15.230320699708455,
"grad_norm": 4.378657093156888,
"learning_rate": 6.23319236868189e-07,
"loss": 0.0292,
"step": 653
},
{
"epoch": 15.253644314868804,
"grad_norm": 4.339007965833144,
"learning_rate": 6.169586262213081e-07,
"loss": 0.0302,
"step": 654
},
{
"epoch": 15.276967930029155,
"grad_norm": 6.16875137282707,
"learning_rate": 6.106260641143547e-07,
"loss": 0.0297,
"step": 655
},
{
"epoch": 15.300291545189504,
"grad_norm": 4.435156404818784,
"learning_rate": 6.043216448729869e-07,
"loss": 0.0255,
"step": 656
},
{
"epoch": 15.323615160349854,
"grad_norm": 3.8681138903824595,
"learning_rate": 5.980454624036696e-07,
"loss": 0.0286,
"step": 657
},
{
"epoch": 15.346938775510203,
"grad_norm": 3.945328695433962,
"learning_rate": 5.917976101922676e-07,
"loss": 0.0309,
"step": 658
},
{
"epoch": 15.370262390670554,
"grad_norm": 3.941723934561367,
"learning_rate": 5.855781813026595e-07,
"loss": 0.0242,
"step": 659
},
{
"epoch": 15.393586005830905,
"grad_norm": 3.615879555645832,
"learning_rate": 5.79387268375349e-07,
"loss": 0.0322,
"step": 660
},
{
"epoch": 15.416909620991254,
"grad_norm": 4.258040507799856,
"learning_rate": 5.73224963626082e-07,
"loss": 0.0253,
"step": 661
},
{
"epoch": 15.440233236151604,
"grad_norm": 4.208961490294516,
"learning_rate": 5.670913588444788e-07,
"loss": 0.0344,
"step": 662
},
{
"epoch": 15.463556851311953,
"grad_norm": 4.070003446535343,
"learning_rate": 5.609865453926635e-07,
"loss": 0.0286,
"step": 663
},
{
"epoch": 15.486880466472304,
"grad_norm": 3.942814222935444,
"learning_rate": 5.549106142039018e-07,
"loss": 0.0305,
"step": 664
},
{
"epoch": 15.510204081632653,
"grad_norm": 3.9970858210363684,
"learning_rate": 5.488636557812499e-07,
"loss": 0.0239,
"step": 665
},
{
"epoch": 15.533527696793003,
"grad_norm": 4.50085123276231,
"learning_rate": 5.428457601962048e-07,
"loss": 0.0274,
"step": 666
},
{
"epoch": 15.556851311953352,
"grad_norm": 3.6513940892295973,
"learning_rate": 5.368570170873618e-07,
"loss": 0.0237,
"step": 667
},
{
"epoch": 15.580174927113703,
"grad_norm": 3.110499959666367,
"learning_rate": 5.30897515659082e-07,
"loss": 0.0251,
"step": 668
},
{
"epoch": 15.603498542274053,
"grad_norm": 4.673667130206186,
"learning_rate": 5.249673446801584e-07,
"loss": 0.0309,
"step": 669
},
{
"epoch": 15.626822157434402,
"grad_norm": 4.138706900217531,
"learning_rate": 5.190665924825005e-07,
"loss": 0.027,
"step": 670
},
{
"epoch": 15.650145772594753,
"grad_norm": 3.9353264758772837,
"learning_rate": 5.13195346959813e-07,
"loss": 0.0272,
"step": 671
},
{
"epoch": 15.673469387755102,
"grad_norm": 4.734505931693013,
"learning_rate": 5.0735369556629e-07,
"loss": 0.0279,
"step": 672
},
{
"epoch": 15.696793002915452,
"grad_norm": 4.469816067386837,
"learning_rate": 5.015417253153093e-07,
"loss": 0.0272,
"step": 673
},
{
"epoch": 15.720116618075801,
"grad_norm": 4.5790698017199345,
"learning_rate": 4.957595227781395e-07,
"loss": 0.0325,
"step": 674
},
{
"epoch": 15.743440233236152,
"grad_norm": 3.9610773848110963,
"learning_rate": 4.90007174082649e-07,
"loss": 0.0267,
"step": 675
},
{
"epoch": 15.7667638483965,
"grad_norm": 4.231649387279768,
"learning_rate": 4.842847649120216e-07,
"loss": 0.0262,
"step": 676
},
{
"epoch": 15.790087463556851,
"grad_norm": 4.758712007026724,
"learning_rate": 4.785923805034831e-07,
"loss": 0.0278,
"step": 677
},
{
"epoch": 15.813411078717202,
"grad_norm": 3.245691083958514,
"learning_rate": 4.729301056470309e-07,
"loss": 0.0244,
"step": 678
},
{
"epoch": 15.83673469387755,
"grad_norm": 3.6139691332125388,
"learning_rate": 4.67298024684168e-07,
"loss": 0.0246,
"step": 679
},
{
"epoch": 15.860058309037901,
"grad_norm": 4.715830525039419,
"learning_rate": 4.616962215066512e-07,
"loss": 0.0262,
"step": 680
},
{
"epoch": 15.88338192419825,
"grad_norm": 4.003030344359909,
"learning_rate": 4.5612477955523953e-07,
"loss": 0.0293,
"step": 681
},
{
"epoch": 15.906705539358601,
"grad_norm": 4.0002567685684305,
"learning_rate": 4.505837818184489e-07,
"loss": 0.0265,
"step": 682
},
{
"epoch": 15.93002915451895,
"grad_norm": 4.240186693651765,
"learning_rate": 4.4507331083132074e-07,
"loss": 0.0335,
"step": 683
},
{
"epoch": 15.9533527696793,
"grad_norm": 3.8489095902442956,
"learning_rate": 4.3959344867419015e-07,
"loss": 0.0259,
"step": 684
},
{
"epoch": 15.97667638483965,
"grad_norm": 5.316228836989733,
"learning_rate": 4.34144276971461e-07,
"loss": 0.0323,
"step": 685
},
{
"epoch": 16.0,
"grad_norm": 4.182654694810523,
"learning_rate": 4.2872587689039486e-07,
"loss": 0.0235,
"step": 686
},
{
"epoch": 16.02332361516035,
"grad_norm": 4.422215964176046,
"learning_rate": 4.2333832913989845e-07,
"loss": 0.0249,
"step": 687
},
{
"epoch": 16.0466472303207,
"grad_norm": 5.1425082111061275,
"learning_rate": 4.179817139693232e-07,
"loss": 0.0241,
"step": 688
},
{
"epoch": 16.06997084548105,
"grad_norm": 5.364149630400351,
"learning_rate": 4.126561111672689e-07,
"loss": 0.0253,
"step": 689
},
{
"epoch": 16.0932944606414,
"grad_norm": 3.1119300657519378,
"learning_rate": 4.073616000603947e-07,
"loss": 0.021,
"step": 690
},
{
"epoch": 16.11661807580175,
"grad_norm": 4.5740999097233175,
"learning_rate": 4.0209825951223996e-07,
"loss": 0.0229,
"step": 691
},
{
"epoch": 16.1399416909621,
"grad_norm": 4.6084190282505135,
"learning_rate": 3.9686616792204677e-07,
"loss": 0.0211,
"step": 692
},
{
"epoch": 16.163265306122447,
"grad_norm": 5.429734571513587,
"learning_rate": 3.91665403223595e-07,
"loss": 0.0238,
"step": 693
},
{
"epoch": 16.186588921282798,
"grad_norm": 3.273517325436302,
"learning_rate": 3.864960428840375e-07,
"loss": 0.0273,
"step": 694
},
{
"epoch": 16.20991253644315,
"grad_norm": 4.565062756812091,
"learning_rate": 3.8135816390275027e-07,
"loss": 0.0228,
"step": 695
},
{
"epoch": 16.2332361516035,
"grad_norm": 5.08354503420065,
"learning_rate": 3.7625184281018385e-07,
"loss": 0.0253,
"step": 696
},
{
"epoch": 16.25655976676385,
"grad_norm": 4.670042973714779,
"learning_rate": 3.711771556667218e-07,
"loss": 0.0255,
"step": 697
},
{
"epoch": 16.279883381924197,
"grad_norm": 4.075983998200786,
"learning_rate": 3.6613417806155126e-07,
"loss": 0.0236,
"step": 698
},
{
"epoch": 16.303206997084548,
"grad_norm": 4.274402795314428,
"learning_rate": 3.611229851115333e-07,
"loss": 0.0225,
"step": 699
},
{
"epoch": 16.3265306122449,
"grad_norm": 4.209429808941616,
"learning_rate": 3.561436514600866e-07,
"loss": 0.0203,
"step": 700
},
{
"epoch": 16.34985422740525,
"grad_norm": 4.145068738710229,
"learning_rate": 3.511962512760758e-07,
"loss": 0.0196,
"step": 701
},
{
"epoch": 16.373177842565596,
"grad_norm": 3.934032790714013,
"learning_rate": 3.46280858252703e-07,
"loss": 0.0205,
"step": 702
},
{
"epoch": 16.396501457725947,
"grad_norm": 3.836792145595831,
"learning_rate": 3.413975456064153e-07,
"loss": 0.0235,
"step": 703
},
{
"epoch": 16.419825072886297,
"grad_norm": 4.405125467719681,
"learning_rate": 3.365463860758114e-07,
"loss": 0.0234,
"step": 704
},
{
"epoch": 16.443148688046648,
"grad_norm": 3.49860367850944,
"learning_rate": 3.3172745192055656e-07,
"loss": 0.0224,
"step": 705
},
{
"epoch": 16.466472303207,
"grad_norm": 4.330738586794788,
"learning_rate": 3.2694081492031017e-07,
"loss": 0.0239,
"step": 706
},
{
"epoch": 16.489795918367346,
"grad_norm": 4.006534960248732,
"learning_rate": 3.2218654637365373e-07,
"loss": 0.0197,
"step": 707
},
{
"epoch": 16.513119533527696,
"grad_norm": 3.823420159690705,
"learning_rate": 3.1746471709702963e-07,
"loss": 0.0246,
"step": 708
},
{
"epoch": 16.536443148688047,
"grad_norm": 4.483000541351714,
"learning_rate": 3.1277539742368636e-07,
"loss": 0.0203,
"step": 709
},
{
"epoch": 16.559766763848398,
"grad_norm": 4.425099413773361,
"learning_rate": 3.081186572026318e-07,
"loss": 0.0204,
"step": 710
},
{
"epoch": 16.583090379008745,
"grad_norm": 5.020044774011588,
"learning_rate": 3.034945657975899e-07,
"loss": 0.0211,
"step": 711
},
{
"epoch": 16.606413994169095,
"grad_norm": 3.481525436271193,
"learning_rate": 2.9890319208597056e-07,
"loss": 0.0209,
"step": 712
},
{
"epoch": 16.629737609329446,
"grad_norm": 5.084619307319498,
"learning_rate": 2.943446044578432e-07,
"loss": 0.0204,
"step": 713
},
{
"epoch": 16.653061224489797,
"grad_norm": 3.942713963829196,
"learning_rate": 2.898188708149158e-07,
"loss": 0.0231,
"step": 714
},
{
"epoch": 16.676384839650147,
"grad_norm": 3.8706390698936537,
"learning_rate": 2.8532605856952613e-07,
"loss": 0.023,
"step": 715
},
{
"epoch": 16.699708454810494,
"grad_norm": 4.072339627624961,
"learning_rate": 2.808662346436372e-07,
"loss": 0.0247,
"step": 716
},
{
"epoch": 16.723032069970845,
"grad_norm": 3.7024539387765794,
"learning_rate": 2.764394654678379e-07,
"loss": 0.0228,
"step": 717
},
{
"epoch": 16.746355685131196,
"grad_norm": 3.2166705775525393,
"learning_rate": 2.720458169803569e-07,
"loss": 0.02,
"step": 718
},
{
"epoch": 16.769679300291546,
"grad_norm": 3.80832130435794,
"learning_rate": 2.676853546260791e-07,
"loss": 0.025,
"step": 719
},
{
"epoch": 16.793002915451893,
"grad_norm": 2.807860298577788,
"learning_rate": 2.6335814335556933e-07,
"loss": 0.0177,
"step": 720
},
{
"epoch": 16.816326530612244,
"grad_norm": 4.087349117189053,
"learning_rate": 2.590642476241076e-07,
"loss": 0.0219,
"step": 721
},
{
"epoch": 16.839650145772595,
"grad_norm": 4.537123457993102,
"learning_rate": 2.5480373139072784e-07,
"loss": 0.0225,
"step": 722
},
{
"epoch": 16.862973760932945,
"grad_norm": 4.516624049775102,
"learning_rate": 2.505766581172628e-07,
"loss": 0.0219,
"step": 723
},
{
"epoch": 16.886297376093296,
"grad_norm": 3.8164973331073773,
"learning_rate": 2.4638309076740364e-07,
"loss": 0.0202,
"step": 724
},
{
"epoch": 16.909620991253643,
"grad_norm": 3.655723859963662,
"learning_rate": 2.422230918057586e-07,
"loss": 0.024,
"step": 725
},
{
"epoch": 16.932944606413994,
"grad_norm": 4.001000040929099,
"learning_rate": 2.3809672319692224e-07,
"loss": 0.0208,
"step": 726
},
{
"epoch": 16.956268221574344,
"grad_norm": 3.858770160925135,
"learning_rate": 2.340040464045548e-07,
"loss": 0.0224,
"step": 727
},
{
"epoch": 16.979591836734695,
"grad_norm": 3.482575640490216,
"learning_rate": 2.2994512239046557e-07,
"loss": 0.0198,
"step": 728
},
{
"epoch": 17.002915451895042,
"grad_norm": 3.323685241274939,
"learning_rate": 2.2592001161370392e-07,
"loss": 0.0207,
"step": 729
},
{
"epoch": 17.026239067055393,
"grad_norm": 3.4118874471688594,
"learning_rate": 2.219287740296605e-07,
"loss": 0.0165,
"step": 730
},
{
"epoch": 17.049562682215743,
"grad_norm": 2.917633078186951,
"learning_rate": 2.1797146908917295e-07,
"loss": 0.019,
"step": 731
},
{
"epoch": 17.072886297376094,
"grad_norm": 3.0512810565097785,
"learning_rate": 2.1404815573764004e-07,
"loss": 0.0189,
"step": 732
},
{
"epoch": 17.096209912536445,
"grad_norm": 2.9573534206720615,
"learning_rate": 2.1015889241414456e-07,
"loss": 0.0175,
"step": 733
},
{
"epoch": 17.11953352769679,
"grad_norm": 3.6324930030036007,
"learning_rate": 2.0630373705058408e-07,
"loss": 0.0148,
"step": 734
},
{
"epoch": 17.142857142857142,
"grad_norm": 3.430174912422043,
"learning_rate": 2.0248274707080378e-07,
"loss": 0.0184,
"step": 735
},
{
"epoch": 17.166180758017493,
"grad_norm": 3.3850899093610507,
"learning_rate": 1.9869597938974611e-07,
"loss": 0.0162,
"step": 736
},
{
"epoch": 17.189504373177844,
"grad_norm": 3.042707047575919,
"learning_rate": 1.9494349041260018e-07,
"loss": 0.0204,
"step": 737
},
{
"epoch": 17.21282798833819,
"grad_norm": 3.596583037536939,
"learning_rate": 1.9122533603396165e-07,
"loss": 0.0188,
"step": 738
},
{
"epoch": 17.23615160349854,
"grad_norm": 2.6161715685027778,
"learning_rate": 1.875415716370013e-07,
"loss": 0.0158,
"step": 739
},
{
"epoch": 17.259475218658892,
"grad_norm": 3.1256243272829085,
"learning_rate": 1.8389225209263867e-07,
"loss": 0.0194,
"step": 740
},
{
"epoch": 17.282798833819243,
"grad_norm": 3.3380374775697406,
"learning_rate": 1.8027743175872663e-07,
"loss": 0.026,
"step": 741
},
{
"epoch": 17.306122448979593,
"grad_norm": 2.8879281325234887,
"learning_rate": 1.7669716447924062e-07,
"loss": 0.0155,
"step": 742
},
{
"epoch": 17.32944606413994,
"grad_norm": 3.3041476627170545,
"learning_rate": 1.7315150358347492e-07,
"loss": 0.0199,
"step": 743
},
{
"epoch": 17.35276967930029,
"grad_norm": 3.36979570737397,
"learning_rate": 1.696405018852515e-07,
"loss": 0.0186,
"step": 744
},
{
"epoch": 17.37609329446064,
"grad_norm": 3.0199209655120867,
"learning_rate": 1.6616421168213215e-07,
"loss": 0.0174,
"step": 745
},
{
"epoch": 17.399416909620992,
"grad_norm": 2.929456045544688,
"learning_rate": 1.627226847546376e-07,
"loss": 0.0197,
"step": 746
},
{
"epoch": 17.42274052478134,
"grad_norm": 2.806790809090712,
"learning_rate": 1.5931597236547886e-07,
"loss": 0.0173,
"step": 747
},
{
"epoch": 17.44606413994169,
"grad_norm": 3.7535778779294233,
"learning_rate": 1.5594412525879288e-07,
"loss": 0.0131,
"step": 748
},
{
"epoch": 17.46938775510204,
"grad_norm": 3.469150966804552,
"learning_rate": 1.5260719365938537e-07,
"loss": 0.0151,
"step": 749
},
{
"epoch": 17.49271137026239,
"grad_norm": 3.916355337119822,
"learning_rate": 1.4930522727198472e-07,
"loss": 0.0187,
"step": 750
},
{
"epoch": 17.516034985422742,
"grad_norm": 3.5350399778012864,
"learning_rate": 1.4603827528050035e-07,
"loss": 0.0172,
"step": 751
},
{
"epoch": 17.53935860058309,
"grad_norm": 3.898385148853154,
"learning_rate": 1.428063863472895e-07,
"loss": 0.0276,
"step": 752
},
{
"epoch": 17.56268221574344,
"grad_norm": 3.3704370807229593,
"learning_rate": 1.396096086124346e-07,
"loss": 0.0218,
"step": 753
},
{
"epoch": 17.58600583090379,
"grad_norm": 3.9975639316201232,
"learning_rate": 1.3644798969302403e-07,
"loss": 0.0155,
"step": 754
},
{
"epoch": 17.60932944606414,
"grad_norm": 4.277717636698487,
"learning_rate": 1.333215766824436e-07,
"loss": 0.0206,
"step": 755
},
{
"epoch": 17.632653061224488,
"grad_norm": 3.5163862294354113,
"learning_rate": 1.302304161496759e-07,
"loss": 0.0216,
"step": 756
},
{
"epoch": 17.65597667638484,
"grad_norm": 3.3342619715001827,
"learning_rate": 1.2717455413860574e-07,
"loss": 0.0222,
"step": 757
},
{
"epoch": 17.67930029154519,
"grad_norm": 3.3972292100389683,
"learning_rate": 1.2415403616733322e-07,
"loss": 0.0167,
"step": 758
},
{
"epoch": 17.70262390670554,
"grad_norm": 3.6025702996578777,
"learning_rate": 1.211689072274988e-07,
"loss": 0.015,
"step": 759
},
{
"epoch": 17.72594752186589,
"grad_norm": 3.587962524158811,
"learning_rate": 1.1821921178361062e-07,
"loss": 0.0199,
"step": 760
},
{
"epoch": 17.749271137026238,
"grad_norm": 3.2190849352151916,
"learning_rate": 1.1530499377238213e-07,
"loss": 0.0181,
"step": 761
},
{
"epoch": 17.77259475218659,
"grad_norm": 3.2480067963458583,
"learning_rate": 1.1242629660207922e-07,
"loss": 0.0191,
"step": 762
},
{
"epoch": 17.79591836734694,
"grad_norm": 4.435559560404031,
"learning_rate": 1.0958316315187289e-07,
"loss": 0.0167,
"step": 763
},
{
"epoch": 17.81924198250729,
"grad_norm": 3.296628825321176,
"learning_rate": 1.0677563577119965e-07,
"loss": 0.0144,
"step": 764
},
{
"epoch": 17.842565597667637,
"grad_norm": 3.718101380786653,
"learning_rate": 1.0400375627913268e-07,
"loss": 0.0201,
"step": 765
},
{
"epoch": 17.865889212827987,
"grad_norm": 3.313696447261319,
"learning_rate": 1.0126756596375687e-07,
"loss": 0.0139,
"step": 766
},
{
"epoch": 17.889212827988338,
"grad_norm": 2.9657404050088982,
"learning_rate": 9.856710558155458e-08,
"loss": 0.022,
"step": 767
},
{
"epoch": 17.91253644314869,
"grad_norm": 3.339676273898552,
"learning_rate": 9.590241535679973e-08,
"loss": 0.0191,
"step": 768
},
{
"epoch": 17.93586005830904,
"grad_norm": 3.3890064788873495,
"learning_rate": 9.32735349809566e-08,
"loss": 0.0173,
"step": 769
},
{
"epoch": 17.959183673469386,
"grad_norm": 3.733632544607117,
"learning_rate": 9.068050361209002e-08,
"loss": 0.0165,
"step": 770
},
{
"epoch": 17.982507288629737,
"grad_norm": 3.167836742731714,
"learning_rate": 8.8123359874282e-08,
"loss": 0.019,
"step": 771
},
{
"epoch": 18.005830903790088,
"grad_norm": 3.1147726930617843,
"learning_rate": 8.560214185705573e-08,
"loss": 0.0201,
"step": 772
},
{
"epoch": 18.02915451895044,
"grad_norm": 2.7408804971223417,
"learning_rate": 8.311688711480858e-08,
"loss": 0.0202,
"step": 773
},
{
"epoch": 18.05247813411079,
"grad_norm": 2.573615274038812,
"learning_rate": 8.066763266625283e-08,
"loss": 0.0157,
"step": 774
},
{
"epoch": 18.075801749271136,
"grad_norm": 3.2240784622186216,
"learning_rate": 7.825441499386437e-08,
"loss": 0.0156,
"step": 775
},
{
"epoch": 18.099125364431487,
"grad_norm": 3.053222148681091,
"learning_rate": 7.587727004333856e-08,
"loss": 0.0148,
"step": 776
},
{
"epoch": 18.122448979591837,
"grad_norm": 3.3140019304710755,
"learning_rate": 7.353623322305603e-08,
"loss": 0.0164,
"step": 777
},
{
"epoch": 18.145772594752188,
"grad_norm": 3.4693247001133947,
"learning_rate": 7.123133940355409e-08,
"loss": 0.0174,
"step": 778
},
{
"epoch": 18.169096209912535,
"grad_norm": 2.8945333915836375,
"learning_rate": 6.89626229170079e-08,
"loss": 0.0195,
"step": 779
},
{
"epoch": 18.192419825072886,
"grad_norm": 3.957731796869534,
"learning_rate": 6.673011755671921e-08,
"loss": 0.0156,
"step": 780
},
{
"epoch": 18.215743440233236,
"grad_norm": 3.5887708586136795,
"learning_rate": 6.45338565766121e-08,
"loss": 0.0185,
"step": 781
},
{
"epoch": 18.239067055393587,
"grad_norm": 2.484618744799056,
"learning_rate": 6.237387269073913e-08,
"loss": 0.0172,
"step": 782
},
{
"epoch": 18.262390670553934,
"grad_norm": 2.7343133537972837,
"learning_rate": 6.025019807279292e-08,
"loss": 0.0135,
"step": 783
},
{
"epoch": 18.285714285714285,
"grad_norm": 3.6152761859835745,
"learning_rate": 5.8162864355626736e-08,
"loss": 0.0171,
"step": 784
},
{
"epoch": 18.309037900874635,
"grad_norm": 3.003376649917478,
"learning_rate": 5.6111902630784643e-08,
"loss": 0.0164,
"step": 785
},
{
"epoch": 18.332361516034986,
"grad_norm": 3.015679551654015,
"learning_rate": 5.409734344803713e-08,
"loss": 0.0142,
"step": 786
},
{
"epoch": 18.355685131195337,
"grad_norm": 2.8544846703309332,
"learning_rate": 5.211921681492649e-08,
"loss": 0.0155,
"step": 787
},
{
"epoch": 18.379008746355684,
"grad_norm": 3.284336491294599,
"learning_rate": 5.017755219631964e-08,
"loss": 0.013,
"step": 788
},
{
"epoch": 18.402332361516034,
"grad_norm": 2.8660825008489135,
"learning_rate": 4.827237851396965e-08,
"loss": 0.0184,
"step": 789
},
{
"epoch": 18.425655976676385,
"grad_norm": 2.5316419886612014,
"learning_rate": 4.640372414608491e-08,
"loss": 0.0113,
"step": 790
},
{
"epoch": 18.448979591836736,
"grad_norm": 3.9166951415196976,
"learning_rate": 4.457161692690587e-08,
"loss": 0.0156,
"step": 791
},
{
"epoch": 18.472303206997086,
"grad_norm": 2.7355307616015683,
"learning_rate": 4.277608414629097e-08,
"loss": 0.0165,
"step": 792
},
{
"epoch": 18.495626822157433,
"grad_norm": 2.8313800773465614,
"learning_rate": 4.101715254930999e-08,
"loss": 0.018,
"step": 793
},
{
"epoch": 18.518950437317784,
"grad_norm": 2.7990361393935856,
"learning_rate": 3.929484833584546e-08,
"loss": 0.0145,
"step": 794
},
{
"epoch": 18.542274052478135,
"grad_norm": 2.7540460645747307,
"learning_rate": 3.760919716020328e-08,
"loss": 0.015,
"step": 795
},
{
"epoch": 18.565597667638485,
"grad_norm": 2.940018568434434,
"learning_rate": 3.596022413072886e-08,
"loss": 0.0201,
"step": 796
},
{
"epoch": 18.588921282798832,
"grad_norm": 3.1727857362746867,
"learning_rate": 3.4347953809434894e-08,
"loss": 0.0175,
"step": 797
},
{
"epoch": 18.612244897959183,
"grad_norm": 3.5560591178845913,
"learning_rate": 3.277241021163502e-08,
"loss": 0.0166,
"step": 798
},
{
"epoch": 18.635568513119534,
"grad_norm": 3.203525425142765,
"learning_rate": 3.1233616805584896e-08,
"loss": 0.0167,
"step": 799
},
{
"epoch": 18.658892128279884,
"grad_norm": 2.797864600345931,
"learning_rate": 2.9731596512134753e-08,
"loss": 0.0165,
"step": 800
},
{
"epoch": 18.682215743440235,
"grad_norm": 3.571162483657132,
"learning_rate": 2.826637170438601e-08,
"loss": 0.0176,
"step": 801
},
{
"epoch": 18.705539358600582,
"grad_norm": 3.4360879945778358,
"learning_rate": 2.683796420735907e-08,
"loss": 0.0158,
"step": 802
},
{
"epoch": 18.728862973760933,
"grad_norm": 3.3669790393708277,
"learning_rate": 2.544639529766829e-08,
"loss": 0.0165,
"step": 803
},
{
"epoch": 18.752186588921283,
"grad_norm": 3.344597824849103,
"learning_rate": 2.409168570320447e-08,
"loss": 0.0145,
"step": 804
},
{
"epoch": 18.775510204081634,
"grad_norm": 3.0129341414147985,
"learning_rate": 2.2773855602826188e-08,
"loss": 0.018,
"step": 805
},
{
"epoch": 18.79883381924198,
"grad_norm": 3.8055786091894377,
"learning_rate": 2.149292462606034e-08,
"loss": 0.0189,
"step": 806
},
{
"epoch": 18.82215743440233,
"grad_norm": 2.7134696282155604,
"learning_rate": 2.0248911852807918e-08,
"loss": 0.0152,
"step": 807
},
{
"epoch": 18.845481049562682,
"grad_norm": 3.205023821475907,
"learning_rate": 1.9041835813061182e-08,
"loss": 0.0174,
"step": 808
},
{
"epoch": 18.868804664723033,
"grad_norm": 3.0937133170371,
"learning_rate": 1.7871714486626944e-08,
"loss": 0.0188,
"step": 809
},
{
"epoch": 18.892128279883384,
"grad_norm": 3.3376923352088217,
"learning_rate": 1.6738565302859276e-08,
"loss": 0.0151,
"step": 810
},
{
"epoch": 18.91545189504373,
"grad_norm": 2.8935337587308427,
"learning_rate": 1.5642405140399998e-08,
"loss": 0.0147,
"step": 811
},
{
"epoch": 18.93877551020408,
"grad_norm": 2.940204917098264,
"learning_rate": 1.458325032692609e-08,
"loss": 0.019,
"step": 812
},
{
"epoch": 18.962099125364432,
"grad_norm": 2.970374375711716,
"learning_rate": 1.3561116638907968e-08,
"loss": 0.0153,
"step": 813
},
{
"epoch": 18.985422740524783,
"grad_norm": 3.290703370989595,
"learning_rate": 1.2576019301373533e-08,
"loss": 0.0182,
"step": 814
},
{
"epoch": 19.00874635568513,
"grad_norm": 3.8527255826600477,
"learning_rate": 1.1627972987681702e-08,
"loss": 0.0153,
"step": 815
},
{
"epoch": 19.03206997084548,
"grad_norm": 2.452166133013113,
"learning_rate": 1.071699181930369e-08,
"loss": 0.0116,
"step": 816
},
{
"epoch": 19.05539358600583,
"grad_norm": 3.2554283291545842,
"learning_rate": 9.843089365613178e-09,
"loss": 0.0182,
"step": 817
},
{
"epoch": 19.07871720116618,
"grad_norm": 3.575695829098365,
"learning_rate": 9.006278643683697e-09,
"loss": 0.0195,
"step": 818
},
{
"epoch": 19.102040816326532,
"grad_norm": 2.3868383199705696,
"learning_rate": 8.206572118094614e-09,
"loss": 0.0116,
"step": 819
},
{
"epoch": 19.12536443148688,
"grad_norm": 3.0586927453946693,
"learning_rate": 7.4439817007460056e-09,
"loss": 0.0161,
"step": 820
},
{
"epoch": 19.14868804664723,
"grad_norm": 2.6031425191557647,
"learning_rate": 6.7185187506804695e-09,
"loss": 0.0139,
"step": 821
},
{
"epoch": 19.17201166180758,
"grad_norm": 2.897274472903437,
"learning_rate": 6.030194073914919e-09,
"loss": 0.0166,
"step": 822
},
{
"epoch": 19.19533527696793,
"grad_norm": 2.6842476438213736,
"learning_rate": 5.379017923278773e-09,
"loss": 0.0152,
"step": 823
},
{
"epoch": 19.21865889212828,
"grad_norm": 3.2130258270849827,
"learning_rate": 4.764999998262132e-09,
"loss": 0.0194,
"step": 824
},
{
"epoch": 19.24198250728863,
"grad_norm": 2.6256504161306493,
"learning_rate": 4.1881494448695046e-09,
"loss": 0.0198,
"step": 825
},
{
"epoch": 19.26530612244898,
"grad_norm": 3.0907940855917286,
"learning_rate": 3.6484748554857508e-09,
"loss": 0.0157,
"step": 826
},
{
"epoch": 19.28862973760933,
"grad_norm": 3.0455037479230396,
"learning_rate": 3.145984268747293e-09,
"loss": 0.0171,
"step": 827
},
{
"epoch": 19.31195335276968,
"grad_norm": 2.913053854478876,
"learning_rate": 2.680685169421382e-09,
"loss": 0.0153,
"step": 828
},
{
"epoch": 19.335276967930028,
"grad_norm": 2.841583600512857,
"learning_rate": 2.252584488296461e-09,
"loss": 0.0173,
"step": 829
},
{
"epoch": 19.35860058309038,
"grad_norm": 3.5529614628152104,
"learning_rate": 1.8616886020766944e-09,
"loss": 0.0152,
"step": 830
},
{
"epoch": 19.38192419825073,
"grad_norm": 2.7916265076742643,
"learning_rate": 1.5080033332892652e-09,
"loss": 0.0139,
"step": 831
},
{
"epoch": 19.40524781341108,
"grad_norm": 2.8798786731171506,
"learning_rate": 1.1915339501963885e-09,
"loss": 0.0159,
"step": 832
},
{
"epoch": 19.428571428571427,
"grad_norm": 2.3887949402642494,
"learning_rate": 9.122851667164867e-10,
"loss": 0.0145,
"step": 833
},
{
"epoch": 19.451895043731778,
"grad_norm": 3.258576154970029,
"learning_rate": 6.702611423550775e-10,
"loss": 0.0132,
"step": 834
},
{
"epoch": 19.47521865889213,
"grad_norm": 3.171290301471705,
"learning_rate": 4.654654821420468e-10,
"loss": 0.0151,
"step": 835
},
{
"epoch": 19.49854227405248,
"grad_norm": 2.941285636222306,
"learning_rate": 2.9790123657835733e-10,
"loss": 0.0196,
"step": 836
},
{
"epoch": 19.52186588921283,
"grad_norm": 2.9155196159591314,
"learning_rate": 1.6757090159025268e-10,
"loss": 0.0169,
"step": 837
},
{
"epoch": 19.545189504373177,
"grad_norm": 3.4301587869352153,
"learning_rate": 7.447641849206433e-11,
"loss": 0.0165,
"step": 838
},
{
"epoch": 19.568513119533527,
"grad_norm": 3.040206732998616,
"learning_rate": 1.8619173957623583e-11,
"loss": 0.0166,
"step": 839
},
{
"epoch": 19.591836734693878,
"grad_norm": 2.972491827489143,
"learning_rate": 0.0,
"loss": 0.016,
"step": 840
}
],
"logging_steps": 1.0,
"max_steps": 840,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 623803706515456.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}