InstructAR / trainer_state.json
shilinxu's picture
Upload folder using huggingface_hub
d67c1ee verified
raw
history blame
200 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997828447339848,
"eval_steps": 500,
"global_step": 1151,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008686210640608035,
"grad_norm": 17.493449382349226,
"learning_rate": 3.3333333333333335e-07,
"loss": 8.7443,
"step": 1
},
{
"epoch": 0.001737242128121607,
"grad_norm": 14.990450099593053,
"learning_rate": 6.666666666666667e-07,
"loss": 8.6957,
"step": 2
},
{
"epoch": 0.0026058631921824105,
"grad_norm": 16.423535165287575,
"learning_rate": 1.0000000000000002e-06,
"loss": 8.692,
"step": 3
},
{
"epoch": 0.003474484256243214,
"grad_norm": 13.143676841573155,
"learning_rate": 1.3333333333333334e-06,
"loss": 8.6702,
"step": 4
},
{
"epoch": 0.004343105320304018,
"grad_norm": 8.964338427609599,
"learning_rate": 1.6666666666666667e-06,
"loss": 8.6436,
"step": 5
},
{
"epoch": 0.005211726384364821,
"grad_norm": 5.711486770337647,
"learning_rate": 2.0000000000000003e-06,
"loss": 8.6472,
"step": 6
},
{
"epoch": 0.006080347448425625,
"grad_norm": 8.527323895494213,
"learning_rate": 2.3333333333333336e-06,
"loss": 8.7054,
"step": 7
},
{
"epoch": 0.006948968512486428,
"grad_norm": 9.12482767652379,
"learning_rate": 2.666666666666667e-06,
"loss": 8.7084,
"step": 8
},
{
"epoch": 0.007817589576547232,
"grad_norm": 5.337796907422001,
"learning_rate": 3e-06,
"loss": 8.5973,
"step": 9
},
{
"epoch": 0.008686210640608035,
"grad_norm": 3.563435103977848,
"learning_rate": 3.3333333333333333e-06,
"loss": 8.594,
"step": 10
},
{
"epoch": 0.009554831704668839,
"grad_norm": 2.8337150046011836,
"learning_rate": 3.6666666666666666e-06,
"loss": 8.562,
"step": 11
},
{
"epoch": 0.010423452768729642,
"grad_norm": 5.305490346258808,
"learning_rate": 4.000000000000001e-06,
"loss": 8.5552,
"step": 12
},
{
"epoch": 0.011292073832790446,
"grad_norm": 3.8824803788805204,
"learning_rate": 4.333333333333334e-06,
"loss": 8.5319,
"step": 13
},
{
"epoch": 0.01216069489685125,
"grad_norm": 2.779267764600503,
"learning_rate": 4.666666666666667e-06,
"loss": 8.5131,
"step": 14
},
{
"epoch": 0.013029315960912053,
"grad_norm": 2.8850795560965943,
"learning_rate": 5e-06,
"loss": 8.4689,
"step": 15
},
{
"epoch": 0.013897937024972856,
"grad_norm": 2.7572282850785186,
"learning_rate": 5.333333333333334e-06,
"loss": 8.574,
"step": 16
},
{
"epoch": 0.01476655808903366,
"grad_norm": 2.573141583562948,
"learning_rate": 5.666666666666667e-06,
"loss": 8.4873,
"step": 17
},
{
"epoch": 0.015635179153094463,
"grad_norm": 3.180322337220607,
"learning_rate": 6e-06,
"loss": 8.4521,
"step": 18
},
{
"epoch": 0.016503800217155265,
"grad_norm": 3.491770308520874,
"learning_rate": 6.333333333333333e-06,
"loss": 8.3846,
"step": 19
},
{
"epoch": 0.01737242128121607,
"grad_norm": 2.540124589419276,
"learning_rate": 6.666666666666667e-06,
"loss": 8.4451,
"step": 20
},
{
"epoch": 0.018241042345276872,
"grad_norm": 3.121998583339167,
"learning_rate": 7e-06,
"loss": 8.2468,
"step": 21
},
{
"epoch": 0.019109663409337677,
"grad_norm": 4.290808656004097,
"learning_rate": 7.333333333333333e-06,
"loss": 8.3109,
"step": 22
},
{
"epoch": 0.01997828447339848,
"grad_norm": 4.450357829892193,
"learning_rate": 7.666666666666667e-06,
"loss": 8.2419,
"step": 23
},
{
"epoch": 0.020846905537459284,
"grad_norm": 3.6215464038427188,
"learning_rate": 8.000000000000001e-06,
"loss": 8.2952,
"step": 24
},
{
"epoch": 0.021715526601520086,
"grad_norm": 4.442035772364645,
"learning_rate": 8.333333333333334e-06,
"loss": 8.377,
"step": 25
},
{
"epoch": 0.02258414766558089,
"grad_norm": 4.843618321464985,
"learning_rate": 8.666666666666668e-06,
"loss": 8.2759,
"step": 26
},
{
"epoch": 0.023452768729641693,
"grad_norm": 2.3677599699211487,
"learning_rate": 9e-06,
"loss": 8.2335,
"step": 27
},
{
"epoch": 0.0243213897937025,
"grad_norm": 2.7402708582612223,
"learning_rate": 9.333333333333334e-06,
"loss": 8.1205,
"step": 28
},
{
"epoch": 0.0251900108577633,
"grad_norm": 3.367587996037935,
"learning_rate": 9.666666666666667e-06,
"loss": 8.1712,
"step": 29
},
{
"epoch": 0.026058631921824105,
"grad_norm": 3.7838438130817647,
"learning_rate": 1e-05,
"loss": 8.155,
"step": 30
},
{
"epoch": 0.026927252985884907,
"grad_norm": 4.375332206336344,
"learning_rate": 9.999982328608275e-06,
"loss": 8.143,
"step": 31
},
{
"epoch": 0.027795874049945712,
"grad_norm": 6.575802280327978,
"learning_rate": 9.999929314571892e-06,
"loss": 8.1258,
"step": 32
},
{
"epoch": 0.028664495114006514,
"grad_norm": 9.035297148846244,
"learning_rate": 9.99984095830722e-06,
"loss": 8.0348,
"step": 33
},
{
"epoch": 0.02953311617806732,
"grad_norm": 6.634311793226191,
"learning_rate": 9.999717260508202e-06,
"loss": 8.0355,
"step": 34
},
{
"epoch": 0.03040173724212812,
"grad_norm": 8.124508838108945,
"learning_rate": 9.999558222146359e-06,
"loss": 7.9736,
"step": 35
},
{
"epoch": 0.031270358306188926,
"grad_norm": 5.8285745211892515,
"learning_rate": 9.999363844470767e-06,
"loss": 7.9337,
"step": 36
},
{
"epoch": 0.03213897937024973,
"grad_norm": 9.094040714600302,
"learning_rate": 9.999134129008061e-06,
"loss": 7.8325,
"step": 37
},
{
"epoch": 0.03300760043431053,
"grad_norm": 8.468740095393665,
"learning_rate": 9.998869077562416e-06,
"loss": 7.8086,
"step": 38
},
{
"epoch": 0.033876221498371335,
"grad_norm": 6.732505858210668,
"learning_rate": 9.998568692215532e-06,
"loss": 7.6726,
"step": 39
},
{
"epoch": 0.03474484256243214,
"grad_norm": 21.57246390769929,
"learning_rate": 9.998232975326619e-06,
"loss": 7.8017,
"step": 40
},
{
"epoch": 0.035613463626492946,
"grad_norm": 12.672871887843293,
"learning_rate": 9.997861929532384e-06,
"loss": 7.6013,
"step": 41
},
{
"epoch": 0.036482084690553744,
"grad_norm": 9.395510661131551,
"learning_rate": 9.997455557747002e-06,
"loss": 7.4226,
"step": 42
},
{
"epoch": 0.03735070575461455,
"grad_norm": 12.735314205401677,
"learning_rate": 9.9970138631621e-06,
"loss": 7.3855,
"step": 43
},
{
"epoch": 0.038219326818675355,
"grad_norm": 10.285907675522205,
"learning_rate": 9.99653684924672e-06,
"loss": 7.3198,
"step": 44
},
{
"epoch": 0.03908794788273615,
"grad_norm": 9.965187203305367,
"learning_rate": 9.996024519747312e-06,
"loss": 7.1805,
"step": 45
},
{
"epoch": 0.03995656894679696,
"grad_norm": 17.926836165323984,
"learning_rate": 9.995476878687687e-06,
"loss": 6.9857,
"step": 46
},
{
"epoch": 0.04082519001085776,
"grad_norm": 23.24576845075726,
"learning_rate": 9.994893930368987e-06,
"loss": 7.0586,
"step": 47
},
{
"epoch": 0.04169381107491857,
"grad_norm": 10.789226700023386,
"learning_rate": 9.994275679369664e-06,
"loss": 6.8635,
"step": 48
},
{
"epoch": 0.04256243213897937,
"grad_norm": 20.819248588210748,
"learning_rate": 9.99362213054543e-06,
"loss": 6.8014,
"step": 49
},
{
"epoch": 0.04343105320304017,
"grad_norm": 13.054140096829588,
"learning_rate": 9.992933289029225e-06,
"loss": 6.6718,
"step": 50
},
{
"epoch": 0.04429967426710098,
"grad_norm": 31.11350513451145,
"learning_rate": 9.992209160231182e-06,
"loss": 6.7444,
"step": 51
},
{
"epoch": 0.04516829533116178,
"grad_norm": 18.66011619667928,
"learning_rate": 9.991449749838567e-06,
"loss": 6.4907,
"step": 52
},
{
"epoch": 0.04603691639522258,
"grad_norm": 30.236445483678942,
"learning_rate": 9.990655063815758e-06,
"loss": 6.4475,
"step": 53
},
{
"epoch": 0.046905537459283386,
"grad_norm": 21.993099100665887,
"learning_rate": 9.989825108404178e-06,
"loss": 6.4676,
"step": 54
},
{
"epoch": 0.04777415852334419,
"grad_norm": 19.83551768600084,
"learning_rate": 9.988959890122257e-06,
"loss": 6.2759,
"step": 55
},
{
"epoch": 0.048642779587405,
"grad_norm": 13.170118097146572,
"learning_rate": 9.988059415765378e-06,
"loss": 6.1095,
"step": 56
},
{
"epoch": 0.049511400651465795,
"grad_norm": 22.67994571341824,
"learning_rate": 9.987123692405825e-06,
"loss": 6.0883,
"step": 57
},
{
"epoch": 0.0503800217155266,
"grad_norm": 13.038280028112723,
"learning_rate": 9.986152727392721e-06,
"loss": 6.0249,
"step": 58
},
{
"epoch": 0.051248642779587406,
"grad_norm": 14.901505032251393,
"learning_rate": 9.985146528351983e-06,
"loss": 6.1762,
"step": 59
},
{
"epoch": 0.05211726384364821,
"grad_norm": 26.832853386529305,
"learning_rate": 9.984105103186245e-06,
"loss": 6.08,
"step": 60
},
{
"epoch": 0.05298588490770901,
"grad_norm": 15.17632623872157,
"learning_rate": 9.983028460074811e-06,
"loss": 5.9952,
"step": 61
},
{
"epoch": 0.053854505971769814,
"grad_norm": 24.418647213861092,
"learning_rate": 9.981916607473589e-06,
"loss": 5.9961,
"step": 62
},
{
"epoch": 0.05472312703583062,
"grad_norm": 18.746932536918226,
"learning_rate": 9.98076955411501e-06,
"loss": 5.8426,
"step": 63
},
{
"epoch": 0.055591748099891425,
"grad_norm": 21.558720998266843,
"learning_rate": 9.97958730900798e-06,
"loss": 5.7789,
"step": 64
},
{
"epoch": 0.05646036916395222,
"grad_norm": 28.510478407780514,
"learning_rate": 9.97836988143779e-06,
"loss": 5.7467,
"step": 65
},
{
"epoch": 0.05732899022801303,
"grad_norm": 15.832636672012686,
"learning_rate": 9.977117280966065e-06,
"loss": 5.8336,
"step": 66
},
{
"epoch": 0.058197611292073834,
"grad_norm": 29.18156046853244,
"learning_rate": 9.975829517430662e-06,
"loss": 5.8591,
"step": 67
},
{
"epoch": 0.05906623235613464,
"grad_norm": 18.247831823792946,
"learning_rate": 9.974506600945618e-06,
"loss": 5.7356,
"step": 68
},
{
"epoch": 0.05993485342019544,
"grad_norm": 23.96714109884484,
"learning_rate": 9.973148541901053e-06,
"loss": 5.7721,
"step": 69
},
{
"epoch": 0.06080347448425624,
"grad_norm": 18.93369316823687,
"learning_rate": 9.9717553509631e-06,
"loss": 5.7324,
"step": 70
},
{
"epoch": 0.06167209554831705,
"grad_norm": 22.87674608218244,
"learning_rate": 9.97032703907381e-06,
"loss": 5.7795,
"step": 71
},
{
"epoch": 0.06254071661237785,
"grad_norm": 22.61900902318279,
"learning_rate": 9.968863617451078e-06,
"loss": 5.7367,
"step": 72
},
{
"epoch": 0.06340933767643865,
"grad_norm": 17.381975236661205,
"learning_rate": 9.967365097588548e-06,
"loss": 5.4779,
"step": 73
},
{
"epoch": 0.06427795874049946,
"grad_norm": 16.21661033530993,
"learning_rate": 9.965831491255521e-06,
"loss": 5.7334,
"step": 74
},
{
"epoch": 0.06514657980456026,
"grad_norm": 13.791459531600026,
"learning_rate": 9.964262810496867e-06,
"loss": 5.5777,
"step": 75
},
{
"epoch": 0.06601520086862106,
"grad_norm": 20.390991279830846,
"learning_rate": 9.962659067632933e-06,
"loss": 5.5614,
"step": 76
},
{
"epoch": 0.06688382193268187,
"grad_norm": 16.095238532163904,
"learning_rate": 9.961020275259433e-06,
"loss": 5.6062,
"step": 77
},
{
"epoch": 0.06775244299674267,
"grad_norm": 9.877965182839311,
"learning_rate": 9.959346446247367e-06,
"loss": 5.6125,
"step": 78
},
{
"epoch": 0.06862106406080347,
"grad_norm": 17.028948309558636,
"learning_rate": 9.957637593742905e-06,
"loss": 5.5812,
"step": 79
},
{
"epoch": 0.06948968512486428,
"grad_norm": 8.979583407927759,
"learning_rate": 9.955893731167295e-06,
"loss": 5.5746,
"step": 80
},
{
"epoch": 0.07035830618892508,
"grad_norm": 12.03923638076693,
"learning_rate": 9.95411487221675e-06,
"loss": 5.6653,
"step": 81
},
{
"epoch": 0.07122692725298589,
"grad_norm": 12.390003162693889,
"learning_rate": 9.952301030862337e-06,
"loss": 5.4735,
"step": 82
},
{
"epoch": 0.07209554831704669,
"grad_norm": 9.315040920552768,
"learning_rate": 9.950452221349887e-06,
"loss": 5.5041,
"step": 83
},
{
"epoch": 0.07296416938110749,
"grad_norm": 10.904257522944858,
"learning_rate": 9.948568458199856e-06,
"loss": 5.4586,
"step": 84
},
{
"epoch": 0.0738327904451683,
"grad_norm": 12.975062616231535,
"learning_rate": 9.94664975620723e-06,
"loss": 5.3053,
"step": 85
},
{
"epoch": 0.0747014115092291,
"grad_norm": 20.113827062713174,
"learning_rate": 9.944696130441399e-06,
"loss": 5.5913,
"step": 86
},
{
"epoch": 0.0755700325732899,
"grad_norm": 12.517008621704704,
"learning_rate": 9.942707596246051e-06,
"loss": 5.337,
"step": 87
},
{
"epoch": 0.07643865363735071,
"grad_norm": 9.758313745988033,
"learning_rate": 9.940684169239035e-06,
"loss": 5.3173,
"step": 88
},
{
"epoch": 0.07730727470141151,
"grad_norm": 10.515920909580231,
"learning_rate": 9.938625865312252e-06,
"loss": 5.4537,
"step": 89
},
{
"epoch": 0.0781758957654723,
"grad_norm": 14.20391683761756,
"learning_rate": 9.936532700631518e-06,
"loss": 5.5486,
"step": 90
},
{
"epoch": 0.07904451682953312,
"grad_norm": 6.514270269870405,
"learning_rate": 9.93440469163645e-06,
"loss": 5.538,
"step": 91
},
{
"epoch": 0.07991313789359392,
"grad_norm": 24.656805213333442,
"learning_rate": 9.932241855040328e-06,
"loss": 5.3992,
"step": 92
},
{
"epoch": 0.08078175895765473,
"grad_norm": 15.24785447513823,
"learning_rate": 9.930044207829966e-06,
"loss": 5.4491,
"step": 93
},
{
"epoch": 0.08165038002171553,
"grad_norm": 17.034838291407535,
"learning_rate": 9.927811767265581e-06,
"loss": 5.5654,
"step": 94
},
{
"epoch": 0.08251900108577633,
"grad_norm": 16.66779749233834,
"learning_rate": 9.925544550880653e-06,
"loss": 5.3037,
"step": 95
},
{
"epoch": 0.08338762214983714,
"grad_norm": 13.53801819272121,
"learning_rate": 9.92324257648179e-06,
"loss": 5.1831,
"step": 96
},
{
"epoch": 0.08425624321389794,
"grad_norm": 13.925819342029348,
"learning_rate": 9.920905862148586e-06,
"loss": 5.3506,
"step": 97
},
{
"epoch": 0.08512486427795873,
"grad_norm": 11.595896262884168,
"learning_rate": 9.918534426233486e-06,
"loss": 5.3379,
"step": 98
},
{
"epoch": 0.08599348534201955,
"grad_norm": 10.793238449505045,
"learning_rate": 9.916128287361634e-06,
"loss": 5.2807,
"step": 99
},
{
"epoch": 0.08686210640608034,
"grad_norm": 9.100694836617444,
"learning_rate": 9.913687464430727e-06,
"loss": 5.3927,
"step": 100
},
{
"epoch": 0.08773072747014116,
"grad_norm": 10.570344665746836,
"learning_rate": 9.91121197661087e-06,
"loss": 5.3416,
"step": 101
},
{
"epoch": 0.08859934853420195,
"grad_norm": 8.054172083905852,
"learning_rate": 9.908701843344427e-06,
"loss": 5.4551,
"step": 102
},
{
"epoch": 0.08946796959826275,
"grad_norm": 9.349388343495786,
"learning_rate": 9.906157084345865e-06,
"loss": 5.3192,
"step": 103
},
{
"epoch": 0.09033659066232357,
"grad_norm": 7.475662385329425,
"learning_rate": 9.903577719601597e-06,
"loss": 5.2733,
"step": 104
},
{
"epoch": 0.09120521172638436,
"grad_norm": 7.943968157184266,
"learning_rate": 9.900963769369827e-06,
"loss": 5.3985,
"step": 105
},
{
"epoch": 0.09207383279044516,
"grad_norm": 7.531045700331419,
"learning_rate": 9.8983152541804e-06,
"loss": 5.3622,
"step": 106
},
{
"epoch": 0.09294245385450597,
"grad_norm": 15.575311318718667,
"learning_rate": 9.895632194834625e-06,
"loss": 5.1931,
"step": 107
},
{
"epoch": 0.09381107491856677,
"grad_norm": 11.029594288829802,
"learning_rate": 9.892914612405117e-06,
"loss": 5.2938,
"step": 108
},
{
"epoch": 0.09467969598262758,
"grad_norm": 11.455193260474546,
"learning_rate": 9.890162528235641e-06,
"loss": 5.1901,
"step": 109
},
{
"epoch": 0.09554831704668838,
"grad_norm": 11.086046323655495,
"learning_rate": 9.887375963940936e-06,
"loss": 5.4614,
"step": 110
},
{
"epoch": 0.09641693811074918,
"grad_norm": 7.999590835458638,
"learning_rate": 9.884554941406539e-06,
"loss": 5.2932,
"step": 111
},
{
"epoch": 0.09728555917481,
"grad_norm": 11.326641474544164,
"learning_rate": 9.881699482788627e-06,
"loss": 5.1322,
"step": 112
},
{
"epoch": 0.09815418023887079,
"grad_norm": 7.633538464939616,
"learning_rate": 9.878809610513836e-06,
"loss": 5.1649,
"step": 113
},
{
"epoch": 0.09902280130293159,
"grad_norm": 9.626046071760667,
"learning_rate": 9.87588534727908e-06,
"loss": 5.3233,
"step": 114
},
{
"epoch": 0.0998914223669924,
"grad_norm": 9.133312938258387,
"learning_rate": 9.872926716051387e-06,
"loss": 5.1132,
"step": 115
},
{
"epoch": 0.1007600434310532,
"grad_norm": 9.505514994492872,
"learning_rate": 9.869933740067703e-06,
"loss": 5.1579,
"step": 116
},
{
"epoch": 0.10162866449511401,
"grad_norm": 8.791259591754573,
"learning_rate": 9.866906442834713e-06,
"loss": 5.2137,
"step": 117
},
{
"epoch": 0.10249728555917481,
"grad_norm": 10.053006259051758,
"learning_rate": 9.863844848128668e-06,
"loss": 5.2787,
"step": 118
},
{
"epoch": 0.10336590662323561,
"grad_norm": 13.131490348069805,
"learning_rate": 9.860748979995183e-06,
"loss": 4.9206,
"step": 119
},
{
"epoch": 0.10423452768729642,
"grad_norm": 6.631095589009018,
"learning_rate": 9.857618862749062e-06,
"loss": 5.2466,
"step": 120
},
{
"epoch": 0.10510314875135722,
"grad_norm": 13.217875916209636,
"learning_rate": 9.85445452097409e-06,
"loss": 5.2414,
"step": 121
},
{
"epoch": 0.10597176981541802,
"grad_norm": 8.029250043406146,
"learning_rate": 9.851255979522856e-06,
"loss": 5.2792,
"step": 122
},
{
"epoch": 0.10684039087947883,
"grad_norm": 17.54497296410541,
"learning_rate": 9.848023263516552e-06,
"loss": 5.4227,
"step": 123
},
{
"epoch": 0.10770901194353963,
"grad_norm": 11.789384484790114,
"learning_rate": 9.844756398344773e-06,
"loss": 5.2564,
"step": 124
},
{
"epoch": 0.10857763300760044,
"grad_norm": 12.89494048195122,
"learning_rate": 9.841455409665322e-06,
"loss": 5.173,
"step": 125
},
{
"epoch": 0.10944625407166124,
"grad_norm": 10.08813144239029,
"learning_rate": 9.838120323404004e-06,
"loss": 5.2173,
"step": 126
},
{
"epoch": 0.11031487513572204,
"grad_norm": 8.840110668733463,
"learning_rate": 9.834751165754428e-06,
"loss": 5.237,
"step": 127
},
{
"epoch": 0.11118349619978285,
"grad_norm": 10.088550995951381,
"learning_rate": 9.831347963177794e-06,
"loss": 5.2767,
"step": 128
},
{
"epoch": 0.11205211726384365,
"grad_norm": 8.195942675919266,
"learning_rate": 9.827910742402693e-06,
"loss": 5.2096,
"step": 129
},
{
"epoch": 0.11292073832790445,
"grad_norm": 11.245032884796654,
"learning_rate": 9.824439530424888e-06,
"loss": 5.1405,
"step": 130
},
{
"epoch": 0.11378935939196526,
"grad_norm": 10.023313323111374,
"learning_rate": 9.820934354507113e-06,
"loss": 5.2706,
"step": 131
},
{
"epoch": 0.11465798045602606,
"grad_norm": 5.266854203910289,
"learning_rate": 9.817395242178854e-06,
"loss": 5.2976,
"step": 132
},
{
"epoch": 0.11552660152008687,
"grad_norm": 13.356121226198322,
"learning_rate": 9.813822221236125e-06,
"loss": 5.1784,
"step": 133
},
{
"epoch": 0.11639522258414767,
"grad_norm": 7.209004502332046,
"learning_rate": 9.810215319741262e-06,
"loss": 5.2083,
"step": 134
},
{
"epoch": 0.11726384364820847,
"grad_norm": 9.476007633968416,
"learning_rate": 9.806574566022693e-06,
"loss": 5.1428,
"step": 135
},
{
"epoch": 0.11813246471226928,
"grad_norm": 8.330598001164512,
"learning_rate": 9.802899988674728e-06,
"loss": 5.0473,
"step": 136
},
{
"epoch": 0.11900108577633008,
"grad_norm": 6.96008730119411,
"learning_rate": 9.799191616557317e-06,
"loss": 5.2599,
"step": 137
},
{
"epoch": 0.11986970684039087,
"grad_norm": 7.736962765205919,
"learning_rate": 9.795449478795836e-06,
"loss": 5.2652,
"step": 138
},
{
"epoch": 0.12073832790445169,
"grad_norm": 6.749695837587577,
"learning_rate": 9.791673604780856e-06,
"loss": 5.238,
"step": 139
},
{
"epoch": 0.12160694896851248,
"grad_norm": 10.329369152037573,
"learning_rate": 9.787864024167911e-06,
"loss": 5.1537,
"step": 140
},
{
"epoch": 0.12247557003257328,
"grad_norm": 6.905477249008306,
"learning_rate": 9.78402076687726e-06,
"loss": 5.2558,
"step": 141
},
{
"epoch": 0.1233441910966341,
"grad_norm": 11.627003258310488,
"learning_rate": 9.780143863093663e-06,
"loss": 5.2708,
"step": 142
},
{
"epoch": 0.1242128121606949,
"grad_norm": 7.001100481839455,
"learning_rate": 9.776233343266138e-06,
"loss": 5.1064,
"step": 143
},
{
"epoch": 0.1250814332247557,
"grad_norm": 11.285622964490454,
"learning_rate": 9.772289238107717e-06,
"loss": 4.9967,
"step": 144
},
{
"epoch": 0.1259500542888165,
"grad_norm": 7.568164194677143,
"learning_rate": 9.768311578595212e-06,
"loss": 5.014,
"step": 145
},
{
"epoch": 0.1268186753528773,
"grad_norm": 9.381543499879124,
"learning_rate": 9.764300395968969e-06,
"loss": 5.0742,
"step": 146
},
{
"epoch": 0.1276872964169381,
"grad_norm": 8.76903592042971,
"learning_rate": 9.760255721732626e-06,
"loss": 4.9568,
"step": 147
},
{
"epoch": 0.12855591748099893,
"grad_norm": 7.998187265875589,
"learning_rate": 9.756177587652857e-06,
"loss": 5.1517,
"step": 148
},
{
"epoch": 0.12942453854505973,
"grad_norm": 10.119399253599614,
"learning_rate": 9.752066025759132e-06,
"loss": 5.0338,
"step": 149
},
{
"epoch": 0.13029315960912052,
"grad_norm": 6.547018882815891,
"learning_rate": 9.74792106834346e-06,
"loss": 5.0788,
"step": 150
},
{
"epoch": 0.13116178067318132,
"grad_norm": 8.201656095763651,
"learning_rate": 9.743742747960138e-06,
"loss": 5.193,
"step": 151
},
{
"epoch": 0.13203040173724212,
"grad_norm": 6.751595098502055,
"learning_rate": 9.739531097425493e-06,
"loss": 5.0545,
"step": 152
},
{
"epoch": 0.13289902280130292,
"grad_norm": 11.073060727681726,
"learning_rate": 9.735286149817623e-06,
"loss": 4.9874,
"step": 153
},
{
"epoch": 0.13376764386536374,
"grad_norm": 6.413004734395765,
"learning_rate": 9.731007938476145e-06,
"loss": 5.1605,
"step": 154
},
{
"epoch": 0.13463626492942454,
"grad_norm": 9.902861982329972,
"learning_rate": 9.726696497001923e-06,
"loss": 5.0762,
"step": 155
},
{
"epoch": 0.13550488599348534,
"grad_norm": 9.135620944817346,
"learning_rate": 9.722351859256815e-06,
"loss": 5.1326,
"step": 156
},
{
"epoch": 0.13637350705754614,
"grad_norm": 5.094932628448521,
"learning_rate": 9.717974059363392e-06,
"loss": 5.0871,
"step": 157
},
{
"epoch": 0.13724212812160694,
"grad_norm": 13.8159898342583,
"learning_rate": 9.713563131704685e-06,
"loss": 5.0887,
"step": 158
},
{
"epoch": 0.13811074918566776,
"grad_norm": 7.776188941220426,
"learning_rate": 9.709119110923911e-06,
"loss": 5.0136,
"step": 159
},
{
"epoch": 0.13897937024972856,
"grad_norm": 14.140298714913962,
"learning_rate": 9.70464203192419e-06,
"loss": 5.1441,
"step": 160
},
{
"epoch": 0.13984799131378936,
"grad_norm": 7.397703190515995,
"learning_rate": 9.700131929868289e-06,
"loss": 5.0998,
"step": 161
},
{
"epoch": 0.14071661237785016,
"grad_norm": 13.939460570037966,
"learning_rate": 9.695588840178331e-06,
"loss": 5.0197,
"step": 162
},
{
"epoch": 0.14158523344191096,
"grad_norm": 10.270022957942269,
"learning_rate": 9.691012798535524e-06,
"loss": 4.8067,
"step": 163
},
{
"epoch": 0.14245385450597178,
"grad_norm": 10.393393548384978,
"learning_rate": 9.686403840879877e-06,
"loss": 5.1063,
"step": 164
},
{
"epoch": 0.14332247557003258,
"grad_norm": 9.993662702844784,
"learning_rate": 9.681762003409926e-06,
"loss": 5.1959,
"step": 165
},
{
"epoch": 0.14419109663409338,
"grad_norm": 7.693407344731637,
"learning_rate": 9.677087322582434e-06,
"loss": 5.0605,
"step": 166
},
{
"epoch": 0.14505971769815418,
"grad_norm": 9.965954257327015,
"learning_rate": 9.672379835112124e-06,
"loss": 5.1052,
"step": 167
},
{
"epoch": 0.14592833876221498,
"grad_norm": 6.623907098912022,
"learning_rate": 9.667639577971372e-06,
"loss": 5.2321,
"step": 168
},
{
"epoch": 0.14679695982627577,
"grad_norm": 12.30970638465487,
"learning_rate": 9.662866588389931e-06,
"loss": 4.9307,
"step": 169
},
{
"epoch": 0.1476655808903366,
"grad_norm": 9.854445105895712,
"learning_rate": 9.658060903854633e-06,
"loss": 5.2573,
"step": 170
},
{
"epoch": 0.1485342019543974,
"grad_norm": 9.733745744797229,
"learning_rate": 9.653222562109093e-06,
"loss": 5.1312,
"step": 171
},
{
"epoch": 0.1494028230184582,
"grad_norm": 7.148106476376709,
"learning_rate": 9.64835160115341e-06,
"loss": 4.9459,
"step": 172
},
{
"epoch": 0.150271444082519,
"grad_norm": 9.910271107591148,
"learning_rate": 9.643448059243881e-06,
"loss": 4.9896,
"step": 173
},
{
"epoch": 0.1511400651465798,
"grad_norm": 6.743151416523807,
"learning_rate": 9.638511974892689e-06,
"loss": 5.0992,
"step": 174
},
{
"epoch": 0.15200868621064062,
"grad_norm": 9.278378594694823,
"learning_rate": 9.633543386867599e-06,
"loss": 5.0205,
"step": 175
},
{
"epoch": 0.15287730727470142,
"grad_norm": 8.006244545055509,
"learning_rate": 9.628542334191665e-06,
"loss": 5.1466,
"step": 176
},
{
"epoch": 0.15374592833876222,
"grad_norm": 6.372417255435819,
"learning_rate": 9.623508856142914e-06,
"loss": 5.078,
"step": 177
},
{
"epoch": 0.15461454940282301,
"grad_norm": 8.38636496529513,
"learning_rate": 9.61844299225404e-06,
"loss": 5.1724,
"step": 178
},
{
"epoch": 0.1554831704668838,
"grad_norm": 7.43768649121031,
"learning_rate": 9.613344782312093e-06,
"loss": 4.8699,
"step": 179
},
{
"epoch": 0.1563517915309446,
"grad_norm": 12.808038040940353,
"learning_rate": 9.608214266358171e-06,
"loss": 5.0929,
"step": 180
},
{
"epoch": 0.15722041259500544,
"grad_norm": 10.021409419376392,
"learning_rate": 9.603051484687096e-06,
"loss": 4.9412,
"step": 181
},
{
"epoch": 0.15808903365906624,
"grad_norm": 8.383987107054914,
"learning_rate": 9.597856477847111e-06,
"loss": 5.0407,
"step": 182
},
{
"epoch": 0.15895765472312703,
"grad_norm": 11.839795913237127,
"learning_rate": 9.592629286639545e-06,
"loss": 4.9284,
"step": 183
},
{
"epoch": 0.15982627578718783,
"grad_norm": 7.10734457748747,
"learning_rate": 9.58736995211851e-06,
"loss": 5.058,
"step": 184
},
{
"epoch": 0.16069489685124863,
"grad_norm": 9.11261944633933,
"learning_rate": 9.582078515590565e-06,
"loss": 4.8771,
"step": 185
},
{
"epoch": 0.16156351791530946,
"grad_norm": 9.824025734718676,
"learning_rate": 9.5767550186144e-06,
"loss": 4.8962,
"step": 186
},
{
"epoch": 0.16243213897937026,
"grad_norm": 8.437193122982974,
"learning_rate": 9.571399503000499e-06,
"loss": 4.9084,
"step": 187
},
{
"epoch": 0.16330076004343105,
"grad_norm": 5.687318397304989,
"learning_rate": 9.566012010810826e-06,
"loss": 5.08,
"step": 188
},
{
"epoch": 0.16416938110749185,
"grad_norm": 14.433519675550484,
"learning_rate": 9.560592584358489e-06,
"loss": 4.8611,
"step": 189
},
{
"epoch": 0.16503800217155265,
"grad_norm": 6.794400971865543,
"learning_rate": 9.555141266207398e-06,
"loss": 4.936,
"step": 190
},
{
"epoch": 0.16590662323561348,
"grad_norm": 14.402134569349881,
"learning_rate": 9.549658099171945e-06,
"loss": 4.8904,
"step": 191
},
{
"epoch": 0.16677524429967427,
"grad_norm": 9.083194622310913,
"learning_rate": 9.544143126316661e-06,
"loss": 5.0294,
"step": 192
},
{
"epoch": 0.16764386536373507,
"grad_norm": 10.639070515809655,
"learning_rate": 9.538596390955876e-06,
"loss": 4.9204,
"step": 193
},
{
"epoch": 0.16851248642779587,
"grad_norm": 7.267347749331012,
"learning_rate": 9.533017936653385e-06,
"loss": 5.0803,
"step": 194
},
{
"epoch": 0.16938110749185667,
"grad_norm": 8.011322892386996,
"learning_rate": 9.527407807222096e-06,
"loss": 5.0817,
"step": 195
},
{
"epoch": 0.17024972855591747,
"grad_norm": 8.33354841480774,
"learning_rate": 9.521766046723699e-06,
"loss": 5.0201,
"step": 196
},
{
"epoch": 0.1711183496199783,
"grad_norm": 4.646575968820735,
"learning_rate": 9.516092699468305e-06,
"loss": 5.1351,
"step": 197
},
{
"epoch": 0.1719869706840391,
"grad_norm": 9.81795639706777,
"learning_rate": 9.510387810014115e-06,
"loss": 4.8496,
"step": 198
},
{
"epoch": 0.1728555917480999,
"grad_norm": 8.118086500928438,
"learning_rate": 9.504651423167056e-06,
"loss": 4.7814,
"step": 199
},
{
"epoch": 0.1737242128121607,
"grad_norm": 8.90163622203598,
"learning_rate": 9.49888358398043e-06,
"loss": 4.9746,
"step": 200
},
{
"epoch": 0.1745928338762215,
"grad_norm": 9.581229012117431,
"learning_rate": 9.493084337754573e-06,
"loss": 4.9412,
"step": 201
},
{
"epoch": 0.1754614549402823,
"grad_norm": 5.4797873048345185,
"learning_rate": 9.487253730036484e-06,
"loss": 5.0215,
"step": 202
},
{
"epoch": 0.1763300760043431,
"grad_norm": 13.799396637492386,
"learning_rate": 9.481391806619475e-06,
"loss": 5.1105,
"step": 203
},
{
"epoch": 0.1771986970684039,
"grad_norm": 10.196454355029717,
"learning_rate": 9.475498613542808e-06,
"loss": 5.0285,
"step": 204
},
{
"epoch": 0.1780673181324647,
"grad_norm": 13.961082886424126,
"learning_rate": 9.469574197091345e-06,
"loss": 4.986,
"step": 205
},
{
"epoch": 0.1789359391965255,
"grad_norm": 9.989829598419863,
"learning_rate": 9.46361860379516e-06,
"loss": 5.021,
"step": 206
},
{
"epoch": 0.17980456026058633,
"grad_norm": 12.997591407480371,
"learning_rate": 9.4576318804292e-06,
"loss": 4.9919,
"step": 207
},
{
"epoch": 0.18067318132464713,
"grad_norm": 9.372234356116795,
"learning_rate": 9.451614074012905e-06,
"loss": 5.1485,
"step": 208
},
{
"epoch": 0.18154180238870793,
"grad_norm": 10.142709831697546,
"learning_rate": 9.445565231809832e-06,
"loss": 5.0171,
"step": 209
},
{
"epoch": 0.18241042345276873,
"grad_norm": 9.226181067395016,
"learning_rate": 9.439485401327296e-06,
"loss": 5.0061,
"step": 210
},
{
"epoch": 0.18327904451682953,
"grad_norm": 8.279353289891953,
"learning_rate": 9.433374630315997e-06,
"loss": 4.9115,
"step": 211
},
{
"epoch": 0.18414766558089032,
"grad_norm": 11.929279731379703,
"learning_rate": 9.427232966769634e-06,
"loss": 4.6847,
"step": 212
},
{
"epoch": 0.18501628664495115,
"grad_norm": 5.482572771970624,
"learning_rate": 9.421060458924539e-06,
"loss": 4.9443,
"step": 213
},
{
"epoch": 0.18588490770901195,
"grad_norm": 7.427326598312704,
"learning_rate": 9.414857155259289e-06,
"loss": 4.9023,
"step": 214
},
{
"epoch": 0.18675352877307275,
"grad_norm": 8.360757486256176,
"learning_rate": 9.408623104494336e-06,
"loss": 4.984,
"step": 215
},
{
"epoch": 0.18762214983713354,
"grad_norm": 7.208982202085643,
"learning_rate": 9.402358355591609e-06,
"loss": 4.963,
"step": 216
},
{
"epoch": 0.18849077090119434,
"grad_norm": 5.301087365446961,
"learning_rate": 9.39606295775415e-06,
"loss": 4.998,
"step": 217
},
{
"epoch": 0.18935939196525517,
"grad_norm": 7.799837692221157,
"learning_rate": 9.389736960425708e-06,
"loss": 4.9741,
"step": 218
},
{
"epoch": 0.19022801302931597,
"grad_norm": 7.151033616802984,
"learning_rate": 9.383380413290361e-06,
"loss": 4.8871,
"step": 219
},
{
"epoch": 0.19109663409337677,
"grad_norm": 5.787982552484058,
"learning_rate": 9.376993366272128e-06,
"loss": 4.9864,
"step": 220
},
{
"epoch": 0.19196525515743756,
"grad_norm": 7.905445175850379,
"learning_rate": 9.370575869534565e-06,
"loss": 5.0129,
"step": 221
},
{
"epoch": 0.19283387622149836,
"grad_norm": 4.339162197971099,
"learning_rate": 9.364127973480385e-06,
"loss": 5.0006,
"step": 222
},
{
"epoch": 0.1937024972855592,
"grad_norm": 9.706062286895799,
"learning_rate": 9.35764972875105e-06,
"loss": 4.8678,
"step": 223
},
{
"epoch": 0.19457111834962,
"grad_norm": 6.364386062061041,
"learning_rate": 9.351141186226387e-06,
"loss": 4.8405,
"step": 224
},
{
"epoch": 0.19543973941368079,
"grad_norm": 8.97734562182805,
"learning_rate": 9.344602397024172e-06,
"loss": 4.9155,
"step": 225
},
{
"epoch": 0.19630836047774158,
"grad_norm": 7.362245536098451,
"learning_rate": 9.338033412499743e-06,
"loss": 4.7595,
"step": 226
},
{
"epoch": 0.19717698154180238,
"grad_norm": 7.54540953421405,
"learning_rate": 9.331434284245585e-06,
"loss": 4.8956,
"step": 227
},
{
"epoch": 0.19804560260586318,
"grad_norm": 7.423100629581046,
"learning_rate": 9.324805064090939e-06,
"loss": 4.9622,
"step": 228
},
{
"epoch": 0.198914223669924,
"grad_norm": 6.020905403406819,
"learning_rate": 9.318145804101377e-06,
"loss": 4.959,
"step": 229
},
{
"epoch": 0.1997828447339848,
"grad_norm": 4.670252852600942,
"learning_rate": 9.31145655657841e-06,
"loss": 5.0506,
"step": 230
},
{
"epoch": 0.2006514657980456,
"grad_norm": 7.795934999989379,
"learning_rate": 9.30473737405906e-06,
"loss": 4.9765,
"step": 231
},
{
"epoch": 0.2015200868621064,
"grad_norm": 6.01240795364037,
"learning_rate": 9.29798830931547e-06,
"loss": 4.9211,
"step": 232
},
{
"epoch": 0.2023887079261672,
"grad_norm": 5.698601126510405,
"learning_rate": 9.291209415354466e-06,
"loss": 4.7963,
"step": 233
},
{
"epoch": 0.20325732899022803,
"grad_norm": 12.382329963742128,
"learning_rate": 9.284400745417154e-06,
"loss": 4.8329,
"step": 234
},
{
"epoch": 0.20412595005428882,
"grad_norm": 6.0887020552234645,
"learning_rate": 9.277562352978504e-06,
"loss": 4.9987,
"step": 235
},
{
"epoch": 0.20499457111834962,
"grad_norm": 9.258532830886514,
"learning_rate": 9.270694291746918e-06,
"loss": 4.7606,
"step": 236
},
{
"epoch": 0.20586319218241042,
"grad_norm": 5.467188874710136,
"learning_rate": 9.26379661566382e-06,
"loss": 4.8058,
"step": 237
},
{
"epoch": 0.20673181324647122,
"grad_norm": 8.326138039007624,
"learning_rate": 9.256869378903226e-06,
"loss": 4.8709,
"step": 238
},
{
"epoch": 0.20760043431053202,
"grad_norm": 6.953137100468736,
"learning_rate": 9.249912635871317e-06,
"loss": 4.877,
"step": 239
},
{
"epoch": 0.20846905537459284,
"grad_norm": 7.746260501101959,
"learning_rate": 9.242926441206024e-06,
"loss": 4.866,
"step": 240
},
{
"epoch": 0.20933767643865364,
"grad_norm": 5.662860561699009,
"learning_rate": 9.235910849776578e-06,
"loss": 4.9476,
"step": 241
},
{
"epoch": 0.21020629750271444,
"grad_norm": 9.471074354024374,
"learning_rate": 9.2288659166831e-06,
"loss": 4.8759,
"step": 242
},
{
"epoch": 0.21107491856677524,
"grad_norm": 5.294803045493394,
"learning_rate": 9.221791697256152e-06,
"loss": 4.8172,
"step": 243
},
{
"epoch": 0.21194353963083604,
"grad_norm": 9.913380058373116,
"learning_rate": 9.214688247056316e-06,
"loss": 4.9476,
"step": 244
},
{
"epoch": 0.21281216069489686,
"grad_norm": 6.941540338932768,
"learning_rate": 9.207555621873748e-06,
"loss": 4.8194,
"step": 245
},
{
"epoch": 0.21368078175895766,
"grad_norm": 9.460762282346524,
"learning_rate": 9.20039387772774e-06,
"loss": 4.9644,
"step": 246
},
{
"epoch": 0.21454940282301846,
"grad_norm": 7.830249212526504,
"learning_rate": 9.19320307086629e-06,
"loss": 4.9704,
"step": 247
},
{
"epoch": 0.21541802388707926,
"grad_norm": 6.925252918501548,
"learning_rate": 9.185983257765648e-06,
"loss": 4.9168,
"step": 248
},
{
"epoch": 0.21628664495114006,
"grad_norm": 7.233949141897864,
"learning_rate": 9.178734495129876e-06,
"loss": 4.8646,
"step": 249
},
{
"epoch": 0.21715526601520088,
"grad_norm": 5.897422894008084,
"learning_rate": 9.171456839890408e-06,
"loss": 5.0017,
"step": 250
},
{
"epoch": 0.21802388707926168,
"grad_norm": 8.435251394805453,
"learning_rate": 9.1641503492056e-06,
"loss": 4.8114,
"step": 251
},
{
"epoch": 0.21889250814332248,
"grad_norm": 5.760910243649614,
"learning_rate": 9.156815080460277e-06,
"loss": 4.8976,
"step": 252
},
{
"epoch": 0.21976112920738328,
"grad_norm": 7.520437903960479,
"learning_rate": 9.149451091265286e-06,
"loss": 4.7124,
"step": 253
},
{
"epoch": 0.22062975027144408,
"grad_norm": 7.013652737318022,
"learning_rate": 9.142058439457044e-06,
"loss": 4.9533,
"step": 254
},
{
"epoch": 0.22149837133550487,
"grad_norm": 7.159149521810479,
"learning_rate": 9.134637183097083e-06,
"loss": 4.8566,
"step": 255
},
{
"epoch": 0.2223669923995657,
"grad_norm": 6.499343085246989,
"learning_rate": 9.127187380471595e-06,
"loss": 4.6676,
"step": 256
},
{
"epoch": 0.2232356134636265,
"grad_norm": 5.21337701054524,
"learning_rate": 9.11970909009097e-06,
"loss": 4.922,
"step": 257
},
{
"epoch": 0.2241042345276873,
"grad_norm": 6.349567353113073,
"learning_rate": 9.112202370689337e-06,
"loss": 4.8532,
"step": 258
},
{
"epoch": 0.2249728555917481,
"grad_norm": 6.818694420622126,
"learning_rate": 9.104667281224114e-06,
"loss": 4.9436,
"step": 259
},
{
"epoch": 0.2258414766558089,
"grad_norm": 7.214238229211862,
"learning_rate": 9.09710388087553e-06,
"loss": 5.0028,
"step": 260
},
{
"epoch": 0.22671009771986972,
"grad_norm": 5.209070263470396,
"learning_rate": 9.089512229046167e-06,
"loss": 4.5818,
"step": 261
},
{
"epoch": 0.22757871878393052,
"grad_norm": 10.360050755845188,
"learning_rate": 9.08189238536049e-06,
"loss": 4.8978,
"step": 262
},
{
"epoch": 0.22844733984799132,
"grad_norm": 6.643043793388568,
"learning_rate": 9.07424440966439e-06,
"loss": 4.7172,
"step": 263
},
{
"epoch": 0.2293159609120521,
"grad_norm": 7.816417260568394,
"learning_rate": 9.066568362024697e-06,
"loss": 4.7677,
"step": 264
},
{
"epoch": 0.2301845819761129,
"grad_norm": 7.24101938038275,
"learning_rate": 9.058864302728722e-06,
"loss": 4.9232,
"step": 265
},
{
"epoch": 0.23105320304017374,
"grad_norm": 5.882755828325563,
"learning_rate": 9.051132292283772e-06,
"loss": 4.8006,
"step": 266
},
{
"epoch": 0.23192182410423454,
"grad_norm": 7.033058546114696,
"learning_rate": 9.043372391416687e-06,
"loss": 4.8018,
"step": 267
},
{
"epoch": 0.23279044516829533,
"grad_norm": 6.638958562180367,
"learning_rate": 9.035584661073357e-06,
"loss": 4.8614,
"step": 268
},
{
"epoch": 0.23365906623235613,
"grad_norm": 5.164804077792831,
"learning_rate": 9.02776916241824e-06,
"loss": 4.6965,
"step": 269
},
{
"epoch": 0.23452768729641693,
"grad_norm": 7.8053882165369055,
"learning_rate": 9.019925956833884e-06,
"loss": 4.8867,
"step": 270
},
{
"epoch": 0.23539630836047773,
"grad_norm": 5.302096025761552,
"learning_rate": 9.012055105920452e-06,
"loss": 4.895,
"step": 271
},
{
"epoch": 0.23626492942453856,
"grad_norm": 5.851233842251707,
"learning_rate": 9.004156671495224e-06,
"loss": 4.9754,
"step": 272
},
{
"epoch": 0.23713355048859935,
"grad_norm": 6.290732292956187,
"learning_rate": 8.996230715592129e-06,
"loss": 4.8171,
"step": 273
},
{
"epoch": 0.23800217155266015,
"grad_norm": 6.760963585753358,
"learning_rate": 8.988277300461238e-06,
"loss": 4.6128,
"step": 274
},
{
"epoch": 0.23887079261672095,
"grad_norm": 5.437936675855679,
"learning_rate": 8.980296488568296e-06,
"loss": 4.8612,
"step": 275
},
{
"epoch": 0.23973941368078175,
"grad_norm": 8.546817288458861,
"learning_rate": 8.972288342594211e-06,
"loss": 4.7358,
"step": 276
},
{
"epoch": 0.24060803474484257,
"grad_norm": 5.168018360375808,
"learning_rate": 8.96425292543458e-06,
"loss": 4.8366,
"step": 277
},
{
"epoch": 0.24147665580890337,
"grad_norm": 8.339946348434717,
"learning_rate": 8.95619030019918e-06,
"loss": 4.9408,
"step": 278
},
{
"epoch": 0.24234527687296417,
"grad_norm": 4.812823437728942,
"learning_rate": 8.94810053021148e-06,
"loss": 4.8325,
"step": 279
},
{
"epoch": 0.24321389793702497,
"grad_norm": 9.00150446795358,
"learning_rate": 8.939983679008147e-06,
"loss": 4.766,
"step": 280
},
{
"epoch": 0.24408251900108577,
"grad_norm": 6.85159808721181,
"learning_rate": 8.931839810338541e-06,
"loss": 4.9044,
"step": 281
},
{
"epoch": 0.24495114006514657,
"grad_norm": 5.419252004137471,
"learning_rate": 8.923668988164213e-06,
"loss": 4.7154,
"step": 282
},
{
"epoch": 0.2458197611292074,
"grad_norm": 8.042101516011865,
"learning_rate": 8.915471276658405e-06,
"loss": 4.8391,
"step": 283
},
{
"epoch": 0.2466883821932682,
"grad_norm": 6.113748424857356,
"learning_rate": 8.907246740205553e-06,
"loss": 4.7499,
"step": 284
},
{
"epoch": 0.247557003257329,
"grad_norm": 7.989470062325969,
"learning_rate": 8.898995443400767e-06,
"loss": 4.8469,
"step": 285
},
{
"epoch": 0.2484256243213898,
"grad_norm": 4.646955530626333,
"learning_rate": 8.890717451049335e-06,
"loss": 4.7358,
"step": 286
},
{
"epoch": 0.24929424538545059,
"grad_norm": 7.744655477388046,
"learning_rate": 8.882412828166213e-06,
"loss": 4.7807,
"step": 287
},
{
"epoch": 0.2501628664495114,
"grad_norm": 5.999315404456719,
"learning_rate": 8.874081639975508e-06,
"loss": 4.7731,
"step": 288
},
{
"epoch": 0.2510314875135722,
"grad_norm": 7.366537198808325,
"learning_rate": 8.865723951909972e-06,
"loss": 4.9502,
"step": 289
},
{
"epoch": 0.251900108577633,
"grad_norm": 5.915226962483226,
"learning_rate": 8.857339829610483e-06,
"loss": 4.7959,
"step": 290
},
{
"epoch": 0.25276872964169383,
"grad_norm": 9.035720639467963,
"learning_rate": 8.848929338925536e-06,
"loss": 4.7853,
"step": 291
},
{
"epoch": 0.2536373507057546,
"grad_norm": 5.777665004211188,
"learning_rate": 8.84049254591072e-06,
"loss": 4.8299,
"step": 292
},
{
"epoch": 0.25450597176981543,
"grad_norm": 9.211239169808724,
"learning_rate": 8.8320295168282e-06,
"loss": 4.6564,
"step": 293
},
{
"epoch": 0.2553745928338762,
"grad_norm": 6.121012460613405,
"learning_rate": 8.8235403181462e-06,
"loss": 4.6625,
"step": 294
},
{
"epoch": 0.256243213897937,
"grad_norm": 7.943211261154958,
"learning_rate": 8.815025016538477e-06,
"loss": 4.9183,
"step": 295
},
{
"epoch": 0.25711183496199785,
"grad_norm": 7.520085515355943,
"learning_rate": 8.806483678883803e-06,
"loss": 4.7658,
"step": 296
},
{
"epoch": 0.2579804560260586,
"grad_norm": 6.255511074855573,
"learning_rate": 8.79791637226543e-06,
"loss": 4.8817,
"step": 297
},
{
"epoch": 0.25884907709011945,
"grad_norm": 8.574508707280637,
"learning_rate": 8.789323163970573e-06,
"loss": 4.8496,
"step": 298
},
{
"epoch": 0.2597176981541802,
"grad_norm": 5.222979665146013,
"learning_rate": 8.780704121489876e-06,
"loss": 4.7925,
"step": 299
},
{
"epoch": 0.26058631921824105,
"grad_norm": 9.382159121398075,
"learning_rate": 8.772059312516883e-06,
"loss": 4.88,
"step": 300
},
{
"epoch": 0.2614549402823019,
"grad_norm": 6.8357894514157245,
"learning_rate": 8.76338880494751e-06,
"loss": 4.6685,
"step": 301
},
{
"epoch": 0.26232356134636264,
"grad_norm": 6.517300496516373,
"learning_rate": 8.754692666879504e-06,
"loss": 4.8814,
"step": 302
},
{
"epoch": 0.26319218241042347,
"grad_norm": 9.702116008549124,
"learning_rate": 8.745970966611917e-06,
"loss": 4.6584,
"step": 303
},
{
"epoch": 0.26406080347448424,
"grad_norm": 6.260530698612518,
"learning_rate": 8.737223772644562e-06,
"loss": 4.6473,
"step": 304
},
{
"epoch": 0.26492942453854507,
"grad_norm": 9.829705946999965,
"learning_rate": 8.72845115367748e-06,
"loss": 4.771,
"step": 305
},
{
"epoch": 0.26579804560260584,
"grad_norm": 6.69025005399437,
"learning_rate": 8.719653178610396e-06,
"loss": 4.778,
"step": 306
},
{
"epoch": 0.26666666666666666,
"grad_norm": 6.70318175356247,
"learning_rate": 8.710829916542184e-06,
"loss": 4.8116,
"step": 307
},
{
"epoch": 0.2675352877307275,
"grad_norm": 7.751822349780793,
"learning_rate": 8.701981436770322e-06,
"loss": 4.6695,
"step": 308
},
{
"epoch": 0.26840390879478826,
"grad_norm": 7.108916006280296,
"learning_rate": 8.69310780879034e-06,
"loss": 4.7317,
"step": 309
},
{
"epoch": 0.2692725298588491,
"grad_norm": 4.328672737830481,
"learning_rate": 8.684209102295292e-06,
"loss": 4.6794,
"step": 310
},
{
"epoch": 0.27014115092290986,
"grad_norm": 8.322594402892245,
"learning_rate": 8.675285387175183e-06,
"loss": 4.8747,
"step": 311
},
{
"epoch": 0.2710097719869707,
"grad_norm": 4.508165025699175,
"learning_rate": 8.666336733516447e-06,
"loss": 4.9213,
"step": 312
},
{
"epoch": 0.2718783930510315,
"grad_norm": 6.5835338898059055,
"learning_rate": 8.657363211601375e-06,
"loss": 4.7728,
"step": 313
},
{
"epoch": 0.2727470141150923,
"grad_norm": 8.537697102858328,
"learning_rate": 8.64836489190758e-06,
"loss": 4.7095,
"step": 314
},
{
"epoch": 0.2736156351791531,
"grad_norm": 8.365612338039389,
"learning_rate": 8.639341845107432e-06,
"loss": 4.8467,
"step": 315
},
{
"epoch": 0.2744842562432139,
"grad_norm": 6.665043705485368,
"learning_rate": 8.630294142067505e-06,
"loss": 4.8167,
"step": 316
},
{
"epoch": 0.2753528773072747,
"grad_norm": 7.82967483334624,
"learning_rate": 8.621221853848022e-06,
"loss": 4.7616,
"step": 317
},
{
"epoch": 0.2762214983713355,
"grad_norm": 4.138871108394252,
"learning_rate": 8.6121250517023e-06,
"loss": 4.7277,
"step": 318
},
{
"epoch": 0.2770901194353963,
"grad_norm": 5.604431351464519,
"learning_rate": 8.603003807076184e-06,
"loss": 4.6664,
"step": 319
},
{
"epoch": 0.2779587404994571,
"grad_norm": 8.601258868804189,
"learning_rate": 8.593858191607492e-06,
"loss": 4.8013,
"step": 320
},
{
"epoch": 0.2788273615635179,
"grad_norm": 4.906802688431626,
"learning_rate": 8.584688277125446e-06,
"loss": 4.6898,
"step": 321
},
{
"epoch": 0.2796959826275787,
"grad_norm": 10.116257183106686,
"learning_rate": 8.575494135650115e-06,
"loss": 4.8905,
"step": 322
},
{
"epoch": 0.28056460369163955,
"grad_norm": 5.885384217150105,
"learning_rate": 8.566275839391842e-06,
"loss": 4.8431,
"step": 323
},
{
"epoch": 0.2814332247557003,
"grad_norm": 7.268094086044695,
"learning_rate": 8.557033460750685e-06,
"loss": 4.8863,
"step": 324
},
{
"epoch": 0.28230184581976114,
"grad_norm": 7.115813734582664,
"learning_rate": 8.547767072315835e-06,
"loss": 4.8921,
"step": 325
},
{
"epoch": 0.2831704668838219,
"grad_norm": 5.851149266761853,
"learning_rate": 8.538476746865066e-06,
"loss": 4.6311,
"step": 326
},
{
"epoch": 0.28403908794788274,
"grad_norm": 11.032345343462614,
"learning_rate": 8.529162557364148e-06,
"loss": 4.751,
"step": 327
},
{
"epoch": 0.28490770901194357,
"grad_norm": 5.0859521313383045,
"learning_rate": 8.519824576966274e-06,
"loss": 4.7293,
"step": 328
},
{
"epoch": 0.28577633007600434,
"grad_norm": 13.803163681395665,
"learning_rate": 8.510462879011492e-06,
"loss": 4.8309,
"step": 329
},
{
"epoch": 0.28664495114006516,
"grad_norm": 10.999396408035773,
"learning_rate": 8.50107753702613e-06,
"loss": 4.7266,
"step": 330
},
{
"epoch": 0.28751357220412593,
"grad_norm": 9.25286217238097,
"learning_rate": 8.49166862472221e-06,
"loss": 4.7683,
"step": 331
},
{
"epoch": 0.28838219326818676,
"grad_norm": 6.4323672461710135,
"learning_rate": 8.482236215996881e-06,
"loss": 4.7786,
"step": 332
},
{
"epoch": 0.28925081433224753,
"grad_norm": 11.742176348664781,
"learning_rate": 8.47278038493182e-06,
"loss": 4.7423,
"step": 333
},
{
"epoch": 0.29011943539630836,
"grad_norm": 7.418561647305186,
"learning_rate": 8.463301205792675e-06,
"loss": 4.7125,
"step": 334
},
{
"epoch": 0.2909880564603692,
"grad_norm": 13.421309741389427,
"learning_rate": 8.45379875302846e-06,
"loss": 4.7378,
"step": 335
},
{
"epoch": 0.29185667752442995,
"grad_norm": 12.62311847789775,
"learning_rate": 8.444273101270982e-06,
"loss": 4.6425,
"step": 336
},
{
"epoch": 0.2927252985884908,
"grad_norm": 7.036866908269808,
"learning_rate": 8.434724325334252e-06,
"loss": 4.7556,
"step": 337
},
{
"epoch": 0.29359391965255155,
"grad_norm": 8.910330385460947,
"learning_rate": 8.425152500213898e-06,
"loss": 4.6621,
"step": 338
},
{
"epoch": 0.2944625407166124,
"grad_norm": 6.680041055622366,
"learning_rate": 8.415557701086572e-06,
"loss": 4.58,
"step": 339
},
{
"epoch": 0.2953311617806732,
"grad_norm": 5.917224901051049,
"learning_rate": 8.405940003309366e-06,
"loss": 4.764,
"step": 340
},
{
"epoch": 0.29619978284473397,
"grad_norm": 6.025660992004857,
"learning_rate": 8.396299482419213e-06,
"loss": 4.9539,
"step": 341
},
{
"epoch": 0.2970684039087948,
"grad_norm": 6.8290925321533855,
"learning_rate": 8.386636214132303e-06,
"loss": 4.7968,
"step": 342
},
{
"epoch": 0.29793702497285557,
"grad_norm": 6.766388648711616,
"learning_rate": 8.376950274343476e-06,
"loss": 4.7316,
"step": 343
},
{
"epoch": 0.2988056460369164,
"grad_norm": 5.199438881706867,
"learning_rate": 8.367241739125645e-06,
"loss": 4.6717,
"step": 344
},
{
"epoch": 0.2996742671009772,
"grad_norm": 5.537976824790018,
"learning_rate": 8.35751068472917e-06,
"loss": 4.5794,
"step": 345
},
{
"epoch": 0.300542888165038,
"grad_norm": 7.434218743498804,
"learning_rate": 8.347757187581288e-06,
"loss": 4.5797,
"step": 346
},
{
"epoch": 0.3014115092290988,
"grad_norm": 3.7410734562690102,
"learning_rate": 8.337981324285495e-06,
"loss": 4.6053,
"step": 347
},
{
"epoch": 0.3022801302931596,
"grad_norm": 7.494539433625744,
"learning_rate": 8.328183171620953e-06,
"loss": 4.6358,
"step": 348
},
{
"epoch": 0.3031487513572204,
"grad_norm": 8.577268240575261,
"learning_rate": 8.318362806541878e-06,
"loss": 4.6568,
"step": 349
},
{
"epoch": 0.30401737242128124,
"grad_norm": 5.870073757957237,
"learning_rate": 8.308520306176948e-06,
"loss": 4.8527,
"step": 350
},
{
"epoch": 0.304885993485342,
"grad_norm": 10.403543599580171,
"learning_rate": 8.298655747828685e-06,
"loss": 4.6002,
"step": 351
},
{
"epoch": 0.30575461454940284,
"grad_norm": 5.381680003218319,
"learning_rate": 8.288769208972858e-06,
"loss": 4.7809,
"step": 352
},
{
"epoch": 0.3066232356134636,
"grad_norm": 9.877174040443599,
"learning_rate": 8.278860767257865e-06,
"loss": 4.7145,
"step": 353
},
{
"epoch": 0.30749185667752443,
"grad_norm": 6.336637930519997,
"learning_rate": 8.26893050050413e-06,
"loss": 4.8645,
"step": 354
},
{
"epoch": 0.30836047774158526,
"grad_norm": 9.02647409091144,
"learning_rate": 8.258978486703493e-06,
"loss": 4.6869,
"step": 355
},
{
"epoch": 0.30922909880564603,
"grad_norm": 7.279729496607072,
"learning_rate": 8.24900480401859e-06,
"loss": 4.883,
"step": 356
},
{
"epoch": 0.31009771986970686,
"grad_norm": 6.615700438254423,
"learning_rate": 8.239009530782244e-06,
"loss": 4.774,
"step": 357
},
{
"epoch": 0.3109663409337676,
"grad_norm": 8.777698668573509,
"learning_rate": 8.228992745496851e-06,
"loss": 4.9262,
"step": 358
},
{
"epoch": 0.31183496199782845,
"grad_norm": 5.9973840840887505,
"learning_rate": 8.21895452683376e-06,
"loss": 4.7127,
"step": 359
},
{
"epoch": 0.3127035830618892,
"grad_norm": 10.806900681675172,
"learning_rate": 8.20889495363266e-06,
"loss": 4.7254,
"step": 360
},
{
"epoch": 0.31357220412595005,
"grad_norm": 8.826153430021765,
"learning_rate": 8.198814104900951e-06,
"loss": 4.5981,
"step": 361
},
{
"epoch": 0.3144408251900109,
"grad_norm": 9.306758043660382,
"learning_rate": 8.188712059813135e-06,
"loss": 4.6332,
"step": 362
},
{
"epoch": 0.31530944625407165,
"grad_norm": 5.858792200617635,
"learning_rate": 8.178588897710189e-06,
"loss": 4.58,
"step": 363
},
{
"epoch": 0.31617806731813247,
"grad_norm": 6.68100229149933,
"learning_rate": 8.16844469809894e-06,
"loss": 4.7159,
"step": 364
},
{
"epoch": 0.31704668838219324,
"grad_norm": 5.735283629069225,
"learning_rate": 8.158279540651446e-06,
"loss": 4.8563,
"step": 365
},
{
"epoch": 0.31791530944625407,
"grad_norm": 6.584386859485198,
"learning_rate": 8.14809350520436e-06,
"loss": 4.751,
"step": 366
},
{
"epoch": 0.3187839305103149,
"grad_norm": 6.582333473666524,
"learning_rate": 8.137886671758317e-06,
"loss": 4.7886,
"step": 367
},
{
"epoch": 0.31965255157437567,
"grad_norm": 6.4883303364815275,
"learning_rate": 8.127659120477296e-06,
"loss": 4.6657,
"step": 368
},
{
"epoch": 0.3205211726384365,
"grad_norm": 5.918831827629274,
"learning_rate": 8.117410931687992e-06,
"loss": 4.8691,
"step": 369
},
{
"epoch": 0.32138979370249726,
"grad_norm": 5.283964365819055,
"learning_rate": 8.107142185879185e-06,
"loss": 4.8074,
"step": 370
},
{
"epoch": 0.3222584147665581,
"grad_norm": 3.8890066265932184,
"learning_rate": 8.096852963701113e-06,
"loss": 4.5405,
"step": 371
},
{
"epoch": 0.3231270358306189,
"grad_norm": 6.990269124593145,
"learning_rate": 8.086543345964833e-06,
"loss": 4.6027,
"step": 372
},
{
"epoch": 0.3239956568946797,
"grad_norm": 8.43721076775689,
"learning_rate": 8.07621341364158e-06,
"loss": 4.647,
"step": 373
},
{
"epoch": 0.3248642779587405,
"grad_norm": 5.58990373211778,
"learning_rate": 8.065863247862153e-06,
"loss": 4.6015,
"step": 374
},
{
"epoch": 0.3257328990228013,
"grad_norm": 6.97196709970691,
"learning_rate": 8.05549292991625e-06,
"loss": 4.6307,
"step": 375
},
{
"epoch": 0.3266015200868621,
"grad_norm": 6.1054642976234215,
"learning_rate": 8.045102541251855e-06,
"loss": 4.5934,
"step": 376
},
{
"epoch": 0.32747014115092293,
"grad_norm": 5.762512151008954,
"learning_rate": 8.034692163474576e-06,
"loss": 4.5073,
"step": 377
},
{
"epoch": 0.3283387622149837,
"grad_norm": 7.3544384303060575,
"learning_rate": 8.02426187834702e-06,
"loss": 4.7555,
"step": 378
},
{
"epoch": 0.32920738327904453,
"grad_norm": 4.541115018669931,
"learning_rate": 8.013811767788144e-06,
"loss": 4.5251,
"step": 379
},
{
"epoch": 0.3300760043431053,
"grad_norm": 7.061819858205978,
"learning_rate": 8.003341913872616e-06,
"loss": 4.6566,
"step": 380
},
{
"epoch": 0.3309446254071661,
"grad_norm": 5.949506061458709,
"learning_rate": 7.992852398830164e-06,
"loss": 4.6047,
"step": 381
},
{
"epoch": 0.33181324647122695,
"grad_norm": 8.186210627662106,
"learning_rate": 7.982343305044932e-06,
"loss": 4.5416,
"step": 382
},
{
"epoch": 0.3326818675352877,
"grad_norm": 5.5358466371451645,
"learning_rate": 7.971814715054837e-06,
"loss": 4.6173,
"step": 383
},
{
"epoch": 0.33355048859934855,
"grad_norm": 7.682761597446245,
"learning_rate": 7.961266711550922e-06,
"loss": 4.5791,
"step": 384
},
{
"epoch": 0.3344191096634093,
"grad_norm": 5.430465140999319,
"learning_rate": 7.950699377376696e-06,
"loss": 4.4514,
"step": 385
},
{
"epoch": 0.33528773072747015,
"grad_norm": 7.022253684816494,
"learning_rate": 7.940112795527493e-06,
"loss": 4.7673,
"step": 386
},
{
"epoch": 0.33615635179153097,
"grad_norm": 6.175962941485549,
"learning_rate": 7.929507049149817e-06,
"loss": 4.5196,
"step": 387
},
{
"epoch": 0.33702497285559174,
"grad_norm": 7.343612570596667,
"learning_rate": 7.918882221540692e-06,
"loss": 4.6791,
"step": 388
},
{
"epoch": 0.33789359391965257,
"grad_norm": 5.998391268508065,
"learning_rate": 7.908238396147002e-06,
"loss": 4.8548,
"step": 389
},
{
"epoch": 0.33876221498371334,
"grad_norm": 7.019440378415661,
"learning_rate": 7.897575656564836e-06,
"loss": 4.6303,
"step": 390
},
{
"epoch": 0.33963083604777417,
"grad_norm": 6.839076558244222,
"learning_rate": 7.886894086538841e-06,
"loss": 4.6871,
"step": 391
},
{
"epoch": 0.34049945711183494,
"grad_norm": 5.40567329714043,
"learning_rate": 7.876193769961555e-06,
"loss": 4.6996,
"step": 392
},
{
"epoch": 0.34136807817589576,
"grad_norm": 7.504268933018013,
"learning_rate": 7.865474790872749e-06,
"loss": 4.6356,
"step": 393
},
{
"epoch": 0.3422366992399566,
"grad_norm": 4.823115754529949,
"learning_rate": 7.854737233458764e-06,
"loss": 4.5891,
"step": 394
},
{
"epoch": 0.34310532030401736,
"grad_norm": 7.282830148401436,
"learning_rate": 7.843981182051866e-06,
"loss": 4.5814,
"step": 395
},
{
"epoch": 0.3439739413680782,
"grad_norm": 7.366691458186102,
"learning_rate": 7.83320672112956e-06,
"loss": 4.6604,
"step": 396
},
{
"epoch": 0.34484256243213895,
"grad_norm": 4.92486295778556,
"learning_rate": 7.822413935313947e-06,
"loss": 4.6913,
"step": 397
},
{
"epoch": 0.3457111834961998,
"grad_norm": 8.210710382886061,
"learning_rate": 7.811602909371044e-06,
"loss": 4.5104,
"step": 398
},
{
"epoch": 0.3465798045602606,
"grad_norm": 5.177887473136741,
"learning_rate": 7.800773728210133e-06,
"loss": 4.7158,
"step": 399
},
{
"epoch": 0.3474484256243214,
"grad_norm": 6.985161948689458,
"learning_rate": 7.789926476883079e-06,
"loss": 4.5611,
"step": 400
},
{
"epoch": 0.3483170466883822,
"grad_norm": 6.148668513601841,
"learning_rate": 7.779061240583669e-06,
"loss": 4.7723,
"step": 401
},
{
"epoch": 0.349185667752443,
"grad_norm": 5.763986287210963,
"learning_rate": 7.768178104646953e-06,
"loss": 4.6559,
"step": 402
},
{
"epoch": 0.3500542888165038,
"grad_norm": 5.7385002364185915,
"learning_rate": 7.757277154548552e-06,
"loss": 4.6866,
"step": 403
},
{
"epoch": 0.3509229098805646,
"grad_norm": 5.490491868927748,
"learning_rate": 7.746358475904006e-06,
"loss": 4.4739,
"step": 404
},
{
"epoch": 0.3517915309446254,
"grad_norm": 7.746007698882787,
"learning_rate": 7.735422154468087e-06,
"loss": 4.6623,
"step": 405
},
{
"epoch": 0.3526601520086862,
"grad_norm": 5.140117986756119,
"learning_rate": 7.724468276134143e-06,
"loss": 4.5921,
"step": 406
},
{
"epoch": 0.353528773072747,
"grad_norm": 5.822784492855372,
"learning_rate": 7.713496926933405e-06,
"loss": 4.5497,
"step": 407
},
{
"epoch": 0.3543973941368078,
"grad_norm": 4.504836312173092,
"learning_rate": 7.70250819303432e-06,
"loss": 4.6658,
"step": 408
},
{
"epoch": 0.35526601520086865,
"grad_norm": 10.535092606649714,
"learning_rate": 7.691502160741879e-06,
"loss": 4.5037,
"step": 409
},
{
"epoch": 0.3561346362649294,
"grad_norm": 6.1030400833165555,
"learning_rate": 7.680478916496927e-06,
"loss": 4.7045,
"step": 410
},
{
"epoch": 0.35700325732899024,
"grad_norm": 7.459890595821586,
"learning_rate": 7.669438546875495e-06,
"loss": 4.7035,
"step": 411
},
{
"epoch": 0.357871878393051,
"grad_norm": 8.136016337416066,
"learning_rate": 7.658381138588111e-06,
"loss": 4.5822,
"step": 412
},
{
"epoch": 0.35874049945711184,
"grad_norm": 7.256847852406638,
"learning_rate": 7.647306778479135e-06,
"loss": 4.5594,
"step": 413
},
{
"epoch": 0.35960912052117266,
"grad_norm": 7.032905775099178,
"learning_rate": 7.636215553526054e-06,
"loss": 4.6578,
"step": 414
},
{
"epoch": 0.36047774158523344,
"grad_norm": 6.984223271346296,
"learning_rate": 7.625107550838813e-06,
"loss": 4.5949,
"step": 415
},
{
"epoch": 0.36134636264929426,
"grad_norm": 8.223177636534274,
"learning_rate": 7.613982857659134e-06,
"loss": 4.5468,
"step": 416
},
{
"epoch": 0.36221498371335503,
"grad_norm": 5.980657620361127,
"learning_rate": 7.602841561359822e-06,
"loss": 4.495,
"step": 417
},
{
"epoch": 0.36308360477741586,
"grad_norm": 6.056077053853974,
"learning_rate": 7.591683749444077e-06,
"loss": 4.7391,
"step": 418
},
{
"epoch": 0.36395222584147663,
"grad_norm": 7.884243252755605,
"learning_rate": 7.5805095095448245e-06,
"loss": 4.5782,
"step": 419
},
{
"epoch": 0.36482084690553745,
"grad_norm": 8.363070322855709,
"learning_rate": 7.569318929424002e-06,
"loss": 4.5914,
"step": 420
},
{
"epoch": 0.3656894679695983,
"grad_norm": 6.163289948356193,
"learning_rate": 7.558112096971889e-06,
"loss": 4.6895,
"step": 421
},
{
"epoch": 0.36655808903365905,
"grad_norm": 6.6531465448851135,
"learning_rate": 7.5468891002064045e-06,
"loss": 4.5072,
"step": 422
},
{
"epoch": 0.3674267100977199,
"grad_norm": 6.718432675278386,
"learning_rate": 7.535650027272432e-06,
"loss": 4.5402,
"step": 423
},
{
"epoch": 0.36829533116178065,
"grad_norm": 6.071906565126357,
"learning_rate": 7.5243949664411035e-06,
"loss": 4.6823,
"step": 424
},
{
"epoch": 0.3691639522258415,
"grad_norm": 7.151230840698435,
"learning_rate": 7.5131240061091285e-06,
"loss": 4.5005,
"step": 425
},
{
"epoch": 0.3700325732899023,
"grad_norm": 5.338683591643838,
"learning_rate": 7.501837234798084e-06,
"loss": 4.5487,
"step": 426
},
{
"epoch": 0.37090119435396307,
"grad_norm": 6.10815615028733,
"learning_rate": 7.490534741153733e-06,
"loss": 4.6367,
"step": 427
},
{
"epoch": 0.3717698154180239,
"grad_norm": 6.115503279804057,
"learning_rate": 7.47921661394531e-06,
"loss": 4.5094,
"step": 428
},
{
"epoch": 0.37263843648208467,
"grad_norm": 5.269106785049928,
"learning_rate": 7.46788294206485e-06,
"loss": 4.5126,
"step": 429
},
{
"epoch": 0.3735070575461455,
"grad_norm": 7.458849192557889,
"learning_rate": 7.4565338145264595e-06,
"loss": 4.7316,
"step": 430
},
{
"epoch": 0.3743756786102063,
"grad_norm": 4.724897099377871,
"learning_rate": 7.445169320465645e-06,
"loss": 4.7243,
"step": 431
},
{
"epoch": 0.3752442996742671,
"grad_norm": 7.229325419079616,
"learning_rate": 7.433789549138592e-06,
"loss": 4.4991,
"step": 432
},
{
"epoch": 0.3761129207383279,
"grad_norm": 6.540185537119028,
"learning_rate": 7.42239458992148e-06,
"loss": 4.4751,
"step": 433
},
{
"epoch": 0.3769815418023887,
"grad_norm": 7.538652093884085,
"learning_rate": 7.410984532309768e-06,
"loss": 4.5819,
"step": 434
},
{
"epoch": 0.3778501628664495,
"grad_norm": 10.09025026929928,
"learning_rate": 7.399559465917499e-06,
"loss": 4.5811,
"step": 435
},
{
"epoch": 0.37871878393051034,
"grad_norm": 5.99150737185514,
"learning_rate": 7.3881194804765975e-06,
"loss": 4.6508,
"step": 436
},
{
"epoch": 0.3795874049945711,
"grad_norm": 10.294753301023693,
"learning_rate": 7.376664665836156e-06,
"loss": 4.6694,
"step": 437
},
{
"epoch": 0.38045602605863194,
"grad_norm": 10.09163312709563,
"learning_rate": 7.3651951119617415e-06,
"loss": 4.515,
"step": 438
},
{
"epoch": 0.3813246471226927,
"grad_norm": 8.660200561326562,
"learning_rate": 7.353710908934672e-06,
"loss": 4.7109,
"step": 439
},
{
"epoch": 0.38219326818675353,
"grad_norm": 5.980585172331498,
"learning_rate": 7.342212146951329e-06,
"loss": 4.62,
"step": 440
},
{
"epoch": 0.38306188925081436,
"grad_norm": 7.2092542635747705,
"learning_rate": 7.3306989163224365e-06,
"loss": 4.6266,
"step": 441
},
{
"epoch": 0.38393051031487513,
"grad_norm": 8.245957827229953,
"learning_rate": 7.319171307472355e-06,
"loss": 4.5792,
"step": 442
},
{
"epoch": 0.38479913137893595,
"grad_norm": 4.686151630126687,
"learning_rate": 7.307629410938364e-06,
"loss": 4.6665,
"step": 443
},
{
"epoch": 0.3856677524429967,
"grad_norm": 7.408155624716384,
"learning_rate": 7.296073317369967e-06,
"loss": 4.3742,
"step": 444
},
{
"epoch": 0.38653637350705755,
"grad_norm": 7.497505710273664,
"learning_rate": 7.284503117528167e-06,
"loss": 4.5901,
"step": 445
},
{
"epoch": 0.3874049945711184,
"grad_norm": 4.224052938744046,
"learning_rate": 7.272918902284758e-06,
"loss": 4.625,
"step": 446
},
{
"epoch": 0.38827361563517915,
"grad_norm": 9.130148186812361,
"learning_rate": 7.261320762621605e-06,
"loss": 4.5953,
"step": 447
},
{
"epoch": 0.38914223669924,
"grad_norm": 5.04037207826654,
"learning_rate": 7.249708789629944e-06,
"loss": 4.3398,
"step": 448
},
{
"epoch": 0.39001085776330074,
"grad_norm": 10.934078227299317,
"learning_rate": 7.2380830745096474e-06,
"loss": 4.8164,
"step": 449
},
{
"epoch": 0.39087947882736157,
"grad_norm": 5.288203351054606,
"learning_rate": 7.226443708568525e-06,
"loss": 4.6176,
"step": 450
},
{
"epoch": 0.39174809989142234,
"grad_norm": 13.494419416996744,
"learning_rate": 7.214790783221596e-06,
"loss": 4.4502,
"step": 451
},
{
"epoch": 0.39261672095548317,
"grad_norm": 5.947583732503299,
"learning_rate": 7.2031243899903755e-06,
"loss": 4.6519,
"step": 452
},
{
"epoch": 0.393485342019544,
"grad_norm": 13.629266769082431,
"learning_rate": 7.191444620502154e-06,
"loss": 4.6178,
"step": 453
},
{
"epoch": 0.39435396308360476,
"grad_norm": 8.435112519451463,
"learning_rate": 7.17975156648928e-06,
"loss": 4.3477,
"step": 454
},
{
"epoch": 0.3952225841476656,
"grad_norm": 13.097395961934241,
"learning_rate": 7.168045319788436e-06,
"loss": 4.5158,
"step": 455
},
{
"epoch": 0.39609120521172636,
"grad_norm": 9.326564191975844,
"learning_rate": 7.1563259723399204e-06,
"loss": 4.6681,
"step": 456
},
{
"epoch": 0.3969598262757872,
"grad_norm": 12.31102530036286,
"learning_rate": 7.144593616186925e-06,
"loss": 4.4425,
"step": 457
},
{
"epoch": 0.397828447339848,
"grad_norm": 9.61855296186821,
"learning_rate": 7.13284834347481e-06,
"loss": 4.755,
"step": 458
},
{
"epoch": 0.3986970684039088,
"grad_norm": 11.132702885829065,
"learning_rate": 7.121090246450381e-06,
"loss": 4.6984,
"step": 459
},
{
"epoch": 0.3995656894679696,
"grad_norm": 9.438660704133007,
"learning_rate": 7.1093194174611665e-06,
"loss": 4.5854,
"step": 460
},
{
"epoch": 0.4004343105320304,
"grad_norm": 10.460987068330683,
"learning_rate": 7.0975359489546914e-06,
"loss": 4.6308,
"step": 461
},
{
"epoch": 0.4013029315960912,
"grad_norm": 8.135876347998378,
"learning_rate": 7.0857399334777525e-06,
"loss": 4.5529,
"step": 462
},
{
"epoch": 0.40217155266015203,
"grad_norm": 12.65605978727452,
"learning_rate": 7.073931463675685e-06,
"loss": 4.5816,
"step": 463
},
{
"epoch": 0.4030401737242128,
"grad_norm": 10.229434009840299,
"learning_rate": 7.062110632291641e-06,
"loss": 4.5845,
"step": 464
},
{
"epoch": 0.40390879478827363,
"grad_norm": 9.117061503179459,
"learning_rate": 7.0502775321658655e-06,
"loss": 4.5246,
"step": 465
},
{
"epoch": 0.4047774158523344,
"grad_norm": 8.569940050056474,
"learning_rate": 7.038432256234956e-06,
"loss": 4.5735,
"step": 466
},
{
"epoch": 0.4056460369163952,
"grad_norm": 7.752296760070429,
"learning_rate": 7.026574897531137e-06,
"loss": 4.3389,
"step": 467
},
{
"epoch": 0.40651465798045605,
"grad_norm": 6.650253151223029,
"learning_rate": 7.014705549181537e-06,
"loss": 4.559,
"step": 468
},
{
"epoch": 0.4073832790445168,
"grad_norm": 6.044551087774109,
"learning_rate": 7.0028243044074425e-06,
"loss": 4.565,
"step": 469
},
{
"epoch": 0.40825190010857765,
"grad_norm": 7.013354857314451,
"learning_rate": 6.990931256523583e-06,
"loss": 4.4699,
"step": 470
},
{
"epoch": 0.4091205211726384,
"grad_norm": 7.694869076287624,
"learning_rate": 6.97902649893738e-06,
"loss": 4.5624,
"step": 471
},
{
"epoch": 0.40998914223669924,
"grad_norm": 4.8526449692385905,
"learning_rate": 6.96711012514823e-06,
"loss": 4.5316,
"step": 472
},
{
"epoch": 0.41085776330076007,
"grad_norm": 6.982180222415533,
"learning_rate": 6.955182228746757e-06,
"loss": 4.4836,
"step": 473
},
{
"epoch": 0.41172638436482084,
"grad_norm": 5.126018865451303,
"learning_rate": 6.943242903414087e-06,
"loss": 4.6034,
"step": 474
},
{
"epoch": 0.41259500542888167,
"grad_norm": 5.649991787266421,
"learning_rate": 6.9312922429211065e-06,
"loss": 4.576,
"step": 475
},
{
"epoch": 0.41346362649294244,
"grad_norm": 6.484179499960972,
"learning_rate": 6.9193303411277265e-06,
"loss": 4.4123,
"step": 476
},
{
"epoch": 0.41433224755700326,
"grad_norm": 6.697110498760649,
"learning_rate": 6.907357291982148e-06,
"loss": 4.5219,
"step": 477
},
{
"epoch": 0.41520086862106403,
"grad_norm": 5.161776605561941,
"learning_rate": 6.895373189520124e-06,
"loss": 4.6172,
"step": 478
},
{
"epoch": 0.41606948968512486,
"grad_norm": 6.744775916139352,
"learning_rate": 6.883378127864218e-06,
"loss": 4.4805,
"step": 479
},
{
"epoch": 0.4169381107491857,
"grad_norm": 5.164686600631932,
"learning_rate": 6.871372201223068e-06,
"loss": 4.5713,
"step": 480
},
{
"epoch": 0.41780673181324646,
"grad_norm": 6.684936475739929,
"learning_rate": 6.859355503890643e-06,
"loss": 4.5636,
"step": 481
},
{
"epoch": 0.4186753528773073,
"grad_norm": 4.963603500277838,
"learning_rate": 6.847328130245506e-06,
"loss": 4.4979,
"step": 482
},
{
"epoch": 0.41954397394136805,
"grad_norm": 6.455524904363139,
"learning_rate": 6.83529017475007e-06,
"loss": 4.5704,
"step": 483
},
{
"epoch": 0.4204125950054289,
"grad_norm": 5.327238544989892,
"learning_rate": 6.8232417319498585e-06,
"loss": 4.5729,
"step": 484
},
{
"epoch": 0.4212812160694897,
"grad_norm": 7.5408057979505525,
"learning_rate": 6.811182896472764e-06,
"loss": 4.5542,
"step": 485
},
{
"epoch": 0.4221498371335505,
"grad_norm": 5.148215274061414,
"learning_rate": 6.799113763028296e-06,
"loss": 4.5727,
"step": 486
},
{
"epoch": 0.4230184581976113,
"grad_norm": 5.126262165864149,
"learning_rate": 6.78703442640685e-06,
"loss": 4.392,
"step": 487
},
{
"epoch": 0.4238870792616721,
"grad_norm": 6.5704973294653986,
"learning_rate": 6.774944981478953e-06,
"loss": 4.7496,
"step": 488
},
{
"epoch": 0.4247557003257329,
"grad_norm": 7.6270736279053715,
"learning_rate": 6.762845523194527e-06,
"loss": 4.4043,
"step": 489
},
{
"epoch": 0.4256243213897937,
"grad_norm": 4.7832965680424975,
"learning_rate": 6.750736146582129e-06,
"loss": 4.4942,
"step": 490
},
{
"epoch": 0.4264929424538545,
"grad_norm": 8.637265182760746,
"learning_rate": 6.738616946748229e-06,
"loss": 4.6536,
"step": 491
},
{
"epoch": 0.4273615635179153,
"grad_norm": 5.5509816327572,
"learning_rate": 6.726488018876431e-06,
"loss": 4.5646,
"step": 492
},
{
"epoch": 0.4282301845819761,
"grad_norm": 7.976660262443447,
"learning_rate": 6.7143494582267565e-06,
"loss": 4.3377,
"step": 493
},
{
"epoch": 0.4290988056460369,
"grad_norm": 5.8635253190574925,
"learning_rate": 6.702201360134874e-06,
"loss": 4.6456,
"step": 494
},
{
"epoch": 0.42996742671009774,
"grad_norm": 6.191768041893744,
"learning_rate": 6.690043820011362e-06,
"loss": 4.4138,
"step": 495
},
{
"epoch": 0.4308360477741585,
"grad_norm": 6.21848167204258,
"learning_rate": 6.677876933340952e-06,
"loss": 4.5893,
"step": 496
},
{
"epoch": 0.43170466883821934,
"grad_norm": 6.974213096926323,
"learning_rate": 6.665700795681795e-06,
"loss": 4.488,
"step": 497
},
{
"epoch": 0.4325732899022801,
"grad_norm": 4.90870178244335,
"learning_rate": 6.65351550266468e-06,
"loss": 4.4874,
"step": 498
},
{
"epoch": 0.43344191096634094,
"grad_norm": 8.263746621881054,
"learning_rate": 6.64132114999232e-06,
"loss": 4.3805,
"step": 499
},
{
"epoch": 0.43431053203040176,
"grad_norm": 7.323957132091585,
"learning_rate": 6.6291178334385695e-06,
"loss": 4.4165,
"step": 500
},
{
"epoch": 0.43517915309446253,
"grad_norm": 6.480701522877974,
"learning_rate": 6.616905648847693e-06,
"loss": 4.3516,
"step": 501
},
{
"epoch": 0.43604777415852336,
"grad_norm": 7.079757216369442,
"learning_rate": 6.604684692133597e-06,
"loss": 4.6046,
"step": 502
},
{
"epoch": 0.43691639522258413,
"grad_norm": 7.375345908223872,
"learning_rate": 6.5924550592790894e-06,
"loss": 4.5464,
"step": 503
},
{
"epoch": 0.43778501628664496,
"grad_norm": 7.222616053445717,
"learning_rate": 6.580216846335118e-06,
"loss": 4.5919,
"step": 504
},
{
"epoch": 0.4386536373507057,
"grad_norm": 4.689789965685166,
"learning_rate": 6.567970149420018e-06,
"loss": 4.3752,
"step": 505
},
{
"epoch": 0.43952225841476655,
"grad_norm": 5.714230313496368,
"learning_rate": 6.555715064718756e-06,
"loss": 4.5501,
"step": 506
},
{
"epoch": 0.4403908794788274,
"grad_norm": 7.038923018037996,
"learning_rate": 6.543451688482182e-06,
"loss": 4.4103,
"step": 507
},
{
"epoch": 0.44125950054288815,
"grad_norm": 7.943178319434065,
"learning_rate": 6.531180117026258e-06,
"loss": 4.5398,
"step": 508
},
{
"epoch": 0.442128121606949,
"grad_norm": 5.455050769628667,
"learning_rate": 6.518900446731319e-06,
"loss": 4.5569,
"step": 509
},
{
"epoch": 0.44299674267100975,
"grad_norm": 9.260245349881815,
"learning_rate": 6.506612774041302e-06,
"loss": 4.6123,
"step": 510
},
{
"epoch": 0.4438653637350706,
"grad_norm": 8.258092780806443,
"learning_rate": 6.494317195462999e-06,
"loss": 4.4937,
"step": 511
},
{
"epoch": 0.4447339847991314,
"grad_norm": 7.575744472833341,
"learning_rate": 6.482013807565292e-06,
"loss": 4.479,
"step": 512
},
{
"epoch": 0.44560260586319217,
"grad_norm": 6.848928494852384,
"learning_rate": 6.469702706978397e-06,
"loss": 4.6304,
"step": 513
},
{
"epoch": 0.446471226927253,
"grad_norm": 6.429893016072357,
"learning_rate": 6.457383990393105e-06,
"loss": 4.4752,
"step": 514
},
{
"epoch": 0.44733984799131377,
"grad_norm": 6.645971348847954,
"learning_rate": 6.445057754560025e-06,
"loss": 4.4823,
"step": 515
},
{
"epoch": 0.4482084690553746,
"grad_norm": 7.649197834317788,
"learning_rate": 6.432724096288818e-06,
"loss": 4.4119,
"step": 516
},
{
"epoch": 0.4490770901194354,
"grad_norm": 4.915321016306434,
"learning_rate": 6.420383112447446e-06,
"loss": 4.5927,
"step": 517
},
{
"epoch": 0.4499457111834962,
"grad_norm": 11.680853631510265,
"learning_rate": 6.408034899961398e-06,
"loss": 4.6962,
"step": 518
},
{
"epoch": 0.450814332247557,
"grad_norm": 8.30571918662033,
"learning_rate": 6.395679555812942e-06,
"loss": 4.4227,
"step": 519
},
{
"epoch": 0.4516829533116178,
"grad_norm": 9.811824442503674,
"learning_rate": 6.383317177040357e-06,
"loss": 4.5434,
"step": 520
},
{
"epoch": 0.4525515743756786,
"grad_norm": 8.160240990996138,
"learning_rate": 6.370947860737173e-06,
"loss": 4.5247,
"step": 521
},
{
"epoch": 0.45342019543973944,
"grad_norm": 9.499135895845495,
"learning_rate": 6.358571704051401e-06,
"loss": 4.5605,
"step": 522
},
{
"epoch": 0.4542888165038002,
"grad_norm": 8.5708728035736,
"learning_rate": 6.346188804184782e-06,
"loss": 4.4378,
"step": 523
},
{
"epoch": 0.45515743756786103,
"grad_norm": 8.020265826405886,
"learning_rate": 6.333799258392015e-06,
"loss": 4.3991,
"step": 524
},
{
"epoch": 0.4560260586319218,
"grad_norm": 8.224933821515378,
"learning_rate": 6.3214031639799975e-06,
"loss": 4.5,
"step": 525
},
{
"epoch": 0.45689467969598263,
"grad_norm": 7.476113586399829,
"learning_rate": 6.309000618307058e-06,
"loss": 4.5767,
"step": 526
},
{
"epoch": 0.45776330076004346,
"grad_norm": 7.324699664493708,
"learning_rate": 6.296591718782193e-06,
"loss": 4.6139,
"step": 527
},
{
"epoch": 0.4586319218241042,
"grad_norm": 8.277908852422534,
"learning_rate": 6.284176562864303e-06,
"loss": 4.4586,
"step": 528
},
{
"epoch": 0.45950054288816505,
"grad_norm": 6.4211734472674005,
"learning_rate": 6.271755248061425e-06,
"loss": 4.4396,
"step": 529
},
{
"epoch": 0.4603691639522258,
"grad_norm": 8.333565942506247,
"learning_rate": 6.259327871929968e-06,
"loss": 4.4319,
"step": 530
},
{
"epoch": 0.46123778501628665,
"grad_norm": 5.765983896353204,
"learning_rate": 6.246894532073945e-06,
"loss": 4.5144,
"step": 531
},
{
"epoch": 0.4621064060803475,
"grad_norm": 8.432302157463148,
"learning_rate": 6.234455326144208e-06,
"loss": 4.4091,
"step": 532
},
{
"epoch": 0.46297502714440825,
"grad_norm": 6.536784604924449,
"learning_rate": 6.222010351837684e-06,
"loss": 4.438,
"step": 533
},
{
"epoch": 0.4638436482084691,
"grad_norm": 7.775568222041834,
"learning_rate": 6.209559706896603e-06,
"loss": 4.4694,
"step": 534
},
{
"epoch": 0.46471226927252984,
"grad_norm": 7.053360240600179,
"learning_rate": 6.197103489107726e-06,
"loss": 4.5039,
"step": 535
},
{
"epoch": 0.46558089033659067,
"grad_norm": 6.449780887258137,
"learning_rate": 6.184641796301596e-06,
"loss": 4.4826,
"step": 536
},
{
"epoch": 0.46644951140065144,
"grad_norm": 6.784307145245734,
"learning_rate": 6.172174726351743e-06,
"loss": 4.6193,
"step": 537
},
{
"epoch": 0.46731813246471227,
"grad_norm": 6.2467877899636575,
"learning_rate": 6.159702377173935e-06,
"loss": 4.4363,
"step": 538
},
{
"epoch": 0.4681867535287731,
"grad_norm": 7.5393857303754155,
"learning_rate": 6.147224846725402e-06,
"loss": 4.4544,
"step": 539
},
{
"epoch": 0.46905537459283386,
"grad_norm": 6.6525514703420585,
"learning_rate": 6.134742233004073e-06,
"loss": 4.5408,
"step": 540
},
{
"epoch": 0.4699239956568947,
"grad_norm": 7.973280585624998,
"learning_rate": 6.122254634047787e-06,
"loss": 4.5086,
"step": 541
},
{
"epoch": 0.47079261672095546,
"grad_norm": 6.6800439107137635,
"learning_rate": 6.109762147933553e-06,
"loss": 4.3572,
"step": 542
},
{
"epoch": 0.4716612377850163,
"grad_norm": 6.076538888634451,
"learning_rate": 6.097264872776749e-06,
"loss": 4.5751,
"step": 543
},
{
"epoch": 0.4725298588490771,
"grad_norm": 8.835225489595787,
"learning_rate": 6.084762906730379e-06,
"loss": 4.5004,
"step": 544
},
{
"epoch": 0.4733984799131379,
"grad_norm": 4.829662247046767,
"learning_rate": 6.0722563479842764e-06,
"loss": 4.3311,
"step": 545
},
{
"epoch": 0.4742671009771987,
"grad_norm": 6.986298812339827,
"learning_rate": 6.059745294764359e-06,
"loss": 4.365,
"step": 546
},
{
"epoch": 0.4751357220412595,
"grad_norm": 7.574390765696185,
"learning_rate": 6.04722984533183e-06,
"loss": 4.4365,
"step": 547
},
{
"epoch": 0.4760043431053203,
"grad_norm": 5.7402675424404395,
"learning_rate": 6.034710097982432e-06,
"loss": 4.2018,
"step": 548
},
{
"epoch": 0.47687296416938113,
"grad_norm": 7.703977261086262,
"learning_rate": 6.022186151045652e-06,
"loss": 4.4048,
"step": 549
},
{
"epoch": 0.4777415852334419,
"grad_norm": 6.289071645181934,
"learning_rate": 6.009658102883974e-06,
"loss": 4.5359,
"step": 550
},
{
"epoch": 0.4786102062975027,
"grad_norm": 6.972078426295843,
"learning_rate": 5.997126051892082e-06,
"loss": 4.4349,
"step": 551
},
{
"epoch": 0.4794788273615635,
"grad_norm": 7.66830398869564,
"learning_rate": 5.984590096496099e-06,
"loss": 4.5563,
"step": 552
},
{
"epoch": 0.4803474484256243,
"grad_norm": 6.0227616340295365,
"learning_rate": 5.972050335152819e-06,
"loss": 4.2982,
"step": 553
},
{
"epoch": 0.48121606948968515,
"grad_norm": 8.678277229194649,
"learning_rate": 5.959506866348924e-06,
"loss": 4.4519,
"step": 554
},
{
"epoch": 0.4820846905537459,
"grad_norm": 4.787842386763273,
"learning_rate": 5.94695978860021e-06,
"loss": 4.438,
"step": 555
},
{
"epoch": 0.48295331161780675,
"grad_norm": 10.597087001950817,
"learning_rate": 5.934409200450828e-06,
"loss": 4.3361,
"step": 556
},
{
"epoch": 0.4838219326818675,
"grad_norm": 6.769261685937347,
"learning_rate": 5.9218552004724895e-06,
"loss": 4.2451,
"step": 557
},
{
"epoch": 0.48469055374592834,
"grad_norm": 8.774187421231344,
"learning_rate": 5.909297887263708e-06,
"loss": 4.4513,
"step": 558
},
{
"epoch": 0.48555917480998917,
"grad_norm": 6.831684880334275,
"learning_rate": 5.896737359449015e-06,
"loss": 4.5108,
"step": 559
},
{
"epoch": 0.48642779587404994,
"grad_norm": 7.414601371689213,
"learning_rate": 5.884173715678193e-06,
"loss": 4.4822,
"step": 560
},
{
"epoch": 0.48729641693811077,
"grad_norm": 6.332769305151126,
"learning_rate": 5.871607054625497e-06,
"loss": 4.3,
"step": 561
},
{
"epoch": 0.48816503800217154,
"grad_norm": 7.518053075962911,
"learning_rate": 5.859037474988875e-06,
"loss": 4.2958,
"step": 562
},
{
"epoch": 0.48903365906623236,
"grad_norm": 7.791263799758281,
"learning_rate": 5.846465075489202e-06,
"loss": 4.6028,
"step": 563
},
{
"epoch": 0.48990228013029313,
"grad_norm": 7.99797189120995,
"learning_rate": 5.8338899548695004e-06,
"loss": 4.362,
"step": 564
},
{
"epoch": 0.49077090119435396,
"grad_norm": 7.553380214852644,
"learning_rate": 5.821312211894159e-06,
"loss": 4.5026,
"step": 565
},
{
"epoch": 0.4916395222584148,
"grad_norm": 5.392709057685767,
"learning_rate": 5.808731945348168e-06,
"loss": 4.3386,
"step": 566
},
{
"epoch": 0.49250814332247556,
"grad_norm": 7.863510875655094,
"learning_rate": 5.7961492540363365e-06,
"loss": 4.564,
"step": 567
},
{
"epoch": 0.4933767643865364,
"grad_norm": 7.618164313464631,
"learning_rate": 5.783564236782514e-06,
"loss": 4.4442,
"step": 568
},
{
"epoch": 0.49424538545059715,
"grad_norm": 8.728366265826342,
"learning_rate": 5.770976992428821e-06,
"loss": 4.4746,
"step": 569
},
{
"epoch": 0.495114006514658,
"grad_norm": 7.8924314705792025,
"learning_rate": 5.758387619834872e-06,
"loss": 4.4972,
"step": 570
},
{
"epoch": 0.4959826275787188,
"grad_norm": 6.344676137880645,
"learning_rate": 5.74579621787699e-06,
"loss": 4.4232,
"step": 571
},
{
"epoch": 0.4968512486427796,
"grad_norm": 5.552402668916067,
"learning_rate": 5.73320288544744e-06,
"loss": 4.46,
"step": 572
},
{
"epoch": 0.4977198697068404,
"grad_norm": 7.611624637980265,
"learning_rate": 5.720607721453651e-06,
"loss": 4.5376,
"step": 573
},
{
"epoch": 0.49858849077090117,
"grad_norm": 7.077190569309273,
"learning_rate": 5.708010824817432e-06,
"loss": 4.3802,
"step": 574
},
{
"epoch": 0.499457111834962,
"grad_norm": 6.943034504994782,
"learning_rate": 5.695412294474208e-06,
"loss": 4.4052,
"step": 575
},
{
"epoch": 0.5003257328990228,
"grad_norm": 7.847100098403116,
"learning_rate": 5.682812229372225e-06,
"loss": 4.3222,
"step": 576
},
{
"epoch": 0.5011943539630836,
"grad_norm": 4.926170240297532,
"learning_rate": 5.67021072847179e-06,
"loss": 4.3407,
"step": 577
},
{
"epoch": 0.5020629750271444,
"grad_norm": 4.81827513225177,
"learning_rate": 5.657607890744485e-06,
"loss": 4.5482,
"step": 578
},
{
"epoch": 0.5029315960912052,
"grad_norm": 9.69284798196995,
"learning_rate": 5.64500381517239e-06,
"loss": 4.5489,
"step": 579
},
{
"epoch": 0.503800217155266,
"grad_norm": 6.4397567914279525,
"learning_rate": 5.632398600747307e-06,
"loss": 4.5095,
"step": 580
},
{
"epoch": 0.5046688382193268,
"grad_norm": 7.564842572267876,
"learning_rate": 5.619792346469988e-06,
"loss": 4.3857,
"step": 581
},
{
"epoch": 0.5055374592833877,
"grad_norm": 5.717273086517985,
"learning_rate": 5.607185151349342e-06,
"loss": 4.601,
"step": 582
},
{
"epoch": 0.5064060803474484,
"grad_norm": 5.765808842291691,
"learning_rate": 5.594577114401677e-06,
"loss": 4.4892,
"step": 583
},
{
"epoch": 0.5072747014115092,
"grad_norm": 8.975257143861999,
"learning_rate": 5.581968334649906e-06,
"loss": 4.3199,
"step": 584
},
{
"epoch": 0.50814332247557,
"grad_norm": 5.004707622689579,
"learning_rate": 5.56935891112278e-06,
"loss": 4.5351,
"step": 585
},
{
"epoch": 0.5090119435396309,
"grad_norm": 12.075805744578952,
"learning_rate": 5.5567489428541035e-06,
"loss": 4.5282,
"step": 586
},
{
"epoch": 0.5098805646036917,
"grad_norm": 7.78389463909419,
"learning_rate": 5.54413852888196e-06,
"loss": 4.3581,
"step": 587
},
{
"epoch": 0.5107491856677524,
"grad_norm": 7.658427280297901,
"learning_rate": 5.531527768247935e-06,
"loss": 4.2974,
"step": 588
},
{
"epoch": 0.5116178067318132,
"grad_norm": 7.231975397294993,
"learning_rate": 5.518916759996337e-06,
"loss": 4.3415,
"step": 589
},
{
"epoch": 0.512486427795874,
"grad_norm": 6.751836330853958,
"learning_rate": 5.506305603173414e-06,
"loss": 4.3004,
"step": 590
},
{
"epoch": 0.5133550488599349,
"grad_norm": 8.355115868210659,
"learning_rate": 5.493694396826589e-06,
"loss": 4.4837,
"step": 591
},
{
"epoch": 0.5142236699239957,
"grad_norm": 6.193051769061237,
"learning_rate": 5.481083240003665e-06,
"loss": 4.4574,
"step": 592
},
{
"epoch": 0.5150922909880564,
"grad_norm": 8.721016177095263,
"learning_rate": 5.468472231752065e-06,
"loss": 4.5605,
"step": 593
},
{
"epoch": 0.5159609120521172,
"grad_norm": 8.690619765588671,
"learning_rate": 5.455861471118041e-06,
"loss": 4.4022,
"step": 594
},
{
"epoch": 0.5168295331161781,
"grad_norm": 6.149157342377077,
"learning_rate": 5.443251057145899e-06,
"loss": 4.3339,
"step": 595
},
{
"epoch": 0.5176981541802389,
"grad_norm": 9.505811782297348,
"learning_rate": 5.430641088877221e-06,
"loss": 4.2946,
"step": 596
},
{
"epoch": 0.5185667752442997,
"grad_norm": 7.387109532510519,
"learning_rate": 5.418031665350096e-06,
"loss": 4.4647,
"step": 597
},
{
"epoch": 0.5194353963083604,
"grad_norm": 8.365180622825434,
"learning_rate": 5.405422885598324e-06,
"loss": 4.376,
"step": 598
},
{
"epoch": 0.5203040173724213,
"grad_norm": 7.489996252960465,
"learning_rate": 5.3928148486506584e-06,
"loss": 4.452,
"step": 599
},
{
"epoch": 0.5211726384364821,
"grad_norm": 7.038197518326514,
"learning_rate": 5.380207653530014e-06,
"loss": 4.433,
"step": 600
},
{
"epoch": 0.5220412595005429,
"grad_norm": 6.5440132590329725,
"learning_rate": 5.367601399252694e-06,
"loss": 4.339,
"step": 601
},
{
"epoch": 0.5229098805646037,
"grad_norm": 7.590698363002852,
"learning_rate": 5.354996184827612e-06,
"loss": 4.5249,
"step": 602
},
{
"epoch": 0.5237785016286645,
"grad_norm": 5.330185549680669,
"learning_rate": 5.3423921092555184e-06,
"loss": 4.2862,
"step": 603
},
{
"epoch": 0.5246471226927253,
"grad_norm": 11.75232584431798,
"learning_rate": 5.329789271528212e-06,
"loss": 4.5025,
"step": 604
},
{
"epoch": 0.5255157437567861,
"grad_norm": 7.036744842599782,
"learning_rate": 5.3171877706277785e-06,
"loss": 4.5778,
"step": 605
},
{
"epoch": 0.5263843648208469,
"grad_norm": 14.767373768502868,
"learning_rate": 5.304587705525795e-06,
"loss": 4.6429,
"step": 606
},
{
"epoch": 0.5272529858849077,
"grad_norm": 13.155922558039405,
"learning_rate": 5.291989175182569e-06,
"loss": 4.308,
"step": 607
},
{
"epoch": 0.5281216069489685,
"grad_norm": 10.112650168109823,
"learning_rate": 5.2793922785463515e-06,
"loss": 4.484,
"step": 608
},
{
"epoch": 0.5289902280130293,
"grad_norm": 11.115807165127777,
"learning_rate": 5.266797114552562e-06,
"loss": 4.2531,
"step": 609
},
{
"epoch": 0.5298588490770901,
"grad_norm": 8.789007652727864,
"learning_rate": 5.254203782123013e-06,
"loss": 4.4873,
"step": 610
},
{
"epoch": 0.530727470141151,
"grad_norm": 8.584208445591985,
"learning_rate": 5.241612380165131e-06,
"loss": 4.474,
"step": 611
},
{
"epoch": 0.5315960912052117,
"grad_norm": 8.322437053520234,
"learning_rate": 5.229023007571179e-06,
"loss": 4.3145,
"step": 612
},
{
"epoch": 0.5324647122692725,
"grad_norm": 7.166099901687578,
"learning_rate": 5.216435763217487e-06,
"loss": 4.3139,
"step": 613
},
{
"epoch": 0.5333333333333333,
"grad_norm": 10.11706910820565,
"learning_rate": 5.203850745963666e-06,
"loss": 4.4996,
"step": 614
},
{
"epoch": 0.5342019543973942,
"grad_norm": 7.536732454472386,
"learning_rate": 5.191268054651833e-06,
"loss": 4.4032,
"step": 615
},
{
"epoch": 0.535070575461455,
"grad_norm": 9.963574957880162,
"learning_rate": 5.178687788105842e-06,
"loss": 4.5332,
"step": 616
},
{
"epoch": 0.5359391965255157,
"grad_norm": 7.28030489177068,
"learning_rate": 5.166110045130503e-06,
"loss": 4.4073,
"step": 617
},
{
"epoch": 0.5368078175895765,
"grad_norm": 9.51169465858539,
"learning_rate": 5.153534924510799e-06,
"loss": 4.3795,
"step": 618
},
{
"epoch": 0.5376764386536373,
"grad_norm": 6.905433659455053,
"learning_rate": 5.1409625250111265e-06,
"loss": 4.4542,
"step": 619
},
{
"epoch": 0.5385450597176982,
"grad_norm": 9.924412663417344,
"learning_rate": 5.1283929453745055e-06,
"loss": 4.2425,
"step": 620
},
{
"epoch": 0.539413680781759,
"grad_norm": 7.187624451476708,
"learning_rate": 5.1158262843218076e-06,
"loss": 4.509,
"step": 621
},
{
"epoch": 0.5402823018458197,
"grad_norm": 7.551854215214278,
"learning_rate": 5.103262640550986e-06,
"loss": 4.4906,
"step": 622
},
{
"epoch": 0.5411509229098805,
"grad_norm": 7.066809081049507,
"learning_rate": 5.090702112736295e-06,
"loss": 4.5479,
"step": 623
},
{
"epoch": 0.5420195439739414,
"grad_norm": 6.869001279033655,
"learning_rate": 5.078144799527513e-06,
"loss": 4.335,
"step": 624
},
{
"epoch": 0.5428881650380022,
"grad_norm": 6.184389891480793,
"learning_rate": 5.0655907995491726e-06,
"loss": 4.4417,
"step": 625
},
{
"epoch": 0.543756786102063,
"grad_norm": 5.725304760538053,
"learning_rate": 5.053040211399792e-06,
"loss": 4.4285,
"step": 626
},
{
"epoch": 0.5446254071661237,
"grad_norm": 6.858465748404722,
"learning_rate": 5.0404931336510785e-06,
"loss": 4.5386,
"step": 627
},
{
"epoch": 0.5454940282301846,
"grad_norm": 5.26607902680547,
"learning_rate": 5.027949664847182e-06,
"loss": 4.3599,
"step": 628
},
{
"epoch": 0.5463626492942454,
"grad_norm": 7.76854318501132,
"learning_rate": 5.015409903503903e-06,
"loss": 4.4496,
"step": 629
},
{
"epoch": 0.5472312703583062,
"grad_norm": 6.476156741121861,
"learning_rate": 5.00287394810792e-06,
"loss": 4.5308,
"step": 630
},
{
"epoch": 0.548099891422367,
"grad_norm": 5.759923281782965,
"learning_rate": 4.9903418971160276e-06,
"loss": 4.3798,
"step": 631
},
{
"epoch": 0.5489685124864278,
"grad_norm": 7.291747973328974,
"learning_rate": 4.977813848954349e-06,
"loss": 4.4205,
"step": 632
},
{
"epoch": 0.5498371335504886,
"grad_norm": 6.271882053028793,
"learning_rate": 4.9652899020175706e-06,
"loss": 4.4581,
"step": 633
},
{
"epoch": 0.5507057546145494,
"grad_norm": 7.284670931214164,
"learning_rate": 4.952770154668173e-06,
"loss": 4.5707,
"step": 634
},
{
"epoch": 0.5515743756786102,
"grad_norm": 4.328191558276064,
"learning_rate": 4.940254705235643e-06,
"loss": 4.5398,
"step": 635
},
{
"epoch": 0.552442996742671,
"grad_norm": 5.190845151993087,
"learning_rate": 4.927743652015723e-06,
"loss": 4.4066,
"step": 636
},
{
"epoch": 0.5533116178067318,
"grad_norm": 5.037059057279867,
"learning_rate": 4.915237093269624e-06,
"loss": 4.4995,
"step": 637
},
{
"epoch": 0.5541802388707926,
"grad_norm": 5.593175005468861,
"learning_rate": 4.902735127223251e-06,
"loss": 4.4209,
"step": 638
},
{
"epoch": 0.5550488599348534,
"grad_norm": 7.109606325525756,
"learning_rate": 4.890237852066449e-06,
"loss": 4.4592,
"step": 639
},
{
"epoch": 0.5559174809989142,
"grad_norm": 7.103955309366861,
"learning_rate": 4.877745365952214e-06,
"loss": 4.3799,
"step": 640
},
{
"epoch": 0.5567861020629751,
"grad_norm": 6.9509815611789785,
"learning_rate": 4.865257766995929e-06,
"loss": 4.4195,
"step": 641
},
{
"epoch": 0.5576547231270358,
"grad_norm": 5.206700465162741,
"learning_rate": 4.852775153274597e-06,
"loss": 4.4101,
"step": 642
},
{
"epoch": 0.5585233441910966,
"grad_norm": 9.493231366877055,
"learning_rate": 4.8402976228260665e-06,
"loss": 4.4044,
"step": 643
},
{
"epoch": 0.5593919652551574,
"grad_norm": 7.354474990771557,
"learning_rate": 4.827825273648259e-06,
"loss": 4.4041,
"step": 644
},
{
"epoch": 0.5602605863192183,
"grad_norm": 7.477143176038921,
"learning_rate": 4.8153582036984055e-06,
"loss": 4.2882,
"step": 645
},
{
"epoch": 0.5611292073832791,
"grad_norm": 6.891158599557778,
"learning_rate": 4.802896510892274e-06,
"loss": 4.4556,
"step": 646
},
{
"epoch": 0.5619978284473398,
"grad_norm": 6.181299599357454,
"learning_rate": 4.790440293103399e-06,
"loss": 4.5159,
"step": 647
},
{
"epoch": 0.5628664495114006,
"grad_norm": 6.219602356707123,
"learning_rate": 4.7779896481623165e-06,
"loss": 4.305,
"step": 648
},
{
"epoch": 0.5637350705754615,
"grad_norm": 7.689806502498994,
"learning_rate": 4.765544673855793e-06,
"loss": 4.588,
"step": 649
},
{
"epoch": 0.5646036916395223,
"grad_norm": 5.222351629918664,
"learning_rate": 4.753105467926058e-06,
"loss": 4.5114,
"step": 650
},
{
"epoch": 0.5654723127035831,
"grad_norm": 6.0422096833801255,
"learning_rate": 4.740672128070033e-06,
"loss": 4.4576,
"step": 651
},
{
"epoch": 0.5663409337676438,
"grad_norm": 6.35521507705001,
"learning_rate": 4.728244751938576e-06,
"loss": 4.2865,
"step": 652
},
{
"epoch": 0.5672095548317047,
"grad_norm": 5.486777370518167,
"learning_rate": 4.715823437135698e-06,
"loss": 4.3724,
"step": 653
},
{
"epoch": 0.5680781758957655,
"grad_norm": 6.005852070933836,
"learning_rate": 4.703408281217808e-06,
"loss": 4.3402,
"step": 654
},
{
"epoch": 0.5689467969598263,
"grad_norm": 5.5827526578094595,
"learning_rate": 4.690999381692943e-06,
"loss": 4.38,
"step": 655
},
{
"epoch": 0.5698154180238871,
"grad_norm": 4.834434200631698,
"learning_rate": 4.678596836020003e-06,
"loss": 4.3628,
"step": 656
},
{
"epoch": 0.5706840390879478,
"grad_norm": 5.54557315267325,
"learning_rate": 4.666200741607986e-06,
"loss": 4.423,
"step": 657
},
{
"epoch": 0.5715526601520087,
"grad_norm": 5.839997452850654,
"learning_rate": 4.6538111958152195e-06,
"loss": 4.4357,
"step": 658
},
{
"epoch": 0.5724212812160695,
"grad_norm": 5.117404141092577,
"learning_rate": 4.6414282959486015e-06,
"loss": 4.2396,
"step": 659
},
{
"epoch": 0.5732899022801303,
"grad_norm": 4.960346982680198,
"learning_rate": 4.62905213926283e-06,
"loss": 4.299,
"step": 660
},
{
"epoch": 0.5741585233441912,
"grad_norm": 5.666130352112717,
"learning_rate": 4.616682822959644e-06,
"loss": 4.3401,
"step": 661
},
{
"epoch": 0.5750271444082519,
"grad_norm": 8.838421793842883,
"learning_rate": 4.604320444187058e-06,
"loss": 4.5139,
"step": 662
},
{
"epoch": 0.5758957654723127,
"grad_norm": 6.818030373290299,
"learning_rate": 4.591965100038604e-06,
"loss": 4.3741,
"step": 663
},
{
"epoch": 0.5767643865363735,
"grad_norm": 5.238826628516235,
"learning_rate": 4.579616887552556e-06,
"loss": 4.3466,
"step": 664
},
{
"epoch": 0.5776330076004343,
"grad_norm": 6.774064034828669,
"learning_rate": 4.567275903711182e-06,
"loss": 4.5341,
"step": 665
},
{
"epoch": 0.5785016286644951,
"grad_norm": 10.5839617130325,
"learning_rate": 4.554942245439977e-06,
"loss": 4.55,
"step": 666
},
{
"epoch": 0.5793702497285559,
"grad_norm": 4.4556947282323875,
"learning_rate": 4.542616009606896e-06,
"loss": 4.4169,
"step": 667
},
{
"epoch": 0.5802388707926167,
"grad_norm": 7.295340591592647,
"learning_rate": 4.5302972930216035e-06,
"loss": 4.1999,
"step": 668
},
{
"epoch": 0.5811074918566775,
"grad_norm": 6.167555266777589,
"learning_rate": 4.5179861924347105e-06,
"loss": 4.4135,
"step": 669
},
{
"epoch": 0.5819761129207384,
"grad_norm": 4.262024109429587,
"learning_rate": 4.505682804537002e-06,
"loss": 4.3303,
"step": 670
},
{
"epoch": 0.5828447339847991,
"grad_norm": 5.436615646031177,
"learning_rate": 4.493387225958698e-06,
"loss": 4.2423,
"step": 671
},
{
"epoch": 0.5837133550488599,
"grad_norm": 7.121365514253581,
"learning_rate": 4.481099553268683e-06,
"loss": 4.3157,
"step": 672
},
{
"epoch": 0.5845819761129207,
"grad_norm": 4.638764375663413,
"learning_rate": 4.468819882973743e-06,
"loss": 4.591,
"step": 673
},
{
"epoch": 0.5854505971769816,
"grad_norm": 5.718150177859018,
"learning_rate": 4.456548311517818e-06,
"loss": 4.4703,
"step": 674
},
{
"epoch": 0.5863192182410424,
"grad_norm": 6.5924354876426445,
"learning_rate": 4.444284935281245e-06,
"loss": 4.4582,
"step": 675
},
{
"epoch": 0.5871878393051031,
"grad_norm": 5.619242120861036,
"learning_rate": 4.432029850579983e-06,
"loss": 4.2845,
"step": 676
},
{
"epoch": 0.5880564603691639,
"grad_norm": 8.851755293802896,
"learning_rate": 4.419783153664885e-06,
"loss": 4.4555,
"step": 677
},
{
"epoch": 0.5889250814332248,
"grad_norm": 4.981757116839891,
"learning_rate": 4.407544940720912e-06,
"loss": 4.4267,
"step": 678
},
{
"epoch": 0.5897937024972856,
"grad_norm": 8.597721909600045,
"learning_rate": 4.395315307866404e-06,
"loss": 4.3179,
"step": 679
},
{
"epoch": 0.5906623235613464,
"grad_norm": 6.495128819820371,
"learning_rate": 4.383094351152309e-06,
"loss": 4.4839,
"step": 680
},
{
"epoch": 0.5915309446254071,
"grad_norm": 7.608989760198819,
"learning_rate": 4.370882166561432e-06,
"loss": 4.4416,
"step": 681
},
{
"epoch": 0.5923995656894679,
"grad_norm": 5.854433397474583,
"learning_rate": 4.358678850007681e-06,
"loss": 4.3324,
"step": 682
},
{
"epoch": 0.5932681867535288,
"grad_norm": 7.407730937666916,
"learning_rate": 4.3464844973353215e-06,
"loss": 4.3833,
"step": 683
},
{
"epoch": 0.5941368078175896,
"grad_norm": 4.968500771960475,
"learning_rate": 4.334299204318208e-06,
"loss": 4.2341,
"step": 684
},
{
"epoch": 0.5950054288816504,
"grad_norm": 7.558518400906316,
"learning_rate": 4.322123066659048e-06,
"loss": 4.4444,
"step": 685
},
{
"epoch": 0.5958740499457111,
"grad_norm": 4.962150548530693,
"learning_rate": 4.309956179988641e-06,
"loss": 4.3576,
"step": 686
},
{
"epoch": 0.596742671009772,
"grad_norm": 6.938240211755133,
"learning_rate": 4.2977986398651285e-06,
"loss": 4.3146,
"step": 687
},
{
"epoch": 0.5976112920738328,
"grad_norm": 7.587763936061048,
"learning_rate": 4.285650541773243e-06,
"loss": 4.4289,
"step": 688
},
{
"epoch": 0.5984799131378936,
"grad_norm": 5.969538038161447,
"learning_rate": 4.273511981123569e-06,
"loss": 4.3759,
"step": 689
},
{
"epoch": 0.5993485342019544,
"grad_norm": 7.899549221564559,
"learning_rate": 4.261383053251773e-06,
"loss": 4.3071,
"step": 690
},
{
"epoch": 0.6002171552660152,
"grad_norm": 5.2445353058223505,
"learning_rate": 4.2492638534178695e-06,
"loss": 4.281,
"step": 691
},
{
"epoch": 0.601085776330076,
"grad_norm": 7.6245739463456985,
"learning_rate": 4.237154476805475e-06,
"loss": 4.2956,
"step": 692
},
{
"epoch": 0.6019543973941368,
"grad_norm": 5.470116425098481,
"learning_rate": 4.225055018521048e-06,
"loss": 4.2829,
"step": 693
},
{
"epoch": 0.6028230184581976,
"grad_norm": 8.639572149060786,
"learning_rate": 4.2129655735931514e-06,
"loss": 4.4738,
"step": 694
},
{
"epoch": 0.6036916395222585,
"grad_norm": 6.152417088405707,
"learning_rate": 4.200886236971707e-06,
"loss": 4.4568,
"step": 695
},
{
"epoch": 0.6045602605863192,
"grad_norm": 9.3108808124201,
"learning_rate": 4.188817103527238e-06,
"loss": 4.6181,
"step": 696
},
{
"epoch": 0.60542888165038,
"grad_norm": 6.812741933127771,
"learning_rate": 4.176758268050141e-06,
"loss": 4.3743,
"step": 697
},
{
"epoch": 0.6062975027144408,
"grad_norm": 7.946624143488305,
"learning_rate": 4.164709825249931e-06,
"loss": 4.3947,
"step": 698
},
{
"epoch": 0.6071661237785017,
"grad_norm": 7.52895947435545,
"learning_rate": 4.152671869754496e-06,
"loss": 4.4524,
"step": 699
},
{
"epoch": 0.6080347448425625,
"grad_norm": 6.78338934618417,
"learning_rate": 4.140644496109358e-06,
"loss": 4.4503,
"step": 700
},
{
"epoch": 0.6089033659066232,
"grad_norm": 7.448080226128123,
"learning_rate": 4.128627798776933e-06,
"loss": 4.353,
"step": 701
},
{
"epoch": 0.609771986970684,
"grad_norm": 5.9613691312207475,
"learning_rate": 4.116621872135782e-06,
"loss": 4.1222,
"step": 702
},
{
"epoch": 0.6106406080347448,
"grad_norm": 7.5790067996481625,
"learning_rate": 4.104626810479878e-06,
"loss": 4.3692,
"step": 703
},
{
"epoch": 0.6115092290988057,
"grad_norm": 5.453670887522505,
"learning_rate": 4.092642708017853e-06,
"loss": 4.491,
"step": 704
},
{
"epoch": 0.6123778501628665,
"grad_norm": 9.305782789439998,
"learning_rate": 4.080669658872275e-06,
"loss": 4.5958,
"step": 705
},
{
"epoch": 0.6132464712269272,
"grad_norm": 5.699148615360036,
"learning_rate": 4.068707757078895e-06,
"loss": 4.2548,
"step": 706
},
{
"epoch": 0.614115092290988,
"grad_norm": 7.911694729153124,
"learning_rate": 4.056757096585914e-06,
"loss": 4.4262,
"step": 707
},
{
"epoch": 0.6149837133550489,
"grad_norm": 6.872921512476096,
"learning_rate": 4.044817771253243e-06,
"loss": 4.3627,
"step": 708
},
{
"epoch": 0.6158523344191097,
"grad_norm": 7.476546715366341,
"learning_rate": 4.0328898748517715e-06,
"loss": 4.3003,
"step": 709
},
{
"epoch": 0.6167209554831705,
"grad_norm": 7.118852675240264,
"learning_rate": 4.020973501062621e-06,
"loss": 4.2213,
"step": 710
},
{
"epoch": 0.6175895765472312,
"grad_norm": 6.777324416098178,
"learning_rate": 4.009068743476418e-06,
"loss": 4.4026,
"step": 711
},
{
"epoch": 0.6184581976112921,
"grad_norm": 6.267261071143136,
"learning_rate": 3.997175695592558e-06,
"loss": 4.3549,
"step": 712
},
{
"epoch": 0.6193268186753529,
"grad_norm": 5.469188635248625,
"learning_rate": 3.985294450818465e-06,
"loss": 4.5069,
"step": 713
},
{
"epoch": 0.6201954397394137,
"grad_norm": 5.5464442937088645,
"learning_rate": 3.973425102468864e-06,
"loss": 4.3273,
"step": 714
},
{
"epoch": 0.6210640608034745,
"grad_norm": 6.3365698216315485,
"learning_rate": 3.961567743765047e-06,
"loss": 4.2548,
"step": 715
},
{
"epoch": 0.6219326818675353,
"grad_norm": 6.762739461810207,
"learning_rate": 3.949722467834136e-06,
"loss": 4.2804,
"step": 716
},
{
"epoch": 0.6228013029315961,
"grad_norm": 6.867786850485367,
"learning_rate": 3.9378893677083585e-06,
"loss": 4.4824,
"step": 717
},
{
"epoch": 0.6236699239956569,
"grad_norm": 4.426890287359104,
"learning_rate": 3.926068536324318e-06,
"loss": 4.2629,
"step": 718
},
{
"epoch": 0.6245385450597177,
"grad_norm": 5.764947147703928,
"learning_rate": 3.914260066522249e-06,
"loss": 4.4748,
"step": 719
},
{
"epoch": 0.6254071661237784,
"grad_norm": 6.449173453157844,
"learning_rate": 3.902464051045308e-06,
"loss": 4.2288,
"step": 720
},
{
"epoch": 0.6262757871878393,
"grad_norm": 7.156025732803798,
"learning_rate": 3.890680582538835e-06,
"loss": 4.471,
"step": 721
},
{
"epoch": 0.6271444082519001,
"grad_norm": 6.044472987310762,
"learning_rate": 3.878909753549621e-06,
"loss": 4.3782,
"step": 722
},
{
"epoch": 0.6280130293159609,
"grad_norm": 6.041979384384623,
"learning_rate": 3.867151656525191e-06,
"loss": 4.3686,
"step": 723
},
{
"epoch": 0.6288816503800218,
"grad_norm": 4.965615098879875,
"learning_rate": 3.8554063838130774e-06,
"loss": 4.3014,
"step": 724
},
{
"epoch": 0.6297502714440825,
"grad_norm": 6.077620537282477,
"learning_rate": 3.84367402766008e-06,
"loss": 4.5319,
"step": 725
},
{
"epoch": 0.6306188925081433,
"grad_norm": 7.348008590308795,
"learning_rate": 3.831954680211567e-06,
"loss": 4.4329,
"step": 726
},
{
"epoch": 0.6314875135722041,
"grad_norm": 4.148156488850514,
"learning_rate": 3.820248433510721e-06,
"loss": 4.3825,
"step": 727
},
{
"epoch": 0.6323561346362649,
"grad_norm": 5.868839788368692,
"learning_rate": 3.8085553794978464e-06,
"loss": 4.4061,
"step": 728
},
{
"epoch": 0.6332247557003258,
"grad_norm": 5.4956452972488465,
"learning_rate": 3.7968756100096264e-06,
"loss": 4.4363,
"step": 729
},
{
"epoch": 0.6340933767643865,
"grad_norm": 6.885854053993367,
"learning_rate": 3.7852092167784057e-06,
"loss": 4.4167,
"step": 730
},
{
"epoch": 0.6349619978284473,
"grad_norm": 3.9881851549507723,
"learning_rate": 3.7735562914314753e-06,
"loss": 4.2648,
"step": 731
},
{
"epoch": 0.6358306188925081,
"grad_norm": 3.4678484500146927,
"learning_rate": 3.761916925490355e-06,
"loss": 4.3543,
"step": 732
},
{
"epoch": 0.636699239956569,
"grad_norm": 5.980995299387367,
"learning_rate": 3.7502912103700573e-06,
"loss": 4.346,
"step": 733
},
{
"epoch": 0.6375678610206298,
"grad_norm": 6.151689157843326,
"learning_rate": 3.738679237378395e-06,
"loss": 4.4269,
"step": 734
},
{
"epoch": 0.6384364820846905,
"grad_norm": 5.540758455448802,
"learning_rate": 3.7270810977152437e-06,
"loss": 4.5648,
"step": 735
},
{
"epoch": 0.6393051031487513,
"grad_norm": 4.261398640280459,
"learning_rate": 3.7154968824718335e-06,
"loss": 4.1601,
"step": 736
},
{
"epoch": 0.6401737242128122,
"grad_norm": 5.048852018013335,
"learning_rate": 3.703926682630034e-06,
"loss": 4.1836,
"step": 737
},
{
"epoch": 0.641042345276873,
"grad_norm": 6.223488653795751,
"learning_rate": 3.6923705890616385e-06,
"loss": 4.2932,
"step": 738
},
{
"epoch": 0.6419109663409338,
"grad_norm": 4.00296364061148,
"learning_rate": 3.6808286925276476e-06,
"loss": 4.5271,
"step": 739
},
{
"epoch": 0.6427795874049945,
"grad_norm": 7.6337502276759,
"learning_rate": 3.669301083677563e-06,
"loss": 4.3391,
"step": 740
},
{
"epoch": 0.6436482084690553,
"grad_norm": 3.648965509349653,
"learning_rate": 3.657787853048671e-06,
"loss": 4.3708,
"step": 741
},
{
"epoch": 0.6445168295331162,
"grad_norm": 6.430179601615156,
"learning_rate": 3.6462890910653287e-06,
"loss": 4.2363,
"step": 742
},
{
"epoch": 0.645385450597177,
"grad_norm": 3.8810246258525414,
"learning_rate": 3.6348048880382603e-06,
"loss": 4.4739,
"step": 743
},
{
"epoch": 0.6462540716612378,
"grad_norm": 6.8458334527081615,
"learning_rate": 3.6233353341638434e-06,
"loss": 4.348,
"step": 744
},
{
"epoch": 0.6471226927252985,
"grad_norm": 3.560922995570656,
"learning_rate": 3.611880519523403e-06,
"loss": 4.3628,
"step": 745
},
{
"epoch": 0.6479913137893594,
"grad_norm": 5.703893391333462,
"learning_rate": 3.600440534082501e-06,
"loss": 4.2112,
"step": 746
},
{
"epoch": 0.6488599348534202,
"grad_norm": 5.21895941854659,
"learning_rate": 3.5890154676902346e-06,
"loss": 4.3305,
"step": 747
},
{
"epoch": 0.649728555917481,
"grad_norm": 4.447049515180316,
"learning_rate": 3.5776054100785223e-06,
"loss": 4.3878,
"step": 748
},
{
"epoch": 0.6505971769815418,
"grad_norm": 4.949990794413519,
"learning_rate": 3.56621045086141e-06,
"loss": 4.2244,
"step": 749
},
{
"epoch": 0.6514657980456026,
"grad_norm": 5.258986448566508,
"learning_rate": 3.554830679534357e-06,
"loss": 4.4186,
"step": 750
},
{
"epoch": 0.6523344191096634,
"grad_norm": 5.064550853101523,
"learning_rate": 3.5434661854735406e-06,
"loss": 4.3671,
"step": 751
},
{
"epoch": 0.6532030401737242,
"grad_norm": 5.6966971937871635,
"learning_rate": 3.532117057935151e-06,
"loss": 4.1835,
"step": 752
},
{
"epoch": 0.654071661237785,
"grad_norm": 5.913092142311183,
"learning_rate": 3.520783386054689e-06,
"loss": 4.414,
"step": 753
},
{
"epoch": 0.6549402823018459,
"grad_norm": 4.6289425679321985,
"learning_rate": 3.5094652588462685e-06,
"loss": 4.3397,
"step": 754
},
{
"epoch": 0.6558089033659066,
"grad_norm": 4.196557296390773,
"learning_rate": 3.498162765201918e-06,
"loss": 4.4643,
"step": 755
},
{
"epoch": 0.6566775244299674,
"grad_norm": 4.276537515669203,
"learning_rate": 3.486875993890874e-06,
"loss": 4.316,
"step": 756
},
{
"epoch": 0.6575461454940282,
"grad_norm": 3.638394175512923,
"learning_rate": 3.475605033558896e-06,
"loss": 4.2535,
"step": 757
},
{
"epoch": 0.6584147665580891,
"grad_norm": 3.7547136720071044,
"learning_rate": 3.4643499727275704e-06,
"loss": 4.4355,
"step": 758
},
{
"epoch": 0.6592833876221499,
"grad_norm": 4.529779878719585,
"learning_rate": 3.4531108997935956e-06,
"loss": 4.3798,
"step": 759
},
{
"epoch": 0.6601520086862106,
"grad_norm": 4.5346637615839205,
"learning_rate": 3.4418879030281133e-06,
"loss": 4.4147,
"step": 760
},
{
"epoch": 0.6610206297502714,
"grad_norm": 5.652353695284788,
"learning_rate": 3.430681070575999e-06,
"loss": 4.3372,
"step": 761
},
{
"epoch": 0.6618892508143323,
"grad_norm": 7.170934682558012,
"learning_rate": 3.419490490455176e-06,
"loss": 4.3881,
"step": 762
},
{
"epoch": 0.6627578718783931,
"grad_norm": 5.047705198747088,
"learning_rate": 3.408316250555922e-06,
"loss": 4.3122,
"step": 763
},
{
"epoch": 0.6636264929424539,
"grad_norm": 7.07536095984844,
"learning_rate": 3.3971584386401816e-06,
"loss": 4.3548,
"step": 764
},
{
"epoch": 0.6644951140065146,
"grad_norm": 3.3336263913275572,
"learning_rate": 3.386017142340867e-06,
"loss": 4.3743,
"step": 765
},
{
"epoch": 0.6653637350705754,
"grad_norm": 7.065332392195253,
"learning_rate": 3.374892449161187e-06,
"loss": 4.4953,
"step": 766
},
{
"epoch": 0.6662323561346363,
"grad_norm": 3.4501029280252937,
"learning_rate": 3.3637844464739492e-06,
"loss": 4.3351,
"step": 767
},
{
"epoch": 0.6671009771986971,
"grad_norm": 6.687004143766434,
"learning_rate": 3.352693221520867e-06,
"loss": 4.4546,
"step": 768
},
{
"epoch": 0.6679695982627579,
"grad_norm": 4.50307043857389,
"learning_rate": 3.341618861411887e-06,
"loss": 4.3398,
"step": 769
},
{
"epoch": 0.6688382193268186,
"grad_norm": 7.8703135527519,
"learning_rate": 3.3305614531245077e-06,
"loss": 4.2638,
"step": 770
},
{
"epoch": 0.6697068403908795,
"grad_norm": 3.922590217758762,
"learning_rate": 3.319521083503075e-06,
"loss": 4.2913,
"step": 771
},
{
"epoch": 0.6705754614549403,
"grad_norm": 8.346093676191689,
"learning_rate": 3.3084978392581223e-06,
"loss": 4.4285,
"step": 772
},
{
"epoch": 0.6714440825190011,
"grad_norm": 6.650194245577298,
"learning_rate": 3.2974918069656797e-06,
"loss": 4.2699,
"step": 773
},
{
"epoch": 0.6723127035830619,
"grad_norm": 6.560011908235111,
"learning_rate": 3.286503073066596e-06,
"loss": 4.4655,
"step": 774
},
{
"epoch": 0.6731813246471227,
"grad_norm": 7.055749636651488,
"learning_rate": 3.2755317238658585e-06,
"loss": 4.4159,
"step": 775
},
{
"epoch": 0.6740499457111835,
"grad_norm": 5.354746730373342,
"learning_rate": 3.2645778455319143e-06,
"loss": 4.3549,
"step": 776
},
{
"epoch": 0.6749185667752443,
"grad_norm": 5.146550880864029,
"learning_rate": 3.2536415240959954e-06,
"loss": 4.4694,
"step": 777
},
{
"epoch": 0.6757871878393051,
"grad_norm": 5.831672070421321,
"learning_rate": 3.2427228454514496e-06,
"loss": 4.1214,
"step": 778
},
{
"epoch": 0.6766558089033659,
"grad_norm": 4.521596862758205,
"learning_rate": 3.2318218953530485e-06,
"loss": 4.2677,
"step": 779
},
{
"epoch": 0.6775244299674267,
"grad_norm": 6.314265698622917,
"learning_rate": 3.2209387594163316e-06,
"loss": 4.412,
"step": 780
},
{
"epoch": 0.6783930510314875,
"grad_norm": 4.866756540705389,
"learning_rate": 3.2100735231169238e-06,
"loss": 4.4095,
"step": 781
},
{
"epoch": 0.6792616720955483,
"grad_norm": 6.493844920886384,
"learning_rate": 3.1992262717898687e-06,
"loss": 4.4263,
"step": 782
},
{
"epoch": 0.6801302931596092,
"grad_norm": 7.051427175831958,
"learning_rate": 3.1883970906289568e-06,
"loss": 4.3525,
"step": 783
},
{
"epoch": 0.6809989142236699,
"grad_norm": 6.86360370814285,
"learning_rate": 3.1775860646860566e-06,
"loss": 4.2127,
"step": 784
},
{
"epoch": 0.6818675352877307,
"grad_norm": 5.647923819665503,
"learning_rate": 3.1667932788704414e-06,
"loss": 4.252,
"step": 785
},
{
"epoch": 0.6827361563517915,
"grad_norm": 4.838224572668243,
"learning_rate": 3.1560188179481356e-06,
"loss": 4.3531,
"step": 786
},
{
"epoch": 0.6836047774158523,
"grad_norm": 6.70434806859771,
"learning_rate": 3.1452627665412384e-06,
"loss": 4.2778,
"step": 787
},
{
"epoch": 0.6844733984799132,
"grad_norm": 3.8436097880252174,
"learning_rate": 3.134525209127255e-06,
"loss": 4.2968,
"step": 788
},
{
"epoch": 0.6853420195439739,
"grad_norm": 6.343428669639347,
"learning_rate": 3.1238062300384464e-06,
"loss": 4.2696,
"step": 789
},
{
"epoch": 0.6862106406080347,
"grad_norm": 5.042121553913117,
"learning_rate": 3.1131059134611595e-06,
"loss": 4.3773,
"step": 790
},
{
"epoch": 0.6870792616720955,
"grad_norm": 5.935535940636119,
"learning_rate": 3.1024243434351653e-06,
"loss": 4.3472,
"step": 791
},
{
"epoch": 0.6879478827361564,
"grad_norm": 4.211630727724998,
"learning_rate": 3.0917616038530006e-06,
"loss": 4.3975,
"step": 792
},
{
"epoch": 0.6888165038002172,
"grad_norm": 5.39953776616512,
"learning_rate": 3.0811177784593086e-06,
"loss": 4.2212,
"step": 793
},
{
"epoch": 0.6896851248642779,
"grad_norm": 6.2670789214250355,
"learning_rate": 3.0704929508501836e-06,
"loss": 4.2709,
"step": 794
},
{
"epoch": 0.6905537459283387,
"grad_norm": 5.3978340661584285,
"learning_rate": 3.059887204472508e-06,
"loss": 4.3752,
"step": 795
},
{
"epoch": 0.6914223669923996,
"grad_norm": 4.222514400176097,
"learning_rate": 3.0493006226233067e-06,
"loss": 4.283,
"step": 796
},
{
"epoch": 0.6922909880564604,
"grad_norm": 6.363789170503971,
"learning_rate": 3.03873328844908e-06,
"loss": 4.2572,
"step": 797
},
{
"epoch": 0.6931596091205212,
"grad_norm": 4.996564390552643,
"learning_rate": 3.028185284945164e-06,
"loss": 4.2321,
"step": 798
},
{
"epoch": 0.6940282301845819,
"grad_norm": 3.7088045379057424,
"learning_rate": 3.01765669495507e-06,
"loss": 4.5402,
"step": 799
},
{
"epoch": 0.6948968512486428,
"grad_norm": 4.7675482950474475,
"learning_rate": 3.0071476011698387e-06,
"loss": 4.247,
"step": 800
},
{
"epoch": 0.6957654723127036,
"grad_norm": 5.639346110912645,
"learning_rate": 2.9966580861273847e-06,
"loss": 4.4099,
"step": 801
},
{
"epoch": 0.6966340933767644,
"grad_norm": 4.770187045539365,
"learning_rate": 2.9861882322118565e-06,
"loss": 4.1644,
"step": 802
},
{
"epoch": 0.6975027144408252,
"grad_norm": 3.6627706854689106,
"learning_rate": 2.9757381216529814e-06,
"loss": 4.3601,
"step": 803
},
{
"epoch": 0.698371335504886,
"grad_norm": 3.9453583597610256,
"learning_rate": 2.9653078365254267e-06,
"loss": 4.3775,
"step": 804
},
{
"epoch": 0.6992399565689468,
"grad_norm": 4.121043954930214,
"learning_rate": 2.954897458748147e-06,
"loss": 4.2391,
"step": 805
},
{
"epoch": 0.7001085776330076,
"grad_norm": 3.5024329761434974,
"learning_rate": 2.9445070700837486e-06,
"loss": 4.4102,
"step": 806
},
{
"epoch": 0.7009771986970684,
"grad_norm": 5.1118475735166085,
"learning_rate": 2.934136752137849e-06,
"loss": 4.3506,
"step": 807
},
{
"epoch": 0.7018458197611293,
"grad_norm": 4.68701671319949,
"learning_rate": 2.9237865863584204e-06,
"loss": 4.273,
"step": 808
},
{
"epoch": 0.70271444082519,
"grad_norm": 4.207677749842511,
"learning_rate": 2.9134566540351695e-06,
"loss": 4.4052,
"step": 809
},
{
"epoch": 0.7035830618892508,
"grad_norm": 5.588319045300857,
"learning_rate": 2.903147036298888e-06,
"loss": 4.3446,
"step": 810
},
{
"epoch": 0.7044516829533116,
"grad_norm": 4.694144532657754,
"learning_rate": 2.892857814120815e-06,
"loss": 4.3963,
"step": 811
},
{
"epoch": 0.7053203040173724,
"grad_norm": 5.261041999651582,
"learning_rate": 2.8825890683120087e-06,
"loss": 4.1755,
"step": 812
},
{
"epoch": 0.7061889250814333,
"grad_norm": 3.537979646262889,
"learning_rate": 2.8723408795227063e-06,
"loss": 4.3071,
"step": 813
},
{
"epoch": 0.707057546145494,
"grad_norm": 3.8989008087932753,
"learning_rate": 2.8621133282416836e-06,
"loss": 4.262,
"step": 814
},
{
"epoch": 0.7079261672095548,
"grad_norm": 3.9902100422123175,
"learning_rate": 2.8519064947956403e-06,
"loss": 4.4904,
"step": 815
},
{
"epoch": 0.7087947882736156,
"grad_norm": 5.206036718500678,
"learning_rate": 2.8417204593485566e-06,
"loss": 4.3229,
"step": 816
},
{
"epoch": 0.7096634093376765,
"grad_norm": 5.794935718355797,
"learning_rate": 2.831555301901061e-06,
"loss": 4.3783,
"step": 817
},
{
"epoch": 0.7105320304017373,
"grad_norm": 4.0421897856037114,
"learning_rate": 2.82141110228981e-06,
"loss": 4.3945,
"step": 818
},
{
"epoch": 0.711400651465798,
"grad_norm": 4.6514330236834,
"learning_rate": 2.811287940186866e-06,
"loss": 4.3617,
"step": 819
},
{
"epoch": 0.7122692725298588,
"grad_norm": 3.6423689253885194,
"learning_rate": 2.80118589509905e-06,
"loss": 4.2173,
"step": 820
},
{
"epoch": 0.7131378935939197,
"grad_norm": 3.6396624178466896,
"learning_rate": 2.791105046367341e-06,
"loss": 4.3947,
"step": 821
},
{
"epoch": 0.7140065146579805,
"grad_norm": 3.8347475901940054,
"learning_rate": 2.781045473166239e-06,
"loss": 4.1539,
"step": 822
},
{
"epoch": 0.7148751357220413,
"grad_norm": 3.9705888799663027,
"learning_rate": 2.771007254503149e-06,
"loss": 4.4397,
"step": 823
},
{
"epoch": 0.715743756786102,
"grad_norm": 3.6196867847154164,
"learning_rate": 2.7609904692177573e-06,
"loss": 4.0627,
"step": 824
},
{
"epoch": 0.7166123778501629,
"grad_norm": 3.1130920798740362,
"learning_rate": 2.750995195981412e-06,
"loss": 4.2884,
"step": 825
},
{
"epoch": 0.7174809989142237,
"grad_norm": 3.288307548785416,
"learning_rate": 2.7410215132965074e-06,
"loss": 4.3433,
"step": 826
},
{
"epoch": 0.7183496199782845,
"grad_norm": 3.5010329845706596,
"learning_rate": 2.7310694994958713e-06,
"loss": 4.2592,
"step": 827
},
{
"epoch": 0.7192182410423453,
"grad_norm": 3.0547168595737504,
"learning_rate": 2.721139232742137e-06,
"loss": 4.3653,
"step": 828
},
{
"epoch": 0.720086862106406,
"grad_norm": 3.1039092159267074,
"learning_rate": 2.711230791027144e-06,
"loss": 4.319,
"step": 829
},
{
"epoch": 0.7209554831704669,
"grad_norm": 3.000221643527686,
"learning_rate": 2.7013442521713157e-06,
"loss": 4.3168,
"step": 830
},
{
"epoch": 0.7218241042345277,
"grad_norm": 3.56612108454193,
"learning_rate": 2.691479693823053e-06,
"loss": 4.2912,
"step": 831
},
{
"epoch": 0.7226927252985885,
"grad_norm": 4.045486521817849,
"learning_rate": 2.6816371934581224e-06,
"loss": 4.4497,
"step": 832
},
{
"epoch": 0.7235613463626493,
"grad_norm": 5.06756019279877,
"learning_rate": 2.6718168283790502e-06,
"loss": 4.2895,
"step": 833
},
{
"epoch": 0.7244299674267101,
"grad_norm": 5.209971490598599,
"learning_rate": 2.6620186757145055e-06,
"loss": 4.3888,
"step": 834
},
{
"epoch": 0.7252985884907709,
"grad_norm": 5.759379090394831,
"learning_rate": 2.652242812418712e-06,
"loss": 4.3862,
"step": 835
},
{
"epoch": 0.7261672095548317,
"grad_norm": 4.223016994293722,
"learning_rate": 2.642489315270832e-06,
"loss": 4.3214,
"step": 836
},
{
"epoch": 0.7270358306188925,
"grad_norm": 4.5544158674627955,
"learning_rate": 2.632758260874358e-06,
"loss": 4.2358,
"step": 837
},
{
"epoch": 0.7279044516829533,
"grad_norm": 4.589085818499639,
"learning_rate": 2.6230497256565234e-06,
"loss": 4.1757,
"step": 838
},
{
"epoch": 0.7287730727470141,
"grad_norm": 4.716082178521095,
"learning_rate": 2.613363785867699e-06,
"loss": 4.2912,
"step": 839
},
{
"epoch": 0.7296416938110749,
"grad_norm": 4.0122133897679975,
"learning_rate": 2.6037005175807883e-06,
"loss": 4.2088,
"step": 840
},
{
"epoch": 0.7305103148751357,
"grad_norm": 4.519123777136473,
"learning_rate": 2.594059996690636e-06,
"loss": 4.3983,
"step": 841
},
{
"epoch": 0.7313789359391966,
"grad_norm": 4.366992105875965,
"learning_rate": 2.5844422989134294e-06,
"loss": 4.3502,
"step": 842
},
{
"epoch": 0.7322475570032573,
"grad_norm": 4.902796136843425,
"learning_rate": 2.574847499786103e-06,
"loss": 4.3827,
"step": 843
},
{
"epoch": 0.7331161780673181,
"grad_norm": 3.7309923778017335,
"learning_rate": 2.5652756746657474e-06,
"loss": 4.1343,
"step": 844
},
{
"epoch": 0.7339847991313789,
"grad_norm": 4.633964698796071,
"learning_rate": 2.5557268987290196e-06,
"loss": 4.2807,
"step": 845
},
{
"epoch": 0.7348534201954398,
"grad_norm": 3.158778304410855,
"learning_rate": 2.546201246971542e-06,
"loss": 4.1983,
"step": 846
},
{
"epoch": 0.7357220412595006,
"grad_norm": 4.280110117256575,
"learning_rate": 2.536698794207327e-06,
"loss": 4.1502,
"step": 847
},
{
"epoch": 0.7365906623235613,
"grad_norm": 4.019320202479935,
"learning_rate": 2.527219615068181e-06,
"loss": 4.269,
"step": 848
},
{
"epoch": 0.7374592833876221,
"grad_norm": 4.684956718222349,
"learning_rate": 2.517763784003121e-06,
"loss": 4.2774,
"step": 849
},
{
"epoch": 0.738327904451683,
"grad_norm": 4.72042770271221,
"learning_rate": 2.5083313752777893e-06,
"loss": 4.3012,
"step": 850
},
{
"epoch": 0.7391965255157438,
"grad_norm": 3.9457131710176028,
"learning_rate": 2.4989224629738705e-06,
"loss": 4.3457,
"step": 851
},
{
"epoch": 0.7400651465798046,
"grad_norm": 3.86871572131023,
"learning_rate": 2.4895371209885082e-06,
"loss": 4.2702,
"step": 852
},
{
"epoch": 0.7409337676438653,
"grad_norm": 4.162164409779829,
"learning_rate": 2.4801754230337287e-06,
"loss": 4.2957,
"step": 853
},
{
"epoch": 0.7418023887079261,
"grad_norm": 5.002185565803835,
"learning_rate": 2.4708374426358543e-06,
"loss": 4.2632,
"step": 854
},
{
"epoch": 0.742671009771987,
"grad_norm": 5.4293250327800004,
"learning_rate": 2.4615232531349332e-06,
"loss": 4.2441,
"step": 855
},
{
"epoch": 0.7435396308360478,
"grad_norm": 4.995810113032375,
"learning_rate": 2.452232927684166e-06,
"loss": 4.2673,
"step": 856
},
{
"epoch": 0.7444082519001086,
"grad_norm": 3.2917198811992803,
"learning_rate": 2.442966539249318e-06,
"loss": 4.1662,
"step": 857
},
{
"epoch": 0.7452768729641693,
"grad_norm": 4.814530155931897,
"learning_rate": 2.4337241606081587e-06,
"loss": 4.2989,
"step": 858
},
{
"epoch": 0.7461454940282302,
"grad_norm": 5.299603931940639,
"learning_rate": 2.424505864349886e-06,
"loss": 4.3474,
"step": 859
},
{
"epoch": 0.747014115092291,
"grad_norm": 5.282979213255703,
"learning_rate": 2.4153117228745543e-06,
"loss": 4.2909,
"step": 860
},
{
"epoch": 0.7478827361563518,
"grad_norm": 4.536156542709376,
"learning_rate": 2.4061418083925085e-06,
"loss": 4.3692,
"step": 861
},
{
"epoch": 0.7487513572204126,
"grad_norm": 5.026058475783841,
"learning_rate": 2.396996192923818e-06,
"loss": 4.3141,
"step": 862
},
{
"epoch": 0.7496199782844734,
"grad_norm": 5.465713560303808,
"learning_rate": 2.387874948297701e-06,
"loss": 4.3735,
"step": 863
},
{
"epoch": 0.7504885993485342,
"grad_norm": 3.9399735021333,
"learning_rate": 2.3787781461519786e-06,
"loss": 4.2175,
"step": 864
},
{
"epoch": 0.751357220412595,
"grad_norm": 5.576777900877066,
"learning_rate": 2.3697058579324976e-06,
"loss": 4.1978,
"step": 865
},
{
"epoch": 0.7522258414766558,
"grad_norm": 5.197291281490897,
"learning_rate": 2.3606581548925696e-06,
"loss": 4.3048,
"step": 866
},
{
"epoch": 0.7530944625407167,
"grad_norm": 3.767610507301271,
"learning_rate": 2.3516351080924206e-06,
"loss": 4.257,
"step": 867
},
{
"epoch": 0.7539630836047774,
"grad_norm": 5.305437866922308,
"learning_rate": 2.3426367883986254e-06,
"loss": 4.3791,
"step": 868
},
{
"epoch": 0.7548317046688382,
"grad_norm": 4.698829389303069,
"learning_rate": 2.333663266483555e-06,
"loss": 4.4556,
"step": 869
},
{
"epoch": 0.755700325732899,
"grad_norm": 4.351560277566826,
"learning_rate": 2.3247146128248183e-06,
"loss": 4.3409,
"step": 870
},
{
"epoch": 0.7565689467969599,
"grad_norm": 3.103240864167305,
"learning_rate": 2.3157908977047096e-06,
"loss": 4.1706,
"step": 871
},
{
"epoch": 0.7574375678610207,
"grad_norm": 5.956940596200794,
"learning_rate": 2.3068921912096585e-06,
"loss": 4.3036,
"step": 872
},
{
"epoch": 0.7583061889250814,
"grad_norm": 4.030266382079339,
"learning_rate": 2.2980185632296797e-06,
"loss": 4.3916,
"step": 873
},
{
"epoch": 0.7591748099891422,
"grad_norm": 4.907585920293589,
"learning_rate": 2.2891700834578175e-06,
"loss": 4.1033,
"step": 874
},
{
"epoch": 0.760043431053203,
"grad_norm": 3.297402834871266,
"learning_rate": 2.2803468213896063e-06,
"loss": 4.3624,
"step": 875
},
{
"epoch": 0.7609120521172639,
"grad_norm": 4.662439533859811,
"learning_rate": 2.2715488463225228e-06,
"loss": 4.3034,
"step": 876
},
{
"epoch": 0.7617806731813247,
"grad_norm": 5.165185490895554,
"learning_rate": 2.262776227355439e-06,
"loss": 4.2157,
"step": 877
},
{
"epoch": 0.7626492942453854,
"grad_norm": 7.506559487228573,
"learning_rate": 2.254029033388084e-06,
"loss": 4.4301,
"step": 878
},
{
"epoch": 0.7635179153094462,
"grad_norm": 3.9252824887426554,
"learning_rate": 2.2453073331204957e-06,
"loss": 4.2351,
"step": 879
},
{
"epoch": 0.7643865363735071,
"grad_norm": 5.290959184706457,
"learning_rate": 2.2366111950524906e-06,
"loss": 4.2538,
"step": 880
},
{
"epoch": 0.7652551574375679,
"grad_norm": 4.7171299694367175,
"learning_rate": 2.2279406874831164e-06,
"loss": 4.4357,
"step": 881
},
{
"epoch": 0.7661237785016287,
"grad_norm": 29.979748449798375,
"learning_rate": 2.2192958785101258e-06,
"loss": 4.3621,
"step": 882
},
{
"epoch": 0.7669923995656894,
"grad_norm": 7.860730063348475,
"learning_rate": 2.210676836029429e-06,
"loss": 4.3751,
"step": 883
},
{
"epoch": 0.7678610206297503,
"grad_norm": 4.334356696505269,
"learning_rate": 2.20208362773457e-06,
"loss": 4.2966,
"step": 884
},
{
"epoch": 0.7687296416938111,
"grad_norm": 7.332346221259919,
"learning_rate": 2.193516321116198e-06,
"loss": 4.2862,
"step": 885
},
{
"epoch": 0.7695982627578719,
"grad_norm": 5.9624071677426125,
"learning_rate": 2.1849749834615235e-06,
"loss": 4.3632,
"step": 886
},
{
"epoch": 0.7704668838219327,
"grad_norm": 4.954161674367398,
"learning_rate": 2.176459681853801e-06,
"loss": 4.2059,
"step": 887
},
{
"epoch": 0.7713355048859935,
"grad_norm": 6.378179042127337,
"learning_rate": 2.167970483171801e-06,
"loss": 4.166,
"step": 888
},
{
"epoch": 0.7722041259500543,
"grad_norm": 4.9618200288070575,
"learning_rate": 2.1595074540892815e-06,
"loss": 4.3356,
"step": 889
},
{
"epoch": 0.7730727470141151,
"grad_norm": 6.856421677907798,
"learning_rate": 2.1510706610744654e-06,
"loss": 4.3511,
"step": 890
},
{
"epoch": 0.7739413680781759,
"grad_norm": 4.410108580533019,
"learning_rate": 2.1426601703895195e-06,
"loss": 4.2937,
"step": 891
},
{
"epoch": 0.7748099891422368,
"grad_norm": 7.583217716536857,
"learning_rate": 2.13427604809003e-06,
"loss": 4.3183,
"step": 892
},
{
"epoch": 0.7756786102062975,
"grad_norm": 5.9361144172296685,
"learning_rate": 2.125918360024493e-06,
"loss": 4.374,
"step": 893
},
{
"epoch": 0.7765472312703583,
"grad_norm": 6.875470826753328,
"learning_rate": 2.117587171833789e-06,
"loss": 4.4312,
"step": 894
},
{
"epoch": 0.7774158523344191,
"grad_norm": 5.096832067096056,
"learning_rate": 2.109282548950667e-06,
"loss": 4.278,
"step": 895
},
{
"epoch": 0.77828447339848,
"grad_norm": 5.729522521702186,
"learning_rate": 2.1010045565992363e-06,
"loss": 4.2937,
"step": 896
},
{
"epoch": 0.7791530944625407,
"grad_norm": 6.742811437853955,
"learning_rate": 2.0927532597944496e-06,
"loss": 4.2172,
"step": 897
},
{
"epoch": 0.7800217155266015,
"grad_norm": 5.6301478715572975,
"learning_rate": 2.0845287233415963e-06,
"loss": 4.176,
"step": 898
},
{
"epoch": 0.7808903365906623,
"grad_norm": 5.235841496697818,
"learning_rate": 2.0763310118357893e-06,
"loss": 4.3747,
"step": 899
},
{
"epoch": 0.7817589576547231,
"grad_norm": 5.049217344169785,
"learning_rate": 2.06816018966146e-06,
"loss": 4.2626,
"step": 900
},
{
"epoch": 0.782627578718784,
"grad_norm": 4.5457661427037195,
"learning_rate": 2.060016320991853e-06,
"loss": 4.2773,
"step": 901
},
{
"epoch": 0.7834961997828447,
"grad_norm": 4.117834715315518,
"learning_rate": 2.051899469788522e-06,
"loss": 4.3088,
"step": 902
},
{
"epoch": 0.7843648208469055,
"grad_norm": 5.135053442236565,
"learning_rate": 2.043809699800824e-06,
"loss": 4.2918,
"step": 903
},
{
"epoch": 0.7852334419109663,
"grad_norm": 7.82388906554306,
"learning_rate": 2.0357470745654213e-06,
"loss": 4.3529,
"step": 904
},
{
"epoch": 0.7861020629750272,
"grad_norm": 4.837908722893104,
"learning_rate": 2.0277116574057905e-06,
"loss": 4.1562,
"step": 905
},
{
"epoch": 0.786970684039088,
"grad_norm": 3.68904796267135,
"learning_rate": 2.0197035114317056e-06,
"loss": 4.2128,
"step": 906
},
{
"epoch": 0.7878393051031487,
"grad_norm": 4.534016411952663,
"learning_rate": 2.0117226995387625e-06,
"loss": 4.3172,
"step": 907
},
{
"epoch": 0.7887079261672095,
"grad_norm": 3.526628215688476,
"learning_rate": 2.0037692844078728e-06,
"loss": 4.2893,
"step": 908
},
{
"epoch": 0.7895765472312704,
"grad_norm": 3.8435396264813457,
"learning_rate": 1.9958433285047766e-06,
"loss": 4.2629,
"step": 909
},
{
"epoch": 0.7904451682953312,
"grad_norm": 12.416597855589442,
"learning_rate": 1.9879448940795496e-06,
"loss": 4.1928,
"step": 910
},
{
"epoch": 0.791313789359392,
"grad_norm": 3.4260543752536354,
"learning_rate": 1.980074043166118e-06,
"loss": 4.3879,
"step": 911
},
{
"epoch": 0.7921824104234527,
"grad_norm": 4.024334100744361,
"learning_rate": 1.9722308375817616e-06,
"loss": 4.3358,
"step": 912
},
{
"epoch": 0.7930510314875135,
"grad_norm": 3.8937964884137704,
"learning_rate": 1.9644153389266428e-06,
"loss": 4.2585,
"step": 913
},
{
"epoch": 0.7939196525515744,
"grad_norm": 4.112423538802947,
"learning_rate": 1.9566276085833137e-06,
"loss": 4.3666,
"step": 914
},
{
"epoch": 0.7947882736156352,
"grad_norm": 3.361914528770081,
"learning_rate": 1.9488677077162294e-06,
"loss": 4.3395,
"step": 915
},
{
"epoch": 0.795656894679696,
"grad_norm": 3.201261785374445,
"learning_rate": 1.9411356972712802e-06,
"loss": 4.1024,
"step": 916
},
{
"epoch": 0.7965255157437567,
"grad_norm": 3.1715636963506193,
"learning_rate": 1.9334316379753037e-06,
"loss": 4.3268,
"step": 917
},
{
"epoch": 0.7973941368078176,
"grad_norm": 4.702760089867412,
"learning_rate": 1.92575559033561e-06,
"loss": 4.2872,
"step": 918
},
{
"epoch": 0.7982627578718784,
"grad_norm": 4.301084643411801,
"learning_rate": 1.91810761463951e-06,
"loss": 4.2491,
"step": 919
},
{
"epoch": 0.7991313789359392,
"grad_norm": 4.368620526838832,
"learning_rate": 1.9104877709538346e-06,
"loss": 4.1825,
"step": 920
},
{
"epoch": 0.8,
"grad_norm": 4.541552734260731,
"learning_rate": 1.902896119124471e-06,
"loss": 4.2936,
"step": 921
},
{
"epoch": 0.8008686210640608,
"grad_norm": 3.753504642364837,
"learning_rate": 1.8953327187758872e-06,
"loss": 4.2807,
"step": 922
},
{
"epoch": 0.8017372421281216,
"grad_norm": 4.1076335957004035,
"learning_rate": 1.8877976293106645e-06,
"loss": 4.3054,
"step": 923
},
{
"epoch": 0.8026058631921824,
"grad_norm": 5.022940647754388,
"learning_rate": 1.8802909099090328e-06,
"loss": 4.2605,
"step": 924
},
{
"epoch": 0.8034744842562432,
"grad_norm": 4.601415915720413,
"learning_rate": 1.8728126195284063e-06,
"loss": 4.2974,
"step": 925
},
{
"epoch": 0.8043431053203041,
"grad_norm": 3.5937684601582505,
"learning_rate": 1.8653628169029172e-06,
"loss": 4.4372,
"step": 926
},
{
"epoch": 0.8052117263843648,
"grad_norm": 4.087141421454951,
"learning_rate": 1.8579415605429566e-06,
"loss": 4.3578,
"step": 927
},
{
"epoch": 0.8060803474484256,
"grad_norm": 3.768303031259501,
"learning_rate": 1.850548908734715e-06,
"loss": 4.3954,
"step": 928
},
{
"epoch": 0.8069489685124864,
"grad_norm": 4.631014557901584,
"learning_rate": 1.843184919539724e-06,
"loss": 4.3692,
"step": 929
},
{
"epoch": 0.8078175895765473,
"grad_norm": 5.588991832935504,
"learning_rate": 1.8358496507944004e-06,
"loss": 4.3679,
"step": 930
},
{
"epoch": 0.8086862106406081,
"grad_norm": 4.760700300967851,
"learning_rate": 1.8285431601095932e-06,
"loss": 4.2824,
"step": 931
},
{
"epoch": 0.8095548317046688,
"grad_norm": 7.110512990765382,
"learning_rate": 1.8212655048701263e-06,
"loss": 4.276,
"step": 932
},
{
"epoch": 0.8104234527687296,
"grad_norm": 4.84933914886446,
"learning_rate": 1.8140167422343536e-06,
"loss": 4.3264,
"step": 933
},
{
"epoch": 0.8112920738327905,
"grad_norm": 6.250495292898315,
"learning_rate": 1.8067969291337111e-06,
"loss": 4.2783,
"step": 934
},
{
"epoch": 0.8121606948968513,
"grad_norm": 5.081084041830173,
"learning_rate": 1.7996061222722602e-06,
"loss": 4.1333,
"step": 935
},
{
"epoch": 0.8130293159609121,
"grad_norm": 4.764801399944141,
"learning_rate": 1.7924443781262537e-06,
"loss": 4.258,
"step": 936
},
{
"epoch": 0.8138979370249728,
"grad_norm": 12.564644718015563,
"learning_rate": 1.7853117529436853e-06,
"loss": 4.4282,
"step": 937
},
{
"epoch": 0.8147665580890336,
"grad_norm": 9.219100670998142,
"learning_rate": 1.7782083027438493e-06,
"loss": 4.4036,
"step": 938
},
{
"epoch": 0.8156351791530945,
"grad_norm": 3.513961026121951,
"learning_rate": 1.7711340833169027e-06,
"loss": 4.2192,
"step": 939
},
{
"epoch": 0.8165038002171553,
"grad_norm": 7.5648391009618985,
"learning_rate": 1.7640891502234242e-06,
"loss": 4.3509,
"step": 940
},
{
"epoch": 0.8173724212812161,
"grad_norm": 7.457786533569347,
"learning_rate": 1.7570735587939774e-06,
"loss": 4.4761,
"step": 941
},
{
"epoch": 0.8182410423452768,
"grad_norm": 6.24994158926114,
"learning_rate": 1.7500873641286822e-06,
"loss": 4.3524,
"step": 942
},
{
"epoch": 0.8191096634093377,
"grad_norm": 4.706302961561861,
"learning_rate": 1.7431306210967757e-06,
"loss": 4.4049,
"step": 943
},
{
"epoch": 0.8199782844733985,
"grad_norm": 5.2319379724074375,
"learning_rate": 1.7362033843361808e-06,
"loss": 4.3184,
"step": 944
},
{
"epoch": 0.8208469055374593,
"grad_norm": 3.9931677964283336,
"learning_rate": 1.7293057082530823e-06,
"loss": 4.3872,
"step": 945
},
{
"epoch": 0.8217155266015201,
"grad_norm": 5.122675213766239,
"learning_rate": 1.7224376470214965e-06,
"loss": 4.2414,
"step": 946
},
{
"epoch": 0.8225841476655809,
"grad_norm": 4.8271711783672595,
"learning_rate": 1.7155992545828459e-06,
"loss": 4.3155,
"step": 947
},
{
"epoch": 0.8234527687296417,
"grad_norm": 5.08549920131417,
"learning_rate": 1.708790584645536e-06,
"loss": 4.3494,
"step": 948
},
{
"epoch": 0.8243213897937025,
"grad_norm": 3.2828478853516265,
"learning_rate": 1.7020116906845314e-06,
"loss": 4.1676,
"step": 949
},
{
"epoch": 0.8251900108577633,
"grad_norm": 4.217032686568317,
"learning_rate": 1.6952626259409403e-06,
"loss": 4.2612,
"step": 950
},
{
"epoch": 0.826058631921824,
"grad_norm": 3.3093948746453967,
"learning_rate": 1.6885434434215928e-06,
"loss": 4.2619,
"step": 951
},
{
"epoch": 0.8269272529858849,
"grad_norm": 3.4071908680921816,
"learning_rate": 1.681854195898624e-06,
"loss": 4.3616,
"step": 952
},
{
"epoch": 0.8277958740499457,
"grad_norm": 3.950423518377903,
"learning_rate": 1.6751949359090608e-06,
"loss": 4.4618,
"step": 953
},
{
"epoch": 0.8286644951140065,
"grad_norm": 3.113460449412214,
"learning_rate": 1.6685657157544152e-06,
"loss": 4.2883,
"step": 954
},
{
"epoch": 0.8295331161780674,
"grad_norm": 3.978159856987336,
"learning_rate": 1.6619665875002589e-06,
"loss": 4.2919,
"step": 955
},
{
"epoch": 0.8304017372421281,
"grad_norm": 3.627190156440042,
"learning_rate": 1.655397602975829e-06,
"loss": 4.4444,
"step": 956
},
{
"epoch": 0.8312703583061889,
"grad_norm": 3.501429507504989,
"learning_rate": 1.6488588137736142e-06,
"loss": 4.3162,
"step": 957
},
{
"epoch": 0.8321389793702497,
"grad_norm": 3.912898082889782,
"learning_rate": 1.6423502712489498e-06,
"loss": 4.1927,
"step": 958
},
{
"epoch": 0.8330076004343105,
"grad_norm": 2.9544066919092002,
"learning_rate": 1.6358720265196162e-06,
"loss": 4.2873,
"step": 959
},
{
"epoch": 0.8338762214983714,
"grad_norm": 3.7766990336064423,
"learning_rate": 1.629424130465436e-06,
"loss": 4.2311,
"step": 960
},
{
"epoch": 0.8347448425624321,
"grad_norm": 3.187557348988122,
"learning_rate": 1.6230066337278721e-06,
"loss": 4.3319,
"step": 961
},
{
"epoch": 0.8356134636264929,
"grad_norm": 2.9035478405536184,
"learning_rate": 1.6166195867096379e-06,
"loss": 4.2798,
"step": 962
},
{
"epoch": 0.8364820846905537,
"grad_norm": 3.581073588055323,
"learning_rate": 1.6102630395742936e-06,
"loss": 4.2378,
"step": 963
},
{
"epoch": 0.8373507057546146,
"grad_norm": 2.9717432186991313,
"learning_rate": 1.603937042245851e-06,
"loss": 4.1643,
"step": 964
},
{
"epoch": 0.8382193268186754,
"grad_norm": 3.4682548739965195,
"learning_rate": 1.5976416444083919e-06,
"loss": 4.2895,
"step": 965
},
{
"epoch": 0.8390879478827361,
"grad_norm": 2.8257749424306917,
"learning_rate": 1.5913768955056669e-06,
"loss": 4.2387,
"step": 966
},
{
"epoch": 0.8399565689467969,
"grad_norm": 3.4045296792179585,
"learning_rate": 1.585142844740712e-06,
"loss": 4.2493,
"step": 967
},
{
"epoch": 0.8408251900108578,
"grad_norm": 3.632548936442941,
"learning_rate": 1.5789395410754624e-06,
"loss": 4.0125,
"step": 968
},
{
"epoch": 0.8416938110749186,
"grad_norm": 3.458367314452796,
"learning_rate": 1.5727670332303662e-06,
"loss": 4.2027,
"step": 969
},
{
"epoch": 0.8425624321389794,
"grad_norm": 3.5230657596459563,
"learning_rate": 1.5666253696840039e-06,
"loss": 4.2408,
"step": 970
},
{
"epoch": 0.8434310532030401,
"grad_norm": 3.0474045364092475,
"learning_rate": 1.5605145986727055e-06,
"loss": 4.1706,
"step": 971
},
{
"epoch": 0.844299674267101,
"grad_norm": 2.9068561874294243,
"learning_rate": 1.5544347681901708e-06,
"loss": 4.2883,
"step": 972
},
{
"epoch": 0.8451682953311618,
"grad_norm": 3.2628088807621385,
"learning_rate": 1.548385925987097e-06,
"loss": 4.2064,
"step": 973
},
{
"epoch": 0.8460369163952226,
"grad_norm": 2.877609574460853,
"learning_rate": 1.5423681195707995e-06,
"loss": 4.2439,
"step": 974
},
{
"epoch": 0.8469055374592834,
"grad_norm": 3.6963360738924007,
"learning_rate": 1.5363813962048404e-06,
"loss": 4.4093,
"step": 975
},
{
"epoch": 0.8477741585233441,
"grad_norm": 3.6396595948485686,
"learning_rate": 1.530425802908657e-06,
"loss": 4.2486,
"step": 976
},
{
"epoch": 0.848642779587405,
"grad_norm": 3.0025316303653593,
"learning_rate": 1.5245013864571915e-06,
"loss": 4.2436,
"step": 977
},
{
"epoch": 0.8495114006514658,
"grad_norm": 3.075143454138421,
"learning_rate": 1.518608193380527e-06,
"loss": 4.1952,
"step": 978
},
{
"epoch": 0.8503800217155266,
"grad_norm": 3.837382455029135,
"learning_rate": 1.5127462699635175e-06,
"loss": 4.3256,
"step": 979
},
{
"epoch": 0.8512486427795874,
"grad_norm": 3.912314406379558,
"learning_rate": 1.5069156622454286e-06,
"loss": 4.1805,
"step": 980
},
{
"epoch": 0.8521172638436482,
"grad_norm": 3.556579266906855,
"learning_rate": 1.5011164160195713e-06,
"loss": 4.187,
"step": 981
},
{
"epoch": 0.852985884907709,
"grad_norm": 4.767048077361486,
"learning_rate": 1.495348576832945e-06,
"loss": 4.3401,
"step": 982
},
{
"epoch": 0.8538545059717698,
"grad_norm": 3.34570276142316,
"learning_rate": 1.4896121899858855e-06,
"loss": 4.3152,
"step": 983
},
{
"epoch": 0.8547231270358306,
"grad_norm": 3.928790482520009,
"learning_rate": 1.4839073005316954e-06,
"loss": 4.4026,
"step": 984
},
{
"epoch": 0.8555917480998915,
"grad_norm": 3.896817042687782,
"learning_rate": 1.4782339532763035e-06,
"loss": 4.3404,
"step": 985
},
{
"epoch": 0.8564603691639522,
"grad_norm": 3.5320973494543972,
"learning_rate": 1.4725921927779053e-06,
"loss": 4.2046,
"step": 986
},
{
"epoch": 0.857328990228013,
"grad_norm": 4.983206804496437,
"learning_rate": 1.466982063346617e-06,
"loss": 4.2724,
"step": 987
},
{
"epoch": 0.8581976112920738,
"grad_norm": 2.6576025845538602,
"learning_rate": 1.4614036090441242e-06,
"loss": 4.2774,
"step": 988
},
{
"epoch": 0.8590662323561347,
"grad_norm": 3.890820577234005,
"learning_rate": 1.4558568736833403e-06,
"loss": 4.3634,
"step": 989
},
{
"epoch": 0.8599348534201955,
"grad_norm": 4.047293622617586,
"learning_rate": 1.450341900828055e-06,
"loss": 4.2452,
"step": 990
},
{
"epoch": 0.8608034744842562,
"grad_norm": 3.082152289805242,
"learning_rate": 1.4448587337926029e-06,
"loss": 4.2638,
"step": 991
},
{
"epoch": 0.861672095548317,
"grad_norm": 3.279358197940446,
"learning_rate": 1.4394074156415127e-06,
"loss": 4.4121,
"step": 992
},
{
"epoch": 0.8625407166123779,
"grad_norm": 3.527792553396294,
"learning_rate": 1.4339879891891745e-06,
"loss": 4.1801,
"step": 993
},
{
"epoch": 0.8634093376764387,
"grad_norm": 3.505923077019055,
"learning_rate": 1.4286004969995026e-06,
"loss": 4.2853,
"step": 994
},
{
"epoch": 0.8642779587404995,
"grad_norm": 3.286205070282355,
"learning_rate": 1.4232449813856024e-06,
"loss": 4.375,
"step": 995
},
{
"epoch": 0.8651465798045602,
"grad_norm": 3.931680228410995,
"learning_rate": 1.4179214844094354e-06,
"loss": 4.2502,
"step": 996
},
{
"epoch": 0.866015200868621,
"grad_norm": 2.4039398272576897,
"learning_rate": 1.4126300478814912e-06,
"loss": 4.208,
"step": 997
},
{
"epoch": 0.8668838219326819,
"grad_norm": 3.7568390710068518,
"learning_rate": 1.4073707133604553e-06,
"loss": 4.1628,
"step": 998
},
{
"epoch": 0.8677524429967427,
"grad_norm": 3.3414877541755748,
"learning_rate": 1.4021435221528907e-06,
"loss": 4.1127,
"step": 999
},
{
"epoch": 0.8686210640608035,
"grad_norm": 3.458776188975169,
"learning_rate": 1.3969485153129052e-06,
"loss": 4.1204,
"step": 1000
},
{
"epoch": 0.8694896851248642,
"grad_norm": 4.919567300721427,
"learning_rate": 1.3917857336418311e-06,
"loss": 4.3408,
"step": 1001
},
{
"epoch": 0.8703583061889251,
"grad_norm": 3.065606325345494,
"learning_rate": 1.3866552176879073e-06,
"loss": 3.9601,
"step": 1002
},
{
"epoch": 0.8712269272529859,
"grad_norm": 3.761007586350327,
"learning_rate": 1.3815570077459616e-06,
"loss": 4.2438,
"step": 1003
},
{
"epoch": 0.8720955483170467,
"grad_norm": 2.885986718619506,
"learning_rate": 1.3764911438570873e-06,
"loss": 4.26,
"step": 1004
},
{
"epoch": 0.8729641693811075,
"grad_norm": 3.826731301340684,
"learning_rate": 1.3714576658083356e-06,
"loss": 4.3967,
"step": 1005
},
{
"epoch": 0.8738327904451683,
"grad_norm": 2.7458285420431805,
"learning_rate": 1.366456613132402e-06,
"loss": 4.3103,
"step": 1006
},
{
"epoch": 0.8747014115092291,
"grad_norm": 3.0626194989917996,
"learning_rate": 1.3614880251073126e-06,
"loss": 4.3979,
"step": 1007
},
{
"epoch": 0.8755700325732899,
"grad_norm": 2.904930109177129,
"learning_rate": 1.356551940756119e-06,
"loss": 4.1057,
"step": 1008
},
{
"epoch": 0.8764386536373507,
"grad_norm": 2.8060273606690864,
"learning_rate": 1.3516483988465911e-06,
"loss": 4.2066,
"step": 1009
},
{
"epoch": 0.8773072747014115,
"grad_norm": 2.9968484430903852,
"learning_rate": 1.3467774378909088e-06,
"loss": 4.2804,
"step": 1010
},
{
"epoch": 0.8781758957654723,
"grad_norm": 2.4009421672150264,
"learning_rate": 1.3419390961453673e-06,
"loss": 4.2054,
"step": 1011
},
{
"epoch": 0.8790445168295331,
"grad_norm": 2.9054455101472914,
"learning_rate": 1.3371334116100692e-06,
"loss": 4.2161,
"step": 1012
},
{
"epoch": 0.8799131378935939,
"grad_norm": 2.7097147432315474,
"learning_rate": 1.332360422028629e-06,
"loss": 4.2635,
"step": 1013
},
{
"epoch": 0.8807817589576548,
"grad_norm": 3.4351233135843895,
"learning_rate": 1.3276201648878778e-06,
"loss": 4.235,
"step": 1014
},
{
"epoch": 0.8816503800217155,
"grad_norm": 3.021256304492239,
"learning_rate": 1.3229126774175663e-06,
"loss": 4.1398,
"step": 1015
},
{
"epoch": 0.8825190010857763,
"grad_norm": 4.561864873305928,
"learning_rate": 1.3182379965900755e-06,
"loss": 4.2196,
"step": 1016
},
{
"epoch": 0.8833876221498371,
"grad_norm": 2.25054567072425,
"learning_rate": 1.3135961591201234e-06,
"loss": 4.3461,
"step": 1017
},
{
"epoch": 0.884256243213898,
"grad_norm": 4.349538372833775,
"learning_rate": 1.3089872014644772e-06,
"loss": 4.1902,
"step": 1018
},
{
"epoch": 0.8851248642779588,
"grad_norm": 5.211240766793744,
"learning_rate": 1.3044111598216697e-06,
"loss": 4.127,
"step": 1019
},
{
"epoch": 0.8859934853420195,
"grad_norm": 3.915181006258746,
"learning_rate": 1.2998680701317116e-06,
"loss": 4.3165,
"step": 1020
},
{
"epoch": 0.8868621064060803,
"grad_norm": 2.634220197635655,
"learning_rate": 1.2953579680758102e-06,
"loss": 4.0896,
"step": 1021
},
{
"epoch": 0.8877307274701411,
"grad_norm": 4.444464060721951,
"learning_rate": 1.2908808890760898e-06,
"loss": 4.3389,
"step": 1022
},
{
"epoch": 0.888599348534202,
"grad_norm": 3.6060226328219223,
"learning_rate": 1.2864368682953144e-06,
"loss": 4.3551,
"step": 1023
},
{
"epoch": 0.8894679695982628,
"grad_norm": 5.166782973498534,
"learning_rate": 1.2820259406366086e-06,
"loss": 4.343,
"step": 1024
},
{
"epoch": 0.8903365906623235,
"grad_norm": 2.8773053428878397,
"learning_rate": 1.2776481407431858e-06,
"loss": 4.2031,
"step": 1025
},
{
"epoch": 0.8912052117263843,
"grad_norm": 4.339800538926299,
"learning_rate": 1.2733035029980764e-06,
"loss": 4.348,
"step": 1026
},
{
"epoch": 0.8920738327904452,
"grad_norm": 3.237109745257262,
"learning_rate": 1.2689920615238564e-06,
"loss": 4.4058,
"step": 1027
},
{
"epoch": 0.892942453854506,
"grad_norm": 3.5663716818909514,
"learning_rate": 1.2647138501823787e-06,
"loss": 4.3298,
"step": 1028
},
{
"epoch": 0.8938110749185668,
"grad_norm": 3.7872661083743613,
"learning_rate": 1.2604689025745097e-06,
"loss": 4.3178,
"step": 1029
},
{
"epoch": 0.8946796959826275,
"grad_norm": 3.555878208510988,
"learning_rate": 1.2562572520398636e-06,
"loss": 4.429,
"step": 1030
},
{
"epoch": 0.8955483170466884,
"grad_norm": 3.548292136180123,
"learning_rate": 1.2520789316565407e-06,
"loss": 4.2436,
"step": 1031
},
{
"epoch": 0.8964169381107492,
"grad_norm": 2.8754548971872453,
"learning_rate": 1.247933974240869e-06,
"loss": 4.2454,
"step": 1032
},
{
"epoch": 0.89728555917481,
"grad_norm": 3.1391372015122534,
"learning_rate": 1.2438224123471442e-06,
"loss": 4.2209,
"step": 1033
},
{
"epoch": 0.8981541802388708,
"grad_norm": 4.08265544167836,
"learning_rate": 1.2397442782673751e-06,
"loss": 4.2561,
"step": 1034
},
{
"epoch": 0.8990228013029316,
"grad_norm": 2.8484362391754097,
"learning_rate": 1.2356996040310312e-06,
"loss": 4.2269,
"step": 1035
},
{
"epoch": 0.8998914223669924,
"grad_norm": 3.802284151154688,
"learning_rate": 1.231688421404789e-06,
"loss": 4.149,
"step": 1036
},
{
"epoch": 0.9007600434310532,
"grad_norm": 2.8793289723328175,
"learning_rate": 1.2277107618922843e-06,
"loss": 4.2458,
"step": 1037
},
{
"epoch": 0.901628664495114,
"grad_norm": 3.515343781948761,
"learning_rate": 1.2237666567338632e-06,
"loss": 4.0914,
"step": 1038
},
{
"epoch": 0.9024972855591749,
"grad_norm": 3.5760188126248926,
"learning_rate": 1.2198561369063366e-06,
"loss": 4.1833,
"step": 1039
},
{
"epoch": 0.9033659066232356,
"grad_norm": 2.8127761697413787,
"learning_rate": 1.2159792331227404e-06,
"loss": 4.1005,
"step": 1040
},
{
"epoch": 0.9042345276872964,
"grad_norm": 3.4802083558560195,
"learning_rate": 1.212135975832091e-06,
"loss": 4.1616,
"step": 1041
},
{
"epoch": 0.9051031487513572,
"grad_norm": 3.3333225446939205,
"learning_rate": 1.2083263952191446e-06,
"loss": 4.3845,
"step": 1042
},
{
"epoch": 0.905971769815418,
"grad_norm": 2.88436141121272,
"learning_rate": 1.2045505212041644e-06,
"loss": 4.3108,
"step": 1043
},
{
"epoch": 0.9068403908794789,
"grad_norm": 3.6801662883359194,
"learning_rate": 1.200808383442684e-06,
"loss": 4.2905,
"step": 1044
},
{
"epoch": 0.9077090119435396,
"grad_norm": 3.019629347981631,
"learning_rate": 1.1971000113252726e-06,
"loss": 4.1987,
"step": 1045
},
{
"epoch": 0.9085776330076004,
"grad_norm": 4.409221785598703,
"learning_rate": 1.1934254339773074e-06,
"loss": 4.3084,
"step": 1046
},
{
"epoch": 0.9094462540716612,
"grad_norm": 2.709116535172474,
"learning_rate": 1.1897846802587395e-06,
"loss": 4.1717,
"step": 1047
},
{
"epoch": 0.9103148751357221,
"grad_norm": 3.8179262996054235,
"learning_rate": 1.1861777787638762e-06,
"loss": 4.3074,
"step": 1048
},
{
"epoch": 0.9111834961997829,
"grad_norm": 3.281229721724311,
"learning_rate": 1.1826047578211473e-06,
"loss": 4.5075,
"step": 1049
},
{
"epoch": 0.9120521172638436,
"grad_norm": 3.0456938909447113,
"learning_rate": 1.1790656454928866e-06,
"loss": 4.4835,
"step": 1050
},
{
"epoch": 0.9129207383279044,
"grad_norm": 3.2615534456726123,
"learning_rate": 1.1755604695751134e-06,
"loss": 4.3037,
"step": 1051
},
{
"epoch": 0.9137893593919653,
"grad_norm": 2.9749791444621163,
"learning_rate": 1.1720892575973095e-06,
"loss": 4.3342,
"step": 1052
},
{
"epoch": 0.9146579804560261,
"grad_norm": 3.3527992702942577,
"learning_rate": 1.1686520368222066e-06,
"loss": 4.2419,
"step": 1053
},
{
"epoch": 0.9155266015200869,
"grad_norm": 2.4945121180562797,
"learning_rate": 1.1652488342455726e-06,
"loss": 4.1137,
"step": 1054
},
{
"epoch": 0.9163952225841476,
"grad_norm": 3.2304134555988897,
"learning_rate": 1.161879676595996e-06,
"loss": 4.3529,
"step": 1055
},
{
"epoch": 0.9172638436482085,
"grad_norm": 6.349732062570278,
"learning_rate": 1.1585445903346784e-06,
"loss": 4.1944,
"step": 1056
},
{
"epoch": 0.9181324647122693,
"grad_norm": 2.9360851966959536,
"learning_rate": 1.1552436016552273e-06,
"loss": 4.1959,
"step": 1057
},
{
"epoch": 0.9190010857763301,
"grad_norm": 3.205644542692188,
"learning_rate": 1.1519767364834494e-06,
"loss": 4.3528,
"step": 1058
},
{
"epoch": 0.9198697068403909,
"grad_norm": 3.2880341446977126,
"learning_rate": 1.1487440204771454e-06,
"loss": 4.3303,
"step": 1059
},
{
"epoch": 0.9207383279044516,
"grad_norm": 3.6622193093061615,
"learning_rate": 1.1455454790259118e-06,
"loss": 4.2606,
"step": 1060
},
{
"epoch": 0.9216069489685125,
"grad_norm": 3.0436045507003873,
"learning_rate": 1.14238113725094e-06,
"loss": 4.3319,
"step": 1061
},
{
"epoch": 0.9224755700325733,
"grad_norm": 2.946350336011715,
"learning_rate": 1.1392510200048167e-06,
"loss": 4.3378,
"step": 1062
},
{
"epoch": 0.9233441910966341,
"grad_norm": 2.864144280137868,
"learning_rate": 1.1361551518713331e-06,
"loss": 4.3061,
"step": 1063
},
{
"epoch": 0.924212812160695,
"grad_norm": 2.742188676823669,
"learning_rate": 1.133093557165288e-06,
"loss": 4.4552,
"step": 1064
},
{
"epoch": 0.9250814332247557,
"grad_norm": 2.961292413358245,
"learning_rate": 1.1300662599322992e-06,
"loss": 4.2678,
"step": 1065
},
{
"epoch": 0.9259500542888165,
"grad_norm": 3.237248386245042,
"learning_rate": 1.1270732839486137e-06,
"loss": 4.3326,
"step": 1066
},
{
"epoch": 0.9268186753528773,
"grad_norm": 3.4990748997732877,
"learning_rate": 1.1241146527209192e-06,
"loss": 4.3578,
"step": 1067
},
{
"epoch": 0.9276872964169381,
"grad_norm": 2.895322090554506,
"learning_rate": 1.1211903894861655e-06,
"loss": 4.2311,
"step": 1068
},
{
"epoch": 0.9285559174809989,
"grad_norm": 3.687438443063813,
"learning_rate": 1.1183005172113743e-06,
"loss": 4.2228,
"step": 1069
},
{
"epoch": 0.9294245385450597,
"grad_norm": 2.6961355880795748,
"learning_rate": 1.1154450585934625e-06,
"loss": 4.0498,
"step": 1070
},
{
"epoch": 0.9302931596091205,
"grad_norm": 3.688749799792538,
"learning_rate": 1.1126240360590658e-06,
"loss": 4.1798,
"step": 1071
},
{
"epoch": 0.9311617806731813,
"grad_norm": 2.8110003232896625,
"learning_rate": 1.1098374717643587e-06,
"loss": 4.3087,
"step": 1072
},
{
"epoch": 0.9320304017372422,
"grad_norm": 2.5887554177535943,
"learning_rate": 1.1070853875948837e-06,
"loss": 4.2262,
"step": 1073
},
{
"epoch": 0.9328990228013029,
"grad_norm": 2.847071399167459,
"learning_rate": 1.1043678051653768e-06,
"loss": 4.2708,
"step": 1074
},
{
"epoch": 0.9337676438653637,
"grad_norm": 4.142401135834692,
"learning_rate": 1.1016847458195999e-06,
"loss": 4.1147,
"step": 1075
},
{
"epoch": 0.9346362649294245,
"grad_norm": 2.9501471418720557,
"learning_rate": 1.0990362306301725e-06,
"loss": 4.2384,
"step": 1076
},
{
"epoch": 0.9355048859934854,
"grad_norm": 2.501434571608401,
"learning_rate": 1.0964222803984048e-06,
"loss": 4.3386,
"step": 1077
},
{
"epoch": 0.9363735070575462,
"grad_norm": 3.0424764452901956,
"learning_rate": 1.0938429156541364e-06,
"loss": 4.3864,
"step": 1078
},
{
"epoch": 0.9372421281216069,
"grad_norm": 3.904570325899996,
"learning_rate": 1.0912981566555736e-06,
"loss": 4.1481,
"step": 1079
},
{
"epoch": 0.9381107491856677,
"grad_norm": 3.3812480967472127,
"learning_rate": 1.0887880233891307e-06,
"loss": 4.4023,
"step": 1080
},
{
"epoch": 0.9389793702497286,
"grad_norm": 3.5320678055096018,
"learning_rate": 1.0863125355692749e-06,
"loss": 4.1932,
"step": 1081
},
{
"epoch": 0.9398479913137894,
"grad_norm": 3.2937159160580904,
"learning_rate": 1.0838717126383676e-06,
"loss": 4.1515,
"step": 1082
},
{
"epoch": 0.9407166123778502,
"grad_norm": 3.9965990047935973,
"learning_rate": 1.081465573766515e-06,
"loss": 4.2552,
"step": 1083
},
{
"epoch": 0.9415852334419109,
"grad_norm": 2.6702475745108014,
"learning_rate": 1.079094137851415e-06,
"loss": 4.2498,
"step": 1084
},
{
"epoch": 0.9424538545059717,
"grad_norm": 4.745534492210504,
"learning_rate": 1.0767574235182125e-06,
"loss": 4.2509,
"step": 1085
},
{
"epoch": 0.9433224755700326,
"grad_norm": 2.632042835693663,
"learning_rate": 1.0744554491193483e-06,
"loss": 4.2512,
"step": 1086
},
{
"epoch": 0.9441910966340934,
"grad_norm": 4.902218185640278,
"learning_rate": 1.0721882327344199e-06,
"loss": 4.2946,
"step": 1087
},
{
"epoch": 0.9450597176981542,
"grad_norm": 3.611290466221663,
"learning_rate": 1.0699557921700337e-06,
"loss": 4.3242,
"step": 1088
},
{
"epoch": 0.9459283387622149,
"grad_norm": 4.159732582837829,
"learning_rate": 1.0677581449596724e-06,
"loss": 4.3335,
"step": 1089
},
{
"epoch": 0.9467969598262758,
"grad_norm": 3.3609545882421,
"learning_rate": 1.0655953083635507e-06,
"loss": 4.3966,
"step": 1090
},
{
"epoch": 0.9476655808903366,
"grad_norm": 3.43860026090556,
"learning_rate": 1.0634672993684828e-06,
"loss": 4.2277,
"step": 1091
},
{
"epoch": 0.9485342019543974,
"grad_norm": 3.0705569428179884,
"learning_rate": 1.0613741346877496e-06,
"loss": 4.2567,
"step": 1092
},
{
"epoch": 0.9494028230184582,
"grad_norm": 3.3793501615647195,
"learning_rate": 1.0593158307609649e-06,
"loss": 4.237,
"step": 1093
},
{
"epoch": 0.950271444082519,
"grad_norm": 2.929105704044937,
"learning_rate": 1.0572924037539494e-06,
"loss": 4.2117,
"step": 1094
},
{
"epoch": 0.9511400651465798,
"grad_norm": 2.71792555170425,
"learning_rate": 1.0553038695586018e-06,
"loss": 4.1614,
"step": 1095
},
{
"epoch": 0.9520086862106406,
"grad_norm": 3.78382633594629,
"learning_rate": 1.0533502437927722e-06,
"loss": 4.3529,
"step": 1096
},
{
"epoch": 0.9528773072747014,
"grad_norm": 2.607501362679155,
"learning_rate": 1.0514315418001456e-06,
"loss": 4.3052,
"step": 1097
},
{
"epoch": 0.9537459283387623,
"grad_norm": 3.663360468883577,
"learning_rate": 1.0495477786501138e-06,
"loss": 4.1874,
"step": 1098
},
{
"epoch": 0.954614549402823,
"grad_norm": 2.630120856617326,
"learning_rate": 1.0476989691376622e-06,
"loss": 4.242,
"step": 1099
},
{
"epoch": 0.9554831704668838,
"grad_norm": 3.823941956325378,
"learning_rate": 1.045885127783252e-06,
"loss": 4.2043,
"step": 1100
},
{
"epoch": 0.9563517915309446,
"grad_norm": 2.312883220474864,
"learning_rate": 1.0441062688327051e-06,
"loss": 4.1473,
"step": 1101
},
{
"epoch": 0.9572204125950055,
"grad_norm": 4.094343889130672,
"learning_rate": 1.0423624062570952e-06,
"loss": 4.222,
"step": 1102
},
{
"epoch": 0.9580890336590663,
"grad_norm": 2.209599527243729,
"learning_rate": 1.0406535537526343e-06,
"loss": 4.1423,
"step": 1103
},
{
"epoch": 0.958957654723127,
"grad_norm": 3.382906333029422,
"learning_rate": 1.0389797247405677e-06,
"loss": 4.2021,
"step": 1104
},
{
"epoch": 0.9598262757871878,
"grad_norm": 2.1618780172948004,
"learning_rate": 1.0373409323670688e-06,
"loss": 4.2185,
"step": 1105
},
{
"epoch": 0.9606948968512486,
"grad_norm": 2.734044937681216,
"learning_rate": 1.0357371895031331e-06,
"loss": 4.1892,
"step": 1106
},
{
"epoch": 0.9615635179153095,
"grad_norm": 3.0779816158117237,
"learning_rate": 1.0341685087444804e-06,
"loss": 4.1711,
"step": 1107
},
{
"epoch": 0.9624321389793703,
"grad_norm": 3.4747018762690973,
"learning_rate": 1.0326349024114533e-06,
"loss": 4.1973,
"step": 1108
},
{
"epoch": 0.963300760043431,
"grad_norm": 36.83010937108777,
"learning_rate": 1.0311363825489225e-06,
"loss": 4.3603,
"step": 1109
},
{
"epoch": 0.9641693811074918,
"grad_norm": 4.281579688730939,
"learning_rate": 1.0296729609261914e-06,
"loss": 4.2369,
"step": 1110
},
{
"epoch": 0.9650380021715527,
"grad_norm": 3.6658084904084647,
"learning_rate": 1.0282446490369017e-06,
"loss": 4.328,
"step": 1111
},
{
"epoch": 0.9659066232356135,
"grad_norm": 3.1538713927243083,
"learning_rate": 1.0268514580989476e-06,
"loss": 4.224,
"step": 1112
},
{
"epoch": 0.9667752442996743,
"grad_norm": 2.9277894965124895,
"learning_rate": 1.025493399054383e-06,
"loss": 4.1622,
"step": 1113
},
{
"epoch": 0.967643865363735,
"grad_norm": 2.792456827835999,
"learning_rate": 1.0241704825693384e-06,
"loss": 4.1866,
"step": 1114
},
{
"epoch": 0.9685124864277959,
"grad_norm": 3.0405461234372178,
"learning_rate": 1.0228827190339365e-06,
"loss": 4.2563,
"step": 1115
},
{
"epoch": 0.9693811074918567,
"grad_norm": 2.456925962421864,
"learning_rate": 1.0216301185622093e-06,
"loss": 4.2406,
"step": 1116
},
{
"epoch": 0.9702497285559175,
"grad_norm": 2.556980498653978,
"learning_rate": 1.0204126909920216e-06,
"loss": 4.2689,
"step": 1117
},
{
"epoch": 0.9711183496199783,
"grad_norm": 2.494568998681425,
"learning_rate": 1.0192304458849905e-06,
"loss": 4.1788,
"step": 1118
},
{
"epoch": 0.971986970684039,
"grad_norm": 2.414367486391019,
"learning_rate": 1.0180833925264123e-06,
"loss": 4.3123,
"step": 1119
},
{
"epoch": 0.9728555917480999,
"grad_norm": 2.5637239329618797,
"learning_rate": 1.0169715399251886e-06,
"loss": 4.1474,
"step": 1120
},
{
"epoch": 0.9737242128121607,
"grad_norm": 2.1036887441385,
"learning_rate": 1.0158948968137562e-06,
"loss": 4.1269,
"step": 1121
},
{
"epoch": 0.9745928338762215,
"grad_norm": 2.922835583562659,
"learning_rate": 1.0148534716480189e-06,
"loss": 4.3625,
"step": 1122
},
{
"epoch": 0.9754614549402822,
"grad_norm": 9.43946034741677,
"learning_rate": 1.013847272607279e-06,
"loss": 4.1683,
"step": 1123
},
{
"epoch": 0.9763300760043431,
"grad_norm": 4.165841700664373,
"learning_rate": 1.0128763075941765e-06,
"loss": 4.2542,
"step": 1124
},
{
"epoch": 0.9771986970684039,
"grad_norm": 2.839928915059915,
"learning_rate": 1.0119405842346225e-06,
"loss": 4.3801,
"step": 1125
},
{
"epoch": 0.9780673181324647,
"grad_norm": 2.8834509246127547,
"learning_rate": 1.0110401098777443e-06,
"loss": 4.1791,
"step": 1126
},
{
"epoch": 0.9789359391965256,
"grad_norm": 3.323021602743116,
"learning_rate": 1.010174891595824e-06,
"loss": 4.2308,
"step": 1127
},
{
"epoch": 0.9798045602605863,
"grad_norm": 2.8595797575000574,
"learning_rate": 1.0093449361842436e-06,
"loss": 4.163,
"step": 1128
},
{
"epoch": 0.9806731813246471,
"grad_norm": 2.6315144637856416,
"learning_rate": 1.0085502501614336e-06,
"loss": 4.2709,
"step": 1129
},
{
"epoch": 0.9815418023887079,
"grad_norm": 3.743912990675417,
"learning_rate": 1.00779083976882e-06,
"loss": 4.378,
"step": 1130
},
{
"epoch": 0.9824104234527687,
"grad_norm": 2.5760124356799627,
"learning_rate": 1.007066710970775e-06,
"loss": 4.3711,
"step": 1131
},
{
"epoch": 0.9832790445168296,
"grad_norm": 3.24899714712198,
"learning_rate": 1.0063778694545713e-06,
"loss": 4.413,
"step": 1132
},
{
"epoch": 0.9841476655808903,
"grad_norm": 2.651957802148301,
"learning_rate": 1.0057243206303377e-06,
"loss": 4.1993,
"step": 1133
},
{
"epoch": 0.9850162866449511,
"grad_norm": 2.2387986170054557,
"learning_rate": 1.0051060696310137e-06,
"loss": 4.0838,
"step": 1134
},
{
"epoch": 0.9858849077090119,
"grad_norm": 3.0411821712384324,
"learning_rate": 1.0045231213123148e-06,
"loss": 4.3682,
"step": 1135
},
{
"epoch": 0.9867535287730728,
"grad_norm": 4.325765653367129,
"learning_rate": 1.0039754802526882e-06,
"loss": 4.2435,
"step": 1136
},
{
"epoch": 0.9876221498371336,
"grad_norm": 2.974916977057162,
"learning_rate": 1.00346315075328e-06,
"loss": 4.1753,
"step": 1137
},
{
"epoch": 0.9884907709011943,
"grad_norm": 2.953803498662762,
"learning_rate": 1.002986136837902e-06,
"loss": 4.1284,
"step": 1138
},
{
"epoch": 0.9893593919652551,
"grad_norm": 3.8781438570753104,
"learning_rate": 1.0025444422529981e-06,
"loss": 4.2912,
"step": 1139
},
{
"epoch": 0.990228013029316,
"grad_norm": 2.751967293645226,
"learning_rate": 1.0021380704676165e-06,
"loss": 4.2889,
"step": 1140
},
{
"epoch": 0.9910966340933768,
"grad_norm": 3.6019606680877354,
"learning_rate": 1.001767024673382e-06,
"loss": 4.3346,
"step": 1141
},
{
"epoch": 0.9919652551574376,
"grad_norm": 4.860956653468777,
"learning_rate": 1.00143130778447e-06,
"loss": 4.4012,
"step": 1142
},
{
"epoch": 0.9928338762214983,
"grad_norm": 2.978905973214082,
"learning_rate": 1.0011309224375852e-06,
"loss": 4.2442,
"step": 1143
},
{
"epoch": 0.9937024972855591,
"grad_norm": 3.8701647090673466,
"learning_rate": 1.0008658709919392e-06,
"loss": 4.1964,
"step": 1144
},
{
"epoch": 0.99457111834962,
"grad_norm": 3.288978549917795,
"learning_rate": 1.0006361555292336e-06,
"loss": 4.3262,
"step": 1145
},
{
"epoch": 0.9954397394136808,
"grad_norm": 2.960844019977527,
"learning_rate": 1.0004417778536422e-06,
"loss": 4.3462,
"step": 1146
},
{
"epoch": 0.9963083604777416,
"grad_norm": 3.7853750361182774,
"learning_rate": 1.0002827394917987e-06,
"loss": 4.1935,
"step": 1147
},
{
"epoch": 0.9971769815418023,
"grad_norm": 3.018359866548487,
"learning_rate": 1.000159041692782e-06,
"loss": 4.2258,
"step": 1148
},
{
"epoch": 0.9980456026058632,
"grad_norm": 3.5491363826838827,
"learning_rate": 1.0000706854281087e-06,
"loss": 4.3101,
"step": 1149
},
{
"epoch": 0.998914223669924,
"grad_norm": 2.819477984506924,
"learning_rate": 1.000017671391725e-06,
"loss": 4.2138,
"step": 1150
},
{
"epoch": 0.9997828447339848,
"grad_norm": 4.035987734141358,
"learning_rate": 1e-06,
"loss": 4.2367,
"step": 1151
},
{
"epoch": 0.9997828447339848,
"step": 1151,
"total_flos": 995560529133568.0,
"train_loss": 4.727850101182608,
"train_runtime": 34373.7603,
"train_samples_per_second": 2.143,
"train_steps_per_second": 0.033
}
],
"logging_steps": 1.0,
"max_steps": 1151,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 995560529133568.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}