downstream_mistral / trainer_state.json
terry69's picture
Model save
1e69fd6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4325,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00023121387283236994,
"grad_norm": 0.6965160472814916,
"learning_rate": 4.6189376443418015e-07,
"loss": 1.086,
"step": 1
},
{
"epoch": 0.0011560693641618498,
"grad_norm": 0.6835787858040867,
"learning_rate": 2.309468822170901e-06,
"loss": 1.128,
"step": 5
},
{
"epoch": 0.0023121387283236996,
"grad_norm": 0.6184758767742636,
"learning_rate": 4.618937644341802e-06,
"loss": 1.1188,
"step": 10
},
{
"epoch": 0.003468208092485549,
"grad_norm": 0.6078620387437725,
"learning_rate": 6.928406466512702e-06,
"loss": 1.1243,
"step": 15
},
{
"epoch": 0.004624277456647399,
"grad_norm": 0.5222130979899404,
"learning_rate": 9.237875288683604e-06,
"loss": 1.1013,
"step": 20
},
{
"epoch": 0.005780346820809248,
"grad_norm": 0.4982345960164489,
"learning_rate": 1.1547344110854504e-05,
"loss": 1.0928,
"step": 25
},
{
"epoch": 0.006936416184971098,
"grad_norm": 0.40171891380185876,
"learning_rate": 1.3856812933025404e-05,
"loss": 1.0736,
"step": 30
},
{
"epoch": 0.008092485549132947,
"grad_norm": 0.41106693801534305,
"learning_rate": 1.6166281755196306e-05,
"loss": 1.0672,
"step": 35
},
{
"epoch": 0.009248554913294798,
"grad_norm": 0.4109497681037648,
"learning_rate": 1.8475750577367208e-05,
"loss": 1.0258,
"step": 40
},
{
"epoch": 0.010404624277456647,
"grad_norm": 0.379459396453402,
"learning_rate": 2.0785219399538107e-05,
"loss": 1.0102,
"step": 45
},
{
"epoch": 0.011560693641618497,
"grad_norm": 0.3721675211937845,
"learning_rate": 2.309468822170901e-05,
"loss": 1.0234,
"step": 50
},
{
"epoch": 0.012716763005780347,
"grad_norm": 0.373222147309766,
"learning_rate": 2.540415704387991e-05,
"loss": 0.992,
"step": 55
},
{
"epoch": 0.013872832369942197,
"grad_norm": 0.3702598509393352,
"learning_rate": 2.771362586605081e-05,
"loss": 1.0326,
"step": 60
},
{
"epoch": 0.015028901734104046,
"grad_norm": 0.35490433661120324,
"learning_rate": 3.0023094688221707e-05,
"loss": 1.0246,
"step": 65
},
{
"epoch": 0.016184971098265895,
"grad_norm": 0.34072939280119285,
"learning_rate": 3.233256351039261e-05,
"loss": 1.0115,
"step": 70
},
{
"epoch": 0.017341040462427744,
"grad_norm": 0.3831809542290894,
"learning_rate": 3.464203233256351e-05,
"loss": 1.0331,
"step": 75
},
{
"epoch": 0.018497109826589597,
"grad_norm": 0.3623596879709559,
"learning_rate": 3.6951501154734416e-05,
"loss": 1.0022,
"step": 80
},
{
"epoch": 0.019653179190751446,
"grad_norm": 0.3931577119185935,
"learning_rate": 3.9260969976905315e-05,
"loss": 0.9997,
"step": 85
},
{
"epoch": 0.020809248554913295,
"grad_norm": 0.4109841798218973,
"learning_rate": 4.1570438799076213e-05,
"loss": 0.9914,
"step": 90
},
{
"epoch": 0.021965317919075144,
"grad_norm": 0.40213543229926174,
"learning_rate": 4.387990762124711e-05,
"loss": 0.9643,
"step": 95
},
{
"epoch": 0.023121387283236993,
"grad_norm": 0.3657003831875091,
"learning_rate": 4.618937644341802e-05,
"loss": 0.9957,
"step": 100
},
{
"epoch": 0.024277456647398842,
"grad_norm": 0.38609844859658143,
"learning_rate": 4.8498845265588916e-05,
"loss": 1.0111,
"step": 105
},
{
"epoch": 0.025433526011560695,
"grad_norm": 0.3583592530014769,
"learning_rate": 5.080831408775982e-05,
"loss": 0.9963,
"step": 110
},
{
"epoch": 0.026589595375722544,
"grad_norm": 0.3554977804638662,
"learning_rate": 5.311778290993071e-05,
"loss": 0.9817,
"step": 115
},
{
"epoch": 0.027745664739884393,
"grad_norm": 0.3840657570276599,
"learning_rate": 5.542725173210162e-05,
"loss": 1.0232,
"step": 120
},
{
"epoch": 0.028901734104046242,
"grad_norm": 0.3774759292695808,
"learning_rate": 5.7736720554272516e-05,
"loss": 1.0139,
"step": 125
},
{
"epoch": 0.03005780346820809,
"grad_norm": 0.360754142577289,
"learning_rate": 6.0046189376443415e-05,
"loss": 0.9982,
"step": 130
},
{
"epoch": 0.03121387283236994,
"grad_norm": 0.3811963374124325,
"learning_rate": 6.235565819861431e-05,
"loss": 1.0088,
"step": 135
},
{
"epoch": 0.03236994219653179,
"grad_norm": 0.3667939507833288,
"learning_rate": 6.466512702078523e-05,
"loss": 0.9887,
"step": 140
},
{
"epoch": 0.03352601156069364,
"grad_norm": 0.3988058302280759,
"learning_rate": 6.697459584295612e-05,
"loss": 0.9513,
"step": 145
},
{
"epoch": 0.03468208092485549,
"grad_norm": 0.3603023427371918,
"learning_rate": 6.928406466512702e-05,
"loss": 1.0239,
"step": 150
},
{
"epoch": 0.035838150289017344,
"grad_norm": 0.321539455275297,
"learning_rate": 7.159353348729792e-05,
"loss": 0.9821,
"step": 155
},
{
"epoch": 0.03699421965317919,
"grad_norm": 0.3548185437448054,
"learning_rate": 7.390300230946883e-05,
"loss": 1.0028,
"step": 160
},
{
"epoch": 0.03815028901734104,
"grad_norm": 0.33174112797447736,
"learning_rate": 7.621247113163973e-05,
"loss": 0.9852,
"step": 165
},
{
"epoch": 0.03930635838150289,
"grad_norm": 0.3428952862407286,
"learning_rate": 7.852193995381063e-05,
"loss": 1.0035,
"step": 170
},
{
"epoch": 0.04046242774566474,
"grad_norm": 0.3164096194909282,
"learning_rate": 8.083140877598153e-05,
"loss": 0.9594,
"step": 175
},
{
"epoch": 0.04161849710982659,
"grad_norm": 0.3210772711947753,
"learning_rate": 8.314087759815243e-05,
"loss": 0.9751,
"step": 180
},
{
"epoch": 0.04277456647398844,
"grad_norm": 0.31913209612903376,
"learning_rate": 8.545034642032334e-05,
"loss": 0.9816,
"step": 185
},
{
"epoch": 0.04393063583815029,
"grad_norm": 0.32476583892299626,
"learning_rate": 8.775981524249422e-05,
"loss": 1.0382,
"step": 190
},
{
"epoch": 0.04508670520231214,
"grad_norm": 0.33804033601960015,
"learning_rate": 9.006928406466512e-05,
"loss": 1.0278,
"step": 195
},
{
"epoch": 0.046242774566473986,
"grad_norm": 0.3034185595284857,
"learning_rate": 9.237875288683603e-05,
"loss": 0.9209,
"step": 200
},
{
"epoch": 0.047398843930635835,
"grad_norm": 0.30966667195251285,
"learning_rate": 9.468822170900693e-05,
"loss": 0.9929,
"step": 205
},
{
"epoch": 0.048554913294797684,
"grad_norm": 0.3110271670167101,
"learning_rate": 9.699769053117783e-05,
"loss": 0.999,
"step": 210
},
{
"epoch": 0.04971098265895954,
"grad_norm": 0.3168339881753396,
"learning_rate": 9.930715935334873e-05,
"loss": 0.9801,
"step": 215
},
{
"epoch": 0.05086705202312139,
"grad_norm": 0.2980534767723837,
"learning_rate": 0.00010161662817551964,
"loss": 0.964,
"step": 220
},
{
"epoch": 0.05202312138728324,
"grad_norm": 0.29439975733255125,
"learning_rate": 0.00010392609699769054,
"loss": 1.0141,
"step": 225
},
{
"epoch": 0.05317919075144509,
"grad_norm": 0.2939300503984728,
"learning_rate": 0.00010623556581986143,
"loss": 1.029,
"step": 230
},
{
"epoch": 0.05433526011560694,
"grad_norm": 0.2921065506824694,
"learning_rate": 0.00010854503464203234,
"loss": 1.0008,
"step": 235
},
{
"epoch": 0.055491329479768786,
"grad_norm": 0.29281624873588324,
"learning_rate": 0.00011085450346420324,
"loss": 1.0286,
"step": 240
},
{
"epoch": 0.056647398843930635,
"grad_norm": 0.30008365055838865,
"learning_rate": 0.00011316397228637415,
"loss": 1.0009,
"step": 245
},
{
"epoch": 0.057803468208092484,
"grad_norm": 0.2821304520615669,
"learning_rate": 0.00011547344110854503,
"loss": 1.0172,
"step": 250
},
{
"epoch": 0.058959537572254334,
"grad_norm": 0.28925572801814625,
"learning_rate": 0.00011778290993071594,
"loss": 1.0106,
"step": 255
},
{
"epoch": 0.06011560693641618,
"grad_norm": 0.2979979142081887,
"learning_rate": 0.00012009237875288683,
"loss": 0.9851,
"step": 260
},
{
"epoch": 0.06127167630057803,
"grad_norm": 0.27608874909832487,
"learning_rate": 0.00012240184757505776,
"loss": 0.9386,
"step": 265
},
{
"epoch": 0.06242774566473988,
"grad_norm": 0.270653606190121,
"learning_rate": 0.00012471131639722863,
"loss": 0.9823,
"step": 270
},
{
"epoch": 0.06358381502890173,
"grad_norm": 0.2889956899438749,
"learning_rate": 0.00012702078521939955,
"loss": 1.0096,
"step": 275
},
{
"epoch": 0.06473988439306358,
"grad_norm": 0.30488051626437424,
"learning_rate": 0.00012933025404157045,
"loss": 0.9819,
"step": 280
},
{
"epoch": 0.06589595375722543,
"grad_norm": 0.2884447508894878,
"learning_rate": 0.00013163972286374135,
"loss": 0.9468,
"step": 285
},
{
"epoch": 0.06705202312138728,
"grad_norm": 0.2755861252646099,
"learning_rate": 0.00013394919168591225,
"loss": 0.9828,
"step": 290
},
{
"epoch": 0.06820809248554913,
"grad_norm": 0.3025742735794824,
"learning_rate": 0.00013625866050808315,
"loss": 1.0178,
"step": 295
},
{
"epoch": 0.06936416184971098,
"grad_norm": 0.28423295151616335,
"learning_rate": 0.00013856812933025404,
"loss": 0.9945,
"step": 300
},
{
"epoch": 0.07052023121387284,
"grad_norm": 0.27585086154702476,
"learning_rate": 0.00014087759815242494,
"loss": 0.9695,
"step": 305
},
{
"epoch": 0.07167630057803469,
"grad_norm": 0.3038442212974576,
"learning_rate": 0.00014318706697459584,
"loss": 0.9808,
"step": 310
},
{
"epoch": 0.07283236994219654,
"grad_norm": 0.27277935404650916,
"learning_rate": 0.00014549653579676674,
"loss": 0.9846,
"step": 315
},
{
"epoch": 0.07398843930635839,
"grad_norm": 0.29591421870168255,
"learning_rate": 0.00014780600461893767,
"loss": 0.9954,
"step": 320
},
{
"epoch": 0.07514450867052024,
"grad_norm": 0.2823015918956852,
"learning_rate": 0.00015011547344110854,
"loss": 0.9674,
"step": 325
},
{
"epoch": 0.07630057803468208,
"grad_norm": 0.27344790533462154,
"learning_rate": 0.00015242494226327946,
"loss": 1.0236,
"step": 330
},
{
"epoch": 0.07745664739884393,
"grad_norm": 0.2949347092864385,
"learning_rate": 0.00015473441108545036,
"loss": 0.9813,
"step": 335
},
{
"epoch": 0.07861271676300578,
"grad_norm": 0.272235850063355,
"learning_rate": 0.00015704387990762126,
"loss": 0.987,
"step": 340
},
{
"epoch": 0.07976878612716763,
"grad_norm": 0.28565982534338485,
"learning_rate": 0.00015935334872979216,
"loss": 0.9716,
"step": 345
},
{
"epoch": 0.08092485549132948,
"grad_norm": 0.28274111136822716,
"learning_rate": 0.00016166281755196306,
"loss": 0.9779,
"step": 350
},
{
"epoch": 0.08208092485549133,
"grad_norm": 0.2878620812403313,
"learning_rate": 0.00016397228637413396,
"loss": 1.0209,
"step": 355
},
{
"epoch": 0.08323699421965318,
"grad_norm": 0.2865629307216328,
"learning_rate": 0.00016628175519630485,
"loss": 0.9921,
"step": 360
},
{
"epoch": 0.08439306358381503,
"grad_norm": 0.2826816337246457,
"learning_rate": 0.00016859122401847575,
"loss": 0.9348,
"step": 365
},
{
"epoch": 0.08554913294797688,
"grad_norm": 0.2881907360895622,
"learning_rate": 0.00017090069284064668,
"loss": 1.0231,
"step": 370
},
{
"epoch": 0.08670520231213873,
"grad_norm": 0.27638308021239116,
"learning_rate": 0.00017321016166281755,
"loss": 1.0079,
"step": 375
},
{
"epoch": 0.08786127167630058,
"grad_norm": 0.28798387568151884,
"learning_rate": 0.00017551963048498845,
"loss": 0.9165,
"step": 380
},
{
"epoch": 0.08901734104046242,
"grad_norm": 0.2767437621962458,
"learning_rate": 0.00017782909930715937,
"loss": 0.9489,
"step": 385
},
{
"epoch": 0.09017341040462427,
"grad_norm": 0.2877748564441354,
"learning_rate": 0.00018013856812933024,
"loss": 1.0022,
"step": 390
},
{
"epoch": 0.09132947976878612,
"grad_norm": 0.28925322685576144,
"learning_rate": 0.00018244803695150117,
"loss": 1.004,
"step": 395
},
{
"epoch": 0.09248554913294797,
"grad_norm": 0.31070512225405156,
"learning_rate": 0.00018475750577367207,
"loss": 0.9753,
"step": 400
},
{
"epoch": 0.09364161849710982,
"grad_norm": 0.31735644814371816,
"learning_rate": 0.00018706697459584297,
"loss": 0.9781,
"step": 405
},
{
"epoch": 0.09479768786127167,
"grad_norm": 0.2974104968051762,
"learning_rate": 0.00018937644341801387,
"loss": 0.9659,
"step": 410
},
{
"epoch": 0.09595375722543352,
"grad_norm": 0.2701025540904289,
"learning_rate": 0.00019168591224018476,
"loss": 0.9294,
"step": 415
},
{
"epoch": 0.09710982658959537,
"grad_norm": 0.27428411358071536,
"learning_rate": 0.00019399538106235566,
"loss": 0.9513,
"step": 420
},
{
"epoch": 0.09826589595375723,
"grad_norm": 0.2745240121214777,
"learning_rate": 0.0001963048498845266,
"loss": 1.0152,
"step": 425
},
{
"epoch": 0.09942196531791908,
"grad_norm": 0.274405426645142,
"learning_rate": 0.00019861431870669746,
"loss": 0.9863,
"step": 430
},
{
"epoch": 0.10057803468208093,
"grad_norm": 0.9048729498402529,
"learning_rate": 0.00019999986968812804,
"loss": 0.9992,
"step": 435
},
{
"epoch": 0.10173410404624278,
"grad_norm": 0.29062903824052616,
"learning_rate": 0.00019999840368346898,
"loss": 1.0509,
"step": 440
},
{
"epoch": 0.10289017341040463,
"grad_norm": 0.28317998532870003,
"learning_rate": 0.0001999953088082702,
"loss": 0.9774,
"step": 445
},
{
"epoch": 0.10404624277456648,
"grad_norm": 0.3000365098515979,
"learning_rate": 0.000199990585112944,
"loss": 0.9822,
"step": 450
},
{
"epoch": 0.10520231213872833,
"grad_norm": 0.28508661433222776,
"learning_rate": 0.00019998423267443454,
"loss": 0.9991,
"step": 455
},
{
"epoch": 0.10635838150289018,
"grad_norm": 0.300390972643917,
"learning_rate": 0.00019997625159621642,
"loss": 0.9411,
"step": 460
},
{
"epoch": 0.10751445086705202,
"grad_norm": 0.3065580188577536,
"learning_rate": 0.0001999666420082932,
"loss": 1.0153,
"step": 465
},
{
"epoch": 0.10867052023121387,
"grad_norm": 0.30176770344761106,
"learning_rate": 0.00019995540406719507,
"loss": 0.9451,
"step": 470
},
{
"epoch": 0.10982658959537572,
"grad_norm": 0.28363462288539226,
"learning_rate": 0.0001999425379559765,
"loss": 1.0229,
"step": 475
},
{
"epoch": 0.11098265895953757,
"grad_norm": 0.2979648349669768,
"learning_rate": 0.00019992804388421312,
"loss": 0.9615,
"step": 480
},
{
"epoch": 0.11213872832369942,
"grad_norm": 0.3533699255705701,
"learning_rate": 0.00019991192208799837,
"loss": 0.9945,
"step": 485
},
{
"epoch": 0.11329479768786127,
"grad_norm": 0.29374690183301444,
"learning_rate": 0.0001998941728299396,
"loss": 0.9481,
"step": 490
},
{
"epoch": 0.11445086705202312,
"grad_norm": 0.27638233259638606,
"learning_rate": 0.0001998747963991539,
"loss": 0.976,
"step": 495
},
{
"epoch": 0.11560693641618497,
"grad_norm": 0.29591583372167063,
"learning_rate": 0.00019985379311126327,
"loss": 0.9776,
"step": 500
},
{
"epoch": 0.11676300578034682,
"grad_norm": 0.28596466519406494,
"learning_rate": 0.00019983116330838955,
"loss": 1.0003,
"step": 505
},
{
"epoch": 0.11791907514450867,
"grad_norm": 0.27570121782043344,
"learning_rate": 0.00019980690735914877,
"loss": 0.9797,
"step": 510
},
{
"epoch": 0.11907514450867052,
"grad_norm": 0.30038501431153675,
"learning_rate": 0.0001997810256586453,
"loss": 0.9865,
"step": 515
},
{
"epoch": 0.12023121387283237,
"grad_norm": 0.31132305182282943,
"learning_rate": 0.00019975351862846523,
"loss": 1.0071,
"step": 520
},
{
"epoch": 0.12138728323699421,
"grad_norm": 0.28621069496048757,
"learning_rate": 0.00019972438671666967,
"loss": 0.9877,
"step": 525
},
{
"epoch": 0.12254335260115606,
"grad_norm": 0.3030051142967184,
"learning_rate": 0.00019969363039778728,
"loss": 0.9894,
"step": 530
},
{
"epoch": 0.12369942196531791,
"grad_norm": 0.2866405334142299,
"learning_rate": 0.0001996612501728067,
"loss": 0.9893,
"step": 535
},
{
"epoch": 0.12485549132947976,
"grad_norm": 0.2867249309375898,
"learning_rate": 0.00019962724656916826,
"loss": 0.9765,
"step": 540
},
{
"epoch": 0.1260115606936416,
"grad_norm": 0.31798324367678865,
"learning_rate": 0.00019959162014075553,
"loss": 0.9465,
"step": 545
},
{
"epoch": 0.12716763005780346,
"grad_norm": 0.30253196958089823,
"learning_rate": 0.0001995543714678861,
"loss": 0.9635,
"step": 550
},
{
"epoch": 0.1283236994219653,
"grad_norm": 0.28954658171708875,
"learning_rate": 0.00019951550115730244,
"loss": 1.0041,
"step": 555
},
{
"epoch": 0.12947976878612716,
"grad_norm": 0.28359238422516453,
"learning_rate": 0.00019947500984216157,
"loss": 0.9837,
"step": 560
},
{
"epoch": 0.130635838150289,
"grad_norm": 0.29624734780777734,
"learning_rate": 0.00019943289818202519,
"loss": 0.9375,
"step": 565
},
{
"epoch": 0.13179190751445086,
"grad_norm": 0.291198302886137,
"learning_rate": 0.0001993891668628486,
"loss": 0.9665,
"step": 570
},
{
"epoch": 0.1329479768786127,
"grad_norm": 0.31056892991094237,
"learning_rate": 0.00019934381659696989,
"loss": 0.9414,
"step": 575
},
{
"epoch": 0.13410404624277455,
"grad_norm": 0.3136680287460065,
"learning_rate": 0.0001992968481230978,
"loss": 1.0442,
"step": 580
},
{
"epoch": 0.1352601156069364,
"grad_norm": 0.30128304774644027,
"learning_rate": 0.0001992482622063003,
"loss": 0.9916,
"step": 585
},
{
"epoch": 0.13641618497109825,
"grad_norm": 0.29097714851626455,
"learning_rate": 0.00019919805963799166,
"loss": 0.9947,
"step": 590
},
{
"epoch": 0.1375722543352601,
"grad_norm": 0.280417582339227,
"learning_rate": 0.0001991462412359198,
"loss": 0.9825,
"step": 595
},
{
"epoch": 0.13872832369942195,
"grad_norm": 0.28842456996684646,
"learning_rate": 0.00019909280784415287,
"loss": 1.0237,
"step": 600
},
{
"epoch": 0.13988439306358383,
"grad_norm": 0.2874190385868597,
"learning_rate": 0.00019903776033306555,
"loss": 0.9611,
"step": 605
},
{
"epoch": 0.14104046242774568,
"grad_norm": 0.2854599682323198,
"learning_rate": 0.00019898109959932478,
"loss": 0.9879,
"step": 610
},
{
"epoch": 0.14219653179190753,
"grad_norm": 0.2986976522432421,
"learning_rate": 0.0001989228265658754,
"loss": 0.9911,
"step": 615
},
{
"epoch": 0.14335260115606938,
"grad_norm": 0.329057703574734,
"learning_rate": 0.00019886294218192477,
"loss": 0.9714,
"step": 620
},
{
"epoch": 0.14450867052023122,
"grad_norm": 0.29538740567126964,
"learning_rate": 0.00019880144742292753,
"loss": 0.955,
"step": 625
},
{
"epoch": 0.14566473988439307,
"grad_norm": 0.32615973261215037,
"learning_rate": 0.00019873834329056975,
"loss": 0.9789,
"step": 630
},
{
"epoch": 0.14682080924855492,
"grad_norm": 0.26819671057976713,
"learning_rate": 0.00019867363081275242,
"loss": 0.9471,
"step": 635
},
{
"epoch": 0.14797687861271677,
"grad_norm": 0.28488339516835476,
"learning_rate": 0.00019860731104357485,
"loss": 0.9779,
"step": 640
},
{
"epoch": 0.14913294797687862,
"grad_norm": 0.27998012828783303,
"learning_rate": 0.00019853938506331749,
"loss": 0.951,
"step": 645
},
{
"epoch": 0.15028901734104047,
"grad_norm": 0.29388395642325527,
"learning_rate": 0.00019846985397842427,
"loss": 0.9554,
"step": 650
},
{
"epoch": 0.15144508670520232,
"grad_norm": 0.2896808457234832,
"learning_rate": 0.0001983987189214846,
"loss": 0.9651,
"step": 655
},
{
"epoch": 0.15260115606936417,
"grad_norm": 0.30032384976542736,
"learning_rate": 0.000198325981051215,
"loss": 0.943,
"step": 660
},
{
"epoch": 0.15375722543352602,
"grad_norm": 0.2977312487124492,
"learning_rate": 0.00019825164155244012,
"loss": 0.9887,
"step": 665
},
{
"epoch": 0.15491329479768787,
"grad_norm": 0.341662364901083,
"learning_rate": 0.00019817570163607347,
"loss": 1.0059,
"step": 670
},
{
"epoch": 0.15606936416184972,
"grad_norm": 0.2996891760902693,
"learning_rate": 0.00019809816253909773,
"loss": 0.9673,
"step": 675
},
{
"epoch": 0.15722543352601157,
"grad_norm": 0.2882952073089595,
"learning_rate": 0.00019801902552454454,
"loss": 0.9558,
"step": 680
},
{
"epoch": 0.15838150289017341,
"grad_norm": 0.3203389676438109,
"learning_rate": 0.00019793829188147406,
"loss": 1.0122,
"step": 685
},
{
"epoch": 0.15953757225433526,
"grad_norm": 0.30270844487268483,
"learning_rate": 0.00019785596292495376,
"loss": 0.9822,
"step": 690
},
{
"epoch": 0.1606936416184971,
"grad_norm": 0.27999850496563145,
"learning_rate": 0.00019777203999603717,
"loss": 0.9841,
"step": 695
},
{
"epoch": 0.16184971098265896,
"grad_norm": 0.29383524324706056,
"learning_rate": 0.000197686524461742,
"loss": 1.0269,
"step": 700
},
{
"epoch": 0.1630057803468208,
"grad_norm": 0.28066838492867907,
"learning_rate": 0.0001975994177150278,
"loss": 0.9927,
"step": 705
},
{
"epoch": 0.16416184971098266,
"grad_norm": 0.31122106212206363,
"learning_rate": 0.0001975107211747734,
"loss": 0.9632,
"step": 710
},
{
"epoch": 0.1653179190751445,
"grad_norm": 0.30120413493928255,
"learning_rate": 0.00019742043628575364,
"loss": 0.9739,
"step": 715
},
{
"epoch": 0.16647398843930636,
"grad_norm": 0.30660646504263267,
"learning_rate": 0.00019732856451861594,
"loss": 0.976,
"step": 720
},
{
"epoch": 0.1676300578034682,
"grad_norm": 0.2856850255835453,
"learning_rate": 0.0001972351073698564,
"loss": 0.9848,
"step": 725
},
{
"epoch": 0.16878612716763006,
"grad_norm": 0.3147493299962656,
"learning_rate": 0.0001971400663617952,
"loss": 0.9921,
"step": 730
},
{
"epoch": 0.1699421965317919,
"grad_norm": 0.29251745238857035,
"learning_rate": 0.0001970434430425521,
"loss": 0.9398,
"step": 735
},
{
"epoch": 0.17109826589595376,
"grad_norm": 0.2938440274420457,
"learning_rate": 0.000196945238986021,
"loss": 1.0153,
"step": 740
},
{
"epoch": 0.1722543352601156,
"grad_norm": 0.2898363733115959,
"learning_rate": 0.00019684545579184433,
"loss": 0.9576,
"step": 745
},
{
"epoch": 0.17341040462427745,
"grad_norm": 0.3088888037796155,
"learning_rate": 0.00019674409508538718,
"loss": 0.9696,
"step": 750
},
{
"epoch": 0.1745664739884393,
"grad_norm": 0.2942346284280998,
"learning_rate": 0.0001966411585177105,
"loss": 1.0203,
"step": 755
},
{
"epoch": 0.17572254335260115,
"grad_norm": 0.2852125727236794,
"learning_rate": 0.00019653664776554455,
"loss": 0.9556,
"step": 760
},
{
"epoch": 0.176878612716763,
"grad_norm": 0.3016262447783914,
"learning_rate": 0.0001964305645312613,
"loss": 0.9896,
"step": 765
},
{
"epoch": 0.17803468208092485,
"grad_norm": 0.30255168675186533,
"learning_rate": 0.00019632291054284693,
"loss": 0.9839,
"step": 770
},
{
"epoch": 0.1791907514450867,
"grad_norm": 0.2806238558760804,
"learning_rate": 0.0001962136875538735,
"loss": 0.9748,
"step": 775
},
{
"epoch": 0.18034682080924855,
"grad_norm": 0.2945672906624874,
"learning_rate": 0.00019610289734347053,
"loss": 0.9479,
"step": 780
},
{
"epoch": 0.1815028901734104,
"grad_norm": 0.3012247410303452,
"learning_rate": 0.00019599054171629595,
"loss": 1.0132,
"step": 785
},
{
"epoch": 0.18265895953757225,
"grad_norm": 0.2944499117709193,
"learning_rate": 0.0001958766225025066,
"loss": 0.9336,
"step": 790
},
{
"epoch": 0.1838150289017341,
"grad_norm": 0.2989322109974369,
"learning_rate": 0.0001957611415577287,
"loss": 0.9857,
"step": 795
},
{
"epoch": 0.18497109826589594,
"grad_norm": 0.2859334477998094,
"learning_rate": 0.0001956441007630273,
"loss": 0.9831,
"step": 800
},
{
"epoch": 0.1861271676300578,
"grad_norm": 0.27928427673168016,
"learning_rate": 0.0001955255020248759,
"loss": 0.9582,
"step": 805
},
{
"epoch": 0.18728323699421964,
"grad_norm": 0.3044716011707441,
"learning_rate": 0.00019540534727512522,
"loss": 1.0061,
"step": 810
},
{
"epoch": 0.1884393063583815,
"grad_norm": 0.30123009109430604,
"learning_rate": 0.00019528363847097185,
"loss": 1.0015,
"step": 815
},
{
"epoch": 0.18959537572254334,
"grad_norm": 0.2983431588431708,
"learning_rate": 0.00019516037759492627,
"loss": 0.9917,
"step": 820
},
{
"epoch": 0.1907514450867052,
"grad_norm": 0.3045474200889231,
"learning_rate": 0.00019503556665478067,
"loss": 0.9924,
"step": 825
},
{
"epoch": 0.19190751445086704,
"grad_norm": 0.3023598788495744,
"learning_rate": 0.00019490920768357607,
"loss": 0.9824,
"step": 830
},
{
"epoch": 0.1930635838150289,
"grad_norm": 0.288004788186244,
"learning_rate": 0.00019478130273956943,
"loss": 0.9756,
"step": 835
},
{
"epoch": 0.19421965317919074,
"grad_norm": 0.28774556008482255,
"learning_rate": 0.00019465185390619996,
"loss": 0.9292,
"step": 840
},
{
"epoch": 0.19537572254335261,
"grad_norm": 0.29766392431820693,
"learning_rate": 0.00019452086329205522,
"loss": 1.008,
"step": 845
},
{
"epoch": 0.19653179190751446,
"grad_norm": 0.2968523784854959,
"learning_rate": 0.00019438833303083678,
"loss": 0.9469,
"step": 850
},
{
"epoch": 0.1976878612716763,
"grad_norm": 0.32050710908212127,
"learning_rate": 0.00019425426528132546,
"loss": 0.9584,
"step": 855
},
{
"epoch": 0.19884393063583816,
"grad_norm": 0.30039833468190924,
"learning_rate": 0.00019411866222734627,
"loss": 0.9345,
"step": 860
},
{
"epoch": 0.2,
"grad_norm": 0.3003418210003542,
"learning_rate": 0.00019398152607773264,
"loss": 1.0149,
"step": 865
},
{
"epoch": 0.20115606936416186,
"grad_norm": 0.28740303119481114,
"learning_rate": 0.00019384285906629055,
"loss": 0.9453,
"step": 870
},
{
"epoch": 0.2023121387283237,
"grad_norm": 0.3022020472256075,
"learning_rate": 0.00019370266345176214,
"loss": 0.9964,
"step": 875
},
{
"epoch": 0.20346820809248556,
"grad_norm": 0.2887054624910223,
"learning_rate": 0.00019356094151778895,
"loss": 0.9857,
"step": 880
},
{
"epoch": 0.2046242774566474,
"grad_norm": 0.38190567119224494,
"learning_rate": 0.00019341769557287467,
"loss": 0.9554,
"step": 885
},
{
"epoch": 0.20578034682080926,
"grad_norm": 0.29810678010523667,
"learning_rate": 0.00019327292795034753,
"loss": 0.9138,
"step": 890
},
{
"epoch": 0.2069364161849711,
"grad_norm": 0.30694177853911697,
"learning_rate": 0.00019312664100832233,
"loss": 0.9504,
"step": 895
},
{
"epoch": 0.20809248554913296,
"grad_norm": 0.31819590085942273,
"learning_rate": 0.00019297883712966204,
"loss": 0.9969,
"step": 900
},
{
"epoch": 0.2092485549132948,
"grad_norm": 0.2915158769948663,
"learning_rate": 0.00019282951872193885,
"loss": 1.0207,
"step": 905
},
{
"epoch": 0.21040462427745665,
"grad_norm": 0.3020525391391881,
"learning_rate": 0.0001926786882173952,
"loss": 0.9424,
"step": 910
},
{
"epoch": 0.2115606936416185,
"grad_norm": 0.3095938351879466,
"learning_rate": 0.000192526348072904,
"loss": 0.9762,
"step": 915
},
{
"epoch": 0.21271676300578035,
"grad_norm": 0.32067080864793646,
"learning_rate": 0.0001923725007699285,
"loss": 0.9423,
"step": 920
},
{
"epoch": 0.2138728323699422,
"grad_norm": 0.3144763229931657,
"learning_rate": 0.00019221714881448217,
"loss": 0.9824,
"step": 925
},
{
"epoch": 0.21502890173410405,
"grad_norm": 0.3179959968229521,
"learning_rate": 0.0001920602947370876,
"loss": 0.9829,
"step": 930
},
{
"epoch": 0.2161849710982659,
"grad_norm": 0.30113560950220297,
"learning_rate": 0.00019190194109273544,
"loss": 0.9463,
"step": 935
},
{
"epoch": 0.21734104046242775,
"grad_norm": 0.2896899865821755,
"learning_rate": 0.00019174209046084276,
"loss": 0.9786,
"step": 940
},
{
"epoch": 0.2184971098265896,
"grad_norm": 0.2969332481357098,
"learning_rate": 0.00019158074544521094,
"loss": 1.0183,
"step": 945
},
{
"epoch": 0.21965317919075145,
"grad_norm": 0.29984068010125464,
"learning_rate": 0.0001914179086739834,
"loss": 0.9688,
"step": 950
},
{
"epoch": 0.2208092485549133,
"grad_norm": 0.3179298483108417,
"learning_rate": 0.0001912535827996026,
"loss": 1.0213,
"step": 955
},
{
"epoch": 0.22196531791907514,
"grad_norm": 0.30071667875773894,
"learning_rate": 0.0001910877704987671,
"loss": 0.9947,
"step": 960
},
{
"epoch": 0.223121387283237,
"grad_norm": 0.3162017726279279,
"learning_rate": 0.00019092047447238773,
"loss": 0.9765,
"step": 965
},
{
"epoch": 0.22427745664739884,
"grad_norm": 0.31999805840625895,
"learning_rate": 0.0001907516974455436,
"loss": 0.9956,
"step": 970
},
{
"epoch": 0.2254335260115607,
"grad_norm": 0.2931503664377131,
"learning_rate": 0.00019058144216743797,
"loss": 1.002,
"step": 975
},
{
"epoch": 0.22658959537572254,
"grad_norm": 0.30298932952701096,
"learning_rate": 0.0001904097114113531,
"loss": 1.0159,
"step": 980
},
{
"epoch": 0.2277456647398844,
"grad_norm": 0.29804869616385626,
"learning_rate": 0.0001902365079746054,
"loss": 0.9771,
"step": 985
},
{
"epoch": 0.22890173410404624,
"grad_norm": 0.30761841093550274,
"learning_rate": 0.00019006183467849957,
"loss": 1.0157,
"step": 990
},
{
"epoch": 0.2300578034682081,
"grad_norm": 0.3080044180069082,
"learning_rate": 0.000189885694368283,
"loss": 0.9463,
"step": 995
},
{
"epoch": 0.23121387283236994,
"grad_norm": 0.29852967122969754,
"learning_rate": 0.00018970808991309904,
"loss": 1.0021,
"step": 1000
},
{
"epoch": 0.2323699421965318,
"grad_norm": 0.2987990831868652,
"learning_rate": 0.00018952902420594058,
"loss": 0.9492,
"step": 1005
},
{
"epoch": 0.23352601156069364,
"grad_norm": 0.30446826555699585,
"learning_rate": 0.0001893485001636026,
"loss": 1.0019,
"step": 1010
},
{
"epoch": 0.23468208092485549,
"grad_norm": 0.30998357590060016,
"learning_rate": 0.00018916652072663515,
"loss": 0.9478,
"step": 1015
},
{
"epoch": 0.23583815028901733,
"grad_norm": 0.3109448921090665,
"learning_rate": 0.0001889830888592949,
"loss": 0.9797,
"step": 1020
},
{
"epoch": 0.23699421965317918,
"grad_norm": 0.30658444282489306,
"learning_rate": 0.00018879820754949718,
"loss": 0.9976,
"step": 1025
},
{
"epoch": 0.23815028901734103,
"grad_norm": 0.32337481964353393,
"learning_rate": 0.0001886118798087673,
"loss": 0.9622,
"step": 1030
},
{
"epoch": 0.23930635838150288,
"grad_norm": 0.30934913326885294,
"learning_rate": 0.00018842410867219136,
"loss": 1.0095,
"step": 1035
},
{
"epoch": 0.24046242774566473,
"grad_norm": 0.32554022935815935,
"learning_rate": 0.0001882348971983669,
"loss": 1.0082,
"step": 1040
},
{
"epoch": 0.24161849710982658,
"grad_norm": 0.28387403686918444,
"learning_rate": 0.0001880442484693531,
"loss": 0.9433,
"step": 1045
},
{
"epoch": 0.24277456647398843,
"grad_norm": 0.2889713456813008,
"learning_rate": 0.0001878521655906205,
"loss": 0.994,
"step": 1050
},
{
"epoch": 0.24393063583815028,
"grad_norm": 0.3038933468668014,
"learning_rate": 0.00018765865169100048,
"loss": 0.966,
"step": 1055
},
{
"epoch": 0.24508670520231213,
"grad_norm": 0.294597892145829,
"learning_rate": 0.00018746370992263423,
"loss": 0.9501,
"step": 1060
},
{
"epoch": 0.24624277456647398,
"grad_norm": 0.30895343503048994,
"learning_rate": 0.00018726734346092148,
"loss": 0.9663,
"step": 1065
},
{
"epoch": 0.24739884393063583,
"grad_norm": 0.32867090123543136,
"learning_rate": 0.00018706955550446878,
"loss": 1.0,
"step": 1070
},
{
"epoch": 0.24855491329479767,
"grad_norm": 0.2860732876796574,
"learning_rate": 0.00018687034927503728,
"loss": 0.9282,
"step": 1075
},
{
"epoch": 0.24971098265895952,
"grad_norm": 0.3622792683930793,
"learning_rate": 0.00018666972801749035,
"loss": 0.9534,
"step": 1080
},
{
"epoch": 0.2508670520231214,
"grad_norm": 0.3215073797028641,
"learning_rate": 0.00018646769499974076,
"loss": 1.0177,
"step": 1085
},
{
"epoch": 0.2520231213872832,
"grad_norm": 0.30391881014243827,
"learning_rate": 0.00018626425351269733,
"loss": 1.0213,
"step": 1090
},
{
"epoch": 0.25317919075144507,
"grad_norm": 0.2959182253635083,
"learning_rate": 0.00018605940687021133,
"loss": 0.9265,
"step": 1095
},
{
"epoch": 0.2543352601156069,
"grad_norm": 0.3054358776400607,
"learning_rate": 0.00018585315840902275,
"loss": 0.9566,
"step": 1100
},
{
"epoch": 0.25549132947976877,
"grad_norm": 0.3038159760611022,
"learning_rate": 0.00018564551148870563,
"loss": 0.9728,
"step": 1105
},
{
"epoch": 0.2566473988439306,
"grad_norm": 0.295408221908172,
"learning_rate": 0.0001854364694916134,
"loss": 0.9769,
"step": 1110
},
{
"epoch": 0.25780346820809247,
"grad_norm": 0.3040191717007843,
"learning_rate": 0.00018522603582282396,
"loss": 0.9745,
"step": 1115
},
{
"epoch": 0.2589595375722543,
"grad_norm": 0.3276795968564327,
"learning_rate": 0.0001850142139100841,
"loss": 0.9843,
"step": 1120
},
{
"epoch": 0.26011560693641617,
"grad_norm": 0.30521432519770036,
"learning_rate": 0.0001848010072037536,
"loss": 0.9958,
"step": 1125
},
{
"epoch": 0.261271676300578,
"grad_norm": 0.3074723851239691,
"learning_rate": 0.0001845864191767491,
"loss": 1.0061,
"step": 1130
},
{
"epoch": 0.26242774566473986,
"grad_norm": 0.38542353780888683,
"learning_rate": 0.0001843704533244876,
"loss": 0.9527,
"step": 1135
},
{
"epoch": 0.2635838150289017,
"grad_norm": 0.31655887417225065,
"learning_rate": 0.00018415311316482934,
"loss": 0.9677,
"step": 1140
},
{
"epoch": 0.26473988439306356,
"grad_norm": 0.296622948449929,
"learning_rate": 0.00018393440223802077,
"loss": 1.0277,
"step": 1145
},
{
"epoch": 0.2658959537572254,
"grad_norm": 0.30339302168936083,
"learning_rate": 0.0001837143241066365,
"loss": 0.9996,
"step": 1150
},
{
"epoch": 0.26705202312138726,
"grad_norm": 0.321536892508796,
"learning_rate": 0.00018349288235552168,
"loss": 0.9599,
"step": 1155
},
{
"epoch": 0.2682080924855491,
"grad_norm": 0.3220388229874098,
"learning_rate": 0.0001832700805917333,
"loss": 0.9957,
"step": 1160
},
{
"epoch": 0.26936416184971096,
"grad_norm": 0.2951235948754925,
"learning_rate": 0.00018304592244448156,
"loss": 1.0261,
"step": 1165
},
{
"epoch": 0.2705202312138728,
"grad_norm": 0.3044130581874429,
"learning_rate": 0.0001828204115650708,
"loss": 0.9871,
"step": 1170
},
{
"epoch": 0.27167630057803466,
"grad_norm": 0.3009718363326464,
"learning_rate": 0.00018259355162684,
"loss": 0.9963,
"step": 1175
},
{
"epoch": 0.2728323699421965,
"grad_norm": 0.28905525049692726,
"learning_rate": 0.00018236534632510277,
"loss": 0.9848,
"step": 1180
},
{
"epoch": 0.27398843930635836,
"grad_norm": 0.35621478263055945,
"learning_rate": 0.00018213579937708735,
"loss": 0.9529,
"step": 1185
},
{
"epoch": 0.2751445086705202,
"grad_norm": 0.3185821009153887,
"learning_rate": 0.00018190491452187613,
"loss": 1.0045,
"step": 1190
},
{
"epoch": 0.27630057803468205,
"grad_norm": 0.3174337839488537,
"learning_rate": 0.00018167269552034446,
"loss": 0.9549,
"step": 1195
},
{
"epoch": 0.2774566473988439,
"grad_norm": 0.2929697771476232,
"learning_rate": 0.00018143914615509967,
"loss": 0.9463,
"step": 1200
},
{
"epoch": 0.2786127167630058,
"grad_norm": 0.3218921944038643,
"learning_rate": 0.00018120427023041925,
"loss": 1.0432,
"step": 1205
},
{
"epoch": 0.27976878612716766,
"grad_norm": 0.29522034854514784,
"learning_rate": 0.00018096807157218909,
"loss": 0.9354,
"step": 1210
},
{
"epoch": 0.2809248554913295,
"grad_norm": 0.31103148068930697,
"learning_rate": 0.0001807305540278409,
"loss": 0.9275,
"step": 1215
},
{
"epoch": 0.28208092485549136,
"grad_norm": 0.3268759634134707,
"learning_rate": 0.00018049172146628975,
"loss": 1.0122,
"step": 1220
},
{
"epoch": 0.2832369942196532,
"grad_norm": 0.2992667041917287,
"learning_rate": 0.00018025157777787102,
"loss": 0.9422,
"step": 1225
},
{
"epoch": 0.28439306358381505,
"grad_norm": 0.31617502406730474,
"learning_rate": 0.00018001012687427688,
"loss": 0.9909,
"step": 1230
},
{
"epoch": 0.2855491329479769,
"grad_norm": 0.31251457847228237,
"learning_rate": 0.0001797673726884928,
"loss": 0.9885,
"step": 1235
},
{
"epoch": 0.28670520231213875,
"grad_norm": 0.32021979333618866,
"learning_rate": 0.00017952331917473336,
"loss": 0.9396,
"step": 1240
},
{
"epoch": 0.2878612716763006,
"grad_norm": 0.34841403249467495,
"learning_rate": 0.00017927797030837768,
"loss": 0.9188,
"step": 1245
},
{
"epoch": 0.28901734104046245,
"grad_norm": 0.40965133837398776,
"learning_rate": 0.0001790313300859051,
"loss": 0.9582,
"step": 1250
},
{
"epoch": 0.2901734104046243,
"grad_norm": 0.3128265434514317,
"learning_rate": 0.00017878340252482956,
"loss": 0.9891,
"step": 1255
},
{
"epoch": 0.29132947976878615,
"grad_norm": 0.3762731976913158,
"learning_rate": 0.00017853419166363458,
"loss": 0.973,
"step": 1260
},
{
"epoch": 0.292485549132948,
"grad_norm": 0.3150565577316399,
"learning_rate": 0.00017828370156170727,
"loss": 0.9777,
"step": 1265
},
{
"epoch": 0.29364161849710985,
"grad_norm": 0.3124944800404496,
"learning_rate": 0.00017803193629927223,
"loss": 0.984,
"step": 1270
},
{
"epoch": 0.2947976878612717,
"grad_norm": 0.3456192268358289,
"learning_rate": 0.0001777788999773251,
"loss": 0.9881,
"step": 1275
},
{
"epoch": 0.29595375722543354,
"grad_norm": 0.309939339205246,
"learning_rate": 0.0001775245967175658,
"loss": 0.9483,
"step": 1280
},
{
"epoch": 0.2971098265895954,
"grad_norm": 0.29097211271370754,
"learning_rate": 0.00017726903066233134,
"loss": 0.9438,
"step": 1285
},
{
"epoch": 0.29826589595375724,
"grad_norm": 0.29586840025056343,
"learning_rate": 0.00017701220597452833,
"loss": 0.9754,
"step": 1290
},
{
"epoch": 0.2994219653179191,
"grad_norm": 0.3017342795593241,
"learning_rate": 0.0001767541268375652,
"loss": 0.9805,
"step": 1295
},
{
"epoch": 0.30057803468208094,
"grad_norm": 0.32061970031794484,
"learning_rate": 0.00017649479745528417,
"loss": 0.9818,
"step": 1300
},
{
"epoch": 0.3017341040462428,
"grad_norm": 0.3011478634925178,
"learning_rate": 0.00017623422205189252,
"loss": 0.9815,
"step": 1305
},
{
"epoch": 0.30289017341040464,
"grad_norm": 0.3086640164396148,
"learning_rate": 0.000175972404871894,
"loss": 0.9756,
"step": 1310
},
{
"epoch": 0.3040462427745665,
"grad_norm": 0.31272520758866784,
"learning_rate": 0.0001757093501800196,
"loss": 0.9993,
"step": 1315
},
{
"epoch": 0.30520231213872834,
"grad_norm": 0.3574223169536535,
"learning_rate": 0.0001754450622611581,
"loss": 0.9468,
"step": 1320
},
{
"epoch": 0.3063583815028902,
"grad_norm": 0.3147796469526059,
"learning_rate": 0.0001751795454202863,
"loss": 0.9848,
"step": 1325
},
{
"epoch": 0.30751445086705204,
"grad_norm": 0.3118480322453381,
"learning_rate": 0.0001749128039823988,
"loss": 1.0249,
"step": 1330
},
{
"epoch": 0.3086705202312139,
"grad_norm": 0.3023964221368843,
"learning_rate": 0.00017464484229243768,
"loss": 0.9232,
"step": 1335
},
{
"epoch": 0.30982658959537573,
"grad_norm": 0.3310385612654826,
"learning_rate": 0.0001743756647152216,
"loss": 0.9975,
"step": 1340
},
{
"epoch": 0.3109826589595376,
"grad_norm": 0.3318179078521207,
"learning_rate": 0.00017410527563537488,
"loss": 0.9776,
"step": 1345
},
{
"epoch": 0.31213872832369943,
"grad_norm": 0.31582505608794464,
"learning_rate": 0.00017383367945725584,
"loss": 0.9191,
"step": 1350
},
{
"epoch": 0.3132947976878613,
"grad_norm": 0.30919652895574773,
"learning_rate": 0.00017356088060488525,
"loss": 0.9813,
"step": 1355
},
{
"epoch": 0.31445086705202313,
"grad_norm": 0.30484218478884034,
"learning_rate": 0.00017328688352187416,
"loss": 0.9791,
"step": 1360
},
{
"epoch": 0.315606936416185,
"grad_norm": 0.3016535900655947,
"learning_rate": 0.00017301169267135163,
"loss": 0.9918,
"step": 1365
},
{
"epoch": 0.31676300578034683,
"grad_norm": 0.2828981498735541,
"learning_rate": 0.00017273531253589187,
"loss": 0.9266,
"step": 1370
},
{
"epoch": 0.3179190751445087,
"grad_norm": 0.3077602888091621,
"learning_rate": 0.00017245774761744134,
"loss": 0.9674,
"step": 1375
},
{
"epoch": 0.3190751445086705,
"grad_norm": 0.3099506136784924,
"learning_rate": 0.00017217900243724543,
"loss": 0.9836,
"step": 1380
},
{
"epoch": 0.3202312138728324,
"grad_norm": 0.31771107668630955,
"learning_rate": 0.00017189908153577473,
"loss": 0.9387,
"step": 1385
},
{
"epoch": 0.3213872832369942,
"grad_norm": 0.3121605825107148,
"learning_rate": 0.0001716179894726511,
"loss": 1.0108,
"step": 1390
},
{
"epoch": 0.3225433526011561,
"grad_norm": 0.32262006217730343,
"learning_rate": 0.0001713357308265735,
"loss": 1.0374,
"step": 1395
},
{
"epoch": 0.3236994219653179,
"grad_norm": 0.3209655049858561,
"learning_rate": 0.0001710523101952432,
"loss": 0.9936,
"step": 1400
},
{
"epoch": 0.3248554913294798,
"grad_norm": 0.32077514063057966,
"learning_rate": 0.00017076773219528905,
"loss": 0.9704,
"step": 1405
},
{
"epoch": 0.3260115606936416,
"grad_norm": 0.30932663652684755,
"learning_rate": 0.0001704820014621923,
"loss": 1.003,
"step": 1410
},
{
"epoch": 0.32716763005780347,
"grad_norm": 0.32273320904553404,
"learning_rate": 0.00017019512265021097,
"loss": 1.0388,
"step": 1415
},
{
"epoch": 0.3283236994219653,
"grad_norm": 0.31422034130330717,
"learning_rate": 0.00016990710043230406,
"loss": 0.9556,
"step": 1420
},
{
"epoch": 0.32947976878612717,
"grad_norm": 0.31168769800585655,
"learning_rate": 0.00016961793950005558,
"loss": 0.9746,
"step": 1425
},
{
"epoch": 0.330635838150289,
"grad_norm": 0.3052533915264079,
"learning_rate": 0.00016932764456359793,
"loss": 0.9542,
"step": 1430
},
{
"epoch": 0.33179190751445087,
"grad_norm": 0.2905803213236271,
"learning_rate": 0.0001690362203515353,
"loss": 0.97,
"step": 1435
},
{
"epoch": 0.3329479768786127,
"grad_norm": 0.3350090558904066,
"learning_rate": 0.00016874367161086662,
"loss": 1.0443,
"step": 1440
},
{
"epoch": 0.33410404624277457,
"grad_norm": 0.3060737171676248,
"learning_rate": 0.00016845000310690815,
"loss": 1.043,
"step": 1445
},
{
"epoch": 0.3352601156069364,
"grad_norm": 0.3129317388012455,
"learning_rate": 0.00016815521962321604,
"loss": 0.9288,
"step": 1450
},
{
"epoch": 0.33641618497109826,
"grad_norm": 0.3197265364198292,
"learning_rate": 0.00016785932596150827,
"loss": 0.984,
"step": 1455
},
{
"epoch": 0.3375722543352601,
"grad_norm": 0.29604487460555096,
"learning_rate": 0.0001675623269415864,
"loss": 1.0016,
"step": 1460
},
{
"epoch": 0.33872832369942196,
"grad_norm": 0.30262162872771553,
"learning_rate": 0.00016726422740125728,
"loss": 0.9856,
"step": 1465
},
{
"epoch": 0.3398843930635838,
"grad_norm": 0.3087826779620012,
"learning_rate": 0.000166965032196254,
"loss": 0.9957,
"step": 1470
},
{
"epoch": 0.34104046242774566,
"grad_norm": 0.3024417730810271,
"learning_rate": 0.00016666474620015686,
"loss": 0.939,
"step": 1475
},
{
"epoch": 0.3421965317919075,
"grad_norm": 0.3138788978371571,
"learning_rate": 0.0001663633743043141,
"loss": 0.9363,
"step": 1480
},
{
"epoch": 0.34335260115606936,
"grad_norm": 0.30022727252981385,
"learning_rate": 0.0001660609214177621,
"loss": 1.0356,
"step": 1485
},
{
"epoch": 0.3445086705202312,
"grad_norm": 0.3554930404416277,
"learning_rate": 0.00016575739246714547,
"loss": 0.9741,
"step": 1490
},
{
"epoch": 0.34566473988439306,
"grad_norm": 0.29207690202141207,
"learning_rate": 0.00016545279239663682,
"loss": 0.9914,
"step": 1495
},
{
"epoch": 0.3468208092485549,
"grad_norm": 0.3029574756031942,
"learning_rate": 0.00016514712616785612,
"loss": 0.9421,
"step": 1500
},
{
"epoch": 0.34797687861271676,
"grad_norm": 0.3003131478682294,
"learning_rate": 0.00016484039875979005,
"loss": 0.9536,
"step": 1505
},
{
"epoch": 0.3491329479768786,
"grad_norm": 0.31513567646148855,
"learning_rate": 0.00016453261516871068,
"loss": 0.9426,
"step": 1510
},
{
"epoch": 0.35028901734104045,
"grad_norm": 0.2996325149723125,
"learning_rate": 0.00016422378040809437,
"loss": 1.0104,
"step": 1515
},
{
"epoch": 0.3514450867052023,
"grad_norm": 0.31000327701469227,
"learning_rate": 0.00016391389950853977,
"loss": 0.9899,
"step": 1520
},
{
"epoch": 0.35260115606936415,
"grad_norm": 0.3077861834938761,
"learning_rate": 0.0001636029775176862,
"loss": 0.9865,
"step": 1525
},
{
"epoch": 0.353757225433526,
"grad_norm": 0.3065445301834393,
"learning_rate": 0.00016329101950013122,
"loss": 0.9833,
"step": 1530
},
{
"epoch": 0.35491329479768785,
"grad_norm": 0.3317987511497053,
"learning_rate": 0.00016297803053734816,
"loss": 0.9549,
"step": 1535
},
{
"epoch": 0.3560693641618497,
"grad_norm": 0.3196645968421778,
"learning_rate": 0.0001626640157276034,
"loss": 0.9675,
"step": 1540
},
{
"epoch": 0.35722543352601155,
"grad_norm": 0.3134584834564055,
"learning_rate": 0.00016234898018587337,
"loss": 0.9516,
"step": 1545
},
{
"epoch": 0.3583815028901734,
"grad_norm": 0.3117834490142485,
"learning_rate": 0.00016203292904376105,
"loss": 0.9846,
"step": 1550
},
{
"epoch": 0.35953757225433525,
"grad_norm": 0.3217830266213364,
"learning_rate": 0.00016171586744941264,
"loss": 0.967,
"step": 1555
},
{
"epoch": 0.3606936416184971,
"grad_norm": 0.30934961569422764,
"learning_rate": 0.00016139780056743342,
"loss": 0.9649,
"step": 1560
},
{
"epoch": 0.36184971098265895,
"grad_norm": 0.3292849028870402,
"learning_rate": 0.00016107873357880384,
"loss": 1.0175,
"step": 1565
},
{
"epoch": 0.3630057803468208,
"grad_norm": 0.31062236776235347,
"learning_rate": 0.00016075867168079507,
"loss": 0.9696,
"step": 1570
},
{
"epoch": 0.36416184971098264,
"grad_norm": 0.318894494073892,
"learning_rate": 0.00016043762008688433,
"loss": 0.9286,
"step": 1575
},
{
"epoch": 0.3653179190751445,
"grad_norm": 0.3029890310755476,
"learning_rate": 0.00016011558402666983,
"loss": 0.9594,
"step": 1580
},
{
"epoch": 0.36647398843930634,
"grad_norm": 0.3081821304700694,
"learning_rate": 0.00015979256874578594,
"loss": 0.987,
"step": 1585
},
{
"epoch": 0.3676300578034682,
"grad_norm": 0.29222596605397133,
"learning_rate": 0.00015946857950581734,
"loss": 0.9919,
"step": 1590
},
{
"epoch": 0.36878612716763004,
"grad_norm": 0.2955787813149893,
"learning_rate": 0.0001591436215842135,
"loss": 0.9653,
"step": 1595
},
{
"epoch": 0.3699421965317919,
"grad_norm": 0.2991599664341822,
"learning_rate": 0.0001588177002742029,
"loss": 0.9874,
"step": 1600
},
{
"epoch": 0.37109826589595374,
"grad_norm": 0.44600654437638615,
"learning_rate": 0.00015849082088470638,
"loss": 0.9504,
"step": 1605
},
{
"epoch": 0.3722543352601156,
"grad_norm": 0.33315550683583905,
"learning_rate": 0.00015816298874025102,
"loss": 1.0328,
"step": 1610
},
{
"epoch": 0.37341040462427744,
"grad_norm": 0.292061245143086,
"learning_rate": 0.00015783420918088337,
"loss": 0.9762,
"step": 1615
},
{
"epoch": 0.3745664739884393,
"grad_norm": 0.31976882089395187,
"learning_rate": 0.0001575044875620822,
"loss": 1.026,
"step": 1620
},
{
"epoch": 0.37572254335260113,
"grad_norm": 0.3238783922087859,
"learning_rate": 0.0001571738292546716,
"loss": 0.9496,
"step": 1625
},
{
"epoch": 0.376878612716763,
"grad_norm": 0.2959033885199569,
"learning_rate": 0.00015684223964473337,
"loss": 0.9656,
"step": 1630
},
{
"epoch": 0.37803468208092483,
"grad_norm": 0.3043686833702477,
"learning_rate": 0.0001565097241335191,
"loss": 0.954,
"step": 1635
},
{
"epoch": 0.3791907514450867,
"grad_norm": 0.3015192978319062,
"learning_rate": 0.00015617628813736247,
"loss": 0.9908,
"step": 1640
},
{
"epoch": 0.38034682080924853,
"grad_norm": 0.3082482576635595,
"learning_rate": 0.00015584193708759094,
"loss": 0.9477,
"step": 1645
},
{
"epoch": 0.3815028901734104,
"grad_norm": 0.30690197747994147,
"learning_rate": 0.00015550667643043716,
"loss": 0.9547,
"step": 1650
},
{
"epoch": 0.38265895953757223,
"grad_norm": 0.32095813086552044,
"learning_rate": 0.0001551705116269504,
"loss": 0.9946,
"step": 1655
},
{
"epoch": 0.3838150289017341,
"grad_norm": 0.3102088056503803,
"learning_rate": 0.0001548334481529075,
"loss": 0.9755,
"step": 1660
},
{
"epoch": 0.38497109826589593,
"grad_norm": 0.31740424344750273,
"learning_rate": 0.00015449549149872376,
"loss": 0.986,
"step": 1665
},
{
"epoch": 0.3861271676300578,
"grad_norm": 0.29760230294640583,
"learning_rate": 0.00015415664716936345,
"loss": 0.9736,
"step": 1670
},
{
"epoch": 0.3872832369942196,
"grad_norm": 0.33696332230509884,
"learning_rate": 0.00015381692068425004,
"loss": 0.9833,
"step": 1675
},
{
"epoch": 0.3884393063583815,
"grad_norm": 0.2971442840267895,
"learning_rate": 0.0001534763175771766,
"loss": 0.9787,
"step": 1680
},
{
"epoch": 0.3895953757225434,
"grad_norm": 0.2985513113352265,
"learning_rate": 0.00015313484339621534,
"loss": 0.9586,
"step": 1685
},
{
"epoch": 0.39075144508670523,
"grad_norm": 0.3070566835370781,
"learning_rate": 0.00015279250370362735,
"loss": 0.9878,
"step": 1690
},
{
"epoch": 0.3919075144508671,
"grad_norm": 0.3277524257534511,
"learning_rate": 0.00015244930407577205,
"loss": 1.0016,
"step": 1695
},
{
"epoch": 0.3930635838150289,
"grad_norm": 0.30050526813256595,
"learning_rate": 0.00015210525010301638,
"loss": 0.9553,
"step": 1700
},
{
"epoch": 0.3942196531791908,
"grad_norm": 0.30810810818757023,
"learning_rate": 0.0001517603473896435,
"loss": 0.9559,
"step": 1705
},
{
"epoch": 0.3953757225433526,
"grad_norm": 0.299950087627466,
"learning_rate": 0.00015141460155376182,
"loss": 0.9609,
"step": 1710
},
{
"epoch": 0.3965317919075145,
"grad_norm": 0.2925418247045739,
"learning_rate": 0.00015106801822721338,
"loss": 0.9763,
"step": 1715
},
{
"epoch": 0.3976878612716763,
"grad_norm": 0.30196182567060115,
"learning_rate": 0.00015072060305548187,
"loss": 0.9959,
"step": 1720
},
{
"epoch": 0.3988439306358382,
"grad_norm": 0.30989108180452857,
"learning_rate": 0.0001503723616976011,
"loss": 1.0003,
"step": 1725
},
{
"epoch": 0.4,
"grad_norm": 0.3039991146331331,
"learning_rate": 0.00015002329982606255,
"loss": 1.0345,
"step": 1730
},
{
"epoch": 0.40115606936416187,
"grad_norm": 0.3211973783178471,
"learning_rate": 0.00014967342312672283,
"loss": 0.9384,
"step": 1735
},
{
"epoch": 0.4023121387283237,
"grad_norm": 0.29388068969488124,
"learning_rate": 0.00014932273729871152,
"loss": 0.9051,
"step": 1740
},
{
"epoch": 0.40346820809248557,
"grad_norm": 0.3265810444042218,
"learning_rate": 0.0001489712480543379,
"loss": 0.9835,
"step": 1745
},
{
"epoch": 0.4046242774566474,
"grad_norm": 0.303317950576793,
"learning_rate": 0.0001486189611189981,
"loss": 0.9446,
"step": 1750
},
{
"epoch": 0.40578034682080927,
"grad_norm": 0.2967360524243329,
"learning_rate": 0.00014826588223108185,
"loss": 0.9908,
"step": 1755
},
{
"epoch": 0.4069364161849711,
"grad_norm": 0.30908569033672595,
"learning_rate": 0.00014791201714187897,
"loss": 0.9118,
"step": 1760
},
{
"epoch": 0.40809248554913297,
"grad_norm": 0.2921540136185523,
"learning_rate": 0.0001475573716154856,
"loss": 1.0177,
"step": 1765
},
{
"epoch": 0.4092485549132948,
"grad_norm": 0.30891513255558445,
"learning_rate": 0.00014720195142871054,
"loss": 0.9528,
"step": 1770
},
{
"epoch": 0.41040462427745666,
"grad_norm": 0.3116110416347837,
"learning_rate": 0.00014684576237098082,
"loss": 1.0153,
"step": 1775
},
{
"epoch": 0.4115606936416185,
"grad_norm": 0.29853304778547163,
"learning_rate": 0.00014648881024424774,
"loss": 0.9607,
"step": 1780
},
{
"epoch": 0.41271676300578036,
"grad_norm": 0.29621019258375375,
"learning_rate": 0.00014613110086289218,
"loss": 1.0178,
"step": 1785
},
{
"epoch": 0.4138728323699422,
"grad_norm": 0.32202847876721696,
"learning_rate": 0.00014577264005362985,
"loss": 0.9274,
"step": 1790
},
{
"epoch": 0.41502890173410406,
"grad_norm": 0.30619120168198916,
"learning_rate": 0.00014541343365541645,
"loss": 0.9435,
"step": 1795
},
{
"epoch": 0.4161849710982659,
"grad_norm": 0.3001554889949122,
"learning_rate": 0.00014505348751935263,
"loss": 0.9738,
"step": 1800
},
{
"epoch": 0.41734104046242776,
"grad_norm": 0.3065569332725715,
"learning_rate": 0.00014469280750858854,
"loss": 0.9627,
"step": 1805
},
{
"epoch": 0.4184971098265896,
"grad_norm": 0.3346803326675102,
"learning_rate": 0.00014433139949822837,
"loss": 1.0008,
"step": 1810
},
{
"epoch": 0.41965317919075146,
"grad_norm": 0.30285946590074797,
"learning_rate": 0.00014396926937523477,
"loss": 0.9681,
"step": 1815
},
{
"epoch": 0.4208092485549133,
"grad_norm": 0.323926429665197,
"learning_rate": 0.0001436064230383327,
"loss": 0.9883,
"step": 1820
},
{
"epoch": 0.42196531791907516,
"grad_norm": 0.30822208654391275,
"learning_rate": 0.00014324286639791367,
"loss": 0.9471,
"step": 1825
},
{
"epoch": 0.423121387283237,
"grad_norm": 0.3043728994137006,
"learning_rate": 0.00014287860537593917,
"loss": 0.9837,
"step": 1830
},
{
"epoch": 0.42427745664739885,
"grad_norm": 0.3042147218697011,
"learning_rate": 0.00014251364590584444,
"loss": 0.9576,
"step": 1835
},
{
"epoch": 0.4254335260115607,
"grad_norm": 0.32773524321463,
"learning_rate": 0.00014214799393244166,
"loss": 0.9356,
"step": 1840
},
{
"epoch": 0.42658959537572255,
"grad_norm": 0.30439292824288355,
"learning_rate": 0.00014178165541182312,
"loss": 0.9421,
"step": 1845
},
{
"epoch": 0.4277456647398844,
"grad_norm": 0.321382456218326,
"learning_rate": 0.00014141463631126442,
"loss": 0.9515,
"step": 1850
},
{
"epoch": 0.42890173410404625,
"grad_norm": 0.3130786437336031,
"learning_rate": 0.0001410469426091269,
"loss": 0.9715,
"step": 1855
},
{
"epoch": 0.4300578034682081,
"grad_norm": 0.3135399317680074,
"learning_rate": 0.00014067858029476063,
"loss": 0.9474,
"step": 1860
},
{
"epoch": 0.43121387283236995,
"grad_norm": 0.31368812931362966,
"learning_rate": 0.00014030955536840656,
"loss": 1.0225,
"step": 1865
},
{
"epoch": 0.4323699421965318,
"grad_norm": 0.3332336708705887,
"learning_rate": 0.00013993987384109898,
"loss": 1.0098,
"step": 1870
},
{
"epoch": 0.43352601156069365,
"grad_norm": 0.2950594206550405,
"learning_rate": 0.00013956954173456747,
"loss": 0.9846,
"step": 1875
},
{
"epoch": 0.4346820809248555,
"grad_norm": 0.3001574947847011,
"learning_rate": 0.000139198565081139,
"loss": 0.9853,
"step": 1880
},
{
"epoch": 0.43583815028901735,
"grad_norm": 0.29311121484460284,
"learning_rate": 0.00013882694992363936,
"loss": 1.0175,
"step": 1885
},
{
"epoch": 0.4369942196531792,
"grad_norm": 0.3286518580874968,
"learning_rate": 0.00013845470231529502,
"loss": 0.9845,
"step": 1890
},
{
"epoch": 0.43815028901734104,
"grad_norm": 0.31669776510548286,
"learning_rate": 0.00013808182831963442,
"loss": 1.0096,
"step": 1895
},
{
"epoch": 0.4393063583815029,
"grad_norm": 0.3230500170692119,
"learning_rate": 0.00013770833401038912,
"loss": 0.9652,
"step": 1900
},
{
"epoch": 0.44046242774566474,
"grad_norm": 0.314774115101565,
"learning_rate": 0.0001373342254713951,
"loss": 0.9884,
"step": 1905
},
{
"epoch": 0.4416184971098266,
"grad_norm": 0.309673258526753,
"learning_rate": 0.00013695950879649338,
"loss": 0.9617,
"step": 1910
},
{
"epoch": 0.44277456647398844,
"grad_norm": 0.31688401964004287,
"learning_rate": 0.00013658419008943088,
"loss": 1.0007,
"step": 1915
},
{
"epoch": 0.4439306358381503,
"grad_norm": 0.3115831394799577,
"learning_rate": 0.00013620827546376112,
"loss": 0.9837,
"step": 1920
},
{
"epoch": 0.44508670520231214,
"grad_norm": 0.3134574472279371,
"learning_rate": 0.00013583177104274435,
"loss": 0.9748,
"step": 1925
},
{
"epoch": 0.446242774566474,
"grad_norm": 0.3388852510725773,
"learning_rate": 0.00013545468295924812,
"loss": 0.9825,
"step": 1930
},
{
"epoch": 0.44739884393063584,
"grad_norm": 0.33068272114069625,
"learning_rate": 0.00013507701735564716,
"loss": 0.9552,
"step": 1935
},
{
"epoch": 0.4485549132947977,
"grad_norm": 0.3070498417777377,
"learning_rate": 0.00013469878038372348,
"loss": 0.9842,
"step": 1940
},
{
"epoch": 0.44971098265895953,
"grad_norm": 0.32088136083821,
"learning_rate": 0.00013431997820456592,
"loss": 0.9635,
"step": 1945
},
{
"epoch": 0.4508670520231214,
"grad_norm": 0.317988833081837,
"learning_rate": 0.00013394061698847022,
"loss": 0.9922,
"step": 1950
},
{
"epoch": 0.45202312138728323,
"grad_norm": 0.29873515381181037,
"learning_rate": 0.000133560702914838,
"loss": 0.9808,
"step": 1955
},
{
"epoch": 0.4531791907514451,
"grad_norm": 0.334307242594275,
"learning_rate": 0.00013318024217207652,
"loss": 0.9285,
"step": 1960
},
{
"epoch": 0.45433526011560693,
"grad_norm": 0.3161167520009514,
"learning_rate": 0.00013279924095749768,
"loss": 0.9721,
"step": 1965
},
{
"epoch": 0.4554913294797688,
"grad_norm": 0.3219859959358273,
"learning_rate": 0.00013241770547721703,
"loss": 1.008,
"step": 1970
},
{
"epoch": 0.45664739884393063,
"grad_norm": 0.27935961874420406,
"learning_rate": 0.00013203564194605284,
"loss": 0.9502,
"step": 1975
},
{
"epoch": 0.4578034682080925,
"grad_norm": 0.29913347538052254,
"learning_rate": 0.0001316530565874248,
"loss": 0.9791,
"step": 1980
},
{
"epoch": 0.45895953757225433,
"grad_norm": 0.30294269390330414,
"learning_rate": 0.00013126995563325254,
"loss": 0.9763,
"step": 1985
},
{
"epoch": 0.4601156069364162,
"grad_norm": 0.32096224239126736,
"learning_rate": 0.00013088634532385424,
"loss": 0.9238,
"step": 1990
},
{
"epoch": 0.461271676300578,
"grad_norm": 0.2960586714061201,
"learning_rate": 0.000130502231907845,
"loss": 0.9533,
"step": 1995
},
{
"epoch": 0.4624277456647399,
"grad_norm": 0.29803737045431256,
"learning_rate": 0.000130117621642035,
"loss": 0.9526,
"step": 2000
},
{
"epoch": 0.4635838150289017,
"grad_norm": 0.31720967984437226,
"learning_rate": 0.00012973252079132749,
"loss": 0.9566,
"step": 2005
},
{
"epoch": 0.4647398843930636,
"grad_norm": 0.31204560106706253,
"learning_rate": 0.00012934693562861692,
"loss": 0.9821,
"step": 2010
},
{
"epoch": 0.4658959537572254,
"grad_norm": 0.3452586478382497,
"learning_rate": 0.00012896087243468673,
"loss": 0.9866,
"step": 2015
},
{
"epoch": 0.46705202312138727,
"grad_norm": 0.30419391343270963,
"learning_rate": 0.00012857433749810691,
"loss": 0.9465,
"step": 2020
},
{
"epoch": 0.4682080924855491,
"grad_norm": 0.302216494494177,
"learning_rate": 0.00012818733711513164,
"loss": 0.9928,
"step": 2025
},
{
"epoch": 0.46936416184971097,
"grad_norm": 0.29660145267520094,
"learning_rate": 0.00012779987758959683,
"loss": 0.9714,
"step": 2030
},
{
"epoch": 0.4705202312138728,
"grad_norm": 0.3375993332751583,
"learning_rate": 0.00012741196523281728,
"loss": 1.004,
"step": 2035
},
{
"epoch": 0.47167630057803467,
"grad_norm": 0.31685124172490736,
"learning_rate": 0.0001270236063634839,
"loss": 0.9686,
"step": 2040
},
{
"epoch": 0.4728323699421965,
"grad_norm": 0.30517277761996336,
"learning_rate": 0.00012663480730756095,
"loss": 0.97,
"step": 2045
},
{
"epoch": 0.47398843930635837,
"grad_norm": 0.3075134986191579,
"learning_rate": 0.00012624557439818275,
"loss": 0.9535,
"step": 2050
},
{
"epoch": 0.4751445086705202,
"grad_norm": 0.2914116111037525,
"learning_rate": 0.00012585591397555078,
"loss": 0.9549,
"step": 2055
},
{
"epoch": 0.47630057803468207,
"grad_norm": 0.3065733883077486,
"learning_rate": 0.00012546583238683015,
"loss": 0.9694,
"step": 2060
},
{
"epoch": 0.4774566473988439,
"grad_norm": 0.30076466916700556,
"learning_rate": 0.00012507533598604632,
"loss": 0.9802,
"step": 2065
},
{
"epoch": 0.47861271676300576,
"grad_norm": 0.29670240314259055,
"learning_rate": 0.00012468443113398175,
"loss": 0.9366,
"step": 2070
},
{
"epoch": 0.4797687861271676,
"grad_norm": 0.31183074363125884,
"learning_rate": 0.00012429312419807198,
"loss": 0.966,
"step": 2075
},
{
"epoch": 0.48092485549132946,
"grad_norm": 0.31278790481596425,
"learning_rate": 0.00012390142155230217,
"loss": 0.9893,
"step": 2080
},
{
"epoch": 0.4820809248554913,
"grad_norm": 0.28207826631174193,
"learning_rate": 0.0001235093295771032,
"loss": 0.9472,
"step": 2085
},
{
"epoch": 0.48323699421965316,
"grad_norm": 0.32635523820738965,
"learning_rate": 0.00012311685465924774,
"loss": 0.9089,
"step": 2090
},
{
"epoch": 0.484393063583815,
"grad_norm": 0.2977916871662523,
"learning_rate": 0.00012272400319174607,
"loss": 0.9834,
"step": 2095
},
{
"epoch": 0.48554913294797686,
"grad_norm": 0.31990291106992935,
"learning_rate": 0.00012233078157374217,
"loss": 0.9312,
"step": 2100
},
{
"epoch": 0.4867052023121387,
"grad_norm": 0.33073575866363214,
"learning_rate": 0.00012193719621040942,
"loss": 0.9795,
"step": 2105
},
{
"epoch": 0.48786127167630056,
"grad_norm": 0.2915838062263623,
"learning_rate": 0.00012154325351284618,
"loss": 0.9789,
"step": 2110
},
{
"epoch": 0.4890173410404624,
"grad_norm": 0.33822747094942934,
"learning_rate": 0.00012114895989797144,
"loss": 0.9304,
"step": 2115
},
{
"epoch": 0.49017341040462425,
"grad_norm": 0.32370602693562334,
"learning_rate": 0.00012075432178842021,
"loss": 0.9428,
"step": 2120
},
{
"epoch": 0.4913294797687861,
"grad_norm": 0.32375527811459415,
"learning_rate": 0.00012035934561243905,
"loss": 0.9718,
"step": 2125
},
{
"epoch": 0.49248554913294795,
"grad_norm": 0.2943351586407433,
"learning_rate": 0.00011996403780378123,
"loss": 0.9712,
"step": 2130
},
{
"epoch": 0.4936416184971098,
"grad_norm": 0.3223304229208655,
"learning_rate": 0.00011956840480160194,
"loss": 1.0046,
"step": 2135
},
{
"epoch": 0.49479768786127165,
"grad_norm": 0.3032804365412004,
"learning_rate": 0.00011917245305035354,
"loss": 0.9596,
"step": 2140
},
{
"epoch": 0.4959537572254335,
"grad_norm": 0.3086608217360584,
"learning_rate": 0.00011877618899968037,
"loss": 0.9473,
"step": 2145
},
{
"epoch": 0.49710982658959535,
"grad_norm": 0.3131187277547376,
"learning_rate": 0.00011837961910431383,
"loss": 1.0065,
"step": 2150
},
{
"epoch": 0.4982658959537572,
"grad_norm": 0.2975605931413944,
"learning_rate": 0.00011798274982396726,
"loss": 0.9481,
"step": 2155
},
{
"epoch": 0.49942196531791905,
"grad_norm": 0.29498275298057963,
"learning_rate": 0.00011758558762323067,
"loss": 0.9884,
"step": 2160
},
{
"epoch": 0.500578034682081,
"grad_norm": 0.2906355713880922,
"learning_rate": 0.00011718813897146535,
"loss": 0.9643,
"step": 2165
},
{
"epoch": 0.5017341040462427,
"grad_norm": 0.2879177716708955,
"learning_rate": 0.00011679041034269869,
"loss": 0.9496,
"step": 2170
},
{
"epoch": 0.5028901734104047,
"grad_norm": 0.3107002014222183,
"learning_rate": 0.00011639240821551858,
"loss": 0.9489,
"step": 2175
},
{
"epoch": 0.5040462427745664,
"grad_norm": 0.30854297451303886,
"learning_rate": 0.00011599413907296785,
"loss": 0.9887,
"step": 2180
},
{
"epoch": 0.5052023121387283,
"grad_norm": 0.3250596343211611,
"learning_rate": 0.00011559560940243888,
"loss": 0.9421,
"step": 2185
},
{
"epoch": 0.5063583815028901,
"grad_norm": 0.303124134082483,
"learning_rate": 0.00011519682569556758,
"loss": 0.967,
"step": 2190
},
{
"epoch": 0.507514450867052,
"grad_norm": 0.29292499319855175,
"learning_rate": 0.00011479779444812808,
"loss": 0.9679,
"step": 2195
},
{
"epoch": 0.5086705202312138,
"grad_norm": 0.30291810874235703,
"learning_rate": 0.00011439852215992647,
"loss": 0.997,
"step": 2200
},
{
"epoch": 0.5098265895953757,
"grad_norm": 0.3234308605182878,
"learning_rate": 0.0001139990153346953,
"loss": 0.9876,
"step": 2205
},
{
"epoch": 0.5109826589595375,
"grad_norm": 0.3137214941805028,
"learning_rate": 0.00011359928047998744,
"loss": 1.0407,
"step": 2210
},
{
"epoch": 0.5121387283236994,
"grad_norm": 0.32171251618436913,
"learning_rate": 0.0001131993241070701,
"loss": 0.9783,
"step": 2215
},
{
"epoch": 0.5132947976878612,
"grad_norm": 0.2971713793214721,
"learning_rate": 0.00011279915273081876,
"loss": 0.9678,
"step": 2220
},
{
"epoch": 0.5144508670520231,
"grad_norm": 0.30876666041737444,
"learning_rate": 0.00011239877286961122,
"loss": 0.9717,
"step": 2225
},
{
"epoch": 0.5156069364161849,
"grad_norm": 0.31611543232380335,
"learning_rate": 0.00011199819104522114,
"loss": 0.9611,
"step": 2230
},
{
"epoch": 0.5167630057803468,
"grad_norm": 0.31365800007794736,
"learning_rate": 0.000111597413782712,
"loss": 0.986,
"step": 2235
},
{
"epoch": 0.5179190751445086,
"grad_norm": 0.3387876838248837,
"learning_rate": 0.00011119644761033078,
"loss": 0.9865,
"step": 2240
},
{
"epoch": 0.5190751445086705,
"grad_norm": 0.3090392049931908,
"learning_rate": 0.00011079529905940163,
"loss": 0.9264,
"step": 2245
},
{
"epoch": 0.5202312138728323,
"grad_norm": 0.30547601371038785,
"learning_rate": 0.0001103939746642194,
"loss": 0.9293,
"step": 2250
},
{
"epoch": 0.5213872832369942,
"grad_norm": 0.30920860300711217,
"learning_rate": 0.00010999248096194326,
"loss": 0.9759,
"step": 2255
},
{
"epoch": 0.522543352601156,
"grad_norm": 0.30207153503695156,
"learning_rate": 0.00010959082449249026,
"loss": 0.9557,
"step": 2260
},
{
"epoch": 0.5236994219653179,
"grad_norm": 0.29504681849985004,
"learning_rate": 0.00010918901179842877,
"loss": 0.9686,
"step": 2265
},
{
"epoch": 0.5248554913294797,
"grad_norm": 0.29286119267320176,
"learning_rate": 0.00010878704942487183,
"loss": 1.0042,
"step": 2270
},
{
"epoch": 0.5260115606936416,
"grad_norm": 0.3062243965654378,
"learning_rate": 0.00010838494391937064,
"loss": 0.9784,
"step": 2275
},
{
"epoch": 0.5271676300578034,
"grad_norm": 0.3079900592590067,
"learning_rate": 0.00010798270183180794,
"loss": 0.9503,
"step": 2280
},
{
"epoch": 0.5283236994219653,
"grad_norm": 0.3154259872984876,
"learning_rate": 0.0001075803297142911,
"loss": 0.9509,
"step": 2285
},
{
"epoch": 0.5294797687861271,
"grad_norm": 0.29229323286742326,
"learning_rate": 0.00010717783412104568,
"loss": 0.9557,
"step": 2290
},
{
"epoch": 0.530635838150289,
"grad_norm": 0.31601404726519217,
"learning_rate": 0.00010677522160830848,
"loss": 0.9042,
"step": 2295
},
{
"epoch": 0.5317919075144508,
"grad_norm": 0.28505318471127816,
"learning_rate": 0.00010637249873422077,
"loss": 0.9692,
"step": 2300
},
{
"epoch": 0.5329479768786127,
"grad_norm": 0.32485305181812835,
"learning_rate": 0.00010596967205872154,
"loss": 1.0065,
"step": 2305
},
{
"epoch": 0.5341040462427745,
"grad_norm": 0.3078039739362063,
"learning_rate": 0.00010556674814344059,
"loss": 0.9284,
"step": 2310
},
{
"epoch": 0.5352601156069364,
"grad_norm": 0.3208549342399588,
"learning_rate": 0.00010516373355159159,
"loss": 0.9477,
"step": 2315
},
{
"epoch": 0.5364161849710982,
"grad_norm": 0.30748525424656054,
"learning_rate": 0.00010476063484786535,
"loss": 0.9629,
"step": 2320
},
{
"epoch": 0.5375722543352601,
"grad_norm": 0.35275688331568944,
"learning_rate": 0.0001043574585983227,
"loss": 1.0113,
"step": 2325
},
{
"epoch": 0.5387283236994219,
"grad_norm": 0.3253346439794717,
"learning_rate": 0.00010395421137028761,
"loss": 1.0346,
"step": 2330
},
{
"epoch": 0.5398843930635838,
"grad_norm": 0.3095697055293057,
"learning_rate": 0.00010355089973224026,
"loss": 0.9546,
"step": 2335
},
{
"epoch": 0.5410404624277456,
"grad_norm": 0.307599147895419,
"learning_rate": 0.00010314753025370991,
"loss": 0.9836,
"step": 2340
},
{
"epoch": 0.5421965317919075,
"grad_norm": 0.31630009518917934,
"learning_rate": 0.00010274410950516815,
"loss": 1.0071,
"step": 2345
},
{
"epoch": 0.5433526011560693,
"grad_norm": 0.2834312244927633,
"learning_rate": 0.00010234064405792154,
"loss": 0.9489,
"step": 2350
},
{
"epoch": 0.5445086705202312,
"grad_norm": 0.2970129068571585,
"learning_rate": 0.0001019371404840048,
"loss": 0.9351,
"step": 2355
},
{
"epoch": 0.545664739884393,
"grad_norm": 0.3025660920447834,
"learning_rate": 0.0001015336053560737,
"loss": 0.99,
"step": 2360
},
{
"epoch": 0.5468208092485549,
"grad_norm": 0.30633618369872967,
"learning_rate": 0.00010113004524729799,
"loss": 0.954,
"step": 2365
},
{
"epoch": 0.5479768786127167,
"grad_norm": 0.28629344395996326,
"learning_rate": 0.00010072646673125432,
"loss": 0.9783,
"step": 2370
},
{
"epoch": 0.5491329479768786,
"grad_norm": 0.3097159968894246,
"learning_rate": 0.00010032287638181919,
"loss": 1.0303,
"step": 2375
},
{
"epoch": 0.5502890173410404,
"grad_norm": 0.30322928513172714,
"learning_rate": 9.991928077306183e-05,
"loss": 1.0,
"step": 2380
},
{
"epoch": 0.5514450867052023,
"grad_norm": 0.4644501347952281,
"learning_rate": 9.951568647913718e-05,
"loss": 0.9294,
"step": 2385
},
{
"epoch": 0.5526011560693641,
"grad_norm": 0.3124185608435605,
"learning_rate": 9.911210007417869e-05,
"loss": 0.9847,
"step": 2390
},
{
"epoch": 0.553757225433526,
"grad_norm": 0.32991063517383223,
"learning_rate": 9.870852813219143e-05,
"loss": 0.9755,
"step": 2395
},
{
"epoch": 0.5549132947976878,
"grad_norm": 0.31527757887354024,
"learning_rate": 9.830497722694478e-05,
"loss": 0.9819,
"step": 2400
},
{
"epoch": 0.5560693641618497,
"grad_norm": 0.3094199650313883,
"learning_rate": 9.790145393186541e-05,
"loss": 0.9409,
"step": 2405
},
{
"epoch": 0.5572254335260116,
"grad_norm": 0.3055144703983288,
"learning_rate": 9.749796481993042e-05,
"loss": 0.9674,
"step": 2410
},
{
"epoch": 0.5583815028901734,
"grad_norm": 0.31607857296755776,
"learning_rate": 9.709451646355996e-05,
"loss": 1.0174,
"step": 2415
},
{
"epoch": 0.5595375722543353,
"grad_norm": 0.2975421743209585,
"learning_rate": 9.669111543451033e-05,
"loss": 0.9683,
"step": 2420
},
{
"epoch": 0.5606936416184971,
"grad_norm": 0.30435556233851097,
"learning_rate": 9.628776830376698e-05,
"loss": 0.9539,
"step": 2425
},
{
"epoch": 0.561849710982659,
"grad_norm": 0.31164699307502913,
"learning_rate": 9.588448164143739e-05,
"loss": 0.9716,
"step": 2430
},
{
"epoch": 0.5630057803468208,
"grad_norm": 0.3029837051614153,
"learning_rate": 9.548126201664398e-05,
"loss": 0.9488,
"step": 2435
},
{
"epoch": 0.5641618497109827,
"grad_norm": 0.30375216989550236,
"learning_rate": 9.507811599741735e-05,
"loss": 0.9241,
"step": 2440
},
{
"epoch": 0.5653179190751445,
"grad_norm": 0.3044100811963364,
"learning_rate": 9.467505015058901e-05,
"loss": 1.0212,
"step": 2445
},
{
"epoch": 0.5664739884393064,
"grad_norm": 0.3079385379059537,
"learning_rate": 9.427207104168467e-05,
"loss": 0.9341,
"step": 2450
},
{
"epoch": 0.5676300578034682,
"grad_norm": 0.3012241533223519,
"learning_rate": 9.386918523481709e-05,
"loss": 0.9533,
"step": 2455
},
{
"epoch": 0.5687861271676301,
"grad_norm": 0.32392283471513106,
"learning_rate": 9.346639929257916e-05,
"loss": 0.9888,
"step": 2460
},
{
"epoch": 0.5699421965317919,
"grad_norm": 0.28835183242553575,
"learning_rate": 9.306371977593726e-05,
"loss": 0.9847,
"step": 2465
},
{
"epoch": 0.5710982658959538,
"grad_norm": 0.29592049768177997,
"learning_rate": 9.26611532441241e-05,
"loss": 0.9075,
"step": 2470
},
{
"epoch": 0.5722543352601156,
"grad_norm": 0.30510640464990846,
"learning_rate": 9.225870625453192e-05,
"loss": 0.9276,
"step": 2475
},
{
"epoch": 0.5734104046242775,
"grad_norm": 0.32217649840985024,
"learning_rate": 9.18563853626059e-05,
"loss": 0.9936,
"step": 2480
},
{
"epoch": 0.5745664739884393,
"grad_norm": 0.2945778811908368,
"learning_rate": 9.145419712173713e-05,
"loss": 0.9775,
"step": 2485
},
{
"epoch": 0.5757225433526012,
"grad_norm": 0.2937161972776823,
"learning_rate": 9.105214808315588e-05,
"loss": 0.9293,
"step": 2490
},
{
"epoch": 0.576878612716763,
"grad_norm": 0.34400282833354867,
"learning_rate": 9.065024479582513e-05,
"loss": 1.0045,
"step": 2495
},
{
"epoch": 0.5780346820809249,
"grad_norm": 0.30233509314710183,
"learning_rate": 9.024849380633359e-05,
"loss": 0.9786,
"step": 2500
},
{
"epoch": 0.5791907514450867,
"grad_norm": 0.29454389871868664,
"learning_rate": 8.984690165878921e-05,
"loss": 0.9584,
"step": 2505
},
{
"epoch": 0.5803468208092486,
"grad_norm": 0.30071228762049557,
"learning_rate": 8.944547489471265e-05,
"loss": 0.955,
"step": 2510
},
{
"epoch": 0.5815028901734104,
"grad_norm": 0.3169803125653554,
"learning_rate": 8.904422005293052e-05,
"loss": 1.0198,
"step": 2515
},
{
"epoch": 0.5826589595375723,
"grad_norm": 0.31831872317303483,
"learning_rate": 8.864314366946913e-05,
"loss": 0.9781,
"step": 2520
},
{
"epoch": 0.5838150289017341,
"grad_norm": 0.30646509617401063,
"learning_rate": 8.824225227744782e-05,
"loss": 0.9556,
"step": 2525
},
{
"epoch": 0.584971098265896,
"grad_norm": 0.28426966152836436,
"learning_rate": 8.784155240697254e-05,
"loss": 0.9811,
"step": 2530
},
{
"epoch": 0.5861271676300578,
"grad_norm": 0.29076631427987554,
"learning_rate": 8.74410505850297e-05,
"loss": 0.9653,
"step": 2535
},
{
"epoch": 0.5872832369942197,
"grad_norm": 0.2993968457362386,
"learning_rate": 8.704075333537963e-05,
"loss": 0.9267,
"step": 2540
},
{
"epoch": 0.5884393063583815,
"grad_norm": 0.28060562517633875,
"learning_rate": 8.66406671784503e-05,
"loss": 0.9767,
"step": 2545
},
{
"epoch": 0.5895953757225434,
"grad_norm": 0.29394163379907895,
"learning_rate": 8.624079863123135e-05,
"loss": 0.9692,
"step": 2550
},
{
"epoch": 0.5907514450867052,
"grad_norm": 0.29752047867631554,
"learning_rate": 8.584115420716777e-05,
"loss": 1.0218,
"step": 2555
},
{
"epoch": 0.5919075144508671,
"grad_norm": 0.2960489971126818,
"learning_rate": 8.544174041605363e-05,
"loss": 0.9386,
"step": 2560
},
{
"epoch": 0.5930635838150289,
"grad_norm": 0.2991003815095411,
"learning_rate": 8.504256376392647e-05,
"loss": 0.951,
"step": 2565
},
{
"epoch": 0.5942196531791908,
"grad_norm": 0.2875329275503883,
"learning_rate": 8.464363075296095e-05,
"loss": 0.9595,
"step": 2570
},
{
"epoch": 0.5953757225433526,
"grad_norm": 0.29629340379738117,
"learning_rate": 8.424494788136303e-05,
"loss": 0.946,
"step": 2575
},
{
"epoch": 0.5965317919075145,
"grad_norm": 0.3039710490176798,
"learning_rate": 8.384652164326432e-05,
"loss": 0.9297,
"step": 2580
},
{
"epoch": 0.5976878612716763,
"grad_norm": 0.2867197601664578,
"learning_rate": 8.344835852861595e-05,
"loss": 0.9655,
"step": 2585
},
{
"epoch": 0.5988439306358382,
"grad_norm": 0.3021141932586307,
"learning_rate": 8.305046502308319e-05,
"loss": 0.9388,
"step": 2590
},
{
"epoch": 0.6,
"grad_norm": 0.32608595836134247,
"learning_rate": 8.265284760793957e-05,
"loss": 0.948,
"step": 2595
},
{
"epoch": 0.6011560693641619,
"grad_norm": 0.3077259465134535,
"learning_rate": 8.225551275996138e-05,
"loss": 1.0123,
"step": 2600
},
{
"epoch": 0.6023121387283237,
"grad_norm": 0.28311149702035393,
"learning_rate": 8.185846695132227e-05,
"loss": 0.9456,
"step": 2605
},
{
"epoch": 0.6034682080924856,
"grad_norm": 0.31276946079278556,
"learning_rate": 8.146171664948769e-05,
"loss": 0.9755,
"step": 2610
},
{
"epoch": 0.6046242774566474,
"grad_norm": 0.3264513747288462,
"learning_rate": 8.10652683171095e-05,
"loss": 0.9619,
"step": 2615
},
{
"epoch": 0.6057803468208093,
"grad_norm": 0.30285428517053464,
"learning_rate": 8.066912841192099e-05,
"loss": 0.9344,
"step": 2620
},
{
"epoch": 0.6069364161849711,
"grad_norm": 0.3644501014383341,
"learning_rate": 8.027330338663132e-05,
"loss": 0.9794,
"step": 2625
},
{
"epoch": 0.608092485549133,
"grad_norm": 0.28904372054279964,
"learning_rate": 7.987779968882061e-05,
"loss": 0.941,
"step": 2630
},
{
"epoch": 0.6092485549132948,
"grad_norm": 0.2895705859655337,
"learning_rate": 7.9482623760835e-05,
"loss": 0.951,
"step": 2635
},
{
"epoch": 0.6104046242774567,
"grad_norm": 0.29749813183970186,
"learning_rate": 7.908778203968146e-05,
"loss": 0.9244,
"step": 2640
},
{
"epoch": 0.6115606936416185,
"grad_norm": 0.31971185227728377,
"learning_rate": 7.869328095692312e-05,
"loss": 0.9645,
"step": 2645
},
{
"epoch": 0.6127167630057804,
"grad_norm": 0.2873929264676909,
"learning_rate": 7.829912693857454e-05,
"loss": 0.9739,
"step": 2650
},
{
"epoch": 0.6138728323699422,
"grad_norm": 0.2962385037449908,
"learning_rate": 7.79053264049968e-05,
"loss": 1.0025,
"step": 2655
},
{
"epoch": 0.6150289017341041,
"grad_norm": 0.2889472954071748,
"learning_rate": 7.751188577079327e-05,
"loss": 0.9764,
"step": 2660
},
{
"epoch": 0.6161849710982659,
"grad_norm": 0.2914115227852295,
"learning_rate": 7.711881144470481e-05,
"loss": 0.9575,
"step": 2665
},
{
"epoch": 0.6173410404624278,
"grad_norm": 0.31913799827615985,
"learning_rate": 7.672610982950546e-05,
"loss": 0.9684,
"step": 2670
},
{
"epoch": 0.6184971098265896,
"grad_norm": 0.3005365331976947,
"learning_rate": 7.633378732189833e-05,
"loss": 0.941,
"step": 2675
},
{
"epoch": 0.6196531791907515,
"grad_norm": 0.29855395413135466,
"learning_rate": 7.594185031241115e-05,
"loss": 0.9504,
"step": 2680
},
{
"epoch": 0.6208092485549133,
"grad_norm": 0.30079166608986607,
"learning_rate": 7.555030518529227e-05,
"loss": 0.9489,
"step": 2685
},
{
"epoch": 0.6219653179190752,
"grad_norm": 0.2835216761875895,
"learning_rate": 7.515915831840682e-05,
"loss": 1.0283,
"step": 2690
},
{
"epoch": 0.623121387283237,
"grad_norm": 0.296763030995749,
"learning_rate": 7.476841608313253e-05,
"loss": 0.9495,
"step": 2695
},
{
"epoch": 0.6242774566473989,
"grad_norm": 0.2965441757959662,
"learning_rate": 7.437808484425614e-05,
"loss": 0.9207,
"step": 2700
},
{
"epoch": 0.6254335260115607,
"grad_norm": 0.28563226791333124,
"learning_rate": 7.398817095986978e-05,
"loss": 0.9529,
"step": 2705
},
{
"epoch": 0.6265895953757226,
"grad_norm": 0.2958876716229884,
"learning_rate": 7.359868078126714e-05,
"loss": 0.9415,
"step": 2710
},
{
"epoch": 0.6277456647398844,
"grad_norm": 0.29947263183760775,
"learning_rate": 7.320962065284032e-05,
"loss": 0.9153,
"step": 2715
},
{
"epoch": 0.6289017341040463,
"grad_norm": 0.2921294166805471,
"learning_rate": 7.282099691197632e-05,
"loss": 1.0061,
"step": 2720
},
{
"epoch": 0.630057803468208,
"grad_norm": 0.3065806154787742,
"learning_rate": 7.243281588895374e-05,
"loss": 0.9713,
"step": 2725
},
{
"epoch": 0.63121387283237,
"grad_norm": 0.29670093707685785,
"learning_rate": 7.204508390683991e-05,
"loss": 0.9152,
"step": 2730
},
{
"epoch": 0.6323699421965318,
"grad_norm": 0.28334644508416545,
"learning_rate": 7.165780728138769e-05,
"loss": 0.9216,
"step": 2735
},
{
"epoch": 0.6335260115606937,
"grad_norm": 0.2983458223496864,
"learning_rate": 7.127099232093252e-05,
"loss": 0.9684,
"step": 2740
},
{
"epoch": 0.6346820809248555,
"grad_norm": 0.297707639389454,
"learning_rate": 7.08846453262901e-05,
"loss": 0.9677,
"step": 2745
},
{
"epoch": 0.6358381502890174,
"grad_norm": 0.3030445642630547,
"learning_rate": 7.049877259065312e-05,
"loss": 0.991,
"step": 2750
},
{
"epoch": 0.6369942196531792,
"grad_norm": 0.2967929219140181,
"learning_rate": 7.011338039948925e-05,
"loss": 0.9331,
"step": 2755
},
{
"epoch": 0.638150289017341,
"grad_norm": 0.30678542010472665,
"learning_rate": 6.972847503043864e-05,
"loss": 0.9952,
"step": 2760
},
{
"epoch": 0.6393063583815028,
"grad_norm": 0.2869522581890146,
"learning_rate": 6.934406275321147e-05,
"loss": 0.9817,
"step": 2765
},
{
"epoch": 0.6404624277456648,
"grad_norm": 0.30227078806254565,
"learning_rate": 6.896014982948602e-05,
"loss": 0.9713,
"step": 2770
},
{
"epoch": 0.6416184971098265,
"grad_norm": 0.29642633769940174,
"learning_rate": 6.857674251280671e-05,
"loss": 0.9495,
"step": 2775
},
{
"epoch": 0.6427745664739885,
"grad_norm": 0.3324961966488245,
"learning_rate": 6.819384704848199e-05,
"loss": 1.0679,
"step": 2780
},
{
"epoch": 0.6439306358381502,
"grad_norm": 0.2888690447316259,
"learning_rate": 6.781146967348284e-05,
"loss": 0.941,
"step": 2785
},
{
"epoch": 0.6450867052023121,
"grad_norm": 0.29579888471073373,
"learning_rate": 6.742961661634115e-05,
"loss": 1.0323,
"step": 2790
},
{
"epoch": 0.6462427745664739,
"grad_norm": 0.2989300466555907,
"learning_rate": 6.704829409704809e-05,
"loss": 0.9723,
"step": 2795
},
{
"epoch": 0.6473988439306358,
"grad_norm": 0.27150342274150757,
"learning_rate": 6.666750832695306e-05,
"loss": 0.9397,
"step": 2800
},
{
"epoch": 0.6485549132947976,
"grad_norm": 0.29998837345134693,
"learning_rate": 6.628726550866227e-05,
"loss": 0.9835,
"step": 2805
},
{
"epoch": 0.6497109826589595,
"grad_norm": 0.28399979161419353,
"learning_rate": 6.59075718359378e-05,
"loss": 0.9509,
"step": 2810
},
{
"epoch": 0.6508670520231213,
"grad_norm": 0.30370562141401386,
"learning_rate": 6.552843349359688e-05,
"loss": 0.967,
"step": 2815
},
{
"epoch": 0.6520231213872832,
"grad_norm": 0.3363515271195273,
"learning_rate": 6.514985665741073e-05,
"loss": 1.0054,
"step": 2820
},
{
"epoch": 0.653179190751445,
"grad_norm": 0.2941850764424752,
"learning_rate": 6.477184749400438e-05,
"loss": 0.9958,
"step": 2825
},
{
"epoch": 0.6543352601156069,
"grad_norm": 0.2845955247940358,
"learning_rate": 6.439441216075605e-05,
"loss": 0.9199,
"step": 2830
},
{
"epoch": 0.6554913294797687,
"grad_norm": 0.30264607648319775,
"learning_rate": 6.401755680569683e-05,
"loss": 0.9974,
"step": 2835
},
{
"epoch": 0.6566473988439306,
"grad_norm": 0.3049902419448789,
"learning_rate": 6.36412875674105e-05,
"loss": 0.9412,
"step": 2840
},
{
"epoch": 0.6578034682080924,
"grad_norm": 0.28427694016822674,
"learning_rate": 6.326561057493376e-05,
"loss": 0.9618,
"step": 2845
},
{
"epoch": 0.6589595375722543,
"grad_norm": 0.2923557343391648,
"learning_rate": 6.28905319476561e-05,
"loss": 0.9738,
"step": 2850
},
{
"epoch": 0.6601156069364161,
"grad_norm": 0.3011922931000182,
"learning_rate": 6.251605779522032e-05,
"loss": 0.9547,
"step": 2855
},
{
"epoch": 0.661271676300578,
"grad_norm": 0.2778141274186397,
"learning_rate": 6.214219421742295e-05,
"loss": 1.0062,
"step": 2860
},
{
"epoch": 0.6624277456647398,
"grad_norm": 0.28425833137988876,
"learning_rate": 6.176894730411483e-05,
"loss": 0.9529,
"step": 2865
},
{
"epoch": 0.6635838150289017,
"grad_norm": 0.2914968347637112,
"learning_rate": 6.139632313510212e-05,
"loss": 0.944,
"step": 2870
},
{
"epoch": 0.6647398843930635,
"grad_norm": 0.2910244571300189,
"learning_rate": 6.1024327780046944e-05,
"loss": 1.0063,
"step": 2875
},
{
"epoch": 0.6658959537572254,
"grad_norm": 0.28730856037506486,
"learning_rate": 6.065296729836879e-05,
"loss": 0.978,
"step": 2880
},
{
"epoch": 0.6670520231213873,
"grad_norm": 0.26692983480937005,
"learning_rate": 6.028224773914575e-05,
"loss": 0.953,
"step": 2885
},
{
"epoch": 0.6682080924855491,
"grad_norm": 0.28871757324400055,
"learning_rate": 5.991217514101586e-05,
"loss": 0.9275,
"step": 2890
},
{
"epoch": 0.669364161849711,
"grad_norm": 0.29493246673699086,
"learning_rate": 5.9542755532078856e-05,
"loss": 1.0001,
"step": 2895
},
{
"epoch": 0.6705202312138728,
"grad_norm": 0.3100505532341507,
"learning_rate": 5.917399492979805e-05,
"loss": 0.9716,
"step": 2900
},
{
"epoch": 0.6716763005780347,
"grad_norm": 0.296746629997667,
"learning_rate": 5.880589934090206e-05,
"loss": 0.987,
"step": 2905
},
{
"epoch": 0.6728323699421965,
"grad_norm": 0.3175494758298602,
"learning_rate": 5.843847476128722e-05,
"loss": 0.9643,
"step": 2910
},
{
"epoch": 0.6739884393063584,
"grad_norm": 0.310484738297623,
"learning_rate": 5.807172717591984e-05,
"loss": 0.9322,
"step": 2915
},
{
"epoch": 0.6751445086705202,
"grad_norm": 0.29571778911697455,
"learning_rate": 5.770566255873866e-05,
"loss": 0.929,
"step": 2920
},
{
"epoch": 0.6763005780346821,
"grad_norm": 0.2876079644684902,
"learning_rate": 5.734028687255751e-05,
"loss": 0.9644,
"step": 2925
},
{
"epoch": 0.6774566473988439,
"grad_norm": 0.3002944071515893,
"learning_rate": 5.697560606896839e-05,
"loss": 0.9987,
"step": 2930
},
{
"epoch": 0.6786127167630058,
"grad_norm": 0.2931767870621979,
"learning_rate": 5.6611626088244194e-05,
"loss": 0.9474,
"step": 2935
},
{
"epoch": 0.6797687861271676,
"grad_norm": 0.28720352206351696,
"learning_rate": 5.6248352859242314e-05,
"loss": 1.0355,
"step": 2940
},
{
"epoch": 0.6809248554913295,
"grad_norm": 0.3191718472968375,
"learning_rate": 5.588579229930784e-05,
"loss": 0.9699,
"step": 2945
},
{
"epoch": 0.6820809248554913,
"grad_norm": 0.30266420054049115,
"learning_rate": 5.552395031417712e-05,
"loss": 0.9358,
"step": 2950
},
{
"epoch": 0.6832369942196532,
"grad_norm": 0.30130721873082394,
"learning_rate": 5.516283279788183e-05,
"loss": 0.9496,
"step": 2955
},
{
"epoch": 0.684393063583815,
"grad_norm": 0.2929120042219513,
"learning_rate": 5.4802445632652634e-05,
"loss": 0.9248,
"step": 2960
},
{
"epoch": 0.6855491329479769,
"grad_norm": 0.31437058476192303,
"learning_rate": 5.444279468882358e-05,
"loss": 0.9745,
"step": 2965
},
{
"epoch": 0.6867052023121387,
"grad_norm": 0.2878711246279315,
"learning_rate": 5.408388582473651e-05,
"loss": 1.0008,
"step": 2970
},
{
"epoch": 0.6878612716763006,
"grad_norm": 0.2808303288729568,
"learning_rate": 5.3725724886645526e-05,
"loss": 0.9381,
"step": 2975
},
{
"epoch": 0.6890173410404624,
"grad_norm": 0.3006592351672636,
"learning_rate": 5.3368317708621674e-05,
"loss": 1.0076,
"step": 2980
},
{
"epoch": 0.6901734104046243,
"grad_norm": 0.30870133430305197,
"learning_rate": 5.3011670112458224e-05,
"loss": 0.9616,
"step": 2985
},
{
"epoch": 0.6913294797687861,
"grad_norm": 0.29489554268370943,
"learning_rate": 5.2655787907575436e-05,
"loss": 0.908,
"step": 2990
},
{
"epoch": 0.692485549132948,
"grad_norm": 0.30595244192322063,
"learning_rate": 5.230067689092629e-05,
"loss": 0.9902,
"step": 2995
},
{
"epoch": 0.6936416184971098,
"grad_norm": 0.3007538157511167,
"learning_rate": 5.19463428469019e-05,
"loss": 1.0259,
"step": 3000
},
{
"epoch": 0.6947976878612717,
"grad_norm": 0.28815906202526853,
"learning_rate": 5.159279154723715e-05,
"loss": 0.9497,
"step": 3005
},
{
"epoch": 0.6959537572254335,
"grad_norm": 0.2752889596438775,
"learning_rate": 5.124002875091704e-05,
"loss": 0.9586,
"step": 3010
},
{
"epoch": 0.6971098265895954,
"grad_norm": 0.2850467130558078,
"learning_rate": 5.088806020408252e-05,
"loss": 0.9049,
"step": 3015
},
{
"epoch": 0.6982658959537572,
"grad_norm": 0.2750258968098293,
"learning_rate": 5.053689163993703e-05,
"loss": 0.937,
"step": 3020
},
{
"epoch": 0.6994219653179191,
"grad_norm": 0.29519726643012756,
"learning_rate": 5.018652877865322e-05,
"loss": 0.9325,
"step": 3025
},
{
"epoch": 0.7005780346820809,
"grad_norm": 0.31869143160605945,
"learning_rate": 4.983697732727964e-05,
"loss": 0.9484,
"step": 3030
},
{
"epoch": 0.7017341040462428,
"grad_norm": 0.30604777371620057,
"learning_rate": 4.948824297964774e-05,
"loss": 0.9497,
"step": 3035
},
{
"epoch": 0.7028901734104046,
"grad_norm": 0.2781484111225009,
"learning_rate": 4.914033141627931e-05,
"loss": 0.9732,
"step": 3040
},
{
"epoch": 0.7040462427745665,
"grad_norm": 0.287979860552131,
"learning_rate": 4.87932483042937e-05,
"loss": 0.9513,
"step": 3045
},
{
"epoch": 0.7052023121387283,
"grad_norm": 0.28553452094386605,
"learning_rate": 4.8446999297315764e-05,
"loss": 0.9528,
"step": 3050
},
{
"epoch": 0.7063583815028902,
"grad_norm": 0.26876115159064556,
"learning_rate": 4.810159003538365e-05,
"loss": 0.9513,
"step": 3055
},
{
"epoch": 0.707514450867052,
"grad_norm": 0.29148448420091455,
"learning_rate": 4.775702614485678e-05,
"loss": 0.9732,
"step": 3060
},
{
"epoch": 0.7086705202312139,
"grad_norm": 0.2858967419780267,
"learning_rate": 4.7413313238324556e-05,
"loss": 0.9874,
"step": 3065
},
{
"epoch": 0.7098265895953757,
"grad_norm": 0.2853440571230289,
"learning_rate": 4.707045691451456e-05,
"loss": 0.9365,
"step": 3070
},
{
"epoch": 0.7109826589595376,
"grad_norm": 0.2756362587234919,
"learning_rate": 4.6728462758201574e-05,
"loss": 0.918,
"step": 3075
},
{
"epoch": 0.7121387283236994,
"grad_norm": 0.2933985662205089,
"learning_rate": 4.638733634011663e-05,
"loss": 0.9652,
"step": 3080
},
{
"epoch": 0.7132947976878613,
"grad_norm": 0.3088785959772121,
"learning_rate": 4.604708321685618e-05,
"loss": 0.9468,
"step": 3085
},
{
"epoch": 0.7144508670520231,
"grad_norm": 0.2897414636494522,
"learning_rate": 4.5707708930791514e-05,
"loss": 0.9136,
"step": 3090
},
{
"epoch": 0.715606936416185,
"grad_norm": 0.2977453761954692,
"learning_rate": 4.536921900997872e-05,
"loss": 0.9684,
"step": 3095
},
{
"epoch": 0.7167630057803468,
"grad_norm": 0.31568624217549895,
"learning_rate": 4.5031618968068325e-05,
"loss": 0.9804,
"step": 3100
},
{
"epoch": 0.7179190751445087,
"grad_norm": 0.2937121832237494,
"learning_rate": 4.4694914304215796e-05,
"loss": 0.9923,
"step": 3105
},
{
"epoch": 0.7190751445086705,
"grad_norm": 0.31223322912252777,
"learning_rate": 4.4359110502991773e-05,
"loss": 0.9493,
"step": 3110
},
{
"epoch": 0.7202312138728324,
"grad_norm": 0.2948338805474285,
"learning_rate": 4.402421303429274e-05,
"loss": 1.0058,
"step": 3115
},
{
"epoch": 0.7213872832369942,
"grad_norm": 0.2998471688331991,
"learning_rate": 4.3690227353251944e-05,
"loss": 0.9793,
"step": 3120
},
{
"epoch": 0.7225433526011561,
"grad_norm": 0.30105760378227486,
"learning_rate": 4.335715890015067e-05,
"loss": 0.9636,
"step": 3125
},
{
"epoch": 0.7236994219653179,
"grad_norm": 0.298969154483284,
"learning_rate": 4.302501310032937e-05,
"loss": 0.9849,
"step": 3130
},
{
"epoch": 0.7248554913294798,
"grad_norm": 0.30686646202523543,
"learning_rate": 4.26937953640995e-05,
"loss": 0.943,
"step": 3135
},
{
"epoch": 0.7260115606936416,
"grad_norm": 0.2906008606262198,
"learning_rate": 4.236351108665537e-05,
"loss": 0.9584,
"step": 3140
},
{
"epoch": 0.7271676300578035,
"grad_norm": 0.2965620261809724,
"learning_rate": 4.203416564798608e-05,
"loss": 0.9376,
"step": 3145
},
{
"epoch": 0.7283236994219653,
"grad_norm": 0.29569547507554944,
"learning_rate": 4.170576441278815e-05,
"loss": 1.0158,
"step": 3150
},
{
"epoch": 0.7294797687861272,
"grad_norm": 0.2873361360138618,
"learning_rate": 4.137831273037793e-05,
"loss": 0.956,
"step": 3155
},
{
"epoch": 0.730635838150289,
"grad_norm": 0.28863435820628536,
"learning_rate": 4.1051815934604465e-05,
"loss": 0.9456,
"step": 3160
},
{
"epoch": 0.7317919075144509,
"grad_norm": 0.30820662139709337,
"learning_rate": 4.072627934376292e-05,
"loss": 0.9627,
"step": 3165
},
{
"epoch": 0.7329479768786127,
"grad_norm": 0.2733290689658499,
"learning_rate": 4.0401708260507495e-05,
"loss": 0.9896,
"step": 3170
},
{
"epoch": 0.7341040462427746,
"grad_norm": 0.29448552437974784,
"learning_rate": 4.00781079717653e-05,
"loss": 0.9817,
"step": 3175
},
{
"epoch": 0.7352601156069364,
"grad_norm": 0.31792876220744604,
"learning_rate": 3.975548374865034e-05,
"loss": 0.98,
"step": 3180
},
{
"epoch": 0.7364161849710983,
"grad_norm": 0.2867123384035597,
"learning_rate": 3.943384084637732e-05,
"loss": 0.9845,
"step": 3185
},
{
"epoch": 0.7375722543352601,
"grad_norm": 0.29809867628254705,
"learning_rate": 3.9113184504176426e-05,
"loss": 0.9589,
"step": 3190
},
{
"epoch": 0.738728323699422,
"grad_norm": 0.2962633412794635,
"learning_rate": 3.879351994520774e-05,
"loss": 0.9644,
"step": 3195
},
{
"epoch": 0.7398843930635838,
"grad_norm": 0.3173965169375924,
"learning_rate": 3.847485237647614e-05,
"loss": 0.9243,
"step": 3200
},
{
"epoch": 0.7410404624277457,
"grad_norm": 0.2802160296625643,
"learning_rate": 3.815718698874672e-05,
"loss": 0.9627,
"step": 3205
},
{
"epoch": 0.7421965317919075,
"grad_norm": 0.30606251064274426,
"learning_rate": 3.7840528956459956e-05,
"loss": 1.0133,
"step": 3210
},
{
"epoch": 0.7433526011560694,
"grad_norm": 0.29567079239489624,
"learning_rate": 3.752488343764751e-05,
"loss": 1.0064,
"step": 3215
},
{
"epoch": 0.7445086705202312,
"grad_norm": 0.2908365758289181,
"learning_rate": 3.721025557384845e-05,
"loss": 0.939,
"step": 3220
},
{
"epoch": 0.7456647398843931,
"grad_norm": 0.2821772422176111,
"learning_rate": 3.689665049002513e-05,
"loss": 0.9176,
"step": 3225
},
{
"epoch": 0.7468208092485549,
"grad_norm": 0.28323988627298013,
"learning_rate": 3.658407329447986e-05,
"loss": 0.9504,
"step": 3230
},
{
"epoch": 0.7479768786127168,
"grad_norm": 0.3052577077686943,
"learning_rate": 3.627252907877184e-05,
"loss": 0.963,
"step": 3235
},
{
"epoch": 0.7491329479768786,
"grad_norm": 0.29287895420892424,
"learning_rate": 3.5962022917633976e-05,
"loss": 0.9758,
"step": 3240
},
{
"epoch": 0.7502890173410405,
"grad_norm": 0.26219424592081725,
"learning_rate": 3.56525598688904e-05,
"loss": 0.9501,
"step": 3245
},
{
"epoch": 0.7514450867052023,
"grad_norm": 0.2999575076011054,
"learning_rate": 3.534414497337406e-05,
"loss": 0.9979,
"step": 3250
},
{
"epoch": 0.7526011560693642,
"grad_norm": 0.30401739845864295,
"learning_rate": 3.503678325484448e-05,
"loss": 0.96,
"step": 3255
},
{
"epoch": 0.753757225433526,
"grad_norm": 0.2861448274685923,
"learning_rate": 3.473047971990605e-05,
"loss": 0.9956,
"step": 3260
},
{
"epoch": 0.7549132947976879,
"grad_norm": 0.2749463140020902,
"learning_rate": 3.442523935792651e-05,
"loss": 1.0207,
"step": 3265
},
{
"epoch": 0.7560693641618497,
"grad_norm": 0.2987193247584773,
"learning_rate": 3.4121067140955455e-05,
"loss": 0.9469,
"step": 3270
},
{
"epoch": 0.7572254335260116,
"grad_norm": 0.29389883736035816,
"learning_rate": 3.3817968023643766e-05,
"loss": 0.9934,
"step": 3275
},
{
"epoch": 0.7583815028901734,
"grad_norm": 0.28653966462371155,
"learning_rate": 3.351594694316239e-05,
"loss": 0.9634,
"step": 3280
},
{
"epoch": 0.7595375722543353,
"grad_norm": 0.29816187415165,
"learning_rate": 3.321500881912225e-05,
"loss": 0.9398,
"step": 3285
},
{
"epoch": 0.7606936416184971,
"grad_norm": 0.289319657394509,
"learning_rate": 3.29151585534941e-05,
"loss": 0.8928,
"step": 3290
},
{
"epoch": 0.761849710982659,
"grad_norm": 0.29521868021236264,
"learning_rate": 3.261640103052849e-05,
"loss": 0.9653,
"step": 3295
},
{
"epoch": 0.7630057803468208,
"grad_norm": 0.29934248191757035,
"learning_rate": 3.23187411166764e-05,
"loss": 0.97,
"step": 3300
},
{
"epoch": 0.7641618497109827,
"grad_norm": 0.32270679766275234,
"learning_rate": 3.2022183660509916e-05,
"loss": 0.9495,
"step": 3305
},
{
"epoch": 0.7653179190751445,
"grad_norm": 0.30814076544629976,
"learning_rate": 3.172673349264316e-05,
"loss": 0.9897,
"step": 3310
},
{
"epoch": 0.7664739884393064,
"grad_norm": 0.2731527495101352,
"learning_rate": 3.143239542565365e-05,
"loss": 0.9922,
"step": 3315
},
{
"epoch": 0.7676300578034682,
"grad_norm": 0.29822125577085,
"learning_rate": 3.113917425400406e-05,
"loss": 0.974,
"step": 3320
},
{
"epoch": 0.7687861271676301,
"grad_norm": 0.27831179956286994,
"learning_rate": 3.084707475396385e-05,
"loss": 0.9309,
"step": 3325
},
{
"epoch": 0.7699421965317919,
"grad_norm": 0.30061057019675924,
"learning_rate": 3.05561016835317e-05,
"loss": 0.9889,
"step": 3330
},
{
"epoch": 0.7710982658959538,
"grad_norm": 0.2839150865870422,
"learning_rate": 3.026625978235793e-05,
"loss": 0.9279,
"step": 3335
},
{
"epoch": 0.7722543352601156,
"grad_norm": 0.28191298234209533,
"learning_rate": 2.9977553771667178e-05,
"loss": 0.9043,
"step": 3340
},
{
"epoch": 0.7734104046242775,
"grad_norm": 0.2839643289974997,
"learning_rate": 2.968998835418174e-05,
"loss": 0.958,
"step": 3345
},
{
"epoch": 0.7745664739884393,
"grad_norm": 0.2727626163667981,
"learning_rate": 2.9403568214044687e-05,
"loss": 1.0045,
"step": 3350
},
{
"epoch": 0.7757225433526012,
"grad_norm": 0.2829586617421094,
"learning_rate": 2.9118298016743815e-05,
"loss": 0.9477,
"step": 3355
},
{
"epoch": 0.776878612716763,
"grad_norm": 0.2863725325553883,
"learning_rate": 2.8834182409035527e-05,
"loss": 1.0095,
"step": 3360
},
{
"epoch": 0.7780346820809249,
"grad_norm": 0.2981971073978521,
"learning_rate": 2.8551226018869105e-05,
"loss": 1.0128,
"step": 3365
},
{
"epoch": 0.7791907514450868,
"grad_norm": 0.2783020472744863,
"learning_rate": 2.8269433455311378e-05,
"loss": 0.9581,
"step": 3370
},
{
"epoch": 0.7803468208092486,
"grad_norm": 0.29194953070328256,
"learning_rate": 2.798880930847173e-05,
"loss": 0.9402,
"step": 3375
},
{
"epoch": 0.7815028901734105,
"grad_norm": 0.31785104915649315,
"learning_rate": 2.7709358149427113e-05,
"loss": 0.9912,
"step": 3380
},
{
"epoch": 0.7826589595375723,
"grad_norm": 0.27416031521109446,
"learning_rate": 2.7431084530147834e-05,
"loss": 0.9413,
"step": 3385
},
{
"epoch": 0.7838150289017342,
"grad_norm": 0.3023857012875686,
"learning_rate": 2.7153992983423283e-05,
"loss": 1.0101,
"step": 3390
},
{
"epoch": 0.784971098265896,
"grad_norm": 0.27883119977909965,
"learning_rate": 2.687808802278805e-05,
"loss": 0.9699,
"step": 3395
},
{
"epoch": 0.7861271676300579,
"grad_norm": 0.28656925239378356,
"learning_rate": 2.6603374142448467e-05,
"loss": 0.9459,
"step": 3400
},
{
"epoch": 0.7872832369942196,
"grad_norm": 0.2805856931300408,
"learning_rate": 2.632985581720947e-05,
"loss": 0.9347,
"step": 3405
},
{
"epoch": 0.7884393063583816,
"grad_norm": 0.28925155839879374,
"learning_rate": 2.6057537502401598e-05,
"loss": 0.9549,
"step": 3410
},
{
"epoch": 0.7895953757225433,
"grad_norm": 0.30326180151228377,
"learning_rate": 2.5786423633808487e-05,
"loss": 0.9954,
"step": 3415
},
{
"epoch": 0.7907514450867053,
"grad_norm": 0.310861152386562,
"learning_rate": 2.5516518627594542e-05,
"loss": 0.981,
"step": 3420
},
{
"epoch": 0.791907514450867,
"grad_norm": 0.29730609595809665,
"learning_rate": 2.524782688023305e-05,
"loss": 0.9933,
"step": 3425
},
{
"epoch": 0.793063583815029,
"grad_norm": 0.29011083826669604,
"learning_rate": 2.4980352768434643e-05,
"loss": 1.0396,
"step": 3430
},
{
"epoch": 0.7942196531791907,
"grad_norm": 0.2941491077374515,
"learning_rate": 2.4714100649075833e-05,
"loss": 0.9676,
"step": 3435
},
{
"epoch": 0.7953757225433526,
"grad_norm": 0.28795285220192574,
"learning_rate": 2.4449074859128197e-05,
"loss": 0.9545,
"step": 3440
},
{
"epoch": 0.7965317919075144,
"grad_norm": 0.2973713924271482,
"learning_rate": 2.4185279715587704e-05,
"loss": 0.925,
"step": 3445
},
{
"epoch": 0.7976878612716763,
"grad_norm": 0.28132269243331093,
"learning_rate": 2.39227195154043e-05,
"loss": 0.974,
"step": 3450
},
{
"epoch": 0.7988439306358381,
"grad_norm": 0.3130593570967486,
"learning_rate": 2.366139853541197e-05,
"loss": 0.9429,
"step": 3455
},
{
"epoch": 0.8,
"grad_norm": 0.28725553924621783,
"learning_rate": 2.340132103225916e-05,
"loss": 0.9609,
"step": 3460
},
{
"epoch": 0.8011560693641618,
"grad_norm": 0.2927313493050735,
"learning_rate": 2.3142491242339338e-05,
"loss": 0.9801,
"step": 3465
},
{
"epoch": 0.8023121387283237,
"grad_norm": 0.2959751791171696,
"learning_rate": 2.288491338172196e-05,
"loss": 0.9983,
"step": 3470
},
{
"epoch": 0.8034682080924855,
"grad_norm": 0.29009100429119816,
"learning_rate": 2.262859164608393e-05,
"loss": 0.9435,
"step": 3475
},
{
"epoch": 0.8046242774566474,
"grad_norm": 0.28065545174572615,
"learning_rate": 2.2373530210641103e-05,
"loss": 0.9664,
"step": 3480
},
{
"epoch": 0.8057803468208092,
"grad_norm": 0.3079082233043266,
"learning_rate": 2.2119733230080408e-05,
"loss": 0.9339,
"step": 3485
},
{
"epoch": 0.8069364161849711,
"grad_norm": 0.2994856214599522,
"learning_rate": 2.186720483849206e-05,
"loss": 0.9642,
"step": 3490
},
{
"epoch": 0.8080924855491329,
"grad_norm": 0.3150825598363239,
"learning_rate": 2.1615949149302305e-05,
"loss": 0.9391,
"step": 3495
},
{
"epoch": 0.8092485549132948,
"grad_norm": 0.28663989647188765,
"learning_rate": 2.1365970255206402e-05,
"loss": 0.9533,
"step": 3500
},
{
"epoch": 0.8104046242774566,
"grad_norm": 0.3064963810102998,
"learning_rate": 2.1117272228101902e-05,
"loss": 0.9269,
"step": 3505
},
{
"epoch": 0.8115606936416185,
"grad_norm": 0.2888662994868131,
"learning_rate": 2.0869859119022328e-05,
"loss": 0.9684,
"step": 3510
},
{
"epoch": 0.8127167630057803,
"grad_norm": 0.29449459192953004,
"learning_rate": 2.0623734958071296e-05,
"loss": 0.9588,
"step": 3515
},
{
"epoch": 0.8138728323699422,
"grad_norm": 0.28505307744008485,
"learning_rate": 2.037890375435677e-05,
"loss": 0.969,
"step": 3520
},
{
"epoch": 0.815028901734104,
"grad_norm": 0.264553046638561,
"learning_rate": 2.0135369495925714e-05,
"loss": 0.8865,
"step": 3525
},
{
"epoch": 0.8161849710982659,
"grad_norm": 0.2854615552898905,
"learning_rate": 1.9893136149699287e-05,
"loss": 0.9606,
"step": 3530
},
{
"epoch": 0.8173410404624277,
"grad_norm": 0.30132043472669484,
"learning_rate": 1.9652207661408073e-05,
"loss": 0.9783,
"step": 3535
},
{
"epoch": 0.8184971098265896,
"grad_norm": 0.28158707938088035,
"learning_rate": 1.941258795552785e-05,
"loss": 0.9702,
"step": 3540
},
{
"epoch": 0.8196531791907514,
"grad_norm": 0.3117678798282903,
"learning_rate": 1.917428093521576e-05,
"loss": 0.9381,
"step": 3545
},
{
"epoch": 0.8208092485549133,
"grad_norm": 0.2721446216347946,
"learning_rate": 1.8937290482246606e-05,
"loss": 0.9294,
"step": 3550
},
{
"epoch": 0.8219653179190751,
"grad_norm": 0.274947743569076,
"learning_rate": 1.870162045694971e-05,
"loss": 0.9874,
"step": 3555
},
{
"epoch": 0.823121387283237,
"grad_norm": 0.29164661845373113,
"learning_rate": 1.8467274698145942e-05,
"loss": 0.9641,
"step": 3560
},
{
"epoch": 0.8242774566473988,
"grad_norm": 0.2857254550756751,
"learning_rate": 1.8234257023085234e-05,
"loss": 0.9629,
"step": 3565
},
{
"epoch": 0.8254335260115607,
"grad_norm": 0.28344620289201494,
"learning_rate": 1.8002571227384467e-05,
"loss": 0.9213,
"step": 3570
},
{
"epoch": 0.8265895953757225,
"grad_norm": 0.2940307676845195,
"learning_rate": 1.777222108496558e-05,
"loss": 0.9567,
"step": 3575
},
{
"epoch": 0.8277456647398844,
"grad_norm": 0.29951372831586726,
"learning_rate": 1.7543210347994022e-05,
"loss": 0.9813,
"step": 3580
},
{
"epoch": 0.8289017341040462,
"grad_norm": 0.2878040658280256,
"learning_rate": 1.7315542746817825e-05,
"loss": 0.9152,
"step": 3585
},
{
"epoch": 0.8300578034682081,
"grad_norm": 0.30016211006512195,
"learning_rate": 1.7089221989906633e-05,
"loss": 0.9656,
"step": 3590
},
{
"epoch": 0.8312138728323699,
"grad_norm": 0.2744245197404456,
"learning_rate": 1.6864251763791428e-05,
"loss": 1.0234,
"step": 3595
},
{
"epoch": 0.8323699421965318,
"grad_norm": 0.28064881967639976,
"learning_rate": 1.664063573300446e-05,
"loss": 1.0006,
"step": 3600
},
{
"epoch": 0.8335260115606936,
"grad_norm": 0.28500326120194347,
"learning_rate": 1.6418377540019536e-05,
"loss": 0.9442,
"step": 3605
},
{
"epoch": 0.8346820809248555,
"grad_norm": 0.27741047130235014,
"learning_rate": 1.6197480805192634e-05,
"loss": 0.9173,
"step": 3610
},
{
"epoch": 0.8358381502890173,
"grad_norm": 0.2952544695479881,
"learning_rate": 1.5977949126703084e-05,
"loss": 0.9546,
"step": 3615
},
{
"epoch": 0.8369942196531792,
"grad_norm": 0.2753923526465453,
"learning_rate": 1.5759786080494743e-05,
"loss": 0.9922,
"step": 3620
},
{
"epoch": 0.838150289017341,
"grad_norm": 0.2713440737201456,
"learning_rate": 1.554299522021796e-05,
"loss": 0.9215,
"step": 3625
},
{
"epoch": 0.8393063583815029,
"grad_norm": 0.2924976737824655,
"learning_rate": 1.5327580077171587e-05,
"loss": 0.9553,
"step": 3630
},
{
"epoch": 0.8404624277456647,
"grad_norm": 0.28640397728398614,
"learning_rate": 1.5113544160245397e-05,
"loss": 0.9395,
"step": 3635
},
{
"epoch": 0.8416184971098266,
"grad_norm": 0.2859545519050961,
"learning_rate": 1.4900890955863067e-05,
"loss": 0.9711,
"step": 3640
},
{
"epoch": 0.8427745664739884,
"grad_norm": 0.3089713704921713,
"learning_rate": 1.4689623927925289e-05,
"loss": 1.0092,
"step": 3645
},
{
"epoch": 0.8439306358381503,
"grad_norm": 0.2837926097401027,
"learning_rate": 1.4479746517753335e-05,
"loss": 1.0052,
"step": 3650
},
{
"epoch": 0.8450867052023121,
"grad_norm": 0.30261005244224143,
"learning_rate": 1.4271262144033116e-05,
"loss": 0.9991,
"step": 3655
},
{
"epoch": 0.846242774566474,
"grad_norm": 0.2674381336270983,
"learning_rate": 1.4064174202759407e-05,
"loss": 0.9552,
"step": 3660
},
{
"epoch": 0.8473988439306358,
"grad_norm": 0.2767756118448426,
"learning_rate": 1.3858486067180465e-05,
"loss": 0.9573,
"step": 3665
},
{
"epoch": 0.8485549132947977,
"grad_norm": 0.2836915452259556,
"learning_rate": 1.3654201087743279e-05,
"loss": 0.9466,
"step": 3670
},
{
"epoch": 0.8497109826589595,
"grad_norm": 0.30898236268087276,
"learning_rate": 1.3451322592038774e-05,
"loss": 0.9379,
"step": 3675
},
{
"epoch": 0.8508670520231214,
"grad_norm": 0.30463841544406706,
"learning_rate": 1.3249853884747753e-05,
"loss": 0.9506,
"step": 3680
},
{
"epoch": 0.8520231213872832,
"grad_norm": 0.29760105531487135,
"learning_rate": 1.3049798247587064e-05,
"loss": 0.9603,
"step": 3685
},
{
"epoch": 0.8531791907514451,
"grad_norm": 0.2910112980743013,
"learning_rate": 1.2851158939256002e-05,
"loss": 0.9903,
"step": 3690
},
{
"epoch": 0.8543352601156069,
"grad_norm": 0.28051123703308095,
"learning_rate": 1.2653939195383446e-05,
"loss": 0.9552,
"step": 3695
},
{
"epoch": 0.8554913294797688,
"grad_norm": 0.32807409572083474,
"learning_rate": 1.2458142228474967e-05,
"loss": 0.9391,
"step": 3700
},
{
"epoch": 0.8566473988439306,
"grad_norm": 0.2857511643393787,
"learning_rate": 1.2263771227860555e-05,
"loss": 0.9249,
"step": 3705
},
{
"epoch": 0.8578034682080925,
"grad_norm": 0.3130878799401169,
"learning_rate": 1.2070829359642743e-05,
"loss": 0.9801,
"step": 3710
},
{
"epoch": 0.8589595375722543,
"grad_norm": 0.2857834049135047,
"learning_rate": 1.1879319766644969e-05,
"loss": 0.9881,
"step": 3715
},
{
"epoch": 0.8601156069364162,
"grad_norm": 0.28295250626185514,
"learning_rate": 1.168924556836034e-05,
"loss": 0.9212,
"step": 3720
},
{
"epoch": 0.861271676300578,
"grad_norm": 0.270720741833164,
"learning_rate": 1.1500609860900934e-05,
"loss": 0.9208,
"step": 3725
},
{
"epoch": 0.8624277456647399,
"grad_norm": 0.3310546829847928,
"learning_rate": 1.131341571694724e-05,
"loss": 0.9134,
"step": 3730
},
{
"epoch": 0.8635838150289017,
"grad_norm": 0.2906866125100536,
"learning_rate": 1.1127666185698183e-05,
"loss": 0.9292,
"step": 3735
},
{
"epoch": 0.8647398843930636,
"grad_norm": 0.2857402985603114,
"learning_rate": 1.0943364292821478e-05,
"loss": 0.9782,
"step": 3740
},
{
"epoch": 0.8658959537572254,
"grad_norm": 0.30621748628878964,
"learning_rate": 1.0760513040404275e-05,
"loss": 0.9538,
"step": 3745
},
{
"epoch": 0.8670520231213873,
"grad_norm": 0.3076325870742748,
"learning_rate": 1.0579115406904327e-05,
"loss": 0.9814,
"step": 3750
},
{
"epoch": 0.8682080924855491,
"grad_norm": 0.2875679417313269,
"learning_rate": 1.0399174347101404e-05,
"loss": 0.9682,
"step": 3755
},
{
"epoch": 0.869364161849711,
"grad_norm": 0.2827349143068363,
"learning_rate": 1.0220692792049169e-05,
"loss": 0.9715,
"step": 3760
},
{
"epoch": 0.8705202312138728,
"grad_norm": 0.2898950516764476,
"learning_rate": 1.0043673649027518e-05,
"loss": 0.9809,
"step": 3765
},
{
"epoch": 0.8716763005780347,
"grad_norm": 0.29127323235770597,
"learning_rate": 9.86811980149519e-06,
"loss": 0.9738,
"step": 3770
},
{
"epoch": 0.8728323699421965,
"grad_norm": 0.289489961872677,
"learning_rate": 9.694034109042694e-06,
"loss": 1.0206,
"step": 3775
},
{
"epoch": 0.8739884393063584,
"grad_norm": 0.2820602963126248,
"learning_rate": 9.521419407345878e-06,
"loss": 0.8967,
"step": 3780
},
{
"epoch": 0.8751445086705202,
"grad_norm": 0.31061259437004146,
"learning_rate": 9.350278508119636e-06,
"loss": 0.9786,
"step": 3785
},
{
"epoch": 0.8763005780346821,
"grad_norm": 0.2815496774647879,
"learning_rate": 9.180614199072146e-06,
"loss": 0.9485,
"step": 3790
},
{
"epoch": 0.8774566473988439,
"grad_norm": 0.3518473115040322,
"learning_rate": 9.012429243859487e-06,
"loss": 0.9903,
"step": 3795
},
{
"epoch": 0.8786127167630058,
"grad_norm": 0.2916712847778055,
"learning_rate": 8.845726382040597e-06,
"loss": 0.9397,
"step": 3800
},
{
"epoch": 0.8797687861271676,
"grad_norm": 0.29392008724784624,
"learning_rate": 8.680508329032589e-06,
"loss": 0.9629,
"step": 3805
},
{
"epoch": 0.8809248554913295,
"grad_norm": 0.28601993585254176,
"learning_rate": 8.516777776066643e-06,
"loss": 0.982,
"step": 3810
},
{
"epoch": 0.8820809248554913,
"grad_norm": 0.28346541614728327,
"learning_rate": 8.354537390144057e-06,
"loss": 0.9446,
"step": 3815
},
{
"epoch": 0.8832369942196532,
"grad_norm": 0.289284064978837,
"learning_rate": 8.19378981399287e-06,
"loss": 0.9647,
"step": 3820
},
{
"epoch": 0.884393063583815,
"grad_norm": 0.2867377684057541,
"learning_rate": 8.034537666024822e-06,
"loss": 0.9455,
"step": 3825
},
{
"epoch": 0.8855491329479769,
"grad_norm": 0.2756548624705791,
"learning_rate": 7.876783540292599e-06,
"loss": 1.0273,
"step": 3830
},
{
"epoch": 0.8867052023121387,
"grad_norm": 0.2936449029949148,
"learning_rate": 7.720530006447736e-06,
"loss": 0.9742,
"step": 3835
},
{
"epoch": 0.8878612716763006,
"grad_norm": 0.297967509970897,
"learning_rate": 7.565779609698631e-06,
"loss": 0.9329,
"step": 3840
},
{
"epoch": 0.8890173410404625,
"grad_norm": 0.2942735618729235,
"learning_rate": 7.412534870769116e-06,
"loss": 0.9581,
"step": 3845
},
{
"epoch": 0.8901734104046243,
"grad_norm": 0.28806737344976613,
"learning_rate": 7.260798285857484e-06,
"loss": 1.0075,
"step": 3850
},
{
"epoch": 0.8913294797687862,
"grad_norm": 0.2831951777045236,
"learning_rate": 7.110572326595711e-06,
"loss": 0.9971,
"step": 3855
},
{
"epoch": 0.892485549132948,
"grad_norm": 0.28520928967193043,
"learning_rate": 6.961859440009233e-06,
"loss": 1.0001,
"step": 3860
},
{
"epoch": 0.8936416184971099,
"grad_norm": 0.27689057228854386,
"learning_rate": 6.8146620484771495e-06,
"loss": 0.9889,
"step": 3865
},
{
"epoch": 0.8947976878612717,
"grad_norm": 0.285699685858991,
"learning_rate": 6.668982549692649e-06,
"loss": 0.9478,
"step": 3870
},
{
"epoch": 0.8959537572254336,
"grad_norm": 0.2754348125812895,
"learning_rate": 6.524823316624063e-06,
"loss": 0.9343,
"step": 3875
},
{
"epoch": 0.8971098265895954,
"grad_norm": 0.29989809168873266,
"learning_rate": 6.382186697476167e-06,
"loss": 0.9789,
"step": 3880
},
{
"epoch": 0.8982658959537573,
"grad_norm": 0.2828138414197748,
"learning_rate": 6.2410750156518985e-06,
"loss": 1.0062,
"step": 3885
},
{
"epoch": 0.8994219653179191,
"grad_norm": 0.2889085637993367,
"learning_rate": 6.101490569714574e-06,
"loss": 0.9696,
"step": 3890
},
{
"epoch": 0.900578034682081,
"grad_norm": 0.29960325302848234,
"learning_rate": 5.963435633350412e-06,
"loss": 0.9162,
"step": 3895
},
{
"epoch": 0.9017341040462428,
"grad_norm": 0.3005589289503204,
"learning_rate": 5.826912455331468e-06,
"loss": 0.9903,
"step": 3900
},
{
"epoch": 0.9028901734104047,
"grad_norm": 0.2928261089403607,
"learning_rate": 5.691923259479093e-06,
"loss": 0.9736,
"step": 3905
},
{
"epoch": 0.9040462427745665,
"grad_norm": 0.28429998893881614,
"learning_rate": 5.558470244627634e-06,
"loss": 0.9711,
"step": 3910
},
{
"epoch": 0.9052023121387284,
"grad_norm": 0.2784468816364227,
"learning_rate": 5.4265555845886215e-06,
"loss": 0.9811,
"step": 3915
},
{
"epoch": 0.9063583815028902,
"grad_norm": 0.2872050965542848,
"learning_rate": 5.29618142811541e-06,
"loss": 0.954,
"step": 3920
},
{
"epoch": 0.9075144508670521,
"grad_norm": 0.2948554706499198,
"learning_rate": 5.16734989886809e-06,
"loss": 0.9811,
"step": 3925
},
{
"epoch": 0.9086705202312139,
"grad_norm": 0.28399320997388067,
"learning_rate": 5.040063095379011e-06,
"loss": 0.9175,
"step": 3930
},
{
"epoch": 0.9098265895953758,
"grad_norm": 0.2855808882463355,
"learning_rate": 4.914323091018535e-06,
"loss": 0.9901,
"step": 3935
},
{
"epoch": 0.9109826589595376,
"grad_norm": 0.3021329131025513,
"learning_rate": 4.790131933961206e-06,
"loss": 0.9613,
"step": 3940
},
{
"epoch": 0.9121387283236995,
"grad_norm": 0.2862589716791116,
"learning_rate": 4.6674916471524995e-06,
"loss": 0.9961,
"step": 3945
},
{
"epoch": 0.9132947976878613,
"grad_norm": 0.2998472712073513,
"learning_rate": 4.546404228275824e-06,
"loss": 0.9845,
"step": 3950
},
{
"epoch": 0.9144508670520232,
"grad_norm": 0.28603652316011496,
"learning_rate": 4.426871649719932e-06,
"loss": 0.9765,
"step": 3955
},
{
"epoch": 0.915606936416185,
"grad_norm": 0.2872761079408887,
"learning_rate": 4.3088958585468686e-06,
"loss": 0.9997,
"step": 3960
},
{
"epoch": 0.9167630057803469,
"grad_norm": 0.2873199733628149,
"learning_rate": 4.192478776460229e-06,
"loss": 1.0064,
"step": 3965
},
{
"epoch": 0.9179190751445087,
"grad_norm": 0.27515435582137365,
"learning_rate": 4.077622299773831e-06,
"loss": 0.9189,
"step": 3970
},
{
"epoch": 0.9190751445086706,
"grad_norm": 0.27999010046883294,
"learning_rate": 3.96432829938086e-06,
"loss": 0.9885,
"step": 3975
},
{
"epoch": 0.9202312138728324,
"grad_norm": 0.2635891865916151,
"learning_rate": 3.8525986207233465e-06,
"loss": 0.9352,
"step": 3980
},
{
"epoch": 0.9213872832369943,
"grad_norm": 0.28633688125843376,
"learning_rate": 3.742435083762186e-06,
"loss": 0.9599,
"step": 3985
},
{
"epoch": 0.922543352601156,
"grad_norm": 0.2826637970887437,
"learning_rate": 3.633839482947421e-06,
"loss": 0.9948,
"step": 3990
},
{
"epoch": 0.923699421965318,
"grad_norm": 0.30153590546133957,
"learning_rate": 3.526813587189026e-06,
"loss": 0.9804,
"step": 3995
},
{
"epoch": 0.9248554913294798,
"grad_norm": 0.2858269823618572,
"learning_rate": 3.4213591398281175e-06,
"loss": 0.974,
"step": 4000
},
{
"epoch": 0.9260115606936417,
"grad_norm": 0.2820688133929364,
"learning_rate": 3.3174778586085643e-06,
"loss": 0.9566,
"step": 4005
},
{
"epoch": 0.9271676300578034,
"grad_norm": 0.3000844480451751,
"learning_rate": 3.2151714356489225e-06,
"loss": 1.0118,
"step": 4010
},
{
"epoch": 0.9283236994219654,
"grad_norm": 0.2790666100533931,
"learning_rate": 3.114441537415014e-06,
"loss": 0.9533,
"step": 4015
},
{
"epoch": 0.9294797687861271,
"grad_norm": 0.29749566810305794,
"learning_rate": 3.0152898046926557e-06,
"loss": 0.9569,
"step": 4020
},
{
"epoch": 0.930635838150289,
"grad_norm": 0.28219907290129675,
"learning_rate": 2.917717852560997e-06,
"loss": 0.9512,
"step": 4025
},
{
"epoch": 0.9317919075144508,
"grad_norm": 0.29888043306041123,
"learning_rate": 2.8217272703661923e-06,
"loss": 1.0231,
"step": 4030
},
{
"epoch": 0.9329479768786128,
"grad_norm": 0.26919643142805166,
"learning_rate": 2.727319621695501e-06,
"loss": 0.9873,
"step": 4035
},
{
"epoch": 0.9341040462427745,
"grad_norm": 0.29380224096591,
"learning_rate": 2.6344964443518526e-06,
"loss": 0.9385,
"step": 4040
},
{
"epoch": 0.9352601156069364,
"grad_norm": 0.28299962979190113,
"learning_rate": 2.5432592503288e-06,
"loss": 0.9292,
"step": 4045
},
{
"epoch": 0.9364161849710982,
"grad_norm": 0.29962326178158855,
"learning_rate": 2.453609525785816e-06,
"loss": 0.9773,
"step": 4050
},
{
"epoch": 0.9375722543352601,
"grad_norm": 0.32360192327986553,
"learning_rate": 2.3655487310241585e-06,
"loss": 0.9529,
"step": 4055
},
{
"epoch": 0.9387283236994219,
"grad_norm": 0.30410924441522585,
"learning_rate": 2.279078300463089e-06,
"loss": 1.0106,
"step": 4060
},
{
"epoch": 0.9398843930635838,
"grad_norm": 0.30431377413637145,
"learning_rate": 2.1941996426164344e-06,
"loss": 0.9735,
"step": 4065
},
{
"epoch": 0.9410404624277456,
"grad_norm": 0.2903468628517127,
"learning_rate": 2.1109141400697418e-06,
"loss": 0.972,
"step": 4070
},
{
"epoch": 0.9421965317919075,
"grad_norm": 0.2954528106164249,
"learning_rate": 2.029223149457682e-06,
"loss": 0.9662,
"step": 4075
},
{
"epoch": 0.9433526011560693,
"grad_norm": 0.27439734246785746,
"learning_rate": 1.949128001441969e-06,
"loss": 0.9509,
"step": 4080
},
{
"epoch": 0.9445086705202312,
"grad_norm": 0.27172305484407105,
"learning_rate": 1.8706300006896882e-06,
"loss": 0.8628,
"step": 4085
},
{
"epoch": 0.945664739884393,
"grad_norm": 0.27216092212065524,
"learning_rate": 1.7937304258520692e-06,
"loss": 0.9785,
"step": 4090
},
{
"epoch": 0.9468208092485549,
"grad_norm": 0.2918352349773759,
"learning_rate": 1.718430529543613e-06,
"loss": 0.9571,
"step": 4095
},
{
"epoch": 0.9479768786127167,
"grad_norm": 0.272999376680257,
"learning_rate": 1.6447315383217643e-06,
"loss": 0.918,
"step": 4100
},
{
"epoch": 0.9491329479768786,
"grad_norm": 0.27901699608664005,
"learning_rate": 1.5726346526668156e-06,
"loss": 0.9528,
"step": 4105
},
{
"epoch": 0.9502890173410404,
"grad_norm": 0.28289047639247433,
"learning_rate": 1.5021410469624465e-06,
"loss": 0.9067,
"step": 4110
},
{
"epoch": 0.9514450867052023,
"grad_norm": 0.2825322282067992,
"learning_rate": 1.4332518694765707e-06,
"loss": 0.8735,
"step": 4115
},
{
"epoch": 0.9526011560693641,
"grad_norm": 0.28837647678016226,
"learning_rate": 1.3659682423425968e-06,
"loss": 0.9432,
"step": 4120
},
{
"epoch": 0.953757225433526,
"grad_norm": 0.2876129817030531,
"learning_rate": 1.30029126154122e-06,
"loss": 0.9268,
"step": 4125
},
{
"epoch": 0.9549132947976878,
"grad_norm": 0.2792371887058164,
"learning_rate": 1.236221996882514e-06,
"loss": 0.9726,
"step": 4130
},
{
"epoch": 0.9560693641618497,
"grad_norm": 0.29099718253951457,
"learning_rate": 1.1737614919885008e-06,
"loss": 0.9209,
"step": 4135
},
{
"epoch": 0.9572254335260115,
"grad_norm": 0.2995617326772327,
"learning_rate": 1.1129107642761872e-06,
"loss": 0.9187,
"step": 4140
},
{
"epoch": 0.9583815028901734,
"grad_norm": 0.2762400721384854,
"learning_rate": 1.053670804940987e-06,
"loss": 0.9142,
"step": 4145
},
{
"epoch": 0.9595375722543352,
"grad_norm": 0.2931863784593866,
"learning_rate": 9.960425789405258e-07,
"loss": 0.9084,
"step": 4150
},
{
"epoch": 0.9606936416184971,
"grad_norm": 0.2880567882354411,
"learning_rate": 9.400270249789955e-07,
"loss": 0.9868,
"step": 4155
},
{
"epoch": 0.9618497109826589,
"grad_norm": 0.2871636561882668,
"learning_rate": 8.856250554918344e-07,
"loss": 0.9152,
"step": 4160
},
{
"epoch": 0.9630057803468208,
"grad_norm": 0.28517050924588744,
"learning_rate": 8.328375566308166e-07,
"loss": 0.9757,
"step": 4165
},
{
"epoch": 0.9641618497109826,
"grad_norm": 0.28587044682235146,
"learning_rate": 7.816653882496971e-07,
"loss": 0.9537,
"step": 4170
},
{
"epoch": 0.9653179190751445,
"grad_norm": 0.29706875055091925,
"learning_rate": 7.321093838901449e-07,
"loss": 0.9247,
"step": 4175
},
{
"epoch": 0.9664739884393063,
"grad_norm": 0.2861635930847044,
"learning_rate": 6.841703507682206e-07,
"loss": 0.8905,
"step": 4180
},
{
"epoch": 0.9676300578034682,
"grad_norm": 0.2813627882340819,
"learning_rate": 6.378490697611761e-07,
"loss": 0.9749,
"step": 4185
},
{
"epoch": 0.96878612716763,
"grad_norm": 0.2865848575770194,
"learning_rate": 5.93146295394742e-07,
"loss": 0.9443,
"step": 4190
},
{
"epoch": 0.9699421965317919,
"grad_norm": 0.27882077648739195,
"learning_rate": 5.500627558308713e-07,
"loss": 0.9398,
"step": 4195
},
{
"epoch": 0.9710982658959537,
"grad_norm": 0.3065313177380281,
"learning_rate": 5.085991528558487e-07,
"loss": 0.9236,
"step": 4200
},
{
"epoch": 0.9722543352601156,
"grad_norm": 0.2871839420940439,
"learning_rate": 4.687561618688663e-07,
"loss": 0.9676,
"step": 4205
},
{
"epoch": 0.9734104046242774,
"grad_norm": 0.2637381920338272,
"learning_rate": 4.3053443187103247e-07,
"loss": 0.8815,
"step": 4210
},
{
"epoch": 0.9745664739884393,
"grad_norm": 0.285755856830912,
"learning_rate": 3.9393458545479157e-07,
"loss": 0.9926,
"step": 4215
},
{
"epoch": 0.9757225433526011,
"grad_norm": 0.30359812477643444,
"learning_rate": 3.589572187937651e-07,
"loss": 0.9604,
"step": 4220
},
{
"epoch": 0.976878612716763,
"grad_norm": 0.29778116804881444,
"learning_rate": 3.2560290163307083e-07,
"loss": 0.9788,
"step": 4225
},
{
"epoch": 0.9780346820809248,
"grad_norm": 0.2794224102916479,
"learning_rate": 2.93872177280019e-07,
"loss": 0.9427,
"step": 4230
},
{
"epoch": 0.9791907514450867,
"grad_norm": 0.3008883017130643,
"learning_rate": 2.637655625952973e-07,
"loss": 0.9475,
"step": 4235
},
{
"epoch": 0.9803468208092485,
"grad_norm": 0.28456190197348324,
"learning_rate": 2.3528354798451058e-07,
"loss": 0.9461,
"step": 4240
},
{
"epoch": 0.9815028901734104,
"grad_norm": 0.2829229588306782,
"learning_rate": 2.0842659739019887e-07,
"loss": 0.9524,
"step": 4245
},
{
"epoch": 0.9826589595375722,
"grad_norm": 0.283105235302632,
"learning_rate": 1.8319514828430973e-07,
"loss": 0.9474,
"step": 4250
},
{
"epoch": 0.9838150289017341,
"grad_norm": 0.27690208166745106,
"learning_rate": 1.5958961166104847e-07,
"loss": 0.9351,
"step": 4255
},
{
"epoch": 0.9849710982658959,
"grad_norm": 0.25712842285219684,
"learning_rate": 1.3761037203017245e-07,
"loss": 0.914,
"step": 4260
},
{
"epoch": 0.9861271676300578,
"grad_norm": 0.2735837511043363,
"learning_rate": 1.1725778741076276e-07,
"loss": 0.8924,
"step": 4265
},
{
"epoch": 0.9872832369942196,
"grad_norm": 0.2820464817023967,
"learning_rate": 9.853218932536212e-08,
"loss": 0.9359,
"step": 4270
},
{
"epoch": 0.9884393063583815,
"grad_norm": 0.27653871874823904,
"learning_rate": 8.143388279460151e-08,
"loss": 0.9704,
"step": 4275
},
{
"epoch": 0.9895953757225433,
"grad_norm": 0.2836276601553615,
"learning_rate": 6.596314633219303e-08,
"loss": 0.9966,
"step": 4280
},
{
"epoch": 0.9907514450867052,
"grad_norm": 0.27880904921583405,
"learning_rate": 5.2120231940433474e-08,
"loss": 0.9227,
"step": 4285
},
{
"epoch": 0.991907514450867,
"grad_norm": 0.2657505634085022,
"learning_rate": 3.990536510608544e-08,
"loss": 0.9657,
"step": 4290
},
{
"epoch": 0.9930635838150289,
"grad_norm": 0.2908473414800192,
"learning_rate": 2.9318744796669183e-08,
"loss": 0.9206,
"step": 4295
},
{
"epoch": 0.9942196531791907,
"grad_norm": 0.2957217704373017,
"learning_rate": 2.036054345729843e-08,
"loss": 0.9528,
"step": 4300
},
{
"epoch": 0.9953757225433526,
"grad_norm": 0.2910406474246415,
"learning_rate": 1.3030907007793857e-08,
"loss": 1.0005,
"step": 4305
},
{
"epoch": 0.9965317919075144,
"grad_norm": 0.2796254503042765,
"learning_rate": 7.329954840362696e-09,
"loss": 1.0259,
"step": 4310
},
{
"epoch": 0.9976878612716763,
"grad_norm": 0.2844197423108749,
"learning_rate": 3.257779817600337e-09,
"loss": 0.9363,
"step": 4315
},
{
"epoch": 0.9988439306358381,
"grad_norm": 0.3388694141182201,
"learning_rate": 8.144482710248369e-10,
"loss": 1.0117,
"step": 4320
},
{
"epoch": 1.0,
"grad_norm": 0.29406054557823574,
"learning_rate": 0.0,
"loss": 0.9703,
"step": 4325
},
{
"epoch": 1.0,
"eval_runtime": 3.3948,
"eval_samples_per_second": 2.946,
"eval_steps_per_second": 0.884,
"step": 4325
},
{
"epoch": 1.0,
"step": 4325,
"total_flos": 1.71674607550464e+16,
"train_loss": 0.9719910388461427,
"train_runtime": 20047.9922,
"train_samples_per_second": 3.452,
"train_steps_per_second": 0.216
}
],
"logging_steps": 5,
"max_steps": 4325,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.71674607550464e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}