nthakur's picture
Model save
64f8a04 verified
{
"best_metric": 1.208633542060852,
"best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Mistral-7B-Instruct-v0.2-miracl-raft-sft-v2.0/checkpoint-2000",
"epoch": 0.9997531473710195,
"eval_steps": 200,
"global_step": 2025,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004937052579609973,
"grad_norm": 3.3225639243513934,
"learning_rate": 4.926108374384237e-08,
"loss": 1.6639,
"step": 1
},
{
"epoch": 0.0024685262898049864,
"grad_norm": 2.953462068009019,
"learning_rate": 2.4630541871921185e-07,
"loss": 1.7388,
"step": 5
},
{
"epoch": 0.004937052579609973,
"grad_norm": 2.4057426602070646,
"learning_rate": 4.926108374384237e-07,
"loss": 1.5798,
"step": 10
},
{
"epoch": 0.00740557886941496,
"grad_norm": 3.186500367639329,
"learning_rate": 7.389162561576356e-07,
"loss": 1.659,
"step": 15
},
{
"epoch": 0.009874105159219946,
"grad_norm": 2.396847476828635,
"learning_rate": 9.852216748768474e-07,
"loss": 1.6374,
"step": 20
},
{
"epoch": 0.012342631449024932,
"grad_norm": 2.5391731431636106,
"learning_rate": 1.2315270935960593e-06,
"loss": 1.6314,
"step": 25
},
{
"epoch": 0.01481115773882992,
"grad_norm": 2.2574409610476462,
"learning_rate": 1.4778325123152712e-06,
"loss": 1.5888,
"step": 30
},
{
"epoch": 0.017279684028634903,
"grad_norm": 1.8342813542038656,
"learning_rate": 1.724137931034483e-06,
"loss": 1.5412,
"step": 35
},
{
"epoch": 0.01974821031843989,
"grad_norm": 1.8380484598711315,
"learning_rate": 1.970443349753695e-06,
"loss": 1.4889,
"step": 40
},
{
"epoch": 0.02221673660824488,
"grad_norm": 1.613058249947001,
"learning_rate": 2.2167487684729067e-06,
"loss": 1.5403,
"step": 45
},
{
"epoch": 0.024685262898049863,
"grad_norm": 1.8920016704605567,
"learning_rate": 2.4630541871921186e-06,
"loss": 1.4831,
"step": 50
},
{
"epoch": 0.02715378918785485,
"grad_norm": 1.2203038504329438,
"learning_rate": 2.70935960591133e-06,
"loss": 1.4233,
"step": 55
},
{
"epoch": 0.02962231547765984,
"grad_norm": 1.095184752565883,
"learning_rate": 2.9556650246305424e-06,
"loss": 1.4255,
"step": 60
},
{
"epoch": 0.03209084176746482,
"grad_norm": 0.8448900941377993,
"learning_rate": 3.201970443349754e-06,
"loss": 1.4892,
"step": 65
},
{
"epoch": 0.03455936805726981,
"grad_norm": 0.9405655862570454,
"learning_rate": 3.448275862068966e-06,
"loss": 1.4673,
"step": 70
},
{
"epoch": 0.0370278943470748,
"grad_norm": 0.6713923929675227,
"learning_rate": 3.6945812807881777e-06,
"loss": 1.4148,
"step": 75
},
{
"epoch": 0.03949642063687978,
"grad_norm": 0.7755902373813679,
"learning_rate": 3.94088669950739e-06,
"loss": 1.4867,
"step": 80
},
{
"epoch": 0.04196494692668477,
"grad_norm": 0.7400218273582495,
"learning_rate": 4.1871921182266015e-06,
"loss": 1.3834,
"step": 85
},
{
"epoch": 0.04443347321648976,
"grad_norm": 0.7245551973919236,
"learning_rate": 4.4334975369458135e-06,
"loss": 1.44,
"step": 90
},
{
"epoch": 0.04690199950629474,
"grad_norm": 0.6731482962094358,
"learning_rate": 4.6798029556650245e-06,
"loss": 1.362,
"step": 95
},
{
"epoch": 0.049370525796099726,
"grad_norm": 0.7105341248736622,
"learning_rate": 4.926108374384237e-06,
"loss": 1.3716,
"step": 100
},
{
"epoch": 0.05183905208590472,
"grad_norm": 0.6774223469533757,
"learning_rate": 5.172413793103449e-06,
"loss": 1.4056,
"step": 105
},
{
"epoch": 0.0543075783757097,
"grad_norm": 0.6745745206164803,
"learning_rate": 5.41871921182266e-06,
"loss": 1.337,
"step": 110
},
{
"epoch": 0.056776104665514686,
"grad_norm": 0.5935854583319804,
"learning_rate": 5.665024630541872e-06,
"loss": 1.3615,
"step": 115
},
{
"epoch": 0.05924463095531968,
"grad_norm": 0.556924082351685,
"learning_rate": 5.911330049261085e-06,
"loss": 1.4248,
"step": 120
},
{
"epoch": 0.06171315724512466,
"grad_norm": 0.5427807889259738,
"learning_rate": 6.157635467980296e-06,
"loss": 1.3286,
"step": 125
},
{
"epoch": 0.06418168353492965,
"grad_norm": 0.6063519166723843,
"learning_rate": 6.403940886699508e-06,
"loss": 1.3176,
"step": 130
},
{
"epoch": 0.06665020982473463,
"grad_norm": 0.5670363529677274,
"learning_rate": 6.65024630541872e-06,
"loss": 1.3273,
"step": 135
},
{
"epoch": 0.06911873611453961,
"grad_norm": 0.5846835420330245,
"learning_rate": 6.896551724137932e-06,
"loss": 1.326,
"step": 140
},
{
"epoch": 0.07158726240434461,
"grad_norm": 0.5686293276495719,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.376,
"step": 145
},
{
"epoch": 0.0740557886941496,
"grad_norm": 0.5275299029056365,
"learning_rate": 7.3891625615763555e-06,
"loss": 1.3364,
"step": 150
},
{
"epoch": 0.07652431498395458,
"grad_norm": 0.5464387846115857,
"learning_rate": 7.635467980295567e-06,
"loss": 1.3654,
"step": 155
},
{
"epoch": 0.07899284127375956,
"grad_norm": 0.5229203264129956,
"learning_rate": 7.88177339901478e-06,
"loss": 1.3009,
"step": 160
},
{
"epoch": 0.08146136756356455,
"grad_norm": 0.5746356087172889,
"learning_rate": 8.12807881773399e-06,
"loss": 1.2611,
"step": 165
},
{
"epoch": 0.08392989385336953,
"grad_norm": 0.5922232695792946,
"learning_rate": 8.374384236453203e-06,
"loss": 1.3643,
"step": 170
},
{
"epoch": 0.08639842014317453,
"grad_norm": 0.5295655281983137,
"learning_rate": 8.620689655172414e-06,
"loss": 1.3165,
"step": 175
},
{
"epoch": 0.08886694643297952,
"grad_norm": 0.5850731545805168,
"learning_rate": 8.866995073891627e-06,
"loss": 1.3105,
"step": 180
},
{
"epoch": 0.0913354727227845,
"grad_norm": 0.5551320012809824,
"learning_rate": 9.113300492610838e-06,
"loss": 1.278,
"step": 185
},
{
"epoch": 0.09380399901258948,
"grad_norm": 0.5711739398485313,
"learning_rate": 9.359605911330049e-06,
"loss": 1.3197,
"step": 190
},
{
"epoch": 0.09627252530239447,
"grad_norm": 0.5559894427552352,
"learning_rate": 9.605911330049262e-06,
"loss": 1.3409,
"step": 195
},
{
"epoch": 0.09874105159219945,
"grad_norm": 0.5671580477100892,
"learning_rate": 9.852216748768475e-06,
"loss": 1.3095,
"step": 200
},
{
"epoch": 0.09874105159219945,
"eval_loss": 1.2800045013427734,
"eval_runtime": 2727.7043,
"eval_samples_per_second": 1.466,
"eval_steps_per_second": 0.122,
"step": 200
},
{
"epoch": 0.10120957788200444,
"grad_norm": 0.5279833943359168,
"learning_rate": 9.999970269475589e-06,
"loss": 1.2966,
"step": 205
},
{
"epoch": 0.10367810417180943,
"grad_norm": 0.5644566149733632,
"learning_rate": 9.99963580513638e-06,
"loss": 1.2874,
"step": 210
},
{
"epoch": 0.10614663046161442,
"grad_norm": 0.6258961143244912,
"learning_rate": 9.998929738244678e-06,
"loss": 1.3209,
"step": 215
},
{
"epoch": 0.1086151567514194,
"grad_norm": 0.4834621448531187,
"learning_rate": 9.997852121279563e-06,
"loss": 1.3313,
"step": 220
},
{
"epoch": 0.11108368304122439,
"grad_norm": 0.5481752837030147,
"learning_rate": 9.996403034335912e-06,
"loss": 1.2738,
"step": 225
},
{
"epoch": 0.11355220933102937,
"grad_norm": 0.5886589355414898,
"learning_rate": 9.994582585118449e-06,
"loss": 1.2758,
"step": 230
},
{
"epoch": 0.11602073562083436,
"grad_norm": 0.5757139355018718,
"learning_rate": 9.992390908933746e-06,
"loss": 1.3187,
"step": 235
},
{
"epoch": 0.11848926191063935,
"grad_norm": 0.5464825333851621,
"learning_rate": 9.989828168680164e-06,
"loss": 1.3677,
"step": 240
},
{
"epoch": 0.12095778820044434,
"grad_norm": 0.6372982363888493,
"learning_rate": 9.986894554835735e-06,
"loss": 1.2668,
"step": 245
},
{
"epoch": 0.12342631449024932,
"grad_norm": 0.5445141174147589,
"learning_rate": 9.983590285444025e-06,
"loss": 1.2917,
"step": 250
},
{
"epoch": 0.1258948407800543,
"grad_norm": 0.6832031232821291,
"learning_rate": 9.979915606097907e-06,
"loss": 1.2675,
"step": 255
},
{
"epoch": 0.1283633670698593,
"grad_norm": 0.62128138673847,
"learning_rate": 9.975870789921322e-06,
"loss": 1.3187,
"step": 260
},
{
"epoch": 0.13083189335966428,
"grad_norm": 0.5161196413352727,
"learning_rate": 9.971456137548971e-06,
"loss": 1.3031,
"step": 265
},
{
"epoch": 0.13330041964946926,
"grad_norm": 0.5524745641605668,
"learning_rate": 9.966671977103972e-06,
"loss": 1.2749,
"step": 270
},
{
"epoch": 0.13576894593927424,
"grad_norm": 0.6669242272051678,
"learning_rate": 9.961518664173473e-06,
"loss": 1.3409,
"step": 275
},
{
"epoch": 0.13823747222907923,
"grad_norm": 0.5555562003933405,
"learning_rate": 9.955996581782218e-06,
"loss": 1.2468,
"step": 280
},
{
"epoch": 0.14070599851888424,
"grad_norm": 0.6244202172570701,
"learning_rate": 9.950106140364089e-06,
"loss": 1.3318,
"step": 285
},
{
"epoch": 0.14317452480868922,
"grad_norm": 0.5100271270558925,
"learning_rate": 9.943847777731584e-06,
"loss": 1.2522,
"step": 290
},
{
"epoch": 0.1456430510984942,
"grad_norm": 0.5482368116306139,
"learning_rate": 9.937221959043294e-06,
"loss": 1.3044,
"step": 295
},
{
"epoch": 0.1481115773882992,
"grad_norm": 0.5919271032213149,
"learning_rate": 9.93022917676932e-06,
"loss": 1.3131,
"step": 300
},
{
"epoch": 0.15058010367810418,
"grad_norm": 0.5428829828459178,
"learning_rate": 9.922869950654662e-06,
"loss": 1.2306,
"step": 305
},
{
"epoch": 0.15304862996790916,
"grad_norm": 0.5461192699131175,
"learning_rate": 9.915144827680606e-06,
"loss": 1.3151,
"step": 310
},
{
"epoch": 0.15551715625771415,
"grad_norm": 0.5113904915941117,
"learning_rate": 9.907054382024058e-06,
"loss": 1.2813,
"step": 315
},
{
"epoch": 0.15798568254751913,
"grad_norm": 0.6272053783824121,
"learning_rate": 9.898599215014868e-06,
"loss": 1.3064,
"step": 320
},
{
"epoch": 0.1604542088373241,
"grad_norm": 0.5671094073178861,
"learning_rate": 9.889779955091142e-06,
"loss": 1.2734,
"step": 325
},
{
"epoch": 0.1629227351271291,
"grad_norm": 0.582371136771928,
"learning_rate": 9.880597257752522e-06,
"loss": 1.3075,
"step": 330
},
{
"epoch": 0.16539126141693408,
"grad_norm": 0.5520015589132342,
"learning_rate": 9.87105180551148e-06,
"loss": 1.2802,
"step": 335
},
{
"epoch": 0.16785978770673907,
"grad_norm": 0.5937587353133906,
"learning_rate": 9.861144307842574e-06,
"loss": 1.2893,
"step": 340
},
{
"epoch": 0.17032831399654405,
"grad_norm": 0.5371728696508287,
"learning_rate": 9.850875501129726e-06,
"loss": 1.219,
"step": 345
},
{
"epoch": 0.17279684028634906,
"grad_norm": 0.5892603164875664,
"learning_rate": 9.840246148611485e-06,
"loss": 1.3094,
"step": 350
},
{
"epoch": 0.17526536657615405,
"grad_norm": 0.5502008403202052,
"learning_rate": 9.829257040324308e-06,
"loss": 1.2543,
"step": 355
},
{
"epoch": 0.17773389286595903,
"grad_norm": 0.6273336128612022,
"learning_rate": 9.817908993043819e-06,
"loss": 1.3107,
"step": 360
},
{
"epoch": 0.18020241915576402,
"grad_norm": 0.5761032807193177,
"learning_rate": 9.806202850224123e-06,
"loss": 1.2657,
"step": 365
},
{
"epoch": 0.182670945445569,
"grad_norm": 0.5628854954179761,
"learning_rate": 9.794139481935108e-06,
"loss": 1.258,
"step": 370
},
{
"epoch": 0.18513947173537398,
"grad_norm": 0.5637909618250402,
"learning_rate": 9.781719784797773e-06,
"loss": 1.2406,
"step": 375
},
{
"epoch": 0.18760799802517897,
"grad_norm": 0.5212794091813217,
"learning_rate": 9.768944681917582e-06,
"loss": 1.2391,
"step": 380
},
{
"epoch": 0.19007652431498395,
"grad_norm": 0.6416799620777229,
"learning_rate": 9.755815122815871e-06,
"loss": 1.3188,
"step": 385
},
{
"epoch": 0.19254505060478894,
"grad_norm": 0.5487444911675088,
"learning_rate": 9.742332083359252e-06,
"loss": 1.2884,
"step": 390
},
{
"epoch": 0.19501357689459392,
"grad_norm": 0.5697317991057302,
"learning_rate": 9.728496565687096e-06,
"loss": 1.2798,
"step": 395
},
{
"epoch": 0.1974821031843989,
"grad_norm": 0.6703007559314436,
"learning_rate": 9.714309598137045e-06,
"loss": 1.249,
"step": 400
},
{
"epoch": 0.1974821031843989,
"eval_loss": 1.2516121864318848,
"eval_runtime": 2575.7168,
"eval_samples_per_second": 1.553,
"eval_steps_per_second": 0.13,
"step": 400
},
{
"epoch": 0.1999506294742039,
"grad_norm": 0.526231295870319,
"learning_rate": 9.699772235168572e-06,
"loss": 1.2554,
"step": 405
},
{
"epoch": 0.20241915576400887,
"grad_norm": 0.5513334850915074,
"learning_rate": 9.68488555728462e-06,
"loss": 1.2753,
"step": 410
},
{
"epoch": 0.20488768205381389,
"grad_norm": 0.5979774809603526,
"learning_rate": 9.669650670951282e-06,
"loss": 1.2562,
"step": 415
},
{
"epoch": 0.20735620834361887,
"grad_norm": 0.5596269907913185,
"learning_rate": 9.654068708515564e-06,
"loss": 1.2829,
"step": 420
},
{
"epoch": 0.20982473463342385,
"grad_norm": 0.5593282633769885,
"learning_rate": 9.638140828121232e-06,
"loss": 1.2843,
"step": 425
},
{
"epoch": 0.21229326092322884,
"grad_norm": 0.5775937654131708,
"learning_rate": 9.621868213622713e-06,
"loss": 1.3001,
"step": 430
},
{
"epoch": 0.21476178721303382,
"grad_norm": 0.5661901033745343,
"learning_rate": 9.605252074497125e-06,
"loss": 1.3038,
"step": 435
},
{
"epoch": 0.2172303135028388,
"grad_norm": 0.6132749209816828,
"learning_rate": 9.588293645754363e-06,
"loss": 1.2843,
"step": 440
},
{
"epoch": 0.2196988397926438,
"grad_norm": 0.5624360623535388,
"learning_rate": 9.570994187845323e-06,
"loss": 1.2342,
"step": 445
},
{
"epoch": 0.22216736608244878,
"grad_norm": 0.5567610470805882,
"learning_rate": 9.553354986568201e-06,
"loss": 1.2955,
"step": 450
},
{
"epoch": 0.22463589237225376,
"grad_norm": 0.6255724221196046,
"learning_rate": 9.53537735297294e-06,
"loss": 1.2921,
"step": 455
},
{
"epoch": 0.22710441866205874,
"grad_norm": 0.5322242379012073,
"learning_rate": 9.517062623263768e-06,
"loss": 1.3011,
"step": 460
},
{
"epoch": 0.22957294495186373,
"grad_norm": 0.5444205798338807,
"learning_rate": 9.498412158699905e-06,
"loss": 1.2733,
"step": 465
},
{
"epoch": 0.2320414712416687,
"grad_norm": 0.5426713243893322,
"learning_rate": 9.479427345494366e-06,
"loss": 1.2312,
"step": 470
},
{
"epoch": 0.23450999753147372,
"grad_norm": 0.5871783813919782,
"learning_rate": 9.460109594710942e-06,
"loss": 1.3655,
"step": 475
},
{
"epoch": 0.2369785238212787,
"grad_norm": 0.574852380091512,
"learning_rate": 9.440460342159314e-06,
"loss": 1.2915,
"step": 480
},
{
"epoch": 0.2394470501110837,
"grad_norm": 0.5336092545421678,
"learning_rate": 9.42048104828834e-06,
"loss": 1.2963,
"step": 485
},
{
"epoch": 0.24191557640088868,
"grad_norm": 0.5998428802300876,
"learning_rate": 9.40017319807751e-06,
"loss": 1.3058,
"step": 490
},
{
"epoch": 0.24438410269069366,
"grad_norm": 0.5421507806800733,
"learning_rate": 9.379538300926553e-06,
"loss": 1.2881,
"step": 495
},
{
"epoch": 0.24685262898049865,
"grad_norm": 0.5358621498972941,
"learning_rate": 9.358577890543277e-06,
"loss": 1.2602,
"step": 500
},
{
"epoch": 0.24932115527030363,
"grad_norm": 0.564112204428148,
"learning_rate": 9.33729352482956e-06,
"loss": 1.279,
"step": 505
},
{
"epoch": 0.2517896815601086,
"grad_norm": 0.6382679375882034,
"learning_rate": 9.315686785765556e-06,
"loss": 1.2534,
"step": 510
},
{
"epoch": 0.2542582078499136,
"grad_norm": 0.5744585475791394,
"learning_rate": 9.293759279292116e-06,
"loss": 1.2744,
"step": 515
},
{
"epoch": 0.2567267341397186,
"grad_norm": 0.615942623926986,
"learning_rate": 9.271512635191427e-06,
"loss": 1.3055,
"step": 520
},
{
"epoch": 0.25919526042952357,
"grad_norm": 0.5780670121734512,
"learning_rate": 9.248948506965877e-06,
"loss": 1.3175,
"step": 525
},
{
"epoch": 0.26166378671932855,
"grad_norm": 0.5777138377025286,
"learning_rate": 9.22606857171515e-06,
"loss": 1.2869,
"step": 530
},
{
"epoch": 0.26413231300913353,
"grad_norm": 0.5611724611846367,
"learning_rate": 9.202874530011583e-06,
"loss": 1.3199,
"step": 535
},
{
"epoch": 0.2666008392989385,
"grad_norm": 0.540794710590132,
"learning_rate": 9.179368105773768e-06,
"loss": 1.208,
"step": 540
},
{
"epoch": 0.2690693655887435,
"grad_norm": 0.5581497544995145,
"learning_rate": 9.155551046138408e-06,
"loss": 1.2638,
"step": 545
},
{
"epoch": 0.2715378918785485,
"grad_norm": 0.560865648598851,
"learning_rate": 9.131425121330477e-06,
"loss": 1.2629,
"step": 550
},
{
"epoch": 0.27400641816835347,
"grad_norm": 0.5458754463390333,
"learning_rate": 9.10699212453164e-06,
"loss": 1.2578,
"step": 555
},
{
"epoch": 0.27647494445815846,
"grad_norm": 0.5468153448281193,
"learning_rate": 9.082253871746962e-06,
"loss": 1.2488,
"step": 560
},
{
"epoch": 0.27894347074796344,
"grad_norm": 0.6168084406611584,
"learning_rate": 9.057212201669952e-06,
"loss": 1.2931,
"step": 565
},
{
"epoch": 0.2814119970377685,
"grad_norm": 0.5767023372783159,
"learning_rate": 9.031868975545884e-06,
"loss": 1.2267,
"step": 570
},
{
"epoch": 0.28388052332757346,
"grad_norm": 0.5315895904457054,
"learning_rate": 9.006226077033464e-06,
"loss": 1.2463,
"step": 575
},
{
"epoch": 0.28634904961737845,
"grad_norm": 0.5616058952533509,
"learning_rate": 8.980285412064827e-06,
"loss": 1.287,
"step": 580
},
{
"epoch": 0.28881757590718343,
"grad_norm": 0.5746998443271042,
"learning_rate": 8.954048908703873e-06,
"loss": 1.2929,
"step": 585
},
{
"epoch": 0.2912861021969884,
"grad_norm": 0.5551746835964705,
"learning_rate": 8.92751851700297e-06,
"loss": 1.298,
"step": 590
},
{
"epoch": 0.2937546284867934,
"grad_norm": 0.578564867995815,
"learning_rate": 8.900696208857996e-06,
"loss": 1.2973,
"step": 595
},
{
"epoch": 0.2962231547765984,
"grad_norm": 0.5925663520696334,
"learning_rate": 8.873583977861802e-06,
"loss": 1.2514,
"step": 600
},
{
"epoch": 0.2962231547765984,
"eval_loss": 1.2368682622909546,
"eval_runtime": 2566.7596,
"eval_samples_per_second": 1.558,
"eval_steps_per_second": 0.13,
"step": 600
},
{
"epoch": 0.29869168106640337,
"grad_norm": 0.5605310856508363,
"learning_rate": 8.846183839156015e-06,
"loss": 1.286,
"step": 605
},
{
"epoch": 0.30116020735620835,
"grad_norm": 0.6632798685747615,
"learning_rate": 8.818497829281272e-06,
"loss": 1.2916,
"step": 610
},
{
"epoch": 0.30362873364601334,
"grad_norm": 0.6145012170463651,
"learning_rate": 8.790528006025848e-06,
"loss": 1.2788,
"step": 615
},
{
"epoch": 0.3060972599358183,
"grad_norm": 0.6017170291600934,
"learning_rate": 8.762276448272709e-06,
"loss": 1.3156,
"step": 620
},
{
"epoch": 0.3085657862256233,
"grad_norm": 0.5728547538871892,
"learning_rate": 8.733745255844996e-06,
"loss": 1.2592,
"step": 625
},
{
"epoch": 0.3110343125154283,
"grad_norm": 0.558142508046803,
"learning_rate": 8.70493654934996e-06,
"loss": 1.309,
"step": 630
},
{
"epoch": 0.3135028388052333,
"grad_norm": 0.5596812007471911,
"learning_rate": 8.675852470021344e-06,
"loss": 1.2746,
"step": 635
},
{
"epoch": 0.31597136509503826,
"grad_norm": 0.5909265132847957,
"learning_rate": 8.646495179560221e-06,
"loss": 1.2686,
"step": 640
},
{
"epoch": 0.31843989138484324,
"grad_norm": 0.6185942591784858,
"learning_rate": 8.616866859974344e-06,
"loss": 1.2759,
"step": 645
},
{
"epoch": 0.3209084176746482,
"grad_norm": 0.6157204431679958,
"learning_rate": 8.586969713415949e-06,
"loss": 1.2957,
"step": 650
},
{
"epoch": 0.3233769439644532,
"grad_norm": 0.5974197754755597,
"learning_rate": 8.556805962018091e-06,
"loss": 1.27,
"step": 655
},
{
"epoch": 0.3258454702542582,
"grad_norm": 0.5389440161380957,
"learning_rate": 8.526377847729475e-06,
"loss": 1.2925,
"step": 660
},
{
"epoch": 0.3283139965440632,
"grad_norm": 0.5370983741740369,
"learning_rate": 8.495687632147817e-06,
"loss": 1.2522,
"step": 665
},
{
"epoch": 0.33078252283386816,
"grad_norm": 0.5639132359450145,
"learning_rate": 8.46473759635176e-06,
"loss": 1.2595,
"step": 670
},
{
"epoch": 0.33325104912367315,
"grad_norm": 0.5598705018251675,
"learning_rate": 8.433530040731321e-06,
"loss": 1.2746,
"step": 675
},
{
"epoch": 0.33571957541347813,
"grad_norm": 0.6303186487688077,
"learning_rate": 8.402067284816919e-06,
"loss": 1.2701,
"step": 680
},
{
"epoch": 0.3381881017032831,
"grad_norm": 0.562747309348665,
"learning_rate": 8.370351667106969e-06,
"loss": 1.2305,
"step": 685
},
{
"epoch": 0.3406566279930881,
"grad_norm": 0.5720387765798051,
"learning_rate": 8.338385544894073e-06,
"loss": 1.2047,
"step": 690
},
{
"epoch": 0.3431251542828931,
"grad_norm": 0.5465830505695308,
"learning_rate": 8.306171294089808e-06,
"loss": 1.2507,
"step": 695
},
{
"epoch": 0.3455936805726981,
"grad_norm": 0.5572297207326813,
"learning_rate": 8.273711309048145e-06,
"loss": 1.2599,
"step": 700
},
{
"epoch": 0.3480622068625031,
"grad_norm": 0.5916945311296786,
"learning_rate": 8.241008002387474e-06,
"loss": 1.2615,
"step": 705
},
{
"epoch": 0.3505307331523081,
"grad_norm": 0.6326075200444886,
"learning_rate": 8.208063804811293e-06,
"loss": 1.2559,
"step": 710
},
{
"epoch": 0.3529992594421131,
"grad_norm": 0.6229843020575793,
"learning_rate": 8.174881164927535e-06,
"loss": 1.2652,
"step": 715
},
{
"epoch": 0.35546778573191806,
"grad_norm": 0.5926153932237264,
"learning_rate": 8.141462549066581e-06,
"loss": 1.2423,
"step": 720
},
{
"epoch": 0.35793631202172305,
"grad_norm": 0.5293071287095781,
"learning_rate": 8.107810441097948e-06,
"loss": 1.2185,
"step": 725
},
{
"epoch": 0.36040483831152803,
"grad_norm": 0.5950082298726722,
"learning_rate": 8.073927342245663e-06,
"loss": 1.2458,
"step": 730
},
{
"epoch": 0.362873364601333,
"grad_norm": 0.5437872955630408,
"learning_rate": 8.039815770902368e-06,
"loss": 1.2699,
"step": 735
},
{
"epoch": 0.365341890891138,
"grad_norm": 0.5842632003875607,
"learning_rate": 8.005478262442132e-06,
"loss": 1.2489,
"step": 740
},
{
"epoch": 0.367810417180943,
"grad_norm": 0.5957543279120926,
"learning_rate": 7.970917369032011e-06,
"loss": 1.2808,
"step": 745
},
{
"epoch": 0.37027894347074797,
"grad_norm": 0.5573632520708609,
"learning_rate": 7.936135659442355e-06,
"loss": 1.2394,
"step": 750
},
{
"epoch": 0.37274746976055295,
"grad_norm": 0.5383442104756702,
"learning_rate": 7.901135718855877e-06,
"loss": 1.2584,
"step": 755
},
{
"epoch": 0.37521599605035794,
"grad_norm": 0.5269547291918393,
"learning_rate": 7.86592014867551e-06,
"loss": 1.32,
"step": 760
},
{
"epoch": 0.3776845223401629,
"grad_norm": 0.6059173481615415,
"learning_rate": 7.830491566331063e-06,
"loss": 1.2705,
"step": 765
},
{
"epoch": 0.3801530486299679,
"grad_norm": 0.5905241537228486,
"learning_rate": 7.794852605084661e-06,
"loss": 1.2661,
"step": 770
},
{
"epoch": 0.3826215749197729,
"grad_norm": 0.6119492506708828,
"learning_rate": 7.759005913835048e-06,
"loss": 1.2573,
"step": 775
},
{
"epoch": 0.3850901012095779,
"grad_norm": 0.6449864393640712,
"learning_rate": 7.722954156920675e-06,
"loss": 1.2681,
"step": 780
},
{
"epoch": 0.38755862749938286,
"grad_norm": 0.5777516112864801,
"learning_rate": 7.686700013921704e-06,
"loss": 1.2999,
"step": 785
},
{
"epoch": 0.39002715378918784,
"grad_norm": 0.5818063096150684,
"learning_rate": 7.650246179460826e-06,
"loss": 1.2842,
"step": 790
},
{
"epoch": 0.3924956800789928,
"grad_norm": 0.5844315528318011,
"learning_rate": 7.613595363002977e-06,
"loss": 1.2995,
"step": 795
},
{
"epoch": 0.3949642063687978,
"grad_norm": 0.5560255613889942,
"learning_rate": 7.57675028865397e-06,
"loss": 1.275,
"step": 800
},
{
"epoch": 0.3949642063687978,
"eval_loss": 1.2263342142105103,
"eval_runtime": 2463.6634,
"eval_samples_per_second": 1.624,
"eval_steps_per_second": 0.136,
"step": 800
},
{
"epoch": 0.3974327326586028,
"grad_norm": 0.5523940138743026,
"learning_rate": 7.539713694958013e-06,
"loss": 1.2202,
"step": 805
},
{
"epoch": 0.3999012589484078,
"grad_norm": 0.5936001183365429,
"learning_rate": 7.502488334694167e-06,
"loss": 1.2444,
"step": 810
},
{
"epoch": 0.40236978523821276,
"grad_norm": 0.6143038376732798,
"learning_rate": 7.465076974671739e-06,
"loss": 1.2032,
"step": 815
},
{
"epoch": 0.40483831152801775,
"grad_norm": 0.5865451493919344,
"learning_rate": 7.427482395524646e-06,
"loss": 1.2733,
"step": 820
},
{
"epoch": 0.4073068378178228,
"grad_norm": 0.5980943581114722,
"learning_rate": 7.389707391504728e-06,
"loss": 1.2732,
"step": 825
},
{
"epoch": 0.40977536410762777,
"grad_norm": 0.6323487686008166,
"learning_rate": 7.35175477027408e-06,
"loss": 1.244,
"step": 830
},
{
"epoch": 0.41224389039743276,
"grad_norm": 0.6562081554973773,
"learning_rate": 7.313627352696353e-06,
"loss": 1.2642,
"step": 835
},
{
"epoch": 0.41471241668723774,
"grad_norm": 0.5554470118072983,
"learning_rate": 7.2753279726271e-06,
"loss": 1.2556,
"step": 840
},
{
"epoch": 0.4171809429770427,
"grad_norm": 0.5740654163988275,
"learning_rate": 7.236859476703148e-06,
"loss": 1.2292,
"step": 845
},
{
"epoch": 0.4196494692668477,
"grad_norm": 0.6062582969566837,
"learning_rate": 7.198224724131012e-06,
"loss": 1.235,
"step": 850
},
{
"epoch": 0.4221179955566527,
"grad_norm": 0.5434614048201878,
"learning_rate": 7.159426586474388e-06,
"loss": 1.2224,
"step": 855
},
{
"epoch": 0.4245865218464577,
"grad_norm": 0.5254561702235886,
"learning_rate": 7.120467947440719e-06,
"loss": 1.2557,
"step": 860
},
{
"epoch": 0.42705504813626266,
"grad_norm": 0.5713031391494172,
"learning_rate": 7.081351702666863e-06,
"loss": 1.2063,
"step": 865
},
{
"epoch": 0.42952357442606764,
"grad_norm": 0.5969980245366532,
"learning_rate": 7.042080759503866e-06,
"loss": 1.2418,
"step": 870
},
{
"epoch": 0.43199210071587263,
"grad_norm": 0.5718940130718101,
"learning_rate": 7.00265803680088e-06,
"loss": 1.2108,
"step": 875
},
{
"epoch": 0.4344606270056776,
"grad_norm": 0.6045555591926912,
"learning_rate": 6.963086464688209e-06,
"loss": 1.2597,
"step": 880
},
{
"epoch": 0.4369291532954826,
"grad_norm": 0.5566709780037437,
"learning_rate": 6.923368984359526e-06,
"loss": 1.2174,
"step": 885
},
{
"epoch": 0.4393976795852876,
"grad_norm": 0.5630200258106689,
"learning_rate": 6.883508547853268e-06,
"loss": 1.2244,
"step": 890
},
{
"epoch": 0.44186620587509257,
"grad_norm": 0.5348314552481888,
"learning_rate": 6.843508117833224e-06,
"loss": 1.2687,
"step": 895
},
{
"epoch": 0.44433473216489755,
"grad_norm": 0.49625311943608336,
"learning_rate": 6.8033706673683276e-06,
"loss": 1.1986,
"step": 900
},
{
"epoch": 0.44680325845470253,
"grad_norm": 0.5542218838145379,
"learning_rate": 6.763099179711685e-06,
"loss": 1.2286,
"step": 905
},
{
"epoch": 0.4492717847445075,
"grad_norm": 0.594098893943127,
"learning_rate": 6.722696648078838e-06,
"loss": 1.2335,
"step": 910
},
{
"epoch": 0.4517403110343125,
"grad_norm": 0.5478077068384012,
"learning_rate": 6.682166075425298e-06,
"loss": 1.264,
"step": 915
},
{
"epoch": 0.4542088373241175,
"grad_norm": 0.5727528301850252,
"learning_rate": 6.641510474223338e-06,
"loss": 1.226,
"step": 920
},
{
"epoch": 0.45667736361392247,
"grad_norm": 0.5888269073825134,
"learning_rate": 6.600732866238097e-06,
"loss": 1.212,
"step": 925
},
{
"epoch": 0.45914588990372746,
"grad_norm": 0.5736288265128395,
"learning_rate": 6.559836282302984e-06,
"loss": 1.25,
"step": 930
},
{
"epoch": 0.46161441619353244,
"grad_norm": 0.6651036803926929,
"learning_rate": 6.5188237620943965e-06,
"loss": 1.2672,
"step": 935
},
{
"epoch": 0.4640829424833374,
"grad_norm": 0.5547382454730273,
"learning_rate": 6.477698353905808e-06,
"loss": 1.2887,
"step": 940
},
{
"epoch": 0.4665514687731424,
"grad_norm": 0.5627833712727636,
"learning_rate": 6.436463114421199e-06,
"loss": 1.2674,
"step": 945
},
{
"epoch": 0.46901999506294745,
"grad_norm": 0.5562108977867529,
"learning_rate": 6.395121108487855e-06,
"loss": 1.2973,
"step": 950
},
{
"epoch": 0.47148852135275243,
"grad_norm": 0.5940300188918287,
"learning_rate": 6.353675408888582e-06,
"loss": 1.278,
"step": 955
},
{
"epoch": 0.4739570476425574,
"grad_norm": 0.6499724681591359,
"learning_rate": 6.312129096113313e-06,
"loss": 1.242,
"step": 960
},
{
"epoch": 0.4764255739323624,
"grad_norm": 0.5794092582819724,
"learning_rate": 6.270485258130146e-06,
"loss": 1.2263,
"step": 965
},
{
"epoch": 0.4788941002221674,
"grad_norm": 0.5810005883829364,
"learning_rate": 6.228746990155831e-06,
"loss": 1.2166,
"step": 970
},
{
"epoch": 0.48136262651197237,
"grad_norm": 0.5523321758038612,
"learning_rate": 6.186917394425715e-06,
"loss": 1.2666,
"step": 975
},
{
"epoch": 0.48383115280177735,
"grad_norm": 0.5353766340095819,
"learning_rate": 6.144999579963164e-06,
"loss": 1.2332,
"step": 980
},
{
"epoch": 0.48629967909158234,
"grad_norm": 0.5962559333577797,
"learning_rate": 6.102996662348485e-06,
"loss": 1.2985,
"step": 985
},
{
"epoch": 0.4887682053813873,
"grad_norm": 0.573508927377536,
"learning_rate": 6.060911763487353e-06,
"loss": 1.2353,
"step": 990
},
{
"epoch": 0.4912367316711923,
"grad_norm": 0.6190411186907346,
"learning_rate": 6.0187480113787765e-06,
"loss": 1.2668,
"step": 995
},
{
"epoch": 0.4937052579609973,
"grad_norm": 0.537107101144104,
"learning_rate": 5.976508539882604e-06,
"loss": 1.1984,
"step": 1000
},
{
"epoch": 0.4937052579609973,
"eval_loss": 1.2196881771087646,
"eval_runtime": 2373.8686,
"eval_samples_per_second": 1.685,
"eval_steps_per_second": 0.141,
"step": 1000
},
{
"epoch": 0.4961737842508023,
"grad_norm": 0.5673334311067016,
"learning_rate": 5.934196488486594e-06,
"loss": 1.2573,
"step": 1005
},
{
"epoch": 0.49864231054060726,
"grad_norm": 0.6141102747872601,
"learning_rate": 5.891815002073081e-06,
"loss": 1.2776,
"step": 1010
},
{
"epoch": 0.5011108368304122,
"grad_norm": 0.5866475421501153,
"learning_rate": 5.849367230685214e-06,
"loss": 1.2139,
"step": 1015
},
{
"epoch": 0.5035793631202172,
"grad_norm": 0.5973223110810923,
"learning_rate": 5.806856329292839e-06,
"loss": 1.2809,
"step": 1020
},
{
"epoch": 0.5060478894100222,
"grad_norm": 0.6385978269750231,
"learning_rate": 5.764285457557994e-06,
"loss": 1.2511,
"step": 1025
},
{
"epoch": 0.5085164156998272,
"grad_norm": 0.5607340345191899,
"learning_rate": 5.721657779600071e-06,
"loss": 1.2421,
"step": 1030
},
{
"epoch": 0.5109849419896322,
"grad_norm": 0.5444555426859482,
"learning_rate": 5.678976463760635e-06,
"loss": 1.2561,
"step": 1035
},
{
"epoch": 0.5134534682794372,
"grad_norm": 0.5663913305474535,
"learning_rate": 5.636244682367937e-06,
"loss": 1.2324,
"step": 1040
},
{
"epoch": 0.5159219945692421,
"grad_norm": 0.6001697304401695,
"learning_rate": 5.593465611501127e-06,
"loss": 1.2206,
"step": 1045
},
{
"epoch": 0.5183905208590471,
"grad_norm": 0.5922209574486257,
"learning_rate": 5.5506424307541895e-06,
"loss": 1.2777,
"step": 1050
},
{
"epoch": 0.5208590471488521,
"grad_norm": 0.5810845811643376,
"learning_rate": 5.507778322999615e-06,
"loss": 1.2186,
"step": 1055
},
{
"epoch": 0.5233275734386571,
"grad_norm": 0.5661815755139697,
"learning_rate": 5.464876474151835e-06,
"loss": 1.2465,
"step": 1060
},
{
"epoch": 0.5257960997284621,
"grad_norm": 0.6016645517449551,
"learning_rate": 5.421940072930415e-06,
"loss": 1.2269,
"step": 1065
},
{
"epoch": 0.5282646260182671,
"grad_norm": 0.6268744087157316,
"learning_rate": 5.3789723106230675e-06,
"loss": 1.2089,
"step": 1070
},
{
"epoch": 0.530733152308072,
"grad_norm": 0.5374231313658383,
"learning_rate": 5.3359763808484396e-06,
"loss": 1.2371,
"step": 1075
},
{
"epoch": 0.533201678597877,
"grad_norm": 0.5696825743006079,
"learning_rate": 5.292955479318756e-06,
"loss": 1.2288,
"step": 1080
},
{
"epoch": 0.535670204887682,
"grad_norm": 0.5474403893705062,
"learning_rate": 5.249912803602287e-06,
"loss": 1.2631,
"step": 1085
},
{
"epoch": 0.538138731177487,
"grad_norm": 0.611438366860115,
"learning_rate": 5.206851552885691e-06,
"loss": 1.2395,
"step": 1090
},
{
"epoch": 0.540607257467292,
"grad_norm": 0.6437738368971478,
"learning_rate": 5.163774927736228e-06,
"loss": 1.3132,
"step": 1095
},
{
"epoch": 0.543075783757097,
"grad_norm": 0.5438676695949717,
"learning_rate": 5.120686129863882e-06,
"loss": 1.2807,
"step": 1100
},
{
"epoch": 0.545544310046902,
"grad_norm": 0.6135072081701597,
"learning_rate": 5.077588361883379e-06,
"loss": 1.2239,
"step": 1105
},
{
"epoch": 0.5480128363367069,
"grad_norm": 0.546701645842348,
"learning_rate": 5.0344848270761635e-06,
"loss": 1.2121,
"step": 1110
},
{
"epoch": 0.5504813626265119,
"grad_norm": 0.6153049309551597,
"learning_rate": 4.9913787291523e-06,
"loss": 1.2832,
"step": 1115
},
{
"epoch": 0.5529498889163169,
"grad_norm": 0.6148368644966669,
"learning_rate": 4.948273272012363e-06,
"loss": 1.2536,
"step": 1120
},
{
"epoch": 0.5554184152061219,
"grad_norm": 0.5911800001869699,
"learning_rate": 4.905171659509294e-06,
"loss": 1.2789,
"step": 1125
},
{
"epoch": 0.5578869414959269,
"grad_norm": 0.5450128065258734,
"learning_rate": 4.862077095210284e-06,
"loss": 1.1595,
"step": 1130
},
{
"epoch": 0.5603554677857319,
"grad_norm": 0.5629093671549396,
"learning_rate": 4.818992782158658e-06,
"loss": 1.2854,
"step": 1135
},
{
"epoch": 0.562823994075537,
"grad_norm": 0.6634778146032412,
"learning_rate": 4.775921922635806e-06,
"loss": 1.2405,
"step": 1140
},
{
"epoch": 0.5652925203653419,
"grad_norm": 0.5439361692157106,
"learning_rate": 4.732867717923174e-06,
"loss": 1.265,
"step": 1145
},
{
"epoch": 0.5677610466551469,
"grad_norm": 0.5860651769650387,
"learning_rate": 4.689833368064326e-06,
"loss": 1.2511,
"step": 1150
},
{
"epoch": 0.5702295729449519,
"grad_norm": 0.627265270599233,
"learning_rate": 4.646822071627089e-06,
"loss": 1.2813,
"step": 1155
},
{
"epoch": 0.5726980992347569,
"grad_norm": 0.5634927900565491,
"learning_rate": 4.603837025465829e-06,
"loss": 1.22,
"step": 1160
},
{
"epoch": 0.5751666255245619,
"grad_norm": 0.6482363315867818,
"learning_rate": 4.560881424483833e-06,
"loss": 1.3095,
"step": 1165
},
{
"epoch": 0.5776351518143669,
"grad_norm": 0.4805380958857345,
"learning_rate": 4.517958461395846e-06,
"loss": 1.2737,
"step": 1170
},
{
"epoch": 0.5801036781041718,
"grad_norm": 0.5854150858325277,
"learning_rate": 4.475071326490781e-06,
"loss": 1.2282,
"step": 1175
},
{
"epoch": 0.5825722043939768,
"grad_norm": 0.554230131541799,
"learning_rate": 4.432223207394577e-06,
"loss": 1.178,
"step": 1180
},
{
"epoch": 0.5850407306837818,
"grad_norm": 0.6930360615517788,
"learning_rate": 4.389417288833292e-06,
"loss": 1.2781,
"step": 1185
},
{
"epoch": 0.5875092569735868,
"grad_norm": 0.6042088339838697,
"learning_rate": 4.346656752396388e-06,
"loss": 1.2813,
"step": 1190
},
{
"epoch": 0.5899777832633918,
"grad_norm": 0.6280387565672664,
"learning_rate": 4.303944776300262e-06,
"loss": 1.2433,
"step": 1195
},
{
"epoch": 0.5924463095531968,
"grad_norm": 0.5502891803034431,
"learning_rate": 4.261284535152016e-06,
"loss": 1.1556,
"step": 1200
},
{
"epoch": 0.5924463095531968,
"eval_loss": 1.2148913145065308,
"eval_runtime": 2558.7024,
"eval_samples_per_second": 1.563,
"eval_steps_per_second": 0.131,
"step": 1200
},
{
"epoch": 0.5949148358430018,
"grad_norm": 0.5429417971755677,
"learning_rate": 4.218679199713505e-06,
"loss": 1.2398,
"step": 1205
},
{
"epoch": 0.5973833621328067,
"grad_norm": 0.5573592415141271,
"learning_rate": 4.176131936665669e-06,
"loss": 1.2348,
"step": 1210
},
{
"epoch": 0.5998518884226117,
"grad_norm": 0.5662130620287456,
"learning_rate": 4.133645908373159e-06,
"loss": 1.1894,
"step": 1215
},
{
"epoch": 0.6023204147124167,
"grad_norm": 0.5330337777111593,
"learning_rate": 4.0912242726493e-06,
"loss": 1.267,
"step": 1220
},
{
"epoch": 0.6047889410022217,
"grad_norm": 0.589763462299109,
"learning_rate": 4.048870182521374e-06,
"loss": 1.2461,
"step": 1225
},
{
"epoch": 0.6072574672920267,
"grad_norm": 0.5798241574940401,
"learning_rate": 4.006586785996285e-06,
"loss": 1.2503,
"step": 1230
},
{
"epoch": 0.6097259935818317,
"grad_norm": 0.5714021679563045,
"learning_rate": 3.96437722582656e-06,
"loss": 1.2322,
"step": 1235
},
{
"epoch": 0.6121945198716366,
"grad_norm": 0.5926307509257247,
"learning_rate": 3.922244639276773e-06,
"loss": 1.2692,
"step": 1240
},
{
"epoch": 0.6146630461614416,
"grad_norm": 0.6016557090563102,
"learning_rate": 3.880192157890365e-06,
"loss": 1.2642,
"step": 1245
},
{
"epoch": 0.6171315724512466,
"grad_norm": 0.5454381088492659,
"learning_rate": 3.838222907256884e-06,
"loss": 1.239,
"step": 1250
},
{
"epoch": 0.6196000987410516,
"grad_norm": 0.5582749852816064,
"learning_rate": 3.7963400067796774e-06,
"loss": 1.2851,
"step": 1255
},
{
"epoch": 0.6220686250308566,
"grad_norm": 0.5562967849735465,
"learning_rate": 3.7545465694440363e-06,
"loss": 1.2432,
"step": 1260
},
{
"epoch": 0.6245371513206616,
"grad_norm": 0.5419669962437569,
"learning_rate": 3.7128457015858198e-06,
"loss": 1.2103,
"step": 1265
},
{
"epoch": 0.6270056776104665,
"grad_norm": 0.558873424565738,
"learning_rate": 3.6712405026605792e-06,
"loss": 1.2388,
"step": 1270
},
{
"epoch": 0.6294742039002715,
"grad_norm": 0.5712282397945332,
"learning_rate": 3.6297340650131785e-06,
"loss": 1.2819,
"step": 1275
},
{
"epoch": 0.6319427301900765,
"grad_norm": 0.5643697726223241,
"learning_rate": 3.5883294736479612e-06,
"loss": 1.2386,
"step": 1280
},
{
"epoch": 0.6344112564798815,
"grad_norm": 0.6332020317807455,
"learning_rate": 3.5470298059994545e-06,
"loss": 1.2677,
"step": 1285
},
{
"epoch": 0.6368797827696865,
"grad_norm": 0.6276157822500693,
"learning_rate": 3.5058381317036285e-06,
"loss": 1.2137,
"step": 1290
},
{
"epoch": 0.6393483090594915,
"grad_norm": 0.5139753708360036,
"learning_rate": 3.46475751236975e-06,
"loss": 1.2436,
"step": 1295
},
{
"epoch": 0.6418168353492965,
"grad_norm": 0.5868933304811402,
"learning_rate": 3.423791001352823e-06,
"loss": 1.1681,
"step": 1300
},
{
"epoch": 0.6442853616391014,
"grad_norm": 0.5592137564928078,
"learning_rate": 3.382941643526644e-06,
"loss": 1.2443,
"step": 1305
},
{
"epoch": 0.6467538879289064,
"grad_norm": 0.567548616583169,
"learning_rate": 3.3422124750574902e-06,
"loss": 1.2604,
"step": 1310
},
{
"epoch": 0.6492224142187114,
"grad_norm": 0.568882999500645,
"learning_rate": 3.3016065231784587e-06,
"loss": 1.1595,
"step": 1315
},
{
"epoch": 0.6516909405085164,
"grad_norm": 0.628304707671549,
"learning_rate": 3.2611268059644535e-06,
"loss": 1.2841,
"step": 1320
},
{
"epoch": 0.6541594667983214,
"grad_norm": 0.5686219665932154,
"learning_rate": 3.2207763321078737e-06,
"loss": 1.2347,
"step": 1325
},
{
"epoch": 0.6566279930881264,
"grad_norm": 0.6424587872522304,
"learning_rate": 3.1805581006949856e-06,
"loss": 1.2329,
"step": 1330
},
{
"epoch": 0.6590965193779313,
"grad_norm": 0.6654374856920555,
"learning_rate": 3.1404751009830124e-06,
"loss": 1.2423,
"step": 1335
},
{
"epoch": 0.6615650456677363,
"grad_norm": 0.5206675422652753,
"learning_rate": 3.100530312177956e-06,
"loss": 1.2329,
"step": 1340
},
{
"epoch": 0.6640335719575413,
"grad_norm": 0.6656795155578475,
"learning_rate": 3.0607267032131704e-06,
"loss": 1.3062,
"step": 1345
},
{
"epoch": 0.6665020982473463,
"grad_norm": 0.6071844948708964,
"learning_rate": 3.0210672325286806e-06,
"loss": 1.2656,
"step": 1350
},
{
"epoch": 0.6689706245371513,
"grad_norm": 0.6211025479318184,
"learning_rate": 2.9815548478513034e-06,
"loss": 1.2167,
"step": 1355
},
{
"epoch": 0.6714391508269563,
"grad_norm": 0.5801456765244887,
"learning_rate": 2.9421924859755525e-06,
"loss": 1.2249,
"step": 1360
},
{
"epoch": 0.6739076771167613,
"grad_norm": 0.564862030285346,
"learning_rate": 2.9029830725453545e-06,
"loss": 1.2414,
"step": 1365
},
{
"epoch": 0.6763762034065662,
"grad_norm": 0.5538133203567932,
"learning_rate": 2.8639295218366115e-06,
"loss": 1.2191,
"step": 1370
},
{
"epoch": 0.6788447296963712,
"grad_norm": 0.5925104037633543,
"learning_rate": 2.8250347365405737e-06,
"loss": 1.2318,
"step": 1375
},
{
"epoch": 0.6813132559861762,
"grad_norm": 0.6173909875052214,
"learning_rate": 2.78630160754811e-06,
"loss": 1.2555,
"step": 1380
},
{
"epoch": 0.6837817822759812,
"grad_norm": 0.6579800769123958,
"learning_rate": 2.747733013734835e-06,
"loss": 1.2553,
"step": 1385
},
{
"epoch": 0.6862503085657862,
"grad_norm": 0.6097488788659552,
"learning_rate": 2.709331821747133e-06,
"loss": 1.2482,
"step": 1390
},
{
"epoch": 0.6887188348555913,
"grad_norm": 0.5717544066297715,
"learning_rate": 2.6711008857890928e-06,
"loss": 1.2477,
"step": 1395
},
{
"epoch": 0.6911873611453963,
"grad_norm": 0.5675063300875494,
"learning_rate": 2.63304304741037e-06,
"loss": 1.2386,
"step": 1400
},
{
"epoch": 0.6911873611453963,
"eval_loss": 1.211606740951538,
"eval_runtime": 2914.6181,
"eval_samples_per_second": 1.372,
"eval_steps_per_second": 0.115,
"step": 1400
},
{
"epoch": 0.6936558874352012,
"grad_norm": 0.623871781326139,
"learning_rate": 2.595161135294978e-06,
"loss": 1.2484,
"step": 1405
},
{
"epoch": 0.6961244137250062,
"grad_norm": 0.5967791678571923,
"learning_rate": 2.55745796505105e-06,
"loss": 1.2816,
"step": 1410
},
{
"epoch": 0.6985929400148112,
"grad_norm": 0.5958918786737188,
"learning_rate": 2.5199363390015645e-06,
"loss": 1.2518,
"step": 1415
},
{
"epoch": 0.7010614663046162,
"grad_norm": 0.5716469845277612,
"learning_rate": 2.482599045976059e-06,
"loss": 1.2518,
"step": 1420
},
{
"epoch": 0.7035299925944212,
"grad_norm": 0.5601354887821722,
"learning_rate": 2.445448861103348e-06,
"loss": 1.2114,
"step": 1425
},
{
"epoch": 0.7059985188842262,
"grad_norm": 0.5783618487395104,
"learning_rate": 2.408488545605265e-06,
"loss": 1.2801,
"step": 1430
},
{
"epoch": 0.7084670451740311,
"grad_norm": 0.600120666255256,
"learning_rate": 2.3717208465914193e-06,
"loss": 1.2928,
"step": 1435
},
{
"epoch": 0.7109355714638361,
"grad_norm": 0.6823362059514299,
"learning_rate": 2.3351484968550264e-06,
"loss": 1.2306,
"step": 1440
},
{
"epoch": 0.7134040977536411,
"grad_norm": 0.5869728269343567,
"learning_rate": 2.298774214669785e-06,
"loss": 1.2417,
"step": 1445
},
{
"epoch": 0.7158726240434461,
"grad_norm": 0.597629982893601,
"learning_rate": 2.2626007035878377e-06,
"loss": 1.1912,
"step": 1450
},
{
"epoch": 0.7183411503332511,
"grad_norm": 0.6222473980576229,
"learning_rate": 2.226630652238836e-06,
"loss": 1.2083,
"step": 1455
},
{
"epoch": 0.7208096766230561,
"grad_norm": 0.5978767327421509,
"learning_rate": 2.1908667341300923e-06,
"loss": 1.2577,
"step": 1460
},
{
"epoch": 0.723278202912861,
"grad_norm": 0.6156905912164004,
"learning_rate": 2.155311607447877e-06,
"loss": 1.2922,
"step": 1465
},
{
"epoch": 0.725746729202666,
"grad_norm": 0.6341472520929511,
"learning_rate": 2.1199679148598434e-06,
"loss": 1.2667,
"step": 1470
},
{
"epoch": 0.728215255492471,
"grad_norm": 0.5655996654676207,
"learning_rate": 2.084838283318616e-06,
"loss": 1.1939,
"step": 1475
},
{
"epoch": 0.730683781782276,
"grad_norm": 0.5824088027115487,
"learning_rate": 2.0499253238665284e-06,
"loss": 1.242,
"step": 1480
},
{
"epoch": 0.733152308072081,
"grad_norm": 0.6063388402546945,
"learning_rate": 2.0152316314415602e-06,
"loss": 1.2482,
"step": 1485
},
{
"epoch": 0.735620834361886,
"grad_norm": 0.6226805122487513,
"learning_rate": 1.9807597846844737e-06,
"loss": 1.255,
"step": 1490
},
{
"epoch": 0.738089360651691,
"grad_norm": 0.5854379294811827,
"learning_rate": 1.9465123457471395e-06,
"loss": 1.1786,
"step": 1495
},
{
"epoch": 0.7405578869414959,
"grad_norm": 0.5577052246580572,
"learning_rate": 1.9124918601021124e-06,
"loss": 1.2358,
"step": 1500
},
{
"epoch": 0.7430264132313009,
"grad_norm": 0.5754079743445688,
"learning_rate": 1.8787008563534326e-06,
"loss": 1.1945,
"step": 1505
},
{
"epoch": 0.7454949395211059,
"grad_norm": 0.6099556355269008,
"learning_rate": 1.845141846048691e-06,
"loss": 1.2379,
"step": 1510
},
{
"epoch": 0.7479634658109109,
"grad_norm": 0.5782704010521243,
"learning_rate": 1.8118173234923447e-06,
"loss": 1.2542,
"step": 1515
},
{
"epoch": 0.7504319921007159,
"grad_norm": 0.5382858254483444,
"learning_rate": 1.778729765560337e-06,
"loss": 1.2327,
"step": 1520
},
{
"epoch": 0.7529005183905209,
"grad_norm": 0.6082642317550977,
"learning_rate": 1.7458816315159937e-06,
"loss": 1.2631,
"step": 1525
},
{
"epoch": 0.7553690446803258,
"grad_norm": 0.6120502232540203,
"learning_rate": 1.7132753628272403e-06,
"loss": 1.2687,
"step": 1530
},
{
"epoch": 0.7578375709701308,
"grad_norm": 0.5800190917782422,
"learning_rate": 1.6809133829851344e-06,
"loss": 1.1809,
"step": 1535
},
{
"epoch": 0.7603060972599358,
"grad_norm": 0.6248767795672576,
"learning_rate": 1.6487980973237434e-06,
"loss": 1.2102,
"step": 1540
},
{
"epoch": 0.7627746235497408,
"grad_norm": 0.6214869106372124,
"learning_rate": 1.6169318928413574e-06,
"loss": 1.3183,
"step": 1545
},
{
"epoch": 0.7652431498395458,
"grad_norm": 0.6509287986960063,
"learning_rate": 1.5853171380230791e-06,
"loss": 1.2394,
"step": 1550
},
{
"epoch": 0.7677116761293508,
"grad_norm": 0.5548564286839581,
"learning_rate": 1.5539561826647832e-06,
"loss": 1.2278,
"step": 1555
},
{
"epoch": 0.7701802024191557,
"grad_norm": 0.5873399173100068,
"learning_rate": 1.5228513576984633e-06,
"loss": 1.2419,
"step": 1560
},
{
"epoch": 0.7726487287089607,
"grad_norm": 0.5698526241039991,
"learning_rate": 1.4920049750189852e-06,
"loss": 1.2134,
"step": 1565
},
{
"epoch": 0.7751172549987657,
"grad_norm": 0.5462525752885333,
"learning_rate": 1.4614193273122562e-06,
"loss": 1.2013,
"step": 1570
},
{
"epoch": 0.7775857812885707,
"grad_norm": 0.5604406125512932,
"learning_rate": 1.4310966878848116e-06,
"loss": 1.2319,
"step": 1575
},
{
"epoch": 0.7800543075783757,
"grad_norm": 0.5512496837811336,
"learning_rate": 1.401039310494855e-06,
"loss": 1.2436,
"step": 1580
},
{
"epoch": 0.7825228338681807,
"grad_norm": 0.6804998312407946,
"learning_rate": 1.3712494291847416e-06,
"loss": 1.2567,
"step": 1585
},
{
"epoch": 0.7849913601579857,
"grad_norm": 0.6655723000722049,
"learning_rate": 1.3417292581149388e-06,
"loss": 1.2682,
"step": 1590
},
{
"epoch": 0.7874598864477906,
"grad_norm": 0.539222744257867,
"learning_rate": 1.3124809913994458e-06,
"loss": 1.2009,
"step": 1595
},
{
"epoch": 0.7899284127375956,
"grad_norm": 0.622721298212167,
"learning_rate": 1.2835068029427188e-06,
"loss": 1.2661,
"step": 1600
},
{
"epoch": 0.7899284127375956,
"eval_loss": 1.2096235752105713,
"eval_runtime": 2576.8943,
"eval_samples_per_second": 1.552,
"eval_steps_per_second": 0.13,
"step": 1600
},
{
"epoch": 0.7923969390274006,
"grad_norm": 0.5470842930259888,
"learning_rate": 1.2548088462781006e-06,
"loss": 1.2244,
"step": 1605
},
{
"epoch": 0.7948654653172056,
"grad_norm": 0.5718801309412294,
"learning_rate": 1.2263892544077439e-06,
"loss": 1.2498,
"step": 1610
},
{
"epoch": 0.7973339916070106,
"grad_norm": 0.5818869817428877,
"learning_rate": 1.1982501396440831e-06,
"loss": 1.2044,
"step": 1615
},
{
"epoch": 0.7998025178968156,
"grad_norm": 0.5534354350847027,
"learning_rate": 1.1703935934528327e-06,
"loss": 1.2328,
"step": 1620
},
{
"epoch": 0.8022710441866205,
"grad_norm": 0.5862274808604895,
"learning_rate": 1.1428216862975383e-06,
"loss": 1.2741,
"step": 1625
},
{
"epoch": 0.8047395704764255,
"grad_norm": 0.5781950796979888,
"learning_rate": 1.1155364674856834e-06,
"loss": 1.2679,
"step": 1630
},
{
"epoch": 0.8072080967662305,
"grad_norm": 0.5751302301159884,
"learning_rate": 1.088539965016377e-06,
"loss": 1.2153,
"step": 1635
},
{
"epoch": 0.8096766230560355,
"grad_norm": 0.6150065644184977,
"learning_rate": 1.0618341854296176e-06,
"loss": 1.2245,
"step": 1640
},
{
"epoch": 0.8121451493458405,
"grad_norm": 0.5893743060234344,
"learning_rate": 1.0354211136571586e-06,
"loss": 1.2091,
"step": 1645
},
{
"epoch": 0.8146136756356456,
"grad_norm": 0.554001627193442,
"learning_rate": 1.0093027128749722e-06,
"loss": 1.22,
"step": 1650
},
{
"epoch": 0.8170822019254506,
"grad_norm": 0.5554016650617593,
"learning_rate": 9.834809243573406e-07,
"loss": 1.2736,
"step": 1655
},
{
"epoch": 0.8195507282152555,
"grad_norm": 0.6467820952863279,
"learning_rate": 9.57957667332562e-07,
"loss": 1.2504,
"step": 1660
},
{
"epoch": 0.8220192545050605,
"grad_norm": 0.5388841867240308,
"learning_rate": 9.327348388403063e-07,
"loss": 1.2134,
"step": 1665
},
{
"epoch": 0.8244877807948655,
"grad_norm": 0.5511949198965124,
"learning_rate": 9.078143135906154e-07,
"loss": 1.2373,
"step": 1670
},
{
"epoch": 0.8269563070846705,
"grad_norm": 0.5662492648467455,
"learning_rate": 8.831979438245619e-07,
"loss": 1.2379,
"step": 1675
},
{
"epoch": 0.8294248333744755,
"grad_norm": 0.6308948625824087,
"learning_rate": 8.588875591765838e-07,
"loss": 1.1868,
"step": 1680
},
{
"epoch": 0.8318933596642805,
"grad_norm": 0.576660126030343,
"learning_rate": 8.348849665384906e-07,
"loss": 1.2891,
"step": 1685
},
{
"epoch": 0.8343618859540854,
"grad_norm": 0.556606789107177,
"learning_rate": 8.111919499251653e-07,
"loss": 1.2021,
"step": 1690
},
{
"epoch": 0.8368304122438904,
"grad_norm": 0.5661740275037651,
"learning_rate": 7.878102703419683e-07,
"loss": 1.2536,
"step": 1695
},
{
"epoch": 0.8392989385336954,
"grad_norm": 0.5967205392911274,
"learning_rate": 7.647416656538464e-07,
"loss": 1.2373,
"step": 1700
},
{
"epoch": 0.8417674648235004,
"grad_norm": 0.5528061162446166,
"learning_rate": 7.419878504561651e-07,
"loss": 1.2199,
"step": 1705
},
{
"epoch": 0.8442359911133054,
"grad_norm": 0.6479872928308008,
"learning_rate": 7.195505159472726e-07,
"loss": 1.2368,
"step": 1710
},
{
"epoch": 0.8467045174031104,
"grad_norm": 0.594834011459554,
"learning_rate": 6.974313298027946e-07,
"loss": 1.1997,
"step": 1715
},
{
"epoch": 0.8491730436929154,
"grad_norm": 0.5442970599231537,
"learning_rate": 6.756319360516856e-07,
"loss": 1.2037,
"step": 1720
},
{
"epoch": 0.8516415699827203,
"grad_norm": 0.6655980946948994,
"learning_rate": 6.541539549540383e-07,
"loss": 1.3013,
"step": 1725
},
{
"epoch": 0.8541100962725253,
"grad_norm": 0.599651741019629,
"learning_rate": 6.329989828806482e-07,
"loss": 1.2454,
"step": 1730
},
{
"epoch": 0.8565786225623303,
"grad_norm": 0.7507415296204425,
"learning_rate": 6.121685921943688e-07,
"loss": 1.2347,
"step": 1735
},
{
"epoch": 0.8590471488521353,
"grad_norm": 0.5883088948787556,
"learning_rate": 5.916643311332438e-07,
"loss": 1.2566,
"step": 1740
},
{
"epoch": 0.8615156751419403,
"grad_norm": 0.5844649067792757,
"learning_rate": 5.71487723695427e-07,
"loss": 1.2176,
"step": 1745
},
{
"epoch": 0.8639842014317453,
"grad_norm": 0.570757598339604,
"learning_rate": 5.516402695259165e-07,
"loss": 1.2111,
"step": 1750
},
{
"epoch": 0.8664527277215502,
"grad_norm": 0.6101964731318252,
"learning_rate": 5.321234438050893e-07,
"loss": 1.2552,
"step": 1755
},
{
"epoch": 0.8689212540113552,
"grad_norm": 0.6114031483570134,
"learning_rate": 5.12938697139056e-07,
"loss": 1.2339,
"step": 1760
},
{
"epoch": 0.8713897803011602,
"grad_norm": 0.5640524033820485,
"learning_rate": 4.940874554518465e-07,
"loss": 1.2594,
"step": 1765
},
{
"epoch": 0.8738583065909652,
"grad_norm": 0.6433079417694005,
"learning_rate": 4.755711198794233e-07,
"loss": 1.2854,
"step": 1770
},
{
"epoch": 0.8763268328807702,
"grad_norm": 0.604973387553276,
"learning_rate": 4.573910666655429e-07,
"loss": 1.3237,
"step": 1775
},
{
"epoch": 0.8787953591705752,
"grad_norm": 0.5628418770325067,
"learning_rate": 4.395486470594645e-07,
"loss": 1.1982,
"step": 1780
},
{
"epoch": 0.8812638854603801,
"grad_norm": 0.6659219563445046,
"learning_rate": 4.220451872155179e-07,
"loss": 1.2309,
"step": 1785
},
{
"epoch": 0.8837324117501851,
"grad_norm": 0.5361789546629312,
"learning_rate": 4.048819880945337e-07,
"loss": 1.199,
"step": 1790
},
{
"epoch": 0.8862009380399901,
"grad_norm": 0.5558192723511216,
"learning_rate": 3.880603253671522e-07,
"loss": 1.2263,
"step": 1795
},
{
"epoch": 0.8886694643297951,
"grad_norm": 0.5634804859248715,
"learning_rate": 3.7158144931900395e-07,
"loss": 1.2752,
"step": 1800
},
{
"epoch": 0.8886694643297951,
"eval_loss": 1.2087970972061157,
"eval_runtime": 2557.7862,
"eval_samples_per_second": 1.564,
"eval_steps_per_second": 0.131,
"step": 1800
},
{
"epoch": 0.8911379906196001,
"grad_norm": 0.6032610406878897,
"learning_rate": 3.5544658475778317e-07,
"loss": 1.1999,
"step": 1805
},
{
"epoch": 0.8936065169094051,
"grad_norm": 0.6216254522630721,
"learning_rate": 3.396569309222114e-07,
"loss": 1.2339,
"step": 1810
},
{
"epoch": 0.89607504319921,
"grad_norm": 0.5807256981071689,
"learning_rate": 3.2421366139290423e-07,
"loss": 1.3057,
"step": 1815
},
{
"epoch": 0.898543569489015,
"grad_norm": 0.5211008570948544,
"learning_rate": 3.091179240051462e-07,
"loss": 1.2022,
"step": 1820
},
{
"epoch": 0.90101209577882,
"grad_norm": 0.5525058863296126,
"learning_rate": 2.943708407635704e-07,
"loss": 1.2048,
"step": 1825
},
{
"epoch": 0.903480622068625,
"grad_norm": 0.6377145176064325,
"learning_rate": 2.799735077587695e-07,
"loss": 1.213,
"step": 1830
},
{
"epoch": 0.90594914835843,
"grad_norm": 0.5813161900855606,
"learning_rate": 2.659269950858273e-07,
"loss": 1.33,
"step": 1835
},
{
"epoch": 0.908417674648235,
"grad_norm": 0.6256712692686102,
"learning_rate": 2.5223234676478193e-07,
"loss": 1.2418,
"step": 1840
},
{
"epoch": 0.91088620093804,
"grad_norm": 0.598042344925788,
"learning_rate": 2.3889058066302873e-07,
"loss": 1.2928,
"step": 1845
},
{
"epoch": 0.9133547272278449,
"grad_norm": 0.6144058961581507,
"learning_rate": 2.2590268841966357e-07,
"loss": 1.2522,
"step": 1850
},
{
"epoch": 0.9158232535176499,
"grad_norm": 0.6086868817654493,
"learning_rate": 2.132696353717839e-07,
"loss": 1.2275,
"step": 1855
},
{
"epoch": 0.9182917798074549,
"grad_norm": 0.6193803813904503,
"learning_rate": 2.0099236048273407e-07,
"loss": 1.2102,
"step": 1860
},
{
"epoch": 0.9207603060972599,
"grad_norm": 0.6206660621687174,
"learning_rate": 1.890717762723182e-07,
"loss": 1.2413,
"step": 1865
},
{
"epoch": 0.9232288323870649,
"grad_norm": 0.5195254310690817,
"learning_rate": 1.7750876874897627e-07,
"loss": 1.2536,
"step": 1870
},
{
"epoch": 0.9256973586768699,
"grad_norm": 0.6172193600635592,
"learning_rate": 1.6630419734393e-07,
"loss": 1.1877,
"step": 1875
},
{
"epoch": 0.9281658849666748,
"grad_norm": 0.5854056073690375,
"learning_rate": 1.554588948473068e-07,
"loss": 1.2694,
"step": 1880
},
{
"epoch": 0.9306344112564798,
"grad_norm": 0.5939692455470944,
"learning_rate": 1.4497366734623874e-07,
"loss": 1.2223,
"step": 1885
},
{
"epoch": 0.9331029375462848,
"grad_norm": 0.558716522853661,
"learning_rate": 1.3484929416495096e-07,
"loss": 1.1465,
"step": 1890
},
{
"epoch": 0.9355714638360898,
"grad_norm": 0.601778856283905,
"learning_rate": 1.2508652780683916e-07,
"loss": 1.2618,
"step": 1895
},
{
"epoch": 0.9380399901258949,
"grad_norm": 0.5724230357863298,
"learning_rate": 1.1568609389853546e-07,
"loss": 1.199,
"step": 1900
},
{
"epoch": 0.9405085164156999,
"grad_norm": 0.5858685464797397,
"learning_rate": 1.0664869113598097e-07,
"loss": 1.2416,
"step": 1905
},
{
"epoch": 0.9429770427055049,
"grad_norm": 0.5955002776535666,
"learning_rate": 9.7974991232489e-08,
"loss": 1.2621,
"step": 1910
},
{
"epoch": 0.9454455689953098,
"grad_norm": 0.6031053768787782,
"learning_rate": 8.966563886882107e-08,
"loss": 1.2966,
"step": 1915
},
{
"epoch": 0.9479140952851148,
"grad_norm": 0.5626513433181811,
"learning_rate": 8.172125164527312e-08,
"loss": 1.197,
"step": 1920
},
{
"epoch": 0.9503826215749198,
"grad_norm": 0.6147790631492948,
"learning_rate": 7.414242003576876e-08,
"loss": 1.2476,
"step": 1925
},
{
"epoch": 0.9528511478647248,
"grad_norm": 0.6387128598756113,
"learning_rate": 6.692970734397176e-08,
"loss": 1.2717,
"step": 1930
},
{
"epoch": 0.9553196741545298,
"grad_norm": 0.58519229057596,
"learning_rate": 6.0083649661421e-08,
"loss": 1.2427,
"step": 1935
},
{
"epoch": 0.9577882004443348,
"grad_norm": 0.5732049204953203,
"learning_rate": 5.360475582768088e-08,
"loss": 1.2499,
"step": 1940
},
{
"epoch": 0.9602567267341398,
"grad_norm": 0.5510115335869762,
"learning_rate": 4.7493507392524226e-08,
"loss": 1.1837,
"step": 1945
},
{
"epoch": 0.9627252530239447,
"grad_norm": 0.5959129330379044,
"learning_rate": 4.175035858013987e-08,
"loss": 1.2595,
"step": 1950
},
{
"epoch": 0.9651937793137497,
"grad_norm": 0.6525575790551825,
"learning_rate": 3.637573625537183e-08,
"loss": 1.3283,
"step": 1955
},
{
"epoch": 0.9676623056035547,
"grad_norm": 0.6761446719619785,
"learning_rate": 3.13700398919925e-08,
"loss": 1.2633,
"step": 1960
},
{
"epoch": 0.9701308318933597,
"grad_norm": 0.5705669812908541,
"learning_rate": 2.673364154301028e-08,
"loss": 1.2446,
"step": 1965
},
{
"epoch": 0.9725993581831647,
"grad_norm": 0.6197155608101478,
"learning_rate": 2.2466885813018925e-08,
"loss": 1.2492,
"step": 1970
},
{
"epoch": 0.9750678844729697,
"grad_norm": 0.5667304098455904,
"learning_rate": 1.857008983258135e-08,
"loss": 1.2485,
"step": 1975
},
{
"epoch": 0.9775364107627746,
"grad_norm": 0.6113665999543747,
"learning_rate": 1.504354323466073e-08,
"loss": 1.2573,
"step": 1980
},
{
"epoch": 0.9800049370525796,
"grad_norm": 0.5726714283406965,
"learning_rate": 1.188750813309214e-08,
"loss": 1.2264,
"step": 1985
},
{
"epoch": 0.9824734633423846,
"grad_norm": 0.5521047354644366,
"learning_rate": 9.102219103103161e-09,
"loss": 1.2194,
"step": 1990
},
{
"epoch": 0.9849419896321896,
"grad_norm": 0.6819693929722572,
"learning_rate": 6.687883163873921e-09,
"loss": 1.244,
"step": 1995
},
{
"epoch": 0.9874105159219946,
"grad_norm": 0.6016814387388122,
"learning_rate": 4.644679763155524e-09,
"loss": 1.2701,
"step": 2000
},
{
"epoch": 0.9874105159219946,
"eval_loss": 1.208633542060852,
"eval_runtime": 2553.7159,
"eval_samples_per_second": 1.566,
"eval_steps_per_second": 0.131,
"step": 2000
},
{
"epoch": 0.9898790422117996,
"grad_norm": 0.5854483828292536,
"learning_rate": 2.97276076392905e-09,
"loss": 1.2735,
"step": 2005
},
{
"epoch": 0.9923475685016045,
"grad_norm": 0.6149856349841143,
"learning_rate": 1.6722504331195822e-09,
"loss": 1.1829,
"step": 2010
},
{
"epoch": 0.9948160947914095,
"grad_norm": 0.5776580228856067,
"learning_rate": 7.432454323597071e-10,
"loss": 1.2584,
"step": 2015
},
{
"epoch": 0.9972846210812145,
"grad_norm": 0.5955477076581019,
"learning_rate": 1.8581481080415242e-10,
"loss": 1.1737,
"step": 2020
},
{
"epoch": 0.9997531473710195,
"grad_norm": 0.6070167910291095,
"learning_rate": 0.0,
"loss": 1.1858,
"step": 2025
},
{
"epoch": 0.9997531473710195,
"step": 2025,
"total_flos": 4526278881050624.0,
"train_loss": 1.270192005722611,
"train_runtime": 113933.3906,
"train_samples_per_second": 0.427,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 2025,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 4526278881050624.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}