qwen2-vl-7b-instruct-ogiri / trainer_state.json
Joctor's picture
Upload 16 files
0bf2d32 verified
raw
history blame
159 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 2000,
"global_step": 9033,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011070519207350825,
"grad_norm": 55.698127642695866,
"learning_rate": 1.1061946902654869e-07,
"loss": 4.4914,
"step": 10
},
{
"epoch": 0.002214103841470165,
"grad_norm": 41.723334569659066,
"learning_rate": 2.2123893805309737e-07,
"loss": 4.4954,
"step": 20
},
{
"epoch": 0.0033211557622052474,
"grad_norm": 43.79884898454788,
"learning_rate": 3.318584070796461e-07,
"loss": 4.296,
"step": 30
},
{
"epoch": 0.00442820768294033,
"grad_norm": 31.30694571868551,
"learning_rate": 4.4247787610619474e-07,
"loss": 3.9204,
"step": 40
},
{
"epoch": 0.0055352596036754124,
"grad_norm": 30.761018090708117,
"learning_rate": 5.530973451327435e-07,
"loss": 3.4951,
"step": 50
},
{
"epoch": 0.006642311524410495,
"grad_norm": 32.180378162249326,
"learning_rate": 6.637168141592922e-07,
"loss": 3.4177,
"step": 60
},
{
"epoch": 0.007749363445145577,
"grad_norm": 24.157005825357814,
"learning_rate": 7.743362831858408e-07,
"loss": 3.3864,
"step": 70
},
{
"epoch": 0.00885641536588066,
"grad_norm": 29.036937778457148,
"learning_rate": 8.849557522123895e-07,
"loss": 3.1996,
"step": 80
},
{
"epoch": 0.009963467286615742,
"grad_norm": 23.44273601537366,
"learning_rate": 9.95575221238938e-07,
"loss": 3.2652,
"step": 90
},
{
"epoch": 0.011070519207350825,
"grad_norm": 31.552666658205744,
"learning_rate": 1.106194690265487e-06,
"loss": 3.2654,
"step": 100
},
{
"epoch": 0.012177571128085908,
"grad_norm": 23.158016097255164,
"learning_rate": 1.2168141592920355e-06,
"loss": 3.1954,
"step": 110
},
{
"epoch": 0.01328462304882099,
"grad_norm": 31.510121445270936,
"learning_rate": 1.3274336283185843e-06,
"loss": 3.2016,
"step": 120
},
{
"epoch": 0.014391674969556073,
"grad_norm": 29.02944479343648,
"learning_rate": 1.438053097345133e-06,
"loss": 3.202,
"step": 130
},
{
"epoch": 0.015498726890291154,
"grad_norm": 24.737793045013742,
"learning_rate": 1.5486725663716816e-06,
"loss": 3.054,
"step": 140
},
{
"epoch": 0.016605778811026237,
"grad_norm": 25.982953823095542,
"learning_rate": 1.6592920353982304e-06,
"loss": 3.1637,
"step": 150
},
{
"epoch": 0.01771283073176132,
"grad_norm": 24.62246443187751,
"learning_rate": 1.769911504424779e-06,
"loss": 3.0422,
"step": 160
},
{
"epoch": 0.018819882652496404,
"grad_norm": 24.996096258559348,
"learning_rate": 1.8805309734513274e-06,
"loss": 2.9983,
"step": 170
},
{
"epoch": 0.019926934573231483,
"grad_norm": 30.197182446028002,
"learning_rate": 1.991150442477876e-06,
"loss": 3.0625,
"step": 180
},
{
"epoch": 0.021033986493966567,
"grad_norm": 25.648689604176077,
"learning_rate": 2.101769911504425e-06,
"loss": 3.2172,
"step": 190
},
{
"epoch": 0.02214103841470165,
"grad_norm": 30.003866974191762,
"learning_rate": 2.212389380530974e-06,
"loss": 3.0895,
"step": 200
},
{
"epoch": 0.023248090335436733,
"grad_norm": 27.11239518865144,
"learning_rate": 2.3230088495575224e-06,
"loss": 2.9847,
"step": 210
},
{
"epoch": 0.024355142256171816,
"grad_norm": 24.532976628512625,
"learning_rate": 2.433628318584071e-06,
"loss": 3.0439,
"step": 220
},
{
"epoch": 0.025462194176906896,
"grad_norm": 25.595931432489675,
"learning_rate": 2.5442477876106196e-06,
"loss": 2.9722,
"step": 230
},
{
"epoch": 0.02656924609764198,
"grad_norm": 24.927511131743852,
"learning_rate": 2.6548672566371687e-06,
"loss": 3.0965,
"step": 240
},
{
"epoch": 0.027676298018377062,
"grad_norm": 21.477421706673375,
"learning_rate": 2.765486725663717e-06,
"loss": 2.9589,
"step": 250
},
{
"epoch": 0.028783349939112145,
"grad_norm": 26.975518213759347,
"learning_rate": 2.876106194690266e-06,
"loss": 2.9878,
"step": 260
},
{
"epoch": 0.029890401859847225,
"grad_norm": 25.199714957692397,
"learning_rate": 2.9867256637168145e-06,
"loss": 3.0718,
"step": 270
},
{
"epoch": 0.03099745378058231,
"grad_norm": 25.911079775392583,
"learning_rate": 3.097345132743363e-06,
"loss": 3.1289,
"step": 280
},
{
"epoch": 0.03210450570131739,
"grad_norm": 22.333848218066173,
"learning_rate": 3.2079646017699117e-06,
"loss": 3.019,
"step": 290
},
{
"epoch": 0.033211557622052475,
"grad_norm": 27.77592678094656,
"learning_rate": 3.3185840707964607e-06,
"loss": 3.0679,
"step": 300
},
{
"epoch": 0.03431860954278756,
"grad_norm": 25.520987098358336,
"learning_rate": 3.429203539823009e-06,
"loss": 3.0455,
"step": 310
},
{
"epoch": 0.03542566146352264,
"grad_norm": 30.694095405699013,
"learning_rate": 3.539823008849558e-06,
"loss": 2.9686,
"step": 320
},
{
"epoch": 0.036532713384257724,
"grad_norm": 34.07278189685705,
"learning_rate": 3.6504424778761066e-06,
"loss": 3.0074,
"step": 330
},
{
"epoch": 0.03763976530499281,
"grad_norm": 23.233595027296616,
"learning_rate": 3.7610619469026547e-06,
"loss": 2.9906,
"step": 340
},
{
"epoch": 0.038746817225727884,
"grad_norm": 21.65008179710679,
"learning_rate": 3.871681415929203e-06,
"loss": 2.9965,
"step": 350
},
{
"epoch": 0.03985386914646297,
"grad_norm": 25.432398948327197,
"learning_rate": 3.982300884955752e-06,
"loss": 2.9583,
"step": 360
},
{
"epoch": 0.04096092106719805,
"grad_norm": 24.118552348993813,
"learning_rate": 4.092920353982301e-06,
"loss": 2.9629,
"step": 370
},
{
"epoch": 0.04206797298793313,
"grad_norm": 28.535820173184682,
"learning_rate": 4.20353982300885e-06,
"loss": 3.0437,
"step": 380
},
{
"epoch": 0.043175024908668216,
"grad_norm": 27.574173741552002,
"learning_rate": 4.314159292035399e-06,
"loss": 2.9642,
"step": 390
},
{
"epoch": 0.0442820768294033,
"grad_norm": 27.270408053929884,
"learning_rate": 4.424778761061948e-06,
"loss": 3.0859,
"step": 400
},
{
"epoch": 0.04538912875013838,
"grad_norm": 27.57691676783791,
"learning_rate": 4.535398230088496e-06,
"loss": 3.009,
"step": 410
},
{
"epoch": 0.046496180670873466,
"grad_norm": 23.96155441996071,
"learning_rate": 4.646017699115045e-06,
"loss": 2.9363,
"step": 420
},
{
"epoch": 0.04760323259160855,
"grad_norm": 24.279797643812547,
"learning_rate": 4.756637168141594e-06,
"loss": 3.061,
"step": 430
},
{
"epoch": 0.04871028451234363,
"grad_norm": 25.19191068246207,
"learning_rate": 4.867256637168142e-06,
"loss": 2.9153,
"step": 440
},
{
"epoch": 0.04981733643307871,
"grad_norm": 28.82056510425203,
"learning_rate": 4.97787610619469e-06,
"loss": 3.1449,
"step": 450
},
{
"epoch": 0.05092438835381379,
"grad_norm": 23.600325136397633,
"learning_rate": 5.088495575221239e-06,
"loss": 3.0081,
"step": 460
},
{
"epoch": 0.052031440274548875,
"grad_norm": 23.484025108009725,
"learning_rate": 5.1991150442477875e-06,
"loss": 3.0463,
"step": 470
},
{
"epoch": 0.05313849219528396,
"grad_norm": 19.983575881103242,
"learning_rate": 5.309734513274337e-06,
"loss": 3.0811,
"step": 480
},
{
"epoch": 0.05424554411601904,
"grad_norm": 21.728377820671874,
"learning_rate": 5.4203539823008855e-06,
"loss": 3.1061,
"step": 490
},
{
"epoch": 0.055352596036754124,
"grad_norm": 34.21680389185442,
"learning_rate": 5.530973451327434e-06,
"loss": 3.0082,
"step": 500
},
{
"epoch": 0.05645964795748921,
"grad_norm": 28.98744881318429,
"learning_rate": 5.641592920353984e-06,
"loss": 2.9659,
"step": 510
},
{
"epoch": 0.05756669987822429,
"grad_norm": 21.719020231412607,
"learning_rate": 5.752212389380532e-06,
"loss": 2.9689,
"step": 520
},
{
"epoch": 0.058673751798959374,
"grad_norm": 26.343484772343533,
"learning_rate": 5.86283185840708e-06,
"loss": 3.0181,
"step": 530
},
{
"epoch": 0.05978080371969445,
"grad_norm": 26.674266585106718,
"learning_rate": 5.973451327433629e-06,
"loss": 2.9782,
"step": 540
},
{
"epoch": 0.06088785564042953,
"grad_norm": 24.29263386559663,
"learning_rate": 6.084070796460177e-06,
"loss": 3.0291,
"step": 550
},
{
"epoch": 0.06199490756116462,
"grad_norm": 27.260031480591426,
"learning_rate": 6.194690265486726e-06,
"loss": 3.0252,
"step": 560
},
{
"epoch": 0.0631019594818997,
"grad_norm": 20.957832212139657,
"learning_rate": 6.305309734513275e-06,
"loss": 3.0388,
"step": 570
},
{
"epoch": 0.06420901140263478,
"grad_norm": 26.9583565130981,
"learning_rate": 6.415929203539823e-06,
"loss": 2.9987,
"step": 580
},
{
"epoch": 0.06531606332336987,
"grad_norm": 23.667021249704298,
"learning_rate": 6.526548672566372e-06,
"loss": 2.9786,
"step": 590
},
{
"epoch": 0.06642311524410495,
"grad_norm": 24.584436820766868,
"learning_rate": 6.6371681415929215e-06,
"loss": 3.0082,
"step": 600
},
{
"epoch": 0.06753016716484003,
"grad_norm": 28.424068265725914,
"learning_rate": 6.74778761061947e-06,
"loss": 3.1212,
"step": 610
},
{
"epoch": 0.06863721908557512,
"grad_norm": 21.704948850763316,
"learning_rate": 6.858407079646018e-06,
"loss": 3.0008,
"step": 620
},
{
"epoch": 0.0697442710063102,
"grad_norm": 25.82364197800952,
"learning_rate": 6.969026548672567e-06,
"loss": 2.9993,
"step": 630
},
{
"epoch": 0.07085132292704528,
"grad_norm": 23.887813042264725,
"learning_rate": 7.079646017699116e-06,
"loss": 2.9319,
"step": 640
},
{
"epoch": 0.07195837484778037,
"grad_norm": 26.62784975319365,
"learning_rate": 7.190265486725664e-06,
"loss": 2.9158,
"step": 650
},
{
"epoch": 0.07306542676851545,
"grad_norm": 25.598475891481986,
"learning_rate": 7.300884955752213e-06,
"loss": 3.0746,
"step": 660
},
{
"epoch": 0.07417247868925053,
"grad_norm": 19.384106471975148,
"learning_rate": 7.411504424778761e-06,
"loss": 2.9683,
"step": 670
},
{
"epoch": 0.07527953060998561,
"grad_norm": 22.58174336593009,
"learning_rate": 7.5221238938053095e-06,
"loss": 2.9548,
"step": 680
},
{
"epoch": 0.07638658253072068,
"grad_norm": 23.253222270880613,
"learning_rate": 7.632743362831859e-06,
"loss": 2.9424,
"step": 690
},
{
"epoch": 0.07749363445145577,
"grad_norm": 24.53761264564241,
"learning_rate": 7.743362831858407e-06,
"loss": 2.9999,
"step": 700
},
{
"epoch": 0.07860068637219085,
"grad_norm": 22.215828742213887,
"learning_rate": 7.853982300884957e-06,
"loss": 2.9638,
"step": 710
},
{
"epoch": 0.07970773829292593,
"grad_norm": 23.926002163576186,
"learning_rate": 7.964601769911505e-06,
"loss": 2.9937,
"step": 720
},
{
"epoch": 0.08081479021366102,
"grad_norm": 24.414616790878906,
"learning_rate": 8.075221238938053e-06,
"loss": 2.9732,
"step": 730
},
{
"epoch": 0.0819218421343961,
"grad_norm": 23.388402347902353,
"learning_rate": 8.185840707964603e-06,
"loss": 2.9107,
"step": 740
},
{
"epoch": 0.08302889405513118,
"grad_norm": 24.124270687360198,
"learning_rate": 8.296460176991151e-06,
"loss": 2.9869,
"step": 750
},
{
"epoch": 0.08413594597586627,
"grad_norm": 21.86924086571945,
"learning_rate": 8.4070796460177e-06,
"loss": 3.0616,
"step": 760
},
{
"epoch": 0.08524299789660135,
"grad_norm": 29.125772286493696,
"learning_rate": 8.517699115044249e-06,
"loss": 2.9174,
"step": 770
},
{
"epoch": 0.08635004981733643,
"grad_norm": 25.471433455609642,
"learning_rate": 8.628318584070797e-06,
"loss": 3.0338,
"step": 780
},
{
"epoch": 0.08745710173807152,
"grad_norm": 24.06665529849035,
"learning_rate": 8.738938053097345e-06,
"loss": 3.0321,
"step": 790
},
{
"epoch": 0.0885641536588066,
"grad_norm": 18.292126722435007,
"learning_rate": 8.849557522123895e-06,
"loss": 2.9429,
"step": 800
},
{
"epoch": 0.08967120557954168,
"grad_norm": 22.110943430558972,
"learning_rate": 8.960176991150443e-06,
"loss": 2.8389,
"step": 810
},
{
"epoch": 0.09077825750027677,
"grad_norm": 24.83606400487908,
"learning_rate": 9.070796460176992e-06,
"loss": 2.9753,
"step": 820
},
{
"epoch": 0.09188530942101185,
"grad_norm": 25.95232011272635,
"learning_rate": 9.181415929203542e-06,
"loss": 3.043,
"step": 830
},
{
"epoch": 0.09299236134174693,
"grad_norm": 20.659162690961626,
"learning_rate": 9.29203539823009e-06,
"loss": 2.9343,
"step": 840
},
{
"epoch": 0.09409941326248202,
"grad_norm": 25.45459790239467,
"learning_rate": 9.402654867256638e-06,
"loss": 3.0285,
"step": 850
},
{
"epoch": 0.0952064651832171,
"grad_norm": 24.920778384975627,
"learning_rate": 9.513274336283188e-06,
"loss": 3.008,
"step": 860
},
{
"epoch": 0.09631351710395218,
"grad_norm": 23.893946109752218,
"learning_rate": 9.623893805309736e-06,
"loss": 3.0592,
"step": 870
},
{
"epoch": 0.09742056902468726,
"grad_norm": 23.476211841831407,
"learning_rate": 9.734513274336284e-06,
"loss": 3.0277,
"step": 880
},
{
"epoch": 0.09852762094542233,
"grad_norm": 32.349500707592966,
"learning_rate": 9.845132743362832e-06,
"loss": 3.0481,
"step": 890
},
{
"epoch": 0.09963467286615742,
"grad_norm": 23.23199056988143,
"learning_rate": 9.95575221238938e-06,
"loss": 2.9412,
"step": 900
},
{
"epoch": 0.1007417247868925,
"grad_norm": 23.773772040090407,
"learning_rate": 9.999986557878607e-06,
"loss": 2.911,
"step": 910
},
{
"epoch": 0.10184877670762758,
"grad_norm": 21.93860193010473,
"learning_rate": 9.999904411842942e-06,
"loss": 3.0976,
"step": 920
},
{
"epoch": 0.10295582862836267,
"grad_norm": 26.162102122715467,
"learning_rate": 9.999747588842252e-06,
"loss": 2.8653,
"step": 930
},
{
"epoch": 0.10406288054909775,
"grad_norm": 24.79079368386082,
"learning_rate": 9.999516091218793e-06,
"loss": 3.0475,
"step": 940
},
{
"epoch": 0.10516993246983283,
"grad_norm": 22.659978907939497,
"learning_rate": 9.999209922430137e-06,
"loss": 2.9725,
"step": 950
},
{
"epoch": 0.10627698439056792,
"grad_norm": 24.881751031281876,
"learning_rate": 9.99882908704913e-06,
"loss": 2.9832,
"step": 960
},
{
"epoch": 0.107384036311303,
"grad_norm": 19.289141804905892,
"learning_rate": 9.998373590763798e-06,
"loss": 2.9333,
"step": 970
},
{
"epoch": 0.10849108823203808,
"grad_norm": 25.233514814542282,
"learning_rate": 9.997843440377293e-06,
"loss": 3.1247,
"step": 980
},
{
"epoch": 0.10959814015277317,
"grad_norm": 22.529791978010802,
"learning_rate": 9.997238643807768e-06,
"loss": 3.0009,
"step": 990
},
{
"epoch": 0.11070519207350825,
"grad_norm": 26.994079486651124,
"learning_rate": 9.996559210088272e-06,
"loss": 3.0359,
"step": 1000
},
{
"epoch": 0.11181224399424333,
"grad_norm": 23.04614956134999,
"learning_rate": 9.995805149366607e-06,
"loss": 2.9097,
"step": 1010
},
{
"epoch": 0.11291929591497842,
"grad_norm": 26.498780600053372,
"learning_rate": 9.994976472905184e-06,
"loss": 3.045,
"step": 1020
},
{
"epoch": 0.1140263478357135,
"grad_norm": 20.370834631825762,
"learning_rate": 9.994073193080844e-06,
"loss": 2.9198,
"step": 1030
},
{
"epoch": 0.11513339975644858,
"grad_norm": 18.931594900754984,
"learning_rate": 9.993095323384688e-06,
"loss": 2.9937,
"step": 1040
},
{
"epoch": 0.11624045167718366,
"grad_norm": 25.23074417182063,
"learning_rate": 9.992042878421862e-06,
"loss": 2.9846,
"step": 1050
},
{
"epoch": 0.11734750359791875,
"grad_norm": 22.748633865558133,
"learning_rate": 9.990915873911346e-06,
"loss": 3.0222,
"step": 1060
},
{
"epoch": 0.11845455551865383,
"grad_norm": 20.500684992552053,
"learning_rate": 9.989714326685715e-06,
"loss": 3.0954,
"step": 1070
},
{
"epoch": 0.1195616074393889,
"grad_norm": 18.438452324633406,
"learning_rate": 9.988438254690896e-06,
"loss": 2.9079,
"step": 1080
},
{
"epoch": 0.12066865936012398,
"grad_norm": 20.9380161875694,
"learning_rate": 9.987087676985886e-06,
"loss": 3.042,
"step": 1090
},
{
"epoch": 0.12177571128085907,
"grad_norm": 21.96145242279915,
"learning_rate": 9.985662613742483e-06,
"loss": 3.0928,
"step": 1100
},
{
"epoch": 0.12288276320159415,
"grad_norm": 22.04573397808545,
"learning_rate": 9.984163086244971e-06,
"loss": 3.1986,
"step": 1110
},
{
"epoch": 0.12398981512232923,
"grad_norm": 22.85309097193917,
"learning_rate": 9.982589116889811e-06,
"loss": 3.0349,
"step": 1120
},
{
"epoch": 0.12509686704306433,
"grad_norm": 22.5326581537924,
"learning_rate": 9.980940729185305e-06,
"loss": 3.0092,
"step": 1130
},
{
"epoch": 0.1262039189637994,
"grad_norm": 23.18944415182163,
"learning_rate": 9.97921794775124e-06,
"loss": 2.952,
"step": 1140
},
{
"epoch": 0.1273109708845345,
"grad_norm": 21.06330535416755,
"learning_rate": 9.977420798318527e-06,
"loss": 2.9854,
"step": 1150
},
{
"epoch": 0.12841802280526957,
"grad_norm": 20.966301469920268,
"learning_rate": 9.975549307728812e-06,
"loss": 2.9179,
"step": 1160
},
{
"epoch": 0.12952507472600466,
"grad_norm": 20.657024404262373,
"learning_rate": 9.973603503934077e-06,
"loss": 2.9828,
"step": 1170
},
{
"epoch": 0.13063212664673973,
"grad_norm": 23.18808360085381,
"learning_rate": 9.97158341599622e-06,
"loss": 2.8795,
"step": 1180
},
{
"epoch": 0.1317391785674748,
"grad_norm": 19.586003898036246,
"learning_rate": 9.969489074086626e-06,
"loss": 2.9715,
"step": 1190
},
{
"epoch": 0.1328462304882099,
"grad_norm": 23.666001778535268,
"learning_rate": 9.967320509485715e-06,
"loss": 3.0556,
"step": 1200
},
{
"epoch": 0.13395328240894497,
"grad_norm": 20.020096724757796,
"learning_rate": 9.965077754582468e-06,
"loss": 2.925,
"step": 1210
},
{
"epoch": 0.13506033432968007,
"grad_norm": 24.015238653225634,
"learning_rate": 9.962760842873952e-06,
"loss": 2.9019,
"step": 1220
},
{
"epoch": 0.13616738625041513,
"grad_norm": 30.05960379166683,
"learning_rate": 9.960369808964816e-06,
"loss": 2.984,
"step": 1230
},
{
"epoch": 0.13727443817115023,
"grad_norm": 19.296451414183455,
"learning_rate": 9.957904688566774e-06,
"loss": 2.9919,
"step": 1240
},
{
"epoch": 0.1383814900918853,
"grad_norm": 20.11171003157378,
"learning_rate": 9.95536551849807e-06,
"loss": 2.939,
"step": 1250
},
{
"epoch": 0.1394885420126204,
"grad_norm": 24.97381642097054,
"learning_rate": 9.952752336682933e-06,
"loss": 3.0819,
"step": 1260
},
{
"epoch": 0.14059559393335547,
"grad_norm": 19.11189833758423,
"learning_rate": 9.950065182151007e-06,
"loss": 2.9558,
"step": 1270
},
{
"epoch": 0.14170264585409056,
"grad_norm": 22.339401966128147,
"learning_rate": 9.947304095036768e-06,
"loss": 2.971,
"step": 1280
},
{
"epoch": 0.14280969777482563,
"grad_norm": 20.896158530865005,
"learning_rate": 9.944469116578925e-06,
"loss": 2.9734,
"step": 1290
},
{
"epoch": 0.14391674969556073,
"grad_norm": 21.80035128152707,
"learning_rate": 9.941560289119808e-06,
"loss": 3.0756,
"step": 1300
},
{
"epoch": 0.1450238016162958,
"grad_norm": 22.803461112332005,
"learning_rate": 9.938577656104725e-06,
"loss": 2.8886,
"step": 1310
},
{
"epoch": 0.1461308535370309,
"grad_norm": 19.045841307524757,
"learning_rate": 9.935521262081324e-06,
"loss": 2.9949,
"step": 1320
},
{
"epoch": 0.14723790545776597,
"grad_norm": 21.269082405436986,
"learning_rate": 9.932391152698926e-06,
"loss": 3.1047,
"step": 1330
},
{
"epoch": 0.14834495737850106,
"grad_norm": 24.520690144049905,
"learning_rate": 9.929187374707836e-06,
"loss": 2.9404,
"step": 1340
},
{
"epoch": 0.14945200929923613,
"grad_norm": 22.56252212345693,
"learning_rate": 9.925909975958655e-06,
"loss": 2.9609,
"step": 1350
},
{
"epoch": 0.15055906121997123,
"grad_norm": 18.509241308235815,
"learning_rate": 9.922559005401555e-06,
"loss": 2.9581,
"step": 1360
},
{
"epoch": 0.1516661131407063,
"grad_norm": 21.078754308555286,
"learning_rate": 9.919134513085557e-06,
"loss": 3.0338,
"step": 1370
},
{
"epoch": 0.15277316506144137,
"grad_norm": 19.364617917203557,
"learning_rate": 9.915636550157776e-06,
"loss": 3.0394,
"step": 1380
},
{
"epoch": 0.15388021698217647,
"grad_norm": 12.87341952454837,
"learning_rate": 9.912065168862661e-06,
"loss": 2.8927,
"step": 1390
},
{
"epoch": 0.15498726890291153,
"grad_norm": 21.353822481564322,
"learning_rate": 9.908420422541216e-06,
"loss": 2.9264,
"step": 1400
},
{
"epoch": 0.15609432082364663,
"grad_norm": 25.61409358483238,
"learning_rate": 9.9047023656302e-06,
"loss": 3.0722,
"step": 1410
},
{
"epoch": 0.1572013727443817,
"grad_norm": 18.98168487984158,
"learning_rate": 9.90091105366132e-06,
"loss": 3.0422,
"step": 1420
},
{
"epoch": 0.1583084246651168,
"grad_norm": 18.90201248838335,
"learning_rate": 9.897046543260384e-06,
"loss": 2.9686,
"step": 1430
},
{
"epoch": 0.15941547658585187,
"grad_norm": 19.145516912456003,
"learning_rate": 9.893108892146487e-06,
"loss": 2.9299,
"step": 1440
},
{
"epoch": 0.16052252850658696,
"grad_norm": 21.131608116342832,
"learning_rate": 9.889098159131112e-06,
"loss": 2.9767,
"step": 1450
},
{
"epoch": 0.16162958042732203,
"grad_norm": 23.100589010259966,
"learning_rate": 9.88501440411728e-06,
"loss": 2.9711,
"step": 1460
},
{
"epoch": 0.16273663234805713,
"grad_norm": 23.844195002755608,
"learning_rate": 9.88085768809865e-06,
"loss": 3.0006,
"step": 1470
},
{
"epoch": 0.1638436842687922,
"grad_norm": 21.595484978633603,
"learning_rate": 9.876628073158586e-06,
"loss": 2.8897,
"step": 1480
},
{
"epoch": 0.1649507361895273,
"grad_norm": 19.91645782320423,
"learning_rate": 9.872325622469263e-06,
"loss": 2.9626,
"step": 1490
},
{
"epoch": 0.16605778811026237,
"grad_norm": 22.954047655684626,
"learning_rate": 9.8679504002907e-06,
"loss": 2.9654,
"step": 1500
},
{
"epoch": 0.16716484003099746,
"grad_norm": 19.01781845067502,
"learning_rate": 9.863502471969811e-06,
"loss": 2.9689,
"step": 1510
},
{
"epoch": 0.16827189195173253,
"grad_norm": 23.51295361703636,
"learning_rate": 9.858981903939419e-06,
"loss": 2.9714,
"step": 1520
},
{
"epoch": 0.16937894387246763,
"grad_norm": 22.715802630980665,
"learning_rate": 9.85438876371728e-06,
"loss": 2.9433,
"step": 1530
},
{
"epoch": 0.1704859957932027,
"grad_norm": 19.235667821528295,
"learning_rate": 9.849723119905055e-06,
"loss": 2.8702,
"step": 1540
},
{
"epoch": 0.1715930477139378,
"grad_norm": 20.997083855056253,
"learning_rate": 9.844985042187305e-06,
"loss": 2.9613,
"step": 1550
},
{
"epoch": 0.17270009963467287,
"grad_norm": 19.327650896289015,
"learning_rate": 9.840174601330434e-06,
"loss": 2.9561,
"step": 1560
},
{
"epoch": 0.17380715155540793,
"grad_norm": 23.743647417758826,
"learning_rate": 9.835291869181638e-06,
"loss": 2.9465,
"step": 1570
},
{
"epoch": 0.17491420347614303,
"grad_norm": 21.24076642916138,
"learning_rate": 9.830336918667838e-06,
"loss": 2.9089,
"step": 1580
},
{
"epoch": 0.1760212553968781,
"grad_norm": 18.18531438353361,
"learning_rate": 9.82530982379458e-06,
"loss": 2.925,
"step": 1590
},
{
"epoch": 0.1771283073176132,
"grad_norm": 18.941367135114337,
"learning_rate": 9.820210659644938e-06,
"loss": 2.8847,
"step": 1600
},
{
"epoch": 0.17823535923834827,
"grad_norm": 21.6741338404853,
"learning_rate": 9.815039502378387e-06,
"loss": 2.8948,
"step": 1610
},
{
"epoch": 0.17934241115908336,
"grad_norm": 20.193862408863023,
"learning_rate": 9.80979642922967e-06,
"loss": 3.0728,
"step": 1620
},
{
"epoch": 0.18044946307981843,
"grad_norm": 18.820011578655564,
"learning_rate": 9.804481518507645e-06,
"loss": 2.9551,
"step": 1630
},
{
"epoch": 0.18155651500055353,
"grad_norm": 21.501952619775196,
"learning_rate": 9.799094849594107e-06,
"loss": 2.9621,
"step": 1640
},
{
"epoch": 0.1826635669212886,
"grad_norm": 25.610574065149102,
"learning_rate": 9.793636502942611e-06,
"loss": 2.8723,
"step": 1650
},
{
"epoch": 0.1837706188420237,
"grad_norm": 20.593228441794714,
"learning_rate": 9.78810656007727e-06,
"loss": 2.8278,
"step": 1660
},
{
"epoch": 0.18487767076275877,
"grad_norm": 19.172777347075332,
"learning_rate": 9.782505103591533e-06,
"loss": 2.9767,
"step": 1670
},
{
"epoch": 0.18598472268349386,
"grad_norm": 21.02145151466687,
"learning_rate": 9.776832217146952e-06,
"loss": 2.8362,
"step": 1680
},
{
"epoch": 0.18709177460422893,
"grad_norm": 20.621602691872784,
"learning_rate": 9.771087985471936e-06,
"loss": 3.0292,
"step": 1690
},
{
"epoch": 0.18819882652496403,
"grad_norm": 17.865789195071134,
"learning_rate": 9.765272494360483e-06,
"loss": 2.8839,
"step": 1700
},
{
"epoch": 0.1893058784456991,
"grad_norm": 18.637077859127157,
"learning_rate": 9.759385830670897e-06,
"loss": 2.8975,
"step": 1710
},
{
"epoch": 0.1904129303664342,
"grad_norm": 20.242466511532335,
"learning_rate": 9.753428082324496e-06,
"loss": 2.8949,
"step": 1720
},
{
"epoch": 0.19151998228716927,
"grad_norm": 19.93215544071779,
"learning_rate": 9.747399338304295e-06,
"loss": 3.0225,
"step": 1730
},
{
"epoch": 0.19262703420790436,
"grad_norm": 24.762070259664206,
"learning_rate": 9.741299688653676e-06,
"loss": 2.9459,
"step": 1740
},
{
"epoch": 0.19373408612863943,
"grad_norm": 19.63500693742026,
"learning_rate": 9.735129224475044e-06,
"loss": 2.8765,
"step": 1750
},
{
"epoch": 0.19484113804937453,
"grad_norm": 21.82483127686805,
"learning_rate": 9.72888803792847e-06,
"loss": 2.8684,
"step": 1760
},
{
"epoch": 0.1959481899701096,
"grad_norm": 19.049243439574713,
"learning_rate": 9.72257622223031e-06,
"loss": 2.9594,
"step": 1770
},
{
"epoch": 0.19705524189084467,
"grad_norm": 21.414348061773953,
"learning_rate": 9.716193871651814e-06,
"loss": 2.9053,
"step": 1780
},
{
"epoch": 0.19816229381157976,
"grad_norm": 17.876253403312774,
"learning_rate": 9.709741081517717e-06,
"loss": 2.8154,
"step": 1790
},
{
"epoch": 0.19926934573231483,
"grad_norm": 20.116361008310705,
"learning_rate": 9.703217948204821e-06,
"loss": 2.9732,
"step": 1800
},
{
"epoch": 0.20037639765304993,
"grad_norm": 18.744377270113645,
"learning_rate": 9.696624569140547e-06,
"loss": 2.8966,
"step": 1810
},
{
"epoch": 0.201483449573785,
"grad_norm": 19.280130238929477,
"learning_rate": 9.689961042801483e-06,
"loss": 2.8611,
"step": 1820
},
{
"epoch": 0.2025905014945201,
"grad_norm": 19.224045203920024,
"learning_rate": 9.68322746871192e-06,
"loss": 2.8985,
"step": 1830
},
{
"epoch": 0.20369755341525517,
"grad_norm": 23.351196637887085,
"learning_rate": 9.676423947442353e-06,
"loss": 2.9592,
"step": 1840
},
{
"epoch": 0.20480460533599026,
"grad_norm": 17.4707572244572,
"learning_rate": 9.66955058060799e-06,
"loss": 2.9347,
"step": 1850
},
{
"epoch": 0.20591165725672533,
"grad_norm": 20.54442740488183,
"learning_rate": 9.662607470867229e-06,
"loss": 2.8642,
"step": 1860
},
{
"epoch": 0.20701870917746043,
"grad_norm": 19.78074079042836,
"learning_rate": 9.655594721920124e-06,
"loss": 2.8779,
"step": 1870
},
{
"epoch": 0.2081257610981955,
"grad_norm": 19.640164221789888,
"learning_rate": 9.648512438506841e-06,
"loss": 3.0375,
"step": 1880
},
{
"epoch": 0.2092328130189306,
"grad_norm": 17.525300444795423,
"learning_rate": 9.641360726406087e-06,
"loss": 2.9689,
"step": 1890
},
{
"epoch": 0.21033986493966567,
"grad_norm": 16.921755221767196,
"learning_rate": 9.634139692433534e-06,
"loss": 2.9311,
"step": 1900
},
{
"epoch": 0.21144691686040076,
"grad_norm": 23.573635502822672,
"learning_rate": 9.626849444440223e-06,
"loss": 3.1791,
"step": 1910
},
{
"epoch": 0.21255396878113583,
"grad_norm": 21.608288648771143,
"learning_rate": 9.619490091310959e-06,
"loss": 2.9152,
"step": 1920
},
{
"epoch": 0.21366102070187093,
"grad_norm": 21.984519688812558,
"learning_rate": 9.612061742962672e-06,
"loss": 2.8558,
"step": 1930
},
{
"epoch": 0.214768072622606,
"grad_norm": 20.401130440641623,
"learning_rate": 9.604564510342785e-06,
"loss": 2.8631,
"step": 1940
},
{
"epoch": 0.2158751245433411,
"grad_norm": 20.05203124505054,
"learning_rate": 9.596998505427556e-06,
"loss": 2.987,
"step": 1950
},
{
"epoch": 0.21698217646407617,
"grad_norm": 20.868561748558378,
"learning_rate": 9.589363841220398e-06,
"loss": 2.7379,
"step": 1960
},
{
"epoch": 0.21808922838481123,
"grad_norm": 22.537403308642126,
"learning_rate": 9.581660631750205e-06,
"loss": 2.9491,
"step": 1970
},
{
"epoch": 0.21919628030554633,
"grad_norm": 18.786633581936144,
"learning_rate": 9.573888992069635e-06,
"loss": 3.0325,
"step": 1980
},
{
"epoch": 0.2203033322262814,
"grad_norm": 20.183050798106528,
"learning_rate": 9.566049038253404e-06,
"loss": 2.8613,
"step": 1990
},
{
"epoch": 0.2214103841470165,
"grad_norm": 19.889860560476563,
"learning_rate": 9.558140887396539e-06,
"loss": 3.0076,
"step": 2000
},
{
"epoch": 0.2214103841470165,
"eval_loss": 2.899467945098877,
"eval_runtime": 2402.2319,
"eval_samples_per_second": 4.178,
"eval_steps_per_second": 0.418,
"step": 2000
},
{
"epoch": 0.22251743606775157,
"grad_norm": 20.918414604698,
"learning_rate": 9.55016465761264e-06,
"loss": 2.8974,
"step": 2010
},
{
"epoch": 0.22362448798848666,
"grad_norm": 18.12895807311221,
"learning_rate": 9.542120468032108e-06,
"loss": 2.8925,
"step": 2020
},
{
"epoch": 0.22473153990922173,
"grad_norm": 20.68008214687689,
"learning_rate": 9.534008438800378e-06,
"loss": 2.8954,
"step": 2030
},
{
"epoch": 0.22583859182995683,
"grad_norm": 19.62578662683229,
"learning_rate": 9.525828691076107e-06,
"loss": 2.9672,
"step": 2040
},
{
"epoch": 0.2269456437506919,
"grad_norm": 18.137624721398762,
"learning_rate": 9.517581347029378e-06,
"loss": 2.7592,
"step": 2050
},
{
"epoch": 0.228052695671427,
"grad_norm": 18.753830138125636,
"learning_rate": 9.509266529839872e-06,
"loss": 2.7837,
"step": 2060
},
{
"epoch": 0.22915974759216207,
"grad_norm": 17.672344029095868,
"learning_rate": 9.500884363695025e-06,
"loss": 2.8959,
"step": 2070
},
{
"epoch": 0.23026679951289716,
"grad_norm": 17.952725451957562,
"learning_rate": 9.492434973788176e-06,
"loss": 2.9146,
"step": 2080
},
{
"epoch": 0.23137385143363223,
"grad_norm": 21.49636205616348,
"learning_rate": 9.483918486316694e-06,
"loss": 2.9972,
"step": 2090
},
{
"epoch": 0.23248090335436733,
"grad_norm": 17.872259773823583,
"learning_rate": 9.475335028480104e-06,
"loss": 2.9048,
"step": 2100
},
{
"epoch": 0.2335879552751024,
"grad_norm": 18.304493955091758,
"learning_rate": 9.466684728478167e-06,
"loss": 2.8832,
"step": 2110
},
{
"epoch": 0.2346950071958375,
"grad_norm": 20.521104550808733,
"learning_rate": 9.457967715508986e-06,
"loss": 2.9132,
"step": 2120
},
{
"epoch": 0.23580205911657257,
"grad_norm": 21.959898523340325,
"learning_rate": 9.449184119767066e-06,
"loss": 2.8827,
"step": 2130
},
{
"epoch": 0.23690911103730766,
"grad_norm": 17.838849413370237,
"learning_rate": 9.440334072441364e-06,
"loss": 2.9918,
"step": 2140
},
{
"epoch": 0.23801616295804273,
"grad_norm": 19.92878332099444,
"learning_rate": 9.431417705713348e-06,
"loss": 2.9768,
"step": 2150
},
{
"epoch": 0.2391232148787778,
"grad_norm": 22.052024784352827,
"learning_rate": 9.422435152755003e-06,
"loss": 2.7936,
"step": 2160
},
{
"epoch": 0.2402302667995129,
"grad_norm": 18.832979591486268,
"learning_rate": 9.41338654772685e-06,
"loss": 2.8846,
"step": 2170
},
{
"epoch": 0.24133731872024797,
"grad_norm": 20.56672086138257,
"learning_rate": 9.40427202577595e-06,
"loss": 2.9381,
"step": 2180
},
{
"epoch": 0.24244437064098306,
"grad_norm": 19.022342343144167,
"learning_rate": 9.39509172303387e-06,
"loss": 2.7231,
"step": 2190
},
{
"epoch": 0.24355142256171813,
"grad_norm": 18.365037787301343,
"learning_rate": 9.385845776614659e-06,
"loss": 2.8299,
"step": 2200
},
{
"epoch": 0.24465847448245323,
"grad_norm": 16.086771712151563,
"learning_rate": 9.3765343246128e-06,
"loss": 2.8833,
"step": 2210
},
{
"epoch": 0.2457655264031883,
"grad_norm": 17.286906565742285,
"learning_rate": 9.367157506101152e-06,
"loss": 2.8471,
"step": 2220
},
{
"epoch": 0.2468725783239234,
"grad_norm": 16.860767812467355,
"learning_rate": 9.35771546112886e-06,
"loss": 2.7524,
"step": 2230
},
{
"epoch": 0.24797963024465847,
"grad_norm": 22.69662113190212,
"learning_rate": 9.348208330719269e-06,
"loss": 2.9083,
"step": 2240
},
{
"epoch": 0.24908668216539356,
"grad_norm": 17.900886651161414,
"learning_rate": 9.338636256867826e-06,
"loss": 2.8428,
"step": 2250
},
{
"epoch": 0.25019373408612866,
"grad_norm": 16.680552099827924,
"learning_rate": 9.328999382539948e-06,
"loss": 2.8914,
"step": 2260
},
{
"epoch": 0.25130078600686373,
"grad_norm": 18.313897702064246,
"learning_rate": 9.319297851668893e-06,
"loss": 2.9034,
"step": 2270
},
{
"epoch": 0.2524078379275988,
"grad_norm": 16.947671537858998,
"learning_rate": 9.309531809153606e-06,
"loss": 2.8502,
"step": 2280
},
{
"epoch": 0.25351488984833387,
"grad_norm": 18.710427396365873,
"learning_rate": 9.29970140085656e-06,
"loss": 2.8524,
"step": 2290
},
{
"epoch": 0.254621941769069,
"grad_norm": 19.2567190717822,
"learning_rate": 9.28980677360157e-06,
"loss": 2.9991,
"step": 2300
},
{
"epoch": 0.25572899368980406,
"grad_norm": 18.050406635894987,
"learning_rate": 9.279848075171613e-06,
"loss": 2.8717,
"step": 2310
},
{
"epoch": 0.25683604561053913,
"grad_norm": 22.127493631791086,
"learning_rate": 9.269825454306605e-06,
"loss": 2.8977,
"step": 2320
},
{
"epoch": 0.2579430975312742,
"grad_norm": 18.821085236072186,
"learning_rate": 9.259739060701189e-06,
"loss": 2.9116,
"step": 2330
},
{
"epoch": 0.2590501494520093,
"grad_norm": 19.277291605575755,
"learning_rate": 9.249589045002497e-06,
"loss": 2.9024,
"step": 2340
},
{
"epoch": 0.2601572013727444,
"grad_norm": 18.176543407022002,
"learning_rate": 9.239375558807901e-06,
"loss": 2.9065,
"step": 2350
},
{
"epoch": 0.26126425329347946,
"grad_norm": 17.55658292047273,
"learning_rate": 9.229098754662748e-06,
"loss": 2.7598,
"step": 2360
},
{
"epoch": 0.26237130521421453,
"grad_norm": 19.0666485097006,
"learning_rate": 9.218758786058084e-06,
"loss": 2.8376,
"step": 2370
},
{
"epoch": 0.2634783571349496,
"grad_norm": 19.066879018665727,
"learning_rate": 9.208355807428351e-06,
"loss": 2.8766,
"step": 2380
},
{
"epoch": 0.26458540905568473,
"grad_norm": 22.160566724834183,
"learning_rate": 9.197889974149096e-06,
"loss": 2.9115,
"step": 2390
},
{
"epoch": 0.2656924609764198,
"grad_norm": 18.716069674957527,
"learning_rate": 9.187361442534641e-06,
"loss": 2.913,
"step": 2400
},
{
"epoch": 0.26679951289715487,
"grad_norm": 21.86386868859532,
"learning_rate": 9.176770369835748e-06,
"loss": 3.0737,
"step": 2410
},
{
"epoch": 0.26790656481788994,
"grad_norm": 19.87740412211485,
"learning_rate": 9.166116914237277e-06,
"loss": 2.827,
"step": 2420
},
{
"epoch": 0.26901361673862506,
"grad_norm": 20.48966032173197,
"learning_rate": 9.155401234855814e-06,
"loss": 2.8279,
"step": 2430
},
{
"epoch": 0.27012066865936013,
"grad_norm": 18.939462945596684,
"learning_rate": 9.144623491737303e-06,
"loss": 2.8827,
"step": 2440
},
{
"epoch": 0.2712277205800952,
"grad_norm": 16.511411706489035,
"learning_rate": 9.133783845854649e-06,
"loss": 2.8858,
"step": 2450
},
{
"epoch": 0.27233477250083027,
"grad_norm": 17.12242102699232,
"learning_rate": 9.12288245910532e-06,
"loss": 3.0051,
"step": 2460
},
{
"epoch": 0.2734418244215654,
"grad_norm": 21.739631249295055,
"learning_rate": 9.111919494308921e-06,
"loss": 2.8119,
"step": 2470
},
{
"epoch": 0.27454887634230046,
"grad_norm": 19.136040590653046,
"learning_rate": 9.100895115204776e-06,
"loss": 2.9821,
"step": 2480
},
{
"epoch": 0.27565592826303553,
"grad_norm": 18.511511436982243,
"learning_rate": 9.08980948644946e-06,
"loss": 2.8592,
"step": 2490
},
{
"epoch": 0.2767629801837706,
"grad_norm": 20.212663617382482,
"learning_rate": 9.078662773614367e-06,
"loss": 2.9192,
"step": 2500
},
{
"epoch": 0.2778700321045057,
"grad_norm": 20.727838354887886,
"learning_rate": 9.067455143183213e-06,
"loss": 2.8882,
"step": 2510
},
{
"epoch": 0.2789770840252408,
"grad_norm": 20.387190015826864,
"learning_rate": 9.056186762549564e-06,
"loss": 2.8964,
"step": 2520
},
{
"epoch": 0.28008413594597587,
"grad_norm": 21.001687858734584,
"learning_rate": 9.04485780001433e-06,
"loss": 3.0001,
"step": 2530
},
{
"epoch": 0.28119118786671093,
"grad_norm": 15.842781499171902,
"learning_rate": 9.033468424783255e-06,
"loss": 2.8406,
"step": 2540
},
{
"epoch": 0.282298239787446,
"grad_norm": 21.453283495940212,
"learning_rate": 9.022018806964388e-06,
"loss": 2.7475,
"step": 2550
},
{
"epoch": 0.28340529170818113,
"grad_norm": 16.60678323210403,
"learning_rate": 9.010509117565538e-06,
"loss": 2.789,
"step": 2560
},
{
"epoch": 0.2845123436289162,
"grad_norm": 21.22156270449788,
"learning_rate": 8.998939528491724e-06,
"loss": 2.8132,
"step": 2570
},
{
"epoch": 0.28561939554965127,
"grad_norm": 20.029298510004143,
"learning_rate": 8.987310212542613e-06,
"loss": 2.8848,
"step": 2580
},
{
"epoch": 0.28672644747038634,
"grad_norm": 17.416215394194477,
"learning_rate": 8.975621343409927e-06,
"loss": 2.8099,
"step": 2590
},
{
"epoch": 0.28783349939112146,
"grad_norm": 17.8983008619953,
"learning_rate": 8.963873095674858e-06,
"loss": 2.8862,
"step": 2600
},
{
"epoch": 0.28894055131185653,
"grad_norm": 17.34578619148897,
"learning_rate": 8.95206564480546e-06,
"loss": 2.7672,
"step": 2610
},
{
"epoch": 0.2900476032325916,
"grad_norm": 20.307382487515195,
"learning_rate": 8.94019916715402e-06,
"loss": 2.9254,
"step": 2620
},
{
"epoch": 0.29115465515332667,
"grad_norm": 15.542065735556422,
"learning_rate": 8.928273839954437e-06,
"loss": 2.7188,
"step": 2630
},
{
"epoch": 0.2922617070740618,
"grad_norm": 15.573521475112441,
"learning_rate": 8.916289841319564e-06,
"loss": 2.8667,
"step": 2640
},
{
"epoch": 0.29336875899479686,
"grad_norm": 19.410117591693684,
"learning_rate": 8.904247350238551e-06,
"loss": 2.8341,
"step": 2650
},
{
"epoch": 0.29447581091553193,
"grad_norm": 19.84765614341061,
"learning_rate": 8.892146546574172e-06,
"loss": 2.7139,
"step": 2660
},
{
"epoch": 0.295582862836267,
"grad_norm": 17.983856647490622,
"learning_rate": 8.879987611060143e-06,
"loss": 2.6931,
"step": 2670
},
{
"epoch": 0.2966899147570021,
"grad_norm": 16.62072011844082,
"learning_rate": 8.867770725298417e-06,
"loss": 2.8986,
"step": 2680
},
{
"epoch": 0.2977969666777372,
"grad_norm": 22.537941135987385,
"learning_rate": 8.855496071756472e-06,
"loss": 2.9275,
"step": 2690
},
{
"epoch": 0.29890401859847227,
"grad_norm": 19.624324641621538,
"learning_rate": 8.843163833764585e-06,
"loss": 2.8609,
"step": 2700
},
{
"epoch": 0.30001107051920733,
"grad_norm": 14.826694832266885,
"learning_rate": 8.8307741955131e-06,
"loss": 2.832,
"step": 2710
},
{
"epoch": 0.30111812243994246,
"grad_norm": 21.084123058139465,
"learning_rate": 8.818327342049672e-06,
"loss": 2.9927,
"step": 2720
},
{
"epoch": 0.30222517436067753,
"grad_norm": 17.156557696514646,
"learning_rate": 8.805823459276501e-06,
"loss": 2.7874,
"step": 2730
},
{
"epoch": 0.3033322262814126,
"grad_norm": 21.616600083840076,
"learning_rate": 8.793262733947564e-06,
"loss": 2.9143,
"step": 2740
},
{
"epoch": 0.30443927820214767,
"grad_norm": 17.849582075052787,
"learning_rate": 8.780645353665814e-06,
"loss": 2.9265,
"step": 2750
},
{
"epoch": 0.30554633012288274,
"grad_norm": 16.907525943766586,
"learning_rate": 8.767971506880388e-06,
"loss": 2.8079,
"step": 2760
},
{
"epoch": 0.30665338204361786,
"grad_norm": 21.80594816789924,
"learning_rate": 8.755241382883786e-06,
"loss": 2.8586,
"step": 2770
},
{
"epoch": 0.30776043396435293,
"grad_norm": 17.786988703153124,
"learning_rate": 8.74245517180905e-06,
"loss": 2.7957,
"step": 2780
},
{
"epoch": 0.308867485885088,
"grad_norm": 18.535816164863746,
"learning_rate": 8.729613064626916e-06,
"loss": 2.9017,
"step": 2790
},
{
"epoch": 0.30997453780582307,
"grad_norm": 16.811716242078795,
"learning_rate": 8.71671525314297e-06,
"loss": 2.8474,
"step": 2800
},
{
"epoch": 0.3110815897265582,
"grad_norm": 18.305914523882734,
"learning_rate": 8.703761929994779e-06,
"loss": 2.9573,
"step": 2810
},
{
"epoch": 0.31218864164729326,
"grad_norm": 18.579915296564323,
"learning_rate": 8.690753288649013e-06,
"loss": 2.8964,
"step": 2820
},
{
"epoch": 0.31329569356802833,
"grad_norm": 18.539697958237422,
"learning_rate": 8.677689523398556e-06,
"loss": 2.7703,
"step": 2830
},
{
"epoch": 0.3144027454887634,
"grad_norm": 17.915697068912802,
"learning_rate": 8.664570829359608e-06,
"loss": 2.8693,
"step": 2840
},
{
"epoch": 0.3155097974094985,
"grad_norm": 18.898905292436613,
"learning_rate": 8.651397402468765e-06,
"loss": 2.8371,
"step": 2850
},
{
"epoch": 0.3166168493302336,
"grad_norm": 22.702920044801495,
"learning_rate": 8.638169439480097e-06,
"loss": 2.8705,
"step": 2860
},
{
"epoch": 0.31772390125096867,
"grad_norm": 14.669145969089513,
"learning_rate": 8.624887137962206e-06,
"loss": 2.7689,
"step": 2870
},
{
"epoch": 0.31883095317170373,
"grad_norm": 20.31679832956785,
"learning_rate": 8.61155069629528e-06,
"loss": 2.8442,
"step": 2880
},
{
"epoch": 0.31993800509243886,
"grad_norm": 17.50251569058274,
"learning_rate": 8.59816031366812e-06,
"loss": 2.8204,
"step": 2890
},
{
"epoch": 0.32104505701317393,
"grad_norm": 14.301977043806207,
"learning_rate": 8.584716190075182e-06,
"loss": 2.7507,
"step": 2900
},
{
"epoch": 0.322152108933909,
"grad_norm": 16.501447600831984,
"learning_rate": 8.571218526313572e-06,
"loss": 2.847,
"step": 2910
},
{
"epoch": 0.32325916085464407,
"grad_norm": 15.819764582641644,
"learning_rate": 8.557667523980054e-06,
"loss": 2.7269,
"step": 2920
},
{
"epoch": 0.3243662127753792,
"grad_norm": 19.79726490914286,
"learning_rate": 8.544063385468047e-06,
"loss": 2.8579,
"step": 2930
},
{
"epoch": 0.32547326469611426,
"grad_norm": 13.946259262777874,
"learning_rate": 8.530406313964588e-06,
"loss": 2.8433,
"step": 2940
},
{
"epoch": 0.32658031661684933,
"grad_norm": 18.300981068446877,
"learning_rate": 8.516696513447308e-06,
"loss": 2.8518,
"step": 2950
},
{
"epoch": 0.3276873685375844,
"grad_norm": 18.862858354575344,
"learning_rate": 8.502934188681382e-06,
"loss": 2.7097,
"step": 2960
},
{
"epoch": 0.32879442045831947,
"grad_norm": 17.293876429758797,
"learning_rate": 8.489119545216465e-06,
"loss": 2.8865,
"step": 2970
},
{
"epoch": 0.3299014723790546,
"grad_norm": 16.410769414507325,
"learning_rate": 8.475252789383634e-06,
"loss": 2.7419,
"step": 2980
},
{
"epoch": 0.33100852429978966,
"grad_norm": 16.157207346564473,
"learning_rate": 8.461334128292296e-06,
"loss": 2.8566,
"step": 2990
},
{
"epoch": 0.33211557622052473,
"grad_norm": 17.97405966664622,
"learning_rate": 8.447363769827097e-06,
"loss": 2.8409,
"step": 3000
},
{
"epoch": 0.3332226281412598,
"grad_norm": 18.040888448056503,
"learning_rate": 8.43334192264482e-06,
"loss": 2.7078,
"step": 3010
},
{
"epoch": 0.3343296800619949,
"grad_norm": 17.401311897099646,
"learning_rate": 8.41926879617127e-06,
"loss": 2.8375,
"step": 3020
},
{
"epoch": 0.33543673198273,
"grad_norm": 18.971972878515558,
"learning_rate": 8.405144600598136e-06,
"loss": 2.7534,
"step": 3030
},
{
"epoch": 0.33654378390346507,
"grad_norm": 17.56044316128444,
"learning_rate": 8.390969546879868e-06,
"loss": 2.8017,
"step": 3040
},
{
"epoch": 0.33765083582420014,
"grad_norm": 18.9191689174584,
"learning_rate": 8.376743846730506e-06,
"loss": 2.8735,
"step": 3050
},
{
"epoch": 0.33875788774493526,
"grad_norm": 16.159522966531355,
"learning_rate": 8.36246771262054e-06,
"loss": 2.7277,
"step": 3060
},
{
"epoch": 0.33986493966567033,
"grad_norm": 17.732911671191786,
"learning_rate": 8.348141357773714e-06,
"loss": 2.7975,
"step": 3070
},
{
"epoch": 0.3409719915864054,
"grad_norm": 17.580686476759546,
"learning_rate": 8.333764996163863e-06,
"loss": 2.7285,
"step": 3080
},
{
"epoch": 0.34207904350714047,
"grad_norm": 20.220871787654826,
"learning_rate": 8.319338842511701e-06,
"loss": 2.7638,
"step": 3090
},
{
"epoch": 0.3431860954278756,
"grad_norm": 15.421883005921854,
"learning_rate": 8.30486311228162e-06,
"loss": 2.7664,
"step": 3100
},
{
"epoch": 0.34429314734861066,
"grad_norm": 22.52292422020666,
"learning_rate": 8.290338021678478e-06,
"loss": 2.7415,
"step": 3110
},
{
"epoch": 0.34540019926934573,
"grad_norm": 17.773426663788022,
"learning_rate": 8.275763787644354e-06,
"loss": 2.7612,
"step": 3120
},
{
"epoch": 0.3465072511900808,
"grad_norm": 17.313609438292495,
"learning_rate": 8.261140627855326e-06,
"loss": 2.6789,
"step": 3130
},
{
"epoch": 0.34761430311081587,
"grad_norm": 19.92121017478009,
"learning_rate": 8.246468760718205e-06,
"loss": 2.9528,
"step": 3140
},
{
"epoch": 0.348721355031551,
"grad_norm": 20.3829374368461,
"learning_rate": 8.231748405367284e-06,
"loss": 2.7307,
"step": 3150
},
{
"epoch": 0.34982840695228606,
"grad_norm": 17.20183231133198,
"learning_rate": 8.216979781661059e-06,
"loss": 2.7799,
"step": 3160
},
{
"epoch": 0.35093545887302113,
"grad_norm": 17.179059431154894,
"learning_rate": 8.202163110178945e-06,
"loss": 2.7417,
"step": 3170
},
{
"epoch": 0.3520425107937562,
"grad_norm": 17.829683364789567,
"learning_rate": 8.187298612217984e-06,
"loss": 2.7268,
"step": 3180
},
{
"epoch": 0.3531495627144913,
"grad_norm": 20.35885213396436,
"learning_rate": 8.172386509789539e-06,
"loss": 2.8759,
"step": 3190
},
{
"epoch": 0.3542566146352264,
"grad_norm": 18.210319395606284,
"learning_rate": 8.157427025615979e-06,
"loss": 2.7603,
"step": 3200
},
{
"epoch": 0.35536366655596147,
"grad_norm": 20.180991639281267,
"learning_rate": 8.14242038312735e-06,
"loss": 2.6385,
"step": 3210
},
{
"epoch": 0.35647071847669654,
"grad_norm": 13.997589668763045,
"learning_rate": 8.127366806458043e-06,
"loss": 2.6638,
"step": 3220
},
{
"epoch": 0.35757777039743166,
"grad_norm": 16.552842345785916,
"learning_rate": 8.112266520443437e-06,
"loss": 2.8545,
"step": 3230
},
{
"epoch": 0.35868482231816673,
"grad_norm": 22.63458529594302,
"learning_rate": 8.097119750616552e-06,
"loss": 2.9072,
"step": 3240
},
{
"epoch": 0.3597918742389018,
"grad_norm": 20.351123072545064,
"learning_rate": 8.08192672320467e-06,
"loss": 2.8104,
"step": 3250
},
{
"epoch": 0.36089892615963687,
"grad_norm": 18.012402171983243,
"learning_rate": 8.066687665125965e-06,
"loss": 2.8857,
"step": 3260
},
{
"epoch": 0.362005978080372,
"grad_norm": 14.813109416518861,
"learning_rate": 8.051402803986112e-06,
"loss": 2.7149,
"step": 3270
},
{
"epoch": 0.36311303000110706,
"grad_norm": 19.48150839228793,
"learning_rate": 8.036072368074883e-06,
"loss": 2.7073,
"step": 3280
},
{
"epoch": 0.36422008192184213,
"grad_norm": 19.11749404734295,
"learning_rate": 8.020696586362739e-06,
"loss": 2.6653,
"step": 3290
},
{
"epoch": 0.3653271338425772,
"grad_norm": 22.934472507487648,
"learning_rate": 8.005275688497415e-06,
"loss": 2.813,
"step": 3300
},
{
"epoch": 0.3664341857633123,
"grad_norm": 14.997032892515483,
"learning_rate": 7.989809904800483e-06,
"loss": 2.7371,
"step": 3310
},
{
"epoch": 0.3675412376840474,
"grad_norm": 15.5742880306809,
"learning_rate": 7.974299466263919e-06,
"loss": 2.8341,
"step": 3320
},
{
"epoch": 0.36864828960478246,
"grad_norm": 20.142912914493085,
"learning_rate": 7.958744604546641e-06,
"loss": 2.8141,
"step": 3330
},
{
"epoch": 0.36975534152551753,
"grad_norm": 18.86513832413105,
"learning_rate": 7.94314555197107e-06,
"loss": 2.7812,
"step": 3340
},
{
"epoch": 0.3708623934462526,
"grad_norm": 22.49228437600144,
"learning_rate": 7.927502541519637e-06,
"loss": 2.825,
"step": 3350
},
{
"epoch": 0.3719694453669877,
"grad_norm": 22.419596048754094,
"learning_rate": 7.91181580683132e-06,
"loss": 2.8135,
"step": 3360
},
{
"epoch": 0.3730764972877228,
"grad_norm": 16.9758949814327,
"learning_rate": 7.896085582198143e-06,
"loss": 2.7589,
"step": 3370
},
{
"epoch": 0.37418354920845787,
"grad_norm": 17.427893990910892,
"learning_rate": 7.880312102561688e-06,
"loss": 2.8191,
"step": 3380
},
{
"epoch": 0.37529060112919294,
"grad_norm": 16.881634487817756,
"learning_rate": 7.864495603509571e-06,
"loss": 2.7757,
"step": 3390
},
{
"epoch": 0.37639765304992806,
"grad_norm": 17.644413976791455,
"learning_rate": 7.848636321271943e-06,
"loss": 2.8439,
"step": 3400
},
{
"epoch": 0.37750470497066313,
"grad_norm": 17.371658704562304,
"learning_rate": 7.83273449271794e-06,
"loss": 2.8163,
"step": 3410
},
{
"epoch": 0.3786117568913982,
"grad_norm": 17.681733503092357,
"learning_rate": 7.816790355352167e-06,
"loss": 2.7568,
"step": 3420
},
{
"epoch": 0.37971880881213327,
"grad_norm": 18.455389219089255,
"learning_rate": 7.80080414731113e-06,
"loss": 2.6985,
"step": 3430
},
{
"epoch": 0.3808258607328684,
"grad_norm": 16.157025548622848,
"learning_rate": 7.784776107359696e-06,
"loss": 2.7969,
"step": 3440
},
{
"epoch": 0.38193291265360346,
"grad_norm": 14.768944382636816,
"learning_rate": 7.768706474887516e-06,
"loss": 2.7339,
"step": 3450
},
{
"epoch": 0.38303996457433853,
"grad_norm": 18.48084069219429,
"learning_rate": 7.752595489905456e-06,
"loss": 2.7754,
"step": 3460
},
{
"epoch": 0.3841470164950736,
"grad_norm": 19.156514520004468,
"learning_rate": 7.736443393042007e-06,
"loss": 2.847,
"step": 3470
},
{
"epoch": 0.3852540684158087,
"grad_norm": 16.446763048779168,
"learning_rate": 7.720250425539698e-06,
"loss": 2.6395,
"step": 3480
},
{
"epoch": 0.3863611203365438,
"grad_norm": 14.192958419140753,
"learning_rate": 7.704016829251484e-06,
"loss": 2.7273,
"step": 3490
},
{
"epoch": 0.38746817225727886,
"grad_norm": 14.358834052259523,
"learning_rate": 7.687742846637141e-06,
"loss": 2.705,
"step": 3500
},
{
"epoch": 0.38857522417801393,
"grad_norm": 17.950732691617667,
"learning_rate": 7.671428720759641e-06,
"loss": 2.7615,
"step": 3510
},
{
"epoch": 0.38968227609874906,
"grad_norm": 18.082782880469356,
"learning_rate": 7.655074695281526e-06,
"loss": 2.7389,
"step": 3520
},
{
"epoch": 0.39078932801948413,
"grad_norm": 17.001645765491634,
"learning_rate": 7.638681014461263e-06,
"loss": 2.7623,
"step": 3530
},
{
"epoch": 0.3918963799402192,
"grad_norm": 16.148791106439415,
"learning_rate": 7.622247923149597e-06,
"loss": 2.771,
"step": 3540
},
{
"epoch": 0.39300343186095427,
"grad_norm": 16.319755028507952,
"learning_rate": 7.6057756667859e-06,
"loss": 2.745,
"step": 3550
},
{
"epoch": 0.39411048378168934,
"grad_norm": 18.249081210470003,
"learning_rate": 7.589264491394497e-06,
"loss": 2.7631,
"step": 3560
},
{
"epoch": 0.39521753570242446,
"grad_norm": 17.114757273903603,
"learning_rate": 7.572714643580993e-06,
"loss": 2.5916,
"step": 3570
},
{
"epoch": 0.39632458762315953,
"grad_norm": 15.74515478345217,
"learning_rate": 7.556126370528598e-06,
"loss": 2.7441,
"step": 3580
},
{
"epoch": 0.3974316395438946,
"grad_norm": 17.521251320931118,
"learning_rate": 7.539499919994425e-06,
"loss": 2.7365,
"step": 3590
},
{
"epoch": 0.39853869146462967,
"grad_norm": 19.23187701802523,
"learning_rate": 7.522835540305795e-06,
"loss": 2.7919,
"step": 3600
},
{
"epoch": 0.3996457433853648,
"grad_norm": 14.994960528554826,
"learning_rate": 7.506133480356523e-06,
"loss": 2.8063,
"step": 3610
},
{
"epoch": 0.40075279530609986,
"grad_norm": 19.43636713958746,
"learning_rate": 7.489393989603213e-06,
"loss": 2.8291,
"step": 3620
},
{
"epoch": 0.40185984722683493,
"grad_norm": 19.96902221880387,
"learning_rate": 7.472617318061515e-06,
"loss": 2.6574,
"step": 3630
},
{
"epoch": 0.40296689914757,
"grad_norm": 15.764432388205172,
"learning_rate": 7.4558037163023986e-06,
"loss": 2.8279,
"step": 3640
},
{
"epoch": 0.4040739510683051,
"grad_norm": 17.00988346435618,
"learning_rate": 7.438953435448422e-06,
"loss": 2.8606,
"step": 3650
},
{
"epoch": 0.4051810029890402,
"grad_norm": 20.528609879722282,
"learning_rate": 7.422066727169956e-06,
"loss": 2.803,
"step": 3660
},
{
"epoch": 0.40628805490977526,
"grad_norm": 24.117540486267707,
"learning_rate": 7.405143843681453e-06,
"loss": 2.8901,
"step": 3670
},
{
"epoch": 0.40739510683051033,
"grad_norm": 15.932815366392553,
"learning_rate": 7.388185037737656e-06,
"loss": 2.6042,
"step": 3680
},
{
"epoch": 0.40850215875124546,
"grad_norm": 16.494705800421944,
"learning_rate": 7.371190562629842e-06,
"loss": 2.7918,
"step": 3690
},
{
"epoch": 0.40960921067198053,
"grad_norm": 21.567108547663295,
"learning_rate": 7.354160672182027e-06,
"loss": 2.7606,
"step": 3700
},
{
"epoch": 0.4107162625927156,
"grad_norm": 21.48414979932869,
"learning_rate": 7.337095620747181e-06,
"loss": 2.6994,
"step": 3710
},
{
"epoch": 0.41182331451345067,
"grad_norm": 13.807319945171502,
"learning_rate": 7.319995663203425e-06,
"loss": 2.7346,
"step": 3720
},
{
"epoch": 0.41293036643418574,
"grad_norm": 18.456828860891658,
"learning_rate": 7.302861054950231e-06,
"loss": 2.6429,
"step": 3730
},
{
"epoch": 0.41403741835492086,
"grad_norm": 18.493884527191277,
"learning_rate": 7.285692051904596e-06,
"loss": 2.7264,
"step": 3740
},
{
"epoch": 0.41514447027565593,
"grad_norm": 15.443965108568486,
"learning_rate": 7.2684889104972335e-06,
"loss": 2.7915,
"step": 3750
},
{
"epoch": 0.416251522196391,
"grad_norm": 15.970560252697705,
"learning_rate": 7.2512518876687325e-06,
"loss": 2.7585,
"step": 3760
},
{
"epoch": 0.41735857411712607,
"grad_norm": 16.483755053972125,
"learning_rate": 7.233981240865723e-06,
"loss": 2.7225,
"step": 3770
},
{
"epoch": 0.4184656260378612,
"grad_norm": 15.927243910629507,
"learning_rate": 7.2166772280370355e-06,
"loss": 2.7053,
"step": 3780
},
{
"epoch": 0.41957267795859626,
"grad_norm": 16.30824749754582,
"learning_rate": 7.199340107629843e-06,
"loss": 2.7531,
"step": 3790
},
{
"epoch": 0.42067972987933133,
"grad_norm": 17.94048283670358,
"learning_rate": 7.1819701385858045e-06,
"loss": 2.643,
"step": 3800
},
{
"epoch": 0.4217867818000664,
"grad_norm": 18.8081266409834,
"learning_rate": 7.164567580337191e-06,
"loss": 2.759,
"step": 3810
},
{
"epoch": 0.4228938337208015,
"grad_norm": 19.93408221633125,
"learning_rate": 7.147132692803018e-06,
"loss": 2.8159,
"step": 3820
},
{
"epoch": 0.4240008856415366,
"grad_norm": 14.119638307817269,
"learning_rate": 7.1296657363851644e-06,
"loss": 2.5886,
"step": 3830
},
{
"epoch": 0.42510793756227166,
"grad_norm": 14.700749001625018,
"learning_rate": 7.112166971964472e-06,
"loss": 2.7577,
"step": 3840
},
{
"epoch": 0.42621498948300673,
"grad_norm": 16.876997824156497,
"learning_rate": 7.094636660896865e-06,
"loss": 2.7068,
"step": 3850
},
{
"epoch": 0.42732204140374186,
"grad_norm": 17.677042560229854,
"learning_rate": 7.0770750650094335e-06,
"loss": 2.7139,
"step": 3860
},
{
"epoch": 0.42842909332447693,
"grad_norm": 22.903911635500307,
"learning_rate": 7.059482446596525e-06,
"loss": 2.6586,
"step": 3870
},
{
"epoch": 0.429536145245212,
"grad_norm": 17.15359853143299,
"learning_rate": 7.041859068415836e-06,
"loss": 2.7196,
"step": 3880
},
{
"epoch": 0.43064319716594707,
"grad_norm": 18.265015720893867,
"learning_rate": 7.024205193684479e-06,
"loss": 2.795,
"step": 3890
},
{
"epoch": 0.4317502490866822,
"grad_norm": 17.416460348542884,
"learning_rate": 7.006521086075049e-06,
"loss": 2.8018,
"step": 3900
},
{
"epoch": 0.43285730100741726,
"grad_norm": 15.06159976676458,
"learning_rate": 6.9888070097116926e-06,
"loss": 2.6702,
"step": 3910
},
{
"epoch": 0.43396435292815233,
"grad_norm": 14.916257340220586,
"learning_rate": 6.971063229166162e-06,
"loss": 2.667,
"step": 3920
},
{
"epoch": 0.4350714048488874,
"grad_norm": 16.946369105743727,
"learning_rate": 6.953290009453857e-06,
"loss": 2.6547,
"step": 3930
},
{
"epoch": 0.43617845676962247,
"grad_norm": 17.606162667161975,
"learning_rate": 6.9354876160298764e-06,
"loss": 2.7565,
"step": 3940
},
{
"epoch": 0.4372855086903576,
"grad_norm": 15.792356606039535,
"learning_rate": 6.917656314785044e-06,
"loss": 2.7603,
"step": 3950
},
{
"epoch": 0.43839256061109266,
"grad_norm": 17.519385710278783,
"learning_rate": 6.899796372041943e-06,
"loss": 2.5908,
"step": 3960
},
{
"epoch": 0.43949961253182773,
"grad_norm": 18.175539572977502,
"learning_rate": 6.881908054550939e-06,
"loss": 2.7189,
"step": 3970
},
{
"epoch": 0.4406066644525628,
"grad_norm": 16.78341459760071,
"learning_rate": 6.863991629486191e-06,
"loss": 2.7457,
"step": 3980
},
{
"epoch": 0.4417137163732979,
"grad_norm": 16.307893535865905,
"learning_rate": 6.846047364441661e-06,
"loss": 2.7664,
"step": 3990
},
{
"epoch": 0.442820768294033,
"grad_norm": 17.795046718057446,
"learning_rate": 6.828075527427127e-06,
"loss": 2.7682,
"step": 4000
},
{
"epoch": 0.442820768294033,
"eval_loss": 2.715528726577759,
"eval_runtime": 2400.8491,
"eval_samples_per_second": 4.181,
"eval_steps_per_second": 0.418,
"step": 4000
},
{
"epoch": 0.44392782021476807,
"grad_norm": 17.177561938405823,
"learning_rate": 6.810076386864168e-06,
"loss": 2.7353,
"step": 4010
},
{
"epoch": 0.44503487213550313,
"grad_norm": 18.717792449825087,
"learning_rate": 6.792050211582164e-06,
"loss": 2.6284,
"step": 4020
},
{
"epoch": 0.44614192405623826,
"grad_norm": 20.629160666920065,
"learning_rate": 6.77399727081427e-06,
"loss": 2.7808,
"step": 4030
},
{
"epoch": 0.44724897597697333,
"grad_norm": 16.300381610488234,
"learning_rate": 6.755917834193408e-06,
"loss": 2.6976,
"step": 4040
},
{
"epoch": 0.4483560278977084,
"grad_norm": 18.995902150808703,
"learning_rate": 6.737812171748234e-06,
"loss": 2.7441,
"step": 4050
},
{
"epoch": 0.44946307981844347,
"grad_norm": 18.261637709522596,
"learning_rate": 6.719680553899097e-06,
"loss": 2.6822,
"step": 4060
},
{
"epoch": 0.4505701317391786,
"grad_norm": 20.659710982739558,
"learning_rate": 6.701523251454017e-06,
"loss": 2.6978,
"step": 4070
},
{
"epoch": 0.45167718365991366,
"grad_norm": 19.963369393203255,
"learning_rate": 6.683340535604624e-06,
"loss": 2.7391,
"step": 4080
},
{
"epoch": 0.45278423558064873,
"grad_norm": 17.272615462239525,
"learning_rate": 6.665132677922118e-06,
"loss": 2.6982,
"step": 4090
},
{
"epoch": 0.4538912875013838,
"grad_norm": 17.102697486895753,
"learning_rate": 6.646899950353208e-06,
"loss": 2.7443,
"step": 4100
},
{
"epoch": 0.4549983394221189,
"grad_norm": 16.731640547098063,
"learning_rate": 6.628642625216053e-06,
"loss": 2.7825,
"step": 4110
},
{
"epoch": 0.456105391342854,
"grad_norm": 16.86948389308186,
"learning_rate": 6.61036097519619e-06,
"loss": 2.6986,
"step": 4120
},
{
"epoch": 0.45721244326358906,
"grad_norm": 20.677217100728953,
"learning_rate": 6.592055273342467e-06,
"loss": 2.8304,
"step": 4130
},
{
"epoch": 0.45831949518432413,
"grad_norm": 16.821661815243136,
"learning_rate": 6.573725793062965e-06,
"loss": 2.6678,
"step": 4140
},
{
"epoch": 0.4594265471050592,
"grad_norm": 18.45134731193715,
"learning_rate": 6.555372808120907e-06,
"loss": 2.823,
"step": 4150
},
{
"epoch": 0.4605335990257943,
"grad_norm": 17.57852954660428,
"learning_rate": 6.536996592630578e-06,
"loss": 2.7795,
"step": 4160
},
{
"epoch": 0.4616406509465294,
"grad_norm": 17.253221141789883,
"learning_rate": 6.518597421053223e-06,
"loss": 2.7,
"step": 4170
},
{
"epoch": 0.46274770286726447,
"grad_norm": 16.206089784799936,
"learning_rate": 6.5001755681929545e-06,
"loss": 2.7196,
"step": 4180
},
{
"epoch": 0.46385475478799953,
"grad_norm": 18.947069414032423,
"learning_rate": 6.481731309192647e-06,
"loss": 2.7542,
"step": 4190
},
{
"epoch": 0.46496180670873466,
"grad_norm": 16.548697201774296,
"learning_rate": 6.463264919529823e-06,
"loss": 2.7531,
"step": 4200
},
{
"epoch": 0.46606885862946973,
"grad_norm": 17.605153791162124,
"learning_rate": 6.444776675012542e-06,
"loss": 2.7248,
"step": 4210
},
{
"epoch": 0.4671759105502048,
"grad_norm": 18.42367136884591,
"learning_rate": 6.42626685177528e-06,
"loss": 2.6742,
"step": 4220
},
{
"epoch": 0.46828296247093987,
"grad_norm": 21.057012768405876,
"learning_rate": 6.407735726274809e-06,
"loss": 2.7067,
"step": 4230
},
{
"epoch": 0.469390014391675,
"grad_norm": 17.878193605338524,
"learning_rate": 6.38918357528606e-06,
"loss": 2.8213,
"step": 4240
},
{
"epoch": 0.47049706631241006,
"grad_norm": 15.251101561882258,
"learning_rate": 6.370610675897997e-06,
"loss": 2.767,
"step": 4250
},
{
"epoch": 0.47160411823314513,
"grad_norm": 16.35077680470725,
"learning_rate": 6.352017305509475e-06,
"loss": 2.5496,
"step": 4260
},
{
"epoch": 0.4727111701538802,
"grad_norm": 20.78692237253247,
"learning_rate": 6.3334037418250975e-06,
"loss": 2.5517,
"step": 4270
},
{
"epoch": 0.4738182220746153,
"grad_norm": 16.49688836558597,
"learning_rate": 6.314770262851069e-06,
"loss": 2.7365,
"step": 4280
},
{
"epoch": 0.4749252739953504,
"grad_norm": 17.75918198378233,
"learning_rate": 6.296117146891039e-06,
"loss": 2.651,
"step": 4290
},
{
"epoch": 0.47603232591608546,
"grad_norm": 15.289950571080979,
"learning_rate": 6.277444672541953e-06,
"loss": 2.7015,
"step": 4300
},
{
"epoch": 0.47713937783682053,
"grad_norm": 15.010585688125417,
"learning_rate": 6.258753118689887e-06,
"loss": 2.6344,
"step": 4310
},
{
"epoch": 0.4782464297575556,
"grad_norm": 16.384237830668948,
"learning_rate": 6.240042764505877e-06,
"loss": 2.7013,
"step": 4320
},
{
"epoch": 0.4793534816782907,
"grad_norm": 15.761472924874809,
"learning_rate": 6.2213138894417615e-06,
"loss": 2.7414,
"step": 4330
},
{
"epoch": 0.4804605335990258,
"grad_norm": 17.457264405530225,
"learning_rate": 6.202566773225995e-06,
"loss": 2.7923,
"step": 4340
},
{
"epoch": 0.48156758551976087,
"grad_norm": 20.03913075692092,
"learning_rate": 6.1838016958594825e-06,
"loss": 2.7145,
"step": 4350
},
{
"epoch": 0.48267463744049593,
"grad_norm": 14.687794264132354,
"learning_rate": 6.165018937611385e-06,
"loss": 2.6172,
"step": 4360
},
{
"epoch": 0.48378168936123106,
"grad_norm": 15.026413038595793,
"learning_rate": 6.146218779014942e-06,
"loss": 2.6804,
"step": 4370
},
{
"epoch": 0.48488874128196613,
"grad_norm": 17.378458618834472,
"learning_rate": 6.127401500863281e-06,
"loss": 2.5838,
"step": 4380
},
{
"epoch": 0.4859957932027012,
"grad_norm": 16.495531002493667,
"learning_rate": 6.108567384205214e-06,
"loss": 2.5008,
"step": 4390
},
{
"epoch": 0.48710284512343627,
"grad_norm": 15.612526961187054,
"learning_rate": 6.089716710341058e-06,
"loss": 2.5134,
"step": 4400
},
{
"epoch": 0.4882098970441714,
"grad_norm": 17.829542612600722,
"learning_rate": 6.070849760818417e-06,
"loss": 2.6932,
"step": 4410
},
{
"epoch": 0.48931694896490646,
"grad_norm": 18.397184297289453,
"learning_rate": 6.051966817427983e-06,
"loss": 2.664,
"step": 4420
},
{
"epoch": 0.49042400088564153,
"grad_norm": 15.139678235200124,
"learning_rate": 6.03306816219933e-06,
"loss": 2.6431,
"step": 4430
},
{
"epoch": 0.4915310528063766,
"grad_norm": 19.13733604850318,
"learning_rate": 6.014154077396695e-06,
"loss": 2.7429,
"step": 4440
},
{
"epoch": 0.4926381047271117,
"grad_norm": 19.88327633299528,
"learning_rate": 5.995224845514771e-06,
"loss": 2.6894,
"step": 4450
},
{
"epoch": 0.4937451566478468,
"grad_norm": 16.78819908723115,
"learning_rate": 5.97628074927448e-06,
"loss": 2.712,
"step": 4460
},
{
"epoch": 0.49485220856858186,
"grad_norm": 15.34943286541028,
"learning_rate": 5.957322071618753e-06,
"loss": 2.652,
"step": 4470
},
{
"epoch": 0.49595926048931693,
"grad_norm": 14.718777663127804,
"learning_rate": 5.9383490957083045e-06,
"loss": 2.6708,
"step": 4480
},
{
"epoch": 0.49706631241005206,
"grad_norm": 14.06128807028094,
"learning_rate": 5.919362104917403e-06,
"loss": 2.6022,
"step": 4490
},
{
"epoch": 0.4981733643307871,
"grad_norm": 16.565786742803958,
"learning_rate": 5.90036138282964e-06,
"loss": 2.6252,
"step": 4500
},
{
"epoch": 0.4992804162515222,
"grad_norm": 15.757898844662668,
"learning_rate": 5.8813472132336955e-06,
"loss": 2.6229,
"step": 4510
},
{
"epoch": 0.5003874681722573,
"grad_norm": 21.10749621990984,
"learning_rate": 5.862319880119092e-06,
"loss": 2.709,
"step": 4520
},
{
"epoch": 0.5014945200929923,
"grad_norm": 18.080937909773763,
"learning_rate": 5.8432796676719585e-06,
"loss": 2.5919,
"step": 4530
},
{
"epoch": 0.5026015720137275,
"grad_norm": 15.309930072347084,
"learning_rate": 5.824226860270791e-06,
"loss": 2.7639,
"step": 4540
},
{
"epoch": 0.5037086239344625,
"grad_norm": 17.326512802033673,
"learning_rate": 5.805161742482194e-06,
"loss": 2.6954,
"step": 4550
},
{
"epoch": 0.5048156758551976,
"grad_norm": 20.016766712775652,
"learning_rate": 5.786084599056637e-06,
"loss": 2.6651,
"step": 4560
},
{
"epoch": 0.5059227277759327,
"grad_norm": 15.39976054839859,
"learning_rate": 5.766995714924204e-06,
"loss": 2.7208,
"step": 4570
},
{
"epoch": 0.5070297796966677,
"grad_norm": 15.56824968714477,
"learning_rate": 5.747895375190331e-06,
"loss": 2.6959,
"step": 4580
},
{
"epoch": 0.5081368316174029,
"grad_norm": 19.043556423880098,
"learning_rate": 5.728783865131554e-06,
"loss": 2.7182,
"step": 4590
},
{
"epoch": 0.509243883538138,
"grad_norm": 18.533491761930883,
"learning_rate": 5.709661470191241e-06,
"loss": 2.6474,
"step": 4600
},
{
"epoch": 0.510350935458873,
"grad_norm": 17.576811873751446,
"learning_rate": 5.6905284759753365e-06,
"loss": 2.6864,
"step": 4610
},
{
"epoch": 0.5114579873796081,
"grad_norm": 18.79796869282816,
"learning_rate": 5.6713851682480926e-06,
"loss": 2.5302,
"step": 4620
},
{
"epoch": 0.5125650393003431,
"grad_norm": 17.510899102111733,
"learning_rate": 5.6522318329278e-06,
"loss": 2.6672,
"step": 4630
},
{
"epoch": 0.5136720912210783,
"grad_norm": 15.707692417808088,
"learning_rate": 5.633068756082517e-06,
"loss": 2.6229,
"step": 4640
},
{
"epoch": 0.5147791431418134,
"grad_norm": 14.427966106685423,
"learning_rate": 5.613896223925799e-06,
"loss": 2.6565,
"step": 4650
},
{
"epoch": 0.5158861950625484,
"grad_norm": 17.13890386270487,
"learning_rate": 5.594714522812422e-06,
"loss": 2.738,
"step": 4660
},
{
"epoch": 0.5169932469832835,
"grad_norm": 15.344124561854793,
"learning_rate": 5.575523939234111e-06,
"loss": 2.7876,
"step": 4670
},
{
"epoch": 0.5181002989040187,
"grad_norm": 16.79964161196015,
"learning_rate": 5.556324759815252e-06,
"loss": 2.6692,
"step": 4680
},
{
"epoch": 0.5192073508247537,
"grad_norm": 19.56356390380519,
"learning_rate": 5.537117271308615e-06,
"loss": 2.7151,
"step": 4690
},
{
"epoch": 0.5203144027454888,
"grad_norm": 18.641775052939003,
"learning_rate": 5.5179017605910754e-06,
"loss": 2.8004,
"step": 4700
},
{
"epoch": 0.5214214546662238,
"grad_norm": 15.272957986365086,
"learning_rate": 5.4986785146593255e-06,
"loss": 2.7083,
"step": 4710
},
{
"epoch": 0.5225285065869589,
"grad_norm": 15.949027616558995,
"learning_rate": 5.479447820625585e-06,
"loss": 2.6865,
"step": 4720
},
{
"epoch": 0.523635558507694,
"grad_norm": 15.67762021450724,
"learning_rate": 5.46020996571332e-06,
"loss": 2.7183,
"step": 4730
},
{
"epoch": 0.5247426104284291,
"grad_norm": 19.95294125446329,
"learning_rate": 5.4409652372529444e-06,
"loss": 2.7927,
"step": 4740
},
{
"epoch": 0.5258496623491642,
"grad_norm": 13.488762906306286,
"learning_rate": 5.421713922677539e-06,
"loss": 2.5992,
"step": 4750
},
{
"epoch": 0.5269567142698992,
"grad_norm": 16.599798214798543,
"learning_rate": 5.402456309518547e-06,
"loss": 2.5732,
"step": 4760
},
{
"epoch": 0.5280637661906343,
"grad_norm": 14.764833460888406,
"learning_rate": 5.383192685401492e-06,
"loss": 2.5634,
"step": 4770
},
{
"epoch": 0.5291708181113695,
"grad_norm": 17.816571873254308,
"learning_rate": 5.363923338041667e-06,
"loss": 2.64,
"step": 4780
},
{
"epoch": 0.5302778700321045,
"grad_norm": 14.543241263642692,
"learning_rate": 5.344648555239854e-06,
"loss": 2.6637,
"step": 4790
},
{
"epoch": 0.5313849219528396,
"grad_norm": 16.519933702897138,
"learning_rate": 5.325368624878009e-06,
"loss": 2.747,
"step": 4800
},
{
"epoch": 0.5324919738735747,
"grad_norm": 17.67293620152496,
"learning_rate": 5.306083834914977e-06,
"loss": 2.6096,
"step": 4810
},
{
"epoch": 0.5335990257943097,
"grad_norm": 17.919095046156233,
"learning_rate": 5.286794473382178e-06,
"loss": 2.6526,
"step": 4820
},
{
"epoch": 0.5347060777150449,
"grad_norm": 14.567289996672956,
"learning_rate": 5.267500828379319e-06,
"loss": 2.7698,
"step": 4830
},
{
"epoch": 0.5358131296357799,
"grad_norm": 17.34975497496579,
"learning_rate": 5.248203188070078e-06,
"loss": 2.6932,
"step": 4840
},
{
"epoch": 0.536920181556515,
"grad_norm": 14.383043710837034,
"learning_rate": 5.228901840677808e-06,
"loss": 2.533,
"step": 4850
},
{
"epoch": 0.5380272334772501,
"grad_norm": 19.4814620431374,
"learning_rate": 5.209597074481228e-06,
"loss": 2.7526,
"step": 4860
},
{
"epoch": 0.5391342853979851,
"grad_norm": 17.294271003058864,
"learning_rate": 5.19028917781012e-06,
"loss": 2.7006,
"step": 4870
},
{
"epoch": 0.5402413373187203,
"grad_norm": 13.454761500494456,
"learning_rate": 5.170978439041023e-06,
"loss": 2.5453,
"step": 4880
},
{
"epoch": 0.5413483892394554,
"grad_norm": 17.855933800763392,
"learning_rate": 5.151665146592924e-06,
"loss": 2.6315,
"step": 4890
},
{
"epoch": 0.5424554411601904,
"grad_norm": 17.427924222975562,
"learning_rate": 5.132349588922949e-06,
"loss": 2.6539,
"step": 4900
},
{
"epoch": 0.5435624930809255,
"grad_norm": 20.073145834110875,
"learning_rate": 5.113032054522058e-06,
"loss": 2.5488,
"step": 4910
},
{
"epoch": 0.5446695450016605,
"grad_norm": 12.357803208105327,
"learning_rate": 5.093712831910736e-06,
"loss": 2.5557,
"step": 4920
},
{
"epoch": 0.5457765969223957,
"grad_norm": 15.692479347879283,
"learning_rate": 5.0743922096346836e-06,
"loss": 2.7068,
"step": 4930
},
{
"epoch": 0.5468836488431308,
"grad_norm": 14.866689448660685,
"learning_rate": 5.055070476260501e-06,
"loss": 2.576,
"step": 4940
},
{
"epoch": 0.5479907007638658,
"grad_norm": 15.129308088501134,
"learning_rate": 5.0357479203713885e-06,
"loss": 2.3914,
"step": 4950
},
{
"epoch": 0.5490977526846009,
"grad_norm": 14.162687417076338,
"learning_rate": 5.0164248305628284e-06,
"loss": 2.6796,
"step": 4960
},
{
"epoch": 0.5502048046053359,
"grad_norm": 19.323858139882816,
"learning_rate": 4.997101495438277e-06,
"loss": 2.4771,
"step": 4970
},
{
"epoch": 0.5513118565260711,
"grad_norm": 17.540498070177875,
"learning_rate": 4.97777820360486e-06,
"loss": 2.572,
"step": 4980
},
{
"epoch": 0.5524189084468062,
"grad_norm": 19.393507393902457,
"learning_rate": 4.958455243669051e-06,
"loss": 2.6577,
"step": 4990
},
{
"epoch": 0.5535259603675412,
"grad_norm": 17.365811060415265,
"learning_rate": 4.939132904232366e-06,
"loss": 2.6571,
"step": 5000
},
{
"epoch": 0.5546330122882763,
"grad_norm": 14.882734972778014,
"learning_rate": 4.91981147388706e-06,
"loss": 2.5927,
"step": 5010
},
{
"epoch": 0.5557400642090115,
"grad_norm": 18.498227060413406,
"learning_rate": 4.900491241211799e-06,
"loss": 2.6215,
"step": 5020
},
{
"epoch": 0.5568471161297465,
"grad_norm": 16.424230672284246,
"learning_rate": 4.881172494767372e-06,
"loss": 2.738,
"step": 5030
},
{
"epoch": 0.5579541680504816,
"grad_norm": 14.449267161706716,
"learning_rate": 4.861855523092366e-06,
"loss": 2.6883,
"step": 5040
},
{
"epoch": 0.5590612199712166,
"grad_norm": 15.748250902231145,
"learning_rate": 4.84254061469886e-06,
"loss": 2.6369,
"step": 5050
},
{
"epoch": 0.5601682718919517,
"grad_norm": 21.423066740561787,
"learning_rate": 4.823228058068113e-06,
"loss": 2.7159,
"step": 5060
},
{
"epoch": 0.5612753238126869,
"grad_norm": 14.22388926392383,
"learning_rate": 4.803918141646268e-06,
"loss": 2.5795,
"step": 5070
},
{
"epoch": 0.5623823757334219,
"grad_norm": 14.83696241654988,
"learning_rate": 4.784611153840027e-06,
"loss": 2.5612,
"step": 5080
},
{
"epoch": 0.563489427654157,
"grad_norm": 14.263900210157331,
"learning_rate": 4.765307383012352e-06,
"loss": 2.5602,
"step": 5090
},
{
"epoch": 0.564596479574892,
"grad_norm": 17.257310107919768,
"learning_rate": 4.746007117478162e-06,
"loss": 2.611,
"step": 5100
},
{
"epoch": 0.5657035314956271,
"grad_norm": 16.708351070999512,
"learning_rate": 4.726710645500014e-06,
"loss": 2.6106,
"step": 5110
},
{
"epoch": 0.5668105834163623,
"grad_norm": 16.979878309390095,
"learning_rate": 4.707418255283817e-06,
"loss": 2.7961,
"step": 5120
},
{
"epoch": 0.5679176353370973,
"grad_norm": 16.81810768550359,
"learning_rate": 4.6881302349745015e-06,
"loss": 2.5536,
"step": 5130
},
{
"epoch": 0.5690246872578324,
"grad_norm": 16.369198159186666,
"learning_rate": 4.668846872651745e-06,
"loss": 2.7049,
"step": 5140
},
{
"epoch": 0.5701317391785675,
"grad_norm": 14.5307901204883,
"learning_rate": 4.649568456325645e-06,
"loss": 2.6538,
"step": 5150
},
{
"epoch": 0.5712387910993025,
"grad_norm": 13.505347462632475,
"learning_rate": 4.630295273932435e-06,
"loss": 2.5944,
"step": 5160
},
{
"epoch": 0.5723458430200377,
"grad_norm": 14.683292804609174,
"learning_rate": 4.611027613330166e-06,
"loss": 2.6914,
"step": 5170
},
{
"epoch": 0.5734528949407727,
"grad_norm": 17.13643381283879,
"learning_rate": 4.5917657622944235e-06,
"loss": 2.6462,
"step": 5180
},
{
"epoch": 0.5745599468615078,
"grad_norm": 16.94159128538117,
"learning_rate": 4.572510008514027e-06,
"loss": 2.6447,
"step": 5190
},
{
"epoch": 0.5756669987822429,
"grad_norm": 18.068429687848685,
"learning_rate": 4.55326063958672e-06,
"loss": 2.7705,
"step": 5200
},
{
"epoch": 0.5767740507029779,
"grad_norm": 14.55412168781434,
"learning_rate": 4.534017943014895e-06,
"loss": 2.6824,
"step": 5210
},
{
"epoch": 0.5778811026237131,
"grad_norm": 14.837147206944774,
"learning_rate": 4.514782206201274e-06,
"loss": 2.5857,
"step": 5220
},
{
"epoch": 0.5789881545444482,
"grad_norm": 15.433613293909772,
"learning_rate": 4.495553716444647e-06,
"loss": 2.6309,
"step": 5230
},
{
"epoch": 0.5800952064651832,
"grad_norm": 15.838049703049755,
"learning_rate": 4.4763327609355505e-06,
"loss": 2.5826,
"step": 5240
},
{
"epoch": 0.5812022583859183,
"grad_norm": 17.013462581069046,
"learning_rate": 4.457119626751998e-06,
"loss": 2.6681,
"step": 5250
},
{
"epoch": 0.5823093103066533,
"grad_norm": 18.074417040094673,
"learning_rate": 4.437914600855187e-06,
"loss": 2.6364,
"step": 5260
},
{
"epoch": 0.5834163622273885,
"grad_norm": 17.194945416385185,
"learning_rate": 4.4187179700852084e-06,
"loss": 2.6663,
"step": 5270
},
{
"epoch": 0.5845234141481236,
"grad_norm": 17.09566869966539,
"learning_rate": 4.399530021156771e-06,
"loss": 2.5621,
"step": 5280
},
{
"epoch": 0.5856304660688586,
"grad_norm": 18.10182865444287,
"learning_rate": 4.38035104065491e-06,
"loss": 2.6451,
"step": 5290
},
{
"epoch": 0.5867375179895937,
"grad_norm": 13.726610338766326,
"learning_rate": 4.361181315030714e-06,
"loss": 2.6154,
"step": 5300
},
{
"epoch": 0.5878445699103287,
"grad_norm": 13.396115971130266,
"learning_rate": 4.342021130597041e-06,
"loss": 2.6552,
"step": 5310
},
{
"epoch": 0.5889516218310639,
"grad_norm": 18.072129861449454,
"learning_rate": 4.3228707735242485e-06,
"loss": 2.6323,
"step": 5320
},
{
"epoch": 0.590058673751799,
"grad_norm": 16.101311253082667,
"learning_rate": 4.303730529835913e-06,
"loss": 2.5936,
"step": 5330
},
{
"epoch": 0.591165725672534,
"grad_norm": 17.959725344836784,
"learning_rate": 4.28460068540456e-06,
"loss": 2.6568,
"step": 5340
},
{
"epoch": 0.5922727775932691,
"grad_norm": 14.558411141104697,
"learning_rate": 4.2654815259473994e-06,
"loss": 2.599,
"step": 5350
},
{
"epoch": 0.5933798295140043,
"grad_norm": 15.020557260142786,
"learning_rate": 4.2463733370220464e-06,
"loss": 2.6193,
"step": 5360
},
{
"epoch": 0.5944868814347393,
"grad_norm": 16.367462970526052,
"learning_rate": 4.2272764040222724e-06,
"loss": 2.5572,
"step": 5370
},
{
"epoch": 0.5955939333554744,
"grad_norm": 17.24930565347666,
"learning_rate": 4.208191012173728e-06,
"loss": 2.7591,
"step": 5380
},
{
"epoch": 0.5967009852762094,
"grad_norm": 16.29148295415015,
"learning_rate": 4.189117446529692e-06,
"loss": 2.6654,
"step": 5390
},
{
"epoch": 0.5978080371969445,
"grad_norm": 14.636816803347672,
"learning_rate": 4.170055991966808e-06,
"loss": 2.6481,
"step": 5400
},
{
"epoch": 0.5989150891176797,
"grad_norm": 15.770080307849732,
"learning_rate": 4.1510069331808324e-06,
"loss": 2.637,
"step": 5410
},
{
"epoch": 0.6000221410384147,
"grad_norm": 15.398178191768253,
"learning_rate": 4.131970554682387e-06,
"loss": 2.6958,
"step": 5420
},
{
"epoch": 0.6011291929591498,
"grad_norm": 15.861210008610465,
"learning_rate": 4.1129471407926995e-06,
"loss": 2.5836,
"step": 5430
},
{
"epoch": 0.6022362448798849,
"grad_norm": 14.510344904474643,
"learning_rate": 4.093936975639367e-06,
"loss": 2.6514,
"step": 5440
},
{
"epoch": 0.6033432968006199,
"grad_norm": 19.34752243925819,
"learning_rate": 4.0749403431521e-06,
"loss": 2.6221,
"step": 5450
},
{
"epoch": 0.6044503487213551,
"grad_norm": 14.169326871610396,
"learning_rate": 4.055957527058501e-06,
"loss": 2.5109,
"step": 5460
},
{
"epoch": 0.6055574006420901,
"grad_norm": 15.469257875046958,
"learning_rate": 4.036988810879804e-06,
"loss": 2.6436,
"step": 5470
},
{
"epoch": 0.6066644525628252,
"grad_norm": 15.484848198531239,
"learning_rate": 4.018034477926661e-06,
"loss": 2.4906,
"step": 5480
},
{
"epoch": 0.6077715044835603,
"grad_norm": 15.378784092462407,
"learning_rate": 3.9990948112948914e-06,
"loss": 2.6171,
"step": 5490
},
{
"epoch": 0.6088785564042953,
"grad_norm": 14.686645639856618,
"learning_rate": 3.9801700938612685e-06,
"loss": 2.6579,
"step": 5500
},
{
"epoch": 0.6099856083250305,
"grad_norm": 13.215751426102292,
"learning_rate": 3.96126060827929e-06,
"loss": 2.5402,
"step": 5510
},
{
"epoch": 0.6110926602457655,
"grad_norm": 14.135003798272539,
"learning_rate": 3.942366636974954e-06,
"loss": 2.622,
"step": 5520
},
{
"epoch": 0.6121997121665006,
"grad_norm": 17.459175088951138,
"learning_rate": 3.923488462142541e-06,
"loss": 2.5552,
"step": 5530
},
{
"epoch": 0.6133067640872357,
"grad_norm": 15.87291748509675,
"learning_rate": 3.9046263657404005e-06,
"loss": 2.6628,
"step": 5540
},
{
"epoch": 0.6144138160079707,
"grad_norm": 17.77834550937652,
"learning_rate": 3.885780629486744e-06,
"loss": 2.5962,
"step": 5550
},
{
"epoch": 0.6155208679287059,
"grad_norm": 14.623260869268544,
"learning_rate": 3.866951534855429e-06,
"loss": 2.5216,
"step": 5560
},
{
"epoch": 0.616627919849441,
"grad_norm": 18.782526592973454,
"learning_rate": 3.848139363071759e-06,
"loss": 2.5408,
"step": 5570
},
{
"epoch": 0.617734971770176,
"grad_norm": 15.484929469465394,
"learning_rate": 3.8293443951082865e-06,
"loss": 2.5616,
"step": 5580
},
{
"epoch": 0.6188420236909111,
"grad_norm": 17.313043224092755,
"learning_rate": 3.810566911680607e-06,
"loss": 2.6196,
"step": 5590
},
{
"epoch": 0.6199490756116461,
"grad_norm": 14.974425571993558,
"learning_rate": 3.7918071932431823e-06,
"loss": 2.5633,
"step": 5600
},
{
"epoch": 0.6210561275323813,
"grad_norm": 14.593381904858223,
"learning_rate": 3.773065519985132e-06,
"loss": 2.6227,
"step": 5610
},
{
"epoch": 0.6221631794531164,
"grad_norm": 19.67519437375815,
"learning_rate": 3.7543421718260663e-06,
"loss": 2.666,
"step": 5620
},
{
"epoch": 0.6232702313738514,
"grad_norm": 13.058989186832509,
"learning_rate": 3.7356374284118906e-06,
"loss": 2.5616,
"step": 5630
},
{
"epoch": 0.6243772832945865,
"grad_norm": 19.30534098144351,
"learning_rate": 3.716951569110645e-06,
"loss": 2.551,
"step": 5640
},
{
"epoch": 0.6254843352153217,
"grad_norm": 15.614374371487665,
"learning_rate": 3.6982848730083144e-06,
"loss": 2.495,
"step": 5650
},
{
"epoch": 0.6265913871360567,
"grad_norm": 21.218331844105535,
"learning_rate": 3.67963761890467e-06,
"loss": 2.7439,
"step": 5660
},
{
"epoch": 0.6276984390567918,
"grad_norm": 17.01930866391004,
"learning_rate": 3.6610100853091067e-06,
"loss": 2.5619,
"step": 5670
},
{
"epoch": 0.6288054909775268,
"grad_norm": 16.548611978624205,
"learning_rate": 3.642402550436476e-06,
"loss": 2.5517,
"step": 5680
},
{
"epoch": 0.6299125428982619,
"grad_norm": 16.350659146252166,
"learning_rate": 3.6238152922029414e-06,
"loss": 2.6533,
"step": 5690
},
{
"epoch": 0.631019594818997,
"grad_norm": 16.295428081442413,
"learning_rate": 3.6052485882218124e-06,
"loss": 2.5341,
"step": 5700
},
{
"epoch": 0.6321266467397321,
"grad_norm": 16.161944221815478,
"learning_rate": 3.5867027157994137e-06,
"loss": 2.4661,
"step": 5710
},
{
"epoch": 0.6332336986604672,
"grad_norm": 18.192390922499364,
"learning_rate": 3.568177951930932e-06,
"loss": 2.5499,
"step": 5720
},
{
"epoch": 0.6343407505812022,
"grad_norm": 18.154938030310817,
"learning_rate": 3.54967457329629e-06,
"loss": 2.671,
"step": 5730
},
{
"epoch": 0.6354478025019373,
"grad_norm": 17.50231046259661,
"learning_rate": 3.5311928562559984e-06,
"loss": 2.5161,
"step": 5740
},
{
"epoch": 0.6365548544226725,
"grad_norm": 15.071570236507409,
"learning_rate": 3.5127330768470414e-06,
"loss": 2.638,
"step": 5750
},
{
"epoch": 0.6376619063434075,
"grad_norm": 17.638180874471615,
"learning_rate": 3.4942955107787534e-06,
"loss": 2.5672,
"step": 5760
},
{
"epoch": 0.6387689582641426,
"grad_norm": 17.092873285184194,
"learning_rate": 3.4758804334286924e-06,
"loss": 2.6012,
"step": 5770
},
{
"epoch": 0.6398760101848777,
"grad_norm": 14.564343624825167,
"learning_rate": 3.457488119838535e-06,
"loss": 2.5989,
"step": 5780
},
{
"epoch": 0.6409830621056127,
"grad_norm": 16.413821117561785,
"learning_rate": 3.4391188447099614e-06,
"loss": 2.506,
"step": 5790
},
{
"epoch": 0.6420901140263479,
"grad_norm": 18.393396650855887,
"learning_rate": 3.4207728824005653e-06,
"loss": 2.5685,
"step": 5800
},
{
"epoch": 0.6431971659470829,
"grad_norm": 16.91734370623325,
"learning_rate": 3.4024505069197387e-06,
"loss": 2.4561,
"step": 5810
},
{
"epoch": 0.644304217867818,
"grad_norm": 15.98240569506593,
"learning_rate": 3.3841519919245925e-06,
"loss": 2.6473,
"step": 5820
},
{
"epoch": 0.6454112697885531,
"grad_norm": 16.326289119567278,
"learning_rate": 3.3658776107158654e-06,
"loss": 2.4694,
"step": 5830
},
{
"epoch": 0.6465183217092881,
"grad_norm": 18.501828998717585,
"learning_rate": 3.347627636233837e-06,
"loss": 2.6163,
"step": 5840
},
{
"epoch": 0.6476253736300233,
"grad_norm": 17.230377910119174,
"learning_rate": 3.329402341054265e-06,
"loss": 2.5839,
"step": 5850
},
{
"epoch": 0.6487324255507584,
"grad_norm": 15.353383433670851,
"learning_rate": 3.311201997384295e-06,
"loss": 2.6337,
"step": 5860
},
{
"epoch": 0.6498394774714934,
"grad_norm": 16.881261849081998,
"learning_rate": 3.2930268770584127e-06,
"loss": 2.5865,
"step": 5870
},
{
"epoch": 0.6509465293922285,
"grad_norm": 18.123650428151265,
"learning_rate": 3.2748772515343697e-06,
"loss": 2.6292,
"step": 5880
},
{
"epoch": 0.6520535813129635,
"grad_norm": 21.517681336714,
"learning_rate": 3.2567533918891414e-06,
"loss": 2.641,
"step": 5890
},
{
"epoch": 0.6531606332336987,
"grad_norm": 19.398238179320135,
"learning_rate": 3.238655568814868e-06,
"loss": 2.6626,
"step": 5900
},
{
"epoch": 0.6542676851544338,
"grad_norm": 16.094985895672867,
"learning_rate": 3.2205840526148158e-06,
"loss": 2.5219,
"step": 5910
},
{
"epoch": 0.6553747370751688,
"grad_norm": 15.058326544356623,
"learning_rate": 3.2025391131993443e-06,
"loss": 2.5849,
"step": 5920
},
{
"epoch": 0.6564817889959039,
"grad_norm": 15.860339323392015,
"learning_rate": 3.184521020081864e-06,
"loss": 2.3947,
"step": 5930
},
{
"epoch": 0.6575888409166389,
"grad_norm": 17.03657583580592,
"learning_rate": 3.1665300423748256e-06,
"loss": 2.6228,
"step": 5940
},
{
"epoch": 0.6586958928373741,
"grad_norm": 16.449779619145687,
"learning_rate": 3.148566448785687e-06,
"loss": 2.6434,
"step": 5950
},
{
"epoch": 0.6598029447581092,
"grad_norm": 18.51817609745207,
"learning_rate": 3.1306305076129083e-06,
"loss": 2.5301,
"step": 5960
},
{
"epoch": 0.6609099966788442,
"grad_norm": 17.17970665475141,
"learning_rate": 3.112722486741941e-06,
"loss": 2.5608,
"step": 5970
},
{
"epoch": 0.6620170485995793,
"grad_norm": 15.220359891812148,
"learning_rate": 3.094842653641225e-06,
"loss": 2.5432,
"step": 5980
},
{
"epoch": 0.6631241005203145,
"grad_norm": 15.940169495180179,
"learning_rate": 3.076991275358205e-06,
"loss": 2.5147,
"step": 5990
},
{
"epoch": 0.6642311524410495,
"grad_norm": 13.94891949646219,
"learning_rate": 3.059168618515325e-06,
"loss": 2.5043,
"step": 6000
},
{
"epoch": 0.6642311524410495,
"eval_loss": 2.562150716781616,
"eval_runtime": 2394.5594,
"eval_samples_per_second": 4.192,
"eval_steps_per_second": 0.419,
"step": 6000
},
{
"epoch": 0.6653382043617846,
"grad_norm": 17.7531887306566,
"learning_rate": 3.0413749493060596e-06,
"loss": 2.6127,
"step": 6010
},
{
"epoch": 0.6664452562825196,
"grad_norm": 12.808942551796036,
"learning_rate": 3.0236105334909303e-06,
"loss": 2.5683,
"step": 6020
},
{
"epoch": 0.6675523082032547,
"grad_norm": 16.672861233647524,
"learning_rate": 3.0058756363935447e-06,
"loss": 2.5315,
"step": 6030
},
{
"epoch": 0.6686593601239899,
"grad_norm": 15.135037228190633,
"learning_rate": 2.9881705228966217e-06,
"loss": 2.4304,
"step": 6040
},
{
"epoch": 0.6697664120447249,
"grad_norm": 19.201710928838462,
"learning_rate": 2.9704954574380474e-06,
"loss": 2.6006,
"step": 6050
},
{
"epoch": 0.67087346396546,
"grad_norm": 16.780831760906963,
"learning_rate": 2.9528507040069165e-06,
"loss": 2.5291,
"step": 6060
},
{
"epoch": 0.6719805158861951,
"grad_norm": 15.110403344711688,
"learning_rate": 2.935236526139592e-06,
"loss": 2.6148,
"step": 6070
},
{
"epoch": 0.6730875678069301,
"grad_norm": 14.691795830412493,
"learning_rate": 2.9176531869157776e-06,
"loss": 2.623,
"step": 6080
},
{
"epoch": 0.6741946197276653,
"grad_norm": 20.694910027119413,
"learning_rate": 2.900100948954568e-06,
"loss": 2.4261,
"step": 6090
},
{
"epoch": 0.6753016716484003,
"grad_norm": 20.153947600154126,
"learning_rate": 2.8825800744105553e-06,
"loss": 2.5051,
"step": 6100
},
{
"epoch": 0.6764087235691354,
"grad_norm": 16.844446676245752,
"learning_rate": 2.8650908249698837e-06,
"loss": 2.4725,
"step": 6110
},
{
"epoch": 0.6775157754898705,
"grad_norm": 15.629536784931664,
"learning_rate": 2.847633461846363e-06,
"loss": 2.4676,
"step": 6120
},
{
"epoch": 0.6786228274106055,
"grad_norm": 15.244942371558702,
"learning_rate": 2.830208245777556e-06,
"loss": 2.4867,
"step": 6130
},
{
"epoch": 0.6797298793313407,
"grad_norm": 18.15276563682713,
"learning_rate": 2.8128154370208895e-06,
"loss": 2.6125,
"step": 6140
},
{
"epoch": 0.6808369312520757,
"grad_norm": 14.866692854122116,
"learning_rate": 2.7954552953497648e-06,
"loss": 2.4709,
"step": 6150
},
{
"epoch": 0.6819439831728108,
"grad_norm": 15.710254262687716,
"learning_rate": 2.778128080049674e-06,
"loss": 2.5593,
"step": 6160
},
{
"epoch": 0.6830510350935459,
"grad_norm": 16.32088369390469,
"learning_rate": 2.760834049914337e-06,
"loss": 2.5904,
"step": 6170
},
{
"epoch": 0.6841580870142809,
"grad_norm": 17.297718496475216,
"learning_rate": 2.7435734632418286e-06,
"loss": 2.6322,
"step": 6180
},
{
"epoch": 0.6852651389350161,
"grad_norm": 16.18993238219759,
"learning_rate": 2.726346577830722e-06,
"loss": 2.4723,
"step": 6190
},
{
"epoch": 0.6863721908557512,
"grad_norm": 13.340569639729669,
"learning_rate": 2.7091536509762407e-06,
"loss": 2.5087,
"step": 6200
},
{
"epoch": 0.6874792427764862,
"grad_norm": 17.20103511645342,
"learning_rate": 2.691994939466415e-06,
"loss": 2.575,
"step": 6210
},
{
"epoch": 0.6885862946972213,
"grad_norm": 15.066807611711438,
"learning_rate": 2.6748706995782407e-06,
"loss": 2.5264,
"step": 6220
},
{
"epoch": 0.6896933466179563,
"grad_norm": 21.941135059717368,
"learning_rate": 2.657781187073861e-06,
"loss": 2.5012,
"step": 6230
},
{
"epoch": 0.6908003985386915,
"grad_norm": 16.278833357503192,
"learning_rate": 2.640726657196743e-06,
"loss": 2.5817,
"step": 6240
},
{
"epoch": 0.6919074504594266,
"grad_norm": 13.836955054815277,
"learning_rate": 2.6237073646678596e-06,
"loss": 2.5257,
"step": 6250
},
{
"epoch": 0.6930145023801616,
"grad_norm": 17.42891079955518,
"learning_rate": 2.6067235636818975e-06,
"loss": 2.4827,
"step": 6260
},
{
"epoch": 0.6941215543008967,
"grad_norm": 16.66766719981607,
"learning_rate": 2.5897755079034415e-06,
"loss": 2.734,
"step": 6270
},
{
"epoch": 0.6952286062216317,
"grad_norm": 18.01524504020241,
"learning_rate": 2.5728634504632132e-06,
"loss": 2.4481,
"step": 6280
},
{
"epoch": 0.6963356581423669,
"grad_norm": 15.361507532173055,
"learning_rate": 2.555987643954259e-06,
"loss": 2.5952,
"step": 6290
},
{
"epoch": 0.697442710063102,
"grad_norm": 12.548971546748055,
"learning_rate": 2.539148340428203e-06,
"loss": 2.4955,
"step": 6300
},
{
"epoch": 0.698549761983837,
"grad_norm": 16.013770363195505,
"learning_rate": 2.5223457913914713e-06,
"loss": 2.5667,
"step": 6310
},
{
"epoch": 0.6996568139045721,
"grad_norm": 18.08109296107942,
"learning_rate": 2.505580247801529e-06,
"loss": 2.6721,
"step": 6320
},
{
"epoch": 0.7007638658253073,
"grad_norm": 18.233567782447306,
"learning_rate": 2.488851960063153e-06,
"loss": 2.5413,
"step": 6330
},
{
"epoch": 0.7018709177460423,
"grad_norm": 20.185450776651432,
"learning_rate": 2.4721611780246662e-06,
"loss": 2.5205,
"step": 6340
},
{
"epoch": 0.7029779696667774,
"grad_norm": 17.322044563032186,
"learning_rate": 2.4555081509742257e-06,
"loss": 2.6061,
"step": 6350
},
{
"epoch": 0.7040850215875124,
"grad_norm": 16.69861708188076,
"learning_rate": 2.4388931276360898e-06,
"loss": 2.5733,
"step": 6360
},
{
"epoch": 0.7051920735082475,
"grad_norm": 14.9415194058973,
"learning_rate": 2.4223163561669084e-06,
"loss": 2.4084,
"step": 6370
},
{
"epoch": 0.7062991254289827,
"grad_norm": 15.070279374628573,
"learning_rate": 2.4057780841520073e-06,
"loss": 2.4201,
"step": 6380
},
{
"epoch": 0.7074061773497177,
"grad_norm": 16.92425944088654,
"learning_rate": 2.389278558601703e-06,
"loss": 2.674,
"step": 6390
},
{
"epoch": 0.7085132292704528,
"grad_norm": 15.873359974625208,
"learning_rate": 2.3728180259476054e-06,
"loss": 2.5413,
"step": 6400
},
{
"epoch": 0.7096202811911879,
"grad_norm": 17.077658381322358,
"learning_rate": 2.356396732038938e-06,
"loss": 2.5189,
"step": 6410
},
{
"epoch": 0.7107273331119229,
"grad_norm": 15.86795681834881,
"learning_rate": 2.34001492213887e-06,
"loss": 2.6101,
"step": 6420
},
{
"epoch": 0.7118343850326581,
"grad_norm": 13.564052898106056,
"learning_rate": 2.323672840920843e-06,
"loss": 2.5059,
"step": 6430
},
{
"epoch": 0.7129414369533931,
"grad_norm": 16.387911586785865,
"learning_rate": 2.307370732464936e-06,
"loss": 2.4656,
"step": 6440
},
{
"epoch": 0.7140484888741282,
"grad_norm": 15.397100789766657,
"learning_rate": 2.291108840254194e-06,
"loss": 2.5474,
"step": 6450
},
{
"epoch": 0.7151555407948633,
"grad_norm": 20.180668201574875,
"learning_rate": 2.274887407171015e-06,
"loss": 2.6061,
"step": 6460
},
{
"epoch": 0.7162625927155983,
"grad_norm": 16.932276461623562,
"learning_rate": 2.2587066754935088e-06,
"loss": 2.6172,
"step": 6470
},
{
"epoch": 0.7173696446363335,
"grad_norm": 15.85444224400965,
"learning_rate": 2.242566886891878e-06,
"loss": 2.4546,
"step": 6480
},
{
"epoch": 0.7184766965570685,
"grad_norm": 16.024831283317745,
"learning_rate": 2.2264682824248244e-06,
"loss": 2.5442,
"step": 6490
},
{
"epoch": 0.7195837484778036,
"grad_norm": 15.983284722901772,
"learning_rate": 2.210411102535923e-06,
"loss": 2.5027,
"step": 6500
},
{
"epoch": 0.7206908003985387,
"grad_norm": 18.522789630055893,
"learning_rate": 2.194395587050053e-06,
"loss": 2.5553,
"step": 6510
},
{
"epoch": 0.7217978523192737,
"grad_norm": 14.14639815951338,
"learning_rate": 2.178421975169806e-06,
"loss": 2.5721,
"step": 6520
},
{
"epoch": 0.7229049042400089,
"grad_norm": 14.492302660298277,
"learning_rate": 2.1624905054719136e-06,
"loss": 2.4938,
"step": 6530
},
{
"epoch": 0.724011956160744,
"grad_norm": 19.363838132408695,
"learning_rate": 2.146601415903685e-06,
"loss": 2.4218,
"step": 6540
},
{
"epoch": 0.725119008081479,
"grad_norm": 15.90076642116056,
"learning_rate": 2.1307549437794576e-06,
"loss": 2.448,
"step": 6550
},
{
"epoch": 0.7262260600022141,
"grad_norm": 17.3475722033809,
"learning_rate": 2.114951325777041e-06,
"loss": 2.5259,
"step": 6560
},
{
"epoch": 0.7273331119229491,
"grad_norm": 17.081131808882112,
"learning_rate": 2.0991907979341945e-06,
"loss": 2.6131,
"step": 6570
},
{
"epoch": 0.7284401638436843,
"grad_norm": 19.24726121813359,
"learning_rate": 2.083473595645096e-06,
"loss": 2.5176,
"step": 6580
},
{
"epoch": 0.7295472157644194,
"grad_norm": 18.22671174512495,
"learning_rate": 2.067799953656827e-06,
"loss": 2.6385,
"step": 6590
},
{
"epoch": 0.7306542676851544,
"grad_norm": 19.51577253516203,
"learning_rate": 2.052170106065867e-06,
"loss": 2.5878,
"step": 6600
},
{
"epoch": 0.7317613196058895,
"grad_norm": 14.740255840350805,
"learning_rate": 2.0365842863145902e-06,
"loss": 2.6232,
"step": 6610
},
{
"epoch": 0.7328683715266247,
"grad_norm": 17.153524931988514,
"learning_rate": 2.021042727187797e-06,
"loss": 2.4545,
"step": 6620
},
{
"epoch": 0.7339754234473597,
"grad_norm": 16.978859837487686,
"learning_rate": 2.0055456608092135e-06,
"loss": 2.4822,
"step": 6630
},
{
"epoch": 0.7350824753680948,
"grad_norm": 15.507136512277452,
"learning_rate": 1.9900933186380427e-06,
"loss": 2.4757,
"step": 6640
},
{
"epoch": 0.7361895272888298,
"grad_norm": 15.113892086099645,
"learning_rate": 1.9746859314655024e-06,
"loss": 2.4577,
"step": 6650
},
{
"epoch": 0.7372965792095649,
"grad_norm": 19.298868896417396,
"learning_rate": 1.9593237294113688e-06,
"loss": 2.5047,
"step": 6660
},
{
"epoch": 0.7384036311303,
"grad_norm": 13.267678704003732,
"learning_rate": 1.944006941920561e-06,
"loss": 2.5715,
"step": 6670
},
{
"epoch": 0.7395106830510351,
"grad_norm": 14.87293193958646,
"learning_rate": 1.928735797759687e-06,
"loss": 2.5132,
"step": 6680
},
{
"epoch": 0.7406177349717702,
"grad_norm": 16.569655196515217,
"learning_rate": 1.91351052501365e-06,
"loss": 2.5578,
"step": 6690
},
{
"epoch": 0.7417247868925052,
"grad_norm": 18.641862537777396,
"learning_rate": 1.8983313510822283e-06,
"loss": 2.5117,
"step": 6700
},
{
"epoch": 0.7428318388132403,
"grad_norm": 16.649411387878974,
"learning_rate": 1.8831985026766848e-06,
"loss": 2.555,
"step": 6710
},
{
"epoch": 0.7439388907339755,
"grad_norm": 17.113555470969906,
"learning_rate": 1.8681122058163797e-06,
"loss": 2.4762,
"step": 6720
},
{
"epoch": 0.7450459426547105,
"grad_norm": 13.60243042756901,
"learning_rate": 1.853072685825391e-06,
"loss": 2.4798,
"step": 6730
},
{
"epoch": 0.7461529945754456,
"grad_norm": 14.062228805408685,
"learning_rate": 1.8380801673291555e-06,
"loss": 2.5991,
"step": 6740
},
{
"epoch": 0.7472600464961807,
"grad_norm": 12.81974531182581,
"learning_rate": 1.8231348742511102e-06,
"loss": 2.3543,
"step": 6750
},
{
"epoch": 0.7483670984169157,
"grad_norm": 16.835322913216885,
"learning_rate": 1.8082370298093483e-06,
"loss": 2.4387,
"step": 6760
},
{
"epoch": 0.7494741503376509,
"grad_norm": 14.330012440741553,
"learning_rate": 1.7933868565132857e-06,
"loss": 2.6009,
"step": 6770
},
{
"epoch": 0.7505812022583859,
"grad_norm": 15.204347320060766,
"learning_rate": 1.7785845761603376e-06,
"loss": 2.5466,
"step": 6780
},
{
"epoch": 0.751688254179121,
"grad_norm": 17.028609074434605,
"learning_rate": 1.7638304098326025e-06,
"loss": 2.4657,
"step": 6790
},
{
"epoch": 0.7527953060998561,
"grad_norm": 13.259346842026316,
"learning_rate": 1.7491245778935673e-06,
"loss": 2.6145,
"step": 6800
},
{
"epoch": 0.7539023580205911,
"grad_norm": 21.625831350357682,
"learning_rate": 1.7344672999848106e-06,
"loss": 2.5143,
"step": 6810
},
{
"epoch": 0.7550094099413263,
"grad_norm": 19.536045749121886,
"learning_rate": 1.7198587950227235e-06,
"loss": 2.4776,
"step": 6820
},
{
"epoch": 0.7561164618620614,
"grad_norm": 17.421699829582213,
"learning_rate": 1.7052992811952411e-06,
"loss": 2.4593,
"step": 6830
},
{
"epoch": 0.7572235137827964,
"grad_norm": 16.49786576509242,
"learning_rate": 1.6907889759585778e-06,
"loss": 2.6817,
"step": 6840
},
{
"epoch": 0.7583305657035315,
"grad_norm": 14.275882435397286,
"learning_rate": 1.676328096033994e-06,
"loss": 2.4542,
"step": 6850
},
{
"epoch": 0.7594376176242665,
"grad_norm": 17.493762248570647,
"learning_rate": 1.6619168574045385e-06,
"loss": 2.4719,
"step": 6860
},
{
"epoch": 0.7605446695450017,
"grad_norm": 16.007658419129143,
"learning_rate": 1.6475554753118412e-06,
"loss": 2.4291,
"step": 6870
},
{
"epoch": 0.7616517214657368,
"grad_norm": 14.774826021297706,
"learning_rate": 1.6332441642528895e-06,
"loss": 2.6003,
"step": 6880
},
{
"epoch": 0.7627587733864718,
"grad_norm": 15.975567591762553,
"learning_rate": 1.6189831379768206e-06,
"loss": 2.5704,
"step": 6890
},
{
"epoch": 0.7638658253072069,
"grad_norm": 17.406951035088184,
"learning_rate": 1.604772609481744e-06,
"loss": 2.5381,
"step": 6900
},
{
"epoch": 0.7649728772279419,
"grad_norm": 15.245412833911804,
"learning_rate": 1.5906127910115414e-06,
"loss": 2.5041,
"step": 6910
},
{
"epoch": 0.7660799291486771,
"grad_norm": 18.14500430607472,
"learning_rate": 1.576503894052711e-06,
"loss": 2.4126,
"step": 6920
},
{
"epoch": 0.7671869810694122,
"grad_norm": 15.112940123243304,
"learning_rate": 1.5624461293312022e-06,
"loss": 2.4729,
"step": 6930
},
{
"epoch": 0.7682940329901472,
"grad_norm": 14.628425372895773,
"learning_rate": 1.548439706809271e-06,
"loss": 2.4399,
"step": 6940
},
{
"epoch": 0.7694010849108823,
"grad_norm": 14.955427356230805,
"learning_rate": 1.5344848356823395e-06,
"loss": 2.4849,
"step": 6950
},
{
"epoch": 0.7705081368316175,
"grad_norm": 15.352858996367999,
"learning_rate": 1.5205817243758775e-06,
"loss": 2.5061,
"step": 6960
},
{
"epoch": 0.7716151887523525,
"grad_norm": 15.531771804427523,
"learning_rate": 1.506730580542287e-06,
"loss": 2.5352,
"step": 6970
},
{
"epoch": 0.7727222406730876,
"grad_norm": 14.802901269445874,
"learning_rate": 1.4929316110577991e-06,
"loss": 2.4606,
"step": 6980
},
{
"epoch": 0.7738292925938226,
"grad_norm": 13.834503126554017,
"learning_rate": 1.4791850220193882e-06,
"loss": 2.4114,
"step": 6990
},
{
"epoch": 0.7749363445145577,
"grad_norm": 17.626871971044736,
"learning_rate": 1.4654910187416843e-06,
"loss": 2.4443,
"step": 7000
},
{
"epoch": 0.7760433964352929,
"grad_norm": 15.72586832532517,
"learning_rate": 1.451849805753925e-06,
"loss": 2.5959,
"step": 7010
},
{
"epoch": 0.7771504483560279,
"grad_norm": 19.63625622564935,
"learning_rate": 1.4382615867968768e-06,
"loss": 2.577,
"step": 7020
},
{
"epoch": 0.778257500276763,
"grad_norm": 16.259423437860036,
"learning_rate": 1.4247265648198122e-06,
"loss": 2.4003,
"step": 7030
},
{
"epoch": 0.7793645521974981,
"grad_norm": 14.868240052692464,
"learning_rate": 1.4112449419774699e-06,
"loss": 2.4374,
"step": 7040
},
{
"epoch": 0.7804716041182331,
"grad_norm": 17.680915858091048,
"learning_rate": 1.3978169196270297e-06,
"loss": 2.4477,
"step": 7050
},
{
"epoch": 0.7815786560389683,
"grad_norm": 18.788763019346266,
"learning_rate": 1.3844426983251242e-06,
"loss": 2.6663,
"step": 7060
},
{
"epoch": 0.7826857079597033,
"grad_norm": 17.443967486074488,
"learning_rate": 1.3711224778248178e-06,
"loss": 2.4001,
"step": 7070
},
{
"epoch": 0.7837927598804384,
"grad_norm": 14.104765296687267,
"learning_rate": 1.3578564570726437e-06,
"loss": 2.5499,
"step": 7080
},
{
"epoch": 0.7848998118011735,
"grad_norm": 14.938982184936348,
"learning_rate": 1.344644834205624e-06,
"loss": 2.6234,
"step": 7090
},
{
"epoch": 0.7860068637219085,
"grad_norm": 16.601186409737505,
"learning_rate": 1.3314878065483106e-06,
"loss": 2.4678,
"step": 7100
},
{
"epoch": 0.7871139156426437,
"grad_norm": 16.126461328991052,
"learning_rate": 1.318385570609838e-06,
"loss": 2.5181,
"step": 7110
},
{
"epoch": 0.7882209675633787,
"grad_norm": 14.264212101115474,
"learning_rate": 1.3053383220809934e-06,
"loss": 2.5319,
"step": 7120
},
{
"epoch": 0.7893280194841138,
"grad_norm": 16.674084788709003,
"learning_rate": 1.2923462558312827e-06,
"loss": 2.5588,
"step": 7130
},
{
"epoch": 0.7904350714048489,
"grad_norm": 14.125047804457926,
"learning_rate": 1.2794095659060335e-06,
"loss": 2.495,
"step": 7140
},
{
"epoch": 0.7915421233255839,
"grad_norm": 13.689321540078824,
"learning_rate": 1.2665284455234867e-06,
"loss": 2.6346,
"step": 7150
},
{
"epoch": 0.7926491752463191,
"grad_norm": 17.491763233443507,
"learning_rate": 1.2537030870719159e-06,
"loss": 2.3638,
"step": 7160
},
{
"epoch": 0.7937562271670542,
"grad_norm": 14.712500473982459,
"learning_rate": 1.2409336821067535e-06,
"loss": 2.4199,
"step": 7170
},
{
"epoch": 0.7948632790877892,
"grad_norm": 13.97965354212977,
"learning_rate": 1.2282204213477233e-06,
"loss": 2.4273,
"step": 7180
},
{
"epoch": 0.7959703310085243,
"grad_norm": 15.125599625889896,
"learning_rate": 1.215563494676007e-06,
"loss": 2.5639,
"step": 7190
},
{
"epoch": 0.7970773829292593,
"grad_norm": 15.308235089960142,
"learning_rate": 1.2029630911313877e-06,
"loss": 2.4943,
"step": 7200
},
{
"epoch": 0.7981844348499945,
"grad_norm": 14.243073168806442,
"learning_rate": 1.1904193989094442e-06,
"loss": 2.6061,
"step": 7210
},
{
"epoch": 0.7992914867707296,
"grad_norm": 14.898872151747849,
"learning_rate": 1.1779326053587326e-06,
"loss": 2.6109,
"step": 7220
},
{
"epoch": 0.8003985386914646,
"grad_norm": 15.213968169737058,
"learning_rate": 1.165502896977983e-06,
"loss": 2.5029,
"step": 7230
},
{
"epoch": 0.8015055906121997,
"grad_norm": 17.57190386080436,
"learning_rate": 1.1531304594133297e-06,
"loss": 2.5218,
"step": 7240
},
{
"epoch": 0.8026126425329347,
"grad_norm": 14.718334901930403,
"learning_rate": 1.1408154774555185e-06,
"loss": 2.5644,
"step": 7250
},
{
"epoch": 0.8037196944536699,
"grad_norm": 14.70466300668309,
"learning_rate": 1.1285581350371633e-06,
"loss": 2.5673,
"step": 7260
},
{
"epoch": 0.804826746374405,
"grad_norm": 16.523083604536307,
"learning_rate": 1.11635861522999e-06,
"loss": 2.6119,
"step": 7270
},
{
"epoch": 0.80593379829514,
"grad_norm": 16.087233648796555,
"learning_rate": 1.1042171002421038e-06,
"loss": 2.3668,
"step": 7280
},
{
"epoch": 0.8070408502158751,
"grad_norm": 18.219483436423715,
"learning_rate": 1.092133771415272e-06,
"loss": 2.5108,
"step": 7290
},
{
"epoch": 0.8081479021366103,
"grad_norm": 14.23626021764468,
"learning_rate": 1.0801088092222067e-06,
"loss": 2.5161,
"step": 7300
},
{
"epoch": 0.8092549540573453,
"grad_norm": 17.579234694984372,
"learning_rate": 1.0681423932638784e-06,
"loss": 2.472,
"step": 7310
},
{
"epoch": 0.8103620059780804,
"grad_norm": 17.509613972476572,
"learning_rate": 1.05623470226683e-06,
"loss": 2.5078,
"step": 7320
},
{
"epoch": 0.8114690578988154,
"grad_norm": 16.567966169697417,
"learning_rate": 1.0443859140805063e-06,
"loss": 2.5549,
"step": 7330
},
{
"epoch": 0.8125761098195505,
"grad_norm": 13.228102448828993,
"learning_rate": 1.032596205674598e-06,
"loss": 2.5958,
"step": 7340
},
{
"epoch": 0.8136831617402857,
"grad_norm": 14.33253011909644,
"learning_rate": 1.020865753136402e-06,
"loss": 2.4304,
"step": 7350
},
{
"epoch": 0.8147902136610207,
"grad_norm": 16.763970324305024,
"learning_rate": 1.0091947316681833e-06,
"loss": 2.5536,
"step": 7360
},
{
"epoch": 0.8158972655817558,
"grad_norm": 16.082943781448364,
"learning_rate": 9.975833155845687e-07,
"loss": 2.4768,
"step": 7370
},
{
"epoch": 0.8170043175024909,
"grad_norm": 15.909337215300724,
"learning_rate": 9.860316783099356e-07,
"loss": 2.4912,
"step": 7380
},
{
"epoch": 0.8181113694232259,
"grad_norm": 17.194058825674805,
"learning_rate": 9.74539992375826e-07,
"loss": 2.4761,
"step": 7390
},
{
"epoch": 0.8192184213439611,
"grad_norm": 15.251099269067993,
"learning_rate": 9.631084294183668e-07,
"loss": 2.538,
"step": 7400
},
{
"epoch": 0.8203254732646961,
"grad_norm": 14.28790996742064,
"learning_rate": 9.517371601757042e-07,
"loss": 2.536,
"step": 7410
},
{
"epoch": 0.8214325251854312,
"grad_norm": 17.000395820091192,
"learning_rate": 9.404263544854658e-07,
"loss": 2.4934,
"step": 7420
},
{
"epoch": 0.8225395771061663,
"grad_norm": 14.025873757437632,
"learning_rate": 9.291761812822054e-07,
"loss": 2.4447,
"step": 7430
},
{
"epoch": 0.8236466290269013,
"grad_norm": 20.369511420071024,
"learning_rate": 9.179868085948946e-07,
"loss": 2.5157,
"step": 7440
},
{
"epoch": 0.8247536809476365,
"grad_norm": 16.887509510072285,
"learning_rate": 9.068584035444083e-07,
"loss": 2.4785,
"step": 7450
},
{
"epoch": 0.8258607328683715,
"grad_norm": 15.952259196345977,
"learning_rate": 8.957911323410229e-07,
"loss": 2.4653,
"step": 7460
},
{
"epoch": 0.8269677847891066,
"grad_norm": 16.24199510067374,
"learning_rate": 8.847851602819485e-07,
"loss": 2.5294,
"step": 7470
},
{
"epoch": 0.8280748367098417,
"grad_norm": 16.976947365156782,
"learning_rate": 8.738406517488423e-07,
"loss": 2.5297,
"step": 7480
},
{
"epoch": 0.8291818886305767,
"grad_norm": 17.934378030024483,
"learning_rate": 8.629577702053671e-07,
"loss": 2.6052,
"step": 7490
},
{
"epoch": 0.8302889405513119,
"grad_norm": 15.407244538769637,
"learning_rate": 8.521366781947426e-07,
"loss": 2.4532,
"step": 7500
},
{
"epoch": 0.831395992472047,
"grad_norm": 15.400477059234891,
"learning_rate": 8.413775373373206e-07,
"loss": 2.4579,
"step": 7510
},
{
"epoch": 0.832503044392782,
"grad_norm": 17.39392388174797,
"learning_rate": 8.306805083281705e-07,
"loss": 2.6138,
"step": 7520
},
{
"epoch": 0.8336100963135171,
"grad_norm": 14.342293383217136,
"learning_rate": 8.200457509346798e-07,
"loss": 2.3725,
"step": 7530
},
{
"epoch": 0.8347171482342521,
"grad_norm": 15.847161214149653,
"learning_rate": 8.094734239941642e-07,
"loss": 2.3768,
"step": 7540
},
{
"epoch": 0.8358242001549873,
"grad_norm": 17.63332070962175,
"learning_rate": 7.989636854115018e-07,
"loss": 2.4585,
"step": 7550
},
{
"epoch": 0.8369312520757224,
"grad_norm": 16.531198506312407,
"learning_rate": 7.885166921567705e-07,
"loss": 2.4787,
"step": 7560
},
{
"epoch": 0.8380383039964574,
"grad_norm": 14.28759893561945,
"learning_rate": 7.781326002628991e-07,
"loss": 2.4685,
"step": 7570
},
{
"epoch": 0.8391453559171925,
"grad_norm": 14.826430399325979,
"learning_rate": 7.678115648233514e-07,
"loss": 2.4173,
"step": 7580
},
{
"epoch": 0.8402524078379277,
"grad_norm": 14.87587504335515,
"learning_rate": 7.57553739989792e-07,
"loss": 2.51,
"step": 7590
},
{
"epoch": 0.8413594597586627,
"grad_norm": 17.574559912620376,
"learning_rate": 7.473592789697947e-07,
"loss": 2.4794,
"step": 7600
},
{
"epoch": 0.8424665116793978,
"grad_norm": 17.140986686992314,
"learning_rate": 7.37228334024555e-07,
"loss": 2.416,
"step": 7610
},
{
"epoch": 0.8435735636001328,
"grad_norm": 15.506861252303242,
"learning_rate": 7.271610564666054e-07,
"loss": 2.3907,
"step": 7620
},
{
"epoch": 0.8446806155208679,
"grad_norm": 15.538508359449784,
"learning_rate": 7.171575966575722e-07,
"loss": 2.5462,
"step": 7630
},
{
"epoch": 0.845787667441603,
"grad_norm": 16.810003606583724,
"learning_rate": 7.072181040059123e-07,
"loss": 2.486,
"step": 7640
},
{
"epoch": 0.8468947193623381,
"grad_norm": 17.523279420449594,
"learning_rate": 6.973427269646932e-07,
"loss": 2.4714,
"step": 7650
},
{
"epoch": 0.8480017712830732,
"grad_norm": 14.739045055561698,
"learning_rate": 6.875316130293724e-07,
"loss": 2.5424,
"step": 7660
},
{
"epoch": 0.8491088232038082,
"grad_norm": 15.925664585980916,
"learning_rate": 6.777849087355932e-07,
"loss": 2.4951,
"step": 7670
},
{
"epoch": 0.8502158751245433,
"grad_norm": 14.15278086352724,
"learning_rate": 6.681027596569988e-07,
"loss": 2.4984,
"step": 7680
},
{
"epoch": 0.8513229270452785,
"grad_norm": 14.613485875082265,
"learning_rate": 6.584853104030553e-07,
"loss": 2.415,
"step": 7690
},
{
"epoch": 0.8524299789660135,
"grad_norm": 13.79991123891203,
"learning_rate": 6.48932704616892e-07,
"loss": 2.4957,
"step": 7700
},
{
"epoch": 0.8535370308867486,
"grad_norm": 16.538555088229636,
"learning_rate": 6.394450849731587e-07,
"loss": 2.5322,
"step": 7710
},
{
"epoch": 0.8546440828074837,
"grad_norm": 17.641076622043553,
"learning_rate": 6.300225931758924e-07,
"loss": 2.4296,
"step": 7720
},
{
"epoch": 0.8557511347282187,
"grad_norm": 17.606467927789563,
"learning_rate": 6.206653699564014e-07,
"loss": 2.5163,
"step": 7730
},
{
"epoch": 0.8568581866489539,
"grad_norm": 17.809260161423225,
"learning_rate": 6.113735550711658e-07,
"loss": 2.4642,
"step": 7740
},
{
"epoch": 0.8579652385696889,
"grad_norm": 13.623839785347023,
"learning_rate": 6.021472872997419e-07,
"loss": 2.512,
"step": 7750
},
{
"epoch": 0.859072290490424,
"grad_norm": 18.78017884173273,
"learning_rate": 5.929867044427035e-07,
"loss": 2.4144,
"step": 7760
},
{
"epoch": 0.8601793424111591,
"grad_norm": 16.837093504212152,
"learning_rate": 5.838919433195678e-07,
"loss": 2.5047,
"step": 7770
},
{
"epoch": 0.8612863943318941,
"grad_norm": 16.87004336022709,
"learning_rate": 5.748631397667654e-07,
"loss": 2.5213,
"step": 7780
},
{
"epoch": 0.8623934462526293,
"grad_norm": 15.69091627736047,
"learning_rate": 5.659004286356045e-07,
"loss": 2.5533,
"step": 7790
},
{
"epoch": 0.8635004981733644,
"grad_norm": 14.187307779530673,
"learning_rate": 5.570039437902536e-07,
"loss": 2.441,
"step": 7800
},
{
"epoch": 0.8646075500940994,
"grad_norm": 17.93288869083588,
"learning_rate": 5.481738181057556e-07,
"loss": 2.5006,
"step": 7810
},
{
"epoch": 0.8657146020148345,
"grad_norm": 15.826634381411255,
"learning_rate": 5.394101834660253e-07,
"loss": 2.4135,
"step": 7820
},
{
"epoch": 0.8668216539355695,
"grad_norm": 16.596251661361375,
"learning_rate": 5.307131707618934e-07,
"loss": 2.4909,
"step": 7830
},
{
"epoch": 0.8679287058563047,
"grad_norm": 15.129013018674039,
"learning_rate": 5.220829098891472e-07,
"loss": 2.4429,
"step": 7840
},
{
"epoch": 0.8690357577770398,
"grad_norm": 14.305450352981211,
"learning_rate": 5.135195297465878e-07,
"loss": 2.4862,
"step": 7850
},
{
"epoch": 0.8701428096977748,
"grad_norm": 12.863234686905033,
"learning_rate": 5.050231582341092e-07,
"loss": 2.4616,
"step": 7860
},
{
"epoch": 0.8712498616185099,
"grad_norm": 13.900921327498637,
"learning_rate": 4.965939222507832e-07,
"loss": 2.5505,
"step": 7870
},
{
"epoch": 0.8723569135392449,
"grad_norm": 15.774427990260946,
"learning_rate": 4.882319476929698e-07,
"loss": 2.4643,
"step": 7880
},
{
"epoch": 0.8734639654599801,
"grad_norm": 18.695971386290847,
"learning_rate": 4.799373594524332e-07,
"loss": 2.4695,
"step": 7890
},
{
"epoch": 0.8745710173807152,
"grad_norm": 15.398391843940924,
"learning_rate": 4.7171028141447693e-07,
"loss": 2.5612,
"step": 7900
},
{
"epoch": 0.8756780693014502,
"grad_norm": 13.563010738327588,
"learning_rate": 4.635508364560937e-07,
"loss": 2.4357,
"step": 7910
},
{
"epoch": 0.8767851212221853,
"grad_norm": 14.212010150425057,
"learning_rate": 4.5545914644413103e-07,
"loss": 2.4529,
"step": 7920
},
{
"epoch": 0.8778921731429205,
"grad_norm": 13.857542112005609,
"learning_rate": 4.474353322334679e-07,
"loss": 2.4963,
"step": 7930
},
{
"epoch": 0.8789992250636555,
"grad_norm": 14.666024515134973,
"learning_rate": 4.394795136652169e-07,
"loss": 2.4512,
"step": 7940
},
{
"epoch": 0.8801062769843906,
"grad_norm": 16.841948685566276,
"learning_rate": 4.315918095649246e-07,
"loss": 2.5056,
"step": 7950
},
{
"epoch": 0.8812133289051256,
"grad_norm": 15.413187142241657,
"learning_rate": 4.2377233774080427e-07,
"loss": 2.5528,
"step": 7960
},
{
"epoch": 0.8823203808258607,
"grad_norm": 13.784700431727842,
"learning_rate": 4.1602121498197477e-07,
"loss": 2.4622,
"step": 7970
},
{
"epoch": 0.8834274327465959,
"grad_norm": 14.844903872123188,
"learning_rate": 4.0833855705671057e-07,
"loss": 2.4508,
"step": 7980
},
{
"epoch": 0.8845344846673309,
"grad_norm": 16.047205147717147,
"learning_rate": 4.0072447871072507e-07,
"loss": 2.4968,
"step": 7990
},
{
"epoch": 0.885641536588066,
"grad_norm": 12.9721170754397,
"learning_rate": 3.931790936654417e-07,
"loss": 2.3906,
"step": 8000
},
{
"epoch": 0.885641536588066,
"eval_loss": 2.48763370513916,
"eval_runtime": 2402.0825,
"eval_samples_per_second": 4.178,
"eval_steps_per_second": 0.418,
"step": 8000
},
{
"epoch": 0.8867485885088011,
"grad_norm": 15.854557624198474,
"learning_rate": 3.8570251461630735e-07,
"loss": 2.4579,
"step": 8010
},
{
"epoch": 0.8878556404295361,
"grad_norm": 16.026725672049096,
"learning_rate": 3.7829485323110316e-07,
"loss": 2.3463,
"step": 8020
},
{
"epoch": 0.8889626923502713,
"grad_norm": 16.073422441115532,
"learning_rate": 3.709562201482769e-07,
"loss": 2.4243,
"step": 8030
},
{
"epoch": 0.8900697442710063,
"grad_norm": 15.38779771086279,
"learning_rate": 3.636867249752962e-07,
"loss": 2.3858,
"step": 8040
},
{
"epoch": 0.8911767961917414,
"grad_norm": 16.258826268938925,
"learning_rate": 3.564864762870013e-07,
"loss": 2.5358,
"step": 8050
},
{
"epoch": 0.8922838481124765,
"grad_norm": 15.02798068624606,
"learning_rate": 3.49355581623993e-07,
"loss": 2.4421,
"step": 8060
},
{
"epoch": 0.8933909000332115,
"grad_norm": 16.654143045304426,
"learning_rate": 3.4229414749102186e-07,
"loss": 2.5125,
"step": 8070
},
{
"epoch": 0.8944979519539467,
"grad_norm": 13.762735453146883,
"learning_rate": 3.353022793553978e-07,
"loss": 2.6232,
"step": 8080
},
{
"epoch": 0.8956050038746817,
"grad_norm": 11.721658803005548,
"learning_rate": 3.2838008164541577e-07,
"loss": 2.4208,
"step": 8090
},
{
"epoch": 0.8967120557954168,
"grad_norm": 15.661791327346446,
"learning_rate": 3.215276577487969e-07,
"loss": 2.5037,
"step": 8100
},
{
"epoch": 0.8978191077161519,
"grad_norm": 14.437374220759548,
"learning_rate": 3.1474511001113926e-07,
"loss": 2.453,
"step": 8110
},
{
"epoch": 0.8989261596368869,
"grad_norm": 23.96541891259206,
"learning_rate": 3.080325397343969e-07,
"loss": 2.4866,
"step": 8120
},
{
"epoch": 0.9000332115576221,
"grad_norm": 14.703254905186904,
"learning_rate": 3.013900471753628e-07,
"loss": 2.5269,
"step": 8130
},
{
"epoch": 0.9011402634783572,
"grad_norm": 17.763519535077947,
"learning_rate": 2.948177315441669e-07,
"loss": 2.5009,
"step": 8140
},
{
"epoch": 0.9022473153990922,
"grad_norm": 18.50559050540985,
"learning_rate": 2.883156910028073e-07,
"loss": 2.4501,
"step": 8150
},
{
"epoch": 0.9033543673198273,
"grad_norm": 13.811835537975867,
"learning_rate": 2.818840226636671e-07,
"loss": 2.3126,
"step": 8160
},
{
"epoch": 0.9044614192405623,
"grad_norm": 18.988992451266952,
"learning_rate": 2.7552282258808125e-07,
"loss": 2.4317,
"step": 8170
},
{
"epoch": 0.9055684711612975,
"grad_norm": 16.031786166509363,
"learning_rate": 2.6923218578488674e-07,
"loss": 2.4247,
"step": 8180
},
{
"epoch": 0.9066755230820326,
"grad_norm": 18.728666251016826,
"learning_rate": 2.630122062090118e-07,
"loss": 2.3527,
"step": 8190
},
{
"epoch": 0.9077825750027676,
"grad_norm": 19.825199377152217,
"learning_rate": 2.568629767600744e-07,
"loss": 2.6088,
"step": 8200
},
{
"epoch": 0.9088896269235027,
"grad_norm": 16.87396488382408,
"learning_rate": 2.507845892809868e-07,
"loss": 2.3591,
"step": 8210
},
{
"epoch": 0.9099966788442378,
"grad_norm": 14.693971646543563,
"learning_rate": 2.4477713455659136e-07,
"loss": 2.4239,
"step": 8220
},
{
"epoch": 0.9111037307649729,
"grad_norm": 15.947418670710583,
"learning_rate": 2.388407023123007e-07,
"loss": 2.4616,
"step": 8230
},
{
"epoch": 0.912210782685708,
"grad_norm": 16.454200712334917,
"learning_rate": 2.329753812127583e-07,
"loss": 2.4244,
"step": 8240
},
{
"epoch": 0.913317834606443,
"grad_norm": 16.497222276931957,
"learning_rate": 2.2718125886051433e-07,
"loss": 2.5867,
"step": 8250
},
{
"epoch": 0.9144248865271781,
"grad_norm": 16.43228833811835,
"learning_rate": 2.214584217947191e-07,
"loss": 2.4391,
"step": 8260
},
{
"epoch": 0.9155319384479133,
"grad_norm": 16.78472579386922,
"learning_rate": 2.1580695548982567e-07,
"loss": 2.4242,
"step": 8270
},
{
"epoch": 0.9166389903686483,
"grad_norm": 16.546048611992425,
"learning_rate": 2.1022694435431868e-07,
"loss": 2.4872,
"step": 8280
},
{
"epoch": 0.9177460422893834,
"grad_norm": 16.770801344250373,
"learning_rate": 2.0471847172945036e-07,
"loss": 2.4296,
"step": 8290
},
{
"epoch": 0.9188530942101184,
"grad_norm": 16.27109240174247,
"learning_rate": 1.9928161988799765e-07,
"loss": 2.5068,
"step": 8300
},
{
"epoch": 0.9199601461308535,
"grad_norm": 12.512458168250634,
"learning_rate": 1.939164700330326e-07,
"loss": 2.4175,
"step": 8310
},
{
"epoch": 0.9210671980515887,
"grad_norm": 14.798188108228695,
"learning_rate": 1.8862310229670612e-07,
"loss": 2.5059,
"step": 8320
},
{
"epoch": 0.9221742499723237,
"grad_norm": 12.936659113537381,
"learning_rate": 1.8340159573906058e-07,
"loss": 2.447,
"step": 8330
},
{
"epoch": 0.9232813018930588,
"grad_norm": 15.624562309086738,
"learning_rate": 1.782520283468364e-07,
"loss": 2.4359,
"step": 8340
},
{
"epoch": 0.9243883538137939,
"grad_norm": 17.36536742613116,
"learning_rate": 1.7317447703231849e-07,
"loss": 2.5658,
"step": 8350
},
{
"epoch": 0.9254954057345289,
"grad_norm": 15.391131130821295,
"learning_rate": 1.6816901763218152e-07,
"loss": 2.5091,
"step": 8360
},
{
"epoch": 0.9266024576552641,
"grad_norm": 15.684736857963308,
"learning_rate": 1.6323572490635543e-07,
"loss": 2.4168,
"step": 8370
},
{
"epoch": 0.9277095095759991,
"grad_norm": 18.001498021183778,
"learning_rate": 1.5837467253691784e-07,
"loss": 2.5202,
"step": 8380
},
{
"epoch": 0.9288165614967342,
"grad_norm": 15.913434285236699,
"learning_rate": 1.5358593312698178e-07,
"loss": 2.6434,
"step": 8390
},
{
"epoch": 0.9299236134174693,
"grad_norm": 15.844539221853895,
"learning_rate": 1.4886957819962077e-07,
"loss": 2.4848,
"step": 8400
},
{
"epoch": 0.9310306653382043,
"grad_norm": 14.883294572064472,
"learning_rate": 1.4422567819679546e-07,
"loss": 2.4281,
"step": 8410
},
{
"epoch": 0.9321377172589395,
"grad_norm": 14.583778182281327,
"learning_rate": 1.3965430247830426e-07,
"loss": 2.4246,
"step": 8420
},
{
"epoch": 0.9332447691796745,
"grad_norm": 16.084883433598268,
"learning_rate": 1.3515551932074488e-07,
"loss": 2.506,
"step": 8430
},
{
"epoch": 0.9343518211004096,
"grad_norm": 13.726377273149337,
"learning_rate": 1.307293959164957e-07,
"loss": 2.5495,
"step": 8440
},
{
"epoch": 0.9354588730211447,
"grad_norm": 17.060608694253016,
"learning_rate": 1.263759983727142e-07,
"loss": 2.337,
"step": 8450
},
{
"epoch": 0.9365659249418797,
"grad_norm": 14.801435939071636,
"learning_rate": 1.2209539171034623e-07,
"loss": 2.5042,
"step": 8460
},
{
"epoch": 0.9376729768626149,
"grad_norm": 15.589161221895887,
"learning_rate": 1.1788763986315621e-07,
"loss": 2.5061,
"step": 8470
},
{
"epoch": 0.93878002878335,
"grad_norm": 16.33836597070153,
"learning_rate": 1.1375280567677393e-07,
"loss": 2.3671,
"step": 8480
},
{
"epoch": 0.939887080704085,
"grad_norm": 18.905885083448613,
"learning_rate": 1.0969095090775428e-07,
"loss": 2.6181,
"step": 8490
},
{
"epoch": 0.9409941326248201,
"grad_norm": 16.762390629046585,
"learning_rate": 1.0570213622265236e-07,
"loss": 2.4327,
"step": 8500
},
{
"epoch": 0.9421011845455551,
"grad_norm": 16.525181960248243,
"learning_rate": 1.0178642119712368e-07,
"loss": 2.4993,
"step": 8510
},
{
"epoch": 0.9432082364662903,
"grad_norm": 16.7132011851729,
"learning_rate": 9.794386431502822e-08,
"loss": 2.5366,
"step": 8520
},
{
"epoch": 0.9443152883870254,
"grad_norm": 13.853046661215803,
"learning_rate": 9.417452296756114e-08,
"loss": 2.4832,
"step": 8530
},
{
"epoch": 0.9454223403077604,
"grad_norm": 15.293104772530375,
"learning_rate": 9.04784534523928e-08,
"loss": 2.3633,
"step": 8540
},
{
"epoch": 0.9465293922284955,
"grad_norm": 14.980540551389215,
"learning_rate": 8.685571097282852e-08,
"loss": 2.4849,
"step": 8550
},
{
"epoch": 0.9476364441492307,
"grad_norm": 18.693304270023244,
"learning_rate": 8.33063496369868e-08,
"loss": 2.5602,
"step": 8560
},
{
"epoch": 0.9487434960699657,
"grad_norm": 15.253297927027766,
"learning_rate": 7.98304224569868e-08,
"loss": 2.4879,
"step": 8570
},
{
"epoch": 0.9498505479907008,
"grad_norm": 20.092545101378285,
"learning_rate": 7.642798134815943e-08,
"loss": 2.5095,
"step": 8580
},
{
"epoch": 0.9509575999114358,
"grad_norm": 16.041421606524025,
"learning_rate": 7.309907712827192e-08,
"loss": 2.4647,
"step": 8590
},
{
"epoch": 0.9520646518321709,
"grad_norm": 15.859909299358135,
"learning_rate": 6.984375951676614e-08,
"loss": 2.5593,
"step": 8600
},
{
"epoch": 0.953171703752906,
"grad_norm": 19.216229700494758,
"learning_rate": 6.66620771340215e-08,
"loss": 2.3626,
"step": 8610
},
{
"epoch": 0.9542787556736411,
"grad_norm": 17.889324656581575,
"learning_rate": 6.355407750062215e-08,
"loss": 2.6562,
"step": 8620
},
{
"epoch": 0.9553858075943762,
"grad_norm": 13.458822428770242,
"learning_rate": 6.051980703665138e-08,
"loss": 2.3909,
"step": 8630
},
{
"epoch": 0.9564928595151112,
"grad_norm": 17.008353277644698,
"learning_rate": 5.755931106099788e-08,
"loss": 2.4223,
"step": 8640
},
{
"epoch": 0.9575999114358463,
"grad_norm": 16.78426968156743,
"learning_rate": 5.4672633790677775e-08,
"loss": 2.6265,
"step": 8650
},
{
"epoch": 0.9587069633565815,
"grad_norm": 17.958386496220644,
"learning_rate": 5.185981834017473e-08,
"loss": 2.5093,
"step": 8660
},
{
"epoch": 0.9598140152773165,
"grad_norm": 17.46930815569884,
"learning_rate": 4.91209067207965e-08,
"loss": 2.4249,
"step": 8670
},
{
"epoch": 0.9609210671980516,
"grad_norm": 17.891927563958056,
"learning_rate": 4.645593984004604e-08,
"loss": 2.533,
"step": 8680
},
{
"epoch": 0.9620281191187867,
"grad_norm": 13.675101972798346,
"learning_rate": 4.386495750101194e-08,
"loss": 2.4507,
"step": 8690
},
{
"epoch": 0.9631351710395217,
"grad_norm": 16.01872970692231,
"learning_rate": 4.1347998401773945e-08,
"loss": 2.4702,
"step": 8700
},
{
"epoch": 0.9642422229602569,
"grad_norm": 17.620120107441487,
"learning_rate": 3.890510013482396e-08,
"loss": 2.3592,
"step": 8710
},
{
"epoch": 0.9653492748809919,
"grad_norm": 13.329706465049831,
"learning_rate": 3.653629918650536e-08,
"loss": 2.4662,
"step": 8720
},
{
"epoch": 0.966456326801727,
"grad_norm": 14.570283074571352,
"learning_rate": 3.424163093646682e-08,
"loss": 2.3495,
"step": 8730
},
{
"epoch": 0.9675633787224621,
"grad_norm": 13.873984864746625,
"learning_rate": 3.202112965713655e-08,
"loss": 2.367,
"step": 8740
},
{
"epoch": 0.9686704306431971,
"grad_norm": 13.467781119638207,
"learning_rate": 2.987482851320778e-08,
"loss": 2.3987,
"step": 8750
},
{
"epoch": 0.9697774825639323,
"grad_norm": 15.489672705466763,
"learning_rate": 2.7802759561144088e-08,
"loss": 2.425,
"step": 8760
},
{
"epoch": 0.9708845344846674,
"grad_norm": 20.126982289743573,
"learning_rate": 2.580495374870151e-08,
"loss": 2.5085,
"step": 8770
},
{
"epoch": 0.9719915864054024,
"grad_norm": 16.778268839880404,
"learning_rate": 2.388144091446498e-08,
"loss": 2.463,
"step": 8780
},
{
"epoch": 0.9730986383261375,
"grad_norm": 20.49911635255473,
"learning_rate": 2.2032249787404258e-08,
"loss": 2.5278,
"step": 8790
},
{
"epoch": 0.9742056902468725,
"grad_norm": 16.182685372782867,
"learning_rate": 2.0257407986443713e-08,
"loss": 2.4702,
"step": 8800
},
{
"epoch": 0.9753127421676077,
"grad_norm": 14.885149821948326,
"learning_rate": 1.8556942020049872e-08,
"loss": 2.5026,
"step": 8810
},
{
"epoch": 0.9764197940883428,
"grad_norm": 18.03209004223668,
"learning_rate": 1.6930877285835644e-08,
"loss": 2.5576,
"step": 8820
},
{
"epoch": 0.9775268460090778,
"grad_norm": 15.861290907259685,
"learning_rate": 1.5379238070181158e-08,
"loss": 2.5681,
"step": 8830
},
{
"epoch": 0.9786338979298129,
"grad_norm": 16.532161800217157,
"learning_rate": 1.3902047547871278e-08,
"loss": 2.4926,
"step": 8840
},
{
"epoch": 0.9797409498505479,
"grad_norm": 14.626301967154978,
"learning_rate": 1.2499327781748116e-08,
"loss": 2.4547,
"step": 8850
},
{
"epoch": 0.9808480017712831,
"grad_norm": 18.74363118033889,
"learning_rate": 1.1171099722383506e-08,
"loss": 2.5054,
"step": 8860
},
{
"epoch": 0.9819550536920182,
"grad_norm": 17.476707949807594,
"learning_rate": 9.917383207765363e-09,
"loss": 2.4136,
"step": 8870
},
{
"epoch": 0.9830621056127532,
"grad_norm": 15.362525353056075,
"learning_rate": 8.738196962999601e-09,
"loss": 2.5267,
"step": 8880
},
{
"epoch": 0.9841691575334883,
"grad_norm": 15.763132841414992,
"learning_rate": 7.633558600033675e-09,
"loss": 2.4059,
"step": 8890
},
{
"epoch": 0.9852762094542235,
"grad_norm": 16.971970282791062,
"learning_rate": 6.603484617390688e-09,
"loss": 2.5169,
"step": 8900
},
{
"epoch": 0.9863832613749585,
"grad_norm": 16.309961470302227,
"learning_rate": 5.647990399924031e-09,
"loss": 2.4272,
"step": 8910
},
{
"epoch": 0.9874903132956936,
"grad_norm": 16.05736958282543,
"learning_rate": 4.767090218589232e-09,
"loss": 2.5884,
"step": 8920
},
{
"epoch": 0.9885973652164286,
"grad_norm": 15.10152496394929,
"learning_rate": 3.960797230227465e-09,
"loss": 2.5573,
"step": 8930
},
{
"epoch": 0.9897044171371637,
"grad_norm": 14.741682018743976,
"learning_rate": 3.2291234773718093e-09,
"loss": 2.3819,
"step": 8940
},
{
"epoch": 0.9908114690578989,
"grad_norm": 15.440675776317423,
"learning_rate": 2.5720798880662922e-09,
"loss": 2.4611,
"step": 8950
},
{
"epoch": 0.9919185209786339,
"grad_norm": 13.979973708890682,
"learning_rate": 1.989676275702679e-09,
"loss": 2.4037,
"step": 8960
},
{
"epoch": 0.993025572899369,
"grad_norm": 19.373337099900795,
"learning_rate": 1.4819213388744814e-09,
"loss": 2.4966,
"step": 8970
},
{
"epoch": 0.9941326248201041,
"grad_norm": 17.103724133893802,
"learning_rate": 1.0488226612459517e-09,
"loss": 2.505,
"step": 8980
},
{
"epoch": 0.9952396767408391,
"grad_norm": 16.90633557993371,
"learning_rate": 6.903867114393947e-10,
"loss": 2.5781,
"step": 8990
},
{
"epoch": 0.9963467286615743,
"grad_norm": 16.59692250103923,
"learning_rate": 4.0661884293913266e-10,
"loss": 2.5521,
"step": 9000
},
{
"epoch": 0.9974537805823093,
"grad_norm": 15.318767567204494,
"learning_rate": 1.97523294011015e-10,
"loss": 2.488,
"step": 9010
},
{
"epoch": 0.9985608325030444,
"grad_norm": 14.806481544474932,
"learning_rate": 6.310318763858014e-11,
"loss": 2.4538,
"step": 9020
},
{
"epoch": 0.9996678844237795,
"grad_norm": 16.39588765945003,
"learning_rate": 3.360531477536455e-12,
"loss": 2.4834,
"step": 9030
},
{
"epoch": 1.0,
"step": 9033,
"total_flos": 227316538671104.0,
"train_loss": 2.714383109890978,
"train_runtime": 83244.0657,
"train_samples_per_second": 1.085,
"train_steps_per_second": 0.109
}
],
"logging_steps": 10,
"max_steps": 9033,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 227316538671104.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}