{
  "best_metric": 0.0710952952504158,
  "best_model_checkpoint": "./vit-base-trash-demo-v5/checkpoint-4000",
  "epoch": 4.0,
  "eval_steps": 1000,
  "global_step": 4476,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.008936550491510277,
      "grad_norm": 2.2266733646392822,
      "learning_rate": 0.00019955317247542448,
      "loss": 1.7283,
      "step": 10
    },
    {
      "epoch": 0.017873100983020553,
      "grad_norm": 1.7498841285705566,
      "learning_rate": 0.000199106344950849,
      "loss": 1.2864,
      "step": 20
    },
    {
      "epoch": 0.02680965147453083,
      "grad_norm": 2.307732582092285,
      "learning_rate": 0.00019865951742627347,
      "loss": 0.9102,
      "step": 30
    },
    {
      "epoch": 0.035746201966041107,
      "grad_norm": 4.274341106414795,
      "learning_rate": 0.00019821268990169794,
      "loss": 0.7496,
      "step": 40
    },
    {
      "epoch": 0.044682752457551385,
      "grad_norm": 2.586291551589966,
      "learning_rate": 0.00019776586237712246,
      "loss": 0.7103,
      "step": 50
    },
    {
      "epoch": 0.05361930294906166,
      "grad_norm": 1.1641569137573242,
      "learning_rate": 0.00019731903485254693,
      "loss": 0.5292,
      "step": 60
    },
    {
      "epoch": 0.06255585344057193,
      "grad_norm": 5.71007776260376,
      "learning_rate": 0.0001968722073279714,
      "loss": 0.5169,
      "step": 70
    },
    {
      "epoch": 0.07149240393208221,
      "grad_norm": 4.907001495361328,
      "learning_rate": 0.0001964253798033959,
      "loss": 0.5468,
      "step": 80
    },
    {
      "epoch": 0.08042895442359249,
      "grad_norm": 4.788897514343262,
      "learning_rate": 0.00019597855227882039,
      "loss": 0.5174,
      "step": 90
    },
    {
      "epoch": 0.08936550491510277,
      "grad_norm": 3.1065032482147217,
      "learning_rate": 0.00019553172475424485,
      "loss": 0.4598,
      "step": 100
    },
    {
      "epoch": 0.09830205540661305,
      "grad_norm": 2.299734115600586,
      "learning_rate": 0.00019508489722966935,
      "loss": 0.3711,
      "step": 110
    },
    {
      "epoch": 0.10723860589812333,
      "grad_norm": 1.3453820943832397,
      "learning_rate": 0.00019463806970509384,
      "loss": 0.4316,
      "step": 120
    },
    {
      "epoch": 0.1161751563896336,
      "grad_norm": 2.797497034072876,
      "learning_rate": 0.00019419124218051834,
      "loss": 0.5008,
      "step": 130
    },
    {
      "epoch": 0.12511170688114387,
      "grad_norm": 3.164219856262207,
      "learning_rate": 0.0001937444146559428,
      "loss": 0.371,
      "step": 140
    },
    {
      "epoch": 0.13404825737265416,
      "grad_norm": 5.14756965637207,
      "learning_rate": 0.0001932975871313673,
      "loss": 0.5617,
      "step": 150
    },
    {
      "epoch": 0.14298480786416443,
      "grad_norm": 1.478502869606018,
      "learning_rate": 0.0001928507596067918,
      "loss": 0.3317,
      "step": 160
    },
    {
      "epoch": 0.15192135835567472,
      "grad_norm": 1.3914997577667236,
      "learning_rate": 0.00019240393208221627,
      "loss": 0.3637,
      "step": 170
    },
    {
      "epoch": 0.16085790884718498,
      "grad_norm": 2.963843822479248,
      "learning_rate": 0.00019195710455764076,
      "loss": 0.3628,
      "step": 180
    },
    {
      "epoch": 0.16979445933869527,
      "grad_norm": 4.369801044464111,
      "learning_rate": 0.00019151027703306526,
      "loss": 0.4182,
      "step": 190
    },
    {
      "epoch": 0.17873100983020554,
      "grad_norm": 2.109142541885376,
      "learning_rate": 0.00019106344950848973,
      "loss": 0.279,
      "step": 200
    },
    {
      "epoch": 0.1876675603217158,
      "grad_norm": 5.093620777130127,
      "learning_rate": 0.00019061662198391422,
      "loss": 0.4936,
      "step": 210
    },
    {
      "epoch": 0.1966041108132261,
      "grad_norm": 2.2940657138824463,
      "learning_rate": 0.00019016979445933872,
      "loss": 0.532,
      "step": 220
    },
    {
      "epoch": 0.20554066130473636,
      "grad_norm": 3.415463924407959,
      "learning_rate": 0.00018972296693476319,
      "loss": 0.3475,
      "step": 230
    },
    {
      "epoch": 0.21447721179624665,
      "grad_norm": 1.158109426498413,
      "learning_rate": 0.00018927613941018768,
      "loss": 0.386,
      "step": 240
    },
    {
      "epoch": 0.22341376228775692,
      "grad_norm": 0.44666406512260437,
      "learning_rate": 0.00018882931188561218,
      "loss": 0.4165,
      "step": 250
    },
    {
      "epoch": 0.2323503127792672,
      "grad_norm": 2.626112461090088,
      "learning_rate": 0.00018838248436103664,
      "loss": 0.3478,
      "step": 260
    },
    {
      "epoch": 0.24128686327077747,
      "grad_norm": 3.9105656147003174,
      "learning_rate": 0.00018793565683646114,
      "loss": 0.293,
      "step": 270
    },
    {
      "epoch": 0.25022341376228774,
      "grad_norm": 6.075140953063965,
      "learning_rate": 0.00018748882931188563,
      "loss": 0.4013,
      "step": 280
    },
    {
      "epoch": 0.25915996425379806,
      "grad_norm": 1.2540974617004395,
      "learning_rate": 0.0001870420017873101,
      "loss": 0.5419,
      "step": 290
    },
    {
      "epoch": 0.2680965147453083,
      "grad_norm": 2.7805850505828857,
      "learning_rate": 0.00018659517426273457,
      "loss": 0.373,
      "step": 300
    },
    {
      "epoch": 0.2770330652368186,
      "grad_norm": 1.9444429874420166,
      "learning_rate": 0.0001861483467381591,
      "loss": 0.2447,
      "step": 310
    },
    {
      "epoch": 0.28596961572832885,
      "grad_norm": 2.164452075958252,
      "learning_rate": 0.00018570151921358356,
      "loss": 0.4141,
      "step": 320
    },
    {
      "epoch": 0.2949061662198391,
      "grad_norm": 4.439435958862305,
      "learning_rate": 0.00018525469168900803,
      "loss": 0.3183,
      "step": 330
    },
    {
      "epoch": 0.30384271671134944,
      "grad_norm": 3.332730770111084,
      "learning_rate": 0.00018480786416443255,
      "loss": 0.3255,
      "step": 340
    },
    {
      "epoch": 0.3127792672028597,
      "grad_norm": 4.461299419403076,
      "learning_rate": 0.00018436103663985702,
      "loss": 0.2733,
      "step": 350
    },
    {
      "epoch": 0.32171581769436997,
      "grad_norm": 4.637039661407471,
      "learning_rate": 0.0001839142091152815,
      "loss": 0.2486,
      "step": 360
    },
    {
      "epoch": 0.33065236818588023,
      "grad_norm": 1.985630989074707,
      "learning_rate": 0.000183467381590706,
      "loss": 0.2978,
      "step": 370
    },
    {
      "epoch": 0.33958891867739055,
      "grad_norm": 2.7530932426452637,
      "learning_rate": 0.00018302055406613048,
      "loss": 0.3722,
      "step": 380
    },
    {
      "epoch": 0.3485254691689008,
      "grad_norm": 0.20281237363815308,
      "learning_rate": 0.00018257372654155497,
      "loss": 0.3897,
      "step": 390
    },
    {
      "epoch": 0.3574620196604111,
      "grad_norm": 1.9220471382141113,
      "learning_rate": 0.00018212689901697947,
      "loss": 0.2564,
      "step": 400
    },
    {
      "epoch": 0.36639857015192134,
      "grad_norm": 4.841084957122803,
      "learning_rate": 0.00018168007149240394,
      "loss": 0.3892,
      "step": 410
    },
    {
      "epoch": 0.3753351206434316,
      "grad_norm": 5.499583721160889,
      "learning_rate": 0.00018123324396782843,
      "loss": 0.3462,
      "step": 420
    },
    {
      "epoch": 0.38427167113494193,
      "grad_norm": 1.4320725202560425,
      "learning_rate": 0.0001807864164432529,
      "loss": 0.2658,
      "step": 430
    },
    {
      "epoch": 0.3932082216264522,
      "grad_norm": 2.9739625453948975,
      "learning_rate": 0.0001803395889186774,
      "loss": 0.269,
      "step": 440
    },
    {
      "epoch": 0.40214477211796246,
      "grad_norm": 2.0247411727905273,
      "learning_rate": 0.0001798927613941019,
      "loss": 0.3355,
      "step": 450
    },
    {
      "epoch": 0.4110813226094727,
      "grad_norm": 0.25784507393836975,
      "learning_rate": 0.00017944593386952636,
      "loss": 0.2589,
      "step": 460
    },
    {
      "epoch": 0.42001787310098304,
      "grad_norm": 8.2475004196167,
      "learning_rate": 0.00017899910634495086,
      "loss": 0.3034,
      "step": 470
    },
    {
      "epoch": 0.4289544235924933,
      "grad_norm": 2.1054959297180176,
      "learning_rate": 0.00017855227882037535,
      "loss": 0.3784,
      "step": 480
    },
    {
      "epoch": 0.43789097408400357,
      "grad_norm": 2.6620118618011475,
      "learning_rate": 0.00017810545129579982,
      "loss": 0.3303,
      "step": 490
    },
    {
      "epoch": 0.44682752457551383,
      "grad_norm": 2.4308674335479736,
      "learning_rate": 0.00017765862377122431,
      "loss": 0.4428,
      "step": 500
    },
    {
      "epoch": 0.45576407506702415,
      "grad_norm": 4.620336532592773,
      "learning_rate": 0.0001772117962466488,
      "loss": 0.4007,
      "step": 510
    },
    {
      "epoch": 0.4647006255585344,
      "grad_norm": 0.6208035945892334,
      "learning_rate": 0.00017676496872207328,
      "loss": 0.4641,
      "step": 520
    },
    {
      "epoch": 0.4736371760500447,
      "grad_norm": 3.478276252746582,
      "learning_rate": 0.00017631814119749777,
      "loss": 0.3107,
      "step": 530
    },
    {
      "epoch": 0.48257372654155495,
      "grad_norm": 2.8934295177459717,
      "learning_rate": 0.00017587131367292227,
      "loss": 0.2283,
      "step": 540
    },
    {
      "epoch": 0.4915102770330652,
      "grad_norm": 2.323265552520752,
      "learning_rate": 0.00017542448614834674,
      "loss": 0.253,
      "step": 550
    },
    {
      "epoch": 0.5004468275245755,
      "grad_norm": 0.8551294207572937,
      "learning_rate": 0.00017497765862377123,
      "loss": 0.2903,
      "step": 560
    },
    {
      "epoch": 0.5093833780160858,
      "grad_norm": 3.454586982727051,
      "learning_rate": 0.00017453083109919573,
      "loss": 0.2434,
      "step": 570
    },
    {
      "epoch": 0.5183199285075961,
      "grad_norm": 4.937902927398682,
      "learning_rate": 0.0001740840035746202,
      "loss": 0.254,
      "step": 580
    },
    {
      "epoch": 0.5272564789991063,
      "grad_norm": 7.016292095184326,
      "learning_rate": 0.0001736371760500447,
      "loss": 0.3365,
      "step": 590
    },
    {
      "epoch": 0.5361930294906166,
      "grad_norm": 4.799339294433594,
      "learning_rate": 0.0001731903485254692,
      "loss": 0.2639,
      "step": 600
    },
    {
      "epoch": 0.5451295799821269,
      "grad_norm": 3.4884583950042725,
      "learning_rate": 0.00017274352100089365,
      "loss": 0.2255,
      "step": 610
    },
    {
      "epoch": 0.5540661304736372,
      "grad_norm": 1.6748002767562866,
      "learning_rate": 0.00017229669347631815,
      "loss": 0.2678,
      "step": 620
    },
    {
      "epoch": 0.5630026809651475,
      "grad_norm": 4.145753383636475,
      "learning_rate": 0.00017184986595174265,
      "loss": 0.3435,
      "step": 630
    },
    {
      "epoch": 0.5719392314566577,
      "grad_norm": 3.8941946029663086,
      "learning_rate": 0.00017140303842716711,
      "loss": 0.3448,
      "step": 640
    },
    {
      "epoch": 0.580875781948168,
      "grad_norm": 2.7980730533599854,
      "learning_rate": 0.0001709562109025916,
      "loss": 0.1776,
      "step": 650
    },
    {
      "epoch": 0.5898123324396782,
      "grad_norm": 2.6846330165863037,
      "learning_rate": 0.0001705093833780161,
      "loss": 0.1698,
      "step": 660
    },
    {
      "epoch": 0.5987488829311886,
      "grad_norm": 0.6026754379272461,
      "learning_rate": 0.00017006255585344057,
      "loss": 0.2683,
      "step": 670
    },
    {
      "epoch": 0.6076854334226989,
      "grad_norm": 1.7795771360397339,
      "learning_rate": 0.00016961572832886507,
      "loss": 0.1734,
      "step": 680
    },
    {
      "epoch": 0.6166219839142091,
      "grad_norm": 2.9793999195098877,
      "learning_rate": 0.00016916890080428956,
      "loss": 0.2341,
      "step": 690
    },
    {
      "epoch": 0.6255585344057194,
      "grad_norm": 2.885993480682373,
      "learning_rate": 0.00016872207327971403,
      "loss": 0.1787,
      "step": 700
    },
    {
      "epoch": 0.6344950848972297,
      "grad_norm": 0.12324349582195282,
      "learning_rate": 0.00016827524575513853,
      "loss": 0.1716,
      "step": 710
    },
    {
      "epoch": 0.6434316353887399,
      "grad_norm": 3.5133144855499268,
      "learning_rate": 0.00016782841823056302,
      "loss": 0.248,
      "step": 720
    },
    {
      "epoch": 0.6523681858802503,
      "grad_norm": 0.16779197752475739,
      "learning_rate": 0.0001673815907059875,
      "loss": 0.1969,
      "step": 730
    },
    {
      "epoch": 0.6613047363717605,
      "grad_norm": 6.696399688720703,
      "learning_rate": 0.00016693476318141199,
      "loss": 0.2659,
      "step": 740
    },
    {
      "epoch": 0.6702412868632708,
      "grad_norm": 3.4363462924957275,
      "learning_rate": 0.00016648793565683648,
      "loss": 0.1933,
      "step": 750
    },
    {
      "epoch": 0.6791778373547811,
      "grad_norm": 2.9766454696655273,
      "learning_rate": 0.00016604110813226095,
      "loss": 0.3646,
      "step": 760
    },
    {
      "epoch": 0.6881143878462913,
      "grad_norm": 3.3751492500305176,
      "learning_rate": 0.00016559428060768544,
      "loss": 0.2365,
      "step": 770
    },
    {
      "epoch": 0.6970509383378016,
      "grad_norm": 1.6829200983047485,
      "learning_rate": 0.00016514745308310994,
      "loss": 0.3954,
      "step": 780
    },
    {
      "epoch": 0.7059874888293118,
      "grad_norm": 3.473019599914551,
      "learning_rate": 0.0001647006255585344,
      "loss": 0.2257,
      "step": 790
    },
    {
      "epoch": 0.7149240393208222,
      "grad_norm": 2.1625287532806396,
      "learning_rate": 0.0001642537980339589,
      "loss": 0.2984,
      "step": 800
    },
    {
      "epoch": 0.7238605898123325,
      "grad_norm": 1.2086807489395142,
      "learning_rate": 0.00016380697050938337,
      "loss": 0.2888,
      "step": 810
    },
    {
      "epoch": 0.7327971403038427,
      "grad_norm": 0.19183319807052612,
      "learning_rate": 0.00016336014298480787,
      "loss": 0.2884,
      "step": 820
    },
    {
      "epoch": 0.741733690795353,
      "grad_norm": 4.687781810760498,
      "learning_rate": 0.00016291331546023236,
      "loss": 0.2456,
      "step": 830
    },
    {
      "epoch": 0.7506702412868632,
      "grad_norm": 1.6150999069213867,
      "learning_rate": 0.00016246648793565683,
      "loss": 0.2231,
      "step": 840
    },
    {
      "epoch": 0.7596067917783735,
      "grad_norm": 2.592801809310913,
      "learning_rate": 0.00016201966041108133,
      "loss": 0.2425,
      "step": 850
    },
    {
      "epoch": 0.7685433422698839,
      "grad_norm": 5.782228469848633,
      "learning_rate": 0.00016157283288650582,
      "loss": 0.2184,
      "step": 860
    },
    {
      "epoch": 0.7774798927613941,
      "grad_norm": 4.794034957885742,
      "learning_rate": 0.0001611260053619303,
      "loss": 0.1862,
      "step": 870
    },
    {
      "epoch": 0.7864164432529044,
      "grad_norm": 6.517756462097168,
      "learning_rate": 0.0001606791778373548,
      "loss": 0.3502,
      "step": 880
    },
    {
      "epoch": 0.7953529937444147,
      "grad_norm": 6.479066848754883,
      "learning_rate": 0.00016023235031277928,
      "loss": 0.1433,
      "step": 890
    },
    {
      "epoch": 0.8042895442359249,
      "grad_norm": 1.539117455482483,
      "learning_rate": 0.00015978552278820375,
      "loss": 0.3306,
      "step": 900
    },
    {
      "epoch": 0.8132260947274352,
      "grad_norm": 2.0679945945739746,
      "learning_rate": 0.00015933869526362827,
      "loss": 0.2325,
      "step": 910
    },
    {
      "epoch": 0.8221626452189454,
      "grad_norm": 4.1405558586120605,
      "learning_rate": 0.00015889186773905274,
      "loss": 0.2126,
      "step": 920
    },
    {
      "epoch": 0.8310991957104558,
      "grad_norm": 3.7805371284484863,
      "learning_rate": 0.0001584450402144772,
      "loss": 0.2418,
      "step": 930
    },
    {
      "epoch": 0.8400357462019661,
      "grad_norm": 4.6036248207092285,
      "learning_rate": 0.0001579982126899017,
      "loss": 0.3191,
      "step": 940
    },
    {
      "epoch": 0.8489722966934763,
      "grad_norm": 0.8650698661804199,
      "learning_rate": 0.0001575513851653262,
      "loss": 0.0849,
      "step": 950
    },
    {
      "epoch": 0.8579088471849866,
      "grad_norm": 0.4226575791835785,
      "learning_rate": 0.00015710455764075067,
      "loss": 0.1689,
      "step": 960
    },
    {
      "epoch": 0.8668453976764968,
      "grad_norm": 4.508443355560303,
      "learning_rate": 0.00015665773011617516,
      "loss": 0.1381,
      "step": 970
    },
    {
      "epoch": 0.8757819481680071,
      "grad_norm": 5.323261260986328,
      "learning_rate": 0.00015621090259159966,
      "loss": 0.1799,
      "step": 980
    },
    {
      "epoch": 0.8847184986595175,
      "grad_norm": 4.80311393737793,
      "learning_rate": 0.00015576407506702412,
      "loss": 0.1616,
      "step": 990
    },
    {
      "epoch": 0.8936550491510277,
      "grad_norm": 2.0073227882385254,
      "learning_rate": 0.00015531724754244862,
      "loss": 0.1814,
      "step": 1000
    },
    {
      "epoch": 0.8936550491510277,
      "eval_accuracy": 0.9487437185929648,
      "eval_loss": 0.17145079374313354,
      "eval_runtime": 56.2937,
      "eval_samples_per_second": 35.35,
      "eval_steps_per_second": 4.423,
      "step": 1000
    },
    {
      "epoch": 0.902591599642538,
      "grad_norm": 3.7729084491729736,
      "learning_rate": 0.00015487042001787312,
      "loss": 0.1352,
      "step": 1010
    },
    {
      "epoch": 0.9115281501340483,
      "grad_norm": 6.341058254241943,
      "learning_rate": 0.00015442359249329758,
      "loss": 0.2095,
      "step": 1020
    },
    {
      "epoch": 0.9204647006255585,
      "grad_norm": 0.42624279856681824,
      "learning_rate": 0.00015397676496872208,
      "loss": 0.2741,
      "step": 1030
    },
    {
      "epoch": 0.9294012511170688,
      "grad_norm": 4.386059761047363,
      "learning_rate": 0.00015352993744414657,
      "loss": 0.1688,
      "step": 1040
    },
    {
      "epoch": 0.938337801608579,
      "grad_norm": 0.07118342816829681,
      "learning_rate": 0.00015308310991957104,
      "loss": 0.2251,
      "step": 1050
    },
    {
      "epoch": 0.9472743521000894,
      "grad_norm": 1.6763445138931274,
      "learning_rate": 0.00015263628239499554,
      "loss": 0.2107,
      "step": 1060
    },
    {
      "epoch": 0.9562109025915997,
      "grad_norm": 5.161765098571777,
      "learning_rate": 0.00015218945487042003,
      "loss": 0.1552,
      "step": 1070
    },
    {
      "epoch": 0.9651474530831099,
      "grad_norm": 1.8887020349502563,
      "learning_rate": 0.0001517426273458445,
      "loss": 0.259,
      "step": 1080
    },
    {
      "epoch": 0.9740840035746202,
      "grad_norm": 0.29733049869537354,
      "learning_rate": 0.000151295799821269,
      "loss": 0.1136,
      "step": 1090
    },
    {
      "epoch": 0.9830205540661304,
      "grad_norm": 3.253506660461426,
      "learning_rate": 0.0001508489722966935,
      "loss": 0.1211,
      "step": 1100
    },
    {
      "epoch": 0.9919571045576407,
      "grad_norm": 2.1613495349884033,
      "learning_rate": 0.00015040214477211796,
      "loss": 0.1925,
      "step": 1110
    },
    {
      "epoch": 1.000893655049151,
      "grad_norm": 0.23403897881507874,
      "learning_rate": 0.00014995531724754246,
      "loss": 0.1099,
      "step": 1120
    },
    {
      "epoch": 1.0098302055406614,
      "grad_norm": 0.7746050357818604,
      "learning_rate": 0.00014950848972296695,
      "loss": 0.0627,
      "step": 1130
    },
    {
      "epoch": 1.0187667560321716,
      "grad_norm": 0.3406558930873871,
      "learning_rate": 0.00014906166219839145,
      "loss": 0.063,
      "step": 1140
    },
    {
      "epoch": 1.0277033065236818,
      "grad_norm": 0.12071269750595093,
      "learning_rate": 0.00014861483467381591,
      "loss": 0.1432,
      "step": 1150
    },
    {
      "epoch": 1.0366398570151922,
      "grad_norm": 0.4978802800178528,
      "learning_rate": 0.00014816800714924038,
      "loss": 0.0746,
      "step": 1160
    },
    {
      "epoch": 1.0455764075067024,
      "grad_norm": 1.3803187608718872,
      "learning_rate": 0.0001477211796246649,
      "loss": 0.0929,
      "step": 1170
    },
    {
      "epoch": 1.0545129579982127,
      "grad_norm": 0.8612852692604065,
      "learning_rate": 0.00014727435210008937,
      "loss": 0.0949,
      "step": 1180
    },
    {
      "epoch": 1.063449508489723,
      "grad_norm": 0.3493196666240692,
      "learning_rate": 0.00014682752457551384,
      "loss": 0.168,
      "step": 1190
    },
    {
      "epoch": 1.0723860589812333,
      "grad_norm": 0.18564535677433014,
      "learning_rate": 0.00014638069705093836,
      "loss": 0.1344,
      "step": 1200
    },
    {
      "epoch": 1.0813226094727435,
      "grad_norm": 5.853936672210693,
      "learning_rate": 0.00014593386952636283,
      "loss": 0.0874,
      "step": 1210
    },
    {
      "epoch": 1.0902591599642537,
      "grad_norm": 5.366926670074463,
      "learning_rate": 0.0001454870420017873,
      "loss": 0.0847,
      "step": 1220
    },
    {
      "epoch": 1.0991957104557641,
      "grad_norm": 0.04646310582756996,
      "learning_rate": 0.00014504021447721182,
      "loss": 0.0947,
      "step": 1230
    },
    {
      "epoch": 1.1081322609472744,
      "grad_norm": 0.7593271732330322,
      "learning_rate": 0.0001445933869526363,
      "loss": 0.1208,
      "step": 1240
    },
    {
      "epoch": 1.1170688114387846,
      "grad_norm": 6.3901472091674805,
      "learning_rate": 0.00014414655942806076,
      "loss": 0.0977,
      "step": 1250
    },
    {
      "epoch": 1.126005361930295,
      "grad_norm": 1.7897100448608398,
      "learning_rate": 0.00014369973190348528,
      "loss": 0.1107,
      "step": 1260
    },
    {
      "epoch": 1.1349419124218052,
      "grad_norm": 3.0502521991729736,
      "learning_rate": 0.00014325290437890975,
      "loss": 0.1854,
      "step": 1270
    },
    {
      "epoch": 1.1438784629133154,
      "grad_norm": 0.18816685676574707,
      "learning_rate": 0.00014280607685433422,
      "loss": 0.0863,
      "step": 1280
    },
    {
      "epoch": 1.1528150134048256,
      "grad_norm": 0.05062058940529823,
      "learning_rate": 0.0001423592493297587,
      "loss": 0.1339,
      "step": 1290
    },
    {
      "epoch": 1.161751563896336,
      "grad_norm": 0.23230992257595062,
      "learning_rate": 0.0001419124218051832,
      "loss": 0.187,
      "step": 1300
    },
    {
      "epoch": 1.1706881143878463,
      "grad_norm": 3.0492892265319824,
      "learning_rate": 0.00014146559428060768,
      "loss": 0.0701,
      "step": 1310
    },
    {
      "epoch": 1.1796246648793565,
      "grad_norm": 0.03424559161067009,
      "learning_rate": 0.00014101876675603217,
      "loss": 0.0428,
      "step": 1320
    },
    {
      "epoch": 1.188561215370867,
      "grad_norm": 3.6026527881622314,
      "learning_rate": 0.00014057193923145667,
      "loss": 0.1337,
      "step": 1330
    },
    {
      "epoch": 1.197497765862377,
      "grad_norm": 0.09644579142332077,
      "learning_rate": 0.00014012511170688114,
      "loss": 0.1271,
      "step": 1340
    },
    {
      "epoch": 1.2064343163538873,
      "grad_norm": 0.22322706878185272,
      "learning_rate": 0.00013967828418230563,
      "loss": 0.055,
      "step": 1350
    },
    {
      "epoch": 1.2153708668453977,
      "grad_norm": 0.05372155085206032,
      "learning_rate": 0.00013923145665773013,
      "loss": 0.1251,
      "step": 1360
    },
    {
      "epoch": 1.224307417336908,
      "grad_norm": 0.4156355857849121,
      "learning_rate": 0.00013878462913315462,
      "loss": 0.0405,
      "step": 1370
    },
    {
      "epoch": 1.2332439678284182,
      "grad_norm": 0.030704261735081673,
      "learning_rate": 0.0001383378016085791,
      "loss": 0.1225,
      "step": 1380
    },
    {
      "epoch": 1.2421805183199286,
      "grad_norm": 0.09552694112062454,
      "learning_rate": 0.00013789097408400359,
      "loss": 0.0495,
      "step": 1390
    },
    {
      "epoch": 1.2511170688114388,
      "grad_norm": 2.1124463081359863,
      "learning_rate": 0.00013744414655942808,
      "loss": 0.0651,
      "step": 1400
    },
    {
      "epoch": 1.260053619302949,
      "grad_norm": 4.6221232414245605,
      "learning_rate": 0.00013699731903485255,
      "loss": 0.2366,
      "step": 1410
    },
    {
      "epoch": 1.2689901697944594,
      "grad_norm": 0.054540861397981644,
      "learning_rate": 0.00013655049151027704,
      "loss": 0.1915,
      "step": 1420
    },
    {
      "epoch": 1.2779267202859697,
      "grad_norm": 0.6603236198425293,
      "learning_rate": 0.00013610366398570154,
      "loss": 0.0386,
      "step": 1430
    },
    {
      "epoch": 1.2868632707774799,
      "grad_norm": 4.419101715087891,
      "learning_rate": 0.000135656836461126,
      "loss": 0.1288,
      "step": 1440
    },
    {
      "epoch": 1.2957998212689903,
      "grad_norm": 1.6491079330444336,
      "learning_rate": 0.0001352100089365505,
      "loss": 0.077,
      "step": 1450
    },
    {
      "epoch": 1.3047363717605005,
      "grad_norm": 0.904062807559967,
      "learning_rate": 0.000134763181411975,
      "loss": 0.2083,
      "step": 1460
    },
    {
      "epoch": 1.3136729222520107,
      "grad_norm": 3.4404361248016357,
      "learning_rate": 0.00013431635388739947,
      "loss": 0.1846,
      "step": 1470
    },
    {
      "epoch": 1.322609472743521,
      "grad_norm": 0.2096666842699051,
      "learning_rate": 0.00013386952636282396,
      "loss": 0.0354,
      "step": 1480
    },
    {
      "epoch": 1.3315460232350314,
      "grad_norm": 4.2826128005981445,
      "learning_rate": 0.00013342269883824846,
      "loss": 0.1438,
      "step": 1490
    },
    {
      "epoch": 1.3404825737265416,
      "grad_norm": 4.742111682891846,
      "learning_rate": 0.00013297587131367293,
      "loss": 0.0994,
      "step": 1500
    },
    {
      "epoch": 1.3494191242180518,
      "grad_norm": 6.2931952476501465,
      "learning_rate": 0.0001325290437890974,
      "loss": 0.0754,
      "step": 1510
    },
    {
      "epoch": 1.358355674709562,
      "grad_norm": 1.523571491241455,
      "learning_rate": 0.00013208221626452192,
      "loss": 0.1283,
      "step": 1520
    },
    {
      "epoch": 1.3672922252010724,
      "grad_norm": 8.253166198730469,
      "learning_rate": 0.00013163538873994638,
      "loss": 0.1718,
      "step": 1530
    },
    {
      "epoch": 1.3762287756925826,
      "grad_norm": 2.4168646335601807,
      "learning_rate": 0.00013118856121537085,
      "loss": 0.1285,
      "step": 1540
    },
    {
      "epoch": 1.3851653261840928,
      "grad_norm": 4.069122314453125,
      "learning_rate": 0.00013074173369079537,
      "loss": 0.1165,
      "step": 1550
    },
    {
      "epoch": 1.3941018766756033,
      "grad_norm": 0.2789513170719147,
      "learning_rate": 0.00013029490616621984,
      "loss": 0.0795,
      "step": 1560
    },
    {
      "epoch": 1.4030384271671135,
      "grad_norm": 0.5609318017959595,
      "learning_rate": 0.0001298480786416443,
      "loss": 0.1187,
      "step": 1570
    },
    {
      "epoch": 1.4119749776586237,
      "grad_norm": 0.34373611211776733,
      "learning_rate": 0.00012940125111706883,
      "loss": 0.0872,
      "step": 1580
    },
    {
      "epoch": 1.420911528150134,
      "grad_norm": 4.596048355102539,
      "learning_rate": 0.0001289544235924933,
      "loss": 0.1354,
      "step": 1590
    },
    {
      "epoch": 1.4298480786416443,
      "grad_norm": 0.06107456609606743,
      "learning_rate": 0.00012850759606791777,
      "loss": 0.119,
      "step": 1600
    },
    {
      "epoch": 1.4387846291331545,
      "grad_norm": 0.08292512595653534,
      "learning_rate": 0.0001280607685433423,
      "loss": 0.1075,
      "step": 1610
    },
    {
      "epoch": 1.447721179624665,
      "grad_norm": 0.04113980755209923,
      "learning_rate": 0.00012761394101876676,
      "loss": 0.099,
      "step": 1620
    },
    {
      "epoch": 1.4566577301161752,
      "grad_norm": 3.1171679496765137,
      "learning_rate": 0.00012716711349419126,
      "loss": 0.0476,
      "step": 1630
    },
    {
      "epoch": 1.4655942806076854,
      "grad_norm": 0.03248828276991844,
      "learning_rate": 0.00012672028596961572,
      "loss": 0.1217,
      "step": 1640
    },
    {
      "epoch": 1.4745308310991958,
      "grad_norm": 0.14615251123905182,
      "learning_rate": 0.00012627345844504022,
      "loss": 0.0845,
      "step": 1650
    },
    {
      "epoch": 1.483467381590706,
      "grad_norm": 0.8569982647895813,
      "learning_rate": 0.00012582663092046471,
      "loss": 0.0933,
      "step": 1660
    },
    {
      "epoch": 1.4924039320822162,
      "grad_norm": 0.030800212174654007,
      "learning_rate": 0.00012537980339588918,
      "loss": 0.0555,
      "step": 1670
    },
    {
      "epoch": 1.5013404825737267,
      "grad_norm": 0.9634251594543457,
      "learning_rate": 0.00012493297587131368,
      "loss": 0.1249,
      "step": 1680
    },
    {
      "epoch": 1.5102770330652369,
      "grad_norm": 0.06999039649963379,
      "learning_rate": 0.00012448614834673817,
      "loss": 0.0727,
      "step": 1690
    },
    {
      "epoch": 1.519213583556747,
      "grad_norm": 0.0438673160970211,
      "learning_rate": 0.00012403932082216264,
      "loss": 0.0595,
      "step": 1700
    },
    {
      "epoch": 1.5281501340482575,
      "grad_norm": 0.030631419271230698,
      "learning_rate": 0.00012359249329758714,
      "loss": 0.0641,
      "step": 1710
    },
    {
      "epoch": 1.5370866845397675,
      "grad_norm": 0.09066120535135269,
      "learning_rate": 0.00012314566577301163,
      "loss": 0.0689,
      "step": 1720
    },
    {
      "epoch": 1.546023235031278,
      "grad_norm": 1.1478157043457031,
      "learning_rate": 0.0001226988382484361,
      "loss": 0.0427,
      "step": 1730
    },
    {
      "epoch": 1.5549597855227884,
      "grad_norm": 0.5382466912269592,
      "learning_rate": 0.0001222520107238606,
      "loss": 0.1211,
      "step": 1740
    },
    {
      "epoch": 1.5638963360142983,
      "grad_norm": 0.15291939675807953,
      "learning_rate": 0.00012180518319928509,
      "loss": 0.1934,
      "step": 1750
    },
    {
      "epoch": 1.5728328865058088,
      "grad_norm": 0.07158921658992767,
      "learning_rate": 0.00012135835567470957,
      "loss": 0.045,
      "step": 1760
    },
    {
      "epoch": 1.5817694369973192,
      "grad_norm": 1.416129469871521,
      "learning_rate": 0.00012091152815013404,
      "loss": 0.0822,
      "step": 1770
    },
    {
      "epoch": 1.5907059874888292,
      "grad_norm": 3.2841928005218506,
      "learning_rate": 0.00012046470062555855,
      "loss": 0.0685,
      "step": 1780
    },
    {
      "epoch": 1.5996425379803396,
      "grad_norm": 5.683614730834961,
      "learning_rate": 0.00012001787310098302,
      "loss": 0.1512,
      "step": 1790
    },
    {
      "epoch": 1.6085790884718498,
      "grad_norm": 0.054330743849277496,
      "learning_rate": 0.0001195710455764075,
      "loss": 0.1381,
      "step": 1800
    },
    {
      "epoch": 1.61751563896336,
      "grad_norm": 0.05368073284626007,
      "learning_rate": 0.00011912421805183201,
      "loss": 0.1118,
      "step": 1810
    },
    {
      "epoch": 1.6264521894548705,
      "grad_norm": 17.735898971557617,
      "learning_rate": 0.00011867739052725648,
      "loss": 0.1704,
      "step": 1820
    },
    {
      "epoch": 1.6353887399463807,
      "grad_norm": 3.4387574195861816,
      "learning_rate": 0.00011823056300268096,
      "loss": 0.1498,
      "step": 1830
    },
    {
      "epoch": 1.6443252904378909,
      "grad_norm": 3.4959723949432373,
      "learning_rate": 0.00011778373547810547,
      "loss": 0.0667,
      "step": 1840
    },
    {
      "epoch": 1.6532618409294013,
      "grad_norm": 1.4753037691116333,
      "learning_rate": 0.00011733690795352994,
      "loss": 0.0445,
      "step": 1850
    },
    {
      "epoch": 1.6621983914209115,
      "grad_norm": 0.24579989910125732,
      "learning_rate": 0.00011689008042895442,
      "loss": 0.0377,
      "step": 1860
    },
    {
      "epoch": 1.6711349419124217,
      "grad_norm": 3.813619375228882,
      "learning_rate": 0.00011644325290437891,
      "loss": 0.1004,
      "step": 1870
    },
    {
      "epoch": 1.6800714924039322,
      "grad_norm": 0.808028519153595,
      "learning_rate": 0.0001159964253798034,
      "loss": 0.0679,
      "step": 1880
    },
    {
      "epoch": 1.6890080428954424,
      "grad_norm": 0.277228444814682,
      "learning_rate": 0.0001155495978552279,
      "loss": 0.1096,
      "step": 1890
    },
    {
      "epoch": 1.6979445933869526,
      "grad_norm": 2.485595703125,
      "learning_rate": 0.00011510277033065237,
      "loss": 0.0738,
      "step": 1900
    },
    {
      "epoch": 1.706881143878463,
      "grad_norm": 0.35362759232521057,
      "learning_rate": 0.00011465594280607685,
      "loss": 0.0807,
      "step": 1910
    },
    {
      "epoch": 1.7158176943699732,
      "grad_norm": 1.7707135677337646,
      "learning_rate": 0.00011420911528150135,
      "loss": 0.0603,
      "step": 1920
    },
    {
      "epoch": 1.7247542448614834,
      "grad_norm": 0.010053984820842743,
      "learning_rate": 0.00011376228775692583,
      "loss": 0.0142,
      "step": 1930
    },
    {
      "epoch": 1.7336907953529939,
      "grad_norm": 11.442891120910645,
      "learning_rate": 0.00011331546023235031,
      "loss": 0.0509,
      "step": 1940
    },
    {
      "epoch": 1.742627345844504,
      "grad_norm": 2.5633316040039062,
      "learning_rate": 0.00011286863270777481,
      "loss": 0.0204,
      "step": 1950
    },
    {
      "epoch": 1.7515638963360143,
      "grad_norm": 0.9002701044082642,
      "learning_rate": 0.00011242180518319929,
      "loss": 0.0822,
      "step": 1960
    },
    {
      "epoch": 1.7605004468275247,
      "grad_norm": 0.03169967234134674,
      "learning_rate": 0.00011197497765862377,
      "loss": 0.0951,
      "step": 1970
    },
    {
      "epoch": 1.7694369973190347,
      "grad_norm": 0.07693292945623398,
      "learning_rate": 0.00011152815013404827,
      "loss": 0.11,
      "step": 1980
    },
    {
      "epoch": 1.7783735478105451,
      "grad_norm": 0.06315601617097855,
      "learning_rate": 0.00011108132260947275,
      "loss": 0.1217,
      "step": 1990
    },
    {
      "epoch": 1.7873100983020556,
      "grad_norm": 0.26389381289482117,
      "learning_rate": 0.00011063449508489723,
      "loss": 0.1077,
      "step": 2000
    },
    {
      "epoch": 1.7873100983020556,
      "eval_accuracy": 0.9668341708542714,
      "eval_loss": 0.12829196453094482,
      "eval_runtime": 56.3081,
      "eval_samples_per_second": 35.341,
      "eval_steps_per_second": 4.422,
      "step": 2000
    },
    {
      "epoch": 1.7962466487935655,
      "grad_norm": 0.14058926701545715,
      "learning_rate": 0.00011018766756032173,
      "loss": 0.051,
      "step": 2010
    },
    {
      "epoch": 1.805183199285076,
      "grad_norm": 0.8464193940162659,
      "learning_rate": 0.00010974084003574621,
      "loss": 0.0557,
      "step": 2020
    },
    {
      "epoch": 1.8141197497765862,
      "grad_norm": 0.5524567365646362,
      "learning_rate": 0.00010929401251117069,
      "loss": 0.0327,
      "step": 2030
    },
    {
      "epoch": 1.8230563002680964,
      "grad_norm": 4.706042289733887,
      "learning_rate": 0.00010884718498659518,
      "loss": 0.0815,
      "step": 2040
    },
    {
      "epoch": 1.8319928507596068,
      "grad_norm": 5.365744113922119,
      "learning_rate": 0.00010840035746201967,
      "loss": 0.0617,
      "step": 2050
    },
    {
      "epoch": 1.840929401251117,
      "grad_norm": 1.1039865016937256,
      "learning_rate": 0.00010795352993744415,
      "loss": 0.0528,
      "step": 2060
    },
    {
      "epoch": 1.8498659517426272,
      "grad_norm": 2.8230929374694824,
      "learning_rate": 0.00010750670241286864,
      "loss": 0.0534,
      "step": 2070
    },
    {
      "epoch": 1.8588025022341377,
      "grad_norm": 0.02104310691356659,
      "learning_rate": 0.00010705987488829313,
      "loss": 0.1058,
      "step": 2080
    },
    {
      "epoch": 1.8677390527256479,
      "grad_norm": 0.030116664245724678,
      "learning_rate": 0.0001066130473637176,
      "loss": 0.0971,
      "step": 2090
    },
    {
      "epoch": 1.876675603217158,
      "grad_norm": 0.5036576986312866,
      "learning_rate": 0.0001061662198391421,
      "loss": 0.0693,
      "step": 2100
    },
    {
      "epoch": 1.8856121537086685,
      "grad_norm": 4.131002426147461,
      "learning_rate": 0.00010571939231456658,
      "loss": 0.0933,
      "step": 2110
    },
    {
      "epoch": 1.8945487042001787,
      "grad_norm": 5.004481792449951,
      "learning_rate": 0.00010527256478999108,
      "loss": 0.0698,
      "step": 2120
    },
    {
      "epoch": 1.903485254691689,
      "grad_norm": 0.014153541065752506,
      "learning_rate": 0.00010482573726541556,
      "loss": 0.0598,
      "step": 2130
    },
    {
      "epoch": 1.9124218051831994,
      "grad_norm": 0.39952540397644043,
      "learning_rate": 0.00010437890974084004,
      "loss": 0.1169,
      "step": 2140
    },
    {
      "epoch": 1.9213583556747096,
      "grad_norm": 5.047325611114502,
      "learning_rate": 0.00010393208221626454,
      "loss": 0.1492,
      "step": 2150
    },
    {
      "epoch": 1.9302949061662198,
      "grad_norm": 0.045367881655693054,
      "learning_rate": 0.00010348525469168902,
      "loss": 0.081,
      "step": 2160
    },
    {
      "epoch": 1.9392314566577302,
      "grad_norm": 0.02820589952170849,
      "learning_rate": 0.00010303842716711349,
      "loss": 0.1456,
      "step": 2170
    },
    {
      "epoch": 1.9481680071492404,
      "grad_norm": 0.15606756508350372,
      "learning_rate": 0.000102591599642538,
      "loss": 0.0484,
      "step": 2180
    },
    {
      "epoch": 1.9571045576407506,
      "grad_norm": 4.374292850494385,
      "learning_rate": 0.00010214477211796248,
      "loss": 0.1133,
      "step": 2190
    },
    {
      "epoch": 1.966041108132261,
      "grad_norm": 0.6300436854362488,
      "learning_rate": 0.00010169794459338695,
      "loss": 0.0159,
      "step": 2200
    },
    {
      "epoch": 1.974977658623771,
      "grad_norm": 0.011597417294979095,
      "learning_rate": 0.00010125111706881146,
      "loss": 0.019,
      "step": 2210
    },
    {
      "epoch": 1.9839142091152815,
      "grad_norm": 0.013629280962049961,
      "learning_rate": 0.00010080428954423592,
      "loss": 0.0953,
      "step": 2220
    },
    {
      "epoch": 1.992850759606792,
      "grad_norm": 4.461750030517578,
      "learning_rate": 0.0001003574620196604,
      "loss": 0.1169,
      "step": 2230
    },
    {
      "epoch": 2.001787310098302,
      "grad_norm": 0.2028690129518509,
      "learning_rate": 9.99106344950849e-05,
      "loss": 0.0515,
      "step": 2240
    },
    {
      "epoch": 2.0107238605898123,
      "grad_norm": 0.683179497718811,
      "learning_rate": 9.946380697050938e-05,
      "loss": 0.0414,
      "step": 2250
    },
    {
      "epoch": 2.0196604110813228,
      "grad_norm": 0.3097274601459503,
      "learning_rate": 9.901697944593388e-05,
      "loss": 0.013,
      "step": 2260
    },
    {
      "epoch": 2.0285969615728328,
      "grad_norm": 0.02391964942216873,
      "learning_rate": 9.857015192135836e-05,
      "loss": 0.0143,
      "step": 2270
    },
    {
      "epoch": 2.037533512064343,
      "grad_norm": 0.02549424022436142,
      "learning_rate": 9.812332439678284e-05,
      "loss": 0.0321,
      "step": 2280
    },
    {
      "epoch": 2.0464700625558536,
      "grad_norm": 0.015907390043139458,
      "learning_rate": 9.767649687220734e-05,
      "loss": 0.0905,
      "step": 2290
    },
    {
      "epoch": 2.0554066130473636,
      "grad_norm": 0.04600854963064194,
      "learning_rate": 9.722966934763182e-05,
      "loss": 0.0055,
      "step": 2300
    },
    {
      "epoch": 2.064343163538874,
      "grad_norm": 0.17837274074554443,
      "learning_rate": 9.67828418230563e-05,
      "loss": 0.0792,
      "step": 2310
    },
    {
      "epoch": 2.0732797140303845,
      "grad_norm": 0.678176760673523,
      "learning_rate": 9.63360142984808e-05,
      "loss": 0.1025,
      "step": 2320
    },
    {
      "epoch": 2.0822162645218945,
      "grad_norm": 0.047438375651836395,
      "learning_rate": 9.588918677390528e-05,
      "loss": 0.0037,
      "step": 2330
    },
    {
      "epoch": 2.091152815013405,
      "grad_norm": 0.3825267553329468,
      "learning_rate": 9.544235924932976e-05,
      "loss": 0.0271,
      "step": 2340
    },
    {
      "epoch": 2.1000893655049153,
      "grad_norm": 0.022976990789175034,
      "learning_rate": 9.499553172475425e-05,
      "loss": 0.0055,
      "step": 2350
    },
    {
      "epoch": 2.1090259159964253,
      "grad_norm": 0.21945427358150482,
      "learning_rate": 9.454870420017874e-05,
      "loss": 0.0072,
      "step": 2360
    },
    {
      "epoch": 2.1179624664879357,
      "grad_norm": 0.020401885733008385,
      "learning_rate": 9.410187667560322e-05,
      "loss": 0.0045,
      "step": 2370
    },
    {
      "epoch": 2.126899016979446,
      "grad_norm": 0.3614647388458252,
      "learning_rate": 9.365504915102771e-05,
      "loss": 0.0292,
      "step": 2380
    },
    {
      "epoch": 2.135835567470956,
      "grad_norm": 0.01699133589863777,
      "learning_rate": 9.32082216264522e-05,
      "loss": 0.0728,
      "step": 2390
    },
    {
      "epoch": 2.1447721179624666,
      "grad_norm": 0.012751326896250248,
      "learning_rate": 9.276139410187668e-05,
      "loss": 0.0358,
      "step": 2400
    },
    {
      "epoch": 2.1537086684539766,
      "grad_norm": 0.009738125838339329,
      "learning_rate": 9.231456657730116e-05,
      "loss": 0.0415,
      "step": 2410
    },
    {
      "epoch": 2.162645218945487,
      "grad_norm": 0.012577983550727367,
      "learning_rate": 9.186773905272565e-05,
      "loss": 0.0204,
      "step": 2420
    },
    {
      "epoch": 2.1715817694369974,
      "grad_norm": 0.022706875577569008,
      "learning_rate": 9.142091152815015e-05,
      "loss": 0.0391,
      "step": 2430
    },
    {
      "epoch": 2.1805183199285074,
      "grad_norm": 1.2650375366210938,
      "learning_rate": 9.097408400357462e-05,
      "loss": 0.005,
      "step": 2440
    },
    {
      "epoch": 2.189454870420018,
      "grad_norm": 0.012098530307412148,
      "learning_rate": 9.052725647899911e-05,
      "loss": 0.0631,
      "step": 2450
    },
    {
      "epoch": 2.1983914209115283,
      "grad_norm": 0.014217260293662548,
      "learning_rate": 9.00804289544236e-05,
      "loss": 0.0158,
      "step": 2460
    },
    {
      "epoch": 2.2073279714030383,
      "grad_norm": 9.968586921691895,
      "learning_rate": 8.963360142984808e-05,
      "loss": 0.0338,
      "step": 2470
    },
    {
      "epoch": 2.2162645218945487,
      "grad_norm": 0.008608737029135227,
      "learning_rate": 8.918677390527257e-05,
      "loss": 0.0344,
      "step": 2480
    },
    {
      "epoch": 2.225201072386059,
      "grad_norm": 0.0957435816526413,
      "learning_rate": 8.873994638069705e-05,
      "loss": 0.0346,
      "step": 2490
    },
    {
      "epoch": 2.234137622877569,
      "grad_norm": 0.009171651676297188,
      "learning_rate": 8.829311885612154e-05,
      "loss": 0.0534,
      "step": 2500
    },
    {
      "epoch": 2.2430741733690795,
      "grad_norm": 0.025571748614311218,
      "learning_rate": 8.784629133154603e-05,
      "loss": 0.0046,
      "step": 2510
    },
    {
      "epoch": 2.25201072386059,
      "grad_norm": 0.008803543634712696,
      "learning_rate": 8.739946380697051e-05,
      "loss": 0.0104,
      "step": 2520
    },
    {
      "epoch": 2.2609472743521,
      "grad_norm": 0.009746580384671688,
      "learning_rate": 8.6952636282395e-05,
      "loss": 0.0194,
      "step": 2530
    },
    {
      "epoch": 2.2698838248436104,
      "grad_norm": 4.104613780975342,
      "learning_rate": 8.650580875781949e-05,
      "loss": 0.0155,
      "step": 2540
    },
    {
      "epoch": 2.278820375335121,
      "grad_norm": 0.01826513558626175,
      "learning_rate": 8.605898123324397e-05,
      "loss": 0.0072,
      "step": 2550
    },
    {
      "epoch": 2.287756925826631,
      "grad_norm": 0.03380773961544037,
      "learning_rate": 8.561215370866847e-05,
      "loss": 0.0515,
      "step": 2560
    },
    {
      "epoch": 2.2966934763181412,
      "grad_norm": 0.13917675614356995,
      "learning_rate": 8.516532618409293e-05,
      "loss": 0.0553,
      "step": 2570
    },
    {
      "epoch": 2.3056300268096512,
      "grad_norm": 3.9170970916748047,
      "learning_rate": 8.471849865951743e-05,
      "loss": 0.0252,
      "step": 2580
    },
    {
      "epoch": 2.3145665773011617,
      "grad_norm": 0.02010478265583515,
      "learning_rate": 8.427167113494193e-05,
      "loss": 0.0212,
      "step": 2590
    },
    {
      "epoch": 2.323503127792672,
      "grad_norm": 0.008358313702046871,
      "learning_rate": 8.38248436103664e-05,
      "loss": 0.1032,
      "step": 2600
    },
    {
      "epoch": 2.3324396782841825,
      "grad_norm": 0.08038530498743057,
      "learning_rate": 8.337801608579089e-05,
      "loss": 0.0445,
      "step": 2610
    },
    {
      "epoch": 2.3413762287756925,
      "grad_norm": 0.03653928264975548,
      "learning_rate": 8.293118856121538e-05,
      "loss": 0.0396,
      "step": 2620
    },
    {
      "epoch": 2.350312779267203,
      "grad_norm": 0.027160342782735825,
      "learning_rate": 8.248436103663985e-05,
      "loss": 0.0305,
      "step": 2630
    },
    {
      "epoch": 2.359249329758713,
      "grad_norm": 0.015198041684925556,
      "learning_rate": 8.203753351206435e-05,
      "loss": 0.0377,
      "step": 2640
    },
    {
      "epoch": 2.3681858802502234,
      "grad_norm": 0.03799434006214142,
      "learning_rate": 8.159070598748883e-05,
      "loss": 0.0057,
      "step": 2650
    },
    {
      "epoch": 2.377122430741734,
      "grad_norm": 0.008046945556998253,
      "learning_rate": 8.114387846291331e-05,
      "loss": 0.0249,
      "step": 2660
    },
    {
      "epoch": 2.386058981233244,
      "grad_norm": 8.727446556091309,
      "learning_rate": 8.069705093833781e-05,
      "loss": 0.0466,
      "step": 2670
    },
    {
      "epoch": 2.394995531724754,
      "grad_norm": 0.01986142434179783,
      "learning_rate": 8.025022341376229e-05,
      "loss": 0.0357,
      "step": 2680
    },
    {
      "epoch": 2.4039320822162646,
      "grad_norm": 7.71134614944458,
      "learning_rate": 7.980339588918678e-05,
      "loss": 0.015,
      "step": 2690
    },
    {
      "epoch": 2.4128686327077746,
      "grad_norm": 0.04247535765171051,
      "learning_rate": 7.935656836461127e-05,
      "loss": 0.0165,
      "step": 2700
    },
    {
      "epoch": 2.421805183199285,
      "grad_norm": 0.008588094264268875,
      "learning_rate": 7.890974084003575e-05,
      "loss": 0.0039,
      "step": 2710
    },
    {
      "epoch": 2.4307417336907955,
      "grad_norm": 0.11789193749427795,
      "learning_rate": 7.846291331546024e-05,
      "loss": 0.0344,
      "step": 2720
    },
    {
      "epoch": 2.4396782841823055,
      "grad_norm": 0.02231294848024845,
      "learning_rate": 7.801608579088472e-05,
      "loss": 0.0248,
      "step": 2730
    },
    {
      "epoch": 2.448614834673816,
      "grad_norm": 0.017268147319555283,
      "learning_rate": 7.75692582663092e-05,
      "loss": 0.0716,
      "step": 2740
    },
    {
      "epoch": 2.4575513851653263,
      "grad_norm": 8.963982582092285,
      "learning_rate": 7.71224307417337e-05,
      "loss": 0.0282,
      "step": 2750
    },
    {
      "epoch": 2.4664879356568363,
      "grad_norm": 0.799085259437561,
      "learning_rate": 7.667560321715817e-05,
      "loss": 0.0416,
      "step": 2760
    },
    {
      "epoch": 2.4754244861483468,
      "grad_norm": 0.15468931198120117,
      "learning_rate": 7.622877569258267e-05,
      "loss": 0.0669,
      "step": 2770
    },
    {
      "epoch": 2.484361036639857,
      "grad_norm": 3.4924068450927734,
      "learning_rate": 7.578194816800716e-05,
      "loss": 0.0477,
      "step": 2780
    },
    {
      "epoch": 2.493297587131367,
      "grad_norm": 0.012834394350647926,
      "learning_rate": 7.533512064343163e-05,
      "loss": 0.0174,
      "step": 2790
    },
    {
      "epoch": 2.5022341376228776,
      "grad_norm": 0.039204515516757965,
      "learning_rate": 7.488829311885612e-05,
      "loss": 0.0699,
      "step": 2800
    },
    {
      "epoch": 2.5111706881143876,
      "grad_norm": 0.08284445852041245,
      "learning_rate": 7.444146559428062e-05,
      "loss": 0.0445,
      "step": 2810
    },
    {
      "epoch": 2.520107238605898,
      "grad_norm": 0.010827134363353252,
      "learning_rate": 7.39946380697051e-05,
      "loss": 0.043,
      "step": 2820
    },
    {
      "epoch": 2.5290437890974085,
      "grad_norm": 3.5454938411712646,
      "learning_rate": 7.354781054512958e-05,
      "loss": 0.0339,
      "step": 2830
    },
    {
      "epoch": 2.537980339588919,
      "grad_norm": 0.006842234171926975,
      "learning_rate": 7.310098302055406e-05,
      "loss": 0.0029,
      "step": 2840
    },
    {
      "epoch": 2.546916890080429,
      "grad_norm": 0.7790193557739258,
      "learning_rate": 7.265415549597856e-05,
      "loss": 0.0055,
      "step": 2850
    },
    {
      "epoch": 2.5558534405719393,
      "grad_norm": 0.022239111363887787,
      "learning_rate": 7.220732797140304e-05,
      "loss": 0.008,
      "step": 2860
    },
    {
      "epoch": 2.5647899910634493,
      "grad_norm": 0.05403418838977814,
      "learning_rate": 7.176050044682752e-05,
      "loss": 0.057,
      "step": 2870
    },
    {
      "epoch": 2.5737265415549597,
      "grad_norm": 0.008923870511353016,
      "learning_rate": 7.131367292225202e-05,
      "loss": 0.0045,
      "step": 2880
    },
    {
      "epoch": 2.58266309204647,
      "grad_norm": 0.02668040059506893,
      "learning_rate": 7.08668453976765e-05,
      "loss": 0.0551,
      "step": 2890
    },
    {
      "epoch": 2.5915996425379806,
      "grad_norm": 0.049835577607154846,
      "learning_rate": 7.042001787310098e-05,
      "loss": 0.0255,
      "step": 2900
    },
    {
      "epoch": 2.6005361930294906,
      "grad_norm": 0.19334334135055542,
      "learning_rate": 6.997319034852548e-05,
      "loss": 0.0434,
      "step": 2910
    },
    {
      "epoch": 2.609472743521001,
      "grad_norm": 2.9139554500579834,
      "learning_rate": 6.952636282394996e-05,
      "loss": 0.0069,
      "step": 2920
    },
    {
      "epoch": 2.618409294012511,
      "grad_norm": 0.006679228041321039,
      "learning_rate": 6.907953529937444e-05,
      "loss": 0.0021,
      "step": 2930
    },
    {
      "epoch": 2.6273458445040214,
      "grad_norm": 0.1680416613817215,
      "learning_rate": 6.863270777479894e-05,
      "loss": 0.0249,
      "step": 2940
    },
    {
      "epoch": 2.636282394995532,
      "grad_norm": 0.08290654420852661,
      "learning_rate": 6.818588025022342e-05,
      "loss": 0.029,
      "step": 2950
    },
    {
      "epoch": 2.645218945487042,
      "grad_norm": 0.013707391917705536,
      "learning_rate": 6.77390527256479e-05,
      "loss": 0.0124,
      "step": 2960
    },
    {
      "epoch": 2.6541554959785523,
      "grad_norm": 0.2275378704071045,
      "learning_rate": 6.72922252010724e-05,
      "loss": 0.035,
      "step": 2970
    },
    {
      "epoch": 2.6630920464700627,
      "grad_norm": 0.5669155716896057,
      "learning_rate": 6.684539767649688e-05,
      "loss": 0.0288,
      "step": 2980
    },
    {
      "epoch": 2.6720285969615727,
      "grad_norm": 0.01488091703504324,
      "learning_rate": 6.639857015192136e-05,
      "loss": 0.0438,
      "step": 2990
    },
    {
      "epoch": 2.680965147453083,
      "grad_norm": 3.9659953117370605,
      "learning_rate": 6.595174262734584e-05,
      "loss": 0.0652,
      "step": 3000
    },
    {
      "epoch": 2.680965147453083,
      "eval_accuracy": 0.9793969849246231,
      "eval_loss": 0.08239442110061646,
      "eval_runtime": 56.1213,
      "eval_samples_per_second": 35.459,
      "eval_steps_per_second": 4.437,
      "step": 3000
    },
    {
      "epoch": 2.6899016979445936,
      "grad_norm": 8.31395149230957,
      "learning_rate": 6.550491510277034e-05,
      "loss": 0.0098,
      "step": 3010
    },
    {
      "epoch": 2.6988382484361035,
      "grad_norm": 0.008468572981655598,
      "learning_rate": 6.505808757819482e-05,
      "loss": 0.1056,
      "step": 3020
    },
    {
      "epoch": 2.707774798927614,
      "grad_norm": 0.9328808188438416,
      "learning_rate": 6.46112600536193e-05,
      "loss": 0.0769,
      "step": 3030
    },
    {
      "epoch": 2.716711349419124,
      "grad_norm": 0.6114912629127502,
      "learning_rate": 6.41644325290438e-05,
      "loss": 0.0434,
      "step": 3040
    },
    {
      "epoch": 2.7256478999106344,
      "grad_norm": 0.03709472343325615,
      "learning_rate": 6.371760500446829e-05,
      "loss": 0.0166,
      "step": 3050
    },
    {
      "epoch": 2.734584450402145,
      "grad_norm": 0.1086587980389595,
      "learning_rate": 6.327077747989276e-05,
      "loss": 0.0047,
      "step": 3060
    },
    {
      "epoch": 2.7435210008936552,
      "grad_norm": 0.12008140981197357,
      "learning_rate": 6.282394995531725e-05,
      "loss": 0.0069,
      "step": 3070
    },
    {
      "epoch": 2.7524575513851652,
      "grad_norm": 0.017355024814605713,
      "learning_rate": 6.237712243074174e-05,
      "loss": 0.0033,
      "step": 3080
    },
    {
      "epoch": 2.7613941018766757,
      "grad_norm": 0.15070508420467377,
      "learning_rate": 6.193029490616622e-05,
      "loss": 0.0476,
      "step": 3090
    },
    {
      "epoch": 2.7703306523681857,
      "grad_norm": 0.022527649998664856,
      "learning_rate": 6.148346738159071e-05,
      "loss": 0.0243,
      "step": 3100
    },
    {
      "epoch": 2.779267202859696,
      "grad_norm": 0.37779930233955383,
      "learning_rate": 6.10366398570152e-05,
      "loss": 0.0058,
      "step": 3110
    },
    {
      "epoch": 2.7882037533512065,
      "grad_norm": 0.029893942177295685,
      "learning_rate": 6.0589812332439676e-05,
      "loss": 0.0208,
      "step": 3120
    },
    {
      "epoch": 2.797140303842717,
      "grad_norm": 0.01635076478123665,
      "learning_rate": 6.0142984807864165e-05,
      "loss": 0.0026,
      "step": 3130
    },
    {
      "epoch": 2.806076854334227,
      "grad_norm": 0.011868173256516457,
      "learning_rate": 5.969615728328865e-05,
      "loss": 0.0257,
      "step": 3140
    },
    {
      "epoch": 2.8150134048257374,
      "grad_norm": 0.02559722028672695,
      "learning_rate": 5.9249329758713135e-05,
      "loss": 0.0666,
      "step": 3150
    },
    {
      "epoch": 2.8239499553172474,
      "grad_norm": 0.01763424649834633,
      "learning_rate": 5.8802502234137623e-05,
      "loss": 0.0611,
      "step": 3160
    },
    {
      "epoch": 2.832886505808758,
      "grad_norm": 0.02686423808336258,
      "learning_rate": 5.835567470956211e-05,
      "loss": 0.0039,
      "step": 3170
    },
    {
      "epoch": 2.841823056300268,
      "grad_norm": 0.04632404074072838,
      "learning_rate": 5.79088471849866e-05,
      "loss": 0.0122,
      "step": 3180
    },
    {
      "epoch": 2.8507596067917786,
      "grad_norm": 0.1586790531873703,
      "learning_rate": 5.746201966041108e-05,
      "loss": 0.0026,
      "step": 3190
    },
    {
      "epoch": 2.8596961572832886,
      "grad_norm": 5.425605297088623,
      "learning_rate": 5.701519213583557e-05,
      "loss": 0.0622,
      "step": 3200
    },
    {
      "epoch": 2.868632707774799,
      "grad_norm": 0.006181008648127317,
      "learning_rate": 5.656836461126006e-05,
      "loss": 0.0028,
      "step": 3210
    },
    {
      "epoch": 2.877569258266309,
      "grad_norm": 0.09517185389995575,
      "learning_rate": 5.612153708668454e-05,
      "loss": 0.0035,
      "step": 3220
    },
    {
      "epoch": 2.8865058087578195,
      "grad_norm": 0.015022194012999535,
      "learning_rate": 5.567470956210903e-05,
      "loss": 0.0285,
      "step": 3230
    },
    {
      "epoch": 2.89544235924933,
      "grad_norm": 4.772485256195068,
      "learning_rate": 5.522788203753352e-05,
      "loss": 0.0279,
      "step": 3240
    },
    {
      "epoch": 2.90437890974084,
      "grad_norm": 3.1032145023345947,
      "learning_rate": 5.478105451295799e-05,
      "loss": 0.0049,
      "step": 3250
    },
    {
      "epoch": 2.9133154602323503,
      "grad_norm": 0.05868244543671608,
      "learning_rate": 5.433422698838249e-05,
      "loss": 0.0029,
      "step": 3260
    },
    {
      "epoch": 2.9222520107238603,
      "grad_norm": 0.008307090029120445,
      "learning_rate": 5.388739946380698e-05,
      "loss": 0.0099,
      "step": 3270
    },
    {
      "epoch": 2.9311885612153707,
      "grad_norm": 0.010392882861196995,
      "learning_rate": 5.344057193923145e-05,
      "loss": 0.002,
      "step": 3280
    },
    {
      "epoch": 2.940125111706881,
      "grad_norm": 0.005523020401597023,
      "learning_rate": 5.299374441465594e-05,
      "loss": 0.0035,
      "step": 3290
    },
    {
      "epoch": 2.9490616621983916,
      "grad_norm": 0.06098335236310959,
      "learning_rate": 5.2546916890080436e-05,
      "loss": 0.0056,
      "step": 3300
    },
    {
      "epoch": 2.9579982126899016,
      "grad_norm": 0.013083376921713352,
      "learning_rate": 5.2100089365504925e-05,
      "loss": 0.023,
      "step": 3310
    },
    {
      "epoch": 2.966934763181412,
      "grad_norm": 0.01605415530502796,
      "learning_rate": 5.16532618409294e-05,
      "loss": 0.0396,
      "step": 3320
    },
    {
      "epoch": 2.975871313672922,
      "grad_norm": 0.013243346475064754,
      "learning_rate": 5.120643431635389e-05,
      "loss": 0.0083,
      "step": 3330
    },
    {
      "epoch": 2.9848078641644324,
      "grad_norm": 1.4108890295028687,
      "learning_rate": 5.0759606791778383e-05,
      "loss": 0.0468,
      "step": 3340
    },
    {
      "epoch": 2.993744414655943,
      "grad_norm": 0.5704414248466492,
      "learning_rate": 5.031277926720286e-05,
      "loss": 0.0209,
      "step": 3350
    },
    {
      "epoch": 3.002680965147453,
      "grad_norm": 0.03908452019095421,
      "learning_rate": 4.986595174262735e-05,
      "loss": 0.0779,
      "step": 3360
    },
    {
      "epoch": 3.0116175156389633,
      "grad_norm": 0.010959290899336338,
      "learning_rate": 4.9419124218051835e-05,
      "loss": 0.0109,
      "step": 3370
    },
    {
      "epoch": 3.0205540661304737,
      "grad_norm": 0.028490547090768814,
      "learning_rate": 4.8972296693476324e-05,
      "loss": 0.0025,
      "step": 3380
    },
    {
      "epoch": 3.0294906166219837,
      "grad_norm": 0.00491972966119647,
      "learning_rate": 4.8525469168900806e-05,
      "loss": 0.0214,
      "step": 3390
    },
    {
      "epoch": 3.038427167113494,
      "grad_norm": 0.014270992018282413,
      "learning_rate": 4.8078641644325294e-05,
      "loss": 0.0048,
      "step": 3400
    },
    {
      "epoch": 3.0473637176050046,
      "grad_norm": 0.00458119623363018,
      "learning_rate": 4.7631814119749776e-05,
      "loss": 0.0308,
      "step": 3410
    },
    {
      "epoch": 3.0563002680965146,
      "grad_norm": 0.00890402402728796,
      "learning_rate": 4.7184986595174265e-05,
      "loss": 0.0019,
      "step": 3420
    },
    {
      "epoch": 3.065236818588025,
      "grad_norm": 0.004751246422529221,
      "learning_rate": 4.673815907059875e-05,
      "loss": 0.0028,
      "step": 3430
    },
    {
      "epoch": 3.0741733690795354,
      "grad_norm": 0.008143426850438118,
      "learning_rate": 4.6291331546023235e-05,
      "loss": 0.0015,
      "step": 3440
    },
    {
      "epoch": 3.0831099195710454,
      "grad_norm": 0.035306982696056366,
      "learning_rate": 4.5844504021447723e-05,
      "loss": 0.0561,
      "step": 3450
    },
    {
      "epoch": 3.092046470062556,
      "grad_norm": 0.006312028504908085,
      "learning_rate": 4.539767649687221e-05,
      "loss": 0.0407,
      "step": 3460
    },
    {
      "epoch": 3.1009830205540663,
      "grad_norm": 0.012918233871459961,
      "learning_rate": 4.4950848972296694e-05,
      "loss": 0.0204,
      "step": 3470
    },
    {
      "epoch": 3.1099195710455763,
      "grad_norm": 0.03429726883769035,
      "learning_rate": 4.450402144772118e-05,
      "loss": 0.0138,
      "step": 3480
    },
    {
      "epoch": 3.1188561215370867,
      "grad_norm": 0.032142043113708496,
      "learning_rate": 4.405719392314567e-05,
      "loss": 0.0074,
      "step": 3490
    },
    {
      "epoch": 3.127792672028597,
      "grad_norm": 0.11621160060167313,
      "learning_rate": 4.361036639857015e-05,
      "loss": 0.007,
      "step": 3500
    },
    {
      "epoch": 3.136729222520107,
      "grad_norm": 0.010225760750472546,
      "learning_rate": 4.316353887399464e-05,
      "loss": 0.0371,
      "step": 3510
    },
    {
      "epoch": 3.1456657730116175,
      "grad_norm": 0.0270242840051651,
      "learning_rate": 4.271671134941912e-05,
      "loss": 0.0024,
      "step": 3520
    },
    {
      "epoch": 3.154602323503128,
      "grad_norm": 0.561730146408081,
      "learning_rate": 4.226988382484361e-05,
      "loss": 0.0322,
      "step": 3530
    },
    {
      "epoch": 3.163538873994638,
      "grad_norm": 3.7698066234588623,
      "learning_rate": 4.18230563002681e-05,
      "loss": 0.0061,
      "step": 3540
    },
    {
      "epoch": 3.1724754244861484,
      "grad_norm": 0.08852257579565048,
      "learning_rate": 4.137622877569258e-05,
      "loss": 0.002,
      "step": 3550
    },
    {
      "epoch": 3.181411974977659,
      "grad_norm": 0.010241570882499218,
      "learning_rate": 4.092940125111707e-05,
      "loss": 0.0032,
      "step": 3560
    },
    {
      "epoch": 3.190348525469169,
      "grad_norm": 0.02900160290300846,
      "learning_rate": 4.048257372654156e-05,
      "loss": 0.0021,
      "step": 3570
    },
    {
      "epoch": 3.1992850759606792,
      "grad_norm": 0.012413430958986282,
      "learning_rate": 4.003574620196605e-05,
      "loss": 0.0016,
      "step": 3580
    },
    {
      "epoch": 3.2082216264521897,
      "grad_norm": 0.011820780113339424,
      "learning_rate": 3.958891867739053e-05,
      "loss": 0.0156,
      "step": 3590
    },
    {
      "epoch": 3.2171581769436997,
      "grad_norm": 0.0063424003310501575,
      "learning_rate": 3.914209115281501e-05,
      "loss": 0.0066,
      "step": 3600
    },
    {
      "epoch": 3.22609472743521,
      "grad_norm": 0.014534726738929749,
      "learning_rate": 3.8695263628239506e-05,
      "loss": 0.0023,
      "step": 3610
    },
    {
      "epoch": 3.23503127792672,
      "grad_norm": 0.0037305313162505627,
      "learning_rate": 3.824843610366399e-05,
      "loss": 0.0014,
      "step": 3620
    },
    {
      "epoch": 3.2439678284182305,
      "grad_norm": 0.004174220375716686,
      "learning_rate": 3.780160857908847e-05,
      "loss": 0.0022,
      "step": 3630
    },
    {
      "epoch": 3.252904378909741,
      "grad_norm": 0.02620732970535755,
      "learning_rate": 3.735478105451296e-05,
      "loss": 0.0033,
      "step": 3640
    },
    {
      "epoch": 3.2618409294012514,
      "grad_norm": 0.008887135423719883,
      "learning_rate": 3.690795352993745e-05,
      "loss": 0.0137,
      "step": 3650
    },
    {
      "epoch": 3.2707774798927614,
      "grad_norm": 0.0036694956943392754,
      "learning_rate": 3.6461126005361935e-05,
      "loss": 0.0016,
      "step": 3660
    },
    {
      "epoch": 3.279714030384272,
      "grad_norm": 0.005121259950101376,
      "learning_rate": 3.601429848078642e-05,
      "loss": 0.0023,
      "step": 3670
    },
    {
      "epoch": 3.2886505808757818,
      "grad_norm": 0.005332967732101679,
      "learning_rate": 3.55674709562109e-05,
      "loss": 0.0508,
      "step": 3680
    },
    {
      "epoch": 3.297587131367292,
      "grad_norm": 0.008636276237666607,
      "learning_rate": 3.5120643431635394e-05,
      "loss": 0.0015,
      "step": 3690
    },
    {
      "epoch": 3.3065236818588026,
      "grad_norm": 0.004048788454383612,
      "learning_rate": 3.4673815907059876e-05,
      "loss": 0.0015,
      "step": 3700
    },
    {
      "epoch": 3.3154602323503126,
      "grad_norm": 0.013148046098649502,
      "learning_rate": 3.4226988382484365e-05,
      "loss": 0.0021,
      "step": 3710
    },
    {
      "epoch": 3.324396782841823,
      "grad_norm": 0.003611048450693488,
      "learning_rate": 3.3780160857908846e-05,
      "loss": 0.0018,
      "step": 3720
    },
    {
      "epoch": 3.3333333333333335,
      "grad_norm": 0.0047615463845431805,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 0.002,
      "step": 3730
    },
    {
      "epoch": 3.3422698838248435,
      "grad_norm": 0.052058279514312744,
      "learning_rate": 3.2886505808757823e-05,
      "loss": 0.0017,
      "step": 3740
    },
    {
      "epoch": 3.351206434316354,
      "grad_norm": 0.004867528565227985,
      "learning_rate": 3.2439678284182305e-05,
      "loss": 0.0015,
      "step": 3750
    },
    {
      "epoch": 3.3601429848078643,
      "grad_norm": 0.005437952931970358,
      "learning_rate": 3.1992850759606794e-05,
      "loss": 0.0027,
      "step": 3760
    },
    {
      "epoch": 3.3690795352993743,
      "grad_norm": 0.08657950907945633,
      "learning_rate": 3.154602323503128e-05,
      "loss": 0.0012,
      "step": 3770
    },
    {
      "epoch": 3.3780160857908847,
      "grad_norm": 0.003917807713150978,
      "learning_rate": 3.1099195710455764e-05,
      "loss": 0.0016,
      "step": 3780
    },
    {
      "epoch": 3.386952636282395,
      "grad_norm": 0.03561088442802429,
      "learning_rate": 3.065236818588025e-05,
      "loss": 0.0063,
      "step": 3790
    },
    {
      "epoch": 3.395889186773905,
      "grad_norm": 0.021922320127487183,
      "learning_rate": 3.0205540661304738e-05,
      "loss": 0.0195,
      "step": 3800
    },
    {
      "epoch": 3.4048257372654156,
      "grad_norm": 0.009989120066165924,
      "learning_rate": 2.9758713136729223e-05,
      "loss": 0.0027,
      "step": 3810
    },
    {
      "epoch": 3.413762287756926,
      "grad_norm": 0.004757929127663374,
      "learning_rate": 2.931188561215371e-05,
      "loss": 0.0213,
      "step": 3820
    },
    {
      "epoch": 3.422698838248436,
      "grad_norm": 0.005087022669613361,
      "learning_rate": 2.8865058087578197e-05,
      "loss": 0.012,
      "step": 3830
    },
    {
      "epoch": 3.4316353887399464,
      "grad_norm": 0.04014687240123749,
      "learning_rate": 2.8418230563002685e-05,
      "loss": 0.0016,
      "step": 3840
    },
    {
      "epoch": 3.4405719392314564,
      "grad_norm": 0.008556324057281017,
      "learning_rate": 2.7971403038427167e-05,
      "loss": 0.0466,
      "step": 3850
    },
    {
      "epoch": 3.449508489722967,
      "grad_norm": 0.0066629331558942795,
      "learning_rate": 2.7524575513851652e-05,
      "loss": 0.0013,
      "step": 3860
    },
    {
      "epoch": 3.4584450402144773,
      "grad_norm": 0.007047568913549185,
      "learning_rate": 2.707774798927614e-05,
      "loss": 0.0015,
      "step": 3870
    },
    {
      "epoch": 3.4673815907059877,
      "grad_norm": 0.0033304065000265837,
      "learning_rate": 2.6630920464700626e-05,
      "loss": 0.0186,
      "step": 3880
    },
    {
      "epoch": 3.4763181411974977,
      "grad_norm": 0.043915342539548874,
      "learning_rate": 2.6184092940125114e-05,
      "loss": 0.0013,
      "step": 3890
    },
    {
      "epoch": 3.485254691689008,
      "grad_norm": 0.005252317525446415,
      "learning_rate": 2.57372654155496e-05,
      "loss": 0.0036,
      "step": 3900
    },
    {
      "epoch": 3.494191242180518,
      "grad_norm": 0.005055012181401253,
      "learning_rate": 2.5290437890974085e-05,
      "loss": 0.0012,
      "step": 3910
    },
    {
      "epoch": 3.5031277926720286,
      "grad_norm": 0.0049805790185928345,
      "learning_rate": 2.484361036639857e-05,
      "loss": 0.0157,
      "step": 3920
    },
    {
      "epoch": 3.512064343163539,
      "grad_norm": 0.009514909237623215,
      "learning_rate": 2.439678284182306e-05,
      "loss": 0.0109,
      "step": 3930
    },
    {
      "epoch": 3.5210008936550494,
      "grad_norm": 0.03643026947975159,
      "learning_rate": 2.3949955317247544e-05,
      "loss": 0.0019,
      "step": 3940
    },
    {
      "epoch": 3.5299374441465594,
      "grad_norm": 0.056902140378952026,
      "learning_rate": 2.3503127792672032e-05,
      "loss": 0.0016,
      "step": 3950
    },
    {
      "epoch": 3.53887399463807,
      "grad_norm": 0.10358071327209473,
      "learning_rate": 2.3056300268096514e-05,
      "loss": 0.005,
      "step": 3960
    },
    {
      "epoch": 3.54781054512958,
      "grad_norm": 0.005386151373386383,
      "learning_rate": 2.2609472743521002e-05,
      "loss": 0.0021,
      "step": 3970
    },
    {
      "epoch": 3.5567470956210903,
      "grad_norm": 0.007350238971412182,
      "learning_rate": 2.2162645218945488e-05,
      "loss": 0.0016,
      "step": 3980
    },
    {
      "epoch": 3.5656836461126007,
      "grad_norm": 0.07326429337263107,
      "learning_rate": 2.1715817694369976e-05,
      "loss": 0.0084,
      "step": 3990
    },
    {
      "epoch": 3.5746201966041107,
      "grad_norm": 0.005603461060672998,
      "learning_rate": 2.126899016979446e-05,
      "loss": 0.0011,
      "step": 4000
    },
    {
      "epoch": 3.5746201966041107,
      "eval_accuracy": 0.9814070351758793,
      "eval_loss": 0.0710952952504158,
      "eval_runtime": 56.3405,
      "eval_samples_per_second": 35.321,
      "eval_steps_per_second": 4.42,
      "step": 4000
    },
    {
      "epoch": 3.583556747095621,
      "grad_norm": 0.010818341746926308,
      "learning_rate": 2.0822162645218946e-05,
      "loss": 0.0038,
      "step": 4010
    },
    {
      "epoch": 3.592493297587131,
      "grad_norm": 0.003599151037633419,
      "learning_rate": 2.037533512064343e-05,
      "loss": 0.0065,
      "step": 4020
    },
    {
      "epoch": 3.6014298480786415,
      "grad_norm": 4.198567867279053,
      "learning_rate": 1.992850759606792e-05,
      "loss": 0.0083,
      "step": 4030
    },
    {
      "epoch": 3.610366398570152,
      "grad_norm": 0.013494855724275112,
      "learning_rate": 1.9481680071492405e-05,
      "loss": 0.0045,
      "step": 4040
    },
    {
      "epoch": 3.6193029490616624,
      "grad_norm": 0.0036234534345567226,
      "learning_rate": 1.903485254691689e-05,
      "loss": 0.0016,
      "step": 4050
    },
    {
      "epoch": 3.6282394995531724,
      "grad_norm": 0.021920403465628624,
      "learning_rate": 1.8588025022341376e-05,
      "loss": 0.0012,
      "step": 4060
    },
    {
      "epoch": 3.637176050044683,
      "grad_norm": 0.004384295083582401,
      "learning_rate": 1.8141197497765864e-05,
      "loss": 0.001,
      "step": 4070
    },
    {
      "epoch": 3.646112600536193,
      "grad_norm": 0.03161391615867615,
      "learning_rate": 1.769436997319035e-05,
      "loss": 0.0026,
      "step": 4080
    },
    {
      "epoch": 3.6550491510277032,
      "grad_norm": 0.0033394452184438705,
      "learning_rate": 1.7247542448614838e-05,
      "loss": 0.0267,
      "step": 4090
    },
    {
      "epoch": 3.6639857015192137,
      "grad_norm": 0.01090541947633028,
      "learning_rate": 1.680071492403932e-05,
      "loss": 0.0014,
      "step": 4100
    },
    {
      "epoch": 3.672922252010724,
      "grad_norm": 0.0053653959184885025,
      "learning_rate": 1.6353887399463808e-05,
      "loss": 0.0095,
      "step": 4110
    },
    {
      "epoch": 3.681858802502234,
      "grad_norm": 0.032379720360040665,
      "learning_rate": 1.5907059874888293e-05,
      "loss": 0.0011,
      "step": 4120
    },
    {
      "epoch": 3.6907953529937445,
      "grad_norm": 0.05944305285811424,
      "learning_rate": 1.5460232350312782e-05,
      "loss": 0.0015,
      "step": 4130
    },
    {
      "epoch": 3.6997319034852545,
      "grad_norm": 0.0054045203141868114,
      "learning_rate": 1.5013404825737265e-05,
      "loss": 0.0012,
      "step": 4140
    },
    {
      "epoch": 3.708668453976765,
      "grad_norm": 0.003021675394847989,
      "learning_rate": 1.4566577301161752e-05,
      "loss": 0.0023,
      "step": 4150
    },
    {
      "epoch": 3.7176050044682754,
      "grad_norm": 0.007955508306622505,
      "learning_rate": 1.4119749776586239e-05,
      "loss": 0.0031,
      "step": 4160
    },
    {
      "epoch": 3.726541554959786,
      "grad_norm": 0.005485454574227333,
      "learning_rate": 1.3672922252010726e-05,
      "loss": 0.0014,
      "step": 4170
    },
    {
      "epoch": 3.7354781054512958,
      "grad_norm": 0.007910342887043953,
      "learning_rate": 1.322609472743521e-05,
      "loss": 0.0014,
      "step": 4180
    },
    {
      "epoch": 3.744414655942806,
      "grad_norm": 0.011793126352131367,
      "learning_rate": 1.2779267202859696e-05,
      "loss": 0.001,
      "step": 4190
    },
    {
      "epoch": 3.753351206434316,
      "grad_norm": 0.005442539695650339,
      "learning_rate": 1.2332439678284183e-05,
      "loss": 0.0015,
      "step": 4200
    },
    {
      "epoch": 3.7622877569258266,
      "grad_norm": 1.1986395120620728,
      "learning_rate": 1.188561215370867e-05,
      "loss": 0.002,
      "step": 4210
    },
    {
      "epoch": 3.771224307417337,
      "grad_norm": 0.006608502473682165,
      "learning_rate": 1.1438784629133155e-05,
      "loss": 0.0009,
      "step": 4220
    },
    {
      "epoch": 3.780160857908847,
      "grad_norm": 0.0039040117990225554,
      "learning_rate": 1.0991957104557642e-05,
      "loss": 0.0013,
      "step": 4230
    },
    {
      "epoch": 3.7890974084003575,
      "grad_norm": 0.0041880221106112,
      "learning_rate": 1.0545129579982127e-05,
      "loss": 0.0012,
      "step": 4240
    },
    {
      "epoch": 3.798033958891868,
      "grad_norm": 0.003776776837185025,
      "learning_rate": 1.0098302055406614e-05,
      "loss": 0.0013,
      "step": 4250
    },
    {
      "epoch": 3.806970509383378,
      "grad_norm": 0.2970888614654541,
      "learning_rate": 9.651474530831099e-06,
      "loss": 0.0014,
      "step": 4260
    },
    {
      "epoch": 3.8159070598748883,
      "grad_norm": 0.003879937343299389,
      "learning_rate": 9.204647006255586e-06,
      "loss": 0.0013,
      "step": 4270
    },
    {
      "epoch": 3.8248436103663987,
      "grad_norm": 3.5169312953948975,
      "learning_rate": 8.757819481680071e-06,
      "loss": 0.0035,
      "step": 4280
    },
    {
      "epoch": 3.8337801608579087,
      "grad_norm": 0.004920534789562225,
      "learning_rate": 8.310991957104558e-06,
      "loss": 0.001,
      "step": 4290
    },
    {
      "epoch": 3.842716711349419,
      "grad_norm": 0.0035059740766882896,
      "learning_rate": 7.864164432529045e-06,
      "loss": 0.0094,
      "step": 4300
    },
    {
      "epoch": 3.851653261840929,
      "grad_norm": 0.004144645761698484,
      "learning_rate": 7.41733690795353e-06,
      "loss": 0.001,
      "step": 4310
    },
    {
      "epoch": 3.8605898123324396,
      "grad_norm": 0.006385812535881996,
      "learning_rate": 6.970509383378017e-06,
      "loss": 0.001,
      "step": 4320
    },
    {
      "epoch": 3.86952636282395,
      "grad_norm": 0.003677819389849901,
      "learning_rate": 6.523681858802503e-06,
      "loss": 0.0009,
      "step": 4330
    },
    {
      "epoch": 3.8784629133154604,
      "grad_norm": 0.003563833888620138,
      "learning_rate": 6.076854334226989e-06,
      "loss": 0.0251,
      "step": 4340
    },
    {
      "epoch": 3.8873994638069704,
      "grad_norm": 0.012182756327092648,
      "learning_rate": 5.630026809651475e-06,
      "loss": 0.0422,
      "step": 4350
    },
    {
      "epoch": 3.896336014298481,
      "grad_norm": 0.004781792871654034,
      "learning_rate": 5.1831992850759615e-06,
      "loss": 0.0021,
      "step": 4360
    },
    {
      "epoch": 3.905272564789991,
      "grad_norm": 0.003455075901001692,
      "learning_rate": 4.7363717605004475e-06,
      "loss": 0.0182,
      "step": 4370
    },
    {
      "epoch": 3.9142091152815013,
      "grad_norm": 0.00627366965636611,
      "learning_rate": 4.2895442359249335e-06,
      "loss": 0.0148,
      "step": 4380
    },
    {
      "epoch": 3.9231456657730117,
      "grad_norm": 0.015941530466079712,
      "learning_rate": 3.8427167113494195e-06,
      "loss": 0.0015,
      "step": 4390
    },
    {
      "epoch": 3.932082216264522,
      "grad_norm": 0.004724125377833843,
      "learning_rate": 3.3958891867739055e-06,
      "loss": 0.0093,
      "step": 4400
    },
    {
      "epoch": 3.941018766756032,
      "grad_norm": 0.0062377783469855785,
      "learning_rate": 2.9490616621983915e-06,
      "loss": 0.0011,
      "step": 4410
    },
    {
      "epoch": 3.9499553172475426,
      "grad_norm": 0.0042613474652171135,
      "learning_rate": 2.502234137622878e-06,
      "loss": 0.0141,
      "step": 4420
    },
    {
      "epoch": 3.9588918677390526,
      "grad_norm": 0.005040575284510851,
      "learning_rate": 2.055406613047364e-06,
      "loss": 0.0061,
      "step": 4430
    },
    {
      "epoch": 3.967828418230563,
      "grad_norm": 0.004678263328969479,
      "learning_rate": 1.60857908847185e-06,
      "loss": 0.0011,
      "step": 4440
    },
    {
      "epoch": 3.9767649687220734,
      "grad_norm": 0.0033705062232911587,
      "learning_rate": 1.161751563896336e-06,
      "loss": 0.0065,
      "step": 4450
    },
    {
      "epoch": 3.9857015192135834,
      "grad_norm": 0.004543005023151636,
      "learning_rate": 7.149240393208222e-07,
      "loss": 0.0011,
      "step": 4460
    },
    {
      "epoch": 3.994638069705094,
      "grad_norm": 0.01603855937719345,
      "learning_rate": 2.6809651474530835e-07,
      "loss": 0.001,
      "step": 4470
    },
    {
      "epoch": 4.0,
      "step": 4476,
      "total_flos": 5.549295064059888e+18,
      "train_loss": 0.1165861947240576,
      "train_runtime": 2488.5837,
      "train_samples_per_second": 28.775,
      "train_steps_per_second": 1.799
    }
  ],
  "logging_steps": 10,
  "max_steps": 4476,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 2000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 5.549295064059888e+18,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}