{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.9806451612903224,
  "eval_steps": 500,
  "global_step": 231,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.012903225806451613,
      "grad_norm": 0.882150089808769,
      "learning_rate": 8.333333333333334e-06,
      "loss": 1.3191,
      "step": 1
    },
    {
      "epoch": 0.025806451612903226,
      "grad_norm": 0.8369153094823952,
      "learning_rate": 1.6666666666666667e-05,
      "loss": 1.249,
      "step": 2
    },
    {
      "epoch": 0.03870967741935484,
      "grad_norm": 0.8525103918091212,
      "learning_rate": 2.5e-05,
      "loss": 1.2775,
      "step": 3
    },
    {
      "epoch": 0.05161290322580645,
      "grad_norm": 0.8113130093304075,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 1.2577,
      "step": 4
    },
    {
      "epoch": 0.06451612903225806,
      "grad_norm": 0.7691226782403744,
      "learning_rate": 4.166666666666667e-05,
      "loss": 1.2275,
      "step": 5
    },
    {
      "epoch": 0.07741935483870968,
      "grad_norm": 0.5954210054804412,
      "learning_rate": 5e-05,
      "loss": 1.1159,
      "step": 6
    },
    {
      "epoch": 0.09032258064516129,
      "grad_norm": 0.48189256930049384,
      "learning_rate": 5.833333333333334e-05,
      "loss": 1.0593,
      "step": 7
    },
    {
      "epoch": 0.1032258064516129,
      "grad_norm": 0.5241879927945232,
      "learning_rate": 6.666666666666667e-05,
      "loss": 1.0031,
      "step": 8
    },
    {
      "epoch": 0.11612903225806452,
      "grad_norm": 0.5751865259411146,
      "learning_rate": 7.500000000000001e-05,
      "loss": 0.9263,
      "step": 9
    },
    {
      "epoch": 0.12903225806451613,
      "grad_norm": 0.5686526755807603,
      "learning_rate": 8.333333333333334e-05,
      "loss": 0.8146,
      "step": 10
    },
    {
      "epoch": 0.14193548387096774,
      "grad_norm": 0.5156906474251192,
      "learning_rate": 9.166666666666667e-05,
      "loss": 0.7583,
      "step": 11
    },
    {
      "epoch": 0.15483870967741936,
      "grad_norm": 0.4901634328534619,
      "learning_rate": 0.0001,
      "loss": 0.6686,
      "step": 12
    },
    {
      "epoch": 0.16774193548387098,
      "grad_norm": 0.376084270046461,
      "learning_rate": 0.00010833333333333333,
      "loss": 0.6005,
      "step": 13
    },
    {
      "epoch": 0.18064516129032257,
      "grad_norm": 0.2761318809240614,
      "learning_rate": 0.00011666666666666668,
      "loss": 0.5741,
      "step": 14
    },
    {
      "epoch": 0.1935483870967742,
      "grad_norm": 0.25038763704461725,
      "learning_rate": 0.000125,
      "loss": 0.5465,
      "step": 15
    },
    {
      "epoch": 0.2064516129032258,
      "grad_norm": 0.2214903977106201,
      "learning_rate": 0.00013333333333333334,
      "loss": 0.5138,
      "step": 16
    },
    {
      "epoch": 0.21935483870967742,
      "grad_norm": 0.28905541505099525,
      "learning_rate": 0.00014166666666666668,
      "loss": 0.5247,
      "step": 17
    },
    {
      "epoch": 0.23225806451612904,
      "grad_norm": 0.20699066633757193,
      "learning_rate": 0.00015000000000000001,
      "loss": 0.4978,
      "step": 18
    },
    {
      "epoch": 0.24516129032258063,
      "grad_norm": 0.219457528851344,
      "learning_rate": 0.00015833333333333332,
      "loss": 0.4924,
      "step": 19
    },
    {
      "epoch": 0.25806451612903225,
      "grad_norm": 0.16596853789220767,
      "learning_rate": 0.0001666666666666667,
      "loss": 0.4759,
      "step": 20
    },
    {
      "epoch": 0.2709677419354839,
      "grad_norm": 0.13228412371333673,
      "learning_rate": 0.000175,
      "loss": 0.4613,
      "step": 21
    },
    {
      "epoch": 0.2838709677419355,
      "grad_norm": 0.1421107856190867,
      "learning_rate": 0.00018333333333333334,
      "loss": 0.4852,
      "step": 22
    },
    {
      "epoch": 0.2967741935483871,
      "grad_norm": 0.12552928984887968,
      "learning_rate": 0.00019166666666666667,
      "loss": 0.4786,
      "step": 23
    },
    {
      "epoch": 0.3096774193548387,
      "grad_norm": 0.11489463060846784,
      "learning_rate": 0.0002,
      "loss": 0.4532,
      "step": 24
    },
    {
      "epoch": 0.3225806451612903,
      "grad_norm": 0.11476879539402507,
      "learning_rate": 0.00019998848349441062,
      "loss": 0.4454,
      "step": 25
    },
    {
      "epoch": 0.33548387096774196,
      "grad_norm": 0.1256602270101812,
      "learning_rate": 0.00019995393663024054,
      "loss": 0.4513,
      "step": 26
    },
    {
      "epoch": 0.34838709677419355,
      "grad_norm": 0.11833482485698336,
      "learning_rate": 0.00019989636736467278,
      "loss": 0.44,
      "step": 27
    },
    {
      "epoch": 0.36129032258064514,
      "grad_norm": 0.11124019681377781,
      "learning_rate": 0.00019981578895764273,
      "loss": 0.4439,
      "step": 28
    },
    {
      "epoch": 0.3741935483870968,
      "grad_norm": 0.10954971384477814,
      "learning_rate": 0.00019971221996878394,
      "loss": 0.4274,
      "step": 29
    },
    {
      "epoch": 0.3870967741935484,
      "grad_norm": 0.11422715129880294,
      "learning_rate": 0.00019958568425315314,
      "loss": 0.4254,
      "step": 30
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.11262310014016527,
      "learning_rate": 0.00019943621095573586,
      "loss": 0.4204,
      "step": 31
    },
    {
      "epoch": 0.4129032258064516,
      "grad_norm": 0.11143099554463408,
      "learning_rate": 0.00019926383450473344,
      "loss": 0.4105,
      "step": 32
    },
    {
      "epoch": 0.4258064516129032,
      "grad_norm": 0.1088260973247734,
      "learning_rate": 0.00019906859460363307,
      "loss": 0.4136,
      "step": 33
    },
    {
      "epoch": 0.43870967741935485,
      "grad_norm": 0.10400753996611788,
      "learning_rate": 0.00019885053622206304,
      "loss": 0.4213,
      "step": 34
    },
    {
      "epoch": 0.45161290322580644,
      "grad_norm": 0.09587900896302251,
      "learning_rate": 0.0001986097095854347,
      "loss": 0.4085,
      "step": 35
    },
    {
      "epoch": 0.4645161290322581,
      "grad_norm": 0.10119603747308556,
      "learning_rate": 0.0001983461701633742,
      "loss": 0.4181,
      "step": 36
    },
    {
      "epoch": 0.4774193548387097,
      "grad_norm": 0.10062413136253176,
      "learning_rate": 0.00019805997865694614,
      "loss": 0.4098,
      "step": 37
    },
    {
      "epoch": 0.49032258064516127,
      "grad_norm": 0.09162394941720846,
      "learning_rate": 0.0001977512009846721,
      "loss": 0.4085,
      "step": 38
    },
    {
      "epoch": 0.5032258064516129,
      "grad_norm": 0.09269316443279575,
      "learning_rate": 0.00019741990826734794,
      "loss": 0.3994,
      "step": 39
    },
    {
      "epoch": 0.5161290322580645,
      "grad_norm": 0.08782581803238095,
      "learning_rate": 0.00019706617681166218,
      "loss": 0.3983,
      "step": 40
    },
    {
      "epoch": 0.5290322580645161,
      "grad_norm": 0.08665646987756218,
      "learning_rate": 0.00019669008809262062,
      "loss": 0.3938,
      "step": 41
    },
    {
      "epoch": 0.5419354838709678,
      "grad_norm": 0.09289388957990503,
      "learning_rate": 0.00019629172873477995,
      "loss": 0.396,
      "step": 42
    },
    {
      "epoch": 0.5548387096774193,
      "grad_norm": 0.09203344649472522,
      "learning_rate": 0.00019587119049229557,
      "loss": 0.4052,
      "step": 43
    },
    {
      "epoch": 0.567741935483871,
      "grad_norm": 0.08209774194723368,
      "learning_rate": 0.0001954285702277879,
      "loss": 0.3959,
      "step": 44
    },
    {
      "epoch": 0.5806451612903226,
      "grad_norm": 0.08595872863630391,
      "learning_rate": 0.00019496396989003193,
      "loss": 0.397,
      "step": 45
    },
    {
      "epoch": 0.5935483870967742,
      "grad_norm": 0.09041908237644536,
      "learning_rate": 0.00019447749649047542,
      "loss": 0.3992,
      "step": 46
    },
    {
      "epoch": 0.6064516129032258,
      "grad_norm": 0.08321976348844515,
      "learning_rate": 0.00019396926207859084,
      "loss": 0.4095,
      "step": 47
    },
    {
      "epoch": 0.6193548387096774,
      "grad_norm": 0.07887604040253807,
      "learning_rate": 0.00019343938371606712,
      "loss": 0.3866,
      "step": 48
    },
    {
      "epoch": 0.632258064516129,
      "grad_norm": 0.08329265943906447,
      "learning_rate": 0.00019288798344984672,
      "loss": 0.3985,
      "step": 49
    },
    {
      "epoch": 0.6451612903225806,
      "grad_norm": 0.08661703211305888,
      "learning_rate": 0.00019231518828401458,
      "loss": 0.3925,
      "step": 50
    },
    {
      "epoch": 0.6580645161290323,
      "grad_norm": 0.08382217550700771,
      "learning_rate": 0.00019172113015054532,
      "loss": 0.3862,
      "step": 51
    },
    {
      "epoch": 0.6709677419354839,
      "grad_norm": 0.08245124856491458,
      "learning_rate": 0.00019110594587891519,
      "loss": 0.3847,
      "step": 52
    },
    {
      "epoch": 0.6838709677419355,
      "grad_norm": 0.08319716279149986,
      "learning_rate": 0.00019046977716458626,
      "loss": 0.3775,
      "step": 53
    },
    {
      "epoch": 0.6967741935483871,
      "grad_norm": 0.08074648144423298,
      "learning_rate": 0.0001898127705363696,
      "loss": 0.3786,
      "step": 54
    },
    {
      "epoch": 0.7096774193548387,
      "grad_norm": 0.08472762376284584,
      "learning_rate": 0.0001891350773226754,
      "loss": 0.3923,
      "step": 55
    },
    {
      "epoch": 0.7225806451612903,
      "grad_norm": 0.08398076059437376,
      "learning_rate": 0.00018843685361665723,
      "loss": 0.3709,
      "step": 56
    },
    {
      "epoch": 0.7354838709677419,
      "grad_norm": 0.08465216102770419,
      "learning_rate": 0.00018771826024025946,
      "loss": 0.3818,
      "step": 57
    },
    {
      "epoch": 0.7483870967741936,
      "grad_norm": 0.09145572810056589,
      "learning_rate": 0.00018697946270717467,
      "loss": 0.39,
      "step": 58
    },
    {
      "epoch": 0.7612903225806451,
      "grad_norm": 0.08415188367023674,
      "learning_rate": 0.00018622063118472134,
      "loss": 0.3733,
      "step": 59
    },
    {
      "epoch": 0.7741935483870968,
      "grad_norm": 0.08576290382509591,
      "learning_rate": 0.00018544194045464886,
      "loss": 0.3878,
      "step": 60
    },
    {
      "epoch": 0.7870967741935484,
      "grad_norm": 0.0844142047859298,
      "learning_rate": 0.00018464356987288013,
      "loss": 0.3637,
      "step": 61
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.08918487261557899,
      "learning_rate": 0.00018382570332820043,
      "loss": 0.3775,
      "step": 62
    },
    {
      "epoch": 0.8129032258064516,
      "grad_norm": 0.0795181880669878,
      "learning_rate": 0.00018298852919990252,
      "loss": 0.3853,
      "step": 63
    },
    {
      "epoch": 0.8258064516129032,
      "grad_norm": 0.08173055996583302,
      "learning_rate": 0.0001821322403143969,
      "loss": 0.38,
      "step": 64
    },
    {
      "epoch": 0.8387096774193549,
      "grad_norm": 0.08525070031165603,
      "learning_rate": 0.0001812570339007983,
      "loss": 0.3778,
      "step": 65
    },
    {
      "epoch": 0.8516129032258064,
      "grad_norm": 0.08531235204546653,
      "learning_rate": 0.00018036311154549784,
      "loss": 0.3727,
      "step": 66
    },
    {
      "epoch": 0.864516129032258,
      "grad_norm": 0.08169851479895494,
      "learning_rate": 0.00017945067914573146,
      "loss": 0.365,
      "step": 67
    },
    {
      "epoch": 0.8774193548387097,
      "grad_norm": 0.08463789046916101,
      "learning_rate": 0.0001785199468621559,
      "loss": 0.3752,
      "step": 68
    },
    {
      "epoch": 0.8903225806451613,
      "grad_norm": 0.09441843624235378,
      "learning_rate": 0.000177571129070442,
      "loss": 0.3665,
      "step": 69
    },
    {
      "epoch": 0.9032258064516129,
      "grad_norm": 0.08530939476149231,
      "learning_rate": 0.0001766044443118978,
      "loss": 0.3926,
      "step": 70
    },
    {
      "epoch": 0.9161290322580645,
      "grad_norm": 0.0836606457284625,
      "learning_rate": 0.00017562011524313185,
      "loss": 0.3844,
      "step": 71
    },
    {
      "epoch": 0.9290322580645162,
      "grad_norm": 0.09868625782773943,
      "learning_rate": 0.00017461836858476856,
      "loss": 0.3835,
      "step": 72
    },
    {
      "epoch": 0.9419354838709677,
      "grad_norm": 0.082132336261239,
      "learning_rate": 0.00017359943506922774,
      "loss": 0.3792,
      "step": 73
    },
    {
      "epoch": 0.9548387096774194,
      "grad_norm": 0.08948965393301354,
      "learning_rate": 0.0001725635493875799,
      "loss": 0.3813,
      "step": 74
    },
    {
      "epoch": 0.967741935483871,
      "grad_norm": 0.08539410389371488,
      "learning_rate": 0.00017151095013548994,
      "loss": 0.3774,
      "step": 75
    },
    {
      "epoch": 0.9806451612903225,
      "grad_norm": 0.08690404790165682,
      "learning_rate": 0.00017044187975826124,
      "loss": 0.3762,
      "step": 76
    },
    {
      "epoch": 0.9935483870967742,
      "grad_norm": 0.09039522496805455,
      "learning_rate": 0.0001693565844949933,
      "loss": 0.3733,
      "step": 77
    },
    {
      "epoch": 0.9935483870967742,
      "eval_loss": 0.3743511736392975,
      "eval_runtime": 42.1339,
      "eval_samples_per_second": 24.66,
      "eval_steps_per_second": 0.783,
      "step": 77
    },
    {
      "epoch": 1.0064516129032257,
      "grad_norm": 0.09165665911792642,
      "learning_rate": 0.00016825531432186543,
      "loss": 0.3532,
      "step": 78
    },
    {
      "epoch": 1.0193548387096774,
      "grad_norm": 0.0801922544260219,
      "learning_rate": 0.0001671383228945597,
      "loss": 0.347,
      "step": 79
    },
    {
      "epoch": 1.032258064516129,
      "grad_norm": 0.08352186065175837,
      "learning_rate": 0.00016600586748983641,
      "loss": 0.3566,
      "step": 80
    },
    {
      "epoch": 1.0451612903225806,
      "grad_norm": 0.08793176795367076,
      "learning_rate": 0.0001648582089462756,
      "loss": 0.3473,
      "step": 81
    },
    {
      "epoch": 1.0580645161290323,
      "grad_norm": 0.08913951531063671,
      "learning_rate": 0.00016369561160419784,
      "loss": 0.342,
      "step": 82
    },
    {
      "epoch": 1.070967741935484,
      "grad_norm": 0.08309712335786672,
      "learning_rate": 0.0001625183432447789,
      "loss": 0.345,
      "step": 83
    },
    {
      "epoch": 1.0838709677419356,
      "grad_norm": 0.08725330804483407,
      "learning_rate": 0.00016132667502837165,
      "loss": 0.3523,
      "step": 84
    },
    {
      "epoch": 1.096774193548387,
      "grad_norm": 0.08680862762413778,
      "learning_rate": 0.00016012088143204953,
      "loss": 0.3554,
      "step": 85
    },
    {
      "epoch": 1.1096774193548387,
      "grad_norm": 0.0863782848559528,
      "learning_rate": 0.00015890124018638638,
      "loss": 0.364,
      "step": 86
    },
    {
      "epoch": 1.1225806451612903,
      "grad_norm": 0.08388848992116194,
      "learning_rate": 0.00015766803221148673,
      "loss": 0.3568,
      "step": 87
    },
    {
      "epoch": 1.135483870967742,
      "grad_norm": 0.08226994751114965,
      "learning_rate": 0.00015642154155228122,
      "loss": 0.3489,
      "step": 88
    },
    {
      "epoch": 1.1483870967741936,
      "grad_norm": 0.08575965994905438,
      "learning_rate": 0.00015516205531310273,
      "loss": 0.3466,
      "step": 89
    },
    {
      "epoch": 1.1612903225806452,
      "grad_norm": 0.0895747440427046,
      "learning_rate": 0.00015388986359155758,
      "loss": 0.3488,
      "step": 90
    },
    {
      "epoch": 1.1741935483870969,
      "grad_norm": 0.08403222320010312,
      "learning_rate": 0.00015260525941170712,
      "loss": 0.356,
      "step": 91
    },
    {
      "epoch": 1.1870967741935483,
      "grad_norm": 0.08627434364043794,
      "learning_rate": 0.0001513085386565758,
      "loss": 0.3519,
      "step": 92
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.08925414655300028,
      "learning_rate": 0.00015000000000000001,
      "loss": 0.3523,
      "step": 93
    },
    {
      "epoch": 1.2129032258064516,
      "grad_norm": 0.09120079741968923,
      "learning_rate": 0.00014867994483783485,
      "loss": 0.3555,
      "step": 94
    },
    {
      "epoch": 1.2258064516129032,
      "grad_norm": 0.08519037826685563,
      "learning_rate": 0.0001473486772185334,
      "loss": 0.3551,
      "step": 95
    },
    {
      "epoch": 1.238709677419355,
      "grad_norm": 0.08814591743170447,
      "learning_rate": 0.00014600650377311522,
      "loss": 0.3535,
      "step": 96
    },
    {
      "epoch": 1.2516129032258063,
      "grad_norm": 0.08812877093082108,
      "learning_rate": 0.00014465373364454001,
      "loss": 0.3498,
      "step": 97
    },
    {
      "epoch": 1.2645161290322582,
      "grad_norm": 0.08596197743921638,
      "learning_rate": 0.00014329067841650274,
      "loss": 0.3484,
      "step": 98
    },
    {
      "epoch": 1.2774193548387096,
      "grad_norm": 0.09025513346881896,
      "learning_rate": 0.00014191765204166643,
      "loss": 0.3465,
      "step": 99
    },
    {
      "epoch": 1.2903225806451613,
      "grad_norm": 0.08665409616008209,
      "learning_rate": 0.00014053497076934948,
      "loss": 0.35,
      "step": 100
    },
    {
      "epoch": 1.303225806451613,
      "grad_norm": 0.09012608398761074,
      "learning_rate": 0.00013914295307268396,
      "loss": 0.3516,
      "step": 101
    },
    {
      "epoch": 1.3161290322580645,
      "grad_norm": 0.09456407877563842,
      "learning_rate": 0.00013774191957526143,
      "loss": 0.3639,
      "step": 102
    },
    {
      "epoch": 1.3290322580645162,
      "grad_norm": 0.0888376260234129,
      "learning_rate": 0.00013633219297728416,
      "loss": 0.3396,
      "step": 103
    },
    {
      "epoch": 1.3419354838709676,
      "grad_norm": 0.08652600639054038,
      "learning_rate": 0.00013491409798123687,
      "loss": 0.3445,
      "step": 104
    },
    {
      "epoch": 1.3548387096774195,
      "grad_norm": 0.09269194410505097,
      "learning_rate": 0.00013348796121709862,
      "loss": 0.3555,
      "step": 105
    },
    {
      "epoch": 1.367741935483871,
      "grad_norm": 0.09421096011594207,
      "learning_rate": 0.00013205411116710972,
      "loss": 0.3508,
      "step": 106
    },
    {
      "epoch": 1.3806451612903226,
      "grad_norm": 0.09286783444235318,
      "learning_rate": 0.00013061287809011242,
      "loss": 0.3571,
      "step": 107
    },
    {
      "epoch": 1.3935483870967742,
      "grad_norm": 0.08172852976047028,
      "learning_rate": 0.0001291645939454825,
      "loss": 0.3488,
      "step": 108
    },
    {
      "epoch": 1.4064516129032258,
      "grad_norm": 0.09033973727962885,
      "learning_rate": 0.0001277095923166689,
      "loss": 0.3498,
      "step": 109
    },
    {
      "epoch": 1.4193548387096775,
      "grad_norm": 0.09628933362833343,
      "learning_rate": 0.00012624820833435937,
      "loss": 0.3472,
      "step": 110
    },
    {
      "epoch": 1.432258064516129,
      "grad_norm": 0.08471497514674803,
      "learning_rate": 0.00012478077859929,
      "loss": 0.3353,
      "step": 111
    },
    {
      "epoch": 1.4451612903225808,
      "grad_norm": 0.08976133324522119,
      "learning_rate": 0.00012330764110471566,
      "loss": 0.3468,
      "step": 112
    },
    {
      "epoch": 1.4580645161290322,
      "grad_norm": 0.09634877556737409,
      "learning_rate": 0.00012182913515856015,
      "loss": 0.3541,
      "step": 113
    },
    {
      "epoch": 1.4709677419354839,
      "grad_norm": 0.09348923296138459,
      "learning_rate": 0.0001203456013052634,
      "loss": 0.3521,
      "step": 114
    },
    {
      "epoch": 1.4838709677419355,
      "grad_norm": 0.09437711091684706,
      "learning_rate": 0.00011885738124734358,
      "loss": 0.3566,
      "step": 115
    },
    {
      "epoch": 1.4967741935483871,
      "grad_norm": 0.08916702937111011,
      "learning_rate": 0.00011736481776669306,
      "loss": 0.3458,
      "step": 116
    },
    {
      "epoch": 1.5096774193548388,
      "grad_norm": 0.09100601467580355,
      "learning_rate": 0.00011586825464562514,
      "loss": 0.3593,
      "step": 117
    },
    {
      "epoch": 1.5225806451612902,
      "grad_norm": 0.08990470683690902,
      "learning_rate": 0.00011436803658769082,
      "loss": 0.3434,
      "step": 118
    },
    {
      "epoch": 1.535483870967742,
      "grad_norm": 0.0932653393737011,
      "learning_rate": 0.00011286450913828312,
      "loss": 0.342,
      "step": 119
    },
    {
      "epoch": 1.5483870967741935,
      "grad_norm": 0.08960531773257623,
      "learning_rate": 0.00011135801860504749,
      "loss": 0.3628,
      "step": 120
    },
    {
      "epoch": 1.5612903225806452,
      "grad_norm": 0.09275069273094473,
      "learning_rate": 0.00010984891197811687,
      "loss": 0.3513,
      "step": 121
    },
    {
      "epoch": 1.5741935483870968,
      "grad_norm": 0.09527469311088294,
      "learning_rate": 0.00010833753685018935,
      "loss": 0.3556,
      "step": 122
    },
    {
      "epoch": 1.5870967741935482,
      "grad_norm": 0.09323849659154124,
      "learning_rate": 0.0001068242413364671,
      "loss": 0.3448,
      "step": 123
    },
    {
      "epoch": 1.6,
      "grad_norm": 0.08474554028292876,
      "learning_rate": 0.00010530937399447496,
      "loss": 0.3499,
      "step": 124
    },
    {
      "epoch": 1.6129032258064515,
      "grad_norm": 0.09382059811382143,
      "learning_rate": 0.00010379328374377715,
      "loss": 0.3384,
      "step": 125
    },
    {
      "epoch": 1.6258064516129034,
      "grad_norm": 0.09276702527842776,
      "learning_rate": 0.00010227631978561056,
      "loss": 0.3444,
      "step": 126
    },
    {
      "epoch": 1.6387096774193548,
      "grad_norm": 0.08750152088472078,
      "learning_rate": 0.00010075883152245334,
      "loss": 0.3569,
      "step": 127
    },
    {
      "epoch": 1.6516129032258065,
      "grad_norm": 0.08714445180642569,
      "learning_rate": 9.92411684775467e-05,
      "loss": 0.342,
      "step": 128
    },
    {
      "epoch": 1.664516129032258,
      "grad_norm": 0.08469902272466831,
      "learning_rate": 9.772368021438943e-05,
      "loss": 0.3342,
      "step": 129
    },
    {
      "epoch": 1.6774193548387095,
      "grad_norm": 0.08724585745005611,
      "learning_rate": 9.620671625622288e-05,
      "loss": 0.3335,
      "step": 130
    },
    {
      "epoch": 1.6903225806451614,
      "grad_norm": 0.09087336723016343,
      "learning_rate": 9.469062600552509e-05,
      "loss": 0.3447,
      "step": 131
    },
    {
      "epoch": 1.7032258064516128,
      "grad_norm": 0.08863278083042062,
      "learning_rate": 9.317575866353292e-05,
      "loss": 0.3487,
      "step": 132
    },
    {
      "epoch": 1.7161290322580647,
      "grad_norm": 0.08343459715762,
      "learning_rate": 9.166246314981066e-05,
      "loss": 0.3454,
      "step": 133
    },
    {
      "epoch": 1.729032258064516,
      "grad_norm": 0.08837483796029806,
      "learning_rate": 9.015108802188313e-05,
      "loss": 0.3484,
      "step": 134
    },
    {
      "epoch": 1.7419354838709677,
      "grad_norm": 0.08762249376974672,
      "learning_rate": 8.86419813949525e-05,
      "loss": 0.3447,
      "step": 135
    },
    {
      "epoch": 1.7548387096774194,
      "grad_norm": 0.08446853010895118,
      "learning_rate": 8.713549086171691e-05,
      "loss": 0.3466,
      "step": 136
    },
    {
      "epoch": 1.7677419354838708,
      "grad_norm": 0.08897676787603495,
      "learning_rate": 8.563196341230919e-05,
      "loss": 0.3434,
      "step": 137
    },
    {
      "epoch": 1.7806451612903227,
      "grad_norm": 0.09210810174866911,
      "learning_rate": 8.413174535437487e-05,
      "loss": 0.355,
      "step": 138
    },
    {
      "epoch": 1.793548387096774,
      "grad_norm": 0.0877098792555575,
      "learning_rate": 8.263518223330697e-05,
      "loss": 0.3392,
      "step": 139
    },
    {
      "epoch": 1.8064516129032258,
      "grad_norm": 0.09059259587839792,
      "learning_rate": 8.114261875265643e-05,
      "loss": 0.3465,
      "step": 140
    },
    {
      "epoch": 1.8193548387096774,
      "grad_norm": 0.09043152099082513,
      "learning_rate": 7.965439869473664e-05,
      "loss": 0.3409,
      "step": 141
    },
    {
      "epoch": 1.832258064516129,
      "grad_norm": 0.08863483273837267,
      "learning_rate": 7.817086484143986e-05,
      "loss": 0.3497,
      "step": 142
    },
    {
      "epoch": 1.8451612903225807,
      "grad_norm": 0.08351509862847174,
      "learning_rate": 7.669235889528436e-05,
      "loss": 0.3484,
      "step": 143
    },
    {
      "epoch": 1.8580645161290321,
      "grad_norm": 0.08881689002413959,
      "learning_rate": 7.521922140071002e-05,
      "loss": 0.3428,
      "step": 144
    },
    {
      "epoch": 1.870967741935484,
      "grad_norm": 0.08962413300366581,
      "learning_rate": 7.375179166564063e-05,
      "loss": 0.3353,
      "step": 145
    },
    {
      "epoch": 1.8838709677419354,
      "grad_norm": 0.08991947191225944,
      "learning_rate": 7.229040768333115e-05,
      "loss": 0.3366,
      "step": 146
    },
    {
      "epoch": 1.896774193548387,
      "grad_norm": 0.0890545628104281,
      "learning_rate": 7.08354060545175e-05,
      "loss": 0.3381,
      "step": 147
    },
    {
      "epoch": 1.9096774193548387,
      "grad_norm": 0.09306016588414409,
      "learning_rate": 6.93871219098876e-05,
      "loss": 0.3356,
      "step": 148
    },
    {
      "epoch": 1.9225806451612903,
      "grad_norm": 0.08816048934545212,
      "learning_rate": 6.79458888328903e-05,
      "loss": 0.3412,
      "step": 149
    },
    {
      "epoch": 1.935483870967742,
      "grad_norm": 0.09006593042575502,
      "learning_rate": 6.651203878290139e-05,
      "loss": 0.3471,
      "step": 150
    },
    {
      "epoch": 1.9483870967741934,
      "grad_norm": 0.08499237638300171,
      "learning_rate": 6.508590201876317e-05,
      "loss": 0.335,
      "step": 151
    },
    {
      "epoch": 1.9612903225806453,
      "grad_norm": 0.09566747308379261,
      "learning_rate": 6.366780702271589e-05,
      "loss": 0.3395,
      "step": 152
    },
    {
      "epoch": 1.9741935483870967,
      "grad_norm": 0.0915253754596643,
      "learning_rate": 6.225808042473858e-05,
      "loss": 0.3488,
      "step": 153
    },
    {
      "epoch": 1.9870967741935484,
      "grad_norm": 0.08657357278603872,
      "learning_rate": 6.085704692731609e-05,
      "loss": 0.3344,
      "step": 154
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.08950726731743963,
      "learning_rate": 5.9465029230650534e-05,
      "loss": 0.33,
      "step": 155
    },
    {
      "epoch": 2.0,
      "eval_loss": 0.35439133644104004,
      "eval_runtime": 36.1469,
      "eval_samples_per_second": 28.744,
      "eval_steps_per_second": 0.913,
      "step": 155
    },
    {
      "epoch": 2.0129032258064514,
      "grad_norm": 0.08961232668946545,
      "learning_rate": 5.8082347958333625e-05,
      "loss": 0.3273,
      "step": 156
    },
    {
      "epoch": 2.0258064516129033,
      "grad_norm": 0.09402916213349197,
      "learning_rate": 5.670932158349731e-05,
      "loss": 0.3218,
      "step": 157
    },
    {
      "epoch": 2.0387096774193547,
      "grad_norm": 0.08520247695821515,
      "learning_rate": 5.5346266355459995e-05,
      "loss": 0.3089,
      "step": 158
    },
    {
      "epoch": 2.0516129032258066,
      "grad_norm": 0.08637288183919145,
      "learning_rate": 5.399349622688479e-05,
      "loss": 0.3266,
      "step": 159
    },
    {
      "epoch": 2.064516129032258,
      "grad_norm": 0.08823864345930746,
      "learning_rate": 5.26513227814666e-05,
      "loss": 0.329,
      "step": 160
    },
    {
      "epoch": 2.07741935483871,
      "grad_norm": 0.09384371931382793,
      "learning_rate": 5.1320055162165115e-05,
      "loss": 0.3275,
      "step": 161
    },
    {
      "epoch": 2.0903225806451613,
      "grad_norm": 0.09516405744887674,
      "learning_rate": 5.000000000000002e-05,
      "loss": 0.332,
      "step": 162
    },
    {
      "epoch": 2.1032258064516127,
      "grad_norm": 0.08966279182804247,
      "learning_rate": 4.869146134342426e-05,
      "loss": 0.3247,
      "step": 163
    },
    {
      "epoch": 2.1161290322580646,
      "grad_norm": 0.08700940402163973,
      "learning_rate": 4.739474058829289e-05,
      "loss": 0.3221,
      "step": 164
    },
    {
      "epoch": 2.129032258064516,
      "grad_norm": 0.08984677102800173,
      "learning_rate": 4.611013640844245e-05,
      "loss": 0.3272,
      "step": 165
    },
    {
      "epoch": 2.141935483870968,
      "grad_norm": 0.08964202186304891,
      "learning_rate": 4.483794468689728e-05,
      "loss": 0.3188,
      "step": 166
    },
    {
      "epoch": 2.1548387096774193,
      "grad_norm": 0.09997697429798251,
      "learning_rate": 4.357845844771881e-05,
      "loss": 0.3383,
      "step": 167
    },
    {
      "epoch": 2.167741935483871,
      "grad_norm": 0.09510073376177604,
      "learning_rate": 4.2331967788513295e-05,
      "loss": 0.3252,
      "step": 168
    },
    {
      "epoch": 2.1806451612903226,
      "grad_norm": 0.09107612709336496,
      "learning_rate": 4.109875981361363e-05,
      "loss": 0.3217,
      "step": 169
    },
    {
      "epoch": 2.193548387096774,
      "grad_norm": 0.08804927379783276,
      "learning_rate": 3.987911856795047e-05,
      "loss": 0.3173,
      "step": 170
    },
    {
      "epoch": 2.206451612903226,
      "grad_norm": 0.0916081059987062,
      "learning_rate": 3.8673324971628357e-05,
      "loss": 0.3285,
      "step": 171
    },
    {
      "epoch": 2.2193548387096773,
      "grad_norm": 0.09226628432750343,
      "learning_rate": 3.7481656755221125e-05,
      "loss": 0.3154,
      "step": 172
    },
    {
      "epoch": 2.232258064516129,
      "grad_norm": 0.09145015878266409,
      "learning_rate": 3.630438839580217e-05,
      "loss": 0.3087,
      "step": 173
    },
    {
      "epoch": 2.2451612903225806,
      "grad_norm": 0.08786201399591659,
      "learning_rate": 3.5141791053724405e-05,
      "loss": 0.3151,
      "step": 174
    },
    {
      "epoch": 2.258064516129032,
      "grad_norm": 0.09259402512083086,
      "learning_rate": 3.399413251016359e-05,
      "loss": 0.3369,
      "step": 175
    },
    {
      "epoch": 2.270967741935484,
      "grad_norm": 0.09311260751337232,
      "learning_rate": 3.2861677105440336e-05,
      "loss": 0.3051,
      "step": 176
    },
    {
      "epoch": 2.2838709677419353,
      "grad_norm": 0.09217712904693832,
      "learning_rate": 3.174468567813461e-05,
      "loss": 0.3199,
      "step": 177
    },
    {
      "epoch": 2.296774193548387,
      "grad_norm": 0.09141877592974519,
      "learning_rate": 3.0643415505006735e-05,
      "loss": 0.3229,
      "step": 178
    },
    {
      "epoch": 2.3096774193548386,
      "grad_norm": 0.09528833689903496,
      "learning_rate": 2.9558120241738784e-05,
      "loss": 0.3286,
      "step": 179
    },
    {
      "epoch": 2.3225806451612905,
      "grad_norm": 0.09070636787107308,
      "learning_rate": 2.8489049864510054e-05,
      "loss": 0.3348,
      "step": 180
    },
    {
      "epoch": 2.335483870967742,
      "grad_norm": 0.09307512327341362,
      "learning_rate": 2.7436450612420095e-05,
      "loss": 0.3256,
      "step": 181
    },
    {
      "epoch": 2.3483870967741938,
      "grad_norm": 0.09127823479306682,
      "learning_rate": 2.640056493077231e-05,
      "loss": 0.3181,
      "step": 182
    },
    {
      "epoch": 2.361290322580645,
      "grad_norm": 0.09246009256113925,
      "learning_rate": 2.5381631415231454e-05,
      "loss": 0.3391,
      "step": 183
    },
    {
      "epoch": 2.3741935483870966,
      "grad_norm": 0.09095352379758655,
      "learning_rate": 2.4379884756868167e-05,
      "loss": 0.3172,
      "step": 184
    },
    {
      "epoch": 2.3870967741935485,
      "grad_norm": 0.0926880163626768,
      "learning_rate": 2.339555568810221e-05,
      "loss": 0.3177,
      "step": 185
    },
    {
      "epoch": 2.4,
      "grad_norm": 0.09094474131194094,
      "learning_rate": 2.242887092955801e-05,
      "loss": 0.3199,
      "step": 186
    },
    {
      "epoch": 2.412903225806452,
      "grad_norm": 0.09106546035353981,
      "learning_rate": 2.1480053137844115e-05,
      "loss": 0.3222,
      "step": 187
    },
    {
      "epoch": 2.425806451612903,
      "grad_norm": 0.08873018715134598,
      "learning_rate": 2.054932085426856e-05,
      "loss": 0.3118,
      "step": 188
    },
    {
      "epoch": 2.4387096774193546,
      "grad_norm": 0.0932765377498955,
      "learning_rate": 1.9636888454502178e-05,
      "loss": 0.3358,
      "step": 189
    },
    {
      "epoch": 2.4516129032258065,
      "grad_norm": 0.09181586534157822,
      "learning_rate": 1.8742966099201697e-05,
      "loss": 0.3157,
      "step": 190
    },
    {
      "epoch": 2.464516129032258,
      "grad_norm": 0.0929486436457203,
      "learning_rate": 1.7867759685603114e-05,
      "loss": 0.3154,
      "step": 191
    },
    {
      "epoch": 2.47741935483871,
      "grad_norm": 0.09188630220285351,
      "learning_rate": 1.7011470800097496e-05,
      "loss": 0.3181,
      "step": 192
    },
    {
      "epoch": 2.490322580645161,
      "grad_norm": 0.09574286894431329,
      "learning_rate": 1.6174296671799572e-05,
      "loss": 0.3222,
      "step": 193
    },
    {
      "epoch": 2.5032258064516126,
      "grad_norm": 0.09145354457132104,
      "learning_rate": 1.5356430127119913e-05,
      "loss": 0.3222,
      "step": 194
    },
    {
      "epoch": 2.5161290322580645,
      "grad_norm": 0.09039580690260736,
      "learning_rate": 1.4558059545351143e-05,
      "loss": 0.324,
      "step": 195
    },
    {
      "epoch": 2.5290322580645164,
      "grad_norm": 0.08979381831653434,
      "learning_rate": 1.3779368815278647e-05,
      "loss": 0.3107,
      "step": 196
    },
    {
      "epoch": 2.541935483870968,
      "grad_norm": 0.09526292697431937,
      "learning_rate": 1.302053729282533e-05,
      "loss": 0.3219,
      "step": 197
    },
    {
      "epoch": 2.554838709677419,
      "grad_norm": 0.09310358146453943,
      "learning_rate": 1.2281739759740574e-05,
      "loss": 0.3214,
      "step": 198
    },
    {
      "epoch": 2.567741935483871,
      "grad_norm": 0.09212645063531479,
      "learning_rate": 1.1563146383342772e-05,
      "loss": 0.3154,
      "step": 199
    },
    {
      "epoch": 2.5806451612903225,
      "grad_norm": 0.09533681862557382,
      "learning_rate": 1.0864922677324618e-05,
      "loss": 0.319,
      "step": 200
    },
    {
      "epoch": 2.5935483870967744,
      "grad_norm": 0.09551418366783314,
      "learning_rate": 1.01872294636304e-05,
      "loss": 0.3333,
      "step": 201
    },
    {
      "epoch": 2.606451612903226,
      "grad_norm": 0.08930212325894361,
      "learning_rate": 9.530222835413738e-06,
      "loss": 0.3048,
      "step": 202
    },
    {
      "epoch": 2.6193548387096772,
      "grad_norm": 0.09220378121771236,
      "learning_rate": 8.894054121084838e-06,
      "loss": 0.3146,
      "step": 203
    },
    {
      "epoch": 2.632258064516129,
      "grad_norm": 0.09150774720724307,
      "learning_rate": 8.278869849454718e-06,
      "loss": 0.3311,
      "step": 204
    },
    {
      "epoch": 2.6451612903225805,
      "grad_norm": 0.09261513270619316,
      "learning_rate": 7.684811715985429e-06,
      "loss": 0.3172,
      "step": 205
    },
    {
      "epoch": 2.6580645161290324,
      "grad_norm": 0.0941004102909483,
      "learning_rate": 7.1120165501533e-06,
      "loss": 0.3347,
      "step": 206
    },
    {
      "epoch": 2.670967741935484,
      "grad_norm": 0.08707518610128166,
      "learning_rate": 6.560616283932897e-06,
      "loss": 0.3116,
      "step": 207
    },
    {
      "epoch": 2.6838709677419352,
      "grad_norm": 0.08648707636296159,
      "learning_rate": 6.030737921409169e-06,
      "loss": 0.3144,
      "step": 208
    },
    {
      "epoch": 2.696774193548387,
      "grad_norm": 0.09169150101119816,
      "learning_rate": 5.52250350952459e-06,
      "loss": 0.3255,
      "step": 209
    },
    {
      "epoch": 2.709677419354839,
      "grad_norm": 0.09060072523264334,
      "learning_rate": 5.036030109968082e-06,
      "loss": 0.3183,
      "step": 210
    },
    {
      "epoch": 2.7225806451612904,
      "grad_norm": 0.09077216490604942,
      "learning_rate": 4.5714297722121106e-06,
      "loss": 0.321,
      "step": 211
    },
    {
      "epoch": 2.735483870967742,
      "grad_norm": 0.09088968433443333,
      "learning_rate": 4.128809507704445e-06,
      "loss": 0.3172,
      "step": 212
    },
    {
      "epoch": 2.7483870967741937,
      "grad_norm": 0.09191902683388614,
      "learning_rate": 3.7082712652200867e-06,
      "loss": 0.3261,
      "step": 213
    },
    {
      "epoch": 2.761290322580645,
      "grad_norm": 0.08843215800144302,
      "learning_rate": 3.3099119073793928e-06,
      "loss": 0.3158,
      "step": 214
    },
    {
      "epoch": 2.774193548387097,
      "grad_norm": 0.09079938334868655,
      "learning_rate": 2.9338231883378366e-06,
      "loss": 0.3178,
      "step": 215
    },
    {
      "epoch": 2.7870967741935484,
      "grad_norm": 0.09122789808454786,
      "learning_rate": 2.580091732652101e-06,
      "loss": 0.3282,
      "step": 216
    },
    {
      "epoch": 2.8,
      "grad_norm": 0.09380292374109117,
      "learning_rate": 2.248799015327907e-06,
      "loss": 0.3359,
      "step": 217
    },
    {
      "epoch": 2.8129032258064517,
      "grad_norm": 0.09035917420929797,
      "learning_rate": 1.9400213430538773e-06,
      "loss": 0.3169,
      "step": 218
    },
    {
      "epoch": 2.825806451612903,
      "grad_norm": 0.09195121657817087,
      "learning_rate": 1.6538298366257976e-06,
      "loss": 0.3314,
      "step": 219
    },
    {
      "epoch": 2.838709677419355,
      "grad_norm": 0.09166102367139951,
      "learning_rate": 1.3902904145653096e-06,
      "loss": 0.3258,
      "step": 220
    },
    {
      "epoch": 2.8516129032258064,
      "grad_norm": 0.0921992572010057,
      "learning_rate": 1.1494637779369766e-06,
      "loss": 0.3298,
      "step": 221
    },
    {
      "epoch": 2.864516129032258,
      "grad_norm": 0.09068261067988724,
      "learning_rate": 9.314053963669245e-07,
      "loss": 0.3214,
      "step": 222
    },
    {
      "epoch": 2.8774193548387097,
      "grad_norm": 0.09417924199778298,
      "learning_rate": 7.361654952665609e-07,
      "loss": 0.3134,
      "step": 223
    },
    {
      "epoch": 2.8903225806451616,
      "grad_norm": 0.0901765977296441,
      "learning_rate": 5.637890442641402e-07,
      "loss": 0.3221,
      "step": 224
    },
    {
      "epoch": 2.903225806451613,
      "grad_norm": 0.09094506589085496,
      "learning_rate": 4.143157468468717e-07,
      "loss": 0.3128,
      "step": 225
    },
    {
      "epoch": 2.9161290322580644,
      "grad_norm": 0.08772549933058231,
      "learning_rate": 2.877800312160783e-07,
      "loss": 0.3248,
      "step": 226
    },
    {
      "epoch": 2.9290322580645163,
      "grad_norm": 0.09191883931659987,
      "learning_rate": 1.8421104235727405e-07,
      "loss": 0.3114,
      "step": 227
    },
    {
      "epoch": 2.9419354838709677,
      "grad_norm": 0.08876137430429,
      "learning_rate": 1.0363263532724432e-07,
      "loss": 0.3127,
      "step": 228
    },
    {
      "epoch": 2.9548387096774196,
      "grad_norm": 0.09157045134043748,
      "learning_rate": 4.606336975948589e-08,
      "loss": 0.3275,
      "step": 229
    },
    {
      "epoch": 2.967741935483871,
      "grad_norm": 0.08940213355520302,
      "learning_rate": 1.1516505589381776e-08,
      "loss": 0.3246,
      "step": 230
    },
    {
      "epoch": 2.9806451612903224,
      "grad_norm": 0.0895898052255747,
      "learning_rate": 0.0,
      "loss": 0.3079,
      "step": 231
    },
    {
      "epoch": 2.9806451612903224,
      "eval_loss": 0.3507891595363617,
      "eval_runtime": 36.0777,
      "eval_samples_per_second": 28.799,
      "eval_steps_per_second": 0.915,
      "step": 231
    },
    {
      "epoch": 2.9806451612903224,
      "step": 231,
      "total_flos": 9.324729662937498e+16,
      "train_loss": 0.3951803825118325,
      "train_runtime": 2997.4381,
      "train_samples_per_second": 9.871,
      "train_steps_per_second": 0.077
    }
  ],
  "logging_steps": 1,
  "max_steps": 231,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 9.324729662937498e+16,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}