{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 500,
  "global_step": 2560,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.00390625,
      "grad_norm": 1.813705325126648,
      "learning_rate": 7.8125e-07,
      "loss": 1.9071,
      "step": 1
    },
    {
      "epoch": 0.01953125,
      "grad_norm": 1.431990385055542,
      "learning_rate": 3.90625e-06,
      "loss": 1.8608,
      "step": 5
    },
    {
      "epoch": 0.0390625,
      "grad_norm": 1.281330943107605,
      "learning_rate": 7.8125e-06,
      "loss": 1.8263,
      "step": 10
    },
    {
      "epoch": 0.05859375,
      "grad_norm": 1.310953140258789,
      "learning_rate": 1.171875e-05,
      "loss": 1.8193,
      "step": 15
    },
    {
      "epoch": 0.078125,
      "grad_norm": 1.296993374824524,
      "learning_rate": 1.5625e-05,
      "loss": 1.7463,
      "step": 20
    },
    {
      "epoch": 0.09765625,
      "grad_norm": 1.1856365203857422,
      "learning_rate": 1.953125e-05,
      "loss": 1.6844,
      "step": 25
    },
    {
      "epoch": 0.1171875,
      "grad_norm": 3.376720905303955,
      "learning_rate": 2.34375e-05,
      "loss": 1.5861,
      "step": 30
    },
    {
      "epoch": 0.13671875,
      "grad_norm": 3.182882785797119,
      "learning_rate": 2.734375e-05,
      "loss": 1.4328,
      "step": 35
    },
    {
      "epoch": 0.15625,
      "grad_norm": 0.682467520236969,
      "learning_rate": 3.125e-05,
      "loss": 1.2702,
      "step": 40
    },
    {
      "epoch": 0.17578125,
      "grad_norm": 0.9865962266921997,
      "learning_rate": 3.5156250000000004e-05,
      "loss": 1.1671,
      "step": 45
    },
    {
      "epoch": 0.1953125,
      "grad_norm": 0.42747607827186584,
      "learning_rate": 3.90625e-05,
      "loss": 1.1303,
      "step": 50
    },
    {
      "epoch": 0.21484375,
      "grad_norm": 0.42581626772880554,
      "learning_rate": 4.2968750000000004e-05,
      "loss": 1.101,
      "step": 55
    },
    {
      "epoch": 0.234375,
      "grad_norm": 0.4914548099040985,
      "learning_rate": 4.6875e-05,
      "loss": 1.0586,
      "step": 60
    },
    {
      "epoch": 0.25390625,
      "grad_norm": 0.39272716641426086,
      "learning_rate": 5.0781250000000004e-05,
      "loss": 1.0308,
      "step": 65
    },
    {
      "epoch": 0.2734375,
      "grad_norm": 0.34394437074661255,
      "learning_rate": 5.46875e-05,
      "loss": 0.9998,
      "step": 70
    },
    {
      "epoch": 0.29296875,
      "grad_norm": 0.3009032607078552,
      "learning_rate": 5.8593750000000005e-05,
      "loss": 0.9784,
      "step": 75
    },
    {
      "epoch": 0.3125,
      "grad_norm": 0.27089548110961914,
      "learning_rate": 6.25e-05,
      "loss": 0.9653,
      "step": 80
    },
    {
      "epoch": 0.33203125,
      "grad_norm": 0.25717490911483765,
      "learning_rate": 6.640625e-05,
      "loss": 0.9434,
      "step": 85
    },
    {
      "epoch": 0.3515625,
      "grad_norm": 0.3018302917480469,
      "learning_rate": 7.031250000000001e-05,
      "loss": 0.9372,
      "step": 90
    },
    {
      "epoch": 0.37109375,
      "grad_norm": 0.2254215031862259,
      "learning_rate": 7.421875e-05,
      "loss": 0.9236,
      "step": 95
    },
    {
      "epoch": 0.390625,
      "grad_norm": 0.2384410947561264,
      "learning_rate": 7.8125e-05,
      "loss": 0.9145,
      "step": 100
    },
    {
      "epoch": 0.41015625,
      "grad_norm": 0.2905459403991699,
      "learning_rate": 8.203125e-05,
      "loss": 0.9177,
      "step": 105
    },
    {
      "epoch": 0.4296875,
      "grad_norm": 0.27646884322166443,
      "learning_rate": 8.593750000000001e-05,
      "loss": 0.9103,
      "step": 110
    },
    {
      "epoch": 0.44921875,
      "grad_norm": 0.23843346536159515,
      "learning_rate": 8.984375e-05,
      "loss": 0.8911,
      "step": 115
    },
    {
      "epoch": 0.46875,
      "grad_norm": 0.3110702931880951,
      "learning_rate": 9.375e-05,
      "loss": 0.8961,
      "step": 120
    },
    {
      "epoch": 0.48828125,
      "grad_norm": 0.2591000199317932,
      "learning_rate": 9.765625e-05,
      "loss": 0.8911,
      "step": 125
    },
    {
      "epoch": 0.5078125,
      "grad_norm": 0.2314710170030594,
      "learning_rate": 0.00010156250000000001,
      "loss": 0.8765,
      "step": 130
    },
    {
      "epoch": 0.52734375,
      "grad_norm": 0.268370658159256,
      "learning_rate": 0.00010546875,
      "loss": 0.8759,
      "step": 135
    },
    {
      "epoch": 0.546875,
      "grad_norm": 0.24689124524593353,
      "learning_rate": 0.000109375,
      "loss": 0.8714,
      "step": 140
    },
    {
      "epoch": 0.56640625,
      "grad_norm": 0.28693222999572754,
      "learning_rate": 0.00011328125,
      "loss": 0.882,
      "step": 145
    },
    {
      "epoch": 0.5859375,
      "grad_norm": 0.26165568828582764,
      "learning_rate": 0.00011718750000000001,
      "loss": 0.8638,
      "step": 150
    },
    {
      "epoch": 0.60546875,
      "grad_norm": 0.2968839406967163,
      "learning_rate": 0.00012109375,
      "loss": 0.8562,
      "step": 155
    },
    {
      "epoch": 0.625,
      "grad_norm": 0.2954418957233429,
      "learning_rate": 0.000125,
      "loss": 0.8569,
      "step": 160
    },
    {
      "epoch": 0.64453125,
      "grad_norm": 0.30811259150505066,
      "learning_rate": 0.00012890625,
      "loss": 0.8455,
      "step": 165
    },
    {
      "epoch": 0.6640625,
      "grad_norm": 0.2631295323371887,
      "learning_rate": 0.0001328125,
      "loss": 0.8574,
      "step": 170
    },
    {
      "epoch": 0.68359375,
      "grad_norm": 0.25627005100250244,
      "learning_rate": 0.00013671875,
      "loss": 0.851,
      "step": 175
    },
    {
      "epoch": 0.703125,
      "grad_norm": 0.28598853945732117,
      "learning_rate": 0.00014062500000000002,
      "loss": 0.8385,
      "step": 180
    },
    {
      "epoch": 0.72265625,
      "grad_norm": 0.2502932548522949,
      "learning_rate": 0.00014453125000000002,
      "loss": 0.8457,
      "step": 185
    },
    {
      "epoch": 0.7421875,
      "grad_norm": 0.3177507817745209,
      "learning_rate": 0.0001484375,
      "loss": 0.8319,
      "step": 190
    },
    {
      "epoch": 0.76171875,
      "grad_norm": 0.27309176325798035,
      "learning_rate": 0.00015234375,
      "loss": 0.8511,
      "step": 195
    },
    {
      "epoch": 0.78125,
      "grad_norm": 0.29295653104782104,
      "learning_rate": 0.00015625,
      "loss": 0.8373,
      "step": 200
    },
    {
      "epoch": 0.80078125,
      "grad_norm": 0.27028167247772217,
      "learning_rate": 0.00016015625,
      "loss": 0.8319,
      "step": 205
    },
    {
      "epoch": 0.8203125,
      "grad_norm": 0.40336114168167114,
      "learning_rate": 0.0001640625,
      "loss": 0.8245,
      "step": 210
    },
    {
      "epoch": 0.83984375,
      "grad_norm": 0.3044915795326233,
      "learning_rate": 0.00016796875000000001,
      "loss": 0.8283,
      "step": 215
    },
    {
      "epoch": 0.859375,
      "grad_norm": 0.29535970091819763,
      "learning_rate": 0.00017187500000000002,
      "loss": 0.8119,
      "step": 220
    },
    {
      "epoch": 0.87890625,
      "grad_norm": 0.28554800152778625,
      "learning_rate": 0.00017578125000000002,
      "loss": 0.8091,
      "step": 225
    },
    {
      "epoch": 0.8984375,
      "grad_norm": 0.26689431071281433,
      "learning_rate": 0.0001796875,
      "loss": 0.8189,
      "step": 230
    },
    {
      "epoch": 0.91796875,
      "grad_norm": 0.29758790135383606,
      "learning_rate": 0.00018359375,
      "loss": 0.8122,
      "step": 235
    },
    {
      "epoch": 0.9375,
      "grad_norm": 0.40431731939315796,
      "learning_rate": 0.0001875,
      "loss": 0.8155,
      "step": 240
    },
    {
      "epoch": 0.95703125,
      "grad_norm": 0.27242639660835266,
      "learning_rate": 0.00019140625,
      "loss": 0.8119,
      "step": 245
    },
    {
      "epoch": 0.9765625,
      "grad_norm": 0.3094847500324249,
      "learning_rate": 0.0001953125,
      "loss": 0.8058,
      "step": 250
    },
    {
      "epoch": 0.99609375,
      "grad_norm": 0.32299983501434326,
      "learning_rate": 0.00019921875000000001,
      "loss": 0.8026,
      "step": 255
    },
    {
      "epoch": 1.0,
      "eval_loss": 2.045611619949341,
      "eval_runtime": 0.5394,
      "eval_samples_per_second": 11.124,
      "eval_steps_per_second": 1.854,
      "step": 256
    },
    {
      "epoch": 1.015625,
      "grad_norm": 0.305078387260437,
      "learning_rate": 0.00019999851261394218,
      "loss": 0.7941,
      "step": 260
    },
    {
      "epoch": 1.03515625,
      "grad_norm": 0.2842113673686981,
      "learning_rate": 0.00019999247018391447,
      "loss": 0.798,
      "step": 265
    },
    {
      "epoch": 1.0546875,
      "grad_norm": 0.27524590492248535,
      "learning_rate": 0.0001999817800289289,
      "loss": 0.7911,
      "step": 270
    },
    {
      "epoch": 1.07421875,
      "grad_norm": 0.2549247145652771,
      "learning_rate": 0.00019996644264587193,
      "loss": 0.7963,
      "step": 275
    },
    {
      "epoch": 1.09375,
      "grad_norm": 0.253353089094162,
      "learning_rate": 0.00019994645874763658,
      "loss": 0.7904,
      "step": 280
    },
    {
      "epoch": 1.11328125,
      "grad_norm": 0.23945719003677368,
      "learning_rate": 0.00019992182926308942,
      "loss": 0.7921,
      "step": 285
    },
    {
      "epoch": 1.1328125,
      "grad_norm": 0.29668208956718445,
      "learning_rate": 0.00019989255533702736,
      "loss": 0.7943,
      "step": 290
    },
    {
      "epoch": 1.15234375,
      "grad_norm": 0.26419156789779663,
      "learning_rate": 0.0001998586383301244,
      "loss": 0.7819,
      "step": 295
    },
    {
      "epoch": 1.171875,
      "grad_norm": 0.3054077625274658,
      "learning_rate": 0.00019982007981886847,
      "loss": 0.7917,
      "step": 300
    },
    {
      "epoch": 1.19140625,
      "grad_norm": 0.27965638041496277,
      "learning_rate": 0.00019977688159548808,
      "loss": 0.7854,
      "step": 305
    },
    {
      "epoch": 1.2109375,
      "grad_norm": 0.23229017853736877,
      "learning_rate": 0.00019972904566786903,
      "loss": 0.7865,
      "step": 310
    },
    {
      "epoch": 1.23046875,
      "grad_norm": 0.2789019048213959,
      "learning_rate": 0.00019967657425946106,
      "loss": 0.7821,
      "step": 315
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.24402114748954773,
      "learning_rate": 0.00019961946980917456,
      "loss": 0.7899,
      "step": 320
    },
    {
      "epoch": 1.26953125,
      "grad_norm": 0.2749808132648468,
      "learning_rate": 0.0001995577349712672,
      "loss": 0.7783,
      "step": 325
    },
    {
      "epoch": 1.2890625,
      "grad_norm": 0.2676057815551758,
      "learning_rate": 0.00019949137261522052,
      "loss": 0.7788,
      "step": 330
    },
    {
      "epoch": 1.30859375,
      "grad_norm": 0.24829885363578796,
      "learning_rate": 0.0001994203858256065,
      "loss": 0.7714,
      "step": 335
    },
    {
      "epoch": 1.328125,
      "grad_norm": 0.24872945249080658,
      "learning_rate": 0.00019934477790194445,
      "loss": 0.7832,
      "step": 340
    },
    {
      "epoch": 1.34765625,
      "grad_norm": 0.2914537489414215,
      "learning_rate": 0.00019926455235854724,
      "loss": 0.7791,
      "step": 345
    },
    {
      "epoch": 1.3671875,
      "grad_norm": 0.2692899703979492,
      "learning_rate": 0.00019917971292435826,
      "loss": 0.7739,
      "step": 350
    },
    {
      "epoch": 1.38671875,
      "grad_norm": 0.2605401873588562,
      "learning_rate": 0.000199090263542778,
      "loss": 0.7717,
      "step": 355
    },
    {
      "epoch": 1.40625,
      "grad_norm": 0.24468782544136047,
      "learning_rate": 0.00019899620837148077,
      "loss": 0.7694,
      "step": 360
    },
    {
      "epoch": 1.42578125,
      "grad_norm": 0.2542877197265625,
      "learning_rate": 0.00019889755178222147,
      "loss": 0.7653,
      "step": 365
    },
    {
      "epoch": 1.4453125,
      "grad_norm": 0.21375133097171783,
      "learning_rate": 0.00019879429836063226,
      "loss": 0.7854,
      "step": 370
    },
    {
      "epoch": 1.46484375,
      "grad_norm": 0.24711847305297852,
      "learning_rate": 0.00019868645290600955,
      "loss": 0.773,
      "step": 375
    },
    {
      "epoch": 1.484375,
      "grad_norm": 0.2352401316165924,
      "learning_rate": 0.0001985740204310909,
      "loss": 0.7641,
      "step": 380
    },
    {
      "epoch": 1.50390625,
      "grad_norm": 0.2681073844432831,
      "learning_rate": 0.00019845700616182206,
      "loss": 0.7755,
      "step": 385
    },
    {
      "epoch": 1.5234375,
      "grad_norm": 0.2394329458475113,
      "learning_rate": 0.00019833541553711395,
      "loss": 0.7635,
      "step": 390
    },
    {
      "epoch": 1.54296875,
      "grad_norm": 0.27736565470695496,
      "learning_rate": 0.00019820925420858991,
      "loss": 0.7744,
      "step": 395
    },
    {
      "epoch": 1.5625,
      "grad_norm": 0.2736864984035492,
      "learning_rate": 0.00019807852804032305,
      "loss": 0.7564,
      "step": 400
    },
    {
      "epoch": 1.58203125,
      "grad_norm": 0.22882600128650665,
      "learning_rate": 0.00019794324310856367,
      "loss": 0.7703,
      "step": 405
    },
    {
      "epoch": 1.6015625,
      "grad_norm": 0.2372276782989502,
      "learning_rate": 0.0001978034057014568,
      "loss": 0.7642,
      "step": 410
    },
    {
      "epoch": 1.62109375,
      "grad_norm": 0.23550736904144287,
      "learning_rate": 0.00019765902231874992,
      "loss": 0.7513,
      "step": 415
    },
    {
      "epoch": 1.640625,
      "grad_norm": 0.23483717441558838,
      "learning_rate": 0.00019751009967149087,
      "loss": 0.7485,
      "step": 420
    },
    {
      "epoch": 1.66015625,
      "grad_norm": 0.23124265670776367,
      "learning_rate": 0.00019735664468171587,
      "loss": 0.7712,
      "step": 425
    },
    {
      "epoch": 1.6796875,
      "grad_norm": 0.25672388076782227,
      "learning_rate": 0.00019719866448212795,
      "loss": 0.7635,
      "step": 430
    },
    {
      "epoch": 1.69921875,
      "grad_norm": 0.2655965983867645,
      "learning_rate": 0.00019703616641576514,
      "loss": 0.7614,
      "step": 435
    },
    {
      "epoch": 1.71875,
      "grad_norm": 0.22875700891017914,
      "learning_rate": 0.00019686915803565934,
      "loss": 0.7597,
      "step": 440
    },
    {
      "epoch": 1.73828125,
      "grad_norm": 0.24324467778205872,
      "learning_rate": 0.00019669764710448522,
      "loss": 0.7592,
      "step": 445
    },
    {
      "epoch": 1.7578125,
      "grad_norm": 0.23085905611515045,
      "learning_rate": 0.00019652164159419946,
      "loss": 0.7582,
      "step": 450
    },
    {
      "epoch": 1.77734375,
      "grad_norm": 0.24821893870830536,
      "learning_rate": 0.00019634114968567005,
      "loss": 0.7565,
      "step": 455
    },
    {
      "epoch": 1.796875,
      "grad_norm": 0.24690982699394226,
      "learning_rate": 0.0001961561797682962,
      "loss": 0.75,
      "step": 460
    },
    {
      "epoch": 1.81640625,
      "grad_norm": 0.21277934312820435,
      "learning_rate": 0.00019596674043961828,
      "loss": 0.7499,
      "step": 465
    },
    {
      "epoch": 1.8359375,
      "grad_norm": 0.2045515477657318,
      "learning_rate": 0.0001957728405049183,
      "loss": 0.7476,
      "step": 470
    },
    {
      "epoch": 1.85546875,
      "grad_norm": 0.22809946537017822,
      "learning_rate": 0.00019557448897681057,
      "loss": 0.7554,
      "step": 475
    },
    {
      "epoch": 1.875,
      "grad_norm": 0.2747824788093567,
      "learning_rate": 0.0001953716950748227,
      "loss": 0.7481,
      "step": 480
    },
    {
      "epoch": 1.89453125,
      "grad_norm": 0.23395125567913055,
      "learning_rate": 0.00019516446822496732,
      "loss": 0.7579,
      "step": 485
    },
    {
      "epoch": 1.9140625,
      "grad_norm": 0.2263769805431366,
      "learning_rate": 0.00019495281805930367,
      "loss": 0.7493,
      "step": 490
    },
    {
      "epoch": 1.93359375,
      "grad_norm": 0.23396165668964386,
      "learning_rate": 0.00019473675441549013,
      "loss": 0.7523,
      "step": 495
    },
    {
      "epoch": 1.953125,
      "grad_norm": 0.23420800268650055,
      "learning_rate": 0.0001945162873363268,
      "loss": 0.7469,
      "step": 500
    },
    {
      "epoch": 1.97265625,
      "grad_norm": 0.19923944771289825,
      "learning_rate": 0.00019429142706928868,
      "loss": 0.7535,
      "step": 505
    },
    {
      "epoch": 1.9921875,
      "grad_norm": 0.2181696891784668,
      "learning_rate": 0.00019406218406604965,
      "loss": 0.7532,
      "step": 510
    },
    {
      "epoch": 2.0,
      "eval_loss": 2.031317949295044,
      "eval_runtime": 0.5375,
      "eval_samples_per_second": 11.164,
      "eval_steps_per_second": 1.861,
      "step": 512
    },
    {
      "epoch": 2.01171875,
      "grad_norm": 0.2611521780490875,
      "learning_rate": 0.0001938285689819962,
      "loss": 0.7349,
      "step": 515
    },
    {
      "epoch": 2.03125,
      "grad_norm": 0.22077465057373047,
      "learning_rate": 0.0001935905926757326,
      "loss": 0.7309,
      "step": 520
    },
    {
      "epoch": 2.05078125,
      "grad_norm": 0.2502357065677643,
      "learning_rate": 0.00019334826620857583,
      "loss": 0.7402,
      "step": 525
    },
    {
      "epoch": 2.0703125,
      "grad_norm": 0.21151328086853027,
      "learning_rate": 0.00019310160084404186,
      "loss": 0.7263,
      "step": 530
    },
    {
      "epoch": 2.08984375,
      "grad_norm": 0.22730891406536102,
      "learning_rate": 0.00019285060804732158,
      "loss": 0.7393,
      "step": 535
    },
    {
      "epoch": 2.109375,
      "grad_norm": 0.29608404636383057,
      "learning_rate": 0.00019259529948474833,
      "loss": 0.7359,
      "step": 540
    },
    {
      "epoch": 2.12890625,
      "grad_norm": 0.2048954963684082,
      "learning_rate": 0.00019233568702325547,
      "loss": 0.7327,
      "step": 545
    },
    {
      "epoch": 2.1484375,
      "grad_norm": 0.24332541227340698,
      "learning_rate": 0.0001920717827298248,
      "loss": 0.723,
      "step": 550
    },
    {
      "epoch": 2.16796875,
      "grad_norm": 0.27370956540107727,
      "learning_rate": 0.0001918035988709256,
      "loss": 0.7346,
      "step": 555
    },
    {
      "epoch": 2.1875,
      "grad_norm": 0.27345338463783264,
      "learning_rate": 0.00019153114791194473,
      "loss": 0.7216,
      "step": 560
    },
    {
      "epoch": 2.20703125,
      "grad_norm": 0.21915854513645172,
      "learning_rate": 0.0001912544425166069,
      "loss": 0.7297,
      "step": 565
    },
    {
      "epoch": 2.2265625,
      "grad_norm": 0.23517705500125885,
      "learning_rate": 0.0001909734955463863,
      "loss": 0.7277,
      "step": 570
    },
    {
      "epoch": 2.24609375,
      "grad_norm": 0.2082410454750061,
      "learning_rate": 0.00019068832005990867,
      "loss": 0.7274,
      "step": 575
    },
    {
      "epoch": 2.265625,
      "grad_norm": 0.25212010741233826,
      "learning_rate": 0.00019039892931234435,
      "loss": 0.7388,
      "step": 580
    },
    {
      "epoch": 2.28515625,
      "grad_norm": 0.22077186405658722,
      "learning_rate": 0.0001901053367547922,
      "loss": 0.7356,
      "step": 585
    },
    {
      "epoch": 2.3046875,
      "grad_norm": 0.24918216466903687,
      "learning_rate": 0.0001898075560336543,
      "loss": 0.7283,
      "step": 590
    },
    {
      "epoch": 2.32421875,
      "grad_norm": 0.2168445587158203,
      "learning_rate": 0.00018950560099000182,
      "loss": 0.7276,
      "step": 595
    },
    {
      "epoch": 2.34375,
      "grad_norm": 0.3361542522907257,
      "learning_rate": 0.00018919948565893142,
      "loss": 0.7394,
      "step": 600
    },
    {
      "epoch": 2.36328125,
      "grad_norm": 0.30473312735557556,
      "learning_rate": 0.0001888892242689132,
      "loss": 0.7214,
      "step": 605
    },
    {
      "epoch": 2.3828125,
      "grad_norm": 0.22810065746307373,
      "learning_rate": 0.00018857483124112907,
      "loss": 0.7389,
      "step": 610
    },
    {
      "epoch": 2.40234375,
      "grad_norm": 0.22486305236816406,
      "learning_rate": 0.00018825632118880259,
      "loss": 0.7382,
      "step": 615
    },
    {
      "epoch": 2.421875,
      "grad_norm": 0.23797857761383057,
      "learning_rate": 0.00018793370891651972,
      "loss": 0.7352,
      "step": 620
    },
    {
      "epoch": 2.44140625,
      "grad_norm": 0.22012600302696228,
      "learning_rate": 0.00018760700941954065,
      "loss": 0.7323,
      "step": 625
    },
    {
      "epoch": 2.4609375,
      "grad_norm": 0.2505754232406616,
      "learning_rate": 0.00018727623788310292,
      "loss": 0.7319,
      "step": 630
    },
    {
      "epoch": 2.48046875,
      "grad_norm": 0.23932820558547974,
      "learning_rate": 0.0001869414096817154,
      "loss": 0.7166,
      "step": 635
    },
    {
      "epoch": 2.5,
      "grad_norm": 0.22623002529144287,
      "learning_rate": 0.00018660254037844388,
      "loss": 0.7254,
      "step": 640
    },
    {
      "epoch": 2.51953125,
      "grad_norm": 0.24143099784851074,
      "learning_rate": 0.0001862596457241875,
      "loss": 0.7374,
      "step": 645
    },
    {
      "epoch": 2.5390625,
      "grad_norm": 0.25545206665992737,
      "learning_rate": 0.00018591274165694687,
      "loss": 0.7268,
      "step": 650
    },
    {
      "epoch": 2.55859375,
      "grad_norm": 0.27690452337265015,
      "learning_rate": 0.00018556184430108293,
      "loss": 0.7318,
      "step": 655
    },
    {
      "epoch": 2.578125,
      "grad_norm": 0.21064211428165436,
      "learning_rate": 0.00018520696996656788,
      "loss": 0.7365,
      "step": 660
    },
    {
      "epoch": 2.59765625,
      "grad_norm": 0.2418980747461319,
      "learning_rate": 0.0001848481351482267,
      "loss": 0.7252,
      "step": 665
    },
    {
      "epoch": 2.6171875,
      "grad_norm": 0.21725673973560333,
      "learning_rate": 0.00018448535652497073,
      "loss": 0.7438,
      "step": 670
    },
    {
      "epoch": 2.63671875,
      "grad_norm": 0.2051118165254593,
      "learning_rate": 0.00018411865095902224,
      "loss": 0.7272,
      "step": 675
    },
    {
      "epoch": 2.65625,
      "grad_norm": 0.20715655386447906,
      "learning_rate": 0.0001837480354951308,
      "loss": 0.7189,
      "step": 680
    },
    {
      "epoch": 2.67578125,
      "grad_norm": 0.224945530295372,
      "learning_rate": 0.00018337352735978095,
      "loss": 0.7283,
      "step": 685
    },
    {
      "epoch": 2.6953125,
      "grad_norm": 0.2353772222995758,
      "learning_rate": 0.0001829951439603915,
      "loss": 0.7172,
      "step": 690
    },
    {
      "epoch": 2.71484375,
      "grad_norm": 0.21377775073051453,
      "learning_rate": 0.00018261290288450646,
      "loss": 0.7245,
      "step": 695
    },
    {
      "epoch": 2.734375,
      "grad_norm": 0.20290276408195496,
      "learning_rate": 0.00018222682189897752,
      "loss": 0.732,
      "step": 700
    },
    {
      "epoch": 2.75390625,
      "grad_norm": 0.21785806119441986,
      "learning_rate": 0.00018183691894913825,
      "loss": 0.7142,
      "step": 705
    },
    {
      "epoch": 2.7734375,
      "grad_norm": 0.21216203272342682,
      "learning_rate": 0.00018144321215797,
      "loss": 0.7163,
      "step": 710
    },
    {
      "epoch": 2.79296875,
      "grad_norm": 0.20187579095363617,
      "learning_rate": 0.0001810457198252595,
      "loss": 0.7196,
      "step": 715
    },
    {
      "epoch": 2.8125,
      "grad_norm": 0.21112394332885742,
      "learning_rate": 0.00018064446042674828,
      "loss": 0.7255,
      "step": 720
    },
    {
      "epoch": 2.83203125,
      "grad_norm": 0.21814604103565216,
      "learning_rate": 0.00018023945261327393,
      "loss": 0.7244,
      "step": 725
    },
    {
      "epoch": 2.8515625,
      "grad_norm": 0.2388346940279007,
      "learning_rate": 0.00017983071520990315,
      "loss": 0.719,
      "step": 730
    },
    {
      "epoch": 2.87109375,
      "grad_norm": 0.2274855226278305,
      "learning_rate": 0.00017941826721505684,
      "loss": 0.7092,
      "step": 735
    },
    {
      "epoch": 2.890625,
      "grad_norm": 0.2171526700258255,
      "learning_rate": 0.0001790021277996269,
      "loss": 0.7177,
      "step": 740
    },
    {
      "epoch": 2.91015625,
      "grad_norm": 0.2128465622663498,
      "learning_rate": 0.00017858231630608527,
      "loss": 0.7245,
      "step": 745
    },
    {
      "epoch": 2.9296875,
      "grad_norm": 0.2257278561592102,
      "learning_rate": 0.0001781588522475848,
      "loss": 0.7172,
      "step": 750
    },
    {
      "epoch": 2.94921875,
      "grad_norm": 0.21227267384529114,
      "learning_rate": 0.00017773175530705232,
      "loss": 0.7208,
      "step": 755
    },
    {
      "epoch": 2.96875,
      "grad_norm": 0.23267419636249542,
      "learning_rate": 0.0001773010453362737,
      "loss": 0.7188,
      "step": 760
    },
    {
      "epoch": 2.98828125,
      "grad_norm": 0.21279846131801605,
      "learning_rate": 0.00017686674235497125,
      "loss": 0.7198,
      "step": 765
    },
    {
      "epoch": 3.0,
      "eval_loss": 2.0403969287872314,
      "eval_runtime": 0.5399,
      "eval_samples_per_second": 11.113,
      "eval_steps_per_second": 1.852,
      "step": 768
    },
    {
      "epoch": 3.0078125,
      "grad_norm": 0.20591868460178375,
      "learning_rate": 0.000176428866549873,
      "loss": 0.7092,
      "step": 770
    },
    {
      "epoch": 3.02734375,
      "grad_norm": 0.21006809175014496,
      "learning_rate": 0.0001759874382737746,
      "loss": 0.6982,
      "step": 775
    },
    {
      "epoch": 3.046875,
      "grad_norm": 0.20914091169834137,
      "learning_rate": 0.00017554247804459316,
      "loss": 0.6986,
      "step": 780
    },
    {
      "epoch": 3.06640625,
      "grad_norm": 0.21207676827907562,
      "learning_rate": 0.0001750940065444136,
      "loss": 0.7024,
      "step": 785
    },
    {
      "epoch": 3.0859375,
      "grad_norm": 0.24130572378635406,
      "learning_rate": 0.00017464204461852738,
      "loss": 0.7011,
      "step": 790
    },
    {
      "epoch": 3.10546875,
      "grad_norm": 0.22464986145496368,
      "learning_rate": 0.0001741866132744636,
      "loss": 0.6998,
      "step": 795
    },
    {
      "epoch": 3.125,
      "grad_norm": 0.20956657826900482,
      "learning_rate": 0.0001737277336810124,
      "loss": 0.7068,
      "step": 800
    },
    {
      "epoch": 3.14453125,
      "grad_norm": 0.21382799744606018,
      "learning_rate": 0.00017326542716724128,
      "loss": 0.6997,
      "step": 805
    },
    {
      "epoch": 3.1640625,
      "grad_norm": 0.2018394023180008,
      "learning_rate": 0.00017279971522150348,
      "loss": 0.7057,
      "step": 810
    },
    {
      "epoch": 3.18359375,
      "grad_norm": 0.20716731250286102,
      "learning_rate": 0.00017233061949043928,
      "loss": 0.6957,
      "step": 815
    },
    {
      "epoch": 3.203125,
      "grad_norm": 0.21063964068889618,
      "learning_rate": 0.0001718581617779698,
      "loss": 0.6989,
      "step": 820
    },
    {
      "epoch": 3.22265625,
      "grad_norm": 0.21001911163330078,
      "learning_rate": 0.0001713823640442837,
      "loss": 0.7065,
      "step": 825
    },
    {
      "epoch": 3.2421875,
      "grad_norm": 0.21537743508815765,
      "learning_rate": 0.0001709032484048162,
      "loss": 0.7001,
      "step": 830
    },
    {
      "epoch": 3.26171875,
      "grad_norm": 0.21781504154205322,
      "learning_rate": 0.00017042083712922131,
      "loss": 0.7076,
      "step": 835
    },
    {
      "epoch": 3.28125,
      "grad_norm": 0.21302708983421326,
      "learning_rate": 0.00016993515264033672,
      "loss": 0.6965,
      "step": 840
    },
    {
      "epoch": 3.30078125,
      "grad_norm": 0.2185572385787964,
      "learning_rate": 0.00016944621751314144,
      "loss": 0.7046,
      "step": 845
    },
    {
      "epoch": 3.3203125,
      "grad_norm": 0.21651025116443634,
      "learning_rate": 0.0001689540544737067,
      "loss": 0.7042,
      "step": 850
    },
    {
      "epoch": 3.33984375,
      "grad_norm": 0.22459545731544495,
      "learning_rate": 0.0001684586863981394,
      "loss": 0.7133,
      "step": 855
    },
    {
      "epoch": 3.359375,
      "grad_norm": 0.21320843696594238,
      "learning_rate": 0.00016796013631151897,
      "loss": 0.7106,
      "step": 860
    },
    {
      "epoch": 3.37890625,
      "grad_norm": 0.22854122519493103,
      "learning_rate": 0.00016745842738682712,
      "loss": 0.6987,
      "step": 865
    },
    {
      "epoch": 3.3984375,
      "grad_norm": 0.22366014122962952,
      "learning_rate": 0.00016695358294387065,
      "loss": 0.7078,
      "step": 870
    },
    {
      "epoch": 3.41796875,
      "grad_norm": 0.21049249172210693,
      "learning_rate": 0.00016644562644819771,
      "loss": 0.6926,
      "step": 875
    },
    {
      "epoch": 3.4375,
      "grad_norm": 0.216139018535614,
      "learning_rate": 0.00016593458151000688,
      "loss": 0.7073,
      "step": 880
    },
    {
      "epoch": 3.45703125,
      "grad_norm": 0.22321297228336334,
      "learning_rate": 0.00016542047188304997,
      "loss": 0.7063,
      "step": 885
    },
    {
      "epoch": 3.4765625,
      "grad_norm": 0.21834047138690948,
      "learning_rate": 0.0001649033214635277,
      "loss": 0.7007,
      "step": 890
    },
    {
      "epoch": 3.49609375,
      "grad_norm": 0.2148895114660263,
      "learning_rate": 0.00016438315428897915,
      "loss": 0.709,
      "step": 895
    },
    {
      "epoch": 3.515625,
      "grad_norm": 0.2145809829235077,
      "learning_rate": 0.00016385999453716454,
      "loss": 0.7073,
      "step": 900
    },
    {
      "epoch": 3.53515625,
      "grad_norm": 0.21147432923316956,
      "learning_rate": 0.00016333386652494117,
      "loss": 0.6915,
      "step": 905
    },
    {
      "epoch": 3.5546875,
      "grad_norm": 0.21884699165821075,
      "learning_rate": 0.00016280479470713344,
      "loss": 0.7026,
      "step": 910
    },
    {
      "epoch": 3.57421875,
      "grad_norm": 0.20934432744979858,
      "learning_rate": 0.0001622728036753959,
      "loss": 0.6908,
      "step": 915
    },
    {
      "epoch": 3.59375,
      "grad_norm": 0.20113444328308105,
      "learning_rate": 0.00016173791815707051,
      "loss": 0.7101,
      "step": 920
    },
    {
      "epoch": 3.61328125,
      "grad_norm": 0.2057623565196991,
      "learning_rate": 0.000161200163014037,
      "loss": 0.7179,
      "step": 925
    },
    {
      "epoch": 3.6328125,
      "grad_norm": 0.21178101003170013,
      "learning_rate": 0.00016065956324155746,
      "loss": 0.7015,
      "step": 930
    },
    {
      "epoch": 3.65234375,
      "grad_norm": 0.21164196729660034,
      "learning_rate": 0.0001601161439671145,
      "loss": 0.6955,
      "step": 935
    },
    {
      "epoch": 3.671875,
      "grad_norm": 0.21989427506923676,
      "learning_rate": 0.00015956993044924334,
      "loss": 0.6972,
      "step": 940
    },
    {
      "epoch": 3.69140625,
      "grad_norm": 0.20968452095985413,
      "learning_rate": 0.0001590209480763576,
      "loss": 0.6986,
      "step": 945
    },
    {
      "epoch": 3.7109375,
      "grad_norm": 0.20064401626586914,
      "learning_rate": 0.00015846922236556946,
      "loss": 0.7073,
      "step": 950
    },
    {
      "epoch": 3.73046875,
      "grad_norm": 0.2390391230583191,
      "learning_rate": 0.00015791477896150347,
      "loss": 0.6958,
      "step": 955
    },
    {
      "epoch": 3.75,
      "grad_norm": 0.21184207499027252,
      "learning_rate": 0.0001573576436351046,
      "loss": 0.7008,
      "step": 960
    },
    {
      "epoch": 3.76953125,
      "grad_norm": 0.21932272613048553,
      "learning_rate": 0.00015679784228244043,
      "loss": 0.6904,
      "step": 965
    },
    {
      "epoch": 3.7890625,
      "grad_norm": 0.19908711314201355,
      "learning_rate": 0.00015623540092349732,
      "loss": 0.6991,
      "step": 970
    },
    {
      "epoch": 3.80859375,
      "grad_norm": 0.22039274871349335,
      "learning_rate": 0.00015567034570097125,
      "loss": 0.6959,
      "step": 975
    },
    {
      "epoch": 3.828125,
      "grad_norm": 0.21224038302898407,
      "learning_rate": 0.0001551027028790524,
      "loss": 0.6976,
      "step": 980
    },
    {
      "epoch": 3.84765625,
      "grad_norm": 0.21021129190921783,
      "learning_rate": 0.00015453249884220464,
      "loss": 0.6976,
      "step": 985
    },
    {
      "epoch": 3.8671875,
      "grad_norm": 0.2202974110841751,
      "learning_rate": 0.00015395976009393894,
      "loss": 0.6995,
      "step": 990
    },
    {
      "epoch": 3.88671875,
      "grad_norm": 0.21578259766101837,
      "learning_rate": 0.0001533845132555816,
      "loss": 0.6882,
      "step": 995
    },
    {
      "epoch": 3.90625,
      "grad_norm": 0.1979641318321228,
      "learning_rate": 0.0001528067850650368,
      "loss": 0.6961,
      "step": 1000
    },
    {
      "epoch": 3.92578125,
      "grad_norm": 0.20889665186405182,
      "learning_rate": 0.00015222660237554383,
      "loss": 0.7,
      "step": 1005
    },
    {
      "epoch": 3.9453125,
      "grad_norm": 0.20623871684074402,
      "learning_rate": 0.00015164399215442898,
      "loss": 0.6985,
      "step": 1010
    },
    {
      "epoch": 3.96484375,
      "grad_norm": 0.2109537273645401,
      "learning_rate": 0.00015105898148185193,
      "loss": 0.7026,
      "step": 1015
    },
    {
      "epoch": 3.984375,
      "grad_norm": 0.20740477740764618,
      "learning_rate": 0.0001504715975495472,
      "loss": 0.7053,
      "step": 1020
    },
    {
      "epoch": 4.0,
      "eval_loss": 2.0418636798858643,
      "eval_runtime": 0.5376,
      "eval_samples_per_second": 11.162,
      "eval_steps_per_second": 1.86,
      "step": 1024
    },
    {
      "epoch": 4.00390625,
      "grad_norm": 0.2116871029138565,
      "learning_rate": 0.00014988186765956029,
      "loss": 0.6923,
      "step": 1025
    },
    {
      "epoch": 4.0234375,
      "grad_norm": 0.20054052770137787,
      "learning_rate": 0.00014928981922297842,
      "loss": 0.6717,
      "step": 1030
    },
    {
      "epoch": 4.04296875,
      "grad_norm": 0.2238766998052597,
      "learning_rate": 0.00014869547975865664,
      "loss": 0.6719,
      "step": 1035
    },
    {
      "epoch": 4.0625,
      "grad_norm": 0.2156434804201126,
      "learning_rate": 0.00014809887689193877,
      "loss": 0.6718,
      "step": 1040
    },
    {
      "epoch": 4.08203125,
      "grad_norm": 0.2189694195985794,
      "learning_rate": 0.00014750003835337316,
      "loss": 0.677,
      "step": 1045
    },
    {
      "epoch": 4.1015625,
      "grad_norm": 0.2283412218093872,
      "learning_rate": 0.0001468989919774239,
      "loss": 0.6724,
      "step": 1050
    },
    {
      "epoch": 4.12109375,
      "grad_norm": 0.2534675598144531,
      "learning_rate": 0.00014629576570117709,
      "loss": 0.6842,
      "step": 1055
    },
    {
      "epoch": 4.140625,
      "grad_norm": 0.24277372658252716,
      "learning_rate": 0.00014569038756304207,
      "loss": 0.676,
      "step": 1060
    },
    {
      "epoch": 4.16015625,
      "grad_norm": 0.2335975170135498,
      "learning_rate": 0.0001450828857014485,
      "loss": 0.6861,
      "step": 1065
    },
    {
      "epoch": 4.1796875,
      "grad_norm": 0.22338411211967468,
      "learning_rate": 0.0001444732883535382,
      "loss": 0.6784,
      "step": 1070
    },
    {
      "epoch": 4.19921875,
      "grad_norm": 0.22138862311840057,
      "learning_rate": 0.00014386162385385278,
      "loss": 0.6765,
      "step": 1075
    },
    {
      "epoch": 4.21875,
      "grad_norm": 0.20274129509925842,
      "learning_rate": 0.00014324792063301662,
      "loss": 0.6762,
      "step": 1080
    },
    {
      "epoch": 4.23828125,
      "grad_norm": 0.20809794962406158,
      "learning_rate": 0.00014263220721641543,
      "loss": 0.6954,
      "step": 1085
    },
    {
      "epoch": 4.2578125,
      "grad_norm": 0.21727928519248962,
      "learning_rate": 0.00014201451222287025,
      "loss": 0.682,
      "step": 1090
    },
    {
      "epoch": 4.27734375,
      "grad_norm": 0.21408621966838837,
      "learning_rate": 0.00014139486436330736,
      "loss": 0.6817,
      "step": 1095
    },
    {
      "epoch": 4.296875,
      "grad_norm": 0.2173791378736496,
      "learning_rate": 0.00014077329243942369,
      "loss": 0.6775,
      "step": 1100
    },
    {
      "epoch": 4.31640625,
      "grad_norm": 0.21154190599918365,
      "learning_rate": 0.0001401498253423481,
      "loss": 0.6793,
      "step": 1105
    },
    {
      "epoch": 4.3359375,
      "grad_norm": 0.2106465995311737,
      "learning_rate": 0.00013952449205129855,
      "loss": 0.6736,
      "step": 1110
    },
    {
      "epoch": 4.35546875,
      "grad_norm": 0.20029598474502563,
      "learning_rate": 0.00013889732163223516,
      "loss": 0.6759,
      "step": 1115
    },
    {
      "epoch": 4.375,
      "grad_norm": 0.21185144782066345,
      "learning_rate": 0.000138268343236509,
      "loss": 0.6777,
      "step": 1120
    },
    {
      "epoch": 4.39453125,
      "grad_norm": 0.2037803679704666,
      "learning_rate": 0.0001376375860995073,
      "loss": 0.6818,
      "step": 1125
    },
    {
      "epoch": 4.4140625,
      "grad_norm": 0.21110603213310242,
      "learning_rate": 0.00013700507953929463,
      "loss": 0.675,
      "step": 1130
    },
    {
      "epoch": 4.43359375,
      "grad_norm": 0.2060796022415161,
      "learning_rate": 0.00013637085295524988,
      "loss": 0.679,
      "step": 1135
    },
    {
      "epoch": 4.453125,
      "grad_norm": 0.2184733897447586,
      "learning_rate": 0.00013573493582670003,
      "loss": 0.6859,
      "step": 1140
    },
    {
      "epoch": 4.47265625,
      "grad_norm": 0.21656639873981476,
      "learning_rate": 0.00013509735771154987,
      "loss": 0.685,
      "step": 1145
    },
    {
      "epoch": 4.4921875,
      "grad_norm": 0.219607412815094,
      "learning_rate": 0.00013445814824490805,
      "loss": 0.6814,
      "step": 1150
    },
    {
      "epoch": 4.51171875,
      "grad_norm": 0.2204212099313736,
      "learning_rate": 0.00013381733713770967,
      "loss": 0.6845,
      "step": 1155
    },
    {
      "epoch": 4.53125,
      "grad_norm": 0.2118123322725296,
      "learning_rate": 0.00013317495417533524,
      "loss": 0.6751,
      "step": 1160
    },
    {
      "epoch": 4.55078125,
      "grad_norm": 0.2175564020872116,
      "learning_rate": 0.0001325310292162263,
      "loss": 0.6813,
      "step": 1165
    },
    {
      "epoch": 4.5703125,
      "grad_norm": 0.2186279296875,
      "learning_rate": 0.0001318855921904976,
      "loss": 0.6869,
      "step": 1170
    },
    {
      "epoch": 4.58984375,
      "grad_norm": 0.21257956326007843,
      "learning_rate": 0.0001312386730985459,
      "loss": 0.6834,
      "step": 1175
    },
    {
      "epoch": 4.609375,
      "grad_norm": 0.20661357045173645,
      "learning_rate": 0.00013059030200965536,
      "loss": 0.7001,
      "step": 1180
    },
    {
      "epoch": 4.62890625,
      "grad_norm": 0.22517681121826172,
      "learning_rate": 0.00012994050906060017,
      "loss": 0.6717,
      "step": 1185
    },
    {
      "epoch": 4.6484375,
      "grad_norm": 0.22090637683868408,
      "learning_rate": 0.00012928932445424365,
      "loss": 0.678,
      "step": 1190
    },
    {
      "epoch": 4.66796875,
      "grad_norm": 0.21545428037643433,
      "learning_rate": 0.00012863677845813433,
      "loss": 0.6819,
      "step": 1195
    },
    {
      "epoch": 4.6875,
      "grad_norm": 0.209136962890625,
      "learning_rate": 0.00012798290140309923,
      "loss": 0.6862,
      "step": 1200
    },
    {
      "epoch": 4.70703125,
      "grad_norm": 0.20853549242019653,
      "learning_rate": 0.00012732772368183388,
      "loss": 0.6719,
      "step": 1205
    },
    {
      "epoch": 4.7265625,
      "grad_norm": 0.2124202698469162,
      "learning_rate": 0.00012667127574748986,
      "loss": 0.6819,
      "step": 1210
    },
    {
      "epoch": 4.74609375,
      "grad_norm": 0.2243855744600296,
      "learning_rate": 0.00012601358811225913,
      "loss": 0.6743,
      "step": 1215
    },
    {
      "epoch": 4.765625,
      "grad_norm": 0.21978437900543213,
      "learning_rate": 0.00012535469134595595,
      "loss": 0.6924,
      "step": 1220
    },
    {
      "epoch": 4.78515625,
      "grad_norm": 0.20108923316001892,
      "learning_rate": 0.00012469461607459583,
      "loss": 0.6836,
      "step": 1225
    },
    {
      "epoch": 4.8046875,
      "grad_norm": 0.21921634674072266,
      "learning_rate": 0.0001240333929789721,
      "loss": 0.6764,
      "step": 1230
    },
    {
      "epoch": 4.82421875,
      "grad_norm": 0.21365371346473694,
      "learning_rate": 0.00012337105279322988,
      "loss": 0.6843,
      "step": 1235
    },
    {
      "epoch": 4.84375,
      "grad_norm": 0.20987005531787872,
      "learning_rate": 0.00012270762630343734,
      "loss": 0.6746,
      "step": 1240
    },
    {
      "epoch": 4.86328125,
      "grad_norm": 0.20794980227947235,
      "learning_rate": 0.00012204314434615501,
      "loss": 0.6815,
      "step": 1245
    },
    {
      "epoch": 4.8828125,
      "grad_norm": 0.21553441882133484,
      "learning_rate": 0.00012137763780700227,
      "loss": 0.6795,
      "step": 1250
    },
    {
      "epoch": 4.90234375,
      "grad_norm": 0.2035866528749466,
      "learning_rate": 0.00012071113761922186,
      "loss": 0.6828,
      "step": 1255
    },
    {
      "epoch": 4.921875,
      "grad_norm": 0.2061247080564499,
      "learning_rate": 0.00012004367476224206,
      "loss": 0.6838,
      "step": 1260
    },
    {
      "epoch": 4.94140625,
      "grad_norm": 0.21384355425834656,
      "learning_rate": 0.0001193752802602367,
      "loss": 0.6902,
      "step": 1265
    },
    {
      "epoch": 4.9609375,
      "grad_norm": 0.21918757259845734,
      "learning_rate": 0.0001187059851806832,
      "loss": 0.6853,
      "step": 1270
    },
    {
      "epoch": 4.98046875,
      "grad_norm": 0.20853689312934875,
      "learning_rate": 0.00011803582063291849,
      "loss": 0.6693,
      "step": 1275
    },
    {
      "epoch": 5.0,
      "grad_norm": 0.2089415341615677,
      "learning_rate": 0.00011736481776669306,
      "loss": 0.6831,
      "step": 1280
    },
    {
      "epoch": 5.0,
      "eval_loss": 2.05405592918396,
      "eval_runtime": 0.5395,
      "eval_samples_per_second": 11.122,
      "eval_steps_per_second": 1.854,
      "step": 1280
    },
    {
      "epoch": 5.01953125,
      "grad_norm": 0.21040305495262146,
      "learning_rate": 0.00011669300777072298,
      "loss": 0.6597,
      "step": 1285
    },
    {
      "epoch": 5.0390625,
      "grad_norm": 0.2179408222436905,
      "learning_rate": 0.00011602042187124045,
      "loss": 0.6675,
      "step": 1290
    },
    {
      "epoch": 5.05859375,
      "grad_norm": 0.20846475660800934,
      "learning_rate": 0.0001153470913305421,
      "loss": 0.6643,
      "step": 1295
    },
    {
      "epoch": 5.078125,
      "grad_norm": 0.2074786126613617,
      "learning_rate": 0.00011467304744553618,
      "loss": 0.6656,
      "step": 1300
    },
    {
      "epoch": 5.09765625,
      "grad_norm": 0.2094477117061615,
      "learning_rate": 0.00011399832154628767,
      "loss": 0.6544,
      "step": 1305
    },
    {
      "epoch": 5.1171875,
      "grad_norm": 0.21982310712337494,
      "learning_rate": 0.000113322944994562,
      "loss": 0.6549,
      "step": 1310
    },
    {
      "epoch": 5.13671875,
      "grad_norm": 0.23372633755207062,
      "learning_rate": 0.00011264694918236753,
      "loss": 0.6567,
      "step": 1315
    },
    {
      "epoch": 5.15625,
      "grad_norm": 0.21253670752048492,
      "learning_rate": 0.00011197036553049625,
      "loss": 0.657,
      "step": 1320
    },
    {
      "epoch": 5.17578125,
      "grad_norm": 0.21819843351840973,
      "learning_rate": 0.00011129322548706342,
      "loss": 0.6624,
      "step": 1325
    },
    {
      "epoch": 5.1953125,
      "grad_norm": 0.22048228979110718,
      "learning_rate": 0.00011061556052604578,
      "loss": 0.6617,
      "step": 1330
    },
    {
      "epoch": 5.21484375,
      "grad_norm": 0.21444514393806458,
      "learning_rate": 0.00010993740214581856,
      "loss": 0.6714,
      "step": 1335
    },
    {
      "epoch": 5.234375,
      "grad_norm": 0.20963872969150543,
      "learning_rate": 0.00010925878186769158,
      "loss": 0.6554,
      "step": 1340
    },
    {
      "epoch": 5.25390625,
      "grad_norm": 0.21605953574180603,
      "learning_rate": 0.000108579731234444,
      "loss": 0.6625,
      "step": 1345
    },
    {
      "epoch": 5.2734375,
      "grad_norm": 0.2186332494020462,
      "learning_rate": 0.00010790028180885821,
      "loss": 0.659,
      "step": 1350
    },
    {
      "epoch": 5.29296875,
      "grad_norm": 0.20879332721233368,
      "learning_rate": 0.00010722046517225271,
      "loss": 0.6574,
      "step": 1355
    },
    {
      "epoch": 5.3125,
      "grad_norm": 0.20964272320270538,
      "learning_rate": 0.00010654031292301432,
      "loss": 0.6495,
      "step": 1360
    },
    {
      "epoch": 5.33203125,
      "grad_norm": 0.22066867351531982,
      "learning_rate": 0.00010585985667512934,
      "loss": 0.6657,
      "step": 1365
    },
    {
      "epoch": 5.3515625,
      "grad_norm": 0.21919472515583038,
      "learning_rate": 0.00010517912805671419,
      "loss": 0.6663,
      "step": 1370
    },
    {
      "epoch": 5.37109375,
      "grad_norm": 0.20911991596221924,
      "learning_rate": 0.00010449815870854525,
      "loss": 0.6655,
      "step": 1375
    },
    {
      "epoch": 5.390625,
      "grad_norm": 0.21343956887722015,
      "learning_rate": 0.00010381698028258817,
      "loss": 0.6538,
      "step": 1380
    },
    {
      "epoch": 5.41015625,
      "grad_norm": 0.23448581993579865,
      "learning_rate": 0.00010313562444052677,
      "loss": 0.6745,
      "step": 1385
    },
    {
      "epoch": 5.4296875,
      "grad_norm": 0.2224402278661728,
      "learning_rate": 0.00010245412285229124,
      "loss": 0.6659,
      "step": 1390
    },
    {
      "epoch": 5.44921875,
      "grad_norm": 0.21760495007038116,
      "learning_rate": 0.0001017725071945862,
      "loss": 0.6574,
      "step": 1395
    },
    {
      "epoch": 5.46875,
      "grad_norm": 0.21981921792030334,
      "learning_rate": 0.00010109080914941824,
      "loss": 0.6639,
      "step": 1400
    },
    {
      "epoch": 5.48828125,
      "grad_norm": 0.22708064317703247,
      "learning_rate": 0.00010040906040262348,
      "loss": 0.6601,
      "step": 1405
    },
    {
      "epoch": 5.5078125,
      "grad_norm": 0.21901877224445343,
      "learning_rate": 9.972729264239461e-05,
      "loss": 0.6708,
      "step": 1410
    },
    {
      "epoch": 5.52734375,
      "grad_norm": 0.21920931339263916,
      "learning_rate": 9.904553755780815e-05,
      "loss": 0.6588,
      "step": 1415
    },
    {
      "epoch": 5.546875,
      "grad_norm": 0.2086167186498642,
      "learning_rate": 9.836382683735132e-05,
      "loss": 0.6689,
      "step": 1420
    },
    {
      "epoch": 5.56640625,
      "grad_norm": 0.2135404795408249,
      "learning_rate": 9.768219216744942e-05,
      "loss": 0.6709,
      "step": 1425
    },
    {
      "epoch": 5.5859375,
      "grad_norm": 0.2296486496925354,
      "learning_rate": 9.700066523099273e-05,
      "loss": 0.6768,
      "step": 1430
    },
    {
      "epoch": 5.60546875,
      "grad_norm": 0.22231514751911163,
      "learning_rate": 9.631927770586412e-05,
      "loss": 0.6662,
      "step": 1435
    },
    {
      "epoch": 5.625,
      "grad_norm": 0.21092720329761505,
      "learning_rate": 9.563806126346642e-05,
      "loss": 0.6563,
      "step": 1440
    },
    {
      "epoch": 5.64453125,
      "grad_norm": 0.2081764191389084,
      "learning_rate": 9.495704756725041e-05,
      "loss": 0.6599,
      "step": 1445
    },
    {
      "epoch": 5.6640625,
      "grad_norm": 0.21930693089962006,
      "learning_rate": 9.427626827124317e-05,
      "loss": 0.6645,
      "step": 1450
    },
    {
      "epoch": 5.68359375,
      "grad_norm": 0.22238822281360626,
      "learning_rate": 9.359575501857651e-05,
      "loss": 0.6653,
      "step": 1455
    },
    {
      "epoch": 5.703125,
      "grad_norm": 0.21201257407665253,
      "learning_rate": 9.29155394400166e-05,
      "loss": 0.675,
      "step": 1460
    },
    {
      "epoch": 5.72265625,
      "grad_norm": 0.21970124542713165,
      "learning_rate": 9.223565315249325e-05,
      "loss": 0.6719,
      "step": 1465
    },
    {
      "epoch": 5.7421875,
      "grad_norm": 0.20852448046207428,
      "learning_rate": 9.155612775763069e-05,
      "loss": 0.6701,
      "step": 1470
    },
    {
      "epoch": 5.76171875,
      "grad_norm": 0.2180168330669403,
      "learning_rate": 9.087699484027857e-05,
      "loss": 0.658,
      "step": 1475
    },
    {
      "epoch": 5.78125,
      "grad_norm": 0.211044043302536,
      "learning_rate": 9.019828596704394e-05,
      "loss": 0.6526,
      "step": 1480
    },
    {
      "epoch": 5.80078125,
      "grad_norm": 0.20980176329612732,
      "learning_rate": 8.95200326848239e-05,
      "loss": 0.6548,
      "step": 1485
    },
    {
      "epoch": 5.8203125,
      "grad_norm": 0.20603534579277039,
      "learning_rate": 8.884226651933927e-05,
      "loss": 0.6644,
      "step": 1490
    },
    {
      "epoch": 5.83984375,
      "grad_norm": 0.20811837911605835,
      "learning_rate": 8.816501897366953e-05,
      "loss": 0.6703,
      "step": 1495
    },
    {
      "epoch": 5.859375,
      "grad_norm": 0.2105432003736496,
      "learning_rate": 8.74883215267881e-05,
      "loss": 0.6649,
      "step": 1500
    },
    {
      "epoch": 5.87890625,
      "grad_norm": 0.22339750826358795,
      "learning_rate": 8.681220563209955e-05,
      "loss": 0.6687,
      "step": 1505
    },
    {
      "epoch": 5.8984375,
      "grad_norm": 0.20943927764892578,
      "learning_rate": 8.613670271597733e-05,
      "loss": 0.663,
      "step": 1510
    },
    {
      "epoch": 5.91796875,
      "grad_norm": 0.20441389083862305,
      "learning_rate": 8.546184417630338e-05,
      "loss": 0.6663,
      "step": 1515
    },
    {
      "epoch": 5.9375,
      "grad_norm": 0.21287420392036438,
      "learning_rate": 8.478766138100834e-05,
      "loss": 0.6727,
      "step": 1520
    },
    {
      "epoch": 5.95703125,
      "grad_norm": 0.21163299679756165,
      "learning_rate": 8.411418566661388e-05,
      "loss": 0.6643,
      "step": 1525
    },
    {
      "epoch": 5.9765625,
      "grad_norm": 0.20541082322597504,
      "learning_rate": 8.344144833677594e-05,
      "loss": 0.6605,
      "step": 1530
    },
    {
      "epoch": 5.99609375,
      "grad_norm": 0.21405570209026337,
      "learning_rate": 8.27694806608298e-05,
      "loss": 0.6633,
      "step": 1535
    },
    {
      "epoch": 6.0,
      "eval_loss": 2.0744192600250244,
      "eval_runtime": 0.5398,
      "eval_samples_per_second": 11.115,
      "eval_steps_per_second": 1.853,
      "step": 1536
    },
    {
      "epoch": 6.015625,
      "grad_norm": 0.21526320278644562,
      "learning_rate": 8.209831387233676e-05,
      "loss": 0.6479,
      "step": 1540
    },
    {
      "epoch": 6.03515625,
      "grad_norm": 0.217779740691185,
      "learning_rate": 8.142797916763209e-05,
      "loss": 0.6536,
      "step": 1545
    },
    {
      "epoch": 6.0546875,
      "grad_norm": 0.22583958506584167,
      "learning_rate": 8.075850770437534e-05,
      "loss": 0.6532,
      "step": 1550
    },
    {
      "epoch": 6.07421875,
      "grad_norm": 0.24157458543777466,
      "learning_rate": 8.008993060010183e-05,
      "loss": 0.6426,
      "step": 1555
    },
    {
      "epoch": 6.09375,
      "grad_norm": 0.2280224710702896,
      "learning_rate": 7.942227893077652e-05,
      "loss": 0.6482,
      "step": 1560
    },
    {
      "epoch": 6.11328125,
      "grad_norm": 0.21372312307357788,
      "learning_rate": 7.875558372934936e-05,
      "loss": 0.6448,
      "step": 1565
    },
    {
      "epoch": 6.1328125,
      "grad_norm": 0.22514766454696655,
      "learning_rate": 7.808987598431303e-05,
      "loss": 0.6506,
      "step": 1570
    },
    {
      "epoch": 6.15234375,
      "grad_norm": 0.22178982198238373,
      "learning_rate": 7.742518663826246e-05,
      "loss": 0.6404,
      "step": 1575
    },
    {
      "epoch": 6.171875,
      "grad_norm": 0.21459142863750458,
      "learning_rate": 7.676154658645656e-05,
      "loss": 0.6557,
      "step": 1580
    },
    {
      "epoch": 6.19140625,
      "grad_norm": 0.22397801280021667,
      "learning_rate": 7.609898667538243e-05,
      "loss": 0.6445,
      "step": 1585
    },
    {
      "epoch": 6.2109375,
      "grad_norm": 0.22123484313488007,
      "learning_rate": 7.543753770132127e-05,
      "loss": 0.6375,
      "step": 1590
    },
    {
      "epoch": 6.23046875,
      "grad_norm": 0.2259218543767929,
      "learning_rate": 7.477723040891717e-05,
      "loss": 0.6486,
      "step": 1595
    },
    {
      "epoch": 6.25,
      "grad_norm": 0.21872185170650482,
      "learning_rate": 7.411809548974792e-05,
      "loss": 0.6546,
      "step": 1600
    },
    {
      "epoch": 6.26953125,
      "grad_norm": 0.2340991348028183,
      "learning_rate": 7.346016358089867e-05,
      "loss": 0.6573,
      "step": 1605
    },
    {
      "epoch": 6.2890625,
      "grad_norm": 0.2258559614419937,
      "learning_rate": 7.280346526353759e-05,
      "loss": 0.6485,
      "step": 1610
    },
    {
      "epoch": 6.30859375,
      "grad_norm": 0.21842586994171143,
      "learning_rate": 7.21480310614947e-05,
      "loss": 0.6452,
      "step": 1615
    },
    {
      "epoch": 6.328125,
      "grad_norm": 0.22392797470092773,
      "learning_rate": 7.149389143984295e-05,
      "loss": 0.6467,
      "step": 1620
    },
    {
      "epoch": 6.34765625,
      "grad_norm": 0.21205224096775055,
      "learning_rate": 7.084107680348218e-05,
      "loss": 0.6502,
      "step": 1625
    },
    {
      "epoch": 6.3671875,
      "grad_norm": 0.22041639685630798,
      "learning_rate": 7.018961749572604e-05,
      "loss": 0.6502,
      "step": 1630
    },
    {
      "epoch": 6.38671875,
      "grad_norm": 0.21791093051433563,
      "learning_rate": 6.953954379689136e-05,
      "loss": 0.6553,
      "step": 1635
    },
    {
      "epoch": 6.40625,
      "grad_norm": 0.22223076224327087,
      "learning_rate": 6.889088592289093e-05,
      "loss": 0.639,
      "step": 1640
    },
    {
      "epoch": 6.42578125,
      "grad_norm": 0.2151210606098175,
      "learning_rate": 6.824367402382885e-05,
      "loss": 0.655,
      "step": 1645
    },
    {
      "epoch": 6.4453125,
      "grad_norm": 0.2196204513311386,
      "learning_rate": 6.759793818259933e-05,
      "loss": 0.6549,
      "step": 1650
    },
    {
      "epoch": 6.46484375,
      "grad_norm": 0.21881859004497528,
      "learning_rate": 6.69537084134882e-05,
      "loss": 0.6516,
      "step": 1655
    },
    {
      "epoch": 6.484375,
      "grad_norm": 0.21970680356025696,
      "learning_rate": 6.6311014660778e-05,
      "loss": 0.6531,
      "step": 1660
    },
    {
      "epoch": 6.50390625,
      "grad_norm": 0.21640105545520782,
      "learning_rate": 6.566988679735606e-05,
      "loss": 0.6474,
      "step": 1665
    },
    {
      "epoch": 6.5234375,
      "grad_norm": 0.225670725107193,
      "learning_rate": 6.503035462332592e-05,
      "loss": 0.6437,
      "step": 1670
    },
    {
      "epoch": 6.54296875,
      "grad_norm": 0.20938833057880402,
      "learning_rate": 6.439244786462245e-05,
      "loss": 0.6526,
      "step": 1675
    },
    {
      "epoch": 6.5625,
      "grad_norm": 0.21592438220977783,
      "learning_rate": 6.375619617162985e-05,
      "loss": 0.6528,
      "step": 1680
    },
    {
      "epoch": 6.58203125,
      "grad_norm": 0.22665540874004364,
      "learning_rate": 6.312162911780368e-05,
      "loss": 0.6502,
      "step": 1685
    },
    {
      "epoch": 6.6015625,
      "grad_norm": 0.2195620834827423,
      "learning_rate": 6.248877619829619e-05,
      "loss": 0.6469,
      "step": 1690
    },
    {
      "epoch": 6.62109375,
      "grad_norm": 0.22165308892726898,
      "learning_rate": 6.185766682858546e-05,
      "loss": 0.6518,
      "step": 1695
    },
    {
      "epoch": 6.640625,
      "grad_norm": 0.22840096056461334,
      "learning_rate": 6.122833034310793e-05,
      "loss": 0.6506,
      "step": 1700
    },
    {
      "epoch": 6.66015625,
      "grad_norm": 0.22422266006469727,
      "learning_rate": 6.060079599389521e-05,
      "loss": 0.6559,
      "step": 1705
    },
    {
      "epoch": 6.6796875,
      "grad_norm": 0.22363343834877014,
      "learning_rate": 5.9975092949214116e-05,
      "loss": 0.6449,
      "step": 1710
    },
    {
      "epoch": 6.69921875,
      "grad_norm": 0.2213827222585678,
      "learning_rate": 5.935125029221111e-05,
      "loss": 0.65,
      "step": 1715
    },
    {
      "epoch": 6.71875,
      "grad_norm": 0.2290297895669937,
      "learning_rate": 5.872929701956054e-05,
      "loss": 0.6476,
      "step": 1720
    },
    {
      "epoch": 6.73828125,
      "grad_norm": 0.23118211328983307,
      "learning_rate": 5.810926204011658e-05,
      "loss": 0.6511,
      "step": 1725
    },
    {
      "epoch": 6.7578125,
      "grad_norm": 0.22112269699573517,
      "learning_rate": 5.749117417356988e-05,
      "loss": 0.6481,
      "step": 1730
    },
    {
      "epoch": 6.77734375,
      "grad_norm": 0.21454501152038574,
      "learning_rate": 5.687506214910765e-05,
      "loss": 0.6492,
      "step": 1735
    },
    {
      "epoch": 6.796875,
      "grad_norm": 0.22518618404865265,
      "learning_rate": 5.6260954604078585e-05,
      "loss": 0.6515,
      "step": 1740
    },
    {
      "epoch": 6.81640625,
      "grad_norm": 0.23013541102409363,
      "learning_rate": 5.564888008266165e-05,
      "loss": 0.6563,
      "step": 1745
    },
    {
      "epoch": 6.8359375,
      "grad_norm": 0.21959349513053894,
      "learning_rate": 5.503886703453933e-05,
      "loss": 0.6504,
      "step": 1750
    },
    {
      "epoch": 6.85546875,
      "grad_norm": 0.23238404095172882,
      "learning_rate": 5.4430943813575375e-05,
      "loss": 0.6575,
      "step": 1755
    },
    {
      "epoch": 6.875,
      "grad_norm": 0.21891681849956512,
      "learning_rate": 5.382513867649663e-05,
      "loss": 0.6415,
      "step": 1760
    },
    {
      "epoch": 6.89453125,
      "grad_norm": 0.2155328243970871,
      "learning_rate": 5.3221479781579955e-05,
      "loss": 0.6498,
      "step": 1765
    },
    {
      "epoch": 6.9140625,
      "grad_norm": 0.21803325414657593,
      "learning_rate": 5.261999518734322e-05,
      "loss": 0.6439,
      "step": 1770
    },
    {
      "epoch": 6.93359375,
      "grad_norm": 0.21531429886817932,
      "learning_rate": 5.202071285124119e-05,
      "loss": 0.6486,
      "step": 1775
    },
    {
      "epoch": 6.953125,
      "grad_norm": 0.22126588225364685,
      "learning_rate": 5.142366062836599e-05,
      "loss": 0.6453,
      "step": 1780
    },
    {
      "epoch": 6.97265625,
      "grad_norm": 0.21690168976783752,
      "learning_rate": 5.082886627015246e-05,
      "loss": 0.6564,
      "step": 1785
    },
    {
      "epoch": 6.9921875,
      "grad_norm": 0.22704558074474335,
      "learning_rate": 5.023635742308807e-05,
      "loss": 0.6595,
      "step": 1790
    },
    {
      "epoch": 7.0,
      "eval_loss": 2.0813868045806885,
      "eval_runtime": 0.5387,
      "eval_samples_per_second": 11.138,
      "eval_steps_per_second": 1.856,
      "step": 1792
    },
    {
      "epoch": 7.01171875,
      "grad_norm": 0.21671408414840698,
      "learning_rate": 4.964616162742826e-05,
      "loss": 0.6478,
      "step": 1795
    },
    {
      "epoch": 7.03125,
      "grad_norm": 0.2322429120540619,
      "learning_rate": 4.9058306315915826e-05,
      "loss": 0.6355,
      "step": 1800
    },
    {
      "epoch": 7.05078125,
      "grad_norm": 0.22516188025474548,
      "learning_rate": 4.84728188125063e-05,
      "loss": 0.6343,
      "step": 1805
    },
    {
      "epoch": 7.0703125,
      "grad_norm": 0.22370575368404388,
      "learning_rate": 4.7889726331097686e-05,
      "loss": 0.6388,
      "step": 1810
    },
    {
      "epoch": 7.08984375,
      "grad_norm": 0.22702112793922424,
      "learning_rate": 4.7309055974265435e-05,
      "loss": 0.6405,
      "step": 1815
    },
    {
      "epoch": 7.109375,
      "grad_norm": 0.2213263362646103,
      "learning_rate": 4.6730834732003104e-05,
      "loss": 0.6369,
      "step": 1820
    },
    {
      "epoch": 7.12890625,
      "grad_norm": 0.2283063679933548,
      "learning_rate": 4.615508948046726e-05,
      "loss": 0.6406,
      "step": 1825
    },
    {
      "epoch": 7.1484375,
      "grad_norm": 0.22583836317062378,
      "learning_rate": 4.5581846980728794e-05,
      "loss": 0.6396,
      "step": 1830
    },
    {
      "epoch": 7.16796875,
      "grad_norm": 0.223560631275177,
      "learning_rate": 4.50111338775287e-05,
      "loss": 0.6487,
      "step": 1835
    },
    {
      "epoch": 7.1875,
      "grad_norm": 0.2752554714679718,
      "learning_rate": 4.444297669803981e-05,
      "loss": 0.6399,
      "step": 1840
    },
    {
      "epoch": 7.20703125,
      "grad_norm": 0.22124579548835754,
      "learning_rate": 4.387740185063358e-05,
      "loss": 0.6413,
      "step": 1845
    },
    {
      "epoch": 7.2265625,
      "grad_norm": 0.22053855657577515,
      "learning_rate": 4.331443562365285e-05,
      "loss": 0.6377,
      "step": 1850
    },
    {
      "epoch": 7.24609375,
      "grad_norm": 0.22650252282619476,
      "learning_rate": 4.275410418418979e-05,
      "loss": 0.6441,
      "step": 1855
    },
    {
      "epoch": 7.265625,
      "grad_norm": 0.2277732640504837,
      "learning_rate": 4.219643357686967e-05,
      "loss": 0.6472,
      "step": 1860
    },
    {
      "epoch": 7.28515625,
      "grad_norm": 0.21958424150943756,
      "learning_rate": 4.1641449722640336e-05,
      "loss": 0.6434,
      "step": 1865
    },
    {
      "epoch": 7.3046875,
      "grad_norm": 0.22781191766262054,
      "learning_rate": 4.1089178417567164e-05,
      "loss": 0.6436,
      "step": 1870
    },
    {
      "epoch": 7.32421875,
      "grad_norm": 0.22724145650863647,
      "learning_rate": 4.0539645331634504e-05,
      "loss": 0.6365,
      "step": 1875
    },
    {
      "epoch": 7.34375,
      "grad_norm": 0.22402629256248474,
      "learning_rate": 3.999287600755192e-05,
      "loss": 0.6404,
      "step": 1880
    },
    {
      "epoch": 7.36328125,
      "grad_norm": 0.22256724536418915,
      "learning_rate": 3.944889585956746e-05,
      "loss": 0.6385,
      "step": 1885
    },
    {
      "epoch": 7.3828125,
      "grad_norm": 0.2245977371931076,
      "learning_rate": 3.8907730172286124e-05,
      "loss": 0.6402,
      "step": 1890
    },
    {
      "epoch": 7.40234375,
      "grad_norm": 0.2223842293024063,
      "learning_rate": 3.8369404099494574e-05,
      "loss": 0.6401,
      "step": 1895
    },
    {
      "epoch": 7.421875,
      "grad_norm": 0.228043794631958,
      "learning_rate": 3.783394266299228e-05,
      "loss": 0.6456,
      "step": 1900
    },
    {
      "epoch": 7.44140625,
      "grad_norm": 0.22321034967899323,
      "learning_rate": 3.730137075142802e-05,
      "loss": 0.6461,
      "step": 1905
    },
    {
      "epoch": 7.4609375,
      "grad_norm": 0.2202451378107071,
      "learning_rate": 3.677171311914346e-05,
      "loss": 0.6404,
      "step": 1910
    },
    {
      "epoch": 7.48046875,
      "grad_norm": 0.23069259524345398,
      "learning_rate": 3.624499438502229e-05,
      "loss": 0.6399,
      "step": 1915
    },
    {
      "epoch": 7.5,
      "grad_norm": 0.22767633199691772,
      "learning_rate": 3.5721239031346066e-05,
      "loss": 0.6365,
      "step": 1920
    },
    {
      "epoch": 7.51953125,
      "grad_norm": 0.223536416888237,
      "learning_rate": 3.520047140265618e-05,
      "loss": 0.6398,
      "step": 1925
    },
    {
      "epoch": 7.5390625,
      "grad_norm": 0.2236379086971283,
      "learning_rate": 3.468271570462235e-05,
      "loss": 0.6374,
      "step": 1930
    },
    {
      "epoch": 7.55859375,
      "grad_norm": 0.22322149574756622,
      "learning_rate": 3.41679960029174e-05,
      "loss": 0.6411,
      "step": 1935
    },
    {
      "epoch": 7.578125,
      "grad_norm": 0.22714544832706451,
      "learning_rate": 3.365633622209891e-05,
      "loss": 0.6281,
      "step": 1940
    },
    {
      "epoch": 7.59765625,
      "grad_norm": 0.23407664895057678,
      "learning_rate": 3.314776014449694e-05,
      "loss": 0.6342,
      "step": 1945
    },
    {
      "epoch": 7.6171875,
      "grad_norm": 0.2269096076488495,
      "learning_rate": 3.2642291409108775e-05,
      "loss": 0.6462,
      "step": 1950
    },
    {
      "epoch": 7.63671875,
      "grad_norm": 0.21775776147842407,
      "learning_rate": 3.213995351050011e-05,
      "loss": 0.6442,
      "step": 1955
    },
    {
      "epoch": 7.65625,
      "grad_norm": 0.21870321035385132,
      "learning_rate": 3.164076979771287e-05,
      "loss": 0.6391,
      "step": 1960
    },
    {
      "epoch": 7.67578125,
      "grad_norm": 0.24278177320957184,
      "learning_rate": 3.1144763473180285e-05,
      "loss": 0.6351,
      "step": 1965
    },
    {
      "epoch": 7.6953125,
      "grad_norm": 0.222146674990654,
      "learning_rate": 3.065195759164797e-05,
      "loss": 0.6442,
      "step": 1970
    },
    {
      "epoch": 7.71484375,
      "grad_norm": 0.23037941753864288,
      "learning_rate": 3.016237505910272e-05,
      "loss": 0.6391,
      "step": 1975
    },
    {
      "epoch": 7.734375,
      "grad_norm": 0.22653505206108093,
      "learning_rate": 2.9676038631707593e-05,
      "loss": 0.6364,
      "step": 1980
    },
    {
      "epoch": 7.75390625,
      "grad_norm": 0.22071927785873413,
      "learning_rate": 2.9192970914744132e-05,
      "loss": 0.6436,
      "step": 1985
    },
    {
      "epoch": 7.7734375,
      "grad_norm": 0.2352590709924698,
      "learning_rate": 2.8713194361562036e-05,
      "loss": 0.6389,
      "step": 1990
    },
    {
      "epoch": 7.79296875,
      "grad_norm": 0.23165152966976166,
      "learning_rate": 2.8236731272534967e-05,
      "loss": 0.6359,
      "step": 1995
    },
    {
      "epoch": 7.8125,
      "grad_norm": 0.22592546045780182,
      "learning_rate": 2.776360379402445e-05,
      "loss": 0.6452,
      "step": 2000
    },
    {
      "epoch": 7.83203125,
      "grad_norm": 0.22005808353424072,
      "learning_rate": 2.72938339173503e-05,
      "loss": 0.6362,
      "step": 2005
    },
    {
      "epoch": 7.8515625,
      "grad_norm": 0.22496894001960754,
      "learning_rate": 2.6827443477768454e-05,
      "loss": 0.6363,
      "step": 2010
    },
    {
      "epoch": 7.87109375,
      "grad_norm": 0.23299238085746765,
      "learning_rate": 2.6364454153456108e-05,
      "loss": 0.6376,
      "step": 2015
    },
    {
      "epoch": 7.890625,
      "grad_norm": 0.21800798177719116,
      "learning_rate": 2.5904887464504114e-05,
      "loss": 0.6316,
      "step": 2020
    },
    {
      "epoch": 7.91015625,
      "grad_norm": 0.22942836582660675,
      "learning_rate": 2.544876477191652e-05,
      "loss": 0.6408,
      "step": 2025
    },
    {
      "epoch": 7.9296875,
      "grad_norm": 0.22502020001411438,
      "learning_rate": 2.4996107276618008e-05,
      "loss": 0.6281,
      "step": 2030
    },
    {
      "epoch": 7.94921875,
      "grad_norm": 0.22493688762187958,
      "learning_rate": 2.454693601846819e-05,
      "loss": 0.6374,
      "step": 2035
    },
    {
      "epoch": 7.96875,
      "grad_norm": 0.22121860086917877,
      "learning_rate": 2.4101271875283817e-05,
      "loss": 0.6301,
      "step": 2040
    },
    {
      "epoch": 7.98828125,
      "grad_norm": 0.22293226420879364,
      "learning_rate": 2.3659135561868305e-05,
      "loss": 0.6374,
      "step": 2045
    },
    {
      "epoch": 8.0,
      "eval_loss": 2.093949556350708,
      "eval_runtime": 0.5398,
      "eval_samples_per_second": 11.115,
      "eval_steps_per_second": 1.852,
      "step": 2048
    },
    {
      "epoch": 8.0078125,
      "grad_norm": 0.22147591412067413,
      "learning_rate": 2.3220547629048796e-05,
      "loss": 0.6318,
      "step": 2050
    },
    {
      "epoch": 8.02734375,
      "grad_norm": 0.22781990468502045,
      "learning_rate": 2.2785528462721238e-05,
      "loss": 0.6301,
      "step": 2055
    },
    {
      "epoch": 8.046875,
      "grad_norm": 0.22302427887916565,
      "learning_rate": 2.2354098282902446e-05,
      "loss": 0.6194,
      "step": 2060
    },
    {
      "epoch": 8.06640625,
      "grad_norm": 0.2345212697982788,
      "learning_rate": 2.1926277142790552e-05,
      "loss": 0.6284,
      "step": 2065
    },
    {
      "epoch": 8.0859375,
      "grad_norm": 0.22880584001541138,
      "learning_rate": 2.1502084927832845e-05,
      "loss": 0.6394,
      "step": 2070
    },
    {
      "epoch": 8.10546875,
      "grad_norm": 0.23197947442531586,
      "learning_rate": 2.1081541354801292e-05,
      "loss": 0.6414,
      "step": 2075
    },
    {
      "epoch": 8.125,
      "grad_norm": 0.2195805162191391,
      "learning_rate": 2.0664665970876496e-05,
      "loss": 0.6274,
      "step": 2080
    },
    {
      "epoch": 8.14453125,
      "grad_norm": 0.2231413722038269,
      "learning_rate": 2.025147815273867e-05,
      "loss": 0.6325,
      "step": 2085
    },
    {
      "epoch": 8.1640625,
      "grad_norm": 0.22956664860248566,
      "learning_rate": 1.9841997105667275e-05,
      "loss": 0.6345,
      "step": 2090
    },
    {
      "epoch": 8.18359375,
      "grad_norm": 0.22590646147727966,
      "learning_rate": 1.943624186264832e-05,
      "loss": 0.6276,
      "step": 2095
    },
    {
      "epoch": 8.203125,
      "grad_norm": 0.2267957627773285,
      "learning_rate": 1.903423128348959e-05,
      "loss": 0.6243,
      "step": 2100
    },
    {
      "epoch": 8.22265625,
      "grad_norm": 0.22633960843086243,
      "learning_rate": 1.8635984053944122e-05,
      "loss": 0.6279,
      "step": 2105
    },
    {
      "epoch": 8.2421875,
      "grad_norm": 0.22983397543430328,
      "learning_rate": 1.824151868484164e-05,
      "loss": 0.6347,
      "step": 2110
    },
    {
      "epoch": 8.26171875,
      "grad_norm": 0.21901904046535492,
      "learning_rate": 1.7850853511228115e-05,
      "loss": 0.6364,
      "step": 2115
    },
    {
      "epoch": 8.28125,
      "grad_norm": 0.2256007343530655,
      "learning_rate": 1.7464006691513623e-05,
      "loss": 0.628,
      "step": 2120
    },
    {
      "epoch": 8.30078125,
      "grad_norm": 0.2304702252149582,
      "learning_rate": 1.7080996206628307e-05,
      "loss": 0.6202,
      "step": 2125
    },
    {
      "epoch": 8.3203125,
      "grad_norm": 0.22724899649620056,
      "learning_rate": 1.6701839859186542e-05,
      "loss": 0.6401,
      "step": 2130
    },
    {
      "epoch": 8.33984375,
      "grad_norm": 0.22017619013786316,
      "learning_rate": 1.632655527265958e-05,
      "loss": 0.6348,
      "step": 2135
    },
    {
      "epoch": 8.359375,
      "grad_norm": 0.221891850233078,
      "learning_rate": 1.595515989055618e-05,
      "loss": 0.6306,
      "step": 2140
    },
    {
      "epoch": 8.37890625,
      "grad_norm": 0.2255999892950058,
      "learning_rate": 1.558767097561219e-05,
      "loss": 0.6436,
      "step": 2145
    },
    {
      "epoch": 8.3984375,
      "grad_norm": 0.2337878942489624,
      "learning_rate": 1.5224105608987704e-05,
      "loss": 0.6256,
      "step": 2150
    },
    {
      "epoch": 8.41796875,
      "grad_norm": 0.2235851138830185,
      "learning_rate": 1.486448068947348e-05,
      "loss": 0.6328,
      "step": 2155
    },
    {
      "epoch": 8.4375,
      "grad_norm": 0.2308977097272873,
      "learning_rate": 1.4508812932705363e-05,
      "loss": 0.6353,
      "step": 2160
    },
    {
      "epoch": 8.45703125,
      "grad_norm": 0.22785401344299316,
      "learning_rate": 1.4157118870387155e-05,
      "loss": 0.6375,
      "step": 2165
    },
    {
      "epoch": 8.4765625,
      "grad_norm": 0.24056580662727356,
      "learning_rate": 1.3809414849522584e-05,
      "loss": 0.6343,
      "step": 2170
    },
    {
      "epoch": 8.49609375,
      "grad_norm": 0.22777673602104187,
      "learning_rate": 1.3465717031655056e-05,
      "loss": 0.6336,
      "step": 2175
    },
    {
      "epoch": 8.515625,
      "grad_norm": 0.23098915815353394,
      "learning_rate": 1.3126041392116772e-05,
      "loss": 0.6296,
      "step": 2180
    },
    {
      "epoch": 8.53515625,
      "grad_norm": 0.2298251986503601,
      "learning_rate": 1.2790403719286049e-05,
      "loss": 0.6305,
      "step": 2185
    },
    {
      "epoch": 8.5546875,
      "grad_norm": 0.22145819664001465,
      "learning_rate": 1.2458819613853468e-05,
      "loss": 0.6262,
      "step": 2190
    },
    {
      "epoch": 8.57421875,
      "grad_norm": 0.2244306206703186,
      "learning_rate": 1.2131304488096772e-05,
      "loss": 0.6225,
      "step": 2195
    },
    {
      "epoch": 8.59375,
      "grad_norm": 0.22416800260543823,
      "learning_rate": 1.1807873565164506e-05,
      "loss": 0.6309,
      "step": 2200
    },
    {
      "epoch": 8.61328125,
      "grad_norm": 0.22584258019924164,
      "learning_rate": 1.148854187836833e-05,
      "loss": 0.6318,
      "step": 2205
    },
    {
      "epoch": 8.6328125,
      "grad_norm": 0.2320922613143921,
      "learning_rate": 1.1173324270484397e-05,
      "loss": 0.6352,
      "step": 2210
    },
    {
      "epoch": 8.65234375,
      "grad_norm": 0.2240631878376007,
      "learning_rate": 1.0862235393063413e-05,
      "loss": 0.6279,
      "step": 2215
    },
    {
      "epoch": 8.671875,
      "grad_norm": 0.2261231392621994,
      "learning_rate": 1.0555289705749483e-05,
      "loss": 0.6299,
      "step": 2220
    },
    {
      "epoch": 8.69140625,
      "grad_norm": 0.22478684782981873,
      "learning_rate": 1.025250147560829e-05,
      "loss": 0.639,
      "step": 2225
    },
    {
      "epoch": 8.7109375,
      "grad_norm": 0.22566542029380798,
      "learning_rate": 9.953884776463652e-06,
      "loss": 0.63,
      "step": 2230
    },
    {
      "epoch": 8.73046875,
      "grad_norm": 0.23023688793182373,
      "learning_rate": 9.659453488243575e-06,
      "loss": 0.6439,
      "step": 2235
    },
    {
      "epoch": 8.75,
      "grad_norm": 0.22487542033195496,
      "learning_rate": 9.369221296335006e-06,
      "loss": 0.6421,
      "step": 2240
    },
    {
      "epoch": 8.76953125,
      "grad_norm": 0.22670140862464905,
      "learning_rate": 9.083201690947763e-06,
      "loss": 0.6331,
      "step": 2245
    },
    {
      "epoch": 8.7890625,
      "grad_norm": 0.2248082160949707,
      "learning_rate": 8.801407966487486e-06,
      "loss": 0.6216,
      "step": 2250
    },
    {
      "epoch": 8.80859375,
      "grad_norm": 0.23012250661849976,
      "learning_rate": 8.52385322093765e-06,
      "loss": 0.6452,
      "step": 2255
    },
    {
      "epoch": 8.828125,
      "grad_norm": 0.22810766100883484,
      "learning_rate": 8.250550355250875e-06,
      "loss": 0.6395,
      "step": 2260
    },
    {
      "epoch": 8.84765625,
      "grad_norm": 0.22482182085514069,
      "learning_rate": 7.981512072749198e-06,
      "loss": 0.6316,
      "step": 2265
    },
    {
      "epoch": 8.8671875,
      "grad_norm": 0.22704395651817322,
      "learning_rate": 7.71675087853364e-06,
      "loss": 0.6389,
      "step": 2270
    },
    {
      "epoch": 8.88671875,
      "grad_norm": 0.2339123636484146,
      "learning_rate": 7.456279078902928e-06,
      "loss": 0.639,
      "step": 2275
    },
    {
      "epoch": 8.90625,
      "grad_norm": 0.2283734679222107,
      "learning_rate": 7.200108780781556e-06,
      "loss": 0.6312,
      "step": 2280
    },
    {
      "epoch": 8.92578125,
      "grad_norm": 0.23632891476154327,
      "learning_rate": 6.948251891156932e-06,
      "loss": 0.6336,
      "step": 2285
    },
    {
      "epoch": 8.9453125,
      "grad_norm": 0.22593176364898682,
      "learning_rate": 6.700720116526116e-06,
      "loss": 0.6382,
      "step": 2290
    },
    {
      "epoch": 8.96484375,
      "grad_norm": 0.2195340245962143,
      "learning_rate": 6.457524962351469e-06,
      "loss": 0.627,
      "step": 2295
    },
    {
      "epoch": 8.984375,
      "grad_norm": 0.2304958701133728,
      "learning_rate": 6.218677732526035e-06,
      "loss": 0.6277,
      "step": 2300
    },
    {
      "epoch": 9.0,
      "eval_loss": 2.0994203090667725,
      "eval_runtime": 0.5356,
      "eval_samples_per_second": 11.202,
      "eval_steps_per_second": 1.867,
      "step": 2304
    },
    {
      "epoch": 9.00390625,
      "grad_norm": 0.2239326387643814,
      "learning_rate": 5.984189528848095e-06,
      "loss": 0.6333,
      "step": 2305
    },
    {
      "epoch": 9.0234375,
      "grad_norm": 0.21830931305885315,
      "learning_rate": 5.7540712505050444e-06,
      "loss": 0.6303,
      "step": 2310
    },
    {
      "epoch": 9.04296875,
      "grad_norm": 0.2230663150548935,
      "learning_rate": 5.528333593567014e-06,
      "loss": 0.6266,
      "step": 2315
    },
    {
      "epoch": 9.0625,
      "grad_norm": 0.22621068358421326,
      "learning_rate": 5.306987050489442e-06,
      "loss": 0.6273,
      "step": 2320
    },
    {
      "epoch": 9.08203125,
      "grad_norm": 0.2257871776819229,
      "learning_rate": 5.090041909625542e-06,
      "loss": 0.6171,
      "step": 2325
    },
    {
      "epoch": 9.1015625,
      "grad_norm": 0.22467824816703796,
      "learning_rate": 4.877508254748076e-06,
      "loss": 0.6256,
      "step": 2330
    },
    {
      "epoch": 9.12109375,
      "grad_norm": 0.22441822290420532,
      "learning_rate": 4.669395964580614e-06,
      "loss": 0.6247,
      "step": 2335
    },
    {
      "epoch": 9.140625,
      "grad_norm": 0.22599612176418304,
      "learning_rate": 4.465714712338398e-06,
      "loss": 0.6204,
      "step": 2340
    },
    {
      "epoch": 9.16015625,
      "grad_norm": 0.22301939129829407,
      "learning_rate": 4.26647396527865e-06,
      "loss": 0.634,
      "step": 2345
    },
    {
      "epoch": 9.1796875,
      "grad_norm": 0.23274029791355133,
      "learning_rate": 4.071682984260638e-06,
      "loss": 0.6256,
      "step": 2350
    },
    {
      "epoch": 9.19921875,
      "grad_norm": 0.23097610473632812,
      "learning_rate": 3.881350823315177e-06,
      "loss": 0.6293,
      "step": 2355
    },
    {
      "epoch": 9.21875,
      "grad_norm": 0.23166796565055847,
      "learning_rate": 3.6954863292237297e-06,
      "loss": 0.6294,
      "step": 2360
    },
    {
      "epoch": 9.23828125,
      "grad_norm": 0.22876545786857605,
      "learning_rate": 3.514098141107314e-06,
      "loss": 0.6298,
      "step": 2365
    },
    {
      "epoch": 9.2578125,
      "grad_norm": 0.22338230907917023,
      "learning_rate": 3.3371946900248473e-06,
      "loss": 0.6264,
      "step": 2370
    },
    {
      "epoch": 9.27734375,
      "grad_norm": 0.2302178293466568,
      "learning_rate": 3.1647841985813164e-06,
      "loss": 0.627,
      "step": 2375
    },
    {
      "epoch": 9.296875,
      "grad_norm": 0.2242288738489151,
      "learning_rate": 2.996874680545603e-06,
      "loss": 0.6336,
      "step": 2380
    },
    {
      "epoch": 9.31640625,
      "grad_norm": 0.22500120103359222,
      "learning_rate": 2.8334739404779375e-06,
      "loss": 0.6264,
      "step": 2385
    },
    {
      "epoch": 9.3359375,
      "grad_norm": 0.23554645478725433,
      "learning_rate": 2.674589573367192e-06,
      "loss": 0.6213,
      "step": 2390
    },
    {
      "epoch": 9.35546875,
      "grad_norm": 0.2254471480846405,
      "learning_rate": 2.5202289642778375e-06,
      "loss": 0.6348,
      "step": 2395
    },
    {
      "epoch": 9.375,
      "grad_norm": 0.22407911717891693,
      "learning_rate": 2.3703992880066638e-06,
      "loss": 0.6294,
      "step": 2400
    },
    {
      "epoch": 9.39453125,
      "grad_norm": 0.22965936362743378,
      "learning_rate": 2.2251075087493355e-06,
      "loss": 0.64,
      "step": 2405
    },
    {
      "epoch": 9.4140625,
      "grad_norm": 0.22874490916728973,
      "learning_rate": 2.0843603797766287e-06,
      "loss": 0.6313,
      "step": 2410
    },
    {
      "epoch": 9.43359375,
      "grad_norm": 0.22413046658039093,
      "learning_rate": 1.9481644431206036e-06,
      "loss": 0.6229,
      "step": 2415
    },
    {
      "epoch": 9.453125,
      "grad_norm": 0.2280588150024414,
      "learning_rate": 1.8165260292704711e-06,
      "loss": 0.6265,
      "step": 2420
    },
    {
      "epoch": 9.47265625,
      "grad_norm": 0.22689659893512726,
      "learning_rate": 1.6894512568783716e-06,
      "loss": 0.6272,
      "step": 2425
    },
    {
      "epoch": 9.4921875,
      "grad_norm": 0.23052698373794556,
      "learning_rate": 1.5669460324749586e-06,
      "loss": 0.6408,
      "step": 2430
    },
    {
      "epoch": 9.51171875,
      "grad_norm": 0.22765642404556274,
      "learning_rate": 1.4490160501948735e-06,
      "loss": 0.644,
      "step": 2435
    },
    {
      "epoch": 9.53125,
      "grad_norm": 0.22766034305095673,
      "learning_rate": 1.3356667915121025e-06,
      "loss": 0.6249,
      "step": 2440
    },
    {
      "epoch": 9.55078125,
      "grad_norm": 0.22794398665428162,
      "learning_rate": 1.2269035249851236e-06,
      "loss": 0.6318,
      "step": 2445
    },
    {
      "epoch": 9.5703125,
      "grad_norm": 0.22712871432304382,
      "learning_rate": 1.1227313060120926e-06,
      "loss": 0.6359,
      "step": 2450
    },
    {
      "epoch": 9.58984375,
      "grad_norm": 0.22914738953113556,
      "learning_rate": 1.0231549765958192e-06,
      "loss": 0.6389,
      "step": 2455
    },
    {
      "epoch": 9.609375,
      "grad_norm": 0.22300153970718384,
      "learning_rate": 9.281791651187366e-07,
      "loss": 0.6356,
      "step": 2460
    },
    {
      "epoch": 9.62890625,
      "grad_norm": 0.232873797416687,
      "learning_rate": 8.378082861277281e-07,
      "loss": 0.6272,
      "step": 2465
    },
    {
      "epoch": 9.6484375,
      "grad_norm": 0.227997824549675,
      "learning_rate": 7.520465401290033e-07,
      "loss": 0.633,
      "step": 2470
    },
    {
      "epoch": 9.66796875,
      "grad_norm": 0.21839286386966705,
      "learning_rate": 6.708979133927762e-07,
      "loss": 0.6215,
      "step": 2475
    },
    {
      "epoch": 9.6875,
      "grad_norm": 0.22753040492534637,
      "learning_rate": 5.943661777680354e-07,
      "loss": 0.6272,
      "step": 2480
    },
    {
      "epoch": 9.70703125,
      "grad_norm": 0.22866863012313843,
      "learning_rate": 5.224548905072402e-07,
      "loss": 0.6357,
      "step": 2485
    },
    {
      "epoch": 9.7265625,
      "grad_norm": 0.2306712120771408,
      "learning_rate": 4.5516739410087494e-07,
      "loss": 0.6244,
      "step": 2490
    },
    {
      "epoch": 9.74609375,
      "grad_norm": 0.22779209911823273,
      "learning_rate": 3.9250681612225116e-07,
      "loss": 0.6309,
      "step": 2495
    },
    {
      "epoch": 9.765625,
      "grad_norm": 0.22719816863536835,
      "learning_rate": 3.3447606908196817e-07,
      "loss": 0.628,
      "step": 2500
    },
    {
      "epoch": 9.78515625,
      "grad_norm": 0.23172929883003235,
      "learning_rate": 2.8107785029265476e-07,
      "loss": 0.6293,
      "step": 2505
    },
    {
      "epoch": 9.8046875,
      "grad_norm": 0.22468186914920807,
      "learning_rate": 2.3231464174352512e-07,
      "loss": 0.6368,
      "step": 2510
    },
    {
      "epoch": 9.82421875,
      "grad_norm": 0.22247561812400818,
      "learning_rate": 1.8818870998508208e-07,
      "loss": 0.6222,
      "step": 2515
    },
    {
      "epoch": 9.84375,
      "grad_norm": 0.22515320777893066,
      "learning_rate": 1.487021060236904e-07,
      "loss": 0.6266,
      "step": 2520
    },
    {
      "epoch": 9.86328125,
      "grad_norm": 0.23118971288204193,
      "learning_rate": 1.1385666522630845e-07,
      "loss": 0.6308,
      "step": 2525
    },
    {
      "epoch": 9.8828125,
      "grad_norm": 0.22416307032108307,
      "learning_rate": 8.365400723512328e-08,
      "loss": 0.6239,
      "step": 2530
    },
    {
      "epoch": 9.90234375,
      "grad_norm": 0.22984710335731506,
      "learning_rate": 5.8095535892332964e-08,
      "loss": 0.6362,
      "step": 2535
    },
    {
      "epoch": 9.921875,
      "grad_norm": 0.23102597892284393,
      "learning_rate": 3.7182439174832106e-08,
      "loss": 0.6365,
      "step": 2540
    },
    {
      "epoch": 9.94140625,
      "grad_norm": 0.2295123189687729,
      "learning_rate": 2.091568913904496e-08,
      "loss": 0.6397,
      "step": 2545
    },
    {
      "epoch": 9.9609375,
      "grad_norm": 0.22766011953353882,
      "learning_rate": 9.296041875683781e-09,
      "loss": 0.6274,
      "step": 2550
    },
    {
      "epoch": 9.98046875,
      "grad_norm": 0.2338954210281372,
      "learning_rate": 2.3240374746658077e-09,
      "loss": 0.6212,
      "step": 2555
    },
    {
      "epoch": 10.0,
      "grad_norm": 0.22291633486747742,
      "learning_rate": 0.0,
      "loss": 0.616,
      "step": 2560
    },
    {
      "epoch": 10.0,
      "eval_loss": 2.1007895469665527,
      "eval_runtime": 0.5705,
      "eval_samples_per_second": 10.518,
      "eval_steps_per_second": 1.753,
      "step": 2560
    },
    {
      "epoch": 10.0,
      "step": 2560,
      "total_flos": 7.568434414263206e+18,
      "train_loss": 0.7105431989766657,
      "train_runtime": 14792.6859,
      "train_samples_per_second": 11.056,
      "train_steps_per_second": 0.173
    }
  ],
  "logging_steps": 5,
  "max_steps": 2560,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 7.568434414263206e+18,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}