llama3-8b-closedqa-gpt4o-100k / trainer_state.json

Model save

9060310 verified 4 months ago

87.5 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 10.0,
	"eval_steps": 500,
	"global_step": 2560,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.00390625,
	"grad_norm": 1.813705325126648,
	"learning_rate": 7.8125e-07,
	"loss": 1.9071,
	"step": 1
	},
	{
	"epoch": 0.01953125,
	"grad_norm": 1.431990385055542,
	"learning_rate": 3.90625e-06,
	"loss": 1.8608,
	"step": 5
	},
	{
	"epoch": 0.0390625,
	"grad_norm": 1.281330943107605,
	"learning_rate": 7.8125e-06,
	"loss": 1.8263,
	"step": 10
	},
	{
	"epoch": 0.05859375,
	"grad_norm": 1.310953140258789,
	"learning_rate": 1.171875e-05,
	"loss": 1.8193,
	"step": 15
	},
	{
	"epoch": 0.078125,
	"grad_norm": 1.296993374824524,
	"learning_rate": 1.5625e-05,
	"loss": 1.7463,
	"step": 20
	},
	{
	"epoch": 0.09765625,
	"grad_norm": 1.1856365203857422,
	"learning_rate": 1.953125e-05,
	"loss": 1.6844,
	"step": 25
	},
	{
	"epoch": 0.1171875,
	"grad_norm": 3.376720905303955,
	"learning_rate": 2.34375e-05,
	"loss": 1.5861,
	"step": 30
	},
	{
	"epoch": 0.13671875,
	"grad_norm": 3.182882785797119,
	"learning_rate": 2.734375e-05,
	"loss": 1.4328,
	"step": 35
	},
	{
	"epoch": 0.15625,
	"grad_norm": 0.682467520236969,
	"learning_rate": 3.125e-05,
	"loss": 1.2702,
	"step": 40
	},
	{
	"epoch": 0.17578125,
	"grad_norm": 0.9865962266921997,
	"learning_rate": 3.5156250000000004e-05,
	"loss": 1.1671,
	"step": 45
	},
	{
	"epoch": 0.1953125,
	"grad_norm": 0.42747607827186584,
	"learning_rate": 3.90625e-05,
	"loss": 1.1303,
	"step": 50
	},
	{
	"epoch": 0.21484375,
	"grad_norm": 0.42581626772880554,
	"learning_rate": 4.2968750000000004e-05,
	"loss": 1.101,
	"step": 55
	},
	{
	"epoch": 0.234375,
	"grad_norm": 0.4914548099040985,
	"learning_rate": 4.6875e-05,
	"loss": 1.0586,
	"step": 60
	},
	{
	"epoch": 0.25390625,
	"grad_norm": 0.39272716641426086,
	"learning_rate": 5.0781250000000004e-05,
	"loss": 1.0308,
	"step": 65
	},
	{
	"epoch": 0.2734375,
	"grad_norm": 0.34394437074661255,
	"learning_rate": 5.46875e-05,
	"loss": 0.9998,
	"step": 70
	},
	{
	"epoch": 0.29296875,
	"grad_norm": 0.3009032607078552,
	"learning_rate": 5.8593750000000005e-05,
	"loss": 0.9784,
	"step": 75
	},
	{
	"epoch": 0.3125,
	"grad_norm": 0.27089548110961914,
	"learning_rate": 6.25e-05,
	"loss": 0.9653,
	"step": 80
	},
	{
	"epoch": 0.33203125,
	"grad_norm": 0.25717490911483765,
	"learning_rate": 6.640625e-05,
	"loss": 0.9434,
	"step": 85
	},
	{
	"epoch": 0.3515625,
	"grad_norm": 0.3018302917480469,
	"learning_rate": 7.031250000000001e-05,
	"loss": 0.9372,
	"step": 90
	},
	{
	"epoch": 0.37109375,
	"grad_norm": 0.2254215031862259,
	"learning_rate": 7.421875e-05,
	"loss": 0.9236,
	"step": 95
	},
	{
	"epoch": 0.390625,
	"grad_norm": 0.2384410947561264,
	"learning_rate": 7.8125e-05,
	"loss": 0.9145,
	"step": 100
	},
	{
	"epoch": 0.41015625,
	"grad_norm": 0.2905459403991699,
	"learning_rate": 8.203125e-05,
	"loss": 0.9177,
	"step": 105
	},
	{
	"epoch": 0.4296875,
	"grad_norm": 0.27646884322166443,
	"learning_rate": 8.593750000000001e-05,
	"loss": 0.9103,
	"step": 110
	},
	{
	"epoch": 0.44921875,
	"grad_norm": 0.23843346536159515,
	"learning_rate": 8.984375e-05,
	"loss": 0.8911,
	"step": 115
	},
	{
	"epoch": 0.46875,
	"grad_norm": 0.3110702931880951,
	"learning_rate": 9.375e-05,
	"loss": 0.8961,
	"step": 120
	},
	{
	"epoch": 0.48828125,
	"grad_norm": 0.2591000199317932,
	"learning_rate": 9.765625e-05,
	"loss": 0.8911,
	"step": 125
	},
	{
	"epoch": 0.5078125,
	"grad_norm": 0.2314710170030594,
	"learning_rate": 0.00010156250000000001,
	"loss": 0.8765,
	"step": 130
	},
	{
	"epoch": 0.52734375,
	"grad_norm": 0.268370658159256,
	"learning_rate": 0.00010546875,
	"loss": 0.8759,
	"step": 135
	},
	{
	"epoch": 0.546875,
	"grad_norm": 0.24689124524593353,
	"learning_rate": 0.000109375,
	"loss": 0.8714,
	"step": 140
	},
	{
	"epoch": 0.56640625,
	"grad_norm": 0.28693222999572754,
	"learning_rate": 0.00011328125,
	"loss": 0.882,
	"step": 145
	},
	{
	"epoch": 0.5859375,
	"grad_norm": 0.26165568828582764,
	"learning_rate": 0.00011718750000000001,
	"loss": 0.8638,
	"step": 150
	},
	{
	"epoch": 0.60546875,
	"grad_norm": 0.2968839406967163,
	"learning_rate": 0.00012109375,
	"loss": 0.8562,
	"step": 155
	},
	{
	"epoch": 0.625,
	"grad_norm": 0.2954418957233429,
	"learning_rate": 0.000125,
	"loss": 0.8569,
	"step": 160
	},
	{
	"epoch": 0.64453125,
	"grad_norm": 0.30811259150505066,
	"learning_rate": 0.00012890625,
	"loss": 0.8455,
	"step": 165
	},
	{
	"epoch": 0.6640625,
	"grad_norm": 0.2631295323371887,
	"learning_rate": 0.0001328125,
	"loss": 0.8574,
	"step": 170
	},
	{
	"epoch": 0.68359375,
	"grad_norm": 0.25627005100250244,
	"learning_rate": 0.00013671875,
	"loss": 0.851,
	"step": 175
	},
	{
	"epoch": 0.703125,
	"grad_norm": 0.28598853945732117,
	"learning_rate": 0.00014062500000000002,
	"loss": 0.8385,
	"step": 180
	},
	{
	"epoch": 0.72265625,
	"grad_norm": 0.2502932548522949,
	"learning_rate": 0.00014453125000000002,
	"loss": 0.8457,
	"step": 185
	},
	{
	"epoch": 0.7421875,
	"grad_norm": 0.3177507817745209,
	"learning_rate": 0.0001484375,
	"loss": 0.8319,
	"step": 190
	},
	{
	"epoch": 0.76171875,
	"grad_norm": 0.27309176325798035,
	"learning_rate": 0.00015234375,
	"loss": 0.8511,
	"step": 195
	},
	{
	"epoch": 0.78125,
	"grad_norm": 0.29295653104782104,
	"learning_rate": 0.00015625,
	"loss": 0.8373,
	"step": 200
	},
	{
	"epoch": 0.80078125,
	"grad_norm": 0.27028167247772217,
	"learning_rate": 0.00016015625,
	"loss": 0.8319,
	"step": 205
	},
	{
	"epoch": 0.8203125,
	"grad_norm": 0.40336114168167114,
	"learning_rate": 0.0001640625,
	"loss": 0.8245,
	"step": 210
	},
	{
	"epoch": 0.83984375,
	"grad_norm": 0.3044915795326233,
	"learning_rate": 0.00016796875000000001,
	"loss": 0.8283,
	"step": 215
	},
	{
	"epoch": 0.859375,
	"grad_norm": 0.29535970091819763,
	"learning_rate": 0.00017187500000000002,
	"loss": 0.8119,
	"step": 220
	},
	{
	"epoch": 0.87890625,
	"grad_norm": 0.28554800152778625,
	"learning_rate": 0.00017578125000000002,
	"loss": 0.8091,
	"step": 225
	},
	{
	"epoch": 0.8984375,
	"grad_norm": 0.26689431071281433,
	"learning_rate": 0.0001796875,
	"loss": 0.8189,
	"step": 230
	},
	{
	"epoch": 0.91796875,
	"grad_norm": 0.29758790135383606,
	"learning_rate": 0.00018359375,
	"loss": 0.8122,
	"step": 235
	},
	{
	"epoch": 0.9375,
	"grad_norm": 0.40431731939315796,
	"learning_rate": 0.0001875,
	"loss": 0.8155,
	"step": 240
	},
	{
	"epoch": 0.95703125,
	"grad_norm": 0.27242639660835266,
	"learning_rate": 0.00019140625,
	"loss": 0.8119,
	"step": 245
	},
	{
	"epoch": 0.9765625,
	"grad_norm": 0.3094847500324249,
	"learning_rate": 0.0001953125,
	"loss": 0.8058,
	"step": 250
	},
	{
	"epoch": 0.99609375,
	"grad_norm": 0.32299983501434326,
	"learning_rate": 0.00019921875000000001,
	"loss": 0.8026,
	"step": 255
	},
	{
	"epoch": 1.0,
	"eval_loss": 2.045611619949341,
	"eval_runtime": 0.5394,
	"eval_samples_per_second": 11.124,
	"eval_steps_per_second": 1.854,
	"step": 256
	},
	{
	"epoch": 1.015625,
	"grad_norm": 0.305078387260437,
	"learning_rate": 0.00019999851261394218,
	"loss": 0.7941,
	"step": 260
	},
	{
	"epoch": 1.03515625,
	"grad_norm": 0.2842113673686981,
	"learning_rate": 0.00019999247018391447,
	"loss": 0.798,
	"step": 265
	},
	{
	"epoch": 1.0546875,
	"grad_norm": 0.27524590492248535,
	"learning_rate": 0.0001999817800289289,
	"loss": 0.7911,
	"step": 270
	},
	{
	"epoch": 1.07421875,
	"grad_norm": 0.2549247145652771,
	"learning_rate": 0.00019996644264587193,
	"loss": 0.7963,
	"step": 275
	},
	{
	"epoch": 1.09375,
	"grad_norm": 0.253353089094162,
	"learning_rate": 0.00019994645874763658,
	"loss": 0.7904,
	"step": 280
	},
	{
	"epoch": 1.11328125,
	"grad_norm": 0.23945719003677368,
	"learning_rate": 0.00019992182926308942,
	"loss": 0.7921,
	"step": 285
	},
	{
	"epoch": 1.1328125,
	"grad_norm": 0.29668208956718445,
	"learning_rate": 0.00019989255533702736,
	"loss": 0.7943,
	"step": 290
	},
	{
	"epoch": 1.15234375,
	"grad_norm": 0.26419156789779663,
	"learning_rate": 0.0001998586383301244,
	"loss": 0.7819,
	"step": 295
	},
	{
	"epoch": 1.171875,
	"grad_norm": 0.3054077625274658,
	"learning_rate": 0.00019982007981886847,
	"loss": 0.7917,
	"step": 300
	},
	{
	"epoch": 1.19140625,
	"grad_norm": 0.27965638041496277,
	"learning_rate": 0.00019977688159548808,
	"loss": 0.7854,
	"step": 305
	},
	{
	"epoch": 1.2109375,
	"grad_norm": 0.23229017853736877,
	"learning_rate": 0.00019972904566786903,
	"loss": 0.7865,
	"step": 310
	},
	{
	"epoch": 1.23046875,
	"grad_norm": 0.2789019048213959,
	"learning_rate": 0.00019967657425946106,
	"loss": 0.7821,
	"step": 315
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.24402114748954773,
	"learning_rate": 0.00019961946980917456,
	"loss": 0.7899,
	"step": 320
	},
	{
	"epoch": 1.26953125,
	"grad_norm": 0.2749808132648468,
	"learning_rate": 0.0001995577349712672,
	"loss": 0.7783,
	"step": 325
	},
	{
	"epoch": 1.2890625,
	"grad_norm": 0.2676057815551758,
	"learning_rate": 0.00019949137261522052,
	"loss": 0.7788,
	"step": 330
	},
	{
	"epoch": 1.30859375,
	"grad_norm": 0.24829885363578796,
	"learning_rate": 0.0001994203858256065,
	"loss": 0.7714,
	"step": 335
	},
	{
	"epoch": 1.328125,
	"grad_norm": 0.24872945249080658,
	"learning_rate": 0.00019934477790194445,
	"loss": 0.7832,
	"step": 340
	},
	{
	"epoch": 1.34765625,
	"grad_norm": 0.2914537489414215,
	"learning_rate": 0.00019926455235854724,
	"loss": 0.7791,
	"step": 345
	},
	{
	"epoch": 1.3671875,
	"grad_norm": 0.2692899703979492,
	"learning_rate": 0.00019917971292435826,
	"loss": 0.7739,
	"step": 350
	},
	{
	"epoch": 1.38671875,
	"grad_norm": 0.2605401873588562,
	"learning_rate": 0.000199090263542778,
	"loss": 0.7717,
	"step": 355
	},
	{
	"epoch": 1.40625,
	"grad_norm": 0.24468782544136047,
	"learning_rate": 0.00019899620837148077,
	"loss": 0.7694,
	"step": 360
	},
	{
	"epoch": 1.42578125,
	"grad_norm": 0.2542877197265625,
	"learning_rate": 0.00019889755178222147,
	"loss": 0.7653,
	"step": 365
	},
	{
	"epoch": 1.4453125,
	"grad_norm": 0.21375133097171783,
	"learning_rate": 0.00019879429836063226,
	"loss": 0.7854,
	"step": 370
	},
	{
	"epoch": 1.46484375,
	"grad_norm": 0.24711847305297852,
	"learning_rate": 0.00019868645290600955,
	"loss": 0.773,
	"step": 375
	},
	{
	"epoch": 1.484375,
	"grad_norm": 0.2352401316165924,
	"learning_rate": 0.0001985740204310909,
	"loss": 0.7641,
	"step": 380
	},
	{
	"epoch": 1.50390625,
	"grad_norm": 0.2681073844432831,
	"learning_rate": 0.00019845700616182206,
	"loss": 0.7755,
	"step": 385
	},
	{
	"epoch": 1.5234375,
	"grad_norm": 0.2394329458475113,
	"learning_rate": 0.00019833541553711395,
	"loss": 0.7635,
	"step": 390
	},
	{
	"epoch": 1.54296875,
	"grad_norm": 0.27736565470695496,
	"learning_rate": 0.00019820925420858991,
	"loss": 0.7744,
	"step": 395
	},
	{
	"epoch": 1.5625,
	"grad_norm": 0.2736864984035492,
	"learning_rate": 0.00019807852804032305,
	"loss": 0.7564,
	"step": 400
	},
	{
	"epoch": 1.58203125,
	"grad_norm": 0.22882600128650665,
	"learning_rate": 0.00019794324310856367,
	"loss": 0.7703,
	"step": 405
	},
	{
	"epoch": 1.6015625,
	"grad_norm": 0.2372276782989502,
	"learning_rate": 0.0001978034057014568,
	"loss": 0.7642,
	"step": 410
	},
	{
	"epoch": 1.62109375,
	"grad_norm": 0.23550736904144287,
	"learning_rate": 0.00019765902231874992,
	"loss": 0.7513,
	"step": 415
	},
	{
	"epoch": 1.640625,
	"grad_norm": 0.23483717441558838,
	"learning_rate": 0.00019751009967149087,
	"loss": 0.7485,
	"step": 420
	},
	{
	"epoch": 1.66015625,
	"grad_norm": 0.23124265670776367,
	"learning_rate": 0.00019735664468171587,
	"loss": 0.7712,
	"step": 425
	},
	{
	"epoch": 1.6796875,
	"grad_norm": 0.25672388076782227,
	"learning_rate": 0.00019719866448212795,
	"loss": 0.7635,
	"step": 430
	},
	{
	"epoch": 1.69921875,
	"grad_norm": 0.2655965983867645,
	"learning_rate": 0.00019703616641576514,
	"loss": 0.7614,
	"step": 435
	},
	{
	"epoch": 1.71875,
	"grad_norm": 0.22875700891017914,
	"learning_rate": 0.00019686915803565934,
	"loss": 0.7597,
	"step": 440
	},
	{
	"epoch": 1.73828125,
	"grad_norm": 0.24324467778205872,
	"learning_rate": 0.00019669764710448522,
	"loss": 0.7592,
	"step": 445
	},
	{
	"epoch": 1.7578125,
	"grad_norm": 0.23085905611515045,
	"learning_rate": 0.00019652164159419946,
	"loss": 0.7582,
	"step": 450
	},
	{
	"epoch": 1.77734375,
	"grad_norm": 0.24821893870830536,
	"learning_rate": 0.00019634114968567005,
	"loss": 0.7565,
	"step": 455
	},
	{
	"epoch": 1.796875,
	"grad_norm": 0.24690982699394226,
	"learning_rate": 0.0001961561797682962,
	"loss": 0.75,
	"step": 460
	},
	{
	"epoch": 1.81640625,
	"grad_norm": 0.21277934312820435,
	"learning_rate": 0.00019596674043961828,
	"loss": 0.7499,
	"step": 465
	},
	{
	"epoch": 1.8359375,
	"grad_norm": 0.2045515477657318,
	"learning_rate": 0.0001957728405049183,
	"loss": 0.7476,
	"step": 470
	},
	{
	"epoch": 1.85546875,
	"grad_norm": 0.22809946537017822,
	"learning_rate": 0.00019557448897681057,
	"loss": 0.7554,
	"step": 475
	},
	{
	"epoch": 1.875,
	"grad_norm": 0.2747824788093567,
	"learning_rate": 0.0001953716950748227,
	"loss": 0.7481,
	"step": 480
	},
	{
	"epoch": 1.89453125,
	"grad_norm": 0.23395125567913055,
	"learning_rate": 0.00019516446822496732,
	"loss": 0.7579,
	"step": 485
	},
	{
	"epoch": 1.9140625,
	"grad_norm": 0.2263769805431366,
	"learning_rate": 0.00019495281805930367,
	"loss": 0.7493,
	"step": 490
	},
	{
	"epoch": 1.93359375,
	"grad_norm": 0.23396165668964386,
	"learning_rate": 0.00019473675441549013,
	"loss": 0.7523,
	"step": 495
	},
	{
	"epoch": 1.953125,
	"grad_norm": 0.23420800268650055,
	"learning_rate": 0.0001945162873363268,
	"loss": 0.7469,
	"step": 500
	},
	{
	"epoch": 1.97265625,
	"grad_norm": 0.19923944771289825,
	"learning_rate": 0.00019429142706928868,
	"loss": 0.7535,
	"step": 505
	},
	{
	"epoch": 1.9921875,
	"grad_norm": 0.2181696891784668,
	"learning_rate": 0.00019406218406604965,
	"loss": 0.7532,
	"step": 510
	},
	{
	"epoch": 2.0,
	"eval_loss": 2.031317949295044,
	"eval_runtime": 0.5375,
	"eval_samples_per_second": 11.164,
	"eval_steps_per_second": 1.861,
	"step": 512
	},
	{
	"epoch": 2.01171875,
	"grad_norm": 0.2611521780490875,
	"learning_rate": 0.0001938285689819962,
	"loss": 0.7349,
	"step": 515
	},
	{
	"epoch": 2.03125,
	"grad_norm": 0.22077465057373047,
	"learning_rate": 0.0001935905926757326,
	"loss": 0.7309,
	"step": 520
	},
	{
	"epoch": 2.05078125,
	"grad_norm": 0.2502357065677643,
	"learning_rate": 0.00019334826620857583,
	"loss": 0.7402,
	"step": 525
	},
	{
	"epoch": 2.0703125,
	"grad_norm": 0.21151328086853027,
	"learning_rate": 0.00019310160084404186,
	"loss": 0.7263,
	"step": 530
	},
	{
	"epoch": 2.08984375,
	"grad_norm": 0.22730891406536102,
	"learning_rate": 0.00019285060804732158,
	"loss": 0.7393,
	"step": 535
	},
	{
	"epoch": 2.109375,
	"grad_norm": 0.29608404636383057,
	"learning_rate": 0.00019259529948474833,
	"loss": 0.7359,
	"step": 540
	},
	{
	"epoch": 2.12890625,
	"grad_norm": 0.2048954963684082,
	"learning_rate": 0.00019233568702325547,
	"loss": 0.7327,
	"step": 545
	},
	{
	"epoch": 2.1484375,
	"grad_norm": 0.24332541227340698,
	"learning_rate": 0.0001920717827298248,
	"loss": 0.723,
	"step": 550
	},
	{
	"epoch": 2.16796875,
	"grad_norm": 0.27370956540107727,
	"learning_rate": 0.0001918035988709256,
	"loss": 0.7346,
	"step": 555
	},
	{
	"epoch": 2.1875,
	"grad_norm": 0.27345338463783264,
	"learning_rate": 0.00019153114791194473,
	"loss": 0.7216,
	"step": 560
	},
	{
	"epoch": 2.20703125,
	"grad_norm": 0.21915854513645172,
	"learning_rate": 0.0001912544425166069,
	"loss": 0.7297,
	"step": 565
	},
	{
	"epoch": 2.2265625,
	"grad_norm": 0.23517705500125885,
	"learning_rate": 0.0001909734955463863,
	"loss": 0.7277,
	"step": 570
	},
	{
	"epoch": 2.24609375,
	"grad_norm": 0.2082410454750061,
	"learning_rate": 0.00019068832005990867,
	"loss": 0.7274,
	"step": 575
	},
	{
	"epoch": 2.265625,
	"grad_norm": 0.25212010741233826,
	"learning_rate": 0.00019039892931234435,
	"loss": 0.7388,
	"step": 580
	},
	{
	"epoch": 2.28515625,
	"grad_norm": 0.22077186405658722,
	"learning_rate": 0.0001901053367547922,
	"loss": 0.7356,
	"step": 585
	},
	{
	"epoch": 2.3046875,
	"grad_norm": 0.24918216466903687,
	"learning_rate": 0.0001898075560336543,
	"loss": 0.7283,
	"step": 590
	},
	{
	"epoch": 2.32421875,
	"grad_norm": 0.2168445587158203,
	"learning_rate": 0.00018950560099000182,
	"loss": 0.7276,
	"step": 595
	},
	{
	"epoch": 2.34375,
	"grad_norm": 0.3361542522907257,
	"learning_rate": 0.00018919948565893142,
	"loss": 0.7394,
	"step": 600
	},
	{
	"epoch": 2.36328125,
	"grad_norm": 0.30473312735557556,
	"learning_rate": 0.0001888892242689132,
	"loss": 0.7214,
	"step": 605
	},
	{
	"epoch": 2.3828125,
	"grad_norm": 0.22810065746307373,
	"learning_rate": 0.00018857483124112907,
	"loss": 0.7389,
	"step": 610
	},
	{
	"epoch": 2.40234375,
	"grad_norm": 0.22486305236816406,
	"learning_rate": 0.00018825632118880259,
	"loss": 0.7382,
	"step": 615
	},
	{
	"epoch": 2.421875,
	"grad_norm": 0.23797857761383057,
	"learning_rate": 0.00018793370891651972,
	"loss": 0.7352,
	"step": 620
	},
	{
	"epoch": 2.44140625,
	"grad_norm": 0.22012600302696228,
	"learning_rate": 0.00018760700941954065,
	"loss": 0.7323,
	"step": 625
	},
	{
	"epoch": 2.4609375,
	"grad_norm": 0.2505754232406616,
	"learning_rate": 0.00018727623788310292,
	"loss": 0.7319,
	"step": 630
	},
	{
	"epoch": 2.48046875,
	"grad_norm": 0.23932820558547974,
	"learning_rate": 0.0001869414096817154,
	"loss": 0.7166,
	"step": 635
	},
	{
	"epoch": 2.5,
	"grad_norm": 0.22623002529144287,
	"learning_rate": 0.00018660254037844388,
	"loss": 0.7254,
	"step": 640
	},
	{
	"epoch": 2.51953125,
	"grad_norm": 0.24143099784851074,
	"learning_rate": 0.0001862596457241875,
	"loss": 0.7374,
	"step": 645
	},
	{
	"epoch": 2.5390625,
	"grad_norm": 0.25545206665992737,
	"learning_rate": 0.00018591274165694687,
	"loss": 0.7268,
	"step": 650
	},
	{
	"epoch": 2.55859375,
	"grad_norm": 0.27690452337265015,
	"learning_rate": 0.00018556184430108293,
	"loss": 0.7318,
	"step": 655
	},
	{
	"epoch": 2.578125,
	"grad_norm": 0.21064211428165436,
	"learning_rate": 0.00018520696996656788,
	"loss": 0.7365,
	"step": 660
	},
	{
	"epoch": 2.59765625,
	"grad_norm": 0.2418980747461319,
	"learning_rate": 0.0001848481351482267,
	"loss": 0.7252,
	"step": 665
	},
	{
	"epoch": 2.6171875,
	"grad_norm": 0.21725673973560333,
	"learning_rate": 0.00018448535652497073,
	"loss": 0.7438,
	"step": 670
	},
	{
	"epoch": 2.63671875,
	"grad_norm": 0.2051118165254593,
	"learning_rate": 0.00018411865095902224,
	"loss": 0.7272,
	"step": 675
	},
	{
	"epoch": 2.65625,
	"grad_norm": 0.20715655386447906,
	"learning_rate": 0.0001837480354951308,
	"loss": 0.7189,
	"step": 680
	},
	{
	"epoch": 2.67578125,
	"grad_norm": 0.224945530295372,
	"learning_rate": 0.00018337352735978095,
	"loss": 0.7283,
	"step": 685
	},
	{
	"epoch": 2.6953125,
	"grad_norm": 0.2353772222995758,
	"learning_rate": 0.0001829951439603915,
	"loss": 0.7172,
	"step": 690
	},
	{
	"epoch": 2.71484375,
	"grad_norm": 0.21377775073051453,
	"learning_rate": 0.00018261290288450646,
	"loss": 0.7245,
	"step": 695
	},
	{
	"epoch": 2.734375,
	"grad_norm": 0.20290276408195496,
	"learning_rate": 0.00018222682189897752,
	"loss": 0.732,
	"step": 700
	},
	{
	"epoch": 2.75390625,
	"grad_norm": 0.21785806119441986,
	"learning_rate": 0.00018183691894913825,
	"loss": 0.7142,
	"step": 705
	},
	{
	"epoch": 2.7734375,
	"grad_norm": 0.21216203272342682,
	"learning_rate": 0.00018144321215797,
	"loss": 0.7163,
	"step": 710
	},
	{
	"epoch": 2.79296875,
	"grad_norm": 0.20187579095363617,
	"learning_rate": 0.0001810457198252595,
	"loss": 0.7196,
	"step": 715
	},
	{
	"epoch": 2.8125,
	"grad_norm": 0.21112394332885742,
	"learning_rate": 0.00018064446042674828,
	"loss": 0.7255,
	"step": 720
	},
	{
	"epoch": 2.83203125,
	"grad_norm": 0.21814604103565216,
	"learning_rate": 0.00018023945261327393,
	"loss": 0.7244,
	"step": 725
	},
	{
	"epoch": 2.8515625,
	"grad_norm": 0.2388346940279007,
	"learning_rate": 0.00017983071520990315,
	"loss": 0.719,
	"step": 730
	},
	{
	"epoch": 2.87109375,
	"grad_norm": 0.2274855226278305,
	"learning_rate": 0.00017941826721505684,
	"loss": 0.7092,
	"step": 735
	},
	{
	"epoch": 2.890625,
	"grad_norm": 0.2171526700258255,
	"learning_rate": 0.0001790021277996269,
	"loss": 0.7177,
	"step": 740
	},
	{
	"epoch": 2.91015625,
	"grad_norm": 0.2128465622663498,
	"learning_rate": 0.00017858231630608527,
	"loss": 0.7245,
	"step": 745
	},
	{
	"epoch": 2.9296875,
	"grad_norm": 0.2257278561592102,
	"learning_rate": 0.0001781588522475848,
	"loss": 0.7172,
	"step": 750
	},
	{
	"epoch": 2.94921875,
	"grad_norm": 0.21227267384529114,
	"learning_rate": 0.00017773175530705232,
	"loss": 0.7208,
	"step": 755
	},
	{
	"epoch": 2.96875,
	"grad_norm": 0.23267419636249542,
	"learning_rate": 0.0001773010453362737,
	"loss": 0.7188,
	"step": 760
	},
	{
	"epoch": 2.98828125,
	"grad_norm": 0.21279846131801605,
	"learning_rate": 0.00017686674235497125,
	"loss": 0.7198,
	"step": 765
	},
	{
	"epoch": 3.0,
	"eval_loss": 2.0403969287872314,
	"eval_runtime": 0.5399,
	"eval_samples_per_second": 11.113,
	"eval_steps_per_second": 1.852,
	"step": 768
	},
	{
	"epoch": 3.0078125,
	"grad_norm": 0.20591868460178375,
	"learning_rate": 0.000176428866549873,
	"loss": 0.7092,
	"step": 770
	},
	{
	"epoch": 3.02734375,
	"grad_norm": 0.21006809175014496,
	"learning_rate": 0.0001759874382737746,
	"loss": 0.6982,
	"step": 775
	},
	{
	"epoch": 3.046875,
	"grad_norm": 0.20914091169834137,
	"learning_rate": 0.00017554247804459316,
	"loss": 0.6986,
	"step": 780
	},
	{
	"epoch": 3.06640625,
	"grad_norm": 0.21207676827907562,
	"learning_rate": 0.0001750940065444136,
	"loss": 0.7024,
	"step": 785
	},
	{
	"epoch": 3.0859375,
	"grad_norm": 0.24130572378635406,
	"learning_rate": 0.00017464204461852738,
	"loss": 0.7011,
	"step": 790
	},
	{
	"epoch": 3.10546875,
	"grad_norm": 0.22464986145496368,
	"learning_rate": 0.0001741866132744636,
	"loss": 0.6998,
	"step": 795
	},
	{
	"epoch": 3.125,
	"grad_norm": 0.20956657826900482,
	"learning_rate": 0.0001737277336810124,
	"loss": 0.7068,
	"step": 800
	},
	{
	"epoch": 3.14453125,
	"grad_norm": 0.21382799744606018,
	"learning_rate": 0.00017326542716724128,
	"loss": 0.6997,
	"step": 805
	},
	{
	"epoch": 3.1640625,
	"grad_norm": 0.2018394023180008,
	"learning_rate": 0.00017279971522150348,
	"loss": 0.7057,
	"step": 810
	},
	{
	"epoch": 3.18359375,
	"grad_norm": 0.20716731250286102,
	"learning_rate": 0.00017233061949043928,
	"loss": 0.6957,
	"step": 815
	},
	{
	"epoch": 3.203125,
	"grad_norm": 0.21063964068889618,
	"learning_rate": 0.0001718581617779698,
	"loss": 0.6989,
	"step": 820
	},
	{
	"epoch": 3.22265625,
	"grad_norm": 0.21001911163330078,
	"learning_rate": 0.0001713823640442837,
	"loss": 0.7065,
	"step": 825
	},
	{
	"epoch": 3.2421875,
	"grad_norm": 0.21537743508815765,
	"learning_rate": 0.0001709032484048162,
	"loss": 0.7001,
	"step": 830
	},
	{
	"epoch": 3.26171875,
	"grad_norm": 0.21781504154205322,
	"learning_rate": 0.00017042083712922131,
	"loss": 0.7076,
	"step": 835
	},
	{
	"epoch": 3.28125,
	"grad_norm": 0.21302708983421326,
	"learning_rate": 0.00016993515264033672,
	"loss": 0.6965,
	"step": 840
	},
	{
	"epoch": 3.30078125,
	"grad_norm": 0.2185572385787964,
	"learning_rate": 0.00016944621751314144,
	"loss": 0.7046,
	"step": 845
	},
	{
	"epoch": 3.3203125,
	"grad_norm": 0.21651025116443634,
	"learning_rate": 0.0001689540544737067,
	"loss": 0.7042,
	"step": 850
	},
	{
	"epoch": 3.33984375,
	"grad_norm": 0.22459545731544495,
	"learning_rate": 0.0001684586863981394,
	"loss": 0.7133,
	"step": 855
	},
	{
	"epoch": 3.359375,
	"grad_norm": 0.21320843696594238,
	"learning_rate": 0.00016796013631151897,
	"loss": 0.7106,
	"step": 860
	},
	{
	"epoch": 3.37890625,
	"grad_norm": 0.22854122519493103,
	"learning_rate": 0.00016745842738682712,
	"loss": 0.6987,
	"step": 865
	},
	{
	"epoch": 3.3984375,
	"grad_norm": 0.22366014122962952,
	"learning_rate": 0.00016695358294387065,
	"loss": 0.7078,
	"step": 870
	},
	{
	"epoch": 3.41796875,
	"grad_norm": 0.21049249172210693,
	"learning_rate": 0.00016644562644819771,
	"loss": 0.6926,
	"step": 875
	},
	{
	"epoch": 3.4375,
	"grad_norm": 0.216139018535614,
	"learning_rate": 0.00016593458151000688,
	"loss": 0.7073,
	"step": 880
	},
	{
	"epoch": 3.45703125,
	"grad_norm": 0.22321297228336334,
	"learning_rate": 0.00016542047188304997,
	"loss": 0.7063,
	"step": 885
	},
	{
	"epoch": 3.4765625,
	"grad_norm": 0.21834047138690948,
	"learning_rate": 0.0001649033214635277,
	"loss": 0.7007,
	"step": 890
	},
	{
	"epoch": 3.49609375,
	"grad_norm": 0.2148895114660263,
	"learning_rate": 0.00016438315428897915,
	"loss": 0.709,
	"step": 895
	},
	{
	"epoch": 3.515625,
	"grad_norm": 0.2145809829235077,
	"learning_rate": 0.00016385999453716454,
	"loss": 0.7073,
	"step": 900
	},
	{
	"epoch": 3.53515625,
	"grad_norm": 0.21147432923316956,
	"learning_rate": 0.00016333386652494117,
	"loss": 0.6915,
	"step": 905
	},
	{
	"epoch": 3.5546875,
	"grad_norm": 0.21884699165821075,
	"learning_rate": 0.00016280479470713344,
	"loss": 0.7026,
	"step": 910
	},
	{
	"epoch": 3.57421875,
	"grad_norm": 0.20934432744979858,
	"learning_rate": 0.0001622728036753959,
	"loss": 0.6908,
	"step": 915
	},
	{
	"epoch": 3.59375,
	"grad_norm": 0.20113444328308105,
	"learning_rate": 0.00016173791815707051,
	"loss": 0.7101,
	"step": 920
	},
	{
	"epoch": 3.61328125,
	"grad_norm": 0.2057623565196991,
	"learning_rate": 0.000161200163014037,
	"loss": 0.7179,
	"step": 925
	},
	{
	"epoch": 3.6328125,
	"grad_norm": 0.21178101003170013,
	"learning_rate": 0.00016065956324155746,
	"loss": 0.7015,
	"step": 930
	},
	{
	"epoch": 3.65234375,
	"grad_norm": 0.21164196729660034,
	"learning_rate": 0.0001601161439671145,
	"loss": 0.6955,
	"step": 935
	},
	{
	"epoch": 3.671875,
	"grad_norm": 0.21989427506923676,
	"learning_rate": 0.00015956993044924334,
	"loss": 0.6972,
	"step": 940
	},
	{
	"epoch": 3.69140625,
	"grad_norm": 0.20968452095985413,
	"learning_rate": 0.0001590209480763576,
	"loss": 0.6986,
	"step": 945
	},
	{
	"epoch": 3.7109375,
	"grad_norm": 0.20064401626586914,
	"learning_rate": 0.00015846922236556946,
	"loss": 0.7073,
	"step": 950
	},
	{
	"epoch": 3.73046875,
	"grad_norm": 0.2390391230583191,
	"learning_rate": 0.00015791477896150347,
	"loss": 0.6958,
	"step": 955
	},
	{
	"epoch": 3.75,
	"grad_norm": 0.21184207499027252,
	"learning_rate": 0.0001573576436351046,
	"loss": 0.7008,
	"step": 960
	},
	{
	"epoch": 3.76953125,
	"grad_norm": 0.21932272613048553,
	"learning_rate": 0.00015679784228244043,
	"loss": 0.6904,
	"step": 965
	},
	{
	"epoch": 3.7890625,
	"grad_norm": 0.19908711314201355,
	"learning_rate": 0.00015623540092349732,
	"loss": 0.6991,
	"step": 970
	},
	{
	"epoch": 3.80859375,
	"grad_norm": 0.22039274871349335,
	"learning_rate": 0.00015567034570097125,
	"loss": 0.6959,
	"step": 975
	},
	{
	"epoch": 3.828125,
	"grad_norm": 0.21224038302898407,
	"learning_rate": 0.0001551027028790524,
	"loss": 0.6976,
	"step": 980
	},
	{
	"epoch": 3.84765625,
	"grad_norm": 0.21021129190921783,
	"learning_rate": 0.00015453249884220464,
	"loss": 0.6976,
	"step": 985
	},
	{
	"epoch": 3.8671875,
	"grad_norm": 0.2202974110841751,
	"learning_rate": 0.00015395976009393894,
	"loss": 0.6995,
	"step": 990
	},
	{
	"epoch": 3.88671875,
	"grad_norm": 0.21578259766101837,
	"learning_rate": 0.0001533845132555816,
	"loss": 0.6882,
	"step": 995
	},
	{
	"epoch": 3.90625,
	"grad_norm": 0.1979641318321228,
	"learning_rate": 0.0001528067850650368,
	"loss": 0.6961,
	"step": 1000
	},
	{
	"epoch": 3.92578125,
	"grad_norm": 0.20889665186405182,
	"learning_rate": 0.00015222660237554383,
	"loss": 0.7,
	"step": 1005
	},
	{
	"epoch": 3.9453125,
	"grad_norm": 0.20623871684074402,
	"learning_rate": 0.00015164399215442898,
	"loss": 0.6985,
	"step": 1010
	},
	{
	"epoch": 3.96484375,
	"grad_norm": 0.2109537273645401,
	"learning_rate": 0.00015105898148185193,
	"loss": 0.7026,
	"step": 1015
	},
	{
	"epoch": 3.984375,
	"grad_norm": 0.20740477740764618,
	"learning_rate": 0.0001504715975495472,
	"loss": 0.7053,
	"step": 1020
	},
	{
	"epoch": 4.0,
	"eval_loss": 2.0418636798858643,
	"eval_runtime": 0.5376,
	"eval_samples_per_second": 11.162,
	"eval_steps_per_second": 1.86,
	"step": 1024
	},
	{
	"epoch": 4.00390625,
	"grad_norm": 0.2116871029138565,
	"learning_rate": 0.00014988186765956029,
	"loss": 0.6923,
	"step": 1025
	},
	{
	"epoch": 4.0234375,
	"grad_norm": 0.20054052770137787,
	"learning_rate": 0.00014928981922297842,
	"loss": 0.6717,
	"step": 1030
	},
	{
	"epoch": 4.04296875,
	"grad_norm": 0.2238766998052597,
	"learning_rate": 0.00014869547975865664,
	"loss": 0.6719,
	"step": 1035
	},
	{
	"epoch": 4.0625,
	"grad_norm": 0.2156434804201126,
	"learning_rate": 0.00014809887689193877,
	"loss": 0.6718,
	"step": 1040
	},
	{
	"epoch": 4.08203125,
	"grad_norm": 0.2189694195985794,
	"learning_rate": 0.00014750003835337316,
	"loss": 0.677,
	"step": 1045
	},
	{
	"epoch": 4.1015625,
	"grad_norm": 0.2283412218093872,
	"learning_rate": 0.0001468989919774239,
	"loss": 0.6724,
	"step": 1050
	},
	{
	"epoch": 4.12109375,
	"grad_norm": 0.2534675598144531,
	"learning_rate": 0.00014629576570117709,
	"loss": 0.6842,
	"step": 1055
	},
	{
	"epoch": 4.140625,
	"grad_norm": 0.24277372658252716,
	"learning_rate": 0.00014569038756304207,
	"loss": 0.676,
	"step": 1060
	},
	{
	"epoch": 4.16015625,
	"grad_norm": 0.2335975170135498,
	"learning_rate": 0.0001450828857014485,
	"loss": 0.6861,
	"step": 1065
	},
	{
	"epoch": 4.1796875,
	"grad_norm": 0.22338411211967468,
	"learning_rate": 0.0001444732883535382,
	"loss": 0.6784,
	"step": 1070
	},
	{
	"epoch": 4.19921875,
	"grad_norm": 0.22138862311840057,
	"learning_rate": 0.00014386162385385278,
	"loss": 0.6765,
	"step": 1075
	},
	{
	"epoch": 4.21875,
	"grad_norm": 0.20274129509925842,
	"learning_rate": 0.00014324792063301662,
	"loss": 0.6762,
	"step": 1080
	},
	{
	"epoch": 4.23828125,
	"grad_norm": 0.20809794962406158,
	"learning_rate": 0.00014263220721641543,
	"loss": 0.6954,
	"step": 1085
	},
	{
	"epoch": 4.2578125,
	"grad_norm": 0.21727928519248962,
	"learning_rate": 0.00014201451222287025,
	"loss": 0.682,
	"step": 1090
	},
	{
	"epoch": 4.27734375,
	"grad_norm": 0.21408621966838837,
	"learning_rate": 0.00014139486436330736,
	"loss": 0.6817,
	"step": 1095
	},
	{
	"epoch": 4.296875,
	"grad_norm": 0.2173791378736496,
	"learning_rate": 0.00014077329243942369,
	"loss": 0.6775,
	"step": 1100
	},
	{
	"epoch": 4.31640625,
	"grad_norm": 0.21154190599918365,
	"learning_rate": 0.0001401498253423481,
	"loss": 0.6793,
	"step": 1105
	},
	{
	"epoch": 4.3359375,
	"grad_norm": 0.2106465995311737,
	"learning_rate": 0.00013952449205129855,
	"loss": 0.6736,
	"step": 1110
	},
	{
	"epoch": 4.35546875,
	"grad_norm": 0.20029598474502563,
	"learning_rate": 0.00013889732163223516,
	"loss": 0.6759,
	"step": 1115
	},
	{
	"epoch": 4.375,
	"grad_norm": 0.21185144782066345,
	"learning_rate": 0.000138268343236509,
	"loss": 0.6777,
	"step": 1120
	},
	{
	"epoch": 4.39453125,
	"grad_norm": 0.2037803679704666,
	"learning_rate": 0.0001376375860995073,
	"loss": 0.6818,
	"step": 1125
	},
	{
	"epoch": 4.4140625,
	"grad_norm": 0.21110603213310242,
	"learning_rate": 0.00013700507953929463,
	"loss": 0.675,
	"step": 1130
	},
	{
	"epoch": 4.43359375,
	"grad_norm": 0.2060796022415161,
	"learning_rate": 0.00013637085295524988,
	"loss": 0.679,
	"step": 1135
	},
	{
	"epoch": 4.453125,
	"grad_norm": 0.2184733897447586,
	"learning_rate": 0.00013573493582670003,
	"loss": 0.6859,
	"step": 1140
	},
	{
	"epoch": 4.47265625,
	"grad_norm": 0.21656639873981476,
	"learning_rate": 0.00013509735771154987,
	"loss": 0.685,
	"step": 1145
	},
	{
	"epoch": 4.4921875,
	"grad_norm": 0.219607412815094,
	"learning_rate": 0.00013445814824490805,
	"loss": 0.6814,
	"step": 1150
	},
	{
	"epoch": 4.51171875,
	"grad_norm": 0.2204212099313736,
	"learning_rate": 0.00013381733713770967,
	"loss": 0.6845,
	"step": 1155
	},
	{
	"epoch": 4.53125,
	"grad_norm": 0.2118123322725296,
	"learning_rate": 0.00013317495417533524,
	"loss": 0.6751,
	"step": 1160
	},
	{
	"epoch": 4.55078125,
	"grad_norm": 0.2175564020872116,
	"learning_rate": 0.0001325310292162263,
	"loss": 0.6813,
	"step": 1165
	},
	{
	"epoch": 4.5703125,
	"grad_norm": 0.2186279296875,
	"learning_rate": 0.0001318855921904976,
	"loss": 0.6869,
	"step": 1170
	},
	{
	"epoch": 4.58984375,
	"grad_norm": 0.21257956326007843,
	"learning_rate": 0.0001312386730985459,
	"loss": 0.6834,
	"step": 1175
	},
	{
	"epoch": 4.609375,
	"grad_norm": 0.20661357045173645,
	"learning_rate": 0.00013059030200965536,
	"loss": 0.7001,
	"step": 1180
	},
	{
	"epoch": 4.62890625,
	"grad_norm": 0.22517681121826172,
	"learning_rate": 0.00012994050906060017,
	"loss": 0.6717,
	"step": 1185
	},
	{
	"epoch": 4.6484375,
	"grad_norm": 0.22090637683868408,
	"learning_rate": 0.00012928932445424365,
	"loss": 0.678,
	"step": 1190
	},
	{
	"epoch": 4.66796875,
	"grad_norm": 0.21545428037643433,
	"learning_rate": 0.00012863677845813433,
	"loss": 0.6819,
	"step": 1195
	},
	{
	"epoch": 4.6875,
	"grad_norm": 0.209136962890625,
	"learning_rate": 0.00012798290140309923,
	"loss": 0.6862,
	"step": 1200
	},
	{
	"epoch": 4.70703125,
	"grad_norm": 0.20853549242019653,
	"learning_rate": 0.00012732772368183388,
	"loss": 0.6719,
	"step": 1205
	},
	{
	"epoch": 4.7265625,
	"grad_norm": 0.2124202698469162,
	"learning_rate": 0.00012667127574748986,
	"loss": 0.6819,
	"step": 1210
	},
	{
	"epoch": 4.74609375,
	"grad_norm": 0.2243855744600296,
	"learning_rate": 0.00012601358811225913,
	"loss": 0.6743,
	"step": 1215
	},
	{
	"epoch": 4.765625,
	"grad_norm": 0.21978437900543213,
	"learning_rate": 0.00012535469134595595,
	"loss": 0.6924,
	"step": 1220
	},
	{
	"epoch": 4.78515625,
	"grad_norm": 0.20108923316001892,
	"learning_rate": 0.00012469461607459583,
	"loss": 0.6836,
	"step": 1225
	},
	{
	"epoch": 4.8046875,
	"grad_norm": 0.21921634674072266,
	"learning_rate": 0.0001240333929789721,
	"loss": 0.6764,
	"step": 1230
	},
	{
	"epoch": 4.82421875,
	"grad_norm": 0.21365371346473694,
	"learning_rate": 0.00012337105279322988,
	"loss": 0.6843,
	"step": 1235
	},
	{
	"epoch": 4.84375,
	"grad_norm": 0.20987005531787872,
	"learning_rate": 0.00012270762630343734,
	"loss": 0.6746,
	"step": 1240
	},
	{
	"epoch": 4.86328125,
	"grad_norm": 0.20794980227947235,
	"learning_rate": 0.00012204314434615501,
	"loss": 0.6815,
	"step": 1245
	},
	{
	"epoch": 4.8828125,
	"grad_norm": 0.21553441882133484,
	"learning_rate": 0.00012137763780700227,
	"loss": 0.6795,
	"step": 1250
	},
	{
	"epoch": 4.90234375,
	"grad_norm": 0.2035866528749466,
	"learning_rate": 0.00012071113761922186,
	"loss": 0.6828,
	"step": 1255
	},
	{
	"epoch": 4.921875,
	"grad_norm": 0.2061247080564499,
	"learning_rate": 0.00012004367476224206,
	"loss": 0.6838,
	"step": 1260
	},
	{
	"epoch": 4.94140625,
	"grad_norm": 0.21384355425834656,
	"learning_rate": 0.0001193752802602367,
	"loss": 0.6902,
	"step": 1265
	},
	{
	"epoch": 4.9609375,
	"grad_norm": 0.21918757259845734,
	"learning_rate": 0.0001187059851806832,
	"loss": 0.6853,
	"step": 1270
	},
	{
	"epoch": 4.98046875,
	"grad_norm": 0.20853689312934875,
	"learning_rate": 0.00011803582063291849,
	"loss": 0.6693,
	"step": 1275
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.2089415341615677,
	"learning_rate": 0.00011736481776669306,
	"loss": 0.6831,
	"step": 1280
	},
	{
	"epoch": 5.0,
	"eval_loss": 2.05405592918396,
	"eval_runtime": 0.5395,
	"eval_samples_per_second": 11.122,
	"eval_steps_per_second": 1.854,
	"step": 1280
	},
	{
	"epoch": 5.01953125,
	"grad_norm": 0.21040305495262146,
	"learning_rate": 0.00011669300777072298,
	"loss": 0.6597,
	"step": 1285
	},
	{
	"epoch": 5.0390625,
	"grad_norm": 0.2179408222436905,
	"learning_rate": 0.00011602042187124045,
	"loss": 0.6675,
	"step": 1290
	},
	{
	"epoch": 5.05859375,
	"grad_norm": 0.20846475660800934,
	"learning_rate": 0.0001153470913305421,
	"loss": 0.6643,
	"step": 1295
	},
	{
	"epoch": 5.078125,
	"grad_norm": 0.2074786126613617,
	"learning_rate": 0.00011467304744553618,
	"loss": 0.6656,
	"step": 1300
	},
	{
	"epoch": 5.09765625,
	"grad_norm": 0.2094477117061615,
	"learning_rate": 0.00011399832154628767,
	"loss": 0.6544,
	"step": 1305
	},
	{
	"epoch": 5.1171875,
	"grad_norm": 0.21982310712337494,
	"learning_rate": 0.000113322944994562,
	"loss": 0.6549,
	"step": 1310
	},
	{
	"epoch": 5.13671875,
	"grad_norm": 0.23372633755207062,
	"learning_rate": 0.00011264694918236753,
	"loss": 0.6567,
	"step": 1315
	},
	{
	"epoch": 5.15625,
	"grad_norm": 0.21253670752048492,
	"learning_rate": 0.00011197036553049625,
	"loss": 0.657,
	"step": 1320
	},
	{
	"epoch": 5.17578125,
	"grad_norm": 0.21819843351840973,
	"learning_rate": 0.00011129322548706342,
	"loss": 0.6624,
	"step": 1325
	},
	{
	"epoch": 5.1953125,
	"grad_norm": 0.22048228979110718,
	"learning_rate": 0.00011061556052604578,
	"loss": 0.6617,
	"step": 1330
	},
	{
	"epoch": 5.21484375,
	"grad_norm": 0.21444514393806458,
	"learning_rate": 0.00010993740214581856,
	"loss": 0.6714,
	"step": 1335
	},
	{
	"epoch": 5.234375,
	"grad_norm": 0.20963872969150543,
	"learning_rate": 0.00010925878186769158,
	"loss": 0.6554,
	"step": 1340
	},
	{
	"epoch": 5.25390625,
	"grad_norm": 0.21605953574180603,
	"learning_rate": 0.000108579731234444,
	"loss": 0.6625,
	"step": 1345
	},
	{
	"epoch": 5.2734375,
	"grad_norm": 0.2186332494020462,
	"learning_rate": 0.00010790028180885821,
	"loss": 0.659,
	"step": 1350
	},
	{
	"epoch": 5.29296875,
	"grad_norm": 0.20879332721233368,
	"learning_rate": 0.00010722046517225271,
	"loss": 0.6574,
	"step": 1355
	},
	{
	"epoch": 5.3125,
	"grad_norm": 0.20964272320270538,
	"learning_rate": 0.00010654031292301432,
	"loss": 0.6495,
	"step": 1360
	},
	{
	"epoch": 5.33203125,
	"grad_norm": 0.22066867351531982,
	"learning_rate": 0.00010585985667512934,
	"loss": 0.6657,
	"step": 1365
	},
	{
	"epoch": 5.3515625,
	"grad_norm": 0.21919472515583038,
	"learning_rate": 0.00010517912805671419,
	"loss": 0.6663,
	"step": 1370
	},
	{
	"epoch": 5.37109375,
	"grad_norm": 0.20911991596221924,
	"learning_rate": 0.00010449815870854525,
	"loss": 0.6655,
	"step": 1375
	},
	{
	"epoch": 5.390625,
	"grad_norm": 0.21343956887722015,
	"learning_rate": 0.00010381698028258817,
	"loss": 0.6538,
	"step": 1380
	},
	{
	"epoch": 5.41015625,
	"grad_norm": 0.23448581993579865,
	"learning_rate": 0.00010313562444052677,
	"loss": 0.6745,
	"step": 1385
	},
	{
	"epoch": 5.4296875,
	"grad_norm": 0.2224402278661728,
	"learning_rate": 0.00010245412285229124,
	"loss": 0.6659,
	"step": 1390
	},
	{
	"epoch": 5.44921875,
	"grad_norm": 0.21760495007038116,
	"learning_rate": 0.0001017725071945862,
	"loss": 0.6574,
	"step": 1395
	},
	{
	"epoch": 5.46875,
	"grad_norm": 0.21981921792030334,
	"learning_rate": 0.00010109080914941824,
	"loss": 0.6639,
	"step": 1400
	},
	{
	"epoch": 5.48828125,
	"grad_norm": 0.22708064317703247,
	"learning_rate": 0.00010040906040262348,
	"loss": 0.6601,
	"step": 1405
	},
	{
	"epoch": 5.5078125,
	"grad_norm": 0.21901877224445343,
	"learning_rate": 9.972729264239461e-05,
	"loss": 0.6708,
	"step": 1410
	},
	{
	"epoch": 5.52734375,
	"grad_norm": 0.21920931339263916,
	"learning_rate": 9.904553755780815e-05,
	"loss": 0.6588,
	"step": 1415
	},
	{
	"epoch": 5.546875,
	"grad_norm": 0.2086167186498642,
	"learning_rate": 9.836382683735132e-05,
	"loss": 0.6689,
	"step": 1420
	},
	{
	"epoch": 5.56640625,
	"grad_norm": 0.2135404795408249,
	"learning_rate": 9.768219216744942e-05,
	"loss": 0.6709,
	"step": 1425
	},
	{
	"epoch": 5.5859375,
	"grad_norm": 0.2296486496925354,
	"learning_rate": 9.700066523099273e-05,
	"loss": 0.6768,
	"step": 1430
	},
	{
	"epoch": 5.60546875,
	"grad_norm": 0.22231514751911163,
	"learning_rate": 9.631927770586412e-05,
	"loss": 0.6662,
	"step": 1435
	},
	{
	"epoch": 5.625,
	"grad_norm": 0.21092720329761505,
	"learning_rate": 9.563806126346642e-05,
	"loss": 0.6563,
	"step": 1440
	},
	{
	"epoch": 5.64453125,
	"grad_norm": 0.2081764191389084,
	"learning_rate": 9.495704756725041e-05,
	"loss": 0.6599,
	"step": 1445
	},
	{
	"epoch": 5.6640625,
	"grad_norm": 0.21930693089962006,
	"learning_rate": 9.427626827124317e-05,
	"loss": 0.6645,
	"step": 1450
	},
	{
	"epoch": 5.68359375,
	"grad_norm": 0.22238822281360626,
	"learning_rate": 9.359575501857651e-05,
	"loss": 0.6653,
	"step": 1455
	},
	{
	"epoch": 5.703125,
	"grad_norm": 0.21201257407665253,
	"learning_rate": 9.29155394400166e-05,
	"loss": 0.675,
	"step": 1460
	},
	{
	"epoch": 5.72265625,
	"grad_norm": 0.21970124542713165,
	"learning_rate": 9.223565315249325e-05,
	"loss": 0.6719,
	"step": 1465
	},
	{
	"epoch": 5.7421875,
	"grad_norm": 0.20852448046207428,
	"learning_rate": 9.155612775763069e-05,
	"loss": 0.6701,
	"step": 1470
	},
	{
	"epoch": 5.76171875,
	"grad_norm": 0.2180168330669403,
	"learning_rate": 9.087699484027857e-05,
	"loss": 0.658,
	"step": 1475
	},
	{
	"epoch": 5.78125,
	"grad_norm": 0.211044043302536,
	"learning_rate": 9.019828596704394e-05,
	"loss": 0.6526,
	"step": 1480
	},
	{
	"epoch": 5.80078125,
	"grad_norm": 0.20980176329612732,
	"learning_rate": 8.95200326848239e-05,
	"loss": 0.6548,
	"step": 1485
	},
	{
	"epoch": 5.8203125,
	"grad_norm": 0.20603534579277039,
	"learning_rate": 8.884226651933927e-05,
	"loss": 0.6644,
	"step": 1490
	},
	{
	"epoch": 5.83984375,
	"grad_norm": 0.20811837911605835,
	"learning_rate": 8.816501897366953e-05,
	"loss": 0.6703,
	"step": 1495
	},
	{
	"epoch": 5.859375,
	"grad_norm": 0.2105432003736496,
	"learning_rate": 8.74883215267881e-05,
	"loss": 0.6649,
	"step": 1500
	},
	{
	"epoch": 5.87890625,
	"grad_norm": 0.22339750826358795,
	"learning_rate": 8.681220563209955e-05,
	"loss": 0.6687,
	"step": 1505
	},
	{
	"epoch": 5.8984375,
	"grad_norm": 0.20943927764892578,
	"learning_rate": 8.613670271597733e-05,
	"loss": 0.663,
	"step": 1510
	},
	{
	"epoch": 5.91796875,
	"grad_norm": 0.20441389083862305,
	"learning_rate": 8.546184417630338e-05,
	"loss": 0.6663,
	"step": 1515
	},
	{
	"epoch": 5.9375,
	"grad_norm": 0.21287420392036438,
	"learning_rate": 8.478766138100834e-05,
	"loss": 0.6727,
	"step": 1520
	},
	{
	"epoch": 5.95703125,
	"grad_norm": 0.21163299679756165,
	"learning_rate": 8.411418566661388e-05,
	"loss": 0.6643,
	"step": 1525
	},
	{
	"epoch": 5.9765625,
	"grad_norm": 0.20541082322597504,
	"learning_rate": 8.344144833677594e-05,
	"loss": 0.6605,
	"step": 1530
	},
	{
	"epoch": 5.99609375,
	"grad_norm": 0.21405570209026337,
	"learning_rate": 8.27694806608298e-05,
	"loss": 0.6633,
	"step": 1535
	},
	{
	"epoch": 6.0,
	"eval_loss": 2.0744192600250244,
	"eval_runtime": 0.5398,
	"eval_samples_per_second": 11.115,
	"eval_steps_per_second": 1.853,
	"step": 1536
	},
	{
	"epoch": 6.015625,
	"grad_norm": 0.21526320278644562,
	"learning_rate": 8.209831387233676e-05,
	"loss": 0.6479,
	"step": 1540
	},
	{
	"epoch": 6.03515625,
	"grad_norm": 0.217779740691185,
	"learning_rate": 8.142797916763209e-05,
	"loss": 0.6536,
	"step": 1545
	},
	{
	"epoch": 6.0546875,
	"grad_norm": 0.22583958506584167,
	"learning_rate": 8.075850770437534e-05,
	"loss": 0.6532,
	"step": 1550
	},
	{
	"epoch": 6.07421875,
	"grad_norm": 0.24157458543777466,
	"learning_rate": 8.008993060010183e-05,
	"loss": 0.6426,
	"step": 1555
	},
	{
	"epoch": 6.09375,
	"grad_norm": 0.2280224710702896,
	"learning_rate": 7.942227893077652e-05,
	"loss": 0.6482,
	"step": 1560
	},
	{
	"epoch": 6.11328125,
	"grad_norm": 0.21372312307357788,
	"learning_rate": 7.875558372934936e-05,
	"loss": 0.6448,
	"step": 1565
	},
	{
	"epoch": 6.1328125,
	"grad_norm": 0.22514766454696655,
	"learning_rate": 7.808987598431303e-05,
	"loss": 0.6506,
	"step": 1570
	},
	{
	"epoch": 6.15234375,
	"grad_norm": 0.22178982198238373,
	"learning_rate": 7.742518663826246e-05,
	"loss": 0.6404,
	"step": 1575
	},
	{
	"epoch": 6.171875,
	"grad_norm": 0.21459142863750458,
	"learning_rate": 7.676154658645656e-05,
	"loss": 0.6557,
	"step": 1580
	},
	{
	"epoch": 6.19140625,
	"grad_norm": 0.22397801280021667,
	"learning_rate": 7.609898667538243e-05,
	"loss": 0.6445,
	"step": 1585
	},
	{
	"epoch": 6.2109375,
	"grad_norm": 0.22123484313488007,
	"learning_rate": 7.543753770132127e-05,
	"loss": 0.6375,
	"step": 1590
	},
	{
	"epoch": 6.23046875,
	"grad_norm": 0.2259218543767929,
	"learning_rate": 7.477723040891717e-05,
	"loss": 0.6486,
	"step": 1595
	},
	{
	"epoch": 6.25,
	"grad_norm": 0.21872185170650482,
	"learning_rate": 7.411809548974792e-05,
	"loss": 0.6546,
	"step": 1600
	},
	{
	"epoch": 6.26953125,
	"grad_norm": 0.2340991348028183,
	"learning_rate": 7.346016358089867e-05,
	"loss": 0.6573,
	"step": 1605
	},
	{
	"epoch": 6.2890625,
	"grad_norm": 0.2258559614419937,
	"learning_rate": 7.280346526353759e-05,
	"loss": 0.6485,
	"step": 1610
	},
	{
	"epoch": 6.30859375,
	"grad_norm": 0.21842586994171143,
	"learning_rate": 7.21480310614947e-05,
	"loss": 0.6452,
	"step": 1615
	},
	{
	"epoch": 6.328125,
	"grad_norm": 0.22392797470092773,
	"learning_rate": 7.149389143984295e-05,
	"loss": 0.6467,
	"step": 1620
	},
	{
	"epoch": 6.34765625,
	"grad_norm": 0.21205224096775055,
	"learning_rate": 7.084107680348218e-05,
	"loss": 0.6502,
	"step": 1625
	},
	{
	"epoch": 6.3671875,
	"grad_norm": 0.22041639685630798,
	"learning_rate": 7.018961749572604e-05,
	"loss": 0.6502,
	"step": 1630
	},
	{
	"epoch": 6.38671875,
	"grad_norm": 0.21791093051433563,
	"learning_rate": 6.953954379689136e-05,
	"loss": 0.6553,
	"step": 1635
	},
	{
	"epoch": 6.40625,
	"grad_norm": 0.22223076224327087,
	"learning_rate": 6.889088592289093e-05,
	"loss": 0.639,
	"step": 1640
	},
	{
	"epoch": 6.42578125,
	"grad_norm": 0.2151210606098175,
	"learning_rate": 6.824367402382885e-05,
	"loss": 0.655,
	"step": 1645
	},
	{
	"epoch": 6.4453125,
	"grad_norm": 0.2196204513311386,
	"learning_rate": 6.759793818259933e-05,
	"loss": 0.6549,
	"step": 1650
	},
	{
	"epoch": 6.46484375,
	"grad_norm": 0.21881859004497528,
	"learning_rate": 6.69537084134882e-05,
	"loss": 0.6516,
	"step": 1655
	},
	{
	"epoch": 6.484375,
	"grad_norm": 0.21970680356025696,
	"learning_rate": 6.6311014660778e-05,
	"loss": 0.6531,
	"step": 1660
	},
	{
	"epoch": 6.50390625,
	"grad_norm": 0.21640105545520782,
	"learning_rate": 6.566988679735606e-05,
	"loss": 0.6474,
	"step": 1665
	},
	{
	"epoch": 6.5234375,
	"grad_norm": 0.225670725107193,
	"learning_rate": 6.503035462332592e-05,
	"loss": 0.6437,
	"step": 1670
	},
	{
	"epoch": 6.54296875,
	"grad_norm": 0.20938833057880402,
	"learning_rate": 6.439244786462245e-05,
	"loss": 0.6526,
	"step": 1675
	},
	{
	"epoch": 6.5625,
	"grad_norm": 0.21592438220977783,
	"learning_rate": 6.375619617162985e-05,
	"loss": 0.6528,
	"step": 1680
	},
	{
	"epoch": 6.58203125,
	"grad_norm": 0.22665540874004364,
	"learning_rate": 6.312162911780368e-05,
	"loss": 0.6502,
	"step": 1685
	},
	{
	"epoch": 6.6015625,
	"grad_norm": 0.2195620834827423,
	"learning_rate": 6.248877619829619e-05,
	"loss": 0.6469,
	"step": 1690
	},
	{
	"epoch": 6.62109375,
	"grad_norm": 0.22165308892726898,
	"learning_rate": 6.185766682858546e-05,
	"loss": 0.6518,
	"step": 1695
	},
	{
	"epoch": 6.640625,
	"grad_norm": 0.22840096056461334,
	"learning_rate": 6.122833034310793e-05,
	"loss": 0.6506,
	"step": 1700
	},
	{
	"epoch": 6.66015625,
	"grad_norm": 0.22422266006469727,
	"learning_rate": 6.060079599389521e-05,
	"loss": 0.6559,
	"step": 1705
	},
	{
	"epoch": 6.6796875,
	"grad_norm": 0.22363343834877014,
	"learning_rate": 5.9975092949214116e-05,
	"loss": 0.6449,
	"step": 1710
	},
	{
	"epoch": 6.69921875,
	"grad_norm": 0.2213827222585678,
	"learning_rate": 5.935125029221111e-05,
	"loss": 0.65,
	"step": 1715
	},
	{
	"epoch": 6.71875,
	"grad_norm": 0.2290297895669937,
	"learning_rate": 5.872929701956054e-05,
	"loss": 0.6476,
	"step": 1720
	},
	{
	"epoch": 6.73828125,
	"grad_norm": 0.23118211328983307,
	"learning_rate": 5.810926204011658e-05,
	"loss": 0.6511,
	"step": 1725
	},
	{
	"epoch": 6.7578125,
	"grad_norm": 0.22112269699573517,
	"learning_rate": 5.749117417356988e-05,
	"loss": 0.6481,
	"step": 1730
	},
	{
	"epoch": 6.77734375,
	"grad_norm": 0.21454501152038574,
	"learning_rate": 5.687506214910765e-05,
	"loss": 0.6492,
	"step": 1735
	},
	{
	"epoch": 6.796875,
	"grad_norm": 0.22518618404865265,
	"learning_rate": 5.6260954604078585e-05,
	"loss": 0.6515,
	"step": 1740
	},
	{
	"epoch": 6.81640625,
	"grad_norm": 0.23013541102409363,
	"learning_rate": 5.564888008266165e-05,
	"loss": 0.6563,
	"step": 1745
	},
	{
	"epoch": 6.8359375,
	"grad_norm": 0.21959349513053894,
	"learning_rate": 5.503886703453933e-05,
	"loss": 0.6504,
	"step": 1750
	},
	{
	"epoch": 6.85546875,
	"grad_norm": 0.23238404095172882,
	"learning_rate": 5.4430943813575375e-05,
	"loss": 0.6575,
	"step": 1755
	},
	{
	"epoch": 6.875,
	"grad_norm": 0.21891681849956512,
	"learning_rate": 5.382513867649663e-05,
	"loss": 0.6415,
	"step": 1760
	},
	{
	"epoch": 6.89453125,
	"grad_norm": 0.2155328243970871,
	"learning_rate": 5.3221479781579955e-05,
	"loss": 0.6498,
	"step": 1765
	},
	{
	"epoch": 6.9140625,
	"grad_norm": 0.21803325414657593,
	"learning_rate": 5.261999518734322e-05,
	"loss": 0.6439,
	"step": 1770
	},
	{
	"epoch": 6.93359375,
	"grad_norm": 0.21531429886817932,
	"learning_rate": 5.202071285124119e-05,
	"loss": 0.6486,
	"step": 1775
	},
	{
	"epoch": 6.953125,
	"grad_norm": 0.22126588225364685,
	"learning_rate": 5.142366062836599e-05,
	"loss": 0.6453,
	"step": 1780
	},
	{
	"epoch": 6.97265625,
	"grad_norm": 0.21690168976783752,
	"learning_rate": 5.082886627015246e-05,
	"loss": 0.6564,
	"step": 1785
	},
	{
	"epoch": 6.9921875,
	"grad_norm": 0.22704558074474335,
	"learning_rate": 5.023635742308807e-05,
	"loss": 0.6595,
	"step": 1790
	},
	{
	"epoch": 7.0,
	"eval_loss": 2.0813868045806885,
	"eval_runtime": 0.5387,
	"eval_samples_per_second": 11.138,
	"eval_steps_per_second": 1.856,
	"step": 1792
	},
	{
	"epoch": 7.01171875,
	"grad_norm": 0.21671408414840698,
	"learning_rate": 4.964616162742826e-05,
	"loss": 0.6478,
	"step": 1795
	},
	{
	"epoch": 7.03125,
	"grad_norm": 0.2322429120540619,
	"learning_rate": 4.9058306315915826e-05,
	"loss": 0.6355,
	"step": 1800
	},
	{
	"epoch": 7.05078125,
	"grad_norm": 0.22516188025474548,
	"learning_rate": 4.84728188125063e-05,
	"loss": 0.6343,
	"step": 1805
	},
	{
	"epoch": 7.0703125,
	"grad_norm": 0.22370575368404388,
	"learning_rate": 4.7889726331097686e-05,
	"loss": 0.6388,
	"step": 1810
	},
	{
	"epoch": 7.08984375,
	"grad_norm": 0.22702112793922424,
	"learning_rate": 4.7309055974265435e-05,
	"loss": 0.6405,
	"step": 1815
	},
	{
	"epoch": 7.109375,
	"grad_norm": 0.2213263362646103,
	"learning_rate": 4.6730834732003104e-05,
	"loss": 0.6369,
	"step": 1820
	},
	{
	"epoch": 7.12890625,
	"grad_norm": 0.2283063679933548,
	"learning_rate": 4.615508948046726e-05,
	"loss": 0.6406,
	"step": 1825
	},
	{
	"epoch": 7.1484375,
	"grad_norm": 0.22583836317062378,
	"learning_rate": 4.5581846980728794e-05,
	"loss": 0.6396,
	"step": 1830
	},
	{
	"epoch": 7.16796875,
	"grad_norm": 0.223560631275177,
	"learning_rate": 4.50111338775287e-05,
	"loss": 0.6487,
	"step": 1835
	},
	{
	"epoch": 7.1875,
	"grad_norm": 0.2752554714679718,
	"learning_rate": 4.444297669803981e-05,
	"loss": 0.6399,
	"step": 1840
	},
	{
	"epoch": 7.20703125,
	"grad_norm": 0.22124579548835754,
	"learning_rate": 4.387740185063358e-05,
	"loss": 0.6413,
	"step": 1845
	},
	{
	"epoch": 7.2265625,
	"grad_norm": 0.22053855657577515,
	"learning_rate": 4.331443562365285e-05,
	"loss": 0.6377,
	"step": 1850
	},
	{
	"epoch": 7.24609375,
	"grad_norm": 0.22650252282619476,
	"learning_rate": 4.275410418418979e-05,
	"loss": 0.6441,
	"step": 1855
	},
	{
	"epoch": 7.265625,
	"grad_norm": 0.2277732640504837,
	"learning_rate": 4.219643357686967e-05,
	"loss": 0.6472,
	"step": 1860
	},
	{
	"epoch": 7.28515625,
	"grad_norm": 0.21958424150943756,
	"learning_rate": 4.1641449722640336e-05,
	"loss": 0.6434,
	"step": 1865
	},
	{
	"epoch": 7.3046875,
	"grad_norm": 0.22781191766262054,
	"learning_rate": 4.1089178417567164e-05,
	"loss": 0.6436,
	"step": 1870
	},
	{
	"epoch": 7.32421875,
	"grad_norm": 0.22724145650863647,
	"learning_rate": 4.0539645331634504e-05,
	"loss": 0.6365,
	"step": 1875
	},
	{
	"epoch": 7.34375,
	"grad_norm": 0.22402629256248474,
	"learning_rate": 3.999287600755192e-05,
	"loss": 0.6404,
	"step": 1880
	},
	{
	"epoch": 7.36328125,
	"grad_norm": 0.22256724536418915,
	"learning_rate": 3.944889585956746e-05,
	"loss": 0.6385,
	"step": 1885
	},
	{
	"epoch": 7.3828125,
	"grad_norm": 0.2245977371931076,
	"learning_rate": 3.8907730172286124e-05,
	"loss": 0.6402,
	"step": 1890
	},
	{
	"epoch": 7.40234375,
	"grad_norm": 0.2223842293024063,
	"learning_rate": 3.8369404099494574e-05,
	"loss": 0.6401,
	"step": 1895
	},
	{
	"epoch": 7.421875,
	"grad_norm": 0.228043794631958,
	"learning_rate": 3.783394266299228e-05,
	"loss": 0.6456,
	"step": 1900
	},
	{
	"epoch": 7.44140625,
	"grad_norm": 0.22321034967899323,
	"learning_rate": 3.730137075142802e-05,
	"loss": 0.6461,
	"step": 1905
	},
	{
	"epoch": 7.4609375,
	"grad_norm": 0.2202451378107071,
	"learning_rate": 3.677171311914346e-05,
	"loss": 0.6404,
	"step": 1910
	},
	{
	"epoch": 7.48046875,
	"grad_norm": 0.23069259524345398,
	"learning_rate": 3.624499438502229e-05,
	"loss": 0.6399,
	"step": 1915
	},
	{
	"epoch": 7.5,
	"grad_norm": 0.22767633199691772,
	"learning_rate": 3.5721239031346066e-05,
	"loss": 0.6365,
	"step": 1920
	},
	{
	"epoch": 7.51953125,
	"grad_norm": 0.223536416888237,
	"learning_rate": 3.520047140265618e-05,
	"loss": 0.6398,
	"step": 1925
	},
	{
	"epoch": 7.5390625,
	"grad_norm": 0.2236379086971283,
	"learning_rate": 3.468271570462235e-05,
	"loss": 0.6374,
	"step": 1930
	},
	{
	"epoch": 7.55859375,
	"grad_norm": 0.22322149574756622,
	"learning_rate": 3.41679960029174e-05,
	"loss": 0.6411,
	"step": 1935
	},
	{
	"epoch": 7.578125,
	"grad_norm": 0.22714544832706451,
	"learning_rate": 3.365633622209891e-05,
	"loss": 0.6281,
	"step": 1940
	},
	{
	"epoch": 7.59765625,
	"grad_norm": 0.23407664895057678,
	"learning_rate": 3.314776014449694e-05,
	"loss": 0.6342,
	"step": 1945
	},
	{
	"epoch": 7.6171875,
	"grad_norm": 0.2269096076488495,
	"learning_rate": 3.2642291409108775e-05,
	"loss": 0.6462,
	"step": 1950
	},
	{
	"epoch": 7.63671875,
	"grad_norm": 0.21775776147842407,
	"learning_rate": 3.213995351050011e-05,
	"loss": 0.6442,
	"step": 1955
	},
	{
	"epoch": 7.65625,
	"grad_norm": 0.21870321035385132,
	"learning_rate": 3.164076979771287e-05,
	"loss": 0.6391,
	"step": 1960
	},
	{
	"epoch": 7.67578125,
	"grad_norm": 0.24278177320957184,
	"learning_rate": 3.1144763473180285e-05,
	"loss": 0.6351,
	"step": 1965
	},
	{
	"epoch": 7.6953125,
	"grad_norm": 0.222146674990654,
	"learning_rate": 3.065195759164797e-05,
	"loss": 0.6442,
	"step": 1970
	},
	{
	"epoch": 7.71484375,
	"grad_norm": 0.23037941753864288,
	"learning_rate": 3.016237505910272e-05,
	"loss": 0.6391,
	"step": 1975
	},
	{
	"epoch": 7.734375,
	"grad_norm": 0.22653505206108093,
	"learning_rate": 2.9676038631707593e-05,
	"loss": 0.6364,
	"step": 1980
	},
	{
	"epoch": 7.75390625,
	"grad_norm": 0.22071927785873413,
	"learning_rate": 2.9192970914744132e-05,
	"loss": 0.6436,
	"step": 1985
	},
	{
	"epoch": 7.7734375,
	"grad_norm": 0.2352590709924698,
	"learning_rate": 2.8713194361562036e-05,
	"loss": 0.6389,
	"step": 1990
	},
	{
	"epoch": 7.79296875,
	"grad_norm": 0.23165152966976166,
	"learning_rate": 2.8236731272534967e-05,
	"loss": 0.6359,
	"step": 1995
	},
	{
	"epoch": 7.8125,
	"grad_norm": 0.22592546045780182,
	"learning_rate": 2.776360379402445e-05,
	"loss": 0.6452,
	"step": 2000
	},
	{
	"epoch": 7.83203125,
	"grad_norm": 0.22005808353424072,
	"learning_rate": 2.72938339173503e-05,
	"loss": 0.6362,
	"step": 2005
	},
	{
	"epoch": 7.8515625,
	"grad_norm": 0.22496894001960754,
	"learning_rate": 2.6827443477768454e-05,
	"loss": 0.6363,
	"step": 2010
	},
	{
	"epoch": 7.87109375,
	"grad_norm": 0.23299238085746765,
	"learning_rate": 2.6364454153456108e-05,
	"loss": 0.6376,
	"step": 2015
	},
	{
	"epoch": 7.890625,
	"grad_norm": 0.21800798177719116,
	"learning_rate": 2.5904887464504114e-05,
	"loss": 0.6316,
	"step": 2020
	},
	{
	"epoch": 7.91015625,
	"grad_norm": 0.22942836582660675,
	"learning_rate": 2.544876477191652e-05,
	"loss": 0.6408,
	"step": 2025
	},
	{
	"epoch": 7.9296875,
	"grad_norm": 0.22502020001411438,
	"learning_rate": 2.4996107276618008e-05,
	"loss": 0.6281,
	"step": 2030
	},
	{
	"epoch": 7.94921875,
	"grad_norm": 0.22493688762187958,
	"learning_rate": 2.454693601846819e-05,
	"loss": 0.6374,
	"step": 2035
	},
	{
	"epoch": 7.96875,
	"grad_norm": 0.22121860086917877,
	"learning_rate": 2.4101271875283817e-05,
	"loss": 0.6301,
	"step": 2040
	},
	{
	"epoch": 7.98828125,
	"grad_norm": 0.22293226420879364,
	"learning_rate": 2.3659135561868305e-05,
	"loss": 0.6374,
	"step": 2045
	},
	{
	"epoch": 8.0,
	"eval_loss": 2.093949556350708,
	"eval_runtime": 0.5398,
	"eval_samples_per_second": 11.115,
	"eval_steps_per_second": 1.852,
	"step": 2048
	},
	{
	"epoch": 8.0078125,
	"grad_norm": 0.22147591412067413,
	"learning_rate": 2.3220547629048796e-05,
	"loss": 0.6318,
	"step": 2050
	},
	{
	"epoch": 8.02734375,
	"grad_norm": 0.22781990468502045,
	"learning_rate": 2.2785528462721238e-05,
	"loss": 0.6301,
	"step": 2055
	},
	{
	"epoch": 8.046875,
	"grad_norm": 0.22302427887916565,
	"learning_rate": 2.2354098282902446e-05,
	"loss": 0.6194,
	"step": 2060
	},
	{
	"epoch": 8.06640625,
	"grad_norm": 0.2345212697982788,
	"learning_rate": 2.1926277142790552e-05,
	"loss": 0.6284,
	"step": 2065
	},
	{
	"epoch": 8.0859375,
	"grad_norm": 0.22880584001541138,
	"learning_rate": 2.1502084927832845e-05,
	"loss": 0.6394,
	"step": 2070
	},
	{
	"epoch": 8.10546875,
	"grad_norm": 0.23197947442531586,
	"learning_rate": 2.1081541354801292e-05,
	"loss": 0.6414,
	"step": 2075
	},
	{
	"epoch": 8.125,
	"grad_norm": 0.2195805162191391,
	"learning_rate": 2.0664665970876496e-05,
	"loss": 0.6274,
	"step": 2080
	},
	{
	"epoch": 8.14453125,
	"grad_norm": 0.2231413722038269,
	"learning_rate": 2.025147815273867e-05,
	"loss": 0.6325,
	"step": 2085
	},
	{
	"epoch": 8.1640625,
	"grad_norm": 0.22956664860248566,
	"learning_rate": 1.9841997105667275e-05,
	"loss": 0.6345,
	"step": 2090
	},
	{
	"epoch": 8.18359375,
	"grad_norm": 0.22590646147727966,
	"learning_rate": 1.943624186264832e-05,
	"loss": 0.6276,
	"step": 2095
	},
	{
	"epoch": 8.203125,
	"grad_norm": 0.2267957627773285,
	"learning_rate": 1.903423128348959e-05,
	"loss": 0.6243,
	"step": 2100
	},
	{
	"epoch": 8.22265625,
	"grad_norm": 0.22633960843086243,
	"learning_rate": 1.8635984053944122e-05,
	"loss": 0.6279,
	"step": 2105
	},
	{
	"epoch": 8.2421875,
	"grad_norm": 0.22983397543430328,
	"learning_rate": 1.824151868484164e-05,
	"loss": 0.6347,
	"step": 2110
	},
	{
	"epoch": 8.26171875,
	"grad_norm": 0.21901904046535492,
	"learning_rate": 1.7850853511228115e-05,
	"loss": 0.6364,
	"step": 2115
	},
	{
	"epoch": 8.28125,
	"grad_norm": 0.2256007343530655,
	"learning_rate": 1.7464006691513623e-05,
	"loss": 0.628,
	"step": 2120
	},
	{
	"epoch": 8.30078125,
	"grad_norm": 0.2304702252149582,
	"learning_rate": 1.7080996206628307e-05,
	"loss": 0.6202,
	"step": 2125
	},
	{
	"epoch": 8.3203125,
	"grad_norm": 0.22724899649620056,
	"learning_rate": 1.6701839859186542e-05,
	"loss": 0.6401,
	"step": 2130
	},
	{
	"epoch": 8.33984375,
	"grad_norm": 0.22017619013786316,
	"learning_rate": 1.632655527265958e-05,
	"loss": 0.6348,
	"step": 2135
	},
	{
	"epoch": 8.359375,
	"grad_norm": 0.221891850233078,
	"learning_rate": 1.595515989055618e-05,
	"loss": 0.6306,
	"step": 2140
	},
	{
	"epoch": 8.37890625,
	"grad_norm": 0.2255999892950058,
	"learning_rate": 1.558767097561219e-05,
	"loss": 0.6436,
	"step": 2145
	},
	{
	"epoch": 8.3984375,
	"grad_norm": 0.2337878942489624,
	"learning_rate": 1.5224105608987704e-05,
	"loss": 0.6256,
	"step": 2150
	},
	{
	"epoch": 8.41796875,
	"grad_norm": 0.2235851138830185,
	"learning_rate": 1.486448068947348e-05,
	"loss": 0.6328,
	"step": 2155
	},
	{
	"epoch": 8.4375,
	"grad_norm": 0.2308977097272873,
	"learning_rate": 1.4508812932705363e-05,
	"loss": 0.6353,
	"step": 2160
	},
	{
	"epoch": 8.45703125,
	"grad_norm": 0.22785401344299316,
	"learning_rate": 1.4157118870387155e-05,
	"loss": 0.6375,
	"step": 2165
	},
	{
	"epoch": 8.4765625,
	"grad_norm": 0.24056580662727356,
	"learning_rate": 1.3809414849522584e-05,
	"loss": 0.6343,
	"step": 2170
	},
	{
	"epoch": 8.49609375,
	"grad_norm": 0.22777673602104187,
	"learning_rate": 1.3465717031655056e-05,
	"loss": 0.6336,
	"step": 2175
	},
	{
	"epoch": 8.515625,
	"grad_norm": 0.23098915815353394,
	"learning_rate": 1.3126041392116772e-05,
	"loss": 0.6296,
	"step": 2180
	},
	{
	"epoch": 8.53515625,
	"grad_norm": 0.2298251986503601,
	"learning_rate": 1.2790403719286049e-05,
	"loss": 0.6305,
	"step": 2185
	},
	{
	"epoch": 8.5546875,
	"grad_norm": 0.22145819664001465,
	"learning_rate": 1.2458819613853468e-05,
	"loss": 0.6262,
	"step": 2190
	},
	{
	"epoch": 8.57421875,
	"grad_norm": 0.2244306206703186,
	"learning_rate": 1.2131304488096772e-05,
	"loss": 0.6225,
	"step": 2195
	},
	{
	"epoch": 8.59375,
	"grad_norm": 0.22416800260543823,
	"learning_rate": 1.1807873565164506e-05,
	"loss": 0.6309,
	"step": 2200
	},
	{
	"epoch": 8.61328125,
	"grad_norm": 0.22584258019924164,
	"learning_rate": 1.148854187836833e-05,
	"loss": 0.6318,
	"step": 2205
	},
	{
	"epoch": 8.6328125,
	"grad_norm": 0.2320922613143921,
	"learning_rate": 1.1173324270484397e-05,
	"loss": 0.6352,
	"step": 2210
	},
	{
	"epoch": 8.65234375,
	"grad_norm": 0.2240631878376007,
	"learning_rate": 1.0862235393063413e-05,
	"loss": 0.6279,
	"step": 2215
	},
	{
	"epoch": 8.671875,
	"grad_norm": 0.2261231392621994,
	"learning_rate": 1.0555289705749483e-05,
	"loss": 0.6299,
	"step": 2220
	},
	{
	"epoch": 8.69140625,
	"grad_norm": 0.22478684782981873,
	"learning_rate": 1.025250147560829e-05,
	"loss": 0.639,
	"step": 2225
	},
	{
	"epoch": 8.7109375,
	"grad_norm": 0.22566542029380798,
	"learning_rate": 9.953884776463652e-06,
	"loss": 0.63,
	"step": 2230
	},
	{
	"epoch": 8.73046875,
	"grad_norm": 0.23023688793182373,
	"learning_rate": 9.659453488243575e-06,
	"loss": 0.6439,
	"step": 2235
	},
	{
	"epoch": 8.75,
	"grad_norm": 0.22487542033195496,
	"learning_rate": 9.369221296335006e-06,
	"loss": 0.6421,
	"step": 2240
	},
	{
	"epoch": 8.76953125,
	"grad_norm": 0.22670140862464905,
	"learning_rate": 9.083201690947763e-06,
	"loss": 0.6331,
	"step": 2245
	},
	{
	"epoch": 8.7890625,
	"grad_norm": 0.2248082160949707,
	"learning_rate": 8.801407966487486e-06,
	"loss": 0.6216,
	"step": 2250
	},
	{
	"epoch": 8.80859375,
	"grad_norm": 0.23012250661849976,
	"learning_rate": 8.52385322093765e-06,
	"loss": 0.6452,
	"step": 2255
	},
	{
	"epoch": 8.828125,
	"grad_norm": 0.22810766100883484,
	"learning_rate": 8.250550355250875e-06,
	"loss": 0.6395,
	"step": 2260
	},
	{
	"epoch": 8.84765625,
	"grad_norm": 0.22482182085514069,
	"learning_rate": 7.981512072749198e-06,
	"loss": 0.6316,
	"step": 2265
	},
	{
	"epoch": 8.8671875,
	"grad_norm": 0.22704395651817322,
	"learning_rate": 7.71675087853364e-06,
	"loss": 0.6389,
	"step": 2270
	},
	{
	"epoch": 8.88671875,
	"grad_norm": 0.2339123636484146,
	"learning_rate": 7.456279078902928e-06,
	"loss": 0.639,
	"step": 2275
	},
	{
	"epoch": 8.90625,
	"grad_norm": 0.2283734679222107,
	"learning_rate": 7.200108780781556e-06,
	"loss": 0.6312,
	"step": 2280
	},
	{
	"epoch": 8.92578125,
	"grad_norm": 0.23632891476154327,
	"learning_rate": 6.948251891156932e-06,
	"loss": 0.6336,
	"step": 2285
	},
	{
	"epoch": 8.9453125,
	"grad_norm": 0.22593176364898682,
	"learning_rate": 6.700720116526116e-06,
	"loss": 0.6382,
	"step": 2290
	},
	{
	"epoch": 8.96484375,
	"grad_norm": 0.2195340245962143,
	"learning_rate": 6.457524962351469e-06,
	"loss": 0.627,
	"step": 2295
	},
	{
	"epoch": 8.984375,
	"grad_norm": 0.2304958701133728,
	"learning_rate": 6.218677732526035e-06,
	"loss": 0.6277,
	"step": 2300
	},
	{
	"epoch": 9.0,
	"eval_loss": 2.0994203090667725,
	"eval_runtime": 0.5356,
	"eval_samples_per_second": 11.202,
	"eval_steps_per_second": 1.867,
	"step": 2304
	},
	{
	"epoch": 9.00390625,
	"grad_norm": 0.2239326387643814,
	"learning_rate": 5.984189528848095e-06,
	"loss": 0.6333,
	"step": 2305
	},
	{
	"epoch": 9.0234375,
	"grad_norm": 0.21830931305885315,
	"learning_rate": 5.7540712505050444e-06,
	"loss": 0.6303,
	"step": 2310
	},
	{
	"epoch": 9.04296875,
	"grad_norm": 0.2230663150548935,
	"learning_rate": 5.528333593567014e-06,
	"loss": 0.6266,
	"step": 2315
	},
	{
	"epoch": 9.0625,
	"grad_norm": 0.22621068358421326,
	"learning_rate": 5.306987050489442e-06,
	"loss": 0.6273,
	"step": 2320
	},
	{
	"epoch": 9.08203125,
	"grad_norm": 0.2257871776819229,
	"learning_rate": 5.090041909625542e-06,
	"loss": 0.6171,
	"step": 2325
	},
	{
	"epoch": 9.1015625,
	"grad_norm": 0.22467824816703796,
	"learning_rate": 4.877508254748076e-06,
	"loss": 0.6256,
	"step": 2330
	},
	{
	"epoch": 9.12109375,
	"grad_norm": 0.22441822290420532,
	"learning_rate": 4.669395964580614e-06,
	"loss": 0.6247,
	"step": 2335
	},
	{
	"epoch": 9.140625,
	"grad_norm": 0.22599612176418304,
	"learning_rate": 4.465714712338398e-06,
	"loss": 0.6204,
	"step": 2340
	},
	{
	"epoch": 9.16015625,
	"grad_norm": 0.22301939129829407,
	"learning_rate": 4.26647396527865e-06,
	"loss": 0.634,
	"step": 2345
	},
	{
	"epoch": 9.1796875,
	"grad_norm": 0.23274029791355133,
	"learning_rate": 4.071682984260638e-06,
	"loss": 0.6256,
	"step": 2350
	},
	{
	"epoch": 9.19921875,
	"grad_norm": 0.23097610473632812,
	"learning_rate": 3.881350823315177e-06,
	"loss": 0.6293,
	"step": 2355
	},
	{
	"epoch": 9.21875,
	"grad_norm": 0.23166796565055847,
	"learning_rate": 3.6954863292237297e-06,
	"loss": 0.6294,
	"step": 2360
	},
	{
	"epoch": 9.23828125,
	"grad_norm": 0.22876545786857605,
	"learning_rate": 3.514098141107314e-06,
	"loss": 0.6298,
	"step": 2365
	},
	{
	"epoch": 9.2578125,
	"grad_norm": 0.22338230907917023,
	"learning_rate": 3.3371946900248473e-06,
	"loss": 0.6264,
	"step": 2370
	},
	{
	"epoch": 9.27734375,
	"grad_norm": 0.2302178293466568,
	"learning_rate": 3.1647841985813164e-06,
	"loss": 0.627,
	"step": 2375
	},
	{
	"epoch": 9.296875,
	"grad_norm": 0.2242288738489151,
	"learning_rate": 2.996874680545603e-06,
	"loss": 0.6336,
	"step": 2380
	},
	{
	"epoch": 9.31640625,
	"grad_norm": 0.22500120103359222,
	"learning_rate": 2.8334739404779375e-06,
	"loss": 0.6264,
	"step": 2385
	},
	{
	"epoch": 9.3359375,
	"grad_norm": 0.23554645478725433,
	"learning_rate": 2.674589573367192e-06,
	"loss": 0.6213,
	"step": 2390
	},
	{
	"epoch": 9.35546875,
	"grad_norm": 0.2254471480846405,
	"learning_rate": 2.5202289642778375e-06,
	"loss": 0.6348,
	"step": 2395
	},
	{
	"epoch": 9.375,
	"grad_norm": 0.22407911717891693,
	"learning_rate": 2.3703992880066638e-06,
	"loss": 0.6294,
	"step": 2400
	},
	{
	"epoch": 9.39453125,
	"grad_norm": 0.22965936362743378,
	"learning_rate": 2.2251075087493355e-06,
	"loss": 0.64,
	"step": 2405
	},
	{
	"epoch": 9.4140625,
	"grad_norm": 0.22874490916728973,
	"learning_rate": 2.0843603797766287e-06,
	"loss": 0.6313,
	"step": 2410
	},
	{
	"epoch": 9.43359375,
	"grad_norm": 0.22413046658039093,
	"learning_rate": 1.9481644431206036e-06,
	"loss": 0.6229,
	"step": 2415
	},
	{
	"epoch": 9.453125,
	"grad_norm": 0.2280588150024414,
	"learning_rate": 1.8165260292704711e-06,
	"loss": 0.6265,
	"step": 2420
	},
	{
	"epoch": 9.47265625,
	"grad_norm": 0.22689659893512726,
	"learning_rate": 1.6894512568783716e-06,
	"loss": 0.6272,
	"step": 2425
	},
	{
	"epoch": 9.4921875,
	"grad_norm": 0.23052698373794556,
	"learning_rate": 1.5669460324749586e-06,
	"loss": 0.6408,
	"step": 2430
	},
	{
	"epoch": 9.51171875,
	"grad_norm": 0.22765642404556274,
	"learning_rate": 1.4490160501948735e-06,
	"loss": 0.644,
	"step": 2435
	},
	{
	"epoch": 9.53125,
	"grad_norm": 0.22766034305095673,
	"learning_rate": 1.3356667915121025e-06,
	"loss": 0.6249,
	"step": 2440
	},
	{
	"epoch": 9.55078125,
	"grad_norm": 0.22794398665428162,
	"learning_rate": 1.2269035249851236e-06,
	"loss": 0.6318,
	"step": 2445
	},
	{
	"epoch": 9.5703125,
	"grad_norm": 0.22712871432304382,
	"learning_rate": 1.1227313060120926e-06,
	"loss": 0.6359,
	"step": 2450
	},
	{
	"epoch": 9.58984375,
	"grad_norm": 0.22914738953113556,
	"learning_rate": 1.0231549765958192e-06,
	"loss": 0.6389,
	"step": 2455
	},
	{
	"epoch": 9.609375,
	"grad_norm": 0.22300153970718384,
	"learning_rate": 9.281791651187366e-07,
	"loss": 0.6356,
	"step": 2460
	},
	{
	"epoch": 9.62890625,
	"grad_norm": 0.232873797416687,
	"learning_rate": 8.378082861277281e-07,
	"loss": 0.6272,
	"step": 2465
	},
	{
	"epoch": 9.6484375,
	"grad_norm": 0.227997824549675,
	"learning_rate": 7.520465401290033e-07,
	"loss": 0.633,
	"step": 2470
	},
	{
	"epoch": 9.66796875,
	"grad_norm": 0.21839286386966705,
	"learning_rate": 6.708979133927762e-07,
	"loss": 0.6215,
	"step": 2475
	},
	{
	"epoch": 9.6875,
	"grad_norm": 0.22753040492534637,
	"learning_rate": 5.943661777680354e-07,
	"loss": 0.6272,
	"step": 2480
	},
	{
	"epoch": 9.70703125,
	"grad_norm": 0.22866863012313843,
	"learning_rate": 5.224548905072402e-07,
	"loss": 0.6357,
	"step": 2485
	},
	{
	"epoch": 9.7265625,
	"grad_norm": 0.2306712120771408,
	"learning_rate": 4.5516739410087494e-07,
	"loss": 0.6244,
	"step": 2490
	},
	{
	"epoch": 9.74609375,
	"grad_norm": 0.22779209911823273,
	"learning_rate": 3.9250681612225116e-07,
	"loss": 0.6309,
	"step": 2495
	},
	{
	"epoch": 9.765625,
	"grad_norm": 0.22719816863536835,
	"learning_rate": 3.3447606908196817e-07,
	"loss": 0.628,
	"step": 2500
	},
	{
	"epoch": 9.78515625,
	"grad_norm": 0.23172929883003235,
	"learning_rate": 2.8107785029265476e-07,
	"loss": 0.6293,
	"step": 2505
	},
	{
	"epoch": 9.8046875,
	"grad_norm": 0.22468186914920807,
	"learning_rate": 2.3231464174352512e-07,
	"loss": 0.6368,
	"step": 2510
	},
	{
	"epoch": 9.82421875,
	"grad_norm": 0.22247561812400818,
	"learning_rate": 1.8818870998508208e-07,
	"loss": 0.6222,
	"step": 2515
	},
	{
	"epoch": 9.84375,
	"grad_norm": 0.22515320777893066,
	"learning_rate": 1.487021060236904e-07,
	"loss": 0.6266,
	"step": 2520
	},
	{
	"epoch": 9.86328125,
	"grad_norm": 0.23118971288204193,
	"learning_rate": 1.1385666522630845e-07,
	"loss": 0.6308,
	"step": 2525
	},
	{
	"epoch": 9.8828125,
	"grad_norm": 0.22416307032108307,
	"learning_rate": 8.365400723512328e-08,
	"loss": 0.6239,
	"step": 2530
	},
	{
	"epoch": 9.90234375,
	"grad_norm": 0.22984710335731506,
	"learning_rate": 5.8095535892332964e-08,
	"loss": 0.6362,
	"step": 2535
	},
	{
	"epoch": 9.921875,
	"grad_norm": 0.23102597892284393,
	"learning_rate": 3.7182439174832106e-08,
	"loss": 0.6365,
	"step": 2540
	},
	{
	"epoch": 9.94140625,
	"grad_norm": 0.2295123189687729,
	"learning_rate": 2.091568913904496e-08,
	"loss": 0.6397,
	"step": 2545
	},
	{
	"epoch": 9.9609375,
	"grad_norm": 0.22766011953353882,
	"learning_rate": 9.296041875683781e-09,
	"loss": 0.6274,
	"step": 2550
	},
	{
	"epoch": 9.98046875,
	"grad_norm": 0.2338954210281372,
	"learning_rate": 2.3240374746658077e-09,
	"loss": 0.6212,
	"step": 2555
	},
	{
	"epoch": 10.0,
	"grad_norm": 0.22291633486747742,
	"learning_rate": 0.0,
	"loss": 0.616,
	"step": 2560
	},
	{
	"epoch": 10.0,
	"eval_loss": 2.1007895469665527,
	"eval_runtime": 0.5705,
	"eval_samples_per_second": 10.518,
	"eval_steps_per_second": 1.753,
	"step": 2560
	},
	{
	"epoch": 10.0,
	"step": 2560,
	"total_flos": 7.568434414263206e+18,
	"train_loss": 0.7105431989766657,
	"train_runtime": 14792.6859,
	"train_samples_per_second": 11.056,
	"train_steps_per_second": 0.173
	}
	],
	"logging_steps": 5,
	"max_steps": 2560,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 10,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 7.568434414263206e+18,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}