llama3-8b-closedqa-gpt4o-100k / trainer_state.json
chansung's picture
Model save
9060310 verified
raw
history blame
87.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00390625,
"grad_norm": 1.813705325126648,
"learning_rate": 7.8125e-07,
"loss": 1.9071,
"step": 1
},
{
"epoch": 0.01953125,
"grad_norm": 1.431990385055542,
"learning_rate": 3.90625e-06,
"loss": 1.8608,
"step": 5
},
{
"epoch": 0.0390625,
"grad_norm": 1.281330943107605,
"learning_rate": 7.8125e-06,
"loss": 1.8263,
"step": 10
},
{
"epoch": 0.05859375,
"grad_norm": 1.310953140258789,
"learning_rate": 1.171875e-05,
"loss": 1.8193,
"step": 15
},
{
"epoch": 0.078125,
"grad_norm": 1.296993374824524,
"learning_rate": 1.5625e-05,
"loss": 1.7463,
"step": 20
},
{
"epoch": 0.09765625,
"grad_norm": 1.1856365203857422,
"learning_rate": 1.953125e-05,
"loss": 1.6844,
"step": 25
},
{
"epoch": 0.1171875,
"grad_norm": 3.376720905303955,
"learning_rate": 2.34375e-05,
"loss": 1.5861,
"step": 30
},
{
"epoch": 0.13671875,
"grad_norm": 3.182882785797119,
"learning_rate": 2.734375e-05,
"loss": 1.4328,
"step": 35
},
{
"epoch": 0.15625,
"grad_norm": 0.682467520236969,
"learning_rate": 3.125e-05,
"loss": 1.2702,
"step": 40
},
{
"epoch": 0.17578125,
"grad_norm": 0.9865962266921997,
"learning_rate": 3.5156250000000004e-05,
"loss": 1.1671,
"step": 45
},
{
"epoch": 0.1953125,
"grad_norm": 0.42747607827186584,
"learning_rate": 3.90625e-05,
"loss": 1.1303,
"step": 50
},
{
"epoch": 0.21484375,
"grad_norm": 0.42581626772880554,
"learning_rate": 4.2968750000000004e-05,
"loss": 1.101,
"step": 55
},
{
"epoch": 0.234375,
"grad_norm": 0.4914548099040985,
"learning_rate": 4.6875e-05,
"loss": 1.0586,
"step": 60
},
{
"epoch": 0.25390625,
"grad_norm": 0.39272716641426086,
"learning_rate": 5.0781250000000004e-05,
"loss": 1.0308,
"step": 65
},
{
"epoch": 0.2734375,
"grad_norm": 0.34394437074661255,
"learning_rate": 5.46875e-05,
"loss": 0.9998,
"step": 70
},
{
"epoch": 0.29296875,
"grad_norm": 0.3009032607078552,
"learning_rate": 5.8593750000000005e-05,
"loss": 0.9784,
"step": 75
},
{
"epoch": 0.3125,
"grad_norm": 0.27089548110961914,
"learning_rate": 6.25e-05,
"loss": 0.9653,
"step": 80
},
{
"epoch": 0.33203125,
"grad_norm": 0.25717490911483765,
"learning_rate": 6.640625e-05,
"loss": 0.9434,
"step": 85
},
{
"epoch": 0.3515625,
"grad_norm": 0.3018302917480469,
"learning_rate": 7.031250000000001e-05,
"loss": 0.9372,
"step": 90
},
{
"epoch": 0.37109375,
"grad_norm": 0.2254215031862259,
"learning_rate": 7.421875e-05,
"loss": 0.9236,
"step": 95
},
{
"epoch": 0.390625,
"grad_norm": 0.2384410947561264,
"learning_rate": 7.8125e-05,
"loss": 0.9145,
"step": 100
},
{
"epoch": 0.41015625,
"grad_norm": 0.2905459403991699,
"learning_rate": 8.203125e-05,
"loss": 0.9177,
"step": 105
},
{
"epoch": 0.4296875,
"grad_norm": 0.27646884322166443,
"learning_rate": 8.593750000000001e-05,
"loss": 0.9103,
"step": 110
},
{
"epoch": 0.44921875,
"grad_norm": 0.23843346536159515,
"learning_rate": 8.984375e-05,
"loss": 0.8911,
"step": 115
},
{
"epoch": 0.46875,
"grad_norm": 0.3110702931880951,
"learning_rate": 9.375e-05,
"loss": 0.8961,
"step": 120
},
{
"epoch": 0.48828125,
"grad_norm": 0.2591000199317932,
"learning_rate": 9.765625e-05,
"loss": 0.8911,
"step": 125
},
{
"epoch": 0.5078125,
"grad_norm": 0.2314710170030594,
"learning_rate": 0.00010156250000000001,
"loss": 0.8765,
"step": 130
},
{
"epoch": 0.52734375,
"grad_norm": 0.268370658159256,
"learning_rate": 0.00010546875,
"loss": 0.8759,
"step": 135
},
{
"epoch": 0.546875,
"grad_norm": 0.24689124524593353,
"learning_rate": 0.000109375,
"loss": 0.8714,
"step": 140
},
{
"epoch": 0.56640625,
"grad_norm": 0.28693222999572754,
"learning_rate": 0.00011328125,
"loss": 0.882,
"step": 145
},
{
"epoch": 0.5859375,
"grad_norm": 0.26165568828582764,
"learning_rate": 0.00011718750000000001,
"loss": 0.8638,
"step": 150
},
{
"epoch": 0.60546875,
"grad_norm": 0.2968839406967163,
"learning_rate": 0.00012109375,
"loss": 0.8562,
"step": 155
},
{
"epoch": 0.625,
"grad_norm": 0.2954418957233429,
"learning_rate": 0.000125,
"loss": 0.8569,
"step": 160
},
{
"epoch": 0.64453125,
"grad_norm": 0.30811259150505066,
"learning_rate": 0.00012890625,
"loss": 0.8455,
"step": 165
},
{
"epoch": 0.6640625,
"grad_norm": 0.2631295323371887,
"learning_rate": 0.0001328125,
"loss": 0.8574,
"step": 170
},
{
"epoch": 0.68359375,
"grad_norm": 0.25627005100250244,
"learning_rate": 0.00013671875,
"loss": 0.851,
"step": 175
},
{
"epoch": 0.703125,
"grad_norm": 0.28598853945732117,
"learning_rate": 0.00014062500000000002,
"loss": 0.8385,
"step": 180
},
{
"epoch": 0.72265625,
"grad_norm": 0.2502932548522949,
"learning_rate": 0.00014453125000000002,
"loss": 0.8457,
"step": 185
},
{
"epoch": 0.7421875,
"grad_norm": 0.3177507817745209,
"learning_rate": 0.0001484375,
"loss": 0.8319,
"step": 190
},
{
"epoch": 0.76171875,
"grad_norm": 0.27309176325798035,
"learning_rate": 0.00015234375,
"loss": 0.8511,
"step": 195
},
{
"epoch": 0.78125,
"grad_norm": 0.29295653104782104,
"learning_rate": 0.00015625,
"loss": 0.8373,
"step": 200
},
{
"epoch": 0.80078125,
"grad_norm": 0.27028167247772217,
"learning_rate": 0.00016015625,
"loss": 0.8319,
"step": 205
},
{
"epoch": 0.8203125,
"grad_norm": 0.40336114168167114,
"learning_rate": 0.0001640625,
"loss": 0.8245,
"step": 210
},
{
"epoch": 0.83984375,
"grad_norm": 0.3044915795326233,
"learning_rate": 0.00016796875000000001,
"loss": 0.8283,
"step": 215
},
{
"epoch": 0.859375,
"grad_norm": 0.29535970091819763,
"learning_rate": 0.00017187500000000002,
"loss": 0.8119,
"step": 220
},
{
"epoch": 0.87890625,
"grad_norm": 0.28554800152778625,
"learning_rate": 0.00017578125000000002,
"loss": 0.8091,
"step": 225
},
{
"epoch": 0.8984375,
"grad_norm": 0.26689431071281433,
"learning_rate": 0.0001796875,
"loss": 0.8189,
"step": 230
},
{
"epoch": 0.91796875,
"grad_norm": 0.29758790135383606,
"learning_rate": 0.00018359375,
"loss": 0.8122,
"step": 235
},
{
"epoch": 0.9375,
"grad_norm": 0.40431731939315796,
"learning_rate": 0.0001875,
"loss": 0.8155,
"step": 240
},
{
"epoch": 0.95703125,
"grad_norm": 0.27242639660835266,
"learning_rate": 0.00019140625,
"loss": 0.8119,
"step": 245
},
{
"epoch": 0.9765625,
"grad_norm": 0.3094847500324249,
"learning_rate": 0.0001953125,
"loss": 0.8058,
"step": 250
},
{
"epoch": 0.99609375,
"grad_norm": 0.32299983501434326,
"learning_rate": 0.00019921875000000001,
"loss": 0.8026,
"step": 255
},
{
"epoch": 1.0,
"eval_loss": 2.045611619949341,
"eval_runtime": 0.5394,
"eval_samples_per_second": 11.124,
"eval_steps_per_second": 1.854,
"step": 256
},
{
"epoch": 1.015625,
"grad_norm": 0.305078387260437,
"learning_rate": 0.00019999851261394218,
"loss": 0.7941,
"step": 260
},
{
"epoch": 1.03515625,
"grad_norm": 0.2842113673686981,
"learning_rate": 0.00019999247018391447,
"loss": 0.798,
"step": 265
},
{
"epoch": 1.0546875,
"grad_norm": 0.27524590492248535,
"learning_rate": 0.0001999817800289289,
"loss": 0.7911,
"step": 270
},
{
"epoch": 1.07421875,
"grad_norm": 0.2549247145652771,
"learning_rate": 0.00019996644264587193,
"loss": 0.7963,
"step": 275
},
{
"epoch": 1.09375,
"grad_norm": 0.253353089094162,
"learning_rate": 0.00019994645874763658,
"loss": 0.7904,
"step": 280
},
{
"epoch": 1.11328125,
"grad_norm": 0.23945719003677368,
"learning_rate": 0.00019992182926308942,
"loss": 0.7921,
"step": 285
},
{
"epoch": 1.1328125,
"grad_norm": 0.29668208956718445,
"learning_rate": 0.00019989255533702736,
"loss": 0.7943,
"step": 290
},
{
"epoch": 1.15234375,
"grad_norm": 0.26419156789779663,
"learning_rate": 0.0001998586383301244,
"loss": 0.7819,
"step": 295
},
{
"epoch": 1.171875,
"grad_norm": 0.3054077625274658,
"learning_rate": 0.00019982007981886847,
"loss": 0.7917,
"step": 300
},
{
"epoch": 1.19140625,
"grad_norm": 0.27965638041496277,
"learning_rate": 0.00019977688159548808,
"loss": 0.7854,
"step": 305
},
{
"epoch": 1.2109375,
"grad_norm": 0.23229017853736877,
"learning_rate": 0.00019972904566786903,
"loss": 0.7865,
"step": 310
},
{
"epoch": 1.23046875,
"grad_norm": 0.2789019048213959,
"learning_rate": 0.00019967657425946106,
"loss": 0.7821,
"step": 315
},
{
"epoch": 1.25,
"grad_norm": 0.24402114748954773,
"learning_rate": 0.00019961946980917456,
"loss": 0.7899,
"step": 320
},
{
"epoch": 1.26953125,
"grad_norm": 0.2749808132648468,
"learning_rate": 0.0001995577349712672,
"loss": 0.7783,
"step": 325
},
{
"epoch": 1.2890625,
"grad_norm": 0.2676057815551758,
"learning_rate": 0.00019949137261522052,
"loss": 0.7788,
"step": 330
},
{
"epoch": 1.30859375,
"grad_norm": 0.24829885363578796,
"learning_rate": 0.0001994203858256065,
"loss": 0.7714,
"step": 335
},
{
"epoch": 1.328125,
"grad_norm": 0.24872945249080658,
"learning_rate": 0.00019934477790194445,
"loss": 0.7832,
"step": 340
},
{
"epoch": 1.34765625,
"grad_norm": 0.2914537489414215,
"learning_rate": 0.00019926455235854724,
"loss": 0.7791,
"step": 345
},
{
"epoch": 1.3671875,
"grad_norm": 0.2692899703979492,
"learning_rate": 0.00019917971292435826,
"loss": 0.7739,
"step": 350
},
{
"epoch": 1.38671875,
"grad_norm": 0.2605401873588562,
"learning_rate": 0.000199090263542778,
"loss": 0.7717,
"step": 355
},
{
"epoch": 1.40625,
"grad_norm": 0.24468782544136047,
"learning_rate": 0.00019899620837148077,
"loss": 0.7694,
"step": 360
},
{
"epoch": 1.42578125,
"grad_norm": 0.2542877197265625,
"learning_rate": 0.00019889755178222147,
"loss": 0.7653,
"step": 365
},
{
"epoch": 1.4453125,
"grad_norm": 0.21375133097171783,
"learning_rate": 0.00019879429836063226,
"loss": 0.7854,
"step": 370
},
{
"epoch": 1.46484375,
"grad_norm": 0.24711847305297852,
"learning_rate": 0.00019868645290600955,
"loss": 0.773,
"step": 375
},
{
"epoch": 1.484375,
"grad_norm": 0.2352401316165924,
"learning_rate": 0.0001985740204310909,
"loss": 0.7641,
"step": 380
},
{
"epoch": 1.50390625,
"grad_norm": 0.2681073844432831,
"learning_rate": 0.00019845700616182206,
"loss": 0.7755,
"step": 385
},
{
"epoch": 1.5234375,
"grad_norm": 0.2394329458475113,
"learning_rate": 0.00019833541553711395,
"loss": 0.7635,
"step": 390
},
{
"epoch": 1.54296875,
"grad_norm": 0.27736565470695496,
"learning_rate": 0.00019820925420858991,
"loss": 0.7744,
"step": 395
},
{
"epoch": 1.5625,
"grad_norm": 0.2736864984035492,
"learning_rate": 0.00019807852804032305,
"loss": 0.7564,
"step": 400
},
{
"epoch": 1.58203125,
"grad_norm": 0.22882600128650665,
"learning_rate": 0.00019794324310856367,
"loss": 0.7703,
"step": 405
},
{
"epoch": 1.6015625,
"grad_norm": 0.2372276782989502,
"learning_rate": 0.0001978034057014568,
"loss": 0.7642,
"step": 410
},
{
"epoch": 1.62109375,
"grad_norm": 0.23550736904144287,
"learning_rate": 0.00019765902231874992,
"loss": 0.7513,
"step": 415
},
{
"epoch": 1.640625,
"grad_norm": 0.23483717441558838,
"learning_rate": 0.00019751009967149087,
"loss": 0.7485,
"step": 420
},
{
"epoch": 1.66015625,
"grad_norm": 0.23124265670776367,
"learning_rate": 0.00019735664468171587,
"loss": 0.7712,
"step": 425
},
{
"epoch": 1.6796875,
"grad_norm": 0.25672388076782227,
"learning_rate": 0.00019719866448212795,
"loss": 0.7635,
"step": 430
},
{
"epoch": 1.69921875,
"grad_norm": 0.2655965983867645,
"learning_rate": 0.00019703616641576514,
"loss": 0.7614,
"step": 435
},
{
"epoch": 1.71875,
"grad_norm": 0.22875700891017914,
"learning_rate": 0.00019686915803565934,
"loss": 0.7597,
"step": 440
},
{
"epoch": 1.73828125,
"grad_norm": 0.24324467778205872,
"learning_rate": 0.00019669764710448522,
"loss": 0.7592,
"step": 445
},
{
"epoch": 1.7578125,
"grad_norm": 0.23085905611515045,
"learning_rate": 0.00019652164159419946,
"loss": 0.7582,
"step": 450
},
{
"epoch": 1.77734375,
"grad_norm": 0.24821893870830536,
"learning_rate": 0.00019634114968567005,
"loss": 0.7565,
"step": 455
},
{
"epoch": 1.796875,
"grad_norm": 0.24690982699394226,
"learning_rate": 0.0001961561797682962,
"loss": 0.75,
"step": 460
},
{
"epoch": 1.81640625,
"grad_norm": 0.21277934312820435,
"learning_rate": 0.00019596674043961828,
"loss": 0.7499,
"step": 465
},
{
"epoch": 1.8359375,
"grad_norm": 0.2045515477657318,
"learning_rate": 0.0001957728405049183,
"loss": 0.7476,
"step": 470
},
{
"epoch": 1.85546875,
"grad_norm": 0.22809946537017822,
"learning_rate": 0.00019557448897681057,
"loss": 0.7554,
"step": 475
},
{
"epoch": 1.875,
"grad_norm": 0.2747824788093567,
"learning_rate": 0.0001953716950748227,
"loss": 0.7481,
"step": 480
},
{
"epoch": 1.89453125,
"grad_norm": 0.23395125567913055,
"learning_rate": 0.00019516446822496732,
"loss": 0.7579,
"step": 485
},
{
"epoch": 1.9140625,
"grad_norm": 0.2263769805431366,
"learning_rate": 0.00019495281805930367,
"loss": 0.7493,
"step": 490
},
{
"epoch": 1.93359375,
"grad_norm": 0.23396165668964386,
"learning_rate": 0.00019473675441549013,
"loss": 0.7523,
"step": 495
},
{
"epoch": 1.953125,
"grad_norm": 0.23420800268650055,
"learning_rate": 0.0001945162873363268,
"loss": 0.7469,
"step": 500
},
{
"epoch": 1.97265625,
"grad_norm": 0.19923944771289825,
"learning_rate": 0.00019429142706928868,
"loss": 0.7535,
"step": 505
},
{
"epoch": 1.9921875,
"grad_norm": 0.2181696891784668,
"learning_rate": 0.00019406218406604965,
"loss": 0.7532,
"step": 510
},
{
"epoch": 2.0,
"eval_loss": 2.031317949295044,
"eval_runtime": 0.5375,
"eval_samples_per_second": 11.164,
"eval_steps_per_second": 1.861,
"step": 512
},
{
"epoch": 2.01171875,
"grad_norm": 0.2611521780490875,
"learning_rate": 0.0001938285689819962,
"loss": 0.7349,
"step": 515
},
{
"epoch": 2.03125,
"grad_norm": 0.22077465057373047,
"learning_rate": 0.0001935905926757326,
"loss": 0.7309,
"step": 520
},
{
"epoch": 2.05078125,
"grad_norm": 0.2502357065677643,
"learning_rate": 0.00019334826620857583,
"loss": 0.7402,
"step": 525
},
{
"epoch": 2.0703125,
"grad_norm": 0.21151328086853027,
"learning_rate": 0.00019310160084404186,
"loss": 0.7263,
"step": 530
},
{
"epoch": 2.08984375,
"grad_norm": 0.22730891406536102,
"learning_rate": 0.00019285060804732158,
"loss": 0.7393,
"step": 535
},
{
"epoch": 2.109375,
"grad_norm": 0.29608404636383057,
"learning_rate": 0.00019259529948474833,
"loss": 0.7359,
"step": 540
},
{
"epoch": 2.12890625,
"grad_norm": 0.2048954963684082,
"learning_rate": 0.00019233568702325547,
"loss": 0.7327,
"step": 545
},
{
"epoch": 2.1484375,
"grad_norm": 0.24332541227340698,
"learning_rate": 0.0001920717827298248,
"loss": 0.723,
"step": 550
},
{
"epoch": 2.16796875,
"grad_norm": 0.27370956540107727,
"learning_rate": 0.0001918035988709256,
"loss": 0.7346,
"step": 555
},
{
"epoch": 2.1875,
"grad_norm": 0.27345338463783264,
"learning_rate": 0.00019153114791194473,
"loss": 0.7216,
"step": 560
},
{
"epoch": 2.20703125,
"grad_norm": 0.21915854513645172,
"learning_rate": 0.0001912544425166069,
"loss": 0.7297,
"step": 565
},
{
"epoch": 2.2265625,
"grad_norm": 0.23517705500125885,
"learning_rate": 0.0001909734955463863,
"loss": 0.7277,
"step": 570
},
{
"epoch": 2.24609375,
"grad_norm": 0.2082410454750061,
"learning_rate": 0.00019068832005990867,
"loss": 0.7274,
"step": 575
},
{
"epoch": 2.265625,
"grad_norm": 0.25212010741233826,
"learning_rate": 0.00019039892931234435,
"loss": 0.7388,
"step": 580
},
{
"epoch": 2.28515625,
"grad_norm": 0.22077186405658722,
"learning_rate": 0.0001901053367547922,
"loss": 0.7356,
"step": 585
},
{
"epoch": 2.3046875,
"grad_norm": 0.24918216466903687,
"learning_rate": 0.0001898075560336543,
"loss": 0.7283,
"step": 590
},
{
"epoch": 2.32421875,
"grad_norm": 0.2168445587158203,
"learning_rate": 0.00018950560099000182,
"loss": 0.7276,
"step": 595
},
{
"epoch": 2.34375,
"grad_norm": 0.3361542522907257,
"learning_rate": 0.00018919948565893142,
"loss": 0.7394,
"step": 600
},
{
"epoch": 2.36328125,
"grad_norm": 0.30473312735557556,
"learning_rate": 0.0001888892242689132,
"loss": 0.7214,
"step": 605
},
{
"epoch": 2.3828125,
"grad_norm": 0.22810065746307373,
"learning_rate": 0.00018857483124112907,
"loss": 0.7389,
"step": 610
},
{
"epoch": 2.40234375,
"grad_norm": 0.22486305236816406,
"learning_rate": 0.00018825632118880259,
"loss": 0.7382,
"step": 615
},
{
"epoch": 2.421875,
"grad_norm": 0.23797857761383057,
"learning_rate": 0.00018793370891651972,
"loss": 0.7352,
"step": 620
},
{
"epoch": 2.44140625,
"grad_norm": 0.22012600302696228,
"learning_rate": 0.00018760700941954065,
"loss": 0.7323,
"step": 625
},
{
"epoch": 2.4609375,
"grad_norm": 0.2505754232406616,
"learning_rate": 0.00018727623788310292,
"loss": 0.7319,
"step": 630
},
{
"epoch": 2.48046875,
"grad_norm": 0.23932820558547974,
"learning_rate": 0.0001869414096817154,
"loss": 0.7166,
"step": 635
},
{
"epoch": 2.5,
"grad_norm": 0.22623002529144287,
"learning_rate": 0.00018660254037844388,
"loss": 0.7254,
"step": 640
},
{
"epoch": 2.51953125,
"grad_norm": 0.24143099784851074,
"learning_rate": 0.0001862596457241875,
"loss": 0.7374,
"step": 645
},
{
"epoch": 2.5390625,
"grad_norm": 0.25545206665992737,
"learning_rate": 0.00018591274165694687,
"loss": 0.7268,
"step": 650
},
{
"epoch": 2.55859375,
"grad_norm": 0.27690452337265015,
"learning_rate": 0.00018556184430108293,
"loss": 0.7318,
"step": 655
},
{
"epoch": 2.578125,
"grad_norm": 0.21064211428165436,
"learning_rate": 0.00018520696996656788,
"loss": 0.7365,
"step": 660
},
{
"epoch": 2.59765625,
"grad_norm": 0.2418980747461319,
"learning_rate": 0.0001848481351482267,
"loss": 0.7252,
"step": 665
},
{
"epoch": 2.6171875,
"grad_norm": 0.21725673973560333,
"learning_rate": 0.00018448535652497073,
"loss": 0.7438,
"step": 670
},
{
"epoch": 2.63671875,
"grad_norm": 0.2051118165254593,
"learning_rate": 0.00018411865095902224,
"loss": 0.7272,
"step": 675
},
{
"epoch": 2.65625,
"grad_norm": 0.20715655386447906,
"learning_rate": 0.0001837480354951308,
"loss": 0.7189,
"step": 680
},
{
"epoch": 2.67578125,
"grad_norm": 0.224945530295372,
"learning_rate": 0.00018337352735978095,
"loss": 0.7283,
"step": 685
},
{
"epoch": 2.6953125,
"grad_norm": 0.2353772222995758,
"learning_rate": 0.0001829951439603915,
"loss": 0.7172,
"step": 690
},
{
"epoch": 2.71484375,
"grad_norm": 0.21377775073051453,
"learning_rate": 0.00018261290288450646,
"loss": 0.7245,
"step": 695
},
{
"epoch": 2.734375,
"grad_norm": 0.20290276408195496,
"learning_rate": 0.00018222682189897752,
"loss": 0.732,
"step": 700
},
{
"epoch": 2.75390625,
"grad_norm": 0.21785806119441986,
"learning_rate": 0.00018183691894913825,
"loss": 0.7142,
"step": 705
},
{
"epoch": 2.7734375,
"grad_norm": 0.21216203272342682,
"learning_rate": 0.00018144321215797,
"loss": 0.7163,
"step": 710
},
{
"epoch": 2.79296875,
"grad_norm": 0.20187579095363617,
"learning_rate": 0.0001810457198252595,
"loss": 0.7196,
"step": 715
},
{
"epoch": 2.8125,
"grad_norm": 0.21112394332885742,
"learning_rate": 0.00018064446042674828,
"loss": 0.7255,
"step": 720
},
{
"epoch": 2.83203125,
"grad_norm": 0.21814604103565216,
"learning_rate": 0.00018023945261327393,
"loss": 0.7244,
"step": 725
},
{
"epoch": 2.8515625,
"grad_norm": 0.2388346940279007,
"learning_rate": 0.00017983071520990315,
"loss": 0.719,
"step": 730
},
{
"epoch": 2.87109375,
"grad_norm": 0.2274855226278305,
"learning_rate": 0.00017941826721505684,
"loss": 0.7092,
"step": 735
},
{
"epoch": 2.890625,
"grad_norm": 0.2171526700258255,
"learning_rate": 0.0001790021277996269,
"loss": 0.7177,
"step": 740
},
{
"epoch": 2.91015625,
"grad_norm": 0.2128465622663498,
"learning_rate": 0.00017858231630608527,
"loss": 0.7245,
"step": 745
},
{
"epoch": 2.9296875,
"grad_norm": 0.2257278561592102,
"learning_rate": 0.0001781588522475848,
"loss": 0.7172,
"step": 750
},
{
"epoch": 2.94921875,
"grad_norm": 0.21227267384529114,
"learning_rate": 0.00017773175530705232,
"loss": 0.7208,
"step": 755
},
{
"epoch": 2.96875,
"grad_norm": 0.23267419636249542,
"learning_rate": 0.0001773010453362737,
"loss": 0.7188,
"step": 760
},
{
"epoch": 2.98828125,
"grad_norm": 0.21279846131801605,
"learning_rate": 0.00017686674235497125,
"loss": 0.7198,
"step": 765
},
{
"epoch": 3.0,
"eval_loss": 2.0403969287872314,
"eval_runtime": 0.5399,
"eval_samples_per_second": 11.113,
"eval_steps_per_second": 1.852,
"step": 768
},
{
"epoch": 3.0078125,
"grad_norm": 0.20591868460178375,
"learning_rate": 0.000176428866549873,
"loss": 0.7092,
"step": 770
},
{
"epoch": 3.02734375,
"grad_norm": 0.21006809175014496,
"learning_rate": 0.0001759874382737746,
"loss": 0.6982,
"step": 775
},
{
"epoch": 3.046875,
"grad_norm": 0.20914091169834137,
"learning_rate": 0.00017554247804459316,
"loss": 0.6986,
"step": 780
},
{
"epoch": 3.06640625,
"grad_norm": 0.21207676827907562,
"learning_rate": 0.0001750940065444136,
"loss": 0.7024,
"step": 785
},
{
"epoch": 3.0859375,
"grad_norm": 0.24130572378635406,
"learning_rate": 0.00017464204461852738,
"loss": 0.7011,
"step": 790
},
{
"epoch": 3.10546875,
"grad_norm": 0.22464986145496368,
"learning_rate": 0.0001741866132744636,
"loss": 0.6998,
"step": 795
},
{
"epoch": 3.125,
"grad_norm": 0.20956657826900482,
"learning_rate": 0.0001737277336810124,
"loss": 0.7068,
"step": 800
},
{
"epoch": 3.14453125,
"grad_norm": 0.21382799744606018,
"learning_rate": 0.00017326542716724128,
"loss": 0.6997,
"step": 805
},
{
"epoch": 3.1640625,
"grad_norm": 0.2018394023180008,
"learning_rate": 0.00017279971522150348,
"loss": 0.7057,
"step": 810
},
{
"epoch": 3.18359375,
"grad_norm": 0.20716731250286102,
"learning_rate": 0.00017233061949043928,
"loss": 0.6957,
"step": 815
},
{
"epoch": 3.203125,
"grad_norm": 0.21063964068889618,
"learning_rate": 0.0001718581617779698,
"loss": 0.6989,
"step": 820
},
{
"epoch": 3.22265625,
"grad_norm": 0.21001911163330078,
"learning_rate": 0.0001713823640442837,
"loss": 0.7065,
"step": 825
},
{
"epoch": 3.2421875,
"grad_norm": 0.21537743508815765,
"learning_rate": 0.0001709032484048162,
"loss": 0.7001,
"step": 830
},
{
"epoch": 3.26171875,
"grad_norm": 0.21781504154205322,
"learning_rate": 0.00017042083712922131,
"loss": 0.7076,
"step": 835
},
{
"epoch": 3.28125,
"grad_norm": 0.21302708983421326,
"learning_rate": 0.00016993515264033672,
"loss": 0.6965,
"step": 840
},
{
"epoch": 3.30078125,
"grad_norm": 0.2185572385787964,
"learning_rate": 0.00016944621751314144,
"loss": 0.7046,
"step": 845
},
{
"epoch": 3.3203125,
"grad_norm": 0.21651025116443634,
"learning_rate": 0.0001689540544737067,
"loss": 0.7042,
"step": 850
},
{
"epoch": 3.33984375,
"grad_norm": 0.22459545731544495,
"learning_rate": 0.0001684586863981394,
"loss": 0.7133,
"step": 855
},
{
"epoch": 3.359375,
"grad_norm": 0.21320843696594238,
"learning_rate": 0.00016796013631151897,
"loss": 0.7106,
"step": 860
},
{
"epoch": 3.37890625,
"grad_norm": 0.22854122519493103,
"learning_rate": 0.00016745842738682712,
"loss": 0.6987,
"step": 865
},
{
"epoch": 3.3984375,
"grad_norm": 0.22366014122962952,
"learning_rate": 0.00016695358294387065,
"loss": 0.7078,
"step": 870
},
{
"epoch": 3.41796875,
"grad_norm": 0.21049249172210693,
"learning_rate": 0.00016644562644819771,
"loss": 0.6926,
"step": 875
},
{
"epoch": 3.4375,
"grad_norm": 0.216139018535614,
"learning_rate": 0.00016593458151000688,
"loss": 0.7073,
"step": 880
},
{
"epoch": 3.45703125,
"grad_norm": 0.22321297228336334,
"learning_rate": 0.00016542047188304997,
"loss": 0.7063,
"step": 885
},
{
"epoch": 3.4765625,
"grad_norm": 0.21834047138690948,
"learning_rate": 0.0001649033214635277,
"loss": 0.7007,
"step": 890
},
{
"epoch": 3.49609375,
"grad_norm": 0.2148895114660263,
"learning_rate": 0.00016438315428897915,
"loss": 0.709,
"step": 895
},
{
"epoch": 3.515625,
"grad_norm": 0.2145809829235077,
"learning_rate": 0.00016385999453716454,
"loss": 0.7073,
"step": 900
},
{
"epoch": 3.53515625,
"grad_norm": 0.21147432923316956,
"learning_rate": 0.00016333386652494117,
"loss": 0.6915,
"step": 905
},
{
"epoch": 3.5546875,
"grad_norm": 0.21884699165821075,
"learning_rate": 0.00016280479470713344,
"loss": 0.7026,
"step": 910
},
{
"epoch": 3.57421875,
"grad_norm": 0.20934432744979858,
"learning_rate": 0.0001622728036753959,
"loss": 0.6908,
"step": 915
},
{
"epoch": 3.59375,
"grad_norm": 0.20113444328308105,
"learning_rate": 0.00016173791815707051,
"loss": 0.7101,
"step": 920
},
{
"epoch": 3.61328125,
"grad_norm": 0.2057623565196991,
"learning_rate": 0.000161200163014037,
"loss": 0.7179,
"step": 925
},
{
"epoch": 3.6328125,
"grad_norm": 0.21178101003170013,
"learning_rate": 0.00016065956324155746,
"loss": 0.7015,
"step": 930
},
{
"epoch": 3.65234375,
"grad_norm": 0.21164196729660034,
"learning_rate": 0.0001601161439671145,
"loss": 0.6955,
"step": 935
},
{
"epoch": 3.671875,
"grad_norm": 0.21989427506923676,
"learning_rate": 0.00015956993044924334,
"loss": 0.6972,
"step": 940
},
{
"epoch": 3.69140625,
"grad_norm": 0.20968452095985413,
"learning_rate": 0.0001590209480763576,
"loss": 0.6986,
"step": 945
},
{
"epoch": 3.7109375,
"grad_norm": 0.20064401626586914,
"learning_rate": 0.00015846922236556946,
"loss": 0.7073,
"step": 950
},
{
"epoch": 3.73046875,
"grad_norm": 0.2390391230583191,
"learning_rate": 0.00015791477896150347,
"loss": 0.6958,
"step": 955
},
{
"epoch": 3.75,
"grad_norm": 0.21184207499027252,
"learning_rate": 0.0001573576436351046,
"loss": 0.7008,
"step": 960
},
{
"epoch": 3.76953125,
"grad_norm": 0.21932272613048553,
"learning_rate": 0.00015679784228244043,
"loss": 0.6904,
"step": 965
},
{
"epoch": 3.7890625,
"grad_norm": 0.19908711314201355,
"learning_rate": 0.00015623540092349732,
"loss": 0.6991,
"step": 970
},
{
"epoch": 3.80859375,
"grad_norm": 0.22039274871349335,
"learning_rate": 0.00015567034570097125,
"loss": 0.6959,
"step": 975
},
{
"epoch": 3.828125,
"grad_norm": 0.21224038302898407,
"learning_rate": 0.0001551027028790524,
"loss": 0.6976,
"step": 980
},
{
"epoch": 3.84765625,
"grad_norm": 0.21021129190921783,
"learning_rate": 0.00015453249884220464,
"loss": 0.6976,
"step": 985
},
{
"epoch": 3.8671875,
"grad_norm": 0.2202974110841751,
"learning_rate": 0.00015395976009393894,
"loss": 0.6995,
"step": 990
},
{
"epoch": 3.88671875,
"grad_norm": 0.21578259766101837,
"learning_rate": 0.0001533845132555816,
"loss": 0.6882,
"step": 995
},
{
"epoch": 3.90625,
"grad_norm": 0.1979641318321228,
"learning_rate": 0.0001528067850650368,
"loss": 0.6961,
"step": 1000
},
{
"epoch": 3.92578125,
"grad_norm": 0.20889665186405182,
"learning_rate": 0.00015222660237554383,
"loss": 0.7,
"step": 1005
},
{
"epoch": 3.9453125,
"grad_norm": 0.20623871684074402,
"learning_rate": 0.00015164399215442898,
"loss": 0.6985,
"step": 1010
},
{
"epoch": 3.96484375,
"grad_norm": 0.2109537273645401,
"learning_rate": 0.00015105898148185193,
"loss": 0.7026,
"step": 1015
},
{
"epoch": 3.984375,
"grad_norm": 0.20740477740764618,
"learning_rate": 0.0001504715975495472,
"loss": 0.7053,
"step": 1020
},
{
"epoch": 4.0,
"eval_loss": 2.0418636798858643,
"eval_runtime": 0.5376,
"eval_samples_per_second": 11.162,
"eval_steps_per_second": 1.86,
"step": 1024
},
{
"epoch": 4.00390625,
"grad_norm": 0.2116871029138565,
"learning_rate": 0.00014988186765956029,
"loss": 0.6923,
"step": 1025
},
{
"epoch": 4.0234375,
"grad_norm": 0.20054052770137787,
"learning_rate": 0.00014928981922297842,
"loss": 0.6717,
"step": 1030
},
{
"epoch": 4.04296875,
"grad_norm": 0.2238766998052597,
"learning_rate": 0.00014869547975865664,
"loss": 0.6719,
"step": 1035
},
{
"epoch": 4.0625,
"grad_norm": 0.2156434804201126,
"learning_rate": 0.00014809887689193877,
"loss": 0.6718,
"step": 1040
},
{
"epoch": 4.08203125,
"grad_norm": 0.2189694195985794,
"learning_rate": 0.00014750003835337316,
"loss": 0.677,
"step": 1045
},
{
"epoch": 4.1015625,
"grad_norm": 0.2283412218093872,
"learning_rate": 0.0001468989919774239,
"loss": 0.6724,
"step": 1050
},
{
"epoch": 4.12109375,
"grad_norm": 0.2534675598144531,
"learning_rate": 0.00014629576570117709,
"loss": 0.6842,
"step": 1055
},
{
"epoch": 4.140625,
"grad_norm": 0.24277372658252716,
"learning_rate": 0.00014569038756304207,
"loss": 0.676,
"step": 1060
},
{
"epoch": 4.16015625,
"grad_norm": 0.2335975170135498,
"learning_rate": 0.0001450828857014485,
"loss": 0.6861,
"step": 1065
},
{
"epoch": 4.1796875,
"grad_norm": 0.22338411211967468,
"learning_rate": 0.0001444732883535382,
"loss": 0.6784,
"step": 1070
},
{
"epoch": 4.19921875,
"grad_norm": 0.22138862311840057,
"learning_rate": 0.00014386162385385278,
"loss": 0.6765,
"step": 1075
},
{
"epoch": 4.21875,
"grad_norm": 0.20274129509925842,
"learning_rate": 0.00014324792063301662,
"loss": 0.6762,
"step": 1080
},
{
"epoch": 4.23828125,
"grad_norm": 0.20809794962406158,
"learning_rate": 0.00014263220721641543,
"loss": 0.6954,
"step": 1085
},
{
"epoch": 4.2578125,
"grad_norm": 0.21727928519248962,
"learning_rate": 0.00014201451222287025,
"loss": 0.682,
"step": 1090
},
{
"epoch": 4.27734375,
"grad_norm": 0.21408621966838837,
"learning_rate": 0.00014139486436330736,
"loss": 0.6817,
"step": 1095
},
{
"epoch": 4.296875,
"grad_norm": 0.2173791378736496,
"learning_rate": 0.00014077329243942369,
"loss": 0.6775,
"step": 1100
},
{
"epoch": 4.31640625,
"grad_norm": 0.21154190599918365,
"learning_rate": 0.0001401498253423481,
"loss": 0.6793,
"step": 1105
},
{
"epoch": 4.3359375,
"grad_norm": 0.2106465995311737,
"learning_rate": 0.00013952449205129855,
"loss": 0.6736,
"step": 1110
},
{
"epoch": 4.35546875,
"grad_norm": 0.20029598474502563,
"learning_rate": 0.00013889732163223516,
"loss": 0.6759,
"step": 1115
},
{
"epoch": 4.375,
"grad_norm": 0.21185144782066345,
"learning_rate": 0.000138268343236509,
"loss": 0.6777,
"step": 1120
},
{
"epoch": 4.39453125,
"grad_norm": 0.2037803679704666,
"learning_rate": 0.0001376375860995073,
"loss": 0.6818,
"step": 1125
},
{
"epoch": 4.4140625,
"grad_norm": 0.21110603213310242,
"learning_rate": 0.00013700507953929463,
"loss": 0.675,
"step": 1130
},
{
"epoch": 4.43359375,
"grad_norm": 0.2060796022415161,
"learning_rate": 0.00013637085295524988,
"loss": 0.679,
"step": 1135
},
{
"epoch": 4.453125,
"grad_norm": 0.2184733897447586,
"learning_rate": 0.00013573493582670003,
"loss": 0.6859,
"step": 1140
},
{
"epoch": 4.47265625,
"grad_norm": 0.21656639873981476,
"learning_rate": 0.00013509735771154987,
"loss": 0.685,
"step": 1145
},
{
"epoch": 4.4921875,
"grad_norm": 0.219607412815094,
"learning_rate": 0.00013445814824490805,
"loss": 0.6814,
"step": 1150
},
{
"epoch": 4.51171875,
"grad_norm": 0.2204212099313736,
"learning_rate": 0.00013381733713770967,
"loss": 0.6845,
"step": 1155
},
{
"epoch": 4.53125,
"grad_norm": 0.2118123322725296,
"learning_rate": 0.00013317495417533524,
"loss": 0.6751,
"step": 1160
},
{
"epoch": 4.55078125,
"grad_norm": 0.2175564020872116,
"learning_rate": 0.0001325310292162263,
"loss": 0.6813,
"step": 1165
},
{
"epoch": 4.5703125,
"grad_norm": 0.2186279296875,
"learning_rate": 0.0001318855921904976,
"loss": 0.6869,
"step": 1170
},
{
"epoch": 4.58984375,
"grad_norm": 0.21257956326007843,
"learning_rate": 0.0001312386730985459,
"loss": 0.6834,
"step": 1175
},
{
"epoch": 4.609375,
"grad_norm": 0.20661357045173645,
"learning_rate": 0.00013059030200965536,
"loss": 0.7001,
"step": 1180
},
{
"epoch": 4.62890625,
"grad_norm": 0.22517681121826172,
"learning_rate": 0.00012994050906060017,
"loss": 0.6717,
"step": 1185
},
{
"epoch": 4.6484375,
"grad_norm": 0.22090637683868408,
"learning_rate": 0.00012928932445424365,
"loss": 0.678,
"step": 1190
},
{
"epoch": 4.66796875,
"grad_norm": 0.21545428037643433,
"learning_rate": 0.00012863677845813433,
"loss": 0.6819,
"step": 1195
},
{
"epoch": 4.6875,
"grad_norm": 0.209136962890625,
"learning_rate": 0.00012798290140309923,
"loss": 0.6862,
"step": 1200
},
{
"epoch": 4.70703125,
"grad_norm": 0.20853549242019653,
"learning_rate": 0.00012732772368183388,
"loss": 0.6719,
"step": 1205
},
{
"epoch": 4.7265625,
"grad_norm": 0.2124202698469162,
"learning_rate": 0.00012667127574748986,
"loss": 0.6819,
"step": 1210
},
{
"epoch": 4.74609375,
"grad_norm": 0.2243855744600296,
"learning_rate": 0.00012601358811225913,
"loss": 0.6743,
"step": 1215
},
{
"epoch": 4.765625,
"grad_norm": 0.21978437900543213,
"learning_rate": 0.00012535469134595595,
"loss": 0.6924,
"step": 1220
},
{
"epoch": 4.78515625,
"grad_norm": 0.20108923316001892,
"learning_rate": 0.00012469461607459583,
"loss": 0.6836,
"step": 1225
},
{
"epoch": 4.8046875,
"grad_norm": 0.21921634674072266,
"learning_rate": 0.0001240333929789721,
"loss": 0.6764,
"step": 1230
},
{
"epoch": 4.82421875,
"grad_norm": 0.21365371346473694,
"learning_rate": 0.00012337105279322988,
"loss": 0.6843,
"step": 1235
},
{
"epoch": 4.84375,
"grad_norm": 0.20987005531787872,
"learning_rate": 0.00012270762630343734,
"loss": 0.6746,
"step": 1240
},
{
"epoch": 4.86328125,
"grad_norm": 0.20794980227947235,
"learning_rate": 0.00012204314434615501,
"loss": 0.6815,
"step": 1245
},
{
"epoch": 4.8828125,
"grad_norm": 0.21553441882133484,
"learning_rate": 0.00012137763780700227,
"loss": 0.6795,
"step": 1250
},
{
"epoch": 4.90234375,
"grad_norm": 0.2035866528749466,
"learning_rate": 0.00012071113761922186,
"loss": 0.6828,
"step": 1255
},
{
"epoch": 4.921875,
"grad_norm": 0.2061247080564499,
"learning_rate": 0.00012004367476224206,
"loss": 0.6838,
"step": 1260
},
{
"epoch": 4.94140625,
"grad_norm": 0.21384355425834656,
"learning_rate": 0.0001193752802602367,
"loss": 0.6902,
"step": 1265
},
{
"epoch": 4.9609375,
"grad_norm": 0.21918757259845734,
"learning_rate": 0.0001187059851806832,
"loss": 0.6853,
"step": 1270
},
{
"epoch": 4.98046875,
"grad_norm": 0.20853689312934875,
"learning_rate": 0.00011803582063291849,
"loss": 0.6693,
"step": 1275
},
{
"epoch": 5.0,
"grad_norm": 0.2089415341615677,
"learning_rate": 0.00011736481776669306,
"loss": 0.6831,
"step": 1280
},
{
"epoch": 5.0,
"eval_loss": 2.05405592918396,
"eval_runtime": 0.5395,
"eval_samples_per_second": 11.122,
"eval_steps_per_second": 1.854,
"step": 1280
},
{
"epoch": 5.01953125,
"grad_norm": 0.21040305495262146,
"learning_rate": 0.00011669300777072298,
"loss": 0.6597,
"step": 1285
},
{
"epoch": 5.0390625,
"grad_norm": 0.2179408222436905,
"learning_rate": 0.00011602042187124045,
"loss": 0.6675,
"step": 1290
},
{
"epoch": 5.05859375,
"grad_norm": 0.20846475660800934,
"learning_rate": 0.0001153470913305421,
"loss": 0.6643,
"step": 1295
},
{
"epoch": 5.078125,
"grad_norm": 0.2074786126613617,
"learning_rate": 0.00011467304744553618,
"loss": 0.6656,
"step": 1300
},
{
"epoch": 5.09765625,
"grad_norm": 0.2094477117061615,
"learning_rate": 0.00011399832154628767,
"loss": 0.6544,
"step": 1305
},
{
"epoch": 5.1171875,
"grad_norm": 0.21982310712337494,
"learning_rate": 0.000113322944994562,
"loss": 0.6549,
"step": 1310
},
{
"epoch": 5.13671875,
"grad_norm": 0.23372633755207062,
"learning_rate": 0.00011264694918236753,
"loss": 0.6567,
"step": 1315
},
{
"epoch": 5.15625,
"grad_norm": 0.21253670752048492,
"learning_rate": 0.00011197036553049625,
"loss": 0.657,
"step": 1320
},
{
"epoch": 5.17578125,
"grad_norm": 0.21819843351840973,
"learning_rate": 0.00011129322548706342,
"loss": 0.6624,
"step": 1325
},
{
"epoch": 5.1953125,
"grad_norm": 0.22048228979110718,
"learning_rate": 0.00011061556052604578,
"loss": 0.6617,
"step": 1330
},
{
"epoch": 5.21484375,
"grad_norm": 0.21444514393806458,
"learning_rate": 0.00010993740214581856,
"loss": 0.6714,
"step": 1335
},
{
"epoch": 5.234375,
"grad_norm": 0.20963872969150543,
"learning_rate": 0.00010925878186769158,
"loss": 0.6554,
"step": 1340
},
{
"epoch": 5.25390625,
"grad_norm": 0.21605953574180603,
"learning_rate": 0.000108579731234444,
"loss": 0.6625,
"step": 1345
},
{
"epoch": 5.2734375,
"grad_norm": 0.2186332494020462,
"learning_rate": 0.00010790028180885821,
"loss": 0.659,
"step": 1350
},
{
"epoch": 5.29296875,
"grad_norm": 0.20879332721233368,
"learning_rate": 0.00010722046517225271,
"loss": 0.6574,
"step": 1355
},
{
"epoch": 5.3125,
"grad_norm": 0.20964272320270538,
"learning_rate": 0.00010654031292301432,
"loss": 0.6495,
"step": 1360
},
{
"epoch": 5.33203125,
"grad_norm": 0.22066867351531982,
"learning_rate": 0.00010585985667512934,
"loss": 0.6657,
"step": 1365
},
{
"epoch": 5.3515625,
"grad_norm": 0.21919472515583038,
"learning_rate": 0.00010517912805671419,
"loss": 0.6663,
"step": 1370
},
{
"epoch": 5.37109375,
"grad_norm": 0.20911991596221924,
"learning_rate": 0.00010449815870854525,
"loss": 0.6655,
"step": 1375
},
{
"epoch": 5.390625,
"grad_norm": 0.21343956887722015,
"learning_rate": 0.00010381698028258817,
"loss": 0.6538,
"step": 1380
},
{
"epoch": 5.41015625,
"grad_norm": 0.23448581993579865,
"learning_rate": 0.00010313562444052677,
"loss": 0.6745,
"step": 1385
},
{
"epoch": 5.4296875,
"grad_norm": 0.2224402278661728,
"learning_rate": 0.00010245412285229124,
"loss": 0.6659,
"step": 1390
},
{
"epoch": 5.44921875,
"grad_norm": 0.21760495007038116,
"learning_rate": 0.0001017725071945862,
"loss": 0.6574,
"step": 1395
},
{
"epoch": 5.46875,
"grad_norm": 0.21981921792030334,
"learning_rate": 0.00010109080914941824,
"loss": 0.6639,
"step": 1400
},
{
"epoch": 5.48828125,
"grad_norm": 0.22708064317703247,
"learning_rate": 0.00010040906040262348,
"loss": 0.6601,
"step": 1405
},
{
"epoch": 5.5078125,
"grad_norm": 0.21901877224445343,
"learning_rate": 9.972729264239461e-05,
"loss": 0.6708,
"step": 1410
},
{
"epoch": 5.52734375,
"grad_norm": 0.21920931339263916,
"learning_rate": 9.904553755780815e-05,
"loss": 0.6588,
"step": 1415
},
{
"epoch": 5.546875,
"grad_norm": 0.2086167186498642,
"learning_rate": 9.836382683735132e-05,
"loss": 0.6689,
"step": 1420
},
{
"epoch": 5.56640625,
"grad_norm": 0.2135404795408249,
"learning_rate": 9.768219216744942e-05,
"loss": 0.6709,
"step": 1425
},
{
"epoch": 5.5859375,
"grad_norm": 0.2296486496925354,
"learning_rate": 9.700066523099273e-05,
"loss": 0.6768,
"step": 1430
},
{
"epoch": 5.60546875,
"grad_norm": 0.22231514751911163,
"learning_rate": 9.631927770586412e-05,
"loss": 0.6662,
"step": 1435
},
{
"epoch": 5.625,
"grad_norm": 0.21092720329761505,
"learning_rate": 9.563806126346642e-05,
"loss": 0.6563,
"step": 1440
},
{
"epoch": 5.64453125,
"grad_norm": 0.2081764191389084,
"learning_rate": 9.495704756725041e-05,
"loss": 0.6599,
"step": 1445
},
{
"epoch": 5.6640625,
"grad_norm": 0.21930693089962006,
"learning_rate": 9.427626827124317e-05,
"loss": 0.6645,
"step": 1450
},
{
"epoch": 5.68359375,
"grad_norm": 0.22238822281360626,
"learning_rate": 9.359575501857651e-05,
"loss": 0.6653,
"step": 1455
},
{
"epoch": 5.703125,
"grad_norm": 0.21201257407665253,
"learning_rate": 9.29155394400166e-05,
"loss": 0.675,
"step": 1460
},
{
"epoch": 5.72265625,
"grad_norm": 0.21970124542713165,
"learning_rate": 9.223565315249325e-05,
"loss": 0.6719,
"step": 1465
},
{
"epoch": 5.7421875,
"grad_norm": 0.20852448046207428,
"learning_rate": 9.155612775763069e-05,
"loss": 0.6701,
"step": 1470
},
{
"epoch": 5.76171875,
"grad_norm": 0.2180168330669403,
"learning_rate": 9.087699484027857e-05,
"loss": 0.658,
"step": 1475
},
{
"epoch": 5.78125,
"grad_norm": 0.211044043302536,
"learning_rate": 9.019828596704394e-05,
"loss": 0.6526,
"step": 1480
},
{
"epoch": 5.80078125,
"grad_norm": 0.20980176329612732,
"learning_rate": 8.95200326848239e-05,
"loss": 0.6548,
"step": 1485
},
{
"epoch": 5.8203125,
"grad_norm": 0.20603534579277039,
"learning_rate": 8.884226651933927e-05,
"loss": 0.6644,
"step": 1490
},
{
"epoch": 5.83984375,
"grad_norm": 0.20811837911605835,
"learning_rate": 8.816501897366953e-05,
"loss": 0.6703,
"step": 1495
},
{
"epoch": 5.859375,
"grad_norm": 0.2105432003736496,
"learning_rate": 8.74883215267881e-05,
"loss": 0.6649,
"step": 1500
},
{
"epoch": 5.87890625,
"grad_norm": 0.22339750826358795,
"learning_rate": 8.681220563209955e-05,
"loss": 0.6687,
"step": 1505
},
{
"epoch": 5.8984375,
"grad_norm": 0.20943927764892578,
"learning_rate": 8.613670271597733e-05,
"loss": 0.663,
"step": 1510
},
{
"epoch": 5.91796875,
"grad_norm": 0.20441389083862305,
"learning_rate": 8.546184417630338e-05,
"loss": 0.6663,
"step": 1515
},
{
"epoch": 5.9375,
"grad_norm": 0.21287420392036438,
"learning_rate": 8.478766138100834e-05,
"loss": 0.6727,
"step": 1520
},
{
"epoch": 5.95703125,
"grad_norm": 0.21163299679756165,
"learning_rate": 8.411418566661388e-05,
"loss": 0.6643,
"step": 1525
},
{
"epoch": 5.9765625,
"grad_norm": 0.20541082322597504,
"learning_rate": 8.344144833677594e-05,
"loss": 0.6605,
"step": 1530
},
{
"epoch": 5.99609375,
"grad_norm": 0.21405570209026337,
"learning_rate": 8.27694806608298e-05,
"loss": 0.6633,
"step": 1535
},
{
"epoch": 6.0,
"eval_loss": 2.0744192600250244,
"eval_runtime": 0.5398,
"eval_samples_per_second": 11.115,
"eval_steps_per_second": 1.853,
"step": 1536
},
{
"epoch": 6.015625,
"grad_norm": 0.21526320278644562,
"learning_rate": 8.209831387233676e-05,
"loss": 0.6479,
"step": 1540
},
{
"epoch": 6.03515625,
"grad_norm": 0.217779740691185,
"learning_rate": 8.142797916763209e-05,
"loss": 0.6536,
"step": 1545
},
{
"epoch": 6.0546875,
"grad_norm": 0.22583958506584167,
"learning_rate": 8.075850770437534e-05,
"loss": 0.6532,
"step": 1550
},
{
"epoch": 6.07421875,
"grad_norm": 0.24157458543777466,
"learning_rate": 8.008993060010183e-05,
"loss": 0.6426,
"step": 1555
},
{
"epoch": 6.09375,
"grad_norm": 0.2280224710702896,
"learning_rate": 7.942227893077652e-05,
"loss": 0.6482,
"step": 1560
},
{
"epoch": 6.11328125,
"grad_norm": 0.21372312307357788,
"learning_rate": 7.875558372934936e-05,
"loss": 0.6448,
"step": 1565
},
{
"epoch": 6.1328125,
"grad_norm": 0.22514766454696655,
"learning_rate": 7.808987598431303e-05,
"loss": 0.6506,
"step": 1570
},
{
"epoch": 6.15234375,
"grad_norm": 0.22178982198238373,
"learning_rate": 7.742518663826246e-05,
"loss": 0.6404,
"step": 1575
},
{
"epoch": 6.171875,
"grad_norm": 0.21459142863750458,
"learning_rate": 7.676154658645656e-05,
"loss": 0.6557,
"step": 1580
},
{
"epoch": 6.19140625,
"grad_norm": 0.22397801280021667,
"learning_rate": 7.609898667538243e-05,
"loss": 0.6445,
"step": 1585
},
{
"epoch": 6.2109375,
"grad_norm": 0.22123484313488007,
"learning_rate": 7.543753770132127e-05,
"loss": 0.6375,
"step": 1590
},
{
"epoch": 6.23046875,
"grad_norm": 0.2259218543767929,
"learning_rate": 7.477723040891717e-05,
"loss": 0.6486,
"step": 1595
},
{
"epoch": 6.25,
"grad_norm": 0.21872185170650482,
"learning_rate": 7.411809548974792e-05,
"loss": 0.6546,
"step": 1600
},
{
"epoch": 6.26953125,
"grad_norm": 0.2340991348028183,
"learning_rate": 7.346016358089867e-05,
"loss": 0.6573,
"step": 1605
},
{
"epoch": 6.2890625,
"grad_norm": 0.2258559614419937,
"learning_rate": 7.280346526353759e-05,
"loss": 0.6485,
"step": 1610
},
{
"epoch": 6.30859375,
"grad_norm": 0.21842586994171143,
"learning_rate": 7.21480310614947e-05,
"loss": 0.6452,
"step": 1615
},
{
"epoch": 6.328125,
"grad_norm": 0.22392797470092773,
"learning_rate": 7.149389143984295e-05,
"loss": 0.6467,
"step": 1620
},
{
"epoch": 6.34765625,
"grad_norm": 0.21205224096775055,
"learning_rate": 7.084107680348218e-05,
"loss": 0.6502,
"step": 1625
},
{
"epoch": 6.3671875,
"grad_norm": 0.22041639685630798,
"learning_rate": 7.018961749572604e-05,
"loss": 0.6502,
"step": 1630
},
{
"epoch": 6.38671875,
"grad_norm": 0.21791093051433563,
"learning_rate": 6.953954379689136e-05,
"loss": 0.6553,
"step": 1635
},
{
"epoch": 6.40625,
"grad_norm": 0.22223076224327087,
"learning_rate": 6.889088592289093e-05,
"loss": 0.639,
"step": 1640
},
{
"epoch": 6.42578125,
"grad_norm": 0.2151210606098175,
"learning_rate": 6.824367402382885e-05,
"loss": 0.655,
"step": 1645
},
{
"epoch": 6.4453125,
"grad_norm": 0.2196204513311386,
"learning_rate": 6.759793818259933e-05,
"loss": 0.6549,
"step": 1650
},
{
"epoch": 6.46484375,
"grad_norm": 0.21881859004497528,
"learning_rate": 6.69537084134882e-05,
"loss": 0.6516,
"step": 1655
},
{
"epoch": 6.484375,
"grad_norm": 0.21970680356025696,
"learning_rate": 6.6311014660778e-05,
"loss": 0.6531,
"step": 1660
},
{
"epoch": 6.50390625,
"grad_norm": 0.21640105545520782,
"learning_rate": 6.566988679735606e-05,
"loss": 0.6474,
"step": 1665
},
{
"epoch": 6.5234375,
"grad_norm": 0.225670725107193,
"learning_rate": 6.503035462332592e-05,
"loss": 0.6437,
"step": 1670
},
{
"epoch": 6.54296875,
"grad_norm": 0.20938833057880402,
"learning_rate": 6.439244786462245e-05,
"loss": 0.6526,
"step": 1675
},
{
"epoch": 6.5625,
"grad_norm": 0.21592438220977783,
"learning_rate": 6.375619617162985e-05,
"loss": 0.6528,
"step": 1680
},
{
"epoch": 6.58203125,
"grad_norm": 0.22665540874004364,
"learning_rate": 6.312162911780368e-05,
"loss": 0.6502,
"step": 1685
},
{
"epoch": 6.6015625,
"grad_norm": 0.2195620834827423,
"learning_rate": 6.248877619829619e-05,
"loss": 0.6469,
"step": 1690
},
{
"epoch": 6.62109375,
"grad_norm": 0.22165308892726898,
"learning_rate": 6.185766682858546e-05,
"loss": 0.6518,
"step": 1695
},
{
"epoch": 6.640625,
"grad_norm": 0.22840096056461334,
"learning_rate": 6.122833034310793e-05,
"loss": 0.6506,
"step": 1700
},
{
"epoch": 6.66015625,
"grad_norm": 0.22422266006469727,
"learning_rate": 6.060079599389521e-05,
"loss": 0.6559,
"step": 1705
},
{
"epoch": 6.6796875,
"grad_norm": 0.22363343834877014,
"learning_rate": 5.9975092949214116e-05,
"loss": 0.6449,
"step": 1710
},
{
"epoch": 6.69921875,
"grad_norm": 0.2213827222585678,
"learning_rate": 5.935125029221111e-05,
"loss": 0.65,
"step": 1715
},
{
"epoch": 6.71875,
"grad_norm": 0.2290297895669937,
"learning_rate": 5.872929701956054e-05,
"loss": 0.6476,
"step": 1720
},
{
"epoch": 6.73828125,
"grad_norm": 0.23118211328983307,
"learning_rate": 5.810926204011658e-05,
"loss": 0.6511,
"step": 1725
},
{
"epoch": 6.7578125,
"grad_norm": 0.22112269699573517,
"learning_rate": 5.749117417356988e-05,
"loss": 0.6481,
"step": 1730
},
{
"epoch": 6.77734375,
"grad_norm": 0.21454501152038574,
"learning_rate": 5.687506214910765e-05,
"loss": 0.6492,
"step": 1735
},
{
"epoch": 6.796875,
"grad_norm": 0.22518618404865265,
"learning_rate": 5.6260954604078585e-05,
"loss": 0.6515,
"step": 1740
},
{
"epoch": 6.81640625,
"grad_norm": 0.23013541102409363,
"learning_rate": 5.564888008266165e-05,
"loss": 0.6563,
"step": 1745
},
{
"epoch": 6.8359375,
"grad_norm": 0.21959349513053894,
"learning_rate": 5.503886703453933e-05,
"loss": 0.6504,
"step": 1750
},
{
"epoch": 6.85546875,
"grad_norm": 0.23238404095172882,
"learning_rate": 5.4430943813575375e-05,
"loss": 0.6575,
"step": 1755
},
{
"epoch": 6.875,
"grad_norm": 0.21891681849956512,
"learning_rate": 5.382513867649663e-05,
"loss": 0.6415,
"step": 1760
},
{
"epoch": 6.89453125,
"grad_norm": 0.2155328243970871,
"learning_rate": 5.3221479781579955e-05,
"loss": 0.6498,
"step": 1765
},
{
"epoch": 6.9140625,
"grad_norm": 0.21803325414657593,
"learning_rate": 5.261999518734322e-05,
"loss": 0.6439,
"step": 1770
},
{
"epoch": 6.93359375,
"grad_norm": 0.21531429886817932,
"learning_rate": 5.202071285124119e-05,
"loss": 0.6486,
"step": 1775
},
{
"epoch": 6.953125,
"grad_norm": 0.22126588225364685,
"learning_rate": 5.142366062836599e-05,
"loss": 0.6453,
"step": 1780
},
{
"epoch": 6.97265625,
"grad_norm": 0.21690168976783752,
"learning_rate": 5.082886627015246e-05,
"loss": 0.6564,
"step": 1785
},
{
"epoch": 6.9921875,
"grad_norm": 0.22704558074474335,
"learning_rate": 5.023635742308807e-05,
"loss": 0.6595,
"step": 1790
},
{
"epoch": 7.0,
"eval_loss": 2.0813868045806885,
"eval_runtime": 0.5387,
"eval_samples_per_second": 11.138,
"eval_steps_per_second": 1.856,
"step": 1792
},
{
"epoch": 7.01171875,
"grad_norm": 0.21671408414840698,
"learning_rate": 4.964616162742826e-05,
"loss": 0.6478,
"step": 1795
},
{
"epoch": 7.03125,
"grad_norm": 0.2322429120540619,
"learning_rate": 4.9058306315915826e-05,
"loss": 0.6355,
"step": 1800
},
{
"epoch": 7.05078125,
"grad_norm": 0.22516188025474548,
"learning_rate": 4.84728188125063e-05,
"loss": 0.6343,
"step": 1805
},
{
"epoch": 7.0703125,
"grad_norm": 0.22370575368404388,
"learning_rate": 4.7889726331097686e-05,
"loss": 0.6388,
"step": 1810
},
{
"epoch": 7.08984375,
"grad_norm": 0.22702112793922424,
"learning_rate": 4.7309055974265435e-05,
"loss": 0.6405,
"step": 1815
},
{
"epoch": 7.109375,
"grad_norm": 0.2213263362646103,
"learning_rate": 4.6730834732003104e-05,
"loss": 0.6369,
"step": 1820
},
{
"epoch": 7.12890625,
"grad_norm": 0.2283063679933548,
"learning_rate": 4.615508948046726e-05,
"loss": 0.6406,
"step": 1825
},
{
"epoch": 7.1484375,
"grad_norm": 0.22583836317062378,
"learning_rate": 4.5581846980728794e-05,
"loss": 0.6396,
"step": 1830
},
{
"epoch": 7.16796875,
"grad_norm": 0.223560631275177,
"learning_rate": 4.50111338775287e-05,
"loss": 0.6487,
"step": 1835
},
{
"epoch": 7.1875,
"grad_norm": 0.2752554714679718,
"learning_rate": 4.444297669803981e-05,
"loss": 0.6399,
"step": 1840
},
{
"epoch": 7.20703125,
"grad_norm": 0.22124579548835754,
"learning_rate": 4.387740185063358e-05,
"loss": 0.6413,
"step": 1845
},
{
"epoch": 7.2265625,
"grad_norm": 0.22053855657577515,
"learning_rate": 4.331443562365285e-05,
"loss": 0.6377,
"step": 1850
},
{
"epoch": 7.24609375,
"grad_norm": 0.22650252282619476,
"learning_rate": 4.275410418418979e-05,
"loss": 0.6441,
"step": 1855
},
{
"epoch": 7.265625,
"grad_norm": 0.2277732640504837,
"learning_rate": 4.219643357686967e-05,
"loss": 0.6472,
"step": 1860
},
{
"epoch": 7.28515625,
"grad_norm": 0.21958424150943756,
"learning_rate": 4.1641449722640336e-05,
"loss": 0.6434,
"step": 1865
},
{
"epoch": 7.3046875,
"grad_norm": 0.22781191766262054,
"learning_rate": 4.1089178417567164e-05,
"loss": 0.6436,
"step": 1870
},
{
"epoch": 7.32421875,
"grad_norm": 0.22724145650863647,
"learning_rate": 4.0539645331634504e-05,
"loss": 0.6365,
"step": 1875
},
{
"epoch": 7.34375,
"grad_norm": 0.22402629256248474,
"learning_rate": 3.999287600755192e-05,
"loss": 0.6404,
"step": 1880
},
{
"epoch": 7.36328125,
"grad_norm": 0.22256724536418915,
"learning_rate": 3.944889585956746e-05,
"loss": 0.6385,
"step": 1885
},
{
"epoch": 7.3828125,
"grad_norm": 0.2245977371931076,
"learning_rate": 3.8907730172286124e-05,
"loss": 0.6402,
"step": 1890
},
{
"epoch": 7.40234375,
"grad_norm": 0.2223842293024063,
"learning_rate": 3.8369404099494574e-05,
"loss": 0.6401,
"step": 1895
},
{
"epoch": 7.421875,
"grad_norm": 0.228043794631958,
"learning_rate": 3.783394266299228e-05,
"loss": 0.6456,
"step": 1900
},
{
"epoch": 7.44140625,
"grad_norm": 0.22321034967899323,
"learning_rate": 3.730137075142802e-05,
"loss": 0.6461,
"step": 1905
},
{
"epoch": 7.4609375,
"grad_norm": 0.2202451378107071,
"learning_rate": 3.677171311914346e-05,
"loss": 0.6404,
"step": 1910
},
{
"epoch": 7.48046875,
"grad_norm": 0.23069259524345398,
"learning_rate": 3.624499438502229e-05,
"loss": 0.6399,
"step": 1915
},
{
"epoch": 7.5,
"grad_norm": 0.22767633199691772,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.6365,
"step": 1920
},
{
"epoch": 7.51953125,
"grad_norm": 0.223536416888237,
"learning_rate": 3.520047140265618e-05,
"loss": 0.6398,
"step": 1925
},
{
"epoch": 7.5390625,
"grad_norm": 0.2236379086971283,
"learning_rate": 3.468271570462235e-05,
"loss": 0.6374,
"step": 1930
},
{
"epoch": 7.55859375,
"grad_norm": 0.22322149574756622,
"learning_rate": 3.41679960029174e-05,
"loss": 0.6411,
"step": 1935
},
{
"epoch": 7.578125,
"grad_norm": 0.22714544832706451,
"learning_rate": 3.365633622209891e-05,
"loss": 0.6281,
"step": 1940
},
{
"epoch": 7.59765625,
"grad_norm": 0.23407664895057678,
"learning_rate": 3.314776014449694e-05,
"loss": 0.6342,
"step": 1945
},
{
"epoch": 7.6171875,
"grad_norm": 0.2269096076488495,
"learning_rate": 3.2642291409108775e-05,
"loss": 0.6462,
"step": 1950
},
{
"epoch": 7.63671875,
"grad_norm": 0.21775776147842407,
"learning_rate": 3.213995351050011e-05,
"loss": 0.6442,
"step": 1955
},
{
"epoch": 7.65625,
"grad_norm": 0.21870321035385132,
"learning_rate": 3.164076979771287e-05,
"loss": 0.6391,
"step": 1960
},
{
"epoch": 7.67578125,
"grad_norm": 0.24278177320957184,
"learning_rate": 3.1144763473180285e-05,
"loss": 0.6351,
"step": 1965
},
{
"epoch": 7.6953125,
"grad_norm": 0.222146674990654,
"learning_rate": 3.065195759164797e-05,
"loss": 0.6442,
"step": 1970
},
{
"epoch": 7.71484375,
"grad_norm": 0.23037941753864288,
"learning_rate": 3.016237505910272e-05,
"loss": 0.6391,
"step": 1975
},
{
"epoch": 7.734375,
"grad_norm": 0.22653505206108093,
"learning_rate": 2.9676038631707593e-05,
"loss": 0.6364,
"step": 1980
},
{
"epoch": 7.75390625,
"grad_norm": 0.22071927785873413,
"learning_rate": 2.9192970914744132e-05,
"loss": 0.6436,
"step": 1985
},
{
"epoch": 7.7734375,
"grad_norm": 0.2352590709924698,
"learning_rate": 2.8713194361562036e-05,
"loss": 0.6389,
"step": 1990
},
{
"epoch": 7.79296875,
"grad_norm": 0.23165152966976166,
"learning_rate": 2.8236731272534967e-05,
"loss": 0.6359,
"step": 1995
},
{
"epoch": 7.8125,
"grad_norm": 0.22592546045780182,
"learning_rate": 2.776360379402445e-05,
"loss": 0.6452,
"step": 2000
},
{
"epoch": 7.83203125,
"grad_norm": 0.22005808353424072,
"learning_rate": 2.72938339173503e-05,
"loss": 0.6362,
"step": 2005
},
{
"epoch": 7.8515625,
"grad_norm": 0.22496894001960754,
"learning_rate": 2.6827443477768454e-05,
"loss": 0.6363,
"step": 2010
},
{
"epoch": 7.87109375,
"grad_norm": 0.23299238085746765,
"learning_rate": 2.6364454153456108e-05,
"loss": 0.6376,
"step": 2015
},
{
"epoch": 7.890625,
"grad_norm": 0.21800798177719116,
"learning_rate": 2.5904887464504114e-05,
"loss": 0.6316,
"step": 2020
},
{
"epoch": 7.91015625,
"grad_norm": 0.22942836582660675,
"learning_rate": 2.544876477191652e-05,
"loss": 0.6408,
"step": 2025
},
{
"epoch": 7.9296875,
"grad_norm": 0.22502020001411438,
"learning_rate": 2.4996107276618008e-05,
"loss": 0.6281,
"step": 2030
},
{
"epoch": 7.94921875,
"grad_norm": 0.22493688762187958,
"learning_rate": 2.454693601846819e-05,
"loss": 0.6374,
"step": 2035
},
{
"epoch": 7.96875,
"grad_norm": 0.22121860086917877,
"learning_rate": 2.4101271875283817e-05,
"loss": 0.6301,
"step": 2040
},
{
"epoch": 7.98828125,
"grad_norm": 0.22293226420879364,
"learning_rate": 2.3659135561868305e-05,
"loss": 0.6374,
"step": 2045
},
{
"epoch": 8.0,
"eval_loss": 2.093949556350708,
"eval_runtime": 0.5398,
"eval_samples_per_second": 11.115,
"eval_steps_per_second": 1.852,
"step": 2048
},
{
"epoch": 8.0078125,
"grad_norm": 0.22147591412067413,
"learning_rate": 2.3220547629048796e-05,
"loss": 0.6318,
"step": 2050
},
{
"epoch": 8.02734375,
"grad_norm": 0.22781990468502045,
"learning_rate": 2.2785528462721238e-05,
"loss": 0.6301,
"step": 2055
},
{
"epoch": 8.046875,
"grad_norm": 0.22302427887916565,
"learning_rate": 2.2354098282902446e-05,
"loss": 0.6194,
"step": 2060
},
{
"epoch": 8.06640625,
"grad_norm": 0.2345212697982788,
"learning_rate": 2.1926277142790552e-05,
"loss": 0.6284,
"step": 2065
},
{
"epoch": 8.0859375,
"grad_norm": 0.22880584001541138,
"learning_rate": 2.1502084927832845e-05,
"loss": 0.6394,
"step": 2070
},
{
"epoch": 8.10546875,
"grad_norm": 0.23197947442531586,
"learning_rate": 2.1081541354801292e-05,
"loss": 0.6414,
"step": 2075
},
{
"epoch": 8.125,
"grad_norm": 0.2195805162191391,
"learning_rate": 2.0664665970876496e-05,
"loss": 0.6274,
"step": 2080
},
{
"epoch": 8.14453125,
"grad_norm": 0.2231413722038269,
"learning_rate": 2.025147815273867e-05,
"loss": 0.6325,
"step": 2085
},
{
"epoch": 8.1640625,
"grad_norm": 0.22956664860248566,
"learning_rate": 1.9841997105667275e-05,
"loss": 0.6345,
"step": 2090
},
{
"epoch": 8.18359375,
"grad_norm": 0.22590646147727966,
"learning_rate": 1.943624186264832e-05,
"loss": 0.6276,
"step": 2095
},
{
"epoch": 8.203125,
"grad_norm": 0.2267957627773285,
"learning_rate": 1.903423128348959e-05,
"loss": 0.6243,
"step": 2100
},
{
"epoch": 8.22265625,
"grad_norm": 0.22633960843086243,
"learning_rate": 1.8635984053944122e-05,
"loss": 0.6279,
"step": 2105
},
{
"epoch": 8.2421875,
"grad_norm": 0.22983397543430328,
"learning_rate": 1.824151868484164e-05,
"loss": 0.6347,
"step": 2110
},
{
"epoch": 8.26171875,
"grad_norm": 0.21901904046535492,
"learning_rate": 1.7850853511228115e-05,
"loss": 0.6364,
"step": 2115
},
{
"epoch": 8.28125,
"grad_norm": 0.2256007343530655,
"learning_rate": 1.7464006691513623e-05,
"loss": 0.628,
"step": 2120
},
{
"epoch": 8.30078125,
"grad_norm": 0.2304702252149582,
"learning_rate": 1.7080996206628307e-05,
"loss": 0.6202,
"step": 2125
},
{
"epoch": 8.3203125,
"grad_norm": 0.22724899649620056,
"learning_rate": 1.6701839859186542e-05,
"loss": 0.6401,
"step": 2130
},
{
"epoch": 8.33984375,
"grad_norm": 0.22017619013786316,
"learning_rate": 1.632655527265958e-05,
"loss": 0.6348,
"step": 2135
},
{
"epoch": 8.359375,
"grad_norm": 0.221891850233078,
"learning_rate": 1.595515989055618e-05,
"loss": 0.6306,
"step": 2140
},
{
"epoch": 8.37890625,
"grad_norm": 0.2255999892950058,
"learning_rate": 1.558767097561219e-05,
"loss": 0.6436,
"step": 2145
},
{
"epoch": 8.3984375,
"grad_norm": 0.2337878942489624,
"learning_rate": 1.5224105608987704e-05,
"loss": 0.6256,
"step": 2150
},
{
"epoch": 8.41796875,
"grad_norm": 0.2235851138830185,
"learning_rate": 1.486448068947348e-05,
"loss": 0.6328,
"step": 2155
},
{
"epoch": 8.4375,
"grad_norm": 0.2308977097272873,
"learning_rate": 1.4508812932705363e-05,
"loss": 0.6353,
"step": 2160
},
{
"epoch": 8.45703125,
"grad_norm": 0.22785401344299316,
"learning_rate": 1.4157118870387155e-05,
"loss": 0.6375,
"step": 2165
},
{
"epoch": 8.4765625,
"grad_norm": 0.24056580662727356,
"learning_rate": 1.3809414849522584e-05,
"loss": 0.6343,
"step": 2170
},
{
"epoch": 8.49609375,
"grad_norm": 0.22777673602104187,
"learning_rate": 1.3465717031655056e-05,
"loss": 0.6336,
"step": 2175
},
{
"epoch": 8.515625,
"grad_norm": 0.23098915815353394,
"learning_rate": 1.3126041392116772e-05,
"loss": 0.6296,
"step": 2180
},
{
"epoch": 8.53515625,
"grad_norm": 0.2298251986503601,
"learning_rate": 1.2790403719286049e-05,
"loss": 0.6305,
"step": 2185
},
{
"epoch": 8.5546875,
"grad_norm": 0.22145819664001465,
"learning_rate": 1.2458819613853468e-05,
"loss": 0.6262,
"step": 2190
},
{
"epoch": 8.57421875,
"grad_norm": 0.2244306206703186,
"learning_rate": 1.2131304488096772e-05,
"loss": 0.6225,
"step": 2195
},
{
"epoch": 8.59375,
"grad_norm": 0.22416800260543823,
"learning_rate": 1.1807873565164506e-05,
"loss": 0.6309,
"step": 2200
},
{
"epoch": 8.61328125,
"grad_norm": 0.22584258019924164,
"learning_rate": 1.148854187836833e-05,
"loss": 0.6318,
"step": 2205
},
{
"epoch": 8.6328125,
"grad_norm": 0.2320922613143921,
"learning_rate": 1.1173324270484397e-05,
"loss": 0.6352,
"step": 2210
},
{
"epoch": 8.65234375,
"grad_norm": 0.2240631878376007,
"learning_rate": 1.0862235393063413e-05,
"loss": 0.6279,
"step": 2215
},
{
"epoch": 8.671875,
"grad_norm": 0.2261231392621994,
"learning_rate": 1.0555289705749483e-05,
"loss": 0.6299,
"step": 2220
},
{
"epoch": 8.69140625,
"grad_norm": 0.22478684782981873,
"learning_rate": 1.025250147560829e-05,
"loss": 0.639,
"step": 2225
},
{
"epoch": 8.7109375,
"grad_norm": 0.22566542029380798,
"learning_rate": 9.953884776463652e-06,
"loss": 0.63,
"step": 2230
},
{
"epoch": 8.73046875,
"grad_norm": 0.23023688793182373,
"learning_rate": 9.659453488243575e-06,
"loss": 0.6439,
"step": 2235
},
{
"epoch": 8.75,
"grad_norm": 0.22487542033195496,
"learning_rate": 9.369221296335006e-06,
"loss": 0.6421,
"step": 2240
},
{
"epoch": 8.76953125,
"grad_norm": 0.22670140862464905,
"learning_rate": 9.083201690947763e-06,
"loss": 0.6331,
"step": 2245
},
{
"epoch": 8.7890625,
"grad_norm": 0.2248082160949707,
"learning_rate": 8.801407966487486e-06,
"loss": 0.6216,
"step": 2250
},
{
"epoch": 8.80859375,
"grad_norm": 0.23012250661849976,
"learning_rate": 8.52385322093765e-06,
"loss": 0.6452,
"step": 2255
},
{
"epoch": 8.828125,
"grad_norm": 0.22810766100883484,
"learning_rate": 8.250550355250875e-06,
"loss": 0.6395,
"step": 2260
},
{
"epoch": 8.84765625,
"grad_norm": 0.22482182085514069,
"learning_rate": 7.981512072749198e-06,
"loss": 0.6316,
"step": 2265
},
{
"epoch": 8.8671875,
"grad_norm": 0.22704395651817322,
"learning_rate": 7.71675087853364e-06,
"loss": 0.6389,
"step": 2270
},
{
"epoch": 8.88671875,
"grad_norm": 0.2339123636484146,
"learning_rate": 7.456279078902928e-06,
"loss": 0.639,
"step": 2275
},
{
"epoch": 8.90625,
"grad_norm": 0.2283734679222107,
"learning_rate": 7.200108780781556e-06,
"loss": 0.6312,
"step": 2280
},
{
"epoch": 8.92578125,
"grad_norm": 0.23632891476154327,
"learning_rate": 6.948251891156932e-06,
"loss": 0.6336,
"step": 2285
},
{
"epoch": 8.9453125,
"grad_norm": 0.22593176364898682,
"learning_rate": 6.700720116526116e-06,
"loss": 0.6382,
"step": 2290
},
{
"epoch": 8.96484375,
"grad_norm": 0.2195340245962143,
"learning_rate": 6.457524962351469e-06,
"loss": 0.627,
"step": 2295
},
{
"epoch": 8.984375,
"grad_norm": 0.2304958701133728,
"learning_rate": 6.218677732526035e-06,
"loss": 0.6277,
"step": 2300
},
{
"epoch": 9.0,
"eval_loss": 2.0994203090667725,
"eval_runtime": 0.5356,
"eval_samples_per_second": 11.202,
"eval_steps_per_second": 1.867,
"step": 2304
},
{
"epoch": 9.00390625,
"grad_norm": 0.2239326387643814,
"learning_rate": 5.984189528848095e-06,
"loss": 0.6333,
"step": 2305
},
{
"epoch": 9.0234375,
"grad_norm": 0.21830931305885315,
"learning_rate": 5.7540712505050444e-06,
"loss": 0.6303,
"step": 2310
},
{
"epoch": 9.04296875,
"grad_norm": 0.2230663150548935,
"learning_rate": 5.528333593567014e-06,
"loss": 0.6266,
"step": 2315
},
{
"epoch": 9.0625,
"grad_norm": 0.22621068358421326,
"learning_rate": 5.306987050489442e-06,
"loss": 0.6273,
"step": 2320
},
{
"epoch": 9.08203125,
"grad_norm": 0.2257871776819229,
"learning_rate": 5.090041909625542e-06,
"loss": 0.6171,
"step": 2325
},
{
"epoch": 9.1015625,
"grad_norm": 0.22467824816703796,
"learning_rate": 4.877508254748076e-06,
"loss": 0.6256,
"step": 2330
},
{
"epoch": 9.12109375,
"grad_norm": 0.22441822290420532,
"learning_rate": 4.669395964580614e-06,
"loss": 0.6247,
"step": 2335
},
{
"epoch": 9.140625,
"grad_norm": 0.22599612176418304,
"learning_rate": 4.465714712338398e-06,
"loss": 0.6204,
"step": 2340
},
{
"epoch": 9.16015625,
"grad_norm": 0.22301939129829407,
"learning_rate": 4.26647396527865e-06,
"loss": 0.634,
"step": 2345
},
{
"epoch": 9.1796875,
"grad_norm": 0.23274029791355133,
"learning_rate": 4.071682984260638e-06,
"loss": 0.6256,
"step": 2350
},
{
"epoch": 9.19921875,
"grad_norm": 0.23097610473632812,
"learning_rate": 3.881350823315177e-06,
"loss": 0.6293,
"step": 2355
},
{
"epoch": 9.21875,
"grad_norm": 0.23166796565055847,
"learning_rate": 3.6954863292237297e-06,
"loss": 0.6294,
"step": 2360
},
{
"epoch": 9.23828125,
"grad_norm": 0.22876545786857605,
"learning_rate": 3.514098141107314e-06,
"loss": 0.6298,
"step": 2365
},
{
"epoch": 9.2578125,
"grad_norm": 0.22338230907917023,
"learning_rate": 3.3371946900248473e-06,
"loss": 0.6264,
"step": 2370
},
{
"epoch": 9.27734375,
"grad_norm": 0.2302178293466568,
"learning_rate": 3.1647841985813164e-06,
"loss": 0.627,
"step": 2375
},
{
"epoch": 9.296875,
"grad_norm": 0.2242288738489151,
"learning_rate": 2.996874680545603e-06,
"loss": 0.6336,
"step": 2380
},
{
"epoch": 9.31640625,
"grad_norm": 0.22500120103359222,
"learning_rate": 2.8334739404779375e-06,
"loss": 0.6264,
"step": 2385
},
{
"epoch": 9.3359375,
"grad_norm": 0.23554645478725433,
"learning_rate": 2.674589573367192e-06,
"loss": 0.6213,
"step": 2390
},
{
"epoch": 9.35546875,
"grad_norm": 0.2254471480846405,
"learning_rate": 2.5202289642778375e-06,
"loss": 0.6348,
"step": 2395
},
{
"epoch": 9.375,
"grad_norm": 0.22407911717891693,
"learning_rate": 2.3703992880066638e-06,
"loss": 0.6294,
"step": 2400
},
{
"epoch": 9.39453125,
"grad_norm": 0.22965936362743378,
"learning_rate": 2.2251075087493355e-06,
"loss": 0.64,
"step": 2405
},
{
"epoch": 9.4140625,
"grad_norm": 0.22874490916728973,
"learning_rate": 2.0843603797766287e-06,
"loss": 0.6313,
"step": 2410
},
{
"epoch": 9.43359375,
"grad_norm": 0.22413046658039093,
"learning_rate": 1.9481644431206036e-06,
"loss": 0.6229,
"step": 2415
},
{
"epoch": 9.453125,
"grad_norm": 0.2280588150024414,
"learning_rate": 1.8165260292704711e-06,
"loss": 0.6265,
"step": 2420
},
{
"epoch": 9.47265625,
"grad_norm": 0.22689659893512726,
"learning_rate": 1.6894512568783716e-06,
"loss": 0.6272,
"step": 2425
},
{
"epoch": 9.4921875,
"grad_norm": 0.23052698373794556,
"learning_rate": 1.5669460324749586e-06,
"loss": 0.6408,
"step": 2430
},
{
"epoch": 9.51171875,
"grad_norm": 0.22765642404556274,
"learning_rate": 1.4490160501948735e-06,
"loss": 0.644,
"step": 2435
},
{
"epoch": 9.53125,
"grad_norm": 0.22766034305095673,
"learning_rate": 1.3356667915121025e-06,
"loss": 0.6249,
"step": 2440
},
{
"epoch": 9.55078125,
"grad_norm": 0.22794398665428162,
"learning_rate": 1.2269035249851236e-06,
"loss": 0.6318,
"step": 2445
},
{
"epoch": 9.5703125,
"grad_norm": 0.22712871432304382,
"learning_rate": 1.1227313060120926e-06,
"loss": 0.6359,
"step": 2450
},
{
"epoch": 9.58984375,
"grad_norm": 0.22914738953113556,
"learning_rate": 1.0231549765958192e-06,
"loss": 0.6389,
"step": 2455
},
{
"epoch": 9.609375,
"grad_norm": 0.22300153970718384,
"learning_rate": 9.281791651187366e-07,
"loss": 0.6356,
"step": 2460
},
{
"epoch": 9.62890625,
"grad_norm": 0.232873797416687,
"learning_rate": 8.378082861277281e-07,
"loss": 0.6272,
"step": 2465
},
{
"epoch": 9.6484375,
"grad_norm": 0.227997824549675,
"learning_rate": 7.520465401290033e-07,
"loss": 0.633,
"step": 2470
},
{
"epoch": 9.66796875,
"grad_norm": 0.21839286386966705,
"learning_rate": 6.708979133927762e-07,
"loss": 0.6215,
"step": 2475
},
{
"epoch": 9.6875,
"grad_norm": 0.22753040492534637,
"learning_rate": 5.943661777680354e-07,
"loss": 0.6272,
"step": 2480
},
{
"epoch": 9.70703125,
"grad_norm": 0.22866863012313843,
"learning_rate": 5.224548905072402e-07,
"loss": 0.6357,
"step": 2485
},
{
"epoch": 9.7265625,
"grad_norm": 0.2306712120771408,
"learning_rate": 4.5516739410087494e-07,
"loss": 0.6244,
"step": 2490
},
{
"epoch": 9.74609375,
"grad_norm": 0.22779209911823273,
"learning_rate": 3.9250681612225116e-07,
"loss": 0.6309,
"step": 2495
},
{
"epoch": 9.765625,
"grad_norm": 0.22719816863536835,
"learning_rate": 3.3447606908196817e-07,
"loss": 0.628,
"step": 2500
},
{
"epoch": 9.78515625,
"grad_norm": 0.23172929883003235,
"learning_rate": 2.8107785029265476e-07,
"loss": 0.6293,
"step": 2505
},
{
"epoch": 9.8046875,
"grad_norm": 0.22468186914920807,
"learning_rate": 2.3231464174352512e-07,
"loss": 0.6368,
"step": 2510
},
{
"epoch": 9.82421875,
"grad_norm": 0.22247561812400818,
"learning_rate": 1.8818870998508208e-07,
"loss": 0.6222,
"step": 2515
},
{
"epoch": 9.84375,
"grad_norm": 0.22515320777893066,
"learning_rate": 1.487021060236904e-07,
"loss": 0.6266,
"step": 2520
},
{
"epoch": 9.86328125,
"grad_norm": 0.23118971288204193,
"learning_rate": 1.1385666522630845e-07,
"loss": 0.6308,
"step": 2525
},
{
"epoch": 9.8828125,
"grad_norm": 0.22416307032108307,
"learning_rate": 8.365400723512328e-08,
"loss": 0.6239,
"step": 2530
},
{
"epoch": 9.90234375,
"grad_norm": 0.22984710335731506,
"learning_rate": 5.8095535892332964e-08,
"loss": 0.6362,
"step": 2535
},
{
"epoch": 9.921875,
"grad_norm": 0.23102597892284393,
"learning_rate": 3.7182439174832106e-08,
"loss": 0.6365,
"step": 2540
},
{
"epoch": 9.94140625,
"grad_norm": 0.2295123189687729,
"learning_rate": 2.091568913904496e-08,
"loss": 0.6397,
"step": 2545
},
{
"epoch": 9.9609375,
"grad_norm": 0.22766011953353882,
"learning_rate": 9.296041875683781e-09,
"loss": 0.6274,
"step": 2550
},
{
"epoch": 9.98046875,
"grad_norm": 0.2338954210281372,
"learning_rate": 2.3240374746658077e-09,
"loss": 0.6212,
"step": 2555
},
{
"epoch": 10.0,
"grad_norm": 0.22291633486747742,
"learning_rate": 0.0,
"loss": 0.616,
"step": 2560
},
{
"epoch": 10.0,
"eval_loss": 2.1007895469665527,
"eval_runtime": 0.5705,
"eval_samples_per_second": 10.518,
"eval_steps_per_second": 1.753,
"step": 2560
},
{
"epoch": 10.0,
"step": 2560,
"total_flos": 7.568434414263206e+18,
"train_loss": 0.7105431989766657,
"train_runtime": 14792.6859,
"train_samples_per_second": 11.056,
"train_steps_per_second": 0.173
}
],
"logging_steps": 5,
"max_steps": 2560,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.568434414263206e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}