{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00390625, "grad_norm": 1.813705325126648, "learning_rate": 7.8125e-07, "loss": 1.9071, "step": 1 }, { "epoch": 0.01953125, "grad_norm": 1.431990385055542, "learning_rate": 3.90625e-06, "loss": 1.8608, "step": 5 }, { "epoch": 0.0390625, "grad_norm": 1.281330943107605, "learning_rate": 7.8125e-06, "loss": 1.8263, "step": 10 }, { "epoch": 0.05859375, "grad_norm": 1.310953140258789, "learning_rate": 1.171875e-05, "loss": 1.8193, "step": 15 }, { "epoch": 0.078125, "grad_norm": 1.296993374824524, "learning_rate": 1.5625e-05, "loss": 1.7463, "step": 20 }, { "epoch": 0.09765625, "grad_norm": 1.1856365203857422, "learning_rate": 1.953125e-05, "loss": 1.6844, "step": 25 }, { "epoch": 0.1171875, "grad_norm": 3.376720905303955, "learning_rate": 2.34375e-05, "loss": 1.5861, "step": 30 }, { "epoch": 0.13671875, "grad_norm": 3.182882785797119, "learning_rate": 2.734375e-05, "loss": 1.4328, "step": 35 }, { "epoch": 0.15625, "grad_norm": 0.682467520236969, "learning_rate": 3.125e-05, "loss": 1.2702, "step": 40 }, { "epoch": 0.17578125, "grad_norm": 0.9865962266921997, "learning_rate": 3.5156250000000004e-05, "loss": 1.1671, "step": 45 }, { "epoch": 0.1953125, "grad_norm": 0.42747607827186584, "learning_rate": 3.90625e-05, "loss": 1.1303, "step": 50 }, { "epoch": 0.21484375, "grad_norm": 0.42581626772880554, "learning_rate": 4.2968750000000004e-05, "loss": 1.101, "step": 55 }, { "epoch": 0.234375, "grad_norm": 0.4914548099040985, "learning_rate": 4.6875e-05, "loss": 1.0586, "step": 60 }, { "epoch": 0.25390625, "grad_norm": 0.39272716641426086, "learning_rate": 5.0781250000000004e-05, "loss": 1.0308, "step": 65 }, { "epoch": 0.2734375, "grad_norm": 0.34394437074661255, "learning_rate": 5.46875e-05, "loss": 0.9998, "step": 70 }, { "epoch": 0.29296875, "grad_norm": 0.3009032607078552, "learning_rate": 5.8593750000000005e-05, "loss": 0.9784, "step": 75 }, { "epoch": 0.3125, "grad_norm": 0.27089548110961914, "learning_rate": 6.25e-05, "loss": 0.9653, "step": 80 }, { "epoch": 0.33203125, "grad_norm": 0.25717490911483765, "learning_rate": 6.640625e-05, "loss": 0.9434, "step": 85 }, { "epoch": 0.3515625, "grad_norm": 0.3018302917480469, "learning_rate": 7.031250000000001e-05, "loss": 0.9372, "step": 90 }, { "epoch": 0.37109375, "grad_norm": 0.2254215031862259, "learning_rate": 7.421875e-05, "loss": 0.9236, "step": 95 }, { "epoch": 0.390625, "grad_norm": 0.2384410947561264, "learning_rate": 7.8125e-05, "loss": 0.9145, "step": 100 }, { "epoch": 0.41015625, "grad_norm": 0.2905459403991699, "learning_rate": 8.203125e-05, "loss": 0.9177, "step": 105 }, { "epoch": 0.4296875, "grad_norm": 0.27646884322166443, "learning_rate": 8.593750000000001e-05, "loss": 0.9103, "step": 110 }, { "epoch": 0.44921875, "grad_norm": 0.23843346536159515, "learning_rate": 8.984375e-05, "loss": 0.8911, "step": 115 }, { "epoch": 0.46875, "grad_norm": 0.3110702931880951, "learning_rate": 9.375e-05, "loss": 0.8961, "step": 120 }, { "epoch": 0.48828125, "grad_norm": 0.2591000199317932, "learning_rate": 9.765625e-05, "loss": 0.8911, "step": 125 }, { "epoch": 0.5078125, "grad_norm": 0.2314710170030594, "learning_rate": 0.00010156250000000001, "loss": 0.8765, "step": 130 }, { "epoch": 0.52734375, "grad_norm": 0.268370658159256, "learning_rate": 0.00010546875, "loss": 0.8759, "step": 135 }, { "epoch": 0.546875, "grad_norm": 0.24689124524593353, "learning_rate": 0.000109375, "loss": 0.8714, "step": 140 }, { "epoch": 0.56640625, "grad_norm": 0.28693222999572754, "learning_rate": 0.00011328125, "loss": 0.882, "step": 145 }, { "epoch": 0.5859375, "grad_norm": 0.26165568828582764, "learning_rate": 0.00011718750000000001, "loss": 0.8638, "step": 150 }, { "epoch": 0.60546875, "grad_norm": 0.2968839406967163, "learning_rate": 0.00012109375, "loss": 0.8562, "step": 155 }, { "epoch": 0.625, "grad_norm": 0.2954418957233429, "learning_rate": 0.000125, "loss": 0.8569, "step": 160 }, { "epoch": 0.64453125, "grad_norm": 0.30811259150505066, "learning_rate": 0.00012890625, "loss": 0.8455, "step": 165 }, { "epoch": 0.6640625, "grad_norm": 0.2631295323371887, "learning_rate": 0.0001328125, "loss": 0.8574, "step": 170 }, { "epoch": 0.68359375, "grad_norm": 0.25627005100250244, "learning_rate": 0.00013671875, "loss": 0.851, "step": 175 }, { "epoch": 0.703125, "grad_norm": 0.28598853945732117, "learning_rate": 0.00014062500000000002, "loss": 0.8385, "step": 180 }, { "epoch": 0.72265625, "grad_norm": 0.2502932548522949, "learning_rate": 0.00014453125000000002, "loss": 0.8457, "step": 185 }, { "epoch": 0.7421875, "grad_norm": 0.3177507817745209, "learning_rate": 0.0001484375, "loss": 0.8319, "step": 190 }, { "epoch": 0.76171875, "grad_norm": 0.27309176325798035, "learning_rate": 0.00015234375, "loss": 0.8511, "step": 195 }, { "epoch": 0.78125, "grad_norm": 0.29295653104782104, "learning_rate": 0.00015625, "loss": 0.8373, "step": 200 }, { "epoch": 0.80078125, "grad_norm": 0.27028167247772217, "learning_rate": 0.00016015625, "loss": 0.8319, "step": 205 }, { "epoch": 0.8203125, "grad_norm": 0.40336114168167114, "learning_rate": 0.0001640625, "loss": 0.8245, "step": 210 }, { "epoch": 0.83984375, "grad_norm": 0.3044915795326233, "learning_rate": 0.00016796875000000001, "loss": 0.8283, "step": 215 }, { "epoch": 0.859375, "grad_norm": 0.29535970091819763, "learning_rate": 0.00017187500000000002, "loss": 0.8119, "step": 220 }, { "epoch": 0.87890625, "grad_norm": 0.28554800152778625, "learning_rate": 0.00017578125000000002, "loss": 0.8091, "step": 225 }, { "epoch": 0.8984375, "grad_norm": 0.26689431071281433, "learning_rate": 0.0001796875, "loss": 0.8189, "step": 230 }, { "epoch": 0.91796875, "grad_norm": 0.29758790135383606, "learning_rate": 0.00018359375, "loss": 0.8122, "step": 235 }, { "epoch": 0.9375, "grad_norm": 0.40431731939315796, "learning_rate": 0.0001875, "loss": 0.8155, "step": 240 }, { "epoch": 0.95703125, "grad_norm": 0.27242639660835266, "learning_rate": 0.00019140625, "loss": 0.8119, "step": 245 }, { "epoch": 0.9765625, "grad_norm": 0.3094847500324249, "learning_rate": 0.0001953125, "loss": 0.8058, "step": 250 }, { "epoch": 0.99609375, "grad_norm": 0.32299983501434326, "learning_rate": 0.00019921875000000001, "loss": 0.8026, "step": 255 }, { "epoch": 1.0, "eval_loss": 2.045611619949341, "eval_runtime": 0.5394, "eval_samples_per_second": 11.124, "eval_steps_per_second": 1.854, "step": 256 }, { "epoch": 1.015625, "grad_norm": 0.305078387260437, "learning_rate": 0.00019999851261394218, "loss": 0.7941, "step": 260 }, { "epoch": 1.03515625, "grad_norm": 0.2842113673686981, "learning_rate": 0.00019999247018391447, "loss": 0.798, "step": 265 }, { "epoch": 1.0546875, "grad_norm": 0.27524590492248535, "learning_rate": 0.0001999817800289289, "loss": 0.7911, "step": 270 }, { "epoch": 1.07421875, "grad_norm": 0.2549247145652771, "learning_rate": 0.00019996644264587193, "loss": 0.7963, "step": 275 }, { "epoch": 1.09375, "grad_norm": 0.253353089094162, "learning_rate": 0.00019994645874763658, "loss": 0.7904, "step": 280 }, { "epoch": 1.11328125, "grad_norm": 0.23945719003677368, "learning_rate": 0.00019992182926308942, "loss": 0.7921, "step": 285 }, { "epoch": 1.1328125, "grad_norm": 0.29668208956718445, "learning_rate": 0.00019989255533702736, "loss": 0.7943, "step": 290 }, { "epoch": 1.15234375, "grad_norm": 0.26419156789779663, "learning_rate": 0.0001998586383301244, "loss": 0.7819, "step": 295 }, { "epoch": 1.171875, "grad_norm": 0.3054077625274658, "learning_rate": 0.00019982007981886847, "loss": 0.7917, "step": 300 }, { "epoch": 1.19140625, "grad_norm": 0.27965638041496277, "learning_rate": 0.00019977688159548808, "loss": 0.7854, "step": 305 }, { "epoch": 1.2109375, "grad_norm": 0.23229017853736877, "learning_rate": 0.00019972904566786903, "loss": 0.7865, "step": 310 }, { "epoch": 1.23046875, "grad_norm": 0.2789019048213959, "learning_rate": 0.00019967657425946106, "loss": 0.7821, "step": 315 }, { "epoch": 1.25, "grad_norm": 0.24402114748954773, "learning_rate": 0.00019961946980917456, "loss": 0.7899, "step": 320 }, { "epoch": 1.26953125, "grad_norm": 0.2749808132648468, "learning_rate": 0.0001995577349712672, "loss": 0.7783, "step": 325 }, { "epoch": 1.2890625, "grad_norm": 0.2676057815551758, "learning_rate": 0.00019949137261522052, "loss": 0.7788, "step": 330 }, { "epoch": 1.30859375, "grad_norm": 0.24829885363578796, "learning_rate": 0.0001994203858256065, "loss": 0.7714, "step": 335 }, { "epoch": 1.328125, "grad_norm": 0.24872945249080658, "learning_rate": 0.00019934477790194445, "loss": 0.7832, "step": 340 }, { "epoch": 1.34765625, "grad_norm": 0.2914537489414215, "learning_rate": 0.00019926455235854724, "loss": 0.7791, "step": 345 }, { "epoch": 1.3671875, "grad_norm": 0.2692899703979492, "learning_rate": 0.00019917971292435826, "loss": 0.7739, "step": 350 }, { "epoch": 1.38671875, "grad_norm": 0.2605401873588562, "learning_rate": 0.000199090263542778, "loss": 0.7717, "step": 355 }, { "epoch": 1.40625, "grad_norm": 0.24468782544136047, "learning_rate": 0.00019899620837148077, "loss": 0.7694, "step": 360 }, { "epoch": 1.42578125, "grad_norm": 0.2542877197265625, "learning_rate": 0.00019889755178222147, "loss": 0.7653, "step": 365 }, { "epoch": 1.4453125, "grad_norm": 0.21375133097171783, "learning_rate": 0.00019879429836063226, "loss": 0.7854, "step": 370 }, { "epoch": 1.46484375, "grad_norm": 0.24711847305297852, "learning_rate": 0.00019868645290600955, "loss": 0.773, "step": 375 }, { "epoch": 1.484375, "grad_norm": 0.2352401316165924, "learning_rate": 0.0001985740204310909, "loss": 0.7641, "step": 380 }, { "epoch": 1.50390625, "grad_norm": 0.2681073844432831, "learning_rate": 0.00019845700616182206, "loss": 0.7755, "step": 385 }, { "epoch": 1.5234375, "grad_norm": 0.2394329458475113, "learning_rate": 0.00019833541553711395, "loss": 0.7635, "step": 390 }, { "epoch": 1.54296875, "grad_norm": 0.27736565470695496, "learning_rate": 0.00019820925420858991, "loss": 0.7744, "step": 395 }, { "epoch": 1.5625, "grad_norm": 0.2736864984035492, "learning_rate": 0.00019807852804032305, "loss": 0.7564, "step": 400 }, { "epoch": 1.58203125, "grad_norm": 0.22882600128650665, "learning_rate": 0.00019794324310856367, "loss": 0.7703, "step": 405 }, { "epoch": 1.6015625, "grad_norm": 0.2372276782989502, "learning_rate": 0.0001978034057014568, "loss": 0.7642, "step": 410 }, { "epoch": 1.62109375, "grad_norm": 0.23550736904144287, "learning_rate": 0.00019765902231874992, "loss": 0.7513, "step": 415 }, { "epoch": 1.640625, "grad_norm": 0.23483717441558838, "learning_rate": 0.00019751009967149087, "loss": 0.7485, "step": 420 }, { "epoch": 1.66015625, "grad_norm": 0.23124265670776367, "learning_rate": 0.00019735664468171587, "loss": 0.7712, "step": 425 }, { "epoch": 1.6796875, "grad_norm": 0.25672388076782227, "learning_rate": 0.00019719866448212795, "loss": 0.7635, "step": 430 }, { "epoch": 1.69921875, "grad_norm": 0.2655965983867645, "learning_rate": 0.00019703616641576514, "loss": 0.7614, "step": 435 }, { "epoch": 1.71875, "grad_norm": 0.22875700891017914, "learning_rate": 0.00019686915803565934, "loss": 0.7597, "step": 440 }, { "epoch": 1.73828125, "grad_norm": 0.24324467778205872, "learning_rate": 0.00019669764710448522, "loss": 0.7592, "step": 445 }, { "epoch": 1.7578125, "grad_norm": 0.23085905611515045, "learning_rate": 0.00019652164159419946, "loss": 0.7582, "step": 450 }, { "epoch": 1.77734375, "grad_norm": 0.24821893870830536, "learning_rate": 0.00019634114968567005, "loss": 0.7565, "step": 455 }, { "epoch": 1.796875, "grad_norm": 0.24690982699394226, "learning_rate": 0.0001961561797682962, "loss": 0.75, "step": 460 }, { "epoch": 1.81640625, "grad_norm": 0.21277934312820435, "learning_rate": 0.00019596674043961828, "loss": 0.7499, "step": 465 }, { "epoch": 1.8359375, "grad_norm": 0.2045515477657318, "learning_rate": 0.0001957728405049183, "loss": 0.7476, "step": 470 }, { "epoch": 1.85546875, "grad_norm": 0.22809946537017822, "learning_rate": 0.00019557448897681057, "loss": 0.7554, "step": 475 }, { "epoch": 1.875, "grad_norm": 0.2747824788093567, "learning_rate": 0.0001953716950748227, "loss": 0.7481, "step": 480 }, { "epoch": 1.89453125, "grad_norm": 0.23395125567913055, "learning_rate": 0.00019516446822496732, "loss": 0.7579, "step": 485 }, { "epoch": 1.9140625, "grad_norm": 0.2263769805431366, "learning_rate": 0.00019495281805930367, "loss": 0.7493, "step": 490 }, { "epoch": 1.93359375, "grad_norm": 0.23396165668964386, "learning_rate": 0.00019473675441549013, "loss": 0.7523, "step": 495 }, { "epoch": 1.953125, "grad_norm": 0.23420800268650055, "learning_rate": 0.0001945162873363268, "loss": 0.7469, "step": 500 }, { "epoch": 1.97265625, "grad_norm": 0.19923944771289825, "learning_rate": 0.00019429142706928868, "loss": 0.7535, "step": 505 }, { "epoch": 1.9921875, "grad_norm": 0.2181696891784668, "learning_rate": 0.00019406218406604965, "loss": 0.7532, "step": 510 }, { "epoch": 2.0, "eval_loss": 2.031317949295044, "eval_runtime": 0.5375, "eval_samples_per_second": 11.164, "eval_steps_per_second": 1.861, "step": 512 }, { "epoch": 2.01171875, "grad_norm": 0.2611521780490875, "learning_rate": 0.0001938285689819962, "loss": 0.7349, "step": 515 }, { "epoch": 2.03125, "grad_norm": 0.22077465057373047, "learning_rate": 0.0001935905926757326, "loss": 0.7309, "step": 520 }, { "epoch": 2.05078125, "grad_norm": 0.2502357065677643, "learning_rate": 0.00019334826620857583, "loss": 0.7402, "step": 525 }, { "epoch": 2.0703125, "grad_norm": 0.21151328086853027, "learning_rate": 0.00019310160084404186, "loss": 0.7263, "step": 530 }, { "epoch": 2.08984375, "grad_norm": 0.22730891406536102, "learning_rate": 0.00019285060804732158, "loss": 0.7393, "step": 535 }, { "epoch": 2.109375, "grad_norm": 0.29608404636383057, "learning_rate": 0.00019259529948474833, "loss": 0.7359, "step": 540 }, { "epoch": 2.12890625, "grad_norm": 0.2048954963684082, "learning_rate": 0.00019233568702325547, "loss": 0.7327, "step": 545 }, { "epoch": 2.1484375, "grad_norm": 0.24332541227340698, "learning_rate": 0.0001920717827298248, "loss": 0.723, "step": 550 }, { "epoch": 2.16796875, "grad_norm": 0.27370956540107727, "learning_rate": 0.0001918035988709256, "loss": 0.7346, "step": 555 }, { "epoch": 2.1875, "grad_norm": 0.27345338463783264, "learning_rate": 0.00019153114791194473, "loss": 0.7216, "step": 560 }, { "epoch": 2.20703125, "grad_norm": 0.21915854513645172, "learning_rate": 0.0001912544425166069, "loss": 0.7297, "step": 565 }, { "epoch": 2.2265625, "grad_norm": 0.23517705500125885, "learning_rate": 0.0001909734955463863, "loss": 0.7277, "step": 570 }, { "epoch": 2.24609375, "grad_norm": 0.2082410454750061, "learning_rate": 0.00019068832005990867, "loss": 0.7274, "step": 575 }, { "epoch": 2.265625, "grad_norm": 0.25212010741233826, "learning_rate": 0.00019039892931234435, "loss": 0.7388, "step": 580 }, { "epoch": 2.28515625, "grad_norm": 0.22077186405658722, "learning_rate": 0.0001901053367547922, "loss": 0.7356, "step": 585 }, { "epoch": 2.3046875, "grad_norm": 0.24918216466903687, "learning_rate": 0.0001898075560336543, "loss": 0.7283, "step": 590 }, { "epoch": 2.32421875, "grad_norm": 0.2168445587158203, "learning_rate": 0.00018950560099000182, "loss": 0.7276, "step": 595 }, { "epoch": 2.34375, "grad_norm": 0.3361542522907257, "learning_rate": 0.00018919948565893142, "loss": 0.7394, "step": 600 }, { "epoch": 2.36328125, "grad_norm": 0.30473312735557556, "learning_rate": 0.0001888892242689132, "loss": 0.7214, "step": 605 }, { "epoch": 2.3828125, "grad_norm": 0.22810065746307373, "learning_rate": 0.00018857483124112907, "loss": 0.7389, "step": 610 }, { "epoch": 2.40234375, "grad_norm": 0.22486305236816406, "learning_rate": 0.00018825632118880259, "loss": 0.7382, "step": 615 }, { "epoch": 2.421875, "grad_norm": 0.23797857761383057, "learning_rate": 0.00018793370891651972, "loss": 0.7352, "step": 620 }, { "epoch": 2.44140625, "grad_norm": 0.22012600302696228, "learning_rate": 0.00018760700941954065, "loss": 0.7323, "step": 625 }, { "epoch": 2.4609375, "grad_norm": 0.2505754232406616, "learning_rate": 0.00018727623788310292, "loss": 0.7319, "step": 630 }, { "epoch": 2.48046875, "grad_norm": 0.23932820558547974, "learning_rate": 0.0001869414096817154, "loss": 0.7166, "step": 635 }, { "epoch": 2.5, "grad_norm": 0.22623002529144287, "learning_rate": 0.00018660254037844388, "loss": 0.7254, "step": 640 }, { "epoch": 2.51953125, "grad_norm": 0.24143099784851074, "learning_rate": 0.0001862596457241875, "loss": 0.7374, "step": 645 }, { "epoch": 2.5390625, "grad_norm": 0.25545206665992737, "learning_rate": 0.00018591274165694687, "loss": 0.7268, "step": 650 }, { "epoch": 2.55859375, "grad_norm": 0.27690452337265015, "learning_rate": 0.00018556184430108293, "loss": 0.7318, "step": 655 }, { "epoch": 2.578125, "grad_norm": 0.21064211428165436, "learning_rate": 0.00018520696996656788, "loss": 0.7365, "step": 660 }, { "epoch": 2.59765625, "grad_norm": 0.2418980747461319, "learning_rate": 0.0001848481351482267, "loss": 0.7252, "step": 665 }, { "epoch": 2.6171875, "grad_norm": 0.21725673973560333, "learning_rate": 0.00018448535652497073, "loss": 0.7438, "step": 670 }, { "epoch": 2.63671875, "grad_norm": 0.2051118165254593, "learning_rate": 0.00018411865095902224, "loss": 0.7272, "step": 675 }, { "epoch": 2.65625, "grad_norm": 0.20715655386447906, "learning_rate": 0.0001837480354951308, "loss": 0.7189, "step": 680 }, { "epoch": 2.67578125, "grad_norm": 0.224945530295372, "learning_rate": 0.00018337352735978095, "loss": 0.7283, "step": 685 }, { "epoch": 2.6953125, "grad_norm": 0.2353772222995758, "learning_rate": 0.0001829951439603915, "loss": 0.7172, "step": 690 }, { "epoch": 2.71484375, "grad_norm": 0.21377775073051453, "learning_rate": 0.00018261290288450646, "loss": 0.7245, "step": 695 }, { "epoch": 2.734375, "grad_norm": 0.20290276408195496, "learning_rate": 0.00018222682189897752, "loss": 0.732, "step": 700 }, { "epoch": 2.75390625, "grad_norm": 0.21785806119441986, "learning_rate": 0.00018183691894913825, "loss": 0.7142, "step": 705 }, { "epoch": 2.7734375, "grad_norm": 0.21216203272342682, "learning_rate": 0.00018144321215797, "loss": 0.7163, "step": 710 }, { "epoch": 2.79296875, "grad_norm": 0.20187579095363617, "learning_rate": 0.0001810457198252595, "loss": 0.7196, "step": 715 }, { "epoch": 2.8125, "grad_norm": 0.21112394332885742, "learning_rate": 0.00018064446042674828, "loss": 0.7255, "step": 720 }, { "epoch": 2.83203125, "grad_norm": 0.21814604103565216, "learning_rate": 0.00018023945261327393, "loss": 0.7244, "step": 725 }, { "epoch": 2.8515625, "grad_norm": 0.2388346940279007, "learning_rate": 0.00017983071520990315, "loss": 0.719, "step": 730 }, { "epoch": 2.87109375, "grad_norm": 0.2274855226278305, "learning_rate": 0.00017941826721505684, "loss": 0.7092, "step": 735 }, { "epoch": 2.890625, "grad_norm": 0.2171526700258255, "learning_rate": 0.0001790021277996269, "loss": 0.7177, "step": 740 }, { "epoch": 2.91015625, "grad_norm": 0.2128465622663498, "learning_rate": 0.00017858231630608527, "loss": 0.7245, "step": 745 }, { "epoch": 2.9296875, "grad_norm": 0.2257278561592102, "learning_rate": 0.0001781588522475848, "loss": 0.7172, "step": 750 }, { "epoch": 2.94921875, "grad_norm": 0.21227267384529114, "learning_rate": 0.00017773175530705232, "loss": 0.7208, "step": 755 }, { "epoch": 2.96875, "grad_norm": 0.23267419636249542, "learning_rate": 0.0001773010453362737, "loss": 0.7188, "step": 760 }, { "epoch": 2.98828125, "grad_norm": 0.21279846131801605, "learning_rate": 0.00017686674235497125, "loss": 0.7198, "step": 765 }, { "epoch": 3.0, "eval_loss": 2.0403969287872314, "eval_runtime": 0.5399, "eval_samples_per_second": 11.113, "eval_steps_per_second": 1.852, "step": 768 }, { "epoch": 3.0078125, "grad_norm": 0.20591868460178375, "learning_rate": 0.000176428866549873, "loss": 0.7092, "step": 770 }, { "epoch": 3.02734375, "grad_norm": 0.21006809175014496, "learning_rate": 0.0001759874382737746, "loss": 0.6982, "step": 775 }, { "epoch": 3.046875, "grad_norm": 0.20914091169834137, "learning_rate": 0.00017554247804459316, "loss": 0.6986, "step": 780 }, { "epoch": 3.06640625, "grad_norm": 0.21207676827907562, "learning_rate": 0.0001750940065444136, "loss": 0.7024, "step": 785 }, { "epoch": 3.0859375, "grad_norm": 0.24130572378635406, "learning_rate": 0.00017464204461852738, "loss": 0.7011, "step": 790 }, { "epoch": 3.10546875, "grad_norm": 0.22464986145496368, "learning_rate": 0.0001741866132744636, "loss": 0.6998, "step": 795 }, { "epoch": 3.125, "grad_norm": 0.20956657826900482, "learning_rate": 0.0001737277336810124, "loss": 0.7068, "step": 800 }, { "epoch": 3.14453125, "grad_norm": 0.21382799744606018, "learning_rate": 0.00017326542716724128, "loss": 0.6997, "step": 805 }, { "epoch": 3.1640625, "grad_norm": 0.2018394023180008, "learning_rate": 0.00017279971522150348, "loss": 0.7057, "step": 810 }, { "epoch": 3.18359375, "grad_norm": 0.20716731250286102, "learning_rate": 0.00017233061949043928, "loss": 0.6957, "step": 815 }, { "epoch": 3.203125, "grad_norm": 0.21063964068889618, "learning_rate": 0.0001718581617779698, "loss": 0.6989, "step": 820 }, { "epoch": 3.22265625, "grad_norm": 0.21001911163330078, "learning_rate": 0.0001713823640442837, "loss": 0.7065, "step": 825 }, { "epoch": 3.2421875, "grad_norm": 0.21537743508815765, "learning_rate": 0.0001709032484048162, "loss": 0.7001, "step": 830 }, { "epoch": 3.26171875, "grad_norm": 0.21781504154205322, "learning_rate": 0.00017042083712922131, "loss": 0.7076, "step": 835 }, { "epoch": 3.28125, "grad_norm": 0.21302708983421326, "learning_rate": 0.00016993515264033672, "loss": 0.6965, "step": 840 }, { "epoch": 3.30078125, "grad_norm": 0.2185572385787964, "learning_rate": 0.00016944621751314144, "loss": 0.7046, "step": 845 }, { "epoch": 3.3203125, "grad_norm": 0.21651025116443634, "learning_rate": 0.0001689540544737067, "loss": 0.7042, "step": 850 }, { "epoch": 3.33984375, "grad_norm": 0.22459545731544495, "learning_rate": 0.0001684586863981394, "loss": 0.7133, "step": 855 }, { "epoch": 3.359375, "grad_norm": 0.21320843696594238, "learning_rate": 0.00016796013631151897, "loss": 0.7106, "step": 860 }, { "epoch": 3.37890625, "grad_norm": 0.22854122519493103, "learning_rate": 0.00016745842738682712, "loss": 0.6987, "step": 865 }, { "epoch": 3.3984375, "grad_norm": 0.22366014122962952, "learning_rate": 0.00016695358294387065, "loss": 0.7078, "step": 870 }, { "epoch": 3.41796875, "grad_norm": 0.21049249172210693, "learning_rate": 0.00016644562644819771, "loss": 0.6926, "step": 875 }, { "epoch": 3.4375, "grad_norm": 0.216139018535614, "learning_rate": 0.00016593458151000688, "loss": 0.7073, "step": 880 }, { "epoch": 3.45703125, "grad_norm": 0.22321297228336334, "learning_rate": 0.00016542047188304997, "loss": 0.7063, "step": 885 }, { "epoch": 3.4765625, "grad_norm": 0.21834047138690948, "learning_rate": 0.0001649033214635277, "loss": 0.7007, "step": 890 }, { "epoch": 3.49609375, "grad_norm": 0.2148895114660263, "learning_rate": 0.00016438315428897915, "loss": 0.709, "step": 895 }, { "epoch": 3.515625, "grad_norm": 0.2145809829235077, "learning_rate": 0.00016385999453716454, "loss": 0.7073, "step": 900 }, { "epoch": 3.53515625, "grad_norm": 0.21147432923316956, "learning_rate": 0.00016333386652494117, "loss": 0.6915, "step": 905 }, { "epoch": 3.5546875, "grad_norm": 0.21884699165821075, "learning_rate": 0.00016280479470713344, "loss": 0.7026, "step": 910 }, { "epoch": 3.57421875, "grad_norm": 0.20934432744979858, "learning_rate": 0.0001622728036753959, "loss": 0.6908, "step": 915 }, { "epoch": 3.59375, "grad_norm": 0.20113444328308105, "learning_rate": 0.00016173791815707051, "loss": 0.7101, "step": 920 }, { "epoch": 3.61328125, "grad_norm": 0.2057623565196991, "learning_rate": 0.000161200163014037, "loss": 0.7179, "step": 925 }, { "epoch": 3.6328125, "grad_norm": 0.21178101003170013, "learning_rate": 0.00016065956324155746, "loss": 0.7015, "step": 930 }, { "epoch": 3.65234375, "grad_norm": 0.21164196729660034, "learning_rate": 0.0001601161439671145, "loss": 0.6955, "step": 935 }, { "epoch": 3.671875, "grad_norm": 0.21989427506923676, "learning_rate": 0.00015956993044924334, "loss": 0.6972, "step": 940 }, { "epoch": 3.69140625, "grad_norm": 0.20968452095985413, "learning_rate": 0.0001590209480763576, "loss": 0.6986, "step": 945 }, { "epoch": 3.7109375, "grad_norm": 0.20064401626586914, "learning_rate": 0.00015846922236556946, "loss": 0.7073, "step": 950 }, { "epoch": 3.73046875, "grad_norm": 0.2390391230583191, "learning_rate": 0.00015791477896150347, "loss": 0.6958, "step": 955 }, { "epoch": 3.75, "grad_norm": 0.21184207499027252, "learning_rate": 0.0001573576436351046, "loss": 0.7008, "step": 960 }, { "epoch": 3.76953125, "grad_norm": 0.21932272613048553, "learning_rate": 0.00015679784228244043, "loss": 0.6904, "step": 965 }, { "epoch": 3.7890625, "grad_norm": 0.19908711314201355, "learning_rate": 0.00015623540092349732, "loss": 0.6991, "step": 970 }, { "epoch": 3.80859375, "grad_norm": 0.22039274871349335, "learning_rate": 0.00015567034570097125, "loss": 0.6959, "step": 975 }, { "epoch": 3.828125, "grad_norm": 0.21224038302898407, "learning_rate": 0.0001551027028790524, "loss": 0.6976, "step": 980 }, { "epoch": 3.84765625, "grad_norm": 0.21021129190921783, "learning_rate": 0.00015453249884220464, "loss": 0.6976, "step": 985 }, { "epoch": 3.8671875, "grad_norm": 0.2202974110841751, "learning_rate": 0.00015395976009393894, "loss": 0.6995, "step": 990 }, { "epoch": 3.88671875, "grad_norm": 0.21578259766101837, "learning_rate": 0.0001533845132555816, "loss": 0.6882, "step": 995 }, { "epoch": 3.90625, "grad_norm": 0.1979641318321228, "learning_rate": 0.0001528067850650368, "loss": 0.6961, "step": 1000 }, { "epoch": 3.92578125, "grad_norm": 0.20889665186405182, "learning_rate": 0.00015222660237554383, "loss": 0.7, "step": 1005 }, { "epoch": 3.9453125, "grad_norm": 0.20623871684074402, "learning_rate": 0.00015164399215442898, "loss": 0.6985, "step": 1010 }, { "epoch": 3.96484375, "grad_norm": 0.2109537273645401, "learning_rate": 0.00015105898148185193, "loss": 0.7026, "step": 1015 }, { "epoch": 3.984375, "grad_norm": 0.20740477740764618, "learning_rate": 0.0001504715975495472, "loss": 0.7053, "step": 1020 }, { "epoch": 4.0, "eval_loss": 2.0418636798858643, "eval_runtime": 0.5376, "eval_samples_per_second": 11.162, "eval_steps_per_second": 1.86, "step": 1024 }, { "epoch": 4.00390625, "grad_norm": 0.2116871029138565, "learning_rate": 0.00014988186765956029, "loss": 0.6923, "step": 1025 }, { "epoch": 4.0234375, "grad_norm": 0.20054052770137787, "learning_rate": 0.00014928981922297842, "loss": 0.6717, "step": 1030 }, { "epoch": 4.04296875, "grad_norm": 0.2238766998052597, "learning_rate": 0.00014869547975865664, "loss": 0.6719, "step": 1035 }, { "epoch": 4.0625, "grad_norm": 0.2156434804201126, "learning_rate": 0.00014809887689193877, "loss": 0.6718, "step": 1040 }, { "epoch": 4.08203125, "grad_norm": 0.2189694195985794, "learning_rate": 0.00014750003835337316, "loss": 0.677, "step": 1045 }, { "epoch": 4.1015625, "grad_norm": 0.2283412218093872, "learning_rate": 0.0001468989919774239, "loss": 0.6724, "step": 1050 }, { "epoch": 4.12109375, "grad_norm": 0.2534675598144531, "learning_rate": 0.00014629576570117709, "loss": 0.6842, "step": 1055 }, { "epoch": 4.140625, "grad_norm": 0.24277372658252716, "learning_rate": 0.00014569038756304207, "loss": 0.676, "step": 1060 }, { "epoch": 4.16015625, "grad_norm": 0.2335975170135498, "learning_rate": 0.0001450828857014485, "loss": 0.6861, "step": 1065 }, { "epoch": 4.1796875, "grad_norm": 0.22338411211967468, "learning_rate": 0.0001444732883535382, "loss": 0.6784, "step": 1070 }, { "epoch": 4.19921875, "grad_norm": 0.22138862311840057, "learning_rate": 0.00014386162385385278, "loss": 0.6765, "step": 1075 }, { "epoch": 4.21875, "grad_norm": 0.20274129509925842, "learning_rate": 0.00014324792063301662, "loss": 0.6762, "step": 1080 }, { "epoch": 4.23828125, "grad_norm": 0.20809794962406158, "learning_rate": 0.00014263220721641543, "loss": 0.6954, "step": 1085 }, { "epoch": 4.2578125, "grad_norm": 0.21727928519248962, "learning_rate": 0.00014201451222287025, "loss": 0.682, "step": 1090 }, { "epoch": 4.27734375, "grad_norm": 0.21408621966838837, "learning_rate": 0.00014139486436330736, "loss": 0.6817, "step": 1095 }, { "epoch": 4.296875, "grad_norm": 0.2173791378736496, "learning_rate": 0.00014077329243942369, "loss": 0.6775, "step": 1100 }, { "epoch": 4.31640625, "grad_norm": 0.21154190599918365, "learning_rate": 0.0001401498253423481, "loss": 0.6793, "step": 1105 }, { "epoch": 4.3359375, "grad_norm": 0.2106465995311737, "learning_rate": 0.00013952449205129855, "loss": 0.6736, "step": 1110 }, { "epoch": 4.35546875, "grad_norm": 0.20029598474502563, "learning_rate": 0.00013889732163223516, "loss": 0.6759, "step": 1115 }, { "epoch": 4.375, "grad_norm": 0.21185144782066345, "learning_rate": 0.000138268343236509, "loss": 0.6777, "step": 1120 }, { "epoch": 4.39453125, "grad_norm": 0.2037803679704666, "learning_rate": 0.0001376375860995073, "loss": 0.6818, "step": 1125 }, { "epoch": 4.4140625, "grad_norm": 0.21110603213310242, "learning_rate": 0.00013700507953929463, "loss": 0.675, "step": 1130 }, { "epoch": 4.43359375, "grad_norm": 0.2060796022415161, "learning_rate": 0.00013637085295524988, "loss": 0.679, "step": 1135 }, { "epoch": 4.453125, "grad_norm": 0.2184733897447586, "learning_rate": 0.00013573493582670003, "loss": 0.6859, "step": 1140 }, { "epoch": 4.47265625, "grad_norm": 0.21656639873981476, "learning_rate": 0.00013509735771154987, "loss": 0.685, "step": 1145 }, { "epoch": 4.4921875, "grad_norm": 0.219607412815094, "learning_rate": 0.00013445814824490805, "loss": 0.6814, "step": 1150 }, { "epoch": 4.51171875, "grad_norm": 0.2204212099313736, "learning_rate": 0.00013381733713770967, "loss": 0.6845, "step": 1155 }, { "epoch": 4.53125, "grad_norm": 0.2118123322725296, "learning_rate": 0.00013317495417533524, "loss": 0.6751, "step": 1160 }, { "epoch": 4.55078125, "grad_norm": 0.2175564020872116, "learning_rate": 0.0001325310292162263, "loss": 0.6813, "step": 1165 }, { "epoch": 4.5703125, "grad_norm": 0.2186279296875, "learning_rate": 0.0001318855921904976, "loss": 0.6869, "step": 1170 }, { "epoch": 4.58984375, "grad_norm": 0.21257956326007843, "learning_rate": 0.0001312386730985459, "loss": 0.6834, "step": 1175 }, { "epoch": 4.609375, "grad_norm": 0.20661357045173645, "learning_rate": 0.00013059030200965536, "loss": 0.7001, "step": 1180 }, { "epoch": 4.62890625, "grad_norm": 0.22517681121826172, "learning_rate": 0.00012994050906060017, "loss": 0.6717, "step": 1185 }, { "epoch": 4.6484375, "grad_norm": 0.22090637683868408, "learning_rate": 0.00012928932445424365, "loss": 0.678, "step": 1190 }, { "epoch": 4.66796875, "grad_norm": 0.21545428037643433, "learning_rate": 0.00012863677845813433, "loss": 0.6819, "step": 1195 }, { "epoch": 4.6875, "grad_norm": 0.209136962890625, "learning_rate": 0.00012798290140309923, "loss": 0.6862, "step": 1200 }, { "epoch": 4.70703125, "grad_norm": 0.20853549242019653, "learning_rate": 0.00012732772368183388, "loss": 0.6719, "step": 1205 }, { "epoch": 4.7265625, "grad_norm": 0.2124202698469162, "learning_rate": 0.00012667127574748986, "loss": 0.6819, "step": 1210 }, { "epoch": 4.74609375, "grad_norm": 0.2243855744600296, "learning_rate": 0.00012601358811225913, "loss": 0.6743, "step": 1215 }, { "epoch": 4.765625, "grad_norm": 0.21978437900543213, "learning_rate": 0.00012535469134595595, "loss": 0.6924, "step": 1220 }, { "epoch": 4.78515625, "grad_norm": 0.20108923316001892, "learning_rate": 0.00012469461607459583, "loss": 0.6836, "step": 1225 }, { "epoch": 4.8046875, "grad_norm": 0.21921634674072266, "learning_rate": 0.0001240333929789721, "loss": 0.6764, "step": 1230 }, { "epoch": 4.82421875, "grad_norm": 0.21365371346473694, "learning_rate": 0.00012337105279322988, "loss": 0.6843, "step": 1235 }, { "epoch": 4.84375, "grad_norm": 0.20987005531787872, "learning_rate": 0.00012270762630343734, "loss": 0.6746, "step": 1240 }, { "epoch": 4.86328125, "grad_norm": 0.20794980227947235, "learning_rate": 0.00012204314434615501, "loss": 0.6815, "step": 1245 }, { "epoch": 4.8828125, "grad_norm": 0.21553441882133484, "learning_rate": 0.00012137763780700227, "loss": 0.6795, "step": 1250 }, { "epoch": 4.90234375, "grad_norm": 0.2035866528749466, "learning_rate": 0.00012071113761922186, "loss": 0.6828, "step": 1255 }, { "epoch": 4.921875, "grad_norm": 0.2061247080564499, "learning_rate": 0.00012004367476224206, "loss": 0.6838, "step": 1260 }, { "epoch": 4.94140625, "grad_norm": 0.21384355425834656, "learning_rate": 0.0001193752802602367, "loss": 0.6902, "step": 1265 }, { "epoch": 4.9609375, "grad_norm": 0.21918757259845734, "learning_rate": 0.0001187059851806832, "loss": 0.6853, "step": 1270 }, { "epoch": 4.98046875, "grad_norm": 0.20853689312934875, "learning_rate": 0.00011803582063291849, "loss": 0.6693, "step": 1275 }, { "epoch": 5.0, "grad_norm": 0.2089415341615677, "learning_rate": 0.00011736481776669306, "loss": 0.6831, "step": 1280 }, { "epoch": 5.0, "eval_loss": 2.05405592918396, "eval_runtime": 0.5395, "eval_samples_per_second": 11.122, "eval_steps_per_second": 1.854, "step": 1280 }, { "epoch": 5.01953125, "grad_norm": 0.21040305495262146, "learning_rate": 0.00011669300777072298, "loss": 0.6597, "step": 1285 }, { "epoch": 5.0390625, "grad_norm": 0.2179408222436905, "learning_rate": 0.00011602042187124045, "loss": 0.6675, "step": 1290 }, { "epoch": 5.05859375, "grad_norm": 0.20846475660800934, "learning_rate": 0.0001153470913305421, "loss": 0.6643, "step": 1295 }, { "epoch": 5.078125, "grad_norm": 0.2074786126613617, "learning_rate": 0.00011467304744553618, "loss": 0.6656, "step": 1300 }, { "epoch": 5.09765625, "grad_norm": 0.2094477117061615, "learning_rate": 0.00011399832154628767, "loss": 0.6544, "step": 1305 }, { "epoch": 5.1171875, "grad_norm": 0.21982310712337494, "learning_rate": 0.000113322944994562, "loss": 0.6549, "step": 1310 }, { "epoch": 5.13671875, "grad_norm": 0.23372633755207062, "learning_rate": 0.00011264694918236753, "loss": 0.6567, "step": 1315 }, { "epoch": 5.15625, "grad_norm": 0.21253670752048492, "learning_rate": 0.00011197036553049625, "loss": 0.657, "step": 1320 }, { "epoch": 5.17578125, "grad_norm": 0.21819843351840973, "learning_rate": 0.00011129322548706342, "loss": 0.6624, "step": 1325 }, { "epoch": 5.1953125, "grad_norm": 0.22048228979110718, "learning_rate": 0.00011061556052604578, "loss": 0.6617, "step": 1330 }, { "epoch": 5.21484375, "grad_norm": 0.21444514393806458, "learning_rate": 0.00010993740214581856, "loss": 0.6714, "step": 1335 }, { "epoch": 5.234375, "grad_norm": 0.20963872969150543, "learning_rate": 0.00010925878186769158, "loss": 0.6554, "step": 1340 }, { "epoch": 5.25390625, "grad_norm": 0.21605953574180603, "learning_rate": 0.000108579731234444, "loss": 0.6625, "step": 1345 }, { "epoch": 5.2734375, "grad_norm": 0.2186332494020462, "learning_rate": 0.00010790028180885821, "loss": 0.659, "step": 1350 }, { "epoch": 5.29296875, "grad_norm": 0.20879332721233368, "learning_rate": 0.00010722046517225271, "loss": 0.6574, "step": 1355 }, { "epoch": 5.3125, "grad_norm": 0.20964272320270538, "learning_rate": 0.00010654031292301432, "loss": 0.6495, "step": 1360 }, { "epoch": 5.33203125, "grad_norm": 0.22066867351531982, "learning_rate": 0.00010585985667512934, "loss": 0.6657, "step": 1365 }, { "epoch": 5.3515625, "grad_norm": 0.21919472515583038, "learning_rate": 0.00010517912805671419, "loss": 0.6663, "step": 1370 }, { "epoch": 5.37109375, "grad_norm": 0.20911991596221924, "learning_rate": 0.00010449815870854525, "loss": 0.6655, "step": 1375 }, { "epoch": 5.390625, "grad_norm": 0.21343956887722015, "learning_rate": 0.00010381698028258817, "loss": 0.6538, "step": 1380 }, { "epoch": 5.41015625, "grad_norm": 0.23448581993579865, "learning_rate": 0.00010313562444052677, "loss": 0.6745, "step": 1385 }, { "epoch": 5.4296875, "grad_norm": 0.2224402278661728, "learning_rate": 0.00010245412285229124, "loss": 0.6659, "step": 1390 }, { "epoch": 5.44921875, "grad_norm": 0.21760495007038116, "learning_rate": 0.0001017725071945862, "loss": 0.6574, "step": 1395 }, { "epoch": 5.46875, "grad_norm": 0.21981921792030334, "learning_rate": 0.00010109080914941824, "loss": 0.6639, "step": 1400 }, { "epoch": 5.48828125, "grad_norm": 0.22708064317703247, "learning_rate": 0.00010040906040262348, "loss": 0.6601, "step": 1405 }, { "epoch": 5.5078125, "grad_norm": 0.21901877224445343, "learning_rate": 9.972729264239461e-05, "loss": 0.6708, "step": 1410 }, { "epoch": 5.52734375, "grad_norm": 0.21920931339263916, "learning_rate": 9.904553755780815e-05, "loss": 0.6588, "step": 1415 }, { "epoch": 5.546875, "grad_norm": 0.2086167186498642, "learning_rate": 9.836382683735132e-05, "loss": 0.6689, "step": 1420 }, { "epoch": 5.56640625, "grad_norm": 0.2135404795408249, "learning_rate": 9.768219216744942e-05, "loss": 0.6709, "step": 1425 }, { "epoch": 5.5859375, "grad_norm": 0.2296486496925354, "learning_rate": 9.700066523099273e-05, "loss": 0.6768, "step": 1430 }, { "epoch": 5.60546875, "grad_norm": 0.22231514751911163, "learning_rate": 9.631927770586412e-05, "loss": 0.6662, "step": 1435 }, { "epoch": 5.625, "grad_norm": 0.21092720329761505, "learning_rate": 9.563806126346642e-05, "loss": 0.6563, "step": 1440 }, { "epoch": 5.64453125, "grad_norm": 0.2081764191389084, "learning_rate": 9.495704756725041e-05, "loss": 0.6599, "step": 1445 }, { "epoch": 5.6640625, "grad_norm": 0.21930693089962006, "learning_rate": 9.427626827124317e-05, "loss": 0.6645, "step": 1450 }, { "epoch": 5.68359375, "grad_norm": 0.22238822281360626, "learning_rate": 9.359575501857651e-05, "loss": 0.6653, "step": 1455 }, { "epoch": 5.703125, "grad_norm": 0.21201257407665253, "learning_rate": 9.29155394400166e-05, "loss": 0.675, "step": 1460 }, { "epoch": 5.72265625, "grad_norm": 0.21970124542713165, "learning_rate": 9.223565315249325e-05, "loss": 0.6719, "step": 1465 }, { "epoch": 5.7421875, "grad_norm": 0.20852448046207428, "learning_rate": 9.155612775763069e-05, "loss": 0.6701, "step": 1470 }, { "epoch": 5.76171875, "grad_norm": 0.2180168330669403, "learning_rate": 9.087699484027857e-05, "loss": 0.658, "step": 1475 }, { "epoch": 5.78125, "grad_norm": 0.211044043302536, "learning_rate": 9.019828596704394e-05, "loss": 0.6526, "step": 1480 }, { "epoch": 5.80078125, "grad_norm": 0.20980176329612732, "learning_rate": 8.95200326848239e-05, "loss": 0.6548, "step": 1485 }, { "epoch": 5.8203125, "grad_norm": 0.20603534579277039, "learning_rate": 8.884226651933927e-05, "loss": 0.6644, "step": 1490 }, { "epoch": 5.83984375, "grad_norm": 0.20811837911605835, "learning_rate": 8.816501897366953e-05, "loss": 0.6703, "step": 1495 }, { "epoch": 5.859375, "grad_norm": 0.2105432003736496, "learning_rate": 8.74883215267881e-05, "loss": 0.6649, "step": 1500 }, { "epoch": 5.87890625, "grad_norm": 0.22339750826358795, "learning_rate": 8.681220563209955e-05, "loss": 0.6687, "step": 1505 }, { "epoch": 5.8984375, "grad_norm": 0.20943927764892578, "learning_rate": 8.613670271597733e-05, "loss": 0.663, "step": 1510 }, { "epoch": 5.91796875, "grad_norm": 0.20441389083862305, "learning_rate": 8.546184417630338e-05, "loss": 0.6663, "step": 1515 }, { "epoch": 5.9375, "grad_norm": 0.21287420392036438, "learning_rate": 8.478766138100834e-05, "loss": 0.6727, "step": 1520 }, { "epoch": 5.95703125, "grad_norm": 0.21163299679756165, "learning_rate": 8.411418566661388e-05, "loss": 0.6643, "step": 1525 }, { "epoch": 5.9765625, "grad_norm": 0.20541082322597504, "learning_rate": 8.344144833677594e-05, "loss": 0.6605, "step": 1530 }, { "epoch": 5.99609375, "grad_norm": 0.21405570209026337, "learning_rate": 8.27694806608298e-05, "loss": 0.6633, "step": 1535 }, { "epoch": 6.0, "eval_loss": 2.0744192600250244, "eval_runtime": 0.5398, "eval_samples_per_second": 11.115, "eval_steps_per_second": 1.853, "step": 1536 }, { "epoch": 6.015625, "grad_norm": 0.21526320278644562, "learning_rate": 8.209831387233676e-05, "loss": 0.6479, "step": 1540 }, { "epoch": 6.03515625, "grad_norm": 0.217779740691185, "learning_rate": 8.142797916763209e-05, "loss": 0.6536, "step": 1545 }, { "epoch": 6.0546875, "grad_norm": 0.22583958506584167, "learning_rate": 8.075850770437534e-05, "loss": 0.6532, "step": 1550 }, { "epoch": 6.07421875, "grad_norm": 0.24157458543777466, "learning_rate": 8.008993060010183e-05, "loss": 0.6426, "step": 1555 }, { "epoch": 6.09375, "grad_norm": 0.2280224710702896, "learning_rate": 7.942227893077652e-05, "loss": 0.6482, "step": 1560 }, { "epoch": 6.11328125, "grad_norm": 0.21372312307357788, "learning_rate": 7.875558372934936e-05, "loss": 0.6448, "step": 1565 }, { "epoch": 6.1328125, "grad_norm": 0.22514766454696655, "learning_rate": 7.808987598431303e-05, "loss": 0.6506, "step": 1570 }, { "epoch": 6.15234375, "grad_norm": 0.22178982198238373, "learning_rate": 7.742518663826246e-05, "loss": 0.6404, "step": 1575 }, { "epoch": 6.171875, "grad_norm": 0.21459142863750458, "learning_rate": 7.676154658645656e-05, "loss": 0.6557, "step": 1580 }, { "epoch": 6.19140625, "grad_norm": 0.22397801280021667, "learning_rate": 7.609898667538243e-05, "loss": 0.6445, "step": 1585 }, { "epoch": 6.2109375, "grad_norm": 0.22123484313488007, "learning_rate": 7.543753770132127e-05, "loss": 0.6375, "step": 1590 }, { "epoch": 6.23046875, "grad_norm": 0.2259218543767929, "learning_rate": 7.477723040891717e-05, "loss": 0.6486, "step": 1595 }, { "epoch": 6.25, "grad_norm": 0.21872185170650482, "learning_rate": 7.411809548974792e-05, "loss": 0.6546, "step": 1600 }, { "epoch": 6.26953125, "grad_norm": 0.2340991348028183, "learning_rate": 7.346016358089867e-05, "loss": 0.6573, "step": 1605 }, { "epoch": 6.2890625, "grad_norm": 0.2258559614419937, "learning_rate": 7.280346526353759e-05, "loss": 0.6485, "step": 1610 }, { "epoch": 6.30859375, "grad_norm": 0.21842586994171143, "learning_rate": 7.21480310614947e-05, "loss": 0.6452, "step": 1615 }, { "epoch": 6.328125, "grad_norm": 0.22392797470092773, "learning_rate": 7.149389143984295e-05, "loss": 0.6467, "step": 1620 }, { "epoch": 6.34765625, "grad_norm": 0.21205224096775055, "learning_rate": 7.084107680348218e-05, "loss": 0.6502, "step": 1625 }, { "epoch": 6.3671875, "grad_norm": 0.22041639685630798, "learning_rate": 7.018961749572604e-05, "loss": 0.6502, "step": 1630 }, { "epoch": 6.38671875, "grad_norm": 0.21791093051433563, "learning_rate": 6.953954379689136e-05, "loss": 0.6553, "step": 1635 }, { "epoch": 6.40625, "grad_norm": 0.22223076224327087, "learning_rate": 6.889088592289093e-05, "loss": 0.639, "step": 1640 }, { "epoch": 6.42578125, "grad_norm": 0.2151210606098175, "learning_rate": 6.824367402382885e-05, "loss": 0.655, "step": 1645 }, { "epoch": 6.4453125, "grad_norm": 0.2196204513311386, "learning_rate": 6.759793818259933e-05, "loss": 0.6549, "step": 1650 }, { "epoch": 6.46484375, "grad_norm": 0.21881859004497528, "learning_rate": 6.69537084134882e-05, "loss": 0.6516, "step": 1655 }, { "epoch": 6.484375, "grad_norm": 0.21970680356025696, "learning_rate": 6.6311014660778e-05, "loss": 0.6531, "step": 1660 }, { "epoch": 6.50390625, "grad_norm": 0.21640105545520782, "learning_rate": 6.566988679735606e-05, "loss": 0.6474, "step": 1665 }, { "epoch": 6.5234375, "grad_norm": 0.225670725107193, "learning_rate": 6.503035462332592e-05, "loss": 0.6437, "step": 1670 }, { "epoch": 6.54296875, "grad_norm": 0.20938833057880402, "learning_rate": 6.439244786462245e-05, "loss": 0.6526, "step": 1675 }, { "epoch": 6.5625, "grad_norm": 0.21592438220977783, "learning_rate": 6.375619617162985e-05, "loss": 0.6528, "step": 1680 }, { "epoch": 6.58203125, "grad_norm": 0.22665540874004364, "learning_rate": 6.312162911780368e-05, "loss": 0.6502, "step": 1685 }, { "epoch": 6.6015625, "grad_norm": 0.2195620834827423, "learning_rate": 6.248877619829619e-05, "loss": 0.6469, "step": 1690 }, { "epoch": 6.62109375, "grad_norm": 0.22165308892726898, "learning_rate": 6.185766682858546e-05, "loss": 0.6518, "step": 1695 }, { "epoch": 6.640625, "grad_norm": 0.22840096056461334, "learning_rate": 6.122833034310793e-05, "loss": 0.6506, "step": 1700 }, { "epoch": 6.66015625, "grad_norm": 0.22422266006469727, "learning_rate": 6.060079599389521e-05, "loss": 0.6559, "step": 1705 }, { "epoch": 6.6796875, "grad_norm": 0.22363343834877014, "learning_rate": 5.9975092949214116e-05, "loss": 0.6449, "step": 1710 }, { "epoch": 6.69921875, "grad_norm": 0.2213827222585678, "learning_rate": 5.935125029221111e-05, "loss": 0.65, "step": 1715 }, { "epoch": 6.71875, "grad_norm": 0.2290297895669937, "learning_rate": 5.872929701956054e-05, "loss": 0.6476, "step": 1720 }, { "epoch": 6.73828125, "grad_norm": 0.23118211328983307, "learning_rate": 5.810926204011658e-05, "loss": 0.6511, "step": 1725 }, { "epoch": 6.7578125, "grad_norm": 0.22112269699573517, "learning_rate": 5.749117417356988e-05, "loss": 0.6481, "step": 1730 }, { "epoch": 6.77734375, "grad_norm": 0.21454501152038574, "learning_rate": 5.687506214910765e-05, "loss": 0.6492, "step": 1735 }, { "epoch": 6.796875, "grad_norm": 0.22518618404865265, "learning_rate": 5.6260954604078585e-05, "loss": 0.6515, "step": 1740 }, { "epoch": 6.81640625, "grad_norm": 0.23013541102409363, "learning_rate": 5.564888008266165e-05, "loss": 0.6563, "step": 1745 }, { "epoch": 6.8359375, "grad_norm": 0.21959349513053894, "learning_rate": 5.503886703453933e-05, "loss": 0.6504, "step": 1750 }, { "epoch": 6.85546875, "grad_norm": 0.23238404095172882, "learning_rate": 5.4430943813575375e-05, "loss": 0.6575, "step": 1755 }, { "epoch": 6.875, "grad_norm": 0.21891681849956512, "learning_rate": 5.382513867649663e-05, "loss": 0.6415, "step": 1760 }, { "epoch": 6.89453125, "grad_norm": 0.2155328243970871, "learning_rate": 5.3221479781579955e-05, "loss": 0.6498, "step": 1765 }, { "epoch": 6.9140625, "grad_norm": 0.21803325414657593, "learning_rate": 5.261999518734322e-05, "loss": 0.6439, "step": 1770 }, { "epoch": 6.93359375, "grad_norm": 0.21531429886817932, "learning_rate": 5.202071285124119e-05, "loss": 0.6486, "step": 1775 }, { "epoch": 6.953125, "grad_norm": 0.22126588225364685, "learning_rate": 5.142366062836599e-05, "loss": 0.6453, "step": 1780 }, { "epoch": 6.97265625, "grad_norm": 0.21690168976783752, "learning_rate": 5.082886627015246e-05, "loss": 0.6564, "step": 1785 }, { "epoch": 6.9921875, "grad_norm": 0.22704558074474335, "learning_rate": 5.023635742308807e-05, "loss": 0.6595, "step": 1790 }, { "epoch": 7.0, "eval_loss": 2.0813868045806885, "eval_runtime": 0.5387, "eval_samples_per_second": 11.138, "eval_steps_per_second": 1.856, "step": 1792 }, { "epoch": 7.01171875, "grad_norm": 0.21671408414840698, "learning_rate": 4.964616162742826e-05, "loss": 0.6478, "step": 1795 }, { "epoch": 7.03125, "grad_norm": 0.2322429120540619, "learning_rate": 4.9058306315915826e-05, "loss": 0.6355, "step": 1800 }, { "epoch": 7.05078125, "grad_norm": 0.22516188025474548, "learning_rate": 4.84728188125063e-05, "loss": 0.6343, "step": 1805 }, { "epoch": 7.0703125, "grad_norm": 0.22370575368404388, "learning_rate": 4.7889726331097686e-05, "loss": 0.6388, "step": 1810 }, { "epoch": 7.08984375, "grad_norm": 0.22702112793922424, "learning_rate": 4.7309055974265435e-05, "loss": 0.6405, "step": 1815 }, { "epoch": 7.109375, "grad_norm": 0.2213263362646103, "learning_rate": 4.6730834732003104e-05, "loss": 0.6369, "step": 1820 }, { "epoch": 7.12890625, "grad_norm": 0.2283063679933548, "learning_rate": 4.615508948046726e-05, "loss": 0.6406, "step": 1825 }, { "epoch": 7.1484375, "grad_norm": 0.22583836317062378, "learning_rate": 4.5581846980728794e-05, "loss": 0.6396, "step": 1830 }, { "epoch": 7.16796875, "grad_norm": 0.223560631275177, "learning_rate": 4.50111338775287e-05, "loss": 0.6487, "step": 1835 }, { "epoch": 7.1875, "grad_norm": 0.2752554714679718, "learning_rate": 4.444297669803981e-05, "loss": 0.6399, "step": 1840 }, { "epoch": 7.20703125, "grad_norm": 0.22124579548835754, "learning_rate": 4.387740185063358e-05, "loss": 0.6413, "step": 1845 }, { "epoch": 7.2265625, "grad_norm": 0.22053855657577515, "learning_rate": 4.331443562365285e-05, "loss": 0.6377, "step": 1850 }, { "epoch": 7.24609375, "grad_norm": 0.22650252282619476, "learning_rate": 4.275410418418979e-05, "loss": 0.6441, "step": 1855 }, { "epoch": 7.265625, "grad_norm": 0.2277732640504837, "learning_rate": 4.219643357686967e-05, "loss": 0.6472, "step": 1860 }, { "epoch": 7.28515625, "grad_norm": 0.21958424150943756, "learning_rate": 4.1641449722640336e-05, "loss": 0.6434, "step": 1865 }, { "epoch": 7.3046875, "grad_norm": 0.22781191766262054, "learning_rate": 4.1089178417567164e-05, "loss": 0.6436, "step": 1870 }, { "epoch": 7.32421875, "grad_norm": 0.22724145650863647, "learning_rate": 4.0539645331634504e-05, "loss": 0.6365, "step": 1875 }, { "epoch": 7.34375, "grad_norm": 0.22402629256248474, "learning_rate": 3.999287600755192e-05, "loss": 0.6404, "step": 1880 }, { "epoch": 7.36328125, "grad_norm": 0.22256724536418915, "learning_rate": 3.944889585956746e-05, "loss": 0.6385, "step": 1885 }, { "epoch": 7.3828125, "grad_norm": 0.2245977371931076, "learning_rate": 3.8907730172286124e-05, "loss": 0.6402, "step": 1890 }, { "epoch": 7.40234375, "grad_norm": 0.2223842293024063, "learning_rate": 3.8369404099494574e-05, "loss": 0.6401, "step": 1895 }, { "epoch": 7.421875, "grad_norm": 0.228043794631958, "learning_rate": 3.783394266299228e-05, "loss": 0.6456, "step": 1900 }, { "epoch": 7.44140625, "grad_norm": 0.22321034967899323, "learning_rate": 3.730137075142802e-05, "loss": 0.6461, "step": 1905 }, { "epoch": 7.4609375, "grad_norm": 0.2202451378107071, "learning_rate": 3.677171311914346e-05, "loss": 0.6404, "step": 1910 }, { "epoch": 7.48046875, "grad_norm": 0.23069259524345398, "learning_rate": 3.624499438502229e-05, "loss": 0.6399, "step": 1915 }, { "epoch": 7.5, "grad_norm": 0.22767633199691772, "learning_rate": 3.5721239031346066e-05, "loss": 0.6365, "step": 1920 }, { "epoch": 7.51953125, "grad_norm": 0.223536416888237, "learning_rate": 3.520047140265618e-05, "loss": 0.6398, "step": 1925 }, { "epoch": 7.5390625, "grad_norm": 0.2236379086971283, "learning_rate": 3.468271570462235e-05, "loss": 0.6374, "step": 1930 }, { "epoch": 7.55859375, "grad_norm": 0.22322149574756622, "learning_rate": 3.41679960029174e-05, "loss": 0.6411, "step": 1935 }, { "epoch": 7.578125, "grad_norm": 0.22714544832706451, "learning_rate": 3.365633622209891e-05, "loss": 0.6281, "step": 1940 }, { "epoch": 7.59765625, "grad_norm": 0.23407664895057678, "learning_rate": 3.314776014449694e-05, "loss": 0.6342, "step": 1945 }, { "epoch": 7.6171875, "grad_norm": 0.2269096076488495, "learning_rate": 3.2642291409108775e-05, "loss": 0.6462, "step": 1950 }, { "epoch": 7.63671875, "grad_norm": 0.21775776147842407, "learning_rate": 3.213995351050011e-05, "loss": 0.6442, "step": 1955 }, { "epoch": 7.65625, "grad_norm": 0.21870321035385132, "learning_rate": 3.164076979771287e-05, "loss": 0.6391, "step": 1960 }, { "epoch": 7.67578125, "grad_norm": 0.24278177320957184, "learning_rate": 3.1144763473180285e-05, "loss": 0.6351, "step": 1965 }, { "epoch": 7.6953125, "grad_norm": 0.222146674990654, "learning_rate": 3.065195759164797e-05, "loss": 0.6442, "step": 1970 }, { "epoch": 7.71484375, "grad_norm": 0.23037941753864288, "learning_rate": 3.016237505910272e-05, "loss": 0.6391, "step": 1975 }, { "epoch": 7.734375, "grad_norm": 0.22653505206108093, "learning_rate": 2.9676038631707593e-05, "loss": 0.6364, "step": 1980 }, { "epoch": 7.75390625, "grad_norm": 0.22071927785873413, "learning_rate": 2.9192970914744132e-05, "loss": 0.6436, "step": 1985 }, { "epoch": 7.7734375, "grad_norm": 0.2352590709924698, "learning_rate": 2.8713194361562036e-05, "loss": 0.6389, "step": 1990 }, { "epoch": 7.79296875, "grad_norm": 0.23165152966976166, "learning_rate": 2.8236731272534967e-05, "loss": 0.6359, "step": 1995 }, { "epoch": 7.8125, "grad_norm": 0.22592546045780182, "learning_rate": 2.776360379402445e-05, "loss": 0.6452, "step": 2000 }, { "epoch": 7.83203125, "grad_norm": 0.22005808353424072, "learning_rate": 2.72938339173503e-05, "loss": 0.6362, "step": 2005 }, { "epoch": 7.8515625, "grad_norm": 0.22496894001960754, "learning_rate": 2.6827443477768454e-05, "loss": 0.6363, "step": 2010 }, { "epoch": 7.87109375, "grad_norm": 0.23299238085746765, "learning_rate": 2.6364454153456108e-05, "loss": 0.6376, "step": 2015 }, { "epoch": 7.890625, "grad_norm": 0.21800798177719116, "learning_rate": 2.5904887464504114e-05, "loss": 0.6316, "step": 2020 }, { "epoch": 7.91015625, "grad_norm": 0.22942836582660675, "learning_rate": 2.544876477191652e-05, "loss": 0.6408, "step": 2025 }, { "epoch": 7.9296875, "grad_norm": 0.22502020001411438, "learning_rate": 2.4996107276618008e-05, "loss": 0.6281, "step": 2030 }, { "epoch": 7.94921875, "grad_norm": 0.22493688762187958, "learning_rate": 2.454693601846819e-05, "loss": 0.6374, "step": 2035 }, { "epoch": 7.96875, "grad_norm": 0.22121860086917877, "learning_rate": 2.4101271875283817e-05, "loss": 0.6301, "step": 2040 }, { "epoch": 7.98828125, "grad_norm": 0.22293226420879364, "learning_rate": 2.3659135561868305e-05, "loss": 0.6374, "step": 2045 }, { "epoch": 8.0, "eval_loss": 2.093949556350708, "eval_runtime": 0.5398, "eval_samples_per_second": 11.115, "eval_steps_per_second": 1.852, "step": 2048 }, { "epoch": 8.0078125, "grad_norm": 0.22147591412067413, "learning_rate": 2.3220547629048796e-05, "loss": 0.6318, "step": 2050 }, { "epoch": 8.02734375, "grad_norm": 0.22781990468502045, "learning_rate": 2.2785528462721238e-05, "loss": 0.6301, "step": 2055 }, { "epoch": 8.046875, "grad_norm": 0.22302427887916565, "learning_rate": 2.2354098282902446e-05, "loss": 0.6194, "step": 2060 }, { "epoch": 8.06640625, "grad_norm": 0.2345212697982788, "learning_rate": 2.1926277142790552e-05, "loss": 0.6284, "step": 2065 }, { "epoch": 8.0859375, "grad_norm": 0.22880584001541138, "learning_rate": 2.1502084927832845e-05, "loss": 0.6394, "step": 2070 }, { "epoch": 8.10546875, "grad_norm": 0.23197947442531586, "learning_rate": 2.1081541354801292e-05, "loss": 0.6414, "step": 2075 }, { "epoch": 8.125, "grad_norm": 0.2195805162191391, "learning_rate": 2.0664665970876496e-05, "loss": 0.6274, "step": 2080 }, { "epoch": 8.14453125, "grad_norm": 0.2231413722038269, "learning_rate": 2.025147815273867e-05, "loss": 0.6325, "step": 2085 }, { "epoch": 8.1640625, "grad_norm": 0.22956664860248566, "learning_rate": 1.9841997105667275e-05, "loss": 0.6345, "step": 2090 }, { "epoch": 8.18359375, "grad_norm": 0.22590646147727966, "learning_rate": 1.943624186264832e-05, "loss": 0.6276, "step": 2095 }, { "epoch": 8.203125, "grad_norm": 0.2267957627773285, "learning_rate": 1.903423128348959e-05, "loss": 0.6243, "step": 2100 }, { "epoch": 8.22265625, "grad_norm": 0.22633960843086243, "learning_rate": 1.8635984053944122e-05, "loss": 0.6279, "step": 2105 }, { "epoch": 8.2421875, "grad_norm": 0.22983397543430328, "learning_rate": 1.824151868484164e-05, "loss": 0.6347, "step": 2110 }, { "epoch": 8.26171875, "grad_norm": 0.21901904046535492, "learning_rate": 1.7850853511228115e-05, "loss": 0.6364, "step": 2115 }, { "epoch": 8.28125, "grad_norm": 0.2256007343530655, "learning_rate": 1.7464006691513623e-05, "loss": 0.628, "step": 2120 }, { "epoch": 8.30078125, "grad_norm": 0.2304702252149582, "learning_rate": 1.7080996206628307e-05, "loss": 0.6202, "step": 2125 }, { "epoch": 8.3203125, "grad_norm": 0.22724899649620056, "learning_rate": 1.6701839859186542e-05, "loss": 0.6401, "step": 2130 }, { "epoch": 8.33984375, "grad_norm": 0.22017619013786316, "learning_rate": 1.632655527265958e-05, "loss": 0.6348, "step": 2135 }, { "epoch": 8.359375, "grad_norm": 0.221891850233078, "learning_rate": 1.595515989055618e-05, "loss": 0.6306, "step": 2140 }, { "epoch": 8.37890625, "grad_norm": 0.2255999892950058, "learning_rate": 1.558767097561219e-05, "loss": 0.6436, "step": 2145 }, { "epoch": 8.3984375, "grad_norm": 0.2337878942489624, "learning_rate": 1.5224105608987704e-05, "loss": 0.6256, "step": 2150 }, { "epoch": 8.41796875, "grad_norm": 0.2235851138830185, "learning_rate": 1.486448068947348e-05, "loss": 0.6328, "step": 2155 }, { "epoch": 8.4375, "grad_norm": 0.2308977097272873, "learning_rate": 1.4508812932705363e-05, "loss": 0.6353, "step": 2160 }, { "epoch": 8.45703125, "grad_norm": 0.22785401344299316, "learning_rate": 1.4157118870387155e-05, "loss": 0.6375, "step": 2165 }, { "epoch": 8.4765625, "grad_norm": 0.24056580662727356, "learning_rate": 1.3809414849522584e-05, "loss": 0.6343, "step": 2170 }, { "epoch": 8.49609375, "grad_norm": 0.22777673602104187, "learning_rate": 1.3465717031655056e-05, "loss": 0.6336, "step": 2175 }, { "epoch": 8.515625, "grad_norm": 0.23098915815353394, "learning_rate": 1.3126041392116772e-05, "loss": 0.6296, "step": 2180 }, { "epoch": 8.53515625, "grad_norm": 0.2298251986503601, "learning_rate": 1.2790403719286049e-05, "loss": 0.6305, "step": 2185 }, { "epoch": 8.5546875, "grad_norm": 0.22145819664001465, "learning_rate": 1.2458819613853468e-05, "loss": 0.6262, "step": 2190 }, { "epoch": 8.57421875, "grad_norm": 0.2244306206703186, "learning_rate": 1.2131304488096772e-05, "loss": 0.6225, "step": 2195 }, { "epoch": 8.59375, "grad_norm": 0.22416800260543823, "learning_rate": 1.1807873565164506e-05, "loss": 0.6309, "step": 2200 }, { "epoch": 8.61328125, "grad_norm": 0.22584258019924164, "learning_rate": 1.148854187836833e-05, "loss": 0.6318, "step": 2205 }, { "epoch": 8.6328125, "grad_norm": 0.2320922613143921, "learning_rate": 1.1173324270484397e-05, "loss": 0.6352, "step": 2210 }, { "epoch": 8.65234375, "grad_norm": 0.2240631878376007, "learning_rate": 1.0862235393063413e-05, "loss": 0.6279, "step": 2215 }, { "epoch": 8.671875, "grad_norm": 0.2261231392621994, "learning_rate": 1.0555289705749483e-05, "loss": 0.6299, "step": 2220 }, { "epoch": 8.69140625, "grad_norm": 0.22478684782981873, "learning_rate": 1.025250147560829e-05, "loss": 0.639, "step": 2225 }, { "epoch": 8.7109375, "grad_norm": 0.22566542029380798, "learning_rate": 9.953884776463652e-06, "loss": 0.63, "step": 2230 }, { "epoch": 8.73046875, "grad_norm": 0.23023688793182373, "learning_rate": 9.659453488243575e-06, "loss": 0.6439, "step": 2235 }, { "epoch": 8.75, "grad_norm": 0.22487542033195496, "learning_rate": 9.369221296335006e-06, "loss": 0.6421, "step": 2240 }, { "epoch": 8.76953125, "grad_norm": 0.22670140862464905, "learning_rate": 9.083201690947763e-06, "loss": 0.6331, "step": 2245 }, { "epoch": 8.7890625, "grad_norm": 0.2248082160949707, "learning_rate": 8.801407966487486e-06, "loss": 0.6216, "step": 2250 }, { "epoch": 8.80859375, "grad_norm": 0.23012250661849976, "learning_rate": 8.52385322093765e-06, "loss": 0.6452, "step": 2255 }, { "epoch": 8.828125, "grad_norm": 0.22810766100883484, "learning_rate": 8.250550355250875e-06, "loss": 0.6395, "step": 2260 }, { "epoch": 8.84765625, "grad_norm": 0.22482182085514069, "learning_rate": 7.981512072749198e-06, "loss": 0.6316, "step": 2265 }, { "epoch": 8.8671875, "grad_norm": 0.22704395651817322, "learning_rate": 7.71675087853364e-06, "loss": 0.6389, "step": 2270 }, { "epoch": 8.88671875, "grad_norm": 0.2339123636484146, "learning_rate": 7.456279078902928e-06, "loss": 0.639, "step": 2275 }, { "epoch": 8.90625, "grad_norm": 0.2283734679222107, "learning_rate": 7.200108780781556e-06, "loss": 0.6312, "step": 2280 }, { "epoch": 8.92578125, "grad_norm": 0.23632891476154327, "learning_rate": 6.948251891156932e-06, "loss": 0.6336, "step": 2285 }, { "epoch": 8.9453125, "grad_norm": 0.22593176364898682, "learning_rate": 6.700720116526116e-06, "loss": 0.6382, "step": 2290 }, { "epoch": 8.96484375, "grad_norm": 0.2195340245962143, "learning_rate": 6.457524962351469e-06, "loss": 0.627, "step": 2295 }, { "epoch": 8.984375, "grad_norm": 0.2304958701133728, "learning_rate": 6.218677732526035e-06, "loss": 0.6277, "step": 2300 }, { "epoch": 9.0, "eval_loss": 2.0994203090667725, "eval_runtime": 0.5356, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.867, "step": 2304 }, { "epoch": 9.00390625, "grad_norm": 0.2239326387643814, "learning_rate": 5.984189528848095e-06, "loss": 0.6333, "step": 2305 }, { "epoch": 9.0234375, "grad_norm": 0.21830931305885315, "learning_rate": 5.7540712505050444e-06, "loss": 0.6303, "step": 2310 }, { "epoch": 9.04296875, "grad_norm": 0.2230663150548935, "learning_rate": 5.528333593567014e-06, "loss": 0.6266, "step": 2315 }, { "epoch": 9.0625, "grad_norm": 0.22621068358421326, "learning_rate": 5.306987050489442e-06, "loss": 0.6273, "step": 2320 }, { "epoch": 9.08203125, "grad_norm": 0.2257871776819229, "learning_rate": 5.090041909625542e-06, "loss": 0.6171, "step": 2325 }, { "epoch": 9.1015625, "grad_norm": 0.22467824816703796, "learning_rate": 4.877508254748076e-06, "loss": 0.6256, "step": 2330 }, { "epoch": 9.12109375, "grad_norm": 0.22441822290420532, "learning_rate": 4.669395964580614e-06, "loss": 0.6247, "step": 2335 }, { "epoch": 9.140625, "grad_norm": 0.22599612176418304, "learning_rate": 4.465714712338398e-06, "loss": 0.6204, "step": 2340 }, { "epoch": 9.16015625, "grad_norm": 0.22301939129829407, "learning_rate": 4.26647396527865e-06, "loss": 0.634, "step": 2345 }, { "epoch": 9.1796875, "grad_norm": 0.23274029791355133, "learning_rate": 4.071682984260638e-06, "loss": 0.6256, "step": 2350 }, { "epoch": 9.19921875, "grad_norm": 0.23097610473632812, "learning_rate": 3.881350823315177e-06, "loss": 0.6293, "step": 2355 }, { "epoch": 9.21875, "grad_norm": 0.23166796565055847, "learning_rate": 3.6954863292237297e-06, "loss": 0.6294, "step": 2360 }, { "epoch": 9.23828125, "grad_norm": 0.22876545786857605, "learning_rate": 3.514098141107314e-06, "loss": 0.6298, "step": 2365 }, { "epoch": 9.2578125, "grad_norm": 0.22338230907917023, "learning_rate": 3.3371946900248473e-06, "loss": 0.6264, "step": 2370 }, { "epoch": 9.27734375, "grad_norm": 0.2302178293466568, "learning_rate": 3.1647841985813164e-06, "loss": 0.627, "step": 2375 }, { "epoch": 9.296875, "grad_norm": 0.2242288738489151, "learning_rate": 2.996874680545603e-06, "loss": 0.6336, "step": 2380 }, { "epoch": 9.31640625, "grad_norm": 0.22500120103359222, "learning_rate": 2.8334739404779375e-06, "loss": 0.6264, "step": 2385 }, { "epoch": 9.3359375, "grad_norm": 0.23554645478725433, "learning_rate": 2.674589573367192e-06, "loss": 0.6213, "step": 2390 }, { "epoch": 9.35546875, "grad_norm": 0.2254471480846405, "learning_rate": 2.5202289642778375e-06, "loss": 0.6348, "step": 2395 }, { "epoch": 9.375, "grad_norm": 0.22407911717891693, "learning_rate": 2.3703992880066638e-06, "loss": 0.6294, "step": 2400 }, { "epoch": 9.39453125, "grad_norm": 0.22965936362743378, "learning_rate": 2.2251075087493355e-06, "loss": 0.64, "step": 2405 }, { "epoch": 9.4140625, "grad_norm": 0.22874490916728973, "learning_rate": 2.0843603797766287e-06, "loss": 0.6313, "step": 2410 }, { "epoch": 9.43359375, "grad_norm": 0.22413046658039093, "learning_rate": 1.9481644431206036e-06, "loss": 0.6229, "step": 2415 }, { "epoch": 9.453125, "grad_norm": 0.2280588150024414, "learning_rate": 1.8165260292704711e-06, "loss": 0.6265, "step": 2420 }, { "epoch": 9.47265625, "grad_norm": 0.22689659893512726, "learning_rate": 1.6894512568783716e-06, "loss": 0.6272, "step": 2425 }, { "epoch": 9.4921875, "grad_norm": 0.23052698373794556, "learning_rate": 1.5669460324749586e-06, "loss": 0.6408, "step": 2430 }, { "epoch": 9.51171875, "grad_norm": 0.22765642404556274, "learning_rate": 1.4490160501948735e-06, "loss": 0.644, "step": 2435 }, { "epoch": 9.53125, "grad_norm": 0.22766034305095673, "learning_rate": 1.3356667915121025e-06, "loss": 0.6249, "step": 2440 }, { "epoch": 9.55078125, "grad_norm": 0.22794398665428162, "learning_rate": 1.2269035249851236e-06, "loss": 0.6318, "step": 2445 }, { "epoch": 9.5703125, "grad_norm": 0.22712871432304382, "learning_rate": 1.1227313060120926e-06, "loss": 0.6359, "step": 2450 }, { "epoch": 9.58984375, "grad_norm": 0.22914738953113556, "learning_rate": 1.0231549765958192e-06, "loss": 0.6389, "step": 2455 }, { "epoch": 9.609375, "grad_norm": 0.22300153970718384, "learning_rate": 9.281791651187366e-07, "loss": 0.6356, "step": 2460 }, { "epoch": 9.62890625, "grad_norm": 0.232873797416687, "learning_rate": 8.378082861277281e-07, "loss": 0.6272, "step": 2465 }, { "epoch": 9.6484375, "grad_norm": 0.227997824549675, "learning_rate": 7.520465401290033e-07, "loss": 0.633, "step": 2470 }, { "epoch": 9.66796875, "grad_norm": 0.21839286386966705, "learning_rate": 6.708979133927762e-07, "loss": 0.6215, "step": 2475 }, { "epoch": 9.6875, "grad_norm": 0.22753040492534637, "learning_rate": 5.943661777680354e-07, "loss": 0.6272, "step": 2480 }, { "epoch": 9.70703125, "grad_norm": 0.22866863012313843, "learning_rate": 5.224548905072402e-07, "loss": 0.6357, "step": 2485 }, { "epoch": 9.7265625, "grad_norm": 0.2306712120771408, "learning_rate": 4.5516739410087494e-07, "loss": 0.6244, "step": 2490 }, { "epoch": 9.74609375, "grad_norm": 0.22779209911823273, "learning_rate": 3.9250681612225116e-07, "loss": 0.6309, "step": 2495 }, { "epoch": 9.765625, "grad_norm": 0.22719816863536835, "learning_rate": 3.3447606908196817e-07, "loss": 0.628, "step": 2500 }, { "epoch": 9.78515625, "grad_norm": 0.23172929883003235, "learning_rate": 2.8107785029265476e-07, "loss": 0.6293, "step": 2505 }, { "epoch": 9.8046875, "grad_norm": 0.22468186914920807, "learning_rate": 2.3231464174352512e-07, "loss": 0.6368, "step": 2510 }, { "epoch": 9.82421875, "grad_norm": 0.22247561812400818, "learning_rate": 1.8818870998508208e-07, "loss": 0.6222, "step": 2515 }, { "epoch": 9.84375, "grad_norm": 0.22515320777893066, "learning_rate": 1.487021060236904e-07, "loss": 0.6266, "step": 2520 }, { "epoch": 9.86328125, "grad_norm": 0.23118971288204193, "learning_rate": 1.1385666522630845e-07, "loss": 0.6308, "step": 2525 }, { "epoch": 9.8828125, "grad_norm": 0.22416307032108307, "learning_rate": 8.365400723512328e-08, "loss": 0.6239, "step": 2530 }, { "epoch": 9.90234375, "grad_norm": 0.22984710335731506, "learning_rate": 5.8095535892332964e-08, "loss": 0.6362, "step": 2535 }, { "epoch": 9.921875, "grad_norm": 0.23102597892284393, "learning_rate": 3.7182439174832106e-08, "loss": 0.6365, "step": 2540 }, { "epoch": 9.94140625, "grad_norm": 0.2295123189687729, "learning_rate": 2.091568913904496e-08, "loss": 0.6397, "step": 2545 }, { "epoch": 9.9609375, "grad_norm": 0.22766011953353882, "learning_rate": 9.296041875683781e-09, "loss": 0.6274, "step": 2550 }, { "epoch": 9.98046875, "grad_norm": 0.2338954210281372, "learning_rate": 2.3240374746658077e-09, "loss": 0.6212, "step": 2555 }, { "epoch": 10.0, "grad_norm": 0.22291633486747742, "learning_rate": 0.0, "loss": 0.616, "step": 2560 }, { "epoch": 10.0, "eval_loss": 2.1007895469665527, "eval_runtime": 0.5705, "eval_samples_per_second": 10.518, "eval_steps_per_second": 1.753, "step": 2560 }, { "epoch": 10.0, "step": 2560, "total_flos": 7.568434414263206e+18, "train_loss": 0.7105431989766657, "train_runtime": 14792.6859, "train_samples_per_second": 11.056, "train_steps_per_second": 0.173 } ], "logging_steps": 5, "max_steps": 2560, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.568434414263206e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }