{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8706251088281386, "eval_steps": 5000, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008706251088281386, "grad_norm": 4.625, "learning_rate": 9.91292232671543e-06, "loss": 1.8999, "step": 25 }, { "epoch": 0.01741250217656277, "grad_norm": 4.34375, "learning_rate": 9.825844653430862e-06, "loss": 1.4397, "step": 50 }, { "epoch": 0.02611875326484416, "grad_norm": 3.25, "learning_rate": 9.738766980146292e-06, "loss": 1.3437, "step": 75 }, { "epoch": 0.03482500435312554, "grad_norm": 3.171875, "learning_rate": 9.651689306861722e-06, "loss": 1.3747, "step": 100 }, { "epoch": 0.04353125544140693, "grad_norm": 2.90625, "learning_rate": 9.564611633577151e-06, "loss": 1.3207, "step": 125 }, { "epoch": 0.05223750652968832, "grad_norm": 2.984375, "learning_rate": 9.477533960292583e-06, "loss": 1.2347, "step": 150 }, { "epoch": 0.060943757617969704, "grad_norm": 7.0, "learning_rate": 9.390456287008013e-06, "loss": 1.3941, "step": 175 }, { "epoch": 0.06965000870625108, "grad_norm": 3.453125, "learning_rate": 9.303378613723442e-06, "loss": 1.2748, "step": 200 }, { "epoch": 0.07835625979453248, "grad_norm": 2.84375, "learning_rate": 9.216300940438872e-06, "loss": 1.3089, "step": 225 }, { "epoch": 0.08706251088281386, "grad_norm": 3.015625, "learning_rate": 9.129223267154302e-06, "loss": 1.2282, "step": 250 }, { "epoch": 0.09576876197109524, "grad_norm": 3.234375, "learning_rate": 9.042145593869732e-06, "loss": 1.2018, "step": 275 }, { "epoch": 0.10447501305937663, "grad_norm": 3.3125, "learning_rate": 8.955067920585163e-06, "loss": 1.2438, "step": 300 }, { "epoch": 0.11318126414765801, "grad_norm": 2.984375, "learning_rate": 8.867990247300593e-06, "loss": 1.2704, "step": 325 }, { "epoch": 0.12188751523593941, "grad_norm": 3.609375, "learning_rate": 8.780912574016023e-06, "loss": 1.2005, "step": 350 }, { "epoch": 0.1305937663242208, "grad_norm": 3.03125, "learning_rate": 8.693834900731452e-06, "loss": 1.3047, "step": 375 }, { "epoch": 0.13930001741250217, "grad_norm": 3.109375, "learning_rate": 8.606757227446884e-06, "loss": 1.271, "step": 400 }, { "epoch": 0.14800626850078355, "grad_norm": 2.875, "learning_rate": 8.519679554162314e-06, "loss": 1.2901, "step": 425 }, { "epoch": 0.15671251958906496, "grad_norm": 2.578125, "learning_rate": 8.432601880877743e-06, "loss": 1.1314, "step": 450 }, { "epoch": 0.16541877067734634, "grad_norm": 2.84375, "learning_rate": 8.345524207593175e-06, "loss": 1.2264, "step": 475 }, { "epoch": 0.17412502176562772, "grad_norm": 3.9375, "learning_rate": 8.258446534308605e-06, "loss": 1.2414, "step": 500 }, { "epoch": 0.1828312728539091, "grad_norm": 3.78125, "learning_rate": 8.171368861024034e-06, "loss": 1.2092, "step": 525 }, { "epoch": 0.19153752394219048, "grad_norm": 3.515625, "learning_rate": 8.084291187739464e-06, "loss": 1.2098, "step": 550 }, { "epoch": 0.2002437750304719, "grad_norm": 3.0625, "learning_rate": 7.997213514454894e-06, "loss": 1.1714, "step": 575 }, { "epoch": 0.20895002611875327, "grad_norm": 3.03125, "learning_rate": 7.910135841170324e-06, "loss": 1.2548, "step": 600 }, { "epoch": 0.21765627720703465, "grad_norm": 3.484375, "learning_rate": 7.823058167885753e-06, "loss": 1.1677, "step": 625 }, { "epoch": 0.22636252829531603, "grad_norm": 3.96875, "learning_rate": 7.735980494601185e-06, "loss": 1.1563, "step": 650 }, { "epoch": 0.2350687793835974, "grad_norm": 2.890625, "learning_rate": 7.648902821316615e-06, "loss": 1.1172, "step": 675 }, { "epoch": 0.24377503047187882, "grad_norm": 2.875, "learning_rate": 7.5618251480320444e-06, "loss": 1.1941, "step": 700 }, { "epoch": 0.2524812815601602, "grad_norm": 3.4375, "learning_rate": 7.474747474747476e-06, "loss": 1.1228, "step": 725 }, { "epoch": 0.2611875326484416, "grad_norm": 4.34375, "learning_rate": 7.387669801462906e-06, "loss": 1.2008, "step": 750 }, { "epoch": 0.26989378373672296, "grad_norm": 3.765625, "learning_rate": 7.3005921281783354e-06, "loss": 1.1465, "step": 775 }, { "epoch": 0.27860003482500434, "grad_norm": 2.75, "learning_rate": 7.213514454893766e-06, "loss": 1.0869, "step": 800 }, { "epoch": 0.2873062859132857, "grad_norm": 2.78125, "learning_rate": 7.126436781609196e-06, "loss": 1.14, "step": 825 }, { "epoch": 0.2960125370015671, "grad_norm": 3.390625, "learning_rate": 7.039359108324626e-06, "loss": 1.1878, "step": 850 }, { "epoch": 0.30471878808984854, "grad_norm": 4.625, "learning_rate": 6.952281435040056e-06, "loss": 1.1763, "step": 875 }, { "epoch": 0.3134250391781299, "grad_norm": 2.828125, "learning_rate": 6.865203761755487e-06, "loss": 1.1286, "step": 900 }, { "epoch": 0.3221312902664113, "grad_norm": 3.796875, "learning_rate": 6.778126088470917e-06, "loss": 1.1759, "step": 925 }, { "epoch": 0.3308375413546927, "grad_norm": 3.15625, "learning_rate": 6.691048415186346e-06, "loss": 1.098, "step": 950 }, { "epoch": 0.33954379244297406, "grad_norm": 4.0625, "learning_rate": 6.603970741901777e-06, "loss": 1.1277, "step": 975 }, { "epoch": 0.34825004353125544, "grad_norm": 3.828125, "learning_rate": 6.5168930686172076e-06, "loss": 1.178, "step": 1000 }, { "epoch": 0.3569562946195368, "grad_norm": 2.875, "learning_rate": 6.429815395332637e-06, "loss": 1.0851, "step": 1025 }, { "epoch": 0.3656625457078182, "grad_norm": 2.5625, "learning_rate": 6.342737722048068e-06, "loss": 1.1963, "step": 1050 }, { "epoch": 0.3743687967960996, "grad_norm": 2.625, "learning_rate": 6.255660048763498e-06, "loss": 1.243, "step": 1075 }, { "epoch": 0.38307504788438096, "grad_norm": 3.765625, "learning_rate": 6.1685823754789275e-06, "loss": 1.1569, "step": 1100 }, { "epoch": 0.3917812989726624, "grad_norm": 4.25, "learning_rate": 6.081504702194357e-06, "loss": 1.1653, "step": 1125 }, { "epoch": 0.4004875500609438, "grad_norm": 3.6875, "learning_rate": 5.994427028909789e-06, "loss": 1.1118, "step": 1150 }, { "epoch": 0.40919380114922516, "grad_norm": 3.8125, "learning_rate": 5.9073493556252185e-06, "loss": 1.1483, "step": 1175 }, { "epoch": 0.41790005223750654, "grad_norm": 3.4375, "learning_rate": 5.820271682340648e-06, "loss": 1.1805, "step": 1200 }, { "epoch": 0.4266063033257879, "grad_norm": 3.5625, "learning_rate": 5.733194009056079e-06, "loss": 1.2287, "step": 1225 }, { "epoch": 0.4353125544140693, "grad_norm": 3.25, "learning_rate": 5.646116335771509e-06, "loss": 1.142, "step": 1250 }, { "epoch": 0.4440188055023507, "grad_norm": 3.21875, "learning_rate": 5.559038662486938e-06, "loss": 1.1072, "step": 1275 }, { "epoch": 0.45272505659063206, "grad_norm": 3.015625, "learning_rate": 5.47196098920237e-06, "loss": 1.1299, "step": 1300 }, { "epoch": 0.46143130767891344, "grad_norm": 3.46875, "learning_rate": 5.3848833159178e-06, "loss": 1.0985, "step": 1325 }, { "epoch": 0.4701375587671948, "grad_norm": 3.1875, "learning_rate": 5.297805642633229e-06, "loss": 1.1516, "step": 1350 }, { "epoch": 0.47884380985547625, "grad_norm": 2.890625, "learning_rate": 5.210727969348659e-06, "loss": 1.1192, "step": 1375 }, { "epoch": 0.48755006094375763, "grad_norm": 3.265625, "learning_rate": 5.12365029606409e-06, "loss": 1.1711, "step": 1400 }, { "epoch": 0.496256312032039, "grad_norm": 3.09375, "learning_rate": 5.0365726227795195e-06, "loss": 1.1778, "step": 1425 }, { "epoch": 0.5049625631203204, "grad_norm": 3.40625, "learning_rate": 4.94949494949495e-06, "loss": 1.1201, "step": 1450 }, { "epoch": 0.5136688142086018, "grad_norm": 3.5625, "learning_rate": 4.86241727621038e-06, "loss": 1.0224, "step": 1475 }, { "epoch": 0.5223750652968832, "grad_norm": 4.0, "learning_rate": 4.7753396029258105e-06, "loss": 1.1114, "step": 1500 }, { "epoch": 0.5310813163851645, "grad_norm": 3.234375, "learning_rate": 4.68826192964124e-06, "loss": 1.2059, "step": 1525 }, { "epoch": 0.5397875674734459, "grad_norm": 3.546875, "learning_rate": 4.601184256356671e-06, "loss": 1.1431, "step": 1550 }, { "epoch": 0.5484938185617273, "grad_norm": 3.328125, "learning_rate": 4.514106583072101e-06, "loss": 1.1475, "step": 1575 }, { "epoch": 0.5572000696500087, "grad_norm": 3.890625, "learning_rate": 4.4270289097875304e-06, "loss": 1.1649, "step": 1600 }, { "epoch": 0.5659063207382901, "grad_norm": 3.984375, "learning_rate": 4.339951236502961e-06, "loss": 1.1171, "step": 1625 }, { "epoch": 0.5746125718265714, "grad_norm": 3.53125, "learning_rate": 4.252873563218391e-06, "loss": 1.1305, "step": 1650 }, { "epoch": 0.5833188229148528, "grad_norm": 3.625, "learning_rate": 4.1657958899338214e-06, "loss": 1.1498, "step": 1675 }, { "epoch": 0.5920250740031342, "grad_norm": 3.203125, "learning_rate": 4.078718216649252e-06, "loss": 1.2036, "step": 1700 }, { "epoch": 0.6007313250914157, "grad_norm": 3.28125, "learning_rate": 3.991640543364682e-06, "loss": 1.2106, "step": 1725 }, { "epoch": 0.6094375761796971, "grad_norm": 3.265625, "learning_rate": 3.904562870080112e-06, "loss": 1.1543, "step": 1750 }, { "epoch": 0.6181438272679785, "grad_norm": 3.21875, "learning_rate": 3.817485196795541e-06, "loss": 1.0685, "step": 1775 }, { "epoch": 0.6268500783562598, "grad_norm": 3.015625, "learning_rate": 3.730407523510972e-06, "loss": 1.1425, "step": 1800 }, { "epoch": 0.6355563294445412, "grad_norm": 2.546875, "learning_rate": 3.6433298502264026e-06, "loss": 1.0915, "step": 1825 }, { "epoch": 0.6442625805328226, "grad_norm": 3.375, "learning_rate": 3.5562521769418323e-06, "loss": 1.0676, "step": 1850 }, { "epoch": 0.652968831621104, "grad_norm": 3.390625, "learning_rate": 3.4691745036572625e-06, "loss": 1.1378, "step": 1875 }, { "epoch": 0.6616750827093854, "grad_norm": 3.328125, "learning_rate": 3.3820968303726927e-06, "loss": 1.1678, "step": 1900 }, { "epoch": 0.6703813337976667, "grad_norm": 3.125, "learning_rate": 3.295019157088123e-06, "loss": 1.1436, "step": 1925 }, { "epoch": 0.6790875848859481, "grad_norm": 4.09375, "learning_rate": 3.207941483803553e-06, "loss": 1.1242, "step": 1950 }, { "epoch": 0.6877938359742295, "grad_norm": 2.9375, "learning_rate": 3.1208638105189833e-06, "loss": 1.092, "step": 1975 }, { "epoch": 0.6965000870625109, "grad_norm": 4.84375, "learning_rate": 3.0337861372344135e-06, "loss": 1.127, "step": 2000 }, { "epoch": 0.7052063381507923, "grad_norm": 3.109375, "learning_rate": 2.9467084639498432e-06, "loss": 1.084, "step": 2025 }, { "epoch": 0.7139125892390736, "grad_norm": 3.09375, "learning_rate": 2.859630790665274e-06, "loss": 1.1275, "step": 2050 }, { "epoch": 0.722618840327355, "grad_norm": 3.75, "learning_rate": 2.7725531173807036e-06, "loss": 1.1879, "step": 2075 }, { "epoch": 0.7313250914156364, "grad_norm": 4.21875, "learning_rate": 2.685475444096134e-06, "loss": 1.0525, "step": 2100 }, { "epoch": 0.7400313425039178, "grad_norm": 3.375, "learning_rate": 2.5983977708115644e-06, "loss": 1.1301, "step": 2125 }, { "epoch": 0.7487375935921992, "grad_norm": 3.140625, "learning_rate": 2.511320097526994e-06, "loss": 1.1141, "step": 2150 }, { "epoch": 0.7574438446804805, "grad_norm": 4.375, "learning_rate": 2.4242424242424244e-06, "loss": 1.2052, "step": 2175 }, { "epoch": 0.7661500957687619, "grad_norm": 3.296875, "learning_rate": 2.3371647509578546e-06, "loss": 1.0469, "step": 2200 }, { "epoch": 0.7748563468570434, "grad_norm": 3.53125, "learning_rate": 2.2500870776732848e-06, "loss": 1.1642, "step": 2225 }, { "epoch": 0.7835625979453248, "grad_norm": 3.03125, "learning_rate": 2.163009404388715e-06, "loss": 1.1541, "step": 2250 }, { "epoch": 0.7922688490336062, "grad_norm": 3.5625, "learning_rate": 2.075931731104145e-06, "loss": 1.1254, "step": 2275 }, { "epoch": 0.8009751001218876, "grad_norm": 3.09375, "learning_rate": 1.9888540578195753e-06, "loss": 1.0916, "step": 2300 }, { "epoch": 0.8096813512101689, "grad_norm": 3.0625, "learning_rate": 1.9017763845350053e-06, "loss": 1.088, "step": 2325 }, { "epoch": 0.8183876022984503, "grad_norm": 3.734375, "learning_rate": 1.8146987112504355e-06, "loss": 1.0664, "step": 2350 }, { "epoch": 0.8270938533867317, "grad_norm": 3.671875, "learning_rate": 1.7276210379658657e-06, "loss": 1.092, "step": 2375 }, { "epoch": 0.8358001044750131, "grad_norm": 3.0625, "learning_rate": 1.6405433646812957e-06, "loss": 1.0523, "step": 2400 }, { "epoch": 0.8445063555632945, "grad_norm": 4.21875, "learning_rate": 1.553465691396726e-06, "loss": 1.0969, "step": 2425 }, { "epoch": 0.8532126066515758, "grad_norm": 3.828125, "learning_rate": 1.4663880181121563e-06, "loss": 1.2036, "step": 2450 }, { "epoch": 0.8619188577398572, "grad_norm": 3.578125, "learning_rate": 1.3793103448275862e-06, "loss": 1.1054, "step": 2475 }, { "epoch": 0.8706251088281386, "grad_norm": 2.984375, "learning_rate": 1.2922326715430164e-06, "loss": 1.0945, "step": 2500 } ], "logging_steps": 25, "max_steps": 2871, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 8.52116678853673e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }