{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8310652178429703, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003324260871371881, "grad_norm": 2.5143792629241943, "learning_rate": 1.6622340425531916e-08, "loss": 9.0836, "step": 10 }, { "epoch": 0.006648521742743762, "grad_norm": 2.6173431873321533, "learning_rate": 3.324468085106383e-08, "loss": 9.0829, "step": 20 }, { "epoch": 0.009972782614115643, "grad_norm": 2.5349628925323486, "learning_rate": 4.9867021276595746e-08, "loss": 9.0061, "step": 30 }, { "epoch": 0.013297043485487523, "grad_norm": 2.3916308879852295, "learning_rate": 6.648936170212767e-08, "loss": 8.9747, "step": 40 }, { "epoch": 0.016621304356859403, "grad_norm": 2.5103342533111572, "learning_rate": 8.311170212765958e-08, "loss": 9.0057, "step": 50 }, { "epoch": 0.019945565228231286, "grad_norm": 2.421079397201538, "learning_rate": 9.973404255319149e-08, "loss": 8.9885, "step": 60 }, { "epoch": 0.023269826099603166, "grad_norm": 2.6052393913269043, "learning_rate": 1.163563829787234e-07, "loss": 8.9706, "step": 70 }, { "epoch": 0.026594086970975046, "grad_norm": 2.376847505569458, "learning_rate": 1.3297872340425533e-07, "loss": 9.0211, "step": 80 }, { "epoch": 0.02991834784234693, "grad_norm": 2.6200971603393555, "learning_rate": 1.4960106382978723e-07, "loss": 8.9903, "step": 90 }, { "epoch": 0.033242608713718806, "grad_norm": 2.515320301055908, "learning_rate": 1.6622340425531916e-07, "loss": 8.9643, "step": 100 }, { "epoch": 0.03656686958509069, "grad_norm": 2.4840102195739746, "learning_rate": 1.8284574468085108e-07, "loss": 8.9761, "step": 110 }, { "epoch": 0.03989113045646257, "grad_norm": 2.5950074195861816, "learning_rate": 1.9946808510638298e-07, "loss": 9.0101, "step": 120 }, { "epoch": 0.04321539132783445, "grad_norm": 2.530604839324951, "learning_rate": 2.160904255319149e-07, "loss": 8.961, "step": 130 }, { "epoch": 0.04653965219920633, "grad_norm": 2.5579464435577393, "learning_rate": 2.327127659574468e-07, "loss": 8.8733, "step": 140 }, { "epoch": 0.04986391307057821, "grad_norm": 2.638901472091675, "learning_rate": 2.4933510638297876e-07, "loss": 8.9534, "step": 150 }, { "epoch": 0.05318817394195009, "grad_norm": 2.6817493438720703, "learning_rate": 2.6595744680851066e-07, "loss": 9.0014, "step": 160 }, { "epoch": 0.05651243481332197, "grad_norm": 2.6700024604797363, "learning_rate": 2.8257978723404256e-07, "loss": 8.8832, "step": 170 }, { "epoch": 0.05983669568469386, "grad_norm": 2.794243335723877, "learning_rate": 2.9920212765957446e-07, "loss": 8.9012, "step": 180 }, { "epoch": 0.06316095655606574, "grad_norm": 3.000873327255249, "learning_rate": 3.1582446808510636e-07, "loss": 8.7874, "step": 190 }, { "epoch": 0.06648521742743761, "grad_norm": 2.872612714767456, "learning_rate": 3.324468085106383e-07, "loss": 8.8558, "step": 200 }, { "epoch": 0.0698094782988095, "grad_norm": 2.9133315086364746, "learning_rate": 3.490691489361702e-07, "loss": 8.8333, "step": 210 }, { "epoch": 0.07313373917018139, "grad_norm": 3.1017534732818604, "learning_rate": 3.6569148936170217e-07, "loss": 8.8199, "step": 220 }, { "epoch": 0.07645800004155326, "grad_norm": 2.9153056144714355, "learning_rate": 3.8231382978723407e-07, "loss": 8.8266, "step": 230 }, { "epoch": 0.07978226091292515, "grad_norm": 3.0996434688568115, "learning_rate": 3.9893617021276597e-07, "loss": 8.7202, "step": 240 }, { "epoch": 0.08310652178429702, "grad_norm": 3.257809638977051, "learning_rate": 4.1555851063829787e-07, "loss": 8.6149, "step": 250 }, { "epoch": 0.0864307826556689, "grad_norm": 3.378631353378296, "learning_rate": 4.321808510638298e-07, "loss": 8.5092, "step": 260 }, { "epoch": 0.08975504352704078, "grad_norm": 3.3546876907348633, "learning_rate": 4.488031914893618e-07, "loss": 8.477, "step": 270 }, { "epoch": 0.09307930439841267, "grad_norm": 3.292569637298584, "learning_rate": 4.654255319148936e-07, "loss": 8.4594, "step": 280 }, { "epoch": 0.09640356526978455, "grad_norm": 3.190239906311035, "learning_rate": 4.820478723404255e-07, "loss": 8.3134, "step": 290 }, { "epoch": 0.09972782614115643, "grad_norm": 3.2521212100982666, "learning_rate": 4.986702127659575e-07, "loss": 8.2896, "step": 300 }, { "epoch": 0.10305208701252831, "grad_norm": 3.399919033050537, "learning_rate": 5.152925531914893e-07, "loss": 8.1656, "step": 310 }, { "epoch": 0.10637634788390019, "grad_norm": 3.412688970565796, "learning_rate": 5.319148936170213e-07, "loss": 7.9764, "step": 320 }, { "epoch": 0.10970060875527207, "grad_norm": 3.2669174671173096, "learning_rate": 5.485372340425532e-07, "loss": 7.9755, "step": 330 }, { "epoch": 0.11302486962664395, "grad_norm": 3.405444383621216, "learning_rate": 5.651595744680851e-07, "loss": 7.8587, "step": 340 }, { "epoch": 0.11634913049801583, "grad_norm": 3.2224161624908447, "learning_rate": 5.81781914893617e-07, "loss": 7.7357, "step": 350 }, { "epoch": 0.11967339136938772, "grad_norm": 3.230048894882202, "learning_rate": 5.984042553191489e-07, "loss": 7.5667, "step": 360 }, { "epoch": 0.12299765224075959, "grad_norm": 3.2728312015533447, "learning_rate": 6.150265957446809e-07, "loss": 7.4844, "step": 370 }, { "epoch": 0.12632191311213148, "grad_norm": 3.200800895690918, "learning_rate": 6.316489361702127e-07, "loss": 7.3059, "step": 380 }, { "epoch": 0.12964617398350337, "grad_norm": 3.075329065322876, "learning_rate": 6.482712765957447e-07, "loss": 7.1618, "step": 390 }, { "epoch": 0.13297043485487522, "grad_norm": 3.1853721141815186, "learning_rate": 6.648936170212766e-07, "loss": 7.1392, "step": 400 }, { "epoch": 0.1362946957262471, "grad_norm": 3.0336828231811523, "learning_rate": 6.815159574468085e-07, "loss": 6.974, "step": 410 }, { "epoch": 0.139618956597619, "grad_norm": 3.027355670928955, "learning_rate": 6.981382978723404e-07, "loss": 6.7714, "step": 420 }, { "epoch": 0.14294321746899089, "grad_norm": 2.99857497215271, "learning_rate": 7.147606382978723e-07, "loss": 6.6538, "step": 430 }, { "epoch": 0.14626747834036277, "grad_norm": 2.840437650680542, "learning_rate": 7.313829787234043e-07, "loss": 6.4758, "step": 440 }, { "epoch": 0.14959173921173463, "grad_norm": 3.076049566268921, "learning_rate": 7.480053191489362e-07, "loss": 6.3225, "step": 450 }, { "epoch": 0.15291600008310652, "grad_norm": 2.8588602542877197, "learning_rate": 7.646276595744681e-07, "loss": 6.1646, "step": 460 }, { "epoch": 0.1562402609544784, "grad_norm": 2.9168858528137207, "learning_rate": 7.8125e-07, "loss": 6.0828, "step": 470 }, { "epoch": 0.1595645218258503, "grad_norm": 2.795363187789917, "learning_rate": 7.978723404255319e-07, "loss": 5.9192, "step": 480 }, { "epoch": 0.16288878269722215, "grad_norm": 2.3897600173950195, "learning_rate": 8.144946808510639e-07, "loss": 5.6675, "step": 490 }, { "epoch": 0.16621304356859404, "grad_norm": 2.279939651489258, "learning_rate": 8.311170212765957e-07, "loss": 5.5443, "step": 500 }, { "epoch": 0.16953730443996592, "grad_norm": 2.394994020462036, "learning_rate": 8.477393617021276e-07, "loss": 5.4127, "step": 510 }, { "epoch": 0.1728615653113378, "grad_norm": 2.3148529529571533, "learning_rate": 8.643617021276596e-07, "loss": 5.2102, "step": 520 }, { "epoch": 0.1761858261827097, "grad_norm": 2.053243637084961, "learning_rate": 8.809840425531915e-07, "loss": 5.1204, "step": 530 }, { "epoch": 0.17951008705408156, "grad_norm": 2.0687060356140137, "learning_rate": 8.976063829787235e-07, "loss": 4.9786, "step": 540 }, { "epoch": 0.18283434792545344, "grad_norm": 1.8042306900024414, "learning_rate": 9.142287234042553e-07, "loss": 4.7735, "step": 550 }, { "epoch": 0.18615860879682533, "grad_norm": 1.8975441455841064, "learning_rate": 9.308510638297872e-07, "loss": 4.6749, "step": 560 }, { "epoch": 0.18948286966819722, "grad_norm": 1.6640989780426025, "learning_rate": 9.474734042553192e-07, "loss": 4.529, "step": 570 }, { "epoch": 0.1928071305395691, "grad_norm": 1.9563913345336914, "learning_rate": 9.64095744680851e-07, "loss": 4.4255, "step": 580 }, { "epoch": 0.19613139141094096, "grad_norm": 1.433192253112793, "learning_rate": 9.80718085106383e-07, "loss": 4.3369, "step": 590 }, { "epoch": 0.19945565228231285, "grad_norm": 1.6940258741378784, "learning_rate": 9.97340425531915e-07, "loss": 4.2264, "step": 600 }, { "epoch": 0.20277991315368474, "grad_norm": 1.3721990585327148, "learning_rate": 1.0139627659574467e-06, "loss": 4.0771, "step": 610 }, { "epoch": 0.20610417402505662, "grad_norm": 1.3481799364089966, "learning_rate": 1.0305851063829786e-06, "loss": 3.9652, "step": 620 }, { "epoch": 0.20942843489642848, "grad_norm": 1.3010597229003906, "learning_rate": 1.0472074468085108e-06, "loss": 3.9205, "step": 630 }, { "epoch": 0.21275269576780037, "grad_norm": 1.551216721534729, "learning_rate": 1.0638297872340427e-06, "loss": 3.8234, "step": 640 }, { "epoch": 0.21607695663917226, "grad_norm": 1.3280216455459595, "learning_rate": 1.0804521276595746e-06, "loss": 3.6898, "step": 650 }, { "epoch": 0.21940121751054414, "grad_norm": 1.0909334421157837, "learning_rate": 1.0970744680851065e-06, "loss": 3.6489, "step": 660 }, { "epoch": 0.22272547838191603, "grad_norm": 1.345831036567688, "learning_rate": 1.1136968085106384e-06, "loss": 3.5296, "step": 670 }, { "epoch": 0.2260497392532879, "grad_norm": 1.0882962942123413, "learning_rate": 1.1303191489361703e-06, "loss": 3.489, "step": 680 }, { "epoch": 0.22937400012465978, "grad_norm": 0.9840554594993591, "learning_rate": 1.1469414893617022e-06, "loss": 3.4164, "step": 690 }, { "epoch": 0.23269826099603166, "grad_norm": 1.0956693887710571, "learning_rate": 1.163563829787234e-06, "loss": 3.3211, "step": 700 }, { "epoch": 0.23602252186740355, "grad_norm": 0.8875247240066528, "learning_rate": 1.1801861702127662e-06, "loss": 3.2647, "step": 710 }, { "epoch": 0.23934678273877544, "grad_norm": 1.268930196762085, "learning_rate": 1.1968085106382979e-06, "loss": 3.2033, "step": 720 }, { "epoch": 0.2426710436101473, "grad_norm": 0.9430990815162659, "learning_rate": 1.2134308510638298e-06, "loss": 3.1317, "step": 730 }, { "epoch": 0.24599530448151918, "grad_norm": 0.9196615219116211, "learning_rate": 1.2300531914893619e-06, "loss": 3.0706, "step": 740 }, { "epoch": 0.24931956535289107, "grad_norm": 0.7046869993209839, "learning_rate": 1.2466755319148936e-06, "loss": 3.0142, "step": 750 }, { "epoch": 0.25264382622426296, "grad_norm": 0.9173153638839722, "learning_rate": 1.2632978723404255e-06, "loss": 2.949, "step": 760 }, { "epoch": 0.25596808709563484, "grad_norm": 0.8014841675758362, "learning_rate": 1.2799202127659576e-06, "loss": 2.9325, "step": 770 }, { "epoch": 0.25929234796700673, "grad_norm": 0.9520502686500549, "learning_rate": 1.2965425531914895e-06, "loss": 2.859, "step": 780 }, { "epoch": 0.2626166088383786, "grad_norm": 0.7679387331008911, "learning_rate": 1.3131648936170214e-06, "loss": 2.8509, "step": 790 }, { "epoch": 0.26594086970975045, "grad_norm": 0.7660825252532959, "learning_rate": 1.3297872340425533e-06, "loss": 2.7896, "step": 800 }, { "epoch": 0.26926513058112234, "grad_norm": 0.7754834294319153, "learning_rate": 1.3464095744680852e-06, "loss": 2.736, "step": 810 }, { "epoch": 0.2725893914524942, "grad_norm": 0.5802922248840332, "learning_rate": 1.363031914893617e-06, "loss": 2.6962, "step": 820 }, { "epoch": 0.2759136523238661, "grad_norm": 0.6394158601760864, "learning_rate": 1.379654255319149e-06, "loss": 2.6656, "step": 830 }, { "epoch": 0.279237913195238, "grad_norm": 0.6503139138221741, "learning_rate": 1.3962765957446809e-06, "loss": 2.616, "step": 840 }, { "epoch": 0.2825621740666099, "grad_norm": 0.6165557503700256, "learning_rate": 1.412898936170213e-06, "loss": 2.5971, "step": 850 }, { "epoch": 0.28588643493798177, "grad_norm": 0.6192012429237366, "learning_rate": 1.4295212765957447e-06, "loss": 2.5536, "step": 860 }, { "epoch": 0.28921069580935366, "grad_norm": 0.6266525983810425, "learning_rate": 1.4461436170212766e-06, "loss": 2.5036, "step": 870 }, { "epoch": 0.29253495668072554, "grad_norm": 0.5376760363578796, "learning_rate": 1.4627659574468087e-06, "loss": 2.5136, "step": 880 }, { "epoch": 0.2958592175520974, "grad_norm": 0.6490041613578796, "learning_rate": 1.4793882978723404e-06, "loss": 2.4638, "step": 890 }, { "epoch": 0.29918347842346926, "grad_norm": 0.6368073225021362, "learning_rate": 1.4960106382978725e-06, "loss": 2.4258, "step": 900 }, { "epoch": 0.30250773929484115, "grad_norm": 0.5121726989746094, "learning_rate": 1.5126329787234044e-06, "loss": 2.4016, "step": 910 }, { "epoch": 0.30583200016621304, "grad_norm": 0.5835744738578796, "learning_rate": 1.5292553191489363e-06, "loss": 2.4192, "step": 920 }, { "epoch": 0.3091562610375849, "grad_norm": 0.5275241732597351, "learning_rate": 1.5458776595744682e-06, "loss": 2.3687, "step": 930 }, { "epoch": 0.3124805219089568, "grad_norm": 0.4900510609149933, "learning_rate": 1.5625e-06, "loss": 2.3208, "step": 940 }, { "epoch": 0.3158047827803287, "grad_norm": 0.4609052240848541, "learning_rate": 1.5791223404255322e-06, "loss": 2.3363, "step": 950 }, { "epoch": 0.3191290436517006, "grad_norm": 0.461566299200058, "learning_rate": 1.5957446808510639e-06, "loss": 2.2793, "step": 960 }, { "epoch": 0.32245330452307247, "grad_norm": 0.49795401096343994, "learning_rate": 1.6123670212765958e-06, "loss": 2.2845, "step": 970 }, { "epoch": 0.3257775653944443, "grad_norm": 0.4422404170036316, "learning_rate": 1.6289893617021279e-06, "loss": 2.2744, "step": 980 }, { "epoch": 0.3291018262658162, "grad_norm": 0.4161861538887024, "learning_rate": 1.6456117021276596e-06, "loss": 2.2463, "step": 990 }, { "epoch": 0.3324260871371881, "grad_norm": 0.46071523427963257, "learning_rate": 1.6622340425531915e-06, "loss": 2.2271, "step": 1000 }, { "epoch": 0.33575034800855996, "grad_norm": 0.3772067129611969, "learning_rate": 1.6788563829787236e-06, "loss": 2.2119, "step": 1010 }, { "epoch": 0.33907460887993185, "grad_norm": 0.44782117009162903, "learning_rate": 1.6954787234042553e-06, "loss": 2.2022, "step": 1020 }, { "epoch": 0.34239886975130374, "grad_norm": 0.4486360251903534, "learning_rate": 1.7121010638297872e-06, "loss": 2.1723, "step": 1030 }, { "epoch": 0.3457231306226756, "grad_norm": 0.47423475980758667, "learning_rate": 1.7287234042553193e-06, "loss": 2.1295, "step": 1040 }, { "epoch": 0.3490473914940475, "grad_norm": 0.4199342131614685, "learning_rate": 1.745345744680851e-06, "loss": 2.1387, "step": 1050 }, { "epoch": 0.3523716523654194, "grad_norm": 0.43744415044784546, "learning_rate": 1.761968085106383e-06, "loss": 2.1195, "step": 1060 }, { "epoch": 0.3556959132367913, "grad_norm": 0.3780044913291931, "learning_rate": 1.778590425531915e-06, "loss": 2.1194, "step": 1070 }, { "epoch": 0.3590201741081631, "grad_norm": 0.40349099040031433, "learning_rate": 1.795212765957447e-06, "loss": 2.1005, "step": 1080 }, { "epoch": 0.362344434979535, "grad_norm": 0.378764271736145, "learning_rate": 1.8118351063829788e-06, "loss": 2.0757, "step": 1090 }, { "epoch": 0.3656686958509069, "grad_norm": 0.34115588665008545, "learning_rate": 1.8284574468085107e-06, "loss": 2.0591, "step": 1100 }, { "epoch": 0.3689929567222788, "grad_norm": 0.39553964138031006, "learning_rate": 1.8450797872340428e-06, "loss": 2.0298, "step": 1110 }, { "epoch": 0.37231721759365066, "grad_norm": 0.36110466718673706, "learning_rate": 1.8617021276595745e-06, "loss": 2.0113, "step": 1120 }, { "epoch": 0.37564147846502255, "grad_norm": 0.33477863669395447, "learning_rate": 1.8783244680851066e-06, "loss": 2.0197, "step": 1130 }, { "epoch": 0.37896573933639444, "grad_norm": 0.43919846415519714, "learning_rate": 1.8949468085106385e-06, "loss": 1.9794, "step": 1140 }, { "epoch": 0.3822900002077663, "grad_norm": 0.3243393898010254, "learning_rate": 1.9115691489361704e-06, "loss": 1.9667, "step": 1150 }, { "epoch": 0.3856142610791382, "grad_norm": 0.3350262939929962, "learning_rate": 1.928191489361702e-06, "loss": 1.978, "step": 1160 }, { "epoch": 0.38893852195051004, "grad_norm": 0.3365063965320587, "learning_rate": 1.944813829787234e-06, "loss": 1.9701, "step": 1170 }, { "epoch": 0.39226278282188193, "grad_norm": 0.3240489661693573, "learning_rate": 1.961436170212766e-06, "loss": 1.9465, "step": 1180 }, { "epoch": 0.3955870436932538, "grad_norm": 0.3239437937736511, "learning_rate": 1.978058510638298e-06, "loss": 1.9253, "step": 1190 }, { "epoch": 0.3989113045646257, "grad_norm": 0.3397749364376068, "learning_rate": 1.99468085106383e-06, "loss": 1.9057, "step": 1200 }, { "epoch": 0.4022355654359976, "grad_norm": 0.2915981113910675, "learning_rate": 2.011303191489362e-06, "loss": 1.9047, "step": 1210 }, { "epoch": 0.4055598263073695, "grad_norm": 0.39456045627593994, "learning_rate": 2.0279255319148935e-06, "loss": 1.9144, "step": 1220 }, { "epoch": 0.40888408717874136, "grad_norm": 0.2593387961387634, "learning_rate": 2.0445478723404256e-06, "loss": 1.8969, "step": 1230 }, { "epoch": 0.41220834805011325, "grad_norm": 0.30935177206993103, "learning_rate": 2.0611702127659573e-06, "loss": 1.8931, "step": 1240 }, { "epoch": 0.41553260892148514, "grad_norm": 0.27917250990867615, "learning_rate": 2.0777925531914894e-06, "loss": 1.8899, "step": 1250 }, { "epoch": 0.41885686979285697, "grad_norm": 0.25976502895355225, "learning_rate": 2.0944148936170215e-06, "loss": 1.8503, "step": 1260 }, { "epoch": 0.42218113066422885, "grad_norm": 0.31833794713020325, "learning_rate": 2.111037234042553e-06, "loss": 1.8527, "step": 1270 }, { "epoch": 0.42550539153560074, "grad_norm": 0.2671976685523987, "learning_rate": 2.1276595744680853e-06, "loss": 1.8505, "step": 1280 }, { "epoch": 0.42882965240697263, "grad_norm": 0.3245258629322052, "learning_rate": 2.144281914893617e-06, "loss": 1.864, "step": 1290 }, { "epoch": 0.4321539132783445, "grad_norm": 0.2622531056404114, "learning_rate": 2.160904255319149e-06, "loss": 1.8301, "step": 1300 }, { "epoch": 0.4354781741497164, "grad_norm": 0.3247709274291992, "learning_rate": 2.177526595744681e-06, "loss": 1.812, "step": 1310 }, { "epoch": 0.4388024350210883, "grad_norm": 0.26424384117126465, "learning_rate": 2.194148936170213e-06, "loss": 1.7958, "step": 1320 }, { "epoch": 0.4421266958924602, "grad_norm": 0.2569092810153961, "learning_rate": 2.210771276595745e-06, "loss": 1.8147, "step": 1330 }, { "epoch": 0.44545095676383206, "grad_norm": 0.2393629103899002, "learning_rate": 2.2273936170212767e-06, "loss": 1.7976, "step": 1340 }, { "epoch": 0.44877521763520395, "grad_norm": 0.232402965426445, "learning_rate": 2.244015957446809e-06, "loss": 1.7597, "step": 1350 }, { "epoch": 0.4520994785065758, "grad_norm": 0.26385971903800964, "learning_rate": 2.2606382978723405e-06, "loss": 1.7781, "step": 1360 }, { "epoch": 0.45542373937794767, "grad_norm": 0.2671038806438446, "learning_rate": 2.277260638297872e-06, "loss": 1.7583, "step": 1370 }, { "epoch": 0.45874800024931955, "grad_norm": 0.27096447348594666, "learning_rate": 2.2938829787234043e-06, "loss": 1.7402, "step": 1380 }, { "epoch": 0.46207226112069144, "grad_norm": 0.2245018631219864, "learning_rate": 2.3105053191489364e-06, "loss": 1.7644, "step": 1390 }, { "epoch": 0.46539652199206333, "grad_norm": 0.20663714408874512, "learning_rate": 2.327127659574468e-06, "loss": 1.7519, "step": 1400 }, { "epoch": 0.4687207828634352, "grad_norm": 0.26273128390312195, "learning_rate": 2.3437500000000002e-06, "loss": 1.7312, "step": 1410 }, { "epoch": 0.4720450437348071, "grad_norm": 0.24725256860256195, "learning_rate": 2.3603723404255323e-06, "loss": 1.7217, "step": 1420 }, { "epoch": 0.475369304606179, "grad_norm": 0.25341796875, "learning_rate": 2.376994680851064e-06, "loss": 1.7246, "step": 1430 }, { "epoch": 0.4786935654775509, "grad_norm": 0.21035414934158325, "learning_rate": 2.3936170212765957e-06, "loss": 1.7017, "step": 1440 }, { "epoch": 0.4820178263489227, "grad_norm": 0.21454143524169922, "learning_rate": 2.410239361702128e-06, "loss": 1.7049, "step": 1450 }, { "epoch": 0.4853420872202946, "grad_norm": 0.22413010895252228, "learning_rate": 2.4268617021276595e-06, "loss": 1.6809, "step": 1460 }, { "epoch": 0.4886663480916665, "grad_norm": 0.2039473056793213, "learning_rate": 2.4434840425531916e-06, "loss": 1.6873, "step": 1470 }, { "epoch": 0.49199060896303837, "grad_norm": 0.18895457684993744, "learning_rate": 2.4601063829787237e-06, "loss": 1.69, "step": 1480 }, { "epoch": 0.49531486983441025, "grad_norm": 0.21047964692115784, "learning_rate": 2.4767287234042554e-06, "loss": 1.681, "step": 1490 }, { "epoch": 0.49863913070578214, "grad_norm": 0.2226460874080658, "learning_rate": 2.493351063829787e-06, "loss": 1.6613, "step": 1500 }, { "epoch": 0.501963391577154, "grad_norm": 0.21892835199832916, "learning_rate": 2.5099734042553192e-06, "loss": 1.6376, "step": 1510 }, { "epoch": 0.5052876524485259, "grad_norm": 0.20363831520080566, "learning_rate": 2.526595744680851e-06, "loss": 1.6541, "step": 1520 }, { "epoch": 0.5086119133198977, "grad_norm": 0.1988699585199356, "learning_rate": 2.543218085106383e-06, "loss": 1.6422, "step": 1530 }, { "epoch": 0.5119361741912697, "grad_norm": 0.2050096094608307, "learning_rate": 2.559840425531915e-06, "loss": 1.6377, "step": 1540 }, { "epoch": 0.5152604350626415, "grad_norm": 0.23265878856182098, "learning_rate": 2.5764627659574472e-06, "loss": 1.6251, "step": 1550 }, { "epoch": 0.5185846959340135, "grad_norm": 0.2024969905614853, "learning_rate": 2.593085106382979e-06, "loss": 1.6048, "step": 1560 }, { "epoch": 0.5219089568053853, "grad_norm": 0.21343863010406494, "learning_rate": 2.6097074468085106e-06, "loss": 1.6195, "step": 1570 }, { "epoch": 0.5252332176767572, "grad_norm": 0.1862565129995346, "learning_rate": 2.6263297872340427e-06, "loss": 1.5991, "step": 1580 }, { "epoch": 0.5285574785481291, "grad_norm": 0.22765249013900757, "learning_rate": 2.6429521276595744e-06, "loss": 1.5957, "step": 1590 }, { "epoch": 0.5318817394195009, "grad_norm": 0.19874997437000275, "learning_rate": 2.6595744680851065e-06, "loss": 1.5847, "step": 1600 }, { "epoch": 0.5352060002908728, "grad_norm": 0.25979486107826233, "learning_rate": 2.6761968085106386e-06, "loss": 1.6046, "step": 1610 }, { "epoch": 0.5385302611622447, "grad_norm": 0.1831529289484024, "learning_rate": 2.6928191489361703e-06, "loss": 1.5835, "step": 1620 }, { "epoch": 0.5418545220336166, "grad_norm": 0.2680751085281372, "learning_rate": 2.7094414893617024e-06, "loss": 1.6009, "step": 1630 }, { "epoch": 0.5451787829049884, "grad_norm": 0.18160907924175262, "learning_rate": 2.726063829787234e-06, "loss": 1.5666, "step": 1640 }, { "epoch": 0.5485030437763604, "grad_norm": 0.22875571250915527, "learning_rate": 2.742686170212766e-06, "loss": 1.5614, "step": 1650 }, { "epoch": 0.5518273046477322, "grad_norm": 0.21110033988952637, "learning_rate": 2.759308510638298e-06, "loss": 1.5707, "step": 1660 }, { "epoch": 0.5551515655191042, "grad_norm": 0.1887374073266983, "learning_rate": 2.77593085106383e-06, "loss": 1.5781, "step": 1670 }, { "epoch": 0.558475826390476, "grad_norm": 0.1916954219341278, "learning_rate": 2.7925531914893617e-06, "loss": 1.563, "step": 1680 }, { "epoch": 0.5618000872618478, "grad_norm": 0.21001753211021423, "learning_rate": 2.809175531914894e-06, "loss": 1.5495, "step": 1690 }, { "epoch": 0.5651243481332198, "grad_norm": 0.1702377200126648, "learning_rate": 2.825797872340426e-06, "loss": 1.5427, "step": 1700 }, { "epoch": 0.5684486090045916, "grad_norm": 0.19061295688152313, "learning_rate": 2.8424202127659576e-06, "loss": 1.5387, "step": 1710 }, { "epoch": 0.5717728698759635, "grad_norm": 0.17503058910369873, "learning_rate": 2.8590425531914893e-06, "loss": 1.5154, "step": 1720 }, { "epoch": 0.5750971307473354, "grad_norm": 0.1703094244003296, "learning_rate": 2.8756648936170214e-06, "loss": 1.5209, "step": 1730 }, { "epoch": 0.5784213916187073, "grad_norm": 0.22713126242160797, "learning_rate": 2.892287234042553e-06, "loss": 1.529, "step": 1740 }, { "epoch": 0.5817456524900791, "grad_norm": 0.16218431293964386, "learning_rate": 2.9089095744680852e-06, "loss": 1.505, "step": 1750 }, { "epoch": 0.5850699133614511, "grad_norm": 0.16082778573036194, "learning_rate": 2.9255319148936174e-06, "loss": 1.5312, "step": 1760 }, { "epoch": 0.5883941742328229, "grad_norm": 0.19500340521335602, "learning_rate": 2.942154255319149e-06, "loss": 1.4971, "step": 1770 }, { "epoch": 0.5917184351041948, "grad_norm": 0.16831324994564056, "learning_rate": 2.9587765957446807e-06, "loss": 1.5172, "step": 1780 }, { "epoch": 0.5950426959755667, "grad_norm": 0.17963413894176483, "learning_rate": 2.975398936170213e-06, "loss": 1.5076, "step": 1790 }, { "epoch": 0.5983669568469385, "grad_norm": 0.17123515903949738, "learning_rate": 2.992021276595745e-06, "loss": 1.4941, "step": 1800 }, { "epoch": 0.6016912177183105, "grad_norm": 0.15727902948856354, "learning_rate": 3.0086436170212766e-06, "loss": 1.4609, "step": 1810 }, { "epoch": 0.6050154785896823, "grad_norm": 0.1833077073097229, "learning_rate": 3.0252659574468088e-06, "loss": 1.5042, "step": 1820 }, { "epoch": 0.6083397394610542, "grad_norm": 0.16962528228759766, "learning_rate": 3.041888297872341e-06, "loss": 1.4651, "step": 1830 }, { "epoch": 0.6116640003324261, "grad_norm": 0.17829731106758118, "learning_rate": 3.0585106382978726e-06, "loss": 1.4907, "step": 1840 }, { "epoch": 0.614988261203798, "grad_norm": 0.16981306672096252, "learning_rate": 3.0751329787234042e-06, "loss": 1.4683, "step": 1850 }, { "epoch": 0.6183125220751698, "grad_norm": 0.20783671736717224, "learning_rate": 3.0917553191489363e-06, "loss": 1.463, "step": 1860 }, { "epoch": 0.6216367829465417, "grad_norm": 0.20343361794948578, "learning_rate": 3.108377659574468e-06, "loss": 1.4632, "step": 1870 }, { "epoch": 0.6249610438179136, "grad_norm": 0.18592675030231476, "learning_rate": 3.125e-06, "loss": 1.4887, "step": 1880 }, { "epoch": 0.6282853046892855, "grad_norm": 0.17272701859474182, "learning_rate": 3.141622340425532e-06, "loss": 1.4491, "step": 1890 }, { "epoch": 0.6316095655606574, "grad_norm": 0.2021792083978653, "learning_rate": 3.1582446808510644e-06, "loss": 1.4537, "step": 1900 }, { "epoch": 0.6349338264320292, "grad_norm": 0.16319766640663147, "learning_rate": 3.174867021276596e-06, "loss": 1.456, "step": 1910 }, { "epoch": 0.6382580873034012, "grad_norm": 0.2344328761100769, "learning_rate": 3.1914893617021277e-06, "loss": 1.4801, "step": 1920 }, { "epoch": 0.641582348174773, "grad_norm": 0.17495407164096832, "learning_rate": 3.20811170212766e-06, "loss": 1.4435, "step": 1930 }, { "epoch": 0.6449066090461449, "grad_norm": 0.19222399592399597, "learning_rate": 3.2247340425531915e-06, "loss": 1.4391, "step": 1940 }, { "epoch": 0.6482308699175168, "grad_norm": 0.24526530504226685, "learning_rate": 3.2413563829787232e-06, "loss": 1.4555, "step": 1950 }, { "epoch": 0.6515551307888886, "grad_norm": 0.18150673806667328, "learning_rate": 3.2579787234042558e-06, "loss": 1.4396, "step": 1960 }, { "epoch": 0.6548793916602605, "grad_norm": 0.18334811925888062, "learning_rate": 3.2746010638297875e-06, "loss": 1.4139, "step": 1970 }, { "epoch": 0.6582036525316324, "grad_norm": 0.25186312198638916, "learning_rate": 3.291223404255319e-06, "loss": 1.439, "step": 1980 }, { "epoch": 0.6615279134030043, "grad_norm": 0.16558600962162018, "learning_rate": 3.3078457446808513e-06, "loss": 1.4383, "step": 1990 }, { "epoch": 0.6648521742743762, "grad_norm": 0.2373538315296173, "learning_rate": 3.324468085106383e-06, "loss": 1.4334, "step": 2000 }, { "epoch": 0.6681764351457481, "grad_norm": 0.2821474075317383, "learning_rate": 3.3410904255319146e-06, "loss": 1.4418, "step": 2010 }, { "epoch": 0.6715006960171199, "grad_norm": 0.2443741410970688, "learning_rate": 3.357712765957447e-06, "loss": 1.4071, "step": 2020 }, { "epoch": 0.6748249568884919, "grad_norm": 0.17468735575675964, "learning_rate": 3.374335106382979e-06, "loss": 1.4109, "step": 2030 }, { "epoch": 0.6781492177598637, "grad_norm": 0.1655045598745346, "learning_rate": 3.3909574468085105e-06, "loss": 1.4049, "step": 2040 }, { "epoch": 0.6814734786312356, "grad_norm": 0.17598801851272583, "learning_rate": 3.4075797872340427e-06, "loss": 1.4188, "step": 2050 }, { "epoch": 0.6847977395026075, "grad_norm": 0.28528669476509094, "learning_rate": 3.4242021276595743e-06, "loss": 1.408, "step": 2060 }, { "epoch": 0.6881220003739793, "grad_norm": 0.17654620110988617, "learning_rate": 3.440824468085106e-06, "loss": 1.4117, "step": 2070 }, { "epoch": 0.6914462612453512, "grad_norm": 0.2636467516422272, "learning_rate": 3.4574468085106386e-06, "loss": 1.3947, "step": 2080 }, { "epoch": 0.6947705221167231, "grad_norm": 0.26495933532714844, "learning_rate": 3.4740691489361703e-06, "loss": 1.398, "step": 2090 }, { "epoch": 0.698094782988095, "grad_norm": 0.3873574435710907, "learning_rate": 3.490691489361702e-06, "loss": 1.4204, "step": 2100 }, { "epoch": 0.7014190438594669, "grad_norm": 0.327854186296463, "learning_rate": 3.5073138297872345e-06, "loss": 1.3744, "step": 2110 }, { "epoch": 0.7047433047308388, "grad_norm": 0.308570921421051, "learning_rate": 3.523936170212766e-06, "loss": 1.4293, "step": 2120 }, { "epoch": 0.7080675656022106, "grad_norm": 0.21123336255550385, "learning_rate": 3.5405585106382983e-06, "loss": 1.3878, "step": 2130 }, { "epoch": 0.7113918264735826, "grad_norm": 0.18777534365653992, "learning_rate": 3.55718085106383e-06, "loss": 1.3882, "step": 2140 }, { "epoch": 0.7147160873449544, "grad_norm": 0.2535350024700165, "learning_rate": 3.5738031914893617e-06, "loss": 1.3974, "step": 2150 }, { "epoch": 0.7180403482163262, "grad_norm": 0.15405435860157013, "learning_rate": 3.590425531914894e-06, "loss": 1.3853, "step": 2160 }, { "epoch": 0.7213646090876982, "grad_norm": 0.1863648146390915, "learning_rate": 3.607047872340426e-06, "loss": 1.3835, "step": 2170 }, { "epoch": 0.72468886995907, "grad_norm": 0.18587157130241394, "learning_rate": 3.6236702127659576e-06, "loss": 1.3711, "step": 2180 }, { "epoch": 0.728013130830442, "grad_norm": 0.18254730105400085, "learning_rate": 3.6402925531914897e-06, "loss": 1.3768, "step": 2190 }, { "epoch": 0.7313373917018138, "grad_norm": 0.21665969491004944, "learning_rate": 3.6569148936170214e-06, "loss": 1.3638, "step": 2200 }, { "epoch": 0.7346616525731857, "grad_norm": 0.15701924264431, "learning_rate": 3.673537234042553e-06, "loss": 1.3885, "step": 2210 }, { "epoch": 0.7379859134445576, "grad_norm": 0.19307725131511688, "learning_rate": 3.6901595744680856e-06, "loss": 1.3933, "step": 2220 }, { "epoch": 0.7413101743159295, "grad_norm": 0.16837100684642792, "learning_rate": 3.7067819148936173e-06, "loss": 1.3685, "step": 2230 }, { "epoch": 0.7446344351873013, "grad_norm": 0.2914402484893799, "learning_rate": 3.723404255319149e-06, "loss": 1.3802, "step": 2240 }, { "epoch": 0.7479586960586732, "grad_norm": 0.2770545184612274, "learning_rate": 3.7400265957446815e-06, "loss": 1.3575, "step": 2250 }, { "epoch": 0.7512829569300451, "grad_norm": 0.19819234311580658, "learning_rate": 3.756648936170213e-06, "loss": 1.3695, "step": 2260 }, { "epoch": 0.7546072178014169, "grad_norm": 0.15371359884738922, "learning_rate": 3.7732712765957445e-06, "loss": 1.3514, "step": 2270 }, { "epoch": 0.7579314786727889, "grad_norm": 0.26700448989868164, "learning_rate": 3.789893617021277e-06, "loss": 1.3689, "step": 2280 }, { "epoch": 0.7612557395441607, "grad_norm": 0.2938506007194519, "learning_rate": 3.8065159574468087e-06, "loss": 1.3518, "step": 2290 }, { "epoch": 0.7645800004155326, "grad_norm": 0.2514606714248657, "learning_rate": 3.823138297872341e-06, "loss": 1.3655, "step": 2300 }, { "epoch": 0.7679042612869045, "grad_norm": 0.2503184378147125, "learning_rate": 3.8397606382978725e-06, "loss": 1.3511, "step": 2310 }, { "epoch": 0.7712285221582764, "grad_norm": 0.1815042346715927, "learning_rate": 3.856382978723404e-06, "loss": 1.383, "step": 2320 }, { "epoch": 0.7745527830296483, "grad_norm": 0.25425419211387634, "learning_rate": 3.873005319148936e-06, "loss": 1.3354, "step": 2330 }, { "epoch": 0.7778770439010201, "grad_norm": 0.18466657400131226, "learning_rate": 3.889627659574468e-06, "loss": 1.3514, "step": 2340 }, { "epoch": 0.781201304772392, "grad_norm": 0.1782332807779312, "learning_rate": 3.90625e-06, "loss": 1.32, "step": 2350 }, { "epoch": 0.7845255656437639, "grad_norm": 0.27637991309165955, "learning_rate": 3.922872340425532e-06, "loss": 1.3383, "step": 2360 }, { "epoch": 0.7878498265151358, "grad_norm": 0.17314772307872772, "learning_rate": 3.939494680851064e-06, "loss": 1.3314, "step": 2370 }, { "epoch": 0.7911740873865076, "grad_norm": 0.3641667068004608, "learning_rate": 3.956117021276596e-06, "loss": 1.3543, "step": 2380 }, { "epoch": 0.7944983482578796, "grad_norm": 0.3088253438472748, "learning_rate": 3.972739361702128e-06, "loss": 1.3444, "step": 2390 }, { "epoch": 0.7978226091292514, "grad_norm": 0.25276973843574524, "learning_rate": 3.98936170212766e-06, "loss": 1.3102, "step": 2400 }, { "epoch": 0.8011468700006233, "grad_norm": 0.26414382457733154, "learning_rate": 4.005984042553192e-06, "loss": 1.3119, "step": 2410 }, { "epoch": 0.8044711308719952, "grad_norm": 0.1684638261795044, "learning_rate": 4.022606382978724e-06, "loss": 1.3204, "step": 2420 }, { "epoch": 0.807795391743367, "grad_norm": 0.18500946462154388, "learning_rate": 4.039228723404256e-06, "loss": 1.3251, "step": 2430 }, { "epoch": 0.811119652614739, "grad_norm": 0.2754835784435272, "learning_rate": 4.055851063829787e-06, "loss": 1.3258, "step": 2440 }, { "epoch": 0.8144439134861108, "grad_norm": 0.18949855864048004, "learning_rate": 4.072473404255319e-06, "loss": 1.3145, "step": 2450 }, { "epoch": 0.8177681743574827, "grad_norm": 0.6927218437194824, "learning_rate": 4.089095744680851e-06, "loss": 1.3205, "step": 2460 }, { "epoch": 0.8210924352288546, "grad_norm": 0.36098670959472656, "learning_rate": 4.105718085106383e-06, "loss": 1.3295, "step": 2470 }, { "epoch": 0.8244166961002265, "grad_norm": 0.25839686393737793, "learning_rate": 4.1223404255319146e-06, "loss": 1.321, "step": 2480 }, { "epoch": 0.8277409569715983, "grad_norm": 0.18720127642154694, "learning_rate": 4.138962765957447e-06, "loss": 1.2975, "step": 2490 }, { "epoch": 0.8310652178429703, "grad_norm": 0.17975495755672455, "learning_rate": 4.155585106382979e-06, "loss": 1.318, "step": 2500 } ], "logging_steps": 10, "max_steps": 150400, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4430265344e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }