diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,7642 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.6628735568690273, - "global_step": 12000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 6.226286401385679e-05, - "loss": 1.4854, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 8.100585369797515e-05, - "loss": 1.3991, - "step": 20 - }, - { - "epoch": 0.0, - "learning_rate": 9.196979981458785e-05, - "loss": 1.2649, - "step": 30 - }, - { - "epoch": 0.0, - "learning_rate": 9.97488433820935e-05, - "loss": 1.3398, - "step": 40 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001057827383435952, - "loss": 1.1817, - "step": 50 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001107127894987062, - "loss": 1.3592, - "step": 60 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011488108835764138, - "loss": 1.1946, - "step": 70 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011849183306621185, - "loss": 1.2433, - "step": 80 - }, - { - "epoch": 0.0, - "learning_rate": 0.00012167673561531892, - "loss": 1.2936, - "step": 90 - }, - { - "epoch": 0.01, - "learning_rate": 0.00012452572802771359, - "loss": 1.1955, - "step": 100 - }, - { - "epoch": 0.01, - "learning_rate": 0.00012710295515488854, - "loss": 1.3258, - "step": 110 - }, - { - "epoch": 0.01, - "learning_rate": 0.00012945577918282457, - "loss": 1.2087, - "step": 120 - }, - { - "epoch": 0.01, - "learning_rate": 0.00013162016747767712, - "loss": 1.2059, - "step": 130 - }, - { - "epoch": 0.01, - "learning_rate": 0.00013362407804175968, - "loss": 1.2616, - "step": 140 - }, - { - "epoch": 0.01, - "learning_rate": 0.00013548967414432628, - "loss": 1.1475, - "step": 150 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001372348227503302, - "loss": 1.3272, - "step": 160 - }, - { - "epoch": 0.01, - "learning_rate": 0.000138874137881629, - "loss": 1.2506, - "step": 170 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001404197252994373, - "loss": 1.2071, - "step": 180 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014188172557721247, - "loss": 1.3089, - "step": 190 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001432687177118319, - "loss": 1.1428, - "step": 200 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014458802415837242, - "loss": 1.2244, - "step": 210 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001458459448390069, - "loss": 1.1951, - "step": 220 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014704793909170365, - "loss": 1.215, - "step": 230 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014819876886694292, - "loss": 1.2766, - "step": 240 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014930261267333362, - "loss": 1.1436, - "step": 250 - }, - { - "epoch": 0.01, - "learning_rate": 0.00015036315716179547, - "loss": 1.2817, - "step": 260 - }, - { - "epoch": 0.01, - "learning_rate": 0.00015138367141604998, - "loss": 1.2022, - "step": 270 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015236706772587806, - "loss": 1.2149, - "step": 280 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001533159516911759, - "loss": 1.2048, - "step": 290 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015423266382844463, - "loss": 1.1425, - "step": 300 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015511931435253522, - "loss": 1.2738, - "step": 310 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001559778124344486, - "loss": 1.1905, - "step": 320 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015680989095561958, - "loss": 1.1633, - "step": 330 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015761712756574739, - "loss": 1.2421, - "step": 340 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015840096268737977, - "loss": 1.144, - "step": 350 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015916271498355564, - "loss": 1.2408, - "step": 360 - }, - { - "epoch": 0.02, - "learning_rate": 0.00015990359470573583, - "loss": 1.2195, - "step": 370 - }, - { - "epoch": 0.02, - "learning_rate": 0.00016062471526133082, - "loss": 1.2085, - "step": 380 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001613271032784082, - "loss": 1.243, - "step": 390 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001620117073959503, - "loss": 1.1044, - "step": 400 - }, - { - "epoch": 0.02, - "learning_rate": 0.00016267940596854117, - "loss": 1.2442, - "step": 410 - }, - { - "epoch": 0.02, - "learning_rate": 0.00016333101384249078, - "loss": 1.1882, - "step": 420 - }, - { - "epoch": 0.02, - "learning_rate": 0.00016396728833453322, - "loss": 1.1486, - "step": 430 - }, - { - "epoch": 0.02, - "learning_rate": 0.00016458893452312522, - "loss": 1.2567, - "step": 440 - }, - { - "epoch": 0.02, - "learning_rate": 0.00016519660994505735, - "loss": 1.164, - "step": 450 - }, - { - "epoch": 0.03, - "learning_rate": 0.000165790928775822, - "loss": 1.2633, - "step": 460 - }, - { - "epoch": 0.03, - "learning_rate": 0.00016637246556036958, - "loss": 1.1913, - "step": 470 - }, - { - "epoch": 0.03, - "learning_rate": 0.00016694175855106128, - "loss": 1.1477, - "step": 480 - }, - { - "epoch": 0.03, - "learning_rate": 0.00016749931270142591, - "loss": 1.2731, - "step": 490 - }, - { - "epoch": 0.03, - "learning_rate": 0.000168045602357452, - "loss": 1.1511, - "step": 500 - }, - { - "epoch": 0.03, - "learning_rate": 0.00016858107368236008, - "loss": 1.2301, - "step": 510 - }, - { - "epoch": 0.03, - "learning_rate": 0.00016910614684591385, - "loss": 1.1907, - "step": 520 - }, - { - "epoch": 0.03, - "learning_rate": 0.00016962121800518536, - "loss": 1.1612, - "step": 530 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017012666110016836, - "loss": 1.2319, - "step": 540 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017062282948462692, - "loss": 1.1193, - "step": 550 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017111005740999641, - "loss": 1.2784, - "step": 560 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017158866137794354, - "loss": 1.1673, - "step": 570 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017205894137529426, - "loss": 1.1765, - "step": 580 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017252118200339632, - "loss": 1.2579, - "step": 590 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017297565351256298, - "loss": 1.1393, - "step": 600 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017342261275101354, - "loss": 1.2666, - "step": 610 - }, - { - "epoch": 0.03, - "learning_rate": 0.00017386230403665357, - "loss": 1.2035, - "step": 620 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001742949599591035, - "loss": 1.1859, - "step": 630 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017472080211856694, - "loss": 1.2394, - "step": 640 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017514004180741556, - "loss": 1.1103, - "step": 650 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017555288063973793, - "loss": 1.1979, - "step": 660 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017595951113354868, - "loss": 1.1613, - "step": 670 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001763601172498657, - "loss": 1.1612, - "step": 680 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001767548748924347, - "loss": 1.2676, - "step": 690 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017714395237149812, - "loss": 1.1264, - "step": 700 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017752751083466962, - "loss": 1.2567, - "step": 710 - }, - { - "epoch": 0.04, - "learning_rate": 0.000177905704667674, - "loss": 1.1648, - "step": 720 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001782786818674474, - "loss": 1.192, - "step": 730 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017864658438985419, - "loss": 1.2161, - "step": 740 - }, - { - "epoch": 0.04, - "learning_rate": 0.00017900954847406472, - "loss": 1.1022, - "step": 750 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001793677049454492, - "loss": 1.3062, - "step": 760 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001797211794986731, - "loss": 1.1351, - "step": 770 - }, - { - "epoch": 0.04, - "learning_rate": 0.00018007009296252654, - "loss": 1.209, - "step": 780 - }, - { - "epoch": 0.04, - "learning_rate": 0.00018041456154788414, - "loss": 1.2362, - "step": 790 - }, - { - "epoch": 0.04, - "learning_rate": 0.00018075469708006865, - "loss": 1.1302, - "step": 800 - }, - { - "epoch": 0.04, - "learning_rate": 0.00018109060721678105, - "loss": 1.2922, - "step": 810 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001814223956526595, - "loss": 1.1519, - "step": 820 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018175016231143995, - "loss": 1.1629, - "step": 830 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018207400352660916, - "loss": 1.2504, - "step": 840 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018239401221136744, - "loss": 1.1551, - "step": 850 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001827102780186516, - "loss": 1.2317, - "step": 860 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018302288749190697, - "loss": 1.1559, - "step": 870 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001833319242072436, - "loss": 1.2097, - "step": 880 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018363746890756038, - "loss": 1.228, - "step": 890 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001839395996291757, - "loss": 1.1432, - "step": 900 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001842383918214617, - "loss": 1.1888, - "step": 910 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018453391845994036, - "loss": 1.1958, - "step": 920 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018482625015326626, - "loss": 1.148, - "step": 930 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018511545524448793, - "loss": 1.242, - "step": 940 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001854015999069509, - "loss": 1.1326, - "step": 950 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018568474823517966, - "loss": 1.2941, - "step": 960 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001859649623310504, - "loss": 1.1983, - "step": 970 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018624230238554427, - "loss": 1.156, - "step": 980 - }, - { - "epoch": 0.05, - "learning_rate": 0.00018651682675635065, - "loss": 1.2216, - "step": 990 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018678859204157035, - "loss": 1.0838, - "step": 1000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018705765314975205, - "loss": 1.2639, - "step": 1010 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018732406336647843, - "loss": 1.1877, - "step": 1020 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018758787441770382, - "loss": 1.1674, - "step": 1030 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001878491365300322, - "loss": 1.1946, - "step": 1040 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018810789848811086, - "loss": 1.121, - "step": 1050 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018836420768930372, - "loss": 1.2559, - "step": 1060 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018861811019579795, - "loss": 1.123, - "step": 1070 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001888696507842867, - "loss": 1.1994, - "step": 1080 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018911887299336228, - "loss": 1.2116, - "step": 1090 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001893658191687453, - "loss": 1.0746, - "step": 1100 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001896105305064669, - "loss": 1.2671, - "step": 1110 - }, - { - "epoch": 0.06, - "learning_rate": 0.00018985304709411477, - "loss": 1.1409, - "step": 1120 - }, - { - "epoch": 0.06, - "learning_rate": 0.00019009340795024567, - "loss": 1.1153, - "step": 1130 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001903316510620619, - "loss": 1.2162, - "step": 1140 - }, - { - "epoch": 0.06, - "learning_rate": 0.00019056781342144207, - "loss": 1.103, - "step": 1150 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001908019310594126, - "loss": 1.2478, - "step": 1160 - }, - { - "epoch": 0.06, - "learning_rate": 0.00019103403907913926, - "loss": 1.1723, - "step": 1170 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001912641716875147, - "loss": 1.1992, - "step": 1180 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001914923622254136, - "loss": 1.2087, - "step": 1190 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019171864319668136, - "loss": 1.1551, - "step": 1200 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019194304629592022, - "loss": 1.2317, - "step": 1210 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019216560243513192, - "loss": 1.1294, - "step": 1220 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001923863417692722, - "loss": 1.1429, - "step": 1230 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019260529372077195, - "loss": 1.2404, - "step": 1240 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019282248700307206, - "loss": 1.1336, - "step": 1250 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019303794964322185, - "loss": 1.2671, - "step": 1260 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019325170900358343, - "loss": 1.1229, - "step": 1270 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001934637918026853, - "loss": 1.1335, - "step": 1280 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001936742241352643, - "loss": 1.2139, - "step": 1290 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001938830314915339, - "loss": 1.0886, - "step": 1300 - }, - { - "epoch": 0.07, - "learning_rate": 0.000194090238775714, - "loss": 1.2325, - "step": 1310 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001942958703238563, - "loss": 1.1449, - "step": 1320 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019449994992099703, - "loss": 1.1834, - "step": 1330 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019470250081766706, - "loss": 1.2746, - "step": 1340 - }, - { - "epoch": 0.07, - "learning_rate": 0.00019490354574578842, - "loss": 1.103, - "step": 1350 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001951031069339841, - "loss": 1.2797, - "step": 1360 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001953012061223271, - "loss": 1.1534, - "step": 1370 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019549786457655305, - "loss": 1.1639, - "step": 1380 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019569310310175997, - "loss": 1.2115, - "step": 1390 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019588694205561647, - "loss": 1.0515, - "step": 1400 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019607940136110065, - "loss": 1.2054, - "step": 1410 - }, - { - "epoch": 0.08, - "learning_rate": 0.000196270500518788, - "loss": 1.1422, - "step": 1420 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019646025861870886, - "loss": 1.1391, - "step": 1430 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019664869435179235, - "loss": 1.2412, - "step": 1440 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019683582602091432, - "loss": 1.1085, - "step": 1450 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019702167155156576, - "loss": 1.2386, - "step": 1460 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019720624850215698, - "loss": 1.1997, - "step": 1470 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019738957407397257, - "loss": 1.1984, - "step": 1480 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019757166512079108, - "loss": 1.2307, - "step": 1490 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019775253815818307, - "loss": 1.1343, - "step": 1500 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001979322093725002, - "loss": 1.2189, - "step": 1510 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019811069462956756, - "loss": 1.1581, - "step": 1520 - }, - { - "epoch": 0.08, - "learning_rate": 0.00019828800948309115, - "loss": 1.1699, - "step": 1530 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019846416918279145, - "loss": 1.2407, - "step": 1540 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019863918868227366, - "loss": 1.0669, - "step": 1550 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019881308264664492, - "loss": 1.2039, - "step": 1560 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001989858654598883, - "loss": 1.1691, - "step": 1570 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001991575512320025, - "loss": 1.1569, - "step": 1580 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019932815380591643, - "loss": 1.2092, - "step": 1590 - }, - { - "epoch": 0.09, - "learning_rate": 0.000199497686764187, - "loss": 1.128, - "step": 1600 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019966616343548824, - "loss": 1.2002, - "step": 1610 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001998335969008994, - "loss": 1.0644, - "step": 1620 - }, - { - "epoch": 0.09, - "learning_rate": 0.0002, - "loss": 1.1275, - "step": 1630 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001999658307864614, - "loss": 1.1783, - "step": 1640 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019992786499364073, - "loss": 1.1262, - "step": 1650 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019988989920082006, - "loss": 1.2625, - "step": 1660 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019985193340799938, - "loss": 1.1807, - "step": 1670 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019981396761517873, - "loss": 1.1256, - "step": 1680 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019977600182235808, - "loss": 1.2082, - "step": 1690 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001997380360295374, - "loss": 1.0813, - "step": 1700 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019970007023671673, - "loss": 1.1981, - "step": 1710 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019966210444389605, - "loss": 1.1224, - "step": 1720 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019962413865107537, - "loss": 1.1375, - "step": 1730 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019958617285825472, - "loss": 1.1768, - "step": 1740 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019954820706543405, - "loss": 1.097, - "step": 1750 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019951024127261337, - "loss": 1.2163, - "step": 1760 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019947227547979272, - "loss": 1.1339, - "step": 1770 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019943430968697205, - "loss": 1.1616, - "step": 1780 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001993963438941514, - "loss": 1.2019, - "step": 1790 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019935837810133072, - "loss": 1.087, - "step": 1800 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019932041230851004, - "loss": 1.2352, - "step": 1810 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019928244651568937, - "loss": 1.1305, - "step": 1820 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001992444807228687, - "loss": 1.126, - "step": 1830 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019920651493004804, - "loss": 1.2041, - "step": 1840 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001991685491372274, - "loss": 1.0549, - "step": 1850 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019913058334440671, - "loss": 1.2373, - "step": 1860 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019909261755158604, - "loss": 1.1491, - "step": 1870 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019905465175876536, - "loss": 1.1715, - "step": 1880 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019901668596594468, - "loss": 1.2384, - "step": 1890 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019897872017312404, - "loss": 1.0732, - "step": 1900 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019894075438030336, - "loss": 1.2436, - "step": 1910 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019890278858748268, - "loss": 1.1645, - "step": 1920 - }, - { - "epoch": 0.11, - "learning_rate": 0.000198864822794662, - "loss": 1.123, - "step": 1930 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019882685700184136, - "loss": 1.2387, - "step": 1940 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019878889120902068, - "loss": 1.0858, - "step": 1950 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019875092541620003, - "loss": 1.2401, - "step": 1960 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019871295962337935, - "loss": 1.1272, - "step": 1970 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019867499383055868, - "loss": 1.1403, - "step": 1980 - }, - { - "epoch": 0.11, - "learning_rate": 0.000198637028037738, - "loss": 1.157, - "step": 1990 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019859906224491732, - "loss": 1.0744, - "step": 2000 - }, - { - "epoch": 0.11, - "eval_loss": 1.1491358280181885, - "eval_runtime": 1.9664, - "eval_samples_per_second": 50.854, - "eval_steps_per_second": 6.611, - "step": 2000 - }, - { - "epoch": 0.11, - "mmlu_eval_accuracy": NaN, - "mmlu_eval_accuracy_abstract_algebra": 0.5454545454545454, - "mmlu_eval_accuracy_anatomy": 0.21428571428571427, - "mmlu_eval_accuracy_astronomy": 0.125, - "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, - "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552, - "mmlu_eval_accuracy_college_biology": 0.75, - "mmlu_eval_accuracy_college_chemistry": 1.0, - "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, - "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, - "mmlu_eval_accuracy_college_medicine": 0.45454545454545453, - "mmlu_eval_accuracy_college_physics": 0.45454545454545453, - "mmlu_eval_accuracy_computer_security": 0.45454545454545453, - "mmlu_eval_accuracy_conceptual_physics": 0.42857142857142855, - "mmlu_eval_accuracy_econometrics": NaN, - "mmlu_eval_accuracy_electrical_engineering": NaN, - "mmlu_eval_accuracy_elementary_mathematics": NaN, - "mmlu_eval_accuracy_formal_logic": NaN, - "mmlu_eval_accuracy_global_facts": NaN, - "mmlu_eval_accuracy_high_school_biology": NaN, - "mmlu_eval_accuracy_high_school_chemistry": NaN, - "mmlu_eval_accuracy_high_school_computer_science": NaN, - "mmlu_eval_accuracy_high_school_european_history": NaN, - "mmlu_eval_accuracy_high_school_geography": NaN, - "mmlu_eval_accuracy_high_school_government_and_politics": NaN, - "mmlu_eval_accuracy_high_school_macroeconomics": NaN, - "mmlu_eval_accuracy_high_school_mathematics": NaN, - "mmlu_eval_accuracy_high_school_microeconomics": NaN, - "mmlu_eval_accuracy_high_school_physics": NaN, - "mmlu_eval_accuracy_high_school_psychology": NaN, - "mmlu_eval_accuracy_high_school_statistics": NaN, - "mmlu_eval_accuracy_high_school_us_history": NaN, - "mmlu_eval_accuracy_high_school_world_history": NaN, - "mmlu_eval_accuracy_human_aging": NaN, - "mmlu_eval_accuracy_human_sexuality": NaN, - "mmlu_eval_accuracy_international_law": NaN, - "mmlu_eval_accuracy_jurisprudence": NaN, - "mmlu_eval_accuracy_logical_fallacies": NaN, - "mmlu_eval_accuracy_machine_learning": NaN, - "mmlu_eval_accuracy_management": NaN, - "mmlu_eval_accuracy_marketing": NaN, - "mmlu_eval_accuracy_medical_genetics": NaN, - "mmlu_eval_accuracy_miscellaneous": NaN, - "mmlu_eval_accuracy_moral_disputes": NaN, - "mmlu_eval_accuracy_moral_scenarios": NaN, - "mmlu_eval_accuracy_nutrition": NaN, - "mmlu_eval_accuracy_philosophy": NaN, - "mmlu_eval_accuracy_prehistory": NaN, - "mmlu_eval_accuracy_professional_accounting": NaN, - "mmlu_eval_accuracy_professional_law": NaN, - "mmlu_eval_accuracy_professional_medicine": NaN, - "mmlu_eval_accuracy_professional_psychology": NaN, - "mmlu_eval_accuracy_public_relations": NaN, - "mmlu_eval_accuracy_security_studies": NaN, - "mmlu_eval_accuracy_sociology": NaN, - "mmlu_eval_accuracy_us_foreign_policy": NaN, - "mmlu_eval_accuracy_virology": NaN, - "mmlu_eval_accuracy_world_religions": NaN, - "mmlu_loss": 1.007582182297483, - "step": 2000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019856109645209667, - "loss": 1.2053, - "step": 2010 - }, - { - "epoch": 0.11, - "learning_rate": 0.000198523130659276, - "loss": 1.1405, - "step": 2020 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019848516486645535, - "loss": 1.1468, - "step": 2030 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019844719907363467, - "loss": 1.2189, - "step": 2040 - }, - { - "epoch": 0.11, - "learning_rate": 0.000198409233280814, - "loss": 1.0986, - "step": 2050 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019837126748799332, - "loss": 1.1598, - "step": 2060 - }, - { - "epoch": 0.11, - "learning_rate": 0.00019833330169517267, - "loss": 1.1875, - "step": 2070 - }, - { - "epoch": 0.11, - "learning_rate": 0.000198295335902352, - "loss": 1.1328, - "step": 2080 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019825737010953132, - "loss": 1.1765, - "step": 2090 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019821940431671064, - "loss": 1.0558, - "step": 2100 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019818143852389, - "loss": 1.2161, - "step": 2110 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001981434727310693, - "loss": 1.149, - "step": 2120 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019810550693824866, - "loss": 1.1905, - "step": 2130 - }, - { - "epoch": 0.12, - "learning_rate": 0.000198067541145428, - "loss": 1.2479, - "step": 2140 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001980295753526073, - "loss": 1.076, - "step": 2150 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019799160955978663, - "loss": 1.2602, - "step": 2160 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019795364376696596, - "loss": 1.1454, - "step": 2170 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001979156779741453, - "loss": 1.1334, - "step": 2180 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019787771218132463, - "loss": 1.1795, - "step": 2190 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019783974638850398, - "loss": 1.1137, - "step": 2200 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001978017805956833, - "loss": 1.2005, - "step": 2210 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019776381480286263, - "loss": 1.1376, - "step": 2220 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019772584901004195, - "loss": 1.166, - "step": 2230 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001976878832172213, - "loss": 1.192, - "step": 2240 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019764991742440063, - "loss": 1.0876, - "step": 2250 - }, - { - "epoch": 0.12, - "learning_rate": 0.00019761195163157995, - "loss": 1.2518, - "step": 2260 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019757398583875927, - "loss": 1.196, - "step": 2270 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001975360200459386, - "loss": 1.2188, - "step": 2280 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019749805425311795, - "loss": 1.2075, - "step": 2290 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001974600884602973, - "loss": 1.0276, - "step": 2300 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019742212266747662, - "loss": 1.227, - "step": 2310 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019738415687465594, - "loss": 1.1681, - "step": 2320 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019734619108183527, - "loss": 1.1402, - "step": 2330 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001973082252890146, - "loss": 1.1977, - "step": 2340 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019727025949619394, - "loss": 1.0541, - "step": 2350 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019723229370337327, - "loss": 1.2152, - "step": 2360 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019719432791055262, - "loss": 1.1588, - "step": 2370 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019715636211773194, - "loss": 1.0637, - "step": 2380 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019711839632491126, - "loss": 1.2027, - "step": 2390 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001970804305320906, - "loss": 1.1089, - "step": 2400 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019704246473926994, - "loss": 1.2193, - "step": 2410 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019700449894644926, - "loss": 1.1809, - "step": 2420 - }, - { - "epoch": 0.13, - "learning_rate": 0.00019696653315362858, - "loss": 1.1154, - "step": 2430 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001969285673608079, - "loss": 1.1509, - "step": 2440 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019689060156798723, - "loss": 1.1225, - "step": 2450 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019685263577516658, - "loss": 1.246, - "step": 2460 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019681466998234593, - "loss": 1.1565, - "step": 2470 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019677670418952526, - "loss": 1.1319, - "step": 2480 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019673873839670458, - "loss": 1.1855, - "step": 2490 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001967007726038839, - "loss": 1.142, - "step": 2500 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019666280681106323, - "loss": 1.2383, - "step": 2510 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019662484101824258, - "loss": 1.1175, - "step": 2520 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001965868752254219, - "loss": 1.1014, - "step": 2530 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019654890943260122, - "loss": 1.1959, - "step": 2540 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019651094363978057, - "loss": 1.0617, - "step": 2550 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001964729778469599, - "loss": 1.2567, - "step": 2560 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019643501205413922, - "loss": 1.1222, - "step": 2570 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019639704626131857, - "loss": 1.1147, - "step": 2580 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001963590804684979, - "loss": 1.2296, - "step": 2590 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019632111467567722, - "loss": 1.094, - "step": 2600 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019628314888285654, - "loss": 1.1776, - "step": 2610 - }, - { - "epoch": 0.14, - "learning_rate": 0.00019624518309003586, - "loss": 1.1303, - "step": 2620 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019620721729721524, - "loss": 1.137, - "step": 2630 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019616925150439457, - "loss": 1.2335, - "step": 2640 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001961312857115739, - "loss": 1.0721, - "step": 2650 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001960933199187532, - "loss": 1.2737, - "step": 2660 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019605535412593254, - "loss": 1.0844, - "step": 2670 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001960173883331119, - "loss": 1.1289, - "step": 2680 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001959794225402912, - "loss": 1.1722, - "step": 2690 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019594145674747053, - "loss": 1.0891, - "step": 2700 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019590349095464986, - "loss": 1.2333, - "step": 2710 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001958655251618292, - "loss": 1.1545, - "step": 2720 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019582755936900853, - "loss": 1.1221, - "step": 2730 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019578959357618788, - "loss": 1.1968, - "step": 2740 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001957516277833672, - "loss": 1.159, - "step": 2750 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019571366199054653, - "loss": 1.2564, - "step": 2760 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019567569619772585, - "loss": 1.1335, - "step": 2770 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019563773040490518, - "loss": 1.1433, - "step": 2780 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019559976461208453, - "loss": 1.1722, - "step": 2790 - }, - { - "epoch": 0.15, - "learning_rate": 0.00019556179881926385, - "loss": 1.0977, - "step": 2800 - }, - { - "epoch": 0.16, - "learning_rate": 0.0001955238330264432, - "loss": 1.2189, - "step": 2810 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019548586723362252, - "loss": 1.1602, - "step": 2820 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019544790144080185, - "loss": 1.147, - "step": 2830 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019540993564798117, - "loss": 1.2212, - "step": 2840 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019537196985516052, - "loss": 1.1267, - "step": 2850 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019533400406233984, - "loss": 1.2403, - "step": 2860 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019529603826951917, - "loss": 1.13, - "step": 2870 - }, - { - "epoch": 0.16, - "learning_rate": 0.0001952580724766985, - "loss": 1.1179, - "step": 2880 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019522010668387784, - "loss": 1.1641, - "step": 2890 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019518214089105716, - "loss": 1.0859, - "step": 2900 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019514417509823652, - "loss": 1.267, - "step": 2910 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019510620930541584, - "loss": 1.1309, - "step": 2920 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019506824351259516, - "loss": 1.0973, - "step": 2930 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019503027771977449, - "loss": 1.1753, - "step": 2940 - }, - { - "epoch": 0.16, - "learning_rate": 0.0001949923119269538, - "loss": 1.0664, - "step": 2950 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019495434613413316, - "loss": 1.2195, - "step": 2960 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019491638034131248, - "loss": 1.1501, - "step": 2970 - }, - { - "epoch": 0.16, - "learning_rate": 0.00019487841454849183, - "loss": 1.1266, - "step": 2980 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019484044875567116, - "loss": 1.1611, - "step": 2990 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019480248296285048, - "loss": 1.0625, - "step": 3000 - }, - { - "epoch": 0.17, - "learning_rate": 0.0001947645171700298, - "loss": 1.2264, - "step": 3010 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019472655137720915, - "loss": 1.1014, - "step": 3020 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019468858558438848, - "loss": 1.1016, - "step": 3030 - }, - { - "epoch": 0.17, - "learning_rate": 0.0001946506197915678, - "loss": 1.2034, - "step": 3040 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019461265399874712, - "loss": 1.0768, - "step": 3050 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019457468820592645, - "loss": 1.1682, - "step": 3060 - }, - { - "epoch": 0.17, - "learning_rate": 0.0001945367224131058, - "loss": 1.1381, - "step": 3070 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019449875662028515, - "loss": 1.1447, - "step": 3080 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019446079082746447, - "loss": 1.2159, - "step": 3090 - }, - { - "epoch": 0.17, - "learning_rate": 0.0001944228250346438, - "loss": 1.0857, - "step": 3100 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019438485924182312, - "loss": 1.2315, - "step": 3110 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019434689344900244, - "loss": 1.1214, - "step": 3120 - }, - { - "epoch": 0.17, - "learning_rate": 0.0001943089276561818, - "loss": 1.1003, - "step": 3130 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019427096186336112, - "loss": 1.2121, - "step": 3140 - }, - { - "epoch": 0.17, - "learning_rate": 0.00019423299607054047, - "loss": 1.0662, - "step": 3150 - }, - { - "epoch": 0.17, - "learning_rate": 0.0001941950302777198, - "loss": 1.2078, - "step": 3160 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019415706448489911, - "loss": 1.1218, - "step": 3170 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019411909869207844, - "loss": 1.136, - "step": 3180 - }, - { - "epoch": 0.18, - "learning_rate": 0.0001940811328992578, - "loss": 1.1991, - "step": 3190 - }, - { - "epoch": 0.18, - "learning_rate": 0.0001940431671064371, - "loss": 1.0649, - "step": 3200 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019400520131361644, - "loss": 1.2392, - "step": 3210 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019396723552079576, - "loss": 1.118, - "step": 3220 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019392926972797508, - "loss": 1.1365, - "step": 3230 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019389130393515443, - "loss": 1.1871, - "step": 3240 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019385333814233378, - "loss": 1.0481, - "step": 3250 - }, - { - "epoch": 0.18, - "learning_rate": 0.0001938153723495131, - "loss": 1.2319, - "step": 3260 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019377740655669243, - "loss": 1.1525, - "step": 3270 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019373944076387175, - "loss": 1.1213, - "step": 3280 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019370147497105108, - "loss": 1.1762, - "step": 3290 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019366350917823043, - "loss": 1.0419, - "step": 3300 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019362554338540975, - "loss": 1.2288, - "step": 3310 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019358757759258907, - "loss": 1.1777, - "step": 3320 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019354961179976843, - "loss": 1.1398, - "step": 3330 - }, - { - "epoch": 0.18, - "learning_rate": 0.00019351164600694775, - "loss": 1.1758, - "step": 3340 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019347368021412707, - "loss": 1.0846, - "step": 3350 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019343571442130642, - "loss": 1.2201, - "step": 3360 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019339774862848575, - "loss": 1.1009, - "step": 3370 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019335978283566507, - "loss": 1.1371, - "step": 3380 - }, - { - "epoch": 0.19, - "learning_rate": 0.0001933218170428444, - "loss": 1.204, - "step": 3390 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019328385125002372, - "loss": 1.0918, - "step": 3400 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019324588545720307, - "loss": 1.19, - "step": 3410 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019320791966438242, - "loss": 1.1501, - "step": 3420 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019316995387156174, - "loss": 1.1232, - "step": 3430 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019313198807874106, - "loss": 1.2088, - "step": 3440 - }, - { - "epoch": 0.19, - "learning_rate": 0.0001930940222859204, - "loss": 1.0601, - "step": 3450 - }, - { - "epoch": 0.19, - "learning_rate": 0.0001930560564930997, - "loss": 1.2256, - "step": 3460 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019301809070027906, - "loss": 1.0896, - "step": 3470 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019298012490745839, - "loss": 1.1774, - "step": 3480 - }, - { - "epoch": 0.19, - "learning_rate": 0.0001929421591146377, - "loss": 1.1691, - "step": 3490 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019290419332181706, - "loss": 1.052, - "step": 3500 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019286622752899638, - "loss": 1.2578, - "step": 3510 - }, - { - "epoch": 0.19, - "learning_rate": 0.0001928282617361757, - "loss": 1.14, - "step": 3520 - }, - { - "epoch": 0.19, - "learning_rate": 0.00019279029594335506, - "loss": 1.1038, - "step": 3530 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019275233015053438, - "loss": 1.2139, - "step": 3540 - }, - { - "epoch": 0.2, - "learning_rate": 0.0001927143643577137, - "loss": 1.0836, - "step": 3550 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019267639856489303, - "loss": 1.2205, - "step": 3560 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019263843277207235, - "loss": 1.1611, - "step": 3570 - }, - { - "epoch": 0.2, - "learning_rate": 0.0001926004669792517, - "loss": 1.1044, - "step": 3580 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019256250118643105, - "loss": 1.176, - "step": 3590 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019252453539361037, - "loss": 1.088, - "step": 3600 - }, - { - "epoch": 0.2, - "learning_rate": 0.0001924865696007897, - "loss": 1.2273, - "step": 3610 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019244860380796902, - "loss": 1.1066, - "step": 3620 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019241063801514837, - "loss": 1.1157, - "step": 3630 - }, - { - "epoch": 0.2, - "learning_rate": 0.0001923726722223277, - "loss": 1.1903, - "step": 3640 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019233470642950702, - "loss": 1.0617, - "step": 3650 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019229674063668634, - "loss": 1.2024, - "step": 3660 - }, - { - "epoch": 0.2, - "learning_rate": 0.0001922587748438657, - "loss": 1.1173, - "step": 3670 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019222080905104502, - "loss": 1.1281, - "step": 3680 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019218284325822437, - "loss": 1.1616, - "step": 3690 - }, - { - "epoch": 0.2, - "learning_rate": 0.0001921448774654037, - "loss": 1.1023, - "step": 3700 - }, - { - "epoch": 0.2, - "learning_rate": 0.00019210691167258301, - "loss": 1.2164, - "step": 3710 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019206894587976234, - "loss": 1.1053, - "step": 3720 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019203098008694166, - "loss": 1.172, - "step": 3730 - }, - { - "epoch": 0.21, - "learning_rate": 0.000191993014294121, - "loss": 1.1863, - "step": 3740 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019195504850130033, - "loss": 1.0662, - "step": 3750 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019191708270847969, - "loss": 1.2229, - "step": 3760 - }, - { - "epoch": 0.21, - "learning_rate": 0.000191879116915659, - "loss": 1.1553, - "step": 3770 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019184115112283833, - "loss": 1.1146, - "step": 3780 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019180318533001766, - "loss": 1.1671, - "step": 3790 - }, - { - "epoch": 0.21, - "learning_rate": 0.000191765219537197, - "loss": 1.139, - "step": 3800 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019172725374437633, - "loss": 1.213, - "step": 3810 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019168928795155565, - "loss": 1.103, - "step": 3820 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019165132215873498, - "loss": 1.144, - "step": 3830 - }, - { - "epoch": 0.21, - "learning_rate": 0.0001916133563659143, - "loss": 1.1856, - "step": 3840 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019157539057309365, - "loss": 1.0548, - "step": 3850 - }, - { - "epoch": 0.21, - "learning_rate": 0.000191537424780273, - "loss": 1.2376, - "step": 3860 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019149945898745232, - "loss": 1.1467, - "step": 3870 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019146149319463165, - "loss": 1.1517, - "step": 3880 - }, - { - "epoch": 0.21, - "learning_rate": 0.00019142352740181097, - "loss": 1.1989, - "step": 3890 - }, - { - "epoch": 0.22, - "learning_rate": 0.0001913855616089903, - "loss": 1.0422, - "step": 3900 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019134759581616965, - "loss": 1.2195, - "step": 3910 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019130963002334897, - "loss": 1.1608, - "step": 3920 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019127166423052832, - "loss": 1.1162, - "step": 3930 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019123369843770764, - "loss": 1.1381, - "step": 3940 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019119573264488697, - "loss": 1.063, - "step": 3950 - }, - { - "epoch": 0.22, - "learning_rate": 0.0001911577668520663, - "loss": 1.2331, - "step": 3960 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019111980105924564, - "loss": 1.1366, - "step": 3970 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019108183526642496, - "loss": 1.1444, - "step": 3980 - }, - { - "epoch": 0.22, - "learning_rate": 0.0001910438694736043, - "loss": 1.1837, - "step": 3990 - }, - { - "epoch": 0.22, - "learning_rate": 0.0001910059036807836, - "loss": 1.0373, - "step": 4000 - }, - { - "epoch": 0.22, - "eval_loss": 1.110398769378662, - "eval_runtime": 1.9777, - "eval_samples_per_second": 50.563, - "eval_steps_per_second": 6.573, - "step": 4000 - }, - { - "epoch": 0.22, - "mmlu_eval_accuracy": NaN, - "mmlu_eval_accuracy_abstract_algebra": 0.5454545454545454, - "mmlu_eval_accuracy_anatomy": 0.2857142857142857, - "mmlu_eval_accuracy_astronomy": 0.125, - "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, - "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, - "mmlu_eval_accuracy_college_biology": 0.75, - "mmlu_eval_accuracy_college_chemistry": 1.0, - "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, - "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, - "mmlu_eval_accuracy_college_medicine": 0.45454545454545453, - "mmlu_eval_accuracy_college_physics": 0.5454545454545454, - "mmlu_eval_accuracy_computer_security": 0.5454545454545454, - "mmlu_eval_accuracy_conceptual_physics": 0.42857142857142855, - "mmlu_eval_accuracy_econometrics": NaN, - "mmlu_eval_accuracy_electrical_engineering": NaN, - "mmlu_eval_accuracy_elementary_mathematics": NaN, - "mmlu_eval_accuracy_formal_logic": NaN, - "mmlu_eval_accuracy_global_facts": NaN, - "mmlu_eval_accuracy_high_school_biology": NaN, - "mmlu_eval_accuracy_high_school_chemistry": NaN, - "mmlu_eval_accuracy_high_school_computer_science": NaN, - "mmlu_eval_accuracy_high_school_european_history": NaN, - "mmlu_eval_accuracy_high_school_geography": NaN, - "mmlu_eval_accuracy_high_school_government_and_politics": NaN, - "mmlu_eval_accuracy_high_school_macroeconomics": NaN, - "mmlu_eval_accuracy_high_school_mathematics": NaN, - "mmlu_eval_accuracy_high_school_microeconomics": NaN, - "mmlu_eval_accuracy_high_school_physics": NaN, - "mmlu_eval_accuracy_high_school_psychology": NaN, - "mmlu_eval_accuracy_high_school_statistics": NaN, - "mmlu_eval_accuracy_high_school_us_history": NaN, - "mmlu_eval_accuracy_high_school_world_history": NaN, - "mmlu_eval_accuracy_human_aging": NaN, - "mmlu_eval_accuracy_human_sexuality": NaN, - "mmlu_eval_accuracy_international_law": NaN, - "mmlu_eval_accuracy_jurisprudence": NaN, - "mmlu_eval_accuracy_logical_fallacies": NaN, - "mmlu_eval_accuracy_machine_learning": NaN, - "mmlu_eval_accuracy_management": NaN, - "mmlu_eval_accuracy_marketing": NaN, - "mmlu_eval_accuracy_medical_genetics": NaN, - "mmlu_eval_accuracy_miscellaneous": NaN, - "mmlu_eval_accuracy_moral_disputes": NaN, - "mmlu_eval_accuracy_moral_scenarios": NaN, - "mmlu_eval_accuracy_nutrition": NaN, - "mmlu_eval_accuracy_philosophy": NaN, - "mmlu_eval_accuracy_prehistory": NaN, - "mmlu_eval_accuracy_professional_accounting": NaN, - "mmlu_eval_accuracy_professional_law": NaN, - "mmlu_eval_accuracy_professional_medicine": NaN, - "mmlu_eval_accuracy_professional_psychology": NaN, - "mmlu_eval_accuracy_public_relations": NaN, - "mmlu_eval_accuracy_security_studies": NaN, - "mmlu_eval_accuracy_sociology": NaN, - "mmlu_eval_accuracy_us_foreign_policy": NaN, - "mmlu_eval_accuracy_virology": NaN, - "mmlu_eval_accuracy_world_religions": NaN, - "mmlu_loss": 1.2074859381342928, - "step": 4000 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019096793788796293, - "loss": 1.2292, - "step": 4010 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019092997209514228, - "loss": 1.085, - "step": 4020 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019089200630232164, - "loss": 1.1231, - "step": 4030 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019085404050950096, - "loss": 1.2231, - "step": 4040 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019081607471668028, - "loss": 1.0526, - "step": 4050 - }, - { - "epoch": 0.22, - "learning_rate": 0.0001907781089238596, - "loss": 1.2899, - "step": 4060 - }, - { - "epoch": 0.22, - "learning_rate": 0.00019074014313103893, - "loss": 1.1367, - "step": 4070 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019070217733821828, - "loss": 1.1275, - "step": 4080 - }, - { - "epoch": 0.23, - "learning_rate": 0.0001906642115453976, - "loss": 1.2279, - "step": 4090 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019062624575257693, - "loss": 1.0699, - "step": 4100 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019058827995975628, - "loss": 1.1987, - "step": 4110 - }, - { - "epoch": 0.23, - "learning_rate": 0.0001905503141669356, - "loss": 1.1191, - "step": 4120 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019051234837411492, - "loss": 1.1265, - "step": 4130 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019047438258129427, - "loss": 1.1401, - "step": 4140 - }, - { - "epoch": 0.23, - "learning_rate": 0.0001904364167884736, - "loss": 1.0537, - "step": 4150 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019039845099565292, - "loss": 1.1899, - "step": 4160 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019036048520283224, - "loss": 1.1534, - "step": 4170 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019032251941001157, - "loss": 1.1586, - "step": 4180 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019028455361719092, - "loss": 1.1836, - "step": 4190 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019024658782437027, - "loss": 1.0386, - "step": 4200 - }, - { - "epoch": 0.23, - "learning_rate": 0.0001902086220315496, - "loss": 1.2267, - "step": 4210 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019017065623872892, - "loss": 1.1185, - "step": 4220 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019013269044590824, - "loss": 1.1282, - "step": 4230 - }, - { - "epoch": 0.23, - "learning_rate": 0.00019009472465308756, - "loss": 1.1845, - "step": 4240 - }, - { - "epoch": 0.23, - "learning_rate": 0.0001900567588602669, - "loss": 1.0246, - "step": 4250 - }, - { - "epoch": 0.24, - "learning_rate": 0.00019001879306744624, - "loss": 1.2289, - "step": 4260 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018998082727462556, - "loss": 1.1126, - "step": 4270 - }, - { - "epoch": 0.24, - "learning_rate": 0.0001899428614818049, - "loss": 1.1857, - "step": 4280 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018990489568898423, - "loss": 1.1578, - "step": 4290 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018986692989616356, - "loss": 1.0807, - "step": 4300 - }, - { - "epoch": 0.24, - "learning_rate": 0.0001898289641033429, - "loss": 1.1825, - "step": 4310 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018979099831052223, - "loss": 1.1619, - "step": 4320 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018975303251770156, - "loss": 1.0785, - "step": 4330 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018971506672488088, - "loss": 1.1788, - "step": 4340 - }, - { - "epoch": 0.24, - "learning_rate": 0.0001896771009320602, - "loss": 1.0419, - "step": 4350 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018963913513923955, - "loss": 1.1988, - "step": 4360 - }, - { - "epoch": 0.24, - "learning_rate": 0.0001896011693464189, - "loss": 1.0989, - "step": 4370 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018956320355359823, - "loss": 1.103, - "step": 4380 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018952523776077755, - "loss": 1.2271, - "step": 4390 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018948727196795687, - "loss": 1.0848, - "step": 4400 - }, - { - "epoch": 0.24, - "learning_rate": 0.0001894493061751362, - "loss": 1.244, - "step": 4410 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018941134038231555, - "loss": 1.1194, - "step": 4420 - }, - { - "epoch": 0.24, - "learning_rate": 0.00018937337458949487, - "loss": 1.1216, - "step": 4430 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001893354087966742, - "loss": 1.2267, - "step": 4440 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018929744300385354, - "loss": 1.0305, - "step": 4450 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018925947721103287, - "loss": 1.2269, - "step": 4460 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001892215114182122, - "loss": 1.0973, - "step": 4470 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018918354562539154, - "loss": 1.1478, - "step": 4480 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018914557983257087, - "loss": 1.1812, - "step": 4490 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001891076140397502, - "loss": 1.071, - "step": 4500 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001890696482469295, - "loss": 1.2323, - "step": 4510 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018903168245410884, - "loss": 1.1268, - "step": 4520 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001889937166612882, - "loss": 1.1018, - "step": 4530 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018895575086846754, - "loss": 1.2016, - "step": 4540 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018891778507564686, - "loss": 1.083, - "step": 4550 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018887981928282618, - "loss": 1.2251, - "step": 4560 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001888418534900055, - "loss": 1.1421, - "step": 4570 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018880388769718486, - "loss": 1.1455, - "step": 4580 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018876592190436418, - "loss": 1.1418, - "step": 4590 - }, - { - "epoch": 0.25, - "learning_rate": 0.0001887279561115435, - "loss": 1.0783, - "step": 4600 - }, - { - "epoch": 0.25, - "learning_rate": 0.00018868999031872283, - "loss": 1.187, - "step": 4610 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018865202452590215, - "loss": 1.1095, - "step": 4620 - }, - { - "epoch": 0.26, - "learning_rate": 0.0001886140587330815, - "loss": 1.1307, - "step": 4630 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018857609294026085, - "loss": 1.1626, - "step": 4640 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018853812714744018, - "loss": 1.0711, - "step": 4650 - }, - { - "epoch": 0.26, - "learning_rate": 0.0001885001613546195, - "loss": 1.1894, - "step": 4660 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018846219556179882, - "loss": 1.1372, - "step": 4670 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018842422976897815, - "loss": 1.1343, - "step": 4680 - }, - { - "epoch": 0.26, - "learning_rate": 0.0001883862639761575, - "loss": 1.1851, - "step": 4690 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018834829818333682, - "loss": 1.0585, - "step": 4700 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018831033239051617, - "loss": 1.1874, - "step": 4710 - }, - { - "epoch": 0.26, - "learning_rate": 0.0001882723665976955, - "loss": 1.1464, - "step": 4720 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018823440080487482, - "loss": 1.1317, - "step": 4730 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018819643501205414, - "loss": 1.2306, - "step": 4740 - }, - { - "epoch": 0.26, - "learning_rate": 0.0001881584692192335, - "loss": 1.0564, - "step": 4750 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018812050342641282, - "loss": 1.209, - "step": 4760 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018808253763359214, - "loss": 1.1539, - "step": 4770 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018804457184077146, - "loss": 1.1303, - "step": 4780 - }, - { - "epoch": 0.26, - "learning_rate": 0.00018800660604795079, - "loss": 1.2107, - "step": 4790 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018796864025513014, - "loss": 1.0294, - "step": 4800 - }, - { - "epoch": 0.27, - "learning_rate": 0.0001879306744623095, - "loss": 1.234, - "step": 4810 - }, - { - "epoch": 0.27, - "learning_rate": 0.0001878927086694888, - "loss": 1.1464, - "step": 4820 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018785474287666813, - "loss": 1.107, - "step": 4830 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018781677708384746, - "loss": 1.1833, - "step": 4840 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018777881129102678, - "loss": 1.0265, - "step": 4850 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018774084549820613, - "loss": 1.2075, - "step": 4860 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018770287970538545, - "loss": 1.1351, - "step": 4870 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018766491391256478, - "loss": 1.1321, - "step": 4880 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018762694811974413, - "loss": 1.1653, - "step": 4890 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018758898232692345, - "loss": 1.0572, - "step": 4900 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018755101653410278, - "loss": 1.2171, - "step": 4910 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018751305074128213, - "loss": 1.098, - "step": 4920 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018747508494846145, - "loss": 1.1458, - "step": 4930 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018743711915564077, - "loss": 1.1418, - "step": 4940 - }, - { - "epoch": 0.27, - "learning_rate": 0.0001873991533628201, - "loss": 1.0089, - "step": 4950 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018736118756999942, - "loss": 1.1623, - "step": 4960 - }, - { - "epoch": 0.27, - "learning_rate": 0.00018732322177717877, - "loss": 1.1495, - "step": 4970 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018728525598435812, - "loss": 1.1035, - "step": 4980 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018724729019153744, - "loss": 1.161, - "step": 4990 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018720932439871677, - "loss": 1.0411, - "step": 5000 - }, - { - "epoch": 0.28, - "learning_rate": 0.0001871713586058961, - "loss": 1.2324, - "step": 5010 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018713339281307541, - "loss": 1.1385, - "step": 5020 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018709542702025477, - "loss": 1.1043, - "step": 5030 - }, - { - "epoch": 0.28, - "learning_rate": 0.0001870574612274341, - "loss": 1.181, - "step": 5040 - }, - { - "epoch": 0.28, - "learning_rate": 0.0001870194954346134, - "loss": 1.0469, - "step": 5050 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018698152964179276, - "loss": 1.2059, - "step": 5060 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018694356384897209, - "loss": 1.1162, - "step": 5070 - }, - { - "epoch": 0.28, - "learning_rate": 0.0001869055980561514, - "loss": 1.1082, - "step": 5080 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018686763226333076, - "loss": 1.1686, - "step": 5090 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018682966647051008, - "loss": 1.0448, - "step": 5100 - }, - { - "epoch": 0.28, - "learning_rate": 0.0001867917006776894, - "loss": 1.1741, - "step": 5110 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018675373488486873, - "loss": 1.0716, - "step": 5120 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018671576909204805, - "loss": 1.1176, - "step": 5130 - }, - { - "epoch": 0.28, - "learning_rate": 0.0001866778032992274, - "loss": 1.1733, - "step": 5140 - }, - { - "epoch": 0.28, - "learning_rate": 0.00018663983750640675, - "loss": 1.0743, - "step": 5150 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018660187171358608, - "loss": 1.2193, - "step": 5160 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001865639059207654, - "loss": 1.1102, - "step": 5170 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018652594012794473, - "loss": 1.121, - "step": 5180 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018648797433512405, - "loss": 1.1776, - "step": 5190 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001864500085423034, - "loss": 1.0163, - "step": 5200 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018641204274948272, - "loss": 1.2541, - "step": 5210 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018637407695666205, - "loss": 1.0853, - "step": 5220 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001863361111638414, - "loss": 1.1733, - "step": 5230 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018629814537102072, - "loss": 1.2336, - "step": 5240 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018626017957820004, - "loss": 1.0562, - "step": 5250 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001862222137853794, - "loss": 1.2348, - "step": 5260 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018618424799255872, - "loss": 1.1489, - "step": 5270 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018614628219973804, - "loss": 1.15, - "step": 5280 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018610831640691736, - "loss": 1.1572, - "step": 5290 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001860703506140967, - "loss": 1.0913, - "step": 5300 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018603238482127604, - "loss": 1.1969, - "step": 5310 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001859944190284554, - "loss": 1.0944, - "step": 5320 - }, - { - "epoch": 0.29, - "learning_rate": 0.0001859564532356347, - "loss": 1.1065, - "step": 5330 - }, - { - "epoch": 0.29, - "learning_rate": 0.00018591848744281404, - "loss": 1.1457, - "step": 5340 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018588052164999336, - "loss": 1.0607, - "step": 5350 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018584255585717268, - "loss": 1.2277, - "step": 5360 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018580459006435203, - "loss": 1.1081, - "step": 5370 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018576662427153136, - "loss": 1.1414, - "step": 5380 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018572865847871068, - "loss": 1.2183, - "step": 5390 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018569069268589003, - "loss": 1.0255, - "step": 5400 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018565272689306935, - "loss": 1.1896, - "step": 5410 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018561476110024868, - "loss": 1.127, - "step": 5420 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018557679530742803, - "loss": 1.1278, - "step": 5430 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018553882951460735, - "loss": 1.2204, - "step": 5440 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018550086372178667, - "loss": 1.0458, - "step": 5450 - }, - { - "epoch": 0.3, - "learning_rate": 0.000185462897928966, - "loss": 1.2106, - "step": 5460 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018542493213614532, - "loss": 1.145, - "step": 5470 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018538696634332467, - "loss": 1.1016, - "step": 5480 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018534900055050402, - "loss": 1.1442, - "step": 5490 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018531103475768335, - "loss": 1.0923, - "step": 5500 - }, - { - "epoch": 0.3, - "learning_rate": 0.00018527306896486267, - "loss": 1.1807, - "step": 5510 - }, - { - "epoch": 0.3, - "learning_rate": 0.000185235103172042, - "loss": 1.1165, - "step": 5520 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018519713737922134, - "loss": 1.148, - "step": 5530 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018515917158640067, - "loss": 1.193, - "step": 5540 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018512120579358, - "loss": 1.0173, - "step": 5550 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018508324000075931, - "loss": 1.2175, - "step": 5560 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018504527420793864, - "loss": 1.1131, - "step": 5570 - }, - { - "epoch": 0.31, - "learning_rate": 0.000185007308415118, - "loss": 1.1276, - "step": 5580 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018496934262229734, - "loss": 1.1736, - "step": 5590 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018493137682947666, - "loss": 1.0398, - "step": 5600 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018489341103665599, - "loss": 1.1924, - "step": 5610 - }, - { - "epoch": 0.31, - "learning_rate": 0.0001848554452438353, - "loss": 1.1046, - "step": 5620 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018481747945101463, - "loss": 1.1178, - "step": 5630 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018477951365819398, - "loss": 1.1669, - "step": 5640 - }, - { - "epoch": 0.31, - "learning_rate": 0.0001847415478653733, - "loss": 1.0463, - "step": 5650 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018470358207255266, - "loss": 1.2307, - "step": 5660 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018466561627973198, - "loss": 1.0907, - "step": 5670 - }, - { - "epoch": 0.31, - "learning_rate": 0.0001846276504869113, - "loss": 1.1094, - "step": 5680 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018458968469409063, - "loss": 1.1678, - "step": 5690 - }, - { - "epoch": 0.31, - "learning_rate": 0.00018455171890126998, - "loss": 1.0547, - "step": 5700 - }, - { - "epoch": 0.32, - "learning_rate": 0.0001845137531084493, - "loss": 1.2321, - "step": 5710 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018447578731562862, - "loss": 1.1277, - "step": 5720 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018443782152280795, - "loss": 1.1338, - "step": 5730 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018439985572998727, - "loss": 1.2256, - "step": 5740 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018436188993716662, - "loss": 1.0613, - "step": 5750 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018432392414434597, - "loss": 1.2168, - "step": 5760 - }, - { - "epoch": 0.32, - "learning_rate": 0.0001842859583515253, - "loss": 1.0944, - "step": 5770 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018424799255870462, - "loss": 1.1116, - "step": 5780 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018421002676588394, - "loss": 1.2336, - "step": 5790 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018417206097306327, - "loss": 1.0786, - "step": 5800 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018413409518024262, - "loss": 1.2013, - "step": 5810 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018409612938742194, - "loss": 1.0692, - "step": 5820 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018405816359460126, - "loss": 1.1132, - "step": 5830 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018402019780178061, - "loss": 1.1931, - "step": 5840 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018398223200895994, - "loss": 1.0281, - "step": 5850 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018394426621613926, - "loss": 1.2425, - "step": 5860 - }, - { - "epoch": 0.32, - "learning_rate": 0.0001839063004233186, - "loss": 1.149, - "step": 5870 - }, - { - "epoch": 0.32, - "learning_rate": 0.00018386833463049793, - "loss": 1.1379, - "step": 5880 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018383036883767726, - "loss": 1.1725, - "step": 5890 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018379240304485658, - "loss": 1.0951, - "step": 5900 - }, - { - "epoch": 0.33, - "learning_rate": 0.0001837544372520359, - "loss": 1.195, - "step": 5910 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018371647145921526, - "loss": 1.1485, - "step": 5920 - }, - { - "epoch": 0.33, - "learning_rate": 0.0001836785056663946, - "loss": 1.1079, - "step": 5930 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018364053987357393, - "loss": 1.1965, - "step": 5940 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018360257408075325, - "loss": 1.0699, - "step": 5950 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018356460828793258, - "loss": 1.1892, - "step": 5960 - }, - { - "epoch": 0.33, - "learning_rate": 0.0001835266424951119, - "loss": 1.1574, - "step": 5970 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018348867670229125, - "loss": 1.1089, - "step": 5980 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018345071090947057, - "loss": 1.1851, - "step": 5990 - }, - { - "epoch": 0.33, - "learning_rate": 0.0001834127451166499, - "loss": 1.0495, - "step": 6000 - }, - { - "epoch": 0.33, - "eval_loss": 1.1018731594085693, - "eval_runtime": 1.9641, - "eval_samples_per_second": 50.913, - "eval_steps_per_second": 6.619, - "step": 6000 - }, - { - "epoch": 0.33, - "mmlu_eval_accuracy": NaN, - "mmlu_eval_accuracy_abstract_algebra": 0.5454545454545454, - "mmlu_eval_accuracy_anatomy": 0.2857142857142857, - "mmlu_eval_accuracy_astronomy": 0.0625, - "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, - "mmlu_eval_accuracy_clinical_knowledge": 0.5172413793103449, - "mmlu_eval_accuracy_college_biology": 0.75, - "mmlu_eval_accuracy_college_chemistry": 1.0, - "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, - "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, - "mmlu_eval_accuracy_college_medicine": 0.45454545454545453, - "mmlu_eval_accuracy_college_physics": 0.5454545454545454, - "mmlu_eval_accuracy_computer_security": 0.5454545454545454, - "mmlu_eval_accuracy_conceptual_physics": 0.3333333333333333, - "mmlu_eval_accuracy_econometrics": NaN, - "mmlu_eval_accuracy_electrical_engineering": NaN, - "mmlu_eval_accuracy_elementary_mathematics": NaN, - "mmlu_eval_accuracy_formal_logic": NaN, - "mmlu_eval_accuracy_global_facts": NaN, - "mmlu_eval_accuracy_high_school_biology": NaN, - "mmlu_eval_accuracy_high_school_chemistry": NaN, - "mmlu_eval_accuracy_high_school_computer_science": NaN, - "mmlu_eval_accuracy_high_school_european_history": NaN, - "mmlu_eval_accuracy_high_school_geography": NaN, - "mmlu_eval_accuracy_high_school_government_and_politics": NaN, - "mmlu_eval_accuracy_high_school_macroeconomics": NaN, - "mmlu_eval_accuracy_high_school_mathematics": NaN, - "mmlu_eval_accuracy_high_school_microeconomics": NaN, - "mmlu_eval_accuracy_high_school_physics": NaN, - "mmlu_eval_accuracy_high_school_psychology": NaN, - "mmlu_eval_accuracy_high_school_statistics": NaN, - "mmlu_eval_accuracy_high_school_us_history": NaN, - "mmlu_eval_accuracy_high_school_world_history": NaN, - "mmlu_eval_accuracy_human_aging": NaN, - "mmlu_eval_accuracy_human_sexuality": NaN, - "mmlu_eval_accuracy_international_law": NaN, - "mmlu_eval_accuracy_jurisprudence": NaN, - "mmlu_eval_accuracy_logical_fallacies": NaN, - "mmlu_eval_accuracy_machine_learning": NaN, - "mmlu_eval_accuracy_management": NaN, - "mmlu_eval_accuracy_marketing": NaN, - "mmlu_eval_accuracy_medical_genetics": NaN, - "mmlu_eval_accuracy_miscellaneous": NaN, - "mmlu_eval_accuracy_moral_disputes": NaN, - "mmlu_eval_accuracy_moral_scenarios": NaN, - "mmlu_eval_accuracy_nutrition": NaN, - "mmlu_eval_accuracy_philosophy": NaN, - "mmlu_eval_accuracy_prehistory": NaN, - "mmlu_eval_accuracy_professional_accounting": NaN, - "mmlu_eval_accuracy_professional_law": NaN, - "mmlu_eval_accuracy_professional_medicine": NaN, - "mmlu_eval_accuracy_professional_psychology": NaN, - "mmlu_eval_accuracy_public_relations": NaN, - "mmlu_eval_accuracy_security_studies": NaN, - "mmlu_eval_accuracy_sociology": NaN, - "mmlu_eval_accuracy_us_foreign_policy": NaN, - "mmlu_eval_accuracy_virology": NaN, - "mmlu_eval_accuracy_world_religions": NaN, - "mmlu_loss": 1.2562693644625444, - "step": 6000 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018337477932382925, - "loss": 1.2248, - "step": 6010 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018333681353100857, - "loss": 1.1279, - "step": 6020 - }, - { - "epoch": 0.33, - "learning_rate": 0.0001832988477381879, - "loss": 1.0738, - "step": 6030 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018326088194536725, - "loss": 1.1895, - "step": 6040 - }, - { - "epoch": 0.33, - "learning_rate": 0.00018322291615254657, - "loss": 1.0129, - "step": 6050 - }, - { - "epoch": 0.33, - "learning_rate": 0.0001831849503597259, - "loss": 1.2221, - "step": 6060 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018314698456690522, - "loss": 1.1028, - "step": 6070 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018310901877408454, - "loss": 1.102, - "step": 6080 - }, - { - "epoch": 0.34, - "learning_rate": 0.0001830710529812639, - "loss": 1.1225, - "step": 6090 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018303308718844324, - "loss": 1.0915, - "step": 6100 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018299512139562256, - "loss": 1.1598, - "step": 6110 - }, - { - "epoch": 0.34, - "learning_rate": 0.0001829571556028019, - "loss": 1.1404, - "step": 6120 - }, - { - "epoch": 0.34, - "learning_rate": 0.0001829191898099812, - "loss": 1.0737, - "step": 6130 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018288122401716053, - "loss": 1.1347, - "step": 6140 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018284325822433988, - "loss": 1.0211, - "step": 6150 - }, - { - "epoch": 0.34, - "learning_rate": 0.0001828052924315192, - "loss": 1.2267, - "step": 6160 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018276732663869853, - "loss": 1.1446, - "step": 6170 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018272936084587788, - "loss": 1.1504, - "step": 6180 - }, - { - "epoch": 0.34, - "learning_rate": 0.0001826913950530572, - "loss": 1.1443, - "step": 6190 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018265342926023653, - "loss": 1.0797, - "step": 6200 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018261546346741588, - "loss": 1.1665, - "step": 6210 - }, - { - "epoch": 0.34, - "learning_rate": 0.0001825774976745952, - "loss": 1.0945, - "step": 6220 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018253953188177453, - "loss": 1.1407, - "step": 6230 - }, - { - "epoch": 0.34, - "learning_rate": 0.00018250156608895385, - "loss": 1.1482, - "step": 6240 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018246360029613317, - "loss": 1.0432, - "step": 6250 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018242563450331252, - "loss": 1.2223, - "step": 6260 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018238766871049187, - "loss": 1.1428, - "step": 6270 - }, - { - "epoch": 0.35, - "learning_rate": 0.0001823497029176712, - "loss": 1.0959, - "step": 6280 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018231173712485052, - "loss": 1.1745, - "step": 6290 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018227377133202984, - "loss": 1.0914, - "step": 6300 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018223580553920917, - "loss": 1.1881, - "step": 6310 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018219783974638852, - "loss": 1.1077, - "step": 6320 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018215987395356784, - "loss": 1.0773, - "step": 6330 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018212190816074717, - "loss": 1.1642, - "step": 6340 - }, - { - "epoch": 0.35, - "learning_rate": 0.0001820839423679265, - "loss": 1.0321, - "step": 6350 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018204597657510584, - "loss": 1.168, - "step": 6360 - }, - { - "epoch": 0.35, - "learning_rate": 0.0001820080107822852, - "loss": 1.1029, - "step": 6370 - }, - { - "epoch": 0.35, - "learning_rate": 0.0001819700449894645, - "loss": 1.1007, - "step": 6380 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018193207919664384, - "loss": 1.2254, - "step": 6390 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018189411340382316, - "loss": 1.0357, - "step": 6400 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018185614761100248, - "loss": 1.2457, - "step": 6410 - }, - { - "epoch": 0.35, - "learning_rate": 0.00018181818181818183, - "loss": 1.1002, - "step": 6420 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018178021602536116, - "loss": 1.1173, - "step": 6430 - }, - { - "epoch": 0.36, - "learning_rate": 0.0001817422502325405, - "loss": 1.1747, - "step": 6440 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018170428443971983, - "loss": 1.0586, - "step": 6450 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018166631864689916, - "loss": 1.218, - "step": 6460 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018162835285407848, - "loss": 1.1095, - "step": 6470 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018159038706125783, - "loss": 1.1188, - "step": 6480 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018155242126843715, - "loss": 1.1898, - "step": 6490 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018151445547561648, - "loss": 1.0182, - "step": 6500 - }, - { - "epoch": 0.36, - "learning_rate": 0.0001814764896827958, - "loss": 1.2594, - "step": 6510 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018143852388997512, - "loss": 1.0942, - "step": 6520 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018140055809715447, - "loss": 1.1292, - "step": 6530 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018136259230433382, - "loss": 1.1962, - "step": 6540 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018132462651151315, - "loss": 1.0887, - "step": 6550 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018128666071869247, - "loss": 1.148, - "step": 6560 - }, - { - "epoch": 0.36, - "learning_rate": 0.0001812486949258718, - "loss": 1.0687, - "step": 6570 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018121072913305112, - "loss": 1.1146, - "step": 6580 - }, - { - "epoch": 0.36, - "learning_rate": 0.00018117276334023047, - "loss": 1.1275, - "step": 6590 - }, - { - "epoch": 0.36, - "learning_rate": 0.0001811347975474098, - "loss": 1.0666, - "step": 6600 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018109683175458912, - "loss": 1.2092, - "step": 6610 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018105886596176847, - "loss": 1.1358, - "step": 6620 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001810209001689478, - "loss": 1.203, - "step": 6630 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001809829343761271, - "loss": 1.1757, - "step": 6640 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018094496858330646, - "loss": 1.0754, - "step": 6650 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001809070027904858, - "loss": 1.2037, - "step": 6660 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001808690369976651, - "loss": 1.1305, - "step": 6670 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018083107120484443, - "loss": 1.0914, - "step": 6680 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018079310541202376, - "loss": 1.1591, - "step": 6690 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001807551396192031, - "loss": 1.0899, - "step": 6700 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018071717382638246, - "loss": 1.1849, - "step": 6710 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018067920803356178, - "loss": 1.0921, - "step": 6720 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001806412422407411, - "loss": 1.1229, - "step": 6730 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018060327644792043, - "loss": 1.1526, - "step": 6740 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018056531065509975, - "loss": 1.0688, - "step": 6750 - }, - { - "epoch": 0.37, - "learning_rate": 0.0001805273448622791, - "loss": 1.208, - "step": 6760 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018048937906945843, - "loss": 1.0821, - "step": 6770 - }, - { - "epoch": 0.37, - "learning_rate": 0.00018045141327663775, - "loss": 1.0888, - "step": 6780 - }, - { - "epoch": 0.38, - "learning_rate": 0.0001804134474838171, - "loss": 1.1763, - "step": 6790 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018037548169099642, - "loss": 1.0669, - "step": 6800 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018033751589817575, - "loss": 1.2801, - "step": 6810 - }, - { - "epoch": 0.38, - "learning_rate": 0.0001802995501053551, - "loss": 1.0969, - "step": 6820 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018026158431253442, - "loss": 1.1022, - "step": 6830 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018022361851971374, - "loss": 1.1699, - "step": 6840 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018018565272689307, - "loss": 1.0569, - "step": 6850 - }, - { - "epoch": 0.38, - "learning_rate": 0.0001801476869340724, - "loss": 1.1959, - "step": 6860 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018010972114125174, - "loss": 1.1223, - "step": 6870 - }, - { - "epoch": 0.38, - "learning_rate": 0.0001800717553484311, - "loss": 1.1347, - "step": 6880 - }, - { - "epoch": 0.38, - "learning_rate": 0.00018003378955561042, - "loss": 1.1832, - "step": 6890 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017999582376278974, - "loss": 1.0275, - "step": 6900 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017995785796996906, - "loss": 1.2386, - "step": 6910 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017991989217714839, - "loss": 1.0886, - "step": 6920 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017988192638432774, - "loss": 1.1499, - "step": 6930 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017984396059150706, - "loss": 1.1467, - "step": 6940 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017980599479868638, - "loss": 1.0457, - "step": 6950 - }, - { - "epoch": 0.38, - "learning_rate": 0.00017976802900586573, - "loss": 1.182, - "step": 6960 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017973006321304506, - "loss": 1.1167, - "step": 6970 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017969209742022438, - "loss": 1.0978, - "step": 6980 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017965413162740373, - "loss": 1.1854, - "step": 6990 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017961616583458305, - "loss": 1.0204, - "step": 7000 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017957820004176238, - "loss": 1.1626, - "step": 7010 - }, - { - "epoch": 0.39, - "learning_rate": 0.0001795402342489417, - "loss": 1.0955, - "step": 7020 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017950226845612102, - "loss": 1.0513, - "step": 7030 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017946430266330038, - "loss": 1.1696, - "step": 7040 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017942633687047973, - "loss": 1.0451, - "step": 7050 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017938837107765905, - "loss": 1.2098, - "step": 7060 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017935040528483837, - "loss": 1.1188, - "step": 7070 - }, - { - "epoch": 0.39, - "learning_rate": 0.0001793124394920177, - "loss": 1.1274, - "step": 7080 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017927447369919702, - "loss": 1.1945, - "step": 7090 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017923650790637637, - "loss": 1.0446, - "step": 7100 - }, - { - "epoch": 0.39, - "learning_rate": 0.0001791985421135557, - "loss": 1.203, - "step": 7110 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017916057632073502, - "loss": 1.0866, - "step": 7120 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017912261052791434, - "loss": 1.0894, - "step": 7130 - }, - { - "epoch": 0.39, - "learning_rate": 0.0001790846447350937, - "loss": 1.172, - "step": 7140 - }, - { - "epoch": 0.39, - "learning_rate": 0.00017904667894227301, - "loss": 1.034, - "step": 7150 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017900871314945237, - "loss": 1.1868, - "step": 7160 - }, - { - "epoch": 0.4, - "learning_rate": 0.0001789707473566317, - "loss": 1.1176, - "step": 7170 - }, - { - "epoch": 0.4, - "learning_rate": 0.000178932781563811, - "loss": 1.0997, - "step": 7180 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017889481577099034, - "loss": 1.1568, - "step": 7190 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017885684997816966, - "loss": 1.0912, - "step": 7200 - }, - { - "epoch": 0.4, - "learning_rate": 0.000178818884185349, - "loss": 1.2338, - "step": 7210 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017878091839252836, - "loss": 1.134, - "step": 7220 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017874295259970768, - "loss": 1.1345, - "step": 7230 - }, - { - "epoch": 0.4, - "learning_rate": 0.000178704986806887, - "loss": 1.1553, - "step": 7240 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017866702101406633, - "loss": 1.0481, - "step": 7250 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017862905522124565, - "loss": 1.1803, - "step": 7260 - }, - { - "epoch": 0.4, - "learning_rate": 0.000178591089428425, - "loss": 1.1155, - "step": 7270 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017855312363560433, - "loss": 1.172, - "step": 7280 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017851515784278365, - "loss": 1.196, - "step": 7290 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017847719204996297, - "loss": 1.0885, - "step": 7300 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017843922625714233, - "loss": 1.2438, - "step": 7310 - }, - { - "epoch": 0.4, - "learning_rate": 0.00017840126046432168, - "loss": 1.1048, - "step": 7320 - }, - { - "epoch": 0.4, - "learning_rate": 0.000178363294671501, - "loss": 1.071, - "step": 7330 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017832532887868032, - "loss": 1.1621, - "step": 7340 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017828736308585965, - "loss": 1.0827, - "step": 7350 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017824939729303897, - "loss": 1.2149, - "step": 7360 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017821143150021832, - "loss": 1.1233, - "step": 7370 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017817346570739764, - "loss": 1.1542, - "step": 7380 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017813549991457697, - "loss": 1.1472, - "step": 7390 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017809753412175632, - "loss": 1.0609, - "step": 7400 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017805956832893564, - "loss": 1.1793, - "step": 7410 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017802160253611496, - "loss": 1.0775, - "step": 7420 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017798363674329431, - "loss": 1.1373, - "step": 7430 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017794567095047364, - "loss": 1.1609, - "step": 7440 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017790770515765296, - "loss": 0.9958, - "step": 7450 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017786973936483229, - "loss": 1.2479, - "step": 7460 - }, - { - "epoch": 0.41, - "learning_rate": 0.0001778317735720116, - "loss": 1.0825, - "step": 7470 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017779380777919096, - "loss": 1.1483, - "step": 7480 - }, - { - "epoch": 0.41, - "learning_rate": 0.0001777558419863703, - "loss": 1.1722, - "step": 7490 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017771787619354963, - "loss": 1.0618, - "step": 7500 - }, - { - "epoch": 0.41, - "learning_rate": 0.00017767991040072896, - "loss": 1.1825, - "step": 7510 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017764194460790828, - "loss": 1.1448, - "step": 7520 - }, - { - "epoch": 0.42, - "learning_rate": 0.0001776039788150876, - "loss": 1.102, - "step": 7530 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017756601302226695, - "loss": 1.1665, - "step": 7540 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017752804722944628, - "loss": 1.0456, - "step": 7550 - }, - { - "epoch": 0.42, - "learning_rate": 0.0001774900814366256, - "loss": 1.2029, - "step": 7560 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017745211564380495, - "loss": 1.1128, - "step": 7570 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017741414985098427, - "loss": 1.1381, - "step": 7580 - }, - { - "epoch": 0.42, - "learning_rate": 0.0001773761840581636, - "loss": 1.1638, - "step": 7590 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017733821826534295, - "loss": 1.0881, - "step": 7600 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017730025247252227, - "loss": 1.2373, - "step": 7610 - }, - { - "epoch": 0.42, - "learning_rate": 0.0001772622866797016, - "loss": 1.1243, - "step": 7620 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017722432088688092, - "loss": 1.0909, - "step": 7630 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017718635509406024, - "loss": 1.1583, - "step": 7640 - }, - { - "epoch": 0.42, - "learning_rate": 0.0001771483893012396, - "loss": 1.0165, - "step": 7650 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017711042350841894, - "loss": 1.1867, - "step": 7660 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017707245771559827, - "loss": 1.0553, - "step": 7670 - }, - { - "epoch": 0.42, - "learning_rate": 0.0001770344919227776, - "loss": 1.1146, - "step": 7680 - }, - { - "epoch": 0.42, - "learning_rate": 0.00017699652612995691, - "loss": 1.1711, - "step": 7690 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017695856033713624, - "loss": 1.06, - "step": 7700 - }, - { - "epoch": 0.43, - "learning_rate": 0.0001769205945443156, - "loss": 1.2259, - "step": 7710 - }, - { - "epoch": 0.43, - "learning_rate": 0.0001768826287514949, - "loss": 1.1026, - "step": 7720 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017684466295867423, - "loss": 1.1233, - "step": 7730 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017680669716585359, - "loss": 1.1615, - "step": 7740 - }, - { - "epoch": 0.43, - "learning_rate": 0.0001767687313730329, - "loss": 1.0338, - "step": 7750 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017673076558021223, - "loss": 1.2506, - "step": 7760 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017669279978739158, - "loss": 1.1077, - "step": 7770 - }, - { - "epoch": 0.43, - "learning_rate": 0.0001766548339945709, - "loss": 1.0701, - "step": 7780 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017661686820175023, - "loss": 1.1583, - "step": 7790 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017657890240892955, - "loss": 1.0209, - "step": 7800 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017654093661610888, - "loss": 1.2504, - "step": 7810 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017650297082328823, - "loss": 1.0819, - "step": 7820 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017646500503046758, - "loss": 1.1083, - "step": 7830 - }, - { - "epoch": 0.43, - "learning_rate": 0.0001764270392376469, - "loss": 1.1461, - "step": 7840 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017638907344482622, - "loss": 1.0021, - "step": 7850 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017635110765200555, - "loss": 1.1797, - "step": 7860 - }, - { - "epoch": 0.43, - "learning_rate": 0.00017631314185918487, - "loss": 1.1139, - "step": 7870 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017627517606636422, - "loss": 1.112, - "step": 7880 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017623721027354355, - "loss": 1.1878, - "step": 7890 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017619924448072287, - "loss": 1.0614, - "step": 7900 - }, - { - "epoch": 0.44, - "learning_rate": 0.0001761612786879022, - "loss": 1.2, - "step": 7910 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017612331289508154, - "loss": 1.0745, - "step": 7920 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017608534710226087, - "loss": 1.091, - "step": 7930 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017604738130944022, - "loss": 1.1541, - "step": 7940 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017600941551661954, - "loss": 1.0546, - "step": 7950 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017597144972379886, - "loss": 1.1869, - "step": 7960 - }, - { - "epoch": 0.44, - "learning_rate": 0.0001759334839309782, - "loss": 1.0857, - "step": 7970 - }, - { - "epoch": 0.44, - "learning_rate": 0.0001758955181381575, - "loss": 1.1167, - "step": 7980 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017585755234533686, - "loss": 1.1943, - "step": 7990 - }, - { - "epoch": 0.44, - "learning_rate": 0.0001758195865525162, - "loss": 1.0637, - "step": 8000 - }, - { - "epoch": 0.44, - "eval_loss": 1.1032294034957886, - "eval_runtime": 2.0046, - "eval_samples_per_second": 49.886, - "eval_steps_per_second": 6.485, - "step": 8000 - }, - { - "epoch": 0.44, - "mmlu_eval_accuracy": NaN, - "mmlu_eval_accuracy_abstract_algebra": 0.5454545454545454, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.0625, - "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, - "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, - "mmlu_eval_accuracy_college_biology": 0.8125, - "mmlu_eval_accuracy_college_chemistry": 1.0, - "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, - "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, - "mmlu_eval_accuracy_college_medicine": 0.45454545454545453, - "mmlu_eval_accuracy_college_physics": 0.2727272727272727, - "mmlu_eval_accuracy_computer_security": 0.6363636363636364, - "mmlu_eval_accuracy_conceptual_physics": 0.5238095238095238, - "mmlu_eval_accuracy_econometrics": NaN, - "mmlu_eval_accuracy_electrical_engineering": NaN, - "mmlu_eval_accuracy_elementary_mathematics": NaN, - "mmlu_eval_accuracy_formal_logic": NaN, - "mmlu_eval_accuracy_global_facts": NaN, - "mmlu_eval_accuracy_high_school_biology": NaN, - "mmlu_eval_accuracy_high_school_chemistry": NaN, - "mmlu_eval_accuracy_high_school_computer_science": NaN, - "mmlu_eval_accuracy_high_school_european_history": NaN, - "mmlu_eval_accuracy_high_school_geography": NaN, - "mmlu_eval_accuracy_high_school_government_and_politics": NaN, - "mmlu_eval_accuracy_high_school_macroeconomics": NaN, - "mmlu_eval_accuracy_high_school_mathematics": NaN, - "mmlu_eval_accuracy_high_school_microeconomics": NaN, - "mmlu_eval_accuracy_high_school_physics": NaN, - "mmlu_eval_accuracy_high_school_psychology": NaN, - "mmlu_eval_accuracy_high_school_statistics": NaN, - "mmlu_eval_accuracy_high_school_us_history": NaN, - "mmlu_eval_accuracy_high_school_world_history": NaN, - "mmlu_eval_accuracy_human_aging": NaN, - "mmlu_eval_accuracy_human_sexuality": NaN, - "mmlu_eval_accuracy_international_law": NaN, - "mmlu_eval_accuracy_jurisprudence": NaN, - "mmlu_eval_accuracy_logical_fallacies": NaN, - "mmlu_eval_accuracy_machine_learning": NaN, - "mmlu_eval_accuracy_management": NaN, - "mmlu_eval_accuracy_marketing": NaN, - "mmlu_eval_accuracy_medical_genetics": NaN, - "mmlu_eval_accuracy_miscellaneous": NaN, - "mmlu_eval_accuracy_moral_disputes": NaN, - "mmlu_eval_accuracy_moral_scenarios": NaN, - "mmlu_eval_accuracy_nutrition": NaN, - "mmlu_eval_accuracy_philosophy": NaN, - "mmlu_eval_accuracy_prehistory": NaN, - "mmlu_eval_accuracy_professional_accounting": NaN, - "mmlu_eval_accuracy_professional_law": NaN, - "mmlu_eval_accuracy_professional_medicine": NaN, - "mmlu_eval_accuracy_professional_psychology": NaN, - "mmlu_eval_accuracy_public_relations": NaN, - "mmlu_eval_accuracy_security_studies": NaN, - "mmlu_eval_accuracy_sociology": NaN, - "mmlu_eval_accuracy_us_foreign_policy": NaN, - "mmlu_eval_accuracy_virology": NaN, - "mmlu_eval_accuracy_world_religions": NaN, - "mmlu_loss": 1.1005631206401933, - "step": 8000 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017578162075969553, - "loss": 1.1715, - "step": 8010 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017574365496687486, - "loss": 1.0833, - "step": 8020 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017570568917405418, - "loss": 1.1304, - "step": 8030 - }, - { - "epoch": 0.44, - "learning_rate": 0.0001756677233812335, - "loss": 1.1738, - "step": 8040 - }, - { - "epoch": 0.44, - "learning_rate": 0.00017562975758841286, - "loss": 1.0631, - "step": 8050 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017559179179559218, - "loss": 1.2041, - "step": 8060 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001755538260027715, - "loss": 1.1268, - "step": 8070 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017551586020995083, - "loss": 1.1077, - "step": 8080 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017547789441713018, - "loss": 1.154, - "step": 8090 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001754399286243095, - "loss": 1.0583, - "step": 8100 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017540196283148885, - "loss": 1.1546, - "step": 8110 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017536399703866817, - "loss": 1.0995, - "step": 8120 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001753260312458475, - "loss": 1.1243, - "step": 8130 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017528806545302682, - "loss": 1.1597, - "step": 8140 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017525009966020614, - "loss": 1.0308, - "step": 8150 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001752121338673855, - "loss": 1.1751, - "step": 8160 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017517416807456482, - "loss": 1.1108, - "step": 8170 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017513620228174417, - "loss": 1.1264, - "step": 8180 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001750982364889235, - "loss": 1.1662, - "step": 8190 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017506027069610282, - "loss": 1.0727, - "step": 8200 - }, - { - "epoch": 0.45, - "learning_rate": 0.00017502230490328214, - "loss": 1.1721, - "step": 8210 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001749843391104615, - "loss": 1.1738, - "step": 8220 - }, - { - "epoch": 0.45, - "learning_rate": 0.0001749463733176408, - "loss": 1.1167, - "step": 8230 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017490840752482014, - "loss": 1.1913, - "step": 8240 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017487044173199946, - "loss": 1.0407, - "step": 8250 - }, - { - "epoch": 0.46, - "learning_rate": 0.0001748324759391788, - "loss": 1.1789, - "step": 8260 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017479451014635816, - "loss": 1.079, - "step": 8270 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017475654435353748, - "loss": 1.0893, - "step": 8280 - }, - { - "epoch": 0.46, - "learning_rate": 0.0001747185785607168, - "loss": 1.2122, - "step": 8290 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017468061276789613, - "loss": 1.048, - "step": 8300 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017464264697507545, - "loss": 1.2269, - "step": 8310 - }, - { - "epoch": 0.46, - "learning_rate": 0.0001746046811822548, - "loss": 1.1305, - "step": 8320 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017456671538943413, - "loss": 1.1337, - "step": 8330 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017452874959661345, - "loss": 1.1664, - "step": 8340 - }, - { - "epoch": 0.46, - "learning_rate": 0.0001744907838037928, - "loss": 1.0228, - "step": 8350 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017445281801097213, - "loss": 1.1606, - "step": 8360 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017441485221815145, - "loss": 1.09, - "step": 8370 - }, - { - "epoch": 0.46, - "learning_rate": 0.0001743768864253308, - "loss": 1.1113, - "step": 8380 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017433892063251012, - "loss": 1.2006, - "step": 8390 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017430095483968945, - "loss": 1.0509, - "step": 8400 - }, - { - "epoch": 0.46, - "learning_rate": 0.00017426298904686877, - "loss": 1.2254, - "step": 8410 - }, - { - "epoch": 0.47, - "learning_rate": 0.0001742250232540481, - "loss": 1.1138, - "step": 8420 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017418705746122744, - "loss": 1.1118, - "step": 8430 - }, - { - "epoch": 0.47, - "learning_rate": 0.0001741490916684068, - "loss": 1.1802, - "step": 8440 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017411112587558612, - "loss": 1.0444, - "step": 8450 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017407316008276544, - "loss": 1.2194, - "step": 8460 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017403519428994477, - "loss": 1.1218, - "step": 8470 - }, - { - "epoch": 0.47, - "learning_rate": 0.0001739972284971241, - "loss": 1.1704, - "step": 8480 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017395926270430344, - "loss": 1.1731, - "step": 8490 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017392129691148276, - "loss": 1.0216, - "step": 8500 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017388333111866209, - "loss": 1.2453, - "step": 8510 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017384536532584144, - "loss": 1.1244, - "step": 8520 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017380739953302076, - "loss": 1.1442, - "step": 8530 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017376943374020008, - "loss": 1.1991, - "step": 8540 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017373146794737943, - "loss": 1.0731, - "step": 8550 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017369350215455876, - "loss": 1.2022, - "step": 8560 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017365553636173808, - "loss": 1.0881, - "step": 8570 - }, - { - "epoch": 0.47, - "learning_rate": 0.0001736175705689174, - "loss": 1.1009, - "step": 8580 - }, - { - "epoch": 0.47, - "learning_rate": 0.00017357960477609673, - "loss": 1.1683, - "step": 8590 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017354163898327608, - "loss": 1.0271, - "step": 8600 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017350367319045543, - "loss": 1.1771, - "step": 8610 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017346570739763475, - "loss": 1.1151, - "step": 8620 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017342774160481408, - "loss": 1.1111, - "step": 8630 - }, - { - "epoch": 0.48, - "learning_rate": 0.0001733897758119934, - "loss": 1.2079, - "step": 8640 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017335181001917272, - "loss": 1.0144, - "step": 8650 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017331384422635207, - "loss": 1.1869, - "step": 8660 - }, - { - "epoch": 0.48, - "learning_rate": 0.0001732758784335314, - "loss": 1.0914, - "step": 8670 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017323791264071072, - "loss": 1.1002, - "step": 8680 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017319994684789004, - "loss": 1.1338, - "step": 8690 - }, - { - "epoch": 0.48, - "learning_rate": 0.0001731619810550694, - "loss": 1.0108, - "step": 8700 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017312401526224872, - "loss": 1.1879, - "step": 8710 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017308604946942807, - "loss": 1.1358, - "step": 8720 - }, - { - "epoch": 0.48, - "learning_rate": 0.0001730480836766074, - "loss": 1.0972, - "step": 8730 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017301011788378672, - "loss": 1.1843, - "step": 8740 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017297215209096604, - "loss": 1.0276, - "step": 8750 - }, - { - "epoch": 0.48, - "learning_rate": 0.00017293418629814536, - "loss": 1.2217, - "step": 8760 - }, - { - "epoch": 0.48, - "learning_rate": 0.0001728962205053247, - "loss": 1.0545, - "step": 8770 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017285825471250406, - "loss": 1.0885, - "step": 8780 - }, - { - "epoch": 0.49, - "learning_rate": 0.0001728202889196834, - "loss": 1.152, - "step": 8790 - }, - { - "epoch": 0.49, - "learning_rate": 0.0001727823231268627, - "loss": 1.0541, - "step": 8800 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017274435733404203, - "loss": 1.2015, - "step": 8810 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017270639154122136, - "loss": 1.1192, - "step": 8820 - }, - { - "epoch": 0.49, - "learning_rate": 0.0001726684257484007, - "loss": 1.0976, - "step": 8830 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017263045995558003, - "loss": 1.1027, - "step": 8840 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017259249416275935, - "loss": 1.0673, - "step": 8850 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017255452836993868, - "loss": 1.2016, - "step": 8860 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017251656257711803, - "loss": 1.0708, - "step": 8870 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017247859678429735, - "loss": 1.1211, - "step": 8880 - }, - { - "epoch": 0.49, - "learning_rate": 0.0001724406309914767, - "loss": 1.1354, - "step": 8890 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017240266519865603, - "loss": 1.0757, - "step": 8900 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017236469940583535, - "loss": 1.2318, - "step": 8910 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017232673361301467, - "loss": 1.1231, - "step": 8920 - }, - { - "epoch": 0.49, - "learning_rate": 0.000172288767820194, - "loss": 1.1097, - "step": 8930 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017225080202737335, - "loss": 1.1618, - "step": 8940 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017221283623455267, - "loss": 1.0874, - "step": 8950 - }, - { - "epoch": 0.49, - "learning_rate": 0.00017217487044173202, - "loss": 1.2472, - "step": 8960 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017213690464891134, - "loss": 1.131, - "step": 8970 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017209893885609067, - "loss": 1.0906, - "step": 8980 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017206097306327, - "loss": 1.132, - "step": 8990 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017202300727044934, - "loss": 1.0318, - "step": 9000 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017198504147762866, - "loss": 1.2357, - "step": 9010 - }, - { - "epoch": 0.5, - "learning_rate": 0.000171947075684808, - "loss": 1.067, - "step": 9020 - }, - { - "epoch": 0.5, - "learning_rate": 0.0001719091098919873, - "loss": 1.0932, - "step": 9030 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017187114409916666, - "loss": 1.1658, - "step": 9040 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017183317830634599, - "loss": 1.0291, - "step": 9050 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017179521251352534, - "loss": 1.183, - "step": 9060 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017175724672070466, - "loss": 1.1261, - "step": 9070 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017171928092788398, - "loss": 1.1133, - "step": 9080 - }, - { - "epoch": 0.5, - "learning_rate": 0.0001716813151350633, - "loss": 1.2068, - "step": 9090 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017164334934224263, - "loss": 1.0777, - "step": 9100 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017160538354942198, - "loss": 1.2064, - "step": 9110 - }, - { - "epoch": 0.5, - "learning_rate": 0.0001715674177566013, - "loss": 1.1148, - "step": 9120 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017152945196378065, - "loss": 1.1075, - "step": 9130 - }, - { - "epoch": 0.5, - "learning_rate": 0.00017149148617095998, - "loss": 1.2026, - "step": 9140 - }, - { - "epoch": 0.51, - "learning_rate": 0.0001714535203781393, - "loss": 1.0355, - "step": 9150 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017141555458531862, - "loss": 1.1654, - "step": 9160 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017137758879249798, - "loss": 1.1032, - "step": 9170 - }, - { - "epoch": 0.51, - "learning_rate": 0.0001713396229996773, - "loss": 1.1445, - "step": 9180 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017130165720685662, - "loss": 1.205, - "step": 9190 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017126369141403595, - "loss": 1.0083, - "step": 9200 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017122572562121527, - "loss": 1.2079, - "step": 9210 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017118775982839465, - "loss": 1.0913, - "step": 9220 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017114979403557397, - "loss": 1.0892, - "step": 9230 - }, - { - "epoch": 0.51, - "learning_rate": 0.0001711118282427533, - "loss": 1.1518, - "step": 9240 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017107386244993262, - "loss": 1.0212, - "step": 9250 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017103589665711194, - "loss": 1.1316, - "step": 9260 - }, - { - "epoch": 0.51, - "learning_rate": 0.0001709979308642913, - "loss": 1.1017, - "step": 9270 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017095996507147061, - "loss": 1.1198, - "step": 9280 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017092199927864994, - "loss": 1.1801, - "step": 9290 - }, - { - "epoch": 0.51, - "learning_rate": 0.0001708840334858293, - "loss": 1.0817, - "step": 9300 - }, - { - "epoch": 0.51, - "learning_rate": 0.0001708460676930086, - "loss": 1.2189, - "step": 9310 - }, - { - "epoch": 0.51, - "learning_rate": 0.00017080810190018794, - "loss": 1.0838, - "step": 9320 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017077013610736729, - "loss": 1.1358, - "step": 9330 - }, - { - "epoch": 0.52, - "learning_rate": 0.0001707321703145466, - "loss": 1.1816, - "step": 9340 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017069420452172593, - "loss": 1.0447, - "step": 9350 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017065623872890526, - "loss": 1.1844, - "step": 9360 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017061827293608458, - "loss": 1.1397, - "step": 9370 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017058030714326393, - "loss": 1.1515, - "step": 9380 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017054234135044328, - "loss": 1.1859, - "step": 9390 - }, - { - "epoch": 0.52, - "learning_rate": 0.0001705043755576226, - "loss": 1.0596, - "step": 9400 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017046640976480193, - "loss": 1.1949, - "step": 9410 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017042844397198125, - "loss": 1.0483, - "step": 9420 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017039047817916057, - "loss": 1.1605, - "step": 9430 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017035251238633993, - "loss": 1.2048, - "step": 9440 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017031454659351925, - "loss": 1.0345, - "step": 9450 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017027658080069857, - "loss": 1.2459, - "step": 9460 - }, - { - "epoch": 0.52, - "learning_rate": 0.0001702386150078779, - "loss": 1.0672, - "step": 9470 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017020064921505725, - "loss": 1.097, - "step": 9480 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017016268342223657, - "loss": 1.2101, - "step": 9490 - }, - { - "epoch": 0.52, - "learning_rate": 0.00017012471762941592, - "loss": 1.0143, - "step": 9500 - }, - { - "epoch": 0.53, - "learning_rate": 0.00017008675183659524, - "loss": 1.2034, - "step": 9510 - }, - { - "epoch": 0.53, - "learning_rate": 0.00017004878604377457, - "loss": 1.0942, - "step": 9520 - }, - { - "epoch": 0.53, - "learning_rate": 0.0001700108202509539, - "loss": 1.09, - "step": 9530 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016997285445813321, - "loss": 1.1249, - "step": 9540 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016993488866531256, - "loss": 1.0765, - "step": 9550 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016989692287249191, - "loss": 1.206, - "step": 9560 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016985895707967124, - "loss": 1.1043, - "step": 9570 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016982099128685056, - "loss": 1.1091, - "step": 9580 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016978302549402989, - "loss": 1.1638, - "step": 9590 - }, - { - "epoch": 0.53, - "learning_rate": 0.0001697450597012092, - "loss": 1.0236, - "step": 9600 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016970709390838856, - "loss": 1.1924, - "step": 9610 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016966912811556788, - "loss": 1.0758, - "step": 9620 - }, - { - "epoch": 0.53, - "learning_rate": 0.0001696311623227472, - "loss": 1.1459, - "step": 9630 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016959319652992653, - "loss": 1.1309, - "step": 9640 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016955523073710588, - "loss": 1.1036, - "step": 9650 - }, - { - "epoch": 0.53, - "learning_rate": 0.0001695172649442852, - "loss": 1.1954, - "step": 9660 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016947929915146455, - "loss": 1.0518, - "step": 9670 - }, - { - "epoch": 0.53, - "learning_rate": 0.00016944133335864388, - "loss": 1.1108, - "step": 9680 - }, - { - "epoch": 0.54, - "learning_rate": 0.0001694033675658232, - "loss": 1.1727, - "step": 9690 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016936540177300252, - "loss": 1.043, - "step": 9700 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016932743598018185, - "loss": 1.1429, - "step": 9710 - }, - { - "epoch": 0.54, - "learning_rate": 0.0001692894701873612, - "loss": 1.0893, - "step": 9720 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016925150439454052, - "loss": 1.0873, - "step": 9730 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016921353860171987, - "loss": 1.1595, - "step": 9740 - }, - { - "epoch": 0.54, - "learning_rate": 0.0001691755728088992, - "loss": 1.0543, - "step": 9750 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016913760701607852, - "loss": 1.2044, - "step": 9760 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016909964122325784, - "loss": 1.1167, - "step": 9770 - }, - { - "epoch": 0.54, - "learning_rate": 0.0001690616754304372, - "loss": 1.1231, - "step": 9780 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016902370963761652, - "loss": 1.1408, - "step": 9790 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016898574384479584, - "loss": 1.0285, - "step": 9800 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016894777805197516, - "loss": 1.2118, - "step": 9810 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016890981225915451, - "loss": 1.0846, - "step": 9820 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016887184646633384, - "loss": 1.1582, - "step": 9830 - }, - { - "epoch": 0.54, - "learning_rate": 0.0001688338806735132, - "loss": 1.1556, - "step": 9840 - }, - { - "epoch": 0.54, - "learning_rate": 0.0001687959148806925, - "loss": 1.0019, - "step": 9850 - }, - { - "epoch": 0.54, - "learning_rate": 0.00016875794908787183, - "loss": 1.1901, - "step": 9860 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016871998329505116, - "loss": 1.1013, - "step": 9870 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016868201750223048, - "loss": 1.1148, - "step": 9880 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016864405170940983, - "loss": 1.1592, - "step": 9890 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016860608591658916, - "loss": 1.0152, - "step": 9900 - }, - { - "epoch": 0.55, - "learning_rate": 0.0001685681201237685, - "loss": 1.1935, - "step": 9910 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016853015433094783, - "loss": 1.1219, - "step": 9920 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016849218853812715, - "loss": 1.0601, - "step": 9930 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016845422274530648, - "loss": 1.1514, - "step": 9940 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016841625695248583, - "loss": 0.9967, - "step": 9950 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016837829115966515, - "loss": 1.1804, - "step": 9960 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016834032536684447, - "loss": 1.106, - "step": 9970 - }, - { - "epoch": 0.55, - "learning_rate": 0.0001683023595740238, - "loss": 1.1105, - "step": 9980 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016826439378120312, - "loss": 1.1945, - "step": 9990 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016822642798838247, - "loss": 1.0313, - "step": 10000 - }, - { - "epoch": 0.55, - "eval_loss": 1.093135952949524, - "eval_runtime": 1.9627, - "eval_samples_per_second": 50.95, - "eval_steps_per_second": 6.624, - "step": 10000 - }, - { - "epoch": 0.55, - "mmlu_eval_accuracy": NaN, - "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, - "mmlu_eval_accuracy_anatomy": 0.35714285714285715, - "mmlu_eval_accuracy_astronomy": 0.125, - "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, - "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586, - "mmlu_eval_accuracy_college_biology": 0.75, - "mmlu_eval_accuracy_college_chemistry": 0.875, - "mmlu_eval_accuracy_college_computer_science": 0.6363636363636364, - "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, - "mmlu_eval_accuracy_college_medicine": 0.5454545454545454, - "mmlu_eval_accuracy_college_physics": 0.18181818181818182, - "mmlu_eval_accuracy_computer_security": 0.45454545454545453, - "mmlu_eval_accuracy_conceptual_physics": 0.47619047619047616, - "mmlu_eval_accuracy_econometrics": NaN, - "mmlu_eval_accuracy_electrical_engineering": NaN, - "mmlu_eval_accuracy_elementary_mathematics": NaN, - "mmlu_eval_accuracy_formal_logic": NaN, - "mmlu_eval_accuracy_global_facts": NaN, - "mmlu_eval_accuracy_high_school_biology": NaN, - "mmlu_eval_accuracy_high_school_chemistry": NaN, - "mmlu_eval_accuracy_high_school_computer_science": NaN, - "mmlu_eval_accuracy_high_school_european_history": NaN, - "mmlu_eval_accuracy_high_school_geography": NaN, - "mmlu_eval_accuracy_high_school_government_and_politics": NaN, - "mmlu_eval_accuracy_high_school_macroeconomics": NaN, - "mmlu_eval_accuracy_high_school_mathematics": NaN, - "mmlu_eval_accuracy_high_school_microeconomics": NaN, - "mmlu_eval_accuracy_high_school_physics": NaN, - "mmlu_eval_accuracy_high_school_psychology": NaN, - "mmlu_eval_accuracy_high_school_statistics": NaN, - "mmlu_eval_accuracy_high_school_us_history": NaN, - "mmlu_eval_accuracy_high_school_world_history": NaN, - "mmlu_eval_accuracy_human_aging": NaN, - "mmlu_eval_accuracy_human_sexuality": NaN, - "mmlu_eval_accuracy_international_law": NaN, - "mmlu_eval_accuracy_jurisprudence": NaN, - "mmlu_eval_accuracy_logical_fallacies": NaN, - "mmlu_eval_accuracy_machine_learning": NaN, - "mmlu_eval_accuracy_management": NaN, - "mmlu_eval_accuracy_marketing": NaN, - "mmlu_eval_accuracy_medical_genetics": NaN, - "mmlu_eval_accuracy_miscellaneous": NaN, - "mmlu_eval_accuracy_moral_disputes": NaN, - "mmlu_eval_accuracy_moral_scenarios": NaN, - "mmlu_eval_accuracy_nutrition": NaN, - "mmlu_eval_accuracy_philosophy": NaN, - "mmlu_eval_accuracy_prehistory": NaN, - "mmlu_eval_accuracy_professional_accounting": NaN, - "mmlu_eval_accuracy_professional_law": NaN, - "mmlu_eval_accuracy_professional_medicine": NaN, - "mmlu_eval_accuracy_professional_psychology": NaN, - "mmlu_eval_accuracy_public_relations": NaN, - "mmlu_eval_accuracy_security_studies": NaN, - "mmlu_eval_accuracy_sociology": NaN, - "mmlu_eval_accuracy_us_foreign_policy": NaN, - "mmlu_eval_accuracy_virology": NaN, - "mmlu_eval_accuracy_world_religions": NaN, - "mmlu_loss": 0.9934630032318333, - "step": 10000 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016818846219556182, - "loss": 1.2753, - "step": 10010 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016815049640274115, - "loss": 1.0814, - "step": 10020 - }, - { - "epoch": 0.55, - "learning_rate": 0.00016811253060992047, - "loss": 1.0591, - "step": 10030 - }, - { - "epoch": 0.55, - "learning_rate": 0.0001680745648170998, - "loss": 1.1769, - "step": 10040 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016803659902427912, - "loss": 1.0637, - "step": 10050 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016799863323145847, - "loss": 1.1419, - "step": 10060 - }, - { - "epoch": 0.56, - "learning_rate": 0.0001679606674386378, - "loss": 1.1077, - "step": 10070 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016792270164581714, - "loss": 1.1202, - "step": 10080 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016788473585299646, - "loss": 1.1092, - "step": 10090 - }, - { - "epoch": 0.56, - "learning_rate": 0.0001678467700601758, - "loss": 1.0123, - "step": 10100 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016780880426735514, - "loss": 1.1836, - "step": 10110 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016777083847453446, - "loss": 1.0693, - "step": 10120 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016773287268171378, - "loss": 1.082, - "step": 10130 - }, - { - "epoch": 0.56, - "learning_rate": 0.0001676949068888931, - "loss": 1.1454, - "step": 10140 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016765694109607243, - "loss": 1.0528, - "step": 10150 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016761897530325178, - "loss": 1.1933, - "step": 10160 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016758100951043113, - "loss": 1.1221, - "step": 10170 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016754304371761046, - "loss": 1.117, - "step": 10180 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016750507792478978, - "loss": 1.1451, - "step": 10190 - }, - { - "epoch": 0.56, - "learning_rate": 0.0001674671121319691, - "loss": 1.0643, - "step": 10200 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016742914633914843, - "loss": 1.2249, - "step": 10210 - }, - { - "epoch": 0.56, - "learning_rate": 0.00016739118054632778, - "loss": 1.115, - "step": 10220 - }, - { - "epoch": 0.57, - "learning_rate": 0.0001673532147535071, - "loss": 1.0937, - "step": 10230 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016731524896068642, - "loss": 1.1612, - "step": 10240 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016727728316786575, - "loss": 1.0649, - "step": 10250 - }, - { - "epoch": 0.57, - "learning_rate": 0.0001672393173750451, - "loss": 1.2074, - "step": 10260 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016720135158222442, - "loss": 1.0892, - "step": 10270 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016716338578940377, - "loss": 1.1302, - "step": 10280 - }, - { - "epoch": 0.57, - "learning_rate": 0.0001671254199965831, - "loss": 1.1513, - "step": 10290 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016708745420376242, - "loss": 1.0549, - "step": 10300 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016704948841094174, - "loss": 1.2296, - "step": 10310 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016701152261812107, - "loss": 1.0671, - "step": 10320 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016697355682530042, - "loss": 1.1102, - "step": 10330 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016693559103247977, - "loss": 1.1001, - "step": 10340 - }, - { - "epoch": 0.57, - "learning_rate": 0.0001668976252396591, - "loss": 1.0419, - "step": 10350 - }, - { - "epoch": 0.57, - "learning_rate": 0.0001668596594468384, - "loss": 1.2167, - "step": 10360 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016682169365401774, - "loss": 1.1062, - "step": 10370 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016678372786119706, - "loss": 1.0865, - "step": 10380 - }, - { - "epoch": 0.57, - "learning_rate": 0.0001667457620683764, - "loss": 1.1414, - "step": 10390 - }, - { - "epoch": 0.57, - "learning_rate": 0.00016670779627555573, - "loss": 1.0388, - "step": 10400 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016666983048273506, - "loss": 1.1287, - "step": 10410 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016663186468991438, - "loss": 1.0894, - "step": 10420 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016659389889709373, - "loss": 1.1156, - "step": 10430 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016655593310427306, - "loss": 1.1405, - "step": 10440 - }, - { - "epoch": 0.58, - "learning_rate": 0.0001665179673114524, - "loss": 1.0181, - "step": 10450 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016648000151863173, - "loss": 1.1604, - "step": 10460 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016644203572581105, - "loss": 1.0799, - "step": 10470 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016640406993299038, - "loss": 1.1437, - "step": 10480 - }, - { - "epoch": 0.58, - "learning_rate": 0.0001663661041401697, - "loss": 1.1852, - "step": 10490 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016632813834734905, - "loss": 1.0712, - "step": 10500 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016629017255452837, - "loss": 1.1855, - "step": 10510 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016625220676170772, - "loss": 1.1211, - "step": 10520 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016621424096888705, - "loss": 1.0731, - "step": 10530 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016617627517606637, - "loss": 1.1385, - "step": 10540 - }, - { - "epoch": 0.58, - "learning_rate": 0.0001661383093832457, - "loss": 1.0333, - "step": 10550 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016610034359042504, - "loss": 1.2008, - "step": 10560 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016606237779760437, - "loss": 1.0792, - "step": 10570 - }, - { - "epoch": 0.58, - "learning_rate": 0.0001660244120047837, - "loss": 1.1494, - "step": 10580 - }, - { - "epoch": 0.58, - "learning_rate": 0.00016598644621196302, - "loss": 1.1602, - "step": 10590 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016594848041914237, - "loss": 1.0484, - "step": 10600 - }, - { - "epoch": 0.59, - "learning_rate": 0.0001659105146263217, - "loss": 1.2558, - "step": 10610 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016587254883350104, - "loss": 1.0949, - "step": 10620 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016583458304068036, - "loss": 1.1117, - "step": 10630 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016579661724785969, - "loss": 1.1609, - "step": 10640 - }, - { - "epoch": 0.59, - "learning_rate": 0.000165758651455039, - "loss": 1.0533, - "step": 10650 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016572068566221833, - "loss": 1.2188, - "step": 10660 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016568271986939768, - "loss": 1.0741, - "step": 10670 - }, - { - "epoch": 0.59, - "learning_rate": 0.000165644754076577, - "loss": 1.0587, - "step": 10680 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016560678828375636, - "loss": 1.1442, - "step": 10690 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016556882249093568, - "loss": 1.0176, - "step": 10700 - }, - { - "epoch": 0.59, - "learning_rate": 0.000165530856698115, - "loss": 1.1438, - "step": 10710 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016549289090529433, - "loss": 1.1068, - "step": 10720 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016545492511247368, - "loss": 1.1599, - "step": 10730 - }, - { - "epoch": 0.59, - "learning_rate": 0.000165416959319653, - "loss": 1.1086, - "step": 10740 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016537899352683233, - "loss": 1.017, - "step": 10750 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016534102773401165, - "loss": 1.1783, - "step": 10760 - }, - { - "epoch": 0.59, - "learning_rate": 0.00016530306194119097, - "loss": 1.0951, - "step": 10770 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016526509614837032, - "loss": 1.1006, - "step": 10780 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016522713035554967, - "loss": 1.1423, - "step": 10790 - }, - { - "epoch": 0.6, - "learning_rate": 0.000165189164562729, - "loss": 1.0442, - "step": 10800 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016515119876990832, - "loss": 1.1991, - "step": 10810 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016511323297708764, - "loss": 1.0897, - "step": 10820 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016507526718426697, - "loss": 1.1232, - "step": 10830 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016503730139144632, - "loss": 1.1423, - "step": 10840 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016499933559862564, - "loss": 1.0857, - "step": 10850 - }, - { - "epoch": 0.6, - "learning_rate": 0.000164961369805805, - "loss": 1.1523, - "step": 10860 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016492340401298432, - "loss": 1.0755, - "step": 10870 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016488543822016364, - "loss": 1.1343, - "step": 10880 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016484747242734296, - "loss": 1.1869, - "step": 10890 - }, - { - "epoch": 0.6, - "learning_rate": 0.0001648095066345223, - "loss": 0.9949, - "step": 10900 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016477154084170164, - "loss": 1.1829, - "step": 10910 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016473357504888096, - "loss": 1.066, - "step": 10920 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016469560925606028, - "loss": 1.1363, - "step": 10930 - }, - { - "epoch": 0.6, - "learning_rate": 0.0001646576434632396, - "loss": 1.1399, - "step": 10940 - }, - { - "epoch": 0.6, - "learning_rate": 0.00016461967767041896, - "loss": 1.0054, - "step": 10950 - }, - { - "epoch": 0.61, - "learning_rate": 0.0001645817118775983, - "loss": 1.1933, - "step": 10960 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016454374608477763, - "loss": 1.081, - "step": 10970 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016450578029195695, - "loss": 1.1207, - "step": 10980 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016446781449913628, - "loss": 1.1576, - "step": 10990 - }, - { - "epoch": 0.61, - "learning_rate": 0.0001644298487063156, - "loss": 1.0303, - "step": 11000 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016439188291349495, - "loss": 1.2308, - "step": 11010 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016435391712067428, - "loss": 1.0759, - "step": 11020 - }, - { - "epoch": 0.61, - "learning_rate": 0.0001643159513278536, - "loss": 1.1499, - "step": 11030 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016427798553503295, - "loss": 1.2058, - "step": 11040 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016424001974221227, - "loss": 1.0445, - "step": 11050 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016420205394939162, - "loss": 1.1934, - "step": 11060 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016416408815657095, - "loss": 1.0907, - "step": 11070 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016412612236375027, - "loss": 1.083, - "step": 11080 - }, - { - "epoch": 0.61, - "learning_rate": 0.0001640881565709296, - "loss": 1.1749, - "step": 11090 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016405019077810892, - "loss": 1.0066, - "step": 11100 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016401222498528827, - "loss": 1.1799, - "step": 11110 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016397425919246762, - "loss": 1.0522, - "step": 11120 - }, - { - "epoch": 0.61, - "learning_rate": 0.00016393629339964694, - "loss": 1.0808, - "step": 11130 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016389832760682626, - "loss": 1.2018, - "step": 11140 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001638603618140056, - "loss": 1.0684, - "step": 11150 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001638223960211849, - "loss": 1.1679, - "step": 11160 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016378443022836426, - "loss": 1.0899, - "step": 11170 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016374646443554359, - "loss": 1.0833, - "step": 11180 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001637084986427229, - "loss": 1.182, - "step": 11190 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016367053284990223, - "loss": 1.0758, - "step": 11200 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016363256705708158, - "loss": 1.1437, - "step": 11210 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001635946012642609, - "loss": 1.1106, - "step": 11220 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016355663547144026, - "loss": 1.1053, - "step": 11230 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016351866967861958, - "loss": 1.1546, - "step": 11240 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001634807038857989, - "loss": 1.0076, - "step": 11250 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016344273809297823, - "loss": 1.1533, - "step": 11260 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016340477230015755, - "loss": 1.119, - "step": 11270 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001633668065073369, - "loss": 1.1071, - "step": 11280 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016332884071451622, - "loss": 1.1484, - "step": 11290 - }, - { - "epoch": 0.62, - "learning_rate": 0.00016329087492169558, - "loss": 1.0669, - "step": 11300 - }, - { - "epoch": 0.62, - "learning_rate": 0.0001632529091288749, - "loss": 1.2377, - "step": 11310 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016321494333605422, - "loss": 1.064, - "step": 11320 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016317697754323355, - "loss": 1.0834, - "step": 11330 - }, - { - "epoch": 0.63, - "learning_rate": 0.0001631390117504129, - "loss": 1.1403, - "step": 11340 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016310104595759222, - "loss": 1.0699, - "step": 11350 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016306308016477154, - "loss": 1.1507, - "step": 11360 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016302511437195087, - "loss": 1.0728, - "step": 11370 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016298714857913022, - "loss": 1.0695, - "step": 11380 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016294918278630954, - "loss": 1.1562, - "step": 11390 - }, - { - "epoch": 0.63, - "learning_rate": 0.0001629112169934889, - "loss": 1.0224, - "step": 11400 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016287325120066821, - "loss": 1.1824, - "step": 11410 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016283528540784754, - "loss": 1.1097, - "step": 11420 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016279731961502686, - "loss": 1.0889, - "step": 11430 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016275935382220618, - "loss": 1.1409, - "step": 11440 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016272138802938554, - "loss": 1.0545, - "step": 11450 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016268342223656486, - "loss": 1.2208, - "step": 11460 - }, - { - "epoch": 0.63, - "learning_rate": 0.0001626454564437442, - "loss": 1.104, - "step": 11470 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016260749065092353, - "loss": 1.1207, - "step": 11480 - }, - { - "epoch": 0.63, - "learning_rate": 0.00016256952485810286, - "loss": 1.1701, - "step": 11490 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016253155906528218, - "loss": 1.0237, - "step": 11500 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016249359327246153, - "loss": 1.2081, - "step": 11510 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016245562747964085, - "loss": 1.1031, - "step": 11520 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016241766168682018, - "loss": 1.0649, - "step": 11530 - }, - { - "epoch": 0.64, - "learning_rate": 0.0001623796958939995, - "loss": 1.1853, - "step": 11540 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016234173010117882, - "loss": 1.0277, - "step": 11550 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016230376430835817, - "loss": 1.133, - "step": 11560 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016226579851553753, - "loss": 1.1233, - "step": 11570 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016222783272271685, - "loss": 1.0954, - "step": 11580 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016218986692989617, - "loss": 1.1522, - "step": 11590 - }, - { - "epoch": 0.64, - "learning_rate": 0.0001621519011370755, - "loss": 1.0041, - "step": 11600 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016211393534425482, - "loss": 1.1471, - "step": 11610 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016207596955143417, - "loss": 1.0746, - "step": 11620 - }, - { - "epoch": 0.64, - "learning_rate": 0.0001620380037586135, - "loss": 1.0909, - "step": 11630 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016200003796579284, - "loss": 1.1264, - "step": 11640 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016196207217297217, - "loss": 1.0273, - "step": 11650 - }, - { - "epoch": 0.64, - "learning_rate": 0.0001619241063801515, - "loss": 1.2157, - "step": 11660 - }, - { - "epoch": 0.64, - "learning_rate": 0.00016188614058733081, - "loss": 1.055, - "step": 11670 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016184817479451016, - "loss": 1.0756, - "step": 11680 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001618102090016895, - "loss": 1.1673, - "step": 11690 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001617722432088688, - "loss": 1.0647, - "step": 11700 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016173427741604813, - "loss": 1.1893, - "step": 11710 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016169631162322746, - "loss": 1.1097, - "step": 11720 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001616583458304068, - "loss": 1.1296, - "step": 11730 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016162038003758616, - "loss": 1.1569, - "step": 11740 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016158241424476548, - "loss": 1.0083, - "step": 11750 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001615444484519448, - "loss": 1.124, - "step": 11760 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016150648265912413, - "loss": 1.0624, - "step": 11770 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016146851686630345, - "loss": 1.1091, - "step": 11780 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001614305510734828, - "loss": 1.1548, - "step": 11790 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016139258528066213, - "loss": 1.0428, - "step": 11800 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016135461948784145, - "loss": 1.1567, - "step": 11810 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001613166536950208, - "loss": 1.085, - "step": 11820 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016127868790220012, - "loss": 1.0887, - "step": 11830 - }, - { - "epoch": 0.65, - "learning_rate": 0.00016124072210937945, - "loss": 1.1699, - "step": 11840 - }, - { - "epoch": 0.65, - "learning_rate": 0.0001612027563165588, - "loss": 1.0464, - "step": 11850 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016116479052373812, - "loss": 1.2486, - "step": 11860 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016112682473091745, - "loss": 1.1073, - "step": 11870 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016108885893809677, - "loss": 1.1094, - "step": 11880 - }, - { - "epoch": 0.66, - "learning_rate": 0.0001610508931452761, - "loss": 1.1772, - "step": 11890 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016101292735245544, - "loss": 1.0588, - "step": 11900 - }, - { - "epoch": 0.66, - "learning_rate": 0.0001609749615596348, - "loss": 1.2318, - "step": 11910 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016093699576681412, - "loss": 1.0654, - "step": 11920 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016089902997399344, - "loss": 1.0555, - "step": 11930 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016086106418117276, - "loss": 1.1654, - "step": 11940 - }, - { - "epoch": 0.66, - "learning_rate": 0.0001608230983883521, - "loss": 1.0351, - "step": 11950 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016078513259553144, - "loss": 1.1783, - "step": 11960 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016074716680271076, - "loss": 1.1161, - "step": 11970 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016070920100989008, - "loss": 1.1364, - "step": 11980 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016067123521706943, - "loss": 1.1486, - "step": 11990 - }, - { - "epoch": 0.66, - "learning_rate": 0.00016063326942424876, - "loss": 0.9943, - "step": 12000 - }, - { - "epoch": 0.66, - "eval_loss": 1.0959264039993286, - "eval_runtime": 2.0019, - "eval_samples_per_second": 49.952, - "eval_steps_per_second": 6.494, - "step": 12000 - }, - { - "epoch": 0.66, - "mmlu_eval_accuracy": NaN, - "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, - "mmlu_eval_accuracy_anatomy": 0.42857142857142855, - "mmlu_eval_accuracy_astronomy": 0.125, - "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, - "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552, - "mmlu_eval_accuracy_college_biology": 0.8125, - "mmlu_eval_accuracy_college_chemistry": 1.0, - "mmlu_eval_accuracy_college_computer_science": 0.5454545454545454, - "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, - "mmlu_eval_accuracy_college_medicine": 0.5, - "mmlu_eval_accuracy_college_physics": 0.36363636363636365, - "mmlu_eval_accuracy_computer_security": 0.6363636363636364, - "mmlu_eval_accuracy_conceptual_physics": 0.42857142857142855, - "mmlu_eval_accuracy_econometrics": NaN, - "mmlu_eval_accuracy_electrical_engineering": NaN, - "mmlu_eval_accuracy_elementary_mathematics": NaN, - "mmlu_eval_accuracy_formal_logic": NaN, - "mmlu_eval_accuracy_global_facts": NaN, - "mmlu_eval_accuracy_high_school_biology": NaN, - "mmlu_eval_accuracy_high_school_chemistry": NaN, - "mmlu_eval_accuracy_high_school_computer_science": NaN, - "mmlu_eval_accuracy_high_school_european_history": NaN, - "mmlu_eval_accuracy_high_school_geography": NaN, - "mmlu_eval_accuracy_high_school_government_and_politics": NaN, - "mmlu_eval_accuracy_high_school_macroeconomics": NaN, - "mmlu_eval_accuracy_high_school_mathematics": NaN, - "mmlu_eval_accuracy_high_school_microeconomics": NaN, - "mmlu_eval_accuracy_high_school_physics": NaN, - "mmlu_eval_accuracy_high_school_psychology": NaN, - "mmlu_eval_accuracy_high_school_statistics": NaN, - "mmlu_eval_accuracy_high_school_us_history": NaN, - "mmlu_eval_accuracy_high_school_world_history": NaN, - "mmlu_eval_accuracy_human_aging": NaN, - "mmlu_eval_accuracy_human_sexuality": NaN, - "mmlu_eval_accuracy_international_law": NaN, - "mmlu_eval_accuracy_jurisprudence": NaN, - "mmlu_eval_accuracy_logical_fallacies": NaN, - "mmlu_eval_accuracy_machine_learning": NaN, - "mmlu_eval_accuracy_management": NaN, - "mmlu_eval_accuracy_marketing": NaN, - "mmlu_eval_accuracy_medical_genetics": NaN, - "mmlu_eval_accuracy_miscellaneous": NaN, - "mmlu_eval_accuracy_moral_disputes": NaN, - "mmlu_eval_accuracy_moral_scenarios": NaN, - "mmlu_eval_accuracy_nutrition": NaN, - "mmlu_eval_accuracy_philosophy": NaN, - "mmlu_eval_accuracy_prehistory": NaN, - "mmlu_eval_accuracy_professional_accounting": NaN, - "mmlu_eval_accuracy_professional_law": NaN, - "mmlu_eval_accuracy_professional_medicine": NaN, - "mmlu_eval_accuracy_professional_psychology": NaN, - "mmlu_eval_accuracy_public_relations": NaN, - "mmlu_eval_accuracy_security_studies": NaN, - "mmlu_eval_accuracy_sociology": NaN, - "mmlu_eval_accuracy_us_foreign_policy": NaN, - "mmlu_eval_accuracy_virology": NaN, - "mmlu_eval_accuracy_world_religions": NaN, - "mmlu_loss": 0.9960953282037129, - "step": 12000 - } - ], - "max_steps": 54309, - "num_train_epochs": 3, - "total_flos": 6.187341352301232e+18, - "trial_name": null, - "trial_params": null -}