{ "best_metric": 0.6366316676139832, "best_model_checkpoint": "./exper3_mesum5/checkpoint-2800", "epoch": 8.0, "global_step": 3440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 0.0001994186046511628, "loss": 4.954, "step": 10 }, { "epoch": 0.05, "learning_rate": 0.0001988372093023256, "loss": 4.8641, "step": 20 }, { "epoch": 0.07, "learning_rate": 0.00019825581395348837, "loss": 4.6647, "step": 30 }, { "epoch": 0.09, "learning_rate": 0.00019767441860465116, "loss": 4.5255, "step": 40 }, { "epoch": 0.12, "learning_rate": 0.00019709302325581396, "loss": 4.4509, "step": 50 }, { "epoch": 0.14, "learning_rate": 0.00019651162790697676, "loss": 4.253, "step": 60 }, { "epoch": 0.16, "learning_rate": 0.00019593023255813952, "loss": 4.2104, "step": 70 }, { "epoch": 0.19, "learning_rate": 0.00019534883720930232, "loss": 4.0778, "step": 80 }, { "epoch": 0.21, "learning_rate": 0.00019476744186046511, "loss": 3.9537, "step": 90 }, { "epoch": 0.23, "learning_rate": 0.0001941860465116279, "loss": 3.895, "step": 100 }, { "epoch": 0.23, "eval_accuracy": 0.19349112426035503, "eval_loss": 3.8276302814483643, "eval_runtime": 19.0606, "eval_samples_per_second": 88.665, "eval_steps_per_second": 11.122, "step": 100 }, { "epoch": 0.26, "learning_rate": 0.0001936046511627907, "loss": 3.825, "step": 110 }, { "epoch": 0.28, "learning_rate": 0.0001930232558139535, "loss": 3.6952, "step": 120 }, { "epoch": 0.3, "learning_rate": 0.0001924418604651163, "loss": 3.4767, "step": 130 }, { "epoch": 0.33, "learning_rate": 0.0001918604651162791, "loss": 3.5417, "step": 140 }, { "epoch": 0.35, "learning_rate": 0.0001912790697674419, "loss": 3.4797, "step": 150 }, { "epoch": 0.37, "learning_rate": 0.00019069767441860466, "loss": 3.3749, "step": 160 }, { "epoch": 0.4, "learning_rate": 0.00019011627906976745, "loss": 3.4024, "step": 170 }, { "epoch": 0.42, "learning_rate": 0.00018953488372093025, "loss": 3.2742, "step": 180 }, { "epoch": 0.44, "learning_rate": 0.00018895348837209304, "loss": 3.3841, "step": 190 }, { "epoch": 0.47, "learning_rate": 0.00018837209302325584, "loss": 3.1174, "step": 200 }, { "epoch": 0.47, "eval_accuracy": 0.3106508875739645, "eval_loss": 3.1216797828674316, "eval_runtime": 21.0487, "eval_samples_per_second": 80.29, "eval_steps_per_second": 10.072, "step": 200 }, { "epoch": 0.49, "learning_rate": 0.0001877906976744186, "loss": 3.012, "step": 210 }, { "epoch": 0.51, "learning_rate": 0.0001872093023255814, "loss": 3.0221, "step": 220 }, { "epoch": 0.53, "learning_rate": 0.0001866279069767442, "loss": 2.9971, "step": 230 }, { "epoch": 0.56, "learning_rate": 0.000186046511627907, "loss": 3.0478, "step": 240 }, { "epoch": 0.58, "learning_rate": 0.00018546511627906976, "loss": 2.9971, "step": 250 }, { "epoch": 0.6, "learning_rate": 0.00018488372093023256, "loss": 2.7655, "step": 260 }, { "epoch": 0.63, "learning_rate": 0.00018430232558139535, "loss": 2.703, "step": 270 }, { "epoch": 0.65, "learning_rate": 0.00018372093023255815, "loss": 2.8271, "step": 280 }, { "epoch": 0.67, "learning_rate": 0.00018313953488372094, "loss": 2.6716, "step": 290 }, { "epoch": 0.7, "learning_rate": 0.0001825581395348837, "loss": 2.6, "step": 300 }, { "epoch": 0.7, "eval_accuracy": 0.42071005917159765, "eval_loss": 2.5399255752563477, "eval_runtime": 20.8035, "eval_samples_per_second": 81.236, "eval_steps_per_second": 10.191, "step": 300 }, { "epoch": 0.72, "learning_rate": 0.0001819767441860465, "loss": 2.5875, "step": 310 }, { "epoch": 0.74, "learning_rate": 0.0001813953488372093, "loss": 2.6057, "step": 320 }, { "epoch": 0.77, "learning_rate": 0.00018081395348837212, "loss": 2.5459, "step": 330 }, { "epoch": 0.79, "learning_rate": 0.0001802325581395349, "loss": 2.4955, "step": 340 }, { "epoch": 0.81, "learning_rate": 0.0001796511627906977, "loss": 2.3718, "step": 350 }, { "epoch": 0.84, "learning_rate": 0.00017906976744186048, "loss": 2.3314, "step": 360 }, { "epoch": 0.86, "learning_rate": 0.00017848837209302328, "loss": 2.3855, "step": 370 }, { "epoch": 0.88, "learning_rate": 0.00017790697674418605, "loss": 2.313, "step": 380 }, { "epoch": 0.91, "learning_rate": 0.00017732558139534884, "loss": 2.1767, "step": 390 }, { "epoch": 0.93, "learning_rate": 0.00017674418604651164, "loss": 2.256, "step": 400 }, { "epoch": 0.93, "eval_accuracy": 0.5159763313609468, "eval_loss": 2.176730155944824, "eval_runtime": 21.1447, "eval_samples_per_second": 79.925, "eval_steps_per_second": 10.026, "step": 400 }, { "epoch": 0.95, "learning_rate": 0.00017616279069767443, "loss": 2.2881, "step": 410 }, { "epoch": 0.98, "learning_rate": 0.00017558139534883723, "loss": 2.0996, "step": 420 }, { "epoch": 1.0, "learning_rate": 0.000175, "loss": 2.0865, "step": 430 }, { "epoch": 1.02, "learning_rate": 0.0001744186046511628, "loss": 1.84, "step": 440 }, { "epoch": 1.05, "learning_rate": 0.0001738372093023256, "loss": 1.7042, "step": 450 }, { "epoch": 1.07, "learning_rate": 0.00017325581395348838, "loss": 1.6687, "step": 460 }, { "epoch": 1.09, "learning_rate": 0.00017267441860465118, "loss": 1.8394, "step": 470 }, { "epoch": 1.12, "learning_rate": 0.00017209302325581395, "loss": 1.6962, "step": 480 }, { "epoch": 1.14, "learning_rate": 0.00017151162790697674, "loss": 1.7329, "step": 490 }, { "epoch": 1.16, "learning_rate": 0.00017093023255813954, "loss": 1.5441, "step": 500 }, { "epoch": 1.16, "eval_accuracy": 0.585207100591716, "eval_loss": 1.8085863590240479, "eval_runtime": 21.5882, "eval_samples_per_second": 78.284, "eval_steps_per_second": 9.82, "step": 500 }, { "epoch": 1.19, "learning_rate": 0.00017034883720930233, "loss": 1.6099, "step": 510 }, { "epoch": 1.21, "learning_rate": 0.0001697674418604651, "loss": 1.5833, "step": 520 }, { "epoch": 1.23, "learning_rate": 0.0001691860465116279, "loss": 1.5696, "step": 530 }, { "epoch": 1.26, "learning_rate": 0.00016860465116279072, "loss": 1.3757, "step": 540 }, { "epoch": 1.28, "learning_rate": 0.00016802325581395352, "loss": 1.4537, "step": 550 }, { "epoch": 1.3, "learning_rate": 0.00016744186046511629, "loss": 1.445, "step": 560 }, { "epoch": 1.33, "learning_rate": 0.00016686046511627908, "loss": 1.3975, "step": 570 }, { "epoch": 1.35, "learning_rate": 0.00016627906976744188, "loss": 1.4086, "step": 580 }, { "epoch": 1.37, "learning_rate": 0.00016569767441860467, "loss": 1.3367, "step": 590 }, { "epoch": 1.4, "learning_rate": 0.00016511627906976747, "loss": 1.3834, "step": 600 }, { "epoch": 1.4, "eval_accuracy": 0.6325443786982249, "eval_loss": 1.556496500968933, "eval_runtime": 21.486, "eval_samples_per_second": 78.656, "eval_steps_per_second": 9.867, "step": 600 }, { "epoch": 1.42, "learning_rate": 0.00016453488372093024, "loss": 1.2953, "step": 610 }, { "epoch": 1.44, "learning_rate": 0.00016395348837209303, "loss": 1.2843, "step": 620 }, { "epoch": 1.47, "learning_rate": 0.00016337209302325583, "loss": 1.1906, "step": 630 }, { "epoch": 1.49, "learning_rate": 0.00016279069767441862, "loss": 1.3458, "step": 640 }, { "epoch": 1.51, "learning_rate": 0.0001622093023255814, "loss": 1.1714, "step": 650 }, { "epoch": 1.53, "learning_rate": 0.00016162790697674419, "loss": 1.191, "step": 660 }, { "epoch": 1.56, "learning_rate": 0.00016104651162790698, "loss": 1.159, "step": 670 }, { "epoch": 1.58, "learning_rate": 0.00016046511627906978, "loss": 1.2594, "step": 680 }, { "epoch": 1.6, "learning_rate": 0.00015988372093023257, "loss": 1.1533, "step": 690 }, { "epoch": 1.63, "learning_rate": 0.00015930232558139534, "loss": 1.1995, "step": 700 }, { "epoch": 1.63, "eval_accuracy": 0.6763313609467455, "eval_loss": 1.3339420557022095, "eval_runtime": 19.734, "eval_samples_per_second": 85.639, "eval_steps_per_second": 10.743, "step": 700 }, { "epoch": 1.65, "learning_rate": 0.00015872093023255814, "loss": 1.0989, "step": 710 }, { "epoch": 1.67, "learning_rate": 0.00015813953488372093, "loss": 1.2864, "step": 720 }, { "epoch": 1.7, "learning_rate": 0.00015755813953488373, "loss": 1.2124, "step": 730 }, { "epoch": 1.72, "learning_rate": 0.00015697674418604652, "loss": 1.1752, "step": 740 }, { "epoch": 1.74, "learning_rate": 0.0001563953488372093, "loss": 1.2127, "step": 750 }, { "epoch": 1.77, "learning_rate": 0.0001558139534883721, "loss": 1.1634, "step": 760 }, { "epoch": 1.79, "learning_rate": 0.0001552325581395349, "loss": 1.0915, "step": 770 }, { "epoch": 1.81, "learning_rate": 0.00015465116279069768, "loss": 1.103, "step": 780 }, { "epoch": 1.84, "learning_rate": 0.00015406976744186047, "loss": 1.0951, "step": 790 }, { "epoch": 1.86, "learning_rate": 0.00015348837209302327, "loss": 1.0845, "step": 800 }, { "epoch": 1.86, "eval_accuracy": 0.6532544378698225, "eval_loss": 1.3299002647399902, "eval_runtime": 19.6334, "eval_samples_per_second": 86.078, "eval_steps_per_second": 10.798, "step": 800 }, { "epoch": 1.88, "learning_rate": 0.00015290697674418606, "loss": 1.03, "step": 810 }, { "epoch": 1.91, "learning_rate": 0.00015232558139534886, "loss": 0.9288, "step": 820 }, { "epoch": 1.93, "learning_rate": 0.00015174418604651163, "loss": 0.9963, "step": 830 }, { "epoch": 1.95, "learning_rate": 0.00015116279069767442, "loss": 1.0363, "step": 840 }, { "epoch": 1.98, "learning_rate": 0.00015058139534883722, "loss": 0.9671, "step": 850 }, { "epoch": 2.0, "learning_rate": 0.00015000000000000001, "loss": 0.8207, "step": 860 }, { "epoch": 2.02, "learning_rate": 0.0001494186046511628, "loss": 0.6051, "step": 870 }, { "epoch": 2.05, "learning_rate": 0.00014883720930232558, "loss": 0.6698, "step": 880 }, { "epoch": 2.07, "learning_rate": 0.00014825581395348837, "loss": 0.6969, "step": 890 }, { "epoch": 2.09, "learning_rate": 0.00014767441860465117, "loss": 0.6472, "step": 900 }, { "epoch": 2.09, "eval_accuracy": 0.7218934911242604, "eval_loss": 1.0679467916488647, "eval_runtime": 19.5979, "eval_samples_per_second": 86.234, "eval_steps_per_second": 10.818, "step": 900 }, { "epoch": 2.12, "learning_rate": 0.00014709302325581396, "loss": 0.6366, "step": 910 }, { "epoch": 2.14, "learning_rate": 0.00014651162790697673, "loss": 0.6353, "step": 920 }, { "epoch": 2.16, "learning_rate": 0.00014593023255813953, "loss": 0.4765, "step": 930 }, { "epoch": 2.19, "learning_rate": 0.00014534883720930232, "loss": 0.453, "step": 940 }, { "epoch": 2.21, "learning_rate": 0.00014476744186046512, "loss": 0.5234, "step": 950 }, { "epoch": 2.23, "learning_rate": 0.00014418604651162791, "loss": 0.5019, "step": 960 }, { "epoch": 2.26, "learning_rate": 0.0001436046511627907, "loss": 0.6719, "step": 970 }, { "epoch": 2.28, "learning_rate": 0.0001430232558139535, "loss": 0.5294, "step": 980 }, { "epoch": 2.3, "learning_rate": 0.0001424418604651163, "loss": 0.6135, "step": 990 }, { "epoch": 2.33, "learning_rate": 0.0001418604651162791, "loss": 0.5948, "step": 1000 }, { "epoch": 2.33, "eval_accuracy": 0.7124260355029586, "eval_loss": 1.0286362171173096, "eval_runtime": 19.4904, "eval_samples_per_second": 86.709, "eval_steps_per_second": 10.877, "step": 1000 }, { "epoch": 2.35, "learning_rate": 0.00014127906976744186, "loss": 0.6138, "step": 1010 }, { "epoch": 2.37, "learning_rate": 0.00014069767441860466, "loss": 0.6543, "step": 1020 }, { "epoch": 2.4, "learning_rate": 0.00014011627906976746, "loss": 0.5534, "step": 1030 }, { "epoch": 2.42, "learning_rate": 0.00013953488372093025, "loss": 0.6408, "step": 1040 }, { "epoch": 2.44, "learning_rate": 0.00013895348837209302, "loss": 0.4687, "step": 1050 }, { "epoch": 2.47, "learning_rate": 0.00013837209302325582, "loss": 0.4635, "step": 1060 }, { "epoch": 2.49, "learning_rate": 0.0001377906976744186, "loss": 0.4466, "step": 1070 }, { "epoch": 2.51, "learning_rate": 0.0001372093023255814, "loss": 0.4255, "step": 1080 }, { "epoch": 2.53, "learning_rate": 0.0001366279069767442, "loss": 0.5848, "step": 1090 }, { "epoch": 2.56, "learning_rate": 0.00013604651162790697, "loss": 0.5565, "step": 1100 }, { "epoch": 2.56, "eval_accuracy": 0.7284023668639054, "eval_loss": 0.9595437049865723, "eval_runtime": 19.5222, "eval_samples_per_second": 86.568, "eval_steps_per_second": 10.859, "step": 1100 }, { "epoch": 2.58, "learning_rate": 0.00013546511627906977, "loss": 0.4625, "step": 1110 }, { "epoch": 2.6, "learning_rate": 0.00013488372093023256, "loss": 0.7103, "step": 1120 }, { "epoch": 2.63, "learning_rate": 0.00013430232558139536, "loss": 0.5923, "step": 1130 }, { "epoch": 2.65, "learning_rate": 0.00013372093023255815, "loss": 0.4913, "step": 1140 }, { "epoch": 2.67, "learning_rate": 0.00013313953488372092, "loss": 0.4915, "step": 1150 }, { "epoch": 2.7, "learning_rate": 0.00013255813953488372, "loss": 0.401, "step": 1160 }, { "epoch": 2.72, "learning_rate": 0.0001319767441860465, "loss": 0.4169, "step": 1170 }, { "epoch": 2.74, "learning_rate": 0.0001313953488372093, "loss": 0.52, "step": 1180 }, { "epoch": 2.77, "learning_rate": 0.0001308139534883721, "loss": 0.4018, "step": 1190 }, { "epoch": 2.79, "learning_rate": 0.0001302325581395349, "loss": 0.4879, "step": 1200 }, { "epoch": 2.79, "eval_accuracy": 0.7420118343195267, "eval_loss": 0.8915188312530518, "eval_runtime": 19.4697, "eval_samples_per_second": 86.801, "eval_steps_per_second": 10.889, "step": 1200 }, { "epoch": 2.81, "learning_rate": 0.0001296511627906977, "loss": 0.527, "step": 1210 }, { "epoch": 2.84, "learning_rate": 0.0001290697674418605, "loss": 0.4114, "step": 1220 }, { "epoch": 2.86, "learning_rate": 0.00012848837209302326, "loss": 0.5728, "step": 1230 }, { "epoch": 2.88, "learning_rate": 0.00012790697674418605, "loss": 0.347, "step": 1240 }, { "epoch": 2.91, "learning_rate": 0.00012732558139534885, "loss": 0.3652, "step": 1250 }, { "epoch": 2.93, "learning_rate": 0.00012674418604651164, "loss": 0.5574, "step": 1260 }, { "epoch": 2.95, "learning_rate": 0.00012616279069767444, "loss": 0.4363, "step": 1270 }, { "epoch": 2.98, "learning_rate": 0.0001255813953488372, "loss": 0.4769, "step": 1280 }, { "epoch": 3.0, "learning_rate": 0.000125, "loss": 0.5541, "step": 1290 }, { "epoch": 3.02, "learning_rate": 0.0001244186046511628, "loss": 0.2816, "step": 1300 }, { "epoch": 3.02, "eval_accuracy": 0.7763313609467456, "eval_loss": 0.8158556818962097, "eval_runtime": 19.4244, "eval_samples_per_second": 87.004, "eval_steps_per_second": 10.914, "step": 1300 }, { "epoch": 3.05, "learning_rate": 0.0001238372093023256, "loss": 0.2724, "step": 1310 }, { "epoch": 3.07, "learning_rate": 0.00012325581395348836, "loss": 0.234, "step": 1320 }, { "epoch": 3.09, "learning_rate": 0.00012267441860465116, "loss": 0.2116, "step": 1330 }, { "epoch": 3.12, "learning_rate": 0.00012209302325581395, "loss": 0.2968, "step": 1340 }, { "epoch": 3.14, "learning_rate": 0.00012151162790697675, "loss": 0.2539, "step": 1350 }, { "epoch": 3.16, "learning_rate": 0.00012093023255813953, "loss": 0.2837, "step": 1360 }, { "epoch": 3.19, "learning_rate": 0.00012034883720930233, "loss": 0.2571, "step": 1370 }, { "epoch": 3.21, "learning_rate": 0.00011976744186046511, "loss": 0.2502, "step": 1380 }, { "epoch": 3.23, "learning_rate": 0.0001191860465116279, "loss": 0.2629, "step": 1390 }, { "epoch": 3.26, "learning_rate": 0.00011860465116279071, "loss": 0.2412, "step": 1400 }, { "epoch": 3.26, "eval_accuracy": 0.7911242603550296, "eval_loss": 0.776643693447113, "eval_runtime": 19.7719, "eval_samples_per_second": 85.475, "eval_steps_per_second": 10.722, "step": 1400 }, { "epoch": 3.28, "learning_rate": 0.00011802325581395351, "loss": 0.2721, "step": 1410 }, { "epoch": 3.3, "learning_rate": 0.00011744186046511629, "loss": 0.1876, "step": 1420 }, { "epoch": 3.33, "learning_rate": 0.00011686046511627909, "loss": 0.2417, "step": 1430 }, { "epoch": 3.35, "learning_rate": 0.00011627906976744187, "loss": 0.2029, "step": 1440 }, { "epoch": 3.37, "learning_rate": 0.00011569767441860466, "loss": 0.1991, "step": 1450 }, { "epoch": 3.4, "learning_rate": 0.00011511627906976746, "loss": 0.2763, "step": 1460 }, { "epoch": 3.42, "learning_rate": 0.00011453488372093024, "loss": 0.1539, "step": 1470 }, { "epoch": 3.44, "learning_rate": 0.00011395348837209304, "loss": 0.2287, "step": 1480 }, { "epoch": 3.47, "learning_rate": 0.00011337209302325582, "loss": 0.2572, "step": 1490 }, { "epoch": 3.49, "learning_rate": 0.00011279069767441861, "loss": 0.2015, "step": 1500 }, { "epoch": 3.49, "eval_accuracy": 0.7828402366863906, "eval_loss": 0.784956157207489, "eval_runtime": 19.4284, "eval_samples_per_second": 86.986, "eval_steps_per_second": 10.912, "step": 1500 }, { "epoch": 3.51, "learning_rate": 0.0001122093023255814, "loss": 0.2263, "step": 1510 }, { "epoch": 3.53, "learning_rate": 0.00011162790697674419, "loss": 0.2843, "step": 1520 }, { "epoch": 3.56, "learning_rate": 0.00011104651162790699, "loss": 0.2371, "step": 1530 }, { "epoch": 3.58, "learning_rate": 0.00011046511627906977, "loss": 0.1717, "step": 1540 }, { "epoch": 3.6, "learning_rate": 0.00010988372093023256, "loss": 0.311, "step": 1550 }, { "epoch": 3.63, "learning_rate": 0.00010930232558139534, "loss": 0.2567, "step": 1560 }, { "epoch": 3.65, "learning_rate": 0.00010872093023255814, "loss": 0.1739, "step": 1570 }, { "epoch": 3.67, "learning_rate": 0.00010813953488372092, "loss": 0.2185, "step": 1580 }, { "epoch": 3.7, "learning_rate": 0.00010755813953488372, "loss": 0.2158, "step": 1590 }, { "epoch": 3.72, "learning_rate": 0.00010697674418604651, "loss": 0.274, "step": 1600 }, { "epoch": 3.72, "eval_accuracy": 0.7934911242603551, "eval_loss": 0.7361425757408142, "eval_runtime": 19.2623, "eval_samples_per_second": 87.736, "eval_steps_per_second": 11.006, "step": 1600 }, { "epoch": 3.74, "learning_rate": 0.0001063953488372093, "loss": 0.1164, "step": 1610 }, { "epoch": 3.77, "learning_rate": 0.0001058139534883721, "loss": 0.1515, "step": 1620 }, { "epoch": 3.79, "learning_rate": 0.0001052325581395349, "loss": 0.2399, "step": 1630 }, { "epoch": 3.81, "learning_rate": 0.00010465116279069768, "loss": 0.2516, "step": 1640 }, { "epoch": 3.84, "learning_rate": 0.00010406976744186048, "loss": 0.2054, "step": 1650 }, { "epoch": 3.86, "learning_rate": 0.00010348837209302327, "loss": 0.1758, "step": 1660 }, { "epoch": 3.88, "learning_rate": 0.00010290697674418605, "loss": 0.1903, "step": 1670 }, { "epoch": 3.91, "learning_rate": 0.00010232558139534885, "loss": 0.1922, "step": 1680 }, { "epoch": 3.93, "learning_rate": 0.00010174418604651163, "loss": 0.2019, "step": 1690 }, { "epoch": 3.95, "learning_rate": 0.00010116279069767443, "loss": 0.1244, "step": 1700 }, { "epoch": 3.95, "eval_accuracy": 0.7911242603550296, "eval_loss": 0.7299075126647949, "eval_runtime": 19.238, "eval_samples_per_second": 87.847, "eval_steps_per_second": 11.02, "step": 1700 }, { "epoch": 3.98, "learning_rate": 0.00010058139534883721, "loss": 0.2361, "step": 1710 }, { "epoch": 4.0, "learning_rate": 0.0001, "loss": 0.1389, "step": 1720 }, { "epoch": 4.02, "learning_rate": 9.94186046511628e-05, "loss": 0.0844, "step": 1730 }, { "epoch": 4.05, "learning_rate": 9.883720930232558e-05, "loss": 0.1463, "step": 1740 }, { "epoch": 4.07, "learning_rate": 9.825581395348838e-05, "loss": 0.09, "step": 1750 }, { "epoch": 4.09, "learning_rate": 9.767441860465116e-05, "loss": 0.1106, "step": 1760 }, { "epoch": 4.12, "learning_rate": 9.709302325581396e-05, "loss": 0.1387, "step": 1770 }, { "epoch": 4.14, "learning_rate": 9.651162790697675e-05, "loss": 0.1231, "step": 1780 }, { "epoch": 4.16, "learning_rate": 9.593023255813955e-05, "loss": 0.0734, "step": 1790 }, { "epoch": 4.19, "learning_rate": 9.534883720930233e-05, "loss": 0.0794, "step": 1800 }, { "epoch": 4.19, "eval_accuracy": 0.7846153846153846, "eval_loss": 0.7440704107284546, "eval_runtime": 19.1417, "eval_samples_per_second": 88.289, "eval_steps_per_second": 11.075, "step": 1800 }, { "epoch": 4.21, "learning_rate": 9.476744186046512e-05, "loss": 0.0885, "step": 1810 }, { "epoch": 4.23, "learning_rate": 9.418604651162792e-05, "loss": 0.0781, "step": 1820 }, { "epoch": 4.26, "learning_rate": 9.36046511627907e-05, "loss": 0.0842, "step": 1830 }, { "epoch": 4.28, "learning_rate": 9.30232558139535e-05, "loss": 0.0957, "step": 1840 }, { "epoch": 4.3, "learning_rate": 9.244186046511628e-05, "loss": 0.0561, "step": 1850 }, { "epoch": 4.33, "learning_rate": 9.186046511627907e-05, "loss": 0.0905, "step": 1860 }, { "epoch": 4.35, "learning_rate": 9.127906976744186e-05, "loss": 0.075, "step": 1870 }, { "epoch": 4.37, "learning_rate": 9.069767441860465e-05, "loss": 0.1165, "step": 1880 }, { "epoch": 4.4, "learning_rate": 9.011627906976745e-05, "loss": 0.0809, "step": 1890 }, { "epoch": 4.42, "learning_rate": 8.953488372093024e-05, "loss": 0.0915, "step": 1900 }, { "epoch": 4.42, "eval_accuracy": 0.7940828402366864, "eval_loss": 0.7614301443099976, "eval_runtime": 19.2799, "eval_samples_per_second": 87.656, "eval_steps_per_second": 10.996, "step": 1900 }, { "epoch": 4.44, "learning_rate": 8.895348837209302e-05, "loss": 0.1187, "step": 1910 }, { "epoch": 4.47, "learning_rate": 8.837209302325582e-05, "loss": 0.1082, "step": 1920 }, { "epoch": 4.49, "learning_rate": 8.779069767441861e-05, "loss": 0.0698, "step": 1930 }, { "epoch": 4.51, "learning_rate": 8.72093023255814e-05, "loss": 0.1073, "step": 1940 }, { "epoch": 4.53, "learning_rate": 8.662790697674419e-05, "loss": 0.0638, "step": 1950 }, { "epoch": 4.56, "learning_rate": 8.604651162790697e-05, "loss": 0.157, "step": 1960 }, { "epoch": 4.58, "learning_rate": 8.546511627906977e-05, "loss": 0.0641, "step": 1970 }, { "epoch": 4.6, "learning_rate": 8.488372093023255e-05, "loss": 0.0536, "step": 1980 }, { "epoch": 4.63, "learning_rate": 8.430232558139536e-05, "loss": 0.0921, "step": 1990 }, { "epoch": 4.65, "learning_rate": 8.372093023255814e-05, "loss": 0.0817, "step": 2000 }, { "epoch": 4.65, "eval_accuracy": 0.8011834319526627, "eval_loss": 0.7310301065444946, "eval_runtime": 19.2052, "eval_samples_per_second": 87.997, "eval_steps_per_second": 11.039, "step": 2000 }, { "epoch": 4.67, "learning_rate": 8.313953488372094e-05, "loss": 0.0876, "step": 2010 }, { "epoch": 4.7, "learning_rate": 8.255813953488373e-05, "loss": 0.0959, "step": 2020 }, { "epoch": 4.72, "learning_rate": 8.197674418604652e-05, "loss": 0.0945, "step": 2030 }, { "epoch": 4.74, "learning_rate": 8.139534883720931e-05, "loss": 0.0375, "step": 2040 }, { "epoch": 4.77, "learning_rate": 8.081395348837209e-05, "loss": 0.0877, "step": 2050 }, { "epoch": 4.79, "learning_rate": 8.023255813953489e-05, "loss": 0.053, "step": 2060 }, { "epoch": 4.81, "learning_rate": 7.965116279069767e-05, "loss": 0.1233, "step": 2070 }, { "epoch": 4.84, "learning_rate": 7.906976744186047e-05, "loss": 0.1089, "step": 2080 }, { "epoch": 4.86, "learning_rate": 7.848837209302326e-05, "loss": 0.1326, "step": 2090 }, { "epoch": 4.88, "learning_rate": 7.790697674418606e-05, "loss": 0.0561, "step": 2100 }, { "epoch": 4.88, "eval_accuracy": 0.806508875739645, "eval_loss": 0.722186267375946, "eval_runtime": 19.2931, "eval_samples_per_second": 87.596, "eval_steps_per_second": 10.988, "step": 2100 }, { "epoch": 4.91, "learning_rate": 7.732558139534884e-05, "loss": 0.1245, "step": 2110 }, { "epoch": 4.93, "learning_rate": 7.674418604651163e-05, "loss": 0.0414, "step": 2120 }, { "epoch": 4.95, "learning_rate": 7.616279069767443e-05, "loss": 0.0901, "step": 2130 }, { "epoch": 4.98, "learning_rate": 7.558139534883721e-05, "loss": 0.0751, "step": 2140 }, { "epoch": 5.0, "learning_rate": 7.500000000000001e-05, "loss": 0.1456, "step": 2150 }, { "epoch": 5.02, "learning_rate": 7.441860465116279e-05, "loss": 0.0245, "step": 2160 }, { "epoch": 5.05, "learning_rate": 7.383720930232558e-05, "loss": 0.0377, "step": 2170 }, { "epoch": 5.07, "learning_rate": 7.325581395348837e-05, "loss": 0.0435, "step": 2180 }, { "epoch": 5.09, "learning_rate": 7.267441860465116e-05, "loss": 0.0639, "step": 2190 }, { "epoch": 5.12, "learning_rate": 7.209302325581396e-05, "loss": 0.0165, "step": 2200 }, { "epoch": 5.12, "eval_accuracy": 0.8059171597633136, "eval_loss": 0.7515397667884827, "eval_runtime": 19.4193, "eval_samples_per_second": 87.027, "eval_steps_per_second": 10.917, "step": 2200 }, { "epoch": 5.14, "learning_rate": 7.151162790697675e-05, "loss": 0.0338, "step": 2210 }, { "epoch": 5.16, "learning_rate": 7.093023255813955e-05, "loss": 0.0586, "step": 2220 }, { "epoch": 5.19, "learning_rate": 7.034883720930233e-05, "loss": 0.0787, "step": 2230 }, { "epoch": 5.21, "learning_rate": 6.976744186046513e-05, "loss": 0.023, "step": 2240 }, { "epoch": 5.23, "learning_rate": 6.918604651162791e-05, "loss": 0.0681, "step": 2250 }, { "epoch": 5.26, "learning_rate": 6.86046511627907e-05, "loss": 0.0569, "step": 2260 }, { "epoch": 5.28, "learning_rate": 6.802325581395348e-05, "loss": 0.0206, "step": 2270 }, { "epoch": 5.3, "learning_rate": 6.744186046511628e-05, "loss": 0.0369, "step": 2280 }, { "epoch": 5.33, "learning_rate": 6.686046511627908e-05, "loss": 0.0526, "step": 2290 }, { "epoch": 5.35, "learning_rate": 6.627906976744186e-05, "loss": 0.0168, "step": 2300 }, { "epoch": 5.35, "eval_accuracy": 0.821301775147929, "eval_loss": 0.6687235832214355, "eval_runtime": 19.3189, "eval_samples_per_second": 87.479, "eval_steps_per_second": 10.974, "step": 2300 }, { "epoch": 5.37, "learning_rate": 6.569767441860465e-05, "loss": 0.0185, "step": 2310 }, { "epoch": 5.4, "learning_rate": 6.511627906976745e-05, "loss": 0.018, "step": 2320 }, { "epoch": 5.42, "learning_rate": 6.453488372093024e-05, "loss": 0.0503, "step": 2330 }, { "epoch": 5.44, "learning_rate": 6.395348837209303e-05, "loss": 0.0145, "step": 2340 }, { "epoch": 5.47, "learning_rate": 6.337209302325582e-05, "loss": 0.0139, "step": 2350 }, { "epoch": 5.49, "learning_rate": 6.27906976744186e-05, "loss": 0.0134, "step": 2360 }, { "epoch": 5.51, "learning_rate": 6.22093023255814e-05, "loss": 0.0168, "step": 2370 }, { "epoch": 5.53, "learning_rate": 6.162790697674418e-05, "loss": 0.0234, "step": 2380 }, { "epoch": 5.56, "learning_rate": 6.104651162790698e-05, "loss": 0.046, "step": 2390 }, { "epoch": 5.58, "learning_rate": 6.0465116279069765e-05, "loss": 0.0212, "step": 2400 }, { "epoch": 5.58, "eval_accuracy": 0.8248520710059172, "eval_loss": 0.6671048402786255, "eval_runtime": 19.6081, "eval_samples_per_second": 86.189, "eval_steps_per_second": 10.812, "step": 2400 }, { "epoch": 5.6, "learning_rate": 5.9883720930232554e-05, "loss": 0.0182, "step": 2410 }, { "epoch": 5.63, "learning_rate": 5.9302325581395356e-05, "loss": 0.0101, "step": 2420 }, { "epoch": 5.65, "learning_rate": 5.8720930232558145e-05, "loss": 0.0131, "step": 2430 }, { "epoch": 5.67, "learning_rate": 5.8139534883720933e-05, "loss": 0.0592, "step": 2440 }, { "epoch": 5.7, "learning_rate": 5.755813953488373e-05, "loss": 0.0317, "step": 2450 }, { "epoch": 5.72, "learning_rate": 5.697674418604652e-05, "loss": 0.0136, "step": 2460 }, { "epoch": 5.74, "learning_rate": 5.6395348837209306e-05, "loss": 0.024, "step": 2470 }, { "epoch": 5.77, "learning_rate": 5.5813953488372095e-05, "loss": 0.0233, "step": 2480 }, { "epoch": 5.79, "learning_rate": 5.5232558139534884e-05, "loss": 0.034, "step": 2490 }, { "epoch": 5.81, "learning_rate": 5.465116279069767e-05, "loss": 0.0389, "step": 2500 }, { "epoch": 5.81, "eval_accuracy": 0.827810650887574, "eval_loss": 0.6893125176429749, "eval_runtime": 19.5009, "eval_samples_per_second": 86.663, "eval_steps_per_second": 10.871, "step": 2500 }, { "epoch": 5.84, "learning_rate": 5.406976744186046e-05, "loss": 0.0227, "step": 2510 }, { "epoch": 5.86, "learning_rate": 5.348837209302326e-05, "loss": 0.0186, "step": 2520 }, { "epoch": 5.88, "learning_rate": 5.290697674418605e-05, "loss": 0.022, "step": 2530 }, { "epoch": 5.91, "learning_rate": 5.232558139534884e-05, "loss": 0.0373, "step": 2540 }, { "epoch": 5.93, "learning_rate": 5.1744186046511636e-05, "loss": 0.0144, "step": 2550 }, { "epoch": 5.95, "learning_rate": 5.1162790697674425e-05, "loss": 0.0112, "step": 2560 }, { "epoch": 5.98, "learning_rate": 5.0581395348837214e-05, "loss": 0.0331, "step": 2570 }, { "epoch": 6.0, "learning_rate": 5e-05, "loss": 0.0139, "step": 2580 }, { "epoch": 6.02, "learning_rate": 4.941860465116279e-05, "loss": 0.0142, "step": 2590 }, { "epoch": 6.05, "learning_rate": 4.883720930232558e-05, "loss": 0.0087, "step": 2600 }, { "epoch": 6.05, "eval_accuracy": 0.8260355029585799, "eval_loss": 0.6839348077774048, "eval_runtime": 19.5465, "eval_samples_per_second": 86.461, "eval_steps_per_second": 10.846, "step": 2600 }, { "epoch": 6.07, "learning_rate": 4.8255813953488375e-05, "loss": 0.0089, "step": 2610 }, { "epoch": 6.09, "learning_rate": 4.7674418604651164e-05, "loss": 0.0168, "step": 2620 }, { "epoch": 6.12, "learning_rate": 4.709302325581396e-05, "loss": 0.009, "step": 2630 }, { "epoch": 6.14, "learning_rate": 4.651162790697675e-05, "loss": 0.0079, "step": 2640 }, { "epoch": 6.16, "learning_rate": 4.593023255813954e-05, "loss": 0.0092, "step": 2650 }, { "epoch": 6.19, "learning_rate": 4.5348837209302326e-05, "loss": 0.01, "step": 2660 }, { "epoch": 6.21, "learning_rate": 4.476744186046512e-05, "loss": 0.0134, "step": 2670 }, { "epoch": 6.23, "learning_rate": 4.418604651162791e-05, "loss": 0.0265, "step": 2680 }, { "epoch": 6.26, "learning_rate": 4.36046511627907e-05, "loss": 0.0079, "step": 2690 }, { "epoch": 6.28, "learning_rate": 4.302325581395349e-05, "loss": 0.0087, "step": 2700 }, { "epoch": 6.28, "eval_accuracy": 0.8319526627218935, "eval_loss": 0.6412006616592407, "eval_runtime": 19.4572, "eval_samples_per_second": 86.857, "eval_steps_per_second": 10.896, "step": 2700 }, { "epoch": 6.3, "learning_rate": 4.2441860465116276e-05, "loss": 0.0242, "step": 2710 }, { "epoch": 6.33, "learning_rate": 4.186046511627907e-05, "loss": 0.0087, "step": 2720 }, { "epoch": 6.35, "learning_rate": 4.127906976744187e-05, "loss": 0.0097, "step": 2730 }, { "epoch": 6.37, "learning_rate": 4.0697674418604655e-05, "loss": 0.0073, "step": 2740 }, { "epoch": 6.4, "learning_rate": 4.0116279069767444e-05, "loss": 0.0077, "step": 2750 }, { "epoch": 6.42, "learning_rate": 3.953488372093023e-05, "loss": 0.0115, "step": 2760 }, { "epoch": 6.44, "learning_rate": 3.895348837209303e-05, "loss": 0.0076, "step": 2770 }, { "epoch": 6.47, "learning_rate": 3.837209302325582e-05, "loss": 0.0092, "step": 2780 }, { "epoch": 6.49, "learning_rate": 3.7790697674418606e-05, "loss": 0.0071, "step": 2790 }, { "epoch": 6.51, "learning_rate": 3.7209302325581394e-05, "loss": 0.0077, "step": 2800 }, { "epoch": 6.51, "eval_accuracy": 0.8366863905325443, "eval_loss": 0.6366316676139832, "eval_runtime": 19.6422, "eval_samples_per_second": 86.039, "eval_steps_per_second": 10.793, "step": 2800 }, { "epoch": 6.53, "learning_rate": 3.662790697674418e-05, "loss": 0.0081, "step": 2810 }, { "epoch": 6.56, "learning_rate": 3.604651162790698e-05, "loss": 0.0077, "step": 2820 }, { "epoch": 6.58, "learning_rate": 3.5465116279069774e-05, "loss": 0.0068, "step": 2830 }, { "epoch": 6.6, "learning_rate": 3.488372093023256e-05, "loss": 0.0069, "step": 2840 }, { "epoch": 6.63, "learning_rate": 3.430232558139535e-05, "loss": 0.0078, "step": 2850 }, { "epoch": 6.65, "learning_rate": 3.372093023255814e-05, "loss": 0.0069, "step": 2860 }, { "epoch": 6.67, "learning_rate": 3.313953488372093e-05, "loss": 0.0075, "step": 2870 }, { "epoch": 6.7, "learning_rate": 3.2558139534883724e-05, "loss": 0.0088, "step": 2880 }, { "epoch": 6.72, "learning_rate": 3.197674418604651e-05, "loss": 0.007, "step": 2890 }, { "epoch": 6.74, "learning_rate": 3.13953488372093e-05, "loss": 0.0065, "step": 2900 }, { "epoch": 6.74, "eval_accuracy": 0.8272189349112427, "eval_loss": 0.6696515679359436, "eval_runtime": 19.4791, "eval_samples_per_second": 86.76, "eval_steps_per_second": 10.883, "step": 2900 }, { "epoch": 6.77, "learning_rate": 3.081395348837209e-05, "loss": 0.0276, "step": 2910 }, { "epoch": 6.79, "learning_rate": 3.0232558139534883e-05, "loss": 0.0064, "step": 2920 }, { "epoch": 6.81, "learning_rate": 2.9651162790697678e-05, "loss": 0.0064, "step": 2930 }, { "epoch": 6.84, "learning_rate": 2.9069767441860467e-05, "loss": 0.0271, "step": 2940 }, { "epoch": 6.86, "learning_rate": 2.848837209302326e-05, "loss": 0.0062, "step": 2950 }, { "epoch": 6.88, "learning_rate": 2.7906976744186048e-05, "loss": 0.0073, "step": 2960 }, { "epoch": 6.91, "learning_rate": 2.7325581395348836e-05, "loss": 0.0074, "step": 2970 }, { "epoch": 6.93, "learning_rate": 2.674418604651163e-05, "loss": 0.0068, "step": 2980 }, { "epoch": 6.95, "learning_rate": 2.616279069767442e-05, "loss": 0.007, "step": 2990 }, { "epoch": 6.98, "learning_rate": 2.5581395348837212e-05, "loss": 0.0061, "step": 3000 }, { "epoch": 6.98, "eval_accuracy": 0.8349112426035503, "eval_loss": 0.6509989500045776, "eval_runtime": 19.534, "eval_samples_per_second": 86.516, "eval_steps_per_second": 10.853, "step": 3000 }, { "epoch": 7.0, "learning_rate": 2.5e-05, "loss": 0.0411, "step": 3010 }, { "epoch": 7.02, "learning_rate": 2.441860465116279e-05, "loss": 0.006, "step": 3020 }, { "epoch": 7.05, "learning_rate": 2.3837209302325582e-05, "loss": 0.0066, "step": 3030 }, { "epoch": 7.07, "learning_rate": 2.3255813953488374e-05, "loss": 0.0058, "step": 3040 }, { "epoch": 7.09, "learning_rate": 2.2674418604651163e-05, "loss": 0.0064, "step": 3050 }, { "epoch": 7.12, "learning_rate": 2.2093023255813955e-05, "loss": 0.006, "step": 3060 }, { "epoch": 7.14, "learning_rate": 2.1511627906976744e-05, "loss": 0.0064, "step": 3070 }, { "epoch": 7.16, "learning_rate": 2.0930232558139536e-05, "loss": 0.0063, "step": 3080 }, { "epoch": 7.19, "learning_rate": 2.0348837209302328e-05, "loss": 0.0243, "step": 3090 }, { "epoch": 7.21, "learning_rate": 1.9767441860465116e-05, "loss": 0.0185, "step": 3100 }, { "epoch": 7.21, "eval_accuracy": 0.8366863905325443, "eval_loss": 0.6451619267463684, "eval_runtime": 19.7485, "eval_samples_per_second": 85.576, "eval_steps_per_second": 10.735, "step": 3100 }, { "epoch": 7.23, "learning_rate": 1.918604651162791e-05, "loss": 0.0111, "step": 3110 }, { "epoch": 7.26, "learning_rate": 1.8604651162790697e-05, "loss": 0.0063, "step": 3120 }, { "epoch": 7.28, "learning_rate": 1.802325581395349e-05, "loss": 0.007, "step": 3130 }, { "epoch": 7.3, "learning_rate": 1.744186046511628e-05, "loss": 0.006, "step": 3140 }, { "epoch": 7.33, "learning_rate": 1.686046511627907e-05, "loss": 0.0064, "step": 3150 }, { "epoch": 7.35, "learning_rate": 1.6279069767441862e-05, "loss": 0.006, "step": 3160 }, { "epoch": 7.37, "learning_rate": 1.569767441860465e-05, "loss": 0.0059, "step": 3170 }, { "epoch": 7.4, "learning_rate": 1.5116279069767441e-05, "loss": 0.0053, "step": 3180 }, { "epoch": 7.42, "learning_rate": 1.4534883720930233e-05, "loss": 0.006, "step": 3190 }, { "epoch": 7.44, "learning_rate": 1.3953488372093024e-05, "loss": 0.0059, "step": 3200 }, { "epoch": 7.44, "eval_accuracy": 0.8378698224852071, "eval_loss": 0.6426283717155457, "eval_runtime": 19.368, "eval_samples_per_second": 87.257, "eval_steps_per_second": 10.946, "step": 3200 }, { "epoch": 7.47, "learning_rate": 1.3372093023255814e-05, "loss": 0.0059, "step": 3210 }, { "epoch": 7.49, "learning_rate": 1.2790697674418606e-05, "loss": 0.0057, "step": 3220 }, { "epoch": 7.51, "learning_rate": 1.2209302325581395e-05, "loss": 0.006, "step": 3230 }, { "epoch": 7.53, "learning_rate": 1.1627906976744187e-05, "loss": 0.0053, "step": 3240 }, { "epoch": 7.56, "learning_rate": 1.1046511627906977e-05, "loss": 0.0053, "step": 3250 }, { "epoch": 7.58, "learning_rate": 1.0465116279069768e-05, "loss": 0.0062, "step": 3260 }, { "epoch": 7.6, "learning_rate": 9.883720930232558e-06, "loss": 0.0061, "step": 3270 }, { "epoch": 7.63, "learning_rate": 9.302325581395349e-06, "loss": 0.0061, "step": 3280 }, { "epoch": 7.65, "learning_rate": 8.72093023255814e-06, "loss": 0.0053, "step": 3290 }, { "epoch": 7.67, "learning_rate": 8.139534883720931e-06, "loss": 0.0062, "step": 3300 }, { "epoch": 7.67, "eval_accuracy": 0.8378698224852071, "eval_loss": 0.6398439407348633, "eval_runtime": 20.4188, "eval_samples_per_second": 82.767, "eval_steps_per_second": 10.383, "step": 3300 }, { "epoch": 7.7, "learning_rate": 7.558139534883721e-06, "loss": 0.0056, "step": 3310 }, { "epoch": 7.72, "learning_rate": 6.976744186046512e-06, "loss": 0.0058, "step": 3320 }, { "epoch": 7.74, "learning_rate": 6.395348837209303e-06, "loss": 0.0058, "step": 3330 }, { "epoch": 7.77, "learning_rate": 5.8139534883720935e-06, "loss": 0.0058, "step": 3340 }, { "epoch": 7.79, "learning_rate": 5.232558139534884e-06, "loss": 0.0062, "step": 3350 }, { "epoch": 7.81, "learning_rate": 4.651162790697674e-06, "loss": 0.0055, "step": 3360 }, { "epoch": 7.84, "learning_rate": 4.0697674418604655e-06, "loss": 0.0053, "step": 3370 }, { "epoch": 7.86, "learning_rate": 3.488372093023256e-06, "loss": 0.0057, "step": 3380 }, { "epoch": 7.88, "learning_rate": 2.9069767441860468e-06, "loss": 0.006, "step": 3390 }, { "epoch": 7.91, "learning_rate": 2.325581395348837e-06, "loss": 0.0315, "step": 3400 }, { "epoch": 7.91, "eval_accuracy": 0.8384615384615385, "eval_loss": 0.6396650075912476, "eval_runtime": 19.5469, "eval_samples_per_second": 86.459, "eval_steps_per_second": 10.846, "step": 3400 }, { "epoch": 7.93, "learning_rate": 1.744186046511628e-06, "loss": 0.0074, "step": 3410 }, { "epoch": 7.95, "learning_rate": 1.1627906976744186e-06, "loss": 0.0057, "step": 3420 }, { "epoch": 7.98, "learning_rate": 5.813953488372093e-07, "loss": 0.0058, "step": 3430 }, { "epoch": 8.0, "learning_rate": 0.0, "loss": 0.0057, "step": 3440 }, { "epoch": 8.0, "step": 3440, "total_flos": 4.2707785173722726e+18, "train_loss": 0.6754590564342432, "train_runtime": 2008.5618, "train_samples_per_second": 27.403, "train_steps_per_second": 1.713 } ], "max_steps": 3440, "num_train_epochs": 8, "total_flos": 4.2707785173722726e+18, "trial_name": null, "trial_params": null }