{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10031347962382445, "eval_steps": 500, "global_step": 128, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2e-05, "loss": 0.7469, "step": 1 }, { "epoch": 0.0, "learning_rate": 4e-05, "loss": 0.7617, "step": 2 }, { "epoch": 0.0, "learning_rate": 6e-05, "loss": 0.8527, "step": 3 }, { "epoch": 0.0, "learning_rate": 8e-05, "loss": 0.7475, "step": 4 }, { "epoch": 0.0, "learning_rate": 0.0001, "loss": 0.7094, "step": 5 }, { "epoch": 0.0, "learning_rate": 0.00012, "loss": 0.6652, "step": 6 }, { "epoch": 0.01, "learning_rate": 0.00014, "loss": 0.5645, "step": 7 }, { "epoch": 0.01, "learning_rate": 0.00016, "loss": 0.5129, "step": 8 }, { "epoch": 0.01, "learning_rate": 0.00018, "loss": 0.3689, "step": 9 }, { "epoch": 0.01, "learning_rate": 0.0002, "loss": 0.4651, "step": 10 }, { "epoch": 0.01, "learning_rate": 0.00019996456111234527, "loss": 0.3466, "step": 11 }, { "epoch": 0.01, "learning_rate": 0.0001998582695676762, "loss": 0.288, "step": 12 }, { "epoch": 0.01, "learning_rate": 0.000199681200703075, "loss": 0.2641, "step": 13 }, { "epoch": 0.01, "learning_rate": 0.00019943348002101371, "loss": 0.1956, "step": 14 }, { "epoch": 0.01, "learning_rate": 0.00019911528310040074, "loss": 0.2055, "step": 15 }, { "epoch": 0.01, "learning_rate": 0.00019872683547213446, "loss": 0.2808, "step": 16 }, { "epoch": 0.01, "learning_rate": 0.00019826841245925212, "loss": 0.2246, "step": 17 }, { "epoch": 0.01, "learning_rate": 0.00019774033898178667, "loss": 0.2667, "step": 18 }, { "epoch": 0.01, "learning_rate": 0.00019714298932647098, "loss": 0.2157, "step": 19 }, { "epoch": 0.02, "learning_rate": 0.0001964767868814516, "loss": 0.2702, "step": 20 }, { "epoch": 0.02, "learning_rate": 0.00019574220383620055, "loss": 0.3036, "step": 21 }, { "epoch": 0.02, "learning_rate": 0.00019493976084683813, "loss": 0.2306, "step": 22 }, { "epoch": 0.02, "learning_rate": 0.00019407002666710336, "loss": 0.2523, "step": 23 }, { "epoch": 0.02, "learning_rate": 0.00019313361774523385, "loss": 0.1678, "step": 24 }, { "epoch": 0.02, "learning_rate": 0.00019213119778704128, "loss": 0.1514, "step": 25 }, { "epoch": 0.02, "learning_rate": 0.00019106347728549135, "loss": 0.2651, "step": 26 }, { "epoch": 0.02, "learning_rate": 0.00018993121301712193, "loss": 0.2442, "step": 27 }, { "epoch": 0.02, "learning_rate": 0.00018873520750565718, "loss": 0.2395, "step": 28 }, { "epoch": 0.02, "learning_rate": 0.00018747630845319612, "loss": 0.1471, "step": 29 }, { "epoch": 0.02, "learning_rate": 0.0001861554081393806, "loss": 0.2239, "step": 30 }, { "epoch": 0.02, "learning_rate": 0.0001847734427889671, "loss": 0.176, "step": 31 }, { "epoch": 0.03, "learning_rate": 0.0001833313919082515, "loss": 0.2475, "step": 32 }, { "epoch": 0.03, "learning_rate": 0.0001818302775908169, "loss": 0.286, "step": 33 }, { "epoch": 0.03, "learning_rate": 0.00018027116379309638, "loss": 0.0994, "step": 34 }, { "epoch": 0.03, "learning_rate": 0.00017865515558026428, "loss": 0.1781, "step": 35 }, { "epoch": 0.03, "learning_rate": 0.00017698339834299061, "loss": 0.2149, "step": 36 }, { "epoch": 0.03, "learning_rate": 0.00017525707698561385, "loss": 0.1176, "step": 37 }, { "epoch": 0.03, "learning_rate": 0.00017347741508630672, "loss": 0.2308, "step": 38 }, { "epoch": 0.03, "learning_rate": 0.00017164567402983152, "loss": 0.1913, "step": 39 }, { "epoch": 0.03, "learning_rate": 0.0001697631521134985, "loss": 0.1527, "step": 40 }, { "epoch": 0.03, "learning_rate": 0.00016783118362696163, "loss": 0.1628, "step": 41 }, { "epoch": 0.03, "learning_rate": 0.00016585113790650388, "loss": 0.1501, "step": 42 }, { "epoch": 0.03, "learning_rate": 0.00016382441836448202, "loss": 0.1326, "step": 43 }, { "epoch": 0.03, "learning_rate": 0.0001617524614946192, "loss": 0.1057, "step": 44 }, { "epoch": 0.04, "learning_rate": 0.00015963673585385016, "loss": 0.1666, "step": 45 }, { "epoch": 0.04, "learning_rate": 0.0001574787410214407, "loss": 0.1454, "step": 46 }, { "epoch": 0.04, "learning_rate": 0.00015528000653611935, "loss": 0.2029, "step": 47 }, { "epoch": 0.04, "learning_rate": 0.00015304209081197425, "loss": 0.2525, "step": 48 }, { "epoch": 0.04, "learning_rate": 0.000150766580033884, "loss": 0.2038, "step": 49 }, { "epoch": 0.04, "learning_rate": 0.00014845508703326504, "loss": 0.2174, "step": 50 }, { "epoch": 0.04, "learning_rate": 0.0001461092501449326, "loss": 0.204, "step": 51 }, { "epoch": 0.04, "learning_rate": 0.00014373073204588556, "loss": 0.159, "step": 52 }, { "epoch": 0.04, "learning_rate": 0.00014132121857683783, "loss": 0.2731, "step": 53 }, { "epoch": 0.04, "learning_rate": 0.00013888241754733208, "loss": 0.1232, "step": 54 }, { "epoch": 0.04, "learning_rate": 0.00013641605752528224, "loss": 0.1696, "step": 55 }, { "epoch": 0.04, "learning_rate": 0.00013392388661180303, "loss": 0.2264, "step": 56 }, { "epoch": 0.04, "learning_rate": 0.0001314076712021949, "loss": 0.2123, "step": 57 }, { "epoch": 0.05, "learning_rate": 0.0001288691947339621, "loss": 0.1561, "step": 58 }, { "epoch": 0.05, "learning_rate": 0.00012631025642275212, "loss": 0.1864, "step": 59 }, { "epoch": 0.05, "learning_rate": 0.0001237326699871115, "loss": 0.2353, "step": 60 }, { "epoch": 0.05, "learning_rate": 0.00012113826236296244, "loss": 0.2863, "step": 61 }, { "epoch": 0.05, "learning_rate": 0.00011852887240871145, "loss": 0.2241, "step": 62 }, { "epoch": 0.05, "learning_rate": 0.00011590634960190721, "loss": 0.1076, "step": 63 }, { "epoch": 0.05, "learning_rate": 0.00011327255272837221, "loss": 0.0993, "step": 64 }, { "epoch": 0.05, "learning_rate": 0.00011062934856473655, "loss": 0.2469, "step": 65 }, { "epoch": 0.05, "learning_rate": 0.00010797861055530831, "loss": 0.1613, "step": 66 }, { "epoch": 0.05, "learning_rate": 0.00010532221748421787, "loss": 0.2165, "step": 67 }, { "epoch": 0.05, "learning_rate": 0.00010266205214377748, "loss": 0.1614, "step": 68 }, { "epoch": 0.05, "learning_rate": 0.0001, "loss": 0.2062, "step": 69 }, { "epoch": 0.05, "learning_rate": 9.733794785622253e-05, "loss": 0.2491, "step": 70 }, { "epoch": 0.06, "learning_rate": 9.467778251578217e-05, "loss": 0.1994, "step": 71 }, { "epoch": 0.06, "learning_rate": 9.202138944469168e-05, "loss": 0.1555, "step": 72 }, { "epoch": 0.06, "learning_rate": 8.937065143526347e-05, "loss": 0.0907, "step": 73 }, { "epoch": 0.06, "learning_rate": 8.672744727162781e-05, "loss": 0.0849, "step": 74 }, { "epoch": 0.06, "learning_rate": 8.409365039809281e-05, "loss": 0.1289, "step": 75 }, { "epoch": 0.06, "learning_rate": 8.147112759128859e-05, "loss": 0.1881, "step": 76 }, { "epoch": 0.06, "learning_rate": 7.886173763703757e-05, "loss": 0.09, "step": 77 }, { "epoch": 0.06, "learning_rate": 7.626733001288851e-05, "loss": 0.3259, "step": 78 }, { "epoch": 0.06, "learning_rate": 7.368974357724789e-05, "loss": 0.1752, "step": 79 }, { "epoch": 0.06, "learning_rate": 7.113080526603792e-05, "loss": 0.2569, "step": 80 }, { "epoch": 0.06, "learning_rate": 6.859232879780515e-05, "loss": 0.118, "step": 81 }, { "epoch": 0.06, "learning_rate": 6.607611338819697e-05, "loss": 0.1885, "step": 82 }, { "epoch": 0.07, "learning_rate": 6.358394247471778e-05, "loss": 0.1382, "step": 83 }, { "epoch": 0.07, "learning_rate": 6.111758245266794e-05, "loss": 0.0643, "step": 84 }, { "epoch": 0.07, "learning_rate": 5.867878142316221e-05, "loss": 0.2721, "step": 85 }, { "epoch": 0.07, "learning_rate": 5.626926795411447e-05, "loss": 0.1497, "step": 86 }, { "epoch": 0.07, "learning_rate": 5.38907498550674e-05, "loss": 0.1311, "step": 87 }, { "epoch": 0.07, "learning_rate": 5.1544912966734994e-05, "loss": 0.1305, "step": 88 }, { "epoch": 0.07, "learning_rate": 4.9233419966116036e-05, "loss": 0.1846, "step": 89 }, { "epoch": 0.07, "learning_rate": 4.695790918802576e-05, "loss": 0.2248, "step": 90 }, { "epoch": 0.07, "learning_rate": 4.47199934638807e-05, "loss": 0.2505, "step": 91 }, { "epoch": 0.07, "learning_rate": 4.252125897855932e-05, "loss": 0.1422, "step": 92 }, { "epoch": 0.07, "learning_rate": 4.036326414614985e-05, "loss": 0.2374, "step": 93 }, { "epoch": 0.07, "learning_rate": 3.824753850538082e-05, "loss": 0.083, "step": 94 }, { "epoch": 0.07, "learning_rate": 3.617558163551802e-05, "loss": 0.1934, "step": 95 }, { "epoch": 0.08, "learning_rate": 3.414886209349615e-05, "loss": 0.1593, "step": 96 }, { "epoch": 0.08, "learning_rate": 3.216881637303839e-05, "loss": 0.1621, "step": 97 }, { "epoch": 0.08, "learning_rate": 3.0236847886501542e-05, "loss": 0.2919, "step": 98 }, { "epoch": 0.08, "learning_rate": 2.8354325970168484e-05, "loss": 0.1741, "step": 99 }, { "epoch": 0.08, "learning_rate": 2.6522584913693294e-05, "loss": 0.0845, "step": 100 }, { "epoch": 0.08, "learning_rate": 2.4742923014386156e-05, "loss": 0.2681, "step": 101 }, { "epoch": 0.08, "learning_rate": 2.301660165700936e-05, "loss": 0.1332, "step": 102 }, { "epoch": 0.08, "learning_rate": 2.1344844419735755e-05, "loss": 0.1809, "step": 103 }, { "epoch": 0.08, "learning_rate": 1.9728836206903656e-05, "loss": 0.2586, "step": 104 }, { "epoch": 0.08, "learning_rate": 1.8169722409183097e-05, "loss": 0.1661, "step": 105 }, { "epoch": 0.08, "learning_rate": 1.6668608091748495e-05, "loss": 0.2437, "step": 106 }, { "epoch": 0.08, "learning_rate": 1.522655721103291e-05, "loss": 0.1746, "step": 107 }, { "epoch": 0.08, "learning_rate": 1.3844591860619383e-05, "loss": 0.2686, "step": 108 }, { "epoch": 0.09, "learning_rate": 1.2523691546803873e-05, "loss": 0.2711, "step": 109 }, { "epoch": 0.09, "learning_rate": 1.1264792494342857e-05, "loss": 0.2428, "step": 110 }, { "epoch": 0.09, "learning_rate": 1.0068786982878087e-05, "loss": 0.2276, "step": 111 }, { "epoch": 0.09, "learning_rate": 8.936522714508678e-06, "loss": 0.2001, "step": 112 }, { "epoch": 0.09, "learning_rate": 7.868802212958703e-06, "loss": 0.2749, "step": 113 }, { "epoch": 0.09, "learning_rate": 6.866382254766157e-06, "loss": 0.2148, "step": 114 }, { "epoch": 0.09, "learning_rate": 5.929973332896677e-06, "loss": 0.1606, "step": 115 }, { "epoch": 0.09, "learning_rate": 5.060239153161872e-06, "loss": 0.1503, "step": 116 }, { "epoch": 0.09, "learning_rate": 4.257796163799455e-06, "loss": 0.1724, "step": 117 }, { "epoch": 0.09, "learning_rate": 3.5232131185484076e-06, "loss": 0.1867, "step": 118 }, { "epoch": 0.09, "learning_rate": 2.857010673529015e-06, "loss": 0.3419, "step": 119 }, { "epoch": 0.09, "learning_rate": 2.259661018213333e-06, "loss": 0.3071, "step": 120 }, { "epoch": 0.09, "learning_rate": 1.7315875407479032e-06, "loss": 0.117, "step": 121 }, { "epoch": 0.1, "learning_rate": 1.2731645278655445e-06, "loss": 0.2245, "step": 122 }, { "epoch": 0.1, "learning_rate": 8.847168995992916e-07, "loss": 0.0688, "step": 123 }, { "epoch": 0.1, "learning_rate": 5.665199789862907e-07, "loss": 0.1772, "step": 124 }, { "epoch": 0.1, "learning_rate": 3.1879929692498757e-07, "loss": 0.1484, "step": 125 }, { "epoch": 0.1, "learning_rate": 1.4173043232380557e-07, "loss": 0.133, "step": 126 }, { "epoch": 0.1, "learning_rate": 3.5438887654737355e-08, "loss": 0.1682, "step": 127 }, { "epoch": 0.1, "learning_rate": 0.0, "loss": 0.1149, "step": 128 } ], "logging_steps": 1, "max_steps": 128, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 5.921036416707461e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }