{ "best_metric": null, "best_model_checkpoint": null, "epoch": 76.8, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.96, "eval_accuracy": 0.22988505747126436, "eval_loss": 1.7957335710525513, "eval_runtime": 2.3499, "eval_samples_per_second": 37.022, "eval_steps_per_second": 1.277, "step": 6 }, { "epoch": 1.6, "grad_norm": 5.402665615081787, "learning_rate": 1.0416666666666668e-05, "loss": 1.8656, "step": 10 }, { "epoch": 1.92, "eval_accuracy": 0.27586206896551724, "eval_loss": 1.7703895568847656, "eval_runtime": 2.2621, "eval_samples_per_second": 38.46, "eval_steps_per_second": 1.326, "step": 12 }, { "epoch": 2.88, "eval_accuracy": 0.3218390804597701, "eval_loss": 1.738166332244873, "eval_runtime": 2.047, "eval_samples_per_second": 42.502, "eval_steps_per_second": 1.466, "step": 18 }, { "epoch": 3.2, "grad_norm": 9.074636459350586, "learning_rate": 2.0833333333333336e-05, "loss": 1.7835, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.3793103448275862, "eval_loss": 1.6673917770385742, "eval_runtime": 2.0721, "eval_samples_per_second": 41.987, "eval_steps_per_second": 1.448, "step": 25 }, { "epoch": 4.8, "grad_norm": 23.296255111694336, "learning_rate": 3.125e-05, "loss": 1.664, "step": 30 }, { "epoch": 4.96, "eval_accuracy": 0.42528735632183906, "eval_loss": 1.5981522798538208, "eval_runtime": 2.0749, "eval_samples_per_second": 41.93, "eval_steps_per_second": 1.446, "step": 31 }, { "epoch": 5.92, "eval_accuracy": 0.4367816091954023, "eval_loss": 1.4861106872558594, "eval_runtime": 2.0842, "eval_samples_per_second": 41.743, "eval_steps_per_second": 1.439, "step": 37 }, { "epoch": 6.4, "grad_norm": 36.56019592285156, "learning_rate": 4.166666666666667e-05, "loss": 1.5072, "step": 40 }, { "epoch": 6.88, "eval_accuracy": 0.47126436781609193, "eval_loss": 1.3644713163375854, "eval_runtime": 2.0344, "eval_samples_per_second": 42.765, "eval_steps_per_second": 1.475, "step": 43 }, { "epoch": 8.0, "grad_norm": 70.99100494384766, "learning_rate": 4.976851851851852e-05, "loss": 1.3304, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.45977011494252873, "eval_loss": 1.285918116569519, "eval_runtime": 2.0671, "eval_samples_per_second": 42.088, "eval_steps_per_second": 1.451, "step": 50 }, { "epoch": 8.96, "eval_accuracy": 0.47126436781609193, "eval_loss": 1.2795610427856445, "eval_runtime": 2.0462, "eval_samples_per_second": 42.519, "eval_steps_per_second": 1.466, "step": 56 }, { "epoch": 9.6, "grad_norm": 60.155181884765625, "learning_rate": 4.8611111111111115e-05, "loss": 1.1651, "step": 60 }, { "epoch": 9.92, "eval_accuracy": 0.5172413793103449, "eval_loss": 1.2455964088439941, "eval_runtime": 2.0479, "eval_samples_per_second": 42.483, "eval_steps_per_second": 1.465, "step": 62 }, { "epoch": 10.88, "eval_accuracy": 0.5402298850574713, "eval_loss": 1.1666686534881592, "eval_runtime": 2.0486, "eval_samples_per_second": 42.468, "eval_steps_per_second": 1.464, "step": 68 }, { "epoch": 11.2, "grad_norm": 17.20172119140625, "learning_rate": 4.745370370370371e-05, "loss": 1.0876, "step": 70 }, { "epoch": 12.0, "eval_accuracy": 0.5632183908045977, "eval_loss": 1.1510032415390015, "eval_runtime": 2.0486, "eval_samples_per_second": 42.468, "eval_steps_per_second": 1.464, "step": 75 }, { "epoch": 12.8, "grad_norm": 98.55331420898438, "learning_rate": 4.62962962962963e-05, "loss": 1.0046, "step": 80 }, { "epoch": 12.96, "eval_accuracy": 0.6091954022988506, "eval_loss": 1.0509852170944214, "eval_runtime": 2.2, "eval_samples_per_second": 39.546, "eval_steps_per_second": 1.364, "step": 81 }, { "epoch": 13.92, "eval_accuracy": 0.5862068965517241, "eval_loss": 1.033838152885437, "eval_runtime": 2.0133, "eval_samples_per_second": 43.212, "eval_steps_per_second": 1.49, "step": 87 }, { "epoch": 14.4, "grad_norm": 53.443302154541016, "learning_rate": 4.5138888888888894e-05, "loss": 0.9465, "step": 90 }, { "epoch": 14.88, "eval_accuracy": 0.5862068965517241, "eval_loss": 0.9883113503456116, "eval_runtime": 2.05, "eval_samples_per_second": 42.439, "eval_steps_per_second": 1.463, "step": 93 }, { "epoch": 16.0, "grad_norm": 30.475088119506836, "learning_rate": 4.3981481481481486e-05, "loss": 0.8699, "step": 100 }, { "epoch": 16.0, "eval_accuracy": 0.5632183908045977, "eval_loss": 0.9881502389907837, "eval_runtime": 2.0881, "eval_samples_per_second": 41.664, "eval_steps_per_second": 1.437, "step": 100 }, { "epoch": 16.96, "eval_accuracy": 0.5747126436781609, "eval_loss": 0.9276102781295776, "eval_runtime": 2.0889, "eval_samples_per_second": 41.648, "eval_steps_per_second": 1.436, "step": 106 }, { "epoch": 17.6, "grad_norm": 21.802074432373047, "learning_rate": 4.282407407407408e-05, "loss": 0.7969, "step": 110 }, { "epoch": 17.92, "eval_accuracy": 0.5862068965517241, "eval_loss": 0.9144545197486877, "eval_runtime": 2.0314, "eval_samples_per_second": 42.828, "eval_steps_per_second": 1.477, "step": 112 }, { "epoch": 18.88, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.8143898844718933, "eval_runtime": 2.0134, "eval_samples_per_second": 43.21, "eval_steps_per_second": 1.49, "step": 118 }, { "epoch": 19.2, "grad_norm": 58.785552978515625, "learning_rate": 4.166666666666667e-05, "loss": 0.7254, "step": 120 }, { "epoch": 20.0, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.7586901187896729, "eval_runtime": 2.0526, "eval_samples_per_second": 42.386, "eval_steps_per_second": 1.462, "step": 125 }, { "epoch": 20.8, "grad_norm": 24.079566955566406, "learning_rate": 4.0509259259259265e-05, "loss": 0.6447, "step": 130 }, { "epoch": 20.96, "eval_accuracy": 0.7471264367816092, "eval_loss": 0.6990374326705933, "eval_runtime": 2.0625, "eval_samples_per_second": 42.182, "eval_steps_per_second": 1.455, "step": 131 }, { "epoch": 21.92, "eval_accuracy": 0.7241379310344828, "eval_loss": 0.7041503190994263, "eval_runtime": 2.0267, "eval_samples_per_second": 42.926, "eval_steps_per_second": 1.48, "step": 137 }, { "epoch": 22.4, "grad_norm": 24.671459197998047, "learning_rate": 3.935185185185186e-05, "loss": 0.6021, "step": 140 }, { "epoch": 22.88, "eval_accuracy": 0.7701149425287356, "eval_loss": 0.6526182293891907, "eval_runtime": 2.1122, "eval_samples_per_second": 41.189, "eval_steps_per_second": 1.42, "step": 143 }, { "epoch": 24.0, "grad_norm": 55.466121673583984, "learning_rate": 3.8194444444444444e-05, "loss": 0.516, "step": 150 }, { "epoch": 24.0, "eval_accuracy": 0.8045977011494253, "eval_loss": 0.6485260128974915, "eval_runtime": 2.0692, "eval_samples_per_second": 42.046, "eval_steps_per_second": 1.45, "step": 150 }, { "epoch": 24.96, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.5802629590034485, "eval_runtime": 2.0421, "eval_samples_per_second": 42.603, "eval_steps_per_second": 1.469, "step": 156 }, { "epoch": 25.6, "grad_norm": 17.66895294189453, "learning_rate": 3.7037037037037037e-05, "loss": 0.4497, "step": 160 }, { "epoch": 25.92, "eval_accuracy": 0.8045977011494253, "eval_loss": 0.6084781289100647, "eval_runtime": 2.0191, "eval_samples_per_second": 43.088, "eval_steps_per_second": 1.486, "step": 162 }, { "epoch": 26.88, "eval_accuracy": 0.8045977011494253, "eval_loss": 0.6094852685928345, "eval_runtime": 1.9897, "eval_samples_per_second": 43.724, "eval_steps_per_second": 1.508, "step": 168 }, { "epoch": 27.2, "grad_norm": 31.39649200439453, "learning_rate": 3.587962962962963e-05, "loss": 0.3935, "step": 170 }, { "epoch": 28.0, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.5372287034988403, "eval_runtime": 2.0637, "eval_samples_per_second": 42.158, "eval_steps_per_second": 1.454, "step": 175 }, { "epoch": 28.8, "grad_norm": 31.86827278137207, "learning_rate": 3.472222222222222e-05, "loss": 0.3321, "step": 180 }, { "epoch": 28.96, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.5828755497932434, "eval_runtime": 2.1428, "eval_samples_per_second": 40.6, "eval_steps_per_second": 1.4, "step": 181 }, { "epoch": 29.92, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.6204901337623596, "eval_runtime": 2.0154, "eval_samples_per_second": 43.168, "eval_steps_per_second": 1.489, "step": 187 }, { "epoch": 30.4, "grad_norm": 42.88612747192383, "learning_rate": 3.3564814814814815e-05, "loss": 0.3007, "step": 190 }, { "epoch": 30.88, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.5149825811386108, "eval_runtime": 2.0492, "eval_samples_per_second": 42.456, "eval_steps_per_second": 1.464, "step": 193 }, { "epoch": 32.0, "grad_norm": 30.13237190246582, "learning_rate": 3.240740740740741e-05, "loss": 0.2618, "step": 200 }, { "epoch": 32.0, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.6068965196609497, "eval_runtime": 2.0657, "eval_samples_per_second": 42.117, "eval_steps_per_second": 1.452, "step": 200 }, { "epoch": 32.96, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.5272508859634399, "eval_runtime": 2.0395, "eval_samples_per_second": 42.657, "eval_steps_per_second": 1.471, "step": 206 }, { "epoch": 33.6, "grad_norm": 24.97075080871582, "learning_rate": 3.125e-05, "loss": 0.2411, "step": 210 }, { "epoch": 33.92, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.4726714789867401, "eval_runtime": 2.0758, "eval_samples_per_second": 41.912, "eval_steps_per_second": 1.445, "step": 212 }, { "epoch": 34.88, "eval_accuracy": 0.8735632183908046, "eval_loss": 0.4611084461212158, "eval_runtime": 2.0264, "eval_samples_per_second": 42.934, "eval_steps_per_second": 1.48, "step": 218 }, { "epoch": 35.2, "grad_norm": 60.3193359375, "learning_rate": 3.0092592592592593e-05, "loss": 0.2108, "step": 220 }, { "epoch": 36.0, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5696073770523071, "eval_runtime": 2.0919, "eval_samples_per_second": 41.589, "eval_steps_per_second": 1.434, "step": 225 }, { "epoch": 36.8, "grad_norm": 16.915546417236328, "learning_rate": 2.8935185185185186e-05, "loss": 0.2143, "step": 230 }, { "epoch": 36.96, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.49439194798469543, "eval_runtime": 2.0923, "eval_samples_per_second": 41.58, "eval_steps_per_second": 1.434, "step": 231 }, { "epoch": 37.92, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.5627816915512085, "eval_runtime": 2.0503, "eval_samples_per_second": 42.432, "eval_steps_per_second": 1.463, "step": 237 }, { "epoch": 38.4, "grad_norm": 14.699493408203125, "learning_rate": 2.777777777777778e-05, "loss": 0.1663, "step": 240 }, { "epoch": 38.88, "eval_accuracy": 0.8045977011494253, "eval_loss": 0.6131365895271301, "eval_runtime": 2.0693, "eval_samples_per_second": 42.044, "eval_steps_per_second": 1.45, "step": 243 }, { "epoch": 40.0, "grad_norm": 25.7874755859375, "learning_rate": 2.6620370370370372e-05, "loss": 0.1714, "step": 250 }, { "epoch": 40.0, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.4961901605129242, "eval_runtime": 2.0252, "eval_samples_per_second": 42.959, "eval_steps_per_second": 1.481, "step": 250 }, { "epoch": 40.96, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.5022612810134888, "eval_runtime": 2.127, "eval_samples_per_second": 40.904, "eval_steps_per_second": 1.41, "step": 256 }, { "epoch": 41.6, "grad_norm": 24.087005615234375, "learning_rate": 2.5462962962962965e-05, "loss": 0.174, "step": 260 }, { "epoch": 41.92, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.48418501019477844, "eval_runtime": 2.0168, "eval_samples_per_second": 43.137, "eval_steps_per_second": 1.487, "step": 262 }, { "epoch": 42.88, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.46790340542793274, "eval_runtime": 2.0909, "eval_samples_per_second": 41.609, "eval_steps_per_second": 1.435, "step": 268 }, { "epoch": 43.2, "grad_norm": 13.284588813781738, "learning_rate": 2.4305555555555558e-05, "loss": 0.138, "step": 270 }, { "epoch": 44.0, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.6270841956138611, "eval_runtime": 2.1069, "eval_samples_per_second": 41.294, "eval_steps_per_second": 1.424, "step": 275 }, { "epoch": 44.8, "grad_norm": 14.41830062866211, "learning_rate": 2.314814814814815e-05, "loss": 0.1437, "step": 280 }, { "epoch": 44.96, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5325595736503601, "eval_runtime": 2.1982, "eval_samples_per_second": 39.578, "eval_steps_per_second": 1.365, "step": 281 }, { "epoch": 45.92, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.5655315518379211, "eval_runtime": 2.0683, "eval_samples_per_second": 42.063, "eval_steps_per_second": 1.45, "step": 287 }, { "epoch": 46.4, "grad_norm": 17.588279724121094, "learning_rate": 2.1990740740740743e-05, "loss": 0.136, "step": 290 }, { "epoch": 46.88, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.46718767285346985, "eval_runtime": 2.0892, "eval_samples_per_second": 41.643, "eval_steps_per_second": 1.436, "step": 293 }, { "epoch": 48.0, "grad_norm": 22.864524841308594, "learning_rate": 2.0833333333333336e-05, "loss": 0.1401, "step": 300 }, { "epoch": 48.0, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.498960942029953, "eval_runtime": 2.0484, "eval_samples_per_second": 42.471, "eval_steps_per_second": 1.465, "step": 300 }, { "epoch": 48.96, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.5445386171340942, "eval_runtime": 2.0365, "eval_samples_per_second": 42.721, "eval_steps_per_second": 1.473, "step": 306 }, { "epoch": 49.6, "grad_norm": 22.651620864868164, "learning_rate": 1.967592592592593e-05, "loss": 0.1281, "step": 310 }, { "epoch": 49.92, "eval_accuracy": 0.8735632183908046, "eval_loss": 0.47610902786254883, "eval_runtime": 2.1166, "eval_samples_per_second": 41.104, "eval_steps_per_second": 1.417, "step": 312 }, { "epoch": 50.88, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5665103793144226, "eval_runtime": 2.1168, "eval_samples_per_second": 41.1, "eval_steps_per_second": 1.417, "step": 318 }, { "epoch": 51.2, "grad_norm": 26.539594650268555, "learning_rate": 1.8518518518518518e-05, "loss": 0.1156, "step": 320 }, { "epoch": 52.0, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5089926719665527, "eval_runtime": 2.0775, "eval_samples_per_second": 41.877, "eval_steps_per_second": 1.444, "step": 325 }, { "epoch": 52.8, "grad_norm": 23.464221954345703, "learning_rate": 1.736111111111111e-05, "loss": 0.0981, "step": 330 }, { "epoch": 52.96, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5152259469032288, "eval_runtime": 2.0607, "eval_samples_per_second": 42.219, "eval_steps_per_second": 1.456, "step": 331 }, { "epoch": 53.92, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.5466004610061646, "eval_runtime": 2.0591, "eval_samples_per_second": 42.251, "eval_steps_per_second": 1.457, "step": 337 }, { "epoch": 54.4, "grad_norm": 14.581974983215332, "learning_rate": 1.6203703703703704e-05, "loss": 0.1055, "step": 340 }, { "epoch": 54.88, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.5390048623085022, "eval_runtime": 2.0443, "eval_samples_per_second": 42.558, "eval_steps_per_second": 1.468, "step": 343 }, { "epoch": 56.0, "grad_norm": 14.774139404296875, "learning_rate": 1.5046296296296297e-05, "loss": 0.112, "step": 350 }, { "epoch": 56.0, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5574498176574707, "eval_runtime": 2.0874, "eval_samples_per_second": 41.679, "eval_steps_per_second": 1.437, "step": 350 }, { "epoch": 56.96, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5448784828186035, "eval_runtime": 2.0514, "eval_samples_per_second": 42.41, "eval_steps_per_second": 1.462, "step": 356 }, { "epoch": 57.6, "grad_norm": 18.17756462097168, "learning_rate": 1.388888888888889e-05, "loss": 0.0855, "step": 360 }, { "epoch": 57.92, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5390240550041199, "eval_runtime": 2.077, "eval_samples_per_second": 41.888, "eval_steps_per_second": 1.444, "step": 362 }, { "epoch": 58.88, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5206344723701477, "eval_runtime": 2.0568, "eval_samples_per_second": 42.299, "eval_steps_per_second": 1.459, "step": 368 }, { "epoch": 59.2, "grad_norm": 40.29678726196289, "learning_rate": 1.2731481481481482e-05, "loss": 0.0899, "step": 370 }, { "epoch": 60.0, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.5475941300392151, "eval_runtime": 2.063, "eval_samples_per_second": 42.172, "eval_steps_per_second": 1.454, "step": 375 }, { "epoch": 60.8, "grad_norm": 33.910377502441406, "learning_rate": 1.1574074074074075e-05, "loss": 0.1026, "step": 380 }, { "epoch": 60.96, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5344437956809998, "eval_runtime": 2.298, "eval_samples_per_second": 37.858, "eval_steps_per_second": 1.305, "step": 381 }, { "epoch": 61.92, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.553070068359375, "eval_runtime": 2.1032, "eval_samples_per_second": 41.366, "eval_steps_per_second": 1.426, "step": 387 }, { "epoch": 62.4, "grad_norm": 13.71580982208252, "learning_rate": 1.0416666666666668e-05, "loss": 0.0799, "step": 390 }, { "epoch": 62.88, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.57228684425354, "eval_runtime": 2.0779, "eval_samples_per_second": 41.868, "eval_steps_per_second": 1.444, "step": 393 }, { "epoch": 64.0, "grad_norm": 28.238468170166016, "learning_rate": 9.259259259259259e-06, "loss": 0.0844, "step": 400 }, { "epoch": 64.0, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.5339850783348083, "eval_runtime": 2.0258, "eval_samples_per_second": 42.946, "eval_steps_per_second": 1.481, "step": 400 }, { "epoch": 64.96, "eval_accuracy": 0.8735632183908046, "eval_loss": 0.52364581823349, "eval_runtime": 2.0251, "eval_samples_per_second": 42.961, "eval_steps_per_second": 1.481, "step": 406 }, { "epoch": 65.6, "grad_norm": 10.24560832977295, "learning_rate": 8.101851851851852e-06, "loss": 0.0724, "step": 410 }, { "epoch": 65.92, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.6136645674705505, "eval_runtime": 2.03, "eval_samples_per_second": 42.858, "eval_steps_per_second": 1.478, "step": 412 }, { "epoch": 66.88, "eval_accuracy": 0.8275862068965517, "eval_loss": 0.5824962854385376, "eval_runtime": 2.0787, "eval_samples_per_second": 41.854, "eval_steps_per_second": 1.443, "step": 418 }, { "epoch": 67.2, "grad_norm": 20.803382873535156, "learning_rate": 6.944444444444445e-06, "loss": 0.0867, "step": 420 }, { "epoch": 68.0, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.510515034198761, "eval_runtime": 2.0565, "eval_samples_per_second": 42.305, "eval_steps_per_second": 1.459, "step": 425 }, { "epoch": 68.8, "grad_norm": 15.880162239074707, "learning_rate": 5.787037037037038e-06, "loss": 0.071, "step": 430 }, { "epoch": 68.96, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5272470116615295, "eval_runtime": 2.0378, "eval_samples_per_second": 42.693, "eval_steps_per_second": 1.472, "step": 431 }, { "epoch": 69.92, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5523571372032166, "eval_runtime": 2.0569, "eval_samples_per_second": 42.297, "eval_steps_per_second": 1.459, "step": 437 }, { "epoch": 70.4, "grad_norm": 14.639904975891113, "learning_rate": 4.6296296296296296e-06, "loss": 0.0723, "step": 440 }, { "epoch": 70.88, "eval_accuracy": 0.8390804597701149, "eval_loss": 0.5507646799087524, "eval_runtime": 2.1114, "eval_samples_per_second": 41.205, "eval_steps_per_second": 1.421, "step": 443 }, { "epoch": 72.0, "grad_norm": 6.164122104644775, "learning_rate": 3.4722222222222224e-06, "loss": 0.0748, "step": 450 }, { "epoch": 72.0, "eval_accuracy": 0.8160919540229885, "eval_loss": 0.568942129611969, "eval_runtime": 2.0852, "eval_samples_per_second": 41.723, "eval_steps_per_second": 1.439, "step": 450 }, { "epoch": 72.96, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.555583119392395, "eval_runtime": 2.0316, "eval_samples_per_second": 42.824, "eval_steps_per_second": 1.477, "step": 456 }, { "epoch": 73.6, "grad_norm": 11.653559684753418, "learning_rate": 2.3148148148148148e-06, "loss": 0.0589, "step": 460 }, { "epoch": 73.92, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.5452274084091187, "eval_runtime": 2.0938, "eval_samples_per_second": 41.551, "eval_steps_per_second": 1.433, "step": 462 }, { "epoch": 74.88, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.5475078225135803, "eval_runtime": 2.0547, "eval_samples_per_second": 42.342, "eval_steps_per_second": 1.46, "step": 468 }, { "epoch": 75.2, "grad_norm": 21.146989822387695, "learning_rate": 1.1574074074074074e-06, "loss": 0.0719, "step": 470 }, { "epoch": 76.0, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.5483731031417847, "eval_runtime": 2.1022, "eval_samples_per_second": 41.386, "eval_steps_per_second": 1.427, "step": 475 }, { "epoch": 76.8, "grad_norm": 12.87066650390625, "learning_rate": 0.0, "loss": 0.0801, "step": 480 }, { "epoch": 76.8, "eval_accuracy": 0.8620689655172413, "eval_loss": 0.5496163368225098, "eval_runtime": 2.0924, "eval_samples_per_second": 41.58, "eval_steps_per_second": 1.434, "step": 480 }, { "epoch": 76.8, "step": 480, "total_flos": 1.514063180200919e+18, "train_loss": 0.449434948215882, "train_runtime": 1985.4684, "train_samples_per_second": 31.549, "train_steps_per_second": 0.242 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.514063180200919e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }