|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 76.8, |
|
"eval_steps": 500, |
|
"global_step": 480, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.22988505747126436, |
|
"eval_loss": 1.7957335710525513, |
|
"eval_runtime": 2.3499, |
|
"eval_samples_per_second": 37.022, |
|
"eval_steps_per_second": 1.277, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.402665615081787, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 1.8656, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_accuracy": 0.27586206896551724, |
|
"eval_loss": 1.7703895568847656, |
|
"eval_runtime": 2.2621, |
|
"eval_samples_per_second": 38.46, |
|
"eval_steps_per_second": 1.326, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_accuracy": 0.3218390804597701, |
|
"eval_loss": 1.738166332244873, |
|
"eval_runtime": 2.047, |
|
"eval_samples_per_second": 42.502, |
|
"eval_steps_per_second": 1.466, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 9.074636459350586, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 1.7835, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3793103448275862, |
|
"eval_loss": 1.6673917770385742, |
|
"eval_runtime": 2.0721, |
|
"eval_samples_per_second": 41.987, |
|
"eval_steps_per_second": 1.448, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 23.296255111694336, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.664, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"eval_accuracy": 0.42528735632183906, |
|
"eval_loss": 1.5981522798538208, |
|
"eval_runtime": 2.0749, |
|
"eval_samples_per_second": 41.93, |
|
"eval_steps_per_second": 1.446, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"eval_accuracy": 0.4367816091954023, |
|
"eval_loss": 1.4861106872558594, |
|
"eval_runtime": 2.0842, |
|
"eval_samples_per_second": 41.743, |
|
"eval_steps_per_second": 1.439, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 36.56019592285156, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.5072, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"eval_accuracy": 0.47126436781609193, |
|
"eval_loss": 1.3644713163375854, |
|
"eval_runtime": 2.0344, |
|
"eval_samples_per_second": 42.765, |
|
"eval_steps_per_second": 1.475, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 70.99100494384766, |
|
"learning_rate": 4.976851851851852e-05, |
|
"loss": 1.3304, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.45977011494252873, |
|
"eval_loss": 1.285918116569519, |
|
"eval_runtime": 2.0671, |
|
"eval_samples_per_second": 42.088, |
|
"eval_steps_per_second": 1.451, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"eval_accuracy": 0.47126436781609193, |
|
"eval_loss": 1.2795610427856445, |
|
"eval_runtime": 2.0462, |
|
"eval_samples_per_second": 42.519, |
|
"eval_steps_per_second": 1.466, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 60.155181884765625, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 1.1651, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"eval_accuracy": 0.5172413793103449, |
|
"eval_loss": 1.2455964088439941, |
|
"eval_runtime": 2.0479, |
|
"eval_samples_per_second": 42.483, |
|
"eval_steps_per_second": 1.465, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"eval_accuracy": 0.5402298850574713, |
|
"eval_loss": 1.1666686534881592, |
|
"eval_runtime": 2.0486, |
|
"eval_samples_per_second": 42.468, |
|
"eval_steps_per_second": 1.464, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 17.20172119140625, |
|
"learning_rate": 4.745370370370371e-05, |
|
"loss": 1.0876, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.5632183908045977, |
|
"eval_loss": 1.1510032415390015, |
|
"eval_runtime": 2.0486, |
|
"eval_samples_per_second": 42.468, |
|
"eval_steps_per_second": 1.464, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 98.55331420898438, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 1.0046, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"eval_accuracy": 0.6091954022988506, |
|
"eval_loss": 1.0509852170944214, |
|
"eval_runtime": 2.2, |
|
"eval_samples_per_second": 39.546, |
|
"eval_steps_per_second": 1.364, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"eval_accuracy": 0.5862068965517241, |
|
"eval_loss": 1.033838152885437, |
|
"eval_runtime": 2.0133, |
|
"eval_samples_per_second": 43.212, |
|
"eval_steps_per_second": 1.49, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 53.443302154541016, |
|
"learning_rate": 4.5138888888888894e-05, |
|
"loss": 0.9465, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 14.88, |
|
"eval_accuracy": 0.5862068965517241, |
|
"eval_loss": 0.9883113503456116, |
|
"eval_runtime": 2.05, |
|
"eval_samples_per_second": 42.439, |
|
"eval_steps_per_second": 1.463, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 30.475088119506836, |
|
"learning_rate": 4.3981481481481486e-05, |
|
"loss": 0.8699, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5632183908045977, |
|
"eval_loss": 0.9881502389907837, |
|
"eval_runtime": 2.0881, |
|
"eval_samples_per_second": 41.664, |
|
"eval_steps_per_second": 1.437, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 16.96, |
|
"eval_accuracy": 0.5747126436781609, |
|
"eval_loss": 0.9276102781295776, |
|
"eval_runtime": 2.0889, |
|
"eval_samples_per_second": 41.648, |
|
"eval_steps_per_second": 1.436, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 21.802074432373047, |
|
"learning_rate": 4.282407407407408e-05, |
|
"loss": 0.7969, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 17.92, |
|
"eval_accuracy": 0.5862068965517241, |
|
"eval_loss": 0.9144545197486877, |
|
"eval_runtime": 2.0314, |
|
"eval_samples_per_second": 42.828, |
|
"eval_steps_per_second": 1.477, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 0.8143898844718933, |
|
"eval_runtime": 2.0134, |
|
"eval_samples_per_second": 43.21, |
|
"eval_steps_per_second": 1.49, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 58.785552978515625, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.7254, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 0.7586901187896729, |
|
"eval_runtime": 2.0526, |
|
"eval_samples_per_second": 42.386, |
|
"eval_steps_per_second": 1.462, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 20.8, |
|
"grad_norm": 24.079566955566406, |
|
"learning_rate": 4.0509259259259265e-05, |
|
"loss": 0.6447, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 20.96, |
|
"eval_accuracy": 0.7471264367816092, |
|
"eval_loss": 0.6990374326705933, |
|
"eval_runtime": 2.0625, |
|
"eval_samples_per_second": 42.182, |
|
"eval_steps_per_second": 1.455, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 21.92, |
|
"eval_accuracy": 0.7241379310344828, |
|
"eval_loss": 0.7041503190994263, |
|
"eval_runtime": 2.0267, |
|
"eval_samples_per_second": 42.926, |
|
"eval_steps_per_second": 1.48, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"grad_norm": 24.671459197998047, |
|
"learning_rate": 3.935185185185186e-05, |
|
"loss": 0.6021, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 22.88, |
|
"eval_accuracy": 0.7701149425287356, |
|
"eval_loss": 0.6526182293891907, |
|
"eval_runtime": 2.1122, |
|
"eval_samples_per_second": 41.189, |
|
"eval_steps_per_second": 1.42, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 55.466121673583984, |
|
"learning_rate": 3.8194444444444444e-05, |
|
"loss": 0.516, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8045977011494253, |
|
"eval_loss": 0.6485260128974915, |
|
"eval_runtime": 2.0692, |
|
"eval_samples_per_second": 42.046, |
|
"eval_steps_per_second": 1.45, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 24.96, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.5802629590034485, |
|
"eval_runtime": 2.0421, |
|
"eval_samples_per_second": 42.603, |
|
"eval_steps_per_second": 1.469, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 25.6, |
|
"grad_norm": 17.66895294189453, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.4497, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 25.92, |
|
"eval_accuracy": 0.8045977011494253, |
|
"eval_loss": 0.6084781289100647, |
|
"eval_runtime": 2.0191, |
|
"eval_samples_per_second": 43.088, |
|
"eval_steps_per_second": 1.486, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 26.88, |
|
"eval_accuracy": 0.8045977011494253, |
|
"eval_loss": 0.6094852685928345, |
|
"eval_runtime": 1.9897, |
|
"eval_samples_per_second": 43.724, |
|
"eval_steps_per_second": 1.508, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 27.2, |
|
"grad_norm": 31.39649200439453, |
|
"learning_rate": 3.587962962962963e-05, |
|
"loss": 0.3935, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.5372287034988403, |
|
"eval_runtime": 2.0637, |
|
"eval_samples_per_second": 42.158, |
|
"eval_steps_per_second": 1.454, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 28.8, |
|
"grad_norm": 31.86827278137207, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 0.3321, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 28.96, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.5828755497932434, |
|
"eval_runtime": 2.1428, |
|
"eval_samples_per_second": 40.6, |
|
"eval_steps_per_second": 1.4, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 29.92, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.6204901337623596, |
|
"eval_runtime": 2.0154, |
|
"eval_samples_per_second": 43.168, |
|
"eval_steps_per_second": 1.489, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 30.4, |
|
"grad_norm": 42.88612747192383, |
|
"learning_rate": 3.3564814814814815e-05, |
|
"loss": 0.3007, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 30.88, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.5149825811386108, |
|
"eval_runtime": 2.0492, |
|
"eval_samples_per_second": 42.456, |
|
"eval_steps_per_second": 1.464, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 30.13237190246582, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 0.2618, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.6068965196609497, |
|
"eval_runtime": 2.0657, |
|
"eval_samples_per_second": 42.117, |
|
"eval_steps_per_second": 1.452, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 32.96, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.5272508859634399, |
|
"eval_runtime": 2.0395, |
|
"eval_samples_per_second": 42.657, |
|
"eval_steps_per_second": 1.471, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 33.6, |
|
"grad_norm": 24.97075080871582, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.2411, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 33.92, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.4726714789867401, |
|
"eval_runtime": 2.0758, |
|
"eval_samples_per_second": 41.912, |
|
"eval_steps_per_second": 1.445, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 34.88, |
|
"eval_accuracy": 0.8735632183908046, |
|
"eval_loss": 0.4611084461212158, |
|
"eval_runtime": 2.0264, |
|
"eval_samples_per_second": 42.934, |
|
"eval_steps_per_second": 1.48, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 35.2, |
|
"grad_norm": 60.3193359375, |
|
"learning_rate": 3.0092592592592593e-05, |
|
"loss": 0.2108, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5696073770523071, |
|
"eval_runtime": 2.0919, |
|
"eval_samples_per_second": 41.589, |
|
"eval_steps_per_second": 1.434, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 36.8, |
|
"grad_norm": 16.915546417236328, |
|
"learning_rate": 2.8935185185185186e-05, |
|
"loss": 0.2143, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 36.96, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.49439194798469543, |
|
"eval_runtime": 2.0923, |
|
"eval_samples_per_second": 41.58, |
|
"eval_steps_per_second": 1.434, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 37.92, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.5627816915512085, |
|
"eval_runtime": 2.0503, |
|
"eval_samples_per_second": 42.432, |
|
"eval_steps_per_second": 1.463, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 38.4, |
|
"grad_norm": 14.699493408203125, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.1663, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 38.88, |
|
"eval_accuracy": 0.8045977011494253, |
|
"eval_loss": 0.6131365895271301, |
|
"eval_runtime": 2.0693, |
|
"eval_samples_per_second": 42.044, |
|
"eval_steps_per_second": 1.45, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 25.7874755859375, |
|
"learning_rate": 2.6620370370370372e-05, |
|
"loss": 0.1714, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.4961901605129242, |
|
"eval_runtime": 2.0252, |
|
"eval_samples_per_second": 42.959, |
|
"eval_steps_per_second": 1.481, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 40.96, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.5022612810134888, |
|
"eval_runtime": 2.127, |
|
"eval_samples_per_second": 40.904, |
|
"eval_steps_per_second": 1.41, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 41.6, |
|
"grad_norm": 24.087005615234375, |
|
"learning_rate": 2.5462962962962965e-05, |
|
"loss": 0.174, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 41.92, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.48418501019477844, |
|
"eval_runtime": 2.0168, |
|
"eval_samples_per_second": 43.137, |
|
"eval_steps_per_second": 1.487, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 42.88, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.46790340542793274, |
|
"eval_runtime": 2.0909, |
|
"eval_samples_per_second": 41.609, |
|
"eval_steps_per_second": 1.435, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 43.2, |
|
"grad_norm": 13.284588813781738, |
|
"learning_rate": 2.4305555555555558e-05, |
|
"loss": 0.138, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.6270841956138611, |
|
"eval_runtime": 2.1069, |
|
"eval_samples_per_second": 41.294, |
|
"eval_steps_per_second": 1.424, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 44.8, |
|
"grad_norm": 14.41830062866211, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 0.1437, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 44.96, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5325595736503601, |
|
"eval_runtime": 2.1982, |
|
"eval_samples_per_second": 39.578, |
|
"eval_steps_per_second": 1.365, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 45.92, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.5655315518379211, |
|
"eval_runtime": 2.0683, |
|
"eval_samples_per_second": 42.063, |
|
"eval_steps_per_second": 1.45, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 46.4, |
|
"grad_norm": 17.588279724121094, |
|
"learning_rate": 2.1990740740740743e-05, |
|
"loss": 0.136, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 46.88, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.46718767285346985, |
|
"eval_runtime": 2.0892, |
|
"eval_samples_per_second": 41.643, |
|
"eval_steps_per_second": 1.436, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 22.864524841308594, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.1401, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.498960942029953, |
|
"eval_runtime": 2.0484, |
|
"eval_samples_per_second": 42.471, |
|
"eval_steps_per_second": 1.465, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 48.96, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.5445386171340942, |
|
"eval_runtime": 2.0365, |
|
"eval_samples_per_second": 42.721, |
|
"eval_steps_per_second": 1.473, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 49.6, |
|
"grad_norm": 22.651620864868164, |
|
"learning_rate": 1.967592592592593e-05, |
|
"loss": 0.1281, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 49.92, |
|
"eval_accuracy": 0.8735632183908046, |
|
"eval_loss": 0.47610902786254883, |
|
"eval_runtime": 2.1166, |
|
"eval_samples_per_second": 41.104, |
|
"eval_steps_per_second": 1.417, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 50.88, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5665103793144226, |
|
"eval_runtime": 2.1168, |
|
"eval_samples_per_second": 41.1, |
|
"eval_steps_per_second": 1.417, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 51.2, |
|
"grad_norm": 26.539594650268555, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.1156, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5089926719665527, |
|
"eval_runtime": 2.0775, |
|
"eval_samples_per_second": 41.877, |
|
"eval_steps_per_second": 1.444, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 52.8, |
|
"grad_norm": 23.464221954345703, |
|
"learning_rate": 1.736111111111111e-05, |
|
"loss": 0.0981, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 52.96, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5152259469032288, |
|
"eval_runtime": 2.0607, |
|
"eval_samples_per_second": 42.219, |
|
"eval_steps_per_second": 1.456, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 53.92, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.5466004610061646, |
|
"eval_runtime": 2.0591, |
|
"eval_samples_per_second": 42.251, |
|
"eval_steps_per_second": 1.457, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 54.4, |
|
"grad_norm": 14.581974983215332, |
|
"learning_rate": 1.6203703703703704e-05, |
|
"loss": 0.1055, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 54.88, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.5390048623085022, |
|
"eval_runtime": 2.0443, |
|
"eval_samples_per_second": 42.558, |
|
"eval_steps_per_second": 1.468, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 14.774139404296875, |
|
"learning_rate": 1.5046296296296297e-05, |
|
"loss": 0.112, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5574498176574707, |
|
"eval_runtime": 2.0874, |
|
"eval_samples_per_second": 41.679, |
|
"eval_steps_per_second": 1.437, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 56.96, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5448784828186035, |
|
"eval_runtime": 2.0514, |
|
"eval_samples_per_second": 42.41, |
|
"eval_steps_per_second": 1.462, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 57.6, |
|
"grad_norm": 18.17756462097168, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.0855, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 57.92, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5390240550041199, |
|
"eval_runtime": 2.077, |
|
"eval_samples_per_second": 41.888, |
|
"eval_steps_per_second": 1.444, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 58.88, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5206344723701477, |
|
"eval_runtime": 2.0568, |
|
"eval_samples_per_second": 42.299, |
|
"eval_steps_per_second": 1.459, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 59.2, |
|
"grad_norm": 40.29678726196289, |
|
"learning_rate": 1.2731481481481482e-05, |
|
"loss": 0.0899, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.5475941300392151, |
|
"eval_runtime": 2.063, |
|
"eval_samples_per_second": 42.172, |
|
"eval_steps_per_second": 1.454, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 60.8, |
|
"grad_norm": 33.910377502441406, |
|
"learning_rate": 1.1574074074074075e-05, |
|
"loss": 0.1026, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 60.96, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5344437956809998, |
|
"eval_runtime": 2.298, |
|
"eval_samples_per_second": 37.858, |
|
"eval_steps_per_second": 1.305, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 61.92, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.553070068359375, |
|
"eval_runtime": 2.1032, |
|
"eval_samples_per_second": 41.366, |
|
"eval_steps_per_second": 1.426, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 62.4, |
|
"grad_norm": 13.71580982208252, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 0.0799, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 62.88, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.57228684425354, |
|
"eval_runtime": 2.0779, |
|
"eval_samples_per_second": 41.868, |
|
"eval_steps_per_second": 1.444, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 28.238468170166016, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.0844, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.5339850783348083, |
|
"eval_runtime": 2.0258, |
|
"eval_samples_per_second": 42.946, |
|
"eval_steps_per_second": 1.481, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 64.96, |
|
"eval_accuracy": 0.8735632183908046, |
|
"eval_loss": 0.52364581823349, |
|
"eval_runtime": 2.0251, |
|
"eval_samples_per_second": 42.961, |
|
"eval_steps_per_second": 1.481, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 65.6, |
|
"grad_norm": 10.24560832977295, |
|
"learning_rate": 8.101851851851852e-06, |
|
"loss": 0.0724, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 65.92, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.6136645674705505, |
|
"eval_runtime": 2.03, |
|
"eval_samples_per_second": 42.858, |
|
"eval_steps_per_second": 1.478, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 66.88, |
|
"eval_accuracy": 0.8275862068965517, |
|
"eval_loss": 0.5824962854385376, |
|
"eval_runtime": 2.0787, |
|
"eval_samples_per_second": 41.854, |
|
"eval_steps_per_second": 1.443, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 67.2, |
|
"grad_norm": 20.803382873535156, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.0867, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.510515034198761, |
|
"eval_runtime": 2.0565, |
|
"eval_samples_per_second": 42.305, |
|
"eval_steps_per_second": 1.459, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 68.8, |
|
"grad_norm": 15.880162239074707, |
|
"learning_rate": 5.787037037037038e-06, |
|
"loss": 0.071, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 68.96, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5272470116615295, |
|
"eval_runtime": 2.0378, |
|
"eval_samples_per_second": 42.693, |
|
"eval_steps_per_second": 1.472, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 69.92, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5523571372032166, |
|
"eval_runtime": 2.0569, |
|
"eval_samples_per_second": 42.297, |
|
"eval_steps_per_second": 1.459, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 70.4, |
|
"grad_norm": 14.639904975891113, |
|
"learning_rate": 4.6296296296296296e-06, |
|
"loss": 0.0723, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 70.88, |
|
"eval_accuracy": 0.8390804597701149, |
|
"eval_loss": 0.5507646799087524, |
|
"eval_runtime": 2.1114, |
|
"eval_samples_per_second": 41.205, |
|
"eval_steps_per_second": 1.421, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 6.164122104644775, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 0.0748, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.8160919540229885, |
|
"eval_loss": 0.568942129611969, |
|
"eval_runtime": 2.0852, |
|
"eval_samples_per_second": 41.723, |
|
"eval_steps_per_second": 1.439, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 72.96, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.555583119392395, |
|
"eval_runtime": 2.0316, |
|
"eval_samples_per_second": 42.824, |
|
"eval_steps_per_second": 1.477, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 73.6, |
|
"grad_norm": 11.653559684753418, |
|
"learning_rate": 2.3148148148148148e-06, |
|
"loss": 0.0589, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 73.92, |
|
"eval_accuracy": 0.8505747126436781, |
|
"eval_loss": 0.5452274084091187, |
|
"eval_runtime": 2.0938, |
|
"eval_samples_per_second": 41.551, |
|
"eval_steps_per_second": 1.433, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 74.88, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.5475078225135803, |
|
"eval_runtime": 2.0547, |
|
"eval_samples_per_second": 42.342, |
|
"eval_steps_per_second": 1.46, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 75.2, |
|
"grad_norm": 21.146989822387695, |
|
"learning_rate": 1.1574074074074074e-06, |
|
"loss": 0.0719, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.5483731031417847, |
|
"eval_runtime": 2.1022, |
|
"eval_samples_per_second": 41.386, |
|
"eval_steps_per_second": 1.427, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 76.8, |
|
"grad_norm": 12.87066650390625, |
|
"learning_rate": 0.0, |
|
"loss": 0.0801, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 76.8, |
|
"eval_accuracy": 0.8620689655172413, |
|
"eval_loss": 0.5496163368225098, |
|
"eval_runtime": 2.0924, |
|
"eval_samples_per_second": 41.58, |
|
"eval_steps_per_second": 1.434, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 76.8, |
|
"step": 480, |
|
"total_flos": 1.514063180200919e+18, |
|
"train_loss": 0.449434948215882, |
|
"train_runtime": 1985.4684, |
|
"train_samples_per_second": 31.549, |
|
"train_steps_per_second": 0.242 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 480, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 80, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.514063180200919e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|