|
{ |
|
"best_metric": 2.009188652038574, |
|
"best_model_checkpoint": "ckpts/sft_gemma-2b/checkpoint-1680", |
|
"epoch": 8.865435356200528, |
|
"eval_steps": 20, |
|
"global_step": 1680, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10554089709762533, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4965, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10554089709762533, |
|
"eval_loss": 2.404269218444824, |
|
"eval_runtime": 8.1633, |
|
"eval_samples_per_second": 24.5, |
|
"eval_steps_per_second": 6.125, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21108179419525067, |
|
"grad_norm": 3.0, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.2807, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21108179419525067, |
|
"eval_loss": 2.1983509063720703, |
|
"eval_runtime": 7.9364, |
|
"eval_samples_per_second": 25.2, |
|
"eval_steps_per_second": 6.3, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.316622691292876, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1723, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.316622691292876, |
|
"eval_loss": 2.157801628112793, |
|
"eval_runtime": 8.1488, |
|
"eval_samples_per_second": 24.543, |
|
"eval_steps_per_second": 6.136, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.42216358839050133, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0888, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.42216358839050133, |
|
"eval_loss": 2.1461212635040283, |
|
"eval_runtime": 8.0327, |
|
"eval_samples_per_second": 24.898, |
|
"eval_steps_per_second": 6.225, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5277044854881267, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1187, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5277044854881267, |
|
"eval_loss": 2.140232801437378, |
|
"eval_runtime": 8.0366, |
|
"eval_samples_per_second": 24.886, |
|
"eval_steps_per_second": 6.222, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.633245382585752, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1293, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.633245382585752, |
|
"eval_loss": 2.135404109954834, |
|
"eval_runtime": 8.1159, |
|
"eval_samples_per_second": 24.643, |
|
"eval_steps_per_second": 6.161, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7387862796833773, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1351, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7387862796833773, |
|
"eval_loss": 2.1313905715942383, |
|
"eval_runtime": 7.9204, |
|
"eval_samples_per_second": 25.251, |
|
"eval_steps_per_second": 6.313, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8443271767810027, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1204, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8443271767810027, |
|
"eval_loss": 2.1264946460723877, |
|
"eval_runtime": 8.1968, |
|
"eval_samples_per_second": 24.4, |
|
"eval_steps_per_second": 6.1, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9498680738786279, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0984, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9498680738786279, |
|
"eval_loss": 2.123286247253418, |
|
"eval_runtime": 8.0328, |
|
"eval_samples_per_second": 24.898, |
|
"eval_steps_per_second": 6.225, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0554089709762533, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1008, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0554089709762533, |
|
"eval_loss": 2.1210150718688965, |
|
"eval_runtime": 7.8913, |
|
"eval_samples_per_second": 25.344, |
|
"eval_steps_per_second": 6.336, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1609498680738786, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0771, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1609498680738786, |
|
"eval_loss": 2.118210792541504, |
|
"eval_runtime": 8.2706, |
|
"eval_samples_per_second": 24.182, |
|
"eval_steps_per_second": 6.045, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.266490765171504, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0659, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.266490765171504, |
|
"eval_loss": 2.1160335540771484, |
|
"eval_runtime": 8.1186, |
|
"eval_samples_per_second": 24.635, |
|
"eval_steps_per_second": 6.159, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3720316622691293, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0616, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3720316622691293, |
|
"eval_loss": 2.113948106765747, |
|
"eval_runtime": 8.3043, |
|
"eval_samples_per_second": 24.084, |
|
"eval_steps_per_second": 6.021, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4775725593667546, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1086, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4775725593667546, |
|
"eval_loss": 2.1105477809906006, |
|
"eval_runtime": 8.3509, |
|
"eval_samples_per_second": 23.95, |
|
"eval_steps_per_second": 5.987, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.58311345646438, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0473, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.58311345646438, |
|
"eval_loss": 2.1075851917266846, |
|
"eval_runtime": 8.3203, |
|
"eval_samples_per_second": 24.037, |
|
"eval_steps_per_second": 6.009, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6886543535620053, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0455, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6886543535620053, |
|
"eval_loss": 2.1052379608154297, |
|
"eval_runtime": 8.4112, |
|
"eval_samples_per_second": 23.778, |
|
"eval_steps_per_second": 5.944, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7941952506596306, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0664, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7941952506596306, |
|
"eval_loss": 2.102696180343628, |
|
"eval_runtime": 8.1772, |
|
"eval_samples_per_second": 24.458, |
|
"eval_steps_per_second": 6.115, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.899736147757256, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0559, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.899736147757256, |
|
"eval_loss": 2.100424289703369, |
|
"eval_runtime": 8.2651, |
|
"eval_samples_per_second": 24.198, |
|
"eval_steps_per_second": 6.05, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.005277044854881, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0638, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.005277044854881, |
|
"eval_loss": 2.0989837646484375, |
|
"eval_runtime": 8.3491, |
|
"eval_samples_per_second": 23.955, |
|
"eval_steps_per_second": 5.989, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.1108179419525066, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0455, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.1108179419525066, |
|
"eval_loss": 2.097106456756592, |
|
"eval_runtime": 8.261, |
|
"eval_samples_per_second": 24.21, |
|
"eval_steps_per_second": 6.053, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.216358839050132, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0114, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.216358839050132, |
|
"eval_loss": 2.095724582672119, |
|
"eval_runtime": 8.1933, |
|
"eval_samples_per_second": 24.41, |
|
"eval_steps_per_second": 6.103, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.321899736147757, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0263, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.321899736147757, |
|
"eval_loss": 2.0944039821624756, |
|
"eval_runtime": 8.3678, |
|
"eval_samples_per_second": 23.901, |
|
"eval_steps_per_second": 5.975, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.4274406332453826, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0127, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.4274406332453826, |
|
"eval_loss": 2.0919580459594727, |
|
"eval_runtime": 8.3065, |
|
"eval_samples_per_second": 24.078, |
|
"eval_steps_per_second": 6.019, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.532981530343008, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9744, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.532981530343008, |
|
"eval_loss": 2.087993860244751, |
|
"eval_runtime": 8.3136, |
|
"eval_samples_per_second": 24.057, |
|
"eval_steps_per_second": 6.014, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0236, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"eval_loss": 2.086052656173706, |
|
"eval_runtime": 8.3652, |
|
"eval_samples_per_second": 23.908, |
|
"eval_steps_per_second": 5.977, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.7440633245382586, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0146, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.7440633245382586, |
|
"eval_loss": 2.08500075340271, |
|
"eval_runtime": 8.387, |
|
"eval_samples_per_second": 23.846, |
|
"eval_steps_per_second": 5.962, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.849604221635884, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0086, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.849604221635884, |
|
"eval_loss": 2.0832457542419434, |
|
"eval_runtime": 8.2202, |
|
"eval_samples_per_second": 24.33, |
|
"eval_steps_per_second": 6.083, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.955145118733509, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0381, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.955145118733509, |
|
"eval_loss": 2.0816333293914795, |
|
"eval_runtime": 8.1766, |
|
"eval_samples_per_second": 24.46, |
|
"eval_steps_per_second": 6.115, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.0606860158311346, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9999, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.0606860158311346, |
|
"eval_loss": 2.0814907550811768, |
|
"eval_runtime": 8.1023, |
|
"eval_samples_per_second": 24.684, |
|
"eval_steps_per_second": 6.171, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.16622691292876, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9754, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.16622691292876, |
|
"eval_loss": 2.0809905529022217, |
|
"eval_runtime": 8.2702, |
|
"eval_samples_per_second": 24.183, |
|
"eval_steps_per_second": 6.046, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.271767810026385, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9742, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.271767810026385, |
|
"eval_loss": 2.0800254344940186, |
|
"eval_runtime": 8.3829, |
|
"eval_samples_per_second": 23.858, |
|
"eval_steps_per_second": 5.965, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.3773087071240107, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9646, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.3773087071240107, |
|
"eval_loss": 2.078634738922119, |
|
"eval_runtime": 8.1957, |
|
"eval_samples_per_second": 24.403, |
|
"eval_steps_per_second": 6.101, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.4828496042216357, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9785, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.4828496042216357, |
|
"eval_loss": 2.075782537460327, |
|
"eval_runtime": 8.2605, |
|
"eval_samples_per_second": 24.211, |
|
"eval_steps_per_second": 6.053, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.588390501319261, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9755, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.588390501319261, |
|
"eval_loss": 2.0737786293029785, |
|
"eval_runtime": 8.3746, |
|
"eval_samples_per_second": 23.882, |
|
"eval_steps_per_second": 5.97, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.6939313984168867, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9667, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.6939313984168867, |
|
"eval_loss": 2.0726418495178223, |
|
"eval_runtime": 8.2601, |
|
"eval_samples_per_second": 24.213, |
|
"eval_steps_per_second": 6.053, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.7994722955145117, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9623, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.7994722955145117, |
|
"eval_loss": 2.070995330810547, |
|
"eval_runtime": 8.3051, |
|
"eval_samples_per_second": 24.081, |
|
"eval_steps_per_second": 6.02, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.905013192612137, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9702, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.905013192612137, |
|
"eval_loss": 2.068690776824951, |
|
"eval_runtime": 8.3707, |
|
"eval_samples_per_second": 23.893, |
|
"eval_steps_per_second": 5.973, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.010554089709762, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9795, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.010554089709762, |
|
"eval_loss": 2.0664331912994385, |
|
"eval_runtime": 9.1952, |
|
"eval_samples_per_second": 21.75, |
|
"eval_steps_per_second": 5.438, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.116094986807388, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9469, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.116094986807388, |
|
"eval_loss": 2.0662195682525635, |
|
"eval_runtime": 8.2606, |
|
"eval_samples_per_second": 24.211, |
|
"eval_steps_per_second": 6.053, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.221635883905013, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9415, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.221635883905013, |
|
"eval_loss": 2.0639867782592773, |
|
"eval_runtime": 8.227, |
|
"eval_samples_per_second": 24.31, |
|
"eval_steps_per_second": 6.078, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.327176781002638, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9574, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.327176781002638, |
|
"eval_loss": 2.062091588973999, |
|
"eval_runtime": 8.2262, |
|
"eval_samples_per_second": 24.313, |
|
"eval_steps_per_second": 6.078, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.432717678100264, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9202, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.432717678100264, |
|
"eval_loss": 2.0608723163604736, |
|
"eval_runtime": 8.2161, |
|
"eval_samples_per_second": 24.343, |
|
"eval_steps_per_second": 6.086, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.538258575197889, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9302, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.538258575197889, |
|
"eval_loss": 2.058367967605591, |
|
"eval_runtime": 8.2827, |
|
"eval_samples_per_second": 24.147, |
|
"eval_steps_per_second": 6.037, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.643799472295514, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9112, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.643799472295514, |
|
"eval_loss": 2.058866500854492, |
|
"eval_runtime": 8.2412, |
|
"eval_samples_per_second": 24.268, |
|
"eval_steps_per_second": 6.067, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.74934036939314, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9127, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.74934036939314, |
|
"eval_loss": 2.0560219287872314, |
|
"eval_runtime": 8.2213, |
|
"eval_samples_per_second": 24.327, |
|
"eval_steps_per_second": 6.082, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.854881266490765, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.899, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.854881266490765, |
|
"eval_loss": 2.054474353790283, |
|
"eval_runtime": 8.0902, |
|
"eval_samples_per_second": 24.721, |
|
"eval_steps_per_second": 6.18, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.96042216358839, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9248, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.96042216358839, |
|
"eval_loss": 2.052360773086548, |
|
"eval_runtime": 8.1899, |
|
"eval_samples_per_second": 24.42, |
|
"eval_steps_per_second": 6.105, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 5.065963060686016, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8878, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 5.065963060686016, |
|
"eval_loss": 2.0526058673858643, |
|
"eval_runtime": 8.0873, |
|
"eval_samples_per_second": 24.73, |
|
"eval_steps_per_second": 6.183, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 5.171503957783641, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8789, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 5.171503957783641, |
|
"eval_loss": 2.0522069931030273, |
|
"eval_runtime": 8.0535, |
|
"eval_samples_per_second": 24.834, |
|
"eval_steps_per_second": 6.208, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 5.277044854881266, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8908, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.277044854881266, |
|
"eval_loss": 2.051866054534912, |
|
"eval_runtime": 7.9907, |
|
"eval_samples_per_second": 25.029, |
|
"eval_steps_per_second": 6.257, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.382585751978892, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8944, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.382585751978892, |
|
"eval_loss": 2.0503861904144287, |
|
"eval_runtime": 7.927, |
|
"eval_samples_per_second": 25.23, |
|
"eval_steps_per_second": 6.308, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.488126649076517, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8867, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.488126649076517, |
|
"eval_loss": 2.0466654300689697, |
|
"eval_runtime": 8.3776, |
|
"eval_samples_per_second": 23.873, |
|
"eval_steps_per_second": 5.968, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.593667546174142, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8764, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.593667546174142, |
|
"eval_loss": 2.0448718070983887, |
|
"eval_runtime": 8.2526, |
|
"eval_samples_per_second": 24.235, |
|
"eval_steps_per_second": 6.059, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.699208443271768, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9082, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.699208443271768, |
|
"eval_loss": 2.0424439907073975, |
|
"eval_runtime": 8.0973, |
|
"eval_samples_per_second": 24.7, |
|
"eval_steps_per_second": 6.175, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.804749340369393, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8782, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.804749340369393, |
|
"eval_loss": 2.0422487258911133, |
|
"eval_runtime": 8.3197, |
|
"eval_samples_per_second": 24.039, |
|
"eval_steps_per_second": 6.01, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.910290237467018, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8394, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.910290237467018, |
|
"eval_loss": 2.0410642623901367, |
|
"eval_runtime": 8.2994, |
|
"eval_samples_per_second": 24.098, |
|
"eval_steps_per_second": 6.025, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 6.015831134564644, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.864, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 6.015831134564644, |
|
"eval_loss": 2.039353370666504, |
|
"eval_runtime": 8.2994, |
|
"eval_samples_per_second": 24.098, |
|
"eval_steps_per_second": 6.025, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 6.121372031662269, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8246, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 6.121372031662269, |
|
"eval_loss": 2.042710304260254, |
|
"eval_runtime": 8.5324, |
|
"eval_samples_per_second": 23.44, |
|
"eval_steps_per_second": 5.86, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 6.226912928759894, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8343, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 6.226912928759894, |
|
"eval_loss": 2.0403542518615723, |
|
"eval_runtime": 8.2652, |
|
"eval_samples_per_second": 24.198, |
|
"eval_steps_per_second": 6.049, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 6.33245382585752, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8541, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.33245382585752, |
|
"eval_loss": 2.03813099861145, |
|
"eval_runtime": 8.4385, |
|
"eval_samples_per_second": 23.701, |
|
"eval_steps_per_second": 5.925, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.437994722955145, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8182, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.437994722955145, |
|
"eval_loss": 2.038771629333496, |
|
"eval_runtime": 8.3561, |
|
"eval_samples_per_second": 23.934, |
|
"eval_steps_per_second": 5.984, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.54353562005277, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8427, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.54353562005277, |
|
"eval_loss": 2.0339856147766113, |
|
"eval_runtime": 8.3288, |
|
"eval_samples_per_second": 24.013, |
|
"eval_steps_per_second": 6.003, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.649076517150396, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8289, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.649076517150396, |
|
"eval_loss": 2.035248041152954, |
|
"eval_runtime": 8.281, |
|
"eval_samples_per_second": 24.152, |
|
"eval_steps_per_second": 6.038, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.754617414248021, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8415, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.754617414248021, |
|
"eval_loss": 2.031825304031372, |
|
"eval_runtime": 8.3052, |
|
"eval_samples_per_second": 24.081, |
|
"eval_steps_per_second": 6.02, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.860158311345646, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8357, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.860158311345646, |
|
"eval_loss": 2.028428316116333, |
|
"eval_runtime": 8.3001, |
|
"eval_samples_per_second": 24.096, |
|
"eval_steps_per_second": 6.024, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.965699208443271, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8324, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.965699208443271, |
|
"eval_loss": 2.0289885997772217, |
|
"eval_runtime": 8.1618, |
|
"eval_samples_per_second": 24.504, |
|
"eval_steps_per_second": 6.126, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 7.071240105540897, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8069, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 7.071240105540897, |
|
"eval_loss": 2.0348060131073, |
|
"eval_runtime": 8.1951, |
|
"eval_samples_per_second": 24.405, |
|
"eval_steps_per_second": 6.101, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 7.176781002638522, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8152, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 7.176781002638522, |
|
"eval_loss": 2.0321884155273438, |
|
"eval_runtime": 8.2785, |
|
"eval_samples_per_second": 24.159, |
|
"eval_steps_per_second": 6.04, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 7.282321899736147, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7871, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 7.282321899736147, |
|
"eval_loss": 2.0307512283325195, |
|
"eval_runtime": 8.0505, |
|
"eval_samples_per_second": 24.843, |
|
"eval_steps_per_second": 6.211, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 7.387862796833773, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7871, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.387862796833773, |
|
"eval_loss": 2.0273208618164062, |
|
"eval_runtime": 8.1896, |
|
"eval_samples_per_second": 24.421, |
|
"eval_steps_per_second": 6.105, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.493403693931398, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8076, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 7.493403693931398, |
|
"eval_loss": 2.0257158279418945, |
|
"eval_runtime": 7.9266, |
|
"eval_samples_per_second": 25.232, |
|
"eval_steps_per_second": 6.308, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 7.598944591029023, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7753, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 7.598944591029023, |
|
"eval_loss": 2.026719570159912, |
|
"eval_runtime": 7.8566, |
|
"eval_samples_per_second": 25.456, |
|
"eval_steps_per_second": 6.364, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 7.704485488126649, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.761, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.704485488126649, |
|
"eval_loss": 2.022343397140503, |
|
"eval_runtime": 8.1505, |
|
"eval_samples_per_second": 24.538, |
|
"eval_steps_per_second": 6.135, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.810026385224274, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7837, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.810026385224274, |
|
"eval_loss": 2.0227696895599365, |
|
"eval_runtime": 7.9021, |
|
"eval_samples_per_second": 25.31, |
|
"eval_steps_per_second": 6.327, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.915567282321899, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7809, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.915567282321899, |
|
"eval_loss": 2.0224175453186035, |
|
"eval_runtime": 8.146, |
|
"eval_samples_per_second": 24.552, |
|
"eval_steps_per_second": 6.138, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.021108179419524, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.779, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 8.021108179419524, |
|
"eval_loss": 2.0209100246429443, |
|
"eval_runtime": 8.392, |
|
"eval_samples_per_second": 23.832, |
|
"eval_steps_per_second": 5.958, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 8.12664907651715, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7353, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 8.12664907651715, |
|
"eval_loss": 2.0220282077789307, |
|
"eval_runtime": 8.6161, |
|
"eval_samples_per_second": 23.212, |
|
"eval_steps_per_second": 5.803, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 8.232189973614776, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7363, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 8.232189973614776, |
|
"eval_loss": 2.0166220664978027, |
|
"eval_runtime": 8.2719, |
|
"eval_samples_per_second": 24.178, |
|
"eval_steps_per_second": 6.045, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 8.3377308707124, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7511, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 8.3377308707124, |
|
"eval_loss": 2.01631236076355, |
|
"eval_runtime": 8.2537, |
|
"eval_samples_per_second": 24.232, |
|
"eval_steps_per_second": 6.058, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 8.443271767810026, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.767, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.443271767810026, |
|
"eval_loss": 2.016242265701294, |
|
"eval_runtime": 8.1762, |
|
"eval_samples_per_second": 24.461, |
|
"eval_steps_per_second": 6.115, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.548812664907652, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6945, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 8.548812664907652, |
|
"eval_loss": 2.019789218902588, |
|
"eval_runtime": 8.4823, |
|
"eval_samples_per_second": 23.579, |
|
"eval_steps_per_second": 5.895, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 8.654353562005277, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7087, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 8.654353562005277, |
|
"eval_loss": 2.0212345123291016, |
|
"eval_runtime": 8.1335, |
|
"eval_samples_per_second": 24.59, |
|
"eval_steps_per_second": 6.147, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 8.759894459102902, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7702, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.759894459102902, |
|
"eval_loss": 2.0104410648345947, |
|
"eval_runtime": 8.0047, |
|
"eval_samples_per_second": 24.985, |
|
"eval_steps_per_second": 6.246, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.865435356200528, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7563, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 8.865435356200528, |
|
"eval_loss": 2.009188652038574, |
|
"eval_runtime": 8.1504, |
|
"eval_samples_per_second": 24.539, |
|
"eval_steps_per_second": 6.135, |
|
"step": 1680 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 9450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6272812961435648e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|