|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 50, |
|
"global_step": 1908, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07861635220125786, |
|
"grad_norm": 0.30994901061058044, |
|
"learning_rate": 0.00013089005235602096, |
|
"loss": 4.0802, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07861635220125786, |
|
"eval_loss": 3.3922524452209473, |
|
"eval_runtime": 27.5047, |
|
"eval_samples_per_second": 3.636, |
|
"eval_steps_per_second": 0.473, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15723270440251572, |
|
"grad_norm": 0.1917889267206192, |
|
"learning_rate": 0.0002617801047120419, |
|
"loss": 2.6833, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15723270440251572, |
|
"eval_loss": 2.327249050140381, |
|
"eval_runtime": 27.3498, |
|
"eval_samples_per_second": 3.656, |
|
"eval_steps_per_second": 0.475, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2358490566037736, |
|
"grad_norm": 0.1460731029510498, |
|
"learning_rate": 0.00039267015706806284, |
|
"loss": 2.1534, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2358490566037736, |
|
"eval_loss": 2.1624627113342285, |
|
"eval_runtime": 27.2554, |
|
"eval_samples_per_second": 3.669, |
|
"eval_steps_per_second": 0.477, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.31446540880503143, |
|
"grad_norm": 0.15231014788150787, |
|
"learning_rate": 0.0004973791496796739, |
|
"loss": 2.0476, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.31446540880503143, |
|
"eval_loss": 2.100531816482544, |
|
"eval_runtime": 27.3815, |
|
"eval_samples_per_second": 3.652, |
|
"eval_steps_per_second": 0.475, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39308176100628933, |
|
"grad_norm": 0.1586989462375641, |
|
"learning_rate": 0.00048281887012230633, |
|
"loss": 1.9929, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.39308176100628933, |
|
"eval_loss": 2.0717108249664307, |
|
"eval_runtime": 27.458, |
|
"eval_samples_per_second": 3.642, |
|
"eval_steps_per_second": 0.473, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"grad_norm": 0.14461246132850647, |
|
"learning_rate": 0.00046825859056493884, |
|
"loss": 1.974, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"eval_loss": 2.0508835315704346, |
|
"eval_runtime": 27.4544, |
|
"eval_samples_per_second": 3.642, |
|
"eval_steps_per_second": 0.474, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.550314465408805, |
|
"grad_norm": 0.22759944200515747, |
|
"learning_rate": 0.00045369831100757136, |
|
"loss": 1.9854, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.550314465408805, |
|
"eval_loss": 2.043459892272949, |
|
"eval_runtime": 27.4852, |
|
"eval_samples_per_second": 3.638, |
|
"eval_steps_per_second": 0.473, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6289308176100629, |
|
"grad_norm": 0.1476861983537674, |
|
"learning_rate": 0.0004391380314502039, |
|
"loss": 2.0109, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6289308176100629, |
|
"eval_loss": 2.035849094390869, |
|
"eval_runtime": 27.3716, |
|
"eval_samples_per_second": 3.653, |
|
"eval_steps_per_second": 0.475, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7075471698113207, |
|
"grad_norm": 0.16703520715236664, |
|
"learning_rate": 0.0004245777518928364, |
|
"loss": 1.9939, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7075471698113207, |
|
"eval_loss": 2.0285253524780273, |
|
"eval_runtime": 27.4183, |
|
"eval_samples_per_second": 3.647, |
|
"eval_steps_per_second": 0.474, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7861635220125787, |
|
"grad_norm": 0.14345824718475342, |
|
"learning_rate": 0.00041001747233546885, |
|
"loss": 1.9672, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7861635220125787, |
|
"eval_loss": 2.0150387287139893, |
|
"eval_runtime": 27.4063, |
|
"eval_samples_per_second": 3.649, |
|
"eval_steps_per_second": 0.474, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8647798742138365, |
|
"grad_norm": 0.15760189294815063, |
|
"learning_rate": 0.0003954571927781013, |
|
"loss": 1.9565, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8647798742138365, |
|
"eval_loss": 2.011507034301758, |
|
"eval_runtime": 27.3979, |
|
"eval_samples_per_second": 3.65, |
|
"eval_steps_per_second": 0.474, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 0.19006682932376862, |
|
"learning_rate": 0.0003808969132207338, |
|
"loss": 1.9706, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"eval_loss": 2.006178379058838, |
|
"eval_runtime": 27.3191, |
|
"eval_samples_per_second": 3.66, |
|
"eval_steps_per_second": 0.476, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0220125786163523, |
|
"grad_norm": 0.13335242867469788, |
|
"learning_rate": 0.00036633663366336634, |
|
"loss": 1.9352, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0220125786163523, |
|
"eval_loss": 1.9949337244033813, |
|
"eval_runtime": 27.384, |
|
"eval_samples_per_second": 3.652, |
|
"eval_steps_per_second": 0.475, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.10062893081761, |
|
"grad_norm": 0.1745985597372055, |
|
"learning_rate": 0.00035177635410599885, |
|
"loss": 1.8647, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.10062893081761, |
|
"eval_loss": 1.9930100440979004, |
|
"eval_runtime": 27.4302, |
|
"eval_samples_per_second": 3.646, |
|
"eval_steps_per_second": 0.474, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.179245283018868, |
|
"grad_norm": 0.17578236758708954, |
|
"learning_rate": 0.00033721607454863137, |
|
"loss": 1.9109, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.179245283018868, |
|
"eval_loss": 1.9840075969696045, |
|
"eval_runtime": 27.4312, |
|
"eval_samples_per_second": 3.645, |
|
"eval_steps_per_second": 0.474, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2578616352201257, |
|
"grad_norm": 0.16206932067871094, |
|
"learning_rate": 0.0003226557949912638, |
|
"loss": 1.885, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2578616352201257, |
|
"eval_loss": 1.9805808067321777, |
|
"eval_runtime": 27.4061, |
|
"eval_samples_per_second": 3.649, |
|
"eval_steps_per_second": 0.474, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3364779874213837, |
|
"grad_norm": 0.13977627456188202, |
|
"learning_rate": 0.00030809551543389634, |
|
"loss": 1.8864, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3364779874213837, |
|
"eval_loss": 1.9878467321395874, |
|
"eval_runtime": 27.3272, |
|
"eval_samples_per_second": 3.659, |
|
"eval_steps_per_second": 0.476, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4150943396226414, |
|
"grad_norm": 0.19199030101299286, |
|
"learning_rate": 0.00029353523587652885, |
|
"loss": 1.8931, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4150943396226414, |
|
"eval_loss": 1.9824912548065186, |
|
"eval_runtime": 27.2809, |
|
"eval_samples_per_second": 3.666, |
|
"eval_steps_per_second": 0.477, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4937106918238994, |
|
"grad_norm": 0.17020899057388306, |
|
"learning_rate": 0.0002789749563191613, |
|
"loss": 1.8599, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4937106918238994, |
|
"eval_loss": 1.975517988204956, |
|
"eval_runtime": 27.4147, |
|
"eval_samples_per_second": 3.648, |
|
"eval_steps_per_second": 0.474, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5723270440251573, |
|
"grad_norm": 0.2071741223335266, |
|
"learning_rate": 0.00026441467676179383, |
|
"loss": 1.9019, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5723270440251573, |
|
"eval_loss": 1.9695261716842651, |
|
"eval_runtime": 27.3628, |
|
"eval_samples_per_second": 3.655, |
|
"eval_steps_per_second": 0.475, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.650943396226415, |
|
"grad_norm": 0.18793335556983948, |
|
"learning_rate": 0.00024985439720442634, |
|
"loss": 1.9081, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.650943396226415, |
|
"eval_loss": 1.959928274154663, |
|
"eval_runtime": 27.4577, |
|
"eval_samples_per_second": 3.642, |
|
"eval_steps_per_second": 0.473, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.7295597484276728, |
|
"grad_norm": 0.2016116827726364, |
|
"learning_rate": 0.00023529411764705883, |
|
"loss": 1.8736, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.7295597484276728, |
|
"eval_loss": 1.9582524299621582, |
|
"eval_runtime": 27.3035, |
|
"eval_samples_per_second": 3.663, |
|
"eval_steps_per_second": 0.476, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.808176100628931, |
|
"grad_norm": 0.1896527260541916, |
|
"learning_rate": 0.00022073383808969132, |
|
"loss": 1.8939, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.808176100628931, |
|
"eval_loss": 1.9566823244094849, |
|
"eval_runtime": 27.4057, |
|
"eval_samples_per_second": 3.649, |
|
"eval_steps_per_second": 0.474, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 0.17931561172008514, |
|
"learning_rate": 0.00020617355853232383, |
|
"loss": 1.8867, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"eval_loss": 1.9534107446670532, |
|
"eval_runtime": 27.4354, |
|
"eval_samples_per_second": 3.645, |
|
"eval_steps_per_second": 0.474, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.9654088050314464, |
|
"grad_norm": 0.21005648374557495, |
|
"learning_rate": 0.00019161327897495632, |
|
"loss": 1.8875, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.9654088050314464, |
|
"eval_loss": 1.947943925857544, |
|
"eval_runtime": 27.4109, |
|
"eval_samples_per_second": 3.648, |
|
"eval_steps_per_second": 0.474, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.0440251572327046, |
|
"grad_norm": 0.175890251994133, |
|
"learning_rate": 0.0001770529994175888, |
|
"loss": 1.8677, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0440251572327046, |
|
"eval_loss": 1.949751853942871, |
|
"eval_runtime": 27.3631, |
|
"eval_samples_per_second": 3.655, |
|
"eval_steps_per_second": 0.475, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.1226415094339623, |
|
"grad_norm": 0.23929914832115173, |
|
"learning_rate": 0.00016249271986022132, |
|
"loss": 1.8152, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.1226415094339623, |
|
"eval_loss": 1.9526574611663818, |
|
"eval_runtime": 27.3536, |
|
"eval_samples_per_second": 3.656, |
|
"eval_steps_per_second": 0.475, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.20125786163522, |
|
"grad_norm": 0.25344499945640564, |
|
"learning_rate": 0.00014793244030285383, |
|
"loss": 1.8573, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.20125786163522, |
|
"eval_loss": 1.9523253440856934, |
|
"eval_runtime": 27.4028, |
|
"eval_samples_per_second": 3.649, |
|
"eval_steps_per_second": 0.474, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.279874213836478, |
|
"grad_norm": 0.23211850225925446, |
|
"learning_rate": 0.0001333721607454863, |
|
"loss": 1.8433, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.279874213836478, |
|
"eval_loss": 1.9440948963165283, |
|
"eval_runtime": 27.3179, |
|
"eval_samples_per_second": 3.661, |
|
"eval_steps_per_second": 0.476, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.358490566037736, |
|
"grad_norm": 0.23775285482406616, |
|
"learning_rate": 0.00011881188118811881, |
|
"loss": 1.828, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.358490566037736, |
|
"eval_loss": 1.945542335510254, |
|
"eval_runtime": 27.3283, |
|
"eval_samples_per_second": 3.659, |
|
"eval_steps_per_second": 0.476, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.4371069182389937, |
|
"grad_norm": 0.20624059438705444, |
|
"learning_rate": 0.00010425160163075131, |
|
"loss": 1.8298, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.4371069182389937, |
|
"eval_loss": 1.9422754049301147, |
|
"eval_runtime": 27.398, |
|
"eval_samples_per_second": 3.65, |
|
"eval_steps_per_second": 0.474, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.5157232704402515, |
|
"grad_norm": 0.20326970517635345, |
|
"learning_rate": 8.969132207338381e-05, |
|
"loss": 1.8258, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.5157232704402515, |
|
"eval_loss": 1.9376040697097778, |
|
"eval_runtime": 27.3017, |
|
"eval_samples_per_second": 3.663, |
|
"eval_steps_per_second": 0.476, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.5943396226415096, |
|
"grad_norm": 0.19999054074287415, |
|
"learning_rate": 7.513104251601631e-05, |
|
"loss": 1.8314, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.5943396226415096, |
|
"eval_loss": 1.937279462814331, |
|
"eval_runtime": 27.3919, |
|
"eval_samples_per_second": 3.651, |
|
"eval_steps_per_second": 0.475, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.6729559748427674, |
|
"grad_norm": 0.17829832434654236, |
|
"learning_rate": 6.0570762958648805e-05, |
|
"loss": 1.8436, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.6729559748427674, |
|
"eval_loss": 1.93943452835083, |
|
"eval_runtime": 27.4007, |
|
"eval_samples_per_second": 3.65, |
|
"eval_steps_per_second": 0.474, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.751572327044025, |
|
"grad_norm": 0.18850775063037872, |
|
"learning_rate": 4.6010483401281306e-05, |
|
"loss": 1.8253, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.751572327044025, |
|
"eval_loss": 1.937272310256958, |
|
"eval_runtime": 27.2797, |
|
"eval_samples_per_second": 3.666, |
|
"eval_steps_per_second": 0.477, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.830188679245283, |
|
"grad_norm": 0.19603122770786285, |
|
"learning_rate": 3.145020384391381e-05, |
|
"loss": 1.8055, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.830188679245283, |
|
"eval_loss": 1.9364655017852783, |
|
"eval_runtime": 27.2904, |
|
"eval_samples_per_second": 3.664, |
|
"eval_steps_per_second": 0.476, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.908805031446541, |
|
"grad_norm": 0.22059138119220734, |
|
"learning_rate": 1.68899242865463e-05, |
|
"loss": 1.8207, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.908805031446541, |
|
"eval_loss": 1.9341883659362793, |
|
"eval_runtime": 27.2939, |
|
"eval_samples_per_second": 3.664, |
|
"eval_steps_per_second": 0.476, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.9874213836477987, |
|
"grad_norm": 0.20625567436218262, |
|
"learning_rate": 2.3296447291788002e-06, |
|
"loss": 1.8514, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.9874213836477987, |
|
"eval_loss": 1.9331157207489014, |
|
"eval_runtime": 27.3592, |
|
"eval_samples_per_second": 3.655, |
|
"eval_steps_per_second": 0.475, |
|
"step": 1900 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 1908, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.377867800895488e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|