|
{ |
|
"best_metric": 2.989222764968872, |
|
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-32k-earlystop-40epochs_seed-42_3e-4/checkpoint-40566", |
|
"epoch": 24.0, |
|
"eval_steps": 500, |
|
"global_step": 46362, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5176653293645658, |
|
"grad_norm": 0.485008180141449, |
|
"learning_rate": 9.375e-06, |
|
"loss": 6.8783, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9996117510029766, |
|
"eval_accuracy": 0.289060183567583, |
|
"eval_loss": 4.46536922454834, |
|
"eval_runtime": 111.7462, |
|
"eval_samples_per_second": 467.04, |
|
"eval_steps_per_second": 7.302, |
|
"step": 1931 |
|
}, |
|
{ |
|
"epoch": 1.0353306587291315, |
|
"grad_norm": 0.6161150932312012, |
|
"learning_rate": 1.875e-05, |
|
"loss": 4.6542, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.5529959880936974, |
|
"grad_norm": 0.6660575270652771, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 4.2263, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.9997411673353178, |
|
"eval_accuracy": 0.3337067802786951, |
|
"eval_loss": 3.917879343032837, |
|
"eval_runtime": 112.2732, |
|
"eval_samples_per_second": 464.848, |
|
"eval_steps_per_second": 7.268, |
|
"step": 3863 |
|
}, |
|
{ |
|
"epoch": 2.070661317458263, |
|
"grad_norm": 0.6615888476371765, |
|
"learning_rate": 3.75e-05, |
|
"loss": 3.9582, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.588326646822829, |
|
"grad_norm": 0.7149969339370728, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 3.7724, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.9998705836676587, |
|
"eval_accuracy": 0.3562454681048507, |
|
"eval_loss": 3.6396915912628174, |
|
"eval_runtime": 112.1753, |
|
"eval_samples_per_second": 465.254, |
|
"eval_steps_per_second": 7.274, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 3.105991976187395, |
|
"grad_norm": 0.6370624899864197, |
|
"learning_rate": 5.625e-05, |
|
"loss": 3.6294, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.6236573055519608, |
|
"grad_norm": 0.6175299286842346, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 3.5091, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3723592153857136, |
|
"eval_loss": 3.456868886947632, |
|
"eval_runtime": 111.9653, |
|
"eval_samples_per_second": 466.126, |
|
"eval_steps_per_second": 7.288, |
|
"step": 7727 |
|
}, |
|
{ |
|
"epoch": 4.141322634916526, |
|
"grad_norm": 0.5864672064781189, |
|
"learning_rate": 7.5e-05, |
|
"loss": 3.4133, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.658987964281092, |
|
"grad_norm": 0.5497373342514038, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 3.3306, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.999611751002977, |
|
"eval_accuracy": 0.3838290710037608, |
|
"eval_loss": 3.3310000896453857, |
|
"eval_runtime": 112.0797, |
|
"eval_samples_per_second": 465.651, |
|
"eval_steps_per_second": 7.281, |
|
"step": 9658 |
|
}, |
|
{ |
|
"epoch": 5.176653293645658, |
|
"grad_norm": 0.5599066019058228, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 3.2608, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.694318623010224, |
|
"grad_norm": 0.5453025698661804, |
|
"learning_rate": 0.00010312499999999999, |
|
"loss": 3.2012, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.999741167335317, |
|
"eval_accuracy": 0.3918066341309469, |
|
"eval_loss": 3.2469069957733154, |
|
"eval_runtime": 112.2873, |
|
"eval_samples_per_second": 464.79, |
|
"eval_steps_per_second": 7.267, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 6.21198395237479, |
|
"grad_norm": 0.5039867162704468, |
|
"learning_rate": 0.0001125, |
|
"loss": 3.152, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.729649281739356, |
|
"grad_norm": 0.4990510642528534, |
|
"learning_rate": 0.000121875, |
|
"loss": 3.1088, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.999870583667659, |
|
"eval_accuracy": 0.39817732342985096, |
|
"eval_loss": 3.1827573776245117, |
|
"eval_runtime": 112.5977, |
|
"eval_samples_per_second": 463.509, |
|
"eval_steps_per_second": 7.247, |
|
"step": 13522 |
|
}, |
|
{ |
|
"epoch": 7.2473146111039215, |
|
"grad_norm": 0.46827396750450134, |
|
"learning_rate": 0.00013125, |
|
"loss": 3.0691, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.764979940468487, |
|
"grad_norm": 0.4815771281719208, |
|
"learning_rate": 0.000140625, |
|
"loss": 3.0364, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.40226096953439355, |
|
"eval_loss": 3.1404154300689697, |
|
"eval_runtime": 112.1878, |
|
"eval_samples_per_second": 465.202, |
|
"eval_steps_per_second": 7.274, |
|
"step": 15454 |
|
}, |
|
{ |
|
"epoch": 8.282645269833052, |
|
"grad_norm": 0.4806288778781891, |
|
"learning_rate": 0.000149990625, |
|
"loss": 3.0022, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.80031059919762, |
|
"grad_norm": 0.4368716776371002, |
|
"learning_rate": 0.00015936562499999999, |
|
"loss": 2.9837, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.999611751002977, |
|
"eval_accuracy": 0.4056956294684956, |
|
"eval_loss": 3.1080353260040283, |
|
"eval_runtime": 112.3344, |
|
"eval_samples_per_second": 464.595, |
|
"eval_steps_per_second": 7.264, |
|
"step": 17385 |
|
}, |
|
{ |
|
"epoch": 9.317975928562184, |
|
"grad_norm": 0.44254282116889954, |
|
"learning_rate": 0.00016873124999999998, |
|
"loss": 2.9495, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.835641257926751, |
|
"grad_norm": 0.4331772029399872, |
|
"learning_rate": 0.00017809687499999998, |
|
"loss": 2.9377, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 9.999741167335317, |
|
"eval_accuracy": 0.40769541156182726, |
|
"eval_loss": 3.083951711654663, |
|
"eval_runtime": 112.4544, |
|
"eval_samples_per_second": 464.099, |
|
"eval_steps_per_second": 7.256, |
|
"step": 19317 |
|
}, |
|
{ |
|
"epoch": 10.353306587291316, |
|
"grad_norm": 0.425889790058136, |
|
"learning_rate": 0.00018747187499999998, |
|
"loss": 2.9051, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.870971916655883, |
|
"grad_norm": 0.40670710802078247, |
|
"learning_rate": 0.00019684687499999998, |
|
"loss": 2.9019, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 10.99987058366766, |
|
"eval_accuracy": 0.41011139539164965, |
|
"eval_loss": 3.063291311264038, |
|
"eval_runtime": 112.0395, |
|
"eval_samples_per_second": 465.818, |
|
"eval_steps_per_second": 7.283, |
|
"step": 21249 |
|
}, |
|
{ |
|
"epoch": 11.388637246020448, |
|
"grad_norm": 0.4176114499568939, |
|
"learning_rate": 0.00020622187499999998, |
|
"loss": 2.8676, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 11.906302575385013, |
|
"grad_norm": 0.400343120098114, |
|
"learning_rate": 0.00021559687499999997, |
|
"loss": 2.8713, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4117278120291995, |
|
"eval_loss": 3.0504517555236816, |
|
"eval_runtime": 111.9439, |
|
"eval_samples_per_second": 466.216, |
|
"eval_steps_per_second": 7.289, |
|
"step": 23181 |
|
}, |
|
{ |
|
"epoch": 12.42396790474958, |
|
"grad_norm": 0.3873593509197235, |
|
"learning_rate": 0.00022496249999999997, |
|
"loss": 2.8353, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 12.941633234114144, |
|
"grad_norm": 0.3830123543739319, |
|
"learning_rate": 0.00023433749999999997, |
|
"loss": 2.8449, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 12.999611751002977, |
|
"eval_accuracy": 0.41297589125705847, |
|
"eval_loss": 3.037565231323242, |
|
"eval_runtime": 112.2612, |
|
"eval_samples_per_second": 464.898, |
|
"eval_steps_per_second": 7.269, |
|
"step": 25112 |
|
}, |
|
{ |
|
"epoch": 13.459298563478711, |
|
"grad_norm": 0.390222430229187, |
|
"learning_rate": 0.00024369374999999997, |
|
"loss": 2.8068, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 13.976963892843276, |
|
"grad_norm": 0.3650892376899719, |
|
"learning_rate": 0.00025306875, |
|
"loss": 2.8231, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 13.999741167335317, |
|
"eval_accuracy": 0.41429497800269754, |
|
"eval_loss": 3.026965856552124, |
|
"eval_runtime": 112.249, |
|
"eval_samples_per_second": 464.948, |
|
"eval_steps_per_second": 7.27, |
|
"step": 27044 |
|
}, |
|
{ |
|
"epoch": 14.494629222207843, |
|
"grad_norm": 0.3750116527080536, |
|
"learning_rate": 0.00026244375, |
|
"loss": 2.7828, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 14.99987058366766, |
|
"eval_accuracy": 0.4150191044035932, |
|
"eval_loss": 3.0222034454345703, |
|
"eval_runtime": 112.1123, |
|
"eval_samples_per_second": 465.516, |
|
"eval_steps_per_second": 7.278, |
|
"step": 28976 |
|
}, |
|
{ |
|
"epoch": 15.012294551572408, |
|
"grad_norm": 0.39160436391830444, |
|
"learning_rate": 0.00027181875, |
|
"loss": 2.804, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 15.529959880936975, |
|
"grad_norm": 0.36566755175590515, |
|
"learning_rate": 0.00028118437499999993, |
|
"loss": 2.7644, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4156012157689288, |
|
"eval_loss": 3.0159823894500732, |
|
"eval_runtime": 112.075, |
|
"eval_samples_per_second": 465.671, |
|
"eval_steps_per_second": 7.281, |
|
"step": 30908 |
|
}, |
|
{ |
|
"epoch": 16.04762521030154, |
|
"grad_norm": 0.37446266412734985, |
|
"learning_rate": 0.000290559375, |
|
"loss": 2.7817, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 16.565290539666105, |
|
"grad_norm": 0.3600151836872101, |
|
"learning_rate": 0.000299925, |
|
"loss": 2.7508, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 16.999611751002977, |
|
"eval_accuracy": 0.4166737674184447, |
|
"eval_loss": 3.010031223297119, |
|
"eval_runtime": 112.0828, |
|
"eval_samples_per_second": 465.638, |
|
"eval_steps_per_second": 7.28, |
|
"step": 32839 |
|
}, |
|
{ |
|
"epoch": 17.082955869030673, |
|
"grad_norm": 0.35380175709724426, |
|
"learning_rate": 0.00029342175066312994, |
|
"loss": 2.7621, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 17.60062119839524, |
|
"grad_norm": 0.3466060161590576, |
|
"learning_rate": 0.0002867970822281167, |
|
"loss": 2.7296, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 17.99974116733532, |
|
"eval_accuracy": 0.41775473477377156, |
|
"eval_loss": 3.002298355102539, |
|
"eval_runtime": 112.139, |
|
"eval_samples_per_second": 465.405, |
|
"eval_steps_per_second": 7.277, |
|
"step": 34771 |
|
}, |
|
{ |
|
"epoch": 18.118286527759803, |
|
"grad_norm": 0.35987117886543274, |
|
"learning_rate": 0.0002801657824933687, |
|
"loss": 2.7304, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 18.63595185712437, |
|
"grad_norm": 0.345238596200943, |
|
"learning_rate": 0.0002735411140583554, |
|
"loss": 2.7053, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 18.999870583667658, |
|
"eval_accuracy": 0.41883562698886795, |
|
"eval_loss": 2.9966797828674316, |
|
"eval_runtime": 112.4594, |
|
"eval_samples_per_second": 464.079, |
|
"eval_steps_per_second": 7.256, |
|
"step": 36703 |
|
}, |
|
{ |
|
"epoch": 19.153617186488933, |
|
"grad_norm": 0.35703110694885254, |
|
"learning_rate": 0.0002669098143236074, |
|
"loss": 2.701, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 19.671282515853502, |
|
"grad_norm": 0.3403065800666809, |
|
"learning_rate": 0.00026028514588859414, |
|
"loss": 2.6821, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.419519853927392, |
|
"eval_loss": 2.9925894737243652, |
|
"eval_runtime": 112.0678, |
|
"eval_samples_per_second": 465.7, |
|
"eval_steps_per_second": 7.281, |
|
"step": 38635 |
|
}, |
|
{ |
|
"epoch": 20.188947845218067, |
|
"grad_norm": 0.36988621950149536, |
|
"learning_rate": 0.0002536538461538461, |
|
"loss": 2.6747, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 20.706613174582632, |
|
"grad_norm": 0.3578760623931885, |
|
"learning_rate": 0.0002470291777188329, |
|
"loss": 2.6601, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 20.999611751002977, |
|
"eval_accuracy": 0.4202126468521879, |
|
"eval_loss": 2.989222764968872, |
|
"eval_runtime": 112.0379, |
|
"eval_samples_per_second": 465.824, |
|
"eval_steps_per_second": 7.283, |
|
"step": 40566 |
|
}, |
|
{ |
|
"epoch": 21.224278503947197, |
|
"grad_norm": 0.36684709787368774, |
|
"learning_rate": 0.00024039787798408487, |
|
"loss": 2.6476, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 21.741943833311765, |
|
"grad_norm": 0.3607647120952606, |
|
"learning_rate": 0.0002337732095490716, |
|
"loss": 2.6406, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 21.99974116733532, |
|
"eval_accuracy": 0.4203659329223163, |
|
"eval_loss": 2.9898085594177246, |
|
"eval_runtime": 112.2513, |
|
"eval_samples_per_second": 464.939, |
|
"eval_steps_per_second": 7.269, |
|
"step": 42498 |
|
}, |
|
{ |
|
"epoch": 22.25960916267633, |
|
"grad_norm": 0.3788565695285797, |
|
"learning_rate": 0.0002271419098143236, |
|
"loss": 2.6251, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 22.777274492040895, |
|
"grad_norm": 0.3606908619403839, |
|
"learning_rate": 0.00022051061007957558, |
|
"loss": 2.621, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 22.999870583667658, |
|
"eval_accuracy": 0.42054829826163076, |
|
"eval_loss": 2.992137908935547, |
|
"eval_runtime": 112.0224, |
|
"eval_samples_per_second": 465.889, |
|
"eval_steps_per_second": 7.284, |
|
"step": 44430 |
|
}, |
|
{ |
|
"epoch": 23.29493982140546, |
|
"grad_norm": 0.3780686855316162, |
|
"learning_rate": 0.00021389257294429705, |
|
"loss": 2.5996, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 23.812605150770025, |
|
"grad_norm": 0.3720705807209015, |
|
"learning_rate": 0.00020726127320954907, |
|
"loss": 2.6048, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.4207038385386728, |
|
"eval_loss": 2.992816925048828, |
|
"eval_runtime": 111.6981, |
|
"eval_samples_per_second": 467.242, |
|
"eval_steps_per_second": 7.305, |
|
"step": 46362 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"step": 46362, |
|
"total_flos": 1.550560221462528e+18, |
|
"train_loss": 3.0875192876465034, |
|
"train_runtime": 49059.1729, |
|
"train_samples_per_second": 403.2, |
|
"train_steps_per_second": 1.574 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 77240, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.550560221462528e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|