{ "best_metric": 2.989222764968872, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-32k-earlystop-40epochs_seed-42_3e-4/checkpoint-40566", "epoch": 24.0, "eval_steps": 500, "global_step": 46362, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5176653293645658, "grad_norm": 0.485008180141449, "learning_rate": 9.375e-06, "loss": 6.8783, "step": 1000 }, { "epoch": 0.9996117510029766, "eval_accuracy": 0.289060183567583, "eval_loss": 4.46536922454834, "eval_runtime": 111.7462, "eval_samples_per_second": 467.04, "eval_steps_per_second": 7.302, "step": 1931 }, { "epoch": 1.0353306587291315, "grad_norm": 0.6161150932312012, "learning_rate": 1.875e-05, "loss": 4.6542, "step": 2000 }, { "epoch": 1.5529959880936974, "grad_norm": 0.6660575270652771, "learning_rate": 2.8125e-05, "loss": 4.2263, "step": 3000 }, { "epoch": 1.9997411673353178, "eval_accuracy": 0.3337067802786951, "eval_loss": 3.917879343032837, "eval_runtime": 112.2732, "eval_samples_per_second": 464.848, "eval_steps_per_second": 7.268, "step": 3863 }, { "epoch": 2.070661317458263, "grad_norm": 0.6615888476371765, "learning_rate": 3.75e-05, "loss": 3.9582, "step": 4000 }, { "epoch": 2.588326646822829, "grad_norm": 0.7149969339370728, "learning_rate": 4.6874999999999994e-05, "loss": 3.7724, "step": 5000 }, { "epoch": 2.9998705836676587, "eval_accuracy": 0.3562454681048507, "eval_loss": 3.6396915912628174, "eval_runtime": 112.1753, "eval_samples_per_second": 465.254, "eval_steps_per_second": 7.274, "step": 5795 }, { "epoch": 3.105991976187395, "grad_norm": 0.6370624899864197, "learning_rate": 5.625e-05, "loss": 3.6294, "step": 6000 }, { "epoch": 3.6236573055519608, "grad_norm": 0.6175299286842346, "learning_rate": 6.5625e-05, "loss": 3.5091, "step": 7000 }, { "epoch": 4.0, "eval_accuracy": 0.3723592153857136, "eval_loss": 3.456868886947632, "eval_runtime": 111.9653, "eval_samples_per_second": 466.126, "eval_steps_per_second": 7.288, "step": 7727 }, { "epoch": 4.141322634916526, "grad_norm": 0.5864672064781189, "learning_rate": 7.5e-05, "loss": 3.4133, "step": 8000 }, { "epoch": 4.658987964281092, "grad_norm": 0.5497373342514038, "learning_rate": 8.437499999999999e-05, "loss": 3.3306, "step": 9000 }, { "epoch": 4.999611751002977, "eval_accuracy": 0.3838290710037608, "eval_loss": 3.3310000896453857, "eval_runtime": 112.0797, "eval_samples_per_second": 465.651, "eval_steps_per_second": 7.281, "step": 9658 }, { "epoch": 5.176653293645658, "grad_norm": 0.5599066019058228, "learning_rate": 9.374999999999999e-05, "loss": 3.2608, "step": 10000 }, { "epoch": 5.694318623010224, "grad_norm": 0.5453025698661804, "learning_rate": 0.00010312499999999999, "loss": 3.2012, "step": 11000 }, { "epoch": 5.999741167335317, "eval_accuracy": 0.3918066341309469, "eval_loss": 3.2469069957733154, "eval_runtime": 112.2873, "eval_samples_per_second": 464.79, "eval_steps_per_second": 7.267, "step": 11590 }, { "epoch": 6.21198395237479, "grad_norm": 0.5039867162704468, "learning_rate": 0.0001125, "loss": 3.152, "step": 12000 }, { "epoch": 6.729649281739356, "grad_norm": 0.4990510642528534, "learning_rate": 0.000121875, "loss": 3.1088, "step": 13000 }, { "epoch": 6.999870583667659, "eval_accuracy": 0.39817732342985096, "eval_loss": 3.1827573776245117, "eval_runtime": 112.5977, "eval_samples_per_second": 463.509, "eval_steps_per_second": 7.247, "step": 13522 }, { "epoch": 7.2473146111039215, "grad_norm": 0.46827396750450134, "learning_rate": 0.00013125, "loss": 3.0691, "step": 14000 }, { "epoch": 7.764979940468487, "grad_norm": 0.4815771281719208, "learning_rate": 0.000140625, "loss": 3.0364, "step": 15000 }, { "epoch": 8.0, "eval_accuracy": 0.40226096953439355, "eval_loss": 3.1404154300689697, "eval_runtime": 112.1878, "eval_samples_per_second": 465.202, "eval_steps_per_second": 7.274, "step": 15454 }, { "epoch": 8.282645269833052, "grad_norm": 0.4806288778781891, "learning_rate": 0.000149990625, "loss": 3.0022, "step": 16000 }, { "epoch": 8.80031059919762, "grad_norm": 0.4368716776371002, "learning_rate": 0.00015936562499999999, "loss": 2.9837, "step": 17000 }, { "epoch": 8.999611751002977, "eval_accuracy": 0.4056956294684956, "eval_loss": 3.1080353260040283, "eval_runtime": 112.3344, "eval_samples_per_second": 464.595, "eval_steps_per_second": 7.264, "step": 17385 }, { "epoch": 9.317975928562184, "grad_norm": 0.44254282116889954, "learning_rate": 0.00016873124999999998, "loss": 2.9495, "step": 18000 }, { "epoch": 9.835641257926751, "grad_norm": 0.4331772029399872, "learning_rate": 0.00017809687499999998, "loss": 2.9377, "step": 19000 }, { "epoch": 9.999741167335317, "eval_accuracy": 0.40769541156182726, "eval_loss": 3.083951711654663, "eval_runtime": 112.4544, "eval_samples_per_second": 464.099, "eval_steps_per_second": 7.256, "step": 19317 }, { "epoch": 10.353306587291316, "grad_norm": 0.425889790058136, "learning_rate": 0.00018747187499999998, "loss": 2.9051, "step": 20000 }, { "epoch": 10.870971916655883, "grad_norm": 0.40670710802078247, "learning_rate": 0.00019684687499999998, "loss": 2.9019, "step": 21000 }, { "epoch": 10.99987058366766, "eval_accuracy": 0.41011139539164965, "eval_loss": 3.063291311264038, "eval_runtime": 112.0395, "eval_samples_per_second": 465.818, "eval_steps_per_second": 7.283, "step": 21249 }, { "epoch": 11.388637246020448, "grad_norm": 0.4176114499568939, "learning_rate": 0.00020622187499999998, "loss": 2.8676, "step": 22000 }, { "epoch": 11.906302575385013, "grad_norm": 0.400343120098114, "learning_rate": 0.00021559687499999997, "loss": 2.8713, "step": 23000 }, { "epoch": 12.0, "eval_accuracy": 0.4117278120291995, "eval_loss": 3.0504517555236816, "eval_runtime": 111.9439, "eval_samples_per_second": 466.216, "eval_steps_per_second": 7.289, "step": 23181 }, { "epoch": 12.42396790474958, "grad_norm": 0.3873593509197235, "learning_rate": 0.00022496249999999997, "loss": 2.8353, "step": 24000 }, { "epoch": 12.941633234114144, "grad_norm": 0.3830123543739319, "learning_rate": 0.00023433749999999997, "loss": 2.8449, "step": 25000 }, { "epoch": 12.999611751002977, "eval_accuracy": 0.41297589125705847, "eval_loss": 3.037565231323242, "eval_runtime": 112.2612, "eval_samples_per_second": 464.898, "eval_steps_per_second": 7.269, "step": 25112 }, { "epoch": 13.459298563478711, "grad_norm": 0.390222430229187, "learning_rate": 0.00024369374999999997, "loss": 2.8068, "step": 26000 }, { "epoch": 13.976963892843276, "grad_norm": 0.3650892376899719, "learning_rate": 0.00025306875, "loss": 2.8231, "step": 27000 }, { "epoch": 13.999741167335317, "eval_accuracy": 0.41429497800269754, "eval_loss": 3.026965856552124, "eval_runtime": 112.249, "eval_samples_per_second": 464.948, "eval_steps_per_second": 7.27, "step": 27044 }, { "epoch": 14.494629222207843, "grad_norm": 0.3750116527080536, "learning_rate": 0.00026244375, "loss": 2.7828, "step": 28000 }, { "epoch": 14.99987058366766, "eval_accuracy": 0.4150191044035932, "eval_loss": 3.0222034454345703, "eval_runtime": 112.1123, "eval_samples_per_second": 465.516, "eval_steps_per_second": 7.278, "step": 28976 }, { "epoch": 15.012294551572408, "grad_norm": 0.39160436391830444, "learning_rate": 0.00027181875, "loss": 2.804, "step": 29000 }, { "epoch": 15.529959880936975, "grad_norm": 0.36566755175590515, "learning_rate": 0.00028118437499999993, "loss": 2.7644, "step": 30000 }, { "epoch": 16.0, "eval_accuracy": 0.4156012157689288, "eval_loss": 3.0159823894500732, "eval_runtime": 112.075, "eval_samples_per_second": 465.671, "eval_steps_per_second": 7.281, "step": 30908 }, { "epoch": 16.04762521030154, "grad_norm": 0.37446266412734985, "learning_rate": 0.000290559375, "loss": 2.7817, "step": 31000 }, { "epoch": 16.565290539666105, "grad_norm": 0.3600151836872101, "learning_rate": 0.000299925, "loss": 2.7508, "step": 32000 }, { "epoch": 16.999611751002977, "eval_accuracy": 0.4166737674184447, "eval_loss": 3.010031223297119, "eval_runtime": 112.0828, "eval_samples_per_second": 465.638, "eval_steps_per_second": 7.28, "step": 32839 }, { "epoch": 17.082955869030673, "grad_norm": 0.35380175709724426, "learning_rate": 0.00029342175066312994, "loss": 2.7621, "step": 33000 }, { "epoch": 17.60062119839524, "grad_norm": 0.3466060161590576, "learning_rate": 0.0002867970822281167, "loss": 2.7296, "step": 34000 }, { "epoch": 17.99974116733532, "eval_accuracy": 0.41775473477377156, "eval_loss": 3.002298355102539, "eval_runtime": 112.139, "eval_samples_per_second": 465.405, "eval_steps_per_second": 7.277, "step": 34771 }, { "epoch": 18.118286527759803, "grad_norm": 0.35987117886543274, "learning_rate": 0.0002801657824933687, "loss": 2.7304, "step": 35000 }, { "epoch": 18.63595185712437, "grad_norm": 0.345238596200943, "learning_rate": 0.0002735411140583554, "loss": 2.7053, "step": 36000 }, { "epoch": 18.999870583667658, "eval_accuracy": 0.41883562698886795, "eval_loss": 2.9966797828674316, "eval_runtime": 112.4594, "eval_samples_per_second": 464.079, "eval_steps_per_second": 7.256, "step": 36703 }, { "epoch": 19.153617186488933, "grad_norm": 0.35703110694885254, "learning_rate": 0.0002669098143236074, "loss": 2.701, "step": 37000 }, { "epoch": 19.671282515853502, "grad_norm": 0.3403065800666809, "learning_rate": 0.00026028514588859414, "loss": 2.6821, "step": 38000 }, { "epoch": 20.0, "eval_accuracy": 0.419519853927392, "eval_loss": 2.9925894737243652, "eval_runtime": 112.0678, "eval_samples_per_second": 465.7, "eval_steps_per_second": 7.281, "step": 38635 }, { "epoch": 20.188947845218067, "grad_norm": 0.36988621950149536, "learning_rate": 0.0002536538461538461, "loss": 2.6747, "step": 39000 }, { "epoch": 20.706613174582632, "grad_norm": 0.3578760623931885, "learning_rate": 0.0002470291777188329, "loss": 2.6601, "step": 40000 }, { "epoch": 20.999611751002977, "eval_accuracy": 0.4202126468521879, "eval_loss": 2.989222764968872, "eval_runtime": 112.0379, "eval_samples_per_second": 465.824, "eval_steps_per_second": 7.283, "step": 40566 }, { "epoch": 21.224278503947197, "grad_norm": 0.36684709787368774, "learning_rate": 0.00024039787798408487, "loss": 2.6476, "step": 41000 }, { "epoch": 21.741943833311765, "grad_norm": 0.3607647120952606, "learning_rate": 0.0002337732095490716, "loss": 2.6406, "step": 42000 }, { "epoch": 21.99974116733532, "eval_accuracy": 0.4203659329223163, "eval_loss": 2.9898085594177246, "eval_runtime": 112.2513, "eval_samples_per_second": 464.939, "eval_steps_per_second": 7.269, "step": 42498 }, { "epoch": 22.25960916267633, "grad_norm": 0.3788565695285797, "learning_rate": 0.0002271419098143236, "loss": 2.6251, "step": 43000 }, { "epoch": 22.777274492040895, "grad_norm": 0.3606908619403839, "learning_rate": 0.00022051061007957558, "loss": 2.621, "step": 44000 }, { "epoch": 22.999870583667658, "eval_accuracy": 0.42054829826163076, "eval_loss": 2.992137908935547, "eval_runtime": 112.0224, "eval_samples_per_second": 465.889, "eval_steps_per_second": 7.284, "step": 44430 }, { "epoch": 23.29493982140546, "grad_norm": 0.3780686855316162, "learning_rate": 0.00021389257294429705, "loss": 2.5996, "step": 45000 }, { "epoch": 23.812605150770025, "grad_norm": 0.3720705807209015, "learning_rate": 0.00020726127320954907, "loss": 2.6048, "step": 46000 }, { "epoch": 24.0, "eval_accuracy": 0.4207038385386728, "eval_loss": 2.992816925048828, "eval_runtime": 111.6981, "eval_samples_per_second": 467.242, "eval_steps_per_second": 7.305, "step": 46362 }, { "epoch": 24.0, "step": 46362, "total_flos": 1.550560221462528e+18, "train_loss": 3.0875192876465034, "train_runtime": 49059.1729, "train_samples_per_second": 403.2, "train_steps_per_second": 1.574 } ], "logging_steps": 1000, "max_steps": 77240, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.550560221462528e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }