{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.992874109263658, "eval_steps": 500, "global_step": 945, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03167062549485352, "grad_norm": 1.8530284877370395, "learning_rate": 5e-06, "loss": 0.8166, "step": 10 }, { "epoch": 0.06334125098970704, "grad_norm": 1.5337420189302975, "learning_rate": 5e-06, "loss": 0.7464, "step": 20 }, { "epoch": 0.09501187648456057, "grad_norm": 1.4029937472724519, "learning_rate": 5e-06, "loss": 0.7243, "step": 30 }, { "epoch": 0.12668250197941408, "grad_norm": 0.9360700334222073, "learning_rate": 5e-06, "loss": 0.7146, "step": 40 }, { "epoch": 0.1583531274742676, "grad_norm": 0.994137693760135, "learning_rate": 5e-06, "loss": 0.6895, "step": 50 }, { "epoch": 0.19002375296912113, "grad_norm": 0.9347536063077458, "learning_rate": 5e-06, "loss": 0.6778, "step": 60 }, { "epoch": 0.22169437846397466, "grad_norm": 1.4913361116122625, "learning_rate": 5e-06, "loss": 0.6594, "step": 70 }, { "epoch": 0.25336500395882816, "grad_norm": 0.7126258978453617, "learning_rate": 5e-06, "loss": 0.6637, "step": 80 }, { "epoch": 0.2850356294536817, "grad_norm": 0.9899069063293856, "learning_rate": 5e-06, "loss": 0.6456, "step": 90 }, { "epoch": 0.3167062549485352, "grad_norm": 0.8045731477224973, "learning_rate": 5e-06, "loss": 0.6537, "step": 100 }, { "epoch": 0.34837688044338877, "grad_norm": 0.7578866653732541, "learning_rate": 5e-06, "loss": 0.6481, "step": 110 }, { "epoch": 0.38004750593824227, "grad_norm": 0.5676407147308244, "learning_rate": 5e-06, "loss": 0.6419, "step": 120 }, { "epoch": 0.4117181314330958, "grad_norm": 0.8078622210774858, "learning_rate": 5e-06, "loss": 0.6479, "step": 130 }, { "epoch": 0.4433887569279493, "grad_norm": 0.7647299971015771, "learning_rate": 5e-06, "loss": 0.6439, "step": 140 }, { "epoch": 0.4750593824228028, "grad_norm": 0.5575084199732572, "learning_rate": 5e-06, "loss": 0.6388, "step": 150 }, { "epoch": 0.5067300079176563, "grad_norm": 0.5128045187828547, "learning_rate": 5e-06, "loss": 0.6347, "step": 160 }, { "epoch": 0.5384006334125099, "grad_norm": 0.8861637952775342, "learning_rate": 5e-06, "loss": 0.6354, "step": 170 }, { "epoch": 0.5700712589073634, "grad_norm": 0.7708516251715861, "learning_rate": 5e-06, "loss": 0.6365, "step": 180 }, { "epoch": 0.601741884402217, "grad_norm": 0.6980158820500171, "learning_rate": 5e-06, "loss": 0.6339, "step": 190 }, { "epoch": 0.6334125098970704, "grad_norm": 0.5994527923990384, "learning_rate": 5e-06, "loss": 0.6384, "step": 200 }, { "epoch": 0.665083135391924, "grad_norm": 1.2967423762938355, "learning_rate": 5e-06, "loss": 0.6369, "step": 210 }, { "epoch": 0.6967537608867775, "grad_norm": 0.47535091611506025, "learning_rate": 5e-06, "loss": 0.6321, "step": 220 }, { "epoch": 0.728424386381631, "grad_norm": 0.6353037224955166, "learning_rate": 5e-06, "loss": 0.6241, "step": 230 }, { "epoch": 0.7600950118764845, "grad_norm": 0.6601833755256151, "learning_rate": 5e-06, "loss": 0.623, "step": 240 }, { "epoch": 0.7917656373713381, "grad_norm": 0.6122026802056224, "learning_rate": 5e-06, "loss": 0.6263, "step": 250 }, { "epoch": 0.8234362628661916, "grad_norm": 0.5075094381024849, "learning_rate": 5e-06, "loss": 0.6301, "step": 260 }, { "epoch": 0.8551068883610451, "grad_norm": 0.5422031773802576, "learning_rate": 5e-06, "loss": 0.6292, "step": 270 }, { "epoch": 0.8867775138558986, "grad_norm": 0.42516469360864606, "learning_rate": 5e-06, "loss": 0.6177, "step": 280 }, { "epoch": 0.9184481393507522, "grad_norm": 0.7465232508313683, "learning_rate": 5e-06, "loss": 0.6197, "step": 290 }, { "epoch": 0.9501187648456056, "grad_norm": 0.5164060465761252, "learning_rate": 5e-06, "loss": 0.6286, "step": 300 }, { "epoch": 0.9817893903404592, "grad_norm": 0.6125601748565044, "learning_rate": 5e-06, "loss": 0.6301, "step": 310 }, { "epoch": 0.997624703087886, "eval_loss": 0.6208310723304749, "eval_runtime": 170.2387, "eval_samples_per_second": 49.965, "eval_steps_per_second": 0.394, "step": 315 }, { "epoch": 1.0134600158353126, "grad_norm": 0.8577950377026092, "learning_rate": 5e-06, "loss": 0.6042, "step": 320 }, { "epoch": 1.0451306413301662, "grad_norm": 0.563239613051996, "learning_rate": 5e-06, "loss": 0.5769, "step": 330 }, { "epoch": 1.0768012668250198, "grad_norm": 0.5011250455738395, "learning_rate": 5e-06, "loss": 0.5753, "step": 340 }, { "epoch": 1.1084718923198733, "grad_norm": 0.507779510609032, "learning_rate": 5e-06, "loss": 0.5755, "step": 350 }, { "epoch": 1.1401425178147269, "grad_norm": 0.5864390033721468, "learning_rate": 5e-06, "loss": 0.5731, "step": 360 }, { "epoch": 1.1718131433095804, "grad_norm": 0.5651111499269338, "learning_rate": 5e-06, "loss": 0.5687, "step": 370 }, { "epoch": 1.203483768804434, "grad_norm": 0.6150558670403593, "learning_rate": 5e-06, "loss": 0.5729, "step": 380 }, { "epoch": 1.2351543942992875, "grad_norm": 0.5808136283436993, "learning_rate": 5e-06, "loss": 0.5718, "step": 390 }, { "epoch": 1.2668250197941409, "grad_norm": 0.5830305205374973, "learning_rate": 5e-06, "loss": 0.5761, "step": 400 }, { "epoch": 1.2984956452889944, "grad_norm": 0.5467014072656838, "learning_rate": 5e-06, "loss": 0.5799, "step": 410 }, { "epoch": 1.330166270783848, "grad_norm": 0.5214522227982674, "learning_rate": 5e-06, "loss": 0.5734, "step": 420 }, { "epoch": 1.3618368962787015, "grad_norm": 0.7855825418949094, "learning_rate": 5e-06, "loss": 0.5712, "step": 430 }, { "epoch": 1.393507521773555, "grad_norm": 0.5656871920545978, "learning_rate": 5e-06, "loss": 0.576, "step": 440 }, { "epoch": 1.4251781472684084, "grad_norm": 0.5944245365102998, "learning_rate": 5e-06, "loss": 0.5708, "step": 450 }, { "epoch": 1.4568487727632622, "grad_norm": 0.5333261966632318, "learning_rate": 5e-06, "loss": 0.5671, "step": 460 }, { "epoch": 1.4885193982581155, "grad_norm": 0.491104016024968, "learning_rate": 5e-06, "loss": 0.5743, "step": 470 }, { "epoch": 1.520190023752969, "grad_norm": 0.5014163060947014, "learning_rate": 5e-06, "loss": 0.5728, "step": 480 }, { "epoch": 1.5518606492478226, "grad_norm": 0.4892383437610088, "learning_rate": 5e-06, "loss": 0.5802, "step": 490 }, { "epoch": 1.5835312747426762, "grad_norm": 0.4989580296588211, "learning_rate": 5e-06, "loss": 0.5751, "step": 500 }, { "epoch": 1.6152019002375297, "grad_norm": 0.616735446474154, "learning_rate": 5e-06, "loss": 0.5716, "step": 510 }, { "epoch": 1.646872525732383, "grad_norm": 0.5387044899734632, "learning_rate": 5e-06, "loss": 0.5751, "step": 520 }, { "epoch": 1.6785431512272369, "grad_norm": 0.5526914425760312, "learning_rate": 5e-06, "loss": 0.5684, "step": 530 }, { "epoch": 1.7102137767220902, "grad_norm": 0.6545410316398991, "learning_rate": 5e-06, "loss": 0.5702, "step": 540 }, { "epoch": 1.7418844022169437, "grad_norm": 0.5254133456615973, "learning_rate": 5e-06, "loss": 0.5674, "step": 550 }, { "epoch": 1.7735550277117973, "grad_norm": 0.521939360967283, "learning_rate": 5e-06, "loss": 0.5724, "step": 560 }, { "epoch": 1.8052256532066508, "grad_norm": 0.463070184825861, "learning_rate": 5e-06, "loss": 0.5818, "step": 570 }, { "epoch": 1.8368962787015044, "grad_norm": 0.6381561992890791, "learning_rate": 5e-06, "loss": 0.5688, "step": 580 }, { "epoch": 1.8685669041963577, "grad_norm": 0.6496360141809046, "learning_rate": 5e-06, "loss": 0.5646, "step": 590 }, { "epoch": 1.9002375296912115, "grad_norm": 0.5324989809239741, "learning_rate": 5e-06, "loss": 0.566, "step": 600 }, { "epoch": 1.9319081551860648, "grad_norm": 0.5111765382529705, "learning_rate": 5e-06, "loss": 0.5708, "step": 610 }, { "epoch": 1.9635787806809184, "grad_norm": 0.5610135178229989, "learning_rate": 5e-06, "loss": 0.574, "step": 620 }, { "epoch": 1.995249406175772, "grad_norm": 0.6915334000048331, "learning_rate": 5e-06, "loss": 0.58, "step": 630 }, { "epoch": 1.9984164687252575, "eval_loss": 0.6117470860481262, "eval_runtime": 171.4849, "eval_samples_per_second": 49.602, "eval_steps_per_second": 0.391, "step": 631 }, { "epoch": 2.0269200316706253, "grad_norm": 0.616676539774963, "learning_rate": 5e-06, "loss": 0.528, "step": 640 }, { "epoch": 2.058590657165479, "grad_norm": 0.6073136045120692, "learning_rate": 5e-06, "loss": 0.5195, "step": 650 }, { "epoch": 2.0902612826603324, "grad_norm": 0.5470687590939235, "learning_rate": 5e-06, "loss": 0.5258, "step": 660 }, { "epoch": 2.121931908155186, "grad_norm": 0.5476384369380188, "learning_rate": 5e-06, "loss": 0.5207, "step": 670 }, { "epoch": 2.1536025336500395, "grad_norm": 0.5807654736471419, "learning_rate": 5e-06, "loss": 0.5229, "step": 680 }, { "epoch": 2.1852731591448933, "grad_norm": 0.5658725818639244, "learning_rate": 5e-06, "loss": 0.5155, "step": 690 }, { "epoch": 2.2169437846397466, "grad_norm": 0.5270348810540799, "learning_rate": 5e-06, "loss": 0.5257, "step": 700 }, { "epoch": 2.2486144101346, "grad_norm": 0.6710965861689625, "learning_rate": 5e-06, "loss": 0.5148, "step": 710 }, { "epoch": 2.2802850356294537, "grad_norm": 0.5096235564504402, "learning_rate": 5e-06, "loss": 0.5181, "step": 720 }, { "epoch": 2.311955661124307, "grad_norm": 0.5550769313636378, "learning_rate": 5e-06, "loss": 0.5171, "step": 730 }, { "epoch": 2.343626286619161, "grad_norm": 0.7122071616039661, "learning_rate": 5e-06, "loss": 0.5181, "step": 740 }, { "epoch": 2.375296912114014, "grad_norm": 0.5158288008674774, "learning_rate": 5e-06, "loss": 0.5255, "step": 750 }, { "epoch": 2.406967537608868, "grad_norm": 0.5744842064968558, "learning_rate": 5e-06, "loss": 0.5232, "step": 760 }, { "epoch": 2.4386381631037213, "grad_norm": 0.48569927712881583, "learning_rate": 5e-06, "loss": 0.5231, "step": 770 }, { "epoch": 2.470308788598575, "grad_norm": 0.6323853806061133, "learning_rate": 5e-06, "loss": 0.5199, "step": 780 }, { "epoch": 2.5019794140934284, "grad_norm": 0.5506449580962853, "learning_rate": 5e-06, "loss": 0.5192, "step": 790 }, { "epoch": 2.5336500395882817, "grad_norm": 0.4790039532718334, "learning_rate": 5e-06, "loss": 0.5179, "step": 800 }, { "epoch": 2.5653206650831355, "grad_norm": 0.5390096335172841, "learning_rate": 5e-06, "loss": 0.5269, "step": 810 }, { "epoch": 2.596991290577989, "grad_norm": 0.5217848609501268, "learning_rate": 5e-06, "loss": 0.5229, "step": 820 }, { "epoch": 2.6286619160728426, "grad_norm": 0.5196866879306132, "learning_rate": 5e-06, "loss": 0.5217, "step": 830 }, { "epoch": 2.660332541567696, "grad_norm": 0.5333118708551611, "learning_rate": 5e-06, "loss": 0.525, "step": 840 }, { "epoch": 2.6920031670625493, "grad_norm": 0.5483398687319767, "learning_rate": 5e-06, "loss": 0.5224, "step": 850 }, { "epoch": 2.723673792557403, "grad_norm": 0.5117263088559509, "learning_rate": 5e-06, "loss": 0.5283, "step": 860 }, { "epoch": 2.7553444180522564, "grad_norm": 0.5377436931762913, "learning_rate": 5e-06, "loss": 0.5232, "step": 870 }, { "epoch": 2.78701504354711, "grad_norm": 0.5662585202942261, "learning_rate": 5e-06, "loss": 0.5274, "step": 880 }, { "epoch": 2.8186856690419635, "grad_norm": 0.48954404185612554, "learning_rate": 5e-06, "loss": 0.5273, "step": 890 }, { "epoch": 2.850356294536817, "grad_norm": 0.5387435059221906, "learning_rate": 5e-06, "loss": 0.5181, "step": 900 }, { "epoch": 2.8820269200316706, "grad_norm": 0.5630237900783817, "learning_rate": 5e-06, "loss": 0.5228, "step": 910 }, { "epoch": 2.9136975455265244, "grad_norm": 0.4995116097709295, "learning_rate": 5e-06, "loss": 0.5247, "step": 920 }, { "epoch": 2.9453681710213777, "grad_norm": 0.49890480333211984, "learning_rate": 5e-06, "loss": 0.521, "step": 930 }, { "epoch": 2.977038796516231, "grad_norm": 0.5217828385490265, "learning_rate": 5e-06, "loss": 0.5261, "step": 940 }, { "epoch": 2.992874109263658, "eval_loss": 0.6157404184341431, "eval_runtime": 171.3558, "eval_samples_per_second": 49.639, "eval_steps_per_second": 0.391, "step": 945 }, { "epoch": 2.992874109263658, "step": 945, "total_flos": 1582491437629440.0, "train_loss": 0.5832562999119835, "train_runtime": 28511.6322, "train_samples_per_second": 17.004, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 945, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1582491437629440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }