{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.150456695724614, "global_step": 77336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.9950413059216727e-05, "loss": 14.0627, "step": 500 }, { "epoch": 0.01, "learning_rate": 1.9900826118433453e-05, "loss": 6.3799, "step": 1000 }, { "epoch": 0.02, "learning_rate": 1.9851239177650176e-05, "loss": 5.31, "step": 1500 }, { "epoch": 0.03, "learning_rate": 1.9801652236866898e-05, "loss": 4.9781, "step": 2000 }, { "epoch": 0.04, "learning_rate": 1.9752065296083624e-05, "loss": 4.7392, "step": 2500 }, { "epoch": 0.04, "learning_rate": 1.970247835530035e-05, "loss": 4.5779, "step": 3000 }, { "epoch": 0.05, "learning_rate": 1.9652891414517075e-05, "loss": 4.4691, "step": 3500 }, { "epoch": 0.06, "learning_rate": 1.96033044737338e-05, "loss": 4.3745, "step": 4000 }, { "epoch": 0.07, "learning_rate": 1.9553717532950524e-05, "loss": 4.2883, "step": 4500 }, { "epoch": 0.07, "learning_rate": 1.9504130592167246e-05, "loss": 4.2342, "step": 5000 }, { "epoch": 0.08, "learning_rate": 1.9454543651383972e-05, "loss": 4.1614, "step": 5500 }, { "epoch": 0.09, "learning_rate": 1.9404956710600698e-05, "loss": 4.1279, "step": 6000 }, { "epoch": 0.1, "learning_rate": 1.9355369769817423e-05, "loss": 4.0802, "step": 6500 }, { "epoch": 0.1, "learning_rate": 1.930578282903415e-05, "loss": 4.0298, "step": 7000 }, { "epoch": 0.11, "learning_rate": 1.925619588825087e-05, "loss": 3.9697, "step": 7500 }, { "epoch": 0.12, "learning_rate": 1.9206608947467594e-05, "loss": 3.9584, "step": 8000 }, { "epoch": 0.13, "learning_rate": 1.915702200668432e-05, "loss": 3.9196, "step": 8500 }, { "epoch": 0.13, "learning_rate": 1.9107435065901046e-05, "loss": 3.9081, "step": 9000 }, { "epoch": 0.14, "learning_rate": 1.905784812511777e-05, "loss": 3.8419, "step": 9500 }, { "epoch": 0.15, "learning_rate": 1.9008261184334497e-05, "loss": 3.8363, "step": 10000 }, { "epoch": 0.16, "learning_rate": 1.895867424355122e-05, "loss": 3.8047, "step": 10500 }, { "epoch": 0.16, "learning_rate": 1.8909087302767945e-05, "loss": 3.7728, "step": 11000 }, { "epoch": 0.17, "learning_rate": 1.8859500361984668e-05, "loss": 3.7731, "step": 11500 }, { "epoch": 0.18, "learning_rate": 1.8809913421201393e-05, "loss": 3.7408, "step": 12000 }, { "epoch": 0.19, "learning_rate": 1.876032648041812e-05, "loss": 3.7027, "step": 12500 }, { "epoch": 0.19, "learning_rate": 1.8710739539634845e-05, "loss": 3.6865, "step": 13000 }, { "epoch": 0.2, "learning_rate": 1.8661152598851567e-05, "loss": 3.6456, "step": 13500 }, { "epoch": 0.21, "learning_rate": 1.8611565658068293e-05, "loss": 3.6539, "step": 14000 }, { "epoch": 0.22, "learning_rate": 1.8561978717285016e-05, "loss": 3.6222, "step": 14500 }, { "epoch": 0.22, "learning_rate": 1.851239177650174e-05, "loss": 3.6127, "step": 15000 }, { "epoch": 0.23, "learning_rate": 1.8462804835718467e-05, "loss": 3.6133, "step": 15500 }, { "epoch": 0.24, "learning_rate": 1.8413217894935193e-05, "loss": 3.5863, "step": 16000 }, { "epoch": 0.25, "learning_rate": 1.8363630954151915e-05, "loss": 3.5669, "step": 16500 }, { "epoch": 0.25, "learning_rate": 1.831404401336864e-05, "loss": 3.5518, "step": 17000 }, { "epoch": 0.26, "learning_rate": 1.8264457072585367e-05, "loss": 3.5368, "step": 17500 }, { "epoch": 0.27, "learning_rate": 1.821487013180209e-05, "loss": 3.5294, "step": 18000 }, { "epoch": 0.28, "learning_rate": 1.8165283191018815e-05, "loss": 3.5097, "step": 18500 }, { "epoch": 0.28, "learning_rate": 1.811569625023554e-05, "loss": 3.5198, "step": 19000 }, { "epoch": 0.29, "learning_rate": 1.8066109309452263e-05, "loss": 3.4702, "step": 19500 }, { "epoch": 0.3, "learning_rate": 1.801652236866899e-05, "loss": 3.485, "step": 20000 }, { "epoch": 0.3, "learning_rate": 1.7966935427885715e-05, "loss": 3.4853, "step": 20500 }, { "epoch": 0.31, "learning_rate": 1.7917348487102437e-05, "loss": 3.4395, "step": 21000 }, { "epoch": 0.32, "learning_rate": 1.7867761546319163e-05, "loss": 3.4515, "step": 21500 }, { "epoch": 0.33, "learning_rate": 1.781817460553589e-05, "loss": 3.4307, "step": 22000 }, { "epoch": 0.33, "learning_rate": 1.776858766475261e-05, "loss": 3.4343, "step": 22500 }, { "epoch": 0.34, "learning_rate": 1.7719000723969337e-05, "loss": 3.4053, "step": 23000 }, { "epoch": 0.35, "learning_rate": 1.7669413783186063e-05, "loss": 3.4008, "step": 23500 }, { "epoch": 0.36, "learning_rate": 1.7619826842402785e-05, "loss": 3.3951, "step": 24000 }, { "epoch": 0.36, "learning_rate": 1.757023990161951e-05, "loss": 3.3871, "step": 24500 }, { "epoch": 0.37, "learning_rate": 1.7520652960836234e-05, "loss": 3.3822, "step": 25000 }, { "epoch": 0.38, "learning_rate": 1.747106602005296e-05, "loss": 3.3816, "step": 25500 }, { "epoch": 0.39, "learning_rate": 1.7421479079269685e-05, "loss": 3.3759, "step": 26000 }, { "epoch": 0.39, "learning_rate": 1.737189213848641e-05, "loss": 3.3624, "step": 26500 }, { "epoch": 0.4, "learning_rate": 1.7322305197703137e-05, "loss": 3.3535, "step": 27000 }, { "epoch": 0.41, "learning_rate": 1.727271825691986e-05, "loss": 3.3366, "step": 27500 }, { "epoch": 0.42, "learning_rate": 1.722313131613658e-05, "loss": 3.3245, "step": 28000 }, { "epoch": 0.42, "learning_rate": 1.7173544375353307e-05, "loss": 3.3575, "step": 28500 }, { "epoch": 0.43, "learning_rate": 1.7123957434570033e-05, "loss": 3.3133, "step": 29000 }, { "epoch": 0.44, "learning_rate": 1.707437049378676e-05, "loss": 3.3124, "step": 29500 }, { "epoch": 0.45, "learning_rate": 1.7024783553003485e-05, "loss": 3.3295, "step": 30000 }, { "epoch": 0.45, "learning_rate": 1.6975196612220207e-05, "loss": 3.3192, "step": 30500 }, { "epoch": 0.46, "learning_rate": 1.692560967143693e-05, "loss": 3.3241, "step": 31000 }, { "epoch": 0.47, "learning_rate": 1.6876022730653655e-05, "loss": 3.2989, "step": 31500 }, { "epoch": 0.48, "learning_rate": 1.682643578987038e-05, "loss": 3.2956, "step": 32000 }, { "epoch": 0.48, "learning_rate": 1.6776848849087107e-05, "loss": 3.2889, "step": 32500 }, { "epoch": 0.49, "learning_rate": 1.6727261908303833e-05, "loss": 3.2934, "step": 33000 }, { "epoch": 0.5, "learning_rate": 1.6677674967520555e-05, "loss": 3.2642, "step": 33500 }, { "epoch": 0.51, "learning_rate": 1.6628088026737277e-05, "loss": 3.2513, "step": 34000 }, { "epoch": 0.51, "learning_rate": 1.6578501085954003e-05, "loss": 3.2584, "step": 34500 }, { "epoch": 0.52, "learning_rate": 1.652891414517073e-05, "loss": 3.2576, "step": 35000 }, { "epoch": 0.53, "learning_rate": 1.6479327204387455e-05, "loss": 3.2532, "step": 35500 }, { "epoch": 0.54, "learning_rate": 1.642974026360418e-05, "loss": 3.2349, "step": 36000 }, { "epoch": 0.54, "learning_rate": 1.6380153322820903e-05, "loss": 3.2349, "step": 36500 }, { "epoch": 0.55, "learning_rate": 1.6330566382037625e-05, "loss": 3.2158, "step": 37000 }, { "epoch": 0.56, "learning_rate": 1.628097944125435e-05, "loss": 3.2309, "step": 37500 }, { "epoch": 0.57, "learning_rate": 1.6231392500471077e-05, "loss": 3.2227, "step": 38000 }, { "epoch": 0.57, "learning_rate": 1.6181805559687803e-05, "loss": 3.2134, "step": 38500 }, { "epoch": 0.58, "learning_rate": 1.613221861890453e-05, "loss": 3.2206, "step": 39000 }, { "epoch": 0.59, "learning_rate": 1.608263167812125e-05, "loss": 3.2002, "step": 39500 }, { "epoch": 0.6, "learning_rate": 1.6033044737337973e-05, "loss": 3.1988, "step": 40000 }, { "epoch": 0.6, "learning_rate": 1.59834577965547e-05, "loss": 3.2081, "step": 40500 }, { "epoch": 0.61, "learning_rate": 1.5933870855771425e-05, "loss": 3.1891, "step": 41000 }, { "epoch": 0.62, "learning_rate": 1.588428391498815e-05, "loss": 3.2007, "step": 41500 }, { "epoch": 0.62, "learning_rate": 1.5834696974204877e-05, "loss": 3.1948, "step": 42000 }, { "epoch": 0.63, "learning_rate": 1.57851100334216e-05, "loss": 3.1673, "step": 42500 }, { "epoch": 0.64, "learning_rate": 1.5735523092638325e-05, "loss": 3.158, "step": 43000 }, { "epoch": 0.65, "learning_rate": 1.5685936151855047e-05, "loss": 3.1561, "step": 43500 }, { "epoch": 0.65, "learning_rate": 1.5636349211071773e-05, "loss": 3.1734, "step": 44000 }, { "epoch": 0.66, "learning_rate": 1.55867622702885e-05, "loss": 3.1401, "step": 44500 }, { "epoch": 0.67, "learning_rate": 1.5537175329505225e-05, "loss": 3.1463, "step": 45000 }, { "epoch": 0.68, "learning_rate": 1.5487588388721947e-05, "loss": 3.1431, "step": 45500 }, { "epoch": 0.68, "learning_rate": 1.5438001447938673e-05, "loss": 3.1316, "step": 46000 }, { "epoch": 0.69, "learning_rate": 1.5388414507155395e-05, "loss": 3.1606, "step": 46500 }, { "epoch": 0.7, "learning_rate": 1.533882756637212e-05, "loss": 3.1362, "step": 47000 }, { "epoch": 0.71, "learning_rate": 1.5289240625588847e-05, "loss": 3.1335, "step": 47500 }, { "epoch": 0.71, "learning_rate": 1.523965368480557e-05, "loss": 3.149, "step": 48000 }, { "epoch": 0.72, "learning_rate": 1.5190066744022297e-05, "loss": 3.1293, "step": 48500 }, { "epoch": 0.73, "learning_rate": 1.514047980323902e-05, "loss": 3.1286, "step": 49000 }, { "epoch": 0.74, "learning_rate": 1.5090892862455743e-05, "loss": 3.1196, "step": 49500 }, { "epoch": 0.74, "learning_rate": 1.5041305921672469e-05, "loss": 3.1238, "step": 50000 }, { "epoch": 0.75, "learning_rate": 1.4991718980889195e-05, "loss": 3.1033, "step": 50500 }, { "epoch": 0.76, "learning_rate": 1.4942132040105919e-05, "loss": 3.1112, "step": 51000 }, { "epoch": 0.77, "learning_rate": 1.4892545099322645e-05, "loss": 3.0936, "step": 51500 }, { "epoch": 0.77, "learning_rate": 1.4842958158539369e-05, "loss": 3.107, "step": 52000 }, { "epoch": 0.78, "learning_rate": 1.4793371217756094e-05, "loss": 3.1063, "step": 52500 }, { "epoch": 0.79, "learning_rate": 1.4743784276972817e-05, "loss": 3.0639, "step": 53000 }, { "epoch": 0.8, "learning_rate": 1.4694197336189543e-05, "loss": 3.1028, "step": 53500 }, { "epoch": 0.8, "learning_rate": 1.4644610395406267e-05, "loss": 3.0821, "step": 54000 }, { "epoch": 0.81, "learning_rate": 1.4595023454622992e-05, "loss": 3.0596, "step": 54500 }, { "epoch": 0.82, "learning_rate": 1.4545436513839717e-05, "loss": 3.0787, "step": 55000 }, { "epoch": 0.83, "learning_rate": 1.4495849573056442e-05, "loss": 3.0755, "step": 55500 }, { "epoch": 0.83, "learning_rate": 1.4446262632273165e-05, "loss": 3.066, "step": 56000 }, { "epoch": 0.84, "learning_rate": 1.439667569148989e-05, "loss": 3.0695, "step": 56500 }, { "epoch": 0.85, "learning_rate": 1.4347088750706615e-05, "loss": 3.059, "step": 57000 }, { "epoch": 0.86, "learning_rate": 1.429750180992334e-05, "loss": 3.0628, "step": 57500 }, { "epoch": 0.86, "learning_rate": 1.4247914869140065e-05, "loss": 3.0733, "step": 58000 }, { "epoch": 0.87, "learning_rate": 1.419832792835679e-05, "loss": 3.0591, "step": 58500 }, { "epoch": 0.88, "learning_rate": 1.4148740987573514e-05, "loss": 3.0468, "step": 59000 }, { "epoch": 0.89, "learning_rate": 1.4099154046790237e-05, "loss": 3.0265, "step": 59500 }, { "epoch": 0.89, "learning_rate": 1.4049567106006963e-05, "loss": 3.0282, "step": 60000 }, { "epoch": 0.9, "learning_rate": 1.3999980165223688e-05, "loss": 3.0222, "step": 60500 }, { "epoch": 0.91, "learning_rate": 1.3950393224440413e-05, "loss": 3.0275, "step": 61000 }, { "epoch": 0.91, "learning_rate": 1.3900806283657138e-05, "loss": 3.0277, "step": 61500 }, { "epoch": 0.92, "learning_rate": 1.3851219342873862e-05, "loss": 3.0551, "step": 62000 }, { "epoch": 0.93, "learning_rate": 1.3801632402090585e-05, "loss": 3.0205, "step": 62500 }, { "epoch": 0.94, "learning_rate": 1.375204546130731e-05, "loss": 3.023, "step": 63000 }, { "epoch": 0.94, "learning_rate": 1.3702458520524036e-05, "loss": 3.0244, "step": 63500 }, { "epoch": 0.95, "learning_rate": 1.365287157974076e-05, "loss": 3.0116, "step": 64000 }, { "epoch": 0.96, "learning_rate": 1.3603284638957486e-05, "loss": 3.0141, "step": 64500 }, { "epoch": 0.97, "learning_rate": 1.355369769817421e-05, "loss": 3.0284, "step": 65000 }, { "epoch": 0.97, "learning_rate": 1.3504110757390933e-05, "loss": 3.0236, "step": 65500 }, { "epoch": 0.98, "learning_rate": 1.3454523816607659e-05, "loss": 3.013, "step": 66000 }, { "epoch": 0.99, "learning_rate": 1.3404936875824384e-05, "loss": 3.0027, "step": 66500 }, { "epoch": 1.0, "learning_rate": 1.3355349935041108e-05, "loss": 3.0155, "step": 67000 }, { "epoch": 1.0, "eval_bleu": 11.298551127218651, "eval_loss": 2.3749005794525146, "eval_runtime": 4929.9601, "eval_samples_per_second": 8.201, "eval_steps_per_second": 0.513, "step": 67222 }, { "epoch": 1.0, "learning_rate": 1.3305762994257834e-05, "loss": 3.0195, "step": 67500 }, { "epoch": 1.01, "learning_rate": 1.3256176053474558e-05, "loss": 2.9924, "step": 68000 }, { "epoch": 1.02, "learning_rate": 1.3206589112691284e-05, "loss": 2.997, "step": 68500 }, { "epoch": 1.03, "learning_rate": 1.3157002171908007e-05, "loss": 2.9694, "step": 69000 }, { "epoch": 1.03, "learning_rate": 1.3107415231124732e-05, "loss": 2.9804, "step": 69500 }, { "epoch": 1.04, "learning_rate": 1.3057828290341456e-05, "loss": 2.9879, "step": 70000 }, { "epoch": 1.05, "learning_rate": 1.3008241349558182e-05, "loss": 2.9919, "step": 70500 }, { "epoch": 1.06, "learning_rate": 1.2958654408774906e-05, "loss": 2.9875, "step": 71000 }, { "epoch": 1.06, "learning_rate": 1.2909067467991632e-05, "loss": 2.9912, "step": 71500 }, { "epoch": 1.07, "learning_rate": 1.2859480527208354e-05, "loss": 2.974, "step": 72000 }, { "epoch": 1.08, "learning_rate": 1.280989358642508e-05, "loss": 2.9581, "step": 72500 }, { "epoch": 1.09, "learning_rate": 1.2760306645641804e-05, "loss": 2.975, "step": 73000 }, { "epoch": 1.09, "learning_rate": 1.271071970485853e-05, "loss": 2.9737, "step": 73500 }, { "epoch": 1.1, "learning_rate": 1.2661132764075254e-05, "loss": 2.9722, "step": 74000 }, { "epoch": 1.11, "learning_rate": 1.261154582329198e-05, "loss": 2.9727, "step": 74500 }, { "epoch": 1.12, "learning_rate": 1.2561958882508702e-05, "loss": 2.9618, "step": 75000 }, { "epoch": 1.12, "learning_rate": 1.2512371941725428e-05, "loss": 2.9554, "step": 75500 }, { "epoch": 1.13, "learning_rate": 1.2462785000942152e-05, "loss": 2.961, "step": 76000 }, { "epoch": 1.14, "learning_rate": 1.2413198060158878e-05, "loss": 2.9627, "step": 76500 }, { "epoch": 1.15, "learning_rate": 1.2363611119375602e-05, "loss": 2.9896, "step": 77000 } ], "max_steps": 201666, "num_train_epochs": 3, "total_flos": 9.093214173619814e+16, "trial_name": null, "trial_params": null }