{ "best_metric": 0.40765267610549927, "best_model_checkpoint": "m2m100_418M_finetuned_fr_to_sw/checkpoint-32000", "epoch": 5.977956286194657, "eval_steps": 1000, "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "learning_rate": 1.992527554642257e-05, "loss": 2.3549, "step": 500 }, { "epoch": 0.19, "learning_rate": 1.9850551092845137e-05, "loss": 0.8843, "step": 1000 }, { "epoch": 0.19, "eval_bleu": 10.6171, "eval_gen_len": 60.1469, "eval_loss": 0.76387619972229, "eval_runtime": 3073.3727, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.436, "step": 1000 }, { "epoch": 0.28, "learning_rate": 1.97758266392677e-05, "loss": 0.7804, "step": 1500 }, { "epoch": 0.37, "learning_rate": 1.9701102185690268e-05, "loss": 0.7269, "step": 2000 }, { "epoch": 0.37, "eval_bleu": 14.3741, "eval_gen_len": 61.2398, "eval_loss": 0.6497731804847717, "eval_runtime": 3071.4883, "eval_samples_per_second": 3.486, "eval_steps_per_second": 0.436, "step": 2000 }, { "epoch": 0.47, "learning_rate": 1.9626377732112836e-05, "loss": 0.6685, "step": 2500 }, { "epoch": 0.56, "learning_rate": 1.9551653278535403e-05, "loss": 0.6504, "step": 3000 }, { "epoch": 0.56, "eval_bleu": 19.1778, "eval_gen_len": 54.184, "eval_loss": 0.5995421409606934, "eval_runtime": 2745.8255, "eval_samples_per_second": 3.899, "eval_steps_per_second": 0.488, "step": 3000 }, { "epoch": 0.65, "learning_rate": 1.9476928824957967e-05, "loss": 0.6243, "step": 3500 }, { "epoch": 0.75, "learning_rate": 1.9402204371380535e-05, "loss": 0.6093, "step": 4000 }, { "epoch": 0.75, "eval_bleu": 20.3586, "eval_gen_len": 56.1132, "eval_loss": 0.5621405243873596, "eval_runtime": 2804.8241, "eval_samples_per_second": 3.817, "eval_steps_per_second": 0.477, "step": 4000 }, { "epoch": 0.84, "learning_rate": 1.9327479917803102e-05, "loss": 0.5779, "step": 4500 }, { "epoch": 0.93, "learning_rate": 1.925275546422567e-05, "loss": 0.58, "step": 5000 }, { "epoch": 0.93, "eval_bleu": 22.497, "eval_gen_len": 53.262, "eval_loss": 0.5317678451538086, "eval_runtime": 2510.0734, "eval_samples_per_second": 4.265, "eval_steps_per_second": 0.533, "step": 5000 }, { "epoch": 1.03, "learning_rate": 1.9178031010648237e-05, "loss": 0.5499, "step": 5500 }, { "epoch": 1.12, "learning_rate": 1.91033065570708e-05, "loss": 0.5067, "step": 6000 }, { "epoch": 1.12, "eval_bleu": 24.1584, "eval_gen_len": 56.1712, "eval_loss": 0.5155890583992004, "eval_runtime": 2540.4899, "eval_samples_per_second": 4.214, "eval_steps_per_second": 0.527, "step": 6000 }, { "epoch": 1.21, "learning_rate": 1.902858210349337e-05, "loss": 0.5104, "step": 6500 }, { "epoch": 1.31, "learning_rate": 1.8953857649915936e-05, "loss": 0.4985, "step": 7000 }, { "epoch": 1.31, "eval_bleu": 24.902, "eval_gen_len": 55.1034, "eval_loss": 0.5012524127960205, "eval_runtime": 2433.6643, "eval_samples_per_second": 4.399, "eval_steps_per_second": 0.55, "step": 7000 }, { "epoch": 1.4, "learning_rate": 1.8879133196338504e-05, "loss": 0.4949, "step": 7500 }, { "epoch": 1.49, "learning_rate": 1.880440874276107e-05, "loss": 0.4861, "step": 8000 }, { "epoch": 1.49, "eval_bleu": 25.8945, "eval_gen_len": 55.7148, "eval_loss": 0.48973962664604187, "eval_runtime": 2476.4919, "eval_samples_per_second": 4.323, "eval_steps_per_second": 0.541, "step": 8000 }, { "epoch": 1.59, "learning_rate": 1.8729684289183636e-05, "loss": 0.4827, "step": 8500 }, { "epoch": 1.68, "learning_rate": 1.8654959835606203e-05, "loss": 0.4789, "step": 9000 }, { "epoch": 1.68, "eval_bleu": 26.2593, "eval_gen_len": 54.9688, "eval_loss": 0.4776358902454376, "eval_runtime": 2500.3532, "eval_samples_per_second": 4.282, "eval_steps_per_second": 0.536, "step": 9000 }, { "epoch": 1.77, "learning_rate": 1.858023538202877e-05, "loss": 0.4757, "step": 9500 }, { "epoch": 1.87, "learning_rate": 1.8505510928451338e-05, "loss": 0.4748, "step": 10000 }, { "epoch": 1.87, "eval_bleu": 26.8308, "eval_gen_len": 53.234, "eval_loss": 0.4675232470035553, "eval_runtime": 2354.2254, "eval_samples_per_second": 4.548, "eval_steps_per_second": 0.569, "step": 10000 }, { "epoch": 1.96, "learning_rate": 1.8430786474873902e-05, "loss": 0.4721, "step": 10500 }, { "epoch": 2.05, "learning_rate": 1.835606202129647e-05, "loss": 0.4365, "step": 11000 }, { "epoch": 2.05, "eval_bleu": 27.8127, "eval_gen_len": 53.7894, "eval_loss": 0.46269142627716064, "eval_runtime": 2320.7276, "eval_samples_per_second": 4.613, "eval_steps_per_second": 0.577, "step": 11000 }, { "epoch": 2.15, "learning_rate": 1.8281337567719037e-05, "loss": 0.4124, "step": 11500 }, { "epoch": 2.24, "learning_rate": 1.8206613114141605e-05, "loss": 0.4065, "step": 12000 }, { "epoch": 2.24, "eval_bleu": 28.1334, "eval_gen_len": 52.4625, "eval_loss": 0.4552680253982544, "eval_runtime": 2260.3502, "eval_samples_per_second": 4.736, "eval_steps_per_second": 0.592, "step": 12000 }, { "epoch": 2.34, "learning_rate": 1.8131888660564172e-05, "loss": 0.4236, "step": 12500 }, { "epoch": 2.43, "learning_rate": 1.8057164206986736e-05, "loss": 0.4159, "step": 13000 }, { "epoch": 2.43, "eval_bleu": 28.5473, "eval_gen_len": 53.1084, "eval_loss": 0.4502773582935333, "eval_runtime": 2305.3584, "eval_samples_per_second": 4.644, "eval_steps_per_second": 0.581, "step": 13000 }, { "epoch": 2.52, "learning_rate": 1.7982439753409304e-05, "loss": 0.4037, "step": 13500 }, { "epoch": 2.62, "learning_rate": 1.790771529983187e-05, "loss": 0.4078, "step": 14000 }, { "epoch": 2.62, "eval_bleu": 28.522, "eval_gen_len": 53.9566, "eval_loss": 0.44360852241516113, "eval_runtime": 2357.175, "eval_samples_per_second": 4.542, "eval_steps_per_second": 0.568, "step": 14000 }, { "epoch": 2.71, "learning_rate": 1.783299084625444e-05, "loss": 0.4016, "step": 14500 }, { "epoch": 2.8, "learning_rate": 1.7758266392677006e-05, "loss": 0.4088, "step": 15000 }, { "epoch": 2.8, "eval_bleu": 29.6642, "eval_gen_len": 54.4689, "eval_loss": 0.439211368560791, "eval_runtime": 2379.609, "eval_samples_per_second": 4.499, "eval_steps_per_second": 0.563, "step": 15000 }, { "epoch": 2.9, "learning_rate": 1.768354193909957e-05, "loss": 0.4046, "step": 15500 }, { "epoch": 2.99, "learning_rate": 1.7608817485522138e-05, "loss": 0.4039, "step": 16000 }, { "epoch": 2.99, "eval_bleu": 29.8929, "eval_gen_len": 55.4612, "eval_loss": 0.4344358444213867, "eval_runtime": 2401.3922, "eval_samples_per_second": 4.458, "eval_steps_per_second": 0.558, "step": 16000 }, { "epoch": 3.08, "learning_rate": 1.7534093031944705e-05, "loss": 0.3635, "step": 16500 }, { "epoch": 3.18, "learning_rate": 1.7459368578367273e-05, "loss": 0.3537, "step": 17000 }, { "epoch": 3.18, "eval_bleu": 30.2302, "eval_gen_len": 54.3727, "eval_loss": 0.43423154950141907, "eval_runtime": 2351.6946, "eval_samples_per_second": 4.552, "eval_steps_per_second": 0.569, "step": 17000 }, { "epoch": 3.27, "learning_rate": 1.7384644124789837e-05, "loss": 0.3575, "step": 17500 }, { "epoch": 3.36, "learning_rate": 1.7309919671212404e-05, "loss": 0.3569, "step": 18000 }, { "epoch": 3.36, "eval_bleu": 30.1139, "eval_gen_len": 54.6381, "eval_loss": 0.4319211542606354, "eval_runtime": 2402.5651, "eval_samples_per_second": 4.456, "eval_steps_per_second": 0.557, "step": 18000 }, { "epoch": 3.46, "learning_rate": 1.7235195217634972e-05, "loss": 0.3564, "step": 18500 }, { "epoch": 3.55, "learning_rate": 1.716047076405754e-05, "loss": 0.3564, "step": 19000 }, { "epoch": 3.55, "eval_bleu": 30.8007, "eval_gen_len": 53.8819, "eval_loss": 0.42764702439308167, "eval_runtime": 2333.0447, "eval_samples_per_second": 4.589, "eval_steps_per_second": 0.574, "step": 19000 }, { "epoch": 3.64, "learning_rate": 1.7085746310480107e-05, "loss": 0.3576, "step": 19500 }, { "epoch": 3.74, "learning_rate": 1.701102185690267e-05, "loss": 0.3637, "step": 20000 }, { "epoch": 3.74, "eval_bleu": 30.8698, "eval_gen_len": 53.7231, "eval_loss": 0.422607421875, "eval_runtime": 2331.5045, "eval_samples_per_second": 4.592, "eval_steps_per_second": 0.574, "step": 20000 }, { "epoch": 3.83, "learning_rate": 1.693629740332524e-05, "loss": 0.3601, "step": 20500 }, { "epoch": 3.92, "learning_rate": 1.6861572949747806e-05, "loss": 0.3571, "step": 21000 }, { "epoch": 3.92, "eval_bleu": 31.1343, "eval_gen_len": 53.5349, "eval_loss": 0.41751930117607117, "eval_runtime": 2304.4971, "eval_samples_per_second": 4.646, "eval_steps_per_second": 0.581, "step": 21000 }, { "epoch": 4.02, "learning_rate": 1.6786848496170374e-05, "loss": 0.3441, "step": 21500 }, { "epoch": 4.11, "learning_rate": 1.671212404259294e-05, "loss": 0.3099, "step": 22000 }, { "epoch": 4.11, "eval_bleu": 31.3026, "eval_gen_len": 53.4483, "eval_loss": 0.421342134475708, "eval_runtime": 2298.8454, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.582, "step": 22000 }, { "epoch": 4.2, "learning_rate": 1.6637399589015505e-05, "loss": 0.3175, "step": 22500 }, { "epoch": 4.3, "learning_rate": 1.6562675135438073e-05, "loss": 0.3104, "step": 23000 }, { "epoch": 4.3, "eval_bleu": 31.1261, "eval_gen_len": 51.5196, "eval_loss": 0.4227532744407654, "eval_runtime": 2198.9363, "eval_samples_per_second": 4.869, "eval_steps_per_second": 0.609, "step": 23000 }, { "epoch": 4.39, "learning_rate": 1.648795068186064e-05, "loss": 0.3169, "step": 23500 }, { "epoch": 4.48, "learning_rate": 1.6413226228283208e-05, "loss": 0.3162, "step": 24000 }, { "epoch": 4.48, "eval_bleu": 31.9091, "eval_gen_len": 53.0626, "eval_loss": 0.4195193946361542, "eval_runtime": 2270.3312, "eval_samples_per_second": 4.716, "eval_steps_per_second": 0.59, "step": 24000 }, { "epoch": 4.58, "learning_rate": 1.6338501774705772e-05, "loss": 0.3128, "step": 24500 }, { "epoch": 4.67, "learning_rate": 1.626377732112834e-05, "loss": 0.3177, "step": 25000 }, { "epoch": 4.67, "eval_bleu": 31.5561, "eval_gen_len": 52.3463, "eval_loss": 0.4158227741718292, "eval_runtime": 2237.9742, "eval_samples_per_second": 4.784, "eval_steps_per_second": 0.598, "step": 25000 }, { "epoch": 4.76, "learning_rate": 1.6189052867550907e-05, "loss": 0.3216, "step": 25500 }, { "epoch": 4.86, "learning_rate": 1.6114328413973474e-05, "loss": 0.3181, "step": 26000 }, { "epoch": 4.86, "eval_bleu": 32.1029, "eval_gen_len": 53.8831, "eval_loss": 0.4130856692790985, "eval_runtime": 2288.7247, "eval_samples_per_second": 4.678, "eval_steps_per_second": 0.585, "step": 26000 }, { "epoch": 4.95, "learning_rate": 1.6039603960396042e-05, "loss": 0.3176, "step": 26500 }, { "epoch": 5.04, "learning_rate": 1.5964879506818606e-05, "loss": 0.2941, "step": 27000 }, { "epoch": 5.04, "eval_bleu": 32.1061, "eval_gen_len": 52.7448, "eval_loss": 0.4150530993938446, "eval_runtime": 2247.1776, "eval_samples_per_second": 4.764, "eval_steps_per_second": 0.596, "step": 27000 }, { "epoch": 5.14, "learning_rate": 1.5890155053241173e-05, "loss": 0.2752, "step": 27500 }, { "epoch": 5.23, "learning_rate": 1.581543059966374e-05, "loss": 0.274, "step": 28000 }, { "epoch": 5.23, "eval_bleu": 31.9128, "eval_gen_len": 52.9394, "eval_loss": 0.4146653711795807, "eval_runtime": 2267.9887, "eval_samples_per_second": 4.72, "eval_steps_per_second": 0.59, "step": 28000 }, { "epoch": 5.32, "learning_rate": 1.574070614608631e-05, "loss": 0.2852, "step": 28500 }, { "epoch": 5.42, "learning_rate": 1.5665981692508876e-05, "loss": 0.2713, "step": 29000 }, { "epoch": 5.42, "eval_bleu": 32.452, "eval_gen_len": 52.881, "eval_loss": 0.41379648447036743, "eval_runtime": 2262.9812, "eval_samples_per_second": 4.731, "eval_steps_per_second": 0.592, "step": 29000 }, { "epoch": 5.51, "learning_rate": 1.559125723893144e-05, "loss": 0.2791, "step": 29500 }, { "epoch": 5.6, "learning_rate": 1.5516532785354008e-05, "loss": 0.283, "step": 30000 }, { "epoch": 5.6, "eval_bleu": 32.6103, "eval_gen_len": 53.2173, "eval_loss": 0.4103504717350006, "eval_runtime": 2279.2685, "eval_samples_per_second": 4.697, "eval_steps_per_second": 0.587, "step": 30000 }, { "epoch": 5.7, "learning_rate": 1.5441808331776575e-05, "loss": 0.2835, "step": 30500 }, { "epoch": 5.79, "learning_rate": 1.5367083878199142e-05, "loss": 0.2866, "step": 31000 }, { "epoch": 5.79, "eval_bleu": 32.5888, "eval_gen_len": 52.9638, "eval_loss": 0.41094180941581726, "eval_runtime": 2257.2287, "eval_samples_per_second": 4.743, "eval_steps_per_second": 0.593, "step": 31000 }, { "epoch": 5.88, "learning_rate": 1.5292359424621707e-05, "loss": 0.2851, "step": 31500 }, { "epoch": 5.98, "learning_rate": 1.5217634971044276e-05, "loss": 0.2865, "step": 32000 }, { "epoch": 5.98, "eval_bleu": 32.6545, "eval_gen_len": 52.4693, "eval_loss": 0.40765267610549927, "eval_runtime": 2233.0208, "eval_samples_per_second": 4.794, "eval_steps_per_second": 0.6, "step": 32000 } ], "logging_steps": 500, "max_steps": 133825, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 1000, "total_flos": 1.0234506177547469e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }