{ "best_metric": 20.7584, "best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14-5/checkpoint-90000", "epoch": 2.7777777777777777, "eval_steps": 10000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027777777777777776, "grad_norm": 1.9314790964126587, "learning_rate": 0.0005, "loss": 3.3589, "step": 1000 }, { "epoch": 0.05555555555555555, "grad_norm": 1.7348469495773315, "learning_rate": 0.0005, "loss": 2.5263, "step": 2000 }, { "epoch": 0.08333333333333333, "grad_norm": 1.9181748628616333, "learning_rate": 0.0005, "loss": 2.3365, "step": 3000 }, { "epoch": 0.1111111111111111, "grad_norm": 1.6642646789550781, "learning_rate": 0.0005, "loss": 2.2207, "step": 4000 }, { "epoch": 0.1388888888888889, "grad_norm": 1.1876742839813232, "learning_rate": 0.0005, "loss": 2.1363, "step": 5000 }, { "epoch": 0.16666666666666666, "grad_norm": 1.567658543586731, "learning_rate": 0.0005, "loss": 2.0733, "step": 6000 }, { "epoch": 0.19444444444444445, "grad_norm": 1.2552471160888672, "learning_rate": 0.0005, "loss": 2.0262, "step": 7000 }, { "epoch": 0.2222222222222222, "grad_norm": 1.049357533454895, "learning_rate": 0.0005, "loss": 1.9775, "step": 8000 }, { "epoch": 0.25, "grad_norm": 1.303145170211792, "learning_rate": 0.0005, "loss": 1.9412, "step": 9000 }, { "epoch": 0.2777777777777778, "grad_norm": 1.0213723182678223, "learning_rate": 0.0005, "loss": 1.9166, "step": 10000 }, { "epoch": 0.2777777777777778, "eval_bleu": 15.8119, "eval_gen_len": 32.097, "eval_loss": 2.31050968170166, "eval_runtime": 410.6001, "eval_samples_per_second": 7.306, "eval_steps_per_second": 0.913, "step": 10000 }, { "epoch": 0.3055555555555556, "grad_norm": 1.2851905822753906, "learning_rate": 0.0005, "loss": 1.8878, "step": 11000 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8447160720825195, "learning_rate": 0.0005, "loss": 1.8492, "step": 12000 }, { "epoch": 0.3611111111111111, "grad_norm": 1.1516064405441284, "learning_rate": 0.0005, "loss": 1.8309, "step": 13000 }, { "epoch": 0.3888888888888889, "grad_norm": 1.0370670557022095, "learning_rate": 0.0005, "loss": 1.8057, "step": 14000 }, { "epoch": 0.4166666666666667, "grad_norm": 1.1649495363235474, "learning_rate": 0.0005, "loss": 1.7867, "step": 15000 }, { "epoch": 0.4444444444444444, "grad_norm": 1.2666045427322388, "learning_rate": 0.0005, "loss": 1.7679, "step": 16000 }, { "epoch": 0.4722222222222222, "grad_norm": 1.0923264026641846, "learning_rate": 0.0005, "loss": 1.7563, "step": 17000 }, { "epoch": 0.5, "grad_norm": 1.560994029045105, "learning_rate": 0.0005, "loss": 1.7342, "step": 18000 }, { "epoch": 0.5277777777777778, "grad_norm": 0.9684827327728271, "learning_rate": 0.0005, "loss": 1.7228, "step": 19000 }, { "epoch": 0.5555555555555556, "grad_norm": 0.9182453751564026, "learning_rate": 0.0005, "loss": 1.7184, "step": 20000 }, { "epoch": 0.5555555555555556, "eval_bleu": 17.5903, "eval_gen_len": 31.1153, "eval_loss": 2.19934344291687, "eval_runtime": 393.3017, "eval_samples_per_second": 7.628, "eval_steps_per_second": 0.953, "step": 20000 }, { "epoch": 0.5833333333333334, "grad_norm": 0.8953577280044556, "learning_rate": 0.0005, "loss": 1.7042, "step": 21000 }, { "epoch": 0.6111111111111112, "grad_norm": 0.9418250918388367, "learning_rate": 0.0005, "loss": 1.683, "step": 22000 }, { "epoch": 0.6388888888888888, "grad_norm": 0.8577601909637451, "learning_rate": 0.0005, "loss": 1.6799, "step": 23000 }, { "epoch": 0.6666666666666666, "grad_norm": 0.9786076545715332, "learning_rate": 0.0005, "loss": 1.6675, "step": 24000 }, { "epoch": 0.6944444444444444, "grad_norm": 0.9262654781341553, "learning_rate": 0.0005, "loss": 1.6499, "step": 25000 }, { "epoch": 0.7222222222222222, "grad_norm": 0.8759564757347107, "learning_rate": 0.0005, "loss": 1.6468, "step": 26000 }, { "epoch": 0.75, "grad_norm": 1.0495752096176147, "learning_rate": 0.0005, "loss": 1.6285, "step": 27000 }, { "epoch": 0.7777777777777778, "grad_norm": 1.092642068862915, "learning_rate": 0.0005, "loss": 1.6276, "step": 28000 }, { "epoch": 0.8055555555555556, "grad_norm": 0.8775661587715149, "learning_rate": 0.0005, "loss": 1.6172, "step": 29000 }, { "epoch": 0.8333333333333334, "grad_norm": 0.8970679044723511, "learning_rate": 0.0005, "loss": 1.6061, "step": 30000 }, { "epoch": 0.8333333333333334, "eval_bleu": 18.9604, "eval_gen_len": 30.327, "eval_loss": 2.1379551887512207, "eval_runtime": 380.095, "eval_samples_per_second": 7.893, "eval_steps_per_second": 0.987, "step": 30000 }, { "epoch": 0.8611111111111112, "grad_norm": 0.9657310247421265, "learning_rate": 0.0005, "loss": 1.5959, "step": 31000 }, { "epoch": 0.8888888888888888, "grad_norm": 0.8748376369476318, "learning_rate": 0.0005, "loss": 1.5908, "step": 32000 }, { "epoch": 0.9166666666666666, "grad_norm": 0.8462302088737488, "learning_rate": 0.0005, "loss": 1.5845, "step": 33000 }, { "epoch": 0.9444444444444444, "grad_norm": 0.9005241394042969, "learning_rate": 0.0005, "loss": 1.5699, "step": 34000 }, { "epoch": 0.9722222222222222, "grad_norm": 0.9596630930900574, "learning_rate": 0.0005, "loss": 1.5752, "step": 35000 }, { "epoch": 1.0, "grad_norm": 0.8307533860206604, "learning_rate": 0.0005, "loss": 1.5634, "step": 36000 }, { "epoch": 1.0277777777777777, "grad_norm": 0.9918788075447083, "learning_rate": 0.0005, "loss": 1.5117, "step": 37000 }, { "epoch": 1.0555555555555556, "grad_norm": 0.9118058085441589, "learning_rate": 0.0005, "loss": 1.5023, "step": 38000 }, { "epoch": 1.0833333333333333, "grad_norm": 0.7213552594184875, "learning_rate": 0.0005, "loss": 1.5087, "step": 39000 }, { "epoch": 1.1111111111111112, "grad_norm": 1.0255305767059326, "learning_rate": 0.0005, "loss": 1.516, "step": 40000 }, { "epoch": 1.1111111111111112, "eval_bleu": 19.1444, "eval_gen_len": 30.2727, "eval_loss": 2.1365692615509033, "eval_runtime": 377.1737, "eval_samples_per_second": 7.954, "eval_steps_per_second": 0.994, "step": 40000 }, { "epoch": 1.1388888888888888, "grad_norm": 0.8766499161720276, "learning_rate": 0.0005, "loss": 1.5096, "step": 41000 }, { "epoch": 1.1666666666666667, "grad_norm": 1.1786612272262573, "learning_rate": 0.0005, "loss": 1.4982, "step": 42000 }, { "epoch": 1.1944444444444444, "grad_norm": 1.011268973350525, "learning_rate": 0.0005, "loss": 1.5013, "step": 43000 }, { "epoch": 1.2222222222222223, "grad_norm": 1.0863969326019287, "learning_rate": 0.0005, "loss": 1.4878, "step": 44000 }, { "epoch": 1.25, "grad_norm": 0.9729832410812378, "learning_rate": 0.0005, "loss": 1.4922, "step": 45000 }, { "epoch": 1.2777777777777777, "grad_norm": 1.3476896286010742, "learning_rate": 0.0005, "loss": 1.4876, "step": 46000 }, { "epoch": 1.3055555555555556, "grad_norm": 0.8493963479995728, "learning_rate": 0.0005, "loss": 1.4823, "step": 47000 }, { "epoch": 1.3333333333333333, "grad_norm": 1.0311123132705688, "learning_rate": 0.0005, "loss": 1.4739, "step": 48000 }, { "epoch": 1.3611111111111112, "grad_norm": 1.259581446647644, "learning_rate": 0.0005, "loss": 1.4747, "step": 49000 }, { "epoch": 1.3888888888888888, "grad_norm": 1.1934195756912231, "learning_rate": 0.0005, "loss": 1.4675, "step": 50000 }, { "epoch": 1.3888888888888888, "eval_bleu": 19.7588, "eval_gen_len": 30.1127, "eval_loss": 2.120835781097412, "eval_runtime": 372.4281, "eval_samples_per_second": 8.055, "eval_steps_per_second": 1.007, "step": 50000 }, { "epoch": 1.4166666666666667, "grad_norm": 1.1824595928192139, "learning_rate": 0.0005, "loss": 1.4659, "step": 51000 }, { "epoch": 1.4444444444444444, "grad_norm": 1.1661032438278198, "learning_rate": 0.0005, "loss": 1.4737, "step": 52000 }, { "epoch": 1.4722222222222223, "grad_norm": 0.7856634259223938, "learning_rate": 0.0005, "loss": 1.4595, "step": 53000 }, { "epoch": 1.5, "grad_norm": 0.9908609986305237, "learning_rate": 0.0005, "loss": 1.4656, "step": 54000 }, { "epoch": 1.5277777777777777, "grad_norm": 0.9270644187927246, "learning_rate": 0.0005, "loss": 1.4524, "step": 55000 }, { "epoch": 1.5555555555555556, "grad_norm": 0.9910904169082642, "learning_rate": 0.0005, "loss": 1.4453, "step": 56000 }, { "epoch": 1.5833333333333335, "grad_norm": 1.0300639867782593, "learning_rate": 0.0005, "loss": 1.451, "step": 57000 }, { "epoch": 1.6111111111111112, "grad_norm": 0.809105396270752, "learning_rate": 0.0005, "loss": 1.444, "step": 58000 }, { "epoch": 1.6388888888888888, "grad_norm": 0.7915866374969482, "learning_rate": 0.0005, "loss": 1.4421, "step": 59000 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9778928756713867, "learning_rate": 0.0005, "loss": 1.4416, "step": 60000 }, { "epoch": 1.6666666666666665, "eval_bleu": 19.9263, "eval_gen_len": 30.4463, "eval_loss": 2.088862657546997, "eval_runtime": 383.2772, "eval_samples_per_second": 7.827, "eval_steps_per_second": 0.978, "step": 60000 }, { "epoch": 1.6944444444444444, "grad_norm": 0.8484209775924683, "learning_rate": 0.0005, "loss": 1.4313, "step": 61000 }, { "epoch": 1.7222222222222223, "grad_norm": 0.8703031539916992, "learning_rate": 0.0005, "loss": 1.4405, "step": 62000 }, { "epoch": 1.75, "grad_norm": 1.4096006155014038, "learning_rate": 0.0005, "loss": 1.4375, "step": 63000 }, { "epoch": 1.7777777777777777, "grad_norm": 0.9177774786949158, "learning_rate": 0.0005, "loss": 1.4262, "step": 64000 }, { "epoch": 1.8055555555555556, "grad_norm": 1.2332441806793213, "learning_rate": 0.0005, "loss": 1.4233, "step": 65000 }, { "epoch": 1.8333333333333335, "grad_norm": 0.8750177621841431, "learning_rate": 0.0005, "loss": 1.4287, "step": 66000 }, { "epoch": 1.8611111111111112, "grad_norm": 0.6736052632331848, "learning_rate": 0.0005, "loss": 1.4231, "step": 67000 }, { "epoch": 1.8888888888888888, "grad_norm": 0.7802408933639526, "learning_rate": 0.0005, "loss": 1.4106, "step": 68000 }, { "epoch": 1.9166666666666665, "grad_norm": 1.1860034465789795, "learning_rate": 0.0005, "loss": 1.4121, "step": 69000 }, { "epoch": 1.9444444444444444, "grad_norm": 0.926054835319519, "learning_rate": 0.0005, "loss": 1.4111, "step": 70000 }, { "epoch": 1.9444444444444444, "eval_bleu": 20.3323, "eval_gen_len": 30.1207, "eval_loss": 2.079472541809082, "eval_runtime": 371.9755, "eval_samples_per_second": 8.065, "eval_steps_per_second": 1.008, "step": 70000 }, { "epoch": 1.9722222222222223, "grad_norm": 1.1691533327102661, "learning_rate": 0.0005, "loss": 1.407, "step": 71000 }, { "epoch": 2.0, "grad_norm": 0.9077666997909546, "learning_rate": 0.0005, "loss": 1.4051, "step": 72000 }, { "epoch": 2.0277777777777777, "grad_norm": 0.9149623513221741, "learning_rate": 0.0005, "loss": 1.3517, "step": 73000 }, { "epoch": 2.0555555555555554, "grad_norm": 1.0772947072982788, "learning_rate": 0.0005, "loss": 1.3624, "step": 74000 }, { "epoch": 2.0833333333333335, "grad_norm": 0.7283540964126587, "learning_rate": 0.0005, "loss": 1.355, "step": 75000 }, { "epoch": 2.111111111111111, "grad_norm": 0.7279065847396851, "learning_rate": 0.0005, "loss": 1.3526, "step": 76000 }, { "epoch": 2.138888888888889, "grad_norm": 1.2707905769348145, "learning_rate": 0.0005, "loss": 1.3535, "step": 77000 }, { "epoch": 2.1666666666666665, "grad_norm": 0.9000493288040161, "learning_rate": 0.0005, "loss": 1.3519, "step": 78000 }, { "epoch": 2.1944444444444446, "grad_norm": 1.043967843055725, "learning_rate": 0.0005, "loss": 1.3567, "step": 79000 }, { "epoch": 2.2222222222222223, "grad_norm": 1.1248853206634521, "learning_rate": 0.0005, "loss": 1.3603, "step": 80000 }, { "epoch": 2.2222222222222223, "eval_bleu": 20.5373, "eval_gen_len": 30.5943, "eval_loss": 2.085047960281372, "eval_runtime": 373.0705, "eval_samples_per_second": 8.041, "eval_steps_per_second": 1.005, "step": 80000 }, { "epoch": 2.25, "grad_norm": 1.056221842765808, "learning_rate": 0.0005, "loss": 1.3657, "step": 81000 }, { "epoch": 2.2777777777777777, "grad_norm": 0.9176587462425232, "learning_rate": 0.0005, "loss": 1.3572, "step": 82000 }, { "epoch": 2.3055555555555554, "grad_norm": 1.0105085372924805, "learning_rate": 0.0005, "loss": 1.3498, "step": 83000 }, { "epoch": 2.3333333333333335, "grad_norm": 1.1589380502700806, "learning_rate": 0.0005, "loss": 1.3567, "step": 84000 }, { "epoch": 2.361111111111111, "grad_norm": 0.7733587622642517, "learning_rate": 0.0005, "loss": 1.3533, "step": 85000 }, { "epoch": 2.388888888888889, "grad_norm": 1.036777138710022, "learning_rate": 0.0005, "loss": 1.3469, "step": 86000 }, { "epoch": 2.4166666666666665, "grad_norm": 1.4935026168823242, "learning_rate": 0.0005, "loss": 1.3469, "step": 87000 }, { "epoch": 2.4444444444444446, "grad_norm": 0.864630937576294, "learning_rate": 0.0005, "loss": 1.3506, "step": 88000 }, { "epoch": 2.4722222222222223, "grad_norm": 0.8495751619338989, "learning_rate": 0.0005, "loss": 1.3408, "step": 89000 }, { "epoch": 2.5, "grad_norm": 1.0840762853622437, "learning_rate": 0.0005, "loss": 1.3378, "step": 90000 }, { "epoch": 2.5, "eval_bleu": 20.7584, "eval_gen_len": 30.499, "eval_loss": 2.0603742599487305, "eval_runtime": 368.0992, "eval_samples_per_second": 8.15, "eval_steps_per_second": 1.019, "step": 90000 }, { "epoch": 2.5277777777777777, "grad_norm": 0.7769622802734375, "learning_rate": 0.0005, "loss": 1.3409, "step": 91000 }, { "epoch": 2.5555555555555554, "grad_norm": 1.049972414970398, "learning_rate": 0.0005, "loss": 1.3443, "step": 92000 }, { "epoch": 2.5833333333333335, "grad_norm": 0.965621292591095, "learning_rate": 0.0005, "loss": 1.342, "step": 93000 }, { "epoch": 2.611111111111111, "grad_norm": 0.8234182000160217, "learning_rate": 0.0005, "loss": 1.3297, "step": 94000 }, { "epoch": 2.638888888888889, "grad_norm": 0.9464855790138245, "learning_rate": 0.0005, "loss": 1.3345, "step": 95000 }, { "epoch": 2.6666666666666665, "grad_norm": 0.987382709980011, "learning_rate": 0.0005, "loss": 1.3284, "step": 96000 }, { "epoch": 2.6944444444444446, "grad_norm": 0.6439863443374634, "learning_rate": 0.0005, "loss": 1.3285, "step": 97000 }, { "epoch": 2.7222222222222223, "grad_norm": 0.8853390216827393, "learning_rate": 0.0005, "loss": 1.3339, "step": 98000 }, { "epoch": 2.75, "grad_norm": 0.7582658529281616, "learning_rate": 0.0005, "loss": 1.3281, "step": 99000 }, { "epoch": 2.7777777777777777, "grad_norm": 0.9061763882637024, "learning_rate": 0.0005, "loss": 1.3381, "step": 100000 }, { "epoch": 2.7777777777777777, "eval_bleu": 20.6113, "eval_gen_len": 30.701, "eval_loss": 2.059664726257324, "eval_runtime": 371.2241, "eval_samples_per_second": 8.081, "eval_steps_per_second": 1.01, "step": 100000 }, { "epoch": 2.7777777777777777, "step": 100000, "total_flos": 1.4240580791795712e+17, "train_loss": 0.5475473999023438, "train_runtime": 14821.2356, "train_samples_per_second": 107.953, "train_steps_per_second": 6.747 } ], "logging_steps": 1000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "total_flos": 1.4240580791795712e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }