{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9927797833935017, "global_step": 11040, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ampere_temperature": 0.0, "ce_loss": 4.599947213172912, "distil_loss": 0.0, "epoch": 0.05, "learning_rate": 0.001, "loss": 4.5999, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 250, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.8339186582565308, "distil_loss": 0.0, "epoch": 0.09, "learning_rate": 0.002, "loss": 1.8339, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 500, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.3567713406085968, "distil_loss": 0.0, "epoch": 0.14, "learning_rate": 0.003, "loss": 1.3568, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 750, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.2095605379343033, "distil_loss": 0.0, "epoch": 0.18, "learning_rate": 0.004, "loss": 1.2096, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 1000, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.1451576855182648, "distil_loss": 0.0, "epoch": 0.23, "learning_rate": 0.005, "loss": 1.1452, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 1250, "threshold": 1.0 }, { "ce_loss": 1.1159179366551912, "distil_loss": 0.0, "epoch": 0.25, "eval_ampere_temperature": 0.0, "eval_exact_match": 77.360454115421, "eval_f1": 86.34721419771964, "eval_progress": 0.0, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 1380 }, { "ampere_temperature": 0.0, "ce_loss": 1.0835382461547851, "distil_loss": 0.0, "epoch": 0.27, "learning_rate": 0.006, "loss": 1.1004, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 1500, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.1237352261543274, "distil_loss": 0.0, "epoch": 0.32, "learning_rate": 0.006999999999999999, "loss": 1.1237, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 1750, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.116382148861885, "distil_loss": 0.0, "epoch": 0.36, "learning_rate": 0.008, "loss": 1.1164, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 2000, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.0670130407810212, "distil_loss": 0.0, "epoch": 0.41, "learning_rate": 0.009000000000000001, "loss": 1.067, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 2250, "threshold": 1.0 }, { "ampere_temperature": 0.0, "ce_loss": 1.0639437032938004, "distil_loss": 0.0, "epoch": 0.45, "learning_rate": 0.01, "loss": 1.0639, "nnz_perc": 1.0, "progress": 0.0, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 2500, "threshold": 1.0 }, { "ampere_temperature": 1.6912145472259645, "ce_loss": 1.0629408322572709, "distil_loss": 0.0, "epoch": 0.5, "learning_rate": 0.00970862470862471, "loss": 1.0629, "nnz_perc": 1.0, "progress": 0.029020979020979, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 2750, "threshold": 1.0 }, { "ce_loss": 1.0985989689826965, "distil_loss": 0.0, "epoch": 0.5, "eval_ampere_temperature": 1.7570655286803998, "eval_exact_match": 75.37369914853359, "eval_f1": 85.4846023509551, "eval_progress": 0.03018648018648018, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 2760 }, { "ampere_temperature": 3.2905000860378912, "ce_loss": 1.0230497049788634, "distil_loss": 0.0, "epoch": 0.54, "learning_rate": 0.009417249417249416, "loss": 1.0261, "nnz_perc": 1.0, "progress": 0.058158508158508204, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 3000, "threshold": 1.0 }, { "ampere_temperature": 4.793831310474058, "ce_loss": 0.9981409941911698, "distil_loss": 0.0, "epoch": 0.59, "learning_rate": 0.009125874125874126, "loss": 0.9981, "nnz_perc": 1.0, "progress": 0.0872960372960373, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 3250, "threshold": 1.0 }, { "ampere_temperature": 6.204176736633212, "ce_loss": 1.0074045011997224, "distil_loss": 0.0, "epoch": 0.63, "learning_rate": 0.008834498834498834, "loss": 1.0074, "nnz_perc": 1.0, "progress": 0.1164335664335664, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 3500, "threshold": 1.0 }, { "ampere_temperature": 7.524504880614105, "ce_loss": 0.9891920503377915, "distil_loss": 0.0, "epoch": 0.68, "learning_rate": 0.008543123543123544, "loss": 0.9892, "nnz_perc": 1.0, "progress": 0.1455710955710956, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 3750, "threshold": 1.0 }, { "ampere_temperature": 8.757784258515466, "ce_loss": 1.0083434996008873, "distil_loss": 0.0, "epoch": 0.72, "learning_rate": 0.008251748251748252, "loss": 1.0083, "nnz_perc": 1.0, "progress": 0.1747086247086247, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 4000, "threshold": 1.0 }, { "ce_loss": 0.9699458577803203, "distil_loss": 0.0, "epoch": 0.75, "eval_ampere_temperature": 9.411504281933276, "eval_exact_match": 79.94323557237465, "eval_f1": 88.17033886272301, "eval_progress": 0.191025641025641, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 4140 }, { "ampere_temperature": 9.906983386436048, "ce_loss": 0.9698418254202062, "distil_loss": 0.0, "epoch": 0.77, "learning_rate": 0.00796037296037296, "loss": 0.9699, "nnz_perc": 1.0, "progress": 0.2038461538461538, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 4250, "threshold": 1.0 }, { "ampere_temperature": 10.97507078047459, "ce_loss": 0.9425091907978058, "distil_loss": 0.0, "epoch": 0.81, "learning_rate": 0.007668997668997669, "loss": 0.9425, "nnz_perc": 1.0, "progress": 0.232983682983683, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 4500, "threshold": 1.0 }, { "ampere_temperature": 11.965014956729835, "ce_loss": 0.9731772248744964, "distil_loss": 0.0, "epoch": 0.86, "learning_rate": 0.007377622377622378, "loss": 0.9732, "nnz_perc": 1.0, "progress": 0.2621212121212122, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 4750, "threshold": 1.0 }, { "ampere_temperature": 12.879784431300521, "ce_loss": 0.9197172073125839, "distil_loss": 0.0, "epoch": 0.9, "learning_rate": 0.007086247086247086, "loss": 0.9197, "nnz_perc": 1.0, "progress": 0.2912587412587413, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 5000, "threshold": 1.0 }, { "ampere_temperature": 13.722347720285395, "ce_loss": 0.9390108388662338, "distil_loss": 0.0, "epoch": 0.95, "learning_rate": 0.006794871794871795, "loss": 0.939, "nnz_perc": 1.0, "progress": 0.3203962703962704, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 5250, "threshold": 1.0 }, { "ampere_temperature": 14.495673339783197, "ce_loss": 0.9188237161636352, "distil_loss": 0.0, "epoch": 0.99, "learning_rate": 0.006503496503496503, "loss": 0.9188, "nnz_perc": 1.0, "progress": 0.3495337995337995, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 5500, "threshold": 1.0 }, { "ce_loss": 0.9402093678712845, "distil_loss": 0.0, "epoch": 1.0, "eval_ampere_temperature": 14.55463723501537, "eval_exact_match": 81.63670766319773, "eval_f1": 89.21446798933258, "eval_progress": 0.35186480186480185, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 5520 }, { "ampere_temperature": 15.202729805892675, "ce_loss": 0.7292252867118172, "distil_loss": 0.0, "epoch": 1.04, "learning_rate": 0.006212121212121212, "loss": 0.7461, "nnz_perc": 1.0, "progress": 0.3786713286713287, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 5750, "threshold": 1.0 }, { "ampere_temperature": 15.846485634712565, "ce_loss": 0.7380791381597519, "distil_loss": 0.0, "epoch": 1.08, "learning_rate": 0.005920745920745921, "loss": 0.7381, "nnz_perc": 1.0, "progress": 0.4078088578088578, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 6000, "threshold": 1.0 }, { "ampere_temperature": 16.429909342341613, "ce_loss": 0.7548821606636047, "distil_loss": 0.0, "epoch": 1.13, "learning_rate": 0.005629370629370629, "loss": 0.7549, "nnz_perc": 1.0, "progress": 0.436946386946387, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 6250, "threshold": 1.0 }, { "ampere_temperature": 16.955969444878562, "ce_loss": 0.7157313173413277, "distil_loss": 0.0, "epoch": 1.17, "learning_rate": 0.005337995337995338, "loss": 0.7157, "nnz_perc": 1.0, "progress": 0.4660839160839161, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 6500, "threshold": 1.0 }, { "ampere_temperature": 17.427634458422148, "ce_loss": 0.7611533465385437, "distil_loss": 0.0, "epoch": 1.22, "learning_rate": 0.005046620046620046, "loss": 0.7612, "nnz_perc": 1.0, "progress": 0.4952214452214452, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 6750, "threshold": 1.0 }, { "ce_loss": 0.7508984424670537, "distil_loss": 0.0, "epoch": 1.25, "eval_ampere_temperature": 17.68575872652857, "eval_exact_match": 81.51371807000946, "eval_f1": 88.80037767793473, "eval_progress": 0.5127039627039627, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 6900 }, { "ampere_temperature": 17.847872899071124, "ce_loss": 0.6947148644924164, "distil_loss": 0.0, "epoch": 1.26, "learning_rate": 0.004755244755244755, "loss": 0.7284, "nnz_perc": 1.0, "progress": 0.5243589743589744, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 7000, "threshold": 1.0 }, { "ampere_temperature": 18.219653282924224, "ce_loss": 0.7663285417556762, "distil_loss": 0.0, "epoch": 1.31, "learning_rate": 0.004463869463869464, "loss": 0.7663, "nnz_perc": 1.0, "progress": 0.5534965034965035, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 7250, "threshold": 1.0 }, { "ampere_temperature": 18.545944126080197, "ce_loss": 0.691897637873888, "distil_loss": 0.0, "epoch": 1.35, "learning_rate": 0.004172494172494173, "loss": 0.6919, "nnz_perc": 1.0, "progress": 0.5826340326340327, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 7500, "threshold": 1.0 }, { "ampere_temperature": 18.82971394463778, "ce_loss": 0.7088325002193451, "distil_loss": 0.0, "epoch": 1.4, "learning_rate": 0.0038811188811188812, "loss": 0.7088, "nnz_perc": 1.0, "progress": 0.6117715617715618, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 7750, "threshold": 1.0 }, { "ampere_temperature": 19.07393125469572, "ce_loss": 0.7107383124232293, "distil_loss": 0.0, "epoch": 1.44, "learning_rate": 0.0035897435897435897, "loss": 0.7107, "nnz_perc": 1.0, "progress": 0.6409090909090909, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 8000, "threshold": 1.0 }, { "ampere_temperature": 19.281564572352753, "ce_loss": 0.7073436776399612, "distil_loss": 0.0, "epoch": 1.49, "learning_rate": 0.0032983682983682983, "loss": 0.7073, "nnz_perc": 1.0, "progress": 0.6700466200466201, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 8250, "threshold": 1.0 }, { "ce_loss": 0.7176821072896321, "distil_loss": 0.0, "epoch": 1.49, "eval_ampere_temperature": 19.304163095074752, "eval_exact_match": 82.71523178807946, "eval_f1": 89.82467226075393, "eval_progress": 0.6735431235431235, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 8280 }, { "ampere_temperature": 19.455582413707628, "ce_loss": 0.7027889224615964, "distil_loss": 0.0, "epoch": 1.53, "learning_rate": 0.0030069930069930068, "loss": 0.7046, "nnz_perc": 1.0, "progress": 0.6991841491841492, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 8500, "threshold": 1.0 }, { "ampere_temperature": 19.598953294859086, "ce_loss": 0.6954642720222474, "distil_loss": 0.0, "epoch": 1.58, "learning_rate": 0.0027156177156177157, "loss": 0.6955, "nnz_perc": 1.0, "progress": 0.7283216783216783, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 8750, "threshold": 1.0 }, { "ampere_temperature": 19.71464573190587, "ce_loss": 0.7050508892536164, "distil_loss": 0.0, "epoch": 1.62, "learning_rate": 0.0024242424242424242, "loss": 0.7051, "nnz_perc": 1.0, "progress": 0.7574592074592075, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 9000, "threshold": 1.0 }, { "ampere_temperature": 19.805628240946717, "ce_loss": 0.6534205512404442, "distil_loss": 0.0, "epoch": 1.67, "learning_rate": 0.0021328671328671328, "loss": 0.6534, "nnz_perc": 1.0, "progress": 0.7865967365967366, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 9250, "threshold": 1.0 }, { "ampere_temperature": 19.874869338080376, "ce_loss": 0.6931327093839645, "distil_loss": 0.0, "epoch": 1.71, "learning_rate": 0.0018414918414918417, "loss": 0.6931, "nnz_perc": 1.0, "progress": 0.8157342657342658, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 9500, "threshold": 1.0 }, { "ce_loss": 0.6803905916400254, "distil_loss": 0.0, "epoch": 1.74, "eval_ampere_temperature": 19.90914467925581, "eval_exact_match": 83.3112582781457, "eval_f1": 90.48253679391624, "eval_progress": 0.8343822843822843, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 9660 }, { "ampere_temperature": 19.925337539405586, "ce_loss": 0.6604658047358195, "distil_loss": 0.0, "epoch": 1.76, "learning_rate": 0.0015501165501165502, "loss": 0.6732, "nnz_perc": 1.0, "progress": 0.8448717948717949, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 9750, "threshold": 1.0 }, { "ampere_temperature": 19.960001361021092, "ce_loss": 0.6589477426409721, "distil_loss": 0.0, "epoch": 1.81, "learning_rate": 0.001258741258741259, "loss": 0.6589, "nnz_perc": 1.0, "progress": 0.874009324009324, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 10000, "threshold": 1.0 }, { "ampere_temperature": 19.981829319025636, "ce_loss": 0.6645486508607864, "distil_loss": 0.0, "epoch": 1.85, "learning_rate": 0.0009673659673659674, "loss": 0.6645, "nnz_perc": 1.0, "progress": 0.9031468531468532, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 10250, "threshold": 1.0 }, { "ampere_temperature": 19.99378992951796, "ce_loss": 0.6627120378017426, "distil_loss": 0.0, "epoch": 1.9, "learning_rate": 0.000675990675990676, "loss": 0.6627, "nnz_perc": 1.0, "progress": 0.9322843822843823, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 10500, "threshold": 1.0 }, { "ampere_temperature": 19.998851708596806, "ce_loss": 0.6525639802217483, "distil_loss": 0.0, "epoch": 1.94, "learning_rate": 0.00038461538461538467, "loss": 0.6526, "nnz_perc": 1.0, "progress": 0.9614219114219115, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 10750, "threshold": 1.0 }, { "ampere_temperature": 19.999983172360917, "ce_loss": 0.630506355702877, "distil_loss": 0.0, "epoch": 1.99, "learning_rate": 9.324009324009324e-05, "loss": 0.6305, "nnz_perc": 1.0, "progress": 0.9905594405594406, "regu_lambda": 0.0, "regu_loss": 0.0, "step": 11000, "threshold": 1.0 }, { "ce_loss": 0.6976410485804081, "distil_loss": 0.0, "epoch": 1.99, "eval_ampere_temperature": 19.99999781767362, "eval_exact_match": 83.74645222327341, "eval_f1": 90.78776054621733, "eval_progress": 0.9952214452214452, "eval_regu_lambda": 0.0, "eval_threshold": 1.0, "nnz_perc": 1.0, "regu_loss": 0.0, "step": 11040 } ], "max_steps": 11080, "num_train_epochs": 2, "total_flos": 0, "trial_name": "hp_mnop-albert-base-v2_tn-albert-base-v2_od-__data_2to__devel_data__nn_pruning__output_sequence__squad_test_teacher___es-steps_pdebs128_nte2_ws2500_ls250_ss1380_stl50_est1380_rn-__da--3c944a736efd9cf3", "trial_params": {} }