albert-base-v2-squad / trainer_state.json
madlag's picture
Inital commit.
50915c2
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9927797833935017,
"global_step": 11040,
"is_hyper_param_search": true,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ampere_temperature": 0.0,
"ce_loss": 4.599947213172912,
"distil_loss": 0.0,
"epoch": 0.05,
"learning_rate": 0.001,
"loss": 4.5999,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 250,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.8339186582565308,
"distil_loss": 0.0,
"epoch": 0.09,
"learning_rate": 0.002,
"loss": 1.8339,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 500,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.3567713406085968,
"distil_loss": 0.0,
"epoch": 0.14,
"learning_rate": 0.003,
"loss": 1.3568,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 750,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.2095605379343033,
"distil_loss": 0.0,
"epoch": 0.18,
"learning_rate": 0.004,
"loss": 1.2096,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 1000,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.1451576855182648,
"distil_loss": 0.0,
"epoch": 0.23,
"learning_rate": 0.005,
"loss": 1.1452,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 1250,
"threshold": 1.0
},
{
"ce_loss": 1.1159179366551912,
"distil_loss": 0.0,
"epoch": 0.25,
"eval_ampere_temperature": 0.0,
"eval_exact_match": 77.360454115421,
"eval_f1": 86.34721419771964,
"eval_progress": 0.0,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 1380
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.0835382461547851,
"distil_loss": 0.0,
"epoch": 0.27,
"learning_rate": 0.006,
"loss": 1.1004,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 1500,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.1237352261543274,
"distil_loss": 0.0,
"epoch": 0.32,
"learning_rate": 0.006999999999999999,
"loss": 1.1237,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 1750,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.116382148861885,
"distil_loss": 0.0,
"epoch": 0.36,
"learning_rate": 0.008,
"loss": 1.1164,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 2000,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.0670130407810212,
"distil_loss": 0.0,
"epoch": 0.41,
"learning_rate": 0.009000000000000001,
"loss": 1.067,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 2250,
"threshold": 1.0
},
{
"ampere_temperature": 0.0,
"ce_loss": 1.0639437032938004,
"distil_loss": 0.0,
"epoch": 0.45,
"learning_rate": 0.01,
"loss": 1.0639,
"nnz_perc": 1.0,
"progress": 0.0,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 2500,
"threshold": 1.0
},
{
"ampere_temperature": 1.6912145472259645,
"ce_loss": 1.0629408322572709,
"distil_loss": 0.0,
"epoch": 0.5,
"learning_rate": 0.00970862470862471,
"loss": 1.0629,
"nnz_perc": 1.0,
"progress": 0.029020979020979,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 2750,
"threshold": 1.0
},
{
"ce_loss": 1.0985989689826965,
"distil_loss": 0.0,
"epoch": 0.5,
"eval_ampere_temperature": 1.7570655286803998,
"eval_exact_match": 75.37369914853359,
"eval_f1": 85.4846023509551,
"eval_progress": 0.03018648018648018,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 2760
},
{
"ampere_temperature": 3.2905000860378912,
"ce_loss": 1.0230497049788634,
"distil_loss": 0.0,
"epoch": 0.54,
"learning_rate": 0.009417249417249416,
"loss": 1.0261,
"nnz_perc": 1.0,
"progress": 0.058158508158508204,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 3000,
"threshold": 1.0
},
{
"ampere_temperature": 4.793831310474058,
"ce_loss": 0.9981409941911698,
"distil_loss": 0.0,
"epoch": 0.59,
"learning_rate": 0.009125874125874126,
"loss": 0.9981,
"nnz_perc": 1.0,
"progress": 0.0872960372960373,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 3250,
"threshold": 1.0
},
{
"ampere_temperature": 6.204176736633212,
"ce_loss": 1.0074045011997224,
"distil_loss": 0.0,
"epoch": 0.63,
"learning_rate": 0.008834498834498834,
"loss": 1.0074,
"nnz_perc": 1.0,
"progress": 0.1164335664335664,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 3500,
"threshold": 1.0
},
{
"ampere_temperature": 7.524504880614105,
"ce_loss": 0.9891920503377915,
"distil_loss": 0.0,
"epoch": 0.68,
"learning_rate": 0.008543123543123544,
"loss": 0.9892,
"nnz_perc": 1.0,
"progress": 0.1455710955710956,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 3750,
"threshold": 1.0
},
{
"ampere_temperature": 8.757784258515466,
"ce_loss": 1.0083434996008873,
"distil_loss": 0.0,
"epoch": 0.72,
"learning_rate": 0.008251748251748252,
"loss": 1.0083,
"nnz_perc": 1.0,
"progress": 0.1747086247086247,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 4000,
"threshold": 1.0
},
{
"ce_loss": 0.9699458577803203,
"distil_loss": 0.0,
"epoch": 0.75,
"eval_ampere_temperature": 9.411504281933276,
"eval_exact_match": 79.94323557237465,
"eval_f1": 88.17033886272301,
"eval_progress": 0.191025641025641,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 4140
},
{
"ampere_temperature": 9.906983386436048,
"ce_loss": 0.9698418254202062,
"distil_loss": 0.0,
"epoch": 0.77,
"learning_rate": 0.00796037296037296,
"loss": 0.9699,
"nnz_perc": 1.0,
"progress": 0.2038461538461538,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 4250,
"threshold": 1.0
},
{
"ampere_temperature": 10.97507078047459,
"ce_loss": 0.9425091907978058,
"distil_loss": 0.0,
"epoch": 0.81,
"learning_rate": 0.007668997668997669,
"loss": 0.9425,
"nnz_perc": 1.0,
"progress": 0.232983682983683,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 4500,
"threshold": 1.0
},
{
"ampere_temperature": 11.965014956729835,
"ce_loss": 0.9731772248744964,
"distil_loss": 0.0,
"epoch": 0.86,
"learning_rate": 0.007377622377622378,
"loss": 0.9732,
"nnz_perc": 1.0,
"progress": 0.2621212121212122,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 4750,
"threshold": 1.0
},
{
"ampere_temperature": 12.879784431300521,
"ce_loss": 0.9197172073125839,
"distil_loss": 0.0,
"epoch": 0.9,
"learning_rate": 0.007086247086247086,
"loss": 0.9197,
"nnz_perc": 1.0,
"progress": 0.2912587412587413,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 5000,
"threshold": 1.0
},
{
"ampere_temperature": 13.722347720285395,
"ce_loss": 0.9390108388662338,
"distil_loss": 0.0,
"epoch": 0.95,
"learning_rate": 0.006794871794871795,
"loss": 0.939,
"nnz_perc": 1.0,
"progress": 0.3203962703962704,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 5250,
"threshold": 1.0
},
{
"ampere_temperature": 14.495673339783197,
"ce_loss": 0.9188237161636352,
"distil_loss": 0.0,
"epoch": 0.99,
"learning_rate": 0.006503496503496503,
"loss": 0.9188,
"nnz_perc": 1.0,
"progress": 0.3495337995337995,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 5500,
"threshold": 1.0
},
{
"ce_loss": 0.9402093678712845,
"distil_loss": 0.0,
"epoch": 1.0,
"eval_ampere_temperature": 14.55463723501537,
"eval_exact_match": 81.63670766319773,
"eval_f1": 89.21446798933258,
"eval_progress": 0.35186480186480185,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 5520
},
{
"ampere_temperature": 15.202729805892675,
"ce_loss": 0.7292252867118172,
"distil_loss": 0.0,
"epoch": 1.04,
"learning_rate": 0.006212121212121212,
"loss": 0.7461,
"nnz_perc": 1.0,
"progress": 0.3786713286713287,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 5750,
"threshold": 1.0
},
{
"ampere_temperature": 15.846485634712565,
"ce_loss": 0.7380791381597519,
"distil_loss": 0.0,
"epoch": 1.08,
"learning_rate": 0.005920745920745921,
"loss": 0.7381,
"nnz_perc": 1.0,
"progress": 0.4078088578088578,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 6000,
"threshold": 1.0
},
{
"ampere_temperature": 16.429909342341613,
"ce_loss": 0.7548821606636047,
"distil_loss": 0.0,
"epoch": 1.13,
"learning_rate": 0.005629370629370629,
"loss": 0.7549,
"nnz_perc": 1.0,
"progress": 0.436946386946387,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 6250,
"threshold": 1.0
},
{
"ampere_temperature": 16.955969444878562,
"ce_loss": 0.7157313173413277,
"distil_loss": 0.0,
"epoch": 1.17,
"learning_rate": 0.005337995337995338,
"loss": 0.7157,
"nnz_perc": 1.0,
"progress": 0.4660839160839161,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 6500,
"threshold": 1.0
},
{
"ampere_temperature": 17.427634458422148,
"ce_loss": 0.7611533465385437,
"distil_loss": 0.0,
"epoch": 1.22,
"learning_rate": 0.005046620046620046,
"loss": 0.7612,
"nnz_perc": 1.0,
"progress": 0.4952214452214452,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 6750,
"threshold": 1.0
},
{
"ce_loss": 0.7508984424670537,
"distil_loss": 0.0,
"epoch": 1.25,
"eval_ampere_temperature": 17.68575872652857,
"eval_exact_match": 81.51371807000946,
"eval_f1": 88.80037767793473,
"eval_progress": 0.5127039627039627,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 6900
},
{
"ampere_temperature": 17.847872899071124,
"ce_loss": 0.6947148644924164,
"distil_loss": 0.0,
"epoch": 1.26,
"learning_rate": 0.004755244755244755,
"loss": 0.7284,
"nnz_perc": 1.0,
"progress": 0.5243589743589744,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 7000,
"threshold": 1.0
},
{
"ampere_temperature": 18.219653282924224,
"ce_loss": 0.7663285417556762,
"distil_loss": 0.0,
"epoch": 1.31,
"learning_rate": 0.004463869463869464,
"loss": 0.7663,
"nnz_perc": 1.0,
"progress": 0.5534965034965035,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 7250,
"threshold": 1.0
},
{
"ampere_temperature": 18.545944126080197,
"ce_loss": 0.691897637873888,
"distil_loss": 0.0,
"epoch": 1.35,
"learning_rate": 0.004172494172494173,
"loss": 0.6919,
"nnz_perc": 1.0,
"progress": 0.5826340326340327,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 7500,
"threshold": 1.0
},
{
"ampere_temperature": 18.82971394463778,
"ce_loss": 0.7088325002193451,
"distil_loss": 0.0,
"epoch": 1.4,
"learning_rate": 0.0038811188811188812,
"loss": 0.7088,
"nnz_perc": 1.0,
"progress": 0.6117715617715618,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 7750,
"threshold": 1.0
},
{
"ampere_temperature": 19.07393125469572,
"ce_loss": 0.7107383124232293,
"distil_loss": 0.0,
"epoch": 1.44,
"learning_rate": 0.0035897435897435897,
"loss": 0.7107,
"nnz_perc": 1.0,
"progress": 0.6409090909090909,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 8000,
"threshold": 1.0
},
{
"ampere_temperature": 19.281564572352753,
"ce_loss": 0.7073436776399612,
"distil_loss": 0.0,
"epoch": 1.49,
"learning_rate": 0.0032983682983682983,
"loss": 0.7073,
"nnz_perc": 1.0,
"progress": 0.6700466200466201,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 8250,
"threshold": 1.0
},
{
"ce_loss": 0.7176821072896321,
"distil_loss": 0.0,
"epoch": 1.49,
"eval_ampere_temperature": 19.304163095074752,
"eval_exact_match": 82.71523178807946,
"eval_f1": 89.82467226075393,
"eval_progress": 0.6735431235431235,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 8280
},
{
"ampere_temperature": 19.455582413707628,
"ce_loss": 0.7027889224615964,
"distil_loss": 0.0,
"epoch": 1.53,
"learning_rate": 0.0030069930069930068,
"loss": 0.7046,
"nnz_perc": 1.0,
"progress": 0.6991841491841492,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 8500,
"threshold": 1.0
},
{
"ampere_temperature": 19.598953294859086,
"ce_loss": 0.6954642720222474,
"distil_loss": 0.0,
"epoch": 1.58,
"learning_rate": 0.0027156177156177157,
"loss": 0.6955,
"nnz_perc": 1.0,
"progress": 0.7283216783216783,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 8750,
"threshold": 1.0
},
{
"ampere_temperature": 19.71464573190587,
"ce_loss": 0.7050508892536164,
"distil_loss": 0.0,
"epoch": 1.62,
"learning_rate": 0.0024242424242424242,
"loss": 0.7051,
"nnz_perc": 1.0,
"progress": 0.7574592074592075,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 9000,
"threshold": 1.0
},
{
"ampere_temperature": 19.805628240946717,
"ce_loss": 0.6534205512404442,
"distil_loss": 0.0,
"epoch": 1.67,
"learning_rate": 0.0021328671328671328,
"loss": 0.6534,
"nnz_perc": 1.0,
"progress": 0.7865967365967366,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 9250,
"threshold": 1.0
},
{
"ampere_temperature": 19.874869338080376,
"ce_loss": 0.6931327093839645,
"distil_loss": 0.0,
"epoch": 1.71,
"learning_rate": 0.0018414918414918417,
"loss": 0.6931,
"nnz_perc": 1.0,
"progress": 0.8157342657342658,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 9500,
"threshold": 1.0
},
{
"ce_loss": 0.6803905916400254,
"distil_loss": 0.0,
"epoch": 1.74,
"eval_ampere_temperature": 19.90914467925581,
"eval_exact_match": 83.3112582781457,
"eval_f1": 90.48253679391624,
"eval_progress": 0.8343822843822843,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 9660
},
{
"ampere_temperature": 19.925337539405586,
"ce_loss": 0.6604658047358195,
"distil_loss": 0.0,
"epoch": 1.76,
"learning_rate": 0.0015501165501165502,
"loss": 0.6732,
"nnz_perc": 1.0,
"progress": 0.8448717948717949,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 9750,
"threshold": 1.0
},
{
"ampere_temperature": 19.960001361021092,
"ce_loss": 0.6589477426409721,
"distil_loss": 0.0,
"epoch": 1.81,
"learning_rate": 0.001258741258741259,
"loss": 0.6589,
"nnz_perc": 1.0,
"progress": 0.874009324009324,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 10000,
"threshold": 1.0
},
{
"ampere_temperature": 19.981829319025636,
"ce_loss": 0.6645486508607864,
"distil_loss": 0.0,
"epoch": 1.85,
"learning_rate": 0.0009673659673659674,
"loss": 0.6645,
"nnz_perc": 1.0,
"progress": 0.9031468531468532,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 10250,
"threshold": 1.0
},
{
"ampere_temperature": 19.99378992951796,
"ce_loss": 0.6627120378017426,
"distil_loss": 0.0,
"epoch": 1.9,
"learning_rate": 0.000675990675990676,
"loss": 0.6627,
"nnz_perc": 1.0,
"progress": 0.9322843822843823,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 10500,
"threshold": 1.0
},
{
"ampere_temperature": 19.998851708596806,
"ce_loss": 0.6525639802217483,
"distil_loss": 0.0,
"epoch": 1.94,
"learning_rate": 0.00038461538461538467,
"loss": 0.6526,
"nnz_perc": 1.0,
"progress": 0.9614219114219115,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 10750,
"threshold": 1.0
},
{
"ampere_temperature": 19.999983172360917,
"ce_loss": 0.630506355702877,
"distil_loss": 0.0,
"epoch": 1.99,
"learning_rate": 9.324009324009324e-05,
"loss": 0.6305,
"nnz_perc": 1.0,
"progress": 0.9905594405594406,
"regu_lambda": 0.0,
"regu_loss": 0.0,
"step": 11000,
"threshold": 1.0
},
{
"ce_loss": 0.6976410485804081,
"distil_loss": 0.0,
"epoch": 1.99,
"eval_ampere_temperature": 19.99999781767362,
"eval_exact_match": 83.74645222327341,
"eval_f1": 90.78776054621733,
"eval_progress": 0.9952214452214452,
"eval_regu_lambda": 0.0,
"eval_threshold": 1.0,
"nnz_perc": 1.0,
"regu_loss": 0.0,
"step": 11040
}
],
"max_steps": 11080,
"num_train_epochs": 2,
"total_flos": 0,
"trial_name": "hp_mnop-albert-base-v2_tn-albert-base-v2_od-__data_2to__devel_data__nn_pruning__output_sequence__squad_test_teacher___es-steps_pdebs128_nte2_ws2500_ls250_ss1380_stl50_est1380_rn-__da--3c944a736efd9cf3",
"trial_params": {}
}