LingConv / ckpt /ling_disc /trainer_state.json
mohdelgaar's picture
Update layout and samples
674b430
raw
history blame
15.9 kB
{
"best_metric": 0.05535305291414261,
"best_model_checkpoint": "/data/mohamed/checkpoints/ling_disc/deberta-v3-small_flan-t5-base_40/checkpoint-41000",
"epoch": 30.0,
"eval_steps": 1000,
"global_step": 41970,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.71,
"grad_norm": 0.855617344379425,
"learning_rate": 1.1913271384322135e-05,
"loss": 0.9117,
"step": 1000
},
{
"epoch": 0.71,
"eval_loss": 0.6742472052574158,
"eval_runtime": 27.0595,
"eval_samples_per_second": 1111.549,
"eval_steps_per_second": 5.58,
"step": 1000
},
{
"epoch": 1.43,
"grad_norm": 4.203719139099121,
"learning_rate": 2.382654276864427e-05,
"loss": 0.4114,
"step": 2000
},
{
"epoch": 1.43,
"eval_loss": 0.3266257345676422,
"eval_runtime": 26.9318,
"eval_samples_per_second": 1116.822,
"eval_steps_per_second": 5.607,
"step": 2000
},
{
"epoch": 2.14,
"grad_norm": 3.1638591289520264,
"learning_rate": 3.57398141529664e-05,
"loss": 0.2624,
"step": 3000
},
{
"epoch": 2.14,
"eval_loss": 0.24602766335010529,
"eval_runtime": 27.0604,
"eval_samples_per_second": 1111.512,
"eval_steps_per_second": 5.58,
"step": 3000
},
{
"epoch": 2.86,
"grad_norm": 1.7417826652526855,
"learning_rate": 4.765308553728854e-05,
"loss": 0.2002,
"step": 4000
},
{
"epoch": 2.86,
"eval_loss": 0.1770436018705368,
"eval_runtime": 26.8812,
"eval_samples_per_second": 1118.922,
"eval_steps_per_second": 5.617,
"step": 4000
},
{
"epoch": 3.57,
"grad_norm": 1.1299816370010376,
"learning_rate": 4.893707145315437e-05,
"loss": 0.1635,
"step": 5000
},
{
"epoch": 3.57,
"eval_loss": 0.14757415652275085,
"eval_runtime": 26.7857,
"eval_samples_per_second": 1122.914,
"eval_steps_per_second": 5.637,
"step": 5000
},
{
"epoch": 4.29,
"grad_norm": 1.210856556892395,
"learning_rate": 4.761337463267413e-05,
"loss": 0.1404,
"step": 6000
},
{
"epoch": 4.29,
"eval_loss": 0.12851941585540771,
"eval_runtime": 26.9893,
"eval_samples_per_second": 1114.44,
"eval_steps_per_second": 5.595,
"step": 6000
},
{
"epoch": 5.0,
"grad_norm": 2.0565412044525146,
"learning_rate": 4.62896778121939e-05,
"loss": 0.1263,
"step": 7000
},
{
"epoch": 5.0,
"eval_loss": 0.12228666245937347,
"eval_runtime": 26.7363,
"eval_samples_per_second": 1124.987,
"eval_steps_per_second": 5.648,
"step": 7000
},
{
"epoch": 5.72,
"grad_norm": 1.8667607307434082,
"learning_rate": 4.496598099171366e-05,
"loss": 0.1127,
"step": 8000
},
{
"epoch": 5.72,
"eval_loss": 0.11036147177219391,
"eval_runtime": 26.7509,
"eval_samples_per_second": 1124.375,
"eval_steps_per_second": 5.645,
"step": 8000
},
{
"epoch": 6.43,
"grad_norm": 0.7492337226867676,
"learning_rate": 4.364228417123342e-05,
"loss": 0.1059,
"step": 9000
},
{
"epoch": 6.43,
"eval_loss": 0.10317497700452805,
"eval_runtime": 27.0158,
"eval_samples_per_second": 1113.349,
"eval_steps_per_second": 5.589,
"step": 9000
},
{
"epoch": 7.15,
"grad_norm": 0.7611485123634338,
"learning_rate": 4.231858735075319e-05,
"loss": 0.0993,
"step": 10000
},
{
"epoch": 7.15,
"eval_loss": 0.10284282267093658,
"eval_runtime": 26.795,
"eval_samples_per_second": 1122.524,
"eval_steps_per_second": 5.635,
"step": 10000
},
{
"epoch": 7.86,
"grad_norm": 0.5870215892791748,
"learning_rate": 4.099489053027295e-05,
"loss": 0.0887,
"step": 11000
},
{
"epoch": 7.86,
"eval_loss": 0.09789762645959854,
"eval_runtime": 26.8453,
"eval_samples_per_second": 1120.419,
"eval_steps_per_second": 5.625,
"step": 11000
},
{
"epoch": 8.58,
"grad_norm": 0.48922085762023926,
"learning_rate": 3.9671193709792706e-05,
"loss": 0.0842,
"step": 12000
},
{
"epoch": 8.58,
"eval_loss": 0.09349656105041504,
"eval_runtime": 26.8273,
"eval_samples_per_second": 1121.172,
"eval_steps_per_second": 5.629,
"step": 12000
},
{
"epoch": 9.29,
"grad_norm": 0.4252859354019165,
"learning_rate": 3.8347496889312476e-05,
"loss": 0.0793,
"step": 13000
},
{
"epoch": 9.29,
"eval_loss": 0.09415590018033981,
"eval_runtime": 25.9362,
"eval_samples_per_second": 1159.693,
"eval_steps_per_second": 5.822,
"step": 13000
},
{
"epoch": 10.01,
"grad_norm": 0.44548505544662476,
"learning_rate": 3.702380006883224e-05,
"loss": 0.076,
"step": 14000
},
{
"epoch": 10.01,
"eval_loss": 0.08913980424404144,
"eval_runtime": 26.7379,
"eval_samples_per_second": 1124.919,
"eval_steps_per_second": 5.647,
"step": 14000
},
{
"epoch": 10.72,
"grad_norm": 0.2965373694896698,
"learning_rate": 3.5700103248352e-05,
"loss": 0.0714,
"step": 15000
},
{
"epoch": 10.72,
"eval_loss": 0.08456840366125107,
"eval_runtime": 26.787,
"eval_samples_per_second": 1122.857,
"eval_steps_per_second": 5.637,
"step": 15000
},
{
"epoch": 11.44,
"grad_norm": 0.3205694854259491,
"learning_rate": 3.437640642787176e-05,
"loss": 0.0677,
"step": 16000
},
{
"epoch": 11.44,
"eval_loss": 0.07863688468933105,
"eval_runtime": 26.8242,
"eval_samples_per_second": 1121.299,
"eval_steps_per_second": 5.629,
"step": 16000
},
{
"epoch": 12.15,
"grad_norm": 0.2736203670501709,
"learning_rate": 3.3052709607391525e-05,
"loss": 0.0636,
"step": 17000
},
{
"epoch": 12.15,
"eval_loss": 0.07664181292057037,
"eval_runtime": 26.7818,
"eval_samples_per_second": 1123.077,
"eval_steps_per_second": 5.638,
"step": 17000
},
{
"epoch": 12.87,
"grad_norm": 0.25644680857658386,
"learning_rate": 3.172901278691129e-05,
"loss": 0.0618,
"step": 18000
},
{
"epoch": 12.87,
"eval_loss": 0.07351888716220856,
"eval_runtime": 26.8445,
"eval_samples_per_second": 1120.453,
"eval_steps_per_second": 5.625,
"step": 18000
},
{
"epoch": 13.58,
"grad_norm": 0.2748676538467407,
"learning_rate": 3.0405315966431053e-05,
"loss": 0.0584,
"step": 19000
},
{
"epoch": 13.58,
"eval_loss": 0.07314006239175797,
"eval_runtime": 26.8333,
"eval_samples_per_second": 1120.921,
"eval_steps_per_second": 5.627,
"step": 19000
},
{
"epoch": 14.3,
"grad_norm": 0.30235132575035095,
"learning_rate": 2.9081619145950812e-05,
"loss": 0.057,
"step": 20000
},
{
"epoch": 14.3,
"eval_loss": 0.07568340748548508,
"eval_runtime": 27.0109,
"eval_samples_per_second": 1113.55,
"eval_steps_per_second": 5.59,
"step": 20000
},
{
"epoch": 15.01,
"grad_norm": 0.2508692145347595,
"learning_rate": 2.7757922325470574e-05,
"loss": 0.0558,
"step": 21000
},
{
"epoch": 15.01,
"eval_loss": 0.07675843685865402,
"eval_runtime": 26.9026,
"eval_samples_per_second": 1118.032,
"eval_steps_per_second": 5.613,
"step": 21000
},
{
"epoch": 15.73,
"grad_norm": 0.3341030478477478,
"learning_rate": 2.643422550499034e-05,
"loss": 0.0533,
"step": 22000
},
{
"epoch": 15.73,
"eval_loss": 0.07339715212583542,
"eval_runtime": 26.8727,
"eval_samples_per_second": 1119.278,
"eval_steps_per_second": 5.619,
"step": 22000
},
{
"epoch": 16.44,
"grad_norm": 0.30433303117752075,
"learning_rate": 2.51105286845101e-05,
"loss": 0.0516,
"step": 23000
},
{
"epoch": 16.44,
"eval_loss": 0.0694783553481102,
"eval_runtime": 26.8551,
"eval_samples_per_second": 1120.012,
"eval_steps_per_second": 5.623,
"step": 23000
},
{
"epoch": 17.16,
"grad_norm": 0.39424875378608704,
"learning_rate": 2.378683186402986e-05,
"loss": 0.049,
"step": 24000
},
{
"epoch": 17.16,
"eval_loss": 0.06750107556581497,
"eval_runtime": 26.9045,
"eval_samples_per_second": 1117.954,
"eval_steps_per_second": 5.612,
"step": 24000
},
{
"epoch": 17.87,
"grad_norm": 0.29526183009147644,
"learning_rate": 2.2463135043549627e-05,
"loss": 0.0478,
"step": 25000
},
{
"epoch": 17.87,
"eval_loss": 0.06841529905796051,
"eval_runtime": 26.9131,
"eval_samples_per_second": 1117.597,
"eval_steps_per_second": 5.611,
"step": 25000
},
{
"epoch": 18.58,
"grad_norm": 0.2802821099758148,
"learning_rate": 2.113943822306939e-05,
"loss": 0.0472,
"step": 26000
},
{
"epoch": 18.58,
"eval_loss": 0.0680340975522995,
"eval_runtime": 26.8442,
"eval_samples_per_second": 1120.467,
"eval_steps_per_second": 5.625,
"step": 26000
},
{
"epoch": 19.3,
"grad_norm": 0.198490172624588,
"learning_rate": 1.9815741402589152e-05,
"loss": 0.0445,
"step": 27000
},
{
"epoch": 19.3,
"eval_loss": 0.059882719069719315,
"eval_runtime": 26.9691,
"eval_samples_per_second": 1115.275,
"eval_steps_per_second": 5.599,
"step": 27000
},
{
"epoch": 20.01,
"grad_norm": 0.3383251130580902,
"learning_rate": 1.8492044582108914e-05,
"loss": 0.0435,
"step": 28000
},
{
"epoch": 20.01,
"eval_loss": 0.06356318295001984,
"eval_runtime": 26.8538,
"eval_samples_per_second": 1120.066,
"eval_steps_per_second": 5.623,
"step": 28000
},
{
"epoch": 20.73,
"grad_norm": 0.16571784019470215,
"learning_rate": 1.7168347761628677e-05,
"loss": 0.0419,
"step": 29000
},
{
"epoch": 20.73,
"eval_loss": 0.06056862324476242,
"eval_runtime": 27.0748,
"eval_samples_per_second": 1110.924,
"eval_steps_per_second": 5.577,
"step": 29000
},
{
"epoch": 21.44,
"grad_norm": 0.19518467783927917,
"learning_rate": 1.584465094114844e-05,
"loss": 0.0409,
"step": 30000
},
{
"epoch": 21.44,
"eval_loss": 0.06490638852119446,
"eval_runtime": 26.8481,
"eval_samples_per_second": 1120.301,
"eval_steps_per_second": 5.624,
"step": 30000
},
{
"epoch": 22.16,
"grad_norm": 0.15420591831207275,
"learning_rate": 1.4520954120668203e-05,
"loss": 0.0397,
"step": 31000
},
{
"epoch": 22.16,
"eval_loss": 0.05918469280004501,
"eval_runtime": 26.8143,
"eval_samples_per_second": 1121.713,
"eval_steps_per_second": 5.631,
"step": 31000
},
{
"epoch": 22.87,
"grad_norm": 0.26854997873306274,
"learning_rate": 1.3197257300187965e-05,
"loss": 0.0387,
"step": 32000
},
{
"epoch": 22.87,
"eval_loss": 0.06144551932811737,
"eval_runtime": 26.8852,
"eval_samples_per_second": 1118.757,
"eval_steps_per_second": 5.616,
"step": 32000
},
{
"epoch": 23.59,
"grad_norm": 0.17430314421653748,
"learning_rate": 1.1873560479707728e-05,
"loss": 0.0373,
"step": 33000
},
{
"epoch": 23.59,
"eval_loss": 0.06159648299217224,
"eval_runtime": 26.7887,
"eval_samples_per_second": 1122.785,
"eval_steps_per_second": 5.637,
"step": 33000
},
{
"epoch": 24.3,
"grad_norm": 0.14911049604415894,
"learning_rate": 1.054986365922749e-05,
"loss": 0.0369,
"step": 34000
},
{
"epoch": 24.3,
"eval_loss": 0.05931873992085457,
"eval_runtime": 26.8571,
"eval_samples_per_second": 1119.926,
"eval_steps_per_second": 5.622,
"step": 34000
},
{
"epoch": 25.02,
"grad_norm": 0.13620807230472565,
"learning_rate": 9.226166838747254e-06,
"loss": 0.0361,
"step": 35000
},
{
"epoch": 25.02,
"eval_loss": 0.05695568770170212,
"eval_runtime": 26.8966,
"eval_samples_per_second": 1118.283,
"eval_steps_per_second": 5.614,
"step": 35000
},
{
"epoch": 25.73,
"grad_norm": 0.13764438033103943,
"learning_rate": 7.902470018267017e-06,
"loss": 0.0349,
"step": 36000
},
{
"epoch": 25.73,
"eval_loss": 0.05707501247525215,
"eval_runtime": 26.986,
"eval_samples_per_second": 1114.578,
"eval_steps_per_second": 5.595,
"step": 36000
},
{
"epoch": 26.45,
"grad_norm": 0.2389635145664215,
"learning_rate": 6.578773197786779e-06,
"loss": 0.0343,
"step": 37000
},
{
"epoch": 26.45,
"eval_loss": 0.0577365942299366,
"eval_runtime": 26.9903,
"eval_samples_per_second": 1114.401,
"eval_steps_per_second": 5.595,
"step": 37000
},
{
"epoch": 27.16,
"grad_norm": 0.15828461945056915,
"learning_rate": 5.255076377306542e-06,
"loss": 0.034,
"step": 38000
},
{
"epoch": 27.16,
"eval_loss": 0.05767366662621498,
"eval_runtime": 27.1454,
"eval_samples_per_second": 1108.035,
"eval_steps_per_second": 5.563,
"step": 38000
},
{
"epoch": 27.88,
"grad_norm": 0.1059570387005806,
"learning_rate": 3.9313795568263045e-06,
"loss": 0.0332,
"step": 39000
},
{
"epoch": 27.88,
"eval_loss": 0.056225307285785675,
"eval_runtime": 26.9534,
"eval_samples_per_second": 1115.928,
"eval_steps_per_second": 5.602,
"step": 39000
},
{
"epoch": 28.59,
"grad_norm": 0.1975150853395462,
"learning_rate": 2.6076827363460673e-06,
"loss": 0.0329,
"step": 40000
},
{
"epoch": 28.59,
"eval_loss": 0.05555161088705063,
"eval_runtime": 27.1187,
"eval_samples_per_second": 1109.122,
"eval_steps_per_second": 5.568,
"step": 40000
},
{
"epoch": 29.31,
"grad_norm": 0.1037423312664032,
"learning_rate": 1.28398591586583e-06,
"loss": 0.0319,
"step": 41000
},
{
"epoch": 29.31,
"eval_loss": 0.05535305291414261,
"eval_runtime": 26.8353,
"eval_samples_per_second": 1120.838,
"eval_steps_per_second": 5.627,
"step": 41000
},
{
"epoch": 30.0,
"step": 41970,
"total_flos": 3.347206753110317e+16,
"train_loss": 0.09860551060169176,
"train_runtime": 13103.021,
"train_samples_per_second": 640.368,
"train_steps_per_second": 3.203
}
],
"logging_steps": 1000,
"max_steps": 41970,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 1000,
"total_flos": 3.347206753110317e+16,
"train_batch_size": 200,
"trial_name": null,
"trial_params": null
}