Spaces:
Sleeping
Sleeping
{ | |
"best_metric": 0.05535305291414261, | |
"best_model_checkpoint": "/data/mohamed/checkpoints/ling_disc/deberta-v3-small_flan-t5-base_40/checkpoint-41000", | |
"epoch": 30.0, | |
"eval_steps": 1000, | |
"global_step": 41970, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.71, | |
"grad_norm": 0.855617344379425, | |
"learning_rate": 1.1913271384322135e-05, | |
"loss": 0.9117, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.71, | |
"eval_loss": 0.6742472052574158, | |
"eval_runtime": 27.0595, | |
"eval_samples_per_second": 1111.549, | |
"eval_steps_per_second": 5.58, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 1.43, | |
"grad_norm": 4.203719139099121, | |
"learning_rate": 2.382654276864427e-05, | |
"loss": 0.4114, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 1.43, | |
"eval_loss": 0.3266257345676422, | |
"eval_runtime": 26.9318, | |
"eval_samples_per_second": 1116.822, | |
"eval_steps_per_second": 5.607, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 2.14, | |
"grad_norm": 3.1638591289520264, | |
"learning_rate": 3.57398141529664e-05, | |
"loss": 0.2624, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 2.14, | |
"eval_loss": 0.24602766335010529, | |
"eval_runtime": 27.0604, | |
"eval_samples_per_second": 1111.512, | |
"eval_steps_per_second": 5.58, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 2.86, | |
"grad_norm": 1.7417826652526855, | |
"learning_rate": 4.765308553728854e-05, | |
"loss": 0.2002, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 2.86, | |
"eval_loss": 0.1770436018705368, | |
"eval_runtime": 26.8812, | |
"eval_samples_per_second": 1118.922, | |
"eval_steps_per_second": 5.617, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 3.57, | |
"grad_norm": 1.1299816370010376, | |
"learning_rate": 4.893707145315437e-05, | |
"loss": 0.1635, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 3.57, | |
"eval_loss": 0.14757415652275085, | |
"eval_runtime": 26.7857, | |
"eval_samples_per_second": 1122.914, | |
"eval_steps_per_second": 5.637, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 4.29, | |
"grad_norm": 1.210856556892395, | |
"learning_rate": 4.761337463267413e-05, | |
"loss": 0.1404, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 4.29, | |
"eval_loss": 0.12851941585540771, | |
"eval_runtime": 26.9893, | |
"eval_samples_per_second": 1114.44, | |
"eval_steps_per_second": 5.595, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 2.0565412044525146, | |
"learning_rate": 4.62896778121939e-05, | |
"loss": 0.1263, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_loss": 0.12228666245937347, | |
"eval_runtime": 26.7363, | |
"eval_samples_per_second": 1124.987, | |
"eval_steps_per_second": 5.648, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 5.72, | |
"grad_norm": 1.8667607307434082, | |
"learning_rate": 4.496598099171366e-05, | |
"loss": 0.1127, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 5.72, | |
"eval_loss": 0.11036147177219391, | |
"eval_runtime": 26.7509, | |
"eval_samples_per_second": 1124.375, | |
"eval_steps_per_second": 5.645, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 6.43, | |
"grad_norm": 0.7492337226867676, | |
"learning_rate": 4.364228417123342e-05, | |
"loss": 0.1059, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 6.43, | |
"eval_loss": 0.10317497700452805, | |
"eval_runtime": 27.0158, | |
"eval_samples_per_second": 1113.349, | |
"eval_steps_per_second": 5.589, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 7.15, | |
"grad_norm": 0.7611485123634338, | |
"learning_rate": 4.231858735075319e-05, | |
"loss": 0.0993, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 7.15, | |
"eval_loss": 0.10284282267093658, | |
"eval_runtime": 26.795, | |
"eval_samples_per_second": 1122.524, | |
"eval_steps_per_second": 5.635, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 7.86, | |
"grad_norm": 0.5870215892791748, | |
"learning_rate": 4.099489053027295e-05, | |
"loss": 0.0887, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 7.86, | |
"eval_loss": 0.09789762645959854, | |
"eval_runtime": 26.8453, | |
"eval_samples_per_second": 1120.419, | |
"eval_steps_per_second": 5.625, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 8.58, | |
"grad_norm": 0.48922085762023926, | |
"learning_rate": 3.9671193709792706e-05, | |
"loss": 0.0842, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 8.58, | |
"eval_loss": 0.09349656105041504, | |
"eval_runtime": 26.8273, | |
"eval_samples_per_second": 1121.172, | |
"eval_steps_per_second": 5.629, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 9.29, | |
"grad_norm": 0.4252859354019165, | |
"learning_rate": 3.8347496889312476e-05, | |
"loss": 0.0793, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 9.29, | |
"eval_loss": 0.09415590018033981, | |
"eval_runtime": 25.9362, | |
"eval_samples_per_second": 1159.693, | |
"eval_steps_per_second": 5.822, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 10.01, | |
"grad_norm": 0.44548505544662476, | |
"learning_rate": 3.702380006883224e-05, | |
"loss": 0.076, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 10.01, | |
"eval_loss": 0.08913980424404144, | |
"eval_runtime": 26.7379, | |
"eval_samples_per_second": 1124.919, | |
"eval_steps_per_second": 5.647, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 10.72, | |
"grad_norm": 0.2965373694896698, | |
"learning_rate": 3.5700103248352e-05, | |
"loss": 0.0714, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 10.72, | |
"eval_loss": 0.08456840366125107, | |
"eval_runtime": 26.787, | |
"eval_samples_per_second": 1122.857, | |
"eval_steps_per_second": 5.637, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 11.44, | |
"grad_norm": 0.3205694854259491, | |
"learning_rate": 3.437640642787176e-05, | |
"loss": 0.0677, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 11.44, | |
"eval_loss": 0.07863688468933105, | |
"eval_runtime": 26.8242, | |
"eval_samples_per_second": 1121.299, | |
"eval_steps_per_second": 5.629, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 12.15, | |
"grad_norm": 0.2736203670501709, | |
"learning_rate": 3.3052709607391525e-05, | |
"loss": 0.0636, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 12.15, | |
"eval_loss": 0.07664181292057037, | |
"eval_runtime": 26.7818, | |
"eval_samples_per_second": 1123.077, | |
"eval_steps_per_second": 5.638, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 12.87, | |
"grad_norm": 0.25644680857658386, | |
"learning_rate": 3.172901278691129e-05, | |
"loss": 0.0618, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 12.87, | |
"eval_loss": 0.07351888716220856, | |
"eval_runtime": 26.8445, | |
"eval_samples_per_second": 1120.453, | |
"eval_steps_per_second": 5.625, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 13.58, | |
"grad_norm": 0.2748676538467407, | |
"learning_rate": 3.0405315966431053e-05, | |
"loss": 0.0584, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 13.58, | |
"eval_loss": 0.07314006239175797, | |
"eval_runtime": 26.8333, | |
"eval_samples_per_second": 1120.921, | |
"eval_steps_per_second": 5.627, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 14.3, | |
"grad_norm": 0.30235132575035095, | |
"learning_rate": 2.9081619145950812e-05, | |
"loss": 0.057, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 14.3, | |
"eval_loss": 0.07568340748548508, | |
"eval_runtime": 27.0109, | |
"eval_samples_per_second": 1113.55, | |
"eval_steps_per_second": 5.59, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 15.01, | |
"grad_norm": 0.2508692145347595, | |
"learning_rate": 2.7757922325470574e-05, | |
"loss": 0.0558, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 15.01, | |
"eval_loss": 0.07675843685865402, | |
"eval_runtime": 26.9026, | |
"eval_samples_per_second": 1118.032, | |
"eval_steps_per_second": 5.613, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 15.73, | |
"grad_norm": 0.3341030478477478, | |
"learning_rate": 2.643422550499034e-05, | |
"loss": 0.0533, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 15.73, | |
"eval_loss": 0.07339715212583542, | |
"eval_runtime": 26.8727, | |
"eval_samples_per_second": 1119.278, | |
"eval_steps_per_second": 5.619, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 16.44, | |
"grad_norm": 0.30433303117752075, | |
"learning_rate": 2.51105286845101e-05, | |
"loss": 0.0516, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 16.44, | |
"eval_loss": 0.0694783553481102, | |
"eval_runtime": 26.8551, | |
"eval_samples_per_second": 1120.012, | |
"eval_steps_per_second": 5.623, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 17.16, | |
"grad_norm": 0.39424875378608704, | |
"learning_rate": 2.378683186402986e-05, | |
"loss": 0.049, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 17.16, | |
"eval_loss": 0.06750107556581497, | |
"eval_runtime": 26.9045, | |
"eval_samples_per_second": 1117.954, | |
"eval_steps_per_second": 5.612, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 17.87, | |
"grad_norm": 0.29526183009147644, | |
"learning_rate": 2.2463135043549627e-05, | |
"loss": 0.0478, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 17.87, | |
"eval_loss": 0.06841529905796051, | |
"eval_runtime": 26.9131, | |
"eval_samples_per_second": 1117.597, | |
"eval_steps_per_second": 5.611, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 18.58, | |
"grad_norm": 0.2802821099758148, | |
"learning_rate": 2.113943822306939e-05, | |
"loss": 0.0472, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 18.58, | |
"eval_loss": 0.0680340975522995, | |
"eval_runtime": 26.8442, | |
"eval_samples_per_second": 1120.467, | |
"eval_steps_per_second": 5.625, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 19.3, | |
"grad_norm": 0.198490172624588, | |
"learning_rate": 1.9815741402589152e-05, | |
"loss": 0.0445, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 19.3, | |
"eval_loss": 0.059882719069719315, | |
"eval_runtime": 26.9691, | |
"eval_samples_per_second": 1115.275, | |
"eval_steps_per_second": 5.599, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 20.01, | |
"grad_norm": 0.3383251130580902, | |
"learning_rate": 1.8492044582108914e-05, | |
"loss": 0.0435, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 20.01, | |
"eval_loss": 0.06356318295001984, | |
"eval_runtime": 26.8538, | |
"eval_samples_per_second": 1120.066, | |
"eval_steps_per_second": 5.623, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 20.73, | |
"grad_norm": 0.16571784019470215, | |
"learning_rate": 1.7168347761628677e-05, | |
"loss": 0.0419, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 20.73, | |
"eval_loss": 0.06056862324476242, | |
"eval_runtime": 27.0748, | |
"eval_samples_per_second": 1110.924, | |
"eval_steps_per_second": 5.577, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 21.44, | |
"grad_norm": 0.19518467783927917, | |
"learning_rate": 1.584465094114844e-05, | |
"loss": 0.0409, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 21.44, | |
"eval_loss": 0.06490638852119446, | |
"eval_runtime": 26.8481, | |
"eval_samples_per_second": 1120.301, | |
"eval_steps_per_second": 5.624, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 22.16, | |
"grad_norm": 0.15420591831207275, | |
"learning_rate": 1.4520954120668203e-05, | |
"loss": 0.0397, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 22.16, | |
"eval_loss": 0.05918469280004501, | |
"eval_runtime": 26.8143, | |
"eval_samples_per_second": 1121.713, | |
"eval_steps_per_second": 5.631, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 22.87, | |
"grad_norm": 0.26854997873306274, | |
"learning_rate": 1.3197257300187965e-05, | |
"loss": 0.0387, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 22.87, | |
"eval_loss": 0.06144551932811737, | |
"eval_runtime": 26.8852, | |
"eval_samples_per_second": 1118.757, | |
"eval_steps_per_second": 5.616, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 23.59, | |
"grad_norm": 0.17430314421653748, | |
"learning_rate": 1.1873560479707728e-05, | |
"loss": 0.0373, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 23.59, | |
"eval_loss": 0.06159648299217224, | |
"eval_runtime": 26.7887, | |
"eval_samples_per_second": 1122.785, | |
"eval_steps_per_second": 5.637, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 24.3, | |
"grad_norm": 0.14911049604415894, | |
"learning_rate": 1.054986365922749e-05, | |
"loss": 0.0369, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 24.3, | |
"eval_loss": 0.05931873992085457, | |
"eval_runtime": 26.8571, | |
"eval_samples_per_second": 1119.926, | |
"eval_steps_per_second": 5.622, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 25.02, | |
"grad_norm": 0.13620807230472565, | |
"learning_rate": 9.226166838747254e-06, | |
"loss": 0.0361, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 25.02, | |
"eval_loss": 0.05695568770170212, | |
"eval_runtime": 26.8966, | |
"eval_samples_per_second": 1118.283, | |
"eval_steps_per_second": 5.614, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 25.73, | |
"grad_norm": 0.13764438033103943, | |
"learning_rate": 7.902470018267017e-06, | |
"loss": 0.0349, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 25.73, | |
"eval_loss": 0.05707501247525215, | |
"eval_runtime": 26.986, | |
"eval_samples_per_second": 1114.578, | |
"eval_steps_per_second": 5.595, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 26.45, | |
"grad_norm": 0.2389635145664215, | |
"learning_rate": 6.578773197786779e-06, | |
"loss": 0.0343, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 26.45, | |
"eval_loss": 0.0577365942299366, | |
"eval_runtime": 26.9903, | |
"eval_samples_per_second": 1114.401, | |
"eval_steps_per_second": 5.595, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 27.16, | |
"grad_norm": 0.15828461945056915, | |
"learning_rate": 5.255076377306542e-06, | |
"loss": 0.034, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 27.16, | |
"eval_loss": 0.05767366662621498, | |
"eval_runtime": 27.1454, | |
"eval_samples_per_second": 1108.035, | |
"eval_steps_per_second": 5.563, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 27.88, | |
"grad_norm": 0.1059570387005806, | |
"learning_rate": 3.9313795568263045e-06, | |
"loss": 0.0332, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 27.88, | |
"eval_loss": 0.056225307285785675, | |
"eval_runtime": 26.9534, | |
"eval_samples_per_second": 1115.928, | |
"eval_steps_per_second": 5.602, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 28.59, | |
"grad_norm": 0.1975150853395462, | |
"learning_rate": 2.6076827363460673e-06, | |
"loss": 0.0329, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 28.59, | |
"eval_loss": 0.05555161088705063, | |
"eval_runtime": 27.1187, | |
"eval_samples_per_second": 1109.122, | |
"eval_steps_per_second": 5.568, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 29.31, | |
"grad_norm": 0.1037423312664032, | |
"learning_rate": 1.28398591586583e-06, | |
"loss": 0.0319, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 29.31, | |
"eval_loss": 0.05535305291414261, | |
"eval_runtime": 26.8353, | |
"eval_samples_per_second": 1120.838, | |
"eval_steps_per_second": 5.627, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 30.0, | |
"step": 41970, | |
"total_flos": 3.347206753110317e+16, | |
"train_loss": 0.09860551060169176, | |
"train_runtime": 13103.021, | |
"train_samples_per_second": 640.368, | |
"train_steps_per_second": 3.203 | |
} | |
], | |
"logging_steps": 1000, | |
"max_steps": 41970, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 30, | |
"save_steps": 1000, | |
"total_flos": 3.347206753110317e+16, | |
"train_batch_size": 200, | |
"trial_name": null, | |
"trial_params": null | |
} | |