{ "best_metric": 0.05535305291414261, "best_model_checkpoint": "/data/mohamed/checkpoints/ling_disc/deberta-v3-small_flan-t5-base_40/checkpoint-41000", "epoch": 30.0, "eval_steps": 1000, "global_step": 41970, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.71, "grad_norm": 0.855617344379425, "learning_rate": 1.1913271384322135e-05, "loss": 0.9117, "step": 1000 }, { "epoch": 0.71, "eval_loss": 0.6742472052574158, "eval_runtime": 27.0595, "eval_samples_per_second": 1111.549, "eval_steps_per_second": 5.58, "step": 1000 }, { "epoch": 1.43, "grad_norm": 4.203719139099121, "learning_rate": 2.382654276864427e-05, "loss": 0.4114, "step": 2000 }, { "epoch": 1.43, "eval_loss": 0.3266257345676422, "eval_runtime": 26.9318, "eval_samples_per_second": 1116.822, "eval_steps_per_second": 5.607, "step": 2000 }, { "epoch": 2.14, "grad_norm": 3.1638591289520264, "learning_rate": 3.57398141529664e-05, "loss": 0.2624, "step": 3000 }, { "epoch": 2.14, "eval_loss": 0.24602766335010529, "eval_runtime": 27.0604, "eval_samples_per_second": 1111.512, "eval_steps_per_second": 5.58, "step": 3000 }, { "epoch": 2.86, "grad_norm": 1.7417826652526855, "learning_rate": 4.765308553728854e-05, "loss": 0.2002, "step": 4000 }, { "epoch": 2.86, "eval_loss": 0.1770436018705368, "eval_runtime": 26.8812, "eval_samples_per_second": 1118.922, "eval_steps_per_second": 5.617, "step": 4000 }, { "epoch": 3.57, "grad_norm": 1.1299816370010376, "learning_rate": 4.893707145315437e-05, "loss": 0.1635, "step": 5000 }, { "epoch": 3.57, "eval_loss": 0.14757415652275085, "eval_runtime": 26.7857, "eval_samples_per_second": 1122.914, "eval_steps_per_second": 5.637, "step": 5000 }, { "epoch": 4.29, "grad_norm": 1.210856556892395, "learning_rate": 4.761337463267413e-05, "loss": 0.1404, "step": 6000 }, { "epoch": 4.29, "eval_loss": 0.12851941585540771, "eval_runtime": 26.9893, "eval_samples_per_second": 1114.44, "eval_steps_per_second": 5.595, "step": 6000 }, { "epoch": 5.0, "grad_norm": 2.0565412044525146, "learning_rate": 4.62896778121939e-05, "loss": 0.1263, "step": 7000 }, { "epoch": 5.0, "eval_loss": 0.12228666245937347, "eval_runtime": 26.7363, "eval_samples_per_second": 1124.987, "eval_steps_per_second": 5.648, "step": 7000 }, { "epoch": 5.72, "grad_norm": 1.8667607307434082, "learning_rate": 4.496598099171366e-05, "loss": 0.1127, "step": 8000 }, { "epoch": 5.72, "eval_loss": 0.11036147177219391, "eval_runtime": 26.7509, "eval_samples_per_second": 1124.375, "eval_steps_per_second": 5.645, "step": 8000 }, { "epoch": 6.43, "grad_norm": 0.7492337226867676, "learning_rate": 4.364228417123342e-05, "loss": 0.1059, "step": 9000 }, { "epoch": 6.43, "eval_loss": 0.10317497700452805, "eval_runtime": 27.0158, "eval_samples_per_second": 1113.349, "eval_steps_per_second": 5.589, "step": 9000 }, { "epoch": 7.15, "grad_norm": 0.7611485123634338, "learning_rate": 4.231858735075319e-05, "loss": 0.0993, "step": 10000 }, { "epoch": 7.15, "eval_loss": 0.10284282267093658, "eval_runtime": 26.795, "eval_samples_per_second": 1122.524, "eval_steps_per_second": 5.635, "step": 10000 }, { "epoch": 7.86, "grad_norm": 0.5870215892791748, "learning_rate": 4.099489053027295e-05, "loss": 0.0887, "step": 11000 }, { "epoch": 7.86, "eval_loss": 0.09789762645959854, "eval_runtime": 26.8453, "eval_samples_per_second": 1120.419, "eval_steps_per_second": 5.625, "step": 11000 }, { "epoch": 8.58, "grad_norm": 0.48922085762023926, "learning_rate": 3.9671193709792706e-05, "loss": 0.0842, "step": 12000 }, { "epoch": 8.58, "eval_loss": 0.09349656105041504, "eval_runtime": 26.8273, "eval_samples_per_second": 1121.172, "eval_steps_per_second": 5.629, "step": 12000 }, { "epoch": 9.29, "grad_norm": 0.4252859354019165, "learning_rate": 3.8347496889312476e-05, "loss": 0.0793, "step": 13000 }, { "epoch": 9.29, "eval_loss": 0.09415590018033981, "eval_runtime": 25.9362, "eval_samples_per_second": 1159.693, "eval_steps_per_second": 5.822, "step": 13000 }, { "epoch": 10.01, "grad_norm": 0.44548505544662476, "learning_rate": 3.702380006883224e-05, "loss": 0.076, "step": 14000 }, { "epoch": 10.01, "eval_loss": 0.08913980424404144, "eval_runtime": 26.7379, "eval_samples_per_second": 1124.919, "eval_steps_per_second": 5.647, "step": 14000 }, { "epoch": 10.72, "grad_norm": 0.2965373694896698, "learning_rate": 3.5700103248352e-05, "loss": 0.0714, "step": 15000 }, { "epoch": 10.72, "eval_loss": 0.08456840366125107, "eval_runtime": 26.787, "eval_samples_per_second": 1122.857, "eval_steps_per_second": 5.637, "step": 15000 }, { "epoch": 11.44, "grad_norm": 0.3205694854259491, "learning_rate": 3.437640642787176e-05, "loss": 0.0677, "step": 16000 }, { "epoch": 11.44, "eval_loss": 0.07863688468933105, "eval_runtime": 26.8242, "eval_samples_per_second": 1121.299, "eval_steps_per_second": 5.629, "step": 16000 }, { "epoch": 12.15, "grad_norm": 0.2736203670501709, "learning_rate": 3.3052709607391525e-05, "loss": 0.0636, "step": 17000 }, { "epoch": 12.15, "eval_loss": 0.07664181292057037, "eval_runtime": 26.7818, "eval_samples_per_second": 1123.077, "eval_steps_per_second": 5.638, "step": 17000 }, { "epoch": 12.87, "grad_norm": 0.25644680857658386, "learning_rate": 3.172901278691129e-05, "loss": 0.0618, "step": 18000 }, { "epoch": 12.87, "eval_loss": 0.07351888716220856, "eval_runtime": 26.8445, "eval_samples_per_second": 1120.453, "eval_steps_per_second": 5.625, "step": 18000 }, { "epoch": 13.58, "grad_norm": 0.2748676538467407, "learning_rate": 3.0405315966431053e-05, "loss": 0.0584, "step": 19000 }, { "epoch": 13.58, "eval_loss": 0.07314006239175797, "eval_runtime": 26.8333, "eval_samples_per_second": 1120.921, "eval_steps_per_second": 5.627, "step": 19000 }, { "epoch": 14.3, "grad_norm": 0.30235132575035095, "learning_rate": 2.9081619145950812e-05, "loss": 0.057, "step": 20000 }, { "epoch": 14.3, "eval_loss": 0.07568340748548508, "eval_runtime": 27.0109, "eval_samples_per_second": 1113.55, "eval_steps_per_second": 5.59, "step": 20000 }, { "epoch": 15.01, "grad_norm": 0.2508692145347595, "learning_rate": 2.7757922325470574e-05, "loss": 0.0558, "step": 21000 }, { "epoch": 15.01, "eval_loss": 0.07675843685865402, "eval_runtime": 26.9026, "eval_samples_per_second": 1118.032, "eval_steps_per_second": 5.613, "step": 21000 }, { "epoch": 15.73, "grad_norm": 0.3341030478477478, "learning_rate": 2.643422550499034e-05, "loss": 0.0533, "step": 22000 }, { "epoch": 15.73, "eval_loss": 0.07339715212583542, "eval_runtime": 26.8727, "eval_samples_per_second": 1119.278, "eval_steps_per_second": 5.619, "step": 22000 }, { "epoch": 16.44, "grad_norm": 0.30433303117752075, "learning_rate": 2.51105286845101e-05, "loss": 0.0516, "step": 23000 }, { "epoch": 16.44, "eval_loss": 0.0694783553481102, "eval_runtime": 26.8551, "eval_samples_per_second": 1120.012, "eval_steps_per_second": 5.623, "step": 23000 }, { "epoch": 17.16, "grad_norm": 0.39424875378608704, "learning_rate": 2.378683186402986e-05, "loss": 0.049, "step": 24000 }, { "epoch": 17.16, "eval_loss": 0.06750107556581497, "eval_runtime": 26.9045, "eval_samples_per_second": 1117.954, "eval_steps_per_second": 5.612, "step": 24000 }, { "epoch": 17.87, "grad_norm": 0.29526183009147644, "learning_rate": 2.2463135043549627e-05, "loss": 0.0478, "step": 25000 }, { "epoch": 17.87, "eval_loss": 0.06841529905796051, "eval_runtime": 26.9131, "eval_samples_per_second": 1117.597, "eval_steps_per_second": 5.611, "step": 25000 }, { "epoch": 18.58, "grad_norm": 0.2802821099758148, "learning_rate": 2.113943822306939e-05, "loss": 0.0472, "step": 26000 }, { "epoch": 18.58, "eval_loss": 0.0680340975522995, "eval_runtime": 26.8442, "eval_samples_per_second": 1120.467, "eval_steps_per_second": 5.625, "step": 26000 }, { "epoch": 19.3, "grad_norm": 0.198490172624588, "learning_rate": 1.9815741402589152e-05, "loss": 0.0445, "step": 27000 }, { "epoch": 19.3, "eval_loss": 0.059882719069719315, "eval_runtime": 26.9691, "eval_samples_per_second": 1115.275, "eval_steps_per_second": 5.599, "step": 27000 }, { "epoch": 20.01, "grad_norm": 0.3383251130580902, "learning_rate": 1.8492044582108914e-05, "loss": 0.0435, "step": 28000 }, { "epoch": 20.01, "eval_loss": 0.06356318295001984, "eval_runtime": 26.8538, "eval_samples_per_second": 1120.066, "eval_steps_per_second": 5.623, "step": 28000 }, { "epoch": 20.73, "grad_norm": 0.16571784019470215, "learning_rate": 1.7168347761628677e-05, "loss": 0.0419, "step": 29000 }, { "epoch": 20.73, "eval_loss": 0.06056862324476242, "eval_runtime": 27.0748, "eval_samples_per_second": 1110.924, "eval_steps_per_second": 5.577, "step": 29000 }, { "epoch": 21.44, "grad_norm": 0.19518467783927917, "learning_rate": 1.584465094114844e-05, "loss": 0.0409, "step": 30000 }, { "epoch": 21.44, "eval_loss": 0.06490638852119446, "eval_runtime": 26.8481, "eval_samples_per_second": 1120.301, "eval_steps_per_second": 5.624, "step": 30000 }, { "epoch": 22.16, "grad_norm": 0.15420591831207275, "learning_rate": 1.4520954120668203e-05, "loss": 0.0397, "step": 31000 }, { "epoch": 22.16, "eval_loss": 0.05918469280004501, "eval_runtime": 26.8143, "eval_samples_per_second": 1121.713, "eval_steps_per_second": 5.631, "step": 31000 }, { "epoch": 22.87, "grad_norm": 0.26854997873306274, "learning_rate": 1.3197257300187965e-05, "loss": 0.0387, "step": 32000 }, { "epoch": 22.87, "eval_loss": 0.06144551932811737, "eval_runtime": 26.8852, "eval_samples_per_second": 1118.757, "eval_steps_per_second": 5.616, "step": 32000 }, { "epoch": 23.59, "grad_norm": 0.17430314421653748, "learning_rate": 1.1873560479707728e-05, "loss": 0.0373, "step": 33000 }, { "epoch": 23.59, "eval_loss": 0.06159648299217224, "eval_runtime": 26.7887, "eval_samples_per_second": 1122.785, "eval_steps_per_second": 5.637, "step": 33000 }, { "epoch": 24.3, "grad_norm": 0.14911049604415894, "learning_rate": 1.054986365922749e-05, "loss": 0.0369, "step": 34000 }, { "epoch": 24.3, "eval_loss": 0.05931873992085457, "eval_runtime": 26.8571, "eval_samples_per_second": 1119.926, "eval_steps_per_second": 5.622, "step": 34000 }, { "epoch": 25.02, "grad_norm": 0.13620807230472565, "learning_rate": 9.226166838747254e-06, "loss": 0.0361, "step": 35000 }, { "epoch": 25.02, "eval_loss": 0.05695568770170212, "eval_runtime": 26.8966, "eval_samples_per_second": 1118.283, "eval_steps_per_second": 5.614, "step": 35000 }, { "epoch": 25.73, "grad_norm": 0.13764438033103943, "learning_rate": 7.902470018267017e-06, "loss": 0.0349, "step": 36000 }, { "epoch": 25.73, "eval_loss": 0.05707501247525215, "eval_runtime": 26.986, "eval_samples_per_second": 1114.578, "eval_steps_per_second": 5.595, "step": 36000 }, { "epoch": 26.45, "grad_norm": 0.2389635145664215, "learning_rate": 6.578773197786779e-06, "loss": 0.0343, "step": 37000 }, { "epoch": 26.45, "eval_loss": 0.0577365942299366, "eval_runtime": 26.9903, "eval_samples_per_second": 1114.401, "eval_steps_per_second": 5.595, "step": 37000 }, { "epoch": 27.16, "grad_norm": 0.15828461945056915, "learning_rate": 5.255076377306542e-06, "loss": 0.034, "step": 38000 }, { "epoch": 27.16, "eval_loss": 0.05767366662621498, "eval_runtime": 27.1454, "eval_samples_per_second": 1108.035, "eval_steps_per_second": 5.563, "step": 38000 }, { "epoch": 27.88, "grad_norm": 0.1059570387005806, "learning_rate": 3.9313795568263045e-06, "loss": 0.0332, "step": 39000 }, { "epoch": 27.88, "eval_loss": 0.056225307285785675, "eval_runtime": 26.9534, "eval_samples_per_second": 1115.928, "eval_steps_per_second": 5.602, "step": 39000 }, { "epoch": 28.59, "grad_norm": 0.1975150853395462, "learning_rate": 2.6076827363460673e-06, "loss": 0.0329, "step": 40000 }, { "epoch": 28.59, "eval_loss": 0.05555161088705063, "eval_runtime": 27.1187, "eval_samples_per_second": 1109.122, "eval_steps_per_second": 5.568, "step": 40000 }, { "epoch": 29.31, "grad_norm": 0.1037423312664032, "learning_rate": 1.28398591586583e-06, "loss": 0.0319, "step": 41000 }, { "epoch": 29.31, "eval_loss": 0.05535305291414261, "eval_runtime": 26.8353, "eval_samples_per_second": 1120.838, "eval_steps_per_second": 5.627, "step": 41000 }, { "epoch": 30.0, "step": 41970, "total_flos": 3.347206753110317e+16, "train_loss": 0.09860551060169176, "train_runtime": 13103.021, "train_samples_per_second": 640.368, "train_steps_per_second": 3.203 } ], "logging_steps": 1000, "max_steps": 41970, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 1000, "total_flos": 3.347206753110317e+16, "train_batch_size": 200, "trial_name": null, "trial_params": null }