|
{ |
|
"best_metric": 0.6518497467041016, |
|
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google/flan_t5_small_amazon/checkpoint-1100", |
|
"epoch": 3.0, |
|
"eval_steps": 50, |
|
"global_step": 1140, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.3804776668548584, |
|
"learning_rate": 0.0004956140350877193, |
|
"loss": 3.1062, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.343827962875366, |
|
"learning_rate": 0.0004912280701754386, |
|
"loss": 2.9891, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.987942099571228, |
|
"learning_rate": 0.0004868421052631579, |
|
"loss": 2.8938, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.4765126705169678, |
|
"learning_rate": 0.0004824561403508772, |
|
"loss": 2.4308, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.325758934020996, |
|
"learning_rate": 0.00047807017543859647, |
|
"loss": 1.9282, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_accuracy": 0.5737812911725956, |
|
"eval_f1_macro": 0.44362854909805843, |
|
"eval_f1_micro": 0.5737812911725956, |
|
"eval_loss": 1.590394139289856, |
|
"eval_runtime": 1.5287, |
|
"eval_samples_per_second": 992.985, |
|
"eval_steps_per_second": 31.399, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.0366220474243164, |
|
"learning_rate": 0.00047368421052631577, |
|
"loss": 1.6028, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.9418632984161377, |
|
"learning_rate": 0.0004692982456140351, |
|
"loss": 1.3737, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.228276014328003, |
|
"learning_rate": 0.00046491228070175437, |
|
"loss": 1.2147, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.007902145385742, |
|
"learning_rate": 0.0004605263157894737, |
|
"loss": 1.2373, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.132214307785034, |
|
"learning_rate": 0.000456140350877193, |
|
"loss": 1.1323, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_accuracy": 0.663372859025033, |
|
"eval_f1_macro": 0.5640942564973862, |
|
"eval_f1_micro": 0.663372859025033, |
|
"eval_loss": 1.1076021194458008, |
|
"eval_runtime": 1.5766, |
|
"eval_samples_per_second": 962.833, |
|
"eval_steps_per_second": 30.445, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.991384506225586, |
|
"learning_rate": 0.00045175438596491233, |
|
"loss": 1.21, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.180238723754883, |
|
"learning_rate": 0.0004473684210526316, |
|
"loss": 1.0477, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.497032642364502, |
|
"learning_rate": 0.0004429824561403509, |
|
"loss": 1.1554, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.396751642227173, |
|
"learning_rate": 0.0004385964912280702, |
|
"loss": 0.9906, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.457590341567993, |
|
"learning_rate": 0.0004342105263157895, |
|
"loss": 0.9976, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_accuracy": 0.7206851119894598, |
|
"eval_f1_macro": 0.646835154821118, |
|
"eval_f1_micro": 0.7206851119894598, |
|
"eval_loss": 0.9464844465255737, |
|
"eval_runtime": 1.5277, |
|
"eval_samples_per_second": 993.644, |
|
"eval_steps_per_second": 31.42, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.1680402755737305, |
|
"learning_rate": 0.0004298245614035088, |
|
"loss": 0.9004, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.452507734298706, |
|
"learning_rate": 0.0004254385964912281, |
|
"loss": 1.0064, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.8887767791748047, |
|
"learning_rate": 0.00042105263157894734, |
|
"loss": 0.9662, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.135780334472656, |
|
"learning_rate": 0.0004166666666666667, |
|
"loss": 1.0267, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.5169897079467773, |
|
"learning_rate": 0.000412280701754386, |
|
"loss": 0.928, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.733201581027668, |
|
"eval_f1_macro": 0.6859595929599217, |
|
"eval_f1_micro": 0.733201581027668, |
|
"eval_loss": 0.8840087652206421, |
|
"eval_runtime": 1.5811, |
|
"eval_samples_per_second": 960.099, |
|
"eval_steps_per_second": 30.359, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.8375213146209717, |
|
"learning_rate": 0.00040789473684210524, |
|
"loss": 0.8223, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.949693441390991, |
|
"learning_rate": 0.00040350877192982455, |
|
"loss": 0.9726, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.1758928298950195, |
|
"learning_rate": 0.0003991228070175439, |
|
"loss": 0.8881, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.932584285736084, |
|
"learning_rate": 0.00039473684210526315, |
|
"loss": 0.9233, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.125002861022949, |
|
"learning_rate": 0.00039035087719298245, |
|
"loss": 0.974, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_accuracy": 0.7523056653491436, |
|
"eval_f1_macro": 0.7025921892520358, |
|
"eval_f1_micro": 0.7523056653491436, |
|
"eval_loss": 0.8179047107696533, |
|
"eval_runtime": 1.5845, |
|
"eval_samples_per_second": 958.002, |
|
"eval_steps_per_second": 30.293, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.811002016067505, |
|
"learning_rate": 0.00038596491228070175, |
|
"loss": 0.8885, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.872647762298584, |
|
"learning_rate": 0.00038157894736842105, |
|
"loss": 0.9161, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.938746452331543, |
|
"learning_rate": 0.00037719298245614036, |
|
"loss": 0.7768, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.9342243671417236, |
|
"learning_rate": 0.00037280701754385966, |
|
"loss": 0.8114, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.424884557723999, |
|
"learning_rate": 0.00036842105263157896, |
|
"loss": 0.8206, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.7674571805006588, |
|
"eval_f1_macro": 0.7181951416155129, |
|
"eval_f1_micro": 0.7674571805006588, |
|
"eval_loss": 0.7982419729232788, |
|
"eval_runtime": 1.5868, |
|
"eval_samples_per_second": 956.662, |
|
"eval_steps_per_second": 30.25, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.263403415679932, |
|
"learning_rate": 0.00036403508771929826, |
|
"loss": 0.7327, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.17680025100708, |
|
"learning_rate": 0.00035964912280701756, |
|
"loss": 0.8797, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.7559242248535156, |
|
"learning_rate": 0.00035526315789473687, |
|
"loss": 0.9148, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.6917154788970947, |
|
"learning_rate": 0.0003508771929824561, |
|
"loss": 0.7763, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.3386855125427246, |
|
"learning_rate": 0.00034649122807017547, |
|
"loss": 0.8863, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_accuracy": 0.7773386034255599, |
|
"eval_f1_macro": 0.7301140464284491, |
|
"eval_f1_micro": 0.7773386034255599, |
|
"eval_loss": 0.7444999814033508, |
|
"eval_runtime": 1.593, |
|
"eval_samples_per_second": 952.941, |
|
"eval_steps_per_second": 30.133, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.625443935394287, |
|
"learning_rate": 0.00034210526315789477, |
|
"loss": 0.7353, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.095985412597656, |
|
"learning_rate": 0.000337719298245614, |
|
"loss": 0.9274, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.8155393600463867, |
|
"learning_rate": 0.0003333333333333333, |
|
"loss": 0.7887, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 3.3496756553649902, |
|
"learning_rate": 0.0003289473684210527, |
|
"loss": 0.682, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 3.9287924766540527, |
|
"learning_rate": 0.0003245614035087719, |
|
"loss": 0.713, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_accuracy": 0.7740447957839263, |
|
"eval_f1_macro": 0.7390551518744245, |
|
"eval_f1_micro": 0.7740447957839263, |
|
"eval_loss": 0.7427995204925537, |
|
"eval_runtime": 1.5483, |
|
"eval_samples_per_second": 980.422, |
|
"eval_steps_per_second": 31.001, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.985713243484497, |
|
"learning_rate": 0.00032017543859649123, |
|
"loss": 0.597, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.647797107696533, |
|
"learning_rate": 0.00031578947368421053, |
|
"loss": 0.6752, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.2907509803771973, |
|
"learning_rate": 0.00031140350877192983, |
|
"loss": 0.6903, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 4.062851905822754, |
|
"learning_rate": 0.00030701754385964913, |
|
"loss": 0.5708, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 3.1373462677001953, |
|
"learning_rate": 0.00030263157894736844, |
|
"loss": 0.6544, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_accuracy": 0.7852437417654808, |
|
"eval_f1_macro": 0.7379347673603733, |
|
"eval_f1_micro": 0.7852437417654808, |
|
"eval_loss": 0.7234027981758118, |
|
"eval_runtime": 1.5405, |
|
"eval_samples_per_second": 985.365, |
|
"eval_steps_per_second": 31.158, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.7265396118164062, |
|
"learning_rate": 0.0002982456140350877, |
|
"loss": 0.7901, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.8776745796203613, |
|
"learning_rate": 0.00029385964912280704, |
|
"loss": 0.5103, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 4.379913806915283, |
|
"learning_rate": 0.00028947368421052634, |
|
"loss": 0.7062, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 3.8924524784088135, |
|
"learning_rate": 0.00028508771929824564, |
|
"loss": 0.657, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 4.120116233825684, |
|
"learning_rate": 0.0002807017543859649, |
|
"loss": 0.6034, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_accuracy": 0.7924901185770751, |
|
"eval_f1_macro": 0.7648022166570957, |
|
"eval_f1_micro": 0.7924901185770751, |
|
"eval_loss": 0.7140358090400696, |
|
"eval_runtime": 1.5497, |
|
"eval_samples_per_second": 979.546, |
|
"eval_steps_per_second": 30.974, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 3.7763350009918213, |
|
"learning_rate": 0.00027631578947368425, |
|
"loss": 0.5917, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 3.2136292457580566, |
|
"learning_rate": 0.00027192982456140355, |
|
"loss": 0.7325, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 5.103039264678955, |
|
"learning_rate": 0.0002675438596491228, |
|
"loss": 0.7051, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.6361660957336426, |
|
"learning_rate": 0.0002631578947368421, |
|
"loss": 0.5079, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.617443561553955, |
|
"learning_rate": 0.00025877192982456146, |
|
"loss": 0.588, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_accuracy": 0.7931488801054019, |
|
"eval_f1_macro": 0.7585136991271696, |
|
"eval_f1_micro": 0.7931488801054019, |
|
"eval_loss": 0.7061514258384705, |
|
"eval_runtime": 1.5944, |
|
"eval_samples_per_second": 952.063, |
|
"eval_steps_per_second": 30.105, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 3.122095823287964, |
|
"learning_rate": 0.0002543859649122807, |
|
"loss": 0.6284, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.7279186248779297, |
|
"learning_rate": 0.00025, |
|
"loss": 0.6255, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.1004347801208496, |
|
"learning_rate": 0.0002456140350877193, |
|
"loss": 0.5774, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 3.556870937347412, |
|
"learning_rate": 0.0002412280701754386, |
|
"loss": 0.5078, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 4.2002854347229, |
|
"learning_rate": 0.00023684210526315788, |
|
"loss": 0.6035, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_accuracy": 0.7924901185770751, |
|
"eval_f1_macro": 0.7479706258386674, |
|
"eval_f1_micro": 0.7924901185770751, |
|
"eval_loss": 0.711154043674469, |
|
"eval_runtime": 1.6011, |
|
"eval_samples_per_second": 948.117, |
|
"eval_steps_per_second": 29.98, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 3.03747296333313, |
|
"learning_rate": 0.00023245614035087719, |
|
"loss": 0.6233, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.1721749305725098, |
|
"learning_rate": 0.0002280701754385965, |
|
"loss": 0.6446, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.81701922416687, |
|
"learning_rate": 0.0002236842105263158, |
|
"loss": 0.6268, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.8511128425598145, |
|
"learning_rate": 0.0002192982456140351, |
|
"loss": 0.6635, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 3.052924871444702, |
|
"learning_rate": 0.0002149122807017544, |
|
"loss": 0.6616, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_accuracy": 0.7938076416337286, |
|
"eval_f1_macro": 0.7577530795114652, |
|
"eval_f1_micro": 0.7938076416337286, |
|
"eval_loss": 0.6783204674720764, |
|
"eval_runtime": 1.6022, |
|
"eval_samples_per_second": 947.452, |
|
"eval_steps_per_second": 29.959, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.520653247833252, |
|
"learning_rate": 0.00021052631578947367, |
|
"loss": 0.5442, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.6852402687072754, |
|
"learning_rate": 0.000206140350877193, |
|
"loss": 0.5909, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.394245147705078, |
|
"learning_rate": 0.00020175438596491227, |
|
"loss": 0.6419, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.5546963214874268, |
|
"learning_rate": 0.00019736842105263157, |
|
"loss": 0.6128, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.6713662147521973, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 0.6334, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_accuracy": 0.8003952569169961, |
|
"eval_f1_macro": 0.7850643712314976, |
|
"eval_f1_micro": 0.8003952569169961, |
|
"eval_loss": 0.6815611720085144, |
|
"eval_runtime": 1.601, |
|
"eval_samples_per_second": 948.157, |
|
"eval_steps_per_second": 29.981, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.6882362365722656, |
|
"learning_rate": 0.00018859649122807018, |
|
"loss": 0.5586, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.613619565963745, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 0.5914, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.6678318977355957, |
|
"learning_rate": 0.00017982456140350878, |
|
"loss": 0.6132, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.920417070388794, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 0.6667, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 4.305054664611816, |
|
"learning_rate": 0.00017105263157894739, |
|
"loss": 0.5872, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"eval_accuracy": 0.8036890645586298, |
|
"eval_f1_macro": 0.7792442711200399, |
|
"eval_f1_micro": 0.8036890645586298, |
|
"eval_loss": 0.6531891822814941, |
|
"eval_runtime": 1.5978, |
|
"eval_samples_per_second": 950.034, |
|
"eval_steps_per_second": 30.041, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.5539097785949707, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 0.5254, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.1510868072509766, |
|
"learning_rate": 0.00016228070175438596, |
|
"loss": 0.4091, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 3.702329635620117, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 0.4975, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 5.3907060623168945, |
|
"learning_rate": 0.00015350877192982457, |
|
"loss": 0.437, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 3.821787118911743, |
|
"learning_rate": 0.00014912280701754384, |
|
"loss": 0.4134, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_accuracy": 0.8069828722002635, |
|
"eval_f1_macro": 0.7857989316065254, |
|
"eval_f1_micro": 0.8069828722002635, |
|
"eval_loss": 0.6601086258888245, |
|
"eval_runtime": 1.5931, |
|
"eval_samples_per_second": 952.843, |
|
"eval_steps_per_second": 30.129, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.2001376152038574, |
|
"learning_rate": 0.00014473684210526317, |
|
"loss": 0.3875, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.7944624423980713, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 0.4485, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.9566526412963867, |
|
"learning_rate": 0.00013596491228070177, |
|
"loss": 0.396, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.5819900035858154, |
|
"learning_rate": 0.00013157894736842105, |
|
"loss": 0.4155, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 3.8049263954162598, |
|
"learning_rate": 0.00012719298245614035, |
|
"loss": 0.518, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_accuracy": 0.8069828722002635, |
|
"eval_f1_macro": 0.7857680865770484, |
|
"eval_f1_micro": 0.8069828722002635, |
|
"eval_loss": 0.6772085428237915, |
|
"eval_runtime": 1.5894, |
|
"eval_samples_per_second": 955.055, |
|
"eval_steps_per_second": 30.199, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 4.169273853302002, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 0.3547, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 3.642315626144409, |
|
"learning_rate": 0.00011842105263157894, |
|
"loss": 0.4423, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 3.0137033462524414, |
|
"learning_rate": 0.00011403508771929824, |
|
"loss": 0.4462, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.681169867515564, |
|
"learning_rate": 0.00010964912280701755, |
|
"loss": 0.3536, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 3.0964701175689697, |
|
"learning_rate": 0.00010526315789473683, |
|
"loss": 0.3891, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_accuracy": 0.8089591567852438, |
|
"eval_f1_macro": 0.7866273295700273, |
|
"eval_f1_micro": 0.8089591567852438, |
|
"eval_loss": 0.6751896142959595, |
|
"eval_runtime": 1.5446, |
|
"eval_samples_per_second": 982.806, |
|
"eval_steps_per_second": 31.077, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.752586841583252, |
|
"learning_rate": 0.00010087719298245614, |
|
"loss": 0.4731, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.796949863433838, |
|
"learning_rate": 9.649122807017544e-05, |
|
"loss": 0.5025, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 3.236764430999756, |
|
"learning_rate": 9.210526315789474e-05, |
|
"loss": 0.4629, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 3.6876580715179443, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 0.4943, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.9102911949157715, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 0.3389, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_accuracy": 0.8122529644268774, |
|
"eval_f1_macro": 0.7914121636262914, |
|
"eval_f1_micro": 0.8122529644268774, |
|
"eval_loss": 0.6639086008071899, |
|
"eval_runtime": 1.5986, |
|
"eval_samples_per_second": 949.6, |
|
"eval_steps_per_second": 30.027, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.5189881324768066, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 0.4331, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 3.3919639587402344, |
|
"learning_rate": 7.456140350877192e-05, |
|
"loss": 0.5003, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 4.042901992797852, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 0.4475, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 3.0205342769622803, |
|
"learning_rate": 6.578947368421052e-05, |
|
"loss": 0.4484, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 3.377220630645752, |
|
"learning_rate": 6.140350877192983e-05, |
|
"loss": 0.4166, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"eval_accuracy": 0.8168642951251647, |
|
"eval_f1_macro": 0.8010465581589675, |
|
"eval_f1_micro": 0.8168642951251647, |
|
"eval_loss": 0.6590448021888733, |
|
"eval_runtime": 1.5979, |
|
"eval_samples_per_second": 950.026, |
|
"eval_steps_per_second": 30.04, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 3.584700584411621, |
|
"learning_rate": 5.701754385964912e-05, |
|
"loss": 0.3845, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.4608445167541504, |
|
"learning_rate": 5.263157894736842e-05, |
|
"loss": 0.4711, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 3.6688997745513916, |
|
"learning_rate": 4.824561403508772e-05, |
|
"loss": 0.4564, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.6104586124420166, |
|
"learning_rate": 4.3859649122807014e-05, |
|
"loss": 0.4098, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 4.012241363525391, |
|
"learning_rate": 3.9473684210526316e-05, |
|
"loss": 0.483, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_accuracy": 0.8148880105401844, |
|
"eval_f1_macro": 0.7936908234810874, |
|
"eval_f1_micro": 0.8148880105401844, |
|
"eval_loss": 0.6629713177680969, |
|
"eval_runtime": 1.5955, |
|
"eval_samples_per_second": 951.432, |
|
"eval_steps_per_second": 30.085, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.5108678340911865, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 0.4483, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 3.1787357330322266, |
|
"learning_rate": 3.0701754385964913e-05, |
|
"loss": 0.5206, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 3.5252914428710938, |
|
"learning_rate": 2.631578947368421e-05, |
|
"loss": 0.5051, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.644866704940796, |
|
"learning_rate": 2.1929824561403507e-05, |
|
"loss": 0.4477, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 3.4592483043670654, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 0.4582, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"eval_accuracy": 0.8096179183135704, |
|
"eval_f1_macro": 0.788232456216749, |
|
"eval_f1_micro": 0.8096179183135704, |
|
"eval_loss": 0.6518497467041016, |
|
"eval_runtime": 1.537, |
|
"eval_samples_per_second": 987.659, |
|
"eval_steps_per_second": 31.23, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.930580973625183, |
|
"learning_rate": 1.3157894736842104e-05, |
|
"loss": 0.4766, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 3.2953085899353027, |
|
"learning_rate": 8.771929824561403e-06, |
|
"loss": 0.4975, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 3.153869390487671, |
|
"learning_rate": 4.3859649122807014e-06, |
|
"loss": 0.3219, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.2529447078704834, |
|
"learning_rate": 0.0, |
|
"loss": 0.3759, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1140, |
|
"total_flos": 1242149934858240.0, |
|
"train_loss": 0.7540260691391795, |
|
"train_runtime": 163.74, |
|
"train_samples_per_second": 222.499, |
|
"train_steps_per_second": 6.962 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1140, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"total_flos": 1242149934858240.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|