|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 8668, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.046146746654360866, |
|
"grad_norm": 0.21824534237384796, |
|
"learning_rate": 2.23760092272203e-06, |
|
"loss": 3.136, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09229349330872173, |
|
"grad_norm": 0.2850794494152069, |
|
"learning_rate": 4.544405997693195e-06, |
|
"loss": 3.1073, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1384402399630826, |
|
"grad_norm": 0.36149144172668457, |
|
"learning_rate": 6.828143021914648e-06, |
|
"loss": 3.1035, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18458698661744347, |
|
"grad_norm": 0.5060057640075684, |
|
"learning_rate": 9.134948096885815e-06, |
|
"loss": 3.106, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23073373327180433, |
|
"grad_norm": 0.3851727247238159, |
|
"learning_rate": 1.144175317185698e-05, |
|
"loss": 3.0204, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2768804799261652, |
|
"grad_norm": 0.30784064531326294, |
|
"learning_rate": 1.3748558246828143e-05, |
|
"loss": 2.9708, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3230272265805261, |
|
"grad_norm": 0.6206746697425842, |
|
"learning_rate": 1.605536332179931e-05, |
|
"loss": 2.8294, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.36917397323488693, |
|
"grad_norm": 0.47670331597328186, |
|
"learning_rate": 1.8362168396770474e-05, |
|
"loss": 2.6824, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4153207198892478, |
|
"grad_norm": 0.4056473672389984, |
|
"learning_rate": 1.9925650557620818e-05, |
|
"loss": 2.4481, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.46146746654360865, |
|
"grad_norm": 0.8275523781776428, |
|
"learning_rate": 1.96692731701064e-05, |
|
"loss": 2.0463, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 0.5123036503791809, |
|
"learning_rate": 1.9412895782591976e-05, |
|
"loss": 1.9237, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5537609598523304, |
|
"grad_norm": 0.7184427976608276, |
|
"learning_rate": 1.9156518395077554e-05, |
|
"loss": 1.881, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5999077065066912, |
|
"grad_norm": 1.0534940958023071, |
|
"learning_rate": 1.8900141007563134e-05, |
|
"loss": 1.8219, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6460544531610521, |
|
"grad_norm": 0.9063017964363098, |
|
"learning_rate": 1.8646327393923858e-05, |
|
"loss": 1.7868, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6922011998154131, |
|
"grad_norm": 0.7229586839675903, |
|
"learning_rate": 1.8389950006409436e-05, |
|
"loss": 1.7581, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7383479464697739, |
|
"grad_norm": 0.6040017604827881, |
|
"learning_rate": 1.8133572618895013e-05, |
|
"loss": 1.7096, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7844946931241348, |
|
"grad_norm": 0.7014355659484863, |
|
"learning_rate": 1.7877195231380594e-05, |
|
"loss": 1.6372, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8306414397784956, |
|
"grad_norm": 0.45642441511154175, |
|
"learning_rate": 1.762081784386617e-05, |
|
"loss": 1.5638, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8767881864328565, |
|
"grad_norm": 0.7556698322296143, |
|
"learning_rate": 1.7364440456351752e-05, |
|
"loss": 1.5481, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9229349330872173, |
|
"grad_norm": 0.48637843132019043, |
|
"learning_rate": 1.710806306883733e-05, |
|
"loss": 1.4976, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9690816797415782, |
|
"grad_norm": 0.6813339591026306, |
|
"learning_rate": 1.685168568132291e-05, |
|
"loss": 1.4417, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 1.094603180885315, |
|
"learning_rate": 1.6595308293808488e-05, |
|
"loss": 1.3938, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0613751730503, |
|
"grad_norm": 0.37147483229637146, |
|
"learning_rate": 1.6338930906294065e-05, |
|
"loss": 1.3497, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1075219197046609, |
|
"grad_norm": 0.2780097424983978, |
|
"learning_rate": 1.6082553518779646e-05, |
|
"loss": 1.3099, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1536686663590217, |
|
"grad_norm": 0.271342933177948, |
|
"learning_rate": 1.5826176131265223e-05, |
|
"loss": 1.285, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1998154130133827, |
|
"grad_norm": 0.27299413084983826, |
|
"learning_rate": 1.55697987437508e-05, |
|
"loss": 1.2614, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2459621596677435, |
|
"grad_norm": 0.29231297969818115, |
|
"learning_rate": 1.531342135623638e-05, |
|
"loss": 1.2308, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2921089063221043, |
|
"grad_norm": 0.22232797741889954, |
|
"learning_rate": 1.505704396872196e-05, |
|
"loss": 1.1944, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.338255652976465, |
|
"grad_norm": 0.9203324913978577, |
|
"learning_rate": 1.480066658120754e-05, |
|
"loss": 1.1544, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.384402399630826, |
|
"grad_norm": 0.19580288231372833, |
|
"learning_rate": 1.4544289193693117e-05, |
|
"loss": 1.0996, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.430549146285187, |
|
"grad_norm": 0.42558759450912476, |
|
"learning_rate": 1.4287911806178696e-05, |
|
"loss": 1.0759, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4766958929395477, |
|
"grad_norm": 0.24304209649562836, |
|
"learning_rate": 1.4031534418664275e-05, |
|
"loss": 1.0637, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"grad_norm": 0.25986990332603455, |
|
"learning_rate": 1.3775157031149852e-05, |
|
"loss": 1.0446, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.5689893862482696, |
|
"grad_norm": 0.20725102722644806, |
|
"learning_rate": 1.3518779643635433e-05, |
|
"loss": 1.0384, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.6151361329026304, |
|
"grad_norm": 0.18936870992183685, |
|
"learning_rate": 1.3262402256121012e-05, |
|
"loss": 1.0139, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6612828795569912, |
|
"grad_norm": 0.19966499507427216, |
|
"learning_rate": 1.300602486860659e-05, |
|
"loss": 1.0058, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.707429626211352, |
|
"grad_norm": 0.30528759956359863, |
|
"learning_rate": 1.2749647481092169e-05, |
|
"loss": 0.9838, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.753576372865713, |
|
"grad_norm": 0.2316664308309555, |
|
"learning_rate": 1.2493270093577748e-05, |
|
"loss": 0.9904, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.7997231195200738, |
|
"grad_norm": 0.2217002511024475, |
|
"learning_rate": 1.2236892706063325e-05, |
|
"loss": 0.9735, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8458698661744348, |
|
"grad_norm": 0.2654038369655609, |
|
"learning_rate": 1.1980515318548904e-05, |
|
"loss": 0.9804, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8920166128287956, |
|
"grad_norm": 0.20543397963047028, |
|
"learning_rate": 1.1724137931034483e-05, |
|
"loss": 0.9424, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9381633594831564, |
|
"grad_norm": 0.24414564669132233, |
|
"learning_rate": 1.1467760543520064e-05, |
|
"loss": 0.9353, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.9843101061375172, |
|
"grad_norm": 0.19333013892173767, |
|
"learning_rate": 1.1211383156005641e-05, |
|
"loss": 0.9374, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 0.20063996315002441, |
|
"learning_rate": 1.095500576849122e-05, |
|
"loss": 0.9409, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.076603599446239, |
|
"grad_norm": 0.4319429397583008, |
|
"learning_rate": 1.0698628380976798e-05, |
|
"loss": 0.93, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.1227503461006, |
|
"grad_norm": 0.21358811855316162, |
|
"learning_rate": 1.0442250993462377e-05, |
|
"loss": 0.9231, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.168897092754961, |
|
"grad_norm": 0.2252470701932907, |
|
"learning_rate": 1.0185873605947956e-05, |
|
"loss": 0.9267, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.2150438394093217, |
|
"grad_norm": 0.6058911681175232, |
|
"learning_rate": 9.929496218433535e-06, |
|
"loss": 0.902, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.2611905860636825, |
|
"grad_norm": 0.27027812600135803, |
|
"learning_rate": 9.673118830919114e-06, |
|
"loss": 0.8908, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.3073373327180433, |
|
"grad_norm": 0.3116415739059448, |
|
"learning_rate": 9.416741443404692e-06, |
|
"loss": 0.8971, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.353484079372404, |
|
"grad_norm": 0.2324889898300171, |
|
"learning_rate": 9.160364055890272e-06, |
|
"loss": 0.8927, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.3996308260267654, |
|
"grad_norm": 0.18322697281837463, |
|
"learning_rate": 8.90398666837585e-06, |
|
"loss": 0.8844, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.445777572681126, |
|
"grad_norm": 1.4241108894348145, |
|
"learning_rate": 8.650173054736572e-06, |
|
"loss": 0.883, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.491924319335487, |
|
"grad_norm": 0.22246557474136353, |
|
"learning_rate": 8.393795667222153e-06, |
|
"loss": 0.8841, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"grad_norm": 0.19322210550308228, |
|
"learning_rate": 8.13741827970773e-06, |
|
"loss": 0.89, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.5842178126442086, |
|
"grad_norm": 0.20664915442466736, |
|
"learning_rate": 7.881040892193309e-06, |
|
"loss": 0.8762, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.6303645592985694, |
|
"grad_norm": 0.19776581227779388, |
|
"learning_rate": 7.624663504678887e-06, |
|
"loss": 0.8649, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.67651130595293, |
|
"grad_norm": 0.7888526916503906, |
|
"learning_rate": 7.368286117164467e-06, |
|
"loss": 0.8678, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.722658052607291, |
|
"grad_norm": 0.7994652390480042, |
|
"learning_rate": 7.1119087296500455e-06, |
|
"loss": 0.7896, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.768804799261652, |
|
"grad_norm": 0.2278624325990677, |
|
"learning_rate": 6.855531342135624e-06, |
|
"loss": 0.7324, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.814951545916013, |
|
"grad_norm": 0.24736915528774261, |
|
"learning_rate": 6.599153954621203e-06, |
|
"loss": 0.7139, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.861098292570374, |
|
"grad_norm": 0.22141049802303314, |
|
"learning_rate": 6.342776567106782e-06, |
|
"loss": 0.7116, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.9072450392247347, |
|
"grad_norm": 0.21581608057022095, |
|
"learning_rate": 6.08639917959236e-06, |
|
"loss": 0.7185, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.9533917858790955, |
|
"grad_norm": 0.27274981141090393, |
|
"learning_rate": 5.830021792077939e-06, |
|
"loss": 0.7114, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.9995385325334563, |
|
"grad_norm": 0.4279099106788635, |
|
"learning_rate": 5.5736444045635175e-06, |
|
"loss": 0.6974, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.045685279187817, |
|
"grad_norm": 0.4010777473449707, |
|
"learning_rate": 5.3172670170490966e-06, |
|
"loss": 0.7089, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.0918320258421783, |
|
"grad_norm": 0.4470697343349457, |
|
"learning_rate": 5.060889629534676e-06, |
|
"loss": 0.6895, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.137978772496539, |
|
"grad_norm": 0.534737229347229, |
|
"learning_rate": 4.804512242020255e-06, |
|
"loss": 0.709, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.1841255191509, |
|
"grad_norm": 0.3858148157596588, |
|
"learning_rate": 4.548134854505833e-06, |
|
"loss": 0.7008, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.2302722658052607, |
|
"grad_norm": 4.884620189666748, |
|
"learning_rate": 4.291757466991412e-06, |
|
"loss": 0.6926, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.2764190124596215, |
|
"grad_norm": 0.44127726554870605, |
|
"learning_rate": 4.03538007947699e-06, |
|
"loss": 0.6986, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.3225657591139823, |
|
"grad_norm": 1.2448310852050781, |
|
"learning_rate": 3.7790026919625694e-06, |
|
"loss": 0.6884, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.368712505768343, |
|
"grad_norm": 0.8101204633712769, |
|
"learning_rate": 3.522625304448148e-06, |
|
"loss": 0.6851, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.4148592524227044, |
|
"grad_norm": 0.5153388381004333, |
|
"learning_rate": 3.2662479169337267e-06, |
|
"loss": 0.6938, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.461005999077065, |
|
"grad_norm": 0.33079493045806885, |
|
"learning_rate": 3.0098705294193053e-06, |
|
"loss": 0.6948, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.507152745731426, |
|
"grad_norm": 1.0203328132629395, |
|
"learning_rate": 2.7534931419048844e-06, |
|
"loss": 0.6855, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.553299492385787, |
|
"grad_norm": 0.3520820140838623, |
|
"learning_rate": 2.4971157543904627e-06, |
|
"loss": 0.6849, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.5994462390401476, |
|
"grad_norm": 0.28180956840515137, |
|
"learning_rate": 2.2407383668760417e-06, |
|
"loss": 0.699, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.6455929856945084, |
|
"grad_norm": 0.33973556756973267, |
|
"learning_rate": 1.9843609793616204e-06, |
|
"loss": 0.6756, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.6917397323488697, |
|
"grad_norm": 0.3416615128517151, |
|
"learning_rate": 1.7279835918471993e-06, |
|
"loss": 0.685, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.7378864790032305, |
|
"grad_norm": 0.7213825583457947, |
|
"learning_rate": 1.4716062043327781e-06, |
|
"loss": 0.6764, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.7840332256575913, |
|
"grad_norm": 0.5637441873550415, |
|
"learning_rate": 1.2152288168183566e-06, |
|
"loss": 0.6896, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.830179972311952, |
|
"grad_norm": 0.4190536141395569, |
|
"learning_rate": 9.588514293039355e-07, |
|
"loss": 0.675, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.876326718966313, |
|
"grad_norm": 0.37957823276519775, |
|
"learning_rate": 7.024740417895142e-07, |
|
"loss": 0.6767, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.9224734656206737, |
|
"grad_norm": 0.5404504537582397, |
|
"learning_rate": 4.4609665427509294e-07, |
|
"loss": 0.695, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.9686202122750345, |
|
"grad_norm": 14.911314964294434, |
|
"learning_rate": 1.8971926676067174e-07, |
|
"loss": 0.6794, |
|
"step": 8600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 8668, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.884583671986995e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|