|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5463, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018304960644334616, |
|
"grad_norm": 0.11226867139339447, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4178, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03660992128866923, |
|
"grad_norm": 0.19307924807071686, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4311, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.054914881933003847, |
|
"grad_norm": 0.1868831366300583, |
|
"learning_rate": 6e-06, |
|
"loss": 2.4251, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07321984257733846, |
|
"grad_norm": 0.2497679591178894, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.4004, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09152480322167307, |
|
"grad_norm": 0.3507976830005646, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3282, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10982976386600769, |
|
"grad_norm": 0.40492674708366394, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2985, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1281347245103423, |
|
"grad_norm": 0.449167400598526, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.2815, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14643968515467692, |
|
"grad_norm": 0.510024905204773, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.2252, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.16474464579901152, |
|
"grad_norm": 0.5523568987846375, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.222, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.18304960644334614, |
|
"grad_norm": 0.6460111141204834, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2266, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20135456708768076, |
|
"grad_norm": 0.6135373711585999, |
|
"learning_rate": 1.9975235096487343e-05, |
|
"loss": 2.1631, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21965952773201539, |
|
"grad_norm": 0.750421941280365, |
|
"learning_rate": 1.990106304603857e-05, |
|
"loss": 2.1781, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23796448837634998, |
|
"grad_norm": 0.6863061189651489, |
|
"learning_rate": 1.9777851221388214e-05, |
|
"loss": 2.1576, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2562694490206846, |
|
"grad_norm": 0.592878520488739, |
|
"learning_rate": 1.9606209888326098e-05, |
|
"loss": 2.142, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2745744096650192, |
|
"grad_norm": 0.9680142998695374, |
|
"learning_rate": 1.9386989183062637e-05, |
|
"loss": 2.1088, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.29287937030935385, |
|
"grad_norm": 0.8196300864219666, |
|
"learning_rate": 1.9121274901520593e-05, |
|
"loss": 2.0937, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.31118433095368847, |
|
"grad_norm": 0.935396671295166, |
|
"learning_rate": 1.881038312140883e-05, |
|
"loss": 2.0938, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.32948929159802304, |
|
"grad_norm": 0.7880417704582214, |
|
"learning_rate": 1.8455853683714823e-05, |
|
"loss": 2.0672, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.34779425224235766, |
|
"grad_norm": 0.8693708777427673, |
|
"learning_rate": 1.805944256590194e-05, |
|
"loss": 2.0529, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3660992128866923, |
|
"grad_norm": 0.7159385681152344, |
|
"learning_rate": 1.7623113184586985e-05, |
|
"loss": 2.0298, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3844041735310269, |
|
"grad_norm": 0.7743539214134216, |
|
"learning_rate": 1.7149026670775558e-05, |
|
"loss": 2.0681, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4027091341753615, |
|
"grad_norm": 0.892536997795105, |
|
"learning_rate": 1.6639531165821896e-05, |
|
"loss": 2.0808, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.42101409481969615, |
|
"grad_norm": 1.1176153421401978, |
|
"learning_rate": 1.6097150191130056e-05, |
|
"loss": 2.0381, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.43931905546403077, |
|
"grad_norm": 1.0423996448516846, |
|
"learning_rate": 1.5524570149201115e-05, |
|
"loss": 2.0246, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4576240161083654, |
|
"grad_norm": 0.8842440843582153, |
|
"learning_rate": 1.4924627017933402e-05, |
|
"loss": 2.0534, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.47592897675269996, |
|
"grad_norm": 0.8251581192016602, |
|
"learning_rate": 1.4300292304078696e-05, |
|
"loss": 2.0748, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4942339373970346, |
|
"grad_norm": 0.9325842261314392, |
|
"learning_rate": 1.3654658325426641e-05, |
|
"loss": 2.0505, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5125388980413692, |
|
"grad_norm": 0.882586658000946, |
|
"learning_rate": 1.2990922894614404e-05, |
|
"loss": 2.0338, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5308438586857038, |
|
"grad_norm": 1.3278307914733887, |
|
"learning_rate": 1.2312373480422384e-05, |
|
"loss": 2.0244, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5491488193300385, |
|
"grad_norm": 1.3495004177093506, |
|
"learning_rate": 1.1622370925004782e-05, |
|
"loss": 2.0226, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5674537799743731, |
|
"grad_norm": 0.730983316898346, |
|
"learning_rate": 1.0924332797703284e-05, |
|
"loss": 1.9642, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5857587406187077, |
|
"grad_norm": 1.0524176359176636, |
|
"learning_rate": 1.0221716467892045e-05, |
|
"loss": 2.0393, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6040637012630423, |
|
"grad_norm": 0.8124818205833435, |
|
"learning_rate": 9.518001980693905e-06, |
|
"loss": 2.0066, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6223686619073769, |
|
"grad_norm": 0.9999445676803589, |
|
"learning_rate": 8.816674820384044e-06, |
|
"loss": 2.0355, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6406736225517116, |
|
"grad_norm": 1.3435726165771484, |
|
"learning_rate": 8.121208646853637e-06, |
|
"loss": 2.0393, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6589785831960461, |
|
"grad_norm": 1.0357476472854614, |
|
"learning_rate": 7.435048090639456e-06, |
|
"loss": 2.0351, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6772835438403807, |
|
"grad_norm": 1.3906038999557495, |
|
"learning_rate": 6.7615916917352545e-06, |
|
"loss": 2.0292, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6955885044847153, |
|
"grad_norm": 1.0058213472366333, |
|
"learning_rate": 6.104175066688805e-06, |
|
"loss": 2.0046, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7138934651290499, |
|
"grad_norm": 0.9746403694152832, |
|
"learning_rate": 5.466054387357491e-06, |
|
"loss": 2.0278, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7321984257733846, |
|
"grad_norm": 0.7524260878562927, |
|
"learning_rate": 4.8503902531519224e-06, |
|
"loss": 2.026, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7505033864177192, |
|
"grad_norm": 0.848231852054596, |
|
"learning_rate": 4.260232036648057e-06, |
|
"loss": 1.994, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7688083470620538, |
|
"grad_norm": 0.9133473634719849, |
|
"learning_rate": 3.6985027801036876e-06, |
|
"loss": 2.0077, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7871133077063884, |
|
"grad_norm": 1.1169400215148926, |
|
"learning_rate": 3.167984717686521e-06, |
|
"loss": 2.0371, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.805418268350723, |
|
"grad_norm": 1.1980133056640625, |
|
"learning_rate": 2.6713054951220497e-06, |
|
"loss": 2.0253, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8237232289950577, |
|
"grad_norm": 0.8313106894493103, |
|
"learning_rate": 2.2109251550149922e-06, |
|
"loss": 1.9908, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8420281896393923, |
|
"grad_norm": 0.9666856527328491, |
|
"learning_rate": 1.7891239523057202e-06, |
|
"loss": 1.9913, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8603331502837269, |
|
"grad_norm": 0.9010581970214844, |
|
"learning_rate": 1.4079910602115544e-06, |
|
"loss": 1.9869, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8786381109280615, |
|
"grad_norm": 1.0931150913238525, |
|
"learning_rate": 1.0694142225921444e-06, |
|
"loss": 1.9533, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8969430715723962, |
|
"grad_norm": 0.984924852848053, |
|
"learning_rate": 7.750704039905343e-07, |
|
"loss": 2.0316, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9152480322167308, |
|
"grad_norm": 0.9124763011932373, |
|
"learning_rate": 5.264174836601732e-07, |
|
"loss": 2.0208, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9335529928610653, |
|
"grad_norm": 1.179923176765442, |
|
"learning_rate": 3.24687034717085e-07, |
|
"loss": 2.0175, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9518579535053999, |
|
"grad_norm": 1.0648012161254883, |
|
"learning_rate": 1.7087822418199506e-07, |
|
"loss": 2.0134, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9701629141497345, |
|
"grad_norm": 0.8695976734161377, |
|
"learning_rate": 6.575286412536686e-08, |
|
"loss": 2.0041, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9884678747940692, |
|
"grad_norm": 1.045597791671753, |
|
"learning_rate": 9.831638426904821e-09, |
|
"loss": 2.0033, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5463, |
|
"total_flos": 9.9498061824e+16, |
|
"train_loss": 2.0932261693746033, |
|
"train_runtime": 883.1108, |
|
"train_samples_per_second": 12.371, |
|
"train_steps_per_second": 6.186 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5463, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.9498061824e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|