|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9953632148377123, |
|
"eval_steps": 500, |
|
"global_step": 969, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030911901081916538, |
|
"grad_norm": 2.0187587823402606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9224, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.061823802163833076, |
|
"grad_norm": 5.23374455774046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8169, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09273570324574962, |
|
"grad_norm": 1.1567925380600657, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7799, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12364760432766615, |
|
"grad_norm": 1.3555247489499298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7536, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1545595054095827, |
|
"grad_norm": 0.8540206480635643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7412, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18547140649149924, |
|
"grad_norm": 1.0411208272577643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7188, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21638330757341576, |
|
"grad_norm": 0.772135725000269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7099, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2472952086553323, |
|
"grad_norm": 0.718962153128519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7044, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2782071097372488, |
|
"grad_norm": 0.9018851336703633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7042, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3091190108191654, |
|
"grad_norm": 0.5501975291936315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6898, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3400309119010819, |
|
"grad_norm": 0.8644622620986302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6901, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.37094281298299847, |
|
"grad_norm": 0.5948602458220357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6828, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.401854714064915, |
|
"grad_norm": 0.720162590208508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6858, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4327666151468315, |
|
"grad_norm": 0.5833952154939747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6745, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46367851622874806, |
|
"grad_norm": 0.6768653844503261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6764, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4945904173106646, |
|
"grad_norm": 0.5707017440465438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6734, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5255023183925811, |
|
"grad_norm": 0.52632417149709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6746, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5564142194744977, |
|
"grad_norm": 0.5660566537871714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6654, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5873261205564142, |
|
"grad_norm": 0.6150804490960232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6669, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6182380216383307, |
|
"grad_norm": 0.5523983219898584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6722, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6491499227202473, |
|
"grad_norm": 0.7211222999715671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6689, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6800618238021638, |
|
"grad_norm": 0.7196378524123959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6685, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7109737248840804, |
|
"grad_norm": 0.7400759740386526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6637, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7418856259659969, |
|
"grad_norm": 0.5515081055008348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6602, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7727975270479135, |
|
"grad_norm": 0.5306297630685508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.80370942812983, |
|
"grad_norm": 0.5220389228174283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.657, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8346213292117465, |
|
"grad_norm": 0.5736057539218636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6606, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.865533230293663, |
|
"grad_norm": 0.6831384912164588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6604, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8964451313755796, |
|
"grad_norm": 0.6555441815061606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6617, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9273570324574961, |
|
"grad_norm": 0.5107130356234296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6577, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9582689335394127, |
|
"grad_norm": 0.5294921581221994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6554, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9891808346213292, |
|
"grad_norm": 0.5933706977854671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6662, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9984544049459042, |
|
"eval_loss": 0.6590365171432495, |
|
"eval_runtime": 113.7467, |
|
"eval_samples_per_second": 76.618, |
|
"eval_steps_per_second": 0.607, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.0200927357032457, |
|
"grad_norm": 0.6466092267278452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6283, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0510046367851622, |
|
"grad_norm": 0.5455963224340917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0819165378670788, |
|
"grad_norm": 0.5457563493460516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6136, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1128284389489953, |
|
"grad_norm": 0.5123524043102256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6186, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1437403400309119, |
|
"grad_norm": 0.6133162659271869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1746522411128284, |
|
"grad_norm": 0.531392662924999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.205564142194745, |
|
"grad_norm": 0.5434898390306484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6031, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2364760432766615, |
|
"grad_norm": 0.665502589919703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6153, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.267387944358578, |
|
"grad_norm": 0.5263248988074614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6154, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2982998454404946, |
|
"grad_norm": 0.6348966883540149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6112, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3292117465224111, |
|
"grad_norm": 0.5230511187445398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6121, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3601236476043277, |
|
"grad_norm": 0.5500625021812695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6109, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3910355486862442, |
|
"grad_norm": 0.7153093571149675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6123, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4219474497681608, |
|
"grad_norm": 0.6382419366334218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6188, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4528593508500773, |
|
"grad_norm": 0.5471481346135804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6179, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4837712519319939, |
|
"grad_norm": 0.5409301526875265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6121, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5146831530139102, |
|
"grad_norm": 0.5070509983871482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6081, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.545595054095827, |
|
"grad_norm": 0.6049383227511866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6111, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5765069551777433, |
|
"grad_norm": 0.5173228367317094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6088, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.60741885625966, |
|
"grad_norm": 0.5087487433614813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6077, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6383307573415764, |
|
"grad_norm": 0.6681086881393504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6073, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6692426584234932, |
|
"grad_norm": 0.9262470601774611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6088, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7001545595054095, |
|
"grad_norm": 0.5528042557953681, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7310664605873263, |
|
"grad_norm": 0.5064527021504606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6064, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7619783616692426, |
|
"grad_norm": 0.5307893550951357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6132, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7928902627511591, |
|
"grad_norm": 0.5363288571581243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8238021638330757, |
|
"grad_norm": 0.4985720023183297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6176, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8547140649149922, |
|
"grad_norm": 0.5256325470444526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6126, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8856259659969088, |
|
"grad_norm": 0.6274813396972104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9165378670788253, |
|
"grad_norm": 0.5789315780747083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6125, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.947449768160742, |
|
"grad_norm": 0.5907958566056928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6062, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9783616692426584, |
|
"grad_norm": 0.5292904118235623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6121, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6483988761901855, |
|
"eval_runtime": 114.5876, |
|
"eval_samples_per_second": 76.055, |
|
"eval_steps_per_second": 0.602, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.009273570324575, |
|
"grad_norm": 1.0140873239994894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.597, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0401854714064913, |
|
"grad_norm": 0.630334813080006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5614, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.071097372488408, |
|
"grad_norm": 0.7038598022498512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5613, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1020092735703244, |
|
"grad_norm": 0.625422416199728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5652, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.132921174652241, |
|
"grad_norm": 0.6504517407003202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1638330757341575, |
|
"grad_norm": 0.6005779084691315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5635, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1947449768160743, |
|
"grad_norm": 0.5153825857145766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5642, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2256568778979906, |
|
"grad_norm": 0.6609026821979475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5636, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2565687789799074, |
|
"grad_norm": 0.7763886247938674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2874806800618237, |
|
"grad_norm": 0.5940744564359401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.566, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3183925811437405, |
|
"grad_norm": 0.8987915044882564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5639, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.349304482225657, |
|
"grad_norm": 0.7853307481543791, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5652, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3802163833075736, |
|
"grad_norm": 0.5555357862981527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5695, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.41112828438949, |
|
"grad_norm": 0.5858397000016529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4420401854714067, |
|
"grad_norm": 0.5809649704126684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.472952086553323, |
|
"grad_norm": 0.6079171026499909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5662, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5038639876352393, |
|
"grad_norm": 0.6460104668614653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.534775888717156, |
|
"grad_norm": 0.5036700308469926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5689, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.565687789799073, |
|
"grad_norm": 0.5720133645014428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5694, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.596599690880989, |
|
"grad_norm": 0.4978320121327588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6275115919629055, |
|
"grad_norm": 0.5742589206659079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5677, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6584234930448223, |
|
"grad_norm": 0.8122081568789994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5685, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.689335394126739, |
|
"grad_norm": 0.5964179415080593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5695, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7202472952086554, |
|
"grad_norm": 0.5560894480978509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5718, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7511591962905717, |
|
"grad_norm": 0.5757117113268359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5696, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7820710973724885, |
|
"grad_norm": 0.5435787690358811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5676, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.812982998454405, |
|
"grad_norm": 0.6300976369973321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5725, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8438948995363216, |
|
"grad_norm": 0.6341460273574823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.874806800618238, |
|
"grad_norm": 0.6961502333247155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5694, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9057187017001547, |
|
"grad_norm": 0.5370380886104864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5712, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.936630602782071, |
|
"grad_norm": 0.595358378564684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5691, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9675425038639878, |
|
"grad_norm": 0.5413573519766721, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5679, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9953632148377123, |
|
"eval_loss": 0.6511133909225464, |
|
"eval_runtime": 110.2332, |
|
"eval_samples_per_second": 79.06, |
|
"eval_steps_per_second": 0.626, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 2.9953632148377123, |
|
"step": 969, |
|
"total_flos": 1622692331520000.0, |
|
"train_loss": 0.6248709833166793, |
|
"train_runtime": 16382.8374, |
|
"train_samples_per_second": 30.32, |
|
"train_steps_per_second": 0.059 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 969, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1622692331520000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|