{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9953632148377123, "eval_steps": 500, "global_step": 969, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030911901081916538, "grad_norm": 2.0187587823402606, "learning_rate": 5e-06, "loss": 0.9224, "step": 10 }, { "epoch": 0.061823802163833076, "grad_norm": 5.23374455774046, "learning_rate": 5e-06, "loss": 0.8169, "step": 20 }, { "epoch": 0.09273570324574962, "grad_norm": 1.1567925380600657, "learning_rate": 5e-06, "loss": 0.7799, "step": 30 }, { "epoch": 0.12364760432766615, "grad_norm": 1.3555247489499298, "learning_rate": 5e-06, "loss": 0.7536, "step": 40 }, { "epoch": 0.1545595054095827, "grad_norm": 0.8540206480635643, "learning_rate": 5e-06, "loss": 0.7412, "step": 50 }, { "epoch": 0.18547140649149924, "grad_norm": 1.0411208272577643, "learning_rate": 5e-06, "loss": 0.7188, "step": 60 }, { "epoch": 0.21638330757341576, "grad_norm": 0.772135725000269, "learning_rate": 5e-06, "loss": 0.7099, "step": 70 }, { "epoch": 0.2472952086553323, "grad_norm": 0.718962153128519, "learning_rate": 5e-06, "loss": 0.7044, "step": 80 }, { "epoch": 0.2782071097372488, "grad_norm": 0.9018851336703633, "learning_rate": 5e-06, "loss": 0.7042, "step": 90 }, { "epoch": 0.3091190108191654, "grad_norm": 0.5501975291936315, "learning_rate": 5e-06, "loss": 0.6898, "step": 100 }, { "epoch": 0.3400309119010819, "grad_norm": 0.8644622620986302, "learning_rate": 5e-06, "loss": 0.6901, "step": 110 }, { "epoch": 0.37094281298299847, "grad_norm": 0.5948602458220357, "learning_rate": 5e-06, "loss": 0.6828, "step": 120 }, { "epoch": 0.401854714064915, "grad_norm": 0.720162590208508, "learning_rate": 5e-06, "loss": 0.6858, "step": 130 }, { "epoch": 0.4327666151468315, "grad_norm": 0.5833952154939747, "learning_rate": 5e-06, "loss": 0.6745, "step": 140 }, { "epoch": 0.46367851622874806, "grad_norm": 0.6768653844503261, "learning_rate": 5e-06, "loss": 0.6764, "step": 150 }, { "epoch": 0.4945904173106646, "grad_norm": 0.5707017440465438, "learning_rate": 5e-06, "loss": 0.6734, "step": 160 }, { "epoch": 0.5255023183925811, "grad_norm": 0.52632417149709, "learning_rate": 5e-06, "loss": 0.6746, "step": 170 }, { "epoch": 0.5564142194744977, "grad_norm": 0.5660566537871714, "learning_rate": 5e-06, "loss": 0.6654, "step": 180 }, { "epoch": 0.5873261205564142, "grad_norm": 0.6150804490960232, "learning_rate": 5e-06, "loss": 0.6669, "step": 190 }, { "epoch": 0.6182380216383307, "grad_norm": 0.5523983219898584, "learning_rate": 5e-06, "loss": 0.6722, "step": 200 }, { "epoch": 0.6491499227202473, "grad_norm": 0.7211222999715671, "learning_rate": 5e-06, "loss": 0.6689, "step": 210 }, { "epoch": 0.6800618238021638, "grad_norm": 0.7196378524123959, "learning_rate": 5e-06, "loss": 0.6685, "step": 220 }, { "epoch": 0.7109737248840804, "grad_norm": 0.7400759740386526, "learning_rate": 5e-06, "loss": 0.6637, "step": 230 }, { "epoch": 0.7418856259659969, "grad_norm": 0.5515081055008348, "learning_rate": 5e-06, "loss": 0.6602, "step": 240 }, { "epoch": 0.7727975270479135, "grad_norm": 0.5306297630685508, "learning_rate": 5e-06, "loss": 0.6641, "step": 250 }, { "epoch": 0.80370942812983, "grad_norm": 0.5220389228174283, "learning_rate": 5e-06, "loss": 0.657, "step": 260 }, { "epoch": 0.8346213292117465, "grad_norm": 0.5736057539218636, "learning_rate": 5e-06, "loss": 0.6606, "step": 270 }, { "epoch": 0.865533230293663, "grad_norm": 0.6831384912164588, "learning_rate": 5e-06, "loss": 0.6604, "step": 280 }, { "epoch": 0.8964451313755796, "grad_norm": 0.6555441815061606, "learning_rate": 5e-06, "loss": 0.6617, "step": 290 }, { "epoch": 0.9273570324574961, "grad_norm": 0.5107130356234296, "learning_rate": 5e-06, "loss": 0.6577, "step": 300 }, { "epoch": 0.9582689335394127, "grad_norm": 0.5294921581221994, "learning_rate": 5e-06, "loss": 0.6554, "step": 310 }, { "epoch": 0.9891808346213292, "grad_norm": 0.5933706977854671, "learning_rate": 5e-06, "loss": 0.6662, "step": 320 }, { "epoch": 0.9984544049459042, "eval_loss": 0.6590365171432495, "eval_runtime": 113.7467, "eval_samples_per_second": 76.618, "eval_steps_per_second": 0.607, "step": 323 }, { "epoch": 1.0200927357032457, "grad_norm": 0.6466092267278452, "learning_rate": 5e-06, "loss": 0.6283, "step": 330 }, { "epoch": 1.0510046367851622, "grad_norm": 0.5455963224340917, "learning_rate": 5e-06, "loss": 0.6142, "step": 340 }, { "epoch": 1.0819165378670788, "grad_norm": 0.5457563493460516, "learning_rate": 5e-06, "loss": 0.6136, "step": 350 }, { "epoch": 1.1128284389489953, "grad_norm": 0.5123524043102256, "learning_rate": 5e-06, "loss": 0.6186, "step": 360 }, { "epoch": 1.1437403400309119, "grad_norm": 0.6133162659271869, "learning_rate": 5e-06, "loss": 0.612, "step": 370 }, { "epoch": 1.1746522411128284, "grad_norm": 0.531392662924999, "learning_rate": 5e-06, "loss": 0.6093, "step": 380 }, { "epoch": 1.205564142194745, "grad_norm": 0.5434898390306484, "learning_rate": 5e-06, "loss": 0.6031, "step": 390 }, { "epoch": 1.2364760432766615, "grad_norm": 0.665502589919703, "learning_rate": 5e-06, "loss": 0.6153, "step": 400 }, { "epoch": 1.267387944358578, "grad_norm": 0.5263248988074614, "learning_rate": 5e-06, "loss": 0.6154, "step": 410 }, { "epoch": 1.2982998454404946, "grad_norm": 0.6348966883540149, "learning_rate": 5e-06, "loss": 0.6112, "step": 420 }, { "epoch": 1.3292117465224111, "grad_norm": 0.5230511187445398, "learning_rate": 5e-06, "loss": 0.6121, "step": 430 }, { "epoch": 1.3601236476043277, "grad_norm": 0.5500625021812695, "learning_rate": 5e-06, "loss": 0.6109, "step": 440 }, { "epoch": 1.3910355486862442, "grad_norm": 0.7153093571149675, "learning_rate": 5e-06, "loss": 0.6123, "step": 450 }, { "epoch": 1.4219474497681608, "grad_norm": 0.6382419366334218, "learning_rate": 5e-06, "loss": 0.6188, "step": 460 }, { "epoch": 1.4528593508500773, "grad_norm": 0.5471481346135804, "learning_rate": 5e-06, "loss": 0.6179, "step": 470 }, { "epoch": 1.4837712519319939, "grad_norm": 0.5409301526875265, "learning_rate": 5e-06, "loss": 0.6121, "step": 480 }, { "epoch": 1.5146831530139102, "grad_norm": 0.5070509983871482, "learning_rate": 5e-06, "loss": 0.6081, "step": 490 }, { "epoch": 1.545595054095827, "grad_norm": 0.6049383227511866, "learning_rate": 5e-06, "loss": 0.6111, "step": 500 }, { "epoch": 1.5765069551777433, "grad_norm": 0.5173228367317094, "learning_rate": 5e-06, "loss": 0.6088, "step": 510 }, { "epoch": 1.60741885625966, "grad_norm": 0.5087487433614813, "learning_rate": 5e-06, "loss": 0.6077, "step": 520 }, { "epoch": 1.6383307573415764, "grad_norm": 0.6681086881393504, "learning_rate": 5e-06, "loss": 0.6073, "step": 530 }, { "epoch": 1.6692426584234932, "grad_norm": 0.9262470601774611, "learning_rate": 5e-06, "loss": 0.6088, "step": 540 }, { "epoch": 1.7001545595054095, "grad_norm": 0.5528042557953681, "learning_rate": 5e-06, "loss": 0.6163, "step": 550 }, { "epoch": 1.7310664605873263, "grad_norm": 0.5064527021504606, "learning_rate": 5e-06, "loss": 0.6064, "step": 560 }, { "epoch": 1.7619783616692426, "grad_norm": 0.5307893550951357, "learning_rate": 5e-06, "loss": 0.6132, "step": 570 }, { "epoch": 1.7928902627511591, "grad_norm": 0.5363288571581243, "learning_rate": 5e-06, "loss": 0.6142, "step": 580 }, { "epoch": 1.8238021638330757, "grad_norm": 0.4985720023183297, "learning_rate": 5e-06, "loss": 0.6176, "step": 590 }, { "epoch": 1.8547140649149922, "grad_norm": 0.5256325470444526, "learning_rate": 5e-06, "loss": 0.6126, "step": 600 }, { "epoch": 1.8856259659969088, "grad_norm": 0.6274813396972104, "learning_rate": 5e-06, "loss": 0.612, "step": 610 }, { "epoch": 1.9165378670788253, "grad_norm": 0.5789315780747083, "learning_rate": 5e-06, "loss": 0.6125, "step": 620 }, { "epoch": 1.947449768160742, "grad_norm": 0.5907958566056928, "learning_rate": 5e-06, "loss": 0.6062, "step": 630 }, { "epoch": 1.9783616692426584, "grad_norm": 0.5292904118235623, "learning_rate": 5e-06, "loss": 0.6121, "step": 640 }, { "epoch": 2.0, "eval_loss": 0.6483988761901855, "eval_runtime": 114.5876, "eval_samples_per_second": 76.055, "eval_steps_per_second": 0.602, "step": 647 }, { "epoch": 2.009273570324575, "grad_norm": 1.0140873239994894, "learning_rate": 5e-06, "loss": 0.597, "step": 650 }, { "epoch": 2.0401854714064913, "grad_norm": 0.630334813080006, "learning_rate": 5e-06, "loss": 0.5614, "step": 660 }, { "epoch": 2.071097372488408, "grad_norm": 0.7038598022498512, "learning_rate": 5e-06, "loss": 0.5613, "step": 670 }, { "epoch": 2.1020092735703244, "grad_norm": 0.625422416199728, "learning_rate": 5e-06, "loss": 0.5652, "step": 680 }, { "epoch": 2.132921174652241, "grad_norm": 0.6504517407003202, "learning_rate": 5e-06, "loss": 0.5646, "step": 690 }, { "epoch": 2.1638330757341575, "grad_norm": 0.6005779084691315, "learning_rate": 5e-06, "loss": 0.5635, "step": 700 }, { "epoch": 2.1947449768160743, "grad_norm": 0.5153825857145766, "learning_rate": 5e-06, "loss": 0.5642, "step": 710 }, { "epoch": 2.2256568778979906, "grad_norm": 0.6609026821979475, "learning_rate": 5e-06, "loss": 0.5636, "step": 720 }, { "epoch": 2.2565687789799074, "grad_norm": 0.7763886247938674, "learning_rate": 5e-06, "loss": 0.5646, "step": 730 }, { "epoch": 2.2874806800618237, "grad_norm": 0.5940744564359401, "learning_rate": 5e-06, "loss": 0.566, "step": 740 }, { "epoch": 2.3183925811437405, "grad_norm": 0.8987915044882564, "learning_rate": 5e-06, "loss": 0.5639, "step": 750 }, { "epoch": 2.349304482225657, "grad_norm": 0.7853307481543791, "learning_rate": 5e-06, "loss": 0.5652, "step": 760 }, { "epoch": 2.3802163833075736, "grad_norm": 0.5555357862981527, "learning_rate": 5e-06, "loss": 0.5695, "step": 770 }, { "epoch": 2.41112828438949, "grad_norm": 0.5858397000016529, "learning_rate": 5e-06, "loss": 0.5674, "step": 780 }, { "epoch": 2.4420401854714067, "grad_norm": 0.5809649704126684, "learning_rate": 5e-06, "loss": 0.5666, "step": 790 }, { "epoch": 2.472952086553323, "grad_norm": 0.6079171026499909, "learning_rate": 5e-06, "loss": 0.5662, "step": 800 }, { "epoch": 2.5038639876352393, "grad_norm": 0.6460104668614653, "learning_rate": 5e-06, "loss": 0.5724, "step": 810 }, { "epoch": 2.534775888717156, "grad_norm": 0.5036700308469926, "learning_rate": 5e-06, "loss": 0.5689, "step": 820 }, { "epoch": 2.565687789799073, "grad_norm": 0.5720133645014428, "learning_rate": 5e-06, "loss": 0.5694, "step": 830 }, { "epoch": 2.596599690880989, "grad_norm": 0.4978320121327588, "learning_rate": 5e-06, "loss": 0.5688, "step": 840 }, { "epoch": 2.6275115919629055, "grad_norm": 0.5742589206659079, "learning_rate": 5e-06, "loss": 0.5677, "step": 850 }, { "epoch": 2.6584234930448223, "grad_norm": 0.8122081568789994, "learning_rate": 5e-06, "loss": 0.5685, "step": 860 }, { "epoch": 2.689335394126739, "grad_norm": 0.5964179415080593, "learning_rate": 5e-06, "loss": 0.5695, "step": 870 }, { "epoch": 2.7202472952086554, "grad_norm": 0.5560894480978509, "learning_rate": 5e-06, "loss": 0.5718, "step": 880 }, { "epoch": 2.7511591962905717, "grad_norm": 0.5757117113268359, "learning_rate": 5e-06, "loss": 0.5696, "step": 890 }, { "epoch": 2.7820710973724885, "grad_norm": 0.5435787690358811, "learning_rate": 5e-06, "loss": 0.5676, "step": 900 }, { "epoch": 2.812982998454405, "grad_norm": 0.6300976369973321, "learning_rate": 5e-06, "loss": 0.5725, "step": 910 }, { "epoch": 2.8438948995363216, "grad_norm": 0.6341460273574823, "learning_rate": 5e-06, "loss": 0.5699, "step": 920 }, { "epoch": 2.874806800618238, "grad_norm": 0.6961502333247155, "learning_rate": 5e-06, "loss": 0.5694, "step": 930 }, { "epoch": 2.9057187017001547, "grad_norm": 0.5370380886104864, "learning_rate": 5e-06, "loss": 0.5712, "step": 940 }, { "epoch": 2.936630602782071, "grad_norm": 0.595358378564684, "learning_rate": 5e-06, "loss": 0.5691, "step": 950 }, { "epoch": 2.9675425038639878, "grad_norm": 0.5413573519766721, "learning_rate": 5e-06, "loss": 0.5679, "step": 960 }, { "epoch": 2.9953632148377123, "eval_loss": 0.6511133909225464, "eval_runtime": 110.2332, "eval_samples_per_second": 79.06, "eval_steps_per_second": 0.626, "step": 969 }, { "epoch": 2.9953632148377123, "step": 969, "total_flos": 1622692331520000.0, "train_loss": 0.6248709833166793, "train_runtime": 16382.8374, "train_samples_per_second": 30.32, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 969, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1622692331520000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }