{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01301959014330229, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603918028660458, "grad_norm": 0.4500846266746521, "learning_rate": 5.194805194805195e-06, "loss": 1.0381, "step": 10 }, { "epoch": 0.0005207836057320916, "grad_norm": 0.35188010334968567, "learning_rate": 1.038961038961039e-05, "loss": 1.0108, "step": 20 }, { "epoch": 0.0007811754085981374, "grad_norm": 0.2300374060869217, "learning_rate": 1.5584415584415583e-05, "loss": 0.9668, "step": 30 }, { "epoch": 0.0010415672114641832, "grad_norm": 0.16189467906951904, "learning_rate": 2.077922077922078e-05, "loss": 0.918, "step": 40 }, { "epoch": 0.001301959014330229, "grad_norm": 0.18843211233615875, "learning_rate": 2.5974025974025972e-05, "loss": 0.9265, "step": 50 }, { "epoch": 0.0015623508171962747, "grad_norm": 0.20334510505199432, "learning_rate": 3.1168831168831166e-05, "loss": 0.9234, "step": 60 }, { "epoch": 0.0018227426200623205, "grad_norm": 0.1745327115058899, "learning_rate": 3.6363636363636364e-05, "loss": 0.881, "step": 70 }, { "epoch": 0.0020831344229283663, "grad_norm": 0.18667331337928772, "learning_rate": 4.155844155844156e-05, "loss": 0.8592, "step": 80 }, { "epoch": 0.002343526225794412, "grad_norm": 0.1848158985376358, "learning_rate": 4.675324675324675e-05, "loss": 0.8537, "step": 90 }, { "epoch": 0.002603918028660458, "grad_norm": 0.17589879035949707, "learning_rate": 5.1948051948051944e-05, "loss": 0.8518, "step": 100 }, { "epoch": 0.0028643098315265037, "grad_norm": 0.2132624089717865, "learning_rate": 5.714285714285714e-05, "loss": 0.8511, "step": 110 }, { "epoch": 0.0031247016343925495, "grad_norm": 0.23070092499256134, "learning_rate": 6.233766233766233e-05, "loss": 0.7975, "step": 120 }, { "epoch": 0.0033850934372585953, "grad_norm": 0.25368157029151917, "learning_rate": 6.753246753246754e-05, "loss": 0.8134, "step": 130 }, { "epoch": 0.003645485240124641, "grad_norm": 0.22897231578826904, "learning_rate": 7.272727272727273e-05, "loss": 0.8322, "step": 140 }, { "epoch": 0.003905877042990687, "grad_norm": 0.19932536780834198, "learning_rate": 7.792207792207793e-05, "loss": 0.7959, "step": 150 }, { "epoch": 0.004166268845856733, "grad_norm": 0.21011792123317719, "learning_rate": 8.311688311688312e-05, "loss": 0.8102, "step": 160 }, { "epoch": 0.004426660648722778, "grad_norm": 0.20594824850559235, "learning_rate": 8.831168831168831e-05, "loss": 0.8128, "step": 170 }, { "epoch": 0.004687052451588824, "grad_norm": 0.20465536415576935, "learning_rate": 9.35064935064935e-05, "loss": 0.7989, "step": 180 }, { "epoch": 0.00494744425445487, "grad_norm": 0.4109392762184143, "learning_rate": 9.870129870129871e-05, "loss": 0.8108, "step": 190 }, { "epoch": 0.005207836057320916, "grad_norm": 0.4293076694011688, "learning_rate": 0.00010389610389610389, "loss": 0.8101, "step": 200 }, { "epoch": 0.005468227860186962, "grad_norm": 0.31628963351249695, "learning_rate": 0.00010909090909090909, "loss": 0.7989, "step": 210 }, { "epoch": 0.005728619663053007, "grad_norm": 0.24642810225486755, "learning_rate": 0.00011428571428571428, "loss": 0.7751, "step": 220 }, { "epoch": 0.005989011465919053, "grad_norm": 0.3599106967449188, "learning_rate": 0.00011948051948051949, "loss": 0.8063, "step": 230 }, { "epoch": 0.006249403268785099, "grad_norm": 0.17053447663784027, "learning_rate": 0.00012467532467532467, "loss": 0.7751, "step": 240 }, { "epoch": 0.006509795071651145, "grad_norm": 0.17303769290447235, "learning_rate": 0.00012987012987012987, "loss": 0.7883, "step": 250 }, { "epoch": 0.0067701868745171905, "grad_norm": 0.1815861016511917, "learning_rate": 0.00013506493506493507, "loss": 0.788, "step": 260 }, { "epoch": 0.007030578677383236, "grad_norm": 0.24125365912914276, "learning_rate": 0.00014025974025974028, "loss": 0.8018, "step": 270 }, { "epoch": 0.007290970480249282, "grad_norm": 0.19443446397781372, "learning_rate": 0.00014545454545454546, "loss": 0.7908, "step": 280 }, { "epoch": 0.007551362283115328, "grad_norm": 0.17829768359661102, "learning_rate": 0.00015064935064935066, "loss": 0.8033, "step": 290 }, { "epoch": 0.007811754085981374, "grad_norm": 0.19535653293132782, "learning_rate": 0.00015584415584415587, "loss": 0.7997, "step": 300 }, { "epoch": 0.008072145888847419, "grad_norm": 0.19930541515350342, "learning_rate": 0.00016103896103896104, "loss": 0.7945, "step": 310 }, { "epoch": 0.008332537691713465, "grad_norm": 0.2156297266483307, "learning_rate": 0.00016623376623376625, "loss": 0.8018, "step": 320 }, { "epoch": 0.00859292949457951, "grad_norm": 0.1924206018447876, "learning_rate": 0.00017142857142857143, "loss": 0.7746, "step": 330 }, { "epoch": 0.008853321297445557, "grad_norm": 0.2294880747795105, "learning_rate": 0.00017662337662337663, "loss": 0.8152, "step": 340 }, { "epoch": 0.009113713100311602, "grad_norm": 0.16817067563533783, "learning_rate": 0.00018181818181818183, "loss": 0.7972, "step": 350 }, { "epoch": 0.009374104903177648, "grad_norm": 0.18544812500476837, "learning_rate": 0.000187012987012987, "loss": 0.7801, "step": 360 }, { "epoch": 0.009634496706043693, "grad_norm": 0.19597066938877106, "learning_rate": 0.00019220779220779222, "loss": 0.7706, "step": 370 }, { "epoch": 0.00989488850890974, "grad_norm": 0.40291881561279297, "learning_rate": 0.00019740259740259742, "loss": 0.7911, "step": 380 }, { "epoch": 0.010155280311775785, "grad_norm": 0.23841074109077454, "learning_rate": 0.00019999996515752773, "loss": 0.7861, "step": 390 }, { "epoch": 0.010415672114641832, "grad_norm": 0.1675388514995575, "learning_rate": 0.00019999968641789507, "loss": 0.788, "step": 400 }, { "epoch": 0.010676063917507876, "grad_norm": 1.8860758543014526, "learning_rate": 0.0001999991289394067, "loss": 0.7632, "step": 410 }, { "epoch": 0.010936455720373923, "grad_norm": 0.17022117972373962, "learning_rate": 0.00019999829272361654, "loss": 0.784, "step": 420 }, { "epoch": 0.011196847523239968, "grad_norm": 0.21460269391536713, "learning_rate": 0.00019999717777285545, "loss": 0.761, "step": 430 }, { "epoch": 0.011457239326106015, "grad_norm": 0.19413785636425018, "learning_rate": 0.00019999578409023126, "loss": 0.7772, "step": 440 }, { "epoch": 0.01171763112897206, "grad_norm": 0.20223405957221985, "learning_rate": 0.00019999411167962868, "loss": 0.7811, "step": 450 }, { "epoch": 0.011978022931838106, "grad_norm": 0.15166303515434265, "learning_rate": 0.00019999216054570942, "loss": 0.7709, "step": 460 }, { "epoch": 0.012238414734704151, "grad_norm": 0.16307081282138824, "learning_rate": 0.00019998993069391205, "loss": 0.7811, "step": 470 }, { "epoch": 0.012498806537570198, "grad_norm": 0.15996049344539642, "learning_rate": 0.00019998742213045206, "loss": 0.7599, "step": 480 }, { "epoch": 0.012759198340436243, "grad_norm": 0.17560279369354248, "learning_rate": 0.00019998463486232179, "loss": 0.7572, "step": 490 }, { "epoch": 0.01301959014330229, "grad_norm": 0.17571642994880676, "learning_rate": 0.0001999815688972905, "loss": 0.7643, "step": 500 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3458366881792e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }