{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3356380479291132, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006712760958582265, "grad_norm": 1.0528477430343628, "learning_rate": 0.00014985229447149842, "loss": 7.6572, "step": 100 }, { "epoch": 0.01342552191716453, "grad_norm": 1.1515228748321533, "learning_rate": 0.00014940916946874937, "loss": 6.1239, "step": 200 }, { "epoch": 0.020138282875746795, "grad_norm": 1.2438573837280273, "learning_rate": 0.00014867237372557577, "loss": 5.4067, "step": 300 }, { "epoch": 0.02685104383432906, "grad_norm": 1.3082321882247925, "learning_rate": 0.00014764481515444297, "loss": 5.0934, "step": 400 }, { "epoch": 0.03356380479291132, "grad_norm": 1.4851253032684326, "learning_rate": 0.00014633054922174807, "loss": 4.8669, "step": 500 }, { "epoch": 0.04027656575149359, "grad_norm": 1.3247835636138916, "learning_rate": 0.00014473476294210664, "loss": 4.7151, "step": 600 }, { "epoch": 0.04698932671007586, "grad_norm": 1.5466852188110352, "learning_rate": 0.0001428637544067573, "loss": 4.5684, "step": 700 }, { "epoch": 0.05370208766865812, "grad_norm": 1.3418868780136108, "learning_rate": 0.0001407249079268789, "loss": 4.4861, "step": 800 }, { "epoch": 0.060414848627240385, "grad_norm": 1.4495049715042114, "learning_rate": 0.0001383266648899225, "loss": 4.3896, "step": 900 }, { "epoch": 0.06712760958582265, "grad_norm": 1.2629677057266235, "learning_rate": 0.0001356784904439796, "loss": 4.3076, "step": 1000 }, { "epoch": 0.07384037054440491, "grad_norm": 1.382216215133667, "learning_rate": 0.00013279083614167278, "loss": 4.2179, "step": 1100 }, { "epoch": 0.08055313150298718, "grad_norm": 1.2883789539337158, "learning_rate": 0.00012967509869100336, "loss": 4.1599, "step": 1200 }, { "epoch": 0.08726589246156945, "grad_norm": 1.3527660369873047, "learning_rate": 0.00012634357497595263, "loss": 4.0976, "step": 1300 }, { "epoch": 0.09397865342015171, "grad_norm": 1.3394412994384766, "learning_rate": 0.00012280941352435837, "loss": 4.0805, "step": 1400 }, { "epoch": 0.10069141437873397, "grad_norm": 1.4646199941635132, "learning_rate": 0.00011908656261460721, "loss": 4.0032, "step": 1500 }, { "epoch": 0.10740417533731624, "grad_norm": 1.2548878192901611, "learning_rate": 0.00011518971522595105, "loss": 3.9702, "step": 1600 }, { "epoch": 0.1141169362958985, "grad_norm": 1.363207221031189, "learning_rate": 0.00011113425104971176, "loss": 3.9321, "step": 1700 }, { "epoch": 0.12082969725448077, "grad_norm": 1.3911628723144531, "learning_rate": 0.00010693617579023885, "loss": 3.8974, "step": 1800 }, { "epoch": 0.12754245821306304, "grad_norm": 1.3630716800689697, "learning_rate": 0.00010261205799518043, "loss": 3.8514, "step": 1900 }, { "epoch": 0.1342552191716453, "grad_norm": 1.2687169313430786, "learning_rate": 9.817896366438074e-05, "loss": 3.818, "step": 2000 }, { "epoch": 0.14096798013022757, "grad_norm": 1.3437057733535767, "learning_rate": 9.36543888954819e-05, "loss": 3.8071, "step": 2100 }, { "epoch": 0.14768074108880982, "grad_norm": 1.3673392534255981, "learning_rate": 8.905619083205881e-05, "loss": 3.7842, "step": 2200 }, { "epoch": 0.1543935020473921, "grad_norm": 1.2775851488113403, "learning_rate": 8.440251718681331e-05, "loss": 3.7666, "step": 2300 }, { "epoch": 0.16110626300597436, "grad_norm": 1.382295846939087, "learning_rate": 7.971173461797922e-05, "loss": 3.679, "step": 2400 }, { "epoch": 0.1678190239645566, "grad_norm": 1.269216775894165, "learning_rate": 7.500235624161463e-05, "loss": 3.7059, "step": 2500 }, { "epoch": 0.1745317849231389, "grad_norm": 1.343229055404663, "learning_rate": 7.029296856586897e-05, "loss": 3.6681, "step": 2600 }, { "epoch": 0.18124454588172115, "grad_norm": 1.3436596393585205, "learning_rate": 6.560215813559365e-05, "loss": 3.6667, "step": 2700 }, { "epoch": 0.18795730684030343, "grad_norm": 1.3680791854858398, "learning_rate": 6.094843817680749e-05, "loss": 3.6359, "step": 2800 }, { "epoch": 0.19467006779888568, "grad_norm": 1.3404663801193237, "learning_rate": 5.635017553053002e-05, "loss": 3.6386, "step": 2900 }, { "epoch": 0.20138282875746794, "grad_norm": 1.2906004190444946, "learning_rate": 5.1825518164352286e-05, "loss": 3.6036, "step": 3000 }, { "epoch": 0.20809558971605022, "grad_norm": 1.585774540901184, "learning_rate": 4.7392323547835926e-05, "loss": 3.6075, "step": 3100 }, { "epoch": 0.21480835067463247, "grad_norm": 1.3589500188827515, "learning_rate": 4.306808817442085e-05, "loss": 3.5855, "step": 3200 }, { "epoch": 0.22152111163321475, "grad_norm": 1.4228436946868896, "learning_rate": 3.886987850799785e-05, "loss": 3.5662, "step": 3300 }, { "epoch": 0.228233872591797, "grad_norm": 1.4741946458816528, "learning_rate": 3.481426362667913e-05, "loss": 3.5487, "step": 3400 }, { "epoch": 0.23494663355037926, "grad_norm": 1.4674755334854126, "learning_rate": 3.0917249829602016e-05, "loss": 3.5521, "step": 3500 }, { "epoch": 0.24165939450896154, "grad_norm": 1.4647969007492065, "learning_rate": 2.7194217464852685e-05, "loss": 3.5201, "step": 3600 }, { "epoch": 0.2483721554675438, "grad_norm": 1.427472710609436, "learning_rate": 2.3659860227831684e-05, "loss": 3.5241, "step": 3700 }, { "epoch": 0.2550849164261261, "grad_norm": 1.4548050165176392, "learning_rate": 2.0328127169632178e-05, "loss": 3.5244, "step": 3800 }, { "epoch": 0.2617976773847083, "grad_norm": 1.442591667175293, "learning_rate": 1.721216764430666e-05, "loss": 3.4858, "step": 3900 }, { "epoch": 0.2685104383432906, "grad_norm": 1.4537155628204346, "learning_rate": 1.4324279412298778e-05, "loss": 3.5342, "step": 4000 }, { "epoch": 0.27522319930187283, "grad_norm": 1.4559599161148071, "learning_rate": 1.1675860104860355e-05, "loss": 3.4991, "step": 4100 }, { "epoch": 0.28193596026045514, "grad_norm": 1.4067513942718506, "learning_rate": 9.277362241009129e-06, "loss": 3.4908, "step": 4200 }, { "epoch": 0.2886487212190374, "grad_norm": 1.4080324172973633, "learning_rate": 7.138251974562107e-06, "loss": 3.475, "step": 4300 }, { "epoch": 0.29536148217761965, "grad_norm": 1.4084968566894531, "learning_rate": 5.266971734057087e-06, "loss": 3.4902, "step": 4400 }, { "epoch": 0.3020742431362019, "grad_norm": 1.7112301588058472, "learning_rate": 3.6709069030119e-06, "loss": 3.4763, "step": 4500 }, { "epoch": 0.3087870040947842, "grad_norm": 1.4443076848983765, "learning_rate": 2.3563566720242454e-06, "loss": 3.4628, "step": 4600 }, { "epoch": 0.31549976505336647, "grad_norm": 1.5065573453903198, "learning_rate": 1.3285091777500451e-06, "loss": 3.4836, "step": 4700 }, { "epoch": 0.3222125260119487, "grad_norm": 1.3545228242874146, "learning_rate": 5.914210268796349e-07, "loss": 3.4876, "step": 4800 }, { "epoch": 0.328925286970531, "grad_norm": 1.3587461709976196, "learning_rate": 1.4800128592414585e-07, "loss": 3.4803, "step": 4900 }, { "epoch": 0.3356380479291132, "grad_norm": 1.4858622550964355, "learning_rate": 0.0, "loss": 3.4479, "step": 5000 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.299915472896e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }