{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9977827050997783, "eval_steps": 500, "global_step": 225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004434589800443459, "grad_norm": 1.91265869140625, "learning_rate": 4.347826086956522e-05, "loss": 2.8127, "step": 1 }, { "epoch": 0.022172949002217297, "grad_norm": 1.5314122438430786, "learning_rate": 0.0002173913043478261, "loss": 2.7241, "step": 5 }, { "epoch": 0.04434589800443459, "grad_norm": 0.6431057453155518, "learning_rate": 0.0004347826086956522, "loss": 2.2423, "step": 10 }, { "epoch": 0.06651884700665188, "grad_norm": 0.5257381200790405, "learning_rate": 0.0006521739130434783, "loss": 1.9505, "step": 15 }, { "epoch": 0.08869179600886919, "grad_norm": 0.37703999876976013, "learning_rate": 0.0008695652173913044, "loss": 1.7881, "step": 20 }, { "epoch": 0.11086474501108648, "grad_norm": 0.30256885290145874, "learning_rate": 0.0009900990099009901, "loss": 1.7031, "step": 25 }, { "epoch": 0.13303769401330376, "grad_norm": 0.3443244993686676, "learning_rate": 0.0009653465346534653, "loss": 1.6352, "step": 30 }, { "epoch": 0.15521064301552107, "grad_norm": 0.369827538728714, "learning_rate": 0.0009405940594059406, "loss": 1.5746, "step": 35 }, { "epoch": 0.17738359201773837, "grad_norm": 0.231527641415596, "learning_rate": 0.0009158415841584159, "loss": 1.5409, "step": 40 }, { "epoch": 0.19955654101995565, "grad_norm": 0.22827404737472534, "learning_rate": 0.0008910891089108911, "loss": 1.5187, "step": 45 }, { "epoch": 0.22172949002217296, "grad_norm": 0.2396710067987442, "learning_rate": 0.0008663366336633663, "loss": 1.5128, "step": 50 }, { "epoch": 0.24390243902439024, "grad_norm": 0.20095600187778473, "learning_rate": 0.0008415841584158416, "loss": 1.4848, "step": 55 }, { "epoch": 0.2660753880266075, "grad_norm": 0.28900983929634094, "learning_rate": 0.0008168316831683168, "loss": 1.4962, "step": 60 }, { "epoch": 0.28824833702882485, "grad_norm": 0.25716254115104675, "learning_rate": 0.0007920792079207921, "loss": 1.4789, "step": 65 }, { "epoch": 0.31042128603104213, "grad_norm": 0.252340167760849, "learning_rate": 0.0007673267326732674, "loss": 1.458, "step": 70 }, { "epoch": 0.3325942350332594, "grad_norm": 0.20464155077934265, "learning_rate": 0.0007425742574257426, "loss": 1.4558, "step": 75 }, { "epoch": 0.35476718403547675, "grad_norm": 0.23394732177257538, "learning_rate": 0.0007178217821782178, "loss": 1.4562, "step": 80 }, { "epoch": 0.376940133037694, "grad_norm": 0.2164139449596405, "learning_rate": 0.000693069306930693, "loss": 1.4338, "step": 85 }, { "epoch": 0.3991130820399113, "grad_norm": 0.215862438082695, "learning_rate": 0.0006683168316831684, "loss": 1.4287, "step": 90 }, { "epoch": 0.4212860310421286, "grad_norm": 0.20270515978336334, "learning_rate": 0.0006435643564356436, "loss": 1.4226, "step": 95 }, { "epoch": 0.4434589800443459, "grad_norm": 0.20255711674690247, "learning_rate": 0.0006188118811881188, "loss": 1.4314, "step": 100 }, { "epoch": 0.4656319290465632, "grad_norm": 0.20747065544128418, "learning_rate": 0.000594059405940594, "loss": 1.4194, "step": 105 }, { "epoch": 0.4878048780487805, "grad_norm": 0.2104884535074234, "learning_rate": 0.0005693069306930693, "loss": 1.4106, "step": 110 }, { "epoch": 0.5099778270509978, "grad_norm": 0.21514882147312164, "learning_rate": 0.0005445544554455446, "loss": 1.42, "step": 115 }, { "epoch": 0.532150776053215, "grad_norm": 0.20466424524784088, "learning_rate": 0.0005198019801980198, "loss": 1.3937, "step": 120 }, { "epoch": 0.5543237250554324, "grad_norm": 0.2181282341480255, "learning_rate": 0.0004950495049504951, "loss": 1.3972, "step": 125 }, { "epoch": 0.5764966740576497, "grad_norm": 0.22615699470043182, "learning_rate": 0.0004702970297029703, "loss": 1.3882, "step": 130 }, { "epoch": 0.5986696230598669, "grad_norm": 0.1967965066432953, "learning_rate": 0.00044554455445544556, "loss": 1.388, "step": 135 }, { "epoch": 0.6208425720620843, "grad_norm": 0.2030034065246582, "learning_rate": 0.0004207920792079208, "loss": 1.4048, "step": 140 }, { "epoch": 0.6430155210643016, "grad_norm": 0.2136310189962387, "learning_rate": 0.00039603960396039607, "loss": 1.3918, "step": 145 }, { "epoch": 0.6651884700665188, "grad_norm": 0.22149060666561127, "learning_rate": 0.0003712871287128713, "loss": 1.4023, "step": 150 }, { "epoch": 0.6873614190687362, "grad_norm": 0.2130667269229889, "learning_rate": 0.0003465346534653465, "loss": 1.3933, "step": 155 }, { "epoch": 0.7095343680709535, "grad_norm": 0.19920696318149567, "learning_rate": 0.0003217821782178218, "loss": 1.3815, "step": 160 }, { "epoch": 0.7317073170731707, "grad_norm": 0.20453611016273499, "learning_rate": 0.000297029702970297, "loss": 1.3648, "step": 165 }, { "epoch": 0.753880266075388, "grad_norm": 0.21325863897800446, "learning_rate": 0.0002722772277227723, "loss": 1.3773, "step": 170 }, { "epoch": 0.7760532150776053, "grad_norm": 0.2014823704957962, "learning_rate": 0.00024752475247524753, "loss": 1.3881, "step": 175 }, { "epoch": 0.7982261640798226, "grad_norm": 0.20359407365322113, "learning_rate": 0.00022277227722772278, "loss": 1.3826, "step": 180 }, { "epoch": 0.8203991130820399, "grad_norm": 0.21738748252391815, "learning_rate": 0.00019801980198019803, "loss": 1.3705, "step": 185 }, { "epoch": 0.8425720620842572, "grad_norm": 0.1990172564983368, "learning_rate": 0.00017326732673267326, "loss": 1.3693, "step": 190 }, { "epoch": 0.8647450110864745, "grad_norm": 0.2007543295621872, "learning_rate": 0.0001485148514851485, "loss": 1.3575, "step": 195 }, { "epoch": 0.8869179600886918, "grad_norm": 0.5149243474006653, "learning_rate": 0.00012376237623762376, "loss": 1.374, "step": 200 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2131042778491974, "learning_rate": 9.900990099009902e-05, "loss": 1.3636, "step": 205 }, { "epoch": 0.9312638580931264, "grad_norm": 0.19097404181957245, "learning_rate": 7.425742574257426e-05, "loss": 1.3489, "step": 210 }, { "epoch": 0.9534368070953437, "grad_norm": 0.19905418157577515, "learning_rate": 4.950495049504951e-05, "loss": 1.3442, "step": 215 }, { "epoch": 0.975609756097561, "grad_norm": 0.19617854058742523, "learning_rate": 2.4752475247524754e-05, "loss": 1.3721, "step": 220 }, { "epoch": 0.9977827050997783, "grad_norm": 0.20064575970172882, "learning_rate": 0.0, "loss": 1.3767, "step": 225 }, { "epoch": 0.9977827050997783, "eval_loss": 1.7732421159744263, "eval_runtime": 0.5415, "eval_samples_per_second": 1.847, "eval_steps_per_second": 1.847, "step": 225 }, { "epoch": 0.9977827050997783, "step": 225, "total_flos": 3.3259687694984806e+17, "train_loss": 1.4963340536753336, "train_runtime": 725.2803, "train_samples_per_second": 9.934, "train_steps_per_second": 0.31 } ], "logging_steps": 5, "max_steps": 225, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3259687694984806e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }