{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 11895, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008406893652795292, "grad_norm": 0.14043055474758148, "learning_rate": 2.0000000000000003e-06, "loss": 2.4104, "step": 100 }, { "epoch": 0.016813787305590584, "grad_norm": 0.2322196364402771, "learning_rate": 4.000000000000001e-06, "loss": 2.4058, "step": 200 }, { "epoch": 0.025220680958385876, "grad_norm": 0.330422580242157, "learning_rate": 6e-06, "loss": 2.3956, "step": 300 }, { "epoch": 0.03362757461118117, "grad_norm": 0.37281373143196106, "learning_rate": 8.000000000000001e-06, "loss": 2.3817, "step": 400 }, { "epoch": 0.042034468263976464, "grad_norm": 0.4371340274810791, "learning_rate": 1e-05, "loss": 2.2819, "step": 500 }, { "epoch": 0.05044136191677175, "grad_norm": 0.5313137173652649, "learning_rate": 1.2e-05, "loss": 2.2971, "step": 600 }, { "epoch": 0.05884825556956705, "grad_norm": 0.5993666648864746, "learning_rate": 1.4e-05, "loss": 2.2486, "step": 700 }, { "epoch": 0.06725514922236234, "grad_norm": 0.885300874710083, "learning_rate": 1.6000000000000003e-05, "loss": 2.2134, "step": 800 }, { "epoch": 0.07566204287515763, "grad_norm": 0.7315155267715454, "learning_rate": 1.8e-05, "loss": 2.1677, "step": 900 }, { "epoch": 0.08406893652795293, "grad_norm": 0.6705694794654846, "learning_rate": 2e-05, "loss": 2.1677, "step": 1000 }, { "epoch": 0.09247583018074822, "grad_norm": 0.6811093688011169, "learning_rate": 1.999584295057038e-05, "loss": 2.1539, "step": 1100 }, { "epoch": 0.1008827238335435, "grad_norm": 0.8088228702545166, "learning_rate": 1.9983375258493504e-05, "loss": 2.1099, "step": 1200 }, { "epoch": 0.1092896174863388, "grad_norm": 0.6950477361679077, "learning_rate": 1.996260728953182e-05, "loss": 2.0899, "step": 1300 }, { "epoch": 0.1176965111391341, "grad_norm": 0.746992290019989, "learning_rate": 1.9933556310380036e-05, "loss": 2.1047, "step": 1400 }, { "epoch": 0.12610340479192939, "grad_norm": 0.9399320483207703, "learning_rate": 1.9896246474309414e-05, "loss": 2.0344, "step": 1500 }, { "epoch": 0.13451029844472467, "grad_norm": 0.8537988066673279, "learning_rate": 1.9850708801086507e-05, "loss": 2.0576, "step": 1600 }, { "epoch": 0.14291719209751996, "grad_norm": 0.8270643949508667, "learning_rate": 1.9796981151183013e-05, "loss": 2.019, "step": 1700 }, { "epoch": 0.15132408575031525, "grad_norm": 1.1590845584869385, "learning_rate": 1.9735108194298206e-05, "loss": 2.0348, "step": 1800 }, { "epoch": 0.15973097940311054, "grad_norm": 0.8914215564727783, "learning_rate": 1.9665141372220112e-05, "loss": 2.0314, "step": 1900 }, { "epoch": 0.16813787305590586, "grad_norm": 0.7579370141029358, "learning_rate": 1.9587138856056303e-05, "loss": 2.0636, "step": 2000 }, { "epoch": 0.17654476670870115, "grad_norm": 0.9895981550216675, "learning_rate": 1.9501165497869832e-05, "loss": 1.9961, "step": 2100 }, { "epoch": 0.18495166036149643, "grad_norm": 0.9374507069587708, "learning_rate": 1.940729277676063e-05, "loss": 1.9923, "step": 2200 }, { "epoch": 0.19335855401429172, "grad_norm": 0.9945927858352661, "learning_rate": 1.930559873943704e-05, "loss": 1.9944, "step": 2300 }, { "epoch": 0.201765447667087, "grad_norm": 0.9538114070892334, "learning_rate": 1.9196167935327036e-05, "loss": 2.036, "step": 2400 }, { "epoch": 0.2101723413198823, "grad_norm": 0.780885636806488, "learning_rate": 1.9079091346282977e-05, "loss": 1.9894, "step": 2500 }, { "epoch": 0.2185792349726776, "grad_norm": 1.2008519172668457, "learning_rate": 1.8954466310938405e-05, "loss": 1.9837, "step": 2600 }, { "epoch": 0.22698612862547288, "grad_norm": 1.089125394821167, "learning_rate": 1.8822396443779745e-05, "loss": 2.016, "step": 2700 }, { "epoch": 0.2353930222782682, "grad_norm": 1.087217092514038, "learning_rate": 1.868299154900018e-05, "loss": 2.0407, "step": 2800 }, { "epoch": 0.24379991593106348, "grad_norm": 0.9000464677810669, "learning_rate": 1.8536367529207375e-05, "loss": 1.9954, "step": 2900 }, { "epoch": 0.25220680958385877, "grad_norm": 0.7694302201271057, "learning_rate": 1.83826462890609e-05, "loss": 1.9973, "step": 3000 }, { "epoch": 0.26061370323665406, "grad_norm": 0.8629293441772461, "learning_rate": 1.8221955633919495e-05, "loss": 1.9994, "step": 3100 }, { "epoch": 0.26902059688944935, "grad_norm": 0.839648962020874, "learning_rate": 1.8054429163582415e-05, "loss": 2.003, "step": 3200 }, { "epoch": 0.27742749054224464, "grad_norm": 0.8285431861877441, "learning_rate": 1.7880206161213255e-05, "loss": 1.9925, "step": 3300 }, { "epoch": 0.2858343841950399, "grad_norm": 0.837740421295166, "learning_rate": 1.7699431477538534e-05, "loss": 1.9801, "step": 3400 }, { "epoch": 0.2942412778478352, "grad_norm": 0.8980201482772827, "learning_rate": 1.7512255410417392e-05, "loss": 1.9764, "step": 3500 }, { "epoch": 0.3026481715006305, "grad_norm": 1.3097134828567505, "learning_rate": 1.731883357988244e-05, "loss": 1.9771, "step": 3600 }, { "epoch": 0.3110550651534258, "grad_norm": 0.7765768766403198, "learning_rate": 1.7119326798755734e-05, "loss": 1.9589, "step": 3700 }, { "epoch": 0.3194619588062211, "grad_norm": 1.0500297546386719, "learning_rate": 1.6913900938947417e-05, "loss": 1.9478, "step": 3800 }, { "epoch": 0.32786885245901637, "grad_norm": 0.9133660793304443, "learning_rate": 1.6702726793548153e-05, "loss": 1.9782, "step": 3900 }, { "epoch": 0.3362757461118117, "grad_norm": 0.9790305495262146, "learning_rate": 1.6485979934830084e-05, "loss": 1.9712, "step": 4000 }, { "epoch": 0.344682639764607, "grad_norm": 1.1819267272949219, "learning_rate": 1.626384056827429e-05, "loss": 1.9597, "step": 4100 }, { "epoch": 0.3530895334174023, "grad_norm": 1.196151852607727, "learning_rate": 1.6036493382746178e-05, "loss": 1.9571, "step": 4200 }, { "epoch": 0.3614964270701976, "grad_norm": 0.8918384909629822, "learning_rate": 1.580412739694333e-05, "loss": 1.9769, "step": 4300 }, { "epoch": 0.36990332072299287, "grad_norm": 2.0516951084136963, "learning_rate": 1.5566935802243496e-05, "loss": 1.9757, "step": 4400 }, { "epoch": 0.37831021437578816, "grad_norm": 0.9879031777381897, "learning_rate": 1.5325115802083373e-05, "loss": 1.9893, "step": 4500 }, { "epoch": 0.38671710802858345, "grad_norm": 0.6235149502754211, "learning_rate": 1.5078868448001704e-05, "loss": 2.008, "step": 4600 }, { "epoch": 0.39512400168137873, "grad_norm": 0.9613019824028015, "learning_rate": 1.4828398472483057e-05, "loss": 1.9669, "step": 4700 }, { "epoch": 0.403530895334174, "grad_norm": 1.0296316146850586, "learning_rate": 1.4573914118741201e-05, "loss": 1.8964, "step": 4800 }, { "epoch": 0.4119377889869693, "grad_norm": 0.8810610771179199, "learning_rate": 1.4315626967583657e-05, "loss": 1.9226, "step": 4900 }, { "epoch": 0.4203446826397646, "grad_norm": 0.9355632066726685, "learning_rate": 1.40537517615013e-05, "loss": 1.9678, "step": 5000 }, { "epoch": 0.4287515762925599, "grad_norm": 0.7143288850784302, "learning_rate": 1.3788506226129348e-05, "loss": 1.9328, "step": 5100 }, { "epoch": 0.4371584699453552, "grad_norm": 0.987920880317688, "learning_rate": 1.3520110889228104e-05, "loss": 1.8968, "step": 5200 }, { "epoch": 0.44556536359815047, "grad_norm": 0.8306659460067749, "learning_rate": 1.3248788897334005e-05, "loss": 1.9294, "step": 5300 }, { "epoch": 0.45397225725094575, "grad_norm": 1.0377694368362427, "learning_rate": 1.2974765830233383e-05, "loss": 1.9835, "step": 5400 }, { "epoch": 0.46237915090374104, "grad_norm": 0.9360339641571045, "learning_rate": 1.269826951341319e-05, "loss": 1.9121, "step": 5500 }, { "epoch": 0.4707860445565364, "grad_norm": 1.0214323997497559, "learning_rate": 1.2419529828644661e-05, "loss": 1.9133, "step": 5600 }, { "epoch": 0.4791929382093317, "grad_norm": 1.1800683736801147, "learning_rate": 1.2138778522857307e-05, "loss": 1.934, "step": 5700 }, { "epoch": 0.48759983186212696, "grad_norm": 1.0064023733139038, "learning_rate": 1.1856249015462242e-05, "loss": 1.9665, "step": 5800 }, { "epoch": 0.49600672551492225, "grad_norm": 1.3952449560165405, "learning_rate": 1.1572176204284986e-05, "loss": 1.9426, "step": 5900 }, { "epoch": 0.5044136191677175, "grad_norm": 1.0473861694335938, "learning_rate": 1.1286796270269076e-05, "loss": 1.9559, "step": 6000 }, { "epoch": 0.5128205128205128, "grad_norm": 1.101931095123291, "learning_rate": 1.1000346481112887e-05, "loss": 1.9103, "step": 6100 }, { "epoch": 0.5212274064733081, "grad_norm": 1.0466363430023193, "learning_rate": 1.0713064994002956e-05, "loss": 1.9395, "step": 6200 }, { "epoch": 0.5296343001261034, "grad_norm": 0.9257377982139587, "learning_rate": 1.0425190657607702e-05, "loss": 1.9141, "step": 6300 }, { "epoch": 0.5380411937788987, "grad_norm": 0.779354453086853, "learning_rate": 1.0136962813496306e-05, "loss": 1.9647, "step": 6400 }, { "epoch": 0.546448087431694, "grad_norm": 0.8354730606079102, "learning_rate": 9.848621097147772e-06, "loss": 2.0283, "step": 6500 }, { "epoch": 0.5548549810844893, "grad_norm": 1.0222142934799194, "learning_rate": 9.560405238715588e-06, "loss": 1.9444, "step": 6600 }, { "epoch": 0.5632618747372846, "grad_norm": 1.3046057224273682, "learning_rate": 9.272554863713739e-06, "loss": 1.9555, "step": 6700 }, { "epoch": 0.5716687683900799, "grad_norm": 0.8095905184745789, "learning_rate": 8.985309293789662e-06, "loss": 1.9761, "step": 6800 }, { "epoch": 0.5800756620428752, "grad_norm": 1.0805299282073975, "learning_rate": 8.698907347749885e-06, "loss": 1.9043, "step": 6900 }, { "epoch": 0.5884825556956704, "grad_norm": 1.3454480171203613, "learning_rate": 8.413587143003691e-06, "loss": 1.902, "step": 7000 }, { "epoch": 0.5968894493484658, "grad_norm": 1.1316404342651367, "learning_rate": 8.129585897589964e-06, "loss": 1.942, "step": 7100 }, { "epoch": 0.605296343001261, "grad_norm": 0.9173849821090698, "learning_rate": 7.847139732951751e-06, "loss": 1.9336, "step": 7200 }, { "epoch": 0.6137032366540563, "grad_norm": 0.8359085321426392, "learning_rate": 7.566483477622577e-06, "loss": 1.9584, "step": 7300 }, { "epoch": 0.6221101303068516, "grad_norm": 1.6915035247802734, "learning_rate": 7.28785047198767e-06, "loss": 1.9113, "step": 7400 }, { "epoch": 0.6305170239596469, "grad_norm": 0.8478715419769287, "learning_rate": 7.011472374282457e-06, "loss": 1.9362, "step": 7500 }, { "epoch": 0.6389239176124422, "grad_norm": 1.5024291276931763, "learning_rate": 6.737578967989626e-06, "loss": 1.9636, "step": 7600 }, { "epoch": 0.6473308112652375, "grad_norm": 0.7754480242729187, "learning_rate": 6.466397970794855e-06, "loss": 1.9107, "step": 7700 }, { "epoch": 0.6557377049180327, "grad_norm": 0.7962978482246399, "learning_rate": 6.198154845260089e-06, "loss": 1.8819, "step": 7800 }, { "epoch": 0.6641445985708281, "grad_norm": 0.9278832077980042, "learning_rate": 5.933072611371725e-06, "loss": 1.9741, "step": 7900 }, { "epoch": 0.6725514922236234, "grad_norm": 1.244887351989746, "learning_rate": 5.671371661119609e-06, "loss": 2.0055, "step": 8000 }, { "epoch": 0.6809583858764187, "grad_norm": 1.1086173057556152, "learning_rate": 5.415832074569403e-06, "loss": 1.8962, "step": 8100 }, { "epoch": 0.689365279529214, "grad_norm": 1.0748802423477173, "learning_rate": 5.161504255249876e-06, "loss": 1.9434, "step": 8200 }, { "epoch": 0.6977721731820092, "grad_norm": 0.7687519788742065, "learning_rate": 4.911199209125536e-06, "loss": 1.95, "step": 8300 }, { "epoch": 0.7061790668348046, "grad_norm": 0.9328936338424683, "learning_rate": 4.665125042286226e-06, "loss": 1.9271, "step": 8400 }, { "epoch": 0.7145859604875998, "grad_norm": 1.0116279125213623, "learning_rate": 4.423486343226934e-06, "loss": 1.8916, "step": 8500 }, { "epoch": 0.7229928541403952, "grad_norm": 0.9574350714683533, "learning_rate": 4.186484012750876e-06, "loss": 2.0135, "step": 8600 }, { "epoch": 0.7313997477931904, "grad_norm": 1.5315290689468384, "learning_rate": 3.954315096938598e-06, "loss": 1.9159, "step": 8700 }, { "epoch": 0.7398066414459857, "grad_norm": 1.2323825359344482, "learning_rate": 3.7271726233219098e-06, "loss": 1.9175, "step": 8800 }, { "epoch": 0.748213535098781, "grad_norm": 0.7751179933547974, "learning_rate": 3.50524544039889e-06, "loss": 1.9278, "step": 8900 }, { "epoch": 0.7566204287515763, "grad_norm": 1.096082329750061, "learning_rate": 3.288718060623376e-06, "loss": 1.9253, "step": 9000 }, { "epoch": 0.7650273224043715, "grad_norm": 0.9929354786872864, "learning_rate": 3.077770506999487e-06, "loss": 1.9252, "step": 9100 }, { "epoch": 0.7734342160571669, "grad_norm": 0.8709311485290527, "learning_rate": 2.872578163408717e-06, "loss": 1.9219, "step": 9200 }, { "epoch": 0.7818411097099621, "grad_norm": 1.3237998485565186, "learning_rate": 2.673311628794043e-06, "loss": 1.9318, "step": 9300 }, { "epoch": 0.7902480033627575, "grad_norm": 1.3490018844604492, "learning_rate": 2.4801365753222795e-06, "loss": 1.9017, "step": 9400 }, { "epoch": 0.7986548970155527, "grad_norm": 1.5231995582580566, "learning_rate": 2.293213610642594e-06, "loss": 1.8539, "step": 9500 }, { "epoch": 0.807061790668348, "grad_norm": 0.9112081527709961, "learning_rate": 2.1126981443557303e-06, "loss": 1.9095, "step": 9600 }, { "epoch": 0.8154686843211434, "grad_norm": 1.3989226818084717, "learning_rate": 1.9387402588049167e-06, "loss": 1.9411, "step": 9700 }, { "epoch": 0.8238755779739386, "grad_norm": 0.9119231104850769, "learning_rate": 1.7714845842959415e-06, "loss": 1.8862, "step": 9800 }, { "epoch": 0.832282471626734, "grad_norm": 1.1409211158752441, "learning_rate": 1.6110701788500682e-06, "loss": 1.897, "step": 9900 }, { "epoch": 0.8406893652795292, "grad_norm": 1.3277921676635742, "learning_rate": 1.4576304125898234e-06, "loss": 1.9049, "step": 10000 }, { "epoch": 0.8490962589323245, "grad_norm": 1.3236161470413208, "learning_rate": 1.311292856853772e-06, "loss": 1.8771, "step": 10100 }, { "epoch": 0.8575031525851198, "grad_norm": 1.0454002618789673, "learning_rate": 1.1721791781324343e-06, "loss": 1.9074, "step": 10200 }, { "epoch": 0.8659100462379151, "grad_norm": 1.196260929107666, "learning_rate": 1.0404050369135698e-06, "loss": 1.9241, "step": 10300 }, { "epoch": 0.8743169398907104, "grad_norm": 1.2088180780410767, "learning_rate": 9.160799915208962e-07, "loss": 1.9161, "step": 10400 }, { "epoch": 0.8827238335435057, "grad_norm": 1.2079304456710815, "learning_rate": 7.993074070262185e-07, "loss": 1.9722, "step": 10500 }, { "epoch": 0.8911307271963009, "grad_norm": 0.9400559067726135, "learning_rate": 6.901843693106935e-07, "loss": 1.972, "step": 10600 }, { "epoch": 0.8995376208490963, "grad_norm": 1.1368870735168457, "learning_rate": 5.888016043466583e-07, "loss": 1.9051, "step": 10700 }, { "epoch": 0.9079445145018915, "grad_norm": 1.0495091676712036, "learning_rate": 4.952434027671659e-07, "loss": 1.8549, "step": 10800 }, { "epoch": 0.9163514081546869, "grad_norm": 0.832502007484436, "learning_rate": 4.095875497859192e-07, "loss": 1.9349, "step": 10900 }, { "epoch": 0.9247583018074821, "grad_norm": 1.295791745185852, "learning_rate": 3.3190526052587545e-07, "loss": 1.9073, "step": 11000 }, { "epoch": 0.9331651954602774, "grad_norm": 0.9404931664466858, "learning_rate": 2.6226112081028544e-07, "loss": 1.9349, "step": 11100 }, { "epoch": 0.9415720891130728, "grad_norm": 0.8282467722892761, "learning_rate": 2.0071303346540505e-07, "loss": 1.9868, "step": 11200 }, { "epoch": 0.949978982765868, "grad_norm": 1.1069015264511108, "learning_rate": 1.473121701795144e-07, "loss": 1.8612, "step": 11300 }, { "epoch": 0.9583858764186634, "grad_norm": 1.072883129119873, "learning_rate": 1.0210292895826423e-07, "loss": 1.8956, "step": 11400 }, { "epoch": 0.9667927700714586, "grad_norm": 1.38330078125, "learning_rate": 6.512289721174126e-08, "loss": 1.9686, "step": 11500 }, { "epoch": 0.9751996637242539, "grad_norm": 1.1821762323379517, "learning_rate": 3.640282050392241e-08, "loss": 1.9082, "step": 11600 }, { "epoch": 0.9836065573770492, "grad_norm": 1.2466627359390259, "learning_rate": 1.5966576990507078e-08, "loss": 1.896, "step": 11700 }, { "epoch": 0.9920134510298445, "grad_norm": 1.2905254364013672, "learning_rate": 3.83115756638297e-09, "loss": 1.9837, "step": 11800 }, { "epoch": 1.0, "step": 11895, "total_flos": 2.17890078425088e+17, "train_loss": 1.9851969016204616, "train_runtime": 3196.2319, "train_samples_per_second": 7.443, "train_steps_per_second": 3.722 } ], "logging_steps": 100, "max_steps": 11895, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.17890078425088e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }