{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997049277072882, "eval_steps": 500, "global_step": 847, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011802891708468575, "grad_norm": 1.0318430662155151, "learning_rate": 5.882352941176471e-06, "loss": 2.0002, "step": 10 }, { "epoch": 0.02360578341693715, "grad_norm": 0.7277640104293823, "learning_rate": 1.1764705882352942e-05, "loss": 1.5384, "step": 20 }, { "epoch": 0.03540867512540572, "grad_norm": 0.5723221898078918, "learning_rate": 1.7647058823529414e-05, "loss": 1.2868, "step": 30 }, { "epoch": 0.0472115668338743, "grad_norm": 0.5436434149742126, "learning_rate": 2.3529411764705884e-05, "loss": 1.195, "step": 40 }, { "epoch": 0.05901445854234287, "grad_norm": 0.5486343502998352, "learning_rate": 2.9411764705882354e-05, "loss": 1.099, "step": 50 }, { "epoch": 0.07081735025081144, "grad_norm": 0.540969967842102, "learning_rate": 3.529411764705883e-05, "loss": 1.0529, "step": 60 }, { "epoch": 0.08262024195928003, "grad_norm": 0.5404631495475769, "learning_rate": 4.11764705882353e-05, "loss": 0.9962, "step": 70 }, { "epoch": 0.0944231336677486, "grad_norm": 0.7088269591331482, "learning_rate": 4.705882352941177e-05, "loss": 1.0458, "step": 80 }, { "epoch": 0.10622602537621717, "grad_norm": 0.5803569555282593, "learning_rate": 4.9994688411216076e-05, "loss": 1.0237, "step": 90 }, { "epoch": 0.11802891708468574, "grad_norm": 0.5251590013504028, "learning_rate": 4.99522092422138e-05, "loss": 1.005, "step": 100 }, { "epoch": 0.1298318087931543, "grad_norm": 0.5402054786682129, "learning_rate": 4.986732309873992e-05, "loss": 0.9771, "step": 110 }, { "epoch": 0.14163470050162288, "grad_norm": 0.5150293111801147, "learning_rate": 4.9740174247159156e-05, "loss": 0.9695, "step": 120 }, { "epoch": 0.15343759221009148, "grad_norm": 0.6688190698623657, "learning_rate": 4.95709787804856e-05, "loss": 0.9605, "step": 130 }, { "epoch": 0.16524048391856005, "grad_norm": 0.4473928213119507, "learning_rate": 4.936002425112657e-05, "loss": 0.8765, "step": 140 }, { "epoch": 0.17704337562702863, "grad_norm": 0.5843707919120789, "learning_rate": 4.910766918217935e-05, "loss": 0.9371, "step": 150 }, { "epoch": 0.1888462673354972, "grad_norm": 0.5653939247131348, "learning_rate": 4.881434245811115e-05, "loss": 0.9091, "step": 160 }, { "epoch": 0.20064915904396577, "grad_norm": 0.725297212600708, "learning_rate": 4.8480542595858025e-05, "loss": 0.9217, "step": 170 }, { "epoch": 0.21245205075243434, "grad_norm": 0.4762970805168152, "learning_rate": 4.810683689758147e-05, "loss": 0.9, "step": 180 }, { "epoch": 0.2242549424609029, "grad_norm": 0.6104872226715088, "learning_rate": 4.7693860486522604e-05, "loss": 0.8662, "step": 190 }, { "epoch": 0.23605783416937148, "grad_norm": 0.5987859964370728, "learning_rate": 4.7242315227592496e-05, "loss": 0.8754, "step": 200 }, { "epoch": 0.24786072587784008, "grad_norm": 0.5462383031845093, "learning_rate": 4.675296853453326e-05, "loss": 0.8838, "step": 210 }, { "epoch": 0.2596636175863086, "grad_norm": 0.7473201155662537, "learning_rate": 4.6226652065676974e-05, "loss": 0.8798, "step": 220 }, { "epoch": 0.2714665092947772, "grad_norm": 0.6000027656555176, "learning_rate": 4.566426031051922e-05, "loss": 0.9065, "step": 230 }, { "epoch": 0.28326940100324577, "grad_norm": 0.6105000972747803, "learning_rate": 4.506674906950929e-05, "loss": 0.9111, "step": 240 }, { "epoch": 0.29507229271171437, "grad_norm": 0.5564777851104736, "learning_rate": 4.4435133829640645e-05, "loss": 0.8646, "step": 250 }, { "epoch": 0.30687518442018297, "grad_norm": 0.6954275369644165, "learning_rate": 4.3770488038602555e-05, "loss": 0.8485, "step": 260 }, { "epoch": 0.3186780761286515, "grad_norm": 0.8191194534301758, "learning_rate": 4.30739412804258e-05, "loss": 0.826, "step": 270 }, { "epoch": 0.3304809678371201, "grad_norm": 0.6449839472770691, "learning_rate": 4.234667735572323e-05, "loss": 0.8685, "step": 280 }, { "epoch": 0.34228385954558865, "grad_norm": 0.7887718677520752, "learning_rate": 4.158993226978757e-05, "loss": 0.8229, "step": 290 }, { "epoch": 0.35408675125405725, "grad_norm": 0.764539361000061, "learning_rate": 4.080499213196607e-05, "loss": 0.8303, "step": 300 }, { "epoch": 0.3658896429625258, "grad_norm": 0.6305603384971619, "learning_rate": 3.999319096988183e-05, "loss": 0.829, "step": 310 }, { "epoch": 0.3776925346709944, "grad_norm": 0.5482339859008789, "learning_rate": 3.915590846221669e-05, "loss": 0.8356, "step": 320 }, { "epoch": 0.389495426379463, "grad_norm": 0.6555970311164856, "learning_rate": 3.8294567593908915e-05, "loss": 0.8281, "step": 330 }, { "epoch": 0.40129831808793154, "grad_norm": 0.8127148151397705, "learning_rate": 3.741063223775066e-05, "loss": 0.8543, "step": 340 }, { "epoch": 0.41310120979640014, "grad_norm": 0.8948593735694885, "learning_rate": 3.650560466649538e-05, "loss": 0.8639, "step": 350 }, { "epoch": 0.4249041015048687, "grad_norm": 0.6402966976165771, "learning_rate": 3.5581022999703464e-05, "loss": 0.8324, "step": 360 }, { "epoch": 0.4367069932133373, "grad_norm": 0.6675844192504883, "learning_rate": 3.4638458589665194e-05, "loss": 0.8265, "step": 370 }, { "epoch": 0.4485098849218058, "grad_norm": 0.6756200194358826, "learning_rate": 3.367951335084379e-05, "loss": 0.7834, "step": 380 }, { "epoch": 0.4603127766302744, "grad_norm": 0.7358006834983826, "learning_rate": 3.270581703737716e-05, "loss": 0.8107, "step": 390 }, { "epoch": 0.47211566833874297, "grad_norm": 0.6496703028678894, "learning_rate": 3.171902447326536e-05, "loss": 0.8055, "step": 400 }, { "epoch": 0.48391856004721157, "grad_norm": 0.6885930895805359, "learning_rate": 3.07208127399511e-05, "loss": 0.8249, "step": 410 }, { "epoch": 0.49572145175568016, "grad_norm": 0.7303836941719055, "learning_rate": 2.9712878326073168e-05, "loss": 0.8054, "step": 420 }, { "epoch": 0.5075243434641488, "grad_norm": 0.6711559295654297, "learning_rate": 2.869693424423673e-05, "loss": 0.7779, "step": 430 }, { "epoch": 0.5193272351726173, "grad_norm": 0.6829948425292969, "learning_rate": 2.767470711970067e-05, "loss": 0.7729, "step": 440 }, { "epoch": 0.5311301268810859, "grad_norm": 0.6073248386383057, "learning_rate": 2.6647934255929933e-05, "loss": 0.7867, "step": 450 }, { "epoch": 0.5429330185895545, "grad_norm": 0.7291135787963867, "learning_rate": 2.5618360681999876e-05, "loss": 0.7751, "step": 460 }, { "epoch": 0.554735910298023, "grad_norm": 0.6531949043273926, "learning_rate": 2.4587736186870766e-05, "loss": 0.7979, "step": 470 }, { "epoch": 0.5665388020064915, "grad_norm": 0.5947457551956177, "learning_rate": 2.3557812345572718e-05, "loss": 0.7807, "step": 480 }, { "epoch": 0.5783416937149601, "grad_norm": 0.7103855609893799, "learning_rate": 2.2530339542355145e-05, "loss": 0.8293, "step": 490 }, { "epoch": 0.5901445854234287, "grad_norm": 1.0487534999847412, "learning_rate": 2.150706399585999e-05, "loss": 0.798, "step": 500 }, { "epoch": 0.6019474771318973, "grad_norm": 0.8106992244720459, "learning_rate": 2.048972479137449e-05, "loss": 0.7426, "step": 510 }, { "epoch": 0.6137503688403659, "grad_norm": 0.6543154120445251, "learning_rate": 1.948005092520735e-05, "loss": 0.7813, "step": 520 }, { "epoch": 0.6255532605488344, "grad_norm": 0.6375657916069031, "learning_rate": 1.8479758366211334e-05, "loss": 0.7701, "step": 530 }, { "epoch": 0.637356152257303, "grad_norm": 0.6001560091972351, "learning_rate": 1.7490547139446407e-05, "loss": 0.7777, "step": 540 }, { "epoch": 0.6491590439657716, "grad_norm": 0.7287290096282959, "learning_rate": 1.6514098436939835e-05, "loss": 0.7693, "step": 550 }, { "epoch": 0.6609619356742402, "grad_norm": 0.6269923448562622, "learning_rate": 1.555207176045349e-05, "loss": 0.7672, "step": 560 }, { "epoch": 0.6727648273827088, "grad_norm": 0.622016966342926, "learning_rate": 1.4606102101114391e-05, "loss": 0.7504, "step": 570 }, { "epoch": 0.6845677190911773, "grad_norm": 0.5838598012924194, "learning_rate": 1.367779716070179e-05, "loss": 0.7865, "step": 580 }, { "epoch": 0.6963706107996459, "grad_norm": 0.656366765499115, "learning_rate": 1.2768734619313147e-05, "loss": 0.7696, "step": 590 }, { "epoch": 0.7081735025081145, "grad_norm": 0.6976104378700256, "learning_rate": 1.188045945405299e-05, "loss": 0.7652, "step": 600 }, { "epoch": 0.7199763942165831, "grad_norm": 0.7407099604606628, "learning_rate": 1.1014481313301172e-05, "loss": 0.7533, "step": 610 }, { "epoch": 0.7317792859250516, "grad_norm": 0.5191411375999451, "learning_rate": 1.017227195102352e-05, "loss": 0.7578, "step": 620 }, { "epoch": 0.7435821776335202, "grad_norm": 0.6771509051322937, "learning_rate": 9.355262725484901e-06, "loss": 0.7768, "step": 630 }, { "epoch": 0.7553850693419888, "grad_norm": 0.6330916881561279, "learning_rate": 8.564842166616047e-06, "loss": 0.7071, "step": 640 }, { "epoch": 0.7671879610504574, "grad_norm": 0.693899929523468, "learning_rate": 7.802353616168229e-06, "loss": 0.7544, "step": 650 }, { "epoch": 0.778990852758926, "grad_norm": 0.6973963379859924, "learning_rate": 7.069092944666586e-06, "loss": 0.7418, "step": 660 }, { "epoch": 0.7907937444673945, "grad_norm": 0.758264422416687, "learning_rate": 6.3663063490420336e-06, "loss": 0.7564, "step": 670 }, { "epoch": 0.8025966361758631, "grad_norm": 0.6236333847045898, "learning_rate": 5.695188234684898e-06, "loss": 0.7431, "step": 680 }, { "epoch": 0.8143995278843317, "grad_norm": 0.6301143169403076, "learning_rate": 5.056879185519714e-06, "loss": 0.7307, "step": 690 }, { "epoch": 0.8262024195928003, "grad_norm": 0.5712493062019348, "learning_rate": 4.452464025551037e-06, "loss": 0.7157, "step": 700 }, { "epoch": 0.8380053113012688, "grad_norm": 0.6849854588508606, "learning_rate": 3.8829699751748885e-06, "loss": 0.7367, "step": 710 }, { "epoch": 0.8498082030097374, "grad_norm": 0.6399794816970825, "learning_rate": 3.3493649053890326e-06, "loss": 0.7288, "step": 720 }, { "epoch": 0.861611094718206, "grad_norm": 0.8012081384658813, "learning_rate": 2.8525556928693186e-06, "loss": 0.7237, "step": 730 }, { "epoch": 0.8734139864266746, "grad_norm": 0.7375155687332153, "learning_rate": 2.3933866787074627e-06, "loss": 0.7543, "step": 740 }, { "epoch": 0.8852168781351432, "grad_norm": 0.6023644208908081, "learning_rate": 1.9726382334298883e-06, "loss": 0.74, "step": 750 }, { "epoch": 0.8970197698436116, "grad_norm": 0.6464205980300903, "learning_rate": 1.5910254307362705e-06, "loss": 0.7578, "step": 760 }, { "epoch": 0.9088226615520802, "grad_norm": 0.6287794709205627, "learning_rate": 1.2491968322118685e-06, "loss": 0.7513, "step": 770 }, { "epoch": 0.9206255532605488, "grad_norm": 0.6092919707298279, "learning_rate": 9.477333850790554e-07, "loss": 0.7187, "step": 780 }, { "epoch": 0.9324284449690174, "grad_norm": 0.6522098183631897, "learning_rate": 6.871474348613266e-07, "loss": 0.7519, "step": 790 }, { "epoch": 0.9442313366774859, "grad_norm": 0.5721604228019714, "learning_rate": 4.678818546378333e-07, "loss": 0.7502, "step": 800 }, { "epoch": 0.9560342283859545, "grad_norm": 0.8141267895698547, "learning_rate": 2.903092923682266e-07, "loss": 0.7512, "step": 810 }, { "epoch": 0.9678371200944231, "grad_norm": 0.5925819277763367, "learning_rate": 1.5473153756709046e-07, "loss": 0.795, "step": 820 }, { "epoch": 0.9796400118028917, "grad_norm": 0.599314272403717, "learning_rate": 6.137900840425815e-08, "loss": 0.7319, "step": 830 }, { "epoch": 0.9914429035113603, "grad_norm": 0.7140465974807739, "learning_rate": 1.0410360102702799e-08, "loss": 0.7747, "step": 840 }, { "epoch": 0.9997049277072882, "step": 847, "total_flos": 6.067900108221972e+17, "train_loss": 0.8547630963105941, "train_runtime": 5769.0005, "train_samples_per_second": 4.699, "train_steps_per_second": 0.147 } ], "logging_steps": 10, "max_steps": 847, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.067900108221972e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }