{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.982222222222222, "eval_steps": 500, "global_step": 504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03950617283950617, "grad_norm": 2.288396849528949, "learning_rate": 8.18672068791075e-06, "loss": 1.3346, "step": 5 }, { "epoch": 0.07901234567901234, "grad_norm": 1.8929681256595516, "learning_rate": 1.1712549375688393e-05, "loss": 1.1223, "step": 10 }, { "epoch": 0.11851851851851852, "grad_norm": 1.4562819635437423, "learning_rate": 1.3775026942005194e-05, "loss": 1.1066, "step": 15 }, { "epoch": 0.1580246913580247, "grad_norm": 1.4050742285496918, "learning_rate": 1.5238378063466034e-05, "loss": 1.0899, "step": 20 }, { "epoch": 0.19753086419753085, "grad_norm": 1.3153167730903892, "learning_rate": 1.63734413758215e-05, "loss": 1.068, "step": 25 }, { "epoch": 0.23703703703703705, "grad_norm": 1.4369967846795906, "learning_rate": 1.7300855629782836e-05, "loss": 1.0441, "step": 30 }, { "epoch": 0.2765432098765432, "grad_norm": 1.329888120477227, "learning_rate": 1.8084973208875214e-05, "loss": 1.0379, "step": 35 }, { "epoch": 0.3160493827160494, "grad_norm": 1.3378535534247475, "learning_rate": 1.8764206751243677e-05, "loss": 1.05, "step": 40 }, { "epoch": 0.35555555555555557, "grad_norm": 1.4123806746044558, "learning_rate": 1.9363333196099635e-05, "loss": 1.025, "step": 45 }, { "epoch": 0.3950617283950617, "grad_norm": 1.1859601137161715, "learning_rate": 1.9899270063599143e-05, "loss": 1.0303, "step": 50 }, { "epoch": 0.4345679012345679, "grad_norm": 1.304632071875575, "learning_rate": 2e-05, "loss": 1.0484, "step": 55 }, { "epoch": 0.4740740740740741, "grad_norm": 1.183106489033389, "learning_rate": 2e-05, "loss": 1.0259, "step": 60 }, { "epoch": 0.5135802469135803, "grad_norm": 1.2699836294272915, "learning_rate": 2e-05, "loss": 1.028, "step": 65 }, { "epoch": 0.5530864197530864, "grad_norm": 1.223033575337603, "learning_rate": 2e-05, "loss": 1.0459, "step": 70 }, { "epoch": 0.5925925925925926, "grad_norm": 1.2008866058582461, "learning_rate": 2e-05, "loss": 1.0634, "step": 75 }, { "epoch": 0.6320987654320988, "grad_norm": 1.2833530733821379, "learning_rate": 2e-05, "loss": 1.0271, "step": 80 }, { "epoch": 0.671604938271605, "grad_norm": 1.163728901488675, "learning_rate": 2e-05, "loss": 1.03, "step": 85 }, { "epoch": 0.7111111111111111, "grad_norm": 1.1416632177568837, "learning_rate": 2e-05, "loss": 1.0419, "step": 90 }, { "epoch": 0.7506172839506173, "grad_norm": 1.1923113133851808, "learning_rate": 2e-05, "loss": 1.0131, "step": 95 }, { "epoch": 0.7901234567901234, "grad_norm": 1.099667508502151, "learning_rate": 2e-05, "loss": 1.0177, "step": 100 }, { "epoch": 0.8296296296296296, "grad_norm": 1.1653220897948604, "learning_rate": 2e-05, "loss": 1.0244, "step": 105 }, { "epoch": 0.8691358024691358, "grad_norm": 1.1508720796926766, "learning_rate": 2e-05, "loss": 0.9878, "step": 110 }, { "epoch": 0.908641975308642, "grad_norm": 1.1402724355963554, "learning_rate": 2e-05, "loss": 1.0391, "step": 115 }, { "epoch": 0.9481481481481482, "grad_norm": 1.141348796259256, "learning_rate": 2e-05, "loss": 1.0153, "step": 120 }, { "epoch": 0.9876543209876543, "grad_norm": 1.1502126933733767, "learning_rate": 2e-05, "loss": 0.9995, "step": 125 }, { "epoch": 1.0271604938271606, "grad_norm": 1.0266288490243014, "learning_rate": 2e-05, "loss": 0.7874, "step": 130 }, { "epoch": 1.0666666666666667, "grad_norm": 1.2240976755676138, "learning_rate": 2e-05, "loss": 0.6494, "step": 135 }, { "epoch": 1.106172839506173, "grad_norm": 1.15929122657082, "learning_rate": 2e-05, "loss": 0.6644, "step": 140 }, { "epoch": 1.145679012345679, "grad_norm": 1.226821515640194, "learning_rate": 2e-05, "loss": 0.6478, "step": 145 }, { "epoch": 1.1851851851851851, "grad_norm": 1.0784057055869019, "learning_rate": 2e-05, "loss": 0.6141, "step": 150 }, { "epoch": 1.2246913580246914, "grad_norm": 1.2189273784729524, "learning_rate": 2e-05, "loss": 0.6171, "step": 155 }, { "epoch": 1.2641975308641975, "grad_norm": 1.1463832706796795, "learning_rate": 2e-05, "loss": 0.6348, "step": 160 }, { "epoch": 1.3037037037037038, "grad_norm": 1.277105384989837, "learning_rate": 2e-05, "loss": 0.6537, "step": 165 }, { "epoch": 1.34320987654321, "grad_norm": 1.2493194408291017, "learning_rate": 2e-05, "loss": 0.6348, "step": 170 }, { "epoch": 1.382716049382716, "grad_norm": 1.275379674934221, "learning_rate": 2e-05, "loss": 0.6359, "step": 175 }, { "epoch": 1.4222222222222223, "grad_norm": 1.2351810219998518, "learning_rate": 2e-05, "loss": 0.634, "step": 180 }, { "epoch": 1.4617283950617284, "grad_norm": 1.2400415938496727, "learning_rate": 2e-05, "loss": 0.6575, "step": 185 }, { "epoch": 1.5012345679012347, "grad_norm": 1.20319815037753, "learning_rate": 2e-05, "loss": 0.6302, "step": 190 }, { "epoch": 1.5407407407407407, "grad_norm": 1.2202272853056775, "learning_rate": 2e-05, "loss": 0.6433, "step": 195 }, { "epoch": 1.5802469135802468, "grad_norm": 1.2375828410223908, "learning_rate": 2e-05, "loss": 0.6527, "step": 200 }, { "epoch": 1.6197530864197531, "grad_norm": 1.2178746501653863, "learning_rate": 2e-05, "loss": 0.6631, "step": 205 }, { "epoch": 1.6592592592592592, "grad_norm": 1.2015876208269247, "learning_rate": 2e-05, "loss": 0.6324, "step": 210 }, { "epoch": 1.6987654320987655, "grad_norm": 1.2831290348498436, "learning_rate": 2e-05, "loss": 0.6325, "step": 215 }, { "epoch": 1.7382716049382716, "grad_norm": 1.1989479874493834, "learning_rate": 2e-05, "loss": 0.6335, "step": 220 }, { "epoch": 1.7777777777777777, "grad_norm": 1.2494160770138447, "learning_rate": 2e-05, "loss": 0.6548, "step": 225 }, { "epoch": 1.817283950617284, "grad_norm": 1.199854634744343, "learning_rate": 2e-05, "loss": 0.6527, "step": 230 }, { "epoch": 1.8567901234567903, "grad_norm": 1.2753911656579426, "learning_rate": 2e-05, "loss": 0.6532, "step": 235 }, { "epoch": 1.8962962962962964, "grad_norm": 1.259005764478814, "learning_rate": 2e-05, "loss": 0.6321, "step": 240 }, { "epoch": 1.9358024691358025, "grad_norm": 1.2073632789042554, "learning_rate": 2e-05, "loss": 0.6502, "step": 245 }, { "epoch": 1.9753086419753085, "grad_norm": 1.3138749527875218, "learning_rate": 2e-05, "loss": 0.6762, "step": 250 }, { "epoch": 2.0148148148148146, "grad_norm": 1.3591666117815475, "learning_rate": 2e-05, "loss": 0.542, "step": 255 }, { "epoch": 2.054320987654321, "grad_norm": 2.063047801337415, "learning_rate": 2e-05, "loss": 0.2887, "step": 260 }, { "epoch": 2.093827160493827, "grad_norm": 1.2684017214430752, "learning_rate": 2e-05, "loss": 0.2644, "step": 265 }, { "epoch": 2.1333333333333333, "grad_norm": 1.2966722941774393, "learning_rate": 2e-05, "loss": 0.2571, "step": 270 }, { "epoch": 2.1728395061728394, "grad_norm": 1.340692853831283, "learning_rate": 2e-05, "loss": 0.2528, "step": 275 }, { "epoch": 2.212345679012346, "grad_norm": 1.14949845398096, "learning_rate": 2e-05, "loss": 0.2537, "step": 280 }, { "epoch": 2.251851851851852, "grad_norm": 1.2372995647380092, "learning_rate": 2e-05, "loss": 0.2499, "step": 285 }, { "epoch": 2.291358024691358, "grad_norm": 1.1599361078462038, "learning_rate": 2e-05, "loss": 0.2571, "step": 290 }, { "epoch": 2.330864197530864, "grad_norm": 1.2300573894453493, "learning_rate": 2e-05, "loss": 0.2493, "step": 295 }, { "epoch": 2.3703703703703702, "grad_norm": 1.3265214490034312, "learning_rate": 2e-05, "loss": 0.253, "step": 300 }, { "epoch": 2.4098765432098768, "grad_norm": 1.2853819683882652, "learning_rate": 2e-05, "loss": 0.2517, "step": 305 }, { "epoch": 2.449382716049383, "grad_norm": 1.3525697343190135, "learning_rate": 2e-05, "loss": 0.2494, "step": 310 }, { "epoch": 2.488888888888889, "grad_norm": 1.2003581951396316, "learning_rate": 2e-05, "loss": 0.2552, "step": 315 }, { "epoch": 2.528395061728395, "grad_norm": 1.3354927903528535, "learning_rate": 2e-05, "loss": 0.2653, "step": 320 }, { "epoch": 2.567901234567901, "grad_norm": 1.4439934100900786, "learning_rate": 2e-05, "loss": 0.2802, "step": 325 }, { "epoch": 2.6074074074074076, "grad_norm": 1.245376378199098, "learning_rate": 2e-05, "loss": 0.2641, "step": 330 }, { "epoch": 2.6469135802469137, "grad_norm": 1.2818866706200012, "learning_rate": 2e-05, "loss": 0.2676, "step": 335 }, { "epoch": 2.68641975308642, "grad_norm": 1.276975908014479, "learning_rate": 2e-05, "loss": 0.2749, "step": 340 }, { "epoch": 2.725925925925926, "grad_norm": 1.2980698214464974, "learning_rate": 2e-05, "loss": 0.2732, "step": 345 }, { "epoch": 2.765432098765432, "grad_norm": 1.3359535241429625, "learning_rate": 2e-05, "loss": 0.2739, "step": 350 }, { "epoch": 2.8049382716049385, "grad_norm": 1.2472173979334094, "learning_rate": 2e-05, "loss": 0.2698, "step": 355 }, { "epoch": 2.8444444444444446, "grad_norm": 1.2863387095995107, "learning_rate": 2e-05, "loss": 0.2647, "step": 360 }, { "epoch": 2.8839506172839506, "grad_norm": 1.4156210734758483, "learning_rate": 2e-05, "loss": 0.2711, "step": 365 }, { "epoch": 2.9234567901234567, "grad_norm": 1.299941175380543, "learning_rate": 2e-05, "loss": 0.2818, "step": 370 }, { "epoch": 2.962962962962963, "grad_norm": 1.266519548711242, "learning_rate": 2e-05, "loss": 0.276, "step": 375 }, { "epoch": 3.0024691358024693, "grad_norm": 1.1318259958419454, "learning_rate": 2e-05, "loss": 0.2592, "step": 380 }, { "epoch": 3.0419753086419754, "grad_norm": 0.933334877688298, "learning_rate": 2e-05, "loss": 0.0838, "step": 385 }, { "epoch": 3.0814814814814815, "grad_norm": 1.0809786957325411, "learning_rate": 2e-05, "loss": 0.0859, "step": 390 }, { "epoch": 3.1209876543209876, "grad_norm": 0.9787186358692034, "learning_rate": 2e-05, "loss": 0.0784, "step": 395 }, { "epoch": 3.1604938271604937, "grad_norm": 0.9546009939819529, "learning_rate": 2e-05, "loss": 0.0802, "step": 400 }, { "epoch": 3.2, "grad_norm": 1.0327679510654035, "learning_rate": 2e-05, "loss": 0.0785, "step": 405 }, { "epoch": 3.2395061728395063, "grad_norm": 0.9851858106843173, "learning_rate": 2e-05, "loss": 0.0804, "step": 410 }, { "epoch": 3.2790123456790123, "grad_norm": 0.8657522447354971, "learning_rate": 2e-05, "loss": 0.0779, "step": 415 }, { "epoch": 3.3185185185185184, "grad_norm": 1.0753000614988253, "learning_rate": 2e-05, "loss": 0.0799, "step": 420 }, { "epoch": 3.3580246913580245, "grad_norm": 0.9715983171240334, "learning_rate": 2e-05, "loss": 0.0787, "step": 425 }, { "epoch": 3.397530864197531, "grad_norm": 1.0205981518321303, "learning_rate": 2e-05, "loss": 0.0845, "step": 430 }, { "epoch": 3.437037037037037, "grad_norm": 0.9519562378749633, "learning_rate": 2e-05, "loss": 0.0831, "step": 435 }, { "epoch": 3.476543209876543, "grad_norm": 1.0856696967629995, "learning_rate": 2e-05, "loss": 0.0835, "step": 440 }, { "epoch": 3.5160493827160493, "grad_norm": 1.0619796419728877, "learning_rate": 2e-05, "loss": 0.0873, "step": 445 }, { "epoch": 3.5555555555555554, "grad_norm": 1.0366626282771845, "learning_rate": 2e-05, "loss": 0.0837, "step": 450 }, { "epoch": 3.595061728395062, "grad_norm": 1.0659804060064433, "learning_rate": 2e-05, "loss": 0.0811, "step": 455 }, { "epoch": 3.634567901234568, "grad_norm": 1.0334508292983433, "learning_rate": 2e-05, "loss": 0.0809, "step": 460 }, { "epoch": 3.674074074074074, "grad_norm": 0.954017121382599, "learning_rate": 2e-05, "loss": 0.0883, "step": 465 }, { "epoch": 3.71358024691358, "grad_norm": 1.0166440249144018, "learning_rate": 2e-05, "loss": 0.0879, "step": 470 }, { "epoch": 3.753086419753086, "grad_norm": 1.0979200122546204, "learning_rate": 2e-05, "loss": 0.0878, "step": 475 }, { "epoch": 3.7925925925925927, "grad_norm": 1.0013459456925258, "learning_rate": 2e-05, "loss": 0.0839, "step": 480 }, { "epoch": 3.832098765432099, "grad_norm": 1.0160863439352807, "learning_rate": 2e-05, "loss": 0.0915, "step": 485 }, { "epoch": 3.871604938271605, "grad_norm": 0.9858324147193233, "learning_rate": 2e-05, "loss": 0.0908, "step": 490 }, { "epoch": 3.911111111111111, "grad_norm": 0.9282172156060597, "learning_rate": 2e-05, "loss": 0.0884, "step": 495 }, { "epoch": 3.950617283950617, "grad_norm": 1.0696690745745738, "learning_rate": 2e-05, "loss": 0.0864, "step": 500 } ], "logging_steps": 5, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 269178256277504.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }