|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.982222222222222, |
|
"eval_steps": 500, |
|
"global_step": 504, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03950617283950617, |
|
"grad_norm": 2.288396849528949, |
|
"learning_rate": 8.18672068791075e-06, |
|
"loss": 1.3346, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07901234567901234, |
|
"grad_norm": 1.8929681256595516, |
|
"learning_rate": 1.1712549375688393e-05, |
|
"loss": 1.1223, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 1.4562819635437423, |
|
"learning_rate": 1.3775026942005194e-05, |
|
"loss": 1.1066, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1580246913580247, |
|
"grad_norm": 1.4050742285496918, |
|
"learning_rate": 1.5238378063466034e-05, |
|
"loss": 1.0899, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 1.3153167730903892, |
|
"learning_rate": 1.63734413758215e-05, |
|
"loss": 1.068, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 1.4369967846795906, |
|
"learning_rate": 1.7300855629782836e-05, |
|
"loss": 1.0441, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2765432098765432, |
|
"grad_norm": 1.329888120477227, |
|
"learning_rate": 1.8084973208875214e-05, |
|
"loss": 1.0379, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3160493827160494, |
|
"grad_norm": 1.3378535534247475, |
|
"learning_rate": 1.8764206751243677e-05, |
|
"loss": 1.05, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 1.4123806746044558, |
|
"learning_rate": 1.9363333196099635e-05, |
|
"loss": 1.025, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 1.1859601137161715, |
|
"learning_rate": 1.9899270063599143e-05, |
|
"loss": 1.0303, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4345679012345679, |
|
"grad_norm": 1.304632071875575, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0484, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 1.183106489033389, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0259, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5135802469135803, |
|
"grad_norm": 1.2699836294272915, |
|
"learning_rate": 2e-05, |
|
"loss": 1.028, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5530864197530864, |
|
"grad_norm": 1.223033575337603, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0459, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 1.2008866058582461, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0634, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6320987654320988, |
|
"grad_norm": 1.2833530733821379, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0271, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.671604938271605, |
|
"grad_norm": 1.163728901488675, |
|
"learning_rate": 2e-05, |
|
"loss": 1.03, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 1.1416632177568837, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0419, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7506172839506173, |
|
"grad_norm": 1.1923113133851808, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0131, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 1.099667508502151, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0177, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 1.1653220897948604, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0244, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8691358024691358, |
|
"grad_norm": 1.1508720796926766, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9878, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.908641975308642, |
|
"grad_norm": 1.1402724355963554, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0391, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 1.141348796259256, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0153, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 1.1502126933733767, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9995, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0271604938271606, |
|
"grad_norm": 1.0266288490243014, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7874, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 1.2240976755676138, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6494, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.106172839506173, |
|
"grad_norm": 1.15929122657082, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6644, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.145679012345679, |
|
"grad_norm": 1.226821515640194, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6478, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 1.0784057055869019, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6141, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.2246913580246914, |
|
"grad_norm": 1.2189273784729524, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6171, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2641975308641975, |
|
"grad_norm": 1.1463832706796795, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6348, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3037037037037038, |
|
"grad_norm": 1.277105384989837, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6537, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.34320987654321, |
|
"grad_norm": 1.2493194408291017, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6348, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.382716049382716, |
|
"grad_norm": 1.275379674934221, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6359, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 1.2351810219998518, |
|
"learning_rate": 2e-05, |
|
"loss": 0.634, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4617283950617284, |
|
"grad_norm": 1.2400415938496727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6575, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5012345679012347, |
|
"grad_norm": 1.20319815037753, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6302, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5407407407407407, |
|
"grad_norm": 1.2202272853056775, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6433, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5802469135802468, |
|
"grad_norm": 1.2375828410223908, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6527, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6197530864197531, |
|
"grad_norm": 1.2178746501653863, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6631, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6592592592592592, |
|
"grad_norm": 1.2015876208269247, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6324, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6987654320987655, |
|
"grad_norm": 1.2831290348498436, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6325, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7382716049382716, |
|
"grad_norm": 1.1989479874493834, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6335, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 1.2494160770138447, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6548, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.817283950617284, |
|
"grad_norm": 1.199854634744343, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6527, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8567901234567903, |
|
"grad_norm": 1.2753911656579426, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6532, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.8962962962962964, |
|
"grad_norm": 1.259005764478814, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6321, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9358024691358025, |
|
"grad_norm": 1.2073632789042554, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6502, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 1.3138749527875218, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6762, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0148148148148146, |
|
"grad_norm": 1.3591666117815475, |
|
"learning_rate": 2e-05, |
|
"loss": 0.542, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.054320987654321, |
|
"grad_norm": 2.063047801337415, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2887, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.093827160493827, |
|
"grad_norm": 1.2684017214430752, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2644, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 1.2966722941774393, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2571, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.1728395061728394, |
|
"grad_norm": 1.340692853831283, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2528, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.212345679012346, |
|
"grad_norm": 1.14949845398096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2537, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.251851851851852, |
|
"grad_norm": 1.2372995647380092, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2499, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.291358024691358, |
|
"grad_norm": 1.1599361078462038, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2571, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.330864197530864, |
|
"grad_norm": 1.2300573894453493, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2493, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 1.3265214490034312, |
|
"learning_rate": 2e-05, |
|
"loss": 0.253, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4098765432098768, |
|
"grad_norm": 1.2853819683882652, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2517, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.449382716049383, |
|
"grad_norm": 1.3525697343190135, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2494, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 1.2003581951396316, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2552, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.528395061728395, |
|
"grad_norm": 1.3354927903528535, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2653, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.567901234567901, |
|
"grad_norm": 1.4439934100900786, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2802, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.6074074074074076, |
|
"grad_norm": 1.245376378199098, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2641, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.6469135802469137, |
|
"grad_norm": 1.2818866706200012, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2676, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.68641975308642, |
|
"grad_norm": 1.276975908014479, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2749, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.725925925925926, |
|
"grad_norm": 1.2980698214464974, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2732, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.765432098765432, |
|
"grad_norm": 1.3359535241429625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2739, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8049382716049385, |
|
"grad_norm": 1.2472173979334094, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2698, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 1.2863387095995107, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2647, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.8839506172839506, |
|
"grad_norm": 1.4156210734758483, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2711, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.9234567901234567, |
|
"grad_norm": 1.299941175380543, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2818, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 1.266519548711242, |
|
"learning_rate": 2e-05, |
|
"loss": 0.276, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.0024691358024693, |
|
"grad_norm": 1.1318259958419454, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2592, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.0419753086419754, |
|
"grad_norm": 0.933334877688298, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0838, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.0814814814814815, |
|
"grad_norm": 1.0809786957325411, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0859, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.1209876543209876, |
|
"grad_norm": 0.9787186358692034, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0784, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.1604938271604937, |
|
"grad_norm": 0.9546009939819529, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0802, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.0327679510654035, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0785, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.2395061728395063, |
|
"grad_norm": 0.9851858106843173, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0804, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.2790123456790123, |
|
"grad_norm": 0.8657522447354971, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0779, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.3185185185185184, |
|
"grad_norm": 1.0753000614988253, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0799, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.3580246913580245, |
|
"grad_norm": 0.9715983171240334, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0787, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.397530864197531, |
|
"grad_norm": 1.0205981518321303, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0845, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.437037037037037, |
|
"grad_norm": 0.9519562378749633, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0831, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.476543209876543, |
|
"grad_norm": 1.0856696967629995, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0835, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.5160493827160493, |
|
"grad_norm": 1.0619796419728877, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0873, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 1.0366626282771845, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0837, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.595061728395062, |
|
"grad_norm": 1.0659804060064433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0811, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.634567901234568, |
|
"grad_norm": 1.0334508292983433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0809, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.674074074074074, |
|
"grad_norm": 0.954017121382599, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0883, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.71358024691358, |
|
"grad_norm": 1.0166440249144018, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0879, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.753086419753086, |
|
"grad_norm": 1.0979200122546204, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0878, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.7925925925925927, |
|
"grad_norm": 1.0013459456925258, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0839, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.832098765432099, |
|
"grad_norm": 1.0160863439352807, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0915, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.871604938271605, |
|
"grad_norm": 0.9858324147193233, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0908, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.911111111111111, |
|
"grad_norm": 0.9282172156060597, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0884, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.950617283950617, |
|
"grad_norm": 1.0696690745745738, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0864, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 504, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 269178256277504.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|