wiki_sim_messy_same_data / trainer_state.json
JH-debug's picture
Upload folder using huggingface_hub
80f151a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.982222222222222,
"eval_steps": 500,
"global_step": 504,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03950617283950617,
"grad_norm": 2.288396849528949,
"learning_rate": 8.18672068791075e-06,
"loss": 1.3346,
"step": 5
},
{
"epoch": 0.07901234567901234,
"grad_norm": 1.8929681256595516,
"learning_rate": 1.1712549375688393e-05,
"loss": 1.1223,
"step": 10
},
{
"epoch": 0.11851851851851852,
"grad_norm": 1.4562819635437423,
"learning_rate": 1.3775026942005194e-05,
"loss": 1.1066,
"step": 15
},
{
"epoch": 0.1580246913580247,
"grad_norm": 1.4050742285496918,
"learning_rate": 1.5238378063466034e-05,
"loss": 1.0899,
"step": 20
},
{
"epoch": 0.19753086419753085,
"grad_norm": 1.3153167730903892,
"learning_rate": 1.63734413758215e-05,
"loss": 1.068,
"step": 25
},
{
"epoch": 0.23703703703703705,
"grad_norm": 1.4369967846795906,
"learning_rate": 1.7300855629782836e-05,
"loss": 1.0441,
"step": 30
},
{
"epoch": 0.2765432098765432,
"grad_norm": 1.329888120477227,
"learning_rate": 1.8084973208875214e-05,
"loss": 1.0379,
"step": 35
},
{
"epoch": 0.3160493827160494,
"grad_norm": 1.3378535534247475,
"learning_rate": 1.8764206751243677e-05,
"loss": 1.05,
"step": 40
},
{
"epoch": 0.35555555555555557,
"grad_norm": 1.4123806746044558,
"learning_rate": 1.9363333196099635e-05,
"loss": 1.025,
"step": 45
},
{
"epoch": 0.3950617283950617,
"grad_norm": 1.1859601137161715,
"learning_rate": 1.9899270063599143e-05,
"loss": 1.0303,
"step": 50
},
{
"epoch": 0.4345679012345679,
"grad_norm": 1.304632071875575,
"learning_rate": 2e-05,
"loss": 1.0484,
"step": 55
},
{
"epoch": 0.4740740740740741,
"grad_norm": 1.183106489033389,
"learning_rate": 2e-05,
"loss": 1.0259,
"step": 60
},
{
"epoch": 0.5135802469135803,
"grad_norm": 1.2699836294272915,
"learning_rate": 2e-05,
"loss": 1.028,
"step": 65
},
{
"epoch": 0.5530864197530864,
"grad_norm": 1.223033575337603,
"learning_rate": 2e-05,
"loss": 1.0459,
"step": 70
},
{
"epoch": 0.5925925925925926,
"grad_norm": 1.2008866058582461,
"learning_rate": 2e-05,
"loss": 1.0634,
"step": 75
},
{
"epoch": 0.6320987654320988,
"grad_norm": 1.2833530733821379,
"learning_rate": 2e-05,
"loss": 1.0271,
"step": 80
},
{
"epoch": 0.671604938271605,
"grad_norm": 1.163728901488675,
"learning_rate": 2e-05,
"loss": 1.03,
"step": 85
},
{
"epoch": 0.7111111111111111,
"grad_norm": 1.1416632177568837,
"learning_rate": 2e-05,
"loss": 1.0419,
"step": 90
},
{
"epoch": 0.7506172839506173,
"grad_norm": 1.1923113133851808,
"learning_rate": 2e-05,
"loss": 1.0131,
"step": 95
},
{
"epoch": 0.7901234567901234,
"grad_norm": 1.099667508502151,
"learning_rate": 2e-05,
"loss": 1.0177,
"step": 100
},
{
"epoch": 0.8296296296296296,
"grad_norm": 1.1653220897948604,
"learning_rate": 2e-05,
"loss": 1.0244,
"step": 105
},
{
"epoch": 0.8691358024691358,
"grad_norm": 1.1508720796926766,
"learning_rate": 2e-05,
"loss": 0.9878,
"step": 110
},
{
"epoch": 0.908641975308642,
"grad_norm": 1.1402724355963554,
"learning_rate": 2e-05,
"loss": 1.0391,
"step": 115
},
{
"epoch": 0.9481481481481482,
"grad_norm": 1.141348796259256,
"learning_rate": 2e-05,
"loss": 1.0153,
"step": 120
},
{
"epoch": 0.9876543209876543,
"grad_norm": 1.1502126933733767,
"learning_rate": 2e-05,
"loss": 0.9995,
"step": 125
},
{
"epoch": 1.0271604938271606,
"grad_norm": 1.0266288490243014,
"learning_rate": 2e-05,
"loss": 0.7874,
"step": 130
},
{
"epoch": 1.0666666666666667,
"grad_norm": 1.2240976755676138,
"learning_rate": 2e-05,
"loss": 0.6494,
"step": 135
},
{
"epoch": 1.106172839506173,
"grad_norm": 1.15929122657082,
"learning_rate": 2e-05,
"loss": 0.6644,
"step": 140
},
{
"epoch": 1.145679012345679,
"grad_norm": 1.226821515640194,
"learning_rate": 2e-05,
"loss": 0.6478,
"step": 145
},
{
"epoch": 1.1851851851851851,
"grad_norm": 1.0784057055869019,
"learning_rate": 2e-05,
"loss": 0.6141,
"step": 150
},
{
"epoch": 1.2246913580246914,
"grad_norm": 1.2189273784729524,
"learning_rate": 2e-05,
"loss": 0.6171,
"step": 155
},
{
"epoch": 1.2641975308641975,
"grad_norm": 1.1463832706796795,
"learning_rate": 2e-05,
"loss": 0.6348,
"step": 160
},
{
"epoch": 1.3037037037037038,
"grad_norm": 1.277105384989837,
"learning_rate": 2e-05,
"loss": 0.6537,
"step": 165
},
{
"epoch": 1.34320987654321,
"grad_norm": 1.2493194408291017,
"learning_rate": 2e-05,
"loss": 0.6348,
"step": 170
},
{
"epoch": 1.382716049382716,
"grad_norm": 1.275379674934221,
"learning_rate": 2e-05,
"loss": 0.6359,
"step": 175
},
{
"epoch": 1.4222222222222223,
"grad_norm": 1.2351810219998518,
"learning_rate": 2e-05,
"loss": 0.634,
"step": 180
},
{
"epoch": 1.4617283950617284,
"grad_norm": 1.2400415938496727,
"learning_rate": 2e-05,
"loss": 0.6575,
"step": 185
},
{
"epoch": 1.5012345679012347,
"grad_norm": 1.20319815037753,
"learning_rate": 2e-05,
"loss": 0.6302,
"step": 190
},
{
"epoch": 1.5407407407407407,
"grad_norm": 1.2202272853056775,
"learning_rate": 2e-05,
"loss": 0.6433,
"step": 195
},
{
"epoch": 1.5802469135802468,
"grad_norm": 1.2375828410223908,
"learning_rate": 2e-05,
"loss": 0.6527,
"step": 200
},
{
"epoch": 1.6197530864197531,
"grad_norm": 1.2178746501653863,
"learning_rate": 2e-05,
"loss": 0.6631,
"step": 205
},
{
"epoch": 1.6592592592592592,
"grad_norm": 1.2015876208269247,
"learning_rate": 2e-05,
"loss": 0.6324,
"step": 210
},
{
"epoch": 1.6987654320987655,
"grad_norm": 1.2831290348498436,
"learning_rate": 2e-05,
"loss": 0.6325,
"step": 215
},
{
"epoch": 1.7382716049382716,
"grad_norm": 1.1989479874493834,
"learning_rate": 2e-05,
"loss": 0.6335,
"step": 220
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.2494160770138447,
"learning_rate": 2e-05,
"loss": 0.6548,
"step": 225
},
{
"epoch": 1.817283950617284,
"grad_norm": 1.199854634744343,
"learning_rate": 2e-05,
"loss": 0.6527,
"step": 230
},
{
"epoch": 1.8567901234567903,
"grad_norm": 1.2753911656579426,
"learning_rate": 2e-05,
"loss": 0.6532,
"step": 235
},
{
"epoch": 1.8962962962962964,
"grad_norm": 1.259005764478814,
"learning_rate": 2e-05,
"loss": 0.6321,
"step": 240
},
{
"epoch": 1.9358024691358025,
"grad_norm": 1.2073632789042554,
"learning_rate": 2e-05,
"loss": 0.6502,
"step": 245
},
{
"epoch": 1.9753086419753085,
"grad_norm": 1.3138749527875218,
"learning_rate": 2e-05,
"loss": 0.6762,
"step": 250
},
{
"epoch": 2.0148148148148146,
"grad_norm": 1.3591666117815475,
"learning_rate": 2e-05,
"loss": 0.542,
"step": 255
},
{
"epoch": 2.054320987654321,
"grad_norm": 2.063047801337415,
"learning_rate": 2e-05,
"loss": 0.2887,
"step": 260
},
{
"epoch": 2.093827160493827,
"grad_norm": 1.2684017214430752,
"learning_rate": 2e-05,
"loss": 0.2644,
"step": 265
},
{
"epoch": 2.1333333333333333,
"grad_norm": 1.2966722941774393,
"learning_rate": 2e-05,
"loss": 0.2571,
"step": 270
},
{
"epoch": 2.1728395061728394,
"grad_norm": 1.340692853831283,
"learning_rate": 2e-05,
"loss": 0.2528,
"step": 275
},
{
"epoch": 2.212345679012346,
"grad_norm": 1.14949845398096,
"learning_rate": 2e-05,
"loss": 0.2537,
"step": 280
},
{
"epoch": 2.251851851851852,
"grad_norm": 1.2372995647380092,
"learning_rate": 2e-05,
"loss": 0.2499,
"step": 285
},
{
"epoch": 2.291358024691358,
"grad_norm": 1.1599361078462038,
"learning_rate": 2e-05,
"loss": 0.2571,
"step": 290
},
{
"epoch": 2.330864197530864,
"grad_norm": 1.2300573894453493,
"learning_rate": 2e-05,
"loss": 0.2493,
"step": 295
},
{
"epoch": 2.3703703703703702,
"grad_norm": 1.3265214490034312,
"learning_rate": 2e-05,
"loss": 0.253,
"step": 300
},
{
"epoch": 2.4098765432098768,
"grad_norm": 1.2853819683882652,
"learning_rate": 2e-05,
"loss": 0.2517,
"step": 305
},
{
"epoch": 2.449382716049383,
"grad_norm": 1.3525697343190135,
"learning_rate": 2e-05,
"loss": 0.2494,
"step": 310
},
{
"epoch": 2.488888888888889,
"grad_norm": 1.2003581951396316,
"learning_rate": 2e-05,
"loss": 0.2552,
"step": 315
},
{
"epoch": 2.528395061728395,
"grad_norm": 1.3354927903528535,
"learning_rate": 2e-05,
"loss": 0.2653,
"step": 320
},
{
"epoch": 2.567901234567901,
"grad_norm": 1.4439934100900786,
"learning_rate": 2e-05,
"loss": 0.2802,
"step": 325
},
{
"epoch": 2.6074074074074076,
"grad_norm": 1.245376378199098,
"learning_rate": 2e-05,
"loss": 0.2641,
"step": 330
},
{
"epoch": 2.6469135802469137,
"grad_norm": 1.2818866706200012,
"learning_rate": 2e-05,
"loss": 0.2676,
"step": 335
},
{
"epoch": 2.68641975308642,
"grad_norm": 1.276975908014479,
"learning_rate": 2e-05,
"loss": 0.2749,
"step": 340
},
{
"epoch": 2.725925925925926,
"grad_norm": 1.2980698214464974,
"learning_rate": 2e-05,
"loss": 0.2732,
"step": 345
},
{
"epoch": 2.765432098765432,
"grad_norm": 1.3359535241429625,
"learning_rate": 2e-05,
"loss": 0.2739,
"step": 350
},
{
"epoch": 2.8049382716049385,
"grad_norm": 1.2472173979334094,
"learning_rate": 2e-05,
"loss": 0.2698,
"step": 355
},
{
"epoch": 2.8444444444444446,
"grad_norm": 1.2863387095995107,
"learning_rate": 2e-05,
"loss": 0.2647,
"step": 360
},
{
"epoch": 2.8839506172839506,
"grad_norm": 1.4156210734758483,
"learning_rate": 2e-05,
"loss": 0.2711,
"step": 365
},
{
"epoch": 2.9234567901234567,
"grad_norm": 1.299941175380543,
"learning_rate": 2e-05,
"loss": 0.2818,
"step": 370
},
{
"epoch": 2.962962962962963,
"grad_norm": 1.266519548711242,
"learning_rate": 2e-05,
"loss": 0.276,
"step": 375
},
{
"epoch": 3.0024691358024693,
"grad_norm": 1.1318259958419454,
"learning_rate": 2e-05,
"loss": 0.2592,
"step": 380
},
{
"epoch": 3.0419753086419754,
"grad_norm": 0.933334877688298,
"learning_rate": 2e-05,
"loss": 0.0838,
"step": 385
},
{
"epoch": 3.0814814814814815,
"grad_norm": 1.0809786957325411,
"learning_rate": 2e-05,
"loss": 0.0859,
"step": 390
},
{
"epoch": 3.1209876543209876,
"grad_norm": 0.9787186358692034,
"learning_rate": 2e-05,
"loss": 0.0784,
"step": 395
},
{
"epoch": 3.1604938271604937,
"grad_norm": 0.9546009939819529,
"learning_rate": 2e-05,
"loss": 0.0802,
"step": 400
},
{
"epoch": 3.2,
"grad_norm": 1.0327679510654035,
"learning_rate": 2e-05,
"loss": 0.0785,
"step": 405
},
{
"epoch": 3.2395061728395063,
"grad_norm": 0.9851858106843173,
"learning_rate": 2e-05,
"loss": 0.0804,
"step": 410
},
{
"epoch": 3.2790123456790123,
"grad_norm": 0.8657522447354971,
"learning_rate": 2e-05,
"loss": 0.0779,
"step": 415
},
{
"epoch": 3.3185185185185184,
"grad_norm": 1.0753000614988253,
"learning_rate": 2e-05,
"loss": 0.0799,
"step": 420
},
{
"epoch": 3.3580246913580245,
"grad_norm": 0.9715983171240334,
"learning_rate": 2e-05,
"loss": 0.0787,
"step": 425
},
{
"epoch": 3.397530864197531,
"grad_norm": 1.0205981518321303,
"learning_rate": 2e-05,
"loss": 0.0845,
"step": 430
},
{
"epoch": 3.437037037037037,
"grad_norm": 0.9519562378749633,
"learning_rate": 2e-05,
"loss": 0.0831,
"step": 435
},
{
"epoch": 3.476543209876543,
"grad_norm": 1.0856696967629995,
"learning_rate": 2e-05,
"loss": 0.0835,
"step": 440
},
{
"epoch": 3.5160493827160493,
"grad_norm": 1.0619796419728877,
"learning_rate": 2e-05,
"loss": 0.0873,
"step": 445
},
{
"epoch": 3.5555555555555554,
"grad_norm": 1.0366626282771845,
"learning_rate": 2e-05,
"loss": 0.0837,
"step": 450
},
{
"epoch": 3.595061728395062,
"grad_norm": 1.0659804060064433,
"learning_rate": 2e-05,
"loss": 0.0811,
"step": 455
},
{
"epoch": 3.634567901234568,
"grad_norm": 1.0334508292983433,
"learning_rate": 2e-05,
"loss": 0.0809,
"step": 460
},
{
"epoch": 3.674074074074074,
"grad_norm": 0.954017121382599,
"learning_rate": 2e-05,
"loss": 0.0883,
"step": 465
},
{
"epoch": 3.71358024691358,
"grad_norm": 1.0166440249144018,
"learning_rate": 2e-05,
"loss": 0.0879,
"step": 470
},
{
"epoch": 3.753086419753086,
"grad_norm": 1.0979200122546204,
"learning_rate": 2e-05,
"loss": 0.0878,
"step": 475
},
{
"epoch": 3.7925925925925927,
"grad_norm": 1.0013459456925258,
"learning_rate": 2e-05,
"loss": 0.0839,
"step": 480
},
{
"epoch": 3.832098765432099,
"grad_norm": 1.0160863439352807,
"learning_rate": 2e-05,
"loss": 0.0915,
"step": 485
},
{
"epoch": 3.871604938271605,
"grad_norm": 0.9858324147193233,
"learning_rate": 2e-05,
"loss": 0.0908,
"step": 490
},
{
"epoch": 3.911111111111111,
"grad_norm": 0.9282172156060597,
"learning_rate": 2e-05,
"loss": 0.0884,
"step": 495
},
{
"epoch": 3.950617283950617,
"grad_norm": 1.0696690745745738,
"learning_rate": 2e-05,
"loss": 0.0864,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 504,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 269178256277504.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}