chansung's picture
Model save
fc53030 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9989550679205852,
"eval_steps": 500,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020898641588296763,
"grad_norm": 216.79754638671875,
"learning_rate": 6.2499999999999995e-06,
"loss": 57.9838,
"step": 1
},
{
"epoch": 0.01044932079414838,
"grad_norm": 184.4412841796875,
"learning_rate": 3.125e-05,
"loss": 60.093,
"step": 5
},
{
"epoch": 0.02089864158829676,
"grad_norm": 107.91060638427734,
"learning_rate": 6.25e-05,
"loss": 48.3094,
"step": 10
},
{
"epoch": 0.03134796238244514,
"grad_norm": 17.1436710357666,
"learning_rate": 9.374999999999999e-05,
"loss": 33.2668,
"step": 15
},
{
"epoch": 0.04179728317659352,
"grad_norm": 12.335116386413574,
"learning_rate": 0.000125,
"loss": 27.698,
"step": 20
},
{
"epoch": 0.0522466039707419,
"grad_norm": 6.2943196296691895,
"learning_rate": 0.00015625,
"loss": 25.9692,
"step": 25
},
{
"epoch": 0.06269592476489028,
"grad_norm": 5.466517448425293,
"learning_rate": 0.00018749999999999998,
"loss": 25.2691,
"step": 30
},
{
"epoch": 0.07314524555903866,
"grad_norm": 9.744288444519043,
"learning_rate": 0.00021874999999999998,
"loss": 23.7082,
"step": 35
},
{
"epoch": 0.08359456635318704,
"grad_norm": 19.27219581604004,
"learning_rate": 0.00025,
"loss": 21.3655,
"step": 40
},
{
"epoch": 0.09404388714733543,
"grad_norm": 41.77222442626953,
"learning_rate": 0.00028125,
"loss": 16.1707,
"step": 45
},
{
"epoch": 0.1044932079414838,
"grad_norm": 18.60293960571289,
"learning_rate": 0.0002999839868651235,
"loss": 8.0969,
"step": 50
},
{
"epoch": 0.11494252873563218,
"grad_norm": 11.452897071838379,
"learning_rate": 0.00029980387835984494,
"loss": 4.1367,
"step": 55
},
{
"epoch": 0.12539184952978055,
"grad_norm": 8.422245979309082,
"learning_rate": 0.000299423886051382,
"loss": 3.1254,
"step": 60
},
{
"epoch": 0.13584117032392895,
"grad_norm": 2.444629669189453,
"learning_rate": 0.0002988445169647103,
"loss": 2.4463,
"step": 65
},
{
"epoch": 0.14629049111807732,
"grad_norm": 1.307098627090454,
"learning_rate": 0.0002980665441538907,
"loss": 2.1685,
"step": 70
},
{
"epoch": 0.15673981191222572,
"grad_norm": 2.10964298248291,
"learning_rate": 0.0002970910056705806,
"loss": 2.0392,
"step": 75
},
{
"epoch": 0.1671891327063741,
"grad_norm": 1.1905853748321533,
"learning_rate": 0.0002959192031789579,
"loss": 1.9225,
"step": 80
},
{
"epoch": 0.17763845350052246,
"grad_norm": 0.8916841745376587,
"learning_rate": 0.0002945527002189068,
"loss": 1.8422,
"step": 85
},
{
"epoch": 0.18808777429467086,
"grad_norm": 3.186051845550537,
"learning_rate": 0.00029299332011978107,
"loss": 1.748,
"step": 90
},
{
"epoch": 0.19853709508881923,
"grad_norm": 3.865817070007324,
"learning_rate": 0.00029124314356752967,
"loss": 1.7184,
"step": 95
},
{
"epoch": 0.2089864158829676,
"grad_norm": 2.8790738582611084,
"learning_rate": 0.0002893045058284311,
"loss": 1.6432,
"step": 100
},
{
"epoch": 0.219435736677116,
"grad_norm": 1.6771491765975952,
"learning_rate": 0.00028717999363313967,
"loss": 1.6567,
"step": 105
},
{
"epoch": 0.22988505747126436,
"grad_norm": 2.725285530090332,
"learning_rate": 0.00028487244172520246,
"loss": 1.6157,
"step": 110
},
{
"epoch": 0.24033437826541273,
"grad_norm": 2.289280652999878,
"learning_rate": 0.0002823849290786517,
"loss": 1.6148,
"step": 115
},
{
"epoch": 0.2507836990595611,
"grad_norm": 2.0211188793182373,
"learning_rate": 0.0002797207747897198,
"loss": 1.5858,
"step": 120
},
{
"epoch": 0.2612330198537095,
"grad_norm": 2.0264103412628174,
"learning_rate": 0.00027688353364815834,
"loss": 1.5708,
"step": 125
},
{
"epoch": 0.2716823406478579,
"grad_norm": 0.9253348112106323,
"learning_rate": 0.0002738769913940706,
"loss": 1.5481,
"step": 130
},
{
"epoch": 0.28213166144200624,
"grad_norm": 3.3143184185028076,
"learning_rate": 0.00027070515966658604,
"loss": 1.5535,
"step": 135
},
{
"epoch": 0.29258098223615464,
"grad_norm": 4.024845600128174,
"learning_rate": 0.0002673722706511174,
"loss": 1.5542,
"step": 140
},
{
"epoch": 0.30303030303030304,
"grad_norm": 3.718261241912842,
"learning_rate": 0.00026388277143234146,
"loss": 1.5507,
"step": 145
},
{
"epoch": 0.31347962382445144,
"grad_norm": 1.9526076316833496,
"learning_rate": 0.0002602413180604401,
"loss": 1.5251,
"step": 150
},
{
"epoch": 0.3239289446185998,
"grad_norm": 1.5725075006484985,
"learning_rate": 0.00025645276933851667,
"loss": 1.4937,
"step": 155
},
{
"epoch": 0.3343782654127482,
"grad_norm": 4.266882419586182,
"learning_rate": 0.00025252218033947993,
"loss": 1.4944,
"step": 160
},
{
"epoch": 0.3448275862068966,
"grad_norm": 2.6647915840148926,
"learning_rate": 0.0002484547956610429,
"loss": 1.4798,
"step": 165
},
{
"epoch": 0.3552769070010449,
"grad_norm": 2.0770153999328613,
"learning_rate": 0.0002442560424278399,
"loss": 1.4708,
"step": 170
},
{
"epoch": 0.3657262277951933,
"grad_norm": 1.8132774829864502,
"learning_rate": 0.00023993152304999582,
"loss": 1.4554,
"step": 175
},
{
"epoch": 0.3761755485893417,
"grad_norm": 1.9493850469589233,
"learning_rate": 0.00023548700774781242,
"loss": 1.485,
"step": 180
},
{
"epoch": 0.38662486938349006,
"grad_norm": 3.6726951599121094,
"learning_rate": 0.00023092842685254442,
"loss": 1.4584,
"step": 185
},
{
"epoch": 0.39707419017763845,
"grad_norm": 2.253319501876831,
"learning_rate": 0.00022626186289353913,
"loss": 1.4569,
"step": 190
},
{
"epoch": 0.40752351097178685,
"grad_norm": 3.336820125579834,
"learning_rate": 0.00022149354248229784,
"loss": 1.4334,
"step": 195
},
{
"epoch": 0.4179728317659352,
"grad_norm": 3.0895018577575684,
"learning_rate": 0.0002166298280042877,
"loss": 1.4203,
"step": 200
},
{
"epoch": 0.4284221525600836,
"grad_norm": 1.8486225605010986,
"learning_rate": 0.00021167720912959004,
"loss": 1.414,
"step": 205
},
{
"epoch": 0.438871473354232,
"grad_norm": 0.7216203808784485,
"learning_rate": 0.00020664229415371266,
"loss": 1.3897,
"step": 210
},
{
"epoch": 0.44932079414838033,
"grad_norm": 2.909454107284546,
"learning_rate": 0.0002015318011801192,
"loss": 1.3713,
"step": 215
},
{
"epoch": 0.45977011494252873,
"grad_norm": 1.5531753301620483,
"learning_rate": 0.0001963525491562421,
"loss": 1.4055,
"step": 220
},
{
"epoch": 0.4702194357366771,
"grad_norm": 4.848015308380127,
"learning_rate": 0.00019111144877493873,
"loss": 1.435,
"step": 225
},
{
"epoch": 0.48066875653082547,
"grad_norm": 4.833097457885742,
"learning_rate": 0.00018581549325353126,
"loss": 1.417,
"step": 230
},
{
"epoch": 0.49111807732497387,
"grad_norm": 1.415703296661377,
"learning_rate": 0.00018047174900273435,
"loss": 1.4449,
"step": 235
},
{
"epoch": 0.5015673981191222,
"grad_norm": 0.9621894359588623,
"learning_rate": 0.00017508734619791966,
"loss": 1.3907,
"step": 240
},
{
"epoch": 0.5120167189132706,
"grad_norm": 2.091428279876709,
"learning_rate": 0.0001696694692653004,
"loss": 1.3581,
"step": 245
},
{
"epoch": 0.522466039707419,
"grad_norm": 1.3531287908554077,
"learning_rate": 0.00016422534729572738,
"loss": 1.3717,
"step": 250
},
{
"epoch": 0.5329153605015674,
"grad_norm": 1.8569897413253784,
"learning_rate": 0.0001587622443988899,
"loss": 1.3811,
"step": 255
},
{
"epoch": 0.5433646812957158,
"grad_norm": 4.248292446136475,
"learning_rate": 0.0001532874500107902,
"loss": 1.3797,
"step": 260
},
{
"epoch": 0.5538140020898642,
"grad_norm": 2.5460174083709717,
"learning_rate": 0.0001478082691674256,
"loss": 1.3576,
"step": 265
},
{
"epoch": 0.5642633228840125,
"grad_norm": 1.3485275506973267,
"learning_rate": 0.00014233201275765494,
"loss": 1.383,
"step": 270
},
{
"epoch": 0.5747126436781609,
"grad_norm": 1.1686965227127075,
"learning_rate": 0.00013686598776825563,
"loss": 1.3715,
"step": 275
},
{
"epoch": 0.5851619644723093,
"grad_norm": 1.8593087196350098,
"learning_rate": 0.0001314174875341878,
"loss": 1.3671,
"step": 280
},
{
"epoch": 0.5956112852664577,
"grad_norm": 1.5989689826965332,
"learning_rate": 0.0001259937820070732,
"loss": 1.3379,
"step": 285
},
{
"epoch": 0.6060606060606061,
"grad_norm": 3.129467248916626,
"learning_rate": 0.00012060210805487529,
"loss": 1.3436,
"step": 290
},
{
"epoch": 0.6165099268547545,
"grad_norm": 1.071311593055725,
"learning_rate": 0.00011524965980572284,
"loss": 1.3711,
"step": 295
},
{
"epoch": 0.6269592476489029,
"grad_norm": 2.8161048889160156,
"learning_rate": 0.00010994357904876106,
"loss": 1.3242,
"step": 300
},
{
"epoch": 0.6374085684430512,
"grad_norm": 0.9445050954818726,
"learning_rate": 0.00010469094570483928,
"loss": 1.3217,
"step": 305
},
{
"epoch": 0.6478578892371996,
"grad_norm": 1.53034508228302,
"learning_rate": 9.949876837974944e-05,
"loss": 1.314,
"step": 310
},
{
"epoch": 0.658307210031348,
"grad_norm": 1.8168761730194092,
"learning_rate": 9.437397501262026e-05,
"loss": 1.3365,
"step": 315
},
{
"epoch": 0.6687565308254964,
"grad_norm": 1.4955302476882935,
"learning_rate": 8.932340363194595e-05,
"loss": 1.3154,
"step": 320
},
{
"epoch": 0.6792058516196448,
"grad_norm": 1.2552021741867065,
"learning_rate": 8.435379323158218e-05,
"loss": 1.3366,
"step": 325
},
{
"epoch": 0.6896551724137931,
"grad_norm": 2.914289712905884,
"learning_rate": 7.947177477888472e-05,
"loss": 1.3233,
"step": 330
},
{
"epoch": 0.7001044932079414,
"grad_norm": 1.3406000137329102,
"learning_rate": 7.46838623669881e-05,
"loss": 1.3264,
"step": 335
},
{
"epoch": 0.7105538140020898,
"grad_norm": 0.9025297164916992,
"learning_rate": 6.999644452302975e-05,
"loss": 1.3197,
"step": 340
},
{
"epoch": 0.7210031347962382,
"grad_norm": 1.2824598550796509,
"learning_rate": 6.541577568391758e-05,
"loss": 1.3201,
"step": 345
},
{
"epoch": 0.7314524555903866,
"grad_norm": 0.9296241998672485,
"learning_rate": 6.0947967851014405e-05,
"loss": 1.3097,
"step": 350
},
{
"epoch": 0.741901776384535,
"grad_norm": 0.8738858699798584,
"learning_rate": 5.659898243487463e-05,
"loss": 1.3044,
"step": 355
},
{
"epoch": 0.7523510971786834,
"grad_norm": 1.8482000827789307,
"learning_rate": 5.237462230091467e-05,
"loss": 1.3108,
"step": 360
},
{
"epoch": 0.7628004179728317,
"grad_norm": 2.537909746170044,
"learning_rate": 4.8280524026630565e-05,
"loss": 1.3164,
"step": 365
},
{
"epoch": 0.7732497387669801,
"grad_norm": 1.3068586587905884,
"learning_rate": 4.432215038069449e-05,
"loss": 1.2782,
"step": 370
},
{
"epoch": 0.7836990595611285,
"grad_norm": 1.3742858171463013,
"learning_rate": 4.0504783033964645e-05,
"loss": 1.3179,
"step": 375
},
{
"epoch": 0.7941483803552769,
"grad_norm": 1.2923156023025513,
"learning_rate": 3.6833515512134606e-05,
"loss": 1.2904,
"step": 380
},
{
"epoch": 0.8045977011494253,
"grad_norm": 0.7867398262023926,
"learning_rate": 3.331324639942526e-05,
"loss": 1.3029,
"step": 385
},
{
"epoch": 0.8150470219435737,
"grad_norm": 1.1442195177078247,
"learning_rate": 2.9948672802388135e-05,
"loss": 1.3069,
"step": 390
},
{
"epoch": 0.8254963427377221,
"grad_norm": 1.4821033477783203,
"learning_rate": 2.67442840825406e-05,
"loss": 1.3177,
"step": 395
},
{
"epoch": 0.8359456635318704,
"grad_norm": 0.9633380770683289,
"learning_rate": 2.3704355866196373e-05,
"loss": 1.3249,
"step": 400
},
{
"epoch": 0.8463949843260188,
"grad_norm": 1.2908155918121338,
"learning_rate": 2.083294433948324e-05,
"loss": 1.3449,
"step": 405
},
{
"epoch": 0.8568443051201672,
"grad_norm": 1.1834619045257568,
"learning_rate": 1.813388083616068e-05,
"loss": 1.3086,
"step": 410
},
{
"epoch": 0.8672936259143156,
"grad_norm": 1.1399352550506592,
"learning_rate": 1.5610766725458834e-05,
"loss": 1.315,
"step": 415
},
{
"epoch": 0.877742946708464,
"grad_norm": 1.2300066947937012,
"learning_rate": 1.326696860675981e-05,
"loss": 1.2894,
"step": 420
},
{
"epoch": 0.8881922675026124,
"grad_norm": 0.9975532293319702,
"learning_rate": 1.1105613817532976e-05,
"loss": 1.2953,
"step": 425
},
{
"epoch": 0.8986415882967607,
"grad_norm": 0.9357336163520813,
"learning_rate": 9.129586260518634e-06,
"loss": 1.3159,
"step": 430
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.7603440880775452,
"learning_rate": 7.34152255572697e-06,
"loss": 1.2897,
"step": 435
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.8711851835250854,
"learning_rate": 5.743808522387544e-06,
"loss": 1.275,
"step": 440
},
{
"epoch": 0.9299895506792059,
"grad_norm": 0.9144044518470764,
"learning_rate": 4.33857599554282e-06,
"loss": 1.328,
"step": 445
},
{
"epoch": 0.9404388714733543,
"grad_norm": 0.862479567527771,
"learning_rate": 3.1276999815337544e-06,
"loss": 1.2879,
"step": 450
},
{
"epoch": 0.9508881922675027,
"grad_norm": 0.7352892756462097,
"learning_rate": 2.1127961561727193e-06,
"loss": 1.2873,
"step": 455
},
{
"epoch": 0.9613375130616509,
"grad_norm": 2.582821846008301,
"learning_rate": 1.2952187089419642e-06,
"loss": 1.3191,
"step": 460
},
{
"epoch": 0.9717868338557993,
"grad_norm": 0.7060139179229736,
"learning_rate": 6.760585360942872e-07,
"loss": 1.3047,
"step": 465
},
{
"epoch": 0.9822361546499477,
"grad_norm": 0.8089200258255005,
"learning_rate": 2.5614178506644934e-07,
"loss": 1.2743,
"step": 470
},
{
"epoch": 0.9926854754440961,
"grad_norm": 1.2739328145980835,
"learning_rate": 3.6028752148081766e-08,
"loss": 1.3004,
"step": 475
},
{
"epoch": 0.9989550679205852,
"eval_loss": 1.9203195571899414,
"eval_runtime": 0.8302,
"eval_samples_per_second": 2.409,
"eval_steps_per_second": 1.205,
"step": 478
},
{
"epoch": 0.9989550679205852,
"step": 478,
"total_flos": 3.643767570437243e+17,
"train_loss": 4.360991338805674,
"train_runtime": 2613.4355,
"train_samples_per_second": 2.928,
"train_steps_per_second": 0.183
}
],
"logging_steps": 5,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.643767570437243e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}