|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9989550679205852, |
|
"eval_steps": 500, |
|
"global_step": 478, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0020898641588296763, |
|
"grad_norm": 216.79754638671875, |
|
"learning_rate": 6.2499999999999995e-06, |
|
"loss": 57.9838, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01044932079414838, |
|
"grad_norm": 184.4412841796875, |
|
"learning_rate": 3.125e-05, |
|
"loss": 60.093, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02089864158829676, |
|
"grad_norm": 107.91060638427734, |
|
"learning_rate": 6.25e-05, |
|
"loss": 48.3094, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03134796238244514, |
|
"grad_norm": 17.1436710357666, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 33.2668, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04179728317659352, |
|
"grad_norm": 12.335116386413574, |
|
"learning_rate": 0.000125, |
|
"loss": 27.698, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0522466039707419, |
|
"grad_norm": 6.2943196296691895, |
|
"learning_rate": 0.00015625, |
|
"loss": 25.9692, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06269592476489028, |
|
"grad_norm": 5.466517448425293, |
|
"learning_rate": 0.00018749999999999998, |
|
"loss": 25.2691, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07314524555903866, |
|
"grad_norm": 9.744288444519043, |
|
"learning_rate": 0.00021874999999999998, |
|
"loss": 23.7082, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08359456635318704, |
|
"grad_norm": 19.27219581604004, |
|
"learning_rate": 0.00025, |
|
"loss": 21.3655, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09404388714733543, |
|
"grad_norm": 41.77222442626953, |
|
"learning_rate": 0.00028125, |
|
"loss": 16.1707, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1044932079414838, |
|
"grad_norm": 18.60293960571289, |
|
"learning_rate": 0.0002999839868651235, |
|
"loss": 8.0969, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11494252873563218, |
|
"grad_norm": 11.452897071838379, |
|
"learning_rate": 0.00029980387835984494, |
|
"loss": 4.1367, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12539184952978055, |
|
"grad_norm": 8.422245979309082, |
|
"learning_rate": 0.000299423886051382, |
|
"loss": 3.1254, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13584117032392895, |
|
"grad_norm": 2.444629669189453, |
|
"learning_rate": 0.0002988445169647103, |
|
"loss": 2.4463, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14629049111807732, |
|
"grad_norm": 1.307098627090454, |
|
"learning_rate": 0.0002980665441538907, |
|
"loss": 2.1685, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15673981191222572, |
|
"grad_norm": 2.10964298248291, |
|
"learning_rate": 0.0002970910056705806, |
|
"loss": 2.0392, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1671891327063741, |
|
"grad_norm": 1.1905853748321533, |
|
"learning_rate": 0.0002959192031789579, |
|
"loss": 1.9225, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17763845350052246, |
|
"grad_norm": 0.8916841745376587, |
|
"learning_rate": 0.0002945527002189068, |
|
"loss": 1.8422, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18808777429467086, |
|
"grad_norm": 3.186051845550537, |
|
"learning_rate": 0.00029299332011978107, |
|
"loss": 1.748, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19853709508881923, |
|
"grad_norm": 3.865817070007324, |
|
"learning_rate": 0.00029124314356752967, |
|
"loss": 1.7184, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2089864158829676, |
|
"grad_norm": 2.8790738582611084, |
|
"learning_rate": 0.0002893045058284311, |
|
"loss": 1.6432, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.219435736677116, |
|
"grad_norm": 1.6771491765975952, |
|
"learning_rate": 0.00028717999363313967, |
|
"loss": 1.6567, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 2.725285530090332, |
|
"learning_rate": 0.00028487244172520246, |
|
"loss": 1.6157, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24033437826541273, |
|
"grad_norm": 2.289280652999878, |
|
"learning_rate": 0.0002823849290786517, |
|
"loss": 1.6148, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2507836990595611, |
|
"grad_norm": 2.0211188793182373, |
|
"learning_rate": 0.0002797207747897198, |
|
"loss": 1.5858, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2612330198537095, |
|
"grad_norm": 2.0264103412628174, |
|
"learning_rate": 0.00027688353364815834, |
|
"loss": 1.5708, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2716823406478579, |
|
"grad_norm": 0.9253348112106323, |
|
"learning_rate": 0.0002738769913940706, |
|
"loss": 1.5481, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28213166144200624, |
|
"grad_norm": 3.3143184185028076, |
|
"learning_rate": 0.00027070515966658604, |
|
"loss": 1.5535, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.29258098223615464, |
|
"grad_norm": 4.024845600128174, |
|
"learning_rate": 0.0002673722706511174, |
|
"loss": 1.5542, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 3.718261241912842, |
|
"learning_rate": 0.00026388277143234146, |
|
"loss": 1.5507, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31347962382445144, |
|
"grad_norm": 1.9526076316833496, |
|
"learning_rate": 0.0002602413180604401, |
|
"loss": 1.5251, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3239289446185998, |
|
"grad_norm": 1.5725075006484985, |
|
"learning_rate": 0.00025645276933851667, |
|
"loss": 1.4937, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3343782654127482, |
|
"grad_norm": 4.266882419586182, |
|
"learning_rate": 0.00025252218033947993, |
|
"loss": 1.4944, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 2.6647915840148926, |
|
"learning_rate": 0.0002484547956610429, |
|
"loss": 1.4798, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3552769070010449, |
|
"grad_norm": 2.0770153999328613, |
|
"learning_rate": 0.0002442560424278399, |
|
"loss": 1.4708, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3657262277951933, |
|
"grad_norm": 1.8132774829864502, |
|
"learning_rate": 0.00023993152304999582, |
|
"loss": 1.4554, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3761755485893417, |
|
"grad_norm": 1.9493850469589233, |
|
"learning_rate": 0.00023548700774781242, |
|
"loss": 1.485, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38662486938349006, |
|
"grad_norm": 3.6726951599121094, |
|
"learning_rate": 0.00023092842685254442, |
|
"loss": 1.4584, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.39707419017763845, |
|
"grad_norm": 2.253319501876831, |
|
"learning_rate": 0.00022626186289353913, |
|
"loss": 1.4569, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.40752351097178685, |
|
"grad_norm": 3.336820125579834, |
|
"learning_rate": 0.00022149354248229784, |
|
"loss": 1.4334, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4179728317659352, |
|
"grad_norm": 3.0895018577575684, |
|
"learning_rate": 0.0002166298280042877, |
|
"loss": 1.4203, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4284221525600836, |
|
"grad_norm": 1.8486225605010986, |
|
"learning_rate": 0.00021167720912959004, |
|
"loss": 1.414, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.438871473354232, |
|
"grad_norm": 0.7216203808784485, |
|
"learning_rate": 0.00020664229415371266, |
|
"loss": 1.3897, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.44932079414838033, |
|
"grad_norm": 2.909454107284546, |
|
"learning_rate": 0.0002015318011801192, |
|
"loss": 1.3713, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 1.5531753301620483, |
|
"learning_rate": 0.0001963525491562421, |
|
"loss": 1.4055, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4702194357366771, |
|
"grad_norm": 4.848015308380127, |
|
"learning_rate": 0.00019111144877493873, |
|
"loss": 1.435, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.48066875653082547, |
|
"grad_norm": 4.833097457885742, |
|
"learning_rate": 0.00018581549325353126, |
|
"loss": 1.417, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.49111807732497387, |
|
"grad_norm": 1.415703296661377, |
|
"learning_rate": 0.00018047174900273435, |
|
"loss": 1.4449, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5015673981191222, |
|
"grad_norm": 0.9621894359588623, |
|
"learning_rate": 0.00017508734619791966, |
|
"loss": 1.3907, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5120167189132706, |
|
"grad_norm": 2.091428279876709, |
|
"learning_rate": 0.0001696694692653004, |
|
"loss": 1.3581, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.522466039707419, |
|
"grad_norm": 1.3531287908554077, |
|
"learning_rate": 0.00016422534729572738, |
|
"loss": 1.3717, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5329153605015674, |
|
"grad_norm": 1.8569897413253784, |
|
"learning_rate": 0.0001587622443988899, |
|
"loss": 1.3811, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5433646812957158, |
|
"grad_norm": 4.248292446136475, |
|
"learning_rate": 0.0001532874500107902, |
|
"loss": 1.3797, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5538140020898642, |
|
"grad_norm": 2.5460174083709717, |
|
"learning_rate": 0.0001478082691674256, |
|
"loss": 1.3576, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5642633228840125, |
|
"grad_norm": 1.3485275506973267, |
|
"learning_rate": 0.00014233201275765494, |
|
"loss": 1.383, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5747126436781609, |
|
"grad_norm": 1.1686965227127075, |
|
"learning_rate": 0.00013686598776825563, |
|
"loss": 1.3715, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5851619644723093, |
|
"grad_norm": 1.8593087196350098, |
|
"learning_rate": 0.0001314174875341878, |
|
"loss": 1.3671, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5956112852664577, |
|
"grad_norm": 1.5989689826965332, |
|
"learning_rate": 0.0001259937820070732, |
|
"loss": 1.3379, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 3.129467248916626, |
|
"learning_rate": 0.00012060210805487529, |
|
"loss": 1.3436, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6165099268547545, |
|
"grad_norm": 1.071311593055725, |
|
"learning_rate": 0.00011524965980572284, |
|
"loss": 1.3711, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6269592476489029, |
|
"grad_norm": 2.8161048889160156, |
|
"learning_rate": 0.00010994357904876106, |
|
"loss": 1.3242, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6374085684430512, |
|
"grad_norm": 0.9445050954818726, |
|
"learning_rate": 0.00010469094570483928, |
|
"loss": 1.3217, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6478578892371996, |
|
"grad_norm": 1.53034508228302, |
|
"learning_rate": 9.949876837974944e-05, |
|
"loss": 1.314, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.658307210031348, |
|
"grad_norm": 1.8168761730194092, |
|
"learning_rate": 9.437397501262026e-05, |
|
"loss": 1.3365, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6687565308254964, |
|
"grad_norm": 1.4955302476882935, |
|
"learning_rate": 8.932340363194595e-05, |
|
"loss": 1.3154, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6792058516196448, |
|
"grad_norm": 1.2552021741867065, |
|
"learning_rate": 8.435379323158218e-05, |
|
"loss": 1.3366, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 2.914289712905884, |
|
"learning_rate": 7.947177477888472e-05, |
|
"loss": 1.3233, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7001044932079414, |
|
"grad_norm": 1.3406000137329102, |
|
"learning_rate": 7.46838623669881e-05, |
|
"loss": 1.3264, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7105538140020898, |
|
"grad_norm": 0.9025297164916992, |
|
"learning_rate": 6.999644452302975e-05, |
|
"loss": 1.3197, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7210031347962382, |
|
"grad_norm": 1.2824598550796509, |
|
"learning_rate": 6.541577568391758e-05, |
|
"loss": 1.3201, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7314524555903866, |
|
"grad_norm": 0.9296241998672485, |
|
"learning_rate": 6.0947967851014405e-05, |
|
"loss": 1.3097, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.741901776384535, |
|
"grad_norm": 0.8738858699798584, |
|
"learning_rate": 5.659898243487463e-05, |
|
"loss": 1.3044, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7523510971786834, |
|
"grad_norm": 1.8482000827789307, |
|
"learning_rate": 5.237462230091467e-05, |
|
"loss": 1.3108, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7628004179728317, |
|
"grad_norm": 2.537909746170044, |
|
"learning_rate": 4.8280524026630565e-05, |
|
"loss": 1.3164, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7732497387669801, |
|
"grad_norm": 1.3068586587905884, |
|
"learning_rate": 4.432215038069449e-05, |
|
"loss": 1.2782, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7836990595611285, |
|
"grad_norm": 1.3742858171463013, |
|
"learning_rate": 4.0504783033964645e-05, |
|
"loss": 1.3179, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7941483803552769, |
|
"grad_norm": 1.2923156023025513, |
|
"learning_rate": 3.6833515512134606e-05, |
|
"loss": 1.2904, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8045977011494253, |
|
"grad_norm": 0.7867398262023926, |
|
"learning_rate": 3.331324639942526e-05, |
|
"loss": 1.3029, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8150470219435737, |
|
"grad_norm": 1.1442195177078247, |
|
"learning_rate": 2.9948672802388135e-05, |
|
"loss": 1.3069, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8254963427377221, |
|
"grad_norm": 1.4821033477783203, |
|
"learning_rate": 2.67442840825406e-05, |
|
"loss": 1.3177, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8359456635318704, |
|
"grad_norm": 0.9633380770683289, |
|
"learning_rate": 2.3704355866196373e-05, |
|
"loss": 1.3249, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8463949843260188, |
|
"grad_norm": 1.2908155918121338, |
|
"learning_rate": 2.083294433948324e-05, |
|
"loss": 1.3449, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8568443051201672, |
|
"grad_norm": 1.1834619045257568, |
|
"learning_rate": 1.813388083616068e-05, |
|
"loss": 1.3086, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8672936259143156, |
|
"grad_norm": 1.1399352550506592, |
|
"learning_rate": 1.5610766725458834e-05, |
|
"loss": 1.315, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.877742946708464, |
|
"grad_norm": 1.2300066947937012, |
|
"learning_rate": 1.326696860675981e-05, |
|
"loss": 1.2894, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8881922675026124, |
|
"grad_norm": 0.9975532293319702, |
|
"learning_rate": 1.1105613817532976e-05, |
|
"loss": 1.2953, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8986415882967607, |
|
"grad_norm": 0.9357336163520813, |
|
"learning_rate": 9.129586260518634e-06, |
|
"loss": 1.3159, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.7603440880775452, |
|
"learning_rate": 7.34152255572697e-06, |
|
"loss": 1.2897, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.8711851835250854, |
|
"learning_rate": 5.743808522387544e-06, |
|
"loss": 1.275, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9299895506792059, |
|
"grad_norm": 0.9144044518470764, |
|
"learning_rate": 4.33857599554282e-06, |
|
"loss": 1.328, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9404388714733543, |
|
"grad_norm": 0.862479567527771, |
|
"learning_rate": 3.1276999815337544e-06, |
|
"loss": 1.2879, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9508881922675027, |
|
"grad_norm": 0.7352892756462097, |
|
"learning_rate": 2.1127961561727193e-06, |
|
"loss": 1.2873, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9613375130616509, |
|
"grad_norm": 2.582821846008301, |
|
"learning_rate": 1.2952187089419642e-06, |
|
"loss": 1.3191, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9717868338557993, |
|
"grad_norm": 0.7060139179229736, |
|
"learning_rate": 6.760585360942872e-07, |
|
"loss": 1.3047, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9822361546499477, |
|
"grad_norm": 0.8089200258255005, |
|
"learning_rate": 2.5614178506644934e-07, |
|
"loss": 1.2743, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9926854754440961, |
|
"grad_norm": 1.2739328145980835, |
|
"learning_rate": 3.6028752148081766e-08, |
|
"loss": 1.3004, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9989550679205852, |
|
"eval_loss": 1.9203195571899414, |
|
"eval_runtime": 0.8302, |
|
"eval_samples_per_second": 2.409, |
|
"eval_steps_per_second": 1.205, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.9989550679205852, |
|
"step": 478, |
|
"total_flos": 3.643767570437243e+17, |
|
"train_loss": 4.360991338805674, |
|
"train_runtime": 2613.4355, |
|
"train_samples_per_second": 2.928, |
|
"train_steps_per_second": 0.183 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 478, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.643767570437243e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|