|
{ |
|
"best_metric": 1.3305245637893677, |
|
"best_model_checkpoint": "./results/models/checkpoint-182628", |
|
"epoch": 19.0, |
|
"eval_steps": 500, |
|
"global_step": 182628, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05201831044527674, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.001997919267582189, |
|
"loss": 2.3433, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10403662089055347, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.001995838535164378, |
|
"loss": 1.9524, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1560549313358302, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.001993757802746567, |
|
"loss": 1.8833, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.20807324178110695, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0019916770703287557, |
|
"loss": 1.8518, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2600915522263837, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0019895963379109446, |
|
"loss": 1.8064, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3121098626716604, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0019875156054931335, |
|
"loss": 1.7881, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3641281731169372, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0019854348730753224, |
|
"loss": 1.7552, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4161464835622139, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0019833541406575114, |
|
"loss": 1.7362, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4681647940074906, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0019812734082397003, |
|
"loss": 1.7302, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5201831044527674, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0019791926758218896, |
|
"loss": 1.6997, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5722014148980441, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.001977111943404078, |
|
"loss": 1.6694, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6242197253433208, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.001975031210986267, |
|
"loss": 1.6882, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6762380357885975, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0019729504785684564, |
|
"loss": 1.6712, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7282563462338744, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0019708697461506453, |
|
"loss": 1.6519, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7802746566791511, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0019687890137328337, |
|
"loss": 1.6525, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8322929671244278, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.001966708281315023, |
|
"loss": 1.6239, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8843112775697045, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.001964627548897212, |
|
"loss": 1.6152, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9363295880149812, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0019625468164794005, |
|
"loss": 1.6053, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9883478984602581, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00196046608406159, |
|
"loss": 1.6078, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.5721051692962646, |
|
"eval_runtime": 1.4853, |
|
"eval_samples_per_second": 673.283, |
|
"eval_steps_per_second": 0.673, |
|
"step": 9612 |
|
}, |
|
{ |
|
"epoch": 1.0403662089055348, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0019583853516437788, |
|
"loss": 1.5953, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0923845193508115, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0019563046192259677, |
|
"loss": 1.587, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.1444028297960882, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0019542238868081566, |
|
"loss": 1.5759, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.196421140241365, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0019521431543903455, |
|
"loss": 1.5785, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2484394506866416, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0019500624219725344, |
|
"loss": 1.5734, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.3004577611319184, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0019479816895547233, |
|
"loss": 1.5593, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.352476071577195, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0019459009571369122, |
|
"loss": 1.5589, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.404494382022472, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0019438202247191011, |
|
"loss": 1.555, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4565126924677487, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00194173949230129, |
|
"loss": 1.548, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.5085310029130254, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.001939658759883479, |
|
"loss": 1.5519, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.5605493133583022, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.001937578027465668, |
|
"loss": 1.5534, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.6125676238035789, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0019354972950478568, |
|
"loss": 1.5408, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.6645859342488556, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0019334165626300457, |
|
"loss": 1.5339, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.7166042446941323, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0019313358302122348, |
|
"loss": 1.5238, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.768622555139409, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0019292550977944235, |
|
"loss": 1.527, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.8206408655846857, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0019271743653766125, |
|
"loss": 1.5253, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.8726591760299627, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0019250936329588016, |
|
"loss": 1.5217, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.9246774864752392, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0019230129005409905, |
|
"loss": 1.5201, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9766957969205161, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0019209321681231794, |
|
"loss": 1.5157, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.4967154264450073, |
|
"eval_runtime": 1.4155, |
|
"eval_samples_per_second": 706.48, |
|
"eval_steps_per_second": 0.706, |
|
"step": 19224 |
|
}, |
|
{ |
|
"epoch": 2.0287141073657926, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0019188514357053683, |
|
"loss": 1.5093, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0807324178110695, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0019167707032875572, |
|
"loss": 1.5081, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.132750728256346, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0019146899708697464, |
|
"loss": 1.5137, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.184769038701623, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.001912609238451935, |
|
"loss": 1.5052, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.2367873491468995, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.001910528506034124, |
|
"loss": 1.4989, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2888056595921764, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0019084477736163131, |
|
"loss": 1.4933, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.3408239700374533, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0019063670411985018, |
|
"loss": 1.4908, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.39284228048273, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0019042863087806907, |
|
"loss": 1.483, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.444860590928007, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0019022055763628799, |
|
"loss": 1.4808, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.4968789013732833, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0019001248439450688, |
|
"loss": 1.4751, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.54889721181856, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0018980441115272575, |
|
"loss": 1.4713, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.6009155222638367, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0018959633791094466, |
|
"loss": 1.4743, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.6529338327091136, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0018938826466916355, |
|
"loss": 1.4703, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.70495214315439, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0018918019142738244, |
|
"loss": 1.4722, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.756970453599667, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0018897211818560133, |
|
"loss": 1.4728, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0018876404494382023, |
|
"loss": 1.4738, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.8610070744902205, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.0018855597170203914, |
|
"loss": 1.4717, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.9130253849354975, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00188347898460258, |
|
"loss": 1.473, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.965043695380774, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.001881398252184769, |
|
"loss": 1.472, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.4684182405471802, |
|
"eval_runtime": 1.4391, |
|
"eval_samples_per_second": 694.9, |
|
"eval_steps_per_second": 0.695, |
|
"step": 28836 |
|
}, |
|
{ |
|
"epoch": 3.017062005826051, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0018793175197669581, |
|
"loss": 1.4727, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.0690803162713274, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.001877236787349147, |
|
"loss": 1.4677, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.1210986267166043, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0018751560549313357, |
|
"loss": 1.4668, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.173116937161881, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0018730753225135249, |
|
"loss": 1.4653, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.2251352476071578, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0018709945900957138, |
|
"loss": 1.4667, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.2771535580524347, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0018689138576779025, |
|
"loss": 1.4613, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.329171868497711, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0018668331252600916, |
|
"loss": 1.4604, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.381190178942988, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0018647523928422805, |
|
"loss": 1.4672, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.4332084893882646, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0018626716604244697, |
|
"loss": 1.465, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.4852267998335416, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0018605909280066584, |
|
"loss": 1.4604, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.537245110278818, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0018585101955888473, |
|
"loss": 1.4539, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.589263420724095, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0018564294631710364, |
|
"loss": 1.4536, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.6412817311693715, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.001854348730753225, |
|
"loss": 1.4549, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.6933000416146484, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.001852267998335414, |
|
"loss": 1.4571, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.7453183520599254, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0018501872659176031, |
|
"loss": 1.4584, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.797336662505202, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.001848106533499792, |
|
"loss": 1.4542, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.8493549729504783, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0018460258010819808, |
|
"loss": 1.4588, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.9013732833957553, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0018439450686641699, |
|
"loss": 1.456, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.9533915938410322, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0018418643362463588, |
|
"loss": 1.4596, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.4403541088104248, |
|
"eval_runtime": 1.4124, |
|
"eval_samples_per_second": 708.019, |
|
"eval_steps_per_second": 0.708, |
|
"step": 38448 |
|
}, |
|
{ |
|
"epoch": 4.005409904286309, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0018397836038285475, |
|
"loss": 1.449, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.057428214731585, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0018377028714107366, |
|
"loss": 1.445, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.109446525176862, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0018356221389929255, |
|
"loss": 1.4441, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.161464835622139, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0018335414065751145, |
|
"loss": 1.4423, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.213483146067416, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0018314606741573034, |
|
"loss": 1.442, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.265501456512692, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0018293799417394923, |
|
"loss": 1.4348, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.317519766957969, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0018272992093216814, |
|
"loss": 1.4352, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.369538077403246, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0018252184769038703, |
|
"loss": 1.4344, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.421556387848523, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.001823137744486059, |
|
"loss": 1.438, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.473574698293799, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0018210570120682482, |
|
"loss": 1.4417, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.525593008739076, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.001818976279650437, |
|
"loss": 1.436, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.577611319184353, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0018168955472326258, |
|
"loss": 1.4367, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.001814814814814815, |
|
"loss": 1.4299, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.681647940074907, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0018127340823970038, |
|
"loss": 1.4332, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.733666250520183, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0018106533499791927, |
|
"loss": 1.4269, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.78568456096546, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0018085726175613816, |
|
"loss": 1.4276, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.837702871410737, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0018064918851435705, |
|
"loss": 1.4288, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.889721181856014, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0018044111527257595, |
|
"loss": 1.4351, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.94173949230129, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0018023304203079484, |
|
"loss": 1.4307, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 4.9937578027465666, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0018002496878901373, |
|
"loss": 1.4257, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.4106667041778564, |
|
"eval_runtime": 1.4218, |
|
"eval_samples_per_second": 703.325, |
|
"eval_steps_per_second": 0.703, |
|
"step": 48060 |
|
}, |
|
{ |
|
"epoch": 5.0457761131918435, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0017981689554723264, |
|
"loss": 1.4193, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.09779442363712, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0017960882230545153, |
|
"loss": 1.4176, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.149812734082397, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.001794007490636704, |
|
"loss": 1.4157, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.201831044527673, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0017919267582188932, |
|
"loss": 1.4186, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.25384935497295, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.001789846025801082, |
|
"loss": 1.4155, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.305867665418227, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.0017877652933832708, |
|
"loss": 1.4142, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.357885975863504, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00178568456096546, |
|
"loss": 1.4155, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.40990428630878, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0017836038285476488, |
|
"loss": 1.4113, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.461922596754057, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0017815230961298377, |
|
"loss": 1.4145, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.513940907199334, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0017794423637120266, |
|
"loss": 1.4132, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.565959217644611, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0017773616312942156, |
|
"loss": 1.4141, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.617977528089888, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0017752808988764045, |
|
"loss": 1.4118, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.669995838535164, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0017732001664585936, |
|
"loss": 1.4094, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.722014148980441, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0017711194340407823, |
|
"loss": 1.4088, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.774032459425718, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.0017690387016229714, |
|
"loss": 1.4068, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.826050769870995, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0017669579692051603, |
|
"loss": 1.4059, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.878069080316271, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.001764877236787349, |
|
"loss": 1.4041, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.930087390761548, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0017627965043695382, |
|
"loss": 1.4024, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.982105701206825, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.001760715771951727, |
|
"loss": 1.4006, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.3921489715576172, |
|
"eval_runtime": 1.4219, |
|
"eval_samples_per_second": 703.273, |
|
"eval_steps_per_second": 0.703, |
|
"step": 57672 |
|
}, |
|
{ |
|
"epoch": 6.034124011652102, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.001758635039533916, |
|
"loss": 1.4001, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.086142322097379, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.001756554307116105, |
|
"loss": 1.3981, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 6.138160632542655, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0017544735746982938, |
|
"loss": 1.4018, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.190178942987932, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0017523928422804827, |
|
"loss": 1.397, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.242197253433209, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0017503121098626717, |
|
"loss": 1.3974, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.294215563878486, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 0.0017482313774448606, |
|
"loss": 1.397, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.346233874323762, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0017461506450270495, |
|
"loss": 1.3971, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.398252184769039, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0017440699126092386, |
|
"loss": 1.3955, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.4502704952143155, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0017419891801914273, |
|
"loss": 1.3959, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.502288805659592, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0017399084477736164, |
|
"loss": 1.3951, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.554307116104869, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0017378277153558054, |
|
"loss": 1.3977, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.606325426550145, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.001735746982937994, |
|
"loss": 1.3976, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.658343736995422, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0017336662505201832, |
|
"loss": 1.3964, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.710362047440699, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.001731585518102372, |
|
"loss": 1.396, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.762380357885976, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.001729504785684561, |
|
"loss": 1.3931, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.814398668331252, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00172742405326675, |
|
"loss": 1.3931, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.866416978776529, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0017253433208489388, |
|
"loss": 1.388, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.918435289221806, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0017232625884311278, |
|
"loss": 1.3862, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.970453599667083, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0017211818560133169, |
|
"loss": 1.3865, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.3738893270492554, |
|
"eval_runtime": 1.4206, |
|
"eval_samples_per_second": 703.936, |
|
"eval_steps_per_second": 0.704, |
|
"step": 67284 |
|
}, |
|
{ |
|
"epoch": 7.022471910112359, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0017191011235955056, |
|
"loss": 1.385, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 7.074490220557636, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0017170203911776945, |
|
"loss": 1.3827, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.126508531002913, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0017149396587598836, |
|
"loss": 1.3864, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 7.17852684144819, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0017128589263420723, |
|
"loss": 1.3874, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.230545151893467, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0017107781939242615, |
|
"loss": 1.3913, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.282563462338743, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0017086974615064504, |
|
"loss": 1.3892, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.33458177278402, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0017066167290886393, |
|
"loss": 1.3888, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.386600083229297, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0017045359966708282, |
|
"loss": 1.3908, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.438618393674574, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.001702455264253017, |
|
"loss": 1.3879, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.49063670411985, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.001700374531835206, |
|
"loss": 1.3873, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.542655014565127, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.001698293799417395, |
|
"loss": 1.3845, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.594673325010404, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0016962130669995838, |
|
"loss": 1.3832, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.646691635455681, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0016941323345817728, |
|
"loss": 1.38, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.698709945900957, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.001692051602163962, |
|
"loss": 1.3798, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.750728256346234, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0016899708697461506, |
|
"loss": 1.3775, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.802746566791511, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0016878901373283395, |
|
"loss": 1.3797, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.8547648772367875, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0016858094049105286, |
|
"loss": 1.3813, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.9067831876820645, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0016837286724927173, |
|
"loss": 1.3779, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.9588014981273405, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0016816479400749065, |
|
"loss": 1.3771, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.3700777292251587, |
|
"eval_runtime": 1.4197, |
|
"eval_samples_per_second": 704.38, |
|
"eval_steps_per_second": 0.704, |
|
"step": 76896 |
|
}, |
|
{ |
|
"epoch": 8.010819808572618, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0016795672076570954, |
|
"loss": 1.3753, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.062838119017893, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0016774864752392843, |
|
"loss": 1.3763, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 8.11485642946317, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0016754057428214732, |
|
"loss": 1.3761, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.166874739908447, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0016733250104036621, |
|
"loss": 1.3786, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 8.218893050353724, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.001671244277985851, |
|
"loss": 1.3786, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.270911360799001, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0016691635455680402, |
|
"loss": 1.3797, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.322929671244278, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0016670828131502289, |
|
"loss": 1.3791, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.374947981689555, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0016650020807324178, |
|
"loss": 1.3783, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.426966292134832, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.001662921348314607, |
|
"loss": 1.376, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.478984602580109, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0016608406158967956, |
|
"loss": 1.3763, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.531002913025384, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0016587598834789845, |
|
"loss": 1.3757, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.583021223470661, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0016566791510611736, |
|
"loss": 1.377, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.635039533915938, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0016545984186433626, |
|
"loss": 1.3755, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.687057844361215, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0016525176862255513, |
|
"loss": 1.3739, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.739076154806492, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0016504369538077404, |
|
"loss": 1.3744, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.791094465251769, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0016483562213899293, |
|
"loss": 1.3733, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.843112775697046, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0016462754889721182, |
|
"loss": 1.3735, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.895131086142323, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0016441947565543071, |
|
"loss": 1.3754, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.947149396587598, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.001642114024136496, |
|
"loss": 1.3749, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 8.999167707032875, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0016400332917186852, |
|
"loss": 1.3739, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.3678644895553589, |
|
"eval_runtime": 1.4359, |
|
"eval_samples_per_second": 696.406, |
|
"eval_steps_per_second": 0.696, |
|
"step": 86508 |
|
}, |
|
{ |
|
"epoch": 9.051186017478152, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0016379525593008739, |
|
"loss": 1.3717, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.103204327923429, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0016358718268830628, |
|
"loss": 1.3705, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 9.155222638368706, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.001633791094465252, |
|
"loss": 1.3719, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.207240948813983, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0016317103620474408, |
|
"loss": 1.3727, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 9.25925925925926, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.0016296296296296295, |
|
"loss": 1.3737, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.311277569704536, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0016275488972118187, |
|
"loss": 1.3707, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.363295880149813, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0016254681647940076, |
|
"loss": 1.3694, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.41531419059509, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0016233874323761963, |
|
"loss": 1.3687, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.467332501040365, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0016213066999583854, |
|
"loss": 1.368, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.519350811485642, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0016192259675405743, |
|
"loss": 1.3716, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.57136912193092, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0016171452351227634, |
|
"loss": 1.3697, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.623387432376196, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0016150645027049521, |
|
"loss": 1.3692, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.675405742821473, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.001612983770287141, |
|
"loss": 1.3682, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.72742405326675, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0016109030378693302, |
|
"loss": 1.3673, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.779442363712027, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0016088223054515189, |
|
"loss": 1.3663, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.831460674157304, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0016067415730337078, |
|
"loss": 1.3662, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.88347898460258, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.001604660840615897, |
|
"loss": 1.37, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.935497295047856, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0016025801081980858, |
|
"loss": 1.372, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 9.987515605493133, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0016004993757802745, |
|
"loss": 1.3699, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.3671537637710571, |
|
"eval_runtime": 2.0428, |
|
"eval_samples_per_second": 489.53, |
|
"eval_steps_per_second": 0.49, |
|
"step": 96120 |
|
}, |
|
{ |
|
"epoch": 10.03953391593841, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0015984186433624637, |
|
"loss": 1.3686, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 10.091552226383687, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0015963379109446526, |
|
"loss": 1.3682, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.143570536828964, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0015942571785268413, |
|
"loss": 1.3659, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 10.19558884727424, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0015921764461090304, |
|
"loss": 1.3653, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.247607157719518, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0015900957136912193, |
|
"loss": 1.3668, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 10.299625468164795, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0015880149812734085, |
|
"loss": 1.3695, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 10.35164377861007, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0015859342488555972, |
|
"loss": 1.3673, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 10.403662089055347, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.001583853516437786, |
|
"loss": 1.3669, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 10.455680399500624, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0015817727840199752, |
|
"loss": 1.367, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 10.5076987099459, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0015796920516021641, |
|
"loss": 1.3668, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 10.559717020391178, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0015776113191843528, |
|
"loss": 1.3654, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 10.611735330836455, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.001575530586766542, |
|
"loss": 1.3655, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 10.663753641281732, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0015734498543487309, |
|
"loss": 1.3673, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 10.715771951727008, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0015713691219309195, |
|
"loss": 1.3651, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 10.767790262172285, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0015692883895131087, |
|
"loss": 1.3652, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 10.81980857261756, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0015672076570952976, |
|
"loss": 1.366, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 10.871826883062838, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0015651269246774865, |
|
"loss": 1.3624, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 10.923845193508114, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0015630461922596754, |
|
"loss": 1.3605, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 10.975863503953391, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0015609654598418643, |
|
"loss": 1.36, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 1.3506468534469604, |
|
"eval_runtime": 1.4359, |
|
"eval_samples_per_second": 696.43, |
|
"eval_steps_per_second": 0.696, |
|
"step": 105732 |
|
}, |
|
{ |
|
"epoch": 11.027881814398668, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0015588847274240535, |
|
"loss": 1.3598, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 11.079900124843945, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0015568039950062422, |
|
"loss": 1.3614, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 11.131918435289222, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.001554723262588431, |
|
"loss": 1.3587, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 11.1839367457345, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0015526425301706202, |
|
"loss": 1.3585, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 11.235955056179776, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.0015505617977528091, |
|
"loss": 1.3577, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 11.287973366625051, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0015484810653349978, |
|
"loss": 1.3581, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 11.339991677070328, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.001546400332917187, |
|
"loss": 1.3574, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 11.392009987515605, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0015443196004993759, |
|
"loss": 1.3586, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 11.444028297960882, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0015422388680815646, |
|
"loss": 1.3582, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 11.496046608406159, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0015401581356637537, |
|
"loss": 1.3573, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 11.548064918851436, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0015380774032459426, |
|
"loss": 1.3592, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 11.600083229296713, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0015359966708281315, |
|
"loss": 1.3558, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 11.65210153974199, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0015339159384103204, |
|
"loss": 1.3578, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 11.704119850187267, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0015318352059925093, |
|
"loss": 1.3562, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 11.756138160632542, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0015297544735746985, |
|
"loss": 1.3575, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 11.808156471077819, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0015276737411568874, |
|
"loss": 1.3568, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 11.860174781523096, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.001525593008739076, |
|
"loss": 1.3611, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 11.912193091968373, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0015235122763212652, |
|
"loss": 1.3598, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 11.96421140241365, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 0.0015214315439034541, |
|
"loss": 1.3598, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.3485276699066162, |
|
"eval_runtime": 1.4101, |
|
"eval_samples_per_second": 709.157, |
|
"eval_steps_per_second": 0.709, |
|
"step": 115344 |
|
}, |
|
{ |
|
"epoch": 12.016229712858927, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0015193508114856428, |
|
"loss": 1.3563, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 12.068248023304204, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.001517270079067832, |
|
"loss": 1.3552, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 12.12026633374948, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0015151893466500209, |
|
"loss": 1.3552, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 12.172284644194757, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0015131086142322098, |
|
"loss": 1.3543, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 12.224302954640033, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0015110278818143987, |
|
"loss": 1.3534, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 12.27632126508531, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0015089471493965876, |
|
"loss": 1.3541, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 12.328339575530586, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0015068664169787765, |
|
"loss": 1.3542, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 12.380357885975863, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0015047856845609654, |
|
"loss": 1.3561, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 12.43237619642114, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0015027049521431544, |
|
"loss": 1.3544, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 12.484394506866417, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0015006242197253433, |
|
"loss": 1.3561, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 12.536412817311694, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0014985434873075324, |
|
"loss": 1.3533, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 12.588431127756971, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.001496462754889721, |
|
"loss": 1.3546, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 12.640449438202246, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0014943820224719102, |
|
"loss": 1.3548, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 12.692467748647523, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.0014923012900540991, |
|
"loss": 1.3531, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 12.7444860590928, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0014902205576362878, |
|
"loss": 1.3518, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 12.796504369538077, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.001488139825218477, |
|
"loss": 1.3512, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 12.848522679983354, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0014860590928006659, |
|
"loss": 1.3506, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 12.900540990428631, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0014839783603828548, |
|
"loss": 1.3539, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 12.952559300873908, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0014818976279650437, |
|
"loss": 1.3581, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 1.3496302366256714, |
|
"eval_runtime": 1.4282, |
|
"eval_samples_per_second": 700.187, |
|
"eval_steps_per_second": 0.7, |
|
"step": 124956 |
|
}, |
|
{ |
|
"epoch": 13.004577611319185, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0014798168955472326, |
|
"loss": 1.3558, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 13.056595921764462, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0014777361631294215, |
|
"loss": 1.3526, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 13.108614232209737, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0014756554307116107, |
|
"loss": 1.3508, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 13.160632542655014, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0014735746982937994, |
|
"loss": 1.3507, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 13.21265085310029, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0014714939658759883, |
|
"loss": 1.3525, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 13.264669163545568, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0014694132334581774, |
|
"loss": 1.3525, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 13.316687473990845, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0014673325010403661, |
|
"loss": 1.3529, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 13.368705784436122, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0014652517686225552, |
|
"loss": 1.3518, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 13.420724094881399, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0014631710362047442, |
|
"loss": 1.3534, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 13.472742405326676, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.001461090303786933, |
|
"loss": 1.3513, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 13.524760715771952, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.001459009571369122, |
|
"loss": 1.3516, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 13.576779026217228, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.001456928838951311, |
|
"loss": 1.3521, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 13.628797336662505, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0014548481065334998, |
|
"loss": 1.353, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 13.680815647107782, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0014527673741156887, |
|
"loss": 1.3525, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 13.732833957553058, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0014506866416978776, |
|
"loss": 1.3501, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 13.784852267998335, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0014486059092800666, |
|
"loss": 1.3495, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 13.836870578443612, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0014465251768622557, |
|
"loss": 1.3513, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0014444444444444444, |
|
"loss": 1.3516, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 13.940907199334166, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0014423637120266333, |
|
"loss": 1.3512, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 13.992925509779443, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0014402829796088224, |
|
"loss": 1.3493, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 1.3465324640274048, |
|
"eval_runtime": 1.419, |
|
"eval_samples_per_second": 704.714, |
|
"eval_steps_per_second": 0.705, |
|
"step": 134568 |
|
}, |
|
{ |
|
"epoch": 14.044943820224718, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0014382022471910111, |
|
"loss": 1.3478, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 14.096962130669995, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0014361215147732003, |
|
"loss": 1.3485, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 14.148980441115272, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0014340407823553892, |
|
"loss": 1.3473, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 14.20099875156055, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.001431960049937578, |
|
"loss": 1.3469, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 14.253017062005826, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.001429879317519767, |
|
"loss": 1.3488, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 14.305035372451103, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.001427798585101956, |
|
"loss": 1.3482, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 14.35705368289638, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0014257178526841448, |
|
"loss": 1.3485, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 14.409071993341657, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.001423637120266334, |
|
"loss": 1.3479, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 14.461090303786934, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0014215563878485226, |
|
"loss": 1.3483, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 14.513108614232209, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0014194756554307116, |
|
"loss": 1.3471, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 14.565126924677486, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0014173949230129007, |
|
"loss": 1.3475, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 14.617145235122763, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0014153141905950894, |
|
"loss": 1.3467, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 14.66916354556804, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0014132334581772783, |
|
"loss": 1.3462, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 14.721181856013317, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0014111527257594674, |
|
"loss": 1.3456, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 14.773200166458594, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0014090719933416563, |
|
"loss": 1.3455, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 14.82521847690387, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0014069912609238453, |
|
"loss": 1.3447, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 14.877236787349148, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.0014049105285060342, |
|
"loss": 1.3454, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 14.929255097794425, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.001402829796088223, |
|
"loss": 1.3461, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 14.9812734082397, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.001400749063670412, |
|
"loss": 1.3454, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 1.3407135009765625, |
|
"eval_runtime": 1.4239, |
|
"eval_samples_per_second": 702.319, |
|
"eval_steps_per_second": 0.702, |
|
"step": 144180 |
|
}, |
|
{ |
|
"epoch": 15.033291718684977, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.001398668331252601, |
|
"loss": 1.3444, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 15.085310029130254, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0013965875988347898, |
|
"loss": 1.3444, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 15.13732833957553, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.001394506866416979, |
|
"loss": 1.3437, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 15.189346650020807, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0013924261339991677, |
|
"loss": 1.3473, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 15.241364960466084, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0013903454015813566, |
|
"loss": 1.3474, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 15.293383270911361, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0013882646691635457, |
|
"loss": 1.3472, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 15.345401581356638, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0013861839367457346, |
|
"loss": 1.3457, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 15.397419891801913, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0013841032043279233, |
|
"loss": 1.3438, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 15.44943820224719, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0013820224719101124, |
|
"loss": 1.3452, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 15.501456512692467, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0013799417394923014, |
|
"loss": 1.3461, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 15.553474823137744, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0013778610070744903, |
|
"loss": 1.344, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 15.605493133583021, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0013757802746566792, |
|
"loss": 1.3437, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 15.657511444028298, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.001373699542238868, |
|
"loss": 1.3468, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 15.709529754473575, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0013716188098210572, |
|
"loss": 1.3439, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 15.761548064918852, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.001369538077403246, |
|
"loss": 1.3435, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 15.813566375364129, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.0013674573449854348, |
|
"loss": 1.3463, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 15.865584685809406, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.001365376612567624, |
|
"loss": 1.3467, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 15.917602996254681, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0013632958801498127, |
|
"loss": 1.3462, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 15.969621306699958, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.0013612151477320016, |
|
"loss": 1.3461, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.3369859457015991, |
|
"eval_runtime": 1.4272, |
|
"eval_samples_per_second": 700.674, |
|
"eval_steps_per_second": 0.701, |
|
"step": 153792 |
|
}, |
|
{ |
|
"epoch": 16.021639617145237, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0013591344153141907, |
|
"loss": 1.3446, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 16.073657927590514, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0013570536828963796, |
|
"loss": 1.3422, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 16.125676238035787, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0013549729504785683, |
|
"loss": 1.3409, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 16.177694548481064, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0013528922180607575, |
|
"loss": 1.3428, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 16.22971285892634, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0013508114856429464, |
|
"loss": 1.3443, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 16.281731169371618, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.001348730753225135, |
|
"loss": 1.3423, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 16.333749479816895, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0013466500208073242, |
|
"loss": 1.3408, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 16.38576779026217, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0013445692883895131, |
|
"loss": 1.3421, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 16.43778610070745, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0013424885559717022, |
|
"loss": 1.3424, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 16.489804411152726, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.001340407823553891, |
|
"loss": 1.3412, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 16.541822721598002, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0013383270911360799, |
|
"loss": 1.3419, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 16.59384103204328, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.001336246358718269, |
|
"loss": 1.3415, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 16.645859342488556, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 0.001334165626300458, |
|
"loss": 1.3417, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 16.697877652933833, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0013320848938826466, |
|
"loss": 1.3405, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 16.74989596337911, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0013300041614648357, |
|
"loss": 1.339, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 16.801914273824387, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0013279234290470246, |
|
"loss": 1.3415, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 16.853932584269664, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0013258426966292133, |
|
"loss": 1.341, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 16.90595089471494, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0013237619642114025, |
|
"loss": 1.3424, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 16.957969205160218, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0013216812317935914, |
|
"loss": 1.3479, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 1.339566707611084, |
|
"eval_runtime": 1.4145, |
|
"eval_samples_per_second": 706.982, |
|
"eval_steps_per_second": 0.707, |
|
"step": 163404 |
|
}, |
|
{ |
|
"epoch": 17.00998751560549, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0013196004993757803, |
|
"loss": 1.3445, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 17.06200582605077, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0013175197669579692, |
|
"loss": 1.3435, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 17.114024136496045, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0013154390345401581, |
|
"loss": 1.3448, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 17.166042446941322, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0013133583021223473, |
|
"loss": 1.3436, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 17.2180607573866, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.001311277569704536, |
|
"loss": 1.3445, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 17.270079067831876, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.0013091968372867249, |
|
"loss": 1.343, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 17.322097378277153, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.001307116104868914, |
|
"loss": 1.3429, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 17.37411568872243, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.001305035372451103, |
|
"loss": 1.342, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 17.426133999167707, |
|
"grad_norm": 2.875, |
|
"learning_rate": 0.0013029546400332916, |
|
"loss": 1.3429, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 17.478152309612984, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0013008739076154807, |
|
"loss": 1.3424, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 17.53017062005826, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0012987931751976696, |
|
"loss": 1.3422, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 17.582188930503538, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0012967124427798583, |
|
"loss": 1.3419, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 17.634207240948815, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0012946317103620475, |
|
"loss": 1.3419, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 17.68622555139409, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0012925509779442364, |
|
"loss": 1.3411, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 17.73824386183937, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0012904702455264253, |
|
"loss": 1.3409, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 17.790262172284645, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0012883895131086142, |
|
"loss": 1.3429, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 17.842280482729922, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0012863087806908031, |
|
"loss": 1.3426, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 17.8942987931752, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0012842280482729923, |
|
"loss": 1.342, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 17.946317103620473, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0012821473158551812, |
|
"loss": 1.3418, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 17.99833541406575, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0012800665834373699, |
|
"loss": 1.3429, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 1.3371888399124146, |
|
"eval_runtime": 1.4206, |
|
"eval_samples_per_second": 703.942, |
|
"eval_steps_per_second": 0.704, |
|
"step": 173016 |
|
}, |
|
{ |
|
"epoch": 18.050353724511027, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.001277985851019559, |
|
"loss": 1.3399, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 18.102372034956304, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.001275905118601748, |
|
"loss": 1.3414, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 18.15439034540158, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0012738243861839366, |
|
"loss": 1.3425, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 18.206408655846857, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0012717436537661257, |
|
"loss": 1.339, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 18.258426966292134, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.0012696629213483147, |
|
"loss": 1.3403, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 18.31044527673741, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0012675821889305036, |
|
"loss": 1.3388, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 18.36246358718269, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0012655014565126925, |
|
"loss": 1.3385, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 18.414481897627965, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0012634207240948814, |
|
"loss": 1.3372, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 18.466500208073242, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0012613399916770703, |
|
"loss": 1.3384, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 18.51851851851852, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0012592592592592592, |
|
"loss": 1.3377, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 18.570536828963796, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0012571785268414481, |
|
"loss": 1.3372, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 18.622555139409073, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0012550977944236373, |
|
"loss": 1.3368, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 18.67457344985435, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0012530170620058262, |
|
"loss": 1.3384, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 18.726591760299627, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0012509363295880149, |
|
"loss": 1.3374, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 18.778610070744904, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.001248855597170204, |
|
"loss": 1.3378, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 18.83062838119018, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.001246774864752393, |
|
"loss": 1.3397, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 18.882646691635454, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0012446941323345816, |
|
"loss": 1.3379, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 18.93466500208073, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0012426133999167708, |
|
"loss": 1.3377, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 18.986683312526008, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0012405326674989597, |
|
"loss": 1.3379, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 1.3305245637893677, |
|
"eval_runtime": 1.4316, |
|
"eval_samples_per_second": 698.511, |
|
"eval_steps_per_second": 0.699, |
|
"step": 182628 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 480600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3420668838410615e+19, |
|
"train_batch_size": 1024, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|