|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 4656, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006443298969072165, |
|
"grad_norm": 37.304439544677734, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 3.2892, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01610824742268041, |
|
"grad_norm": 21.749683380126953, |
|
"learning_rate": 2.5e-05, |
|
"loss": 3.1951, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03221649484536082, |
|
"grad_norm": 17.803585052490234, |
|
"learning_rate": 5e-05, |
|
"loss": 3.3824, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04832474226804124, |
|
"grad_norm": 13.760115623474121, |
|
"learning_rate": 4.97286148501954e-05, |
|
"loss": 3.4591, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06443298969072164, |
|
"grad_norm": 17.308778762817383, |
|
"learning_rate": 4.945722970039079e-05, |
|
"loss": 3.5677, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08054123711340207, |
|
"grad_norm": 11.235258102416992, |
|
"learning_rate": 4.9185844550586194e-05, |
|
"loss": 3.5688, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09664948453608248, |
|
"grad_norm": 11.886427879333496, |
|
"learning_rate": 4.891445940078159e-05, |
|
"loss": 3.5572, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11275773195876289, |
|
"grad_norm": 10.693597793579102, |
|
"learning_rate": 4.864307425097699e-05, |
|
"loss": 3.5816, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.12886597938144329, |
|
"grad_norm": 10.654956817626953, |
|
"learning_rate": 4.8371689101172386e-05, |
|
"loss": 3.4257, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14497422680412372, |
|
"grad_norm": 78.01908111572266, |
|
"learning_rate": 4.810030395136778e-05, |
|
"loss": 3.4916, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.16108247422680413, |
|
"grad_norm": 14.090380668640137, |
|
"learning_rate": 4.782891880156318e-05, |
|
"loss": 3.4205, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.17719072164948454, |
|
"grad_norm": 9.427169799804688, |
|
"learning_rate": 4.755753365175858e-05, |
|
"loss": 3.4519, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.19329896907216496, |
|
"grad_norm": 57.52346420288086, |
|
"learning_rate": 4.728614850195397e-05, |
|
"loss": 3.5561, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20940721649484537, |
|
"grad_norm": 16.087291717529297, |
|
"learning_rate": 4.701476335214937e-05, |
|
"loss": 3.4669, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.22551546391752578, |
|
"grad_norm": 11.201750755310059, |
|
"learning_rate": 4.674337820234477e-05, |
|
"loss": 3.4119, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2416237113402062, |
|
"grad_norm": 12.096583366394043, |
|
"learning_rate": 4.647199305254017e-05, |
|
"loss": 3.3902, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 12.698060989379883, |
|
"learning_rate": 4.620060790273557e-05, |
|
"loss": 3.4245, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.27384020618556704, |
|
"grad_norm": 6.767716884613037, |
|
"learning_rate": 4.592922275293096e-05, |
|
"loss": 3.3878, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.28994845360824745, |
|
"grad_norm": 8.80782413482666, |
|
"learning_rate": 4.565783760312636e-05, |
|
"loss": 3.36, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.30605670103092786, |
|
"grad_norm": 8.567368507385254, |
|
"learning_rate": 4.538645245332175e-05, |
|
"loss": 3.46, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.32216494845360827, |
|
"grad_norm": 7.334051132202148, |
|
"learning_rate": 4.5115067303517154e-05, |
|
"loss": 3.4295, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3382731958762887, |
|
"grad_norm": 35.50625991821289, |
|
"learning_rate": 4.484368215371255e-05, |
|
"loss": 3.458, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3543814432989691, |
|
"grad_norm": 8.980048179626465, |
|
"learning_rate": 4.457229700390795e-05, |
|
"loss": 3.3348, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3704896907216495, |
|
"grad_norm": 11.022858619689941, |
|
"learning_rate": 4.4300911854103346e-05, |
|
"loss": 3.4155, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3865979381443299, |
|
"grad_norm": 7.455577373504639, |
|
"learning_rate": 4.402952670429874e-05, |
|
"loss": 3.3107, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4027061855670103, |
|
"grad_norm": 6.974651336669922, |
|
"learning_rate": 4.375814155449414e-05, |
|
"loss": 3.371, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.41881443298969073, |
|
"grad_norm": 6.6951680183410645, |
|
"learning_rate": 4.348675640468954e-05, |
|
"loss": 3.3197, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.43492268041237114, |
|
"grad_norm": 7.696976661682129, |
|
"learning_rate": 4.321537125488493e-05, |
|
"loss": 3.3954, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.45103092783505155, |
|
"grad_norm": 7.18176794052124, |
|
"learning_rate": 4.294398610508033e-05, |
|
"loss": 3.3443, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.46713917525773196, |
|
"grad_norm": 6.54254150390625, |
|
"learning_rate": 4.267260095527572e-05, |
|
"loss": 3.235, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.4832474226804124, |
|
"grad_norm": 6.953215599060059, |
|
"learning_rate": 4.2401215805471125e-05, |
|
"loss": 3.3103, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4993556701030928, |
|
"grad_norm": 5.925103187561035, |
|
"learning_rate": 4.212983065566653e-05, |
|
"loss": 3.3243, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 7.676642417907715, |
|
"learning_rate": 4.185844550586192e-05, |
|
"loss": 3.2962, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5315721649484536, |
|
"grad_norm": 5.8870038986206055, |
|
"learning_rate": 4.158706035605732e-05, |
|
"loss": 3.2808, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5476804123711341, |
|
"grad_norm": 6.311049938201904, |
|
"learning_rate": 4.131567520625272e-05, |
|
"loss": 3.2797, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5637886597938144, |
|
"grad_norm": 12.73991584777832, |
|
"learning_rate": 4.1044290056448114e-05, |
|
"loss": 3.2348, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5798969072164949, |
|
"grad_norm": 8.892550468444824, |
|
"learning_rate": 4.077290490664351e-05, |
|
"loss": 3.2308, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5960051546391752, |
|
"grad_norm": 5.8089704513549805, |
|
"learning_rate": 4.0501519756838904e-05, |
|
"loss": 3.2951, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6121134020618557, |
|
"grad_norm": 6.098554611206055, |
|
"learning_rate": 4.0230134607034306e-05, |
|
"loss": 3.1594, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6282216494845361, |
|
"grad_norm": 26.497699737548828, |
|
"learning_rate": 3.995874945722971e-05, |
|
"loss": 3.2403, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6443298969072165, |
|
"grad_norm": 7.092996597290039, |
|
"learning_rate": 3.96873643074251e-05, |
|
"loss": 3.1879, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6604381443298969, |
|
"grad_norm": 5.452559947967529, |
|
"learning_rate": 3.94159791576205e-05, |
|
"loss": 3.2312, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6765463917525774, |
|
"grad_norm": 11.570932388305664, |
|
"learning_rate": 3.914459400781589e-05, |
|
"loss": 3.1979, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6926546391752577, |
|
"grad_norm": 10.654516220092773, |
|
"learning_rate": 3.887320885801129e-05, |
|
"loss": 3.1124, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7087628865979382, |
|
"grad_norm": 5.750201225280762, |
|
"learning_rate": 3.860182370820669e-05, |
|
"loss": 3.1902, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7248711340206185, |
|
"grad_norm": 6.332087993621826, |
|
"learning_rate": 3.8330438558402085e-05, |
|
"loss": 3.1835, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.740979381443299, |
|
"grad_norm": 6.187074661254883, |
|
"learning_rate": 3.805905340859749e-05, |
|
"loss": 3.2402, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7570876288659794, |
|
"grad_norm": 5.326789379119873, |
|
"learning_rate": 3.778766825879288e-05, |
|
"loss": 3.1311, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 5.872878551483154, |
|
"learning_rate": 3.751628310898828e-05, |
|
"loss": 3.168, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7893041237113402, |
|
"grad_norm": 5.528806209564209, |
|
"learning_rate": 3.724489795918368e-05, |
|
"loss": 3.2025, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.8054123711340206, |
|
"grad_norm": 4.855608940124512, |
|
"learning_rate": 3.6973512809379074e-05, |
|
"loss": 3.1711, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.821520618556701, |
|
"grad_norm": 5.053402423858643, |
|
"learning_rate": 3.670212765957447e-05, |
|
"loss": 3.0436, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8376288659793815, |
|
"grad_norm": 6.834145545959473, |
|
"learning_rate": 3.6430742509769864e-05, |
|
"loss": 3.0668, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8537371134020618, |
|
"grad_norm": 5.844705104827881, |
|
"learning_rate": 3.615935735996526e-05, |
|
"loss": 3.129, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.8698453608247423, |
|
"grad_norm": 5.622738361358643, |
|
"learning_rate": 3.588797221016066e-05, |
|
"loss": 3.123, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8859536082474226, |
|
"grad_norm": 5.435595512390137, |
|
"learning_rate": 3.561658706035606e-05, |
|
"loss": 3.1695, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.9020618556701031, |
|
"grad_norm": 5.923786640167236, |
|
"learning_rate": 3.534520191055146e-05, |
|
"loss": 3.1486, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9181701030927835, |
|
"grad_norm": 5.717883586883545, |
|
"learning_rate": 3.507381676074685e-05, |
|
"loss": 3.119, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.9342783505154639, |
|
"grad_norm": 5.194445610046387, |
|
"learning_rate": 3.480243161094225e-05, |
|
"loss": 3.0391, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9503865979381443, |
|
"grad_norm": 4.672726154327393, |
|
"learning_rate": 3.453104646113765e-05, |
|
"loss": 3.0887, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.9664948453608248, |
|
"grad_norm": 5.593866348266602, |
|
"learning_rate": 3.4259661311333045e-05, |
|
"loss": 3.1433, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9826030927835051, |
|
"grad_norm": 6.05122709274292, |
|
"learning_rate": 3.398827616152844e-05, |
|
"loss": 3.094, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.9987113402061856, |
|
"grad_norm": 5.456536769866943, |
|
"learning_rate": 3.371689101172384e-05, |
|
"loss": 3.0894, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.014819587628866, |
|
"grad_norm": 6.8430867195129395, |
|
"learning_rate": 3.344550586191924e-05, |
|
"loss": 2.5031, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 6.864569664001465, |
|
"learning_rate": 3.317412071211464e-05, |
|
"loss": 2.3879, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0470360824742269, |
|
"grad_norm": 7.638180732727051, |
|
"learning_rate": 3.2902735562310034e-05, |
|
"loss": 2.3948, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.0631443298969072, |
|
"grad_norm": 5.917698860168457, |
|
"learning_rate": 3.263135041250543e-05, |
|
"loss": 2.3638, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.0792525773195876, |
|
"grad_norm": 6.708238124847412, |
|
"learning_rate": 3.2359965262700824e-05, |
|
"loss": 2.3536, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.0953608247422681, |
|
"grad_norm": 9.36337947845459, |
|
"learning_rate": 3.208858011289622e-05, |
|
"loss": 2.4048, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.1114690721649485, |
|
"grad_norm": 7.072855472564697, |
|
"learning_rate": 3.181719496309162e-05, |
|
"loss": 2.3709, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.1275773195876289, |
|
"grad_norm": 6.986050128936768, |
|
"learning_rate": 3.154580981328702e-05, |
|
"loss": 2.46, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.1436855670103092, |
|
"grad_norm": 6.583354949951172, |
|
"learning_rate": 3.127442466348242e-05, |
|
"loss": 2.3884, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.1597938144329896, |
|
"grad_norm": 6.607515811920166, |
|
"learning_rate": 3.100303951367781e-05, |
|
"loss": 2.3733, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.1759020618556701, |
|
"grad_norm": 7.239434719085693, |
|
"learning_rate": 3.073165436387321e-05, |
|
"loss": 2.4139, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.1920103092783505, |
|
"grad_norm": 7.7802042961120605, |
|
"learning_rate": 3.046026921406861e-05, |
|
"loss": 2.3074, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.2081185567010309, |
|
"grad_norm": 5.834593772888184, |
|
"learning_rate": 3.0188884064264005e-05, |
|
"loss": 2.3383, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.2242268041237114, |
|
"grad_norm": 6.189608097076416, |
|
"learning_rate": 2.9917498914459403e-05, |
|
"loss": 2.2833, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.2403350515463918, |
|
"grad_norm": 6.848288536071777, |
|
"learning_rate": 2.9646113764654798e-05, |
|
"loss": 2.3957, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.2564432989690721, |
|
"grad_norm": 6.78605842590332, |
|
"learning_rate": 2.9374728614850193e-05, |
|
"loss": 2.4055, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.2725515463917525, |
|
"grad_norm": 7.676894664764404, |
|
"learning_rate": 2.9103343465045595e-05, |
|
"loss": 2.3365, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.2886597938144329, |
|
"grad_norm": 6.011926651000977, |
|
"learning_rate": 2.8831958315240993e-05, |
|
"loss": 2.3317, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3047680412371134, |
|
"grad_norm": 6.217193126678467, |
|
"learning_rate": 2.856057316543639e-05, |
|
"loss": 2.3736, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.3208762886597938, |
|
"grad_norm": 7.027468681335449, |
|
"learning_rate": 2.8289188015631784e-05, |
|
"loss": 2.3509, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.3369845360824741, |
|
"grad_norm": 7.210168838500977, |
|
"learning_rate": 2.8017802865827182e-05, |
|
"loss": 2.449, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.3530927835051547, |
|
"grad_norm": 7.149182319641113, |
|
"learning_rate": 2.7746417716022584e-05, |
|
"loss": 2.3631, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.369201030927835, |
|
"grad_norm": 6.41991662979126, |
|
"learning_rate": 2.747503256621798e-05, |
|
"loss": 2.4368, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.3853092783505154, |
|
"grad_norm": 6.897440433502197, |
|
"learning_rate": 2.7203647416413374e-05, |
|
"loss": 2.3772, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.401417525773196, |
|
"grad_norm": 6.562511444091797, |
|
"learning_rate": 2.6932262266608772e-05, |
|
"loss": 2.346, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.4175257731958764, |
|
"grad_norm": 6.86238431930542, |
|
"learning_rate": 2.6660877116804168e-05, |
|
"loss": 2.3861, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.4336340206185567, |
|
"grad_norm": 7.627070426940918, |
|
"learning_rate": 2.638949196699957e-05, |
|
"loss": 2.3415, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.449742268041237, |
|
"grad_norm": 6.463057518005371, |
|
"learning_rate": 2.6118106817194964e-05, |
|
"loss": 2.3558, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.4658505154639174, |
|
"grad_norm": 6.722979545593262, |
|
"learning_rate": 2.584672166739036e-05, |
|
"loss": 2.3812, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.481958762886598, |
|
"grad_norm": 7.5143585205078125, |
|
"learning_rate": 2.5575336517585758e-05, |
|
"loss": 2.3577, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4980670103092784, |
|
"grad_norm": 6.2719197273254395, |
|
"learning_rate": 2.5303951367781153e-05, |
|
"loss": 2.3249, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.5141752577319587, |
|
"grad_norm": 6.567588806152344, |
|
"learning_rate": 2.5032566217976555e-05, |
|
"loss": 2.3663, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.5302835051546393, |
|
"grad_norm": 6.04072380065918, |
|
"learning_rate": 2.476118106817195e-05, |
|
"loss": 2.3098, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.5463917525773194, |
|
"grad_norm": 6.608715057373047, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 2.3335, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 6.724149227142334, |
|
"learning_rate": 2.4218410768562747e-05, |
|
"loss": 2.3538, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.5786082474226806, |
|
"grad_norm": 7.360804080963135, |
|
"learning_rate": 2.3947025618758142e-05, |
|
"loss": 2.3797, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.5947164948453607, |
|
"grad_norm": 7.265044689178467, |
|
"learning_rate": 2.367564046895354e-05, |
|
"loss": 2.3377, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.6108247422680413, |
|
"grad_norm": 7.212481498718262, |
|
"learning_rate": 2.340425531914894e-05, |
|
"loss": 2.3111, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.6269329896907216, |
|
"grad_norm": 6.6800456047058105, |
|
"learning_rate": 2.3132870169344334e-05, |
|
"loss": 2.3555, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.643041237113402, |
|
"grad_norm": 6.473804950714111, |
|
"learning_rate": 2.2861485019539732e-05, |
|
"loss": 2.2877, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.6591494845360826, |
|
"grad_norm": 13.455022811889648, |
|
"learning_rate": 2.2590099869735127e-05, |
|
"loss": 2.2963, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.675257731958763, |
|
"grad_norm": 6.606278419494629, |
|
"learning_rate": 2.2318714719930526e-05, |
|
"loss": 2.3671, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6913659793814433, |
|
"grad_norm": 6.745218276977539, |
|
"learning_rate": 2.2047329570125924e-05, |
|
"loss": 2.3202, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.7074742268041239, |
|
"grad_norm": 7.282406330108643, |
|
"learning_rate": 2.177594442032132e-05, |
|
"loss": 2.3242, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.723582474226804, |
|
"grad_norm": 7.313311576843262, |
|
"learning_rate": 2.1504559270516718e-05, |
|
"loss": 2.3228, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.7396907216494846, |
|
"grad_norm": 7.339620590209961, |
|
"learning_rate": 2.1233174120712116e-05, |
|
"loss": 2.3336, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.755798969072165, |
|
"grad_norm": 6.999018669128418, |
|
"learning_rate": 2.096178897090751e-05, |
|
"loss": 2.2578, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.7719072164948453, |
|
"grad_norm": 6.459262371063232, |
|
"learning_rate": 2.069040382110291e-05, |
|
"loss": 2.2741, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.7880154639175259, |
|
"grad_norm": 7.308042049407959, |
|
"learning_rate": 2.0419018671298308e-05, |
|
"loss": 2.2925, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.8041237113402062, |
|
"grad_norm": 6.555530071258545, |
|
"learning_rate": 2.0147633521493707e-05, |
|
"loss": 2.3082, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.8202319587628866, |
|
"grad_norm": 6.764036655426025, |
|
"learning_rate": 1.9876248371689102e-05, |
|
"loss": 2.2095, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.8363402061855671, |
|
"grad_norm": 7.759133815765381, |
|
"learning_rate": 1.96048632218845e-05, |
|
"loss": 2.3166, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.8524484536082473, |
|
"grad_norm": 6.442126274108887, |
|
"learning_rate": 1.9333478072079895e-05, |
|
"loss": 2.3075, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.8685567010309279, |
|
"grad_norm": 6.804947376251221, |
|
"learning_rate": 1.9062092922275294e-05, |
|
"loss": 2.2889, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.8846649484536082, |
|
"grad_norm": 6.473705291748047, |
|
"learning_rate": 1.8790707772470692e-05, |
|
"loss": 2.2423, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.9007731958762886, |
|
"grad_norm": 6.420748710632324, |
|
"learning_rate": 1.8519322622666087e-05, |
|
"loss": 2.2308, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.9168814432989691, |
|
"grad_norm": 7.469099044799805, |
|
"learning_rate": 1.8247937472861486e-05, |
|
"loss": 2.3467, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.9329896907216495, |
|
"grad_norm": 7.019501686096191, |
|
"learning_rate": 1.7976552323056884e-05, |
|
"loss": 2.3117, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.9490979381443299, |
|
"grad_norm": 7.53558874130249, |
|
"learning_rate": 1.770516717325228e-05, |
|
"loss": 2.2763, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.9652061855670104, |
|
"grad_norm": 6.622589588165283, |
|
"learning_rate": 1.7433782023447678e-05, |
|
"loss": 2.2725, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.9813144329896906, |
|
"grad_norm": 6.681495189666748, |
|
"learning_rate": 1.7162396873643076e-05, |
|
"loss": 2.1871, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.9974226804123711, |
|
"grad_norm": 5.900623321533203, |
|
"learning_rate": 1.6891011723838475e-05, |
|
"loss": 2.2892, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.0135309278350517, |
|
"grad_norm": 9.32268238067627, |
|
"learning_rate": 1.661962657403387e-05, |
|
"loss": 1.3008, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.029639175257732, |
|
"grad_norm": 7.467723369598389, |
|
"learning_rate": 1.6348241424229265e-05, |
|
"loss": 1.0731, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.0457474226804124, |
|
"grad_norm": 8.434012413024902, |
|
"learning_rate": 1.6076856274424663e-05, |
|
"loss": 1.0061, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 9.433366775512695, |
|
"learning_rate": 1.580547112462006e-05, |
|
"loss": 1.0132, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.077963917525773, |
|
"grad_norm": 7.6198039054870605, |
|
"learning_rate": 1.553408597481546e-05, |
|
"loss": 0.9944, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.0940721649484537, |
|
"grad_norm": 8.139434814453125, |
|
"learning_rate": 1.5262700825010855e-05, |
|
"loss": 0.9757, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.110180412371134, |
|
"grad_norm": 8.175223350524902, |
|
"learning_rate": 1.4991315675206252e-05, |
|
"loss": 0.978, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.1262886597938144, |
|
"grad_norm": 8.026739120483398, |
|
"learning_rate": 1.4719930525401652e-05, |
|
"loss": 0.9758, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.142396907216495, |
|
"grad_norm": 8.502424240112305, |
|
"learning_rate": 1.4448545375597047e-05, |
|
"loss": 0.9047, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.158505154639175, |
|
"grad_norm": 9.062753677368164, |
|
"learning_rate": 1.4177160225792445e-05, |
|
"loss": 0.9462, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.1746134020618557, |
|
"grad_norm": 9.223316192626953, |
|
"learning_rate": 1.3905775075987842e-05, |
|
"loss": 0.9564, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.1907216494845363, |
|
"grad_norm": 8.59533977508545, |
|
"learning_rate": 1.3634389926183239e-05, |
|
"loss": 0.9476, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.2068298969072164, |
|
"grad_norm": 8.367724418640137, |
|
"learning_rate": 1.3363004776378637e-05, |
|
"loss": 0.9684, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.222938144329897, |
|
"grad_norm": 9.15878963470459, |
|
"learning_rate": 1.3091619626574034e-05, |
|
"loss": 0.9847, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.239046391752577, |
|
"grad_norm": 10.106039047241211, |
|
"learning_rate": 1.2820234476769433e-05, |
|
"loss": 0.9148, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.2551546391752577, |
|
"grad_norm": 8.91595458984375, |
|
"learning_rate": 1.254884932696483e-05, |
|
"loss": 0.9293, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.2712628865979383, |
|
"grad_norm": 9.854774475097656, |
|
"learning_rate": 1.2277464177160226e-05, |
|
"loss": 0.9469, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.2873711340206184, |
|
"grad_norm": 8.479780197143555, |
|
"learning_rate": 1.2006079027355625e-05, |
|
"loss": 0.925, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.303479381443299, |
|
"grad_norm": 8.944768905639648, |
|
"learning_rate": 1.1734693877551021e-05, |
|
"loss": 0.9593, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 2.319587628865979, |
|
"grad_norm": 8.820865631103516, |
|
"learning_rate": 1.1463308727746418e-05, |
|
"loss": 0.9583, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.3356958762886597, |
|
"grad_norm": 9.563779830932617, |
|
"learning_rate": 1.1191923577941815e-05, |
|
"loss": 0.9298, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.3518041237113403, |
|
"grad_norm": 8.982272148132324, |
|
"learning_rate": 1.0920538428137213e-05, |
|
"loss": 0.9376, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.367912371134021, |
|
"grad_norm": 9.715324401855469, |
|
"learning_rate": 1.064915327833261e-05, |
|
"loss": 0.9428, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.384020618556701, |
|
"grad_norm": 8.481626510620117, |
|
"learning_rate": 1.0377768128528009e-05, |
|
"loss": 0.9254, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.4001288659793816, |
|
"grad_norm": 9.785958290100098, |
|
"learning_rate": 1.0106382978723404e-05, |
|
"loss": 0.9266, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.4162371134020617, |
|
"grad_norm": 10.050392150878906, |
|
"learning_rate": 9.834997828918802e-06, |
|
"loss": 0.9335, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.4323453608247423, |
|
"grad_norm": 8.707124710083008, |
|
"learning_rate": 9.563612679114199e-06, |
|
"loss": 0.872, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.448453608247423, |
|
"grad_norm": 9.095705032348633, |
|
"learning_rate": 9.292227529309597e-06, |
|
"loss": 0.8928, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.464561855670103, |
|
"grad_norm": 9.691436767578125, |
|
"learning_rate": 9.020842379504994e-06, |
|
"loss": 0.8993, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.4806701030927836, |
|
"grad_norm": 17.811647415161133, |
|
"learning_rate": 8.749457229700392e-06, |
|
"loss": 0.8943, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.4967783505154637, |
|
"grad_norm": 9.972207069396973, |
|
"learning_rate": 8.478072079895788e-06, |
|
"loss": 0.9248, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.5128865979381443, |
|
"grad_norm": 9.202563285827637, |
|
"learning_rate": 8.206686930091186e-06, |
|
"loss": 0.9077, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.528994845360825, |
|
"grad_norm": 9.509817123413086, |
|
"learning_rate": 7.935301780286583e-06, |
|
"loss": 0.9037, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.545103092783505, |
|
"grad_norm": 8.833476066589355, |
|
"learning_rate": 7.663916630481981e-06, |
|
"loss": 0.8766, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.5612113402061856, |
|
"grad_norm": 10.363802909851074, |
|
"learning_rate": 7.392531480677378e-06, |
|
"loss": 0.895, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 2.5773195876288657, |
|
"grad_norm": 9.111068725585938, |
|
"learning_rate": 7.121146330872775e-06, |
|
"loss": 0.9224, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.5934278350515463, |
|
"grad_norm": 10.667325019836426, |
|
"learning_rate": 6.849761181068172e-06, |
|
"loss": 0.8776, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 2.609536082474227, |
|
"grad_norm": 11.279472351074219, |
|
"learning_rate": 6.578376031263569e-06, |
|
"loss": 0.8723, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.6256443298969074, |
|
"grad_norm": 15.722869873046875, |
|
"learning_rate": 6.306990881458967e-06, |
|
"loss": 0.8858, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 2.6417525773195876, |
|
"grad_norm": 10.252237319946289, |
|
"learning_rate": 6.035605731654364e-06, |
|
"loss": 0.8639, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.657860824742268, |
|
"grad_norm": 9.055089950561523, |
|
"learning_rate": 5.764220581849761e-06, |
|
"loss": 0.8794, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 2.6739690721649483, |
|
"grad_norm": 9.109421730041504, |
|
"learning_rate": 5.492835432045159e-06, |
|
"loss": 0.8667, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.690077319587629, |
|
"grad_norm": 9.12623119354248, |
|
"learning_rate": 5.221450282240556e-06, |
|
"loss": 0.8626, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.7061855670103094, |
|
"grad_norm": 9.60417366027832, |
|
"learning_rate": 4.950065132435953e-06, |
|
"loss": 0.9106, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.7222938144329896, |
|
"grad_norm": 9.32435417175293, |
|
"learning_rate": 4.678679982631351e-06, |
|
"loss": 0.8714, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 2.73840206185567, |
|
"grad_norm": 9.819196701049805, |
|
"learning_rate": 4.407294832826748e-06, |
|
"loss": 0.8621, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.7545103092783503, |
|
"grad_norm": 8.934945106506348, |
|
"learning_rate": 4.135909683022145e-06, |
|
"loss": 0.8644, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 2.770618556701031, |
|
"grad_norm": 10.425902366638184, |
|
"learning_rate": 3.864524533217543e-06, |
|
"loss": 0.8937, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.7867268041237114, |
|
"grad_norm": 9.629773139953613, |
|
"learning_rate": 3.5931393834129398e-06, |
|
"loss": 0.8774, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 2.802835051546392, |
|
"grad_norm": 9.796236038208008, |
|
"learning_rate": 3.3217542336083374e-06, |
|
"loss": 0.8589, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.818943298969072, |
|
"grad_norm": 9.853483200073242, |
|
"learning_rate": 3.050369083803734e-06, |
|
"loss": 0.8394, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 2.8350515463917527, |
|
"grad_norm": 9.696287155151367, |
|
"learning_rate": 2.7789839339991317e-06, |
|
"loss": 0.8523, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.851159793814433, |
|
"grad_norm": 9.468950271606445, |
|
"learning_rate": 2.507598784194529e-06, |
|
"loss": 0.8444, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 2.8672680412371134, |
|
"grad_norm": 9.996761322021484, |
|
"learning_rate": 2.236213634389926e-06, |
|
"loss": 0.8517, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.883376288659794, |
|
"grad_norm": 12.354564666748047, |
|
"learning_rate": 1.9648284845853233e-06, |
|
"loss": 0.8523, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 2.899484536082474, |
|
"grad_norm": 11.12836742401123, |
|
"learning_rate": 1.6934433347807209e-06, |
|
"loss": 0.8458, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.9155927835051547, |
|
"grad_norm": 9.318047523498535, |
|
"learning_rate": 1.4220581849761183e-06, |
|
"loss": 0.8548, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 2.931701030927835, |
|
"grad_norm": 9.987869262695312, |
|
"learning_rate": 1.1506730351715155e-06, |
|
"loss": 0.8567, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.9478092783505154, |
|
"grad_norm": 10.364538192749023, |
|
"learning_rate": 8.792878853669127e-07, |
|
"loss": 0.8547, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 2.963917525773196, |
|
"grad_norm": 10.010146141052246, |
|
"learning_rate": 6.0790273556231e-07, |
|
"loss": 0.8464, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.980025773195876, |
|
"grad_norm": 8.987505912780762, |
|
"learning_rate": 3.3651758575770737e-07, |
|
"loss": 0.8361, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 2.9961340206185567, |
|
"grad_norm": 9.563461303710938, |
|
"learning_rate": 6.513243595310464e-08, |
|
"loss": 0.8486, |
|
"step": 4650 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 4656, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3657646372356096e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|