|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1914, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001567398119122257, |
|
"grad_norm": 0.5552594661712646, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 3.3921, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01567398119122257, |
|
"grad_norm": 0.660531222820282, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 3.5922, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03134796238244514, |
|
"grad_norm": 0.5598097443580627, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 3.619, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.047021943573667714, |
|
"grad_norm": 0.5996779799461365, |
|
"learning_rate": 0.00010344827586206898, |
|
"loss": 3.4824, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06269592476489028, |
|
"grad_norm": 0.8172075152397156, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 3.3997, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07836990595611286, |
|
"grad_norm": 1.6439019441604614, |
|
"learning_rate": 0.00017241379310344826, |
|
"loss": 3.2485, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09404388714733543, |
|
"grad_norm": 0.28180772066116333, |
|
"learning_rate": 0.00019999942697524717, |
|
"loss": 2.9975, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.109717868338558, |
|
"grad_norm": 0.3870689868927002, |
|
"learning_rate": 0.00019997937179843937, |
|
"loss": 3.0446, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12539184952978055, |
|
"grad_norm": 0.575548529624939, |
|
"learning_rate": 0.00019993067195079803, |
|
"loss": 3.0178, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14106583072100312, |
|
"grad_norm": 0.781446635723114, |
|
"learning_rate": 0.00019985334138511237, |
|
"loss": 3.0394, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15673981191222572, |
|
"grad_norm": 1.725696325302124, |
|
"learning_rate": 0.00019974740225703878, |
|
"loss": 3.0751, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 0.2845414876937866, |
|
"learning_rate": 0.00019961288491875278, |
|
"loss": 2.9291, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18808777429467086, |
|
"grad_norm": 0.36810651421546936, |
|
"learning_rate": 0.00019944982791025333, |
|
"loss": 2.9491, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.20376175548589343, |
|
"grad_norm": 0.5454439520835876, |
|
"learning_rate": 0.00019925827794832056, |
|
"loss": 3.0337, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.219435736677116, |
|
"grad_norm": 0.6503669619560242, |
|
"learning_rate": 0.00019903828991313138, |
|
"loss": 3.0246, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.23510971786833856, |
|
"grad_norm": 1.4392451047897339, |
|
"learning_rate": 0.00019878992683253582, |
|
"loss": 3.0232, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2507836990595611, |
|
"grad_norm": 0.2859440743923187, |
|
"learning_rate": 0.00019851325986399934, |
|
"loss": 2.8955, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2664576802507837, |
|
"grad_norm": 0.44268104434013367, |
|
"learning_rate": 0.0001982083682742156, |
|
"loss": 2.9338, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28213166144200624, |
|
"grad_norm": 0.5128395557403564, |
|
"learning_rate": 0.00019787533941639638, |
|
"loss": 3.0089, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.29780564263322884, |
|
"grad_norm": 0.7328920364379883, |
|
"learning_rate": 0.00019751426870524407, |
|
"loss": 3.0157, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.31347962382445144, |
|
"grad_norm": 1.5265012979507446, |
|
"learning_rate": 0.000197125259589615, |
|
"loss": 2.9007, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.329153605015674, |
|
"grad_norm": 0.2766813635826111, |
|
"learning_rate": 0.0001967084235228807, |
|
"loss": 2.8275, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 0.36695396900177, |
|
"learning_rate": 0.00019626387993099579, |
|
"loss": 2.9158, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3605015673981191, |
|
"grad_norm": 0.5359162092208862, |
|
"learning_rate": 0.00019579175617828187, |
|
"loss": 2.9465, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3761755485893417, |
|
"grad_norm": 0.6529833674430847, |
|
"learning_rate": 0.0001952921875309368, |
|
"loss": 2.981, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.39184952978056425, |
|
"grad_norm": 1.5314627885818481, |
|
"learning_rate": 0.00019476531711828027, |
|
"loss": 2.9737, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.40752351097178685, |
|
"grad_norm": 0.2949506342411041, |
|
"learning_rate": 0.00019421129589174618, |
|
"loss": 2.8208, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4231974921630094, |
|
"grad_norm": 0.39567869901657104, |
|
"learning_rate": 0.00019363028258163447, |
|
"loss": 2.8557, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.438871473354232, |
|
"grad_norm": 0.5587254166603088, |
|
"learning_rate": 0.00019302244365163376, |
|
"loss": 2.9494, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.7218978404998779, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 2.9742, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4702194357366771, |
|
"grad_norm": 1.4482598304748535, |
|
"learning_rate": 0.0001917269931653049, |
|
"loss": 2.8646, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.48589341692789967, |
|
"grad_norm": 0.2901701033115387, |
|
"learning_rate": 0.00019103975276306678, |
|
"loss": 2.7788, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5015673981191222, |
|
"grad_norm": 0.4310539960861206, |
|
"learning_rate": 0.00019032642894278192, |
|
"loss": 2.8655, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 0.5589954853057861, |
|
"learning_rate": 0.0001895872260758688, |
|
"loss": 2.914, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5329153605015674, |
|
"grad_norm": 0.7243526577949524, |
|
"learning_rate": 0.00018882235594824308, |
|
"loss": 2.9191, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.54858934169279, |
|
"grad_norm": 1.4200222492218018, |
|
"learning_rate": 0.00018803203769963967, |
|
"loss": 2.8128, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5642633228840125, |
|
"grad_norm": 0.26594147086143494, |
|
"learning_rate": 0.000187216497760828, |
|
"loss": 2.762, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5799373040752351, |
|
"grad_norm": 0.3894258439540863, |
|
"learning_rate": 0.00018637596978873835, |
|
"loss": 2.9077, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5956112852664577, |
|
"grad_norm": 0.5348561406135559, |
|
"learning_rate": 0.00018551069459951758, |
|
"loss": 2.9292, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6112852664576802, |
|
"grad_norm": 0.746507465839386, |
|
"learning_rate": 0.00018462092009953408, |
|
"loss": 2.8795, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6269592476489029, |
|
"grad_norm": 1.5225753784179688, |
|
"learning_rate": 0.0001837069012143511, |
|
"loss": 2.8263, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6426332288401254, |
|
"grad_norm": 0.26905450224876404, |
|
"learning_rate": 0.00018276889981568906, |
|
"loss": 2.7218, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.658307210031348, |
|
"grad_norm": 0.3912515342235565, |
|
"learning_rate": 0.00018180718464639787, |
|
"loss": 2.819, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6739811912225705, |
|
"grad_norm": 0.5661373138427734, |
|
"learning_rate": 0.00018082203124346045, |
|
"loss": 2.8772, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.805776059627533, |
|
"learning_rate": 0.0001798137218590498, |
|
"loss": 2.9562, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7053291536050157, |
|
"grad_norm": 1.4879732131958008, |
|
"learning_rate": 0.00017878254537966216, |
|
"loss": 2.7925, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7210031347962382, |
|
"grad_norm": 0.28762391209602356, |
|
"learning_rate": 0.00017772879724334937, |
|
"loss": 2.8006, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7366771159874608, |
|
"grad_norm": 0.41769474744796753, |
|
"learning_rate": 0.00017665277935507398, |
|
"loss": 2.8148, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7523510971786834, |
|
"grad_norm": 0.633671760559082, |
|
"learning_rate": 0.00017555480000021198, |
|
"loss": 2.8461, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.768025078369906, |
|
"grad_norm": 0.816681444644928, |
|
"learning_rate": 0.00017443517375622704, |
|
"loss": 2.8826, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7836990595611285, |
|
"grad_norm": 1.3913438320159912, |
|
"learning_rate": 0.00017329422140254235, |
|
"loss": 2.7449, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.799373040752351, |
|
"grad_norm": 0.30519339442253113, |
|
"learning_rate": 0.0001721322698286354, |
|
"loss": 2.7933, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8150470219435737, |
|
"grad_norm": 0.3911365270614624, |
|
"learning_rate": 0.0001709496519403823, |
|
"loss": 2.8433, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8307210031347962, |
|
"grad_norm": 0.5197113752365112, |
|
"learning_rate": 0.00016974670656467824, |
|
"loss": 2.828, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8463949843260188, |
|
"grad_norm": 0.7463776469230652, |
|
"learning_rate": 0.00016852377835236166, |
|
"loss": 2.9549, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 1.6616039276123047, |
|
"learning_rate": 0.00016728121767946977, |
|
"loss": 2.8843, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.877742946708464, |
|
"grad_norm": 0.2525625228881836, |
|
"learning_rate": 0.00016601938054685385, |
|
"loss": 2.7715, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8934169278996865, |
|
"grad_norm": 0.4268622100353241, |
|
"learning_rate": 0.00016473862847818277, |
|
"loss": 2.8196, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.5538944005966187, |
|
"learning_rate": 0.00016343932841636456, |
|
"loss": 2.8619, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9247648902821317, |
|
"grad_norm": 0.7213597297668457, |
|
"learning_rate": 0.00016212185261841499, |
|
"loss": 2.9276, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9404388714733543, |
|
"grad_norm": 1.6927486658096313, |
|
"learning_rate": 0.00016078657854880376, |
|
"loss": 2.7446, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9561128526645768, |
|
"grad_norm": 0.3090650141239166, |
|
"learning_rate": 0.000159433888771309, |
|
"loss": 2.747, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9717868338557993, |
|
"grad_norm": 0.4454885423183441, |
|
"learning_rate": 0.00015806417083941002, |
|
"loss": 2.8622, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.987460815047022, |
|
"grad_norm": 0.8089830279350281, |
|
"learning_rate": 0.00015667781718525157, |
|
"loss": 2.8568, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0031347962382444, |
|
"grad_norm": 0.21911662817001343, |
|
"learning_rate": 0.00015527522500720934, |
|
"loss": 2.7012, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0188087774294672, |
|
"grad_norm": 0.3196061849594116, |
|
"learning_rate": 0.00015385679615609042, |
|
"loss": 2.7127, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.4679659605026245, |
|
"learning_rate": 0.00015242293702000086, |
|
"loss": 2.7103, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0501567398119123, |
|
"grad_norm": 0.6506277322769165, |
|
"learning_rate": 0.00015097405840791276, |
|
"loss": 2.6762, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0658307210031348, |
|
"grad_norm": 0.9151834845542908, |
|
"learning_rate": 0.00014951057543196566, |
|
"loss": 2.6503, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0815047021943573, |
|
"grad_norm": 0.2834145724773407, |
|
"learning_rate": 0.00014803290738853395, |
|
"loss": 2.5103, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.09717868338558, |
|
"grad_norm": 0.3199128210544586, |
|
"learning_rate": 0.00014654147763809637, |
|
"loss": 2.7147, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1128526645768024, |
|
"grad_norm": 0.5001178979873657, |
|
"learning_rate": 0.00014503671348394057, |
|
"loss": 2.7124, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1285266457680252, |
|
"grad_norm": 0.6373796463012695, |
|
"learning_rate": 0.0001435190460497384, |
|
"loss": 2.7012, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1442006269592477, |
|
"grad_norm": 0.9262835383415222, |
|
"learning_rate": 0.00014198891015602646, |
|
"loss": 2.6493, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1598746081504703, |
|
"grad_norm": 0.2955109775066376, |
|
"learning_rate": 0.00014044674419562734, |
|
"loss": 2.5023, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1755485893416928, |
|
"grad_norm": 0.34623944759368896, |
|
"learning_rate": 0.0001388929900080476, |
|
"loss": 2.6849, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1912225705329154, |
|
"grad_norm": 0.43281346559524536, |
|
"learning_rate": 0.00013732809275288828, |
|
"loss": 2.6655, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 0.6403496265411377, |
|
"learning_rate": 0.000135752500782304, |
|
"loss": 2.6557, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2225705329153604, |
|
"grad_norm": 0.8644494414329529, |
|
"learning_rate": 0.00013416666551254748, |
|
"loss": 2.6162, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.238244514106583, |
|
"grad_norm": 0.2673655152320862, |
|
"learning_rate": 0.00013257104129463614, |
|
"loss": 2.4991, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2539184952978055, |
|
"grad_norm": 0.3414609432220459, |
|
"learning_rate": 0.00013096608528417788, |
|
"loss": 2.6638, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2695924764890283, |
|
"grad_norm": 0.47588247060775757, |
|
"learning_rate": 0.00012935225731039348, |
|
"loss": 2.6465, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2852664576802508, |
|
"grad_norm": 0.6843693852424622, |
|
"learning_rate": 0.00012773001974437267, |
|
"loss": 2.6697, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3009404388714734, |
|
"grad_norm": 0.9213519096374512, |
|
"learning_rate": 0.0001260998373666022, |
|
"loss": 2.6691, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.316614420062696, |
|
"grad_norm": 0.26475438475608826, |
|
"learning_rate": 0.0001244621772338036, |
|
"loss": 2.5109, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3322884012539185, |
|
"grad_norm": 0.3754674196243286, |
|
"learning_rate": 0.0001228175085451186, |
|
"loss": 2.6699, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.347962382445141, |
|
"grad_norm": 0.49352237582206726, |
|
"learning_rate": 0.00012116630250768097, |
|
"loss": 2.6889, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.7134298086166382, |
|
"learning_rate": 0.00011950903220161285, |
|
"loss": 2.6502, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.9475263953208923, |
|
"learning_rate": 0.00011784617244448451, |
|
"loss": 2.6062, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3949843260188088, |
|
"grad_norm": 0.2767506539821625, |
|
"learning_rate": 0.0001161781996552765, |
|
"loss": 2.4584, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.4106583072100314, |
|
"grad_norm": 0.38563597202301025, |
|
"learning_rate": 0.00011450559171788269, |
|
"loss": 2.6751, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.426332288401254, |
|
"grad_norm": 0.466782808303833, |
|
"learning_rate": 0.00011282882784419398, |
|
"loss": 2.6181, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4420062695924765, |
|
"grad_norm": 0.7600008845329285, |
|
"learning_rate": 0.00011114838843680095, |
|
"loss": 2.6729, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.457680250783699, |
|
"grad_norm": 0.956167459487915, |
|
"learning_rate": 0.0001094647549513561, |
|
"loss": 2.5486, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4733542319749215, |
|
"grad_norm": 0.257744699716568, |
|
"learning_rate": 0.00010777840975863383, |
|
"loss": 2.4545, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.489028213166144, |
|
"grad_norm": 0.347033828496933, |
|
"learning_rate": 0.00010608983600632831, |
|
"loss": 2.6313, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5047021943573666, |
|
"grad_norm": 0.5105902552604675, |
|
"learning_rate": 0.00010439951748062912, |
|
"loss": 2.623, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.5203761755485894, |
|
"grad_norm": 0.7116357684135437, |
|
"learning_rate": 0.00010270793846761347, |
|
"loss": 2.6493, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.536050156739812, |
|
"grad_norm": 0.9367708563804626, |
|
"learning_rate": 0.00010101558361449552, |
|
"loss": 2.6081, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.25939515233039856, |
|
"learning_rate": 9.932293779077216e-05, |
|
"loss": 2.4289, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.567398119122257, |
|
"grad_norm": 0.3675819933414459, |
|
"learning_rate": 9.763048594930502e-05, |
|
"loss": 2.6617, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5830721003134798, |
|
"grad_norm": 0.47927770018577576, |
|
"learning_rate": 9.59387129873787e-05, |
|
"loss": 2.661, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5987460815047023, |
|
"grad_norm": 0.6929473876953125, |
|
"learning_rate": 9.42481036077749e-05, |
|
"loss": 2.6103, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6144200626959249, |
|
"grad_norm": 0.9767149090766907, |
|
"learning_rate": 9.255914217990211e-05, |
|
"loss": 2.5943, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6300940438871474, |
|
"grad_norm": 0.24275819957256317, |
|
"learning_rate": 9.08723126010212e-05, |
|
"loss": 2.4676, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.64576802507837, |
|
"grad_norm": 0.3554080128669739, |
|
"learning_rate": 8.918809815760585e-05, |
|
"loss": 2.6107, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6614420062695925, |
|
"grad_norm": 0.5181464552879333, |
|
"learning_rate": 8.750698138687827e-05, |
|
"loss": 2.617, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.677115987460815, |
|
"grad_norm": 0.7479755282402039, |
|
"learning_rate": 8.582944393855941e-05, |
|
"loss": 2.6401, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6927899686520376, |
|
"grad_norm": 0.9864292740821838, |
|
"learning_rate": 8.415596643687363e-05, |
|
"loss": 2.6327, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.70846394984326, |
|
"grad_norm": 0.24126508831977844, |
|
"learning_rate": 8.248702834284693e-05, |
|
"loss": 2.4569, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.3491465449333191, |
|
"learning_rate": 8.082310781693865e-05, |
|
"loss": 2.6473, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.7398119122257052, |
|
"grad_norm": 0.5092320442199707, |
|
"learning_rate": 7.916468158204576e-05, |
|
"loss": 2.6587, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.7554858934169277, |
|
"grad_norm": 0.6617609262466431, |
|
"learning_rate": 7.751222478691884e-05, |
|
"loss": 2.6862, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.7711598746081505, |
|
"grad_norm": 1.0016579627990723, |
|
"learning_rate": 7.586621087002945e-05, |
|
"loss": 2.6226, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.786833855799373, |
|
"grad_norm": 0.2429930418729782, |
|
"learning_rate": 7.422711142392695e-05, |
|
"loss": 2.5479, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8025078369905956, |
|
"grad_norm": 0.396308034658432, |
|
"learning_rate": 7.259539606012478e-05, |
|
"loss": 2.6005, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.5580389499664307, |
|
"learning_rate": 7.097153227455379e-05, |
|
"loss": 2.6714, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8338557993730409, |
|
"grad_norm": 0.7043899893760681, |
|
"learning_rate": 6.93559853136221e-05, |
|
"loss": 2.6417, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8495297805642634, |
|
"grad_norm": 0.9616154432296753, |
|
"learning_rate": 6.774921804091934e-05, |
|
"loss": 2.5572, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.865203761755486, |
|
"grad_norm": 0.2324633002281189, |
|
"learning_rate": 6.615169080460331e-05, |
|
"loss": 2.4474, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.8808777429467085, |
|
"grad_norm": 0.3579946756362915, |
|
"learning_rate": 6.456386130550782e-05, |
|
"loss": 2.6281, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 0.5167599320411682, |
|
"learning_rate": 6.298618446600856e-05, |
|
"loss": 2.6397, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.9122257053291536, |
|
"grad_norm": 0.7201157808303833, |
|
"learning_rate": 6.141911229968533e-05, |
|
"loss": 2.6389, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.9278996865203761, |
|
"grad_norm": 1.1355105638504028, |
|
"learning_rate": 5.9863093781817394e-05, |
|
"loss": 2.6135, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.9435736677115987, |
|
"grad_norm": 0.2417139708995819, |
|
"learning_rate": 5.831857472074956e-05, |
|
"loss": 2.456, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.9592476489028212, |
|
"grad_norm": 0.4348711669445038, |
|
"learning_rate": 5.6785997630165435e-05, |
|
"loss": 2.6756, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.9749216300940438, |
|
"grad_norm": 0.6851107478141785, |
|
"learning_rate": 5.526580160230476e-05, |
|
"loss": 2.6338, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9905956112852663, |
|
"grad_norm": 1.0602185726165771, |
|
"learning_rate": 5.375842218216076e-05, |
|
"loss": 2.5925, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.006269592476489, |
|
"grad_norm": 0.23286418616771698, |
|
"learning_rate": 5.226429124269423e-05, |
|
"loss": 2.4199, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.0219435736677114, |
|
"grad_norm": 0.41238126158714294, |
|
"learning_rate": 5.078383686109926e-05, |
|
"loss": 2.5441, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.0376175548589344, |
|
"grad_norm": 0.5980674624443054, |
|
"learning_rate": 4.931748319615656e-05, |
|
"loss": 2.5002, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.053291536050157, |
|
"grad_norm": 0.8170694708824158, |
|
"learning_rate": 4.786565036670972e-05, |
|
"loss": 2.4228, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 1.1204980611801147, |
|
"learning_rate": 4.642875433129854e-05, |
|
"loss": 2.3089, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.084639498432602, |
|
"grad_norm": 0.28670328855514526, |
|
"learning_rate": 4.500720676898452e-05, |
|
"loss": 2.2163, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.1003134796238245, |
|
"grad_norm": 0.45365315675735474, |
|
"learning_rate": 4.36014149614026e-05, |
|
"loss": 2.5476, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.115987460815047, |
|
"grad_norm": 0.589374840259552, |
|
"learning_rate": 4.221178167607226e-05, |
|
"loss": 2.4646, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.1316614420062696, |
|
"grad_norm": 0.9127137064933777, |
|
"learning_rate": 4.083870505100263e-05, |
|
"loss": 2.3819, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.147335423197492, |
|
"grad_norm": 1.167529582977295, |
|
"learning_rate": 3.948257848062351e-05, |
|
"loss": 2.3445, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.1630094043887147, |
|
"grad_norm": 0.26559045910835266, |
|
"learning_rate": 3.8143790503075403e-05, |
|
"loss": 2.228, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.1786833855799372, |
|
"grad_norm": 0.5015429258346558, |
|
"learning_rate": 3.6822724688891416e-05, |
|
"loss": 2.5579, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.19435736677116, |
|
"grad_norm": 0.7187398672103882, |
|
"learning_rate": 3.551975953110177e-05, |
|
"loss": 2.4841, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.2100313479623823, |
|
"grad_norm": 0.872721791267395, |
|
"learning_rate": 3.423526833679355e-05, |
|
"loss": 2.418, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.225705329153605, |
|
"grad_norm": 1.3087424039840698, |
|
"learning_rate": 3.296961912015598e-05, |
|
"loss": 2.2849, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"grad_norm": 0.31001710891723633, |
|
"learning_rate": 3.172317449704216e-05, |
|
"loss": 2.2705, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.2570532915360504, |
|
"grad_norm": 0.47308361530303955, |
|
"learning_rate": 3.0496291581077673e-05, |
|
"loss": 2.5207, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.6600085496902466, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 2.4747, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.2884012539184955, |
|
"grad_norm": 0.8648523092269897, |
|
"learning_rate": 2.8102611201675598e-05, |
|
"loss": 2.4407, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.304075235109718, |
|
"grad_norm": 1.2338064908981323, |
|
"learning_rate": 2.6936499541572445e-05, |
|
"loss": 2.3066, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.3197492163009406, |
|
"grad_norm": 0.2991880476474762, |
|
"learning_rate": 2.5791320998800695e-05, |
|
"loss": 2.2581, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.335423197492163, |
|
"grad_norm": 0.48960086703300476, |
|
"learning_rate": 2.4667403673665623e-05, |
|
"loss": 2.5032, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.3510971786833856, |
|
"grad_norm": 0.6952352523803711, |
|
"learning_rate": 2.3565069575010035e-05, |
|
"loss": 2.4815, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.366771159874608, |
|
"grad_norm": 0.9776033163070679, |
|
"learning_rate": 2.248463452795705e-05, |
|
"loss": 2.4033, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.3824451410658307, |
|
"grad_norm": 1.2295323610305786, |
|
"learning_rate": 2.142640808342429e-05, |
|
"loss": 2.2546, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.3981191222570533, |
|
"grad_norm": 0.2598329782485962, |
|
"learning_rate": 2.0390693429435627e-05, |
|
"loss": 2.2122, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.4593777358531952, |
|
"learning_rate": 1.9377787304256302e-05, |
|
"loss": 2.518, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.4294670846394983, |
|
"grad_norm": 0.6363881826400757, |
|
"learning_rate": 1.838797991137543e-05, |
|
"loss": 2.4564, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.445141065830721, |
|
"grad_norm": 0.9114318490028381, |
|
"learning_rate": 1.742155483636123e-05, |
|
"loss": 2.4088, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.4608150470219434, |
|
"grad_norm": 1.3736695051193237, |
|
"learning_rate": 1.6478788965611993e-05, |
|
"loss": 2.3423, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.476489028213166, |
|
"grad_norm": 0.2641979157924652, |
|
"learning_rate": 1.555995240702648e-05, |
|
"loss": 2.1546, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.492163009404389, |
|
"grad_norm": 0.5061512589454651, |
|
"learning_rate": 1.4665308412616596e-05, |
|
"loss": 2.5098, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.507836990595611, |
|
"grad_norm": 0.6692546606063843, |
|
"learning_rate": 1.3795113303084006e-05, |
|
"loss": 2.4528, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.523510971786834, |
|
"grad_norm": 0.9549630880355835, |
|
"learning_rate": 1.2949616394382802e-05, |
|
"loss": 2.4189, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.5391849529780566, |
|
"grad_norm": 1.450711727142334, |
|
"learning_rate": 1.2129059926289166e-05, |
|
"loss": 2.2557, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.554858934169279, |
|
"grad_norm": 0.2581474781036377, |
|
"learning_rate": 1.1333678992998043e-05, |
|
"loss": 2.2043, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.5705329153605017, |
|
"grad_norm": 0.4815433621406555, |
|
"learning_rate": 1.0563701475767462e-05, |
|
"loss": 2.5434, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 0.6861922144889832, |
|
"learning_rate": 9.819347977629202e-06, |
|
"loss": 2.4619, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.6018808777429467, |
|
"grad_norm": 0.9293055534362793, |
|
"learning_rate": 9.100831760184625e-06, |
|
"loss": 2.4539, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.6175548589341693, |
|
"grad_norm": 1.2631028890609741, |
|
"learning_rate": 8.408358682504147e-06, |
|
"loss": 2.2262, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.633228840125392, |
|
"grad_norm": 0.3051944673061371, |
|
"learning_rate": 7.742127142147337e-06, |
|
"loss": 2.2413, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.6489028213166144, |
|
"grad_norm": 0.5327479243278503, |
|
"learning_rate": 7.102328018320858e-06, |
|
"loss": 2.5487, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.664576802507837, |
|
"grad_norm": 0.6830099821090698, |
|
"learning_rate": 6.489144617190501e-06, |
|
"loss": 2.492, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.6802507836990594, |
|
"grad_norm": 1.006087303161621, |
|
"learning_rate": 5.902752619362861e-06, |
|
"loss": 2.4549, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.695924764890282, |
|
"grad_norm": 1.3168909549713135, |
|
"learning_rate": 5.343320029551868e-06, |
|
"loss": 2.2644, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.7115987460815045, |
|
"grad_norm": 0.29884764552116394, |
|
"learning_rate": 4.811007128444445e-06, |
|
"loss": 2.3007, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.4936012029647827, |
|
"learning_rate": 4.305966426779118e-06, |
|
"loss": 2.502, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.7429467084639496, |
|
"grad_norm": 0.689198911190033, |
|
"learning_rate": 3.828342621650882e-06, |
|
"loss": 2.4674, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.9643833041191101, |
|
"learning_rate": 3.3782725550545625e-06, |
|
"loss": 2.4213, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.774294670846395, |
|
"grad_norm": 1.3325724601745605, |
|
"learning_rate": 2.9558851746788517e-06, |
|
"loss": 2.2601, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.7899686520376177, |
|
"grad_norm": 0.2799229621887207, |
|
"learning_rate": 2.561301496962043e-06, |
|
"loss": 2.2256, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.80564263322884, |
|
"grad_norm": 0.5209997296333313, |
|
"learning_rate": 2.194634572420029e-06, |
|
"loss": 2.5132, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.8213166144200628, |
|
"grad_norm": 0.6882754564285278, |
|
"learning_rate": 1.8559894532568122e-06, |
|
"loss": 2.428, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.8369905956112853, |
|
"grad_norm": 0.9445529580116272, |
|
"learning_rate": 1.545463163266303e-06, |
|
"loss": 2.4341, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.852664576802508, |
|
"grad_norm": 1.3681193590164185, |
|
"learning_rate": 1.263144670034555e-06, |
|
"loss": 2.2468, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.8683385579937304, |
|
"grad_norm": 0.2578428089618683, |
|
"learning_rate": 1.0091148594499666e-06, |
|
"loss": 2.2084, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.884012539184953, |
|
"grad_norm": 0.5128017663955688, |
|
"learning_rate": 7.83446512529007e-07, |
|
"loss": 2.5086, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.8996865203761755, |
|
"grad_norm": 0.7125623226165771, |
|
"learning_rate": 5.862042845640403e-07, |
|
"loss": 2.4374, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.915360501567398, |
|
"grad_norm": 1.0510890483856201, |
|
"learning_rate": 4.174446865992332e-07, |
|
"loss": 2.4102, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"grad_norm": 1.4845973253250122, |
|
"learning_rate": 2.7721606923978293e-07, |
|
"loss": 2.2825, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.946708463949843, |
|
"grad_norm": 0.3054816722869873, |
|
"learning_rate": 1.6555860879919892e-07, |
|
"loss": 2.1791, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.962382445141066, |
|
"grad_norm": 0.6139517426490784, |
|
"learning_rate": 8.250429578855467e-08, |
|
"loss": 2.5106, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.978056426332288, |
|
"grad_norm": 0.9643613696098328, |
|
"learning_rate": 2.8076925751008286e-08, |
|
"loss": 2.4253, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.993730407523511, |
|
"grad_norm": 1.5458600521087646, |
|
"learning_rate": 2.292092444255989e-09, |
|
"loss": 2.2764, |
|
"step": 1910 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1914, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.5314350752283034e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|