|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.77459749552773, |
|
"eval_steps": 500, |
|
"global_step": 1242, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14311270125223613, |
|
"grad_norm": 7.146514892578125, |
|
"learning_rate": 0.00039677938808373593, |
|
"loss": 3.5751, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.28622540250447226, |
|
"grad_norm": 2.1325223445892334, |
|
"learning_rate": 0.00039355877616747184, |
|
"loss": 0.939, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4293381037567084, |
|
"grad_norm": 0.8520782589912415, |
|
"learning_rate": 0.00039033816425120774, |
|
"loss": 0.2653, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5724508050089445, |
|
"grad_norm": 0.7653748393058777, |
|
"learning_rate": 0.00038711755233494365, |
|
"loss": 0.1603, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7155635062611807, |
|
"grad_norm": 0.661469578742981, |
|
"learning_rate": 0.00038389694041867956, |
|
"loss": 0.1886, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8586762075134168, |
|
"grad_norm": 0.39610955119132996, |
|
"learning_rate": 0.00038067632850241547, |
|
"loss": 0.1859, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0017889087656529, |
|
"grad_norm": 0.4488755464553833, |
|
"learning_rate": 0.0003774557165861514, |
|
"loss": 0.1538, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.144901610017889, |
|
"grad_norm": 0.2944377362728119, |
|
"learning_rate": 0.00037423510466988734, |
|
"loss": 0.1195, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2880143112701252, |
|
"grad_norm": 0.29124024510383606, |
|
"learning_rate": 0.0003710144927536232, |
|
"loss": 0.1271, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4311270125223614, |
|
"grad_norm": 0.42328736186027527, |
|
"learning_rate": 0.0003677938808373591, |
|
"loss": 0.1018, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5742397137745976, |
|
"grad_norm": 0.3259565234184265, |
|
"learning_rate": 0.00036457326892109506, |
|
"loss": 0.0848, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7173524150268338, |
|
"grad_norm": 0.479124516248703, |
|
"learning_rate": 0.0003613526570048309, |
|
"loss": 0.106, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 0.40788090229034424, |
|
"learning_rate": 0.0003581320450885668, |
|
"loss": 0.0969, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0035778175313057, |
|
"grad_norm": 0.3574964106082916, |
|
"learning_rate": 0.0003549114331723028, |
|
"loss": 0.124, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.146690518783542, |
|
"grad_norm": 0.37805065512657166, |
|
"learning_rate": 0.0003516908212560387, |
|
"loss": 0.0491, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.289803220035778, |
|
"grad_norm": 0.25937220454216003, |
|
"learning_rate": 0.00034847020933977455, |
|
"loss": 0.0669, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.4329159212880143, |
|
"grad_norm": 0.34056201577186584, |
|
"learning_rate": 0.00034524959742351046, |
|
"loss": 0.0595, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.5760286225402504, |
|
"grad_norm": 0.30211707949638367, |
|
"learning_rate": 0.0003420289855072464, |
|
"loss": 0.0648, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.7191413237924866, |
|
"grad_norm": 0.18458786606788635, |
|
"learning_rate": 0.0003388083735909823, |
|
"loss": 0.0545, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.862254025044723, |
|
"grad_norm": 0.27384912967681885, |
|
"learning_rate": 0.0003355877616747182, |
|
"loss": 0.0684, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.005366726296959, |
|
"grad_norm": 0.16877304017543793, |
|
"learning_rate": 0.00033236714975845414, |
|
"loss": 0.0695, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.148479427549195, |
|
"grad_norm": 0.07739146798849106, |
|
"learning_rate": 0.00032914653784219005, |
|
"loss": 0.0256, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.2915921288014314, |
|
"grad_norm": 0.2832132577896118, |
|
"learning_rate": 0.0003259259259259259, |
|
"loss": 0.0263, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.434704830053667, |
|
"grad_norm": 0.21412289142608643, |
|
"learning_rate": 0.00032270531400966187, |
|
"loss": 0.0287, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.5778175313059033, |
|
"grad_norm": 0.1840696483850479, |
|
"learning_rate": 0.0003194847020933978, |
|
"loss": 0.0469, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.7209302325581395, |
|
"grad_norm": 0.34246236085891724, |
|
"learning_rate": 0.00031626409017713363, |
|
"loss": 0.0243, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.8640429338103757, |
|
"grad_norm": 0.056173525750637054, |
|
"learning_rate": 0.0003130434782608696, |
|
"loss": 0.0252, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.007155635062611, |
|
"grad_norm": 0.09256428480148315, |
|
"learning_rate": 0.0003098228663446055, |
|
"loss": 0.0216, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.150268336314848, |
|
"grad_norm": 0.20085078477859497, |
|
"learning_rate": 0.0003066022544283414, |
|
"loss": 0.0102, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.293381037567084, |
|
"grad_norm": 0.021982286125421524, |
|
"learning_rate": 0.0003033816425120773, |
|
"loss": 0.0131, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.43649373881932, |
|
"grad_norm": 0.054368916898965836, |
|
"learning_rate": 0.0003001610305958132, |
|
"loss": 0.0145, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.579606440071556, |
|
"grad_norm": 0.0868581086397171, |
|
"learning_rate": 0.00029694041867954913, |
|
"loss": 0.0181, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.722719141323792, |
|
"grad_norm": 0.24308475852012634, |
|
"learning_rate": 0.00029371980676328504, |
|
"loss": 0.0125, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.8658318425760285, |
|
"grad_norm": 0.14394602179527283, |
|
"learning_rate": 0.00029049919484702095, |
|
"loss": 0.0149, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.008944543828265, |
|
"grad_norm": 0.05040862783789635, |
|
"learning_rate": 0.00028727858293075686, |
|
"loss": 0.0096, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.152057245080501, |
|
"grad_norm": 0.28047820925712585, |
|
"learning_rate": 0.00028405797101449276, |
|
"loss": 0.0032, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.295169946332737, |
|
"grad_norm": 0.07502233237028122, |
|
"learning_rate": 0.0002808373590982287, |
|
"loss": 0.0038, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.438282647584973, |
|
"grad_norm": 0.8537871837615967, |
|
"learning_rate": 0.0002776167471819646, |
|
"loss": 0.0073, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.5813953488372094, |
|
"grad_norm": 0.005727715790271759, |
|
"learning_rate": 0.0002743961352657005, |
|
"loss": 0.0106, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.724508050089446, |
|
"grad_norm": 0.04042937234044075, |
|
"learning_rate": 0.0002711755233494364, |
|
"loss": 0.0041, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.867620751341682, |
|
"grad_norm": 0.11248348653316498, |
|
"learning_rate": 0.0002679549114331723, |
|
"loss": 0.003, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.010733452593918, |
|
"grad_norm": 0.014976495876908302, |
|
"learning_rate": 0.00026473429951690827, |
|
"loss": 0.0067, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.15070898830890656, |
|
"learning_rate": 0.0002615136876006441, |
|
"loss": 0.0035, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.29695885509839, |
|
"grad_norm": 0.0066925715655088425, |
|
"learning_rate": 0.00025829307568438003, |
|
"loss": 0.0027, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.440071556350626, |
|
"grad_norm": 0.015314973890781403, |
|
"learning_rate": 0.00025507246376811594, |
|
"loss": 0.0045, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.583184257602863, |
|
"grad_norm": 0.030470581725239754, |
|
"learning_rate": 0.00025185185185185185, |
|
"loss": 0.0029, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.726296958855098, |
|
"grad_norm": 0.0092542115598917, |
|
"learning_rate": 0.00024863123993558775, |
|
"loss": 0.0043, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.869409660107334, |
|
"grad_norm": 0.016118695959448814, |
|
"learning_rate": 0.00024541062801932366, |
|
"loss": 0.0011, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.01252236135957, |
|
"grad_norm": 0.01760493591427803, |
|
"learning_rate": 0.0002421900161030596, |
|
"loss": 0.004, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.155635062611807, |
|
"grad_norm": 0.0077838534489274025, |
|
"learning_rate": 0.0002389694041867955, |
|
"loss": 0.0004, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.298747763864043, |
|
"grad_norm": 0.006766254547983408, |
|
"learning_rate": 0.00023574879227053139, |
|
"loss": 0.0015, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.441860465116279, |
|
"grad_norm": 0.009164445102214813, |
|
"learning_rate": 0.00023252818035426732, |
|
"loss": 0.0006, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.584973166368515, |
|
"grad_norm": 0.002783432835713029, |
|
"learning_rate": 0.00022930756843800323, |
|
"loss": 0.0005, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.728085867620751, |
|
"grad_norm": 0.15486985445022583, |
|
"learning_rate": 0.0002260869565217391, |
|
"loss": 0.0036, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 7.8711985688729875, |
|
"grad_norm": 0.042650897055864334, |
|
"learning_rate": 0.00022286634460547507, |
|
"loss": 0.0006, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.014311270125223, |
|
"grad_norm": 0.0018309111474081874, |
|
"learning_rate": 0.00021964573268921095, |
|
"loss": 0.0011, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.15742397137746, |
|
"grad_norm": 0.005560223013162613, |
|
"learning_rate": 0.00021642512077294686, |
|
"loss": 0.001, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.300536672629695, |
|
"grad_norm": 0.03727242350578308, |
|
"learning_rate": 0.0002132045088566828, |
|
"loss": 0.0015, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.443649373881932, |
|
"grad_norm": 0.34321093559265137, |
|
"learning_rate": 0.00020998389694041868, |
|
"loss": 0.0031, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.586762075134168, |
|
"grad_norm": 0.020942572504281998, |
|
"learning_rate": 0.00020676328502415459, |
|
"loss": 0.0022, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 8.729874776386405, |
|
"grad_norm": 0.010578208602964878, |
|
"learning_rate": 0.00020354267310789052, |
|
"loss": 0.0006, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 8.87298747763864, |
|
"grad_norm": 0.004323468543589115, |
|
"learning_rate": 0.00020032206119162643, |
|
"loss": 0.0005, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.016100178890877, |
|
"grad_norm": 0.1614646017551422, |
|
"learning_rate": 0.00019710144927536234, |
|
"loss": 0.0011, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.159212880143112, |
|
"grad_norm": 0.0012326347641646862, |
|
"learning_rate": 0.00019388083735909825, |
|
"loss": 0.0005, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 9.30232558139535, |
|
"grad_norm": 0.006543063558638096, |
|
"learning_rate": 0.00019066022544283415, |
|
"loss": 0.0002, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.445438282647585, |
|
"grad_norm": 0.21280421316623688, |
|
"learning_rate": 0.00018743961352657006, |
|
"loss": 0.0003, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.588550983899822, |
|
"grad_norm": 0.006327577400952578, |
|
"learning_rate": 0.00018421900161030597, |
|
"loss": 0.0002, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 9.731663685152057, |
|
"grad_norm": 0.0025285291485488415, |
|
"learning_rate": 0.00018099838969404188, |
|
"loss": 0.0001, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 9.874776386404294, |
|
"grad_norm": 0.0014309959951788187, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.0001, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 10.01788908765653, |
|
"grad_norm": 0.0024150668177753687, |
|
"learning_rate": 0.0001745571658615137, |
|
"loss": 0.0001, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.161001788908766, |
|
"grad_norm": 0.0025761763099581003, |
|
"learning_rate": 0.0001713365539452496, |
|
"loss": 0.0001, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 10.304114490161002, |
|
"grad_norm": 0.0017020882805809379, |
|
"learning_rate": 0.0001681159420289855, |
|
"loss": 0.0001, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 10.447227191413237, |
|
"grad_norm": 0.0028596080373972654, |
|
"learning_rate": 0.00016489533011272142, |
|
"loss": 0.0001, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 10.590339892665474, |
|
"grad_norm": 0.0019378801807761192, |
|
"learning_rate": 0.00016167471819645735, |
|
"loss": 0.0001, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 10.73345259391771, |
|
"grad_norm": 0.001211544731631875, |
|
"learning_rate": 0.00015845410628019323, |
|
"loss": 0.0001, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 10.876565295169947, |
|
"grad_norm": 0.0033484594896435738, |
|
"learning_rate": 0.00015523349436392914, |
|
"loss": 0.0001, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 11.019677996422182, |
|
"grad_norm": 0.001493943389505148, |
|
"learning_rate": 0.00015201288244766508, |
|
"loss": 0.0001, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 11.162790697674419, |
|
"grad_norm": 0.0019909776747226715, |
|
"learning_rate": 0.00014879227053140096, |
|
"loss": 0.0001, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 11.305903398926654, |
|
"grad_norm": 0.0011982638388872147, |
|
"learning_rate": 0.0001455716586151369, |
|
"loss": 0.0001, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 11.449016100178891, |
|
"grad_norm": 0.0015958467265591025, |
|
"learning_rate": 0.0001423510466988728, |
|
"loss": 0.0001, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 11.592128801431127, |
|
"grad_norm": 0.0008461058023385704, |
|
"learning_rate": 0.0001391304347826087, |
|
"loss": 0.0001, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 11.735241502683364, |
|
"grad_norm": 0.0005576548865064979, |
|
"learning_rate": 0.00013590982286634462, |
|
"loss": 0.0001, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 11.878354203935599, |
|
"grad_norm": 0.0017713948618620634, |
|
"learning_rate": 0.0001326892109500805, |
|
"loss": 0.0001, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 12.021466905187836, |
|
"grad_norm": 0.001206880551762879, |
|
"learning_rate": 0.00012946859903381643, |
|
"loss": 0.0001, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 12.164579606440071, |
|
"grad_norm": 0.0013083606027066708, |
|
"learning_rate": 0.00012624798711755234, |
|
"loss": 0.0001, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 0.0008201482123695314, |
|
"learning_rate": 0.00012302737520128825, |
|
"loss": 0.0001, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 12.450805008944544, |
|
"grad_norm": 0.0006802495336160064, |
|
"learning_rate": 0.00011980676328502416, |
|
"loss": 0.0001, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 12.59391771019678, |
|
"grad_norm": 0.0017911783652380109, |
|
"learning_rate": 0.00011658615136876008, |
|
"loss": 0.0001, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 12.737030411449016, |
|
"grad_norm": 0.0007388959056697786, |
|
"learning_rate": 0.00011336553945249598, |
|
"loss": 0.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 12.880143112701251, |
|
"grad_norm": 0.0007727427291683853, |
|
"learning_rate": 0.00011014492753623188, |
|
"loss": 0.0001, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 13.023255813953488, |
|
"grad_norm": 0.0008818788919597864, |
|
"learning_rate": 0.0001069243156199678, |
|
"loss": 0.0001, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 13.166368515205724, |
|
"grad_norm": 0.0005572364898398519, |
|
"learning_rate": 0.0001037037037037037, |
|
"loss": 0.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 13.30948121645796, |
|
"grad_norm": 0.0009758470696397126, |
|
"learning_rate": 0.00010048309178743962, |
|
"loss": 0.0001, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 13.452593917710196, |
|
"grad_norm": 0.0003166435344610363, |
|
"learning_rate": 9.726247987117553e-05, |
|
"loss": 0.0001, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 13.595706618962433, |
|
"grad_norm": 0.0005005749990232289, |
|
"learning_rate": 9.404186795491144e-05, |
|
"loss": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 13.738819320214668, |
|
"grad_norm": 0.0003304154670331627, |
|
"learning_rate": 9.082125603864735e-05, |
|
"loss": 0.0001, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 13.881932021466906, |
|
"grad_norm": 0.0005377003108151257, |
|
"learning_rate": 8.760064412238325e-05, |
|
"loss": 0.0001, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 14.02504472271914, |
|
"grad_norm": 0.0015913191018626094, |
|
"learning_rate": 8.438003220611916e-05, |
|
"loss": 0.0001, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 14.168157423971378, |
|
"grad_norm": 0.000676720985211432, |
|
"learning_rate": 8.115942028985508e-05, |
|
"loss": 0.0001, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 14.311270125223613, |
|
"grad_norm": 0.0007494900492019951, |
|
"learning_rate": 7.793880837359099e-05, |
|
"loss": 0.0001, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 14.45438282647585, |
|
"grad_norm": 0.0015422647120431066, |
|
"learning_rate": 7.47181964573269e-05, |
|
"loss": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 14.597495527728086, |
|
"grad_norm": 0.0005012313486076891, |
|
"learning_rate": 7.14975845410628e-05, |
|
"loss": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 14.740608228980323, |
|
"grad_norm": 0.0008338551269844174, |
|
"learning_rate": 6.827697262479872e-05, |
|
"loss": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 14.883720930232558, |
|
"grad_norm": 0.0006810138584114611, |
|
"learning_rate": 6.505636070853462e-05, |
|
"loss": 0.0001, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 15.026833631484795, |
|
"grad_norm": 0.00043299293611198664, |
|
"learning_rate": 6.183574879227053e-05, |
|
"loss": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 15.16994633273703, |
|
"grad_norm": 0.0005277034360915422, |
|
"learning_rate": 5.861513687600645e-05, |
|
"loss": 0.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 15.313059033989267, |
|
"grad_norm": 0.0006858156993985176, |
|
"learning_rate": 5.5394524959742355e-05, |
|
"loss": 0.0001, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 15.456171735241503, |
|
"grad_norm": 0.0008438636432401836, |
|
"learning_rate": 5.217391304347826e-05, |
|
"loss": 0.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 15.59928443649374, |
|
"grad_norm": 0.0012173138093203306, |
|
"learning_rate": 4.895330112721417e-05, |
|
"loss": 0.0001, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 15.742397137745975, |
|
"grad_norm": 0.002290137577801943, |
|
"learning_rate": 4.573268921095008e-05, |
|
"loss": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 15.88550983899821, |
|
"grad_norm": 0.0005496228695847094, |
|
"learning_rate": 4.2512077294685994e-05, |
|
"loss": 0.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 16.028622540250446, |
|
"grad_norm": 0.0018827420426532626, |
|
"learning_rate": 3.92914653784219e-05, |
|
"loss": 0.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 16.171735241502684, |
|
"grad_norm": 0.00045006562140770257, |
|
"learning_rate": 3.607085346215781e-05, |
|
"loss": 0.0001, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 16.31484794275492, |
|
"grad_norm": 0.0005126325413584709, |
|
"learning_rate": 3.2850241545893725e-05, |
|
"loss": 0.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 16.457960644007155, |
|
"grad_norm": 0.00035093360929749906, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 16.60107334525939, |
|
"grad_norm": 0.0010109692811965942, |
|
"learning_rate": 2.640901771336554e-05, |
|
"loss": 0.0001, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 16.74418604651163, |
|
"grad_norm": 0.0006910230731591582, |
|
"learning_rate": 2.318840579710145e-05, |
|
"loss": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 16.887298747763865, |
|
"grad_norm": 0.0004351095121819526, |
|
"learning_rate": 1.996779388083736e-05, |
|
"loss": 0.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 17.0304114490161, |
|
"grad_norm": 0.0006468660430982709, |
|
"learning_rate": 1.674718196457327e-05, |
|
"loss": 0.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 17.173524150268335, |
|
"grad_norm": 0.0002576902334112674, |
|
"learning_rate": 1.3526570048309179e-05, |
|
"loss": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 17.316636851520574, |
|
"grad_norm": 0.0010522498050704598, |
|
"learning_rate": 1.0305958132045089e-05, |
|
"loss": 0.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 17.45974955277281, |
|
"grad_norm": 0.0007789513911120594, |
|
"learning_rate": 7.0853462157809985e-06, |
|
"loss": 0.0001, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 17.602862254025045, |
|
"grad_norm": 0.0009570368565618992, |
|
"learning_rate": 3.864734299516908e-06, |
|
"loss": 0.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 17.74597495527728, |
|
"grad_norm": 0.0009920781012624502, |
|
"learning_rate": 6.44122383252818e-07, |
|
"loss": 0.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 17.77459749552773, |
|
"step": 1242, |
|
"total_flos": 6.807580263736934e+16, |
|
"train_loss": 0.056659956144777014, |
|
"train_runtime": 9770.217, |
|
"train_samples_per_second": 6.177, |
|
"train_steps_per_second": 0.127 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1242, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 18, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.807580263736934e+16, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|