|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 2000, |
|
"global_step": 9033, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011070519207350825, |
|
"grad_norm": 55.698127642695866, |
|
"learning_rate": 1.1061946902654869e-07, |
|
"loss": 4.4914, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002214103841470165, |
|
"grad_norm": 41.723334569659066, |
|
"learning_rate": 2.2123893805309737e-07, |
|
"loss": 4.4954, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0033211557622052474, |
|
"grad_norm": 43.79884898454788, |
|
"learning_rate": 3.318584070796461e-07, |
|
"loss": 4.296, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00442820768294033, |
|
"grad_norm": 31.30694571868551, |
|
"learning_rate": 4.4247787610619474e-07, |
|
"loss": 3.9204, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0055352596036754124, |
|
"grad_norm": 30.761018090708117, |
|
"learning_rate": 5.530973451327435e-07, |
|
"loss": 3.4951, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006642311524410495, |
|
"grad_norm": 32.180378162249326, |
|
"learning_rate": 6.637168141592922e-07, |
|
"loss": 3.4177, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007749363445145577, |
|
"grad_norm": 24.157005825357814, |
|
"learning_rate": 7.743362831858408e-07, |
|
"loss": 3.3864, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00885641536588066, |
|
"grad_norm": 29.036937778457148, |
|
"learning_rate": 8.849557522123895e-07, |
|
"loss": 3.1996, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009963467286615742, |
|
"grad_norm": 23.44273601537366, |
|
"learning_rate": 9.95575221238938e-07, |
|
"loss": 3.2652, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.011070519207350825, |
|
"grad_norm": 31.552666658205744, |
|
"learning_rate": 1.106194690265487e-06, |
|
"loss": 3.2654, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.012177571128085908, |
|
"grad_norm": 23.158016097255164, |
|
"learning_rate": 1.2168141592920355e-06, |
|
"loss": 3.1954, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01328462304882099, |
|
"grad_norm": 31.510121445270936, |
|
"learning_rate": 1.3274336283185843e-06, |
|
"loss": 3.2016, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.014391674969556073, |
|
"grad_norm": 29.02944479343648, |
|
"learning_rate": 1.438053097345133e-06, |
|
"loss": 3.202, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.015498726890291154, |
|
"grad_norm": 24.737793045013742, |
|
"learning_rate": 1.5486725663716816e-06, |
|
"loss": 3.054, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.016605778811026237, |
|
"grad_norm": 25.982953823095542, |
|
"learning_rate": 1.6592920353982304e-06, |
|
"loss": 3.1637, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01771283073176132, |
|
"grad_norm": 24.62246443187751, |
|
"learning_rate": 1.769911504424779e-06, |
|
"loss": 3.0422, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.018819882652496404, |
|
"grad_norm": 24.996096258559348, |
|
"learning_rate": 1.8805309734513274e-06, |
|
"loss": 2.9983, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.019926934573231483, |
|
"grad_norm": 30.197182446028002, |
|
"learning_rate": 1.991150442477876e-06, |
|
"loss": 3.0625, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.021033986493966567, |
|
"grad_norm": 25.648689604176077, |
|
"learning_rate": 2.101769911504425e-06, |
|
"loss": 3.2172, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02214103841470165, |
|
"grad_norm": 30.003866974191762, |
|
"learning_rate": 2.212389380530974e-06, |
|
"loss": 3.0895, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.023248090335436733, |
|
"grad_norm": 27.11239518865144, |
|
"learning_rate": 2.3230088495575224e-06, |
|
"loss": 2.9847, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.024355142256171816, |
|
"grad_norm": 24.532976628512625, |
|
"learning_rate": 2.433628318584071e-06, |
|
"loss": 3.0439, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.025462194176906896, |
|
"grad_norm": 25.595931432489675, |
|
"learning_rate": 2.5442477876106196e-06, |
|
"loss": 2.9722, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02656924609764198, |
|
"grad_norm": 24.927511131743852, |
|
"learning_rate": 2.6548672566371687e-06, |
|
"loss": 3.0965, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.027676298018377062, |
|
"grad_norm": 21.477421706673375, |
|
"learning_rate": 2.765486725663717e-06, |
|
"loss": 2.9589, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.028783349939112145, |
|
"grad_norm": 26.975518213759347, |
|
"learning_rate": 2.876106194690266e-06, |
|
"loss": 2.9878, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.029890401859847225, |
|
"grad_norm": 25.199714957692397, |
|
"learning_rate": 2.9867256637168145e-06, |
|
"loss": 3.0718, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03099745378058231, |
|
"grad_norm": 25.911079775392583, |
|
"learning_rate": 3.097345132743363e-06, |
|
"loss": 3.1289, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03210450570131739, |
|
"grad_norm": 22.333848218066173, |
|
"learning_rate": 3.2079646017699117e-06, |
|
"loss": 3.019, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.033211557622052475, |
|
"grad_norm": 27.77592678094656, |
|
"learning_rate": 3.3185840707964607e-06, |
|
"loss": 3.0679, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03431860954278756, |
|
"grad_norm": 25.520987098358336, |
|
"learning_rate": 3.429203539823009e-06, |
|
"loss": 3.0455, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03542566146352264, |
|
"grad_norm": 30.694095405699013, |
|
"learning_rate": 3.539823008849558e-06, |
|
"loss": 2.9686, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.036532713384257724, |
|
"grad_norm": 34.07278189685705, |
|
"learning_rate": 3.6504424778761066e-06, |
|
"loss": 3.0074, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03763976530499281, |
|
"grad_norm": 23.233595027296616, |
|
"learning_rate": 3.7610619469026547e-06, |
|
"loss": 2.9906, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.038746817225727884, |
|
"grad_norm": 21.65008179710679, |
|
"learning_rate": 3.871681415929203e-06, |
|
"loss": 2.9965, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.03985386914646297, |
|
"grad_norm": 25.432398948327197, |
|
"learning_rate": 3.982300884955752e-06, |
|
"loss": 2.9583, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04096092106719805, |
|
"grad_norm": 24.118552348993813, |
|
"learning_rate": 4.092920353982301e-06, |
|
"loss": 2.9629, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04206797298793313, |
|
"grad_norm": 28.535820173184682, |
|
"learning_rate": 4.20353982300885e-06, |
|
"loss": 3.0437, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.043175024908668216, |
|
"grad_norm": 27.574173741552002, |
|
"learning_rate": 4.314159292035399e-06, |
|
"loss": 2.9642, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.0442820768294033, |
|
"grad_norm": 27.270408053929884, |
|
"learning_rate": 4.424778761061948e-06, |
|
"loss": 3.0859, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04538912875013838, |
|
"grad_norm": 27.57691676783791, |
|
"learning_rate": 4.535398230088496e-06, |
|
"loss": 3.009, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.046496180670873466, |
|
"grad_norm": 23.96155441996071, |
|
"learning_rate": 4.646017699115045e-06, |
|
"loss": 2.9363, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04760323259160855, |
|
"grad_norm": 24.279797643812547, |
|
"learning_rate": 4.756637168141594e-06, |
|
"loss": 3.061, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.04871028451234363, |
|
"grad_norm": 25.19191068246207, |
|
"learning_rate": 4.867256637168142e-06, |
|
"loss": 2.9153, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04981733643307871, |
|
"grad_norm": 28.82056510425203, |
|
"learning_rate": 4.97787610619469e-06, |
|
"loss": 3.1449, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05092438835381379, |
|
"grad_norm": 23.600325136397633, |
|
"learning_rate": 5.088495575221239e-06, |
|
"loss": 3.0081, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.052031440274548875, |
|
"grad_norm": 23.484025108009725, |
|
"learning_rate": 5.1991150442477875e-06, |
|
"loss": 3.0463, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.05313849219528396, |
|
"grad_norm": 19.983575881103242, |
|
"learning_rate": 5.309734513274337e-06, |
|
"loss": 3.0811, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.05424554411601904, |
|
"grad_norm": 21.728377820671874, |
|
"learning_rate": 5.4203539823008855e-06, |
|
"loss": 3.1061, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.055352596036754124, |
|
"grad_norm": 34.21680389185442, |
|
"learning_rate": 5.530973451327434e-06, |
|
"loss": 3.0082, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05645964795748921, |
|
"grad_norm": 28.98744881318429, |
|
"learning_rate": 5.641592920353984e-06, |
|
"loss": 2.9659, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.05756669987822429, |
|
"grad_norm": 21.719020231412607, |
|
"learning_rate": 5.752212389380532e-06, |
|
"loss": 2.9689, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.058673751798959374, |
|
"grad_norm": 26.343484772343533, |
|
"learning_rate": 5.86283185840708e-06, |
|
"loss": 3.0181, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.05978080371969445, |
|
"grad_norm": 26.674266585106718, |
|
"learning_rate": 5.973451327433629e-06, |
|
"loss": 2.9782, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.06088785564042953, |
|
"grad_norm": 24.29263386559663, |
|
"learning_rate": 6.084070796460177e-06, |
|
"loss": 3.0291, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.06199490756116462, |
|
"grad_norm": 27.260031480591426, |
|
"learning_rate": 6.194690265486726e-06, |
|
"loss": 3.0252, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.0631019594818997, |
|
"grad_norm": 20.957832212139657, |
|
"learning_rate": 6.305309734513275e-06, |
|
"loss": 3.0388, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.06420901140263478, |
|
"grad_norm": 26.9583565130981, |
|
"learning_rate": 6.415929203539823e-06, |
|
"loss": 2.9987, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.06531606332336987, |
|
"grad_norm": 23.667021249704298, |
|
"learning_rate": 6.526548672566372e-06, |
|
"loss": 2.9786, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.06642311524410495, |
|
"grad_norm": 24.584436820766868, |
|
"learning_rate": 6.6371681415929215e-06, |
|
"loss": 3.0082, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06753016716484003, |
|
"grad_norm": 28.424068265725914, |
|
"learning_rate": 6.74778761061947e-06, |
|
"loss": 3.1212, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.06863721908557512, |
|
"grad_norm": 21.704948850763316, |
|
"learning_rate": 6.858407079646018e-06, |
|
"loss": 3.0008, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.0697442710063102, |
|
"grad_norm": 25.82364197800952, |
|
"learning_rate": 6.969026548672567e-06, |
|
"loss": 2.9993, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.07085132292704528, |
|
"grad_norm": 23.887813042264725, |
|
"learning_rate": 7.079646017699116e-06, |
|
"loss": 2.9319, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.07195837484778037, |
|
"grad_norm": 26.62784975319365, |
|
"learning_rate": 7.190265486725664e-06, |
|
"loss": 2.9158, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.07306542676851545, |
|
"grad_norm": 25.598475891481986, |
|
"learning_rate": 7.300884955752213e-06, |
|
"loss": 3.0746, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.07417247868925053, |
|
"grad_norm": 19.384106471975148, |
|
"learning_rate": 7.411504424778761e-06, |
|
"loss": 2.9683, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.07527953060998561, |
|
"grad_norm": 22.58174336593009, |
|
"learning_rate": 7.5221238938053095e-06, |
|
"loss": 2.9548, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.07638658253072068, |
|
"grad_norm": 23.253222270880613, |
|
"learning_rate": 7.632743362831859e-06, |
|
"loss": 2.9424, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.07749363445145577, |
|
"grad_norm": 24.53761264564241, |
|
"learning_rate": 7.743362831858407e-06, |
|
"loss": 2.9999, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.07860068637219085, |
|
"grad_norm": 22.215828742213887, |
|
"learning_rate": 7.853982300884957e-06, |
|
"loss": 2.9638, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.07970773829292593, |
|
"grad_norm": 23.926002163576186, |
|
"learning_rate": 7.964601769911505e-06, |
|
"loss": 2.9937, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.08081479021366102, |
|
"grad_norm": 24.414616790878906, |
|
"learning_rate": 8.075221238938053e-06, |
|
"loss": 2.9732, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.0819218421343961, |
|
"grad_norm": 23.388402347902353, |
|
"learning_rate": 8.185840707964603e-06, |
|
"loss": 2.9107, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.08302889405513118, |
|
"grad_norm": 24.124270687360198, |
|
"learning_rate": 8.296460176991151e-06, |
|
"loss": 2.9869, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.08413594597586627, |
|
"grad_norm": 21.86924086571945, |
|
"learning_rate": 8.4070796460177e-06, |
|
"loss": 3.0616, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.08524299789660135, |
|
"grad_norm": 29.125772286493696, |
|
"learning_rate": 8.517699115044249e-06, |
|
"loss": 2.9174, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.08635004981733643, |
|
"grad_norm": 25.471433455609642, |
|
"learning_rate": 8.628318584070797e-06, |
|
"loss": 3.0338, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.08745710173807152, |
|
"grad_norm": 24.06665529849035, |
|
"learning_rate": 8.738938053097345e-06, |
|
"loss": 3.0321, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.0885641536588066, |
|
"grad_norm": 18.292126722435007, |
|
"learning_rate": 8.849557522123895e-06, |
|
"loss": 2.9429, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08967120557954168, |
|
"grad_norm": 22.110943430558972, |
|
"learning_rate": 8.960176991150443e-06, |
|
"loss": 2.8389, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.09077825750027677, |
|
"grad_norm": 24.83606400487908, |
|
"learning_rate": 9.070796460176992e-06, |
|
"loss": 2.9753, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.09188530942101185, |
|
"grad_norm": 25.95232011272635, |
|
"learning_rate": 9.181415929203542e-06, |
|
"loss": 3.043, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.09299236134174693, |
|
"grad_norm": 20.659162690961626, |
|
"learning_rate": 9.29203539823009e-06, |
|
"loss": 2.9343, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.09409941326248202, |
|
"grad_norm": 25.45459790239467, |
|
"learning_rate": 9.402654867256638e-06, |
|
"loss": 3.0285, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.0952064651832171, |
|
"grad_norm": 24.920778384975627, |
|
"learning_rate": 9.513274336283188e-06, |
|
"loss": 3.008, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.09631351710395218, |
|
"grad_norm": 23.893946109752218, |
|
"learning_rate": 9.623893805309736e-06, |
|
"loss": 3.0592, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.09742056902468726, |
|
"grad_norm": 23.476211841831407, |
|
"learning_rate": 9.734513274336284e-06, |
|
"loss": 3.0277, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.09852762094542233, |
|
"grad_norm": 32.349500707592966, |
|
"learning_rate": 9.845132743362832e-06, |
|
"loss": 3.0481, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.09963467286615742, |
|
"grad_norm": 23.23199056988143, |
|
"learning_rate": 9.95575221238938e-06, |
|
"loss": 2.9412, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1007417247868925, |
|
"grad_norm": 23.773772040090407, |
|
"learning_rate": 9.999986557878607e-06, |
|
"loss": 2.911, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.10184877670762758, |
|
"grad_norm": 21.93860193010473, |
|
"learning_rate": 9.999904411842942e-06, |
|
"loss": 3.0976, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.10295582862836267, |
|
"grad_norm": 26.162102122715467, |
|
"learning_rate": 9.999747588842252e-06, |
|
"loss": 2.8653, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.10406288054909775, |
|
"grad_norm": 24.79079368386082, |
|
"learning_rate": 9.999516091218793e-06, |
|
"loss": 3.0475, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.10516993246983283, |
|
"grad_norm": 22.659978907939497, |
|
"learning_rate": 9.999209922430137e-06, |
|
"loss": 2.9725, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.10627698439056792, |
|
"grad_norm": 24.881751031281876, |
|
"learning_rate": 9.99882908704913e-06, |
|
"loss": 2.9832, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.107384036311303, |
|
"grad_norm": 19.289141804905892, |
|
"learning_rate": 9.998373590763798e-06, |
|
"loss": 2.9333, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.10849108823203808, |
|
"grad_norm": 25.233514814542282, |
|
"learning_rate": 9.997843440377293e-06, |
|
"loss": 3.1247, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.10959814015277317, |
|
"grad_norm": 22.529791978010802, |
|
"learning_rate": 9.997238643807768e-06, |
|
"loss": 3.0009, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.11070519207350825, |
|
"grad_norm": 26.994079486651124, |
|
"learning_rate": 9.996559210088272e-06, |
|
"loss": 3.0359, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11181224399424333, |
|
"grad_norm": 23.04614956134999, |
|
"learning_rate": 9.995805149366607e-06, |
|
"loss": 2.9097, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.11291929591497842, |
|
"grad_norm": 26.498780600053372, |
|
"learning_rate": 9.994976472905184e-06, |
|
"loss": 3.045, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1140263478357135, |
|
"grad_norm": 20.370834631825762, |
|
"learning_rate": 9.994073193080844e-06, |
|
"loss": 2.9198, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.11513339975644858, |
|
"grad_norm": 18.931594900754984, |
|
"learning_rate": 9.993095323384688e-06, |
|
"loss": 2.9937, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.11624045167718366, |
|
"grad_norm": 25.23074417182063, |
|
"learning_rate": 9.992042878421862e-06, |
|
"loss": 2.9846, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.11734750359791875, |
|
"grad_norm": 22.748633865558133, |
|
"learning_rate": 9.990915873911346e-06, |
|
"loss": 3.0222, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.11845455551865383, |
|
"grad_norm": 20.500684992552053, |
|
"learning_rate": 9.989714326685715e-06, |
|
"loss": 3.0954, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.1195616074393889, |
|
"grad_norm": 18.438452324633406, |
|
"learning_rate": 9.988438254690896e-06, |
|
"loss": 2.9079, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.12066865936012398, |
|
"grad_norm": 20.9380161875694, |
|
"learning_rate": 9.987087676985886e-06, |
|
"loss": 3.042, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.12177571128085907, |
|
"grad_norm": 21.96145242279915, |
|
"learning_rate": 9.985662613742483e-06, |
|
"loss": 3.0928, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.12288276320159415, |
|
"grad_norm": 22.04573397808545, |
|
"learning_rate": 9.984163086244971e-06, |
|
"loss": 3.1986, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.12398981512232923, |
|
"grad_norm": 22.85309097193917, |
|
"learning_rate": 9.982589116889811e-06, |
|
"loss": 3.0349, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.12509686704306433, |
|
"grad_norm": 22.5326581537924, |
|
"learning_rate": 9.980940729185305e-06, |
|
"loss": 3.0092, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.1262039189637994, |
|
"grad_norm": 23.18944415182163, |
|
"learning_rate": 9.97921794775124e-06, |
|
"loss": 2.952, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.1273109708845345, |
|
"grad_norm": 21.06330535416755, |
|
"learning_rate": 9.977420798318527e-06, |
|
"loss": 2.9854, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.12841802280526957, |
|
"grad_norm": 20.966301469920268, |
|
"learning_rate": 9.975549307728812e-06, |
|
"loss": 2.9179, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.12952507472600466, |
|
"grad_norm": 20.657024404262373, |
|
"learning_rate": 9.973603503934077e-06, |
|
"loss": 2.9828, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.13063212664673973, |
|
"grad_norm": 23.18808360085381, |
|
"learning_rate": 9.97158341599622e-06, |
|
"loss": 2.8795, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.1317391785674748, |
|
"grad_norm": 19.586003898036246, |
|
"learning_rate": 9.969489074086626e-06, |
|
"loss": 2.9715, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.1328462304882099, |
|
"grad_norm": 23.666001778535268, |
|
"learning_rate": 9.967320509485715e-06, |
|
"loss": 3.0556, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.13395328240894497, |
|
"grad_norm": 20.020096724757796, |
|
"learning_rate": 9.965077754582468e-06, |
|
"loss": 2.925, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.13506033432968007, |
|
"grad_norm": 24.015238653225634, |
|
"learning_rate": 9.962760842873952e-06, |
|
"loss": 2.9019, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.13616738625041513, |
|
"grad_norm": 30.05960379166683, |
|
"learning_rate": 9.960369808964816e-06, |
|
"loss": 2.984, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.13727443817115023, |
|
"grad_norm": 19.296451414183455, |
|
"learning_rate": 9.957904688566774e-06, |
|
"loss": 2.9919, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.1383814900918853, |
|
"grad_norm": 20.11171003157378, |
|
"learning_rate": 9.95536551849807e-06, |
|
"loss": 2.939, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.1394885420126204, |
|
"grad_norm": 24.97381642097054, |
|
"learning_rate": 9.952752336682933e-06, |
|
"loss": 3.0819, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.14059559393335547, |
|
"grad_norm": 19.11189833758423, |
|
"learning_rate": 9.950065182151007e-06, |
|
"loss": 2.9558, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.14170264585409056, |
|
"grad_norm": 22.339401966128147, |
|
"learning_rate": 9.947304095036768e-06, |
|
"loss": 2.971, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.14280969777482563, |
|
"grad_norm": 20.896158530865005, |
|
"learning_rate": 9.944469116578925e-06, |
|
"loss": 2.9734, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.14391674969556073, |
|
"grad_norm": 21.80035128152707, |
|
"learning_rate": 9.941560289119808e-06, |
|
"loss": 3.0756, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1450238016162958, |
|
"grad_norm": 22.803461112332005, |
|
"learning_rate": 9.938577656104725e-06, |
|
"loss": 2.8886, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.1461308535370309, |
|
"grad_norm": 19.045841307524757, |
|
"learning_rate": 9.935521262081324e-06, |
|
"loss": 2.9949, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.14723790545776597, |
|
"grad_norm": 21.269082405436986, |
|
"learning_rate": 9.932391152698926e-06, |
|
"loss": 3.1047, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.14834495737850106, |
|
"grad_norm": 24.520690144049905, |
|
"learning_rate": 9.929187374707836e-06, |
|
"loss": 2.9404, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.14945200929923613, |
|
"grad_norm": 22.56252212345693, |
|
"learning_rate": 9.925909975958655e-06, |
|
"loss": 2.9609, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.15055906121997123, |
|
"grad_norm": 18.509241308235815, |
|
"learning_rate": 9.922559005401555e-06, |
|
"loss": 2.9581, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1516661131407063, |
|
"grad_norm": 21.078754308555286, |
|
"learning_rate": 9.919134513085557e-06, |
|
"loss": 3.0338, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.15277316506144137, |
|
"grad_norm": 19.364617917203557, |
|
"learning_rate": 9.915636550157776e-06, |
|
"loss": 3.0394, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.15388021698217647, |
|
"grad_norm": 12.87341952454837, |
|
"learning_rate": 9.912065168862661e-06, |
|
"loss": 2.8927, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.15498726890291153, |
|
"grad_norm": 21.353822481564322, |
|
"learning_rate": 9.908420422541216e-06, |
|
"loss": 2.9264, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.15609432082364663, |
|
"grad_norm": 25.61409358483238, |
|
"learning_rate": 9.9047023656302e-06, |
|
"loss": 3.0722, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.1572013727443817, |
|
"grad_norm": 18.98168487984158, |
|
"learning_rate": 9.90091105366132e-06, |
|
"loss": 3.0422, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.1583084246651168, |
|
"grad_norm": 18.90201248838335, |
|
"learning_rate": 9.897046543260384e-06, |
|
"loss": 2.9686, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.15941547658585187, |
|
"grad_norm": 19.145516912456003, |
|
"learning_rate": 9.893108892146487e-06, |
|
"loss": 2.9299, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.16052252850658696, |
|
"grad_norm": 21.131608116342832, |
|
"learning_rate": 9.889098159131112e-06, |
|
"loss": 2.9767, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.16162958042732203, |
|
"grad_norm": 23.100589010259966, |
|
"learning_rate": 9.88501440411728e-06, |
|
"loss": 2.9711, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.16273663234805713, |
|
"grad_norm": 23.844195002755608, |
|
"learning_rate": 9.88085768809865e-06, |
|
"loss": 3.0006, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.1638436842687922, |
|
"grad_norm": 21.595484978633603, |
|
"learning_rate": 9.876628073158586e-06, |
|
"loss": 2.8897, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.1649507361895273, |
|
"grad_norm": 19.91645782320423, |
|
"learning_rate": 9.872325622469263e-06, |
|
"loss": 2.9626, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.16605778811026237, |
|
"grad_norm": 22.954047655684626, |
|
"learning_rate": 9.8679504002907e-06, |
|
"loss": 2.9654, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.16716484003099746, |
|
"grad_norm": 19.01781845067502, |
|
"learning_rate": 9.863502471969811e-06, |
|
"loss": 2.9689, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.16827189195173253, |
|
"grad_norm": 23.51295361703636, |
|
"learning_rate": 9.858981903939419e-06, |
|
"loss": 2.9714, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.16937894387246763, |
|
"grad_norm": 22.715802630980665, |
|
"learning_rate": 9.85438876371728e-06, |
|
"loss": 2.9433, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.1704859957932027, |
|
"grad_norm": 19.235667821528295, |
|
"learning_rate": 9.849723119905055e-06, |
|
"loss": 2.8702, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.1715930477139378, |
|
"grad_norm": 20.997083855056253, |
|
"learning_rate": 9.844985042187305e-06, |
|
"loss": 2.9613, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.17270009963467287, |
|
"grad_norm": 19.327650896289015, |
|
"learning_rate": 9.840174601330434e-06, |
|
"loss": 2.9561, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.17380715155540793, |
|
"grad_norm": 23.743647417758826, |
|
"learning_rate": 9.835291869181638e-06, |
|
"loss": 2.9465, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.17491420347614303, |
|
"grad_norm": 21.24076642916138, |
|
"learning_rate": 9.830336918667838e-06, |
|
"loss": 2.9089, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.1760212553968781, |
|
"grad_norm": 18.18531438353361, |
|
"learning_rate": 9.82530982379458e-06, |
|
"loss": 2.925, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.1771283073176132, |
|
"grad_norm": 18.941367135114337, |
|
"learning_rate": 9.820210659644938e-06, |
|
"loss": 2.8847, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.17823535923834827, |
|
"grad_norm": 21.6741338404853, |
|
"learning_rate": 9.815039502378387e-06, |
|
"loss": 2.8948, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.17934241115908336, |
|
"grad_norm": 20.193862408863023, |
|
"learning_rate": 9.80979642922967e-06, |
|
"loss": 3.0728, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.18044946307981843, |
|
"grad_norm": 18.820011578655564, |
|
"learning_rate": 9.804481518507645e-06, |
|
"loss": 2.9551, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.18155651500055353, |
|
"grad_norm": 21.501952619775196, |
|
"learning_rate": 9.799094849594107e-06, |
|
"loss": 2.9621, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.1826635669212886, |
|
"grad_norm": 25.610574065149102, |
|
"learning_rate": 9.793636502942611e-06, |
|
"loss": 2.8723, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.1837706188420237, |
|
"grad_norm": 20.593228441794714, |
|
"learning_rate": 9.78810656007727e-06, |
|
"loss": 2.8278, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.18487767076275877, |
|
"grad_norm": 19.172777347075332, |
|
"learning_rate": 9.782505103591533e-06, |
|
"loss": 2.9767, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.18598472268349386, |
|
"grad_norm": 21.02145151466687, |
|
"learning_rate": 9.776832217146952e-06, |
|
"loss": 2.8362, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.18709177460422893, |
|
"grad_norm": 20.621602691872784, |
|
"learning_rate": 9.771087985471936e-06, |
|
"loss": 3.0292, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.18819882652496403, |
|
"grad_norm": 17.865789195071134, |
|
"learning_rate": 9.765272494360483e-06, |
|
"loss": 2.8839, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1893058784456991, |
|
"grad_norm": 18.637077859127157, |
|
"learning_rate": 9.759385830670897e-06, |
|
"loss": 2.8975, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.1904129303664342, |
|
"grad_norm": 20.242466511532335, |
|
"learning_rate": 9.753428082324496e-06, |
|
"loss": 2.8949, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.19151998228716927, |
|
"grad_norm": 19.93215544071779, |
|
"learning_rate": 9.747399338304295e-06, |
|
"loss": 3.0225, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.19262703420790436, |
|
"grad_norm": 24.762070259664206, |
|
"learning_rate": 9.741299688653676e-06, |
|
"loss": 2.9459, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.19373408612863943, |
|
"grad_norm": 19.63500693742026, |
|
"learning_rate": 9.735129224475044e-06, |
|
"loss": 2.8765, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.19484113804937453, |
|
"grad_norm": 21.82483127686805, |
|
"learning_rate": 9.72888803792847e-06, |
|
"loss": 2.8684, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.1959481899701096, |
|
"grad_norm": 19.049243439574713, |
|
"learning_rate": 9.72257622223031e-06, |
|
"loss": 2.9594, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.19705524189084467, |
|
"grad_norm": 21.414348061773953, |
|
"learning_rate": 9.716193871651814e-06, |
|
"loss": 2.9053, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.19816229381157976, |
|
"grad_norm": 17.876253403312774, |
|
"learning_rate": 9.709741081517717e-06, |
|
"loss": 2.8154, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.19926934573231483, |
|
"grad_norm": 20.116361008310705, |
|
"learning_rate": 9.703217948204821e-06, |
|
"loss": 2.9732, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.20037639765304993, |
|
"grad_norm": 18.744377270113645, |
|
"learning_rate": 9.696624569140547e-06, |
|
"loss": 2.8966, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.201483449573785, |
|
"grad_norm": 19.280130238929477, |
|
"learning_rate": 9.689961042801483e-06, |
|
"loss": 2.8611, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.2025905014945201, |
|
"grad_norm": 19.224045203920024, |
|
"learning_rate": 9.68322746871192e-06, |
|
"loss": 2.8985, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.20369755341525517, |
|
"grad_norm": 23.351196637887085, |
|
"learning_rate": 9.676423947442353e-06, |
|
"loss": 2.9592, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.20480460533599026, |
|
"grad_norm": 17.4707572244572, |
|
"learning_rate": 9.66955058060799e-06, |
|
"loss": 2.9347, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.20591165725672533, |
|
"grad_norm": 20.54442740488183, |
|
"learning_rate": 9.662607470867229e-06, |
|
"loss": 2.8642, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.20701870917746043, |
|
"grad_norm": 19.78074079042836, |
|
"learning_rate": 9.655594721920124e-06, |
|
"loss": 2.8779, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.2081257610981955, |
|
"grad_norm": 19.640164221789888, |
|
"learning_rate": 9.648512438506841e-06, |
|
"loss": 3.0375, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.2092328130189306, |
|
"grad_norm": 17.525300444795423, |
|
"learning_rate": 9.641360726406087e-06, |
|
"loss": 2.9689, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.21033986493966567, |
|
"grad_norm": 16.921755221767196, |
|
"learning_rate": 9.634139692433534e-06, |
|
"loss": 2.9311, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.21144691686040076, |
|
"grad_norm": 23.573635502822672, |
|
"learning_rate": 9.626849444440223e-06, |
|
"loss": 3.1791, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.21255396878113583, |
|
"grad_norm": 21.608288648771143, |
|
"learning_rate": 9.619490091310959e-06, |
|
"loss": 2.9152, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.21366102070187093, |
|
"grad_norm": 21.984519688812558, |
|
"learning_rate": 9.612061742962672e-06, |
|
"loss": 2.8558, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.214768072622606, |
|
"grad_norm": 20.401130440641623, |
|
"learning_rate": 9.604564510342785e-06, |
|
"loss": 2.8631, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.2158751245433411, |
|
"grad_norm": 20.05203124505054, |
|
"learning_rate": 9.596998505427556e-06, |
|
"loss": 2.987, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.21698217646407617, |
|
"grad_norm": 20.868561748558378, |
|
"learning_rate": 9.589363841220398e-06, |
|
"loss": 2.7379, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.21808922838481123, |
|
"grad_norm": 22.537403308642126, |
|
"learning_rate": 9.581660631750205e-06, |
|
"loss": 2.9491, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.21919628030554633, |
|
"grad_norm": 18.786633581936144, |
|
"learning_rate": 9.573888992069635e-06, |
|
"loss": 3.0325, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.2203033322262814, |
|
"grad_norm": 20.183050798106528, |
|
"learning_rate": 9.566049038253404e-06, |
|
"loss": 2.8613, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.2214103841470165, |
|
"grad_norm": 19.889860560476563, |
|
"learning_rate": 9.558140887396539e-06, |
|
"loss": 3.0076, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2214103841470165, |
|
"eval_loss": 2.899467945098877, |
|
"eval_runtime": 2402.2319, |
|
"eval_samples_per_second": 4.178, |
|
"eval_steps_per_second": 0.418, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.22251743606775157, |
|
"grad_norm": 20.918414604698, |
|
"learning_rate": 9.55016465761264e-06, |
|
"loss": 2.8974, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.22362448798848666, |
|
"grad_norm": 18.12895807311221, |
|
"learning_rate": 9.542120468032108e-06, |
|
"loss": 2.8925, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.22473153990922173, |
|
"grad_norm": 20.68008214687689, |
|
"learning_rate": 9.534008438800378e-06, |
|
"loss": 2.8954, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.22583859182995683, |
|
"grad_norm": 19.62578662683229, |
|
"learning_rate": 9.525828691076107e-06, |
|
"loss": 2.9672, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.2269456437506919, |
|
"grad_norm": 18.137624721398762, |
|
"learning_rate": 9.517581347029378e-06, |
|
"loss": 2.7592, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.228052695671427, |
|
"grad_norm": 18.753830138125636, |
|
"learning_rate": 9.509266529839872e-06, |
|
"loss": 2.7837, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.22915974759216207, |
|
"grad_norm": 17.672344029095868, |
|
"learning_rate": 9.500884363695025e-06, |
|
"loss": 2.8959, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.23026679951289716, |
|
"grad_norm": 17.952725451957562, |
|
"learning_rate": 9.492434973788176e-06, |
|
"loss": 2.9146, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.23137385143363223, |
|
"grad_norm": 21.49636205616348, |
|
"learning_rate": 9.483918486316694e-06, |
|
"loss": 2.9972, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.23248090335436733, |
|
"grad_norm": 17.872259773823583, |
|
"learning_rate": 9.475335028480104e-06, |
|
"loss": 2.9048, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2335879552751024, |
|
"grad_norm": 18.304493955091758, |
|
"learning_rate": 9.466684728478167e-06, |
|
"loss": 2.8832, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.2346950071958375, |
|
"grad_norm": 20.521104550808733, |
|
"learning_rate": 9.457967715508986e-06, |
|
"loss": 2.9132, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.23580205911657257, |
|
"grad_norm": 21.959898523340325, |
|
"learning_rate": 9.449184119767066e-06, |
|
"loss": 2.8827, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.23690911103730766, |
|
"grad_norm": 17.838849413370237, |
|
"learning_rate": 9.440334072441364e-06, |
|
"loss": 2.9918, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.23801616295804273, |
|
"grad_norm": 19.92878332099444, |
|
"learning_rate": 9.431417705713348e-06, |
|
"loss": 2.9768, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.2391232148787778, |
|
"grad_norm": 22.052024784352827, |
|
"learning_rate": 9.422435152755003e-06, |
|
"loss": 2.7936, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.2402302667995129, |
|
"grad_norm": 18.832979591486268, |
|
"learning_rate": 9.41338654772685e-06, |
|
"loss": 2.8846, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.24133731872024797, |
|
"grad_norm": 20.56672086138257, |
|
"learning_rate": 9.40427202577595e-06, |
|
"loss": 2.9381, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.24244437064098306, |
|
"grad_norm": 19.022342343144167, |
|
"learning_rate": 9.39509172303387e-06, |
|
"loss": 2.7231, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.24355142256171813, |
|
"grad_norm": 18.365037787301343, |
|
"learning_rate": 9.385845776614659e-06, |
|
"loss": 2.8299, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.24465847448245323, |
|
"grad_norm": 16.086771712151563, |
|
"learning_rate": 9.3765343246128e-06, |
|
"loss": 2.8833, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.2457655264031883, |
|
"grad_norm": 17.286906565742285, |
|
"learning_rate": 9.367157506101152e-06, |
|
"loss": 2.8471, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.2468725783239234, |
|
"grad_norm": 16.860767812467355, |
|
"learning_rate": 9.35771546112886e-06, |
|
"loss": 2.7524, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.24797963024465847, |
|
"grad_norm": 22.69662113190212, |
|
"learning_rate": 9.348208330719269e-06, |
|
"loss": 2.9083, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.24908668216539356, |
|
"grad_norm": 17.900886651161414, |
|
"learning_rate": 9.338636256867826e-06, |
|
"loss": 2.8428, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.25019373408612866, |
|
"grad_norm": 16.680552099827924, |
|
"learning_rate": 9.328999382539948e-06, |
|
"loss": 2.8914, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.25130078600686373, |
|
"grad_norm": 18.313897702064246, |
|
"learning_rate": 9.319297851668893e-06, |
|
"loss": 2.9034, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.2524078379275988, |
|
"grad_norm": 16.947671537858998, |
|
"learning_rate": 9.309531809153606e-06, |
|
"loss": 2.8502, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.25351488984833387, |
|
"grad_norm": 18.710427396365873, |
|
"learning_rate": 9.29970140085656e-06, |
|
"loss": 2.8524, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.254621941769069, |
|
"grad_norm": 19.2567190717822, |
|
"learning_rate": 9.28980677360157e-06, |
|
"loss": 2.9991, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.25572899368980406, |
|
"grad_norm": 18.050406635894987, |
|
"learning_rate": 9.279848075171613e-06, |
|
"loss": 2.8717, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.25683604561053913, |
|
"grad_norm": 22.127493631791086, |
|
"learning_rate": 9.269825454306605e-06, |
|
"loss": 2.8977, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.2579430975312742, |
|
"grad_norm": 18.821085236072186, |
|
"learning_rate": 9.259739060701189e-06, |
|
"loss": 2.9116, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.2590501494520093, |
|
"grad_norm": 19.277291605575755, |
|
"learning_rate": 9.249589045002497e-06, |
|
"loss": 2.9024, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.2601572013727444, |
|
"grad_norm": 18.176543407022002, |
|
"learning_rate": 9.239375558807901e-06, |
|
"loss": 2.9065, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.26126425329347946, |
|
"grad_norm": 17.55658292047273, |
|
"learning_rate": 9.229098754662748e-06, |
|
"loss": 2.7598, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.26237130521421453, |
|
"grad_norm": 19.0666485097006, |
|
"learning_rate": 9.218758786058084e-06, |
|
"loss": 2.8376, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.2634783571349496, |
|
"grad_norm": 19.066879018665727, |
|
"learning_rate": 9.208355807428351e-06, |
|
"loss": 2.8766, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.26458540905568473, |
|
"grad_norm": 22.160566724834183, |
|
"learning_rate": 9.197889974149096e-06, |
|
"loss": 2.9115, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.2656924609764198, |
|
"grad_norm": 18.716069674957527, |
|
"learning_rate": 9.187361442534641e-06, |
|
"loss": 2.913, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.26679951289715487, |
|
"grad_norm": 21.86386868859532, |
|
"learning_rate": 9.176770369835748e-06, |
|
"loss": 3.0737, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.26790656481788994, |
|
"grad_norm": 19.87740412211485, |
|
"learning_rate": 9.166116914237277e-06, |
|
"loss": 2.827, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.26901361673862506, |
|
"grad_norm": 20.48966032173197, |
|
"learning_rate": 9.155401234855814e-06, |
|
"loss": 2.8279, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.27012066865936013, |
|
"grad_norm": 18.939462945596684, |
|
"learning_rate": 9.144623491737303e-06, |
|
"loss": 2.8827, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.2712277205800952, |
|
"grad_norm": 16.511411706489035, |
|
"learning_rate": 9.133783845854649e-06, |
|
"loss": 2.8858, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.27233477250083027, |
|
"grad_norm": 17.12242102699232, |
|
"learning_rate": 9.12288245910532e-06, |
|
"loss": 3.0051, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.2734418244215654, |
|
"grad_norm": 21.739631249295055, |
|
"learning_rate": 9.111919494308921e-06, |
|
"loss": 2.8119, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.27454887634230046, |
|
"grad_norm": 19.136040590653046, |
|
"learning_rate": 9.100895115204776e-06, |
|
"loss": 2.9821, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.27565592826303553, |
|
"grad_norm": 18.511511436982243, |
|
"learning_rate": 9.08980948644946e-06, |
|
"loss": 2.8592, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.2767629801837706, |
|
"grad_norm": 20.212663617382482, |
|
"learning_rate": 9.078662773614367e-06, |
|
"loss": 2.9192, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2778700321045057, |
|
"grad_norm": 20.727838354887886, |
|
"learning_rate": 9.067455143183213e-06, |
|
"loss": 2.8882, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.2789770840252408, |
|
"grad_norm": 20.387190015826864, |
|
"learning_rate": 9.056186762549564e-06, |
|
"loss": 2.8964, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.28008413594597587, |
|
"grad_norm": 21.001687858734584, |
|
"learning_rate": 9.04485780001433e-06, |
|
"loss": 3.0001, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.28119118786671093, |
|
"grad_norm": 15.842781499171902, |
|
"learning_rate": 9.033468424783255e-06, |
|
"loss": 2.8406, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.282298239787446, |
|
"grad_norm": 21.453283495940212, |
|
"learning_rate": 9.022018806964388e-06, |
|
"loss": 2.7475, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.28340529170818113, |
|
"grad_norm": 16.60678323210403, |
|
"learning_rate": 9.010509117565538e-06, |
|
"loss": 2.789, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.2845123436289162, |
|
"grad_norm": 21.22156270449788, |
|
"learning_rate": 8.998939528491724e-06, |
|
"loss": 2.8132, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.28561939554965127, |
|
"grad_norm": 20.029298510004143, |
|
"learning_rate": 8.987310212542613e-06, |
|
"loss": 2.8848, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.28672644747038634, |
|
"grad_norm": 17.416215394194477, |
|
"learning_rate": 8.975621343409927e-06, |
|
"loss": 2.8099, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.28783349939112146, |
|
"grad_norm": 17.8983008619953, |
|
"learning_rate": 8.963873095674858e-06, |
|
"loss": 2.8862, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.28894055131185653, |
|
"grad_norm": 17.34578619148897, |
|
"learning_rate": 8.95206564480546e-06, |
|
"loss": 2.7672, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.2900476032325916, |
|
"grad_norm": 20.307382487515195, |
|
"learning_rate": 8.94019916715402e-06, |
|
"loss": 2.9254, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.29115465515332667, |
|
"grad_norm": 15.542065735556422, |
|
"learning_rate": 8.928273839954437e-06, |
|
"loss": 2.7188, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.2922617070740618, |
|
"grad_norm": 15.573521475112441, |
|
"learning_rate": 8.916289841319564e-06, |
|
"loss": 2.8667, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.29336875899479686, |
|
"grad_norm": 19.410117591693684, |
|
"learning_rate": 8.904247350238551e-06, |
|
"loss": 2.8341, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.29447581091553193, |
|
"grad_norm": 19.84765614341061, |
|
"learning_rate": 8.892146546574172e-06, |
|
"loss": 2.7139, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.295582862836267, |
|
"grad_norm": 17.983856647490622, |
|
"learning_rate": 8.879987611060143e-06, |
|
"loss": 2.6931, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.2966899147570021, |
|
"grad_norm": 16.62072011844082, |
|
"learning_rate": 8.867770725298417e-06, |
|
"loss": 2.8986, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.2977969666777372, |
|
"grad_norm": 22.537941135987385, |
|
"learning_rate": 8.855496071756472e-06, |
|
"loss": 2.9275, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.29890401859847227, |
|
"grad_norm": 19.624324641621538, |
|
"learning_rate": 8.843163833764585e-06, |
|
"loss": 2.8609, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.30001107051920733, |
|
"grad_norm": 14.826694832266885, |
|
"learning_rate": 8.8307741955131e-06, |
|
"loss": 2.832, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.30111812243994246, |
|
"grad_norm": 21.084123058139465, |
|
"learning_rate": 8.818327342049672e-06, |
|
"loss": 2.9927, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.30222517436067753, |
|
"grad_norm": 17.156557696514646, |
|
"learning_rate": 8.805823459276501e-06, |
|
"loss": 2.7874, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.3033322262814126, |
|
"grad_norm": 21.616600083840076, |
|
"learning_rate": 8.793262733947564e-06, |
|
"loss": 2.9143, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.30443927820214767, |
|
"grad_norm": 17.849582075052787, |
|
"learning_rate": 8.780645353665814e-06, |
|
"loss": 2.9265, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.30554633012288274, |
|
"grad_norm": 16.907525943766586, |
|
"learning_rate": 8.767971506880388e-06, |
|
"loss": 2.8079, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.30665338204361786, |
|
"grad_norm": 21.80594816789924, |
|
"learning_rate": 8.755241382883786e-06, |
|
"loss": 2.8586, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.30776043396435293, |
|
"grad_norm": 17.786988703153124, |
|
"learning_rate": 8.74245517180905e-06, |
|
"loss": 2.7957, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.308867485885088, |
|
"grad_norm": 18.535816164863746, |
|
"learning_rate": 8.729613064626916e-06, |
|
"loss": 2.9017, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.30997453780582307, |
|
"grad_norm": 16.811716242078795, |
|
"learning_rate": 8.71671525314297e-06, |
|
"loss": 2.8474, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.3110815897265582, |
|
"grad_norm": 18.305914523882734, |
|
"learning_rate": 8.703761929994779e-06, |
|
"loss": 2.9573, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.31218864164729326, |
|
"grad_norm": 18.579915296564323, |
|
"learning_rate": 8.690753288649013e-06, |
|
"loss": 2.8964, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.31329569356802833, |
|
"grad_norm": 18.539697958237422, |
|
"learning_rate": 8.677689523398556e-06, |
|
"loss": 2.7703, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.3144027454887634, |
|
"grad_norm": 17.915697068912802, |
|
"learning_rate": 8.664570829359608e-06, |
|
"loss": 2.8693, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.3155097974094985, |
|
"grad_norm": 18.898905292436613, |
|
"learning_rate": 8.651397402468765e-06, |
|
"loss": 2.8371, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.3166168493302336, |
|
"grad_norm": 22.702920044801495, |
|
"learning_rate": 8.638169439480097e-06, |
|
"loss": 2.8705, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.31772390125096867, |
|
"grad_norm": 14.669145969089513, |
|
"learning_rate": 8.624887137962206e-06, |
|
"loss": 2.7689, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.31883095317170373, |
|
"grad_norm": 20.31679832956785, |
|
"learning_rate": 8.61155069629528e-06, |
|
"loss": 2.8442, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.31993800509243886, |
|
"grad_norm": 17.50251569058274, |
|
"learning_rate": 8.59816031366812e-06, |
|
"loss": 2.8204, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.32104505701317393, |
|
"grad_norm": 14.301977043806207, |
|
"learning_rate": 8.584716190075182e-06, |
|
"loss": 2.7507, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.322152108933909, |
|
"grad_norm": 16.501447600831984, |
|
"learning_rate": 8.571218526313572e-06, |
|
"loss": 2.847, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.32325916085464407, |
|
"grad_norm": 15.819764582641644, |
|
"learning_rate": 8.557667523980054e-06, |
|
"loss": 2.7269, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.3243662127753792, |
|
"grad_norm": 19.79726490914286, |
|
"learning_rate": 8.544063385468047e-06, |
|
"loss": 2.8579, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.32547326469611426, |
|
"grad_norm": 13.946259262777874, |
|
"learning_rate": 8.530406313964588e-06, |
|
"loss": 2.8433, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.32658031661684933, |
|
"grad_norm": 18.300981068446877, |
|
"learning_rate": 8.516696513447308e-06, |
|
"loss": 2.8518, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.3276873685375844, |
|
"grad_norm": 18.862858354575344, |
|
"learning_rate": 8.502934188681382e-06, |
|
"loss": 2.7097, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.32879442045831947, |
|
"grad_norm": 17.293876429758797, |
|
"learning_rate": 8.489119545216465e-06, |
|
"loss": 2.8865, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.3299014723790546, |
|
"grad_norm": 16.410769414507325, |
|
"learning_rate": 8.475252789383634e-06, |
|
"loss": 2.7419, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.33100852429978966, |
|
"grad_norm": 16.157207346564473, |
|
"learning_rate": 8.461334128292296e-06, |
|
"loss": 2.8566, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.33211557622052473, |
|
"grad_norm": 17.97405966664622, |
|
"learning_rate": 8.447363769827097e-06, |
|
"loss": 2.8409, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3332226281412598, |
|
"grad_norm": 18.040888448056503, |
|
"learning_rate": 8.43334192264482e-06, |
|
"loss": 2.7078, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.3343296800619949, |
|
"grad_norm": 17.401311897099646, |
|
"learning_rate": 8.41926879617127e-06, |
|
"loss": 2.8375, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.33543673198273, |
|
"grad_norm": 18.971972878515558, |
|
"learning_rate": 8.405144600598136e-06, |
|
"loss": 2.7534, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.33654378390346507, |
|
"grad_norm": 17.56044316128444, |
|
"learning_rate": 8.390969546879868e-06, |
|
"loss": 2.8017, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.33765083582420014, |
|
"grad_norm": 18.9191689174584, |
|
"learning_rate": 8.376743846730506e-06, |
|
"loss": 2.8735, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.33875788774493526, |
|
"grad_norm": 16.159522966531355, |
|
"learning_rate": 8.36246771262054e-06, |
|
"loss": 2.7277, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.33986493966567033, |
|
"grad_norm": 17.732911671191786, |
|
"learning_rate": 8.348141357773714e-06, |
|
"loss": 2.7975, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.3409719915864054, |
|
"grad_norm": 17.580686476759546, |
|
"learning_rate": 8.333764996163863e-06, |
|
"loss": 2.7285, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.34207904350714047, |
|
"grad_norm": 20.220871787654826, |
|
"learning_rate": 8.319338842511701e-06, |
|
"loss": 2.7638, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.3431860954278756, |
|
"grad_norm": 15.421883005921854, |
|
"learning_rate": 8.30486311228162e-06, |
|
"loss": 2.7664, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.34429314734861066, |
|
"grad_norm": 22.52292422020666, |
|
"learning_rate": 8.290338021678478e-06, |
|
"loss": 2.7415, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.34540019926934573, |
|
"grad_norm": 17.773426663788022, |
|
"learning_rate": 8.275763787644354e-06, |
|
"loss": 2.7612, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.3465072511900808, |
|
"grad_norm": 17.313609438292495, |
|
"learning_rate": 8.261140627855326e-06, |
|
"loss": 2.6789, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.34761430311081587, |
|
"grad_norm": 19.92121017478009, |
|
"learning_rate": 8.246468760718205e-06, |
|
"loss": 2.9528, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.348721355031551, |
|
"grad_norm": 20.3829374368461, |
|
"learning_rate": 8.231748405367284e-06, |
|
"loss": 2.7307, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.34982840695228606, |
|
"grad_norm": 17.20183231133198, |
|
"learning_rate": 8.216979781661059e-06, |
|
"loss": 2.7799, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.35093545887302113, |
|
"grad_norm": 17.179059431154894, |
|
"learning_rate": 8.202163110178945e-06, |
|
"loss": 2.7417, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.3520425107937562, |
|
"grad_norm": 17.829683364789567, |
|
"learning_rate": 8.187298612217984e-06, |
|
"loss": 2.7268, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.3531495627144913, |
|
"grad_norm": 20.35885213396436, |
|
"learning_rate": 8.172386509789539e-06, |
|
"loss": 2.8759, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.3542566146352264, |
|
"grad_norm": 18.210319395606284, |
|
"learning_rate": 8.157427025615979e-06, |
|
"loss": 2.7603, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.35536366655596147, |
|
"grad_norm": 20.180991639281267, |
|
"learning_rate": 8.14242038312735e-06, |
|
"loss": 2.6385, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.35647071847669654, |
|
"grad_norm": 13.997589668763045, |
|
"learning_rate": 8.127366806458043e-06, |
|
"loss": 2.6638, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.35757777039743166, |
|
"grad_norm": 16.552842345785916, |
|
"learning_rate": 8.112266520443437e-06, |
|
"loss": 2.8545, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.35868482231816673, |
|
"grad_norm": 22.63458529594302, |
|
"learning_rate": 8.097119750616552e-06, |
|
"loss": 2.9072, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.3597918742389018, |
|
"grad_norm": 20.351123072545064, |
|
"learning_rate": 8.08192672320467e-06, |
|
"loss": 2.8104, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.36089892615963687, |
|
"grad_norm": 18.012402171983243, |
|
"learning_rate": 8.066687665125965e-06, |
|
"loss": 2.8857, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.362005978080372, |
|
"grad_norm": 14.813109416518861, |
|
"learning_rate": 8.051402803986112e-06, |
|
"loss": 2.7149, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.36311303000110706, |
|
"grad_norm": 19.48150839228793, |
|
"learning_rate": 8.036072368074883e-06, |
|
"loss": 2.7073, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.36422008192184213, |
|
"grad_norm": 19.11749404734295, |
|
"learning_rate": 8.020696586362739e-06, |
|
"loss": 2.6653, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.3653271338425772, |
|
"grad_norm": 22.934472507487648, |
|
"learning_rate": 8.005275688497415e-06, |
|
"loss": 2.813, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.3664341857633123, |
|
"grad_norm": 14.997032892515483, |
|
"learning_rate": 7.989809904800483e-06, |
|
"loss": 2.7371, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.3675412376840474, |
|
"grad_norm": 15.5742880306809, |
|
"learning_rate": 7.974299466263919e-06, |
|
"loss": 2.8341, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.36864828960478246, |
|
"grad_norm": 20.142912914493085, |
|
"learning_rate": 7.958744604546641e-06, |
|
"loss": 2.8141, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.36975534152551753, |
|
"grad_norm": 18.86513832413105, |
|
"learning_rate": 7.94314555197107e-06, |
|
"loss": 2.7812, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.3708623934462526, |
|
"grad_norm": 22.49228437600144, |
|
"learning_rate": 7.927502541519637e-06, |
|
"loss": 2.825, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.3719694453669877, |
|
"grad_norm": 22.419596048754094, |
|
"learning_rate": 7.91181580683132e-06, |
|
"loss": 2.8135, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.3730764972877228, |
|
"grad_norm": 16.9758949814327, |
|
"learning_rate": 7.896085582198143e-06, |
|
"loss": 2.7589, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.37418354920845787, |
|
"grad_norm": 17.427893990910892, |
|
"learning_rate": 7.880312102561688e-06, |
|
"loss": 2.8191, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.37529060112919294, |
|
"grad_norm": 16.881634487817756, |
|
"learning_rate": 7.864495603509571e-06, |
|
"loss": 2.7757, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.37639765304992806, |
|
"grad_norm": 17.644413976791455, |
|
"learning_rate": 7.848636321271943e-06, |
|
"loss": 2.8439, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.37750470497066313, |
|
"grad_norm": 17.371658704562304, |
|
"learning_rate": 7.83273449271794e-06, |
|
"loss": 2.8163, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.3786117568913982, |
|
"grad_norm": 17.681733503092357, |
|
"learning_rate": 7.816790355352167e-06, |
|
"loss": 2.7568, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.37971880881213327, |
|
"grad_norm": 18.455389219089255, |
|
"learning_rate": 7.80080414731113e-06, |
|
"loss": 2.6985, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.3808258607328684, |
|
"grad_norm": 16.157025548622848, |
|
"learning_rate": 7.784776107359696e-06, |
|
"loss": 2.7969, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.38193291265360346, |
|
"grad_norm": 14.768944382636816, |
|
"learning_rate": 7.768706474887516e-06, |
|
"loss": 2.7339, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.38303996457433853, |
|
"grad_norm": 18.48084069219429, |
|
"learning_rate": 7.752595489905456e-06, |
|
"loss": 2.7754, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.3841470164950736, |
|
"grad_norm": 19.156514520004468, |
|
"learning_rate": 7.736443393042007e-06, |
|
"loss": 2.847, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.3852540684158087, |
|
"grad_norm": 16.446763048779168, |
|
"learning_rate": 7.720250425539698e-06, |
|
"loss": 2.6395, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.3863611203365438, |
|
"grad_norm": 14.192958419140753, |
|
"learning_rate": 7.704016829251484e-06, |
|
"loss": 2.7273, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.38746817225727886, |
|
"grad_norm": 14.358834052259523, |
|
"learning_rate": 7.687742846637141e-06, |
|
"loss": 2.705, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.38857522417801393, |
|
"grad_norm": 17.950732691617667, |
|
"learning_rate": 7.671428720759641e-06, |
|
"loss": 2.7615, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.38968227609874906, |
|
"grad_norm": 18.082782880469356, |
|
"learning_rate": 7.655074695281526e-06, |
|
"loss": 2.7389, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.39078932801948413, |
|
"grad_norm": 17.001645765491634, |
|
"learning_rate": 7.638681014461263e-06, |
|
"loss": 2.7623, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.3918963799402192, |
|
"grad_norm": 16.148791106439415, |
|
"learning_rate": 7.622247923149597e-06, |
|
"loss": 2.771, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.39300343186095427, |
|
"grad_norm": 16.319755028507952, |
|
"learning_rate": 7.6057756667859e-06, |
|
"loss": 2.745, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.39411048378168934, |
|
"grad_norm": 18.249081210470003, |
|
"learning_rate": 7.589264491394497e-06, |
|
"loss": 2.7631, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.39521753570242446, |
|
"grad_norm": 17.114757273903603, |
|
"learning_rate": 7.572714643580993e-06, |
|
"loss": 2.5916, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.39632458762315953, |
|
"grad_norm": 15.74515478345217, |
|
"learning_rate": 7.556126370528598e-06, |
|
"loss": 2.7441, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.3974316395438946, |
|
"grad_norm": 17.521251320931118, |
|
"learning_rate": 7.539499919994425e-06, |
|
"loss": 2.7365, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.39853869146462967, |
|
"grad_norm": 19.23187701802523, |
|
"learning_rate": 7.522835540305795e-06, |
|
"loss": 2.7919, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3996457433853648, |
|
"grad_norm": 14.994960528554826, |
|
"learning_rate": 7.506133480356523e-06, |
|
"loss": 2.8063, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.40075279530609986, |
|
"grad_norm": 19.43636713958746, |
|
"learning_rate": 7.489393989603213e-06, |
|
"loss": 2.8291, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.40185984722683493, |
|
"grad_norm": 19.96902221880387, |
|
"learning_rate": 7.472617318061515e-06, |
|
"loss": 2.6574, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.40296689914757, |
|
"grad_norm": 15.764432388205172, |
|
"learning_rate": 7.4558037163023986e-06, |
|
"loss": 2.8279, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.4040739510683051, |
|
"grad_norm": 17.00988346435618, |
|
"learning_rate": 7.438953435448422e-06, |
|
"loss": 2.8606, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.4051810029890402, |
|
"grad_norm": 20.528609879722282, |
|
"learning_rate": 7.422066727169956e-06, |
|
"loss": 2.803, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.40628805490977526, |
|
"grad_norm": 24.117540486267707, |
|
"learning_rate": 7.405143843681453e-06, |
|
"loss": 2.8901, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.40739510683051033, |
|
"grad_norm": 15.932815366392553, |
|
"learning_rate": 7.388185037737656e-06, |
|
"loss": 2.6042, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.40850215875124546, |
|
"grad_norm": 16.494705800421944, |
|
"learning_rate": 7.371190562629842e-06, |
|
"loss": 2.7918, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.40960921067198053, |
|
"grad_norm": 21.567108547663295, |
|
"learning_rate": 7.354160672182027e-06, |
|
"loss": 2.7606, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.4107162625927156, |
|
"grad_norm": 21.48414979932869, |
|
"learning_rate": 7.337095620747181e-06, |
|
"loss": 2.6994, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.41182331451345067, |
|
"grad_norm": 13.807319945171502, |
|
"learning_rate": 7.319995663203425e-06, |
|
"loss": 2.7346, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.41293036643418574, |
|
"grad_norm": 18.456828860891658, |
|
"learning_rate": 7.302861054950231e-06, |
|
"loss": 2.6429, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.41403741835492086, |
|
"grad_norm": 18.493884527191277, |
|
"learning_rate": 7.285692051904596e-06, |
|
"loss": 2.7264, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.41514447027565593, |
|
"grad_norm": 15.443965108568486, |
|
"learning_rate": 7.2684889104972335e-06, |
|
"loss": 2.7915, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.416251522196391, |
|
"grad_norm": 15.970560252697705, |
|
"learning_rate": 7.2512518876687325e-06, |
|
"loss": 2.7585, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.41735857411712607, |
|
"grad_norm": 16.483755053972125, |
|
"learning_rate": 7.233981240865723e-06, |
|
"loss": 2.7225, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.4184656260378612, |
|
"grad_norm": 15.927243910629507, |
|
"learning_rate": 7.2166772280370355e-06, |
|
"loss": 2.7053, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.41957267795859626, |
|
"grad_norm": 16.30824749754582, |
|
"learning_rate": 7.199340107629843e-06, |
|
"loss": 2.7531, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.42067972987933133, |
|
"grad_norm": 17.94048283670358, |
|
"learning_rate": 7.1819701385858045e-06, |
|
"loss": 2.643, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.4217867818000664, |
|
"grad_norm": 18.8081266409834, |
|
"learning_rate": 7.164567580337191e-06, |
|
"loss": 2.759, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.4228938337208015, |
|
"grad_norm": 19.93408221633125, |
|
"learning_rate": 7.147132692803018e-06, |
|
"loss": 2.8159, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.4240008856415366, |
|
"grad_norm": 14.119638307817269, |
|
"learning_rate": 7.1296657363851644e-06, |
|
"loss": 2.5886, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.42510793756227166, |
|
"grad_norm": 14.700749001625018, |
|
"learning_rate": 7.112166971964472e-06, |
|
"loss": 2.7577, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.42621498948300673, |
|
"grad_norm": 16.876997824156497, |
|
"learning_rate": 7.094636660896865e-06, |
|
"loss": 2.7068, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.42732204140374186, |
|
"grad_norm": 17.677042560229854, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": 2.7139, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.42842909332447693, |
|
"grad_norm": 22.903911635500307, |
|
"learning_rate": 7.059482446596525e-06, |
|
"loss": 2.6586, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.429536145245212, |
|
"grad_norm": 17.15359853143299, |
|
"learning_rate": 7.041859068415836e-06, |
|
"loss": 2.7196, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.43064319716594707, |
|
"grad_norm": 18.265015720893867, |
|
"learning_rate": 7.024205193684479e-06, |
|
"loss": 2.795, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.4317502490866822, |
|
"grad_norm": 17.416460348542884, |
|
"learning_rate": 7.006521086075049e-06, |
|
"loss": 2.8018, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.43285730100741726, |
|
"grad_norm": 15.06159976676458, |
|
"learning_rate": 6.9888070097116926e-06, |
|
"loss": 2.6702, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.43396435292815233, |
|
"grad_norm": 14.916257340220586, |
|
"learning_rate": 6.971063229166162e-06, |
|
"loss": 2.667, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.4350714048488874, |
|
"grad_norm": 16.946369105743727, |
|
"learning_rate": 6.953290009453857e-06, |
|
"loss": 2.6547, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.43617845676962247, |
|
"grad_norm": 17.606162667161975, |
|
"learning_rate": 6.9354876160298764e-06, |
|
"loss": 2.7565, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.4372855086903576, |
|
"grad_norm": 15.792356606039535, |
|
"learning_rate": 6.917656314785044e-06, |
|
"loss": 2.7603, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.43839256061109266, |
|
"grad_norm": 17.519385710278783, |
|
"learning_rate": 6.899796372041943e-06, |
|
"loss": 2.5908, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.43949961253182773, |
|
"grad_norm": 18.175539572977502, |
|
"learning_rate": 6.881908054550939e-06, |
|
"loss": 2.7189, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.4406066644525628, |
|
"grad_norm": 16.78341459760071, |
|
"learning_rate": 6.863991629486191e-06, |
|
"loss": 2.7457, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.4417137163732979, |
|
"grad_norm": 16.307893535865905, |
|
"learning_rate": 6.846047364441661e-06, |
|
"loss": 2.7664, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.442820768294033, |
|
"grad_norm": 17.795046718057446, |
|
"learning_rate": 6.828075527427127e-06, |
|
"loss": 2.7682, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.442820768294033, |
|
"eval_loss": 2.715528726577759, |
|
"eval_runtime": 2400.8491, |
|
"eval_samples_per_second": 4.181, |
|
"eval_steps_per_second": 0.418, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.44392782021476807, |
|
"grad_norm": 17.177561938405823, |
|
"learning_rate": 6.810076386864168e-06, |
|
"loss": 2.7353, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.44503487213550313, |
|
"grad_norm": 18.717792449825087, |
|
"learning_rate": 6.792050211582164e-06, |
|
"loss": 2.6284, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.44614192405623826, |
|
"grad_norm": 20.629160666920065, |
|
"learning_rate": 6.77399727081427e-06, |
|
"loss": 2.7808, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.44724897597697333, |
|
"grad_norm": 16.300381610488234, |
|
"learning_rate": 6.755917834193408e-06, |
|
"loss": 2.6976, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.4483560278977084, |
|
"grad_norm": 18.995902150808703, |
|
"learning_rate": 6.737812171748234e-06, |
|
"loss": 2.7441, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.44946307981844347, |
|
"grad_norm": 18.261637709522596, |
|
"learning_rate": 6.719680553899097e-06, |
|
"loss": 2.6822, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.4505701317391786, |
|
"grad_norm": 20.659710982739558, |
|
"learning_rate": 6.701523251454017e-06, |
|
"loss": 2.6978, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.45167718365991366, |
|
"grad_norm": 19.963369393203255, |
|
"learning_rate": 6.683340535604624e-06, |
|
"loss": 2.7391, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.45278423558064873, |
|
"grad_norm": 17.272615462239525, |
|
"learning_rate": 6.665132677922118e-06, |
|
"loss": 2.6982, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.4538912875013838, |
|
"grad_norm": 17.102697486895753, |
|
"learning_rate": 6.646899950353208e-06, |
|
"loss": 2.7443, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.4549983394221189, |
|
"grad_norm": 16.731640547098063, |
|
"learning_rate": 6.628642625216053e-06, |
|
"loss": 2.7825, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.456105391342854, |
|
"grad_norm": 16.86948389308186, |
|
"learning_rate": 6.61036097519619e-06, |
|
"loss": 2.6986, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.45721244326358906, |
|
"grad_norm": 20.677217100728953, |
|
"learning_rate": 6.592055273342467e-06, |
|
"loss": 2.8304, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.45831949518432413, |
|
"grad_norm": 16.821661815243136, |
|
"learning_rate": 6.573725793062965e-06, |
|
"loss": 2.6678, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.4594265471050592, |
|
"grad_norm": 18.45134731193715, |
|
"learning_rate": 6.555372808120907e-06, |
|
"loss": 2.823, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.4605335990257943, |
|
"grad_norm": 17.57852954660428, |
|
"learning_rate": 6.536996592630578e-06, |
|
"loss": 2.7795, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.4616406509465294, |
|
"grad_norm": 17.253221141789883, |
|
"learning_rate": 6.518597421053223e-06, |
|
"loss": 2.7, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.46274770286726447, |
|
"grad_norm": 16.206089784799936, |
|
"learning_rate": 6.5001755681929545e-06, |
|
"loss": 2.7196, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.46385475478799953, |
|
"grad_norm": 18.947069414032423, |
|
"learning_rate": 6.481731309192647e-06, |
|
"loss": 2.7542, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.46496180670873466, |
|
"grad_norm": 16.548697201774296, |
|
"learning_rate": 6.463264919529823e-06, |
|
"loss": 2.7531, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.46606885862946973, |
|
"grad_norm": 17.605153791162124, |
|
"learning_rate": 6.444776675012542e-06, |
|
"loss": 2.7248, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.4671759105502048, |
|
"grad_norm": 18.42367136884591, |
|
"learning_rate": 6.42626685177528e-06, |
|
"loss": 2.6742, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.46828296247093987, |
|
"grad_norm": 21.057012768405876, |
|
"learning_rate": 6.407735726274809e-06, |
|
"loss": 2.7067, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.469390014391675, |
|
"grad_norm": 17.878193605338524, |
|
"learning_rate": 6.38918357528606e-06, |
|
"loss": 2.8213, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.47049706631241006, |
|
"grad_norm": 15.251101561882258, |
|
"learning_rate": 6.370610675897997e-06, |
|
"loss": 2.767, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.47160411823314513, |
|
"grad_norm": 16.35077680470725, |
|
"learning_rate": 6.352017305509475e-06, |
|
"loss": 2.5496, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.4727111701538802, |
|
"grad_norm": 20.78692237253247, |
|
"learning_rate": 6.3334037418250975e-06, |
|
"loss": 2.5517, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.4738182220746153, |
|
"grad_norm": 16.49688836558597, |
|
"learning_rate": 6.314770262851069e-06, |
|
"loss": 2.7365, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.4749252739953504, |
|
"grad_norm": 17.75918198378233, |
|
"learning_rate": 6.296117146891039e-06, |
|
"loss": 2.651, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.47603232591608546, |
|
"grad_norm": 15.289950571080979, |
|
"learning_rate": 6.277444672541953e-06, |
|
"loss": 2.7015, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.47713937783682053, |
|
"grad_norm": 15.010585688125417, |
|
"learning_rate": 6.258753118689887e-06, |
|
"loss": 2.6344, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.4782464297575556, |
|
"grad_norm": 16.384237830668948, |
|
"learning_rate": 6.240042764505877e-06, |
|
"loss": 2.7013, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.4793534816782907, |
|
"grad_norm": 15.761472924874809, |
|
"learning_rate": 6.2213138894417615e-06, |
|
"loss": 2.7414, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.4804605335990258, |
|
"grad_norm": 17.457264405530225, |
|
"learning_rate": 6.202566773225995e-06, |
|
"loss": 2.7923, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.48156758551976087, |
|
"grad_norm": 20.03913075692092, |
|
"learning_rate": 6.1838016958594825e-06, |
|
"loss": 2.7145, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.48267463744049593, |
|
"grad_norm": 14.687794264132354, |
|
"learning_rate": 6.165018937611385e-06, |
|
"loss": 2.6172, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.48378168936123106, |
|
"grad_norm": 15.026413038595793, |
|
"learning_rate": 6.146218779014942e-06, |
|
"loss": 2.6804, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.48488874128196613, |
|
"grad_norm": 17.378458618834472, |
|
"learning_rate": 6.127401500863281e-06, |
|
"loss": 2.5838, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.4859957932027012, |
|
"grad_norm": 16.495531002493667, |
|
"learning_rate": 6.108567384205214e-06, |
|
"loss": 2.5008, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.48710284512343627, |
|
"grad_norm": 15.612526961187054, |
|
"learning_rate": 6.089716710341058e-06, |
|
"loss": 2.5134, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.4882098970441714, |
|
"grad_norm": 17.829542612600722, |
|
"learning_rate": 6.070849760818417e-06, |
|
"loss": 2.6932, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.48931694896490646, |
|
"grad_norm": 18.397184297289453, |
|
"learning_rate": 6.051966817427983e-06, |
|
"loss": 2.664, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.49042400088564153, |
|
"grad_norm": 15.139678235200124, |
|
"learning_rate": 6.03306816219933e-06, |
|
"loss": 2.6431, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.4915310528063766, |
|
"grad_norm": 19.13733604850318, |
|
"learning_rate": 6.014154077396695e-06, |
|
"loss": 2.7429, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.4926381047271117, |
|
"grad_norm": 19.88327633299528, |
|
"learning_rate": 5.995224845514771e-06, |
|
"loss": 2.6894, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.4937451566478468, |
|
"grad_norm": 16.78819908723115, |
|
"learning_rate": 5.97628074927448e-06, |
|
"loss": 2.712, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.49485220856858186, |
|
"grad_norm": 15.34943286541028, |
|
"learning_rate": 5.957322071618753e-06, |
|
"loss": 2.652, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.49595926048931693, |
|
"grad_norm": 14.718777663127804, |
|
"learning_rate": 5.9383490957083045e-06, |
|
"loss": 2.6708, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.49706631241005206, |
|
"grad_norm": 14.06128807028094, |
|
"learning_rate": 5.919362104917403e-06, |
|
"loss": 2.6022, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.4981733643307871, |
|
"grad_norm": 16.565786742803958, |
|
"learning_rate": 5.90036138282964e-06, |
|
"loss": 2.6252, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4992804162515222, |
|
"grad_norm": 15.757898844662668, |
|
"learning_rate": 5.8813472132336955e-06, |
|
"loss": 2.6229, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.5003874681722573, |
|
"grad_norm": 21.10749621990984, |
|
"learning_rate": 5.862319880119092e-06, |
|
"loss": 2.709, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.5014945200929923, |
|
"grad_norm": 18.080937909773763, |
|
"learning_rate": 5.8432796676719585e-06, |
|
"loss": 2.5919, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.5026015720137275, |
|
"grad_norm": 15.309930072347084, |
|
"learning_rate": 5.824226860270791e-06, |
|
"loss": 2.7639, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.5037086239344625, |
|
"grad_norm": 17.326512802033673, |
|
"learning_rate": 5.805161742482194e-06, |
|
"loss": 2.6954, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.5048156758551976, |
|
"grad_norm": 20.016766712775652, |
|
"learning_rate": 5.786084599056637e-06, |
|
"loss": 2.6651, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.5059227277759327, |
|
"grad_norm": 15.39976054839859, |
|
"learning_rate": 5.766995714924204e-06, |
|
"loss": 2.7208, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.5070297796966677, |
|
"grad_norm": 15.56824968714477, |
|
"learning_rate": 5.747895375190331e-06, |
|
"loss": 2.6959, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.5081368316174029, |
|
"grad_norm": 19.043556423880098, |
|
"learning_rate": 5.728783865131554e-06, |
|
"loss": 2.7182, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.509243883538138, |
|
"grad_norm": 18.533491761930883, |
|
"learning_rate": 5.709661470191241e-06, |
|
"loss": 2.6474, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.510350935458873, |
|
"grad_norm": 17.576811873751446, |
|
"learning_rate": 5.6905284759753365e-06, |
|
"loss": 2.6864, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.5114579873796081, |
|
"grad_norm": 18.79796869282816, |
|
"learning_rate": 5.6713851682480926e-06, |
|
"loss": 2.5302, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.5125650393003431, |
|
"grad_norm": 17.510899102111733, |
|
"learning_rate": 5.6522318329278e-06, |
|
"loss": 2.6672, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.5136720912210783, |
|
"grad_norm": 15.707692417808088, |
|
"learning_rate": 5.633068756082517e-06, |
|
"loss": 2.6229, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.5147791431418134, |
|
"grad_norm": 14.427966106685423, |
|
"learning_rate": 5.613896223925799e-06, |
|
"loss": 2.6565, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.5158861950625484, |
|
"grad_norm": 17.13890386270487, |
|
"learning_rate": 5.594714522812422e-06, |
|
"loss": 2.738, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.5169932469832835, |
|
"grad_norm": 15.344124561854793, |
|
"learning_rate": 5.575523939234111e-06, |
|
"loss": 2.7876, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.5181002989040187, |
|
"grad_norm": 16.79964161196015, |
|
"learning_rate": 5.556324759815252e-06, |
|
"loss": 2.6692, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.5192073508247537, |
|
"grad_norm": 19.56356390380519, |
|
"learning_rate": 5.537117271308615e-06, |
|
"loss": 2.7151, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.5203144027454888, |
|
"grad_norm": 18.641775052939003, |
|
"learning_rate": 5.5179017605910754e-06, |
|
"loss": 2.8004, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5214214546662238, |
|
"grad_norm": 15.272957986365086, |
|
"learning_rate": 5.4986785146593255e-06, |
|
"loss": 2.7083, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.5225285065869589, |
|
"grad_norm": 15.949027616558995, |
|
"learning_rate": 5.479447820625585e-06, |
|
"loss": 2.6865, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.523635558507694, |
|
"grad_norm": 15.67762021450724, |
|
"learning_rate": 5.46020996571332e-06, |
|
"loss": 2.7183, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.5247426104284291, |
|
"grad_norm": 19.95294125446329, |
|
"learning_rate": 5.4409652372529444e-06, |
|
"loss": 2.7927, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.5258496623491642, |
|
"grad_norm": 13.488762906306286, |
|
"learning_rate": 5.421713922677539e-06, |
|
"loss": 2.5992, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.5269567142698992, |
|
"grad_norm": 16.599798214798543, |
|
"learning_rate": 5.402456309518547e-06, |
|
"loss": 2.5732, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.5280637661906343, |
|
"grad_norm": 14.764833460888406, |
|
"learning_rate": 5.383192685401492e-06, |
|
"loss": 2.5634, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.5291708181113695, |
|
"grad_norm": 17.816571873254308, |
|
"learning_rate": 5.363923338041667e-06, |
|
"loss": 2.64, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.5302778700321045, |
|
"grad_norm": 14.543241263642692, |
|
"learning_rate": 5.344648555239854e-06, |
|
"loss": 2.6637, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.5313849219528396, |
|
"grad_norm": 16.519933702897138, |
|
"learning_rate": 5.325368624878009e-06, |
|
"loss": 2.747, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5324919738735747, |
|
"grad_norm": 17.67293620152496, |
|
"learning_rate": 5.306083834914977e-06, |
|
"loss": 2.6096, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.5335990257943097, |
|
"grad_norm": 17.919095046156233, |
|
"learning_rate": 5.286794473382178e-06, |
|
"loss": 2.6526, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.5347060777150449, |
|
"grad_norm": 14.567289996672956, |
|
"learning_rate": 5.267500828379319e-06, |
|
"loss": 2.7698, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.5358131296357799, |
|
"grad_norm": 17.34975497496579, |
|
"learning_rate": 5.248203188070078e-06, |
|
"loss": 2.6932, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.536920181556515, |
|
"grad_norm": 14.383043710837034, |
|
"learning_rate": 5.228901840677808e-06, |
|
"loss": 2.533, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.5380272334772501, |
|
"grad_norm": 19.4814620431374, |
|
"learning_rate": 5.209597074481228e-06, |
|
"loss": 2.7526, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.5391342853979851, |
|
"grad_norm": 17.294271003058864, |
|
"learning_rate": 5.19028917781012e-06, |
|
"loss": 2.7006, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.5402413373187203, |
|
"grad_norm": 13.454761500494456, |
|
"learning_rate": 5.170978439041023e-06, |
|
"loss": 2.5453, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.5413483892394554, |
|
"grad_norm": 17.855933800763392, |
|
"learning_rate": 5.151665146592924e-06, |
|
"loss": 2.6315, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.5424554411601904, |
|
"grad_norm": 17.427924222975562, |
|
"learning_rate": 5.132349588922949e-06, |
|
"loss": 2.6539, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.5435624930809255, |
|
"grad_norm": 20.073145834110875, |
|
"learning_rate": 5.113032054522058e-06, |
|
"loss": 2.5488, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.5446695450016605, |
|
"grad_norm": 12.357803208105327, |
|
"learning_rate": 5.093712831910736e-06, |
|
"loss": 2.5557, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.5457765969223957, |
|
"grad_norm": 15.692479347879283, |
|
"learning_rate": 5.0743922096346836e-06, |
|
"loss": 2.7068, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.5468836488431308, |
|
"grad_norm": 14.866689448660685, |
|
"learning_rate": 5.055070476260501e-06, |
|
"loss": 2.576, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.5479907007638658, |
|
"grad_norm": 15.129308088501134, |
|
"learning_rate": 5.0357479203713885e-06, |
|
"loss": 2.3914, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.5490977526846009, |
|
"grad_norm": 14.162687417076338, |
|
"learning_rate": 5.0164248305628284e-06, |
|
"loss": 2.6796, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.5502048046053359, |
|
"grad_norm": 19.323858139882816, |
|
"learning_rate": 4.997101495438277e-06, |
|
"loss": 2.4771, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.5513118565260711, |
|
"grad_norm": 17.540498070177875, |
|
"learning_rate": 4.97777820360486e-06, |
|
"loss": 2.572, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.5524189084468062, |
|
"grad_norm": 19.393507393902457, |
|
"learning_rate": 4.958455243669051e-06, |
|
"loss": 2.6577, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.5535259603675412, |
|
"grad_norm": 17.365811060415265, |
|
"learning_rate": 4.939132904232366e-06, |
|
"loss": 2.6571, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5546330122882763, |
|
"grad_norm": 14.882734972778014, |
|
"learning_rate": 4.91981147388706e-06, |
|
"loss": 2.5927, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.5557400642090115, |
|
"grad_norm": 18.498227060413406, |
|
"learning_rate": 4.900491241211799e-06, |
|
"loss": 2.6215, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.5568471161297465, |
|
"grad_norm": 16.424230672284246, |
|
"learning_rate": 4.881172494767372e-06, |
|
"loss": 2.738, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.5579541680504816, |
|
"grad_norm": 14.449267161706716, |
|
"learning_rate": 4.861855523092366e-06, |
|
"loss": 2.6883, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.5590612199712166, |
|
"grad_norm": 15.748250902231145, |
|
"learning_rate": 4.84254061469886e-06, |
|
"loss": 2.6369, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.5601682718919517, |
|
"grad_norm": 21.423066740561787, |
|
"learning_rate": 4.823228058068113e-06, |
|
"loss": 2.7159, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.5612753238126869, |
|
"grad_norm": 14.22388926392383, |
|
"learning_rate": 4.803918141646268e-06, |
|
"loss": 2.5795, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.5623823757334219, |
|
"grad_norm": 14.83696241654988, |
|
"learning_rate": 4.784611153840027e-06, |
|
"loss": 2.5612, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.563489427654157, |
|
"grad_norm": 14.263900210157331, |
|
"learning_rate": 4.765307383012352e-06, |
|
"loss": 2.5602, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.564596479574892, |
|
"grad_norm": 17.257310107919768, |
|
"learning_rate": 4.746007117478162e-06, |
|
"loss": 2.611, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5657035314956271, |
|
"grad_norm": 16.708351070999512, |
|
"learning_rate": 4.726710645500014e-06, |
|
"loss": 2.6106, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.5668105834163623, |
|
"grad_norm": 16.979878309390095, |
|
"learning_rate": 4.707418255283817e-06, |
|
"loss": 2.7961, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.5679176353370973, |
|
"grad_norm": 16.81810768550359, |
|
"learning_rate": 4.6881302349745015e-06, |
|
"loss": 2.5536, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.5690246872578324, |
|
"grad_norm": 16.369198159186666, |
|
"learning_rate": 4.668846872651745e-06, |
|
"loss": 2.7049, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.5701317391785675, |
|
"grad_norm": 14.5307901204883, |
|
"learning_rate": 4.649568456325645e-06, |
|
"loss": 2.6538, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.5712387910993025, |
|
"grad_norm": 13.505347462632475, |
|
"learning_rate": 4.630295273932435e-06, |
|
"loss": 2.5944, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.5723458430200377, |
|
"grad_norm": 14.683292804609174, |
|
"learning_rate": 4.611027613330166e-06, |
|
"loss": 2.6914, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.5734528949407727, |
|
"grad_norm": 17.13643381283879, |
|
"learning_rate": 4.5917657622944235e-06, |
|
"loss": 2.6462, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.5745599468615078, |
|
"grad_norm": 16.94159128538117, |
|
"learning_rate": 4.572510008514027e-06, |
|
"loss": 2.6447, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.5756669987822429, |
|
"grad_norm": 18.068429687848685, |
|
"learning_rate": 4.55326063958672e-06, |
|
"loss": 2.7705, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5767740507029779, |
|
"grad_norm": 14.55412168781434, |
|
"learning_rate": 4.534017943014895e-06, |
|
"loss": 2.6824, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.5778811026237131, |
|
"grad_norm": 14.837147206944774, |
|
"learning_rate": 4.514782206201274e-06, |
|
"loss": 2.5857, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.5789881545444482, |
|
"grad_norm": 15.433613293909772, |
|
"learning_rate": 4.495553716444647e-06, |
|
"loss": 2.6309, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.5800952064651832, |
|
"grad_norm": 15.838049703049755, |
|
"learning_rate": 4.4763327609355505e-06, |
|
"loss": 2.5826, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.5812022583859183, |
|
"grad_norm": 17.013462581069046, |
|
"learning_rate": 4.457119626751998e-06, |
|
"loss": 2.6681, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.5823093103066533, |
|
"grad_norm": 18.074417040094673, |
|
"learning_rate": 4.437914600855187e-06, |
|
"loss": 2.6364, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.5834163622273885, |
|
"grad_norm": 17.194945416385185, |
|
"learning_rate": 4.4187179700852084e-06, |
|
"loss": 2.6663, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.5845234141481236, |
|
"grad_norm": 17.09566869966539, |
|
"learning_rate": 4.399530021156771e-06, |
|
"loss": 2.5621, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.5856304660688586, |
|
"grad_norm": 18.10182865444287, |
|
"learning_rate": 4.38035104065491e-06, |
|
"loss": 2.6451, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.5867375179895937, |
|
"grad_norm": 13.726610338766326, |
|
"learning_rate": 4.361181315030714e-06, |
|
"loss": 2.6154, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.5878445699103287, |
|
"grad_norm": 13.396115971130266, |
|
"learning_rate": 4.342021130597041e-06, |
|
"loss": 2.6552, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.5889516218310639, |
|
"grad_norm": 18.072129861449454, |
|
"learning_rate": 4.3228707735242485e-06, |
|
"loss": 2.6323, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.590058673751799, |
|
"grad_norm": 16.101311253082667, |
|
"learning_rate": 4.303730529835913e-06, |
|
"loss": 2.5936, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.591165725672534, |
|
"grad_norm": 17.959725344836784, |
|
"learning_rate": 4.28460068540456e-06, |
|
"loss": 2.6568, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.5922727775932691, |
|
"grad_norm": 14.558411141104697, |
|
"learning_rate": 4.2654815259473994e-06, |
|
"loss": 2.599, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.5933798295140043, |
|
"grad_norm": 15.020557260142786, |
|
"learning_rate": 4.2463733370220464e-06, |
|
"loss": 2.6193, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.5944868814347393, |
|
"grad_norm": 16.367462970526052, |
|
"learning_rate": 4.2272764040222724e-06, |
|
"loss": 2.5572, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.5955939333554744, |
|
"grad_norm": 17.24930565347666, |
|
"learning_rate": 4.208191012173728e-06, |
|
"loss": 2.7591, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.5967009852762094, |
|
"grad_norm": 16.29148295415015, |
|
"learning_rate": 4.189117446529692e-06, |
|
"loss": 2.6654, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.5978080371969445, |
|
"grad_norm": 14.636816803347672, |
|
"learning_rate": 4.170055991966808e-06, |
|
"loss": 2.6481, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.5989150891176797, |
|
"grad_norm": 15.770080307849732, |
|
"learning_rate": 4.1510069331808324e-06, |
|
"loss": 2.637, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.6000221410384147, |
|
"grad_norm": 15.398178191768253, |
|
"learning_rate": 4.131970554682387e-06, |
|
"loss": 2.6958, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.6011291929591498, |
|
"grad_norm": 15.861210008610465, |
|
"learning_rate": 4.1129471407926995e-06, |
|
"loss": 2.5836, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.6022362448798849, |
|
"grad_norm": 14.510344904474643, |
|
"learning_rate": 4.093936975639367e-06, |
|
"loss": 2.6514, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.6033432968006199, |
|
"grad_norm": 19.34752243925819, |
|
"learning_rate": 4.0749403431521e-06, |
|
"loss": 2.6221, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.6044503487213551, |
|
"grad_norm": 14.169326871610396, |
|
"learning_rate": 4.055957527058501e-06, |
|
"loss": 2.5109, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.6055574006420901, |
|
"grad_norm": 15.469257875046958, |
|
"learning_rate": 4.036988810879804e-06, |
|
"loss": 2.6436, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.6066644525628252, |
|
"grad_norm": 15.484848198531239, |
|
"learning_rate": 4.018034477926661e-06, |
|
"loss": 2.4906, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.6077715044835603, |
|
"grad_norm": 15.378784092462407, |
|
"learning_rate": 3.9990948112948914e-06, |
|
"loss": 2.6171, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.6088785564042953, |
|
"grad_norm": 14.686645639856618, |
|
"learning_rate": 3.9801700938612685e-06, |
|
"loss": 2.6579, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6099856083250305, |
|
"grad_norm": 13.215751426102292, |
|
"learning_rate": 3.96126060827929e-06, |
|
"loss": 2.5402, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.6110926602457655, |
|
"grad_norm": 14.135003798272539, |
|
"learning_rate": 3.942366636974954e-06, |
|
"loss": 2.622, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.6121997121665006, |
|
"grad_norm": 17.459175088951138, |
|
"learning_rate": 3.923488462142541e-06, |
|
"loss": 2.5552, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.6133067640872357, |
|
"grad_norm": 15.87291748509675, |
|
"learning_rate": 3.9046263657404005e-06, |
|
"loss": 2.6628, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.6144138160079707, |
|
"grad_norm": 17.77834550937652, |
|
"learning_rate": 3.885780629486744e-06, |
|
"loss": 2.5962, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.6155208679287059, |
|
"grad_norm": 14.623260869268544, |
|
"learning_rate": 3.866951534855429e-06, |
|
"loss": 2.5216, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.616627919849441, |
|
"grad_norm": 18.782526592973454, |
|
"learning_rate": 3.848139363071759e-06, |
|
"loss": 2.5408, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.617734971770176, |
|
"grad_norm": 15.484929469465394, |
|
"learning_rate": 3.8293443951082865e-06, |
|
"loss": 2.5616, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.6188420236909111, |
|
"grad_norm": 17.313043224092755, |
|
"learning_rate": 3.810566911680607e-06, |
|
"loss": 2.6196, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.6199490756116461, |
|
"grad_norm": 14.974425571993558, |
|
"learning_rate": 3.7918071932431823e-06, |
|
"loss": 2.5633, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6210561275323813, |
|
"grad_norm": 14.593381904858223, |
|
"learning_rate": 3.773065519985132e-06, |
|
"loss": 2.6227, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.6221631794531164, |
|
"grad_norm": 19.67519437375815, |
|
"learning_rate": 3.7543421718260663e-06, |
|
"loss": 2.666, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.6232702313738514, |
|
"grad_norm": 13.058989186832509, |
|
"learning_rate": 3.7356374284118906e-06, |
|
"loss": 2.5616, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.6243772832945865, |
|
"grad_norm": 19.30534098144351, |
|
"learning_rate": 3.716951569110645e-06, |
|
"loss": 2.551, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.6254843352153217, |
|
"grad_norm": 15.614374371487665, |
|
"learning_rate": 3.6982848730083144e-06, |
|
"loss": 2.495, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.6265913871360567, |
|
"grad_norm": 21.218331844105535, |
|
"learning_rate": 3.67963761890467e-06, |
|
"loss": 2.7439, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.6276984390567918, |
|
"grad_norm": 17.01930866391004, |
|
"learning_rate": 3.6610100853091067e-06, |
|
"loss": 2.5619, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.6288054909775268, |
|
"grad_norm": 16.548611978624205, |
|
"learning_rate": 3.642402550436476e-06, |
|
"loss": 2.5517, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.6299125428982619, |
|
"grad_norm": 16.350659146252166, |
|
"learning_rate": 3.6238152922029414e-06, |
|
"loss": 2.6533, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.631019594818997, |
|
"grad_norm": 16.295428081442413, |
|
"learning_rate": 3.6052485882218124e-06, |
|
"loss": 2.5341, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6321266467397321, |
|
"grad_norm": 16.161944221815478, |
|
"learning_rate": 3.5867027157994137e-06, |
|
"loss": 2.4661, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.6332336986604672, |
|
"grad_norm": 18.192390922499364, |
|
"learning_rate": 3.568177951930932e-06, |
|
"loss": 2.5499, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.6343407505812022, |
|
"grad_norm": 18.154938030310817, |
|
"learning_rate": 3.54967457329629e-06, |
|
"loss": 2.671, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.6354478025019373, |
|
"grad_norm": 17.50231046259661, |
|
"learning_rate": 3.5311928562559984e-06, |
|
"loss": 2.5161, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.6365548544226725, |
|
"grad_norm": 15.071570236507409, |
|
"learning_rate": 3.5127330768470414e-06, |
|
"loss": 2.638, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.6376619063434075, |
|
"grad_norm": 17.638180874471615, |
|
"learning_rate": 3.4942955107787534e-06, |
|
"loss": 2.5672, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.6387689582641426, |
|
"grad_norm": 17.092873285184194, |
|
"learning_rate": 3.4758804334286924e-06, |
|
"loss": 2.6012, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.6398760101848777, |
|
"grad_norm": 14.564343624825167, |
|
"learning_rate": 3.457488119838535e-06, |
|
"loss": 2.5989, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.6409830621056127, |
|
"grad_norm": 16.413821117561785, |
|
"learning_rate": 3.4391188447099614e-06, |
|
"loss": 2.506, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.6420901140263479, |
|
"grad_norm": 18.393396650855887, |
|
"learning_rate": 3.4207728824005653e-06, |
|
"loss": 2.5685, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6431971659470829, |
|
"grad_norm": 16.91734370623325, |
|
"learning_rate": 3.4024505069197387e-06, |
|
"loss": 2.4561, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.644304217867818, |
|
"grad_norm": 15.98240569506593, |
|
"learning_rate": 3.3841519919245925e-06, |
|
"loss": 2.6473, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.6454112697885531, |
|
"grad_norm": 16.326289119567278, |
|
"learning_rate": 3.3658776107158654e-06, |
|
"loss": 2.4694, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.6465183217092881, |
|
"grad_norm": 18.501828998717585, |
|
"learning_rate": 3.347627636233837e-06, |
|
"loss": 2.6163, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.6476253736300233, |
|
"grad_norm": 17.230377910119174, |
|
"learning_rate": 3.329402341054265e-06, |
|
"loss": 2.5839, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.6487324255507584, |
|
"grad_norm": 15.353383433670851, |
|
"learning_rate": 3.311201997384295e-06, |
|
"loss": 2.6337, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.6498394774714934, |
|
"grad_norm": 16.881261849081998, |
|
"learning_rate": 3.2930268770584127e-06, |
|
"loss": 2.5865, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.6509465293922285, |
|
"grad_norm": 18.123650428151265, |
|
"learning_rate": 3.2748772515343697e-06, |
|
"loss": 2.6292, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.6520535813129635, |
|
"grad_norm": 21.517681336714, |
|
"learning_rate": 3.2567533918891414e-06, |
|
"loss": 2.641, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.6531606332336987, |
|
"grad_norm": 19.398238179320135, |
|
"learning_rate": 3.238655568814868e-06, |
|
"loss": 2.6626, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.6542676851544338, |
|
"grad_norm": 16.094985895672867, |
|
"learning_rate": 3.2205840526148158e-06, |
|
"loss": 2.5219, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.6553747370751688, |
|
"grad_norm": 15.058326544356623, |
|
"learning_rate": 3.2025391131993443e-06, |
|
"loss": 2.5849, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.6564817889959039, |
|
"grad_norm": 15.860339323392015, |
|
"learning_rate": 3.184521020081864e-06, |
|
"loss": 2.3947, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.6575888409166389, |
|
"grad_norm": 17.03657583580592, |
|
"learning_rate": 3.1665300423748256e-06, |
|
"loss": 2.6228, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.6586958928373741, |
|
"grad_norm": 16.449779619145687, |
|
"learning_rate": 3.148566448785687e-06, |
|
"loss": 2.6434, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.6598029447581092, |
|
"grad_norm": 18.51817609745207, |
|
"learning_rate": 3.1306305076129083e-06, |
|
"loss": 2.5301, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.6609099966788442, |
|
"grad_norm": 17.17970665475141, |
|
"learning_rate": 3.112722486741941e-06, |
|
"loss": 2.5608, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.6620170485995793, |
|
"grad_norm": 15.220359891812148, |
|
"learning_rate": 3.094842653641225e-06, |
|
"loss": 2.5432, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.6631241005203145, |
|
"grad_norm": 15.940169495180179, |
|
"learning_rate": 3.076991275358205e-06, |
|
"loss": 2.5147, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.6642311524410495, |
|
"grad_norm": 13.94891949646219, |
|
"learning_rate": 3.059168618515325e-06, |
|
"loss": 2.5043, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6642311524410495, |
|
"eval_loss": 2.562150716781616, |
|
"eval_runtime": 2394.5594, |
|
"eval_samples_per_second": 4.192, |
|
"eval_steps_per_second": 0.419, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6653382043617846, |
|
"grad_norm": 17.7531887306566, |
|
"learning_rate": 3.0413749493060596e-06, |
|
"loss": 2.6127, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.6664452562825196, |
|
"grad_norm": 12.808942551796036, |
|
"learning_rate": 3.0236105334909303e-06, |
|
"loss": 2.5683, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.6675523082032547, |
|
"grad_norm": 16.672861233647524, |
|
"learning_rate": 3.0058756363935447e-06, |
|
"loss": 2.5315, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.6686593601239899, |
|
"grad_norm": 15.135037228190633, |
|
"learning_rate": 2.9881705228966217e-06, |
|
"loss": 2.4304, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.6697664120447249, |
|
"grad_norm": 19.201710928838462, |
|
"learning_rate": 2.9704954574380474e-06, |
|
"loss": 2.6006, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.67087346396546, |
|
"grad_norm": 16.780831760906963, |
|
"learning_rate": 2.9528507040069165e-06, |
|
"loss": 2.5291, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.6719805158861951, |
|
"grad_norm": 15.110403344711688, |
|
"learning_rate": 2.935236526139592e-06, |
|
"loss": 2.6148, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.6730875678069301, |
|
"grad_norm": 14.691795830412493, |
|
"learning_rate": 2.9176531869157776e-06, |
|
"loss": 2.623, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.6741946197276653, |
|
"grad_norm": 20.694910027119413, |
|
"learning_rate": 2.900100948954568e-06, |
|
"loss": 2.4261, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.6753016716484003, |
|
"grad_norm": 20.153947600154126, |
|
"learning_rate": 2.8825800744105553e-06, |
|
"loss": 2.5051, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.6764087235691354, |
|
"grad_norm": 16.844446676245752, |
|
"learning_rate": 2.8650908249698837e-06, |
|
"loss": 2.4725, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.6775157754898705, |
|
"grad_norm": 15.629536784931664, |
|
"learning_rate": 2.847633461846363e-06, |
|
"loss": 2.4676, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.6786228274106055, |
|
"grad_norm": 15.244942371558702, |
|
"learning_rate": 2.830208245777556e-06, |
|
"loss": 2.4867, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.6797298793313407, |
|
"grad_norm": 18.15276563682713, |
|
"learning_rate": 2.8128154370208895e-06, |
|
"loss": 2.6125, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.6808369312520757, |
|
"grad_norm": 14.866692854122116, |
|
"learning_rate": 2.7954552953497648e-06, |
|
"loss": 2.4709, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.6819439831728108, |
|
"grad_norm": 15.710254262687716, |
|
"learning_rate": 2.778128080049674e-06, |
|
"loss": 2.5593, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.6830510350935459, |
|
"grad_norm": 16.32088369390469, |
|
"learning_rate": 2.760834049914337e-06, |
|
"loss": 2.5904, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.6841580870142809, |
|
"grad_norm": 17.297718496475216, |
|
"learning_rate": 2.7435734632418286e-06, |
|
"loss": 2.6322, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.6852651389350161, |
|
"grad_norm": 16.18993238219759, |
|
"learning_rate": 2.726346577830722e-06, |
|
"loss": 2.4723, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.6863721908557512, |
|
"grad_norm": 13.340569639729669, |
|
"learning_rate": 2.7091536509762407e-06, |
|
"loss": 2.5087, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.6874792427764862, |
|
"grad_norm": 17.20103511645342, |
|
"learning_rate": 2.691994939466415e-06, |
|
"loss": 2.575, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.6885862946972213, |
|
"grad_norm": 15.066807611711438, |
|
"learning_rate": 2.6748706995782407e-06, |
|
"loss": 2.5264, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.6896933466179563, |
|
"grad_norm": 21.941135059717368, |
|
"learning_rate": 2.657781187073861e-06, |
|
"loss": 2.5012, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.6908003985386915, |
|
"grad_norm": 16.278833357503192, |
|
"learning_rate": 2.640726657196743e-06, |
|
"loss": 2.5817, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.6919074504594266, |
|
"grad_norm": 13.836955054815277, |
|
"learning_rate": 2.6237073646678596e-06, |
|
"loss": 2.5257, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.6930145023801616, |
|
"grad_norm": 17.42891079955518, |
|
"learning_rate": 2.6067235636818975e-06, |
|
"loss": 2.4827, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.6941215543008967, |
|
"grad_norm": 16.66766719981607, |
|
"learning_rate": 2.5897755079034415e-06, |
|
"loss": 2.734, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.6952286062216317, |
|
"grad_norm": 18.01524504020241, |
|
"learning_rate": 2.5728634504632132e-06, |
|
"loss": 2.4481, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.6963356581423669, |
|
"grad_norm": 15.361507532173055, |
|
"learning_rate": 2.555987643954259e-06, |
|
"loss": 2.5952, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.697442710063102, |
|
"grad_norm": 12.548971546748055, |
|
"learning_rate": 2.539148340428203e-06, |
|
"loss": 2.4955, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.698549761983837, |
|
"grad_norm": 16.013770363195505, |
|
"learning_rate": 2.5223457913914713e-06, |
|
"loss": 2.5667, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.6996568139045721, |
|
"grad_norm": 18.08109296107942, |
|
"learning_rate": 2.505580247801529e-06, |
|
"loss": 2.6721, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.7007638658253073, |
|
"grad_norm": 18.233567782447306, |
|
"learning_rate": 2.488851960063153e-06, |
|
"loss": 2.5413, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.7018709177460423, |
|
"grad_norm": 20.185450776651432, |
|
"learning_rate": 2.4721611780246662e-06, |
|
"loss": 2.5205, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.7029779696667774, |
|
"grad_norm": 17.322044563032186, |
|
"learning_rate": 2.4555081509742257e-06, |
|
"loss": 2.6061, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.7040850215875124, |
|
"grad_norm": 16.69861708188076, |
|
"learning_rate": 2.4388931276360898e-06, |
|
"loss": 2.5733, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.7051920735082475, |
|
"grad_norm": 14.9415194058973, |
|
"learning_rate": 2.4223163561669084e-06, |
|
"loss": 2.4084, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.7062991254289827, |
|
"grad_norm": 15.070279374628573, |
|
"learning_rate": 2.4057780841520073e-06, |
|
"loss": 2.4201, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.7074061773497177, |
|
"grad_norm": 16.92425944088654, |
|
"learning_rate": 2.389278558601703e-06, |
|
"loss": 2.674, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.7085132292704528, |
|
"grad_norm": 15.873359974625208, |
|
"learning_rate": 2.3728180259476054e-06, |
|
"loss": 2.5413, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.7096202811911879, |
|
"grad_norm": 17.077658381322358, |
|
"learning_rate": 2.356396732038938e-06, |
|
"loss": 2.5189, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.7107273331119229, |
|
"grad_norm": 15.86795681834881, |
|
"learning_rate": 2.34001492213887e-06, |
|
"loss": 2.6101, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.7118343850326581, |
|
"grad_norm": 13.564052898106056, |
|
"learning_rate": 2.323672840920843e-06, |
|
"loss": 2.5059, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.7129414369533931, |
|
"grad_norm": 16.387911586785865, |
|
"learning_rate": 2.307370732464936e-06, |
|
"loss": 2.4656, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.7140484888741282, |
|
"grad_norm": 15.397100789766657, |
|
"learning_rate": 2.291108840254194e-06, |
|
"loss": 2.5474, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.7151555407948633, |
|
"grad_norm": 20.180668201574875, |
|
"learning_rate": 2.274887407171015e-06, |
|
"loss": 2.6061, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.7162625927155983, |
|
"grad_norm": 16.932276461623562, |
|
"learning_rate": 2.2587066754935088e-06, |
|
"loss": 2.6172, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.7173696446363335, |
|
"grad_norm": 15.85444224400965, |
|
"learning_rate": 2.242566886891878e-06, |
|
"loss": 2.4546, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.7184766965570685, |
|
"grad_norm": 16.024831283317745, |
|
"learning_rate": 2.2264682824248244e-06, |
|
"loss": 2.5442, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.7195837484778036, |
|
"grad_norm": 15.983284722901772, |
|
"learning_rate": 2.210411102535923e-06, |
|
"loss": 2.5027, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7206908003985387, |
|
"grad_norm": 18.522789630055893, |
|
"learning_rate": 2.194395587050053e-06, |
|
"loss": 2.5553, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.7217978523192737, |
|
"grad_norm": 14.14639815951338, |
|
"learning_rate": 2.178421975169806e-06, |
|
"loss": 2.5721, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.7229049042400089, |
|
"grad_norm": 14.492302660298277, |
|
"learning_rate": 2.1624905054719136e-06, |
|
"loss": 2.4938, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.724011956160744, |
|
"grad_norm": 19.363838132408695, |
|
"learning_rate": 2.146601415903685e-06, |
|
"loss": 2.4218, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.725119008081479, |
|
"grad_norm": 15.90076642116056, |
|
"learning_rate": 2.1307549437794576e-06, |
|
"loss": 2.448, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.7262260600022141, |
|
"grad_norm": 17.3475722033809, |
|
"learning_rate": 2.114951325777041e-06, |
|
"loss": 2.5259, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.7273331119229491, |
|
"grad_norm": 17.081131808882112, |
|
"learning_rate": 2.0991907979341945e-06, |
|
"loss": 2.6131, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.7284401638436843, |
|
"grad_norm": 19.24726121813359, |
|
"learning_rate": 2.083473595645096e-06, |
|
"loss": 2.5176, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.7295472157644194, |
|
"grad_norm": 18.22671174512495, |
|
"learning_rate": 2.067799953656827e-06, |
|
"loss": 2.6385, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.7306542676851544, |
|
"grad_norm": 19.51577253516203, |
|
"learning_rate": 2.052170106065867e-06, |
|
"loss": 2.5878, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7317613196058895, |
|
"grad_norm": 14.740255840350805, |
|
"learning_rate": 2.0365842863145902e-06, |
|
"loss": 2.6232, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.7328683715266247, |
|
"grad_norm": 17.153524931988514, |
|
"learning_rate": 2.021042727187797e-06, |
|
"loss": 2.4545, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.7339754234473597, |
|
"grad_norm": 16.978859837487686, |
|
"learning_rate": 2.0055456608092135e-06, |
|
"loss": 2.4822, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.7350824753680948, |
|
"grad_norm": 15.507136512277452, |
|
"learning_rate": 1.9900933186380427e-06, |
|
"loss": 2.4757, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.7361895272888298, |
|
"grad_norm": 15.113892086099645, |
|
"learning_rate": 1.9746859314655024e-06, |
|
"loss": 2.4577, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.7372965792095649, |
|
"grad_norm": 19.298868896417396, |
|
"learning_rate": 1.9593237294113688e-06, |
|
"loss": 2.5047, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.7384036311303, |
|
"grad_norm": 13.267678704003732, |
|
"learning_rate": 1.944006941920561e-06, |
|
"loss": 2.5715, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.7395106830510351, |
|
"grad_norm": 14.87293193958646, |
|
"learning_rate": 1.928735797759687e-06, |
|
"loss": 2.5132, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.7406177349717702, |
|
"grad_norm": 16.569655196515217, |
|
"learning_rate": 1.91351052501365e-06, |
|
"loss": 2.5578, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.7417247868925052, |
|
"grad_norm": 18.641862537777396, |
|
"learning_rate": 1.8983313510822283e-06, |
|
"loss": 2.5117, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.7428318388132403, |
|
"grad_norm": 16.649411387878974, |
|
"learning_rate": 1.8831985026766848e-06, |
|
"loss": 2.555, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.7439388907339755, |
|
"grad_norm": 17.113555470969906, |
|
"learning_rate": 1.8681122058163797e-06, |
|
"loss": 2.4762, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.7450459426547105, |
|
"grad_norm": 13.60243042756901, |
|
"learning_rate": 1.853072685825391e-06, |
|
"loss": 2.4798, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.7461529945754456, |
|
"grad_norm": 14.062228805408685, |
|
"learning_rate": 1.8380801673291555e-06, |
|
"loss": 2.5991, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.7472600464961807, |
|
"grad_norm": 12.81974531182581, |
|
"learning_rate": 1.8231348742511102e-06, |
|
"loss": 2.3543, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.7483670984169157, |
|
"grad_norm": 16.835322913216885, |
|
"learning_rate": 1.8082370298093483e-06, |
|
"loss": 2.4387, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.7494741503376509, |
|
"grad_norm": 14.330012440741553, |
|
"learning_rate": 1.7933868565132857e-06, |
|
"loss": 2.6009, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.7505812022583859, |
|
"grad_norm": 15.204347320060766, |
|
"learning_rate": 1.7785845761603376e-06, |
|
"loss": 2.5466, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.751688254179121, |
|
"grad_norm": 17.028609074434605, |
|
"learning_rate": 1.7638304098326025e-06, |
|
"loss": 2.4657, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.7527953060998561, |
|
"grad_norm": 13.259346842026316, |
|
"learning_rate": 1.7491245778935673e-06, |
|
"loss": 2.6145, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.7539023580205911, |
|
"grad_norm": 21.625831350357682, |
|
"learning_rate": 1.7344672999848106e-06, |
|
"loss": 2.5143, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.7550094099413263, |
|
"grad_norm": 19.536045749121886, |
|
"learning_rate": 1.7198587950227235e-06, |
|
"loss": 2.4776, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.7561164618620614, |
|
"grad_norm": 17.421699829582213, |
|
"learning_rate": 1.7052992811952411e-06, |
|
"loss": 2.4593, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.7572235137827964, |
|
"grad_norm": 16.49786576509242, |
|
"learning_rate": 1.6907889759585778e-06, |
|
"loss": 2.6817, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.7583305657035315, |
|
"grad_norm": 14.275882435397286, |
|
"learning_rate": 1.676328096033994e-06, |
|
"loss": 2.4542, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.7594376176242665, |
|
"grad_norm": 17.493762248570647, |
|
"learning_rate": 1.6619168574045385e-06, |
|
"loss": 2.4719, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.7605446695450017, |
|
"grad_norm": 16.007658419129143, |
|
"learning_rate": 1.6475554753118412e-06, |
|
"loss": 2.4291, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.7616517214657368, |
|
"grad_norm": 14.774826021297706, |
|
"learning_rate": 1.6332441642528895e-06, |
|
"loss": 2.6003, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.7627587733864718, |
|
"grad_norm": 15.975567591762553, |
|
"learning_rate": 1.6189831379768206e-06, |
|
"loss": 2.5704, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.7638658253072069, |
|
"grad_norm": 17.406951035088184, |
|
"learning_rate": 1.604772609481744e-06, |
|
"loss": 2.5381, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.7649728772279419, |
|
"grad_norm": 15.245412833911804, |
|
"learning_rate": 1.5906127910115414e-06, |
|
"loss": 2.5041, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.7660799291486771, |
|
"grad_norm": 18.14500430607472, |
|
"learning_rate": 1.576503894052711e-06, |
|
"loss": 2.4126, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.7671869810694122, |
|
"grad_norm": 15.112940123243304, |
|
"learning_rate": 1.5624461293312022e-06, |
|
"loss": 2.4729, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.7682940329901472, |
|
"grad_norm": 14.628425372895773, |
|
"learning_rate": 1.548439706809271e-06, |
|
"loss": 2.4399, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.7694010849108823, |
|
"grad_norm": 14.955427356230805, |
|
"learning_rate": 1.5344848356823395e-06, |
|
"loss": 2.4849, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.7705081368316175, |
|
"grad_norm": 15.352858996367999, |
|
"learning_rate": 1.5205817243758775e-06, |
|
"loss": 2.5061, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.7716151887523525, |
|
"grad_norm": 15.531771804427523, |
|
"learning_rate": 1.506730580542287e-06, |
|
"loss": 2.5352, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.7727222406730876, |
|
"grad_norm": 14.802901269445874, |
|
"learning_rate": 1.4929316110577991e-06, |
|
"loss": 2.4606, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.7738292925938226, |
|
"grad_norm": 13.834503126554017, |
|
"learning_rate": 1.4791850220193882e-06, |
|
"loss": 2.4114, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.7749363445145577, |
|
"grad_norm": 17.626871971044736, |
|
"learning_rate": 1.4654910187416843e-06, |
|
"loss": 2.4443, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7760433964352929, |
|
"grad_norm": 15.72586832532517, |
|
"learning_rate": 1.451849805753925e-06, |
|
"loss": 2.5959, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.7771504483560279, |
|
"grad_norm": 19.63625622564935, |
|
"learning_rate": 1.4382615867968768e-06, |
|
"loss": 2.577, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.778257500276763, |
|
"grad_norm": 16.259423437860036, |
|
"learning_rate": 1.4247265648198122e-06, |
|
"loss": 2.4003, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.7793645521974981, |
|
"grad_norm": 14.868240052692464, |
|
"learning_rate": 1.4112449419774699e-06, |
|
"loss": 2.4374, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.7804716041182331, |
|
"grad_norm": 17.680915858091048, |
|
"learning_rate": 1.3978169196270297e-06, |
|
"loss": 2.4477, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.7815786560389683, |
|
"grad_norm": 18.788763019346266, |
|
"learning_rate": 1.3844426983251242e-06, |
|
"loss": 2.6663, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.7826857079597033, |
|
"grad_norm": 17.443967486074488, |
|
"learning_rate": 1.3711224778248178e-06, |
|
"loss": 2.4001, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.7837927598804384, |
|
"grad_norm": 14.104765296687267, |
|
"learning_rate": 1.3578564570726437e-06, |
|
"loss": 2.5499, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.7848998118011735, |
|
"grad_norm": 14.938982184936348, |
|
"learning_rate": 1.344644834205624e-06, |
|
"loss": 2.6234, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.7860068637219085, |
|
"grad_norm": 16.601186409737505, |
|
"learning_rate": 1.3314878065483106e-06, |
|
"loss": 2.4678, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.7871139156426437, |
|
"grad_norm": 16.126461328991052, |
|
"learning_rate": 1.318385570609838e-06, |
|
"loss": 2.5181, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.7882209675633787, |
|
"grad_norm": 14.264212101115474, |
|
"learning_rate": 1.3053383220809934e-06, |
|
"loss": 2.5319, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.7893280194841138, |
|
"grad_norm": 16.674084788709003, |
|
"learning_rate": 1.2923462558312827e-06, |
|
"loss": 2.5588, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.7904350714048489, |
|
"grad_norm": 14.125047804457926, |
|
"learning_rate": 1.2794095659060335e-06, |
|
"loss": 2.495, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.7915421233255839, |
|
"grad_norm": 13.689321540078824, |
|
"learning_rate": 1.2665284455234867e-06, |
|
"loss": 2.6346, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.7926491752463191, |
|
"grad_norm": 17.491763233443507, |
|
"learning_rate": 1.2537030870719159e-06, |
|
"loss": 2.3638, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.7937562271670542, |
|
"grad_norm": 14.712500473982459, |
|
"learning_rate": 1.2409336821067535e-06, |
|
"loss": 2.4199, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.7948632790877892, |
|
"grad_norm": 13.97965354212977, |
|
"learning_rate": 1.2282204213477233e-06, |
|
"loss": 2.4273, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.7959703310085243, |
|
"grad_norm": 15.125599625889896, |
|
"learning_rate": 1.215563494676007e-06, |
|
"loss": 2.5639, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.7970773829292593, |
|
"grad_norm": 15.308235089960142, |
|
"learning_rate": 1.2029630911313877e-06, |
|
"loss": 2.4943, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.7981844348499945, |
|
"grad_norm": 14.243073168806442, |
|
"learning_rate": 1.1904193989094442e-06, |
|
"loss": 2.6061, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.7992914867707296, |
|
"grad_norm": 14.898872151747849, |
|
"learning_rate": 1.1779326053587326e-06, |
|
"loss": 2.6109, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.8003985386914646, |
|
"grad_norm": 15.213968169737058, |
|
"learning_rate": 1.165502896977983e-06, |
|
"loss": 2.5029, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.8015055906121997, |
|
"grad_norm": 17.57190386080436, |
|
"learning_rate": 1.1531304594133297e-06, |
|
"loss": 2.5218, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.8026126425329347, |
|
"grad_norm": 14.718334901930403, |
|
"learning_rate": 1.1408154774555185e-06, |
|
"loss": 2.5644, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.8037196944536699, |
|
"grad_norm": 14.70466300668309, |
|
"learning_rate": 1.1285581350371633e-06, |
|
"loss": 2.5673, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.804826746374405, |
|
"grad_norm": 16.523083604536307, |
|
"learning_rate": 1.11635861522999e-06, |
|
"loss": 2.6119, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.80593379829514, |
|
"grad_norm": 16.087233648796555, |
|
"learning_rate": 1.1042171002421038e-06, |
|
"loss": 2.3668, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.8070408502158751, |
|
"grad_norm": 18.219483436423715, |
|
"learning_rate": 1.092133771415272e-06, |
|
"loss": 2.5108, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.8081479021366103, |
|
"grad_norm": 14.23626021764468, |
|
"learning_rate": 1.0801088092222067e-06, |
|
"loss": 2.5161, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.8092549540573453, |
|
"grad_norm": 17.579234694984372, |
|
"learning_rate": 1.0681423932638784e-06, |
|
"loss": 2.472, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.8103620059780804, |
|
"grad_norm": 17.509613972476572, |
|
"learning_rate": 1.05623470226683e-06, |
|
"loss": 2.5078, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.8114690578988154, |
|
"grad_norm": 16.567966169697417, |
|
"learning_rate": 1.0443859140805063e-06, |
|
"loss": 2.5549, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.8125761098195505, |
|
"grad_norm": 13.228102448828993, |
|
"learning_rate": 1.032596205674598e-06, |
|
"loss": 2.5958, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.8136831617402857, |
|
"grad_norm": 14.33253011909644, |
|
"learning_rate": 1.020865753136402e-06, |
|
"loss": 2.4304, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.8147902136610207, |
|
"grad_norm": 16.763970324305024, |
|
"learning_rate": 1.0091947316681833e-06, |
|
"loss": 2.5536, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.8158972655817558, |
|
"grad_norm": 16.082943781448364, |
|
"learning_rate": 9.975833155845687e-07, |
|
"loss": 2.4768, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.8170043175024909, |
|
"grad_norm": 15.909337215300724, |
|
"learning_rate": 9.860316783099356e-07, |
|
"loss": 2.4912, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.8181113694232259, |
|
"grad_norm": 17.194058825674805, |
|
"learning_rate": 9.74539992375826e-07, |
|
"loss": 2.4761, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.8192184213439611, |
|
"grad_norm": 15.251099269067993, |
|
"learning_rate": 9.631084294183668e-07, |
|
"loss": 2.538, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.8203254732646961, |
|
"grad_norm": 14.28790996742064, |
|
"learning_rate": 9.517371601757042e-07, |
|
"loss": 2.536, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.8214325251854312, |
|
"grad_norm": 17.000395820091192, |
|
"learning_rate": 9.404263544854658e-07, |
|
"loss": 2.4934, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.8225395771061663, |
|
"grad_norm": 14.025873757437632, |
|
"learning_rate": 9.291761812822054e-07, |
|
"loss": 2.4447, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.8236466290269013, |
|
"grad_norm": 20.369511420071024, |
|
"learning_rate": 9.179868085948946e-07, |
|
"loss": 2.5157, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.8247536809476365, |
|
"grad_norm": 16.887509510072285, |
|
"learning_rate": 9.068584035444083e-07, |
|
"loss": 2.4785, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.8258607328683715, |
|
"grad_norm": 15.952259196345977, |
|
"learning_rate": 8.957911323410229e-07, |
|
"loss": 2.4653, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.8269677847891066, |
|
"grad_norm": 16.24199510067374, |
|
"learning_rate": 8.847851602819485e-07, |
|
"loss": 2.5294, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.8280748367098417, |
|
"grad_norm": 16.976947365156782, |
|
"learning_rate": 8.738406517488423e-07, |
|
"loss": 2.5297, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.8291818886305767, |
|
"grad_norm": 17.934378030024483, |
|
"learning_rate": 8.629577702053671e-07, |
|
"loss": 2.6052, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.8302889405513119, |
|
"grad_norm": 15.407244538769637, |
|
"learning_rate": 8.521366781947426e-07, |
|
"loss": 2.4532, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.831395992472047, |
|
"grad_norm": 15.400477059234891, |
|
"learning_rate": 8.413775373373206e-07, |
|
"loss": 2.4579, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.832503044392782, |
|
"grad_norm": 17.39392388174797, |
|
"learning_rate": 8.306805083281705e-07, |
|
"loss": 2.6138, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.8336100963135171, |
|
"grad_norm": 14.342293383217136, |
|
"learning_rate": 8.200457509346798e-07, |
|
"loss": 2.3725, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.8347171482342521, |
|
"grad_norm": 15.847161214149653, |
|
"learning_rate": 8.094734239941642e-07, |
|
"loss": 2.3768, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.8358242001549873, |
|
"grad_norm": 17.63332070962175, |
|
"learning_rate": 7.989636854115018e-07, |
|
"loss": 2.4585, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.8369312520757224, |
|
"grad_norm": 16.531198506312407, |
|
"learning_rate": 7.885166921567705e-07, |
|
"loss": 2.4787, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.8380383039964574, |
|
"grad_norm": 14.28759893561945, |
|
"learning_rate": 7.781326002628991e-07, |
|
"loss": 2.4685, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.8391453559171925, |
|
"grad_norm": 14.826430399325979, |
|
"learning_rate": 7.678115648233514e-07, |
|
"loss": 2.4173, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.8402524078379277, |
|
"grad_norm": 14.87587504335515, |
|
"learning_rate": 7.57553739989792e-07, |
|
"loss": 2.51, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.8413594597586627, |
|
"grad_norm": 17.574559912620376, |
|
"learning_rate": 7.473592789697947e-07, |
|
"loss": 2.4794, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.8424665116793978, |
|
"grad_norm": 17.140986686992314, |
|
"learning_rate": 7.37228334024555e-07, |
|
"loss": 2.416, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.8435735636001328, |
|
"grad_norm": 15.506861252303242, |
|
"learning_rate": 7.271610564666054e-07, |
|
"loss": 2.3907, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.8446806155208679, |
|
"grad_norm": 15.538508359449784, |
|
"learning_rate": 7.171575966575722e-07, |
|
"loss": 2.5462, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.845787667441603, |
|
"grad_norm": 16.810003606583724, |
|
"learning_rate": 7.072181040059123e-07, |
|
"loss": 2.486, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.8468947193623381, |
|
"grad_norm": 17.523279420449594, |
|
"learning_rate": 6.973427269646932e-07, |
|
"loss": 2.4714, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.8480017712830732, |
|
"grad_norm": 14.739045055561698, |
|
"learning_rate": 6.875316130293724e-07, |
|
"loss": 2.5424, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.8491088232038082, |
|
"grad_norm": 15.925664585980916, |
|
"learning_rate": 6.777849087355932e-07, |
|
"loss": 2.4951, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.8502158751245433, |
|
"grad_norm": 14.15278086352724, |
|
"learning_rate": 6.681027596569988e-07, |
|
"loss": 2.4984, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.8513229270452785, |
|
"grad_norm": 14.613485875082265, |
|
"learning_rate": 6.584853104030553e-07, |
|
"loss": 2.415, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.8524299789660135, |
|
"grad_norm": 13.79991123891203, |
|
"learning_rate": 6.48932704616892e-07, |
|
"loss": 2.4957, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.8535370308867486, |
|
"grad_norm": 16.538555088229636, |
|
"learning_rate": 6.394450849731587e-07, |
|
"loss": 2.5322, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.8546440828074837, |
|
"grad_norm": 17.641076622043553, |
|
"learning_rate": 6.300225931758924e-07, |
|
"loss": 2.4296, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.8557511347282187, |
|
"grad_norm": 17.606467927789563, |
|
"learning_rate": 6.206653699564014e-07, |
|
"loss": 2.5163, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.8568581866489539, |
|
"grad_norm": 17.809260161423225, |
|
"learning_rate": 6.113735550711658e-07, |
|
"loss": 2.4642, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.8579652385696889, |
|
"grad_norm": 13.623839785347023, |
|
"learning_rate": 6.021472872997419e-07, |
|
"loss": 2.512, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.859072290490424, |
|
"grad_norm": 18.78017884173273, |
|
"learning_rate": 5.929867044427035e-07, |
|
"loss": 2.4144, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.8601793424111591, |
|
"grad_norm": 16.837093504212152, |
|
"learning_rate": 5.838919433195678e-07, |
|
"loss": 2.5047, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.8612863943318941, |
|
"grad_norm": 16.87004336022709, |
|
"learning_rate": 5.748631397667654e-07, |
|
"loss": 2.5213, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.8623934462526293, |
|
"grad_norm": 15.69091627736047, |
|
"learning_rate": 5.659004286356045e-07, |
|
"loss": 2.5533, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.8635004981733644, |
|
"grad_norm": 14.187307779530673, |
|
"learning_rate": 5.570039437902536e-07, |
|
"loss": 2.441, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.8646075500940994, |
|
"grad_norm": 17.93288869083588, |
|
"learning_rate": 5.481738181057556e-07, |
|
"loss": 2.5006, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.8657146020148345, |
|
"grad_norm": 15.826634381411255, |
|
"learning_rate": 5.394101834660253e-07, |
|
"loss": 2.4135, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.8668216539355695, |
|
"grad_norm": 16.596251661361375, |
|
"learning_rate": 5.307131707618934e-07, |
|
"loss": 2.4909, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.8679287058563047, |
|
"grad_norm": 15.129013018674039, |
|
"learning_rate": 5.220829098891472e-07, |
|
"loss": 2.4429, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.8690357577770398, |
|
"grad_norm": 14.305450352981211, |
|
"learning_rate": 5.135195297465878e-07, |
|
"loss": 2.4862, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.8701428096977748, |
|
"grad_norm": 12.863234686905033, |
|
"learning_rate": 5.050231582341092e-07, |
|
"loss": 2.4616, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.8712498616185099, |
|
"grad_norm": 13.900921327498637, |
|
"learning_rate": 4.965939222507832e-07, |
|
"loss": 2.5505, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.8723569135392449, |
|
"grad_norm": 15.774427990260946, |
|
"learning_rate": 4.882319476929698e-07, |
|
"loss": 2.4643, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.8734639654599801, |
|
"grad_norm": 18.695971386290847, |
|
"learning_rate": 4.799373594524332e-07, |
|
"loss": 2.4695, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.8745710173807152, |
|
"grad_norm": 15.398391843940924, |
|
"learning_rate": 4.7171028141447693e-07, |
|
"loss": 2.5612, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.8756780693014502, |
|
"grad_norm": 13.563010738327588, |
|
"learning_rate": 4.635508364560937e-07, |
|
"loss": 2.4357, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.8767851212221853, |
|
"grad_norm": 14.212010150425057, |
|
"learning_rate": 4.5545914644413103e-07, |
|
"loss": 2.4529, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.8778921731429205, |
|
"grad_norm": 13.857542112005609, |
|
"learning_rate": 4.474353322334679e-07, |
|
"loss": 2.4963, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.8789992250636555, |
|
"grad_norm": 14.666024515134973, |
|
"learning_rate": 4.394795136652169e-07, |
|
"loss": 2.4512, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.8801062769843906, |
|
"grad_norm": 16.841948685566276, |
|
"learning_rate": 4.315918095649246e-07, |
|
"loss": 2.5056, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.8812133289051256, |
|
"grad_norm": 15.413187142241657, |
|
"learning_rate": 4.2377233774080427e-07, |
|
"loss": 2.5528, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.8823203808258607, |
|
"grad_norm": 13.784700431727842, |
|
"learning_rate": 4.1602121498197477e-07, |
|
"loss": 2.4622, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.8834274327465959, |
|
"grad_norm": 14.844903872123188, |
|
"learning_rate": 4.0833855705671057e-07, |
|
"loss": 2.4508, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.8845344846673309, |
|
"grad_norm": 16.047205147717147, |
|
"learning_rate": 4.0072447871072507e-07, |
|
"loss": 2.4968, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.885641536588066, |
|
"grad_norm": 12.9721170754397, |
|
"learning_rate": 3.931790936654417e-07, |
|
"loss": 2.3906, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.885641536588066, |
|
"eval_loss": 2.48763370513916, |
|
"eval_runtime": 2402.0825, |
|
"eval_samples_per_second": 4.178, |
|
"eval_steps_per_second": 0.418, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8867485885088011, |
|
"grad_norm": 15.854557624198474, |
|
"learning_rate": 3.8570251461630735e-07, |
|
"loss": 2.4579, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.8878556404295361, |
|
"grad_norm": 16.026725672049096, |
|
"learning_rate": 3.7829485323110316e-07, |
|
"loss": 2.3463, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.8889626923502713, |
|
"grad_norm": 16.073422441115532, |
|
"learning_rate": 3.709562201482769e-07, |
|
"loss": 2.4243, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.8900697442710063, |
|
"grad_norm": 15.38779771086279, |
|
"learning_rate": 3.636867249752962e-07, |
|
"loss": 2.3858, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.8911767961917414, |
|
"grad_norm": 16.258826268938925, |
|
"learning_rate": 3.564864762870013e-07, |
|
"loss": 2.5358, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.8922838481124765, |
|
"grad_norm": 15.02798068624606, |
|
"learning_rate": 3.49355581623993e-07, |
|
"loss": 2.4421, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.8933909000332115, |
|
"grad_norm": 16.654143045304426, |
|
"learning_rate": 3.4229414749102186e-07, |
|
"loss": 2.5125, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.8944979519539467, |
|
"grad_norm": 13.762735453146883, |
|
"learning_rate": 3.353022793553978e-07, |
|
"loss": 2.6232, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.8956050038746817, |
|
"grad_norm": 11.721658803005548, |
|
"learning_rate": 3.2838008164541577e-07, |
|
"loss": 2.4208, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.8967120557954168, |
|
"grad_norm": 15.661791327346446, |
|
"learning_rate": 3.215276577487969e-07, |
|
"loss": 2.5037, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.8978191077161519, |
|
"grad_norm": 14.437374220759548, |
|
"learning_rate": 3.1474511001113926e-07, |
|
"loss": 2.453, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.8989261596368869, |
|
"grad_norm": 23.96541891259206, |
|
"learning_rate": 3.080325397343969e-07, |
|
"loss": 2.4866, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.9000332115576221, |
|
"grad_norm": 14.703254905186904, |
|
"learning_rate": 3.013900471753628e-07, |
|
"loss": 2.5269, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.9011402634783572, |
|
"grad_norm": 17.763519535077947, |
|
"learning_rate": 2.948177315441669e-07, |
|
"loss": 2.5009, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.9022473153990922, |
|
"grad_norm": 18.50559050540985, |
|
"learning_rate": 2.883156910028073e-07, |
|
"loss": 2.4501, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.9033543673198273, |
|
"grad_norm": 13.811835537975867, |
|
"learning_rate": 2.818840226636671e-07, |
|
"loss": 2.3126, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.9044614192405623, |
|
"grad_norm": 18.988992451266952, |
|
"learning_rate": 2.7552282258808125e-07, |
|
"loss": 2.4317, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.9055684711612975, |
|
"grad_norm": 16.031786166509363, |
|
"learning_rate": 2.6923218578488674e-07, |
|
"loss": 2.4247, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.9066755230820326, |
|
"grad_norm": 18.728666251016826, |
|
"learning_rate": 2.630122062090118e-07, |
|
"loss": 2.3527, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.9077825750027676, |
|
"grad_norm": 19.825199377152217, |
|
"learning_rate": 2.568629767600744e-07, |
|
"loss": 2.6088, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.9088896269235027, |
|
"grad_norm": 16.87396488382408, |
|
"learning_rate": 2.507845892809868e-07, |
|
"loss": 2.3591, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.9099966788442378, |
|
"grad_norm": 14.693971646543563, |
|
"learning_rate": 2.4477713455659136e-07, |
|
"loss": 2.4239, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.9111037307649729, |
|
"grad_norm": 15.947418670710583, |
|
"learning_rate": 2.388407023123007e-07, |
|
"loss": 2.4616, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.912210782685708, |
|
"grad_norm": 16.454200712334917, |
|
"learning_rate": 2.329753812127583e-07, |
|
"loss": 2.4244, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.913317834606443, |
|
"grad_norm": 16.497222276931957, |
|
"learning_rate": 2.2718125886051433e-07, |
|
"loss": 2.5867, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.9144248865271781, |
|
"grad_norm": 16.43228833811835, |
|
"learning_rate": 2.214584217947191e-07, |
|
"loss": 2.4391, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.9155319384479133, |
|
"grad_norm": 16.78472579386922, |
|
"learning_rate": 2.1580695548982567e-07, |
|
"loss": 2.4242, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.9166389903686483, |
|
"grad_norm": 16.546048611992425, |
|
"learning_rate": 2.1022694435431868e-07, |
|
"loss": 2.4872, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.9177460422893834, |
|
"grad_norm": 16.770801344250373, |
|
"learning_rate": 2.0471847172945036e-07, |
|
"loss": 2.4296, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.9188530942101184, |
|
"grad_norm": 16.27109240174247, |
|
"learning_rate": 1.9928161988799765e-07, |
|
"loss": 2.5068, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.9199601461308535, |
|
"grad_norm": 12.512458168250634, |
|
"learning_rate": 1.939164700330326e-07, |
|
"loss": 2.4175, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.9210671980515887, |
|
"grad_norm": 14.798188108228695, |
|
"learning_rate": 1.8862310229670612e-07, |
|
"loss": 2.5059, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.9221742499723237, |
|
"grad_norm": 12.936659113537381, |
|
"learning_rate": 1.8340159573906058e-07, |
|
"loss": 2.447, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.9232813018930588, |
|
"grad_norm": 15.624562309086738, |
|
"learning_rate": 1.782520283468364e-07, |
|
"loss": 2.4359, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.9243883538137939, |
|
"grad_norm": 17.36536742613116, |
|
"learning_rate": 1.7317447703231849e-07, |
|
"loss": 2.5658, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.9254954057345289, |
|
"grad_norm": 15.391131130821295, |
|
"learning_rate": 1.6816901763218152e-07, |
|
"loss": 2.5091, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.9266024576552641, |
|
"grad_norm": 15.684736857963308, |
|
"learning_rate": 1.6323572490635543e-07, |
|
"loss": 2.4168, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.9277095095759991, |
|
"grad_norm": 18.001498021183778, |
|
"learning_rate": 1.5837467253691784e-07, |
|
"loss": 2.5202, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.9288165614967342, |
|
"grad_norm": 15.913434285236699, |
|
"learning_rate": 1.5358593312698178e-07, |
|
"loss": 2.6434, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.9299236134174693, |
|
"grad_norm": 15.844539221853895, |
|
"learning_rate": 1.4886957819962077e-07, |
|
"loss": 2.4848, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.9310306653382043, |
|
"grad_norm": 14.883294572064472, |
|
"learning_rate": 1.4422567819679546e-07, |
|
"loss": 2.4281, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.9321377172589395, |
|
"grad_norm": 14.583778182281327, |
|
"learning_rate": 1.3965430247830426e-07, |
|
"loss": 2.4246, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.9332447691796745, |
|
"grad_norm": 16.084883433598268, |
|
"learning_rate": 1.3515551932074488e-07, |
|
"loss": 2.506, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.9343518211004096, |
|
"grad_norm": 13.726377273149337, |
|
"learning_rate": 1.307293959164957e-07, |
|
"loss": 2.5495, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.9354588730211447, |
|
"grad_norm": 17.060608694253016, |
|
"learning_rate": 1.263759983727142e-07, |
|
"loss": 2.337, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.9365659249418797, |
|
"grad_norm": 14.801435939071636, |
|
"learning_rate": 1.2209539171034623e-07, |
|
"loss": 2.5042, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.9376729768626149, |
|
"grad_norm": 15.589161221895887, |
|
"learning_rate": 1.1788763986315621e-07, |
|
"loss": 2.5061, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.93878002878335, |
|
"grad_norm": 16.33836597070153, |
|
"learning_rate": 1.1375280567677393e-07, |
|
"loss": 2.3671, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.939887080704085, |
|
"grad_norm": 18.905885083448613, |
|
"learning_rate": 1.0969095090775428e-07, |
|
"loss": 2.6181, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.9409941326248201, |
|
"grad_norm": 16.762390629046585, |
|
"learning_rate": 1.0570213622265236e-07, |
|
"loss": 2.4327, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9421011845455551, |
|
"grad_norm": 16.525181960248243, |
|
"learning_rate": 1.0178642119712368e-07, |
|
"loss": 2.4993, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.9432082364662903, |
|
"grad_norm": 16.7132011851729, |
|
"learning_rate": 9.794386431502822e-08, |
|
"loss": 2.5366, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.9443152883870254, |
|
"grad_norm": 13.853046661215803, |
|
"learning_rate": 9.417452296756114e-08, |
|
"loss": 2.4832, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.9454223403077604, |
|
"grad_norm": 15.293104772530375, |
|
"learning_rate": 9.04784534523928e-08, |
|
"loss": 2.3633, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.9465293922284955, |
|
"grad_norm": 14.980540551389215, |
|
"learning_rate": 8.685571097282852e-08, |
|
"loss": 2.4849, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.9476364441492307, |
|
"grad_norm": 18.693304270023244, |
|
"learning_rate": 8.33063496369868e-08, |
|
"loss": 2.5602, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.9487434960699657, |
|
"grad_norm": 15.253297927027766, |
|
"learning_rate": 7.98304224569868e-08, |
|
"loss": 2.4879, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.9498505479907008, |
|
"grad_norm": 20.092545101378285, |
|
"learning_rate": 7.642798134815943e-08, |
|
"loss": 2.5095, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.9509575999114358, |
|
"grad_norm": 16.041421606524025, |
|
"learning_rate": 7.309907712827192e-08, |
|
"loss": 2.4647, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.9520646518321709, |
|
"grad_norm": 15.859909299358135, |
|
"learning_rate": 6.984375951676614e-08, |
|
"loss": 2.5593, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.953171703752906, |
|
"grad_norm": 19.216229700494758, |
|
"learning_rate": 6.66620771340215e-08, |
|
"loss": 2.3626, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.9542787556736411, |
|
"grad_norm": 17.889324656581575, |
|
"learning_rate": 6.355407750062215e-08, |
|
"loss": 2.6562, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.9553858075943762, |
|
"grad_norm": 13.458822428770242, |
|
"learning_rate": 6.051980703665138e-08, |
|
"loss": 2.3909, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.9564928595151112, |
|
"grad_norm": 17.008353277644698, |
|
"learning_rate": 5.755931106099788e-08, |
|
"loss": 2.4223, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.9575999114358463, |
|
"grad_norm": 16.78426968156743, |
|
"learning_rate": 5.4672633790677775e-08, |
|
"loss": 2.6265, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.9587069633565815, |
|
"grad_norm": 17.958386496220644, |
|
"learning_rate": 5.185981834017473e-08, |
|
"loss": 2.5093, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.9598140152773165, |
|
"grad_norm": 17.46930815569884, |
|
"learning_rate": 4.91209067207965e-08, |
|
"loss": 2.4249, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.9609210671980516, |
|
"grad_norm": 17.891927563958056, |
|
"learning_rate": 4.645593984004604e-08, |
|
"loss": 2.533, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.9620281191187867, |
|
"grad_norm": 13.675101972798346, |
|
"learning_rate": 4.386495750101194e-08, |
|
"loss": 2.4507, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.9631351710395217, |
|
"grad_norm": 16.01872970692231, |
|
"learning_rate": 4.1347998401773945e-08, |
|
"loss": 2.4702, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.9642422229602569, |
|
"grad_norm": 17.620120107441487, |
|
"learning_rate": 3.890510013482396e-08, |
|
"loss": 2.3592, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.9653492748809919, |
|
"grad_norm": 13.329706465049831, |
|
"learning_rate": 3.653629918650536e-08, |
|
"loss": 2.4662, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.966456326801727, |
|
"grad_norm": 14.570283074571352, |
|
"learning_rate": 3.424163093646682e-08, |
|
"loss": 2.3495, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.9675633787224621, |
|
"grad_norm": 13.873984864746625, |
|
"learning_rate": 3.202112965713655e-08, |
|
"loss": 2.367, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.9686704306431971, |
|
"grad_norm": 13.467781119638207, |
|
"learning_rate": 2.987482851320778e-08, |
|
"loss": 2.3987, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.9697774825639323, |
|
"grad_norm": 15.489672705466763, |
|
"learning_rate": 2.7802759561144088e-08, |
|
"loss": 2.425, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.9708845344846674, |
|
"grad_norm": 20.126982289743573, |
|
"learning_rate": 2.580495374870151e-08, |
|
"loss": 2.5085, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.9719915864054024, |
|
"grad_norm": 16.778268839880404, |
|
"learning_rate": 2.388144091446498e-08, |
|
"loss": 2.463, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.9730986383261375, |
|
"grad_norm": 20.49911635255473, |
|
"learning_rate": 2.2032249787404258e-08, |
|
"loss": 2.5278, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.9742056902468725, |
|
"grad_norm": 16.182685372782867, |
|
"learning_rate": 2.0257407986443713e-08, |
|
"loss": 2.4702, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.9753127421676077, |
|
"grad_norm": 14.885149821948326, |
|
"learning_rate": 1.8556942020049872e-08, |
|
"loss": 2.5026, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.9764197940883428, |
|
"grad_norm": 18.03209004223668, |
|
"learning_rate": 1.6930877285835644e-08, |
|
"loss": 2.5576, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.9775268460090778, |
|
"grad_norm": 15.861290907259685, |
|
"learning_rate": 1.5379238070181158e-08, |
|
"loss": 2.5681, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.9786338979298129, |
|
"grad_norm": 16.532161800217157, |
|
"learning_rate": 1.3902047547871278e-08, |
|
"loss": 2.4926, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.9797409498505479, |
|
"grad_norm": 14.626301967154978, |
|
"learning_rate": 1.2499327781748116e-08, |
|
"loss": 2.4547, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.9808480017712831, |
|
"grad_norm": 18.74363118033889, |
|
"learning_rate": 1.1171099722383506e-08, |
|
"loss": 2.5054, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.9819550536920182, |
|
"grad_norm": 17.476707949807594, |
|
"learning_rate": 9.917383207765363e-09, |
|
"loss": 2.4136, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.9830621056127532, |
|
"grad_norm": 15.362525353056075, |
|
"learning_rate": 8.738196962999601e-09, |
|
"loss": 2.5267, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.9841691575334883, |
|
"grad_norm": 15.763132841414992, |
|
"learning_rate": 7.633558600033675e-09, |
|
"loss": 2.4059, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.9852762094542235, |
|
"grad_norm": 16.971970282791062, |
|
"learning_rate": 6.603484617390688e-09, |
|
"loss": 2.5169, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.9863832613749585, |
|
"grad_norm": 16.309961470302227, |
|
"learning_rate": 5.647990399924031e-09, |
|
"loss": 2.4272, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.9874903132956936, |
|
"grad_norm": 16.05736958282543, |
|
"learning_rate": 4.767090218589232e-09, |
|
"loss": 2.5884, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.9885973652164286, |
|
"grad_norm": 15.10152496394929, |
|
"learning_rate": 3.960797230227465e-09, |
|
"loss": 2.5573, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.9897044171371637, |
|
"grad_norm": 14.741682018743976, |
|
"learning_rate": 3.2291234773718093e-09, |
|
"loss": 2.3819, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.9908114690578989, |
|
"grad_norm": 15.440675776317423, |
|
"learning_rate": 2.5720798880662922e-09, |
|
"loss": 2.4611, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.9919185209786339, |
|
"grad_norm": 13.979973708890682, |
|
"learning_rate": 1.989676275702679e-09, |
|
"loss": 2.4037, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.993025572899369, |
|
"grad_norm": 19.373337099900795, |
|
"learning_rate": 1.4819213388744814e-09, |
|
"loss": 2.4966, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.9941326248201041, |
|
"grad_norm": 17.103724133893802, |
|
"learning_rate": 1.0488226612459517e-09, |
|
"loss": 2.505, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.9952396767408391, |
|
"grad_norm": 16.90633557993371, |
|
"learning_rate": 6.903867114393947e-10, |
|
"loss": 2.5781, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.9963467286615743, |
|
"grad_norm": 16.59692250103923, |
|
"learning_rate": 4.0661884293913266e-10, |
|
"loss": 2.5521, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9974537805823093, |
|
"grad_norm": 15.318767567204494, |
|
"learning_rate": 1.97523294011015e-10, |
|
"loss": 2.488, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.9985608325030444, |
|
"grad_norm": 14.806481544474932, |
|
"learning_rate": 6.310318763858014e-11, |
|
"loss": 2.4538, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.9996678844237795, |
|
"grad_norm": 16.39588765945003, |
|
"learning_rate": 3.360531477536455e-12, |
|
"loss": 2.4834, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 9033, |
|
"total_flos": 227316538671104.0, |
|
"train_loss": 2.714383109890978, |
|
"train_runtime": 83244.0657, |
|
"train_samples_per_second": 1.085, |
|
"train_steps_per_second": 0.109 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 9033, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 227316538671104.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|