diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6395 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 9033, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011070519207350825, + "grad_norm": 55.698127642695866, + "learning_rate": 1.1061946902654869e-07, + "loss": 4.4914, + "step": 10 + }, + { + "epoch": 0.002214103841470165, + "grad_norm": 41.723334569659066, + "learning_rate": 2.2123893805309737e-07, + "loss": 4.4954, + "step": 20 + }, + { + "epoch": 0.0033211557622052474, + "grad_norm": 43.79884898454788, + "learning_rate": 3.318584070796461e-07, + "loss": 4.296, + "step": 30 + }, + { + "epoch": 0.00442820768294033, + "grad_norm": 31.30694571868551, + "learning_rate": 4.4247787610619474e-07, + "loss": 3.9204, + "step": 40 + }, + { + "epoch": 0.0055352596036754124, + "grad_norm": 30.761018090708117, + "learning_rate": 5.530973451327435e-07, + "loss": 3.4951, + "step": 50 + }, + { + "epoch": 0.006642311524410495, + "grad_norm": 32.180378162249326, + "learning_rate": 6.637168141592922e-07, + "loss": 3.4177, + "step": 60 + }, + { + "epoch": 0.007749363445145577, + "grad_norm": 24.157005825357814, + "learning_rate": 7.743362831858408e-07, + "loss": 3.3864, + "step": 70 + }, + { + "epoch": 0.00885641536588066, + "grad_norm": 29.036937778457148, + "learning_rate": 8.849557522123895e-07, + "loss": 3.1996, + "step": 80 + }, + { + "epoch": 0.009963467286615742, + "grad_norm": 23.44273601537366, + "learning_rate": 9.95575221238938e-07, + "loss": 3.2652, + "step": 90 + }, + { + "epoch": 0.011070519207350825, + "grad_norm": 31.552666658205744, + "learning_rate": 1.106194690265487e-06, + "loss": 3.2654, + "step": 100 + }, + { + "epoch": 0.012177571128085908, + "grad_norm": 23.158016097255164, + "learning_rate": 1.2168141592920355e-06, + "loss": 3.1954, + "step": 110 + }, + { + "epoch": 0.01328462304882099, + "grad_norm": 31.510121445270936, + "learning_rate": 1.3274336283185843e-06, + "loss": 3.2016, + "step": 120 + }, + { + "epoch": 0.014391674969556073, + "grad_norm": 29.02944479343648, + "learning_rate": 1.438053097345133e-06, + "loss": 3.202, + "step": 130 + }, + { + "epoch": 0.015498726890291154, + "grad_norm": 24.737793045013742, + "learning_rate": 1.5486725663716816e-06, + "loss": 3.054, + "step": 140 + }, + { + "epoch": 0.016605778811026237, + "grad_norm": 25.982953823095542, + "learning_rate": 1.6592920353982304e-06, + "loss": 3.1637, + "step": 150 + }, + { + "epoch": 0.01771283073176132, + "grad_norm": 24.62246443187751, + "learning_rate": 1.769911504424779e-06, + "loss": 3.0422, + "step": 160 + }, + { + "epoch": 0.018819882652496404, + "grad_norm": 24.996096258559348, + "learning_rate": 1.8805309734513274e-06, + "loss": 2.9983, + "step": 170 + }, + { + "epoch": 0.019926934573231483, + "grad_norm": 30.197182446028002, + "learning_rate": 1.991150442477876e-06, + "loss": 3.0625, + "step": 180 + }, + { + "epoch": 0.021033986493966567, + "grad_norm": 25.648689604176077, + "learning_rate": 2.101769911504425e-06, + "loss": 3.2172, + "step": 190 + }, + { + "epoch": 0.02214103841470165, + "grad_norm": 30.003866974191762, + "learning_rate": 2.212389380530974e-06, + "loss": 3.0895, + "step": 200 + }, + { + "epoch": 0.023248090335436733, + "grad_norm": 27.11239518865144, + "learning_rate": 2.3230088495575224e-06, + "loss": 2.9847, + "step": 210 + }, + { + "epoch": 0.024355142256171816, + "grad_norm": 24.532976628512625, + "learning_rate": 2.433628318584071e-06, + "loss": 3.0439, + "step": 220 + }, + { + "epoch": 0.025462194176906896, + "grad_norm": 25.595931432489675, + "learning_rate": 2.5442477876106196e-06, + "loss": 2.9722, + "step": 230 + }, + { + "epoch": 0.02656924609764198, + "grad_norm": 24.927511131743852, + "learning_rate": 2.6548672566371687e-06, + "loss": 3.0965, + "step": 240 + }, + { + "epoch": 0.027676298018377062, + "grad_norm": 21.477421706673375, + "learning_rate": 2.765486725663717e-06, + "loss": 2.9589, + "step": 250 + }, + { + "epoch": 0.028783349939112145, + "grad_norm": 26.975518213759347, + "learning_rate": 2.876106194690266e-06, + "loss": 2.9878, + "step": 260 + }, + { + "epoch": 0.029890401859847225, + "grad_norm": 25.199714957692397, + "learning_rate": 2.9867256637168145e-06, + "loss": 3.0718, + "step": 270 + }, + { + "epoch": 0.03099745378058231, + "grad_norm": 25.911079775392583, + "learning_rate": 3.097345132743363e-06, + "loss": 3.1289, + "step": 280 + }, + { + "epoch": 0.03210450570131739, + "grad_norm": 22.333848218066173, + "learning_rate": 3.2079646017699117e-06, + "loss": 3.019, + "step": 290 + }, + { + "epoch": 0.033211557622052475, + "grad_norm": 27.77592678094656, + "learning_rate": 3.3185840707964607e-06, + "loss": 3.0679, + "step": 300 + }, + { + "epoch": 0.03431860954278756, + "grad_norm": 25.520987098358336, + "learning_rate": 3.429203539823009e-06, + "loss": 3.0455, + "step": 310 + }, + { + "epoch": 0.03542566146352264, + "grad_norm": 30.694095405699013, + "learning_rate": 3.539823008849558e-06, + "loss": 2.9686, + "step": 320 + }, + { + "epoch": 0.036532713384257724, + "grad_norm": 34.07278189685705, + "learning_rate": 3.6504424778761066e-06, + "loss": 3.0074, + "step": 330 + }, + { + "epoch": 0.03763976530499281, + "grad_norm": 23.233595027296616, + "learning_rate": 3.7610619469026547e-06, + "loss": 2.9906, + "step": 340 + }, + { + "epoch": 0.038746817225727884, + "grad_norm": 21.65008179710679, + "learning_rate": 3.871681415929203e-06, + "loss": 2.9965, + "step": 350 + }, + { + "epoch": 0.03985386914646297, + "grad_norm": 25.432398948327197, + "learning_rate": 3.982300884955752e-06, + "loss": 2.9583, + "step": 360 + }, + { + "epoch": 0.04096092106719805, + "grad_norm": 24.118552348993813, + "learning_rate": 4.092920353982301e-06, + "loss": 2.9629, + "step": 370 + }, + { + "epoch": 0.04206797298793313, + "grad_norm": 28.535820173184682, + "learning_rate": 4.20353982300885e-06, + "loss": 3.0437, + "step": 380 + }, + { + "epoch": 0.043175024908668216, + "grad_norm": 27.574173741552002, + "learning_rate": 4.314159292035399e-06, + "loss": 2.9642, + "step": 390 + }, + { + "epoch": 0.0442820768294033, + "grad_norm": 27.270408053929884, + "learning_rate": 4.424778761061948e-06, + "loss": 3.0859, + "step": 400 + }, + { + "epoch": 0.04538912875013838, + "grad_norm": 27.57691676783791, + "learning_rate": 4.535398230088496e-06, + "loss": 3.009, + "step": 410 + }, + { + "epoch": 0.046496180670873466, + "grad_norm": 23.96155441996071, + "learning_rate": 4.646017699115045e-06, + "loss": 2.9363, + "step": 420 + }, + { + "epoch": 0.04760323259160855, + "grad_norm": 24.279797643812547, + "learning_rate": 4.756637168141594e-06, + "loss": 3.061, + "step": 430 + }, + { + "epoch": 0.04871028451234363, + "grad_norm": 25.19191068246207, + "learning_rate": 4.867256637168142e-06, + "loss": 2.9153, + "step": 440 + }, + { + "epoch": 0.04981733643307871, + "grad_norm": 28.82056510425203, + "learning_rate": 4.97787610619469e-06, + "loss": 3.1449, + "step": 450 + }, + { + "epoch": 0.05092438835381379, + "grad_norm": 23.600325136397633, + "learning_rate": 5.088495575221239e-06, + "loss": 3.0081, + "step": 460 + }, + { + "epoch": 0.052031440274548875, + "grad_norm": 23.484025108009725, + "learning_rate": 5.1991150442477875e-06, + "loss": 3.0463, + "step": 470 + }, + { + "epoch": 0.05313849219528396, + "grad_norm": 19.983575881103242, + "learning_rate": 5.309734513274337e-06, + "loss": 3.0811, + "step": 480 + }, + { + "epoch": 0.05424554411601904, + "grad_norm": 21.728377820671874, + "learning_rate": 5.4203539823008855e-06, + "loss": 3.1061, + "step": 490 + }, + { + "epoch": 0.055352596036754124, + "grad_norm": 34.21680389185442, + "learning_rate": 5.530973451327434e-06, + "loss": 3.0082, + "step": 500 + }, + { + "epoch": 0.05645964795748921, + "grad_norm": 28.98744881318429, + "learning_rate": 5.641592920353984e-06, + "loss": 2.9659, + "step": 510 + }, + { + "epoch": 0.05756669987822429, + "grad_norm": 21.719020231412607, + "learning_rate": 5.752212389380532e-06, + "loss": 2.9689, + "step": 520 + }, + { + "epoch": 0.058673751798959374, + "grad_norm": 26.343484772343533, + "learning_rate": 5.86283185840708e-06, + "loss": 3.0181, + "step": 530 + }, + { + "epoch": 0.05978080371969445, + "grad_norm": 26.674266585106718, + "learning_rate": 5.973451327433629e-06, + "loss": 2.9782, + "step": 540 + }, + { + "epoch": 0.06088785564042953, + "grad_norm": 24.29263386559663, + "learning_rate": 6.084070796460177e-06, + "loss": 3.0291, + "step": 550 + }, + { + "epoch": 0.06199490756116462, + "grad_norm": 27.260031480591426, + "learning_rate": 6.194690265486726e-06, + "loss": 3.0252, + "step": 560 + }, + { + "epoch": 0.0631019594818997, + "grad_norm": 20.957832212139657, + "learning_rate": 6.305309734513275e-06, + "loss": 3.0388, + "step": 570 + }, + { + "epoch": 0.06420901140263478, + "grad_norm": 26.9583565130981, + "learning_rate": 6.415929203539823e-06, + "loss": 2.9987, + "step": 580 + }, + { + "epoch": 0.06531606332336987, + "grad_norm": 23.667021249704298, + "learning_rate": 6.526548672566372e-06, + "loss": 2.9786, + "step": 590 + }, + { + "epoch": 0.06642311524410495, + "grad_norm": 24.584436820766868, + "learning_rate": 6.6371681415929215e-06, + "loss": 3.0082, + "step": 600 + }, + { + "epoch": 0.06753016716484003, + "grad_norm": 28.424068265725914, + "learning_rate": 6.74778761061947e-06, + "loss": 3.1212, + "step": 610 + }, + { + "epoch": 0.06863721908557512, + "grad_norm": 21.704948850763316, + "learning_rate": 6.858407079646018e-06, + "loss": 3.0008, + "step": 620 + }, + { + "epoch": 0.0697442710063102, + "grad_norm": 25.82364197800952, + "learning_rate": 6.969026548672567e-06, + "loss": 2.9993, + "step": 630 + }, + { + "epoch": 0.07085132292704528, + "grad_norm": 23.887813042264725, + "learning_rate": 7.079646017699116e-06, + "loss": 2.9319, + "step": 640 + }, + { + "epoch": 0.07195837484778037, + "grad_norm": 26.62784975319365, + "learning_rate": 7.190265486725664e-06, + "loss": 2.9158, + "step": 650 + }, + { + "epoch": 0.07306542676851545, + "grad_norm": 25.598475891481986, + "learning_rate": 7.300884955752213e-06, + "loss": 3.0746, + "step": 660 + }, + { + "epoch": 0.07417247868925053, + "grad_norm": 19.384106471975148, + "learning_rate": 7.411504424778761e-06, + "loss": 2.9683, + "step": 670 + }, + { + "epoch": 0.07527953060998561, + "grad_norm": 22.58174336593009, + "learning_rate": 7.5221238938053095e-06, + "loss": 2.9548, + "step": 680 + }, + { + "epoch": 0.07638658253072068, + "grad_norm": 23.253222270880613, + "learning_rate": 7.632743362831859e-06, + "loss": 2.9424, + "step": 690 + }, + { + "epoch": 0.07749363445145577, + "grad_norm": 24.53761264564241, + "learning_rate": 7.743362831858407e-06, + "loss": 2.9999, + "step": 700 + }, + { + "epoch": 0.07860068637219085, + "grad_norm": 22.215828742213887, + "learning_rate": 7.853982300884957e-06, + "loss": 2.9638, + "step": 710 + }, + { + "epoch": 0.07970773829292593, + "grad_norm": 23.926002163576186, + "learning_rate": 7.964601769911505e-06, + "loss": 2.9937, + "step": 720 + }, + { + "epoch": 0.08081479021366102, + "grad_norm": 24.414616790878906, + "learning_rate": 8.075221238938053e-06, + "loss": 2.9732, + "step": 730 + }, + { + "epoch": 0.0819218421343961, + "grad_norm": 23.388402347902353, + "learning_rate": 8.185840707964603e-06, + "loss": 2.9107, + "step": 740 + }, + { + "epoch": 0.08302889405513118, + "grad_norm": 24.124270687360198, + "learning_rate": 8.296460176991151e-06, + "loss": 2.9869, + "step": 750 + }, + { + "epoch": 0.08413594597586627, + "grad_norm": 21.86924086571945, + "learning_rate": 8.4070796460177e-06, + "loss": 3.0616, + "step": 760 + }, + { + "epoch": 0.08524299789660135, + "grad_norm": 29.125772286493696, + "learning_rate": 8.517699115044249e-06, + "loss": 2.9174, + "step": 770 + }, + { + "epoch": 0.08635004981733643, + "grad_norm": 25.471433455609642, + "learning_rate": 8.628318584070797e-06, + "loss": 3.0338, + "step": 780 + }, + { + "epoch": 0.08745710173807152, + "grad_norm": 24.06665529849035, + "learning_rate": 8.738938053097345e-06, + "loss": 3.0321, + "step": 790 + }, + { + "epoch": 0.0885641536588066, + "grad_norm": 18.292126722435007, + "learning_rate": 8.849557522123895e-06, + "loss": 2.9429, + "step": 800 + }, + { + "epoch": 0.08967120557954168, + "grad_norm": 22.110943430558972, + "learning_rate": 8.960176991150443e-06, + "loss": 2.8389, + "step": 810 + }, + { + "epoch": 0.09077825750027677, + "grad_norm": 24.83606400487908, + "learning_rate": 9.070796460176992e-06, + "loss": 2.9753, + "step": 820 + }, + { + "epoch": 0.09188530942101185, + "grad_norm": 25.95232011272635, + "learning_rate": 9.181415929203542e-06, + "loss": 3.043, + "step": 830 + }, + { + "epoch": 0.09299236134174693, + "grad_norm": 20.659162690961626, + "learning_rate": 9.29203539823009e-06, + "loss": 2.9343, + "step": 840 + }, + { + "epoch": 0.09409941326248202, + "grad_norm": 25.45459790239467, + "learning_rate": 9.402654867256638e-06, + "loss": 3.0285, + "step": 850 + }, + { + "epoch": 0.0952064651832171, + "grad_norm": 24.920778384975627, + "learning_rate": 9.513274336283188e-06, + "loss": 3.008, + "step": 860 + }, + { + "epoch": 0.09631351710395218, + "grad_norm": 23.893946109752218, + "learning_rate": 9.623893805309736e-06, + "loss": 3.0592, + "step": 870 + }, + { + "epoch": 0.09742056902468726, + "grad_norm": 23.476211841831407, + "learning_rate": 9.734513274336284e-06, + "loss": 3.0277, + "step": 880 + }, + { + "epoch": 0.09852762094542233, + "grad_norm": 32.349500707592966, + "learning_rate": 9.845132743362832e-06, + "loss": 3.0481, + "step": 890 + }, + { + "epoch": 0.09963467286615742, + "grad_norm": 23.23199056988143, + "learning_rate": 9.95575221238938e-06, + "loss": 2.9412, + "step": 900 + }, + { + "epoch": 0.1007417247868925, + "grad_norm": 23.773772040090407, + "learning_rate": 9.999986557878607e-06, + "loss": 2.911, + "step": 910 + }, + { + "epoch": 0.10184877670762758, + "grad_norm": 21.93860193010473, + "learning_rate": 9.999904411842942e-06, + "loss": 3.0976, + "step": 920 + }, + { + "epoch": 0.10295582862836267, + "grad_norm": 26.162102122715467, + "learning_rate": 9.999747588842252e-06, + "loss": 2.8653, + "step": 930 + }, + { + "epoch": 0.10406288054909775, + "grad_norm": 24.79079368386082, + "learning_rate": 9.999516091218793e-06, + "loss": 3.0475, + "step": 940 + }, + { + "epoch": 0.10516993246983283, + "grad_norm": 22.659978907939497, + "learning_rate": 9.999209922430137e-06, + "loss": 2.9725, + "step": 950 + }, + { + "epoch": 0.10627698439056792, + "grad_norm": 24.881751031281876, + "learning_rate": 9.99882908704913e-06, + "loss": 2.9832, + "step": 960 + }, + { + "epoch": 0.107384036311303, + "grad_norm": 19.289141804905892, + "learning_rate": 9.998373590763798e-06, + "loss": 2.9333, + "step": 970 + }, + { + "epoch": 0.10849108823203808, + "grad_norm": 25.233514814542282, + "learning_rate": 9.997843440377293e-06, + "loss": 3.1247, + "step": 980 + }, + { + "epoch": 0.10959814015277317, + "grad_norm": 22.529791978010802, + "learning_rate": 9.997238643807768e-06, + "loss": 3.0009, + "step": 990 + }, + { + "epoch": 0.11070519207350825, + "grad_norm": 26.994079486651124, + "learning_rate": 9.996559210088272e-06, + "loss": 3.0359, + "step": 1000 + }, + { + "epoch": 0.11181224399424333, + "grad_norm": 23.04614956134999, + "learning_rate": 9.995805149366607e-06, + "loss": 2.9097, + "step": 1010 + }, + { + "epoch": 0.11291929591497842, + "grad_norm": 26.498780600053372, + "learning_rate": 9.994976472905184e-06, + "loss": 3.045, + "step": 1020 + }, + { + "epoch": 0.1140263478357135, + "grad_norm": 20.370834631825762, + "learning_rate": 9.994073193080844e-06, + "loss": 2.9198, + "step": 1030 + }, + { + "epoch": 0.11513339975644858, + "grad_norm": 18.931594900754984, + "learning_rate": 9.993095323384688e-06, + "loss": 2.9937, + "step": 1040 + }, + { + "epoch": 0.11624045167718366, + "grad_norm": 25.23074417182063, + "learning_rate": 9.992042878421862e-06, + "loss": 2.9846, + "step": 1050 + }, + { + "epoch": 0.11734750359791875, + "grad_norm": 22.748633865558133, + "learning_rate": 9.990915873911346e-06, + "loss": 3.0222, + "step": 1060 + }, + { + "epoch": 0.11845455551865383, + "grad_norm": 20.500684992552053, + "learning_rate": 9.989714326685715e-06, + "loss": 3.0954, + "step": 1070 + }, + { + "epoch": 0.1195616074393889, + "grad_norm": 18.438452324633406, + "learning_rate": 9.988438254690896e-06, + "loss": 2.9079, + "step": 1080 + }, + { + "epoch": 0.12066865936012398, + "grad_norm": 20.9380161875694, + "learning_rate": 9.987087676985886e-06, + "loss": 3.042, + "step": 1090 + }, + { + "epoch": 0.12177571128085907, + "grad_norm": 21.96145242279915, + "learning_rate": 9.985662613742483e-06, + "loss": 3.0928, + "step": 1100 + }, + { + "epoch": 0.12288276320159415, + "grad_norm": 22.04573397808545, + "learning_rate": 9.984163086244971e-06, + "loss": 3.1986, + "step": 1110 + }, + { + "epoch": 0.12398981512232923, + "grad_norm": 22.85309097193917, + "learning_rate": 9.982589116889811e-06, + "loss": 3.0349, + "step": 1120 + }, + { + "epoch": 0.12509686704306433, + "grad_norm": 22.5326581537924, + "learning_rate": 9.980940729185305e-06, + "loss": 3.0092, + "step": 1130 + }, + { + "epoch": 0.1262039189637994, + "grad_norm": 23.18944415182163, + "learning_rate": 9.97921794775124e-06, + "loss": 2.952, + "step": 1140 + }, + { + "epoch": 0.1273109708845345, + "grad_norm": 21.06330535416755, + "learning_rate": 9.977420798318527e-06, + "loss": 2.9854, + "step": 1150 + }, + { + "epoch": 0.12841802280526957, + "grad_norm": 20.966301469920268, + "learning_rate": 9.975549307728812e-06, + "loss": 2.9179, + "step": 1160 + }, + { + "epoch": 0.12952507472600466, + "grad_norm": 20.657024404262373, + "learning_rate": 9.973603503934077e-06, + "loss": 2.9828, + "step": 1170 + }, + { + "epoch": 0.13063212664673973, + "grad_norm": 23.18808360085381, + "learning_rate": 9.97158341599622e-06, + "loss": 2.8795, + "step": 1180 + }, + { + "epoch": 0.1317391785674748, + "grad_norm": 19.586003898036246, + "learning_rate": 9.969489074086626e-06, + "loss": 2.9715, + "step": 1190 + }, + { + "epoch": 0.1328462304882099, + "grad_norm": 23.666001778535268, + "learning_rate": 9.967320509485715e-06, + "loss": 3.0556, + "step": 1200 + }, + { + "epoch": 0.13395328240894497, + "grad_norm": 20.020096724757796, + "learning_rate": 9.965077754582468e-06, + "loss": 2.925, + "step": 1210 + }, + { + "epoch": 0.13506033432968007, + "grad_norm": 24.015238653225634, + "learning_rate": 9.962760842873952e-06, + "loss": 2.9019, + "step": 1220 + }, + { + "epoch": 0.13616738625041513, + "grad_norm": 30.05960379166683, + "learning_rate": 9.960369808964816e-06, + "loss": 2.984, + "step": 1230 + }, + { + "epoch": 0.13727443817115023, + "grad_norm": 19.296451414183455, + "learning_rate": 9.957904688566774e-06, + "loss": 2.9919, + "step": 1240 + }, + { + "epoch": 0.1383814900918853, + "grad_norm": 20.11171003157378, + "learning_rate": 9.95536551849807e-06, + "loss": 2.939, + "step": 1250 + }, + { + "epoch": 0.1394885420126204, + "grad_norm": 24.97381642097054, + "learning_rate": 9.952752336682933e-06, + "loss": 3.0819, + "step": 1260 + }, + { + "epoch": 0.14059559393335547, + "grad_norm": 19.11189833758423, + "learning_rate": 9.950065182151007e-06, + "loss": 2.9558, + "step": 1270 + }, + { + "epoch": 0.14170264585409056, + "grad_norm": 22.339401966128147, + "learning_rate": 9.947304095036768e-06, + "loss": 2.971, + "step": 1280 + }, + { + "epoch": 0.14280969777482563, + "grad_norm": 20.896158530865005, + "learning_rate": 9.944469116578925e-06, + "loss": 2.9734, + "step": 1290 + }, + { + "epoch": 0.14391674969556073, + "grad_norm": 21.80035128152707, + "learning_rate": 9.941560289119808e-06, + "loss": 3.0756, + "step": 1300 + }, + { + "epoch": 0.1450238016162958, + "grad_norm": 22.803461112332005, + "learning_rate": 9.938577656104725e-06, + "loss": 2.8886, + "step": 1310 + }, + { + "epoch": 0.1461308535370309, + "grad_norm": 19.045841307524757, + "learning_rate": 9.935521262081324e-06, + "loss": 2.9949, + "step": 1320 + }, + { + "epoch": 0.14723790545776597, + "grad_norm": 21.269082405436986, + "learning_rate": 9.932391152698926e-06, + "loss": 3.1047, + "step": 1330 + }, + { + "epoch": 0.14834495737850106, + "grad_norm": 24.520690144049905, + "learning_rate": 9.929187374707836e-06, + "loss": 2.9404, + "step": 1340 + }, + { + "epoch": 0.14945200929923613, + "grad_norm": 22.56252212345693, + "learning_rate": 9.925909975958655e-06, + "loss": 2.9609, + "step": 1350 + }, + { + "epoch": 0.15055906121997123, + "grad_norm": 18.509241308235815, + "learning_rate": 9.922559005401555e-06, + "loss": 2.9581, + "step": 1360 + }, + { + "epoch": 0.1516661131407063, + "grad_norm": 21.078754308555286, + "learning_rate": 9.919134513085557e-06, + "loss": 3.0338, + "step": 1370 + }, + { + "epoch": 0.15277316506144137, + "grad_norm": 19.364617917203557, + "learning_rate": 9.915636550157776e-06, + "loss": 3.0394, + "step": 1380 + }, + { + "epoch": 0.15388021698217647, + "grad_norm": 12.87341952454837, + "learning_rate": 9.912065168862661e-06, + "loss": 2.8927, + "step": 1390 + }, + { + "epoch": 0.15498726890291153, + "grad_norm": 21.353822481564322, + "learning_rate": 9.908420422541216e-06, + "loss": 2.9264, + "step": 1400 + }, + { + "epoch": 0.15609432082364663, + "grad_norm": 25.61409358483238, + "learning_rate": 9.9047023656302e-06, + "loss": 3.0722, + "step": 1410 + }, + { + "epoch": 0.1572013727443817, + "grad_norm": 18.98168487984158, + "learning_rate": 9.90091105366132e-06, + "loss": 3.0422, + "step": 1420 + }, + { + "epoch": 0.1583084246651168, + "grad_norm": 18.90201248838335, + "learning_rate": 9.897046543260384e-06, + "loss": 2.9686, + "step": 1430 + }, + { + "epoch": 0.15941547658585187, + "grad_norm": 19.145516912456003, + "learning_rate": 9.893108892146487e-06, + "loss": 2.9299, + "step": 1440 + }, + { + "epoch": 0.16052252850658696, + "grad_norm": 21.131608116342832, + "learning_rate": 9.889098159131112e-06, + "loss": 2.9767, + "step": 1450 + }, + { + "epoch": 0.16162958042732203, + "grad_norm": 23.100589010259966, + "learning_rate": 9.88501440411728e-06, + "loss": 2.9711, + "step": 1460 + }, + { + "epoch": 0.16273663234805713, + "grad_norm": 23.844195002755608, + "learning_rate": 9.88085768809865e-06, + "loss": 3.0006, + "step": 1470 + }, + { + "epoch": 0.1638436842687922, + "grad_norm": 21.595484978633603, + "learning_rate": 9.876628073158586e-06, + "loss": 2.8897, + "step": 1480 + }, + { + "epoch": 0.1649507361895273, + "grad_norm": 19.91645782320423, + "learning_rate": 9.872325622469263e-06, + "loss": 2.9626, + "step": 1490 + }, + { + "epoch": 0.16605778811026237, + "grad_norm": 22.954047655684626, + "learning_rate": 9.8679504002907e-06, + "loss": 2.9654, + "step": 1500 + }, + { + "epoch": 0.16716484003099746, + "grad_norm": 19.01781845067502, + "learning_rate": 9.863502471969811e-06, + "loss": 2.9689, + "step": 1510 + }, + { + "epoch": 0.16827189195173253, + "grad_norm": 23.51295361703636, + "learning_rate": 9.858981903939419e-06, + "loss": 2.9714, + "step": 1520 + }, + { + "epoch": 0.16937894387246763, + "grad_norm": 22.715802630980665, + "learning_rate": 9.85438876371728e-06, + "loss": 2.9433, + "step": 1530 + }, + { + "epoch": 0.1704859957932027, + "grad_norm": 19.235667821528295, + "learning_rate": 9.849723119905055e-06, + "loss": 2.8702, + "step": 1540 + }, + { + "epoch": 0.1715930477139378, + "grad_norm": 20.997083855056253, + "learning_rate": 9.844985042187305e-06, + "loss": 2.9613, + "step": 1550 + }, + { + "epoch": 0.17270009963467287, + "grad_norm": 19.327650896289015, + "learning_rate": 9.840174601330434e-06, + "loss": 2.9561, + "step": 1560 + }, + { + "epoch": 0.17380715155540793, + "grad_norm": 23.743647417758826, + "learning_rate": 9.835291869181638e-06, + "loss": 2.9465, + "step": 1570 + }, + { + "epoch": 0.17491420347614303, + "grad_norm": 21.24076642916138, + "learning_rate": 9.830336918667838e-06, + "loss": 2.9089, + "step": 1580 + }, + { + "epoch": 0.1760212553968781, + "grad_norm": 18.18531438353361, + "learning_rate": 9.82530982379458e-06, + "loss": 2.925, + "step": 1590 + }, + { + "epoch": 0.1771283073176132, + "grad_norm": 18.941367135114337, + "learning_rate": 9.820210659644938e-06, + "loss": 2.8847, + "step": 1600 + }, + { + "epoch": 0.17823535923834827, + "grad_norm": 21.6741338404853, + "learning_rate": 9.815039502378387e-06, + "loss": 2.8948, + "step": 1610 + }, + { + "epoch": 0.17934241115908336, + "grad_norm": 20.193862408863023, + "learning_rate": 9.80979642922967e-06, + "loss": 3.0728, + "step": 1620 + }, + { + "epoch": 0.18044946307981843, + "grad_norm": 18.820011578655564, + "learning_rate": 9.804481518507645e-06, + "loss": 2.9551, + "step": 1630 + }, + { + "epoch": 0.18155651500055353, + "grad_norm": 21.501952619775196, + "learning_rate": 9.799094849594107e-06, + "loss": 2.9621, + "step": 1640 + }, + { + "epoch": 0.1826635669212886, + "grad_norm": 25.610574065149102, + "learning_rate": 9.793636502942611e-06, + "loss": 2.8723, + "step": 1650 + }, + { + "epoch": 0.1837706188420237, + "grad_norm": 20.593228441794714, + "learning_rate": 9.78810656007727e-06, + "loss": 2.8278, + "step": 1660 + }, + { + "epoch": 0.18487767076275877, + "grad_norm": 19.172777347075332, + "learning_rate": 9.782505103591533e-06, + "loss": 2.9767, + "step": 1670 + }, + { + "epoch": 0.18598472268349386, + "grad_norm": 21.02145151466687, + "learning_rate": 9.776832217146952e-06, + "loss": 2.8362, + "step": 1680 + }, + { + "epoch": 0.18709177460422893, + "grad_norm": 20.621602691872784, + "learning_rate": 9.771087985471936e-06, + "loss": 3.0292, + "step": 1690 + }, + { + "epoch": 0.18819882652496403, + "grad_norm": 17.865789195071134, + "learning_rate": 9.765272494360483e-06, + "loss": 2.8839, + "step": 1700 + }, + { + "epoch": 0.1893058784456991, + "grad_norm": 18.637077859127157, + "learning_rate": 9.759385830670897e-06, + "loss": 2.8975, + "step": 1710 + }, + { + "epoch": 0.1904129303664342, + "grad_norm": 20.242466511532335, + "learning_rate": 9.753428082324496e-06, + "loss": 2.8949, + "step": 1720 + }, + { + "epoch": 0.19151998228716927, + "grad_norm": 19.93215544071779, + "learning_rate": 9.747399338304295e-06, + "loss": 3.0225, + "step": 1730 + }, + { + "epoch": 0.19262703420790436, + "grad_norm": 24.762070259664206, + "learning_rate": 9.741299688653676e-06, + "loss": 2.9459, + "step": 1740 + }, + { + "epoch": 0.19373408612863943, + "grad_norm": 19.63500693742026, + "learning_rate": 9.735129224475044e-06, + "loss": 2.8765, + "step": 1750 + }, + { + "epoch": 0.19484113804937453, + "grad_norm": 21.82483127686805, + "learning_rate": 9.72888803792847e-06, + "loss": 2.8684, + "step": 1760 + }, + { + "epoch": 0.1959481899701096, + "grad_norm": 19.049243439574713, + "learning_rate": 9.72257622223031e-06, + "loss": 2.9594, + "step": 1770 + }, + { + "epoch": 0.19705524189084467, + "grad_norm": 21.414348061773953, + "learning_rate": 9.716193871651814e-06, + "loss": 2.9053, + "step": 1780 + }, + { + "epoch": 0.19816229381157976, + "grad_norm": 17.876253403312774, + "learning_rate": 9.709741081517717e-06, + "loss": 2.8154, + "step": 1790 + }, + { + "epoch": 0.19926934573231483, + "grad_norm": 20.116361008310705, + "learning_rate": 9.703217948204821e-06, + "loss": 2.9732, + "step": 1800 + }, + { + "epoch": 0.20037639765304993, + "grad_norm": 18.744377270113645, + "learning_rate": 9.696624569140547e-06, + "loss": 2.8966, + "step": 1810 + }, + { + "epoch": 0.201483449573785, + "grad_norm": 19.280130238929477, + "learning_rate": 9.689961042801483e-06, + "loss": 2.8611, + "step": 1820 + }, + { + "epoch": 0.2025905014945201, + "grad_norm": 19.224045203920024, + "learning_rate": 9.68322746871192e-06, + "loss": 2.8985, + "step": 1830 + }, + { + "epoch": 0.20369755341525517, + "grad_norm": 23.351196637887085, + "learning_rate": 9.676423947442353e-06, + "loss": 2.9592, + "step": 1840 + }, + { + "epoch": 0.20480460533599026, + "grad_norm": 17.4707572244572, + "learning_rate": 9.66955058060799e-06, + "loss": 2.9347, + "step": 1850 + }, + { + "epoch": 0.20591165725672533, + "grad_norm": 20.54442740488183, + "learning_rate": 9.662607470867229e-06, + "loss": 2.8642, + "step": 1860 + }, + { + "epoch": 0.20701870917746043, + "grad_norm": 19.78074079042836, + "learning_rate": 9.655594721920124e-06, + "loss": 2.8779, + "step": 1870 + }, + { + "epoch": 0.2081257610981955, + "grad_norm": 19.640164221789888, + "learning_rate": 9.648512438506841e-06, + "loss": 3.0375, + "step": 1880 + }, + { + "epoch": 0.2092328130189306, + "grad_norm": 17.525300444795423, + "learning_rate": 9.641360726406087e-06, + "loss": 2.9689, + "step": 1890 + }, + { + "epoch": 0.21033986493966567, + "grad_norm": 16.921755221767196, + "learning_rate": 9.634139692433534e-06, + "loss": 2.9311, + "step": 1900 + }, + { + "epoch": 0.21144691686040076, + "grad_norm": 23.573635502822672, + "learning_rate": 9.626849444440223e-06, + "loss": 3.1791, + "step": 1910 + }, + { + "epoch": 0.21255396878113583, + "grad_norm": 21.608288648771143, + "learning_rate": 9.619490091310959e-06, + "loss": 2.9152, + "step": 1920 + }, + { + "epoch": 0.21366102070187093, + "grad_norm": 21.984519688812558, + "learning_rate": 9.612061742962672e-06, + "loss": 2.8558, + "step": 1930 + }, + { + "epoch": 0.214768072622606, + "grad_norm": 20.401130440641623, + "learning_rate": 9.604564510342785e-06, + "loss": 2.8631, + "step": 1940 + }, + { + "epoch": 0.2158751245433411, + "grad_norm": 20.05203124505054, + "learning_rate": 9.596998505427556e-06, + "loss": 2.987, + "step": 1950 + }, + { + "epoch": 0.21698217646407617, + "grad_norm": 20.868561748558378, + "learning_rate": 9.589363841220398e-06, + "loss": 2.7379, + "step": 1960 + }, + { + "epoch": 0.21808922838481123, + "grad_norm": 22.537403308642126, + "learning_rate": 9.581660631750205e-06, + "loss": 2.9491, + "step": 1970 + }, + { + "epoch": 0.21919628030554633, + "grad_norm": 18.786633581936144, + "learning_rate": 9.573888992069635e-06, + "loss": 3.0325, + "step": 1980 + }, + { + "epoch": 0.2203033322262814, + "grad_norm": 20.183050798106528, + "learning_rate": 9.566049038253404e-06, + "loss": 2.8613, + "step": 1990 + }, + { + "epoch": 0.2214103841470165, + "grad_norm": 19.889860560476563, + "learning_rate": 9.558140887396539e-06, + "loss": 3.0076, + "step": 2000 + }, + { + "epoch": 0.2214103841470165, + "eval_loss": 2.899467945098877, + "eval_runtime": 2402.2319, + "eval_samples_per_second": 4.178, + "eval_steps_per_second": 0.418, + "step": 2000 + }, + { + "epoch": 0.22251743606775157, + "grad_norm": 20.918414604698, + "learning_rate": 9.55016465761264e-06, + "loss": 2.8974, + "step": 2010 + }, + { + "epoch": 0.22362448798848666, + "grad_norm": 18.12895807311221, + "learning_rate": 9.542120468032108e-06, + "loss": 2.8925, + "step": 2020 + }, + { + "epoch": 0.22473153990922173, + "grad_norm": 20.68008214687689, + "learning_rate": 9.534008438800378e-06, + "loss": 2.8954, + "step": 2030 + }, + { + "epoch": 0.22583859182995683, + "grad_norm": 19.62578662683229, + "learning_rate": 9.525828691076107e-06, + "loss": 2.9672, + "step": 2040 + }, + { + "epoch": 0.2269456437506919, + "grad_norm": 18.137624721398762, + "learning_rate": 9.517581347029378e-06, + "loss": 2.7592, + "step": 2050 + }, + { + "epoch": 0.228052695671427, + "grad_norm": 18.753830138125636, + "learning_rate": 9.509266529839872e-06, + "loss": 2.7837, + "step": 2060 + }, + { + "epoch": 0.22915974759216207, + "grad_norm": 17.672344029095868, + "learning_rate": 9.500884363695025e-06, + "loss": 2.8959, + "step": 2070 + }, + { + "epoch": 0.23026679951289716, + "grad_norm": 17.952725451957562, + "learning_rate": 9.492434973788176e-06, + "loss": 2.9146, + "step": 2080 + }, + { + "epoch": 0.23137385143363223, + "grad_norm": 21.49636205616348, + "learning_rate": 9.483918486316694e-06, + "loss": 2.9972, + "step": 2090 + }, + { + "epoch": 0.23248090335436733, + "grad_norm": 17.872259773823583, + "learning_rate": 9.475335028480104e-06, + "loss": 2.9048, + "step": 2100 + }, + { + "epoch": 0.2335879552751024, + "grad_norm": 18.304493955091758, + "learning_rate": 9.466684728478167e-06, + "loss": 2.8832, + "step": 2110 + }, + { + "epoch": 0.2346950071958375, + "grad_norm": 20.521104550808733, + "learning_rate": 9.457967715508986e-06, + "loss": 2.9132, + "step": 2120 + }, + { + "epoch": 0.23580205911657257, + "grad_norm": 21.959898523340325, + "learning_rate": 9.449184119767066e-06, + "loss": 2.8827, + "step": 2130 + }, + { + "epoch": 0.23690911103730766, + "grad_norm": 17.838849413370237, + "learning_rate": 9.440334072441364e-06, + "loss": 2.9918, + "step": 2140 + }, + { + "epoch": 0.23801616295804273, + "grad_norm": 19.92878332099444, + "learning_rate": 9.431417705713348e-06, + "loss": 2.9768, + "step": 2150 + }, + { + "epoch": 0.2391232148787778, + "grad_norm": 22.052024784352827, + "learning_rate": 9.422435152755003e-06, + "loss": 2.7936, + "step": 2160 + }, + { + "epoch": 0.2402302667995129, + "grad_norm": 18.832979591486268, + "learning_rate": 9.41338654772685e-06, + "loss": 2.8846, + "step": 2170 + }, + { + "epoch": 0.24133731872024797, + "grad_norm": 20.56672086138257, + "learning_rate": 9.40427202577595e-06, + "loss": 2.9381, + "step": 2180 + }, + { + "epoch": 0.24244437064098306, + "grad_norm": 19.022342343144167, + "learning_rate": 9.39509172303387e-06, + "loss": 2.7231, + "step": 2190 + }, + { + "epoch": 0.24355142256171813, + "grad_norm": 18.365037787301343, + "learning_rate": 9.385845776614659e-06, + "loss": 2.8299, + "step": 2200 + }, + { + "epoch": 0.24465847448245323, + "grad_norm": 16.086771712151563, + "learning_rate": 9.3765343246128e-06, + "loss": 2.8833, + "step": 2210 + }, + { + "epoch": 0.2457655264031883, + "grad_norm": 17.286906565742285, + "learning_rate": 9.367157506101152e-06, + "loss": 2.8471, + "step": 2220 + }, + { + "epoch": 0.2468725783239234, + "grad_norm": 16.860767812467355, + "learning_rate": 9.35771546112886e-06, + "loss": 2.7524, + "step": 2230 + }, + { + "epoch": 0.24797963024465847, + "grad_norm": 22.69662113190212, + "learning_rate": 9.348208330719269e-06, + "loss": 2.9083, + "step": 2240 + }, + { + "epoch": 0.24908668216539356, + "grad_norm": 17.900886651161414, + "learning_rate": 9.338636256867826e-06, + "loss": 2.8428, + "step": 2250 + }, + { + "epoch": 0.25019373408612866, + "grad_norm": 16.680552099827924, + "learning_rate": 9.328999382539948e-06, + "loss": 2.8914, + "step": 2260 + }, + { + "epoch": 0.25130078600686373, + "grad_norm": 18.313897702064246, + "learning_rate": 9.319297851668893e-06, + "loss": 2.9034, + "step": 2270 + }, + { + "epoch": 0.2524078379275988, + "grad_norm": 16.947671537858998, + "learning_rate": 9.309531809153606e-06, + "loss": 2.8502, + "step": 2280 + }, + { + "epoch": 0.25351488984833387, + "grad_norm": 18.710427396365873, + "learning_rate": 9.29970140085656e-06, + "loss": 2.8524, + "step": 2290 + }, + { + "epoch": 0.254621941769069, + "grad_norm": 19.2567190717822, + "learning_rate": 9.28980677360157e-06, + "loss": 2.9991, + "step": 2300 + }, + { + "epoch": 0.25572899368980406, + "grad_norm": 18.050406635894987, + "learning_rate": 9.279848075171613e-06, + "loss": 2.8717, + "step": 2310 + }, + { + "epoch": 0.25683604561053913, + "grad_norm": 22.127493631791086, + "learning_rate": 9.269825454306605e-06, + "loss": 2.8977, + "step": 2320 + }, + { + "epoch": 0.2579430975312742, + "grad_norm": 18.821085236072186, + "learning_rate": 9.259739060701189e-06, + "loss": 2.9116, + "step": 2330 + }, + { + "epoch": 0.2590501494520093, + "grad_norm": 19.277291605575755, + "learning_rate": 9.249589045002497e-06, + "loss": 2.9024, + "step": 2340 + }, + { + "epoch": 0.2601572013727444, + "grad_norm": 18.176543407022002, + "learning_rate": 9.239375558807901e-06, + "loss": 2.9065, + "step": 2350 + }, + { + "epoch": 0.26126425329347946, + "grad_norm": 17.55658292047273, + "learning_rate": 9.229098754662748e-06, + "loss": 2.7598, + "step": 2360 + }, + { + "epoch": 0.26237130521421453, + "grad_norm": 19.0666485097006, + "learning_rate": 9.218758786058084e-06, + "loss": 2.8376, + "step": 2370 + }, + { + "epoch": 0.2634783571349496, + "grad_norm": 19.066879018665727, + "learning_rate": 9.208355807428351e-06, + "loss": 2.8766, + "step": 2380 + }, + { + "epoch": 0.26458540905568473, + "grad_norm": 22.160566724834183, + "learning_rate": 9.197889974149096e-06, + "loss": 2.9115, + "step": 2390 + }, + { + "epoch": 0.2656924609764198, + "grad_norm": 18.716069674957527, + "learning_rate": 9.187361442534641e-06, + "loss": 2.913, + "step": 2400 + }, + { + "epoch": 0.26679951289715487, + "grad_norm": 21.86386868859532, + "learning_rate": 9.176770369835748e-06, + "loss": 3.0737, + "step": 2410 + }, + { + "epoch": 0.26790656481788994, + "grad_norm": 19.87740412211485, + "learning_rate": 9.166116914237277e-06, + "loss": 2.827, + "step": 2420 + }, + { + "epoch": 0.26901361673862506, + "grad_norm": 20.48966032173197, + "learning_rate": 9.155401234855814e-06, + "loss": 2.8279, + "step": 2430 + }, + { + "epoch": 0.27012066865936013, + "grad_norm": 18.939462945596684, + "learning_rate": 9.144623491737303e-06, + "loss": 2.8827, + "step": 2440 + }, + { + "epoch": 0.2712277205800952, + "grad_norm": 16.511411706489035, + "learning_rate": 9.133783845854649e-06, + "loss": 2.8858, + "step": 2450 + }, + { + "epoch": 0.27233477250083027, + "grad_norm": 17.12242102699232, + "learning_rate": 9.12288245910532e-06, + "loss": 3.0051, + "step": 2460 + }, + { + "epoch": 0.2734418244215654, + "grad_norm": 21.739631249295055, + "learning_rate": 9.111919494308921e-06, + "loss": 2.8119, + "step": 2470 + }, + { + "epoch": 0.27454887634230046, + "grad_norm": 19.136040590653046, + "learning_rate": 9.100895115204776e-06, + "loss": 2.9821, + "step": 2480 + }, + { + "epoch": 0.27565592826303553, + "grad_norm": 18.511511436982243, + "learning_rate": 9.08980948644946e-06, + "loss": 2.8592, + "step": 2490 + }, + { + "epoch": 0.2767629801837706, + "grad_norm": 20.212663617382482, + "learning_rate": 9.078662773614367e-06, + "loss": 2.9192, + "step": 2500 + }, + { + "epoch": 0.2778700321045057, + "grad_norm": 20.727838354887886, + "learning_rate": 9.067455143183213e-06, + "loss": 2.8882, + "step": 2510 + }, + { + "epoch": 0.2789770840252408, + "grad_norm": 20.387190015826864, + "learning_rate": 9.056186762549564e-06, + "loss": 2.8964, + "step": 2520 + }, + { + "epoch": 0.28008413594597587, + "grad_norm": 21.001687858734584, + "learning_rate": 9.04485780001433e-06, + "loss": 3.0001, + "step": 2530 + }, + { + "epoch": 0.28119118786671093, + "grad_norm": 15.842781499171902, + "learning_rate": 9.033468424783255e-06, + "loss": 2.8406, + "step": 2540 + }, + { + "epoch": 0.282298239787446, + "grad_norm": 21.453283495940212, + "learning_rate": 9.022018806964388e-06, + "loss": 2.7475, + "step": 2550 + }, + { + "epoch": 0.28340529170818113, + "grad_norm": 16.60678323210403, + "learning_rate": 9.010509117565538e-06, + "loss": 2.789, + "step": 2560 + }, + { + "epoch": 0.2845123436289162, + "grad_norm": 21.22156270449788, + "learning_rate": 8.998939528491724e-06, + "loss": 2.8132, + "step": 2570 + }, + { + "epoch": 0.28561939554965127, + "grad_norm": 20.029298510004143, + "learning_rate": 8.987310212542613e-06, + "loss": 2.8848, + "step": 2580 + }, + { + "epoch": 0.28672644747038634, + "grad_norm": 17.416215394194477, + "learning_rate": 8.975621343409927e-06, + "loss": 2.8099, + "step": 2590 + }, + { + "epoch": 0.28783349939112146, + "grad_norm": 17.8983008619953, + "learning_rate": 8.963873095674858e-06, + "loss": 2.8862, + "step": 2600 + }, + { + "epoch": 0.28894055131185653, + "grad_norm": 17.34578619148897, + "learning_rate": 8.95206564480546e-06, + "loss": 2.7672, + "step": 2610 + }, + { + "epoch": 0.2900476032325916, + "grad_norm": 20.307382487515195, + "learning_rate": 8.94019916715402e-06, + "loss": 2.9254, + "step": 2620 + }, + { + "epoch": 0.29115465515332667, + "grad_norm": 15.542065735556422, + "learning_rate": 8.928273839954437e-06, + "loss": 2.7188, + "step": 2630 + }, + { + "epoch": 0.2922617070740618, + "grad_norm": 15.573521475112441, + "learning_rate": 8.916289841319564e-06, + "loss": 2.8667, + "step": 2640 + }, + { + "epoch": 0.29336875899479686, + "grad_norm": 19.410117591693684, + "learning_rate": 8.904247350238551e-06, + "loss": 2.8341, + "step": 2650 + }, + { + "epoch": 0.29447581091553193, + "grad_norm": 19.84765614341061, + "learning_rate": 8.892146546574172e-06, + "loss": 2.7139, + "step": 2660 + }, + { + "epoch": 0.295582862836267, + "grad_norm": 17.983856647490622, + "learning_rate": 8.879987611060143e-06, + "loss": 2.6931, + "step": 2670 + }, + { + "epoch": 0.2966899147570021, + "grad_norm": 16.62072011844082, + "learning_rate": 8.867770725298417e-06, + "loss": 2.8986, + "step": 2680 + }, + { + "epoch": 0.2977969666777372, + "grad_norm": 22.537941135987385, + "learning_rate": 8.855496071756472e-06, + "loss": 2.9275, + "step": 2690 + }, + { + "epoch": 0.29890401859847227, + "grad_norm": 19.624324641621538, + "learning_rate": 8.843163833764585e-06, + "loss": 2.8609, + "step": 2700 + }, + { + "epoch": 0.30001107051920733, + "grad_norm": 14.826694832266885, + "learning_rate": 8.8307741955131e-06, + "loss": 2.832, + "step": 2710 + }, + { + "epoch": 0.30111812243994246, + "grad_norm": 21.084123058139465, + "learning_rate": 8.818327342049672e-06, + "loss": 2.9927, + "step": 2720 + }, + { + "epoch": 0.30222517436067753, + "grad_norm": 17.156557696514646, + "learning_rate": 8.805823459276501e-06, + "loss": 2.7874, + "step": 2730 + }, + { + "epoch": 0.3033322262814126, + "grad_norm": 21.616600083840076, + "learning_rate": 8.793262733947564e-06, + "loss": 2.9143, + "step": 2740 + }, + { + "epoch": 0.30443927820214767, + "grad_norm": 17.849582075052787, + "learning_rate": 8.780645353665814e-06, + "loss": 2.9265, + "step": 2750 + }, + { + "epoch": 0.30554633012288274, + "grad_norm": 16.907525943766586, + "learning_rate": 8.767971506880388e-06, + "loss": 2.8079, + "step": 2760 + }, + { + "epoch": 0.30665338204361786, + "grad_norm": 21.80594816789924, + "learning_rate": 8.755241382883786e-06, + "loss": 2.8586, + "step": 2770 + }, + { + "epoch": 0.30776043396435293, + "grad_norm": 17.786988703153124, + "learning_rate": 8.74245517180905e-06, + "loss": 2.7957, + "step": 2780 + }, + { + "epoch": 0.308867485885088, + "grad_norm": 18.535816164863746, + "learning_rate": 8.729613064626916e-06, + "loss": 2.9017, + "step": 2790 + }, + { + "epoch": 0.30997453780582307, + "grad_norm": 16.811716242078795, + "learning_rate": 8.71671525314297e-06, + "loss": 2.8474, + "step": 2800 + }, + { + "epoch": 0.3110815897265582, + "grad_norm": 18.305914523882734, + "learning_rate": 8.703761929994779e-06, + "loss": 2.9573, + "step": 2810 + }, + { + "epoch": 0.31218864164729326, + "grad_norm": 18.579915296564323, + "learning_rate": 8.690753288649013e-06, + "loss": 2.8964, + "step": 2820 + }, + { + "epoch": 0.31329569356802833, + "grad_norm": 18.539697958237422, + "learning_rate": 8.677689523398556e-06, + "loss": 2.7703, + "step": 2830 + }, + { + "epoch": 0.3144027454887634, + "grad_norm": 17.915697068912802, + "learning_rate": 8.664570829359608e-06, + "loss": 2.8693, + "step": 2840 + }, + { + "epoch": 0.3155097974094985, + "grad_norm": 18.898905292436613, + "learning_rate": 8.651397402468765e-06, + "loss": 2.8371, + "step": 2850 + }, + { + "epoch": 0.3166168493302336, + "grad_norm": 22.702920044801495, + "learning_rate": 8.638169439480097e-06, + "loss": 2.8705, + "step": 2860 + }, + { + "epoch": 0.31772390125096867, + "grad_norm": 14.669145969089513, + "learning_rate": 8.624887137962206e-06, + "loss": 2.7689, + "step": 2870 + }, + { + "epoch": 0.31883095317170373, + "grad_norm": 20.31679832956785, + "learning_rate": 8.61155069629528e-06, + "loss": 2.8442, + "step": 2880 + }, + { + "epoch": 0.31993800509243886, + "grad_norm": 17.50251569058274, + "learning_rate": 8.59816031366812e-06, + "loss": 2.8204, + "step": 2890 + }, + { + "epoch": 0.32104505701317393, + "grad_norm": 14.301977043806207, + "learning_rate": 8.584716190075182e-06, + "loss": 2.7507, + "step": 2900 + }, + { + "epoch": 0.322152108933909, + "grad_norm": 16.501447600831984, + "learning_rate": 8.571218526313572e-06, + "loss": 2.847, + "step": 2910 + }, + { + "epoch": 0.32325916085464407, + "grad_norm": 15.819764582641644, + "learning_rate": 8.557667523980054e-06, + "loss": 2.7269, + "step": 2920 + }, + { + "epoch": 0.3243662127753792, + "grad_norm": 19.79726490914286, + "learning_rate": 8.544063385468047e-06, + "loss": 2.8579, + "step": 2930 + }, + { + "epoch": 0.32547326469611426, + "grad_norm": 13.946259262777874, + "learning_rate": 8.530406313964588e-06, + "loss": 2.8433, + "step": 2940 + }, + { + "epoch": 0.32658031661684933, + "grad_norm": 18.300981068446877, + "learning_rate": 8.516696513447308e-06, + "loss": 2.8518, + "step": 2950 + }, + { + "epoch": 0.3276873685375844, + "grad_norm": 18.862858354575344, + "learning_rate": 8.502934188681382e-06, + "loss": 2.7097, + "step": 2960 + }, + { + "epoch": 0.32879442045831947, + "grad_norm": 17.293876429758797, + "learning_rate": 8.489119545216465e-06, + "loss": 2.8865, + "step": 2970 + }, + { + "epoch": 0.3299014723790546, + "grad_norm": 16.410769414507325, + "learning_rate": 8.475252789383634e-06, + "loss": 2.7419, + "step": 2980 + }, + { + "epoch": 0.33100852429978966, + "grad_norm": 16.157207346564473, + "learning_rate": 8.461334128292296e-06, + "loss": 2.8566, + "step": 2990 + }, + { + "epoch": 0.33211557622052473, + "grad_norm": 17.97405966664622, + "learning_rate": 8.447363769827097e-06, + "loss": 2.8409, + "step": 3000 + }, + { + "epoch": 0.3332226281412598, + "grad_norm": 18.040888448056503, + "learning_rate": 8.43334192264482e-06, + "loss": 2.7078, + "step": 3010 + }, + { + "epoch": 0.3343296800619949, + "grad_norm": 17.401311897099646, + "learning_rate": 8.41926879617127e-06, + "loss": 2.8375, + "step": 3020 + }, + { + "epoch": 0.33543673198273, + "grad_norm": 18.971972878515558, + "learning_rate": 8.405144600598136e-06, + "loss": 2.7534, + "step": 3030 + }, + { + "epoch": 0.33654378390346507, + "grad_norm": 17.56044316128444, + "learning_rate": 8.390969546879868e-06, + "loss": 2.8017, + "step": 3040 + }, + { + "epoch": 0.33765083582420014, + "grad_norm": 18.9191689174584, + "learning_rate": 8.376743846730506e-06, + "loss": 2.8735, + "step": 3050 + }, + { + "epoch": 0.33875788774493526, + "grad_norm": 16.159522966531355, + "learning_rate": 8.36246771262054e-06, + "loss": 2.7277, + "step": 3060 + }, + { + "epoch": 0.33986493966567033, + "grad_norm": 17.732911671191786, + "learning_rate": 8.348141357773714e-06, + "loss": 2.7975, + "step": 3070 + }, + { + "epoch": 0.3409719915864054, + "grad_norm": 17.580686476759546, + "learning_rate": 8.333764996163863e-06, + "loss": 2.7285, + "step": 3080 + }, + { + "epoch": 0.34207904350714047, + "grad_norm": 20.220871787654826, + "learning_rate": 8.319338842511701e-06, + "loss": 2.7638, + "step": 3090 + }, + { + "epoch": 0.3431860954278756, + "grad_norm": 15.421883005921854, + "learning_rate": 8.30486311228162e-06, + "loss": 2.7664, + "step": 3100 + }, + { + "epoch": 0.34429314734861066, + "grad_norm": 22.52292422020666, + "learning_rate": 8.290338021678478e-06, + "loss": 2.7415, + "step": 3110 + }, + { + "epoch": 0.34540019926934573, + "grad_norm": 17.773426663788022, + "learning_rate": 8.275763787644354e-06, + "loss": 2.7612, + "step": 3120 + }, + { + "epoch": 0.3465072511900808, + "grad_norm": 17.313609438292495, + "learning_rate": 8.261140627855326e-06, + "loss": 2.6789, + "step": 3130 + }, + { + "epoch": 0.34761430311081587, + "grad_norm": 19.92121017478009, + "learning_rate": 8.246468760718205e-06, + "loss": 2.9528, + "step": 3140 + }, + { + "epoch": 0.348721355031551, + "grad_norm": 20.3829374368461, + "learning_rate": 8.231748405367284e-06, + "loss": 2.7307, + "step": 3150 + }, + { + "epoch": 0.34982840695228606, + "grad_norm": 17.20183231133198, + "learning_rate": 8.216979781661059e-06, + "loss": 2.7799, + "step": 3160 + }, + { + "epoch": 0.35093545887302113, + "grad_norm": 17.179059431154894, + "learning_rate": 8.202163110178945e-06, + "loss": 2.7417, + "step": 3170 + }, + { + "epoch": 0.3520425107937562, + "grad_norm": 17.829683364789567, + "learning_rate": 8.187298612217984e-06, + "loss": 2.7268, + "step": 3180 + }, + { + "epoch": 0.3531495627144913, + "grad_norm": 20.35885213396436, + "learning_rate": 8.172386509789539e-06, + "loss": 2.8759, + "step": 3190 + }, + { + "epoch": 0.3542566146352264, + "grad_norm": 18.210319395606284, + "learning_rate": 8.157427025615979e-06, + "loss": 2.7603, + "step": 3200 + }, + { + "epoch": 0.35536366655596147, + "grad_norm": 20.180991639281267, + "learning_rate": 8.14242038312735e-06, + "loss": 2.6385, + "step": 3210 + }, + { + "epoch": 0.35647071847669654, + "grad_norm": 13.997589668763045, + "learning_rate": 8.127366806458043e-06, + "loss": 2.6638, + "step": 3220 + }, + { + "epoch": 0.35757777039743166, + "grad_norm": 16.552842345785916, + "learning_rate": 8.112266520443437e-06, + "loss": 2.8545, + "step": 3230 + }, + { + "epoch": 0.35868482231816673, + "grad_norm": 22.63458529594302, + "learning_rate": 8.097119750616552e-06, + "loss": 2.9072, + "step": 3240 + }, + { + "epoch": 0.3597918742389018, + "grad_norm": 20.351123072545064, + "learning_rate": 8.08192672320467e-06, + "loss": 2.8104, + "step": 3250 + }, + { + "epoch": 0.36089892615963687, + "grad_norm": 18.012402171983243, + "learning_rate": 8.066687665125965e-06, + "loss": 2.8857, + "step": 3260 + }, + { + "epoch": 0.362005978080372, + "grad_norm": 14.813109416518861, + "learning_rate": 8.051402803986112e-06, + "loss": 2.7149, + "step": 3270 + }, + { + "epoch": 0.36311303000110706, + "grad_norm": 19.48150839228793, + "learning_rate": 8.036072368074883e-06, + "loss": 2.7073, + "step": 3280 + }, + { + "epoch": 0.36422008192184213, + "grad_norm": 19.11749404734295, + "learning_rate": 8.020696586362739e-06, + "loss": 2.6653, + "step": 3290 + }, + { + "epoch": 0.3653271338425772, + "grad_norm": 22.934472507487648, + "learning_rate": 8.005275688497415e-06, + "loss": 2.813, + "step": 3300 + }, + { + "epoch": 0.3664341857633123, + "grad_norm": 14.997032892515483, + "learning_rate": 7.989809904800483e-06, + "loss": 2.7371, + "step": 3310 + }, + { + "epoch": 0.3675412376840474, + "grad_norm": 15.5742880306809, + "learning_rate": 7.974299466263919e-06, + "loss": 2.8341, + "step": 3320 + }, + { + "epoch": 0.36864828960478246, + "grad_norm": 20.142912914493085, + "learning_rate": 7.958744604546641e-06, + "loss": 2.8141, + "step": 3330 + }, + { + "epoch": 0.36975534152551753, + "grad_norm": 18.86513832413105, + "learning_rate": 7.94314555197107e-06, + "loss": 2.7812, + "step": 3340 + }, + { + "epoch": 0.3708623934462526, + "grad_norm": 22.49228437600144, + "learning_rate": 7.927502541519637e-06, + "loss": 2.825, + "step": 3350 + }, + { + "epoch": 0.3719694453669877, + "grad_norm": 22.419596048754094, + "learning_rate": 7.91181580683132e-06, + "loss": 2.8135, + "step": 3360 + }, + { + "epoch": 0.3730764972877228, + "grad_norm": 16.9758949814327, + "learning_rate": 7.896085582198143e-06, + "loss": 2.7589, + "step": 3370 + }, + { + "epoch": 0.37418354920845787, + "grad_norm": 17.427893990910892, + "learning_rate": 7.880312102561688e-06, + "loss": 2.8191, + "step": 3380 + }, + { + "epoch": 0.37529060112919294, + "grad_norm": 16.881634487817756, + "learning_rate": 7.864495603509571e-06, + "loss": 2.7757, + "step": 3390 + }, + { + "epoch": 0.37639765304992806, + "grad_norm": 17.644413976791455, + "learning_rate": 7.848636321271943e-06, + "loss": 2.8439, + "step": 3400 + }, + { + "epoch": 0.37750470497066313, + "grad_norm": 17.371658704562304, + "learning_rate": 7.83273449271794e-06, + "loss": 2.8163, + "step": 3410 + }, + { + "epoch": 0.3786117568913982, + "grad_norm": 17.681733503092357, + "learning_rate": 7.816790355352167e-06, + "loss": 2.7568, + "step": 3420 + }, + { + "epoch": 0.37971880881213327, + "grad_norm": 18.455389219089255, + "learning_rate": 7.80080414731113e-06, + "loss": 2.6985, + "step": 3430 + }, + { + "epoch": 0.3808258607328684, + "grad_norm": 16.157025548622848, + "learning_rate": 7.784776107359696e-06, + "loss": 2.7969, + "step": 3440 + }, + { + "epoch": 0.38193291265360346, + "grad_norm": 14.768944382636816, + "learning_rate": 7.768706474887516e-06, + "loss": 2.7339, + "step": 3450 + }, + { + "epoch": 0.38303996457433853, + "grad_norm": 18.48084069219429, + "learning_rate": 7.752595489905456e-06, + "loss": 2.7754, + "step": 3460 + }, + { + "epoch": 0.3841470164950736, + "grad_norm": 19.156514520004468, + "learning_rate": 7.736443393042007e-06, + "loss": 2.847, + "step": 3470 + }, + { + "epoch": 0.3852540684158087, + "grad_norm": 16.446763048779168, + "learning_rate": 7.720250425539698e-06, + "loss": 2.6395, + "step": 3480 + }, + { + "epoch": 0.3863611203365438, + "grad_norm": 14.192958419140753, + "learning_rate": 7.704016829251484e-06, + "loss": 2.7273, + "step": 3490 + }, + { + "epoch": 0.38746817225727886, + "grad_norm": 14.358834052259523, + "learning_rate": 7.687742846637141e-06, + "loss": 2.705, + "step": 3500 + }, + { + "epoch": 0.38857522417801393, + "grad_norm": 17.950732691617667, + "learning_rate": 7.671428720759641e-06, + "loss": 2.7615, + "step": 3510 + }, + { + "epoch": 0.38968227609874906, + "grad_norm": 18.082782880469356, + "learning_rate": 7.655074695281526e-06, + "loss": 2.7389, + "step": 3520 + }, + { + "epoch": 0.39078932801948413, + "grad_norm": 17.001645765491634, + "learning_rate": 7.638681014461263e-06, + "loss": 2.7623, + "step": 3530 + }, + { + "epoch": 0.3918963799402192, + "grad_norm": 16.148791106439415, + "learning_rate": 7.622247923149597e-06, + "loss": 2.771, + "step": 3540 + }, + { + "epoch": 0.39300343186095427, + "grad_norm": 16.319755028507952, + "learning_rate": 7.6057756667859e-06, + "loss": 2.745, + "step": 3550 + }, + { + "epoch": 0.39411048378168934, + "grad_norm": 18.249081210470003, + "learning_rate": 7.589264491394497e-06, + "loss": 2.7631, + "step": 3560 + }, + { + "epoch": 0.39521753570242446, + "grad_norm": 17.114757273903603, + "learning_rate": 7.572714643580993e-06, + "loss": 2.5916, + "step": 3570 + }, + { + "epoch": 0.39632458762315953, + "grad_norm": 15.74515478345217, + "learning_rate": 7.556126370528598e-06, + "loss": 2.7441, + "step": 3580 + }, + { + "epoch": 0.3974316395438946, + "grad_norm": 17.521251320931118, + "learning_rate": 7.539499919994425e-06, + "loss": 2.7365, + "step": 3590 + }, + { + "epoch": 0.39853869146462967, + "grad_norm": 19.23187701802523, + "learning_rate": 7.522835540305795e-06, + "loss": 2.7919, + "step": 3600 + }, + { + "epoch": 0.3996457433853648, + "grad_norm": 14.994960528554826, + "learning_rate": 7.506133480356523e-06, + "loss": 2.8063, + "step": 3610 + }, + { + "epoch": 0.40075279530609986, + "grad_norm": 19.43636713958746, + "learning_rate": 7.489393989603213e-06, + "loss": 2.8291, + "step": 3620 + }, + { + "epoch": 0.40185984722683493, + "grad_norm": 19.96902221880387, + "learning_rate": 7.472617318061515e-06, + "loss": 2.6574, + "step": 3630 + }, + { + "epoch": 0.40296689914757, + "grad_norm": 15.764432388205172, + "learning_rate": 7.4558037163023986e-06, + "loss": 2.8279, + "step": 3640 + }, + { + "epoch": 0.4040739510683051, + "grad_norm": 17.00988346435618, + "learning_rate": 7.438953435448422e-06, + "loss": 2.8606, + "step": 3650 + }, + { + "epoch": 0.4051810029890402, + "grad_norm": 20.528609879722282, + "learning_rate": 7.422066727169956e-06, + "loss": 2.803, + "step": 3660 + }, + { + "epoch": 0.40628805490977526, + "grad_norm": 24.117540486267707, + "learning_rate": 7.405143843681453e-06, + "loss": 2.8901, + "step": 3670 + }, + { + "epoch": 0.40739510683051033, + "grad_norm": 15.932815366392553, + "learning_rate": 7.388185037737656e-06, + "loss": 2.6042, + "step": 3680 + }, + { + "epoch": 0.40850215875124546, + "grad_norm": 16.494705800421944, + "learning_rate": 7.371190562629842e-06, + "loss": 2.7918, + "step": 3690 + }, + { + "epoch": 0.40960921067198053, + "grad_norm": 21.567108547663295, + "learning_rate": 7.354160672182027e-06, + "loss": 2.7606, + "step": 3700 + }, + { + "epoch": 0.4107162625927156, + "grad_norm": 21.48414979932869, + "learning_rate": 7.337095620747181e-06, + "loss": 2.6994, + "step": 3710 + }, + { + "epoch": 0.41182331451345067, + "grad_norm": 13.807319945171502, + "learning_rate": 7.319995663203425e-06, + "loss": 2.7346, + "step": 3720 + }, + { + "epoch": 0.41293036643418574, + "grad_norm": 18.456828860891658, + "learning_rate": 7.302861054950231e-06, + "loss": 2.6429, + "step": 3730 + }, + { + "epoch": 0.41403741835492086, + "grad_norm": 18.493884527191277, + "learning_rate": 7.285692051904596e-06, + "loss": 2.7264, + "step": 3740 + }, + { + "epoch": 0.41514447027565593, + "grad_norm": 15.443965108568486, + "learning_rate": 7.2684889104972335e-06, + "loss": 2.7915, + "step": 3750 + }, + { + "epoch": 0.416251522196391, + "grad_norm": 15.970560252697705, + "learning_rate": 7.2512518876687325e-06, + "loss": 2.7585, + "step": 3760 + }, + { + "epoch": 0.41735857411712607, + "grad_norm": 16.483755053972125, + "learning_rate": 7.233981240865723e-06, + "loss": 2.7225, + "step": 3770 + }, + { + "epoch": 0.4184656260378612, + "grad_norm": 15.927243910629507, + "learning_rate": 7.2166772280370355e-06, + "loss": 2.7053, + "step": 3780 + }, + { + "epoch": 0.41957267795859626, + "grad_norm": 16.30824749754582, + "learning_rate": 7.199340107629843e-06, + "loss": 2.7531, + "step": 3790 + }, + { + "epoch": 0.42067972987933133, + "grad_norm": 17.94048283670358, + "learning_rate": 7.1819701385858045e-06, + "loss": 2.643, + "step": 3800 + }, + { + "epoch": 0.4217867818000664, + "grad_norm": 18.8081266409834, + "learning_rate": 7.164567580337191e-06, + "loss": 2.759, + "step": 3810 + }, + { + "epoch": 0.4228938337208015, + "grad_norm": 19.93408221633125, + "learning_rate": 7.147132692803018e-06, + "loss": 2.8159, + "step": 3820 + }, + { + "epoch": 0.4240008856415366, + "grad_norm": 14.119638307817269, + "learning_rate": 7.1296657363851644e-06, + "loss": 2.5886, + "step": 3830 + }, + { + "epoch": 0.42510793756227166, + "grad_norm": 14.700749001625018, + "learning_rate": 7.112166971964472e-06, + "loss": 2.7577, + "step": 3840 + }, + { + "epoch": 0.42621498948300673, + "grad_norm": 16.876997824156497, + "learning_rate": 7.094636660896865e-06, + "loss": 2.7068, + "step": 3850 + }, + { + "epoch": 0.42732204140374186, + "grad_norm": 17.677042560229854, + "learning_rate": 7.0770750650094335e-06, + "loss": 2.7139, + "step": 3860 + }, + { + "epoch": 0.42842909332447693, + "grad_norm": 22.903911635500307, + "learning_rate": 7.059482446596525e-06, + "loss": 2.6586, + "step": 3870 + }, + { + "epoch": 0.429536145245212, + "grad_norm": 17.15359853143299, + "learning_rate": 7.041859068415836e-06, + "loss": 2.7196, + "step": 3880 + }, + { + "epoch": 0.43064319716594707, + "grad_norm": 18.265015720893867, + "learning_rate": 7.024205193684479e-06, + "loss": 2.795, + "step": 3890 + }, + { + "epoch": 0.4317502490866822, + "grad_norm": 17.416460348542884, + "learning_rate": 7.006521086075049e-06, + "loss": 2.8018, + "step": 3900 + }, + { + "epoch": 0.43285730100741726, + "grad_norm": 15.06159976676458, + "learning_rate": 6.9888070097116926e-06, + "loss": 2.6702, + "step": 3910 + }, + { + "epoch": 0.43396435292815233, + "grad_norm": 14.916257340220586, + "learning_rate": 6.971063229166162e-06, + "loss": 2.667, + "step": 3920 + }, + { + "epoch": 0.4350714048488874, + "grad_norm": 16.946369105743727, + "learning_rate": 6.953290009453857e-06, + "loss": 2.6547, + "step": 3930 + }, + { + "epoch": 0.43617845676962247, + "grad_norm": 17.606162667161975, + "learning_rate": 6.9354876160298764e-06, + "loss": 2.7565, + "step": 3940 + }, + { + "epoch": 0.4372855086903576, + "grad_norm": 15.792356606039535, + "learning_rate": 6.917656314785044e-06, + "loss": 2.7603, + "step": 3950 + }, + { + "epoch": 0.43839256061109266, + "grad_norm": 17.519385710278783, + "learning_rate": 6.899796372041943e-06, + "loss": 2.5908, + "step": 3960 + }, + { + "epoch": 0.43949961253182773, + "grad_norm": 18.175539572977502, + "learning_rate": 6.881908054550939e-06, + "loss": 2.7189, + "step": 3970 + }, + { + "epoch": 0.4406066644525628, + "grad_norm": 16.78341459760071, + "learning_rate": 6.863991629486191e-06, + "loss": 2.7457, + "step": 3980 + }, + { + "epoch": 0.4417137163732979, + "grad_norm": 16.307893535865905, + "learning_rate": 6.846047364441661e-06, + "loss": 2.7664, + "step": 3990 + }, + { + "epoch": 0.442820768294033, + "grad_norm": 17.795046718057446, + "learning_rate": 6.828075527427127e-06, + "loss": 2.7682, + "step": 4000 + }, + { + "epoch": 0.442820768294033, + "eval_loss": 2.715528726577759, + "eval_runtime": 2400.8491, + "eval_samples_per_second": 4.181, + "eval_steps_per_second": 0.418, + "step": 4000 + }, + { + "epoch": 0.44392782021476807, + "grad_norm": 17.177561938405823, + "learning_rate": 6.810076386864168e-06, + "loss": 2.7353, + "step": 4010 + }, + { + "epoch": 0.44503487213550313, + "grad_norm": 18.717792449825087, + "learning_rate": 6.792050211582164e-06, + "loss": 2.6284, + "step": 4020 + }, + { + "epoch": 0.44614192405623826, + "grad_norm": 20.629160666920065, + "learning_rate": 6.77399727081427e-06, + "loss": 2.7808, + "step": 4030 + }, + { + "epoch": 0.44724897597697333, + "grad_norm": 16.300381610488234, + "learning_rate": 6.755917834193408e-06, + "loss": 2.6976, + "step": 4040 + }, + { + "epoch": 0.4483560278977084, + "grad_norm": 18.995902150808703, + "learning_rate": 6.737812171748234e-06, + "loss": 2.7441, + "step": 4050 + }, + { + "epoch": 0.44946307981844347, + "grad_norm": 18.261637709522596, + "learning_rate": 6.719680553899097e-06, + "loss": 2.6822, + "step": 4060 + }, + { + "epoch": 0.4505701317391786, + "grad_norm": 20.659710982739558, + "learning_rate": 6.701523251454017e-06, + "loss": 2.6978, + "step": 4070 + }, + { + "epoch": 0.45167718365991366, + "grad_norm": 19.963369393203255, + "learning_rate": 6.683340535604624e-06, + "loss": 2.7391, + "step": 4080 + }, + { + "epoch": 0.45278423558064873, + "grad_norm": 17.272615462239525, + "learning_rate": 6.665132677922118e-06, + "loss": 2.6982, + "step": 4090 + }, + { + "epoch": 0.4538912875013838, + "grad_norm": 17.102697486895753, + "learning_rate": 6.646899950353208e-06, + "loss": 2.7443, + "step": 4100 + }, + { + "epoch": 0.4549983394221189, + "grad_norm": 16.731640547098063, + "learning_rate": 6.628642625216053e-06, + "loss": 2.7825, + "step": 4110 + }, + { + "epoch": 0.456105391342854, + "grad_norm": 16.86948389308186, + "learning_rate": 6.61036097519619e-06, + "loss": 2.6986, + "step": 4120 + }, + { + "epoch": 0.45721244326358906, + "grad_norm": 20.677217100728953, + "learning_rate": 6.592055273342467e-06, + "loss": 2.8304, + "step": 4130 + }, + { + "epoch": 0.45831949518432413, + "grad_norm": 16.821661815243136, + "learning_rate": 6.573725793062965e-06, + "loss": 2.6678, + "step": 4140 + }, + { + "epoch": 0.4594265471050592, + "grad_norm": 18.45134731193715, + "learning_rate": 6.555372808120907e-06, + "loss": 2.823, + "step": 4150 + }, + { + "epoch": 0.4605335990257943, + "grad_norm": 17.57852954660428, + "learning_rate": 6.536996592630578e-06, + "loss": 2.7795, + "step": 4160 + }, + { + "epoch": 0.4616406509465294, + "grad_norm": 17.253221141789883, + "learning_rate": 6.518597421053223e-06, + "loss": 2.7, + "step": 4170 + }, + { + "epoch": 0.46274770286726447, + "grad_norm": 16.206089784799936, + "learning_rate": 6.5001755681929545e-06, + "loss": 2.7196, + "step": 4180 + }, + { + "epoch": 0.46385475478799953, + "grad_norm": 18.947069414032423, + "learning_rate": 6.481731309192647e-06, + "loss": 2.7542, + "step": 4190 + }, + { + "epoch": 0.46496180670873466, + "grad_norm": 16.548697201774296, + "learning_rate": 6.463264919529823e-06, + "loss": 2.7531, + "step": 4200 + }, + { + "epoch": 0.46606885862946973, + "grad_norm": 17.605153791162124, + "learning_rate": 6.444776675012542e-06, + "loss": 2.7248, + "step": 4210 + }, + { + "epoch": 0.4671759105502048, + "grad_norm": 18.42367136884591, + "learning_rate": 6.42626685177528e-06, + "loss": 2.6742, + "step": 4220 + }, + { + "epoch": 0.46828296247093987, + "grad_norm": 21.057012768405876, + "learning_rate": 6.407735726274809e-06, + "loss": 2.7067, + "step": 4230 + }, + { + "epoch": 0.469390014391675, + "grad_norm": 17.878193605338524, + "learning_rate": 6.38918357528606e-06, + "loss": 2.8213, + "step": 4240 + }, + { + "epoch": 0.47049706631241006, + "grad_norm": 15.251101561882258, + "learning_rate": 6.370610675897997e-06, + "loss": 2.767, + "step": 4250 + }, + { + "epoch": 0.47160411823314513, + "grad_norm": 16.35077680470725, + "learning_rate": 6.352017305509475e-06, + "loss": 2.5496, + "step": 4260 + }, + { + "epoch": 0.4727111701538802, + "grad_norm": 20.78692237253247, + "learning_rate": 6.3334037418250975e-06, + "loss": 2.5517, + "step": 4270 + }, + { + "epoch": 0.4738182220746153, + "grad_norm": 16.49688836558597, + "learning_rate": 6.314770262851069e-06, + "loss": 2.7365, + "step": 4280 + }, + { + "epoch": 0.4749252739953504, + "grad_norm": 17.75918198378233, + "learning_rate": 6.296117146891039e-06, + "loss": 2.651, + "step": 4290 + }, + { + "epoch": 0.47603232591608546, + "grad_norm": 15.289950571080979, + "learning_rate": 6.277444672541953e-06, + "loss": 2.7015, + "step": 4300 + }, + { + "epoch": 0.47713937783682053, + "grad_norm": 15.010585688125417, + "learning_rate": 6.258753118689887e-06, + "loss": 2.6344, + "step": 4310 + }, + { + "epoch": 0.4782464297575556, + "grad_norm": 16.384237830668948, + "learning_rate": 6.240042764505877e-06, + "loss": 2.7013, + "step": 4320 + }, + { + "epoch": 0.4793534816782907, + "grad_norm": 15.761472924874809, + "learning_rate": 6.2213138894417615e-06, + "loss": 2.7414, + "step": 4330 + }, + { + "epoch": 0.4804605335990258, + "grad_norm": 17.457264405530225, + "learning_rate": 6.202566773225995e-06, + "loss": 2.7923, + "step": 4340 + }, + { + "epoch": 0.48156758551976087, + "grad_norm": 20.03913075692092, + "learning_rate": 6.1838016958594825e-06, + "loss": 2.7145, + "step": 4350 + }, + { + "epoch": 0.48267463744049593, + "grad_norm": 14.687794264132354, + "learning_rate": 6.165018937611385e-06, + "loss": 2.6172, + "step": 4360 + }, + { + "epoch": 0.48378168936123106, + "grad_norm": 15.026413038595793, + "learning_rate": 6.146218779014942e-06, + "loss": 2.6804, + "step": 4370 + }, + { + "epoch": 0.48488874128196613, + "grad_norm": 17.378458618834472, + "learning_rate": 6.127401500863281e-06, + "loss": 2.5838, + "step": 4380 + }, + { + "epoch": 0.4859957932027012, + "grad_norm": 16.495531002493667, + "learning_rate": 6.108567384205214e-06, + "loss": 2.5008, + "step": 4390 + }, + { + "epoch": 0.48710284512343627, + "grad_norm": 15.612526961187054, + "learning_rate": 6.089716710341058e-06, + "loss": 2.5134, + "step": 4400 + }, + { + "epoch": 0.4882098970441714, + "grad_norm": 17.829542612600722, + "learning_rate": 6.070849760818417e-06, + "loss": 2.6932, + "step": 4410 + }, + { + "epoch": 0.48931694896490646, + "grad_norm": 18.397184297289453, + "learning_rate": 6.051966817427983e-06, + "loss": 2.664, + "step": 4420 + }, + { + "epoch": 0.49042400088564153, + "grad_norm": 15.139678235200124, + "learning_rate": 6.03306816219933e-06, + "loss": 2.6431, + "step": 4430 + }, + { + "epoch": 0.4915310528063766, + "grad_norm": 19.13733604850318, + "learning_rate": 6.014154077396695e-06, + "loss": 2.7429, + "step": 4440 + }, + { + "epoch": 0.4926381047271117, + "grad_norm": 19.88327633299528, + "learning_rate": 5.995224845514771e-06, + "loss": 2.6894, + "step": 4450 + }, + { + "epoch": 0.4937451566478468, + "grad_norm": 16.78819908723115, + "learning_rate": 5.97628074927448e-06, + "loss": 2.712, + "step": 4460 + }, + { + "epoch": 0.49485220856858186, + "grad_norm": 15.34943286541028, + "learning_rate": 5.957322071618753e-06, + "loss": 2.652, + "step": 4470 + }, + { + "epoch": 0.49595926048931693, + "grad_norm": 14.718777663127804, + "learning_rate": 5.9383490957083045e-06, + "loss": 2.6708, + "step": 4480 + }, + { + "epoch": 0.49706631241005206, + "grad_norm": 14.06128807028094, + "learning_rate": 5.919362104917403e-06, + "loss": 2.6022, + "step": 4490 + }, + { + "epoch": 0.4981733643307871, + "grad_norm": 16.565786742803958, + "learning_rate": 5.90036138282964e-06, + "loss": 2.6252, + "step": 4500 + }, + { + "epoch": 0.4992804162515222, + "grad_norm": 15.757898844662668, + "learning_rate": 5.8813472132336955e-06, + "loss": 2.6229, + "step": 4510 + }, + { + "epoch": 0.5003874681722573, + "grad_norm": 21.10749621990984, + "learning_rate": 5.862319880119092e-06, + "loss": 2.709, + "step": 4520 + }, + { + "epoch": 0.5014945200929923, + "grad_norm": 18.080937909773763, + "learning_rate": 5.8432796676719585e-06, + "loss": 2.5919, + "step": 4530 + }, + { + "epoch": 0.5026015720137275, + "grad_norm": 15.309930072347084, + "learning_rate": 5.824226860270791e-06, + "loss": 2.7639, + "step": 4540 + }, + { + "epoch": 0.5037086239344625, + "grad_norm": 17.326512802033673, + "learning_rate": 5.805161742482194e-06, + "loss": 2.6954, + "step": 4550 + }, + { + "epoch": 0.5048156758551976, + "grad_norm": 20.016766712775652, + "learning_rate": 5.786084599056637e-06, + "loss": 2.6651, + "step": 4560 + }, + { + "epoch": 0.5059227277759327, + "grad_norm": 15.39976054839859, + "learning_rate": 5.766995714924204e-06, + "loss": 2.7208, + "step": 4570 + }, + { + "epoch": 0.5070297796966677, + "grad_norm": 15.56824968714477, + "learning_rate": 5.747895375190331e-06, + "loss": 2.6959, + "step": 4580 + }, + { + "epoch": 0.5081368316174029, + "grad_norm": 19.043556423880098, + "learning_rate": 5.728783865131554e-06, + "loss": 2.7182, + "step": 4590 + }, + { + "epoch": 0.509243883538138, + "grad_norm": 18.533491761930883, + "learning_rate": 5.709661470191241e-06, + "loss": 2.6474, + "step": 4600 + }, + { + "epoch": 0.510350935458873, + "grad_norm": 17.576811873751446, + "learning_rate": 5.6905284759753365e-06, + "loss": 2.6864, + "step": 4610 + }, + { + "epoch": 0.5114579873796081, + "grad_norm": 18.79796869282816, + "learning_rate": 5.6713851682480926e-06, + "loss": 2.5302, + "step": 4620 + }, + { + "epoch": 0.5125650393003431, + "grad_norm": 17.510899102111733, + "learning_rate": 5.6522318329278e-06, + "loss": 2.6672, + "step": 4630 + }, + { + "epoch": 0.5136720912210783, + "grad_norm": 15.707692417808088, + "learning_rate": 5.633068756082517e-06, + "loss": 2.6229, + "step": 4640 + }, + { + "epoch": 0.5147791431418134, + "grad_norm": 14.427966106685423, + "learning_rate": 5.613896223925799e-06, + "loss": 2.6565, + "step": 4650 + }, + { + "epoch": 0.5158861950625484, + "grad_norm": 17.13890386270487, + "learning_rate": 5.594714522812422e-06, + "loss": 2.738, + "step": 4660 + }, + { + "epoch": 0.5169932469832835, + "grad_norm": 15.344124561854793, + "learning_rate": 5.575523939234111e-06, + "loss": 2.7876, + "step": 4670 + }, + { + "epoch": 0.5181002989040187, + "grad_norm": 16.79964161196015, + "learning_rate": 5.556324759815252e-06, + "loss": 2.6692, + "step": 4680 + }, + { + "epoch": 0.5192073508247537, + "grad_norm": 19.56356390380519, + "learning_rate": 5.537117271308615e-06, + "loss": 2.7151, + "step": 4690 + }, + { + "epoch": 0.5203144027454888, + "grad_norm": 18.641775052939003, + "learning_rate": 5.5179017605910754e-06, + "loss": 2.8004, + "step": 4700 + }, + { + "epoch": 0.5214214546662238, + "grad_norm": 15.272957986365086, + "learning_rate": 5.4986785146593255e-06, + "loss": 2.7083, + "step": 4710 + }, + { + "epoch": 0.5225285065869589, + "grad_norm": 15.949027616558995, + "learning_rate": 5.479447820625585e-06, + "loss": 2.6865, + "step": 4720 + }, + { + "epoch": 0.523635558507694, + "grad_norm": 15.67762021450724, + "learning_rate": 5.46020996571332e-06, + "loss": 2.7183, + "step": 4730 + }, + { + "epoch": 0.5247426104284291, + "grad_norm": 19.95294125446329, + "learning_rate": 5.4409652372529444e-06, + "loss": 2.7927, + "step": 4740 + }, + { + "epoch": 0.5258496623491642, + "grad_norm": 13.488762906306286, + "learning_rate": 5.421713922677539e-06, + "loss": 2.5992, + "step": 4750 + }, + { + "epoch": 0.5269567142698992, + "grad_norm": 16.599798214798543, + "learning_rate": 5.402456309518547e-06, + "loss": 2.5732, + "step": 4760 + }, + { + "epoch": 0.5280637661906343, + "grad_norm": 14.764833460888406, + "learning_rate": 5.383192685401492e-06, + "loss": 2.5634, + "step": 4770 + }, + { + "epoch": 0.5291708181113695, + "grad_norm": 17.816571873254308, + "learning_rate": 5.363923338041667e-06, + "loss": 2.64, + "step": 4780 + }, + { + "epoch": 0.5302778700321045, + "grad_norm": 14.543241263642692, + "learning_rate": 5.344648555239854e-06, + "loss": 2.6637, + "step": 4790 + }, + { + "epoch": 0.5313849219528396, + "grad_norm": 16.519933702897138, + "learning_rate": 5.325368624878009e-06, + "loss": 2.747, + "step": 4800 + }, + { + "epoch": 0.5324919738735747, + "grad_norm": 17.67293620152496, + "learning_rate": 5.306083834914977e-06, + "loss": 2.6096, + "step": 4810 + }, + { + "epoch": 0.5335990257943097, + "grad_norm": 17.919095046156233, + "learning_rate": 5.286794473382178e-06, + "loss": 2.6526, + "step": 4820 + }, + { + "epoch": 0.5347060777150449, + "grad_norm": 14.567289996672956, + "learning_rate": 5.267500828379319e-06, + "loss": 2.7698, + "step": 4830 + }, + { + "epoch": 0.5358131296357799, + "grad_norm": 17.34975497496579, + "learning_rate": 5.248203188070078e-06, + "loss": 2.6932, + "step": 4840 + }, + { + "epoch": 0.536920181556515, + "grad_norm": 14.383043710837034, + "learning_rate": 5.228901840677808e-06, + "loss": 2.533, + "step": 4850 + }, + { + "epoch": 0.5380272334772501, + "grad_norm": 19.4814620431374, + "learning_rate": 5.209597074481228e-06, + "loss": 2.7526, + "step": 4860 + }, + { + "epoch": 0.5391342853979851, + "grad_norm": 17.294271003058864, + "learning_rate": 5.19028917781012e-06, + "loss": 2.7006, + "step": 4870 + }, + { + "epoch": 0.5402413373187203, + "grad_norm": 13.454761500494456, + "learning_rate": 5.170978439041023e-06, + "loss": 2.5453, + "step": 4880 + }, + { + "epoch": 0.5413483892394554, + "grad_norm": 17.855933800763392, + "learning_rate": 5.151665146592924e-06, + "loss": 2.6315, + "step": 4890 + }, + { + "epoch": 0.5424554411601904, + "grad_norm": 17.427924222975562, + "learning_rate": 5.132349588922949e-06, + "loss": 2.6539, + "step": 4900 + }, + { + "epoch": 0.5435624930809255, + "grad_norm": 20.073145834110875, + "learning_rate": 5.113032054522058e-06, + "loss": 2.5488, + "step": 4910 + }, + { + "epoch": 0.5446695450016605, + "grad_norm": 12.357803208105327, + "learning_rate": 5.093712831910736e-06, + "loss": 2.5557, + "step": 4920 + }, + { + "epoch": 0.5457765969223957, + "grad_norm": 15.692479347879283, + "learning_rate": 5.0743922096346836e-06, + "loss": 2.7068, + "step": 4930 + }, + { + "epoch": 0.5468836488431308, + "grad_norm": 14.866689448660685, + "learning_rate": 5.055070476260501e-06, + "loss": 2.576, + "step": 4940 + }, + { + "epoch": 0.5479907007638658, + "grad_norm": 15.129308088501134, + "learning_rate": 5.0357479203713885e-06, + "loss": 2.3914, + "step": 4950 + }, + { + "epoch": 0.5490977526846009, + "grad_norm": 14.162687417076338, + "learning_rate": 5.0164248305628284e-06, + "loss": 2.6796, + "step": 4960 + }, + { + "epoch": 0.5502048046053359, + "grad_norm": 19.323858139882816, + "learning_rate": 4.997101495438277e-06, + "loss": 2.4771, + "step": 4970 + }, + { + "epoch": 0.5513118565260711, + "grad_norm": 17.540498070177875, + "learning_rate": 4.97777820360486e-06, + "loss": 2.572, + "step": 4980 + }, + { + "epoch": 0.5524189084468062, + "grad_norm": 19.393507393902457, + "learning_rate": 4.958455243669051e-06, + "loss": 2.6577, + "step": 4990 + }, + { + "epoch": 0.5535259603675412, + "grad_norm": 17.365811060415265, + "learning_rate": 4.939132904232366e-06, + "loss": 2.6571, + "step": 5000 + }, + { + "epoch": 0.5546330122882763, + "grad_norm": 14.882734972778014, + "learning_rate": 4.91981147388706e-06, + "loss": 2.5927, + "step": 5010 + }, + { + "epoch": 0.5557400642090115, + "grad_norm": 18.498227060413406, + "learning_rate": 4.900491241211799e-06, + "loss": 2.6215, + "step": 5020 + }, + { + "epoch": 0.5568471161297465, + "grad_norm": 16.424230672284246, + "learning_rate": 4.881172494767372e-06, + "loss": 2.738, + "step": 5030 + }, + { + "epoch": 0.5579541680504816, + "grad_norm": 14.449267161706716, + "learning_rate": 4.861855523092366e-06, + "loss": 2.6883, + "step": 5040 + }, + { + "epoch": 0.5590612199712166, + "grad_norm": 15.748250902231145, + "learning_rate": 4.84254061469886e-06, + "loss": 2.6369, + "step": 5050 + }, + { + "epoch": 0.5601682718919517, + "grad_norm": 21.423066740561787, + "learning_rate": 4.823228058068113e-06, + "loss": 2.7159, + "step": 5060 + }, + { + "epoch": 0.5612753238126869, + "grad_norm": 14.22388926392383, + "learning_rate": 4.803918141646268e-06, + "loss": 2.5795, + "step": 5070 + }, + { + "epoch": 0.5623823757334219, + "grad_norm": 14.83696241654988, + "learning_rate": 4.784611153840027e-06, + "loss": 2.5612, + "step": 5080 + }, + { + "epoch": 0.563489427654157, + "grad_norm": 14.263900210157331, + "learning_rate": 4.765307383012352e-06, + "loss": 2.5602, + "step": 5090 + }, + { + "epoch": 0.564596479574892, + "grad_norm": 17.257310107919768, + "learning_rate": 4.746007117478162e-06, + "loss": 2.611, + "step": 5100 + }, + { + "epoch": 0.5657035314956271, + "grad_norm": 16.708351070999512, + "learning_rate": 4.726710645500014e-06, + "loss": 2.6106, + "step": 5110 + }, + { + "epoch": 0.5668105834163623, + "grad_norm": 16.979878309390095, + "learning_rate": 4.707418255283817e-06, + "loss": 2.7961, + "step": 5120 + }, + { + "epoch": 0.5679176353370973, + "grad_norm": 16.81810768550359, + "learning_rate": 4.6881302349745015e-06, + "loss": 2.5536, + "step": 5130 + }, + { + "epoch": 0.5690246872578324, + "grad_norm": 16.369198159186666, + "learning_rate": 4.668846872651745e-06, + "loss": 2.7049, + "step": 5140 + }, + { + "epoch": 0.5701317391785675, + "grad_norm": 14.5307901204883, + "learning_rate": 4.649568456325645e-06, + "loss": 2.6538, + "step": 5150 + }, + { + "epoch": 0.5712387910993025, + "grad_norm": 13.505347462632475, + "learning_rate": 4.630295273932435e-06, + "loss": 2.5944, + "step": 5160 + }, + { + "epoch": 0.5723458430200377, + "grad_norm": 14.683292804609174, + "learning_rate": 4.611027613330166e-06, + "loss": 2.6914, + "step": 5170 + }, + { + "epoch": 0.5734528949407727, + "grad_norm": 17.13643381283879, + "learning_rate": 4.5917657622944235e-06, + "loss": 2.6462, + "step": 5180 + }, + { + "epoch": 0.5745599468615078, + "grad_norm": 16.94159128538117, + "learning_rate": 4.572510008514027e-06, + "loss": 2.6447, + "step": 5190 + }, + { + "epoch": 0.5756669987822429, + "grad_norm": 18.068429687848685, + "learning_rate": 4.55326063958672e-06, + "loss": 2.7705, + "step": 5200 + }, + { + "epoch": 0.5767740507029779, + "grad_norm": 14.55412168781434, + "learning_rate": 4.534017943014895e-06, + "loss": 2.6824, + "step": 5210 + }, + { + "epoch": 0.5778811026237131, + "grad_norm": 14.837147206944774, + "learning_rate": 4.514782206201274e-06, + "loss": 2.5857, + "step": 5220 + }, + { + "epoch": 0.5789881545444482, + "grad_norm": 15.433613293909772, + "learning_rate": 4.495553716444647e-06, + "loss": 2.6309, + "step": 5230 + }, + { + "epoch": 0.5800952064651832, + "grad_norm": 15.838049703049755, + "learning_rate": 4.4763327609355505e-06, + "loss": 2.5826, + "step": 5240 + }, + { + "epoch": 0.5812022583859183, + "grad_norm": 17.013462581069046, + "learning_rate": 4.457119626751998e-06, + "loss": 2.6681, + "step": 5250 + }, + { + "epoch": 0.5823093103066533, + "grad_norm": 18.074417040094673, + "learning_rate": 4.437914600855187e-06, + "loss": 2.6364, + "step": 5260 + }, + { + "epoch": 0.5834163622273885, + "grad_norm": 17.194945416385185, + "learning_rate": 4.4187179700852084e-06, + "loss": 2.6663, + "step": 5270 + }, + { + "epoch": 0.5845234141481236, + "grad_norm": 17.09566869966539, + "learning_rate": 4.399530021156771e-06, + "loss": 2.5621, + "step": 5280 + }, + { + "epoch": 0.5856304660688586, + "grad_norm": 18.10182865444287, + "learning_rate": 4.38035104065491e-06, + "loss": 2.6451, + "step": 5290 + }, + { + "epoch": 0.5867375179895937, + "grad_norm": 13.726610338766326, + "learning_rate": 4.361181315030714e-06, + "loss": 2.6154, + "step": 5300 + }, + { + "epoch": 0.5878445699103287, + "grad_norm": 13.396115971130266, + "learning_rate": 4.342021130597041e-06, + "loss": 2.6552, + "step": 5310 + }, + { + "epoch": 0.5889516218310639, + "grad_norm": 18.072129861449454, + "learning_rate": 4.3228707735242485e-06, + "loss": 2.6323, + "step": 5320 + }, + { + "epoch": 0.590058673751799, + "grad_norm": 16.101311253082667, + "learning_rate": 4.303730529835913e-06, + "loss": 2.5936, + "step": 5330 + }, + { + "epoch": 0.591165725672534, + "grad_norm": 17.959725344836784, + "learning_rate": 4.28460068540456e-06, + "loss": 2.6568, + "step": 5340 + }, + { + "epoch": 0.5922727775932691, + "grad_norm": 14.558411141104697, + "learning_rate": 4.2654815259473994e-06, + "loss": 2.599, + "step": 5350 + }, + { + "epoch": 0.5933798295140043, + "grad_norm": 15.020557260142786, + "learning_rate": 4.2463733370220464e-06, + "loss": 2.6193, + "step": 5360 + }, + { + "epoch": 0.5944868814347393, + "grad_norm": 16.367462970526052, + "learning_rate": 4.2272764040222724e-06, + "loss": 2.5572, + "step": 5370 + }, + { + "epoch": 0.5955939333554744, + "grad_norm": 17.24930565347666, + "learning_rate": 4.208191012173728e-06, + "loss": 2.7591, + "step": 5380 + }, + { + "epoch": 0.5967009852762094, + "grad_norm": 16.29148295415015, + "learning_rate": 4.189117446529692e-06, + "loss": 2.6654, + "step": 5390 + }, + { + "epoch": 0.5978080371969445, + "grad_norm": 14.636816803347672, + "learning_rate": 4.170055991966808e-06, + "loss": 2.6481, + "step": 5400 + }, + { + "epoch": 0.5989150891176797, + "grad_norm": 15.770080307849732, + "learning_rate": 4.1510069331808324e-06, + "loss": 2.637, + "step": 5410 + }, + { + "epoch": 0.6000221410384147, + "grad_norm": 15.398178191768253, + "learning_rate": 4.131970554682387e-06, + "loss": 2.6958, + "step": 5420 + }, + { + "epoch": 0.6011291929591498, + "grad_norm": 15.861210008610465, + "learning_rate": 4.1129471407926995e-06, + "loss": 2.5836, + "step": 5430 + }, + { + "epoch": 0.6022362448798849, + "grad_norm": 14.510344904474643, + "learning_rate": 4.093936975639367e-06, + "loss": 2.6514, + "step": 5440 + }, + { + "epoch": 0.6033432968006199, + "grad_norm": 19.34752243925819, + "learning_rate": 4.0749403431521e-06, + "loss": 2.6221, + "step": 5450 + }, + { + "epoch": 0.6044503487213551, + "grad_norm": 14.169326871610396, + "learning_rate": 4.055957527058501e-06, + "loss": 2.5109, + "step": 5460 + }, + { + "epoch": 0.6055574006420901, + "grad_norm": 15.469257875046958, + "learning_rate": 4.036988810879804e-06, + "loss": 2.6436, + "step": 5470 + }, + { + "epoch": 0.6066644525628252, + "grad_norm": 15.484848198531239, + "learning_rate": 4.018034477926661e-06, + "loss": 2.4906, + "step": 5480 + }, + { + "epoch": 0.6077715044835603, + "grad_norm": 15.378784092462407, + "learning_rate": 3.9990948112948914e-06, + "loss": 2.6171, + "step": 5490 + }, + { + "epoch": 0.6088785564042953, + "grad_norm": 14.686645639856618, + "learning_rate": 3.9801700938612685e-06, + "loss": 2.6579, + "step": 5500 + }, + { + "epoch": 0.6099856083250305, + "grad_norm": 13.215751426102292, + "learning_rate": 3.96126060827929e-06, + "loss": 2.5402, + "step": 5510 + }, + { + "epoch": 0.6110926602457655, + "grad_norm": 14.135003798272539, + "learning_rate": 3.942366636974954e-06, + "loss": 2.622, + "step": 5520 + }, + { + "epoch": 0.6121997121665006, + "grad_norm": 17.459175088951138, + "learning_rate": 3.923488462142541e-06, + "loss": 2.5552, + "step": 5530 + }, + { + "epoch": 0.6133067640872357, + "grad_norm": 15.87291748509675, + "learning_rate": 3.9046263657404005e-06, + "loss": 2.6628, + "step": 5540 + }, + { + "epoch": 0.6144138160079707, + "grad_norm": 17.77834550937652, + "learning_rate": 3.885780629486744e-06, + "loss": 2.5962, + "step": 5550 + }, + { + "epoch": 0.6155208679287059, + "grad_norm": 14.623260869268544, + "learning_rate": 3.866951534855429e-06, + "loss": 2.5216, + "step": 5560 + }, + { + "epoch": 0.616627919849441, + "grad_norm": 18.782526592973454, + "learning_rate": 3.848139363071759e-06, + "loss": 2.5408, + "step": 5570 + }, + { + "epoch": 0.617734971770176, + "grad_norm": 15.484929469465394, + "learning_rate": 3.8293443951082865e-06, + "loss": 2.5616, + "step": 5580 + }, + { + "epoch": 0.6188420236909111, + "grad_norm": 17.313043224092755, + "learning_rate": 3.810566911680607e-06, + "loss": 2.6196, + "step": 5590 + }, + { + "epoch": 0.6199490756116461, + "grad_norm": 14.974425571993558, + "learning_rate": 3.7918071932431823e-06, + "loss": 2.5633, + "step": 5600 + }, + { + "epoch": 0.6210561275323813, + "grad_norm": 14.593381904858223, + "learning_rate": 3.773065519985132e-06, + "loss": 2.6227, + "step": 5610 + }, + { + "epoch": 0.6221631794531164, + "grad_norm": 19.67519437375815, + "learning_rate": 3.7543421718260663e-06, + "loss": 2.666, + "step": 5620 + }, + { + "epoch": 0.6232702313738514, + "grad_norm": 13.058989186832509, + "learning_rate": 3.7356374284118906e-06, + "loss": 2.5616, + "step": 5630 + }, + { + "epoch": 0.6243772832945865, + "grad_norm": 19.30534098144351, + "learning_rate": 3.716951569110645e-06, + "loss": 2.551, + "step": 5640 + }, + { + "epoch": 0.6254843352153217, + "grad_norm": 15.614374371487665, + "learning_rate": 3.6982848730083144e-06, + "loss": 2.495, + "step": 5650 + }, + { + "epoch": 0.6265913871360567, + "grad_norm": 21.218331844105535, + "learning_rate": 3.67963761890467e-06, + "loss": 2.7439, + "step": 5660 + }, + { + "epoch": 0.6276984390567918, + "grad_norm": 17.01930866391004, + "learning_rate": 3.6610100853091067e-06, + "loss": 2.5619, + "step": 5670 + }, + { + "epoch": 0.6288054909775268, + "grad_norm": 16.548611978624205, + "learning_rate": 3.642402550436476e-06, + "loss": 2.5517, + "step": 5680 + }, + { + "epoch": 0.6299125428982619, + "grad_norm": 16.350659146252166, + "learning_rate": 3.6238152922029414e-06, + "loss": 2.6533, + "step": 5690 + }, + { + "epoch": 0.631019594818997, + "grad_norm": 16.295428081442413, + "learning_rate": 3.6052485882218124e-06, + "loss": 2.5341, + "step": 5700 + }, + { + "epoch": 0.6321266467397321, + "grad_norm": 16.161944221815478, + "learning_rate": 3.5867027157994137e-06, + "loss": 2.4661, + "step": 5710 + }, + { + "epoch": 0.6332336986604672, + "grad_norm": 18.192390922499364, + "learning_rate": 3.568177951930932e-06, + "loss": 2.5499, + "step": 5720 + }, + { + "epoch": 0.6343407505812022, + "grad_norm": 18.154938030310817, + "learning_rate": 3.54967457329629e-06, + "loss": 2.671, + "step": 5730 + }, + { + "epoch": 0.6354478025019373, + "grad_norm": 17.50231046259661, + "learning_rate": 3.5311928562559984e-06, + "loss": 2.5161, + "step": 5740 + }, + { + "epoch": 0.6365548544226725, + "grad_norm": 15.071570236507409, + "learning_rate": 3.5127330768470414e-06, + "loss": 2.638, + "step": 5750 + }, + { + "epoch": 0.6376619063434075, + "grad_norm": 17.638180874471615, + "learning_rate": 3.4942955107787534e-06, + "loss": 2.5672, + "step": 5760 + }, + { + "epoch": 0.6387689582641426, + "grad_norm": 17.092873285184194, + "learning_rate": 3.4758804334286924e-06, + "loss": 2.6012, + "step": 5770 + }, + { + "epoch": 0.6398760101848777, + "grad_norm": 14.564343624825167, + "learning_rate": 3.457488119838535e-06, + "loss": 2.5989, + "step": 5780 + }, + { + "epoch": 0.6409830621056127, + "grad_norm": 16.413821117561785, + "learning_rate": 3.4391188447099614e-06, + "loss": 2.506, + "step": 5790 + }, + { + "epoch": 0.6420901140263479, + "grad_norm": 18.393396650855887, + "learning_rate": 3.4207728824005653e-06, + "loss": 2.5685, + "step": 5800 + }, + { + "epoch": 0.6431971659470829, + "grad_norm": 16.91734370623325, + "learning_rate": 3.4024505069197387e-06, + "loss": 2.4561, + "step": 5810 + }, + { + "epoch": 0.644304217867818, + "grad_norm": 15.98240569506593, + "learning_rate": 3.3841519919245925e-06, + "loss": 2.6473, + "step": 5820 + }, + { + "epoch": 0.6454112697885531, + "grad_norm": 16.326289119567278, + "learning_rate": 3.3658776107158654e-06, + "loss": 2.4694, + "step": 5830 + }, + { + "epoch": 0.6465183217092881, + "grad_norm": 18.501828998717585, + "learning_rate": 3.347627636233837e-06, + "loss": 2.6163, + "step": 5840 + }, + { + "epoch": 0.6476253736300233, + "grad_norm": 17.230377910119174, + "learning_rate": 3.329402341054265e-06, + "loss": 2.5839, + "step": 5850 + }, + { + "epoch": 0.6487324255507584, + "grad_norm": 15.353383433670851, + "learning_rate": 3.311201997384295e-06, + "loss": 2.6337, + "step": 5860 + }, + { + "epoch": 0.6498394774714934, + "grad_norm": 16.881261849081998, + "learning_rate": 3.2930268770584127e-06, + "loss": 2.5865, + "step": 5870 + }, + { + "epoch": 0.6509465293922285, + "grad_norm": 18.123650428151265, + "learning_rate": 3.2748772515343697e-06, + "loss": 2.6292, + "step": 5880 + }, + { + "epoch": 0.6520535813129635, + "grad_norm": 21.517681336714, + "learning_rate": 3.2567533918891414e-06, + "loss": 2.641, + "step": 5890 + }, + { + "epoch": 0.6531606332336987, + "grad_norm": 19.398238179320135, + "learning_rate": 3.238655568814868e-06, + "loss": 2.6626, + "step": 5900 + }, + { + "epoch": 0.6542676851544338, + "grad_norm": 16.094985895672867, + "learning_rate": 3.2205840526148158e-06, + "loss": 2.5219, + "step": 5910 + }, + { + "epoch": 0.6553747370751688, + "grad_norm": 15.058326544356623, + "learning_rate": 3.2025391131993443e-06, + "loss": 2.5849, + "step": 5920 + }, + { + "epoch": 0.6564817889959039, + "grad_norm": 15.860339323392015, + "learning_rate": 3.184521020081864e-06, + "loss": 2.3947, + "step": 5930 + }, + { + "epoch": 0.6575888409166389, + "grad_norm": 17.03657583580592, + "learning_rate": 3.1665300423748256e-06, + "loss": 2.6228, + "step": 5940 + }, + { + "epoch": 0.6586958928373741, + "grad_norm": 16.449779619145687, + "learning_rate": 3.148566448785687e-06, + "loss": 2.6434, + "step": 5950 + }, + { + "epoch": 0.6598029447581092, + "grad_norm": 18.51817609745207, + "learning_rate": 3.1306305076129083e-06, + "loss": 2.5301, + "step": 5960 + }, + { + "epoch": 0.6609099966788442, + "grad_norm": 17.17970665475141, + "learning_rate": 3.112722486741941e-06, + "loss": 2.5608, + "step": 5970 + }, + { + "epoch": 0.6620170485995793, + "grad_norm": 15.220359891812148, + "learning_rate": 3.094842653641225e-06, + "loss": 2.5432, + "step": 5980 + }, + { + "epoch": 0.6631241005203145, + "grad_norm": 15.940169495180179, + "learning_rate": 3.076991275358205e-06, + "loss": 2.5147, + "step": 5990 + }, + { + "epoch": 0.6642311524410495, + "grad_norm": 13.94891949646219, + "learning_rate": 3.059168618515325e-06, + "loss": 2.5043, + "step": 6000 + }, + { + "epoch": 0.6642311524410495, + "eval_loss": 2.562150716781616, + "eval_runtime": 2394.5594, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.419, + "step": 6000 + }, + { + "epoch": 0.6653382043617846, + "grad_norm": 17.7531887306566, + "learning_rate": 3.0413749493060596e-06, + "loss": 2.6127, + "step": 6010 + }, + { + "epoch": 0.6664452562825196, + "grad_norm": 12.808942551796036, + "learning_rate": 3.0236105334909303e-06, + "loss": 2.5683, + "step": 6020 + }, + { + "epoch": 0.6675523082032547, + "grad_norm": 16.672861233647524, + "learning_rate": 3.0058756363935447e-06, + "loss": 2.5315, + "step": 6030 + }, + { + "epoch": 0.6686593601239899, + "grad_norm": 15.135037228190633, + "learning_rate": 2.9881705228966217e-06, + "loss": 2.4304, + "step": 6040 + }, + { + "epoch": 0.6697664120447249, + "grad_norm": 19.201710928838462, + "learning_rate": 2.9704954574380474e-06, + "loss": 2.6006, + "step": 6050 + }, + { + "epoch": 0.67087346396546, + "grad_norm": 16.780831760906963, + "learning_rate": 2.9528507040069165e-06, + "loss": 2.5291, + "step": 6060 + }, + { + "epoch": 0.6719805158861951, + "grad_norm": 15.110403344711688, + "learning_rate": 2.935236526139592e-06, + "loss": 2.6148, + "step": 6070 + }, + { + "epoch": 0.6730875678069301, + "grad_norm": 14.691795830412493, + "learning_rate": 2.9176531869157776e-06, + "loss": 2.623, + "step": 6080 + }, + { + "epoch": 0.6741946197276653, + "grad_norm": 20.694910027119413, + "learning_rate": 2.900100948954568e-06, + "loss": 2.4261, + "step": 6090 + }, + { + "epoch": 0.6753016716484003, + "grad_norm": 20.153947600154126, + "learning_rate": 2.8825800744105553e-06, + "loss": 2.5051, + "step": 6100 + }, + { + "epoch": 0.6764087235691354, + "grad_norm": 16.844446676245752, + "learning_rate": 2.8650908249698837e-06, + "loss": 2.4725, + "step": 6110 + }, + { + "epoch": 0.6775157754898705, + "grad_norm": 15.629536784931664, + "learning_rate": 2.847633461846363e-06, + "loss": 2.4676, + "step": 6120 + }, + { + "epoch": 0.6786228274106055, + "grad_norm": 15.244942371558702, + "learning_rate": 2.830208245777556e-06, + "loss": 2.4867, + "step": 6130 + }, + { + "epoch": 0.6797298793313407, + "grad_norm": 18.15276563682713, + "learning_rate": 2.8128154370208895e-06, + "loss": 2.6125, + "step": 6140 + }, + { + "epoch": 0.6808369312520757, + "grad_norm": 14.866692854122116, + "learning_rate": 2.7954552953497648e-06, + "loss": 2.4709, + "step": 6150 + }, + { + "epoch": 0.6819439831728108, + "grad_norm": 15.710254262687716, + "learning_rate": 2.778128080049674e-06, + "loss": 2.5593, + "step": 6160 + }, + { + "epoch": 0.6830510350935459, + "grad_norm": 16.32088369390469, + "learning_rate": 2.760834049914337e-06, + "loss": 2.5904, + "step": 6170 + }, + { + "epoch": 0.6841580870142809, + "grad_norm": 17.297718496475216, + "learning_rate": 2.7435734632418286e-06, + "loss": 2.6322, + "step": 6180 + }, + { + "epoch": 0.6852651389350161, + "grad_norm": 16.18993238219759, + "learning_rate": 2.726346577830722e-06, + "loss": 2.4723, + "step": 6190 + }, + { + "epoch": 0.6863721908557512, + "grad_norm": 13.340569639729669, + "learning_rate": 2.7091536509762407e-06, + "loss": 2.5087, + "step": 6200 + }, + { + "epoch": 0.6874792427764862, + "grad_norm": 17.20103511645342, + "learning_rate": 2.691994939466415e-06, + "loss": 2.575, + "step": 6210 + }, + { + "epoch": 0.6885862946972213, + "grad_norm": 15.066807611711438, + "learning_rate": 2.6748706995782407e-06, + "loss": 2.5264, + "step": 6220 + }, + { + "epoch": 0.6896933466179563, + "grad_norm": 21.941135059717368, + "learning_rate": 2.657781187073861e-06, + "loss": 2.5012, + "step": 6230 + }, + { + "epoch": 0.6908003985386915, + "grad_norm": 16.278833357503192, + "learning_rate": 2.640726657196743e-06, + "loss": 2.5817, + "step": 6240 + }, + { + "epoch": 0.6919074504594266, + "grad_norm": 13.836955054815277, + "learning_rate": 2.6237073646678596e-06, + "loss": 2.5257, + "step": 6250 + }, + { + "epoch": 0.6930145023801616, + "grad_norm": 17.42891079955518, + "learning_rate": 2.6067235636818975e-06, + "loss": 2.4827, + "step": 6260 + }, + { + "epoch": 0.6941215543008967, + "grad_norm": 16.66766719981607, + "learning_rate": 2.5897755079034415e-06, + "loss": 2.734, + "step": 6270 + }, + { + "epoch": 0.6952286062216317, + "grad_norm": 18.01524504020241, + "learning_rate": 2.5728634504632132e-06, + "loss": 2.4481, + "step": 6280 + }, + { + "epoch": 0.6963356581423669, + "grad_norm": 15.361507532173055, + "learning_rate": 2.555987643954259e-06, + "loss": 2.5952, + "step": 6290 + }, + { + "epoch": 0.697442710063102, + "grad_norm": 12.548971546748055, + "learning_rate": 2.539148340428203e-06, + "loss": 2.4955, + "step": 6300 + }, + { + "epoch": 0.698549761983837, + "grad_norm": 16.013770363195505, + "learning_rate": 2.5223457913914713e-06, + "loss": 2.5667, + "step": 6310 + }, + { + "epoch": 0.6996568139045721, + "grad_norm": 18.08109296107942, + "learning_rate": 2.505580247801529e-06, + "loss": 2.6721, + "step": 6320 + }, + { + "epoch": 0.7007638658253073, + "grad_norm": 18.233567782447306, + "learning_rate": 2.488851960063153e-06, + "loss": 2.5413, + "step": 6330 + }, + { + "epoch": 0.7018709177460423, + "grad_norm": 20.185450776651432, + "learning_rate": 2.4721611780246662e-06, + "loss": 2.5205, + "step": 6340 + }, + { + "epoch": 0.7029779696667774, + "grad_norm": 17.322044563032186, + "learning_rate": 2.4555081509742257e-06, + "loss": 2.6061, + "step": 6350 + }, + { + "epoch": 0.7040850215875124, + "grad_norm": 16.69861708188076, + "learning_rate": 2.4388931276360898e-06, + "loss": 2.5733, + "step": 6360 + }, + { + "epoch": 0.7051920735082475, + "grad_norm": 14.9415194058973, + "learning_rate": 2.4223163561669084e-06, + "loss": 2.4084, + "step": 6370 + }, + { + "epoch": 0.7062991254289827, + "grad_norm": 15.070279374628573, + "learning_rate": 2.4057780841520073e-06, + "loss": 2.4201, + "step": 6380 + }, + { + "epoch": 0.7074061773497177, + "grad_norm": 16.92425944088654, + "learning_rate": 2.389278558601703e-06, + "loss": 2.674, + "step": 6390 + }, + { + "epoch": 0.7085132292704528, + "grad_norm": 15.873359974625208, + "learning_rate": 2.3728180259476054e-06, + "loss": 2.5413, + "step": 6400 + }, + { + "epoch": 0.7096202811911879, + "grad_norm": 17.077658381322358, + "learning_rate": 2.356396732038938e-06, + "loss": 2.5189, + "step": 6410 + }, + { + "epoch": 0.7107273331119229, + "grad_norm": 15.86795681834881, + "learning_rate": 2.34001492213887e-06, + "loss": 2.6101, + "step": 6420 + }, + { + "epoch": 0.7118343850326581, + "grad_norm": 13.564052898106056, + "learning_rate": 2.323672840920843e-06, + "loss": 2.5059, + "step": 6430 + }, + { + "epoch": 0.7129414369533931, + "grad_norm": 16.387911586785865, + "learning_rate": 2.307370732464936e-06, + "loss": 2.4656, + "step": 6440 + }, + { + "epoch": 0.7140484888741282, + "grad_norm": 15.397100789766657, + "learning_rate": 2.291108840254194e-06, + "loss": 2.5474, + "step": 6450 + }, + { + "epoch": 0.7151555407948633, + "grad_norm": 20.180668201574875, + "learning_rate": 2.274887407171015e-06, + "loss": 2.6061, + "step": 6460 + }, + { + "epoch": 0.7162625927155983, + "grad_norm": 16.932276461623562, + "learning_rate": 2.2587066754935088e-06, + "loss": 2.6172, + "step": 6470 + }, + { + "epoch": 0.7173696446363335, + "grad_norm": 15.85444224400965, + "learning_rate": 2.242566886891878e-06, + "loss": 2.4546, + "step": 6480 + }, + { + "epoch": 0.7184766965570685, + "grad_norm": 16.024831283317745, + "learning_rate": 2.2264682824248244e-06, + "loss": 2.5442, + "step": 6490 + }, + { + "epoch": 0.7195837484778036, + "grad_norm": 15.983284722901772, + "learning_rate": 2.210411102535923e-06, + "loss": 2.5027, + "step": 6500 + }, + { + "epoch": 0.7206908003985387, + "grad_norm": 18.522789630055893, + "learning_rate": 2.194395587050053e-06, + "loss": 2.5553, + "step": 6510 + }, + { + "epoch": 0.7217978523192737, + "grad_norm": 14.14639815951338, + "learning_rate": 2.178421975169806e-06, + "loss": 2.5721, + "step": 6520 + }, + { + "epoch": 0.7229049042400089, + "grad_norm": 14.492302660298277, + "learning_rate": 2.1624905054719136e-06, + "loss": 2.4938, + "step": 6530 + }, + { + "epoch": 0.724011956160744, + "grad_norm": 19.363838132408695, + "learning_rate": 2.146601415903685e-06, + "loss": 2.4218, + "step": 6540 + }, + { + "epoch": 0.725119008081479, + "grad_norm": 15.90076642116056, + "learning_rate": 2.1307549437794576e-06, + "loss": 2.448, + "step": 6550 + }, + { + "epoch": 0.7262260600022141, + "grad_norm": 17.3475722033809, + "learning_rate": 2.114951325777041e-06, + "loss": 2.5259, + "step": 6560 + }, + { + "epoch": 0.7273331119229491, + "grad_norm": 17.081131808882112, + "learning_rate": 2.0991907979341945e-06, + "loss": 2.6131, + "step": 6570 + }, + { + "epoch": 0.7284401638436843, + "grad_norm": 19.24726121813359, + "learning_rate": 2.083473595645096e-06, + "loss": 2.5176, + "step": 6580 + }, + { + "epoch": 0.7295472157644194, + "grad_norm": 18.22671174512495, + "learning_rate": 2.067799953656827e-06, + "loss": 2.6385, + "step": 6590 + }, + { + "epoch": 0.7306542676851544, + "grad_norm": 19.51577253516203, + "learning_rate": 2.052170106065867e-06, + "loss": 2.5878, + "step": 6600 + }, + { + "epoch": 0.7317613196058895, + "grad_norm": 14.740255840350805, + "learning_rate": 2.0365842863145902e-06, + "loss": 2.6232, + "step": 6610 + }, + { + "epoch": 0.7328683715266247, + "grad_norm": 17.153524931988514, + "learning_rate": 2.021042727187797e-06, + "loss": 2.4545, + "step": 6620 + }, + { + "epoch": 0.7339754234473597, + "grad_norm": 16.978859837487686, + "learning_rate": 2.0055456608092135e-06, + "loss": 2.4822, + "step": 6630 + }, + { + "epoch": 0.7350824753680948, + "grad_norm": 15.507136512277452, + "learning_rate": 1.9900933186380427e-06, + "loss": 2.4757, + "step": 6640 + }, + { + "epoch": 0.7361895272888298, + "grad_norm": 15.113892086099645, + "learning_rate": 1.9746859314655024e-06, + "loss": 2.4577, + "step": 6650 + }, + { + "epoch": 0.7372965792095649, + "grad_norm": 19.298868896417396, + "learning_rate": 1.9593237294113688e-06, + "loss": 2.5047, + "step": 6660 + }, + { + "epoch": 0.7384036311303, + "grad_norm": 13.267678704003732, + "learning_rate": 1.944006941920561e-06, + "loss": 2.5715, + "step": 6670 + }, + { + "epoch": 0.7395106830510351, + "grad_norm": 14.87293193958646, + "learning_rate": 1.928735797759687e-06, + "loss": 2.5132, + "step": 6680 + }, + { + "epoch": 0.7406177349717702, + "grad_norm": 16.569655196515217, + "learning_rate": 1.91351052501365e-06, + "loss": 2.5578, + "step": 6690 + }, + { + "epoch": 0.7417247868925052, + "grad_norm": 18.641862537777396, + "learning_rate": 1.8983313510822283e-06, + "loss": 2.5117, + "step": 6700 + }, + { + "epoch": 0.7428318388132403, + "grad_norm": 16.649411387878974, + "learning_rate": 1.8831985026766848e-06, + "loss": 2.555, + "step": 6710 + }, + { + "epoch": 0.7439388907339755, + "grad_norm": 17.113555470969906, + "learning_rate": 1.8681122058163797e-06, + "loss": 2.4762, + "step": 6720 + }, + { + "epoch": 0.7450459426547105, + "grad_norm": 13.60243042756901, + "learning_rate": 1.853072685825391e-06, + "loss": 2.4798, + "step": 6730 + }, + { + "epoch": 0.7461529945754456, + "grad_norm": 14.062228805408685, + "learning_rate": 1.8380801673291555e-06, + "loss": 2.5991, + "step": 6740 + }, + { + "epoch": 0.7472600464961807, + "grad_norm": 12.81974531182581, + "learning_rate": 1.8231348742511102e-06, + "loss": 2.3543, + "step": 6750 + }, + { + "epoch": 0.7483670984169157, + "grad_norm": 16.835322913216885, + "learning_rate": 1.8082370298093483e-06, + "loss": 2.4387, + "step": 6760 + }, + { + "epoch": 0.7494741503376509, + "grad_norm": 14.330012440741553, + "learning_rate": 1.7933868565132857e-06, + "loss": 2.6009, + "step": 6770 + }, + { + "epoch": 0.7505812022583859, + "grad_norm": 15.204347320060766, + "learning_rate": 1.7785845761603376e-06, + "loss": 2.5466, + "step": 6780 + }, + { + "epoch": 0.751688254179121, + "grad_norm": 17.028609074434605, + "learning_rate": 1.7638304098326025e-06, + "loss": 2.4657, + "step": 6790 + }, + { + "epoch": 0.7527953060998561, + "grad_norm": 13.259346842026316, + "learning_rate": 1.7491245778935673e-06, + "loss": 2.6145, + "step": 6800 + }, + { + "epoch": 0.7539023580205911, + "grad_norm": 21.625831350357682, + "learning_rate": 1.7344672999848106e-06, + "loss": 2.5143, + "step": 6810 + }, + { + "epoch": 0.7550094099413263, + "grad_norm": 19.536045749121886, + "learning_rate": 1.7198587950227235e-06, + "loss": 2.4776, + "step": 6820 + }, + { + "epoch": 0.7561164618620614, + "grad_norm": 17.421699829582213, + "learning_rate": 1.7052992811952411e-06, + "loss": 2.4593, + "step": 6830 + }, + { + "epoch": 0.7572235137827964, + "grad_norm": 16.49786576509242, + "learning_rate": 1.6907889759585778e-06, + "loss": 2.6817, + "step": 6840 + }, + { + "epoch": 0.7583305657035315, + "grad_norm": 14.275882435397286, + "learning_rate": 1.676328096033994e-06, + "loss": 2.4542, + "step": 6850 + }, + { + "epoch": 0.7594376176242665, + "grad_norm": 17.493762248570647, + "learning_rate": 1.6619168574045385e-06, + "loss": 2.4719, + "step": 6860 + }, + { + "epoch": 0.7605446695450017, + "grad_norm": 16.007658419129143, + "learning_rate": 1.6475554753118412e-06, + "loss": 2.4291, + "step": 6870 + }, + { + "epoch": 0.7616517214657368, + "grad_norm": 14.774826021297706, + "learning_rate": 1.6332441642528895e-06, + "loss": 2.6003, + "step": 6880 + }, + { + "epoch": 0.7627587733864718, + "grad_norm": 15.975567591762553, + "learning_rate": 1.6189831379768206e-06, + "loss": 2.5704, + "step": 6890 + }, + { + "epoch": 0.7638658253072069, + "grad_norm": 17.406951035088184, + "learning_rate": 1.604772609481744e-06, + "loss": 2.5381, + "step": 6900 + }, + { + "epoch": 0.7649728772279419, + "grad_norm": 15.245412833911804, + "learning_rate": 1.5906127910115414e-06, + "loss": 2.5041, + "step": 6910 + }, + { + "epoch": 0.7660799291486771, + "grad_norm": 18.14500430607472, + "learning_rate": 1.576503894052711e-06, + "loss": 2.4126, + "step": 6920 + }, + { + "epoch": 0.7671869810694122, + "grad_norm": 15.112940123243304, + "learning_rate": 1.5624461293312022e-06, + "loss": 2.4729, + "step": 6930 + }, + { + "epoch": 0.7682940329901472, + "grad_norm": 14.628425372895773, + "learning_rate": 1.548439706809271e-06, + "loss": 2.4399, + "step": 6940 + }, + { + "epoch": 0.7694010849108823, + "grad_norm": 14.955427356230805, + "learning_rate": 1.5344848356823395e-06, + "loss": 2.4849, + "step": 6950 + }, + { + "epoch": 0.7705081368316175, + "grad_norm": 15.352858996367999, + "learning_rate": 1.5205817243758775e-06, + "loss": 2.5061, + "step": 6960 + }, + { + "epoch": 0.7716151887523525, + "grad_norm": 15.531771804427523, + "learning_rate": 1.506730580542287e-06, + "loss": 2.5352, + "step": 6970 + }, + { + "epoch": 0.7727222406730876, + "grad_norm": 14.802901269445874, + "learning_rate": 1.4929316110577991e-06, + "loss": 2.4606, + "step": 6980 + }, + { + "epoch": 0.7738292925938226, + "grad_norm": 13.834503126554017, + "learning_rate": 1.4791850220193882e-06, + "loss": 2.4114, + "step": 6990 + }, + { + "epoch": 0.7749363445145577, + "grad_norm": 17.626871971044736, + "learning_rate": 1.4654910187416843e-06, + "loss": 2.4443, + "step": 7000 + }, + { + "epoch": 0.7760433964352929, + "grad_norm": 15.72586832532517, + "learning_rate": 1.451849805753925e-06, + "loss": 2.5959, + "step": 7010 + }, + { + "epoch": 0.7771504483560279, + "grad_norm": 19.63625622564935, + "learning_rate": 1.4382615867968768e-06, + "loss": 2.577, + "step": 7020 + }, + { + "epoch": 0.778257500276763, + "grad_norm": 16.259423437860036, + "learning_rate": 1.4247265648198122e-06, + "loss": 2.4003, + "step": 7030 + }, + { + "epoch": 0.7793645521974981, + "grad_norm": 14.868240052692464, + "learning_rate": 1.4112449419774699e-06, + "loss": 2.4374, + "step": 7040 + }, + { + "epoch": 0.7804716041182331, + "grad_norm": 17.680915858091048, + "learning_rate": 1.3978169196270297e-06, + "loss": 2.4477, + "step": 7050 + }, + { + "epoch": 0.7815786560389683, + "grad_norm": 18.788763019346266, + "learning_rate": 1.3844426983251242e-06, + "loss": 2.6663, + "step": 7060 + }, + { + "epoch": 0.7826857079597033, + "grad_norm": 17.443967486074488, + "learning_rate": 1.3711224778248178e-06, + "loss": 2.4001, + "step": 7070 + }, + { + "epoch": 0.7837927598804384, + "grad_norm": 14.104765296687267, + "learning_rate": 1.3578564570726437e-06, + "loss": 2.5499, + "step": 7080 + }, + { + "epoch": 0.7848998118011735, + "grad_norm": 14.938982184936348, + "learning_rate": 1.344644834205624e-06, + "loss": 2.6234, + "step": 7090 + }, + { + "epoch": 0.7860068637219085, + "grad_norm": 16.601186409737505, + "learning_rate": 1.3314878065483106e-06, + "loss": 2.4678, + "step": 7100 + }, + { + "epoch": 0.7871139156426437, + "grad_norm": 16.126461328991052, + "learning_rate": 1.318385570609838e-06, + "loss": 2.5181, + "step": 7110 + }, + { + "epoch": 0.7882209675633787, + "grad_norm": 14.264212101115474, + "learning_rate": 1.3053383220809934e-06, + "loss": 2.5319, + "step": 7120 + }, + { + "epoch": 0.7893280194841138, + "grad_norm": 16.674084788709003, + "learning_rate": 1.2923462558312827e-06, + "loss": 2.5588, + "step": 7130 + }, + { + "epoch": 0.7904350714048489, + "grad_norm": 14.125047804457926, + "learning_rate": 1.2794095659060335e-06, + "loss": 2.495, + "step": 7140 + }, + { + "epoch": 0.7915421233255839, + "grad_norm": 13.689321540078824, + "learning_rate": 1.2665284455234867e-06, + "loss": 2.6346, + "step": 7150 + }, + { + "epoch": 0.7926491752463191, + "grad_norm": 17.491763233443507, + "learning_rate": 1.2537030870719159e-06, + "loss": 2.3638, + "step": 7160 + }, + { + "epoch": 0.7937562271670542, + "grad_norm": 14.712500473982459, + "learning_rate": 1.2409336821067535e-06, + "loss": 2.4199, + "step": 7170 + }, + { + "epoch": 0.7948632790877892, + "grad_norm": 13.97965354212977, + "learning_rate": 1.2282204213477233e-06, + "loss": 2.4273, + "step": 7180 + }, + { + "epoch": 0.7959703310085243, + "grad_norm": 15.125599625889896, + "learning_rate": 1.215563494676007e-06, + "loss": 2.5639, + "step": 7190 + }, + { + "epoch": 0.7970773829292593, + "grad_norm": 15.308235089960142, + "learning_rate": 1.2029630911313877e-06, + "loss": 2.4943, + "step": 7200 + }, + { + "epoch": 0.7981844348499945, + "grad_norm": 14.243073168806442, + "learning_rate": 1.1904193989094442e-06, + "loss": 2.6061, + "step": 7210 + }, + { + "epoch": 0.7992914867707296, + "grad_norm": 14.898872151747849, + "learning_rate": 1.1779326053587326e-06, + "loss": 2.6109, + "step": 7220 + }, + { + "epoch": 0.8003985386914646, + "grad_norm": 15.213968169737058, + "learning_rate": 1.165502896977983e-06, + "loss": 2.5029, + "step": 7230 + }, + { + "epoch": 0.8015055906121997, + "grad_norm": 17.57190386080436, + "learning_rate": 1.1531304594133297e-06, + "loss": 2.5218, + "step": 7240 + }, + { + "epoch": 0.8026126425329347, + "grad_norm": 14.718334901930403, + "learning_rate": 1.1408154774555185e-06, + "loss": 2.5644, + "step": 7250 + }, + { + "epoch": 0.8037196944536699, + "grad_norm": 14.70466300668309, + "learning_rate": 1.1285581350371633e-06, + "loss": 2.5673, + "step": 7260 + }, + { + "epoch": 0.804826746374405, + "grad_norm": 16.523083604536307, + "learning_rate": 1.11635861522999e-06, + "loss": 2.6119, + "step": 7270 + }, + { + "epoch": 0.80593379829514, + "grad_norm": 16.087233648796555, + "learning_rate": 1.1042171002421038e-06, + "loss": 2.3668, + "step": 7280 + }, + { + "epoch": 0.8070408502158751, + "grad_norm": 18.219483436423715, + "learning_rate": 1.092133771415272e-06, + "loss": 2.5108, + "step": 7290 + }, + { + "epoch": 0.8081479021366103, + "grad_norm": 14.23626021764468, + "learning_rate": 1.0801088092222067e-06, + "loss": 2.5161, + "step": 7300 + }, + { + "epoch": 0.8092549540573453, + "grad_norm": 17.579234694984372, + "learning_rate": 1.0681423932638784e-06, + "loss": 2.472, + "step": 7310 + }, + { + "epoch": 0.8103620059780804, + "grad_norm": 17.509613972476572, + "learning_rate": 1.05623470226683e-06, + "loss": 2.5078, + "step": 7320 + }, + { + "epoch": 0.8114690578988154, + "grad_norm": 16.567966169697417, + "learning_rate": 1.0443859140805063e-06, + "loss": 2.5549, + "step": 7330 + }, + { + "epoch": 0.8125761098195505, + "grad_norm": 13.228102448828993, + "learning_rate": 1.032596205674598e-06, + "loss": 2.5958, + "step": 7340 + }, + { + "epoch": 0.8136831617402857, + "grad_norm": 14.33253011909644, + "learning_rate": 1.020865753136402e-06, + "loss": 2.4304, + "step": 7350 + }, + { + "epoch": 0.8147902136610207, + "grad_norm": 16.763970324305024, + "learning_rate": 1.0091947316681833e-06, + "loss": 2.5536, + "step": 7360 + }, + { + "epoch": 0.8158972655817558, + "grad_norm": 16.082943781448364, + "learning_rate": 9.975833155845687e-07, + "loss": 2.4768, + "step": 7370 + }, + { + "epoch": 0.8170043175024909, + "grad_norm": 15.909337215300724, + "learning_rate": 9.860316783099356e-07, + "loss": 2.4912, + "step": 7380 + }, + { + "epoch": 0.8181113694232259, + "grad_norm": 17.194058825674805, + "learning_rate": 9.74539992375826e-07, + "loss": 2.4761, + "step": 7390 + }, + { + "epoch": 0.8192184213439611, + "grad_norm": 15.251099269067993, + "learning_rate": 9.631084294183668e-07, + "loss": 2.538, + "step": 7400 + }, + { + "epoch": 0.8203254732646961, + "grad_norm": 14.28790996742064, + "learning_rate": 9.517371601757042e-07, + "loss": 2.536, + "step": 7410 + }, + { + "epoch": 0.8214325251854312, + "grad_norm": 17.000395820091192, + "learning_rate": 9.404263544854658e-07, + "loss": 2.4934, + "step": 7420 + }, + { + "epoch": 0.8225395771061663, + "grad_norm": 14.025873757437632, + "learning_rate": 9.291761812822054e-07, + "loss": 2.4447, + "step": 7430 + }, + { + "epoch": 0.8236466290269013, + "grad_norm": 20.369511420071024, + "learning_rate": 9.179868085948946e-07, + "loss": 2.5157, + "step": 7440 + }, + { + "epoch": 0.8247536809476365, + "grad_norm": 16.887509510072285, + "learning_rate": 9.068584035444083e-07, + "loss": 2.4785, + "step": 7450 + }, + { + "epoch": 0.8258607328683715, + "grad_norm": 15.952259196345977, + "learning_rate": 8.957911323410229e-07, + "loss": 2.4653, + "step": 7460 + }, + { + "epoch": 0.8269677847891066, + "grad_norm": 16.24199510067374, + "learning_rate": 8.847851602819485e-07, + "loss": 2.5294, + "step": 7470 + }, + { + "epoch": 0.8280748367098417, + "grad_norm": 16.976947365156782, + "learning_rate": 8.738406517488423e-07, + "loss": 2.5297, + "step": 7480 + }, + { + "epoch": 0.8291818886305767, + "grad_norm": 17.934378030024483, + "learning_rate": 8.629577702053671e-07, + "loss": 2.6052, + "step": 7490 + }, + { + "epoch": 0.8302889405513119, + "grad_norm": 15.407244538769637, + "learning_rate": 8.521366781947426e-07, + "loss": 2.4532, + "step": 7500 + }, + { + "epoch": 0.831395992472047, + "grad_norm": 15.400477059234891, + "learning_rate": 8.413775373373206e-07, + "loss": 2.4579, + "step": 7510 + }, + { + "epoch": 0.832503044392782, + "grad_norm": 17.39392388174797, + "learning_rate": 8.306805083281705e-07, + "loss": 2.6138, + "step": 7520 + }, + { + "epoch": 0.8336100963135171, + "grad_norm": 14.342293383217136, + "learning_rate": 8.200457509346798e-07, + "loss": 2.3725, + "step": 7530 + }, + { + "epoch": 0.8347171482342521, + "grad_norm": 15.847161214149653, + "learning_rate": 8.094734239941642e-07, + "loss": 2.3768, + "step": 7540 + }, + { + "epoch": 0.8358242001549873, + "grad_norm": 17.63332070962175, + "learning_rate": 7.989636854115018e-07, + "loss": 2.4585, + "step": 7550 + }, + { + "epoch": 0.8369312520757224, + "grad_norm": 16.531198506312407, + "learning_rate": 7.885166921567705e-07, + "loss": 2.4787, + "step": 7560 + }, + { + "epoch": 0.8380383039964574, + "grad_norm": 14.28759893561945, + "learning_rate": 7.781326002628991e-07, + "loss": 2.4685, + "step": 7570 + }, + { + "epoch": 0.8391453559171925, + "grad_norm": 14.826430399325979, + "learning_rate": 7.678115648233514e-07, + "loss": 2.4173, + "step": 7580 + }, + { + "epoch": 0.8402524078379277, + "grad_norm": 14.87587504335515, + "learning_rate": 7.57553739989792e-07, + "loss": 2.51, + "step": 7590 + }, + { + "epoch": 0.8413594597586627, + "grad_norm": 17.574559912620376, + "learning_rate": 7.473592789697947e-07, + "loss": 2.4794, + "step": 7600 + }, + { + "epoch": 0.8424665116793978, + "grad_norm": 17.140986686992314, + "learning_rate": 7.37228334024555e-07, + "loss": 2.416, + "step": 7610 + }, + { + "epoch": 0.8435735636001328, + "grad_norm": 15.506861252303242, + "learning_rate": 7.271610564666054e-07, + "loss": 2.3907, + "step": 7620 + }, + { + "epoch": 0.8446806155208679, + "grad_norm": 15.538508359449784, + "learning_rate": 7.171575966575722e-07, + "loss": 2.5462, + "step": 7630 + }, + { + "epoch": 0.845787667441603, + "grad_norm": 16.810003606583724, + "learning_rate": 7.072181040059123e-07, + "loss": 2.486, + "step": 7640 + }, + { + "epoch": 0.8468947193623381, + "grad_norm": 17.523279420449594, + "learning_rate": 6.973427269646932e-07, + "loss": 2.4714, + "step": 7650 + }, + { + "epoch": 0.8480017712830732, + "grad_norm": 14.739045055561698, + "learning_rate": 6.875316130293724e-07, + "loss": 2.5424, + "step": 7660 + }, + { + "epoch": 0.8491088232038082, + "grad_norm": 15.925664585980916, + "learning_rate": 6.777849087355932e-07, + "loss": 2.4951, + "step": 7670 + }, + { + "epoch": 0.8502158751245433, + "grad_norm": 14.15278086352724, + "learning_rate": 6.681027596569988e-07, + "loss": 2.4984, + "step": 7680 + }, + { + "epoch": 0.8513229270452785, + "grad_norm": 14.613485875082265, + "learning_rate": 6.584853104030553e-07, + "loss": 2.415, + "step": 7690 + }, + { + "epoch": 0.8524299789660135, + "grad_norm": 13.79991123891203, + "learning_rate": 6.48932704616892e-07, + "loss": 2.4957, + "step": 7700 + }, + { + "epoch": 0.8535370308867486, + "grad_norm": 16.538555088229636, + "learning_rate": 6.394450849731587e-07, + "loss": 2.5322, + "step": 7710 + }, + { + "epoch": 0.8546440828074837, + "grad_norm": 17.641076622043553, + "learning_rate": 6.300225931758924e-07, + "loss": 2.4296, + "step": 7720 + }, + { + "epoch": 0.8557511347282187, + "grad_norm": 17.606467927789563, + "learning_rate": 6.206653699564014e-07, + "loss": 2.5163, + "step": 7730 + }, + { + "epoch": 0.8568581866489539, + "grad_norm": 17.809260161423225, + "learning_rate": 6.113735550711658e-07, + "loss": 2.4642, + "step": 7740 + }, + { + "epoch": 0.8579652385696889, + "grad_norm": 13.623839785347023, + "learning_rate": 6.021472872997419e-07, + "loss": 2.512, + "step": 7750 + }, + { + "epoch": 0.859072290490424, + "grad_norm": 18.78017884173273, + "learning_rate": 5.929867044427035e-07, + "loss": 2.4144, + "step": 7760 + }, + { + "epoch": 0.8601793424111591, + "grad_norm": 16.837093504212152, + "learning_rate": 5.838919433195678e-07, + "loss": 2.5047, + "step": 7770 + }, + { + "epoch": 0.8612863943318941, + "grad_norm": 16.87004336022709, + "learning_rate": 5.748631397667654e-07, + "loss": 2.5213, + "step": 7780 + }, + { + "epoch": 0.8623934462526293, + "grad_norm": 15.69091627736047, + "learning_rate": 5.659004286356045e-07, + "loss": 2.5533, + "step": 7790 + }, + { + "epoch": 0.8635004981733644, + "grad_norm": 14.187307779530673, + "learning_rate": 5.570039437902536e-07, + "loss": 2.441, + "step": 7800 + }, + { + "epoch": 0.8646075500940994, + "grad_norm": 17.93288869083588, + "learning_rate": 5.481738181057556e-07, + "loss": 2.5006, + "step": 7810 + }, + { + "epoch": 0.8657146020148345, + "grad_norm": 15.826634381411255, + "learning_rate": 5.394101834660253e-07, + "loss": 2.4135, + "step": 7820 + }, + { + "epoch": 0.8668216539355695, + "grad_norm": 16.596251661361375, + "learning_rate": 5.307131707618934e-07, + "loss": 2.4909, + "step": 7830 + }, + { + "epoch": 0.8679287058563047, + "grad_norm": 15.129013018674039, + "learning_rate": 5.220829098891472e-07, + "loss": 2.4429, + "step": 7840 + }, + { + "epoch": 0.8690357577770398, + "grad_norm": 14.305450352981211, + "learning_rate": 5.135195297465878e-07, + "loss": 2.4862, + "step": 7850 + }, + { + "epoch": 0.8701428096977748, + "grad_norm": 12.863234686905033, + "learning_rate": 5.050231582341092e-07, + "loss": 2.4616, + "step": 7860 + }, + { + "epoch": 0.8712498616185099, + "grad_norm": 13.900921327498637, + "learning_rate": 4.965939222507832e-07, + "loss": 2.5505, + "step": 7870 + }, + { + "epoch": 0.8723569135392449, + "grad_norm": 15.774427990260946, + "learning_rate": 4.882319476929698e-07, + "loss": 2.4643, + "step": 7880 + }, + { + "epoch": 0.8734639654599801, + "grad_norm": 18.695971386290847, + "learning_rate": 4.799373594524332e-07, + "loss": 2.4695, + "step": 7890 + }, + { + "epoch": 0.8745710173807152, + "grad_norm": 15.398391843940924, + "learning_rate": 4.7171028141447693e-07, + "loss": 2.5612, + "step": 7900 + }, + { + "epoch": 0.8756780693014502, + "grad_norm": 13.563010738327588, + "learning_rate": 4.635508364560937e-07, + "loss": 2.4357, + "step": 7910 + }, + { + "epoch": 0.8767851212221853, + "grad_norm": 14.212010150425057, + "learning_rate": 4.5545914644413103e-07, + "loss": 2.4529, + "step": 7920 + }, + { + "epoch": 0.8778921731429205, + "grad_norm": 13.857542112005609, + "learning_rate": 4.474353322334679e-07, + "loss": 2.4963, + "step": 7930 + }, + { + "epoch": 0.8789992250636555, + "grad_norm": 14.666024515134973, + "learning_rate": 4.394795136652169e-07, + "loss": 2.4512, + "step": 7940 + }, + { + "epoch": 0.8801062769843906, + "grad_norm": 16.841948685566276, + "learning_rate": 4.315918095649246e-07, + "loss": 2.5056, + "step": 7950 + }, + { + "epoch": 0.8812133289051256, + "grad_norm": 15.413187142241657, + "learning_rate": 4.2377233774080427e-07, + "loss": 2.5528, + "step": 7960 + }, + { + "epoch": 0.8823203808258607, + "grad_norm": 13.784700431727842, + "learning_rate": 4.1602121498197477e-07, + "loss": 2.4622, + "step": 7970 + }, + { + "epoch": 0.8834274327465959, + "grad_norm": 14.844903872123188, + "learning_rate": 4.0833855705671057e-07, + "loss": 2.4508, + "step": 7980 + }, + { + "epoch": 0.8845344846673309, + "grad_norm": 16.047205147717147, + "learning_rate": 4.0072447871072507e-07, + "loss": 2.4968, + "step": 7990 + }, + { + "epoch": 0.885641536588066, + "grad_norm": 12.9721170754397, + "learning_rate": 3.931790936654417e-07, + "loss": 2.3906, + "step": 8000 + }, + { + "epoch": 0.885641536588066, + "eval_loss": 2.48763370513916, + "eval_runtime": 2402.0825, + "eval_samples_per_second": 4.178, + "eval_steps_per_second": 0.418, + "step": 8000 + }, + { + "epoch": 0.8867485885088011, + "grad_norm": 15.854557624198474, + "learning_rate": 3.8570251461630735e-07, + "loss": 2.4579, + "step": 8010 + }, + { + "epoch": 0.8878556404295361, + "grad_norm": 16.026725672049096, + "learning_rate": 3.7829485323110316e-07, + "loss": 2.3463, + "step": 8020 + }, + { + "epoch": 0.8889626923502713, + "grad_norm": 16.073422441115532, + "learning_rate": 3.709562201482769e-07, + "loss": 2.4243, + "step": 8030 + }, + { + "epoch": 0.8900697442710063, + "grad_norm": 15.38779771086279, + "learning_rate": 3.636867249752962e-07, + "loss": 2.3858, + "step": 8040 + }, + { + "epoch": 0.8911767961917414, + "grad_norm": 16.258826268938925, + "learning_rate": 3.564864762870013e-07, + "loss": 2.5358, + "step": 8050 + }, + { + "epoch": 0.8922838481124765, + "grad_norm": 15.02798068624606, + "learning_rate": 3.49355581623993e-07, + "loss": 2.4421, + "step": 8060 + }, + { + "epoch": 0.8933909000332115, + "grad_norm": 16.654143045304426, + "learning_rate": 3.4229414749102186e-07, + "loss": 2.5125, + "step": 8070 + }, + { + "epoch": 0.8944979519539467, + "grad_norm": 13.762735453146883, + "learning_rate": 3.353022793553978e-07, + "loss": 2.6232, + "step": 8080 + }, + { + "epoch": 0.8956050038746817, + "grad_norm": 11.721658803005548, + "learning_rate": 3.2838008164541577e-07, + "loss": 2.4208, + "step": 8090 + }, + { + "epoch": 0.8967120557954168, + "grad_norm": 15.661791327346446, + "learning_rate": 3.215276577487969e-07, + "loss": 2.5037, + "step": 8100 + }, + { + "epoch": 0.8978191077161519, + "grad_norm": 14.437374220759548, + "learning_rate": 3.1474511001113926e-07, + "loss": 2.453, + "step": 8110 + }, + { + "epoch": 0.8989261596368869, + "grad_norm": 23.96541891259206, + "learning_rate": 3.080325397343969e-07, + "loss": 2.4866, + "step": 8120 + }, + { + "epoch": 0.9000332115576221, + "grad_norm": 14.703254905186904, + "learning_rate": 3.013900471753628e-07, + "loss": 2.5269, + "step": 8130 + }, + { + "epoch": 0.9011402634783572, + "grad_norm": 17.763519535077947, + "learning_rate": 2.948177315441669e-07, + "loss": 2.5009, + "step": 8140 + }, + { + "epoch": 0.9022473153990922, + "grad_norm": 18.50559050540985, + "learning_rate": 2.883156910028073e-07, + "loss": 2.4501, + "step": 8150 + }, + { + "epoch": 0.9033543673198273, + "grad_norm": 13.811835537975867, + "learning_rate": 2.818840226636671e-07, + "loss": 2.3126, + "step": 8160 + }, + { + "epoch": 0.9044614192405623, + "grad_norm": 18.988992451266952, + "learning_rate": 2.7552282258808125e-07, + "loss": 2.4317, + "step": 8170 + }, + { + "epoch": 0.9055684711612975, + "grad_norm": 16.031786166509363, + "learning_rate": 2.6923218578488674e-07, + "loss": 2.4247, + "step": 8180 + }, + { + "epoch": 0.9066755230820326, + "grad_norm": 18.728666251016826, + "learning_rate": 2.630122062090118e-07, + "loss": 2.3527, + "step": 8190 + }, + { + "epoch": 0.9077825750027676, + "grad_norm": 19.825199377152217, + "learning_rate": 2.568629767600744e-07, + "loss": 2.6088, + "step": 8200 + }, + { + "epoch": 0.9088896269235027, + "grad_norm": 16.87396488382408, + "learning_rate": 2.507845892809868e-07, + "loss": 2.3591, + "step": 8210 + }, + { + "epoch": 0.9099966788442378, + "grad_norm": 14.693971646543563, + "learning_rate": 2.4477713455659136e-07, + "loss": 2.4239, + "step": 8220 + }, + { + "epoch": 0.9111037307649729, + "grad_norm": 15.947418670710583, + "learning_rate": 2.388407023123007e-07, + "loss": 2.4616, + "step": 8230 + }, + { + "epoch": 0.912210782685708, + "grad_norm": 16.454200712334917, + "learning_rate": 2.329753812127583e-07, + "loss": 2.4244, + "step": 8240 + }, + { + "epoch": 0.913317834606443, + "grad_norm": 16.497222276931957, + "learning_rate": 2.2718125886051433e-07, + "loss": 2.5867, + "step": 8250 + }, + { + "epoch": 0.9144248865271781, + "grad_norm": 16.43228833811835, + "learning_rate": 2.214584217947191e-07, + "loss": 2.4391, + "step": 8260 + }, + { + "epoch": 0.9155319384479133, + "grad_norm": 16.78472579386922, + "learning_rate": 2.1580695548982567e-07, + "loss": 2.4242, + "step": 8270 + }, + { + "epoch": 0.9166389903686483, + "grad_norm": 16.546048611992425, + "learning_rate": 2.1022694435431868e-07, + "loss": 2.4872, + "step": 8280 + }, + { + "epoch": 0.9177460422893834, + "grad_norm": 16.770801344250373, + "learning_rate": 2.0471847172945036e-07, + "loss": 2.4296, + "step": 8290 + }, + { + "epoch": 0.9188530942101184, + "grad_norm": 16.27109240174247, + "learning_rate": 1.9928161988799765e-07, + "loss": 2.5068, + "step": 8300 + }, + { + "epoch": 0.9199601461308535, + "grad_norm": 12.512458168250634, + "learning_rate": 1.939164700330326e-07, + "loss": 2.4175, + "step": 8310 + }, + { + "epoch": 0.9210671980515887, + "grad_norm": 14.798188108228695, + "learning_rate": 1.8862310229670612e-07, + "loss": 2.5059, + "step": 8320 + }, + { + "epoch": 0.9221742499723237, + "grad_norm": 12.936659113537381, + "learning_rate": 1.8340159573906058e-07, + "loss": 2.447, + "step": 8330 + }, + { + "epoch": 0.9232813018930588, + "grad_norm": 15.624562309086738, + "learning_rate": 1.782520283468364e-07, + "loss": 2.4359, + "step": 8340 + }, + { + "epoch": 0.9243883538137939, + "grad_norm": 17.36536742613116, + "learning_rate": 1.7317447703231849e-07, + "loss": 2.5658, + "step": 8350 + }, + { + "epoch": 0.9254954057345289, + "grad_norm": 15.391131130821295, + "learning_rate": 1.6816901763218152e-07, + "loss": 2.5091, + "step": 8360 + }, + { + "epoch": 0.9266024576552641, + "grad_norm": 15.684736857963308, + "learning_rate": 1.6323572490635543e-07, + "loss": 2.4168, + "step": 8370 + }, + { + "epoch": 0.9277095095759991, + "grad_norm": 18.001498021183778, + "learning_rate": 1.5837467253691784e-07, + "loss": 2.5202, + "step": 8380 + }, + { + "epoch": 0.9288165614967342, + "grad_norm": 15.913434285236699, + "learning_rate": 1.5358593312698178e-07, + "loss": 2.6434, + "step": 8390 + }, + { + "epoch": 0.9299236134174693, + "grad_norm": 15.844539221853895, + "learning_rate": 1.4886957819962077e-07, + "loss": 2.4848, + "step": 8400 + }, + { + "epoch": 0.9310306653382043, + "grad_norm": 14.883294572064472, + "learning_rate": 1.4422567819679546e-07, + "loss": 2.4281, + "step": 8410 + }, + { + "epoch": 0.9321377172589395, + "grad_norm": 14.583778182281327, + "learning_rate": 1.3965430247830426e-07, + "loss": 2.4246, + "step": 8420 + }, + { + "epoch": 0.9332447691796745, + "grad_norm": 16.084883433598268, + "learning_rate": 1.3515551932074488e-07, + "loss": 2.506, + "step": 8430 + }, + { + "epoch": 0.9343518211004096, + "grad_norm": 13.726377273149337, + "learning_rate": 1.307293959164957e-07, + "loss": 2.5495, + "step": 8440 + }, + { + "epoch": 0.9354588730211447, + "grad_norm": 17.060608694253016, + "learning_rate": 1.263759983727142e-07, + "loss": 2.337, + "step": 8450 + }, + { + "epoch": 0.9365659249418797, + "grad_norm": 14.801435939071636, + "learning_rate": 1.2209539171034623e-07, + "loss": 2.5042, + "step": 8460 + }, + { + "epoch": 0.9376729768626149, + "grad_norm": 15.589161221895887, + "learning_rate": 1.1788763986315621e-07, + "loss": 2.5061, + "step": 8470 + }, + { + "epoch": 0.93878002878335, + "grad_norm": 16.33836597070153, + "learning_rate": 1.1375280567677393e-07, + "loss": 2.3671, + "step": 8480 + }, + { + "epoch": 0.939887080704085, + "grad_norm": 18.905885083448613, + "learning_rate": 1.0969095090775428e-07, + "loss": 2.6181, + "step": 8490 + }, + { + "epoch": 0.9409941326248201, + "grad_norm": 16.762390629046585, + "learning_rate": 1.0570213622265236e-07, + "loss": 2.4327, + "step": 8500 + }, + { + "epoch": 0.9421011845455551, + "grad_norm": 16.525181960248243, + "learning_rate": 1.0178642119712368e-07, + "loss": 2.4993, + "step": 8510 + }, + { + "epoch": 0.9432082364662903, + "grad_norm": 16.7132011851729, + "learning_rate": 9.794386431502822e-08, + "loss": 2.5366, + "step": 8520 + }, + { + "epoch": 0.9443152883870254, + "grad_norm": 13.853046661215803, + "learning_rate": 9.417452296756114e-08, + "loss": 2.4832, + "step": 8530 + }, + { + "epoch": 0.9454223403077604, + "grad_norm": 15.293104772530375, + "learning_rate": 9.04784534523928e-08, + "loss": 2.3633, + "step": 8540 + }, + { + "epoch": 0.9465293922284955, + "grad_norm": 14.980540551389215, + "learning_rate": 8.685571097282852e-08, + "loss": 2.4849, + "step": 8550 + }, + { + "epoch": 0.9476364441492307, + "grad_norm": 18.693304270023244, + "learning_rate": 8.33063496369868e-08, + "loss": 2.5602, + "step": 8560 + }, + { + "epoch": 0.9487434960699657, + "grad_norm": 15.253297927027766, + "learning_rate": 7.98304224569868e-08, + "loss": 2.4879, + "step": 8570 + }, + { + "epoch": 0.9498505479907008, + "grad_norm": 20.092545101378285, + "learning_rate": 7.642798134815943e-08, + "loss": 2.5095, + "step": 8580 + }, + { + "epoch": 0.9509575999114358, + "grad_norm": 16.041421606524025, + "learning_rate": 7.309907712827192e-08, + "loss": 2.4647, + "step": 8590 + }, + { + "epoch": 0.9520646518321709, + "grad_norm": 15.859909299358135, + "learning_rate": 6.984375951676614e-08, + "loss": 2.5593, + "step": 8600 + }, + { + "epoch": 0.953171703752906, + "grad_norm": 19.216229700494758, + "learning_rate": 6.66620771340215e-08, + "loss": 2.3626, + "step": 8610 + }, + { + "epoch": 0.9542787556736411, + "grad_norm": 17.889324656581575, + "learning_rate": 6.355407750062215e-08, + "loss": 2.6562, + "step": 8620 + }, + { + "epoch": 0.9553858075943762, + "grad_norm": 13.458822428770242, + "learning_rate": 6.051980703665138e-08, + "loss": 2.3909, + "step": 8630 + }, + { + "epoch": 0.9564928595151112, + "grad_norm": 17.008353277644698, + "learning_rate": 5.755931106099788e-08, + "loss": 2.4223, + "step": 8640 + }, + { + "epoch": 0.9575999114358463, + "grad_norm": 16.78426968156743, + "learning_rate": 5.4672633790677775e-08, + "loss": 2.6265, + "step": 8650 + }, + { + "epoch": 0.9587069633565815, + "grad_norm": 17.958386496220644, + "learning_rate": 5.185981834017473e-08, + "loss": 2.5093, + "step": 8660 + }, + { + "epoch": 0.9598140152773165, + "grad_norm": 17.46930815569884, + "learning_rate": 4.91209067207965e-08, + "loss": 2.4249, + "step": 8670 + }, + { + "epoch": 0.9609210671980516, + "grad_norm": 17.891927563958056, + "learning_rate": 4.645593984004604e-08, + "loss": 2.533, + "step": 8680 + }, + { + "epoch": 0.9620281191187867, + "grad_norm": 13.675101972798346, + "learning_rate": 4.386495750101194e-08, + "loss": 2.4507, + "step": 8690 + }, + { + "epoch": 0.9631351710395217, + "grad_norm": 16.01872970692231, + "learning_rate": 4.1347998401773945e-08, + "loss": 2.4702, + "step": 8700 + }, + { + "epoch": 0.9642422229602569, + "grad_norm": 17.620120107441487, + "learning_rate": 3.890510013482396e-08, + "loss": 2.3592, + "step": 8710 + }, + { + "epoch": 0.9653492748809919, + "grad_norm": 13.329706465049831, + "learning_rate": 3.653629918650536e-08, + "loss": 2.4662, + "step": 8720 + }, + { + "epoch": 0.966456326801727, + "grad_norm": 14.570283074571352, + "learning_rate": 3.424163093646682e-08, + "loss": 2.3495, + "step": 8730 + }, + { + "epoch": 0.9675633787224621, + "grad_norm": 13.873984864746625, + "learning_rate": 3.202112965713655e-08, + "loss": 2.367, + "step": 8740 + }, + { + "epoch": 0.9686704306431971, + "grad_norm": 13.467781119638207, + "learning_rate": 2.987482851320778e-08, + "loss": 2.3987, + "step": 8750 + }, + { + "epoch": 0.9697774825639323, + "grad_norm": 15.489672705466763, + "learning_rate": 2.7802759561144088e-08, + "loss": 2.425, + "step": 8760 + }, + { + "epoch": 0.9708845344846674, + "grad_norm": 20.126982289743573, + "learning_rate": 2.580495374870151e-08, + "loss": 2.5085, + "step": 8770 + }, + { + "epoch": 0.9719915864054024, + "grad_norm": 16.778268839880404, + "learning_rate": 2.388144091446498e-08, + "loss": 2.463, + "step": 8780 + }, + { + "epoch": 0.9730986383261375, + "grad_norm": 20.49911635255473, + "learning_rate": 2.2032249787404258e-08, + "loss": 2.5278, + "step": 8790 + }, + { + "epoch": 0.9742056902468725, + "grad_norm": 16.182685372782867, + "learning_rate": 2.0257407986443713e-08, + "loss": 2.4702, + "step": 8800 + }, + { + "epoch": 0.9753127421676077, + "grad_norm": 14.885149821948326, + "learning_rate": 1.8556942020049872e-08, + "loss": 2.5026, + "step": 8810 + }, + { + "epoch": 0.9764197940883428, + "grad_norm": 18.03209004223668, + "learning_rate": 1.6930877285835644e-08, + "loss": 2.5576, + "step": 8820 + }, + { + "epoch": 0.9775268460090778, + "grad_norm": 15.861290907259685, + "learning_rate": 1.5379238070181158e-08, + "loss": 2.5681, + "step": 8830 + }, + { + "epoch": 0.9786338979298129, + "grad_norm": 16.532161800217157, + "learning_rate": 1.3902047547871278e-08, + "loss": 2.4926, + "step": 8840 + }, + { + "epoch": 0.9797409498505479, + "grad_norm": 14.626301967154978, + "learning_rate": 1.2499327781748116e-08, + "loss": 2.4547, + "step": 8850 + }, + { + "epoch": 0.9808480017712831, + "grad_norm": 18.74363118033889, + "learning_rate": 1.1171099722383506e-08, + "loss": 2.5054, + "step": 8860 + }, + { + "epoch": 0.9819550536920182, + "grad_norm": 17.476707949807594, + "learning_rate": 9.917383207765363e-09, + "loss": 2.4136, + "step": 8870 + }, + { + "epoch": 0.9830621056127532, + "grad_norm": 15.362525353056075, + "learning_rate": 8.738196962999601e-09, + "loss": 2.5267, + "step": 8880 + }, + { + "epoch": 0.9841691575334883, + "grad_norm": 15.763132841414992, + "learning_rate": 7.633558600033675e-09, + "loss": 2.4059, + "step": 8890 + }, + { + "epoch": 0.9852762094542235, + "grad_norm": 16.971970282791062, + "learning_rate": 6.603484617390688e-09, + "loss": 2.5169, + "step": 8900 + }, + { + "epoch": 0.9863832613749585, + "grad_norm": 16.309961470302227, + "learning_rate": 5.647990399924031e-09, + "loss": 2.4272, + "step": 8910 + }, + { + "epoch": 0.9874903132956936, + "grad_norm": 16.05736958282543, + "learning_rate": 4.767090218589232e-09, + "loss": 2.5884, + "step": 8920 + }, + { + "epoch": 0.9885973652164286, + "grad_norm": 15.10152496394929, + "learning_rate": 3.960797230227465e-09, + "loss": 2.5573, + "step": 8930 + }, + { + "epoch": 0.9897044171371637, + "grad_norm": 14.741682018743976, + "learning_rate": 3.2291234773718093e-09, + "loss": 2.3819, + "step": 8940 + }, + { + "epoch": 0.9908114690578989, + "grad_norm": 15.440675776317423, + "learning_rate": 2.5720798880662922e-09, + "loss": 2.4611, + "step": 8950 + }, + { + "epoch": 0.9919185209786339, + "grad_norm": 13.979973708890682, + "learning_rate": 1.989676275702679e-09, + "loss": 2.4037, + "step": 8960 + }, + { + "epoch": 0.993025572899369, + "grad_norm": 19.373337099900795, + "learning_rate": 1.4819213388744814e-09, + "loss": 2.4966, + "step": 8970 + }, + { + "epoch": 0.9941326248201041, + "grad_norm": 17.103724133893802, + "learning_rate": 1.0488226612459517e-09, + "loss": 2.505, + "step": 8980 + }, + { + "epoch": 0.9952396767408391, + "grad_norm": 16.90633557993371, + "learning_rate": 6.903867114393947e-10, + "loss": 2.5781, + "step": 8990 + }, + { + "epoch": 0.9963467286615743, + "grad_norm": 16.59692250103923, + "learning_rate": 4.0661884293913266e-10, + "loss": 2.5521, + "step": 9000 + }, + { + "epoch": 0.9974537805823093, + "grad_norm": 15.318767567204494, + "learning_rate": 1.97523294011015e-10, + "loss": 2.488, + "step": 9010 + }, + { + "epoch": 0.9985608325030444, + "grad_norm": 14.806481544474932, + "learning_rate": 6.310318763858014e-11, + "loss": 2.4538, + "step": 9020 + }, + { + "epoch": 0.9996678844237795, + "grad_norm": 16.39588765945003, + "learning_rate": 3.360531477536455e-12, + "loss": 2.4834, + "step": 9030 + }, + { + "epoch": 1.0, + "step": 9033, + "total_flos": 227316538671104.0, + "train_loss": 2.714383109890978, + "train_runtime": 83244.0657, + "train_samples_per_second": 1.085, + "train_steps_per_second": 0.109 + } + ], + "logging_steps": 10, + "max_steps": 9033, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 227316538671104.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}