{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.093095281472035, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0036436509382401167, "grad_norm": 0.6875, "learning_rate": 9.987852283770651e-05, "loss": 3.4902, "step": 10 }, { "epoch": 0.007287301876480233, "grad_norm": 0.66796875, "learning_rate": 9.975704567541302e-05, "loss": 3.3432, "step": 20 }, { "epoch": 0.01093095281472035, "grad_norm": 0.5546875, "learning_rate": 9.963556851311953e-05, "loss": 3.2381, "step": 30 }, { "epoch": 0.014574603752960467, "grad_norm": 0.65234375, "learning_rate": 9.951409135082604e-05, "loss": 3.2931, "step": 40 }, { "epoch": 0.018218254691200583, "grad_norm": 0.6328125, "learning_rate": 9.939261418853257e-05, "loss": 3.3235, "step": 50 }, { "epoch": 0.0218619056294407, "grad_norm": 0.64453125, "learning_rate": 9.927113702623908e-05, "loss": 3.2988, "step": 60 }, { "epoch": 0.025505556567680818, "grad_norm": 0.59765625, "learning_rate": 9.914965986394558e-05, "loss": 3.2927, "step": 70 }, { "epoch": 0.029149207505920934, "grad_norm": 0.57421875, "learning_rate": 9.90281827016521e-05, "loss": 3.275, "step": 80 }, { "epoch": 0.03279285844416105, "grad_norm": 0.640625, "learning_rate": 9.89067055393586e-05, "loss": 3.316, "step": 90 }, { "epoch": 0.036436509382401165, "grad_norm": 0.57421875, "learning_rate": 9.878522837706513e-05, "loss": 3.2611, "step": 100 }, { "epoch": 0.04008016032064128, "grad_norm": 0.51171875, "learning_rate": 9.866375121477162e-05, "loss": 3.268, "step": 110 }, { "epoch": 0.0437238112588814, "grad_norm": 0.703125, "learning_rate": 9.854227405247813e-05, "loss": 3.3032, "step": 120 }, { "epoch": 0.04736746219712151, "grad_norm": 0.5546875, "learning_rate": 9.842079689018465e-05, "loss": 3.3334, "step": 130 }, { "epoch": 0.051011113135361635, "grad_norm": 0.671875, "learning_rate": 9.829931972789116e-05, "loss": 3.1943, "step": 140 }, { "epoch": 0.05465476407360175, "grad_norm": 0.6171875, "learning_rate": 9.817784256559767e-05, "loss": 3.2574, "step": 150 }, { "epoch": 0.05829841501184187, "grad_norm": 0.66015625, "learning_rate": 9.805636540330418e-05, "loss": 3.3747, "step": 160 }, { "epoch": 0.06194206595008198, "grad_norm": 0.52734375, "learning_rate": 9.793488824101069e-05, "loss": 3.2992, "step": 170 }, { "epoch": 0.0655857168883221, "grad_norm": 0.50390625, "learning_rate": 9.781341107871722e-05, "loss": 3.2342, "step": 180 }, { "epoch": 0.06922936782656222, "grad_norm": 0.65234375, "learning_rate": 9.769193391642371e-05, "loss": 3.356, "step": 190 }, { "epoch": 0.07287301876480233, "grad_norm": 0.57421875, "learning_rate": 9.757045675413022e-05, "loss": 3.3618, "step": 200 }, { "epoch": 0.07651666970304245, "grad_norm": 0.58984375, "learning_rate": 9.744897959183674e-05, "loss": 3.2931, "step": 210 }, { "epoch": 0.08016032064128256, "grad_norm": 0.77734375, "learning_rate": 9.732750242954325e-05, "loss": 3.3246, "step": 220 }, { "epoch": 0.08380397157952268, "grad_norm": 0.5859375, "learning_rate": 9.720602526724975e-05, "loss": 3.3181, "step": 230 }, { "epoch": 0.0874476225177628, "grad_norm": 0.640625, "learning_rate": 9.708454810495627e-05, "loss": 3.2757, "step": 240 }, { "epoch": 0.09109127345600292, "grad_norm": 0.55859375, "learning_rate": 9.696307094266278e-05, "loss": 3.2753, "step": 250 }, { "epoch": 0.09473492439424303, "grad_norm": 0.58203125, "learning_rate": 9.68415937803693e-05, "loss": 3.3207, "step": 260 }, { "epoch": 0.09837857533248315, "grad_norm": 0.63671875, "learning_rate": 9.67201166180758e-05, "loss": 3.3035, "step": 270 }, { "epoch": 0.10202222627072327, "grad_norm": 0.578125, "learning_rate": 9.659863945578231e-05, "loss": 3.3025, "step": 280 }, { "epoch": 0.10566587720896338, "grad_norm": 0.5859375, "learning_rate": 9.647716229348883e-05, "loss": 3.2066, "step": 290 }, { "epoch": 0.1093095281472035, "grad_norm": 0.7109375, "learning_rate": 9.635568513119534e-05, "loss": 3.2757, "step": 300 }, { "epoch": 0.11295317908544361, "grad_norm": 0.609375, "learning_rate": 9.623420796890185e-05, "loss": 3.1904, "step": 310 }, { "epoch": 0.11659683002368373, "grad_norm": 0.60546875, "learning_rate": 9.611273080660836e-05, "loss": 3.1947, "step": 320 }, { "epoch": 0.12024048096192384, "grad_norm": 0.6171875, "learning_rate": 9.599125364431487e-05, "loss": 3.2016, "step": 330 }, { "epoch": 0.12388413190016397, "grad_norm": 0.640625, "learning_rate": 9.58697764820214e-05, "loss": 3.329, "step": 340 }, { "epoch": 0.12752778283840407, "grad_norm": 0.66796875, "learning_rate": 9.574829931972789e-05, "loss": 3.2483, "step": 350 }, { "epoch": 0.1311714337766442, "grad_norm": 0.57421875, "learning_rate": 9.56268221574344e-05, "loss": 3.2388, "step": 360 }, { "epoch": 0.13481508471488432, "grad_norm": 0.58984375, "learning_rate": 9.550534499514092e-05, "loss": 3.2722, "step": 370 }, { "epoch": 0.13845873565312444, "grad_norm": 0.58203125, "learning_rate": 9.538386783284743e-05, "loss": 3.2672, "step": 380 }, { "epoch": 0.14210238659136454, "grad_norm": 0.5234375, "learning_rate": 9.526239067055394e-05, "loss": 3.3378, "step": 390 }, { "epoch": 0.14574603752960466, "grad_norm": 0.55859375, "learning_rate": 9.514091350826045e-05, "loss": 3.2637, "step": 400 }, { "epoch": 0.14938968846784478, "grad_norm": 0.70703125, "learning_rate": 9.501943634596696e-05, "loss": 3.2879, "step": 410 }, { "epoch": 0.1530333394060849, "grad_norm": 0.6640625, "learning_rate": 9.489795918367348e-05, "loss": 3.2614, "step": 420 }, { "epoch": 0.156676990344325, "grad_norm": 0.625, "learning_rate": 9.477648202137999e-05, "loss": 3.2469, "step": 430 }, { "epoch": 0.16032064128256512, "grad_norm": 0.5703125, "learning_rate": 9.465500485908649e-05, "loss": 3.1614, "step": 440 }, { "epoch": 0.16396429222080525, "grad_norm": 0.59765625, "learning_rate": 9.453352769679301e-05, "loss": 3.2658, "step": 450 }, { "epoch": 0.16760794315904537, "grad_norm": 0.6953125, "learning_rate": 9.441205053449952e-05, "loss": 3.3253, "step": 460 }, { "epoch": 0.1712515940972855, "grad_norm": 0.67578125, "learning_rate": 9.429057337220603e-05, "loss": 3.2311, "step": 470 }, { "epoch": 0.1748952450355256, "grad_norm": 0.625, "learning_rate": 9.416909620991254e-05, "loss": 3.3117, "step": 480 }, { "epoch": 0.1785388959737657, "grad_norm": 0.6640625, "learning_rate": 9.404761904761905e-05, "loss": 3.3513, "step": 490 }, { "epoch": 0.18218254691200583, "grad_norm": 0.5703125, "learning_rate": 9.392614188532556e-05, "loss": 3.3071, "step": 500 }, { "epoch": 0.18582619785024596, "grad_norm": 0.5703125, "learning_rate": 9.380466472303208e-05, "loss": 3.3047, "step": 510 }, { "epoch": 0.18946984878848605, "grad_norm": 0.58984375, "learning_rate": 9.368318756073858e-05, "loss": 3.1964, "step": 520 }, { "epoch": 0.19311349972672617, "grad_norm": 0.57421875, "learning_rate": 9.35617103984451e-05, "loss": 3.2459, "step": 530 }, { "epoch": 0.1967571506649663, "grad_norm": 0.62109375, "learning_rate": 9.344023323615161e-05, "loss": 3.205, "step": 540 }, { "epoch": 0.20040080160320642, "grad_norm": 0.66015625, "learning_rate": 9.331875607385812e-05, "loss": 3.2856, "step": 550 }, { "epoch": 0.20404445254144654, "grad_norm": 0.52734375, "learning_rate": 9.319727891156463e-05, "loss": 3.185, "step": 560 }, { "epoch": 0.20768810347968664, "grad_norm": 0.5546875, "learning_rate": 9.307580174927114e-05, "loss": 3.3071, "step": 570 }, { "epoch": 0.21133175441792676, "grad_norm": 0.63671875, "learning_rate": 9.295432458697765e-05, "loss": 3.2363, "step": 580 }, { "epoch": 0.21497540535616688, "grad_norm": 0.5625, "learning_rate": 9.283284742468417e-05, "loss": 3.2697, "step": 590 }, { "epoch": 0.218619056294407, "grad_norm": 0.56640625, "learning_rate": 9.271137026239067e-05, "loss": 3.3037, "step": 600 }, { "epoch": 0.2222627072326471, "grad_norm": 0.53125, "learning_rate": 9.258989310009719e-05, "loss": 3.2371, "step": 610 }, { "epoch": 0.22590635817088722, "grad_norm": 0.61328125, "learning_rate": 9.24684159378037e-05, "loss": 3.3367, "step": 620 }, { "epoch": 0.22955000910912735, "grad_norm": 0.5703125, "learning_rate": 9.234693877551021e-05, "loss": 3.2109, "step": 630 }, { "epoch": 0.23319366004736747, "grad_norm": 0.59375, "learning_rate": 9.222546161321672e-05, "loss": 3.2374, "step": 640 }, { "epoch": 0.2368373109856076, "grad_norm": 0.6875, "learning_rate": 9.210398445092323e-05, "loss": 3.3066, "step": 650 }, { "epoch": 0.24048096192384769, "grad_norm": 0.6484375, "learning_rate": 9.198250728862974e-05, "loss": 3.2635, "step": 660 }, { "epoch": 0.2441246128620878, "grad_norm": 0.60546875, "learning_rate": 9.186103012633626e-05, "loss": 3.26, "step": 670 }, { "epoch": 0.24776826380032793, "grad_norm": 0.65234375, "learning_rate": 9.173955296404276e-05, "loss": 3.2641, "step": 680 }, { "epoch": 0.25141191473856805, "grad_norm": 0.6015625, "learning_rate": 9.161807580174927e-05, "loss": 3.2907, "step": 690 }, { "epoch": 0.25505556567680815, "grad_norm": 0.54296875, "learning_rate": 9.149659863945579e-05, "loss": 3.2567, "step": 700 }, { "epoch": 0.2586992166150483, "grad_norm": 0.62890625, "learning_rate": 9.13751214771623e-05, "loss": 3.2838, "step": 710 }, { "epoch": 0.2623428675532884, "grad_norm": 0.546875, "learning_rate": 9.125364431486881e-05, "loss": 3.2969, "step": 720 }, { "epoch": 0.2659865184915285, "grad_norm": 0.6328125, "learning_rate": 9.113216715257532e-05, "loss": 3.2212, "step": 730 }, { "epoch": 0.26963016942976864, "grad_norm": 0.6328125, "learning_rate": 9.101068999028183e-05, "loss": 3.212, "step": 740 }, { "epoch": 0.27327382036800874, "grad_norm": 0.5859375, "learning_rate": 9.088921282798835e-05, "loss": 3.3488, "step": 750 }, { "epoch": 0.2769174713062489, "grad_norm": 0.546875, "learning_rate": 9.076773566569486e-05, "loss": 3.2143, "step": 760 }, { "epoch": 0.280561122244489, "grad_norm": 0.56640625, "learning_rate": 9.064625850340136e-05, "loss": 3.2518, "step": 770 }, { "epoch": 0.2842047731827291, "grad_norm": 0.578125, "learning_rate": 9.052478134110788e-05, "loss": 3.2638, "step": 780 }, { "epoch": 0.2878484241209692, "grad_norm": 0.58203125, "learning_rate": 9.040330417881439e-05, "loss": 3.2584, "step": 790 }, { "epoch": 0.2914920750592093, "grad_norm": 0.62890625, "learning_rate": 9.02818270165209e-05, "loss": 3.2841, "step": 800 }, { "epoch": 0.29513572599744947, "grad_norm": 0.55078125, "learning_rate": 9.01603498542274e-05, "loss": 3.261, "step": 810 }, { "epoch": 0.29877937693568957, "grad_norm": 0.6171875, "learning_rate": 9.003887269193392e-05, "loss": 3.2954, "step": 820 }, { "epoch": 0.30242302787392966, "grad_norm": 0.54296875, "learning_rate": 8.991739552964044e-05, "loss": 3.2337, "step": 830 }, { "epoch": 0.3060666788121698, "grad_norm": 0.6171875, "learning_rate": 8.979591836734695e-05, "loss": 3.2881, "step": 840 }, { "epoch": 0.3097103297504099, "grad_norm": 0.5546875, "learning_rate": 8.967444120505344e-05, "loss": 3.3519, "step": 850 }, { "epoch": 0.31335398068865, "grad_norm": 0.5859375, "learning_rate": 8.955296404275997e-05, "loss": 3.3147, "step": 860 }, { "epoch": 0.31699763162689015, "grad_norm": 0.62890625, "learning_rate": 8.943148688046648e-05, "loss": 3.2304, "step": 870 }, { "epoch": 0.32064128256513025, "grad_norm": 0.60546875, "learning_rate": 8.931000971817299e-05, "loss": 3.2526, "step": 880 }, { "epoch": 0.3242849335033704, "grad_norm": 0.6640625, "learning_rate": 8.91885325558795e-05, "loss": 3.309, "step": 890 }, { "epoch": 0.3279285844416105, "grad_norm": 0.6484375, "learning_rate": 8.9067055393586e-05, "loss": 3.2513, "step": 900 }, { "epoch": 0.3315722353798506, "grad_norm": 0.5703125, "learning_rate": 8.894557823129253e-05, "loss": 3.2135, "step": 910 }, { "epoch": 0.33521588631809074, "grad_norm": 0.64453125, "learning_rate": 8.882410106899904e-05, "loss": 3.3048, "step": 920 }, { "epoch": 0.33885953725633083, "grad_norm": 0.6015625, "learning_rate": 8.870262390670553e-05, "loss": 3.3047, "step": 930 }, { "epoch": 0.342503188194571, "grad_norm": 0.6015625, "learning_rate": 8.858114674441206e-05, "loss": 3.2616, "step": 940 }, { "epoch": 0.3461468391328111, "grad_norm": 0.5859375, "learning_rate": 8.845966958211857e-05, "loss": 3.2697, "step": 950 }, { "epoch": 0.3497904900710512, "grad_norm": 0.72265625, "learning_rate": 8.833819241982508e-05, "loss": 3.2395, "step": 960 }, { "epoch": 0.3534341410092913, "grad_norm": 0.61328125, "learning_rate": 8.821671525753159e-05, "loss": 3.2137, "step": 970 }, { "epoch": 0.3570777919475314, "grad_norm": 0.625, "learning_rate": 8.80952380952381e-05, "loss": 3.2872, "step": 980 }, { "epoch": 0.36072144288577157, "grad_norm": 0.5859375, "learning_rate": 8.797376093294462e-05, "loss": 3.2682, "step": 990 }, { "epoch": 0.36436509382401167, "grad_norm": 0.5390625, "learning_rate": 8.785228377065113e-05, "loss": 3.204, "step": 1000 }, { "epoch": 0.36800874476225176, "grad_norm": 0.71875, "learning_rate": 8.773080660835762e-05, "loss": 3.2472, "step": 1010 }, { "epoch": 0.3716523957004919, "grad_norm": 0.609375, "learning_rate": 8.760932944606415e-05, "loss": 3.2638, "step": 1020 }, { "epoch": 0.375296046638732, "grad_norm": 0.60546875, "learning_rate": 8.748785228377066e-05, "loss": 3.2803, "step": 1030 }, { "epoch": 0.3789396975769721, "grad_norm": 0.66796875, "learning_rate": 8.736637512147716e-05, "loss": 3.273, "step": 1040 }, { "epoch": 0.38258334851521225, "grad_norm": 0.65625, "learning_rate": 8.724489795918367e-05, "loss": 3.2854, "step": 1050 }, { "epoch": 0.38622699945345235, "grad_norm": 0.640625, "learning_rate": 8.712342079689018e-05, "loss": 3.2373, "step": 1060 }, { "epoch": 0.3898706503916925, "grad_norm": 0.55859375, "learning_rate": 8.700194363459671e-05, "loss": 3.2259, "step": 1070 }, { "epoch": 0.3935143013299326, "grad_norm": 0.5078125, "learning_rate": 8.688046647230322e-05, "loss": 3.2402, "step": 1080 }, { "epoch": 0.3971579522681727, "grad_norm": 0.61328125, "learning_rate": 8.675898931000973e-05, "loss": 3.2379, "step": 1090 }, { "epoch": 0.40080160320641284, "grad_norm": 0.59375, "learning_rate": 8.663751214771624e-05, "loss": 3.2564, "step": 1100 }, { "epoch": 0.40444525414465293, "grad_norm": 0.69921875, "learning_rate": 8.651603498542274e-05, "loss": 3.2342, "step": 1110 }, { "epoch": 0.4080889050828931, "grad_norm": 0.53125, "learning_rate": 8.639455782312925e-05, "loss": 3.3336, "step": 1120 }, { "epoch": 0.4117325560211332, "grad_norm": 0.63671875, "learning_rate": 8.627308066083576e-05, "loss": 3.2684, "step": 1130 }, { "epoch": 0.4153762069593733, "grad_norm": 0.61328125, "learning_rate": 8.615160349854227e-05, "loss": 3.2581, "step": 1140 }, { "epoch": 0.4190198578976134, "grad_norm": 0.50390625, "learning_rate": 8.603012633624878e-05, "loss": 3.3428, "step": 1150 }, { "epoch": 0.4226635088358535, "grad_norm": 0.58203125, "learning_rate": 8.59086491739553e-05, "loss": 3.2331, "step": 1160 }, { "epoch": 0.42630715977409367, "grad_norm": 0.63671875, "learning_rate": 8.578717201166182e-05, "loss": 3.2203, "step": 1170 }, { "epoch": 0.42995081071233376, "grad_norm": 0.57421875, "learning_rate": 8.566569484936832e-05, "loss": 3.248, "step": 1180 }, { "epoch": 0.43359446165057386, "grad_norm": 0.6015625, "learning_rate": 8.554421768707483e-05, "loss": 3.3052, "step": 1190 }, { "epoch": 0.437238112588814, "grad_norm": 0.5546875, "learning_rate": 8.542274052478134e-05, "loss": 3.2036, "step": 1200 }, { "epoch": 0.4408817635270541, "grad_norm": 0.64453125, "learning_rate": 8.530126336248787e-05, "loss": 3.2199, "step": 1210 }, { "epoch": 0.4445254144652942, "grad_norm": 0.68359375, "learning_rate": 8.517978620019436e-05, "loss": 3.2594, "step": 1220 }, { "epoch": 0.44816906540353435, "grad_norm": 0.6953125, "learning_rate": 8.505830903790087e-05, "loss": 3.26, "step": 1230 }, { "epoch": 0.45181271634177445, "grad_norm": 0.66015625, "learning_rate": 8.49368318756074e-05, "loss": 3.3623, "step": 1240 }, { "epoch": 0.4554563672800146, "grad_norm": 0.7421875, "learning_rate": 8.48153547133139e-05, "loss": 3.2625, "step": 1250 }, { "epoch": 0.4591000182182547, "grad_norm": 0.6875, "learning_rate": 8.469387755102041e-05, "loss": 3.2738, "step": 1260 }, { "epoch": 0.4627436691564948, "grad_norm": 0.61328125, "learning_rate": 8.457240038872692e-05, "loss": 3.2688, "step": 1270 }, { "epoch": 0.46638732009473494, "grad_norm": 0.609375, "learning_rate": 8.445092322643343e-05, "loss": 3.2392, "step": 1280 }, { "epoch": 0.47003097103297503, "grad_norm": 0.56640625, "learning_rate": 8.432944606413996e-05, "loss": 3.2414, "step": 1290 }, { "epoch": 0.4736746219712152, "grad_norm": 0.640625, "learning_rate": 8.420796890184645e-05, "loss": 3.2461, "step": 1300 }, { "epoch": 0.4773182729094553, "grad_norm": 0.578125, "learning_rate": 8.408649173955296e-05, "loss": 3.3459, "step": 1310 }, { "epoch": 0.48096192384769537, "grad_norm": 0.6953125, "learning_rate": 8.396501457725948e-05, "loss": 3.2631, "step": 1320 }, { "epoch": 0.4846055747859355, "grad_norm": 0.59765625, "learning_rate": 8.3843537414966e-05, "loss": 3.2883, "step": 1330 }, { "epoch": 0.4882492257241756, "grad_norm": 0.625, "learning_rate": 8.372206025267249e-05, "loss": 3.2085, "step": 1340 }, { "epoch": 0.49189287666241577, "grad_norm": 0.6640625, "learning_rate": 8.360058309037901e-05, "loss": 3.3132, "step": 1350 }, { "epoch": 0.49553652760065586, "grad_norm": 0.61328125, "learning_rate": 8.347910592808552e-05, "loss": 3.3076, "step": 1360 }, { "epoch": 0.49918017853889596, "grad_norm": 0.7265625, "learning_rate": 8.335762876579204e-05, "loss": 3.3183, "step": 1370 }, { "epoch": 0.5028238294771361, "grad_norm": 0.55859375, "learning_rate": 8.323615160349854e-05, "loss": 3.1761, "step": 1380 }, { "epoch": 0.5064674804153763, "grad_norm": 0.60546875, "learning_rate": 8.311467444120505e-05, "loss": 3.2079, "step": 1390 }, { "epoch": 0.5101111313536163, "grad_norm": 0.703125, "learning_rate": 8.299319727891157e-05, "loss": 3.2844, "step": 1400 }, { "epoch": 0.5137547822918564, "grad_norm": 0.578125, "learning_rate": 8.287172011661808e-05, "loss": 3.2492, "step": 1410 }, { "epoch": 0.5173984332300966, "grad_norm": 0.6328125, "learning_rate": 8.275024295432459e-05, "loss": 3.2525, "step": 1420 }, { "epoch": 0.5210420841683366, "grad_norm": 0.5703125, "learning_rate": 8.26287657920311e-05, "loss": 3.2449, "step": 1430 }, { "epoch": 0.5246857351065768, "grad_norm": 0.54296875, "learning_rate": 8.250728862973761e-05, "loss": 3.2279, "step": 1440 }, { "epoch": 0.5283293860448169, "grad_norm": 0.5859375, "learning_rate": 8.238581146744413e-05, "loss": 3.2751, "step": 1450 }, { "epoch": 0.531973036983057, "grad_norm": 0.57421875, "learning_rate": 8.226433430515063e-05, "loss": 3.2404, "step": 1460 }, { "epoch": 0.5356166879212971, "grad_norm": 0.67578125, "learning_rate": 8.214285714285714e-05, "loss": 3.2911, "step": 1470 }, { "epoch": 0.5392603388595373, "grad_norm": 0.6796875, "learning_rate": 8.202137998056366e-05, "loss": 3.2637, "step": 1480 }, { "epoch": 0.5429039897977773, "grad_norm": 0.61328125, "learning_rate": 8.189990281827017e-05, "loss": 3.2004, "step": 1490 }, { "epoch": 0.5465476407360175, "grad_norm": 0.6875, "learning_rate": 8.177842565597668e-05, "loss": 3.2958, "step": 1500 }, { "epoch": 0.5501912916742576, "grad_norm": 0.609375, "learning_rate": 8.165694849368319e-05, "loss": 3.2371, "step": 1510 }, { "epoch": 0.5538349426124978, "grad_norm": 0.6171875, "learning_rate": 8.15354713313897e-05, "loss": 3.2798, "step": 1520 }, { "epoch": 0.5574785935507378, "grad_norm": 0.6953125, "learning_rate": 8.141399416909622e-05, "loss": 3.2608, "step": 1530 }, { "epoch": 0.561122244488978, "grad_norm": 0.62109375, "learning_rate": 8.129251700680273e-05, "loss": 3.2374, "step": 1540 }, { "epoch": 0.5647658954272181, "grad_norm": 0.625, "learning_rate": 8.117103984450923e-05, "loss": 3.189, "step": 1550 }, { "epoch": 0.5684095463654582, "grad_norm": 0.57421875, "learning_rate": 8.104956268221575e-05, "loss": 3.2008, "step": 1560 }, { "epoch": 0.5720531973036983, "grad_norm": 0.58984375, "learning_rate": 8.092808551992226e-05, "loss": 3.219, "step": 1570 }, { "epoch": 0.5756968482419385, "grad_norm": 0.58203125, "learning_rate": 8.080660835762877e-05, "loss": 3.2417, "step": 1580 }, { "epoch": 0.5793404991801785, "grad_norm": 0.63671875, "learning_rate": 8.068513119533528e-05, "loss": 3.236, "step": 1590 }, { "epoch": 0.5829841501184186, "grad_norm": 0.703125, "learning_rate": 8.056365403304179e-05, "loss": 3.3037, "step": 1600 }, { "epoch": 0.5866278010566588, "grad_norm": 0.703125, "learning_rate": 8.04421768707483e-05, "loss": 3.2412, "step": 1610 }, { "epoch": 0.5902714519948989, "grad_norm": 0.66796875, "learning_rate": 8.032069970845482e-05, "loss": 3.2293, "step": 1620 }, { "epoch": 0.593915102933139, "grad_norm": 0.6640625, "learning_rate": 8.019922254616132e-05, "loss": 3.2208, "step": 1630 }, { "epoch": 0.5975587538713791, "grad_norm": 0.671875, "learning_rate": 8.007774538386784e-05, "loss": 3.2251, "step": 1640 }, { "epoch": 0.6012024048096193, "grad_norm": 0.63671875, "learning_rate": 7.995626822157435e-05, "loss": 3.284, "step": 1650 }, { "epoch": 0.6048460557478593, "grad_norm": 0.6484375, "learning_rate": 7.983479105928086e-05, "loss": 3.2404, "step": 1660 }, { "epoch": 0.6084897066860995, "grad_norm": 0.69140625, "learning_rate": 7.971331389698737e-05, "loss": 3.3335, "step": 1670 }, { "epoch": 0.6121333576243396, "grad_norm": 0.59765625, "learning_rate": 7.959183673469388e-05, "loss": 3.276, "step": 1680 }, { "epoch": 0.6157770085625797, "grad_norm": 0.63671875, "learning_rate": 7.947035957240039e-05, "loss": 3.2263, "step": 1690 }, { "epoch": 0.6194206595008198, "grad_norm": 0.546875, "learning_rate": 7.934888241010691e-05, "loss": 3.1878, "step": 1700 }, { "epoch": 0.62306431043906, "grad_norm": 0.625, "learning_rate": 7.922740524781341e-05, "loss": 3.294, "step": 1710 }, { "epoch": 0.6267079613773, "grad_norm": 0.578125, "learning_rate": 7.910592808551993e-05, "loss": 3.2183, "step": 1720 }, { "epoch": 0.6303516123155402, "grad_norm": 0.69140625, "learning_rate": 7.898445092322644e-05, "loss": 3.1985, "step": 1730 }, { "epoch": 0.6339952632537803, "grad_norm": 0.74609375, "learning_rate": 7.886297376093295e-05, "loss": 3.1563, "step": 1740 }, { "epoch": 0.6376389141920205, "grad_norm": 0.6484375, "learning_rate": 7.874149659863946e-05, "loss": 3.2806, "step": 1750 }, { "epoch": 0.6412825651302605, "grad_norm": 0.6328125, "learning_rate": 7.862001943634597e-05, "loss": 3.2288, "step": 1760 }, { "epoch": 0.6449262160685006, "grad_norm": 0.5859375, "learning_rate": 7.849854227405248e-05, "loss": 3.2785, "step": 1770 }, { "epoch": 0.6485698670067408, "grad_norm": 0.6875, "learning_rate": 7.8377065111759e-05, "loss": 3.2952, "step": 1780 }, { "epoch": 0.6522135179449808, "grad_norm": 0.6796875, "learning_rate": 7.82555879494655e-05, "loss": 3.1665, "step": 1790 }, { "epoch": 0.655857168883221, "grad_norm": 0.6796875, "learning_rate": 7.8134110787172e-05, "loss": 3.1984, "step": 1800 }, { "epoch": 0.6595008198214611, "grad_norm": 0.625, "learning_rate": 7.801263362487853e-05, "loss": 3.2051, "step": 1810 }, { "epoch": 0.6631444707597012, "grad_norm": 0.6640625, "learning_rate": 7.789115646258504e-05, "loss": 3.2141, "step": 1820 }, { "epoch": 0.6667881216979413, "grad_norm": 0.59375, "learning_rate": 7.776967930029155e-05, "loss": 3.312, "step": 1830 }, { "epoch": 0.6704317726361815, "grad_norm": 0.65234375, "learning_rate": 7.764820213799806e-05, "loss": 3.2473, "step": 1840 }, { "epoch": 0.6740754235744215, "grad_norm": 0.61328125, "learning_rate": 7.752672497570457e-05, "loss": 3.2924, "step": 1850 }, { "epoch": 0.6777190745126617, "grad_norm": 0.71484375, "learning_rate": 7.740524781341109e-05, "loss": 3.2799, "step": 1860 }, { "epoch": 0.6813627254509018, "grad_norm": 0.55078125, "learning_rate": 7.72837706511176e-05, "loss": 3.2251, "step": 1870 }, { "epoch": 0.685006376389142, "grad_norm": 0.70703125, "learning_rate": 7.71622934888241e-05, "loss": 3.209, "step": 1880 }, { "epoch": 0.688650027327382, "grad_norm": 0.63671875, "learning_rate": 7.704081632653062e-05, "loss": 3.2312, "step": 1890 }, { "epoch": 0.6922936782656222, "grad_norm": 0.6328125, "learning_rate": 7.691933916423713e-05, "loss": 3.2487, "step": 1900 }, { "epoch": 0.6959373292038623, "grad_norm": 0.5703125, "learning_rate": 7.679786200194364e-05, "loss": 3.3157, "step": 1910 }, { "epoch": 0.6995809801421023, "grad_norm": 0.63671875, "learning_rate": 7.667638483965015e-05, "loss": 3.299, "step": 1920 }, { "epoch": 0.7032246310803425, "grad_norm": 0.69140625, "learning_rate": 7.655490767735666e-05, "loss": 3.2755, "step": 1930 }, { "epoch": 0.7068682820185826, "grad_norm": 0.625, "learning_rate": 7.643343051506318e-05, "loss": 3.317, "step": 1940 }, { "epoch": 0.7105119329568227, "grad_norm": 0.55078125, "learning_rate": 7.631195335276969e-05, "loss": 3.1871, "step": 1950 }, { "epoch": 0.7141555838950628, "grad_norm": 0.74609375, "learning_rate": 7.619047619047618e-05, "loss": 3.2405, "step": 1960 }, { "epoch": 0.717799234833303, "grad_norm": 0.69921875, "learning_rate": 7.606899902818271e-05, "loss": 3.3068, "step": 1970 }, { "epoch": 0.7214428857715431, "grad_norm": 0.578125, "learning_rate": 7.594752186588922e-05, "loss": 3.335, "step": 1980 }, { "epoch": 0.7250865367097832, "grad_norm": 0.6484375, "learning_rate": 7.582604470359573e-05, "loss": 3.2617, "step": 1990 }, { "epoch": 0.7287301876480233, "grad_norm": 0.5234375, "learning_rate": 7.570456754130224e-05, "loss": 3.2335, "step": 2000 }, { "epoch": 0.7323738385862635, "grad_norm": 0.640625, "learning_rate": 7.558309037900875e-05, "loss": 3.2604, "step": 2010 }, { "epoch": 0.7360174895245035, "grad_norm": 0.57421875, "learning_rate": 7.546161321671527e-05, "loss": 3.2632, "step": 2020 }, { "epoch": 0.7396611404627437, "grad_norm": 0.61328125, "learning_rate": 7.534013605442178e-05, "loss": 3.2184, "step": 2030 }, { "epoch": 0.7433047914009838, "grad_norm": 0.6171875, "learning_rate": 7.521865889212827e-05, "loss": 3.2848, "step": 2040 }, { "epoch": 0.7469484423392239, "grad_norm": 0.6484375, "learning_rate": 7.50971817298348e-05, "loss": 3.2473, "step": 2050 }, { "epoch": 0.750592093277464, "grad_norm": 0.6953125, "learning_rate": 7.49757045675413e-05, "loss": 3.195, "step": 2060 }, { "epoch": 0.7542357442157042, "grad_norm": 0.73046875, "learning_rate": 7.485422740524782e-05, "loss": 3.2248, "step": 2070 }, { "epoch": 0.7578793951539442, "grad_norm": 0.5390625, "learning_rate": 7.473275024295433e-05, "loss": 3.1511, "step": 2080 }, { "epoch": 0.7615230460921844, "grad_norm": 0.66796875, "learning_rate": 7.461127308066083e-05, "loss": 3.2719, "step": 2090 }, { "epoch": 0.7651666970304245, "grad_norm": 0.57421875, "learning_rate": 7.448979591836736e-05, "loss": 3.2339, "step": 2100 }, { "epoch": 0.7688103479686647, "grad_norm": 0.61328125, "learning_rate": 7.436831875607387e-05, "loss": 3.2863, "step": 2110 }, { "epoch": 0.7724539989069047, "grad_norm": 0.55859375, "learning_rate": 7.424684159378036e-05, "loss": 3.2057, "step": 2120 }, { "epoch": 0.7760976498451448, "grad_norm": 0.73046875, "learning_rate": 7.412536443148689e-05, "loss": 3.2397, "step": 2130 }, { "epoch": 0.779741300783385, "grad_norm": 0.59375, "learning_rate": 7.40038872691934e-05, "loss": 3.2323, "step": 2140 }, { "epoch": 0.783384951721625, "grad_norm": 0.63671875, "learning_rate": 7.38824101068999e-05, "loss": 3.2764, "step": 2150 }, { "epoch": 0.7870286026598652, "grad_norm": 0.60546875, "learning_rate": 7.376093294460641e-05, "loss": 3.2668, "step": 2160 }, { "epoch": 0.7906722535981053, "grad_norm": 0.63671875, "learning_rate": 7.363945578231292e-05, "loss": 3.2953, "step": 2170 }, { "epoch": 0.7943159045363454, "grad_norm": 0.5625, "learning_rate": 7.351797862001945e-05, "loss": 3.1915, "step": 2180 }, { "epoch": 0.7979595554745855, "grad_norm": 0.66015625, "learning_rate": 7.339650145772596e-05, "loss": 3.2622, "step": 2190 }, { "epoch": 0.8016032064128257, "grad_norm": 0.6171875, "learning_rate": 7.327502429543247e-05, "loss": 3.2522, "step": 2200 }, { "epoch": 0.8052468573510657, "grad_norm": 0.64453125, "learning_rate": 7.315354713313898e-05, "loss": 3.1673, "step": 2210 }, { "epoch": 0.8088905082893059, "grad_norm": 0.625, "learning_rate": 7.303206997084548e-05, "loss": 3.2722, "step": 2220 }, { "epoch": 0.812534159227546, "grad_norm": 0.6640625, "learning_rate": 7.2910592808552e-05, "loss": 3.2377, "step": 2230 }, { "epoch": 0.8161778101657862, "grad_norm": 0.6171875, "learning_rate": 7.27891156462585e-05, "loss": 3.179, "step": 2240 }, { "epoch": 0.8198214611040262, "grad_norm": 0.57421875, "learning_rate": 7.266763848396501e-05, "loss": 3.2588, "step": 2250 }, { "epoch": 0.8234651120422664, "grad_norm": 0.578125, "learning_rate": 7.254616132167152e-05, "loss": 3.2664, "step": 2260 }, { "epoch": 0.8271087629805065, "grad_norm": 0.73046875, "learning_rate": 7.242468415937805e-05, "loss": 3.2515, "step": 2270 }, { "epoch": 0.8307524139187465, "grad_norm": 0.6328125, "learning_rate": 7.230320699708455e-05, "loss": 3.2102, "step": 2280 }, { "epoch": 0.8343960648569867, "grad_norm": 0.6484375, "learning_rate": 7.218172983479106e-05, "loss": 3.246, "step": 2290 }, { "epoch": 0.8380397157952268, "grad_norm": 0.58203125, "learning_rate": 7.206025267249757e-05, "loss": 3.3321, "step": 2300 }, { "epoch": 0.8416833667334669, "grad_norm": 0.59765625, "learning_rate": 7.193877551020408e-05, "loss": 3.0889, "step": 2310 }, { "epoch": 0.845327017671707, "grad_norm": 0.66015625, "learning_rate": 7.18172983479106e-05, "loss": 3.2811, "step": 2320 }, { "epoch": 0.8489706686099472, "grad_norm": 0.65234375, "learning_rate": 7.16958211856171e-05, "loss": 3.1688, "step": 2330 }, { "epoch": 0.8526143195481873, "grad_norm": 0.76171875, "learning_rate": 7.157434402332361e-05, "loss": 3.2495, "step": 2340 }, { "epoch": 0.8562579704864274, "grad_norm": 0.6484375, "learning_rate": 7.145286686103013e-05, "loss": 3.1742, "step": 2350 }, { "epoch": 0.8599016214246675, "grad_norm": 0.5859375, "learning_rate": 7.133138969873664e-05, "loss": 3.2293, "step": 2360 }, { "epoch": 0.8635452723629077, "grad_norm": 0.640625, "learning_rate": 7.120991253644315e-05, "loss": 3.2574, "step": 2370 }, { "epoch": 0.8671889233011477, "grad_norm": 0.55078125, "learning_rate": 7.108843537414966e-05, "loss": 3.2496, "step": 2380 }, { "epoch": 0.8708325742393879, "grad_norm": 0.7109375, "learning_rate": 7.096695821185617e-05, "loss": 3.2527, "step": 2390 }, { "epoch": 0.874476225177628, "grad_norm": 0.6640625, "learning_rate": 7.08454810495627e-05, "loss": 3.1984, "step": 2400 }, { "epoch": 0.8781198761158681, "grad_norm": 0.58984375, "learning_rate": 7.072400388726919e-05, "loss": 3.2517, "step": 2410 }, { "epoch": 0.8817635270541082, "grad_norm": 0.6171875, "learning_rate": 7.06025267249757e-05, "loss": 3.2105, "step": 2420 }, { "epoch": 0.8854071779923484, "grad_norm": 0.62890625, "learning_rate": 7.048104956268222e-05, "loss": 3.2125, "step": 2430 }, { "epoch": 0.8890508289305884, "grad_norm": 0.72265625, "learning_rate": 7.035957240038873e-05, "loss": 3.255, "step": 2440 }, { "epoch": 0.8926944798688285, "grad_norm": 0.671875, "learning_rate": 7.023809523809524e-05, "loss": 3.3331, "step": 2450 }, { "epoch": 0.8963381308070687, "grad_norm": 0.65234375, "learning_rate": 7.011661807580175e-05, "loss": 3.3545, "step": 2460 }, { "epoch": 0.8999817817453089, "grad_norm": 0.62890625, "learning_rate": 6.999514091350826e-05, "loss": 3.2776, "step": 2470 }, { "epoch": 0.9036254326835489, "grad_norm": 0.76953125, "learning_rate": 6.987366375121478e-05, "loss": 3.2331, "step": 2480 }, { "epoch": 0.907269083621789, "grad_norm": 0.78515625, "learning_rate": 6.975218658892128e-05, "loss": 3.2803, "step": 2490 }, { "epoch": 0.9109127345600292, "grad_norm": 0.671875, "learning_rate": 6.963070942662779e-05, "loss": 3.256, "step": 2500 }, { "epoch": 0.9145563854982692, "grad_norm": 0.59765625, "learning_rate": 6.950923226433431e-05, "loss": 3.2896, "step": 2510 }, { "epoch": 0.9182000364365094, "grad_norm": 0.62890625, "learning_rate": 6.938775510204082e-05, "loss": 3.2555, "step": 2520 }, { "epoch": 0.9218436873747495, "grad_norm": 0.7421875, "learning_rate": 6.926627793974733e-05, "loss": 3.2682, "step": 2530 }, { "epoch": 0.9254873383129896, "grad_norm": 0.671875, "learning_rate": 6.914480077745384e-05, "loss": 3.1564, "step": 2540 }, { "epoch": 0.9291309892512297, "grad_norm": 0.6484375, "learning_rate": 6.902332361516035e-05, "loss": 3.1445, "step": 2550 }, { "epoch": 0.9327746401894699, "grad_norm": 0.51953125, "learning_rate": 6.890184645286687e-05, "loss": 3.2515, "step": 2560 }, { "epoch": 0.9364182911277099, "grad_norm": 0.65625, "learning_rate": 6.878036929057337e-05, "loss": 3.1962, "step": 2570 }, { "epoch": 0.9400619420659501, "grad_norm": 0.59375, "learning_rate": 6.865889212827988e-05, "loss": 3.3199, "step": 2580 }, { "epoch": 0.9437055930041902, "grad_norm": 0.65234375, "learning_rate": 6.85374149659864e-05, "loss": 3.264, "step": 2590 }, { "epoch": 0.9473492439424304, "grad_norm": 0.63671875, "learning_rate": 6.841593780369291e-05, "loss": 3.1853, "step": 2600 }, { "epoch": 0.9509928948806704, "grad_norm": 0.72265625, "learning_rate": 6.829446064139942e-05, "loss": 3.3017, "step": 2610 }, { "epoch": 0.9546365458189106, "grad_norm": 0.6953125, "learning_rate": 6.817298347910593e-05, "loss": 3.2358, "step": 2620 }, { "epoch": 0.9582801967571507, "grad_norm": 0.6328125, "learning_rate": 6.805150631681244e-05, "loss": 3.2854, "step": 2630 }, { "epoch": 0.9619238476953907, "grad_norm": 0.5859375, "learning_rate": 6.793002915451895e-05, "loss": 3.1873, "step": 2640 }, { "epoch": 0.9655674986336309, "grad_norm": 0.59375, "learning_rate": 6.780855199222547e-05, "loss": 3.2274, "step": 2650 }, { "epoch": 0.969211149571871, "grad_norm": 0.63671875, "learning_rate": 6.768707482993197e-05, "loss": 3.2037, "step": 2660 }, { "epoch": 0.9728548005101111, "grad_norm": 0.5703125, "learning_rate": 6.756559766763849e-05, "loss": 3.3132, "step": 2670 }, { "epoch": 0.9764984514483512, "grad_norm": 0.72265625, "learning_rate": 6.7444120505345e-05, "loss": 3.2734, "step": 2680 }, { "epoch": 0.9801421023865914, "grad_norm": 0.70703125, "learning_rate": 6.732264334305151e-05, "loss": 3.1784, "step": 2690 }, { "epoch": 0.9837857533248315, "grad_norm": 0.57421875, "learning_rate": 6.720116618075802e-05, "loss": 3.2181, "step": 2700 }, { "epoch": 0.9874294042630716, "grad_norm": 0.6953125, "learning_rate": 6.707968901846453e-05, "loss": 3.2676, "step": 2710 }, { "epoch": 0.9910730552013117, "grad_norm": 0.6875, "learning_rate": 6.695821185617104e-05, "loss": 3.1952, "step": 2720 }, { "epoch": 0.9947167061395519, "grad_norm": 0.609375, "learning_rate": 6.683673469387756e-05, "loss": 3.3135, "step": 2730 }, { "epoch": 0.9983603570777919, "grad_norm": 0.6484375, "learning_rate": 6.671525753158406e-05, "loss": 3.2643, "step": 2740 }, { "epoch": 1.002004008016032, "grad_norm": 0.6015625, "learning_rate": 6.659378036929058e-05, "loss": 3.1996, "step": 2750 }, { "epoch": 1.0056476589542722, "grad_norm": 0.75, "learning_rate": 6.647230320699709e-05, "loss": 3.0862, "step": 2760 }, { "epoch": 1.0092913098925123, "grad_norm": 0.671875, "learning_rate": 6.63508260447036e-05, "loss": 3.1886, "step": 2770 }, { "epoch": 1.0129349608307525, "grad_norm": 0.65625, "learning_rate": 6.622934888241011e-05, "loss": 3.1478, "step": 2780 }, { "epoch": 1.0165786117689926, "grad_norm": 0.69921875, "learning_rate": 6.610787172011662e-05, "loss": 3.1577, "step": 2790 }, { "epoch": 1.0202222627072326, "grad_norm": 0.77734375, "learning_rate": 6.598639455782313e-05, "loss": 3.148, "step": 2800 }, { "epoch": 1.0238659136454729, "grad_norm": 0.640625, "learning_rate": 6.586491739552965e-05, "loss": 3.1971, "step": 2810 }, { "epoch": 1.027509564583713, "grad_norm": 0.58984375, "learning_rate": 6.574344023323615e-05, "loss": 3.1351, "step": 2820 }, { "epoch": 1.031153215521953, "grad_norm": 0.734375, "learning_rate": 6.562196307094267e-05, "loss": 3.2304, "step": 2830 }, { "epoch": 1.0347968664601932, "grad_norm": 0.71484375, "learning_rate": 6.550048590864918e-05, "loss": 3.1582, "step": 2840 }, { "epoch": 1.0384405173984332, "grad_norm": 0.71875, "learning_rate": 6.537900874635569e-05, "loss": 3.1183, "step": 2850 }, { "epoch": 1.0420841683366733, "grad_norm": 0.8046875, "learning_rate": 6.52575315840622e-05, "loss": 3.2056, "step": 2860 }, { "epoch": 1.0457278192749135, "grad_norm": 0.765625, "learning_rate": 6.513605442176871e-05, "loss": 3.1694, "step": 2870 }, { "epoch": 1.0493714702131536, "grad_norm": 0.890625, "learning_rate": 6.501457725947522e-05, "loss": 3.1428, "step": 2880 }, { "epoch": 1.0530151211513936, "grad_norm": 0.65625, "learning_rate": 6.489310009718174e-05, "loss": 3.1052, "step": 2890 }, { "epoch": 1.0566587720896339, "grad_norm": 0.83203125, "learning_rate": 6.477162293488824e-05, "loss": 3.1195, "step": 2900 }, { "epoch": 1.060302423027874, "grad_norm": 0.7421875, "learning_rate": 6.465014577259475e-05, "loss": 3.2278, "step": 2910 }, { "epoch": 1.063946073966114, "grad_norm": 0.71875, "learning_rate": 6.452866861030127e-05, "loss": 3.1563, "step": 2920 }, { "epoch": 1.0675897249043542, "grad_norm": 0.69140625, "learning_rate": 6.440719144800778e-05, "loss": 3.1505, "step": 2930 }, { "epoch": 1.0712333758425943, "grad_norm": 0.8515625, "learning_rate": 6.428571428571429e-05, "loss": 3.1681, "step": 2940 }, { "epoch": 1.0748770267808343, "grad_norm": 0.71484375, "learning_rate": 6.41642371234208e-05, "loss": 3.17, "step": 2950 }, { "epoch": 1.0785206777190746, "grad_norm": 0.90625, "learning_rate": 6.40427599611273e-05, "loss": 3.1775, "step": 2960 }, { "epoch": 1.0821643286573146, "grad_norm": 0.73828125, "learning_rate": 6.392128279883383e-05, "loss": 3.0921, "step": 2970 }, { "epoch": 1.0858079795955549, "grad_norm": 0.75390625, "learning_rate": 6.379980563654034e-05, "loss": 3.1666, "step": 2980 }, { "epoch": 1.089451630533795, "grad_norm": 0.80859375, "learning_rate": 6.367832847424684e-05, "loss": 3.1935, "step": 2990 }, { "epoch": 1.093095281472035, "grad_norm": 0.67578125, "learning_rate": 6.355685131195336e-05, "loss": 3.0588, "step": 3000 } ], "logging_steps": 10, "max_steps": 8232, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.570661967366676e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }