|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.093095281472035, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0036436509382401167, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.987852283770651e-05, |
|
"loss": 3.4902, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007287301876480233, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.975704567541302e-05, |
|
"loss": 3.3432, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01093095281472035, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.963556851311953e-05, |
|
"loss": 3.2381, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014574603752960467, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.951409135082604e-05, |
|
"loss": 3.2931, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.018218254691200583, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.939261418853257e-05, |
|
"loss": 3.3235, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0218619056294407, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.927113702623908e-05, |
|
"loss": 3.2988, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025505556567680818, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.914965986394558e-05, |
|
"loss": 3.2927, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029149207505920934, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.90281827016521e-05, |
|
"loss": 3.275, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03279285844416105, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.89067055393586e-05, |
|
"loss": 3.316, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.036436509382401165, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.878522837706513e-05, |
|
"loss": 3.2611, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04008016032064128, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 9.866375121477162e-05, |
|
"loss": 3.268, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0437238112588814, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 9.854227405247813e-05, |
|
"loss": 3.3032, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04736746219712151, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.842079689018465e-05, |
|
"loss": 3.3334, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.051011113135361635, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 9.829931972789116e-05, |
|
"loss": 3.1943, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05465476407360175, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.817784256559767e-05, |
|
"loss": 3.2574, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05829841501184187, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.805636540330418e-05, |
|
"loss": 3.3747, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06194206595008198, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.793488824101069e-05, |
|
"loss": 3.2992, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0655857168883221, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 9.781341107871722e-05, |
|
"loss": 3.2342, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06922936782656222, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.769193391642371e-05, |
|
"loss": 3.356, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07287301876480233, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.757045675413022e-05, |
|
"loss": 3.3618, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07651666970304245, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.744897959183674e-05, |
|
"loss": 3.2931, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.732750242954325e-05, |
|
"loss": 3.3246, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08380397157952268, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.720602526724975e-05, |
|
"loss": 3.3181, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0874476225177628, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.708454810495627e-05, |
|
"loss": 3.2757, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09109127345600292, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.696307094266278e-05, |
|
"loss": 3.2753, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09473492439424303, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.68415937803693e-05, |
|
"loss": 3.3207, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09837857533248315, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.67201166180758e-05, |
|
"loss": 3.3035, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10202222627072327, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.659863945578231e-05, |
|
"loss": 3.3025, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10566587720896338, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.647716229348883e-05, |
|
"loss": 3.2066, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1093095281472035, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 9.635568513119534e-05, |
|
"loss": 3.2757, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11295317908544361, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.623420796890185e-05, |
|
"loss": 3.1904, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11659683002368373, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.611273080660836e-05, |
|
"loss": 3.1947, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12024048096192384, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.599125364431487e-05, |
|
"loss": 3.2016, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12388413190016397, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.58697764820214e-05, |
|
"loss": 3.329, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12752778283840407, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.574829931972789e-05, |
|
"loss": 3.2483, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1311714337766442, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.56268221574344e-05, |
|
"loss": 3.2388, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13481508471488432, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.550534499514092e-05, |
|
"loss": 3.2722, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13845873565312444, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.538386783284743e-05, |
|
"loss": 3.2672, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14210238659136454, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 9.526239067055394e-05, |
|
"loss": 3.3378, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14574603752960466, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.514091350826045e-05, |
|
"loss": 3.2637, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14938968846784478, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.501943634596696e-05, |
|
"loss": 3.2879, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1530333394060849, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.489795918367348e-05, |
|
"loss": 3.2614, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.156676990344325, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.477648202137999e-05, |
|
"loss": 3.2469, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.16032064128256512, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.465500485908649e-05, |
|
"loss": 3.1614, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16396429222080525, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.453352769679301e-05, |
|
"loss": 3.2658, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16760794315904537, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.441205053449952e-05, |
|
"loss": 3.3253, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1712515940972855, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.429057337220603e-05, |
|
"loss": 3.2311, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1748952450355256, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.416909620991254e-05, |
|
"loss": 3.3117, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1785388959737657, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.404761904761905e-05, |
|
"loss": 3.3513, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18218254691200583, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.392614188532556e-05, |
|
"loss": 3.3071, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18582619785024596, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.380466472303208e-05, |
|
"loss": 3.3047, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.18946984878848605, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.368318756073858e-05, |
|
"loss": 3.1964, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.19311349972672617, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.35617103984451e-05, |
|
"loss": 3.2459, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1967571506649663, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.344023323615161e-05, |
|
"loss": 3.205, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.20040080160320642, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.331875607385812e-05, |
|
"loss": 3.2856, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20404445254144654, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.319727891156463e-05, |
|
"loss": 3.185, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.20768810347968664, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.307580174927114e-05, |
|
"loss": 3.3071, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.21133175441792676, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.295432458697765e-05, |
|
"loss": 3.2363, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.21497540535616688, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.283284742468417e-05, |
|
"loss": 3.2697, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.218619056294407, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.271137026239067e-05, |
|
"loss": 3.3037, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2222627072326471, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 9.258989310009719e-05, |
|
"loss": 3.2371, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.22590635817088722, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.24684159378037e-05, |
|
"loss": 3.3367, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.22955000910912735, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.234693877551021e-05, |
|
"loss": 3.2109, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.23319366004736747, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.222546161321672e-05, |
|
"loss": 3.2374, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2368373109856076, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.210398445092323e-05, |
|
"loss": 3.3066, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24048096192384769, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.198250728862974e-05, |
|
"loss": 3.2635, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2441246128620878, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.186103012633626e-05, |
|
"loss": 3.26, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.24776826380032793, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.173955296404276e-05, |
|
"loss": 3.2641, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.25141191473856805, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.161807580174927e-05, |
|
"loss": 3.2907, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.25505556567680815, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.149659863945579e-05, |
|
"loss": 3.2567, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2586992166150483, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.13751214771623e-05, |
|
"loss": 3.2838, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2623428675532884, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.125364431486881e-05, |
|
"loss": 3.2969, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2659865184915285, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.113216715257532e-05, |
|
"loss": 3.2212, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.26963016942976864, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.101068999028183e-05, |
|
"loss": 3.212, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.27327382036800874, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.088921282798835e-05, |
|
"loss": 3.3488, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2769174713062489, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.076773566569486e-05, |
|
"loss": 3.2143, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.280561122244489, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.064625850340136e-05, |
|
"loss": 3.2518, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2842047731827291, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.052478134110788e-05, |
|
"loss": 3.2638, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2878484241209692, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.040330417881439e-05, |
|
"loss": 3.2584, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2914920750592093, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.02818270165209e-05, |
|
"loss": 3.2841, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29513572599744947, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 9.01603498542274e-05, |
|
"loss": 3.261, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.29877937693568957, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.003887269193392e-05, |
|
"loss": 3.2954, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.30242302787392966, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.991739552964044e-05, |
|
"loss": 3.2337, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3060666788121698, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 3.2881, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3097103297504099, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 8.967444120505344e-05, |
|
"loss": 3.3519, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.31335398068865, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.955296404275997e-05, |
|
"loss": 3.3147, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.31699763162689015, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 8.943148688046648e-05, |
|
"loss": 3.2304, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.32064128256513025, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.931000971817299e-05, |
|
"loss": 3.2526, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3242849335033704, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 8.91885325558795e-05, |
|
"loss": 3.309, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3279285844416105, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 8.9067055393586e-05, |
|
"loss": 3.2513, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3315722353798506, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.894557823129253e-05, |
|
"loss": 3.2135, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.33521588631809074, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 8.882410106899904e-05, |
|
"loss": 3.3048, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.33885953725633083, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.870262390670553e-05, |
|
"loss": 3.3047, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.342503188194571, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.858114674441206e-05, |
|
"loss": 3.2616, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3461468391328111, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.845966958211857e-05, |
|
"loss": 3.2697, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3497904900710512, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.833819241982508e-05, |
|
"loss": 3.2395, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3534341410092913, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.821671525753159e-05, |
|
"loss": 3.2137, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3570777919475314, |
|
"grad_norm": 0.625, |
|
"learning_rate": 8.80952380952381e-05, |
|
"loss": 3.2872, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.36072144288577157, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.797376093294462e-05, |
|
"loss": 3.2682, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.36436509382401167, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 8.785228377065113e-05, |
|
"loss": 3.204, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36800874476225176, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 8.773080660835762e-05, |
|
"loss": 3.2472, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3716523957004919, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.760932944606415e-05, |
|
"loss": 3.2638, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.375296046638732, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.748785228377066e-05, |
|
"loss": 3.2803, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3789396975769721, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.736637512147716e-05, |
|
"loss": 3.273, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.38258334851521225, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 8.724489795918367e-05, |
|
"loss": 3.2854, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.38622699945345235, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.712342079689018e-05, |
|
"loss": 3.2373, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3898706503916925, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.700194363459671e-05, |
|
"loss": 3.2259, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3935143013299326, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 8.688046647230322e-05, |
|
"loss": 3.2402, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3971579522681727, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.675898931000973e-05, |
|
"loss": 3.2379, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.40080160320641284, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 8.663751214771624e-05, |
|
"loss": 3.2564, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.40444525414465293, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 8.651603498542274e-05, |
|
"loss": 3.2342, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4080889050828931, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 8.639455782312925e-05, |
|
"loss": 3.3336, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4117325560211332, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.627308066083576e-05, |
|
"loss": 3.2684, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4153762069593733, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.615160349854227e-05, |
|
"loss": 3.2581, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4190198578976134, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 8.603012633624878e-05, |
|
"loss": 3.3428, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4226635088358535, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.59086491739553e-05, |
|
"loss": 3.2331, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.42630715977409367, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.578717201166182e-05, |
|
"loss": 3.2203, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.42995081071233376, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.566569484936832e-05, |
|
"loss": 3.248, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.43359446165057386, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.554421768707483e-05, |
|
"loss": 3.3052, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.437238112588814, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 8.542274052478134e-05, |
|
"loss": 3.2036, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4408817635270541, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 8.530126336248787e-05, |
|
"loss": 3.2199, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.4445254144652942, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 8.517978620019436e-05, |
|
"loss": 3.2594, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.44816906540353435, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.505830903790087e-05, |
|
"loss": 3.26, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.45181271634177445, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 8.49368318756074e-05, |
|
"loss": 3.3623, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4554563672800146, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.48153547133139e-05, |
|
"loss": 3.2625, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4591000182182547, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.469387755102041e-05, |
|
"loss": 3.2738, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4627436691564948, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.457240038872692e-05, |
|
"loss": 3.2688, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.46638732009473494, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.445092322643343e-05, |
|
"loss": 3.2392, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.47003097103297503, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 8.432944606413996e-05, |
|
"loss": 3.2414, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4736746219712152, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.420796890184645e-05, |
|
"loss": 3.2461, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4773182729094553, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 8.408649173955296e-05, |
|
"loss": 3.3459, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.48096192384769537, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.396501457725948e-05, |
|
"loss": 3.2631, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4846055747859355, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 8.3843537414966e-05, |
|
"loss": 3.2883, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.4882492257241756, |
|
"grad_norm": 0.625, |
|
"learning_rate": 8.372206025267249e-05, |
|
"loss": 3.2085, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.49189287666241577, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 8.360058309037901e-05, |
|
"loss": 3.3132, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.49553652760065586, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.347910592808552e-05, |
|
"loss": 3.3076, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.49918017853889596, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 8.335762876579204e-05, |
|
"loss": 3.3183, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5028238294771361, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.323615160349854e-05, |
|
"loss": 3.1761, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5064674804153763, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.311467444120505e-05, |
|
"loss": 3.2079, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5101111313536163, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.299319727891157e-05, |
|
"loss": 3.2844, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5137547822918564, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 8.287172011661808e-05, |
|
"loss": 3.2492, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5173984332300966, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 8.275024295432459e-05, |
|
"loss": 3.2525, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5210420841683366, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.26287657920311e-05, |
|
"loss": 3.2449, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5246857351065768, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.250728862973761e-05, |
|
"loss": 3.2279, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5283293860448169, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.238581146744413e-05, |
|
"loss": 3.2751, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.531973036983057, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.226433430515063e-05, |
|
"loss": 3.2404, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5356166879212971, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 8.214285714285714e-05, |
|
"loss": 3.2911, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5392603388595373, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.202137998056366e-05, |
|
"loss": 3.2637, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5429039897977773, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.189990281827017e-05, |
|
"loss": 3.2004, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5465476407360175, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.177842565597668e-05, |
|
"loss": 3.2958, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5501912916742576, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.165694849368319e-05, |
|
"loss": 3.2371, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5538349426124978, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.15354713313897e-05, |
|
"loss": 3.2798, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5574785935507378, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.141399416909622e-05, |
|
"loss": 3.2608, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.561122244488978, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 8.129251700680273e-05, |
|
"loss": 3.2374, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5647658954272181, |
|
"grad_norm": 0.625, |
|
"learning_rate": 8.117103984450923e-05, |
|
"loss": 3.189, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5684095463654582, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.104956268221575e-05, |
|
"loss": 3.2008, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5720531973036983, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 8.092808551992226e-05, |
|
"loss": 3.219, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5756968482419385, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.080660835762877e-05, |
|
"loss": 3.2417, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5793404991801785, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.068513119533528e-05, |
|
"loss": 3.236, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5829841501184186, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.056365403304179e-05, |
|
"loss": 3.3037, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5866278010566588, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.04421768707483e-05, |
|
"loss": 3.2412, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5902714519948989, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.032069970845482e-05, |
|
"loss": 3.2293, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.593915102933139, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 8.019922254616132e-05, |
|
"loss": 3.2208, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5975587538713791, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 8.007774538386784e-05, |
|
"loss": 3.2251, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6012024048096193, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.995626822157435e-05, |
|
"loss": 3.284, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6048460557478593, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.983479105928086e-05, |
|
"loss": 3.2404, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6084897066860995, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.971331389698737e-05, |
|
"loss": 3.3335, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6121333576243396, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.959183673469388e-05, |
|
"loss": 3.276, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6157770085625797, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.947035957240039e-05, |
|
"loss": 3.2263, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6194206595008198, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 7.934888241010691e-05, |
|
"loss": 3.1878, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.62306431043906, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.922740524781341e-05, |
|
"loss": 3.294, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6267079613773, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.910592808551993e-05, |
|
"loss": 3.2183, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6303516123155402, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.898445092322644e-05, |
|
"loss": 3.1985, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6339952632537803, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.886297376093295e-05, |
|
"loss": 3.1563, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6376389141920205, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.874149659863946e-05, |
|
"loss": 3.2806, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6412825651302605, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7.862001943634597e-05, |
|
"loss": 3.2288, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6449262160685006, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.849854227405248e-05, |
|
"loss": 3.2785, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6485698670067408, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.8377065111759e-05, |
|
"loss": 3.2952, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6522135179449808, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.82555879494655e-05, |
|
"loss": 3.1665, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.655857168883221, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.8134110787172e-05, |
|
"loss": 3.1984, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6595008198214611, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.801263362487853e-05, |
|
"loss": 3.2051, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6631444707597012, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.789115646258504e-05, |
|
"loss": 3.2141, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6667881216979413, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.776967930029155e-05, |
|
"loss": 3.312, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.6704317726361815, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.764820213799806e-05, |
|
"loss": 3.2473, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.6740754235744215, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.752672497570457e-05, |
|
"loss": 3.2924, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6777190745126617, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.740524781341109e-05, |
|
"loss": 3.2799, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.6813627254509018, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.72837706511176e-05, |
|
"loss": 3.2251, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.685006376389142, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 7.71622934888241e-05, |
|
"loss": 3.209, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.688650027327382, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.704081632653062e-05, |
|
"loss": 3.2312, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6922936782656222, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7.691933916423713e-05, |
|
"loss": 3.2487, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6959373292038623, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.679786200194364e-05, |
|
"loss": 3.3157, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6995809801421023, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.667638483965015e-05, |
|
"loss": 3.299, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7032246310803425, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.655490767735666e-05, |
|
"loss": 3.2755, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7068682820185826, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.643343051506318e-05, |
|
"loss": 3.317, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7105119329568227, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.631195335276969e-05, |
|
"loss": 3.1871, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7141555838950628, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.619047619047618e-05, |
|
"loss": 3.2405, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.717799234833303, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 7.606899902818271e-05, |
|
"loss": 3.3068, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7214428857715431, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.594752186588922e-05, |
|
"loss": 3.335, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7250865367097832, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.582604470359573e-05, |
|
"loss": 3.2617, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7287301876480233, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 7.570456754130224e-05, |
|
"loss": 3.2335, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7323738385862635, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 7.558309037900875e-05, |
|
"loss": 3.2604, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.7360174895245035, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 7.546161321671527e-05, |
|
"loss": 3.2632, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.7396611404627437, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.534013605442178e-05, |
|
"loss": 3.2184, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.7433047914009838, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.521865889212827e-05, |
|
"loss": 3.2848, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.7469484423392239, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.50971817298348e-05, |
|
"loss": 3.2473, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.750592093277464, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.49757045675413e-05, |
|
"loss": 3.195, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.7542357442157042, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.485422740524782e-05, |
|
"loss": 3.2248, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7578793951539442, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 7.473275024295433e-05, |
|
"loss": 3.1511, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.7615230460921844, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 7.461127308066083e-05, |
|
"loss": 3.2719, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7651666970304245, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 7.448979591836736e-05, |
|
"loss": 3.2339, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7688103479686647, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.436831875607387e-05, |
|
"loss": 3.2863, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.7724539989069047, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 7.424684159378036e-05, |
|
"loss": 3.2057, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.7760976498451448, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.412536443148689e-05, |
|
"loss": 3.2397, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.779741300783385, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.40038872691934e-05, |
|
"loss": 3.2323, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.783384951721625, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.38824101068999e-05, |
|
"loss": 3.2764, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.7870286026598652, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 7.376093294460641e-05, |
|
"loss": 3.2668, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.7906722535981053, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.363945578231292e-05, |
|
"loss": 3.2953, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.7943159045363454, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 7.351797862001945e-05, |
|
"loss": 3.1915, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.7979595554745855, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 7.339650145772596e-05, |
|
"loss": 3.2622, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8016032064128257, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.327502429543247e-05, |
|
"loss": 3.2522, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8052468573510657, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 7.315354713313898e-05, |
|
"loss": 3.1673, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8088905082893059, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.303206997084548e-05, |
|
"loss": 3.2722, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.812534159227546, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.2910592808552e-05, |
|
"loss": 3.2377, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8161778101657862, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.27891156462585e-05, |
|
"loss": 3.179, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8198214611040262, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 7.266763848396501e-05, |
|
"loss": 3.2588, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8234651120422664, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.254616132167152e-05, |
|
"loss": 3.2664, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.8271087629805065, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.242468415937805e-05, |
|
"loss": 3.2515, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.8307524139187465, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7.230320699708455e-05, |
|
"loss": 3.2102, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.8343960648569867, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.218172983479106e-05, |
|
"loss": 3.246, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.8380397157952268, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 7.206025267249757e-05, |
|
"loss": 3.3321, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8416833667334669, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.193877551020408e-05, |
|
"loss": 3.0889, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.845327017671707, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 7.18172983479106e-05, |
|
"loss": 3.2811, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.8489706686099472, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.16958211856171e-05, |
|
"loss": 3.1688, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.8526143195481873, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.157434402332361e-05, |
|
"loss": 3.2495, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.8562579704864274, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.145286686103013e-05, |
|
"loss": 3.1742, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8599016214246675, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.133138969873664e-05, |
|
"loss": 3.2293, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.8635452723629077, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 7.120991253644315e-05, |
|
"loss": 3.2574, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.8671889233011477, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.108843537414966e-05, |
|
"loss": 3.2496, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.8708325742393879, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.096695821185617e-05, |
|
"loss": 3.2527, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.874476225177628, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.08454810495627e-05, |
|
"loss": 3.1984, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8781198761158681, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 7.072400388726919e-05, |
|
"loss": 3.2517, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.8817635270541082, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.06025267249757e-05, |
|
"loss": 3.2105, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.8854071779923484, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 7.048104956268222e-05, |
|
"loss": 3.2125, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.8890508289305884, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.035957240038873e-05, |
|
"loss": 3.255, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.8926944798688285, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 7.023809523809524e-05, |
|
"loss": 3.3331, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8963381308070687, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.011661807580175e-05, |
|
"loss": 3.3545, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.8999817817453089, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 6.999514091350826e-05, |
|
"loss": 3.2776, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.9036254326835489, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.987366375121478e-05, |
|
"loss": 3.2331, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.907269083621789, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.975218658892128e-05, |
|
"loss": 3.2803, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9109127345600292, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.963070942662779e-05, |
|
"loss": 3.256, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9145563854982692, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 6.950923226433431e-05, |
|
"loss": 3.2896, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.9182000364365094, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 3.2555, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9218436873747495, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.926627793974733e-05, |
|
"loss": 3.2682, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9254873383129896, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.914480077745384e-05, |
|
"loss": 3.1564, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9291309892512297, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.902332361516035e-05, |
|
"loss": 3.1445, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.9327746401894699, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 6.890184645286687e-05, |
|
"loss": 3.2515, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.9364182911277099, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.878036929057337e-05, |
|
"loss": 3.1962, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.9400619420659501, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 6.865889212827988e-05, |
|
"loss": 3.3199, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.9437055930041902, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 6.85374149659864e-05, |
|
"loss": 3.264, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.9473492439424304, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.841593780369291e-05, |
|
"loss": 3.1853, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9509928948806704, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.829446064139942e-05, |
|
"loss": 3.3017, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.9546365458189106, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.817298347910593e-05, |
|
"loss": 3.2358, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.9582801967571507, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 6.805150631681244e-05, |
|
"loss": 3.2854, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.9619238476953907, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.793002915451895e-05, |
|
"loss": 3.1873, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.9655674986336309, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 6.780855199222547e-05, |
|
"loss": 3.2274, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.969211149571871, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.768707482993197e-05, |
|
"loss": 3.2037, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.9728548005101111, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 6.756559766763849e-05, |
|
"loss": 3.3132, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.9764984514483512, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.7444120505345e-05, |
|
"loss": 3.2734, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.9801421023865914, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 6.732264334305151e-05, |
|
"loss": 3.1784, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.9837857533248315, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 6.720116618075802e-05, |
|
"loss": 3.2181, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9874294042630716, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.707968901846453e-05, |
|
"loss": 3.2676, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.9910730552013117, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.695821185617104e-05, |
|
"loss": 3.1952, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.9947167061395519, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 6.683673469387756e-05, |
|
"loss": 3.3135, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.9983603570777919, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.671525753158406e-05, |
|
"loss": 3.2643, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.002004008016032, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 6.659378036929058e-05, |
|
"loss": 3.1996, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.0056476589542722, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.647230320699709e-05, |
|
"loss": 3.0862, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.0092913098925123, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.63508260447036e-05, |
|
"loss": 3.1886, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.0129349608307525, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.622934888241011e-05, |
|
"loss": 3.1478, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.0165786117689926, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 6.610787172011662e-05, |
|
"loss": 3.1577, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.0202222627072326, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.598639455782313e-05, |
|
"loss": 3.148, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0238659136454729, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 6.586491739552965e-05, |
|
"loss": 3.1971, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.027509564583713, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 6.574344023323615e-05, |
|
"loss": 3.1351, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.031153215521953, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.562196307094267e-05, |
|
"loss": 3.2304, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.0347968664601932, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.550048590864918e-05, |
|
"loss": 3.1582, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.0384405173984332, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.537900874635569e-05, |
|
"loss": 3.1183, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.0420841683366733, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 6.52575315840622e-05, |
|
"loss": 3.2056, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.0457278192749135, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 6.513605442176871e-05, |
|
"loss": 3.1694, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.0493714702131536, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 6.501457725947522e-05, |
|
"loss": 3.1428, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.0530151211513936, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.489310009718174e-05, |
|
"loss": 3.1052, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.0566587720896339, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.477162293488824e-05, |
|
"loss": 3.1195, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.060302423027874, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.465014577259475e-05, |
|
"loss": 3.2278, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.063946073966114, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.452866861030127e-05, |
|
"loss": 3.1563, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.0675897249043542, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.440719144800778e-05, |
|
"loss": 3.1505, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.0712333758425943, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 6.428571428571429e-05, |
|
"loss": 3.1681, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.0748770267808343, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.41642371234208e-05, |
|
"loss": 3.17, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.0785206777190746, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 6.40427599611273e-05, |
|
"loss": 3.1775, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.0821643286573146, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.392128279883383e-05, |
|
"loss": 3.0921, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.0858079795955549, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.379980563654034e-05, |
|
"loss": 3.1666, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.089451630533795, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 6.367832847424684e-05, |
|
"loss": 3.1935, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.093095281472035, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.355685131195336e-05, |
|
"loss": 3.0588, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8232, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.570661967366676e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|