|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.07553869201667132, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.277, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.07142321749040205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1731, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 0.04004541380672789, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.2066, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.03135466258081705, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1541, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 0.08107204814222205, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.3396, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.02826228780285631, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.0857, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 0.04534872682356579, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.1668, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.08062512421235096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.307, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 0.09855642598201231, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.3094, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.08223399648677109, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.3169, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 0.036306345795043035, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.156, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.0446200343132829, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1902, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 0.0394950521156506, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.1665, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.0779690335299316, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.3144, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 0.0646426759636167, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.2018, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.08984226365886303, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2255, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 0.036674267927634084, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.1264, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.09371135142216022, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.2776, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 0.04744368538635616, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.149, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.11052474076313488, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3257, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 0.12235330629107824, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.2811, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.09475772803286764, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.2566, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 0.07682060292069236, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.194, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.10253318753424252, |
|
"learning_rate": 6e-05, |
|
"loss": 0.1742, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.15452516312260753, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.2426, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.06980713124278756, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.1524, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 0.17169291075340687, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.2302, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.12097272092886306, |
|
"learning_rate": 7e-05, |
|
"loss": 0.1751, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 0.04894250463264245, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.0896, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.16842709573150588, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.3866, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 0.10293501286832617, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.166, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.06255095679441007, |
|
"learning_rate": 8e-05, |
|
"loss": 0.1388, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 0.09759267630202532, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.1998, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.03725031482525615, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.0808, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 0.06548370439282793, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.1329, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.06363300533898003, |
|
"learning_rate": 9e-05, |
|
"loss": 0.0765, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 0.1319969217444984, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.1583, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1570940458256374, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.1703, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 0.17816543762213244, |
|
"learning_rate": 9.75e-05, |
|
"loss": 0.1677, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.06814489472510708, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0813, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 0.12234791580019863, |
|
"learning_rate": 9.999809615320856e-05, |
|
"loss": 0.067, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.11898491410474155, |
|
"learning_rate": 9.999238475781957e-05, |
|
"loss": 0.0832, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 0.11599178809474275, |
|
"learning_rate": 9.998286624877786e-05, |
|
"loss": 0.1261, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.18501882395233082, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 0.1908, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 0.06856454706988713, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 0.0672, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.05947222817167692, |
|
"learning_rate": 9.99314767377287e-05, |
|
"loss": 0.0745, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 0.07765190202488637, |
|
"learning_rate": 9.990673992109335e-05, |
|
"loss": 0.0756, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1359994526177061, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.0331, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 0.1373406192095304, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 0.0806, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.05473571476326683, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 0.0522, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 0.1169729050564611, |
|
"learning_rate": 9.976980991835894e-05, |
|
"loss": 0.0762, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.05579810258223367, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 0.0345, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 0.09381823707624487, |
|
"learning_rate": 9.967859278382938e-05, |
|
"loss": 0.0897, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.07938166641979207, |
|
"learning_rate": 9.962730758206611e-05, |
|
"loss": 0.0654, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 0.13087865330833595, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 0.1015, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.263321695571175, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 0.0519, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 0.14202447905316087, |
|
"learning_rate": 9.945079316809585e-05, |
|
"loss": 0.0368, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.21296861819635188, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 0.0699, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 0.17479932116014785, |
|
"learning_rate": 9.931428007686158e-05, |
|
"loss": 0.0383, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.19384996245201125, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.0977, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 0.3132458011578011, |
|
"learning_rate": 9.916274537819775e-05, |
|
"loss": 0.0783, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.24343342969724435, |
|
"learning_rate": 9.908135917238321e-05, |
|
"loss": 0.1309, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 0.06358039936658727, |
|
"learning_rate": 9.899623523104149e-05, |
|
"loss": 0.0341, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2220768799042642, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 0.1244, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 0.09877556684729816, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 0.0584, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.07125822760195885, |
|
"learning_rate": 9.871850323926177e-05, |
|
"loss": 0.0099, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 0.19991568339646582, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 0.0532, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.18115871524580987, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 0.1011, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 0.07567960238386034, |
|
"learning_rate": 9.84073820189054e-05, |
|
"loss": 0.034, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.15451087209839287, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.0348, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 0.08313091042199042, |
|
"learning_rate": 9.818152266043114e-05, |
|
"loss": 0.0422, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.07302944049536657, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 0.0417, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 0.041309166149277605, |
|
"learning_rate": 9.794098674340965e-05, |
|
"loss": 0.0231, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.05571619251756244, |
|
"learning_rate": 9.781523779815179e-05, |
|
"loss": 0.0517, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.07780175573372976, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 0.0312, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.046226872712960713, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.0418, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 0.08057703631440487, |
|
"learning_rate": 9.741618276030997e-05, |
|
"loss": 0.0419, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.05050489987091237, |
|
"learning_rate": 9.727592877996585e-05, |
|
"loss": 0.0163, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 0.0959178937528156, |
|
"learning_rate": 9.713207455460894e-05, |
|
"loss": 0.0428, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.08603826631317996, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.0674, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 0.044173037654917646, |
|
"learning_rate": 9.683360946241989e-05, |
|
"loss": 0.0425, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.026290493271921827, |
|
"learning_rate": 9.667902132486009e-05, |
|
"loss": 0.0053, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 0.09325040701230163, |
|
"learning_rate": 9.652087839910124e-05, |
|
"loss": 0.0505, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.11764293119485195, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 0.0305, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 0.18654991334295415, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.011, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.08098006638815154, |
|
"learning_rate": 9.602524267262203e-05, |
|
"loss": 0.0183, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 0.07659797438155175, |
|
"learning_rate": 9.58530037192562e-05, |
|
"loss": 0.0274, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.16347900939631616, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.0859, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 0.15150817750296133, |
|
"learning_rate": 9.549806354382717e-05, |
|
"loss": 0.0333, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.07123424851384792, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 0.0405, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 0.11026163678942051, |
|
"learning_rate": 9.512926421749304e-05, |
|
"loss": 0.0462, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.07331190265762698, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 0.0069, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 0.06418964403024571, |
|
"learning_rate": 9.474671808010126e-05, |
|
"loss": 0.0441, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.04366811050044381, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.0321, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 0.06891688314378318, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 0.0197, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.0791719137172165, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.0211, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 0.09055479977019616, |
|
"learning_rate": 9.394085563309827e-05, |
|
"loss": 0.0368, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.1078342902713965, |
|
"learning_rate": 9.373098535696979e-05, |
|
"loss": 0.0402, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 0.04234438785284557, |
|
"learning_rate": 9.351778479699499e-05, |
|
"loss": 0.0155, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.32259634488597166, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0234, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.505, |
|
"grad_norm": 0.057346774722805624, |
|
"learning_rate": 9.308145802207629e-05, |
|
"loss": 0.0057, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06845584491597821, |
|
"learning_rate": 9.285836503510562e-05, |
|
"loss": 0.0239, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.515, |
|
"grad_norm": 0.027458411814212238, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 0.0046, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.04119439564225103, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 0.0067, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 0.25272039680541203, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 0.1068, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.15642301404795694, |
|
"learning_rate": 9.193352839727121e-05, |
|
"loss": 0.0487, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.535, |
|
"grad_norm": 0.020234092439841273, |
|
"learning_rate": 9.169429110335841e-05, |
|
"loss": 0.0044, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.021594134995858247, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 0.0023, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.545, |
|
"grad_norm": 0.11956523872488599, |
|
"learning_rate": 9.120630943110077e-05, |
|
"loss": 0.0647, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06453797114816638, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 0.0378, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.555, |
|
"grad_norm": 0.040619091199357006, |
|
"learning_rate": 9.070577591781597e-05, |
|
"loss": 0.0101, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.012631286216702108, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0025, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.565, |
|
"grad_norm": 0.11250625624010586, |
|
"learning_rate": 9.019284303086087e-05, |
|
"loss": 0.0326, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.008498513085188694, |
|
"learning_rate": 8.993177550236464e-05, |
|
"loss": 0.0019, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 0.06549524971538588, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 0.0144, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.1243637807598006, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 0.0427, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.585, |
|
"grad_norm": 0.4551297436540559, |
|
"learning_rate": 8.91304078426207e-05, |
|
"loss": 0.1364, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.14895611964997935, |
|
"learning_rate": 8.885729807284856e-05, |
|
"loss": 0.0516, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.595, |
|
"grad_norm": 0.03168114639692374, |
|
"learning_rate": 8.858122916938601e-05, |
|
"loss": 0.0038, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.08370376943417981, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.0068, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.605, |
|
"grad_norm": 0.07476474940113496, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 0.0182, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.13159976173852048, |
|
"learning_rate": 8.773547901113862e-05, |
|
"loss": 0.0352, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.615, |
|
"grad_norm": 0.022707973634618142, |
|
"learning_rate": 8.744778603945011e-05, |
|
"loss": 0.0033, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.12482944752372985, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 0.0202, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.39302174930812633, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 0.0758, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.19383130479510832, |
|
"learning_rate": 8.656768508095853e-05, |
|
"loss": 0.0325, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.635, |
|
"grad_norm": 0.03039287191356843, |
|
"learning_rate": 8.626871855061438e-05, |
|
"loss": 0.0041, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.048215778020076296, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 0.0273, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.645, |
|
"grad_norm": 0.1392522953938403, |
|
"learning_rate": 8.566252245770909e-05, |
|
"loss": 0.0622, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.07411597583466574, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.0116, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.655, |
|
"grad_norm": 0.08394509913329051, |
|
"learning_rate": 8.504546321499255e-05, |
|
"loss": 0.0454, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.04530702990326196, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.0185, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.665, |
|
"grad_norm": 0.03765639754984064, |
|
"learning_rate": 8.44177287846877e-05, |
|
"loss": 0.0065, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.031586807052073985, |
|
"learning_rate": 8.409991800312493e-05, |
|
"loss": 0.0064, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 0.08297489591002068, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 0.0086, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.03562288518144821, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.0117, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.685, |
|
"grad_norm": 0.031042156897553914, |
|
"learning_rate": 8.313100241078689e-05, |
|
"loss": 0.0032, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.02483745350649097, |
|
"learning_rate": 8.280295144952536e-05, |
|
"loss": 0.0116, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.695, |
|
"grad_norm": 0.009338647544721107, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 0.0015, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.34270075427008806, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.0789, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.705, |
|
"grad_norm": 0.024842574848788327, |
|
"learning_rate": 8.18039110138882e-05, |
|
"loss": 0.0039, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.12421960639233866, |
|
"learning_rate": 8.146601955249188e-05, |
|
"loss": 0.0398, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.715, |
|
"grad_norm": 0.056187242189785436, |
|
"learning_rate": 8.112573183188099e-05, |
|
"loss": 0.0103, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.023766584635126216, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.0049, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 0.04512771394673685, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 0.0219, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.0274526387971708, |
|
"learning_rate": 8.009075115760243e-05, |
|
"loss": 0.0033, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.735, |
|
"grad_norm": 0.19532934746359332, |
|
"learning_rate": 7.974113933756707e-05, |
|
"loss": 0.0734, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.032852874131568864, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.0042, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.745, |
|
"grad_norm": 0.0640117280012966, |
|
"learning_rate": 7.903514778554699e-05, |
|
"loss": 0.0156, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.047410567888246616, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 0.0074, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.755, |
|
"grad_norm": 0.16842524790743002, |
|
"learning_rate": 7.832031184624164e-05, |
|
"loss": 0.0398, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.006147996063681932, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 0.001, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.765, |
|
"grad_norm": 0.04008165019001057, |
|
"learning_rate": 7.75968492656029e-05, |
|
"loss": 0.0059, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.007292282430214955, |
|
"learning_rate": 7.723195175075136e-05, |
|
"loss": 0.0012, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 0.14138265996109384, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 0.0259, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.07468592471462034, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.0192, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.785, |
|
"grad_norm": 0.014395933526058188, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 0.0025, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.0038502942774920196, |
|
"learning_rate": 7.575190374550272e-05, |
|
"loss": 0.0005, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.795, |
|
"grad_norm": 0.10226426219442211, |
|
"learning_rate": 7.537691814803521e-05, |
|
"loss": 0.0123, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.012762312166514576, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0019, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.805, |
|
"grad_norm": 0.01630848631501936, |
|
"learning_rate": 7.462117800517336e-05, |
|
"loss": 0.0019, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.09121920855984286, |
|
"learning_rate": 7.424048101231686e-05, |
|
"loss": 0.0252, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.815, |
|
"grad_norm": 0.044779381951263734, |
|
"learning_rate": 7.385793801298042e-05, |
|
"loss": 0.0049, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.0069701432925383874, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 0.0005, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 0.04833507572943274, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 0.0127, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.001782479532615867, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 0.0003, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.835, |
|
"grad_norm": 0.1033970098216112, |
|
"learning_rate": 7.230989065549044e-05, |
|
"loss": 0.0331, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.050963861639285846, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.0078, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.845, |
|
"grad_norm": 0.025336676884289144, |
|
"learning_rate": 7.152555484041476e-05, |
|
"loss": 0.0037, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.04631516809520592, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 0.0026, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.855, |
|
"grad_norm": 0.024526402505777593, |
|
"learning_rate": 7.073466213281196e-05, |
|
"loss": 0.0032, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.12794480290400578, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 0.0286, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.865, |
|
"grad_norm": 0.05388064509201144, |
|
"learning_rate": 6.993745344626231e-05, |
|
"loss": 0.019, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.029754195193650093, |
|
"learning_rate": 6.953655642446368e-05, |
|
"loss": 0.0127, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.11351797783250586, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.0276, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.04238335598804278, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 0.0044, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.885, |
|
"grad_norm": 0.07192092013374167, |
|
"learning_rate": 6.832506133621487e-05, |
|
"loss": 0.0033, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.086298683194853, |
|
"learning_rate": 6.7918397477265e-05, |
|
"loss": 0.0255, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.895, |
|
"grad_norm": 0.010115510174389889, |
|
"learning_rate": 6.751036906297337e-05, |
|
"loss": 0.0005, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.11331127094042127, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.0334, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.905, |
|
"grad_norm": 0.00359905539491798, |
|
"learning_rate": 6.669034296168855e-05, |
|
"loss": 0.0005, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.003786666725911578, |
|
"learning_rate": 6.627840772285784e-05, |
|
"loss": 0.0004, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.915, |
|
"grad_norm": 0.1178858284146152, |
|
"learning_rate": 6.586523282025462e-05, |
|
"loss": 0.0059, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.0028456285153728687, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.0004, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 0.005713089625735049, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 0.0012, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.0030406845588623957, |
|
"learning_rate": 6.461858523613684e-05, |
|
"loss": 0.0003, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.935, |
|
"grad_norm": 0.0028405909976545836, |
|
"learning_rate": 6.420076723519614e-05, |
|
"loss": 0.0003, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2531768340842614, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 0.0043, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.945, |
|
"grad_norm": 0.05452635559394549, |
|
"learning_rate": 6.336191880391284e-05, |
|
"loss": 0.0288, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.009714923881054332, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 0.0012, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.955, |
|
"grad_norm": 0.004170292898958103, |
|
"learning_rate": 6.251900020272208e-05, |
|
"loss": 0.0004, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06545271449024238, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.0316, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.965, |
|
"grad_norm": 0.002583766103443765, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 0.0005, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.05335515958281829, |
|
"learning_rate": 6.124755271719325e-05, |
|
"loss": 0.0248, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 0.11111023851134917, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 0.015, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0237783109632992, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 0.0053, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.985, |
|
"grad_norm": 0.04143947097844851, |
|
"learning_rate": 5.9968396720859864e-05, |
|
"loss": 0.0051, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.02380124179400986, |
|
"learning_rate": 5.9540449768827246e-05, |
|
"loss": 0.0035, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.995, |
|
"grad_norm": 0.01783821356262124, |
|
"learning_rate": 5.911177627460739e-05, |
|
"loss": 0.0019, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.012578119490010059, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.0009, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.07603452354669571, |
|
"eval_runtime": 0.4047, |
|
"eval_samples_per_second": 2.471, |
|
"eval_steps_per_second": 2.471, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.005, |
|
"grad_norm": 0.014839351675487417, |
|
"learning_rate": 5.8252380293033884e-05, |
|
"loss": 0.0006, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.012314267589470526, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 0.0029, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.015, |
|
"grad_norm": 0.003125090751647675, |
|
"learning_rate": 5.7390470556480545e-05, |
|
"loss": 0.0004, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.007073162659384819, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.001, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.025, |
|
"grad_norm": 0.31379793587721094, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 0.0057, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.0011295952381376085, |
|
"learning_rate": 5.6093467170257374e-05, |
|
"loss": 0.0001, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.035, |
|
"grad_norm": 0.0046728430974052725, |
|
"learning_rate": 5.566016068839535e-05, |
|
"loss": 0.0003, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.0055708348232121965, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.0006, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.045, |
|
"grad_norm": 0.0465350248080129, |
|
"learning_rate": 5.4792287626011204e-05, |
|
"loss": 0.0012, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.0008151862390823683, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 0.0001, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.055, |
|
"grad_norm": 0.0017764023093753461, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 0.0003, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.028033413085950722, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 0.0065, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.065, |
|
"grad_norm": 0.00419815372280561, |
|
"learning_rate": 5.3052426976742855e-05, |
|
"loss": 0.0004, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.0102127421097092, |
|
"learning_rate": 5.26167978121472e-05, |
|
"loss": 0.0021, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.075, |
|
"grad_norm": 0.0015729817142162348, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 0.0003, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.004004133543726314, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.0002, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.085, |
|
"grad_norm": 0.0012454184210680926, |
|
"learning_rate": 5.1308847415393666e-05, |
|
"loss": 0.0002, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.0005409115041498422, |
|
"learning_rate": 5.0872620321864185e-05, |
|
"loss": 0.0001, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.095, |
|
"grad_norm": 0.001726174471896587, |
|
"learning_rate": 5.04363267749187e-05, |
|
"loss": 0.0002, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.0010814524798174311, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0001, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.105, |
|
"grad_norm": 0.009560946952238247, |
|
"learning_rate": 4.9563673225081314e-05, |
|
"loss": 0.0008, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.01441628431952276, |
|
"learning_rate": 4.912737967813583e-05, |
|
"loss": 0.0009, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.115, |
|
"grad_norm": 0.0023532660086251268, |
|
"learning_rate": 4.869115258460635e-05, |
|
"loss": 0.0003, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.057060873899210225, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.0041, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.020282977300679454, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 0.002, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.02046603304331513, |
|
"learning_rate": 4.738320218785281e-05, |
|
"loss": 0.0075, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.135, |
|
"grad_norm": 0.013990089439389753, |
|
"learning_rate": 4.694757302325715e-05, |
|
"loss": 0.0014, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 0.211826851392036, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.0054, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.145, |
|
"grad_norm": 0.0064942039937938645, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 0.0011, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.028422826475287776, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 0.0109, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.155, |
|
"grad_norm": 0.0002586519968099824, |
|
"learning_rate": 4.52077123739888e-05, |
|
"loss": 0.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.13429703504492657, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.0067, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.165, |
|
"grad_norm": 0.0009260106161090144, |
|
"learning_rate": 4.433983931160467e-05, |
|
"loss": 0.0001, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.2565611405752989, |
|
"learning_rate": 4.390653282974264e-05, |
|
"loss": 0.0069, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.175, |
|
"grad_norm": 0.026210448606146238, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 0.002, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.0012302259036583447, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 0.0002, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.185, |
|
"grad_norm": 0.02625494458907875, |
|
"learning_rate": 4.260952944351947e-05, |
|
"loss": 0.0011, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.000887755762843119, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 0.0001, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.195, |
|
"grad_norm": 0.003911862624404058, |
|
"learning_rate": 4.174761970696612e-05, |
|
"loss": 0.0002, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.03517802115885231, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0019, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.205, |
|
"grad_norm": 0.020725215351612047, |
|
"learning_rate": 4.088822372539263e-05, |
|
"loss": 0.001, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.0002202527290049613, |
|
"learning_rate": 4.045955023117276e-05, |
|
"loss": 0.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.215, |
|
"grad_norm": 0.004526177203391082, |
|
"learning_rate": 4.003160327914015e-05, |
|
"loss": 0.0005, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.018305572530131792, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 0.0005, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.225, |
|
"grad_norm": 0.0007986471410436935, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 0.0001, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.0259210834529961, |
|
"learning_rate": 3.875244728280676e-05, |
|
"loss": 0.0015, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.2349999999999999, |
|
"grad_norm": 0.007431515309959642, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 0.0008, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.21370639527155416, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.0296, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.245, |
|
"grad_norm": 0.0017749674756192052, |
|
"learning_rate": 3.748099979727792e-05, |
|
"loss": 0.0002, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.06429161818678288, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 0.004, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.255, |
|
"grad_norm": 0.4999355458993204, |
|
"learning_rate": 3.663808119608716e-05, |
|
"loss": 0.0118, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.004634517265489021, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.0002, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.2650000000000001, |
|
"grad_norm": 0.10035773018671189, |
|
"learning_rate": 3.579923276480387e-05, |
|
"loss": 0.0132, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.014262309364614909, |
|
"learning_rate": 3.5381414763863166e-05, |
|
"loss": 0.0006, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.275, |
|
"grad_norm": 0.0006263407491042736, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 0.0001, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.0018123347337400675, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.0003, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.285, |
|
"grad_norm": 0.0180314931604898, |
|
"learning_rate": 3.4134767179745406e-05, |
|
"loss": 0.0056, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.005834199925826024, |
|
"learning_rate": 3.372159227714218e-05, |
|
"loss": 0.0007, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.295, |
|
"grad_norm": 0.008235805677850457, |
|
"learning_rate": 3.330965703831146e-05, |
|
"loss": 0.0014, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.01238354074430894, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.0027, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.305, |
|
"grad_norm": 0.01737694828749958, |
|
"learning_rate": 3.248963093702663e-05, |
|
"loss": 0.0022, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.0019556398997595315, |
|
"learning_rate": 3.2081602522734986e-05, |
|
"loss": 0.0002, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.315, |
|
"grad_norm": 0.011691972118475544, |
|
"learning_rate": 3.167493866378514e-05, |
|
"loss": 0.0009, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.004834944193413171, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.0004, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.325, |
|
"grad_norm": 0.00037022329873905106, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.017585669474956853, |
|
"learning_rate": 3.046344357553632e-05, |
|
"loss": 0.0034, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.335, |
|
"grad_norm": 0.0015713497036280754, |
|
"learning_rate": 3.006254655373769e-05, |
|
"loss": 0.0002, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.006894231638829978, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 0.0012, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.345, |
|
"grad_norm": 0.0024889803245449374, |
|
"learning_rate": 2.926533786718806e-05, |
|
"loss": 0.0003, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.196326209710552, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 0.0108, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.355, |
|
"grad_norm": 0.0006519053745859627, |
|
"learning_rate": 2.8474445159585235e-05, |
|
"loss": 0.0001, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.017091849830244706, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 0.0017, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.365, |
|
"grad_norm": 0.000303651475823324, |
|
"learning_rate": 2.7690109344509563e-05, |
|
"loss": 0.0001, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.026097473151970325, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 0.0078, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.00044755689357070186, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 0.0001, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.0005042582378594239, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.385, |
|
"grad_norm": 0.003238731213585818, |
|
"learning_rate": 2.6142061987019577e-05, |
|
"loss": 0.0001, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.3900000000000001, |
|
"grad_norm": 0.014755686528500189, |
|
"learning_rate": 2.575951898768315e-05, |
|
"loss": 0.0013, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.395, |
|
"grad_norm": 0.00553162806351944, |
|
"learning_rate": 2.537882199482665e-05, |
|
"loss": 0.0005, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.0005184577628493705, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0001, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.405, |
|
"grad_norm": 0.0037574278569336966, |
|
"learning_rate": 2.4623081851964806e-05, |
|
"loss": 0.0002, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.0013208423085230652, |
|
"learning_rate": 2.4248096254497288e-05, |
|
"loss": 0.0001, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.415, |
|
"grad_norm": 0.008749035967488314, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 0.0006, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.1377433041169159, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 0.0129, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.425, |
|
"grad_norm": 0.0010673749455521134, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 0.0001, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.0014952971304103484, |
|
"learning_rate": 2.2768048249248648e-05, |
|
"loss": 0.0001, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.435, |
|
"grad_norm": 0.0037047451799289005, |
|
"learning_rate": 2.2403150734397094e-05, |
|
"loss": 0.0003, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.0016251406693602976, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.0002, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.445, |
|
"grad_norm": 0.00028939999776533094, |
|
"learning_rate": 2.167968815375837e-05, |
|
"loss": 0.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.001349118937608808, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 0.0001, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.455, |
|
"grad_norm": 0.0067421886978353855, |
|
"learning_rate": 2.0964852214453013e-05, |
|
"loss": 0.0008, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.0011089997483127705, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.0002, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.465, |
|
"grad_norm": 0.0028423391897432755, |
|
"learning_rate": 2.0258860662432942e-05, |
|
"loss": 0.0002, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.01152426090888302, |
|
"learning_rate": 1.9909248842397584e-05, |
|
"loss": 0.0005, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.475, |
|
"grad_norm": 0.0674872940289676, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 0.0089, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.0014300801418590321, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 0.0001, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.4849999999999999, |
|
"grad_norm": 0.0009000382689770731, |
|
"learning_rate": 1.887426816811903e-05, |
|
"loss": 0.0001, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.0037992575880660263, |
|
"learning_rate": 1.8533980447508137e-05, |
|
"loss": 0.0005, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.495, |
|
"grad_norm": 0.22933461676514044, |
|
"learning_rate": 1.8196088986111797e-05, |
|
"loss": 0.0094, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.042524052580419906, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0092, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.505, |
|
"grad_norm": 0.0023691615908781444, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 0.0001, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.019487602890200652, |
|
"learning_rate": 1.7197048550474643e-05, |
|
"loss": 0.0033, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.5150000000000001, |
|
"grad_norm": 0.0016513291656156698, |
|
"learning_rate": 1.6868997589213136e-05, |
|
"loss": 0.0003, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.0014481350536131741, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 0.0002, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.525, |
|
"grad_norm": 0.030859003830790453, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 0.0047, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.0013227578422165872, |
|
"learning_rate": 1.5900081996875083e-05, |
|
"loss": 0.0002, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.5350000000000001, |
|
"grad_norm": 0.040650034143242364, |
|
"learning_rate": 1.5582271215312294e-05, |
|
"loss": 0.0058, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.010752068089793936, |
|
"learning_rate": 1.526708147705013e-05, |
|
"loss": 0.0004, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.545, |
|
"grad_norm": 0.0007336372616353323, |
|
"learning_rate": 1.4954536785007456e-05, |
|
"loss": 0.0001, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.0006185076286575476, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.0001, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.5550000000000002, |
|
"grad_norm": 0.0012108807503260438, |
|
"learning_rate": 1.4337477542290928e-05, |
|
"loss": 0.0001, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.014653087670058953, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 0.0038, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.565, |
|
"grad_norm": 0.00070106922262732, |
|
"learning_rate": 1.373128144938563e-05, |
|
"loss": 0.0001, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.5699999999999998, |
|
"grad_norm": 0.022894525200356754, |
|
"learning_rate": 1.3432314919041478e-05, |
|
"loss": 0.0034, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.575, |
|
"grad_norm": 0.15319546037835274, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 0.005, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.02298142323498265, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 0.0036, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.585, |
|
"grad_norm": 0.000645097633470432, |
|
"learning_rate": 1.2552213960549891e-05, |
|
"loss": 0.0001, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.5899999999999999, |
|
"grad_norm": 0.08053780392470371, |
|
"learning_rate": 1.22645209888614e-05, |
|
"loss": 0.0249, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.595, |
|
"grad_norm": 0.00023214176871761666, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 0.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.0008201475435205818, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.0001, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.605, |
|
"grad_norm": 0.00476872158921415, |
|
"learning_rate": 1.1418770830614013e-05, |
|
"loss": 0.0002, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.6099999999999999, |
|
"grad_norm": 0.001393468029081844, |
|
"learning_rate": 1.1142701927151456e-05, |
|
"loss": 0.0001, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.615, |
|
"grad_norm": 0.003859464277860662, |
|
"learning_rate": 1.0869592157379304e-05, |
|
"loss": 0.0001, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.004353663579813155, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 0.0003, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.013200524828428624, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 0.0018, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.0024888356543417843, |
|
"learning_rate": 1.006822449763537e-05, |
|
"loss": 0.0001, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.635, |
|
"grad_norm": 0.02372982952309258, |
|
"learning_rate": 9.807156969139136e-06, |
|
"loss": 0.0031, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.0227337977382863, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.0032, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.645, |
|
"grad_norm": 0.0006468911312178352, |
|
"learning_rate": 9.294224082184045e-06, |
|
"loss": 0.0001, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.00031790071448860156, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.655, |
|
"grad_norm": 0.014069307621453687, |
|
"learning_rate": 8.793690568899216e-06, |
|
"loss": 0.001, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.6600000000000001, |
|
"grad_norm": 0.002913520292376533, |
|
"learning_rate": 8.548121372247918e-06, |
|
"loss": 0.0001, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.665, |
|
"grad_norm": 0.010269178065459858, |
|
"learning_rate": 8.305708896641594e-06, |
|
"loss": 0.0008, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.004202310016346467, |
|
"learning_rate": 8.066471602728803e-06, |
|
"loss": 0.0002, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.675, |
|
"grad_norm": 0.06791650915734657, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 0.0089, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.00023193543187296342, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.685, |
|
"grad_norm": 0.009522222092425871, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 0.0005, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.0445881491075994, |
|
"learning_rate": 7.1416349648943894e-06, |
|
"loss": 0.0016, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.6949999999999998, |
|
"grad_norm": 0.000599146744339273, |
|
"learning_rate": 6.918541977923709e-06, |
|
"loss": 0.0001, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.027249503153038576, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.0021, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.705, |
|
"grad_norm": 0.002016792070660535, |
|
"learning_rate": 6.482215203005015e-06, |
|
"loss": 0.0002, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.007484120928358062, |
|
"learning_rate": 6.269014643030213e-06, |
|
"loss": 0.0003, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.7149999999999999, |
|
"grad_norm": 0.0005063050902788136, |
|
"learning_rate": 6.059144366901736e-06, |
|
"loss": 0.0, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.0009197549253660609, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 0.0001, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.725, |
|
"grad_norm": 0.003191114724532278, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 0.0005, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.0014447492382083482, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 0.0002, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.7349999999999999, |
|
"grad_norm": 0.004059252437141361, |
|
"learning_rate": 5.2532819198987506e-06, |
|
"loss": 0.0006, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.02120955388832761, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 0.0007, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.745, |
|
"grad_norm": 0.0011651360195809543, |
|
"learning_rate": 4.87073578250698e-06, |
|
"loss": 0.0001, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.0010159633731216518, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 0.0001, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.755, |
|
"grad_norm": 0.04624949210450388, |
|
"learning_rate": 4.501936456172845e-06, |
|
"loss": 0.0023, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.013865761867621462, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 0.0008, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.7650000000000001, |
|
"grad_norm": 0.0006593659250274612, |
|
"learning_rate": 4.146996280743798e-06, |
|
"loss": 0.0001, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.0006699609878223177, |
|
"learning_rate": 3.974757327377981e-06, |
|
"loss": 0.0001, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.775, |
|
"grad_norm": 0.010073962298842973, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 0.0014, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.00046017499160588973, |
|
"learning_rate": 3.6408072716606346e-06, |
|
"loss": 0.0001, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.7850000000000001, |
|
"grad_norm": 0.001412185181637751, |
|
"learning_rate": 3.479121600898777e-06, |
|
"loss": 0.0001, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.0010948577791496526, |
|
"learning_rate": 3.3209786751399187e-06, |
|
"loss": 0.0001, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.795, |
|
"grad_norm": 0.0027495650503305998, |
|
"learning_rate": 3.1663905375801216e-06, |
|
"loss": 0.0003, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.00497037863878421, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.0004, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8050000000000002, |
|
"grad_norm": 0.04826775060358748, |
|
"learning_rate": 2.8679254453910785e-06, |
|
"loss": 0.0052, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.0002190733399899643, |
|
"learning_rate": 2.724071220034158e-06, |
|
"loss": 0.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.815, |
|
"grad_norm": 0.03983428885724064, |
|
"learning_rate": 2.583817239690034e-06, |
|
"loss": 0.0108, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.8199999999999998, |
|
"grad_norm": 0.005353636861069711, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.0004, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.825, |
|
"grad_norm": 0.007365720628728885, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 0.0009, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.0020943736140563596, |
|
"learning_rate": 2.1847622018482283e-06, |
|
"loss": 0.0002, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.835, |
|
"grad_norm": 0.049181169958885725, |
|
"learning_rate": 2.0590132565903476e-06, |
|
"loss": 0.0066, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.008505272014854959, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 0.001, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.845, |
|
"grad_norm": 0.015026960520249318, |
|
"learning_rate": 1.8184773395688526e-06, |
|
"loss": 0.0027, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.0012138520031968582, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 0.0002, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.855, |
|
"grad_norm": 0.00024125978466275796, |
|
"learning_rate": 1.5926179810946184e-06, |
|
"loss": 0.0001, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.8599999999999999, |
|
"grad_norm": 0.07024226815561596, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 0.004, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.865, |
|
"grad_norm": 0.0021247803242464055, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 0.0002, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.04502769124604199, |
|
"learning_rate": 1.2814967607382432e-06, |
|
"loss": 0.0122, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.007452172380573964, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 0.001, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.0015481238834281481, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.0002, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.885, |
|
"grad_norm": 0.005530884948723974, |
|
"learning_rate": 1.0037647689585206e-06, |
|
"loss": 0.0009, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.8900000000000001, |
|
"grad_norm": 0.0067523371656823305, |
|
"learning_rate": 9.186408276168013e-07, |
|
"loss": 0.0005, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.895, |
|
"grad_norm": 0.003506175781494403, |
|
"learning_rate": 8.372546218022747e-07, |
|
"loss": 0.0002, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.0019742849650606926, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.0003, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.905, |
|
"grad_norm": 0.0009738360342464184, |
|
"learning_rate": 6.857199231384282e-07, |
|
"loss": 0.0001, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.9100000000000001, |
|
"grad_norm": 0.0004408482841272174, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 0.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.915, |
|
"grad_norm": 0.010244955433424012, |
|
"learning_rate": 5.492068319041588e-07, |
|
"loss": 0.0013, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.0521719111078745, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.0071, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.925, |
|
"grad_norm": 0.0012864130857160657, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 0.0002, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.9300000000000002, |
|
"grad_norm": 0.2423177287343396, |
|
"learning_rate": 3.7269241793390085e-07, |
|
"loss": 0.028, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.935, |
|
"grad_norm": 0.010456316918486782, |
|
"learning_rate": 3.214072161706272e-07, |
|
"loss": 0.0007, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.0005706094269043214, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 0.0001, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.9449999999999998, |
|
"grad_norm": 0.013608987991299646, |
|
"learning_rate": 2.3019008164105738e-07, |
|
"loss": 0.0026, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.01259450581057638, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 0.0006, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.955, |
|
"grad_norm": 0.006309175090935663, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 0.0006, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.00027718336636984805, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 0.0001, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.9649999999999999, |
|
"grad_norm": 0.01973445145502157, |
|
"learning_rate": 9.3260078906654e-08, |
|
"loss": 0.0058, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.0050223114988680664, |
|
"learning_rate": 6.852326227130834e-08, |
|
"loss": 0.0007, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.975, |
|
"grad_norm": 0.041307491240042314, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 0.004, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.0019487309746442478, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 0.0001, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.9849999999999999, |
|
"grad_norm": 0.00051503727945184, |
|
"learning_rate": 1.7133751222137007e-08, |
|
"loss": 0.0001, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.02902500982586163, |
|
"learning_rate": 7.615242180436522e-09, |
|
"loss": 0.003, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.995, |
|
"grad_norm": 0.02095204026685112, |
|
"learning_rate": 1.903846791434516e-09, |
|
"loss": 0.0024, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.006401152408197018, |
|
"learning_rate": 0.0, |
|
"loss": 0.0004, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.0046582599170506, |
|
"eval_runtime": 1.5245, |
|
"eval_samples_per_second": 0.656, |
|
"eval_steps_per_second": 0.656, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 400, |
|
"total_flos": 6612300644352.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.3861, |
|
"train_samples_per_second": 577.159, |
|
"train_steps_per_second": 288.579 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6612300644352.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|