|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 2560, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00390625, |
|
"grad_norm": 1.813705325126648, |
|
"learning_rate": 7.8125e-07, |
|
"loss": 1.9071, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01953125, |
|
"grad_norm": 1.431990385055542, |
|
"learning_rate": 3.90625e-06, |
|
"loss": 1.8608, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 1.281330943107605, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 1.8263, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05859375, |
|
"grad_norm": 1.310953140258789, |
|
"learning_rate": 1.171875e-05, |
|
"loss": 1.8193, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 1.296993374824524, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 1.7463, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09765625, |
|
"grad_norm": 1.1856365203857422, |
|
"learning_rate": 1.953125e-05, |
|
"loss": 1.6844, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 3.376720905303955, |
|
"learning_rate": 2.34375e-05, |
|
"loss": 1.5861, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13671875, |
|
"grad_norm": 3.182882785797119, |
|
"learning_rate": 2.734375e-05, |
|
"loss": 1.4328, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.682467520236969, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.2702, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17578125, |
|
"grad_norm": 0.9865962266921997, |
|
"learning_rate": 3.5156250000000004e-05, |
|
"loss": 1.1671, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 0.42747607827186584, |
|
"learning_rate": 3.90625e-05, |
|
"loss": 1.1303, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21484375, |
|
"grad_norm": 0.42581626772880554, |
|
"learning_rate": 4.2968750000000004e-05, |
|
"loss": 1.101, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 0.4914548099040985, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 1.0586, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25390625, |
|
"grad_norm": 0.39272716641426086, |
|
"learning_rate": 5.0781250000000004e-05, |
|
"loss": 1.0308, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 0.34394437074661255, |
|
"learning_rate": 5.46875e-05, |
|
"loss": 0.9998, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.29296875, |
|
"grad_norm": 0.3009032607078552, |
|
"learning_rate": 5.8593750000000005e-05, |
|
"loss": 0.9784, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.27089548110961914, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.9653, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.33203125, |
|
"grad_norm": 0.25717490911483765, |
|
"learning_rate": 6.640625e-05, |
|
"loss": 0.9434, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 0.3018302917480469, |
|
"learning_rate": 7.031250000000001e-05, |
|
"loss": 0.9372, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.37109375, |
|
"grad_norm": 0.2254215031862259, |
|
"learning_rate": 7.421875e-05, |
|
"loss": 0.9236, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 0.2384410947561264, |
|
"learning_rate": 7.8125e-05, |
|
"loss": 0.9145, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.41015625, |
|
"grad_norm": 0.2905459403991699, |
|
"learning_rate": 8.203125e-05, |
|
"loss": 0.9177, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.27646884322166443, |
|
"learning_rate": 8.593750000000001e-05, |
|
"loss": 0.9103, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.44921875, |
|
"grad_norm": 0.23843346536159515, |
|
"learning_rate": 8.984375e-05, |
|
"loss": 0.8911, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.3110702931880951, |
|
"learning_rate": 9.375e-05, |
|
"loss": 0.8961, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.48828125, |
|
"grad_norm": 0.2591000199317932, |
|
"learning_rate": 9.765625e-05, |
|
"loss": 0.8911, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"grad_norm": 0.2314710170030594, |
|
"learning_rate": 0.00010156250000000001, |
|
"loss": 0.8765, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.52734375, |
|
"grad_norm": 0.268370658159256, |
|
"learning_rate": 0.00010546875, |
|
"loss": 0.8759, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 0.24689124524593353, |
|
"learning_rate": 0.000109375, |
|
"loss": 0.8714, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.56640625, |
|
"grad_norm": 0.28693222999572754, |
|
"learning_rate": 0.00011328125, |
|
"loss": 0.882, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5859375, |
|
"grad_norm": 0.26165568828582764, |
|
"learning_rate": 0.00011718750000000001, |
|
"loss": 0.8638, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.60546875, |
|
"grad_norm": 0.2968839406967163, |
|
"learning_rate": 0.00012109375, |
|
"loss": 0.8562, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.2954418957233429, |
|
"learning_rate": 0.000125, |
|
"loss": 0.8569, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.64453125, |
|
"grad_norm": 0.30811259150505066, |
|
"learning_rate": 0.00012890625, |
|
"loss": 0.8455, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6640625, |
|
"grad_norm": 0.2631295323371887, |
|
"learning_rate": 0.0001328125, |
|
"loss": 0.8574, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.68359375, |
|
"grad_norm": 0.25627005100250244, |
|
"learning_rate": 0.00013671875, |
|
"loss": 0.851, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 0.28598853945732117, |
|
"learning_rate": 0.00014062500000000002, |
|
"loss": 0.8385, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.72265625, |
|
"grad_norm": 0.2502932548522949, |
|
"learning_rate": 0.00014453125000000002, |
|
"loss": 0.8457, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7421875, |
|
"grad_norm": 0.3177507817745209, |
|
"learning_rate": 0.0001484375, |
|
"loss": 0.8319, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.76171875, |
|
"grad_norm": 0.27309176325798035, |
|
"learning_rate": 0.00015234375, |
|
"loss": 0.8511, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.29295653104782104, |
|
"learning_rate": 0.00015625, |
|
"loss": 0.8373, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.80078125, |
|
"grad_norm": 0.27028167247772217, |
|
"learning_rate": 0.00016015625, |
|
"loss": 0.8319, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8203125, |
|
"grad_norm": 0.40336114168167114, |
|
"learning_rate": 0.0001640625, |
|
"loss": 0.8245, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.83984375, |
|
"grad_norm": 0.3044915795326233, |
|
"learning_rate": 0.00016796875000000001, |
|
"loss": 0.8283, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 0.29535970091819763, |
|
"learning_rate": 0.00017187500000000002, |
|
"loss": 0.8119, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.87890625, |
|
"grad_norm": 0.28554800152778625, |
|
"learning_rate": 0.00017578125000000002, |
|
"loss": 0.8091, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8984375, |
|
"grad_norm": 0.26689431071281433, |
|
"learning_rate": 0.0001796875, |
|
"loss": 0.8189, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.91796875, |
|
"grad_norm": 0.29758790135383606, |
|
"learning_rate": 0.00018359375, |
|
"loss": 0.8122, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.40431731939315796, |
|
"learning_rate": 0.0001875, |
|
"loss": 0.8155, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.95703125, |
|
"grad_norm": 0.27242639660835266, |
|
"learning_rate": 0.00019140625, |
|
"loss": 0.8119, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9765625, |
|
"grad_norm": 0.3094847500324249, |
|
"learning_rate": 0.0001953125, |
|
"loss": 0.8058, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.99609375, |
|
"grad_norm": 0.32299983501434326, |
|
"learning_rate": 0.00019921875000000001, |
|
"loss": 0.8026, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.045611619949341, |
|
"eval_runtime": 0.5394, |
|
"eval_samples_per_second": 11.124, |
|
"eval_steps_per_second": 1.854, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.015625, |
|
"grad_norm": 0.305078387260437, |
|
"learning_rate": 0.00019999851261394218, |
|
"loss": 0.7941, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.03515625, |
|
"grad_norm": 0.2842113673686981, |
|
"learning_rate": 0.00019999247018391447, |
|
"loss": 0.798, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0546875, |
|
"grad_norm": 0.27524590492248535, |
|
"learning_rate": 0.0001999817800289289, |
|
"loss": 0.7911, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.07421875, |
|
"grad_norm": 0.2549247145652771, |
|
"learning_rate": 0.00019996644264587193, |
|
"loss": 0.7963, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.253353089094162, |
|
"learning_rate": 0.00019994645874763658, |
|
"loss": 0.7904, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.11328125, |
|
"grad_norm": 0.23945719003677368, |
|
"learning_rate": 0.00019992182926308942, |
|
"loss": 0.7921, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1328125, |
|
"grad_norm": 0.29668208956718445, |
|
"learning_rate": 0.00019989255533702736, |
|
"loss": 0.7943, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.15234375, |
|
"grad_norm": 0.26419156789779663, |
|
"learning_rate": 0.0001998586383301244, |
|
"loss": 0.7819, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 0.3054077625274658, |
|
"learning_rate": 0.00019982007981886847, |
|
"loss": 0.7917, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.19140625, |
|
"grad_norm": 0.27965638041496277, |
|
"learning_rate": 0.00019977688159548808, |
|
"loss": 0.7854, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2109375, |
|
"grad_norm": 0.23229017853736877, |
|
"learning_rate": 0.00019972904566786903, |
|
"loss": 0.7865, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.23046875, |
|
"grad_norm": 0.2789019048213959, |
|
"learning_rate": 0.00019967657425946106, |
|
"loss": 0.7821, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.24402114748954773, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 0.7899, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.26953125, |
|
"grad_norm": 0.2749808132648468, |
|
"learning_rate": 0.0001995577349712672, |
|
"loss": 0.7783, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.2890625, |
|
"grad_norm": 0.2676057815551758, |
|
"learning_rate": 0.00019949137261522052, |
|
"loss": 0.7788, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.30859375, |
|
"grad_norm": 0.24829885363578796, |
|
"learning_rate": 0.0001994203858256065, |
|
"loss": 0.7714, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.328125, |
|
"grad_norm": 0.24872945249080658, |
|
"learning_rate": 0.00019934477790194445, |
|
"loss": 0.7832, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.34765625, |
|
"grad_norm": 0.2914537489414215, |
|
"learning_rate": 0.00019926455235854724, |
|
"loss": 0.7791, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.3671875, |
|
"grad_norm": 0.2692899703979492, |
|
"learning_rate": 0.00019917971292435826, |
|
"loss": 0.7739, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.38671875, |
|
"grad_norm": 0.2605401873588562, |
|
"learning_rate": 0.000199090263542778, |
|
"loss": 0.7717, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.24468782544136047, |
|
"learning_rate": 0.00019899620837148077, |
|
"loss": 0.7694, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.42578125, |
|
"grad_norm": 0.2542877197265625, |
|
"learning_rate": 0.00019889755178222147, |
|
"loss": 0.7653, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.4453125, |
|
"grad_norm": 0.21375133097171783, |
|
"learning_rate": 0.00019879429836063226, |
|
"loss": 0.7854, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.46484375, |
|
"grad_norm": 0.24711847305297852, |
|
"learning_rate": 0.00019868645290600955, |
|
"loss": 0.773, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.484375, |
|
"grad_norm": 0.2352401316165924, |
|
"learning_rate": 0.0001985740204310909, |
|
"loss": 0.7641, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.50390625, |
|
"grad_norm": 0.2681073844432831, |
|
"learning_rate": 0.00019845700616182206, |
|
"loss": 0.7755, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.5234375, |
|
"grad_norm": 0.2394329458475113, |
|
"learning_rate": 0.00019833541553711395, |
|
"loss": 0.7635, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.54296875, |
|
"grad_norm": 0.27736565470695496, |
|
"learning_rate": 0.00019820925420858991, |
|
"loss": 0.7744, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.2736864984035492, |
|
"learning_rate": 0.00019807852804032305, |
|
"loss": 0.7564, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.58203125, |
|
"grad_norm": 0.22882600128650665, |
|
"learning_rate": 0.00019794324310856367, |
|
"loss": 0.7703, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6015625, |
|
"grad_norm": 0.2372276782989502, |
|
"learning_rate": 0.0001978034057014568, |
|
"loss": 0.7642, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.62109375, |
|
"grad_norm": 0.23550736904144287, |
|
"learning_rate": 0.00019765902231874992, |
|
"loss": 0.7513, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.640625, |
|
"grad_norm": 0.23483717441558838, |
|
"learning_rate": 0.00019751009967149087, |
|
"loss": 0.7485, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.66015625, |
|
"grad_norm": 0.23124265670776367, |
|
"learning_rate": 0.00019735664468171587, |
|
"loss": 0.7712, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.6796875, |
|
"grad_norm": 0.25672388076782227, |
|
"learning_rate": 0.00019719866448212795, |
|
"loss": 0.7635, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.69921875, |
|
"grad_norm": 0.2655965983867645, |
|
"learning_rate": 0.00019703616641576514, |
|
"loss": 0.7614, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.22875700891017914, |
|
"learning_rate": 0.00019686915803565934, |
|
"loss": 0.7597, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.73828125, |
|
"grad_norm": 0.24324467778205872, |
|
"learning_rate": 0.00019669764710448522, |
|
"loss": 0.7592, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.7578125, |
|
"grad_norm": 0.23085905611515045, |
|
"learning_rate": 0.00019652164159419946, |
|
"loss": 0.7582, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.77734375, |
|
"grad_norm": 0.24821893870830536, |
|
"learning_rate": 0.00019634114968567005, |
|
"loss": 0.7565, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.796875, |
|
"grad_norm": 0.24690982699394226, |
|
"learning_rate": 0.0001961561797682962, |
|
"loss": 0.75, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.81640625, |
|
"grad_norm": 0.21277934312820435, |
|
"learning_rate": 0.00019596674043961828, |
|
"loss": 0.7499, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.8359375, |
|
"grad_norm": 0.2045515477657318, |
|
"learning_rate": 0.0001957728405049183, |
|
"loss": 0.7476, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.85546875, |
|
"grad_norm": 0.22809946537017822, |
|
"learning_rate": 0.00019557448897681057, |
|
"loss": 0.7554, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.2747824788093567, |
|
"learning_rate": 0.0001953716950748227, |
|
"loss": 0.7481, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.89453125, |
|
"grad_norm": 0.23395125567913055, |
|
"learning_rate": 0.00019516446822496732, |
|
"loss": 0.7579, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9140625, |
|
"grad_norm": 0.2263769805431366, |
|
"learning_rate": 0.00019495281805930367, |
|
"loss": 0.7493, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.93359375, |
|
"grad_norm": 0.23396165668964386, |
|
"learning_rate": 0.00019473675441549013, |
|
"loss": 0.7523, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.953125, |
|
"grad_norm": 0.23420800268650055, |
|
"learning_rate": 0.0001945162873363268, |
|
"loss": 0.7469, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.97265625, |
|
"grad_norm": 0.19923944771289825, |
|
"learning_rate": 0.00019429142706928868, |
|
"loss": 0.7535, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.9921875, |
|
"grad_norm": 0.2181696891784668, |
|
"learning_rate": 0.00019406218406604965, |
|
"loss": 0.7532, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.031317949295044, |
|
"eval_runtime": 0.5375, |
|
"eval_samples_per_second": 11.164, |
|
"eval_steps_per_second": 1.861, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.01171875, |
|
"grad_norm": 0.2611521780490875, |
|
"learning_rate": 0.0001938285689819962, |
|
"loss": 0.7349, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.22077465057373047, |
|
"learning_rate": 0.0001935905926757326, |
|
"loss": 0.7309, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.05078125, |
|
"grad_norm": 0.2502357065677643, |
|
"learning_rate": 0.00019334826620857583, |
|
"loss": 0.7402, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.0703125, |
|
"grad_norm": 0.21151328086853027, |
|
"learning_rate": 0.00019310160084404186, |
|
"loss": 0.7263, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.08984375, |
|
"grad_norm": 0.22730891406536102, |
|
"learning_rate": 0.00019285060804732158, |
|
"loss": 0.7393, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.109375, |
|
"grad_norm": 0.29608404636383057, |
|
"learning_rate": 0.00019259529948474833, |
|
"loss": 0.7359, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.12890625, |
|
"grad_norm": 0.2048954963684082, |
|
"learning_rate": 0.00019233568702325547, |
|
"loss": 0.7327, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.1484375, |
|
"grad_norm": 0.24332541227340698, |
|
"learning_rate": 0.0001920717827298248, |
|
"loss": 0.723, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.16796875, |
|
"grad_norm": 0.27370956540107727, |
|
"learning_rate": 0.0001918035988709256, |
|
"loss": 0.7346, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.27345338463783264, |
|
"learning_rate": 0.00019153114791194473, |
|
"loss": 0.7216, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.20703125, |
|
"grad_norm": 0.21915854513645172, |
|
"learning_rate": 0.0001912544425166069, |
|
"loss": 0.7297, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.2265625, |
|
"grad_norm": 0.23517705500125885, |
|
"learning_rate": 0.0001909734955463863, |
|
"loss": 0.7277, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.24609375, |
|
"grad_norm": 0.2082410454750061, |
|
"learning_rate": 0.00019068832005990867, |
|
"loss": 0.7274, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.265625, |
|
"grad_norm": 0.25212010741233826, |
|
"learning_rate": 0.00019039892931234435, |
|
"loss": 0.7388, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.28515625, |
|
"grad_norm": 0.22077186405658722, |
|
"learning_rate": 0.0001901053367547922, |
|
"loss": 0.7356, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.3046875, |
|
"grad_norm": 0.24918216466903687, |
|
"learning_rate": 0.0001898075560336543, |
|
"loss": 0.7283, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.32421875, |
|
"grad_norm": 0.2168445587158203, |
|
"learning_rate": 0.00018950560099000182, |
|
"loss": 0.7276, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.3361542522907257, |
|
"learning_rate": 0.00018919948565893142, |
|
"loss": 0.7394, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.36328125, |
|
"grad_norm": 0.30473312735557556, |
|
"learning_rate": 0.0001888892242689132, |
|
"loss": 0.7214, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.3828125, |
|
"grad_norm": 0.22810065746307373, |
|
"learning_rate": 0.00018857483124112907, |
|
"loss": 0.7389, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.40234375, |
|
"grad_norm": 0.22486305236816406, |
|
"learning_rate": 0.00018825632118880259, |
|
"loss": 0.7382, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.421875, |
|
"grad_norm": 0.23797857761383057, |
|
"learning_rate": 0.00018793370891651972, |
|
"loss": 0.7352, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.44140625, |
|
"grad_norm": 0.22012600302696228, |
|
"learning_rate": 0.00018760700941954065, |
|
"loss": 0.7323, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.4609375, |
|
"grad_norm": 0.2505754232406616, |
|
"learning_rate": 0.00018727623788310292, |
|
"loss": 0.7319, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.48046875, |
|
"grad_norm": 0.23932820558547974, |
|
"learning_rate": 0.0001869414096817154, |
|
"loss": 0.7166, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.22623002529144287, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.7254, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.51953125, |
|
"grad_norm": 0.24143099784851074, |
|
"learning_rate": 0.0001862596457241875, |
|
"loss": 0.7374, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.5390625, |
|
"grad_norm": 0.25545206665992737, |
|
"learning_rate": 0.00018591274165694687, |
|
"loss": 0.7268, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.55859375, |
|
"grad_norm": 0.27690452337265015, |
|
"learning_rate": 0.00018556184430108293, |
|
"loss": 0.7318, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.578125, |
|
"grad_norm": 0.21064211428165436, |
|
"learning_rate": 0.00018520696996656788, |
|
"loss": 0.7365, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.59765625, |
|
"grad_norm": 0.2418980747461319, |
|
"learning_rate": 0.0001848481351482267, |
|
"loss": 0.7252, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.6171875, |
|
"grad_norm": 0.21725673973560333, |
|
"learning_rate": 0.00018448535652497073, |
|
"loss": 0.7438, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.63671875, |
|
"grad_norm": 0.2051118165254593, |
|
"learning_rate": 0.00018411865095902224, |
|
"loss": 0.7272, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.20715655386447906, |
|
"learning_rate": 0.0001837480354951308, |
|
"loss": 0.7189, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.67578125, |
|
"grad_norm": 0.224945530295372, |
|
"learning_rate": 0.00018337352735978095, |
|
"loss": 0.7283, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.6953125, |
|
"grad_norm": 0.2353772222995758, |
|
"learning_rate": 0.0001829951439603915, |
|
"loss": 0.7172, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.71484375, |
|
"grad_norm": 0.21377775073051453, |
|
"learning_rate": 0.00018261290288450646, |
|
"loss": 0.7245, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.734375, |
|
"grad_norm": 0.20290276408195496, |
|
"learning_rate": 0.00018222682189897752, |
|
"loss": 0.732, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.75390625, |
|
"grad_norm": 0.21785806119441986, |
|
"learning_rate": 0.00018183691894913825, |
|
"loss": 0.7142, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.7734375, |
|
"grad_norm": 0.21216203272342682, |
|
"learning_rate": 0.00018144321215797, |
|
"loss": 0.7163, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.79296875, |
|
"grad_norm": 0.20187579095363617, |
|
"learning_rate": 0.0001810457198252595, |
|
"loss": 0.7196, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.21112394332885742, |
|
"learning_rate": 0.00018064446042674828, |
|
"loss": 0.7255, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.83203125, |
|
"grad_norm": 0.21814604103565216, |
|
"learning_rate": 0.00018023945261327393, |
|
"loss": 0.7244, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.8515625, |
|
"grad_norm": 0.2388346940279007, |
|
"learning_rate": 0.00017983071520990315, |
|
"loss": 0.719, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.87109375, |
|
"grad_norm": 0.2274855226278305, |
|
"learning_rate": 0.00017941826721505684, |
|
"loss": 0.7092, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.890625, |
|
"grad_norm": 0.2171526700258255, |
|
"learning_rate": 0.0001790021277996269, |
|
"loss": 0.7177, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.91015625, |
|
"grad_norm": 0.2128465622663498, |
|
"learning_rate": 0.00017858231630608527, |
|
"loss": 0.7245, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.9296875, |
|
"grad_norm": 0.2257278561592102, |
|
"learning_rate": 0.0001781588522475848, |
|
"loss": 0.7172, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.94921875, |
|
"grad_norm": 0.21227267384529114, |
|
"learning_rate": 0.00017773175530705232, |
|
"loss": 0.7208, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.23267419636249542, |
|
"learning_rate": 0.0001773010453362737, |
|
"loss": 0.7188, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.98828125, |
|
"grad_norm": 0.21279846131801605, |
|
"learning_rate": 0.00017686674235497125, |
|
"loss": 0.7198, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.0403969287872314, |
|
"eval_runtime": 0.5399, |
|
"eval_samples_per_second": 11.113, |
|
"eval_steps_per_second": 1.852, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 3.0078125, |
|
"grad_norm": 0.20591868460178375, |
|
"learning_rate": 0.000176428866549873, |
|
"loss": 0.7092, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.02734375, |
|
"grad_norm": 0.21006809175014496, |
|
"learning_rate": 0.0001759874382737746, |
|
"loss": 0.6982, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.046875, |
|
"grad_norm": 0.20914091169834137, |
|
"learning_rate": 0.00017554247804459316, |
|
"loss": 0.6986, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.06640625, |
|
"grad_norm": 0.21207676827907562, |
|
"learning_rate": 0.0001750940065444136, |
|
"loss": 0.7024, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.0859375, |
|
"grad_norm": 0.24130572378635406, |
|
"learning_rate": 0.00017464204461852738, |
|
"loss": 0.7011, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.10546875, |
|
"grad_norm": 0.22464986145496368, |
|
"learning_rate": 0.0001741866132744636, |
|
"loss": 0.6998, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.20956657826900482, |
|
"learning_rate": 0.0001737277336810124, |
|
"loss": 0.7068, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.14453125, |
|
"grad_norm": 0.21382799744606018, |
|
"learning_rate": 0.00017326542716724128, |
|
"loss": 0.6997, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.1640625, |
|
"grad_norm": 0.2018394023180008, |
|
"learning_rate": 0.00017279971522150348, |
|
"loss": 0.7057, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.18359375, |
|
"grad_norm": 0.20716731250286102, |
|
"learning_rate": 0.00017233061949043928, |
|
"loss": 0.6957, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.203125, |
|
"grad_norm": 0.21063964068889618, |
|
"learning_rate": 0.0001718581617779698, |
|
"loss": 0.6989, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.22265625, |
|
"grad_norm": 0.21001911163330078, |
|
"learning_rate": 0.0001713823640442837, |
|
"loss": 0.7065, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.2421875, |
|
"grad_norm": 0.21537743508815765, |
|
"learning_rate": 0.0001709032484048162, |
|
"loss": 0.7001, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.26171875, |
|
"grad_norm": 0.21781504154205322, |
|
"learning_rate": 0.00017042083712922131, |
|
"loss": 0.7076, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.21302708983421326, |
|
"learning_rate": 0.00016993515264033672, |
|
"loss": 0.6965, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.30078125, |
|
"grad_norm": 0.2185572385787964, |
|
"learning_rate": 0.00016944621751314144, |
|
"loss": 0.7046, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.3203125, |
|
"grad_norm": 0.21651025116443634, |
|
"learning_rate": 0.0001689540544737067, |
|
"loss": 0.7042, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.33984375, |
|
"grad_norm": 0.22459545731544495, |
|
"learning_rate": 0.0001684586863981394, |
|
"loss": 0.7133, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.359375, |
|
"grad_norm": 0.21320843696594238, |
|
"learning_rate": 0.00016796013631151897, |
|
"loss": 0.7106, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.37890625, |
|
"grad_norm": 0.22854122519493103, |
|
"learning_rate": 0.00016745842738682712, |
|
"loss": 0.6987, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.3984375, |
|
"grad_norm": 0.22366014122962952, |
|
"learning_rate": 0.00016695358294387065, |
|
"loss": 0.7078, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.41796875, |
|
"grad_norm": 0.21049249172210693, |
|
"learning_rate": 0.00016644562644819771, |
|
"loss": 0.6926, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.216139018535614, |
|
"learning_rate": 0.00016593458151000688, |
|
"loss": 0.7073, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.45703125, |
|
"grad_norm": 0.22321297228336334, |
|
"learning_rate": 0.00016542047188304997, |
|
"loss": 0.7063, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.4765625, |
|
"grad_norm": 0.21834047138690948, |
|
"learning_rate": 0.0001649033214635277, |
|
"loss": 0.7007, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.49609375, |
|
"grad_norm": 0.2148895114660263, |
|
"learning_rate": 0.00016438315428897915, |
|
"loss": 0.709, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.515625, |
|
"grad_norm": 0.2145809829235077, |
|
"learning_rate": 0.00016385999453716454, |
|
"loss": 0.7073, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.53515625, |
|
"grad_norm": 0.21147432923316956, |
|
"learning_rate": 0.00016333386652494117, |
|
"loss": 0.6915, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.5546875, |
|
"grad_norm": 0.21884699165821075, |
|
"learning_rate": 0.00016280479470713344, |
|
"loss": 0.7026, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.57421875, |
|
"grad_norm": 0.20934432744979858, |
|
"learning_rate": 0.0001622728036753959, |
|
"loss": 0.6908, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.20113444328308105, |
|
"learning_rate": 0.00016173791815707051, |
|
"loss": 0.7101, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.61328125, |
|
"grad_norm": 0.2057623565196991, |
|
"learning_rate": 0.000161200163014037, |
|
"loss": 0.7179, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.6328125, |
|
"grad_norm": 0.21178101003170013, |
|
"learning_rate": 0.00016065956324155746, |
|
"loss": 0.7015, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.65234375, |
|
"grad_norm": 0.21164196729660034, |
|
"learning_rate": 0.0001601161439671145, |
|
"loss": 0.6955, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.671875, |
|
"grad_norm": 0.21989427506923676, |
|
"learning_rate": 0.00015956993044924334, |
|
"loss": 0.6972, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.69140625, |
|
"grad_norm": 0.20968452095985413, |
|
"learning_rate": 0.0001590209480763576, |
|
"loss": 0.6986, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.7109375, |
|
"grad_norm": 0.20064401626586914, |
|
"learning_rate": 0.00015846922236556946, |
|
"loss": 0.7073, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.73046875, |
|
"grad_norm": 0.2390391230583191, |
|
"learning_rate": 0.00015791477896150347, |
|
"loss": 0.6958, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.21184207499027252, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 0.7008, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.76953125, |
|
"grad_norm": 0.21932272613048553, |
|
"learning_rate": 0.00015679784228244043, |
|
"loss": 0.6904, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.7890625, |
|
"grad_norm": 0.19908711314201355, |
|
"learning_rate": 0.00015623540092349732, |
|
"loss": 0.6991, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.80859375, |
|
"grad_norm": 0.22039274871349335, |
|
"learning_rate": 0.00015567034570097125, |
|
"loss": 0.6959, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.828125, |
|
"grad_norm": 0.21224038302898407, |
|
"learning_rate": 0.0001551027028790524, |
|
"loss": 0.6976, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.84765625, |
|
"grad_norm": 0.21021129190921783, |
|
"learning_rate": 0.00015453249884220464, |
|
"loss": 0.6976, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.8671875, |
|
"grad_norm": 0.2202974110841751, |
|
"learning_rate": 0.00015395976009393894, |
|
"loss": 0.6995, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.88671875, |
|
"grad_norm": 0.21578259766101837, |
|
"learning_rate": 0.0001533845132555816, |
|
"loss": 0.6882, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.1979641318321228, |
|
"learning_rate": 0.0001528067850650368, |
|
"loss": 0.6961, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.92578125, |
|
"grad_norm": 0.20889665186405182, |
|
"learning_rate": 0.00015222660237554383, |
|
"loss": 0.7, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.9453125, |
|
"grad_norm": 0.20623871684074402, |
|
"learning_rate": 0.00015164399215442898, |
|
"loss": 0.6985, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.96484375, |
|
"grad_norm": 0.2109537273645401, |
|
"learning_rate": 0.00015105898148185193, |
|
"loss": 0.7026, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.984375, |
|
"grad_norm": 0.20740477740764618, |
|
"learning_rate": 0.0001504715975495472, |
|
"loss": 0.7053, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.0418636798858643, |
|
"eval_runtime": 0.5376, |
|
"eval_samples_per_second": 11.162, |
|
"eval_steps_per_second": 1.86, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 4.00390625, |
|
"grad_norm": 0.2116871029138565, |
|
"learning_rate": 0.00014988186765956029, |
|
"loss": 0.6923, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.0234375, |
|
"grad_norm": 0.20054052770137787, |
|
"learning_rate": 0.00014928981922297842, |
|
"loss": 0.6717, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.04296875, |
|
"grad_norm": 0.2238766998052597, |
|
"learning_rate": 0.00014869547975865664, |
|
"loss": 0.6719, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.2156434804201126, |
|
"learning_rate": 0.00014809887689193877, |
|
"loss": 0.6718, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.08203125, |
|
"grad_norm": 0.2189694195985794, |
|
"learning_rate": 0.00014750003835337316, |
|
"loss": 0.677, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.1015625, |
|
"grad_norm": 0.2283412218093872, |
|
"learning_rate": 0.0001468989919774239, |
|
"loss": 0.6724, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.12109375, |
|
"grad_norm": 0.2534675598144531, |
|
"learning_rate": 0.00014629576570117709, |
|
"loss": 0.6842, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.140625, |
|
"grad_norm": 0.24277372658252716, |
|
"learning_rate": 0.00014569038756304207, |
|
"loss": 0.676, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.16015625, |
|
"grad_norm": 0.2335975170135498, |
|
"learning_rate": 0.0001450828857014485, |
|
"loss": 0.6861, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.1796875, |
|
"grad_norm": 0.22338411211967468, |
|
"learning_rate": 0.0001444732883535382, |
|
"loss": 0.6784, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.19921875, |
|
"grad_norm": 0.22138862311840057, |
|
"learning_rate": 0.00014386162385385278, |
|
"loss": 0.6765, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.20274129509925842, |
|
"learning_rate": 0.00014324792063301662, |
|
"loss": 0.6762, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.23828125, |
|
"grad_norm": 0.20809794962406158, |
|
"learning_rate": 0.00014263220721641543, |
|
"loss": 0.6954, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.2578125, |
|
"grad_norm": 0.21727928519248962, |
|
"learning_rate": 0.00014201451222287025, |
|
"loss": 0.682, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.27734375, |
|
"grad_norm": 0.21408621966838837, |
|
"learning_rate": 0.00014139486436330736, |
|
"loss": 0.6817, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.296875, |
|
"grad_norm": 0.2173791378736496, |
|
"learning_rate": 0.00014077329243942369, |
|
"loss": 0.6775, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.31640625, |
|
"grad_norm": 0.21154190599918365, |
|
"learning_rate": 0.0001401498253423481, |
|
"loss": 0.6793, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.3359375, |
|
"grad_norm": 0.2106465995311737, |
|
"learning_rate": 0.00013952449205129855, |
|
"loss": 0.6736, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.35546875, |
|
"grad_norm": 0.20029598474502563, |
|
"learning_rate": 0.00013889732163223516, |
|
"loss": 0.6759, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.21185144782066345, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.6777, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.39453125, |
|
"grad_norm": 0.2037803679704666, |
|
"learning_rate": 0.0001376375860995073, |
|
"loss": 0.6818, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.4140625, |
|
"grad_norm": 0.21110603213310242, |
|
"learning_rate": 0.00013700507953929463, |
|
"loss": 0.675, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.43359375, |
|
"grad_norm": 0.2060796022415161, |
|
"learning_rate": 0.00013637085295524988, |
|
"loss": 0.679, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 4.453125, |
|
"grad_norm": 0.2184733897447586, |
|
"learning_rate": 0.00013573493582670003, |
|
"loss": 0.6859, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.47265625, |
|
"grad_norm": 0.21656639873981476, |
|
"learning_rate": 0.00013509735771154987, |
|
"loss": 0.685, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 4.4921875, |
|
"grad_norm": 0.219607412815094, |
|
"learning_rate": 0.00013445814824490805, |
|
"loss": 0.6814, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.51171875, |
|
"grad_norm": 0.2204212099313736, |
|
"learning_rate": 0.00013381733713770967, |
|
"loss": 0.6845, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.2118123322725296, |
|
"learning_rate": 0.00013317495417533524, |
|
"loss": 0.6751, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.55078125, |
|
"grad_norm": 0.2175564020872116, |
|
"learning_rate": 0.0001325310292162263, |
|
"loss": 0.6813, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 4.5703125, |
|
"grad_norm": 0.2186279296875, |
|
"learning_rate": 0.0001318855921904976, |
|
"loss": 0.6869, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.58984375, |
|
"grad_norm": 0.21257956326007843, |
|
"learning_rate": 0.0001312386730985459, |
|
"loss": 0.6834, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.609375, |
|
"grad_norm": 0.20661357045173645, |
|
"learning_rate": 0.00013059030200965536, |
|
"loss": 0.7001, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.62890625, |
|
"grad_norm": 0.22517681121826172, |
|
"learning_rate": 0.00012994050906060017, |
|
"loss": 0.6717, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 4.6484375, |
|
"grad_norm": 0.22090637683868408, |
|
"learning_rate": 0.00012928932445424365, |
|
"loss": 0.678, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.66796875, |
|
"grad_norm": 0.21545428037643433, |
|
"learning_rate": 0.00012863677845813433, |
|
"loss": 0.6819, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.209136962890625, |
|
"learning_rate": 0.00012798290140309923, |
|
"loss": 0.6862, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.70703125, |
|
"grad_norm": 0.20853549242019653, |
|
"learning_rate": 0.00012732772368183388, |
|
"loss": 0.6719, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 4.7265625, |
|
"grad_norm": 0.2124202698469162, |
|
"learning_rate": 0.00012667127574748986, |
|
"loss": 0.6819, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.74609375, |
|
"grad_norm": 0.2243855744600296, |
|
"learning_rate": 0.00012601358811225913, |
|
"loss": 0.6743, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 4.765625, |
|
"grad_norm": 0.21978437900543213, |
|
"learning_rate": 0.00012535469134595595, |
|
"loss": 0.6924, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.78515625, |
|
"grad_norm": 0.20108923316001892, |
|
"learning_rate": 0.00012469461607459583, |
|
"loss": 0.6836, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 4.8046875, |
|
"grad_norm": 0.21921634674072266, |
|
"learning_rate": 0.0001240333929789721, |
|
"loss": 0.6764, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.82421875, |
|
"grad_norm": 0.21365371346473694, |
|
"learning_rate": 0.00012337105279322988, |
|
"loss": 0.6843, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.20987005531787872, |
|
"learning_rate": 0.00012270762630343734, |
|
"loss": 0.6746, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.86328125, |
|
"grad_norm": 0.20794980227947235, |
|
"learning_rate": 0.00012204314434615501, |
|
"loss": 0.6815, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 4.8828125, |
|
"grad_norm": 0.21553441882133484, |
|
"learning_rate": 0.00012137763780700227, |
|
"loss": 0.6795, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.90234375, |
|
"grad_norm": 0.2035866528749466, |
|
"learning_rate": 0.00012071113761922186, |
|
"loss": 0.6828, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 4.921875, |
|
"grad_norm": 0.2061247080564499, |
|
"learning_rate": 0.00012004367476224206, |
|
"loss": 0.6838, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.94140625, |
|
"grad_norm": 0.21384355425834656, |
|
"learning_rate": 0.0001193752802602367, |
|
"loss": 0.6902, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 4.9609375, |
|
"grad_norm": 0.21918757259845734, |
|
"learning_rate": 0.0001187059851806832, |
|
"loss": 0.6853, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.98046875, |
|
"grad_norm": 0.20853689312934875, |
|
"learning_rate": 0.00011803582063291849, |
|
"loss": 0.6693, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.2089415341615677, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.6831, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.05405592918396, |
|
"eval_runtime": 0.5395, |
|
"eval_samples_per_second": 11.122, |
|
"eval_steps_per_second": 1.854, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.01953125, |
|
"grad_norm": 0.21040305495262146, |
|
"learning_rate": 0.00011669300777072298, |
|
"loss": 0.6597, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 5.0390625, |
|
"grad_norm": 0.2179408222436905, |
|
"learning_rate": 0.00011602042187124045, |
|
"loss": 0.6675, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.05859375, |
|
"grad_norm": 0.20846475660800934, |
|
"learning_rate": 0.0001153470913305421, |
|
"loss": 0.6643, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 5.078125, |
|
"grad_norm": 0.2074786126613617, |
|
"learning_rate": 0.00011467304744553618, |
|
"loss": 0.6656, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.09765625, |
|
"grad_norm": 0.2094477117061615, |
|
"learning_rate": 0.00011399832154628767, |
|
"loss": 0.6544, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 5.1171875, |
|
"grad_norm": 0.21982310712337494, |
|
"learning_rate": 0.000113322944994562, |
|
"loss": 0.6549, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.13671875, |
|
"grad_norm": 0.23372633755207062, |
|
"learning_rate": 0.00011264694918236753, |
|
"loss": 0.6567, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.21253670752048492, |
|
"learning_rate": 0.00011197036553049625, |
|
"loss": 0.657, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.17578125, |
|
"grad_norm": 0.21819843351840973, |
|
"learning_rate": 0.00011129322548706342, |
|
"loss": 0.6624, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 5.1953125, |
|
"grad_norm": 0.22048228979110718, |
|
"learning_rate": 0.00011061556052604578, |
|
"loss": 0.6617, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.21484375, |
|
"grad_norm": 0.21444514393806458, |
|
"learning_rate": 0.00010993740214581856, |
|
"loss": 0.6714, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 5.234375, |
|
"grad_norm": 0.20963872969150543, |
|
"learning_rate": 0.00010925878186769158, |
|
"loss": 0.6554, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.25390625, |
|
"grad_norm": 0.21605953574180603, |
|
"learning_rate": 0.000108579731234444, |
|
"loss": 0.6625, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 5.2734375, |
|
"grad_norm": 0.2186332494020462, |
|
"learning_rate": 0.00010790028180885821, |
|
"loss": 0.659, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.29296875, |
|
"grad_norm": 0.20879332721233368, |
|
"learning_rate": 0.00010722046517225271, |
|
"loss": 0.6574, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.20964272320270538, |
|
"learning_rate": 0.00010654031292301432, |
|
"loss": 0.6495, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.33203125, |
|
"grad_norm": 0.22066867351531982, |
|
"learning_rate": 0.00010585985667512934, |
|
"loss": 0.6657, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 5.3515625, |
|
"grad_norm": 0.21919472515583038, |
|
"learning_rate": 0.00010517912805671419, |
|
"loss": 0.6663, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.37109375, |
|
"grad_norm": 0.20911991596221924, |
|
"learning_rate": 0.00010449815870854525, |
|
"loss": 0.6655, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 5.390625, |
|
"grad_norm": 0.21343956887722015, |
|
"learning_rate": 0.00010381698028258817, |
|
"loss": 0.6538, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.41015625, |
|
"grad_norm": 0.23448581993579865, |
|
"learning_rate": 0.00010313562444052677, |
|
"loss": 0.6745, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 5.4296875, |
|
"grad_norm": 0.2224402278661728, |
|
"learning_rate": 0.00010245412285229124, |
|
"loss": 0.6659, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.44921875, |
|
"grad_norm": 0.21760495007038116, |
|
"learning_rate": 0.0001017725071945862, |
|
"loss": 0.6574, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.21981921792030334, |
|
"learning_rate": 0.00010109080914941824, |
|
"loss": 0.6639, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.48828125, |
|
"grad_norm": 0.22708064317703247, |
|
"learning_rate": 0.00010040906040262348, |
|
"loss": 0.6601, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 5.5078125, |
|
"grad_norm": 0.21901877224445343, |
|
"learning_rate": 9.972729264239461e-05, |
|
"loss": 0.6708, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.52734375, |
|
"grad_norm": 0.21920931339263916, |
|
"learning_rate": 9.904553755780815e-05, |
|
"loss": 0.6588, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 5.546875, |
|
"grad_norm": 0.2086167186498642, |
|
"learning_rate": 9.836382683735132e-05, |
|
"loss": 0.6689, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.56640625, |
|
"grad_norm": 0.2135404795408249, |
|
"learning_rate": 9.768219216744942e-05, |
|
"loss": 0.6709, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 5.5859375, |
|
"grad_norm": 0.2296486496925354, |
|
"learning_rate": 9.700066523099273e-05, |
|
"loss": 0.6768, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.60546875, |
|
"grad_norm": 0.22231514751911163, |
|
"learning_rate": 9.631927770586412e-05, |
|
"loss": 0.6662, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.21092720329761505, |
|
"learning_rate": 9.563806126346642e-05, |
|
"loss": 0.6563, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.64453125, |
|
"grad_norm": 0.2081764191389084, |
|
"learning_rate": 9.495704756725041e-05, |
|
"loss": 0.6599, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 5.6640625, |
|
"grad_norm": 0.21930693089962006, |
|
"learning_rate": 9.427626827124317e-05, |
|
"loss": 0.6645, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.68359375, |
|
"grad_norm": 0.22238822281360626, |
|
"learning_rate": 9.359575501857651e-05, |
|
"loss": 0.6653, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 5.703125, |
|
"grad_norm": 0.21201257407665253, |
|
"learning_rate": 9.29155394400166e-05, |
|
"loss": 0.675, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.72265625, |
|
"grad_norm": 0.21970124542713165, |
|
"learning_rate": 9.223565315249325e-05, |
|
"loss": 0.6719, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 5.7421875, |
|
"grad_norm": 0.20852448046207428, |
|
"learning_rate": 9.155612775763069e-05, |
|
"loss": 0.6701, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.76171875, |
|
"grad_norm": 0.2180168330669403, |
|
"learning_rate": 9.087699484027857e-05, |
|
"loss": 0.658, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.211044043302536, |
|
"learning_rate": 9.019828596704394e-05, |
|
"loss": 0.6526, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.80078125, |
|
"grad_norm": 0.20980176329612732, |
|
"learning_rate": 8.95200326848239e-05, |
|
"loss": 0.6548, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 5.8203125, |
|
"grad_norm": 0.20603534579277039, |
|
"learning_rate": 8.884226651933927e-05, |
|
"loss": 0.6644, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.83984375, |
|
"grad_norm": 0.20811837911605835, |
|
"learning_rate": 8.816501897366953e-05, |
|
"loss": 0.6703, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 5.859375, |
|
"grad_norm": 0.2105432003736496, |
|
"learning_rate": 8.74883215267881e-05, |
|
"loss": 0.6649, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.87890625, |
|
"grad_norm": 0.22339750826358795, |
|
"learning_rate": 8.681220563209955e-05, |
|
"loss": 0.6687, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 5.8984375, |
|
"grad_norm": 0.20943927764892578, |
|
"learning_rate": 8.613670271597733e-05, |
|
"loss": 0.663, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.91796875, |
|
"grad_norm": 0.20441389083862305, |
|
"learning_rate": 8.546184417630338e-05, |
|
"loss": 0.6663, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.21287420392036438, |
|
"learning_rate": 8.478766138100834e-05, |
|
"loss": 0.6727, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.95703125, |
|
"grad_norm": 0.21163299679756165, |
|
"learning_rate": 8.411418566661388e-05, |
|
"loss": 0.6643, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 5.9765625, |
|
"grad_norm": 0.20541082322597504, |
|
"learning_rate": 8.344144833677594e-05, |
|
"loss": 0.6605, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.99609375, |
|
"grad_norm": 0.21405570209026337, |
|
"learning_rate": 8.27694806608298e-05, |
|
"loss": 0.6633, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.0744192600250244, |
|
"eval_runtime": 0.5398, |
|
"eval_samples_per_second": 11.115, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 6.015625, |
|
"grad_norm": 0.21526320278644562, |
|
"learning_rate": 8.209831387233676e-05, |
|
"loss": 0.6479, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.03515625, |
|
"grad_norm": 0.217779740691185, |
|
"learning_rate": 8.142797916763209e-05, |
|
"loss": 0.6536, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 6.0546875, |
|
"grad_norm": 0.22583958506584167, |
|
"learning_rate": 8.075850770437534e-05, |
|
"loss": 0.6532, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.07421875, |
|
"grad_norm": 0.24157458543777466, |
|
"learning_rate": 8.008993060010183e-05, |
|
"loss": 0.6426, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.2280224710702896, |
|
"learning_rate": 7.942227893077652e-05, |
|
"loss": 0.6482, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.11328125, |
|
"grad_norm": 0.21372312307357788, |
|
"learning_rate": 7.875558372934936e-05, |
|
"loss": 0.6448, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 6.1328125, |
|
"grad_norm": 0.22514766454696655, |
|
"learning_rate": 7.808987598431303e-05, |
|
"loss": 0.6506, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.15234375, |
|
"grad_norm": 0.22178982198238373, |
|
"learning_rate": 7.742518663826246e-05, |
|
"loss": 0.6404, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 6.171875, |
|
"grad_norm": 0.21459142863750458, |
|
"learning_rate": 7.676154658645656e-05, |
|
"loss": 0.6557, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.19140625, |
|
"grad_norm": 0.22397801280021667, |
|
"learning_rate": 7.609898667538243e-05, |
|
"loss": 0.6445, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 6.2109375, |
|
"grad_norm": 0.22123484313488007, |
|
"learning_rate": 7.543753770132127e-05, |
|
"loss": 0.6375, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.23046875, |
|
"grad_norm": 0.2259218543767929, |
|
"learning_rate": 7.477723040891717e-05, |
|
"loss": 0.6486, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.21872185170650482, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 0.6546, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.26953125, |
|
"grad_norm": 0.2340991348028183, |
|
"learning_rate": 7.346016358089867e-05, |
|
"loss": 0.6573, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 6.2890625, |
|
"grad_norm": 0.2258559614419937, |
|
"learning_rate": 7.280346526353759e-05, |
|
"loss": 0.6485, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.30859375, |
|
"grad_norm": 0.21842586994171143, |
|
"learning_rate": 7.21480310614947e-05, |
|
"loss": 0.6452, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 6.328125, |
|
"grad_norm": 0.22392797470092773, |
|
"learning_rate": 7.149389143984295e-05, |
|
"loss": 0.6467, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.34765625, |
|
"grad_norm": 0.21205224096775055, |
|
"learning_rate": 7.084107680348218e-05, |
|
"loss": 0.6502, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 6.3671875, |
|
"grad_norm": 0.22041639685630798, |
|
"learning_rate": 7.018961749572604e-05, |
|
"loss": 0.6502, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.38671875, |
|
"grad_norm": 0.21791093051433563, |
|
"learning_rate": 6.953954379689136e-05, |
|
"loss": 0.6553, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.22223076224327087, |
|
"learning_rate": 6.889088592289093e-05, |
|
"loss": 0.639, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.42578125, |
|
"grad_norm": 0.2151210606098175, |
|
"learning_rate": 6.824367402382885e-05, |
|
"loss": 0.655, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 6.4453125, |
|
"grad_norm": 0.2196204513311386, |
|
"learning_rate": 6.759793818259933e-05, |
|
"loss": 0.6549, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.46484375, |
|
"grad_norm": 0.21881859004497528, |
|
"learning_rate": 6.69537084134882e-05, |
|
"loss": 0.6516, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 6.484375, |
|
"grad_norm": 0.21970680356025696, |
|
"learning_rate": 6.6311014660778e-05, |
|
"loss": 0.6531, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.50390625, |
|
"grad_norm": 0.21640105545520782, |
|
"learning_rate": 6.566988679735606e-05, |
|
"loss": 0.6474, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 6.5234375, |
|
"grad_norm": 0.225670725107193, |
|
"learning_rate": 6.503035462332592e-05, |
|
"loss": 0.6437, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.54296875, |
|
"grad_norm": 0.20938833057880402, |
|
"learning_rate": 6.439244786462245e-05, |
|
"loss": 0.6526, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.21592438220977783, |
|
"learning_rate": 6.375619617162985e-05, |
|
"loss": 0.6528, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.58203125, |
|
"grad_norm": 0.22665540874004364, |
|
"learning_rate": 6.312162911780368e-05, |
|
"loss": 0.6502, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 6.6015625, |
|
"grad_norm": 0.2195620834827423, |
|
"learning_rate": 6.248877619829619e-05, |
|
"loss": 0.6469, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.62109375, |
|
"grad_norm": 0.22165308892726898, |
|
"learning_rate": 6.185766682858546e-05, |
|
"loss": 0.6518, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 6.640625, |
|
"grad_norm": 0.22840096056461334, |
|
"learning_rate": 6.122833034310793e-05, |
|
"loss": 0.6506, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.66015625, |
|
"grad_norm": 0.22422266006469727, |
|
"learning_rate": 6.060079599389521e-05, |
|
"loss": 0.6559, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 6.6796875, |
|
"grad_norm": 0.22363343834877014, |
|
"learning_rate": 5.9975092949214116e-05, |
|
"loss": 0.6449, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.69921875, |
|
"grad_norm": 0.2213827222585678, |
|
"learning_rate": 5.935125029221111e-05, |
|
"loss": 0.65, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.2290297895669937, |
|
"learning_rate": 5.872929701956054e-05, |
|
"loss": 0.6476, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.73828125, |
|
"grad_norm": 0.23118211328983307, |
|
"learning_rate": 5.810926204011658e-05, |
|
"loss": 0.6511, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 6.7578125, |
|
"grad_norm": 0.22112269699573517, |
|
"learning_rate": 5.749117417356988e-05, |
|
"loss": 0.6481, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.77734375, |
|
"grad_norm": 0.21454501152038574, |
|
"learning_rate": 5.687506214910765e-05, |
|
"loss": 0.6492, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 6.796875, |
|
"grad_norm": 0.22518618404865265, |
|
"learning_rate": 5.6260954604078585e-05, |
|
"loss": 0.6515, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.81640625, |
|
"grad_norm": 0.23013541102409363, |
|
"learning_rate": 5.564888008266165e-05, |
|
"loss": 0.6563, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 6.8359375, |
|
"grad_norm": 0.21959349513053894, |
|
"learning_rate": 5.503886703453933e-05, |
|
"loss": 0.6504, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.85546875, |
|
"grad_norm": 0.23238404095172882, |
|
"learning_rate": 5.4430943813575375e-05, |
|
"loss": 0.6575, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.21891681849956512, |
|
"learning_rate": 5.382513867649663e-05, |
|
"loss": 0.6415, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.89453125, |
|
"grad_norm": 0.2155328243970871, |
|
"learning_rate": 5.3221479781579955e-05, |
|
"loss": 0.6498, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 6.9140625, |
|
"grad_norm": 0.21803325414657593, |
|
"learning_rate": 5.261999518734322e-05, |
|
"loss": 0.6439, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.93359375, |
|
"grad_norm": 0.21531429886817932, |
|
"learning_rate": 5.202071285124119e-05, |
|
"loss": 0.6486, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 6.953125, |
|
"grad_norm": 0.22126588225364685, |
|
"learning_rate": 5.142366062836599e-05, |
|
"loss": 0.6453, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.97265625, |
|
"grad_norm": 0.21690168976783752, |
|
"learning_rate": 5.082886627015246e-05, |
|
"loss": 0.6564, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 6.9921875, |
|
"grad_norm": 0.22704558074474335, |
|
"learning_rate": 5.023635742308807e-05, |
|
"loss": 0.6595, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.0813868045806885, |
|
"eval_runtime": 0.5387, |
|
"eval_samples_per_second": 11.138, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 7.01171875, |
|
"grad_norm": 0.21671408414840698, |
|
"learning_rate": 4.964616162742826e-05, |
|
"loss": 0.6478, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.2322429120540619, |
|
"learning_rate": 4.9058306315915826e-05, |
|
"loss": 0.6355, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.05078125, |
|
"grad_norm": 0.22516188025474548, |
|
"learning_rate": 4.84728188125063e-05, |
|
"loss": 0.6343, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 7.0703125, |
|
"grad_norm": 0.22370575368404388, |
|
"learning_rate": 4.7889726331097686e-05, |
|
"loss": 0.6388, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.08984375, |
|
"grad_norm": 0.22702112793922424, |
|
"learning_rate": 4.7309055974265435e-05, |
|
"loss": 0.6405, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 7.109375, |
|
"grad_norm": 0.2213263362646103, |
|
"learning_rate": 4.6730834732003104e-05, |
|
"loss": 0.6369, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.12890625, |
|
"grad_norm": 0.2283063679933548, |
|
"learning_rate": 4.615508948046726e-05, |
|
"loss": 0.6406, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 7.1484375, |
|
"grad_norm": 0.22583836317062378, |
|
"learning_rate": 4.5581846980728794e-05, |
|
"loss": 0.6396, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.16796875, |
|
"grad_norm": 0.223560631275177, |
|
"learning_rate": 4.50111338775287e-05, |
|
"loss": 0.6487, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 0.2752554714679718, |
|
"learning_rate": 4.444297669803981e-05, |
|
"loss": 0.6399, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.20703125, |
|
"grad_norm": 0.22124579548835754, |
|
"learning_rate": 4.387740185063358e-05, |
|
"loss": 0.6413, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 7.2265625, |
|
"grad_norm": 0.22053855657577515, |
|
"learning_rate": 4.331443562365285e-05, |
|
"loss": 0.6377, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.24609375, |
|
"grad_norm": 0.22650252282619476, |
|
"learning_rate": 4.275410418418979e-05, |
|
"loss": 0.6441, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 7.265625, |
|
"grad_norm": 0.2277732640504837, |
|
"learning_rate": 4.219643357686967e-05, |
|
"loss": 0.6472, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.28515625, |
|
"grad_norm": 0.21958424150943756, |
|
"learning_rate": 4.1641449722640336e-05, |
|
"loss": 0.6434, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 7.3046875, |
|
"grad_norm": 0.22781191766262054, |
|
"learning_rate": 4.1089178417567164e-05, |
|
"loss": 0.6436, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.32421875, |
|
"grad_norm": 0.22724145650863647, |
|
"learning_rate": 4.0539645331634504e-05, |
|
"loss": 0.6365, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.22402629256248474, |
|
"learning_rate": 3.999287600755192e-05, |
|
"loss": 0.6404, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.36328125, |
|
"grad_norm": 0.22256724536418915, |
|
"learning_rate": 3.944889585956746e-05, |
|
"loss": 0.6385, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 7.3828125, |
|
"grad_norm": 0.2245977371931076, |
|
"learning_rate": 3.8907730172286124e-05, |
|
"loss": 0.6402, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.40234375, |
|
"grad_norm": 0.2223842293024063, |
|
"learning_rate": 3.8369404099494574e-05, |
|
"loss": 0.6401, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 7.421875, |
|
"grad_norm": 0.228043794631958, |
|
"learning_rate": 3.783394266299228e-05, |
|
"loss": 0.6456, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.44140625, |
|
"grad_norm": 0.22321034967899323, |
|
"learning_rate": 3.730137075142802e-05, |
|
"loss": 0.6461, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 7.4609375, |
|
"grad_norm": 0.2202451378107071, |
|
"learning_rate": 3.677171311914346e-05, |
|
"loss": 0.6404, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.48046875, |
|
"grad_norm": 0.23069259524345398, |
|
"learning_rate": 3.624499438502229e-05, |
|
"loss": 0.6399, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.22767633199691772, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.6365, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.51953125, |
|
"grad_norm": 0.223536416888237, |
|
"learning_rate": 3.520047140265618e-05, |
|
"loss": 0.6398, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 7.5390625, |
|
"grad_norm": 0.2236379086971283, |
|
"learning_rate": 3.468271570462235e-05, |
|
"loss": 0.6374, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.55859375, |
|
"grad_norm": 0.22322149574756622, |
|
"learning_rate": 3.41679960029174e-05, |
|
"loss": 0.6411, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 7.578125, |
|
"grad_norm": 0.22714544832706451, |
|
"learning_rate": 3.365633622209891e-05, |
|
"loss": 0.6281, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.59765625, |
|
"grad_norm": 0.23407664895057678, |
|
"learning_rate": 3.314776014449694e-05, |
|
"loss": 0.6342, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 7.6171875, |
|
"grad_norm": 0.2269096076488495, |
|
"learning_rate": 3.2642291409108775e-05, |
|
"loss": 0.6462, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.63671875, |
|
"grad_norm": 0.21775776147842407, |
|
"learning_rate": 3.213995351050011e-05, |
|
"loss": 0.6442, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.21870321035385132, |
|
"learning_rate": 3.164076979771287e-05, |
|
"loss": 0.6391, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.67578125, |
|
"grad_norm": 0.24278177320957184, |
|
"learning_rate": 3.1144763473180285e-05, |
|
"loss": 0.6351, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 7.6953125, |
|
"grad_norm": 0.222146674990654, |
|
"learning_rate": 3.065195759164797e-05, |
|
"loss": 0.6442, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.71484375, |
|
"grad_norm": 0.23037941753864288, |
|
"learning_rate": 3.016237505910272e-05, |
|
"loss": 0.6391, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 7.734375, |
|
"grad_norm": 0.22653505206108093, |
|
"learning_rate": 2.9676038631707593e-05, |
|
"loss": 0.6364, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.75390625, |
|
"grad_norm": 0.22071927785873413, |
|
"learning_rate": 2.9192970914744132e-05, |
|
"loss": 0.6436, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 7.7734375, |
|
"grad_norm": 0.2352590709924698, |
|
"learning_rate": 2.8713194361562036e-05, |
|
"loss": 0.6389, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.79296875, |
|
"grad_norm": 0.23165152966976166, |
|
"learning_rate": 2.8236731272534967e-05, |
|
"loss": 0.6359, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.22592546045780182, |
|
"learning_rate": 2.776360379402445e-05, |
|
"loss": 0.6452, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.83203125, |
|
"grad_norm": 0.22005808353424072, |
|
"learning_rate": 2.72938339173503e-05, |
|
"loss": 0.6362, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 7.8515625, |
|
"grad_norm": 0.22496894001960754, |
|
"learning_rate": 2.6827443477768454e-05, |
|
"loss": 0.6363, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 7.87109375, |
|
"grad_norm": 0.23299238085746765, |
|
"learning_rate": 2.6364454153456108e-05, |
|
"loss": 0.6376, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 7.890625, |
|
"grad_norm": 0.21800798177719116, |
|
"learning_rate": 2.5904887464504114e-05, |
|
"loss": 0.6316, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 7.91015625, |
|
"grad_norm": 0.22942836582660675, |
|
"learning_rate": 2.544876477191652e-05, |
|
"loss": 0.6408, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 7.9296875, |
|
"grad_norm": 0.22502020001411438, |
|
"learning_rate": 2.4996107276618008e-05, |
|
"loss": 0.6281, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 7.94921875, |
|
"grad_norm": 0.22493688762187958, |
|
"learning_rate": 2.454693601846819e-05, |
|
"loss": 0.6374, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 0.22121860086917877, |
|
"learning_rate": 2.4101271875283817e-05, |
|
"loss": 0.6301, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 7.98828125, |
|
"grad_norm": 0.22293226420879364, |
|
"learning_rate": 2.3659135561868305e-05, |
|
"loss": 0.6374, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.093949556350708, |
|
"eval_runtime": 0.5398, |
|
"eval_samples_per_second": 11.115, |
|
"eval_steps_per_second": 1.852, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 8.0078125, |
|
"grad_norm": 0.22147591412067413, |
|
"learning_rate": 2.3220547629048796e-05, |
|
"loss": 0.6318, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 8.02734375, |
|
"grad_norm": 0.22781990468502045, |
|
"learning_rate": 2.2785528462721238e-05, |
|
"loss": 0.6301, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 8.046875, |
|
"grad_norm": 0.22302427887916565, |
|
"learning_rate": 2.2354098282902446e-05, |
|
"loss": 0.6194, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 8.06640625, |
|
"grad_norm": 0.2345212697982788, |
|
"learning_rate": 2.1926277142790552e-05, |
|
"loss": 0.6284, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 8.0859375, |
|
"grad_norm": 0.22880584001541138, |
|
"learning_rate": 2.1502084927832845e-05, |
|
"loss": 0.6394, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 8.10546875, |
|
"grad_norm": 0.23197947442531586, |
|
"learning_rate": 2.1081541354801292e-05, |
|
"loss": 0.6414, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.2195805162191391, |
|
"learning_rate": 2.0664665970876496e-05, |
|
"loss": 0.6274, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 8.14453125, |
|
"grad_norm": 0.2231413722038269, |
|
"learning_rate": 2.025147815273867e-05, |
|
"loss": 0.6325, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 8.1640625, |
|
"grad_norm": 0.22956664860248566, |
|
"learning_rate": 1.9841997105667275e-05, |
|
"loss": 0.6345, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 8.18359375, |
|
"grad_norm": 0.22590646147727966, |
|
"learning_rate": 1.943624186264832e-05, |
|
"loss": 0.6276, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 8.203125, |
|
"grad_norm": 0.2267957627773285, |
|
"learning_rate": 1.903423128348959e-05, |
|
"loss": 0.6243, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.22265625, |
|
"grad_norm": 0.22633960843086243, |
|
"learning_rate": 1.8635984053944122e-05, |
|
"loss": 0.6279, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 8.2421875, |
|
"grad_norm": 0.22983397543430328, |
|
"learning_rate": 1.824151868484164e-05, |
|
"loss": 0.6347, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 8.26171875, |
|
"grad_norm": 0.21901904046535492, |
|
"learning_rate": 1.7850853511228115e-05, |
|
"loss": 0.6364, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 0.2256007343530655, |
|
"learning_rate": 1.7464006691513623e-05, |
|
"loss": 0.628, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 8.30078125, |
|
"grad_norm": 0.2304702252149582, |
|
"learning_rate": 1.7080996206628307e-05, |
|
"loss": 0.6202, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 8.3203125, |
|
"grad_norm": 0.22724899649620056, |
|
"learning_rate": 1.6701839859186542e-05, |
|
"loss": 0.6401, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 8.33984375, |
|
"grad_norm": 0.22017619013786316, |
|
"learning_rate": 1.632655527265958e-05, |
|
"loss": 0.6348, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 8.359375, |
|
"grad_norm": 0.221891850233078, |
|
"learning_rate": 1.595515989055618e-05, |
|
"loss": 0.6306, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 8.37890625, |
|
"grad_norm": 0.2255999892950058, |
|
"learning_rate": 1.558767097561219e-05, |
|
"loss": 0.6436, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 8.3984375, |
|
"grad_norm": 0.2337878942489624, |
|
"learning_rate": 1.5224105608987704e-05, |
|
"loss": 0.6256, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 8.41796875, |
|
"grad_norm": 0.2235851138830185, |
|
"learning_rate": 1.486448068947348e-05, |
|
"loss": 0.6328, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.2308977097272873, |
|
"learning_rate": 1.4508812932705363e-05, |
|
"loss": 0.6353, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 8.45703125, |
|
"grad_norm": 0.22785401344299316, |
|
"learning_rate": 1.4157118870387155e-05, |
|
"loss": 0.6375, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 8.4765625, |
|
"grad_norm": 0.24056580662727356, |
|
"learning_rate": 1.3809414849522584e-05, |
|
"loss": 0.6343, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 8.49609375, |
|
"grad_norm": 0.22777673602104187, |
|
"learning_rate": 1.3465717031655056e-05, |
|
"loss": 0.6336, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 8.515625, |
|
"grad_norm": 0.23098915815353394, |
|
"learning_rate": 1.3126041392116772e-05, |
|
"loss": 0.6296, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 8.53515625, |
|
"grad_norm": 0.2298251986503601, |
|
"learning_rate": 1.2790403719286049e-05, |
|
"loss": 0.6305, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 8.5546875, |
|
"grad_norm": 0.22145819664001465, |
|
"learning_rate": 1.2458819613853468e-05, |
|
"loss": 0.6262, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 8.57421875, |
|
"grad_norm": 0.2244306206703186, |
|
"learning_rate": 1.2131304488096772e-05, |
|
"loss": 0.6225, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 0.22416800260543823, |
|
"learning_rate": 1.1807873565164506e-05, |
|
"loss": 0.6309, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 8.61328125, |
|
"grad_norm": 0.22584258019924164, |
|
"learning_rate": 1.148854187836833e-05, |
|
"loss": 0.6318, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 8.6328125, |
|
"grad_norm": 0.2320922613143921, |
|
"learning_rate": 1.1173324270484397e-05, |
|
"loss": 0.6352, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 8.65234375, |
|
"grad_norm": 0.2240631878376007, |
|
"learning_rate": 1.0862235393063413e-05, |
|
"loss": 0.6279, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 8.671875, |
|
"grad_norm": 0.2261231392621994, |
|
"learning_rate": 1.0555289705749483e-05, |
|
"loss": 0.6299, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 8.69140625, |
|
"grad_norm": 0.22478684782981873, |
|
"learning_rate": 1.025250147560829e-05, |
|
"loss": 0.639, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 8.7109375, |
|
"grad_norm": 0.22566542029380798, |
|
"learning_rate": 9.953884776463652e-06, |
|
"loss": 0.63, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 8.73046875, |
|
"grad_norm": 0.23023688793182373, |
|
"learning_rate": 9.659453488243575e-06, |
|
"loss": 0.6439, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.22487542033195496, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 0.6421, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 8.76953125, |
|
"grad_norm": 0.22670140862464905, |
|
"learning_rate": 9.083201690947763e-06, |
|
"loss": 0.6331, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 8.7890625, |
|
"grad_norm": 0.2248082160949707, |
|
"learning_rate": 8.801407966487486e-06, |
|
"loss": 0.6216, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 8.80859375, |
|
"grad_norm": 0.23012250661849976, |
|
"learning_rate": 8.52385322093765e-06, |
|
"loss": 0.6452, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 8.828125, |
|
"grad_norm": 0.22810766100883484, |
|
"learning_rate": 8.250550355250875e-06, |
|
"loss": 0.6395, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 8.84765625, |
|
"grad_norm": 0.22482182085514069, |
|
"learning_rate": 7.981512072749198e-06, |
|
"loss": 0.6316, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 8.8671875, |
|
"grad_norm": 0.22704395651817322, |
|
"learning_rate": 7.71675087853364e-06, |
|
"loss": 0.6389, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 8.88671875, |
|
"grad_norm": 0.2339123636484146, |
|
"learning_rate": 7.456279078902928e-06, |
|
"loss": 0.639, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 0.2283734679222107, |
|
"learning_rate": 7.200108780781556e-06, |
|
"loss": 0.6312, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 8.92578125, |
|
"grad_norm": 0.23632891476154327, |
|
"learning_rate": 6.948251891156932e-06, |
|
"loss": 0.6336, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 8.9453125, |
|
"grad_norm": 0.22593176364898682, |
|
"learning_rate": 6.700720116526116e-06, |
|
"loss": 0.6382, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 8.96484375, |
|
"grad_norm": 0.2195340245962143, |
|
"learning_rate": 6.457524962351469e-06, |
|
"loss": 0.627, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 8.984375, |
|
"grad_norm": 0.2304958701133728, |
|
"learning_rate": 6.218677732526035e-06, |
|
"loss": 0.6277, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.0994203090667725, |
|
"eval_runtime": 0.5356, |
|
"eval_samples_per_second": 11.202, |
|
"eval_steps_per_second": 1.867, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 9.00390625, |
|
"grad_norm": 0.2239326387643814, |
|
"learning_rate": 5.984189528848095e-06, |
|
"loss": 0.6333, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 9.0234375, |
|
"grad_norm": 0.21830931305885315, |
|
"learning_rate": 5.7540712505050444e-06, |
|
"loss": 0.6303, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 9.04296875, |
|
"grad_norm": 0.2230663150548935, |
|
"learning_rate": 5.528333593567014e-06, |
|
"loss": 0.6266, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.22621068358421326, |
|
"learning_rate": 5.306987050489442e-06, |
|
"loss": 0.6273, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 9.08203125, |
|
"grad_norm": 0.2257871776819229, |
|
"learning_rate": 5.090041909625542e-06, |
|
"loss": 0.6171, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 9.1015625, |
|
"grad_norm": 0.22467824816703796, |
|
"learning_rate": 4.877508254748076e-06, |
|
"loss": 0.6256, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 9.12109375, |
|
"grad_norm": 0.22441822290420532, |
|
"learning_rate": 4.669395964580614e-06, |
|
"loss": 0.6247, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 9.140625, |
|
"grad_norm": 0.22599612176418304, |
|
"learning_rate": 4.465714712338398e-06, |
|
"loss": 0.6204, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 9.16015625, |
|
"grad_norm": 0.22301939129829407, |
|
"learning_rate": 4.26647396527865e-06, |
|
"loss": 0.634, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 9.1796875, |
|
"grad_norm": 0.23274029791355133, |
|
"learning_rate": 4.071682984260638e-06, |
|
"loss": 0.6256, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 9.19921875, |
|
"grad_norm": 0.23097610473632812, |
|
"learning_rate": 3.881350823315177e-06, |
|
"loss": 0.6293, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 0.23166796565055847, |
|
"learning_rate": 3.6954863292237297e-06, |
|
"loss": 0.6294, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 9.23828125, |
|
"grad_norm": 0.22876545786857605, |
|
"learning_rate": 3.514098141107314e-06, |
|
"loss": 0.6298, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 9.2578125, |
|
"grad_norm": 0.22338230907917023, |
|
"learning_rate": 3.3371946900248473e-06, |
|
"loss": 0.6264, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 9.27734375, |
|
"grad_norm": 0.2302178293466568, |
|
"learning_rate": 3.1647841985813164e-06, |
|
"loss": 0.627, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 9.296875, |
|
"grad_norm": 0.2242288738489151, |
|
"learning_rate": 2.996874680545603e-06, |
|
"loss": 0.6336, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 9.31640625, |
|
"grad_norm": 0.22500120103359222, |
|
"learning_rate": 2.8334739404779375e-06, |
|
"loss": 0.6264, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 9.3359375, |
|
"grad_norm": 0.23554645478725433, |
|
"learning_rate": 2.674589573367192e-06, |
|
"loss": 0.6213, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 9.35546875, |
|
"grad_norm": 0.2254471480846405, |
|
"learning_rate": 2.5202289642778375e-06, |
|
"loss": 0.6348, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 0.22407911717891693, |
|
"learning_rate": 2.3703992880066638e-06, |
|
"loss": 0.6294, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 9.39453125, |
|
"grad_norm": 0.22965936362743378, |
|
"learning_rate": 2.2251075087493355e-06, |
|
"loss": 0.64, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 9.4140625, |
|
"grad_norm": 0.22874490916728973, |
|
"learning_rate": 2.0843603797766287e-06, |
|
"loss": 0.6313, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 9.43359375, |
|
"grad_norm": 0.22413046658039093, |
|
"learning_rate": 1.9481644431206036e-06, |
|
"loss": 0.6229, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 9.453125, |
|
"grad_norm": 0.2280588150024414, |
|
"learning_rate": 1.8165260292704711e-06, |
|
"loss": 0.6265, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 9.47265625, |
|
"grad_norm": 0.22689659893512726, |
|
"learning_rate": 1.6894512568783716e-06, |
|
"loss": 0.6272, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 9.4921875, |
|
"grad_norm": 0.23052698373794556, |
|
"learning_rate": 1.5669460324749586e-06, |
|
"loss": 0.6408, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 9.51171875, |
|
"grad_norm": 0.22765642404556274, |
|
"learning_rate": 1.4490160501948735e-06, |
|
"loss": 0.644, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 0.22766034305095673, |
|
"learning_rate": 1.3356667915121025e-06, |
|
"loss": 0.6249, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 9.55078125, |
|
"grad_norm": 0.22794398665428162, |
|
"learning_rate": 1.2269035249851236e-06, |
|
"loss": 0.6318, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 9.5703125, |
|
"grad_norm": 0.22712871432304382, |
|
"learning_rate": 1.1227313060120926e-06, |
|
"loss": 0.6359, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 9.58984375, |
|
"grad_norm": 0.22914738953113556, |
|
"learning_rate": 1.0231549765958192e-06, |
|
"loss": 0.6389, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 9.609375, |
|
"grad_norm": 0.22300153970718384, |
|
"learning_rate": 9.281791651187366e-07, |
|
"loss": 0.6356, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 9.62890625, |
|
"grad_norm": 0.232873797416687, |
|
"learning_rate": 8.378082861277281e-07, |
|
"loss": 0.6272, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 9.6484375, |
|
"grad_norm": 0.227997824549675, |
|
"learning_rate": 7.520465401290033e-07, |
|
"loss": 0.633, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 9.66796875, |
|
"grad_norm": 0.21839286386966705, |
|
"learning_rate": 6.708979133927762e-07, |
|
"loss": 0.6215, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 0.22753040492534637, |
|
"learning_rate": 5.943661777680354e-07, |
|
"loss": 0.6272, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 9.70703125, |
|
"grad_norm": 0.22866863012313843, |
|
"learning_rate": 5.224548905072402e-07, |
|
"loss": 0.6357, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 9.7265625, |
|
"grad_norm": 0.2306712120771408, |
|
"learning_rate": 4.5516739410087494e-07, |
|
"loss": 0.6244, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 9.74609375, |
|
"grad_norm": 0.22779209911823273, |
|
"learning_rate": 3.9250681612225116e-07, |
|
"loss": 0.6309, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 9.765625, |
|
"grad_norm": 0.22719816863536835, |
|
"learning_rate": 3.3447606908196817e-07, |
|
"loss": 0.628, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 9.78515625, |
|
"grad_norm": 0.23172929883003235, |
|
"learning_rate": 2.8107785029265476e-07, |
|
"loss": 0.6293, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 9.8046875, |
|
"grad_norm": 0.22468186914920807, |
|
"learning_rate": 2.3231464174352512e-07, |
|
"loss": 0.6368, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 9.82421875, |
|
"grad_norm": 0.22247561812400818, |
|
"learning_rate": 1.8818870998508208e-07, |
|
"loss": 0.6222, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 0.22515320777893066, |
|
"learning_rate": 1.487021060236904e-07, |
|
"loss": 0.6266, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 9.86328125, |
|
"grad_norm": 0.23118971288204193, |
|
"learning_rate": 1.1385666522630845e-07, |
|
"loss": 0.6308, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 9.8828125, |
|
"grad_norm": 0.22416307032108307, |
|
"learning_rate": 8.365400723512328e-08, |
|
"loss": 0.6239, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 9.90234375, |
|
"grad_norm": 0.22984710335731506, |
|
"learning_rate": 5.8095535892332964e-08, |
|
"loss": 0.6362, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 9.921875, |
|
"grad_norm": 0.23102597892284393, |
|
"learning_rate": 3.7182439174832106e-08, |
|
"loss": 0.6365, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 9.94140625, |
|
"grad_norm": 0.2295123189687729, |
|
"learning_rate": 2.091568913904496e-08, |
|
"loss": 0.6397, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 9.9609375, |
|
"grad_norm": 0.22766011953353882, |
|
"learning_rate": 9.296041875683781e-09, |
|
"loss": 0.6274, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 9.98046875, |
|
"grad_norm": 0.2338954210281372, |
|
"learning_rate": 2.3240374746658077e-09, |
|
"loss": 0.6212, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.22291633486747742, |
|
"learning_rate": 0.0, |
|
"loss": 0.616, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.1007895469665527, |
|
"eval_runtime": 0.5705, |
|
"eval_samples_per_second": 10.518, |
|
"eval_steps_per_second": 1.753, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 2560, |
|
"total_flos": 7.568434414263206e+18, |
|
"train_loss": 0.7105431989766657, |
|
"train_runtime": 14792.6859, |
|
"train_samples_per_second": 11.056, |
|
"train_steps_per_second": 0.173 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2560, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.568434414263206e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|