|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.96, |
|
"eval_steps": 500, |
|
"global_step": 2400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 14.472631454467773, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 5.6365, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 6e-06, |
|
"loss": 7.9556, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.6599702835083, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 6.5246, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 40.07120895385742, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 7.7818, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.662575721740723, |
|
"learning_rate": 2.1e-05, |
|
"loss": 5.5899, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5e-05, |
|
"loss": 7.1589, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.6021246910095215, |
|
"learning_rate": 3e-05, |
|
"loss": 4.9643, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.397390365600586, |
|
"learning_rate": 3.5e-05, |
|
"loss": 5.944, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.122846603393555, |
|
"learning_rate": 4e-05, |
|
"loss": 4.2179, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.520490646362305, |
|
"learning_rate": 4.5e-05, |
|
"loss": 3.9576, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 12.957700729370117, |
|
"learning_rate": 5e-05, |
|
"loss": 4.6752, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 11.252644538879395, |
|
"learning_rate": 4.999948617395915e-05, |
|
"loss": 4.586, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 14.29404354095459, |
|
"learning_rate": 4.9997944716957985e-05, |
|
"loss": 4.2009, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 12.03260612487793, |
|
"learning_rate": 4.9995375692359755e-05, |
|
"loss": 4.7383, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.339493751525879, |
|
"learning_rate": 4.9991779205767e-05, |
|
"loss": 4.0294, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.51794958114624, |
|
"learning_rate": 4.99871554050172e-05, |
|
"loss": 3.8225, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 10.195987701416016, |
|
"learning_rate": 4.9981504480176696e-05, |
|
"loss": 4.1299, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 14.785408973693848, |
|
"learning_rate": 4.997482666353287e-05, |
|
"loss": 4.1754, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 9.586732864379883, |
|
"learning_rate": 4.996712222958461e-05, |
|
"loss": 3.2669, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 25.427532196044922, |
|
"learning_rate": 4.9958391495031026e-05, |
|
"loss": 4.1074, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.9541730880737305, |
|
"learning_rate": 4.994863481875841e-05, |
|
"loss": 4.2565, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.371864318847656, |
|
"learning_rate": 4.993785260182552e-05, |
|
"loss": 3.3137, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 9.015960693359375, |
|
"learning_rate": 4.992604528744705e-05, |
|
"loss": 3.4324, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.063784599304199, |
|
"learning_rate": 4.991321336097546e-05, |
|
"loss": 3.3621, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.260740280151367, |
|
"learning_rate": 4.989935734988098e-05, |
|
"loss": 4.7443, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 9.744063377380371, |
|
"learning_rate": 4.9884477823729956e-05, |
|
"loss": 4.085, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 9.156418800354004, |
|
"learning_rate": 4.986857539416144e-05, |
|
"loss": 3.5387, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.296212673187256, |
|
"learning_rate": 4.9851650714862006e-05, |
|
"loss": 3.348, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.042045593261719, |
|
"learning_rate": 4.983370448153896e-05, |
|
"loss": 3.5621, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 12.138426780700684, |
|
"learning_rate": 4.981473743189163e-05, |
|
"loss": 3.4145, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 12.3366060256958, |
|
"learning_rate": 4.979475034558115e-05, |
|
"loss": 3.9396, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.696476936340332, |
|
"learning_rate": 4.977374404419837e-05, |
|
"loss": 3.5372, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.023372173309326, |
|
"learning_rate": 4.975171939123005e-05, |
|
"loss": 3.2419, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.861164093017578, |
|
"learning_rate": 4.9728677292023405e-05, |
|
"loss": 3.7565, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.757652282714844, |
|
"learning_rate": 4.970461869374889e-05, |
|
"loss": 3.2182, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 9.434078216552734, |
|
"learning_rate": 4.967954458536126e-05, |
|
"loss": 3.229, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.395694732666016, |
|
"learning_rate": 4.965345599755887e-05, |
|
"loss": 3.903, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.09558629989624, |
|
"learning_rate": 4.962635400274142e-05, |
|
"loss": 3.1898, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.992712497711182, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 3.2185, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.1479411125183105, |
|
"learning_rate": 4.95691142899001e-05, |
|
"loss": 3.2792, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.431621551513672, |
|
"learning_rate": 4.9538978924776634e-05, |
|
"loss": 3.1418, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.00255584716797, |
|
"learning_rate": 4.9507834858342186e-05, |
|
"loss": 3.9976, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 10.47463321685791, |
|
"learning_rate": 4.9475683370807326e-05, |
|
"loss": 3.4157, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 18.375465393066406, |
|
"learning_rate": 4.9442525783793794e-05, |
|
"loss": 3.5929, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 8.78403091430664, |
|
"learning_rate": 4.940836346028011e-05, |
|
"loss": 3.1698, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.213769912719727, |
|
"learning_rate": 4.937319780454559e-05, |
|
"loss": 3.9643, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.919612884521484, |
|
"learning_rate": 4.933703026211262e-05, |
|
"loss": 3.6494, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.598107814788818, |
|
"learning_rate": 4.9299862319687204e-05, |
|
"loss": 3.5917, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 37.90977096557617, |
|
"learning_rate": 4.926169550509787e-05, |
|
"loss": 3.7991, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 12.698101997375488, |
|
"learning_rate": 4.9222531387232885e-05, |
|
"loss": 3.5116, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 9.609795570373535, |
|
"learning_rate": 4.9182371575975736e-05, |
|
"loss": 3.4126, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.108003616333008, |
|
"learning_rate": 4.914121772213898e-05, |
|
"loss": 3.2962, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 11.94726848602295, |
|
"learning_rate": 4.909907151739633e-05, |
|
"loss": 3.6953, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 8.730710983276367, |
|
"learning_rate": 4.905593469421323e-05, |
|
"loss": 3.4841, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.423217296600342, |
|
"learning_rate": 4.9011809025775486e-05, |
|
"loss": 3.1157, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 10.736145973205566, |
|
"learning_rate": 4.8966696325916515e-05, |
|
"loss": 3.244, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 21.597434997558594, |
|
"learning_rate": 4.892059844904272e-05, |
|
"loss": 3.172, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 23.540430068969727, |
|
"learning_rate": 4.887351729005726e-05, |
|
"loss": 3.1889, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 11.286609649658203, |
|
"learning_rate": 4.882545478428218e-05, |
|
"loss": 3.5211, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.204074382781982, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 3.255, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 25.35500717163086, |
|
"learning_rate": 4.8726393675266716e-05, |
|
"loss": 3.4832, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.76464319229126, |
|
"learning_rate": 4.8675399144040537e-05, |
|
"loss": 2.9075, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 10.19654655456543, |
|
"learning_rate": 4.862343140988573e-05, |
|
"loss": 3.0107, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.545323848724365, |
|
"learning_rate": 4.8570492608992325e-05, |
|
"loss": 2.7479, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 15.334161758422852, |
|
"learning_rate": 4.851658491746707e-05, |
|
"loss": 2.8654, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 14.28708553314209, |
|
"learning_rate": 4.846171055124401e-05, |
|
"loss": 3.1851, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.403162002563477, |
|
"learning_rate": 4.8405871765993433e-05, |
|
"loss": 2.8862, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.840334415435791, |
|
"learning_rate": 4.834907085702908e-05, |
|
"loss": 2.968, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 12.307807922363281, |
|
"learning_rate": 4.829131015921385e-05, |
|
"loss": 2.9458, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 12.92584228515625, |
|
"learning_rate": 4.82325920468638e-05, |
|
"loss": 3.4169, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 13.389299392700195, |
|
"learning_rate": 4.817291893365055e-05, |
|
"loss": 3.1601, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 16.73059844970703, |
|
"learning_rate": 4.8112293272502043e-05, |
|
"loss": 3.2846, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 8.608989715576172, |
|
"learning_rate": 4.805071755550177e-05, |
|
"loss": 3.0091, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.731265544891357, |
|
"learning_rate": 4.7988194313786275e-05, |
|
"loss": 3.3688, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 11.152880668640137, |
|
"learning_rate": 4.7924726117441135e-05, |
|
"loss": 3.3177, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 8.759363174438477, |
|
"learning_rate": 4.7860315575395316e-05, |
|
"loss": 2.9184, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 37.01634216308594, |
|
"learning_rate": 4.7794965335313926e-05, |
|
"loss": 3.1494, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.318065643310547, |
|
"learning_rate": 4.772867808348938e-05, |
|
"loss": 3.7067, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 14.883944511413574, |
|
"learning_rate": 4.766145654473095e-05, |
|
"loss": 2.9832, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 35.67652130126953, |
|
"learning_rate": 4.759330348225284e-05, |
|
"loss": 3.6382, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.075866222381592, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 3.6166, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.807673454284668, |
|
"learning_rate": 4.745421403033548e-05, |
|
"loss": 2.896, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 17.647350311279297, |
|
"learning_rate": 4.738328335831883e-05, |
|
"loss": 2.9218, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 35.758544921875, |
|
"learning_rate": 4.731143259719265e-05, |
|
"loss": 3.0896, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 19.15152931213379, |
|
"learning_rate": 4.72386647004603e-05, |
|
"loss": 3.0875, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 10.721521377563477, |
|
"learning_rate": 4.716498265932501e-05, |
|
"loss": 2.833, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 36.97370147705078, |
|
"learning_rate": 4.709038950256688e-05, |
|
"loss": 3.3553, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 18.96316146850586, |
|
"learning_rate": 4.701488829641845e-05, |
|
"loss": 2.8189, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 8.662888526916504, |
|
"learning_rate": 4.693848214443858e-05, |
|
"loss": 3.4443, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 18.23220443725586, |
|
"learning_rate": 4.686117418738489e-05, |
|
"loss": 3.5717, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 39.92061996459961, |
|
"learning_rate": 4.678296760308474e-05, |
|
"loss": 3.2364, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.42786693572998, |
|
"learning_rate": 4.6703865606304465e-05, |
|
"loss": 2.9595, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.904202938079834, |
|
"learning_rate": 4.662387144861734e-05, |
|
"loss": 3.1582, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 26.168556213378906, |
|
"learning_rate": 4.6542988418269876e-05, |
|
"loss": 3.3465, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 10.387030601501465, |
|
"learning_rate": 4.6461219840046654e-05, |
|
"loss": 3.0016, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.973118782043457, |
|
"learning_rate": 4.637856907513366e-05, |
|
"loss": 3.2299, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 31.876556396484375, |
|
"learning_rate": 4.629503952098011e-05, |
|
"loss": 3.3682, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.903682708740234, |
|
"learning_rate": 4.6210634611158816e-05, |
|
"loss": 2.7361, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.048882007598877, |
|
"learning_rate": 4.612535781522504e-05, |
|
"loss": 2.8006, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 16.777807235717773, |
|
"learning_rate": 4.6039212638573833e-05, |
|
"loss": 2.7768, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 35.86275100708008, |
|
"learning_rate": 4.595220262229601e-05, |
|
"loss": 3.2371, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 18.5112361907959, |
|
"learning_rate": 4.586433134303257e-05, |
|
"loss": 3.9903, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 15.84546947479248, |
|
"learning_rate": 4.5775602412827604e-05, |
|
"loss": 3.2624, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 9.097062110900879, |
|
"learning_rate": 4.5686019478979915e-05, |
|
"loss": 3.036, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 12.135428428649902, |
|
"learning_rate": 4.559558622389304e-05, |
|
"loss": 3.0802, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.016368865966797, |
|
"learning_rate": 4.55043063649239e-05, |
|
"loss": 2.718, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.902190685272217, |
|
"learning_rate": 4.5412183654229965e-05, |
|
"loss": 2.9923, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 9.023144721984863, |
|
"learning_rate": 4.531922187861507e-05, |
|
"loss": 3.2273, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 12.821276664733887, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 3.0269, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 11.519277572631836, |
|
"learning_rate": 4.51307964521339e-05, |
|
"loss": 2.9414, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.272705554962158, |
|
"learning_rate": 4.503534054669892e-05, |
|
"loss": 3.1369, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 15.541712760925293, |
|
"learning_rate": 4.493906106688712e-05, |
|
"loss": 3.3594, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 29.176177978515625, |
|
"learning_rate": 4.484196197037082e-05, |
|
"loss": 3.7387, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.822291374206543, |
|
"learning_rate": 4.474404724851356e-05, |
|
"loss": 3.0015, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 13.846043586730957, |
|
"learning_rate": 4.4645320926206064e-05, |
|
"loss": 2.8736, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.161740303039551, |
|
"learning_rate": 4.454578706170075e-05, |
|
"loss": 3.1031, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 12.11698055267334, |
|
"learning_rate": 4.444544974644493e-05, |
|
"loss": 3.1832, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.144786834716797, |
|
"learning_rate": 4.434431310491267e-05, |
|
"loss": 2.7414, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 14.706486701965332, |
|
"learning_rate": 4.4242381294435154e-05, |
|
"loss": 2.7289, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 11.251163482666016, |
|
"learning_rate": 4.413965850502987e-05, |
|
"loss": 3.4787, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 11.005367279052734, |
|
"learning_rate": 4.4036148959228365e-05, |
|
"loss": 3.1987, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 13.336194038391113, |
|
"learning_rate": 4.393185691190264e-05, |
|
"loss": 2.9382, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 9.259475708007812, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 3.1349, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.902462005615234, |
|
"learning_rate": 4.372094249281821e-05, |
|
"loss": 3.1365, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 13.907161712646484, |
|
"learning_rate": 4.3614328790925177e-05, |
|
"loss": 3.3361, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.845438480377197, |
|
"learning_rate": 4.350694992688289e-05, |
|
"loss": 3.1903, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 14.879186630249023, |
|
"learning_rate": 4.3398810314615876e-05, |
|
"loss": 3.5133, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 10.454853057861328, |
|
"learning_rate": 4.3289914399320034e-05, |
|
"loss": 3.0928, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 10.692648887634277, |
|
"learning_rate": 4.318026665727993e-05, |
|
"loss": 3.1065, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.677508354187012, |
|
"learning_rate": 4.306987159568479e-05, |
|
"loss": 2.6484, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 7.058112621307373, |
|
"learning_rate": 4.2958733752443195e-05, |
|
"loss": 2.6531, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 7.029059886932373, |
|
"learning_rate": 4.284685769599658e-05, |
|
"loss": 3.3165, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 9.306894302368164, |
|
"learning_rate": 4.273424802513145e-05, |
|
"loss": 3.2707, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.9906392097473145, |
|
"learning_rate": 4.262090936879029e-05, |
|
"loss": 3.0822, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 10.101268768310547, |
|
"learning_rate": 4.250684638588138e-05, |
|
"loss": 3.3994, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 13.360941886901855, |
|
"learning_rate": 4.239206376508717e-05, |
|
"loss": 3.2358, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 25.73331642150879, |
|
"learning_rate": 4.227656622467162e-05, |
|
"loss": 3.1359, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.391608715057373, |
|
"learning_rate": 4.216035851228626e-05, |
|
"loss": 3.1343, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 21.201011657714844, |
|
"learning_rate": 4.204344540477499e-05, |
|
"loss": 3.0807, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 12.530668258666992, |
|
"learning_rate": 4.192583170797774e-05, |
|
"loss": 2.9729, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 17.209762573242188, |
|
"learning_rate": 4.180752225653292e-05, |
|
"loss": 2.9911, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 16.309785842895508, |
|
"learning_rate": 4.16885219136787e-05, |
|
"loss": 3.0701, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 7.244511127471924, |
|
"learning_rate": 4.1568835571053075e-05, |
|
"loss": 2.8169, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.627272605895996, |
|
"learning_rate": 4.144846814849282e-05, |
|
"loss": 3.0518, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 9.72758674621582, |
|
"learning_rate": 4.132742459383122e-05, |
|
"loss": 3.1135, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 14.121967315673828, |
|
"learning_rate": 4.120570988269472e-05, |
|
"loss": 2.9481, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 71.68965148925781, |
|
"learning_rate": 4.108332901829836e-05, |
|
"loss": 3.7509, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 22.303184509277344, |
|
"learning_rate": 4.096028703124014e-05, |
|
"loss": 2.9864, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 11.789464950561523, |
|
"learning_rate": 4.083658897929426e-05, |
|
"loss": 2.7573, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 21.576683044433594, |
|
"learning_rate": 4.071223994720309e-05, |
|
"loss": 3.4365, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.004244804382324, |
|
"learning_rate": 4.058724504646834e-05, |
|
"loss": 3.1543, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 8.743468284606934, |
|
"learning_rate": 4.046160941514079e-05, |
|
"loss": 2.638, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 9.981857299804688, |
|
"learning_rate": 4.033533821760917e-05, |
|
"loss": 2.4701, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 11.454618453979492, |
|
"learning_rate": 4.0208436644387834e-05, |
|
"loss": 3.4152, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 22.861881256103516, |
|
"learning_rate": 4.008090991190341e-05, |
|
"loss": 3.0526, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 34.58321762084961, |
|
"learning_rate": 3.9952763262280405e-05, |
|
"loss": 3.1283, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 12.163683891296387, |
|
"learning_rate": 3.982400196312564e-05, |
|
"loss": 2.8686, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 34.78618240356445, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 2.782, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 11.429190635681152, |
|
"learning_rate": 3.95646566127599e-05, |
|
"loss": 3.1518, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.942728519439697, |
|
"learning_rate": 3.943408322222049e-05, |
|
"loss": 3.1183, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.556036949157715, |
|
"learning_rate": 3.9302916503054246e-05, |
|
"loss": 2.9241, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.161313533782959, |
|
"learning_rate": 3.917116184701125e-05, |
|
"loss": 2.8394, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 17.556371688842773, |
|
"learning_rate": 3.903882467000937e-05, |
|
"loss": 2.667, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.0064167976379395, |
|
"learning_rate": 3.8905910411911625e-05, |
|
"loss": 2.8605, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 13.329277038574219, |
|
"learning_rate": 3.8772424536302564e-05, |
|
"loss": 2.8765, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.7756876945495605, |
|
"learning_rate": 3.8638372530263715e-05, |
|
"loss": 2.6558, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.505970478057861, |
|
"learning_rate": 3.850375990414801e-05, |
|
"loss": 2.6178, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 29.54509925842285, |
|
"learning_rate": 3.836859219135324e-05, |
|
"loss": 2.9655, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 12.92197322845459, |
|
"learning_rate": 3.823287494809469e-05, |
|
"loss": 2.7825, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 10.073705673217773, |
|
"learning_rate": 3.8096613753176634e-05, |
|
"loss": 3.3055, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 23.254878997802734, |
|
"learning_rate": 3.7959814207763135e-05, |
|
"loss": 2.906, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 16.555452346801758, |
|
"learning_rate": 3.782248193514766e-05, |
|
"loss": 3.2928, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 8.429189682006836, |
|
"learning_rate": 3.7684622580522055e-05, |
|
"loss": 2.6259, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 9.762368202209473, |
|
"learning_rate": 3.7546241810744445e-05, |
|
"loss": 2.7576, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 11.004511833190918, |
|
"learning_rate": 3.740734531410626e-05, |
|
"loss": 2.6714, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 56.73052978515625, |
|
"learning_rate": 3.726793880009845e-05, |
|
"loss": 3.542, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 10.330306053161621, |
|
"learning_rate": 3.7128027999176803e-05, |
|
"loss": 3.0011, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 12.130693435668945, |
|
"learning_rate": 3.698761866252635e-05, |
|
"loss": 3.0935, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 12.528786659240723, |
|
"learning_rate": 3.6846716561824965e-05, |
|
"loss": 3.3687, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 14.122909545898438, |
|
"learning_rate": 3.670532748900615e-05, |
|
"loss": 2.5591, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8.046327590942383, |
|
"learning_rate": 3.656345725602089e-05, |
|
"loss": 2.3845, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 22.46470832824707, |
|
"learning_rate": 3.642111169459879e-05, |
|
"loss": 2.8321, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 10.554203987121582, |
|
"learning_rate": 3.6278296656008366e-05, |
|
"loss": 2.4067, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 8.757329940795898, |
|
"learning_rate": 3.6135018010816477e-05, |
|
"loss": 2.7003, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 8.956037521362305, |
|
"learning_rate": 3.599128164864706e-05, |
|
"loss": 2.7975, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 15.920394897460938, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 2.7154, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 15.14090633392334, |
|
"learning_rate": 3.570245942570315e-05, |
|
"loss": 3.3547, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 17.72629165649414, |
|
"learning_rate": 3.5557385437279e-05, |
|
"loss": 2.7852, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.857475757598877, |
|
"learning_rate": 3.5411877476089975e-05, |
|
"loss": 2.73, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 22.337295532226562, |
|
"learning_rate": 3.526594152339845e-05, |
|
"loss": 3.1076, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 10.523070335388184, |
|
"learning_rate": 3.5119583578059846e-05, |
|
"loss": 3.1857, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 11.635588645935059, |
|
"learning_rate": 3.497280965627605e-05, |
|
"loss": 2.8584, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 27.29017448425293, |
|
"learning_rate": 3.4825625791348096e-05, |
|
"loss": 3.3567, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 25.24538230895996, |
|
"learning_rate": 3.467803803342821e-05, |
|
"loss": 3.1713, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 11.2062406539917, |
|
"learning_rate": 3.4530052449271044e-05, |
|
"loss": 2.8941, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 20.25462532043457, |
|
"learning_rate": 3.438167512198436e-05, |
|
"loss": 3.1291, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 13.0928373336792, |
|
"learning_rate": 3.4232912150778914e-05, |
|
"loss": 2.4672, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 21.326560974121094, |
|
"learning_rate": 3.408376965071779e-05, |
|
"loss": 3.0489, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 54.281578063964844, |
|
"learning_rate": 3.393425375246503e-05, |
|
"loss": 3.0251, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 18.1798038482666, |
|
"learning_rate": 3.378437060203357e-05, |
|
"loss": 3.0152, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 8.757742881774902, |
|
"learning_rate": 3.363412636053269e-05, |
|
"loss": 2.5223, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.601003170013428, |
|
"learning_rate": 3.348352720391469e-05, |
|
"loss": 2.8158, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 16.87445068359375, |
|
"learning_rate": 3.3332579322721046e-05, |
|
"loss": 3.3828, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 11.726461410522461, |
|
"learning_rate": 3.318128892182792e-05, |
|
"loss": 2.5411, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 11.669478416442871, |
|
"learning_rate": 3.3029662220191144e-05, |
|
"loss": 2.6686, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 58.972007751464844, |
|
"learning_rate": 3.2877705450590526e-05, |
|
"loss": 3.3026, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 7.434502124786377, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 2.6652, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 22.020599365234375, |
|
"learning_rate": 3.2572826706199305e-05, |
|
"loss": 3.1553, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 17.60317611694336, |
|
"learning_rate": 3.2419917263779766e-05, |
|
"loss": 3.2016, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 15.09745979309082, |
|
"learning_rate": 3.2266702817623346e-05, |
|
"loss": 2.6242, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 11.325228691101074, |
|
"learning_rate": 3.211318966577581e-05, |
|
"loss": 2.9649, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 16.898090362548828, |
|
"learning_rate": 3.195938411856159e-05, |
|
"loss": 2.4623, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 8.761795043945312, |
|
"learning_rate": 3.180529249832428e-05, |
|
"loss": 2.4152, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 11.047560691833496, |
|
"learning_rate": 3.165092113916688e-05, |
|
"loss": 3.0119, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 11.61601734161377, |
|
"learning_rate": 3.149627638669132e-05, |
|
"loss": 2.4781, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 21.56399154663086, |
|
"learning_rate": 3.1341364597737686e-05, |
|
"loss": 3.4005, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 15.666382789611816, |
|
"learning_rate": 3.118619214012286e-05, |
|
"loss": 2.9385, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.866064071655273, |
|
"learning_rate": 3.1030765392378816e-05, |
|
"loss": 3.0096, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.25349235534668, |
|
"learning_rate": 3.0875090743490384e-05, |
|
"loss": 2.0702, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 10.070486068725586, |
|
"learning_rate": 3.071917459263264e-05, |
|
"loss": 3.0652, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7.047021389007568, |
|
"learning_rate": 3.056302334890786e-05, |
|
"loss": 2.8617, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.59208869934082, |
|
"learning_rate": 3.040664343108209e-05, |
|
"loss": 2.8697, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 9.361946105957031, |
|
"learning_rate": 3.0250041267321232e-05, |
|
"loss": 2.7646, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 11.60400390625, |
|
"learning_rate": 3.0093223294926892e-05, |
|
"loss": 2.8238, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 19.50318145751953, |
|
"learning_rate": 2.993619596007168e-05, |
|
"loss": 2.8857, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 24.245025634765625, |
|
"learning_rate": 2.9778965717534313e-05, |
|
"loss": 2.949, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 14.323025703430176, |
|
"learning_rate": 2.962153903043422e-05, |
|
"loss": 2.6237, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 13.869138717651367, |
|
"learning_rate": 2.9463922369965917e-05, |
|
"loss": 2.5021, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 9.667675018310547, |
|
"learning_rate": 2.9306122215132976e-05, |
|
"loss": 2.9346, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 22.312841415405273, |
|
"learning_rate": 2.91481450524817e-05, |
|
"loss": 2.7172, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 10.218385696411133, |
|
"learning_rate": 2.8989997375834482e-05, |
|
"loss": 2.4864, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 7.388011455535889, |
|
"learning_rate": 2.8831685686022897e-05, |
|
"loss": 2.7519, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 8.172524452209473, |
|
"learning_rate": 2.8673216490620452e-05, |
|
"loss": 2.701, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 11.60141372680664, |
|
"learning_rate": 2.8514596303675073e-05, |
|
"loss": 3.1608, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 21.537139892578125, |
|
"learning_rate": 2.8355831645441388e-05, |
|
"loss": 2.9307, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 12.06334400177002, |
|
"learning_rate": 2.8196929042112652e-05, |
|
"loss": 2.9414, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 8.39703369140625, |
|
"learning_rate": 2.8037895025552512e-05, |
|
"loss": 2.938, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 8.78774642944336, |
|
"learning_rate": 2.787873613302649e-05, |
|
"loss": 2.5751, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 24.190916061401367, |
|
"learning_rate": 2.7719458906933277e-05, |
|
"loss": 2.328, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 17.80084228515625, |
|
"learning_rate": 2.7560069894535784e-05, |
|
"loss": 2.7258, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 17.03345489501953, |
|
"learning_rate": 2.7400575647692046e-05, |
|
"loss": 2.8104, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 36.040897369384766, |
|
"learning_rate": 2.724098272258584e-05, |
|
"loss": 2.9138, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 9.431939125061035, |
|
"learning_rate": 2.7081297679457236e-05, |
|
"loss": 2.3052, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 10.720209121704102, |
|
"learning_rate": 2.692152708233292e-05, |
|
"loss": 2.3984, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 13.637066841125488, |
|
"learning_rate": 2.676167749875635e-05, |
|
"loss": 2.8592, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 8.968180656433105, |
|
"learning_rate": 2.6601755499517826e-05, |
|
"loss": 2.4064, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 9.793657302856445, |
|
"learning_rate": 2.6441767658384366e-05, |
|
"loss": 3.0291, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 8.77736759185791, |
|
"learning_rate": 2.628172055182948e-05, |
|
"loss": 2.3978, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 9.85341739654541, |
|
"learning_rate": 2.6121620758762877e-05, |
|
"loss": 2.5431, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 11.554092407226562, |
|
"learning_rate": 2.596147486025996e-05, |
|
"loss": 2.8736, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 8.345808029174805, |
|
"learning_rate": 2.5801289439291388e-05, |
|
"loss": 2.6837, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 9.449942588806152, |
|
"learning_rate": 2.564107108045239e-05, |
|
"loss": 3.1686, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 32.239585876464844, |
|
"learning_rate": 2.5480826369692178e-05, |
|
"loss": 3.0723, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 36.09556579589844, |
|
"learning_rate": 2.5320561894043184e-05, |
|
"loss": 2.6983, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 42.91587829589844, |
|
"learning_rate": 2.5160284241350278e-05, |
|
"loss": 2.6336, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 20.82798194885254, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.8635, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 7.868647575378418, |
|
"learning_rate": 2.4839715758649724e-05, |
|
"loss": 2.8232, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 12.662830352783203, |
|
"learning_rate": 2.467943810595682e-05, |
|
"loss": 2.8228, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 15.0082368850708, |
|
"learning_rate": 2.4519173630307825e-05, |
|
"loss": 2.6371, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 15.185354232788086, |
|
"learning_rate": 2.4358928919547616e-05, |
|
"loss": 2.6336, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.559114456176758, |
|
"learning_rate": 2.419871056070862e-05, |
|
"loss": 2.4209, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 13.428425788879395, |
|
"learning_rate": 2.403852513974004e-05, |
|
"loss": 2.6897, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 38.91737747192383, |
|
"learning_rate": 2.3878379241237136e-05, |
|
"loss": 2.569, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 15.211456298828125, |
|
"learning_rate": 2.3718279448170525e-05, |
|
"loss": 2.4856, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 11.391417503356934, |
|
"learning_rate": 2.3558232341615643e-05, |
|
"loss": 2.5527, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 21.21587371826172, |
|
"learning_rate": 2.339824450048218e-05, |
|
"loss": 3.5899, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 17.66887664794922, |
|
"learning_rate": 2.323832250124365e-05, |
|
"loss": 2.4669, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 10.46285629272461, |
|
"learning_rate": 2.3078472917667092e-05, |
|
"loss": 2.4984, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 14.3226318359375, |
|
"learning_rate": 2.291870232054277e-05, |
|
"loss": 2.8999, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 47.95698547363281, |
|
"learning_rate": 2.2759017277414166e-05, |
|
"loss": 2.7827, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 10.691919326782227, |
|
"learning_rate": 2.2599424352307957e-05, |
|
"loss": 2.9201, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 15.748466491699219, |
|
"learning_rate": 2.243993010546422e-05, |
|
"loss": 2.3433, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 18.585126876831055, |
|
"learning_rate": 2.2280541093066732e-05, |
|
"loss": 2.7896, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 9.304043769836426, |
|
"learning_rate": 2.212126386697352e-05, |
|
"loss": 2.781, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 14.388626098632812, |
|
"learning_rate": 2.196210497444749e-05, |
|
"loss": 2.9678, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 9.608166694641113, |
|
"learning_rate": 2.1803070957887347e-05, |
|
"loss": 3.0607, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.090882301330566, |
|
"learning_rate": 2.164416835455862e-05, |
|
"loss": 3.6207, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 13.572842597961426, |
|
"learning_rate": 2.1485403696324936e-05, |
|
"loss": 2.5546, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.186910629272461, |
|
"learning_rate": 2.1326783509379554e-05, |
|
"loss": 2.9678, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 50.014766693115234, |
|
"learning_rate": 2.11683143139771e-05, |
|
"loss": 3.1836, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.37592077255249, |
|
"learning_rate": 2.1010002624165527e-05, |
|
"loss": 2.659, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 22.413625717163086, |
|
"learning_rate": 2.0851854947518313e-05, |
|
"loss": 2.6929, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 11.021233558654785, |
|
"learning_rate": 2.069387778486703e-05, |
|
"loss": 2.7117, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 56.379844665527344, |
|
"learning_rate": 2.0536077630034086e-05, |
|
"loss": 3.4033, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 10.377793312072754, |
|
"learning_rate": 2.0378460969565782e-05, |
|
"loss": 2.4798, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.888087749481201, |
|
"learning_rate": 2.02210342824657e-05, |
|
"loss": 2.6521, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.764181613922119, |
|
"learning_rate": 2.0063804039928324e-05, |
|
"loss": 2.6824, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 55.408729553222656, |
|
"learning_rate": 1.9906776705073114e-05, |
|
"loss": 3.5755, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 8.327556610107422, |
|
"learning_rate": 1.9749958732678767e-05, |
|
"loss": 2.3398, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 9.360859870910645, |
|
"learning_rate": 1.9593356568917913e-05, |
|
"loss": 2.5018, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 32.50775146484375, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 2.8295, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 23.467443466186523, |
|
"learning_rate": 1.928082540736737e-05, |
|
"loss": 2.6034, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 10.357468605041504, |
|
"learning_rate": 1.9124909256509622e-05, |
|
"loss": 3.0481, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 16.312400817871094, |
|
"learning_rate": 1.8969234607621186e-05, |
|
"loss": 2.9409, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 15.405855178833008, |
|
"learning_rate": 1.8813807859877147e-05, |
|
"loss": 2.6141, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 10.774744987487793, |
|
"learning_rate": 1.865863540226232e-05, |
|
"loss": 2.5919, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 7.001652717590332, |
|
"learning_rate": 1.8503723613308683e-05, |
|
"loss": 3.043, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 32.264766693115234, |
|
"learning_rate": 1.8349078860833123e-05, |
|
"loss": 2.711, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 15.870736122131348, |
|
"learning_rate": 1.8194707501675724e-05, |
|
"loss": 2.4717, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 15.47707748413086, |
|
"learning_rate": 1.8040615881438425e-05, |
|
"loss": 2.9706, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 12.465653419494629, |
|
"learning_rate": 1.7886810334224192e-05, |
|
"loss": 3.0024, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 20.99846839904785, |
|
"learning_rate": 1.7733297182376663e-05, |
|
"loss": 3.3546, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 37.41334533691406, |
|
"learning_rate": 1.7580082736220237e-05, |
|
"loss": 2.924, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.230709552764893, |
|
"learning_rate": 1.74271732938007e-05, |
|
"loss": 2.7253, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 15.803852081298828, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 2.8601, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.981109142303467, |
|
"learning_rate": 1.7122294549409484e-05, |
|
"loss": 2.6754, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 21.247787475585938, |
|
"learning_rate": 1.6970337779808862e-05, |
|
"loss": 2.5148, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 13.401627540588379, |
|
"learning_rate": 1.6818711078172077e-05, |
|
"loss": 2.5783, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 8.62679386138916, |
|
"learning_rate": 1.666742067727896e-05, |
|
"loss": 2.5611, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 6.567328929901123, |
|
"learning_rate": 1.6516472796085315e-05, |
|
"loss": 2.3179, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 9.707379341125488, |
|
"learning_rate": 1.6365873639467315e-05, |
|
"loss": 2.3441, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 38.05426788330078, |
|
"learning_rate": 1.621562939796643e-05, |
|
"loss": 2.562, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 18.448406219482422, |
|
"learning_rate": 1.6065746247534984e-05, |
|
"loss": 2.6616, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 12.998920440673828, |
|
"learning_rate": 1.5916230349282215e-05, |
|
"loss": 2.9305, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 7.341577053070068, |
|
"learning_rate": 1.5767087849221096e-05, |
|
"loss": 2.6997, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 11.726909637451172, |
|
"learning_rate": 1.561832487801565e-05, |
|
"loss": 2.9281, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 10.877985000610352, |
|
"learning_rate": 1.5469947550728958e-05, |
|
"loss": 3.7181, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 11.845093727111816, |
|
"learning_rate": 1.53219619665718e-05, |
|
"loss": 3.1277, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 11.064024925231934, |
|
"learning_rate": 1.5174374208651912e-05, |
|
"loss": 2.8159, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 12.611387252807617, |
|
"learning_rate": 1.502719034372396e-05, |
|
"loss": 2.9906, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 8.44251537322998, |
|
"learning_rate": 1.4880416421940155e-05, |
|
"loss": 2.7055, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 11.020173072814941, |
|
"learning_rate": 1.4734058476601553e-05, |
|
"loss": 2.7852, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 18.893342971801758, |
|
"learning_rate": 1.458812252391003e-05, |
|
"loss": 2.7677, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 9.38496208190918, |
|
"learning_rate": 1.444261456272101e-05, |
|
"loss": 2.7732, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.147873401641846, |
|
"learning_rate": 1.4297540574296869e-05, |
|
"loss": 2.2796, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 18.142108917236328, |
|
"learning_rate": 1.4152906522061048e-05, |
|
"loss": 3.1047, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 16.73578643798828, |
|
"learning_rate": 1.400871835135295e-05, |
|
"loss": 2.9558, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 8.667417526245117, |
|
"learning_rate": 1.386498198918352e-05, |
|
"loss": 2.5587, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 29.769210815429688, |
|
"learning_rate": 1.3721703343991633e-05, |
|
"loss": 3.2146, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 27.924272537231445, |
|
"learning_rate": 1.3578888305401207e-05, |
|
"loss": 2.8355, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 8.13482666015625, |
|
"learning_rate": 1.3436542743979125e-05, |
|
"loss": 2.8218, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 16.816307067871094, |
|
"learning_rate": 1.329467251099386e-05, |
|
"loss": 2.5636, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 10.0423583984375, |
|
"learning_rate": 1.3153283438175034e-05, |
|
"loss": 2.3591, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 10.223281860351562, |
|
"learning_rate": 1.3012381337473656e-05, |
|
"loss": 3.0024, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 16.49390983581543, |
|
"learning_rate": 1.2871972000823196e-05, |
|
"loss": 2.9023, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 7.545689582824707, |
|
"learning_rate": 1.2732061199901562e-05, |
|
"loss": 2.8308, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 16.10993003845215, |
|
"learning_rate": 1.2592654685893757e-05, |
|
"loss": 2.6319, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 17.161388397216797, |
|
"learning_rate": 1.2453758189255568e-05, |
|
"loss": 2.6535, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 9.898162841796875, |
|
"learning_rate": 1.231537741947795e-05, |
|
"loss": 3.3956, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 8.259682655334473, |
|
"learning_rate": 1.217751806485235e-05, |
|
"loss": 2.5027, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 13.10878849029541, |
|
"learning_rate": 1.2040185792236874e-05, |
|
"loss": 2.4352, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.2070722579956055, |
|
"learning_rate": 1.1903386246823361e-05, |
|
"loss": 2.8799, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 12.991480827331543, |
|
"learning_rate": 1.1767125051905315e-05, |
|
"loss": 3.0858, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 10.183439254760742, |
|
"learning_rate": 1.1631407808646758e-05, |
|
"loss": 2.767, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 44.539459228515625, |
|
"learning_rate": 1.1496240095852001e-05, |
|
"loss": 2.9168, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 9.825093269348145, |
|
"learning_rate": 1.1361627469736285e-05, |
|
"loss": 2.6493, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 13.887325286865234, |
|
"learning_rate": 1.122757546369744e-05, |
|
"loss": 2.8159, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 6.512905120849609, |
|
"learning_rate": 1.1094089588088383e-05, |
|
"loss": 3.4574, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 8.677343368530273, |
|
"learning_rate": 1.096117532999063e-05, |
|
"loss": 2.7936, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 29.443038940429688, |
|
"learning_rate": 1.082883815298876e-05, |
|
"loss": 2.7603, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 6.62708044052124, |
|
"learning_rate": 1.0697083496945765e-05, |
|
"loss": 2.6984, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 11.575033187866211, |
|
"learning_rate": 1.0565916777779519e-05, |
|
"loss": 2.7648, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 19.933650970458984, |
|
"learning_rate": 1.0435343387240098e-05, |
|
"loss": 2.9776, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 12.603243827819824, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 2.4419, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 8.084098815917969, |
|
"learning_rate": 1.0175998036874356e-05, |
|
"loss": 2.7489, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 20.572294235229492, |
|
"learning_rate": 1.0047236737719601e-05, |
|
"loss": 3.0075, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 8.42055606842041, |
|
"learning_rate": 9.919090088096589e-06, |
|
"loss": 2.7706, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 33.90031051635742, |
|
"learning_rate": 9.791563355612172e-06, |
|
"loss": 3.6592, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 8.972926139831543, |
|
"learning_rate": 9.664661782390841e-06, |
|
"loss": 2.4741, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 34.49540328979492, |
|
"learning_rate": 9.538390584859214e-06, |
|
"loss": 2.8379, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 17.665390014648438, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 3.0774, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 24.203062057495117, |
|
"learning_rate": 9.287760052796909e-06, |
|
"loss": 2.6639, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 15.584200859069824, |
|
"learning_rate": 9.163411020705762e-06, |
|
"loss": 2.75, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 9.29335880279541, |
|
"learning_rate": 9.039712968759864e-06, |
|
"loss": 2.9486, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 10.088394165039062, |
|
"learning_rate": 8.916670981701655e-06, |
|
"loss": 2.251, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 12.336640357971191, |
|
"learning_rate": 8.794290117305296e-06, |
|
"loss": 2.9527, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 8.988898277282715, |
|
"learning_rate": 8.672575406168782e-06, |
|
"loss": 2.6327, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 11.5975980758667, |
|
"learning_rate": 8.551531851507186e-06, |
|
"loss": 2.6512, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 23.88221549987793, |
|
"learning_rate": 8.431164428946927e-06, |
|
"loss": 2.6542, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 8.243393898010254, |
|
"learning_rate": 8.3114780863213e-06, |
|
"loss": 3.0325, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 13.184893608093262, |
|
"learning_rate": 8.192477743467078e-06, |
|
"loss": 2.7061, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 7.766141414642334, |
|
"learning_rate": 8.07416829202227e-06, |
|
"loss": 2.2925, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 30.14096450805664, |
|
"learning_rate": 7.956554595225016e-06, |
|
"loss": 3.4068, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 16.042314529418945, |
|
"learning_rate": 7.839641487713745e-06, |
|
"loss": 2.388, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 16.486698150634766, |
|
"learning_rate": 7.723433775328384e-06, |
|
"loss": 3.549, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 10.384564399719238, |
|
"learning_rate": 7.607936234912841e-06, |
|
"loss": 3.186, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 13.724809646606445, |
|
"learning_rate": 7.493153614118634e-06, |
|
"loss": 2.6417, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 8.79148006439209, |
|
"learning_rate": 7.379090631209712e-06, |
|
"loss": 2.6296, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 14.533072471618652, |
|
"learning_rate": 7.265751974868554e-06, |
|
"loss": 2.5174, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 11.014676094055176, |
|
"learning_rate": 7.153142304003418e-06, |
|
"loss": 2.5473, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 14.28827953338623, |
|
"learning_rate": 7.041266247556813e-06, |
|
"loss": 2.8404, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 13.483999252319336, |
|
"learning_rate": 6.930128404315214e-06, |
|
"loss": 2.6145, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 22.0334529876709, |
|
"learning_rate": 6.819733342720066e-06, |
|
"loss": 2.7059, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 42.69429397583008, |
|
"learning_rate": 6.7100856006799665e-06, |
|
"loss": 3.0099, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 9.883504867553711, |
|
"learning_rate": 6.601189685384126e-06, |
|
"loss": 2.9242, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 17.86440658569336, |
|
"learning_rate": 6.493050073117116e-06, |
|
"loss": 3.022, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 27.365087509155273, |
|
"learning_rate": 6.385671209074828e-06, |
|
"loss": 2.9742, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 13.641215324401855, |
|
"learning_rate": 6.279057507181796e-06, |
|
"loss": 2.4529, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 8.5147123336792, |
|
"learning_rate": 6.173213349909729e-06, |
|
"loss": 2.5926, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 13.009632110595703, |
|
"learning_rate": 6.068143088097372e-06, |
|
"loss": 2.4284, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 11.745450973510742, |
|
"learning_rate": 5.9638510407716394e-06, |
|
"loss": 2.7295, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 7.102616310119629, |
|
"learning_rate": 5.860341494970131e-06, |
|
"loss": 2.3844, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 19.79552459716797, |
|
"learning_rate": 5.757618705564849e-06, |
|
"loss": 2.6637, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 10.288678169250488, |
|
"learning_rate": 5.655686895087329e-06, |
|
"loss": 2.6376, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 5.515965938568115, |
|
"learning_rate": 5.554550253555066e-06, |
|
"loss": 2.7672, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 22.45775604248047, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 2.9637, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 8.804770469665527, |
|
"learning_rate": 5.354679073793942e-06, |
|
"loss": 2.4708, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 14.247252464294434, |
|
"learning_rate": 5.255952751486443e-06, |
|
"loss": 2.6551, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.674140930175781, |
|
"learning_rate": 5.158038029629195e-06, |
|
"loss": 2.7633, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 9.93137264251709, |
|
"learning_rate": 5.060938933112891e-06, |
|
"loss": 2.7034, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 11.792275428771973, |
|
"learning_rate": 4.9646594533010875e-06, |
|
"loss": 2.8564, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 13.885851860046387, |
|
"learning_rate": 4.869203547866097e-06, |
|
"loss": 2.6809, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 21.813217163085938, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 3.0319, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 12.183605194091797, |
|
"learning_rate": 4.680778121384935e-06, |
|
"loss": 2.5685, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 26.5369930267334, |
|
"learning_rate": 4.587816345770032e-06, |
|
"loss": 2.9539, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.17782735824585, |
|
"learning_rate": 4.495693635076101e-06, |
|
"loss": 2.3644, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 21.22880744934082, |
|
"learning_rate": 4.404413776106958e-06, |
|
"loss": 3.0965, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 5.561256408691406, |
|
"learning_rate": 4.313980521020092e-06, |
|
"loss": 3.032, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 7.701003074645996, |
|
"learning_rate": 4.224397587172402e-06, |
|
"loss": 2.7757, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 17.345849990844727, |
|
"learning_rate": 4.135668656967434e-06, |
|
"loss": 2.3689, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 8.013896942138672, |
|
"learning_rate": 4.047797377703985e-06, |
|
"loss": 2.512, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.709656715393066, |
|
"learning_rate": 3.9607873614261715e-06, |
|
"loss": 3.0421, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 13.18295669555664, |
|
"learning_rate": 3.8746421847749765e-06, |
|
"loss": 2.4467, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 9.89919662475586, |
|
"learning_rate": 3.789365388841193e-06, |
|
"loss": 2.322, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 20.111215591430664, |
|
"learning_rate": 3.7049604790198976e-06, |
|
"loss": 3.2308, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 9.88414192199707, |
|
"learning_rate": 3.621430924866348e-06, |
|
"loss": 2.5939, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 8.166717529296875, |
|
"learning_rate": 3.5387801599533475e-06, |
|
"loss": 2.9439, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 31.1898193359375, |
|
"learning_rate": 3.4570115817301243e-06, |
|
"loss": 3.2292, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 26.66686248779297, |
|
"learning_rate": 3.3761285513826625e-06, |
|
"loss": 2.9332, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.934303283691406, |
|
"learning_rate": 3.296134393695538e-06, |
|
"loss": 2.7759, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 11.340413093566895, |
|
"learning_rate": 3.217032396915265e-06, |
|
"loss": 2.3886, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 10.123784065246582, |
|
"learning_rate": 3.1388258126151093e-06, |
|
"loss": 2.6871, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 9.508092880249023, |
|
"learning_rate": 3.06151785556143e-06, |
|
"loss": 3.0332, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 6.981659412384033, |
|
"learning_rate": 2.98511170358155e-06, |
|
"loss": 2.6556, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.004157066345215, |
|
"learning_rate": 2.9096104974331184e-06, |
|
"loss": 2.7863, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 15.677950859069824, |
|
"learning_rate": 2.8350173406749973e-06, |
|
"loss": 2.7697, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.333318710327148, |
|
"learning_rate": 2.7613352995397078e-06, |
|
"loss": 2.6726, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 5.832099437713623, |
|
"learning_rate": 2.688567402807357e-06, |
|
"loss": 2.8863, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 10.78770637512207, |
|
"learning_rate": 2.6167166416811746e-06, |
|
"loss": 2.8584, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 16.921499252319336, |
|
"learning_rate": 2.545785969664524e-06, |
|
"loss": 2.9597, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 8.275824546813965, |
|
"learning_rate": 2.475778302439524e-06, |
|
"loss": 2.1545, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 20.255998611450195, |
|
"learning_rate": 2.4066965177471645e-06, |
|
"loss": 2.5138, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 6.863336086273193, |
|
"learning_rate": 2.338543455269046e-06, |
|
"loss": 2.5736, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 10.660894393920898, |
|
"learning_rate": 2.271321916510627e-06, |
|
"loss": 2.5524, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 13.18586254119873, |
|
"learning_rate": 2.205034664686076e-06, |
|
"loss": 2.5716, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 11.016261100769043, |
|
"learning_rate": 2.1396844246046903e-06, |
|
"loss": 3.3971, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 23.928808212280273, |
|
"learning_rate": 2.075273882558873e-06, |
|
"loss": 2.7627, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 8.036683082580566, |
|
"learning_rate": 2.0118056862137357e-06, |
|
"loss": 2.7452, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 15.620336532592773, |
|
"learning_rate": 1.949282444498238e-06, |
|
"loss": 2.7892, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 5.829482078552246, |
|
"learning_rate": 1.8877067274979648e-06, |
|
"loss": 2.7427, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 24.735841751098633, |
|
"learning_rate": 1.827081066349459e-06, |
|
"loss": 2.3911, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 15.522488594055176, |
|
"learning_rate": 1.767407953136202e-06, |
|
"loss": 2.7896, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 10.789400100708008, |
|
"learning_rate": 1.7086898407861485e-06, |
|
"loss": 2.6058, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 12.961084365844727, |
|
"learning_rate": 1.6509291429709223e-06, |
|
"loss": 2.1545, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 42.19524002075195, |
|
"learning_rate": 1.59412823400657e-06, |
|
"loss": 2.7638, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 11.88717269897461, |
|
"learning_rate": 1.538289448755989e-06, |
|
"loss": 3.258, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 9.746241569519043, |
|
"learning_rate": 1.483415082532938e-06, |
|
"loss": 2.4654, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 8.284168243408203, |
|
"learning_rate": 1.4295073910076757e-06, |
|
"loss": 2.8934, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.290571212768555, |
|
"learning_rate": 1.3765685901142716e-06, |
|
"loss": 2.8615, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 9.758255004882812, |
|
"learning_rate": 1.3246008559594709e-06, |
|
"loss": 2.4915, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 64.08142852783203, |
|
"learning_rate": 1.273606324733284e-06, |
|
"loss": 2.6644, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.796569347381592, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 3.0118, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 48.14067459106445, |
|
"learning_rate": 1.1745452157178206e-06, |
|
"loss": 2.5428, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 78.56604766845703, |
|
"learning_rate": 1.1264827099427417e-06, |
|
"loss": 3.2121, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 8.145052909851074, |
|
"learning_rate": 1.0794015509572818e-06, |
|
"loss": 2.7715, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 9.410415649414062, |
|
"learning_rate": 1.0333036740834856e-06, |
|
"loss": 2.9311, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 26.370338439941406, |
|
"learning_rate": 9.881909742245177e-07, |
|
"loss": 2.8867, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 19.1849365234375, |
|
"learning_rate": 9.440653057867815e-07, |
|
"loss": 2.559, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 25.568355560302734, |
|
"learning_rate": 9.009284826036691e-07, |
|
"loss": 2.5787, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 14.307110786437988, |
|
"learning_rate": 8.587822778610283e-07, |
|
"loss": 2.8058, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 25.297014236450195, |
|
"learning_rate": 8.176284240242638e-07, |
|
"loss": 2.8921, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 19.239059448242188, |
|
"learning_rate": 7.774686127671183e-07, |
|
"loss": 2.6122, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 11.51448917388916, |
|
"learning_rate": 7.383044949021339e-07, |
|
"loss": 2.2361, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 6.253133773803711, |
|
"learning_rate": 7.00137680312804e-07, |
|
"loss": 2.5475, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 17.918170928955078, |
|
"learning_rate": 6.62969737887384e-07, |
|
"loss": 2.8589, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 15.065885543823242, |
|
"learning_rate": 6.268021954544096e-07, |
|
"loss": 2.9153, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 13.842984199523926, |
|
"learning_rate": 5.916365397198975e-07, |
|
"loss": 2.894, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 10.621698379516602, |
|
"learning_rate": 5.574742162062163e-07, |
|
"loss": 3.1294, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 13.219593048095703, |
|
"learning_rate": 5.243166291926782e-07, |
|
"loss": 2.6543, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 11.599324226379395, |
|
"learning_rate": 4.921651416578188e-07, |
|
"loss": 2.3677, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 8.731278419494629, |
|
"learning_rate": 4.6102107522336403e-07, |
|
"loss": 2.7634, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 43.908077239990234, |
|
"learning_rate": 4.308857100999042e-07, |
|
"loss": 2.8106, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 9.233963966369629, |
|
"learning_rate": 4.0176028503425835e-07, |
|
"loss": 2.5252, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 13.496291160583496, |
|
"learning_rate": 3.7364599725858153e-07, |
|
"loss": 2.2908, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 11.278969764709473, |
|
"learning_rate": 3.465440024411265e-07, |
|
"loss": 2.5045, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 13.121251106262207, |
|
"learning_rate": 3.204554146387456e-07, |
|
"loss": 2.4661, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 17.475078582763672, |
|
"learning_rate": 2.9538130625110796e-07, |
|
"loss": 2.9907, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 6.753970146179199, |
|
"learning_rate": 2.7132270797659563e-07, |
|
"loss": 2.4326, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 18.052387237548828, |
|
"learning_rate": 2.482806087699546e-07, |
|
"loss": 3.0056, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 11.094266891479492, |
|
"learning_rate": 2.262559558016325e-07, |
|
"loss": 2.7058, |
|
"step": 2400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"total_flos": 1.3202552777146368e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|