|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 19095, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005236973029588898, |
|
"grad_norm": 0.07354702800512314, |
|
"learning_rate": 1.3093289689034372e-07, |
|
"loss": 2.2441, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.010473946059177796, |
|
"grad_norm": 0.07018959522247314, |
|
"learning_rate": 2.6186579378068744e-07, |
|
"loss": 2.2612, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.015710919088766692, |
|
"grad_norm": 0.07288151979446411, |
|
"learning_rate": 3.9279869067103113e-07, |
|
"loss": 2.241, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.020947892118355592, |
|
"grad_norm": 0.0727863535284996, |
|
"learning_rate": 5.237315875613749e-07, |
|
"loss": 2.2466, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.026184865147944488, |
|
"grad_norm": 0.10916672646999359, |
|
"learning_rate": 6.546644844517186e-07, |
|
"loss": 2.2483, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.031421838177533384, |
|
"grad_norm": 0.08643593639135361, |
|
"learning_rate": 7.855973813420623e-07, |
|
"loss": 2.2339, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.036658811207122284, |
|
"grad_norm": 0.09361663460731506, |
|
"learning_rate": 9.165302782324059e-07, |
|
"loss": 2.2352, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.041895784236711184, |
|
"grad_norm": 0.11914397031068802, |
|
"learning_rate": 1.0474631751227498e-06, |
|
"loss": 2.2503, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.04713275726630008, |
|
"grad_norm": 0.11531686782836914, |
|
"learning_rate": 1.1783960720130934e-06, |
|
"loss": 2.2327, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.052369730295888976, |
|
"grad_norm": 0.12413031607866287, |
|
"learning_rate": 1.3093289689034372e-06, |
|
"loss": 2.2412, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.057606703325477876, |
|
"grad_norm": 0.1295381635427475, |
|
"learning_rate": 1.4402618657937809e-06, |
|
"loss": 2.2431, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06284367635506677, |
|
"grad_norm": 0.1375189870595932, |
|
"learning_rate": 1.5711947626841245e-06, |
|
"loss": 2.2273, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.06808064938465568, |
|
"grad_norm": 0.16082307696342468, |
|
"learning_rate": 1.7021276595744682e-06, |
|
"loss": 2.2117, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.07331762241424457, |
|
"grad_norm": 0.16158346831798553, |
|
"learning_rate": 1.8330605564648118e-06, |
|
"loss": 2.2206, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.07855459544383346, |
|
"grad_norm": 0.174397274851799, |
|
"learning_rate": 1.9639934533551554e-06, |
|
"loss": 2.2255, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.08379156847342237, |
|
"grad_norm": 0.186273992061615, |
|
"learning_rate": 2.0949263502454995e-06, |
|
"loss": 2.1958, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.08902854150301126, |
|
"grad_norm": 0.1794576197862625, |
|
"learning_rate": 2.225859247135843e-06, |
|
"loss": 2.2321, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.09426551453260015, |
|
"grad_norm": 0.19168038666248322, |
|
"learning_rate": 2.3567921440261868e-06, |
|
"loss": 2.2192, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.09950248756218906, |
|
"grad_norm": 0.1959036886692047, |
|
"learning_rate": 2.486415711947627e-06, |
|
"loss": 2.1996, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.10473946059177795, |
|
"grad_norm": 0.22251689434051514, |
|
"learning_rate": 2.6173486088379706e-06, |
|
"loss": 2.2137, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.10997643362136685, |
|
"grad_norm": 0.20624759793281555, |
|
"learning_rate": 2.7482815057283147e-06, |
|
"loss": 2.2049, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.11521340665095575, |
|
"grad_norm": 0.25143158435821533, |
|
"learning_rate": 2.879214402618658e-06, |
|
"loss": 2.2046, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.12045037968054464, |
|
"grad_norm": 0.24057720601558685, |
|
"learning_rate": 3.010147299509002e-06, |
|
"loss": 2.1802, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.12568735271013354, |
|
"grad_norm": 0.23995009064674377, |
|
"learning_rate": 3.141080196399345e-06, |
|
"loss": 2.1901, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.13092432573972243, |
|
"grad_norm": 0.2643965184688568, |
|
"learning_rate": 3.2720130932896892e-06, |
|
"loss": 2.1948, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.13616129876931135, |
|
"grad_norm": 0.2582302689552307, |
|
"learning_rate": 3.4029459901800333e-06, |
|
"loss": 2.1953, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.14139827179890024, |
|
"grad_norm": 0.2721526622772217, |
|
"learning_rate": 3.5338788870703765e-06, |
|
"loss": 2.2013, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.14663524482848914, |
|
"grad_norm": 0.26533135771751404, |
|
"learning_rate": 3.6648117839607206e-06, |
|
"loss": 2.1874, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.15187221785807803, |
|
"grad_norm": 0.2823657989501953, |
|
"learning_rate": 3.7957446808510638e-06, |
|
"loss": 2.1956, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.15710919088766692, |
|
"grad_norm": 0.36414533853530884, |
|
"learning_rate": 3.926677577741408e-06, |
|
"loss": 2.1853, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.16234616391725581, |
|
"grad_norm": 0.2987813353538513, |
|
"learning_rate": 4.0576104746317515e-06, |
|
"loss": 2.1845, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.16758313694684474, |
|
"grad_norm": 0.2912348806858063, |
|
"learning_rate": 4.1885433715220955e-06, |
|
"loss": 2.1754, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.17282010997643363, |
|
"grad_norm": 0.3097289204597473, |
|
"learning_rate": 4.319476268412439e-06, |
|
"loss": 2.1552, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.17805708300602252, |
|
"grad_norm": 0.31930428743362427, |
|
"learning_rate": 4.450409165302783e-06, |
|
"loss": 2.1862, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.1832940560356114, |
|
"grad_norm": 0.33817386627197266, |
|
"learning_rate": 4.581342062193127e-06, |
|
"loss": 2.1808, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.1885310290652003, |
|
"grad_norm": 0.3205846846103668, |
|
"learning_rate": 4.71227495908347e-06, |
|
"loss": 2.1898, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.1937680020947892, |
|
"grad_norm": 0.3309566080570221, |
|
"learning_rate": 4.843207855973814e-06, |
|
"loss": 2.174, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.19900497512437812, |
|
"grad_norm": 0.34491220116615295, |
|
"learning_rate": 4.974140752864157e-06, |
|
"loss": 2.1764, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.204241948153967, |
|
"grad_norm": 0.35818833112716675, |
|
"learning_rate": 5.1050736497545014e-06, |
|
"loss": 2.1502, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2094789211835559, |
|
"grad_norm": 0.3484792709350586, |
|
"learning_rate": 5.2360065466448455e-06, |
|
"loss": 2.1749, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2147158942131448, |
|
"grad_norm": 0.3905714452266693, |
|
"learning_rate": 5.366939443535189e-06, |
|
"loss": 2.1468, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.2199528672427337, |
|
"grad_norm": 0.3773205280303955, |
|
"learning_rate": 5.497872340425532e-06, |
|
"loss": 2.1716, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.2251898402723226, |
|
"grad_norm": 0.38546085357666016, |
|
"learning_rate": 5.628805237315876e-06, |
|
"loss": 2.1519, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.2304268133019115, |
|
"grad_norm": 0.39430660009384155, |
|
"learning_rate": 5.75973813420622e-06, |
|
"loss": 2.1472, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.2356637863315004, |
|
"grad_norm": 0.38882067799568176, |
|
"learning_rate": 5.890671031096563e-06, |
|
"loss": 2.1417, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2409007593610893, |
|
"grad_norm": 0.40174001455307007, |
|
"learning_rate": 6.021603927986907e-06, |
|
"loss": 2.1528, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.24613773239067818, |
|
"grad_norm": 0.4062660038471222, |
|
"learning_rate": 6.152536824877251e-06, |
|
"loss": 2.1352, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.2513747054202671, |
|
"grad_norm": 0.4776448905467987, |
|
"learning_rate": 6.283469721767595e-06, |
|
"loss": 2.1528, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.256611678449856, |
|
"grad_norm": 0.3891739845275879, |
|
"learning_rate": 6.414402618657938e-06, |
|
"loss": 2.1508, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.26184865147944486, |
|
"grad_norm": 0.42986443638801575, |
|
"learning_rate": 6.545335515548282e-06, |
|
"loss": 2.149, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2670856245090338, |
|
"grad_norm": 0.39317014813423157, |
|
"learning_rate": 6.676268412438626e-06, |
|
"loss": 2.1472, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.2723225975386227, |
|
"grad_norm": 0.45696353912353516, |
|
"learning_rate": 6.807201309328969e-06, |
|
"loss": 2.1401, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.27755957056821157, |
|
"grad_norm": 0.4466469883918762, |
|
"learning_rate": 6.938134206219313e-06, |
|
"loss": 2.1492, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.2827965435978005, |
|
"grad_norm": 0.4214916229248047, |
|
"learning_rate": 7.069067103109657e-06, |
|
"loss": 2.1438, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.28803351662738935, |
|
"grad_norm": 0.44096261262893677, |
|
"learning_rate": 7.198690671031097e-06, |
|
"loss": 2.1445, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.2932704896569783, |
|
"grad_norm": 0.4745313823223114, |
|
"learning_rate": 7.329623567921441e-06, |
|
"loss": 2.1303, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.5099794864654541, |
|
"learning_rate": 7.460556464811784e-06, |
|
"loss": 2.14, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.30374443571615606, |
|
"grad_norm": 0.4933392405509949, |
|
"learning_rate": 7.5914893617021276e-06, |
|
"loss": 2.1181, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.308981408745745, |
|
"grad_norm": 0.4734782576560974, |
|
"learning_rate": 7.722422258592472e-06, |
|
"loss": 2.1259, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.31421838177533384, |
|
"grad_norm": 0.4762997627258301, |
|
"learning_rate": 7.853355155482817e-06, |
|
"loss": 2.1185, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.31945535480492276, |
|
"grad_norm": 0.5242263674736023, |
|
"learning_rate": 7.98428805237316e-06, |
|
"loss": 2.1406, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.32469232783451163, |
|
"grad_norm": 0.4882369637489319, |
|
"learning_rate": 8.115220949263503e-06, |
|
"loss": 2.1221, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.32992930086410055, |
|
"grad_norm": 0.48831576108932495, |
|
"learning_rate": 8.246153846153848e-06, |
|
"loss": 2.1203, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.33516627389368947, |
|
"grad_norm": 0.4771474301815033, |
|
"learning_rate": 8.377086743044191e-06, |
|
"loss": 2.1247, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.34040324692327834, |
|
"grad_norm": 0.48237186670303345, |
|
"learning_rate": 8.508019639934534e-06, |
|
"loss": 2.1083, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.34564021995286726, |
|
"grad_norm": 0.5286875367164612, |
|
"learning_rate": 8.638952536824878e-06, |
|
"loss": 2.1258, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 0.5419202446937561, |
|
"learning_rate": 8.769885433715222e-06, |
|
"loss": 2.1248, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.35611416601204504, |
|
"grad_norm": 0.5243601202964783, |
|
"learning_rate": 8.900818330605566e-06, |
|
"loss": 2.1252, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.36135113904163396, |
|
"grad_norm": 0.5450451970100403, |
|
"learning_rate": 9.031751227495909e-06, |
|
"loss": 2.1117, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.3665881120712228, |
|
"grad_norm": 0.5390617251396179, |
|
"learning_rate": 9.162684124386254e-06, |
|
"loss": 2.1312, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.37182508510081175, |
|
"grad_norm": 0.5742843747138977, |
|
"learning_rate": 9.293617021276597e-06, |
|
"loss": 2.1207, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.3770620581304006, |
|
"grad_norm": 0.5794598460197449, |
|
"learning_rate": 9.42454991816694e-06, |
|
"loss": 2.1088, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.38229903115998953, |
|
"grad_norm": 0.5871763229370117, |
|
"learning_rate": 9.555482815057283e-06, |
|
"loss": 2.1138, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.3875360041895784, |
|
"grad_norm": 0.574471116065979, |
|
"learning_rate": 9.686415711947628e-06, |
|
"loss": 2.1234, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.3927729772191673, |
|
"grad_norm": 0.5694011449813843, |
|
"learning_rate": 9.817348608837972e-06, |
|
"loss": 2.1028, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.39800995024875624, |
|
"grad_norm": 0.5721834301948547, |
|
"learning_rate": 9.948281505728315e-06, |
|
"loss": 2.0942, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.4032469232783451, |
|
"grad_norm": 0.5568354725837708, |
|
"learning_rate": 1.0079214402618658e-05, |
|
"loss": 2.0937, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.408483896307934, |
|
"grad_norm": 0.575330913066864, |
|
"learning_rate": 1.0210147299509003e-05, |
|
"loss": 2.0989, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.4137208693375229, |
|
"grad_norm": 0.5605918169021606, |
|
"learning_rate": 1.0341080196399346e-05, |
|
"loss": 2.0992, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.4189578423671118, |
|
"grad_norm": 0.5807542204856873, |
|
"learning_rate": 1.0472013093289691e-05, |
|
"loss": 2.101, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.42419481539670073, |
|
"grad_norm": 0.5749071836471558, |
|
"learning_rate": 1.0602945990180034e-05, |
|
"loss": 2.1092, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.4294317884262896, |
|
"grad_norm": 0.6206376552581787, |
|
"learning_rate": 1.0733878887070377e-05, |
|
"loss": 2.1026, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.4346687614558785, |
|
"grad_norm": 0.586361825466156, |
|
"learning_rate": 1.086481178396072e-05, |
|
"loss": 2.0985, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.4399057344854674, |
|
"grad_norm": 0.6338817477226257, |
|
"learning_rate": 1.0995744680851064e-05, |
|
"loss": 2.0887, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.4451427075150563, |
|
"grad_norm": 0.6082013845443726, |
|
"learning_rate": 1.1126677577741409e-05, |
|
"loss": 2.1088, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.4503796805446452, |
|
"grad_norm": 0.6418773531913757, |
|
"learning_rate": 1.1257610474631752e-05, |
|
"loss": 2.0641, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.4556166535742341, |
|
"grad_norm": 0.6760055422782898, |
|
"learning_rate": 1.1388543371522097e-05, |
|
"loss": 2.079, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.460853626603823, |
|
"grad_norm": 0.611735999584198, |
|
"learning_rate": 1.151947626841244e-05, |
|
"loss": 2.0853, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.4660905996334119, |
|
"grad_norm": 0.6323230266571045, |
|
"learning_rate": 1.1650409165302783e-05, |
|
"loss": 2.0919, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.4713275726630008, |
|
"grad_norm": 0.7350252270698547, |
|
"learning_rate": 1.1781342062193127e-05, |
|
"loss": 2.0942, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.47656454569258966, |
|
"grad_norm": 0.5890368223190308, |
|
"learning_rate": 1.191227495908347e-05, |
|
"loss": 2.1016, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.4818015187221786, |
|
"grad_norm": 0.6341009736061096, |
|
"learning_rate": 1.2043207855973815e-05, |
|
"loss": 2.0804, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.4870384917517675, |
|
"grad_norm": 0.6020395755767822, |
|
"learning_rate": 1.2174140752864158e-05, |
|
"loss": 2.0686, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.49227546478135636, |
|
"grad_norm": 0.6680401563644409, |
|
"learning_rate": 1.2305073649754503e-05, |
|
"loss": 2.0854, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.4975124378109453, |
|
"grad_norm": 0.7290039658546448, |
|
"learning_rate": 1.2436006546644846e-05, |
|
"loss": 2.0779, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5027494108405341, |
|
"grad_norm": 0.6373685598373413, |
|
"learning_rate": 1.256693944353519e-05, |
|
"loss": 2.1097, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.5079863838701231, |
|
"grad_norm": 0.5846343040466309, |
|
"learning_rate": 1.2697872340425532e-05, |
|
"loss": 2.0751, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.513223356899712, |
|
"grad_norm": 0.5871058702468872, |
|
"learning_rate": 1.2828805237315876e-05, |
|
"loss": 2.0771, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.5184603299293009, |
|
"grad_norm": 0.6121764779090881, |
|
"learning_rate": 1.295973813420622e-05, |
|
"loss": 2.0813, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.5236973029588897, |
|
"grad_norm": 0.5855483412742615, |
|
"learning_rate": 1.3090671031096564e-05, |
|
"loss": 2.0796, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5289342759884786, |
|
"grad_norm": 0.6471145153045654, |
|
"learning_rate": 1.3221603927986909e-05, |
|
"loss": 2.0898, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.5341712490180676, |
|
"grad_norm": 0.6933115124702454, |
|
"learning_rate": 1.3352536824877252e-05, |
|
"loss": 2.0949, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.5394082220476565, |
|
"grad_norm": 0.6297255158424377, |
|
"learning_rate": 1.3483469721767595e-05, |
|
"loss": 2.0888, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.5446451950772454, |
|
"grad_norm": 0.6992611885070801, |
|
"learning_rate": 1.3614402618657938e-05, |
|
"loss": 2.0908, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.5498821681068342, |
|
"grad_norm": 0.6574690341949463, |
|
"learning_rate": 1.3745335515548283e-05, |
|
"loss": 2.0826, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5551191411364231, |
|
"grad_norm": 0.5975152850151062, |
|
"learning_rate": 1.3876268412438626e-05, |
|
"loss": 2.0747, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.560356114166012, |
|
"grad_norm": 0.6534228920936584, |
|
"learning_rate": 1.400720130932897e-05, |
|
"loss": 2.0882, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.565593087195601, |
|
"grad_norm": 0.6680553555488586, |
|
"learning_rate": 1.4138134206219315e-05, |
|
"loss": 2.0788, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.5708300602251899, |
|
"grad_norm": 0.6993077993392944, |
|
"learning_rate": 1.4269067103109658e-05, |
|
"loss": 2.0505, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.5760670332547787, |
|
"grad_norm": 0.6117376089096069, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 2.0805, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5813040062843676, |
|
"grad_norm": 0.670172929763794, |
|
"learning_rate": 1.4530932896890344e-05, |
|
"loss": 2.0809, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.5865409793139565, |
|
"grad_norm": 0.61323481798172, |
|
"learning_rate": 1.466186579378069e-05, |
|
"loss": 2.061, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.5917779523435455, |
|
"grad_norm": 0.6071058511734009, |
|
"learning_rate": 1.4792798690671032e-05, |
|
"loss": 2.0544, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.6362223029136658, |
|
"learning_rate": 1.4923731587561376e-05, |
|
"loss": 2.0656, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.6022518984027232, |
|
"grad_norm": 0.6346144080162048, |
|
"learning_rate": 1.505466448445172e-05, |
|
"loss": 2.0611, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6074888714323121, |
|
"grad_norm": 0.6532538533210754, |
|
"learning_rate": 1.5185597381342064e-05, |
|
"loss": 2.0686, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.612725844461901, |
|
"grad_norm": 0.6856857538223267, |
|
"learning_rate": 1.5316530278232407e-05, |
|
"loss": 2.0616, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.61796281749149, |
|
"grad_norm": 0.9743651747703552, |
|
"learning_rate": 1.544746317512275e-05, |
|
"loss": 2.0641, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.6231997905210788, |
|
"grad_norm": 0.628181517124176, |
|
"learning_rate": 1.5578396072013097e-05, |
|
"loss": 2.0725, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.6284367635506677, |
|
"grad_norm": 0.6573601961135864, |
|
"learning_rate": 1.570932896890344e-05, |
|
"loss": 2.0819, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6336737365802566, |
|
"grad_norm": 0.6741845011711121, |
|
"learning_rate": 1.5840261865793783e-05, |
|
"loss": 2.0636, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.6389107096098455, |
|
"grad_norm": 0.730778694152832, |
|
"learning_rate": 1.5971194762684126e-05, |
|
"loss": 2.0621, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.6441476826394344, |
|
"grad_norm": 0.6156385540962219, |
|
"learning_rate": 1.6100818330605564e-05, |
|
"loss": 2.0571, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.6493846556690233, |
|
"grad_norm": 0.6113892197608948, |
|
"learning_rate": 1.6231751227495908e-05, |
|
"loss": 2.0669, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.6546216286986122, |
|
"grad_norm": 0.6205545663833618, |
|
"learning_rate": 1.6362684124386254e-05, |
|
"loss": 2.0669, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.6598586017282011, |
|
"grad_norm": 0.6788818836212158, |
|
"learning_rate": 1.6493617021276598e-05, |
|
"loss": 2.0406, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.66509557475779, |
|
"grad_norm": 0.693049430847168, |
|
"learning_rate": 1.662454991816694e-05, |
|
"loss": 2.0551, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.6703325477873789, |
|
"grad_norm": 0.7428627610206604, |
|
"learning_rate": 1.6755482815057284e-05, |
|
"loss": 2.0301, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.6755695208169678, |
|
"grad_norm": 0.6874978542327881, |
|
"learning_rate": 1.6886415711947627e-05, |
|
"loss": 2.0546, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.6808064938465567, |
|
"grad_norm": 0.7278417348861694, |
|
"learning_rate": 1.701734860883797e-05, |
|
"loss": 2.0538, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.6860434668761456, |
|
"grad_norm": 0.641114354133606, |
|
"learning_rate": 1.7148281505728314e-05, |
|
"loss": 2.0585, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.6912804399057345, |
|
"grad_norm": 0.6964296698570251, |
|
"learning_rate": 1.727921440261866e-05, |
|
"loss": 2.0384, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.6965174129353234, |
|
"grad_norm": 0.6126134395599365, |
|
"learning_rate": 1.7410147299509003e-05, |
|
"loss": 2.0449, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 0.6734199523925781, |
|
"learning_rate": 1.7541080196399347e-05, |
|
"loss": 2.0458, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.7069913589945012, |
|
"grad_norm": 0.6749238967895508, |
|
"learning_rate": 1.767201309328969e-05, |
|
"loss": 2.0639, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7122283320240901, |
|
"grad_norm": 0.6168593764305115, |
|
"learning_rate": 1.7802945990180033e-05, |
|
"loss": 2.0435, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.717465305053679, |
|
"grad_norm": 0.7050462365150452, |
|
"learning_rate": 1.7933878887070376e-05, |
|
"loss": 2.049, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.7227022780832679, |
|
"grad_norm": 0.6948175430297852, |
|
"learning_rate": 1.806481178396072e-05, |
|
"loss": 2.0516, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.7279392511128567, |
|
"grad_norm": 0.6051421761512756, |
|
"learning_rate": 1.8195744680851066e-05, |
|
"loss": 2.0478, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.7331762241424457, |
|
"grad_norm": 0.7436869144439697, |
|
"learning_rate": 1.832667757774141e-05, |
|
"loss": 2.0466, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7384131971720346, |
|
"grad_norm": 0.6047870516777039, |
|
"learning_rate": 1.8457610474631753e-05, |
|
"loss": 2.0382, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.7436501702016235, |
|
"grad_norm": 0.7828758358955383, |
|
"learning_rate": 1.8588543371522096e-05, |
|
"loss": 2.0303, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.7488871432312123, |
|
"grad_norm": 0.653523325920105, |
|
"learning_rate": 1.871947626841244e-05, |
|
"loss": 2.048, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.7541241162608012, |
|
"grad_norm": 0.6173336505889893, |
|
"learning_rate": 1.8850409165302782e-05, |
|
"loss": 2.0489, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.7593610892903901, |
|
"grad_norm": 0.7114732265472412, |
|
"learning_rate": 1.8981342062193125e-05, |
|
"loss": 2.0356, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.7645980623199791, |
|
"grad_norm": 0.6434004902839661, |
|
"learning_rate": 1.9112274959083472e-05, |
|
"loss": 2.035, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.769835035349568, |
|
"grad_norm": 0.6391409039497375, |
|
"learning_rate": 1.9243207855973815e-05, |
|
"loss": 2.0638, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.7750720083791568, |
|
"grad_norm": 0.8258867263793945, |
|
"learning_rate": 1.937414075286416e-05, |
|
"loss": 2.0248, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.7803089814087457, |
|
"grad_norm": 0.6063815951347351, |
|
"learning_rate": 1.95050736497545e-05, |
|
"loss": 2.0486, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.7855459544383346, |
|
"grad_norm": 0.6866258978843689, |
|
"learning_rate": 1.9636006546644845e-05, |
|
"loss": 2.0292, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7907829274679236, |
|
"grad_norm": 0.5765138268470764, |
|
"learning_rate": 1.9766939443535188e-05, |
|
"loss": 2.0186, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.7960199004975125, |
|
"grad_norm": 0.6583371162414551, |
|
"learning_rate": 1.989787234042553e-05, |
|
"loss": 2.0391, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.8012568735271013, |
|
"grad_norm": 0.7544857263565063, |
|
"learning_rate": 1.9998363271901744e-05, |
|
"loss": 2.0349, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.8064938465566902, |
|
"grad_norm": 0.6168326735496521, |
|
"learning_rate": 1.9949708067498546e-05, |
|
"loss": 2.0375, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.8117308195862791, |
|
"grad_norm": 0.7661889791488647, |
|
"learning_rate": 1.9833795697023395e-05, |
|
"loss": 2.0328, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.816967792615868, |
|
"grad_norm": 0.6521978974342346, |
|
"learning_rate": 1.9651409694776794e-05, |
|
"loss": 2.0574, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.822204765645457, |
|
"grad_norm": 0.6655182838439941, |
|
"learning_rate": 1.9403782937699357e-05, |
|
"loss": 2.0313, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.8274417386750458, |
|
"grad_norm": 0.6480154991149902, |
|
"learning_rate": 1.9092589311478146e-05, |
|
"loss": 2.0384, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.8326787117046347, |
|
"grad_norm": 0.6570712327957153, |
|
"learning_rate": 1.8719932395560647e-05, |
|
"loss": 2.0313, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.8379156847342236, |
|
"grad_norm": 0.6235129237174988, |
|
"learning_rate": 1.8288331243562475e-05, |
|
"loss": 2.0322, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8431526577638125, |
|
"grad_norm": 0.6542329788208008, |
|
"learning_rate": 1.7800703355189137e-05, |
|
"loss": 2.0384, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.8483896307934015, |
|
"grad_norm": 0.6734735369682312, |
|
"learning_rate": 1.726034495477677e-05, |
|
"loss": 2.0381, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.8536266038229903, |
|
"grad_norm": 0.6568425297737122, |
|
"learning_rate": 1.66709087097633e-05, |
|
"loss": 2.0372, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.8588635768525792, |
|
"grad_norm": 0.6389915943145752, |
|
"learning_rate": 1.603637903970664e-05, |
|
"loss": 2.0302, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.8641005498821681, |
|
"grad_norm": 0.5985362529754639, |
|
"learning_rate": 1.5361045182753986e-05, |
|
"loss": 2.025, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.869337522911757, |
|
"grad_norm": 0.6669567823410034, |
|
"learning_rate": 1.4649472201625057e-05, |
|
"loss": 2.0329, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.874574495941346, |
|
"grad_norm": 0.5840954780578613, |
|
"learning_rate": 1.3914039388098432e-05, |
|
"loss": 2.0207, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.8798114689709348, |
|
"grad_norm": 0.6801176071166992, |
|
"learning_rate": 1.3144869286586354e-05, |
|
"loss": 2.0087, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.8850484420005237, |
|
"grad_norm": 0.6012386679649353, |
|
"learning_rate": 1.2354440772822623e-05, |
|
"loss": 2.0202, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.8902854150301126, |
|
"grad_norm": 0.655200719833374, |
|
"learning_rate": 1.1548096916318175e-05, |
|
"loss": 2.0297, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.6136151552200317, |
|
"learning_rate": 1.0739490166119155e-05, |
|
"loss": 2.0128, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.9007593610892904, |
|
"grad_norm": 0.7110956311225891, |
|
"learning_rate": 9.917760281675867e-06, |
|
"loss": 2.0239, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.9059963341188793, |
|
"grad_norm": 0.6501589417457581, |
|
"learning_rate": 9.096586314085162e-06, |
|
"loss": 2.0274, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.9112333071484682, |
|
"grad_norm": 0.6471460461616516, |
|
"learning_rate": 8.281519163286772e-06, |
|
"loss": 2.0398, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.9164702801780571, |
|
"grad_norm": 0.7580538392066956, |
|
"learning_rate": 7.478068448894577e-06, |
|
"loss": 2.0231, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.921707253207646, |
|
"grad_norm": 0.657486617565155, |
|
"learning_rate": 6.6916652667519855e-06, |
|
"loss": 2.0192, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.9269442262372348, |
|
"grad_norm": 0.7056224346160889, |
|
"learning_rate": 5.927625476285426e-06, |
|
"loss": 2.0233, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.9321811992668237, |
|
"grad_norm": 0.6053991913795471, |
|
"learning_rate": 5.191113766822905e-06, |
|
"loss": 2.0165, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.9374181722964127, |
|
"grad_norm": 0.6543104648590088, |
|
"learning_rate": 4.487108745778958e-06, |
|
"loss": 2.0096, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.9426551453260016, |
|
"grad_norm": 0.6767512559890747, |
|
"learning_rate": 3.820369284699823e-06, |
|
"loss": 2.0236, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9478921183555905, |
|
"grad_norm": 0.5917364358901978, |
|
"learning_rate": 3.195402350659945e-06, |
|
"loss": 2.0315, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.9531290913851793, |
|
"grad_norm": 0.5886921286582947, |
|
"learning_rate": 2.616432540460255e-06, |
|
"loss": 2.0335, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.9583660644147682, |
|
"grad_norm": 0.6519348621368408, |
|
"learning_rate": 2.0873735235683535e-06, |
|
"loss": 2.0138, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.9636030374443572, |
|
"grad_norm": 0.6619024872779846, |
|
"learning_rate": 1.6118015868380387e-06, |
|
"loss": 2.0223, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.9688400104739461, |
|
"grad_norm": 0.6300016045570374, |
|
"learning_rate": 1.1929314598383423e-06, |
|
"loss": 2.0184, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.974076983503535, |
|
"grad_norm": 0.6754645109176636, |
|
"learning_rate": 8.335945842058524e-07, |
|
"loss": 2.0215, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.9793139565331238, |
|
"grad_norm": 0.7369129657745361, |
|
"learning_rate": 5.362199739132656e-07, |
|
"loss": 2.0138, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.9845509295627127, |
|
"grad_norm": 0.5822499990463257, |
|
"learning_rate": 3.028177958332512e-07, |
|
"loss": 2.0249, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.9897879025923016, |
|
"grad_norm": 0.6991373896598816, |
|
"learning_rate": 1.349657815883032e-07, |
|
"loss": 2.0329, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.9950248756218906, |
|
"grad_norm": 0.6656786203384399, |
|
"learning_rate": 3.379856253855951e-08, |
|
"loss": 2.0112, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 19095, |
|
"total_flos": 2.7919996141761987e+18, |
|
"train_loss": 2.1007044342432573, |
|
"train_runtime": 7350.2103, |
|
"train_samples_per_second": 41.566, |
|
"train_steps_per_second": 2.598 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 19095, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7919996141761987e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|