{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 19095, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005236973029588898, "grad_norm": 0.07354702800512314, "learning_rate": 1.3093289689034372e-07, "loss": 2.2441, "step": 100 }, { "epoch": 0.010473946059177796, "grad_norm": 0.07018959522247314, "learning_rate": 2.6186579378068744e-07, "loss": 2.2612, "step": 200 }, { "epoch": 0.015710919088766692, "grad_norm": 0.07288151979446411, "learning_rate": 3.9279869067103113e-07, "loss": 2.241, "step": 300 }, { "epoch": 0.020947892118355592, "grad_norm": 0.0727863535284996, "learning_rate": 5.237315875613749e-07, "loss": 2.2466, "step": 400 }, { "epoch": 0.026184865147944488, "grad_norm": 0.10916672646999359, "learning_rate": 6.546644844517186e-07, "loss": 2.2483, "step": 500 }, { "epoch": 0.031421838177533384, "grad_norm": 0.08643593639135361, "learning_rate": 7.855973813420623e-07, "loss": 2.2339, "step": 600 }, { "epoch": 0.036658811207122284, "grad_norm": 0.09361663460731506, "learning_rate": 9.165302782324059e-07, "loss": 2.2352, "step": 700 }, { "epoch": 0.041895784236711184, "grad_norm": 0.11914397031068802, "learning_rate": 1.0474631751227498e-06, "loss": 2.2503, "step": 800 }, { "epoch": 0.04713275726630008, "grad_norm": 0.11531686782836914, "learning_rate": 1.1783960720130934e-06, "loss": 2.2327, "step": 900 }, { "epoch": 0.052369730295888976, "grad_norm": 0.12413031607866287, "learning_rate": 1.3093289689034372e-06, "loss": 2.2412, "step": 1000 }, { "epoch": 0.057606703325477876, "grad_norm": 0.1295381635427475, "learning_rate": 1.4402618657937809e-06, "loss": 2.2431, "step": 1100 }, { "epoch": 0.06284367635506677, "grad_norm": 0.1375189870595932, "learning_rate": 1.5711947626841245e-06, "loss": 2.2273, "step": 1200 }, { "epoch": 0.06808064938465568, "grad_norm": 0.16082307696342468, "learning_rate": 1.7021276595744682e-06, "loss": 2.2117, "step": 1300 }, { "epoch": 0.07331762241424457, "grad_norm": 0.16158346831798553, "learning_rate": 1.8330605564648118e-06, "loss": 2.2206, "step": 1400 }, { "epoch": 0.07855459544383346, "grad_norm": 0.174397274851799, "learning_rate": 1.9639934533551554e-06, "loss": 2.2255, "step": 1500 }, { "epoch": 0.08379156847342237, "grad_norm": 0.186273992061615, "learning_rate": 2.0949263502454995e-06, "loss": 2.1958, "step": 1600 }, { "epoch": 0.08902854150301126, "grad_norm": 0.1794576197862625, "learning_rate": 2.225859247135843e-06, "loss": 2.2321, "step": 1700 }, { "epoch": 0.09426551453260015, "grad_norm": 0.19168038666248322, "learning_rate": 2.3567921440261868e-06, "loss": 2.2192, "step": 1800 }, { "epoch": 0.09950248756218906, "grad_norm": 0.1959036886692047, "learning_rate": 2.486415711947627e-06, "loss": 2.1996, "step": 1900 }, { "epoch": 0.10473946059177795, "grad_norm": 0.22251689434051514, "learning_rate": 2.6173486088379706e-06, "loss": 2.2137, "step": 2000 }, { "epoch": 0.10997643362136685, "grad_norm": 0.20624759793281555, "learning_rate": 2.7482815057283147e-06, "loss": 2.2049, "step": 2100 }, { "epoch": 0.11521340665095575, "grad_norm": 0.25143158435821533, "learning_rate": 2.879214402618658e-06, "loss": 2.2046, "step": 2200 }, { "epoch": 0.12045037968054464, "grad_norm": 0.24057720601558685, "learning_rate": 3.010147299509002e-06, "loss": 2.1802, "step": 2300 }, { "epoch": 0.12568735271013354, "grad_norm": 0.23995009064674377, "learning_rate": 3.141080196399345e-06, "loss": 2.1901, "step": 2400 }, { "epoch": 0.13092432573972243, "grad_norm": 0.2643965184688568, "learning_rate": 3.2720130932896892e-06, "loss": 2.1948, "step": 2500 }, { "epoch": 0.13616129876931135, "grad_norm": 0.2582302689552307, "learning_rate": 3.4029459901800333e-06, "loss": 2.1953, "step": 2600 }, { "epoch": 0.14139827179890024, "grad_norm": 0.2721526622772217, "learning_rate": 3.5338788870703765e-06, "loss": 2.2013, "step": 2700 }, { "epoch": 0.14663524482848914, "grad_norm": 0.26533135771751404, "learning_rate": 3.6648117839607206e-06, "loss": 2.1874, "step": 2800 }, { "epoch": 0.15187221785807803, "grad_norm": 0.2823657989501953, "learning_rate": 3.7957446808510638e-06, "loss": 2.1956, "step": 2900 }, { "epoch": 0.15710919088766692, "grad_norm": 0.36414533853530884, "learning_rate": 3.926677577741408e-06, "loss": 2.1853, "step": 3000 }, { "epoch": 0.16234616391725581, "grad_norm": 0.2987813353538513, "learning_rate": 4.0576104746317515e-06, "loss": 2.1845, "step": 3100 }, { "epoch": 0.16758313694684474, "grad_norm": 0.2912348806858063, "learning_rate": 4.1885433715220955e-06, "loss": 2.1754, "step": 3200 }, { "epoch": 0.17282010997643363, "grad_norm": 0.3097289204597473, "learning_rate": 4.319476268412439e-06, "loss": 2.1552, "step": 3300 }, { "epoch": 0.17805708300602252, "grad_norm": 0.31930428743362427, "learning_rate": 4.450409165302783e-06, "loss": 2.1862, "step": 3400 }, { "epoch": 0.1832940560356114, "grad_norm": 0.33817386627197266, "learning_rate": 4.581342062193127e-06, "loss": 2.1808, "step": 3500 }, { "epoch": 0.1885310290652003, "grad_norm": 0.3205846846103668, "learning_rate": 4.71227495908347e-06, "loss": 2.1898, "step": 3600 }, { "epoch": 0.1937680020947892, "grad_norm": 0.3309566080570221, "learning_rate": 4.843207855973814e-06, "loss": 2.174, "step": 3700 }, { "epoch": 0.19900497512437812, "grad_norm": 0.34491220116615295, "learning_rate": 4.974140752864157e-06, "loss": 2.1764, "step": 3800 }, { "epoch": 0.204241948153967, "grad_norm": 0.35818833112716675, "learning_rate": 5.1050736497545014e-06, "loss": 2.1502, "step": 3900 }, { "epoch": 0.2094789211835559, "grad_norm": 0.3484792709350586, "learning_rate": 5.2360065466448455e-06, "loss": 2.1749, "step": 4000 }, { "epoch": 0.2147158942131448, "grad_norm": 0.3905714452266693, "learning_rate": 5.366939443535189e-06, "loss": 2.1468, "step": 4100 }, { "epoch": 0.2199528672427337, "grad_norm": 0.3773205280303955, "learning_rate": 5.497872340425532e-06, "loss": 2.1716, "step": 4200 }, { "epoch": 0.2251898402723226, "grad_norm": 0.38546085357666016, "learning_rate": 5.628805237315876e-06, "loss": 2.1519, "step": 4300 }, { "epoch": 0.2304268133019115, "grad_norm": 0.39430660009384155, "learning_rate": 5.75973813420622e-06, "loss": 2.1472, "step": 4400 }, { "epoch": 0.2356637863315004, "grad_norm": 0.38882067799568176, "learning_rate": 5.890671031096563e-06, "loss": 2.1417, "step": 4500 }, { "epoch": 0.2409007593610893, "grad_norm": 0.40174001455307007, "learning_rate": 6.021603927986907e-06, "loss": 2.1528, "step": 4600 }, { "epoch": 0.24613773239067818, "grad_norm": 0.4062660038471222, "learning_rate": 6.152536824877251e-06, "loss": 2.1352, "step": 4700 }, { "epoch": 0.2513747054202671, "grad_norm": 0.4776448905467987, "learning_rate": 6.283469721767595e-06, "loss": 2.1528, "step": 4800 }, { "epoch": 0.256611678449856, "grad_norm": 0.3891739845275879, "learning_rate": 6.414402618657938e-06, "loss": 2.1508, "step": 4900 }, { "epoch": 0.26184865147944486, "grad_norm": 0.42986443638801575, "learning_rate": 6.545335515548282e-06, "loss": 2.149, "step": 5000 }, { "epoch": 0.2670856245090338, "grad_norm": 0.39317014813423157, "learning_rate": 6.676268412438626e-06, "loss": 2.1472, "step": 5100 }, { "epoch": 0.2723225975386227, "grad_norm": 0.45696353912353516, "learning_rate": 6.807201309328969e-06, "loss": 2.1401, "step": 5200 }, { "epoch": 0.27755957056821157, "grad_norm": 0.4466469883918762, "learning_rate": 6.938134206219313e-06, "loss": 2.1492, "step": 5300 }, { "epoch": 0.2827965435978005, "grad_norm": 0.4214916229248047, "learning_rate": 7.069067103109657e-06, "loss": 2.1438, "step": 5400 }, { "epoch": 0.28803351662738935, "grad_norm": 0.44096261262893677, "learning_rate": 7.198690671031097e-06, "loss": 2.1445, "step": 5500 }, { "epoch": 0.2932704896569783, "grad_norm": 0.4745313823223114, "learning_rate": 7.329623567921441e-06, "loss": 2.1303, "step": 5600 }, { "epoch": 0.29850746268656714, "grad_norm": 0.5099794864654541, "learning_rate": 7.460556464811784e-06, "loss": 2.14, "step": 5700 }, { "epoch": 0.30374443571615606, "grad_norm": 0.4933392405509949, "learning_rate": 7.5914893617021276e-06, "loss": 2.1181, "step": 5800 }, { "epoch": 0.308981408745745, "grad_norm": 0.4734782576560974, "learning_rate": 7.722422258592472e-06, "loss": 2.1259, "step": 5900 }, { "epoch": 0.31421838177533384, "grad_norm": 0.4762997627258301, "learning_rate": 7.853355155482817e-06, "loss": 2.1185, "step": 6000 }, { "epoch": 0.31945535480492276, "grad_norm": 0.5242263674736023, "learning_rate": 7.98428805237316e-06, "loss": 2.1406, "step": 6100 }, { "epoch": 0.32469232783451163, "grad_norm": 0.4882369637489319, "learning_rate": 8.115220949263503e-06, "loss": 2.1221, "step": 6200 }, { "epoch": 0.32992930086410055, "grad_norm": 0.48831576108932495, "learning_rate": 8.246153846153848e-06, "loss": 2.1203, "step": 6300 }, { "epoch": 0.33516627389368947, "grad_norm": 0.4771474301815033, "learning_rate": 8.377086743044191e-06, "loss": 2.1247, "step": 6400 }, { "epoch": 0.34040324692327834, "grad_norm": 0.48237186670303345, "learning_rate": 8.508019639934534e-06, "loss": 2.1083, "step": 6500 }, { "epoch": 0.34564021995286726, "grad_norm": 0.5286875367164612, "learning_rate": 8.638952536824878e-06, "loss": 2.1258, "step": 6600 }, { "epoch": 0.3508771929824561, "grad_norm": 0.5419202446937561, "learning_rate": 8.769885433715222e-06, "loss": 2.1248, "step": 6700 }, { "epoch": 0.35611416601204504, "grad_norm": 0.5243601202964783, "learning_rate": 8.900818330605566e-06, "loss": 2.1252, "step": 6800 }, { "epoch": 0.36135113904163396, "grad_norm": 0.5450451970100403, "learning_rate": 9.031751227495909e-06, "loss": 2.1117, "step": 6900 }, { "epoch": 0.3665881120712228, "grad_norm": 0.5390617251396179, "learning_rate": 9.162684124386254e-06, "loss": 2.1312, "step": 7000 }, { "epoch": 0.37182508510081175, "grad_norm": 0.5742843747138977, "learning_rate": 9.293617021276597e-06, "loss": 2.1207, "step": 7100 }, { "epoch": 0.3770620581304006, "grad_norm": 0.5794598460197449, "learning_rate": 9.42454991816694e-06, "loss": 2.1088, "step": 7200 }, { "epoch": 0.38229903115998953, "grad_norm": 0.5871763229370117, "learning_rate": 9.555482815057283e-06, "loss": 2.1138, "step": 7300 }, { "epoch": 0.3875360041895784, "grad_norm": 0.574471116065979, "learning_rate": 9.686415711947628e-06, "loss": 2.1234, "step": 7400 }, { "epoch": 0.3927729772191673, "grad_norm": 0.5694011449813843, "learning_rate": 9.817348608837972e-06, "loss": 2.1028, "step": 7500 }, { "epoch": 0.39800995024875624, "grad_norm": 0.5721834301948547, "learning_rate": 9.948281505728315e-06, "loss": 2.0942, "step": 7600 }, { "epoch": 0.4032469232783451, "grad_norm": 0.5568354725837708, "learning_rate": 1.0079214402618658e-05, "loss": 2.0937, "step": 7700 }, { "epoch": 0.408483896307934, "grad_norm": 0.575330913066864, "learning_rate": 1.0210147299509003e-05, "loss": 2.0989, "step": 7800 }, { "epoch": 0.4137208693375229, "grad_norm": 0.5605918169021606, "learning_rate": 1.0341080196399346e-05, "loss": 2.0992, "step": 7900 }, { "epoch": 0.4189578423671118, "grad_norm": 0.5807542204856873, "learning_rate": 1.0472013093289691e-05, "loss": 2.101, "step": 8000 }, { "epoch": 0.42419481539670073, "grad_norm": 0.5749071836471558, "learning_rate": 1.0602945990180034e-05, "loss": 2.1092, "step": 8100 }, { "epoch": 0.4294317884262896, "grad_norm": 0.6206376552581787, "learning_rate": 1.0733878887070377e-05, "loss": 2.1026, "step": 8200 }, { "epoch": 0.4346687614558785, "grad_norm": 0.586361825466156, "learning_rate": 1.086481178396072e-05, "loss": 2.0985, "step": 8300 }, { "epoch": 0.4399057344854674, "grad_norm": 0.6338817477226257, "learning_rate": 1.0995744680851064e-05, "loss": 2.0887, "step": 8400 }, { "epoch": 0.4451427075150563, "grad_norm": 0.6082013845443726, "learning_rate": 1.1126677577741409e-05, "loss": 2.1088, "step": 8500 }, { "epoch": 0.4503796805446452, "grad_norm": 0.6418773531913757, "learning_rate": 1.1257610474631752e-05, "loss": 2.0641, "step": 8600 }, { "epoch": 0.4556166535742341, "grad_norm": 0.6760055422782898, "learning_rate": 1.1388543371522097e-05, "loss": 2.079, "step": 8700 }, { "epoch": 0.460853626603823, "grad_norm": 0.611735999584198, "learning_rate": 1.151947626841244e-05, "loss": 2.0853, "step": 8800 }, { "epoch": 0.4660905996334119, "grad_norm": 0.6323230266571045, "learning_rate": 1.1650409165302783e-05, "loss": 2.0919, "step": 8900 }, { "epoch": 0.4713275726630008, "grad_norm": 0.7350252270698547, "learning_rate": 1.1781342062193127e-05, "loss": 2.0942, "step": 9000 }, { "epoch": 0.47656454569258966, "grad_norm": 0.5890368223190308, "learning_rate": 1.191227495908347e-05, "loss": 2.1016, "step": 9100 }, { "epoch": 0.4818015187221786, "grad_norm": 0.6341009736061096, "learning_rate": 1.2043207855973815e-05, "loss": 2.0804, "step": 9200 }, { "epoch": 0.4870384917517675, "grad_norm": 0.6020395755767822, "learning_rate": 1.2174140752864158e-05, "loss": 2.0686, "step": 9300 }, { "epoch": 0.49227546478135636, "grad_norm": 0.6680401563644409, "learning_rate": 1.2305073649754503e-05, "loss": 2.0854, "step": 9400 }, { "epoch": 0.4975124378109453, "grad_norm": 0.7290039658546448, "learning_rate": 1.2436006546644846e-05, "loss": 2.0779, "step": 9500 }, { "epoch": 0.5027494108405341, "grad_norm": 0.6373685598373413, "learning_rate": 1.256693944353519e-05, "loss": 2.1097, "step": 9600 }, { "epoch": 0.5079863838701231, "grad_norm": 0.5846343040466309, "learning_rate": 1.2697872340425532e-05, "loss": 2.0751, "step": 9700 }, { "epoch": 0.513223356899712, "grad_norm": 0.5871058702468872, "learning_rate": 1.2828805237315876e-05, "loss": 2.0771, "step": 9800 }, { "epoch": 0.5184603299293009, "grad_norm": 0.6121764779090881, "learning_rate": 1.295973813420622e-05, "loss": 2.0813, "step": 9900 }, { "epoch": 0.5236973029588897, "grad_norm": 0.5855483412742615, "learning_rate": 1.3090671031096564e-05, "loss": 2.0796, "step": 10000 }, { "epoch": 0.5289342759884786, "grad_norm": 0.6471145153045654, "learning_rate": 1.3221603927986909e-05, "loss": 2.0898, "step": 10100 }, { "epoch": 0.5341712490180676, "grad_norm": 0.6933115124702454, "learning_rate": 1.3352536824877252e-05, "loss": 2.0949, "step": 10200 }, { "epoch": 0.5394082220476565, "grad_norm": 0.6297255158424377, "learning_rate": 1.3483469721767595e-05, "loss": 2.0888, "step": 10300 }, { "epoch": 0.5446451950772454, "grad_norm": 0.6992611885070801, "learning_rate": 1.3614402618657938e-05, "loss": 2.0908, "step": 10400 }, { "epoch": 0.5498821681068342, "grad_norm": 0.6574690341949463, "learning_rate": 1.3745335515548283e-05, "loss": 2.0826, "step": 10500 }, { "epoch": 0.5551191411364231, "grad_norm": 0.5975152850151062, "learning_rate": 1.3876268412438626e-05, "loss": 2.0747, "step": 10600 }, { "epoch": 0.560356114166012, "grad_norm": 0.6534228920936584, "learning_rate": 1.400720130932897e-05, "loss": 2.0882, "step": 10700 }, { "epoch": 0.565593087195601, "grad_norm": 0.6680553555488586, "learning_rate": 1.4138134206219315e-05, "loss": 2.0788, "step": 10800 }, { "epoch": 0.5708300602251899, "grad_norm": 0.6993077993392944, "learning_rate": 1.4269067103109658e-05, "loss": 2.0505, "step": 10900 }, { "epoch": 0.5760670332547787, "grad_norm": 0.6117376089096069, "learning_rate": 1.4400000000000001e-05, "loss": 2.0805, "step": 11000 }, { "epoch": 0.5813040062843676, "grad_norm": 0.670172929763794, "learning_rate": 1.4530932896890344e-05, "loss": 2.0809, "step": 11100 }, { "epoch": 0.5865409793139565, "grad_norm": 0.61323481798172, "learning_rate": 1.466186579378069e-05, "loss": 2.061, "step": 11200 }, { "epoch": 0.5917779523435455, "grad_norm": 0.6071058511734009, "learning_rate": 1.4792798690671032e-05, "loss": 2.0544, "step": 11300 }, { "epoch": 0.5970149253731343, "grad_norm": 0.6362223029136658, "learning_rate": 1.4923731587561376e-05, "loss": 2.0656, "step": 11400 }, { "epoch": 0.6022518984027232, "grad_norm": 0.6346144080162048, "learning_rate": 1.505466448445172e-05, "loss": 2.0611, "step": 11500 }, { "epoch": 0.6074888714323121, "grad_norm": 0.6532538533210754, "learning_rate": 1.5185597381342064e-05, "loss": 2.0686, "step": 11600 }, { "epoch": 0.612725844461901, "grad_norm": 0.6856857538223267, "learning_rate": 1.5316530278232407e-05, "loss": 2.0616, "step": 11700 }, { "epoch": 0.61796281749149, "grad_norm": 0.9743651747703552, "learning_rate": 1.544746317512275e-05, "loss": 2.0641, "step": 11800 }, { "epoch": 0.6231997905210788, "grad_norm": 0.628181517124176, "learning_rate": 1.5578396072013097e-05, "loss": 2.0725, "step": 11900 }, { "epoch": 0.6284367635506677, "grad_norm": 0.6573601961135864, "learning_rate": 1.570932896890344e-05, "loss": 2.0819, "step": 12000 }, { "epoch": 0.6336737365802566, "grad_norm": 0.6741845011711121, "learning_rate": 1.5840261865793783e-05, "loss": 2.0636, "step": 12100 }, { "epoch": 0.6389107096098455, "grad_norm": 0.730778694152832, "learning_rate": 1.5971194762684126e-05, "loss": 2.0621, "step": 12200 }, { "epoch": 0.6441476826394344, "grad_norm": 0.6156385540962219, "learning_rate": 1.6100818330605564e-05, "loss": 2.0571, "step": 12300 }, { "epoch": 0.6493846556690233, "grad_norm": 0.6113892197608948, "learning_rate": 1.6231751227495908e-05, "loss": 2.0669, "step": 12400 }, { "epoch": 0.6546216286986122, "grad_norm": 0.6205545663833618, "learning_rate": 1.6362684124386254e-05, "loss": 2.0669, "step": 12500 }, { "epoch": 0.6598586017282011, "grad_norm": 0.6788818836212158, "learning_rate": 1.6493617021276598e-05, "loss": 2.0406, "step": 12600 }, { "epoch": 0.66509557475779, "grad_norm": 0.693049430847168, "learning_rate": 1.662454991816694e-05, "loss": 2.0551, "step": 12700 }, { "epoch": 0.6703325477873789, "grad_norm": 0.7428627610206604, "learning_rate": 1.6755482815057284e-05, "loss": 2.0301, "step": 12800 }, { "epoch": 0.6755695208169678, "grad_norm": 0.6874978542327881, "learning_rate": 1.6886415711947627e-05, "loss": 2.0546, "step": 12900 }, { "epoch": 0.6808064938465567, "grad_norm": 0.7278417348861694, "learning_rate": 1.701734860883797e-05, "loss": 2.0538, "step": 13000 }, { "epoch": 0.6860434668761456, "grad_norm": 0.641114354133606, "learning_rate": 1.7148281505728314e-05, "loss": 2.0585, "step": 13100 }, { "epoch": 0.6912804399057345, "grad_norm": 0.6964296698570251, "learning_rate": 1.727921440261866e-05, "loss": 2.0384, "step": 13200 }, { "epoch": 0.6965174129353234, "grad_norm": 0.6126134395599365, "learning_rate": 1.7410147299509003e-05, "loss": 2.0449, "step": 13300 }, { "epoch": 0.7017543859649122, "grad_norm": 0.6734199523925781, "learning_rate": 1.7541080196399347e-05, "loss": 2.0458, "step": 13400 }, { "epoch": 0.7069913589945012, "grad_norm": 0.6749238967895508, "learning_rate": 1.767201309328969e-05, "loss": 2.0639, "step": 13500 }, { "epoch": 0.7122283320240901, "grad_norm": 0.6168593764305115, "learning_rate": 1.7802945990180033e-05, "loss": 2.0435, "step": 13600 }, { "epoch": 0.717465305053679, "grad_norm": 0.7050462365150452, "learning_rate": 1.7933878887070376e-05, "loss": 2.049, "step": 13700 }, { "epoch": 0.7227022780832679, "grad_norm": 0.6948175430297852, "learning_rate": 1.806481178396072e-05, "loss": 2.0516, "step": 13800 }, { "epoch": 0.7279392511128567, "grad_norm": 0.6051421761512756, "learning_rate": 1.8195744680851066e-05, "loss": 2.0478, "step": 13900 }, { "epoch": 0.7331762241424457, "grad_norm": 0.7436869144439697, "learning_rate": 1.832667757774141e-05, "loss": 2.0466, "step": 14000 }, { "epoch": 0.7384131971720346, "grad_norm": 0.6047870516777039, "learning_rate": 1.8457610474631753e-05, "loss": 2.0382, "step": 14100 }, { "epoch": 0.7436501702016235, "grad_norm": 0.7828758358955383, "learning_rate": 1.8588543371522096e-05, "loss": 2.0303, "step": 14200 }, { "epoch": 0.7488871432312123, "grad_norm": 0.653523325920105, "learning_rate": 1.871947626841244e-05, "loss": 2.048, "step": 14300 }, { "epoch": 0.7541241162608012, "grad_norm": 0.6173336505889893, "learning_rate": 1.8850409165302782e-05, "loss": 2.0489, "step": 14400 }, { "epoch": 0.7593610892903901, "grad_norm": 0.7114732265472412, "learning_rate": 1.8981342062193125e-05, "loss": 2.0356, "step": 14500 }, { "epoch": 0.7645980623199791, "grad_norm": 0.6434004902839661, "learning_rate": 1.9112274959083472e-05, "loss": 2.035, "step": 14600 }, { "epoch": 0.769835035349568, "grad_norm": 0.6391409039497375, "learning_rate": 1.9243207855973815e-05, "loss": 2.0638, "step": 14700 }, { "epoch": 0.7750720083791568, "grad_norm": 0.8258867263793945, "learning_rate": 1.937414075286416e-05, "loss": 2.0248, "step": 14800 }, { "epoch": 0.7803089814087457, "grad_norm": 0.6063815951347351, "learning_rate": 1.95050736497545e-05, "loss": 2.0486, "step": 14900 }, { "epoch": 0.7855459544383346, "grad_norm": 0.6866258978843689, "learning_rate": 1.9636006546644845e-05, "loss": 2.0292, "step": 15000 }, { "epoch": 0.7907829274679236, "grad_norm": 0.5765138268470764, "learning_rate": 1.9766939443535188e-05, "loss": 2.0186, "step": 15100 }, { "epoch": 0.7960199004975125, "grad_norm": 0.6583371162414551, "learning_rate": 1.989787234042553e-05, "loss": 2.0391, "step": 15200 }, { "epoch": 0.8012568735271013, "grad_norm": 0.7544857263565063, "learning_rate": 1.9998363271901744e-05, "loss": 2.0349, "step": 15300 }, { "epoch": 0.8064938465566902, "grad_norm": 0.6168326735496521, "learning_rate": 1.9949708067498546e-05, "loss": 2.0375, "step": 15400 }, { "epoch": 0.8117308195862791, "grad_norm": 0.7661889791488647, "learning_rate": 1.9833795697023395e-05, "loss": 2.0328, "step": 15500 }, { "epoch": 0.816967792615868, "grad_norm": 0.6521978974342346, "learning_rate": 1.9651409694776794e-05, "loss": 2.0574, "step": 15600 }, { "epoch": 0.822204765645457, "grad_norm": 0.6655182838439941, "learning_rate": 1.9403782937699357e-05, "loss": 2.0313, "step": 15700 }, { "epoch": 0.8274417386750458, "grad_norm": 0.6480154991149902, "learning_rate": 1.9092589311478146e-05, "loss": 2.0384, "step": 15800 }, { "epoch": 0.8326787117046347, "grad_norm": 0.6570712327957153, "learning_rate": 1.8719932395560647e-05, "loss": 2.0313, "step": 15900 }, { "epoch": 0.8379156847342236, "grad_norm": 0.6235129237174988, "learning_rate": 1.8288331243562475e-05, "loss": 2.0322, "step": 16000 }, { "epoch": 0.8431526577638125, "grad_norm": 0.6542329788208008, "learning_rate": 1.7800703355189137e-05, "loss": 2.0384, "step": 16100 }, { "epoch": 0.8483896307934015, "grad_norm": 0.6734735369682312, "learning_rate": 1.726034495477677e-05, "loss": 2.0381, "step": 16200 }, { "epoch": 0.8536266038229903, "grad_norm": 0.6568425297737122, "learning_rate": 1.66709087097633e-05, "loss": 2.0372, "step": 16300 }, { "epoch": 0.8588635768525792, "grad_norm": 0.6389915943145752, "learning_rate": 1.603637903970664e-05, "loss": 2.0302, "step": 16400 }, { "epoch": 0.8641005498821681, "grad_norm": 0.5985362529754639, "learning_rate": 1.5361045182753986e-05, "loss": 2.025, "step": 16500 }, { "epoch": 0.869337522911757, "grad_norm": 0.6669567823410034, "learning_rate": 1.4649472201625057e-05, "loss": 2.0329, "step": 16600 }, { "epoch": 0.874574495941346, "grad_norm": 0.5840954780578613, "learning_rate": 1.3914039388098432e-05, "loss": 2.0207, "step": 16700 }, { "epoch": 0.8798114689709348, "grad_norm": 0.6801176071166992, "learning_rate": 1.3144869286586354e-05, "loss": 2.0087, "step": 16800 }, { "epoch": 0.8850484420005237, "grad_norm": 0.6012386679649353, "learning_rate": 1.2354440772822623e-05, "loss": 2.0202, "step": 16900 }, { "epoch": 0.8902854150301126, "grad_norm": 0.655200719833374, "learning_rate": 1.1548096916318175e-05, "loss": 2.0297, "step": 17000 }, { "epoch": 0.8955223880597015, "grad_norm": 0.6136151552200317, "learning_rate": 1.0739490166119155e-05, "loss": 2.0128, "step": 17100 }, { "epoch": 0.9007593610892904, "grad_norm": 0.7110956311225891, "learning_rate": 9.917760281675867e-06, "loss": 2.0239, "step": 17200 }, { "epoch": 0.9059963341188793, "grad_norm": 0.6501589417457581, "learning_rate": 9.096586314085162e-06, "loss": 2.0274, "step": 17300 }, { "epoch": 0.9112333071484682, "grad_norm": 0.6471460461616516, "learning_rate": 8.281519163286772e-06, "loss": 2.0398, "step": 17400 }, { "epoch": 0.9164702801780571, "grad_norm": 0.7580538392066956, "learning_rate": 7.478068448894577e-06, "loss": 2.0231, "step": 17500 }, { "epoch": 0.921707253207646, "grad_norm": 0.657486617565155, "learning_rate": 6.6916652667519855e-06, "loss": 2.0192, "step": 17600 }, { "epoch": 0.9269442262372348, "grad_norm": 0.7056224346160889, "learning_rate": 5.927625476285426e-06, "loss": 2.0233, "step": 17700 }, { "epoch": 0.9321811992668237, "grad_norm": 0.6053991913795471, "learning_rate": 5.191113766822905e-06, "loss": 2.0165, "step": 17800 }, { "epoch": 0.9374181722964127, "grad_norm": 0.6543104648590088, "learning_rate": 4.487108745778958e-06, "loss": 2.0096, "step": 17900 }, { "epoch": 0.9426551453260016, "grad_norm": 0.6767512559890747, "learning_rate": 3.820369284699823e-06, "loss": 2.0236, "step": 18000 }, { "epoch": 0.9478921183555905, "grad_norm": 0.5917364358901978, "learning_rate": 3.195402350659945e-06, "loss": 2.0315, "step": 18100 }, { "epoch": 0.9531290913851793, "grad_norm": 0.5886921286582947, "learning_rate": 2.616432540460255e-06, "loss": 2.0335, "step": 18200 }, { "epoch": 0.9583660644147682, "grad_norm": 0.6519348621368408, "learning_rate": 2.0873735235683535e-06, "loss": 2.0138, "step": 18300 }, { "epoch": 0.9636030374443572, "grad_norm": 0.6619024872779846, "learning_rate": 1.6118015868380387e-06, "loss": 2.0223, "step": 18400 }, { "epoch": 0.9688400104739461, "grad_norm": 0.6300016045570374, "learning_rate": 1.1929314598383423e-06, "loss": 2.0184, "step": 18500 }, { "epoch": 0.974076983503535, "grad_norm": 0.6754645109176636, "learning_rate": 8.335945842058524e-07, "loss": 2.0215, "step": 18600 }, { "epoch": 0.9793139565331238, "grad_norm": 0.7369129657745361, "learning_rate": 5.362199739132656e-07, "loss": 2.0138, "step": 18700 }, { "epoch": 0.9845509295627127, "grad_norm": 0.5822499990463257, "learning_rate": 3.028177958332512e-07, "loss": 2.0249, "step": 18800 }, { "epoch": 0.9897879025923016, "grad_norm": 0.6991373896598816, "learning_rate": 1.349657815883032e-07, "loss": 2.0329, "step": 18900 }, { "epoch": 0.9950248756218906, "grad_norm": 0.6656786203384399, "learning_rate": 3.379856253855951e-08, "loss": 2.0112, "step": 19000 }, { "epoch": 1.0, "step": 19095, "total_flos": 2.7919996141761987e+18, "train_loss": 2.1007044342432573, "train_runtime": 7350.2103, "train_samples_per_second": 41.566, "train_steps_per_second": 2.598 } ], "logging_steps": 100, "max_steps": 19095, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7919996141761987e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }