gpt2-xl-lora-multi-512-k5-14-im-1 / trainer_state.json
MHGanainy's picture
MHGanainy/gpt2-xl-lora-multi-512-k5-14-im-1
a7681b8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 19095,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005236973029588898,
"grad_norm": 0.07354702800512314,
"learning_rate": 1.3093289689034372e-07,
"loss": 2.2441,
"step": 100
},
{
"epoch": 0.010473946059177796,
"grad_norm": 0.07018959522247314,
"learning_rate": 2.6186579378068744e-07,
"loss": 2.2612,
"step": 200
},
{
"epoch": 0.015710919088766692,
"grad_norm": 0.07288151979446411,
"learning_rate": 3.9279869067103113e-07,
"loss": 2.241,
"step": 300
},
{
"epoch": 0.020947892118355592,
"grad_norm": 0.0727863535284996,
"learning_rate": 5.237315875613749e-07,
"loss": 2.2466,
"step": 400
},
{
"epoch": 0.026184865147944488,
"grad_norm": 0.10916672646999359,
"learning_rate": 6.546644844517186e-07,
"loss": 2.2483,
"step": 500
},
{
"epoch": 0.031421838177533384,
"grad_norm": 0.08643593639135361,
"learning_rate": 7.855973813420623e-07,
"loss": 2.2339,
"step": 600
},
{
"epoch": 0.036658811207122284,
"grad_norm": 0.09361663460731506,
"learning_rate": 9.165302782324059e-07,
"loss": 2.2352,
"step": 700
},
{
"epoch": 0.041895784236711184,
"grad_norm": 0.11914397031068802,
"learning_rate": 1.0474631751227498e-06,
"loss": 2.2503,
"step": 800
},
{
"epoch": 0.04713275726630008,
"grad_norm": 0.11531686782836914,
"learning_rate": 1.1783960720130934e-06,
"loss": 2.2327,
"step": 900
},
{
"epoch": 0.052369730295888976,
"grad_norm": 0.12413031607866287,
"learning_rate": 1.3093289689034372e-06,
"loss": 2.2412,
"step": 1000
},
{
"epoch": 0.057606703325477876,
"grad_norm": 0.1295381635427475,
"learning_rate": 1.4402618657937809e-06,
"loss": 2.2431,
"step": 1100
},
{
"epoch": 0.06284367635506677,
"grad_norm": 0.1375189870595932,
"learning_rate": 1.5711947626841245e-06,
"loss": 2.2273,
"step": 1200
},
{
"epoch": 0.06808064938465568,
"grad_norm": 0.16082307696342468,
"learning_rate": 1.7021276595744682e-06,
"loss": 2.2117,
"step": 1300
},
{
"epoch": 0.07331762241424457,
"grad_norm": 0.16158346831798553,
"learning_rate": 1.8330605564648118e-06,
"loss": 2.2206,
"step": 1400
},
{
"epoch": 0.07855459544383346,
"grad_norm": 0.174397274851799,
"learning_rate": 1.9639934533551554e-06,
"loss": 2.2255,
"step": 1500
},
{
"epoch": 0.08379156847342237,
"grad_norm": 0.186273992061615,
"learning_rate": 2.0949263502454995e-06,
"loss": 2.1958,
"step": 1600
},
{
"epoch": 0.08902854150301126,
"grad_norm": 0.1794576197862625,
"learning_rate": 2.225859247135843e-06,
"loss": 2.2321,
"step": 1700
},
{
"epoch": 0.09426551453260015,
"grad_norm": 0.19168038666248322,
"learning_rate": 2.3567921440261868e-06,
"loss": 2.2192,
"step": 1800
},
{
"epoch": 0.09950248756218906,
"grad_norm": 0.1959036886692047,
"learning_rate": 2.486415711947627e-06,
"loss": 2.1996,
"step": 1900
},
{
"epoch": 0.10473946059177795,
"grad_norm": 0.22251689434051514,
"learning_rate": 2.6173486088379706e-06,
"loss": 2.2137,
"step": 2000
},
{
"epoch": 0.10997643362136685,
"grad_norm": 0.20624759793281555,
"learning_rate": 2.7482815057283147e-06,
"loss": 2.2049,
"step": 2100
},
{
"epoch": 0.11521340665095575,
"grad_norm": 0.25143158435821533,
"learning_rate": 2.879214402618658e-06,
"loss": 2.2046,
"step": 2200
},
{
"epoch": 0.12045037968054464,
"grad_norm": 0.24057720601558685,
"learning_rate": 3.010147299509002e-06,
"loss": 2.1802,
"step": 2300
},
{
"epoch": 0.12568735271013354,
"grad_norm": 0.23995009064674377,
"learning_rate": 3.141080196399345e-06,
"loss": 2.1901,
"step": 2400
},
{
"epoch": 0.13092432573972243,
"grad_norm": 0.2643965184688568,
"learning_rate": 3.2720130932896892e-06,
"loss": 2.1948,
"step": 2500
},
{
"epoch": 0.13616129876931135,
"grad_norm": 0.2582302689552307,
"learning_rate": 3.4029459901800333e-06,
"loss": 2.1953,
"step": 2600
},
{
"epoch": 0.14139827179890024,
"grad_norm": 0.2721526622772217,
"learning_rate": 3.5338788870703765e-06,
"loss": 2.2013,
"step": 2700
},
{
"epoch": 0.14663524482848914,
"grad_norm": 0.26533135771751404,
"learning_rate": 3.6648117839607206e-06,
"loss": 2.1874,
"step": 2800
},
{
"epoch": 0.15187221785807803,
"grad_norm": 0.2823657989501953,
"learning_rate": 3.7957446808510638e-06,
"loss": 2.1956,
"step": 2900
},
{
"epoch": 0.15710919088766692,
"grad_norm": 0.36414533853530884,
"learning_rate": 3.926677577741408e-06,
"loss": 2.1853,
"step": 3000
},
{
"epoch": 0.16234616391725581,
"grad_norm": 0.2987813353538513,
"learning_rate": 4.0576104746317515e-06,
"loss": 2.1845,
"step": 3100
},
{
"epoch": 0.16758313694684474,
"grad_norm": 0.2912348806858063,
"learning_rate": 4.1885433715220955e-06,
"loss": 2.1754,
"step": 3200
},
{
"epoch": 0.17282010997643363,
"grad_norm": 0.3097289204597473,
"learning_rate": 4.319476268412439e-06,
"loss": 2.1552,
"step": 3300
},
{
"epoch": 0.17805708300602252,
"grad_norm": 0.31930428743362427,
"learning_rate": 4.450409165302783e-06,
"loss": 2.1862,
"step": 3400
},
{
"epoch": 0.1832940560356114,
"grad_norm": 0.33817386627197266,
"learning_rate": 4.581342062193127e-06,
"loss": 2.1808,
"step": 3500
},
{
"epoch": 0.1885310290652003,
"grad_norm": 0.3205846846103668,
"learning_rate": 4.71227495908347e-06,
"loss": 2.1898,
"step": 3600
},
{
"epoch": 0.1937680020947892,
"grad_norm": 0.3309566080570221,
"learning_rate": 4.843207855973814e-06,
"loss": 2.174,
"step": 3700
},
{
"epoch": 0.19900497512437812,
"grad_norm": 0.34491220116615295,
"learning_rate": 4.974140752864157e-06,
"loss": 2.1764,
"step": 3800
},
{
"epoch": 0.204241948153967,
"grad_norm": 0.35818833112716675,
"learning_rate": 5.1050736497545014e-06,
"loss": 2.1502,
"step": 3900
},
{
"epoch": 0.2094789211835559,
"grad_norm": 0.3484792709350586,
"learning_rate": 5.2360065466448455e-06,
"loss": 2.1749,
"step": 4000
},
{
"epoch": 0.2147158942131448,
"grad_norm": 0.3905714452266693,
"learning_rate": 5.366939443535189e-06,
"loss": 2.1468,
"step": 4100
},
{
"epoch": 0.2199528672427337,
"grad_norm": 0.3773205280303955,
"learning_rate": 5.497872340425532e-06,
"loss": 2.1716,
"step": 4200
},
{
"epoch": 0.2251898402723226,
"grad_norm": 0.38546085357666016,
"learning_rate": 5.628805237315876e-06,
"loss": 2.1519,
"step": 4300
},
{
"epoch": 0.2304268133019115,
"grad_norm": 0.39430660009384155,
"learning_rate": 5.75973813420622e-06,
"loss": 2.1472,
"step": 4400
},
{
"epoch": 0.2356637863315004,
"grad_norm": 0.38882067799568176,
"learning_rate": 5.890671031096563e-06,
"loss": 2.1417,
"step": 4500
},
{
"epoch": 0.2409007593610893,
"grad_norm": 0.40174001455307007,
"learning_rate": 6.021603927986907e-06,
"loss": 2.1528,
"step": 4600
},
{
"epoch": 0.24613773239067818,
"grad_norm": 0.4062660038471222,
"learning_rate": 6.152536824877251e-06,
"loss": 2.1352,
"step": 4700
},
{
"epoch": 0.2513747054202671,
"grad_norm": 0.4776448905467987,
"learning_rate": 6.283469721767595e-06,
"loss": 2.1528,
"step": 4800
},
{
"epoch": 0.256611678449856,
"grad_norm": 0.3891739845275879,
"learning_rate": 6.414402618657938e-06,
"loss": 2.1508,
"step": 4900
},
{
"epoch": 0.26184865147944486,
"grad_norm": 0.42986443638801575,
"learning_rate": 6.545335515548282e-06,
"loss": 2.149,
"step": 5000
},
{
"epoch": 0.2670856245090338,
"grad_norm": 0.39317014813423157,
"learning_rate": 6.676268412438626e-06,
"loss": 2.1472,
"step": 5100
},
{
"epoch": 0.2723225975386227,
"grad_norm": 0.45696353912353516,
"learning_rate": 6.807201309328969e-06,
"loss": 2.1401,
"step": 5200
},
{
"epoch": 0.27755957056821157,
"grad_norm": 0.4466469883918762,
"learning_rate": 6.938134206219313e-06,
"loss": 2.1492,
"step": 5300
},
{
"epoch": 0.2827965435978005,
"grad_norm": 0.4214916229248047,
"learning_rate": 7.069067103109657e-06,
"loss": 2.1438,
"step": 5400
},
{
"epoch": 0.28803351662738935,
"grad_norm": 0.44096261262893677,
"learning_rate": 7.198690671031097e-06,
"loss": 2.1445,
"step": 5500
},
{
"epoch": 0.2932704896569783,
"grad_norm": 0.4745313823223114,
"learning_rate": 7.329623567921441e-06,
"loss": 2.1303,
"step": 5600
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.5099794864654541,
"learning_rate": 7.460556464811784e-06,
"loss": 2.14,
"step": 5700
},
{
"epoch": 0.30374443571615606,
"grad_norm": 0.4933392405509949,
"learning_rate": 7.5914893617021276e-06,
"loss": 2.1181,
"step": 5800
},
{
"epoch": 0.308981408745745,
"grad_norm": 0.4734782576560974,
"learning_rate": 7.722422258592472e-06,
"loss": 2.1259,
"step": 5900
},
{
"epoch": 0.31421838177533384,
"grad_norm": 0.4762997627258301,
"learning_rate": 7.853355155482817e-06,
"loss": 2.1185,
"step": 6000
},
{
"epoch": 0.31945535480492276,
"grad_norm": 0.5242263674736023,
"learning_rate": 7.98428805237316e-06,
"loss": 2.1406,
"step": 6100
},
{
"epoch": 0.32469232783451163,
"grad_norm": 0.4882369637489319,
"learning_rate": 8.115220949263503e-06,
"loss": 2.1221,
"step": 6200
},
{
"epoch": 0.32992930086410055,
"grad_norm": 0.48831576108932495,
"learning_rate": 8.246153846153848e-06,
"loss": 2.1203,
"step": 6300
},
{
"epoch": 0.33516627389368947,
"grad_norm": 0.4771474301815033,
"learning_rate": 8.377086743044191e-06,
"loss": 2.1247,
"step": 6400
},
{
"epoch": 0.34040324692327834,
"grad_norm": 0.48237186670303345,
"learning_rate": 8.508019639934534e-06,
"loss": 2.1083,
"step": 6500
},
{
"epoch": 0.34564021995286726,
"grad_norm": 0.5286875367164612,
"learning_rate": 8.638952536824878e-06,
"loss": 2.1258,
"step": 6600
},
{
"epoch": 0.3508771929824561,
"grad_norm": 0.5419202446937561,
"learning_rate": 8.769885433715222e-06,
"loss": 2.1248,
"step": 6700
},
{
"epoch": 0.35611416601204504,
"grad_norm": 0.5243601202964783,
"learning_rate": 8.900818330605566e-06,
"loss": 2.1252,
"step": 6800
},
{
"epoch": 0.36135113904163396,
"grad_norm": 0.5450451970100403,
"learning_rate": 9.031751227495909e-06,
"loss": 2.1117,
"step": 6900
},
{
"epoch": 0.3665881120712228,
"grad_norm": 0.5390617251396179,
"learning_rate": 9.162684124386254e-06,
"loss": 2.1312,
"step": 7000
},
{
"epoch": 0.37182508510081175,
"grad_norm": 0.5742843747138977,
"learning_rate": 9.293617021276597e-06,
"loss": 2.1207,
"step": 7100
},
{
"epoch": 0.3770620581304006,
"grad_norm": 0.5794598460197449,
"learning_rate": 9.42454991816694e-06,
"loss": 2.1088,
"step": 7200
},
{
"epoch": 0.38229903115998953,
"grad_norm": 0.5871763229370117,
"learning_rate": 9.555482815057283e-06,
"loss": 2.1138,
"step": 7300
},
{
"epoch": 0.3875360041895784,
"grad_norm": 0.574471116065979,
"learning_rate": 9.686415711947628e-06,
"loss": 2.1234,
"step": 7400
},
{
"epoch": 0.3927729772191673,
"grad_norm": 0.5694011449813843,
"learning_rate": 9.817348608837972e-06,
"loss": 2.1028,
"step": 7500
},
{
"epoch": 0.39800995024875624,
"grad_norm": 0.5721834301948547,
"learning_rate": 9.948281505728315e-06,
"loss": 2.0942,
"step": 7600
},
{
"epoch": 0.4032469232783451,
"grad_norm": 0.5568354725837708,
"learning_rate": 1.0079214402618658e-05,
"loss": 2.0937,
"step": 7700
},
{
"epoch": 0.408483896307934,
"grad_norm": 0.575330913066864,
"learning_rate": 1.0210147299509003e-05,
"loss": 2.0989,
"step": 7800
},
{
"epoch": 0.4137208693375229,
"grad_norm": 0.5605918169021606,
"learning_rate": 1.0341080196399346e-05,
"loss": 2.0992,
"step": 7900
},
{
"epoch": 0.4189578423671118,
"grad_norm": 0.5807542204856873,
"learning_rate": 1.0472013093289691e-05,
"loss": 2.101,
"step": 8000
},
{
"epoch": 0.42419481539670073,
"grad_norm": 0.5749071836471558,
"learning_rate": 1.0602945990180034e-05,
"loss": 2.1092,
"step": 8100
},
{
"epoch": 0.4294317884262896,
"grad_norm": 0.6206376552581787,
"learning_rate": 1.0733878887070377e-05,
"loss": 2.1026,
"step": 8200
},
{
"epoch": 0.4346687614558785,
"grad_norm": 0.586361825466156,
"learning_rate": 1.086481178396072e-05,
"loss": 2.0985,
"step": 8300
},
{
"epoch": 0.4399057344854674,
"grad_norm": 0.6338817477226257,
"learning_rate": 1.0995744680851064e-05,
"loss": 2.0887,
"step": 8400
},
{
"epoch": 0.4451427075150563,
"grad_norm": 0.6082013845443726,
"learning_rate": 1.1126677577741409e-05,
"loss": 2.1088,
"step": 8500
},
{
"epoch": 0.4503796805446452,
"grad_norm": 0.6418773531913757,
"learning_rate": 1.1257610474631752e-05,
"loss": 2.0641,
"step": 8600
},
{
"epoch": 0.4556166535742341,
"grad_norm": 0.6760055422782898,
"learning_rate": 1.1388543371522097e-05,
"loss": 2.079,
"step": 8700
},
{
"epoch": 0.460853626603823,
"grad_norm": 0.611735999584198,
"learning_rate": 1.151947626841244e-05,
"loss": 2.0853,
"step": 8800
},
{
"epoch": 0.4660905996334119,
"grad_norm": 0.6323230266571045,
"learning_rate": 1.1650409165302783e-05,
"loss": 2.0919,
"step": 8900
},
{
"epoch": 0.4713275726630008,
"grad_norm": 0.7350252270698547,
"learning_rate": 1.1781342062193127e-05,
"loss": 2.0942,
"step": 9000
},
{
"epoch": 0.47656454569258966,
"grad_norm": 0.5890368223190308,
"learning_rate": 1.191227495908347e-05,
"loss": 2.1016,
"step": 9100
},
{
"epoch": 0.4818015187221786,
"grad_norm": 0.6341009736061096,
"learning_rate": 1.2043207855973815e-05,
"loss": 2.0804,
"step": 9200
},
{
"epoch": 0.4870384917517675,
"grad_norm": 0.6020395755767822,
"learning_rate": 1.2174140752864158e-05,
"loss": 2.0686,
"step": 9300
},
{
"epoch": 0.49227546478135636,
"grad_norm": 0.6680401563644409,
"learning_rate": 1.2305073649754503e-05,
"loss": 2.0854,
"step": 9400
},
{
"epoch": 0.4975124378109453,
"grad_norm": 0.7290039658546448,
"learning_rate": 1.2436006546644846e-05,
"loss": 2.0779,
"step": 9500
},
{
"epoch": 0.5027494108405341,
"grad_norm": 0.6373685598373413,
"learning_rate": 1.256693944353519e-05,
"loss": 2.1097,
"step": 9600
},
{
"epoch": 0.5079863838701231,
"grad_norm": 0.5846343040466309,
"learning_rate": 1.2697872340425532e-05,
"loss": 2.0751,
"step": 9700
},
{
"epoch": 0.513223356899712,
"grad_norm": 0.5871058702468872,
"learning_rate": 1.2828805237315876e-05,
"loss": 2.0771,
"step": 9800
},
{
"epoch": 0.5184603299293009,
"grad_norm": 0.6121764779090881,
"learning_rate": 1.295973813420622e-05,
"loss": 2.0813,
"step": 9900
},
{
"epoch": 0.5236973029588897,
"grad_norm": 0.5855483412742615,
"learning_rate": 1.3090671031096564e-05,
"loss": 2.0796,
"step": 10000
},
{
"epoch": 0.5289342759884786,
"grad_norm": 0.6471145153045654,
"learning_rate": 1.3221603927986909e-05,
"loss": 2.0898,
"step": 10100
},
{
"epoch": 0.5341712490180676,
"grad_norm": 0.6933115124702454,
"learning_rate": 1.3352536824877252e-05,
"loss": 2.0949,
"step": 10200
},
{
"epoch": 0.5394082220476565,
"grad_norm": 0.6297255158424377,
"learning_rate": 1.3483469721767595e-05,
"loss": 2.0888,
"step": 10300
},
{
"epoch": 0.5446451950772454,
"grad_norm": 0.6992611885070801,
"learning_rate": 1.3614402618657938e-05,
"loss": 2.0908,
"step": 10400
},
{
"epoch": 0.5498821681068342,
"grad_norm": 0.6574690341949463,
"learning_rate": 1.3745335515548283e-05,
"loss": 2.0826,
"step": 10500
},
{
"epoch": 0.5551191411364231,
"grad_norm": 0.5975152850151062,
"learning_rate": 1.3876268412438626e-05,
"loss": 2.0747,
"step": 10600
},
{
"epoch": 0.560356114166012,
"grad_norm": 0.6534228920936584,
"learning_rate": 1.400720130932897e-05,
"loss": 2.0882,
"step": 10700
},
{
"epoch": 0.565593087195601,
"grad_norm": 0.6680553555488586,
"learning_rate": 1.4138134206219315e-05,
"loss": 2.0788,
"step": 10800
},
{
"epoch": 0.5708300602251899,
"grad_norm": 0.6993077993392944,
"learning_rate": 1.4269067103109658e-05,
"loss": 2.0505,
"step": 10900
},
{
"epoch": 0.5760670332547787,
"grad_norm": 0.6117376089096069,
"learning_rate": 1.4400000000000001e-05,
"loss": 2.0805,
"step": 11000
},
{
"epoch": 0.5813040062843676,
"grad_norm": 0.670172929763794,
"learning_rate": 1.4530932896890344e-05,
"loss": 2.0809,
"step": 11100
},
{
"epoch": 0.5865409793139565,
"grad_norm": 0.61323481798172,
"learning_rate": 1.466186579378069e-05,
"loss": 2.061,
"step": 11200
},
{
"epoch": 0.5917779523435455,
"grad_norm": 0.6071058511734009,
"learning_rate": 1.4792798690671032e-05,
"loss": 2.0544,
"step": 11300
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.6362223029136658,
"learning_rate": 1.4923731587561376e-05,
"loss": 2.0656,
"step": 11400
},
{
"epoch": 0.6022518984027232,
"grad_norm": 0.6346144080162048,
"learning_rate": 1.505466448445172e-05,
"loss": 2.0611,
"step": 11500
},
{
"epoch": 0.6074888714323121,
"grad_norm": 0.6532538533210754,
"learning_rate": 1.5185597381342064e-05,
"loss": 2.0686,
"step": 11600
},
{
"epoch": 0.612725844461901,
"grad_norm": 0.6856857538223267,
"learning_rate": 1.5316530278232407e-05,
"loss": 2.0616,
"step": 11700
},
{
"epoch": 0.61796281749149,
"grad_norm": 0.9743651747703552,
"learning_rate": 1.544746317512275e-05,
"loss": 2.0641,
"step": 11800
},
{
"epoch": 0.6231997905210788,
"grad_norm": 0.628181517124176,
"learning_rate": 1.5578396072013097e-05,
"loss": 2.0725,
"step": 11900
},
{
"epoch": 0.6284367635506677,
"grad_norm": 0.6573601961135864,
"learning_rate": 1.570932896890344e-05,
"loss": 2.0819,
"step": 12000
},
{
"epoch": 0.6336737365802566,
"grad_norm": 0.6741845011711121,
"learning_rate": 1.5840261865793783e-05,
"loss": 2.0636,
"step": 12100
},
{
"epoch": 0.6389107096098455,
"grad_norm": 0.730778694152832,
"learning_rate": 1.5971194762684126e-05,
"loss": 2.0621,
"step": 12200
},
{
"epoch": 0.6441476826394344,
"grad_norm": 0.6156385540962219,
"learning_rate": 1.6100818330605564e-05,
"loss": 2.0571,
"step": 12300
},
{
"epoch": 0.6493846556690233,
"grad_norm": 0.6113892197608948,
"learning_rate": 1.6231751227495908e-05,
"loss": 2.0669,
"step": 12400
},
{
"epoch": 0.6546216286986122,
"grad_norm": 0.6205545663833618,
"learning_rate": 1.6362684124386254e-05,
"loss": 2.0669,
"step": 12500
},
{
"epoch": 0.6598586017282011,
"grad_norm": 0.6788818836212158,
"learning_rate": 1.6493617021276598e-05,
"loss": 2.0406,
"step": 12600
},
{
"epoch": 0.66509557475779,
"grad_norm": 0.693049430847168,
"learning_rate": 1.662454991816694e-05,
"loss": 2.0551,
"step": 12700
},
{
"epoch": 0.6703325477873789,
"grad_norm": 0.7428627610206604,
"learning_rate": 1.6755482815057284e-05,
"loss": 2.0301,
"step": 12800
},
{
"epoch": 0.6755695208169678,
"grad_norm": 0.6874978542327881,
"learning_rate": 1.6886415711947627e-05,
"loss": 2.0546,
"step": 12900
},
{
"epoch": 0.6808064938465567,
"grad_norm": 0.7278417348861694,
"learning_rate": 1.701734860883797e-05,
"loss": 2.0538,
"step": 13000
},
{
"epoch": 0.6860434668761456,
"grad_norm": 0.641114354133606,
"learning_rate": 1.7148281505728314e-05,
"loss": 2.0585,
"step": 13100
},
{
"epoch": 0.6912804399057345,
"grad_norm": 0.6964296698570251,
"learning_rate": 1.727921440261866e-05,
"loss": 2.0384,
"step": 13200
},
{
"epoch": 0.6965174129353234,
"grad_norm": 0.6126134395599365,
"learning_rate": 1.7410147299509003e-05,
"loss": 2.0449,
"step": 13300
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.6734199523925781,
"learning_rate": 1.7541080196399347e-05,
"loss": 2.0458,
"step": 13400
},
{
"epoch": 0.7069913589945012,
"grad_norm": 0.6749238967895508,
"learning_rate": 1.767201309328969e-05,
"loss": 2.0639,
"step": 13500
},
{
"epoch": 0.7122283320240901,
"grad_norm": 0.6168593764305115,
"learning_rate": 1.7802945990180033e-05,
"loss": 2.0435,
"step": 13600
},
{
"epoch": 0.717465305053679,
"grad_norm": 0.7050462365150452,
"learning_rate": 1.7933878887070376e-05,
"loss": 2.049,
"step": 13700
},
{
"epoch": 0.7227022780832679,
"grad_norm": 0.6948175430297852,
"learning_rate": 1.806481178396072e-05,
"loss": 2.0516,
"step": 13800
},
{
"epoch": 0.7279392511128567,
"grad_norm": 0.6051421761512756,
"learning_rate": 1.8195744680851066e-05,
"loss": 2.0478,
"step": 13900
},
{
"epoch": 0.7331762241424457,
"grad_norm": 0.7436869144439697,
"learning_rate": 1.832667757774141e-05,
"loss": 2.0466,
"step": 14000
},
{
"epoch": 0.7384131971720346,
"grad_norm": 0.6047870516777039,
"learning_rate": 1.8457610474631753e-05,
"loss": 2.0382,
"step": 14100
},
{
"epoch": 0.7436501702016235,
"grad_norm": 0.7828758358955383,
"learning_rate": 1.8588543371522096e-05,
"loss": 2.0303,
"step": 14200
},
{
"epoch": 0.7488871432312123,
"grad_norm": 0.653523325920105,
"learning_rate": 1.871947626841244e-05,
"loss": 2.048,
"step": 14300
},
{
"epoch": 0.7541241162608012,
"grad_norm": 0.6173336505889893,
"learning_rate": 1.8850409165302782e-05,
"loss": 2.0489,
"step": 14400
},
{
"epoch": 0.7593610892903901,
"grad_norm": 0.7114732265472412,
"learning_rate": 1.8981342062193125e-05,
"loss": 2.0356,
"step": 14500
},
{
"epoch": 0.7645980623199791,
"grad_norm": 0.6434004902839661,
"learning_rate": 1.9112274959083472e-05,
"loss": 2.035,
"step": 14600
},
{
"epoch": 0.769835035349568,
"grad_norm": 0.6391409039497375,
"learning_rate": 1.9243207855973815e-05,
"loss": 2.0638,
"step": 14700
},
{
"epoch": 0.7750720083791568,
"grad_norm": 0.8258867263793945,
"learning_rate": 1.937414075286416e-05,
"loss": 2.0248,
"step": 14800
},
{
"epoch": 0.7803089814087457,
"grad_norm": 0.6063815951347351,
"learning_rate": 1.95050736497545e-05,
"loss": 2.0486,
"step": 14900
},
{
"epoch": 0.7855459544383346,
"grad_norm": 0.6866258978843689,
"learning_rate": 1.9636006546644845e-05,
"loss": 2.0292,
"step": 15000
},
{
"epoch": 0.7907829274679236,
"grad_norm": 0.5765138268470764,
"learning_rate": 1.9766939443535188e-05,
"loss": 2.0186,
"step": 15100
},
{
"epoch": 0.7960199004975125,
"grad_norm": 0.6583371162414551,
"learning_rate": 1.989787234042553e-05,
"loss": 2.0391,
"step": 15200
},
{
"epoch": 0.8012568735271013,
"grad_norm": 0.7544857263565063,
"learning_rate": 1.9998363271901744e-05,
"loss": 2.0349,
"step": 15300
},
{
"epoch": 0.8064938465566902,
"grad_norm": 0.6168326735496521,
"learning_rate": 1.9949708067498546e-05,
"loss": 2.0375,
"step": 15400
},
{
"epoch": 0.8117308195862791,
"grad_norm": 0.7661889791488647,
"learning_rate": 1.9833795697023395e-05,
"loss": 2.0328,
"step": 15500
},
{
"epoch": 0.816967792615868,
"grad_norm": 0.6521978974342346,
"learning_rate": 1.9651409694776794e-05,
"loss": 2.0574,
"step": 15600
},
{
"epoch": 0.822204765645457,
"grad_norm": 0.6655182838439941,
"learning_rate": 1.9403782937699357e-05,
"loss": 2.0313,
"step": 15700
},
{
"epoch": 0.8274417386750458,
"grad_norm": 0.6480154991149902,
"learning_rate": 1.9092589311478146e-05,
"loss": 2.0384,
"step": 15800
},
{
"epoch": 0.8326787117046347,
"grad_norm": 0.6570712327957153,
"learning_rate": 1.8719932395560647e-05,
"loss": 2.0313,
"step": 15900
},
{
"epoch": 0.8379156847342236,
"grad_norm": 0.6235129237174988,
"learning_rate": 1.8288331243562475e-05,
"loss": 2.0322,
"step": 16000
},
{
"epoch": 0.8431526577638125,
"grad_norm": 0.6542329788208008,
"learning_rate": 1.7800703355189137e-05,
"loss": 2.0384,
"step": 16100
},
{
"epoch": 0.8483896307934015,
"grad_norm": 0.6734735369682312,
"learning_rate": 1.726034495477677e-05,
"loss": 2.0381,
"step": 16200
},
{
"epoch": 0.8536266038229903,
"grad_norm": 0.6568425297737122,
"learning_rate": 1.66709087097633e-05,
"loss": 2.0372,
"step": 16300
},
{
"epoch": 0.8588635768525792,
"grad_norm": 0.6389915943145752,
"learning_rate": 1.603637903970664e-05,
"loss": 2.0302,
"step": 16400
},
{
"epoch": 0.8641005498821681,
"grad_norm": 0.5985362529754639,
"learning_rate": 1.5361045182753986e-05,
"loss": 2.025,
"step": 16500
},
{
"epoch": 0.869337522911757,
"grad_norm": 0.6669567823410034,
"learning_rate": 1.4649472201625057e-05,
"loss": 2.0329,
"step": 16600
},
{
"epoch": 0.874574495941346,
"grad_norm": 0.5840954780578613,
"learning_rate": 1.3914039388098432e-05,
"loss": 2.0207,
"step": 16700
},
{
"epoch": 0.8798114689709348,
"grad_norm": 0.6801176071166992,
"learning_rate": 1.3144869286586354e-05,
"loss": 2.0087,
"step": 16800
},
{
"epoch": 0.8850484420005237,
"grad_norm": 0.6012386679649353,
"learning_rate": 1.2354440772822623e-05,
"loss": 2.0202,
"step": 16900
},
{
"epoch": 0.8902854150301126,
"grad_norm": 0.655200719833374,
"learning_rate": 1.1548096916318175e-05,
"loss": 2.0297,
"step": 17000
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.6136151552200317,
"learning_rate": 1.0739490166119155e-05,
"loss": 2.0128,
"step": 17100
},
{
"epoch": 0.9007593610892904,
"grad_norm": 0.7110956311225891,
"learning_rate": 9.917760281675867e-06,
"loss": 2.0239,
"step": 17200
},
{
"epoch": 0.9059963341188793,
"grad_norm": 0.6501589417457581,
"learning_rate": 9.096586314085162e-06,
"loss": 2.0274,
"step": 17300
},
{
"epoch": 0.9112333071484682,
"grad_norm": 0.6471460461616516,
"learning_rate": 8.281519163286772e-06,
"loss": 2.0398,
"step": 17400
},
{
"epoch": 0.9164702801780571,
"grad_norm": 0.7580538392066956,
"learning_rate": 7.478068448894577e-06,
"loss": 2.0231,
"step": 17500
},
{
"epoch": 0.921707253207646,
"grad_norm": 0.657486617565155,
"learning_rate": 6.6916652667519855e-06,
"loss": 2.0192,
"step": 17600
},
{
"epoch": 0.9269442262372348,
"grad_norm": 0.7056224346160889,
"learning_rate": 5.927625476285426e-06,
"loss": 2.0233,
"step": 17700
},
{
"epoch": 0.9321811992668237,
"grad_norm": 0.6053991913795471,
"learning_rate": 5.191113766822905e-06,
"loss": 2.0165,
"step": 17800
},
{
"epoch": 0.9374181722964127,
"grad_norm": 0.6543104648590088,
"learning_rate": 4.487108745778958e-06,
"loss": 2.0096,
"step": 17900
},
{
"epoch": 0.9426551453260016,
"grad_norm": 0.6767512559890747,
"learning_rate": 3.820369284699823e-06,
"loss": 2.0236,
"step": 18000
},
{
"epoch": 0.9478921183555905,
"grad_norm": 0.5917364358901978,
"learning_rate": 3.195402350659945e-06,
"loss": 2.0315,
"step": 18100
},
{
"epoch": 0.9531290913851793,
"grad_norm": 0.5886921286582947,
"learning_rate": 2.616432540460255e-06,
"loss": 2.0335,
"step": 18200
},
{
"epoch": 0.9583660644147682,
"grad_norm": 0.6519348621368408,
"learning_rate": 2.0873735235683535e-06,
"loss": 2.0138,
"step": 18300
},
{
"epoch": 0.9636030374443572,
"grad_norm": 0.6619024872779846,
"learning_rate": 1.6118015868380387e-06,
"loss": 2.0223,
"step": 18400
},
{
"epoch": 0.9688400104739461,
"grad_norm": 0.6300016045570374,
"learning_rate": 1.1929314598383423e-06,
"loss": 2.0184,
"step": 18500
},
{
"epoch": 0.974076983503535,
"grad_norm": 0.6754645109176636,
"learning_rate": 8.335945842058524e-07,
"loss": 2.0215,
"step": 18600
},
{
"epoch": 0.9793139565331238,
"grad_norm": 0.7369129657745361,
"learning_rate": 5.362199739132656e-07,
"loss": 2.0138,
"step": 18700
},
{
"epoch": 0.9845509295627127,
"grad_norm": 0.5822499990463257,
"learning_rate": 3.028177958332512e-07,
"loss": 2.0249,
"step": 18800
},
{
"epoch": 0.9897879025923016,
"grad_norm": 0.6991373896598816,
"learning_rate": 1.349657815883032e-07,
"loss": 2.0329,
"step": 18900
},
{
"epoch": 0.9950248756218906,
"grad_norm": 0.6656786203384399,
"learning_rate": 3.379856253855951e-08,
"loss": 2.0112,
"step": 19000
},
{
"epoch": 1.0,
"step": 19095,
"total_flos": 2.7919996141761987e+18,
"train_loss": 2.1007044342432573,
"train_runtime": 7350.2103,
"train_samples_per_second": 41.566,
"train_steps_per_second": 2.598
}
],
"logging_steps": 100,
"max_steps": 19095,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7919996141761987e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}