End of training

Browse files

Files changed (6) hide show

README.md +14 -2
all_results.json +16 -0
eval_results.json +10 -0
runs/Jul24_22-53-38_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721911056.phyl-ling-p01.la.utexas.edu.132712.1 +3 -0
train_results.json +9 -0
trainer_state.json +1119 -0

README.md CHANGED Viewed

@@ -1,11 +1,23 @@
 ---
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: opt-babylm2-subset-default-3e-4
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
 # opt-babylm2-subset-default-3e-4
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 2.3776
 - Accuracy: 0.5327

 ---
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/babylm2-subset
 metrics:
 - accuracy
 model-index:
 - name: opt-babylm2-subset-default-3e-4
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/babylm2-subset
+      type: kanishka/babylm2-subset
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.5327396962492645
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # opt-babylm2-subset-default-3e-4
+This model was trained from scratch on the kanishka/babylm2-subset dataset.
 It achieves the following results on the evaluation set:
 - Loss: 2.3776
 - Accuracy: 0.5327

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.5327396962492645,
+    "eval_loss": 2.377573251724243,
+    "eval_runtime": 124.4661,
+    "eval_samples": 46951,
+    "eval_samples_per_second": 377.219,
+    "eval_steps_per_second": 5.897,
+    "perplexity": 10.77871387443153,
+    "total_flos": 5.9695438005504e+17,
+    "train_loss": 2.237272139724321,
+    "train_runtime": 31036.7572,
+    "train_samples": 453383,
+    "train_samples_per_second": 146.079,
+    "train_steps_per_second": 4.565
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.5327396962492645,
+    "eval_loss": 2.377573251724243,
+    "eval_runtime": 124.4661,
+    "eval_samples": 46951,
+    "eval_samples_per_second": 377.219,
+    "eval_steps_per_second": 5.897,
+    "perplexity": 10.77871387443153
+}

runs/Jul24_22-53-38_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721911056.phyl-ling-p01.la.utexas.edu.132712.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f902d4566853072bec641ec0a18171a081d044cfdc6327673cbc1ce69376a2a9
+size 417

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 10.0,
+    "total_flos": 5.9695438005504e+17,
+    "train_loss": 2.237272139724321,
+    "train_runtime": 31036.7572,
+    "train_samples": 453383,
+    "train_samples_per_second": 146.079,
+    "train_steps_per_second": 4.565
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1119 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 141690,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.07057661091114405,
+      "grad_norm": 0.8544681072235107,
+      "learning_rate": 9.375e-06,
+      "loss": 5.8075,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1411532218222881,
+      "grad_norm": 0.9348427057266235,
+      "learning_rate": 1.875e-05,
+      "loss": 3.9507,
+      "step": 2000
+    },
+    {
+      "epoch": 0.21172983273343213,
+      "grad_norm": 0.9987612366676331,
+      "learning_rate": 2.8125e-05,
+      "loss": 3.6154,
+      "step": 3000
+    },
+    {
+      "epoch": 0.2823064436445762,
+      "grad_norm": 0.9734600782394409,
+      "learning_rate": 3.75e-05,
+      "loss": 3.444,
+      "step": 4000
+    },
+    {
+      "epoch": 0.3528830545557202,
+      "grad_norm": 0.9963154196739197,
+      "learning_rate": 4.6874999999999994e-05,
+      "loss": 3.2983,
+      "step": 5000
+    },
+    {
+      "epoch": 0.42345966546686425,
+      "grad_norm": 0.8833392858505249,
+      "learning_rate": 5.625e-05,
+      "loss": 3.1806,
+      "step": 6000
+    },
+    {
+      "epoch": 0.49403627637800834,
+      "grad_norm": 0.8395134806632996,
+      "learning_rate": 6.5625e-05,
+      "loss": 3.0762,
+      "step": 7000
+    },
+    {
+      "epoch": 0.5646128872891524,
+      "grad_norm": 0.9056613445281982,
+      "learning_rate": 7.5e-05,
+      "loss": 3.0103,
+      "step": 8000
+    },
+    {
+      "epoch": 0.6351894982002965,
+      "grad_norm": 0.8697331547737122,
+      "learning_rate": 8.437499999999999e-05,
+      "loss": 2.9146,
+      "step": 9000
+    },
+    {
+      "epoch": 0.7057661091114404,
+      "grad_norm": 0.7743774056434631,
+      "learning_rate": 9.374999999999999e-05,
+      "loss": 2.8472,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7763427200225845,
+      "grad_norm": 0.769959568977356,
+      "learning_rate": 0.00010312499999999999,
+      "loss": 2.8093,
+      "step": 11000
+    },
+    {
+      "epoch": 0.8469193309337285,
+      "grad_norm": 0.7926977872848511,
+      "learning_rate": 0.0001125,
+      "loss": 2.7498,
+      "step": 12000
+    },
+    {
+      "epoch": 0.9174959418448726,
+      "grad_norm": 0.7256486415863037,
+      "learning_rate": 0.000121865625,
+      "loss": 2.714,
+      "step": 13000
+    },
+    {
+      "epoch": 0.9880725527560167,
+      "grad_norm": 0.7287374138832092,
+      "learning_rate": 0.000131240625,
+      "loss": 2.6575,
+      "step": 14000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.4749594473317534,
+      "eval_loss": 2.8570854663848877,
+      "eval_runtime": 123.5266,
+      "eval_samples_per_second": 380.088,
+      "eval_steps_per_second": 5.942,
+      "step": 14169
+    },
+    {
+      "epoch": 1.0586491636671607,
+      "grad_norm": 0.7397704124450684,
+      "learning_rate": 0.00014060625,
+      "loss": 2.6238,
+      "step": 15000
+    },
+    {
+      "epoch": 1.1292257745783048,
+      "grad_norm": 0.7099004983901978,
+      "learning_rate": 0.000149971875,
+      "loss": 2.5914,
+      "step": 16000
+    },
+    {
+      "epoch": 1.1998023854894488,
+      "grad_norm": 0.6785891056060791,
+      "learning_rate": 0.000159346875,
+      "loss": 2.5765,
+      "step": 17000
+    },
+    {
+      "epoch": 1.2703789964005927,
+      "grad_norm": 0.6459276080131531,
+      "learning_rate": 0.000168703125,
+      "loss": 2.5484,
+      "step": 18000
+    },
+    {
+      "epoch": 1.340955607311737,
+      "grad_norm": 0.6296180486679077,
+      "learning_rate": 0.000178078125,
+      "loss": 2.5329,
+      "step": 19000
+    },
+    {
+      "epoch": 1.4115322182228809,
+      "grad_norm": 0.648757815361023,
+      "learning_rate": 0.00018745312499999998,
+      "loss": 2.5078,
+      "step": 20000
+    },
+    {
+      "epoch": 1.482108829134025,
+      "grad_norm": 0.6126940250396729,
+      "learning_rate": 0.00019681874999999998,
+      "loss": 2.5066,
+      "step": 21000
+    },
+    {
+      "epoch": 1.552685440045169,
+      "grad_norm": 0.5499350428581238,
+      "learning_rate": 0.00020618437499999995,
+      "loss": 2.4882,
+      "step": 22000
+    },
+    {
+      "epoch": 1.623262050956313,
+      "grad_norm": 0.7012745141983032,
+      "learning_rate": 0.00021555937499999998,
+      "loss": 2.4746,
+      "step": 23000
+    },
+    {
+      "epoch": 1.6938386618674572,
+      "grad_norm": 0.563831627368927,
+      "learning_rate": 0.00022493437499999998,
+      "loss": 2.4607,
+      "step": 24000
+    },
+    {
+      "epoch": 1.764415272778601,
+      "grad_norm": 0.4928041696548462,
+      "learning_rate": 0.00023430937499999997,
+      "loss": 2.4454,
+      "step": 25000
+    },
+    {
+      "epoch": 1.8349918836897452,
+      "grad_norm": 0.5389479398727417,
+      "learning_rate": 0.00024367499999999997,
+      "loss": 2.4429,
+      "step": 26000
+    },
+    {
+      "epoch": 1.9055684946008893,
+      "grad_norm": 0.549089252948761,
+      "learning_rate": 0.000253040625,
+      "loss": 2.4239,
+      "step": 27000
+    },
+    {
+      "epoch": 1.9761451055120332,
+      "grad_norm": 0.5027530193328857,
+      "learning_rate": 0.000262415625,
+      "loss": 2.4179,
+      "step": 28000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.4989820084802377,
+      "eval_loss": 2.6257801055908203,
+      "eval_runtime": 124.6794,
+      "eval_samples_per_second": 376.574,
+      "eval_steps_per_second": 5.887,
+      "step": 28338
+    },
+    {
+      "epoch": 2.0467217164231775,
+      "grad_norm": 0.4511743485927582,
+      "learning_rate": 0.000271790625,
+      "loss": 2.3885,
+      "step": 29000
+    },
+    {
+      "epoch": 2.1172983273343213,
+      "grad_norm": 0.4871021807193756,
+      "learning_rate": 0.00028115624999999994,
+      "loss": 2.378,
+      "step": 30000
+    },
+    {
+      "epoch": 2.1878749382454656,
+      "grad_norm": 0.4743562638759613,
+      "learning_rate": 0.00029053124999999994,
+      "loss": 2.3721,
+      "step": 31000
+    },
+    {
+      "epoch": 2.2584515491566095,
+      "grad_norm": 0.4668987989425659,
+      "learning_rate": 0.00029990624999999993,
+      "loss": 2.3584,
+      "step": 32000
+    },
+    {
+      "epoch": 2.3290281600677534,
+      "grad_norm": 0.4822200834751129,
+      "learning_rate": 0.0002972951043850852,
+      "loss": 2.3592,
+      "step": 33000
+    },
+    {
+      "epoch": 2.3996047709788977,
+      "grad_norm": 0.43820998072624207,
+      "learning_rate": 0.0002945601239857781,
+      "loss": 2.3459,
+      "step": 34000
+    },
+    {
+      "epoch": 2.4701813818900415,
+      "grad_norm": 0.4586232602596283,
+      "learning_rate": 0.0002918251435864709,
+      "loss": 2.3499,
+      "step": 35000
+    },
+    {
+      "epoch": 2.5407579928011854,
+      "grad_norm": 0.4139242172241211,
+      "learning_rate": 0.0002890928981675631,
+      "loss": 2.3302,
+      "step": 36000
+    },
+    {
+      "epoch": 2.6113346037123297,
+      "grad_norm": 0.41075077652931213,
+      "learning_rate": 0.000286357917768256,
+      "loss": 2.3236,
+      "step": 37000
+    },
+    {
+      "epoch": 2.681911214623474,
+      "grad_norm": 0.4315313994884491,
+      "learning_rate": 0.00028362567234934815,
+      "loss": 2.3239,
+      "step": 38000
+    },
+    {
+      "epoch": 2.752487825534618,
+      "grad_norm": 0.39671897888183594,
+      "learning_rate": 0.000280890691950041,
+      "loss": 2.3087,
+      "step": 39000
+    },
+    {
+      "epoch": 2.8230644364457618,
+      "grad_norm": 0.4033380150794983,
+      "learning_rate": 0.00027815844653113317,
+      "loss": 2.2952,
+      "step": 40000
+    },
+    {
+      "epoch": 2.893641047356906,
+      "grad_norm": 0.36479175090789795,
+      "learning_rate": 0.000275423466131826,
+      "loss": 2.3049,
+      "step": 41000
+    },
+    {
+      "epoch": 2.96421765826805,
+      "grad_norm": 0.3533291220664978,
+      "learning_rate": 0.00027268848573251887,
+      "loss": 2.286,
+      "step": 42000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.5122970740171453,
+      "eval_loss": 2.5088469982147217,
+      "eval_runtime": 124.5088,
+      "eval_samples_per_second": 377.09,
+      "eval_steps_per_second": 5.895,
+      "step": 42507
+    },
+    {
+      "epoch": 3.034794269179194,
+      "grad_norm": 0.37031927704811096,
+      "learning_rate": 0.00026995350533321177,
+      "loss": 2.2529,
+      "step": 43000
+    },
+    {
+      "epoch": 3.105370880090338,
+      "grad_norm": 0.39132222533226013,
+      "learning_rate": 0.0002672185249339046,
+      "loss": 2.2252,
+      "step": 44000
+    },
+    {
+      "epoch": 3.175947491001482,
+      "grad_norm": 0.36771416664123535,
+      "learning_rate": 0.0002644835445345975,
+      "loss": 2.2389,
+      "step": 45000
+    },
+    {
+      "epoch": 3.2465241019126263,
+      "grad_norm": 0.40541785955429077,
+      "learning_rate": 0.00026175129911568964,
+      "loss": 2.2224,
+      "step": 46000
+    },
+    {
+      "epoch": 3.31710071282377,
+      "grad_norm": 0.4325103461742401,
+      "learning_rate": 0.0002590190536967818,
+      "loss": 2.2286,
+      "step": 47000
+    },
+    {
+      "epoch": 3.3876773237349145,
+      "grad_norm": 0.3990887403488159,
+      "learning_rate": 0.00025628407329747466,
+      "loss": 2.216,
+      "step": 48000
+    },
+    {
+      "epoch": 3.4582539346460583,
+      "grad_norm": 0.36805373430252075,
+      "learning_rate": 0.00025354909289816756,
+      "loss": 2.2209,
+      "step": 49000
+    },
+    {
+      "epoch": 3.528830545557202,
+      "grad_norm": 0.357721209526062,
+      "learning_rate": 0.0002508141124988604,
+      "loss": 2.2218,
+      "step": 50000
+    },
+    {
+      "epoch": 3.5994071564683465,
+      "grad_norm": 0.3424566984176636,
+      "learning_rate": 0.00024808460206035186,
+      "loss": 2.2159,
+      "step": 51000
+    },
+    {
+      "epoch": 3.6699837673794904,
+      "grad_norm": 0.35358792543411255,
+      "learning_rate": 0.00024534962166104476,
+      "loss": 2.2085,
+      "step": 52000
+    },
+    {
+      "epoch": 3.7405603782906347,
+      "grad_norm": 0.369150310754776,
+      "learning_rate": 0.0002426146412617376,
+      "loss": 2.2033,
+      "step": 53000
+    },
+    {
+      "epoch": 3.8111369892017786,
+      "grad_norm": 0.3341532051563263,
+      "learning_rate": 0.00023987966086243048,
+      "loss": 2.2017,
+      "step": 54000
+    },
+    {
+      "epoch": 3.8817136001129224,
+      "grad_norm": 0.40089789032936096,
+      "learning_rate": 0.00023714741544352263,
+      "loss": 2.2015,
+      "step": 55000
+    },
+    {
+      "epoch": 3.9522902110240667,
+      "grad_norm": 0.35854268074035645,
+      "learning_rate": 0.0002344124350442155,
+      "loss": 2.2124,
+      "step": 56000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.5203132962446899,
+      "eval_loss": 2.444835662841797,
+      "eval_runtime": 124.4717,
+      "eval_samples_per_second": 377.202,
+      "eval_steps_per_second": 5.897,
+      "step": 56676
+    },
+    {
+      "epoch": 4.022866821935211,
+      "grad_norm": 0.3867768347263336,
+      "learning_rate": 0.00023167745464490835,
+      "loss": 2.1839,
+      "step": 57000
+    },
+    {
+      "epoch": 4.093443432846355,
+      "grad_norm": 0.3627295196056366,
+      "learning_rate": 0.00022894520922600055,
+      "loss": 2.1384,
+      "step": 58000
+    },
+    {
+      "epoch": 4.164020043757499,
+      "grad_norm": 0.35046979784965515,
+      "learning_rate": 0.00022621022882669337,
+      "loss": 2.1439,
+      "step": 59000
+    },
+    {
+      "epoch": 4.234596654668643,
+      "grad_norm": 0.3317088186740875,
+      "learning_rate": 0.00022347524842738624,
+      "loss": 2.1446,
+      "step": 60000
+    },
+    {
+      "epoch": 4.3051732655797865,
+      "grad_norm": 0.37563446164131165,
+      "learning_rate": 0.00022074300300847845,
+      "loss": 2.1408,
+      "step": 61000
+    },
+    {
+      "epoch": 4.375749876490931,
+      "grad_norm": 0.36360374093055725,
+      "learning_rate": 0.00021800802260917127,
+      "loss": 2.1447,
+      "step": 62000
+    },
+    {
+      "epoch": 4.446326487402075,
+      "grad_norm": 0.35474300384521484,
+      "learning_rate": 0.00021527577719026347,
+      "loss": 2.1478,
+      "step": 63000
+    },
+    {
+      "epoch": 4.516903098313219,
+      "grad_norm": 0.38771218061447144,
+      "learning_rate": 0.00021254079679095632,
+      "loss": 2.136,
+      "step": 64000
+    },
+    {
+      "epoch": 4.587479709224363,
+      "grad_norm": 0.3860458433628082,
+      "learning_rate": 0.0002098085513720485,
+      "loss": 2.1379,
+      "step": 65000
+    },
+    {
+      "epoch": 4.658056320135507,
+      "grad_norm": 0.3763484060764313,
+      "learning_rate": 0.00020707357097274134,
+      "loss": 2.1344,
+      "step": 66000
+    },
+    {
+      "epoch": 4.7286329310466515,
+      "grad_norm": 0.37907466292381287,
+      "learning_rate": 0.0002043413255538335,
+      "loss": 2.1426,
+      "step": 67000
+    },
+    {
+      "epoch": 4.799209541957795,
+      "grad_norm": 0.34690865874290466,
+      "learning_rate": 0.00020160634515452636,
+      "loss": 2.1208,
+      "step": 68000
+    },
+    {
+      "epoch": 4.869786152868939,
+      "grad_norm": 0.36183568835258484,
+      "learning_rate": 0.00019887409973561856,
+      "loss": 2.1242,
+      "step": 69000
+    },
+    {
+      "epoch": 4.940362763780083,
+      "grad_norm": 0.35947293043136597,
+      "learning_rate": 0.00019613911933631138,
+      "loss": 2.1307,
+      "step": 70000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.5251367702083976,
+      "eval_loss": 2.4099230766296387,
+      "eval_runtime": 124.5048,
+      "eval_samples_per_second": 377.102,
+      "eval_steps_per_second": 5.895,
+      "step": 70845
+    },
+    {
+      "epoch": 5.010939374691227,
+      "grad_norm": 0.3575204312801361,
+      "learning_rate": 0.00019340687391740358,
+      "loss": 2.1265,
+      "step": 71000
+    },
+    {
+      "epoch": 5.081515985602372,
+      "grad_norm": 0.38784071803092957,
+      "learning_rate": 0.00019067189351809646,
+      "loss": 2.064,
+      "step": 72000
+    },
+    {
+      "epoch": 5.152092596513516,
+      "grad_norm": 0.34589263796806335,
+      "learning_rate": 0.0001879396480991886,
+      "loss": 2.0754,
+      "step": 73000
+    },
+    {
+      "epoch": 5.2226692074246595,
+      "grad_norm": 0.3403594195842743,
+      "learning_rate": 0.00018520466769988148,
+      "loss": 2.073,
+      "step": 74000
+    },
+    {
+      "epoch": 5.293245818335803,
+      "grad_norm": 0.38706710934638977,
+      "learning_rate": 0.00018246968730057433,
+      "loss": 2.0812,
+      "step": 75000
+    },
+    {
+      "epoch": 5.363822429246947,
+      "grad_norm": 0.39309191703796387,
+      "learning_rate": 0.0001797347069012672,
+      "loss": 2.0915,
+      "step": 76000
+    },
+    {
+      "epoch": 5.434399040158092,
+      "grad_norm": 0.37432822585105896,
+      "learning_rate": 0.00017700246148235935,
+      "loss": 2.0755,
+      "step": 77000
+    },
+    {
+      "epoch": 5.504975651069236,
+      "grad_norm": 0.3538018465042114,
+      "learning_rate": 0.00017426748108305222,
+      "loss": 2.0772,
+      "step": 78000
+    },
+    {
+      "epoch": 5.57555226198038,
+      "grad_norm": 0.3601301610469818,
+      "learning_rate": 0.00017153523566414437,
+      "loss": 2.085,
+      "step": 79000
+    },
+    {
+      "epoch": 5.6461288728915235,
+      "grad_norm": 0.3469400703907013,
+      "learning_rate": 0.00016880299024523657,
+      "loss": 2.078,
+      "step": 80000
+    },
+    {
+      "epoch": 5.716705483802667,
+      "grad_norm": 0.3623177111148834,
+      "learning_rate": 0.00016607074482632872,
+      "loss": 2.077,
+      "step": 81000
+    },
+    {
+      "epoch": 5.787282094713812,
+      "grad_norm": 0.3382331132888794,
+      "learning_rate": 0.0001633357644270216,
+      "loss": 2.0745,
+      "step": 82000
+    },
+    {
+      "epoch": 5.857858705624956,
+      "grad_norm": 0.3787217140197754,
+      "learning_rate": 0.00016060078402771447,
+      "loss": 2.0768,
+      "step": 83000
+    },
+    {
+      "epoch": 5.9284353165361,
+      "grad_norm": 0.36773914098739624,
+      "learning_rate": 0.00015786580362840732,
+      "loss": 2.0756,
+      "step": 84000
+    },
+    {
+      "epoch": 5.999011927447244,
+      "grad_norm": 0.39036858081817627,
+      "learning_rate": 0.00015513082322910016,
+      "loss": 2.0706,
+      "step": 85000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.5280841264512295,
+      "eval_loss": 2.388700246810913,
+      "eval_runtime": 125.3022,
+      "eval_samples_per_second": 374.702,
+      "eval_steps_per_second": 5.858,
+      "step": 85014
+    },
+    {
+      "epoch": 6.069588538358388,
+      "grad_norm": 0.3616304099559784,
+      "learning_rate": 0.00015239584282979304,
+      "loss": 2.0123,
+      "step": 86000
+    },
+    {
+      "epoch": 6.140165149269532,
+      "grad_norm": 0.37002402544021606,
+      "learning_rate": 0.0001496635974108852,
+      "loss": 2.0153,
+      "step": 87000
+    },
+    {
+      "epoch": 6.210741760180676,
+      "grad_norm": 0.3603184223175049,
+      "learning_rate": 0.0001469286170115781,
+      "loss": 2.0199,
+      "step": 88000
+    },
+    {
+      "epoch": 6.28131837109182,
+      "grad_norm": 0.37550196051597595,
+      "learning_rate": 0.00014419637159267024,
+      "loss": 2.0138,
+      "step": 89000
+    },
+    {
+      "epoch": 6.351894982002964,
+      "grad_norm": 0.3768686056137085,
+      "learning_rate": 0.0001414641261737624,
+      "loss": 2.0314,
+      "step": 90000
+    },
+    {
+      "epoch": 6.422471592914108,
+      "grad_norm": 0.3591078221797943,
+      "learning_rate": 0.00013872914577445526,
+      "loss": 2.0226,
+      "step": 91000
+    },
+    {
+      "epoch": 6.493048203825253,
+      "grad_norm": 0.3905663788318634,
+      "learning_rate": 0.00013599416537514813,
+      "loss": 2.0257,
+      "step": 92000
+    },
+    {
+      "epoch": 6.5636248147363965,
+      "grad_norm": 0.39147230982780457,
+      "learning_rate": 0.00013325918497584098,
+      "loss": 2.0311,
+      "step": 93000
+    },
+    {
+      "epoch": 6.63420142564754,
+      "grad_norm": 0.40250155329704285,
+      "learning_rate": 0.00013052420457653385,
+      "loss": 2.0301,
+      "step": 94000
+    },
+    {
+      "epoch": 6.704778036558684,
+      "grad_norm": 0.3860897123813629,
+      "learning_rate": 0.00012779195915762603,
+      "loss": 2.0312,
+      "step": 95000
+    },
+    {
+      "epoch": 6.775354647469829,
+      "grad_norm": 0.3707718253135681,
+      "learning_rate": 0.0001250597137387182,
+      "loss": 2.0282,
+      "step": 96000
+    },
+    {
+      "epoch": 6.845931258380973,
+      "grad_norm": 0.3690090775489807,
+      "learning_rate": 0.00012232473333941105,
+      "loss": 2.0255,
+      "step": 97000
+    },
+    {
+      "epoch": 6.916507869292117,
+      "grad_norm": 0.4174107313156128,
+      "learning_rate": 0.00011958975294010391,
+      "loss": 2.0276,
+      "step": 98000
+    },
+    {
+      "epoch": 6.987084480203261,
+      "grad_norm": 0.3740842938423157,
+      "learning_rate": 0.00011685750752119609,
+      "loss": 2.0233,
+      "step": 99000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.5303788443403243,
+      "eval_loss": 2.3779311180114746,
+      "eval_runtime": 124.6993,
+      "eval_samples_per_second": 376.514,
+      "eval_steps_per_second": 5.886,
+      "step": 99183
+    },
+    {
+      "epoch": 7.057661091114404,
+      "grad_norm": 0.37689074873924255,
+      "learning_rate": 0.00011412252712188895,
+      "loss": 1.9805,
+      "step": 100000
+    },
+    {
+      "epoch": 7.128237702025549,
+      "grad_norm": 0.4047756493091583,
+      "learning_rate": 0.00011139028170298112,
+      "loss": 1.9746,
+      "step": 101000
+    },
+    {
+      "epoch": 7.198814312936693,
+      "grad_norm": 0.3889460563659668,
+      "learning_rate": 0.00010865530130367397,
+      "loss": 1.9659,
+      "step": 102000
+    },
+    {
+      "epoch": 7.269390923847837,
+      "grad_norm": 0.4061487019062042,
+      "learning_rate": 0.00010592032090436684,
+      "loss": 1.9708,
+      "step": 103000
+    },
+    {
+      "epoch": 7.339967534758981,
+      "grad_norm": 0.4057160019874573,
+      "learning_rate": 0.0001031853405050597,
+      "loss": 1.9772,
+      "step": 104000
+    },
+    {
+      "epoch": 7.410544145670125,
+      "grad_norm": 0.40489134192466736,
+      "learning_rate": 0.00010045309508615188,
+      "loss": 1.9736,
+      "step": 105000
+    },
+    {
+      "epoch": 7.481120756581269,
+      "grad_norm": 0.4148654043674469,
+      "learning_rate": 9.771811468684473e-05,
+      "loss": 1.9796,
+      "step": 106000
+    },
+    {
+      "epoch": 7.551697367492413,
+      "grad_norm": 0.41483381390571594,
+      "learning_rate": 9.49831342875376e-05,
+      "loss": 1.9804,
+      "step": 107000
+    },
+    {
+      "epoch": 7.622273978403557,
+      "grad_norm": 0.43718773126602173,
+      "learning_rate": 9.225088886862978e-05,
+      "loss": 1.9712,
+      "step": 108000
+    },
+    {
+      "epoch": 7.692850589314701,
+      "grad_norm": 0.40646329522132874,
+      "learning_rate": 8.951590846932262e-05,
+      "loss": 1.9822,
+      "step": 109000
+    },
+    {
+      "epoch": 7.763427200225845,
+      "grad_norm": 0.44571158289909363,
+      "learning_rate": 8.67836630504148e-05,
+      "loss": 1.9832,
+      "step": 110000
+    },
+    {
+      "epoch": 7.83400381113699,
+      "grad_norm": 0.41726765036582947,
+      "learning_rate": 8.404868265110766e-05,
+      "loss": 1.9747,
+      "step": 111000
+    },
+    {
+      "epoch": 7.9045804220481335,
+      "grad_norm": 0.39210569858551025,
+      "learning_rate": 8.131370225180053e-05,
+      "loss": 1.9929,
+      "step": 112000
+    },
+    {
+      "epoch": 7.975157032959277,
+      "grad_norm": 0.37121346592903137,
+      "learning_rate": 7.858145683289268e-05,
+      "loss": 1.9727,
+      "step": 113000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.5315104156523147,
+      "eval_loss": 2.3731467723846436,
+      "eval_runtime": 124.474,
+      "eval_samples_per_second": 377.195,
+      "eval_steps_per_second": 5.897,
+      "step": 113352
+    },
+    {
+      "epoch": 8.045733643870422,
+      "grad_norm": 0.3864983916282654,
+      "learning_rate": 7.584647643358555e-05,
+      "loss": 1.9365,
+      "step": 114000
+    },
+    {
+      "epoch": 8.116310254781565,
+      "grad_norm": 0.4133272171020508,
+      "learning_rate": 7.311423101467773e-05,
+      "loss": 1.9174,
+      "step": 115000
+    },
+    {
+      "epoch": 8.18688686569271,
+      "grad_norm": 0.4471355676651001,
+      "learning_rate": 7.037925061537059e-05,
+      "loss": 1.9327,
+      "step": 116000
+    },
+    {
+      "epoch": 8.257463476603853,
+      "grad_norm": 0.42897623777389526,
+      "learning_rate": 6.764700519646275e-05,
+      "loss": 1.9288,
+      "step": 117000
+    },
+    {
+      "epoch": 8.328040087514998,
+      "grad_norm": 0.43864506483078003,
+      "learning_rate": 6.491202479715561e-05,
+      "loss": 1.9359,
+      "step": 118000
+    },
+    {
+      "epoch": 8.398616698426142,
+      "grad_norm": 0.46767184138298035,
+      "learning_rate": 6.217977937824779e-05,
+      "loss": 1.9399,
+      "step": 119000
+    },
+    {
+      "epoch": 8.469193309337285,
+      "grad_norm": 0.4159405827522278,
+      "learning_rate": 5.944479897894064e-05,
+      "loss": 1.9318,
+      "step": 120000
+    },
+    {
+      "epoch": 8.53976992024843,
+      "grad_norm": 0.4233142137527466,
+      "learning_rate": 5.671255356003281e-05,
+      "loss": 1.9316,
+      "step": 121000
+    },
+    {
+      "epoch": 8.610346531159573,
+      "grad_norm": 0.4398587942123413,
+      "learning_rate": 5.3980308141124983e-05,
+      "loss": 1.9349,
+      "step": 122000
+    },
+    {
+      "epoch": 8.680923142070718,
+      "grad_norm": 0.424790620803833,
+      "learning_rate": 5.124532774181785e-05,
+      "loss": 1.9382,
+      "step": 123000
+    },
+    {
+      "epoch": 8.751499752981863,
+      "grad_norm": 0.4141928553581238,
+      "learning_rate": 4.851034734251071e-05,
+      "loss": 1.9314,
+      "step": 124000
+    },
+    {
+      "epoch": 8.822076363893006,
+      "grad_norm": 0.45448678731918335,
+      "learning_rate": 4.577536694320357e-05,
+      "loss": 1.9374,
+      "step": 125000
+    },
+    {
+      "epoch": 8.89265297480415,
+      "grad_norm": 0.4196777939796448,
+      "learning_rate": 4.304312152429574e-05,
+      "loss": 1.9413,
+      "step": 126000
+    },
+    {
+      "epoch": 8.963229585715293,
+      "grad_norm": 0.3975803256034851,
+      "learning_rate": 4.03081411249886e-05,
+      "loss": 1.9311,
+      "step": 127000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.532392202583586,
+      "eval_loss": 2.3728187084198,
+      "eval_runtime": 124.6433,
+      "eval_samples_per_second": 376.683,
+      "eval_steps_per_second": 5.889,
+      "step": 127521
+    },
+    {
+      "epoch": 9.033806196626438,
+      "grad_norm": 0.42701438069343567,
+      "learning_rate": 3.757589570608077e-05,
+      "loss": 1.9097,
+      "step": 128000
+    },
+    {
+      "epoch": 9.104382807537583,
+      "grad_norm": 0.44095513224601746,
+      "learning_rate": 3.484091530677363e-05,
+      "loss": 1.8841,
+      "step": 129000
+    },
+    {
+      "epoch": 9.174959418448726,
+      "grad_norm": 0.4554646909236908,
+      "learning_rate": 3.2108669887865805e-05,
+      "loss": 1.898,
+      "step": 130000
+    },
+    {
+      "epoch": 9.24553602935987,
+      "grad_norm": 0.4306824803352356,
+      "learning_rate": 2.937642446895797e-05,
+      "loss": 1.8921,
+      "step": 131000
+    },
+    {
+      "epoch": 9.316112640271013,
+      "grad_norm": 0.4496276080608368,
+      "learning_rate": 2.664144406965083e-05,
+      "loss": 1.8883,
+      "step": 132000
+    },
+    {
+      "epoch": 9.386689251182158,
+      "grad_norm": 0.4314197301864624,
+      "learning_rate": 2.3906463670343695e-05,
+      "loss": 1.8972,
+      "step": 133000
+    },
+    {
+      "epoch": 9.457265862093303,
+      "grad_norm": 0.4485911428928375,
+      "learning_rate": 2.1171483271036556e-05,
+      "loss": 1.8908,
+      "step": 134000
+    },
+    {
+      "epoch": 9.527842473004446,
+      "grad_norm": 0.4166420102119446,
+      "learning_rate": 1.8439237852128724e-05,
+      "loss": 1.8924,
+      "step": 135000
+    },
+    {
+      "epoch": 9.59841908391559,
+      "grad_norm": 0.4447433650493622,
+      "learning_rate": 1.5704257452821588e-05,
+      "loss": 1.892,
+      "step": 136000
+    },
+    {
+      "epoch": 9.668995694826734,
+      "grad_norm": 0.42986035346984863,
+      "learning_rate": 1.2972012033913756e-05,
+      "loss": 1.8992,
+      "step": 137000
+    },
+    {
+      "epoch": 9.739572305737878,
+      "grad_norm": 0.45574498176574707,
+      "learning_rate": 1.0237031634606618e-05,
+      "loss": 1.8972,
+      "step": 138000
+    },
+    {
+      "epoch": 9.810148916649023,
+      "grad_norm": 0.4515238404273987,
+      "learning_rate": 7.5020512352994795e-06,
+      "loss": 1.8952,
+      "step": 139000
+    },
+    {
+      "epoch": 9.880725527560166,
+      "grad_norm": 0.43244990706443787,
+      "learning_rate": 4.769805816391649e-06,
+      "loss": 1.8946,
+      "step": 140000
+    },
+    {
+      "epoch": 9.951302138471311,
+      "grad_norm": 0.44382914900779724,
+      "learning_rate": 2.0348254170845108e-06,
+      "loss": 1.8943,
+      "step": 141000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.5327396962492645,
+      "eval_loss": 2.377573251724243,
+      "eval_runtime": 124.448,
+      "eval_samples_per_second": 377.274,
+      "eval_steps_per_second": 5.898,
+      "step": 141690
+    },
+    {
+      "epoch": 10.0,
+      "step": 141690,
+      "total_flos": 5.9695438005504e+17,
+      "train_loss": 2.237272139724321,
+      "train_runtime": 31036.7572,
+      "train_samples_per_second": 146.079,
+      "train_steps_per_second": 4.565
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 141690,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.9695438005504e+17,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}