Model save

Browse files

Files changed (11) hide show

README.md +28 -12
all_results.json +5 -5
config.json +1 -1
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
runs/Sep27_17-24-19_action-graph-trainer/events.out.tfevents.1727460461.action-graph-trainer.379760.0 +3 -0
runs/Sep27_20-11-03_action-graph-trainer/events.out.tfevents.1727468262.action-graph-trainer.430650.0 +3 -0
tokenizer.json +1 -1
train_results.json +5 -5
trainer_state.json +853 -685
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -3,15 +3,10 @@ library_name: transformers
 license: llama3.2
 base_model: tanliboy/llama-3.2-3b
 tags:
-- alignment-handbook
-- trl
-- sft
-- generated_from_trainer
 - trl
 - sft
 - generated_from_trainer
-datasets:
-- tanliboy/OpenHermes-2.5-reformat
 model-index:
 - name: llama-3.2-3b-sft
   results: []
@@ -22,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
 # llama-3.2-3b-sft
-This model is a fine-tuned version of [tanliboy/llama-3.2-3b](https://huggingface.co/tanliboy/llama-3.2-3b) on the tanliboy/OpenHermes-2.5-reformat dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.7042
 ## Model description
@@ -43,7 +38,7 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 1e-05
 - train_batch_size: 8
 - eval_batch_size: 8
 - seed: 42
@@ -59,9 +54,30 @@ The following hyperparameters were used during training:
 ### Training results
-| Training Loss | Epoch | Step | Validation Loss |
-|:-------------:|:-----:|:----:|:---------------:|
-| 0.6946        | 1.0   | 2230 | 0.7042          |
 ### Framework versions

 license: llama3.2
 base_model: tanliboy/llama-3.2-3b
 tags:
 - trl
 - sft
+- alignment-handbook
 - generated_from_trainer
 model-index:
 - name: llama-3.2-3b-sft
   results: []
 # llama-3.2-3b-sft
+This model is a fine-tuned version of [tanliboy/llama-3.2-3b](https://huggingface.co/tanliboy/llama-3.2-3b) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.7216
 ## Model description
 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 3e-06
 - train_batch_size: 8
 - eval_batch_size: 8
 - seed: 42
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.8741        | 0.0448 | 100  | 0.8600          |
+| 0.8038        | 0.0897 | 200  | 0.8095          |
+| 0.7937        | 0.1345 | 300  | 0.7789          |
+| 0.7712        | 0.1794 | 400  | 0.7644          |
+| 0.7393        | 0.2242 | 500  | 0.7565          |
+| 0.7458        | 0.2691 | 600  | 0.7506          |
+| 0.7694        | 0.3139 | 700  | 0.7458          |
+| 0.713         | 0.3587 | 800  | 0.7422          |
+| 0.7347        | 0.4036 | 900  | 0.7387          |
+| 0.7243        | 0.4484 | 1000 | 0.7356          |
+| 0.7161        | 0.4933 | 1100 | 0.7331          |
+| 0.7247        | 0.5381 | 1200 | 0.7308          |
+| 0.7477        | 0.5830 | 1300 | 0.7288          |
+| 0.7429        | 0.6278 | 1400 | 0.7273          |
+| 0.7317        | 0.6726 | 1500 | 0.7256          |
+| 0.7226        | 0.7175 | 1600 | 0.7243          |
+| 0.695         | 0.7623 | 1700 | 0.7234          |
+| 0.7167        | 0.8072 | 1800 | 0.7226          |
+| 0.686         | 0.8520 | 1900 | 0.7221          |
+| 0.7214        | 0.8969 | 2000 | 0.7218          |
+| 0.7358        | 0.9417 | 2100 | 0.7216          |
+| 0.7259        | 0.9865 | 2200 | 0.7216          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -5,10 +5,10 @@
     "eval_samples": 50077,
     "eval_samples_per_second": 132.107,
     "eval_steps_per_second": 2.066,
-    "total_flos": 244955314192384.0,
-    "train_loss": 0.7376568270371099,
-    "train_runtime": 8720.6202,
     "train_samples": 285435,
-    "train_samples_per_second": 32.731,
-    "train_steps_per_second": 0.256
 }

     "eval_samples": 50077,
     "eval_samples_per_second": 132.107,
     "eval_steps_per_second": 2.066,
+    "total_flos": 250303561007104.0,
+    "train_loss": 0.7492096503219262,
+    "train_runtime": 18007.2993,
     "train_samples": 285435,
+    "train_samples_per_second": 15.851,
+    "train_steps_per_second": 0.124
 }

config.json CHANGED Viewed

@@ -35,6 +35,6 @@
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.44.2",
-  "use_cache": true,
   "vocab_size": 128256
 }

   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.44.2",
+  "use_cache": false,
   "vocab_size": 128256
 }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5bafbda1cf46dcdfa68fddb235d102da9d1885ae56e80b4e41840a35aba9df3e
 size 4965799096

 version https://git-lfs.github.com/spec/v1
+oid sha256:09efa89e7b43c24e8c1ab73f7e09196edd014cf14482fe5101aaf902a586d7f2
 size 4965799096

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:674efdeb16be82251aae01846f486cb86b0be050c9eba1232d0602a00bd5679b
 size 1459729952

 version https://git-lfs.github.com/spec/v1
+oid sha256:c08bcfff732c5238b56bd7df4c0e2c63a0a02d0592c022ee16a201dab2ea9820
 size 1459729952

runs/Sep27_17-24-19_action-graph-trainer/events.out.tfevents.1727460461.action-graph-trainer.379760.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8664a4d576b88a2be33c4a2390982c36adcdf3a6711e183eecfb35752a6285e
+size 29491

runs/Sep27_20-11-03_action-graph-trainer/events.out.tfevents.1727468262.action-graph-trainer.430650.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6c18947c9ba2697ba326b828d36458b2b45f1b8572f03289d81b56f7df7627a
+size 59321

tokenizer.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "version": "1.0",
   "truncation": {
     "direction": "Right",
-    "max_length": 2048,
     "strategy": "LongestFirst",
     "stride": 0
   },

   "version": "1.0",
   "truncation": {
     "direction": "Right",
+    "max_length": 4096,
     "strategy": "LongestFirst",
     "stride": 0
   },

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 1.0,
-    "total_flos": 244955314192384.0,
-    "train_loss": 0.7376568270371099,
-    "train_runtime": 8720.6202,
     "train_samples": 285435,
-    "train_samples_per_second": 32.731,
-    "train_steps_per_second": 0.256
 }

 {
     "epoch": 1.0,
+    "total_flos": 250303561007104.0,
+    "train_loss": 0.7492096503219262,
+    "train_runtime": 18007.2993,
     "train_samples": 285435,
+    "train_samples_per_second": 15.851,
+    "train_steps_per_second": 0.124
 }

trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_metric": null,
   "best_model_checkpoint": null,
   "epoch": 1.0,
-  "eval_steps": 500,
   "global_step": 2230,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
@@ -10,1588 +10,1756 @@
   "log_history": [
     {
       "epoch": 0.0004484304932735426,
-      "grad_norm": 4.697020980518476,
-      "learning_rate": 4.4843049327354265e-08,
       "loss": 0.9912,
       "step": 1
     },
     {
       "epoch": 0.004484304932735426,
-      "grad_norm": 5.023039914107906,
-      "learning_rate": 4.484304932735426e-07,
-      "loss": 1.0335,
       "step": 10
     },
     {
       "epoch": 0.008968609865470852,
-      "grad_norm": 4.618075407201601,
-      "learning_rate": 8.968609865470852e-07,
-      "loss": 1.0355,
       "step": 20
     },
     {
       "epoch": 0.013452914798206279,
-      "grad_norm": 2.181160774619855,
-      "learning_rate": 1.345291479820628e-06,
-      "loss": 0.9824,
       "step": 30
     },
     {
       "epoch": 0.017937219730941704,
-      "grad_norm": 2.008569343268214,
-      "learning_rate": 1.7937219730941704e-06,
-      "loss": 0.9383,
       "step": 40
     },
     {
       "epoch": 0.02242152466367713,
-      "grad_norm": 1.7133805307500916,
-      "learning_rate": 2.242152466367713e-06,
-      "loss": 0.87,
       "step": 50
     },
     {
       "epoch": 0.026905829596412557,
-      "grad_norm": 1.6402285214131669,
-      "learning_rate": 2.690582959641256e-06,
-      "loss": 0.8608,
       "step": 60
     },
     {
       "epoch": 0.03139013452914798,
-      "grad_norm": 1.548076900788033,
-      "learning_rate": 3.1390134529147986e-06,
-      "loss": 0.8675,
       "step": 70
     },
     {
       "epoch": 0.03587443946188341,
-      "grad_norm": 1.5686573872779637,
-      "learning_rate": 3.587443946188341e-06,
-      "loss": 0.8581,
       "step": 80
     },
     {
       "epoch": 0.04035874439461883,
-      "grad_norm": 1.8398749741247653,
-      "learning_rate": 4.0358744394618836e-06,
-      "loss": 0.8272,
       "step": 90
     },
     {
       "epoch": 0.04484304932735426,
-      "grad_norm": 1.431522929426351,
-      "learning_rate": 4.484304932735426e-06,
-      "loss": 0.837,
       "step": 100
     },
     {
       "epoch": 0.04932735426008968,
-      "grad_norm": 1.5183209831163855,
-      "learning_rate": 4.932735426008969e-06,
-      "loss": 0.7993,
       "step": 110
     },
     {
       "epoch": 0.053811659192825115,
-      "grad_norm": 1.440741479973038,
-      "learning_rate": 5.381165919282512e-06,
-      "loss": 0.7768,
       "step": 120
     },
     {
       "epoch": 0.05829596412556054,
-      "grad_norm": 1.6957957755628803,
-      "learning_rate": 5.8295964125560544e-06,
-      "loss": 0.793,
       "step": 130
     },
     {
       "epoch": 0.06278026905829596,
-      "grad_norm": 1.5847929433417824,
-      "learning_rate": 6.278026905829597e-06,
-      "loss": 0.8145,
       "step": 140
     },
     {
       "epoch": 0.06726457399103139,
-      "grad_norm": 1.721638040499256,
-      "learning_rate": 6.72645739910314e-06,
-      "loss": 0.7677,
       "step": 150
     },
     {
       "epoch": 0.07174887892376682,
-      "grad_norm": 1.8900581111883776,
-      "learning_rate": 7.174887892376682e-06,
-      "loss": 0.784,
       "step": 160
     },
     {
       "epoch": 0.07623318385650224,
-      "grad_norm": 1.554784427401762,
-      "learning_rate": 7.6233183856502244e-06,
-      "loss": 0.7595,
       "step": 170
     },
     {
       "epoch": 0.08071748878923767,
-      "grad_norm": 1.8326262271816938,
-      "learning_rate": 8.071748878923767e-06,
-      "loss": 0.7722,
       "step": 180
     },
     {
       "epoch": 0.08520179372197309,
-      "grad_norm": 1.4707798772479566,
-      "learning_rate": 8.52017937219731e-06,
-      "loss": 0.764,
       "step": 190
     },
     {
       "epoch": 0.08968609865470852,
-      "grad_norm": 1.449476026227771,
-      "learning_rate": 8.968609865470853e-06,
-      "loss": 0.773,
       "step": 200
     },
     {
       "epoch": 0.09417040358744394,
-      "grad_norm": 1.5945882538155545,
-      "learning_rate": 9.417040358744395e-06,
-      "loss": 0.7549,
       "step": 210
     },
     {
       "epoch": 0.09865470852017937,
-      "grad_norm": 1.4765341188830319,
-      "learning_rate": 9.865470852017938e-06,
-      "loss": 0.784,
       "step": 220
     },
     {
       "epoch": 0.1031390134529148,
-      "grad_norm": 1.452955640004767,
-      "learning_rate": 9.999699851108367e-06,
-      "loss": 0.7909,
       "step": 230
     },
     {
       "epoch": 0.10762331838565023,
-      "grad_norm": 1.5997501860541272,
-      "learning_rate": 9.9982298208374e-06,
-      "loss": 0.7651,
       "step": 240
     },
     {
       "epoch": 0.11210762331838565,
-      "grad_norm": 1.554511883278329,
-      "learning_rate": 9.995535139530904e-06,
-      "loss": 0.7621,
       "step": 250
     },
     {
       "epoch": 0.11659192825112108,
-      "grad_norm": 1.6319742757477633,
-      "learning_rate": 9.991616467431486e-06,
-      "loss": 0.7906,
       "step": 260
     },
     {
       "epoch": 0.1210762331838565,
-      "grad_norm": 1.6863475759473823,
-      "learning_rate": 9.986474764680236e-06,
-      "loss": 0.7684,
       "step": 270
     },
     {
       "epoch": 0.12556053811659193,
-      "grad_norm": 1.5563949560805244,
-      "learning_rate": 9.98011129108149e-06,
-      "loss": 0.793,
       "step": 280
     },
     {
       "epoch": 0.13004484304932734,
-      "grad_norm": 1.6203440473254576,
-      "learning_rate": 9.972527605794151e-06,
-      "loss": 0.771,
       "step": 290
     },
     {
       "epoch": 0.13452914798206278,
-      "grad_norm": 1.5211967329248808,
-      "learning_rate": 9.963725566949674e-06,
-      "loss": 0.7865,
       "step": 300
     },
     {
       "epoch": 0.13901345291479822,
-      "grad_norm": 1.3996609263865165,
-      "learning_rate": 9.953707331196787e-06,
-      "loss": 0.7397,
       "step": 310
     },
     {
       "epoch": 0.14349775784753363,
-      "grad_norm": 1.7467903395183983,
-      "learning_rate": 9.94247535317308e-06,
-      "loss": 0.802,
       "step": 320
     },
     {
       "epoch": 0.14798206278026907,
-      "grad_norm": 1.4561862746225176,
-      "learning_rate": 9.930032384903566e-06,
-      "loss": 0.78,
       "step": 330
     },
     {
       "epoch": 0.15246636771300448,
-      "grad_norm": 1.261592243705757,
-      "learning_rate": 9.916381475126406e-06,
-      "loss": 0.7678,
       "step": 340
     },
     {
       "epoch": 0.15695067264573992,
-      "grad_norm": 1.3845441861346746,
-      "learning_rate": 9.901525968545907e-06,
-      "loss": 0.7462,
       "step": 350
     },
     {
       "epoch": 0.16143497757847533,
-      "grad_norm": 1.330783975393604,
-      "learning_rate": 9.885469505013006e-06,
-      "loss": 0.7516,
       "step": 360
     },
     {
       "epoch": 0.16591928251121077,
-      "grad_norm": 1.5411411223309597,
-      "learning_rate": 9.868216018633456e-06,
-      "loss": 0.7617,
       "step": 370
     },
     {
       "epoch": 0.17040358744394618,
-      "grad_norm": 1.4997661320107978,
-      "learning_rate": 9.8497697368039e-06,
-      "loss": 0.7433,
       "step": 380
     },
     {
       "epoch": 0.17488789237668162,
-      "grad_norm": 1.4613487182945122,
-      "learning_rate": 9.830135179176086e-06,
-      "loss": 0.7977,
       "step": 390
     },
     {
       "epoch": 0.17937219730941703,
-      "grad_norm": 1.3823002584421413,
-      "learning_rate": 9.809317156549476e-06,
-      "loss": 0.7668,
       "step": 400
     },
     {
       "epoch": 0.18385650224215247,
-      "grad_norm": 1.3405610913825137,
-      "learning_rate": 9.787320769692517e-06,
-      "loss": 0.755,
       "step": 410
     },
     {
       "epoch": 0.18834080717488788,
-      "grad_norm": 1.321174566440371,
-      "learning_rate": 9.76415140809287e-06,
-      "loss": 0.7712,
       "step": 420
     },
     {
       "epoch": 0.19282511210762332,
-      "grad_norm": 1.5855307049280556,
-      "learning_rate": 9.739814748636892e-06,
-      "loss": 0.7876,
       "step": 430
     },
     {
       "epoch": 0.19730941704035873,
-      "grad_norm": 1.3277684374580685,
-      "learning_rate": 9.7143167542187e-06,
-      "loss": 0.7497,
       "step": 440
     },
     {
       "epoch": 0.20179372197309417,
-      "grad_norm": 1.3316052843966186,
-      "learning_rate": 9.687663672279167e-06,
-      "loss": 0.7742,
       "step": 450
     },
     {
       "epoch": 0.2062780269058296,
-      "grad_norm": 1.3422686862026139,
-      "learning_rate": 9.659862033275188e-06,
-      "loss": 0.7443,
       "step": 460
     },
     {
       "epoch": 0.21076233183856502,
-      "grad_norm": 1.3794382112115433,
-      "learning_rate": 9.630918649079606e-06,
-      "loss": 0.7423,
       "step": 470
     },
     {
       "epoch": 0.21524663677130046,
-      "grad_norm": 1.3473781761704757,
-      "learning_rate": 9.600840611312198e-06,
-      "loss": 0.756,
       "step": 480
     },
     {
       "epoch": 0.21973094170403587,
-      "grad_norm": 1.477159074283593,
-      "learning_rate": 9.569635289602098e-06,
-      "loss": 0.758,
       "step": 490
     },
     {
       "epoch": 0.2242152466367713,
-      "grad_norm": 1.4406496372056654,
-      "learning_rate": 9.537310329782109e-06,
-      "loss": 0.7373,
       "step": 500
     },
     {
       "epoch": 0.22869955156950672,
-      "grad_norm": 1.3236307975388621,
-      "learning_rate": 9.503873652015358e-06,
-      "loss": 0.7485,
       "step": 510
     },
     {
       "epoch": 0.23318385650224216,
-      "grad_norm": 1.2168213830447414,
-      "learning_rate": 9.469333448854713e-06,
-      "loss": 0.7518,
       "step": 520
     },
     {
       "epoch": 0.23766816143497757,
-      "grad_norm": 1.4695219464522695,
-      "learning_rate": 9.433698183235468e-06,
-      "loss": 0.7389,
       "step": 530
     },
     {
       "epoch": 0.242152466367713,
-      "grad_norm": 1.5460371795366352,
-      "learning_rate": 9.39697658640179e-06,
-      "loss": 0.7606,
       "step": 540
     },
     {
       "epoch": 0.24663677130044842,
-      "grad_norm": 1.36384817307445,
-      "learning_rate": 9.359177655767398e-06,
-      "loss": 0.7573,
       "step": 550
     },
     {
       "epoch": 0.25112107623318386,
-      "grad_norm": 1.2974873879986306,
-      "learning_rate": 9.320310652711062e-06,
-      "loss": 0.7447,
       "step": 560
     },
     {
       "epoch": 0.2556053811659193,
-      "grad_norm": 1.3924395179060305,
-      "learning_rate": 9.2803851003074e-06,
-      "loss": 0.7346,
       "step": 570
     },
     {
       "epoch": 0.2600896860986547,
-      "grad_norm": 1.3646463661968233,
-      "learning_rate": 9.239410780993565e-06,
-      "loss": 0.7637,
       "step": 580
     },
     {
       "epoch": 0.2645739910313901,
-      "grad_norm": 1.5500883315093192,
-      "learning_rate": 9.197397734172381e-06,
-      "loss": 0.7352,
       "step": 590
     },
     {
       "epoch": 0.26905829596412556,
-      "grad_norm": 1.2384578247121611,
-      "learning_rate": 9.154356253752519e-06,
-      "loss": 0.7467,
       "step": 600
     },
     {
       "epoch": 0.273542600896861,
-      "grad_norm": 1.4632407956897133,
-      "learning_rate": 9.110296885626315e-06,
-      "loss": 0.7592,
       "step": 610
     },
     {
       "epoch": 0.27802690582959644,
-      "grad_norm": 1.4743516068749583,
-      "learning_rate": 9.065230425085849e-06,
-      "loss": 0.7471,
       "step": 620
     },
     {
       "epoch": 0.2825112107623318,
-      "grad_norm": 1.609459180353317,
-      "learning_rate": 9.01916791417792e-06,
-      "loss": 0.7411,
       "step": 630
     },
     {
       "epoch": 0.28699551569506726,
-      "grad_norm": 1.4163468213726333,
-      "learning_rate": 8.97212063899854e-06,
-      "loss": 0.7583,
       "step": 640
     },
     {
       "epoch": 0.2914798206278027,
-      "grad_norm": 1.2789206960042645,
-      "learning_rate": 8.924100126927672e-06,
-      "loss": 0.7637,
       "step": 650
     },
     {
       "epoch": 0.29596412556053814,
-      "grad_norm": 1.4348891847742615,
-      "learning_rate": 8.87511814380481e-06,
-      "loss": 0.7376,
       "step": 660
     },
     {
       "epoch": 0.3004484304932735,
-      "grad_norm": 1.4055551714674843,
-      "learning_rate": 8.825186691046156e-06,
-      "loss": 0.7544,
       "step": 670
     },
     {
       "epoch": 0.30493273542600896,
-      "grad_norm": 1.3076750983715024,
-      "learning_rate": 8.774318002704072e-06,
-      "loss": 0.7388,
       "step": 680
     },
     {
       "epoch": 0.3094170403587444,
-      "grad_norm": 1.2832634236951583,
-      "learning_rate": 8.722524542469517e-06,
-      "loss": 0.7386,
       "step": 690
     },
     {
       "epoch": 0.31390134529147984,
-      "grad_norm": 1.3612825976793976,
-      "learning_rate": 8.669819000618248e-06,
-      "loss": 0.768,
       "step": 700
     },
     {
       "epoch": 0.3183856502242152,
-      "grad_norm": 1.3343477240590562,
-      "learning_rate": 8.616214290901474e-06,
-      "loss": 0.7244,
       "step": 710
     },
     {
       "epoch": 0.32286995515695066,
-      "grad_norm": 1.3061898085537815,
-      "learning_rate": 8.56172354738178e-06,
-      "loss": 0.7368,
       "step": 720
     },
     {
       "epoch": 0.3273542600896861,
-      "grad_norm": 1.423008056010291,
-      "learning_rate": 8.506360121215046e-06,
-      "loss": 0.7297,
       "step": 730
     },
     {
       "epoch": 0.33183856502242154,
-      "grad_norm": 1.258418518673196,
-      "learning_rate": 8.4501375773792e-06,
-      "loss": 0.7322,
       "step": 740
     },
     {
       "epoch": 0.336322869955157,
-      "grad_norm": 1.4805131801943718,
-      "learning_rate": 8.39306969135056e-06,
-      "loss": 0.7284,
       "step": 750
     },
     {
       "epoch": 0.34080717488789236,
-      "grad_norm": 1.3260411374117855,
-      "learning_rate": 8.335170445728609e-06,
-      "loss": 0.7618,
       "step": 760
     },
     {
       "epoch": 0.3452914798206278,
-      "grad_norm": 1.3351013471897553,
-      "learning_rate": 8.276454026810026e-06,
-      "loss": 0.7454,
       "step": 770
     },
     {
       "epoch": 0.34977578475336324,
-      "grad_norm": 1.267536863507402,
-      "learning_rate": 8.216934821112803e-06,
-      "loss": 0.742,
       "step": 780
     },
     {
       "epoch": 0.3542600896860987,
-      "grad_norm": 1.4791659570865663,
-      "learning_rate": 8.156627411851295e-06,
-      "loss": 0.7483,
       "step": 790
     },
     {
       "epoch": 0.35874439461883406,
-      "grad_norm": 1.4042375363677306,
-      "learning_rate": 8.095546575363098e-06,
-      "loss": 0.7134,
       "step": 800
     },
     {
       "epoch": 0.3632286995515695,
-      "grad_norm": 1.2552675050466975,
-      "learning_rate": 8.033707277488585e-06,
-      "loss": 0.7186,
       "step": 810
     },
     {
       "epoch": 0.36771300448430494,
-      "grad_norm": 1.281776208560821,
-      "learning_rate": 7.97112466990403e-06,
-      "loss": 0.7367,
       "step": 820
     },
     {
       "epoch": 0.3721973094170404,
-      "grad_norm": 1.417675486477273,
-      "learning_rate": 7.907814086409183e-06,
-      "loss": 0.7399,
       "step": 830
     },
     {
       "epoch": 0.37668161434977576,
-      "grad_norm": 1.4911782746859528,
-      "learning_rate": 7.843791039170232e-06,
-      "loss": 0.738,
       "step": 840
     },
     {
       "epoch": 0.3811659192825112,
-      "grad_norm": 1.3628167592658191,
-      "learning_rate": 7.779071214919068e-06,
-      "loss": 0.7404,
       "step": 850
     },
     {
       "epoch": 0.38565022421524664,
-      "grad_norm": 1.256534318600846,
-      "learning_rate": 7.713670471109749e-06,
-      "loss": 0.7364,
       "step": 860
     },
     {
       "epoch": 0.3901345291479821,
-      "grad_norm": 1.3061985247991272,
-      "learning_rate": 7.647604832033178e-06,
-      "loss": 0.7535,
       "step": 870
     },
     {
       "epoch": 0.39461883408071746,
-      "grad_norm": 1.3626743194809932,
-      "learning_rate": 7.580890484890864e-06,
-      "loss": 0.7212,
       "step": 880
     },
     {
       "epoch": 0.3991031390134529,
-      "grad_norm": 1.283405821881289,
-      "learning_rate": 7.513543775828791e-06,
-      "loss": 0.7336,
       "step": 890
     },
     {
       "epoch": 0.40358744394618834,
-      "grad_norm": 1.4217858183255085,
-      "learning_rate": 7.445581205932335e-06,
-      "loss": 0.7349,
       "step": 900
     },
     {
       "epoch": 0.4080717488789238,
-      "grad_norm": 1.3304359323377464,
-      "learning_rate": 7.377019427183213e-06,
-      "loss": 0.7265,
       "step": 910
     },
     {
       "epoch": 0.4125560538116592,
-      "grad_norm": 1.3324230635898773,
-      "learning_rate": 7.30787523837947e-06,
-      "loss": 0.7451,
       "step": 920
     },
     {
       "epoch": 0.4170403587443946,
-      "grad_norm": 1.2976336030674929,
-      "learning_rate": 7.238165581019488e-06,
-      "loss": 0.7415,
       "step": 930
     },
     {
       "epoch": 0.42152466367713004,
-      "grad_norm": 1.3082251443805497,
-      "learning_rate": 7.167907535151027e-06,
-      "loss": 0.7405,
       "step": 940
     },
     {
       "epoch": 0.4260089686098655,
-      "grad_norm": 1.368310990039969,
-      "learning_rate": 7.097118315186335e-06,
-      "loss": 0.7141,
       "step": 950
     },
     {
       "epoch": 0.4304932735426009,
-      "grad_norm": 1.3507885090012453,
-      "learning_rate": 7.025815265684315e-06,
-      "loss": 0.744,
       "step": 960
     },
     {
       "epoch": 0.4349775784753363,
-      "grad_norm": 1.3870418574466943,
-      "learning_rate": 6.9540158571008105e-06,
-      "loss": 0.7344,
       "step": 970
     },
     {
       "epoch": 0.43946188340807174,
-      "grad_norm": 1.2702369270052376,
-      "learning_rate": 6.881737681508065e-06,
-      "loss": 0.7131,
       "step": 980
     },
     {
       "epoch": 0.4439461883408072,
-      "grad_norm": 1.3694518474478212,
-      "learning_rate": 6.808998448284347e-06,
-      "loss": 0.7516,
       "step": 990
     },
     {
       "epoch": 0.4484304932735426,
-      "grad_norm": 1.3967080566658139,
-      "learning_rate": 6.735815979774865e-06,
-      "loss": 0.7208,
       "step": 1000
     },
     {
       "epoch": 0.452914798206278,
-      "grad_norm": 1.4517646369323314,
-      "learning_rate": 6.662208206924986e-06,
-      "loss": 0.7455,
       "step": 1010
     },
     {
       "epoch": 0.45739910313901344,
-      "grad_norm": 1.2752229454885209,
-      "learning_rate": 6.588193164886847e-06,
-      "loss": 0.7555,
       "step": 1020
     },
     {
       "epoch": 0.4618834080717489,
-      "grad_norm": 1.3528261238688069,
-      "learning_rate": 6.513788988600441e-06,
-      "loss": 0.7428,
       "step": 1030
     },
     {
       "epoch": 0.4663677130044843,
-      "grad_norm": 1.340831615562883,
-      "learning_rate": 6.439013908350249e-06,
-      "loss": 0.7446,
       "step": 1040
     },
     {
       "epoch": 0.47085201793721976,
-      "grad_norm": 1.3233981064583105,
-      "learning_rate": 6.363886245298514e-06,
-      "loss": 0.6945,
       "step": 1050
     },
     {
       "epoch": 0.47533632286995514,
-      "grad_norm": 1.29683857481815,
-      "learning_rate": 6.288424406996237e-06,
-      "loss": 0.7085,
       "step": 1060
     },
     {
       "epoch": 0.4798206278026906,
-      "grad_norm": 1.3009424922765027,
-      "learning_rate": 6.2126468828730225e-06,
-      "loss": 0.7294,
       "step": 1070
     },
     {
       "epoch": 0.484304932735426,
-      "grad_norm": 1.244204104767217,
-      "learning_rate": 6.136572239706854e-06,
-      "loss": 0.7091,
       "step": 1080
     },
     {
       "epoch": 0.48878923766816146,
-      "grad_norm": 1.4080424781447183,
-      "learning_rate": 6.060219117074914e-06,
-      "loss": 0.724,
       "step": 1090
     },
     {
       "epoch": 0.49327354260089684,
-      "grad_norm": 1.4328593518662633,
-      "learning_rate": 5.983606222786577e-06,
-      "loss": 0.7106,
       "step": 1100
     },
     {
       "epoch": 0.4977578475336323,
-      "grad_norm": 1.3515007322722326,
-      "learning_rate": 5.9067523282996775e-06,
-      "loss": 0.7111,
       "step": 1110
     },
     {
       "epoch": 0.5022421524663677,
-      "grad_norm": 1.3343622572750706,
-      "learning_rate": 5.829676264121184e-06,
-      "loss": 0.7323,
       "step": 1120
     },
     {
       "epoch": 0.5067264573991032,
-      "grad_norm": 1.3346769678858248,
-      "learning_rate": 5.752396915193403e-06,
-      "loss": 0.744,
       "step": 1130
     },
     {
       "epoch": 0.5112107623318386,
-      "grad_norm": 1.4071259742434898,
-      "learning_rate": 5.6749332162668525e-06,
-      "loss": 0.7181,
       "step": 1140
     },
     {
       "epoch": 0.515695067264574,
-      "grad_norm": 1.314030857289351,
-      "learning_rate": 5.5973041472609265e-06,
-      "loss": 0.7278,
       "step": 1150
     },
     {
       "epoch": 0.5201793721973094,
-      "grad_norm": 1.2893634754111014,
-      "learning_rate": 5.519528728613491e-06,
-      "loss": 0.722,
       "step": 1160
     },
     {
       "epoch": 0.5246636771300448,
-      "grad_norm": 1.2894378069172643,
-      "learning_rate": 5.4416260166205525e-06,
-      "loss": 0.7282,
       "step": 1170
     },
     {
       "epoch": 0.5291479820627802,
-      "grad_norm": 1.2691657277439665,
-      "learning_rate": 5.363615098767149e-06,
-      "loss": 0.7439,
       "step": 1180
     },
     {
       "epoch": 0.5336322869955157,
-      "grad_norm": 1.3478124675299437,
-      "learning_rate": 5.285515089050587e-06,
-      "loss": 0.7164,
       "step": 1190
     },
     {
       "epoch": 0.5381165919282511,
-      "grad_norm": 1.3052899854504807,
-      "learning_rate": 5.207345123297187e-06,
-      "loss": 0.7171,
       "step": 1200
     },
     {
       "epoch": 0.5426008968609866,
-      "grad_norm": 1.27131464716386,
-      "learning_rate": 5.129124354473688e-06,
-      "loss": 0.7235,
       "step": 1210
     },
     {
       "epoch": 0.547085201793722,
-      "grad_norm": 1.257329353514232,
-      "learning_rate": 5.050871947994443e-06,
-      "loss": 0.6999,
       "step": 1220
     },
     {
       "epoch": 0.5515695067264574,
-      "grad_norm": 1.3863466652299603,
-      "learning_rate": 4.972607077025563e-06,
-      "loss": 0.7251,
       "step": 1230
     },
     {
       "epoch": 0.5560538116591929,
-      "grad_norm": 1.394706696869187,
-      "learning_rate": 4.894348917787174e-06,
-      "loss": 0.6963,
       "step": 1240
     },
     {
       "epoch": 0.5605381165919282,
-      "grad_norm": 1.288516832053435,
-      "learning_rate": 4.816116644854912e-06,
-      "loss": 0.7207,
       "step": 1250
     },
     {
       "epoch": 0.5650224215246636,
-      "grad_norm": 1.291568839795731,
-      "learning_rate": 4.73792942646183e-06,
-      "loss": 0.7168,
       "step": 1260
     },
     {
       "epoch": 0.5695067264573991,
-      "grad_norm": 1.3692501176044114,
-      "learning_rate": 4.659806419801855e-06,
-      "loss": 0.7311,
       "step": 1270
     },
     {
       "epoch": 0.5739910313901345,
-      "grad_norm": 1.2214111714932894,
-      "learning_rate": 4.581766766335953e-06,
-      "loss": 0.7175,
       "step": 1280
     },
     {
       "epoch": 0.57847533632287,
-      "grad_norm": 1.2261021201524966,
-      "learning_rate": 4.503829587102138e-06,
-      "loss": 0.722,
       "step": 1290
     },
     {
       "epoch": 0.5829596412556054,
-      "grad_norm": 1.279552435182188,
-      "learning_rate": 4.426013978030508e-06,
-      "loss": 0.7407,
       "step": 1300
     },
     {
       "epoch": 0.5874439461883408,
-      "grad_norm": 1.2639914975822624,
-      "learning_rate": 4.348339005264406e-06,
-      "loss": 0.7174,
       "step": 1310
     },
     {
       "epoch": 0.5919282511210763,
-      "grad_norm": 1.2931987303236723,
-      "learning_rate": 4.270823700488896e-06,
-      "loss": 0.7236,
       "step": 1320
     },
     {
       "epoch": 0.5964125560538116,
-      "grad_norm": 1.3083460389811294,
-      "learning_rate": 4.19348705626768e-06,
-      "loss": 0.7247,
       "step": 1330
     },
     {
       "epoch": 0.600896860986547,
-      "grad_norm": 1.359946232054244,
-      "learning_rate": 4.116348021389595e-06,
-      "loss": 0.7289,
       "step": 1340
     },
     {
       "epoch": 0.6053811659192825,
-      "grad_norm": 1.3038409489179155,
-      "learning_rate": 4.039425496225834e-06,
-      "loss": 0.723,
       "step": 1350
     },
     {
       "epoch": 0.6098654708520179,
-      "grad_norm": 1.337031671252276,
-      "learning_rate": 3.962738328099047e-06,
-      "loss": 0.718,
       "step": 1360
     },
     {
       "epoch": 0.6143497757847534,
-      "grad_norm": 1.3006506595251124,
-      "learning_rate": 3.88630530666542e-06,
-      "loss": 0.7372,
       "step": 1370
     },
     {
       "epoch": 0.6188340807174888,
-      "grad_norm": 1.3038120667886732,
-      "learning_rate": 3.8101451593108816e-06,
-      "loss": 0.732,
       "step": 1380
     },
     {
       "epoch": 0.6233183856502242,
-      "grad_norm": 1.2544712968929104,
-      "learning_rate": 3.7342765465625953e-06,
-      "loss": 0.7347,
       "step": 1390
     },
     {
       "epoch": 0.6278026905829597,
-      "grad_norm": 1.3352755014667614,
-      "learning_rate": 3.658718057516803e-06,
-      "loss": 0.7332,
       "step": 1400
     },
     {
       "epoch": 0.6322869955156951,
-      "grad_norm": 1.3389617347187606,
-      "learning_rate": 3.5834882052841744e-06,
-      "loss": 0.7154,
       "step": 1410
     },
     {
       "epoch": 0.6367713004484304,
-      "grad_norm": 1.2654799213890686,
-      "learning_rate": 3.508605422453799e-06,
-      "loss": 0.7002,
       "step": 1420
     },
     {
       "epoch": 0.6412556053811659,
-      "grad_norm": 1.311398422880929,
-      "learning_rate": 3.4340880565768707e-06,
-      "loss": 0.7098,
       "step": 1430
     },
     {
       "epoch": 0.6457399103139013,
-      "grad_norm": 1.3577211660369808,
-      "learning_rate": 3.359954365671241e-06,
-      "loss": 0.7024,
       "step": 1440
     },
     {
       "epoch": 0.6502242152466368,
-      "grad_norm": 1.145375472595952,
-      "learning_rate": 3.2862225137478897e-06,
-      "loss": 0.7097,
       "step": 1450
     },
     {
       "epoch": 0.6547085201793722,
-      "grad_norm": 1.3358950336855993,
-      "learning_rate": 3.2129105663604275e-06,
-      "loss": 0.7148,
       "step": 1460
     },
     {
       "epoch": 0.6591928251121076,
-      "grad_norm": 1.2495395838860759,
-      "learning_rate": 3.1400364861787434e-06,
-      "loss": 0.7483,
       "step": 1470
     },
     {
       "epoch": 0.6636771300448431,
-      "grad_norm": 1.2742823576583961,
-      "learning_rate": 3.0676181285878343e-06,
-      "loss": 0.7063,
       "step": 1480
     },
     {
       "epoch": 0.6681614349775785,
-      "grad_norm": 1.2596513250823083,
-      "learning_rate": 2.9956732373129378e-06,
-      "loss": 0.7201,
       "step": 1490
     },
     {
       "epoch": 0.672645739910314,
-      "grad_norm": 1.4183600229677984,
-      "learning_rate": 2.9242194400720157e-06,
-      "loss": 0.7202,
       "step": 1500
     },
     {
       "epoch": 0.6771300448430493,
-      "grad_norm": 1.224500332009707,
-      "learning_rate": 2.8532742442566735e-06,
-      "loss": 0.7228,
       "step": 1510
     },
     {
       "epoch": 0.6816143497757847,
-      "grad_norm": 1.20210393613667,
-      "learning_rate": 2.782855032642535e-06,
-      "loss": 0.7386,
       "step": 1520
     },
     {
       "epoch": 0.6860986547085202,
-      "grad_norm": 1.2835056973370584,
-      "learning_rate": 2.712979059130187e-06,
-      "loss": 0.7207,
       "step": 1530
     },
     {
       "epoch": 0.6905829596412556,
-      "grad_norm": 1.180714987729606,
-      "learning_rate": 2.643663444517671e-06,
-      "loss": 0.6981,
       "step": 1540
     },
     {
       "epoch": 0.695067264573991,
-      "grad_norm": 1.2871858226590431,
-      "learning_rate": 2.5749251723055933e-06,
-      "loss": 0.6853,
       "step": 1550
     },
     {
       "epoch": 0.6995515695067265,
-      "grad_norm": 1.3219720717807693,
-      "learning_rate": 2.5067810845358926e-06,
-      "loss": 0.7192,
       "step": 1560
     },
     {
       "epoch": 0.7040358744394619,
-      "grad_norm": 1.391893981214182,
-      "learning_rate": 2.439247877665244e-06,
-      "loss": 0.7103,
       "step": 1570
     },
     {
       "epoch": 0.7085201793721974,
-      "grad_norm": 1.2636799158865641,
-      "learning_rate": 2.3723420984741417e-06,
-      "loss": 0.684,
       "step": 1580
     },
     {
       "epoch": 0.7130044843049327,
-      "grad_norm": 1.3557464635552046,
-      "learning_rate": 2.3060801400126693e-06,
-      "loss": 0.7207,
       "step": 1590
     },
     {
       "epoch": 0.7174887892376681,
-      "grad_norm": 1.3569256088684083,
-      "learning_rate": 2.240478237583915e-06,
-      "loss": 0.7077,
       "step": 1600
     },
     {
       "epoch": 0.7219730941704036,
-      "grad_norm": 1.37192661939199,
-      "learning_rate": 2.1755524647660514e-06,
-      "loss": 0.693,
       "step": 1610
     },
     {
       "epoch": 0.726457399103139,
-      "grad_norm": 1.248391921620642,
-      "learning_rate": 2.1113187294740294e-06,
-      "loss": 0.6911,
       "step": 1620
     },
     {
       "epoch": 0.7309417040358744,
-      "grad_norm": 1.357856212054366,
-      "learning_rate": 2.047792770061881e-06,
-      "loss": 0.6838,
       "step": 1630
     },
     {
       "epoch": 0.7354260089686099,
-      "grad_norm": 1.4147463764446673,
-      "learning_rate": 1.9849901514665458e-06,
-      "loss": 0.7122,
       "step": 1640
     },
     {
       "epoch": 0.7399103139013453,
-      "grad_norm": 1.3587967144630926,
-      "learning_rate": 1.922926261394206e-06,
-      "loss": 0.6927,
       "step": 1650
     },
     {
       "epoch": 0.7443946188340808,
-      "grad_norm": 1.326112035320906,
-      "learning_rate": 1.8616163065500231e-06,
-      "loss": 0.6931,
       "step": 1660
     },
     {
       "epoch": 0.7488789237668162,
-      "grad_norm": 1.2337199420172695,
-      "learning_rate": 1.8010753089122572e-06,
-      "loss": 0.6934,
       "step": 1670
     },
     {
       "epoch": 0.7533632286995515,
-      "grad_norm": 1.2176765529306792,
-      "learning_rate": 1.7413181020516146e-06,
-      "loss": 0.7164,
       "step": 1680
     },
     {
       "epoch": 0.757847533632287,
-      "grad_norm": 1.2724919849788396,
-      "learning_rate": 1.6823593274967703e-06,
-      "loss": 0.7267,
       "step": 1690
     },
     {
       "epoch": 0.7623318385650224,
-      "grad_norm": 1.3274428154634692,
-      "learning_rate": 1.6242134311469538e-06,
-      "loss": 0.6824,
       "step": 1700
     },
     {
       "epoch": 0.7668161434977578,
-      "grad_norm": 1.3661211722387332,
-      "learning_rate": 1.5668946597324558e-06,
-      "loss": 0.7182,
       "step": 1710
     },
     {
       "epoch": 0.7713004484304933,
-      "grad_norm": 1.2959174525555186,
-      "learning_rate": 1.51041705732393e-06,
-      "loss": 0.7118,
       "step": 1720
     },
     {
       "epoch": 0.7757847533632287,
-      "grad_norm": 1.2773849053623296,
-      "learning_rate": 1.4547944618913706e-06,
-      "loss": 0.6929,
       "step": 1730
     },
     {
       "epoch": 0.7802690582959642,
-      "grad_norm": 1.297150269984566,
-      "learning_rate": 1.4000405019135676e-06,
-      "loss": 0.6883,
       "step": 1740
     },
     {
       "epoch": 0.7847533632286996,
-      "grad_norm": 1.400539444832659,
-      "learning_rate": 1.3461685930388958e-06,
-      "loss": 0.6911,
       "step": 1750
     },
     {
       "epoch": 0.7892376681614349,
-      "grad_norm": 1.2337657683341194,
-      "learning_rate": 1.2931919347982607e-06,
-      "loss": 0.6921,
       "step": 1760
     },
     {
       "epoch": 0.7937219730941704,
-      "grad_norm": 1.2072415924732212,
-      "learning_rate": 1.2411235073709883e-06,
-      "loss": 0.7102,
       "step": 1770
     },
     {
       "epoch": 0.7982062780269058,
-      "grad_norm": 1.381367292756694,
-      "learning_rate": 1.1899760684044515e-06,
-      "loss": 0.6838,
       "step": 1780
     },
     {
       "epoch": 0.8026905829596412,
-      "grad_norm": 1.3102643665842388,
-      "learning_rate": 1.1397621498882471e-06,
-      "loss": 0.6945,
       "step": 1790
     },
     {
       "epoch": 0.8071748878923767,
-      "grad_norm": 1.3214717866253802,
-      "learning_rate": 1.0904940550836285e-06,
-      "loss": 0.7016,
       "step": 1800
     },
     {
       "epoch": 0.8116591928251121,
-      "grad_norm": 1.1240219719921938,
-      "learning_rate": 1.0421838555090119e-06,
-      "loss": 0.7018,
       "step": 1810
     },
     {
       "epoch": 0.8161434977578476,
-      "grad_norm": 1.2844428659053804,
-      "learning_rate": 9.948433879822428e-07,
-      "loss": 0.7361,
       "step": 1820
     },
     {
       "epoch": 0.820627802690583,
-      "grad_norm": 1.249492840397432,
-      "learning_rate": 9.484842517203735e-07,
-      "loss": 0.707,
       "step": 1830
     },
     {
       "epoch": 0.8251121076233184,
-      "grad_norm": 1.2793612843154911,
-      "learning_rate": 9.031178054976636e-07,
-      "loss": 0.7226,
       "step": 1840
     },
     {
       "epoch": 0.8295964125560538,
-      "grad_norm": 1.3447642426455195,
-      "learning_rate": 8.587551648624859e-07,
-      "loss": 0.6906,
       "step": 1850
     },
     {
       "epoch": 0.8340807174887892,
-      "grad_norm": 1.282168137173779,
-      "learning_rate": 8.154071994138241e-07,
-      "loss": 0.698,
       "step": 1860
     },
     {
       "epoch": 0.8385650224215246,
-      "grad_norm": 1.4629920435128856,
-      "learning_rate": 7.730845301380441e-07,
-      "loss": 0.7212,
       "step": 1870
     },
     {
       "epoch": 0.8430493273542601,
-      "grad_norm": 1.3143459341111923,
-      "learning_rate": 7.317975268065685e-07,
-      "loss": 0.6942,
       "step": 1880
     },
     {
       "epoch": 0.8475336322869955,
-      "grad_norm": 1.3274198546425116,
-      "learning_rate": 6.915563054351037e-07,
-      "loss": 0.6944,
       "step": 1890
     },
     {
       "epoch": 0.852017937219731,
-      "grad_norm": 1.2958298618538142,
-      "learning_rate": 6.523707258050516e-07,
-      "loss": 0.6692,
       "step": 1900
     },
     {
       "epoch": 0.8565022421524664,
-      "grad_norm": 1.3403229426415875,
-      "learning_rate": 6.14250389047692e-07,
-      "loss": 0.7034,
       "step": 1910
     },
     {
       "epoch": 0.8609865470852018,
-      "grad_norm": 1.258601487476543,
-      "learning_rate": 5.772046352917399e-07,
-      "loss": 0.7144,
       "step": 1920
     },
     {
       "epoch": 0.8654708520179372,
-      "grad_norm": 1.1222145256140716,
-      "learning_rate": 5.412425413748623e-07,
-      "loss": 0.6988,
       "step": 1930
     },
     {
       "epoch": 0.8699551569506726,
-      "grad_norm": 1.2674996138499859,
-      "learning_rate": 5.063729186196948e-07,
-      "loss": 0.7089,
       "step": 1940
     },
     {
       "epoch": 0.874439461883408,
-      "grad_norm": 1.3633828734617575,
-      "learning_rate": 4.7260431067491617e-07,
-      "loss": 0.733,
       "step": 1950
     },
     {
       "epoch": 0.8789237668161435,
-      "grad_norm": 1.2652212276863493,
-      "learning_rate": 4.399449914219167e-07,
-      "loss": 0.7209,
       "step": 1960
     },
     {
       "epoch": 0.8834080717488789,
-      "grad_norm": 1.2975811004623845,
-      "learning_rate": 4.084029629475478e-07,
-      "loss": 0.7252,
       "step": 1970
     },
     {
       "epoch": 0.8878923766816144,
-      "grad_norm": 1.2744740809644324,
-      "learning_rate": 3.7798595358348457e-07,
-      "loss": 0.7083,
       "step": 1980
     },
     {
       "epoch": 0.8923766816143498,
-      "grad_norm": 1.2924446335095698,
-      "learning_rate": 3.487014160126467e-07,
-      "loss": 0.7077,
       "step": 1990
     },
     {
       "epoch": 0.8968609865470852,
-      "grad_norm": 1.283938673421291,
-      "learning_rate": 3.2055652544316695e-07,
-      "loss": 0.7038,
       "step": 2000
     },
     {
       "epoch": 0.9013452914798207,
-      "grad_norm": 1.3428158271865258,
-      "learning_rate": 2.9355817785034325e-07,
-      "loss": 0.7177,
       "step": 2010
     },
     {
       "epoch": 0.905829596412556,
-      "grad_norm": 1.3237588390932393,
-      "learning_rate": 2.6771298828700885e-07,
-      "loss": 0.7079,
       "step": 2020
     },
     {
       "epoch": 0.9103139013452914,
-      "grad_norm": 1.379188659318735,
-      "learning_rate": 2.4302728926273224e-07,
-      "loss": 0.7159,
       "step": 2030
     },
     {
       "epoch": 0.9147982062780269,
-      "grad_norm": 1.2853687562313414,
-      "learning_rate": 2.195071291922435e-07,
-      "loss": 0.6842,
       "step": 2040
     },
     {
       "epoch": 0.9192825112107623,
-      "grad_norm": 1.3212853096546313,
-      "learning_rate": 1.9715827091347005e-07,
-      "loss": 0.6994,
       "step": 2050
     },
     {
       "epoch": 0.9237668161434978,
-      "grad_norm": 1.4231610496002076,
-      "learning_rate": 1.7598619027554553e-07,
-      "loss": 0.7032,
       "step": 2060
     },
     {
       "epoch": 0.9282511210762332,
-      "grad_norm": 1.288660209861224,
-      "learning_rate": 1.5599607479713396e-07,
-      "loss": 0.6856,
       "step": 2070
     },
     {
       "epoch": 0.9327354260089686,
-      "grad_norm": 1.291080758227027,
-      "learning_rate": 1.3719282239539722e-07,
-      "loss": 0.7183,
       "step": 2080
     },
     {
       "epoch": 0.9372197309417041,
-      "grad_norm": 1.3527938026636528,
-      "learning_rate": 1.1958104018592376e-07,
-      "loss": 0.7022,
       "step": 2090
     },
     {
       "epoch": 0.9417040358744395,
-      "grad_norm": 1.306930563286188,
-      "learning_rate": 1.0316504335390775e-07,
-      "loss": 0.7202,
       "step": 2100
     },
     {
       "epoch": 0.9461883408071748,
-      "grad_norm": 1.34728936572921,
-      "learning_rate": 8.79488540968565e-08,
-      "loss": 0.7128,
       "step": 2110
     },
     {
       "epoch": 0.9506726457399103,
-      "grad_norm": 1.5138997567977701,
-      "learning_rate": 7.39362006390798e-08,
-      "loss": 0.6841,
       "step": 2120
     },
     {
       "epoch": 0.9551569506726457,
-      "grad_norm": 1.2518587224976194,
-      "learning_rate": 6.113051631821631e-08,
-      "loss": 0.71,
       "step": 2130
     },
     {
       "epoch": 0.9596412556053812,
-      "grad_norm": 1.4419766059722212,
-      "learning_rate": 4.9534938744004723e-08,
-      "loss": 0.6944,
       "step": 2140
     },
     {
       "epoch": 0.9641255605381166,
-      "grad_norm": 1.3591089992663685,
-      "learning_rate": 3.915230902951761e-08,
-      "loss": 0.717,
       "step": 2150
     },
     {
       "epoch": 0.968609865470852,
-      "grad_norm": 1.281084457332566,
-      "learning_rate": 2.9985171095041066e-08,
-      "loss": 0.703,
       "step": 2160
     },
     {
       "epoch": 0.9730941704035875,
-      "grad_norm": 1.1922556783094496,
-      "learning_rate": 2.203577104476773e-08,
-      "loss": 0.7085,
       "step": 2170
     },
     {
       "epoch": 0.9775784753363229,
-      "grad_norm": 1.3917918614257676,
-      "learning_rate": 1.5306056616468666e-08,
-      "loss": 0.709,
       "step": 2180
     },
     {
       "epoch": 0.9820627802690582,
-      "grad_norm": 1.178733370314234,
-      "learning_rate": 9.797676704259574e-09,
-      "loss": 0.7009,
       "step": 2190
     },
     {
       "epoch": 0.9865470852017937,
-      "grad_norm": 1.2352452900540438,
-      "learning_rate": 5.511980954596152e-09,
-      "loss": 0.7085,
       "step": 2200
     },
     {
       "epoch": 0.9910313901345291,
-      "grad_norm": 1.519249312059187,
-      "learning_rate": 2.4500194355880913e-09,
-      "loss": 0.6865,
       "step": 2210
     },
     {
       "epoch": 0.9955156950672646,
-      "grad_norm": 1.2975150315542752,
-      "learning_rate": 6.125423797137541e-10,
-      "loss": 0.6935,
       "step": 2220
     },
     {
       "epoch": 1.0,
-      "grad_norm": 1.2570272992562246,
       "learning_rate": 0.0,
-      "loss": 0.6946,
-      "step": 2230
-    },
-    {
-      "epoch": 1.0,
-      "eval_loss": 0.7042345404624939,
-      "eval_runtime": 381.2635,
-      "eval_samples_per_second": 131.345,
-      "eval_steps_per_second": 2.054,
       "step": 2230
     },
     {
       "epoch": 1.0,
       "step": 2230,
-      "total_flos": 244955314192384.0,
-      "train_loss": 0.7376568270371099,
-      "train_runtime": 8720.6202,
-      "train_samples_per_second": 32.731,
-      "train_steps_per_second": 0.256
     }
   ],
   "logging_steps": 10,
@@ -1611,7 +1779,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 244955314192384.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

   "best_metric": null,
   "best_model_checkpoint": null,
   "epoch": 1.0,
+  "eval_steps": 100,
   "global_step": 2230,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0004484304932735426,
+      "grad_norm": 4.696539476451585,
+      "learning_rate": 1.3452914798206278e-08,
       "loss": 0.9912,
       "step": 1
     },
     {
       "epoch": 0.004484304932735426,
+      "grad_norm": 5.089904667658368,
+      "learning_rate": 1.345291479820628e-07,
+      "loss": 1.0341,
       "step": 10
     },
     {
       "epoch": 0.008968609865470852,
+      "grad_norm": 5.546828630388097,
+      "learning_rate": 2.690582959641256e-07,
+      "loss": 1.0502,
       "step": 20
     },
     {
       "epoch": 0.013452914798206279,
+      "grad_norm": 4.113849381101499,
+      "learning_rate": 4.0358744394618834e-07,
+      "loss": 1.0386,
       "step": 30
     },
     {
       "epoch": 0.017937219730941704,
+      "grad_norm": 3.6548963622814887,
+      "learning_rate": 5.381165919282512e-07,
+      "loss": 1.0282,
       "step": 40
     },
     {
       "epoch": 0.02242152466367713,
+      "grad_norm": 2.157564670206396,
+      "learning_rate": 6.72645739910314e-07,
+      "loss": 0.9574,
       "step": 50
     },
     {
       "epoch": 0.026905829596412557,
+      "grad_norm": 2.0184475272019555,
+      "learning_rate": 8.071748878923767e-07,
+      "loss": 0.9263,
       "step": 60
     },
     {
       "epoch": 0.03139013452914798,
+      "grad_norm": 1.7894937443172652,
+      "learning_rate": 9.417040358744395e-07,
+      "loss": 0.9253,
       "step": 70
     },
     {
       "epoch": 0.03587443946188341,
+      "grad_norm": 1.6533764414432808,
+      "learning_rate": 1.0762331838565023e-06,
+      "loss": 0.9106,
       "step": 80
     },
     {
       "epoch": 0.04035874439461883,
+      "grad_norm": 1.9561381307359194,
+      "learning_rate": 1.2107623318385651e-06,
+      "loss": 0.8713,
       "step": 90
     },
     {
       "epoch": 0.04484304932735426,
+      "grad_norm": 1.5478472557018526,
+      "learning_rate": 1.345291479820628e-06,
+      "loss": 0.8741,
+      "step": 100
+    },
+    {
+      "epoch": 0.04484304932735426,
+      "eval_loss": 0.8599640727043152,
+      "eval_runtime": 430.7233,
+      "eval_samples_per_second": 116.263,
+      "eval_steps_per_second": 1.818,
       "step": 100
     },
     {
       "epoch": 0.04932735426008968,
+      "grad_norm": 1.5759592930264636,
+      "learning_rate": 1.4798206278026905e-06,
+      "loss": 0.8381,
       "step": 110
     },
     {
       "epoch": 0.053811659192825115,
+      "grad_norm": 1.5446577353242628,
+      "learning_rate": 1.6143497757847533e-06,
+      "loss": 0.8151,
       "step": 120
     },
     {
       "epoch": 0.05829596412556054,
+      "grad_norm": 1.6899841974229757,
+      "learning_rate": 1.7488789237668162e-06,
+      "loss": 0.8309,
       "step": 130
     },
     {
       "epoch": 0.06278026905829596,
+      "grad_norm": 1.6274283098945213,
+      "learning_rate": 1.883408071748879e-06,
+      "loss": 0.8509,
       "step": 140
     },
     {
       "epoch": 0.06726457399103139,
+      "grad_norm": 1.7690619100525546,
+      "learning_rate": 2.0179372197309418e-06,
+      "loss": 0.8057,
       "step": 150
     },
     {
       "epoch": 0.07174887892376682,
+      "grad_norm": 1.866473004768342,
+      "learning_rate": 2.1524663677130046e-06,
+      "loss": 0.8236,
       "step": 160
     },
     {
       "epoch": 0.07623318385650224,
+      "grad_norm": 1.5528009019380091,
+      "learning_rate": 2.2869955156950674e-06,
+      "loss": 0.7936,
       "step": 170
     },
     {
       "epoch": 0.08071748878923767,
+      "grad_norm": 1.8924349879943885,
+      "learning_rate": 2.4215246636771302e-06,
+      "loss": 0.8054,
       "step": 180
     },
     {
       "epoch": 0.08520179372197309,
+      "grad_norm": 1.5998254884542162,
+      "learning_rate": 2.556053811659193e-06,
+      "loss": 0.7971,
       "step": 190
     },
     {
       "epoch": 0.08968609865470852,
+      "grad_norm": 1.553085624058612,
+      "learning_rate": 2.690582959641256e-06,
+      "loss": 0.8038,
+      "step": 200
+    },
+    {
+      "epoch": 0.08968609865470852,
+      "eval_loss": 0.8094644546508789,
+      "eval_runtime": 412.1717,
+      "eval_samples_per_second": 121.495,
+      "eval_steps_per_second": 1.9,
       "step": 200
     },
     {
       "epoch": 0.09417040358744394,
+      "grad_norm": 1.6621378080442881,
+      "learning_rate": 2.8251121076233187e-06,
+      "loss": 0.7815,
       "step": 210
     },
     {
       "epoch": 0.09865470852017937,
+      "grad_norm": 1.5875832641605891,
+      "learning_rate": 2.959641255605381e-06,
+      "loss": 0.8088,
       "step": 220
     },
     {
       "epoch": 0.1031390134529148,
+      "grad_norm": 1.6006597094640902,
+      "learning_rate": 2.99990995533251e-06,
+      "loss": 0.8141,
       "step": 230
     },
     {
       "epoch": 0.10762331838565023,
+      "grad_norm": 1.7932554350094232,
+      "learning_rate": 2.9994689462512194e-06,
+      "loss": 0.7834,
       "step": 240
     },
     {
       "epoch": 0.11210762331838565,
+      "grad_norm": 1.6444723214299724,
+      "learning_rate": 2.998660541859271e-06,
+      "loss": 0.7797,
       "step": 250
     },
     {
       "epoch": 0.11659192825112108,
+      "grad_norm": 1.790145213655978,
+      "learning_rate": 2.9974849402294452e-06,
+      "loss": 0.8046,
       "step": 260
     },
     {
       "epoch": 0.1210762331838565,
+      "grad_norm": 1.8694283184605,
+      "learning_rate": 2.9959424294040703e-06,
+      "loss": 0.7802,
       "step": 270
     },
     {
       "epoch": 0.12556053811659193,
+      "grad_norm": 1.6030839509233756,
+      "learning_rate": 2.9940333873244464e-06,
+      "loss": 0.8032,
       "step": 280
     },
     {
       "epoch": 0.13004484304932734,
+      "grad_norm": 1.664910362160235,
+      "learning_rate": 2.991758281738245e-06,
+      "loss": 0.7802,
       "step": 290
     },
     {
       "epoch": 0.13452914798206278,
+      "grad_norm": 1.6726792291262853,
+      "learning_rate": 2.989117670084902e-06,
+      "loss": 0.7937,
+      "step": 300
+    },
+    {
+      "epoch": 0.13452914798206278,
+      "eval_loss": 0.7789004445075989,
+      "eval_runtime": 410.6605,
+      "eval_samples_per_second": 121.943,
+      "eval_steps_per_second": 1.907,
       "step": 300
     },
     {
       "epoch": 0.13901345291479822,
+      "grad_norm": 1.4685211047526556,
+      "learning_rate": 2.986112199359036e-06,
+      "loss": 0.7486,
       "step": 310
     },
     {
       "epoch": 0.14349775784753363,
+      "grad_norm": 2.0076694355781575,
+      "learning_rate": 2.9827426059519237e-06,
+      "loss": 0.808,
       "step": 320
     },
     {
       "epoch": 0.14798206278026907,
+      "grad_norm": 1.557780179088859,
+      "learning_rate": 2.9790097154710697e-06,
+      "loss": 0.7849,
       "step": 330
     },
     {
       "epoch": 0.15246636771300448,
+      "grad_norm": 1.3610248283116362,
+      "learning_rate": 2.9749144425379216e-06,
+      "loss": 0.7696,
       "step": 340
     },
     {
       "epoch": 0.15695067264573992,
+      "grad_norm": 1.5050628258310632,
+      "learning_rate": 2.9704577905637718e-06,
+      "loss": 0.7497,
       "step": 350
     },
     {
       "epoch": 0.16143497757847533,
+      "grad_norm": 1.4313536098763806,
+      "learning_rate": 2.9656408515039017e-06,
+      "loss": 0.7544,
       "step": 360
     },
     {
       "epoch": 0.16591928251121077,
+      "grad_norm": 1.6003065628553548,
+      "learning_rate": 2.9604648055900368e-06,
+      "loss": 0.7648,
       "step": 370
     },
     {
       "epoch": 0.17040358744394618,
+      "grad_norm": 1.633334409956319,
+      "learning_rate": 2.9549309210411697e-06,
+      "loss": 0.7471,
       "step": 380
     },
     {
       "epoch": 0.17488789237668162,
+      "grad_norm": 1.5700271693529286,
+      "learning_rate": 2.949040553752826e-06,
+      "loss": 0.8009,
       "step": 390
     },
     {
       "epoch": 0.17937219730941703,
+      "grad_norm": 1.4854276734758955,
+      "learning_rate": 2.9427951469648425e-06,
+      "loss": 0.7712,
+      "step": 400
+    },
+    {
+      "epoch": 0.17937219730941703,
+      "eval_loss": 0.7643527388572693,
+      "eval_runtime": 413.4678,
+      "eval_samples_per_second": 121.115,
+      "eval_steps_per_second": 1.894,
       "step": 400
     },
     {
       "epoch": 0.18385650224215247,
+      "grad_norm": 1.4160940764229815,
+      "learning_rate": 2.936196230907755e-06,
+      "loss": 0.7532,
       "step": 410
     },
     {
       "epoch": 0.18834080717488788,
+      "grad_norm": 1.4265290618310995,
+      "learning_rate": 2.929245422427861e-06,
+      "loss": 0.7703,
       "step": 420
     },
     {
       "epoch": 0.19282511210762332,
+      "grad_norm": 1.6899882763333507,
+      "learning_rate": 2.9219444245910674e-06,
+      "loss": 0.7919,
       "step": 430
     },
     {
       "epoch": 0.19730941704035873,
+      "grad_norm": 1.4186337044303068,
+      "learning_rate": 2.9142950262656098e-06,
+      "loss": 0.7477,
       "step": 440
     },
     {
       "epoch": 0.20179372197309417,
+      "grad_norm": 1.4178331376670448,
+      "learning_rate": 2.9062991016837496e-06,
+      "loss": 0.7734,
       "step": 450
     },
     {
       "epoch": 0.2062780269058296,
+      "grad_norm": 1.4503162574851487,
+      "learning_rate": 2.897958609982556e-06,
+      "loss": 0.7447,
       "step": 460
     },
     {
       "epoch": 0.21076233183856502,
+      "grad_norm": 1.558520612711291,
+      "learning_rate": 2.8892755947238818e-06,
+      "loss": 0.741,
       "step": 470
     },
     {
       "epoch": 0.21524663677130046,
+      "grad_norm": 1.4382572158325275,
+      "learning_rate": 2.8802521833936595e-06,
+      "loss": 0.7563,
       "step": 480
     },
     {
       "epoch": 0.21973094170403587,
+      "grad_norm": 1.5964216489171685,
+      "learning_rate": 2.870890586880629e-06,
+      "loss": 0.7554,
       "step": 490
     },
     {
       "epoch": 0.2242152466367713,
+      "grad_norm": 1.496069010720812,
+      "learning_rate": 2.8611930989346322e-06,
+      "loss": 0.7393,
+      "step": 500
+    },
+    {
+      "epoch": 0.2242152466367713,
+      "eval_loss": 0.7564548254013062,
+      "eval_runtime": 408.8965,
+      "eval_samples_per_second": 122.469,
+      "eval_steps_per_second": 1.915,
       "step": 500
     },
     {
       "epoch": 0.22869955156950672,
+      "grad_norm": 1.4866290735466012,
+      "learning_rate": 2.851162095604607e-06,
+      "loss": 0.7499,
       "step": 510
     },
     {
       "epoch": 0.23318385650224216,
+      "grad_norm": 1.3341919240907245,
+      "learning_rate": 2.8408000346564136e-06,
+      "loss": 0.7524,
       "step": 520
     },
     {
       "epoch": 0.23766816143497757,
+      "grad_norm": 1.6374942242171213,
+      "learning_rate": 2.8301094549706405e-06,
+      "loss": 0.7386,
       "step": 530
     },
     {
       "epoch": 0.242152466367713,
+      "grad_norm": 1.6225803035616944,
+      "learning_rate": 2.8190929759205366e-06,
+      "loss": 0.7616,
       "step": 540
     },
     {
       "epoch": 0.24663677130044842,
+      "grad_norm": 1.4683777464043755,
+      "learning_rate": 2.807753296730219e-06,
+      "loss": 0.7564,
       "step": 550
     },
     {
       "epoch": 0.25112107623318386,
+      "grad_norm": 1.350460716883926,
+      "learning_rate": 2.7960931958133183e-06,
+      "loss": 0.7424,
       "step": 560
     },
     {
       "epoch": 0.2556053811659193,
+      "grad_norm": 1.522474854464212,
+      "learning_rate": 2.7841155300922202e-06,
+      "loss": 0.7331,
       "step": 570
     },
     {
       "epoch": 0.2600896860986547,
+      "grad_norm": 1.448720887976205,
+      "learning_rate": 2.7718232342980693e-06,
+      "loss": 0.7657,
       "step": 580
     },
     {
       "epoch": 0.2645739910313901,
+      "grad_norm": 1.6744619426337854,
+      "learning_rate": 2.759219320251714e-06,
+      "loss": 0.7363,
       "step": 590
     },
     {
       "epoch": 0.26905829596412556,
+      "grad_norm": 1.3585539591402243,
+      "learning_rate": 2.7463068761257554e-06,
+      "loss": 0.7458,
+      "step": 600
+    },
+    {
+      "epoch": 0.26905829596412556,
+      "eval_loss": 0.7505608797073364,
+      "eval_runtime": 408.9234,
+      "eval_samples_per_second": 122.461,
+      "eval_steps_per_second": 1.915,
       "step": 600
     },
     {
       "epoch": 0.273542600896861,
+      "grad_norm": 1.580932873164111,
+      "learning_rate": 2.7330890656878943e-06,
+      "loss": 0.7565,
       "step": 610
     },
     {
       "epoch": 0.27802690582959644,
+      "grad_norm": 1.5329888412189265,
+      "learning_rate": 2.7195691275257547e-06,
+      "loss": 0.7457,
       "step": 620
     },
     {
       "epoch": 0.2825112107623318,
+      "grad_norm": 1.6754413400622026,
+      "learning_rate": 2.7057503742533753e-06,
+      "loss": 0.7392,
       "step": 630
     },
     {
       "epoch": 0.28699551569506726,
+      "grad_norm": 1.6247897070260917,
+      "learning_rate": 2.691636191699562e-06,
+      "loss": 0.758,
       "step": 640
     },
     {
       "epoch": 0.2914798206278027,
+      "grad_norm": 1.42356323236888,
+      "learning_rate": 2.6772300380783013e-06,
+      "loss": 0.7626,
       "step": 650
     },
     {
       "epoch": 0.29596412556053814,
+      "grad_norm": 1.4955853270730488,
+      "learning_rate": 2.662535443141443e-06,
+      "loss": 0.7355,
       "step": 660
     },
     {
       "epoch": 0.3004484304932735,
+      "grad_norm": 1.4879073313151545,
+      "learning_rate": 2.647556007313847e-06,
+      "loss": 0.7545,
       "step": 670
     },
     {
       "epoch": 0.30493273542600896,
+      "grad_norm": 1.4153755477305148,
+      "learning_rate": 2.6322954008112213e-06,
+      "loss": 0.7378,
       "step": 680
     },
     {
       "epoch": 0.3094170403587444,
+      "grad_norm": 1.4019993036978922,
+      "learning_rate": 2.616757362740855e-06,
+      "loss": 0.7387,
       "step": 690
     },
     {
       "epoch": 0.31390134529147984,
+      "grad_norm": 1.5335241758091316,
+      "learning_rate": 2.600945700185474e-06,
+      "loss": 0.7694,
+      "step": 700
+    },
+    {
+      "epoch": 0.31390134529147984,
+      "eval_loss": 0.7457958459854126,
+      "eval_runtime": 408.7761,
+      "eval_samples_per_second": 122.505,
+      "eval_steps_per_second": 1.915,
       "step": 700
     },
     {
       "epoch": 0.3183856502242152,
+      "grad_norm": 1.47263429505246,
+      "learning_rate": 2.5848642872704417e-06,
+      "loss": 0.7246,
       "step": 710
     },
     {
       "epoch": 0.32286995515695066,
+      "grad_norm": 1.5062835613914285,
+      "learning_rate": 2.5685170642145337e-06,
+      "loss": 0.7338,
       "step": 720
     },
     {
       "epoch": 0.3273542600896861,
+      "grad_norm": 1.6182138547104117,
+      "learning_rate": 2.5519080363645134e-06,
+      "loss": 0.73,
       "step": 730
     },
     {
       "epoch": 0.33183856502242154,
+      "grad_norm": 1.3515300425343295,
+      "learning_rate": 2.53504127321376e-06,
+      "loss": 0.7299,
       "step": 740
     },
     {
       "epoch": 0.336322869955157,
+      "grad_norm": 1.5798782493243635,
+      "learning_rate": 2.517920907405168e-06,
+      "loss": 0.7293,
       "step": 750
     },
     {
       "epoch": 0.34080717488789236,
+      "grad_norm": 1.4549259580353344,
+      "learning_rate": 2.5005511337185824e-06,
+      "loss": 0.7621,
       "step": 760
     },
     {
       "epoch": 0.3452914798206278,
+      "grad_norm": 1.456599605633329,
+      "learning_rate": 2.4829362080430077e-06,
+      "loss": 0.7438,
       "step": 770
     },
     {
       "epoch": 0.34977578475336324,
+      "grad_norm": 1.4128813340833153,
+      "learning_rate": 2.4650804463338406e-06,
+      "loss": 0.7413,
       "step": 780
     },
     {
       "epoch": 0.3542600896860987,
+      "grad_norm": 1.5613737124434628,
+      "learning_rate": 2.4469882235553887e-06,
+      "loss": 0.7477,
       "step": 790
     },
     {
       "epoch": 0.35874439461883406,
+      "grad_norm": 1.6383373422678345,
+      "learning_rate": 2.4286639726089293e-06,
+      "loss": 0.713,
+      "step": 800
+    },
+    {
+      "epoch": 0.35874439461883406,
+      "eval_loss": 0.7421520352363586,
+      "eval_runtime": 408.0589,
+      "eval_samples_per_second": 122.72,
+      "eval_steps_per_second": 1.919,
       "step": 800
     },
     {
       "epoch": 0.3632286995515695,
+      "grad_norm": 1.3492102003393152,
+      "learning_rate": 2.4101121832465754e-06,
+      "loss": 0.7185,
       "step": 810
     },
     {
       "epoch": 0.36771300448430494,
+      "grad_norm": 1.4117655797526263,
+      "learning_rate": 2.3913374009712084e-06,
+      "loss": 0.7379,
       "step": 820
     },
     {
       "epoch": 0.3721973094170404,
+      "grad_norm": 1.5281693242796246,
+      "learning_rate": 2.3723442259227547e-06,
+      "loss": 0.7406,
       "step": 830
     },
     {
       "epoch": 0.37668161434977576,
+      "grad_norm": 1.6990323130848894,
+      "learning_rate": 2.3531373117510695e-06,
+      "loss": 0.7388,
       "step": 840
     },
     {
       "epoch": 0.3811659192825112,
+      "grad_norm": 1.476162200960684,
+      "learning_rate": 2.33372136447572e-06,
+      "loss": 0.7434,
       "step": 850
     },
     {
       "epoch": 0.38565022421524664,
+      "grad_norm": 1.3930484173784414,
+      "learning_rate": 2.3141011413329244e-06,
+      "loss": 0.7372,
       "step": 860
     },
     {
       "epoch": 0.3901345291479821,
+      "grad_norm": 1.4071716332679987,
+      "learning_rate": 2.2942814496099532e-06,
+      "loss": 0.7531,
       "step": 870
     },
     {
       "epoch": 0.39461883408071746,
+      "grad_norm": 1.5479232446038012,
+      "learning_rate": 2.274267145467259e-06,
+      "loss": 0.7216,
       "step": 880
     },
     {
       "epoch": 0.3991031390134529,
+      "grad_norm": 1.4255077423798548,
+      "learning_rate": 2.254063132748637e-06,
+      "loss": 0.7343,
       "step": 890
     },
     {
       "epoch": 0.40358744394618834,
+      "grad_norm": 1.57276996130409,
+      "learning_rate": 2.2336743617797006e-06,
+      "loss": 0.7347,
+      "step": 900
+    },
+    {
+      "epoch": 0.40358744394618834,
+      "eval_loss": 0.7386789321899414,
+      "eval_runtime": 408.1839,
+      "eval_samples_per_second": 122.682,
+      "eval_steps_per_second": 1.918,
       "step": 900
     },
     {
       "epoch": 0.4080717488789238,
+      "grad_norm": 1.4568107529063017,
+      "learning_rate": 2.213105828154964e-06,
+      "loss": 0.7266,
       "step": 910
     },
     {
       "epoch": 0.4125560538116592,
+      "grad_norm": 1.374198091231606,
+      "learning_rate": 2.192362571513841e-06,
+      "loss": 0.7465,
       "step": 920
     },
     {
       "epoch": 0.4170403587443946,
+      "grad_norm": 1.3925457206301284,
+      "learning_rate": 2.171449674305846e-06,
+      "loss": 0.7427,
       "step": 930
     },
     {
       "epoch": 0.42152466367713004,
+      "grad_norm": 1.4443502855856463,
+      "learning_rate": 2.1503722605453083e-06,
+      "loss": 0.7428,
       "step": 940
     },
     {
       "epoch": 0.4260089686098655,
+      "grad_norm": 1.5268146365443709,
+      "learning_rate": 2.1291354945559004e-06,
+      "loss": 0.7163,
       "step": 950
     },
     {
       "epoch": 0.4304932735426009,
+      "grad_norm": 1.5000325455240473,
+      "learning_rate": 2.1077445797052945e-06,
+      "loss": 0.7472,
       "step": 960
     },
     {
       "epoch": 0.4349775784753363,
+      "grad_norm": 1.4869091852092478,
+      "learning_rate": 2.086204757130243e-06,
+      "loss": 0.7427,
       "step": 970
     },
     {
       "epoch": 0.43946188340807174,
+      "grad_norm": 1.4430282256544564,
+      "learning_rate": 2.0645213044524194e-06,
+      "loss": 0.7174,
       "step": 980
     },
     {
       "epoch": 0.4439461883408072,
+      "grad_norm": 1.4822025498870304,
+      "learning_rate": 2.0426995344853043e-06,
+      "loss": 0.7538,
       "step": 990
     },
     {
       "epoch": 0.4484304932735426,
+      "grad_norm": 1.5186234240452396,
+      "learning_rate": 2.0207447939324598e-06,
+      "loss": 0.7243,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4484304932735426,
+      "eval_loss": 0.7356163859367371,
+      "eval_runtime": 407.0139,
+      "eval_samples_per_second": 123.035,
+      "eval_steps_per_second": 1.924,
       "step": 1000
     },
     {
       "epoch": 0.452914798206278,
+      "grad_norm": 1.5742685454152958,
+      "learning_rate": 1.998662462077496e-06,
+      "loss": 0.7475,
       "step": 1010
     },
     {
       "epoch": 0.45739910313901344,
+      "grad_norm": 1.3834168469611057,
+      "learning_rate": 1.976457949466054e-06,
+      "loss": 0.7568,
       "step": 1020
     },
     {
       "epoch": 0.4618834080717489,
+      "grad_norm": 1.4947961999330186,
+      "learning_rate": 1.954136696580132e-06,
+      "loss": 0.7464,
       "step": 1030
     },
     {
       "epoch": 0.4663677130044843,
+      "grad_norm": 1.4284253764088304,
+      "learning_rate": 1.9317041725050747e-06,
+      "loss": 0.7456,
       "step": 1040
     },
     {
       "epoch": 0.47085201793721976,
+      "grad_norm": 1.4247354157320633,
+      "learning_rate": 1.909165873589554e-06,
+      "loss": 0.7008,
       "step": 1050
     },
     {
       "epoch": 0.47533632286995514,
+      "grad_norm": 1.4525308368306575,
+      "learning_rate": 1.886527322098871e-06,
+      "loss": 0.7121,
       "step": 1060
     },
     {
       "epoch": 0.4798206278026906,
+      "grad_norm": 1.43738036112722,
+      "learning_rate": 1.8637940648619065e-06,
+      "loss": 0.7308,
       "step": 1070
     },
     {
       "epoch": 0.484304932735426,
+      "grad_norm": 1.402086349899742,
+      "learning_rate": 1.8409716719120561e-06,
+      "loss": 0.7164,
       "step": 1080
     },
     {
       "epoch": 0.48878923766816146,
+      "grad_norm": 1.5227358428935063,
+      "learning_rate": 1.8180657351224739e-06,
+      "loss": 0.732,
       "step": 1090
     },
     {
       "epoch": 0.49327354260089684,
+      "grad_norm": 1.5813743714389112,
+      "learning_rate": 1.7950818668359733e-06,
+      "loss": 0.7161,
+      "step": 1100
+    },
+    {
+      "epoch": 0.49327354260089684,
+      "eval_loss": 0.7330535054206848,
+      "eval_runtime": 408.4081,
+      "eval_samples_per_second": 122.615,
+      "eval_steps_per_second": 1.917,
       "step": 1100
     },
     {
       "epoch": 0.4977578475336323,
+      "grad_norm": 1.4881819590713468,
+      "learning_rate": 1.772025698489903e-06,
+      "loss": 0.7144,
       "step": 1110
     },
     {
       "epoch": 0.5022421524663677,
+      "grad_norm": 1.4750319990458514,
+      "learning_rate": 1.7489028792363549e-06,
+      "loss": 0.7365,
       "step": 1120
     },
     {
       "epoch": 0.5067264573991032,
+      "grad_norm": 1.4443590686278198,
+      "learning_rate": 1.7257190745580209e-06,
+      "loss": 0.7487,
       "step": 1130
     },
     {
       "epoch": 0.5112107623318386,
+      "grad_norm": 1.4695293763109774,
+      "learning_rate": 1.7024799648800555e-06,
+      "loss": 0.7233,
       "step": 1140
     },
     {
       "epoch": 0.515695067264574,
+      "grad_norm": 1.4328944860273993,
+      "learning_rate": 1.679191244178278e-06,
+      "loss": 0.7322,
       "step": 1150
     },
     {
       "epoch": 0.5201793721973094,
+      "grad_norm": 1.4157130638413895,
+      "learning_rate": 1.6558586185840473e-06,
+      "loss": 0.728,
       "step": 1160
     },
     {
       "epoch": 0.5246636771300448,
+      "grad_norm": 1.4117533616122613,
+      "learning_rate": 1.6324878049861656e-06,
+      "loss": 0.7331,
       "step": 1170
     },
     {
       "epoch": 0.5291479820627802,
+      "grad_norm": 1.4255877674393056,
+      "learning_rate": 1.609084529630145e-06,
+      "loss": 0.7491,
       "step": 1180
     },
     {
       "epoch": 0.5336322869955157,
+      "grad_norm": 1.4486300200418207,
+      "learning_rate": 1.5856545267151759e-06,
+      "loss": 0.7261,
       "step": 1190
     },
     {
       "epoch": 0.5381165919282511,
+      "grad_norm": 1.4628618883782867,
+      "learning_rate": 1.5622035369891561e-06,
+      "loss": 0.7247,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5381165919282511,
+      "eval_loss": 0.7308038473129272,
+      "eval_runtime": 406.6873,
+      "eval_samples_per_second": 123.134,
+      "eval_steps_per_second": 1.925,
       "step": 1200
     },
     {
       "epoch": 0.5426008968609866,
+      "grad_norm": 1.4112256357672157,
+      "learning_rate": 1.5387373063421062e-06,
+      "loss": 0.7307,
       "step": 1210
     },
     {
       "epoch": 0.547085201793722,
+      "grad_norm": 1.3994109954542429,
+      "learning_rate": 1.515261584398333e-06,
+      "loss": 0.7062,
       "step": 1220
     },
     {
       "epoch": 0.5515695067264574,
+      "grad_norm": 1.5279436893984248,
+      "learning_rate": 1.491782123107669e-06,
+      "loss": 0.7314,
       "step": 1230
     },
     {
       "epoch": 0.5560538116591929,
+      "grad_norm": 1.4092281762272858,
+      "learning_rate": 1.4683046753361521e-06,
+      "loss": 0.7044,
       "step": 1240
     },
     {
       "epoch": 0.5605381165919282,
+      "grad_norm": 1.4363381867810665,
+      "learning_rate": 1.4448349934564736e-06,
+      "loss": 0.7287,
       "step": 1250
     },
     {
       "epoch": 0.5650224215246636,
+      "grad_norm": 1.4913351223697051,
+      "learning_rate": 1.421378827938549e-06,
+      "loss": 0.7254,
       "step": 1260
     },
     {
       "epoch": 0.5695067264573991,
+      "grad_norm": 1.5096384680619075,
+      "learning_rate": 1.3979419259405563e-06,
+      "loss": 0.7389,
       "step": 1270
     },
     {
       "epoch": 0.5739910313901345,
+      "grad_norm": 1.3495144573299676,
+      "learning_rate": 1.3745300299007856e-06,
+      "loss": 0.7247,
       "step": 1280
     },
     {
       "epoch": 0.57847533632287,
+      "grad_norm": 1.3641879848291365,
+      "learning_rate": 1.3511488761306412e-06,
+      "loss": 0.7312,
       "step": 1290
     },
     {
       "epoch": 0.5829596412556054,
+      "grad_norm": 1.3879105033157129,
+      "learning_rate": 1.3278041934091524e-06,
+      "loss": 0.7477,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5829596412556054,
+      "eval_loss": 0.7287724018096924,
+      "eval_runtime": 406.882,
+      "eval_samples_per_second": 123.075,
+      "eval_steps_per_second": 1.924,
       "step": 1300
     },
     {
       "epoch": 0.5874439461883408,
+      "grad_norm": 1.3916697284582622,
+      "learning_rate": 1.3045017015793217e-06,
+      "loss": 0.7246,
       "step": 1310
     },
     {
       "epoch": 0.5919282511210763,
+      "grad_norm": 1.4328511876779917,
+      "learning_rate": 1.2812471101466687e-06,
+      "loss": 0.7303,
       "step": 1320
     },
     {
       "epoch": 0.5964125560538116,
+      "grad_norm": 1.4411092846252307,
+      "learning_rate": 1.2580461168803038e-06,
+      "loss": 0.7318,
       "step": 1330
     },
     {
       "epoch": 0.600896860986547,
+      "grad_norm": 1.4703965551927338,
+      "learning_rate": 1.2349044064168782e-06,
+      "loss": 0.7375,
       "step": 1340
     },
     {
       "epoch": 0.6053811659192825,
+      "grad_norm": 1.4319057117061509,
+      "learning_rate": 1.21182764886775e-06,
+      "loss": 0.7302,
       "step": 1350
     },
     {
       "epoch": 0.6098654708520179,
+      "grad_norm": 1.5017976848926429,
+      "learning_rate": 1.188821498429714e-06,
+      "loss": 0.7262,
       "step": 1360
     },
     {
       "epoch": 0.6143497757847534,
+      "grad_norm": 1.4553869576056546,
+      "learning_rate": 1.165891591999626e-06,
+      "loss": 0.7447,
       "step": 1370
     },
     {
       "epoch": 0.6188340807174888,
+      "grad_norm": 1.4128744043127173,
+      "learning_rate": 1.1430435477932646e-06,
+      "loss": 0.7423,
       "step": 1380
     },
     {
       "epoch": 0.6233183856502242,
+      "grad_norm": 1.3797159286061107,
+      "learning_rate": 1.1202829639687785e-06,
+      "loss": 0.744,
       "step": 1390
     },
     {
       "epoch": 0.6278026905829597,
+      "grad_norm": 1.487304571595245,
+      "learning_rate": 1.0976154172550408e-06,
+      "loss": 0.7429,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6278026905829597,
+      "eval_loss": 0.7272571921348572,
+      "eval_runtime": 406.7541,
+      "eval_samples_per_second": 123.114,
+      "eval_steps_per_second": 1.925,
       "step": 1400
     },
     {
       "epoch": 0.6322869955156951,
+      "grad_norm": 1.544512062570189,
+      "learning_rate": 1.0750464615852523e-06,
+      "loss": 0.7251,
       "step": 1410
     },
     {
       "epoch": 0.6367713004484304,
+      "grad_norm": 1.422563130817404,
+      "learning_rate": 1.0525816267361398e-06,
+      "loss": 0.712,
       "step": 1420
     },
     {
       "epoch": 0.6412556053811659,
+      "grad_norm": 1.4937681764382644,
+      "learning_rate": 1.0302264169730613e-06,
+      "loss": 0.7203,
       "step": 1430
     },
     {
       "epoch": 0.6457399103139013,
+      "grad_norm": 1.50738757049434,
+      "learning_rate": 1.0079863097013722e-06,
+      "loss": 0.7121,
       "step": 1440
     },
     {
       "epoch": 0.6502242152466368,
+      "grad_norm": 1.286396172710849,
+      "learning_rate": 9.85866754124367e-07,
+      "loss": 0.7193,
       "step": 1450
     },
     {
       "epoch": 0.6547085201793722,
+      "grad_norm": 1.4997539342741677,
+      "learning_rate": 9.638731699081281e-07,
+      "loss": 0.7288,
       "step": 1460
     },
     {
       "epoch": 0.6591928251121076,
+      "grad_norm": 1.37434247409356,
+      "learning_rate": 9.42010945853623e-07,
+      "loss": 0.7597,
       "step": 1470
     },
     {
       "epoch": 0.6636771300448431,
+      "grad_norm": 1.3869436283100607,
+      "learning_rate": 9.202854385763502e-07,
+      "loss": 0.7184,
       "step": 1480
     },
     {
       "epoch": 0.6681614349775785,
+      "grad_norm": 1.3970067087387381,
+      "learning_rate": 8.987019711938812e-07,
+      "loss": 0.7326,
       "step": 1490
     },
     {
       "epoch": 0.672645739910314,
+      "grad_norm": 1.553183464191494,
+      "learning_rate": 8.772658320216047e-07,
+      "loss": 0.7317,
+      "step": 1500
+    },
+    {
+      "epoch": 0.672645739910314,
+      "eval_loss": 0.7256098389625549,
+      "eval_runtime": 406.6132,
+      "eval_samples_per_second": 123.156,
+      "eval_steps_per_second": 1.926,
       "step": 1500
     },
     {
       "epoch": 0.6771300448430493,
+      "grad_norm": 1.3357768297094936,
+      "learning_rate": 8.55982273277002e-07,
+      "loss": 0.7347,
       "step": 1510
     },
     {
       "epoch": 0.6816143497757847,
+      "grad_norm": 1.3249788097985131,
+      "learning_rate": 8.348565097927605e-07,
+      "loss": 0.7496,
       "step": 1520
     },
     {
       "epoch": 0.6860986547085202,
+      "grad_norm": 1.4578138220875878,
+      "learning_rate": 8.13893717739056e-07,
+      "loss": 0.7308,
       "step": 1530
     },
     {
       "epoch": 0.6905829596412556,
+      "grad_norm": 1.3268077719441809,
+      "learning_rate": 7.930990333553013e-07,
+      "loss": 0.7094,
       "step": 1540
     },
     {
       "epoch": 0.695067264573991,
+      "grad_norm": 1.47562182506043,
+      "learning_rate": 7.72477551691678e-07,
+      "loss": 0.697,
       "step": 1550
     },
     {
       "epoch": 0.6995515695067265,
+      "grad_norm": 1.4850843190566259,
+      "learning_rate": 7.520343253607677e-07,
+      "loss": 0.7301,
       "step": 1560
     },
     {
       "epoch": 0.7040358744394619,
+      "grad_norm": 1.5097763618083517,
+      "learning_rate": 7.317743632995731e-07,
+      "loss": 0.7217,
       "step": 1570
     },
     {
       "epoch": 0.7085201793721974,
+      "grad_norm": 1.3914348509226637,
+      "learning_rate": 7.117026295422425e-07,
+      "loss": 0.6957,
       "step": 1580
     },
     {
       "epoch": 0.7130044843049327,
+      "grad_norm": 1.5175208261545492,
+      "learning_rate": 6.918240420038007e-07,
+      "loss": 0.7317,
       "step": 1590
     },
     {
       "epoch": 0.7174887892376681,
+      "grad_norm": 1.4947559578839034,
+      "learning_rate": 6.721434712751745e-07,
+      "loss": 0.7226,
+      "step": 1600
+    },
+    {
+      "epoch": 0.7174887892376681,
+      "eval_loss": 0.7243176102638245,
+      "eval_runtime": 406.7899,
+      "eval_samples_per_second": 123.103,
+      "eval_steps_per_second": 1.925,
       "step": 1600
     },
     {
       "epoch": 0.7219730941704036,
+      "grad_norm": 1.5192098207309965,
+      "learning_rate": 6.526657394298154e-07,
+      "loss": 0.705,
       "step": 1610
     },
     {
       "epoch": 0.726457399103139,
+      "grad_norm": 1.3665027387136646,
+      "learning_rate": 6.333956188422088e-07,
+      "loss": 0.706,
       "step": 1620
     },
     {
       "epoch": 0.7309417040358744,
+      "grad_norm": 1.4974912840899435,
+      "learning_rate": 6.143378310185643e-07,
+      "loss": 0.6983,
       "step": 1630
     },
     {
       "epoch": 0.7354260089686099,
+      "grad_norm": 1.5477574584643699,
+      "learning_rate": 5.954970454399638e-07,
+      "loss": 0.7252,
       "step": 1640
     },
     {
       "epoch": 0.7399103139013453,
+      "grad_norm": 1.525090065151942,
+      "learning_rate": 5.768778784182616e-07,
+      "loss": 0.7087,
       "step": 1650
     },
     {
       "epoch": 0.7443946188340808,
+      "grad_norm": 1.4837554579437873,
+      "learning_rate": 5.584848919650069e-07,
+      "loss": 0.7075,
       "step": 1660
     },
     {
       "epoch": 0.7488789237668162,
+      "grad_norm": 1.3538329119260115,
+      "learning_rate": 5.403225926736772e-07,
+      "loss": 0.7057,
       "step": 1670
     },
     {
       "epoch": 0.7533632286995515,
+      "grad_norm": 1.359895087573495,
+      "learning_rate": 5.223954306154843e-07,
+      "loss": 0.7306,
       "step": 1680
     },
     {
       "epoch": 0.757847533632287,
+      "grad_norm": 1.4168148218595764,
+      "learning_rate": 5.047077982490311e-07,
+      "loss": 0.7424,
       "step": 1690
     },
     {
       "epoch": 0.7623318385650224,
+      "grad_norm": 1.4815842671642683,
+      "learning_rate": 4.872640293440861e-07,
+      "loss": 0.695,
+      "step": 1700
+    },
+    {
+      "epoch": 0.7623318385650224,
+      "eval_loss": 0.7233718633651733,
+      "eval_runtime": 406.8015,
+      "eval_samples_per_second": 123.099,
+      "eval_steps_per_second": 1.925,
       "step": 1700
     },
     {
       "epoch": 0.7668161434977578,
+      "grad_norm": 1.5501655544071418,
+      "learning_rate": 4.7006839791973673e-07,
+      "loss": 0.7327,
       "step": 1710
     },
     {
       "epoch": 0.7713004484304933,
+      "grad_norm": 1.3834984705411,
+      "learning_rate": 4.53125117197179e-07,
+      "loss": 0.7245,
       "step": 1720
     },
     {
       "epoch": 0.7757847533632287,
+      "grad_norm": 1.4041748328697374,
+      "learning_rate": 4.364383385674112e-07,
+      "loss": 0.7054,
       "step": 1730
     },
     {
       "epoch": 0.7802690582959642,
+      "grad_norm": 1.443104622604103,
+      "learning_rate": 4.2001215057407026e-07,
+      "loss": 0.7037,
       "step": 1740
     },
     {
       "epoch": 0.7847533632286996,
+      "grad_norm": 1.5632699202433824,
+      "learning_rate": 4.038505779116687e-07,
+      "loss": 0.705,
       "step": 1750
     },
     {
       "epoch": 0.7892376681614349,
+      "grad_norm": 1.349615732583278,
+      "learning_rate": 3.879575804394782e-07,
+      "loss": 0.7071,
       "step": 1760
     },
     {
       "epoch": 0.7937219730941704,
+      "grad_norm": 1.3657530768128234,
+      "learning_rate": 3.7233705221129646e-07,
+      "loss": 0.7273,
       "step": 1770
     },
     {
       "epoch": 0.7982062780269058,
+      "grad_norm": 1.5107387856649341,
+      "learning_rate": 3.569928205213354e-07,
+      "loss": 0.6975,
       "step": 1780
     },
     {
       "epoch": 0.8026905829596412,
+      "grad_norm": 1.4525568524987686,
+      "learning_rate": 3.419286449664741e-07,
+      "loss": 0.7095,
       "step": 1790
     },
     {
       "epoch": 0.8071748878923767,
+      "grad_norm": 1.4847854049722584,
+      "learning_rate": 3.2714821652508854e-07,
+      "loss": 0.7167,
+      "step": 1800
+    },
+    {
+      "epoch": 0.8071748878923767,
+      "eval_loss": 0.7225807309150696,
+      "eval_runtime": 406.5326,
+      "eval_samples_per_second": 123.181,
+      "eval_steps_per_second": 1.926,
       "step": 1800
     },
     {
       "epoch": 0.8116591928251121,
+      "grad_norm": 1.2447161837361285,
+      "learning_rate": 3.126551566527036e-07,
+      "loss": 0.7156,
       "step": 1810
     },
     {
       "epoch": 0.8161434977578476,
+      "grad_norm": 1.4139333132454484,
+      "learning_rate": 2.9845301639467284e-07,
+      "loss": 0.7537,
       "step": 1820
     },
     {
       "epoch": 0.820627802690583,
+      "grad_norm": 1.3663031642715642,
+      "learning_rate": 2.8454527551611205e-07,
+      "loss": 0.7238,
       "step": 1830
     },
     {
       "epoch": 0.8251121076233184,
+      "grad_norm": 1.389263976301968,
+      "learning_rate": 2.7093534164929904e-07,
+      "loss": 0.738,
       "step": 1840
     },
     {
       "epoch": 0.8295964125560538,
+      "grad_norm": 1.5068808968575202,
+      "learning_rate": 2.576265494587458e-07,
+      "loss": 0.7067,
       "step": 1850
     },
     {
       "epoch": 0.8340807174887892,
+      "grad_norm": 1.4226178531466935,
+      "learning_rate": 2.446221598241472e-07,
+      "loss": 0.7143,
       "step": 1860
     },
     {
       "epoch": 0.8385650224215246,
+      "grad_norm": 1.6881847148932905,
+      "learning_rate": 2.319253590414132e-07,
+      "loss": 0.7376,
       "step": 1870
     },
     {
       "epoch": 0.8430493273542601,
+      "grad_norm": 1.4353283330892004,
+      "learning_rate": 2.1953925804197056e-07,
+      "loss": 0.7095,
       "step": 1880
     },
     {
       "epoch": 0.8475336322869955,
+      "grad_norm": 1.4639605071750654,
+      "learning_rate": 2.0746689163053113e-07,
+      "loss": 0.7102,
       "step": 1890
     },
     {
       "epoch": 0.852017937219731,
+      "grad_norm": 1.458703799588621,
+      "learning_rate": 1.9571121774151545e-07,
+      "loss": 0.686,
+      "step": 1900
+    },
+    {
+      "epoch": 0.852017937219731,
+      "eval_loss": 0.7220604419708252,
+      "eval_runtime": 406.5609,
+      "eval_samples_per_second": 123.172,
+      "eval_steps_per_second": 1.926,
       "step": 1900
     },
     {
       "epoch": 0.8565022421524664,
+      "grad_norm": 1.470148783910905,
+      "learning_rate": 1.8427511671430757e-07,
+      "loss": 0.72,
       "step": 1910
     },
     {
       "epoch": 0.8609865470852018,
+      "grad_norm": 1.3891242748262451,
+      "learning_rate": 1.7316139058752194e-07,
+      "loss": 0.7318,
       "step": 1920
     },
     {
       "epoch": 0.8654708520179372,
+      "grad_norm": 1.2245069775705093,
+      "learning_rate": 1.6237276241245867e-07,
+      "loss": 0.7155,
       "step": 1930
     },
     {
       "epoch": 0.8699551569506726,
+      "grad_norm": 1.360510189488915,
+      "learning_rate": 1.519118755859084e-07,
+      "loss": 0.7255,
       "step": 1940
     },
     {
       "epoch": 0.874439461883408,
+      "grad_norm": 1.495119615923585,
+      "learning_rate": 1.4178129320247486e-07,
+      "loss": 0.7484,
       "step": 1950
     },
     {
       "epoch": 0.8789237668161435,
+      "grad_norm": 1.3674856635367474,
+      "learning_rate": 1.31983497426575e-07,
+      "loss": 0.7366,
       "step": 1960
     },
     {
       "epoch": 0.8834080717488789,
+      "grad_norm": 1.4494730150421093,
+      "learning_rate": 1.2252088888426431e-07,
+      "loss": 0.742,
       "step": 1970
     },
     {
       "epoch": 0.8878923766816144,
+      "grad_norm": 1.4368197978682802,
+      "learning_rate": 1.1339578607504536e-07,
+      "loss": 0.7269,
       "step": 1980
     },
     {
       "epoch": 0.8923766816143498,
+      "grad_norm": 1.4017197990051706,
+      "learning_rate": 1.0461042480379402e-07,
+      "loss": 0.7234,
       "step": 1990
     },
     {
       "epoch": 0.8968609865470852,
+      "grad_norm": 1.426560347266084,
+      "learning_rate": 9.616695763295007e-08,
+      "loss": 0.7214,
+      "step": 2000
+    },
+    {
+      "epoch": 0.8968609865470852,
+      "eval_loss": 0.721759557723999,
+      "eval_runtime": 406.5838,
+      "eval_samples_per_second": 123.165,
+      "eval_steps_per_second": 1.926,
       "step": 2000
     },
     {
       "epoch": 0.9013452914798207,
+      "grad_norm": 1.489947255967281,
+      "learning_rate": 8.806745335510297e-08,
+      "loss": 0.7341,
       "step": 2010
     },
     {
       "epoch": 0.905829596412556,
+      "grad_norm": 1.4312716003053576,
+      "learning_rate": 8.031389648610266e-08,
+      "loss": 0.7264,
       "step": 2020
     },
     {
       "epoch": 0.9103139013452914,
+      "grad_norm": 1.4764400641380824,
+      "learning_rate": 7.290818677881966e-08,
+      "loss": 0.7301,
       "step": 2030
     },
     {
       "epoch": 0.9147982062780269,
+      "grad_norm": 1.4381108917682341,
+      "learning_rate": 6.585213875767305e-08,
+      "loss": 0.6997,
       "step": 2040
     },
     {
       "epoch": 0.9192825112107623,
+      "grad_norm": 1.459723127188453,
+      "learning_rate": 5.914748127404102e-08,
+      "loss": 0.7168,
       "step": 2050
     },
     {
       "epoch": 0.9237668161434978,
+      "grad_norm": 1.5776619173541433,
+      "learning_rate": 5.2795857082663655e-08,
+      "loss": 0.72,
       "step": 2060
     },
     {
       "epoch": 0.9282511210762332,
+      "grad_norm": 1.438610611700907,
+      "learning_rate": 4.6798822439140185e-08,
+      "loss": 0.7035,
       "step": 2070
     },
     {
       "epoch": 0.9327354260089686,
+      "grad_norm": 1.4350411032390504,
+      "learning_rate": 4.115784671861916e-08,
+      "loss": 0.735,
       "step": 2080
     },
     {
       "epoch": 0.9372197309417041,
+      "grad_norm": 1.4822578142933729,
+      "learning_rate": 3.587431205577713e-08,
+      "loss": 0.7178,
       "step": 2090
     },
     {
       "epoch": 0.9417040358744395,
+      "grad_norm": 1.5001233187138816,
+      "learning_rate": 3.0949513006172325e-08,
+      "loss": 0.7358,
+      "step": 2100
+    },
+    {
+      "epoch": 0.9417040358744395,
+      "eval_loss": 0.7216091752052307,
+      "eval_runtime": 406.6258,
+      "eval_samples_per_second": 123.153,
+      "eval_steps_per_second": 1.926,
       "step": 2100
     },
     {
       "epoch": 0.9461883408071748,
+      "grad_norm": 1.4457564058059627,
+      "learning_rate": 2.6384656229056946e-08,
+      "loss": 0.7285,
       "step": 2110
     },
     {
       "epoch": 0.9506726457399103,
+      "grad_norm": 1.6789172768348999,
+      "learning_rate": 2.218086019172394e-08,
+      "loss": 0.7027,
       "step": 2120
     },
     {
       "epoch": 0.9551569506726457,
+      "grad_norm": 1.4039832008414181,
+      "learning_rate": 1.8339154895464894e-08,
+      "loss": 0.7285,
       "step": 2130
     },
     {
       "epoch": 0.9596412556053812,
+      "grad_norm": 1.7674026844330886,
+      "learning_rate": 1.4860481623201417e-08,
+      "loss": 0.713,
       "step": 2140
     },
     {
       "epoch": 0.9641255605381166,
+      "grad_norm": 1.531580121339593,
+      "learning_rate": 1.1745692708855282e-08,
+      "loss": 0.7328,
       "step": 2150
     },
     {
       "epoch": 0.968609865470852,
+      "grad_norm": 1.455884868550825,
+      "learning_rate": 8.99555132851232e-09,
+      "loss": 0.7196,
       "step": 2160
     },
     {
       "epoch": 0.9730941704035875,
+      "grad_norm": 1.3157536936429735,
+      "learning_rate": 6.610731313430318e-09,
+      "loss": 0.7277,
       "step": 2170
     },
     {
       "epoch": 0.9775784753363229,
+      "grad_norm": 1.5586404477319191,
+      "learning_rate": 4.5918169849406e-09,
+      "loss": 0.7265,
       "step": 2180
     },
     {
       "epoch": 0.9820627802690582,
+      "grad_norm": 1.3596393082767964,
+      "learning_rate": 2.939303011277872e-09,
+      "loss": 0.719,
       "step": 2190
     },
     {
       "epoch": 0.9865470852017937,
+      "grad_norm": 1.3866642718972106,
+      "learning_rate": 1.6535942863788456e-09,
+      "loss": 0.7259,
+      "step": 2200
+    },
+    {
+      "epoch": 0.9865470852017937,
+      "eval_loss": 0.7215752005577087,
+      "eval_runtime": 408.9437,
+      "eval_samples_per_second": 122.455,
+      "eval_steps_per_second": 1.915,
       "step": 2200
     },
     {
       "epoch": 0.9910313901345291,
+      "grad_norm": 1.6643780128489514,
+      "learning_rate": 7.350058306764273e-10,
+      "loss": 0.7044,
       "step": 2210
     },
     {
       "epoch": 0.9955156950672646,
+      "grad_norm": 1.428221428067804,
+      "learning_rate": 1.8376271391412624e-10,
+      "loss": 0.7109,
       "step": 2220
     },
     {
       "epoch": 1.0,
+      "grad_norm": 1.3882910125414851,
       "learning_rate": 0.0,
+      "loss": 0.7123,
       "step": 2230
     },
     {
       "epoch": 1.0,
       "step": 2230,
+      "total_flos": 250303561007104.0,
+      "train_loss": 0.7492096503219262,
+      "train_runtime": 18007.2993,
+      "train_samples_per_second": 15.851,
+      "train_steps_per_second": 0.124
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 250303561007104.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb25fa7c42df808b4748bec88dee1a8e16e0082e24daff6298a752d2dca4e443
 size 6968

 version https://git-lfs.github.com/spec/v1
+oid sha256:c10bdb747e6aa533e359cc0a3925f648df006fc2f6bd836e9cca6e77438744b9
 size 6968