End of training

Browse files

Files changed (5) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1602 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: llama_8b_lima_40
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # llama_8b_lima_40
-This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the None dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.9288

 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: llama_8b_lima_40
 # llama_8b_lima_40
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the open_webui_dataset dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.9288

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 1.0,
+    "eval_loss": 0.9287646412849426,
+    "eval_runtime": 19.2416,
+    "eval_samples_per_second": 10.394,
+    "eval_steps_per_second": 2.599,
+    "total_flos": 8.200255844856627e+16,
+    "train_loss": 0.8882445046534905,
+    "train_runtime": 9157.2611,
+    "train_samples_per_second": 3.18,
+    "train_steps_per_second": 0.114
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 1.0,
+    "eval_loss": 0.9287646412849426,
+    "eval_runtime": 19.2416,
+    "eval_samples_per_second": 10.394,
+    "eval_steps_per_second": 2.599
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 1.0,
+    "total_flos": 8.200255844856627e+16,
+    "train_loss": 0.8882445046534905,
+    "train_runtime": 9157.2611,
+    "train_samples_per_second": 3.18,
+    "train_steps_per_second": 0.114
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1602 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 80,
+  "global_step": 1040,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.004807692307692308,
+      "grad_norm": 172.91949232049635,
+      "learning_rate": 7.142857142857143e-07,
+      "loss": 1.275,
+      "step": 5
+    },
+    {
+      "epoch": 0.009615384615384616,
+      "grad_norm": 15.162819415928812,
+      "learning_rate": 1.4285714285714286e-06,
+      "loss": 1.0842,
+      "step": 10
+    },
+    {
+      "epoch": 0.014423076923076924,
+      "grad_norm": 43.156613200294984,
+      "learning_rate": 2.142857142857143e-06,
+      "loss": 1.0731,
+      "step": 15
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 12.05730410265307,
+      "learning_rate": 2.8571428571428573e-06,
+      "loss": 0.9966,
+      "step": 20
+    },
+    {
+      "epoch": 0.02403846153846154,
+      "grad_norm": 29.645022197375592,
+      "learning_rate": 3.5714285714285714e-06,
+      "loss": 0.9506,
+      "step": 25
+    },
+    {
+      "epoch": 0.028846153846153848,
+      "grad_norm": 27.496331665845375,
+      "learning_rate": 4.285714285714286e-06,
+      "loss": 0.9081,
+      "step": 30
+    },
+    {
+      "epoch": 0.03365384615384615,
+      "grad_norm": 33.04607017399404,
+      "learning_rate": 5e-06,
+      "loss": 0.9839,
+      "step": 35
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 4.67215100059801,
+      "learning_rate": 5.7142857142857145e-06,
+      "loss": 0.9143,
+      "step": 40
+    },
+    {
+      "epoch": 0.04326923076923077,
+      "grad_norm": 3.5686279553698648,
+      "learning_rate": 5.958760472832704e-06,
+      "loss": 1.0449,
+      "step": 45
+    },
+    {
+      "epoch": 0.04807692307692308,
+      "grad_norm": 3.5137053750826093,
+      "learning_rate": 5.890441320869003e-06,
+      "loss": 1.1233,
+      "step": 50
+    },
+    {
+      "epoch": 0.052884615384615384,
+      "grad_norm": 39.50156599931848,
+      "learning_rate": 5.822637783235761e-06,
+      "loss": 0.7838,
+      "step": 55
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 3.9954682494826987,
+      "learning_rate": 5.755348556225628e-06,
+      "loss": 0.937,
+      "step": 60
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 3.606702118329892,
+      "learning_rate": 5.688572332818116e-06,
+      "loss": 0.9261,
+      "step": 65
+    },
+    {
+      "epoch": 0.0673076923076923,
+      "grad_norm": 5.018248874418577,
+      "learning_rate": 5.622307802654199e-06,
+      "loss": 0.929,
+      "step": 70
+    },
+    {
+      "epoch": 0.07211538461538461,
+      "grad_norm": 14.46426266685474,
+      "learning_rate": 5.556553652010609e-06,
+      "loss": 1.0281,
+      "step": 75
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 6.717639424187089,
+      "learning_rate": 5.4913085637737825e-06,
+      "loss": 1.0252,
+      "step": 80
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "eval_loss": 1.0117295980453491,
+      "eval_runtime": 22.175,
+      "eval_samples_per_second": 9.019,
+      "eval_steps_per_second": 2.255,
+      "step": 80
+    },
+    {
+      "epoch": 0.08173076923076923,
+      "grad_norm": 5.9628841757039055,
+      "learning_rate": 5.42657121741348e-06,
+      "loss": 0.9798,
+      "step": 85
+    },
+    {
+      "epoch": 0.08653846153846154,
+      "grad_norm": 2.8480154385386633,
+      "learning_rate": 5.362340288956054e-06,
+      "loss": 0.9422,
+      "step": 90
+    },
+    {
+      "epoch": 0.09134615384615384,
+      "grad_norm": 3.041511518203806,
+      "learning_rate": 5.298614450957377e-06,
+      "loss": 0.7751,
+      "step": 95
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 3.245117237641717,
+      "learning_rate": 5.235392372475402e-06,
+      "loss": 1.0559,
+      "step": 100
+    },
+    {
+      "epoch": 0.10096153846153846,
+      "grad_norm": 4.186131160050305,
+      "learning_rate": 5.1726727190423596e-06,
+      "loss": 0.8535,
+      "step": 105
+    },
+    {
+      "epoch": 0.10576923076923077,
+      "grad_norm": 3.2112656498482743,
+      "learning_rate": 5.110454152636601e-06,
+      "loss": 1.0847,
+      "step": 110
+    },
+    {
+      "epoch": 0.11057692307692307,
+      "grad_norm": 3.5829116342694443,
+      "learning_rate": 5.04873533165404e-06,
+      "loss": 0.989,
+      "step": 115
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 2.8192163511739308,
+      "learning_rate": 4.987514910879233e-06,
+      "loss": 0.7562,
+      "step": 120
+    },
+    {
+      "epoch": 0.1201923076923077,
+      "grad_norm": 3.552581067366997,
+      "learning_rate": 4.9267915414560465e-06,
+      "loss": 0.882,
+      "step": 125
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 3.166131159213283,
+      "learning_rate": 4.866563870857949e-06,
+      "loss": 0.8461,
+      "step": 130
+    },
+    {
+      "epoch": 0.12980769230769232,
+      "grad_norm": 3.4902158184612873,
+      "learning_rate": 4.806830542857871e-06,
+      "loss": 1.0949,
+      "step": 135
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 2.763230275625746,
+      "learning_rate": 4.7475901974976784e-06,
+      "loss": 0.9741,
+      "step": 140
+    },
+    {
+      "epoch": 0.13942307692307693,
+      "grad_norm": 3.7680960024565047,
+      "learning_rate": 4.688841471057191e-06,
+      "loss": 0.8267,
+      "step": 145
+    },
+    {
+      "epoch": 0.14423076923076922,
+      "grad_norm": 3.7223152035406177,
+      "learning_rate": 4.630582996022805e-06,
+      "loss": 0.9237,
+      "step": 150
+    },
+    {
+      "epoch": 0.14903846153846154,
+      "grad_norm": 163.68789967501425,
+      "learning_rate": 4.572813401055646e-06,
+      "loss": 0.9735,
+      "step": 155
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 4.052744089990857,
+      "learning_rate": 4.515531310959294e-06,
+      "loss": 0.8185,
+      "step": 160
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "eval_loss": 0.9820164442062378,
+      "eval_runtime": 20.5987,
+      "eval_samples_per_second": 9.709,
+      "eval_steps_per_second": 2.427,
+      "step": 160
+    },
+    {
+      "epoch": 0.15865384615384615,
+      "grad_norm": 3.5962134321693213,
+      "learning_rate": 4.458735346647049e-06,
+      "loss": 0.9701,
+      "step": 165
+    },
+    {
+      "epoch": 0.16346153846153846,
+      "grad_norm": 3.405720690826482,
+      "learning_rate": 4.402424125108714e-06,
+      "loss": 0.7428,
+      "step": 170
+    },
+    {
+      "epoch": 0.16826923076923078,
+      "grad_norm": 3.5656581164655297,
+      "learning_rate": 4.346596259376934e-06,
+      "loss": 1.0573,
+      "step": 175
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 3.1116839574479944,
+      "learning_rate": 4.291250358493015e-06,
+      "loss": 0.99,
+      "step": 180
+    },
+    {
+      "epoch": 0.1778846153846154,
+      "grad_norm": 3.1856579669538037,
+      "learning_rate": 4.236385027472282e-06,
+      "loss": 0.9208,
+      "step": 185
+    },
+    {
+      "epoch": 0.18269230769230768,
+      "grad_norm": 2.713262969000155,
+      "learning_rate": 4.181998867268901e-06,
+      "loss": 0.9552,
+      "step": 190
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 3.4690878970474364,
+      "learning_rate": 4.1280904747402165e-06,
+      "loss": 0.9004,
+      "step": 195
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 2.6094836512830755,
+      "learning_rate": 4.07465844261054e-06,
+      "loss": 1.0189,
+      "step": 200
+    },
+    {
+      "epoch": 0.1971153846153846,
+      "grad_norm": 2.7258662188339917,
+      "learning_rate": 4.021701359434411e-06,
+      "loss": 0.8663,
+      "step": 205
+    },
+    {
+      "epoch": 0.20192307692307693,
+      "grad_norm": 2.130745708170683,
+      "learning_rate": 3.9692178095593185e-06,
+      "loss": 0.9191,
+      "step": 210
+    },
+    {
+      "epoch": 0.20673076923076922,
+      "grad_norm": 3.632025896546127,
+      "learning_rate": 3.917206373087843e-06,
+      "loss": 0.8463,
+      "step": 215
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 2.8163127172248754,
+      "learning_rate": 3.86566562583925e-06,
+      "loss": 0.9113,
+      "step": 220
+    },
+    {
+      "epoch": 0.21634615384615385,
+      "grad_norm": 2.925143301211318,
+      "learning_rate": 3.814594139310489e-06,
+      "loss": 0.8026,
+      "step": 225
+    },
+    {
+      "epoch": 0.22115384615384615,
+      "grad_norm": 3.491601498263278,
+      "learning_rate": 3.7639904806365957e-06,
+      "loss": 1.0014,
+      "step": 230
+    },
+    {
+      "epoch": 0.22596153846153846,
+      "grad_norm": 3.60018394829918,
+      "learning_rate": 3.7138532125504874e-06,
+      "loss": 0.8704,
+      "step": 235
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 2.8380175093955193,
+      "learning_rate": 3.664180893342146e-06,
+      "loss": 0.9686,
+      "step": 240
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "eval_loss": 0.9701676964759827,
+      "eval_runtime": 21.0027,
+      "eval_samples_per_second": 9.523,
+      "eval_steps_per_second": 2.381,
+      "step": 240
+    },
+    {
+      "epoch": 0.23557692307692307,
+      "grad_norm": 2.88654950044071,
+      "learning_rate": 3.6149720768171497e-06,
+      "loss": 0.9927,
+      "step": 245
+    },
+    {
+      "epoch": 0.2403846153846154,
+      "grad_norm": 4.075449595613525,
+      "learning_rate": 3.5662253122545742e-06,
+      "loss": 0.8335,
+      "step": 250
+    },
+    {
+      "epoch": 0.24519230769230768,
+      "grad_norm": 3.1216694362939137,
+      "learning_rate": 3.517939144364211e-06,
+      "loss": 0.9225,
+      "step": 255
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 3.4006563474977787,
+      "learning_rate": 3.4701121132431283e-06,
+      "loss": 0.9645,
+      "step": 260
+    },
+    {
+      "epoch": 0.2548076923076923,
+      "grad_norm": 5.006159977047571,
+      "learning_rate": 3.422742754331519e-06,
+      "loss": 1.0596,
+      "step": 265
+    },
+    {
+      "epoch": 0.25961538461538464,
+      "grad_norm": 6.352134442675443,
+      "learning_rate": 3.3758295983678575e-06,
+      "loss": 0.8279,
+      "step": 270
+    },
+    {
+      "epoch": 0.2644230769230769,
+      "grad_norm": 4.599350051977448,
+      "learning_rate": 3.329371171343321e-06,
+      "loss": 0.7653,
+      "step": 275
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 3.775428351149461,
+      "learning_rate": 3.2833659944554757e-06,
+      "loss": 0.8703,
+      "step": 280
+    },
+    {
+      "epoch": 0.27403846153846156,
+      "grad_norm": 2.5830965772659287,
+      "learning_rate": 3.2378125840611978e-06,
+      "loss": 0.826,
+      "step": 285
+    },
+    {
+      "epoch": 0.27884615384615385,
+      "grad_norm": 3.6279739322810642,
+      "learning_rate": 3.192709451628821e-06,
+      "loss": 0.8617,
+      "step": 290
+    },
+    {
+      "epoch": 0.28365384615384615,
+      "grad_norm": 2.574184072314173,
+      "learning_rate": 3.1480551036895063e-06,
+      "loss": 0.9925,
+      "step": 295
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 3.245622107244932,
+      "learning_rate": 3.1038480417877728e-06,
+      "loss": 0.8276,
+      "step": 300
+    },
+    {
+      "epoch": 0.2932692307692308,
+      "grad_norm": 2.7094531818622385,
+      "learning_rate": 3.0600867624312124e-06,
+      "loss": 0.93,
+      "step": 305
+    },
+    {
+      "epoch": 0.2980769230769231,
+      "grad_norm": 3.4108002405937996,
+      "learning_rate": 3.0167697570393586e-06,
+      "loss": 0.9093,
+      "step": 310
+    },
+    {
+      "epoch": 0.30288461538461536,
+      "grad_norm": 3.2261512213908468,
+      "learning_rate": 2.973895511891673e-06,
+      "loss": 0.8436,
+      "step": 315
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 2.9111217804814733,
+      "learning_rate": 2.9314625080746407e-06,
+      "loss": 0.7962,
+      "step": 320
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "eval_loss": 0.9604336619377136,
+      "eval_runtime": 20.7503,
+      "eval_samples_per_second": 9.638,
+      "eval_steps_per_second": 2.41,
+      "step": 320
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 3.0069826903052568,
+      "learning_rate": 2.8894692214279614e-06,
+      "loss": 0.9501,
+      "step": 325
+    },
+    {
+      "epoch": 0.3173076923076923,
+      "grad_norm": 2.7402700321309497,
+      "learning_rate": 2.8479141224897947e-06,
+      "loss": 0.8932,
+      "step": 330
+    },
+    {
+      "epoch": 0.32211538461538464,
+      "grad_norm": 2.850461668225791,
+      "learning_rate": 2.806795676441052e-06,
+      "loss": 0.8509,
+      "step": 335
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 2.8055976999039833,
+      "learning_rate": 2.7661123430487023e-06,
+      "loss": 0.8531,
+      "step": 340
+    },
+    {
+      "epoch": 0.3317307692307692,
+      "grad_norm": 3.950790855598453,
+      "learning_rate": 2.725862576608072e-06,
+      "loss": 0.8428,
+      "step": 345
+    },
+    {
+      "epoch": 0.33653846153846156,
+      "grad_norm": 2.608925093832874,
+      "learning_rate": 2.6860448258841182e-06,
+      "loss": 0.9324,
+      "step": 350
+    },
+    {
+      "epoch": 0.34134615384615385,
+      "grad_norm": 4.161582883109561,
+      "learning_rate": 2.6466575340516312e-06,
+      "loss": 0.8302,
+      "step": 355
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 3.223665474192437,
+      "learning_rate": 2.607699138634365e-06,
+      "loss": 1.0338,
+      "step": 360
+    },
+    {
+      "epoch": 0.35096153846153844,
+      "grad_norm": 4.360630028683017,
+      "learning_rate": 2.5691680714430463e-06,
+      "loss": 0.781,
+      "step": 365
+    },
+    {
+      "epoch": 0.3557692307692308,
+      "grad_norm": 3.2326801834772256,
+      "learning_rate": 2.531062758512248e-06,
+      "loss": 0.9277,
+      "step": 370
+    },
+    {
+      "epoch": 0.3605769230769231,
+      "grad_norm": 3.518325507567999,
+      "learning_rate": 2.493381620036082e-06,
+      "loss": 0.7648,
+      "step": 375
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 3.905842925893686,
+      "learning_rate": 2.4561230703027005e-06,
+      "loss": 0.7278,
+      "step": 380
+    },
+    {
+      "epoch": 0.3701923076923077,
+      "grad_norm": 5.371293959548764,
+      "learning_rate": 2.4192855176275597e-06,
+      "loss": 0.7564,
+      "step": 385
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 2.850075623051217,
+      "learning_rate": 2.382867364285416e-06,
+      "loss": 0.7983,
+      "step": 390
+    },
+    {
+      "epoch": 0.3798076923076923,
+      "grad_norm": 6.661652196819241,
+      "learning_rate": 2.3468670064410194e-06,
+      "loss": 0.9005,
+      "step": 395
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 4.700394864120094,
+      "learning_rate": 2.3112828340784763e-06,
+      "loss": 0.8669,
+      "step": 400
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "eval_loss": 0.9519588351249695,
+      "eval_runtime": 20.79,
+      "eval_samples_per_second": 9.62,
+      "eval_steps_per_second": 2.405,
+      "step": 400
+    },
+    {
+      "epoch": 0.3894230769230769,
+      "grad_norm": 3.3197778882289297,
+      "learning_rate": 2.2761132309292435e-06,
+      "loss": 0.8864,
+      "step": 405
+    },
+    {
+      "epoch": 0.3942307692307692,
+      "grad_norm": 4.198490325675027,
+      "learning_rate": 2.241356574398701e-06,
+      "loss": 0.9219,
+      "step": 410
+    },
+    {
+      "epoch": 0.39903846153846156,
+      "grad_norm": 8.447734132502742,
+      "learning_rate": 2.2070112354912867e-06,
+      "loss": 0.9542,
+      "step": 415
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 3.6043476480492873,
+      "learning_rate": 2.1730755787341422e-06,
+      "loss": 0.7828,
+      "step": 420
+    },
+    {
+      "epoch": 0.40865384615384615,
+      "grad_norm": 3.550876988072227,
+      "learning_rate": 2.1395479620992237e-06,
+      "loss": 0.9213,
+      "step": 425
+    },
+    {
+      "epoch": 0.41346153846153844,
+      "grad_norm": 4.346265355776214,
+      "learning_rate": 2.1064267369238405e-06,
+      "loss": 0.8832,
+      "step": 430
+    },
+    {
+      "epoch": 0.4182692307692308,
+      "grad_norm": 8.956356184457416,
+      "learning_rate": 2.0737102478295753e-06,
+      "loss": 1.0524,
+      "step": 435
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 4.0026073252992225,
+      "learning_rate": 2.0413968326395454e-06,
+      "loss": 0.8951,
+      "step": 440
+    },
+    {
+      "epoch": 0.42788461538461536,
+      "grad_norm": 3.769313811024604,
+      "learning_rate": 2.009484822293941e-06,
+      "loss": 0.8803,
+      "step": 445
+    },
+    {
+      "epoch": 0.4326923076923077,
+      "grad_norm": 3.4579810908904927,
+      "learning_rate": 1.9779725407638038e-06,
+      "loss": 0.8575,
+      "step": 450
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 3.6235925820400112,
+      "learning_rate": 1.946858304962993e-06,
+      "loss": 0.874,
+      "step": 455
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 3.2454132821623607,
+      "learning_rate": 1.9161404246582834e-06,
+      "loss": 1.0103,
+      "step": 460
+    },
+    {
+      "epoch": 0.44711538461538464,
+      "grad_norm": 3.438741636237806,
+      "learning_rate": 1.8858172023775289e-06,
+      "loss": 0.8943,
+      "step": 465
+    },
+    {
+      "epoch": 0.4519230769230769,
+      "grad_norm": 3.1798809256755205,
+      "learning_rate": 1.8558869333158512e-06,
+      "loss": 0.9638,
+      "step": 470
+    },
+    {
+      "epoch": 0.4567307692307692,
+      "grad_norm": 3.6082058444107177,
+      "learning_rate": 1.8263479052397838e-06,
+      "loss": 0.8781,
+      "step": 475
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 2.83102154533938,
+      "learning_rate": 1.7971983983893046e-06,
+      "loss": 0.8883,
+      "step": 480
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "eval_loss": 0.9505824446678162,
+      "eval_runtime": 20.9063,
+      "eval_samples_per_second": 9.566,
+      "eval_steps_per_second": 2.392,
+      "step": 480
+    },
+    {
+      "epoch": 0.46634615384615385,
+      "grad_norm": 2.9075319767858425,
+      "learning_rate": 1.768436685377699e-06,
+      "loss": 0.7087,
+      "step": 485
+    },
+    {
+      "epoch": 0.47115384615384615,
+      "grad_norm": 3.7507183698931117,
+      "learning_rate": 1.7400610310891816e-06,
+      "loss": 0.928,
+      "step": 490
+    },
+    {
+      "epoch": 0.47596153846153844,
+      "grad_norm": 3.0576523378992326,
+      "learning_rate": 1.7120696925742107e-06,
+      "loss": 0.8047,
+      "step": 495
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 2.6687945237895287,
+      "learning_rate": 1.6844609189424112e-06,
+      "loss": 1.0923,
+      "step": 500
+    },
+    {
+      "epoch": 0.4855769230769231,
+      "grad_norm": 3.7056881913494277,
+      "learning_rate": 1.6572329512530394e-06,
+      "loss": 0.7718,
+      "step": 505
+    },
+    {
+      "epoch": 0.49038461538461536,
+      "grad_norm": 4.261130783269975,
+      "learning_rate": 1.630384022402907e-06,
+      "loss": 0.7462,
+      "step": 510
+    },
+    {
+      "epoch": 0.4951923076923077,
+      "grad_norm": 2.8143821099136024,
+      "learning_rate": 1.6039123570116796e-06,
+      "loss": 0.965,
+      "step": 515
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 3.0264813559392616,
+      "learning_rate": 1.5778161713044614e-06,
+      "loss": 0.8943,
+      "step": 520
+    },
+    {
+      "epoch": 0.5048076923076923,
+      "grad_norm": 18.246495136897703,
+      "learning_rate": 1.5520936729915777e-06,
+      "loss": 0.9694,
+      "step": 525
+    },
+    {
+      "epoch": 0.5096153846153846,
+      "grad_norm": 4.039649841411536,
+      "learning_rate": 1.5267430611454654e-06,
+      "loss": 0.8589,
+      "step": 530
+    },
+    {
+      "epoch": 0.5144230769230769,
+      "grad_norm": 3.028129518354503,
+      "learning_rate": 1.5017625260745615e-06,
+      "loss": 0.8761,
+      "step": 535
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 3.0504275368028115,
+      "learning_rate": 1.4771502491940911e-06,
+      "loss": 0.9293,
+      "step": 540
+    },
+    {
+      "epoch": 0.5240384615384616,
+      "grad_norm": 2.520216608258428,
+      "learning_rate": 1.4529044028936606e-06,
+      "loss": 0.7738,
+      "step": 545
+    },
+    {
+      "epoch": 0.5288461538461539,
+      "grad_norm": 3.4732840458118197,
+      "learning_rate": 1.4290231504015187e-06,
+      "loss": 0.8173,
+      "step": 550
+    },
+    {
+      "epoch": 0.5336538461538461,
+      "grad_norm": 2.992673074333473,
+      "learning_rate": 1.4055046456453867e-06,
+      "loss": 1.0166,
+      "step": 555
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 3.676863247659791,
+      "learning_rate": 1.3823470331097324e-06,
+      "loss": 0.7636,
+      "step": 560
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "eval_loss": 0.9441266059875488,
+      "eval_runtime": 20.933,
+      "eval_samples_per_second": 9.554,
+      "eval_steps_per_second": 2.389,
+      "step": 560
+    },
+    {
+      "epoch": 0.5432692307692307,
+      "grad_norm": 2.562908465662044,
+      "learning_rate": 1.3595484476893454e-06,
+      "loss": 0.9229,
+      "step": 565
+    },
+    {
+      "epoch": 0.5480769230769231,
+      "grad_norm": 2.2982897576935724,
+      "learning_rate": 1.3371070145391023e-06,
+      "loss": 0.8806,
+      "step": 570
+    },
+    {
+      "epoch": 0.5528846153846154,
+      "grad_norm": 4.029788762639043,
+      "learning_rate": 1.3150208489197545e-06,
+      "loss": 0.7314,
+      "step": 575
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 3.4816155172912575,
+      "learning_rate": 1.2932880560396128e-06,
+      "loss": 0.819,
+      "step": 580
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 3.8108295243391868,
+      "learning_rate": 1.2719067308919584e-06,
+      "loss": 0.7222,
+      "step": 585
+    },
+    {
+      "epoch": 0.5673076923076923,
+      "grad_norm": 2.7857292629014183,
+      "learning_rate": 1.2508749580880287e-06,
+      "loss": 0.8022,
+      "step": 590
+    },
+    {
+      "epoch": 0.5721153846153846,
+      "grad_norm": 3.6021354748640677,
+      "learning_rate": 1.2301908116853925e-06,
+      "loss": 0.884,
+      "step": 595
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 3.135380180508478,
+      "learning_rate": 1.2098523550115558e-06,
+      "loss": 1.0023,
+      "step": 600
+    },
+    {
+      "epoch": 0.5817307692307693,
+      "grad_norm": 3.3653027564726035,
+      "learning_rate": 1.189857640482588e-06,
+      "loss": 0.9518,
+      "step": 605
+    },
+    {
+      "epoch": 0.5865384615384616,
+      "grad_norm": 2.459430693726985,
+      "learning_rate": 1.170204709416585e-06,
+      "loss": 0.8211,
+      "step": 610
+    },
+    {
+      "epoch": 0.5913461538461539,
+      "grad_norm": 5.022938552667774,
+      "learning_rate": 1.1508915918417567e-06,
+      "loss": 0.7398,
+      "step": 615
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 3.8724856541183357,
+      "learning_rate": 1.1319163062989139e-06,
+      "loss": 0.941,
+      "step": 620
+    },
+    {
+      "epoch": 0.6009615384615384,
+      "grad_norm": 3.1280693366860963,
+      "learning_rate": 1.1132768596381337e-06,
+      "loss": 0.815,
+      "step": 625
+    },
+    {
+      "epoch": 0.6057692307692307,
+      "grad_norm": 2.8201015243807284,
+      "learning_rate": 1.0949712468093497e-06,
+      "loss": 0.8991,
+      "step": 630
+    },
+    {
+      "epoch": 0.6105769230769231,
+      "grad_norm": 3.32788176588362,
+      "learning_rate": 1.076997450646619e-06,
+      "loss": 0.9282,
+      "step": 635
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 3.9582374514755134,
+      "learning_rate": 1.0593534416457847e-06,
+      "loss": 0.8221,
+      "step": 640
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "eval_loss": 0.9404194355010986,
+      "eval_runtime": 21.0496,
+      "eval_samples_per_second": 9.501,
+      "eval_steps_per_second": 2.375,
+      "step": 640
+    },
+    {
+      "epoch": 0.6201923076923077,
+      "grad_norm": 2.5869189332376004,
+      "learning_rate": 1.0420371777352623e-06,
+      "loss": 0.8804,
+      "step": 645
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 2.53500848922609,
+      "learning_rate": 1.0250466040396306e-06,
+      "loss": 0.7947,
+      "step": 650
+    },
+    {
+      "epoch": 0.6298076923076923,
+      "grad_norm": 3.07037325829785,
+      "learning_rate": 1.0083796526357243e-06,
+      "loss": 0.8485,
+      "step": 655
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 2.5949762709128814,
+      "learning_rate": 9.920342423008766e-07,
+      "loss": 0.7737,
+      "step": 660
+    },
+    {
+      "epoch": 0.6394230769230769,
+      "grad_norm": 3.723350500191604,
+      "learning_rate": 9.760082782529624e-07,
+      "loss": 0.8044,
+      "step": 665
+    },
+    {
+      "epoch": 0.6442307692307693,
+      "grad_norm": 2.91223481306706,
+      "learning_rate": 9.602996518818617e-07,
+      "loss": 0.8059,
+      "step": 670
+    },
+    {
+      "epoch": 0.6490384615384616,
+      "grad_norm": 3.228159750161236,
+      "learning_rate": 9.449062404719376e-07,
+      "loss": 0.9736,
+      "step": 675
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 4.2304614726707594,
+      "learning_rate": 9.298259069151074e-07,
+      "loss": 0.8253,
+      "step": 680
+    },
+    {
+      "epoch": 0.6586538461538461,
+      "grad_norm": 3.253581255940029,
+      "learning_rate": 9.15056499414049e-07,
+      "loss": 1.0807,
+      "step": 685
+    },
+    {
+      "epoch": 0.6634615384615384,
+      "grad_norm": 4.2515171628124975,
+      "learning_rate": 9.005958511750684e-07,
+      "loss": 0.8206,
+      "step": 690
+    },
+    {
+      "epoch": 0.6682692307692307,
+      "grad_norm": 2.7617275421854526,
+      "learning_rate": 8.864417800901062e-07,
+      "loss": 0.9496,
+      "step": 695
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 3.233107996911771,
+      "learning_rate": 8.72592088407351e-07,
+      "loss": 0.9023,
+      "step": 700
+    },
+    {
+      "epoch": 0.6778846153846154,
+      "grad_norm": 3.1204863795886184,
+      "learning_rate": 8.590445623898662e-07,
+      "loss": 0.869,
+      "step": 705
+    },
+    {
+      "epoch": 0.6826923076923077,
+      "grad_norm": 2.5285063680240234,
+      "learning_rate": 8.457969719616223e-07,
+      "loss": 0.9186,
+      "step": 710
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 3.0506459039436336,
+      "learning_rate": 8.32847070340265e-07,
+      "loss": 0.9203,
+      "step": 715
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 3.7957636063897318,
+      "learning_rate": 8.201925936559198e-07,
+      "loss": 0.9417,
+      "step": 720
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "eval_loss": 0.9345305562019348,
+      "eval_runtime": 21.1147,
+      "eval_samples_per_second": 9.472,
+      "eval_steps_per_second": 2.368,
+      "step": 720
+    },
+    {
+      "epoch": 0.6971153846153846,
+      "grad_norm": 3.3254122602539624,
+      "learning_rate": 8.078312605552745e-07,
+      "loss": 0.9107,
+      "step": 725
+    },
+    {
+      "epoch": 0.7019230769230769,
+      "grad_norm": 2.8068324192286487,
+      "learning_rate": 7.957607717901299e-07,
+      "loss": 0.9438,
+      "step": 730
+    },
+    {
+      "epoch": 0.7067307692307693,
+      "grad_norm": 3.498836942130792,
+      "learning_rate": 7.839788097895564e-07,
+      "loss": 0.8693,
+      "step": 735
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 2.5787803338017885,
+      "learning_rate": 7.72483038214722e-07,
+      "loss": 0.896,
+      "step": 740
+    },
+    {
+      "epoch": 0.7163461538461539,
+      "grad_norm": 3.67630240687256,
+      "learning_rate": 7.612711014953991e-07,
+      "loss": 0.8243,
+      "step": 745
+    },
+    {
+      "epoch": 0.7211538461538461,
+      "grad_norm": 2.4521374343388125,
+      "learning_rate": 7.503406243470673e-07,
+      "loss": 1.0063,
+      "step": 750
+    },
+    {
+      "epoch": 0.7259615384615384,
+      "grad_norm": 2.6536830050201536,
+      "learning_rate": 7.396892112674676e-07,
+      "loss": 0.8133,
+      "step": 755
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 3.057951252038446,
+      "learning_rate": 7.293144460113513e-07,
+      "loss": 0.8753,
+      "step": 760
+    },
+    {
+      "epoch": 0.7355769230769231,
+      "grad_norm": 2.3939129798326815,
+      "learning_rate": 7.192138910420856e-07,
+      "loss": 0.8277,
+      "step": 765
+    },
+    {
+      "epoch": 0.7403846153846154,
+      "grad_norm": 2.8809002810189233,
+      "learning_rate": 7.093850869586572e-07,
+      "loss": 0.8746,
+      "step": 770
+    },
+    {
+      "epoch": 0.7451923076923077,
+      "grad_norm": 3.272891692948664,
+      "learning_rate": 6.998255518965055e-07,
+      "loss": 0.8711,
+      "step": 775
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 3.1649449172099073,
+      "learning_rate": 6.905327809004765e-07,
+      "loss": 0.8073,
+      "step": 780
+    },
+    {
+      "epoch": 0.7548076923076923,
+      "grad_norm": 2.862835029692555,
+      "learning_rate": 6.815042452680482e-07,
+      "loss": 0.852,
+      "step": 785
+    },
+    {
+      "epoch": 0.7596153846153846,
+      "grad_norm": 4.777839902626332,
+      "learning_rate": 6.727373918608166e-07,
+      "loss": 0.7941,
+      "step": 790
+    },
+    {
+      "epoch": 0.7644230769230769,
+      "grad_norm": 3.4663518671110403,
+      "learning_rate": 6.642296423820508e-07,
+      "loss": 0.8553,
+      "step": 795
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 3.062550953679388,
+      "learning_rate": 6.559783926179307e-07,
+      "loss": 0.9623,
+      "step": 800
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "eval_loss": 0.9317355155944824,
+      "eval_runtime": 21.1215,
+      "eval_samples_per_second": 9.469,
+      "eval_steps_per_second": 2.367,
+      "step": 800
+    },
+    {
+      "epoch": 0.7740384615384616,
+      "grad_norm": 2.9850983787230145,
+      "learning_rate": 6.479810116398562e-07,
+      "loss": 0.9048,
+      "step": 805
+    },
+    {
+      "epoch": 0.7788461538461539,
+      "grad_norm": 2.5686622431209387,
+      "learning_rate": 6.40234840964976e-07,
+      "loss": 0.7535,
+      "step": 810
+    },
+    {
+      "epoch": 0.7836538461538461,
+      "grad_norm": 2.8469066270016894,
+      "learning_rate": 6.327371936718024e-07,
+      "loss": 0.8606,
+      "step": 815
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 3.567677645668133,
+      "learning_rate": 6.254853534674779e-07,
+      "loss": 0.8133,
+      "step": 820
+    },
+    {
+      "epoch": 0.7932692307692307,
+      "grad_norm": 2.331177876625003,
+      "learning_rate": 6.184765737029068e-07,
+      "loss": 0.921,
+      "step": 825
+    },
+    {
+      "epoch": 0.7980769230769231,
+      "grad_norm": 2.684486602009453,
+      "learning_rate": 6.117080763315794e-07,
+      "loss": 0.8378,
+      "step": 830
+    },
+    {
+      "epoch": 0.8028846153846154,
+      "grad_norm": 2.7951045757499546,
+      "learning_rate": 6.051770508074766e-07,
+      "loss": 0.7412,
+      "step": 835
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 4.34395271902391,
+      "learning_rate": 5.98880652916942e-07,
+      "loss": 0.8488,
+      "step": 840
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 2.4901987068339175,
+      "learning_rate": 5.928160035388477e-07,
+      "loss": 0.7888,
+      "step": 845
+    },
+    {
+      "epoch": 0.8173076923076923,
+      "grad_norm": 3.410681331565254,
+      "learning_rate": 5.869801873267336e-07,
+      "loss": 0.9896,
+      "step": 850
+    },
+    {
+      "epoch": 0.8221153846153846,
+      "grad_norm": 3.0373771991309715,
+      "learning_rate": 5.813702513058679e-07,
+      "loss": 0.7731,
+      "step": 855
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 2.6095155256301656,
+      "learning_rate": 5.759832033773325e-07,
+      "loss": 0.9015,
+      "step": 860
+    },
+    {
+      "epoch": 0.8317307692307693,
+      "grad_norm": 3.499761379842187,
+      "learning_rate": 5.708160107202719e-07,
+      "loss": 0.8423,
+      "step": 865
+    },
+    {
+      "epoch": 0.8365384615384616,
+      "grad_norm": 2.63663041754238,
+      "learning_rate": 5.658655980823239e-07,
+      "loss": 0.8807,
+      "step": 870
+    },
+    {
+      "epoch": 0.8413461538461539,
+      "grad_norm": 3.943874822020016,
+      "learning_rate": 5.611288459469594e-07,
+      "loss": 0.8609,
+      "step": 875
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 2.9004043511306525,
+      "learning_rate": 5.566025885649524e-07,
+      "loss": 0.9654,
+      "step": 880
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "eval_loss": 0.9302033185958862,
+      "eval_runtime": 21.0263,
+      "eval_samples_per_second": 9.512,
+      "eval_steps_per_second": 2.378,
+      "step": 880
+    },
+    {
+      "epoch": 0.8509615384615384,
+      "grad_norm": 3.182299494802371,
+      "learning_rate": 5.522836118354419e-07,
+      "loss": 0.7406,
+      "step": 885
+    },
+    {
+      "epoch": 0.8557692307692307,
+      "grad_norm": 3.1170335107274214,
+      "learning_rate": 5.481686510199858e-07,
+      "loss": 0.9893,
+      "step": 890
+    },
+    {
+      "epoch": 0.8605769230769231,
+      "grad_norm": 2.437332494806209,
+      "learning_rate": 5.442543882705713e-07,
+      "loss": 0.9432,
+      "step": 895
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 3.248411155382253,
+      "learning_rate": 5.405374499496658e-07,
+      "loss": 0.8199,
+      "step": 900
+    },
+    {
+      "epoch": 0.8701923076923077,
+      "grad_norm": 3.699605668699813,
+      "learning_rate": 5.370144037169503e-07,
+      "loss": 0.8742,
+      "step": 905
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 4.418113021858762,
+      "learning_rate": 5.336817553532644e-07,
+      "loss": 0.8431,
+      "step": 910
+    },
+    {
+      "epoch": 0.8798076923076923,
+      "grad_norm": 2.3988015404279874,
+      "learning_rate": 5.305359452873153e-07,
+      "loss": 0.8947,
+      "step": 915
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 3.0267726009783766,
+      "learning_rate": 5.275733447846792e-07,
+      "loss": 0.7263,
+      "step": 920
+    },
+    {
+      "epoch": 0.8894230769230769,
+      "grad_norm": 3.722228079235539,
+      "learning_rate": 5.247902517512378e-07,
+      "loss": 0.8365,
+      "step": 925
+    },
+    {
+      "epoch": 0.8942307692307693,
+      "grad_norm": 2.603232021464912,
+      "learning_rate": 5.221828860941111e-07,
+      "loss": 1.0223,
+      "step": 930
+    },
+    {
+      "epoch": 0.8990384615384616,
+      "grad_norm": 2.784717139792509,
+      "learning_rate": 5.197473845718411e-07,
+      "loss": 0.8666,
+      "step": 935
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 2.864173244146164,
+      "learning_rate": 5.174797950514308e-07,
+      "loss": 0.7097,
+      "step": 940
+    },
+    {
+      "epoch": 0.9086538461538461,
+      "grad_norm": 3.1016453769012395,
+      "learning_rate": 5.153760700719024e-07,
+      "loss": 0.9475,
+      "step": 945
+    },
+    {
+      "epoch": 0.9134615384615384,
+      "grad_norm": 3.5038468947729973,
+      "learning_rate": 5.13432059591097e-07,
+      "loss": 0.8123,
+      "step": 950
+    },
+    {
+      "epoch": 0.9182692307692307,
+      "grad_norm": 3.2927805818210407,
+      "learning_rate": 5.116435027627297e-07,
+      "loss": 0.8134,
+      "step": 955
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 2.3328148005143747,
+      "learning_rate": 5.100060185517474e-07,
+      "loss": 0.9169,
+      "step": 960
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "eval_loss": 0.928638756275177,
+      "eval_runtime": 21.0064,
+      "eval_samples_per_second": 9.521,
+      "eval_steps_per_second": 2.38,
+      "step": 960
+    },
+    {
+      "epoch": 0.9278846153846154,
+      "grad_norm": 3.644838748812858,
+      "learning_rate": 5.085150949442101e-07,
+      "loss": 0.7718,
+      "step": 965
+    },
+    {
+      "epoch": 0.9326923076923077,
+      "grad_norm": 2.7559502909140505,
+      "learning_rate": 5.071660764378547e-07,
+      "loss": 0.9096,
+      "step": 970
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 2.5970949524935363,
+      "learning_rate": 5.059541494031398e-07,
+      "loss": 0.8835,
+      "step": 975
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 2.1523312550723066,
+      "learning_rate": 5.048743247693103e-07,
+      "loss": 0.8909,
+      "step": 980
+    },
+    {
+      "epoch": 0.9471153846153846,
+      "grad_norm": 5.2539613787039885,
+      "learning_rate": 5.039214172958587e-07,
+      "loss": 0.8688,
+      "step": 985
+    },
+    {
+      "epoch": 0.9519230769230769,
+      "grad_norm": 2.9606045980250837,
+      "learning_rate": 5.030900204036544e-07,
+      "loss": 0.8714,
+      "step": 990
+    },
+    {
+      "epoch": 0.9567307692307693,
+      "grad_norm": 2.939313716550038,
+      "learning_rate": 5.023744751055416e-07,
+      "loss": 0.9248,
+      "step": 995
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 2.7776091933130473,
+      "learning_rate": 5.017688308926548e-07,
+      "loss": 0.8965,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9663461538461539,
+      "grad_norm": 3.3105407766408685,
+      "learning_rate": 5.012667953109271e-07,
+      "loss": 0.8606,
+      "step": 1005
+    },
+    {
+      "epoch": 0.9711538461538461,
+      "grad_norm": 7.289088245652649,
+      "learning_rate": 5.008616670245212e-07,
+      "loss": 0.8847,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9759615384615384,
+      "grad_norm": 4.342531181739036,
+      "learning_rate": 5.005462435953572e-07,
+      "loss": 0.7237,
+      "step": 1015
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 3.3798170801004304,
+      "learning_rate": 5.003126880797421e-07,
+      "loss": 0.9875,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9855769230769231,
+      "grad_norm": 2.413281341822416,
+      "learning_rate": 5.00152322649041e-07,
+      "loss": 0.8558,
+      "step": 1025
+    },
+    {
+      "epoch": 0.9903846153846154,
+      "grad_norm": 3.479845931889368,
+      "learning_rate": 5.000552759653955e-07,
+      "loss": 0.6462,
+      "step": 1030
+    },
+    {
+      "epoch": 0.9951923076923077,
+      "grad_norm": 3.6411495658522273,
+      "learning_rate": 5.000097715024919e-07,
+      "loss": 0.7703,
+      "step": 1035
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.04941647733406,
+      "learning_rate": 5e-07,
+      "loss": 0.9005,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.9287646412849426,
+      "eval_runtime": 21.1549,
+      "eval_samples_per_second": 9.454,
+      "eval_steps_per_second": 2.364,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0,
+      "step": 1040,
+      "total_flos": 8.200255844856627e+16,
+      "train_loss": 0.8882445046534905,
+      "train_runtime": 9157.2611,
+      "train_samples_per_second": 3.18,
+      "train_steps_per_second": 0.114
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1040,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1040,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.200255844856627e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}