End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +892 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh-dcft-v1.1-no-curation
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh-dcft-v1.1-no-curation
-This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.4714

 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh-dcft-v1.1-no-curation
 # oh-dcft-v1.1-no-curation
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/oh-dcft-v1.1-no-curation dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.4714

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9980976537729864,
+    "eval_loss": 0.4713599979877472,
+    "eval_runtime": 132.103,
+    "eval_samples_per_second": 80.384,
+    "eval_steps_per_second": 0.628,
+    "total_flos": 1979475264798720.0,
+    "train_loss": 0.4632013658985067,
+    "train_runtime": 20039.4082,
+    "train_samples_per_second": 30.202,
+    "train_steps_per_second": 0.059
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9980976537729864,
+    "eval_loss": 0.4713599979877472,
+    "eval_runtime": 132.103,
+    "eval_samples_per_second": 80.384,
+    "eval_steps_per_second": 0.628
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9980976537729864,
+    "total_flos": 1979475264798720.0,
+    "train_loss": 0.4632013658985067,
+    "train_runtime": 20039.4082,
+    "train_samples_per_second": 30.202,
+    "train_steps_per_second": 0.059
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,892 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9980976537729864,
+  "eval_steps": 500,
+  "global_step": 1182,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.025364616360177554,
+      "grad_norm": 2.4958535272810582,
+      "learning_rate": 5e-06,
+      "loss": 0.7847,
+      "step": 10
+    },
+    {
+      "epoch": 0.05072923272035511,
+      "grad_norm": 1.332301089693953,
+      "learning_rate": 5e-06,
+      "loss": 0.6565,
+      "step": 20
+    },
+    {
+      "epoch": 0.07609384908053266,
+      "grad_norm": 0.8570758334622178,
+      "learning_rate": 5e-06,
+      "loss": 0.6087,
+      "step": 30
+    },
+    {
+      "epoch": 0.10145846544071022,
+      "grad_norm": 0.7961220667217911,
+      "learning_rate": 5e-06,
+      "loss": 0.5923,
+      "step": 40
+    },
+    {
+      "epoch": 0.12682308180088775,
+      "grad_norm": 1.0397565268140934,
+      "learning_rate": 5e-06,
+      "loss": 0.5693,
+      "step": 50
+    },
+    {
+      "epoch": 0.1521876981610653,
+      "grad_norm": 0.8562621904197287,
+      "learning_rate": 5e-06,
+      "loss": 0.5503,
+      "step": 60
+    },
+    {
+      "epoch": 0.17755231452124287,
+      "grad_norm": 0.8886037080258321,
+      "learning_rate": 5e-06,
+      "loss": 0.5498,
+      "step": 70
+    },
+    {
+      "epoch": 0.20291693088142043,
+      "grad_norm": 0.6838633568199429,
+      "learning_rate": 5e-06,
+      "loss": 0.534,
+      "step": 80
+    },
+    {
+      "epoch": 0.22828154724159797,
+      "grad_norm": 0.6556349806544307,
+      "learning_rate": 5e-06,
+      "loss": 0.535,
+      "step": 90
+    },
+    {
+      "epoch": 0.2536461636017755,
+      "grad_norm": 0.7469146203238889,
+      "learning_rate": 5e-06,
+      "loss": 0.5333,
+      "step": 100
+    },
+    {
+      "epoch": 0.27901077996195306,
+      "grad_norm": 0.5122739271343442,
+      "learning_rate": 5e-06,
+      "loss": 0.5172,
+      "step": 110
+    },
+    {
+      "epoch": 0.3043753963221306,
+      "grad_norm": 0.9622908612260581,
+      "learning_rate": 5e-06,
+      "loss": 0.5257,
+      "step": 120
+    },
+    {
+      "epoch": 0.3297400126823082,
+      "grad_norm": 0.6046903397133303,
+      "learning_rate": 5e-06,
+      "loss": 0.5151,
+      "step": 130
+    },
+    {
+      "epoch": 0.35510462904248574,
+      "grad_norm": 0.47575500456135494,
+      "learning_rate": 5e-06,
+      "loss": 0.5143,
+      "step": 140
+    },
+    {
+      "epoch": 0.3804692454026633,
+      "grad_norm": 0.7013205926571314,
+      "learning_rate": 5e-06,
+      "loss": 0.5144,
+      "step": 150
+    },
+    {
+      "epoch": 0.40583386176284086,
+      "grad_norm": 0.6351422540630048,
+      "learning_rate": 5e-06,
+      "loss": 0.5058,
+      "step": 160
+    },
+    {
+      "epoch": 0.43119847812301837,
+      "grad_norm": 0.6261693017885483,
+      "learning_rate": 5e-06,
+      "loss": 0.5102,
+      "step": 170
+    },
+    {
+      "epoch": 0.45656309448319593,
+      "grad_norm": 0.7605740230985341,
+      "learning_rate": 5e-06,
+      "loss": 0.5078,
+      "step": 180
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 0.5845861272533613,
+      "learning_rate": 5e-06,
+      "loss": 0.5043,
+      "step": 190
+    },
+    {
+      "epoch": 0.507292327203551,
+      "grad_norm": 0.7171099558889358,
+      "learning_rate": 5e-06,
+      "loss": 0.5053,
+      "step": 200
+    },
+    {
+      "epoch": 0.5326569435637286,
+      "grad_norm": 0.5158539718235372,
+      "learning_rate": 5e-06,
+      "loss": 0.5066,
+      "step": 210
+    },
+    {
+      "epoch": 0.5580215599239061,
+      "grad_norm": 0.7716179700630799,
+      "learning_rate": 5e-06,
+      "loss": 0.5037,
+      "step": 220
+    },
+    {
+      "epoch": 0.5833861762840837,
+      "grad_norm": 0.5356639716385265,
+      "learning_rate": 5e-06,
+      "loss": 0.5077,
+      "step": 230
+    },
+    {
+      "epoch": 0.6087507926442612,
+      "grad_norm": 0.5909560901543055,
+      "learning_rate": 5e-06,
+      "loss": 0.4978,
+      "step": 240
+    },
+    {
+      "epoch": 0.6341154090044389,
+      "grad_norm": 0.5612449176342577,
+      "learning_rate": 5e-06,
+      "loss": 0.4955,
+      "step": 250
+    },
+    {
+      "epoch": 0.6594800253646164,
+      "grad_norm": 0.8220158158926282,
+      "learning_rate": 5e-06,
+      "loss": 0.4932,
+      "step": 260
+    },
+    {
+      "epoch": 0.6848446417247939,
+      "grad_norm": 0.9803427711154427,
+      "learning_rate": 5e-06,
+      "loss": 0.4935,
+      "step": 270
+    },
+    {
+      "epoch": 0.7102092580849715,
+      "grad_norm": 0.7003489682973207,
+      "learning_rate": 5e-06,
+      "loss": 0.4921,
+      "step": 280
+    },
+    {
+      "epoch": 0.735573874445149,
+      "grad_norm": 0.7155818668831541,
+      "learning_rate": 5e-06,
+      "loss": 0.4941,
+      "step": 290
+    },
+    {
+      "epoch": 0.7609384908053266,
+      "grad_norm": 0.5641884255018443,
+      "learning_rate": 5e-06,
+      "loss": 0.4905,
+      "step": 300
+    },
+    {
+      "epoch": 0.7863031071655041,
+      "grad_norm": 0.5667685684791592,
+      "learning_rate": 5e-06,
+      "loss": 0.4972,
+      "step": 310
+    },
+    {
+      "epoch": 0.8116677235256817,
+      "grad_norm": 0.5424782856163526,
+      "learning_rate": 5e-06,
+      "loss": 0.4908,
+      "step": 320
+    },
+    {
+      "epoch": 0.8370323398858592,
+      "grad_norm": 0.555119069867457,
+      "learning_rate": 5e-06,
+      "loss": 0.49,
+      "step": 330
+    },
+    {
+      "epoch": 0.8623969562460367,
+      "grad_norm": 0.5540403091132209,
+      "learning_rate": 5e-06,
+      "loss": 0.4892,
+      "step": 340
+    },
+    {
+      "epoch": 0.8877615726062144,
+      "grad_norm": 0.6718528259146384,
+      "learning_rate": 5e-06,
+      "loss": 0.4879,
+      "step": 350
+    },
+    {
+      "epoch": 0.9131261889663919,
+      "grad_norm": 0.48504592421103015,
+      "learning_rate": 5e-06,
+      "loss": 0.4866,
+      "step": 360
+    },
+    {
+      "epoch": 0.9384908053265695,
+      "grad_norm": 0.5794400662308987,
+      "learning_rate": 5e-06,
+      "loss": 0.489,
+      "step": 370
+    },
+    {
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.49175786205010735,
+      "learning_rate": 5e-06,
+      "loss": 0.4792,
+      "step": 380
+    },
+    {
+      "epoch": 0.9892200380469245,
+      "grad_norm": 0.48088824717550854,
+      "learning_rate": 5e-06,
+      "loss": 0.4793,
+      "step": 390
+    },
+    {
+      "epoch": 0.9993658845909955,
+      "eval_loss": 0.48458319902420044,
+      "eval_runtime": 140.5522,
+      "eval_samples_per_second": 75.552,
+      "eval_steps_per_second": 0.591,
+      "step": 394
+    },
+    {
+      "epoch": 1.014584654407102,
+      "grad_norm": 0.6241434976553506,
+      "learning_rate": 5e-06,
+      "loss": 0.4668,
+      "step": 400
+    },
+    {
+      "epoch": 1.0399492707672797,
+      "grad_norm": 0.5387091155966651,
+      "learning_rate": 5e-06,
+      "loss": 0.4467,
+      "step": 410
+    },
+    {
+      "epoch": 1.0653138871274572,
+      "grad_norm": 0.6088667420403366,
+      "learning_rate": 5e-06,
+      "loss": 0.4552,
+      "step": 420
+    },
+    {
+      "epoch": 1.0906785034876347,
+      "grad_norm": 0.7635188991702534,
+      "learning_rate": 5e-06,
+      "loss": 0.4569,
+      "step": 430
+    },
+    {
+      "epoch": 1.1160431198478122,
+      "grad_norm": 0.5202613636726365,
+      "learning_rate": 5e-06,
+      "loss": 0.4532,
+      "step": 440
+    },
+    {
+      "epoch": 1.1414077362079897,
+      "grad_norm": 0.5431289298627378,
+      "learning_rate": 5e-06,
+      "loss": 0.4552,
+      "step": 450
+    },
+    {
+      "epoch": 1.1667723525681675,
+      "grad_norm": 0.5447516747773636,
+      "learning_rate": 5e-06,
+      "loss": 0.4517,
+      "step": 460
+    },
+    {
+      "epoch": 1.192136968928345,
+      "grad_norm": 0.5811733767557097,
+      "learning_rate": 5e-06,
+      "loss": 0.4596,
+      "step": 470
+    },
+    {
+      "epoch": 1.2175015852885225,
+      "grad_norm": 0.5291374404256166,
+      "learning_rate": 5e-06,
+      "loss": 0.4523,
+      "step": 480
+    },
+    {
+      "epoch": 1.2428662016487,
+      "grad_norm": 0.920406850160634,
+      "learning_rate": 5e-06,
+      "loss": 0.4512,
+      "step": 490
+    },
+    {
+      "epoch": 1.2682308180088775,
+      "grad_norm": 0.5379277068224477,
+      "learning_rate": 5e-06,
+      "loss": 0.4589,
+      "step": 500
+    },
+    {
+      "epoch": 1.2935954343690552,
+      "grad_norm": 0.6084288782824112,
+      "learning_rate": 5e-06,
+      "loss": 0.4476,
+      "step": 510
+    },
+    {
+      "epoch": 1.3189600507292327,
+      "grad_norm": 0.6373203390142074,
+      "learning_rate": 5e-06,
+      "loss": 0.4508,
+      "step": 520
+    },
+    {
+      "epoch": 1.3443246670894102,
+      "grad_norm": 0.5297816500484004,
+      "learning_rate": 5e-06,
+      "loss": 0.4519,
+      "step": 530
+    },
+    {
+      "epoch": 1.369689283449588,
+      "grad_norm": 0.5214550304276996,
+      "learning_rate": 5e-06,
+      "loss": 0.4507,
+      "step": 540
+    },
+    {
+      "epoch": 1.3950538998097652,
+      "grad_norm": 0.5932937282969459,
+      "learning_rate": 5e-06,
+      "loss": 0.4508,
+      "step": 550
+    },
+    {
+      "epoch": 1.420418516169943,
+      "grad_norm": 0.5015573262400715,
+      "learning_rate": 5e-06,
+      "loss": 0.4529,
+      "step": 560
+    },
+    {
+      "epoch": 1.4457831325301205,
+      "grad_norm": 0.6541003393290922,
+      "learning_rate": 5e-06,
+      "loss": 0.4487,
+      "step": 570
+    },
+    {
+      "epoch": 1.471147748890298,
+      "grad_norm": 0.4738510019221813,
+      "learning_rate": 5e-06,
+      "loss": 0.4437,
+      "step": 580
+    },
+    {
+      "epoch": 1.4965123652504757,
+      "grad_norm": 0.5284328908203406,
+      "learning_rate": 5e-06,
+      "loss": 0.449,
+      "step": 590
+    },
+    {
+      "epoch": 1.521876981610653,
+      "grad_norm": 0.5814801147707117,
+      "learning_rate": 5e-06,
+      "loss": 0.4498,
+      "step": 600
+    },
+    {
+      "epoch": 1.5472415979708307,
+      "grad_norm": 0.7380939259733779,
+      "learning_rate": 5e-06,
+      "loss": 0.4574,
+      "step": 610
+    },
+    {
+      "epoch": 1.5726062143310082,
+      "grad_norm": 0.5158189079851289,
+      "learning_rate": 5e-06,
+      "loss": 0.4553,
+      "step": 620
+    },
+    {
+      "epoch": 1.5979708306911857,
+      "grad_norm": 0.7517859976181999,
+      "learning_rate": 5e-06,
+      "loss": 0.4479,
+      "step": 630
+    },
+    {
+      "epoch": 1.6233354470513635,
+      "grad_norm": 0.4624484508717309,
+      "learning_rate": 5e-06,
+      "loss": 0.4484,
+      "step": 640
+    },
+    {
+      "epoch": 1.6487000634115407,
+      "grad_norm": 0.6517886187802472,
+      "learning_rate": 5e-06,
+      "loss": 0.4479,
+      "step": 650
+    },
+    {
+      "epoch": 1.6740646797717185,
+      "grad_norm": 0.5168694302612785,
+      "learning_rate": 5e-06,
+      "loss": 0.4498,
+      "step": 660
+    },
+    {
+      "epoch": 1.699429296131896,
+      "grad_norm": 0.5442235822761647,
+      "learning_rate": 5e-06,
+      "loss": 0.4546,
+      "step": 670
+    },
+    {
+      "epoch": 1.7247939124920735,
+      "grad_norm": 0.5866332538354704,
+      "learning_rate": 5e-06,
+      "loss": 0.4502,
+      "step": 680
+    },
+    {
+      "epoch": 1.7501585288522512,
+      "grad_norm": 0.5771993285709256,
+      "learning_rate": 5e-06,
+      "loss": 0.4489,
+      "step": 690
+    },
+    {
+      "epoch": 1.7755231452124287,
+      "grad_norm": 0.5856601574541924,
+      "learning_rate": 5e-06,
+      "loss": 0.4516,
+      "step": 700
+    },
+    {
+      "epoch": 1.8008877615726062,
+      "grad_norm": 0.5219735572020098,
+      "learning_rate": 5e-06,
+      "loss": 0.4504,
+      "step": 710
+    },
+    {
+      "epoch": 1.8262523779327837,
+      "grad_norm": 0.5294326989128105,
+      "learning_rate": 5e-06,
+      "loss": 0.4512,
+      "step": 720
+    },
+    {
+      "epoch": 1.8516169942929612,
+      "grad_norm": 0.5043747110843602,
+      "learning_rate": 5e-06,
+      "loss": 0.4476,
+      "step": 730
+    },
+    {
+      "epoch": 1.876981610653139,
+      "grad_norm": 0.5243372113736201,
+      "learning_rate": 5e-06,
+      "loss": 0.4487,
+      "step": 740
+    },
+    {
+      "epoch": 1.9023462270133165,
+      "grad_norm": 0.46489075414726855,
+      "learning_rate": 5e-06,
+      "loss": 0.4477,
+      "step": 750
+    },
+    {
+      "epoch": 1.927710843373494,
+      "grad_norm": 0.47070137502563003,
+      "learning_rate": 5e-06,
+      "loss": 0.4491,
+      "step": 760
+    },
+    {
+      "epoch": 1.9530754597336717,
+      "grad_norm": 0.5114250833346574,
+      "learning_rate": 5e-06,
+      "loss": 0.4497,
+      "step": 770
+    },
+    {
+      "epoch": 1.978440076093849,
+      "grad_norm": 0.44673587993328173,
+      "learning_rate": 5e-06,
+      "loss": 0.4461,
+      "step": 780
+    },
+    {
+      "epoch": 1.9987317691819912,
+      "eval_loss": 0.4722590744495392,
+      "eval_runtime": 136.3111,
+      "eval_samples_per_second": 77.903,
+      "eval_steps_per_second": 0.609,
+      "step": 788
+    },
+    {
+      "epoch": 2.0038046924540267,
+      "grad_norm": 0.5253387493826779,
+      "learning_rate": 5e-06,
+      "loss": 0.4406,
+      "step": 790
+    },
+    {
+      "epoch": 2.029169308814204,
+      "grad_norm": 0.5649979072148124,
+      "learning_rate": 5e-06,
+      "loss": 0.4174,
+      "step": 800
+    },
+    {
+      "epoch": 2.0545339251743817,
+      "grad_norm": 0.554555886277626,
+      "learning_rate": 5e-06,
+      "loss": 0.416,
+      "step": 810
+    },
+    {
+      "epoch": 2.0798985415345594,
+      "grad_norm": 0.5380213608538502,
+      "learning_rate": 5e-06,
+      "loss": 0.4075,
+      "step": 820
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 0.6590004861365489,
+      "learning_rate": 5e-06,
+      "loss": 0.415,
+      "step": 830
+    },
+    {
+      "epoch": 2.1306277742549145,
+      "grad_norm": 0.5145129946467305,
+      "learning_rate": 5e-06,
+      "loss": 0.4096,
+      "step": 840
+    },
+    {
+      "epoch": 2.1559923906150917,
+      "grad_norm": 0.572199886696882,
+      "learning_rate": 5e-06,
+      "loss": 0.4189,
+      "step": 850
+    },
+    {
+      "epoch": 2.1813570069752695,
+      "grad_norm": 0.5756593969633285,
+      "learning_rate": 5e-06,
+      "loss": 0.4201,
+      "step": 860
+    },
+    {
+      "epoch": 2.206721623335447,
+      "grad_norm": 0.5265898189979799,
+      "learning_rate": 5e-06,
+      "loss": 0.4116,
+      "step": 870
+    },
+    {
+      "epoch": 2.2320862396956245,
+      "grad_norm": 0.5424672160350248,
+      "learning_rate": 5e-06,
+      "loss": 0.4099,
+      "step": 880
+    },
+    {
+      "epoch": 2.257450856055802,
+      "grad_norm": 0.5674446384978195,
+      "learning_rate": 5e-06,
+      "loss": 0.416,
+      "step": 890
+    },
+    {
+      "epoch": 2.2828154724159795,
+      "grad_norm": 0.5128282183689237,
+      "learning_rate": 5e-06,
+      "loss": 0.4157,
+      "step": 900
+    },
+    {
+      "epoch": 2.308180088776157,
+      "grad_norm": 0.5135015935006935,
+      "learning_rate": 5e-06,
+      "loss": 0.4172,
+      "step": 910
+    },
+    {
+      "epoch": 2.333544705136335,
+      "grad_norm": 0.596189153928778,
+      "learning_rate": 5e-06,
+      "loss": 0.4152,
+      "step": 920
+    },
+    {
+      "epoch": 2.3589093214965122,
+      "grad_norm": 0.5352826549369347,
+      "learning_rate": 5e-06,
+      "loss": 0.4149,
+      "step": 930
+    },
+    {
+      "epoch": 2.38427393785669,
+      "grad_norm": 0.5014349895803593,
+      "learning_rate": 5e-06,
+      "loss": 0.4141,
+      "step": 940
+    },
+    {
+      "epoch": 2.4096385542168672,
+      "grad_norm": 0.4805462505254729,
+      "learning_rate": 5e-06,
+      "loss": 0.4176,
+      "step": 950
+    },
+    {
+      "epoch": 2.435003170577045,
+      "grad_norm": 0.6137290218711765,
+      "learning_rate": 5e-06,
+      "loss": 0.419,
+      "step": 960
+    },
+    {
+      "epoch": 2.4603677869372227,
+      "grad_norm": 0.5732682054062723,
+      "learning_rate": 5e-06,
+      "loss": 0.4163,
+      "step": 970
+    },
+    {
+      "epoch": 2.4857324032974,
+      "grad_norm": 0.4771910554061346,
+      "learning_rate": 5e-06,
+      "loss": 0.4134,
+      "step": 980
+    },
+    {
+      "epoch": 2.5110970196575777,
+      "grad_norm": 0.476330897847943,
+      "learning_rate": 5e-06,
+      "loss": 0.4225,
+      "step": 990
+    },
+    {
+      "epoch": 2.536461636017755,
+      "grad_norm": 0.47973764991876255,
+      "learning_rate": 5e-06,
+      "loss": 0.4145,
+      "step": 1000
+    },
+    {
+      "epoch": 2.5618262523779327,
+      "grad_norm": 0.5939904213084772,
+      "learning_rate": 5e-06,
+      "loss": 0.4153,
+      "step": 1010
+    },
+    {
+      "epoch": 2.5871908687381104,
+      "grad_norm": 0.5936679428712734,
+      "learning_rate": 5e-06,
+      "loss": 0.4204,
+      "step": 1020
+    },
+    {
+      "epoch": 2.6125554850982877,
+      "grad_norm": 0.5188426106745951,
+      "learning_rate": 5e-06,
+      "loss": 0.4183,
+      "step": 1030
+    },
+    {
+      "epoch": 2.6379201014584654,
+      "grad_norm": 0.5644339619977095,
+      "learning_rate": 5e-06,
+      "loss": 0.4126,
+      "step": 1040
+    },
+    {
+      "epoch": 2.6632847178186427,
+      "grad_norm": 0.6020266606747191,
+      "learning_rate": 5e-06,
+      "loss": 0.4186,
+      "step": 1050
+    },
+    {
+      "epoch": 2.6886493341788205,
+      "grad_norm": 0.4752185053914476,
+      "learning_rate": 5e-06,
+      "loss": 0.4138,
+      "step": 1060
+    },
+    {
+      "epoch": 2.714013950538998,
+      "grad_norm": 0.7626568079783347,
+      "learning_rate": 5e-06,
+      "loss": 0.4135,
+      "step": 1070
+    },
+    {
+      "epoch": 2.739378566899176,
+      "grad_norm": 0.5108017704950135,
+      "learning_rate": 5e-06,
+      "loss": 0.4154,
+      "step": 1080
+    },
+    {
+      "epoch": 2.764743183259353,
+      "grad_norm": 0.5746749293115092,
+      "learning_rate": 5e-06,
+      "loss": 0.4173,
+      "step": 1090
+    },
+    {
+      "epoch": 2.7901077996195305,
+      "grad_norm": 0.5467822052037948,
+      "learning_rate": 5e-06,
+      "loss": 0.4166,
+      "step": 1100
+    },
+    {
+      "epoch": 2.815472415979708,
+      "grad_norm": 0.6357622704499519,
+      "learning_rate": 5e-06,
+      "loss": 0.4198,
+      "step": 1110
+    },
+    {
+      "epoch": 2.840837032339886,
+      "grad_norm": 0.7346508445377833,
+      "learning_rate": 5e-06,
+      "loss": 0.4161,
+      "step": 1120
+    },
+    {
+      "epoch": 2.8662016487000637,
+      "grad_norm": 0.4767595766550471,
+      "learning_rate": 5e-06,
+      "loss": 0.4136,
+      "step": 1130
+    },
+    {
+      "epoch": 2.891566265060241,
+      "grad_norm": 0.5450967603642648,
+      "learning_rate": 5e-06,
+      "loss": 0.416,
+      "step": 1140
+    },
+    {
+      "epoch": 2.9169308814204187,
+      "grad_norm": 0.6310631600995659,
+      "learning_rate": 5e-06,
+      "loss": 0.4156,
+      "step": 1150
+    },
+    {
+      "epoch": 2.942295497780596,
+      "grad_norm": 0.4875236135766479,
+      "learning_rate": 5e-06,
+      "loss": 0.4135,
+      "step": 1160
+    },
+    {
+      "epoch": 2.9676601141407737,
+      "grad_norm": 0.5024341899279373,
+      "learning_rate": 5e-06,
+      "loss": 0.4185,
+      "step": 1170
+    },
+    {
+      "epoch": 2.9930247305009514,
+      "grad_norm": 0.4812185425989623,
+      "learning_rate": 5e-06,
+      "loss": 0.422,
+      "step": 1180
+    },
+    {
+      "epoch": 2.9980976537729864,
+      "eval_loss": 0.4713599979877472,
+      "eval_runtime": 132.9609,
+      "eval_samples_per_second": 79.866,
+      "eval_steps_per_second": 0.624,
+      "step": 1182
+    },
+    {
+      "epoch": 2.9980976537729864,
+      "step": 1182,
+      "total_flos": 1979475264798720.0,
+      "train_loss": 0.4632013658985067,
+      "train_runtime": 20039.4082,
+      "train_samples_per_second": 30.202,
+      "train_steps_per_second": 0.059
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1182,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1979475264798720.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed