Model save

Browse files

Files changed (4) hide show

README.md +69 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +1394 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+- generator
+model-index:
+- name: downstream_0.02p_seed42_level2_rare
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# downstream_0.02p_seed42_level2_rare
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1101
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 1.071         | 1.0   | 959  | 1.1101          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.43.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 1.2060020938113024e+16,
+    "train_loss": 1.1071907986689657,
+    "train_runtime": 19793.4474,
+    "train_samples": 104971,
+    "train_samples_per_second": 3.101,
+    "train_steps_per_second": 0.048
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 1.2060020938113024e+16,
+    "train_loss": 1.1071907986689657,
+    "train_runtime": 19793.4474,
+    "train_samples": 104971,
+    "train_samples_per_second": 3.101,
+    "train_steps_per_second": 0.048
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1394 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 959,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0010427528675703858,
+      "grad_norm": 0.46784728546447485,
+      "learning_rate": 2.0833333333333334e-06,
+      "loss": 1.3538,
+      "step": 1
+    },
+    {
+      "epoch": 0.005213764337851929,
+      "grad_norm": 0.4299186002140457,
+      "learning_rate": 1.0416666666666668e-05,
+      "loss": 1.3601,
+      "step": 5
+    },
+    {
+      "epoch": 0.010427528675703858,
+      "grad_norm": 0.5025473100643405,
+      "learning_rate": 2.0833333333333336e-05,
+      "loss": 1.3602,
+      "step": 10
+    },
+    {
+      "epoch": 0.01564129301355579,
+      "grad_norm": 0.22754157012043386,
+      "learning_rate": 3.125e-05,
+      "loss": 1.3526,
+      "step": 15
+    },
+    {
+      "epoch": 0.020855057351407715,
+      "grad_norm": 0.14930678513355825,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.3157,
+      "step": 20
+    },
+    {
+      "epoch": 0.026068821689259645,
+      "grad_norm": 0.1493862762381592,
+      "learning_rate": 5.208333333333334e-05,
+      "loss": 1.2959,
+      "step": 25
+    },
+    {
+      "epoch": 0.03128258602711158,
+      "grad_norm": 0.14284419715578545,
+      "learning_rate": 6.25e-05,
+      "loss": 1.2774,
+      "step": 30
+    },
+    {
+      "epoch": 0.0364963503649635,
+      "grad_norm": 0.13626355620208974,
+      "learning_rate": 7.291666666666667e-05,
+      "loss": 1.2487,
+      "step": 35
+    },
+    {
+      "epoch": 0.04171011470281543,
+      "grad_norm": 0.10695646364370354,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 1.2206,
+      "step": 40
+    },
+    {
+      "epoch": 0.04692387904066736,
+      "grad_norm": 0.09407825046309691,
+      "learning_rate": 9.375e-05,
+      "loss": 1.2028,
+      "step": 45
+    },
+    {
+      "epoch": 0.05213764337851929,
+      "grad_norm": 0.08120415093523357,
+      "learning_rate": 0.00010416666666666667,
+      "loss": 1.1819,
+      "step": 50
+    },
+    {
+      "epoch": 0.05735140771637122,
+      "grad_norm": 0.07854484960093237,
+      "learning_rate": 0.00011458333333333333,
+      "loss": 1.1687,
+      "step": 55
+    },
+    {
+      "epoch": 0.06256517205422316,
+      "grad_norm": 0.06991743542761439,
+      "learning_rate": 0.000125,
+      "loss": 1.1476,
+      "step": 60
+    },
+    {
+      "epoch": 0.06777893639207508,
+      "grad_norm": 0.07010747792123796,
+      "learning_rate": 0.0001354166666666667,
+      "loss": 1.1566,
+      "step": 65
+    },
+    {
+      "epoch": 0.072992700729927,
+      "grad_norm": 0.06735485294531884,
+      "learning_rate": 0.00014583333333333335,
+      "loss": 1.1758,
+      "step": 70
+    },
+    {
+      "epoch": 0.07820646506777894,
+      "grad_norm": 0.07054279022592949,
+      "learning_rate": 0.00015625,
+      "loss": 1.1725,
+      "step": 75
+    },
+    {
+      "epoch": 0.08342022940563086,
+      "grad_norm": 0.07848476612174879,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.1588,
+      "step": 80
+    },
+    {
+      "epoch": 0.0886339937434828,
+      "grad_norm": 0.07428124015010305,
+      "learning_rate": 0.00017708333333333335,
+      "loss": 1.1409,
+      "step": 85
+    },
+    {
+      "epoch": 0.09384775808133472,
+      "grad_norm": 0.07511025896318015,
+      "learning_rate": 0.0001875,
+      "loss": 1.1501,
+      "step": 90
+    },
+    {
+      "epoch": 0.09906152241918666,
+      "grad_norm": 0.08140724060317898,
+      "learning_rate": 0.0001979166666666667,
+      "loss": 1.1525,
+      "step": 95
+    },
+    {
+      "epoch": 0.10427528675703858,
+      "grad_norm": 0.09875089257447961,
+      "learning_rate": 0.00019998939866722242,
+      "loss": 1.1152,
+      "step": 100
+    },
+    {
+      "epoch": 0.10948905109489052,
+      "grad_norm": 0.10683544050076133,
+      "learning_rate": 0.00019994633460515538,
+      "loss": 1.1245,
+      "step": 105
+    },
+    {
+      "epoch": 0.11470281543274244,
+      "grad_norm": 0.10489956661825363,
+      "learning_rate": 0.00019987015948614691,
+      "loss": 1.1228,
+      "step": 110
+    },
+    {
+      "epoch": 0.11991657977059438,
+      "grad_norm": 0.07725971069252748,
+      "learning_rate": 0.00019976089854612702,
+      "loss": 1.1215,
+      "step": 115
+    },
+    {
+      "epoch": 0.1251303441084463,
+      "grad_norm": 0.07944990580241551,
+      "learning_rate": 0.000199618587981973,
+      "loss": 1.116,
+      "step": 120
+    },
+    {
+      "epoch": 0.13034410844629823,
+      "grad_norm": 0.0834401702395657,
+      "learning_rate": 0.00019944327493951773,
+      "loss": 1.1144,
+      "step": 125
+    },
+    {
+      "epoch": 0.13555787278415016,
+      "grad_norm": 0.08900854650193943,
+      "learning_rate": 0.00019923501749793097,
+      "loss": 1.1048,
+      "step": 130
+    },
+    {
+      "epoch": 0.14077163712200208,
+      "grad_norm": 0.08078088093283514,
+      "learning_rate": 0.0001989938846504783,
+      "loss": 1.1022,
+      "step": 135
+    },
+    {
+      "epoch": 0.145985401459854,
+      "grad_norm": 0.08120650571343319,
+      "learning_rate": 0.00019871995628166443,
+      "loss": 1.1269,
+      "step": 140
+    },
+    {
+      "epoch": 0.15119916579770595,
+      "grad_norm": 0.08251173811384645,
+      "learning_rate": 0.00019841332314076855,
+      "loss": 1.1049,
+      "step": 145
+    },
+    {
+      "epoch": 0.15641293013555788,
+      "grad_norm": 0.0811645274318103,
+      "learning_rate": 0.00019807408681177997,
+      "loss": 1.1132,
+      "step": 150
+    },
+    {
+      "epoch": 0.1616266944734098,
+      "grad_norm": 0.07646936117341427,
+      "learning_rate": 0.00019770235967974463,
+      "loss": 1.1058,
+      "step": 155
+    },
+    {
+      "epoch": 0.16684045881126172,
+      "grad_norm": 0.07385173118278343,
+      "learning_rate": 0.00019729826489353324,
+      "loss": 1.1209,
+      "step": 160
+    },
+    {
+      "epoch": 0.17205422314911367,
+      "grad_norm": 0.08021452641715054,
+      "learning_rate": 0.00019686193632504338,
+      "loss": 1.1049,
+      "step": 165
+    },
+    {
+      "epoch": 0.1772679874869656,
+      "grad_norm": 0.07668807421928069,
+      "learning_rate": 0.00019639351852484947,
+      "loss": 1.1106,
+      "step": 170
+    },
+    {
+      "epoch": 0.18248175182481752,
+      "grad_norm": 0.07892223313284867,
+      "learning_rate": 0.0001958931666743148,
+      "loss": 1.1161,
+      "step": 175
+    },
+    {
+      "epoch": 0.18769551616266944,
+      "grad_norm": 0.08190647698912198,
+      "learning_rate": 0.0001953610465341816,
+      "loss": 1.0996,
+      "step": 180
+    },
+    {
+      "epoch": 0.19290928050052136,
+      "grad_norm": 0.07800476181033798,
+      "learning_rate": 0.00019479733438965667,
+      "loss": 1.1247,
+      "step": 185
+    },
+    {
+      "epoch": 0.1981230448383733,
+      "grad_norm": 0.08349049942704777,
+      "learning_rate": 0.00019420221699200995,
+      "loss": 1.1154,
+      "step": 190
+    },
+    {
+      "epoch": 0.20333680917622524,
+      "grad_norm": 0.07929575871932729,
+      "learning_rate": 0.00019357589149670608,
+      "loss": 1.1184,
+      "step": 195
+    },
+    {
+      "epoch": 0.20855057351407716,
+      "grad_norm": 0.0735399192652975,
+      "learning_rate": 0.00019291856539808917,
+      "loss": 1.0968,
+      "step": 200
+    },
+    {
+      "epoch": 0.21376433785192908,
+      "grad_norm": 0.07775467151795544,
+      "learning_rate": 0.00019223045646064212,
+      "loss": 1.1122,
+      "step": 205
+    },
+    {
+      "epoch": 0.21897810218978103,
+      "grad_norm": 0.07608187355699407,
+      "learning_rate": 0.00019151179264684402,
+      "loss": 1.1129,
+      "step": 210
+    },
+    {
+      "epoch": 0.22419186652763295,
+      "grad_norm": 0.07668601971351766,
+      "learning_rate": 0.00019076281204164874,
+      "loss": 1.0979,
+      "step": 215
+    },
+    {
+      "epoch": 0.22940563086548488,
+      "grad_norm": 0.07230005483681756,
+      "learning_rate": 0.00018998376277361008,
+      "loss": 1.1215,
+      "step": 220
+    },
+    {
+      "epoch": 0.2346193952033368,
+      "grad_norm": 0.0733672154981931,
+      "learning_rate": 0.00018917490293267973,
+      "loss": 1.1021,
+      "step": 225
+    },
+    {
+      "epoch": 0.23983315954118875,
+      "grad_norm": 0.08070384712102877,
+      "learning_rate": 0.00018833650048470524,
+      "loss": 1.1131,
+      "step": 230
+    },
+    {
+      "epoch": 0.24504692387904067,
+      "grad_norm": 0.0789014320960882,
+      "learning_rate": 0.0001874688331826557,
+      "loss": 1.1226,
+      "step": 235
+    },
+    {
+      "epoch": 0.2502606882168926,
+      "grad_norm": 0.0713759955633646,
+      "learning_rate": 0.00018657218847460572,
+      "loss": 1.124,
+      "step": 240
+    },
+    {
+      "epoch": 0.25547445255474455,
+      "grad_norm": 0.07275793106627063,
+      "learning_rate": 0.00018564686340850708,
+      "loss": 1.1081,
+      "step": 245
+    },
+    {
+      "epoch": 0.26068821689259647,
+      "grad_norm": 0.07708474011231153,
+      "learning_rate": 0.0001846931645337803,
+      "loss": 1.094,
+      "step": 250
+    },
+    {
+      "epoch": 0.2659019812304484,
+      "grad_norm": 0.08547989244439409,
+      "learning_rate": 0.00018371140779975824,
+      "loss": 1.1068,
+      "step": 255
+    },
+    {
+      "epoch": 0.2711157455683003,
+      "grad_norm": 0.07980805306815425,
+      "learning_rate": 0.00018270191845101602,
+      "loss": 1.1097,
+      "step": 260
+    },
+    {
+      "epoch": 0.27632950990615224,
+      "grad_norm": 0.0771809814717238,
+      "learning_rate": 0.0001816650309196209,
+      "loss": 1.0993,
+      "step": 265
+    },
+    {
+      "epoch": 0.28154327424400416,
+      "grad_norm": 0.07825602560329317,
+      "learning_rate": 0.00018060108871433922,
+      "loss": 1.0965,
+      "step": 270
+    },
+    {
+      "epoch": 0.2867570385818561,
+      "grad_norm": 0.07544233821374238,
+      "learning_rate": 0.00017951044430683565,
+      "loss": 1.1077,
+      "step": 275
+    },
+    {
+      "epoch": 0.291970802919708,
+      "grad_norm": 0.08595174428600973,
+      "learning_rate": 0.00017839345901490367,
+      "loss": 1.1444,
+      "step": 280
+    },
+    {
+      "epoch": 0.29718456725756,
+      "grad_norm": 0.08036998457602522,
+      "learning_rate": 0.000177250502882765,
+      "loss": 1.1018,
+      "step": 285
+    },
+    {
+      "epoch": 0.3023983315954119,
+      "grad_norm": 0.0758408964978662,
+      "learning_rate": 0.0001760819545584783,
+      "loss": 1.1144,
+      "step": 290
+    },
+    {
+      "epoch": 0.30761209593326383,
+      "grad_norm": 0.08039719317248988,
+      "learning_rate": 0.00017488820116849757,
+      "loss": 1.0848,
+      "step": 295
+    },
+    {
+      "epoch": 0.31282586027111575,
+      "grad_norm": 0.08195730354878057,
+      "learning_rate": 0.0001736696381894216,
+      "loss": 1.1091,
+      "step": 300
+    },
+    {
+      "epoch": 0.3180396246089677,
+      "grad_norm": 0.0857915617590216,
+      "learning_rate": 0.0001724266693169772,
+      "loss": 1.1179,
+      "step": 305
+    },
+    {
+      "epoch": 0.3232533889468196,
+      "grad_norm": 0.0727205241081358,
+      "learning_rate": 0.00017115970633227936,
+      "loss": 1.1104,
+      "step": 310
+    },
+    {
+      "epoch": 0.3284671532846715,
+      "grad_norm": 0.08177574294024298,
+      "learning_rate": 0.00016986916896541323,
+      "loss": 1.1,
+      "step": 315
+    },
+    {
+      "epoch": 0.33368091762252344,
+      "grad_norm": 0.07784461413056201,
+      "learning_rate": 0.00016855548475638225,
+      "loss": 1.0934,
+      "step": 320
+    },
+    {
+      "epoch": 0.33889468196037537,
+      "grad_norm": 0.0708482833267521,
+      "learning_rate": 0.0001672190889134691,
+      "loss": 1.0958,
+      "step": 325
+    },
+    {
+      "epoch": 0.34410844629822734,
+      "grad_norm": 0.076390205474366,
+      "learning_rate": 0.0001658604241690564,
+      "loss": 1.105,
+      "step": 330
+    },
+    {
+      "epoch": 0.34932221063607927,
+      "grad_norm": 0.07309708286911666,
+      "learning_rate": 0.00016447994063295457,
+      "loss": 1.095,
+      "step": 335
+    },
+    {
+      "epoch": 0.3545359749739312,
+      "grad_norm": 0.07017926439528561,
+      "learning_rate": 0.00016307809564328551,
+      "loss": 1.1157,
+      "step": 340
+    },
+    {
+      "epoch": 0.3597497393117831,
+      "grad_norm": 0.07365098763599127,
+      "learning_rate": 0.00016165535361497218,
+      "loss": 1.1186,
+      "step": 345
+    },
+    {
+      "epoch": 0.36496350364963503,
+      "grad_norm": 0.07663605771628923,
+      "learning_rate": 0.00016021218588588298,
+      "loss": 1.0749,
+      "step": 350
+    },
+    {
+      "epoch": 0.37017726798748696,
+      "grad_norm": 0.07615854076662545,
+      "learning_rate": 0.0001587490705606832,
+      "loss": 1.0827,
+      "step": 355
+    },
+    {
+      "epoch": 0.3753910323253389,
+      "grad_norm": 0.0753345353309753,
+      "learning_rate": 0.00015726649235244455,
+      "loss": 1.1082,
+      "step": 360
+    },
+    {
+      "epoch": 0.3806047966631908,
+      "grad_norm": 0.07913485493599222,
+      "learning_rate": 0.00015576494242206508,
+      "loss": 1.1258,
+      "step": 365
+    },
+    {
+      "epoch": 0.3858185610010427,
+      "grad_norm": 0.07705620467095986,
+      "learning_rate": 0.0001542449182155537,
+      "loss": 1.0772,
+      "step": 370
+    },
+    {
+      "epoch": 0.3910323253388947,
+      "grad_norm": 0.07160314888274456,
+      "learning_rate": 0.00015270692329923176,
+      "loss": 1.0996,
+      "step": 375
+    },
+    {
+      "epoch": 0.3962460896767466,
+      "grad_norm": 0.08325118544541214,
+      "learning_rate": 0.0001511514671929076,
+      "loss": 1.0904,
+      "step": 380
+    },
+    {
+      "epoch": 0.40145985401459855,
+      "grad_norm": 0.07199438515468881,
+      "learning_rate": 0.00014957906520107845,
+      "loss": 1.0851,
+      "step": 385
+    },
+    {
+      "epoch": 0.40667361835245047,
+      "grad_norm": 0.07710028013752196,
+      "learning_rate": 0.00014799023824221613,
+      "loss": 1.1123,
+      "step": 390
+    },
+    {
+      "epoch": 0.4118873826903024,
+      "grad_norm": 0.07407414298584401,
+      "learning_rate": 0.0001463855126761928,
+      "loss": 1.0878,
+      "step": 395
+    },
+    {
+      "epoch": 0.4171011470281543,
+      "grad_norm": 0.07467701529997763,
+      "learning_rate": 0.0001447654201299041,
+      "loss": 1.0975,
+      "step": 400
+    },
+    {
+      "epoch": 0.42231491136600624,
+      "grad_norm": 0.07107152720346374,
+      "learning_rate": 0.00014313049732114715,
+      "loss": 1.0657,
+      "step": 405
+    },
+    {
+      "epoch": 0.42752867570385816,
+      "grad_norm": 0.07823458012876569,
+      "learning_rate": 0.00014148128588081256,
+      "loss": 1.1153,
+      "step": 410
+    },
+    {
+      "epoch": 0.43274244004171014,
+      "grad_norm": 0.08009223556489412,
+      "learning_rate": 0.0001398183321734481,
+      "loss": 1.0974,
+      "step": 415
+    },
+    {
+      "epoch": 0.43795620437956206,
+      "grad_norm": 0.07669197698500169,
+      "learning_rate": 0.00013814218711625457,
+      "loss": 1.0904,
+      "step": 420
+    },
+    {
+      "epoch": 0.443169968717414,
+      "grad_norm": 0.08468765745640104,
+      "learning_rate": 0.0001364534059965735,
+      "loss": 1.0808,
+      "step": 425
+    },
+    {
+      "epoch": 0.4483837330552659,
+      "grad_norm": 0.07933899715254406,
+      "learning_rate": 0.00013475254828792662,
+      "loss": 1.1232,
+      "step": 430
+    },
+    {
+      "epoch": 0.45359749739311783,
+      "grad_norm": 0.07992483417151192,
+      "learning_rate": 0.0001330401774646691,
+      "loss": 1.0752,
+      "step": 435
+    },
+    {
+      "epoch": 0.45881126173096975,
+      "grad_norm": 0.07194633805909506,
+      "learning_rate": 0.00013131686081531698,
+      "loss": 1.1021,
+      "step": 440
+    },
+    {
+      "epoch": 0.4640250260688217,
+      "grad_norm": 0.07408216210208647,
+      "learning_rate": 0.00012958316925461085,
+      "loss": 1.103,
+      "step": 445
+    },
+    {
+      "epoch": 0.4692387904066736,
+      "grad_norm": 0.07365408906170111,
+      "learning_rate": 0.00012783967713437882,
+      "loss": 1.1039,
+      "step": 450
+    },
+    {
+      "epoch": 0.4744525547445255,
+      "grad_norm": 0.07264336315306943,
+      "learning_rate": 0.0001260869620532601,
+      "loss": 1.0703,
+      "step": 455
+    },
+    {
+      "epoch": 0.4796663190823775,
+      "grad_norm": 0.0716348036748029,
+      "learning_rate": 0.00012432560466535362,
+      "loss": 1.078,
+      "step": 460
+    },
+    {
+      "epoch": 0.4848800834202294,
+      "grad_norm": 0.07275630175563283,
+      "learning_rate": 0.00012255618848785378,
+      "loss": 1.0931,
+      "step": 465
+    },
+    {
+      "epoch": 0.49009384775808135,
+      "grad_norm": 0.08738232111757031,
+      "learning_rate": 0.00012077929970773823,
+      "loss": 1.1046,
+      "step": 470
+    },
+    {
+      "epoch": 0.49530761209593327,
+      "grad_norm": 0.07302314407960024,
+      "learning_rate": 0.00011899552698757078,
+      "loss": 1.0797,
+      "step": 475
+    },
+    {
+      "epoch": 0.5005213764337852,
+      "grad_norm": 0.07630391551541794,
+      "learning_rate": 0.00011720546127048452,
+      "loss": 1.0853,
+      "step": 480
+    },
+    {
+      "epoch": 0.5057351407716372,
+      "grad_norm": 0.0786638943841889,
+      "learning_rate": 0.0001154096955844091,
+      "loss": 1.0936,
+      "step": 485
+    },
+    {
+      "epoch": 0.5109489051094891,
+      "grad_norm": 0.0738033508924629,
+      "learning_rate": 0.00011360882484560755,
+      "loss": 1.0947,
+      "step": 490
+    },
+    {
+      "epoch": 0.516162669447341,
+      "grad_norm": 0.07194256539748715,
+      "learning_rate": 0.00011180344566158739,
+      "loss": 1.0567,
+      "step": 495
+    },
+    {
+      "epoch": 0.5213764337851929,
+      "grad_norm": 0.07372207865811028,
+      "learning_rate": 0.0001099941561334515,
+      "loss": 1.096,
+      "step": 500
+    },
+    {
+      "epoch": 0.5265901981230449,
+      "grad_norm": 0.08419340992248064,
+      "learning_rate": 0.00010818155565775443,
+      "loss": 1.0906,
+      "step": 505
+    },
+    {
+      "epoch": 0.5318039624608968,
+      "grad_norm": 0.07572982107794511,
+      "learning_rate": 0.00010636624472792889,
+      "loss": 1.0672,
+      "step": 510
+    },
+    {
+      "epoch": 0.5370177267987487,
+      "grad_norm": 0.07601878254011342,
+      "learning_rate": 0.00010454882473534961,
+      "loss": 1.0828,
+      "step": 515
+    },
+    {
+      "epoch": 0.5422314911366006,
+      "grad_norm": 0.07348182851139312,
+      "learning_rate": 0.0001027298977700992,
+      "loss": 1.0725,
+      "step": 520
+    },
+    {
+      "epoch": 0.5474452554744526,
+      "grad_norm": 0.0744277902361585,
+      "learning_rate": 0.0001009100664215028,
+      "loss": 1.0895,
+      "step": 525
+    },
+    {
+      "epoch": 0.5526590198123045,
+      "grad_norm": 0.07570350615821111,
+      "learning_rate": 9.908993357849721e-05,
+      "loss": 1.0873,
+      "step": 530
+    },
+    {
+      "epoch": 0.5578727841501564,
+      "grad_norm": 0.07422866916118566,
+      "learning_rate": 9.727010222990082e-05,
+      "loss": 1.0812,
+      "step": 535
+    },
+    {
+      "epoch": 0.5630865484880083,
+      "grad_norm": 0.07362006062815561,
+      "learning_rate": 9.545117526465041e-05,
+      "loss": 1.0905,
+      "step": 540
+    },
+    {
+      "epoch": 0.5683003128258602,
+      "grad_norm": 0.0700033281234262,
+      "learning_rate": 9.363375527207111e-05,
+      "loss": 1.0682,
+      "step": 545
+    },
+    {
+      "epoch": 0.5735140771637122,
+      "grad_norm": 0.07230470722040953,
+      "learning_rate": 9.181844434224558e-05,
+      "loss": 1.0873,
+      "step": 550
+    },
+    {
+      "epoch": 0.5787278415015641,
+      "grad_norm": 0.08088691708607157,
+      "learning_rate": 9.000584386654853e-05,
+      "loss": 1.0785,
+      "step": 555
+    },
+    {
+      "epoch": 0.583941605839416,
+      "grad_norm": 0.0744369753893106,
+      "learning_rate": 8.819655433841262e-05,
+      "loss": 1.107,
+      "step": 560
+    },
+    {
+      "epoch": 0.5891553701772679,
+      "grad_norm": 0.07574963261021694,
+      "learning_rate": 8.639117515439248e-05,
+      "loss": 1.0854,
+      "step": 565
+    },
+    {
+      "epoch": 0.59436913451512,
+      "grad_norm": 0.07470955964117636,
+      "learning_rate": 8.459030441559091e-05,
+      "loss": 1.1039,
+      "step": 570
+    },
+    {
+      "epoch": 0.5995828988529719,
+      "grad_norm": 0.0753809437283073,
+      "learning_rate": 8.27945387295155e-05,
+      "loss": 1.097,
+      "step": 575
+    },
+    {
+      "epoch": 0.6047966631908238,
+      "grad_norm": 0.07151693665161514,
+      "learning_rate": 8.100447301242923e-05,
+      "loss": 1.1062,
+      "step": 580
+    },
+    {
+      "epoch": 0.6100104275286757,
+      "grad_norm": 0.07299808736310547,
+      "learning_rate": 7.92207002922618e-05,
+      "loss": 1.1024,
+      "step": 585
+    },
+    {
+      "epoch": 0.6152241918665277,
+      "grad_norm": 0.07463126953374469,
+      "learning_rate": 7.744381151214627e-05,
+      "loss": 1.0809,
+      "step": 590
+    },
+    {
+      "epoch": 0.6204379562043796,
+      "grad_norm": 0.07475965582424028,
+      "learning_rate": 7.567439533464639e-05,
+      "loss": 1.0708,
+      "step": 595
+    },
+    {
+      "epoch": 0.6256517205422315,
+      "grad_norm": 0.0733039750634976,
+      "learning_rate": 7.391303794673992e-05,
+      "loss": 1.1059,
+      "step": 600
+    },
+    {
+      "epoch": 0.6308654848800834,
+      "grad_norm": 0.08113303731253642,
+      "learning_rate": 7.216032286562122e-05,
+      "loss": 1.0891,
+      "step": 605
+    },
+    {
+      "epoch": 0.6360792492179353,
+      "grad_norm": 0.07598750215168182,
+      "learning_rate": 7.041683074538916e-05,
+      "loss": 1.0951,
+      "step": 610
+    },
+    {
+      "epoch": 0.6412930135557873,
+      "grad_norm": 0.07271525090431738,
+      "learning_rate": 6.868313918468305e-05,
+      "loss": 1.0708,
+      "step": 615
+    },
+    {
+      "epoch": 0.6465067778936392,
+      "grad_norm": 0.07349014640316023,
+      "learning_rate": 6.69598225353309e-05,
+      "loss": 1.0856,
+      "step": 620
+    },
+    {
+      "epoch": 0.6517205422314911,
+      "grad_norm": 0.07174197708446833,
+      "learning_rate": 6.524745171207339e-05,
+      "loss": 1.085,
+      "step": 625
+    },
+    {
+      "epoch": 0.656934306569343,
+      "grad_norm": 0.07579799503336161,
+      "learning_rate": 6.354659400342653e-05,
+      "loss": 1.0805,
+      "step": 630
+    },
+    {
+      "epoch": 0.662148070907195,
+      "grad_norm": 0.07866794639764096,
+      "learning_rate": 6.185781288374545e-05,
+      "loss": 1.0852,
+      "step": 635
+    },
+    {
+      "epoch": 0.6673618352450469,
+      "grad_norm": 0.07476193110508485,
+      "learning_rate": 6.0181667826551924e-05,
+      "loss": 1.0896,
+      "step": 640
+    },
+    {
+      "epoch": 0.6725755995828988,
+      "grad_norm": 0.0772747671129761,
+      "learning_rate": 5.851871411918743e-05,
+      "loss": 1.0792,
+      "step": 645
+    },
+    {
+      "epoch": 0.6777893639207507,
+      "grad_norm": 0.07566640995898331,
+      "learning_rate": 5.6869502678852835e-05,
+      "loss": 1.0471,
+      "step": 650
+    },
+    {
+      "epoch": 0.6830031282586028,
+      "grad_norm": 0.07164138898307278,
+      "learning_rate": 5.523457987009595e-05,
+      "loss": 1.1035,
+      "step": 655
+    },
+    {
+      "epoch": 0.6882168925964547,
+      "grad_norm": 0.07099725484474564,
+      "learning_rate": 5.3614487323807195e-05,
+      "loss": 1.0992,
+      "step": 660
+    },
+    {
+      "epoch": 0.6934306569343066,
+      "grad_norm": 0.07751613627358377,
+      "learning_rate": 5.20097617577839e-05,
+      "loss": 1.0837,
+      "step": 665
+    },
+    {
+      "epoch": 0.6986444212721585,
+      "grad_norm": 0.07882659762962813,
+      "learning_rate": 5.042093479892158e-05,
+      "loss": 1.081,
+      "step": 670
+    },
+    {
+      "epoch": 0.7038581856100105,
+      "grad_norm": 0.07504468684412657,
+      "learning_rate": 4.8848532807092416e-05,
+      "loss": 1.0711,
+      "step": 675
+    },
+    {
+      "epoch": 0.7090719499478624,
+      "grad_norm": 0.0747790732557037,
+      "learning_rate": 4.729307670076826e-05,
+      "loss": 1.077,
+      "step": 680
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.07214223012913477,
+      "learning_rate": 4.5755081784446306e-05,
+      "loss": 1.0894,
+      "step": 685
+    },
+    {
+      "epoch": 0.7194994786235662,
+      "grad_norm": 0.07363920926565273,
+      "learning_rate": 4.4235057577934905e-05,
+      "loss": 1.0853,
+      "step": 690
+    },
+    {
+      "epoch": 0.7247132429614181,
+      "grad_norm": 0.0735821627304811,
+      "learning_rate": 4.273350764755548e-05,
+      "loss": 1.082,
+      "step": 695
+    },
+    {
+      "epoch": 0.7299270072992701,
+      "grad_norm": 0.0738898873124254,
+      "learning_rate": 4.1250929439316824e-05,
+      "loss": 1.0773,
+      "step": 700
+    },
+    {
+      "epoch": 0.735140771637122,
+      "grad_norm": 0.07402633045377013,
+      "learning_rate": 3.978781411411705e-05,
+      "loss": 1.1247,
+      "step": 705
+    },
+    {
+      "epoch": 0.7403545359749739,
+      "grad_norm": 0.07284695841320597,
+      "learning_rate": 3.8344646385027837e-05,
+      "loss": 1.0754,
+      "step": 710
+    },
+    {
+      "epoch": 0.7455683003128258,
+      "grad_norm": 0.07929374122905068,
+      "learning_rate": 3.692190435671452e-05,
+      "loss": 1.0795,
+      "step": 715
+    },
+    {
+      "epoch": 0.7507820646506778,
+      "grad_norm": 0.0734239751966692,
+      "learning_rate": 3.5520059367045465e-05,
+      "loss": 1.0887,
+      "step": 720
+    },
+    {
+      "epoch": 0.7559958289885297,
+      "grad_norm": 0.07435223656654096,
+      "learning_rate": 3.413957583094358e-05,
+      "loss": 1.0999,
+      "step": 725
+    },
+    {
+      "epoch": 0.7612095933263816,
+      "grad_norm": 0.07261210019867713,
+      "learning_rate": 3.278091108653091e-05,
+      "loss": 1.0832,
+      "step": 730
+    },
+    {
+      "epoch": 0.7664233576642335,
+      "grad_norm": 0.07607256887516554,
+      "learning_rate": 3.144451524361779e-05,
+      "loss": 1.1057,
+      "step": 735
+    },
+    {
+      "epoch": 0.7716371220020855,
+      "grad_norm": 0.07249789517904658,
+      "learning_rate": 3.013083103458676e-05,
+      "loss": 1.0786,
+      "step": 740
+    },
+    {
+      "epoch": 0.7768508863399375,
+      "grad_norm": 0.07092399285995109,
+      "learning_rate": 2.8840293667720653e-05,
+      "loss": 1.0361,
+      "step": 745
+    },
+    {
+      "epoch": 0.7820646506777894,
+      "grad_norm": 0.07310960304099395,
+      "learning_rate": 2.7573330683022847e-05,
+      "loss": 1.0902,
+      "step": 750
+    },
+    {
+      "epoch": 0.7872784150156413,
+      "grad_norm": 0.07182455138220446,
+      "learning_rate": 2.6330361810578395e-05,
+      "loss": 1.0864,
+      "step": 755
+    },
+    {
+      "epoch": 0.7924921793534933,
+      "grad_norm": 0.07508985054363387,
+      "learning_rate": 2.511179883150242e-05,
+      "loss": 1.1085,
+      "step": 760
+    },
+    {
+      "epoch": 0.7977059436913452,
+      "grad_norm": 0.07685850755584918,
+      "learning_rate": 2.3918045441521718e-05,
+      "loss": 1.1116,
+      "step": 765
+    },
+    {
+      "epoch": 0.8029197080291971,
+      "grad_norm": 0.07574318675677963,
+      "learning_rate": 2.274949711723502e-05,
+      "loss": 1.0735,
+      "step": 770
+    },
+    {
+      "epoch": 0.808133472367049,
+      "grad_norm": 0.07349993942522902,
+      "learning_rate": 2.1606540985096312e-05,
+      "loss": 1.0892,
+      "step": 775
+    },
+    {
+      "epoch": 0.8133472367049009,
+      "grad_norm": 0.07510810135075115,
+      "learning_rate": 2.0489555693164342e-05,
+      "loss": 1.0929,
+      "step": 780
+    },
+    {
+      "epoch": 0.8185610010427529,
+      "grad_norm": 0.07465217855129215,
+      "learning_rate": 1.9398911285660816e-05,
+      "loss": 1.0974,
+      "step": 785
+    },
+    {
+      "epoch": 0.8237747653806048,
+      "grad_norm": 0.07641853556955017,
+      "learning_rate": 1.8334969080379115e-05,
+      "loss": 1.0981,
+      "step": 790
+    },
+    {
+      "epoch": 0.8289885297184567,
+      "grad_norm": 0.07374429336005603,
+      "learning_rate": 1.7298081548984002e-05,
+      "loss": 1.1147,
+      "step": 795
+    },
+    {
+      "epoch": 0.8342022940563086,
+      "grad_norm": 0.07198920669236027,
+      "learning_rate": 1.628859220024177e-05,
+      "loss": 1.0805,
+      "step": 800
+    },
+    {
+      "epoch": 0.8394160583941606,
+      "grad_norm": 0.07357888739399879,
+      "learning_rate": 1.5306835466219738e-05,
+      "loss": 1.0708,
+      "step": 805
+    },
+    {
+      "epoch": 0.8446298227320125,
+      "grad_norm": 0.07321249998664951,
+      "learning_rate": 1.4353136591492933e-05,
+      "loss": 1.1037,
+      "step": 810
+    },
+    {
+      "epoch": 0.8498435870698644,
+      "grad_norm": 0.07782709844758377,
+      "learning_rate": 1.3427811525394319e-05,
+      "loss": 1.0904,
+      "step": 815
+    },
+    {
+      "epoch": 0.8550573514077163,
+      "grad_norm": 0.07455259445795498,
+      "learning_rate": 1.253116681734432e-05,
+      "loss": 1.0768,
+      "step": 820
+    },
+    {
+      "epoch": 0.8602711157455682,
+      "grad_norm": 0.07401992962356452,
+      "learning_rate": 1.1663499515294762e-05,
+      "loss": 1.0653,
+      "step": 825
+    },
+    {
+      "epoch": 0.8654848800834203,
+      "grad_norm": 0.07220555107508732,
+      "learning_rate": 1.0825097067320267e-05,
+      "loss": 1.0729,
+      "step": 830
+    },
+    {
+      "epoch": 0.8706986444212722,
+      "grad_norm": 0.07544891513968513,
+      "learning_rate": 1.0016237226389947e-05,
+      "loss": 1.0855,
+      "step": 835
+    },
+    {
+      "epoch": 0.8759124087591241,
+      "grad_norm": 0.07485172694158737,
+      "learning_rate": 9.237187958351279e-06,
+      "loss": 1.09,
+      "step": 840
+    },
+    {
+      "epoch": 0.881126173096976,
+      "grad_norm": 0.07275386970426083,
+      "learning_rate": 8.488207353155986e-06,
+      "loss": 1.0896,
+      "step": 845
+    },
+    {
+      "epoch": 0.886339937434828,
+      "grad_norm": 0.07454861219445595,
+      "learning_rate": 7.769543539357904e-06,
+      "loss": 1.0953,
+      "step": 850
+    },
+    {
+      "epoch": 0.8915537017726799,
+      "grad_norm": 0.07164206957934503,
+      "learning_rate": 7.081434601910864e-06,
+      "loss": 1.0944,
+      "step": 855
+    },
+    {
+      "epoch": 0.8967674661105318,
+      "grad_norm": 0.07286862134246115,
+      "learning_rate": 6.424108503293924e-06,
+      "loss": 1.1096,
+      "step": 860
+    },
+    {
+      "epoch": 0.9019812304483837,
+      "grad_norm": 0.07369229576160694,
+      "learning_rate": 5.797783007990076e-06,
+      "loss": 1.0951,
+      "step": 865
+    },
+    {
+      "epoch": 0.9071949947862357,
+      "grad_norm": 0.07460818380676967,
+      "learning_rate": 5.202665610343338e-06,
+      "loss": 1.0883,
+      "step": 870
+    },
+    {
+      "epoch": 0.9124087591240876,
+      "grad_norm": 0.07357667175850421,
+      "learning_rate": 4.6389534658184075e-06,
+      "loss": 1.0856,
+      "step": 875
+    },
+    {
+      "epoch": 0.9176225234619395,
+      "grad_norm": 0.07355488764311813,
+      "learning_rate": 4.106833325685222e-06,
+      "loss": 1.0992,
+      "step": 880
+    },
+    {
+      "epoch": 0.9228362877997914,
+      "grad_norm": 0.07560170907908333,
+      "learning_rate": 3.606481475150536e-06,
+      "loss": 1.0753,
+      "step": 885
+    },
+    {
+      "epoch": 0.9280500521376434,
+      "grad_norm": 0.07352332452876986,
+      "learning_rate": 3.138063674956648e-06,
+      "loss": 1.0606,
+      "step": 890
+    },
+    {
+      "epoch": 0.9332638164754953,
+      "grad_norm": 0.07595937879635985,
+      "learning_rate": 2.701735106466796e-06,
+      "loss": 1.0931,
+      "step": 895
+    },
+    {
+      "epoch": 0.9384775808133472,
+      "grad_norm": 0.07046192596564868,
+      "learning_rate": 2.2976403202553787e-06,
+      "loss": 1.0931,
+      "step": 900
+    },
+    {
+      "epoch": 0.9436913451511991,
+      "grad_norm": 0.07358829064156538,
+      "learning_rate": 1.925913188220052e-06,
+      "loss": 1.0806,
+      "step": 905
+    },
+    {
+      "epoch": 0.948905109489051,
+      "grad_norm": 0.07595473779694568,
+      "learning_rate": 1.586676859231473e-06,
+      "loss": 1.0816,
+      "step": 910
+    },
+    {
+      "epoch": 0.954118873826903,
+      "grad_norm": 0.07568884330826342,
+      "learning_rate": 1.2800437183355885e-06,
+      "loss": 1.0809,
+      "step": 915
+    },
+    {
+      "epoch": 0.959332638164755,
+      "grad_norm": 0.07584100660095472,
+      "learning_rate": 1.0061153495217412e-06,
+      "loss": 1.0634,
+      "step": 920
+    },
+    {
+      "epoch": 0.9645464025026069,
+      "grad_norm": 0.07304112910406586,
+      "learning_rate": 7.64982502069056e-07,
+      "loss": 1.0592,
+      "step": 925
+    },
+    {
+      "epoch": 0.9697601668404588,
+      "grad_norm": 0.07324941671378657,
+      "learning_rate": 5.567250604822882e-07,
+      "loss": 1.0764,
+      "step": 930
+    },
+    {
+      "epoch": 0.9749739311783108,
+      "grad_norm": 0.07540315428759628,
+      "learning_rate": 3.814120180270164e-07,
+      "loss": 1.0862,
+      "step": 935
+    },
+    {
+      "epoch": 0.9801876955161627,
+      "grad_norm": 0.07167856975306086,
+      "learning_rate": 2.3910145387299185e-07,
+      "loss": 1.0798,
+      "step": 940
+    },
+    {
+      "epoch": 0.9854014598540146,
+      "grad_norm": 0.07399712911952666,
+      "learning_rate": 1.298405138531078e-07,
+      "loss": 1.0819,
+      "step": 945
+    },
+    {
+      "epoch": 0.9906152241918665,
+      "grad_norm": 0.07550044600887201,
+      "learning_rate": 5.366539484464861e-08,
+      "loss": 1.088,
+      "step": 950
+    },
+    {
+      "epoch": 0.9958289885297185,
+      "grad_norm": 0.07533429394464258,
+      "learning_rate": 1.0601332777604444e-08,
+      "loss": 1.071,
+      "step": 955
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.1101280450820923,
+      "eval_runtime": 2.0768,
+      "eval_samples_per_second": 3.371,
+      "eval_steps_per_second": 0.963,
+      "step": 959
+    },
+    {
+      "epoch": 1.0,
+      "step": 959,
+      "total_flos": 1.2060020938113024e+16,
+      "train_loss": 1.1071907986689657,
+      "train_runtime": 19793.4474,
+      "train_samples_per_second": 3.101,
+      "train_steps_per_second": 0.048
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 959,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2060020938113024e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}