Upload 8 files

Browse files

Files changed (8) hide show

README.md +202 -3
adapter_config.json +29 -0
adapter_model.bin +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1533 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
----
-license: apache-2.0
----

+---
+library_name: peft
+base_model: meta-llama/Llama-2-7b-hf
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7dc07c8711811876a2574ceae0d54648a12092e78e64ed0f9226f93cb636c48
+size 16823434

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e1cec9fdd5f532a9143affb50feb85d62759daa8df3de39a86cd9179e202085
+size 33662074

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34f745fa6dfaaddd7baec3093d9b799d21ce1fc1d5ec7905895d613f9f8836d5
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:802aea6a94596a1508efd2609357c7d03b7afc6793c019662dc8c4716ec3f4be
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1533 @@

+{
+  "best_metric": 0.2660675048828125,
+  "best_model_checkpoint": "/scratch/czm5kz/llama2-7b_8_50_0.0003_sg_finetuned_combined/checkpoint-840",
+  "epoch": 49.411764705882355,
+  "eval_steps": 20,
+  "global_step": 840,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.29,
+      "grad_norm": NaN,
+      "learning_rate": 0.0002989411764705882,
+      "loss": 5.4103,
+      "step": 5
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.4938805103302002,
+      "learning_rate": 0.0002971764705882353,
+      "loss": 5.1285,
+      "step": 10
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.592315673828125,
+      "learning_rate": 0.0002954117647058823,
+      "loss": 4.2667,
+      "step": 15
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.206411123275757,
+      "learning_rate": 0.0002936470588235294,
+      "loss": 3.4402,
+      "step": 20
+    },
+    {
+      "epoch": 1.18,
+      "eval_loss": 3.1646199226379395,
+      "eval_runtime": 1.9198,
+      "eval_samples_per_second": 69.797,
+      "eval_steps_per_second": 8.855,
+      "step": 20
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.824563980102539,
+      "learning_rate": 0.00029188235294117643,
+      "loss": 3.1312,
+      "step": 25
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 2.1594173908233643,
+      "learning_rate": 0.0002901176470588235,
+      "loss": 2.675,
+      "step": 30
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 2.2764227390289307,
+      "learning_rate": 0.00028835294117647055,
+      "loss": 2.491,
+      "step": 35
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 2.14278507232666,
+      "learning_rate": 0.00028658823529411763,
+      "loss": 2.2694,
+      "step": 40
+    },
+    {
+      "epoch": 2.35,
+      "eval_loss": 2.0995066165924072,
+      "eval_runtime": 1.9224,
+      "eval_samples_per_second": 69.704,
+      "eval_steps_per_second": 8.843,
+      "step": 40
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 2.457611560821533,
+      "learning_rate": 0.0002848235294117647,
+      "loss": 2.115,
+      "step": 45
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 3.5367114543914795,
+      "learning_rate": 0.00028305882352941175,
+      "loss": 1.9109,
+      "step": 50
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 5.9490227699279785,
+      "learning_rate": 0.0002812941176470588,
+      "loss": 1.6125,
+      "step": 55
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 4.218448638916016,
+      "learning_rate": 0.00027952941176470587,
+      "loss": 1.6965,
+      "step": 60
+    },
+    {
+      "epoch": 3.53,
+      "eval_loss": 1.309256672859192,
+      "eval_runtime": 1.9221,
+      "eval_samples_per_second": 69.714,
+      "eval_steps_per_second": 8.844,
+      "step": 60
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 4.950811862945557,
+      "learning_rate": 0.0002777647058823529,
+      "loss": 1.2561,
+      "step": 65
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 3.5115580558776855,
+      "learning_rate": 0.000276,
+      "loss": 1.2923,
+      "step": 70
+    },
+    {
+      "epoch": 4.41,
+      "grad_norm": 4.053131103515625,
+      "learning_rate": 0.000274235294117647,
+      "loss": 0.8754,
+      "step": 75
+    },
+    {
+      "epoch": 4.71,
+      "grad_norm": 8.593412399291992,
+      "learning_rate": 0.0002724705882352941,
+      "loss": 0.9971,
+      "step": 80
+    },
+    {
+      "epoch": 4.71,
+      "eval_loss": 0.7918062210083008,
+      "eval_runtime": 1.923,
+      "eval_samples_per_second": 69.684,
+      "eval_steps_per_second": 8.841,
+      "step": 80
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 5.546477794647217,
+      "learning_rate": 0.00027070588235294114,
+      "loss": 1.0066,
+      "step": 85
+    },
+    {
+      "epoch": 5.29,
+      "grad_norm": 4.750329971313477,
+      "learning_rate": 0.0002689411764705882,
+      "loss": 0.6666,
+      "step": 90
+    },
+    {
+      "epoch": 5.59,
+      "grad_norm": 6.40559720993042,
+      "learning_rate": 0.00026717647058823525,
+      "loss": 0.7559,
+      "step": 95
+    },
+    {
+      "epoch": 5.88,
+      "grad_norm": 4.358940124511719,
+      "learning_rate": 0.00026541176470588234,
+      "loss": 0.6462,
+      "step": 100
+    },
+    {
+      "epoch": 5.88,
+      "eval_loss": 0.5163397789001465,
+      "eval_runtime": 1.9282,
+      "eval_samples_per_second": 69.496,
+      "eval_steps_per_second": 8.817,
+      "step": 100
+    },
+    {
+      "epoch": 6.18,
+      "grad_norm": 2.795166015625,
+      "learning_rate": 0.00026364705882352937,
+      "loss": 0.5542,
+      "step": 105
+    },
+    {
+      "epoch": 6.47,
+      "grad_norm": 5.22014856338501,
+      "learning_rate": 0.00026188235294117646,
+      "loss": 0.4833,
+      "step": 110
+    },
+    {
+      "epoch": 6.76,
+      "grad_norm": 3.8278372287750244,
+      "learning_rate": 0.0002601176470588235,
+      "loss": 0.5228,
+      "step": 115
+    },
+    {
+      "epoch": 7.06,
+      "grad_norm": 3.607290029525757,
+      "learning_rate": 0.0002583529411764706,
+      "loss": 0.5413,
+      "step": 120
+    },
+    {
+      "epoch": 7.06,
+      "eval_loss": 0.43107467889785767,
+      "eval_runtime": 1.9336,
+      "eval_samples_per_second": 69.302,
+      "eval_steps_per_second": 8.792,
+      "step": 120
+    },
+    {
+      "epoch": 7.35,
+      "grad_norm": 2.634229898452759,
+      "learning_rate": 0.00025658823529411766,
+      "loss": 0.4545,
+      "step": 125
+    },
+    {
+      "epoch": 7.65,
+      "grad_norm": 5.728726863861084,
+      "learning_rate": 0.0002548235294117647,
+      "loss": 0.4578,
+      "step": 130
+    },
+    {
+      "epoch": 7.94,
+      "grad_norm": 3.3267197608947754,
+      "learning_rate": 0.0002530588235294117,
+      "loss": 0.4934,
+      "step": 135
+    },
+    {
+      "epoch": 8.24,
+      "grad_norm": 2.8072242736816406,
+      "learning_rate": 0.0002512941176470588,
+      "loss": 0.4146,
+      "step": 140
+    },
+    {
+      "epoch": 8.24,
+      "eval_loss": 0.37879472970962524,
+      "eval_runtime": 1.9304,
+      "eval_samples_per_second": 69.415,
+      "eval_steps_per_second": 8.806,
+      "step": 140
+    },
+    {
+      "epoch": 8.53,
+      "grad_norm": 4.419933319091797,
+      "learning_rate": 0.00024952941176470584,
+      "loss": 0.436,
+      "step": 145
+    },
+    {
+      "epoch": 8.82,
+      "grad_norm": 2.759542465209961,
+      "learning_rate": 0.00024776470588235293,
+      "loss": 0.4245,
+      "step": 150
+    },
+    {
+      "epoch": 9.12,
+      "grad_norm": 2.82211971282959,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.4608,
+      "step": 155
+    },
+    {
+      "epoch": 9.41,
+      "grad_norm": 3.7776830196380615,
+      "learning_rate": 0.00024423529411764705,
+      "loss": 0.3992,
+      "step": 160
+    },
+    {
+      "epoch": 9.41,
+      "eval_loss": 0.35705941915512085,
+      "eval_runtime": 1.9328,
+      "eval_samples_per_second": 69.328,
+      "eval_steps_per_second": 8.795,
+      "step": 160
+    },
+    {
+      "epoch": 9.71,
+      "grad_norm": 3.110931873321533,
+      "learning_rate": 0.0002424705882352941,
+      "loss": 0.4201,
+      "step": 165
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2.9114022254943848,
+      "learning_rate": 0.00024070588235294117,
+      "loss": 0.4171,
+      "step": 170
+    },
+    {
+      "epoch": 10.29,
+      "grad_norm": 2.747077703475952,
+      "learning_rate": 0.00023894117647058823,
+      "loss": 0.3664,
+      "step": 175
+    },
+    {
+      "epoch": 10.59,
+      "grad_norm": 3.499821424484253,
+      "learning_rate": 0.00023717647058823526,
+      "loss": 0.3934,
+      "step": 180
+    },
+    {
+      "epoch": 10.59,
+      "eval_loss": 0.3550761640071869,
+      "eval_runtime": 1.9331,
+      "eval_samples_per_second": 69.318,
+      "eval_steps_per_second": 8.794,
+      "step": 180
+    },
+    {
+      "epoch": 10.88,
+      "grad_norm": 2.199946165084839,
+      "learning_rate": 0.00023541176470588232,
+      "loss": 0.4389,
+      "step": 185
+    },
+    {
+      "epoch": 11.18,
+      "grad_norm": 1.6292589902877808,
+      "learning_rate": 0.00023364705882352938,
+      "loss": 0.3886,
+      "step": 190
+    },
+    {
+      "epoch": 11.47,
+      "grad_norm": 2.2364566326141357,
+      "learning_rate": 0.00023188235294117646,
+      "loss": 0.3536,
+      "step": 195
+    },
+    {
+      "epoch": 11.76,
+      "grad_norm": 2.165865898132324,
+      "learning_rate": 0.00023011764705882352,
+      "loss": 0.3489,
+      "step": 200
+    },
+    {
+      "epoch": 11.76,
+      "eval_loss": 0.3096841275691986,
+      "eval_runtime": 1.9357,
+      "eval_samples_per_second": 69.225,
+      "eval_steps_per_second": 8.782,
+      "step": 200
+    },
+    {
+      "epoch": 12.06,
+      "grad_norm": 1.1357084512710571,
+      "learning_rate": 0.00022835294117647058,
+      "loss": 0.3538,
+      "step": 205
+    },
+    {
+      "epoch": 12.35,
+      "grad_norm": 1.3025437593460083,
+      "learning_rate": 0.0002265882352941176,
+      "loss": 0.3033,
+      "step": 210
+    },
+    {
+      "epoch": 12.65,
+      "grad_norm": 1.7671643495559692,
+      "learning_rate": 0.00022482352941176467,
+      "loss": 0.3734,
+      "step": 215
+    },
+    {
+      "epoch": 12.94,
+      "grad_norm": 2.1980714797973633,
+      "learning_rate": 0.00022305882352941176,
+      "loss": 0.3942,
+      "step": 220
+    },
+    {
+      "epoch": 12.94,
+      "eval_loss": 0.3041135370731354,
+      "eval_runtime": 1.9467,
+      "eval_samples_per_second": 68.835,
+      "eval_steps_per_second": 8.733,
+      "step": 220
+    },
+    {
+      "epoch": 13.24,
+      "grad_norm": 1.5232170820236206,
+      "learning_rate": 0.00022129411764705881,
+      "loss": 0.3196,
+      "step": 225
+    },
+    {
+      "epoch": 13.53,
+      "grad_norm": 1.3355060815811157,
+      "learning_rate": 0.00021952941176470587,
+      "loss": 0.3504,
+      "step": 230
+    },
+    {
+      "epoch": 13.82,
+      "grad_norm": 1.7420495748519897,
+      "learning_rate": 0.0002177647058823529,
+      "loss": 0.3633,
+      "step": 235
+    },
+    {
+      "epoch": 14.12,
+      "grad_norm": 1.2982964515686035,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.3953,
+      "step": 240
+    },
+    {
+      "epoch": 14.12,
+      "eval_loss": 0.30998367071151733,
+      "eval_runtime": 1.9386,
+      "eval_samples_per_second": 69.123,
+      "eval_steps_per_second": 8.769,
+      "step": 240
+    },
+    {
+      "epoch": 14.41,
+      "grad_norm": 1.4662508964538574,
+      "learning_rate": 0.00021423529411764705,
+      "loss": 0.3218,
+      "step": 245
+    },
+    {
+      "epoch": 14.71,
+      "grad_norm": 5.769560813903809,
+      "learning_rate": 0.0002124705882352941,
+      "loss": 0.3529,
+      "step": 250
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 1.21450936794281,
+      "learning_rate": 0.00021070588235294117,
+      "loss": 0.3709,
+      "step": 255
+    },
+    {
+      "epoch": 15.29,
+      "grad_norm": 1.1447687149047852,
+      "learning_rate": 0.0002089411764705882,
+      "loss": 0.3181,
+      "step": 260
+    },
+    {
+      "epoch": 15.29,
+      "eval_loss": 0.3001173138618469,
+      "eval_runtime": 1.9398,
+      "eval_samples_per_second": 69.078,
+      "eval_steps_per_second": 8.764,
+      "step": 260
+    },
+    {
+      "epoch": 15.59,
+      "grad_norm": 1.2628802061080933,
+      "learning_rate": 0.00020717647058823526,
+      "loss": 0.3276,
+      "step": 265
+    },
+    {
+      "epoch": 15.88,
+      "grad_norm": 1.5145180225372314,
+      "learning_rate": 0.00020541176470588232,
+      "loss": 0.3494,
+      "step": 270
+    },
+    {
+      "epoch": 16.18,
+      "grad_norm": 0.9314962029457092,
+      "learning_rate": 0.0002036470588235294,
+      "loss": 0.3023,
+      "step": 275
+    },
+    {
+      "epoch": 16.47,
+      "grad_norm": 1.3280673027038574,
+      "learning_rate": 0.00020188235294117646,
+      "loss": 0.3252,
+      "step": 280
+    },
+    {
+      "epoch": 16.47,
+      "eval_loss": 0.3047679364681244,
+      "eval_runtime": 1.9398,
+      "eval_samples_per_second": 69.08,
+      "eval_steps_per_second": 8.764,
+      "step": 280
+    },
+    {
+      "epoch": 16.76,
+      "grad_norm": 12.369646072387695,
+      "learning_rate": 0.00020011764705882352,
+      "loss": 0.3628,
+      "step": 285
+    },
+    {
+      "epoch": 17.06,
+      "grad_norm": 0.9423893690109253,
+      "learning_rate": 0.00019835294117647055,
+      "loss": 0.3365,
+      "step": 290
+    },
+    {
+      "epoch": 17.35,
+      "grad_norm": 1.1777836084365845,
+      "learning_rate": 0.00019658823529411761,
+      "loss": 0.3145,
+      "step": 295
+    },
+    {
+      "epoch": 17.65,
+      "grad_norm": 1.1752514839172363,
+      "learning_rate": 0.0001948235294117647,
+      "loss": 0.3433,
+      "step": 300
+    },
+    {
+      "epoch": 17.65,
+      "eval_loss": 0.2916710674762726,
+      "eval_runtime": 1.949,
+      "eval_samples_per_second": 68.752,
+      "eval_steps_per_second": 8.722,
+      "step": 300
+    },
+    {
+      "epoch": 17.94,
+      "grad_norm": 0.9878242611885071,
+      "learning_rate": 0.00019305882352941176,
+      "loss": 0.3349,
+      "step": 305
+    },
+    {
+      "epoch": 18.24,
+      "grad_norm": 0.836021900177002,
+      "learning_rate": 0.00019129411764705882,
+      "loss": 0.3016,
+      "step": 310
+    },
+    {
+      "epoch": 18.53,
+      "grad_norm": 0.9070651531219482,
+      "learning_rate": 0.00018952941176470585,
+      "loss": 0.3042,
+      "step": 315
+    },
+    {
+      "epoch": 18.82,
+      "grad_norm": 0.9631121754646301,
+      "learning_rate": 0.0001877647058823529,
+      "loss": 0.3359,
+      "step": 320
+    },
+    {
+      "epoch": 18.82,
+      "eval_loss": 0.2895776033401489,
+      "eval_runtime": 1.9409,
+      "eval_samples_per_second": 69.042,
+      "eval_steps_per_second": 8.759,
+      "step": 320
+    },
+    {
+      "epoch": 19.12,
+      "grad_norm": 0.9133499264717102,
+      "learning_rate": 0.000186,
+      "loss": 0.3066,
+      "step": 325
+    },
+    {
+      "epoch": 19.41,
+      "grad_norm": 0.9856986403465271,
+      "learning_rate": 0.00018423529411764705,
+      "loss": 0.2968,
+      "step": 330
+    },
+    {
+      "epoch": 19.71,
+      "grad_norm": 0.8035317659378052,
+      "learning_rate": 0.0001824705882352941,
+      "loss": 0.293,
+      "step": 335
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1.5814510583877563,
+      "learning_rate": 0.00018070588235294114,
+      "loss": 0.3671,
+      "step": 340
+    },
+    {
+      "epoch": 20.0,
+      "eval_loss": 0.2835056781768799,
+      "eval_runtime": 1.9399,
+      "eval_samples_per_second": 69.076,
+      "eval_steps_per_second": 8.763,
+      "step": 340
+    },
+    {
+      "epoch": 20.29,
+      "grad_norm": 0.9612240791320801,
+      "learning_rate": 0.0001789411764705882,
+      "loss": 0.2899,
+      "step": 345
+    },
+    {
+      "epoch": 20.59,
+      "grad_norm": 0.8361384272575378,
+      "learning_rate": 0.00017717647058823526,
+      "loss": 0.3003,
+      "step": 350
+    },
+    {
+      "epoch": 20.88,
+      "grad_norm": 1.1012054681777954,
+      "learning_rate": 0.00017541176470588235,
+      "loss": 0.3166,
+      "step": 355
+    },
+    {
+      "epoch": 21.18,
+      "grad_norm": 0.9163299202919006,
+      "learning_rate": 0.0001736470588235294,
+      "loss": 0.2974,
+      "step": 360
+    },
+    {
+      "epoch": 21.18,
+      "eval_loss": 0.2791600525379181,
+      "eval_runtime": 1.941,
+      "eval_samples_per_second": 69.038,
+      "eval_steps_per_second": 8.759,
+      "step": 360
+    },
+    {
+      "epoch": 21.47,
+      "grad_norm": 0.8624510765075684,
+      "learning_rate": 0.00017188235294117647,
+      "loss": 0.2861,
+      "step": 365
+    },
+    {
+      "epoch": 21.76,
+      "grad_norm": 1.293444275856018,
+      "learning_rate": 0.0001701176470588235,
+      "loss": 0.3208,
+      "step": 370
+    },
+    {
+      "epoch": 22.06,
+      "grad_norm": 0.6756078004837036,
+      "learning_rate": 0.00016835294117647056,
+      "loss": 0.3121,
+      "step": 375
+    },
+    {
+      "epoch": 22.35,
+      "grad_norm": 0.9174789190292358,
+      "learning_rate": 0.00016658823529411764,
+      "loss": 0.2957,
+      "step": 380
+    },
+    {
+      "epoch": 22.35,
+      "eval_loss": 0.28055718541145325,
+      "eval_runtime": 1.9416,
+      "eval_samples_per_second": 69.014,
+      "eval_steps_per_second": 8.755,
+      "step": 380
+    },
+    {
+      "epoch": 22.65,
+      "grad_norm": 0.8151916861534119,
+      "learning_rate": 0.0001648235294117647,
+      "loss": 0.2977,
+      "step": 385
+    },
+    {
+      "epoch": 22.94,
+      "grad_norm": 0.8723404407501221,
+      "learning_rate": 0.00016305882352941176,
+      "loss": 0.3042,
+      "step": 390
+    },
+    {
+      "epoch": 23.24,
+      "grad_norm": 0.9448694586753845,
+      "learning_rate": 0.0001612941176470588,
+      "loss": 0.2725,
+      "step": 395
+    },
+    {
+      "epoch": 23.53,
+      "grad_norm": 1.0331761837005615,
+      "learning_rate": 0.00015952941176470585,
+      "loss": 0.307,
+      "step": 400
+    },
+    {
+      "epoch": 23.53,
+      "eval_loss": 0.2780434191226959,
+      "eval_runtime": 1.9423,
+      "eval_samples_per_second": 68.99,
+      "eval_steps_per_second": 8.752,
+      "step": 400
+    },
+    {
+      "epoch": 23.82,
+      "grad_norm": 1.0633246898651123,
+      "learning_rate": 0.00015776470588235294,
+      "loss": 0.3015,
+      "step": 405
+    },
+    {
+      "epoch": 24.12,
+      "grad_norm": 0.7136632800102234,
+      "learning_rate": 0.000156,
+      "loss": 0.2915,
+      "step": 410
+    },
+    {
+      "epoch": 24.41,
+      "grad_norm": 0.7327393293380737,
+      "learning_rate": 0.00015423529411764706,
+      "loss": 0.2882,
+      "step": 415
+    },
+    {
+      "epoch": 24.71,
+      "grad_norm": 0.7720869779586792,
+      "learning_rate": 0.0001524705882352941,
+      "loss": 0.2905,
+      "step": 420
+    },
+    {
+      "epoch": 24.71,
+      "eval_loss": 0.27704769372940063,
+      "eval_runtime": 1.9407,
+      "eval_samples_per_second": 69.046,
+      "eval_steps_per_second": 8.76,
+      "step": 420
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.8705993890762329,
+      "learning_rate": 0.00015070588235294115,
+      "loss": 0.3195,
+      "step": 425
+    },
+    {
+      "epoch": 25.29,
+      "grad_norm": 0.8639763593673706,
+      "learning_rate": 0.0001489411764705882,
+      "loss": 0.2838,
+      "step": 430
+    },
+    {
+      "epoch": 25.59,
+      "grad_norm": 0.9361162185668945,
+      "learning_rate": 0.0001471764705882353,
+      "loss": 0.2748,
+      "step": 435
+    },
+    {
+      "epoch": 25.88,
+      "grad_norm": 1.0659866333007812,
+      "learning_rate": 0.00014541176470588232,
+      "loss": 0.3256,
+      "step": 440
+    },
+    {
+      "epoch": 25.88,
+      "eval_loss": 0.27406755089759827,
+      "eval_runtime": 1.9407,
+      "eval_samples_per_second": 69.046,
+      "eval_steps_per_second": 8.76,
+      "step": 440
+    },
+    {
+      "epoch": 26.18,
+      "grad_norm": 0.6408669352531433,
+      "learning_rate": 0.0001436470588235294,
+      "loss": 0.2793,
+      "step": 445
+    },
+    {
+      "epoch": 26.47,
+      "grad_norm": 0.7933536767959595,
+      "learning_rate": 0.00014188235294117647,
+      "loss": 0.2871,
+      "step": 450
+    },
+    {
+      "epoch": 26.76,
+      "grad_norm": 0.8687612414360046,
+      "learning_rate": 0.0001401176470588235,
+      "loss": 0.3059,
+      "step": 455
+    },
+    {
+      "epoch": 27.06,
+      "grad_norm": 0.9702476859092712,
+      "learning_rate": 0.0001383529411764706,
+      "loss": 0.2966,
+      "step": 460
+    },
+    {
+      "epoch": 27.06,
+      "eval_loss": 0.2723328769207001,
+      "eval_runtime": 1.9411,
+      "eval_samples_per_second": 69.035,
+      "eval_steps_per_second": 8.758,
+      "step": 460
+    },
+    {
+      "epoch": 27.35,
+      "grad_norm": 0.6730605959892273,
+      "learning_rate": 0.00013658823529411765,
+      "loss": 0.2638,
+      "step": 465
+    },
+    {
+      "epoch": 27.65,
+      "grad_norm": 0.6563217639923096,
+      "learning_rate": 0.0001348235294117647,
+      "loss": 0.3059,
+      "step": 470
+    },
+    {
+      "epoch": 27.94,
+      "grad_norm": 0.686863362789154,
+      "learning_rate": 0.00013305882352941176,
+      "loss": 0.2938,
+      "step": 475
+    },
+    {
+      "epoch": 28.24,
+      "grad_norm": 1.0649175643920898,
+      "learning_rate": 0.0001312941176470588,
+      "loss": 0.312,
+      "step": 480
+    },
+    {
+      "epoch": 28.24,
+      "eval_loss": 0.27256956696510315,
+      "eval_runtime": 1.9432,
+      "eval_samples_per_second": 68.957,
+      "eval_steps_per_second": 8.748,
+      "step": 480
+    },
+    {
+      "epoch": 28.53,
+      "grad_norm": 0.746249258518219,
+      "learning_rate": 0.00012952941176470588,
+      "loss": 0.2788,
+      "step": 485
+    },
+    {
+      "epoch": 28.82,
+      "grad_norm": 0.7490785717964172,
+      "learning_rate": 0.00012776470588235294,
+      "loss": 0.2766,
+      "step": 490
+    },
+    {
+      "epoch": 29.12,
+      "grad_norm": 0.967271625995636,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.2848,
+      "step": 495
+    },
+    {
+      "epoch": 29.41,
+      "grad_norm": 0.7683994770050049,
+      "learning_rate": 0.00012423529411764706,
+      "loss": 0.2886,
+      "step": 500
+    },
+    {
+      "epoch": 29.41,
+      "eval_loss": 0.2728249728679657,
+      "eval_runtime": 1.9475,
+      "eval_samples_per_second": 68.808,
+      "eval_steps_per_second": 8.729,
+      "step": 500
+    },
+    {
+      "epoch": 29.71,
+      "grad_norm": 0.8643724918365479,
+      "learning_rate": 0.00012247058823529412,
+      "loss": 0.2887,
+      "step": 505
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 1.343279242515564,
+      "learning_rate": 0.00012070588235294116,
+      "loss": 0.2992,
+      "step": 510
+    },
+    {
+      "epoch": 30.29,
+      "grad_norm": 0.905185341835022,
+      "learning_rate": 0.00011894117647058822,
+      "loss": 0.2755,
+      "step": 515
+    },
+    {
+      "epoch": 30.59,
+      "grad_norm": 0.708074152469635,
+      "learning_rate": 0.00011717647058823528,
+      "loss": 0.3067,
+      "step": 520
+    },
+    {
+      "epoch": 30.59,
+      "eval_loss": 0.27262699604034424,
+      "eval_runtime": 1.9441,
+      "eval_samples_per_second": 68.925,
+      "eval_steps_per_second": 8.744,
+      "step": 520
+    },
+    {
+      "epoch": 30.88,
+      "grad_norm": 0.9293233156204224,
+      "learning_rate": 0.00011541176470588234,
+      "loss": 0.2789,
+      "step": 525
+    },
+    {
+      "epoch": 31.18,
+      "grad_norm": 0.5454906225204468,
+      "learning_rate": 0.00011364705882352941,
+      "loss": 0.279,
+      "step": 530
+    },
+    {
+      "epoch": 31.47,
+      "grad_norm": 0.7660827040672302,
+      "learning_rate": 0.00011188235294117646,
+      "loss": 0.29,
+      "step": 535
+    },
+    {
+      "epoch": 31.76,
+      "grad_norm": 0.8223424553871155,
+      "learning_rate": 0.00011011764705882352,
+      "loss": 0.2854,
+      "step": 540
+    },
+    {
+      "epoch": 31.76,
+      "eval_loss": 0.2715389132499695,
+      "eval_runtime": 1.9416,
+      "eval_samples_per_second": 69.013,
+      "eval_steps_per_second": 8.755,
+      "step": 540
+    },
+    {
+      "epoch": 32.06,
+      "grad_norm": 0.6093394160270691,
+      "learning_rate": 0.00010835294117647059,
+      "loss": 0.2945,
+      "step": 545
+    },
+    {
+      "epoch": 32.35,
+      "grad_norm": 0.7865052223205566,
+      "learning_rate": 0.00010658823529411764,
+      "loss": 0.2748,
+      "step": 550
+    },
+    {
+      "epoch": 32.65,
+      "grad_norm": 0.6309812664985657,
+      "learning_rate": 0.0001048235294117647,
+      "loss": 0.282,
+      "step": 555
+    },
+    {
+      "epoch": 32.94,
+      "grad_norm": 0.7672943472862244,
+      "learning_rate": 0.00010305882352941175,
+      "loss": 0.2944,
+      "step": 560
+    },
+    {
+      "epoch": 32.94,
+      "eval_loss": 0.2701457440853119,
+      "eval_runtime": 1.9433,
+      "eval_samples_per_second": 68.956,
+      "eval_steps_per_second": 8.748,
+      "step": 560
+    },
+    {
+      "epoch": 33.24,
+      "grad_norm": 0.7916382551193237,
+      "learning_rate": 0.00010129411764705881,
+      "loss": 0.2797,
+      "step": 565
+    },
+    {
+      "epoch": 33.53,
+      "grad_norm": 0.8927915692329407,
+      "learning_rate": 9.952941176470588e-05,
+      "loss": 0.2912,
+      "step": 570
+    },
+    {
+      "epoch": 33.82,
+      "grad_norm": 0.7179085612297058,
+      "learning_rate": 9.776470588235293e-05,
+      "loss": 0.2905,
+      "step": 575
+    },
+    {
+      "epoch": 34.12,
+      "grad_norm": 1.0066139698028564,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.293,
+      "step": 580
+    },
+    {
+      "epoch": 34.12,
+      "eval_loss": 0.26910731196403503,
+      "eval_runtime": 1.9434,
+      "eval_samples_per_second": 68.95,
+      "eval_steps_per_second": 8.747,
+      "step": 580
+    },
+    {
+      "epoch": 34.41,
+      "grad_norm": 0.5777180790901184,
+      "learning_rate": 9.423529411764706e-05,
+      "loss": 0.2577,
+      "step": 585
+    },
+    {
+      "epoch": 34.71,
+      "grad_norm": 0.7130693793296814,
+      "learning_rate": 9.247058823529411e-05,
+      "loss": 0.2807,
+      "step": 590
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 1.32589852809906,
+      "learning_rate": 9.070588235294118e-05,
+      "loss": 0.3171,
+      "step": 595
+    },
+    {
+      "epoch": 35.29,
+      "grad_norm": 0.7349640727043152,
+      "learning_rate": 8.894117647058822e-05,
+      "loss": 0.2739,
+      "step": 600
+    },
+    {
+      "epoch": 35.29,
+      "eval_loss": 0.2687808871269226,
+      "eval_runtime": 1.943,
+      "eval_samples_per_second": 68.967,
+      "eval_steps_per_second": 8.75,
+      "step": 600
+    },
+    {
+      "epoch": 35.59,
+      "grad_norm": 0.8435064554214478,
+      "learning_rate": 8.717647058823528e-05,
+      "loss": 0.297,
+      "step": 605
+    },
+    {
+      "epoch": 35.88,
+      "grad_norm": 0.9483019113540649,
+      "learning_rate": 8.541176470588236e-05,
+      "loss": 0.278,
+      "step": 610
+    },
+    {
+      "epoch": 36.18,
+      "grad_norm": 0.7475037574768066,
+      "learning_rate": 8.36470588235294e-05,
+      "loss": 0.279,
+      "step": 615
+    },
+    {
+      "epoch": 36.47,
+      "grad_norm": 0.6387231349945068,
+      "learning_rate": 8.188235294117646e-05,
+      "loss": 0.2816,
+      "step": 620
+    },
+    {
+      "epoch": 36.47,
+      "eval_loss": 0.26973241567611694,
+      "eval_runtime": 1.9418,
+      "eval_samples_per_second": 69.008,
+      "eval_steps_per_second": 8.755,
+      "step": 620
+    },
+    {
+      "epoch": 36.76,
+      "grad_norm": 0.7511037588119507,
+      "learning_rate": 8.011764705882352e-05,
+      "loss": 0.2884,
+      "step": 625
+    },
+    {
+      "epoch": 37.06,
+      "grad_norm": 0.7197074890136719,
+      "learning_rate": 7.835294117647058e-05,
+      "loss": 0.2907,
+      "step": 630
+    },
+    {
+      "epoch": 37.35,
+      "grad_norm": 0.8565782308578491,
+      "learning_rate": 7.658823529411765e-05,
+      "loss": 0.277,
+      "step": 635
+    },
+    {
+      "epoch": 37.65,
+      "grad_norm": 0.8459848761558533,
+      "learning_rate": 7.48235294117647e-05,
+      "loss": 0.2756,
+      "step": 640
+    },
+    {
+      "epoch": 37.65,
+      "eval_loss": 0.2685478627681732,
+      "eval_runtime": 1.9444,
+      "eval_samples_per_second": 68.917,
+      "eval_steps_per_second": 8.743,
+      "step": 640
+    },
+    {
+      "epoch": 37.94,
+      "grad_norm": 0.8671796917915344,
+      "learning_rate": 7.305882352941176e-05,
+      "loss": 0.2778,
+      "step": 645
+    },
+    {
+      "epoch": 38.24,
+      "grad_norm": 0.8741558790206909,
+      "learning_rate": 7.129411764705881e-05,
+      "loss": 0.283,
+      "step": 650
+    },
+    {
+      "epoch": 38.53,
+      "grad_norm": 0.9461960792541504,
+      "learning_rate": 6.952941176470587e-05,
+      "loss": 0.2773,
+      "step": 655
+    },
+    {
+      "epoch": 38.82,
+      "grad_norm": 0.9360217452049255,
+      "learning_rate": 6.776470588235293e-05,
+      "loss": 0.2871,
+      "step": 660
+    },
+    {
+      "epoch": 38.82,
+      "eval_loss": 0.26918286085128784,
+      "eval_runtime": 1.9403,
+      "eval_samples_per_second": 69.063,
+      "eval_steps_per_second": 8.762,
+      "step": 660
+    },
+    {
+      "epoch": 39.12,
+      "grad_norm": 0.953507125377655,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.2872,
+      "step": 665
+    },
+    {
+      "epoch": 39.41,
+      "grad_norm": 0.8516682386398315,
+      "learning_rate": 6.423529411764705e-05,
+      "loss": 0.2754,
+      "step": 670
+    },
+    {
+      "epoch": 39.71,
+      "grad_norm": 0.8544163107872009,
+      "learning_rate": 6.247058823529411e-05,
+      "loss": 0.2841,
+      "step": 675
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 1.3582571744918823,
+      "learning_rate": 6.0705882352941175e-05,
+      "loss": 0.282,
+      "step": 680
+    },
+    {
+      "epoch": 40.0,
+      "eval_loss": 0.26776203513145447,
+      "eval_runtime": 1.9501,
+      "eval_samples_per_second": 68.716,
+      "eval_steps_per_second": 8.718,
+      "step": 680
+    },
+    {
+      "epoch": 40.29,
+      "grad_norm": 0.8032355904579163,
+      "learning_rate": 5.894117647058823e-05,
+      "loss": 0.2634,
+      "step": 685
+    },
+    {
+      "epoch": 40.59,
+      "grad_norm": 0.7542455792427063,
+      "learning_rate": 5.7176470588235286e-05,
+      "loss": 0.279,
+      "step": 690
+    },
+    {
+      "epoch": 40.88,
+      "grad_norm": 0.835426390171051,
+      "learning_rate": 5.5411764705882345e-05,
+      "loss": 0.2864,
+      "step": 695
+    },
+    {
+      "epoch": 41.18,
+      "grad_norm": 0.8475760817527771,
+      "learning_rate": 5.364705882352941e-05,
+      "loss": 0.2979,
+      "step": 700
+    },
+    {
+      "epoch": 41.18,
+      "eval_loss": 0.26696136593818665,
+      "eval_runtime": 1.9422,
+      "eval_samples_per_second": 68.995,
+      "eval_steps_per_second": 8.753,
+      "step": 700
+    },
+    {
+      "epoch": 41.47,
+      "grad_norm": 0.8202780485153198,
+      "learning_rate": 5.188235294117646e-05,
+      "loss": 0.2752,
+      "step": 705
+    },
+    {
+      "epoch": 41.76,
+      "grad_norm": 0.8360605835914612,
+      "learning_rate": 5.011764705882352e-05,
+      "loss": 0.2813,
+      "step": 710
+    },
+    {
+      "epoch": 42.06,
+      "grad_norm": 0.767460823059082,
+      "learning_rate": 4.835294117647058e-05,
+      "loss": 0.2656,
+      "step": 715
+    },
+    {
+      "epoch": 42.35,
+      "grad_norm": 0.6413666605949402,
+      "learning_rate": 4.658823529411765e-05,
+      "loss": 0.2629,
+      "step": 720
+    },
+    {
+      "epoch": 42.35,
+      "eval_loss": 0.2672065198421478,
+      "eval_runtime": 1.9441,
+      "eval_samples_per_second": 68.926,
+      "eval_steps_per_second": 8.744,
+      "step": 720
+    },
+    {
+      "epoch": 42.65,
+      "grad_norm": 0.9078086018562317,
+      "learning_rate": 4.48235294117647e-05,
+      "loss": 0.2959,
+      "step": 725
+    },
+    {
+      "epoch": 42.94,
+      "grad_norm": 0.7576885223388672,
+      "learning_rate": 4.305882352941176e-05,
+      "loss": 0.2664,
+      "step": 730
+    },
+    {
+      "epoch": 43.24,
+      "grad_norm": 0.7104106545448303,
+      "learning_rate": 4.129411764705882e-05,
+      "loss": 0.2562,
+      "step": 735
+    },
+    {
+      "epoch": 43.53,
+      "grad_norm": 0.8250320553779602,
+      "learning_rate": 3.952941176470588e-05,
+      "loss": 0.2596,
+      "step": 740
+    },
+    {
+      "epoch": 43.53,
+      "eval_loss": 0.2670022249221802,
+      "eval_runtime": 1.9446,
+      "eval_samples_per_second": 68.907,
+      "eval_steps_per_second": 8.742,
+      "step": 740
+    },
+    {
+      "epoch": 43.82,
+      "grad_norm": 0.8341289162635803,
+      "learning_rate": 3.776470588235294e-05,
+      "loss": 0.2809,
+      "step": 745
+    },
+    {
+      "epoch": 44.12,
+      "grad_norm": 0.6533918380737305,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.2823,
+      "step": 750
+    },
+    {
+      "epoch": 44.41,
+      "grad_norm": 0.773414671421051,
+      "learning_rate": 3.423529411764706e-05,
+      "loss": 0.2944,
+      "step": 755
+    },
+    {
+      "epoch": 44.71,
+      "grad_norm": 0.9755772948265076,
+      "learning_rate": 3.247058823529411e-05,
+      "loss": 0.2833,
+      "step": 760
+    },
+    {
+      "epoch": 44.71,
+      "eval_loss": 0.26673921942710876,
+      "eval_runtime": 1.9452,
+      "eval_samples_per_second": 68.887,
+      "eval_steps_per_second": 8.739,
+      "step": 760
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.8540541529655457,
+      "learning_rate": 3.070588235294118e-05,
+      "loss": 0.2717,
+      "step": 765
+    },
+    {
+      "epoch": 45.29,
+      "grad_norm": 0.7504986524581909,
+      "learning_rate": 2.894117647058823e-05,
+      "loss": 0.2685,
+      "step": 770
+    },
+    {
+      "epoch": 45.59,
+      "grad_norm": 0.7057090997695923,
+      "learning_rate": 2.7176470588235292e-05,
+      "loss": 0.2669,
+      "step": 775
+    },
+    {
+      "epoch": 45.88,
+      "grad_norm": 0.9183229207992554,
+      "learning_rate": 2.5411764705882348e-05,
+      "loss": 0.2747,
+      "step": 780
+    },
+    {
+      "epoch": 45.88,
+      "eval_loss": 0.26657456159591675,
+      "eval_runtime": 1.9434,
+      "eval_samples_per_second": 68.95,
+      "eval_steps_per_second": 8.747,
+      "step": 780
+    },
+    {
+      "epoch": 46.18,
+      "grad_norm": 0.9822715520858765,
+      "learning_rate": 2.364705882352941e-05,
+      "loss": 0.2844,
+      "step": 785
+    },
+    {
+      "epoch": 46.47,
+      "grad_norm": 0.6618647575378418,
+      "learning_rate": 2.1882352941176466e-05,
+      "loss": 0.2534,
+      "step": 790
+    },
+    {
+      "epoch": 46.76,
+      "grad_norm": 0.8773749470710754,
+      "learning_rate": 2.011764705882353e-05,
+      "loss": 0.2634,
+      "step": 795
+    },
+    {
+      "epoch": 47.06,
+      "grad_norm": 0.7959761619567871,
+      "learning_rate": 1.8352941176470587e-05,
+      "loss": 0.3073,
+      "step": 800
+    },
+    {
+      "epoch": 47.06,
+      "eval_loss": 0.2664162814617157,
+      "eval_runtime": 1.9451,
+      "eval_samples_per_second": 68.891,
+      "eval_steps_per_second": 8.74,
+      "step": 800
+    },
+    {
+      "epoch": 47.35,
+      "grad_norm": 0.8709791302680969,
+      "learning_rate": 1.6588235294117646e-05,
+      "loss": 0.2852,
+      "step": 805
+    },
+    {
+      "epoch": 47.65,
+      "grad_norm": 0.7862871289253235,
+      "learning_rate": 1.4823529411764704e-05,
+      "loss": 0.2687,
+      "step": 810
+    },
+    {
+      "epoch": 47.94,
+      "grad_norm": 0.8266788721084595,
+      "learning_rate": 1.3058823529411763e-05,
+      "loss": 0.2736,
+      "step": 815
+    },
+    {
+      "epoch": 48.24,
+      "grad_norm": 0.926347553730011,
+      "learning_rate": 1.1294117647058822e-05,
+      "loss": 0.2535,
+      "step": 820
+    },
+    {
+      "epoch": 48.24,
+      "eval_loss": 0.2662106454372406,
+      "eval_runtime": 1.9448,
+      "eval_samples_per_second": 68.903,
+      "eval_steps_per_second": 8.741,
+      "step": 820
+    },
+    {
+      "epoch": 48.53,
+      "grad_norm": 1.0417717695236206,
+      "learning_rate": 9.52941176470588e-06,
+      "loss": 0.2793,
+      "step": 825
+    },
+    {
+      "epoch": 48.82,
+      "grad_norm": 0.71202152967453,
+      "learning_rate": 7.764705882352941e-06,
+      "loss": 0.2736,
+      "step": 830
+    },
+    {
+      "epoch": 49.12,
+      "grad_norm": 0.8569315671920776,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.2717,
+      "step": 835
+    },
+    {
+      "epoch": 49.41,
+      "grad_norm": 0.8434327840805054,
+      "learning_rate": 4.235294117647058e-06,
+      "loss": 0.2713,
+      "step": 840
+    },
+    {
+      "epoch": 49.41,
+      "eval_loss": 0.2660675048828125,
+      "eval_runtime": 1.9443,
+      "eval_samples_per_second": 68.92,
+      "eval_steps_per_second": 8.744,
+      "step": 840
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 850,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 20,
+  "total_flos": 7554290335088640.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43433281d9ff055bfecbaed56ac74c2fc38ba6e0c553077df8c74c1c1010d11a
+size 5048