yaswanth-iitkgp commited on
Commit
332b32a
1 Parent(s): e8fa6cc

End of training

Browse files
README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: HuggingFaceM4/idefics2-8b
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: idefics2_ft_augmented_dataset
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # idefics2_ft_augmented_dataset
15
+
16
+ This model is a fine-tuned version of [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 0.1871
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 2.5e-05
38
+ - train_batch_size: 4
39
+ - eval_batch_size: 8
40
+ - seed: 42
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: linear
43
+ - lr_scheduler_warmup_steps: 25
44
+ - training_steps: 5000
45
+
46
+ ### Training results
47
+
48
+ | Training Loss | Epoch | Step | Validation Loss |
49
+ |:-------------:|:------:|:----:|:---------------:|
50
+ | 0.6338 | 0.0234 | 100 | 0.6474 |
51
+ | 0.5748 | 0.0468 | 200 | 0.5899 |
52
+ | 0.4678 | 0.0702 | 300 | 0.5603 |
53
+ | 0.428 | 0.0936 | 400 | 0.5249 |
54
+ | 0.3798 | 0.1170 | 500 | 0.4984 |
55
+ | 0.3665 | 0.1404 | 600 | 0.4733 |
56
+ | 0.4406 | 0.1637 | 700 | 0.4510 |
57
+ | 0.4723 | 0.1871 | 800 | 0.4245 |
58
+ | 0.4807 | 0.2105 | 900 | 0.4158 |
59
+ | 0.4196 | 0.2339 | 1000 | 0.3971 |
60
+ | 0.3443 | 0.2573 | 1100 | 0.3738 |
61
+ | 0.4133 | 0.2807 | 1200 | 0.3631 |
62
+ | 0.2838 | 0.3041 | 1300 | 0.3334 |
63
+ | 0.4134 | 0.3275 | 1400 | 0.3264 |
64
+ | 0.2838 | 0.3509 | 1500 | 0.3125 |
65
+ | 0.275 | 0.3743 | 1600 | 0.2944 |
66
+ | 0.4141 | 0.3977 | 1700 | 0.2839 |
67
+ | 0.2498 | 0.4211 | 1800 | 0.2749 |
68
+ | 0.2817 | 0.4444 | 1900 | 0.2606 |
69
+ | 0.2899 | 0.4678 | 2000 | 0.2526 |
70
+ | 0.2695 | 0.4912 | 2100 | 0.2521 |
71
+ | 0.2619 | 0.5146 | 2200 | 0.2424 |
72
+ | 0.2238 | 0.5380 | 2300 | 0.2373 |
73
+ | 0.3049 | 0.5614 | 2400 | 0.2301 |
74
+ | 0.1308 | 0.5848 | 2500 | 0.2292 |
75
+ | 0.1936 | 0.6082 | 2600 | 0.2190 |
76
+ | 0.2479 | 0.6316 | 2700 | 0.2191 |
77
+ | 0.1575 | 0.6550 | 2800 | 0.2165 |
78
+ | 0.193 | 0.6784 | 2900 | 0.2107 |
79
+ | 0.2526 | 0.7018 | 3000 | 0.2114 |
80
+ | 0.1574 | 0.7251 | 3100 | 0.2087 |
81
+ | 0.1989 | 0.7485 | 3200 | 0.2051 |
82
+ | 0.1761 | 0.7719 | 3300 | 0.2013 |
83
+ | 0.2223 | 0.7953 | 3400 | 0.1996 |
84
+ | 0.2127 | 0.8187 | 3500 | 0.1966 |
85
+ | 0.2477 | 0.8421 | 3600 | 0.1923 |
86
+ | 0.1931 | 0.8655 | 3700 | 0.1908 |
87
+ | 0.182 | 0.8889 | 3800 | 0.1888 |
88
+ | 0.1693 | 0.9123 | 3900 | 0.1878 |
89
+ | 0.1346 | 0.9357 | 4000 | 0.1853 |
90
+ | 0.1484 | 0.9591 | 4100 | 0.1849 |
91
+ | 0.1217 | 0.9825 | 4200 | 0.1838 |
92
+ | 0.0669 | 1.0058 | 4300 | 0.1844 |
93
+ | 0.1292 | 1.0292 | 4400 | 0.1877 |
94
+ | 0.1106 | 1.0526 | 4500 | 0.1876 |
95
+ | 0.0828 | 1.0760 | 4600 | 0.1875 |
96
+ | 0.0485 | 1.0994 | 4700 | 0.1871 |
97
+ | 0.0624 | 1.1228 | 4800 | 0.1874 |
98
+ | 0.0895 | 1.1462 | 4900 | 0.1871 |
99
+ | 0.1 | 1.1696 | 5000 | 0.1871 |
100
+
101
+
102
+ ### Framework versions
103
+
104
+ - Transformers 4.41.2
105
+ - Pytorch 2.3.1+cu121
106
+ - Datasets 2.19.2
107
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "HuggingFaceM4/idefics2-8b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_lora_weights": "gaussian",
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": ".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$",
23
+ "task_type": null,
24
+ "use_dora": true,
25
+ "use_rslora": false
26
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f4e4e2c8426cd5e7abd1a135f4a29ca6ae37bb43e2a799f9c7721c4c6a13b43
3
+ size 189801888
generation_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 32000
6
+ ],
7
+ [
8
+ 32001
9
+ ]
10
+ ],
11
+ "bos_token_id": 1,
12
+ "eos_token_id": [
13
+ 2,
14
+ 32002
15
+ ],
16
+ "pad_token_id": 0,
17
+ "transformers_version": "4.41.2"
18
+ }
trainer_state.json ADDED
@@ -0,0 +1,1842 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.1695906432748537,
5
+ "eval_steps": 100,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005847953216374269,
13
+ "grad_norm": 19.25,
14
+ "learning_rate": 2.5e-05,
15
+ "loss": 1.3094,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.011695906432748537,
20
+ "grad_norm": 20.515625,
21
+ "learning_rate": 2.4874371859296484e-05,
22
+ "loss": 0.6781,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.017543859649122806,
27
+ "grad_norm": 9.015625,
28
+ "learning_rate": 2.4748743718592964e-05,
29
+ "loss": 0.6127,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.023391812865497075,
34
+ "grad_norm": 9.4765625,
35
+ "learning_rate": 2.462311557788945e-05,
36
+ "loss": 0.6338,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.023391812865497075,
41
+ "eval_loss": 0.6474161744117737,
42
+ "eval_runtime": 125.9492,
43
+ "eval_samples_per_second": 3.573,
44
+ "eval_steps_per_second": 0.453,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.029239766081871343,
49
+ "grad_norm": 16.71875,
50
+ "learning_rate": 2.449748743718593e-05,
51
+ "loss": 0.506,
52
+ "step": 125
53
+ },
54
+ {
55
+ "epoch": 0.03508771929824561,
56
+ "grad_norm": 21.125,
57
+ "learning_rate": 2.4371859296482413e-05,
58
+ "loss": 0.501,
59
+ "step": 150
60
+ },
61
+ {
62
+ "epoch": 0.04093567251461988,
63
+ "grad_norm": 17.953125,
64
+ "learning_rate": 2.4246231155778896e-05,
65
+ "loss": 0.6208,
66
+ "step": 175
67
+ },
68
+ {
69
+ "epoch": 0.04678362573099415,
70
+ "grad_norm": 7.9609375,
71
+ "learning_rate": 2.4120603015075376e-05,
72
+ "loss": 0.5748,
73
+ "step": 200
74
+ },
75
+ {
76
+ "epoch": 0.04678362573099415,
77
+ "eval_loss": 0.5898649096488953,
78
+ "eval_runtime": 129.2682,
79
+ "eval_samples_per_second": 3.481,
80
+ "eval_steps_per_second": 0.441,
81
+ "step": 200
82
+ },
83
+ {
84
+ "epoch": 0.05263157894736842,
85
+ "grad_norm": 7.4921875,
86
+ "learning_rate": 2.3994974874371863e-05,
87
+ "loss": 0.5821,
88
+ "step": 225
89
+ },
90
+ {
91
+ "epoch": 0.05847953216374269,
92
+ "grad_norm": 8.703125,
93
+ "learning_rate": 2.3869346733668342e-05,
94
+ "loss": 0.5764,
95
+ "step": 250
96
+ },
97
+ {
98
+ "epoch": 0.06432748538011696,
99
+ "grad_norm": 10.3828125,
100
+ "learning_rate": 2.3743718592964825e-05,
101
+ "loss": 0.5921,
102
+ "step": 275
103
+ },
104
+ {
105
+ "epoch": 0.07017543859649122,
106
+ "grad_norm": 14.90625,
107
+ "learning_rate": 2.361809045226131e-05,
108
+ "loss": 0.4678,
109
+ "step": 300
110
+ },
111
+ {
112
+ "epoch": 0.07017543859649122,
113
+ "eval_loss": 0.5602560639381409,
114
+ "eval_runtime": 129.0871,
115
+ "eval_samples_per_second": 3.486,
116
+ "eval_steps_per_second": 0.442,
117
+ "step": 300
118
+ },
119
+ {
120
+ "epoch": 0.07602339181286549,
121
+ "grad_norm": 16.6875,
122
+ "learning_rate": 2.3492462311557788e-05,
123
+ "loss": 0.5118,
124
+ "step": 325
125
+ },
126
+ {
127
+ "epoch": 0.08187134502923976,
128
+ "grad_norm": 17.921875,
129
+ "learning_rate": 2.3366834170854275e-05,
130
+ "loss": 0.5199,
131
+ "step": 350
132
+ },
133
+ {
134
+ "epoch": 0.08771929824561403,
135
+ "grad_norm": 16.5625,
136
+ "learning_rate": 2.3241206030150754e-05,
137
+ "loss": 0.4611,
138
+ "step": 375
139
+ },
140
+ {
141
+ "epoch": 0.0935672514619883,
142
+ "grad_norm": 10.5,
143
+ "learning_rate": 2.3115577889447238e-05,
144
+ "loss": 0.428,
145
+ "step": 400
146
+ },
147
+ {
148
+ "epoch": 0.0935672514619883,
149
+ "eval_loss": 0.524933397769928,
150
+ "eval_runtime": 127.1828,
151
+ "eval_samples_per_second": 3.538,
152
+ "eval_steps_per_second": 0.448,
153
+ "step": 400
154
+ },
155
+ {
156
+ "epoch": 0.09941520467836257,
157
+ "grad_norm": 11.6328125,
158
+ "learning_rate": 2.298994974874372e-05,
159
+ "loss": 0.4953,
160
+ "step": 425
161
+ },
162
+ {
163
+ "epoch": 0.10526315789473684,
164
+ "grad_norm": 5.4296875,
165
+ "learning_rate": 2.28643216080402e-05,
166
+ "loss": 0.4173,
167
+ "step": 450
168
+ },
169
+ {
170
+ "epoch": 0.1111111111111111,
171
+ "grad_norm": 8.8125,
172
+ "learning_rate": 2.2738693467336687e-05,
173
+ "loss": 0.4225,
174
+ "step": 475
175
+ },
176
+ {
177
+ "epoch": 0.11695906432748537,
178
+ "grad_norm": 8.84375,
179
+ "learning_rate": 2.2613065326633167e-05,
180
+ "loss": 0.3798,
181
+ "step": 500
182
+ },
183
+ {
184
+ "epoch": 0.11695906432748537,
185
+ "eval_loss": 0.49835124611854553,
186
+ "eval_runtime": 128.3365,
187
+ "eval_samples_per_second": 3.506,
188
+ "eval_steps_per_second": 0.444,
189
+ "step": 500
190
+ },
191
+ {
192
+ "epoch": 0.12280701754385964,
193
+ "grad_norm": 11.4609375,
194
+ "learning_rate": 2.248743718592965e-05,
195
+ "loss": 0.4386,
196
+ "step": 525
197
+ },
198
+ {
199
+ "epoch": 0.1286549707602339,
200
+ "grad_norm": 16.4375,
201
+ "learning_rate": 2.2361809045226133e-05,
202
+ "loss": 0.4537,
203
+ "step": 550
204
+ },
205
+ {
206
+ "epoch": 0.13450292397660818,
207
+ "grad_norm": 10.140625,
208
+ "learning_rate": 2.2236180904522613e-05,
209
+ "loss": 0.3514,
210
+ "step": 575
211
+ },
212
+ {
213
+ "epoch": 0.14035087719298245,
214
+ "grad_norm": 3.265625,
215
+ "learning_rate": 2.21105527638191e-05,
216
+ "loss": 0.3665,
217
+ "step": 600
218
+ },
219
+ {
220
+ "epoch": 0.14035087719298245,
221
+ "eval_loss": 0.47331658005714417,
222
+ "eval_runtime": 128.8126,
223
+ "eval_samples_per_second": 3.493,
224
+ "eval_steps_per_second": 0.443,
225
+ "step": 600
226
+ },
227
+ {
228
+ "epoch": 0.14619883040935672,
229
+ "grad_norm": 12.3046875,
230
+ "learning_rate": 2.198492462311558e-05,
231
+ "loss": 0.2409,
232
+ "step": 625
233
+ },
234
+ {
235
+ "epoch": 0.15204678362573099,
236
+ "grad_norm": 9.703125,
237
+ "learning_rate": 2.1859296482412062e-05,
238
+ "loss": 0.4553,
239
+ "step": 650
240
+ },
241
+ {
242
+ "epoch": 0.15789473684210525,
243
+ "grad_norm": 10.65625,
244
+ "learning_rate": 2.1733668341708545e-05,
245
+ "loss": 0.4242,
246
+ "step": 675
247
+ },
248
+ {
249
+ "epoch": 0.16374269005847952,
250
+ "grad_norm": 4.96484375,
251
+ "learning_rate": 2.1608040201005025e-05,
252
+ "loss": 0.4406,
253
+ "step": 700
254
+ },
255
+ {
256
+ "epoch": 0.16374269005847952,
257
+ "eval_loss": 0.45101454854011536,
258
+ "eval_runtime": 129.1251,
259
+ "eval_samples_per_second": 3.485,
260
+ "eval_steps_per_second": 0.441,
261
+ "step": 700
262
+ },
263
+ {
264
+ "epoch": 0.1695906432748538,
265
+ "grad_norm": 6.20703125,
266
+ "learning_rate": 2.1482412060301508e-05,
267
+ "loss": 0.408,
268
+ "step": 725
269
+ },
270
+ {
271
+ "epoch": 0.17543859649122806,
272
+ "grad_norm": 9.875,
273
+ "learning_rate": 2.135678391959799e-05,
274
+ "loss": 0.5067,
275
+ "step": 750
276
+ },
277
+ {
278
+ "epoch": 0.18128654970760233,
279
+ "grad_norm": 7.1875,
280
+ "learning_rate": 2.1231155778894474e-05,
281
+ "loss": 0.4338,
282
+ "step": 775
283
+ },
284
+ {
285
+ "epoch": 0.1871345029239766,
286
+ "grad_norm": 12.765625,
287
+ "learning_rate": 2.1105527638190957e-05,
288
+ "loss": 0.4723,
289
+ "step": 800
290
+ },
291
+ {
292
+ "epoch": 0.1871345029239766,
293
+ "eval_loss": 0.4244983494281769,
294
+ "eval_runtime": 127.3581,
295
+ "eval_samples_per_second": 3.533,
296
+ "eval_steps_per_second": 0.448,
297
+ "step": 800
298
+ },
299
+ {
300
+ "epoch": 0.19298245614035087,
301
+ "grad_norm": 26.03125,
302
+ "learning_rate": 2.0979899497487437e-05,
303
+ "loss": 0.4424,
304
+ "step": 825
305
+ },
306
+ {
307
+ "epoch": 0.19883040935672514,
308
+ "grad_norm": 5.234375,
309
+ "learning_rate": 2.085427135678392e-05,
310
+ "loss": 0.3599,
311
+ "step": 850
312
+ },
313
+ {
314
+ "epoch": 0.2046783625730994,
315
+ "grad_norm": 4.1875,
316
+ "learning_rate": 2.0728643216080403e-05,
317
+ "loss": 0.3399,
318
+ "step": 875
319
+ },
320
+ {
321
+ "epoch": 0.21052631578947367,
322
+ "grad_norm": 9.6640625,
323
+ "learning_rate": 2.0603015075376886e-05,
324
+ "loss": 0.4807,
325
+ "step": 900
326
+ },
327
+ {
328
+ "epoch": 0.21052631578947367,
329
+ "eval_loss": 0.41580215096473694,
330
+ "eval_runtime": 128.5483,
331
+ "eval_samples_per_second": 3.501,
332
+ "eval_steps_per_second": 0.443,
333
+ "step": 900
334
+ },
335
+ {
336
+ "epoch": 0.21637426900584794,
337
+ "grad_norm": 9.03125,
338
+ "learning_rate": 2.047738693467337e-05,
339
+ "loss": 0.3341,
340
+ "step": 925
341
+ },
342
+ {
343
+ "epoch": 0.2222222222222222,
344
+ "grad_norm": 10.3515625,
345
+ "learning_rate": 2.035175879396985e-05,
346
+ "loss": 0.4407,
347
+ "step": 950
348
+ },
349
+ {
350
+ "epoch": 0.22807017543859648,
351
+ "grad_norm": 13.046875,
352
+ "learning_rate": 2.0226130653266332e-05,
353
+ "loss": 0.391,
354
+ "step": 975
355
+ },
356
+ {
357
+ "epoch": 0.23391812865497075,
358
+ "grad_norm": 13.0859375,
359
+ "learning_rate": 2.0100502512562815e-05,
360
+ "loss": 0.4196,
361
+ "step": 1000
362
+ },
363
+ {
364
+ "epoch": 0.23391812865497075,
365
+ "eval_loss": 0.39709773659706116,
366
+ "eval_runtime": 129.6284,
367
+ "eval_samples_per_second": 3.471,
368
+ "eval_steps_per_second": 0.44,
369
+ "step": 1000
370
+ },
371
+ {
372
+ "epoch": 0.23976608187134502,
373
+ "grad_norm": 2.962890625,
374
+ "learning_rate": 1.9974874371859298e-05,
375
+ "loss": 0.3042,
376
+ "step": 1025
377
+ },
378
+ {
379
+ "epoch": 0.24561403508771928,
380
+ "grad_norm": 11.9765625,
381
+ "learning_rate": 1.984924623115578e-05,
382
+ "loss": 0.3731,
383
+ "step": 1050
384
+ },
385
+ {
386
+ "epoch": 0.25146198830409355,
387
+ "grad_norm": 5.42578125,
388
+ "learning_rate": 1.972361809045226e-05,
389
+ "loss": 0.3032,
390
+ "step": 1075
391
+ },
392
+ {
393
+ "epoch": 0.2573099415204678,
394
+ "grad_norm": 8.9140625,
395
+ "learning_rate": 1.9597989949748744e-05,
396
+ "loss": 0.3443,
397
+ "step": 1100
398
+ },
399
+ {
400
+ "epoch": 0.2573099415204678,
401
+ "eval_loss": 0.3737768530845642,
402
+ "eval_runtime": 126.3694,
403
+ "eval_samples_per_second": 3.561,
404
+ "eval_steps_per_second": 0.451,
405
+ "step": 1100
406
+ },
407
+ {
408
+ "epoch": 0.2631578947368421,
409
+ "grad_norm": 14.0625,
410
+ "learning_rate": 1.9472361809045227e-05,
411
+ "loss": 0.2804,
412
+ "step": 1125
413
+ },
414
+ {
415
+ "epoch": 0.26900584795321636,
416
+ "grad_norm": 5.19921875,
417
+ "learning_rate": 1.934673366834171e-05,
418
+ "loss": 0.3257,
419
+ "step": 1150
420
+ },
421
+ {
422
+ "epoch": 0.27485380116959063,
423
+ "grad_norm": 9.78125,
424
+ "learning_rate": 1.9221105527638193e-05,
425
+ "loss": 0.3625,
426
+ "step": 1175
427
+ },
428
+ {
429
+ "epoch": 0.2807017543859649,
430
+ "grad_norm": 12.890625,
431
+ "learning_rate": 1.9095477386934673e-05,
432
+ "loss": 0.4133,
433
+ "step": 1200
434
+ },
435
+ {
436
+ "epoch": 0.2807017543859649,
437
+ "eval_loss": 0.3631095290184021,
438
+ "eval_runtime": 125.638,
439
+ "eval_samples_per_second": 3.582,
440
+ "eval_steps_per_second": 0.454,
441
+ "step": 1200
442
+ },
443
+ {
444
+ "epoch": 0.28654970760233917,
445
+ "grad_norm": 7.71875,
446
+ "learning_rate": 1.8969849246231156e-05,
447
+ "loss": 0.5163,
448
+ "step": 1225
449
+ },
450
+ {
451
+ "epoch": 0.29239766081871343,
452
+ "grad_norm": 7.25390625,
453
+ "learning_rate": 1.884422110552764e-05,
454
+ "loss": 0.2991,
455
+ "step": 1250
456
+ },
457
+ {
458
+ "epoch": 0.2982456140350877,
459
+ "grad_norm": 6.5234375,
460
+ "learning_rate": 1.8718592964824123e-05,
461
+ "loss": 0.431,
462
+ "step": 1275
463
+ },
464
+ {
465
+ "epoch": 0.30409356725146197,
466
+ "grad_norm": 3.8515625,
467
+ "learning_rate": 1.8592964824120602e-05,
468
+ "loss": 0.2838,
469
+ "step": 1300
470
+ },
471
+ {
472
+ "epoch": 0.30409356725146197,
473
+ "eval_loss": 0.33338162302970886,
474
+ "eval_runtime": 127.3115,
475
+ "eval_samples_per_second": 3.535,
476
+ "eval_steps_per_second": 0.448,
477
+ "step": 1300
478
+ },
479
+ {
480
+ "epoch": 0.30994152046783624,
481
+ "grad_norm": 9.828125,
482
+ "learning_rate": 1.8467336683417085e-05,
483
+ "loss": 0.2653,
484
+ "step": 1325
485
+ },
486
+ {
487
+ "epoch": 0.3157894736842105,
488
+ "grad_norm": 5.34765625,
489
+ "learning_rate": 1.834170854271357e-05,
490
+ "loss": 0.2515,
491
+ "step": 1350
492
+ },
493
+ {
494
+ "epoch": 0.3216374269005848,
495
+ "grad_norm": 12.15625,
496
+ "learning_rate": 1.821608040201005e-05,
497
+ "loss": 0.2983,
498
+ "step": 1375
499
+ },
500
+ {
501
+ "epoch": 0.32748538011695905,
502
+ "grad_norm": 9.171875,
503
+ "learning_rate": 1.8090452261306535e-05,
504
+ "loss": 0.4134,
505
+ "step": 1400
506
+ },
507
+ {
508
+ "epoch": 0.32748538011695905,
509
+ "eval_loss": 0.32642024755477905,
510
+ "eval_runtime": 127.196,
511
+ "eval_samples_per_second": 3.538,
512
+ "eval_steps_per_second": 0.448,
513
+ "step": 1400
514
+ },
515
+ {
516
+ "epoch": 0.3333333333333333,
517
+ "grad_norm": 7.16015625,
518
+ "learning_rate": 1.7964824120603014e-05,
519
+ "loss": 0.3015,
520
+ "step": 1425
521
+ },
522
+ {
523
+ "epoch": 0.3391812865497076,
524
+ "grad_norm": 15.6640625,
525
+ "learning_rate": 1.7839195979899497e-05,
526
+ "loss": 0.3098,
527
+ "step": 1450
528
+ },
529
+ {
530
+ "epoch": 0.34502923976608185,
531
+ "grad_norm": 13.8984375,
532
+ "learning_rate": 1.771356783919598e-05,
533
+ "loss": 0.4022,
534
+ "step": 1475
535
+ },
536
+ {
537
+ "epoch": 0.3508771929824561,
538
+ "grad_norm": 9.90625,
539
+ "learning_rate": 1.7587939698492464e-05,
540
+ "loss": 0.2838,
541
+ "step": 1500
542
+ },
543
+ {
544
+ "epoch": 0.3508771929824561,
545
+ "eval_loss": 0.3124904930591583,
546
+ "eval_runtime": 128.6196,
547
+ "eval_samples_per_second": 3.499,
548
+ "eval_steps_per_second": 0.443,
549
+ "step": 1500
550
+ },
551
+ {
552
+ "epoch": 0.3567251461988304,
553
+ "grad_norm": 18.234375,
554
+ "learning_rate": 1.7462311557788947e-05,
555
+ "loss": 0.3792,
556
+ "step": 1525
557
+ },
558
+ {
559
+ "epoch": 0.36257309941520466,
560
+ "grad_norm": 1.482421875,
561
+ "learning_rate": 1.7336683417085427e-05,
562
+ "loss": 0.4708,
563
+ "step": 1550
564
+ },
565
+ {
566
+ "epoch": 0.3684210526315789,
567
+ "grad_norm": 10.8046875,
568
+ "learning_rate": 1.721105527638191e-05,
569
+ "loss": 0.3695,
570
+ "step": 1575
571
+ },
572
+ {
573
+ "epoch": 0.3742690058479532,
574
+ "grad_norm": 13.6328125,
575
+ "learning_rate": 1.7085427135678393e-05,
576
+ "loss": 0.275,
577
+ "step": 1600
578
+ },
579
+ {
580
+ "epoch": 0.3742690058479532,
581
+ "eval_loss": 0.294376403093338,
582
+ "eval_runtime": 125.5965,
583
+ "eval_samples_per_second": 3.583,
584
+ "eval_steps_per_second": 0.454,
585
+ "step": 1600
586
+ },
587
+ {
588
+ "epoch": 0.38011695906432746,
589
+ "grad_norm": 10.3359375,
590
+ "learning_rate": 1.6959798994974876e-05,
591
+ "loss": 0.3134,
592
+ "step": 1625
593
+ },
594
+ {
595
+ "epoch": 0.38596491228070173,
596
+ "grad_norm": 4.671875,
597
+ "learning_rate": 1.683417085427136e-05,
598
+ "loss": 0.1961,
599
+ "step": 1650
600
+ },
601
+ {
602
+ "epoch": 0.391812865497076,
603
+ "grad_norm": 3.734375,
604
+ "learning_rate": 1.670854271356784e-05,
605
+ "loss": 0.2686,
606
+ "step": 1675
607
+ },
608
+ {
609
+ "epoch": 0.39766081871345027,
610
+ "grad_norm": 11.5859375,
611
+ "learning_rate": 1.6582914572864322e-05,
612
+ "loss": 0.4141,
613
+ "step": 1700
614
+ },
615
+ {
616
+ "epoch": 0.39766081871345027,
617
+ "eval_loss": 0.2839096784591675,
618
+ "eval_runtime": 126.4222,
619
+ "eval_samples_per_second": 3.56,
620
+ "eval_steps_per_second": 0.451,
621
+ "step": 1700
622
+ },
623
+ {
624
+ "epoch": 0.40350877192982454,
625
+ "grad_norm": 12.0234375,
626
+ "learning_rate": 1.6457286432160805e-05,
627
+ "loss": 0.2948,
628
+ "step": 1725
629
+ },
630
+ {
631
+ "epoch": 0.4093567251461988,
632
+ "grad_norm": 6.44140625,
633
+ "learning_rate": 1.6331658291457288e-05,
634
+ "loss": 0.3546,
635
+ "step": 1750
636
+ },
637
+ {
638
+ "epoch": 0.4152046783625731,
639
+ "grad_norm": 7.5234375,
640
+ "learning_rate": 1.620603015075377e-05,
641
+ "loss": 0.3239,
642
+ "step": 1775
643
+ },
644
+ {
645
+ "epoch": 0.42105263157894735,
646
+ "grad_norm": 4.30078125,
647
+ "learning_rate": 1.608040201005025e-05,
648
+ "loss": 0.2498,
649
+ "step": 1800
650
+ },
651
+ {
652
+ "epoch": 0.42105263157894735,
653
+ "eval_loss": 0.2749183773994446,
654
+ "eval_runtime": 126.776,
655
+ "eval_samples_per_second": 3.55,
656
+ "eval_steps_per_second": 0.45,
657
+ "step": 1800
658
+ },
659
+ {
660
+ "epoch": 0.4269005847953216,
661
+ "grad_norm": 8.4375,
662
+ "learning_rate": 1.5954773869346734e-05,
663
+ "loss": 0.3026,
664
+ "step": 1825
665
+ },
666
+ {
667
+ "epoch": 0.4327485380116959,
668
+ "grad_norm": 8.140625,
669
+ "learning_rate": 1.5829145728643217e-05,
670
+ "loss": 0.3848,
671
+ "step": 1850
672
+ },
673
+ {
674
+ "epoch": 0.43859649122807015,
675
+ "grad_norm": 10.046875,
676
+ "learning_rate": 1.57035175879397e-05,
677
+ "loss": 0.2574,
678
+ "step": 1875
679
+ },
680
+ {
681
+ "epoch": 0.4444444444444444,
682
+ "grad_norm": 9.1640625,
683
+ "learning_rate": 1.5577889447236183e-05,
684
+ "loss": 0.2817,
685
+ "step": 1900
686
+ },
687
+ {
688
+ "epoch": 0.4444444444444444,
689
+ "eval_loss": 0.2605937719345093,
690
+ "eval_runtime": 129.5889,
691
+ "eval_samples_per_second": 3.473,
692
+ "eval_steps_per_second": 0.44,
693
+ "step": 1900
694
+ },
695
+ {
696
+ "epoch": 0.4502923976608187,
697
+ "grad_norm": 11.3828125,
698
+ "learning_rate": 1.5452261306532663e-05,
699
+ "loss": 0.376,
700
+ "step": 1925
701
+ },
702
+ {
703
+ "epoch": 0.45614035087719296,
704
+ "grad_norm": 9.6953125,
705
+ "learning_rate": 1.5326633165829146e-05,
706
+ "loss": 0.2708,
707
+ "step": 1950
708
+ },
709
+ {
710
+ "epoch": 0.4619883040935672,
711
+ "grad_norm": 10.0859375,
712
+ "learning_rate": 1.5201005025125627e-05,
713
+ "loss": 0.2509,
714
+ "step": 1975
715
+ },
716
+ {
717
+ "epoch": 0.4678362573099415,
718
+ "grad_norm": 11.4921875,
719
+ "learning_rate": 1.507537688442211e-05,
720
+ "loss": 0.2899,
721
+ "step": 2000
722
+ },
723
+ {
724
+ "epoch": 0.4678362573099415,
725
+ "eval_loss": 0.2526080906391144,
726
+ "eval_runtime": 127.4275,
727
+ "eval_samples_per_second": 3.531,
728
+ "eval_steps_per_second": 0.447,
729
+ "step": 2000
730
+ },
731
+ {
732
+ "epoch": 0.47368421052631576,
733
+ "grad_norm": 10.1796875,
734
+ "learning_rate": 1.4949748743718595e-05,
735
+ "loss": 0.3491,
736
+ "step": 2025
737
+ },
738
+ {
739
+ "epoch": 0.47953216374269003,
740
+ "grad_norm": 4.72265625,
741
+ "learning_rate": 1.4824120603015077e-05,
742
+ "loss": 0.1925,
743
+ "step": 2050
744
+ },
745
+ {
746
+ "epoch": 0.4853801169590643,
747
+ "grad_norm": 6.58203125,
748
+ "learning_rate": 1.4698492462311558e-05,
749
+ "loss": 0.2187,
750
+ "step": 2075
751
+ },
752
+ {
753
+ "epoch": 0.49122807017543857,
754
+ "grad_norm": 11.4609375,
755
+ "learning_rate": 1.457286432160804e-05,
756
+ "loss": 0.2695,
757
+ "step": 2100
758
+ },
759
+ {
760
+ "epoch": 0.49122807017543857,
761
+ "eval_loss": 0.2520677149295807,
762
+ "eval_runtime": 129.417,
763
+ "eval_samples_per_second": 3.477,
764
+ "eval_steps_per_second": 0.44,
765
+ "step": 2100
766
+ },
767
+ {
768
+ "epoch": 0.49707602339181284,
769
+ "grad_norm": 4.5,
770
+ "learning_rate": 1.4447236180904523e-05,
771
+ "loss": 0.1898,
772
+ "step": 2125
773
+ },
774
+ {
775
+ "epoch": 0.5029239766081871,
776
+ "grad_norm": 7.640625,
777
+ "learning_rate": 1.4321608040201007e-05,
778
+ "loss": 0.3,
779
+ "step": 2150
780
+ },
781
+ {
782
+ "epoch": 0.5087719298245614,
783
+ "grad_norm": 3.82421875,
784
+ "learning_rate": 1.4195979899497489e-05,
785
+ "loss": 0.2006,
786
+ "step": 2175
787
+ },
788
+ {
789
+ "epoch": 0.5146198830409356,
790
+ "grad_norm": 3.509765625,
791
+ "learning_rate": 1.407035175879397e-05,
792
+ "loss": 0.2619,
793
+ "step": 2200
794
+ },
795
+ {
796
+ "epoch": 0.5146198830409356,
797
+ "eval_loss": 0.24239127337932587,
798
+ "eval_runtime": 125.4976,
799
+ "eval_samples_per_second": 3.586,
800
+ "eval_steps_per_second": 0.454,
801
+ "step": 2200
802
+ },
803
+ {
804
+ "epoch": 0.52046783625731,
805
+ "grad_norm": 7.35546875,
806
+ "learning_rate": 1.3944723618090452e-05,
807
+ "loss": 0.3202,
808
+ "step": 2225
809
+ },
810
+ {
811
+ "epoch": 0.5263157894736842,
812
+ "grad_norm": 5.37109375,
813
+ "learning_rate": 1.3819095477386935e-05,
814
+ "loss": 0.143,
815
+ "step": 2250
816
+ },
817
+ {
818
+ "epoch": 0.5321637426900585,
819
+ "grad_norm": 13.203125,
820
+ "learning_rate": 1.369346733668342e-05,
821
+ "loss": 0.3726,
822
+ "step": 2275
823
+ },
824
+ {
825
+ "epoch": 0.5380116959064327,
826
+ "grad_norm": 7.08984375,
827
+ "learning_rate": 1.3567839195979901e-05,
828
+ "loss": 0.2238,
829
+ "step": 2300
830
+ },
831
+ {
832
+ "epoch": 0.5380116959064327,
833
+ "eval_loss": 0.2372661828994751,
834
+ "eval_runtime": 125.8271,
835
+ "eval_samples_per_second": 3.576,
836
+ "eval_steps_per_second": 0.453,
837
+ "step": 2300
838
+ },
839
+ {
840
+ "epoch": 0.543859649122807,
841
+ "grad_norm": 7.73046875,
842
+ "learning_rate": 1.3442211055276382e-05,
843
+ "loss": 0.2301,
844
+ "step": 2325
845
+ },
846
+ {
847
+ "epoch": 0.5497076023391813,
848
+ "grad_norm": 1.9326171875,
849
+ "learning_rate": 1.3316582914572864e-05,
850
+ "loss": 0.2891,
851
+ "step": 2350
852
+ },
853
+ {
854
+ "epoch": 0.5555555555555556,
855
+ "grad_norm": 5.25,
856
+ "learning_rate": 1.3190954773869347e-05,
857
+ "loss": 0.1861,
858
+ "step": 2375
859
+ },
860
+ {
861
+ "epoch": 0.5614035087719298,
862
+ "grad_norm": 15.7265625,
863
+ "learning_rate": 1.306532663316583e-05,
864
+ "loss": 0.3049,
865
+ "step": 2400
866
+ },
867
+ {
868
+ "epoch": 0.5614035087719298,
869
+ "eval_loss": 0.23007704317569733,
870
+ "eval_runtime": 125.6345,
871
+ "eval_samples_per_second": 3.582,
872
+ "eval_steps_per_second": 0.454,
873
+ "step": 2400
874
+ },
875
+ {
876
+ "epoch": 0.5672514619883041,
877
+ "grad_norm": 12.0078125,
878
+ "learning_rate": 1.2939698492462313e-05,
879
+ "loss": 0.2584,
880
+ "step": 2425
881
+ },
882
+ {
883
+ "epoch": 0.5730994152046783,
884
+ "grad_norm": 11.8125,
885
+ "learning_rate": 1.2814070351758795e-05,
886
+ "loss": 0.29,
887
+ "step": 2450
888
+ },
889
+ {
890
+ "epoch": 0.5789473684210527,
891
+ "grad_norm": 4.7578125,
892
+ "learning_rate": 1.2688442211055276e-05,
893
+ "loss": 0.2648,
894
+ "step": 2475
895
+ },
896
+ {
897
+ "epoch": 0.5847953216374269,
898
+ "grad_norm": 10.53125,
899
+ "learning_rate": 1.2562814070351759e-05,
900
+ "loss": 0.1308,
901
+ "step": 2500
902
+ },
903
+ {
904
+ "epoch": 0.5847953216374269,
905
+ "eval_loss": 0.2292058765888214,
906
+ "eval_runtime": 127.785,
907
+ "eval_samples_per_second": 3.522,
908
+ "eval_steps_per_second": 0.446,
909
+ "step": 2500
910
+ },
911
+ {
912
+ "epoch": 0.5906432748538012,
913
+ "grad_norm": 8.421875,
914
+ "learning_rate": 1.2437185929648242e-05,
915
+ "loss": 0.3249,
916
+ "step": 2525
917
+ },
918
+ {
919
+ "epoch": 0.5964912280701754,
920
+ "grad_norm": 10.8828125,
921
+ "learning_rate": 1.2311557788944725e-05,
922
+ "loss": 0.2164,
923
+ "step": 2550
924
+ },
925
+ {
926
+ "epoch": 0.6023391812865497,
927
+ "grad_norm": 7.421875,
928
+ "learning_rate": 1.2185929648241207e-05,
929
+ "loss": 0.2512,
930
+ "step": 2575
931
+ },
932
+ {
933
+ "epoch": 0.6081871345029239,
934
+ "grad_norm": 9.6171875,
935
+ "learning_rate": 1.2060301507537688e-05,
936
+ "loss": 0.1936,
937
+ "step": 2600
938
+ },
939
+ {
940
+ "epoch": 0.6081871345029239,
941
+ "eval_loss": 0.21902820467948914,
942
+ "eval_runtime": 125.3191,
943
+ "eval_samples_per_second": 3.591,
944
+ "eval_steps_per_second": 0.455,
945
+ "step": 2600
946
+ },
947
+ {
948
+ "epoch": 0.6140350877192983,
949
+ "grad_norm": 1.091796875,
950
+ "learning_rate": 1.1934673366834171e-05,
951
+ "loss": 0.189,
952
+ "step": 2625
953
+ },
954
+ {
955
+ "epoch": 0.6198830409356725,
956
+ "grad_norm": 7.83203125,
957
+ "learning_rate": 1.1809045226130654e-05,
958
+ "loss": 0.2179,
959
+ "step": 2650
960
+ },
961
+ {
962
+ "epoch": 0.6257309941520468,
963
+ "grad_norm": 8.2109375,
964
+ "learning_rate": 1.1683417085427137e-05,
965
+ "loss": 0.224,
966
+ "step": 2675
967
+ },
968
+ {
969
+ "epoch": 0.631578947368421,
970
+ "grad_norm": 5.1640625,
971
+ "learning_rate": 1.1557788944723619e-05,
972
+ "loss": 0.2479,
973
+ "step": 2700
974
+ },
975
+ {
976
+ "epoch": 0.631578947368421,
977
+ "eval_loss": 0.21907079219818115,
978
+ "eval_runtime": 126.3038,
979
+ "eval_samples_per_second": 3.563,
980
+ "eval_steps_per_second": 0.451,
981
+ "step": 2700
982
+ },
983
+ {
984
+ "epoch": 0.6374269005847953,
985
+ "grad_norm": 4.47265625,
986
+ "learning_rate": 1.14321608040201e-05,
987
+ "loss": 0.219,
988
+ "step": 2725
989
+ },
990
+ {
991
+ "epoch": 0.6432748538011696,
992
+ "grad_norm": 3.30078125,
993
+ "learning_rate": 1.1306532663316583e-05,
994
+ "loss": 0.2741,
995
+ "step": 2750
996
+ },
997
+ {
998
+ "epoch": 0.6491228070175439,
999
+ "grad_norm": 4.21484375,
1000
+ "learning_rate": 1.1180904522613066e-05,
1001
+ "loss": 0.1153,
1002
+ "step": 2775
1003
+ },
1004
+ {
1005
+ "epoch": 0.6549707602339181,
1006
+ "grad_norm": 5.125,
1007
+ "learning_rate": 1.105527638190955e-05,
1008
+ "loss": 0.1575,
1009
+ "step": 2800
1010
+ },
1011
+ {
1012
+ "epoch": 0.6549707602339181,
1013
+ "eval_loss": 0.2165260761976242,
1014
+ "eval_runtime": 125.8096,
1015
+ "eval_samples_per_second": 3.577,
1016
+ "eval_steps_per_second": 0.453,
1017
+ "step": 2800
1018
+ },
1019
+ {
1020
+ "epoch": 0.6608187134502924,
1021
+ "grad_norm": 0.83837890625,
1022
+ "learning_rate": 1.0929648241206031e-05,
1023
+ "loss": 0.2238,
1024
+ "step": 2825
1025
+ },
1026
+ {
1027
+ "epoch": 0.6666666666666666,
1028
+ "grad_norm": 6.61328125,
1029
+ "learning_rate": 1.0804020100502512e-05,
1030
+ "loss": 0.3554,
1031
+ "step": 2850
1032
+ },
1033
+ {
1034
+ "epoch": 0.672514619883041,
1035
+ "grad_norm": 12.875,
1036
+ "learning_rate": 1.0678391959798995e-05,
1037
+ "loss": 0.096,
1038
+ "step": 2875
1039
+ },
1040
+ {
1041
+ "epoch": 0.6783625730994152,
1042
+ "grad_norm": 3.28125,
1043
+ "learning_rate": 1.0552763819095479e-05,
1044
+ "loss": 0.193,
1045
+ "step": 2900
1046
+ },
1047
+ {
1048
+ "epoch": 0.6783625730994152,
1049
+ "eval_loss": 0.21065188944339752,
1050
+ "eval_runtime": 125.6087,
1051
+ "eval_samples_per_second": 3.583,
1052
+ "eval_steps_per_second": 0.454,
1053
+ "step": 2900
1054
+ },
1055
+ {
1056
+ "epoch": 0.6842105263157895,
1057
+ "grad_norm": 8.7265625,
1058
+ "learning_rate": 1.042713567839196e-05,
1059
+ "loss": 0.1936,
1060
+ "step": 2925
1061
+ },
1062
+ {
1063
+ "epoch": 0.6900584795321637,
1064
+ "grad_norm": 4.0859375,
1065
+ "learning_rate": 1.0301507537688443e-05,
1066
+ "loss": 0.2103,
1067
+ "step": 2950
1068
+ },
1069
+ {
1070
+ "epoch": 0.695906432748538,
1071
+ "grad_norm": 15.5703125,
1072
+ "learning_rate": 1.0175879396984924e-05,
1073
+ "loss": 0.2701,
1074
+ "step": 2975
1075
+ },
1076
+ {
1077
+ "epoch": 0.7017543859649122,
1078
+ "grad_norm": 3.73046875,
1079
+ "learning_rate": 1.0050251256281408e-05,
1080
+ "loss": 0.2526,
1081
+ "step": 3000
1082
+ },
1083
+ {
1084
+ "epoch": 0.7017543859649122,
1085
+ "eval_loss": 0.21144379675388336,
1086
+ "eval_runtime": 128.1153,
1087
+ "eval_samples_per_second": 3.512,
1088
+ "eval_steps_per_second": 0.445,
1089
+ "step": 3000
1090
+ },
1091
+ {
1092
+ "epoch": 0.7076023391812866,
1093
+ "grad_norm": 1.537109375,
1094
+ "learning_rate": 9.92462311557789e-06,
1095
+ "loss": 0.2087,
1096
+ "step": 3025
1097
+ },
1098
+ {
1099
+ "epoch": 0.7134502923976608,
1100
+ "grad_norm": 0.55908203125,
1101
+ "learning_rate": 9.798994974874372e-06,
1102
+ "loss": 0.1532,
1103
+ "step": 3050
1104
+ },
1105
+ {
1106
+ "epoch": 0.7192982456140351,
1107
+ "grad_norm": 9.7578125,
1108
+ "learning_rate": 9.673366834170855e-06,
1109
+ "loss": 0.1985,
1110
+ "step": 3075
1111
+ },
1112
+ {
1113
+ "epoch": 0.7251461988304093,
1114
+ "grad_norm": 9.390625,
1115
+ "learning_rate": 9.547738693467337e-06,
1116
+ "loss": 0.1574,
1117
+ "step": 3100
1118
+ },
1119
+ {
1120
+ "epoch": 0.7251461988304093,
1121
+ "eval_loss": 0.20868618786334991,
1122
+ "eval_runtime": 129.4237,
1123
+ "eval_samples_per_second": 3.477,
1124
+ "eval_steps_per_second": 0.44,
1125
+ "step": 3100
1126
+ },
1127
+ {
1128
+ "epoch": 0.7309941520467836,
1129
+ "grad_norm": 4.6171875,
1130
+ "learning_rate": 9.42211055276382e-06,
1131
+ "loss": 0.2921,
1132
+ "step": 3125
1133
+ },
1134
+ {
1135
+ "epoch": 0.7368421052631579,
1136
+ "grad_norm": 10.859375,
1137
+ "learning_rate": 9.296482412060301e-06,
1138
+ "loss": 0.1932,
1139
+ "step": 3150
1140
+ },
1141
+ {
1142
+ "epoch": 0.7426900584795322,
1143
+ "grad_norm": 5.97265625,
1144
+ "learning_rate": 9.170854271356784e-06,
1145
+ "loss": 0.2999,
1146
+ "step": 3175
1147
+ },
1148
+ {
1149
+ "epoch": 0.7485380116959064,
1150
+ "grad_norm": 4.85546875,
1151
+ "learning_rate": 9.045226130653267e-06,
1152
+ "loss": 0.1989,
1153
+ "step": 3200
1154
+ },
1155
+ {
1156
+ "epoch": 0.7485380116959064,
1157
+ "eval_loss": 0.20511174201965332,
1158
+ "eval_runtime": 127.5209,
1159
+ "eval_samples_per_second": 3.529,
1160
+ "eval_steps_per_second": 0.447,
1161
+ "step": 3200
1162
+ },
1163
+ {
1164
+ "epoch": 0.7543859649122807,
1165
+ "grad_norm": 8.7265625,
1166
+ "learning_rate": 8.919597989949749e-06,
1167
+ "loss": 0.1795,
1168
+ "step": 3225
1169
+ },
1170
+ {
1171
+ "epoch": 0.7602339181286549,
1172
+ "grad_norm": 3.490234375,
1173
+ "learning_rate": 8.793969849246232e-06,
1174
+ "loss": 0.2444,
1175
+ "step": 3250
1176
+ },
1177
+ {
1178
+ "epoch": 0.7660818713450293,
1179
+ "grad_norm": 6.671875,
1180
+ "learning_rate": 8.668341708542713e-06,
1181
+ "loss": 0.2775,
1182
+ "step": 3275
1183
+ },
1184
+ {
1185
+ "epoch": 0.7719298245614035,
1186
+ "grad_norm": 5.1796875,
1187
+ "learning_rate": 8.542713567839196e-06,
1188
+ "loss": 0.1761,
1189
+ "step": 3300
1190
+ },
1191
+ {
1192
+ "epoch": 0.7719298245614035,
1193
+ "eval_loss": 0.20133711397647858,
1194
+ "eval_runtime": 129.2927,
1195
+ "eval_samples_per_second": 3.48,
1196
+ "eval_steps_per_second": 0.441,
1197
+ "step": 3300
1198
+ },
1199
+ {
1200
+ "epoch": 0.7777777777777778,
1201
+ "grad_norm": 0.83935546875,
1202
+ "learning_rate": 8.41708542713568e-06,
1203
+ "loss": 0.216,
1204
+ "step": 3325
1205
+ },
1206
+ {
1207
+ "epoch": 0.783625730994152,
1208
+ "grad_norm": 2.580078125,
1209
+ "learning_rate": 8.291457286432161e-06,
1210
+ "loss": 0.0873,
1211
+ "step": 3350
1212
+ },
1213
+ {
1214
+ "epoch": 0.7894736842105263,
1215
+ "grad_norm": 5.87109375,
1216
+ "learning_rate": 8.165829145728644e-06,
1217
+ "loss": 0.1145,
1218
+ "step": 3375
1219
+ },
1220
+ {
1221
+ "epoch": 0.7953216374269005,
1222
+ "grad_norm": 7.125,
1223
+ "learning_rate": 8.040201005025125e-06,
1224
+ "loss": 0.2223,
1225
+ "step": 3400
1226
+ },
1227
+ {
1228
+ "epoch": 0.7953216374269005,
1229
+ "eval_loss": 0.19959186017513275,
1230
+ "eval_runtime": 125.4382,
1231
+ "eval_samples_per_second": 3.587,
1232
+ "eval_steps_per_second": 0.454,
1233
+ "step": 3400
1234
+ },
1235
+ {
1236
+ "epoch": 0.8011695906432749,
1237
+ "grad_norm": 3.94921875,
1238
+ "learning_rate": 7.914572864321608e-06,
1239
+ "loss": 0.1845,
1240
+ "step": 3425
1241
+ },
1242
+ {
1243
+ "epoch": 0.8070175438596491,
1244
+ "grad_norm": 5.99609375,
1245
+ "learning_rate": 7.788944723618092e-06,
1246
+ "loss": 0.104,
1247
+ "step": 3450
1248
+ },
1249
+ {
1250
+ "epoch": 0.8128654970760234,
1251
+ "grad_norm": 7.953125,
1252
+ "learning_rate": 7.663316582914573e-06,
1253
+ "loss": 0.1119,
1254
+ "step": 3475
1255
+ },
1256
+ {
1257
+ "epoch": 0.8187134502923976,
1258
+ "grad_norm": 7.5859375,
1259
+ "learning_rate": 7.537688442211055e-06,
1260
+ "loss": 0.2127,
1261
+ "step": 3500
1262
+ },
1263
+ {
1264
+ "epoch": 0.8187134502923976,
1265
+ "eval_loss": 0.1966124027967453,
1266
+ "eval_runtime": 126.749,
1267
+ "eval_samples_per_second": 3.55,
1268
+ "eval_steps_per_second": 0.45,
1269
+ "step": 3500
1270
+ },
1271
+ {
1272
+ "epoch": 0.8245614035087719,
1273
+ "grad_norm": 25.390625,
1274
+ "learning_rate": 7.412060301507538e-06,
1275
+ "loss": 0.1597,
1276
+ "step": 3525
1277
+ },
1278
+ {
1279
+ "epoch": 0.8304093567251462,
1280
+ "grad_norm": 7.55078125,
1281
+ "learning_rate": 7.28643216080402e-06,
1282
+ "loss": 0.0942,
1283
+ "step": 3550
1284
+ },
1285
+ {
1286
+ "epoch": 0.8362573099415205,
1287
+ "grad_norm": 6.5390625,
1288
+ "learning_rate": 7.160804020100504e-06,
1289
+ "loss": 0.236,
1290
+ "step": 3575
1291
+ },
1292
+ {
1293
+ "epoch": 0.8421052631578947,
1294
+ "grad_norm": 9.390625,
1295
+ "learning_rate": 7.035175879396985e-06,
1296
+ "loss": 0.2477,
1297
+ "step": 3600
1298
+ },
1299
+ {
1300
+ "epoch": 0.8421052631578947,
1301
+ "eval_loss": 0.1922728419303894,
1302
+ "eval_runtime": 125.5954,
1303
+ "eval_samples_per_second": 3.583,
1304
+ "eval_steps_per_second": 0.454,
1305
+ "step": 3600
1306
+ },
1307
+ {
1308
+ "epoch": 0.847953216374269,
1309
+ "grad_norm": 0.7783203125,
1310
+ "learning_rate": 6.909547738693467e-06,
1311
+ "loss": 0.3097,
1312
+ "step": 3625
1313
+ },
1314
+ {
1315
+ "epoch": 0.8538011695906432,
1316
+ "grad_norm": 5.91796875,
1317
+ "learning_rate": 6.7839195979899505e-06,
1318
+ "loss": 0.2097,
1319
+ "step": 3650
1320
+ },
1321
+ {
1322
+ "epoch": 0.8596491228070176,
1323
+ "grad_norm": 2.55078125,
1324
+ "learning_rate": 6.658291457286432e-06,
1325
+ "loss": 0.1837,
1326
+ "step": 3675
1327
+ },
1328
+ {
1329
+ "epoch": 0.8654970760233918,
1330
+ "grad_norm": 5.1015625,
1331
+ "learning_rate": 6.532663316582915e-06,
1332
+ "loss": 0.1931,
1333
+ "step": 3700
1334
+ },
1335
+ {
1336
+ "epoch": 0.8654970760233918,
1337
+ "eval_loss": 0.1908126324415207,
1338
+ "eval_runtime": 125.264,
1339
+ "eval_samples_per_second": 3.592,
1340
+ "eval_steps_per_second": 0.455,
1341
+ "step": 3700
1342
+ },
1343
+ {
1344
+ "epoch": 0.8713450292397661,
1345
+ "grad_norm": 1.0830078125,
1346
+ "learning_rate": 6.407035175879397e-06,
1347
+ "loss": 0.1688,
1348
+ "step": 3725
1349
+ },
1350
+ {
1351
+ "epoch": 0.8771929824561403,
1352
+ "grad_norm": 2.021484375,
1353
+ "learning_rate": 6.2814070351758795e-06,
1354
+ "loss": 0.1635,
1355
+ "step": 3750
1356
+ },
1357
+ {
1358
+ "epoch": 0.8830409356725146,
1359
+ "grad_norm": 24.90625,
1360
+ "learning_rate": 6.155778894472363e-06,
1361
+ "loss": 0.2588,
1362
+ "step": 3775
1363
+ },
1364
+ {
1365
+ "epoch": 0.8888888888888888,
1366
+ "grad_norm": 4.9453125,
1367
+ "learning_rate": 6.030150753768844e-06,
1368
+ "loss": 0.182,
1369
+ "step": 3800
1370
+ },
1371
+ {
1372
+ "epoch": 0.8888888888888888,
1373
+ "eval_loss": 0.18878790736198425,
1374
+ "eval_runtime": 126.7176,
1375
+ "eval_samples_per_second": 3.551,
1376
+ "eval_steps_per_second": 0.45,
1377
+ "step": 3800
1378
+ },
1379
+ {
1380
+ "epoch": 0.8947368421052632,
1381
+ "grad_norm": 7.25,
1382
+ "learning_rate": 5.904522613065327e-06,
1383
+ "loss": 0.2793,
1384
+ "step": 3825
1385
+ },
1386
+ {
1387
+ "epoch": 0.9005847953216374,
1388
+ "grad_norm": 9.59375,
1389
+ "learning_rate": 5.778894472361809e-06,
1390
+ "loss": 0.1608,
1391
+ "step": 3850
1392
+ },
1393
+ {
1394
+ "epoch": 0.9064327485380117,
1395
+ "grad_norm": 2.943359375,
1396
+ "learning_rate": 5.653266331658292e-06,
1397
+ "loss": 0.2136,
1398
+ "step": 3875
1399
+ },
1400
+ {
1401
+ "epoch": 0.9122807017543859,
1402
+ "grad_norm": 0.48681640625,
1403
+ "learning_rate": 5.527638190954775e-06,
1404
+ "loss": 0.1693,
1405
+ "step": 3900
1406
+ },
1407
+ {
1408
+ "epoch": 0.9122807017543859,
1409
+ "eval_loss": 0.18779730796813965,
1410
+ "eval_runtime": 126.6799,
1411
+ "eval_samples_per_second": 3.552,
1412
+ "eval_steps_per_second": 0.45,
1413
+ "step": 3900
1414
+ },
1415
+ {
1416
+ "epoch": 0.9181286549707602,
1417
+ "grad_norm": 3.2109375,
1418
+ "learning_rate": 5.402010050251256e-06,
1419
+ "loss": 0.0918,
1420
+ "step": 3925
1421
+ },
1422
+ {
1423
+ "epoch": 0.9239766081871345,
1424
+ "grad_norm": 1.541015625,
1425
+ "learning_rate": 5.276381909547739e-06,
1426
+ "loss": 0.2076,
1427
+ "step": 3950
1428
+ },
1429
+ {
1430
+ "epoch": 0.9298245614035088,
1431
+ "grad_norm": 4.7421875,
1432
+ "learning_rate": 5.1507537688442215e-06,
1433
+ "loss": 0.2429,
1434
+ "step": 3975
1435
+ },
1436
+ {
1437
+ "epoch": 0.935672514619883,
1438
+ "grad_norm": 4.375,
1439
+ "learning_rate": 5.025125628140704e-06,
1440
+ "loss": 0.1346,
1441
+ "step": 4000
1442
+ },
1443
+ {
1444
+ "epoch": 0.935672514619883,
1445
+ "eval_loss": 0.18527910113334656,
1446
+ "eval_runtime": 129.3266,
1447
+ "eval_samples_per_second": 3.48,
1448
+ "eval_steps_per_second": 0.441,
1449
+ "step": 4000
1450
+ },
1451
+ {
1452
+ "epoch": 0.9415204678362573,
1453
+ "grad_norm": 4.875,
1454
+ "learning_rate": 4.899497487437186e-06,
1455
+ "loss": 0.2457,
1456
+ "step": 4025
1457
+ },
1458
+ {
1459
+ "epoch": 0.9473684210526315,
1460
+ "grad_norm": 3.013671875,
1461
+ "learning_rate": 4.773869346733668e-06,
1462
+ "loss": 0.187,
1463
+ "step": 4050
1464
+ },
1465
+ {
1466
+ "epoch": 0.9532163742690059,
1467
+ "grad_norm": 4.3125,
1468
+ "learning_rate": 4.6482412060301506e-06,
1469
+ "loss": 0.1546,
1470
+ "step": 4075
1471
+ },
1472
+ {
1473
+ "epoch": 0.9590643274853801,
1474
+ "grad_norm": 2.5,
1475
+ "learning_rate": 4.522613065326634e-06,
1476
+ "loss": 0.1484,
1477
+ "step": 4100
1478
+ },
1479
+ {
1480
+ "epoch": 0.9590643274853801,
1481
+ "eval_loss": 0.18491099774837494,
1482
+ "eval_runtime": 126.5822,
1483
+ "eval_samples_per_second": 3.555,
1484
+ "eval_steps_per_second": 0.45,
1485
+ "step": 4100
1486
+ },
1487
+ {
1488
+ "epoch": 0.9649122807017544,
1489
+ "grad_norm": 5.03515625,
1490
+ "learning_rate": 4.396984924623116e-06,
1491
+ "loss": 0.1739,
1492
+ "step": 4125
1493
+ },
1494
+ {
1495
+ "epoch": 0.9707602339181286,
1496
+ "grad_norm": 7.1015625,
1497
+ "learning_rate": 4.271356783919598e-06,
1498
+ "loss": 0.3268,
1499
+ "step": 4150
1500
+ },
1501
+ {
1502
+ "epoch": 0.9766081871345029,
1503
+ "grad_norm": 0.397216796875,
1504
+ "learning_rate": 4.1457286432160804e-06,
1505
+ "loss": 0.2178,
1506
+ "step": 4175
1507
+ },
1508
+ {
1509
+ "epoch": 0.9824561403508771,
1510
+ "grad_norm": 5.00390625,
1511
+ "learning_rate": 4.020100502512563e-06,
1512
+ "loss": 0.1217,
1513
+ "step": 4200
1514
+ },
1515
+ {
1516
+ "epoch": 0.9824561403508771,
1517
+ "eval_loss": 0.1838068664073944,
1518
+ "eval_runtime": 127.461,
1519
+ "eval_samples_per_second": 3.53,
1520
+ "eval_steps_per_second": 0.447,
1521
+ "step": 4200
1522
+ },
1523
+ {
1524
+ "epoch": 0.9883040935672515,
1525
+ "grad_norm": 0.1112060546875,
1526
+ "learning_rate": 3.894472361809046e-06,
1527
+ "loss": 0.0894,
1528
+ "step": 4225
1529
+ },
1530
+ {
1531
+ "epoch": 0.9941520467836257,
1532
+ "grad_norm": 1.748046875,
1533
+ "learning_rate": 3.7688442211055276e-06,
1534
+ "loss": 0.1733,
1535
+ "step": 4250
1536
+ },
1537
+ {
1538
+ "epoch": 1.0,
1539
+ "grad_norm": 6.53125,
1540
+ "learning_rate": 3.64321608040201e-06,
1541
+ "loss": 0.1281,
1542
+ "step": 4275
1543
+ },
1544
+ {
1545
+ "epoch": 1.0058479532163742,
1546
+ "grad_norm": 0.9111328125,
1547
+ "learning_rate": 3.5175879396984926e-06,
1548
+ "loss": 0.0669,
1549
+ "step": 4300
1550
+ },
1551
+ {
1552
+ "epoch": 1.0058479532163742,
1553
+ "eval_loss": 0.18437370657920837,
1554
+ "eval_runtime": 128.6228,
1555
+ "eval_samples_per_second": 3.499,
1556
+ "eval_steps_per_second": 0.443,
1557
+ "step": 4300
1558
+ },
1559
+ {
1560
+ "epoch": 1.0116959064327484,
1561
+ "grad_norm": 0.486328125,
1562
+ "learning_rate": 3.3919597989949752e-06,
1563
+ "loss": 0.0748,
1564
+ "step": 4325
1565
+ },
1566
+ {
1567
+ "epoch": 1.0175438596491229,
1568
+ "grad_norm": 7.703125,
1569
+ "learning_rate": 3.2663316582914575e-06,
1570
+ "loss": 0.071,
1571
+ "step": 4350
1572
+ },
1573
+ {
1574
+ "epoch": 1.023391812865497,
1575
+ "grad_norm": 8.9765625,
1576
+ "learning_rate": 3.1407035175879398e-06,
1577
+ "loss": 0.1588,
1578
+ "step": 4375
1579
+ },
1580
+ {
1581
+ "epoch": 1.0292397660818713,
1582
+ "grad_norm": 0.38427734375,
1583
+ "learning_rate": 3.015075376884422e-06,
1584
+ "loss": 0.1292,
1585
+ "step": 4400
1586
+ },
1587
+ {
1588
+ "epoch": 1.0292397660818713,
1589
+ "eval_loss": 0.18767422437667847,
1590
+ "eval_runtime": 126.9252,
1591
+ "eval_samples_per_second": 3.545,
1592
+ "eval_steps_per_second": 0.449,
1593
+ "step": 4400
1594
+ },
1595
+ {
1596
+ "epoch": 1.0350877192982457,
1597
+ "grad_norm": 6.4375,
1598
+ "learning_rate": 2.8894472361809047e-06,
1599
+ "loss": 0.1419,
1600
+ "step": 4425
1601
+ },
1602
+ {
1603
+ "epoch": 1.04093567251462,
1604
+ "grad_norm": 13.3515625,
1605
+ "learning_rate": 2.7638190954773874e-06,
1606
+ "loss": 0.084,
1607
+ "step": 4450
1608
+ },
1609
+ {
1610
+ "epoch": 1.0467836257309941,
1611
+ "grad_norm": 9.203125,
1612
+ "learning_rate": 2.6381909547738696e-06,
1613
+ "loss": 0.072,
1614
+ "step": 4475
1615
+ },
1616
+ {
1617
+ "epoch": 1.0526315789473684,
1618
+ "grad_norm": 2.439453125,
1619
+ "learning_rate": 2.512562814070352e-06,
1620
+ "loss": 0.1106,
1621
+ "step": 4500
1622
+ },
1623
+ {
1624
+ "epoch": 1.0526315789473684,
1625
+ "eval_loss": 0.18756501376628876,
1626
+ "eval_runtime": 128.4204,
1627
+ "eval_samples_per_second": 3.504,
1628
+ "eval_steps_per_second": 0.444,
1629
+ "step": 4500
1630
+ },
1631
+ {
1632
+ "epoch": 1.0584795321637426,
1633
+ "grad_norm": 4.671875,
1634
+ "learning_rate": 2.386934673366834e-06,
1635
+ "loss": 0.1226,
1636
+ "step": 4525
1637
+ },
1638
+ {
1639
+ "epoch": 1.064327485380117,
1640
+ "grad_norm": 2.62890625,
1641
+ "learning_rate": 2.261306532663317e-06,
1642
+ "loss": 0.123,
1643
+ "step": 4550
1644
+ },
1645
+ {
1646
+ "epoch": 1.0701754385964912,
1647
+ "grad_norm": 16.640625,
1648
+ "learning_rate": 2.135678391959799e-06,
1649
+ "loss": 0.0593,
1650
+ "step": 4575
1651
+ },
1652
+ {
1653
+ "epoch": 1.0760233918128654,
1654
+ "grad_norm": 6.8828125,
1655
+ "learning_rate": 2.0100502512562813e-06,
1656
+ "loss": 0.0828,
1657
+ "step": 4600
1658
+ },
1659
+ {
1660
+ "epoch": 1.0760233918128654,
1661
+ "eval_loss": 0.1875353455543518,
1662
+ "eval_runtime": 126.5483,
1663
+ "eval_samples_per_second": 3.556,
1664
+ "eval_steps_per_second": 0.45,
1665
+ "step": 4600
1666
+ },
1667
+ {
1668
+ "epoch": 1.0818713450292399,
1669
+ "grad_norm": 4.55078125,
1670
+ "learning_rate": 1.8844221105527638e-06,
1671
+ "loss": 0.1481,
1672
+ "step": 4625
1673
+ },
1674
+ {
1675
+ "epoch": 1.087719298245614,
1676
+ "grad_norm": 3.6796875,
1677
+ "learning_rate": 1.7587939698492463e-06,
1678
+ "loss": 0.1275,
1679
+ "step": 4650
1680
+ },
1681
+ {
1682
+ "epoch": 1.0935672514619883,
1683
+ "grad_norm": 6.02734375,
1684
+ "learning_rate": 1.6331658291457288e-06,
1685
+ "loss": 0.1274,
1686
+ "step": 4675
1687
+ },
1688
+ {
1689
+ "epoch": 1.0994152046783625,
1690
+ "grad_norm": 1.4111328125,
1691
+ "learning_rate": 1.507537688442211e-06,
1692
+ "loss": 0.0485,
1693
+ "step": 4700
1694
+ },
1695
+ {
1696
+ "epoch": 1.0994152046783625,
1697
+ "eval_loss": 0.18709704279899597,
1698
+ "eval_runtime": 129.1721,
1699
+ "eval_samples_per_second": 3.484,
1700
+ "eval_steps_per_second": 0.441,
1701
+ "step": 4700
1702
+ },
1703
+ {
1704
+ "epoch": 1.1052631578947367,
1705
+ "grad_norm": 0.74853515625,
1706
+ "learning_rate": 1.3819095477386937e-06,
1707
+ "loss": 0.0483,
1708
+ "step": 4725
1709
+ },
1710
+ {
1711
+ "epoch": 1.1111111111111112,
1712
+ "grad_norm": 1.044921875,
1713
+ "learning_rate": 1.256281407035176e-06,
1714
+ "loss": 0.0799,
1715
+ "step": 4750
1716
+ },
1717
+ {
1718
+ "epoch": 1.1169590643274854,
1719
+ "grad_norm": 0.171142578125,
1720
+ "learning_rate": 1.1306532663316584e-06,
1721
+ "loss": 0.1273,
1722
+ "step": 4775
1723
+ },
1724
+ {
1725
+ "epoch": 1.1228070175438596,
1726
+ "grad_norm": 0.346923828125,
1727
+ "learning_rate": 1.0050251256281407e-06,
1728
+ "loss": 0.0624,
1729
+ "step": 4800
1730
+ },
1731
+ {
1732
+ "epoch": 1.1228070175438596,
1733
+ "eval_loss": 0.1874168962240219,
1734
+ "eval_runtime": 128.2151,
1735
+ "eval_samples_per_second": 3.51,
1736
+ "eval_steps_per_second": 0.445,
1737
+ "step": 4800
1738
+ },
1739
+ {
1740
+ "epoch": 1.128654970760234,
1741
+ "grad_norm": 0.1552734375,
1742
+ "learning_rate": 8.793969849246231e-07,
1743
+ "loss": 0.1017,
1744
+ "step": 4825
1745
+ },
1746
+ {
1747
+ "epoch": 1.1345029239766082,
1748
+ "grad_norm": 1.619140625,
1749
+ "learning_rate": 7.537688442211055e-07,
1750
+ "loss": 0.1556,
1751
+ "step": 4850
1752
+ },
1753
+ {
1754
+ "epoch": 1.1403508771929824,
1755
+ "grad_norm": 1.958984375,
1756
+ "learning_rate": 6.28140703517588e-07,
1757
+ "loss": 0.1113,
1758
+ "step": 4875
1759
+ },
1760
+ {
1761
+ "epoch": 1.1461988304093567,
1762
+ "grad_norm": 5.375,
1763
+ "learning_rate": 5.025125628140703e-07,
1764
+ "loss": 0.0895,
1765
+ "step": 4900
1766
+ },
1767
+ {
1768
+ "epoch": 1.1461988304093567,
1769
+ "eval_loss": 0.1870710253715515,
1770
+ "eval_runtime": 126.862,
1771
+ "eval_samples_per_second": 3.547,
1772
+ "eval_steps_per_second": 0.449,
1773
+ "step": 4900
1774
+ },
1775
+ {
1776
+ "epoch": 1.1520467836257309,
1777
+ "grad_norm": 5.984375,
1778
+ "learning_rate": 3.7688442211055275e-07,
1779
+ "loss": 0.1076,
1780
+ "step": 4925
1781
+ },
1782
+ {
1783
+ "epoch": 1.1578947368421053,
1784
+ "grad_norm": 1.9619140625,
1785
+ "learning_rate": 2.5125628140703517e-07,
1786
+ "loss": 0.1024,
1787
+ "step": 4950
1788
+ },
1789
+ {
1790
+ "epoch": 1.1637426900584795,
1791
+ "grad_norm": 6.17578125,
1792
+ "learning_rate": 1.2562814070351758e-07,
1793
+ "loss": 0.0698,
1794
+ "step": 4975
1795
+ },
1796
+ {
1797
+ "epoch": 1.1695906432748537,
1798
+ "grad_norm": 7.06640625,
1799
+ "learning_rate": 0.0,
1800
+ "loss": 0.1,
1801
+ "step": 5000
1802
+ },
1803
+ {
1804
+ "epoch": 1.1695906432748537,
1805
+ "eval_loss": 0.1871223896741867,
1806
+ "eval_runtime": 128.9773,
1807
+ "eval_samples_per_second": 3.489,
1808
+ "eval_steps_per_second": 0.442,
1809
+ "step": 5000
1810
+ },
1811
+ {
1812
+ "epoch": 1.1695906432748537,
1813
+ "step": 5000,
1814
+ "total_flos": 2.536648947447456e+17,
1815
+ "train_loss": 0.27304353165626527,
1816
+ "train_runtime": 21409.3382,
1817
+ "train_samples_per_second": 0.934,
1818
+ "train_steps_per_second": 0.234
1819
+ }
1820
+ ],
1821
+ "logging_steps": 25,
1822
+ "max_steps": 5000,
1823
+ "num_input_tokens_seen": 0,
1824
+ "num_train_epochs": 2,
1825
+ "save_steps": 100,
1826
+ "stateful_callbacks": {
1827
+ "TrainerControl": {
1828
+ "args": {
1829
+ "should_epoch_stop": false,
1830
+ "should_evaluate": false,
1831
+ "should_log": false,
1832
+ "should_save": true,
1833
+ "should_training_stop": true
1834
+ },
1835
+ "attributes": {}
1836
+ }
1837
+ },
1838
+ "total_flos": 2.536648947447456e+17,
1839
+ "train_batch_size": 4,
1840
+ "trial_name": null,
1841
+ "trial_params": null
1842
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3660282e9bbcdcfad19c2bf547dad5d55220b8a1e969aab83a17e938d21e214
3
+ size 5176