Raihan004 commited on
Commit
b349a97
1 Parent(s): afe7fca

Model save

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.7466666666666667
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 1.1523
36
- - Accuracy: 0.7467
37
 
38
  ## Model description
39
 
@@ -53,7 +53,7 @@ More information needed
53
 
54
  The following hyperparameters were used during training:
55
  - learning_rate: 0.0001
56
- - train_batch_size: 16
57
  - eval_batch_size: 8
58
  - seed: 42
59
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
@@ -65,68 +65,37 @@ The following hyperparameters were used during training:
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
68
- | 1.3291 | 0.16 | 100 | 1.1433 | 0.7457 |
69
- | 0.9409 | 0.32 | 200 | 0.8621 | 0.7686 |
70
- | 0.7815 | 0.48 | 300 | 0.7599 | 0.7676 |
71
- | 0.6803 | 0.64 | 400 | 0.8861 | 0.7210 |
72
- | 0.6277 | 0.8 | 500 | 0.8190 | 0.7305 |
73
- | 0.5569 | 0.96 | 600 | 0.6443 | 0.7990 |
74
- | 0.5029 | 1.11 | 700 | 0.7456 | 0.7676 |
75
- | 0.3852 | 1.27 | 800 | 0.7609 | 0.7695 |
76
- | 0.4065 | 1.43 | 900 | 0.7442 | 0.7705 |
77
- | 0.4319 | 1.59 | 1000 | 0.8240 | 0.7543 |
78
- | 0.4167 | 1.75 | 1100 | 0.8335 | 0.7571 |
79
- | 0.3778 | 1.91 | 1200 | 0.8210 | 0.7667 |
80
- | 0.3818 | 2.07 | 1300 | 0.8431 | 0.7486 |
81
- | 0.3249 | 2.23 | 1400 | 0.9156 | 0.7276 |
82
- | 0.2931 | 2.39 | 1500 | 0.8948 | 0.7371 |
83
- | 0.2808 | 2.55 | 1600 | 0.9114 | 0.7467 |
84
- | 0.2767 | 2.71 | 1700 | 0.8772 | 0.7590 |
85
- | 0.3422 | 2.87 | 1800 | 0.8072 | 0.7676 |
86
- | 0.2441 | 3.03 | 1900 | 0.9659 | 0.7381 |
87
- | 0.2438 | 3.18 | 2000 | 0.7905 | 0.7924 |
88
- | 0.3925 | 3.34 | 2100 | 0.9977 | 0.7305 |
89
- | 0.263 | 3.5 | 2200 | 0.8677 | 0.7619 |
90
- | 0.2585 | 3.66 | 2300 | 1.0279 | 0.7390 |
91
- | 0.2523 | 3.82 | 2400 | 0.8742 | 0.7829 |
92
- | 0.2322 | 3.98 | 2500 | 0.8817 | 0.7790 |
93
- | 0.1948 | 4.14 | 2600 | 0.8387 | 0.7848 |
94
- | 0.2318 | 4.3 | 2700 | 1.1542 | 0.7124 |
95
- | 0.2184 | 4.46 | 2800 | 1.1387 | 0.7152 |
96
- | 0.2484 | 4.62 | 2900 | 1.0976 | 0.7248 |
97
- | 0.1575 | 4.78 | 3000 | 1.0478 | 0.7457 |
98
- | 0.2028 | 4.94 | 3100 | 1.0374 | 0.7448 |
99
- | 0.1173 | 5.1 | 3200 | 1.0067 | 0.7610 |
100
- | 0.1313 | 5.25 | 3300 | 1.1971 | 0.7267 |
101
- | 0.2142 | 5.41 | 3400 | 1.1455 | 0.74 |
102
- | 0.1302 | 5.57 | 3500 | 1.0319 | 0.7629 |
103
- | 0.2193 | 5.73 | 3600 | 0.9746 | 0.7733 |
104
- | 0.1778 | 5.89 | 3700 | 1.0207 | 0.7552 |
105
- | 0.1003 | 6.05 | 3800 | 1.0538 | 0.7638 |
106
- | 0.1644 | 6.21 | 3900 | 1.1832 | 0.7305 |
107
- | 0.1843 | 6.37 | 4000 | 1.0814 | 0.7495 |
108
- | 0.129 | 6.53 | 4100 | 1.2479 | 0.72 |
109
- | 0.17 | 6.69 | 4200 | 1.1575 | 0.7419 |
110
- | 0.2184 | 6.85 | 4300 | 1.0946 | 0.7562 |
111
- | 0.1506 | 7.01 | 4400 | 1.0580 | 0.7714 |
112
- | 0.1099 | 7.17 | 4500 | 1.0479 | 0.7638 |
113
- | 0.1226 | 7.32 | 4600 | 1.1307 | 0.7495 |
114
- | 0.2122 | 7.48 | 4700 | 1.2838 | 0.7286 |
115
- | 0.1565 | 7.64 | 4800 | 1.2040 | 0.7390 |
116
- | 0.151 | 7.8 | 4900 | 1.2361 | 0.7429 |
117
- | 0.0934 | 7.96 | 5000 | 1.1985 | 0.7457 |
118
- | 0.1374 | 8.12 | 5100 | 1.1365 | 0.7495 |
119
- | 0.1799 | 8.28 | 5200 | 1.1371 | 0.7581 |
120
- | 0.1496 | 8.44 | 5300 | 1.1775 | 0.7429 |
121
- | 0.0804 | 8.6 | 5400 | 1.1278 | 0.7562 |
122
- | 0.12 | 8.76 | 5500 | 1.1210 | 0.7533 |
123
- | 0.099 | 8.92 | 5600 | 1.1295 | 0.7486 |
124
- | 0.1429 | 9.08 | 5700 | 1.2079 | 0.7390 |
125
- | 0.0959 | 9.24 | 5800 | 1.1451 | 0.7476 |
126
- | 0.0263 | 9.39 | 5900 | 1.1176 | 0.7514 |
127
- | 0.0936 | 9.55 | 6000 | 1.1179 | 0.7533 |
128
- | 0.1332 | 9.71 | 6100 | 1.1387 | 0.7486 |
129
- | 0.071 | 9.87 | 6200 | 1.1523 | 0.7467 |
130
 
131
 
132
  ### Framework versions
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.7666666666666667
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 1.0234
36
+ - Accuracy: 0.7667
37
 
38
  ## Model description
39
 
 
53
 
54
  The following hyperparameters were used during training:
55
  - learning_rate: 0.0001
56
+ - train_batch_size: 32
57
  - eval_batch_size: 8
58
  - seed: 42
59
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
68
+ | 0.4299 | 0.32 | 100 | 0.7981 | 0.7457 |
69
+ | 0.3903 | 0.64 | 200 | 0.7173 | 0.7771 |
70
+ | 0.4296 | 0.96 | 300 | 0.6869 | 0.7876 |
71
+ | 0.3589 | 1.27 | 400 | 0.9108 | 0.7314 |
72
+ | 0.3007 | 1.59 | 500 | 0.9720 | 0.7133 |
73
+ | 0.2817 | 1.91 | 600 | 0.8504 | 0.7486 |
74
+ | 0.2754 | 2.23 | 700 | 0.9009 | 0.7410 |
75
+ | 0.2226 | 2.55 | 800 | 0.9020 | 0.7495 |
76
+ | 0.285 | 2.87 | 900 | 1.0012 | 0.7295 |
77
+ | 0.2307 | 3.18 | 1000 | 0.8204 | 0.7810 |
78
+ | 0.2398 | 3.5 | 1100 | 0.8857 | 0.7695 |
79
+ | 0.1948 | 3.82 | 1200 | 0.9110 | 0.7571 |
80
+ | 0.1962 | 4.14 | 1300 | 0.9775 | 0.7533 |
81
+ | 0.2159 | 4.46 | 1400 | 0.9719 | 0.7457 |
82
+ | 0.1361 | 4.78 | 1500 | 0.9262 | 0.7571 |
83
+ | 0.1898 | 5.1 | 1600 | 0.9130 | 0.7705 |
84
+ | 0.1153 | 5.41 | 1700 | 1.0409 | 0.7438 |
85
+ | 0.1489 | 5.73 | 1800 | 1.0176 | 0.7495 |
86
+ | 0.1515 | 6.05 | 1900 | 1.0507 | 0.7486 |
87
+ | 0.1126 | 6.37 | 2000 | 1.1423 | 0.7210 |
88
+ | 0.1319 | 6.69 | 2100 | 1.1008 | 0.7467 |
89
+ | 0.1424 | 7.01 | 2200 | 1.0798 | 0.7419 |
90
+ | 0.0955 | 7.32 | 2300 | 1.0767 | 0.7505 |
91
+ | 0.1077 | 7.64 | 2400 | 1.0920 | 0.7457 |
92
+ | 0.1048 | 7.96 | 2500 | 1.0040 | 0.7733 |
93
+ | 0.0965 | 8.28 | 2600 | 1.0384 | 0.7610 |
94
+ | 0.0995 | 8.6 | 2700 | 1.0423 | 0.7648 |
95
+ | 0.1213 | 8.92 | 2800 | 1.0544 | 0.7619 |
96
+ | 0.0863 | 9.24 | 2900 | 1.0454 | 0.7629 |
97
+ | 0.0926 | 9.55 | 3000 | 1.0380 | 0.7676 |
98
+ | 0.0536 | 9.87 | 3100 | 1.0234 | 0.7667 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
 
101
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.7847619047619048,
4
- "eval_loss": 0.7120087742805481,
5
- "eval_runtime": 16.7402,
6
- "eval_samples_per_second": 62.723,
7
- "eval_steps_per_second": 7.885,
8
- "total_flos": 1.555375746295849e+18,
9
- "train_loss": 0.6562361546382782,
10
- "train_runtime": 775.5335,
11
- "train_samples_per_second": 25.879,
12
- "train_steps_per_second": 1.62
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 7.783078535151575e+18,
4
+ "train_loss": 0.28272074579623097,
5
+ "train_runtime": 3631.8012,
6
+ "train_samples_per_second": 27.631,
7
+ "train_steps_per_second": 1.729
 
 
 
 
 
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62faa2e5b8a3a31e455fcc990a986f000a21efc3e9603f74d88f7558ded49f80
3
  size 343248584
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9b654bdb5f5622a0b159d7dd36070d3ac5fe4a3b9b297c9406f7e37b8a92893
3
  size 343248584
runs/Apr17_15-48-28_43f826248acb/events.out.tfevents.1713373470.43f826248acb.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b75b9b6d46f625139c33fdb18bad86f4df7331f711209e17a73c220ea5845fa
3
+ size 5696
runs/Apr17_17-04-50_43f826248acb/events.out.tfevents.1713373508.43f826248acb.34.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2b48f5f0a5fe9bc3e06c3d074235d960f8566b1bc0aa2d6817b9f54a30ec69
3
+ size 49160
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "total_flos": 1.555375746295849e+18,
4
- "train_loss": 0.6562361546382782,
5
- "train_runtime": 775.5335,
6
- "train_samples_per_second": 25.879,
7
- "train_steps_per_second": 1.62
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 7.783078535151575e+18,
4
+ "train_loss": 0.28272074579623097,
5
+ "train_runtime": 3631.8012,
6
+ "train_samples_per_second": 27.631,
7
+ "train_steps_per_second": 1.729
8
  }
trainer_state.json CHANGED
@@ -1,571 +1,2785 @@
1
  {
2
- "best_metric": 0.7120087742805481,
3
  "best_model_checkpoint": "Action_model/checkpoint-600",
4
- "epoch": 2.0,
5
  "eval_steps": 100,
6
- "global_step": 1256,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "grad_norm": 2.222489356994629,
14
- "learning_rate": 9.840764331210192e-05,
15
- "loss": 2.2257,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.06,
20
- "grad_norm": 2.293585777282715,
21
- "learning_rate": 9.681528662420382e-05,
22
- "loss": 2.0112,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.1,
27
- "grad_norm": 2.167264223098755,
28
- "learning_rate": 9.522292993630574e-05,
29
- "loss": 1.8158,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.13,
34
- "grad_norm": 3.259340286254883,
35
- "learning_rate": 9.363057324840766e-05,
36
- "loss": 1.5504,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 2.540658473968506,
42
- "learning_rate": 9.203821656050956e-05,
43
- "loss": 1.3489,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.16,
48
- "eval_accuracy": 0.7,
49
- "eval_loss": 1.2611500024795532,
50
- "eval_runtime": 22.4862,
51
- "eval_samples_per_second": 46.695,
52
- "eval_steps_per_second": 5.87,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 0.19,
57
- "grad_norm": 2.064728021621704,
58
- "learning_rate": 9.044585987261147e-05,
59
- "loss": 1.2181,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 0.22,
64
- "grad_norm": 4.9456000328063965,
65
- "learning_rate": 8.885350318471338e-05,
66
- "loss": 1.1517,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 0.25,
71
- "grad_norm": 3.7164435386657715,
72
- "learning_rate": 8.73407643312102e-05,
73
- "loss": 1.0429,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 0.29,
78
- "grad_norm": 3.3535468578338623,
79
- "learning_rate": 8.57484076433121e-05,
80
- "loss": 0.9935,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 0.32,
85
- "grad_norm": 5.574573040008545,
86
- "learning_rate": 8.415605095541401e-05,
87
- "loss": 1.0112,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 0.32,
92
- "eval_accuracy": 0.7590476190476191,
93
- "eval_loss": 0.9050103425979614,
94
- "eval_runtime": 14.9955,
95
- "eval_samples_per_second": 70.021,
96
- "eval_steps_per_second": 8.803,
97
  "step": 200
98
  },
99
  {
100
  "epoch": 0.35,
101
- "grad_norm": 2.9566049575805664,
102
- "learning_rate": 8.256369426751593e-05,
103
- "loss": 0.9295,
104
  "step": 220
105
  },
106
  {
107
  "epoch": 0.38,
108
- "grad_norm": 2.9411683082580566,
109
- "learning_rate": 8.097133757961783e-05,
110
- "loss": 0.895,
111
  "step": 240
112
  },
113
  {
114
  "epoch": 0.41,
115
- "grad_norm": 4.049664497375488,
116
- "learning_rate": 7.937898089171975e-05,
117
- "loss": 0.8278,
118
  "step": 260
119
  },
120
  {
121
  "epoch": 0.45,
122
- "grad_norm": 4.934290409088135,
123
- "learning_rate": 7.778662420382165e-05,
124
- "loss": 0.7667,
125
  "step": 280
126
  },
127
  {
128
  "epoch": 0.48,
129
- "grad_norm": 5.120416164398193,
130
- "learning_rate": 7.619426751592357e-05,
131
- "loss": 0.7962,
132
  "step": 300
133
  },
134
  {
135
  "epoch": 0.48,
136
- "eval_accuracy": 0.7504761904761905,
137
- "eval_loss": 0.852245032787323,
138
- "eval_runtime": 14.85,
139
- "eval_samples_per_second": 70.707,
140
- "eval_steps_per_second": 8.889,
141
  "step": 300
142
  },
143
  {
144
  "epoch": 0.51,
145
- "grad_norm": 2.324152708053589,
146
- "learning_rate": 7.460191082802548e-05,
147
- "loss": 0.8446,
148
  "step": 320
149
  },
150
  {
151
  "epoch": 0.54,
152
- "grad_norm": 6.521075248718262,
153
- "learning_rate": 7.300955414012739e-05,
154
- "loss": 0.6878,
155
  "step": 340
156
  },
157
  {
158
  "epoch": 0.57,
159
- "grad_norm": 4.726436614990234,
160
- "learning_rate": 7.14171974522293e-05,
161
- "loss": 0.7465,
162
  "step": 360
163
  },
164
  {
165
  "epoch": 0.61,
166
- "grad_norm": 3.800800085067749,
167
- "learning_rate": 6.982484076433122e-05,
168
- "loss": 0.706,
169
  "step": 380
170
  },
171
  {
172
  "epoch": 0.64,
173
- "grad_norm": 5.507264614105225,
174
- "learning_rate": 6.823248407643312e-05,
175
- "loss": 0.6383,
176
  "step": 400
177
  },
178
  {
179
  "epoch": 0.64,
180
- "eval_accuracy": 0.7219047619047619,
181
- "eval_loss": 0.8676416277885437,
182
- "eval_runtime": 14.6076,
183
- "eval_samples_per_second": 71.881,
184
- "eval_steps_per_second": 9.036,
185
  "step": 400
186
  },
187
  {
188
  "epoch": 0.67,
189
- "grad_norm": 6.359522342681885,
190
- "learning_rate": 6.664012738853504e-05,
191
- "loss": 0.6658,
192
  "step": 420
193
  },
194
  {
195
  "epoch": 0.7,
196
- "grad_norm": 3.9945261478424072,
197
- "learning_rate": 6.504777070063695e-05,
198
- "loss": 0.6106,
199
  "step": 440
200
  },
201
  {
202
  "epoch": 0.73,
203
- "grad_norm": 2.555899143218994,
204
- "learning_rate": 6.345541401273885e-05,
205
- "loss": 0.7034,
206
  "step": 460
207
  },
208
  {
209
  "epoch": 0.76,
210
- "grad_norm": 2.68978214263916,
211
- "learning_rate": 6.186305732484077e-05,
212
- "loss": 0.4986,
213
  "step": 480
214
  },
215
  {
216
  "epoch": 0.8,
217
- "grad_norm": 4.49608039855957,
218
- "learning_rate": 6.027070063694268e-05,
219
- "loss": 0.6485,
220
  "step": 500
221
  },
222
  {
223
  "epoch": 0.8,
224
- "eval_accuracy": 0.7323809523809524,
225
- "eval_loss": 0.8052415251731873,
226
- "eval_runtime": 14.5568,
227
- "eval_samples_per_second": 72.131,
228
- "eval_steps_per_second": 9.068,
229
  "step": 500
230
  },
231
  {
232
  "epoch": 0.83,
233
- "grad_norm": 5.239855766296387,
234
- "learning_rate": 5.867834394904459e-05,
235
- "loss": 0.6176,
236
  "step": 520
237
  },
238
  {
239
  "epoch": 0.86,
240
- "grad_norm": 2.8663668632507324,
241
- "learning_rate": 5.70859872611465e-05,
242
- "loss": 0.5519,
243
  "step": 540
244
  },
245
  {
246
  "epoch": 0.89,
247
- "grad_norm": 2.615525245666504,
248
- "learning_rate": 5.5493630573248414e-05,
249
- "loss": 0.6374,
250
  "step": 560
251
  },
252
  {
253
  "epoch": 0.92,
254
- "grad_norm": 3.312385082244873,
255
- "learning_rate": 5.3901273885350324e-05,
256
- "loss": 0.5816,
257
  "step": 580
258
  },
259
  {
260
  "epoch": 0.96,
261
- "grad_norm": 4.399689197540283,
262
- "learning_rate": 5.230891719745223e-05,
263
- "loss": 0.5452,
264
  "step": 600
265
  },
266
  {
267
  "epoch": 0.96,
268
- "eval_accuracy": 0.7847619047619048,
269
- "eval_loss": 0.7120087742805481,
270
- "eval_runtime": 14.577,
271
- "eval_samples_per_second": 72.031,
272
  "eval_steps_per_second": 9.055,
273
  "step": 600
274
  },
275
  {
276
  "epoch": 0.99,
277
- "grad_norm": 3.4874184131622314,
278
- "learning_rate": 5.071656050955414e-05,
279
- "loss": 0.5328,
280
  "step": 620
281
  },
282
  {
283
  "epoch": 1.02,
284
- "grad_norm": 5.2181396484375,
285
- "learning_rate": 4.912420382165605e-05,
286
- "loss": 0.5078,
287
  "step": 640
288
  },
289
  {
290
  "epoch": 1.05,
291
- "grad_norm": 2.219102621078491,
292
- "learning_rate": 4.753184713375796e-05,
293
- "loss": 0.4969,
294
  "step": 660
295
  },
296
  {
297
  "epoch": 1.08,
298
- "grad_norm": 4.785001754760742,
299
- "learning_rate": 4.593949044585987e-05,
300
- "loss": 0.5407,
301
  "step": 680
302
  },
303
  {
304
  "epoch": 1.11,
305
- "grad_norm": 7.441385269165039,
306
- "learning_rate": 4.4347133757961786e-05,
307
- "loss": 0.4882,
308
  "step": 700
309
  },
310
  {
311
  "epoch": 1.11,
312
- "eval_accuracy": 0.7714285714285715,
313
- "eval_loss": 0.7478358745574951,
314
- "eval_runtime": 14.7271,
315
- "eval_samples_per_second": 71.297,
316
- "eval_steps_per_second": 8.963,
317
  "step": 700
318
  },
319
  {
320
  "epoch": 1.15,
321
- "grad_norm": 3.0530927181243896,
322
- "learning_rate": 4.2754777070063695e-05,
323
- "loss": 0.423,
324
  "step": 720
325
  },
326
  {
327
  "epoch": 1.18,
328
- "grad_norm": 3.1082653999328613,
329
- "learning_rate": 4.1162420382165605e-05,
330
- "loss": 0.505,
331
  "step": 740
332
  },
333
  {
334
  "epoch": 1.21,
335
- "grad_norm": 5.5019683837890625,
336
- "learning_rate": 3.957006369426752e-05,
337
- "loss": 0.4445,
338
  "step": 760
339
  },
340
  {
341
  "epoch": 1.24,
342
- "grad_norm": 3.35685658454895,
343
- "learning_rate": 3.797770700636943e-05,
344
- "loss": 0.4795,
345
  "step": 780
346
  },
347
  {
348
  "epoch": 1.27,
349
- "grad_norm": 0.8577423691749573,
350
- "learning_rate": 3.638535031847134e-05,
351
- "loss": 0.3409,
352
  "step": 800
353
  },
354
  {
355
  "epoch": 1.27,
356
- "eval_accuracy": 0.7742857142857142,
357
- "eval_loss": 0.7310556769371033,
358
- "eval_runtime": 14.6273,
359
- "eval_samples_per_second": 71.784,
360
- "eval_steps_per_second": 9.024,
361
  "step": 800
362
  },
363
  {
364
  "epoch": 1.31,
365
- "grad_norm": 2.747500419616699,
366
- "learning_rate": 3.479299363057325e-05,
367
- "loss": 0.3633,
368
  "step": 820
369
  },
370
  {
371
  "epoch": 1.34,
372
- "grad_norm": 2.4795773029327393,
373
- "learning_rate": 3.3200636942675165e-05,
374
- "loss": 0.4641,
375
  "step": 840
376
  },
377
  {
378
  "epoch": 1.37,
379
- "grad_norm": 5.826427936553955,
380
- "learning_rate": 3.1608280254777074e-05,
381
- "loss": 0.4289,
382
  "step": 860
383
  },
384
  {
385
  "epoch": 1.4,
386
- "grad_norm": 4.507148742675781,
387
- "learning_rate": 3.0015923566878983e-05,
388
- "loss": 0.4525,
389
  "step": 880
390
  },
391
  {
392
  "epoch": 1.43,
393
- "grad_norm": 2.810245990753174,
394
- "learning_rate": 2.8423566878980896e-05,
395
- "loss": 0.4105,
396
  "step": 900
397
  },
398
  {
399
  "epoch": 1.43,
400
- "eval_accuracy": 0.780952380952381,
401
- "eval_loss": 0.735313892364502,
402
- "eval_runtime": 14.7897,
403
- "eval_samples_per_second": 70.995,
404
- "eval_steps_per_second": 8.925,
405
  "step": 900
406
  },
407
  {
408
  "epoch": 1.46,
409
- "grad_norm": 3.5758163928985596,
410
- "learning_rate": 2.6831210191082805e-05,
411
- "loss": 0.3657,
412
  "step": 920
413
  },
414
  {
415
  "epoch": 1.5,
416
- "grad_norm": 2.174391031265259,
417
- "learning_rate": 2.5238853503184718e-05,
418
- "loss": 0.3409,
419
  "step": 940
420
  },
421
  {
422
  "epoch": 1.53,
423
- "grad_norm": 3.542391300201416,
424
- "learning_rate": 2.372611464968153e-05,
425
- "loss": 0.3414,
426
  "step": 960
427
  },
428
  {
429
  "epoch": 1.56,
430
- "grad_norm": 4.226655006408691,
431
- "learning_rate": 2.2133757961783442e-05,
432
- "loss": 0.383,
433
  "step": 980
434
  },
435
  {
436
  "epoch": 1.59,
437
- "grad_norm": 5.462564945220947,
438
- "learning_rate": 2.054140127388535e-05,
439
- "loss": 0.4011,
440
  "step": 1000
441
  },
442
  {
443
  "epoch": 1.59,
444
- "eval_accuracy": 0.7457142857142857,
445
- "eval_loss": 0.8153719305992126,
446
- "eval_runtime": 14.4617,
447
- "eval_samples_per_second": 72.605,
448
- "eval_steps_per_second": 9.128,
449
  "step": 1000
450
  },
451
  {
452
  "epoch": 1.62,
453
- "grad_norm": 6.1501569747924805,
454
- "learning_rate": 1.8949044585987264e-05,
455
- "loss": 0.3402,
456
  "step": 1020
457
  },
458
  {
459
  "epoch": 1.66,
460
- "grad_norm": 2.9438650608062744,
461
- "learning_rate": 1.7356687898089173e-05,
462
- "loss": 0.2997,
463
  "step": 1040
464
  },
465
  {
466
  "epoch": 1.69,
467
- "grad_norm": 1.3817728757858276,
468
- "learning_rate": 1.5764331210191083e-05,
469
- "loss": 0.3485,
470
  "step": 1060
471
  },
472
  {
473
  "epoch": 1.72,
474
- "grad_norm": 0.5100256204605103,
475
- "learning_rate": 1.4171974522292993e-05,
476
- "loss": 0.3804,
477
  "step": 1080
478
  },
479
  {
480
  "epoch": 1.75,
481
- "grad_norm": 7.605688095092773,
482
- "learning_rate": 1.2579617834394904e-05,
483
- "loss": 0.3493,
484
  "step": 1100
485
  },
486
  {
487
  "epoch": 1.75,
488
- "eval_accuracy": 0.7752380952380953,
489
- "eval_loss": 0.7397615313529968,
490
- "eval_runtime": 14.8106,
491
- "eval_samples_per_second": 70.895,
492
- "eval_steps_per_second": 8.913,
493
  "step": 1100
494
  },
495
  {
496
  "epoch": 1.78,
497
- "grad_norm": 10.322568893432617,
498
- "learning_rate": 1.0987261146496815e-05,
499
- "loss": 0.4022,
500
  "step": 1120
501
  },
502
  {
503
  "epoch": 1.82,
504
- "grad_norm": 5.649250030517578,
505
- "learning_rate": 9.394904458598726e-06,
506
- "loss": 0.2426,
507
  "step": 1140
508
  },
509
  {
510
  "epoch": 1.85,
511
- "grad_norm": 7.395249366760254,
512
- "learning_rate": 7.802547770700637e-06,
513
- "loss": 0.2628,
514
  "step": 1160
515
  },
516
  {
517
  "epoch": 1.88,
518
- "grad_norm": 1.7934772968292236,
519
- "learning_rate": 6.210191082802548e-06,
520
- "loss": 0.3818,
521
  "step": 1180
522
  },
523
  {
524
  "epoch": 1.91,
525
- "grad_norm": 6.324862480163574,
526
- "learning_rate": 4.6178343949044585e-06,
527
- "loss": 0.3389,
528
  "step": 1200
529
  },
530
  {
531
  "epoch": 1.91,
532
- "eval_accuracy": 0.7676190476190476,
533
- "eval_loss": 0.7365464568138123,
534
- "eval_runtime": 14.7187,
535
- "eval_samples_per_second": 71.338,
536
- "eval_steps_per_second": 8.968,
537
  "step": 1200
538
  },
539
  {
540
  "epoch": 1.94,
541
- "grad_norm": 4.285161018371582,
542
- "learning_rate": 3.0254777070063695e-06,
543
- "loss": 0.3351,
544
  "step": 1220
545
  },
546
  {
547
  "epoch": 1.97,
548
- "grad_norm": 4.406313896179199,
549
- "learning_rate": 1.4331210191082802e-06,
550
- "loss": 0.2856,
551
  "step": 1240
552
  },
553
  {
554
- "epoch": 2.0,
555
- "step": 1256,
556
- "total_flos": 1.555375746295849e+18,
557
- "train_loss": 0.6562361546382782,
558
- "train_runtime": 775.5335,
559
- "train_samples_per_second": 25.879,
560
- "train_steps_per_second": 1.62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  }
562
  ],
563
  "logging_steps": 20,
564
- "max_steps": 1256,
565
  "num_input_tokens_seen": 0,
566
- "num_train_epochs": 2,
567
  "save_steps": 100,
568
- "total_flos": 1.555375746295849e+18,
569
  "train_batch_size": 16,
570
  "trial_name": null,
571
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6442861557006836,
3
  "best_model_checkpoint": "Action_model/checkpoint-600",
4
+ "epoch": 10.0,
5
  "eval_steps": 100,
6
+ "global_step": 6280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "grad_norm": 1.9006062746047974,
14
+ "learning_rate": 9.968152866242038e-05,
15
+ "loss": 2.1282,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.06,
20
+ "grad_norm": 2.270326614379883,
21
+ "learning_rate": 9.936305732484077e-05,
22
+ "loss": 1.9389,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.1,
27
+ "grad_norm": 2.4455759525299072,
28
+ "learning_rate": 9.904458598726115e-05,
29
+ "loss": 1.7096,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.13,
34
+ "grad_norm": 2.7374861240386963,
35
+ "learning_rate": 9.872611464968153e-05,
36
+ "loss": 1.4553,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.16,
41
+ "grad_norm": 2.2748217582702637,
42
+ "learning_rate": 9.840764331210192e-05,
43
+ "loss": 1.3291,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.16,
48
+ "eval_accuracy": 0.7457142857142857,
49
+ "eval_loss": 1.1432547569274902,
50
+ "eval_runtime": 24.2055,
51
+ "eval_samples_per_second": 43.379,
52
+ "eval_steps_per_second": 5.453,
53
  "step": 100
54
  },
55
  {
56
  "epoch": 0.19,
57
+ "grad_norm": 6.182733535766602,
58
+ "learning_rate": 9.80891719745223e-05,
59
+ "loss": 1.2402,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 0.22,
64
+ "grad_norm": 3.0395169258117676,
65
+ "learning_rate": 9.777070063694268e-05,
66
+ "loss": 1.1275,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 0.25,
71
+ "grad_norm": 3.663072347640991,
72
+ "learning_rate": 9.745222929936307e-05,
73
+ "loss": 1.0061,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 0.29,
78
+ "grad_norm": 2.044201135635376,
79
+ "learning_rate": 9.713375796178345e-05,
80
+ "loss": 1.0052,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 0.32,
85
+ "grad_norm": 3.233858585357666,
86
+ "learning_rate": 9.681528662420382e-05,
87
+ "loss": 0.9409,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 0.32,
92
+ "eval_accuracy": 0.7685714285714286,
93
+ "eval_loss": 0.8621137738227844,
94
+ "eval_runtime": 14.6062,
95
+ "eval_samples_per_second": 71.887,
96
+ "eval_steps_per_second": 9.037,
97
  "step": 200
98
  },
99
  {
100
  "epoch": 0.35,
101
+ "grad_norm": 3.4736602306365967,
102
+ "learning_rate": 9.649681528662421e-05,
103
+ "loss": 0.8728,
104
  "step": 220
105
  },
106
  {
107
  "epoch": 0.38,
108
+ "grad_norm": 4.214045524597168,
109
+ "learning_rate": 9.617834394904459e-05,
110
+ "loss": 0.865,
111
  "step": 240
112
  },
113
  {
114
  "epoch": 0.41,
115
+ "grad_norm": 4.457535743713379,
116
+ "learning_rate": 9.585987261146497e-05,
117
+ "loss": 0.8243,
118
  "step": 260
119
  },
120
  {
121
  "epoch": 0.45,
122
+ "grad_norm": 5.426912307739258,
123
+ "learning_rate": 9.554140127388536e-05,
124
+ "loss": 0.8089,
125
  "step": 280
126
  },
127
  {
128
  "epoch": 0.48,
129
+ "grad_norm": 3.6690785884857178,
130
+ "learning_rate": 9.522292993630574e-05,
131
+ "loss": 0.7815,
132
  "step": 300
133
  },
134
  {
135
  "epoch": 0.48,
136
+ "eval_accuracy": 0.7676190476190476,
137
+ "eval_loss": 0.7599405646324158,
138
+ "eval_runtime": 14.7234,
139
+ "eval_samples_per_second": 71.315,
140
+ "eval_steps_per_second": 8.965,
141
  "step": 300
142
  },
143
  {
144
  "epoch": 0.51,
145
+ "grad_norm": 3.9266645908355713,
146
+ "learning_rate": 9.490445859872612e-05,
147
+ "loss": 0.856,
148
  "step": 320
149
  },
150
  {
151
  "epoch": 0.54,
152
+ "grad_norm": 4.882758617401123,
153
+ "learning_rate": 9.458598726114651e-05,
154
+ "loss": 0.7119,
155
  "step": 340
156
  },
157
  {
158
  "epoch": 0.57,
159
+ "grad_norm": 4.982381343841553,
160
+ "learning_rate": 9.426751592356689e-05,
161
+ "loss": 0.7165,
162
  "step": 360
163
  },
164
  {
165
  "epoch": 0.61,
166
+ "grad_norm": 9.61684799194336,
167
+ "learning_rate": 9.394904458598726e-05,
168
+ "loss": 0.7334,
169
  "step": 380
170
  },
171
  {
172
  "epoch": 0.64,
173
+ "grad_norm": 4.234207630157471,
174
+ "learning_rate": 9.364649681528663e-05,
175
+ "loss": 0.6803,
176
  "step": 400
177
  },
178
  {
179
  "epoch": 0.64,
180
+ "eval_accuracy": 0.7209523809523809,
181
+ "eval_loss": 0.88614422082901,
182
+ "eval_runtime": 14.4486,
183
+ "eval_samples_per_second": 72.671,
184
+ "eval_steps_per_second": 9.136,
185
  "step": 400
186
  },
187
  {
188
  "epoch": 0.67,
189
+ "grad_norm": 4.2712836265563965,
190
+ "learning_rate": 9.332802547770702e-05,
191
+ "loss": 0.7381,
192
  "step": 420
193
  },
194
  {
195
  "epoch": 0.7,
196
+ "grad_norm": 4.896230220794678,
197
+ "learning_rate": 9.30095541401274e-05,
198
+ "loss": 0.6985,
199
  "step": 440
200
  },
201
  {
202
  "epoch": 0.73,
203
+ "grad_norm": 6.688229560852051,
204
+ "learning_rate": 9.269108280254777e-05,
205
+ "loss": 0.6238,
206
  "step": 460
207
  },
208
  {
209
  "epoch": 0.76,
210
+ "grad_norm": 0.8742395043373108,
211
+ "learning_rate": 9.237261146496817e-05,
212
+ "loss": 0.5231,
213
  "step": 480
214
  },
215
  {
216
  "epoch": 0.8,
217
+ "grad_norm": 7.481076240539551,
218
+ "learning_rate": 9.205414012738854e-05,
219
+ "loss": 0.6277,
220
  "step": 500
221
  },
222
  {
223
  "epoch": 0.8,
224
+ "eval_accuracy": 0.7304761904761905,
225
+ "eval_loss": 0.8189947009086609,
226
+ "eval_runtime": 14.388,
227
+ "eval_samples_per_second": 72.977,
228
+ "eval_steps_per_second": 9.174,
229
  "step": 500
230
  },
231
  {
232
  "epoch": 0.83,
233
+ "grad_norm": 4.198249340057373,
234
+ "learning_rate": 9.173566878980892e-05,
235
+ "loss": 0.5832,
236
  "step": 520
237
  },
238
  {
239
  "epoch": 0.86,
240
+ "grad_norm": 4.3164286613464355,
241
+ "learning_rate": 9.141719745222931e-05,
242
+ "loss": 0.6607,
243
  "step": 540
244
  },
245
  {
246
  "epoch": 0.89,
247
+ "grad_norm": 4.7557759284973145,
248
+ "learning_rate": 9.109872611464969e-05,
249
+ "loss": 0.5325,
250
  "step": 560
251
  },
252
  {
253
  "epoch": 0.92,
254
+ "grad_norm": 3.6213605403900146,
255
+ "learning_rate": 9.078025477707007e-05,
256
+ "loss": 0.6377,
257
  "step": 580
258
  },
259
  {
260
  "epoch": 0.96,
261
+ "grad_norm": 3.4348907470703125,
262
+ "learning_rate": 9.046178343949046e-05,
263
+ "loss": 0.5569,
264
  "step": 600
265
  },
266
  {
267
  "epoch": 0.96,
268
+ "eval_accuracy": 0.799047619047619,
269
+ "eval_loss": 0.6442861557006836,
270
+ "eval_runtime": 14.5781,
271
+ "eval_samples_per_second": 72.026,
272
  "eval_steps_per_second": 9.055,
273
  "step": 600
274
  },
275
  {
276
  "epoch": 0.99,
277
+ "grad_norm": 3.056196451187134,
278
+ "learning_rate": 9.014331210191084e-05,
279
+ "loss": 0.5262,
280
  "step": 620
281
  },
282
  {
283
  "epoch": 1.02,
284
+ "grad_norm": 5.837003707885742,
285
+ "learning_rate": 8.982484076433122e-05,
286
+ "loss": 0.5519,
287
  "step": 640
288
  },
289
  {
290
  "epoch": 1.05,
291
+ "grad_norm": 3.373262643814087,
292
+ "learning_rate": 8.950636942675161e-05,
293
+ "loss": 0.5346,
294
  "step": 660
295
  },
296
  {
297
  "epoch": 1.08,
298
+ "grad_norm": 23.955032348632812,
299
+ "learning_rate": 8.920382165605096e-05,
300
+ "loss": 0.5024,
301
  "step": 680
302
  },
303
  {
304
  "epoch": 1.11,
305
+ "grad_norm": 8.919329643249512,
306
+ "learning_rate": 8.888535031847133e-05,
307
+ "loss": 0.5029,
308
  "step": 700
309
  },
310
  {
311
  "epoch": 1.11,
312
+ "eval_accuracy": 0.7676190476190476,
313
+ "eval_loss": 0.74560546875,
314
+ "eval_runtime": 14.3938,
315
+ "eval_samples_per_second": 72.948,
316
+ "eval_steps_per_second": 9.171,
317
  "step": 700
318
  },
319
  {
320
  "epoch": 1.15,
321
+ "grad_norm": 3.3955509662628174,
322
+ "learning_rate": 8.856687898089173e-05,
323
+ "loss": 0.5526,
324
  "step": 720
325
  },
326
  {
327
  "epoch": 1.18,
328
+ "grad_norm": 4.959121227264404,
329
+ "learning_rate": 8.82484076433121e-05,
330
+ "loss": 0.6102,
331
  "step": 740
332
  },
333
  {
334
  "epoch": 1.21,
335
+ "grad_norm": 4.158123970031738,
336
+ "learning_rate": 8.792993630573248e-05,
337
+ "loss": 0.5073,
338
  "step": 760
339
  },
340
  {
341
  "epoch": 1.24,
342
+ "grad_norm": 5.6492815017700195,
343
+ "learning_rate": 8.761146496815287e-05,
344
+ "loss": 0.5264,
345
  "step": 780
346
  },
347
  {
348
  "epoch": 1.27,
349
+ "grad_norm": 6.516519069671631,
350
+ "learning_rate": 8.729299363057325e-05,
351
+ "loss": 0.3852,
352
  "step": 800
353
  },
354
  {
355
  "epoch": 1.27,
356
+ "eval_accuracy": 0.7695238095238095,
357
+ "eval_loss": 0.7608746886253357,
358
+ "eval_runtime": 14.1377,
359
+ "eval_samples_per_second": 74.27,
360
+ "eval_steps_per_second": 9.337,
361
  "step": 800
362
  },
363
  {
364
  "epoch": 1.31,
365
+ "grad_norm": 2.4917163848876953,
366
+ "learning_rate": 8.697452229299363e-05,
367
+ "loss": 0.3567,
368
  "step": 820
369
  },
370
  {
371
  "epoch": 1.34,
372
+ "grad_norm": 6.754037380218506,
373
+ "learning_rate": 8.665605095541402e-05,
374
+ "loss": 0.3531,
375
  "step": 840
376
  },
377
  {
378
  "epoch": 1.37,
379
+ "grad_norm": 2.805572032928467,
380
+ "learning_rate": 8.63375796178344e-05,
381
+ "loss": 0.4395,
382
  "step": 860
383
  },
384
  {
385
  "epoch": 1.4,
386
+ "grad_norm": 6.810430526733398,
387
+ "learning_rate": 8.601910828025478e-05,
388
+ "loss": 0.4404,
389
  "step": 880
390
  },
391
  {
392
  "epoch": 1.43,
393
+ "grad_norm": 3.569650888442993,
394
+ "learning_rate": 8.570063694267517e-05,
395
+ "loss": 0.4065,
396
  "step": 900
397
  },
398
  {
399
  "epoch": 1.43,
400
+ "eval_accuracy": 0.7704761904761904,
401
+ "eval_loss": 0.7441606521606445,
402
+ "eval_runtime": 14.6102,
403
+ "eval_samples_per_second": 71.867,
404
+ "eval_steps_per_second": 9.035,
405
  "step": 900
406
  },
407
  {
408
  "epoch": 1.46,
409
+ "grad_norm": 5.281124114990234,
410
+ "learning_rate": 8.538216560509554e-05,
411
+ "loss": 0.3558,
412
  "step": 920
413
  },
414
  {
415
  "epoch": 1.5,
416
+ "grad_norm": 2.859487295150757,
417
+ "learning_rate": 8.506369426751592e-05,
418
+ "loss": 0.5225,
419
  "step": 940
420
  },
421
  {
422
  "epoch": 1.53,
423
+ "grad_norm": 1.9912675619125366,
424
+ "learning_rate": 8.474522292993631e-05,
425
+ "loss": 0.4057,
426
  "step": 960
427
  },
428
  {
429
  "epoch": 1.56,
430
+ "grad_norm": 2.6385486125946045,
431
+ "learning_rate": 8.442675159235669e-05,
432
+ "loss": 0.327,
433
  "step": 980
434
  },
435
  {
436
  "epoch": 1.59,
437
+ "grad_norm": 1.4441651105880737,
438
+ "learning_rate": 8.410828025477707e-05,
439
+ "loss": 0.4319,
440
  "step": 1000
441
  },
442
  {
443
  "epoch": 1.59,
444
+ "eval_accuracy": 0.7542857142857143,
445
+ "eval_loss": 0.8239891529083252,
446
+ "eval_runtime": 14.7208,
447
+ "eval_samples_per_second": 71.328,
448
+ "eval_steps_per_second": 8.967,
449
  "step": 1000
450
  },
451
  {
452
  "epoch": 1.62,
453
+ "grad_norm": 6.856269359588623,
454
+ "learning_rate": 8.378980891719746e-05,
455
+ "loss": 0.3086,
456
  "step": 1020
457
  },
458
  {
459
  "epoch": 1.66,
460
+ "grad_norm": 3.9138472080230713,
461
+ "learning_rate": 8.347133757961784e-05,
462
+ "loss": 0.434,
463
  "step": 1040
464
  },
465
  {
466
  "epoch": 1.69,
467
+ "grad_norm": 1.921924114227295,
468
+ "learning_rate": 8.315286624203822e-05,
469
+ "loss": 0.474,
470
  "step": 1060
471
  },
472
  {
473
  "epoch": 1.72,
474
+ "grad_norm": 2.406094789505005,
475
+ "learning_rate": 8.283439490445861e-05,
476
+ "loss": 0.4016,
477
  "step": 1080
478
  },
479
  {
480
  "epoch": 1.75,
481
+ "grad_norm": 3.4770796298980713,
482
+ "learning_rate": 8.251592356687899e-05,
483
+ "loss": 0.4167,
484
  "step": 1100
485
  },
486
  {
487
  "epoch": 1.75,
488
+ "eval_accuracy": 0.7571428571428571,
489
+ "eval_loss": 0.8335052728652954,
490
+ "eval_runtime": 14.4944,
491
+ "eval_samples_per_second": 72.442,
492
+ "eval_steps_per_second": 9.107,
493
  "step": 1100
494
  },
495
  {
496
  "epoch": 1.78,
497
+ "grad_norm": 3.806575298309326,
498
+ "learning_rate": 8.219745222929936e-05,
499
+ "loss": 0.3995,
500
  "step": 1120
501
  },
502
  {
503
  "epoch": 1.82,
504
+ "grad_norm": 4.899787902832031,
505
+ "learning_rate": 8.187898089171974e-05,
506
+ "loss": 0.4137,
507
  "step": 1140
508
  },
509
  {
510
  "epoch": 1.85,
511
+ "grad_norm": 3.540315866470337,
512
+ "learning_rate": 8.156050955414013e-05,
513
+ "loss": 0.3625,
514
  "step": 1160
515
  },
516
  {
517
  "epoch": 1.88,
518
+ "grad_norm": 5.864512920379639,
519
+ "learning_rate": 8.124203821656051e-05,
520
+ "loss": 0.3156,
521
  "step": 1180
522
  },
523
  {
524
  "epoch": 1.91,
525
+ "grad_norm": 5.941263675689697,
526
+ "learning_rate": 8.092356687898089e-05,
527
+ "loss": 0.3778,
528
  "step": 1200
529
  },
530
  {
531
  "epoch": 1.91,
532
+ "eval_accuracy": 0.7666666666666667,
533
+ "eval_loss": 0.8210071325302124,
534
+ "eval_runtime": 14.6523,
535
+ "eval_samples_per_second": 71.661,
536
+ "eval_steps_per_second": 9.009,
537
  "step": 1200
538
  },
539
  {
540
  "epoch": 1.94,
541
+ "grad_norm": 10.370551109313965,
542
+ "learning_rate": 8.060509554140128e-05,
543
+ "loss": 0.3759,
544
  "step": 1220
545
  },
546
  {
547
  "epoch": 1.97,
548
+ "grad_norm": 0.30926162004470825,
549
+ "learning_rate": 8.028662420382166e-05,
550
+ "loss": 0.3933,
551
  "step": 1240
552
  },
553
  {
554
+ "epoch": 2.01,
555
+ "grad_norm": 3.72684907913208,
556
+ "learning_rate": 7.998407643312102e-05,
557
+ "loss": 0.4111,
558
+ "step": 1260
559
+ },
560
+ {
561
+ "epoch": 2.04,
562
+ "grad_norm": 0.4711369276046753,
563
+ "learning_rate": 7.966560509554141e-05,
564
+ "loss": 0.3268,
565
+ "step": 1280
566
+ },
567
+ {
568
+ "epoch": 2.07,
569
+ "grad_norm": 2.9964778423309326,
570
+ "learning_rate": 7.934713375796179e-05,
571
+ "loss": 0.3818,
572
+ "step": 1300
573
+ },
574
+ {
575
+ "epoch": 2.07,
576
+ "eval_accuracy": 0.7485714285714286,
577
+ "eval_loss": 0.8431442975997925,
578
+ "eval_runtime": 14.7089,
579
+ "eval_samples_per_second": 71.385,
580
+ "eval_steps_per_second": 8.974,
581
+ "step": 1300
582
+ },
583
+ {
584
+ "epoch": 2.1,
585
+ "grad_norm": 4.478536128997803,
586
+ "learning_rate": 7.902866242038217e-05,
587
+ "loss": 0.4138,
588
+ "step": 1320
589
+ },
590
+ {
591
+ "epoch": 2.13,
592
+ "grad_norm": 10.571870803833008,
593
+ "learning_rate": 7.871019108280256e-05,
594
+ "loss": 0.3231,
595
+ "step": 1340
596
+ },
597
+ {
598
+ "epoch": 2.17,
599
+ "grad_norm": 0.3552369773387909,
600
+ "learning_rate": 7.839171974522294e-05,
601
+ "loss": 0.2726,
602
+ "step": 1360
603
+ },
604
+ {
605
+ "epoch": 2.2,
606
+ "grad_norm": 7.564175605773926,
607
+ "learning_rate": 7.807324840764331e-05,
608
+ "loss": 0.3614,
609
+ "step": 1380
610
+ },
611
+ {
612
+ "epoch": 2.23,
613
+ "grad_norm": 3.9198570251464844,
614
+ "learning_rate": 7.77547770700637e-05,
615
+ "loss": 0.3249,
616
+ "step": 1400
617
+ },
618
+ {
619
+ "epoch": 2.23,
620
+ "eval_accuracy": 0.7276190476190476,
621
+ "eval_loss": 0.915591299533844,
622
+ "eval_runtime": 14.3099,
623
+ "eval_samples_per_second": 73.376,
624
+ "eval_steps_per_second": 9.224,
625
+ "step": 1400
626
+ },
627
+ {
628
+ "epoch": 2.26,
629
+ "grad_norm": 3.698049306869507,
630
+ "learning_rate": 7.743630573248408e-05,
631
+ "loss": 0.3335,
632
+ "step": 1420
633
+ },
634
+ {
635
+ "epoch": 2.29,
636
+ "grad_norm": 4.099252223968506,
637
+ "learning_rate": 7.711783439490446e-05,
638
+ "loss": 0.3994,
639
+ "step": 1440
640
+ },
641
+ {
642
+ "epoch": 2.32,
643
+ "grad_norm": 3.5469987392425537,
644
+ "learning_rate": 7.679936305732485e-05,
645
+ "loss": 0.3912,
646
+ "step": 1460
647
+ },
648
+ {
649
+ "epoch": 2.36,
650
+ "grad_norm": 0.5396450161933899,
651
+ "learning_rate": 7.648089171974523e-05,
652
+ "loss": 0.3337,
653
+ "step": 1480
654
+ },
655
+ {
656
+ "epoch": 2.39,
657
+ "grad_norm": 2.9178402423858643,
658
+ "learning_rate": 7.616242038216561e-05,
659
+ "loss": 0.2931,
660
+ "step": 1500
661
+ },
662
+ {
663
+ "epoch": 2.39,
664
+ "eval_accuracy": 0.7371428571428571,
665
+ "eval_loss": 0.8948376178741455,
666
+ "eval_runtime": 14.4741,
667
+ "eval_samples_per_second": 72.543,
668
+ "eval_steps_per_second": 9.12,
669
+ "step": 1500
670
+ },
671
+ {
672
+ "epoch": 2.42,
673
+ "grad_norm": 3.9901485443115234,
674
+ "learning_rate": 7.5843949044586e-05,
675
+ "loss": 0.3061,
676
+ "step": 1520
677
+ },
678
+ {
679
+ "epoch": 2.45,
680
+ "grad_norm": 5.539924621582031,
681
+ "learning_rate": 7.552547770700638e-05,
682
+ "loss": 0.354,
683
+ "step": 1540
684
+ },
685
+ {
686
+ "epoch": 2.48,
687
+ "grad_norm": 0.7245145440101624,
688
+ "learning_rate": 7.520700636942676e-05,
689
+ "loss": 0.3427,
690
+ "step": 1560
691
+ },
692
+ {
693
+ "epoch": 2.52,
694
+ "grad_norm": 5.047821044921875,
695
+ "learning_rate": 7.488853503184715e-05,
696
+ "loss": 0.3581,
697
+ "step": 1580
698
+ },
699
+ {
700
+ "epoch": 2.55,
701
+ "grad_norm": 9.612607955932617,
702
+ "learning_rate": 7.457006369426752e-05,
703
+ "loss": 0.2808,
704
+ "step": 1600
705
+ },
706
+ {
707
+ "epoch": 2.55,
708
+ "eval_accuracy": 0.7466666666666667,
709
+ "eval_loss": 0.9114470481872559,
710
+ "eval_runtime": 14.3031,
711
+ "eval_samples_per_second": 73.411,
712
+ "eval_steps_per_second": 9.229,
713
+ "step": 1600
714
+ },
715
+ {
716
+ "epoch": 2.58,
717
+ "grad_norm": 2.8393077850341797,
718
+ "learning_rate": 7.42515923566879e-05,
719
+ "loss": 0.3319,
720
+ "step": 1620
721
+ },
722
+ {
723
+ "epoch": 2.61,
724
+ "grad_norm": 2.83937668800354,
725
+ "learning_rate": 7.393312101910828e-05,
726
+ "loss": 0.3182,
727
+ "step": 1640
728
+ },
729
+ {
730
+ "epoch": 2.64,
731
+ "grad_norm": 3.702991008758545,
732
+ "learning_rate": 7.361464968152867e-05,
733
+ "loss": 0.2819,
734
+ "step": 1660
735
+ },
736
+ {
737
+ "epoch": 2.68,
738
+ "grad_norm": 3.6075496673583984,
739
+ "learning_rate": 7.329617834394905e-05,
740
+ "loss": 0.2595,
741
+ "step": 1680
742
+ },
743
+ {
744
+ "epoch": 2.71,
745
+ "grad_norm": 5.349125385284424,
746
+ "learning_rate": 7.297770700636943e-05,
747
+ "loss": 0.2767,
748
+ "step": 1700
749
+ },
750
+ {
751
+ "epoch": 2.71,
752
+ "eval_accuracy": 0.7590476190476191,
753
+ "eval_loss": 0.8772222399711609,
754
+ "eval_runtime": 14.5323,
755
+ "eval_samples_per_second": 72.253,
756
+ "eval_steps_per_second": 9.083,
757
+ "step": 1700
758
+ },
759
+ {
760
+ "epoch": 2.74,
761
+ "grad_norm": 5.251324653625488,
762
+ "learning_rate": 7.265923566878982e-05,
763
+ "loss": 0.281,
764
+ "step": 1720
765
+ },
766
+ {
767
+ "epoch": 2.77,
768
+ "grad_norm": 4.524991989135742,
769
+ "learning_rate": 7.23407643312102e-05,
770
+ "loss": 0.3164,
771
+ "step": 1740
772
+ },
773
+ {
774
+ "epoch": 2.8,
775
+ "grad_norm": 0.386074036359787,
776
+ "learning_rate": 7.202229299363057e-05,
777
+ "loss": 0.3269,
778
+ "step": 1760
779
+ },
780
+ {
781
+ "epoch": 2.83,
782
+ "grad_norm": 6.747868061065674,
783
+ "learning_rate": 7.170382165605097e-05,
784
+ "loss": 0.2968,
785
+ "step": 1780
786
+ },
787
+ {
788
+ "epoch": 2.87,
789
+ "grad_norm": 1.3398292064666748,
790
+ "learning_rate": 7.138535031847134e-05,
791
+ "loss": 0.3422,
792
+ "step": 1800
793
+ },
794
+ {
795
+ "epoch": 2.87,
796
+ "eval_accuracy": 0.7676190476190476,
797
+ "eval_loss": 0.8072260022163391,
798
+ "eval_runtime": 14.6204,
799
+ "eval_samples_per_second": 71.817,
800
+ "eval_steps_per_second": 9.028,
801
+ "step": 1800
802
+ },
803
+ {
804
+ "epoch": 2.9,
805
+ "grad_norm": 6.668325901031494,
806
+ "learning_rate": 7.106687898089172e-05,
807
+ "loss": 0.2825,
808
+ "step": 1820
809
+ },
810
+ {
811
+ "epoch": 2.93,
812
+ "grad_norm": 0.14574064314365387,
813
+ "learning_rate": 7.074840764331211e-05,
814
+ "loss": 0.2408,
815
+ "step": 1840
816
+ },
817
+ {
818
+ "epoch": 2.96,
819
+ "grad_norm": 7.607251167297363,
820
+ "learning_rate": 7.042993630573249e-05,
821
+ "loss": 0.3295,
822
+ "step": 1860
823
+ },
824
+ {
825
+ "epoch": 2.99,
826
+ "grad_norm": 0.10143504291772842,
827
+ "learning_rate": 7.011146496815287e-05,
828
+ "loss": 0.2425,
829
+ "step": 1880
830
+ },
831
+ {
832
+ "epoch": 3.03,
833
+ "grad_norm": 9.659887313842773,
834
+ "learning_rate": 6.979299363057326e-05,
835
+ "loss": 0.2441,
836
+ "step": 1900
837
+ },
838
+ {
839
+ "epoch": 3.03,
840
+ "eval_accuracy": 0.7380952380952381,
841
+ "eval_loss": 0.9659485816955566,
842
+ "eval_runtime": 14.6043,
843
+ "eval_samples_per_second": 71.897,
844
+ "eval_steps_per_second": 9.038,
845
+ "step": 1900
846
+ },
847
+ {
848
+ "epoch": 3.06,
849
+ "grad_norm": 0.4748639762401581,
850
+ "learning_rate": 6.947452229299364e-05,
851
+ "loss": 0.2151,
852
+ "step": 1920
853
+ },
854
+ {
855
+ "epoch": 3.09,
856
+ "grad_norm": 2.329375743865967,
857
+ "learning_rate": 6.915605095541401e-05,
858
+ "loss": 0.2014,
859
+ "step": 1940
860
+ },
861
+ {
862
+ "epoch": 3.12,
863
+ "grad_norm": 0.08076146245002747,
864
+ "learning_rate": 6.88375796178344e-05,
865
+ "loss": 0.3039,
866
+ "step": 1960
867
+ },
868
+ {
869
+ "epoch": 3.15,
870
+ "grad_norm": 2.8407723903656006,
871
+ "learning_rate": 6.851910828025478e-05,
872
+ "loss": 0.2391,
873
+ "step": 1980
874
+ },
875
+ {
876
+ "epoch": 3.18,
877
+ "grad_norm": 4.808467388153076,
878
+ "learning_rate": 6.820063694267516e-05,
879
+ "loss": 0.2438,
880
+ "step": 2000
881
+ },
882
+ {
883
+ "epoch": 3.18,
884
+ "eval_accuracy": 0.7923809523809524,
885
+ "eval_loss": 0.7905208468437195,
886
+ "eval_runtime": 14.5131,
887
+ "eval_samples_per_second": 72.349,
888
+ "eval_steps_per_second": 9.095,
889
+ "step": 2000
890
+ },
891
+ {
892
+ "epoch": 3.22,
893
+ "grad_norm": 4.137459754943848,
894
+ "learning_rate": 6.788216560509555e-05,
895
+ "loss": 0.2438,
896
+ "step": 2020
897
+ },
898
+ {
899
+ "epoch": 3.25,
900
+ "grad_norm": 1.3424416780471802,
901
+ "learning_rate": 6.756369426751593e-05,
902
+ "loss": 0.3438,
903
+ "step": 2040
904
+ },
905
+ {
906
+ "epoch": 3.28,
907
+ "grad_norm": 9.731406211853027,
908
+ "learning_rate": 6.724522292993631e-05,
909
+ "loss": 0.2893,
910
+ "step": 2060
911
+ },
912
+ {
913
+ "epoch": 3.31,
914
+ "grad_norm": 3.4848482608795166,
915
+ "learning_rate": 6.69267515923567e-05,
916
+ "loss": 0.2532,
917
+ "step": 2080
918
+ },
919
+ {
920
+ "epoch": 3.34,
921
+ "grad_norm": 0.2547754943370819,
922
+ "learning_rate": 6.660828025477708e-05,
923
+ "loss": 0.3925,
924
+ "step": 2100
925
+ },
926
+ {
927
+ "epoch": 3.34,
928
+ "eval_accuracy": 0.7304761904761905,
929
+ "eval_loss": 0.9977201819419861,
930
+ "eval_runtime": 14.2446,
931
+ "eval_samples_per_second": 73.712,
932
+ "eval_steps_per_second": 9.267,
933
+ "step": 2100
934
+ },
935
+ {
936
+ "epoch": 3.38,
937
+ "grad_norm": 0.40093258023262024,
938
+ "learning_rate": 6.628980891719746e-05,
939
+ "loss": 0.2749,
940
+ "step": 2120
941
+ },
942
+ {
943
+ "epoch": 3.41,
944
+ "grad_norm": 5.478763103485107,
945
+ "learning_rate": 6.597133757961785e-05,
946
+ "loss": 0.2362,
947
+ "step": 2140
948
+ },
949
+ {
950
+ "epoch": 3.44,
951
+ "grad_norm": 6.652340888977051,
952
+ "learning_rate": 6.565286624203822e-05,
953
+ "loss": 0.1756,
954
+ "step": 2160
955
+ },
956
+ {
957
+ "epoch": 3.47,
958
+ "grad_norm": 0.4871610701084137,
959
+ "learning_rate": 6.53343949044586e-05,
960
+ "loss": 0.2548,
961
+ "step": 2180
962
+ },
963
+ {
964
+ "epoch": 3.5,
965
+ "grad_norm": 0.6913163661956787,
966
+ "learning_rate": 6.5015923566879e-05,
967
+ "loss": 0.263,
968
+ "step": 2200
969
+ },
970
+ {
971
+ "epoch": 3.5,
972
+ "eval_accuracy": 0.7619047619047619,
973
+ "eval_loss": 0.8677366971969604,
974
+ "eval_runtime": 14.5691,
975
+ "eval_samples_per_second": 72.07,
976
+ "eval_steps_per_second": 9.06,
977
+ "step": 2200
978
+ },
979
+ {
980
+ "epoch": 3.54,
981
+ "grad_norm": 4.722912788391113,
982
+ "learning_rate": 6.469745222929937e-05,
983
+ "loss": 0.2466,
984
+ "step": 2220
985
+ },
986
+ {
987
+ "epoch": 3.57,
988
+ "grad_norm": 2.7744407653808594,
989
+ "learning_rate": 6.437898089171975e-05,
990
+ "loss": 0.2491,
991
+ "step": 2240
992
+ },
993
+ {
994
+ "epoch": 3.6,
995
+ "grad_norm": 2.703162431716919,
996
+ "learning_rate": 6.406050955414014e-05,
997
+ "loss": 0.1714,
998
+ "step": 2260
999
+ },
1000
+ {
1001
+ "epoch": 3.63,
1002
+ "grad_norm": 6.555820941925049,
1003
+ "learning_rate": 6.374203821656052e-05,
1004
+ "loss": 0.2594,
1005
+ "step": 2280
1006
+ },
1007
+ {
1008
+ "epoch": 3.66,
1009
+ "grad_norm": 1.9190115928649902,
1010
+ "learning_rate": 6.34235668789809e-05,
1011
+ "loss": 0.2585,
1012
+ "step": 2300
1013
+ },
1014
+ {
1015
+ "epoch": 3.66,
1016
+ "eval_accuracy": 0.7390476190476191,
1017
+ "eval_loss": 1.0279096364974976,
1018
+ "eval_runtime": 14.5845,
1019
+ "eval_samples_per_second": 71.994,
1020
+ "eval_steps_per_second": 9.051,
1021
+ "step": 2300
1022
+ },
1023
+ {
1024
+ "epoch": 3.69,
1025
+ "grad_norm": 3.693655252456665,
1026
+ "learning_rate": 6.310509554140129e-05,
1027
+ "loss": 0.2109,
1028
+ "step": 2320
1029
+ },
1030
+ {
1031
+ "epoch": 3.73,
1032
+ "grad_norm": 5.748462200164795,
1033
+ "learning_rate": 6.278662420382167e-05,
1034
+ "loss": 0.2292,
1035
+ "step": 2340
1036
+ },
1037
+ {
1038
+ "epoch": 3.76,
1039
+ "grad_norm": 5.914999485015869,
1040
+ "learning_rate": 6.246815286624203e-05,
1041
+ "loss": 0.3045,
1042
+ "step": 2360
1043
+ },
1044
+ {
1045
+ "epoch": 3.79,
1046
+ "grad_norm": 4.841549873352051,
1047
+ "learning_rate": 6.214968152866242e-05,
1048
+ "loss": 0.2512,
1049
+ "step": 2380
1050
+ },
1051
+ {
1052
+ "epoch": 3.82,
1053
+ "grad_norm": 1.5212379693984985,
1054
+ "learning_rate": 6.18312101910828e-05,
1055
+ "loss": 0.2523,
1056
+ "step": 2400
1057
+ },
1058
+ {
1059
+ "epoch": 3.82,
1060
+ "eval_accuracy": 0.7828571428571428,
1061
+ "eval_loss": 0.8742047548294067,
1062
+ "eval_runtime": 14.3756,
1063
+ "eval_samples_per_second": 73.041,
1064
+ "eval_steps_per_second": 9.182,
1065
+ "step": 2400
1066
+ },
1067
+ {
1068
+ "epoch": 3.85,
1069
+ "grad_norm": 5.37538480758667,
1070
+ "learning_rate": 6.151273885350318e-05,
1071
+ "loss": 0.2907,
1072
+ "step": 2420
1073
+ },
1074
+ {
1075
+ "epoch": 3.89,
1076
+ "grad_norm": 0.1197693943977356,
1077
+ "learning_rate": 6.119426751592357e-05,
1078
+ "loss": 0.166,
1079
+ "step": 2440
1080
+ },
1081
+ {
1082
+ "epoch": 3.92,
1083
+ "grad_norm": 0.6890983581542969,
1084
+ "learning_rate": 6.0875796178343946e-05,
1085
+ "loss": 0.3482,
1086
+ "step": 2460
1087
+ },
1088
+ {
1089
+ "epoch": 3.95,
1090
+ "grad_norm": 6.912744045257568,
1091
+ "learning_rate": 6.055732484076433e-05,
1092
+ "loss": 0.262,
1093
+ "step": 2480
1094
+ },
1095
+ {
1096
+ "epoch": 3.98,
1097
+ "grad_norm": 5.4778733253479,
1098
+ "learning_rate": 6.023885350318471e-05,
1099
+ "loss": 0.2322,
1100
+ "step": 2500
1101
+ },
1102
+ {
1103
+ "epoch": 3.98,
1104
+ "eval_accuracy": 0.7790476190476191,
1105
+ "eval_loss": 0.8817368745803833,
1106
+ "eval_runtime": 14.4036,
1107
+ "eval_samples_per_second": 72.899,
1108
+ "eval_steps_per_second": 9.164,
1109
+ "step": 2500
1110
+ },
1111
+ {
1112
+ "epoch": 4.01,
1113
+ "grad_norm": 1.3964594602584839,
1114
+ "learning_rate": 5.992038216560509e-05,
1115
+ "loss": 0.1819,
1116
+ "step": 2520
1117
+ },
1118
+ {
1119
+ "epoch": 4.04,
1120
+ "grad_norm": 0.07835082709789276,
1121
+ "learning_rate": 5.960191082802548e-05,
1122
+ "loss": 0.1875,
1123
+ "step": 2540
1124
+ },
1125
+ {
1126
+ "epoch": 4.08,
1127
+ "grad_norm": 0.1384442001581192,
1128
+ "learning_rate": 5.9283439490445855e-05,
1129
+ "loss": 0.2671,
1130
+ "step": 2560
1131
+ },
1132
+ {
1133
+ "epoch": 4.11,
1134
+ "grad_norm": 1.913549780845642,
1135
+ "learning_rate": 5.896496815286624e-05,
1136
+ "loss": 0.21,
1137
+ "step": 2580
1138
+ },
1139
+ {
1140
+ "epoch": 4.14,
1141
+ "grad_norm": 1.3238245248794556,
1142
+ "learning_rate": 5.8646496815286624e-05,
1143
+ "loss": 0.1948,
1144
+ "step": 2600
1145
+ },
1146
+ {
1147
+ "epoch": 4.14,
1148
+ "eval_accuracy": 0.7847619047619048,
1149
+ "eval_loss": 0.8387181162834167,
1150
+ "eval_runtime": 14.398,
1151
+ "eval_samples_per_second": 72.927,
1152
+ "eval_steps_per_second": 9.168,
1153
+ "step": 2600
1154
+ },
1155
+ {
1156
+ "epoch": 4.17,
1157
+ "grad_norm": 4.234366416931152,
1158
+ "learning_rate": 5.8328025477707e-05,
1159
+ "loss": 0.1491,
1160
+ "step": 2620
1161
+ },
1162
+ {
1163
+ "epoch": 4.2,
1164
+ "grad_norm": 2.7212390899658203,
1165
+ "learning_rate": 5.8009554140127386e-05,
1166
+ "loss": 0.1666,
1167
+ "step": 2640
1168
+ },
1169
+ {
1170
+ "epoch": 4.24,
1171
+ "grad_norm": 0.8286333680152893,
1172
+ "learning_rate": 5.769108280254777e-05,
1173
+ "loss": 0.1682,
1174
+ "step": 2660
1175
+ },
1176
+ {
1177
+ "epoch": 4.27,
1178
+ "grad_norm": 11.05745792388916,
1179
+ "learning_rate": 5.737261146496815e-05,
1180
+ "loss": 0.2116,
1181
+ "step": 2680
1182
+ },
1183
+ {
1184
+ "epoch": 4.3,
1185
+ "grad_norm": 3.838848352432251,
1186
+ "learning_rate": 5.705414012738853e-05,
1187
+ "loss": 0.2318,
1188
+ "step": 2700
1189
+ },
1190
+ {
1191
+ "epoch": 4.3,
1192
+ "eval_accuracy": 0.7123809523809523,
1193
+ "eval_loss": 1.1542153358459473,
1194
+ "eval_runtime": 14.4441,
1195
+ "eval_samples_per_second": 72.694,
1196
+ "eval_steps_per_second": 9.139,
1197
+ "step": 2700
1198
+ },
1199
+ {
1200
+ "epoch": 4.33,
1201
+ "grad_norm": 7.583115100860596,
1202
+ "learning_rate": 5.673566878980892e-05,
1203
+ "loss": 0.1891,
1204
+ "step": 2720
1205
+ },
1206
+ {
1207
+ "epoch": 4.36,
1208
+ "grad_norm": 0.050061825662851334,
1209
+ "learning_rate": 5.6417197452229296e-05,
1210
+ "loss": 0.2521,
1211
+ "step": 2740
1212
+ },
1213
+ {
1214
+ "epoch": 4.39,
1215
+ "grad_norm": 3.531385898590088,
1216
+ "learning_rate": 5.609872611464968e-05,
1217
+ "loss": 0.203,
1218
+ "step": 2760
1219
+ },
1220
+ {
1221
+ "epoch": 4.43,
1222
+ "grad_norm": 0.8870866894721985,
1223
+ "learning_rate": 5.5780254777070065e-05,
1224
+ "loss": 0.2052,
1225
+ "step": 2780
1226
+ },
1227
+ {
1228
+ "epoch": 4.46,
1229
+ "grad_norm": 5.210803031921387,
1230
+ "learning_rate": 5.546178343949044e-05,
1231
+ "loss": 0.2184,
1232
+ "step": 2800
1233
+ },
1234
+ {
1235
+ "epoch": 4.46,
1236
+ "eval_accuracy": 0.7152380952380952,
1237
+ "eval_loss": 1.1386553049087524,
1238
+ "eval_runtime": 14.4274,
1239
+ "eval_samples_per_second": 72.778,
1240
+ "eval_steps_per_second": 9.149,
1241
+ "step": 2800
1242
+ },
1243
+ {
1244
+ "epoch": 4.49,
1245
+ "grad_norm": 2.6480255126953125,
1246
+ "learning_rate": 5.514331210191083e-05,
1247
+ "loss": 0.2484,
1248
+ "step": 2820
1249
+ },
1250
+ {
1251
+ "epoch": 4.52,
1252
+ "grad_norm": 0.11620941758155823,
1253
+ "learning_rate": 5.482484076433121e-05,
1254
+ "loss": 0.1871,
1255
+ "step": 2840
1256
+ },
1257
+ {
1258
+ "epoch": 4.55,
1259
+ "grad_norm": 3.393413782119751,
1260
+ "learning_rate": 5.450636942675159e-05,
1261
+ "loss": 0.1796,
1262
+ "step": 2860
1263
+ },
1264
+ {
1265
+ "epoch": 4.59,
1266
+ "grad_norm": 2.8002829551696777,
1267
+ "learning_rate": 5.4187898089171974e-05,
1268
+ "loss": 0.2563,
1269
+ "step": 2880
1270
+ },
1271
+ {
1272
+ "epoch": 4.62,
1273
+ "grad_norm": 6.208673000335693,
1274
+ "learning_rate": 5.386942675159236e-05,
1275
+ "loss": 0.2484,
1276
+ "step": 2900
1277
+ },
1278
+ {
1279
+ "epoch": 4.62,
1280
+ "eval_accuracy": 0.7247619047619047,
1281
+ "eval_loss": 1.0976409912109375,
1282
+ "eval_runtime": 14.3178,
1283
+ "eval_samples_per_second": 73.335,
1284
+ "eval_steps_per_second": 9.219,
1285
+ "step": 2900
1286
+ },
1287
+ {
1288
+ "epoch": 4.65,
1289
+ "grad_norm": 2.2990214824676514,
1290
+ "learning_rate": 5.3550955414012736e-05,
1291
+ "loss": 0.1589,
1292
+ "step": 2920
1293
+ },
1294
+ {
1295
+ "epoch": 4.68,
1296
+ "grad_norm": 3.4065587520599365,
1297
+ "learning_rate": 5.323248407643312e-05,
1298
+ "loss": 0.1823,
1299
+ "step": 2940
1300
+ },
1301
+ {
1302
+ "epoch": 4.71,
1303
+ "grad_norm": 8.64983081817627,
1304
+ "learning_rate": 5.2914012738853506e-05,
1305
+ "loss": 0.214,
1306
+ "step": 2960
1307
+ },
1308
+ {
1309
+ "epoch": 4.75,
1310
+ "grad_norm": 0.693291425704956,
1311
+ "learning_rate": 5.2595541401273883e-05,
1312
+ "loss": 0.2407,
1313
+ "step": 2980
1314
+ },
1315
+ {
1316
+ "epoch": 4.78,
1317
+ "grad_norm": 7.287389278411865,
1318
+ "learning_rate": 5.227707006369427e-05,
1319
+ "loss": 0.1575,
1320
+ "step": 3000
1321
+ },
1322
+ {
1323
+ "epoch": 4.78,
1324
+ "eval_accuracy": 0.7457142857142857,
1325
+ "eval_loss": 1.0477701425552368,
1326
+ "eval_runtime": 14.492,
1327
+ "eval_samples_per_second": 72.454,
1328
+ "eval_steps_per_second": 9.108,
1329
+ "step": 3000
1330
+ },
1331
+ {
1332
+ "epoch": 4.81,
1333
+ "grad_norm": 5.838592529296875,
1334
+ "learning_rate": 5.1958598726114646e-05,
1335
+ "loss": 0.2628,
1336
+ "step": 3020
1337
+ },
1338
+ {
1339
+ "epoch": 4.84,
1340
+ "grad_norm": 5.59370756149292,
1341
+ "learning_rate": 5.164012738853503e-05,
1342
+ "loss": 0.1391,
1343
+ "step": 3040
1344
+ },
1345
+ {
1346
+ "epoch": 4.87,
1347
+ "grad_norm": 1.1490521430969238,
1348
+ "learning_rate": 5.1321656050955415e-05,
1349
+ "loss": 0.2392,
1350
+ "step": 3060
1351
+ },
1352
+ {
1353
+ "epoch": 4.9,
1354
+ "grad_norm": 4.223572731018066,
1355
+ "learning_rate": 5.100318471337579e-05,
1356
+ "loss": 0.2346,
1357
+ "step": 3080
1358
+ },
1359
+ {
1360
+ "epoch": 4.94,
1361
+ "grad_norm": 3.0692086219787598,
1362
+ "learning_rate": 5.068471337579618e-05,
1363
+ "loss": 0.2028,
1364
+ "step": 3100
1365
+ },
1366
+ {
1367
+ "epoch": 4.94,
1368
+ "eval_accuracy": 0.7447619047619047,
1369
+ "eval_loss": 1.0374425649642944,
1370
+ "eval_runtime": 14.213,
1371
+ "eval_samples_per_second": 73.876,
1372
+ "eval_steps_per_second": 9.287,
1373
+ "step": 3100
1374
+ },
1375
+ {
1376
+ "epoch": 4.97,
1377
+ "grad_norm": 1.3413909673690796,
1378
+ "learning_rate": 5.036624203821656e-05,
1379
+ "loss": 0.1113,
1380
+ "step": 3120
1381
+ },
1382
+ {
1383
+ "epoch": 5.0,
1384
+ "grad_norm": 0.1451389044523239,
1385
+ "learning_rate": 5.004777070063694e-05,
1386
+ "loss": 0.1797,
1387
+ "step": 3140
1388
+ },
1389
+ {
1390
+ "epoch": 5.03,
1391
+ "grad_norm": 8.001447677612305,
1392
+ "learning_rate": 4.9729299363057324e-05,
1393
+ "loss": 0.1934,
1394
+ "step": 3160
1395
+ },
1396
+ {
1397
+ "epoch": 5.06,
1398
+ "grad_norm": 3.657029151916504,
1399
+ "learning_rate": 4.941082802547771e-05,
1400
+ "loss": 0.2077,
1401
+ "step": 3180
1402
+ },
1403
+ {
1404
+ "epoch": 5.1,
1405
+ "grad_norm": 0.24746979773044586,
1406
+ "learning_rate": 4.9092356687898087e-05,
1407
+ "loss": 0.1173,
1408
+ "step": 3200
1409
+ },
1410
+ {
1411
+ "epoch": 5.1,
1412
+ "eval_accuracy": 0.7609523809523809,
1413
+ "eval_loss": 1.0067189931869507,
1414
+ "eval_runtime": 14.4189,
1415
+ "eval_samples_per_second": 72.821,
1416
+ "eval_steps_per_second": 9.155,
1417
+ "step": 3200
1418
+ },
1419
+ {
1420
+ "epoch": 5.13,
1421
+ "grad_norm": 1.1961082220077515,
1422
+ "learning_rate": 4.877388535031847e-05,
1423
+ "loss": 0.2267,
1424
+ "step": 3220
1425
+ },
1426
+ {
1427
+ "epoch": 5.16,
1428
+ "grad_norm": 6.003686904907227,
1429
+ "learning_rate": 4.8455414012738856e-05,
1430
+ "loss": 0.2543,
1431
+ "step": 3240
1432
+ },
1433
+ {
1434
+ "epoch": 5.19,
1435
+ "grad_norm": 16.887609481811523,
1436
+ "learning_rate": 4.8136942675159233e-05,
1437
+ "loss": 0.1849,
1438
+ "step": 3260
1439
+ },
1440
+ {
1441
+ "epoch": 5.22,
1442
+ "grad_norm": 1.4152742624282837,
1443
+ "learning_rate": 4.781847133757962e-05,
1444
+ "loss": 0.2119,
1445
+ "step": 3280
1446
+ },
1447
+ {
1448
+ "epoch": 5.25,
1449
+ "grad_norm": 1.6081457138061523,
1450
+ "learning_rate": 4.75e-05,
1451
+ "loss": 0.1313,
1452
+ "step": 3300
1453
+ },
1454
+ {
1455
+ "epoch": 5.25,
1456
+ "eval_accuracy": 0.7266666666666667,
1457
+ "eval_loss": 1.1970884799957275,
1458
+ "eval_runtime": 14.5245,
1459
+ "eval_samples_per_second": 72.292,
1460
+ "eval_steps_per_second": 9.088,
1461
+ "step": 3300
1462
+ },
1463
+ {
1464
+ "epoch": 5.29,
1465
+ "grad_norm": 19.153751373291016,
1466
+ "learning_rate": 4.718152866242038e-05,
1467
+ "loss": 0.2472,
1468
+ "step": 3320
1469
+ },
1470
+ {
1471
+ "epoch": 5.32,
1472
+ "grad_norm": 4.610495567321777,
1473
+ "learning_rate": 4.6863057324840765e-05,
1474
+ "loss": 0.1872,
1475
+ "step": 3340
1476
+ },
1477
+ {
1478
+ "epoch": 5.35,
1479
+ "grad_norm": 1.8803948163986206,
1480
+ "learning_rate": 4.654458598726115e-05,
1481
+ "loss": 0.153,
1482
+ "step": 3360
1483
+ },
1484
+ {
1485
+ "epoch": 5.38,
1486
+ "grad_norm": 2.9183852672576904,
1487
+ "learning_rate": 4.622611464968153e-05,
1488
+ "loss": 0.1024,
1489
+ "step": 3380
1490
+ },
1491
+ {
1492
+ "epoch": 5.41,
1493
+ "grad_norm": 11.132299423217773,
1494
+ "learning_rate": 4.590764331210191e-05,
1495
+ "loss": 0.2142,
1496
+ "step": 3400
1497
+ },
1498
+ {
1499
+ "epoch": 5.41,
1500
+ "eval_accuracy": 0.74,
1501
+ "eval_loss": 1.1455038785934448,
1502
+ "eval_runtime": 13.999,
1503
+ "eval_samples_per_second": 75.006,
1504
+ "eval_steps_per_second": 9.429,
1505
+ "step": 3400
1506
+ },
1507
+ {
1508
+ "epoch": 5.45,
1509
+ "grad_norm": 0.15127496421337128,
1510
+ "learning_rate": 4.5589171974522296e-05,
1511
+ "loss": 0.1674,
1512
+ "step": 3420
1513
+ },
1514
+ {
1515
+ "epoch": 5.48,
1516
+ "grad_norm": 4.990033149719238,
1517
+ "learning_rate": 4.5270700636942674e-05,
1518
+ "loss": 0.1906,
1519
+ "step": 3440
1520
+ },
1521
+ {
1522
+ "epoch": 5.51,
1523
+ "grad_norm": 4.008519649505615,
1524
+ "learning_rate": 4.495222929936306e-05,
1525
+ "loss": 0.1497,
1526
+ "step": 3460
1527
+ },
1528
+ {
1529
+ "epoch": 5.54,
1530
+ "grad_norm": 1.5550432205200195,
1531
+ "learning_rate": 4.4633757961783443e-05,
1532
+ "loss": 0.1006,
1533
+ "step": 3480
1534
+ },
1535
+ {
1536
+ "epoch": 5.57,
1537
+ "grad_norm": 0.13081075251102448,
1538
+ "learning_rate": 4.431528662420382e-05,
1539
+ "loss": 0.1302,
1540
+ "step": 3500
1541
+ },
1542
+ {
1543
+ "epoch": 5.57,
1544
+ "eval_accuracy": 0.7628571428571429,
1545
+ "eval_loss": 1.0319159030914307,
1546
+ "eval_runtime": 14.3106,
1547
+ "eval_samples_per_second": 73.372,
1548
+ "eval_steps_per_second": 9.224,
1549
+ "step": 3500
1550
+ },
1551
+ {
1552
+ "epoch": 5.61,
1553
+ "grad_norm": 4.138894081115723,
1554
+ "learning_rate": 4.3996815286624206e-05,
1555
+ "loss": 0.1988,
1556
+ "step": 3520
1557
+ },
1558
+ {
1559
+ "epoch": 5.64,
1560
+ "grad_norm": 0.9299852848052979,
1561
+ "learning_rate": 4.3678343949044584e-05,
1562
+ "loss": 0.1264,
1563
+ "step": 3540
1564
+ },
1565
+ {
1566
+ "epoch": 5.67,
1567
+ "grad_norm": 0.05426739901304245,
1568
+ "learning_rate": 4.335987261146497e-05,
1569
+ "loss": 0.184,
1570
+ "step": 3560
1571
+ },
1572
+ {
1573
+ "epoch": 5.7,
1574
+ "grad_norm": 0.8005551695823669,
1575
+ "learning_rate": 4.304140127388535e-05,
1576
+ "loss": 0.1535,
1577
+ "step": 3580
1578
+ },
1579
+ {
1580
+ "epoch": 5.73,
1581
+ "grad_norm": 6.299161911010742,
1582
+ "learning_rate": 4.272292993630573e-05,
1583
+ "loss": 0.2193,
1584
+ "step": 3600
1585
+ },
1586
+ {
1587
+ "epoch": 5.73,
1588
+ "eval_accuracy": 0.7733333333333333,
1589
+ "eval_loss": 0.9745522141456604,
1590
+ "eval_runtime": 14.4492,
1591
+ "eval_samples_per_second": 72.669,
1592
+ "eval_steps_per_second": 9.135,
1593
+ "step": 3600
1594
+ },
1595
+ {
1596
+ "epoch": 5.76,
1597
+ "grad_norm": 5.8440937995910645,
1598
+ "learning_rate": 4.2404458598726115e-05,
1599
+ "loss": 0.2346,
1600
+ "step": 3620
1601
+ },
1602
+ {
1603
+ "epoch": 5.8,
1604
+ "grad_norm": 0.3318318724632263,
1605
+ "learning_rate": 4.20859872611465e-05,
1606
+ "loss": 0.2378,
1607
+ "step": 3640
1608
+ },
1609
+ {
1610
+ "epoch": 5.83,
1611
+ "grad_norm": 6.423093318939209,
1612
+ "learning_rate": 4.176751592356688e-05,
1613
+ "loss": 0.2194,
1614
+ "step": 3660
1615
+ },
1616
+ {
1617
+ "epoch": 5.86,
1618
+ "grad_norm": 1.1782708168029785,
1619
+ "learning_rate": 4.144904458598726e-05,
1620
+ "loss": 0.2442,
1621
+ "step": 3680
1622
+ },
1623
+ {
1624
+ "epoch": 5.89,
1625
+ "grad_norm": 4.644681930541992,
1626
+ "learning_rate": 4.1130573248407647e-05,
1627
+ "loss": 0.1778,
1628
+ "step": 3700
1629
+ },
1630
+ {
1631
+ "epoch": 5.89,
1632
+ "eval_accuracy": 0.7552380952380953,
1633
+ "eval_loss": 1.0206609964370728,
1634
+ "eval_runtime": 14.406,
1635
+ "eval_samples_per_second": 72.886,
1636
+ "eval_steps_per_second": 9.163,
1637
+ "step": 3700
1638
+ },
1639
+ {
1640
+ "epoch": 5.92,
1641
+ "grad_norm": 2.935955762863159,
1642
+ "learning_rate": 4.0812101910828024e-05,
1643
+ "loss": 0.2641,
1644
+ "step": 3720
1645
+ },
1646
+ {
1647
+ "epoch": 5.96,
1648
+ "grad_norm": 12.24996566772461,
1649
+ "learning_rate": 4.049363057324841e-05,
1650
+ "loss": 0.2185,
1651
+ "step": 3740
1652
+ },
1653
+ {
1654
+ "epoch": 5.99,
1655
+ "grad_norm": 4.923351287841797,
1656
+ "learning_rate": 4.0175159235668793e-05,
1657
+ "loss": 0.1169,
1658
+ "step": 3760
1659
+ },
1660
+ {
1661
+ "epoch": 6.02,
1662
+ "grad_norm": 3.076030731201172,
1663
+ "learning_rate": 3.985668789808917e-05,
1664
+ "loss": 0.1629,
1665
+ "step": 3780
1666
+ },
1667
+ {
1668
+ "epoch": 6.05,
1669
+ "grad_norm": 1.7281866073608398,
1670
+ "learning_rate": 3.9538216560509556e-05,
1671
+ "loss": 0.1003,
1672
+ "step": 3800
1673
+ },
1674
+ {
1675
+ "epoch": 6.05,
1676
+ "eval_accuracy": 0.7638095238095238,
1677
+ "eval_loss": 1.0537958145141602,
1678
+ "eval_runtime": 14.4611,
1679
+ "eval_samples_per_second": 72.609,
1680
+ "eval_steps_per_second": 9.128,
1681
+ "step": 3800
1682
+ },
1683
+ {
1684
+ "epoch": 6.08,
1685
+ "grad_norm": 4.985991477966309,
1686
+ "learning_rate": 3.921974522292994e-05,
1687
+ "loss": 0.1808,
1688
+ "step": 3820
1689
+ },
1690
+ {
1691
+ "epoch": 6.11,
1692
+ "grad_norm": 0.1005251556634903,
1693
+ "learning_rate": 3.890127388535032e-05,
1694
+ "loss": 0.1514,
1695
+ "step": 3840
1696
+ },
1697
+ {
1698
+ "epoch": 6.15,
1699
+ "grad_norm": 4.247957706451416,
1700
+ "learning_rate": 3.85828025477707e-05,
1701
+ "loss": 0.1823,
1702
+ "step": 3860
1703
+ },
1704
+ {
1705
+ "epoch": 6.18,
1706
+ "grad_norm": 4.05401611328125,
1707
+ "learning_rate": 3.8280254777070066e-05,
1708
+ "loss": 0.1412,
1709
+ "step": 3880
1710
+ },
1711
+ {
1712
+ "epoch": 6.21,
1713
+ "grad_norm": 0.09717115759849548,
1714
+ "learning_rate": 3.796178343949045e-05,
1715
+ "loss": 0.1644,
1716
+ "step": 3900
1717
+ },
1718
+ {
1719
+ "epoch": 6.21,
1720
+ "eval_accuracy": 0.7304761904761905,
1721
+ "eval_loss": 1.1832480430603027,
1722
+ "eval_runtime": 14.3128,
1723
+ "eval_samples_per_second": 73.361,
1724
+ "eval_steps_per_second": 9.222,
1725
+ "step": 3900
1726
+ },
1727
+ {
1728
+ "epoch": 6.24,
1729
+ "grad_norm": 6.105861663818359,
1730
+ "learning_rate": 3.7643312101910836e-05,
1731
+ "loss": 0.2253,
1732
+ "step": 3920
1733
+ },
1734
+ {
1735
+ "epoch": 6.27,
1736
+ "grad_norm": 0.2512984573841095,
1737
+ "learning_rate": 3.7324840764331207e-05,
1738
+ "loss": 0.1698,
1739
+ "step": 3940
1740
+ },
1741
+ {
1742
+ "epoch": 6.31,
1743
+ "grad_norm": 0.045377813279628754,
1744
+ "learning_rate": 3.700636942675159e-05,
1745
+ "loss": 0.1442,
1746
+ "step": 3960
1747
+ },
1748
+ {
1749
+ "epoch": 6.34,
1750
+ "grad_norm": 8.247663497924805,
1751
+ "learning_rate": 3.6687898089171976e-05,
1752
+ "loss": 0.1449,
1753
+ "step": 3980
1754
+ },
1755
+ {
1756
+ "epoch": 6.37,
1757
+ "grad_norm": 10.677542686462402,
1758
+ "learning_rate": 3.6369426751592353e-05,
1759
+ "loss": 0.1843,
1760
+ "step": 4000
1761
+ },
1762
+ {
1763
+ "epoch": 6.37,
1764
+ "eval_accuracy": 0.7495238095238095,
1765
+ "eval_loss": 1.081445336341858,
1766
+ "eval_runtime": 14.074,
1767
+ "eval_samples_per_second": 74.605,
1768
+ "eval_steps_per_second": 9.379,
1769
+ "step": 4000
1770
+ },
1771
+ {
1772
+ "epoch": 6.4,
1773
+ "grad_norm": 1.8205310106277466,
1774
+ "learning_rate": 3.605095541401274e-05,
1775
+ "loss": 0.1621,
1776
+ "step": 4020
1777
+ },
1778
+ {
1779
+ "epoch": 6.43,
1780
+ "grad_norm": 4.124607086181641,
1781
+ "learning_rate": 3.573248407643312e-05,
1782
+ "loss": 0.1634,
1783
+ "step": 4040
1784
+ },
1785
+ {
1786
+ "epoch": 6.46,
1787
+ "grad_norm": 0.3355584442615509,
1788
+ "learning_rate": 3.54140127388535e-05,
1789
+ "loss": 0.2083,
1790
+ "step": 4060
1791
+ },
1792
+ {
1793
+ "epoch": 6.5,
1794
+ "grad_norm": 0.12367378920316696,
1795
+ "learning_rate": 3.5095541401273885e-05,
1796
+ "loss": 0.148,
1797
+ "step": 4080
1798
+ },
1799
+ {
1800
+ "epoch": 6.53,
1801
+ "grad_norm": 0.021049553528428078,
1802
+ "learning_rate": 3.477707006369427e-05,
1803
+ "loss": 0.129,
1804
+ "step": 4100
1805
+ },
1806
+ {
1807
+ "epoch": 6.53,
1808
+ "eval_accuracy": 0.72,
1809
+ "eval_loss": 1.247863531112671,
1810
+ "eval_runtime": 14.2349,
1811
+ "eval_samples_per_second": 73.762,
1812
+ "eval_steps_per_second": 9.273,
1813
+ "step": 4100
1814
+ },
1815
+ {
1816
+ "epoch": 6.56,
1817
+ "grad_norm": 1.4821038246154785,
1818
+ "learning_rate": 3.445859872611465e-05,
1819
+ "loss": 0.1379,
1820
+ "step": 4120
1821
+ },
1822
+ {
1823
+ "epoch": 6.59,
1824
+ "grad_norm": 2.6189982891082764,
1825
+ "learning_rate": 3.414012738853503e-05,
1826
+ "loss": 0.1667,
1827
+ "step": 4140
1828
+ },
1829
+ {
1830
+ "epoch": 6.62,
1831
+ "grad_norm": 1.175626516342163,
1832
+ "learning_rate": 3.3821656050955416e-05,
1833
+ "loss": 0.2009,
1834
+ "step": 4160
1835
+ },
1836
+ {
1837
+ "epoch": 6.66,
1838
+ "grad_norm": 5.35676908493042,
1839
+ "learning_rate": 3.3503184713375794e-05,
1840
+ "loss": 0.1821,
1841
+ "step": 4180
1842
+ },
1843
+ {
1844
+ "epoch": 6.69,
1845
+ "grad_norm": 2.7101659774780273,
1846
+ "learning_rate": 3.318471337579618e-05,
1847
+ "loss": 0.17,
1848
+ "step": 4200
1849
+ },
1850
+ {
1851
+ "epoch": 6.69,
1852
+ "eval_accuracy": 0.741904761904762,
1853
+ "eval_loss": 1.1575206518173218,
1854
+ "eval_runtime": 14.4135,
1855
+ "eval_samples_per_second": 72.848,
1856
+ "eval_steps_per_second": 9.158,
1857
+ "step": 4200
1858
+ },
1859
+ {
1860
+ "epoch": 6.72,
1861
+ "grad_norm": 4.070112705230713,
1862
+ "learning_rate": 3.286624203821656e-05,
1863
+ "loss": 0.2064,
1864
+ "step": 4220
1865
+ },
1866
+ {
1867
+ "epoch": 6.75,
1868
+ "grad_norm": 0.4738242030143738,
1869
+ "learning_rate": 3.254777070063694e-05,
1870
+ "loss": 0.1836,
1871
+ "step": 4240
1872
+ },
1873
+ {
1874
+ "epoch": 6.78,
1875
+ "grad_norm": 6.459200859069824,
1876
+ "learning_rate": 3.2229299363057326e-05,
1877
+ "loss": 0.1277,
1878
+ "step": 4260
1879
+ },
1880
+ {
1881
+ "epoch": 6.82,
1882
+ "grad_norm": 4.038717746734619,
1883
+ "learning_rate": 3.191082802547771e-05,
1884
+ "loss": 0.1903,
1885
+ "step": 4280
1886
+ },
1887
+ {
1888
+ "epoch": 6.85,
1889
+ "grad_norm": 1.6985381841659546,
1890
+ "learning_rate": 3.159235668789809e-05,
1891
+ "loss": 0.2184,
1892
+ "step": 4300
1893
+ },
1894
+ {
1895
+ "epoch": 6.85,
1896
+ "eval_accuracy": 0.7561904761904762,
1897
+ "eval_loss": 1.0945791006088257,
1898
+ "eval_runtime": 14.3853,
1899
+ "eval_samples_per_second": 72.991,
1900
+ "eval_steps_per_second": 9.176,
1901
+ "step": 4300
1902
+ },
1903
+ {
1904
+ "epoch": 6.88,
1905
+ "grad_norm": 1.8890398740768433,
1906
+ "learning_rate": 3.127388535031847e-05,
1907
+ "loss": 0.1224,
1908
+ "step": 4320
1909
+ },
1910
+ {
1911
+ "epoch": 6.91,
1912
+ "grad_norm": 5.5975799560546875,
1913
+ "learning_rate": 3.095541401273885e-05,
1914
+ "loss": 0.1936,
1915
+ "step": 4340
1916
+ },
1917
+ {
1918
+ "epoch": 6.94,
1919
+ "grad_norm": 0.805150032043457,
1920
+ "learning_rate": 3.0636942675159235e-05,
1921
+ "loss": 0.1103,
1922
+ "step": 4360
1923
+ },
1924
+ {
1925
+ "epoch": 6.97,
1926
+ "grad_norm": 6.093379020690918,
1927
+ "learning_rate": 3.031847133757962e-05,
1928
+ "loss": 0.1666,
1929
+ "step": 4380
1930
+ },
1931
+ {
1932
+ "epoch": 7.01,
1933
+ "grad_norm": 0.4640253484249115,
1934
+ "learning_rate": 3e-05,
1935
+ "loss": 0.1506,
1936
+ "step": 4400
1937
+ },
1938
+ {
1939
+ "epoch": 7.01,
1940
+ "eval_accuracy": 0.7714285714285715,
1941
+ "eval_loss": 1.0580005645751953,
1942
+ "eval_runtime": 14.4045,
1943
+ "eval_samples_per_second": 72.894,
1944
+ "eval_steps_per_second": 9.164,
1945
+ "step": 4400
1946
+ },
1947
+ {
1948
+ "epoch": 7.04,
1949
+ "grad_norm": 0.06945107132196426,
1950
+ "learning_rate": 2.9681528662420382e-05,
1951
+ "loss": 0.1537,
1952
+ "step": 4420
1953
+ },
1954
+ {
1955
+ "epoch": 7.07,
1956
+ "grad_norm": 1.686673879623413,
1957
+ "learning_rate": 2.9363057324840763e-05,
1958
+ "loss": 0.1661,
1959
+ "step": 4440
1960
+ },
1961
+ {
1962
+ "epoch": 7.1,
1963
+ "grad_norm": 3.4670166969299316,
1964
+ "learning_rate": 2.9044585987261148e-05,
1965
+ "loss": 0.1128,
1966
+ "step": 4460
1967
+ },
1968
+ {
1969
+ "epoch": 7.13,
1970
+ "grad_norm": 3.8089122772216797,
1971
+ "learning_rate": 2.872611464968153e-05,
1972
+ "loss": 0.1505,
1973
+ "step": 4480
1974
+ },
1975
+ {
1976
+ "epoch": 7.17,
1977
+ "grad_norm": 0.22587546706199646,
1978
+ "learning_rate": 2.840764331210191e-05,
1979
+ "loss": 0.1099,
1980
+ "step": 4500
1981
+ },
1982
+ {
1983
+ "epoch": 7.17,
1984
+ "eval_accuracy": 0.7638095238095238,
1985
+ "eval_loss": 1.0479028224945068,
1986
+ "eval_runtime": 14.3476,
1987
+ "eval_samples_per_second": 73.183,
1988
+ "eval_steps_per_second": 9.2,
1989
+ "step": 4500
1990
+ },
1991
+ {
1992
+ "epoch": 7.2,
1993
+ "grad_norm": 0.4526523947715759,
1994
+ "learning_rate": 2.8089171974522295e-05,
1995
+ "loss": 0.0994,
1996
+ "step": 4520
1997
+ },
1998
+ {
1999
+ "epoch": 7.23,
2000
+ "grad_norm": 2.4940195083618164,
2001
+ "learning_rate": 2.7770700636942676e-05,
2002
+ "loss": 0.1098,
2003
+ "step": 4540
2004
+ },
2005
+ {
2006
+ "epoch": 7.26,
2007
+ "grad_norm": 1.7795358896255493,
2008
+ "learning_rate": 2.7452229299363057e-05,
2009
+ "loss": 0.1648,
2010
+ "step": 4560
2011
+ },
2012
+ {
2013
+ "epoch": 7.29,
2014
+ "grad_norm": 8.93464469909668,
2015
+ "learning_rate": 2.713375796178344e-05,
2016
+ "loss": 0.151,
2017
+ "step": 4580
2018
+ },
2019
+ {
2020
+ "epoch": 7.32,
2021
+ "grad_norm": 0.04857097566127777,
2022
+ "learning_rate": 2.6815286624203823e-05,
2023
+ "loss": 0.1226,
2024
+ "step": 4600
2025
+ },
2026
+ {
2027
+ "epoch": 7.32,
2028
+ "eval_accuracy": 0.7495238095238095,
2029
+ "eval_loss": 1.1306982040405273,
2030
+ "eval_runtime": 14.3492,
2031
+ "eval_samples_per_second": 73.175,
2032
+ "eval_steps_per_second": 9.199,
2033
+ "step": 4600
2034
+ },
2035
+ {
2036
+ "epoch": 7.36,
2037
+ "grad_norm": 1.8932238817214966,
2038
+ "learning_rate": 2.6496815286624204e-05,
2039
+ "loss": 0.1225,
2040
+ "step": 4620
2041
+ },
2042
+ {
2043
+ "epoch": 7.39,
2044
+ "grad_norm": 10.580707550048828,
2045
+ "learning_rate": 2.617834394904459e-05,
2046
+ "loss": 0.1567,
2047
+ "step": 4640
2048
+ },
2049
+ {
2050
+ "epoch": 7.42,
2051
+ "grad_norm": 0.38718360662460327,
2052
+ "learning_rate": 2.585987261146497e-05,
2053
+ "loss": 0.1251,
2054
+ "step": 4660
2055
+ },
2056
+ {
2057
+ "epoch": 7.45,
2058
+ "grad_norm": 9.657524108886719,
2059
+ "learning_rate": 2.554140127388535e-05,
2060
+ "loss": 0.1076,
2061
+ "step": 4680
2062
+ },
2063
+ {
2064
+ "epoch": 7.48,
2065
+ "grad_norm": 0.019253961741924286,
2066
+ "learning_rate": 2.5222929936305732e-05,
2067
+ "loss": 0.2122,
2068
+ "step": 4700
2069
+ },
2070
+ {
2071
+ "epoch": 7.48,
2072
+ "eval_accuracy": 0.7285714285714285,
2073
+ "eval_loss": 1.2837932109832764,
2074
+ "eval_runtime": 14.1311,
2075
+ "eval_samples_per_second": 74.304,
2076
+ "eval_steps_per_second": 9.341,
2077
+ "step": 4700
2078
+ },
2079
+ {
2080
+ "epoch": 7.52,
2081
+ "grad_norm": 0.8583599925041199,
2082
+ "learning_rate": 2.4904458598726117e-05,
2083
+ "loss": 0.1148,
2084
+ "step": 4720
2085
+ },
2086
+ {
2087
+ "epoch": 7.55,
2088
+ "grad_norm": 0.6162470579147339,
2089
+ "learning_rate": 2.4585987261146498e-05,
2090
+ "loss": 0.1698,
2091
+ "step": 4740
2092
+ },
2093
+ {
2094
+ "epoch": 7.58,
2095
+ "grad_norm": 0.9160044193267822,
2096
+ "learning_rate": 2.426751592356688e-05,
2097
+ "loss": 0.0871,
2098
+ "step": 4760
2099
+ },
2100
+ {
2101
+ "epoch": 7.61,
2102
+ "grad_norm": 0.59488445520401,
2103
+ "learning_rate": 2.3949044585987263e-05,
2104
+ "loss": 0.1455,
2105
+ "step": 4780
2106
+ },
2107
+ {
2108
+ "epoch": 7.64,
2109
+ "grad_norm": 18.75020980834961,
2110
+ "learning_rate": 2.3630573248407645e-05,
2111
+ "loss": 0.1565,
2112
+ "step": 4800
2113
+ },
2114
+ {
2115
+ "epoch": 7.64,
2116
+ "eval_accuracy": 0.7390476190476191,
2117
+ "eval_loss": 1.2039564847946167,
2118
+ "eval_runtime": 14.3722,
2119
+ "eval_samples_per_second": 73.058,
2120
+ "eval_steps_per_second": 9.184,
2121
+ "step": 4800
2122
+ },
2123
+ {
2124
+ "epoch": 7.68,
2125
+ "grad_norm": 0.011890099383890629,
2126
+ "learning_rate": 2.3312101910828026e-05,
2127
+ "loss": 0.0881,
2128
+ "step": 4820
2129
+ },
2130
+ {
2131
+ "epoch": 7.71,
2132
+ "grad_norm": 2.5114028453826904,
2133
+ "learning_rate": 2.299363057324841e-05,
2134
+ "loss": 0.1385,
2135
+ "step": 4840
2136
+ },
2137
+ {
2138
+ "epoch": 7.74,
2139
+ "grad_norm": 0.410961389541626,
2140
+ "learning_rate": 2.267515923566879e-05,
2141
+ "loss": 0.1422,
2142
+ "step": 4860
2143
+ },
2144
+ {
2145
+ "epoch": 7.77,
2146
+ "grad_norm": 0.022722242400050163,
2147
+ "learning_rate": 2.2356687898089173e-05,
2148
+ "loss": 0.1176,
2149
+ "step": 4880
2150
+ },
2151
+ {
2152
+ "epoch": 7.8,
2153
+ "grad_norm": 0.026792345568537712,
2154
+ "learning_rate": 2.2038216560509557e-05,
2155
+ "loss": 0.151,
2156
+ "step": 4900
2157
+ },
2158
+ {
2159
+ "epoch": 7.8,
2160
+ "eval_accuracy": 0.7428571428571429,
2161
+ "eval_loss": 1.2361027002334595,
2162
+ "eval_runtime": 14.3298,
2163
+ "eval_samples_per_second": 73.274,
2164
+ "eval_steps_per_second": 9.212,
2165
+ "step": 4900
2166
+ },
2167
+ {
2168
+ "epoch": 7.83,
2169
+ "grad_norm": 5.774230480194092,
2170
+ "learning_rate": 2.171974522292994e-05,
2171
+ "loss": 0.1663,
2172
+ "step": 4920
2173
+ },
2174
+ {
2175
+ "epoch": 7.87,
2176
+ "grad_norm": 0.011655393987894058,
2177
+ "learning_rate": 2.140127388535032e-05,
2178
+ "loss": 0.1085,
2179
+ "step": 4940
2180
+ },
2181
+ {
2182
+ "epoch": 7.9,
2183
+ "grad_norm": 2.6619997024536133,
2184
+ "learning_rate": 2.10828025477707e-05,
2185
+ "loss": 0.115,
2186
+ "step": 4960
2187
+ },
2188
+ {
2189
+ "epoch": 7.93,
2190
+ "grad_norm": 6.465090274810791,
2191
+ "learning_rate": 2.0764331210191085e-05,
2192
+ "loss": 0.1723,
2193
+ "step": 4980
2194
+ },
2195
+ {
2196
+ "epoch": 7.96,
2197
+ "grad_norm": 8.653176307678223,
2198
+ "learning_rate": 2.0445859872611467e-05,
2199
+ "loss": 0.0934,
2200
+ "step": 5000
2201
+ },
2202
+ {
2203
+ "epoch": 7.96,
2204
+ "eval_accuracy": 0.7457142857142857,
2205
+ "eval_loss": 1.198464274406433,
2206
+ "eval_runtime": 14.2409,
2207
+ "eval_samples_per_second": 73.731,
2208
+ "eval_steps_per_second": 9.269,
2209
+ "step": 5000
2210
+ },
2211
+ {
2212
+ "epoch": 7.99,
2213
+ "grad_norm": 4.535229206085205,
2214
+ "learning_rate": 2.0127388535031848e-05,
2215
+ "loss": 0.1456,
2216
+ "step": 5020
2217
+ },
2218
+ {
2219
+ "epoch": 8.03,
2220
+ "grad_norm": 11.831160545349121,
2221
+ "learning_rate": 1.9808917197452232e-05,
2222
+ "loss": 0.1115,
2223
+ "step": 5040
2224
+ },
2225
+ {
2226
+ "epoch": 8.06,
2227
+ "grad_norm": 8.955262184143066,
2228
+ "learning_rate": 1.9490445859872614e-05,
2229
+ "loss": 0.1298,
2230
+ "step": 5060
2231
+ },
2232
+ {
2233
+ "epoch": 8.09,
2234
+ "grad_norm": 5.660279273986816,
2235
+ "learning_rate": 1.9171974522292995e-05,
2236
+ "loss": 0.1849,
2237
+ "step": 5080
2238
+ },
2239
+ {
2240
+ "epoch": 8.12,
2241
+ "grad_norm": 6.7122087478637695,
2242
+ "learning_rate": 1.885350318471338e-05,
2243
+ "loss": 0.1374,
2244
+ "step": 5100
2245
+ },
2246
+ {
2247
+ "epoch": 8.12,
2248
+ "eval_accuracy": 0.7495238095238095,
2249
+ "eval_loss": 1.136474370956421,
2250
+ "eval_runtime": 14.3605,
2251
+ "eval_samples_per_second": 73.117,
2252
+ "eval_steps_per_second": 9.192,
2253
+ "step": 5100
2254
+ },
2255
+ {
2256
+ "epoch": 8.15,
2257
+ "grad_norm": 0.83880215883255,
2258
+ "learning_rate": 1.8535031847133757e-05,
2259
+ "loss": 0.1406,
2260
+ "step": 5120
2261
+ },
2262
+ {
2263
+ "epoch": 8.18,
2264
+ "grad_norm": 7.110737323760986,
2265
+ "learning_rate": 1.8216560509554138e-05,
2266
+ "loss": 0.1445,
2267
+ "step": 5140
2268
+ },
2269
+ {
2270
+ "epoch": 8.22,
2271
+ "grad_norm": 3.224792242050171,
2272
+ "learning_rate": 1.7898089171974523e-05,
2273
+ "loss": 0.1951,
2274
+ "step": 5160
2275
+ },
2276
+ {
2277
+ "epoch": 8.25,
2278
+ "grad_norm": 1.4524933099746704,
2279
+ "learning_rate": 1.7579617834394904e-05,
2280
+ "loss": 0.1013,
2281
+ "step": 5180
2282
+ },
2283
+ {
2284
+ "epoch": 8.28,
2285
+ "grad_norm": 5.544615268707275,
2286
+ "learning_rate": 1.7261146496815285e-05,
2287
+ "loss": 0.1799,
2288
+ "step": 5200
2289
+ },
2290
+ {
2291
+ "epoch": 8.28,
2292
+ "eval_accuracy": 0.758095238095238,
2293
+ "eval_loss": 1.1371468305587769,
2294
+ "eval_runtime": 14.4095,
2295
+ "eval_samples_per_second": 72.869,
2296
+ "eval_steps_per_second": 9.161,
2297
+ "step": 5200
2298
+ },
2299
+ {
2300
+ "epoch": 8.31,
2301
+ "grad_norm": 0.4378151297569275,
2302
+ "learning_rate": 1.694267515923567e-05,
2303
+ "loss": 0.156,
2304
+ "step": 5220
2305
+ },
2306
+ {
2307
+ "epoch": 8.34,
2308
+ "grad_norm": 4.541534423828125,
2309
+ "learning_rate": 1.662420382165605e-05,
2310
+ "loss": 0.1158,
2311
+ "step": 5240
2312
+ },
2313
+ {
2314
+ "epoch": 8.38,
2315
+ "grad_norm": 0.01598621904850006,
2316
+ "learning_rate": 1.6305732484076432e-05,
2317
+ "loss": 0.1636,
2318
+ "step": 5260
2319
+ },
2320
+ {
2321
+ "epoch": 8.41,
2322
+ "grad_norm": 0.15167254209518433,
2323
+ "learning_rate": 1.5987261146496817e-05,
2324
+ "loss": 0.0819,
2325
+ "step": 5280
2326
+ },
2327
+ {
2328
+ "epoch": 8.44,
2329
+ "grad_norm": 2.8723442554473877,
2330
+ "learning_rate": 1.5668789808917198e-05,
2331
+ "loss": 0.1496,
2332
+ "step": 5300
2333
+ },
2334
+ {
2335
+ "epoch": 8.44,
2336
+ "eval_accuracy": 0.7428571428571429,
2337
+ "eval_loss": 1.1775193214416504,
2338
+ "eval_runtime": 14.1642,
2339
+ "eval_samples_per_second": 74.13,
2340
+ "eval_steps_per_second": 9.319,
2341
+ "step": 5300
2342
+ },
2343
+ {
2344
+ "epoch": 8.47,
2345
+ "grad_norm": 5.163817405700684,
2346
+ "learning_rate": 1.535031847133758e-05,
2347
+ "loss": 0.126,
2348
+ "step": 5320
2349
+ },
2350
+ {
2351
+ "epoch": 8.5,
2352
+ "grad_norm": 7.263587474822998,
2353
+ "learning_rate": 1.5031847133757962e-05,
2354
+ "loss": 0.1124,
2355
+ "step": 5340
2356
+ },
2357
+ {
2358
+ "epoch": 8.54,
2359
+ "grad_norm": 0.11626210063695908,
2360
+ "learning_rate": 1.4713375796178345e-05,
2361
+ "loss": 0.1274,
2362
+ "step": 5360
2363
+ },
2364
+ {
2365
+ "epoch": 8.57,
2366
+ "grad_norm": 0.1953892856836319,
2367
+ "learning_rate": 1.4394904458598726e-05,
2368
+ "loss": 0.0721,
2369
+ "step": 5380
2370
+ },
2371
+ {
2372
+ "epoch": 8.6,
2373
+ "grad_norm": 4.252742290496826,
2374
+ "learning_rate": 1.4076433121019109e-05,
2375
+ "loss": 0.0804,
2376
+ "step": 5400
2377
+ },
2378
+ {
2379
+ "epoch": 8.6,
2380
+ "eval_accuracy": 0.7561904761904762,
2381
+ "eval_loss": 1.1278163194656372,
2382
+ "eval_runtime": 14.271,
2383
+ "eval_samples_per_second": 73.576,
2384
+ "eval_steps_per_second": 9.25,
2385
+ "step": 5400
2386
+ },
2387
+ {
2388
+ "epoch": 8.63,
2389
+ "grad_norm": 0.10082034021615982,
2390
+ "learning_rate": 1.375796178343949e-05,
2391
+ "loss": 0.1376,
2392
+ "step": 5420
2393
+ },
2394
+ {
2395
+ "epoch": 8.66,
2396
+ "grad_norm": 0.05313138663768768,
2397
+ "learning_rate": 1.3439490445859873e-05,
2398
+ "loss": 0.0992,
2399
+ "step": 5440
2400
+ },
2401
+ {
2402
+ "epoch": 8.69,
2403
+ "grad_norm": 3.8297231197357178,
2404
+ "learning_rate": 1.3121019108280256e-05,
2405
+ "loss": 0.1511,
2406
+ "step": 5460
2407
+ },
2408
+ {
2409
+ "epoch": 8.73,
2410
+ "grad_norm": 3.4488308429718018,
2411
+ "learning_rate": 1.2802547770700637e-05,
2412
+ "loss": 0.1603,
2413
+ "step": 5480
2414
+ },
2415
+ {
2416
+ "epoch": 8.76,
2417
+ "grad_norm": 1.6856378316879272,
2418
+ "learning_rate": 1.248407643312102e-05,
2419
+ "loss": 0.12,
2420
+ "step": 5500
2421
+ },
2422
+ {
2423
+ "epoch": 8.76,
2424
+ "eval_accuracy": 0.7533333333333333,
2425
+ "eval_loss": 1.1209964752197266,
2426
+ "eval_runtime": 14.1604,
2427
+ "eval_samples_per_second": 74.15,
2428
+ "eval_steps_per_second": 9.322,
2429
+ "step": 5500
2430
+ },
2431
+ {
2432
+ "epoch": 8.79,
2433
+ "grad_norm": 3.3723340034484863,
2434
+ "learning_rate": 1.2165605095541403e-05,
2435
+ "loss": 0.0811,
2436
+ "step": 5520
2437
+ },
2438
+ {
2439
+ "epoch": 8.82,
2440
+ "grad_norm": 4.700990676879883,
2441
+ "learning_rate": 1.1847133757961784e-05,
2442
+ "loss": 0.0977,
2443
+ "step": 5540
2444
+ },
2445
+ {
2446
+ "epoch": 8.85,
2447
+ "grad_norm": 2.095137596130371,
2448
+ "learning_rate": 1.1528662420382167e-05,
2449
+ "loss": 0.0514,
2450
+ "step": 5560
2451
+ },
2452
+ {
2453
+ "epoch": 8.89,
2454
+ "grad_norm": 0.014027014374732971,
2455
+ "learning_rate": 1.1210191082802548e-05,
2456
+ "loss": 0.1034,
2457
+ "step": 5580
2458
+ },
2459
+ {
2460
+ "epoch": 8.92,
2461
+ "grad_norm": 1.6214849948883057,
2462
+ "learning_rate": 1.089171974522293e-05,
2463
+ "loss": 0.099,
2464
+ "step": 5600
2465
+ },
2466
+ {
2467
+ "epoch": 8.92,
2468
+ "eval_accuracy": 0.7485714285714286,
2469
+ "eval_loss": 1.1295264959335327,
2470
+ "eval_runtime": 14.482,
2471
+ "eval_samples_per_second": 72.504,
2472
+ "eval_steps_per_second": 9.115,
2473
+ "step": 5600
2474
+ },
2475
+ {
2476
+ "epoch": 8.95,
2477
+ "grad_norm": 1.0302766561508179,
2478
+ "learning_rate": 1.0573248407643314e-05,
2479
+ "loss": 0.0755,
2480
+ "step": 5620
2481
+ },
2482
+ {
2483
+ "epoch": 8.98,
2484
+ "grad_norm": 3.192128896713257,
2485
+ "learning_rate": 1.0254777070063695e-05,
2486
+ "loss": 0.0791,
2487
+ "step": 5640
2488
+ },
2489
+ {
2490
+ "epoch": 9.01,
2491
+ "grad_norm": 4.739717960357666,
2492
+ "learning_rate": 9.936305732484078e-06,
2493
+ "loss": 0.1346,
2494
+ "step": 5660
2495
+ },
2496
+ {
2497
+ "epoch": 9.04,
2498
+ "grad_norm": 0.05003926530480385,
2499
+ "learning_rate": 9.617834394904459e-06,
2500
+ "loss": 0.0694,
2501
+ "step": 5680
2502
+ },
2503
+ {
2504
+ "epoch": 9.08,
2505
+ "grad_norm": 1.395236611366272,
2506
+ "learning_rate": 9.29936305732484e-06,
2507
+ "loss": 0.1429,
2508
+ "step": 5700
2509
+ },
2510
+ {
2511
+ "epoch": 9.08,
2512
+ "eval_accuracy": 0.7390476190476191,
2513
+ "eval_loss": 1.207868218421936,
2514
+ "eval_runtime": 14.2527,
2515
+ "eval_samples_per_second": 73.67,
2516
+ "eval_steps_per_second": 9.261,
2517
+ "step": 5700
2518
+ },
2519
+ {
2520
+ "epoch": 9.11,
2521
+ "grad_norm": 4.653144836425781,
2522
+ "learning_rate": 8.980891719745223e-06,
2523
+ "loss": 0.1111,
2524
+ "step": 5720
2525
+ },
2526
+ {
2527
+ "epoch": 9.14,
2528
+ "grad_norm": 12.179301261901855,
2529
+ "learning_rate": 8.662420382165604e-06,
2530
+ "loss": 0.0931,
2531
+ "step": 5740
2532
+ },
2533
+ {
2534
+ "epoch": 9.17,
2535
+ "grad_norm": 5.497566223144531,
2536
+ "learning_rate": 8.343949044585987e-06,
2537
+ "loss": 0.1509,
2538
+ "step": 5760
2539
+ },
2540
+ {
2541
+ "epoch": 9.2,
2542
+ "grad_norm": 4.653054237365723,
2543
+ "learning_rate": 8.02547770700637e-06,
2544
+ "loss": 0.1349,
2545
+ "step": 5780
2546
+ },
2547
+ {
2548
+ "epoch": 9.24,
2549
+ "grad_norm": 4.78197717666626,
2550
+ "learning_rate": 7.707006369426751e-06,
2551
+ "loss": 0.0959,
2552
+ "step": 5800
2553
+ },
2554
+ {
2555
+ "epoch": 9.24,
2556
+ "eval_accuracy": 0.7476190476190476,
2557
+ "eval_loss": 1.1450541019439697,
2558
+ "eval_runtime": 14.2174,
2559
+ "eval_samples_per_second": 73.853,
2560
+ "eval_steps_per_second": 9.284,
2561
+ "step": 5800
2562
+ },
2563
+ {
2564
+ "epoch": 9.27,
2565
+ "grad_norm": 0.03227326646447182,
2566
+ "learning_rate": 7.388535031847134e-06,
2567
+ "loss": 0.135,
2568
+ "step": 5820
2569
+ },
2570
+ {
2571
+ "epoch": 9.3,
2572
+ "grad_norm": 0.03949011489748955,
2573
+ "learning_rate": 7.070063694267516e-06,
2574
+ "loss": 0.1352,
2575
+ "step": 5840
2576
+ },
2577
+ {
2578
+ "epoch": 9.33,
2579
+ "grad_norm": 0.12604393064975739,
2580
+ "learning_rate": 6.751592356687898e-06,
2581
+ "loss": 0.0613,
2582
+ "step": 5860
2583
+ },
2584
+ {
2585
+ "epoch": 9.36,
2586
+ "grad_norm": 3.415341377258301,
2587
+ "learning_rate": 6.433121019108281e-06,
2588
+ "loss": 0.1164,
2589
+ "step": 5880
2590
+ },
2591
+ {
2592
+ "epoch": 9.39,
2593
+ "grad_norm": 0.011540273204445839,
2594
+ "learning_rate": 6.114649681528663e-06,
2595
+ "loss": 0.0263,
2596
+ "step": 5900
2597
+ },
2598
+ {
2599
+ "epoch": 9.39,
2600
+ "eval_accuracy": 0.7514285714285714,
2601
+ "eval_loss": 1.117615818977356,
2602
+ "eval_runtime": 14.3543,
2603
+ "eval_samples_per_second": 73.149,
2604
+ "eval_steps_per_second": 9.196,
2605
+ "step": 5900
2606
+ },
2607
+ {
2608
+ "epoch": 9.43,
2609
+ "grad_norm": 1.7297450304031372,
2610
+ "learning_rate": 5.796178343949045e-06,
2611
+ "loss": 0.1058,
2612
+ "step": 5920
2613
+ },
2614
+ {
2615
+ "epoch": 9.46,
2616
+ "grad_norm": 0.2807926833629608,
2617
+ "learning_rate": 5.477707006369427e-06,
2618
+ "loss": 0.1072,
2619
+ "step": 5940
2620
+ },
2621
+ {
2622
+ "epoch": 9.49,
2623
+ "grad_norm": 4.0545244216918945,
2624
+ "learning_rate": 5.15923566878981e-06,
2625
+ "loss": 0.1031,
2626
+ "step": 5960
2627
+ },
2628
+ {
2629
+ "epoch": 9.52,
2630
+ "grad_norm": 7.815937042236328,
2631
+ "learning_rate": 4.840764331210192e-06,
2632
+ "loss": 0.0962,
2633
+ "step": 5980
2634
+ },
2635
+ {
2636
+ "epoch": 9.55,
2637
+ "grad_norm": 12.278979301452637,
2638
+ "learning_rate": 4.522292993630573e-06,
2639
+ "loss": 0.0936,
2640
+ "step": 6000
2641
+ },
2642
+ {
2643
+ "epoch": 9.55,
2644
+ "eval_accuracy": 0.7533333333333333,
2645
+ "eval_loss": 1.1178562641143799,
2646
+ "eval_runtime": 14.5366,
2647
+ "eval_samples_per_second": 72.231,
2648
+ "eval_steps_per_second": 9.081,
2649
+ "step": 6000
2650
+ },
2651
+ {
2652
+ "epoch": 9.59,
2653
+ "grad_norm": 0.06204557418823242,
2654
+ "learning_rate": 4.203821656050955e-06,
2655
+ "loss": 0.1101,
2656
+ "step": 6020
2657
+ },
2658
+ {
2659
+ "epoch": 9.62,
2660
+ "grad_norm": 0.023558788001537323,
2661
+ "learning_rate": 3.885350318471338e-06,
2662
+ "loss": 0.0556,
2663
+ "step": 6040
2664
+ },
2665
+ {
2666
+ "epoch": 9.65,
2667
+ "grad_norm": 0.07364491373300552,
2668
+ "learning_rate": 3.56687898089172e-06,
2669
+ "loss": 0.1003,
2670
+ "step": 6060
2671
+ },
2672
+ {
2673
+ "epoch": 9.68,
2674
+ "grad_norm": 2.476426839828491,
2675
+ "learning_rate": 3.248407643312102e-06,
2676
+ "loss": 0.1278,
2677
+ "step": 6080
2678
+ },
2679
+ {
2680
+ "epoch": 9.71,
2681
+ "grad_norm": 0.04320213571190834,
2682
+ "learning_rate": 2.9299363057324844e-06,
2683
+ "loss": 0.1332,
2684
+ "step": 6100
2685
+ },
2686
+ {
2687
+ "epoch": 9.71,
2688
+ "eval_accuracy": 0.7485714285714286,
2689
+ "eval_loss": 1.1386895179748535,
2690
+ "eval_runtime": 14.088,
2691
+ "eval_samples_per_second": 74.532,
2692
+ "eval_steps_per_second": 9.37,
2693
+ "step": 6100
2694
+ },
2695
+ {
2696
+ "epoch": 9.75,
2697
+ "grad_norm": 3.164142608642578,
2698
+ "learning_rate": 2.6114649681528665e-06,
2699
+ "loss": 0.1016,
2700
+ "step": 6120
2701
+ },
2702
+ {
2703
+ "epoch": 9.78,
2704
+ "grad_norm": 1.9194905757904053,
2705
+ "learning_rate": 2.2929936305732485e-06,
2706
+ "loss": 0.0907,
2707
+ "step": 6140
2708
+ },
2709
+ {
2710
+ "epoch": 9.81,
2711
+ "grad_norm": 6.227757930755615,
2712
+ "learning_rate": 1.9745222929936305e-06,
2713
+ "loss": 0.1274,
2714
+ "step": 6160
2715
+ },
2716
+ {
2717
+ "epoch": 9.84,
2718
+ "grad_norm": 0.11668018996715546,
2719
+ "learning_rate": 1.6560509554140127e-06,
2720
+ "loss": 0.0701,
2721
+ "step": 6180
2722
+ },
2723
+ {
2724
+ "epoch": 9.87,
2725
+ "grad_norm": 1.6369106769561768,
2726
+ "learning_rate": 1.337579617834395e-06,
2727
+ "loss": 0.071,
2728
+ "step": 6200
2729
+ },
2730
+ {
2731
+ "epoch": 9.87,
2732
+ "eval_accuracy": 0.7466666666666667,
2733
+ "eval_loss": 1.1522855758666992,
2734
+ "eval_runtime": 14.2788,
2735
+ "eval_samples_per_second": 73.536,
2736
+ "eval_steps_per_second": 9.244,
2737
+ "step": 6200
2738
+ },
2739
+ {
2740
+ "epoch": 9.9,
2741
+ "grad_norm": 0.5098825097084045,
2742
+ "learning_rate": 1.019108280254777e-06,
2743
+ "loss": 0.0679,
2744
+ "step": 6220
2745
+ },
2746
+ {
2747
+ "epoch": 9.94,
2748
+ "grad_norm": 0.0343327559530735,
2749
+ "learning_rate": 7.006369426751592e-07,
2750
+ "loss": 0.1315,
2751
+ "step": 6240
2752
+ },
2753
+ {
2754
+ "epoch": 9.97,
2755
+ "grad_norm": 3.2321577072143555,
2756
+ "learning_rate": 3.8216560509554143e-07,
2757
+ "loss": 0.0521,
2758
+ "step": 6260
2759
+ },
2760
+ {
2761
+ "epoch": 10.0,
2762
+ "grad_norm": 0.04384122043848038,
2763
+ "learning_rate": 6.369426751592356e-08,
2764
+ "loss": 0.0769,
2765
+ "step": 6280
2766
+ },
2767
+ {
2768
+ "epoch": 10.0,
2769
+ "step": 6280,
2770
+ "total_flos": 7.783078535151575e+18,
2771
+ "train_loss": 0.28272074579623097,
2772
+ "train_runtime": 3631.8012,
2773
+ "train_samples_per_second": 27.631,
2774
+ "train_steps_per_second": 1.729
2775
  }
2776
  ],
2777
  "logging_steps": 20,
2778
+ "max_steps": 6280,
2779
  "num_input_tokens_seen": 0,
2780
+ "num_train_epochs": 10,
2781
  "save_steps": 100,
2782
+ "total_flos": 7.783078535151575e+18,
2783
  "train_batch_size": 16,
2784
  "trial_name": null,
2785
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d34fdbe88bbadf111a47562e45e3b8a5dc649c10b26b418f408b2be7a922460c
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec81ac56c07abf2a8d2de12bf4cde576c49fbc4cc06fb62394226e481178d423
3
  size 4920