tanliboy commited on
Commit
ba96ecf
1 Parent(s): 09ea778

Model save

Browse files
README.md CHANGED
@@ -3,15 +3,10 @@ library_name: transformers
3
  license: llama3.2
4
  base_model: tanliboy/llama-3.2-3b
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - sft
9
- - generated_from_trainer
10
  - trl
11
  - sft
 
12
  - generated_from_trainer
13
- datasets:
14
- - tanliboy/OpenHermes-2.5-reformat
15
  model-index:
16
  - name: llama-3.2-3b-sft
17
  results: []
@@ -22,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # llama-3.2-3b-sft
24
 
25
- This model is a fine-tuned version of [tanliboy/llama-3.2-3b](https://huggingface.co/tanliboy/llama-3.2-3b) on the tanliboy/OpenHermes-2.5-reformat dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.7042
28
 
29
  ## Model description
30
 
@@ -43,7 +38,7 @@ More information needed
43
  ### Training hyperparameters
44
 
45
  The following hyperparameters were used during training:
46
- - learning_rate: 1e-05
47
  - train_batch_size: 8
48
  - eval_batch_size: 8
49
  - seed: 42
@@ -59,9 +54,30 @@ The following hyperparameters were used during training:
59
 
60
  ### Training results
61
 
62
- | Training Loss | Epoch | Step | Validation Loss |
63
- |:-------------:|:-----:|:----:|:---------------:|
64
- | 0.6946 | 1.0 | 2230 | 0.7042 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  ### Framework versions
 
3
  license: llama3.2
4
  base_model: tanliboy/llama-3.2-3b
5
  tags:
 
 
 
 
6
  - trl
7
  - sft
8
+ - alignment-handbook
9
  - generated_from_trainer
 
 
10
  model-index:
11
  - name: llama-3.2-3b-sft
12
  results: []
 
17
 
18
  # llama-3.2-3b-sft
19
 
20
+ This model is a fine-tuned version of [tanliboy/llama-3.2-3b](https://huggingface.co/tanliboy/llama-3.2-3b) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.7216
23
 
24
  ## Model description
25
 
 
38
  ### Training hyperparameters
39
 
40
  The following hyperparameters were used during training:
41
+ - learning_rate: 3e-06
42
  - train_batch_size: 8
43
  - eval_batch_size: 8
44
  - seed: 42
 
54
 
55
  ### Training results
56
 
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:------:|:----:|:---------------:|
59
+ | 0.8741 | 0.0448 | 100 | 0.8600 |
60
+ | 0.8038 | 0.0897 | 200 | 0.8095 |
61
+ | 0.7937 | 0.1345 | 300 | 0.7789 |
62
+ | 0.7712 | 0.1794 | 400 | 0.7644 |
63
+ | 0.7393 | 0.2242 | 500 | 0.7565 |
64
+ | 0.7458 | 0.2691 | 600 | 0.7506 |
65
+ | 0.7694 | 0.3139 | 700 | 0.7458 |
66
+ | 0.713 | 0.3587 | 800 | 0.7422 |
67
+ | 0.7347 | 0.4036 | 900 | 0.7387 |
68
+ | 0.7243 | 0.4484 | 1000 | 0.7356 |
69
+ | 0.7161 | 0.4933 | 1100 | 0.7331 |
70
+ | 0.7247 | 0.5381 | 1200 | 0.7308 |
71
+ | 0.7477 | 0.5830 | 1300 | 0.7288 |
72
+ | 0.7429 | 0.6278 | 1400 | 0.7273 |
73
+ | 0.7317 | 0.6726 | 1500 | 0.7256 |
74
+ | 0.7226 | 0.7175 | 1600 | 0.7243 |
75
+ | 0.695 | 0.7623 | 1700 | 0.7234 |
76
+ | 0.7167 | 0.8072 | 1800 | 0.7226 |
77
+ | 0.686 | 0.8520 | 1900 | 0.7221 |
78
+ | 0.7214 | 0.8969 | 2000 | 0.7218 |
79
+ | 0.7358 | 0.9417 | 2100 | 0.7216 |
80
+ | 0.7259 | 0.9865 | 2200 | 0.7216 |
81
 
82
 
83
  ### Framework versions
all_results.json CHANGED
@@ -5,10 +5,10 @@
5
  "eval_samples": 50077,
6
  "eval_samples_per_second": 132.107,
7
  "eval_steps_per_second": 2.066,
8
- "total_flos": 244955314192384.0,
9
- "train_loss": 0.7376568270371099,
10
- "train_runtime": 8720.6202,
11
  "train_samples": 285435,
12
- "train_samples_per_second": 32.731,
13
- "train_steps_per_second": 0.256
14
  }
 
5
  "eval_samples": 50077,
6
  "eval_samples_per_second": 132.107,
7
  "eval_steps_per_second": 2.066,
8
+ "total_flos": 250303561007104.0,
9
+ "train_loss": 0.7492096503219262,
10
+ "train_runtime": 18007.2993,
11
  "train_samples": 285435,
12
+ "train_samples_per_second": 15.851,
13
+ "train_steps_per_second": 0.124
14
  }
config.json CHANGED
@@ -35,6 +35,6 @@
35
  "tie_word_embeddings": true,
36
  "torch_dtype": "bfloat16",
37
  "transformers_version": "4.44.2",
38
- "use_cache": true,
39
  "vocab_size": 128256
40
  }
 
35
  "tie_word_embeddings": true,
36
  "torch_dtype": "bfloat16",
37
  "transformers_version": "4.44.2",
38
+ "use_cache": false,
39
  "vocab_size": 128256
40
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bafbda1cf46dcdfa68fddb235d102da9d1885ae56e80b4e41840a35aba9df3e
3
  size 4965799096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09efa89e7b43c24e8c1ab73f7e09196edd014cf14482fe5101aaf902a586d7f2
3
  size 4965799096
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:674efdeb16be82251aae01846f486cb86b0be050c9eba1232d0602a00bd5679b
3
  size 1459729952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08bcfff732c5238b56bd7df4c0e2c63a0a02d0592c022ee16a201dab2ea9820
3
  size 1459729952
runs/Sep27_17-24-19_action-graph-trainer/events.out.tfevents.1727460461.action-graph-trainer.379760.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8664a4d576b88a2be33c4a2390982c36adcdf3a6711e183eecfb35752a6285e
3
+ size 29491
runs/Sep27_20-11-03_action-graph-trainer/events.out.tfevents.1727468262.action-graph-trainer.430650.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c18947c9ba2697ba326b828d36458b2b45f1b8572f03289d81b56f7df7627a
3
+ size 59321
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 2048,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 4096,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 244955314192384.0,
4
- "train_loss": 0.7376568270371099,
5
- "train_runtime": 8720.6202,
6
  "train_samples": 285435,
7
- "train_samples_per_second": 32.731,
8
- "train_steps_per_second": 0.256
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 250303561007104.0,
4
+ "train_loss": 0.7492096503219262,
5
+ "train_runtime": 18007.2993,
6
  "train_samples": 285435,
7
+ "train_samples_per_second": 15.851,
8
+ "train_steps_per_second": 0.124
9
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
- "eval_steps": 500,
6
  "global_step": 2230,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,1588 +10,1756 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0004484304932735426,
13
- "grad_norm": 4.697020980518476,
14
- "learning_rate": 4.4843049327354265e-08,
15
  "loss": 0.9912,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.004484304932735426,
20
- "grad_norm": 5.023039914107906,
21
- "learning_rate": 4.484304932735426e-07,
22
- "loss": 1.0335,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.008968609865470852,
27
- "grad_norm": 4.618075407201601,
28
- "learning_rate": 8.968609865470852e-07,
29
- "loss": 1.0355,
30
  "step": 20
31
  },
32
  {
33
  "epoch": 0.013452914798206279,
34
- "grad_norm": 2.181160774619855,
35
- "learning_rate": 1.345291479820628e-06,
36
- "loss": 0.9824,
37
  "step": 30
38
  },
39
  {
40
  "epoch": 0.017937219730941704,
41
- "grad_norm": 2.008569343268214,
42
- "learning_rate": 1.7937219730941704e-06,
43
- "loss": 0.9383,
44
  "step": 40
45
  },
46
  {
47
  "epoch": 0.02242152466367713,
48
- "grad_norm": 1.7133805307500916,
49
- "learning_rate": 2.242152466367713e-06,
50
- "loss": 0.87,
51
  "step": 50
52
  },
53
  {
54
  "epoch": 0.026905829596412557,
55
- "grad_norm": 1.6402285214131669,
56
- "learning_rate": 2.690582959641256e-06,
57
- "loss": 0.8608,
58
  "step": 60
59
  },
60
  {
61
  "epoch": 0.03139013452914798,
62
- "grad_norm": 1.548076900788033,
63
- "learning_rate": 3.1390134529147986e-06,
64
- "loss": 0.8675,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.03587443946188341,
69
- "grad_norm": 1.5686573872779637,
70
- "learning_rate": 3.587443946188341e-06,
71
- "loss": 0.8581,
72
  "step": 80
73
  },
74
  {
75
  "epoch": 0.04035874439461883,
76
- "grad_norm": 1.8398749741247653,
77
- "learning_rate": 4.0358744394618836e-06,
78
- "loss": 0.8272,
79
  "step": 90
80
  },
81
  {
82
  "epoch": 0.04484304932735426,
83
- "grad_norm": 1.431522929426351,
84
- "learning_rate": 4.484304932735426e-06,
85
- "loss": 0.837,
 
 
 
 
 
 
 
 
86
  "step": 100
87
  },
88
  {
89
  "epoch": 0.04932735426008968,
90
- "grad_norm": 1.5183209831163855,
91
- "learning_rate": 4.932735426008969e-06,
92
- "loss": 0.7993,
93
  "step": 110
94
  },
95
  {
96
  "epoch": 0.053811659192825115,
97
- "grad_norm": 1.440741479973038,
98
- "learning_rate": 5.381165919282512e-06,
99
- "loss": 0.7768,
100
  "step": 120
101
  },
102
  {
103
  "epoch": 0.05829596412556054,
104
- "grad_norm": 1.6957957755628803,
105
- "learning_rate": 5.8295964125560544e-06,
106
- "loss": 0.793,
107
  "step": 130
108
  },
109
  {
110
  "epoch": 0.06278026905829596,
111
- "grad_norm": 1.5847929433417824,
112
- "learning_rate": 6.278026905829597e-06,
113
- "loss": 0.8145,
114
  "step": 140
115
  },
116
  {
117
  "epoch": 0.06726457399103139,
118
- "grad_norm": 1.721638040499256,
119
- "learning_rate": 6.72645739910314e-06,
120
- "loss": 0.7677,
121
  "step": 150
122
  },
123
  {
124
  "epoch": 0.07174887892376682,
125
- "grad_norm": 1.8900581111883776,
126
- "learning_rate": 7.174887892376682e-06,
127
- "loss": 0.784,
128
  "step": 160
129
  },
130
  {
131
  "epoch": 0.07623318385650224,
132
- "grad_norm": 1.554784427401762,
133
- "learning_rate": 7.6233183856502244e-06,
134
- "loss": 0.7595,
135
  "step": 170
136
  },
137
  {
138
  "epoch": 0.08071748878923767,
139
- "grad_norm": 1.8326262271816938,
140
- "learning_rate": 8.071748878923767e-06,
141
- "loss": 0.7722,
142
  "step": 180
143
  },
144
  {
145
  "epoch": 0.08520179372197309,
146
- "grad_norm": 1.4707798772479566,
147
- "learning_rate": 8.52017937219731e-06,
148
- "loss": 0.764,
149
  "step": 190
150
  },
151
  {
152
  "epoch": 0.08968609865470852,
153
- "grad_norm": 1.449476026227771,
154
- "learning_rate": 8.968609865470853e-06,
155
- "loss": 0.773,
 
 
 
 
 
 
 
 
156
  "step": 200
157
  },
158
  {
159
  "epoch": 0.09417040358744394,
160
- "grad_norm": 1.5945882538155545,
161
- "learning_rate": 9.417040358744395e-06,
162
- "loss": 0.7549,
163
  "step": 210
164
  },
165
  {
166
  "epoch": 0.09865470852017937,
167
- "grad_norm": 1.4765341188830319,
168
- "learning_rate": 9.865470852017938e-06,
169
- "loss": 0.784,
170
  "step": 220
171
  },
172
  {
173
  "epoch": 0.1031390134529148,
174
- "grad_norm": 1.452955640004767,
175
- "learning_rate": 9.999699851108367e-06,
176
- "loss": 0.7909,
177
  "step": 230
178
  },
179
  {
180
  "epoch": 0.10762331838565023,
181
- "grad_norm": 1.5997501860541272,
182
- "learning_rate": 9.9982298208374e-06,
183
- "loss": 0.7651,
184
  "step": 240
185
  },
186
  {
187
  "epoch": 0.11210762331838565,
188
- "grad_norm": 1.554511883278329,
189
- "learning_rate": 9.995535139530904e-06,
190
- "loss": 0.7621,
191
  "step": 250
192
  },
193
  {
194
  "epoch": 0.11659192825112108,
195
- "grad_norm": 1.6319742757477633,
196
- "learning_rate": 9.991616467431486e-06,
197
- "loss": 0.7906,
198
  "step": 260
199
  },
200
  {
201
  "epoch": 0.1210762331838565,
202
- "grad_norm": 1.6863475759473823,
203
- "learning_rate": 9.986474764680236e-06,
204
- "loss": 0.7684,
205
  "step": 270
206
  },
207
  {
208
  "epoch": 0.12556053811659193,
209
- "grad_norm": 1.5563949560805244,
210
- "learning_rate": 9.98011129108149e-06,
211
- "loss": 0.793,
212
  "step": 280
213
  },
214
  {
215
  "epoch": 0.13004484304932734,
216
- "grad_norm": 1.6203440473254576,
217
- "learning_rate": 9.972527605794151e-06,
218
- "loss": 0.771,
219
  "step": 290
220
  },
221
  {
222
  "epoch": 0.13452914798206278,
223
- "grad_norm": 1.5211967329248808,
224
- "learning_rate": 9.963725566949674e-06,
225
- "loss": 0.7865,
 
 
 
 
 
 
 
 
226
  "step": 300
227
  },
228
  {
229
  "epoch": 0.13901345291479822,
230
- "grad_norm": 1.3996609263865165,
231
- "learning_rate": 9.953707331196787e-06,
232
- "loss": 0.7397,
233
  "step": 310
234
  },
235
  {
236
  "epoch": 0.14349775784753363,
237
- "grad_norm": 1.7467903395183983,
238
- "learning_rate": 9.94247535317308e-06,
239
- "loss": 0.802,
240
  "step": 320
241
  },
242
  {
243
  "epoch": 0.14798206278026907,
244
- "grad_norm": 1.4561862746225176,
245
- "learning_rate": 9.930032384903566e-06,
246
- "loss": 0.78,
247
  "step": 330
248
  },
249
  {
250
  "epoch": 0.15246636771300448,
251
- "grad_norm": 1.261592243705757,
252
- "learning_rate": 9.916381475126406e-06,
253
- "loss": 0.7678,
254
  "step": 340
255
  },
256
  {
257
  "epoch": 0.15695067264573992,
258
- "grad_norm": 1.3845441861346746,
259
- "learning_rate": 9.901525968545907e-06,
260
- "loss": 0.7462,
261
  "step": 350
262
  },
263
  {
264
  "epoch": 0.16143497757847533,
265
- "grad_norm": 1.330783975393604,
266
- "learning_rate": 9.885469505013006e-06,
267
- "loss": 0.7516,
268
  "step": 360
269
  },
270
  {
271
  "epoch": 0.16591928251121077,
272
- "grad_norm": 1.5411411223309597,
273
- "learning_rate": 9.868216018633456e-06,
274
- "loss": 0.7617,
275
  "step": 370
276
  },
277
  {
278
  "epoch": 0.17040358744394618,
279
- "grad_norm": 1.4997661320107978,
280
- "learning_rate": 9.8497697368039e-06,
281
- "loss": 0.7433,
282
  "step": 380
283
  },
284
  {
285
  "epoch": 0.17488789237668162,
286
- "grad_norm": 1.4613487182945122,
287
- "learning_rate": 9.830135179176086e-06,
288
- "loss": 0.7977,
289
  "step": 390
290
  },
291
  {
292
  "epoch": 0.17937219730941703,
293
- "grad_norm": 1.3823002584421413,
294
- "learning_rate": 9.809317156549476e-06,
295
- "loss": 0.7668,
 
 
 
 
 
 
 
 
296
  "step": 400
297
  },
298
  {
299
  "epoch": 0.18385650224215247,
300
- "grad_norm": 1.3405610913825137,
301
- "learning_rate": 9.787320769692517e-06,
302
- "loss": 0.755,
303
  "step": 410
304
  },
305
  {
306
  "epoch": 0.18834080717488788,
307
- "grad_norm": 1.321174566440371,
308
- "learning_rate": 9.76415140809287e-06,
309
- "loss": 0.7712,
310
  "step": 420
311
  },
312
  {
313
  "epoch": 0.19282511210762332,
314
- "grad_norm": 1.5855307049280556,
315
- "learning_rate": 9.739814748636892e-06,
316
- "loss": 0.7876,
317
  "step": 430
318
  },
319
  {
320
  "epoch": 0.19730941704035873,
321
- "grad_norm": 1.3277684374580685,
322
- "learning_rate": 9.7143167542187e-06,
323
- "loss": 0.7497,
324
  "step": 440
325
  },
326
  {
327
  "epoch": 0.20179372197309417,
328
- "grad_norm": 1.3316052843966186,
329
- "learning_rate": 9.687663672279167e-06,
330
- "loss": 0.7742,
331
  "step": 450
332
  },
333
  {
334
  "epoch": 0.2062780269058296,
335
- "grad_norm": 1.3422686862026139,
336
- "learning_rate": 9.659862033275188e-06,
337
- "loss": 0.7443,
338
  "step": 460
339
  },
340
  {
341
  "epoch": 0.21076233183856502,
342
- "grad_norm": 1.3794382112115433,
343
- "learning_rate": 9.630918649079606e-06,
344
- "loss": 0.7423,
345
  "step": 470
346
  },
347
  {
348
  "epoch": 0.21524663677130046,
349
- "grad_norm": 1.3473781761704757,
350
- "learning_rate": 9.600840611312198e-06,
351
- "loss": 0.756,
352
  "step": 480
353
  },
354
  {
355
  "epoch": 0.21973094170403587,
356
- "grad_norm": 1.477159074283593,
357
- "learning_rate": 9.569635289602098e-06,
358
- "loss": 0.758,
359
  "step": 490
360
  },
361
  {
362
  "epoch": 0.2242152466367713,
363
- "grad_norm": 1.4406496372056654,
364
- "learning_rate": 9.537310329782109e-06,
365
- "loss": 0.7373,
 
 
 
 
 
 
 
 
366
  "step": 500
367
  },
368
  {
369
  "epoch": 0.22869955156950672,
370
- "grad_norm": 1.3236307975388621,
371
- "learning_rate": 9.503873652015358e-06,
372
- "loss": 0.7485,
373
  "step": 510
374
  },
375
  {
376
  "epoch": 0.23318385650224216,
377
- "grad_norm": 1.2168213830447414,
378
- "learning_rate": 9.469333448854713e-06,
379
- "loss": 0.7518,
380
  "step": 520
381
  },
382
  {
383
  "epoch": 0.23766816143497757,
384
- "grad_norm": 1.4695219464522695,
385
- "learning_rate": 9.433698183235468e-06,
386
- "loss": 0.7389,
387
  "step": 530
388
  },
389
  {
390
  "epoch": 0.242152466367713,
391
- "grad_norm": 1.5460371795366352,
392
- "learning_rate": 9.39697658640179e-06,
393
- "loss": 0.7606,
394
  "step": 540
395
  },
396
  {
397
  "epoch": 0.24663677130044842,
398
- "grad_norm": 1.36384817307445,
399
- "learning_rate": 9.359177655767398e-06,
400
- "loss": 0.7573,
401
  "step": 550
402
  },
403
  {
404
  "epoch": 0.25112107623318386,
405
- "grad_norm": 1.2974873879986306,
406
- "learning_rate": 9.320310652711062e-06,
407
- "loss": 0.7447,
408
  "step": 560
409
  },
410
  {
411
  "epoch": 0.2556053811659193,
412
- "grad_norm": 1.3924395179060305,
413
- "learning_rate": 9.2803851003074e-06,
414
- "loss": 0.7346,
415
  "step": 570
416
  },
417
  {
418
  "epoch": 0.2600896860986547,
419
- "grad_norm": 1.3646463661968233,
420
- "learning_rate": 9.239410780993565e-06,
421
- "loss": 0.7637,
422
  "step": 580
423
  },
424
  {
425
  "epoch": 0.2645739910313901,
426
- "grad_norm": 1.5500883315093192,
427
- "learning_rate": 9.197397734172381e-06,
428
- "loss": 0.7352,
429
  "step": 590
430
  },
431
  {
432
  "epoch": 0.26905829596412556,
433
- "grad_norm": 1.2384578247121611,
434
- "learning_rate": 9.154356253752519e-06,
435
- "loss": 0.7467,
 
 
 
 
 
 
 
 
436
  "step": 600
437
  },
438
  {
439
  "epoch": 0.273542600896861,
440
- "grad_norm": 1.4632407956897133,
441
- "learning_rate": 9.110296885626315e-06,
442
- "loss": 0.7592,
443
  "step": 610
444
  },
445
  {
446
  "epoch": 0.27802690582959644,
447
- "grad_norm": 1.4743516068749583,
448
- "learning_rate": 9.065230425085849e-06,
449
- "loss": 0.7471,
450
  "step": 620
451
  },
452
  {
453
  "epoch": 0.2825112107623318,
454
- "grad_norm": 1.609459180353317,
455
- "learning_rate": 9.01916791417792e-06,
456
- "loss": 0.7411,
457
  "step": 630
458
  },
459
  {
460
  "epoch": 0.28699551569506726,
461
- "grad_norm": 1.4163468213726333,
462
- "learning_rate": 8.97212063899854e-06,
463
- "loss": 0.7583,
464
  "step": 640
465
  },
466
  {
467
  "epoch": 0.2914798206278027,
468
- "grad_norm": 1.2789206960042645,
469
- "learning_rate": 8.924100126927672e-06,
470
- "loss": 0.7637,
471
  "step": 650
472
  },
473
  {
474
  "epoch": 0.29596412556053814,
475
- "grad_norm": 1.4348891847742615,
476
- "learning_rate": 8.87511814380481e-06,
477
- "loss": 0.7376,
478
  "step": 660
479
  },
480
  {
481
  "epoch": 0.3004484304932735,
482
- "grad_norm": 1.4055551714674843,
483
- "learning_rate": 8.825186691046156e-06,
484
- "loss": 0.7544,
485
  "step": 670
486
  },
487
  {
488
  "epoch": 0.30493273542600896,
489
- "grad_norm": 1.3076750983715024,
490
- "learning_rate": 8.774318002704072e-06,
491
- "loss": 0.7388,
492
  "step": 680
493
  },
494
  {
495
  "epoch": 0.3094170403587444,
496
- "grad_norm": 1.2832634236951583,
497
- "learning_rate": 8.722524542469517e-06,
498
- "loss": 0.7386,
499
  "step": 690
500
  },
501
  {
502
  "epoch": 0.31390134529147984,
503
- "grad_norm": 1.3612825976793976,
504
- "learning_rate": 8.669819000618248e-06,
505
- "loss": 0.768,
 
 
 
 
 
 
 
 
506
  "step": 700
507
  },
508
  {
509
  "epoch": 0.3183856502242152,
510
- "grad_norm": 1.3343477240590562,
511
- "learning_rate": 8.616214290901474e-06,
512
- "loss": 0.7244,
513
  "step": 710
514
  },
515
  {
516
  "epoch": 0.32286995515695066,
517
- "grad_norm": 1.3061898085537815,
518
- "learning_rate": 8.56172354738178e-06,
519
- "loss": 0.7368,
520
  "step": 720
521
  },
522
  {
523
  "epoch": 0.3273542600896861,
524
- "grad_norm": 1.423008056010291,
525
- "learning_rate": 8.506360121215046e-06,
526
- "loss": 0.7297,
527
  "step": 730
528
  },
529
  {
530
  "epoch": 0.33183856502242154,
531
- "grad_norm": 1.258418518673196,
532
- "learning_rate": 8.4501375773792e-06,
533
- "loss": 0.7322,
534
  "step": 740
535
  },
536
  {
537
  "epoch": 0.336322869955157,
538
- "grad_norm": 1.4805131801943718,
539
- "learning_rate": 8.39306969135056e-06,
540
- "loss": 0.7284,
541
  "step": 750
542
  },
543
  {
544
  "epoch": 0.34080717488789236,
545
- "grad_norm": 1.3260411374117855,
546
- "learning_rate": 8.335170445728609e-06,
547
- "loss": 0.7618,
548
  "step": 760
549
  },
550
  {
551
  "epoch": 0.3452914798206278,
552
- "grad_norm": 1.3351013471897553,
553
- "learning_rate": 8.276454026810026e-06,
554
- "loss": 0.7454,
555
  "step": 770
556
  },
557
  {
558
  "epoch": 0.34977578475336324,
559
- "grad_norm": 1.267536863507402,
560
- "learning_rate": 8.216934821112803e-06,
561
- "loss": 0.742,
562
  "step": 780
563
  },
564
  {
565
  "epoch": 0.3542600896860987,
566
- "grad_norm": 1.4791659570865663,
567
- "learning_rate": 8.156627411851295e-06,
568
- "loss": 0.7483,
569
  "step": 790
570
  },
571
  {
572
  "epoch": 0.35874439461883406,
573
- "grad_norm": 1.4042375363677306,
574
- "learning_rate": 8.095546575363098e-06,
575
- "loss": 0.7134,
 
 
 
 
 
 
 
 
576
  "step": 800
577
  },
578
  {
579
  "epoch": 0.3632286995515695,
580
- "grad_norm": 1.2552675050466975,
581
- "learning_rate": 8.033707277488585e-06,
582
- "loss": 0.7186,
583
  "step": 810
584
  },
585
  {
586
  "epoch": 0.36771300448430494,
587
- "grad_norm": 1.281776208560821,
588
- "learning_rate": 7.97112466990403e-06,
589
- "loss": 0.7367,
590
  "step": 820
591
  },
592
  {
593
  "epoch": 0.3721973094170404,
594
- "grad_norm": 1.417675486477273,
595
- "learning_rate": 7.907814086409183e-06,
596
- "loss": 0.7399,
597
  "step": 830
598
  },
599
  {
600
  "epoch": 0.37668161434977576,
601
- "grad_norm": 1.4911782746859528,
602
- "learning_rate": 7.843791039170232e-06,
603
- "loss": 0.738,
604
  "step": 840
605
  },
606
  {
607
  "epoch": 0.3811659192825112,
608
- "grad_norm": 1.3628167592658191,
609
- "learning_rate": 7.779071214919068e-06,
610
- "loss": 0.7404,
611
  "step": 850
612
  },
613
  {
614
  "epoch": 0.38565022421524664,
615
- "grad_norm": 1.256534318600846,
616
- "learning_rate": 7.713670471109749e-06,
617
- "loss": 0.7364,
618
  "step": 860
619
  },
620
  {
621
  "epoch": 0.3901345291479821,
622
- "grad_norm": 1.3061985247991272,
623
- "learning_rate": 7.647604832033178e-06,
624
- "loss": 0.7535,
625
  "step": 870
626
  },
627
  {
628
  "epoch": 0.39461883408071746,
629
- "grad_norm": 1.3626743194809932,
630
- "learning_rate": 7.580890484890864e-06,
631
- "loss": 0.7212,
632
  "step": 880
633
  },
634
  {
635
  "epoch": 0.3991031390134529,
636
- "grad_norm": 1.283405821881289,
637
- "learning_rate": 7.513543775828791e-06,
638
- "loss": 0.7336,
639
  "step": 890
640
  },
641
  {
642
  "epoch": 0.40358744394618834,
643
- "grad_norm": 1.4217858183255085,
644
- "learning_rate": 7.445581205932335e-06,
645
- "loss": 0.7349,
 
 
 
 
 
 
 
 
646
  "step": 900
647
  },
648
  {
649
  "epoch": 0.4080717488789238,
650
- "grad_norm": 1.3304359323377464,
651
- "learning_rate": 7.377019427183213e-06,
652
- "loss": 0.7265,
653
  "step": 910
654
  },
655
  {
656
  "epoch": 0.4125560538116592,
657
- "grad_norm": 1.3324230635898773,
658
- "learning_rate": 7.30787523837947e-06,
659
- "loss": 0.7451,
660
  "step": 920
661
  },
662
  {
663
  "epoch": 0.4170403587443946,
664
- "grad_norm": 1.2976336030674929,
665
- "learning_rate": 7.238165581019488e-06,
666
- "loss": 0.7415,
667
  "step": 930
668
  },
669
  {
670
  "epoch": 0.42152466367713004,
671
- "grad_norm": 1.3082251443805497,
672
- "learning_rate": 7.167907535151027e-06,
673
- "loss": 0.7405,
674
  "step": 940
675
  },
676
  {
677
  "epoch": 0.4260089686098655,
678
- "grad_norm": 1.368310990039969,
679
- "learning_rate": 7.097118315186335e-06,
680
- "loss": 0.7141,
681
  "step": 950
682
  },
683
  {
684
  "epoch": 0.4304932735426009,
685
- "grad_norm": 1.3507885090012453,
686
- "learning_rate": 7.025815265684315e-06,
687
- "loss": 0.744,
688
  "step": 960
689
  },
690
  {
691
  "epoch": 0.4349775784753363,
692
- "grad_norm": 1.3870418574466943,
693
- "learning_rate": 6.9540158571008105e-06,
694
- "loss": 0.7344,
695
  "step": 970
696
  },
697
  {
698
  "epoch": 0.43946188340807174,
699
- "grad_norm": 1.2702369270052376,
700
- "learning_rate": 6.881737681508065e-06,
701
- "loss": 0.7131,
702
  "step": 980
703
  },
704
  {
705
  "epoch": 0.4439461883408072,
706
- "grad_norm": 1.3694518474478212,
707
- "learning_rate": 6.808998448284347e-06,
708
- "loss": 0.7516,
709
  "step": 990
710
  },
711
  {
712
  "epoch": 0.4484304932735426,
713
- "grad_norm": 1.3967080566658139,
714
- "learning_rate": 6.735815979774865e-06,
715
- "loss": 0.7208,
 
 
 
 
 
 
 
 
716
  "step": 1000
717
  },
718
  {
719
  "epoch": 0.452914798206278,
720
- "grad_norm": 1.4517646369323314,
721
- "learning_rate": 6.662208206924986e-06,
722
- "loss": 0.7455,
723
  "step": 1010
724
  },
725
  {
726
  "epoch": 0.45739910313901344,
727
- "grad_norm": 1.2752229454885209,
728
- "learning_rate": 6.588193164886847e-06,
729
- "loss": 0.7555,
730
  "step": 1020
731
  },
732
  {
733
  "epoch": 0.4618834080717489,
734
- "grad_norm": 1.3528261238688069,
735
- "learning_rate": 6.513788988600441e-06,
736
- "loss": 0.7428,
737
  "step": 1030
738
  },
739
  {
740
  "epoch": 0.4663677130044843,
741
- "grad_norm": 1.340831615562883,
742
- "learning_rate": 6.439013908350249e-06,
743
- "loss": 0.7446,
744
  "step": 1040
745
  },
746
  {
747
  "epoch": 0.47085201793721976,
748
- "grad_norm": 1.3233981064583105,
749
- "learning_rate": 6.363886245298514e-06,
750
- "loss": 0.6945,
751
  "step": 1050
752
  },
753
  {
754
  "epoch": 0.47533632286995514,
755
- "grad_norm": 1.29683857481815,
756
- "learning_rate": 6.288424406996237e-06,
757
- "loss": 0.7085,
758
  "step": 1060
759
  },
760
  {
761
  "epoch": 0.4798206278026906,
762
- "grad_norm": 1.3009424922765027,
763
- "learning_rate": 6.2126468828730225e-06,
764
- "loss": 0.7294,
765
  "step": 1070
766
  },
767
  {
768
  "epoch": 0.484304932735426,
769
- "grad_norm": 1.244204104767217,
770
- "learning_rate": 6.136572239706854e-06,
771
- "loss": 0.7091,
772
  "step": 1080
773
  },
774
  {
775
  "epoch": 0.48878923766816146,
776
- "grad_norm": 1.4080424781447183,
777
- "learning_rate": 6.060219117074914e-06,
778
- "loss": 0.724,
779
  "step": 1090
780
  },
781
  {
782
  "epoch": 0.49327354260089684,
783
- "grad_norm": 1.4328593518662633,
784
- "learning_rate": 5.983606222786577e-06,
785
- "loss": 0.7106,
 
 
 
 
 
 
 
 
786
  "step": 1100
787
  },
788
  {
789
  "epoch": 0.4977578475336323,
790
- "grad_norm": 1.3515007322722326,
791
- "learning_rate": 5.9067523282996775e-06,
792
- "loss": 0.7111,
793
  "step": 1110
794
  },
795
  {
796
  "epoch": 0.5022421524663677,
797
- "grad_norm": 1.3343622572750706,
798
- "learning_rate": 5.829676264121184e-06,
799
- "loss": 0.7323,
800
  "step": 1120
801
  },
802
  {
803
  "epoch": 0.5067264573991032,
804
- "grad_norm": 1.3346769678858248,
805
- "learning_rate": 5.752396915193403e-06,
806
- "loss": 0.744,
807
  "step": 1130
808
  },
809
  {
810
  "epoch": 0.5112107623318386,
811
- "grad_norm": 1.4071259742434898,
812
- "learning_rate": 5.6749332162668525e-06,
813
- "loss": 0.7181,
814
  "step": 1140
815
  },
816
  {
817
  "epoch": 0.515695067264574,
818
- "grad_norm": 1.314030857289351,
819
- "learning_rate": 5.5973041472609265e-06,
820
- "loss": 0.7278,
821
  "step": 1150
822
  },
823
  {
824
  "epoch": 0.5201793721973094,
825
- "grad_norm": 1.2893634754111014,
826
- "learning_rate": 5.519528728613491e-06,
827
- "loss": 0.722,
828
  "step": 1160
829
  },
830
  {
831
  "epoch": 0.5246636771300448,
832
- "grad_norm": 1.2894378069172643,
833
- "learning_rate": 5.4416260166205525e-06,
834
- "loss": 0.7282,
835
  "step": 1170
836
  },
837
  {
838
  "epoch": 0.5291479820627802,
839
- "grad_norm": 1.2691657277439665,
840
- "learning_rate": 5.363615098767149e-06,
841
- "loss": 0.7439,
842
  "step": 1180
843
  },
844
  {
845
  "epoch": 0.5336322869955157,
846
- "grad_norm": 1.3478124675299437,
847
- "learning_rate": 5.285515089050587e-06,
848
- "loss": 0.7164,
849
  "step": 1190
850
  },
851
  {
852
  "epoch": 0.5381165919282511,
853
- "grad_norm": 1.3052899854504807,
854
- "learning_rate": 5.207345123297187e-06,
855
- "loss": 0.7171,
 
 
 
 
 
 
 
 
856
  "step": 1200
857
  },
858
  {
859
  "epoch": 0.5426008968609866,
860
- "grad_norm": 1.27131464716386,
861
- "learning_rate": 5.129124354473688e-06,
862
- "loss": 0.7235,
863
  "step": 1210
864
  },
865
  {
866
  "epoch": 0.547085201793722,
867
- "grad_norm": 1.257329353514232,
868
- "learning_rate": 5.050871947994443e-06,
869
- "loss": 0.6999,
870
  "step": 1220
871
  },
872
  {
873
  "epoch": 0.5515695067264574,
874
- "grad_norm": 1.3863466652299603,
875
- "learning_rate": 4.972607077025563e-06,
876
- "loss": 0.7251,
877
  "step": 1230
878
  },
879
  {
880
  "epoch": 0.5560538116591929,
881
- "grad_norm": 1.394706696869187,
882
- "learning_rate": 4.894348917787174e-06,
883
- "loss": 0.6963,
884
  "step": 1240
885
  },
886
  {
887
  "epoch": 0.5605381165919282,
888
- "grad_norm": 1.288516832053435,
889
- "learning_rate": 4.816116644854912e-06,
890
- "loss": 0.7207,
891
  "step": 1250
892
  },
893
  {
894
  "epoch": 0.5650224215246636,
895
- "grad_norm": 1.291568839795731,
896
- "learning_rate": 4.73792942646183e-06,
897
- "loss": 0.7168,
898
  "step": 1260
899
  },
900
  {
901
  "epoch": 0.5695067264573991,
902
- "grad_norm": 1.3692501176044114,
903
- "learning_rate": 4.659806419801855e-06,
904
- "loss": 0.7311,
905
  "step": 1270
906
  },
907
  {
908
  "epoch": 0.5739910313901345,
909
- "grad_norm": 1.2214111714932894,
910
- "learning_rate": 4.581766766335953e-06,
911
- "loss": 0.7175,
912
  "step": 1280
913
  },
914
  {
915
  "epoch": 0.57847533632287,
916
- "grad_norm": 1.2261021201524966,
917
- "learning_rate": 4.503829587102138e-06,
918
- "loss": 0.722,
919
  "step": 1290
920
  },
921
  {
922
  "epoch": 0.5829596412556054,
923
- "grad_norm": 1.279552435182188,
924
- "learning_rate": 4.426013978030508e-06,
925
- "loss": 0.7407,
 
 
 
 
 
 
 
 
926
  "step": 1300
927
  },
928
  {
929
  "epoch": 0.5874439461883408,
930
- "grad_norm": 1.2639914975822624,
931
- "learning_rate": 4.348339005264406e-06,
932
- "loss": 0.7174,
933
  "step": 1310
934
  },
935
  {
936
  "epoch": 0.5919282511210763,
937
- "grad_norm": 1.2931987303236723,
938
- "learning_rate": 4.270823700488896e-06,
939
- "loss": 0.7236,
940
  "step": 1320
941
  },
942
  {
943
  "epoch": 0.5964125560538116,
944
- "grad_norm": 1.3083460389811294,
945
- "learning_rate": 4.19348705626768e-06,
946
- "loss": 0.7247,
947
  "step": 1330
948
  },
949
  {
950
  "epoch": 0.600896860986547,
951
- "grad_norm": 1.359946232054244,
952
- "learning_rate": 4.116348021389595e-06,
953
- "loss": 0.7289,
954
  "step": 1340
955
  },
956
  {
957
  "epoch": 0.6053811659192825,
958
- "grad_norm": 1.3038409489179155,
959
- "learning_rate": 4.039425496225834e-06,
960
- "loss": 0.723,
961
  "step": 1350
962
  },
963
  {
964
  "epoch": 0.6098654708520179,
965
- "grad_norm": 1.337031671252276,
966
- "learning_rate": 3.962738328099047e-06,
967
- "loss": 0.718,
968
  "step": 1360
969
  },
970
  {
971
  "epoch": 0.6143497757847534,
972
- "grad_norm": 1.3006506595251124,
973
- "learning_rate": 3.88630530666542e-06,
974
- "loss": 0.7372,
975
  "step": 1370
976
  },
977
  {
978
  "epoch": 0.6188340807174888,
979
- "grad_norm": 1.3038120667886732,
980
- "learning_rate": 3.8101451593108816e-06,
981
- "loss": 0.732,
982
  "step": 1380
983
  },
984
  {
985
  "epoch": 0.6233183856502242,
986
- "grad_norm": 1.2544712968929104,
987
- "learning_rate": 3.7342765465625953e-06,
988
- "loss": 0.7347,
989
  "step": 1390
990
  },
991
  {
992
  "epoch": 0.6278026905829597,
993
- "grad_norm": 1.3352755014667614,
994
- "learning_rate": 3.658718057516803e-06,
995
- "loss": 0.7332,
 
 
 
 
 
 
 
 
996
  "step": 1400
997
  },
998
  {
999
  "epoch": 0.6322869955156951,
1000
- "grad_norm": 1.3389617347187606,
1001
- "learning_rate": 3.5834882052841744e-06,
1002
- "loss": 0.7154,
1003
  "step": 1410
1004
  },
1005
  {
1006
  "epoch": 0.6367713004484304,
1007
- "grad_norm": 1.2654799213890686,
1008
- "learning_rate": 3.508605422453799e-06,
1009
- "loss": 0.7002,
1010
  "step": 1420
1011
  },
1012
  {
1013
  "epoch": 0.6412556053811659,
1014
- "grad_norm": 1.311398422880929,
1015
- "learning_rate": 3.4340880565768707e-06,
1016
- "loss": 0.7098,
1017
  "step": 1430
1018
  },
1019
  {
1020
  "epoch": 0.6457399103139013,
1021
- "grad_norm": 1.3577211660369808,
1022
- "learning_rate": 3.359954365671241e-06,
1023
- "loss": 0.7024,
1024
  "step": 1440
1025
  },
1026
  {
1027
  "epoch": 0.6502242152466368,
1028
- "grad_norm": 1.145375472595952,
1029
- "learning_rate": 3.2862225137478897e-06,
1030
- "loss": 0.7097,
1031
  "step": 1450
1032
  },
1033
  {
1034
  "epoch": 0.6547085201793722,
1035
- "grad_norm": 1.3358950336855993,
1036
- "learning_rate": 3.2129105663604275e-06,
1037
- "loss": 0.7148,
1038
  "step": 1460
1039
  },
1040
  {
1041
  "epoch": 0.6591928251121076,
1042
- "grad_norm": 1.2495395838860759,
1043
- "learning_rate": 3.1400364861787434e-06,
1044
- "loss": 0.7483,
1045
  "step": 1470
1046
  },
1047
  {
1048
  "epoch": 0.6636771300448431,
1049
- "grad_norm": 1.2742823576583961,
1050
- "learning_rate": 3.0676181285878343e-06,
1051
- "loss": 0.7063,
1052
  "step": 1480
1053
  },
1054
  {
1055
  "epoch": 0.6681614349775785,
1056
- "grad_norm": 1.2596513250823083,
1057
- "learning_rate": 2.9956732373129378e-06,
1058
- "loss": 0.7201,
1059
  "step": 1490
1060
  },
1061
  {
1062
  "epoch": 0.672645739910314,
1063
- "grad_norm": 1.4183600229677984,
1064
- "learning_rate": 2.9242194400720157e-06,
1065
- "loss": 0.7202,
 
 
 
 
 
 
 
 
1066
  "step": 1500
1067
  },
1068
  {
1069
  "epoch": 0.6771300448430493,
1070
- "grad_norm": 1.224500332009707,
1071
- "learning_rate": 2.8532742442566735e-06,
1072
- "loss": 0.7228,
1073
  "step": 1510
1074
  },
1075
  {
1076
  "epoch": 0.6816143497757847,
1077
- "grad_norm": 1.20210393613667,
1078
- "learning_rate": 2.782855032642535e-06,
1079
- "loss": 0.7386,
1080
  "step": 1520
1081
  },
1082
  {
1083
  "epoch": 0.6860986547085202,
1084
- "grad_norm": 1.2835056973370584,
1085
- "learning_rate": 2.712979059130187e-06,
1086
- "loss": 0.7207,
1087
  "step": 1530
1088
  },
1089
  {
1090
  "epoch": 0.6905829596412556,
1091
- "grad_norm": 1.180714987729606,
1092
- "learning_rate": 2.643663444517671e-06,
1093
- "loss": 0.6981,
1094
  "step": 1540
1095
  },
1096
  {
1097
  "epoch": 0.695067264573991,
1098
- "grad_norm": 1.2871858226590431,
1099
- "learning_rate": 2.5749251723055933e-06,
1100
- "loss": 0.6853,
1101
  "step": 1550
1102
  },
1103
  {
1104
  "epoch": 0.6995515695067265,
1105
- "grad_norm": 1.3219720717807693,
1106
- "learning_rate": 2.5067810845358926e-06,
1107
- "loss": 0.7192,
1108
  "step": 1560
1109
  },
1110
  {
1111
  "epoch": 0.7040358744394619,
1112
- "grad_norm": 1.391893981214182,
1113
- "learning_rate": 2.439247877665244e-06,
1114
- "loss": 0.7103,
1115
  "step": 1570
1116
  },
1117
  {
1118
  "epoch": 0.7085201793721974,
1119
- "grad_norm": 1.2636799158865641,
1120
- "learning_rate": 2.3723420984741417e-06,
1121
- "loss": 0.684,
1122
  "step": 1580
1123
  },
1124
  {
1125
  "epoch": 0.7130044843049327,
1126
- "grad_norm": 1.3557464635552046,
1127
- "learning_rate": 2.3060801400126693e-06,
1128
- "loss": 0.7207,
1129
  "step": 1590
1130
  },
1131
  {
1132
  "epoch": 0.7174887892376681,
1133
- "grad_norm": 1.3569256088684083,
1134
- "learning_rate": 2.240478237583915e-06,
1135
- "loss": 0.7077,
 
 
 
 
 
 
 
 
1136
  "step": 1600
1137
  },
1138
  {
1139
  "epoch": 0.7219730941704036,
1140
- "grad_norm": 1.37192661939199,
1141
- "learning_rate": 2.1755524647660514e-06,
1142
- "loss": 0.693,
1143
  "step": 1610
1144
  },
1145
  {
1146
  "epoch": 0.726457399103139,
1147
- "grad_norm": 1.248391921620642,
1148
- "learning_rate": 2.1113187294740294e-06,
1149
- "loss": 0.6911,
1150
  "step": 1620
1151
  },
1152
  {
1153
  "epoch": 0.7309417040358744,
1154
- "grad_norm": 1.357856212054366,
1155
- "learning_rate": 2.047792770061881e-06,
1156
- "loss": 0.6838,
1157
  "step": 1630
1158
  },
1159
  {
1160
  "epoch": 0.7354260089686099,
1161
- "grad_norm": 1.4147463764446673,
1162
- "learning_rate": 1.9849901514665458e-06,
1163
- "loss": 0.7122,
1164
  "step": 1640
1165
  },
1166
  {
1167
  "epoch": 0.7399103139013453,
1168
- "grad_norm": 1.3587967144630926,
1169
- "learning_rate": 1.922926261394206e-06,
1170
- "loss": 0.6927,
1171
  "step": 1650
1172
  },
1173
  {
1174
  "epoch": 0.7443946188340808,
1175
- "grad_norm": 1.326112035320906,
1176
- "learning_rate": 1.8616163065500231e-06,
1177
- "loss": 0.6931,
1178
  "step": 1660
1179
  },
1180
  {
1181
  "epoch": 0.7488789237668162,
1182
- "grad_norm": 1.2337199420172695,
1183
- "learning_rate": 1.8010753089122572e-06,
1184
- "loss": 0.6934,
1185
  "step": 1670
1186
  },
1187
  {
1188
  "epoch": 0.7533632286995515,
1189
- "grad_norm": 1.2176765529306792,
1190
- "learning_rate": 1.7413181020516146e-06,
1191
- "loss": 0.7164,
1192
  "step": 1680
1193
  },
1194
  {
1195
  "epoch": 0.757847533632287,
1196
- "grad_norm": 1.2724919849788396,
1197
- "learning_rate": 1.6823593274967703e-06,
1198
- "loss": 0.7267,
1199
  "step": 1690
1200
  },
1201
  {
1202
  "epoch": 0.7623318385650224,
1203
- "grad_norm": 1.3274428154634692,
1204
- "learning_rate": 1.6242134311469538e-06,
1205
- "loss": 0.6824,
 
 
 
 
 
 
 
 
1206
  "step": 1700
1207
  },
1208
  {
1209
  "epoch": 0.7668161434977578,
1210
- "grad_norm": 1.3661211722387332,
1211
- "learning_rate": 1.5668946597324558e-06,
1212
- "loss": 0.7182,
1213
  "step": 1710
1214
  },
1215
  {
1216
  "epoch": 0.7713004484304933,
1217
- "grad_norm": 1.2959174525555186,
1218
- "learning_rate": 1.51041705732393e-06,
1219
- "loss": 0.7118,
1220
  "step": 1720
1221
  },
1222
  {
1223
  "epoch": 0.7757847533632287,
1224
- "grad_norm": 1.2773849053623296,
1225
- "learning_rate": 1.4547944618913706e-06,
1226
- "loss": 0.6929,
1227
  "step": 1730
1228
  },
1229
  {
1230
  "epoch": 0.7802690582959642,
1231
- "grad_norm": 1.297150269984566,
1232
- "learning_rate": 1.4000405019135676e-06,
1233
- "loss": 0.6883,
1234
  "step": 1740
1235
  },
1236
  {
1237
  "epoch": 0.7847533632286996,
1238
- "grad_norm": 1.400539444832659,
1239
- "learning_rate": 1.3461685930388958e-06,
1240
- "loss": 0.6911,
1241
  "step": 1750
1242
  },
1243
  {
1244
  "epoch": 0.7892376681614349,
1245
- "grad_norm": 1.2337657683341194,
1246
- "learning_rate": 1.2931919347982607e-06,
1247
- "loss": 0.6921,
1248
  "step": 1760
1249
  },
1250
  {
1251
  "epoch": 0.7937219730941704,
1252
- "grad_norm": 1.2072415924732212,
1253
- "learning_rate": 1.2411235073709883e-06,
1254
- "loss": 0.7102,
1255
  "step": 1770
1256
  },
1257
  {
1258
  "epoch": 0.7982062780269058,
1259
- "grad_norm": 1.381367292756694,
1260
- "learning_rate": 1.1899760684044515e-06,
1261
- "loss": 0.6838,
1262
  "step": 1780
1263
  },
1264
  {
1265
  "epoch": 0.8026905829596412,
1266
- "grad_norm": 1.3102643665842388,
1267
- "learning_rate": 1.1397621498882471e-06,
1268
- "loss": 0.6945,
1269
  "step": 1790
1270
  },
1271
  {
1272
  "epoch": 0.8071748878923767,
1273
- "grad_norm": 1.3214717866253802,
1274
- "learning_rate": 1.0904940550836285e-06,
1275
- "loss": 0.7016,
 
 
 
 
 
 
 
 
1276
  "step": 1800
1277
  },
1278
  {
1279
  "epoch": 0.8116591928251121,
1280
- "grad_norm": 1.1240219719921938,
1281
- "learning_rate": 1.0421838555090119e-06,
1282
- "loss": 0.7018,
1283
  "step": 1810
1284
  },
1285
  {
1286
  "epoch": 0.8161434977578476,
1287
- "grad_norm": 1.2844428659053804,
1288
- "learning_rate": 9.948433879822428e-07,
1289
- "loss": 0.7361,
1290
  "step": 1820
1291
  },
1292
  {
1293
  "epoch": 0.820627802690583,
1294
- "grad_norm": 1.249492840397432,
1295
- "learning_rate": 9.484842517203735e-07,
1296
- "loss": 0.707,
1297
  "step": 1830
1298
  },
1299
  {
1300
  "epoch": 0.8251121076233184,
1301
- "grad_norm": 1.2793612843154911,
1302
- "learning_rate": 9.031178054976636e-07,
1303
- "loss": 0.7226,
1304
  "step": 1840
1305
  },
1306
  {
1307
  "epoch": 0.8295964125560538,
1308
- "grad_norm": 1.3447642426455195,
1309
- "learning_rate": 8.587551648624859e-07,
1310
- "loss": 0.6906,
1311
  "step": 1850
1312
  },
1313
  {
1314
  "epoch": 0.8340807174887892,
1315
- "grad_norm": 1.282168137173779,
1316
- "learning_rate": 8.154071994138241e-07,
1317
- "loss": 0.698,
1318
  "step": 1860
1319
  },
1320
  {
1321
  "epoch": 0.8385650224215246,
1322
- "grad_norm": 1.4629920435128856,
1323
- "learning_rate": 7.730845301380441e-07,
1324
- "loss": 0.7212,
1325
  "step": 1870
1326
  },
1327
  {
1328
  "epoch": 0.8430493273542601,
1329
- "grad_norm": 1.3143459341111923,
1330
- "learning_rate": 7.317975268065685e-07,
1331
- "loss": 0.6942,
1332
  "step": 1880
1333
  },
1334
  {
1335
  "epoch": 0.8475336322869955,
1336
- "grad_norm": 1.3274198546425116,
1337
- "learning_rate": 6.915563054351037e-07,
1338
- "loss": 0.6944,
1339
  "step": 1890
1340
  },
1341
  {
1342
  "epoch": 0.852017937219731,
1343
- "grad_norm": 1.2958298618538142,
1344
- "learning_rate": 6.523707258050516e-07,
1345
- "loss": 0.6692,
 
 
 
 
 
 
 
 
1346
  "step": 1900
1347
  },
1348
  {
1349
  "epoch": 0.8565022421524664,
1350
- "grad_norm": 1.3403229426415875,
1351
- "learning_rate": 6.14250389047692e-07,
1352
- "loss": 0.7034,
1353
  "step": 1910
1354
  },
1355
  {
1356
  "epoch": 0.8609865470852018,
1357
- "grad_norm": 1.258601487476543,
1358
- "learning_rate": 5.772046352917399e-07,
1359
- "loss": 0.7144,
1360
  "step": 1920
1361
  },
1362
  {
1363
  "epoch": 0.8654708520179372,
1364
- "grad_norm": 1.1222145256140716,
1365
- "learning_rate": 5.412425413748623e-07,
1366
- "loss": 0.6988,
1367
  "step": 1930
1368
  },
1369
  {
1370
  "epoch": 0.8699551569506726,
1371
- "grad_norm": 1.2674996138499859,
1372
- "learning_rate": 5.063729186196948e-07,
1373
- "loss": 0.7089,
1374
  "step": 1940
1375
  },
1376
  {
1377
  "epoch": 0.874439461883408,
1378
- "grad_norm": 1.3633828734617575,
1379
- "learning_rate": 4.7260431067491617e-07,
1380
- "loss": 0.733,
1381
  "step": 1950
1382
  },
1383
  {
1384
  "epoch": 0.8789237668161435,
1385
- "grad_norm": 1.2652212276863493,
1386
- "learning_rate": 4.399449914219167e-07,
1387
- "loss": 0.7209,
1388
  "step": 1960
1389
  },
1390
  {
1391
  "epoch": 0.8834080717488789,
1392
- "grad_norm": 1.2975811004623845,
1393
- "learning_rate": 4.084029629475478e-07,
1394
- "loss": 0.7252,
1395
  "step": 1970
1396
  },
1397
  {
1398
  "epoch": 0.8878923766816144,
1399
- "grad_norm": 1.2744740809644324,
1400
- "learning_rate": 3.7798595358348457e-07,
1401
- "loss": 0.7083,
1402
  "step": 1980
1403
  },
1404
  {
1405
  "epoch": 0.8923766816143498,
1406
- "grad_norm": 1.2924446335095698,
1407
- "learning_rate": 3.487014160126467e-07,
1408
- "loss": 0.7077,
1409
  "step": 1990
1410
  },
1411
  {
1412
  "epoch": 0.8968609865470852,
1413
- "grad_norm": 1.283938673421291,
1414
- "learning_rate": 3.2055652544316695e-07,
1415
- "loss": 0.7038,
 
 
 
 
 
 
 
 
1416
  "step": 2000
1417
  },
1418
  {
1419
  "epoch": 0.9013452914798207,
1420
- "grad_norm": 1.3428158271865258,
1421
- "learning_rate": 2.9355817785034325e-07,
1422
- "loss": 0.7177,
1423
  "step": 2010
1424
  },
1425
  {
1426
  "epoch": 0.905829596412556,
1427
- "grad_norm": 1.3237588390932393,
1428
- "learning_rate": 2.6771298828700885e-07,
1429
- "loss": 0.7079,
1430
  "step": 2020
1431
  },
1432
  {
1433
  "epoch": 0.9103139013452914,
1434
- "grad_norm": 1.379188659318735,
1435
- "learning_rate": 2.4302728926273224e-07,
1436
- "loss": 0.7159,
1437
  "step": 2030
1438
  },
1439
  {
1440
  "epoch": 0.9147982062780269,
1441
- "grad_norm": 1.2853687562313414,
1442
- "learning_rate": 2.195071291922435e-07,
1443
- "loss": 0.6842,
1444
  "step": 2040
1445
  },
1446
  {
1447
  "epoch": 0.9192825112107623,
1448
- "grad_norm": 1.3212853096546313,
1449
- "learning_rate": 1.9715827091347005e-07,
1450
- "loss": 0.6994,
1451
  "step": 2050
1452
  },
1453
  {
1454
  "epoch": 0.9237668161434978,
1455
- "grad_norm": 1.4231610496002076,
1456
- "learning_rate": 1.7598619027554553e-07,
1457
- "loss": 0.7032,
1458
  "step": 2060
1459
  },
1460
  {
1461
  "epoch": 0.9282511210762332,
1462
- "grad_norm": 1.288660209861224,
1463
- "learning_rate": 1.5599607479713396e-07,
1464
- "loss": 0.6856,
1465
  "step": 2070
1466
  },
1467
  {
1468
  "epoch": 0.9327354260089686,
1469
- "grad_norm": 1.291080758227027,
1470
- "learning_rate": 1.3719282239539722e-07,
1471
- "loss": 0.7183,
1472
  "step": 2080
1473
  },
1474
  {
1475
  "epoch": 0.9372197309417041,
1476
- "grad_norm": 1.3527938026636528,
1477
- "learning_rate": 1.1958104018592376e-07,
1478
- "loss": 0.7022,
1479
  "step": 2090
1480
  },
1481
  {
1482
  "epoch": 0.9417040358744395,
1483
- "grad_norm": 1.306930563286188,
1484
- "learning_rate": 1.0316504335390775e-07,
1485
- "loss": 0.7202,
 
 
 
 
 
 
 
 
1486
  "step": 2100
1487
  },
1488
  {
1489
  "epoch": 0.9461883408071748,
1490
- "grad_norm": 1.34728936572921,
1491
- "learning_rate": 8.79488540968565e-08,
1492
- "loss": 0.7128,
1493
  "step": 2110
1494
  },
1495
  {
1496
  "epoch": 0.9506726457399103,
1497
- "grad_norm": 1.5138997567977701,
1498
- "learning_rate": 7.39362006390798e-08,
1499
- "loss": 0.6841,
1500
  "step": 2120
1501
  },
1502
  {
1503
  "epoch": 0.9551569506726457,
1504
- "grad_norm": 1.2518587224976194,
1505
- "learning_rate": 6.113051631821631e-08,
1506
- "loss": 0.71,
1507
  "step": 2130
1508
  },
1509
  {
1510
  "epoch": 0.9596412556053812,
1511
- "grad_norm": 1.4419766059722212,
1512
- "learning_rate": 4.9534938744004723e-08,
1513
- "loss": 0.6944,
1514
  "step": 2140
1515
  },
1516
  {
1517
  "epoch": 0.9641255605381166,
1518
- "grad_norm": 1.3591089992663685,
1519
- "learning_rate": 3.915230902951761e-08,
1520
- "loss": 0.717,
1521
  "step": 2150
1522
  },
1523
  {
1524
  "epoch": 0.968609865470852,
1525
- "grad_norm": 1.281084457332566,
1526
- "learning_rate": 2.9985171095041066e-08,
1527
- "loss": 0.703,
1528
  "step": 2160
1529
  },
1530
  {
1531
  "epoch": 0.9730941704035875,
1532
- "grad_norm": 1.1922556783094496,
1533
- "learning_rate": 2.203577104476773e-08,
1534
- "loss": 0.7085,
1535
  "step": 2170
1536
  },
1537
  {
1538
  "epoch": 0.9775784753363229,
1539
- "grad_norm": 1.3917918614257676,
1540
- "learning_rate": 1.5306056616468666e-08,
1541
- "loss": 0.709,
1542
  "step": 2180
1543
  },
1544
  {
1545
  "epoch": 0.9820627802690582,
1546
- "grad_norm": 1.178733370314234,
1547
- "learning_rate": 9.797676704259574e-09,
1548
- "loss": 0.7009,
1549
  "step": 2190
1550
  },
1551
  {
1552
  "epoch": 0.9865470852017937,
1553
- "grad_norm": 1.2352452900540438,
1554
- "learning_rate": 5.511980954596152e-09,
1555
- "loss": 0.7085,
 
 
 
 
 
 
 
 
1556
  "step": 2200
1557
  },
1558
  {
1559
  "epoch": 0.9910313901345291,
1560
- "grad_norm": 1.519249312059187,
1561
- "learning_rate": 2.4500194355880913e-09,
1562
- "loss": 0.6865,
1563
  "step": 2210
1564
  },
1565
  {
1566
  "epoch": 0.9955156950672646,
1567
- "grad_norm": 1.2975150315542752,
1568
- "learning_rate": 6.125423797137541e-10,
1569
- "loss": 0.6935,
1570
  "step": 2220
1571
  },
1572
  {
1573
  "epoch": 1.0,
1574
- "grad_norm": 1.2570272992562246,
1575
  "learning_rate": 0.0,
1576
- "loss": 0.6946,
1577
- "step": 2230
1578
- },
1579
- {
1580
- "epoch": 1.0,
1581
- "eval_loss": 0.7042345404624939,
1582
- "eval_runtime": 381.2635,
1583
- "eval_samples_per_second": 131.345,
1584
- "eval_steps_per_second": 2.054,
1585
  "step": 2230
1586
  },
1587
  {
1588
  "epoch": 1.0,
1589
  "step": 2230,
1590
- "total_flos": 244955314192384.0,
1591
- "train_loss": 0.7376568270371099,
1592
- "train_runtime": 8720.6202,
1593
- "train_samples_per_second": 32.731,
1594
- "train_steps_per_second": 0.256
1595
  }
1596
  ],
1597
  "logging_steps": 10,
@@ -1611,7 +1779,7 @@
1611
  "attributes": {}
1612
  }
1613
  },
1614
- "total_flos": 244955314192384.0,
1615
  "train_batch_size": 8,
1616
  "trial_name": null,
1617
  "trial_params": null
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
+ "eval_steps": 100,
6
  "global_step": 2230,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0004484304932735426,
13
+ "grad_norm": 4.696539476451585,
14
+ "learning_rate": 1.3452914798206278e-08,
15
  "loss": 0.9912,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.004484304932735426,
20
+ "grad_norm": 5.089904667658368,
21
+ "learning_rate": 1.345291479820628e-07,
22
+ "loss": 1.0341,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.008968609865470852,
27
+ "grad_norm": 5.546828630388097,
28
+ "learning_rate": 2.690582959641256e-07,
29
+ "loss": 1.0502,
30
  "step": 20
31
  },
32
  {
33
  "epoch": 0.013452914798206279,
34
+ "grad_norm": 4.113849381101499,
35
+ "learning_rate": 4.0358744394618834e-07,
36
+ "loss": 1.0386,
37
  "step": 30
38
  },
39
  {
40
  "epoch": 0.017937219730941704,
41
+ "grad_norm": 3.6548963622814887,
42
+ "learning_rate": 5.381165919282512e-07,
43
+ "loss": 1.0282,
44
  "step": 40
45
  },
46
  {
47
  "epoch": 0.02242152466367713,
48
+ "grad_norm": 2.157564670206396,
49
+ "learning_rate": 6.72645739910314e-07,
50
+ "loss": 0.9574,
51
  "step": 50
52
  },
53
  {
54
  "epoch": 0.026905829596412557,
55
+ "grad_norm": 2.0184475272019555,
56
+ "learning_rate": 8.071748878923767e-07,
57
+ "loss": 0.9263,
58
  "step": 60
59
  },
60
  {
61
  "epoch": 0.03139013452914798,
62
+ "grad_norm": 1.7894937443172652,
63
+ "learning_rate": 9.417040358744395e-07,
64
+ "loss": 0.9253,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.03587443946188341,
69
+ "grad_norm": 1.6533764414432808,
70
+ "learning_rate": 1.0762331838565023e-06,
71
+ "loss": 0.9106,
72
  "step": 80
73
  },
74
  {
75
  "epoch": 0.04035874439461883,
76
+ "grad_norm": 1.9561381307359194,
77
+ "learning_rate": 1.2107623318385651e-06,
78
+ "loss": 0.8713,
79
  "step": 90
80
  },
81
  {
82
  "epoch": 0.04484304932735426,
83
+ "grad_norm": 1.5478472557018526,
84
+ "learning_rate": 1.345291479820628e-06,
85
+ "loss": 0.8741,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.04484304932735426,
90
+ "eval_loss": 0.8599640727043152,
91
+ "eval_runtime": 430.7233,
92
+ "eval_samples_per_second": 116.263,
93
+ "eval_steps_per_second": 1.818,
94
  "step": 100
95
  },
96
  {
97
  "epoch": 0.04932735426008968,
98
+ "grad_norm": 1.5759592930264636,
99
+ "learning_rate": 1.4798206278026905e-06,
100
+ "loss": 0.8381,
101
  "step": 110
102
  },
103
  {
104
  "epoch": 0.053811659192825115,
105
+ "grad_norm": 1.5446577353242628,
106
+ "learning_rate": 1.6143497757847533e-06,
107
+ "loss": 0.8151,
108
  "step": 120
109
  },
110
  {
111
  "epoch": 0.05829596412556054,
112
+ "grad_norm": 1.6899841974229757,
113
+ "learning_rate": 1.7488789237668162e-06,
114
+ "loss": 0.8309,
115
  "step": 130
116
  },
117
  {
118
  "epoch": 0.06278026905829596,
119
+ "grad_norm": 1.6274283098945213,
120
+ "learning_rate": 1.883408071748879e-06,
121
+ "loss": 0.8509,
122
  "step": 140
123
  },
124
  {
125
  "epoch": 0.06726457399103139,
126
+ "grad_norm": 1.7690619100525546,
127
+ "learning_rate": 2.0179372197309418e-06,
128
+ "loss": 0.8057,
129
  "step": 150
130
  },
131
  {
132
  "epoch": 0.07174887892376682,
133
+ "grad_norm": 1.866473004768342,
134
+ "learning_rate": 2.1524663677130046e-06,
135
+ "loss": 0.8236,
136
  "step": 160
137
  },
138
  {
139
  "epoch": 0.07623318385650224,
140
+ "grad_norm": 1.5528009019380091,
141
+ "learning_rate": 2.2869955156950674e-06,
142
+ "loss": 0.7936,
143
  "step": 170
144
  },
145
  {
146
  "epoch": 0.08071748878923767,
147
+ "grad_norm": 1.8924349879943885,
148
+ "learning_rate": 2.4215246636771302e-06,
149
+ "loss": 0.8054,
150
  "step": 180
151
  },
152
  {
153
  "epoch": 0.08520179372197309,
154
+ "grad_norm": 1.5998254884542162,
155
+ "learning_rate": 2.556053811659193e-06,
156
+ "loss": 0.7971,
157
  "step": 190
158
  },
159
  {
160
  "epoch": 0.08968609865470852,
161
+ "grad_norm": 1.553085624058612,
162
+ "learning_rate": 2.690582959641256e-06,
163
+ "loss": 0.8038,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.08968609865470852,
168
+ "eval_loss": 0.8094644546508789,
169
+ "eval_runtime": 412.1717,
170
+ "eval_samples_per_second": 121.495,
171
+ "eval_steps_per_second": 1.9,
172
  "step": 200
173
  },
174
  {
175
  "epoch": 0.09417040358744394,
176
+ "grad_norm": 1.6621378080442881,
177
+ "learning_rate": 2.8251121076233187e-06,
178
+ "loss": 0.7815,
179
  "step": 210
180
  },
181
  {
182
  "epoch": 0.09865470852017937,
183
+ "grad_norm": 1.5875832641605891,
184
+ "learning_rate": 2.959641255605381e-06,
185
+ "loss": 0.8088,
186
  "step": 220
187
  },
188
  {
189
  "epoch": 0.1031390134529148,
190
+ "grad_norm": 1.6006597094640902,
191
+ "learning_rate": 2.99990995533251e-06,
192
+ "loss": 0.8141,
193
  "step": 230
194
  },
195
  {
196
  "epoch": 0.10762331838565023,
197
+ "grad_norm": 1.7932554350094232,
198
+ "learning_rate": 2.9994689462512194e-06,
199
+ "loss": 0.7834,
200
  "step": 240
201
  },
202
  {
203
  "epoch": 0.11210762331838565,
204
+ "grad_norm": 1.6444723214299724,
205
+ "learning_rate": 2.998660541859271e-06,
206
+ "loss": 0.7797,
207
  "step": 250
208
  },
209
  {
210
  "epoch": 0.11659192825112108,
211
+ "grad_norm": 1.790145213655978,
212
+ "learning_rate": 2.9974849402294452e-06,
213
+ "loss": 0.8046,
214
  "step": 260
215
  },
216
  {
217
  "epoch": 0.1210762331838565,
218
+ "grad_norm": 1.8694283184605,
219
+ "learning_rate": 2.9959424294040703e-06,
220
+ "loss": 0.7802,
221
  "step": 270
222
  },
223
  {
224
  "epoch": 0.12556053811659193,
225
+ "grad_norm": 1.6030839509233756,
226
+ "learning_rate": 2.9940333873244464e-06,
227
+ "loss": 0.8032,
228
  "step": 280
229
  },
230
  {
231
  "epoch": 0.13004484304932734,
232
+ "grad_norm": 1.664910362160235,
233
+ "learning_rate": 2.991758281738245e-06,
234
+ "loss": 0.7802,
235
  "step": 290
236
  },
237
  {
238
  "epoch": 0.13452914798206278,
239
+ "grad_norm": 1.6726792291262853,
240
+ "learning_rate": 2.989117670084902e-06,
241
+ "loss": 0.7937,
242
+ "step": 300
243
+ },
244
+ {
245
+ "epoch": 0.13452914798206278,
246
+ "eval_loss": 0.7789004445075989,
247
+ "eval_runtime": 410.6605,
248
+ "eval_samples_per_second": 121.943,
249
+ "eval_steps_per_second": 1.907,
250
  "step": 300
251
  },
252
  {
253
  "epoch": 0.13901345291479822,
254
+ "grad_norm": 1.4685211047526556,
255
+ "learning_rate": 2.986112199359036e-06,
256
+ "loss": 0.7486,
257
  "step": 310
258
  },
259
  {
260
  "epoch": 0.14349775784753363,
261
+ "grad_norm": 2.0076694355781575,
262
+ "learning_rate": 2.9827426059519237e-06,
263
+ "loss": 0.808,
264
  "step": 320
265
  },
266
  {
267
  "epoch": 0.14798206278026907,
268
+ "grad_norm": 1.557780179088859,
269
+ "learning_rate": 2.9790097154710697e-06,
270
+ "loss": 0.7849,
271
  "step": 330
272
  },
273
  {
274
  "epoch": 0.15246636771300448,
275
+ "grad_norm": 1.3610248283116362,
276
+ "learning_rate": 2.9749144425379216e-06,
277
+ "loss": 0.7696,
278
  "step": 340
279
  },
280
  {
281
  "epoch": 0.15695067264573992,
282
+ "grad_norm": 1.5050628258310632,
283
+ "learning_rate": 2.9704577905637718e-06,
284
+ "loss": 0.7497,
285
  "step": 350
286
  },
287
  {
288
  "epoch": 0.16143497757847533,
289
+ "grad_norm": 1.4313536098763806,
290
+ "learning_rate": 2.9656408515039017e-06,
291
+ "loss": 0.7544,
292
  "step": 360
293
  },
294
  {
295
  "epoch": 0.16591928251121077,
296
+ "grad_norm": 1.6003065628553548,
297
+ "learning_rate": 2.9604648055900368e-06,
298
+ "loss": 0.7648,
299
  "step": 370
300
  },
301
  {
302
  "epoch": 0.17040358744394618,
303
+ "grad_norm": 1.633334409956319,
304
+ "learning_rate": 2.9549309210411697e-06,
305
+ "loss": 0.7471,
306
  "step": 380
307
  },
308
  {
309
  "epoch": 0.17488789237668162,
310
+ "grad_norm": 1.5700271693529286,
311
+ "learning_rate": 2.949040553752826e-06,
312
+ "loss": 0.8009,
313
  "step": 390
314
  },
315
  {
316
  "epoch": 0.17937219730941703,
317
+ "grad_norm": 1.4854276734758955,
318
+ "learning_rate": 2.9427951469648425e-06,
319
+ "loss": 0.7712,
320
+ "step": 400
321
+ },
322
+ {
323
+ "epoch": 0.17937219730941703,
324
+ "eval_loss": 0.7643527388572693,
325
+ "eval_runtime": 413.4678,
326
+ "eval_samples_per_second": 121.115,
327
+ "eval_steps_per_second": 1.894,
328
  "step": 400
329
  },
330
  {
331
  "epoch": 0.18385650224215247,
332
+ "grad_norm": 1.4160940764229815,
333
+ "learning_rate": 2.936196230907755e-06,
334
+ "loss": 0.7532,
335
  "step": 410
336
  },
337
  {
338
  "epoch": 0.18834080717488788,
339
+ "grad_norm": 1.4265290618310995,
340
+ "learning_rate": 2.929245422427861e-06,
341
+ "loss": 0.7703,
342
  "step": 420
343
  },
344
  {
345
  "epoch": 0.19282511210762332,
346
+ "grad_norm": 1.6899882763333507,
347
+ "learning_rate": 2.9219444245910674e-06,
348
+ "loss": 0.7919,
349
  "step": 430
350
  },
351
  {
352
  "epoch": 0.19730941704035873,
353
+ "grad_norm": 1.4186337044303068,
354
+ "learning_rate": 2.9142950262656098e-06,
355
+ "loss": 0.7477,
356
  "step": 440
357
  },
358
  {
359
  "epoch": 0.20179372197309417,
360
+ "grad_norm": 1.4178331376670448,
361
+ "learning_rate": 2.9062991016837496e-06,
362
+ "loss": 0.7734,
363
  "step": 450
364
  },
365
  {
366
  "epoch": 0.2062780269058296,
367
+ "grad_norm": 1.4503162574851487,
368
+ "learning_rate": 2.897958609982556e-06,
369
+ "loss": 0.7447,
370
  "step": 460
371
  },
372
  {
373
  "epoch": 0.21076233183856502,
374
+ "grad_norm": 1.558520612711291,
375
+ "learning_rate": 2.8892755947238818e-06,
376
+ "loss": 0.741,
377
  "step": 470
378
  },
379
  {
380
  "epoch": 0.21524663677130046,
381
+ "grad_norm": 1.4382572158325275,
382
+ "learning_rate": 2.8802521833936595e-06,
383
+ "loss": 0.7563,
384
  "step": 480
385
  },
386
  {
387
  "epoch": 0.21973094170403587,
388
+ "grad_norm": 1.5964216489171685,
389
+ "learning_rate": 2.870890586880629e-06,
390
+ "loss": 0.7554,
391
  "step": 490
392
  },
393
  {
394
  "epoch": 0.2242152466367713,
395
+ "grad_norm": 1.496069010720812,
396
+ "learning_rate": 2.8611930989346322e-06,
397
+ "loss": 0.7393,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.2242152466367713,
402
+ "eval_loss": 0.7564548254013062,
403
+ "eval_runtime": 408.8965,
404
+ "eval_samples_per_second": 122.469,
405
+ "eval_steps_per_second": 1.915,
406
  "step": 500
407
  },
408
  {
409
  "epoch": 0.22869955156950672,
410
+ "grad_norm": 1.4866290735466012,
411
+ "learning_rate": 2.851162095604607e-06,
412
+ "loss": 0.7499,
413
  "step": 510
414
  },
415
  {
416
  "epoch": 0.23318385650224216,
417
+ "grad_norm": 1.3341919240907245,
418
+ "learning_rate": 2.8408000346564136e-06,
419
+ "loss": 0.7524,
420
  "step": 520
421
  },
422
  {
423
  "epoch": 0.23766816143497757,
424
+ "grad_norm": 1.6374942242171213,
425
+ "learning_rate": 2.8301094549706405e-06,
426
+ "loss": 0.7386,
427
  "step": 530
428
  },
429
  {
430
  "epoch": 0.242152466367713,
431
+ "grad_norm": 1.6225803035616944,
432
+ "learning_rate": 2.8190929759205366e-06,
433
+ "loss": 0.7616,
434
  "step": 540
435
  },
436
  {
437
  "epoch": 0.24663677130044842,
438
+ "grad_norm": 1.4683777464043755,
439
+ "learning_rate": 2.807753296730219e-06,
440
+ "loss": 0.7564,
441
  "step": 550
442
  },
443
  {
444
  "epoch": 0.25112107623318386,
445
+ "grad_norm": 1.350460716883926,
446
+ "learning_rate": 2.7960931958133183e-06,
447
+ "loss": 0.7424,
448
  "step": 560
449
  },
450
  {
451
  "epoch": 0.2556053811659193,
452
+ "grad_norm": 1.522474854464212,
453
+ "learning_rate": 2.7841155300922202e-06,
454
+ "loss": 0.7331,
455
  "step": 570
456
  },
457
  {
458
  "epoch": 0.2600896860986547,
459
+ "grad_norm": 1.448720887976205,
460
+ "learning_rate": 2.7718232342980693e-06,
461
+ "loss": 0.7657,
462
  "step": 580
463
  },
464
  {
465
  "epoch": 0.2645739910313901,
466
+ "grad_norm": 1.6744619426337854,
467
+ "learning_rate": 2.759219320251714e-06,
468
+ "loss": 0.7363,
469
  "step": 590
470
  },
471
  {
472
  "epoch": 0.26905829596412556,
473
+ "grad_norm": 1.3585539591402243,
474
+ "learning_rate": 2.7463068761257554e-06,
475
+ "loss": 0.7458,
476
+ "step": 600
477
+ },
478
+ {
479
+ "epoch": 0.26905829596412556,
480
+ "eval_loss": 0.7505608797073364,
481
+ "eval_runtime": 408.9234,
482
+ "eval_samples_per_second": 122.461,
483
+ "eval_steps_per_second": 1.915,
484
  "step": 600
485
  },
486
  {
487
  "epoch": 0.273542600896861,
488
+ "grad_norm": 1.580932873164111,
489
+ "learning_rate": 2.7330890656878943e-06,
490
+ "loss": 0.7565,
491
  "step": 610
492
  },
493
  {
494
  "epoch": 0.27802690582959644,
495
+ "grad_norm": 1.5329888412189265,
496
+ "learning_rate": 2.7195691275257547e-06,
497
+ "loss": 0.7457,
498
  "step": 620
499
  },
500
  {
501
  "epoch": 0.2825112107623318,
502
+ "grad_norm": 1.6754413400622026,
503
+ "learning_rate": 2.7057503742533753e-06,
504
+ "loss": 0.7392,
505
  "step": 630
506
  },
507
  {
508
  "epoch": 0.28699551569506726,
509
+ "grad_norm": 1.6247897070260917,
510
+ "learning_rate": 2.691636191699562e-06,
511
+ "loss": 0.758,
512
  "step": 640
513
  },
514
  {
515
  "epoch": 0.2914798206278027,
516
+ "grad_norm": 1.42356323236888,
517
+ "learning_rate": 2.6772300380783013e-06,
518
+ "loss": 0.7626,
519
  "step": 650
520
  },
521
  {
522
  "epoch": 0.29596412556053814,
523
+ "grad_norm": 1.4955853270730488,
524
+ "learning_rate": 2.662535443141443e-06,
525
+ "loss": 0.7355,
526
  "step": 660
527
  },
528
  {
529
  "epoch": 0.3004484304932735,
530
+ "grad_norm": 1.4879073313151545,
531
+ "learning_rate": 2.647556007313847e-06,
532
+ "loss": 0.7545,
533
  "step": 670
534
  },
535
  {
536
  "epoch": 0.30493273542600896,
537
+ "grad_norm": 1.4153755477305148,
538
+ "learning_rate": 2.6322954008112213e-06,
539
+ "loss": 0.7378,
540
  "step": 680
541
  },
542
  {
543
  "epoch": 0.3094170403587444,
544
+ "grad_norm": 1.4019993036978922,
545
+ "learning_rate": 2.616757362740855e-06,
546
+ "loss": 0.7387,
547
  "step": 690
548
  },
549
  {
550
  "epoch": 0.31390134529147984,
551
+ "grad_norm": 1.5335241758091316,
552
+ "learning_rate": 2.600945700185474e-06,
553
+ "loss": 0.7694,
554
+ "step": 700
555
+ },
556
+ {
557
+ "epoch": 0.31390134529147984,
558
+ "eval_loss": 0.7457958459854126,
559
+ "eval_runtime": 408.7761,
560
+ "eval_samples_per_second": 122.505,
561
+ "eval_steps_per_second": 1.915,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 0.3183856502242152,
566
+ "grad_norm": 1.47263429505246,
567
+ "learning_rate": 2.5848642872704417e-06,
568
+ "loss": 0.7246,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 0.32286995515695066,
573
+ "grad_norm": 1.5062835613914285,
574
+ "learning_rate": 2.5685170642145337e-06,
575
+ "loss": 0.7338,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 0.3273542600896861,
580
+ "grad_norm": 1.6182138547104117,
581
+ "learning_rate": 2.5519080363645134e-06,
582
+ "loss": 0.73,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 0.33183856502242154,
587
+ "grad_norm": 1.3515300425343295,
588
+ "learning_rate": 2.53504127321376e-06,
589
+ "loss": 0.7299,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 0.336322869955157,
594
+ "grad_norm": 1.5798782493243635,
595
+ "learning_rate": 2.517920907405168e-06,
596
+ "loss": 0.7293,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 0.34080717488789236,
601
+ "grad_norm": 1.4549259580353344,
602
+ "learning_rate": 2.5005511337185824e-06,
603
+ "loss": 0.7621,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 0.3452914798206278,
608
+ "grad_norm": 1.456599605633329,
609
+ "learning_rate": 2.4829362080430077e-06,
610
+ "loss": 0.7438,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 0.34977578475336324,
615
+ "grad_norm": 1.4128813340833153,
616
+ "learning_rate": 2.4650804463338406e-06,
617
+ "loss": 0.7413,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 0.3542600896860987,
622
+ "grad_norm": 1.5613737124434628,
623
+ "learning_rate": 2.4469882235553887e-06,
624
+ "loss": 0.7477,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 0.35874439461883406,
629
+ "grad_norm": 1.6383373422678345,
630
+ "learning_rate": 2.4286639726089293e-06,
631
+ "loss": 0.713,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 0.35874439461883406,
636
+ "eval_loss": 0.7421520352363586,
637
+ "eval_runtime": 408.0589,
638
+ "eval_samples_per_second": 122.72,
639
+ "eval_steps_per_second": 1.919,
640
  "step": 800
641
  },
642
  {
643
  "epoch": 0.3632286995515695,
644
+ "grad_norm": 1.3492102003393152,
645
+ "learning_rate": 2.4101121832465754e-06,
646
+ "loss": 0.7185,
647
  "step": 810
648
  },
649
  {
650
  "epoch": 0.36771300448430494,
651
+ "grad_norm": 1.4117655797526263,
652
+ "learning_rate": 2.3913374009712084e-06,
653
+ "loss": 0.7379,
654
  "step": 820
655
  },
656
  {
657
  "epoch": 0.3721973094170404,
658
+ "grad_norm": 1.5281693242796246,
659
+ "learning_rate": 2.3723442259227547e-06,
660
+ "loss": 0.7406,
661
  "step": 830
662
  },
663
  {
664
  "epoch": 0.37668161434977576,
665
+ "grad_norm": 1.6990323130848894,
666
+ "learning_rate": 2.3531373117510695e-06,
667
+ "loss": 0.7388,
668
  "step": 840
669
  },
670
  {
671
  "epoch": 0.3811659192825112,
672
+ "grad_norm": 1.476162200960684,
673
+ "learning_rate": 2.33372136447572e-06,
674
+ "loss": 0.7434,
675
  "step": 850
676
  },
677
  {
678
  "epoch": 0.38565022421524664,
679
+ "grad_norm": 1.3930484173784414,
680
+ "learning_rate": 2.3141011413329244e-06,
681
+ "loss": 0.7372,
682
  "step": 860
683
  },
684
  {
685
  "epoch": 0.3901345291479821,
686
+ "grad_norm": 1.4071716332679987,
687
+ "learning_rate": 2.2942814496099532e-06,
688
+ "loss": 0.7531,
689
  "step": 870
690
  },
691
  {
692
  "epoch": 0.39461883408071746,
693
+ "grad_norm": 1.5479232446038012,
694
+ "learning_rate": 2.274267145467259e-06,
695
+ "loss": 0.7216,
696
  "step": 880
697
  },
698
  {
699
  "epoch": 0.3991031390134529,
700
+ "grad_norm": 1.4255077423798548,
701
+ "learning_rate": 2.254063132748637e-06,
702
+ "loss": 0.7343,
703
  "step": 890
704
  },
705
  {
706
  "epoch": 0.40358744394618834,
707
+ "grad_norm": 1.57276996130409,
708
+ "learning_rate": 2.2336743617797006e-06,
709
+ "loss": 0.7347,
710
+ "step": 900
711
+ },
712
+ {
713
+ "epoch": 0.40358744394618834,
714
+ "eval_loss": 0.7386789321899414,
715
+ "eval_runtime": 408.1839,
716
+ "eval_samples_per_second": 122.682,
717
+ "eval_steps_per_second": 1.918,
718
  "step": 900
719
  },
720
  {
721
  "epoch": 0.4080717488789238,
722
+ "grad_norm": 1.4568107529063017,
723
+ "learning_rate": 2.213105828154964e-06,
724
+ "loss": 0.7266,
725
  "step": 910
726
  },
727
  {
728
  "epoch": 0.4125560538116592,
729
+ "grad_norm": 1.374198091231606,
730
+ "learning_rate": 2.192362571513841e-06,
731
+ "loss": 0.7465,
732
  "step": 920
733
  },
734
  {
735
  "epoch": 0.4170403587443946,
736
+ "grad_norm": 1.3925457206301284,
737
+ "learning_rate": 2.171449674305846e-06,
738
+ "loss": 0.7427,
739
  "step": 930
740
  },
741
  {
742
  "epoch": 0.42152466367713004,
743
+ "grad_norm": 1.4443502855856463,
744
+ "learning_rate": 2.1503722605453083e-06,
745
+ "loss": 0.7428,
746
  "step": 940
747
  },
748
  {
749
  "epoch": 0.4260089686098655,
750
+ "grad_norm": 1.5268146365443709,
751
+ "learning_rate": 2.1291354945559004e-06,
752
+ "loss": 0.7163,
753
  "step": 950
754
  },
755
  {
756
  "epoch": 0.4304932735426009,
757
+ "grad_norm": 1.5000325455240473,
758
+ "learning_rate": 2.1077445797052945e-06,
759
+ "loss": 0.7472,
760
  "step": 960
761
  },
762
  {
763
  "epoch": 0.4349775784753363,
764
+ "grad_norm": 1.4869091852092478,
765
+ "learning_rate": 2.086204757130243e-06,
766
+ "loss": 0.7427,
767
  "step": 970
768
  },
769
  {
770
  "epoch": 0.43946188340807174,
771
+ "grad_norm": 1.4430282256544564,
772
+ "learning_rate": 2.0645213044524194e-06,
773
+ "loss": 0.7174,
774
  "step": 980
775
  },
776
  {
777
  "epoch": 0.4439461883408072,
778
+ "grad_norm": 1.4822025498870304,
779
+ "learning_rate": 2.0426995344853043e-06,
780
+ "loss": 0.7538,
781
  "step": 990
782
  },
783
  {
784
  "epoch": 0.4484304932735426,
785
+ "grad_norm": 1.5186234240452396,
786
+ "learning_rate": 2.0207447939324598e-06,
787
+ "loss": 0.7243,
788
+ "step": 1000
789
+ },
790
+ {
791
+ "epoch": 0.4484304932735426,
792
+ "eval_loss": 0.7356163859367371,
793
+ "eval_runtime": 407.0139,
794
+ "eval_samples_per_second": 123.035,
795
+ "eval_steps_per_second": 1.924,
796
  "step": 1000
797
  },
798
  {
799
  "epoch": 0.452914798206278,
800
+ "grad_norm": 1.5742685454152958,
801
+ "learning_rate": 1.998662462077496e-06,
802
+ "loss": 0.7475,
803
  "step": 1010
804
  },
805
  {
806
  "epoch": 0.45739910313901344,
807
+ "grad_norm": 1.3834168469611057,
808
+ "learning_rate": 1.976457949466054e-06,
809
+ "loss": 0.7568,
810
  "step": 1020
811
  },
812
  {
813
  "epoch": 0.4618834080717489,
814
+ "grad_norm": 1.4947961999330186,
815
+ "learning_rate": 1.954136696580132e-06,
816
+ "loss": 0.7464,
817
  "step": 1030
818
  },
819
  {
820
  "epoch": 0.4663677130044843,
821
+ "grad_norm": 1.4284253764088304,
822
+ "learning_rate": 1.9317041725050747e-06,
823
+ "loss": 0.7456,
824
  "step": 1040
825
  },
826
  {
827
  "epoch": 0.47085201793721976,
828
+ "grad_norm": 1.4247354157320633,
829
+ "learning_rate": 1.909165873589554e-06,
830
+ "loss": 0.7008,
831
  "step": 1050
832
  },
833
  {
834
  "epoch": 0.47533632286995514,
835
+ "grad_norm": 1.4525308368306575,
836
+ "learning_rate": 1.886527322098871e-06,
837
+ "loss": 0.7121,
838
  "step": 1060
839
  },
840
  {
841
  "epoch": 0.4798206278026906,
842
+ "grad_norm": 1.43738036112722,
843
+ "learning_rate": 1.8637940648619065e-06,
844
+ "loss": 0.7308,
845
  "step": 1070
846
  },
847
  {
848
  "epoch": 0.484304932735426,
849
+ "grad_norm": 1.402086349899742,
850
+ "learning_rate": 1.8409716719120561e-06,
851
+ "loss": 0.7164,
852
  "step": 1080
853
  },
854
  {
855
  "epoch": 0.48878923766816146,
856
+ "grad_norm": 1.5227358428935063,
857
+ "learning_rate": 1.8180657351224739e-06,
858
+ "loss": 0.732,
859
  "step": 1090
860
  },
861
  {
862
  "epoch": 0.49327354260089684,
863
+ "grad_norm": 1.5813743714389112,
864
+ "learning_rate": 1.7950818668359733e-06,
865
+ "loss": 0.7161,
866
+ "step": 1100
867
+ },
868
+ {
869
+ "epoch": 0.49327354260089684,
870
+ "eval_loss": 0.7330535054206848,
871
+ "eval_runtime": 408.4081,
872
+ "eval_samples_per_second": 122.615,
873
+ "eval_steps_per_second": 1.917,
874
  "step": 1100
875
  },
876
  {
877
  "epoch": 0.4977578475336323,
878
+ "grad_norm": 1.4881819590713468,
879
+ "learning_rate": 1.772025698489903e-06,
880
+ "loss": 0.7144,
881
  "step": 1110
882
  },
883
  {
884
  "epoch": 0.5022421524663677,
885
+ "grad_norm": 1.4750319990458514,
886
+ "learning_rate": 1.7489028792363549e-06,
887
+ "loss": 0.7365,
888
  "step": 1120
889
  },
890
  {
891
  "epoch": 0.5067264573991032,
892
+ "grad_norm": 1.4443590686278198,
893
+ "learning_rate": 1.7257190745580209e-06,
894
+ "loss": 0.7487,
895
  "step": 1130
896
  },
897
  {
898
  "epoch": 0.5112107623318386,
899
+ "grad_norm": 1.4695293763109774,
900
+ "learning_rate": 1.7024799648800555e-06,
901
+ "loss": 0.7233,
902
  "step": 1140
903
  },
904
  {
905
  "epoch": 0.515695067264574,
906
+ "grad_norm": 1.4328944860273993,
907
+ "learning_rate": 1.679191244178278e-06,
908
+ "loss": 0.7322,
909
  "step": 1150
910
  },
911
  {
912
  "epoch": 0.5201793721973094,
913
+ "grad_norm": 1.4157130638413895,
914
+ "learning_rate": 1.6558586185840473e-06,
915
+ "loss": 0.728,
916
  "step": 1160
917
  },
918
  {
919
  "epoch": 0.5246636771300448,
920
+ "grad_norm": 1.4117533616122613,
921
+ "learning_rate": 1.6324878049861656e-06,
922
+ "loss": 0.7331,
923
  "step": 1170
924
  },
925
  {
926
  "epoch": 0.5291479820627802,
927
+ "grad_norm": 1.4255877674393056,
928
+ "learning_rate": 1.609084529630145e-06,
929
+ "loss": 0.7491,
930
  "step": 1180
931
  },
932
  {
933
  "epoch": 0.5336322869955157,
934
+ "grad_norm": 1.4486300200418207,
935
+ "learning_rate": 1.5856545267151759e-06,
936
+ "loss": 0.7261,
937
  "step": 1190
938
  },
939
  {
940
  "epoch": 0.5381165919282511,
941
+ "grad_norm": 1.4628618883782867,
942
+ "learning_rate": 1.5622035369891561e-06,
943
+ "loss": 0.7247,
944
+ "step": 1200
945
+ },
946
+ {
947
+ "epoch": 0.5381165919282511,
948
+ "eval_loss": 0.7308038473129272,
949
+ "eval_runtime": 406.6873,
950
+ "eval_samples_per_second": 123.134,
951
+ "eval_steps_per_second": 1.925,
952
  "step": 1200
953
  },
954
  {
955
  "epoch": 0.5426008968609866,
956
+ "grad_norm": 1.4112256357672157,
957
+ "learning_rate": 1.5387373063421062e-06,
958
+ "loss": 0.7307,
959
  "step": 1210
960
  },
961
  {
962
  "epoch": 0.547085201793722,
963
+ "grad_norm": 1.3994109954542429,
964
+ "learning_rate": 1.515261584398333e-06,
965
+ "loss": 0.7062,
966
  "step": 1220
967
  },
968
  {
969
  "epoch": 0.5515695067264574,
970
+ "grad_norm": 1.5279436893984248,
971
+ "learning_rate": 1.491782123107669e-06,
972
+ "loss": 0.7314,
973
  "step": 1230
974
  },
975
  {
976
  "epoch": 0.5560538116591929,
977
+ "grad_norm": 1.4092281762272858,
978
+ "learning_rate": 1.4683046753361521e-06,
979
+ "loss": 0.7044,
980
  "step": 1240
981
  },
982
  {
983
  "epoch": 0.5605381165919282,
984
+ "grad_norm": 1.4363381867810665,
985
+ "learning_rate": 1.4448349934564736e-06,
986
+ "loss": 0.7287,
987
  "step": 1250
988
  },
989
  {
990
  "epoch": 0.5650224215246636,
991
+ "grad_norm": 1.4913351223697051,
992
+ "learning_rate": 1.421378827938549e-06,
993
+ "loss": 0.7254,
994
  "step": 1260
995
  },
996
  {
997
  "epoch": 0.5695067264573991,
998
+ "grad_norm": 1.5096384680619075,
999
+ "learning_rate": 1.3979419259405563e-06,
1000
+ "loss": 0.7389,
1001
  "step": 1270
1002
  },
1003
  {
1004
  "epoch": 0.5739910313901345,
1005
+ "grad_norm": 1.3495144573299676,
1006
+ "learning_rate": 1.3745300299007856e-06,
1007
+ "loss": 0.7247,
1008
  "step": 1280
1009
  },
1010
  {
1011
  "epoch": 0.57847533632287,
1012
+ "grad_norm": 1.3641879848291365,
1013
+ "learning_rate": 1.3511488761306412e-06,
1014
+ "loss": 0.7312,
1015
  "step": 1290
1016
  },
1017
  {
1018
  "epoch": 0.5829596412556054,
1019
+ "grad_norm": 1.3879105033157129,
1020
+ "learning_rate": 1.3278041934091524e-06,
1021
+ "loss": 0.7477,
1022
+ "step": 1300
1023
+ },
1024
+ {
1025
+ "epoch": 0.5829596412556054,
1026
+ "eval_loss": 0.7287724018096924,
1027
+ "eval_runtime": 406.882,
1028
+ "eval_samples_per_second": 123.075,
1029
+ "eval_steps_per_second": 1.924,
1030
  "step": 1300
1031
  },
1032
  {
1033
  "epoch": 0.5874439461883408,
1034
+ "grad_norm": 1.3916697284582622,
1035
+ "learning_rate": 1.3045017015793217e-06,
1036
+ "loss": 0.7246,
1037
  "step": 1310
1038
  },
1039
  {
1040
  "epoch": 0.5919282511210763,
1041
+ "grad_norm": 1.4328511876779917,
1042
+ "learning_rate": 1.2812471101466687e-06,
1043
+ "loss": 0.7303,
1044
  "step": 1320
1045
  },
1046
  {
1047
  "epoch": 0.5964125560538116,
1048
+ "grad_norm": 1.4411092846252307,
1049
+ "learning_rate": 1.2580461168803038e-06,
1050
+ "loss": 0.7318,
1051
  "step": 1330
1052
  },
1053
  {
1054
  "epoch": 0.600896860986547,
1055
+ "grad_norm": 1.4703965551927338,
1056
+ "learning_rate": 1.2349044064168782e-06,
1057
+ "loss": 0.7375,
1058
  "step": 1340
1059
  },
1060
  {
1061
  "epoch": 0.6053811659192825,
1062
+ "grad_norm": 1.4319057117061509,
1063
+ "learning_rate": 1.21182764886775e-06,
1064
+ "loss": 0.7302,
1065
  "step": 1350
1066
  },
1067
  {
1068
  "epoch": 0.6098654708520179,
1069
+ "grad_norm": 1.5017976848926429,
1070
+ "learning_rate": 1.188821498429714e-06,
1071
+ "loss": 0.7262,
1072
  "step": 1360
1073
  },
1074
  {
1075
  "epoch": 0.6143497757847534,
1076
+ "grad_norm": 1.4553869576056546,
1077
+ "learning_rate": 1.165891591999626e-06,
1078
+ "loss": 0.7447,
1079
  "step": 1370
1080
  },
1081
  {
1082
  "epoch": 0.6188340807174888,
1083
+ "grad_norm": 1.4128744043127173,
1084
+ "learning_rate": 1.1430435477932646e-06,
1085
+ "loss": 0.7423,
1086
  "step": 1380
1087
  },
1088
  {
1089
  "epoch": 0.6233183856502242,
1090
+ "grad_norm": 1.3797159286061107,
1091
+ "learning_rate": 1.1202829639687785e-06,
1092
+ "loss": 0.744,
1093
  "step": 1390
1094
  },
1095
  {
1096
  "epoch": 0.6278026905829597,
1097
+ "grad_norm": 1.487304571595245,
1098
+ "learning_rate": 1.0976154172550408e-06,
1099
+ "loss": 0.7429,
1100
+ "step": 1400
1101
+ },
1102
+ {
1103
+ "epoch": 0.6278026905829597,
1104
+ "eval_loss": 0.7272571921348572,
1105
+ "eval_runtime": 406.7541,
1106
+ "eval_samples_per_second": 123.114,
1107
+ "eval_steps_per_second": 1.925,
1108
  "step": 1400
1109
  },
1110
  {
1111
  "epoch": 0.6322869955156951,
1112
+ "grad_norm": 1.544512062570189,
1113
+ "learning_rate": 1.0750464615852523e-06,
1114
+ "loss": 0.7251,
1115
  "step": 1410
1116
  },
1117
  {
1118
  "epoch": 0.6367713004484304,
1119
+ "grad_norm": 1.422563130817404,
1120
+ "learning_rate": 1.0525816267361398e-06,
1121
+ "loss": 0.712,
1122
  "step": 1420
1123
  },
1124
  {
1125
  "epoch": 0.6412556053811659,
1126
+ "grad_norm": 1.4937681764382644,
1127
+ "learning_rate": 1.0302264169730613e-06,
1128
+ "loss": 0.7203,
1129
  "step": 1430
1130
  },
1131
  {
1132
  "epoch": 0.6457399103139013,
1133
+ "grad_norm": 1.50738757049434,
1134
+ "learning_rate": 1.0079863097013722e-06,
1135
+ "loss": 0.7121,
1136
  "step": 1440
1137
  },
1138
  {
1139
  "epoch": 0.6502242152466368,
1140
+ "grad_norm": 1.286396172710849,
1141
+ "learning_rate": 9.85866754124367e-07,
1142
+ "loss": 0.7193,
1143
  "step": 1450
1144
  },
1145
  {
1146
  "epoch": 0.6547085201793722,
1147
+ "grad_norm": 1.4997539342741677,
1148
+ "learning_rate": 9.638731699081281e-07,
1149
+ "loss": 0.7288,
1150
  "step": 1460
1151
  },
1152
  {
1153
  "epoch": 0.6591928251121076,
1154
+ "grad_norm": 1.37434247409356,
1155
+ "learning_rate": 9.42010945853623e-07,
1156
+ "loss": 0.7597,
1157
  "step": 1470
1158
  },
1159
  {
1160
  "epoch": 0.6636771300448431,
1161
+ "grad_norm": 1.3869436283100607,
1162
+ "learning_rate": 9.202854385763502e-07,
1163
+ "loss": 0.7184,
1164
  "step": 1480
1165
  },
1166
  {
1167
  "epoch": 0.6681614349775785,
1168
+ "grad_norm": 1.3970067087387381,
1169
+ "learning_rate": 8.987019711938812e-07,
1170
+ "loss": 0.7326,
1171
  "step": 1490
1172
  },
1173
  {
1174
  "epoch": 0.672645739910314,
1175
+ "grad_norm": 1.553183464191494,
1176
+ "learning_rate": 8.772658320216047e-07,
1177
+ "loss": 0.7317,
1178
+ "step": 1500
1179
+ },
1180
+ {
1181
+ "epoch": 0.672645739910314,
1182
+ "eval_loss": 0.7256098389625549,
1183
+ "eval_runtime": 406.6132,
1184
+ "eval_samples_per_second": 123.156,
1185
+ "eval_steps_per_second": 1.926,
1186
  "step": 1500
1187
  },
1188
  {
1189
  "epoch": 0.6771300448430493,
1190
+ "grad_norm": 1.3357768297094936,
1191
+ "learning_rate": 8.55982273277002e-07,
1192
+ "loss": 0.7347,
1193
  "step": 1510
1194
  },
1195
  {
1196
  "epoch": 0.6816143497757847,
1197
+ "grad_norm": 1.3249788097985131,
1198
+ "learning_rate": 8.348565097927605e-07,
1199
+ "loss": 0.7496,
1200
  "step": 1520
1201
  },
1202
  {
1203
  "epoch": 0.6860986547085202,
1204
+ "grad_norm": 1.4578138220875878,
1205
+ "learning_rate": 8.13893717739056e-07,
1206
+ "loss": 0.7308,
1207
  "step": 1530
1208
  },
1209
  {
1210
  "epoch": 0.6905829596412556,
1211
+ "grad_norm": 1.3268077719441809,
1212
+ "learning_rate": 7.930990333553013e-07,
1213
+ "loss": 0.7094,
1214
  "step": 1540
1215
  },
1216
  {
1217
  "epoch": 0.695067264573991,
1218
+ "grad_norm": 1.47562182506043,
1219
+ "learning_rate": 7.72477551691678e-07,
1220
+ "loss": 0.697,
1221
  "step": 1550
1222
  },
1223
  {
1224
  "epoch": 0.6995515695067265,
1225
+ "grad_norm": 1.4850843190566259,
1226
+ "learning_rate": 7.520343253607677e-07,
1227
+ "loss": 0.7301,
1228
  "step": 1560
1229
  },
1230
  {
1231
  "epoch": 0.7040358744394619,
1232
+ "grad_norm": 1.5097763618083517,
1233
+ "learning_rate": 7.317743632995731e-07,
1234
+ "loss": 0.7217,
1235
  "step": 1570
1236
  },
1237
  {
1238
  "epoch": 0.7085201793721974,
1239
+ "grad_norm": 1.3914348509226637,
1240
+ "learning_rate": 7.117026295422425e-07,
1241
+ "loss": 0.6957,
1242
  "step": 1580
1243
  },
1244
  {
1245
  "epoch": 0.7130044843049327,
1246
+ "grad_norm": 1.5175208261545492,
1247
+ "learning_rate": 6.918240420038007e-07,
1248
+ "loss": 0.7317,
1249
  "step": 1590
1250
  },
1251
  {
1252
  "epoch": 0.7174887892376681,
1253
+ "grad_norm": 1.4947559578839034,
1254
+ "learning_rate": 6.721434712751745e-07,
1255
+ "loss": 0.7226,
1256
+ "step": 1600
1257
+ },
1258
+ {
1259
+ "epoch": 0.7174887892376681,
1260
+ "eval_loss": 0.7243176102638245,
1261
+ "eval_runtime": 406.7899,
1262
+ "eval_samples_per_second": 123.103,
1263
+ "eval_steps_per_second": 1.925,
1264
  "step": 1600
1265
  },
1266
  {
1267
  "epoch": 0.7219730941704036,
1268
+ "grad_norm": 1.5192098207309965,
1269
+ "learning_rate": 6.526657394298154e-07,
1270
+ "loss": 0.705,
1271
  "step": 1610
1272
  },
1273
  {
1274
  "epoch": 0.726457399103139,
1275
+ "grad_norm": 1.3665027387136646,
1276
+ "learning_rate": 6.333956188422088e-07,
1277
+ "loss": 0.706,
1278
  "step": 1620
1279
  },
1280
  {
1281
  "epoch": 0.7309417040358744,
1282
+ "grad_norm": 1.4974912840899435,
1283
+ "learning_rate": 6.143378310185643e-07,
1284
+ "loss": 0.6983,
1285
  "step": 1630
1286
  },
1287
  {
1288
  "epoch": 0.7354260089686099,
1289
+ "grad_norm": 1.5477574584643699,
1290
+ "learning_rate": 5.954970454399638e-07,
1291
+ "loss": 0.7252,
1292
  "step": 1640
1293
  },
1294
  {
1295
  "epoch": 0.7399103139013453,
1296
+ "grad_norm": 1.525090065151942,
1297
+ "learning_rate": 5.768778784182616e-07,
1298
+ "loss": 0.7087,
1299
  "step": 1650
1300
  },
1301
  {
1302
  "epoch": 0.7443946188340808,
1303
+ "grad_norm": 1.4837554579437873,
1304
+ "learning_rate": 5.584848919650069e-07,
1305
+ "loss": 0.7075,
1306
  "step": 1660
1307
  },
1308
  {
1309
  "epoch": 0.7488789237668162,
1310
+ "grad_norm": 1.3538329119260115,
1311
+ "learning_rate": 5.403225926736772e-07,
1312
+ "loss": 0.7057,
1313
  "step": 1670
1314
  },
1315
  {
1316
  "epoch": 0.7533632286995515,
1317
+ "grad_norm": 1.359895087573495,
1318
+ "learning_rate": 5.223954306154843e-07,
1319
+ "loss": 0.7306,
1320
  "step": 1680
1321
  },
1322
  {
1323
  "epoch": 0.757847533632287,
1324
+ "grad_norm": 1.4168148218595764,
1325
+ "learning_rate": 5.047077982490311e-07,
1326
+ "loss": 0.7424,
1327
  "step": 1690
1328
  },
1329
  {
1330
  "epoch": 0.7623318385650224,
1331
+ "grad_norm": 1.4815842671642683,
1332
+ "learning_rate": 4.872640293440861e-07,
1333
+ "loss": 0.695,
1334
+ "step": 1700
1335
+ },
1336
+ {
1337
+ "epoch": 0.7623318385650224,
1338
+ "eval_loss": 0.7233718633651733,
1339
+ "eval_runtime": 406.8015,
1340
+ "eval_samples_per_second": 123.099,
1341
+ "eval_steps_per_second": 1.925,
1342
  "step": 1700
1343
  },
1344
  {
1345
  "epoch": 0.7668161434977578,
1346
+ "grad_norm": 1.5501655544071418,
1347
+ "learning_rate": 4.7006839791973673e-07,
1348
+ "loss": 0.7327,
1349
  "step": 1710
1350
  },
1351
  {
1352
  "epoch": 0.7713004484304933,
1353
+ "grad_norm": 1.3834984705411,
1354
+ "learning_rate": 4.53125117197179e-07,
1355
+ "loss": 0.7245,
1356
  "step": 1720
1357
  },
1358
  {
1359
  "epoch": 0.7757847533632287,
1360
+ "grad_norm": 1.4041748328697374,
1361
+ "learning_rate": 4.364383385674112e-07,
1362
+ "loss": 0.7054,
1363
  "step": 1730
1364
  },
1365
  {
1366
  "epoch": 0.7802690582959642,
1367
+ "grad_norm": 1.443104622604103,
1368
+ "learning_rate": 4.2001215057407026e-07,
1369
+ "loss": 0.7037,
1370
  "step": 1740
1371
  },
1372
  {
1373
  "epoch": 0.7847533632286996,
1374
+ "grad_norm": 1.5632699202433824,
1375
+ "learning_rate": 4.038505779116687e-07,
1376
+ "loss": 0.705,
1377
  "step": 1750
1378
  },
1379
  {
1380
  "epoch": 0.7892376681614349,
1381
+ "grad_norm": 1.349615732583278,
1382
+ "learning_rate": 3.879575804394782e-07,
1383
+ "loss": 0.7071,
1384
  "step": 1760
1385
  },
1386
  {
1387
  "epoch": 0.7937219730941704,
1388
+ "grad_norm": 1.3657530768128234,
1389
+ "learning_rate": 3.7233705221129646e-07,
1390
+ "loss": 0.7273,
1391
  "step": 1770
1392
  },
1393
  {
1394
  "epoch": 0.7982062780269058,
1395
+ "grad_norm": 1.5107387856649341,
1396
+ "learning_rate": 3.569928205213354e-07,
1397
+ "loss": 0.6975,
1398
  "step": 1780
1399
  },
1400
  {
1401
  "epoch": 0.8026905829596412,
1402
+ "grad_norm": 1.4525568524987686,
1403
+ "learning_rate": 3.419286449664741e-07,
1404
+ "loss": 0.7095,
1405
  "step": 1790
1406
  },
1407
  {
1408
  "epoch": 0.8071748878923767,
1409
+ "grad_norm": 1.4847854049722584,
1410
+ "learning_rate": 3.2714821652508854e-07,
1411
+ "loss": 0.7167,
1412
+ "step": 1800
1413
+ },
1414
+ {
1415
+ "epoch": 0.8071748878923767,
1416
+ "eval_loss": 0.7225807309150696,
1417
+ "eval_runtime": 406.5326,
1418
+ "eval_samples_per_second": 123.181,
1419
+ "eval_steps_per_second": 1.926,
1420
  "step": 1800
1421
  },
1422
  {
1423
  "epoch": 0.8116591928251121,
1424
+ "grad_norm": 1.2447161837361285,
1425
+ "learning_rate": 3.126551566527036e-07,
1426
+ "loss": 0.7156,
1427
  "step": 1810
1428
  },
1429
  {
1430
  "epoch": 0.8161434977578476,
1431
+ "grad_norm": 1.4139333132454484,
1432
+ "learning_rate": 2.9845301639467284e-07,
1433
+ "loss": 0.7537,
1434
  "step": 1820
1435
  },
1436
  {
1437
  "epoch": 0.820627802690583,
1438
+ "grad_norm": 1.3663031642715642,
1439
+ "learning_rate": 2.8454527551611205e-07,
1440
+ "loss": 0.7238,
1441
  "step": 1830
1442
  },
1443
  {
1444
  "epoch": 0.8251121076233184,
1445
+ "grad_norm": 1.389263976301968,
1446
+ "learning_rate": 2.7093534164929904e-07,
1447
+ "loss": 0.738,
1448
  "step": 1840
1449
  },
1450
  {
1451
  "epoch": 0.8295964125560538,
1452
+ "grad_norm": 1.5068808968575202,
1453
+ "learning_rate": 2.576265494587458e-07,
1454
+ "loss": 0.7067,
1455
  "step": 1850
1456
  },
1457
  {
1458
  "epoch": 0.8340807174887892,
1459
+ "grad_norm": 1.4226178531466935,
1460
+ "learning_rate": 2.446221598241472e-07,
1461
+ "loss": 0.7143,
1462
  "step": 1860
1463
  },
1464
  {
1465
  "epoch": 0.8385650224215246,
1466
+ "grad_norm": 1.6881847148932905,
1467
+ "learning_rate": 2.319253590414132e-07,
1468
+ "loss": 0.7376,
1469
  "step": 1870
1470
  },
1471
  {
1472
  "epoch": 0.8430493273542601,
1473
+ "grad_norm": 1.4353283330892004,
1474
+ "learning_rate": 2.1953925804197056e-07,
1475
+ "loss": 0.7095,
1476
  "step": 1880
1477
  },
1478
  {
1479
  "epoch": 0.8475336322869955,
1480
+ "grad_norm": 1.4639605071750654,
1481
+ "learning_rate": 2.0746689163053113e-07,
1482
+ "loss": 0.7102,
1483
  "step": 1890
1484
  },
1485
  {
1486
  "epoch": 0.852017937219731,
1487
+ "grad_norm": 1.458703799588621,
1488
+ "learning_rate": 1.9571121774151545e-07,
1489
+ "loss": 0.686,
1490
+ "step": 1900
1491
+ },
1492
+ {
1493
+ "epoch": 0.852017937219731,
1494
+ "eval_loss": 0.7220604419708252,
1495
+ "eval_runtime": 406.5609,
1496
+ "eval_samples_per_second": 123.172,
1497
+ "eval_steps_per_second": 1.926,
1498
  "step": 1900
1499
  },
1500
  {
1501
  "epoch": 0.8565022421524664,
1502
+ "grad_norm": 1.470148783910905,
1503
+ "learning_rate": 1.8427511671430757e-07,
1504
+ "loss": 0.72,
1505
  "step": 1910
1506
  },
1507
  {
1508
  "epoch": 0.8609865470852018,
1509
+ "grad_norm": 1.3891242748262451,
1510
+ "learning_rate": 1.7316139058752194e-07,
1511
+ "loss": 0.7318,
1512
  "step": 1920
1513
  },
1514
  {
1515
  "epoch": 0.8654708520179372,
1516
+ "grad_norm": 1.2245069775705093,
1517
+ "learning_rate": 1.6237276241245867e-07,
1518
+ "loss": 0.7155,
1519
  "step": 1930
1520
  },
1521
  {
1522
  "epoch": 0.8699551569506726,
1523
+ "grad_norm": 1.360510189488915,
1524
+ "learning_rate": 1.519118755859084e-07,
1525
+ "loss": 0.7255,
1526
  "step": 1940
1527
  },
1528
  {
1529
  "epoch": 0.874439461883408,
1530
+ "grad_norm": 1.495119615923585,
1531
+ "learning_rate": 1.4178129320247486e-07,
1532
+ "loss": 0.7484,
1533
  "step": 1950
1534
  },
1535
  {
1536
  "epoch": 0.8789237668161435,
1537
+ "grad_norm": 1.3674856635367474,
1538
+ "learning_rate": 1.31983497426575e-07,
1539
+ "loss": 0.7366,
1540
  "step": 1960
1541
  },
1542
  {
1543
  "epoch": 0.8834080717488789,
1544
+ "grad_norm": 1.4494730150421093,
1545
+ "learning_rate": 1.2252088888426431e-07,
1546
+ "loss": 0.742,
1547
  "step": 1970
1548
  },
1549
  {
1550
  "epoch": 0.8878923766816144,
1551
+ "grad_norm": 1.4368197978682802,
1552
+ "learning_rate": 1.1339578607504536e-07,
1553
+ "loss": 0.7269,
1554
  "step": 1980
1555
  },
1556
  {
1557
  "epoch": 0.8923766816143498,
1558
+ "grad_norm": 1.4017197990051706,
1559
+ "learning_rate": 1.0461042480379402e-07,
1560
+ "loss": 0.7234,
1561
  "step": 1990
1562
  },
1563
  {
1564
  "epoch": 0.8968609865470852,
1565
+ "grad_norm": 1.426560347266084,
1566
+ "learning_rate": 9.616695763295007e-08,
1567
+ "loss": 0.7214,
1568
+ "step": 2000
1569
+ },
1570
+ {
1571
+ "epoch": 0.8968609865470852,
1572
+ "eval_loss": 0.721759557723999,
1573
+ "eval_runtime": 406.5838,
1574
+ "eval_samples_per_second": 123.165,
1575
+ "eval_steps_per_second": 1.926,
1576
  "step": 2000
1577
  },
1578
  {
1579
  "epoch": 0.9013452914798207,
1580
+ "grad_norm": 1.489947255967281,
1581
+ "learning_rate": 8.806745335510297e-08,
1582
+ "loss": 0.7341,
1583
  "step": 2010
1584
  },
1585
  {
1586
  "epoch": 0.905829596412556,
1587
+ "grad_norm": 1.4312716003053576,
1588
+ "learning_rate": 8.031389648610266e-08,
1589
+ "loss": 0.7264,
1590
  "step": 2020
1591
  },
1592
  {
1593
  "epoch": 0.9103139013452914,
1594
+ "grad_norm": 1.4764400641380824,
1595
+ "learning_rate": 7.290818677881966e-08,
1596
+ "loss": 0.7301,
1597
  "step": 2030
1598
  },
1599
  {
1600
  "epoch": 0.9147982062780269,
1601
+ "grad_norm": 1.4381108917682341,
1602
+ "learning_rate": 6.585213875767305e-08,
1603
+ "loss": 0.6997,
1604
  "step": 2040
1605
  },
1606
  {
1607
  "epoch": 0.9192825112107623,
1608
+ "grad_norm": 1.459723127188453,
1609
+ "learning_rate": 5.914748127404102e-08,
1610
+ "loss": 0.7168,
1611
  "step": 2050
1612
  },
1613
  {
1614
  "epoch": 0.9237668161434978,
1615
+ "grad_norm": 1.5776619173541433,
1616
+ "learning_rate": 5.2795857082663655e-08,
1617
+ "loss": 0.72,
1618
  "step": 2060
1619
  },
1620
  {
1621
  "epoch": 0.9282511210762332,
1622
+ "grad_norm": 1.438610611700907,
1623
+ "learning_rate": 4.6798822439140185e-08,
1624
+ "loss": 0.7035,
1625
  "step": 2070
1626
  },
1627
  {
1628
  "epoch": 0.9327354260089686,
1629
+ "grad_norm": 1.4350411032390504,
1630
+ "learning_rate": 4.115784671861916e-08,
1631
+ "loss": 0.735,
1632
  "step": 2080
1633
  },
1634
  {
1635
  "epoch": 0.9372197309417041,
1636
+ "grad_norm": 1.4822578142933729,
1637
+ "learning_rate": 3.587431205577713e-08,
1638
+ "loss": 0.7178,
1639
  "step": 2090
1640
  },
1641
  {
1642
  "epoch": 0.9417040358744395,
1643
+ "grad_norm": 1.5001233187138816,
1644
+ "learning_rate": 3.0949513006172325e-08,
1645
+ "loss": 0.7358,
1646
+ "step": 2100
1647
+ },
1648
+ {
1649
+ "epoch": 0.9417040358744395,
1650
+ "eval_loss": 0.7216091752052307,
1651
+ "eval_runtime": 406.6258,
1652
+ "eval_samples_per_second": 123.153,
1653
+ "eval_steps_per_second": 1.926,
1654
  "step": 2100
1655
  },
1656
  {
1657
  "epoch": 0.9461883408071748,
1658
+ "grad_norm": 1.4457564058059627,
1659
+ "learning_rate": 2.6384656229056946e-08,
1660
+ "loss": 0.7285,
1661
  "step": 2110
1662
  },
1663
  {
1664
  "epoch": 0.9506726457399103,
1665
+ "grad_norm": 1.6789172768348999,
1666
+ "learning_rate": 2.218086019172394e-08,
1667
+ "loss": 0.7027,
1668
  "step": 2120
1669
  },
1670
  {
1671
  "epoch": 0.9551569506726457,
1672
+ "grad_norm": 1.4039832008414181,
1673
+ "learning_rate": 1.8339154895464894e-08,
1674
+ "loss": 0.7285,
1675
  "step": 2130
1676
  },
1677
  {
1678
  "epoch": 0.9596412556053812,
1679
+ "grad_norm": 1.7674026844330886,
1680
+ "learning_rate": 1.4860481623201417e-08,
1681
+ "loss": 0.713,
1682
  "step": 2140
1683
  },
1684
  {
1685
  "epoch": 0.9641255605381166,
1686
+ "grad_norm": 1.531580121339593,
1687
+ "learning_rate": 1.1745692708855282e-08,
1688
+ "loss": 0.7328,
1689
  "step": 2150
1690
  },
1691
  {
1692
  "epoch": 0.968609865470852,
1693
+ "grad_norm": 1.455884868550825,
1694
+ "learning_rate": 8.99555132851232e-09,
1695
+ "loss": 0.7196,
1696
  "step": 2160
1697
  },
1698
  {
1699
  "epoch": 0.9730941704035875,
1700
+ "grad_norm": 1.3157536936429735,
1701
+ "learning_rate": 6.610731313430318e-09,
1702
+ "loss": 0.7277,
1703
  "step": 2170
1704
  },
1705
  {
1706
  "epoch": 0.9775784753363229,
1707
+ "grad_norm": 1.5586404477319191,
1708
+ "learning_rate": 4.5918169849406e-09,
1709
+ "loss": 0.7265,
1710
  "step": 2180
1711
  },
1712
  {
1713
  "epoch": 0.9820627802690582,
1714
+ "grad_norm": 1.3596393082767964,
1715
+ "learning_rate": 2.939303011277872e-09,
1716
+ "loss": 0.719,
1717
  "step": 2190
1718
  },
1719
  {
1720
  "epoch": 0.9865470852017937,
1721
+ "grad_norm": 1.3866642718972106,
1722
+ "learning_rate": 1.6535942863788456e-09,
1723
+ "loss": 0.7259,
1724
+ "step": 2200
1725
+ },
1726
+ {
1727
+ "epoch": 0.9865470852017937,
1728
+ "eval_loss": 0.7215752005577087,
1729
+ "eval_runtime": 408.9437,
1730
+ "eval_samples_per_second": 122.455,
1731
+ "eval_steps_per_second": 1.915,
1732
  "step": 2200
1733
  },
1734
  {
1735
  "epoch": 0.9910313901345291,
1736
+ "grad_norm": 1.6643780128489514,
1737
+ "learning_rate": 7.350058306764273e-10,
1738
+ "loss": 0.7044,
1739
  "step": 2210
1740
  },
1741
  {
1742
  "epoch": 0.9955156950672646,
1743
+ "grad_norm": 1.428221428067804,
1744
+ "learning_rate": 1.8376271391412624e-10,
1745
+ "loss": 0.7109,
1746
  "step": 2220
1747
  },
1748
  {
1749
  "epoch": 1.0,
1750
+ "grad_norm": 1.3882910125414851,
1751
  "learning_rate": 0.0,
1752
+ "loss": 0.7123,
 
 
 
 
 
 
 
 
1753
  "step": 2230
1754
  },
1755
  {
1756
  "epoch": 1.0,
1757
  "step": 2230,
1758
+ "total_flos": 250303561007104.0,
1759
+ "train_loss": 0.7492096503219262,
1760
+ "train_runtime": 18007.2993,
1761
+ "train_samples_per_second": 15.851,
1762
+ "train_steps_per_second": 0.124
1763
  }
1764
  ],
1765
  "logging_steps": 10,
 
1779
  "attributes": {}
1780
  }
1781
  },
1782
+ "total_flos": 250303561007104.0,
1783
  "train_batch_size": 8,
1784
  "trial_name": null,
1785
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb25fa7c42df808b4748bec88dee1a8e16e0082e24daff6298a752d2dca4e443
3
  size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c10bdb747e6aa533e359cc0a3925f648df006fc2f6bd836e9cca6e77438744b9
3
  size 6968