BilelDJ commited on
Commit
1719c1d
1 Parent(s): 395a56a

Training in progress, step 110

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e469f4e5e0aebde708d69422bfec5a756da896a8d5d32ccbed422c7319a0700f
3
  size 605156676
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352172f076d6a58ebf2fc85e11b1cbc041ccf3a96a3ea3e2f8b1e6a9c0fcf049
3
  size 605156676
run-wgja68lo/checkpoint-110/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/clip-vit-base-patch32",
3
+ "architectures": [
4
+ "CLIPModel"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 512,
10
+ "text_config": {
11
+ "bos_token_id": 0,
12
+ "dropout": 0.0,
13
+ "eos_token_id": 2,
14
+ "model_type": "clip_text_model"
15
+ },
16
+ "torch_dtype": "float32",
17
+ "transformers_version": "4.42.0.dev0",
18
+ "vision_config": {
19
+ "dropout": 0.0,
20
+ "model_type": "clip_vision_model"
21
+ }
22
+ }
run-wgja68lo/checkpoint-110/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352172f076d6a58ebf2fc85e11b1cbc041ccf3a96a3ea3e2f8b1e6a9c0fcf049
3
+ size 605156676
run-wgja68lo/checkpoint-110/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74bf7e861a9d3432e4f142b660c2df80ad4249de87e94e287f3c471e652d336a
3
+ size 1210551612
run-wgja68lo/checkpoint-110/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16629df2819a8d44b43da4938948e4713f44200e43c923a476715285313928f0
3
+ size 14244
run-wgja68lo/checkpoint-110/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d663d07e1f9020b5d7720335f45763a001928feb06edd02fcff9e48232361a86
3
+ size 1064
run-wgja68lo/checkpoint-110/trainer_state.json ADDED
@@ -0,0 +1,865 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 2,
6
+ "global_step": 110,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03636363636363636,
13
+ "grad_norm": 156.3038787841797,
14
+ "learning_rate": 0.0002480103771818548,
15
+ "loss": 3.8254,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.03636363636363636,
20
+ "eval_loss": 4.441679000854492,
21
+ "eval_runtime": 138.5108,
22
+ "eval_samples_per_second": 2.527,
23
+ "eval_steps_per_second": 0.043,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.07272727272727272,
28
+ "grad_norm": 121.79210662841797,
29
+ "learning_rate": 0.00024341759241922788,
30
+ "loss": 5.6376,
31
+ "step": 4
32
+ },
33
+ {
34
+ "epoch": 0.07272727272727272,
35
+ "eval_loss": 6.213077545166016,
36
+ "eval_runtime": 169.0449,
37
+ "eval_samples_per_second": 2.07,
38
+ "eval_steps_per_second": 0.035,
39
+ "step": 4
40
+ },
41
+ {
42
+ "epoch": 0.10909090909090909,
43
+ "grad_norm": 3.2695116996765137,
44
+ "learning_rate": 0.00023882480765660094,
45
+ "loss": 5.484,
46
+ "step": 6
47
+ },
48
+ {
49
+ "epoch": 0.10909090909090909,
50
+ "eval_loss": 4.189974308013916,
51
+ "eval_runtime": 150.2603,
52
+ "eval_samples_per_second": 2.329,
53
+ "eval_steps_per_second": 0.04,
54
+ "step": 6
55
+ },
56
+ {
57
+ "epoch": 0.14545454545454545,
58
+ "grad_norm": 0.20998883247375488,
59
+ "learning_rate": 0.00023423202289397398,
60
+ "loss": 4.212,
61
+ "step": 8
62
+ },
63
+ {
64
+ "epoch": 0.14545454545454545,
65
+ "eval_loss": 4.095343112945557,
66
+ "eval_runtime": 108.8466,
67
+ "eval_samples_per_second": 3.216,
68
+ "eval_steps_per_second": 0.055,
69
+ "step": 8
70
+ },
71
+ {
72
+ "epoch": 0.18181818181818182,
73
+ "grad_norm": 0.05422870069742203,
74
+ "learning_rate": 0.00022963923813134704,
75
+ "loss": 4.16,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.18181818181818182,
80
+ "eval_loss": 4.094566822052002,
81
+ "eval_runtime": 117.133,
82
+ "eval_samples_per_second": 2.988,
83
+ "eval_steps_per_second": 0.051,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.21818181818181817,
88
+ "grad_norm": 0.043678443878889084,
89
+ "learning_rate": 0.0002250464533687201,
90
+ "loss": 4.1595,
91
+ "step": 12
92
+ },
93
+ {
94
+ "epoch": 0.21818181818181817,
95
+ "eval_loss": 4.094362258911133,
96
+ "eval_runtime": 115.9567,
97
+ "eval_samples_per_second": 3.018,
98
+ "eval_steps_per_second": 0.052,
99
+ "step": 12
100
+ },
101
+ {
102
+ "epoch": 0.2545454545454545,
103
+ "grad_norm": 0.024278217926621437,
104
+ "learning_rate": 0.00022045366860609316,
105
+ "loss": 4.1593,
106
+ "step": 14
107
+ },
108
+ {
109
+ "epoch": 0.2545454545454545,
110
+ "eval_loss": 4.09407377243042,
111
+ "eval_runtime": 108.3601,
112
+ "eval_samples_per_second": 3.23,
113
+ "eval_steps_per_second": 0.055,
114
+ "step": 14
115
+ },
116
+ {
117
+ "epoch": 0.2909090909090909,
118
+ "grad_norm": 0.00787164457142353,
119
+ "learning_rate": 0.00021586088384346623,
120
+ "loss": 4.159,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.2909090909090909,
125
+ "eval_loss": 4.094001770019531,
126
+ "eval_runtime": 108.9106,
127
+ "eval_samples_per_second": 3.214,
128
+ "eval_steps_per_second": 0.055,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.32727272727272727,
133
+ "grad_norm": 0.008814062923192978,
134
+ "learning_rate": 0.0002112680990808393,
135
+ "loss": 4.159,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.32727272727272727,
140
+ "eval_loss": 4.094032287597656,
141
+ "eval_runtime": 108.5561,
142
+ "eval_samples_per_second": 3.224,
143
+ "eval_steps_per_second": 0.055,
144
+ "step": 18
145
+ },
146
+ {
147
+ "epoch": 0.36363636363636365,
148
+ "grad_norm": 0.011641773395240307,
149
+ "learning_rate": 0.00020667531431821235,
150
+ "loss": 4.159,
151
+ "step": 20
152
+ },
153
+ {
154
+ "epoch": 0.36363636363636365,
155
+ "eval_loss": 4.094035625457764,
156
+ "eval_runtime": 109.9627,
157
+ "eval_samples_per_second": 3.183,
158
+ "eval_steps_per_second": 0.055,
159
+ "step": 20
160
+ },
161
+ {
162
+ "epoch": 0.4,
163
+ "grad_norm": 0.010468115098774433,
164
+ "learning_rate": 0.0002020825295555854,
165
+ "loss": 4.159,
166
+ "step": 22
167
+ },
168
+ {
169
+ "epoch": 0.4,
170
+ "eval_loss": 4.093992233276367,
171
+ "eval_runtime": 109.2553,
172
+ "eval_samples_per_second": 3.204,
173
+ "eval_steps_per_second": 0.055,
174
+ "step": 22
175
+ },
176
+ {
177
+ "epoch": 0.43636363636363634,
178
+ "grad_norm": 0.004853196442127228,
179
+ "learning_rate": 0.00019748974479295847,
180
+ "loss": 4.1589,
181
+ "step": 24
182
+ },
183
+ {
184
+ "epoch": 0.43636363636363634,
185
+ "eval_loss": 4.093957901000977,
186
+ "eval_runtime": 110.1754,
187
+ "eval_samples_per_second": 3.177,
188
+ "eval_steps_per_second": 0.054,
189
+ "step": 24
190
+ },
191
+ {
192
+ "epoch": 0.4727272727272727,
193
+ "grad_norm": 0.003641755087301135,
194
+ "learning_rate": 0.00019289696003033154,
195
+ "loss": 4.1589,
196
+ "step": 26
197
+ },
198
+ {
199
+ "epoch": 0.4727272727272727,
200
+ "eval_loss": 4.093954563140869,
201
+ "eval_runtime": 109.1144,
202
+ "eval_samples_per_second": 3.208,
203
+ "eval_steps_per_second": 0.055,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.509090909090909,
208
+ "grad_norm": 0.004993860609829426,
209
+ "learning_rate": 0.0001883041752677046,
210
+ "loss": 4.1589,
211
+ "step": 28
212
+ },
213
+ {
214
+ "epoch": 0.509090909090909,
215
+ "eval_loss": 4.093963146209717,
216
+ "eval_runtime": 201.6876,
217
+ "eval_samples_per_second": 1.735,
218
+ "eval_steps_per_second": 0.03,
219
+ "step": 28
220
+ },
221
+ {
222
+ "epoch": 0.5454545454545454,
223
+ "grad_norm": 0.005196776241064072,
224
+ "learning_rate": 0.00018371139050507766,
225
+ "loss": 4.1589,
226
+ "step": 30
227
+ },
228
+ {
229
+ "epoch": 0.5454545454545454,
230
+ "eval_loss": 4.093966007232666,
231
+ "eval_runtime": 118.6265,
232
+ "eval_samples_per_second": 2.95,
233
+ "eval_steps_per_second": 0.051,
234
+ "step": 30
235
+ },
236
+ {
237
+ "epoch": 0.5818181818181818,
238
+ "grad_norm": 0.0044463989324867725,
239
+ "learning_rate": 0.0001791186057424507,
240
+ "loss": 4.1589,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.5818181818181818,
245
+ "eval_loss": 4.093958377838135,
246
+ "eval_runtime": 115.5925,
247
+ "eval_samples_per_second": 3.028,
248
+ "eval_steps_per_second": 0.052,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.6181818181818182,
253
+ "grad_norm": 0.0034679854288697243,
254
+ "learning_rate": 0.00017452582097982376,
255
+ "loss": 4.1589,
256
+ "step": 34
257
+ },
258
+ {
259
+ "epoch": 0.6181818181818182,
260
+ "eval_loss": 4.093946933746338,
261
+ "eval_runtime": 146.8875,
262
+ "eval_samples_per_second": 2.383,
263
+ "eval_steps_per_second": 0.041,
264
+ "step": 34
265
+ },
266
+ {
267
+ "epoch": 0.6545454545454545,
268
+ "grad_norm": 0.002400787780061364,
269
+ "learning_rate": 0.00016993303621719682,
270
+ "loss": 4.1589,
271
+ "step": 36
272
+ },
273
+ {
274
+ "epoch": 0.6545454545454545,
275
+ "eval_loss": 4.0939412117004395,
276
+ "eval_runtime": 149.1826,
277
+ "eval_samples_per_second": 2.346,
278
+ "eval_steps_per_second": 0.04,
279
+ "step": 36
280
+ },
281
+ {
282
+ "epoch": 0.6909090909090909,
283
+ "grad_norm": 0.001346707926131785,
284
+ "learning_rate": 0.00016534025145456988,
285
+ "loss": 4.1589,
286
+ "step": 38
287
+ },
288
+ {
289
+ "epoch": 0.6909090909090909,
290
+ "eval_loss": 4.093942642211914,
291
+ "eval_runtime": 107.8083,
292
+ "eval_samples_per_second": 3.247,
293
+ "eval_steps_per_second": 0.056,
294
+ "step": 38
295
+ },
296
+ {
297
+ "epoch": 0.7272727272727273,
298
+ "grad_norm": 0.002782245399430394,
299
+ "learning_rate": 0.00016074746669194294,
300
+ "loss": 4.1589,
301
+ "step": 40
302
+ },
303
+ {
304
+ "epoch": 0.7272727272727273,
305
+ "eval_loss": 4.093945503234863,
306
+ "eval_runtime": 154.0308,
307
+ "eval_samples_per_second": 2.272,
308
+ "eval_steps_per_second": 0.039,
309
+ "step": 40
310
+ },
311
+ {
312
+ "epoch": 0.7636363636363637,
313
+ "grad_norm": 0.00260904454626143,
314
+ "learning_rate": 0.000156154681929316,
315
+ "loss": 4.1589,
316
+ "step": 42
317
+ },
318
+ {
319
+ "epoch": 0.7636363636363637,
320
+ "eval_loss": 4.093945503234863,
321
+ "eval_runtime": 112.4159,
322
+ "eval_samples_per_second": 3.113,
323
+ "eval_steps_per_second": 0.053,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.8,
328
+ "grad_norm": 0.002523690229281783,
329
+ "learning_rate": 0.00015156189716668904,
330
+ "loss": 4.1589,
331
+ "step": 44
332
+ },
333
+ {
334
+ "epoch": 0.8,
335
+ "eval_loss": 4.093942642211914,
336
+ "eval_runtime": 124.943,
337
+ "eval_samples_per_second": 2.801,
338
+ "eval_steps_per_second": 0.048,
339
+ "step": 44
340
+ },
341
+ {
342
+ "epoch": 0.8363636363636363,
343
+ "grad_norm": 0.002061552833765745,
344
+ "learning_rate": 0.0001469691124040621,
345
+ "loss": 4.1589,
346
+ "step": 46
347
+ },
348
+ {
349
+ "epoch": 0.8363636363636363,
350
+ "eval_loss": 4.093940258026123,
351
+ "eval_runtime": 108.0115,
352
+ "eval_samples_per_second": 3.24,
353
+ "eval_steps_per_second": 0.056,
354
+ "step": 46
355
+ },
356
+ {
357
+ "epoch": 0.8727272727272727,
358
+ "grad_norm": 0.0015735819470137358,
359
+ "learning_rate": 0.00014237632764143516,
360
+ "loss": 4.1589,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.8727272727272727,
365
+ "eval_loss": 4.093937873840332,
366
+ "eval_runtime": 115.9852,
367
+ "eval_samples_per_second": 3.018,
368
+ "eval_steps_per_second": 0.052,
369
+ "step": 48
370
+ },
371
+ {
372
+ "epoch": 0.9090909090909091,
373
+ "grad_norm": 0.001207723980769515,
374
+ "learning_rate": 0.00013778354287880822,
375
+ "loss": 4.1589,
376
+ "step": 50
377
+ },
378
+ {
379
+ "epoch": 0.9090909090909091,
380
+ "eval_loss": 4.093936443328857,
381
+ "eval_runtime": 155.9244,
382
+ "eval_samples_per_second": 2.245,
383
+ "eval_steps_per_second": 0.038,
384
+ "step": 50
385
+ },
386
+ {
387
+ "epoch": 0.9454545454545454,
388
+ "grad_norm": 0.001054088119417429,
389
+ "learning_rate": 0.00013319075811618129,
390
+ "loss": 4.1589,
391
+ "step": 52
392
+ },
393
+ {
394
+ "epoch": 0.9454545454545454,
395
+ "eval_loss": 4.093936443328857,
396
+ "eval_runtime": 136.0542,
397
+ "eval_samples_per_second": 2.573,
398
+ "eval_steps_per_second": 0.044,
399
+ "step": 52
400
+ },
401
+ {
402
+ "epoch": 0.9818181818181818,
403
+ "grad_norm": 0.0010858627501875162,
404
+ "learning_rate": 0.00012859797335355435,
405
+ "loss": 4.1589,
406
+ "step": 54
407
+ },
408
+ {
409
+ "epoch": 0.9818181818181818,
410
+ "eval_loss": 4.093936443328857,
411
+ "eval_runtime": 110.1319,
412
+ "eval_samples_per_second": 3.178,
413
+ "eval_steps_per_second": 0.054,
414
+ "step": 54
415
+ },
416
+ {
417
+ "epoch": 1.018181818181818,
418
+ "grad_norm": 0.001234299037605524,
419
+ "learning_rate": 0.0001240051885909274,
420
+ "loss": 3.96,
421
+ "step": 56
422
+ },
423
+ {
424
+ "epoch": 1.018181818181818,
425
+ "eval_loss": 4.093935966491699,
426
+ "eval_runtime": 118.108,
427
+ "eval_samples_per_second": 2.963,
428
+ "eval_steps_per_second": 0.051,
429
+ "step": 56
430
+ },
431
+ {
432
+ "epoch": 1.0545454545454545,
433
+ "grad_norm": 0.0017606632318347692,
434
+ "learning_rate": 0.00011941240382830047,
435
+ "loss": 4.1588,
436
+ "step": 58
437
+ },
438
+ {
439
+ "epoch": 1.0545454545454545,
440
+ "eval_loss": 4.093935012817383,
441
+ "eval_runtime": 111.0724,
442
+ "eval_samples_per_second": 3.151,
443
+ "eval_steps_per_second": 0.054,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 1.0909090909090908,
448
+ "grad_norm": 0.0012147607048973441,
449
+ "learning_rate": 0.00011481961906567352,
450
+ "loss": 4.1589,
451
+ "step": 60
452
+ },
453
+ {
454
+ "epoch": 1.0909090909090908,
455
+ "eval_loss": 4.093934535980225,
456
+ "eval_runtime": 123.0725,
457
+ "eval_samples_per_second": 2.844,
458
+ "eval_steps_per_second": 0.049,
459
+ "step": 60
460
+ },
461
+ {
462
+ "epoch": 1.1272727272727272,
463
+ "grad_norm": 0.0010563035029917955,
464
+ "learning_rate": 0.00011022683430304658,
465
+ "loss": 4.1589,
466
+ "step": 62
467
+ },
468
+ {
469
+ "epoch": 1.1272727272727272,
470
+ "eval_loss": 4.093933582305908,
471
+ "eval_runtime": 130.8184,
472
+ "eval_samples_per_second": 2.675,
473
+ "eval_steps_per_second": 0.046,
474
+ "step": 62
475
+ },
476
+ {
477
+ "epoch": 1.1636363636363636,
478
+ "grad_norm": 0.000990673084743321,
479
+ "learning_rate": 0.00010563404954041964,
480
+ "loss": 4.1589,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 1.1636363636363636,
485
+ "eval_loss": 4.093933582305908,
486
+ "eval_runtime": 130.543,
487
+ "eval_samples_per_second": 2.681,
488
+ "eval_steps_per_second": 0.046,
489
+ "step": 64
490
+ },
491
+ {
492
+ "epoch": 1.2,
493
+ "grad_norm": 0.0012123591732233763,
494
+ "learning_rate": 0.0001010412647777927,
495
+ "loss": 4.1589,
496
+ "step": 66
497
+ },
498
+ {
499
+ "epoch": 1.2,
500
+ "eval_loss": 4.09393310546875,
501
+ "eval_runtime": 153.4481,
502
+ "eval_samples_per_second": 2.281,
503
+ "eval_steps_per_second": 0.039,
504
+ "step": 66
505
+ },
506
+ {
507
+ "epoch": 1.2363636363636363,
508
+ "grad_norm": 0.0011688098311424255,
509
+ "learning_rate": 9.644848001516577e-05,
510
+ "loss": 4.1589,
511
+ "step": 68
512
+ },
513
+ {
514
+ "epoch": 1.2363636363636363,
515
+ "eval_loss": 4.093932151794434,
516
+ "eval_runtime": 154.5442,
517
+ "eval_samples_per_second": 2.265,
518
+ "eval_steps_per_second": 0.039,
519
+ "step": 68
520
+ },
521
+ {
522
+ "epoch": 1.2727272727272727,
523
+ "grad_norm": 0.0015955213457345963,
524
+ "learning_rate": 9.185569525253883e-05,
525
+ "loss": 4.1589,
526
+ "step": 70
527
+ },
528
+ {
529
+ "epoch": 1.2727272727272727,
530
+ "eval_loss": 4.093931674957275,
531
+ "eval_runtime": 122.1283,
532
+ "eval_samples_per_second": 2.866,
533
+ "eval_steps_per_second": 0.049,
534
+ "step": 70
535
+ },
536
+ {
537
+ "epoch": 1.309090909090909,
538
+ "grad_norm": 0.001243259641341865,
539
+ "learning_rate": 8.726291048991188e-05,
540
+ "loss": 4.1589,
541
+ "step": 72
542
+ },
543
+ {
544
+ "epoch": 1.309090909090909,
545
+ "eval_loss": 4.093931198120117,
546
+ "eval_runtime": 123.0656,
547
+ "eval_samples_per_second": 2.844,
548
+ "eval_steps_per_second": 0.049,
549
+ "step": 72
550
+ },
551
+ {
552
+ "epoch": 1.3454545454545455,
553
+ "grad_norm": 0.001263299840502441,
554
+ "learning_rate": 8.267012572728494e-05,
555
+ "loss": 4.1589,
556
+ "step": 74
557
+ },
558
+ {
559
+ "epoch": 1.3454545454545455,
560
+ "eval_loss": 4.093930721282959,
561
+ "eval_runtime": 115.4851,
562
+ "eval_samples_per_second": 3.031,
563
+ "eval_steps_per_second": 0.052,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 1.3818181818181818,
568
+ "grad_norm": 0.0013924150262027979,
569
+ "learning_rate": 7.8077340964658e-05,
570
+ "loss": 4.1588,
571
+ "step": 76
572
+ },
573
+ {
574
+ "epoch": 1.3818181818181818,
575
+ "eval_loss": 4.093929767608643,
576
+ "eval_runtime": 139.7759,
577
+ "eval_samples_per_second": 2.504,
578
+ "eval_steps_per_second": 0.043,
579
+ "step": 76
580
+ },
581
+ {
582
+ "epoch": 1.4181818181818182,
583
+ "grad_norm": 0.001308809733018279,
584
+ "learning_rate": 7.348455620203105e-05,
585
+ "loss": 4.1588,
586
+ "step": 78
587
+ },
588
+ {
589
+ "epoch": 1.4181818181818182,
590
+ "eval_loss": 4.093928337097168,
591
+ "eval_runtime": 110.3532,
592
+ "eval_samples_per_second": 3.172,
593
+ "eval_steps_per_second": 0.054,
594
+ "step": 78
595
+ },
596
+ {
597
+ "epoch": 1.4545454545454546,
598
+ "grad_norm": 0.00145295774564147,
599
+ "learning_rate": 6.889177143940411e-05,
600
+ "loss": 4.1588,
601
+ "step": 80
602
+ },
603
+ {
604
+ "epoch": 1.4545454545454546,
605
+ "eval_loss": 4.093928337097168,
606
+ "eval_runtime": 110.6229,
607
+ "eval_samples_per_second": 3.164,
608
+ "eval_steps_per_second": 0.054,
609
+ "step": 80
610
+ },
611
+ {
612
+ "epoch": 1.490909090909091,
613
+ "grad_norm": 0.0014558923430740833,
614
+ "learning_rate": 6.429898667677717e-05,
615
+ "loss": 4.1588,
616
+ "step": 82
617
+ },
618
+ {
619
+ "epoch": 1.490909090909091,
620
+ "eval_loss": 4.09392786026001,
621
+ "eval_runtime": 151.2194,
622
+ "eval_samples_per_second": 2.315,
623
+ "eval_steps_per_second": 0.04,
624
+ "step": 82
625
+ },
626
+ {
627
+ "epoch": 1.5272727272727273,
628
+ "grad_norm": 0.0015545696951448917,
629
+ "learning_rate": 5.9706201914150236e-05,
630
+ "loss": 4.1588,
631
+ "step": 84
632
+ },
633
+ {
634
+ "epoch": 1.5272727272727273,
635
+ "eval_loss": 4.093926906585693,
636
+ "eval_runtime": 131.734,
637
+ "eval_samples_per_second": 2.657,
638
+ "eval_steps_per_second": 0.046,
639
+ "step": 84
640
+ },
641
+ {
642
+ "epoch": 1.5636363636363635,
643
+ "grad_norm": 0.0016734582604840398,
644
+ "learning_rate": 5.511341715152329e-05,
645
+ "loss": 4.1588,
646
+ "step": 86
647
+ },
648
+ {
649
+ "epoch": 1.5636363636363635,
650
+ "eval_loss": 4.093926429748535,
651
+ "eval_runtime": 109.3531,
652
+ "eval_samples_per_second": 3.201,
653
+ "eval_steps_per_second": 0.055,
654
+ "step": 86
655
+ },
656
+ {
657
+ "epoch": 1.6,
658
+ "grad_norm": 0.0014604291645810008,
659
+ "learning_rate": 5.052063238889635e-05,
660
+ "loss": 4.1589,
661
+ "step": 88
662
+ },
663
+ {
664
+ "epoch": 1.6,
665
+ "eval_loss": 4.093925952911377,
666
+ "eval_runtime": 109.4141,
667
+ "eval_samples_per_second": 3.199,
668
+ "eval_steps_per_second": 0.055,
669
+ "step": 88
670
+ },
671
+ {
672
+ "epoch": 1.6363636363636362,
673
+ "grad_norm": 0.0015530511736869812,
674
+ "learning_rate": 4.5927847626269415e-05,
675
+ "loss": 4.1588,
676
+ "step": 90
677
+ },
678
+ {
679
+ "epoch": 1.6363636363636362,
680
+ "eval_loss": 4.093925476074219,
681
+ "eval_runtime": 109.4483,
682
+ "eval_samples_per_second": 3.198,
683
+ "eval_steps_per_second": 0.055,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.6727272727272728,
688
+ "grad_norm": 0.0015709660947322845,
689
+ "learning_rate": 4.133506286364247e-05,
690
+ "loss": 4.1589,
691
+ "step": 92
692
+ },
693
+ {
694
+ "epoch": 1.6727272727272728,
695
+ "eval_loss": 4.0939249992370605,
696
+ "eval_runtime": 122.4435,
697
+ "eval_samples_per_second": 2.858,
698
+ "eval_steps_per_second": 0.049,
699
+ "step": 92
700
+ },
701
+ {
702
+ "epoch": 1.709090909090909,
703
+ "grad_norm": 0.0014102915301918983,
704
+ "learning_rate": 3.6742278101015525e-05,
705
+ "loss": 4.1588,
706
+ "step": 94
707
+ },
708
+ {
709
+ "epoch": 1.709090909090909,
710
+ "eval_loss": 4.093924522399902,
711
+ "eval_runtime": 150.6921,
712
+ "eval_samples_per_second": 2.323,
713
+ "eval_steps_per_second": 0.04,
714
+ "step": 94
715
+ },
716
+ {
717
+ "epoch": 1.7454545454545456,
718
+ "grad_norm": 0.0015738351503387094,
719
+ "learning_rate": 3.214949333838859e-05,
720
+ "loss": 4.1588,
721
+ "step": 96
722
+ },
723
+ {
724
+ "epoch": 1.7454545454545456,
725
+ "eval_loss": 4.093923568725586,
726
+ "eval_runtime": 128.0266,
727
+ "eval_samples_per_second": 2.734,
728
+ "eval_steps_per_second": 0.047,
729
+ "step": 96
730
+ },
731
+ {
732
+ "epoch": 1.7818181818181817,
733
+ "grad_norm": 0.001314620254561305,
734
+ "learning_rate": 2.7556708575761645e-05,
735
+ "loss": 4.1588,
736
+ "step": 98
737
+ },
738
+ {
739
+ "epoch": 1.7818181818181817,
740
+ "eval_loss": 4.093923091888428,
741
+ "eval_runtime": 110.6007,
742
+ "eval_samples_per_second": 3.165,
743
+ "eval_steps_per_second": 0.054,
744
+ "step": 98
745
+ },
746
+ {
747
+ "epoch": 1.8181818181818183,
748
+ "grad_norm": 0.0014478659722954035,
749
+ "learning_rate": 2.2963923813134707e-05,
750
+ "loss": 4.1589,
751
+ "step": 100
752
+ },
753
+ {
754
+ "epoch": 1.8181818181818183,
755
+ "eval_loss": 4.0939226150512695,
756
+ "eval_runtime": 124.9794,
757
+ "eval_samples_per_second": 2.8,
758
+ "eval_steps_per_second": 0.048,
759
+ "step": 100
760
+ },
761
+ {
762
+ "epoch": 1.8545454545454545,
763
+ "grad_norm": 0.0018716281047090888,
764
+ "learning_rate": 1.8371139050507763e-05,
765
+ "loss": 4.1588,
766
+ "step": 102
767
+ },
768
+ {
769
+ "epoch": 1.8545454545454545,
770
+ "eval_loss": 4.093922138214111,
771
+ "eval_runtime": 108.7929,
772
+ "eval_samples_per_second": 3.217,
773
+ "eval_steps_per_second": 0.055,
774
+ "step": 102
775
+ },
776
+ {
777
+ "epoch": 1.8909090909090909,
778
+ "grad_norm": 0.0020189164206385612,
779
+ "learning_rate": 1.3778354287880823e-05,
780
+ "loss": 4.1588,
781
+ "step": 104
782
+ },
783
+ {
784
+ "epoch": 1.8909090909090909,
785
+ "eval_loss": 4.093922138214111,
786
+ "eval_runtime": 157.5843,
787
+ "eval_samples_per_second": 2.221,
788
+ "eval_steps_per_second": 0.038,
789
+ "step": 104
790
+ },
791
+ {
792
+ "epoch": 1.9272727272727272,
793
+ "grad_norm": 0.0016775216208770871,
794
+ "learning_rate": 9.185569525253881e-06,
795
+ "loss": 4.1588,
796
+ "step": 106
797
+ },
798
+ {
799
+ "epoch": 1.9272727272727272,
800
+ "eval_loss": 4.093922138214111,
801
+ "eval_runtime": 130.985,
802
+ "eval_samples_per_second": 2.672,
803
+ "eval_steps_per_second": 0.046,
804
+ "step": 106
805
+ },
806
+ {
807
+ "epoch": 1.9636363636363636,
808
+ "grad_norm": 0.0019341203151270747,
809
+ "learning_rate": 4.592784762626941e-06,
810
+ "loss": 4.1588,
811
+ "step": 108
812
+ },
813
+ {
814
+ "epoch": 1.9636363636363636,
815
+ "eval_loss": 4.093922138214111,
816
+ "eval_runtime": 183.7417,
817
+ "eval_samples_per_second": 1.905,
818
+ "eval_steps_per_second": 0.033,
819
+ "step": 108
820
+ },
821
+ {
822
+ "epoch": 2.0,
823
+ "grad_norm": 0.0019428413361310959,
824
+ "learning_rate": 0.0,
825
+ "loss": 3.96,
826
+ "step": 110
827
+ },
828
+ {
829
+ "epoch": 2.0,
830
+ "eval_loss": 4.093922138214111,
831
+ "eval_runtime": 121.5213,
832
+ "eval_samples_per_second": 2.88,
833
+ "eval_steps_per_second": 0.049,
834
+ "step": 110
835
+ }
836
+ ],
837
+ "logging_steps": 2,
838
+ "max_steps": 110,
839
+ "num_input_tokens_seen": 0,
840
+ "num_train_epochs": 2,
841
+ "save_steps": 500,
842
+ "stateful_callbacks": {
843
+ "TrainerControl": {
844
+ "args": {
845
+ "should_epoch_stop": false,
846
+ "should_evaluate": false,
847
+ "should_log": false,
848
+ "should_save": true,
849
+ "should_training_stop": true
850
+ },
851
+ "attributes": {}
852
+ }
853
+ },
854
+ "total_flos": 407052651766068.0,
855
+ "train_batch_size": 64,
856
+ "trial_name": null,
857
+ "trial_params": {
858
+ "_wandb": {},
859
+ "assignments": {},
860
+ "decay": 0.1,
861
+ "learning_rate": 0.00025260316194448176,
862
+ "metric": "eval/loss",
863
+ "per_device_train_batch_size": 64
864
+ }
865
+ }
run-wgja68lo/checkpoint-110/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f10dc52d742b958935413d6f7dc245cb806eaebf68214cd06171241af4d4a13
3
+ size 5112
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec583036ee75123b3ad43a5af2bca340e80c9be115a3e221d0646ea35625512c
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f10dc52d742b958935413d6f7dc245cb806eaebf68214cd06171241af4d4a13
3
  size 5112