OpenLeecher commited on
Commit
48d80b5
1 Parent(s): 3e3886a

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +12 -0
  3. eval_results.json +7 -0
  4. train_results.json +8 -0
  5. trainer_state.json +612 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3
4
  base_model: meta-llama/Meta-Llama-3-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: llama_8b_lima_8
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # llama_8b_lima_8
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.7044
21
 
 
4
  base_model: meta-llama/Meta-Llama-3-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: llama_8b_lima_8
 
16
 
17
  # llama_8b_lima_8
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the open_webui_dataset dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.7044
22
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.7043666839599609,
4
+ "eval_runtime": 8.0832,
5
+ "eval_samples_per_second": 27.217,
6
+ "eval_steps_per_second": 4.577,
7
+ "total_flos": 2.093861581933773e+16,
8
+ "train_loss": 0.6267224325452532,
9
+ "train_runtime": 1749.357,
10
+ "train_samples_per_second": 3.601,
11
+ "train_steps_per_second": 0.2
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.7043666839599609,
4
+ "eval_runtime": 8.0832,
5
+ "eval_samples_per_second": 27.217,
6
+ "eval_steps_per_second": 4.577
7
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.093861581933773e+16,
4
+ "train_loss": 0.6267224325452532,
5
+ "train_runtime": 1749.357,
6
+ "train_samples_per_second": 3.601,
7
+ "train_steps_per_second": 0.2
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 35,
6
+ "global_step": 350,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.014285714285714285,
13
+ "grad_norm": 118.65539307904277,
14
+ "learning_rate": 1.6e-06,
15
+ "loss": 0.8722,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.02857142857142857,
20
+ "grad_norm": 38.484858029114484,
21
+ "learning_rate": 3.2e-06,
22
+ "loss": 0.7825,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.04285714285714286,
27
+ "grad_norm": 88.4372189754248,
28
+ "learning_rate": 4.8e-06,
29
+ "loss": 0.7646,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.05714285714285714,
34
+ "grad_norm": 41.455396892104744,
35
+ "learning_rate": 6.4e-06,
36
+ "loss": 0.7226,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.07142857142857142,
41
+ "grad_norm": 4.738758549704674,
42
+ "learning_rate": 8e-06,
43
+ "loss": 0.6666,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.08571428571428572,
48
+ "grad_norm": 10.5176134179179,
49
+ "learning_rate": 7.771005917159763e-06,
50
+ "loss": 0.7263,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.1,
55
+ "grad_norm": 11.488275336430554,
56
+ "learning_rate": 7.545562130177514e-06,
57
+ "loss": 0.6211,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.1,
62
+ "eval_loss": 0.9106850624084473,
63
+ "eval_runtime": 8.373,
64
+ "eval_samples_per_second": 26.275,
65
+ "eval_steps_per_second": 4.419,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 0.11428571428571428,
70
+ "grad_norm": 18.92714653332412,
71
+ "learning_rate": 7.323668639053254e-06,
72
+ "loss": 0.6989,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.12857142857142856,
77
+ "grad_norm": 22.493765084792706,
78
+ "learning_rate": 7.105325443786982e-06,
79
+ "loss": 0.704,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 0.14285714285714285,
84
+ "grad_norm": 211.59677798517507,
85
+ "learning_rate": 6.890532544378699e-06,
86
+ "loss": 0.6654,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 0.15714285714285714,
91
+ "grad_norm": 160.57396424563427,
92
+ "learning_rate": 6.679289940828402e-06,
93
+ "loss": 0.7127,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 0.17142857142857143,
98
+ "grad_norm": 8.199790596273626,
99
+ "learning_rate": 6.471597633136094e-06,
100
+ "loss": 0.6253,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 0.18571428571428572,
105
+ "grad_norm": 50.8343087092876,
106
+ "learning_rate": 6.2674556213017745e-06,
107
+ "loss": 0.6973,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 0.2,
112
+ "grad_norm": 99.89969831226269,
113
+ "learning_rate": 6.066863905325444e-06,
114
+ "loss": 0.6976,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.2,
119
+ "eval_loss": 0.852588951587677,
120
+ "eval_runtime": 8.1201,
121
+ "eval_samples_per_second": 27.093,
122
+ "eval_steps_per_second": 4.557,
123
+ "step": 70
124
+ },
125
+ {
126
+ "epoch": 0.21428571428571427,
127
+ "grad_norm": 68.18379393534644,
128
+ "learning_rate": 5.869822485207101e-06,
129
+ "loss": 0.6194,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 0.22857142857142856,
134
+ "grad_norm": 10.735154673670019,
135
+ "learning_rate": 5.676331360946745e-06,
136
+ "loss": 0.6906,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.24285714285714285,
141
+ "grad_norm": 9.131803561236161,
142
+ "learning_rate": 5.486390532544378e-06,
143
+ "loss": 0.6401,
144
+ "step": 85
145
+ },
146
+ {
147
+ "epoch": 0.2571428571428571,
148
+ "grad_norm": 3.7707612659287766,
149
+ "learning_rate": 5.300000000000002e-06,
150
+ "loss": 0.6326,
151
+ "step": 90
152
+ },
153
+ {
154
+ "epoch": 0.2714285714285714,
155
+ "grad_norm": 8.70600675802851,
156
+ "learning_rate": 5.1171597633136094e-06,
157
+ "loss": 0.6019,
158
+ "step": 95
159
+ },
160
+ {
161
+ "epoch": 0.2857142857142857,
162
+ "grad_norm": 7.499128841456364,
163
+ "learning_rate": 4.9378698224852065e-06,
164
+ "loss": 0.6404,
165
+ "step": 100
166
+ },
167
+ {
168
+ "epoch": 0.3,
169
+ "grad_norm": 5.203399746039487,
170
+ "learning_rate": 4.762130177514793e-06,
171
+ "loss": 0.5762,
172
+ "step": 105
173
+ },
174
+ {
175
+ "epoch": 0.3,
176
+ "eval_loss": 0.7999083399772644,
177
+ "eval_runtime": 8.0634,
178
+ "eval_samples_per_second": 27.284,
179
+ "eval_steps_per_second": 4.589,
180
+ "step": 105
181
+ },
182
+ {
183
+ "epoch": 0.3142857142857143,
184
+ "grad_norm": 4.900592118020277,
185
+ "learning_rate": 4.589940828402367e-06,
186
+ "loss": 0.7544,
187
+ "step": 110
188
+ },
189
+ {
190
+ "epoch": 0.32857142857142857,
191
+ "grad_norm": 6.583803116646508,
192
+ "learning_rate": 4.421301775147928e-06,
193
+ "loss": 0.6494,
194
+ "step": 115
195
+ },
196
+ {
197
+ "epoch": 0.34285714285714286,
198
+ "grad_norm": 2.8434894878977968,
199
+ "learning_rate": 4.2562130177514784e-06,
200
+ "loss": 0.6542,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.35714285714285715,
205
+ "grad_norm": 3.3996307039750926,
206
+ "learning_rate": 4.094674556213017e-06,
207
+ "loss": 0.6631,
208
+ "step": 125
209
+ },
210
+ {
211
+ "epoch": 0.37142857142857144,
212
+ "grad_norm": 5.094034483082871,
213
+ "learning_rate": 3.936686390532545e-06,
214
+ "loss": 0.6086,
215
+ "step": 130
216
+ },
217
+ {
218
+ "epoch": 0.38571428571428573,
219
+ "grad_norm": 3.8103990086460757,
220
+ "learning_rate": 3.7822485207100586e-06,
221
+ "loss": 0.5935,
222
+ "step": 135
223
+ },
224
+ {
225
+ "epoch": 0.4,
226
+ "grad_norm": 4.098923421731261,
227
+ "learning_rate": 3.631360946745561e-06,
228
+ "loss": 0.5226,
229
+ "step": 140
230
+ },
231
+ {
232
+ "epoch": 0.4,
233
+ "eval_loss": 0.7717307209968567,
234
+ "eval_runtime": 8.04,
235
+ "eval_samples_per_second": 27.363,
236
+ "eval_steps_per_second": 4.602,
237
+ "step": 140
238
+ },
239
+ {
240
+ "epoch": 0.4142857142857143,
241
+ "grad_norm": 3.3521626999923875,
242
+ "learning_rate": 3.484023668639053e-06,
243
+ "loss": 0.5967,
244
+ "step": 145
245
+ },
246
+ {
247
+ "epoch": 0.42857142857142855,
248
+ "grad_norm": 4.761409736989179,
249
+ "learning_rate": 3.3402366863905327e-06,
250
+ "loss": 0.6379,
251
+ "step": 150
252
+ },
253
+ {
254
+ "epoch": 0.44285714285714284,
255
+ "grad_norm": 5.189823443764368,
256
+ "learning_rate": 3.1999999999999994e-06,
257
+ "loss": 0.69,
258
+ "step": 155
259
+ },
260
+ {
261
+ "epoch": 0.45714285714285713,
262
+ "grad_norm": 3.4259629095344963,
263
+ "learning_rate": 3.0633136094674547e-06,
264
+ "loss": 0.6177,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.4714285714285714,
269
+ "grad_norm": 6.564952365844063,
270
+ "learning_rate": 2.930177514792899e-06,
271
+ "loss": 0.6451,
272
+ "step": 165
273
+ },
274
+ {
275
+ "epoch": 0.4857142857142857,
276
+ "grad_norm": 3.8605502771298346,
277
+ "learning_rate": 2.8005917159763313e-06,
278
+ "loss": 0.5706,
279
+ "step": 170
280
+ },
281
+ {
282
+ "epoch": 0.5,
283
+ "grad_norm": 4.052127550985865,
284
+ "learning_rate": 2.674556213017751e-06,
285
+ "loss": 0.5866,
286
+ "step": 175
287
+ },
288
+ {
289
+ "epoch": 0.5,
290
+ "eval_loss": 0.7432886958122253,
291
+ "eval_runtime": 8.0548,
292
+ "eval_samples_per_second": 27.313,
293
+ "eval_steps_per_second": 4.594,
294
+ "step": 175
295
+ },
296
+ {
297
+ "epoch": 0.5142857142857142,
298
+ "grad_norm": 3.3881437035921675,
299
+ "learning_rate": 2.5520710059171586e-06,
300
+ "loss": 0.6236,
301
+ "step": 180
302
+ },
303
+ {
304
+ "epoch": 0.5285714285714286,
305
+ "grad_norm": 3.6324500058620783,
306
+ "learning_rate": 2.4331360946745558e-06,
307
+ "loss": 0.614,
308
+ "step": 185
309
+ },
310
+ {
311
+ "epoch": 0.5428571428571428,
312
+ "grad_norm": 3.6721733069689377,
313
+ "learning_rate": 2.317751479289941e-06,
314
+ "loss": 0.5696,
315
+ "step": 190
316
+ },
317
+ {
318
+ "epoch": 0.5571428571428572,
319
+ "grad_norm": 3.2065483170410944,
320
+ "learning_rate": 2.2059171597633135e-06,
321
+ "loss": 0.6017,
322
+ "step": 195
323
+ },
324
+ {
325
+ "epoch": 0.5714285714285714,
326
+ "grad_norm": 3.694107283298511,
327
+ "learning_rate": 2.0976331360946745e-06,
328
+ "loss": 0.5215,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 0.5857142857142857,
333
+ "grad_norm": 3.3237350491778606,
334
+ "learning_rate": 1.9928994082840233e-06,
335
+ "loss": 0.6335,
336
+ "step": 205
337
+ },
338
+ {
339
+ "epoch": 0.6,
340
+ "grad_norm": 5.7877669531548595,
341
+ "learning_rate": 1.891715976331361e-06,
342
+ "loss": 0.5999,
343
+ "step": 210
344
+ },
345
+ {
346
+ "epoch": 0.6,
347
+ "eval_loss": 0.7398412227630615,
348
+ "eval_runtime": 8.0412,
349
+ "eval_samples_per_second": 27.359,
350
+ "eval_steps_per_second": 4.601,
351
+ "step": 210
352
+ },
353
+ {
354
+ "epoch": 0.6142857142857143,
355
+ "grad_norm": 4.758098459635445,
356
+ "learning_rate": 1.7940828402366863e-06,
357
+ "loss": 0.6837,
358
+ "step": 215
359
+ },
360
+ {
361
+ "epoch": 0.6285714285714286,
362
+ "grad_norm": 2.672565209825915,
363
+ "learning_rate": 1.7000000000000002e-06,
364
+ "loss": 0.6247,
365
+ "step": 220
366
+ },
367
+ {
368
+ "epoch": 0.6428571428571429,
369
+ "grad_norm": 3.9151505842097594,
370
+ "learning_rate": 1.6094674556213014e-06,
371
+ "loss": 0.5437,
372
+ "step": 225
373
+ },
374
+ {
375
+ "epoch": 0.6571428571428571,
376
+ "grad_norm": 4.93388663600263,
377
+ "learning_rate": 1.5224852071005916e-06,
378
+ "loss": 0.6034,
379
+ "step": 230
380
+ },
381
+ {
382
+ "epoch": 0.6714285714285714,
383
+ "grad_norm": 2.9614002133596316,
384
+ "learning_rate": 1.4390532544378696e-06,
385
+ "loss": 0.5687,
386
+ "step": 235
387
+ },
388
+ {
389
+ "epoch": 0.6857142857142857,
390
+ "grad_norm": 3.738349914157848,
391
+ "learning_rate": 1.3591715976331362e-06,
392
+ "loss": 0.6636,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 0.7,
397
+ "grad_norm": 3.8772829249948852,
398
+ "learning_rate": 1.2828402366863903e-06,
399
+ "loss": 0.6464,
400
+ "step": 245
401
+ },
402
+ {
403
+ "epoch": 0.7,
404
+ "eval_loss": 0.7203609943389893,
405
+ "eval_runtime": 8.0382,
406
+ "eval_samples_per_second": 27.369,
407
+ "eval_steps_per_second": 4.603,
408
+ "step": 245
409
+ },
410
+ {
411
+ "epoch": 0.7142857142857143,
412
+ "grad_norm": 2.972054074443536,
413
+ "learning_rate": 1.2100591715976333e-06,
414
+ "loss": 0.6081,
415
+ "step": 250
416
+ },
417
+ {
418
+ "epoch": 0.7285714285714285,
419
+ "grad_norm": 2.8758679138381202,
420
+ "learning_rate": 1.1408284023668636e-06,
421
+ "loss": 0.5808,
422
+ "step": 255
423
+ },
424
+ {
425
+ "epoch": 0.7428571428571429,
426
+ "grad_norm": 3.959990245370596,
427
+ "learning_rate": 1.0751479289940828e-06,
428
+ "loss": 0.6201,
429
+ "step": 260
430
+ },
431
+ {
432
+ "epoch": 0.7571428571428571,
433
+ "grad_norm": 3.5665367608826632,
434
+ "learning_rate": 1.0130177514792898e-06,
435
+ "loss": 0.5034,
436
+ "step": 265
437
+ },
438
+ {
439
+ "epoch": 0.7714285714285715,
440
+ "grad_norm": 4.028636856853669,
441
+ "learning_rate": 9.544378698224853e-07,
442
+ "loss": 0.5932,
443
+ "step": 270
444
+ },
445
+ {
446
+ "epoch": 0.7857142857142857,
447
+ "grad_norm": 4.258016842155138,
448
+ "learning_rate": 8.994082840236684e-07,
449
+ "loss": 0.5742,
450
+ "step": 275
451
+ },
452
+ {
453
+ "epoch": 0.8,
454
+ "grad_norm": 3.5334205284517024,
455
+ "learning_rate": 8.479289940828401e-07,
456
+ "loss": 0.552,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 0.8,
461
+ "eval_loss": 0.7106707692146301,
462
+ "eval_runtime": 8.068,
463
+ "eval_samples_per_second": 27.268,
464
+ "eval_steps_per_second": 4.586,
465
+ "step": 280
466
+ },
467
+ {
468
+ "epoch": 0.8142857142857143,
469
+ "grad_norm": 6.70792314615095,
470
+ "learning_rate": 7.999999999999998e-07,
471
+ "loss": 0.6199,
472
+ "step": 285
473
+ },
474
+ {
475
+ "epoch": 0.8285714285714286,
476
+ "grad_norm": 3.3423971423196983,
477
+ "learning_rate": 7.556213017751479e-07,
478
+ "loss": 0.5638,
479
+ "step": 290
480
+ },
481
+ {
482
+ "epoch": 0.8428571428571429,
483
+ "grad_norm": 3.0750783422793497,
484
+ "learning_rate": 7.147928994082838e-07,
485
+ "loss": 0.5326,
486
+ "step": 295
487
+ },
488
+ {
489
+ "epoch": 0.8571428571428571,
490
+ "grad_norm": 4.7296945533927355,
491
+ "learning_rate": 6.775147928994083e-07,
492
+ "loss": 0.6127,
493
+ "step": 300
494
+ },
495
+ {
496
+ "epoch": 0.8714285714285714,
497
+ "grad_norm": 3.833725884188028,
498
+ "learning_rate": 6.437869822485206e-07,
499
+ "loss": 0.5634,
500
+ "step": 305
501
+ },
502
+ {
503
+ "epoch": 0.8857142857142857,
504
+ "grad_norm": 3.359034703004336,
505
+ "learning_rate": 6.136094674556213e-07,
506
+ "loss": 0.5592,
507
+ "step": 310
508
+ },
509
+ {
510
+ "epoch": 0.9,
511
+ "grad_norm": 3.1893425724586857,
512
+ "learning_rate": 5.869822485207099e-07,
513
+ "loss": 0.5657,
514
+ "step": 315
515
+ },
516
+ {
517
+ "epoch": 0.9,
518
+ "eval_loss": 0.7130246758460999,
519
+ "eval_runtime": 8.0419,
520
+ "eval_samples_per_second": 27.357,
521
+ "eval_steps_per_second": 4.601,
522
+ "step": 315
523
+ },
524
+ {
525
+ "epoch": 0.9142857142857143,
526
+ "grad_norm": 3.3324510082581087,
527
+ "learning_rate": 5.63905325443787e-07,
528
+ "loss": 0.6271,
529
+ "step": 320
530
+ },
531
+ {
532
+ "epoch": 0.9285714285714286,
533
+ "grad_norm": 3.4362159022422736,
534
+ "learning_rate": 5.443786982248519e-07,
535
+ "loss": 0.5349,
536
+ "step": 325
537
+ },
538
+ {
539
+ "epoch": 0.9428571428571428,
540
+ "grad_norm": 4.804926939525648,
541
+ "learning_rate": 5.284023668639053e-07,
542
+ "loss": 0.7169,
543
+ "step": 330
544
+ },
545
+ {
546
+ "epoch": 0.9571428571428572,
547
+ "grad_norm": 4.408287391472689,
548
+ "learning_rate": 5.159763313609466e-07,
549
+ "loss": 0.606,
550
+ "step": 335
551
+ },
552
+ {
553
+ "epoch": 0.9714285714285714,
554
+ "grad_norm": 5.066493752587581,
555
+ "learning_rate": 5.071005917159763e-07,
556
+ "loss": 0.5669,
557
+ "step": 340
558
+ },
559
+ {
560
+ "epoch": 0.9857142857142858,
561
+ "grad_norm": 8.178486160006328,
562
+ "learning_rate": 5.01775147928994e-07,
563
+ "loss": 0.5541,
564
+ "step": 345
565
+ },
566
+ {
567
+ "epoch": 1.0,
568
+ "grad_norm": 3.7353925732066693,
569
+ "learning_rate": 5e-07,
570
+ "loss": 0.5687,
571
+ "step": 350
572
+ },
573
+ {
574
+ "epoch": 1.0,
575
+ "eval_loss": 0.7043666839599609,
576
+ "eval_runtime": 8.0404,
577
+ "eval_samples_per_second": 27.362,
578
+ "eval_steps_per_second": 4.602,
579
+ "step": 350
580
+ },
581
+ {
582
+ "epoch": 1.0,
583
+ "step": 350,
584
+ "total_flos": 2.093861581933773e+16,
585
+ "train_loss": 0.6267224325452532,
586
+ "train_runtime": 1749.357,
587
+ "train_samples_per_second": 3.601,
588
+ "train_steps_per_second": 0.2
589
+ }
590
+ ],
591
+ "logging_steps": 5,
592
+ "max_steps": 350,
593
+ "num_input_tokens_seen": 0,
594
+ "num_train_epochs": 1,
595
+ "save_steps": 350,
596
+ "stateful_callbacks": {
597
+ "TrainerControl": {
598
+ "args": {
599
+ "should_epoch_stop": false,
600
+ "should_evaluate": false,
601
+ "should_log": false,
602
+ "should_save": true,
603
+ "should_training_stop": true
604
+ },
605
+ "attributes": {}
606
+ }
607
+ },
608
+ "total_flos": 2.093861581933773e+16,
609
+ "train_batch_size": 3,
610
+ "trial_name": null,
611
+ "trial_params": null
612
+ }