tiennguyenbnbk commited on
Commit
d4e7d77
1 Parent(s): 9248399

End of training

Browse files
Files changed (5) hide show
  1. README.md +5 -5
  2. all_results.json +14 -14
  3. test_results.json +9 -9
  4. train_results.json +6 -6
  5. trainer_state.json +502 -312
README.md CHANGED
@@ -18,11 +18,11 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [vinai/phobert-base-v2](https://huggingface.co/vinai/phobert-base-v2) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.4654
22
- - Accuracy: 0.9402
23
- - F1 Score: 0.9310
24
- - Recall: 0.9300
25
- - Precision: 0.9325
26
 
27
  ## Model description
28
 
 
18
 
19
  This model is a fine-tuned version of [vinai/phobert-base-v2](https://huggingface.co/vinai/phobert-base-v2) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4657
22
+ - Accuracy: 0.9407
23
+ - F1 Score: 0.9319
24
+ - Recall: 0.9326
25
+ - Precision: 0.9319
26
 
27
  ## Model description
28
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 26.08695652173913,
3
- "eval_accuracy": 0.9323185648273987,
4
- "eval_f1_score": 0.9261695691084951,
5
- "eval_loss": 0.2873239815235138,
6
- "eval_precision": 0.9319834922740233,
7
- "eval_recall": 0.9216726996777184,
8
- "eval_runtime": 6.6463,
9
- "eval_samples_per_second": 553.538,
10
- "eval_steps_per_second": 8.727,
11
- "total_flos": 6579999363349350.0,
12
- "train_loss": 0.31640464369455973,
13
- "train_runtime": 3238.6183,
14
- "train_samples_per_second": 158.092,
15
- "train_steps_per_second": 1.235
16
  }
 
1
  {
2
+ "epoch": 34.78260869565217,
3
+ "eval_accuracy": 0.9407286568787384,
4
+ "eval_f1_score": 0.9319305111443131,
5
+ "eval_loss": 0.46566537022590637,
6
+ "eval_precision": 0.9319435555053062,
7
+ "eval_recall": 0.9325682397024007,
8
+ "eval_runtime": 6.6584,
9
+ "eval_samples_per_second": 552.385,
10
+ "eval_steps_per_second": 8.711,
11
+ "total_flos": 8662131210539100.0,
12
+ "train_loss": 0.4411096167564392,
13
+ "train_runtime": 3432.8242,
14
+ "train_samples_per_second": 149.148,
15
+ "train_steps_per_second": 1.165
16
  }
test_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "epoch": 26.08695652173913,
3
- "eval_accuracy": 0.9323185648273987,
4
- "eval_f1_score": 0.9261695691084951,
5
- "eval_loss": 0.2873239815235138,
6
- "eval_precision": 0.9319834922740233,
7
- "eval_recall": 0.9216726996777184,
8
- "eval_runtime": 6.6463,
9
- "eval_samples_per_second": 553.538,
10
- "eval_steps_per_second": 8.727
11
  }
 
1
  {
2
+ "epoch": 34.78260869565217,
3
+ "eval_accuracy": 0.9407286568787384,
4
+ "eval_f1_score": 0.9319305111443131,
5
+ "eval_loss": 0.46566537022590637,
6
+ "eval_precision": 0.9319435555053062,
7
+ "eval_recall": 0.9325682397024007,
8
+ "eval_runtime": 6.6584,
9
+ "eval_samples_per_second": 552.385,
10
+ "eval_steps_per_second": 8.711
11
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 26.08695652173913,
3
- "total_flos": 6579999363349350.0,
4
- "train_loss": 0.31640464369455973,
5
- "train_runtime": 3238.6183,
6
- "train_samples_per_second": 158.092,
7
- "train_steps_per_second": 1.235
8
  }
 
1
  {
2
+ "epoch": 34.78260869565217,
3
+ "total_flos": 8662131210539100.0,
4
+ "train_loss": 0.4411096167564392,
5
+ "train_runtime": 3432.8242,
6
+ "train_samples_per_second": 149.148,
7
+ "train_steps_per_second": 1.165
8
  }
trainer_state.json CHANGED
@@ -1,591 +1,781 @@
1
  {
2
- "best_metric": 0.9261695691084951,
3
- "best_model_checkpoint": "cls_comment-phobert-base-v2-v3.2.1/checkpoint-2000",
4
- "epoch": 26.08695652173913,
5
  "eval_steps": 100,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.8695652173913043,
13
- "grad_norm": 2.4986989498138428,
14
  "learning_rate": 2.5e-06,
15
- "loss": 1.8947,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.8695652173913043,
20
- "eval_accuracy": 0.4001087251970644,
21
- "eval_f1_score": 0.08320411950694513,
22
- "eval_loss": 1.68748140335083,
23
- "eval_precision": 0.14637409036074248,
24
- "eval_recall": 0.14367816091954025,
25
- "eval_runtime": 6.6876,
26
- "eval_samples_per_second": 550.125,
27
- "eval_steps_per_second": 8.673,
28
  "step": 100
29
  },
30
  {
31
  "epoch": 1.7391304347826086,
32
- "grad_norm": 1.596021294593811,
33
  "learning_rate": 5e-06,
34
- "loss": 1.5395,
35
  "step": 200
36
  },
37
  {
38
  "epoch": 1.7391304347826086,
39
- "eval_accuracy": 0.5849415602065778,
40
- "eval_f1_score": 0.2355807809182458,
41
- "eval_loss": 1.2897096872329712,
42
- "eval_precision": 0.27516139357553443,
43
- "eval_recall": 0.26320585050663253,
44
- "eval_runtime": 6.7112,
45
- "eval_samples_per_second": 548.189,
46
- "eval_steps_per_second": 8.642,
47
  "step": 200
48
  },
49
  {
50
  "epoch": 2.608695652173913,
51
- "grad_norm": 5.161496162414551,
52
  "learning_rate": 7.500000000000001e-06,
53
- "loss": 1.1205,
54
  "step": 300
55
  },
56
  {
57
  "epoch": 2.608695652173913,
58
- "eval_accuracy": 0.7999456374014677,
59
- "eval_f1_score": 0.5833178086765388,
60
- "eval_loss": 0.8468331098556519,
61
- "eval_precision": 0.5889764394952819,
62
- "eval_recall": 0.5810488238671468,
63
- "eval_runtime": 6.6686,
64
- "eval_samples_per_second": 551.694,
65
- "eval_steps_per_second": 8.698,
66
  "step": 300
67
  },
68
  {
69
  "epoch": 3.4782608695652173,
70
- "grad_norm": 5.227330207824707,
71
  "learning_rate": 1e-05,
72
- "loss": 0.82,
73
  "step": 400
74
  },
75
  {
76
  "epoch": 3.4782608695652173,
77
- "eval_accuracy": 0.8369122044033704,
78
- "eval_f1_score": 0.6179371343772609,
79
- "eval_loss": 0.6537477374076843,
80
- "eval_precision": 0.6062100200393906,
81
- "eval_recall": 0.6355302315827523,
82
- "eval_runtime": 6.728,
83
- "eval_samples_per_second": 546.823,
84
- "eval_steps_per_second": 8.621,
85
  "step": 400
86
  },
87
  {
88
  "epoch": 4.3478260869565215,
89
- "grad_norm": 5.6816534996032715,
90
  "learning_rate": 9.722222222222223e-06,
91
- "loss": 0.6232,
92
  "step": 500
93
  },
94
  {
95
  "epoch": 4.3478260869565215,
96
- "eval_accuracy": 0.8537646099483556,
97
- "eval_f1_score": 0.633743239294036,
98
- "eval_loss": 0.537100613117218,
99
- "eval_precision": 0.7525070200257705,
100
- "eval_recall": 0.6518017678843925,
101
- "eval_runtime": 6.7932,
102
- "eval_samples_per_second": 541.573,
103
- "eval_steps_per_second": 8.538,
104
  "step": 500
105
  },
106
  {
107
  "epoch": 5.217391304347826,
108
- "grad_norm": 5.096814155578613,
109
  "learning_rate": 9.444444444444445e-06,
110
- "loss": 0.5148,
111
  "step": 600
112
  },
113
  {
114
  "epoch": 5.217391304347826,
115
- "eval_accuracy": 0.872791519434629,
116
- "eval_f1_score": 0.7299293979398146,
117
- "eval_loss": 0.46505650877952576,
118
- "eval_precision": 0.7548552896750885,
119
- "eval_recall": 0.7210618976649555,
120
- "eval_runtime": 6.7028,
121
- "eval_samples_per_second": 548.875,
122
- "eval_steps_per_second": 8.653,
123
  "step": 600
124
  },
125
  {
126
  "epoch": 6.086956521739131,
127
- "grad_norm": 5.458530902862549,
128
  "learning_rate": 9.166666666666666e-06,
129
- "loss": 0.4204,
130
  "step": 700
131
  },
132
  {
133
  "epoch": 6.086956521739131,
134
- "eval_accuracy": 0.8869257950530035,
135
- "eval_f1_score": 0.7654329783869755,
136
- "eval_loss": 0.40097591280937195,
137
- "eval_precision": 0.8914471413846636,
138
- "eval_recall": 0.7712133932759179,
139
- "eval_runtime": 6.7443,
140
- "eval_samples_per_second": 545.497,
141
- "eval_steps_per_second": 8.6,
142
  "step": 700
143
  },
144
  {
145
  "epoch": 6.956521739130435,
146
- "grad_norm": 6.144416809082031,
147
  "learning_rate": 8.888888888888888e-06,
148
- "loss": 0.3421,
149
  "step": 800
150
  },
151
  {
152
  "epoch": 6.956521739130435,
153
- "eval_accuracy": 0.9051372655612938,
154
- "eval_f1_score": 0.8713582894968701,
155
- "eval_loss": 0.3648029565811157,
156
- "eval_precision": 0.8940734807154502,
157
- "eval_recall": 0.8588405388993653,
158
- "eval_runtime": 6.7352,
159
- "eval_samples_per_second": 546.232,
160
- "eval_steps_per_second": 8.611,
161
  "step": 800
162
  },
163
  {
164
  "epoch": 7.826086956521739,
165
- "grad_norm": 9.907292366027832,
166
  "learning_rate": 8.611111111111112e-06,
167
- "loss": 0.2841,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 7.826086956521739,
172
- "eval_accuracy": 0.9181842892090242,
173
- "eval_f1_score": 0.9006880118200489,
174
- "eval_loss": 0.3239505887031555,
175
- "eval_precision": 0.8978148514278343,
176
- "eval_recall": 0.9038252102525616,
177
- "eval_runtime": 6.7315,
178
- "eval_samples_per_second": 546.532,
179
- "eval_steps_per_second": 8.616,
180
  "step": 900
181
  },
182
  {
183
  "epoch": 8.695652173913043,
184
- "grad_norm": 6.941843032836914,
185
  "learning_rate": 8.333333333333334e-06,
186
- "loss": 0.2319,
187
  "step": 1000
188
  },
189
  {
190
  "epoch": 8.695652173913043,
191
- "eval_accuracy": 0.9203587931503125,
192
- "eval_f1_score": 0.9060754755748909,
193
- "eval_loss": 0.3025033473968506,
194
- "eval_precision": 0.9175362378163865,
195
- "eval_recall": 0.8975903509513042,
196
- "eval_runtime": 6.6908,
197
- "eval_samples_per_second": 549.861,
198
- "eval_steps_per_second": 8.669,
199
  "step": 1000
200
  },
201
  {
202
  "epoch": 9.565217391304348,
203
- "grad_norm": 4.255012035369873,
204
  "learning_rate": 8.055555555555557e-06,
205
- "loss": 0.205,
206
  "step": 1100
207
  },
208
  {
209
  "epoch": 9.565217391304348,
210
- "eval_accuracy": 0.9209024191356346,
211
- "eval_f1_score": 0.9098640550303895,
212
- "eval_loss": 0.29862046241760254,
213
- "eval_precision": 0.9123097696068861,
214
- "eval_recall": 0.9086287269577242,
215
- "eval_runtime": 6.7134,
216
- "eval_samples_per_second": 548.01,
217
- "eval_steps_per_second": 8.639,
218
  "step": 1100
219
  },
220
  {
221
  "epoch": 10.434782608695652,
222
- "grad_norm": 5.848569393157959,
223
  "learning_rate": 7.77777777777778e-06,
224
- "loss": 0.1783,
225
  "step": 1200
226
  },
227
  {
228
  "epoch": 10.434782608695652,
229
- "eval_accuracy": 0.9206306061429737,
230
- "eval_f1_score": 0.9104384776037051,
231
- "eval_loss": 0.3047122657299042,
232
- "eval_precision": 0.9024848857165658,
233
- "eval_recall": 0.9207396220750284,
234
- "eval_runtime": 6.6561,
235
- "eval_samples_per_second": 552.726,
236
- "eval_steps_per_second": 8.714,
237
  "step": 1200
238
  },
239
  {
240
  "epoch": 11.304347826086957,
241
- "grad_norm": 7.340043544769287,
242
  "learning_rate": 7.500000000000001e-06,
243
- "loss": 0.1587,
244
  "step": 1300
245
  },
246
  {
247
  "epoch": 11.304347826086957,
248
- "eval_accuracy": 0.9296004349007883,
249
- "eval_f1_score": 0.9202832724978299,
250
- "eval_loss": 0.2757803201675415,
251
- "eval_precision": 0.9233347498988893,
252
- "eval_recall": 0.917658614989255,
253
- "eval_runtime": 6.6787,
254
- "eval_samples_per_second": 550.859,
255
- "eval_steps_per_second": 8.684,
256
  "step": 1300
257
  },
258
  {
259
  "epoch": 12.173913043478262,
260
- "grad_norm": 5.315700054168701,
261
  "learning_rate": 7.222222222222223e-06,
262
- "loss": 0.1286,
263
  "step": 1400
264
  },
265
  {
266
  "epoch": 12.173913043478262,
267
- "eval_accuracy": 0.9266104919815167,
268
- "eval_f1_score": 0.9144278995332229,
269
- "eval_loss": 0.29267847537994385,
270
- "eval_precision": 0.9100638576136009,
271
- "eval_recall": 0.9198715425139269,
272
- "eval_runtime": 6.7676,
273
- "eval_samples_per_second": 543.619,
274
- "eval_steps_per_second": 8.57,
275
  "step": 1400
276
  },
277
  {
278
  "epoch": 13.043478260869565,
279
- "grad_norm": 5.173799514770508,
280
  "learning_rate": 6.944444444444445e-06,
281
- "loss": 0.1221,
282
  "step": 1500
283
  },
284
  {
285
  "epoch": 13.043478260869565,
286
- "eval_accuracy": 0.9317749388420766,
287
- "eval_f1_score": 0.9245023460604546,
288
- "eval_loss": 0.28211963176727295,
289
- "eval_precision": 0.9309417300478454,
290
- "eval_recall": 0.9193579289135766,
291
- "eval_runtime": 6.7091,
292
- "eval_samples_per_second": 548.359,
293
- "eval_steps_per_second": 8.645,
294
  "step": 1500
295
  },
296
  {
297
  "epoch": 13.91304347826087,
298
- "grad_norm": 8.639619827270508,
299
  "learning_rate": 6.666666666666667e-06,
300
- "loss": 0.1087,
301
  "step": 1600
302
  },
303
  {
304
  "epoch": 13.91304347826087,
305
- "eval_accuracy": 0.9293286219081273,
306
- "eval_f1_score": 0.9159607873769989,
307
- "eval_loss": 0.27890825271606445,
308
- "eval_precision": 0.9090390134661626,
309
- "eval_recall": 0.9236924050215896,
310
- "eval_runtime": 6.7017,
311
- "eval_samples_per_second": 548.966,
312
- "eval_steps_per_second": 8.655,
313
  "step": 1600
314
  },
315
  {
316
  "epoch": 14.782608695652174,
317
- "grad_norm": 6.424872398376465,
318
  "learning_rate": 6.3888888888888885e-06,
319
- "loss": 0.0982,
320
  "step": 1700
321
  },
322
  {
323
  "epoch": 14.782608695652174,
324
- "eval_accuracy": 0.9290568089154662,
325
- "eval_f1_score": 0.9196461825352048,
326
- "eval_loss": 0.2833573520183563,
327
- "eval_precision": 0.9187836187318232,
328
- "eval_recall": 0.9213402050339831,
329
- "eval_runtime": 6.7096,
330
- "eval_samples_per_second": 548.32,
331
- "eval_steps_per_second": 8.644,
332
  "step": 1700
333
  },
334
  {
335
  "epoch": 15.652173913043478,
336
- "grad_norm": 4.618613243103027,
337
  "learning_rate": 6.111111111111112e-06,
338
- "loss": 0.089,
339
  "step": 1800
340
  },
341
  {
342
  "epoch": 15.652173913043478,
343
- "eval_accuracy": 0.9298722478934494,
344
- "eval_f1_score": 0.9202166850732406,
345
- "eval_loss": 0.28280356526374817,
346
- "eval_precision": 0.9151663252588741,
347
- "eval_recall": 0.9260674008256092,
348
- "eval_runtime": 6.7345,
349
- "eval_samples_per_second": 546.292,
350
- "eval_steps_per_second": 8.612,
351
  "step": 1800
352
  },
353
  {
354
  "epoch": 16.52173913043478,
355
- "grad_norm": 1.9568698406219482,
356
  "learning_rate": 5.833333333333334e-06,
357
- "loss": 0.0795,
358
  "step": 1900
359
  },
360
  {
361
  "epoch": 16.52173913043478,
362
- "eval_accuracy": 0.9331340038053819,
363
- "eval_f1_score": 0.9244095368032713,
364
- "eval_loss": 0.273701936006546,
365
- "eval_precision": 0.925343846727414,
366
- "eval_recall": 0.9238732382441093,
367
- "eval_runtime": 6.7425,
368
- "eval_samples_per_second": 545.641,
369
- "eval_steps_per_second": 8.602,
370
  "step": 1900
371
  },
372
  {
373
  "epoch": 17.391304347826086,
374
- "grad_norm": 2.161759614944458,
375
  "learning_rate": 5.555555555555557e-06,
376
- "loss": 0.0684,
377
  "step": 2000
378
  },
379
  {
380
  "epoch": 17.391304347826086,
381
- "eval_accuracy": 0.9323185648273987,
382
- "eval_f1_score": 0.9261695691084951,
383
- "eval_loss": 0.2873239815235138,
384
- "eval_precision": 0.9319834922740233,
385
- "eval_recall": 0.9216726996777184,
386
- "eval_runtime": 6.7577,
387
- "eval_samples_per_second": 544.415,
388
- "eval_steps_per_second": 8.583,
389
  "step": 2000
390
  },
391
  {
392
  "epoch": 18.26086956521739,
393
- "grad_norm": 4.607916355133057,
394
  "learning_rate": 5.2777777777777785e-06,
395
- "loss": 0.0673,
396
  "step": 2100
397
  },
398
  {
399
  "epoch": 18.26086956521739,
400
- "eval_accuracy": 0.9320467518347377,
401
- "eval_f1_score": 0.925184613434992,
402
- "eval_loss": 0.2904324531555176,
403
- "eval_precision": 0.9332741752610002,
404
- "eval_recall": 0.9184445089519725,
405
- "eval_runtime": 6.7294,
406
- "eval_samples_per_second": 546.702,
407
- "eval_steps_per_second": 8.619,
408
  "step": 2100
409
  },
410
  {
411
  "epoch": 19.130434782608695,
412
- "grad_norm": 6.327251434326172,
413
  "learning_rate": 5e-06,
414
- "loss": 0.0571,
415
  "step": 2200
416
  },
417
  {
418
  "epoch": 19.130434782608695,
419
- "eval_accuracy": 0.9293286219081273,
420
- "eval_f1_score": 0.9221668516434853,
421
- "eval_loss": 0.3166205883026123,
422
- "eval_precision": 0.925137476734381,
423
- "eval_recall": 0.920952609526737,
424
- "eval_runtime": 6.7037,
425
- "eval_samples_per_second": 548.799,
426
- "eval_steps_per_second": 8.652,
427
  "step": 2200
428
  },
429
  {
430
  "epoch": 20.0,
431
- "grad_norm": 9.082805633544922,
432
  "learning_rate": 4.722222222222222e-06,
433
- "loss": 0.0561,
434
  "step": 2300
435
  },
436
  {
437
  "epoch": 20.0,
438
- "eval_accuracy": 0.9317749388420766,
439
- "eval_f1_score": 0.9221280725480369,
440
- "eval_loss": 0.2922111749649048,
441
- "eval_precision": 0.9150274852978553,
442
- "eval_recall": 0.9297539237688469,
443
- "eval_runtime": 6.7503,
444
- "eval_samples_per_second": 545.017,
445
- "eval_steps_per_second": 8.592,
446
  "step": 2300
447
  },
448
  {
449
  "epoch": 20.869565217391305,
450
- "grad_norm": 5.283856391906738,
451
  "learning_rate": 4.444444444444444e-06,
452
- "loss": 0.0511,
453
  "step": 2400
454
  },
455
  {
456
  "epoch": 20.869565217391305,
457
- "eval_accuracy": 0.9315031258494156,
458
- "eval_f1_score": 0.9190655007648246,
459
- "eval_loss": 0.29927295446395874,
460
- "eval_precision": 0.9088064828335735,
461
- "eval_recall": 0.9303236730969998,
462
- "eval_runtime": 6.7281,
463
- "eval_samples_per_second": 546.809,
464
- "eval_steps_per_second": 8.621,
465
  "step": 2400
466
  },
467
  {
468
  "epoch": 21.73913043478261,
469
- "grad_norm": 6.0074896812438965,
470
  "learning_rate": 4.166666666666667e-06,
471
- "loss": 0.0442,
472
  "step": 2500
473
  },
474
  {
475
  "epoch": 21.73913043478261,
476
- "eval_accuracy": 0.9266104919815167,
477
- "eval_f1_score": 0.9161795338292905,
478
- "eval_loss": 0.32011494040489197,
479
- "eval_precision": 0.9060451440252857,
480
- "eval_recall": 0.9280493422296427,
481
- "eval_runtime": 6.7127,
482
- "eval_samples_per_second": 548.067,
483
- "eval_steps_per_second": 8.64,
484
  "step": 2500
485
  },
486
  {
487
  "epoch": 22.608695652173914,
488
- "grad_norm": 3.1078407764434814,
489
  "learning_rate": 3.88888888888889e-06,
490
- "loss": 0.0447,
491
  "step": 2600
492
  },
493
  {
494
  "epoch": 22.608695652173914,
495
- "eval_accuracy": 0.928241369937483,
496
- "eval_f1_score": 0.9137497551284842,
497
- "eval_loss": 0.3155056834220886,
498
- "eval_precision": 0.9009580466238951,
499
- "eval_recall": 0.9281730038314259,
500
- "eval_runtime": 6.7337,
501
- "eval_samples_per_second": 546.354,
502
- "eval_steps_per_second": 8.613,
503
  "step": 2600
504
  },
505
  {
506
  "epoch": 23.47826086956522,
507
- "grad_norm": 2.9584195613861084,
508
  "learning_rate": 3.6111111111111115e-06,
509
- "loss": 0.0415,
510
  "step": 2700
511
  },
512
  {
513
  "epoch": 23.47826086956522,
514
- "eval_accuracy": 0.9334058167980429,
515
- "eval_f1_score": 0.9226018260362496,
516
- "eval_loss": 0.30177852511405945,
517
- "eval_precision": 0.9185179495480513,
518
- "eval_recall": 0.9269833265460256,
519
- "eval_runtime": 6.7411,
520
- "eval_samples_per_second": 545.757,
521
- "eval_steps_per_second": 8.604,
522
  "step": 2700
523
  },
524
  {
525
  "epoch": 24.347826086956523,
526
- "grad_norm": 12.190321922302246,
527
  "learning_rate": 3.3333333333333333e-06,
528
- "loss": 0.0359,
529
  "step": 2800
530
  },
531
  {
532
  "epoch": 24.347826086956523,
533
- "eval_accuracy": 0.9298722478934494,
534
- "eval_f1_score": 0.9177278989948806,
535
- "eval_loss": 0.31918126344680786,
536
- "eval_precision": 0.9062664068560837,
537
- "eval_recall": 0.9308234663752396,
538
- "eval_runtime": 6.7802,
539
- "eval_samples_per_second": 542.606,
540
- "eval_steps_per_second": 8.554,
541
  "step": 2800
542
  },
543
  {
544
  "epoch": 25.217391304347824,
545
- "grad_norm": 0.2598835527896881,
546
  "learning_rate": 3.055555555555556e-06,
547
- "loss": 0.0369,
548
  "step": 2900
549
  },
550
  {
551
  "epoch": 25.217391304347824,
552
- "eval_accuracy": 0.933677629790704,
553
- "eval_f1_score": 0.9210521238209074,
554
- "eval_loss": 0.3063570559024811,
555
- "eval_precision": 0.9140578271273506,
556
- "eval_recall": 0.9285610502121662,
557
- "eval_runtime": 6.7729,
558
- "eval_samples_per_second": 543.197,
559
- "eval_steps_per_second": 8.564,
560
  "step": 2900
561
  },
562
  {
563
  "epoch": 26.08695652173913,
564
- "grad_norm": 0.24433408677577972,
565
  "learning_rate": 2.7777777777777783e-06,
566
- "loss": 0.0296,
567
  "step": 3000
568
  },
569
  {
570
  "epoch": 26.08695652173913,
571
- "eval_accuracy": 0.9328621908127208,
572
- "eval_f1_score": 0.9237047805131925,
573
- "eval_loss": 0.311038613319397,
574
- "eval_precision": 0.9198460229141495,
575
- "eval_recall": 0.9279424126946928,
576
- "eval_runtime": 6.8161,
577
- "eval_samples_per_second": 539.754,
578
- "eval_steps_per_second": 8.509,
579
  "step": 3000
580
  },
581
  {
582
- "epoch": 26.08695652173913,
583
- "step": 3000,
584
- "total_flos": 6579999363349350.0,
585
- "train_loss": 0.31640464369455973,
586
- "train_runtime": 3238.6183,
587
- "train_samples_per_second": 158.092,
588
- "train_steps_per_second": 1.235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
  }
590
  ],
591
  "logging_steps": 100,
@@ -593,7 +783,7 @@
593
  "num_input_tokens_seen": 0,
594
  "num_train_epochs": 35,
595
  "save_steps": 100,
596
- "total_flos": 6579999363349350.0,
597
  "train_batch_size": 64,
598
  "trial_name": null,
599
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9319305111443131,
3
+ "best_model_checkpoint": "cls_comment-phobert-base-v2-v3.2.1/checkpoint-3700",
4
+ "epoch": 34.78260869565217,
5
  "eval_steps": 100,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.8695652173913043,
13
+ "grad_norm": 1.2353936433792114,
14
  "learning_rate": 2.5e-06,
15
+ "loss": 1.8571,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.8695652173913043,
20
+ "eval_accuracy": 0.3985861881457314,
21
+ "eval_f1_score": 0.08142634970006665,
22
+ "eval_loss": 1.6996082067489624,
23
+ "eval_precision": 0.05694088402081877,
24
+ "eval_recall": 0.14285714285714285,
25
+ "eval_runtime": 6.7198,
26
+ "eval_samples_per_second": 547.337,
27
+ "eval_steps_per_second": 8.631,
28
  "step": 100
29
  },
30
  {
31
  "epoch": 1.7391304347826086,
32
+ "grad_norm": 2.164222240447998,
33
  "learning_rate": 5e-06,
34
+ "loss": 1.552,
35
  "step": 200
36
  },
37
  {
38
  "epoch": 1.7391304347826086,
39
+ "eval_accuracy": 0.6150081566068516,
40
+ "eval_f1_score": 0.25519317041574235,
41
+ "eval_loss": 1.287759780883789,
42
+ "eval_precision": 0.27379414130499563,
43
+ "eval_recall": 0.28595668999910956,
44
+ "eval_runtime": 6.6372,
45
+ "eval_samples_per_second": 554.145,
46
+ "eval_steps_per_second": 8.739,
47
  "step": 200
48
  },
49
  {
50
  "epoch": 2.608695652173913,
51
+ "grad_norm": 5.309168815612793,
52
  "learning_rate": 7.500000000000001e-06,
53
+ "loss": 1.1701,
54
  "step": 300
55
  },
56
  {
57
  "epoch": 2.608695652173913,
58
+ "eval_accuracy": 0.7746057640021751,
59
+ "eval_f1_score": 0.5380183437718388,
60
+ "eval_loss": 0.9308958649635315,
61
+ "eval_precision": 0.5797370934127978,
62
+ "eval_recall": 0.5249138773909011,
63
+ "eval_runtime": 6.5583,
64
+ "eval_samples_per_second": 560.819,
65
+ "eval_steps_per_second": 8.844,
66
  "step": 300
67
  },
68
  {
69
  "epoch": 3.4782608695652173,
70
+ "grad_norm": 6.065152645111084,
71
  "learning_rate": 1e-05,
72
+ "loss": 0.8958,
73
  "step": 400
74
  },
75
  {
76
  "epoch": 3.4782608695652173,
77
+ "eval_accuracy": 0.8371397498640566,
78
+ "eval_f1_score": 0.6099241108684058,
79
+ "eval_loss": 0.7467954754829407,
80
+ "eval_precision": 0.6113332602554015,
81
+ "eval_recall": 0.6120619930302185,
82
+ "eval_runtime": 6.7113,
83
+ "eval_samples_per_second": 548.029,
84
+ "eval_steps_per_second": 8.642,
85
  "step": 400
86
  },
87
  {
88
  "epoch": 4.3478260869565215,
89
+ "grad_norm": 6.0916032791137695,
90
  "learning_rate": 9.722222222222223e-06,
91
+ "loss": 0.7463,
92
  "step": 500
93
  },
94
  {
95
  "epoch": 4.3478260869565215,
96
+ "eval_accuracy": 0.8640565524741708,
97
+ "eval_f1_score": 0.6758460592010713,
98
+ "eval_loss": 0.6539974808692932,
99
+ "eval_precision": 0.7556366523817865,
100
+ "eval_recall": 0.6741270736536388,
101
+ "eval_runtime": 6.6507,
102
+ "eval_samples_per_second": 553.021,
103
+ "eval_steps_per_second": 8.721,
104
  "step": 500
105
  },
106
  {
107
  "epoch": 5.217391304347826,
108
+ "grad_norm": 5.243598461151123,
109
  "learning_rate": 9.444444444444445e-06,
110
+ "loss": 0.6489,
111
  "step": 600
112
  },
113
  {
114
  "epoch": 5.217391304347826,
115
+ "eval_accuracy": 0.8866231647634584,
116
+ "eval_f1_score": 0.7501544920720539,
117
+ "eval_loss": 0.5884435772895813,
118
+ "eval_precision": 0.7611116878989773,
119
+ "eval_recall": 0.7443224665881154,
120
+ "eval_runtime": 6.5994,
121
+ "eval_samples_per_second": 557.321,
122
+ "eval_steps_per_second": 8.789,
123
  "step": 600
124
  },
125
  {
126
  "epoch": 6.086956521739131,
127
+ "grad_norm": 4.099198341369629,
128
  "learning_rate": 9.166666666666666e-06,
129
+ "loss": 0.5604,
130
  "step": 700
131
  },
132
  {
133
  "epoch": 6.086956521739131,
134
+ "eval_accuracy": 0.9010331702011963,
135
+ "eval_f1_score": 0.8349576474350455,
136
+ "eval_loss": 0.5296739339828491,
137
+ "eval_precision": 0.906039613501241,
138
+ "eval_recall": 0.8195735271786126,
139
+ "eval_runtime": 6.5841,
140
+ "eval_samples_per_second": 558.615,
141
+ "eval_steps_per_second": 8.809,
142
  "step": 700
143
  },
144
  {
145
  "epoch": 6.956521739130435,
146
+ "grad_norm": 7.3427557945251465,
147
  "learning_rate": 8.888888888888888e-06,
148
+ "loss": 0.4907,
149
  "step": 800
150
  },
151
  {
152
  "epoch": 6.956521739130435,
153
+ "eval_accuracy": 0.9170744970092441,
154
+ "eval_f1_score": 0.8961939258721013,
155
+ "eval_loss": 0.4928275942802429,
156
+ "eval_precision": 0.9189847475576503,
157
+ "eval_recall": 0.8769461858418355,
158
+ "eval_runtime": 6.5949,
159
+ "eval_samples_per_second": 557.707,
160
+ "eval_steps_per_second": 8.795,
161
  "step": 800
162
  },
163
  {
164
  "epoch": 7.826086956521739,
165
+ "grad_norm": 5.116893768310547,
166
  "learning_rate": 8.611111111111112e-06,
167
+ "loss": 0.4428,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 7.826086956521739,
172
+ "eval_accuracy": 0.921968461120174,
173
+ "eval_f1_score": 0.9048377528141078,
174
+ "eval_loss": 0.46924909949302673,
175
+ "eval_precision": 0.9169978450568272,
176
+ "eval_recall": 0.8957923531168384,
177
+ "eval_runtime": 6.6744,
178
+ "eval_samples_per_second": 551.061,
179
+ "eval_steps_per_second": 8.69,
180
  "step": 900
181
  },
182
  {
183
  "epoch": 8.695652173913043,
184
+ "grad_norm": 3.862825393676758,
185
  "learning_rate": 8.333333333333334e-06,
186
+ "loss": 0.4086,
187
  "step": 1000
188
  },
189
  {
190
  "epoch": 8.695652173913043,
191
+ "eval_accuracy": 0.9235997824904839,
192
+ "eval_f1_score": 0.9073197371323319,
193
+ "eval_loss": 0.4600367546081543,
194
+ "eval_precision": 0.8977722877003532,
195
+ "eval_recall": 0.9183169396463787,
196
+ "eval_runtime": 6.5594,
197
+ "eval_samples_per_second": 560.722,
198
+ "eval_steps_per_second": 8.842,
199
  "step": 1000
200
  },
201
  {
202
  "epoch": 9.565217391304348,
203
+ "grad_norm": 3.6134090423583984,
204
  "learning_rate": 8.055555555555557e-06,
205
+ "loss": 0.3892,
206
  "step": 1100
207
  },
208
  {
209
  "epoch": 9.565217391304348,
210
+ "eval_accuracy": 0.9293094072865687,
211
+ "eval_f1_score": 0.9155883832712851,
212
+ "eval_loss": 0.45303475856781006,
213
+ "eval_precision": 0.9155788758247504,
214
+ "eval_recall": 0.9158764039642001,
215
+ "eval_runtime": 6.5742,
216
+ "eval_samples_per_second": 559.46,
217
+ "eval_steps_per_second": 8.822,
218
  "step": 1100
219
  },
220
  {
221
  "epoch": 10.434782608695652,
222
+ "grad_norm": 6.361451148986816,
223
  "learning_rate": 7.77777777777778e-06,
224
+ "loss": 0.3659,
225
  "step": 1200
226
  },
227
  {
228
  "epoch": 10.434782608695652,
229
+ "eval_accuracy": 0.9257748776508973,
230
+ "eval_f1_score": 0.915368183730898,
231
+ "eval_loss": 0.4573982357978821,
232
+ "eval_precision": 0.9071440635058929,
233
+ "eval_recall": 0.9257672878056209,
234
+ "eval_runtime": 6.6383,
235
+ "eval_samples_per_second": 554.054,
236
+ "eval_steps_per_second": 8.737,
237
  "step": 1200
238
  },
239
  {
240
  "epoch": 11.304347826086957,
241
+ "grad_norm": 6.6150312423706055,
242
  "learning_rate": 7.500000000000001e-06,
243
+ "loss": 0.3577,
244
  "step": 1300
245
  },
246
  {
247
  "epoch": 11.304347826086957,
248
+ "eval_accuracy": 0.9287656334964655,
249
+ "eval_f1_score": 0.9158955580622885,
250
+ "eval_loss": 0.45325955748558044,
251
+ "eval_precision": 0.9151528325914676,
252
+ "eval_recall": 0.9176855536409609,
253
+ "eval_runtime": 6.6572,
254
+ "eval_samples_per_second": 552.485,
255
+ "eval_steps_per_second": 8.712,
256
  "step": 1300
257
  },
258
  {
259
  "epoch": 12.173913043478262,
260
+ "grad_norm": 5.580691814422607,
261
  "learning_rate": 7.222222222222223e-06,
262
+ "loss": 0.338,
263
  "step": 1400
264
  },
265
  {
266
  "epoch": 12.173913043478262,
267
+ "eval_accuracy": 0.933931484502447,
268
+ "eval_f1_score": 0.9203137031893097,
269
+ "eval_loss": 0.44535157084465027,
270
+ "eval_precision": 0.9127875535751154,
271
+ "eval_recall": 0.9284587781330601,
272
+ "eval_runtime": 6.607,
273
+ "eval_samples_per_second": 556.686,
274
+ "eval_steps_per_second": 8.779,
275
  "step": 1400
276
  },
277
  {
278
  "epoch": 13.043478260869565,
279
+ "grad_norm": 3.8554763793945312,
280
  "learning_rate": 6.944444444444445e-06,
281
+ "loss": 0.3302,
282
  "step": 1500
283
  },
284
  {
285
  "epoch": 13.043478260869565,
286
+ "eval_accuracy": 0.9312126155519304,
287
+ "eval_f1_score": 0.9179345627885324,
288
+ "eval_loss": 0.4539467692375183,
289
+ "eval_precision": 0.919620307390688,
290
+ "eval_recall": 0.9172409654840804,
291
+ "eval_runtime": 6.6136,
292
+ "eval_samples_per_second": 556.129,
293
+ "eval_steps_per_second": 8.77,
294
  "step": 1500
295
  },
296
  {
297
  "epoch": 13.91304347826087,
298
+ "grad_norm": 11.763055801391602,
299
  "learning_rate": 6.666666666666667e-06,
300
+ "loss": 0.3186,
301
  "step": 1600
302
  },
303
  {
304
  "epoch": 13.91304347826087,
305
+ "eval_accuracy": 0.9320282762370854,
306
+ "eval_f1_score": 0.9219566242363927,
307
+ "eval_loss": 0.4532802700996399,
308
+ "eval_precision": 0.9298453666516142,
309
+ "eval_recall": 0.914622870958479,
310
+ "eval_runtime": 6.6786,
311
+ "eval_samples_per_second": 550.714,
312
+ "eval_steps_per_second": 8.684,
313
  "step": 1600
314
  },
315
  {
316
  "epoch": 14.782608695652174,
317
+ "grad_norm": 4.790759563446045,
318
  "learning_rate": 6.3888888888888885e-06,
319
+ "loss": 0.3146,
320
  "step": 1700
321
  },
322
  {
323
  "epoch": 14.782608695652174,
324
+ "eval_accuracy": 0.935562805872757,
325
+ "eval_f1_score": 0.9245866574233509,
326
+ "eval_loss": 0.4484989047050476,
327
+ "eval_precision": 0.9280507721712984,
328
+ "eval_recall": 0.9224481039362212,
329
+ "eval_runtime": 6.5869,
330
+ "eval_samples_per_second": 558.384,
331
+ "eval_steps_per_second": 8.805,
332
  "step": 1700
333
  },
334
  {
335
  "epoch": 15.652173913043478,
336
+ "grad_norm": 5.715475559234619,
337
  "learning_rate": 6.111111111111112e-06,
338
+ "loss": 0.3093,
339
  "step": 1800
340
  },
341
  {
342
  "epoch": 15.652173913043478,
343
+ "eval_accuracy": 0.9325720500271887,
344
+ "eval_f1_score": 0.9193696012739166,
345
+ "eval_loss": 0.45573264360427856,
346
+ "eval_precision": 0.9290964969084188,
347
+ "eval_recall": 0.9124598145426113,
348
+ "eval_runtime": 6.6184,
349
+ "eval_samples_per_second": 555.721,
350
+ "eval_steps_per_second": 8.763,
351
  "step": 1800
352
  },
353
  {
354
  "epoch": 16.52173913043478,
355
+ "grad_norm": 7.8289079666137695,
356
  "learning_rate": 5.833333333333334e-06,
357
+ "loss": 0.3019,
358
  "step": 1900
359
  },
360
  {
361
  "epoch": 16.52173913043478,
362
+ "eval_accuracy": 0.9290375203915171,
363
+ "eval_f1_score": 0.9169347716468026,
364
+ "eval_loss": 0.46843111515045166,
365
+ "eval_precision": 0.9128246753731022,
366
+ "eval_recall": 0.923370474591805,
367
+ "eval_runtime": 6.6032,
368
+ "eval_samples_per_second": 557.0,
369
+ "eval_steps_per_second": 8.784,
370
  "step": 1900
371
  },
372
  {
373
  "epoch": 17.391304347826086,
374
+ "grad_norm": 5.883206844329834,
375
  "learning_rate": 5.555555555555557e-06,
376
+ "loss": 0.2985,
377
  "step": 2000
378
  },
379
  {
380
  "epoch": 17.391304347826086,
381
+ "eval_accuracy": 0.9347471451876019,
382
+ "eval_f1_score": 0.92475047525614,
383
+ "eval_loss": 0.4544869661331177,
384
+ "eval_precision": 0.9259386916015938,
385
+ "eval_recall": 0.9237552045152712,
386
+ "eval_runtime": 6.6237,
387
+ "eval_samples_per_second": 555.275,
388
+ "eval_steps_per_second": 8.756,
389
  "step": 2000
390
  },
391
  {
392
  "epoch": 18.26086956521739,
393
+ "grad_norm": 8.993196487426758,
394
  "learning_rate": 5.2777777777777785e-06,
395
+ "loss": 0.2959,
396
  "step": 2100
397
  },
398
  {
399
  "epoch": 18.26086956521739,
400
+ "eval_accuracy": 0.9333877107123436,
401
+ "eval_f1_score": 0.9219602449474701,
402
+ "eval_loss": 0.46893206238746643,
403
+ "eval_precision": 0.92490748541024,
404
+ "eval_recall": 0.9208243882860574,
405
+ "eval_runtime": 6.7132,
406
+ "eval_samples_per_second": 547.875,
407
+ "eval_steps_per_second": 8.64,
408
  "step": 2100
409
  },
410
  {
411
  "epoch": 19.130434782608695,
412
+ "grad_norm": 6.87612771987915,
413
  "learning_rate": 5e-06,
414
+ "loss": 0.2891,
415
  "step": 2200
416
  },
417
  {
418
  "epoch": 19.130434782608695,
419
+ "eval_accuracy": 0.9385535617183252,
420
+ "eval_f1_score": 0.9262085591552154,
421
+ "eval_loss": 0.4558440148830414,
422
+ "eval_precision": 0.9360271609767613,
423
+ "eval_recall": 0.9180112460726695,
424
+ "eval_runtime": 6.5974,
425
+ "eval_samples_per_second": 557.491,
426
+ "eval_steps_per_second": 8.791,
427
  "step": 2200
428
  },
429
  {
430
  "epoch": 20.0,
431
+ "grad_norm": 8.078782081604004,
432
  "learning_rate": 4.722222222222222e-06,
433
+ "loss": 0.2905,
434
  "step": 2300
435
  },
436
  {
437
  "epoch": 20.0,
438
+ "eval_accuracy": 0.9358346927678086,
439
+ "eval_f1_score": 0.9227437963525783,
440
+ "eval_loss": 0.45897340774536133,
441
+ "eval_precision": 0.9307760201675803,
442
+ "eval_recall": 0.9162662496043394,
443
+ "eval_runtime": 6.6433,
444
+ "eval_samples_per_second": 553.643,
445
+ "eval_steps_per_second": 8.731,
446
  "step": 2300
447
  },
448
  {
449
  "epoch": 20.869565217391305,
450
+ "grad_norm": 6.95852518081665,
451
  "learning_rate": 4.444444444444444e-06,
452
+ "loss": 0.2875,
453
  "step": 2400
454
  },
455
  {
456
  "epoch": 20.869565217391305,
457
+ "eval_accuracy": 0.9306688417618271,
458
+ "eval_f1_score": 0.9192946959387055,
459
+ "eval_loss": 0.4796580672264099,
460
+ "eval_precision": 0.9267958791034487,
461
+ "eval_recall": 0.9145913486548557,
462
+ "eval_runtime": 6.5982,
463
+ "eval_samples_per_second": 557.426,
464
+ "eval_steps_per_second": 8.79,
465
  "step": 2400
466
  },
467
  {
468
  "epoch": 21.73913043478261,
469
+ "grad_norm": 5.104602336883545,
470
  "learning_rate": 4.166666666666667e-06,
471
+ "loss": 0.2812,
472
  "step": 2500
473
  },
474
  {
475
  "epoch": 21.73913043478261,
476
+ "eval_accuracy": 0.935562805872757,
477
+ "eval_f1_score": 0.9246927415584684,
478
+ "eval_loss": 0.46965479850769043,
479
+ "eval_precision": 0.9241768322453005,
480
+ "eval_recall": 0.9257346798390873,
481
+ "eval_runtime": 6.6685,
482
+ "eval_samples_per_second": 551.547,
483
+ "eval_steps_per_second": 8.698,
484
  "step": 2500
485
  },
486
  {
487
  "epoch": 22.608695652173914,
488
+ "grad_norm": 1.9872806072235107,
489
  "learning_rate": 3.88888888888889e-06,
490
+ "loss": 0.2789,
491
  "step": 2600
492
  },
493
  {
494
  "epoch": 22.608695652173914,
495
+ "eval_accuracy": 0.9380097879282219,
496
+ "eval_f1_score": 0.9255376836601789,
497
+ "eval_loss": 0.46675482392311096,
498
+ "eval_precision": 0.9271185300366118,
499
+ "eval_recall": 0.9250032647695392,
500
+ "eval_runtime": 6.6231,
501
+ "eval_samples_per_second": 555.328,
502
+ "eval_steps_per_second": 8.757,
503
  "step": 2600
504
  },
505
  {
506
  "epoch": 23.47826086956522,
507
+ "grad_norm": 4.836044788360596,
508
  "learning_rate": 3.6111111111111115e-06,
509
+ "loss": 0.2785,
510
  "step": 2700
511
  },
512
  {
513
  "epoch": 23.47826086956522,
514
+ "eval_accuracy": 0.9382816748232735,
515
+ "eval_f1_score": 0.9293325929996209,
516
+ "eval_loss": 0.4671032130718231,
517
+ "eval_precision": 0.9288859327813286,
518
+ "eval_recall": 0.9301070328166979,
519
+ "eval_runtime": 6.6888,
520
+ "eval_samples_per_second": 549.877,
521
+ "eval_steps_per_second": 8.671,
522
  "step": 2700
523
  },
524
  {
525
  "epoch": 24.347826086956523,
526
+ "grad_norm": 2.3744585514068604,
527
  "learning_rate": 3.3333333333333333e-06,
528
+ "loss": 0.2773,
529
  "step": 2800
530
  },
531
  {
532
  "epoch": 24.347826086956523,
533
+ "eval_accuracy": 0.9390973355084284,
534
+ "eval_f1_score": 0.9293323313487988,
535
+ "eval_loss": 0.46571776270866394,
536
+ "eval_precision": 0.9327721009515447,
537
+ "eval_recall": 0.927360363082818,
538
+ "eval_runtime": 6.6343,
539
+ "eval_samples_per_second": 554.389,
540
+ "eval_steps_per_second": 8.742,
541
  "step": 2800
542
  },
543
  {
544
  "epoch": 25.217391304347824,
545
+ "grad_norm": 4.463809490203857,
546
  "learning_rate": 3.055555555555556e-06,
547
+ "loss": 0.2814,
548
  "step": 2900
549
  },
550
  {
551
  "epoch": 25.217391304347824,
552
+ "eval_accuracy": 0.9361065796628603,
553
+ "eval_f1_score": 0.9259199302104238,
554
+ "eval_loss": 0.47015631198883057,
555
+ "eval_precision": 0.924434133110186,
556
+ "eval_recall": 0.9285494955527653,
557
+ "eval_runtime": 6.6115,
558
+ "eval_samples_per_second": 556.303,
559
+ "eval_steps_per_second": 8.773,
560
  "step": 2900
561
  },
562
  {
563
  "epoch": 26.08695652173913,
564
+ "grad_norm": 11.97252082824707,
565
  "learning_rate": 2.7777777777777783e-06,
566
+ "loss": 0.2744,
567
  "step": 3000
568
  },
569
  {
570
  "epoch": 26.08695652173913,
571
+ "eval_accuracy": 0.9352909189777052,
572
+ "eval_f1_score": 0.9273773946315609,
573
+ "eval_loss": 0.4731716811656952,
574
+ "eval_precision": 0.9272954212892072,
575
+ "eval_recall": 0.929014459015713,
576
+ "eval_runtime": 6.6523,
577
+ "eval_samples_per_second": 552.888,
578
+ "eval_steps_per_second": 8.719,
579
  "step": 3000
580
  },
581
  {
582
+ "epoch": 26.956521739130434,
583
+ "grad_norm": 8.698735237121582,
584
+ "learning_rate": 2.5e-06,
585
+ "loss": 0.2772,
586
+ "step": 3100
587
+ },
588
+ {
589
+ "epoch": 26.956521739130434,
590
+ "eval_accuracy": 0.9388254486133768,
591
+ "eval_f1_score": 0.9280828388852939,
592
+ "eval_loss": 0.46764281392097473,
593
+ "eval_precision": 0.9264283970809257,
594
+ "eval_recall": 0.9301128430609281,
595
+ "eval_runtime": 6.6577,
596
+ "eval_samples_per_second": 552.441,
597
+ "eval_steps_per_second": 8.712,
598
+ "step": 3100
599
+ },
600
+ {
601
+ "epoch": 27.82608695652174,
602
+ "grad_norm": 9.868401527404785,
603
+ "learning_rate": 2.222222222222222e-06,
604
+ "loss": 0.2736,
605
+ "step": 3200
606
+ },
607
+ {
608
+ "epoch": 27.82608695652174,
609
+ "eval_accuracy": 0.9393692224034802,
610
+ "eval_f1_score": 0.9280880031444269,
611
+ "eval_loss": 0.46609246730804443,
612
+ "eval_precision": 0.9325449452470522,
613
+ "eval_recall": 0.9241991833476634,
614
+ "eval_runtime": 6.6629,
615
+ "eval_samples_per_second": 552.016,
616
+ "eval_steps_per_second": 8.705,
617
+ "step": 3200
618
+ },
619
+ {
620
+ "epoch": 28.695652173913043,
621
+ "grad_norm": 13.52723217010498,
622
+ "learning_rate": 1.944444444444445e-06,
623
+ "loss": 0.2754,
624
+ "step": 3300
625
+ },
626
+ {
627
+ "epoch": 28.695652173913043,
628
+ "eval_accuracy": 0.9366503534529635,
629
+ "eval_f1_score": 0.925681100132558,
630
+ "eval_loss": 0.4745844602584839,
631
+ "eval_precision": 0.9287802003680891,
632
+ "eval_recall": 0.9233293953593443,
633
+ "eval_runtime": 6.6962,
634
+ "eval_samples_per_second": 549.264,
635
+ "eval_steps_per_second": 8.662,
636
+ "step": 3300
637
+ },
638
+ {
639
+ "epoch": 29.565217391304348,
640
+ "grad_norm": 0.09545432776212692,
641
+ "learning_rate": 1.6666666666666667e-06,
642
+ "loss": 0.2717,
643
+ "step": 3400
644
+ },
645
+ {
646
+ "epoch": 29.565217391304348,
647
+ "eval_accuracy": 0.9380097879282219,
648
+ "eval_f1_score": 0.9283345025534331,
649
+ "eval_loss": 0.46884092688560486,
650
+ "eval_precision": 0.9315263692536144,
651
+ "eval_recall": 0.925530596287616,
652
+ "eval_runtime": 6.6275,
653
+ "eval_samples_per_second": 554.958,
654
+ "eval_steps_per_second": 8.751,
655
+ "step": 3400
656
+ },
657
+ {
658
+ "epoch": 30.434782608695652,
659
+ "grad_norm": 1.8131216764450073,
660
+ "learning_rate": 1.3888888888888892e-06,
661
+ "loss": 0.27,
662
+ "step": 3500
663
+ },
664
+ {
665
+ "epoch": 30.434782608695652,
666
+ "eval_accuracy": 0.9388254486133768,
667
+ "eval_f1_score": 0.9303895468552515,
668
+ "eval_loss": 0.4696621894836426,
669
+ "eval_precision": 0.9308188528131611,
670
+ "eval_recall": 0.9307349860958422,
671
+ "eval_runtime": 6.6391,
672
+ "eval_samples_per_second": 553.987,
673
+ "eval_steps_per_second": 8.736,
674
+ "step": 3500
675
+ },
676
+ {
677
+ "epoch": 31.304347826086957,
678
+ "grad_norm": 1.2362151145935059,
679
+ "learning_rate": 1.111111111111111e-06,
680
+ "loss": 0.2674,
681
+ "step": 3600
682
+ },
683
+ {
684
+ "epoch": 31.304347826086957,
685
+ "eval_accuracy": 0.9390973355084284,
686
+ "eval_f1_score": 0.9273897248556044,
687
+ "eval_loss": 0.466818243265152,
688
+ "eval_precision": 0.9240182135116143,
689
+ "eval_recall": 0.9311490745870729,
690
+ "eval_runtime": 6.6874,
691
+ "eval_samples_per_second": 549.992,
692
+ "eval_steps_per_second": 8.673,
693
+ "step": 3600
694
+ },
695
+ {
696
+ "epoch": 32.17391304347826,
697
+ "grad_norm": 6.612317085266113,
698
+ "learning_rate": 8.333333333333333e-07,
699
+ "loss": 0.2693,
700
+ "step": 3700
701
+ },
702
+ {
703
+ "epoch": 32.17391304347826,
704
+ "eval_accuracy": 0.9407286568787384,
705
+ "eval_f1_score": 0.9319305111443131,
706
+ "eval_loss": 0.46566537022590637,
707
+ "eval_precision": 0.9319435555053062,
708
+ "eval_recall": 0.9325682397024007,
709
+ "eval_runtime": 6.6594,
710
+ "eval_samples_per_second": 552.298,
711
+ "eval_steps_per_second": 8.709,
712
+ "step": 3700
713
+ },
714
+ {
715
+ "epoch": 33.04347826086956,
716
+ "grad_norm": 3.6944832801818848,
717
+ "learning_rate": 5.555555555555555e-07,
718
+ "loss": 0.2685,
719
+ "step": 3800
720
+ },
721
+ {
722
+ "epoch": 33.04347826086956,
723
+ "eval_accuracy": 0.9401848830886351,
724
+ "eval_f1_score": 0.9298408543588726,
725
+ "eval_loss": 0.46719253063201904,
726
+ "eval_precision": 0.9297325884730158,
727
+ "eval_recall": 0.9303720415200172,
728
+ "eval_runtime": 6.6751,
729
+ "eval_samples_per_second": 551.006,
730
+ "eval_steps_per_second": 8.689,
731
+ "step": 3800
732
+ },
733
+ {
734
+ "epoch": 33.91304347826087,
735
+ "grad_norm": 0.5045357346534729,
736
+ "learning_rate": 2.7777777777777776e-07,
737
+ "loss": 0.268,
738
+ "step": 3900
739
+ },
740
+ {
741
+ "epoch": 33.91304347826087,
742
+ "eval_accuracy": 0.94100054377379,
743
+ "eval_f1_score": 0.9316787163771599,
744
+ "eval_loss": 0.46683618426322937,
745
+ "eval_precision": 0.9328330606111085,
746
+ "eval_recall": 0.9311209947261558,
747
+ "eval_runtime": 6.6584,
748
+ "eval_samples_per_second": 552.388,
749
+ "eval_steps_per_second": 8.711,
750
+ "step": 3900
751
+ },
752
+ {
753
+ "epoch": 34.78260869565217,
754
+ "grad_norm": 0.15777729451656342,
755
+ "learning_rate": 0.0,
756
+ "loss": 0.272,
757
+ "step": 4000
758
+ },
759
+ {
760
+ "epoch": 34.78260869565217,
761
+ "eval_accuracy": 0.9401848830886351,
762
+ "eval_f1_score": 0.9309979115269789,
763
+ "eval_loss": 0.46536290645599365,
764
+ "eval_precision": 0.9324573806162856,
765
+ "eval_recall": 0.9300452091014509,
766
+ "eval_runtime": 6.6565,
767
+ "eval_samples_per_second": 552.546,
768
+ "eval_steps_per_second": 8.713,
769
+ "step": 4000
770
+ },
771
+ {
772
+ "epoch": 34.78260869565217,
773
+ "step": 4000,
774
+ "total_flos": 8662131210539100.0,
775
+ "train_loss": 0.4411096167564392,
776
+ "train_runtime": 3432.8242,
777
+ "train_samples_per_second": 149.148,
778
+ "train_steps_per_second": 1.165
779
  }
780
  ],
781
  "logging_steps": 100,
 
783
  "num_input_tokens_seen": 0,
784
  "num_train_epochs": 35,
785
  "save_steps": 100,
786
+ "total_flos": 8662131210539100.0,
787
  "train_batch_size": 64,
788
  "trial_name": null,
789
  "trial_params": null