shenkha commited on
Commit
5de2517
1 Parent(s): f364619

Upload 13 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<MASK>": 50258,
3
+ "<PAD>": 50257
4
+ }
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.791762140185717,
4
+ "eval_loss": 1.2498760223388672,
5
+ "eval_runtime": 27.7085,
6
+ "eval_samples": 1987,
7
+ "eval_samples_per_second": 71.711,
8
+ "eval_steps_per_second": 3.609,
9
+ "perplexity": 3.4899102597284046,
10
+ "train_loss": 0.9664699619099245,
11
+ "train_runtime": 20866.4149,
12
+ "train_samples": 9478,
13
+ "train_samples_per_second": 22.711,
14
+ "train_steps_per_second": 0.283
15
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/DialoGPT-small",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "conversational": {
31
+ "max_length": 1000
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.24.0",
36
+ "use_cache": true,
37
+ "vocab_size": 50259
38
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 50.0,
3
+ "eval_accuracy": 0.791762140185717,
4
+ "eval_loss": 1.2498760223388672,
5
+ "eval_runtime": 27.7085,
6
+ "eval_samples": 1987,
7
+ "eval_samples_per_second": 71.711,
8
+ "eval_steps_per_second": 3.609,
9
+ "perplexity": 3.4899102597284046
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c05c0f7dc4208ccc41f4c50e6814f476091d85c84c9ce031c1e8ce5ec5d77c6
3
+ size 510404157
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": "<MASK>",
17
+ "pad_token": "<PAD>",
18
+ "unk_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": true,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ }
25
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 1024,
22
+ "name_or_path": "microsoft/DialoGPT-small",
23
+ "pad_token": null,
24
+ "special_tokens_map_file": null,
25
+ "tokenizer_class": "GPT2Tokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 50.0,
3
+ "train_loss": 0.9664699619099245,
4
+ "train_runtime": 20866.4149,
5
+ "train_samples": 9478,
6
+ "train_samples_per_second": 22.711,
7
+ "train_steps_per_second": 0.283
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 49.9957805907173,
5
+ "global_step": 5900,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.0,
12
+ "eval_accuracy": 0.7731608395748838,
13
+ "eval_loss": 1.273674726486206,
14
+ "eval_runtime": 27.8473,
15
+ "eval_samples_per_second": 71.353,
16
+ "eval_steps_per_second": 3.591,
17
+ "step": 118
18
+ },
19
+ {
20
+ "epoch": 2.0,
21
+ "eval_accuracy": 0.7844064852916506,
22
+ "eval_loss": 1.149436354637146,
23
+ "eval_runtime": 27.7157,
24
+ "eval_samples_per_second": 71.692,
25
+ "eval_steps_per_second": 3.608,
26
+ "step": 236
27
+ },
28
+ {
29
+ "epoch": 3.0,
30
+ "eval_accuracy": 0.7886655417073725,
31
+ "eval_loss": 1.115924596786499,
32
+ "eval_runtime": 27.7526,
33
+ "eval_samples_per_second": 71.597,
34
+ "eval_steps_per_second": 3.603,
35
+ "step": 354
36
+ },
37
+ {
38
+ "epoch": 4.0,
39
+ "eval_accuracy": 0.7905562627668078,
40
+ "eval_loss": 1.0954861640930176,
41
+ "eval_runtime": 27.7193,
42
+ "eval_samples_per_second": 71.683,
43
+ "eval_steps_per_second": 3.608,
44
+ "step": 472
45
+ },
46
+ {
47
+ "epoch": 4.24,
48
+ "learning_rate": 4.5762711864406784e-05,
49
+ "loss": 3.3542,
50
+ "step": 500
51
+ },
52
+ {
53
+ "epoch": 5.0,
54
+ "eval_accuracy": 0.7918075332800458,
55
+ "eval_loss": 1.0870832204818726,
56
+ "eval_runtime": 27.7224,
57
+ "eval_samples_per_second": 71.675,
58
+ "eval_steps_per_second": 3.607,
59
+ "step": 590
60
+ },
61
+ {
62
+ "epoch": 6.0,
63
+ "eval_accuracy": 0.7928476272240149,
64
+ "eval_loss": 1.0806066989898682,
65
+ "eval_runtime": 27.72,
66
+ "eval_samples_per_second": 71.681,
67
+ "eval_steps_per_second": 3.607,
68
+ "step": 708
69
+ },
70
+ {
71
+ "epoch": 7.0,
72
+ "eval_accuracy": 0.7931219594027847,
73
+ "eval_loss": 1.0786826610565186,
74
+ "eval_runtime": 27.7079,
75
+ "eval_samples_per_second": 71.712,
76
+ "eval_steps_per_second": 3.609,
77
+ "step": 826
78
+ },
79
+ {
80
+ "epoch": 8.0,
81
+ "eval_accuracy": 0.7938285127840768,
82
+ "eval_loss": 1.0748920440673828,
83
+ "eval_runtime": 27.7168,
84
+ "eval_samples_per_second": 71.689,
85
+ "eval_steps_per_second": 3.608,
86
+ "step": 944
87
+ },
88
+ {
89
+ "epoch": 8.47,
90
+ "learning_rate": 4.152542372881356e-05,
91
+ "loss": 1.0048,
92
+ "step": 1000
93
+ },
94
+ {
95
+ "epoch": 9.0,
96
+ "eval_accuracy": 0.7937554891105914,
97
+ "eval_loss": 1.0768133401870728,
98
+ "eval_runtime": 27.8358,
99
+ "eval_samples_per_second": 71.383,
100
+ "eval_steps_per_second": 3.592,
101
+ "step": 1062
102
+ },
103
+ {
104
+ "epoch": 10.0,
105
+ "eval_accuracy": 0.793783119689748,
106
+ "eval_loss": 1.076497197151184,
107
+ "eval_runtime": 27.7645,
108
+ "eval_samples_per_second": 71.566,
109
+ "eval_steps_per_second": 3.602,
110
+ "step": 1180
111
+ },
112
+ {
113
+ "epoch": 11.0,
114
+ "eval_accuracy": 0.7941719214107384,
115
+ "eval_loss": 1.0760843753814697,
116
+ "eval_runtime": 28.0837,
117
+ "eval_samples_per_second": 70.753,
118
+ "eval_steps_per_second": 3.561,
119
+ "step": 1298
120
+ },
121
+ {
122
+ "epoch": 12.0,
123
+ "eval_accuracy": 0.7943712563032259,
124
+ "eval_loss": 1.078471064567566,
125
+ "eval_runtime": 27.7304,
126
+ "eval_samples_per_second": 71.654,
127
+ "eval_steps_per_second": 3.606,
128
+ "step": 1416
129
+ },
130
+ {
131
+ "epoch": 12.71,
132
+ "learning_rate": 3.728813559322034e-05,
133
+ "loss": 0.9221,
134
+ "step": 1500
135
+ },
136
+ {
137
+ "epoch": 13.0,
138
+ "eval_accuracy": 0.7942034992154889,
139
+ "eval_loss": 1.0854936838150024,
140
+ "eval_runtime": 27.702,
141
+ "eval_samples_per_second": 71.728,
142
+ "eval_steps_per_second": 3.61,
143
+ "step": 1534
144
+ },
145
+ {
146
+ "epoch": 14.0,
147
+ "eval_accuracy": 0.7943969132695856,
148
+ "eval_loss": 1.0861722230911255,
149
+ "eval_runtime": 27.7005,
150
+ "eval_samples_per_second": 71.731,
151
+ "eval_steps_per_second": 3.61,
152
+ "step": 1652
153
+ },
154
+ {
155
+ "epoch": 15.0,
156
+ "eval_accuracy": 0.7947047968659029,
157
+ "eval_loss": 1.0891741514205933,
158
+ "eval_runtime": 27.7017,
159
+ "eval_samples_per_second": 71.729,
160
+ "eval_steps_per_second": 3.61,
161
+ "step": 1770
162
+ },
163
+ {
164
+ "epoch": 16.0,
165
+ "eval_accuracy": 0.7944600688790866,
166
+ "eval_loss": 1.0921040773391724,
167
+ "eval_runtime": 27.6967,
168
+ "eval_samples_per_second": 71.741,
169
+ "eval_steps_per_second": 3.611,
170
+ "step": 1888
171
+ },
172
+ {
173
+ "epoch": 16.95,
174
+ "learning_rate": 3.305084745762712e-05,
175
+ "loss": 0.8534,
176
+ "step": 2000
177
+ },
178
+ {
179
+ "epoch": 17.0,
180
+ "eval_accuracy": 0.7944245438487423,
181
+ "eval_loss": 1.0993601083755493,
182
+ "eval_runtime": 27.7053,
183
+ "eval_samples_per_second": 71.719,
184
+ "eval_steps_per_second": 3.609,
185
+ "step": 2006
186
+ },
187
+ {
188
+ "epoch": 18.0,
189
+ "eval_accuracy": 0.7944719105558681,
190
+ "eval_loss": 1.100709319114685,
191
+ "eval_runtime": 27.7072,
192
+ "eval_samples_per_second": 71.714,
193
+ "eval_steps_per_second": 3.609,
194
+ "step": 2124
195
+ },
196
+ {
197
+ "epoch": 19.0,
198
+ "eval_accuracy": 0.7942567867610053,
199
+ "eval_loss": 1.1095192432403564,
200
+ "eval_runtime": 27.6924,
201
+ "eval_samples_per_second": 71.753,
202
+ "eval_steps_per_second": 3.611,
203
+ "step": 2242
204
+ },
205
+ {
206
+ "epoch": 20.0,
207
+ "eval_accuracy": 0.7939962698718138,
208
+ "eval_loss": 1.1117533445358276,
209
+ "eval_runtime": 27.706,
210
+ "eval_samples_per_second": 71.717,
211
+ "eval_steps_per_second": 3.609,
212
+ "step": 2360
213
+ },
214
+ {
215
+ "epoch": 21.0,
216
+ "eval_accuracy": 0.7940456101917365,
217
+ "eval_loss": 1.1208868026733398,
218
+ "eval_runtime": 27.7121,
219
+ "eval_samples_per_second": 71.702,
220
+ "eval_steps_per_second": 3.609,
221
+ "step": 2478
222
+ },
223
+ {
224
+ "epoch": 21.19,
225
+ "learning_rate": 2.88135593220339e-05,
226
+ "loss": 0.7959,
227
+ "step": 2500
228
+ },
229
+ {
230
+ "epoch": 22.0,
231
+ "eval_accuracy": 0.7939271934239222,
232
+ "eval_loss": 1.1249679327011108,
233
+ "eval_runtime": 27.7085,
234
+ "eval_samples_per_second": 71.711,
235
+ "eval_steps_per_second": 3.609,
236
+ "step": 2596
237
+ },
238
+ {
239
+ "epoch": 23.0,
240
+ "eval_accuracy": 0.7937811460769512,
241
+ "eval_loss": 1.1323717832565308,
242
+ "eval_runtime": 27.7079,
243
+ "eval_samples_per_second": 71.712,
244
+ "eval_steps_per_second": 3.609,
245
+ "step": 2714
246
+ },
247
+ {
248
+ "epoch": 24.0,
249
+ "eval_accuracy": 0.7939074572959531,
250
+ "eval_loss": 1.136080026626587,
251
+ "eval_runtime": 27.704,
252
+ "eval_samples_per_second": 71.723,
253
+ "eval_steps_per_second": 3.61,
254
+ "step": 2832
255
+ },
256
+ {
257
+ "epoch": 25.0,
258
+ "eval_accuracy": 0.7935601014436977,
259
+ "eval_loss": 1.1441864967346191,
260
+ "eval_runtime": 27.6972,
261
+ "eval_samples_per_second": 71.74,
262
+ "eval_steps_per_second": 3.61,
263
+ "step": 2950
264
+ },
265
+ {
266
+ "epoch": 25.42,
267
+ "learning_rate": 2.457627118644068e-05,
268
+ "loss": 0.7458,
269
+ "step": 3000
270
+ },
271
+ {
272
+ "epoch": 26.0,
273
+ "eval_accuracy": 0.7934377374502897,
274
+ "eval_loss": 1.1540361642837524,
275
+ "eval_runtime": 27.7164,
276
+ "eval_samples_per_second": 71.69,
277
+ "eval_steps_per_second": 3.608,
278
+ "step": 3068
279
+ },
280
+ {
281
+ "epoch": 27.0,
282
+ "eval_accuracy": 0.7932680067497557,
283
+ "eval_loss": 1.1579915285110474,
284
+ "eval_runtime": 27.7102,
285
+ "eval_samples_per_second": 71.706,
286
+ "eval_steps_per_second": 3.609,
287
+ "step": 3186
288
+ },
289
+ {
290
+ "epoch": 28.0,
291
+ "eval_accuracy": 0.793429842999102,
292
+ "eval_loss": 1.1653059720993042,
293
+ "eval_runtime": 27.7095,
294
+ "eval_samples_per_second": 71.708,
295
+ "eval_steps_per_second": 3.609,
296
+ "step": 3304
297
+ },
298
+ {
299
+ "epoch": 29.0,
300
+ "eval_accuracy": 0.7931002496620189,
301
+ "eval_loss": 1.1742562055587769,
302
+ "eval_runtime": 27.7109,
303
+ "eval_samples_per_second": 71.705,
304
+ "eval_steps_per_second": 3.609,
305
+ "step": 3422
306
+ },
307
+ {
308
+ "epoch": 29.66,
309
+ "learning_rate": 2.033898305084746e-05,
310
+ "loss": 0.7034,
311
+ "step": 3500
312
+ },
313
+ {
314
+ "epoch": 30.0,
315
+ "eval_accuracy": 0.7932344553322084,
316
+ "eval_loss": 1.1726936101913452,
317
+ "eval_runtime": 27.7127,
318
+ "eval_samples_per_second": 71.7,
319
+ "eval_steps_per_second": 3.608,
320
+ "step": 3540
321
+ },
322
+ {
323
+ "epoch": 31.0,
324
+ "eval_accuracy": 0.7930153843117519,
325
+ "eval_loss": 1.1811208724975586,
326
+ "eval_runtime": 27.7003,
327
+ "eval_samples_per_second": 71.732,
328
+ "eval_steps_per_second": 3.61,
329
+ "step": 3658
330
+ },
331
+ {
332
+ "epoch": 32.0,
333
+ "eval_accuracy": 0.7930114370861581,
334
+ "eval_loss": 1.1879879236221313,
335
+ "eval_runtime": 27.7125,
336
+ "eval_samples_per_second": 71.701,
337
+ "eval_steps_per_second": 3.608,
338
+ "step": 3776
339
+ },
340
+ {
341
+ "epoch": 33.0,
342
+ "eval_accuracy": 0.7927528938097634,
343
+ "eval_loss": 1.1952810287475586,
344
+ "eval_runtime": 27.7028,
345
+ "eval_samples_per_second": 71.726,
346
+ "eval_steps_per_second": 3.61,
347
+ "step": 3894
348
+ },
349
+ {
350
+ "epoch": 33.89,
351
+ "learning_rate": 1.6101694915254237e-05,
352
+ "loss": 0.6688,
353
+ "step": 4000
354
+ },
355
+ {
356
+ "epoch": 34.0,
357
+ "eval_accuracy": 0.7928298647088428,
358
+ "eval_loss": 1.200770378112793,
359
+ "eval_runtime": 27.7087,
360
+ "eval_samples_per_second": 71.71,
361
+ "eval_steps_per_second": 3.609,
362
+ "step": 4012
363
+ },
364
+ {
365
+ "epoch": 35.0,
366
+ "eval_accuracy": 0.7926384242675429,
367
+ "eval_loss": 1.205856442451477,
368
+ "eval_runtime": 27.7159,
369
+ "eval_samples_per_second": 71.692,
370
+ "eval_steps_per_second": 3.608,
371
+ "step": 4130
372
+ },
373
+ {
374
+ "epoch": 36.0,
375
+ "eval_accuracy": 0.7925101394357441,
376
+ "eval_loss": 1.2133612632751465,
377
+ "eval_runtime": 27.7077,
378
+ "eval_samples_per_second": 71.713,
379
+ "eval_steps_per_second": 3.609,
380
+ "step": 4248
381
+ },
382
+ {
383
+ "epoch": 37.0,
384
+ "eval_accuracy": 0.7922279128057866,
385
+ "eval_loss": 1.2220430374145508,
386
+ "eval_runtime": 27.7023,
387
+ "eval_samples_per_second": 71.727,
388
+ "eval_steps_per_second": 3.61,
389
+ "step": 4366
390
+ },
391
+ {
392
+ "epoch": 38.0,
393
+ "eval_accuracy": 0.7922910684152876,
394
+ "eval_loss": 1.2201595306396484,
395
+ "eval_runtime": 27.6967,
396
+ "eval_samples_per_second": 71.741,
397
+ "eval_steps_per_second": 3.611,
398
+ "step": 4484
399
+ },
400
+ {
401
+ "epoch": 38.14,
402
+ "learning_rate": 1.1864406779661018e-05,
403
+ "loss": 0.6427,
404
+ "step": 4500
405
+ },
406
+ {
407
+ "epoch": 39.0,
408
+ "eval_accuracy": 0.7922890948024907,
409
+ "eval_loss": 1.2266799211502075,
410
+ "eval_runtime": 27.7206,
411
+ "eval_samples_per_second": 71.68,
412
+ "eval_steps_per_second": 3.607,
413
+ "step": 4602
414
+ },
415
+ {
416
+ "epoch": 40.0,
417
+ "eval_accuracy": 0.7919200292094694,
418
+ "eval_loss": 1.2350112199783325,
419
+ "eval_runtime": 27.709,
420
+ "eval_samples_per_second": 71.709,
421
+ "eval_steps_per_second": 3.609,
422
+ "step": 4720
423
+ },
424
+ {
425
+ "epoch": 41.0,
426
+ "eval_accuracy": 0.792095680748394,
427
+ "eval_loss": 1.23201584815979,
428
+ "eval_runtime": 27.6985,
429
+ "eval_samples_per_second": 71.737,
430
+ "eval_steps_per_second": 3.61,
431
+ "step": 4838
432
+ },
433
+ {
434
+ "epoch": 42.0,
435
+ "eval_accuracy": 0.791866741663953,
436
+ "eval_loss": 1.2356910705566406,
437
+ "eval_runtime": 27.7184,
438
+ "eval_samples_per_second": 71.685,
439
+ "eval_steps_per_second": 3.608,
440
+ "step": 4956
441
+ },
442
+ {
443
+ "epoch": 42.37,
444
+ "learning_rate": 7.627118644067798e-06,
445
+ "loss": 0.6219,
446
+ "step": 5000
447
+ },
448
+ {
449
+ "epoch": 43.0,
450
+ "eval_accuracy": 0.791991079270158,
451
+ "eval_loss": 1.23865807056427,
452
+ "eval_runtime": 27.7193,
453
+ "eval_samples_per_second": 71.683,
454
+ "eval_steps_per_second": 3.608,
455
+ "step": 5074
456
+ },
457
+ {
458
+ "epoch": 44.0,
459
+ "eval_accuracy": 0.7919575278526106,
460
+ "eval_loss": 1.2408578395843506,
461
+ "eval_runtime": 27.719,
462
+ "eval_samples_per_second": 71.684,
463
+ "eval_steps_per_second": 3.608,
464
+ "step": 5192
465
+ },
466
+ {
467
+ "epoch": 45.0,
468
+ "eval_accuracy": 0.791886477791922,
469
+ "eval_loss": 1.244328260421753,
470
+ "eval_runtime": 27.7249,
471
+ "eval_samples_per_second": 71.668,
472
+ "eval_steps_per_second": 3.607,
473
+ "step": 5310
474
+ },
475
+ {
476
+ "epoch": 46.0,
477
+ "eval_accuracy": 0.7917897707648737,
478
+ "eval_loss": 1.2477593421936035,
479
+ "eval_runtime": 27.6986,
480
+ "eval_samples_per_second": 71.736,
481
+ "eval_steps_per_second": 3.61,
482
+ "step": 5428
483
+ },
484
+ {
485
+ "epoch": 46.61,
486
+ "learning_rate": 3.3898305084745763e-06,
487
+ "loss": 0.6097,
488
+ "step": 5500
489
+ },
490
+ {
491
+ "epoch": 47.0,
492
+ "eval_accuracy": 0.7917739818624984,
493
+ "eval_loss": 1.2487553358078003,
494
+ "eval_runtime": 27.7426,
495
+ "eval_samples_per_second": 71.623,
496
+ "eval_steps_per_second": 3.605,
497
+ "step": 5546
498
+ },
499
+ {
500
+ "epoch": 48.0,
501
+ "eval_accuracy": 0.7917522721217325,
502
+ "eval_loss": 1.2487850189208984,
503
+ "eval_runtime": 27.7184,
504
+ "eval_samples_per_second": 71.685,
505
+ "eval_steps_per_second": 3.608,
506
+ "step": 5664
507
+ },
508
+ {
509
+ "epoch": 49.0,
510
+ "eval_accuracy": 0.7917424040577479,
511
+ "eval_loss": 1.2495468854904175,
512
+ "eval_runtime": 27.7228,
513
+ "eval_samples_per_second": 71.674,
514
+ "eval_steps_per_second": 3.607,
515
+ "step": 5782
516
+ },
517
+ {
518
+ "epoch": 50.0,
519
+ "eval_accuracy": 0.791762140185717,
520
+ "eval_loss": 1.2498760223388672,
521
+ "eval_runtime": 27.7268,
522
+ "eval_samples_per_second": 71.664,
523
+ "eval_steps_per_second": 3.607,
524
+ "step": 5900
525
+ },
526
+ {
527
+ "epoch": 50.0,
528
+ "step": 5900,
529
+ "total_flos": 6.1908182433792e+16,
530
+ "train_loss": 0.9664699619099245,
531
+ "train_runtime": 20866.4149,
532
+ "train_samples_per_second": 22.711,
533
+ "train_steps_per_second": 0.283
534
+ }
535
+ ],
536
+ "max_steps": 5900,
537
+ "num_train_epochs": 50,
538
+ "total_flos": 6.1908182433792e+16,
539
+ "trial_name": null,
540
+ "trial_params": null
541
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24e2dd9b341b4a3957a649463d27424b2198bfac1a4b65904a90b5daa586e451
3
+ size 3451
vocab.json ADDED
The diff for this file is too large to render. See raw diff