MichaelOrme commited on
Commit
47f6cfc
1 Parent(s): 23c1dda

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/DialoGPT-large",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1280,
16
+ "n_head": 20,
17
+ "n_inner": null,
18
+ "n_layer": 36,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "conversational": {
31
+ "max_length": 1000
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.33.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.33.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a512064c53a7902c7e5732596ad87db152d47478a421115fb372bcbe8c10b13
3
+ size 6192613541
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c907589f131258a5c958d499703a9c84d8774a64150b9ec7a679d450d0fccde
3
+ size 3096262685
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df86bc1036ccd2ea8489e328a1827c1047e623465c54c482b9e33f3168bf2594
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1644f4272a5614e5abec86ab375f8e3e64085c9aa2cdafd4c3ad61d11ff69bb2
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 1024,
23
+ "pad_token": null,
24
+ "tokenizer_class": "GPT2Tokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
trainer_state.json ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 56415,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 9.911371089249314e-05,
14
+ "loss": 0.156,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.09,
19
+ "learning_rate": 9.822742178498626e-05,
20
+ "loss": 0.1245,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.13,
25
+ "learning_rate": 9.73411326774794e-05,
26
+ "loss": 0.1381,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.18,
31
+ "learning_rate": 9.645484356997254e-05,
32
+ "loss": 0.1251,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.22,
37
+ "learning_rate": 9.556855446246566e-05,
38
+ "loss": 0.123,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.27,
43
+ "learning_rate": 9.468226535495879e-05,
44
+ "loss": 0.1149,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.31,
49
+ "learning_rate": 9.379597624745192e-05,
50
+ "loss": 0.1299,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.35,
55
+ "learning_rate": 9.290968713994506e-05,
56
+ "loss": 0.1237,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.4,
61
+ "learning_rate": 9.20233980324382e-05,
62
+ "loss": 0.1249,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.44,
67
+ "learning_rate": 9.113710892493131e-05,
68
+ "loss": 0.1191,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.49,
73
+ "learning_rate": 9.025081981742445e-05,
74
+ "loss": 0.1135,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.53,
79
+ "learning_rate": 8.936453070991759e-05,
80
+ "loss": 0.114,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.58,
85
+ "learning_rate": 8.84782416024107e-05,
86
+ "loss": 0.1165,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.62,
91
+ "learning_rate": 8.759195249490384e-05,
92
+ "loss": 0.1191,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.66,
97
+ "learning_rate": 8.670566338739697e-05,
98
+ "loss": 0.1164,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.71,
103
+ "learning_rate": 8.581937427989011e-05,
104
+ "loss": 0.1211,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.75,
109
+ "learning_rate": 8.493308517238323e-05,
110
+ "loss": 0.1091,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.8,
115
+ "learning_rate": 8.404679606487636e-05,
116
+ "loss": 0.1096,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.84,
121
+ "learning_rate": 8.31605069573695e-05,
122
+ "loss": 0.1117,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.89,
127
+ "learning_rate": 8.227421784986264e-05,
128
+ "loss": 0.1125,
129
+ "step": 10000
130
+ },
131
+ {
132
+ "epoch": 0.93,
133
+ "learning_rate": 8.138792874235575e-05,
134
+ "loss": 0.1193,
135
+ "step": 10500
136
+ },
137
+ {
138
+ "epoch": 0.97,
139
+ "learning_rate": 8.050163963484889e-05,
140
+ "loss": 0.1094,
141
+ "step": 11000
142
+ },
143
+ {
144
+ "epoch": 1.0,
145
+ "eval_loss": 0.10602948069572449,
146
+ "eval_runtime": 418.0898,
147
+ "eval_samples_per_second": 6.747,
148
+ "eval_steps_per_second": 3.375,
149
+ "step": 11283
150
+ },
151
+ {
152
+ "epoch": 1.02,
153
+ "learning_rate": 7.961535052734203e-05,
154
+ "loss": 0.0908,
155
+ "step": 11500
156
+ },
157
+ {
158
+ "epoch": 1.06,
159
+ "learning_rate": 7.872906141983516e-05,
160
+ "loss": 0.0683,
161
+ "step": 12000
162
+ },
163
+ {
164
+ "epoch": 1.11,
165
+ "learning_rate": 7.784277231232828e-05,
166
+ "loss": 0.0725,
167
+ "step": 12500
168
+ },
169
+ {
170
+ "epoch": 1.15,
171
+ "learning_rate": 7.695648320482141e-05,
172
+ "loss": 0.0723,
173
+ "step": 13000
174
+ },
175
+ {
176
+ "epoch": 1.2,
177
+ "learning_rate": 7.607019409731455e-05,
178
+ "loss": 0.0707,
179
+ "step": 13500
180
+ },
181
+ {
182
+ "epoch": 1.24,
183
+ "learning_rate": 7.518390498980768e-05,
184
+ "loss": 0.0731,
185
+ "step": 14000
186
+ },
187
+ {
188
+ "epoch": 1.29,
189
+ "learning_rate": 7.42976158823008e-05,
190
+ "loss": 0.0687,
191
+ "step": 14500
192
+ },
193
+ {
194
+ "epoch": 1.33,
195
+ "learning_rate": 7.341132677479394e-05,
196
+ "loss": 0.0686,
197
+ "step": 15000
198
+ },
199
+ {
200
+ "epoch": 1.37,
201
+ "learning_rate": 7.252503766728708e-05,
202
+ "loss": 0.0717,
203
+ "step": 15500
204
+ },
205
+ {
206
+ "epoch": 1.42,
207
+ "learning_rate": 7.16387485597802e-05,
208
+ "loss": 0.0703,
209
+ "step": 16000
210
+ },
211
+ {
212
+ "epoch": 1.46,
213
+ "learning_rate": 7.075245945227334e-05,
214
+ "loss": 0.0702,
215
+ "step": 16500
216
+ },
217
+ {
218
+ "epoch": 1.51,
219
+ "learning_rate": 6.986617034476646e-05,
220
+ "loss": 0.0687,
221
+ "step": 17000
222
+ },
223
+ {
224
+ "epoch": 1.55,
225
+ "learning_rate": 6.89798812372596e-05,
226
+ "loss": 0.0695,
227
+ "step": 17500
228
+ },
229
+ {
230
+ "epoch": 1.6,
231
+ "learning_rate": 6.809359212975273e-05,
232
+ "loss": 0.0667,
233
+ "step": 18000
234
+ },
235
+ {
236
+ "epoch": 1.64,
237
+ "learning_rate": 6.720730302224585e-05,
238
+ "loss": 0.0706,
239
+ "step": 18500
240
+ },
241
+ {
242
+ "epoch": 1.68,
243
+ "learning_rate": 6.6321013914739e-05,
244
+ "loss": 0.061,
245
+ "step": 19000
246
+ },
247
+ {
248
+ "epoch": 1.73,
249
+ "learning_rate": 6.543472480723212e-05,
250
+ "loss": 0.063,
251
+ "step": 19500
252
+ },
253
+ {
254
+ "epoch": 1.77,
255
+ "learning_rate": 6.454843569972525e-05,
256
+ "loss": 0.0692,
257
+ "step": 20000
258
+ },
259
+ {
260
+ "epoch": 1.82,
261
+ "learning_rate": 6.366214659221839e-05,
262
+ "loss": 0.0691,
263
+ "step": 20500
264
+ },
265
+ {
266
+ "epoch": 1.86,
267
+ "learning_rate": 6.277585748471153e-05,
268
+ "loss": 0.0649,
269
+ "step": 21000
270
+ },
271
+ {
272
+ "epoch": 1.91,
273
+ "learning_rate": 6.188956837720464e-05,
274
+ "loss": 0.0629,
275
+ "step": 21500
276
+ },
277
+ {
278
+ "epoch": 1.95,
279
+ "learning_rate": 6.100327926969778e-05,
280
+ "loss": 0.0678,
281
+ "step": 22000
282
+ },
283
+ {
284
+ "epoch": 1.99,
285
+ "learning_rate": 6.011699016219091e-05,
286
+ "loss": 0.0659,
287
+ "step": 22500
288
+ },
289
+ {
290
+ "epoch": 2.0,
291
+ "eval_loss": 0.09332570433616638,
292
+ "eval_runtime": 419.0801,
293
+ "eval_samples_per_second": 6.731,
294
+ "eval_steps_per_second": 3.367,
295
+ "step": 22566
296
+ },
297
+ {
298
+ "epoch": 2.04,
299
+ "learning_rate": 5.9230701054684045e-05,
300
+ "loss": 0.0358,
301
+ "step": 23000
302
+ },
303
+ {
304
+ "epoch": 2.08,
305
+ "learning_rate": 5.834441194717717e-05,
306
+ "loss": 0.0333,
307
+ "step": 23500
308
+ },
309
+ {
310
+ "epoch": 2.13,
311
+ "learning_rate": 5.7458122839670305e-05,
312
+ "loss": 0.034,
313
+ "step": 24000
314
+ },
315
+ {
316
+ "epoch": 2.17,
317
+ "learning_rate": 5.657183373216344e-05,
318
+ "loss": 0.0331,
319
+ "step": 24500
320
+ },
321
+ {
322
+ "epoch": 2.22,
323
+ "learning_rate": 5.568554462465656e-05,
324
+ "loss": 0.0353,
325
+ "step": 25000
326
+ },
327
+ {
328
+ "epoch": 2.26,
329
+ "learning_rate": 5.47992555171497e-05,
330
+ "loss": 0.0362,
331
+ "step": 25500
332
+ },
333
+ {
334
+ "epoch": 2.3,
335
+ "learning_rate": 5.391296640964283e-05,
336
+ "loss": 0.0339,
337
+ "step": 26000
338
+ },
339
+ {
340
+ "epoch": 2.35,
341
+ "learning_rate": 5.302667730213596e-05,
342
+ "loss": 0.0332,
343
+ "step": 26500
344
+ },
345
+ {
346
+ "epoch": 2.39,
347
+ "learning_rate": 5.214038819462909e-05,
348
+ "loss": 0.0343,
349
+ "step": 27000
350
+ },
351
+ {
352
+ "epoch": 2.44,
353
+ "learning_rate": 5.125409908712222e-05,
354
+ "loss": 0.0337,
355
+ "step": 27500
356
+ },
357
+ {
358
+ "epoch": 2.48,
359
+ "learning_rate": 5.0367809979615356e-05,
360
+ "loss": 0.0324,
361
+ "step": 28000
362
+ },
363
+ {
364
+ "epoch": 2.53,
365
+ "learning_rate": 4.948152087210848e-05,
366
+ "loss": 0.0341,
367
+ "step": 28500
368
+ },
369
+ {
370
+ "epoch": 2.57,
371
+ "learning_rate": 4.8595231764601615e-05,
372
+ "loss": 0.0332,
373
+ "step": 29000
374
+ },
375
+ {
376
+ "epoch": 2.61,
377
+ "learning_rate": 4.770894265709475e-05,
378
+ "loss": 0.0346,
379
+ "step": 29500
380
+ },
381
+ {
382
+ "epoch": 2.66,
383
+ "learning_rate": 4.682265354958788e-05,
384
+ "loss": 0.0346,
385
+ "step": 30000
386
+ },
387
+ {
388
+ "epoch": 2.7,
389
+ "learning_rate": 4.593636444208101e-05,
390
+ "loss": 0.034,
391
+ "step": 30500
392
+ },
393
+ {
394
+ "epoch": 2.75,
395
+ "learning_rate": 4.505007533457414e-05,
396
+ "loss": 0.0371,
397
+ "step": 31000
398
+ },
399
+ {
400
+ "epoch": 2.79,
401
+ "learning_rate": 4.4163786227067274e-05,
402
+ "loss": 0.0335,
403
+ "step": 31500
404
+ },
405
+ {
406
+ "epoch": 2.84,
407
+ "learning_rate": 4.32774971195604e-05,
408
+ "loss": 0.0337,
409
+ "step": 32000
410
+ },
411
+ {
412
+ "epoch": 2.88,
413
+ "learning_rate": 4.239120801205353e-05,
414
+ "loss": 0.0332,
415
+ "step": 32500
416
+ },
417
+ {
418
+ "epoch": 2.92,
419
+ "learning_rate": 4.1504918904546666e-05,
420
+ "loss": 0.0325,
421
+ "step": 33000
422
+ },
423
+ {
424
+ "epoch": 2.97,
425
+ "learning_rate": 4.06186297970398e-05,
426
+ "loss": 0.0335,
427
+ "step": 33500
428
+ },
429
+ {
430
+ "epoch": 3.0,
431
+ "eval_loss": 0.09837915003299713,
432
+ "eval_runtime": 419.1038,
433
+ "eval_samples_per_second": 6.731,
434
+ "eval_steps_per_second": 3.367,
435
+ "step": 33849
436
+ },
437
+ {
438
+ "epoch": 3.01,
439
+ "learning_rate": 3.9732340689532926e-05,
440
+ "loss": 0.028,
441
+ "step": 34000
442
+ },
443
+ {
444
+ "epoch": 3.06,
445
+ "learning_rate": 3.884605158202606e-05,
446
+ "loss": 0.0163,
447
+ "step": 34500
448
+ },
449
+ {
450
+ "epoch": 3.1,
451
+ "learning_rate": 3.795976247451919e-05,
452
+ "loss": 0.0153,
453
+ "step": 35000
454
+ },
455
+ {
456
+ "epoch": 3.15,
457
+ "learning_rate": 3.7073473367012325e-05,
458
+ "loss": 0.0165,
459
+ "step": 35500
460
+ },
461
+ {
462
+ "epoch": 3.19,
463
+ "learning_rate": 3.618718425950545e-05,
464
+ "loss": 0.0167,
465
+ "step": 36000
466
+ },
467
+ {
468
+ "epoch": 3.23,
469
+ "learning_rate": 3.5300895151998584e-05,
470
+ "loss": 0.0164,
471
+ "step": 36500
472
+ },
473
+ {
474
+ "epoch": 3.28,
475
+ "learning_rate": 3.441460604449172e-05,
476
+ "loss": 0.0166,
477
+ "step": 37000
478
+ },
479
+ {
480
+ "epoch": 3.32,
481
+ "learning_rate": 3.3528316936984844e-05,
482
+ "loss": 0.0185,
483
+ "step": 37500
484
+ },
485
+ {
486
+ "epoch": 3.37,
487
+ "learning_rate": 3.264202782947798e-05,
488
+ "loss": 0.0174,
489
+ "step": 38000
490
+ },
491
+ {
492
+ "epoch": 3.41,
493
+ "learning_rate": 3.175573872197111e-05,
494
+ "loss": 0.0176,
495
+ "step": 38500
496
+ },
497
+ {
498
+ "epoch": 3.46,
499
+ "learning_rate": 3.086944961446424e-05,
500
+ "loss": 0.0171,
501
+ "step": 39000
502
+ },
503
+ {
504
+ "epoch": 3.5,
505
+ "learning_rate": 2.998316050695737e-05,
506
+ "loss": 0.0167,
507
+ "step": 39500
508
+ },
509
+ {
510
+ "epoch": 3.55,
511
+ "learning_rate": 2.9096871399450502e-05,
512
+ "loss": 0.0177,
513
+ "step": 40000
514
+ },
515
+ {
516
+ "epoch": 3.59,
517
+ "learning_rate": 2.8210582291943632e-05,
518
+ "loss": 0.0173,
519
+ "step": 40500
520
+ },
521
+ {
522
+ "epoch": 3.63,
523
+ "learning_rate": 2.7324293184436765e-05,
524
+ "loss": 0.0178,
525
+ "step": 41000
526
+ },
527
+ {
528
+ "epoch": 3.68,
529
+ "learning_rate": 2.6438004076929895e-05,
530
+ "loss": 0.0168,
531
+ "step": 41500
532
+ },
533
+ {
534
+ "epoch": 3.72,
535
+ "learning_rate": 2.555171496942303e-05,
536
+ "loss": 0.0172,
537
+ "step": 42000
538
+ },
539
+ {
540
+ "epoch": 3.77,
541
+ "learning_rate": 2.4665425861916157e-05,
542
+ "loss": 0.0171,
543
+ "step": 42500
544
+ },
545
+ {
546
+ "epoch": 3.81,
547
+ "learning_rate": 2.377913675440929e-05,
548
+ "loss": 0.0173,
549
+ "step": 43000
550
+ },
551
+ {
552
+ "epoch": 3.86,
553
+ "learning_rate": 2.289284764690242e-05,
554
+ "loss": 0.0165,
555
+ "step": 43500
556
+ },
557
+ {
558
+ "epoch": 3.9,
559
+ "learning_rate": 2.2006558539395553e-05,
560
+ "loss": 0.0167,
561
+ "step": 44000
562
+ },
563
+ {
564
+ "epoch": 3.94,
565
+ "learning_rate": 2.1120269431888683e-05,
566
+ "loss": 0.0169,
567
+ "step": 44500
568
+ },
569
+ {
570
+ "epoch": 3.99,
571
+ "learning_rate": 2.0233980324381816e-05,
572
+ "loss": 0.0161,
573
+ "step": 45000
574
+ },
575
+ {
576
+ "epoch": 4.0,
577
+ "eval_loss": 0.10828191787004471,
578
+ "eval_runtime": 419.188,
579
+ "eval_samples_per_second": 6.73,
580
+ "eval_steps_per_second": 3.366,
581
+ "step": 45132
582
+ },
583
+ {
584
+ "epoch": 4.03,
585
+ "learning_rate": 1.9347691216874946e-05,
586
+ "loss": 0.0116,
587
+ "step": 45500
588
+ },
589
+ {
590
+ "epoch": 4.08,
591
+ "learning_rate": 1.8461402109368076e-05,
592
+ "loss": 0.0098,
593
+ "step": 46000
594
+ },
595
+ {
596
+ "epoch": 4.12,
597
+ "learning_rate": 1.757511300186121e-05,
598
+ "loss": 0.0094,
599
+ "step": 46500
600
+ },
601
+ {
602
+ "epoch": 4.17,
603
+ "learning_rate": 1.6688823894354338e-05,
604
+ "loss": 0.01,
605
+ "step": 47000
606
+ },
607
+ {
608
+ "epoch": 4.21,
609
+ "learning_rate": 1.580253478684747e-05,
610
+ "loss": 0.0099,
611
+ "step": 47500
612
+ },
613
+ {
614
+ "epoch": 4.25,
615
+ "learning_rate": 1.4916245679340601e-05,
616
+ "loss": 0.0099,
617
+ "step": 48000
618
+ },
619
+ {
620
+ "epoch": 4.3,
621
+ "learning_rate": 1.4029956571833732e-05,
622
+ "loss": 0.0098,
623
+ "step": 48500
624
+ },
625
+ {
626
+ "epoch": 4.34,
627
+ "learning_rate": 1.3143667464326864e-05,
628
+ "loss": 0.0097,
629
+ "step": 49000
630
+ },
631
+ {
632
+ "epoch": 4.39,
633
+ "learning_rate": 1.2257378356819995e-05,
634
+ "loss": 0.0097,
635
+ "step": 49500
636
+ },
637
+ {
638
+ "epoch": 4.43,
639
+ "learning_rate": 1.1371089249313127e-05,
640
+ "loss": 0.01,
641
+ "step": 50000
642
+ },
643
+ {
644
+ "epoch": 4.48,
645
+ "learning_rate": 1.0484800141806258e-05,
646
+ "loss": 0.0101,
647
+ "step": 50500
648
+ },
649
+ {
650
+ "epoch": 4.52,
651
+ "learning_rate": 9.59851103429939e-06,
652
+ "loss": 0.01,
653
+ "step": 51000
654
+ },
655
+ {
656
+ "epoch": 4.56,
657
+ "learning_rate": 8.71222192679252e-06,
658
+ "loss": 0.0141,
659
+ "step": 51500
660
+ },
661
+ {
662
+ "epoch": 4.61,
663
+ "learning_rate": 7.825932819285652e-06,
664
+ "loss": 0.0097,
665
+ "step": 52000
666
+ },
667
+ {
668
+ "epoch": 4.65,
669
+ "learning_rate": 6.939643711778783e-06,
670
+ "loss": 0.0097,
671
+ "step": 52500
672
+ },
673
+ {
674
+ "epoch": 4.7,
675
+ "learning_rate": 6.053354604271914e-06,
676
+ "loss": 0.0111,
677
+ "step": 53000
678
+ },
679
+ {
680
+ "epoch": 4.74,
681
+ "learning_rate": 5.1670654967650455e-06,
682
+ "loss": 0.0097,
683
+ "step": 53500
684
+ },
685
+ {
686
+ "epoch": 4.79,
687
+ "learning_rate": 4.280776389258176e-06,
688
+ "loss": 0.0102,
689
+ "step": 54000
690
+ },
691
+ {
692
+ "epoch": 4.83,
693
+ "learning_rate": 3.3944872817513074e-06,
694
+ "loss": 0.0097,
695
+ "step": 54500
696
+ },
697
+ {
698
+ "epoch": 4.87,
699
+ "learning_rate": 2.508198174244439e-06,
700
+ "loss": 0.0097,
701
+ "step": 55000
702
+ },
703
+ {
704
+ "epoch": 4.92,
705
+ "learning_rate": 1.62190906673757e-06,
706
+ "loss": 0.0098,
707
+ "step": 55500
708
+ },
709
+ {
710
+ "epoch": 4.96,
711
+ "learning_rate": 7.35619959230701e-07,
712
+ "loss": 0.0095,
713
+ "step": 56000
714
+ },
715
+ {
716
+ "epoch": 5.0,
717
+ "eval_loss": 0.11257254332304001,
718
+ "eval_runtime": 419.1321,
719
+ "eval_samples_per_second": 6.731,
720
+ "eval_steps_per_second": 3.366,
721
+ "step": 56415
722
+ }
723
+ ],
724
+ "logging_steps": 500,
725
+ "max_steps": 56415,
726
+ "num_train_epochs": 5,
727
+ "save_steps": 500,
728
+ "total_flos": 4.9105395843072e+17,
729
+ "trial_name": null,
730
+ "trial_params": null
731
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30215808f9c22ea7377cab1d5d0d5eec3bcc709165517c2cfe59b1102a410087
3
+ size 4091
vocab.json ADDED
The diff for this file is too large to render. See raw diff