madlag commited on
Commit
50915c2
1 Parent(s): 100f5d7

Inital commit.

Browse files
README.md ADDED
File without changes
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "albert-base-v2",
3
+ "architectures": [
4
+ "AlbertForQuestionAnswering"
5
+ ],
6
+ "attention_probs_dropout_prob": 0,
7
+ "bos_token_id": 2,
8
+ "classifier_dropout_prob": 0.1,
9
+ "down_scale_factor": 1,
10
+ "embedding_size": 128,
11
+ "eos_token_id": 3,
12
+ "gap_size": 0,
13
+ "hidden_act": "gelu_new",
14
+ "hidden_dropout_prob": 0,
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "inner_group_num": 1,
18
+ "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_position_embeddings": 512,
21
+ "model_type": "albert",
22
+ "net_structure_type": 0,
23
+ "num_attention_heads": 12,
24
+ "num_hidden_groups": 1,
25
+ "num_hidden_layers": 12,
26
+ "num_memory_blocks": 0,
27
+ "pad_token_id": 0,
28
+ "position_embedding_type": "absolute",
29
+ "transformers_version": "4.5.0.dev0",
30
+ "type_vocab_size": 2,
31
+ "vocab_size": 30000
32
+ }
data_args.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_cache_dir": "dataset_cache",
3
+ "dataset_config_name": null,
4
+ "dataset_name": "squad",
5
+ "doc_stride": 128,
6
+ "max_answer_length": 30,
7
+ "max_seq_length": 384,
8
+ "n_best_size": 20,
9
+ "null_score_diff_threshold": 0.0,
10
+ "overwrite_cache": 0,
11
+ "pad_to_max_length": true,
12
+ "preprocessing_num_workers": null,
13
+ "train_file": null,
14
+ "validation_file": null,
15
+ "version_2_with_negative": false
16
+ }
eval_metrics.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "exact_match": 83.74645222327341,
3
+ "f1": 90.78776054621733
4
+ }
evaluate_timing.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eval_elapsed_time": 54.7689094375819, "cuda_eval_elapsed_time": 50.769064575195316}
model_args.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cache_dir": null,
3
+ "config_name": null,
4
+ "model_name_or_path": "albert-base-v2",
5
+ "tokenizer_name": "albert-base-v2",
6
+ "use_fast_tokenizer": true
7
+ }
predictions.json ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb659ad3854c5d988f0c30549bff33f3ece09c372a37fca2af93f2ab47b5fd81
3
+ size 44393497
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3efb47b48f2529e5a647ddc807b45fa670bdecf81c6556e90e2318707a57c88
3
+ size 623
sparse_args.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ampere_pruning_method": "disabled",
3
+ "attention_block_cols": 32,
4
+ "attention_block_rows": 32,
5
+ "attention_lambda": 1.0,
6
+ "attention_output_with_dense": 0,
7
+ "attention_pruning_method": "sigmoied_threshold",
8
+ "bias_mask": true,
9
+ "dense_block_cols": 1,
10
+ "dense_block_rows": 1,
11
+ "dense_lambda": 1.0,
12
+ "dense_pruning_method": "sigmoied_threshold:1d_alt",
13
+ "distil_alpha_ce": 0.1,
14
+ "distil_alpha_teacher": 0.9,
15
+ "distil_teacher_name_or_path": null,
16
+ "distil_temperature": 2.0,
17
+ "eval_with_current_patch_params": 1,
18
+ "final_ampere_temperature": 20.0,
19
+ "final_finetune": false,
20
+ "final_threshold": 1.0,
21
+ "final_warmup": 0.0,
22
+ "gelu_patch": 0,
23
+ "gelu_patch_steps": 50000,
24
+ "initial_ampere_temperature": 0.0,
25
+ "initial_threshold": 1.0,
26
+ "initial_warmup": 1,
27
+ "layer_norm_patch": 0,
28
+ "layer_norm_patch_start_delta": 0.99,
29
+ "layer_norm_patch_steps": 50000,
30
+ "linear_min_parameters": 0,
31
+ "mask_init": "constant",
32
+ "mask_scale": 0.0,
33
+ "mask_scores_learning_rate": 0.01,
34
+ "regularization": "l1",
35
+ "regularization_final_lambda": 0,
36
+ "rewind_model_name_or_path": null
37
+ }
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefb02b667a6c5c2fe27602d28e5fb3428f66ab89c7d6f388e7c8d44a02d0336
3
+ size 760289
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "albert-base-v2"}
trainer_state.json ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9927797833935017,
5
+ "global_step": 11040,
6
+ "is_hyper_param_search": true,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "ampere_temperature": 0.0,
12
+ "ce_loss": 4.599947213172912,
13
+ "distil_loss": 0.0,
14
+ "epoch": 0.05,
15
+ "learning_rate": 0.001,
16
+ "loss": 4.5999,
17
+ "nnz_perc": 1.0,
18
+ "progress": 0.0,
19
+ "regu_lambda": 0.0,
20
+ "regu_loss": 0.0,
21
+ "step": 250,
22
+ "threshold": 1.0
23
+ },
24
+ {
25
+ "ampere_temperature": 0.0,
26
+ "ce_loss": 1.8339186582565308,
27
+ "distil_loss": 0.0,
28
+ "epoch": 0.09,
29
+ "learning_rate": 0.002,
30
+ "loss": 1.8339,
31
+ "nnz_perc": 1.0,
32
+ "progress": 0.0,
33
+ "regu_lambda": 0.0,
34
+ "regu_loss": 0.0,
35
+ "step": 500,
36
+ "threshold": 1.0
37
+ },
38
+ {
39
+ "ampere_temperature": 0.0,
40
+ "ce_loss": 1.3567713406085968,
41
+ "distil_loss": 0.0,
42
+ "epoch": 0.14,
43
+ "learning_rate": 0.003,
44
+ "loss": 1.3568,
45
+ "nnz_perc": 1.0,
46
+ "progress": 0.0,
47
+ "regu_lambda": 0.0,
48
+ "regu_loss": 0.0,
49
+ "step": 750,
50
+ "threshold": 1.0
51
+ },
52
+ {
53
+ "ampere_temperature": 0.0,
54
+ "ce_loss": 1.2095605379343033,
55
+ "distil_loss": 0.0,
56
+ "epoch": 0.18,
57
+ "learning_rate": 0.004,
58
+ "loss": 1.2096,
59
+ "nnz_perc": 1.0,
60
+ "progress": 0.0,
61
+ "regu_lambda": 0.0,
62
+ "regu_loss": 0.0,
63
+ "step": 1000,
64
+ "threshold": 1.0
65
+ },
66
+ {
67
+ "ampere_temperature": 0.0,
68
+ "ce_loss": 1.1451576855182648,
69
+ "distil_loss": 0.0,
70
+ "epoch": 0.23,
71
+ "learning_rate": 0.005,
72
+ "loss": 1.1452,
73
+ "nnz_perc": 1.0,
74
+ "progress": 0.0,
75
+ "regu_lambda": 0.0,
76
+ "regu_loss": 0.0,
77
+ "step": 1250,
78
+ "threshold": 1.0
79
+ },
80
+ {
81
+ "ce_loss": 1.1159179366551912,
82
+ "distil_loss": 0.0,
83
+ "epoch": 0.25,
84
+ "eval_ampere_temperature": 0.0,
85
+ "eval_exact_match": 77.360454115421,
86
+ "eval_f1": 86.34721419771964,
87
+ "eval_progress": 0.0,
88
+ "eval_regu_lambda": 0.0,
89
+ "eval_threshold": 1.0,
90
+ "nnz_perc": 1.0,
91
+ "regu_loss": 0.0,
92
+ "step": 1380
93
+ },
94
+ {
95
+ "ampere_temperature": 0.0,
96
+ "ce_loss": 1.0835382461547851,
97
+ "distil_loss": 0.0,
98
+ "epoch": 0.27,
99
+ "learning_rate": 0.006,
100
+ "loss": 1.1004,
101
+ "nnz_perc": 1.0,
102
+ "progress": 0.0,
103
+ "regu_lambda": 0.0,
104
+ "regu_loss": 0.0,
105
+ "step": 1500,
106
+ "threshold": 1.0
107
+ },
108
+ {
109
+ "ampere_temperature": 0.0,
110
+ "ce_loss": 1.1237352261543274,
111
+ "distil_loss": 0.0,
112
+ "epoch": 0.32,
113
+ "learning_rate": 0.006999999999999999,
114
+ "loss": 1.1237,
115
+ "nnz_perc": 1.0,
116
+ "progress": 0.0,
117
+ "regu_lambda": 0.0,
118
+ "regu_loss": 0.0,
119
+ "step": 1750,
120
+ "threshold": 1.0
121
+ },
122
+ {
123
+ "ampere_temperature": 0.0,
124
+ "ce_loss": 1.116382148861885,
125
+ "distil_loss": 0.0,
126
+ "epoch": 0.36,
127
+ "learning_rate": 0.008,
128
+ "loss": 1.1164,
129
+ "nnz_perc": 1.0,
130
+ "progress": 0.0,
131
+ "regu_lambda": 0.0,
132
+ "regu_loss": 0.0,
133
+ "step": 2000,
134
+ "threshold": 1.0
135
+ },
136
+ {
137
+ "ampere_temperature": 0.0,
138
+ "ce_loss": 1.0670130407810212,
139
+ "distil_loss": 0.0,
140
+ "epoch": 0.41,
141
+ "learning_rate": 0.009000000000000001,
142
+ "loss": 1.067,
143
+ "nnz_perc": 1.0,
144
+ "progress": 0.0,
145
+ "regu_lambda": 0.0,
146
+ "regu_loss": 0.0,
147
+ "step": 2250,
148
+ "threshold": 1.0
149
+ },
150
+ {
151
+ "ampere_temperature": 0.0,
152
+ "ce_loss": 1.0639437032938004,
153
+ "distil_loss": 0.0,
154
+ "epoch": 0.45,
155
+ "learning_rate": 0.01,
156
+ "loss": 1.0639,
157
+ "nnz_perc": 1.0,
158
+ "progress": 0.0,
159
+ "regu_lambda": 0.0,
160
+ "regu_loss": 0.0,
161
+ "step": 2500,
162
+ "threshold": 1.0
163
+ },
164
+ {
165
+ "ampere_temperature": 1.6912145472259645,
166
+ "ce_loss": 1.0629408322572709,
167
+ "distil_loss": 0.0,
168
+ "epoch": 0.5,
169
+ "learning_rate": 0.00970862470862471,
170
+ "loss": 1.0629,
171
+ "nnz_perc": 1.0,
172
+ "progress": 0.029020979020979,
173
+ "regu_lambda": 0.0,
174
+ "regu_loss": 0.0,
175
+ "step": 2750,
176
+ "threshold": 1.0
177
+ },
178
+ {
179
+ "ce_loss": 1.0985989689826965,
180
+ "distil_loss": 0.0,
181
+ "epoch": 0.5,
182
+ "eval_ampere_temperature": 1.7570655286803998,
183
+ "eval_exact_match": 75.37369914853359,
184
+ "eval_f1": 85.4846023509551,
185
+ "eval_progress": 0.03018648018648018,
186
+ "eval_regu_lambda": 0.0,
187
+ "eval_threshold": 1.0,
188
+ "nnz_perc": 1.0,
189
+ "regu_loss": 0.0,
190
+ "step": 2760
191
+ },
192
+ {
193
+ "ampere_temperature": 3.2905000860378912,
194
+ "ce_loss": 1.0230497049788634,
195
+ "distil_loss": 0.0,
196
+ "epoch": 0.54,
197
+ "learning_rate": 0.009417249417249416,
198
+ "loss": 1.0261,
199
+ "nnz_perc": 1.0,
200
+ "progress": 0.058158508158508204,
201
+ "regu_lambda": 0.0,
202
+ "regu_loss": 0.0,
203
+ "step": 3000,
204
+ "threshold": 1.0
205
+ },
206
+ {
207
+ "ampere_temperature": 4.793831310474058,
208
+ "ce_loss": 0.9981409941911698,
209
+ "distil_loss": 0.0,
210
+ "epoch": 0.59,
211
+ "learning_rate": 0.009125874125874126,
212
+ "loss": 0.9981,
213
+ "nnz_perc": 1.0,
214
+ "progress": 0.0872960372960373,
215
+ "regu_lambda": 0.0,
216
+ "regu_loss": 0.0,
217
+ "step": 3250,
218
+ "threshold": 1.0
219
+ },
220
+ {
221
+ "ampere_temperature": 6.204176736633212,
222
+ "ce_loss": 1.0074045011997224,
223
+ "distil_loss": 0.0,
224
+ "epoch": 0.63,
225
+ "learning_rate": 0.008834498834498834,
226
+ "loss": 1.0074,
227
+ "nnz_perc": 1.0,
228
+ "progress": 0.1164335664335664,
229
+ "regu_lambda": 0.0,
230
+ "regu_loss": 0.0,
231
+ "step": 3500,
232
+ "threshold": 1.0
233
+ },
234
+ {
235
+ "ampere_temperature": 7.524504880614105,
236
+ "ce_loss": 0.9891920503377915,
237
+ "distil_loss": 0.0,
238
+ "epoch": 0.68,
239
+ "learning_rate": 0.008543123543123544,
240
+ "loss": 0.9892,
241
+ "nnz_perc": 1.0,
242
+ "progress": 0.1455710955710956,
243
+ "regu_lambda": 0.0,
244
+ "regu_loss": 0.0,
245
+ "step": 3750,
246
+ "threshold": 1.0
247
+ },
248
+ {
249
+ "ampere_temperature": 8.757784258515466,
250
+ "ce_loss": 1.0083434996008873,
251
+ "distil_loss": 0.0,
252
+ "epoch": 0.72,
253
+ "learning_rate": 0.008251748251748252,
254
+ "loss": 1.0083,
255
+ "nnz_perc": 1.0,
256
+ "progress": 0.1747086247086247,
257
+ "regu_lambda": 0.0,
258
+ "regu_loss": 0.0,
259
+ "step": 4000,
260
+ "threshold": 1.0
261
+ },
262
+ {
263
+ "ce_loss": 0.9699458577803203,
264
+ "distil_loss": 0.0,
265
+ "epoch": 0.75,
266
+ "eval_ampere_temperature": 9.411504281933276,
267
+ "eval_exact_match": 79.94323557237465,
268
+ "eval_f1": 88.17033886272301,
269
+ "eval_progress": 0.191025641025641,
270
+ "eval_regu_lambda": 0.0,
271
+ "eval_threshold": 1.0,
272
+ "nnz_perc": 1.0,
273
+ "regu_loss": 0.0,
274
+ "step": 4140
275
+ },
276
+ {
277
+ "ampere_temperature": 9.906983386436048,
278
+ "ce_loss": 0.9698418254202062,
279
+ "distil_loss": 0.0,
280
+ "epoch": 0.77,
281
+ "learning_rate": 0.00796037296037296,
282
+ "loss": 0.9699,
283
+ "nnz_perc": 1.0,
284
+ "progress": 0.2038461538461538,
285
+ "regu_lambda": 0.0,
286
+ "regu_loss": 0.0,
287
+ "step": 4250,
288
+ "threshold": 1.0
289
+ },
290
+ {
291
+ "ampere_temperature": 10.97507078047459,
292
+ "ce_loss": 0.9425091907978058,
293
+ "distil_loss": 0.0,
294
+ "epoch": 0.81,
295
+ "learning_rate": 0.007668997668997669,
296
+ "loss": 0.9425,
297
+ "nnz_perc": 1.0,
298
+ "progress": 0.232983682983683,
299
+ "regu_lambda": 0.0,
300
+ "regu_loss": 0.0,
301
+ "step": 4500,
302
+ "threshold": 1.0
303
+ },
304
+ {
305
+ "ampere_temperature": 11.965014956729835,
306
+ "ce_loss": 0.9731772248744964,
307
+ "distil_loss": 0.0,
308
+ "epoch": 0.86,
309
+ "learning_rate": 0.007377622377622378,
310
+ "loss": 0.9732,
311
+ "nnz_perc": 1.0,
312
+ "progress": 0.2621212121212122,
313
+ "regu_lambda": 0.0,
314
+ "regu_loss": 0.0,
315
+ "step": 4750,
316
+ "threshold": 1.0
317
+ },
318
+ {
319
+ "ampere_temperature": 12.879784431300521,
320
+ "ce_loss": 0.9197172073125839,
321
+ "distil_loss": 0.0,
322
+ "epoch": 0.9,
323
+ "learning_rate": 0.007086247086247086,
324
+ "loss": 0.9197,
325
+ "nnz_perc": 1.0,
326
+ "progress": 0.2912587412587413,
327
+ "regu_lambda": 0.0,
328
+ "regu_loss": 0.0,
329
+ "step": 5000,
330
+ "threshold": 1.0
331
+ },
332
+ {
333
+ "ampere_temperature": 13.722347720285395,
334
+ "ce_loss": 0.9390108388662338,
335
+ "distil_loss": 0.0,
336
+ "epoch": 0.95,
337
+ "learning_rate": 0.006794871794871795,
338
+ "loss": 0.939,
339
+ "nnz_perc": 1.0,
340
+ "progress": 0.3203962703962704,
341
+ "regu_lambda": 0.0,
342
+ "regu_loss": 0.0,
343
+ "step": 5250,
344
+ "threshold": 1.0
345
+ },
346
+ {
347
+ "ampere_temperature": 14.495673339783197,
348
+ "ce_loss": 0.9188237161636352,
349
+ "distil_loss": 0.0,
350
+ "epoch": 0.99,
351
+ "learning_rate": 0.006503496503496503,
352
+ "loss": 0.9188,
353
+ "nnz_perc": 1.0,
354
+ "progress": 0.3495337995337995,
355
+ "regu_lambda": 0.0,
356
+ "regu_loss": 0.0,
357
+ "step": 5500,
358
+ "threshold": 1.0
359
+ },
360
+ {
361
+ "ce_loss": 0.9402093678712845,
362
+ "distil_loss": 0.0,
363
+ "epoch": 1.0,
364
+ "eval_ampere_temperature": 14.55463723501537,
365
+ "eval_exact_match": 81.63670766319773,
366
+ "eval_f1": 89.21446798933258,
367
+ "eval_progress": 0.35186480186480185,
368
+ "eval_regu_lambda": 0.0,
369
+ "eval_threshold": 1.0,
370
+ "nnz_perc": 1.0,
371
+ "regu_loss": 0.0,
372
+ "step": 5520
373
+ },
374
+ {
375
+ "ampere_temperature": 15.202729805892675,
376
+ "ce_loss": 0.7292252867118172,
377
+ "distil_loss": 0.0,
378
+ "epoch": 1.04,
379
+ "learning_rate": 0.006212121212121212,
380
+ "loss": 0.7461,
381
+ "nnz_perc": 1.0,
382
+ "progress": 0.3786713286713287,
383
+ "regu_lambda": 0.0,
384
+ "regu_loss": 0.0,
385
+ "step": 5750,
386
+ "threshold": 1.0
387
+ },
388
+ {
389
+ "ampere_temperature": 15.846485634712565,
390
+ "ce_loss": 0.7380791381597519,
391
+ "distil_loss": 0.0,
392
+ "epoch": 1.08,
393
+ "learning_rate": 0.005920745920745921,
394
+ "loss": 0.7381,
395
+ "nnz_perc": 1.0,
396
+ "progress": 0.4078088578088578,
397
+ "regu_lambda": 0.0,
398
+ "regu_loss": 0.0,
399
+ "step": 6000,
400
+ "threshold": 1.0
401
+ },
402
+ {
403
+ "ampere_temperature": 16.429909342341613,
404
+ "ce_loss": 0.7548821606636047,
405
+ "distil_loss": 0.0,
406
+ "epoch": 1.13,
407
+ "learning_rate": 0.005629370629370629,
408
+ "loss": 0.7549,
409
+ "nnz_perc": 1.0,
410
+ "progress": 0.436946386946387,
411
+ "regu_lambda": 0.0,
412
+ "regu_loss": 0.0,
413
+ "step": 6250,
414
+ "threshold": 1.0
415
+ },
416
+ {
417
+ "ampere_temperature": 16.955969444878562,
418
+ "ce_loss": 0.7157313173413277,
419
+ "distil_loss": 0.0,
420
+ "epoch": 1.17,
421
+ "learning_rate": 0.005337995337995338,
422
+ "loss": 0.7157,
423
+ "nnz_perc": 1.0,
424
+ "progress": 0.4660839160839161,
425
+ "regu_lambda": 0.0,
426
+ "regu_loss": 0.0,
427
+ "step": 6500,
428
+ "threshold": 1.0
429
+ },
430
+ {
431
+ "ampere_temperature": 17.427634458422148,
432
+ "ce_loss": 0.7611533465385437,
433
+ "distil_loss": 0.0,
434
+ "epoch": 1.22,
435
+ "learning_rate": 0.005046620046620046,
436
+ "loss": 0.7612,
437
+ "nnz_perc": 1.0,
438
+ "progress": 0.4952214452214452,
439
+ "regu_lambda": 0.0,
440
+ "regu_loss": 0.0,
441
+ "step": 6750,
442
+ "threshold": 1.0
443
+ },
444
+ {
445
+ "ce_loss": 0.7508984424670537,
446
+ "distil_loss": 0.0,
447
+ "epoch": 1.25,
448
+ "eval_ampere_temperature": 17.68575872652857,
449
+ "eval_exact_match": 81.51371807000946,
450
+ "eval_f1": 88.80037767793473,
451
+ "eval_progress": 0.5127039627039627,
452
+ "eval_regu_lambda": 0.0,
453
+ "eval_threshold": 1.0,
454
+ "nnz_perc": 1.0,
455
+ "regu_loss": 0.0,
456
+ "step": 6900
457
+ },
458
+ {
459
+ "ampere_temperature": 17.847872899071124,
460
+ "ce_loss": 0.6947148644924164,
461
+ "distil_loss": 0.0,
462
+ "epoch": 1.26,
463
+ "learning_rate": 0.004755244755244755,
464
+ "loss": 0.7284,
465
+ "nnz_perc": 1.0,
466
+ "progress": 0.5243589743589744,
467
+ "regu_lambda": 0.0,
468
+ "regu_loss": 0.0,
469
+ "step": 7000,
470
+ "threshold": 1.0
471
+ },
472
+ {
473
+ "ampere_temperature": 18.219653282924224,
474
+ "ce_loss": 0.7663285417556762,
475
+ "distil_loss": 0.0,
476
+ "epoch": 1.31,
477
+ "learning_rate": 0.004463869463869464,
478
+ "loss": 0.7663,
479
+ "nnz_perc": 1.0,
480
+ "progress": 0.5534965034965035,
481
+ "regu_lambda": 0.0,
482
+ "regu_loss": 0.0,
483
+ "step": 7250,
484
+ "threshold": 1.0
485
+ },
486
+ {
487
+ "ampere_temperature": 18.545944126080197,
488
+ "ce_loss": 0.691897637873888,
489
+ "distil_loss": 0.0,
490
+ "epoch": 1.35,
491
+ "learning_rate": 0.004172494172494173,
492
+ "loss": 0.6919,
493
+ "nnz_perc": 1.0,
494
+ "progress": 0.5826340326340327,
495
+ "regu_lambda": 0.0,
496
+ "regu_loss": 0.0,
497
+ "step": 7500,
498
+ "threshold": 1.0
499
+ },
500
+ {
501
+ "ampere_temperature": 18.82971394463778,
502
+ "ce_loss": 0.7088325002193451,
503
+ "distil_loss": 0.0,
504
+ "epoch": 1.4,
505
+ "learning_rate": 0.0038811188811188812,
506
+ "loss": 0.7088,
507
+ "nnz_perc": 1.0,
508
+ "progress": 0.6117715617715618,
509
+ "regu_lambda": 0.0,
510
+ "regu_loss": 0.0,
511
+ "step": 7750,
512
+ "threshold": 1.0
513
+ },
514
+ {
515
+ "ampere_temperature": 19.07393125469572,
516
+ "ce_loss": 0.7107383124232293,
517
+ "distil_loss": 0.0,
518
+ "epoch": 1.44,
519
+ "learning_rate": 0.0035897435897435897,
520
+ "loss": 0.7107,
521
+ "nnz_perc": 1.0,
522
+ "progress": 0.6409090909090909,
523
+ "regu_lambda": 0.0,
524
+ "regu_loss": 0.0,
525
+ "step": 8000,
526
+ "threshold": 1.0
527
+ },
528
+ {
529
+ "ampere_temperature": 19.281564572352753,
530
+ "ce_loss": 0.7073436776399612,
531
+ "distil_loss": 0.0,
532
+ "epoch": 1.49,
533
+ "learning_rate": 0.0032983682983682983,
534
+ "loss": 0.7073,
535
+ "nnz_perc": 1.0,
536
+ "progress": 0.6700466200466201,
537
+ "regu_lambda": 0.0,
538
+ "regu_loss": 0.0,
539
+ "step": 8250,
540
+ "threshold": 1.0
541
+ },
542
+ {
543
+ "ce_loss": 0.7176821072896321,
544
+ "distil_loss": 0.0,
545
+ "epoch": 1.49,
546
+ "eval_ampere_temperature": 19.304163095074752,
547
+ "eval_exact_match": 82.71523178807946,
548
+ "eval_f1": 89.82467226075393,
549
+ "eval_progress": 0.6735431235431235,
550
+ "eval_regu_lambda": 0.0,
551
+ "eval_threshold": 1.0,
552
+ "nnz_perc": 1.0,
553
+ "regu_loss": 0.0,
554
+ "step": 8280
555
+ },
556
+ {
557
+ "ampere_temperature": 19.455582413707628,
558
+ "ce_loss": 0.7027889224615964,
559
+ "distil_loss": 0.0,
560
+ "epoch": 1.53,
561
+ "learning_rate": 0.0030069930069930068,
562
+ "loss": 0.7046,
563
+ "nnz_perc": 1.0,
564
+ "progress": 0.6991841491841492,
565
+ "regu_lambda": 0.0,
566
+ "regu_loss": 0.0,
567
+ "step": 8500,
568
+ "threshold": 1.0
569
+ },
570
+ {
571
+ "ampere_temperature": 19.598953294859086,
572
+ "ce_loss": 0.6954642720222474,
573
+ "distil_loss": 0.0,
574
+ "epoch": 1.58,
575
+ "learning_rate": 0.0027156177156177157,
576
+ "loss": 0.6955,
577
+ "nnz_perc": 1.0,
578
+ "progress": 0.7283216783216783,
579
+ "regu_lambda": 0.0,
580
+ "regu_loss": 0.0,
581
+ "step": 8750,
582
+ "threshold": 1.0
583
+ },
584
+ {
585
+ "ampere_temperature": 19.71464573190587,
586
+ "ce_loss": 0.7050508892536164,
587
+ "distil_loss": 0.0,
588
+ "epoch": 1.62,
589
+ "learning_rate": 0.0024242424242424242,
590
+ "loss": 0.7051,
591
+ "nnz_perc": 1.0,
592
+ "progress": 0.7574592074592075,
593
+ "regu_lambda": 0.0,
594
+ "regu_loss": 0.0,
595
+ "step": 9000,
596
+ "threshold": 1.0
597
+ },
598
+ {
599
+ "ampere_temperature": 19.805628240946717,
600
+ "ce_loss": 0.6534205512404442,
601
+ "distil_loss": 0.0,
602
+ "epoch": 1.67,
603
+ "learning_rate": 0.0021328671328671328,
604
+ "loss": 0.6534,
605
+ "nnz_perc": 1.0,
606
+ "progress": 0.7865967365967366,
607
+ "regu_lambda": 0.0,
608
+ "regu_loss": 0.0,
609
+ "step": 9250,
610
+ "threshold": 1.0
611
+ },
612
+ {
613
+ "ampere_temperature": 19.874869338080376,
614
+ "ce_loss": 0.6931327093839645,
615
+ "distil_loss": 0.0,
616
+ "epoch": 1.71,
617
+ "learning_rate": 0.0018414918414918417,
618
+ "loss": 0.6931,
619
+ "nnz_perc": 1.0,
620
+ "progress": 0.8157342657342658,
621
+ "regu_lambda": 0.0,
622
+ "regu_loss": 0.0,
623
+ "step": 9500,
624
+ "threshold": 1.0
625
+ },
626
+ {
627
+ "ce_loss": 0.6803905916400254,
628
+ "distil_loss": 0.0,
629
+ "epoch": 1.74,
630
+ "eval_ampere_temperature": 19.90914467925581,
631
+ "eval_exact_match": 83.3112582781457,
632
+ "eval_f1": 90.48253679391624,
633
+ "eval_progress": 0.8343822843822843,
634
+ "eval_regu_lambda": 0.0,
635
+ "eval_threshold": 1.0,
636
+ "nnz_perc": 1.0,
637
+ "regu_loss": 0.0,
638
+ "step": 9660
639
+ },
640
+ {
641
+ "ampere_temperature": 19.925337539405586,
642
+ "ce_loss": 0.6604658047358195,
643
+ "distil_loss": 0.0,
644
+ "epoch": 1.76,
645
+ "learning_rate": 0.0015501165501165502,
646
+ "loss": 0.6732,
647
+ "nnz_perc": 1.0,
648
+ "progress": 0.8448717948717949,
649
+ "regu_lambda": 0.0,
650
+ "regu_loss": 0.0,
651
+ "step": 9750,
652
+ "threshold": 1.0
653
+ },
654
+ {
655
+ "ampere_temperature": 19.960001361021092,
656
+ "ce_loss": 0.6589477426409721,
657
+ "distil_loss": 0.0,
658
+ "epoch": 1.81,
659
+ "learning_rate": 0.001258741258741259,
660
+ "loss": 0.6589,
661
+ "nnz_perc": 1.0,
662
+ "progress": 0.874009324009324,
663
+ "regu_lambda": 0.0,
664
+ "regu_loss": 0.0,
665
+ "step": 10000,
666
+ "threshold": 1.0
667
+ },
668
+ {
669
+ "ampere_temperature": 19.981829319025636,
670
+ "ce_loss": 0.6645486508607864,
671
+ "distil_loss": 0.0,
672
+ "epoch": 1.85,
673
+ "learning_rate": 0.0009673659673659674,
674
+ "loss": 0.6645,
675
+ "nnz_perc": 1.0,
676
+ "progress": 0.9031468531468532,
677
+ "regu_lambda": 0.0,
678
+ "regu_loss": 0.0,
679
+ "step": 10250,
680
+ "threshold": 1.0
681
+ },
682
+ {
683
+ "ampere_temperature": 19.99378992951796,
684
+ "ce_loss": 0.6627120378017426,
685
+ "distil_loss": 0.0,
686
+ "epoch": 1.9,
687
+ "learning_rate": 0.000675990675990676,
688
+ "loss": 0.6627,
689
+ "nnz_perc": 1.0,
690
+ "progress": 0.9322843822843823,
691
+ "regu_lambda": 0.0,
692
+ "regu_loss": 0.0,
693
+ "step": 10500,
694
+ "threshold": 1.0
695
+ },
696
+ {
697
+ "ampere_temperature": 19.998851708596806,
698
+ "ce_loss": 0.6525639802217483,
699
+ "distil_loss": 0.0,
700
+ "epoch": 1.94,
701
+ "learning_rate": 0.00038461538461538467,
702
+ "loss": 0.6526,
703
+ "nnz_perc": 1.0,
704
+ "progress": 0.9614219114219115,
705
+ "regu_lambda": 0.0,
706
+ "regu_loss": 0.0,
707
+ "step": 10750,
708
+ "threshold": 1.0
709
+ },
710
+ {
711
+ "ampere_temperature": 19.999983172360917,
712
+ "ce_loss": 0.630506355702877,
713
+ "distil_loss": 0.0,
714
+ "epoch": 1.99,
715
+ "learning_rate": 9.324009324009324e-05,
716
+ "loss": 0.6305,
717
+ "nnz_perc": 1.0,
718
+ "progress": 0.9905594405594406,
719
+ "regu_lambda": 0.0,
720
+ "regu_loss": 0.0,
721
+ "step": 11000,
722
+ "threshold": 1.0
723
+ },
724
+ {
725
+ "ce_loss": 0.6976410485804081,
726
+ "distil_loss": 0.0,
727
+ "epoch": 1.99,
728
+ "eval_ampere_temperature": 19.99999781767362,
729
+ "eval_exact_match": 83.74645222327341,
730
+ "eval_f1": 90.78776054621733,
731
+ "eval_progress": 0.9952214452214452,
732
+ "eval_regu_lambda": 0.0,
733
+ "eval_threshold": 1.0,
734
+ "nnz_perc": 1.0,
735
+ "regu_loss": 0.0,
736
+ "step": 11040
737
+ }
738
+ ],
739
+ "max_steps": 11080,
740
+ "num_train_epochs": 2,
741
+ "total_flos": 0,
742
+ "trial_name": "hp_mnop-albert-base-v2_tn-albert-base-v2_od-__data_2to__devel_data__nn_pruning__output_sequence__squad_test_teacher___es-steps_pdebs128_nte2_ws2500_ls250_ss1380_stl50_est1380_rn-__da--3c944a736efd9cf3",
743
+ "trial_params": {}
744
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47449325d252d34ff18fda0c7a3bf3dd57546e99a7f8befb130692cf2688c4fb
3
+ size 2479