yuyang commited on
Commit
bdb75dc
1 Parent(s): 791a54d

upload ckpt files

Browse files
config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sshleifer/distilbart-cnn-12-3",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "add_bias_logits": false,
7
+ "add_final_layer_norm": false,
8
+ "architectures": [
9
+ "NFBart"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 1024,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 4096,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 3,
20
+ "decoder_start_token_id": 2,
21
+ "dropout": 0.1,
22
+ "early_stopping": true,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 2,
28
+ "extra_pos_embeddings": 2,
29
+ "force_bos_token_to_be_generated": true,
30
+ "forced_bos_token_id": 0,
31
+ "forced_eos_token_id": 2,
32
+ "gradient_checkpointing": false,
33
+ "id2label": {
34
+ "0": "LABEL_0",
35
+ "1": "LABEL_1",
36
+ "2": "LABEL_2"
37
+ },
38
+ "init_std": 0.02,
39
+ "is_encoder_decoder": true,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1,
43
+ "LABEL_2": 2
44
+ },
45
+ "length_penalty": 2.0,
46
+ "max_length": 142,
47
+ "max_position_embeddings": 1024,
48
+ "min_length": 56,
49
+ "model_type": "bart",
50
+ "no_repeat_ngram_size": 3,
51
+ "normalize_before": false,
52
+ "normalize_embedding": true,
53
+ "num_beams": 4,
54
+ "num_hidden_layers": 12,
55
+ "output_past": true,
56
+ "pad_token_id": 1,
57
+ "prefix": " ",
58
+ "save_step": 14,
59
+ "scale_embedding": false,
60
+ "static_position_embeddings": false,
61
+ "task_specific_params": {
62
+ "summarization": {
63
+ "early_stopping": true,
64
+ "length_penalty": 2.0,
65
+ "max_length": 142,
66
+ "min_length": 56,
67
+ "no_repeat_ngram_size": 3,
68
+ "num_beams": 4
69
+ }
70
+ },
71
+ "torch_dtype": "float32",
72
+ "transformers_version": "4.25.1",
73
+ "use_cache": true,
74
+ "vocab_size": 50265
75
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e72e0975dfbe0455970b9b0a0b404fa14de3dfedba3ae44792e9461cbae2dd9
3
+ size 1051630133
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "model_max_length": 1024,
37
+ "name_or_path": "sshleifer/distilbart-cnn-12-3",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "special_tokens_map_file": "/home/yang6367/.cache/huggingface/hub/models--sshleifer--distilbart-cnn-12-3/snapshots/e3a8f0dafad8b99df3209ff7cf33e1f8402619e1/special_tokens_map.json",
55
+ "tokenizer_class": "BartTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": {
58
+ "__type": "AddedToken",
59
+ "content": "<unk>",
60
+ "lstrip": false,
61
+ "normalized": true,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
+ }
trainer_state.json ADDED
@@ -0,0 +1,1500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.177720546722412,
3
+ "best_model_checkpoint": "../checkpoints/nf-distilbart_12_3-rqnsf-lagtrain-augmented/checkpoint-212000",
4
+ "epoch": 2.953510079549729,
5
+ "global_step": 212000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.03,
12
+ "gate_score": 0.4928,
13
+ "learning_rate": 0.0,
14
+ "loss": 573.0775,
15
+ "nf_loss": 569.8872,
16
+ "ppl": 32.0588,
17
+ "step": 2000
18
+ },
19
+ {
20
+ "epoch": 0.06,
21
+ "gate_score": 0.4835,
22
+ "learning_rate": 0.0,
23
+ "loss": 341.9544,
24
+ "nf_loss": 339.8841,
25
+ "ppl": 9.249,
26
+ "step": 4000
27
+ },
28
+ {
29
+ "epoch": 0.06,
30
+ "eval_loss": 256.18603515625,
31
+ "eval_nf_loss": 253.6398468017578,
32
+ "eval_perplexity": 13.794677734375,
33
+ "eval_runtime": 646.8656,
34
+ "eval_samples_per_second": 20.666,
35
+ "eval_steps_per_second": 2.583,
36
+ "step": 4000
37
+ },
38
+ {
39
+ "epoch": 0.08,
40
+ "gate_score": 0.468,
41
+ "learning_rate": 0.0,
42
+ "loss": 216.1715,
43
+ "nf_loss": 214.5228,
44
+ "ppl": 5.9033,
45
+ "step": 6000
46
+ },
47
+ {
48
+ "epoch": 0.11,
49
+ "gate_score": 0.4488,
50
+ "learning_rate": 0.0,
51
+ "loss": 143.1113,
52
+ "nf_loss": 141.682,
53
+ "ppl": 4.666,
54
+ "step": 8000
55
+ },
56
+ {
57
+ "epoch": 0.11,
58
+ "eval_loss": 111.98719024658203,
59
+ "eval_nf_loss": 109.95372009277344,
60
+ "eval_perplexity": 8.187104225158691,
61
+ "eval_runtime": 836.6322,
62
+ "eval_samples_per_second": 15.978,
63
+ "eval_steps_per_second": 1.997,
64
+ "step": 8000
65
+ },
66
+ {
67
+ "epoch": 0.14,
68
+ "gate_score": 0.4331,
69
+ "learning_rate": 0.0,
70
+ "loss": 97.2206,
71
+ "nf_loss": 95.8896,
72
+ "ppl": 4.2098,
73
+ "step": 10000
74
+ },
75
+ {
76
+ "epoch": 0.17,
77
+ "gate_score": 0.4146,
78
+ "learning_rate": 0.0,
79
+ "loss": 72.4608,
80
+ "nf_loss": 71.1644,
81
+ "ppl": 4.0666,
82
+ "step": 12000
83
+ },
84
+ {
85
+ "epoch": 0.17,
86
+ "eval_loss": 60.48736572265625,
87
+ "eval_nf_loss": 58.55850601196289,
88
+ "eval_perplexity": 7.357144355773926,
89
+ "eval_runtime": 847.0188,
90
+ "eval_samples_per_second": 15.782,
91
+ "eval_steps_per_second": 1.973,
92
+ "step": 12000
93
+ },
94
+ {
95
+ "epoch": 0.2,
96
+ "gate_score": 0.3943,
97
+ "learning_rate": 0.0,
98
+ "loss": 58.0258,
99
+ "nf_loss": 56.7432,
100
+ "ppl": 4.0134,
101
+ "step": 14000
102
+ },
103
+ {
104
+ "epoch": 0.22,
105
+ "gate_score": 0.373,
106
+ "learning_rate": 0.0,
107
+ "loss": 45.4137,
108
+ "nf_loss": 44.1607,
109
+ "ppl": 3.8803,
110
+ "step": 16000
111
+ },
112
+ {
113
+ "epoch": 0.22,
114
+ "eval_loss": 35.780086517333984,
115
+ "eval_nf_loss": 33.906742095947266,
116
+ "eval_perplexity": 6.943728923797607,
117
+ "eval_runtime": 829.3361,
118
+ "eval_samples_per_second": 16.119,
119
+ "eval_steps_per_second": 2.015,
120
+ "step": 16000
121
+ },
122
+ {
123
+ "epoch": 0.25,
124
+ "gate_score": 0.3552,
125
+ "learning_rate": 0.0,
126
+ "loss": 33.838,
127
+ "nf_loss": 32.5964,
128
+ "ppl": 3.8548,
129
+ "step": 18000
130
+ },
131
+ {
132
+ "epoch": 0.28,
133
+ "gate_score": 0.3346,
134
+ "learning_rate": 0.0,
135
+ "loss": 28.827,
136
+ "nf_loss": 27.5837,
137
+ "ppl": 3.8361,
138
+ "step": 20000
139
+ },
140
+ {
141
+ "epoch": 0.28,
142
+ "eval_loss": 24.475879669189453,
143
+ "eval_nf_loss": 22.578872680664062,
144
+ "eval_perplexity": 7.1263651847839355,
145
+ "eval_runtime": 878.8848,
146
+ "eval_samples_per_second": 15.21,
147
+ "eval_steps_per_second": 1.901,
148
+ "step": 20000
149
+ },
150
+ {
151
+ "epoch": 0.31,
152
+ "gate_score": 0.3166,
153
+ "learning_rate": 0.0,
154
+ "loss": 25.3406,
155
+ "nf_loss": 24.0903,
156
+ "ppl": 3.8804,
157
+ "step": 22000
158
+ },
159
+ {
160
+ "epoch": 0.33,
161
+ "gate_score": 0.2976,
162
+ "learning_rate": 0.0,
163
+ "loss": 22.6997,
164
+ "nf_loss": 21.451,
165
+ "ppl": 3.8523,
166
+ "step": 24000
167
+ },
168
+ {
169
+ "epoch": 0.33,
170
+ "eval_loss": 20.067764282226562,
171
+ "eval_nf_loss": 18.16155242919922,
172
+ "eval_perplexity": 7.192458152770996,
173
+ "eval_runtime": 909.546,
174
+ "eval_samples_per_second": 14.697,
175
+ "eval_steps_per_second": 1.837,
176
+ "step": 24000
177
+ },
178
+ {
179
+ "epoch": 0.36,
180
+ "gate_score": 0.2793,
181
+ "learning_rate": 0.0,
182
+ "loss": 20.7264,
183
+ "nf_loss": 19.4539,
184
+ "ppl": 3.9698,
185
+ "step": 26000
186
+ },
187
+ {
188
+ "epoch": 0.39,
189
+ "gate_score": 0.2666,
190
+ "learning_rate": 0.0,
191
+ "loss": 19.1146,
192
+ "nf_loss": 17.8629,
193
+ "ppl": 3.8907,
194
+ "step": 28000
195
+ },
196
+ {
197
+ "epoch": 0.39,
198
+ "eval_loss": 16.49034309387207,
199
+ "eval_nf_loss": 14.588922500610352,
200
+ "eval_perplexity": 7.155718803405762,
201
+ "eval_runtime": 909.9257,
202
+ "eval_samples_per_second": 14.691,
203
+ "eval_steps_per_second": 1.836,
204
+ "step": 28000
205
+ },
206
+ {
207
+ "epoch": 0.42,
208
+ "gate_score": 0.2556,
209
+ "learning_rate": 0.0,
210
+ "loss": 17.6585,
211
+ "nf_loss": 16.365,
212
+ "ppl": 4.0564,
213
+ "step": 30000
214
+ },
215
+ {
216
+ "epoch": 0.45,
217
+ "gate_score": 0.2421,
218
+ "learning_rate": 0.0,
219
+ "loss": 16.8685,
220
+ "nf_loss": 15.5951,
221
+ "ppl": 3.9944,
222
+ "step": 32000
223
+ },
224
+ {
225
+ "epoch": 0.45,
226
+ "eval_loss": 15.092597961425781,
227
+ "eval_nf_loss": 13.178274154663086,
228
+ "eval_perplexity": 7.254034996032715,
229
+ "eval_runtime": 911.9245,
230
+ "eval_samples_per_second": 14.659,
231
+ "eval_steps_per_second": 1.832,
232
+ "step": 32000
233
+ },
234
+ {
235
+ "epoch": 0.47,
236
+ "gate_score": 0.2294,
237
+ "learning_rate": 0.0,
238
+ "loss": 15.8886,
239
+ "nf_loss": 14.6156,
240
+ "ppl": 3.9955,
241
+ "step": 34000
242
+ },
243
+ {
244
+ "epoch": 0.5,
245
+ "gate_score": 0.218,
246
+ "learning_rate": 0.0,
247
+ "loss": 14.9949,
248
+ "nf_loss": 13.7324,
249
+ "ppl": 3.9536,
250
+ "step": 36000
251
+ },
252
+ {
253
+ "epoch": 0.5,
254
+ "eval_loss": 13.548192977905273,
255
+ "eval_nf_loss": 11.632102012634277,
256
+ "eval_perplexity": 7.279270172119141,
257
+ "eval_runtime": 927.089,
258
+ "eval_samples_per_second": 14.419,
259
+ "eval_steps_per_second": 1.802,
260
+ "step": 36000
261
+ },
262
+ {
263
+ "epoch": 0.53,
264
+ "gate_score": 0.2095,
265
+ "learning_rate": 0.0,
266
+ "loss": 14.5153,
267
+ "nf_loss": 13.2641,
268
+ "ppl": 3.8886,
269
+ "step": 38000
270
+ },
271
+ {
272
+ "epoch": 0.56,
273
+ "gate_score": 0.2065,
274
+ "learning_rate": 0.0,
275
+ "loss": 13.8227,
276
+ "nf_loss": 12.5901,
277
+ "ppl": 3.8375,
278
+ "step": 40000
279
+ },
280
+ {
281
+ "epoch": 0.56,
282
+ "eval_loss": 12.412004470825195,
283
+ "eval_nf_loss": 10.545845985412598,
284
+ "eval_perplexity": 6.89225435256958,
285
+ "eval_runtime": 905.0462,
286
+ "eval_samples_per_second": 14.771,
287
+ "eval_steps_per_second": 1.846,
288
+ "step": 40000
289
+ },
290
+ {
291
+ "epoch": 0.59,
292
+ "gate_score": 0.2009,
293
+ "learning_rate": 0.0,
294
+ "loss": 13.3939,
295
+ "nf_loss": 12.1663,
296
+ "ppl": 3.7926,
297
+ "step": 42000
298
+ },
299
+ {
300
+ "epoch": 0.61,
301
+ "gate_score": 0.1935,
302
+ "learning_rate": 0.0,
303
+ "loss": 13.0096,
304
+ "nf_loss": 11.7892,
305
+ "ppl": 3.758,
306
+ "step": 44000
307
+ },
308
+ {
309
+ "epoch": 0.61,
310
+ "eval_loss": 11.815178871154785,
311
+ "eval_nf_loss": 9.942044258117676,
312
+ "eval_perplexity": 6.960797309875488,
313
+ "eval_runtime": 909.5617,
314
+ "eval_samples_per_second": 14.697,
315
+ "eval_steps_per_second": 1.837,
316
+ "step": 44000
317
+ },
318
+ {
319
+ "epoch": 0.64,
320
+ "gate_score": 0.1903,
321
+ "learning_rate": 0.0,
322
+ "loss": 12.4269,
323
+ "nf_loss": 11.2151,
324
+ "ppl": 3.7435,
325
+ "step": 46000
326
+ },
327
+ {
328
+ "epoch": 0.67,
329
+ "gate_score": 0.1854,
330
+ "learning_rate": 0.0,
331
+ "loss": 11.9829,
332
+ "nf_loss": 10.7658,
333
+ "ppl": 3.7626,
334
+ "step": 48000
335
+ },
336
+ {
337
+ "epoch": 0.67,
338
+ "eval_loss": 10.730130195617676,
339
+ "eval_nf_loss": 8.879775047302246,
340
+ "eval_perplexity": 6.794580459594727,
341
+ "eval_runtime": 900.5692,
342
+ "eval_samples_per_second": 14.844,
343
+ "eval_steps_per_second": 1.855,
344
+ "step": 48000
345
+ },
346
+ {
347
+ "epoch": 0.7,
348
+ "gate_score": 0.1842,
349
+ "learning_rate": 0.0,
350
+ "loss": 11.6604,
351
+ "nf_loss": 10.459,
352
+ "ppl": 3.6987,
353
+ "step": 50000
354
+ },
355
+ {
356
+ "epoch": 0.72,
357
+ "gate_score": 0.1778,
358
+ "learning_rate": 0.0,
359
+ "loss": 11.3837,
360
+ "nf_loss": 10.1987,
361
+ "ppl": 3.6203,
362
+ "step": 52000
363
+ },
364
+ {
365
+ "epoch": 0.72,
366
+ "eval_loss": 10.283514976501465,
367
+ "eval_nf_loss": 8.462200164794922,
368
+ "eval_perplexity": 6.589704513549805,
369
+ "eval_runtime": 897.7218,
370
+ "eval_samples_per_second": 14.891,
371
+ "eval_steps_per_second": 1.861,
372
+ "step": 52000
373
+ },
374
+ {
375
+ "epoch": 0.75,
376
+ "gate_score": 0.1743,
377
+ "learning_rate": 0.0,
378
+ "loss": 11.1345,
379
+ "nf_loss": 9.9505,
380
+ "ppl": 3.6226,
381
+ "step": 54000
382
+ },
383
+ {
384
+ "epoch": 0.78,
385
+ "gate_score": 0.1681,
386
+ "learning_rate": 0.0,
387
+ "loss": 10.913,
388
+ "nf_loss": 9.7401,
389
+ "ppl": 3.5839,
390
+ "step": 56000
391
+ },
392
+ {
393
+ "epoch": 0.78,
394
+ "eval_loss": 9.798712730407715,
395
+ "eval_nf_loss": 7.988397598266602,
396
+ "eval_perplexity": 6.516669750213623,
397
+ "eval_runtime": 883.3578,
398
+ "eval_samples_per_second": 15.133,
399
+ "eval_steps_per_second": 1.892,
400
+ "step": 56000
401
+ },
402
+ {
403
+ "epoch": 0.81,
404
+ "gate_score": 0.1714,
405
+ "learning_rate": 0.0,
406
+ "loss": 10.7764,
407
+ "nf_loss": 9.605,
408
+ "ppl": 3.5672,
409
+ "step": 58000
410
+ },
411
+ {
412
+ "epoch": 0.84,
413
+ "gate_score": 0.1655,
414
+ "learning_rate": 0.0,
415
+ "loss": 10.5627,
416
+ "nf_loss": 9.4126,
417
+ "ppl": 3.5056,
418
+ "step": 60000
419
+ },
420
+ {
421
+ "epoch": 0.84,
422
+ "eval_loss": 9.538662910461426,
423
+ "eval_nf_loss": 7.738859176635742,
424
+ "eval_perplexity": 6.450651168823242,
425
+ "eval_runtime": 876.1125,
426
+ "eval_samples_per_second": 15.258,
427
+ "eval_steps_per_second": 1.907,
428
+ "step": 60000
429
+ },
430
+ {
431
+ "epoch": 0.86,
432
+ "gate_score": 0.1661,
433
+ "learning_rate": 0.0,
434
+ "loss": 10.4422,
435
+ "nf_loss": 9.2887,
436
+ "ppl": 3.523,
437
+ "step": 62000
438
+ },
439
+ {
440
+ "epoch": 0.89,
441
+ "gate_score": 0.1643,
442
+ "learning_rate": 0.0,
443
+ "loss": 10.238,
444
+ "nf_loss": 9.092,
445
+ "ppl": 3.4769,
446
+ "step": 64000
447
+ },
448
+ {
449
+ "epoch": 0.89,
450
+ "eval_loss": 9.2942533493042,
451
+ "eval_nf_loss": 7.510049819946289,
452
+ "eval_perplexity": 6.345398426055908,
453
+ "eval_runtime": 873.6279,
454
+ "eval_samples_per_second": 15.302,
455
+ "eval_steps_per_second": 1.913,
456
+ "step": 64000
457
+ },
458
+ {
459
+ "epoch": 0.92,
460
+ "gate_score": 0.1625,
461
+ "learning_rate": 0.0,
462
+ "loss": 10.1516,
463
+ "nf_loss": 9.0024,
464
+ "ppl": 3.4898,
465
+ "step": 66000
466
+ },
467
+ {
468
+ "epoch": 0.95,
469
+ "gate_score": 0.1609,
470
+ "learning_rate": 0.0,
471
+ "loss": 9.9819,
472
+ "nf_loss": 8.8616,
473
+ "ppl": 3.3834,
474
+ "step": 68000
475
+ },
476
+ {
477
+ "epoch": 0.95,
478
+ "eval_loss": 9.13103199005127,
479
+ "eval_nf_loss": 7.360151290893555,
480
+ "eval_perplexity": 6.2570929527282715,
481
+ "eval_runtime": 892.7167,
482
+ "eval_samples_per_second": 14.975,
483
+ "eval_steps_per_second": 1.872,
484
+ "step": 68000
485
+ },
486
+ {
487
+ "epoch": 0.98,
488
+ "gate_score": 0.1605,
489
+ "learning_rate": 0.0,
490
+ "loss": 9.9759,
491
+ "nf_loss": 8.8506,
492
+ "ppl": 3.3913,
493
+ "step": 70000
494
+ },
495
+ {
496
+ "epoch": 1.0,
497
+ "gate_score": 0.161,
498
+ "learning_rate": 5.5250000000000005e-06,
499
+ "loss": 9.9267,
500
+ "nf_loss": 8.805,
501
+ "ppl": 3.3955,
502
+ "step": 72000
503
+ },
504
+ {
505
+ "epoch": 1.0,
506
+ "eval_loss": 9.130549430847168,
507
+ "eval_nf_loss": 7.364365577697754,
508
+ "eval_perplexity": 6.226567268371582,
509
+ "eval_runtime": 865.1706,
510
+ "eval_samples_per_second": 15.451,
511
+ "eval_steps_per_second": 1.931,
512
+ "step": 72000
513
+ },
514
+ {
515
+ "epoch": 1.03,
516
+ "gate_score": 0.1525,
517
+ "learning_rate": 4.9922293335593894e-05,
518
+ "loss": 10.6431,
519
+ "nf_loss": 9.4961,
520
+ "ppl": 3.4863,
521
+ "step": 74000
522
+ },
523
+ {
524
+ "epoch": 1.06,
525
+ "gate_score": 0.1281,
526
+ "learning_rate": 4.921622232583111e-05,
527
+ "loss": 11.018,
528
+ "nf_loss": 9.7948,
529
+ "ppl": 3.8142,
530
+ "step": 76000
531
+ },
532
+ {
533
+ "epoch": 1.06,
534
+ "eval_loss": 10.00072193145752,
535
+ "eval_nf_loss": 8.153450965881348,
536
+ "eval_perplexity": 6.768190383911133,
537
+ "eval_runtime": 618.9198,
538
+ "eval_samples_per_second": 21.599,
539
+ "eval_steps_per_second": 2.7,
540
+ "step": 76000
541
+ },
542
+ {
543
+ "epoch": 1.09,
544
+ "gate_score": 0.1074,
545
+ "learning_rate": 4.850979810395739e-05,
546
+ "loss": 10.5531,
547
+ "nf_loss": 9.3486,
548
+ "ppl": 3.7186,
549
+ "step": 78000
550
+ },
551
+ {
552
+ "epoch": 1.11,
553
+ "gate_score": 0.0924,
554
+ "learning_rate": 4.780408030630554e-05,
555
+ "loss": 10.0575,
556
+ "nf_loss": 8.8613,
557
+ "ppl": 3.7072,
558
+ "step": 80000
559
+ },
560
+ {
561
+ "epoch": 1.11,
562
+ "eval_loss": 9.039822578430176,
563
+ "eval_nf_loss": 7.206754684448242,
564
+ "eval_perplexity": 6.669988632202148,
565
+ "eval_runtime": 639.9491,
566
+ "eval_samples_per_second": 20.889,
567
+ "eval_steps_per_second": 2.611,
568
+ "step": 80000
569
+ },
570
+ {
571
+ "epoch": 1.14,
572
+ "gate_score": 0.0874,
573
+ "learning_rate": 4.7097656084431825e-05,
574
+ "loss": 9.7084,
575
+ "nf_loss": 8.5025,
576
+ "ppl": 3.7371,
577
+ "step": 82000
578
+ },
579
+ {
580
+ "epoch": 1.17,
581
+ "gate_score": 0.0759,
582
+ "learning_rate": 4.639158507466904e-05,
583
+ "loss": 9.3613,
584
+ "nf_loss": 8.1756,
585
+ "ppl": 3.6444,
586
+ "step": 84000
587
+ },
588
+ {
589
+ "epoch": 1.17,
590
+ "eval_loss": 8.67480182647705,
591
+ "eval_nf_loss": 6.853387832641602,
592
+ "eval_perplexity": 6.5951128005981445,
593
+ "eval_runtime": 655.8198,
594
+ "eval_samples_per_second": 20.384,
595
+ "eval_steps_per_second": 2.548,
596
+ "step": 84000
597
+ },
598
+ {
599
+ "epoch": 1.2,
600
+ "gate_score": 0.0725,
601
+ "learning_rate": 4.5685160852795324e-05,
602
+ "loss": 8.9712,
603
+ "nf_loss": 7.7979,
604
+ "ppl": 3.5998,
605
+ "step": 86000
606
+ },
607
+ {
608
+ "epoch": 1.23,
609
+ "gate_score": 0.0666,
610
+ "learning_rate": 4.497908984303254e-05,
611
+ "loss": 8.7453,
612
+ "nf_loss": 7.5777,
613
+ "ppl": 3.5764,
614
+ "step": 88000
615
+ },
616
+ {
617
+ "epoch": 1.23,
618
+ "eval_loss": 7.947096347808838,
619
+ "eval_nf_loss": 6.137327671051025,
620
+ "eval_perplexity": 6.509429454803467,
621
+ "eval_runtime": 654.1896,
622
+ "eval_samples_per_second": 20.434,
623
+ "eval_steps_per_second": 2.554,
624
+ "step": 88000
625
+ },
626
+ {
627
+ "epoch": 1.25,
628
+ "gate_score": 0.0657,
629
+ "learning_rate": 4.4273018833269757e-05,
630
+ "loss": 8.4051,
631
+ "nf_loss": 7.2351,
632
+ "ppl": 3.5928,
633
+ "step": 90000
634
+ },
635
+ {
636
+ "epoch": 1.28,
637
+ "gate_score": 0.0632,
638
+ "learning_rate": 4.356694782350697e-05,
639
+ "loss": 8.2083,
640
+ "nf_loss": 7.0471,
641
+ "ppl": 3.5522,
642
+ "step": 92000
643
+ },
644
+ {
645
+ "epoch": 1.28,
646
+ "eval_loss": 7.776934623718262,
647
+ "eval_nf_loss": 5.991891384124756,
648
+ "eval_perplexity": 6.346434593200684,
649
+ "eval_runtime": 664.9292,
650
+ "eval_samples_per_second": 20.104,
651
+ "eval_steps_per_second": 2.513,
652
+ "step": 92000
653
+ },
654
+ {
655
+ "epoch": 1.31,
656
+ "gate_score": 0.0614,
657
+ "learning_rate": 4.2860523601633256e-05,
658
+ "loss": 7.8299,
659
+ "nf_loss": 6.6969,
660
+ "ppl": 3.4547,
661
+ "step": 94000
662
+ },
663
+ {
664
+ "epoch": 1.34,
665
+ "gate_score": 0.0596,
666
+ "learning_rate": 4.215445259187047e-05,
667
+ "loss": 7.6361,
668
+ "nf_loss": 6.4891,
669
+ "ppl": 3.5083,
670
+ "step": 96000
671
+ },
672
+ {
673
+ "epoch": 1.34,
674
+ "eval_loss": 7.583176612854004,
675
+ "eval_nf_loss": 5.804019927978516,
676
+ "eval_perplexity": 6.316826343536377,
677
+ "eval_runtime": 649.9826,
678
+ "eval_samples_per_second": 20.567,
679
+ "eval_steps_per_second": 2.571,
680
+ "step": 96000
681
+ },
682
+ {
683
+ "epoch": 1.37,
684
+ "gate_score": 0.0567,
685
+ "learning_rate": 4.144838158210769e-05,
686
+ "loss": 7.4853,
687
+ "nf_loss": 6.36,
688
+ "ppl": 3.4233,
689
+ "step": 98000
690
+ },
691
+ {
692
+ "epoch": 1.39,
693
+ "gate_score": 0.0557,
694
+ "learning_rate": 4.074195736023397e-05,
695
+ "loss": 7.2881,
696
+ "nf_loss": 6.1511,
697
+ "ppl": 3.4723,
698
+ "step": 100000
699
+ },
700
+ {
701
+ "epoch": 1.39,
702
+ "eval_loss": 7.400701999664307,
703
+ "eval_nf_loss": 5.625098705291748,
704
+ "eval_perplexity": 6.289206504821777,
705
+ "eval_runtime": 667.0031,
706
+ "eval_samples_per_second": 20.042,
707
+ "eval_steps_per_second": 2.505,
708
+ "step": 100000
709
+ },
710
+ {
711
+ "epoch": 1.42,
712
+ "gate_score": 0.054,
713
+ "learning_rate": 4.003588635047119e-05,
714
+ "loss": 7.165,
715
+ "nf_loss": 6.0472,
716
+ "ppl": 3.3942,
717
+ "step": 102000
718
+ },
719
+ {
720
+ "epoch": 1.45,
721
+ "gate_score": 0.0532,
722
+ "learning_rate": 3.9329815340708403e-05,
723
+ "loss": 6.9929,
724
+ "nf_loss": 5.864,
725
+ "ppl": 3.45,
726
+ "step": 104000
727
+ },
728
+ {
729
+ "epoch": 1.45,
730
+ "eval_loss": 6.6446003913879395,
731
+ "eval_nf_loss": 4.897329330444336,
732
+ "eval_perplexity": 6.099964141845703,
733
+ "eval_runtime": 665.1582,
734
+ "eval_samples_per_second": 20.097,
735
+ "eval_steps_per_second": 2.512,
736
+ "step": 104000
737
+ },
738
+ {
739
+ "epoch": 1.48,
740
+ "gate_score": 0.0508,
741
+ "learning_rate": 3.8623391118834686e-05,
742
+ "loss": 6.8638,
743
+ "nf_loss": 5.7412,
744
+ "ppl": 3.4016,
745
+ "step": 106000
746
+ },
747
+ {
748
+ "epoch": 1.5,
749
+ "gate_score": 0.048,
750
+ "learning_rate": 3.79173201090719e-05,
751
+ "loss": 6.7439,
752
+ "nf_loss": 5.6285,
753
+ "ppl": 3.3851,
754
+ "step": 108000
755
+ },
756
+ {
757
+ "epoch": 1.5,
758
+ "eval_loss": 6.440381050109863,
759
+ "eval_nf_loss": 4.687047004699707,
760
+ "eval_perplexity": 6.147556304931641,
761
+ "eval_runtime": 671.2658,
762
+ "eval_samples_per_second": 19.915,
763
+ "eval_steps_per_second": 2.489,
764
+ "step": 108000
765
+ },
766
+ {
767
+ "epoch": 1.53,
768
+ "gate_score": 0.0489,
769
+ "learning_rate": 3.721124909930912e-05,
770
+ "loss": 6.6258,
771
+ "nf_loss": 5.5227,
772
+ "ppl": 3.34,
773
+ "step": 110000
774
+ },
775
+ {
776
+ "epoch": 1.56,
777
+ "gate_score": 0.0476,
778
+ "learning_rate": 3.65048248774354e-05,
779
+ "loss": 6.5119,
780
+ "nf_loss": 5.4155,
781
+ "ppl": 3.3128,
782
+ "step": 112000
783
+ },
784
+ {
785
+ "epoch": 1.56,
786
+ "eval_loss": 6.296238422393799,
787
+ "eval_nf_loss": 4.570157527923584,
788
+ "eval_perplexity": 5.967574119567871,
789
+ "eval_runtime": 658.6675,
790
+ "eval_samples_per_second": 20.296,
791
+ "eval_steps_per_second": 2.537,
792
+ "step": 112000
793
+ },
794
+ {
795
+ "epoch": 1.59,
796
+ "gate_score": 0.0482,
797
+ "learning_rate": 3.579910707978356e-05,
798
+ "loss": 6.3285,
799
+ "nf_loss": 5.2244,
800
+ "ppl": 3.3628,
801
+ "step": 114000
802
+ },
803
+ {
804
+ "epoch": 1.62,
805
+ "gate_score": 0.0482,
806
+ "learning_rate": 3.5092682857909834e-05,
807
+ "loss": 6.2206,
808
+ "nf_loss": 5.1234,
809
+ "ppl": 3.3327,
810
+ "step": 116000
811
+ },
812
+ {
813
+ "epoch": 1.62,
814
+ "eval_loss": 6.046633720397949,
815
+ "eval_nf_loss": 4.321578025817871,
816
+ "eval_perplexity": 5.9654130935668945,
817
+ "eval_runtime": 653.7138,
818
+ "eval_samples_per_second": 20.449,
819
+ "eval_steps_per_second": 2.556,
820
+ "step": 116000
821
+ },
822
+ {
823
+ "epoch": 1.64,
824
+ "gate_score": 0.0489,
825
+ "learning_rate": 3.438625863603611e-05,
826
+ "loss": 6.0978,
827
+ "nf_loss": 5.0037,
828
+ "ppl": 3.3053,
829
+ "step": 118000
830
+ },
831
+ {
832
+ "epoch": 1.67,
833
+ "gate_score": 0.0468,
834
+ "learning_rate": 3.3680540838384266e-05,
835
+ "loss": 6.0198,
836
+ "nf_loss": 4.9354,
837
+ "ppl": 3.2737,
838
+ "step": 120000
839
+ },
840
+ {
841
+ "epoch": 1.67,
842
+ "eval_loss": 5.779796123504639,
843
+ "eval_nf_loss": 4.067526817321777,
844
+ "eval_perplexity": 5.880522727966309,
845
+ "eval_runtime": 641.0411,
846
+ "eval_samples_per_second": 20.854,
847
+ "eval_steps_per_second": 2.607,
848
+ "step": 120000
849
+ },
850
+ {
851
+ "epoch": 1.7,
852
+ "gate_score": 0.0425,
853
+ "learning_rate": 3.297446982862148e-05,
854
+ "loss": 5.907,
855
+ "nf_loss": 4.8253,
856
+ "ppl": 3.2734,
857
+ "step": 122000
858
+ },
859
+ {
860
+ "epoch": 1.73,
861
+ "gate_score": 0.0432,
862
+ "learning_rate": 3.2268045606747765e-05,
863
+ "loss": 5.7937,
864
+ "nf_loss": 4.7167,
865
+ "ppl": 3.2463,
866
+ "step": 124000
867
+ },
868
+ {
869
+ "epoch": 1.73,
870
+ "eval_loss": 5.693485260009766,
871
+ "eval_nf_loss": 3.997667074203491,
872
+ "eval_perplexity": 5.778055667877197,
873
+ "eval_runtime": 629.6888,
874
+ "eval_samples_per_second": 21.23,
875
+ "eval_steps_per_second": 2.654,
876
+ "step": 124000
877
+ },
878
+ {
879
+ "epoch": 1.76,
880
+ "gate_score": 0.0413,
881
+ "learning_rate": 3.156197459698498e-05,
882
+ "loss": 5.6708,
883
+ "nf_loss": 4.5839,
884
+ "ppl": 3.2695,
885
+ "step": 126000
886
+ },
887
+ {
888
+ "epoch": 1.78,
889
+ "gate_score": 0.0423,
890
+ "learning_rate": 3.08559035872222e-05,
891
+ "loss": 5.5759,
892
+ "nf_loss": 4.5071,
893
+ "ppl": 3.2127,
894
+ "step": 128000
895
+ },
896
+ {
897
+ "epoch": 1.78,
898
+ "eval_loss": 5.489774227142334,
899
+ "eval_nf_loss": 3.8035385608673096,
900
+ "eval_perplexity": 5.725496292114258,
901
+ "eval_runtime": 629.3114,
902
+ "eval_samples_per_second": 21.242,
903
+ "eval_steps_per_second": 2.655,
904
+ "step": 128000
905
+ },
906
+ {
907
+ "epoch": 1.81,
908
+ "gate_score": 0.0415,
909
+ "learning_rate": 3.0149832577459418e-05,
910
+ "loss": 5.4762,
911
+ "nf_loss": 4.4182,
912
+ "ppl": 3.189,
913
+ "step": 130000
914
+ },
915
+ {
916
+ "epoch": 1.84,
917
+ "gate_score": 0.043,
918
+ "learning_rate": 2.9443761567696637e-05,
919
+ "loss": 5.4304,
920
+ "nf_loss": 4.3627,
921
+ "ppl": 3.2132,
922
+ "step": 132000
923
+ },
924
+ {
925
+ "epoch": 1.84,
926
+ "eval_loss": 5.458424091339111,
927
+ "eval_nf_loss": 3.7697036266326904,
928
+ "eval_perplexity": 5.744439125061035,
929
+ "eval_runtime": 641.9111,
930
+ "eval_samples_per_second": 20.825,
931
+ "eval_steps_per_second": 2.603,
932
+ "step": 132000
933
+ },
934
+ {
935
+ "epoch": 1.87,
936
+ "gate_score": 0.0438,
937
+ "learning_rate": 2.8737337345822913e-05,
938
+ "loss": 5.3504,
939
+ "nf_loss": 4.2998,
940
+ "ppl": 3.1531,
941
+ "step": 134000
942
+ },
943
+ {
944
+ "epoch": 1.89,
945
+ "gate_score": 0.0429,
946
+ "learning_rate": 2.803126633606013e-05,
947
+ "loss": 5.2695,
948
+ "nf_loss": 4.2383,
949
+ "ppl": 3.0881,
950
+ "step": 136000
951
+ },
952
+ {
953
+ "epoch": 1.89,
954
+ "eval_loss": 5.256904125213623,
955
+ "eval_nf_loss": 3.5766844749450684,
956
+ "eval_perplexity": 5.6950812339782715,
957
+ "eval_runtime": 660.429,
958
+ "eval_samples_per_second": 20.241,
959
+ "eval_steps_per_second": 2.53,
960
+ "step": 136000
961
+ },
962
+ {
963
+ "epoch": 1.92,
964
+ "gate_score": 0.0421,
965
+ "learning_rate": 2.7324842114186412e-05,
966
+ "loss": 5.2219,
967
+ "nf_loss": 4.1712,
968
+ "ppl": 3.1497,
969
+ "step": 138000
970
+ },
971
+ {
972
+ "epoch": 1.95,
973
+ "gate_score": 0.0417,
974
+ "learning_rate": 2.661877110442363e-05,
975
+ "loss": 5.1817,
976
+ "nf_loss": 4.1256,
977
+ "ppl": 3.1777,
978
+ "step": 140000
979
+ },
980
+ {
981
+ "epoch": 1.95,
982
+ "eval_loss": 5.158201217651367,
983
+ "eval_nf_loss": 3.487734794616699,
984
+ "eval_perplexity": 5.636016845703125,
985
+ "eval_runtime": 644.0563,
986
+ "eval_samples_per_second": 20.756,
987
+ "eval_steps_per_second": 2.594,
988
+ "step": 140000
989
+ },
990
+ {
991
+ "epoch": 1.98,
992
+ "gate_score": 0.0422,
993
+ "learning_rate": 2.5913053306771785e-05,
994
+ "loss": 5.1108,
995
+ "nf_loss": 4.0728,
996
+ "ppl": 3.1163,
997
+ "step": 142000
998
+ },
999
+ {
1000
+ "epoch": 2.01,
1001
+ "gate_score": 0.0402,
1002
+ "learning_rate": 2.5206982297009e-05,
1003
+ "loss": 5.0219,
1004
+ "nf_loss": 4.0038,
1005
+ "ppl": 3.0436,
1006
+ "step": 144000
1007
+ },
1008
+ {
1009
+ "epoch": 2.01,
1010
+ "eval_loss": 5.062300205230713,
1011
+ "eval_nf_loss": 3.3892769813537598,
1012
+ "eval_perplexity": 5.655203819274902,
1013
+ "eval_runtime": 654.7044,
1014
+ "eval_samples_per_second": 20.418,
1015
+ "eval_steps_per_second": 2.552,
1016
+ "step": 144000
1017
+ },
1018
+ {
1019
+ "epoch": 2.03,
1020
+ "gate_score": 0.0407,
1021
+ "learning_rate": 2.4500558075135284e-05,
1022
+ "loss": 4.8922,
1023
+ "nf_loss": 3.9548,
1024
+ "ppl": 2.775,
1025
+ "step": 146000
1026
+ },
1027
+ {
1028
+ "epoch": 2.06,
1029
+ "gate_score": 0.0409,
1030
+ "learning_rate": 2.379413385326156e-05,
1031
+ "loss": 4.8179,
1032
+ "nf_loss": 3.8903,
1033
+ "ppl": 2.7374,
1034
+ "step": 148000
1035
+ },
1036
+ {
1037
+ "epoch": 2.06,
1038
+ "eval_loss": 4.891648769378662,
1039
+ "eval_nf_loss": 3.21927547454834,
1040
+ "eval_perplexity": 5.654395580291748,
1041
+ "eval_runtime": 649.5705,
1042
+ "eval_samples_per_second": 20.58,
1043
+ "eval_steps_per_second": 2.572,
1044
+ "step": 148000
1045
+ },
1046
+ {
1047
+ "epoch": 2.09,
1048
+ "gate_score": 0.0431,
1049
+ "learning_rate": 2.308806284349878e-05,
1050
+ "loss": 4.7494,
1051
+ "nf_loss": 3.8238,
1052
+ "ppl": 2.7455,
1053
+ "step": 150000
1054
+ },
1055
+ {
1056
+ "epoch": 2.12,
1057
+ "gate_score": 0.0424,
1058
+ "learning_rate": 2.2381991833735996e-05,
1059
+ "loss": 4.7155,
1060
+ "nf_loss": 3.7946,
1061
+ "ppl": 2.7302,
1062
+ "step": 152000
1063
+ },
1064
+ {
1065
+ "epoch": 2.12,
1066
+ "eval_loss": 4.806872844696045,
1067
+ "eval_nf_loss": 3.148994207382202,
1068
+ "eval_perplexity": 5.567600250244141,
1069
+ "eval_runtime": 630.6434,
1070
+ "eval_samples_per_second": 21.197,
1071
+ "eval_steps_per_second": 2.65,
1072
+ "step": 152000
1073
+ },
1074
+ {
1075
+ "epoch": 2.15,
1076
+ "gate_score": 0.0408,
1077
+ "learning_rate": 2.1675567611862275e-05,
1078
+ "loss": 4.6944,
1079
+ "nf_loss": 3.7643,
1080
+ "ppl": 2.7512,
1081
+ "step": 154000
1082
+ },
1083
+ {
1084
+ "epoch": 2.17,
1085
+ "gate_score": 0.0426,
1086
+ "learning_rate": 2.0969496602099495e-05,
1087
+ "loss": 4.6805,
1088
+ "nf_loss": 3.7557,
1089
+ "ppl": 2.7415,
1090
+ "step": 156000
1091
+ },
1092
+ {
1093
+ "epoch": 2.17,
1094
+ "eval_loss": 4.722836494445801,
1095
+ "eval_nf_loss": 3.066758632659912,
1096
+ "eval_perplexity": 5.557853698730469,
1097
+ "eval_runtime": 604.3797,
1098
+ "eval_samples_per_second": 22.119,
1099
+ "eval_steps_per_second": 2.765,
1100
+ "step": 156000
1101
+ },
1102
+ {
1103
+ "epoch": 2.2,
1104
+ "gate_score": 0.0438,
1105
+ "learning_rate": 2.026342559233671e-05,
1106
+ "loss": 4.5943,
1107
+ "nf_loss": 3.6721,
1108
+ "ppl": 2.722,
1109
+ "step": 158000
1110
+ },
1111
+ {
1112
+ "epoch": 2.23,
1113
+ "gate_score": 0.0447,
1114
+ "learning_rate": 1.9557354582573927e-05,
1115
+ "loss": 4.5781,
1116
+ "nf_loss": 3.6564,
1117
+ "ppl": 2.728,
1118
+ "step": 160000
1119
+ },
1120
+ {
1121
+ "epoch": 2.23,
1122
+ "eval_loss": 4.671295166015625,
1123
+ "eval_nf_loss": 3.0203938484191895,
1124
+ "eval_perplexity": 5.526300430297852,
1125
+ "eval_runtime": 604.9582,
1126
+ "eval_samples_per_second": 22.097,
1127
+ "eval_steps_per_second": 2.762,
1128
+ "step": 160000
1129
+ },
1130
+ {
1131
+ "epoch": 2.26,
1132
+ "gate_score": 0.0436,
1133
+ "learning_rate": 1.8851283572811147e-05,
1134
+ "loss": 4.5642,
1135
+ "nf_loss": 3.6404,
1136
+ "ppl": 2.7413,
1137
+ "step": 162000
1138
+ },
1139
+ {
1140
+ "epoch": 2.28,
1141
+ "gate_score": 0.0453,
1142
+ "learning_rate": 1.8144859350937426e-05,
1143
+ "loss": 4.4834,
1144
+ "nf_loss": 3.5645,
1145
+ "ppl": 2.7145,
1146
+ "step": 164000
1147
+ },
1148
+ {
1149
+ "epoch": 2.28,
1150
+ "eval_loss": 4.710845947265625,
1151
+ "eval_nf_loss": 3.063795566558838,
1152
+ "eval_perplexity": 5.505098342895508,
1153
+ "eval_runtime": 601.1865,
1154
+ "eval_samples_per_second": 22.236,
1155
+ "eval_steps_per_second": 2.78,
1156
+ "step": 164000
1157
+ },
1158
+ {
1159
+ "epoch": 2.31,
1160
+ "gate_score": 0.0442,
1161
+ "learning_rate": 1.7438788341174643e-05,
1162
+ "loss": 4.4998,
1163
+ "nf_loss": 3.5735,
1164
+ "ppl": 2.7361,
1165
+ "step": 166000
1166
+ },
1167
+ {
1168
+ "epoch": 2.34,
1169
+ "gate_score": 0.0442,
1170
+ "learning_rate": 1.6733070543522796e-05,
1171
+ "loss": 4.4172,
1172
+ "nf_loss": 3.4989,
1173
+ "ppl": 2.7118,
1174
+ "step": 168000
1175
+ },
1176
+ {
1177
+ "epoch": 2.34,
1178
+ "eval_loss": 4.5222930908203125,
1179
+ "eval_nf_loss": 2.8794257640838623,
1180
+ "eval_perplexity": 5.477485179901123,
1181
+ "eval_runtime": 609.6448,
1182
+ "eval_samples_per_second": 21.928,
1183
+ "eval_steps_per_second": 2.741,
1184
+ "step": 168000
1185
+ },
1186
+ {
1187
+ "epoch": 2.37,
1188
+ "gate_score": 0.0464,
1189
+ "learning_rate": 1.602664632164908e-05,
1190
+ "loss": 4.3817,
1191
+ "nf_loss": 3.4671,
1192
+ "ppl": 2.7031,
1193
+ "step": 170000
1194
+ },
1195
+ {
1196
+ "epoch": 2.4,
1197
+ "gate_score": 0.0447,
1198
+ "learning_rate": 1.5320575311886295e-05,
1199
+ "loss": 4.3776,
1200
+ "nf_loss": 3.4701,
1201
+ "ppl": 2.6834,
1202
+ "step": 172000
1203
+ },
1204
+ {
1205
+ "epoch": 2.4,
1206
+ "eval_loss": 4.446962356567383,
1207
+ "eval_nf_loss": 2.8145503997802734,
1208
+ "eval_perplexity": 5.419012546539307,
1209
+ "eval_runtime": 591.7923,
1210
+ "eval_samples_per_second": 22.589,
1211
+ "eval_steps_per_second": 2.824,
1212
+ "step": 172000
1213
+ },
1214
+ {
1215
+ "epoch": 2.42,
1216
+ "gate_score": 0.0438,
1217
+ "learning_rate": 1.4614151090012576e-05,
1218
+ "loss": 4.34,
1219
+ "nf_loss": 3.4303,
1220
+ "ppl": 2.6917,
1221
+ "step": 174000
1222
+ },
1223
+ {
1224
+ "epoch": 2.45,
1225
+ "gate_score": 0.0433,
1226
+ "learning_rate": 1.390808008024979e-05,
1227
+ "loss": 4.3096,
1228
+ "nf_loss": 3.4,
1229
+ "ppl": 2.6915,
1230
+ "step": 176000
1231
+ },
1232
+ {
1233
+ "epoch": 2.45,
1234
+ "eval_loss": 4.412790298461914,
1235
+ "eval_nf_loss": 2.7833149433135986,
1236
+ "eval_perplexity": 5.404773712158203,
1237
+ "eval_runtime": 593.2765,
1238
+ "eval_samples_per_second": 22.532,
1239
+ "eval_steps_per_second": 2.817,
1240
+ "step": 176000
1241
+ },
1242
+ {
1243
+ "epoch": 2.48,
1244
+ "gate_score": 0.0449,
1245
+ "learning_rate": 1.3201655858376072e-05,
1246
+ "loss": 4.289,
1247
+ "nf_loss": 3.3752,
1248
+ "ppl": 2.7063,
1249
+ "step": 178000
1250
+ },
1251
+ {
1252
+ "epoch": 2.51,
1253
+ "gate_score": 0.0431,
1254
+ "learning_rate": 1.2495938060724226e-05,
1255
+ "loss": 4.2453,
1256
+ "nf_loss": 3.3433,
1257
+ "ppl": 2.6679,
1258
+ "step": 180000
1259
+ },
1260
+ {
1261
+ "epoch": 2.51,
1262
+ "eval_loss": 4.363792896270752,
1263
+ "eval_nf_loss": 2.7436203956604004,
1264
+ "eval_perplexity": 5.350350856781006,
1265
+ "eval_runtime": 592.6812,
1266
+ "eval_samples_per_second": 22.555,
1267
+ "eval_steps_per_second": 2.819,
1268
+ "step": 180000
1269
+ },
1270
+ {
1271
+ "epoch": 2.54,
1272
+ "gate_score": 0.0444,
1273
+ "learning_rate": 1.1789867050961444e-05,
1274
+ "loss": 4.1812,
1275
+ "nf_loss": 3.2773,
1276
+ "ppl": 2.6769,
1277
+ "step": 182000
1278
+ },
1279
+ {
1280
+ "epoch": 2.56,
1281
+ "gate_score": 0.0439,
1282
+ "learning_rate": 1.1083442829087724e-05,
1283
+ "loss": 4.2063,
1284
+ "nf_loss": 3.3076,
1285
+ "ppl": 2.6592,
1286
+ "step": 184000
1287
+ },
1288
+ {
1289
+ "epoch": 2.56,
1290
+ "eval_loss": 4.401766777038574,
1291
+ "eval_nf_loss": 2.7845916748046875,
1292
+ "eval_perplexity": 5.334909915924072,
1293
+ "eval_runtime": 593.569,
1294
+ "eval_samples_per_second": 22.521,
1295
+ "eval_steps_per_second": 2.815,
1296
+ "step": 184000
1297
+ },
1298
+ {
1299
+ "epoch": 2.59,
1300
+ "gate_score": 0.0435,
1301
+ "learning_rate": 1.0377371819324942e-05,
1302
+ "loss": 4.1697,
1303
+ "nf_loss": 3.2627,
1304
+ "ppl": 2.6781,
1305
+ "step": 186000
1306
+ },
1307
+ {
1308
+ "epoch": 2.62,
1309
+ "gate_score": 0.0455,
1310
+ "learning_rate": 9.67130080956216e-06,
1311
+ "loss": 4.1413,
1312
+ "nf_loss": 3.2508,
1313
+ "ppl": 2.6383,
1314
+ "step": 188000
1315
+ },
1316
+ {
1317
+ "epoch": 2.62,
1318
+ "eval_loss": 4.287137031555176,
1319
+ "eval_nf_loss": 2.6753435134887695,
1320
+ "eval_perplexity": 5.3036627769470215,
1321
+ "eval_runtime": 592.6474,
1322
+ "eval_samples_per_second": 22.556,
1323
+ "eval_steps_per_second": 2.82,
1324
+ "step": 188000
1325
+ },
1326
+ {
1327
+ "epoch": 2.65,
1328
+ "gate_score": 0.0438,
1329
+ "learning_rate": 8.964876587688439e-06,
1330
+ "loss": 4.1244,
1331
+ "nf_loss": 3.2269,
1332
+ "ppl": 2.6537,
1333
+ "step": 190000
1334
+ },
1335
+ {
1336
+ "epoch": 2.67,
1337
+ "gate_score": 0.0446,
1338
+ "learning_rate": 8.258805577925655e-06,
1339
+ "loss": 4.0845,
1340
+ "nf_loss": 3.1966,
1341
+ "ppl": 2.6201,
1342
+ "step": 192000
1343
+ },
1344
+ {
1345
+ "epoch": 2.67,
1346
+ "eval_loss": 4.247885704040527,
1347
+ "eval_nf_loss": 2.640995502471924,
1348
+ "eval_perplexity": 5.275094985961914,
1349
+ "eval_runtime": 591.9491,
1350
+ "eval_samples_per_second": 22.583,
1351
+ "eval_steps_per_second": 2.823,
1352
+ "step": 192000
1353
+ },
1354
+ {
1355
+ "epoch": 2.7,
1356
+ "gate_score": 0.0445,
1357
+ "learning_rate": 7.552734568162874e-06,
1358
+ "loss": 4.0604,
1359
+ "nf_loss": 3.1769,
1360
+ "ppl": 2.6091,
1361
+ "step": 194000
1362
+ },
1363
+ {
1364
+ "epoch": 2.73,
1365
+ "gate_score": 0.0433,
1366
+ "learning_rate": 6.846310346289154e-06,
1367
+ "loss": 4.06,
1368
+ "nf_loss": 3.1758,
1369
+ "ppl": 2.6114,
1370
+ "step": 196000
1371
+ },
1372
+ {
1373
+ "epoch": 2.73,
1374
+ "eval_loss": 4.171822547912598,
1375
+ "eval_nf_loss": 2.567200183868408,
1376
+ "eval_perplexity": 5.264888763427734,
1377
+ "eval_runtime": 589.4312,
1378
+ "eval_samples_per_second": 22.679,
1379
+ "eval_steps_per_second": 2.835,
1380
+ "step": 196000
1381
+ },
1382
+ {
1383
+ "epoch": 2.76,
1384
+ "gate_score": 0.0436,
1385
+ "learning_rate": 6.140239336526371e-06,
1386
+ "loss": 4.0426,
1387
+ "nf_loss": 3.1603,
1388
+ "ppl": 2.6136,
1389
+ "step": 198000
1390
+ },
1391
+ {
1392
+ "epoch": 2.79,
1393
+ "gate_score": 0.0444,
1394
+ "learning_rate": 5.4338151146526515e-06,
1395
+ "loss": 4.0352,
1396
+ "nf_loss": 3.1654,
1397
+ "ppl": 2.5709,
1398
+ "step": 200000
1399
+ },
1400
+ {
1401
+ "epoch": 2.79,
1402
+ "eval_loss": 4.141200542449951,
1403
+ "eval_nf_loss": 2.539963483810425,
1404
+ "eval_perplexity": 5.245651721954346,
1405
+ "eval_runtime": 622.9626,
1406
+ "eval_samples_per_second": 21.459,
1407
+ "eval_steps_per_second": 2.682,
1408
+ "step": 200000
1409
+ },
1410
+ {
1411
+ "epoch": 2.81,
1412
+ "gate_score": 0.0442,
1413
+ "learning_rate": 4.727744104889869e-06,
1414
+ "loss": 3.9924,
1415
+ "nf_loss": 3.1183,
1416
+ "ppl": 2.5982,
1417
+ "step": 202000
1418
+ },
1419
+ {
1420
+ "epoch": 2.84,
1421
+ "gate_score": 0.0437,
1422
+ "learning_rate": 4.021673095127087e-06,
1423
+ "loss": 3.982,
1424
+ "nf_loss": 3.0969,
1425
+ "ppl": 2.6167,
1426
+ "step": 204000
1427
+ },
1428
+ {
1429
+ "epoch": 2.84,
1430
+ "eval_loss": 4.099726676940918,
1431
+ "eval_nf_loss": 2.5044403076171875,
1432
+ "eval_perplexity": 5.212295055389404,
1433
+ "eval_runtime": 628.0641,
1434
+ "eval_samples_per_second": 21.284,
1435
+ "eval_steps_per_second": 2.661,
1436
+ "step": 204000
1437
+ },
1438
+ {
1439
+ "epoch": 2.87,
1440
+ "gate_score": 0.044,
1441
+ "learning_rate": 3.315248873253366e-06,
1442
+ "loss": 3.9519,
1443
+ "nf_loss": 3.0778,
1444
+ "ppl": 2.5849,
1445
+ "step": 206000
1446
+ },
1447
+ {
1448
+ "epoch": 2.9,
1449
+ "gate_score": 0.0427,
1450
+ "learning_rate": 2.6091778634905835e-06,
1451
+ "loss": 3.9487,
1452
+ "nf_loss": 3.082,
1453
+ "ppl": 2.5664,
1454
+ "step": 208000
1455
+ },
1456
+ {
1457
+ "epoch": 2.9,
1458
+ "eval_loss": 4.066439151763916,
1459
+ "eval_nf_loss": 2.474716901779175,
1460
+ "eval_perplexity": 5.1935272216796875,
1461
+ "eval_runtime": 625.0657,
1462
+ "eval_samples_per_second": 21.387,
1463
+ "eval_steps_per_second": 2.673,
1464
+ "step": 208000
1465
+ },
1466
+ {
1467
+ "epoch": 2.93,
1468
+ "gate_score": 0.0425,
1469
+ "learning_rate": 1.9031068537278006e-06,
1470
+ "loss": 3.9389,
1471
+ "nf_loss": 3.0634,
1472
+ "ppl": 2.5914,
1473
+ "step": 210000
1474
+ },
1475
+ {
1476
+ "epoch": 2.95,
1477
+ "gate_score": 0.0424,
1478
+ "learning_rate": 1.196682631854081e-06,
1479
+ "loss": 3.9603,
1480
+ "nf_loss": 3.0926,
1481
+ "ppl": 2.5703,
1482
+ "step": 212000
1483
+ },
1484
+ {
1485
+ "epoch": 2.95,
1486
+ "eval_loss": 4.057208061218262,
1487
+ "eval_nf_loss": 2.4683539867401123,
1488
+ "eval_perplexity": 5.177720546722412,
1489
+ "eval_runtime": 618.2401,
1490
+ "eval_samples_per_second": 21.623,
1491
+ "eval_steps_per_second": 2.703,
1492
+ "step": 212000
1493
+ }
1494
+ ],
1495
+ "max_steps": 215337,
1496
+ "num_train_epochs": 3,
1497
+ "total_flos": 2.1693560999113897e+18,
1498
+ "trial_name": null,
1499
+ "trial_params": null
1500
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff