pszemraj commited on
Commit
fe5a0f5
1 Parent(s): e212533

Model save

Browse files
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: roberta-large-mnli
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - f1
8
+ model-index:
9
+ - name: roberta-large-mnli-goodreads-bookgenres-Book_cls-5e
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # roberta-large-mnli-goodreads-bookgenres-Book_cls-5e
17
+
18
+ This model is a fine-tuned version of [roberta-large-mnli](https://huggingface.co/roberta-large-mnli) on an unknown dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.2758
21
+ - F1: 0.5464
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 6e-05
41
+ - train_batch_size: 32
42
+ - eval_batch_size: 32
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 128
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-10
47
+ - lr_scheduler_type: linear
48
+ - num_epochs: 5.0
49
+
50
+ ### Training results
51
+
52
+ | Training Loss | Epoch | Step | Validation Loss | F1 |
53
+ |:-------------:|:-----:|:----:|:---------------:|:------:|
54
+ | 0.3096 | 1.0 | 62 | 0.2862 | 0.3707 |
55
+ | 0.2863 | 2.0 | 124 | 0.2804 | 0.4422 |
56
+ | 0.2618 | 3.0 | 186 | 0.2773 | 0.4989 |
57
+ | 0.2432 | 4.0 | 248 | 0.2764 | 0.5223 |
58
+ | 0.2241 | 5.0 | 310 | 0.2758 | 0.5464 |
59
+
60
+
61
+ ### Framework versions
62
+
63
+ - Transformers 4.33.3
64
+ - Pytorch 2.2.0.dev20231001+cu121
65
+ - Datasets 2.14.5
66
+ - Tokenizers 0.13.3
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_f1": 0.5832229580573952,
4
+ "eval_loss": 0.3661610186100006,
5
+ "eval_runtime": 2.8821,
6
+ "eval_samples": 989,
7
+ "eval_samples_per_second": 343.154,
8
+ "eval_steps_per_second": 10.756,
9
+ "train_loss": 0.14994276651451666,
10
+ "train_runtime": 1498.2493,
11
+ "train_samples": 7914,
12
+ "train_samples_per_second": 105.643,
13
+ "train_steps_per_second": 0.828
14
+ }
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-large-mnli",
3
+ "_num_labels": 3,
4
+ "architectures": [
5
+ "RobertaForSequenceClassification"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": null,
10
+ "eos_token_id": 2,
11
+ "finetuning_task": "text-classification",
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 1024,
15
+ "id2label": {
16
+ "0": "History & Politics",
17
+ "1": "Health & Medicine",
18
+ "2": "Mystery & Thriller",
19
+ "3": "Arts & Design",
20
+ "4": "Self-Help & Wellness",
21
+ "5": "Sports & Recreation",
22
+ "6": "Non-Fiction",
23
+ "7": "Science Fiction & Fantasy",
24
+ "8": "Countries & Geography",
25
+ "9": "Other",
26
+ "10": "Nature & Environment",
27
+ "11": "Business & Finance",
28
+ "12": "Romance",
29
+ "13": "Philosophy & Religion",
30
+ "14": "Literature & Fiction",
31
+ "15": "Science & Technology",
32
+ "16": "Children & Young Adult",
33
+ "17": "Food & Cooking"
34
+ },
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": 4096,
37
+ "label2id": {
38
+ "Arts & Design": 3,
39
+ "Business & Finance": 11,
40
+ "Children & Young Adult": 16,
41
+ "Countries & Geography": 8,
42
+ "Food & Cooking": 17,
43
+ "Health & Medicine": 1,
44
+ "History & Politics": 0,
45
+ "Literature & Fiction": 14,
46
+ "Mystery & Thriller": 2,
47
+ "Nature & Environment": 10,
48
+ "Non-Fiction": 6,
49
+ "Other": 9,
50
+ "Philosophy & Religion": 13,
51
+ "Romance": 12,
52
+ "Science & Technology": 15,
53
+ "Science Fiction & Fantasy": 7,
54
+ "Self-Help & Wellness": 4,
55
+ "Sports & Recreation": 5
56
+ },
57
+ "layer_norm_eps": 1e-05,
58
+ "max_position_embeddings": 514,
59
+ "model_type": "roberta",
60
+ "num_attention_heads": 16,
61
+ "num_hidden_layers": 24,
62
+ "pad_token_id": 1,
63
+ "position_embedding_type": "absolute",
64
+ "problem_type": "multi_label_classification",
65
+ "torch_dtype": "float32",
66
+ "transformers_version": "4.33.3",
67
+ "type_vocab_size": 1,
68
+ "use_cache": true,
69
+ "vocab_size": 50265
70
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_f1": 0.5832229580573952,
4
+ "eval_loss": 0.3661610186100006,
5
+ "eval_runtime": 2.8821,
6
+ "eval_samples": 989,
7
+ "eval_samples_per_second": 343.154,
8
+ "eval_steps_per_second": 10.756
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8601937cac48a5151f1d4c48445cb8b152e8e9bc875f9ee17304ea18ff02913e
3
+ size 1421561016
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "RobertaTokenizer",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "train_loss": 0.14994276651451666,
4
+ "train_runtime": 1498.2493,
5
+ "train_samples": 7914,
6
+ "train_samples_per_second": 105.643,
7
+ "train_steps_per_second": 0.828
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,952 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1240,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.16,
13
+ "learning_rate": 5.951612903225807e-05,
14
+ "loss": 0.4848,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.32,
19
+ "learning_rate": 5.9032258064516134e-05,
20
+ "loss": 0.3412,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.48,
25
+ "learning_rate": 5.854838709677419e-05,
26
+ "loss": 0.3195,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.65,
31
+ "learning_rate": 5.806451612903226e-05,
32
+ "loss": 0.3107,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.81,
37
+ "learning_rate": 5.7580645161290325e-05,
38
+ "loss": 0.3062,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.97,
43
+ "learning_rate": 5.709677419354839e-05,
44
+ "loss": 0.3047,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 1.0,
49
+ "eval_f1": 0.3600605143721634,
50
+ "eval_loss": 0.28778037428855896,
51
+ "eval_runtime": 2.9163,
52
+ "eval_samples_per_second": 339.124,
53
+ "eval_steps_per_second": 10.63,
54
+ "step": 62
55
+ },
56
+ {
57
+ "epoch": 1.13,
58
+ "learning_rate": 5.661290322580646e-05,
59
+ "loss": 0.2971,
60
+ "step": 70
61
+ },
62
+ {
63
+ "epoch": 1.29,
64
+ "learning_rate": 5.612903225806452e-05,
65
+ "loss": 0.2923,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 1.45,
70
+ "learning_rate": 5.5645161290322576e-05,
71
+ "loss": 0.2913,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 1.61,
76
+ "learning_rate": 5.516129032258064e-05,
77
+ "loss": 0.2868,
78
+ "step": 100
79
+ },
80
+ {
81
+ "epoch": 1.77,
82
+ "learning_rate": 5.467741935483871e-05,
83
+ "loss": 0.2876,
84
+ "step": 110
85
+ },
86
+ {
87
+ "epoch": 1.94,
88
+ "learning_rate": 5.4193548387096774e-05,
89
+ "loss": 0.2809,
90
+ "step": 120
91
+ },
92
+ {
93
+ "epoch": 2.0,
94
+ "eval_f1": 0.4606093878671425,
95
+ "eval_loss": 0.2768673002719879,
96
+ "eval_runtime": 2.9476,
97
+ "eval_samples_per_second": 335.524,
98
+ "eval_steps_per_second": 10.517,
99
+ "step": 124
100
+ },
101
+ {
102
+ "epoch": 2.1,
103
+ "learning_rate": 5.370967741935484e-05,
104
+ "loss": 0.2685,
105
+ "step": 130
106
+ },
107
+ {
108
+ "epoch": 2.26,
109
+ "learning_rate": 5.3225806451612906e-05,
110
+ "loss": 0.2658,
111
+ "step": 140
112
+ },
113
+ {
114
+ "epoch": 2.42,
115
+ "learning_rate": 5.2741935483870966e-05,
116
+ "loss": 0.2706,
117
+ "step": 150
118
+ },
119
+ {
120
+ "epoch": 2.58,
121
+ "learning_rate": 5.225806451612903e-05,
122
+ "loss": 0.2613,
123
+ "step": 160
124
+ },
125
+ {
126
+ "epoch": 2.74,
127
+ "learning_rate": 5.17741935483871e-05,
128
+ "loss": 0.2634,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 2.9,
133
+ "learning_rate": 5.1290322580645164e-05,
134
+ "loss": 0.2584,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 3.0,
139
+ "eval_f1": 0.48365306673871933,
140
+ "eval_loss": 0.2766420543193817,
141
+ "eval_runtime": 2.9111,
142
+ "eval_samples_per_second": 339.73,
143
+ "eval_steps_per_second": 10.649,
144
+ "step": 186
145
+ },
146
+ {
147
+ "epoch": 3.06,
148
+ "learning_rate": 5.080645161290323e-05,
149
+ "loss": 0.259,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 3.23,
154
+ "learning_rate": 5.0322580645161296e-05,
155
+ "loss": 0.2427,
156
+ "step": 200
157
+ },
158
+ {
159
+ "epoch": 3.39,
160
+ "learning_rate": 4.9838709677419356e-05,
161
+ "loss": 0.2389,
162
+ "step": 210
163
+ },
164
+ {
165
+ "epoch": 3.55,
166
+ "learning_rate": 4.935483870967742e-05,
167
+ "loss": 0.2441,
168
+ "step": 220
169
+ },
170
+ {
171
+ "epoch": 3.71,
172
+ "learning_rate": 4.887096774193549e-05,
173
+ "loss": 0.2416,
174
+ "step": 230
175
+ },
176
+ {
177
+ "epoch": 3.87,
178
+ "learning_rate": 4.838709677419355e-05,
179
+ "loss": 0.2382,
180
+ "step": 240
181
+ },
182
+ {
183
+ "epoch": 4.0,
184
+ "eval_f1": 0.49434656849855374,
185
+ "eval_loss": 0.28233975172042847,
186
+ "eval_runtime": 2.9046,
187
+ "eval_samples_per_second": 340.492,
188
+ "eval_steps_per_second": 10.673,
189
+ "step": 248
190
+ },
191
+ {
192
+ "epoch": 4.03,
193
+ "learning_rate": 4.790322580645161e-05,
194
+ "loss": 0.2327,
195
+ "step": 250
196
+ },
197
+ {
198
+ "epoch": 4.19,
199
+ "learning_rate": 4.741935483870968e-05,
200
+ "loss": 0.2168,
201
+ "step": 260
202
+ },
203
+ {
204
+ "epoch": 4.35,
205
+ "learning_rate": 4.693548387096774e-05,
206
+ "loss": 0.2198,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 4.52,
211
+ "learning_rate": 4.6451612903225805e-05,
212
+ "loss": 0.221,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 4.68,
217
+ "learning_rate": 4.596774193548387e-05,
218
+ "loss": 0.2191,
219
+ "step": 290
220
+ },
221
+ {
222
+ "epoch": 4.84,
223
+ "learning_rate": 4.548387096774194e-05,
224
+ "loss": 0.2126,
225
+ "step": 300
226
+ },
227
+ {
228
+ "epoch": 5.0,
229
+ "learning_rate": 4.5e-05,
230
+ "loss": 0.2161,
231
+ "step": 310
232
+ },
233
+ {
234
+ "epoch": 5.0,
235
+ "eval_f1": 0.5236882322975038,
236
+ "eval_loss": 0.28145599365234375,
237
+ "eval_runtime": 2.9136,
238
+ "eval_samples_per_second": 339.438,
239
+ "eval_steps_per_second": 10.64,
240
+ "step": 310
241
+ },
242
+ {
243
+ "epoch": 5.16,
244
+ "learning_rate": 4.451612903225807e-05,
245
+ "loss": 0.1952,
246
+ "step": 320
247
+ },
248
+ {
249
+ "epoch": 5.32,
250
+ "learning_rate": 4.403225806451613e-05,
251
+ "loss": 0.1996,
252
+ "step": 330
253
+ },
254
+ {
255
+ "epoch": 5.48,
256
+ "learning_rate": 4.3548387096774194e-05,
257
+ "loss": 0.1921,
258
+ "step": 340
259
+ },
260
+ {
261
+ "epoch": 5.65,
262
+ "learning_rate": 4.306451612903226e-05,
263
+ "loss": 0.1966,
264
+ "step": 350
265
+ },
266
+ {
267
+ "epoch": 5.81,
268
+ "learning_rate": 4.2580645161290327e-05,
269
+ "loss": 0.2023,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 5.97,
274
+ "learning_rate": 4.209677419354839e-05,
275
+ "loss": 0.1903,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 6.0,
280
+ "eval_f1": 0.5535545023696682,
281
+ "eval_loss": 0.29538020491600037,
282
+ "eval_runtime": 2.9433,
283
+ "eval_samples_per_second": 336.013,
284
+ "eval_steps_per_second": 10.532,
285
+ "step": 372
286
+ },
287
+ {
288
+ "epoch": 6.13,
289
+ "learning_rate": 4.161290322580646e-05,
290
+ "loss": 0.1854,
291
+ "step": 380
292
+ },
293
+ {
294
+ "epoch": 6.29,
295
+ "learning_rate": 4.112903225806451e-05,
296
+ "loss": 0.1784,
297
+ "step": 390
298
+ },
299
+ {
300
+ "epoch": 6.45,
301
+ "learning_rate": 4.064516129032258e-05,
302
+ "loss": 0.1776,
303
+ "step": 400
304
+ },
305
+ {
306
+ "epoch": 6.61,
307
+ "learning_rate": 4.0161290322580643e-05,
308
+ "loss": 0.1746,
309
+ "step": 410
310
+ },
311
+ {
312
+ "epoch": 6.77,
313
+ "learning_rate": 3.967741935483871e-05,
314
+ "loss": 0.1762,
315
+ "step": 420
316
+ },
317
+ {
318
+ "epoch": 6.94,
319
+ "learning_rate": 3.9193548387096776e-05,
320
+ "loss": 0.172,
321
+ "step": 430
322
+ },
323
+ {
324
+ "epoch": 7.0,
325
+ "eval_f1": 0.5482866043613707,
326
+ "eval_loss": 0.29978421330451965,
327
+ "eval_runtime": 2.9231,
328
+ "eval_samples_per_second": 338.34,
329
+ "eval_steps_per_second": 10.605,
330
+ "step": 434
331
+ },
332
+ {
333
+ "epoch": 7.1,
334
+ "learning_rate": 3.870967741935484e-05,
335
+ "loss": 0.1638,
336
+ "step": 440
337
+ },
338
+ {
339
+ "epoch": 7.26,
340
+ "learning_rate": 3.82258064516129e-05,
341
+ "loss": 0.1628,
342
+ "step": 450
343
+ },
344
+ {
345
+ "epoch": 7.42,
346
+ "learning_rate": 3.774193548387097e-05,
347
+ "loss": 0.1559,
348
+ "step": 460
349
+ },
350
+ {
351
+ "epoch": 7.58,
352
+ "learning_rate": 3.725806451612903e-05,
353
+ "loss": 0.1574,
354
+ "step": 470
355
+ },
356
+ {
357
+ "epoch": 7.74,
358
+ "learning_rate": 3.67741935483871e-05,
359
+ "loss": 0.1614,
360
+ "step": 480
361
+ },
362
+ {
363
+ "epoch": 7.9,
364
+ "learning_rate": 3.6290322580645165e-05,
365
+ "loss": 0.1551,
366
+ "step": 490
367
+ },
368
+ {
369
+ "epoch": 8.0,
370
+ "eval_f1": 0.5663555656726611,
371
+ "eval_loss": 0.3146507441997528,
372
+ "eval_runtime": 2.9227,
373
+ "eval_samples_per_second": 338.382,
374
+ "eval_steps_per_second": 10.607,
375
+ "step": 496
376
+ },
377
+ {
378
+ "epoch": 8.06,
379
+ "learning_rate": 3.580645161290323e-05,
380
+ "loss": 0.1497,
381
+ "step": 500
382
+ },
383
+ {
384
+ "epoch": 8.23,
385
+ "learning_rate": 3.532258064516129e-05,
386
+ "loss": 0.146,
387
+ "step": 510
388
+ },
389
+ {
390
+ "epoch": 8.39,
391
+ "learning_rate": 3.483870967741936e-05,
392
+ "loss": 0.1426,
393
+ "step": 520
394
+ },
395
+ {
396
+ "epoch": 8.55,
397
+ "learning_rate": 3.435483870967742e-05,
398
+ "loss": 0.1494,
399
+ "step": 530
400
+ },
401
+ {
402
+ "epoch": 8.71,
403
+ "learning_rate": 3.387096774193549e-05,
404
+ "loss": 0.1405,
405
+ "step": 540
406
+ },
407
+ {
408
+ "epoch": 8.87,
409
+ "learning_rate": 3.338709677419355e-05,
410
+ "loss": 0.1419,
411
+ "step": 550
412
+ },
413
+ {
414
+ "epoch": 9.0,
415
+ "eval_f1": 0.5569386814200094,
416
+ "eval_loss": 0.3179270029067993,
417
+ "eval_runtime": 2.8924,
418
+ "eval_samples_per_second": 341.925,
419
+ "eval_steps_per_second": 10.718,
420
+ "step": 558
421
+ },
422
+ {
423
+ "epoch": 9.03,
424
+ "learning_rate": 3.2903225806451614e-05,
425
+ "loss": 0.1391,
426
+ "step": 560
427
+ },
428
+ {
429
+ "epoch": 9.19,
430
+ "learning_rate": 3.2419354838709674e-05,
431
+ "loss": 0.1295,
432
+ "step": 570
433
+ },
434
+ {
435
+ "epoch": 9.35,
436
+ "learning_rate": 3.193548387096774e-05,
437
+ "loss": 0.1269,
438
+ "step": 580
439
+ },
440
+ {
441
+ "epoch": 9.52,
442
+ "learning_rate": 3.1451612903225806e-05,
443
+ "loss": 0.1279,
444
+ "step": 590
445
+ },
446
+ {
447
+ "epoch": 9.68,
448
+ "learning_rate": 3.096774193548387e-05,
449
+ "loss": 0.1283,
450
+ "step": 600
451
+ },
452
+ {
453
+ "epoch": 9.84,
454
+ "learning_rate": 3.0483870967741935e-05,
455
+ "loss": 0.1266,
456
+ "step": 610
457
+ },
458
+ {
459
+ "epoch": 10.0,
460
+ "learning_rate": 3e-05,
461
+ "loss": 0.1291,
462
+ "step": 620
463
+ },
464
+ {
465
+ "epoch": 10.0,
466
+ "eval_f1": 0.5659516202647192,
467
+ "eval_loss": 0.32372668385505676,
468
+ "eval_runtime": 2.9143,
469
+ "eval_samples_per_second": 339.356,
470
+ "eval_steps_per_second": 10.637,
471
+ "step": 620
472
+ },
473
+ {
474
+ "epoch": 10.16,
475
+ "learning_rate": 2.9516129032258067e-05,
476
+ "loss": 0.1164,
477
+ "step": 630
478
+ },
479
+ {
480
+ "epoch": 10.32,
481
+ "learning_rate": 2.903225806451613e-05,
482
+ "loss": 0.1206,
483
+ "step": 640
484
+ },
485
+ {
486
+ "epoch": 10.48,
487
+ "learning_rate": 2.8548387096774196e-05,
488
+ "loss": 0.1121,
489
+ "step": 650
490
+ },
491
+ {
492
+ "epoch": 10.65,
493
+ "learning_rate": 2.806451612903226e-05,
494
+ "loss": 0.123,
495
+ "step": 660
496
+ },
497
+ {
498
+ "epoch": 10.81,
499
+ "learning_rate": 2.758064516129032e-05,
500
+ "loss": 0.1148,
501
+ "step": 670
502
+ },
503
+ {
504
+ "epoch": 10.97,
505
+ "learning_rate": 2.7096774193548387e-05,
506
+ "loss": 0.1166,
507
+ "step": 680
508
+ },
509
+ {
510
+ "epoch": 11.0,
511
+ "eval_f1": 0.5683084466235793,
512
+ "eval_loss": 0.3352525234222412,
513
+ "eval_runtime": 2.8947,
514
+ "eval_samples_per_second": 341.665,
515
+ "eval_steps_per_second": 10.709,
516
+ "step": 682
517
+ },
518
+ {
519
+ "epoch": 11.13,
520
+ "learning_rate": 2.6612903225806453e-05,
521
+ "loss": 0.1068,
522
+ "step": 690
523
+ },
524
+ {
525
+ "epoch": 11.29,
526
+ "learning_rate": 2.6129032258064516e-05,
527
+ "loss": 0.1109,
528
+ "step": 700
529
+ },
530
+ {
531
+ "epoch": 11.45,
532
+ "learning_rate": 2.5645161290322582e-05,
533
+ "loss": 0.1049,
534
+ "step": 710
535
+ },
536
+ {
537
+ "epoch": 11.61,
538
+ "learning_rate": 2.5161290322580648e-05,
539
+ "loss": 0.1042,
540
+ "step": 720
541
+ },
542
+ {
543
+ "epoch": 11.77,
544
+ "learning_rate": 2.467741935483871e-05,
545
+ "loss": 0.1111,
546
+ "step": 730
547
+ },
548
+ {
549
+ "epoch": 11.94,
550
+ "learning_rate": 2.4193548387096773e-05,
551
+ "loss": 0.1079,
552
+ "step": 740
553
+ },
554
+ {
555
+ "epoch": 12.0,
556
+ "eval_f1": 0.5689615996364462,
557
+ "eval_loss": 0.33389773964881897,
558
+ "eval_runtime": 2.9192,
559
+ "eval_samples_per_second": 338.786,
560
+ "eval_steps_per_second": 10.619,
561
+ "step": 744
562
+ },
563
+ {
564
+ "epoch": 12.1,
565
+ "learning_rate": 2.370967741935484e-05,
566
+ "loss": 0.0981,
567
+ "step": 750
568
+ },
569
+ {
570
+ "epoch": 12.26,
571
+ "learning_rate": 2.3225806451612902e-05,
572
+ "loss": 0.1,
573
+ "step": 760
574
+ },
575
+ {
576
+ "epoch": 12.42,
577
+ "learning_rate": 2.274193548387097e-05,
578
+ "loss": 0.0947,
579
+ "step": 770
580
+ },
581
+ {
582
+ "epoch": 12.58,
583
+ "learning_rate": 2.2258064516129034e-05,
584
+ "loss": 0.0937,
585
+ "step": 780
586
+ },
587
+ {
588
+ "epoch": 12.74,
589
+ "learning_rate": 2.1774193548387097e-05,
590
+ "loss": 0.0978,
591
+ "step": 790
592
+ },
593
+ {
594
+ "epoch": 12.9,
595
+ "learning_rate": 2.1290322580645163e-05,
596
+ "loss": 0.0988,
597
+ "step": 800
598
+ },
599
+ {
600
+ "epoch": 13.0,
601
+ "eval_f1": 0.573460769059791,
602
+ "eval_loss": 0.34931689500808716,
603
+ "eval_runtime": 2.9238,
604
+ "eval_samples_per_second": 338.256,
605
+ "eval_steps_per_second": 10.603,
606
+ "step": 806
607
+ },
608
+ {
609
+ "epoch": 13.06,
610
+ "learning_rate": 2.080645161290323e-05,
611
+ "loss": 0.0948,
612
+ "step": 810
613
+ },
614
+ {
615
+ "epoch": 13.23,
616
+ "learning_rate": 2.032258064516129e-05,
617
+ "loss": 0.0874,
618
+ "step": 820
619
+ },
620
+ {
621
+ "epoch": 13.39,
622
+ "learning_rate": 1.9838709677419355e-05,
623
+ "loss": 0.0894,
624
+ "step": 830
625
+ },
626
+ {
627
+ "epoch": 13.55,
628
+ "learning_rate": 1.935483870967742e-05,
629
+ "loss": 0.0919,
630
+ "step": 840
631
+ },
632
+ {
633
+ "epoch": 13.71,
634
+ "learning_rate": 1.8870967741935484e-05,
635
+ "loss": 0.0899,
636
+ "step": 850
637
+ },
638
+ {
639
+ "epoch": 13.87,
640
+ "learning_rate": 1.838709677419355e-05,
641
+ "loss": 0.0926,
642
+ "step": 860
643
+ },
644
+ {
645
+ "epoch": 14.0,
646
+ "eval_f1": 0.5807734806629834,
647
+ "eval_loss": 0.35069382190704346,
648
+ "eval_runtime": 2.9589,
649
+ "eval_samples_per_second": 334.248,
650
+ "eval_steps_per_second": 10.477,
651
+ "step": 868
652
+ },
653
+ {
654
+ "epoch": 14.03,
655
+ "learning_rate": 1.7903225806451616e-05,
656
+ "loss": 0.0817,
657
+ "step": 870
658
+ },
659
+ {
660
+ "epoch": 14.19,
661
+ "learning_rate": 1.741935483870968e-05,
662
+ "loss": 0.0826,
663
+ "step": 880
664
+ },
665
+ {
666
+ "epoch": 14.35,
667
+ "learning_rate": 1.6935483870967744e-05,
668
+ "loss": 0.0842,
669
+ "step": 890
670
+ },
671
+ {
672
+ "epoch": 14.52,
673
+ "learning_rate": 1.6451612903225807e-05,
674
+ "loss": 0.0824,
675
+ "step": 900
676
+ },
677
+ {
678
+ "epoch": 14.68,
679
+ "learning_rate": 1.596774193548387e-05,
680
+ "loss": 0.0822,
681
+ "step": 910
682
+ },
683
+ {
684
+ "epoch": 14.84,
685
+ "learning_rate": 1.5483870967741936e-05,
686
+ "loss": 0.0826,
687
+ "step": 920
688
+ },
689
+ {
690
+ "epoch": 15.0,
691
+ "learning_rate": 1.5e-05,
692
+ "loss": 0.0837,
693
+ "step": 930
694
+ },
695
+ {
696
+ "epoch": 15.0,
697
+ "eval_f1": 0.5837362637362637,
698
+ "eval_loss": 0.3567672073841095,
699
+ "eval_runtime": 2.915,
700
+ "eval_samples_per_second": 339.279,
701
+ "eval_steps_per_second": 10.635,
702
+ "step": 930
703
+ },
704
+ {
705
+ "epoch": 15.16,
706
+ "learning_rate": 1.4516129032258065e-05,
707
+ "loss": 0.0781,
708
+ "step": 940
709
+ },
710
+ {
711
+ "epoch": 15.32,
712
+ "learning_rate": 1.403225806451613e-05,
713
+ "loss": 0.0757,
714
+ "step": 950
715
+ },
716
+ {
717
+ "epoch": 15.48,
718
+ "learning_rate": 1.3548387096774194e-05,
719
+ "loss": 0.0766,
720
+ "step": 960
721
+ },
722
+ {
723
+ "epoch": 15.65,
724
+ "learning_rate": 1.3064516129032258e-05,
725
+ "loss": 0.0788,
726
+ "step": 970
727
+ },
728
+ {
729
+ "epoch": 15.81,
730
+ "learning_rate": 1.2580645161290324e-05,
731
+ "loss": 0.0764,
732
+ "step": 980
733
+ },
734
+ {
735
+ "epoch": 15.97,
736
+ "learning_rate": 1.2096774193548387e-05,
737
+ "loss": 0.076,
738
+ "step": 990
739
+ },
740
+ {
741
+ "epoch": 16.0,
742
+ "eval_f1": 0.5796391178436178,
743
+ "eval_loss": 0.3577311336994171,
744
+ "eval_runtime": 2.8859,
745
+ "eval_samples_per_second": 342.697,
746
+ "eval_steps_per_second": 10.742,
747
+ "step": 992
748
+ },
749
+ {
750
+ "epoch": 16.13,
751
+ "learning_rate": 1.1612903225806451e-05,
752
+ "loss": 0.0743,
753
+ "step": 1000
754
+ },
755
+ {
756
+ "epoch": 16.29,
757
+ "learning_rate": 1.1129032258064517e-05,
758
+ "loss": 0.0713,
759
+ "step": 1010
760
+ },
761
+ {
762
+ "epoch": 16.45,
763
+ "learning_rate": 1.0645161290322582e-05,
764
+ "loss": 0.0744,
765
+ "step": 1020
766
+ },
767
+ {
768
+ "epoch": 16.61,
769
+ "learning_rate": 1.0161290322580644e-05,
770
+ "loss": 0.069,
771
+ "step": 1030
772
+ },
773
+ {
774
+ "epoch": 16.77,
775
+ "learning_rate": 9.67741935483871e-06,
776
+ "loss": 0.0742,
777
+ "step": 1040
778
+ },
779
+ {
780
+ "epoch": 16.94,
781
+ "learning_rate": 9.193548387096775e-06,
782
+ "loss": 0.0718,
783
+ "step": 1050
784
+ },
785
+ {
786
+ "epoch": 17.0,
787
+ "eval_f1": 0.5815983881799865,
788
+ "eval_loss": 0.3607926368713379,
789
+ "eval_runtime": 2.9273,
790
+ "eval_samples_per_second": 337.857,
791
+ "eval_steps_per_second": 10.59,
792
+ "step": 1054
793
+ },
794
+ {
795
+ "epoch": 17.1,
796
+ "learning_rate": 8.70967741935484e-06,
797
+ "loss": 0.0704,
798
+ "step": 1060
799
+ },
800
+ {
801
+ "epoch": 17.26,
802
+ "learning_rate": 8.225806451612904e-06,
803
+ "loss": 0.067,
804
+ "step": 1070
805
+ },
806
+ {
807
+ "epoch": 17.42,
808
+ "learning_rate": 7.741935483870968e-06,
809
+ "loss": 0.0709,
810
+ "step": 1080
811
+ },
812
+ {
813
+ "epoch": 17.58,
814
+ "learning_rate": 7.258064516129032e-06,
815
+ "loss": 0.0679,
816
+ "step": 1090
817
+ },
818
+ {
819
+ "epoch": 17.74,
820
+ "learning_rate": 6.774193548387097e-06,
821
+ "loss": 0.071,
822
+ "step": 1100
823
+ },
824
+ {
825
+ "epoch": 17.9,
826
+ "learning_rate": 6.290322580645162e-06,
827
+ "loss": 0.0685,
828
+ "step": 1110
829
+ },
830
+ {
831
+ "epoch": 18.0,
832
+ "eval_f1": 0.5815289438798055,
833
+ "eval_loss": 0.36424893140792847,
834
+ "eval_runtime": 2.9446,
835
+ "eval_samples_per_second": 335.873,
836
+ "eval_steps_per_second": 10.528,
837
+ "step": 1116
838
+ },
839
+ {
840
+ "epoch": 18.06,
841
+ "learning_rate": 5.8064516129032256e-06,
842
+ "loss": 0.0673,
843
+ "step": 1120
844
+ },
845
+ {
846
+ "epoch": 18.23,
847
+ "learning_rate": 5.322580645161291e-06,
848
+ "loss": 0.0666,
849
+ "step": 1130
850
+ },
851
+ {
852
+ "epoch": 18.39,
853
+ "learning_rate": 4.838709677419355e-06,
854
+ "loss": 0.0663,
855
+ "step": 1140
856
+ },
857
+ {
858
+ "epoch": 18.55,
859
+ "learning_rate": 4.35483870967742e-06,
860
+ "loss": 0.0669,
861
+ "step": 1150
862
+ },
863
+ {
864
+ "epoch": 18.71,
865
+ "learning_rate": 3.870967741935484e-06,
866
+ "loss": 0.0677,
867
+ "step": 1160
868
+ },
869
+ {
870
+ "epoch": 18.87,
871
+ "learning_rate": 3.3870967741935484e-06,
872
+ "loss": 0.0665,
873
+ "step": 1170
874
+ },
875
+ {
876
+ "epoch": 19.0,
877
+ "eval_f1": 0.5837742504409171,
878
+ "eval_loss": 0.36555004119873047,
879
+ "eval_runtime": 2.9275,
880
+ "eval_samples_per_second": 337.836,
881
+ "eval_steps_per_second": 10.589,
882
+ "step": 1178
883
+ },
884
+ {
885
+ "epoch": 19.03,
886
+ "learning_rate": 2.9032258064516128e-06,
887
+ "loss": 0.0656,
888
+ "step": 1180
889
+ },
890
+ {
891
+ "epoch": 19.19,
892
+ "learning_rate": 2.4193548387096776e-06,
893
+ "loss": 0.063,
894
+ "step": 1190
895
+ },
896
+ {
897
+ "epoch": 19.35,
898
+ "learning_rate": 1.935483870967742e-06,
899
+ "loss": 0.0628,
900
+ "step": 1200
901
+ },
902
+ {
903
+ "epoch": 19.52,
904
+ "learning_rate": 1.4516129032258064e-06,
905
+ "loss": 0.0651,
906
+ "step": 1210
907
+ },
908
+ {
909
+ "epoch": 19.68,
910
+ "learning_rate": 9.67741935483871e-07,
911
+ "loss": 0.0662,
912
+ "step": 1220
913
+ },
914
+ {
915
+ "epoch": 19.84,
916
+ "learning_rate": 4.838709677419355e-07,
917
+ "loss": 0.0636,
918
+ "step": 1230
919
+ },
920
+ {
921
+ "epoch": 20.0,
922
+ "learning_rate": 0.0,
923
+ "loss": 0.0676,
924
+ "step": 1240
925
+ },
926
+ {
927
+ "epoch": 20.0,
928
+ "eval_f1": 0.5832229580573952,
929
+ "eval_loss": 0.3661610186100006,
930
+ "eval_runtime": 2.9338,
931
+ "eval_samples_per_second": 337.105,
932
+ "eval_steps_per_second": 10.566,
933
+ "step": 1240
934
+ },
935
+ {
936
+ "epoch": 20.0,
937
+ "step": 1240,
938
+ "total_flos": 2.082560024604672e+16,
939
+ "train_loss": 0.14994276651451666,
940
+ "train_runtime": 1498.2493,
941
+ "train_samples_per_second": 105.643,
942
+ "train_steps_per_second": 0.828
943
+ }
944
+ ],
945
+ "logging_steps": 10,
946
+ "max_steps": 1240,
947
+ "num_train_epochs": 20,
948
+ "save_steps": 500,
949
+ "total_flos": 2.082560024604672e+16,
950
+ "trial_name": null,
951
+ "trial_params": null
952
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d4aa7ecccbe4b27163ea940d8f5d4aed0acdeee63d06ce326f947364079c53
3
+ size 4600
vocab.json ADDED
The diff for this file is too large to render. See raw diff