agentlans commited on
Commit
a5ffd1d
1 Parent(s): 05c901f

Trained with more data

Browse files
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_loss": 1.7977033853530884,
4
- "eval_runtime": 2.6931,
5
- "eval_samples": 1000,
6
- "eval_samples_per_second": 371.32,
7
- "eval_steps_per_second": 46.415,
8
- "total_flos": 1.1470348787122176e+16,
9
- "train_loss": 1.9159377371910775,
10
- "train_runtime": 858.1835,
11
- "train_samples": 9693,
12
- "train_samples_per_second": 112.948,
13
- "train_steps_per_second": 14.123
14
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 2.615057185279181e+16,
4
+ "train_loss": 1.8617943959923335,
5
+ "train_runtime": 1938.923,
6
+ "train_samples": 21811,
7
+ "train_samples_per_second": 112.49,
8
+ "train_steps_per_second": 14.065
 
 
 
 
 
9
  }
config.json CHANGED
@@ -55,7 +55,7 @@
55
  }
56
  },
57
  "torch_dtype": "float32",
58
- "transformers_version": "4.44.2",
59
  "use_cache": true,
60
  "vocab_size": 32128
61
  }
 
55
  }
56
  },
57
  "torch_dtype": "float32",
58
+ "transformers_version": "4.45.1",
59
  "use_cache": true,
60
  "vocab_size": 32128
61
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "decoder_start_token_id": 0,
3
  "eos_token_id": 1,
4
  "pad_token_id": 0,
5
- "transformers_version": "4.44.2"
6
  }
 
2
  "decoder_start_token_id": 0,
3
  "eos_token_id": 1,
4
  "pad_token_id": 0,
5
+ "transformers_version": "4.45.1"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5da896e2bc44956e39c87b20421d99991d08ece1e764af8cbfa04dc12ce3295
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c45451090f738291073f43a87918d8f3c94c97a23ce24543100114626e906a8f
3
  size 242041896
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 128,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 256,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 1.1470348787122176e+16,
4
- "train_loss": 1.9159377371910775,
5
- "train_runtime": 858.1835,
6
- "train_samples": 9693,
7
- "train_samples_per_second": 112.948,
8
- "train_steps_per_second": 14.123
9
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 2.615057185279181e+16,
4
+ "train_loss": 1.8617943959923335,
5
+ "train_runtime": 1938.923,
6
+ "train_samples": 21811,
7
+ "train_samples_per_second": 112.49,
8
+ "train_steps_per_second": 14.065
9
  }
trainer_state.json CHANGED
@@ -3,191 +3,401 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 12120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.41254125412541254,
13
- "grad_norm": 2.3108015060424805,
14
- "learning_rate": 4.793729372937294e-05,
15
- "loss": 2.2146,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.8250825082508251,
20
- "grad_norm": 2.0797488689422607,
21
- "learning_rate": 4.5874587458745876e-05,
22
- "loss": 2.1084,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 1.2376237623762376,
27
- "grad_norm": 3.0814099311828613,
28
- "learning_rate": 4.3811881188118816e-05,
29
- "loss": 2.0322,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 1.6501650165016502,
34
- "grad_norm": 2.495213508605957,
35
- "learning_rate": 4.174917491749175e-05,
36
- "loss": 2.0345,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 2.062706270627063,
41
- "grad_norm": 2.1899046897888184,
42
- "learning_rate": 3.968646864686469e-05,
43
- "loss": 2.0033,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 2.4752475247524752,
48
- "grad_norm": 1.9743722677230835,
49
- "learning_rate": 3.762376237623763e-05,
50
- "loss": 1.9696,
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 2.887788778877888,
55
- "grad_norm": 2.0741982460021973,
56
- "learning_rate": 3.556105610561056e-05,
57
- "loss": 1.9576,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 3.3003300330033003,
62
- "grad_norm": 2.478909492492676,
63
- "learning_rate": 3.34983498349835e-05,
64
- "loss": 1.9328,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 3.7128712871287126,
69
- "grad_norm": 1.9436851739883423,
70
- "learning_rate": 3.1435643564356435e-05,
71
- "loss": 1.9219,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 4.125412541254126,
76
- "grad_norm": 2.5008606910705566,
77
- "learning_rate": 2.9372937293729375e-05,
78
- "loss": 1.9088,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 4.537953795379538,
83
- "grad_norm": 2.509181499481201,
84
- "learning_rate": 2.731023102310231e-05,
85
- "loss": 1.9007,
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 4.9504950495049505,
90
- "grad_norm": 2.128865957260132,
91
- "learning_rate": 2.5247524752475248e-05,
92
- "loss": 1.8946,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 5.363036303630363,
97
- "grad_norm": 2.622591972351074,
98
- "learning_rate": 2.3184818481848185e-05,
99
- "loss": 1.8833,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 5.775577557755776,
104
- "grad_norm": 2.249598264694214,
105
- "learning_rate": 2.1122112211221125e-05,
106
- "loss": 1.8689,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 6.188118811881188,
111
- "grad_norm": 2.382103681564331,
112
- "learning_rate": 1.905940594059406e-05,
113
- "loss": 1.8549,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 6.600660066006601,
118
- "grad_norm": 2.8902101516723633,
119
- "learning_rate": 1.6996699669966998e-05,
120
- "loss": 1.8497,
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 7.013201320132013,
125
- "grad_norm": 3.372351884841919,
126
- "learning_rate": 1.4933993399339935e-05,
127
- "loss": 1.8578,
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 7.425742574257426,
132
- "grad_norm": 2.1681647300720215,
133
- "learning_rate": 1.2871287128712873e-05,
134
- "loss": 1.8376,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 7.838283828382838,
139
- "grad_norm": 2.148872137069702,
140
- "learning_rate": 1.080858085808581e-05,
141
- "loss": 1.846,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 8.250825082508252,
146
- "grad_norm": 2.4388763904571533,
147
- "learning_rate": 8.745874587458746e-06,
148
- "loss": 1.825,
149
  "step": 10000
150
  },
151
  {
152
- "epoch": 8.663366336633663,
153
- "grad_norm": 3.16158127784729,
154
- "learning_rate": 6.6831683168316835e-06,
155
- "loss": 1.8288,
156
  "step": 10500
157
  },
158
  {
159
- "epoch": 9.075907590759076,
160
- "grad_norm": 2.0919010639190674,
161
- "learning_rate": 4.62046204620462e-06,
162
- "loss": 1.8287,
163
  "step": 11000
164
  },
165
  {
166
- "epoch": 9.488448844884488,
167
- "grad_norm": 2.2496068477630615,
168
- "learning_rate": 2.557755775577558e-06,
169
- "loss": 1.8293,
170
  "step": 11500
171
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  {
173
  "epoch": 9.900990099009901,
174
- "grad_norm": 3.0332181453704834,
175
  "learning_rate": 4.950495049504951e-07,
176
- "loss": 1.8216,
177
- "step": 12000
178
  },
179
  {
180
  "epoch": 10.0,
181
- "step": 12120,
182
- "total_flos": 1.1470348787122176e+16,
183
- "train_loss": 1.9159377371910775,
184
- "train_runtime": 858.1835,
185
- "train_samples_per_second": 112.948,
186
- "train_steps_per_second": 14.123
187
  }
188
  ],
189
  "logging_steps": 500,
190
- "max_steps": 12120,
191
  "num_input_tokens_seen": 0,
192
  "num_train_epochs": 10,
193
  "save_steps": 500,
@@ -203,7 +413,7 @@
203
  "attributes": {}
204
  }
205
  },
206
- "total_flos": 1.1470348787122176e+16,
207
  "train_batch_size": 8,
208
  "trial_name": null,
209
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 27270,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.18335166850018336,
13
+ "grad_norm": 2.6387195587158203,
14
+ "learning_rate": 4.908324165749908e-05,
15
+ "loss": 2.1955,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.3667033370003667,
20
+ "grad_norm": 2.266592025756836,
21
+ "learning_rate": 4.816648331499817e-05,
22
+ "loss": 2.0932,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.5500550055005501,
27
+ "grad_norm": 3.0838563442230225,
28
+ "learning_rate": 4.724972497249725e-05,
29
+ "loss": 2.048,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.7334066740007334,
34
+ "grad_norm": 2.5882773399353027,
35
+ "learning_rate": 4.633296662999633e-05,
36
+ "loss": 2.0419,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 0.9167583425009168,
41
+ "grad_norm": 2.441417694091797,
42
+ "learning_rate": 4.541620828749542e-05,
43
+ "loss": 2.0299,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 1.1001100110011002,
48
+ "grad_norm": 2.2205169200897217,
49
+ "learning_rate": 4.449944994499451e-05,
50
+ "loss": 1.9854,
51
  "step": 3000
52
  },
53
  {
54
+ "epoch": 1.2834616795012834,
55
+ "grad_norm": 2.802492380142212,
56
+ "learning_rate": 4.358269160249359e-05,
57
+ "loss": 1.9621,
58
  "step": 3500
59
  },
60
  {
61
+ "epoch": 1.466813348001467,
62
+ "grad_norm": 6.791469573974609,
63
+ "learning_rate": 4.266593325999267e-05,
64
+ "loss": 1.957,
65
  "step": 4000
66
  },
67
  {
68
+ "epoch": 1.6501650165016502,
69
+ "grad_norm": 2.5560526847839355,
70
+ "learning_rate": 4.174917491749175e-05,
71
+ "loss": 1.9447,
72
  "step": 4500
73
  },
74
  {
75
+ "epoch": 1.8335166850018334,
76
+ "grad_norm": 2.623948812484741,
77
+ "learning_rate": 4.0832416574990836e-05,
78
+ "loss": 1.9517,
79
  "step": 5000
80
  },
81
  {
82
+ "epoch": 2.0168683535020167,
83
+ "grad_norm": 2.395932674407959,
84
+ "learning_rate": 3.991565823248992e-05,
85
+ "loss": 1.9559,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 2.2002200220022003,
90
+ "grad_norm": 2.153954267501831,
91
+ "learning_rate": 3.8998899889989e-05,
92
+ "loss": 1.9099,
93
  "step": 6000
94
  },
95
  {
96
+ "epoch": 2.3835716905023836,
97
+ "grad_norm": 2.0535569190979004,
98
+ "learning_rate": 3.808214154748808e-05,
99
+ "loss": 1.8994,
100
  "step": 6500
101
  },
102
  {
103
+ "epoch": 2.566923359002567,
104
+ "grad_norm": 2.264138698577881,
105
+ "learning_rate": 3.716538320498717e-05,
106
+ "loss": 1.9069,
107
  "step": 7000
108
  },
109
  {
110
+ "epoch": 2.7502750275027505,
111
+ "grad_norm": 1.9639852046966553,
112
+ "learning_rate": 3.624862486248625e-05,
113
+ "loss": 1.9109,
114
  "step": 7500
115
  },
116
  {
117
+ "epoch": 2.933626696002934,
118
+ "grad_norm": 1.975576639175415,
119
+ "learning_rate": 3.5331866519985334e-05,
120
+ "loss": 1.9207,
121
  "step": 8000
122
  },
123
  {
124
+ "epoch": 3.116978364503117,
125
+ "grad_norm": 1.8623844385147095,
126
+ "learning_rate": 3.4415108177484414e-05,
127
+ "loss": 1.8771,
128
  "step": 8500
129
  },
130
  {
131
+ "epoch": 3.3003300330033003,
132
+ "grad_norm": 2.512981653213501,
133
+ "learning_rate": 3.34983498349835e-05,
134
+ "loss": 1.8801,
135
  "step": 9000
136
  },
137
  {
138
+ "epoch": 3.4836817015034836,
139
+ "grad_norm": 2.324370861053467,
140
+ "learning_rate": 3.258159149248258e-05,
141
+ "loss": 1.8746,
142
  "step": 9500
143
  },
144
  {
145
+ "epoch": 3.667033370003667,
146
+ "grad_norm": 2.3853843212127686,
147
+ "learning_rate": 3.166483314998166e-05,
148
+ "loss": 1.8549,
149
  "step": 10000
150
  },
151
  {
152
+ "epoch": 3.8503850385038505,
153
+ "grad_norm": 1.7555441856384277,
154
+ "learning_rate": 3.074807480748075e-05,
155
+ "loss": 1.8724,
156
  "step": 10500
157
  },
158
  {
159
+ "epoch": 4.033736707004033,
160
+ "grad_norm": 2.166177749633789,
161
+ "learning_rate": 2.983131646497983e-05,
162
+ "loss": 1.8574,
163
  "step": 11000
164
  },
165
  {
166
+ "epoch": 4.2170883755042174,
167
+ "grad_norm": 2.927156925201416,
168
+ "learning_rate": 2.891455812247892e-05,
169
+ "loss": 1.8507,
170
  "step": 11500
171
  },
172
+ {
173
+ "epoch": 4.400440044004401,
174
+ "grad_norm": 2.2842493057250977,
175
+ "learning_rate": 2.7997799779978003e-05,
176
+ "loss": 1.8447,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 4.583791712504584,
181
+ "grad_norm": 2.1565608978271484,
182
+ "learning_rate": 2.7081041437477084e-05,
183
+ "loss": 1.8359,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 4.767143381004767,
188
+ "grad_norm": 2.3850159645080566,
189
+ "learning_rate": 2.6164283094976168e-05,
190
+ "loss": 1.8413,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 4.9504950495049505,
195
+ "grad_norm": 2.2318992614746094,
196
+ "learning_rate": 2.5247524752475248e-05,
197
+ "loss": 1.8397,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 5.133846718005134,
202
+ "grad_norm": 2.347505569458008,
203
+ "learning_rate": 2.4330766409974332e-05,
204
+ "loss": 1.8196,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 5.317198386505317,
209
+ "grad_norm": 2.1487085819244385,
210
+ "learning_rate": 2.3414008067473413e-05,
211
+ "loss": 1.8309,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 5.5005500550055,
216
+ "grad_norm": 2.21701717376709,
217
+ "learning_rate": 2.24972497249725e-05,
218
+ "loss": 1.8212,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 5.683901723505684,
223
+ "grad_norm": 2.2371625900268555,
224
+ "learning_rate": 2.158049138247158e-05,
225
+ "loss": 1.8148,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 5.867253392005868,
230
+ "grad_norm": 2.157254457473755,
231
+ "learning_rate": 2.0663733039970665e-05,
232
+ "loss": 1.83,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 6.050605060506051,
237
+ "grad_norm": 2.467193603515625,
238
+ "learning_rate": 1.9746974697469746e-05,
239
+ "loss": 1.8115,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 6.233956729006234,
244
+ "grad_norm": 2.3348140716552734,
245
+ "learning_rate": 1.883021635496883e-05,
246
+ "loss": 1.8037,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 6.417308397506417,
251
+ "grad_norm": 2.6734750270843506,
252
+ "learning_rate": 1.7913458012467914e-05,
253
+ "loss": 1.805,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 6.600660066006601,
258
+ "grad_norm": 2.0465357303619385,
259
+ "learning_rate": 1.6996699669966998e-05,
260
+ "loss": 1.8014,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 6.784011734506784,
265
+ "grad_norm": 2.7078983783721924,
266
+ "learning_rate": 1.6079941327466082e-05,
267
+ "loss": 1.7988,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 6.967363403006967,
272
+ "grad_norm": 2.771340847015381,
273
+ "learning_rate": 1.5163182984965163e-05,
274
+ "loss": 1.796,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 7.15071507150715,
279
+ "grad_norm": 2.8874900341033936,
280
+ "learning_rate": 1.4246424642464248e-05,
281
+ "loss": 1.7986,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 7.334066740007334,
286
+ "grad_norm": 2.211998701095581,
287
+ "learning_rate": 1.3329666299963331e-05,
288
+ "loss": 1.7963,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 7.517418408507518,
293
+ "grad_norm": 2.0423145294189453,
294
+ "learning_rate": 1.2412907957462413e-05,
295
+ "loss": 1.7784,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 7.700770077007701,
300
+ "grad_norm": 2.0297274589538574,
301
+ "learning_rate": 1.1496149614961496e-05,
302
+ "loss": 1.7987,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 7.884121745507884,
307
+ "grad_norm": 2.6104135513305664,
308
+ "learning_rate": 1.057939127246058e-05,
309
+ "loss": 1.7847,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 8.067473414008067,
314
+ "grad_norm": 2.4602534770965576,
315
+ "learning_rate": 9.662632929959662e-06,
316
+ "loss": 1.7903,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 8.250825082508252,
321
+ "grad_norm": 2.1262950897216797,
322
+ "learning_rate": 8.745874587458746e-06,
323
+ "loss": 1.795,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 8.434176751008435,
328
+ "grad_norm": 2.538703680038452,
329
+ "learning_rate": 7.829116244957828e-06,
330
+ "loss": 1.7708,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 8.617528419508618,
335
+ "grad_norm": 2.607891321182251,
336
+ "learning_rate": 6.912357902456913e-06,
337
+ "loss": 1.7824,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 8.800880088008801,
342
+ "grad_norm": 2.0636353492736816,
343
+ "learning_rate": 5.995599559955996e-06,
344
+ "loss": 1.7651,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 8.984231756508985,
349
+ "grad_norm": 2.1311886310577393,
350
+ "learning_rate": 5.078841217455079e-06,
351
+ "loss": 1.7825,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 9.167583425009168,
356
+ "grad_norm": 2.185648202896118,
357
+ "learning_rate": 4.162082874954162e-06,
358
+ "loss": 1.7744,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 9.350935093509351,
363
+ "grad_norm": 2.149644374847412,
364
+ "learning_rate": 3.2453245324532458e-06,
365
+ "loss": 1.781,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 9.534286762009534,
370
+ "grad_norm": 2.1185684204101562,
371
+ "learning_rate": 2.3285661899523286e-06,
372
+ "loss": 1.7712,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 9.717638430509718,
377
+ "grad_norm": 2.186683416366577,
378
+ "learning_rate": 1.411807847451412e-06,
379
+ "loss": 1.7739,
380
+ "step": 26500
381
+ },
382
  {
383
  "epoch": 9.900990099009901,
384
+ "grad_norm": 2.319247007369995,
385
  "learning_rate": 4.950495049504951e-07,
386
+ "loss": 1.7713,
387
+ "step": 27000
388
  },
389
  {
390
  "epoch": 10.0,
391
+ "step": 27270,
392
+ "total_flos": 2.615057185279181e+16,
393
+ "train_loss": 1.8617943959923335,
394
+ "train_runtime": 1938.923,
395
+ "train_samples_per_second": 112.49,
396
+ "train_steps_per_second": 14.065
397
  }
398
  ],
399
  "logging_steps": 500,
400
+ "max_steps": 27270,
401
  "num_input_tokens_seen": 0,
402
  "num_train_epochs": 10,
403
  "save_steps": 500,
 
413
  "attributes": {}
414
  }
415
  },
416
+ "total_flos": 2.615057185279181e+16,
417
  "train_batch_size": 8,
418
  "trial_name": null,
419
  "trial_params": null