BilelDJ commited on
Commit
9442d20
1 Parent(s): 3ca0668

Training in progress, step 282

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f13d2d2ee4a89f12eb24c2531e16499da0ada8ea331ec110dbd806519b8eb9b4
3
  size 605156676
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0263202fc195d3f84616e80a536b52506dab04b8e4e367bb1b9bcfbd400f3101
3
  size 605156676
run-bq1dw9ny/checkpoint-282/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/clip-vit-base-patch32",
3
+ "architectures": [
4
+ "CLIPModel"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 512,
10
+ "text_config": {
11
+ "bos_token_id": 0,
12
+ "dropout": 0.0,
13
+ "eos_token_id": 2,
14
+ "model_type": "clip_text_model"
15
+ },
16
+ "torch_dtype": "float32",
17
+ "transformers_version": "4.42.0.dev0",
18
+ "vision_config": {
19
+ "dropout": 0.0,
20
+ "model_type": "clip_vision_model"
21
+ }
22
+ }
run-bq1dw9ny/checkpoint-282/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0263202fc195d3f84616e80a536b52506dab04b8e4e367bb1b9bcfbd400f3101
3
+ size 605156676
run-bq1dw9ny/checkpoint-282/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b01104df926457f0eeb1642e7e785485bc4fc5993053aead19db11f58ee503a
3
+ size 1210551612
run-bq1dw9ny/checkpoint-282/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c33740806ccf1955d4345523f1b05c54a20fdcaf53bdbe6b546e984ef823f1
3
+ size 14244
run-bq1dw9ny/checkpoint-282/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f867e47eeac9ea634f86ff11b0978933721dd3b3057b39790e98c44aab4c5a70
3
+ size 1064
run-bq1dw9ny/checkpoint-282/trainer_state.json ADDED
@@ -0,0 +1,2155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 2,
6
+ "global_step": 282,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02127659574468085,
13
+ "grad_norm": 147.76431274414062,
14
+ "learning_rate": 0.000495960190363068,
15
+ "loss": 3.2762,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.02127659574468085,
20
+ "eval_loss": 5.192628383636475,
21
+ "eval_runtime": 254.7582,
22
+ "eval_samples_per_second": 1.178,
23
+ "eval_steps_per_second": 0.02,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.0425531914893617,
28
+ "grad_norm": 34.33057403564453,
29
+ "learning_rate": 0.0004924176175747603,
30
+ "loss": 4.1983,
31
+ "step": 4
32
+ },
33
+ {
34
+ "epoch": 0.0425531914893617,
35
+ "eval_loss": 4.506951332092285,
36
+ "eval_runtime": 116.2019,
37
+ "eval_samples_per_second": 2.582,
38
+ "eval_steps_per_second": 0.043,
39
+ "step": 4
40
+ },
41
+ {
42
+ "epoch": 0.06382978723404255,
43
+ "grad_norm": 2.5619542598724365,
44
+ "learning_rate": 0.0004888750447864527,
45
+ "loss": 3.6907,
46
+ "step": 6
47
+ },
48
+ {
49
+ "epoch": 0.06382978723404255,
50
+ "eval_loss": 4.125861644744873,
51
+ "eval_runtime": 95.9708,
52
+ "eval_samples_per_second": 3.126,
53
+ "eval_steps_per_second": 0.052,
54
+ "step": 6
55
+ },
56
+ {
57
+ "epoch": 0.0851063829787234,
58
+ "grad_norm": 1.6348135471343994,
59
+ "learning_rate": 0.0004853324719981451,
60
+ "loss": 3.5109,
61
+ "step": 8
62
+ },
63
+ {
64
+ "epoch": 0.0851063829787234,
65
+ "eval_loss": 4.108506679534912,
66
+ "eval_runtime": 95.4994,
67
+ "eval_samples_per_second": 3.141,
68
+ "eval_steps_per_second": 0.052,
69
+ "step": 8
70
+ },
71
+ {
72
+ "epoch": 0.10638297872340426,
73
+ "grad_norm": 0.02139226719737053,
74
+ "learning_rate": 0.00048178989920983744,
75
+ "loss": 3.4686,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.10638297872340426,
80
+ "eval_loss": 4.104280471801758,
81
+ "eval_runtime": 114.204,
82
+ "eval_samples_per_second": 2.627,
83
+ "eval_steps_per_second": 0.044,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.1276595744680851,
88
+ "grad_norm": 0.005519088823348284,
89
+ "learning_rate": 0.0004782473264215299,
90
+ "loss": 3.466,
91
+ "step": 12
92
+ },
93
+ {
94
+ "epoch": 0.1276595744680851,
95
+ "eval_loss": 4.104023456573486,
96
+ "eval_runtime": 96.7284,
97
+ "eval_samples_per_second": 3.101,
98
+ "eval_steps_per_second": 0.052,
99
+ "step": 12
100
+ },
101
+ {
102
+ "epoch": 0.14893617021276595,
103
+ "grad_norm": 0.005563751328736544,
104
+ "learning_rate": 0.00047470475363322223,
105
+ "loss": 3.4658,
106
+ "step": 14
107
+ },
108
+ {
109
+ "epoch": 0.14893617021276595,
110
+ "eval_loss": 4.104072093963623,
111
+ "eval_runtime": 97.0883,
112
+ "eval_samples_per_second": 3.09,
113
+ "eval_steps_per_second": 0.051,
114
+ "step": 14
115
+ },
116
+ {
117
+ "epoch": 0.1702127659574468,
118
+ "grad_norm": 0.0055496166460216045,
119
+ "learning_rate": 0.00047116218084491463,
120
+ "loss": 3.4659,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.1702127659574468,
125
+ "eval_loss": 4.104039669036865,
126
+ "eval_runtime": 93.8461,
127
+ "eval_samples_per_second": 3.197,
128
+ "eval_steps_per_second": 0.053,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.19148936170212766,
133
+ "grad_norm": 0.035093989223241806,
134
+ "learning_rate": 0.000467619608056607,
135
+ "loss": 3.4659,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.19148936170212766,
140
+ "eval_loss": 4.104025363922119,
141
+ "eval_runtime": 93.5115,
142
+ "eval_samples_per_second": 3.208,
143
+ "eval_steps_per_second": 0.053,
144
+ "step": 18
145
+ },
146
+ {
147
+ "epoch": 0.2127659574468085,
148
+ "grad_norm": 0.013235281221568584,
149
+ "learning_rate": 0.0004640770352682993,
150
+ "loss": 3.4658,
151
+ "step": 20
152
+ },
153
+ {
154
+ "epoch": 0.2127659574468085,
155
+ "eval_loss": 4.1039652824401855,
156
+ "eval_runtime": 93.8067,
157
+ "eval_samples_per_second": 3.198,
158
+ "eval_steps_per_second": 0.053,
159
+ "step": 20
160
+ },
161
+ {
162
+ "epoch": 0.23404255319148937,
163
+ "grad_norm": 0.0016526976833119988,
164
+ "learning_rate": 0.0004605344624799917,
165
+ "loss": 3.4658,
166
+ "step": 22
167
+ },
168
+ {
169
+ "epoch": 0.23404255319148937,
170
+ "eval_loss": 4.103963375091553,
171
+ "eval_runtime": 93.4791,
172
+ "eval_samples_per_second": 3.209,
173
+ "eval_steps_per_second": 0.053,
174
+ "step": 22
175
+ },
176
+ {
177
+ "epoch": 0.2553191489361702,
178
+ "grad_norm": 0.0017120653064921498,
179
+ "learning_rate": 0.00045699188969168406,
180
+ "loss": 3.4658,
181
+ "step": 24
182
+ },
183
+ {
184
+ "epoch": 0.2553191489361702,
185
+ "eval_loss": 4.103957176208496,
186
+ "eval_runtime": 116.2864,
187
+ "eval_samples_per_second": 2.58,
188
+ "eval_steps_per_second": 0.043,
189
+ "step": 24
190
+ },
191
+ {
192
+ "epoch": 0.2765957446808511,
193
+ "grad_norm": 0.0011341843055561185,
194
+ "learning_rate": 0.00045344931690337645,
195
+ "loss": 3.4658,
196
+ "step": 26
197
+ },
198
+ {
199
+ "epoch": 0.2765957446808511,
200
+ "eval_loss": 4.103954315185547,
201
+ "eval_runtime": 96.3574,
202
+ "eval_samples_per_second": 3.113,
203
+ "eval_steps_per_second": 0.052,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.2978723404255319,
208
+ "grad_norm": 0.0025584695395082235,
209
+ "learning_rate": 0.0004499067441150688,
210
+ "loss": 3.4658,
211
+ "step": 28
212
+ },
213
+ {
214
+ "epoch": 0.2978723404255319,
215
+ "eval_loss": 4.103954792022705,
216
+ "eval_runtime": 109.8402,
217
+ "eval_samples_per_second": 2.731,
218
+ "eval_steps_per_second": 0.046,
219
+ "step": 28
220
+ },
221
+ {
222
+ "epoch": 0.3191489361702128,
223
+ "grad_norm": 0.0012942380271852016,
224
+ "learning_rate": 0.0004463641713267612,
225
+ "loss": 3.4658,
226
+ "step": 30
227
+ },
228
+ {
229
+ "epoch": 0.3191489361702128,
230
+ "eval_loss": 4.1039557456970215,
231
+ "eval_runtime": 114.0952,
232
+ "eval_samples_per_second": 2.629,
233
+ "eval_steps_per_second": 0.044,
234
+ "step": 30
235
+ },
236
+ {
237
+ "epoch": 0.3404255319148936,
238
+ "grad_norm": 0.001233804621733725,
239
+ "learning_rate": 0.00044282159853845354,
240
+ "loss": 3.4658,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.3404255319148936,
245
+ "eval_loss": 4.103955268859863,
246
+ "eval_runtime": 96.1239,
247
+ "eval_samples_per_second": 3.121,
248
+ "eval_steps_per_second": 0.052,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.3617021276595745,
253
+ "grad_norm": 0.0013235628139227629,
254
+ "learning_rate": 0.00043927902575014593,
255
+ "loss": 3.4658,
256
+ "step": 34
257
+ },
258
+ {
259
+ "epoch": 0.3617021276595745,
260
+ "eval_loss": 4.103952407836914,
261
+ "eval_runtime": 135.012,
262
+ "eval_samples_per_second": 2.222,
263
+ "eval_steps_per_second": 0.037,
264
+ "step": 34
265
+ },
266
+ {
267
+ "epoch": 0.3829787234042553,
268
+ "grad_norm": 0.002040724502876401,
269
+ "learning_rate": 0.00043573645296183833,
270
+ "loss": 3.4658,
271
+ "step": 36
272
+ },
273
+ {
274
+ "epoch": 0.3829787234042553,
275
+ "eval_loss": 4.103944301605225,
276
+ "eval_runtime": 96.6912,
277
+ "eval_samples_per_second": 3.103,
278
+ "eval_steps_per_second": 0.052,
279
+ "step": 36
280
+ },
281
+ {
282
+ "epoch": 0.40425531914893614,
283
+ "grad_norm": 0.0005005718558095396,
284
+ "learning_rate": 0.00043219388017353067,
285
+ "loss": 3.4658,
286
+ "step": 38
287
+ },
288
+ {
289
+ "epoch": 0.40425531914893614,
290
+ "eval_loss": 4.103936195373535,
291
+ "eval_runtime": 120.942,
292
+ "eval_samples_per_second": 2.481,
293
+ "eval_steps_per_second": 0.041,
294
+ "step": 38
295
+ },
296
+ {
297
+ "epoch": 0.425531914893617,
298
+ "grad_norm": 0.0004194887587800622,
299
+ "learning_rate": 0.00042865130738522307,
300
+ "loss": 3.4657,
301
+ "step": 40
302
+ },
303
+ {
304
+ "epoch": 0.425531914893617,
305
+ "eval_loss": 4.103933334350586,
306
+ "eval_runtime": 165.7391,
307
+ "eval_samples_per_second": 1.81,
308
+ "eval_steps_per_second": 0.03,
309
+ "step": 40
310
+ },
311
+ {
312
+ "epoch": 0.44680851063829785,
313
+ "grad_norm": 0.0007025453960523009,
314
+ "learning_rate": 0.0004251087345969154,
315
+ "loss": 3.4657,
316
+ "step": 42
317
+ },
318
+ {
319
+ "epoch": 0.44680851063829785,
320
+ "eval_loss": 4.103935241699219,
321
+ "eval_runtime": 95.9616,
322
+ "eval_samples_per_second": 3.126,
323
+ "eval_steps_per_second": 0.052,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.46808510638297873,
328
+ "grad_norm": 0.0005715902079828084,
329
+ "learning_rate": 0.0004215661618086078,
330
+ "loss": 3.4657,
331
+ "step": 44
332
+ },
333
+ {
334
+ "epoch": 0.46808510638297873,
335
+ "eval_loss": 4.103936195373535,
336
+ "eval_runtime": 95.6382,
337
+ "eval_samples_per_second": 3.137,
338
+ "eval_steps_per_second": 0.052,
339
+ "step": 44
340
+ },
341
+ {
342
+ "epoch": 0.48936170212765956,
343
+ "grad_norm": 0.0009604791412129998,
344
+ "learning_rate": 0.00041802358902030015,
345
+ "loss": 3.4657,
346
+ "step": 46
347
+ },
348
+ {
349
+ "epoch": 0.48936170212765956,
350
+ "eval_loss": 4.103936195373535,
351
+ "eval_runtime": 95.422,
352
+ "eval_samples_per_second": 3.144,
353
+ "eval_steps_per_second": 0.052,
354
+ "step": 46
355
+ },
356
+ {
357
+ "epoch": 0.5106382978723404,
358
+ "grad_norm": 0.0015922917518764734,
359
+ "learning_rate": 0.00041448101623199255,
360
+ "loss": 3.4657,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.5106382978723404,
365
+ "eval_loss": 4.103933334350586,
366
+ "eval_runtime": 95.2253,
367
+ "eval_samples_per_second": 3.15,
368
+ "eval_steps_per_second": 0.053,
369
+ "step": 48
370
+ },
371
+ {
372
+ "epoch": 0.5319148936170213,
373
+ "grad_norm": 0.00046437734272331,
374
+ "learning_rate": 0.0004109384434436849,
375
+ "loss": 3.4657,
376
+ "step": 50
377
+ },
378
+ {
379
+ "epoch": 0.5319148936170213,
380
+ "eval_loss": 4.103931427001953,
381
+ "eval_runtime": 93.7871,
382
+ "eval_samples_per_second": 3.199,
383
+ "eval_steps_per_second": 0.053,
384
+ "step": 50
385
+ },
386
+ {
387
+ "epoch": 0.5531914893617021,
388
+ "grad_norm": 0.0003185276291333139,
389
+ "learning_rate": 0.00040739587065537723,
390
+ "loss": 3.4657,
391
+ "step": 52
392
+ },
393
+ {
394
+ "epoch": 0.5531914893617021,
395
+ "eval_loss": 4.103931903839111,
396
+ "eval_runtime": 94.4747,
397
+ "eval_samples_per_second": 3.175,
398
+ "eval_steps_per_second": 0.053,
399
+ "step": 52
400
+ },
401
+ {
402
+ "epoch": 0.574468085106383,
403
+ "grad_norm": 0.0005938044050708413,
404
+ "learning_rate": 0.00040385329786706963,
405
+ "loss": 3.4657,
406
+ "step": 54
407
+ },
408
+ {
409
+ "epoch": 0.574468085106383,
410
+ "eval_loss": 4.103933334350586,
411
+ "eval_runtime": 94.4693,
412
+ "eval_samples_per_second": 3.176,
413
+ "eval_steps_per_second": 0.053,
414
+ "step": 54
415
+ },
416
+ {
417
+ "epoch": 0.5957446808510638,
418
+ "grad_norm": 0.0008093913784250617,
419
+ "learning_rate": 0.000400310725078762,
420
+ "loss": 3.4657,
421
+ "step": 56
422
+ },
423
+ {
424
+ "epoch": 0.5957446808510638,
425
+ "eval_loss": 4.103933334350586,
426
+ "eval_runtime": 93.6969,
427
+ "eval_samples_per_second": 3.202,
428
+ "eval_steps_per_second": 0.053,
429
+ "step": 56
430
+ },
431
+ {
432
+ "epoch": 0.6170212765957447,
433
+ "grad_norm": 0.000580158899538219,
434
+ "learning_rate": 0.0003967681522904544,
435
+ "loss": 3.4657,
436
+ "step": 58
437
+ },
438
+ {
439
+ "epoch": 0.6170212765957447,
440
+ "eval_loss": 4.103932857513428,
441
+ "eval_runtime": 94.0277,
442
+ "eval_samples_per_second": 3.191,
443
+ "eval_steps_per_second": 0.053,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 0.6382978723404256,
448
+ "grad_norm": 0.0005565917235799134,
449
+ "learning_rate": 0.00039322557950214677,
450
+ "loss": 3.4657,
451
+ "step": 60
452
+ },
453
+ {
454
+ "epoch": 0.6382978723404256,
455
+ "eval_loss": 4.103931903839111,
456
+ "eval_runtime": 107.4438,
457
+ "eval_samples_per_second": 2.792,
458
+ "eval_steps_per_second": 0.047,
459
+ "step": 60
460
+ },
461
+ {
462
+ "epoch": 0.6595744680851063,
463
+ "grad_norm": 0.0006027113413438201,
464
+ "learning_rate": 0.00038968300671383916,
465
+ "loss": 3.4657,
466
+ "step": 62
467
+ },
468
+ {
469
+ "epoch": 0.6595744680851063,
470
+ "eval_loss": 4.103930473327637,
471
+ "eval_runtime": 107.2976,
472
+ "eval_samples_per_second": 2.796,
473
+ "eval_steps_per_second": 0.047,
474
+ "step": 62
475
+ },
476
+ {
477
+ "epoch": 0.6808510638297872,
478
+ "grad_norm": 0.0002680857141967863,
479
+ "learning_rate": 0.0003861404339255315,
480
+ "loss": 3.4657,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.6808510638297872,
485
+ "eval_loss": 4.103930473327637,
486
+ "eval_runtime": 94.4753,
487
+ "eval_samples_per_second": 3.175,
488
+ "eval_steps_per_second": 0.053,
489
+ "step": 64
490
+ },
491
+ {
492
+ "epoch": 0.7021276595744681,
493
+ "grad_norm": 0.0003917052235919982,
494
+ "learning_rate": 0.0003825978611372239,
495
+ "loss": 3.4657,
496
+ "step": 66
497
+ },
498
+ {
499
+ "epoch": 0.7021276595744681,
500
+ "eval_loss": 4.103930473327637,
501
+ "eval_runtime": 94.4775,
502
+ "eval_samples_per_second": 3.175,
503
+ "eval_steps_per_second": 0.053,
504
+ "step": 66
505
+ },
506
+ {
507
+ "epoch": 0.723404255319149,
508
+ "grad_norm": 0.00032164924778044224,
509
+ "learning_rate": 0.00037905528834891625,
510
+ "loss": 3.4657,
511
+ "step": 68
512
+ },
513
+ {
514
+ "epoch": 0.723404255319149,
515
+ "eval_loss": 4.103930473327637,
516
+ "eval_runtime": 94.9268,
517
+ "eval_samples_per_second": 3.16,
518
+ "eval_steps_per_second": 0.053,
519
+ "step": 68
520
+ },
521
+ {
522
+ "epoch": 0.7446808510638298,
523
+ "grad_norm": 0.0004127752035856247,
524
+ "learning_rate": 0.0003755127155606086,
525
+ "loss": 3.4657,
526
+ "step": 70
527
+ },
528
+ {
529
+ "epoch": 0.7446808510638298,
530
+ "eval_loss": 4.103930473327637,
531
+ "eval_runtime": 95.0978,
532
+ "eval_samples_per_second": 3.155,
533
+ "eval_steps_per_second": 0.053,
534
+ "step": 70
535
+ },
536
+ {
537
+ "epoch": 0.7659574468085106,
538
+ "grad_norm": 0.00039929302874952555,
539
+ "learning_rate": 0.000371970142772301,
540
+ "loss": 3.4657,
541
+ "step": 72
542
+ },
543
+ {
544
+ "epoch": 0.7659574468085106,
545
+ "eval_loss": 4.1039299964904785,
546
+ "eval_runtime": 100.3564,
547
+ "eval_samples_per_second": 2.989,
548
+ "eval_steps_per_second": 0.05,
549
+ "step": 72
550
+ },
551
+ {
552
+ "epoch": 0.7872340425531915,
553
+ "grad_norm": 0.00019805252668447793,
554
+ "learning_rate": 0.00036842756998399333,
555
+ "loss": 3.4657,
556
+ "step": 74
557
+ },
558
+ {
559
+ "epoch": 0.7872340425531915,
560
+ "eval_loss": 4.1039299964904785,
561
+ "eval_runtime": 93.9211,
562
+ "eval_samples_per_second": 3.194,
563
+ "eval_steps_per_second": 0.053,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 0.8085106382978723,
568
+ "grad_norm": 0.00028355230460874736,
569
+ "learning_rate": 0.0003648849971956858,
570
+ "loss": 3.4657,
571
+ "step": 76
572
+ },
573
+ {
574
+ "epoch": 0.8085106382978723,
575
+ "eval_loss": 4.1039299964904785,
576
+ "eval_runtime": 122.794,
577
+ "eval_samples_per_second": 2.443,
578
+ "eval_steps_per_second": 0.041,
579
+ "step": 76
580
+ },
581
+ {
582
+ "epoch": 0.8297872340425532,
583
+ "grad_norm": 0.0003330525360070169,
584
+ "learning_rate": 0.0003613424244073781,
585
+ "loss": 3.4657,
586
+ "step": 78
587
+ },
588
+ {
589
+ "epoch": 0.8297872340425532,
590
+ "eval_loss": 4.1039299964904785,
591
+ "eval_runtime": 108.2578,
592
+ "eval_samples_per_second": 2.771,
593
+ "eval_steps_per_second": 0.046,
594
+ "step": 78
595
+ },
596
+ {
597
+ "epoch": 0.851063829787234,
598
+ "grad_norm": 0.00028144015232101083,
599
+ "learning_rate": 0.0003577998516190705,
600
+ "loss": 3.4657,
601
+ "step": 80
602
+ },
603
+ {
604
+ "epoch": 0.851063829787234,
605
+ "eval_loss": 4.10392951965332,
606
+ "eval_runtime": 93.7754,
607
+ "eval_samples_per_second": 3.199,
608
+ "eval_steps_per_second": 0.053,
609
+ "step": 80
610
+ },
611
+ {
612
+ "epoch": 0.8723404255319149,
613
+ "grad_norm": 0.0003809813060797751,
614
+ "learning_rate": 0.00035425727883076286,
615
+ "loss": 3.4657,
616
+ "step": 82
617
+ },
618
+ {
619
+ "epoch": 0.8723404255319149,
620
+ "eval_loss": 4.1039299964904785,
621
+ "eval_runtime": 104.9248,
622
+ "eval_samples_per_second": 2.859,
623
+ "eval_steps_per_second": 0.048,
624
+ "step": 82
625
+ },
626
+ {
627
+ "epoch": 0.8936170212765957,
628
+ "grad_norm": 0.0005144781316630542,
629
+ "learning_rate": 0.0003507147060424552,
630
+ "loss": 3.4657,
631
+ "step": 84
632
+ },
633
+ {
634
+ "epoch": 0.8936170212765957,
635
+ "eval_loss": 4.1039299964904785,
636
+ "eval_runtime": 94.2337,
637
+ "eval_samples_per_second": 3.184,
638
+ "eval_steps_per_second": 0.053,
639
+ "step": 84
640
+ },
641
+ {
642
+ "epoch": 0.9148936170212766,
643
+ "grad_norm": 0.0004984396509826183,
644
+ "learning_rate": 0.0003471721332541476,
645
+ "loss": 3.4657,
646
+ "step": 86
647
+ },
648
+ {
649
+ "epoch": 0.9148936170212766,
650
+ "eval_loss": 4.103929042816162,
651
+ "eval_runtime": 93.8643,
652
+ "eval_samples_per_second": 3.196,
653
+ "eval_steps_per_second": 0.053,
654
+ "step": 86
655
+ },
656
+ {
657
+ "epoch": 0.9361702127659575,
658
+ "grad_norm": 0.00022737662948202342,
659
+ "learning_rate": 0.00034362956046583994,
660
+ "loss": 3.4657,
661
+ "step": 88
662
+ },
663
+ {
664
+ "epoch": 0.9361702127659575,
665
+ "eval_loss": 4.1039299964904785,
666
+ "eval_runtime": 135.1229,
667
+ "eval_samples_per_second": 2.22,
668
+ "eval_steps_per_second": 0.037,
669
+ "step": 88
670
+ },
671
+ {
672
+ "epoch": 0.9574468085106383,
673
+ "grad_norm": 0.0003345514414831996,
674
+ "learning_rate": 0.00034008698767753234,
675
+ "loss": 3.4657,
676
+ "step": 90
677
+ },
678
+ {
679
+ "epoch": 0.9574468085106383,
680
+ "eval_loss": 4.1039299964904785,
681
+ "eval_runtime": 141.4817,
682
+ "eval_samples_per_second": 2.12,
683
+ "eval_steps_per_second": 0.035,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 0.9787234042553191,
688
+ "grad_norm": 0.00027788232546299696,
689
+ "learning_rate": 0.0003365444148892247,
690
+ "loss": 3.4657,
691
+ "step": 92
692
+ },
693
+ {
694
+ "epoch": 0.9787234042553191,
695
+ "eval_loss": 4.10392951965332,
696
+ "eval_runtime": 116.6901,
697
+ "eval_samples_per_second": 2.571,
698
+ "eval_steps_per_second": 0.043,
699
+ "step": 92
700
+ },
701
+ {
702
+ "epoch": 1.0,
703
+ "grad_norm": 0.00033105359761975706,
704
+ "learning_rate": 0.0003330018421009171,
705
+ "loss": 3.3006,
706
+ "step": 94
707
+ },
708
+ {
709
+ "epoch": 1.0,
710
+ "eval_loss": 4.103929042816162,
711
+ "eval_runtime": 149.9164,
712
+ "eval_samples_per_second": 2.001,
713
+ "eval_steps_per_second": 0.033,
714
+ "step": 94
715
+ },
716
+ {
717
+ "epoch": 1.0212765957446808,
718
+ "grad_norm": 0.0003272149770054966,
719
+ "learning_rate": 0.0003294592693126094,
720
+ "loss": 3.4657,
721
+ "step": 96
722
+ },
723
+ {
724
+ "epoch": 1.0212765957446808,
725
+ "eval_loss": 4.103928565979004,
726
+ "eval_runtime": 117.8922,
727
+ "eval_samples_per_second": 2.545,
728
+ "eval_steps_per_second": 0.042,
729
+ "step": 96
730
+ },
731
+ {
732
+ "epoch": 1.0425531914893618,
733
+ "grad_norm": 0.0002339623897569254,
734
+ "learning_rate": 0.0003259166965243019,
735
+ "loss": 3.4657,
736
+ "step": 98
737
+ },
738
+ {
739
+ "epoch": 1.0425531914893618,
740
+ "eval_loss": 4.10392951965332,
741
+ "eval_runtime": 94.5849,
742
+ "eval_samples_per_second": 3.172,
743
+ "eval_steps_per_second": 0.053,
744
+ "step": 98
745
+ },
746
+ {
747
+ "epoch": 1.0638297872340425,
748
+ "grad_norm": 0.0003783780266530812,
749
+ "learning_rate": 0.0003223741237359942,
750
+ "loss": 3.4657,
751
+ "step": 100
752
+ },
753
+ {
754
+ "epoch": 1.0638297872340425,
755
+ "eval_loss": 4.103929042816162,
756
+ "eval_runtime": 117.3215,
757
+ "eval_samples_per_second": 2.557,
758
+ "eval_steps_per_second": 0.043,
759
+ "step": 100
760
+ },
761
+ {
762
+ "epoch": 1.0851063829787233,
763
+ "grad_norm": 0.00021044367167633027,
764
+ "learning_rate": 0.00031883155094768656,
765
+ "loss": 3.4657,
766
+ "step": 102
767
+ },
768
+ {
769
+ "epoch": 1.0851063829787233,
770
+ "eval_loss": 4.103929042816162,
771
+ "eval_runtime": 118.2354,
772
+ "eval_samples_per_second": 2.537,
773
+ "eval_steps_per_second": 0.042,
774
+ "step": 102
775
+ },
776
+ {
777
+ "epoch": 1.1063829787234043,
778
+ "grad_norm": 0.00032288962393067777,
779
+ "learning_rate": 0.00031528897815937896,
780
+ "loss": 3.4657,
781
+ "step": 104
782
+ },
783
+ {
784
+ "epoch": 1.1063829787234043,
785
+ "eval_loss": 4.103928089141846,
786
+ "eval_runtime": 111.5984,
787
+ "eval_samples_per_second": 2.688,
788
+ "eval_steps_per_second": 0.045,
789
+ "step": 104
790
+ },
791
+ {
792
+ "epoch": 1.127659574468085,
793
+ "grad_norm": 0.0002670014800969511,
794
+ "learning_rate": 0.0003117464053710713,
795
+ "loss": 3.4657,
796
+ "step": 106
797
+ },
798
+ {
799
+ "epoch": 1.127659574468085,
800
+ "eval_loss": 4.103928565979004,
801
+ "eval_runtime": 121.2363,
802
+ "eval_samples_per_second": 2.475,
803
+ "eval_steps_per_second": 0.041,
804
+ "step": 106
805
+ },
806
+ {
807
+ "epoch": 1.148936170212766,
808
+ "grad_norm": 0.00021224924421403557,
809
+ "learning_rate": 0.0003082038325827637,
810
+ "loss": 3.4657,
811
+ "step": 108
812
+ },
813
+ {
814
+ "epoch": 1.148936170212766,
815
+ "eval_loss": 4.103928089141846,
816
+ "eval_runtime": 102.1198,
817
+ "eval_samples_per_second": 2.938,
818
+ "eval_steps_per_second": 0.049,
819
+ "step": 108
820
+ },
821
+ {
822
+ "epoch": 1.1702127659574468,
823
+ "grad_norm": 0.0002756421163212508,
824
+ "learning_rate": 0.00030466125979445604,
825
+ "loss": 3.4657,
826
+ "step": 110
827
+ },
828
+ {
829
+ "epoch": 1.1702127659574468,
830
+ "eval_loss": 4.103928565979004,
831
+ "eval_runtime": 121.4773,
832
+ "eval_samples_per_second": 2.47,
833
+ "eval_steps_per_second": 0.041,
834
+ "step": 110
835
+ },
836
+ {
837
+ "epoch": 1.1914893617021276,
838
+ "grad_norm": 0.0002671127731446177,
839
+ "learning_rate": 0.00030111868700614843,
840
+ "loss": 3.4657,
841
+ "step": 112
842
+ },
843
+ {
844
+ "epoch": 1.1914893617021276,
845
+ "eval_loss": 4.103928565979004,
846
+ "eval_runtime": 138.1466,
847
+ "eval_samples_per_second": 2.172,
848
+ "eval_steps_per_second": 0.036,
849
+ "step": 112
850
+ },
851
+ {
852
+ "epoch": 1.2127659574468086,
853
+ "grad_norm": 0.0004021762579213828,
854
+ "learning_rate": 0.0002975761142178408,
855
+ "loss": 3.4657,
856
+ "step": 114
857
+ },
858
+ {
859
+ "epoch": 1.2127659574468086,
860
+ "eval_loss": 4.103928089141846,
861
+ "eval_runtime": 101.5548,
862
+ "eval_samples_per_second": 2.954,
863
+ "eval_steps_per_second": 0.049,
864
+ "step": 114
865
+ },
866
+ {
867
+ "epoch": 1.2340425531914894,
868
+ "grad_norm": 0.0002796368789859116,
869
+ "learning_rate": 0.0002940335414295331,
870
+ "loss": 3.4657,
871
+ "step": 116
872
+ },
873
+ {
874
+ "epoch": 1.2340425531914894,
875
+ "eval_loss": 4.103928089141846,
876
+ "eval_runtime": 94.7269,
877
+ "eval_samples_per_second": 3.167,
878
+ "eval_steps_per_second": 0.053,
879
+ "step": 116
880
+ },
881
+ {
882
+ "epoch": 1.2553191489361701,
883
+ "grad_norm": 0.0003006465267390013,
884
+ "learning_rate": 0.0002904909686412255,
885
+ "loss": 3.4657,
886
+ "step": 118
887
+ },
888
+ {
889
+ "epoch": 1.2553191489361701,
890
+ "eval_loss": 4.103928089141846,
891
+ "eval_runtime": 104.4724,
892
+ "eval_samples_per_second": 2.872,
893
+ "eval_steps_per_second": 0.048,
894
+ "step": 118
895
+ },
896
+ {
897
+ "epoch": 1.2765957446808511,
898
+ "grad_norm": 0.00029228252242319286,
899
+ "learning_rate": 0.0002869483958529179,
900
+ "loss": 3.4657,
901
+ "step": 120
902
+ },
903
+ {
904
+ "epoch": 1.2765957446808511,
905
+ "eval_loss": 4.103928565979004,
906
+ "eval_runtime": 108.4499,
907
+ "eval_samples_per_second": 2.766,
908
+ "eval_steps_per_second": 0.046,
909
+ "step": 120
910
+ },
911
+ {
912
+ "epoch": 1.297872340425532,
913
+ "grad_norm": 0.00032636514515616,
914
+ "learning_rate": 0.0002834058230646103,
915
+ "loss": 3.4657,
916
+ "step": 122
917
+ },
918
+ {
919
+ "epoch": 1.297872340425532,
920
+ "eval_loss": 4.1039276123046875,
921
+ "eval_runtime": 94.7255,
922
+ "eval_samples_per_second": 3.167,
923
+ "eval_steps_per_second": 0.053,
924
+ "step": 122
925
+ },
926
+ {
927
+ "epoch": 1.3191489361702127,
928
+ "grad_norm": 0.00040858419379219413,
929
+ "learning_rate": 0.00027986325027630265,
930
+ "loss": 3.4657,
931
+ "step": 124
932
+ },
933
+ {
934
+ "epoch": 1.3191489361702127,
935
+ "eval_loss": 4.103928089141846,
936
+ "eval_runtime": 123.8424,
937
+ "eval_samples_per_second": 2.422,
938
+ "eval_steps_per_second": 0.04,
939
+ "step": 124
940
+ },
941
+ {
942
+ "epoch": 1.3404255319148937,
943
+ "grad_norm": 0.00040357012767344713,
944
+ "learning_rate": 0.00027632067748799505,
945
+ "loss": 3.4657,
946
+ "step": 126
947
+ },
948
+ {
949
+ "epoch": 1.3404255319148937,
950
+ "eval_loss": 4.103928089141846,
951
+ "eval_runtime": 118.9281,
952
+ "eval_samples_per_second": 2.523,
953
+ "eval_steps_per_second": 0.042,
954
+ "step": 126
955
+ },
956
+ {
957
+ "epoch": 1.3617021276595744,
958
+ "grad_norm": 0.00039930595085024834,
959
+ "learning_rate": 0.0002727781046996874,
960
+ "loss": 3.4657,
961
+ "step": 128
962
+ },
963
+ {
964
+ "epoch": 1.3617021276595744,
965
+ "eval_loss": 4.1039276123046875,
966
+ "eval_runtime": 112.1697,
967
+ "eval_samples_per_second": 2.675,
968
+ "eval_steps_per_second": 0.045,
969
+ "step": 128
970
+ },
971
+ {
972
+ "epoch": 1.3829787234042552,
973
+ "grad_norm": 0.00037575900205411017,
974
+ "learning_rate": 0.0002692355319113798,
975
+ "loss": 3.4657,
976
+ "step": 130
977
+ },
978
+ {
979
+ "epoch": 1.3829787234042552,
980
+ "eval_loss": 4.1039276123046875,
981
+ "eval_runtime": 125.21,
982
+ "eval_samples_per_second": 2.396,
983
+ "eval_steps_per_second": 0.04,
984
+ "step": 130
985
+ },
986
+ {
987
+ "epoch": 1.4042553191489362,
988
+ "grad_norm": 0.00047126636491157115,
989
+ "learning_rate": 0.00026569295912307213,
990
+ "loss": 3.4657,
991
+ "step": 132
992
+ },
993
+ {
994
+ "epoch": 1.4042553191489362,
995
+ "eval_loss": 4.1039276123046875,
996
+ "eval_runtime": 135.9147,
997
+ "eval_samples_per_second": 2.207,
998
+ "eval_steps_per_second": 0.037,
999
+ "step": 132
1000
+ },
1001
+ {
1002
+ "epoch": 1.425531914893617,
1003
+ "grad_norm": 0.0005009864689782262,
1004
+ "learning_rate": 0.0002621503863347645,
1005
+ "loss": 3.4657,
1006
+ "step": 134
1007
+ },
1008
+ {
1009
+ "epoch": 1.425531914893617,
1010
+ "eval_loss": 4.1039276123046875,
1011
+ "eval_runtime": 107.4624,
1012
+ "eval_samples_per_second": 2.792,
1013
+ "eval_steps_per_second": 0.047,
1014
+ "step": 134
1015
+ },
1016
+ {
1017
+ "epoch": 1.4468085106382977,
1018
+ "grad_norm": 0.0004678282712120563,
1019
+ "learning_rate": 0.00025860781354645687,
1020
+ "loss": 3.4657,
1021
+ "step": 136
1022
+ },
1023
+ {
1024
+ "epoch": 1.4468085106382977,
1025
+ "eval_loss": 4.1039276123046875,
1026
+ "eval_runtime": 138.7278,
1027
+ "eval_samples_per_second": 2.163,
1028
+ "eval_steps_per_second": 0.036,
1029
+ "step": 136
1030
+ },
1031
+ {
1032
+ "epoch": 1.4680851063829787,
1033
+ "grad_norm": 0.0003415594110265374,
1034
+ "learning_rate": 0.0002550652407581492,
1035
+ "loss": 3.4657,
1036
+ "step": 138
1037
+ },
1038
+ {
1039
+ "epoch": 1.4680851063829787,
1040
+ "eval_loss": 4.1039276123046875,
1041
+ "eval_runtime": 130.873,
1042
+ "eval_samples_per_second": 2.292,
1043
+ "eval_steps_per_second": 0.038,
1044
+ "step": 138
1045
+ },
1046
+ {
1047
+ "epoch": 1.4893617021276595,
1048
+ "grad_norm": 0.00039781673694960773,
1049
+ "learning_rate": 0.0002515226679698416,
1050
+ "loss": 3.4657,
1051
+ "step": 140
1052
+ },
1053
+ {
1054
+ "epoch": 1.4893617021276595,
1055
+ "eval_loss": 4.1039276123046875,
1056
+ "eval_runtime": 127.3509,
1057
+ "eval_samples_per_second": 2.356,
1058
+ "eval_steps_per_second": 0.039,
1059
+ "step": 140
1060
+ },
1061
+ {
1062
+ "epoch": 1.5106382978723403,
1063
+ "grad_norm": 0.0007758048013783991,
1064
+ "learning_rate": 0.000247980095181534,
1065
+ "loss": 3.4657,
1066
+ "step": 142
1067
+ },
1068
+ {
1069
+ "epoch": 1.5106382978723403,
1070
+ "eval_loss": 4.103927135467529,
1071
+ "eval_runtime": 93.1203,
1072
+ "eval_samples_per_second": 3.222,
1073
+ "eval_steps_per_second": 0.054,
1074
+ "step": 142
1075
+ },
1076
+ {
1077
+ "epoch": 1.5319148936170213,
1078
+ "grad_norm": 0.0003962689661420882,
1079
+ "learning_rate": 0.00024443752239322635,
1080
+ "loss": 3.4657,
1081
+ "step": 144
1082
+ },
1083
+ {
1084
+ "epoch": 1.5319148936170213,
1085
+ "eval_loss": 4.103927135467529,
1086
+ "eval_runtime": 92.9702,
1087
+ "eval_samples_per_second": 3.227,
1088
+ "eval_steps_per_second": 0.054,
1089
+ "step": 144
1090
+ },
1091
+ {
1092
+ "epoch": 1.5531914893617023,
1093
+ "grad_norm": 0.00046265136916190386,
1094
+ "learning_rate": 0.00024089494960491872,
1095
+ "loss": 3.4657,
1096
+ "step": 146
1097
+ },
1098
+ {
1099
+ "epoch": 1.5531914893617023,
1100
+ "eval_loss": 4.103926658630371,
1101
+ "eval_runtime": 95.1623,
1102
+ "eval_samples_per_second": 3.153,
1103
+ "eval_steps_per_second": 0.053,
1104
+ "step": 146
1105
+ },
1106
+ {
1107
+ "epoch": 1.574468085106383,
1108
+ "grad_norm": 0.0003199471684638411,
1109
+ "learning_rate": 0.00023735237681661112,
1110
+ "loss": 3.4657,
1111
+ "step": 148
1112
+ },
1113
+ {
1114
+ "epoch": 1.574468085106383,
1115
+ "eval_loss": 4.103927135467529,
1116
+ "eval_runtime": 94.8304,
1117
+ "eval_samples_per_second": 3.164,
1118
+ "eval_steps_per_second": 0.053,
1119
+ "step": 148
1120
+ },
1121
+ {
1122
+ "epoch": 1.5957446808510638,
1123
+ "grad_norm": 0.0009507798822596669,
1124
+ "learning_rate": 0.0002338098040283035,
1125
+ "loss": 3.4657,
1126
+ "step": 150
1127
+ },
1128
+ {
1129
+ "epoch": 1.5957446808510638,
1130
+ "eval_loss": 4.103926658630371,
1131
+ "eval_runtime": 106.393,
1132
+ "eval_samples_per_second": 2.82,
1133
+ "eval_steps_per_second": 0.047,
1134
+ "step": 150
1135
+ },
1136
+ {
1137
+ "epoch": 1.6170212765957448,
1138
+ "grad_norm": 0.0003600665950216353,
1139
+ "learning_rate": 0.00023026723123999586,
1140
+ "loss": 3.4657,
1141
+ "step": 152
1142
+ },
1143
+ {
1144
+ "epoch": 1.6170212765957448,
1145
+ "eval_loss": 4.103927135467529,
1146
+ "eval_runtime": 93.6609,
1147
+ "eval_samples_per_second": 3.203,
1148
+ "eval_steps_per_second": 0.053,
1149
+ "step": 152
1150
+ },
1151
+ {
1152
+ "epoch": 1.6382978723404256,
1153
+ "grad_norm": 0.00046105613000690937,
1154
+ "learning_rate": 0.00022672465845168823,
1155
+ "loss": 3.4657,
1156
+ "step": 154
1157
+ },
1158
+ {
1159
+ "epoch": 1.6382978723404256,
1160
+ "eval_loss": 4.103926658630371,
1161
+ "eval_runtime": 93.7233,
1162
+ "eval_samples_per_second": 3.201,
1163
+ "eval_steps_per_second": 0.053,
1164
+ "step": 154
1165
+ },
1166
+ {
1167
+ "epoch": 1.6595744680851063,
1168
+ "grad_norm": 0.0004165441496297717,
1169
+ "learning_rate": 0.0002231820856633806,
1170
+ "loss": 3.4657,
1171
+ "step": 156
1172
+ },
1173
+ {
1174
+ "epoch": 1.6595744680851063,
1175
+ "eval_loss": 4.103926658630371,
1176
+ "eval_runtime": 93.1558,
1177
+ "eval_samples_per_second": 3.22,
1178
+ "eval_steps_per_second": 0.054,
1179
+ "step": 156
1180
+ },
1181
+ {
1182
+ "epoch": 1.6808510638297873,
1183
+ "grad_norm": 0.00046332846977747977,
1184
+ "learning_rate": 0.00021963951287507297,
1185
+ "loss": 3.4657,
1186
+ "step": 158
1187
+ },
1188
+ {
1189
+ "epoch": 1.6808510638297873,
1190
+ "eval_loss": 4.103925704956055,
1191
+ "eval_runtime": 92.8599,
1192
+ "eval_samples_per_second": 3.231,
1193
+ "eval_steps_per_second": 0.054,
1194
+ "step": 158
1195
+ },
1196
+ {
1197
+ "epoch": 1.702127659574468,
1198
+ "grad_norm": 0.00045260830665938556,
1199
+ "learning_rate": 0.00021609694008676534,
1200
+ "loss": 3.4657,
1201
+ "step": 160
1202
+ },
1203
+ {
1204
+ "epoch": 1.702127659574468,
1205
+ "eval_loss": 4.103925704956055,
1206
+ "eval_runtime": 92.4335,
1207
+ "eval_samples_per_second": 3.246,
1208
+ "eval_steps_per_second": 0.054,
1209
+ "step": 160
1210
+ },
1211
+ {
1212
+ "epoch": 1.7234042553191489,
1213
+ "grad_norm": 0.0006771318148821592,
1214
+ "learning_rate": 0.0002125543672984577,
1215
+ "loss": 3.4657,
1216
+ "step": 162
1217
+ },
1218
+ {
1219
+ "epoch": 1.7234042553191489,
1220
+ "eval_loss": 4.1039252281188965,
1221
+ "eval_runtime": 92.8141,
1222
+ "eval_samples_per_second": 3.232,
1223
+ "eval_steps_per_second": 0.054,
1224
+ "step": 162
1225
+ },
1226
+ {
1227
+ "epoch": 1.7446808510638299,
1228
+ "grad_norm": 0.0002604821929708123,
1229
+ "learning_rate": 0.00020901179451015008,
1230
+ "loss": 3.4657,
1231
+ "step": 164
1232
+ },
1233
+ {
1234
+ "epoch": 1.7446808510638299,
1235
+ "eval_loss": 4.1039252281188965,
1236
+ "eval_runtime": 93.7459,
1237
+ "eval_samples_per_second": 3.2,
1238
+ "eval_steps_per_second": 0.053,
1239
+ "step": 164
1240
+ },
1241
+ {
1242
+ "epoch": 1.7659574468085106,
1243
+ "grad_norm": 0.0006150390254333615,
1244
+ "learning_rate": 0.00020546922172184245,
1245
+ "loss": 3.4657,
1246
+ "step": 166
1247
+ },
1248
+ {
1249
+ "epoch": 1.7659574468085106,
1250
+ "eval_loss": 4.103924751281738,
1251
+ "eval_runtime": 94.135,
1252
+ "eval_samples_per_second": 3.187,
1253
+ "eval_steps_per_second": 0.053,
1254
+ "step": 166
1255
+ },
1256
+ {
1257
+ "epoch": 1.7872340425531914,
1258
+ "grad_norm": 0.0006115163560025394,
1259
+ "learning_rate": 0.00020192664893353482,
1260
+ "loss": 3.4657,
1261
+ "step": 168
1262
+ },
1263
+ {
1264
+ "epoch": 1.7872340425531914,
1265
+ "eval_loss": 4.10392427444458,
1266
+ "eval_runtime": 93.8446,
1267
+ "eval_samples_per_second": 3.197,
1268
+ "eval_steps_per_second": 0.053,
1269
+ "step": 168
1270
+ },
1271
+ {
1272
+ "epoch": 1.8085106382978724,
1273
+ "grad_norm": 0.000593107077293098,
1274
+ "learning_rate": 0.0001983840761452272,
1275
+ "loss": 3.4657,
1276
+ "step": 170
1277
+ },
1278
+ {
1279
+ "epoch": 1.8085106382978724,
1280
+ "eval_loss": 4.10392427444458,
1281
+ "eval_runtime": 92.9188,
1282
+ "eval_samples_per_second": 3.229,
1283
+ "eval_steps_per_second": 0.054,
1284
+ "step": 170
1285
+ },
1286
+ {
1287
+ "epoch": 1.8297872340425532,
1288
+ "grad_norm": 0.0008810166036710143,
1289
+ "learning_rate": 0.00019484150335691958,
1290
+ "loss": 3.4657,
1291
+ "step": 172
1292
+ },
1293
+ {
1294
+ "epoch": 1.8297872340425532,
1295
+ "eval_loss": 4.103923320770264,
1296
+ "eval_runtime": 93.7813,
1297
+ "eval_samples_per_second": 3.199,
1298
+ "eval_steps_per_second": 0.053,
1299
+ "step": 172
1300
+ },
1301
+ {
1302
+ "epoch": 1.851063829787234,
1303
+ "grad_norm": 0.0006031219381839037,
1304
+ "learning_rate": 0.00019129893056861195,
1305
+ "loss": 3.4657,
1306
+ "step": 174
1307
+ },
1308
+ {
1309
+ "epoch": 1.851063829787234,
1310
+ "eval_loss": 4.103923320770264,
1311
+ "eval_runtime": 94.6061,
1312
+ "eval_samples_per_second": 3.171,
1313
+ "eval_steps_per_second": 0.053,
1314
+ "step": 174
1315
+ },
1316
+ {
1317
+ "epoch": 1.872340425531915,
1318
+ "grad_norm": 0.000731271633412689,
1319
+ "learning_rate": 0.0001877563577803043,
1320
+ "loss": 3.4657,
1321
+ "step": 176
1322
+ },
1323
+ {
1324
+ "epoch": 1.872340425531915,
1325
+ "eval_loss": 4.103922367095947,
1326
+ "eval_runtime": 93.6123,
1327
+ "eval_samples_per_second": 3.205,
1328
+ "eval_steps_per_second": 0.053,
1329
+ "step": 176
1330
+ },
1331
+ {
1332
+ "epoch": 1.8936170212765957,
1333
+ "grad_norm": 0.0005839240038767457,
1334
+ "learning_rate": 0.00018421378499199666,
1335
+ "loss": 3.4657,
1336
+ "step": 178
1337
+ },
1338
+ {
1339
+ "epoch": 1.8936170212765957,
1340
+ "eval_loss": 4.103921890258789,
1341
+ "eval_runtime": 93.4167,
1342
+ "eval_samples_per_second": 3.211,
1343
+ "eval_steps_per_second": 0.054,
1344
+ "step": 178
1345
+ },
1346
+ {
1347
+ "epoch": 1.9148936170212765,
1348
+ "grad_norm": 0.0006142465863376856,
1349
+ "learning_rate": 0.00018067121220368906,
1350
+ "loss": 3.4657,
1351
+ "step": 180
1352
+ },
1353
+ {
1354
+ "epoch": 1.9148936170212765,
1355
+ "eval_loss": 4.103921890258789,
1356
+ "eval_runtime": 93.1077,
1357
+ "eval_samples_per_second": 3.222,
1358
+ "eval_steps_per_second": 0.054,
1359
+ "step": 180
1360
+ },
1361
+ {
1362
+ "epoch": 1.9361702127659575,
1363
+ "grad_norm": 0.0005157635896466672,
1364
+ "learning_rate": 0.00017712863941538143,
1365
+ "loss": 3.4657,
1366
+ "step": 182
1367
+ },
1368
+ {
1369
+ "epoch": 1.9361702127659575,
1370
+ "eval_loss": 4.103921890258789,
1371
+ "eval_runtime": 104.9418,
1372
+ "eval_samples_per_second": 2.859,
1373
+ "eval_steps_per_second": 0.048,
1374
+ "step": 182
1375
+ },
1376
+ {
1377
+ "epoch": 1.9574468085106385,
1378
+ "grad_norm": 0.00047715185792185366,
1379
+ "learning_rate": 0.0001735860666270738,
1380
+ "loss": 3.4657,
1381
+ "step": 184
1382
+ },
1383
+ {
1384
+ "epoch": 1.9574468085106385,
1385
+ "eval_loss": 4.103921890258789,
1386
+ "eval_runtime": 143.8492,
1387
+ "eval_samples_per_second": 2.086,
1388
+ "eval_steps_per_second": 0.035,
1389
+ "step": 184
1390
+ },
1391
+ {
1392
+ "epoch": 1.978723404255319,
1393
+ "grad_norm": 0.0006548584206029773,
1394
+ "learning_rate": 0.00017004349383876617,
1395
+ "loss": 3.4657,
1396
+ "step": 186
1397
+ },
1398
+ {
1399
+ "epoch": 1.978723404255319,
1400
+ "eval_loss": 4.103920936584473,
1401
+ "eval_runtime": 93.0081,
1402
+ "eval_samples_per_second": 3.226,
1403
+ "eval_steps_per_second": 0.054,
1404
+ "step": 186
1405
+ },
1406
+ {
1407
+ "epoch": 2.0,
1408
+ "grad_norm": 0.0007490716525353491,
1409
+ "learning_rate": 0.00016650092105045854,
1410
+ "loss": 3.3006,
1411
+ "step": 188
1412
+ },
1413
+ {
1414
+ "epoch": 2.0,
1415
+ "eval_loss": 4.103919982910156,
1416
+ "eval_runtime": 93.8488,
1417
+ "eval_samples_per_second": 3.197,
1418
+ "eval_steps_per_second": 0.053,
1419
+ "step": 188
1420
+ },
1421
+ {
1422
+ "epoch": 2.021276595744681,
1423
+ "grad_norm": 0.001015963382087648,
1424
+ "learning_rate": 0.00016295834826215094,
1425
+ "loss": 3.4657,
1426
+ "step": 190
1427
+ },
1428
+ {
1429
+ "epoch": 2.021276595744681,
1430
+ "eval_loss": 4.103919506072998,
1431
+ "eval_runtime": 93.9004,
1432
+ "eval_samples_per_second": 3.195,
1433
+ "eval_steps_per_second": 0.053,
1434
+ "step": 190
1435
+ },
1436
+ {
1437
+ "epoch": 2.0425531914893615,
1438
+ "grad_norm": 0.004860539920628071,
1439
+ "learning_rate": 0.00015941577547384328,
1440
+ "loss": 3.4657,
1441
+ "step": 192
1442
+ },
1443
+ {
1444
+ "epoch": 2.0425531914893615,
1445
+ "eval_loss": 4.10391902923584,
1446
+ "eval_runtime": 105.0138,
1447
+ "eval_samples_per_second": 2.857,
1448
+ "eval_steps_per_second": 0.048,
1449
+ "step": 192
1450
+ },
1451
+ {
1452
+ "epoch": 2.0638297872340425,
1453
+ "grad_norm": 0.0015178662724792957,
1454
+ "learning_rate": 0.00015587320268553565,
1455
+ "loss": 3.4657,
1456
+ "step": 194
1457
+ },
1458
+ {
1459
+ "epoch": 2.0638297872340425,
1460
+ "eval_loss": 4.103918552398682,
1461
+ "eval_runtime": 94.2786,
1462
+ "eval_samples_per_second": 3.182,
1463
+ "eval_steps_per_second": 0.053,
1464
+ "step": 194
1465
+ },
1466
+ {
1467
+ "epoch": 2.0851063829787235,
1468
+ "grad_norm": 0.0009718618239276111,
1469
+ "learning_rate": 0.00015233062989722802,
1470
+ "loss": 3.4657,
1471
+ "step": 196
1472
+ },
1473
+ {
1474
+ "epoch": 2.0851063829787235,
1475
+ "eval_loss": 4.103917598724365,
1476
+ "eval_runtime": 93.9852,
1477
+ "eval_samples_per_second": 3.192,
1478
+ "eval_steps_per_second": 0.053,
1479
+ "step": 196
1480
+ },
1481
+ {
1482
+ "epoch": 2.106382978723404,
1483
+ "grad_norm": 0.0011399408103898168,
1484
+ "learning_rate": 0.0001487880571089204,
1485
+ "loss": 3.4657,
1486
+ "step": 198
1487
+ },
1488
+ {
1489
+ "epoch": 2.106382978723404,
1490
+ "eval_loss": 4.103916645050049,
1491
+ "eval_runtime": 151.4173,
1492
+ "eval_samples_per_second": 1.981,
1493
+ "eval_steps_per_second": 0.033,
1494
+ "step": 198
1495
+ },
1496
+ {
1497
+ "epoch": 2.127659574468085,
1498
+ "grad_norm": 0.0005256883450783789,
1499
+ "learning_rate": 0.00014524548432061276,
1500
+ "loss": 3.4657,
1501
+ "step": 200
1502
+ },
1503
+ {
1504
+ "epoch": 2.127659574468085,
1505
+ "eval_loss": 4.103916168212891,
1506
+ "eval_runtime": 102.8677,
1507
+ "eval_samples_per_second": 2.916,
1508
+ "eval_steps_per_second": 0.049,
1509
+ "step": 200
1510
+ },
1511
+ {
1512
+ "epoch": 2.148936170212766,
1513
+ "grad_norm": 0.0009113452979363501,
1514
+ "learning_rate": 0.00014170291153230516,
1515
+ "loss": 3.4657,
1516
+ "step": 202
1517
+ },
1518
+ {
1519
+ "epoch": 2.148936170212766,
1520
+ "eval_loss": 4.103915214538574,
1521
+ "eval_runtime": 111.9189,
1522
+ "eval_samples_per_second": 2.681,
1523
+ "eval_steps_per_second": 0.045,
1524
+ "step": 202
1525
+ },
1526
+ {
1527
+ "epoch": 2.1702127659574466,
1528
+ "grad_norm": 0.0010462055215612054,
1529
+ "learning_rate": 0.00013816033874399752,
1530
+ "loss": 3.4657,
1531
+ "step": 204
1532
+ },
1533
+ {
1534
+ "epoch": 2.1702127659574466,
1535
+ "eval_loss": 4.103914260864258,
1536
+ "eval_runtime": 126.7351,
1537
+ "eval_samples_per_second": 2.367,
1538
+ "eval_steps_per_second": 0.039,
1539
+ "step": 204
1540
+ },
1541
+ {
1542
+ "epoch": 2.1914893617021276,
1543
+ "grad_norm": 0.0008617418352514505,
1544
+ "learning_rate": 0.0001346177659556899,
1545
+ "loss": 3.4657,
1546
+ "step": 206
1547
+ },
1548
+ {
1549
+ "epoch": 2.1914893617021276,
1550
+ "eval_loss": 4.1039137840271,
1551
+ "eval_runtime": 124.4636,
1552
+ "eval_samples_per_second": 2.41,
1553
+ "eval_steps_per_second": 0.04,
1554
+ "step": 206
1555
+ },
1556
+ {
1557
+ "epoch": 2.2127659574468086,
1558
+ "grad_norm": 0.0011684081982821226,
1559
+ "learning_rate": 0.00013107519316738224,
1560
+ "loss": 3.4657,
1561
+ "step": 208
1562
+ },
1563
+ {
1564
+ "epoch": 2.2127659574468086,
1565
+ "eval_loss": 4.103911876678467,
1566
+ "eval_runtime": 94.3018,
1567
+ "eval_samples_per_second": 3.181,
1568
+ "eval_steps_per_second": 0.053,
1569
+ "step": 208
1570
+ },
1571
+ {
1572
+ "epoch": 2.2340425531914896,
1573
+ "grad_norm": 0.0008167130872607231,
1574
+ "learning_rate": 0.0001275326203790746,
1575
+ "loss": 3.4657,
1576
+ "step": 210
1577
+ },
1578
+ {
1579
+ "epoch": 2.2340425531914896,
1580
+ "eval_loss": 4.10391092300415,
1581
+ "eval_runtime": 94.2002,
1582
+ "eval_samples_per_second": 3.185,
1583
+ "eval_steps_per_second": 0.053,
1584
+ "step": 210
1585
+ },
1586
+ {
1587
+ "epoch": 2.25531914893617,
1588
+ "grad_norm": 0.0542316697537899,
1589
+ "learning_rate": 0.000123990047590767,
1590
+ "loss": 3.4657,
1591
+ "step": 212
1592
+ },
1593
+ {
1594
+ "epoch": 2.25531914893617,
1595
+ "eval_loss": 4.10391092300415,
1596
+ "eval_runtime": 93.5571,
1597
+ "eval_samples_per_second": 3.207,
1598
+ "eval_steps_per_second": 0.053,
1599
+ "step": 212
1600
+ },
1601
+ {
1602
+ "epoch": 2.276595744680851,
1603
+ "grad_norm": 0.002146972343325615,
1604
+ "learning_rate": 0.00012044747480245936,
1605
+ "loss": 3.4656,
1606
+ "step": 214
1607
+ },
1608
+ {
1609
+ "epoch": 2.276595744680851,
1610
+ "eval_loss": 4.10391092300415,
1611
+ "eval_runtime": 93.6853,
1612
+ "eval_samples_per_second": 3.202,
1613
+ "eval_steps_per_second": 0.053,
1614
+ "step": 214
1615
+ },
1616
+ {
1617
+ "epoch": 2.297872340425532,
1618
+ "grad_norm": 0.0018499704310670495,
1619
+ "learning_rate": 0.00011690490201415174,
1620
+ "loss": 3.4656,
1621
+ "step": 216
1622
+ },
1623
+ {
1624
+ "epoch": 2.297872340425532,
1625
+ "eval_loss": 4.103910446166992,
1626
+ "eval_runtime": 93.9528,
1627
+ "eval_samples_per_second": 3.193,
1628
+ "eval_steps_per_second": 0.053,
1629
+ "step": 216
1630
+ },
1631
+ {
1632
+ "epoch": 2.3191489361702127,
1633
+ "grad_norm": 0.0013631999026983976,
1634
+ "learning_rate": 0.00011336232922584411,
1635
+ "loss": 3.4657,
1636
+ "step": 218
1637
+ },
1638
+ {
1639
+ "epoch": 2.3191489361702127,
1640
+ "eval_loss": 4.103910446166992,
1641
+ "eval_runtime": 93.3718,
1642
+ "eval_samples_per_second": 3.213,
1643
+ "eval_steps_per_second": 0.054,
1644
+ "step": 218
1645
+ },
1646
+ {
1647
+ "epoch": 2.3404255319148937,
1648
+ "grad_norm": 0.0021245332900434732,
1649
+ "learning_rate": 0.00010981975643753648,
1650
+ "loss": 3.4656,
1651
+ "step": 220
1652
+ },
1653
+ {
1654
+ "epoch": 2.3404255319148937,
1655
+ "eval_loss": 4.103908538818359,
1656
+ "eval_runtime": 116.0788,
1657
+ "eval_samples_per_second": 2.584,
1658
+ "eval_steps_per_second": 0.043,
1659
+ "step": 220
1660
+ },
1661
+ {
1662
+ "epoch": 2.3617021276595747,
1663
+ "grad_norm": 0.0015864246524870396,
1664
+ "learning_rate": 0.00010627718364922885,
1665
+ "loss": 3.4657,
1666
+ "step": 222
1667
+ },
1668
+ {
1669
+ "epoch": 2.3617021276595747,
1670
+ "eval_loss": 4.103908061981201,
1671
+ "eval_runtime": 93.8014,
1672
+ "eval_samples_per_second": 3.198,
1673
+ "eval_steps_per_second": 0.053,
1674
+ "step": 222
1675
+ },
1676
+ {
1677
+ "epoch": 2.382978723404255,
1678
+ "grad_norm": 0.0017925110878422856,
1679
+ "learning_rate": 0.00010273461086092122,
1680
+ "loss": 3.4657,
1681
+ "step": 224
1682
+ },
1683
+ {
1684
+ "epoch": 2.382978723404255,
1685
+ "eval_loss": 4.103907108306885,
1686
+ "eval_runtime": 94.17,
1687
+ "eval_samples_per_second": 3.186,
1688
+ "eval_steps_per_second": 0.053,
1689
+ "step": 224
1690
+ },
1691
+ {
1692
+ "epoch": 2.404255319148936,
1693
+ "grad_norm": 0.0022581228986382484,
1694
+ "learning_rate": 9.91920380726136e-05,
1695
+ "loss": 3.4656,
1696
+ "step": 226
1697
+ },
1698
+ {
1699
+ "epoch": 2.404255319148936,
1700
+ "eval_loss": 4.10390567779541,
1701
+ "eval_runtime": 117.7055,
1702
+ "eval_samples_per_second": 2.549,
1703
+ "eval_steps_per_second": 0.042,
1704
+ "step": 226
1705
+ },
1706
+ {
1707
+ "epoch": 2.425531914893617,
1708
+ "grad_norm": 0.0026256099808961153,
1709
+ "learning_rate": 9.564946528430598e-05,
1710
+ "loss": 3.4656,
1711
+ "step": 228
1712
+ },
1713
+ {
1714
+ "epoch": 2.425531914893617,
1715
+ "eval_loss": 4.1039042472839355,
1716
+ "eval_runtime": 128.097,
1717
+ "eval_samples_per_second": 2.342,
1718
+ "eval_steps_per_second": 0.039,
1719
+ "step": 228
1720
+ },
1721
+ {
1722
+ "epoch": 2.4468085106382977,
1723
+ "grad_norm": 0.0022717451211065054,
1724
+ "learning_rate": 9.210689249599833e-05,
1725
+ "loss": 3.4656,
1726
+ "step": 230
1727
+ },
1728
+ {
1729
+ "epoch": 2.4468085106382977,
1730
+ "eval_loss": 4.103902339935303,
1731
+ "eval_runtime": 113.9038,
1732
+ "eval_samples_per_second": 2.634,
1733
+ "eval_steps_per_second": 0.044,
1734
+ "step": 230
1735
+ },
1736
+ {
1737
+ "epoch": 2.4680851063829787,
1738
+ "grad_norm": 0.003249780274927616,
1739
+ "learning_rate": 8.856431970769072e-05,
1740
+ "loss": 3.4656,
1741
+ "step": 232
1742
+ },
1743
+ {
1744
+ "epoch": 2.4680851063829787,
1745
+ "eval_loss": 4.103899955749512,
1746
+ "eval_runtime": 92.3315,
1747
+ "eval_samples_per_second": 3.249,
1748
+ "eval_steps_per_second": 0.054,
1749
+ "step": 232
1750
+ },
1751
+ {
1752
+ "epoch": 2.4893617021276597,
1753
+ "grad_norm": 0.0025626318529248238,
1754
+ "learning_rate": 8.502174691938309e-05,
1755
+ "loss": 3.4656,
1756
+ "step": 234
1757
+ },
1758
+ {
1759
+ "epoch": 2.4893617021276597,
1760
+ "eval_loss": 4.103896617889404,
1761
+ "eval_runtime": 93.3268,
1762
+ "eval_samples_per_second": 3.215,
1763
+ "eval_steps_per_second": 0.054,
1764
+ "step": 234
1765
+ },
1766
+ {
1767
+ "epoch": 2.5106382978723403,
1768
+ "grad_norm": 0.004104019142687321,
1769
+ "learning_rate": 8.147917413107547e-05,
1770
+ "loss": 3.4655,
1771
+ "step": 236
1772
+ },
1773
+ {
1774
+ "epoch": 2.5106382978723403,
1775
+ "eval_loss": 4.103891849517822,
1776
+ "eval_runtime": 93.0172,
1777
+ "eval_samples_per_second": 3.225,
1778
+ "eval_steps_per_second": 0.054,
1779
+ "step": 236
1780
+ },
1781
+ {
1782
+ "epoch": 2.5319148936170213,
1783
+ "grad_norm": 0.005048415157943964,
1784
+ "learning_rate": 7.793660134276782e-05,
1785
+ "loss": 3.4656,
1786
+ "step": 238
1787
+ },
1788
+ {
1789
+ "epoch": 2.5319148936170213,
1790
+ "eval_loss": 4.103887557983398,
1791
+ "eval_runtime": 103.5236,
1792
+ "eval_samples_per_second": 2.898,
1793
+ "eval_steps_per_second": 0.048,
1794
+ "step": 238
1795
+ },
1796
+ {
1797
+ "epoch": 2.5531914893617023,
1798
+ "grad_norm": 0.004755980335175991,
1799
+ "learning_rate": 7.43940285544602e-05,
1800
+ "loss": 3.4656,
1801
+ "step": 240
1802
+ },
1803
+ {
1804
+ "epoch": 2.5531914893617023,
1805
+ "eval_loss": 4.103884220123291,
1806
+ "eval_runtime": 92.2557,
1807
+ "eval_samples_per_second": 3.252,
1808
+ "eval_steps_per_second": 0.054,
1809
+ "step": 240
1810
+ },
1811
+ {
1812
+ "epoch": 2.574468085106383,
1813
+ "grad_norm": 0.00427659647539258,
1814
+ "learning_rate": 7.085145576615258e-05,
1815
+ "loss": 3.4654,
1816
+ "step": 242
1817
+ },
1818
+ {
1819
+ "epoch": 2.574468085106383,
1820
+ "eval_loss": 4.103878498077393,
1821
+ "eval_runtime": 94.0279,
1822
+ "eval_samples_per_second": 3.191,
1823
+ "eval_steps_per_second": 0.053,
1824
+ "step": 242
1825
+ },
1826
+ {
1827
+ "epoch": 2.595744680851064,
1828
+ "grad_norm": 0.0044647688046097755,
1829
+ "learning_rate": 6.730888297784495e-05,
1830
+ "loss": 3.4655,
1831
+ "step": 244
1832
+ },
1833
+ {
1834
+ "epoch": 2.595744680851064,
1835
+ "eval_loss": 4.103872776031494,
1836
+ "eval_runtime": 98.9692,
1837
+ "eval_samples_per_second": 3.031,
1838
+ "eval_steps_per_second": 0.051,
1839
+ "step": 244
1840
+ },
1841
+ {
1842
+ "epoch": 2.617021276595745,
1843
+ "grad_norm": 0.00606452114880085,
1844
+ "learning_rate": 6.37663101895373e-05,
1845
+ "loss": 3.4654,
1846
+ "step": 246
1847
+ },
1848
+ {
1849
+ "epoch": 2.617021276595745,
1850
+ "eval_loss": 4.1038665771484375,
1851
+ "eval_runtime": 130.5821,
1852
+ "eval_samples_per_second": 2.297,
1853
+ "eval_steps_per_second": 0.038,
1854
+ "step": 246
1855
+ },
1856
+ {
1857
+ "epoch": 2.6382978723404253,
1858
+ "grad_norm": 0.0025343315210193396,
1859
+ "learning_rate": 6.022373740122968e-05,
1860
+ "loss": 3.4655,
1861
+ "step": 248
1862
+ },
1863
+ {
1864
+ "epoch": 2.6382978723404253,
1865
+ "eval_loss": 4.103858947753906,
1866
+ "eval_runtime": 110.6408,
1867
+ "eval_samples_per_second": 2.711,
1868
+ "eval_steps_per_second": 0.045,
1869
+ "step": 248
1870
+ },
1871
+ {
1872
+ "epoch": 2.6595744680851063,
1873
+ "grad_norm": 0.004990957211703062,
1874
+ "learning_rate": 5.668116461292206e-05,
1875
+ "loss": 3.4654,
1876
+ "step": 250
1877
+ },
1878
+ {
1879
+ "epoch": 2.6595744680851063,
1880
+ "eval_loss": 4.103850841522217,
1881
+ "eval_runtime": 110.9115,
1882
+ "eval_samples_per_second": 2.705,
1883
+ "eval_steps_per_second": 0.045,
1884
+ "step": 250
1885
+ },
1886
+ {
1887
+ "epoch": 2.6808510638297873,
1888
+ "grad_norm": 0.003452206961810589,
1889
+ "learning_rate": 5.3138591824614426e-05,
1890
+ "loss": 3.4655,
1891
+ "step": 252
1892
+ },
1893
+ {
1894
+ "epoch": 2.6808510638297873,
1895
+ "eval_loss": 4.103842735290527,
1896
+ "eval_runtime": 116.4041,
1897
+ "eval_samples_per_second": 2.577,
1898
+ "eval_steps_per_second": 0.043,
1899
+ "step": 252
1900
+ },
1901
+ {
1902
+ "epoch": 2.702127659574468,
1903
+ "grad_norm": 0.01098131388425827,
1904
+ "learning_rate": 4.95960190363068e-05,
1905
+ "loss": 3.4655,
1906
+ "step": 254
1907
+ },
1908
+ {
1909
+ "epoch": 2.702127659574468,
1910
+ "eval_loss": 4.1038360595703125,
1911
+ "eval_runtime": 120.3016,
1912
+ "eval_samples_per_second": 2.494,
1913
+ "eval_steps_per_second": 0.042,
1914
+ "step": 254
1915
+ },
1916
+ {
1917
+ "epoch": 2.723404255319149,
1918
+ "grad_norm": 0.009479314088821411,
1919
+ "learning_rate": 4.6053446247999166e-05,
1920
+ "loss": 3.4652,
1921
+ "step": 256
1922
+ },
1923
+ {
1924
+ "epoch": 2.723404255319149,
1925
+ "eval_loss": 4.1038289070129395,
1926
+ "eval_runtime": 102.7773,
1927
+ "eval_samples_per_second": 2.919,
1928
+ "eval_steps_per_second": 0.049,
1929
+ "step": 256
1930
+ },
1931
+ {
1932
+ "epoch": 2.74468085106383,
1933
+ "grad_norm": 0.018755914643406868,
1934
+ "learning_rate": 4.251087345969154e-05,
1935
+ "loss": 3.4646,
1936
+ "step": 258
1937
+ },
1938
+ {
1939
+ "epoch": 2.74468085106383,
1940
+ "eval_loss": 4.103816032409668,
1941
+ "eval_runtime": 155.1062,
1942
+ "eval_samples_per_second": 1.934,
1943
+ "eval_steps_per_second": 0.032,
1944
+ "step": 258
1945
+ },
1946
+ {
1947
+ "epoch": 2.7659574468085104,
1948
+ "grad_norm": 0.00831508357077837,
1949
+ "learning_rate": 3.896830067138391e-05,
1950
+ "loss": 3.4654,
1951
+ "step": 260
1952
+ },
1953
+ {
1954
+ "epoch": 2.7659574468085104,
1955
+ "eval_loss": 4.10380220413208,
1956
+ "eval_runtime": 104.9599,
1957
+ "eval_samples_per_second": 2.858,
1958
+ "eval_steps_per_second": 0.048,
1959
+ "step": 260
1960
+ },
1961
+ {
1962
+ "epoch": 2.7872340425531914,
1963
+ "grad_norm": 0.00918419286608696,
1964
+ "learning_rate": 3.542572788307629e-05,
1965
+ "loss": 3.4654,
1966
+ "step": 262
1967
+ },
1968
+ {
1969
+ "epoch": 2.7872340425531914,
1970
+ "eval_loss": 4.103787422180176,
1971
+ "eval_runtime": 121.0806,
1972
+ "eval_samples_per_second": 2.478,
1973
+ "eval_steps_per_second": 0.041,
1974
+ "step": 262
1975
+ },
1976
+ {
1977
+ "epoch": 2.8085106382978724,
1978
+ "grad_norm": 0.010791248641908169,
1979
+ "learning_rate": 3.188315509476865e-05,
1980
+ "loss": 3.4653,
1981
+ "step": 264
1982
+ },
1983
+ {
1984
+ "epoch": 2.8085106382978724,
1985
+ "eval_loss": 4.103773593902588,
1986
+ "eval_runtime": 117.3875,
1987
+ "eval_samples_per_second": 2.556,
1988
+ "eval_steps_per_second": 0.043,
1989
+ "step": 264
1990
+ },
1991
+ {
1992
+ "epoch": 2.829787234042553,
1993
+ "grad_norm": 0.018539341166615486,
1994
+ "learning_rate": 2.834058230646103e-05,
1995
+ "loss": 3.4651,
1996
+ "step": 266
1997
+ },
1998
+ {
1999
+ "epoch": 2.829787234042553,
2000
+ "eval_loss": 4.103758811950684,
2001
+ "eval_runtime": 134.9208,
2002
+ "eval_samples_per_second": 2.224,
2003
+ "eval_steps_per_second": 0.037,
2004
+ "step": 266
2005
+ },
2006
+ {
2007
+ "epoch": 2.851063829787234,
2008
+ "grad_norm": 0.013375967741012573,
2009
+ "learning_rate": 2.47980095181534e-05,
2010
+ "loss": 3.4649,
2011
+ "step": 268
2012
+ },
2013
+ {
2014
+ "epoch": 2.851063829787234,
2015
+ "eval_loss": 4.103740692138672,
2016
+ "eval_runtime": 112.6527,
2017
+ "eval_samples_per_second": 2.663,
2018
+ "eval_steps_per_second": 0.044,
2019
+ "step": 268
2020
+ },
2021
+ {
2022
+ "epoch": 2.872340425531915,
2023
+ "grad_norm": 0.01334660779684782,
2024
+ "learning_rate": 2.125543672984577e-05,
2025
+ "loss": 3.465,
2026
+ "step": 270
2027
+ },
2028
+ {
2029
+ "epoch": 2.872340425531915,
2030
+ "eval_loss": 4.103722095489502,
2031
+ "eval_runtime": 94.5738,
2032
+ "eval_samples_per_second": 3.172,
2033
+ "eval_steps_per_second": 0.053,
2034
+ "step": 270
2035
+ },
2036
+ {
2037
+ "epoch": 2.8936170212765955,
2038
+ "grad_norm": 0.017678698524832726,
2039
+ "learning_rate": 1.7712863941538144e-05,
2040
+ "loss": 3.4647,
2041
+ "step": 272
2042
+ },
2043
+ {
2044
+ "epoch": 2.8936170212765955,
2045
+ "eval_loss": 4.103702068328857,
2046
+ "eval_runtime": 108.8046,
2047
+ "eval_samples_per_second": 2.757,
2048
+ "eval_steps_per_second": 0.046,
2049
+ "step": 272
2050
+ },
2051
+ {
2052
+ "epoch": 2.9148936170212765,
2053
+ "grad_norm": 0.023421209305524826,
2054
+ "learning_rate": 1.4170291153230514e-05,
2055
+ "loss": 3.4642,
2056
+ "step": 274
2057
+ },
2058
+ {
2059
+ "epoch": 2.9148936170212765,
2060
+ "eval_loss": 4.103678226470947,
2061
+ "eval_runtime": 95.2978,
2062
+ "eval_samples_per_second": 3.148,
2063
+ "eval_steps_per_second": 0.052,
2064
+ "step": 274
2065
+ },
2066
+ {
2067
+ "epoch": 2.9361702127659575,
2068
+ "grad_norm": 0.02967756614089012,
2069
+ "learning_rate": 1.0627718364922886e-05,
2070
+ "loss": 3.4649,
2071
+ "step": 276
2072
+ },
2073
+ {
2074
+ "epoch": 2.9361702127659575,
2075
+ "eval_loss": 4.103660583496094,
2076
+ "eval_runtime": 164.156,
2077
+ "eval_samples_per_second": 1.828,
2078
+ "eval_steps_per_second": 0.03,
2079
+ "step": 276
2080
+ },
2081
+ {
2082
+ "epoch": 2.9574468085106385,
2083
+ "grad_norm": 0.04368141293525696,
2084
+ "learning_rate": 7.085145576615257e-06,
2085
+ "loss": 3.4632,
2086
+ "step": 278
2087
+ },
2088
+ {
2089
+ "epoch": 2.9574468085106385,
2090
+ "eval_loss": 4.103642463684082,
2091
+ "eval_runtime": 135.1093,
2092
+ "eval_samples_per_second": 2.22,
2093
+ "eval_steps_per_second": 0.037,
2094
+ "step": 278
2095
+ },
2096
+ {
2097
+ "epoch": 2.978723404255319,
2098
+ "grad_norm": 0.024568898603320122,
2099
+ "learning_rate": 3.5425727883076285e-06,
2100
+ "loss": 3.4644,
2101
+ "step": 280
2102
+ },
2103
+ {
2104
+ "epoch": 2.978723404255319,
2105
+ "eval_loss": 4.103628158569336,
2106
+ "eval_runtime": 111.5348,
2107
+ "eval_samples_per_second": 2.69,
2108
+ "eval_steps_per_second": 0.045,
2109
+ "step": 280
2110
+ },
2111
+ {
2112
+ "epoch": 3.0,
2113
+ "grad_norm": 0.04271033778786659,
2114
+ "learning_rate": 0.0,
2115
+ "loss": 3.2985,
2116
+ "step": 282
2117
+ },
2118
+ {
2119
+ "epoch": 3.0,
2120
+ "eval_loss": 4.103620529174805,
2121
+ "eval_runtime": 122.9717,
2122
+ "eval_samples_per_second": 2.44,
2123
+ "eval_steps_per_second": 0.041,
2124
+ "step": 282
2125
+ }
2126
+ ],
2127
+ "logging_steps": 2,
2128
+ "max_steps": 282,
2129
+ "num_input_tokens_seen": 0,
2130
+ "num_train_epochs": 3,
2131
+ "save_steps": 500,
2132
+ "stateful_callbacks": {
2133
+ "TrainerControl": {
2134
+ "args": {
2135
+ "should_epoch_stop": false,
2136
+ "should_evaluate": false,
2137
+ "should_log": false,
2138
+ "should_save": true,
2139
+ "should_training_stop": true
2140
+ },
2141
+ "attributes": {}
2142
+ }
2143
+ },
2144
+ "total_flos": 523328480700102.0,
2145
+ "train_batch_size": 32,
2146
+ "trial_name": null,
2147
+ "trial_params": {
2148
+ "_wandb": {},
2149
+ "assignments": {},
2150
+ "decay": 0.1,
2151
+ "learning_rate": 0.0004995027631513756,
2152
+ "metric": "eval/loss",
2153
+ "per_device_train_batch_size": 32
2154
+ }
2155
+ }
run-bq1dw9ny/checkpoint-282/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d179cbce729053172c2f3e0999512a7fc639544280b821f811c1a6e684dc6c4c
3
+ size 5112
run-yha1u6a1/checkpoint-2/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/clip-vit-base-patch32",
3
+ "architectures": [
4
+ "CLIPModel"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 512,
10
+ "text_config": {
11
+ "bos_token_id": 0,
12
+ "dropout": 0.0,
13
+ "eos_token_id": 2,
14
+ "model_type": "clip_text_model"
15
+ },
16
+ "torch_dtype": "float32",
17
+ "transformers_version": "4.42.0.dev0",
18
+ "vision_config": {
19
+ "dropout": 0.0,
20
+ "model_type": "clip_vision_model"
21
+ }
22
+ }
run-yha1u6a1/checkpoint-2/model.safetensors ADDED
File without changes
runs/May23_07-40-12_1eee021f3931/events.out.tfevents.1716472131.1eee021f3931.3634.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cec984482bd10e65452569acf6b5ea17ac49f20deea438f67de852733e632f0
3
+ size 4184
runs/May26_11-15-36_7c156d14e175/events.out.tfevents.1716742187.7c156d14e175.3825.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:980663982b77393f5ea35bef2b677dfa5b4b68c9b2f48e5a8094545a183efcb1
3
+ size 4184
runs/May27_07-52-26_f4e2ba664464/events.out.tfevents.1716817225.f4e2ba664464.2272.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91c41b84c166468debb499a314c4686845b6a479a683dc107086b103f9882783
3
+ size 4184
runs/May29_14-16-19_00a42a5d637b/events.out.tfevents.1716992396.00a42a5d637b.22663.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc8ad1f4e57d3a25f471d1aaf6c186d01f953992d26a1e3b661d042a1a94c65
3
+ size 4184
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10e3da5a2c2894297cb56e6aa655f2850fc6a872b1aff8030670c382688d25ff
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d179cbce729053172c2f3e0999512a7fc639544280b821f811c1a6e684dc6c4c
3
  size 5112