akahana commited on
Commit
ac34bcd
1 Parent(s): 749423f

tebak-gambar-mobilevit

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ base_model: apple/mobilevit-x-small
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: tebak-gambar-mobilevit
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # tebak-gambar-mobilevit
17
+
18
+ This model is a fine-tuned version of [apple/mobilevit-x-small](https://huggingface.co/apple/mobilevit-x-small) on an unknown dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 1.0799
21
+ - Accuracy: 0.7289
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0008
41
+ - train_batch_size: 256
42
+ - eval_batch_size: 256
43
+ - seed: 42
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: linear
46
+ - num_epochs: 3
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
+ |:-------------:|:------:|:-----:|:---------------:|:--------:|
53
+ | 1.5084 | 0.2844 | 5000 | 1.4700 | 0.6364 |
54
+ | 1.3684 | 0.5689 | 10000 | 1.3353 | 0.6674 |
55
+ | 1.3568 | 0.8533 | 15000 | 1.2764 | 0.6804 |
56
+ | 1.226 | 1.1377 | 20000 | 1.2323 | 0.6924 |
57
+ | 1.2125 | 1.4222 | 25000 | 1.1850 | 0.7031 |
58
+ | 1.1912 | 1.7066 | 30000 | 1.1567 | 0.7092 |
59
+ | 1.1902 | 1.9910 | 35000 | 1.1297 | 0.7165 |
60
+ | 1.131 | 2.2754 | 40000 | 1.1106 | 0.7213 |
61
+ | 1.124 | 2.5599 | 45000 | 1.0916 | 0.7258 |
62
+ | 1.1245 | 2.8443 | 50000 | 1.0782 | 0.7300 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - Transformers 4.42.4
68
+ - Pytorch 2.3.1+cu121
69
+ - Datasets 2.20.0
70
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.728948,
4
+ "eval_loss": 1.0799410343170166,
5
+ "eval_runtime": 79.2396,
6
+ "eval_samples_per_second": 3154.988,
7
+ "eval_steps_per_second": 12.33,
8
+ "total_flos": 1.3116020904e+17,
9
+ "train_loss": 1.269093582550002,
10
+ "train_runtime": 7252.1306,
11
+ "train_samples_per_second": 1861.522,
12
+ "train_steps_per_second": 7.272
13
+ }
config.json ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "apple/mobilevit-x-small",
3
+ "architectures": [
4
+ "MobileViTForImageClassification"
5
+ ],
6
+ "aspp_dropout_prob": 0.1,
7
+ "aspp_out_channels": 256,
8
+ "atrous_rates": [
9
+ 6,
10
+ 12,
11
+ 18
12
+ ],
13
+ "attention_probs_dropout_prob": 0.0,
14
+ "classifier_dropout_prob": 0.1,
15
+ "conv_kernel_size": 3,
16
+ "expand_ratio": 4.0,
17
+ "hidden_act": "silu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_sizes": [
20
+ 96,
21
+ 120,
22
+ 144
23
+ ],
24
+ "id2label": {
25
+ "0": "aircraft carrier",
26
+ "1": "airplane",
27
+ "10": "asparagus",
28
+ "100": "dumbbell",
29
+ "101": "ear",
30
+ "102": "elbow",
31
+ "103": "elephant",
32
+ "104": "envelope",
33
+ "105": "eraser",
34
+ "106": "eye",
35
+ "107": "eyeglasses",
36
+ "108": "face",
37
+ "109": "fan",
38
+ "11": "axe",
39
+ "110": "feather",
40
+ "111": "fence",
41
+ "112": "finger",
42
+ "113": "fire hydrant",
43
+ "114": "fireplace",
44
+ "115": "firetruck",
45
+ "116": "fish",
46
+ "117": "flamingo",
47
+ "118": "flashlight",
48
+ "119": "flip flops",
49
+ "12": "backpack",
50
+ "120": "floor lamp",
51
+ "121": "flower",
52
+ "122": "flying saucer",
53
+ "123": "foot",
54
+ "124": "fork",
55
+ "125": "frog",
56
+ "126": "frying pan",
57
+ "127": "garden hose",
58
+ "128": "garden",
59
+ "129": "giraffe",
60
+ "13": "banana",
61
+ "130": "goatee",
62
+ "131": "golf club",
63
+ "132": "grapes",
64
+ "133": "grass",
65
+ "134": "guitar",
66
+ "135": "hamburger",
67
+ "136": "hammer",
68
+ "137": "hand",
69
+ "138": "harp",
70
+ "139": "hat",
71
+ "14": "bandage",
72
+ "140": "headphones",
73
+ "141": "hedgehog",
74
+ "142": "helicopter",
75
+ "143": "helmet",
76
+ "144": "hexagon",
77
+ "145": "hockey puck",
78
+ "146": "hockey stick",
79
+ "147": "horse",
80
+ "148": "hospital",
81
+ "149": "hot air balloon",
82
+ "15": "barn",
83
+ "150": "hot dog",
84
+ "151": "hot tub",
85
+ "152": "hourglass",
86
+ "153": "house plant",
87
+ "154": "house",
88
+ "155": "hurricane",
89
+ "156": "ice cream",
90
+ "157": "jacket",
91
+ "158": "jail",
92
+ "159": "kangaroo",
93
+ "16": "baseball bat",
94
+ "160": "key",
95
+ "161": "keyboard",
96
+ "162": "knee",
97
+ "163": "knife",
98
+ "164": "ladder",
99
+ "165": "lantern",
100
+ "166": "laptop",
101
+ "167": "leaf",
102
+ "168": "leg",
103
+ "169": "light bulb",
104
+ "17": "baseball",
105
+ "170": "lighter",
106
+ "171": "lighthouse",
107
+ "172": "lightning",
108
+ "173": "line",
109
+ "174": "lion",
110
+ "175": "lipstick",
111
+ "176": "lobster",
112
+ "177": "lollipop",
113
+ "178": "mailbox",
114
+ "179": "map",
115
+ "18": "basket",
116
+ "180": "marker",
117
+ "181": "matches",
118
+ "182": "megaphone",
119
+ "183": "mermaid",
120
+ "184": "microphone",
121
+ "185": "microwave",
122
+ "186": "monkey",
123
+ "187": "moon",
124
+ "188": "mosquito",
125
+ "189": "motorbike",
126
+ "19": "basketball",
127
+ "190": "mountain",
128
+ "191": "mouse",
129
+ "192": "moustache",
130
+ "193": "mouth",
131
+ "194": "mug",
132
+ "195": "mushroom",
133
+ "196": "nail",
134
+ "197": "necklace",
135
+ "198": "nose",
136
+ "199": "ocean",
137
+ "2": "alarm clock",
138
+ "20": "bat",
139
+ "200": "octagon",
140
+ "201": "octopus",
141
+ "202": "onion",
142
+ "203": "oven",
143
+ "204": "owl",
144
+ "205": "paint can",
145
+ "206": "paintbrush",
146
+ "207": "palm tree",
147
+ "208": "panda",
148
+ "209": "pants",
149
+ "21": "bathtub",
150
+ "210": "paper clip",
151
+ "211": "parachute",
152
+ "212": "parrot",
153
+ "213": "passport",
154
+ "214": "peanut",
155
+ "215": "pear",
156
+ "216": "peas",
157
+ "217": "pencil",
158
+ "218": "penguin",
159
+ "219": "piano",
160
+ "22": "beach",
161
+ "220": "pickup truck",
162
+ "221": "picture frame",
163
+ "222": "pig",
164
+ "223": "pillow",
165
+ "224": "pineapple",
166
+ "225": "pizza",
167
+ "226": "pliers",
168
+ "227": "police car",
169
+ "228": "pond",
170
+ "229": "pool",
171
+ "23": "bear",
172
+ "230": "popsicle",
173
+ "231": "postcard",
174
+ "232": "potato",
175
+ "233": "power outlet",
176
+ "234": "purse",
177
+ "235": "rabbit",
178
+ "236": "raccoon",
179
+ "237": "radio",
180
+ "238": "rain",
181
+ "239": "rainbow",
182
+ "24": "beard",
183
+ "240": "rake",
184
+ "241": "remote control",
185
+ "242": "rhinoceros",
186
+ "243": "rifle",
187
+ "244": "river",
188
+ "245": "roller coaster",
189
+ "246": "rollerskates",
190
+ "247": "sailboat",
191
+ "248": "sandwich",
192
+ "249": "saw",
193
+ "25": "bed",
194
+ "250": "saxophone",
195
+ "251": "school bus",
196
+ "252": "scissors",
197
+ "253": "scorpion",
198
+ "254": "screwdriver",
199
+ "255": "sea turtle",
200
+ "256": "see saw",
201
+ "257": "shark",
202
+ "258": "sheep",
203
+ "259": "shoe",
204
+ "26": "bee",
205
+ "260": "shorts",
206
+ "261": "shovel",
207
+ "262": "sink",
208
+ "263": "skateboard",
209
+ "264": "skull",
210
+ "265": "skyscraper",
211
+ "266": "sleeping bag",
212
+ "267": "smiley face",
213
+ "268": "snail",
214
+ "269": "snake",
215
+ "27": "belt",
216
+ "270": "snorkel",
217
+ "271": "snowflake",
218
+ "272": "snowman",
219
+ "273": "soccer ball",
220
+ "274": "sock",
221
+ "275": "speedboat",
222
+ "276": "spider",
223
+ "277": "spoon",
224
+ "278": "spreadsheet",
225
+ "279": "square",
226
+ "28": "bench",
227
+ "280": "squiggle",
228
+ "281": "squirrel",
229
+ "282": "stairs",
230
+ "283": "star",
231
+ "284": "steak",
232
+ "285": "stereo",
233
+ "286": "stethoscope",
234
+ "287": "stitches",
235
+ "288": "stop sign",
236
+ "289": "stove",
237
+ "29": "bicycle",
238
+ "290": "strawberry",
239
+ "291": "streetlight",
240
+ "292": "string bean",
241
+ "293": "submarine",
242
+ "294": "suitcase",
243
+ "295": "sun",
244
+ "296": "swan",
245
+ "297": "sweater",
246
+ "298": "swing set",
247
+ "299": "sword",
248
+ "3": "ambulance",
249
+ "30": "binoculars",
250
+ "300": "syringe",
251
+ "301": "t-shirt",
252
+ "302": "table",
253
+ "303": "teapot",
254
+ "304": "teddy-bear",
255
+ "305": "telephone",
256
+ "306": "television",
257
+ "307": "tennis racquet",
258
+ "308": "tent",
259
+ "309": "The Eiffel Tower",
260
+ "31": "bird",
261
+ "310": "The Great Wall of China",
262
+ "311": "The Mona Lisa",
263
+ "312": "tiger",
264
+ "313": "toaster",
265
+ "314": "toe",
266
+ "315": "toilet",
267
+ "316": "tooth",
268
+ "317": "toothbrush",
269
+ "318": "toothpaste",
270
+ "319": "tornado",
271
+ "32": "birthday cake",
272
+ "320": "tractor",
273
+ "321": "traffic light",
274
+ "322": "train",
275
+ "323": "tree",
276
+ "324": "triangle",
277
+ "325": "trombone",
278
+ "326": "truck",
279
+ "327": "trumpet",
280
+ "328": "umbrella",
281
+ "329": "underwear",
282
+ "33": "blackberry",
283
+ "330": "van",
284
+ "331": "vase",
285
+ "332": "violin",
286
+ "333": "washing machine",
287
+ "334": "watermelon",
288
+ "335": "waterslide",
289
+ "336": "whale",
290
+ "337": "wheel",
291
+ "338": "windmill",
292
+ "339": "wine bottle",
293
+ "34": "blueberry",
294
+ "340": "wine glass",
295
+ "341": "wristwatch",
296
+ "342": "yoga",
297
+ "343": "zebra",
298
+ "344": "zigzag",
299
+ "35": "book",
300
+ "36": "boomerang",
301
+ "37": "bottlecap",
302
+ "38": "bowtie",
303
+ "39": "bracelet",
304
+ "4": "angel",
305
+ "40": "brain",
306
+ "41": "bread",
307
+ "42": "bridge",
308
+ "43": "broccoli",
309
+ "44": "broom",
310
+ "45": "bucket",
311
+ "46": "bulldozer",
312
+ "47": "bus",
313
+ "48": "bush",
314
+ "49": "butterfly",
315
+ "5": "animal migration",
316
+ "50": "cactus",
317
+ "51": "cake",
318
+ "52": "calculator",
319
+ "53": "calendar",
320
+ "54": "camel",
321
+ "55": "camera",
322
+ "56": "camouflage",
323
+ "57": "campfire",
324
+ "58": "candle",
325
+ "59": "cannon",
326
+ "6": "ant",
327
+ "60": "canoe",
328
+ "61": "car",
329
+ "62": "carrot",
330
+ "63": "castle",
331
+ "64": "cat",
332
+ "65": "ceiling fan",
333
+ "66": "cell phone",
334
+ "67": "cello",
335
+ "68": "chair",
336
+ "69": "chandelier",
337
+ "7": "anvil",
338
+ "70": "church",
339
+ "71": "circle",
340
+ "72": "clarinet",
341
+ "73": "clock",
342
+ "74": "cloud",
343
+ "75": "coffee cup",
344
+ "76": "compass",
345
+ "77": "computer",
346
+ "78": "cookie",
347
+ "79": "cooler",
348
+ "8": "apple",
349
+ "80": "couch",
350
+ "81": "cow",
351
+ "82": "crab",
352
+ "83": "crayon",
353
+ "84": "crocodile",
354
+ "85": "crown",
355
+ "86": "cruise ship",
356
+ "87": "cup",
357
+ "88": "diamond",
358
+ "89": "dishwasher",
359
+ "9": "arm",
360
+ "90": "diving board",
361
+ "91": "dog",
362
+ "92": "dolphin",
363
+ "93": "donut",
364
+ "94": "door",
365
+ "95": "dragon",
366
+ "96": "dresser",
367
+ "97": "drill",
368
+ "98": "drums",
369
+ "99": "duck"
370
+ },
371
+ "image_size": 28,
372
+ "initializer_range": 0.02,
373
+ "label2id": {
374
+ "The Eiffel Tower": "309",
375
+ "The Great Wall of China": "310",
376
+ "The Mona Lisa": "311",
377
+ "aircraft carrier": "0",
378
+ "airplane": "1",
379
+ "alarm clock": "2",
380
+ "ambulance": "3",
381
+ "angel": "4",
382
+ "animal migration": "5",
383
+ "ant": "6",
384
+ "anvil": "7",
385
+ "apple": "8",
386
+ "arm": "9",
387
+ "asparagus": "10",
388
+ "axe": "11",
389
+ "backpack": "12",
390
+ "banana": "13",
391
+ "bandage": "14",
392
+ "barn": "15",
393
+ "baseball": "17",
394
+ "baseball bat": "16",
395
+ "basket": "18",
396
+ "basketball": "19",
397
+ "bat": "20",
398
+ "bathtub": "21",
399
+ "beach": "22",
400
+ "bear": "23",
401
+ "beard": "24",
402
+ "bed": "25",
403
+ "bee": "26",
404
+ "belt": "27",
405
+ "bench": "28",
406
+ "bicycle": "29",
407
+ "binoculars": "30",
408
+ "bird": "31",
409
+ "birthday cake": "32",
410
+ "blackberry": "33",
411
+ "blueberry": "34",
412
+ "book": "35",
413
+ "boomerang": "36",
414
+ "bottlecap": "37",
415
+ "bowtie": "38",
416
+ "bracelet": "39",
417
+ "brain": "40",
418
+ "bread": "41",
419
+ "bridge": "42",
420
+ "broccoli": "43",
421
+ "broom": "44",
422
+ "bucket": "45",
423
+ "bulldozer": "46",
424
+ "bus": "47",
425
+ "bush": "48",
426
+ "butterfly": "49",
427
+ "cactus": "50",
428
+ "cake": "51",
429
+ "calculator": "52",
430
+ "calendar": "53",
431
+ "camel": "54",
432
+ "camera": "55",
433
+ "camouflage": "56",
434
+ "campfire": "57",
435
+ "candle": "58",
436
+ "cannon": "59",
437
+ "canoe": "60",
438
+ "car": "61",
439
+ "carrot": "62",
440
+ "castle": "63",
441
+ "cat": "64",
442
+ "ceiling fan": "65",
443
+ "cell phone": "66",
444
+ "cello": "67",
445
+ "chair": "68",
446
+ "chandelier": "69",
447
+ "church": "70",
448
+ "circle": "71",
449
+ "clarinet": "72",
450
+ "clock": "73",
451
+ "cloud": "74",
452
+ "coffee cup": "75",
453
+ "compass": "76",
454
+ "computer": "77",
455
+ "cookie": "78",
456
+ "cooler": "79",
457
+ "couch": "80",
458
+ "cow": "81",
459
+ "crab": "82",
460
+ "crayon": "83",
461
+ "crocodile": "84",
462
+ "crown": "85",
463
+ "cruise ship": "86",
464
+ "cup": "87",
465
+ "diamond": "88",
466
+ "dishwasher": "89",
467
+ "diving board": "90",
468
+ "dog": "91",
469
+ "dolphin": "92",
470
+ "donut": "93",
471
+ "door": "94",
472
+ "dragon": "95",
473
+ "dresser": "96",
474
+ "drill": "97",
475
+ "drums": "98",
476
+ "duck": "99",
477
+ "dumbbell": "100",
478
+ "ear": "101",
479
+ "elbow": "102",
480
+ "elephant": "103",
481
+ "envelope": "104",
482
+ "eraser": "105",
483
+ "eye": "106",
484
+ "eyeglasses": "107",
485
+ "face": "108",
486
+ "fan": "109",
487
+ "feather": "110",
488
+ "fence": "111",
489
+ "finger": "112",
490
+ "fire hydrant": "113",
491
+ "fireplace": "114",
492
+ "firetruck": "115",
493
+ "fish": "116",
494
+ "flamingo": "117",
495
+ "flashlight": "118",
496
+ "flip flops": "119",
497
+ "floor lamp": "120",
498
+ "flower": "121",
499
+ "flying saucer": "122",
500
+ "foot": "123",
501
+ "fork": "124",
502
+ "frog": "125",
503
+ "frying pan": "126",
504
+ "garden": "128",
505
+ "garden hose": "127",
506
+ "giraffe": "129",
507
+ "goatee": "130",
508
+ "golf club": "131",
509
+ "grapes": "132",
510
+ "grass": "133",
511
+ "guitar": "134",
512
+ "hamburger": "135",
513
+ "hammer": "136",
514
+ "hand": "137",
515
+ "harp": "138",
516
+ "hat": "139",
517
+ "headphones": "140",
518
+ "hedgehog": "141",
519
+ "helicopter": "142",
520
+ "helmet": "143",
521
+ "hexagon": "144",
522
+ "hockey puck": "145",
523
+ "hockey stick": "146",
524
+ "horse": "147",
525
+ "hospital": "148",
526
+ "hot air balloon": "149",
527
+ "hot dog": "150",
528
+ "hot tub": "151",
529
+ "hourglass": "152",
530
+ "house": "154",
531
+ "house plant": "153",
532
+ "hurricane": "155",
533
+ "ice cream": "156",
534
+ "jacket": "157",
535
+ "jail": "158",
536
+ "kangaroo": "159",
537
+ "key": "160",
538
+ "keyboard": "161",
539
+ "knee": "162",
540
+ "knife": "163",
541
+ "ladder": "164",
542
+ "lantern": "165",
543
+ "laptop": "166",
544
+ "leaf": "167",
545
+ "leg": "168",
546
+ "light bulb": "169",
547
+ "lighter": "170",
548
+ "lighthouse": "171",
549
+ "lightning": "172",
550
+ "line": "173",
551
+ "lion": "174",
552
+ "lipstick": "175",
553
+ "lobster": "176",
554
+ "lollipop": "177",
555
+ "mailbox": "178",
556
+ "map": "179",
557
+ "marker": "180",
558
+ "matches": "181",
559
+ "megaphone": "182",
560
+ "mermaid": "183",
561
+ "microphone": "184",
562
+ "microwave": "185",
563
+ "monkey": "186",
564
+ "moon": "187",
565
+ "mosquito": "188",
566
+ "motorbike": "189",
567
+ "mountain": "190",
568
+ "mouse": "191",
569
+ "moustache": "192",
570
+ "mouth": "193",
571
+ "mug": "194",
572
+ "mushroom": "195",
573
+ "nail": "196",
574
+ "necklace": "197",
575
+ "nose": "198",
576
+ "ocean": "199",
577
+ "octagon": "200",
578
+ "octopus": "201",
579
+ "onion": "202",
580
+ "oven": "203",
581
+ "owl": "204",
582
+ "paint can": "205",
583
+ "paintbrush": "206",
584
+ "palm tree": "207",
585
+ "panda": "208",
586
+ "pants": "209",
587
+ "paper clip": "210",
588
+ "parachute": "211",
589
+ "parrot": "212",
590
+ "passport": "213",
591
+ "peanut": "214",
592
+ "pear": "215",
593
+ "peas": "216",
594
+ "pencil": "217",
595
+ "penguin": "218",
596
+ "piano": "219",
597
+ "pickup truck": "220",
598
+ "picture frame": "221",
599
+ "pig": "222",
600
+ "pillow": "223",
601
+ "pineapple": "224",
602
+ "pizza": "225",
603
+ "pliers": "226",
604
+ "police car": "227",
605
+ "pond": "228",
606
+ "pool": "229",
607
+ "popsicle": "230",
608
+ "postcard": "231",
609
+ "potato": "232",
610
+ "power outlet": "233",
611
+ "purse": "234",
612
+ "rabbit": "235",
613
+ "raccoon": "236",
614
+ "radio": "237",
615
+ "rain": "238",
616
+ "rainbow": "239",
617
+ "rake": "240",
618
+ "remote control": "241",
619
+ "rhinoceros": "242",
620
+ "rifle": "243",
621
+ "river": "244",
622
+ "roller coaster": "245",
623
+ "rollerskates": "246",
624
+ "sailboat": "247",
625
+ "sandwich": "248",
626
+ "saw": "249",
627
+ "saxophone": "250",
628
+ "school bus": "251",
629
+ "scissors": "252",
630
+ "scorpion": "253",
631
+ "screwdriver": "254",
632
+ "sea turtle": "255",
633
+ "see saw": "256",
634
+ "shark": "257",
635
+ "sheep": "258",
636
+ "shoe": "259",
637
+ "shorts": "260",
638
+ "shovel": "261",
639
+ "sink": "262",
640
+ "skateboard": "263",
641
+ "skull": "264",
642
+ "skyscraper": "265",
643
+ "sleeping bag": "266",
644
+ "smiley face": "267",
645
+ "snail": "268",
646
+ "snake": "269",
647
+ "snorkel": "270",
648
+ "snowflake": "271",
649
+ "snowman": "272",
650
+ "soccer ball": "273",
651
+ "sock": "274",
652
+ "speedboat": "275",
653
+ "spider": "276",
654
+ "spoon": "277",
655
+ "spreadsheet": "278",
656
+ "square": "279",
657
+ "squiggle": "280",
658
+ "squirrel": "281",
659
+ "stairs": "282",
660
+ "star": "283",
661
+ "steak": "284",
662
+ "stereo": "285",
663
+ "stethoscope": "286",
664
+ "stitches": "287",
665
+ "stop sign": "288",
666
+ "stove": "289",
667
+ "strawberry": "290",
668
+ "streetlight": "291",
669
+ "string bean": "292",
670
+ "submarine": "293",
671
+ "suitcase": "294",
672
+ "sun": "295",
673
+ "swan": "296",
674
+ "sweater": "297",
675
+ "swing set": "298",
676
+ "sword": "299",
677
+ "syringe": "300",
678
+ "t-shirt": "301",
679
+ "table": "302",
680
+ "teapot": "303",
681
+ "teddy-bear": "304",
682
+ "telephone": "305",
683
+ "television": "306",
684
+ "tennis racquet": "307",
685
+ "tent": "308",
686
+ "tiger": "312",
687
+ "toaster": "313",
688
+ "toe": "314",
689
+ "toilet": "315",
690
+ "tooth": "316",
691
+ "toothbrush": "317",
692
+ "toothpaste": "318",
693
+ "tornado": "319",
694
+ "tractor": "320",
695
+ "traffic light": "321",
696
+ "train": "322",
697
+ "tree": "323",
698
+ "triangle": "324",
699
+ "trombone": "325",
700
+ "truck": "326",
701
+ "trumpet": "327",
702
+ "umbrella": "328",
703
+ "underwear": "329",
704
+ "van": "330",
705
+ "vase": "331",
706
+ "violin": "332",
707
+ "washing machine": "333",
708
+ "watermelon": "334",
709
+ "waterslide": "335",
710
+ "whale": "336",
711
+ "wheel": "337",
712
+ "windmill": "338",
713
+ "wine bottle": "339",
714
+ "wine glass": "340",
715
+ "wristwatch": "341",
716
+ "yoga": "342",
717
+ "zebra": "343",
718
+ "zigzag": "344"
719
+ },
720
+ "layer_norm_eps": 1e-05,
721
+ "mlp_ratio": 2.0,
722
+ "model_type": "mobilevit",
723
+ "neck_hidden_sizes": [
724
+ 16,
725
+ 32,
726
+ 48,
727
+ 64,
728
+ 80,
729
+ 96,
730
+ 384
731
+ ],
732
+ "num_attention_heads": 4,
733
+ "num_channels": 1,
734
+ "output_stride": 32,
735
+ "patch_size": 2,
736
+ "problem_type": "single_label_classification",
737
+ "qkv_bias": true,
738
+ "semantic_loss_ignore_index": 255,
739
+ "torch_dtype": "float32",
740
+ "transformers_version": "4.42.4"
741
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:515a6ad114e024e5fd13dc70624fc54dc109be5014b295778bb5bde059f6e93f
3
+ size 8341964
preprocessor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 28,
4
+ "width": 28
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": false,
8
+ "do_flip_channel_order": false,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_processor_type": "MobileViTImageProcessor",
12
+ "resample": 2,
13
+ "rescale_factor": 0.00392156862745098,
14
+ "size": {
15
+ "shortest_edge": 28
16
+ }
17
+ }
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.728948,
4
+ "eval_loss": 1.0799410343170166,
5
+ "eval_runtime": 79.2396,
6
+ "eval_samples_per_second": 3154.988,
7
+ "eval_steps_per_second": 12.33
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1.3116020904e+17,
4
+ "train_loss": 1.269093582550002,
5
+ "train_runtime": 7252.1306,
6
+ "train_samples_per_second": 1861.522,
7
+ "train_steps_per_second": 7.272
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,3821 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 5000,
6
+ "global_step": 52737,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005688605722737357,
13
+ "grad_norm": 2.727705717086792,
14
+ "learning_rate": 0.0007984982080891974,
15
+ "loss": 2.3682,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.011377211445474714,
20
+ "grad_norm": 2.5978145599365234,
21
+ "learning_rate": 0.0007969812465631342,
22
+ "loss": 2.2098,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.01706581716821207,
27
+ "grad_norm": 2.488832473754883,
28
+ "learning_rate": 0.0007954642850370708,
29
+ "loss": 2.1041,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.02275442289094943,
34
+ "grad_norm": 2.1368465423583984,
35
+ "learning_rate": 0.0007939473235110074,
36
+ "loss": 2.0233,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.028443028613686784,
41
+ "grad_norm": 2.7280213832855225,
42
+ "learning_rate": 0.0007924303619849442,
43
+ "loss": 1.9639,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.03413163433642414,
48
+ "grad_norm": 2.5257463455200195,
49
+ "learning_rate": 0.0007909134004588809,
50
+ "loss": 2.0229,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.0398202400591615,
55
+ "grad_norm": 2.748051404953003,
56
+ "learning_rate": 0.0007893964389328175,
57
+ "loss": 1.9867,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.04550884578189886,
62
+ "grad_norm": 2.212047815322876,
63
+ "learning_rate": 0.0007878794774067543,
64
+ "loss": 1.9354,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.05119745150463621,
69
+ "grad_norm": 2.423400640487671,
70
+ "learning_rate": 0.000786362515880691,
71
+ "loss": 1.9127,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.05688605722737357,
76
+ "grad_norm": 2.379678726196289,
77
+ "learning_rate": 0.0007848455543546277,
78
+ "loss": 1.8927,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.06257466295011092,
83
+ "grad_norm": 2.5806541442871094,
84
+ "learning_rate": 0.0007833285928285645,
85
+ "loss": 1.8471,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.06826326867284828,
90
+ "grad_norm": 2.4539499282836914,
91
+ "learning_rate": 0.0007818116313025011,
92
+ "loss": 1.8296,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.07395187439558565,
97
+ "grad_norm": 2.7818546295166016,
98
+ "learning_rate": 0.0007802946697764378,
99
+ "loss": 1.8136,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.079640480118323,
104
+ "grad_norm": 2.979959487915039,
105
+ "learning_rate": 0.0007787777082503746,
106
+ "loss": 1.7844,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.08532908584106036,
111
+ "grad_norm": 2.3885879516601562,
112
+ "learning_rate": 0.0007772607467243113,
113
+ "loss": 1.7627,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.09101769156379771,
118
+ "grad_norm": 2.2447025775909424,
119
+ "learning_rate": 0.0007757437851982479,
120
+ "loss": 1.7375,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.09670629728653507,
125
+ "grad_norm": 2.184580087661743,
126
+ "learning_rate": 0.0007742268236721846,
127
+ "loss": 1.7212,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.10239490300927243,
132
+ "grad_norm": 2.2506866455078125,
133
+ "learning_rate": 0.0007727098621461213,
134
+ "loss": 1.7264,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.10808350873200978,
139
+ "grad_norm": 2.059812068939209,
140
+ "learning_rate": 0.000771192900620058,
141
+ "loss": 1.7123,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.11377211445474714,
146
+ "grad_norm": 2.3013007640838623,
147
+ "learning_rate": 0.0007696759390939948,
148
+ "loss": 1.7122,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.11946072017748449,
153
+ "grad_norm": 2.7073047161102295,
154
+ "learning_rate": 0.0007681741471831921,
155
+ "loss": 1.6847,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.12514932590022185,
160
+ "grad_norm": 2.023949384689331,
161
+ "learning_rate": 0.0007666571856571288,
162
+ "loss": 1.6941,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.1308379316229592,
167
+ "grad_norm": 1.9444501399993896,
168
+ "learning_rate": 0.0007651402241310656,
169
+ "loss": 1.6682,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.13652653734569656,
174
+ "grad_norm": 2.691826105117798,
175
+ "learning_rate": 0.0007636232626050023,
176
+ "loss": 1.6409,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.1422151430684339,
181
+ "grad_norm": 2.483386993408203,
182
+ "learning_rate": 0.0007621063010789389,
183
+ "loss": 1.662,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.1479037487911713,
188
+ "grad_norm": 2.1850545406341553,
189
+ "learning_rate": 0.0007605893395528756,
190
+ "loss": 1.6532,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.15359235451390865,
195
+ "grad_norm": 1.989560842514038,
196
+ "learning_rate": 0.0007590723780268123,
197
+ "loss": 1.6231,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.159280960236646,
202
+ "grad_norm": 2.1362531185150146,
203
+ "learning_rate": 0.000757555416500749,
204
+ "loss": 1.6019,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.16496956595938336,
209
+ "grad_norm": 2.3262641429901123,
210
+ "learning_rate": 0.0007560384549746857,
211
+ "loss": 1.6103,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 0.17065817168212072,
216
+ "grad_norm": 2.297419309616089,
217
+ "learning_rate": 0.0007545214934486224,
218
+ "loss": 1.6314,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 0.17634677740485807,
223
+ "grad_norm": 2.1368629932403564,
224
+ "learning_rate": 0.0007530045319225591,
225
+ "loss": 1.5838,
226
+ "step": 3100
227
+ },
228
+ {
229
+ "epoch": 0.18203538312759543,
230
+ "grad_norm": 2.3383195400238037,
231
+ "learning_rate": 0.0007514875703964959,
232
+ "loss": 1.5857,
233
+ "step": 3200
234
+ },
235
+ {
236
+ "epoch": 0.18772398885033278,
237
+ "grad_norm": 2.149740219116211,
238
+ "learning_rate": 0.0007499706088704326,
239
+ "loss": 1.6016,
240
+ "step": 3300
241
+ },
242
+ {
243
+ "epoch": 0.19341259457307014,
244
+ "grad_norm": 2.096703290939331,
245
+ "learning_rate": 0.0007484536473443692,
246
+ "loss": 1.5904,
247
+ "step": 3400
248
+ },
249
+ {
250
+ "epoch": 0.1991012002958075,
251
+ "grad_norm": 2.2043957710266113,
252
+ "learning_rate": 0.000746936685818306,
253
+ "loss": 1.5787,
254
+ "step": 3500
255
+ },
256
+ {
257
+ "epoch": 0.20478980601854485,
258
+ "grad_norm": 2.6369898319244385,
259
+ "learning_rate": 0.0007454197242922427,
260
+ "loss": 1.5539,
261
+ "step": 3600
262
+ },
263
+ {
264
+ "epoch": 0.2104784117412822,
265
+ "grad_norm": 1.9776628017425537,
266
+ "learning_rate": 0.0007439027627661794,
267
+ "loss": 1.5815,
268
+ "step": 3700
269
+ },
270
+ {
271
+ "epoch": 0.21616701746401956,
272
+ "grad_norm": 2.2001795768737793,
273
+ "learning_rate": 0.0007423858012401161,
274
+ "loss": 1.5583,
275
+ "step": 3800
276
+ },
277
+ {
278
+ "epoch": 0.22185562318675692,
279
+ "grad_norm": 2.2252562046051025,
280
+ "learning_rate": 0.0007408688397140527,
281
+ "loss": 1.5539,
282
+ "step": 3900
283
+ },
284
+ {
285
+ "epoch": 0.22754422890949427,
286
+ "grad_norm": 2.15871262550354,
287
+ "learning_rate": 0.0007393518781879895,
288
+ "loss": 1.5786,
289
+ "step": 4000
290
+ },
291
+ {
292
+ "epoch": 0.23323283463223163,
293
+ "grad_norm": 2.026066303253174,
294
+ "learning_rate": 0.0007378500862771869,
295
+ "loss": 1.5452,
296
+ "step": 4100
297
+ },
298
+ {
299
+ "epoch": 0.23892144035496898,
300
+ "grad_norm": 2.116511583328247,
301
+ "learning_rate": 0.0007363331247511235,
302
+ "loss": 1.5381,
303
+ "step": 4200
304
+ },
305
+ {
306
+ "epoch": 0.24461004607770637,
307
+ "grad_norm": 1.9454152584075928,
308
+ "learning_rate": 0.0007348161632250602,
309
+ "loss": 1.557,
310
+ "step": 4300
311
+ },
312
+ {
313
+ "epoch": 0.2502986518004437,
314
+ "grad_norm": 1.8668495416641235,
315
+ "learning_rate": 0.000733299201698997,
316
+ "loss": 1.5406,
317
+ "step": 4400
318
+ },
319
+ {
320
+ "epoch": 0.2559872575231811,
321
+ "grad_norm": 2.0886125564575195,
322
+ "learning_rate": 0.0007317822401729337,
323
+ "loss": 1.519,
324
+ "step": 4500
325
+ },
326
+ {
327
+ "epoch": 0.2616758632459184,
328
+ "grad_norm": 2.5768446922302246,
329
+ "learning_rate": 0.0007302652786468704,
330
+ "loss": 1.5178,
331
+ "step": 4600
332
+ },
333
+ {
334
+ "epoch": 0.2673644689686558,
335
+ "grad_norm": 2.4169631004333496,
336
+ "learning_rate": 0.000728748317120807,
337
+ "loss": 1.5283,
338
+ "step": 4700
339
+ },
340
+ {
341
+ "epoch": 0.2730530746913931,
342
+ "grad_norm": 2.7676291465759277,
343
+ "learning_rate": 0.0007272313555947437,
344
+ "loss": 1.5265,
345
+ "step": 4800
346
+ },
347
+ {
348
+ "epoch": 0.2787416804141305,
349
+ "grad_norm": 1.9152452945709229,
350
+ "learning_rate": 0.0007257143940686805,
351
+ "loss": 1.5152,
352
+ "step": 4900
353
+ },
354
+ {
355
+ "epoch": 0.2844302861368678,
356
+ "grad_norm": 2.56608510017395,
357
+ "learning_rate": 0.0007241974325426172,
358
+ "loss": 1.5084,
359
+ "step": 5000
360
+ },
361
+ {
362
+ "epoch": 0.2844302861368678,
363
+ "eval_accuracy": 0.636444,
364
+ "eval_loss": 1.469992995262146,
365
+ "eval_runtime": 85.5907,
366
+ "eval_samples_per_second": 2920.879,
367
+ "eval_steps_per_second": 11.415,
368
+ "step": 5000
369
+ },
370
+ {
371
+ "epoch": 0.2901188918596052,
372
+ "grad_norm": 2.2403135299682617,
373
+ "learning_rate": 0.0007226804710165538,
374
+ "loss": 1.5076,
375
+ "step": 5100
376
+ },
377
+ {
378
+ "epoch": 0.2958074975823426,
379
+ "grad_norm": 2.058535099029541,
380
+ "learning_rate": 0.0007211635094904906,
381
+ "loss": 1.4973,
382
+ "step": 5200
383
+ },
384
+ {
385
+ "epoch": 0.3014961033050799,
386
+ "grad_norm": 1.9374159574508667,
387
+ "learning_rate": 0.0007196465479644273,
388
+ "loss": 1.5055,
389
+ "step": 5300
390
+ },
391
+ {
392
+ "epoch": 0.3071847090278173,
393
+ "grad_norm": 1.8894695043563843,
394
+ "learning_rate": 0.000718129586438364,
395
+ "loss": 1.4996,
396
+ "step": 5400
397
+ },
398
+ {
399
+ "epoch": 0.31287331475055463,
400
+ "grad_norm": 2.5466501712799072,
401
+ "learning_rate": 0.0007166126249123008,
402
+ "loss": 1.5006,
403
+ "step": 5500
404
+ },
405
+ {
406
+ "epoch": 0.318561920473292,
407
+ "grad_norm": 1.9721605777740479,
408
+ "learning_rate": 0.0007150956633862374,
409
+ "loss": 1.4932,
410
+ "step": 5600
411
+ },
412
+ {
413
+ "epoch": 0.32425052619602934,
414
+ "grad_norm": 1.8921763896942139,
415
+ "learning_rate": 0.0007135787018601741,
416
+ "loss": 1.4762,
417
+ "step": 5700
418
+ },
419
+ {
420
+ "epoch": 0.3299391319187667,
421
+ "grad_norm": 2.49052357673645,
422
+ "learning_rate": 0.0007120617403341108,
423
+ "loss": 1.4636,
424
+ "step": 5800
425
+ },
426
+ {
427
+ "epoch": 0.33562773764150405,
428
+ "grad_norm": 1.8825891017913818,
429
+ "learning_rate": 0.0007105447788080475,
430
+ "loss": 1.4903,
431
+ "step": 5900
432
+ },
433
+ {
434
+ "epoch": 0.34131634336424144,
435
+ "grad_norm": 1.9227776527404785,
436
+ "learning_rate": 0.0007090278172819842,
437
+ "loss": 1.4602,
438
+ "step": 6000
439
+ },
440
+ {
441
+ "epoch": 0.34700494908697876,
442
+ "grad_norm": 2.173774242401123,
443
+ "learning_rate": 0.0007075108557559209,
444
+ "loss": 1.4581,
445
+ "step": 6100
446
+ },
447
+ {
448
+ "epoch": 0.35269355480971615,
449
+ "grad_norm": 1.9840656518936157,
450
+ "learning_rate": 0.0007059938942298576,
451
+ "loss": 1.4504,
452
+ "step": 6200
453
+ },
454
+ {
455
+ "epoch": 0.3583821605324535,
456
+ "grad_norm": 2.368171453475952,
457
+ "learning_rate": 0.000704492102319055,
458
+ "loss": 1.4659,
459
+ "step": 6300
460
+ },
461
+ {
462
+ "epoch": 0.36407076625519086,
463
+ "grad_norm": 2.005125045776367,
464
+ "learning_rate": 0.0007029751407929917,
465
+ "loss": 1.4698,
466
+ "step": 6400
467
+ },
468
+ {
469
+ "epoch": 0.3697593719779282,
470
+ "grad_norm": 1.8724095821380615,
471
+ "learning_rate": 0.0007014581792669284,
472
+ "loss": 1.4429,
473
+ "step": 6500
474
+ },
475
+ {
476
+ "epoch": 0.37544797770066557,
477
+ "grad_norm": 1.8412431478500366,
478
+ "learning_rate": 0.0006999412177408651,
479
+ "loss": 1.4368,
480
+ "step": 6600
481
+ },
482
+ {
483
+ "epoch": 0.3811365834234029,
484
+ "grad_norm": 1.9016755819320679,
485
+ "learning_rate": 0.0006984242562148018,
486
+ "loss": 1.4351,
487
+ "step": 6700
488
+ },
489
+ {
490
+ "epoch": 0.3868251891461403,
491
+ "grad_norm": 1.9896953105926514,
492
+ "learning_rate": 0.0006969072946887384,
493
+ "loss": 1.4563,
494
+ "step": 6800
495
+ },
496
+ {
497
+ "epoch": 0.39251379486887766,
498
+ "grad_norm": 2.3341548442840576,
499
+ "learning_rate": 0.0006953903331626751,
500
+ "loss": 1.4457,
501
+ "step": 6900
502
+ },
503
+ {
504
+ "epoch": 0.398202400591615,
505
+ "grad_norm": 1.95259690284729,
506
+ "learning_rate": 0.0006938733716366119,
507
+ "loss": 1.4636,
508
+ "step": 7000
509
+ },
510
+ {
511
+ "epoch": 0.4038910063143524,
512
+ "grad_norm": 1.8444461822509766,
513
+ "learning_rate": 0.0006923564101105486,
514
+ "loss": 1.4418,
515
+ "step": 7100
516
+ },
517
+ {
518
+ "epoch": 0.4095796120370897,
519
+ "grad_norm": 1.9170624017715454,
520
+ "learning_rate": 0.0006908394485844853,
521
+ "loss": 1.4267,
522
+ "step": 7200
523
+ },
524
+ {
525
+ "epoch": 0.4152682177598271,
526
+ "grad_norm": 1.6293827295303345,
527
+ "learning_rate": 0.000689322487058422,
528
+ "loss": 1.4474,
529
+ "step": 7300
530
+ },
531
+ {
532
+ "epoch": 0.4209568234825644,
533
+ "grad_norm": 2.2202467918395996,
534
+ "learning_rate": 0.0006878055255323587,
535
+ "loss": 1.4166,
536
+ "step": 7400
537
+ },
538
+ {
539
+ "epoch": 0.4266454292053018,
540
+ "grad_norm": 1.9069397449493408,
541
+ "learning_rate": 0.0006862885640062954,
542
+ "loss": 1.4194,
543
+ "step": 7500
544
+ },
545
+ {
546
+ "epoch": 0.4323340349280391,
547
+ "grad_norm": 2.0205297470092773,
548
+ "learning_rate": 0.0006847716024802322,
549
+ "loss": 1.4328,
550
+ "step": 7600
551
+ },
552
+ {
553
+ "epoch": 0.4380226406507765,
554
+ "grad_norm": 1.6736252307891846,
555
+ "learning_rate": 0.0006832546409541689,
556
+ "loss": 1.4309,
557
+ "step": 7700
558
+ },
559
+ {
560
+ "epoch": 0.44371124637351383,
561
+ "grad_norm": 1.7010937929153442,
562
+ "learning_rate": 0.0006817376794281055,
563
+ "loss": 1.412,
564
+ "step": 7800
565
+ },
566
+ {
567
+ "epoch": 0.4493998520962512,
568
+ "grad_norm": 2.748424768447876,
569
+ "learning_rate": 0.0006802207179020422,
570
+ "loss": 1.421,
571
+ "step": 7900
572
+ },
573
+ {
574
+ "epoch": 0.45508845781898855,
575
+ "grad_norm": 1.908728837966919,
576
+ "learning_rate": 0.0006787037563759789,
577
+ "loss": 1.4172,
578
+ "step": 8000
579
+ },
580
+ {
581
+ "epoch": 0.46077706354172593,
582
+ "grad_norm": 1.8672014474868774,
583
+ "learning_rate": 0.0006771867948499157,
584
+ "loss": 1.4448,
585
+ "step": 8100
586
+ },
587
+ {
588
+ "epoch": 0.46646566926446326,
589
+ "grad_norm": 2.128519058227539,
590
+ "learning_rate": 0.0006756698333238524,
591
+ "loss": 1.4158,
592
+ "step": 8200
593
+ },
594
+ {
595
+ "epoch": 0.47215427498720064,
596
+ "grad_norm": 1.7498713731765747,
597
+ "learning_rate": 0.000674152871797789,
598
+ "loss": 1.412,
599
+ "step": 8300
600
+ },
601
+ {
602
+ "epoch": 0.47784288070993797,
603
+ "grad_norm": 1.7801289558410645,
604
+ "learning_rate": 0.0006726510798869864,
605
+ "loss": 1.4146,
606
+ "step": 8400
607
+ },
608
+ {
609
+ "epoch": 0.48353148643267535,
610
+ "grad_norm": 1.9360538721084595,
611
+ "learning_rate": 0.0006711341183609232,
612
+ "loss": 1.4252,
613
+ "step": 8500
614
+ },
615
+ {
616
+ "epoch": 0.48922009215541273,
617
+ "grad_norm": 2.3669304847717285,
618
+ "learning_rate": 0.0006696171568348598,
619
+ "loss": 1.4057,
620
+ "step": 8600
621
+ },
622
+ {
623
+ "epoch": 0.49490869787815006,
624
+ "grad_norm": 1.7751379013061523,
625
+ "learning_rate": 0.0006681001953087965,
626
+ "loss": 1.4049,
627
+ "step": 8700
628
+ },
629
+ {
630
+ "epoch": 0.5005973036008874,
631
+ "grad_norm": 2.5389885902404785,
632
+ "learning_rate": 0.0006665832337827332,
633
+ "loss": 1.3837,
634
+ "step": 8800
635
+ },
636
+ {
637
+ "epoch": 0.5062859093236248,
638
+ "grad_norm": 2.5082690715789795,
639
+ "learning_rate": 0.0006650662722566699,
640
+ "loss": 1.3924,
641
+ "step": 8900
642
+ },
643
+ {
644
+ "epoch": 0.5119745150463622,
645
+ "grad_norm": 2.011589527130127,
646
+ "learning_rate": 0.0006635493107306066,
647
+ "loss": 1.3956,
648
+ "step": 9000
649
+ },
650
+ {
651
+ "epoch": 0.5176631207690995,
652
+ "grad_norm": 1.819793939590454,
653
+ "learning_rate": 0.0006620323492045433,
654
+ "loss": 1.4063,
655
+ "step": 9100
656
+ },
657
+ {
658
+ "epoch": 0.5233517264918368,
659
+ "grad_norm": 2.081247568130493,
660
+ "learning_rate": 0.00066051538767848,
661
+ "loss": 1.4145,
662
+ "step": 9200
663
+ },
664
+ {
665
+ "epoch": 0.5290403322145742,
666
+ "grad_norm": 2.151563882827759,
667
+ "learning_rate": 0.0006589984261524168,
668
+ "loss": 1.4002,
669
+ "step": 9300
670
+ },
671
+ {
672
+ "epoch": 0.5347289379373116,
673
+ "grad_norm": 1.9170759916305542,
674
+ "learning_rate": 0.0006574814646263535,
675
+ "loss": 1.3863,
676
+ "step": 9400
677
+ },
678
+ {
679
+ "epoch": 0.5404175436600489,
680
+ "grad_norm": 1.6435186862945557,
681
+ "learning_rate": 0.0006559645031002901,
682
+ "loss": 1.3888,
683
+ "step": 9500
684
+ },
685
+ {
686
+ "epoch": 0.5461061493827862,
687
+ "grad_norm": 1.8130972385406494,
688
+ "learning_rate": 0.0006544475415742268,
689
+ "loss": 1.3904,
690
+ "step": 9600
691
+ },
692
+ {
693
+ "epoch": 0.5517947551055237,
694
+ "grad_norm": 1.8200345039367676,
695
+ "learning_rate": 0.0006529305800481636,
696
+ "loss": 1.3647,
697
+ "step": 9700
698
+ },
699
+ {
700
+ "epoch": 0.557483360828261,
701
+ "grad_norm": 1.7286423444747925,
702
+ "learning_rate": 0.0006514136185221003,
703
+ "loss": 1.3815,
704
+ "step": 9800
705
+ },
706
+ {
707
+ "epoch": 0.5631719665509983,
708
+ "grad_norm": 2.345879554748535,
709
+ "learning_rate": 0.000649896656996037,
710
+ "loss": 1.3919,
711
+ "step": 9900
712
+ },
713
+ {
714
+ "epoch": 0.5688605722737357,
715
+ "grad_norm": 1.8189209699630737,
716
+ "learning_rate": 0.0006483796954699736,
717
+ "loss": 1.3684,
718
+ "step": 10000
719
+ },
720
+ {
721
+ "epoch": 0.5688605722737357,
722
+ "eval_accuracy": 0.667392,
723
+ "eval_loss": 1.3353288173675537,
724
+ "eval_runtime": 85.0475,
725
+ "eval_samples_per_second": 2939.533,
726
+ "eval_steps_per_second": 11.488,
727
+ "step": 10000
728
+ },
729
+ {
730
+ "epoch": 0.5745491779964731,
731
+ "grad_norm": 1.7264429330825806,
732
+ "learning_rate": 0.0006468627339439103,
733
+ "loss": 1.4044,
734
+ "step": 10100
735
+ },
736
+ {
737
+ "epoch": 0.5802377837192104,
738
+ "grad_norm": 1.8806540966033936,
739
+ "learning_rate": 0.0006453457724178471,
740
+ "loss": 1.3746,
741
+ "step": 10200
742
+ },
743
+ {
744
+ "epoch": 0.5859263894419477,
745
+ "grad_norm": 1.7714815139770508,
746
+ "learning_rate": 0.0006438288108917838,
747
+ "loss": 1.3837,
748
+ "step": 10300
749
+ },
750
+ {
751
+ "epoch": 0.5916149951646852,
752
+ "grad_norm": 1.713157057762146,
753
+ "learning_rate": 0.0006423118493657205,
754
+ "loss": 1.3939,
755
+ "step": 10400
756
+ },
757
+ {
758
+ "epoch": 0.5973036008874225,
759
+ "grad_norm": 2.169168472290039,
760
+ "learning_rate": 0.0006408100574549179,
761
+ "loss": 1.3658,
762
+ "step": 10500
763
+ },
764
+ {
765
+ "epoch": 0.6029922066101598,
766
+ "grad_norm": 1.727501630783081,
767
+ "learning_rate": 0.0006392930959288546,
768
+ "loss": 1.3907,
769
+ "step": 10600
770
+ },
771
+ {
772
+ "epoch": 0.6086808123328972,
773
+ "grad_norm": 2.0120322704315186,
774
+ "learning_rate": 0.0006377761344027913,
775
+ "loss": 1.3757,
776
+ "step": 10700
777
+ },
778
+ {
779
+ "epoch": 0.6143694180556346,
780
+ "grad_norm": 1.799139142036438,
781
+ "learning_rate": 0.0006362591728767279,
782
+ "loss": 1.3803,
783
+ "step": 10800
784
+ },
785
+ {
786
+ "epoch": 0.6200580237783719,
787
+ "grad_norm": 1.8817808628082275,
788
+ "learning_rate": 0.0006347422113506646,
789
+ "loss": 1.3702,
790
+ "step": 10900
791
+ },
792
+ {
793
+ "epoch": 0.6257466295011093,
794
+ "grad_norm": 2.1144518852233887,
795
+ "learning_rate": 0.0006332252498246013,
796
+ "loss": 1.3832,
797
+ "step": 11000
798
+ },
799
+ {
800
+ "epoch": 0.6314352352238466,
801
+ "grad_norm": 2.1396071910858154,
802
+ "learning_rate": 0.0006317082882985381,
803
+ "loss": 1.3611,
804
+ "step": 11100
805
+ },
806
+ {
807
+ "epoch": 0.637123840946584,
808
+ "grad_norm": 1.6794757843017578,
809
+ "learning_rate": 0.0006301913267724747,
810
+ "loss": 1.368,
811
+ "step": 11200
812
+ },
813
+ {
814
+ "epoch": 0.6428124466693214,
815
+ "grad_norm": 2.268433094024658,
816
+ "learning_rate": 0.0006286743652464114,
817
+ "loss": 1.3498,
818
+ "step": 11300
819
+ },
820
+ {
821
+ "epoch": 0.6485010523920587,
822
+ "grad_norm": 1.8515706062316895,
823
+ "learning_rate": 0.0006271574037203482,
824
+ "loss": 1.3489,
825
+ "step": 11400
826
+ },
827
+ {
828
+ "epoch": 0.654189658114796,
829
+ "grad_norm": 2.482171058654785,
830
+ "learning_rate": 0.0006256404421942849,
831
+ "loss": 1.3501,
832
+ "step": 11500
833
+ },
834
+ {
835
+ "epoch": 0.6598782638375335,
836
+ "grad_norm": 1.9485667943954468,
837
+ "learning_rate": 0.0006241234806682216,
838
+ "loss": 1.3483,
839
+ "step": 11600
840
+ },
841
+ {
842
+ "epoch": 0.6655668695602708,
843
+ "grad_norm": 1.8601367473602295,
844
+ "learning_rate": 0.0006226065191421583,
845
+ "loss": 1.3392,
846
+ "step": 11700
847
+ },
848
+ {
849
+ "epoch": 0.6712554752830081,
850
+ "grad_norm": 1.870851993560791,
851
+ "learning_rate": 0.000621089557616095,
852
+ "loss": 1.352,
853
+ "step": 11800
854
+ },
855
+ {
856
+ "epoch": 0.6769440810057455,
857
+ "grad_norm": 1.9454014301300049,
858
+ "learning_rate": 0.0006195725960900317,
859
+ "loss": 1.3537,
860
+ "step": 11900
861
+ },
862
+ {
863
+ "epoch": 0.6826326867284829,
864
+ "grad_norm": 1.9180669784545898,
865
+ "learning_rate": 0.0006180556345639685,
866
+ "loss": 1.3541,
867
+ "step": 12000
868
+ },
869
+ {
870
+ "epoch": 0.6883212924512202,
871
+ "grad_norm": 1.7796809673309326,
872
+ "learning_rate": 0.0006165386730379051,
873
+ "loss": 1.331,
874
+ "step": 12100
875
+ },
876
+ {
877
+ "epoch": 0.6940098981739575,
878
+ "grad_norm": 2.040998935699463,
879
+ "learning_rate": 0.0006150217115118417,
880
+ "loss": 1.3214,
881
+ "step": 12200
882
+ },
883
+ {
884
+ "epoch": 0.699698503896695,
885
+ "grad_norm": 1.7188791036605835,
886
+ "learning_rate": 0.0006135047499857785,
887
+ "loss": 1.3577,
888
+ "step": 12300
889
+ },
890
+ {
891
+ "epoch": 0.7053871096194323,
892
+ "grad_norm": 1.9152625799179077,
893
+ "learning_rate": 0.0006119877884597152,
894
+ "loss": 1.3682,
895
+ "step": 12400
896
+ },
897
+ {
898
+ "epoch": 0.7110757153421696,
899
+ "grad_norm": 2.150810718536377,
900
+ "learning_rate": 0.0006104708269336519,
901
+ "loss": 1.3388,
902
+ "step": 12500
903
+ },
904
+ {
905
+ "epoch": 0.716764321064907,
906
+ "grad_norm": 1.97470223903656,
907
+ "learning_rate": 0.0006089538654075887,
908
+ "loss": 1.3319,
909
+ "step": 12600
910
+ },
911
+ {
912
+ "epoch": 0.7224529267876444,
913
+ "grad_norm": 1.663122296333313,
914
+ "learning_rate": 0.0006074369038815253,
915
+ "loss": 1.3593,
916
+ "step": 12700
917
+ },
918
+ {
919
+ "epoch": 0.7281415325103817,
920
+ "grad_norm": 1.6453677415847778,
921
+ "learning_rate": 0.0006059351119707227,
922
+ "loss": 1.3592,
923
+ "step": 12800
924
+ },
925
+ {
926
+ "epoch": 0.733830138233119,
927
+ "grad_norm": 1.6896419525146484,
928
+ "learning_rate": 0.0006044181504446595,
929
+ "loss": 1.3183,
930
+ "step": 12900
931
+ },
932
+ {
933
+ "epoch": 0.7395187439558564,
934
+ "grad_norm": 1.7903008460998535,
935
+ "learning_rate": 0.000602901188918596,
936
+ "loss": 1.3373,
937
+ "step": 13000
938
+ },
939
+ {
940
+ "epoch": 0.7452073496785938,
941
+ "grad_norm": 2.2026655673980713,
942
+ "learning_rate": 0.0006013842273925327,
943
+ "loss": 1.3403,
944
+ "step": 13100
945
+ },
946
+ {
947
+ "epoch": 0.7508959554013311,
948
+ "grad_norm": 1.9204201698303223,
949
+ "learning_rate": 0.0005998672658664695,
950
+ "loss": 1.3199,
951
+ "step": 13200
952
+ },
953
+ {
954
+ "epoch": 0.7565845611240685,
955
+ "grad_norm": 1.946899652481079,
956
+ "learning_rate": 0.0005983503043404062,
957
+ "loss": 1.3298,
958
+ "step": 13300
959
+ },
960
+ {
961
+ "epoch": 0.7622731668468058,
962
+ "grad_norm": 2.019131898880005,
963
+ "learning_rate": 0.0005968333428143428,
964
+ "loss": 1.3449,
965
+ "step": 13400
966
+ },
967
+ {
968
+ "epoch": 0.7679617725695432,
969
+ "grad_norm": 1.848008155822754,
970
+ "learning_rate": 0.0005953163812882796,
971
+ "loss": 1.3206,
972
+ "step": 13500
973
+ },
974
+ {
975
+ "epoch": 0.7736503782922806,
976
+ "grad_norm": 2.373288631439209,
977
+ "learning_rate": 0.0005937994197622163,
978
+ "loss": 1.3564,
979
+ "step": 13600
980
+ },
981
+ {
982
+ "epoch": 0.7793389840150179,
983
+ "grad_norm": 2.556985855102539,
984
+ "learning_rate": 0.000592282458236153,
985
+ "loss": 1.3254,
986
+ "step": 13700
987
+ },
988
+ {
989
+ "epoch": 0.7850275897377553,
990
+ "grad_norm": 1.8957433700561523,
991
+ "learning_rate": 0.0005907654967100898,
992
+ "loss": 1.3498,
993
+ "step": 13800
994
+ },
995
+ {
996
+ "epoch": 0.7907161954604927,
997
+ "grad_norm": 1.7315127849578857,
998
+ "learning_rate": 0.0005892485351840264,
999
+ "loss": 1.3249,
1000
+ "step": 13900
1001
+ },
1002
+ {
1003
+ "epoch": 0.79640480118323,
1004
+ "grad_norm": 1.973764419555664,
1005
+ "learning_rate": 0.0005877315736579631,
1006
+ "loss": 1.3305,
1007
+ "step": 14000
1008
+ },
1009
+ {
1010
+ "epoch": 0.8020934069059673,
1011
+ "grad_norm": 1.711145281791687,
1012
+ "learning_rate": 0.0005862146121318999,
1013
+ "loss": 1.3011,
1014
+ "step": 14100
1015
+ },
1016
+ {
1017
+ "epoch": 0.8077820126287047,
1018
+ "grad_norm": 1.8515042066574097,
1019
+ "learning_rate": 0.0005846976506058365,
1020
+ "loss": 1.3195,
1021
+ "step": 14200
1022
+ },
1023
+ {
1024
+ "epoch": 0.8134706183514421,
1025
+ "grad_norm": 1.6278700828552246,
1026
+ "learning_rate": 0.0005831806890797733,
1027
+ "loss": 1.3308,
1028
+ "step": 14300
1029
+ },
1030
+ {
1031
+ "epoch": 0.8191592240741794,
1032
+ "grad_norm": 1.444455623626709,
1033
+ "learning_rate": 0.0005816637275537099,
1034
+ "loss": 1.3119,
1035
+ "step": 14400
1036
+ },
1037
+ {
1038
+ "epoch": 0.8248478297969167,
1039
+ "grad_norm": 1.6277796030044556,
1040
+ "learning_rate": 0.0005801467660276466,
1041
+ "loss": 1.3227,
1042
+ "step": 14500
1043
+ },
1044
+ {
1045
+ "epoch": 0.8305364355196542,
1046
+ "grad_norm": 1.8428665399551392,
1047
+ "learning_rate": 0.0005786298045015834,
1048
+ "loss": 1.3336,
1049
+ "step": 14600
1050
+ },
1051
+ {
1052
+ "epoch": 0.8362250412423915,
1053
+ "grad_norm": 1.6377763748168945,
1054
+ "learning_rate": 0.0005771128429755201,
1055
+ "loss": 1.3141,
1056
+ "step": 14700
1057
+ },
1058
+ {
1059
+ "epoch": 0.8419136469651288,
1060
+ "grad_norm": 1.7305645942687988,
1061
+ "learning_rate": 0.0005755958814494568,
1062
+ "loss": 1.3062,
1063
+ "step": 14800
1064
+ },
1065
+ {
1066
+ "epoch": 0.8476022526878662,
1067
+ "grad_norm": 2.469701051712036,
1068
+ "learning_rate": 0.0005740940895386541,
1069
+ "loss": 1.3074,
1070
+ "step": 14900
1071
+ },
1072
+ {
1073
+ "epoch": 0.8532908584106036,
1074
+ "grad_norm": 1.952755331993103,
1075
+ "learning_rate": 0.0005725771280125909,
1076
+ "loss": 1.3568,
1077
+ "step": 15000
1078
+ },
1079
+ {
1080
+ "epoch": 0.8532908584106036,
1081
+ "eval_accuracy": 0.68038,
1082
+ "eval_loss": 1.2764052152633667,
1083
+ "eval_runtime": 82.5452,
1084
+ "eval_samples_per_second": 3028.643,
1085
+ "eval_steps_per_second": 11.836,
1086
+ "step": 15000
1087
+ },
1088
+ {
1089
+ "epoch": 0.8589794641333409,
1090
+ "grad_norm": 3.221471071243286,
1091
+ "learning_rate": 0.0005710601664865274,
1092
+ "loss": 1.3341,
1093
+ "step": 15100
1094
+ },
1095
+ {
1096
+ "epoch": 0.8646680698560782,
1097
+ "grad_norm": 2.2455317974090576,
1098
+ "learning_rate": 0.0005695432049604642,
1099
+ "loss": 1.3276,
1100
+ "step": 15200
1101
+ },
1102
+ {
1103
+ "epoch": 0.8703566755788157,
1104
+ "grad_norm": 1.8076684474945068,
1105
+ "learning_rate": 0.0005680262434344009,
1106
+ "loss": 1.2922,
1107
+ "step": 15300
1108
+ },
1109
+ {
1110
+ "epoch": 0.876045281301553,
1111
+ "grad_norm": 1.701774001121521,
1112
+ "learning_rate": 0.0005665092819083376,
1113
+ "loss": 1.3003,
1114
+ "step": 15400
1115
+ },
1116
+ {
1117
+ "epoch": 0.8817338870242903,
1118
+ "grad_norm": 1.5403673648834229,
1119
+ "learning_rate": 0.0005649923203822744,
1120
+ "loss": 1.3207,
1121
+ "step": 15500
1122
+ },
1123
+ {
1124
+ "epoch": 0.8874224927470277,
1125
+ "grad_norm": 1.9462639093399048,
1126
+ "learning_rate": 0.000563475358856211,
1127
+ "loss": 1.3098,
1128
+ "step": 15600
1129
+ },
1130
+ {
1131
+ "epoch": 0.8931110984697651,
1132
+ "grad_norm": 1.6688456535339355,
1133
+ "learning_rate": 0.0005619583973301477,
1134
+ "loss": 1.2993,
1135
+ "step": 15700
1136
+ },
1137
+ {
1138
+ "epoch": 0.8987997041925024,
1139
+ "grad_norm": 1.6060837507247925,
1140
+ "learning_rate": 0.0005604414358040845,
1141
+ "loss": 1.3145,
1142
+ "step": 15800
1143
+ },
1144
+ {
1145
+ "epoch": 0.9044883099152398,
1146
+ "grad_norm": 1.8593111038208008,
1147
+ "learning_rate": 0.0005589244742780212,
1148
+ "loss": 1.2836,
1149
+ "step": 15900
1150
+ },
1151
+ {
1152
+ "epoch": 0.9101769156379771,
1153
+ "grad_norm": 2.035261869430542,
1154
+ "learning_rate": 0.0005574075127519579,
1155
+ "loss": 1.3125,
1156
+ "step": 16000
1157
+ },
1158
+ {
1159
+ "epoch": 0.9158655213607145,
1160
+ "grad_norm": 1.6091046333312988,
1161
+ "learning_rate": 0.0005558905512258946,
1162
+ "loss": 1.2868,
1163
+ "step": 16100
1164
+ },
1165
+ {
1166
+ "epoch": 0.9215541270834519,
1167
+ "grad_norm": 1.656204104423523,
1168
+ "learning_rate": 0.0005543735896998313,
1169
+ "loss": 1.3075,
1170
+ "step": 16200
1171
+ },
1172
+ {
1173
+ "epoch": 0.9272427328061892,
1174
+ "grad_norm": 1.5555946826934814,
1175
+ "learning_rate": 0.0005528566281737679,
1176
+ "loss": 1.2963,
1177
+ "step": 16300
1178
+ },
1179
+ {
1180
+ "epoch": 0.9329313385289265,
1181
+ "grad_norm": 1.7379626035690308,
1182
+ "learning_rate": 0.0005513396666477047,
1183
+ "loss": 1.2905,
1184
+ "step": 16400
1185
+ },
1186
+ {
1187
+ "epoch": 0.938619944251664,
1188
+ "grad_norm": 1.5103166103363037,
1189
+ "learning_rate": 0.0005498227051216414,
1190
+ "loss": 1.2848,
1191
+ "step": 16500
1192
+ },
1193
+ {
1194
+ "epoch": 0.9443085499744013,
1195
+ "grad_norm": 1.5895978212356567,
1196
+ "learning_rate": 0.000548305743595578,
1197
+ "loss": 1.293,
1198
+ "step": 16600
1199
+ },
1200
+ {
1201
+ "epoch": 0.9499971556971386,
1202
+ "grad_norm": 1.6526978015899658,
1203
+ "learning_rate": 0.0005467887820695148,
1204
+ "loss": 1.288,
1205
+ "step": 16700
1206
+ },
1207
+ {
1208
+ "epoch": 0.9556857614198759,
1209
+ "grad_norm": 1.7471717596054077,
1210
+ "learning_rate": 0.0005452718205434515,
1211
+ "loss": 1.3099,
1212
+ "step": 16800
1213
+ },
1214
+ {
1215
+ "epoch": 0.9613743671426134,
1216
+ "grad_norm": 1.5995450019836426,
1217
+ "learning_rate": 0.0005437700286326488,
1218
+ "loss": 1.2886,
1219
+ "step": 16900
1220
+ },
1221
+ {
1222
+ "epoch": 0.9670629728653507,
1223
+ "grad_norm": 1.7462047338485718,
1224
+ "learning_rate": 0.0005422530671065856,
1225
+ "loss": 1.317,
1226
+ "step": 17000
1227
+ },
1228
+ {
1229
+ "epoch": 0.972751578588088,
1230
+ "grad_norm": 1.5739308595657349,
1231
+ "learning_rate": 0.0005407361055805223,
1232
+ "loss": 1.2997,
1233
+ "step": 17100
1234
+ },
1235
+ {
1236
+ "epoch": 0.9784401843108255,
1237
+ "grad_norm": 1.6608139276504517,
1238
+ "learning_rate": 0.0005392191440544589,
1239
+ "loss": 1.3037,
1240
+ "step": 17200
1241
+ },
1242
+ {
1243
+ "epoch": 0.9841287900335628,
1244
+ "grad_norm": 1.7515637874603271,
1245
+ "learning_rate": 0.0005377021825283956,
1246
+ "loss": 1.302,
1247
+ "step": 17300
1248
+ },
1249
+ {
1250
+ "epoch": 0.9898173957563001,
1251
+ "grad_norm": 1.572986364364624,
1252
+ "learning_rate": 0.0005361852210023323,
1253
+ "loss": 1.2945,
1254
+ "step": 17400
1255
+ },
1256
+ {
1257
+ "epoch": 0.9955060014790375,
1258
+ "grad_norm": 1.9207016229629517,
1259
+ "learning_rate": 0.000534668259476269,
1260
+ "loss": 1.2747,
1261
+ "step": 17500
1262
+ },
1263
+ {
1264
+ "epoch": 1.0011946072017748,
1265
+ "grad_norm": 1.9010945558547974,
1266
+ "learning_rate": 0.0005331512979502058,
1267
+ "loss": 1.263,
1268
+ "step": 17600
1269
+ },
1270
+ {
1271
+ "epoch": 1.0068832129245122,
1272
+ "grad_norm": 2.4259393215179443,
1273
+ "learning_rate": 0.0005316343364241425,
1274
+ "loss": 1.2741,
1275
+ "step": 17700
1276
+ },
1277
+ {
1278
+ "epoch": 1.0125718186472497,
1279
+ "grad_norm": 2.5002028942108154,
1280
+ "learning_rate": 0.0005301173748980791,
1281
+ "loss": 1.2686,
1282
+ "step": 17800
1283
+ },
1284
+ {
1285
+ "epoch": 1.0182604243699869,
1286
+ "grad_norm": 1.7075704336166382,
1287
+ "learning_rate": 0.0005286004133720159,
1288
+ "loss": 1.2661,
1289
+ "step": 17900
1290
+ },
1291
+ {
1292
+ "epoch": 1.0239490300927243,
1293
+ "grad_norm": 1.7390458583831787,
1294
+ "learning_rate": 0.0005270834518459526,
1295
+ "loss": 1.2698,
1296
+ "step": 18000
1297
+ },
1298
+ {
1299
+ "epoch": 1.0296376358154615,
1300
+ "grad_norm": 1.980185627937317,
1301
+ "learning_rate": 0.0005255664903198893,
1302
+ "loss": 1.2569,
1303
+ "step": 18100
1304
+ },
1305
+ {
1306
+ "epoch": 1.035326241538199,
1307
+ "grad_norm": 1.79970383644104,
1308
+ "learning_rate": 0.0005240495287938261,
1309
+ "loss": 1.2738,
1310
+ "step": 18200
1311
+ },
1312
+ {
1313
+ "epoch": 1.0410148472609364,
1314
+ "grad_norm": 1.6184749603271484,
1315
+ "learning_rate": 0.0005225325672677627,
1316
+ "loss": 1.2637,
1317
+ "step": 18300
1318
+ },
1319
+ {
1320
+ "epoch": 1.0467034529836736,
1321
+ "grad_norm": 2.3463358879089355,
1322
+ "learning_rate": 0.0005210156057416993,
1323
+ "loss": 1.2665,
1324
+ "step": 18400
1325
+ },
1326
+ {
1327
+ "epoch": 1.052392058706411,
1328
+ "grad_norm": 1.8550745248794556,
1329
+ "learning_rate": 0.0005194986442156361,
1330
+ "loss": 1.2664,
1331
+ "step": 18500
1332
+ },
1333
+ {
1334
+ "epoch": 1.0580806644291485,
1335
+ "grad_norm": 1.8582580089569092,
1336
+ "learning_rate": 0.0005179816826895728,
1337
+ "loss": 1.2442,
1338
+ "step": 18600
1339
+ },
1340
+ {
1341
+ "epoch": 1.0637692701518857,
1342
+ "grad_norm": 1.88007390499115,
1343
+ "learning_rate": 0.0005164647211635095,
1344
+ "loss": 1.2536,
1345
+ "step": 18700
1346
+ },
1347
+ {
1348
+ "epoch": 1.0694578758746232,
1349
+ "grad_norm": 1.804671287536621,
1350
+ "learning_rate": 0.0005149477596374462,
1351
+ "loss": 1.2459,
1352
+ "step": 18800
1353
+ },
1354
+ {
1355
+ "epoch": 1.0751464815973604,
1356
+ "grad_norm": 1.7329107522964478,
1357
+ "learning_rate": 0.0005134307981113829,
1358
+ "loss": 1.2499,
1359
+ "step": 18900
1360
+ },
1361
+ {
1362
+ "epoch": 1.0808350873200978,
1363
+ "grad_norm": 1.693323016166687,
1364
+ "learning_rate": 0.0005119290062005802,
1365
+ "loss": 1.25,
1366
+ "step": 19000
1367
+ },
1368
+ {
1369
+ "epoch": 1.0865236930428352,
1370
+ "grad_norm": 1.600060224533081,
1371
+ "learning_rate": 0.000510412044674517,
1372
+ "loss": 1.2515,
1373
+ "step": 19100
1374
+ },
1375
+ {
1376
+ "epoch": 1.0922122987655725,
1377
+ "grad_norm": 1.8084614276885986,
1378
+ "learning_rate": 0.0005088950831484537,
1379
+ "loss": 1.246,
1380
+ "step": 19200
1381
+ },
1382
+ {
1383
+ "epoch": 1.09790090448831,
1384
+ "grad_norm": 1.8022205829620361,
1385
+ "learning_rate": 0.0005073781216223904,
1386
+ "loss": 1.2597,
1387
+ "step": 19300
1388
+ },
1389
+ {
1390
+ "epoch": 1.1035895102110473,
1391
+ "grad_norm": 1.6137562990188599,
1392
+ "learning_rate": 0.0005058611600963271,
1393
+ "loss": 1.2685,
1394
+ "step": 19400
1395
+ },
1396
+ {
1397
+ "epoch": 1.1092781159337846,
1398
+ "grad_norm": 1.7756201028823853,
1399
+ "learning_rate": 0.0005043441985702637,
1400
+ "loss": 1.2606,
1401
+ "step": 19500
1402
+ },
1403
+ {
1404
+ "epoch": 1.114966721656522,
1405
+ "grad_norm": 1.8828805685043335,
1406
+ "learning_rate": 0.0005028272370442004,
1407
+ "loss": 1.2582,
1408
+ "step": 19600
1409
+ },
1410
+ {
1411
+ "epoch": 1.1206553273792594,
1412
+ "grad_norm": 1.6829185485839844,
1413
+ "learning_rate": 0.0005013102755181372,
1414
+ "loss": 1.2563,
1415
+ "step": 19700
1416
+ },
1417
+ {
1418
+ "epoch": 1.1263439331019967,
1419
+ "grad_norm": 1.6716195344924927,
1420
+ "learning_rate": 0.0004997933139920739,
1421
+ "loss": 1.2405,
1422
+ "step": 19800
1423
+ },
1424
+ {
1425
+ "epoch": 1.132032538824734,
1426
+ "grad_norm": 1.7629872560501099,
1427
+ "learning_rate": 0.0004982763524660106,
1428
+ "loss": 1.2649,
1429
+ "step": 19900
1430
+ },
1431
+ {
1432
+ "epoch": 1.1377211445474713,
1433
+ "grad_norm": 1.704967737197876,
1434
+ "learning_rate": 0.0004967593909399473,
1435
+ "loss": 1.226,
1436
+ "step": 20000
1437
+ },
1438
+ {
1439
+ "epoch": 1.1377211445474713,
1440
+ "eval_accuracy": 0.692396,
1441
+ "eval_loss": 1.2322564125061035,
1442
+ "eval_runtime": 82.0826,
1443
+ "eval_samples_per_second": 3045.712,
1444
+ "eval_steps_per_second": 11.903,
1445
+ "step": 20000
1446
+ },
1447
+ {
1448
+ "epoch": 1.1434097502702087,
1449
+ "grad_norm": 1.5182781219482422,
1450
+ "learning_rate": 0.000495242429413884,
1451
+ "loss": 1.2343,
1452
+ "step": 20100
1453
+ },
1454
+ {
1455
+ "epoch": 1.1490983559929462,
1456
+ "grad_norm": 2.637796640396118,
1457
+ "learning_rate": 0.0004937254678878207,
1458
+ "loss": 1.2506,
1459
+ "step": 20200
1460
+ },
1461
+ {
1462
+ "epoch": 1.1547869617156834,
1463
+ "grad_norm": 1.8955748081207275,
1464
+ "learning_rate": 0.0004922085063617575,
1465
+ "loss": 1.2625,
1466
+ "step": 20300
1467
+ },
1468
+ {
1469
+ "epoch": 1.1604755674384208,
1470
+ "grad_norm": 2.0370640754699707,
1471
+ "learning_rate": 0.0004906915448356942,
1472
+ "loss": 1.2551,
1473
+ "step": 20400
1474
+ },
1475
+ {
1476
+ "epoch": 1.1661641731611583,
1477
+ "grad_norm": 1.8047020435333252,
1478
+ "learning_rate": 0.0004891745833096308,
1479
+ "loss": 1.2489,
1480
+ "step": 20500
1481
+ },
1482
+ {
1483
+ "epoch": 1.1718527788838955,
1484
+ "grad_norm": 1.5440089702606201,
1485
+ "learning_rate": 0.0004876576217835675,
1486
+ "loss": 1.2646,
1487
+ "step": 20600
1488
+ },
1489
+ {
1490
+ "epoch": 1.177541384606633,
1491
+ "grad_norm": 1.5029830932617188,
1492
+ "learning_rate": 0.00048614066025750425,
1493
+ "loss": 1.2536,
1494
+ "step": 20700
1495
+ },
1496
+ {
1497
+ "epoch": 1.1832299903293704,
1498
+ "grad_norm": 1.4674205780029297,
1499
+ "learning_rate": 0.0004846236987314409,
1500
+ "loss": 1.2457,
1501
+ "step": 20800
1502
+ },
1503
+ {
1504
+ "epoch": 1.1889185960521076,
1505
+ "grad_norm": 1.5259037017822266,
1506
+ "learning_rate": 0.00048310673720537765,
1507
+ "loss": 1.2449,
1508
+ "step": 20900
1509
+ },
1510
+ {
1511
+ "epoch": 1.194607201774845,
1512
+ "grad_norm": 1.6339012384414673,
1513
+ "learning_rate": 0.0004815897756793144,
1514
+ "loss": 1.2163,
1515
+ "step": 21000
1516
+ },
1517
+ {
1518
+ "epoch": 1.2002958074975822,
1519
+ "grad_norm": 1.5565885305404663,
1520
+ "learning_rate": 0.00048007281415325106,
1521
+ "loss": 1.2461,
1522
+ "step": 21100
1523
+ },
1524
+ {
1525
+ "epoch": 1.2059844132203197,
1526
+ "grad_norm": 1.676540493965149,
1527
+ "learning_rate": 0.0004785558526271878,
1528
+ "loss": 1.2555,
1529
+ "step": 21200
1530
+ },
1531
+ {
1532
+ "epoch": 1.2116730189430571,
1533
+ "grad_norm": 1.6003342866897583,
1534
+ "learning_rate": 0.00047705406071638513,
1535
+ "loss": 1.2507,
1536
+ "step": 21300
1537
+ },
1538
+ {
1539
+ "epoch": 1.2173616246657943,
1540
+ "grad_norm": 2.2655630111694336,
1541
+ "learning_rate": 0.00047553709919032175,
1542
+ "loss": 1.2144,
1543
+ "step": 21400
1544
+ },
1545
+ {
1546
+ "epoch": 1.2230502303885318,
1547
+ "grad_norm": 1.695094108581543,
1548
+ "learning_rate": 0.0004740201376642585,
1549
+ "loss": 1.2415,
1550
+ "step": 21500
1551
+ },
1552
+ {
1553
+ "epoch": 1.2287388361112692,
1554
+ "grad_norm": 1.8387731313705444,
1555
+ "learning_rate": 0.0004725031761381952,
1556
+ "loss": 1.2406,
1557
+ "step": 21600
1558
+ },
1559
+ {
1560
+ "epoch": 1.2344274418340064,
1561
+ "grad_norm": 1.6776598691940308,
1562
+ "learning_rate": 0.0004709862146121319,
1563
+ "loss": 1.2673,
1564
+ "step": 21700
1565
+ },
1566
+ {
1567
+ "epoch": 1.2401160475567439,
1568
+ "grad_norm": 1.6573506593704224,
1569
+ "learning_rate": 0.0004694692530860686,
1570
+ "loss": 1.2587,
1571
+ "step": 21800
1572
+ },
1573
+ {
1574
+ "epoch": 1.2458046532794813,
1575
+ "grad_norm": 1.6786317825317383,
1576
+ "learning_rate": 0.00046795229156000535,
1577
+ "loss": 1.2464,
1578
+ "step": 21900
1579
+ },
1580
+ {
1581
+ "epoch": 1.2514932590022185,
1582
+ "grad_norm": 1.887971043586731,
1583
+ "learning_rate": 0.00046643533003394203,
1584
+ "loss": 1.2501,
1585
+ "step": 22000
1586
+ },
1587
+ {
1588
+ "epoch": 1.257181864724956,
1589
+ "grad_norm": 1.7499343156814575,
1590
+ "learning_rate": 0.00046491836850787876,
1591
+ "loss": 1.2296,
1592
+ "step": 22100
1593
+ },
1594
+ {
1595
+ "epoch": 1.2628704704476932,
1596
+ "grad_norm": 2.057670831680298,
1597
+ "learning_rate": 0.00046340140698181544,
1598
+ "loss": 1.2346,
1599
+ "step": 22200
1600
+ },
1601
+ {
1602
+ "epoch": 1.2685590761704306,
1603
+ "grad_norm": 1.7353135347366333,
1604
+ "learning_rate": 0.00046188444545575217,
1605
+ "loss": 1.2512,
1606
+ "step": 22300
1607
+ },
1608
+ {
1609
+ "epoch": 1.274247681893168,
1610
+ "grad_norm": 2.0662734508514404,
1611
+ "learning_rate": 0.0004603674839296889,
1612
+ "loss": 1.2493,
1613
+ "step": 22400
1614
+ },
1615
+ {
1616
+ "epoch": 1.2799362876159053,
1617
+ "grad_norm": 1.5519914627075195,
1618
+ "learning_rate": 0.0004588505224036255,
1619
+ "loss": 1.2404,
1620
+ "step": 22500
1621
+ },
1622
+ {
1623
+ "epoch": 1.2856248933386427,
1624
+ "grad_norm": 1.8667906522750854,
1625
+ "learning_rate": 0.0004573335608775622,
1626
+ "loss": 1.247,
1627
+ "step": 22600
1628
+ },
1629
+ {
1630
+ "epoch": 1.29131349906138,
1631
+ "grad_norm": 1.8621453046798706,
1632
+ "learning_rate": 0.00045581659935149893,
1633
+ "loss": 1.2428,
1634
+ "step": 22700
1635
+ },
1636
+ {
1637
+ "epoch": 1.2970021047841174,
1638
+ "grad_norm": 1.7203937768936157,
1639
+ "learning_rate": 0.00045429963782543566,
1640
+ "loss": 1.2273,
1641
+ "step": 22800
1642
+ },
1643
+ {
1644
+ "epoch": 1.3026907105068548,
1645
+ "grad_norm": 1.7497667074203491,
1646
+ "learning_rate": 0.00045278267629937234,
1647
+ "loss": 1.2458,
1648
+ "step": 22900
1649
+ },
1650
+ {
1651
+ "epoch": 1.3083793162295922,
1652
+ "grad_norm": 2.057507276535034,
1653
+ "learning_rate": 0.0004512657147733091,
1654
+ "loss": 1.2325,
1655
+ "step": 23000
1656
+ },
1657
+ {
1658
+ "epoch": 1.3140679219523295,
1659
+ "grad_norm": 1.4594337940216064,
1660
+ "learning_rate": 0.0004497487532472458,
1661
+ "loss": 1.2319,
1662
+ "step": 23100
1663
+ },
1664
+ {
1665
+ "epoch": 1.319756527675067,
1666
+ "grad_norm": 2.1696736812591553,
1667
+ "learning_rate": 0.0004482317917211825,
1668
+ "loss": 1.234,
1669
+ "step": 23200
1670
+ },
1671
+ {
1672
+ "epoch": 1.3254451333978041,
1673
+ "grad_norm": 1.8165019750595093,
1674
+ "learning_rate": 0.0004467148301951192,
1675
+ "loss": 1.2256,
1676
+ "step": 23300
1677
+ },
1678
+ {
1679
+ "epoch": 1.3311337391205416,
1680
+ "grad_norm": 1.5531728267669678,
1681
+ "learning_rate": 0.00044519786866905594,
1682
+ "loss": 1.2518,
1683
+ "step": 23400
1684
+ },
1685
+ {
1686
+ "epoch": 1.336822344843279,
1687
+ "grad_norm": 1.4592831134796143,
1688
+ "learning_rate": 0.0004436809071429926,
1689
+ "loss": 1.2192,
1690
+ "step": 23500
1691
+ },
1692
+ {
1693
+ "epoch": 1.3425109505660162,
1694
+ "grad_norm": 1.74478280544281,
1695
+ "learning_rate": 0.00044216394561692935,
1696
+ "loss": 1.2427,
1697
+ "step": 23600
1698
+ },
1699
+ {
1700
+ "epoch": 1.3481995562887537,
1701
+ "grad_norm": 1.8685113191604614,
1702
+ "learning_rate": 0.000440646984090866,
1703
+ "loss": 1.2581,
1704
+ "step": 23700
1705
+ },
1706
+ {
1707
+ "epoch": 1.3538881620114909,
1708
+ "grad_norm": 1.7366535663604736,
1709
+ "learning_rate": 0.0004391300225648027,
1710
+ "loss": 1.2471,
1711
+ "step": 23800
1712
+ },
1713
+ {
1714
+ "epoch": 1.3595767677342283,
1715
+ "grad_norm": 1.6585444211959839,
1716
+ "learning_rate": 0.0004376130610387394,
1717
+ "loss": 1.2198,
1718
+ "step": 23900
1719
+ },
1720
+ {
1721
+ "epoch": 1.3652653734569657,
1722
+ "grad_norm": 1.9299806356430054,
1723
+ "learning_rate": 0.0004360960995126761,
1724
+ "loss": 1.2314,
1725
+ "step": 24000
1726
+ },
1727
+ {
1728
+ "epoch": 1.3709539791797032,
1729
+ "grad_norm": 1.8172481060028076,
1730
+ "learning_rate": 0.00043457913798661285,
1731
+ "loss": 1.2098,
1732
+ "step": 24100
1733
+ },
1734
+ {
1735
+ "epoch": 1.3766425849024404,
1736
+ "grad_norm": 1.5579493045806885,
1737
+ "learning_rate": 0.0004330621764605495,
1738
+ "loss": 1.2043,
1739
+ "step": 24200
1740
+ },
1741
+ {
1742
+ "epoch": 1.3823311906251778,
1743
+ "grad_norm": 1.8178203105926514,
1744
+ "learning_rate": 0.00043154521493448625,
1745
+ "loss": 1.2235,
1746
+ "step": 24300
1747
+ },
1748
+ {
1749
+ "epoch": 1.388019796347915,
1750
+ "grad_norm": 1.676126480102539,
1751
+ "learning_rate": 0.000430028253408423,
1752
+ "loss": 1.2388,
1753
+ "step": 24400
1754
+ },
1755
+ {
1756
+ "epoch": 1.3937084020706525,
1757
+ "grad_norm": 1.863893985748291,
1758
+ "learning_rate": 0.00042851129188235966,
1759
+ "loss": 1.2206,
1760
+ "step": 24500
1761
+ },
1762
+ {
1763
+ "epoch": 1.39939700779339,
1764
+ "grad_norm": 1.5618318319320679,
1765
+ "learning_rate": 0.0004269943303562964,
1766
+ "loss": 1.2359,
1767
+ "step": 24600
1768
+ },
1769
+ {
1770
+ "epoch": 1.4050856135161272,
1771
+ "grad_norm": 1.3972681760787964,
1772
+ "learning_rate": 0.0004254773688302331,
1773
+ "loss": 1.2152,
1774
+ "step": 24700
1775
+ },
1776
+ {
1777
+ "epoch": 1.4107742192388646,
1778
+ "grad_norm": 1.584274411201477,
1779
+ "learning_rate": 0.00042396040730416975,
1780
+ "loss": 1.211,
1781
+ "step": 24800
1782
+ },
1783
+ {
1784
+ "epoch": 1.4164628249616018,
1785
+ "grad_norm": 1.7282276153564453,
1786
+ "learning_rate": 0.0004224434457781064,
1787
+ "loss": 1.2034,
1788
+ "step": 24900
1789
+ },
1790
+ {
1791
+ "epoch": 1.4221514306843392,
1792
+ "grad_norm": 2.2420654296875,
1793
+ "learning_rate": 0.00042092648425204316,
1794
+ "loss": 1.2125,
1795
+ "step": 25000
1796
+ },
1797
+ {
1798
+ "epoch": 1.4221514306843392,
1799
+ "eval_accuracy": 0.703072,
1800
+ "eval_loss": 1.185011863708496,
1801
+ "eval_runtime": 81.7158,
1802
+ "eval_samples_per_second": 3059.385,
1803
+ "eval_steps_per_second": 11.956,
1804
+ "step": 25000
1805
+ },
1806
+ {
1807
+ "epoch": 1.4278400364070767,
1808
+ "grad_norm": 1.5998648405075073,
1809
+ "learning_rate": 0.0004194095227259799,
1810
+ "loss": 1.2145,
1811
+ "step": 25100
1812
+ },
1813
+ {
1814
+ "epoch": 1.433528642129814,
1815
+ "grad_norm": 1.8546173572540283,
1816
+ "learning_rate": 0.00041789256119991656,
1817
+ "loss": 1.2293,
1818
+ "step": 25200
1819
+ },
1820
+ {
1821
+ "epoch": 1.4392172478525513,
1822
+ "grad_norm": 1.6815022230148315,
1823
+ "learning_rate": 0.00041639076928911395,
1824
+ "loss": 1.2165,
1825
+ "step": 25300
1826
+ },
1827
+ {
1828
+ "epoch": 1.4449058535752888,
1829
+ "grad_norm": 1.5567988157272339,
1830
+ "learning_rate": 0.0004148889773783113,
1831
+ "loss": 1.2231,
1832
+ "step": 25400
1833
+ },
1834
+ {
1835
+ "epoch": 1.450594459298026,
1836
+ "grad_norm": 1.9424728155136108,
1837
+ "learning_rate": 0.000413372015852248,
1838
+ "loss": 1.2302,
1839
+ "step": 25500
1840
+ },
1841
+ {
1842
+ "epoch": 1.4562830650207634,
1843
+ "grad_norm": 1.9052510261535645,
1844
+ "learning_rate": 0.00041185505432618464,
1845
+ "loss": 1.2174,
1846
+ "step": 25600
1847
+ },
1848
+ {
1849
+ "epoch": 1.4619716707435009,
1850
+ "grad_norm": 1.470513939857483,
1851
+ "learning_rate": 0.0004103380928001213,
1852
+ "loss": 1.2167,
1853
+ "step": 25700
1854
+ },
1855
+ {
1856
+ "epoch": 1.467660276466238,
1857
+ "grad_norm": 1.5082899332046509,
1858
+ "learning_rate": 0.00040882113127405805,
1859
+ "loss": 1.2098,
1860
+ "step": 25800
1861
+ },
1862
+ {
1863
+ "epoch": 1.4733488821889755,
1864
+ "grad_norm": 1.7309447526931763,
1865
+ "learning_rate": 0.0004073041697479948,
1866
+ "loss": 1.2111,
1867
+ "step": 25900
1868
+ },
1869
+ {
1870
+ "epoch": 1.4790374879117127,
1871
+ "grad_norm": 1.7894470691680908,
1872
+ "learning_rate": 0.00040578720822193146,
1873
+ "loss": 1.2237,
1874
+ "step": 26000
1875
+ },
1876
+ {
1877
+ "epoch": 1.4847260936344502,
1878
+ "grad_norm": 1.712557077407837,
1879
+ "learning_rate": 0.0004042702466958682,
1880
+ "loss": 1.2314,
1881
+ "step": 26100
1882
+ },
1883
+ {
1884
+ "epoch": 1.4904146993571876,
1885
+ "grad_norm": 1.794565200805664,
1886
+ "learning_rate": 0.0004027532851698049,
1887
+ "loss": 1.2072,
1888
+ "step": 26200
1889
+ },
1890
+ {
1891
+ "epoch": 1.4961033050799248,
1892
+ "grad_norm": 1.655179500579834,
1893
+ "learning_rate": 0.0004012363236437416,
1894
+ "loss": 1.218,
1895
+ "step": 26300
1896
+ },
1897
+ {
1898
+ "epoch": 1.5017919108026623,
1899
+ "grad_norm": 1.8749343156814575,
1900
+ "learning_rate": 0.00039971936211767833,
1901
+ "loss": 1.2037,
1902
+ "step": 26400
1903
+ },
1904
+ {
1905
+ "epoch": 1.5074805165253995,
1906
+ "grad_norm": 1.5337320566177368,
1907
+ "learning_rate": 0.000398202400591615,
1908
+ "loss": 1.2179,
1909
+ "step": 26500
1910
+ },
1911
+ {
1912
+ "epoch": 1.513169122248137,
1913
+ "grad_norm": 1.5731686353683472,
1914
+ "learning_rate": 0.0003966854390655517,
1915
+ "loss": 1.2037,
1916
+ "step": 26600
1917
+ },
1918
+ {
1919
+ "epoch": 1.5188577279708744,
1920
+ "grad_norm": 1.5700329542160034,
1921
+ "learning_rate": 0.0003951684775394884,
1922
+ "loss": 1.2189,
1923
+ "step": 26700
1924
+ },
1925
+ {
1926
+ "epoch": 1.5245463336936118,
1927
+ "grad_norm": 1.9315118789672852,
1928
+ "learning_rate": 0.00039365151601342515,
1929
+ "loss": 1.2314,
1930
+ "step": 26800
1931
+ },
1932
+ {
1933
+ "epoch": 1.530234939416349,
1934
+ "grad_norm": 1.6017844676971436,
1935
+ "learning_rate": 0.0003921345544873618,
1936
+ "loss": 1.211,
1937
+ "step": 26900
1938
+ },
1939
+ {
1940
+ "epoch": 1.5359235451390862,
1941
+ "grad_norm": 1.586595058441162,
1942
+ "learning_rate": 0.0003906175929612985,
1943
+ "loss": 1.2079,
1944
+ "step": 27000
1945
+ },
1946
+ {
1947
+ "epoch": 1.5416121508618237,
1948
+ "grad_norm": 1.8215593099594116,
1949
+ "learning_rate": 0.00038910063143523523,
1950
+ "loss": 1.2022,
1951
+ "step": 27100
1952
+ },
1953
+ {
1954
+ "epoch": 1.5473007565845611,
1955
+ "grad_norm": 1.7390124797821045,
1956
+ "learning_rate": 0.00038758366990917196,
1957
+ "loss": 1.2143,
1958
+ "step": 27200
1959
+ },
1960
+ {
1961
+ "epoch": 1.5529893623072986,
1962
+ "grad_norm": 1.792608618736267,
1963
+ "learning_rate": 0.00038606670838310864,
1964
+ "loss": 1.2104,
1965
+ "step": 27300
1966
+ },
1967
+ {
1968
+ "epoch": 1.558677968030036,
1969
+ "grad_norm": 1.802167296409607,
1970
+ "learning_rate": 0.00038454974685704537,
1971
+ "loss": 1.1924,
1972
+ "step": 27400
1973
+ },
1974
+ {
1975
+ "epoch": 1.5643665737527732,
1976
+ "grad_norm": 1.7943332195281982,
1977
+ "learning_rate": 0.0003830327853309821,
1978
+ "loss": 1.2096,
1979
+ "step": 27500
1980
+ },
1981
+ {
1982
+ "epoch": 1.5700551794755104,
1983
+ "grad_norm": 1.745893120765686,
1984
+ "learning_rate": 0.0003815158238049187,
1985
+ "loss": 1.1941,
1986
+ "step": 27600
1987
+ },
1988
+ {
1989
+ "epoch": 1.5757437851982479,
1990
+ "grad_norm": 1.6740118265151978,
1991
+ "learning_rate": 0.00037999886227885546,
1992
+ "loss": 1.2355,
1993
+ "step": 27700
1994
+ },
1995
+ {
1996
+ "epoch": 1.5814323909209853,
1997
+ "grad_norm": 1.681840419769287,
1998
+ "learning_rate": 0.0003784970703680528,
1999
+ "loss": 1.2034,
2000
+ "step": 27800
2001
+ },
2002
+ {
2003
+ "epoch": 1.5871209966437227,
2004
+ "grad_norm": 1.6897751092910767,
2005
+ "learning_rate": 0.0003769801088419895,
2006
+ "loss": 1.2169,
2007
+ "step": 27900
2008
+ },
2009
+ {
2010
+ "epoch": 1.59280960236646,
2011
+ "grad_norm": 1.686784267425537,
2012
+ "learning_rate": 0.0003754631473159262,
2013
+ "loss": 1.2093,
2014
+ "step": 28000
2015
+ },
2016
+ {
2017
+ "epoch": 1.5984982080891972,
2018
+ "grad_norm": 1.6020421981811523,
2019
+ "learning_rate": 0.00037394618578986293,
2020
+ "loss": 1.2131,
2021
+ "step": 28100
2022
+ },
2023
+ {
2024
+ "epoch": 1.6041868138119346,
2025
+ "grad_norm": 1.478246808052063,
2026
+ "learning_rate": 0.0003724292242637996,
2027
+ "loss": 1.2048,
2028
+ "step": 28200
2029
+ },
2030
+ {
2031
+ "epoch": 1.609875419534672,
2032
+ "grad_norm": 1.4912410974502563,
2033
+ "learning_rate": 0.00037091226273773634,
2034
+ "loss": 1.18,
2035
+ "step": 28300
2036
+ },
2037
+ {
2038
+ "epoch": 1.6155640252574095,
2039
+ "grad_norm": 1.6362539529800415,
2040
+ "learning_rate": 0.00036939530121167307,
2041
+ "loss": 1.2022,
2042
+ "step": 28400
2043
+ },
2044
+ {
2045
+ "epoch": 1.6212526309801467,
2046
+ "grad_norm": 1.5238479375839233,
2047
+ "learning_rate": 0.00036787833968560975,
2048
+ "loss": 1.2105,
2049
+ "step": 28500
2050
+ },
2051
+ {
2052
+ "epoch": 1.6269412367028842,
2053
+ "grad_norm": 1.6359635591506958,
2054
+ "learning_rate": 0.0003663613781595464,
2055
+ "loss": 1.1833,
2056
+ "step": 28600
2057
+ },
2058
+ {
2059
+ "epoch": 1.6326298424256214,
2060
+ "grad_norm": 1.6206257343292236,
2061
+ "learning_rate": 0.00036484441663348316,
2062
+ "loss": 1.1945,
2063
+ "step": 28700
2064
+ },
2065
+ {
2066
+ "epoch": 1.6383184481483588,
2067
+ "grad_norm": 1.7032015323638916,
2068
+ "learning_rate": 0.00036332745510741983,
2069
+ "loss": 1.2065,
2070
+ "step": 28800
2071
+ },
2072
+ {
2073
+ "epoch": 1.6440070538710962,
2074
+ "grad_norm": 1.7177228927612305,
2075
+ "learning_rate": 0.00036181049358135657,
2076
+ "loss": 1.2035,
2077
+ "step": 28900
2078
+ },
2079
+ {
2080
+ "epoch": 1.6496956595938337,
2081
+ "grad_norm": 1.5967752933502197,
2082
+ "learning_rate": 0.0003602935320552933,
2083
+ "loss": 1.2036,
2084
+ "step": 29000
2085
+ },
2086
+ {
2087
+ "epoch": 1.655384265316571,
2088
+ "grad_norm": 1.6632803678512573,
2089
+ "learning_rate": 0.00035877657052923,
2090
+ "loss": 1.226,
2091
+ "step": 29100
2092
+ },
2093
+ {
2094
+ "epoch": 1.6610728710393081,
2095
+ "grad_norm": 1.5134357213974,
2096
+ "learning_rate": 0.00035725960900316665,
2097
+ "loss": 1.1947,
2098
+ "step": 29200
2099
+ },
2100
+ {
2101
+ "epoch": 1.6667614767620456,
2102
+ "grad_norm": 1.5506322383880615,
2103
+ "learning_rate": 0.0003557426474771034,
2104
+ "loss": 1.1963,
2105
+ "step": 29300
2106
+ },
2107
+ {
2108
+ "epoch": 1.672450082484783,
2109
+ "grad_norm": 1.4821183681488037,
2110
+ "learning_rate": 0.0003542256859510401,
2111
+ "loss": 1.2039,
2112
+ "step": 29400
2113
+ },
2114
+ {
2115
+ "epoch": 1.6781386882075204,
2116
+ "grad_norm": 2.278379440307617,
2117
+ "learning_rate": 0.0003527087244249768,
2118
+ "loss": 1.205,
2119
+ "step": 29500
2120
+ },
2121
+ {
2122
+ "epoch": 1.6838272939302577,
2123
+ "grad_norm": 1.5077921152114868,
2124
+ "learning_rate": 0.0003511917628989135,
2125
+ "loss": 1.1984,
2126
+ "step": 29600
2127
+ },
2128
+ {
2129
+ "epoch": 1.689515899652995,
2130
+ "grad_norm": 1.629607915878296,
2131
+ "learning_rate": 0.0003496748013728502,
2132
+ "loss": 1.2023,
2133
+ "step": 29700
2134
+ },
2135
+ {
2136
+ "epoch": 1.6952045053757323,
2137
+ "grad_norm": 1.5007668733596802,
2138
+ "learning_rate": 0.0003481578398467869,
2139
+ "loss": 1.1907,
2140
+ "step": 29800
2141
+ },
2142
+ {
2143
+ "epoch": 1.7008931110984697,
2144
+ "grad_norm": 1.7543882131576538,
2145
+ "learning_rate": 0.0003466408783207236,
2146
+ "loss": 1.1949,
2147
+ "step": 29900
2148
+ },
2149
+ {
2150
+ "epoch": 1.7065817168212072,
2151
+ "grad_norm": 1.6254594326019287,
2152
+ "learning_rate": 0.00034512391679466034,
2153
+ "loss": 1.1912,
2154
+ "step": 30000
2155
+ },
2156
+ {
2157
+ "epoch": 1.7065817168212072,
2158
+ "eval_accuracy": 0.709248,
2159
+ "eval_loss": 1.1566522121429443,
2160
+ "eval_runtime": 79.5399,
2161
+ "eval_samples_per_second": 3143.077,
2162
+ "eval_steps_per_second": 12.283,
2163
+ "step": 30000
2164
+ },
2165
+ {
2166
+ "epoch": 1.7122703225439446,
2167
+ "grad_norm": 1.873049020767212,
2168
+ "learning_rate": 0.000343606955268597,
2169
+ "loss": 1.2063,
2170
+ "step": 30100
2171
+ },
2172
+ {
2173
+ "epoch": 1.7179589282666818,
2174
+ "grad_norm": 1.5862141847610474,
2175
+ "learning_rate": 0.00034208999374253375,
2176
+ "loss": 1.1926,
2177
+ "step": 30200
2178
+ },
2179
+ {
2180
+ "epoch": 1.723647533989419,
2181
+ "grad_norm": 1.9915696382522583,
2182
+ "learning_rate": 0.0003405730322164704,
2183
+ "loss": 1.1952,
2184
+ "step": 30300
2185
+ },
2186
+ {
2187
+ "epoch": 1.7293361397121565,
2188
+ "grad_norm": 1.856048822402954,
2189
+ "learning_rate": 0.0003390560706904071,
2190
+ "loss": 1.1953,
2191
+ "step": 30400
2192
+ },
2193
+ {
2194
+ "epoch": 1.735024745434894,
2195
+ "grad_norm": 1.6758267879486084,
2196
+ "learning_rate": 0.00033753910916434383,
2197
+ "loss": 1.1906,
2198
+ "step": 30500
2199
+ },
2200
+ {
2201
+ "epoch": 1.7407133511576314,
2202
+ "grad_norm": 1.8683140277862549,
2203
+ "learning_rate": 0.00033602214763828056,
2204
+ "loss": 1.2025,
2205
+ "step": 30600
2206
+ },
2207
+ {
2208
+ "epoch": 1.7464019568803686,
2209
+ "grad_norm": 1.452721118927002,
2210
+ "learning_rate": 0.00033450518611221724,
2211
+ "loss": 1.1866,
2212
+ "step": 30700
2213
+ },
2214
+ {
2215
+ "epoch": 1.752090562603106,
2216
+ "grad_norm": 1.5711089372634888,
2217
+ "learning_rate": 0.0003329882245861539,
2218
+ "loss": 1.1856,
2219
+ "step": 30800
2220
+ },
2221
+ {
2222
+ "epoch": 1.7577791683258432,
2223
+ "grad_norm": 2.0584185123443604,
2224
+ "learning_rate": 0.00033147126306009065,
2225
+ "loss": 1.1873,
2226
+ "step": 30900
2227
+ },
2228
+ {
2229
+ "epoch": 1.7634677740485807,
2230
+ "grad_norm": 1.5743275880813599,
2231
+ "learning_rate": 0.0003299543015340274,
2232
+ "loss": 1.1988,
2233
+ "step": 31000
2234
+ },
2235
+ {
2236
+ "epoch": 1.7691563797713181,
2237
+ "grad_norm": 1.5788936614990234,
2238
+ "learning_rate": 0.00032843734000796406,
2239
+ "loss": 1.1932,
2240
+ "step": 31100
2241
+ },
2242
+ {
2243
+ "epoch": 1.7748449854940556,
2244
+ "grad_norm": 1.6406651735305786,
2245
+ "learning_rate": 0.0003269203784819008,
2246
+ "loss": 1.1876,
2247
+ "step": 31200
2248
+ },
2249
+ {
2250
+ "epoch": 1.7805335912167928,
2251
+ "grad_norm": 1.6410019397735596,
2252
+ "learning_rate": 0.00032540341695583747,
2253
+ "loss": 1.1859,
2254
+ "step": 31300
2255
+ },
2256
+ {
2257
+ "epoch": 1.78622219693953,
2258
+ "grad_norm": 1.548140287399292,
2259
+ "learning_rate": 0.00032388645542977414,
2260
+ "loss": 1.2032,
2261
+ "step": 31400
2262
+ },
2263
+ {
2264
+ "epoch": 1.7919108026622674,
2265
+ "grad_norm": 1.9242947101593018,
2266
+ "learning_rate": 0.0003223694939037109,
2267
+ "loss": 1.199,
2268
+ "step": 31500
2269
+ },
2270
+ {
2271
+ "epoch": 1.7975994083850049,
2272
+ "grad_norm": 2.0189428329467773,
2273
+ "learning_rate": 0.0003208525323776476,
2274
+ "loss": 1.1832,
2275
+ "step": 31600
2276
+ },
2277
+ {
2278
+ "epoch": 1.8032880141077423,
2279
+ "grad_norm": 1.740432620048523,
2280
+ "learning_rate": 0.0003193355708515843,
2281
+ "loss": 1.183,
2282
+ "step": 31700
2283
+ },
2284
+ {
2285
+ "epoch": 1.8089766198304795,
2286
+ "grad_norm": 1.743503451347351,
2287
+ "learning_rate": 0.0003178337789407816,
2288
+ "loss": 1.1991,
2289
+ "step": 31800
2290
+ },
2291
+ {
2292
+ "epoch": 1.8146652255532167,
2293
+ "grad_norm": 1.7166736125946045,
2294
+ "learning_rate": 0.00031631681741471835,
2295
+ "loss": 1.2031,
2296
+ "step": 31900
2297
+ },
2298
+ {
2299
+ "epoch": 1.8203538312759542,
2300
+ "grad_norm": 1.626386046409607,
2301
+ "learning_rate": 0.000314799855888655,
2302
+ "loss": 1.1987,
2303
+ "step": 32000
2304
+ },
2305
+ {
2306
+ "epoch": 1.8260424369986916,
2307
+ "grad_norm": 1.5402131080627441,
2308
+ "learning_rate": 0.00031328289436259176,
2309
+ "loss": 1.172,
2310
+ "step": 32100
2311
+ },
2312
+ {
2313
+ "epoch": 1.831731042721429,
2314
+ "grad_norm": 1.6522256135940552,
2315
+ "learning_rate": 0.0003117659328365285,
2316
+ "loss": 1.179,
2317
+ "step": 32200
2318
+ },
2319
+ {
2320
+ "epoch": 1.8374196484441665,
2321
+ "grad_norm": 1.482009768486023,
2322
+ "learning_rate": 0.00031024897131046517,
2323
+ "loss": 1.1903,
2324
+ "step": 32300
2325
+ },
2326
+ {
2327
+ "epoch": 1.8431082541669037,
2328
+ "grad_norm": 1.6417380571365356,
2329
+ "learning_rate": 0.00030873200978440184,
2330
+ "loss": 1.2024,
2331
+ "step": 32400
2332
+ },
2333
+ {
2334
+ "epoch": 1.848796859889641,
2335
+ "grad_norm": 1.532333493232727,
2336
+ "learning_rate": 0.0003072150482583386,
2337
+ "loss": 1.1843,
2338
+ "step": 32500
2339
+ },
2340
+ {
2341
+ "epoch": 1.8544854656123784,
2342
+ "grad_norm": 2.004293441772461,
2343
+ "learning_rate": 0.00030569808673227525,
2344
+ "loss": 1.192,
2345
+ "step": 32600
2346
+ },
2347
+ {
2348
+ "epoch": 1.8601740713351158,
2349
+ "grad_norm": 1.7226125001907349,
2350
+ "learning_rate": 0.000304181125206212,
2351
+ "loss": 1.1902,
2352
+ "step": 32700
2353
+ },
2354
+ {
2355
+ "epoch": 1.8658626770578532,
2356
+ "grad_norm": 1.7714165449142456,
2357
+ "learning_rate": 0.0003026641636801487,
2358
+ "loss": 1.1908,
2359
+ "step": 32800
2360
+ },
2361
+ {
2362
+ "epoch": 1.8715512827805905,
2363
+ "grad_norm": 1.5100337266921997,
2364
+ "learning_rate": 0.00030114720215408534,
2365
+ "loss": 1.1735,
2366
+ "step": 32900
2367
+ },
2368
+ {
2369
+ "epoch": 1.8772398885033277,
2370
+ "grad_norm": 1.6792744398117065,
2371
+ "learning_rate": 0.00029963024062802207,
2372
+ "loss": 1.191,
2373
+ "step": 33000
2374
+ },
2375
+ {
2376
+ "epoch": 1.8829284942260651,
2377
+ "grad_norm": 1.705554723739624,
2378
+ "learning_rate": 0.0002981132791019588,
2379
+ "loss": 1.1878,
2380
+ "step": 33100
2381
+ },
2382
+ {
2383
+ "epoch": 1.8886170999488026,
2384
+ "grad_norm": 1.4528917074203491,
2385
+ "learning_rate": 0.0002965963175758955,
2386
+ "loss": 1.1685,
2387
+ "step": 33200
2388
+ },
2389
+ {
2390
+ "epoch": 1.89430570567154,
2391
+ "grad_norm": 1.7752711772918701,
2392
+ "learning_rate": 0.0002950793560498322,
2393
+ "loss": 1.1743,
2394
+ "step": 33300
2395
+ },
2396
+ {
2397
+ "epoch": 1.8999943113942772,
2398
+ "grad_norm": 1.762074589729309,
2399
+ "learning_rate": 0.00029356239452376894,
2400
+ "loss": 1.1775,
2401
+ "step": 33400
2402
+ },
2403
+ {
2404
+ "epoch": 1.9056829171170147,
2405
+ "grad_norm": 1.6388828754425049,
2406
+ "learning_rate": 0.0002920454329977056,
2407
+ "loss": 1.1762,
2408
+ "step": 33500
2409
+ },
2410
+ {
2411
+ "epoch": 1.9113715228397519,
2412
+ "grad_norm": 1.5171791315078735,
2413
+ "learning_rate": 0.0002905284714716423,
2414
+ "loss": 1.1649,
2415
+ "step": 33600
2416
+ },
2417
+ {
2418
+ "epoch": 1.9170601285624893,
2419
+ "grad_norm": 1.6547460556030273,
2420
+ "learning_rate": 0.000289011509945579,
2421
+ "loss": 1.1904,
2422
+ "step": 33700
2423
+ },
2424
+ {
2425
+ "epoch": 1.9227487342852267,
2426
+ "grad_norm": 1.705083966255188,
2427
+ "learning_rate": 0.00028750971803477636,
2428
+ "loss": 1.1667,
2429
+ "step": 33800
2430
+ },
2431
+ {
2432
+ "epoch": 1.9284373400079642,
2433
+ "grad_norm": 1.731803059577942,
2434
+ "learning_rate": 0.00028599275650871304,
2435
+ "loss": 1.1788,
2436
+ "step": 33900
2437
+ },
2438
+ {
2439
+ "epoch": 1.9341259457307014,
2440
+ "grad_norm": 2.056766986846924,
2441
+ "learning_rate": 0.00028447579498264977,
2442
+ "loss": 1.1878,
2443
+ "step": 34000
2444
+ },
2445
+ {
2446
+ "epoch": 1.9398145514534386,
2447
+ "grad_norm": 1.8016914129257202,
2448
+ "learning_rate": 0.00028295883345658644,
2449
+ "loss": 1.1632,
2450
+ "step": 34100
2451
+ },
2452
+ {
2453
+ "epoch": 1.945503157176176,
2454
+ "grad_norm": 1.7706475257873535,
2455
+ "learning_rate": 0.0002814418719305232,
2456
+ "loss": 1.1658,
2457
+ "step": 34200
2458
+ },
2459
+ {
2460
+ "epoch": 1.9511917628989135,
2461
+ "grad_norm": 1.8184970617294312,
2462
+ "learning_rate": 0.0002799249104044599,
2463
+ "loss": 1.1666,
2464
+ "step": 34300
2465
+ },
2466
+ {
2467
+ "epoch": 1.956880368621651,
2468
+ "grad_norm": 1.6529743671417236,
2469
+ "learning_rate": 0.0002784079488783966,
2470
+ "loss": 1.1846,
2471
+ "step": 34400
2472
+ },
2473
+ {
2474
+ "epoch": 1.9625689743443882,
2475
+ "grad_norm": 1.5860931873321533,
2476
+ "learning_rate": 0.00027689098735233326,
2477
+ "loss": 1.1917,
2478
+ "step": 34500
2479
+ },
2480
+ {
2481
+ "epoch": 1.9682575800671256,
2482
+ "grad_norm": 1.672756552696228,
2483
+ "learning_rate": 0.00027537402582627,
2484
+ "loss": 1.1654,
2485
+ "step": 34600
2486
+ },
2487
+ {
2488
+ "epoch": 1.9739461857898628,
2489
+ "grad_norm": 1.7606583833694458,
2490
+ "learning_rate": 0.0002738570643002067,
2491
+ "loss": 1.176,
2492
+ "step": 34700
2493
+ },
2494
+ {
2495
+ "epoch": 1.9796347915126002,
2496
+ "grad_norm": 1.912277340888977,
2497
+ "learning_rate": 0.0002723401027741434,
2498
+ "loss": 1.1695,
2499
+ "step": 34800
2500
+ },
2501
+ {
2502
+ "epoch": 1.9853233972353377,
2503
+ "grad_norm": 1.7096484899520874,
2504
+ "learning_rate": 0.00027082314124808013,
2505
+ "loss": 1.1669,
2506
+ "step": 34900
2507
+ },
2508
+ {
2509
+ "epoch": 1.9910120029580751,
2510
+ "grad_norm": 1.7793241739273071,
2511
+ "learning_rate": 0.0002693061797220168,
2512
+ "loss": 1.1902,
2513
+ "step": 35000
2514
+ },
2515
+ {
2516
+ "epoch": 1.9910120029580751,
2517
+ "eval_accuracy": 0.71648,
2518
+ "eval_loss": 1.1297262907028198,
2519
+ "eval_runtime": 79.5966,
2520
+ "eval_samples_per_second": 3140.839,
2521
+ "eval_steps_per_second": 12.274,
2522
+ "step": 35000
2523
+ },
2524
+ {
2525
+ "epoch": 1.9967006086808123,
2526
+ "grad_norm": 1.4913907051086426,
2527
+ "learning_rate": 0.0002677892181959535,
2528
+ "loss": 1.1607,
2529
+ "step": 35100
2530
+ },
2531
+ {
2532
+ "epoch": 2.0023892144035496,
2533
+ "grad_norm": 1.639985203742981,
2534
+ "learning_rate": 0.0002662874262851509,
2535
+ "loss": 1.1727,
2536
+ "step": 35200
2537
+ },
2538
+ {
2539
+ "epoch": 2.008077820126287,
2540
+ "grad_norm": 1.6419970989227295,
2541
+ "learning_rate": 0.00026477046475908755,
2542
+ "loss": 1.1392,
2543
+ "step": 35300
2544
+ },
2545
+ {
2546
+ "epoch": 2.0137664258490244,
2547
+ "grad_norm": 1.8132672309875488,
2548
+ "learning_rate": 0.00026325350323302423,
2549
+ "loss": 1.1503,
2550
+ "step": 35400
2551
+ },
2552
+ {
2553
+ "epoch": 2.019455031571762,
2554
+ "grad_norm": 1.4656819105148315,
2555
+ "learning_rate": 0.00026173654170696096,
2556
+ "loss": 1.1565,
2557
+ "step": 35500
2558
+ },
2559
+ {
2560
+ "epoch": 2.0251436372944993,
2561
+ "grad_norm": 1.3595716953277588,
2562
+ "learning_rate": 0.0002602195801808977,
2563
+ "loss": 1.1526,
2564
+ "step": 35600
2565
+ },
2566
+ {
2567
+ "epoch": 2.0308322430172363,
2568
+ "grad_norm": 1.6904360055923462,
2569
+ "learning_rate": 0.00025870261865483437,
2570
+ "loss": 1.1448,
2571
+ "step": 35700
2572
+ },
2573
+ {
2574
+ "epoch": 2.0365208487399737,
2575
+ "grad_norm": 1.7240209579467773,
2576
+ "learning_rate": 0.0002571856571287711,
2577
+ "loss": 1.1424,
2578
+ "step": 35800
2579
+ },
2580
+ {
2581
+ "epoch": 2.042209454462711,
2582
+ "grad_norm": 1.5376731157302856,
2583
+ "learning_rate": 0.00025566869560270783,
2584
+ "loss": 1.143,
2585
+ "step": 35900
2586
+ },
2587
+ {
2588
+ "epoch": 2.0478980601854486,
2589
+ "grad_norm": 1.893202781677246,
2590
+ "learning_rate": 0.00025415173407664445,
2591
+ "loss": 1.1519,
2592
+ "step": 36000
2593
+ },
2594
+ {
2595
+ "epoch": 2.053586665908186,
2596
+ "grad_norm": 1.9057211875915527,
2597
+ "learning_rate": 0.0002526347725505812,
2598
+ "loss": 1.1375,
2599
+ "step": 36100
2600
+ },
2601
+ {
2602
+ "epoch": 2.059275271630923,
2603
+ "grad_norm": 1.7818187475204468,
2604
+ "learning_rate": 0.0002511178110245179,
2605
+ "loss": 1.1424,
2606
+ "step": 36200
2607
+ },
2608
+ {
2609
+ "epoch": 2.0649638773536605,
2610
+ "grad_norm": 1.825323462486267,
2611
+ "learning_rate": 0.0002496008494984546,
2612
+ "loss": 1.1196,
2613
+ "step": 36300
2614
+ },
2615
+ {
2616
+ "epoch": 2.070652483076398,
2617
+ "grad_norm": 2.0049736499786377,
2618
+ "learning_rate": 0.0002480838879723913,
2619
+ "loss": 1.1317,
2620
+ "step": 36400
2621
+ },
2622
+ {
2623
+ "epoch": 2.0763410887991354,
2624
+ "grad_norm": 1.599846363067627,
2625
+ "learning_rate": 0.000246566926446328,
2626
+ "loss": 1.1573,
2627
+ "step": 36500
2628
+ },
2629
+ {
2630
+ "epoch": 2.082029694521873,
2631
+ "grad_norm": 1.5434855222702026,
2632
+ "learning_rate": 0.00024504996492026473,
2633
+ "loss": 1.1493,
2634
+ "step": 36600
2635
+ },
2636
+ {
2637
+ "epoch": 2.0877183002446102,
2638
+ "grad_norm": 1.6306787729263306,
2639
+ "learning_rate": 0.0002435330033942014,
2640
+ "loss": 1.1564,
2641
+ "step": 36700
2642
+ },
2643
+ {
2644
+ "epoch": 2.0934069059673472,
2645
+ "grad_norm": 1.6914353370666504,
2646
+ "learning_rate": 0.00024201604186813814,
2647
+ "loss": 1.1395,
2648
+ "step": 36800
2649
+ },
2650
+ {
2651
+ "epoch": 2.0990955116900847,
2652
+ "grad_norm": 1.6444432735443115,
2653
+ "learning_rate": 0.00024049908034207485,
2654
+ "loss": 1.1615,
2655
+ "step": 36900
2656
+ },
2657
+ {
2658
+ "epoch": 2.104784117412822,
2659
+ "grad_norm": 1.821244239807129,
2660
+ "learning_rate": 0.00023898211881601155,
2661
+ "loss": 1.1429,
2662
+ "step": 37000
2663
+ },
2664
+ {
2665
+ "epoch": 2.1104727231355596,
2666
+ "grad_norm": 1.6050491333007812,
2667
+ "learning_rate": 0.00023746515728994823,
2668
+ "loss": 1.1376,
2669
+ "step": 37100
2670
+ },
2671
+ {
2672
+ "epoch": 2.116161328858297,
2673
+ "grad_norm": 1.7375249862670898,
2674
+ "learning_rate": 0.00023594819576388493,
2675
+ "loss": 1.1253,
2676
+ "step": 37200
2677
+ },
2678
+ {
2679
+ "epoch": 2.121849934581034,
2680
+ "grad_norm": 2.0717177391052246,
2681
+ "learning_rate": 0.00023443123423782166,
2682
+ "loss": 1.1527,
2683
+ "step": 37300
2684
+ },
2685
+ {
2686
+ "epoch": 2.1275385403037714,
2687
+ "grad_norm": 1.43324875831604,
2688
+ "learning_rate": 0.00023291427271175837,
2689
+ "loss": 1.1475,
2690
+ "step": 37400
2691
+ },
2692
+ {
2693
+ "epoch": 2.133227146026509,
2694
+ "grad_norm": 1.448669195175171,
2695
+ "learning_rate": 0.00023139731118569507,
2696
+ "loss": 1.1133,
2697
+ "step": 37500
2698
+ },
2699
+ {
2700
+ "epoch": 2.1389157517492463,
2701
+ "grad_norm": 1.521912932395935,
2702
+ "learning_rate": 0.00022988034965963178,
2703
+ "loss": 1.1292,
2704
+ "step": 37600
2705
+ },
2706
+ {
2707
+ "epoch": 2.1446043574719837,
2708
+ "grad_norm": 1.6070728302001953,
2709
+ "learning_rate": 0.00022836338813356845,
2710
+ "loss": 1.1384,
2711
+ "step": 37700
2712
+ },
2713
+ {
2714
+ "epoch": 2.1502929631947207,
2715
+ "grad_norm": 1.3853884935379028,
2716
+ "learning_rate": 0.00022684642660750516,
2717
+ "loss": 1.1344,
2718
+ "step": 37800
2719
+ },
2720
+ {
2721
+ "epoch": 2.155981568917458,
2722
+ "grad_norm": 1.569415807723999,
2723
+ "learning_rate": 0.0002253294650814419,
2724
+ "loss": 1.1572,
2725
+ "step": 37900
2726
+ },
2727
+ {
2728
+ "epoch": 2.1616701746401956,
2729
+ "grad_norm": 1.544966220855713,
2730
+ "learning_rate": 0.0002238125035553786,
2731
+ "loss": 1.1378,
2732
+ "step": 38000
2733
+ },
2734
+ {
2735
+ "epoch": 2.167358780362933,
2736
+ "grad_norm": 1.6090420484542847,
2737
+ "learning_rate": 0.0002222955420293153,
2738
+ "loss": 1.1331,
2739
+ "step": 38100
2740
+ },
2741
+ {
2742
+ "epoch": 2.1730473860856705,
2743
+ "grad_norm": 1.542605996131897,
2744
+ "learning_rate": 0.00022077858050325197,
2745
+ "loss": 1.1302,
2746
+ "step": 38200
2747
+ },
2748
+ {
2749
+ "epoch": 2.178735991808408,
2750
+ "grad_norm": 1.744084119796753,
2751
+ "learning_rate": 0.00021926161897718868,
2752
+ "loss": 1.1305,
2753
+ "step": 38300
2754
+ },
2755
+ {
2756
+ "epoch": 2.184424597531145,
2757
+ "grad_norm": 1.630118489265442,
2758
+ "learning_rate": 0.0002177446574511254,
2759
+ "loss": 1.1294,
2760
+ "step": 38400
2761
+ },
2762
+ {
2763
+ "epoch": 2.1901132032538824,
2764
+ "grad_norm": 1.6920104026794434,
2765
+ "learning_rate": 0.0002162276959250621,
2766
+ "loss": 1.1337,
2767
+ "step": 38500
2768
+ },
2769
+ {
2770
+ "epoch": 2.19580180897662,
2771
+ "grad_norm": 1.654189944267273,
2772
+ "learning_rate": 0.00021471073439899882,
2773
+ "loss": 1.1182,
2774
+ "step": 38600
2775
+ },
2776
+ {
2777
+ "epoch": 2.2014904146993572,
2778
+ "grad_norm": 1.8575996160507202,
2779
+ "learning_rate": 0.00021319377287293555,
2780
+ "loss": 1.1261,
2781
+ "step": 38700
2782
+ },
2783
+ {
2784
+ "epoch": 2.2071790204220947,
2785
+ "grad_norm": 1.5796535015106201,
2786
+ "learning_rate": 0.0002116768113468722,
2787
+ "loss": 1.1389,
2788
+ "step": 38800
2789
+ },
2790
+ {
2791
+ "epoch": 2.212867626144832,
2792
+ "grad_norm": 1.6893657445907593,
2793
+ "learning_rate": 0.00021015984982080893,
2794
+ "loss": 1.122,
2795
+ "step": 38900
2796
+ },
2797
+ {
2798
+ "epoch": 2.218556231867569,
2799
+ "grad_norm": 1.5983092784881592,
2800
+ "learning_rate": 0.00020864288829474563,
2801
+ "loss": 1.1487,
2802
+ "step": 39000
2803
+ },
2804
+ {
2805
+ "epoch": 2.2242448375903066,
2806
+ "grad_norm": 1.632049798965454,
2807
+ "learning_rate": 0.00020712592676868234,
2808
+ "loss": 1.1476,
2809
+ "step": 39100
2810
+ },
2811
+ {
2812
+ "epoch": 2.229933443313044,
2813
+ "grad_norm": 2.039854049682617,
2814
+ "learning_rate": 0.00020562413485787965,
2815
+ "loss": 1.1443,
2816
+ "step": 39200
2817
+ },
2818
+ {
2819
+ "epoch": 2.2356220490357814,
2820
+ "grad_norm": 1.5673627853393555,
2821
+ "learning_rate": 0.00020410717333181638,
2822
+ "loss": 1.1259,
2823
+ "step": 39300
2824
+ },
2825
+ {
2826
+ "epoch": 2.241310654758519,
2827
+ "grad_norm": 1.6900497674942017,
2828
+ "learning_rate": 0.00020259021180575308,
2829
+ "loss": 1.1356,
2830
+ "step": 39400
2831
+ },
2832
+ {
2833
+ "epoch": 2.246999260481256,
2834
+ "grad_norm": 1.8306878805160522,
2835
+ "learning_rate": 0.00020107325027968979,
2836
+ "loss": 1.1349,
2837
+ "step": 39500
2838
+ },
2839
+ {
2840
+ "epoch": 2.2526878662039933,
2841
+ "grad_norm": 1.620490550994873,
2842
+ "learning_rate": 0.0001995562887536265,
2843
+ "loss": 1.1417,
2844
+ "step": 39600
2845
+ },
2846
+ {
2847
+ "epoch": 2.2583764719267307,
2848
+ "grad_norm": 1.828751802444458,
2849
+ "learning_rate": 0.0001980393272275632,
2850
+ "loss": 1.1302,
2851
+ "step": 39700
2852
+ },
2853
+ {
2854
+ "epoch": 2.264065077649468,
2855
+ "grad_norm": 1.4963942766189575,
2856
+ "learning_rate": 0.0001965223657014999,
2857
+ "loss": 1.152,
2858
+ "step": 39800
2859
+ },
2860
+ {
2861
+ "epoch": 2.2697536833722056,
2862
+ "grad_norm": 2.081669807434082,
2863
+ "learning_rate": 0.0001950054041754366,
2864
+ "loss": 1.1385,
2865
+ "step": 39900
2866
+ },
2867
+ {
2868
+ "epoch": 2.2754422890949426,
2869
+ "grad_norm": 1.6873656511306763,
2870
+ "learning_rate": 0.0001934884426493733,
2871
+ "loss": 1.131,
2872
+ "step": 40000
2873
+ },
2874
+ {
2875
+ "epoch": 2.2754422890949426,
2876
+ "eval_accuracy": 0.721316,
2877
+ "eval_loss": 1.1105972528457642,
2878
+ "eval_runtime": 80.6985,
2879
+ "eval_samples_per_second": 3097.949,
2880
+ "eval_steps_per_second": 12.107,
2881
+ "step": 40000
2882
+ },
2883
+ {
2884
+ "epoch": 2.28113089481768,
2885
+ "grad_norm": 1.5599457025527954,
2886
+ "learning_rate": 0.00019197148112331004,
2887
+ "loss": 1.1525,
2888
+ "step": 40100
2889
+ },
2890
+ {
2891
+ "epoch": 2.2868195005404175,
2892
+ "grad_norm": 1.816628098487854,
2893
+ "learning_rate": 0.00019045451959724672,
2894
+ "loss": 1.1295,
2895
+ "step": 40200
2896
+ },
2897
+ {
2898
+ "epoch": 2.292508106263155,
2899
+ "grad_norm": 1.5481749773025513,
2900
+ "learning_rate": 0.00018893755807118342,
2901
+ "loss": 1.124,
2902
+ "step": 40300
2903
+ },
2904
+ {
2905
+ "epoch": 2.2981967119858924,
2906
+ "grad_norm": 1.632873296737671,
2907
+ "learning_rate": 0.00018742059654512015,
2908
+ "loss": 1.1217,
2909
+ "step": 40400
2910
+ },
2911
+ {
2912
+ "epoch": 2.3038853177086294,
2913
+ "grad_norm": 1.4403363466262817,
2914
+ "learning_rate": 0.00018590363501905683,
2915
+ "loss": 1.1315,
2916
+ "step": 40500
2917
+ },
2918
+ {
2919
+ "epoch": 2.309573923431367,
2920
+ "grad_norm": 1.6744205951690674,
2921
+ "learning_rate": 0.00018438667349299353,
2922
+ "loss": 1.1473,
2923
+ "step": 40600
2924
+ },
2925
+ {
2926
+ "epoch": 2.3152625291541042,
2927
+ "grad_norm": 1.5021002292633057,
2928
+ "learning_rate": 0.00018286971196693026,
2929
+ "loss": 1.1127,
2930
+ "step": 40700
2931
+ },
2932
+ {
2933
+ "epoch": 2.3209511348768417,
2934
+ "grad_norm": 1.689931869506836,
2935
+ "learning_rate": 0.00018135275044086694,
2936
+ "loss": 1.1394,
2937
+ "step": 40800
2938
+ },
2939
+ {
2940
+ "epoch": 2.326639740599579,
2941
+ "grad_norm": 2.1370577812194824,
2942
+ "learning_rate": 0.00017983578891480367,
2943
+ "loss": 1.148,
2944
+ "step": 40900
2945
+ },
2946
+ {
2947
+ "epoch": 2.3323283463223166,
2948
+ "grad_norm": 1.9048566818237305,
2949
+ "learning_rate": 0.00017831882738874038,
2950
+ "loss": 1.1181,
2951
+ "step": 41000
2952
+ },
2953
+ {
2954
+ "epoch": 2.338016952045054,
2955
+ "grad_norm": 1.8328748941421509,
2956
+ "learning_rate": 0.00017680186586267705,
2957
+ "loss": 1.1302,
2958
+ "step": 41100
2959
+ },
2960
+ {
2961
+ "epoch": 2.343705557767791,
2962
+ "grad_norm": 1.7709869146347046,
2963
+ "learning_rate": 0.0001753000739518744,
2964
+ "loss": 1.1369,
2965
+ "step": 41200
2966
+ },
2967
+ {
2968
+ "epoch": 2.3493941634905284,
2969
+ "grad_norm": 1.6296570301055908,
2970
+ "learning_rate": 0.00017378311242581112,
2971
+ "loss": 1.1302,
2972
+ "step": 41300
2973
+ },
2974
+ {
2975
+ "epoch": 2.355082769213266,
2976
+ "grad_norm": 1.6044236421585083,
2977
+ "learning_rate": 0.0001722661508997478,
2978
+ "loss": 1.1313,
2979
+ "step": 41400
2980
+ },
2981
+ {
2982
+ "epoch": 2.3607713749360033,
2983
+ "grad_norm": 1.4571659564971924,
2984
+ "learning_rate": 0.00017074918937368453,
2985
+ "loss": 1.1249,
2986
+ "step": 41500
2987
+ },
2988
+ {
2989
+ "epoch": 2.3664599806587407,
2990
+ "grad_norm": 1.7237457036972046,
2991
+ "learning_rate": 0.00016923222784762123,
2992
+ "loss": 1.1312,
2993
+ "step": 41600
2994
+ },
2995
+ {
2996
+ "epoch": 2.3721485863814777,
2997
+ "grad_norm": 1.552881121635437,
2998
+ "learning_rate": 0.0001677152663215579,
2999
+ "loss": 1.1282,
3000
+ "step": 41700
3001
+ },
3002
+ {
3003
+ "epoch": 2.377837192104215,
3004
+ "grad_norm": 1.6091784238815308,
3005
+ "learning_rate": 0.00016619830479549464,
3006
+ "loss": 1.1236,
3007
+ "step": 41800
3008
+ },
3009
+ {
3010
+ "epoch": 2.3835257978269526,
3011
+ "grad_norm": 1.8620885610580444,
3012
+ "learning_rate": 0.00016468134326943134,
3013
+ "loss": 1.1469,
3014
+ "step": 41900
3015
+ },
3016
+ {
3017
+ "epoch": 2.38921440354969,
3018
+ "grad_norm": 1.717551827430725,
3019
+ "learning_rate": 0.00016316438174336802,
3020
+ "loss": 1.121,
3021
+ "step": 42000
3022
+ },
3023
+ {
3024
+ "epoch": 2.3949030092724275,
3025
+ "grad_norm": 1.6212184429168701,
3026
+ "learning_rate": 0.00016164742021730475,
3027
+ "loss": 1.0997,
3028
+ "step": 42100
3029
+ },
3030
+ {
3031
+ "epoch": 2.4005916149951645,
3032
+ "grad_norm": 1.3878498077392578,
3033
+ "learning_rate": 0.00016013045869124146,
3034
+ "loss": 1.1362,
3035
+ "step": 42200
3036
+ },
3037
+ {
3038
+ "epoch": 2.406280220717902,
3039
+ "grad_norm": 1.6336196660995483,
3040
+ "learning_rate": 0.00015861349716517816,
3041
+ "loss": 1.1256,
3042
+ "step": 42300
3043
+ },
3044
+ {
3045
+ "epoch": 2.4119688264406394,
3046
+ "grad_norm": 1.7155201435089111,
3047
+ "learning_rate": 0.00015709653563911486,
3048
+ "loss": 1.1133,
3049
+ "step": 42400
3050
+ },
3051
+ {
3052
+ "epoch": 2.417657432163377,
3053
+ "grad_norm": 1.7675564289093018,
3054
+ "learning_rate": 0.00015557957411305157,
3055
+ "loss": 1.1416,
3056
+ "step": 42500
3057
+ },
3058
+ {
3059
+ "epoch": 2.4233460378861142,
3060
+ "grad_norm": 1.676527976989746,
3061
+ "learning_rate": 0.00015406261258698827,
3062
+ "loss": 1.1378,
3063
+ "step": 42600
3064
+ },
3065
+ {
3066
+ "epoch": 2.4290346436088512,
3067
+ "grad_norm": 1.6293052434921265,
3068
+ "learning_rate": 0.00015254565106092498,
3069
+ "loss": 1.1177,
3070
+ "step": 42700
3071
+ },
3072
+ {
3073
+ "epoch": 2.4347232493315887,
3074
+ "grad_norm": 1.5264780521392822,
3075
+ "learning_rate": 0.00015102868953486168,
3076
+ "loss": 1.1063,
3077
+ "step": 42800
3078
+ },
3079
+ {
3080
+ "epoch": 2.440411855054326,
3081
+ "grad_norm": 1.6453486680984497,
3082
+ "learning_rate": 0.00014951172800879839,
3083
+ "loss": 1.1375,
3084
+ "step": 42900
3085
+ },
3086
+ {
3087
+ "epoch": 2.4461004607770636,
3088
+ "grad_norm": 1.692336082458496,
3089
+ "learning_rate": 0.0001479947664827351,
3090
+ "loss": 1.1004,
3091
+ "step": 43000
3092
+ },
3093
+ {
3094
+ "epoch": 2.451789066499801,
3095
+ "grad_norm": 1.868812084197998,
3096
+ "learning_rate": 0.0001464778049566718,
3097
+ "loss": 1.1288,
3098
+ "step": 43100
3099
+ },
3100
+ {
3101
+ "epoch": 2.4574776722225384,
3102
+ "grad_norm": 1.7713991403579712,
3103
+ "learning_rate": 0.0001449608434306085,
3104
+ "loss": 1.1229,
3105
+ "step": 43200
3106
+ },
3107
+ {
3108
+ "epoch": 2.4631662779452754,
3109
+ "grad_norm": 1.6394290924072266,
3110
+ "learning_rate": 0.00014345905151980583,
3111
+ "loss": 1.0968,
3112
+ "step": 43300
3113
+ },
3114
+ {
3115
+ "epoch": 2.468854883668013,
3116
+ "grad_norm": 1.7240723371505737,
3117
+ "learning_rate": 0.00014194208999374254,
3118
+ "loss": 1.1151,
3119
+ "step": 43400
3120
+ },
3121
+ {
3122
+ "epoch": 2.4745434893907503,
3123
+ "grad_norm": 1.9284464120864868,
3124
+ "learning_rate": 0.00014042512846767924,
3125
+ "loss": 1.1302,
3126
+ "step": 43500
3127
+ },
3128
+ {
3129
+ "epoch": 2.4802320951134877,
3130
+ "grad_norm": 1.6855792999267578,
3131
+ "learning_rate": 0.00013890816694161595,
3132
+ "loss": 1.1163,
3133
+ "step": 43600
3134
+ },
3135
+ {
3136
+ "epoch": 2.485920700836225,
3137
+ "grad_norm": 1.8182587623596191,
3138
+ "learning_rate": 0.00013739120541555265,
3139
+ "loss": 1.1172,
3140
+ "step": 43700
3141
+ },
3142
+ {
3143
+ "epoch": 2.4916093065589626,
3144
+ "grad_norm": 1.5971157550811768,
3145
+ "learning_rate": 0.00013587424388948935,
3146
+ "loss": 1.1071,
3147
+ "step": 43800
3148
+ },
3149
+ {
3150
+ "epoch": 2.4972979122816996,
3151
+ "grad_norm": 1.7139756679534912,
3152
+ "learning_rate": 0.00013435728236342606,
3153
+ "loss": 1.1239,
3154
+ "step": 43900
3155
+ },
3156
+ {
3157
+ "epoch": 2.502986518004437,
3158
+ "grad_norm": 1.7199363708496094,
3159
+ "learning_rate": 0.00013284032083736276,
3160
+ "loss": 1.1444,
3161
+ "step": 44000
3162
+ },
3163
+ {
3164
+ "epoch": 2.5086751237271745,
3165
+ "grad_norm": 1.7295994758605957,
3166
+ "learning_rate": 0.00013132335931129947,
3167
+ "loss": 1.122,
3168
+ "step": 44100
3169
+ },
3170
+ {
3171
+ "epoch": 2.514363729449912,
3172
+ "grad_norm": 1.9433492422103882,
3173
+ "learning_rate": 0.00012980639778523617,
3174
+ "loss": 1.1209,
3175
+ "step": 44200
3176
+ },
3177
+ {
3178
+ "epoch": 2.5200523351726494,
3179
+ "grad_norm": 1.5811411142349243,
3180
+ "learning_rate": 0.0001282894362591729,
3181
+ "loss": 1.1084,
3182
+ "step": 44300
3183
+ },
3184
+ {
3185
+ "epoch": 2.5257409408953864,
3186
+ "grad_norm": 1.5232020616531372,
3187
+ "learning_rate": 0.00012677247473310958,
3188
+ "loss": 1.1372,
3189
+ "step": 44400
3190
+ },
3191
+ {
3192
+ "epoch": 2.531429546618124,
3193
+ "grad_norm": 2.6212551593780518,
3194
+ "learning_rate": 0.00012525551320704628,
3195
+ "loss": 1.1246,
3196
+ "step": 44500
3197
+ },
3198
+ {
3199
+ "epoch": 2.5371181523408612,
3200
+ "grad_norm": 1.4962718486785889,
3201
+ "learning_rate": 0.00012373855168098301,
3202
+ "loss": 1.1386,
3203
+ "step": 44600
3204
+ },
3205
+ {
3206
+ "epoch": 2.5428067580635987,
3207
+ "grad_norm": 1.7713087797164917,
3208
+ "learning_rate": 0.0001222215901549197,
3209
+ "loss": 1.1314,
3210
+ "step": 44700
3211
+ },
3212
+ {
3213
+ "epoch": 2.548495363786336,
3214
+ "grad_norm": 1.5493218898773193,
3215
+ "learning_rate": 0.00012070462862885641,
3216
+ "loss": 1.1204,
3217
+ "step": 44800
3218
+ },
3219
+ {
3220
+ "epoch": 2.554183969509073,
3221
+ "grad_norm": 1.6126313209533691,
3222
+ "learning_rate": 0.00011918766710279313,
3223
+ "loss": 1.1283,
3224
+ "step": 44900
3225
+ },
3226
+ {
3227
+ "epoch": 2.5598725752318106,
3228
+ "grad_norm": 1.5327433347702026,
3229
+ "learning_rate": 0.00011767070557672982,
3230
+ "loss": 1.124,
3231
+ "step": 45000
3232
+ },
3233
+ {
3234
+ "epoch": 2.5598725752318106,
3235
+ "eval_accuracy": 0.725824,
3236
+ "eval_loss": 1.0916061401367188,
3237
+ "eval_runtime": 80.2149,
3238
+ "eval_samples_per_second": 3116.626,
3239
+ "eval_steps_per_second": 12.18,
3240
+ "step": 45000
3241
+ },
3242
+ {
3243
+ "epoch": 2.565561180954548,
3244
+ "grad_norm": 1.5026576519012451,
3245
+ "learning_rate": 0.00011615374405066652,
3246
+ "loss": 1.1197,
3247
+ "step": 45100
3248
+ },
3249
+ {
3250
+ "epoch": 2.5712497866772854,
3251
+ "grad_norm": 1.6989002227783203,
3252
+ "learning_rate": 0.00011463678252460321,
3253
+ "loss": 1.1247,
3254
+ "step": 45200
3255
+ },
3256
+ {
3257
+ "epoch": 2.576938392400023,
3258
+ "grad_norm": 1.5901920795440674,
3259
+ "learning_rate": 0.00011313499061380057,
3260
+ "loss": 1.1352,
3261
+ "step": 45300
3262
+ },
3263
+ {
3264
+ "epoch": 2.58262699812276,
3265
+ "grad_norm": 1.4382330179214478,
3266
+ "learning_rate": 0.00011161802908773727,
3267
+ "loss": 1.1093,
3268
+ "step": 45400
3269
+ },
3270
+ {
3271
+ "epoch": 2.5883156038454973,
3272
+ "grad_norm": 1.8520530462265015,
3273
+ "learning_rate": 0.00011010106756167397,
3274
+ "loss": 1.1081,
3275
+ "step": 45500
3276
+ },
3277
+ {
3278
+ "epoch": 2.5940042095682347,
3279
+ "grad_norm": 1.8772435188293457,
3280
+ "learning_rate": 0.00010858410603561066,
3281
+ "loss": 1.1157,
3282
+ "step": 45600
3283
+ },
3284
+ {
3285
+ "epoch": 2.599692815290972,
3286
+ "grad_norm": 1.6013365983963013,
3287
+ "learning_rate": 0.00010706714450954738,
3288
+ "loss": 1.1463,
3289
+ "step": 45700
3290
+ },
3291
+ {
3292
+ "epoch": 2.6053814210137096,
3293
+ "grad_norm": 1.582515835762024,
3294
+ "learning_rate": 0.0001055501829834841,
3295
+ "loss": 1.1135,
3296
+ "step": 45800
3297
+ },
3298
+ {
3299
+ "epoch": 2.611070026736447,
3300
+ "grad_norm": 1.3782535791397095,
3301
+ "learning_rate": 0.00010403322145742079,
3302
+ "loss": 1.1101,
3303
+ "step": 45900
3304
+ },
3305
+ {
3306
+ "epoch": 2.6167586324591845,
3307
+ "grad_norm": 1.465584397315979,
3308
+ "learning_rate": 0.00010251625993135749,
3309
+ "loss": 1.1354,
3310
+ "step": 46000
3311
+ },
3312
+ {
3313
+ "epoch": 2.6224472381819215,
3314
+ "grad_norm": 1.4038536548614502,
3315
+ "learning_rate": 0.00010099929840529421,
3316
+ "loss": 1.1078,
3317
+ "step": 46100
3318
+ },
3319
+ {
3320
+ "epoch": 2.628135843904659,
3321
+ "grad_norm": 1.9926286935806274,
3322
+ "learning_rate": 9.948233687923091e-05,
3323
+ "loss": 1.1044,
3324
+ "step": 46200
3325
+ },
3326
+ {
3327
+ "epoch": 2.6338244496273964,
3328
+ "grad_norm": 1.6215740442276,
3329
+ "learning_rate": 9.796537535316762e-05,
3330
+ "loss": 1.1188,
3331
+ "step": 46300
3332
+ },
3333
+ {
3334
+ "epoch": 2.639513055350134,
3335
+ "grad_norm": 1.5623165369033813,
3336
+ "learning_rate": 9.644841382710431e-05,
3337
+ "loss": 1.1155,
3338
+ "step": 46400
3339
+ },
3340
+ {
3341
+ "epoch": 2.6452016610728712,
3342
+ "grad_norm": 1.491926670074463,
3343
+ "learning_rate": 9.493145230104102e-05,
3344
+ "loss": 1.1211,
3345
+ "step": 46500
3346
+ },
3347
+ {
3348
+ "epoch": 2.6508902667956082,
3349
+ "grad_norm": 1.7084381580352783,
3350
+ "learning_rate": 9.341449077497773e-05,
3351
+ "loss": 1.1091,
3352
+ "step": 46600
3353
+ },
3354
+ {
3355
+ "epoch": 2.6565788725183457,
3356
+ "grad_norm": 1.5060371160507202,
3357
+ "learning_rate": 9.189752924891443e-05,
3358
+ "loss": 1.1198,
3359
+ "step": 46700
3360
+ },
3361
+ {
3362
+ "epoch": 2.662267478241083,
3363
+ "grad_norm": 1.7321504354476929,
3364
+ "learning_rate": 9.038056772285112e-05,
3365
+ "loss": 1.1157,
3366
+ "step": 46800
3367
+ },
3368
+ {
3369
+ "epoch": 2.6679560839638206,
3370
+ "grad_norm": 1.559877634048462,
3371
+ "learning_rate": 8.886360619678784e-05,
3372
+ "loss": 1.1035,
3373
+ "step": 46900
3374
+ },
3375
+ {
3376
+ "epoch": 2.673644689686558,
3377
+ "grad_norm": 1.8588401079177856,
3378
+ "learning_rate": 8.734664467072455e-05,
3379
+ "loss": 1.1288,
3380
+ "step": 47000
3381
+ },
3382
+ {
3383
+ "epoch": 2.679333295409295,
3384
+ "grad_norm": 1.751246452331543,
3385
+ "learning_rate": 8.582968314466125e-05,
3386
+ "loss": 1.1206,
3387
+ "step": 47100
3388
+ },
3389
+ {
3390
+ "epoch": 2.6850219011320324,
3391
+ "grad_norm": 1.7309458255767822,
3392
+ "learning_rate": 8.431272161859795e-05,
3393
+ "loss": 1.1089,
3394
+ "step": 47200
3395
+ },
3396
+ {
3397
+ "epoch": 2.69071050685477,
3398
+ "grad_norm": 1.8057925701141357,
3399
+ "learning_rate": 8.281092970779529e-05,
3400
+ "loss": 1.1244,
3401
+ "step": 47300
3402
+ },
3403
+ {
3404
+ "epoch": 2.6963991125775073,
3405
+ "grad_norm": 1.7594059705734253,
3406
+ "learning_rate": 8.1293968181732e-05,
3407
+ "loss": 1.1188,
3408
+ "step": 47400
3409
+ },
3410
+ {
3411
+ "epoch": 2.7020877183002447,
3412
+ "grad_norm": 1.686438798904419,
3413
+ "learning_rate": 7.97770066556687e-05,
3414
+ "loss": 1.1068,
3415
+ "step": 47500
3416
+ },
3417
+ {
3418
+ "epoch": 2.7077763240229817,
3419
+ "grad_norm": 1.6962246894836426,
3420
+ "learning_rate": 7.82600451296054e-05,
3421
+ "loss": 1.1043,
3422
+ "step": 47600
3423
+ },
3424
+ {
3425
+ "epoch": 2.713464929745719,
3426
+ "grad_norm": 1.5946807861328125,
3427
+ "learning_rate": 7.67430836035421e-05,
3428
+ "loss": 1.1109,
3429
+ "step": 47700
3430
+ },
3431
+ {
3432
+ "epoch": 2.7191535354684566,
3433
+ "grad_norm": 1.4834094047546387,
3434
+ "learning_rate": 7.522612207747881e-05,
3435
+ "loss": 1.114,
3436
+ "step": 47800
3437
+ },
3438
+ {
3439
+ "epoch": 2.724842141191194,
3440
+ "grad_norm": 1.763058066368103,
3441
+ "learning_rate": 7.370916055141553e-05,
3442
+ "loss": 1.1091,
3443
+ "step": 47900
3444
+ },
3445
+ {
3446
+ "epoch": 2.7305307469139315,
3447
+ "grad_norm": 1.9240601062774658,
3448
+ "learning_rate": 7.219219902535223e-05,
3449
+ "loss": 1.0936,
3450
+ "step": 48000
3451
+ },
3452
+ {
3453
+ "epoch": 2.7362193526366685,
3454
+ "grad_norm": 1.4768198728561401,
3455
+ "learning_rate": 7.067523749928892e-05,
3456
+ "loss": 1.1158,
3457
+ "step": 48100
3458
+ },
3459
+ {
3460
+ "epoch": 2.7419079583594064,
3461
+ "grad_norm": 1.9692409038543701,
3462
+ "learning_rate": 6.915827597322563e-05,
3463
+ "loss": 1.1201,
3464
+ "step": 48200
3465
+ },
3466
+ {
3467
+ "epoch": 2.7475965640821434,
3468
+ "grad_norm": 1.636785864830017,
3469
+ "learning_rate": 6.764131444716234e-05,
3470
+ "loss": 1.1092,
3471
+ "step": 48300
3472
+ },
3473
+ {
3474
+ "epoch": 2.753285169804881,
3475
+ "grad_norm": 1.5599926710128784,
3476
+ "learning_rate": 6.612435292109905e-05,
3477
+ "loss": 1.0932,
3478
+ "step": 48400
3479
+ },
3480
+ {
3481
+ "epoch": 2.7589737755276182,
3482
+ "grad_norm": 1.695862054824829,
3483
+ "learning_rate": 6.460739139503574e-05,
3484
+ "loss": 1.1227,
3485
+ "step": 48500
3486
+ },
3487
+ {
3488
+ "epoch": 2.7646623812503557,
3489
+ "grad_norm": 1.8806819915771484,
3490
+ "learning_rate": 6.309042986897246e-05,
3491
+ "loss": 1.1049,
3492
+ "step": 48600
3493
+ },
3494
+ {
3495
+ "epoch": 2.770350986973093,
3496
+ "grad_norm": 1.814792513847351,
3497
+ "learning_rate": 6.157346834290916e-05,
3498
+ "loss": 1.1149,
3499
+ "step": 48700
3500
+ },
3501
+ {
3502
+ "epoch": 2.77603959269583,
3503
+ "grad_norm": 2.068614959716797,
3504
+ "learning_rate": 6.005650681684586e-05,
3505
+ "loss": 1.1181,
3506
+ "step": 48800
3507
+ },
3508
+ {
3509
+ "epoch": 2.7817281984185676,
3510
+ "grad_norm": 1.5576444864273071,
3511
+ "learning_rate": 5.853954529078256e-05,
3512
+ "loss": 1.1223,
3513
+ "step": 48900
3514
+ },
3515
+ {
3516
+ "epoch": 2.787416804141305,
3517
+ "grad_norm": 1.8175384998321533,
3518
+ "learning_rate": 5.7022583764719273e-05,
3519
+ "loss": 1.1113,
3520
+ "step": 49000
3521
+ },
3522
+ {
3523
+ "epoch": 2.7931054098640424,
3524
+ "grad_norm": 1.570915937423706,
3525
+ "learning_rate": 5.550562223865598e-05,
3526
+ "loss": 1.1123,
3527
+ "step": 49100
3528
+ },
3529
+ {
3530
+ "epoch": 2.79879401558678,
3531
+ "grad_norm": 1.9663364887237549,
3532
+ "learning_rate": 5.3988660712592675e-05,
3533
+ "loss": 1.1065,
3534
+ "step": 49200
3535
+ },
3536
+ {
3537
+ "epoch": 2.804482621309517,
3538
+ "grad_norm": 2.2906079292297363,
3539
+ "learning_rate": 5.248686880179001e-05,
3540
+ "loss": 1.0993,
3541
+ "step": 49300
3542
+ },
3543
+ {
3544
+ "epoch": 2.8101712270322543,
3545
+ "grad_norm": 1.566801905632019,
3546
+ "learning_rate": 5.096990727572673e-05,
3547
+ "loss": 1.0964,
3548
+ "step": 49400
3549
+ },
3550
+ {
3551
+ "epoch": 2.8158598327549917,
3552
+ "grad_norm": 1.7769867181777954,
3553
+ "learning_rate": 4.9452945749663425e-05,
3554
+ "loss": 1.0978,
3555
+ "step": 49500
3556
+ },
3557
+ {
3558
+ "epoch": 2.821548438477729,
3559
+ "grad_norm": 1.9856287240982056,
3560
+ "learning_rate": 4.7935984223600136e-05,
3561
+ "loss": 1.0875,
3562
+ "step": 49600
3563
+ },
3564
+ {
3565
+ "epoch": 2.8272370442004666,
3566
+ "grad_norm": 1.7836079597473145,
3567
+ "learning_rate": 4.6419022697536834e-05,
3568
+ "loss": 1.1056,
3569
+ "step": 49700
3570
+ },
3571
+ {
3572
+ "epoch": 2.8329256499232036,
3573
+ "grad_norm": 1.9246402978897095,
3574
+ "learning_rate": 4.4902061171473545e-05,
3575
+ "loss": 1.1074,
3576
+ "step": 49800
3577
+ },
3578
+ {
3579
+ "epoch": 2.838614255645941,
3580
+ "grad_norm": 1.3988184928894043,
3581
+ "learning_rate": 4.338509964541024e-05,
3582
+ "loss": 1.1206,
3583
+ "step": 49900
3584
+ },
3585
+ {
3586
+ "epoch": 2.8443028613686785,
3587
+ "grad_norm": 1.7193849086761475,
3588
+ "learning_rate": 4.186813811934695e-05,
3589
+ "loss": 1.1245,
3590
+ "step": 50000
3591
+ },
3592
+ {
3593
+ "epoch": 2.8443028613686785,
3594
+ "eval_accuracy": 0.729988,
3595
+ "eval_loss": 1.0782374143600464,
3596
+ "eval_runtime": 82.4205,
3597
+ "eval_samples_per_second": 3033.226,
3598
+ "eval_steps_per_second": 11.854,
3599
+ "step": 50000
3600
+ },
3601
+ {
3602
+ "epoch": 2.849991467091416,
3603
+ "grad_norm": 1.7059062719345093,
3604
+ "learning_rate": 4.035117659328366e-05,
3605
+ "loss": 1.1009,
3606
+ "step": 50100
3607
+ },
3608
+ {
3609
+ "epoch": 2.8556800728141534,
3610
+ "grad_norm": 1.4554681777954102,
3611
+ "learning_rate": 3.883421506722036e-05,
3612
+ "loss": 1.1108,
3613
+ "step": 50200
3614
+ },
3615
+ {
3616
+ "epoch": 2.8613686785368904,
3617
+ "grad_norm": 1.7067590951919556,
3618
+ "learning_rate": 3.7317253541157065e-05,
3619
+ "loss": 1.0956,
3620
+ "step": 50300
3621
+ },
3622
+ {
3623
+ "epoch": 2.867057284259628,
3624
+ "grad_norm": 1.7176940441131592,
3625
+ "learning_rate": 3.580029201509377e-05,
3626
+ "loss": 1.0841,
3627
+ "step": 50400
3628
+ },
3629
+ {
3630
+ "epoch": 2.8727458899823652,
3631
+ "grad_norm": 1.7251313924789429,
3632
+ "learning_rate": 3.4283330489030474e-05,
3633
+ "loss": 1.0908,
3634
+ "step": 50500
3635
+ },
3636
+ {
3637
+ "epoch": 2.8784344957051027,
3638
+ "grad_norm": 1.668372631072998,
3639
+ "learning_rate": 3.276636896296718e-05,
3640
+ "loss": 1.0954,
3641
+ "step": 50600
3642
+ },
3643
+ {
3644
+ "epoch": 2.88412310142784,
3645
+ "grad_norm": 1.889109492301941,
3646
+ "learning_rate": 3.124940743690388e-05,
3647
+ "loss": 1.1096,
3648
+ "step": 50700
3649
+ },
3650
+ {
3651
+ "epoch": 2.8898117071505776,
3652
+ "grad_norm": 1.509391188621521,
3653
+ "learning_rate": 2.973244591084059e-05,
3654
+ "loss": 1.09,
3655
+ "step": 50800
3656
+ },
3657
+ {
3658
+ "epoch": 2.895500312873315,
3659
+ "grad_norm": 2.024489402770996,
3660
+ "learning_rate": 2.821548438477729e-05,
3661
+ "loss": 1.0993,
3662
+ "step": 50900
3663
+ },
3664
+ {
3665
+ "epoch": 2.901188918596052,
3666
+ "grad_norm": 2.007756471633911,
3667
+ "learning_rate": 2.6698522858713998e-05,
3668
+ "loss": 1.1029,
3669
+ "step": 51000
3670
+ },
3671
+ {
3672
+ "epoch": 2.9068775243187894,
3673
+ "grad_norm": 1.5296841859817505,
3674
+ "learning_rate": 2.51815613326507e-05,
3675
+ "loss": 1.095,
3676
+ "step": 51100
3677
+ },
3678
+ {
3679
+ "epoch": 2.912566130041527,
3680
+ "grad_norm": 1.6109613180160522,
3681
+ "learning_rate": 2.3664599806587406e-05,
3682
+ "loss": 1.1044,
3683
+ "step": 51200
3684
+ },
3685
+ {
3686
+ "epoch": 2.9182547357642643,
3687
+ "grad_norm": 1.8067957162857056,
3688
+ "learning_rate": 2.216280789578474e-05,
3689
+ "loss": 1.1074,
3690
+ "step": 51300
3691
+ },
3692
+ {
3693
+ "epoch": 2.9239433414870017,
3694
+ "grad_norm": 1.6997472047805786,
3695
+ "learning_rate": 2.064584636972145e-05,
3696
+ "loss": 1.1036,
3697
+ "step": 51400
3698
+ },
3699
+ {
3700
+ "epoch": 2.9296319472097387,
3701
+ "grad_norm": 1.6443182229995728,
3702
+ "learning_rate": 1.9128884843658153e-05,
3703
+ "loss": 1.1258,
3704
+ "step": 51500
3705
+ },
3706
+ {
3707
+ "epoch": 2.935320552932476,
3708
+ "grad_norm": 1.868161916732788,
3709
+ "learning_rate": 1.7611923317594857e-05,
3710
+ "loss": 1.115,
3711
+ "step": 51600
3712
+ },
3713
+ {
3714
+ "epoch": 2.9410091586552136,
3715
+ "grad_norm": 1.5620206594467163,
3716
+ "learning_rate": 1.609496179153156e-05,
3717
+ "loss": 1.1075,
3718
+ "step": 51700
3719
+ },
3720
+ {
3721
+ "epoch": 2.946697764377951,
3722
+ "grad_norm": 1.6326332092285156,
3723
+ "learning_rate": 1.4578000265468267e-05,
3724
+ "loss": 1.1113,
3725
+ "step": 51800
3726
+ },
3727
+ {
3728
+ "epoch": 2.9523863701006885,
3729
+ "grad_norm": 1.805126428604126,
3730
+ "learning_rate": 1.3061038739404973e-05,
3731
+ "loss": 1.0937,
3732
+ "step": 51900
3733
+ },
3734
+ {
3735
+ "epoch": 2.9580749758234255,
3736
+ "grad_norm": 1.5693707466125488,
3737
+ "learning_rate": 1.1544077213341677e-05,
3738
+ "loss": 1.1053,
3739
+ "step": 52000
3740
+ },
3741
+ {
3742
+ "epoch": 2.963763581546163,
3743
+ "grad_norm": 1.4851309061050415,
3744
+ "learning_rate": 1.0027115687278382e-05,
3745
+ "loss": 1.1101,
3746
+ "step": 52100
3747
+ },
3748
+ {
3749
+ "epoch": 2.9694521872689004,
3750
+ "grad_norm": 1.8946778774261475,
3751
+ "learning_rate": 8.510154161215086e-06,
3752
+ "loss": 1.0934,
3753
+ "step": 52200
3754
+ },
3755
+ {
3756
+ "epoch": 2.975140792991638,
3757
+ "grad_norm": 1.5624499320983887,
3758
+ "learning_rate": 6.993192635151791e-06,
3759
+ "loss": 1.0994,
3760
+ "step": 52300
3761
+ },
3762
+ {
3763
+ "epoch": 2.9808293987143752,
3764
+ "grad_norm": 1.5662641525268555,
3765
+ "learning_rate": 5.476231109088497e-06,
3766
+ "loss": 1.1058,
3767
+ "step": 52400
3768
+ },
3769
+ {
3770
+ "epoch": 2.9865180044371122,
3771
+ "grad_norm": 1.514809250831604,
3772
+ "learning_rate": 3.959269583025201e-06,
3773
+ "loss": 1.0967,
3774
+ "step": 52500
3775
+ },
3776
+ {
3777
+ "epoch": 2.9922066101598497,
3778
+ "grad_norm": 1.8442556858062744,
3779
+ "learning_rate": 2.4423080569619053e-06,
3780
+ "loss": 1.1041,
3781
+ "step": 52600
3782
+ },
3783
+ {
3784
+ "epoch": 2.997895215882587,
3785
+ "grad_norm": 1.7445664405822754,
3786
+ "learning_rate": 9.253465308986101e-07,
3787
+ "loss": 1.0786,
3788
+ "step": 52700
3789
+ },
3790
+ {
3791
+ "epoch": 3.0,
3792
+ "step": 52737,
3793
+ "total_flos": 1.3116020904e+17,
3794
+ "train_loss": 1.269093582550002,
3795
+ "train_runtime": 7252.1306,
3796
+ "train_samples_per_second": 1861.522,
3797
+ "train_steps_per_second": 7.272
3798
+ }
3799
+ ],
3800
+ "logging_steps": 100,
3801
+ "max_steps": 52737,
3802
+ "num_input_tokens_seen": 0,
3803
+ "num_train_epochs": 3,
3804
+ "save_steps": 5000,
3805
+ "stateful_callbacks": {
3806
+ "TrainerControl": {
3807
+ "args": {
3808
+ "should_epoch_stop": false,
3809
+ "should_evaluate": false,
3810
+ "should_log": false,
3811
+ "should_save": true,
3812
+ "should_training_stop": true
3813
+ },
3814
+ "attributes": {}
3815
+ }
3816
+ },
3817
+ "total_flos": 1.3116020904e+17,
3818
+ "train_batch_size": 256,
3819
+ "trial_name": null,
3820
+ "trial_params": null
3821
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4741f9929f86e0a0cc5ef0c6799bcd5cfd09259484bf459d7901ff7158af6501
3
+ size 5112