diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.7567567567567568, - "best_model_checkpoint": "MAE-CT-M1N0-M12_v8_split1_v3/checkpoint-1846", + "best_metric": 0.7837837837837838, + "best_model_checkpoint": "MAE-CT-M1N0-M12_v8_split1_v3/checkpoint-568", "epoch": 147.006, "eval_steps": 500, "global_step": 10500, @@ -10,6622 +10,6622 @@ "log_history": [ { "epoch": 0.0009523809523809524, - "grad_norm": 33.47982406616211, + "grad_norm": 1.495897650718689, "learning_rate": 9.523809523809525e-08, - "loss": 0.728, + "loss": 0.6806, "step": 10 }, { "epoch": 0.0019047619047619048, - "grad_norm": 22.75005340576172, + "grad_norm": 2.5130839347839355, "learning_rate": 1.904761904761905e-07, - "loss": 0.7888, + "loss": 0.6831, "step": 20 }, { "epoch": 0.002857142857142857, - "grad_norm": 7.631560325622559, + "grad_norm": 1.683809518814087, "learning_rate": 2.8571428571428575e-07, - "loss": 0.7747, + "loss": 0.6792, "step": 30 }, { "epoch": 0.0038095238095238095, - "grad_norm": 26.753015518188477, + "grad_norm": 3.095991849899292, "learning_rate": 3.80952380952381e-07, - "loss": 0.7297, + "loss": 0.6764, "step": 40 }, { "epoch": 0.004761904761904762, - "grad_norm": 9.174154281616211, + "grad_norm": 2.464594841003418, "learning_rate": 4.7619047619047623e-07, - "loss": 0.7039, + "loss": 0.6689, "step": 50 }, { "epoch": 0.005714285714285714, - "grad_norm": 36.965885162353516, + "grad_norm": 2.2026219367980957, "learning_rate": 5.714285714285715e-07, - "loss": 0.6859, + "loss": 0.6583, "step": 60 }, { "epoch": 0.006666666666666667, - "grad_norm": 8.628267288208008, + "grad_norm": 1.6293625831604004, "learning_rate": 6.666666666666667e-07, - "loss": 0.6778, + "loss": 0.6862, "step": 70 }, { "epoch": 0.0067619047619047615, "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6619952917098999, - "eval_runtime": 9.2515, - "eval_samples_per_second": 7.999, - "eval_steps_per_second": 2.054, + "eval_loss": 0.6570932269096375, + "eval_runtime": 17.5734, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 1.081, "step": 71 }, { "epoch": 1.000857142857143, - "grad_norm": 51.088619232177734, + "grad_norm": 4.512332439422607, "learning_rate": 7.61904761904762e-07, - "loss": 0.6783, + "loss": 0.6868, "step": 80 }, { "epoch": 1.0018095238095237, - "grad_norm": 67.16112518310547, + "grad_norm": 4.540114402770996, "learning_rate": 8.571428571428572e-07, - "loss": 0.6881, + "loss": 0.6768, "step": 90 }, { "epoch": 1.0027619047619047, - "grad_norm": 36.75675964355469, + "grad_norm": 2.093035936355591, "learning_rate": 9.523809523809525e-07, - "loss": 0.6454, + "loss": 0.6615, "step": 100 }, { "epoch": 1.0037142857142858, - "grad_norm": 17.448606491088867, + "grad_norm": 3.182893991470337, "learning_rate": 1.0476190476190478e-06, - "loss": 0.6087, + "loss": 0.6468, "step": 110 }, { "epoch": 1.0046666666666666, - "grad_norm": 70.8754653930664, + "grad_norm": 2.688915491104126, "learning_rate": 1.142857142857143e-06, - "loss": 0.6216, + "loss": 0.6293, "step": 120 }, { "epoch": 1.0056190476190476, - "grad_norm": 18.515897750854492, + "grad_norm": 4.570864200592041, "learning_rate": 1.2380952380952382e-06, - "loss": 0.6139, + "loss": 0.643, "step": 130 }, { "epoch": 1.0065714285714287, - "grad_norm": 52.605350494384766, + "grad_norm": 5.001130104064941, "learning_rate": 1.3333333333333334e-06, - "loss": 0.6974, + "loss": 0.665, "step": 140 }, { "epoch": 1.0067619047619047, "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6517667770385742, - "eval_runtime": 9.2216, - "eval_samples_per_second": 8.025, - "eval_steps_per_second": 2.06, + "eval_loss": 0.6369683742523193, + "eval_runtime": 17.5533, + "eval_samples_per_second": 4.216, + "eval_steps_per_second": 1.082, "step": 142 }, { "epoch": 2.0007619047619047, - "grad_norm": 18.45825958251953, + "grad_norm": 3.4650774002075195, "learning_rate": 1.4285714285714286e-06, - "loss": 0.6696, + "loss": 0.6296, "step": 150 }, { "epoch": 2.001714285714286, - "grad_norm": 45.19963836669922, + "grad_norm": 6.552379131317139, "learning_rate": 1.523809523809524e-06, - "loss": 0.6659, + "loss": 0.6634, "step": 160 }, { "epoch": 2.002666666666667, - "grad_norm": 58.19581985473633, + "grad_norm": 6.5488715171813965, "learning_rate": 1.6190476190476193e-06, - "loss": 0.6516, + "loss": 0.6709, "step": 170 }, { "epoch": 2.0036190476190474, - "grad_norm": 38.006587982177734, + "grad_norm": 9.810216903686523, "learning_rate": 1.7142857142857145e-06, - "loss": 0.595, + "loss": 0.6067, "step": 180 }, { "epoch": 2.0045714285714284, - "grad_norm": 28.931968688964844, + "grad_norm": 12.946127891540527, "learning_rate": 1.8095238095238097e-06, - "loss": 0.6969, + "loss": 0.6301, "step": 190 }, { "epoch": 2.0055238095238095, - "grad_norm": 59.999698638916016, + "grad_norm": 17.473913192749023, "learning_rate": 1.904761904761905e-06, - "loss": 0.666, + "loss": 0.5936, "step": 200 }, { "epoch": 2.0064761904761905, - "grad_norm": 15.645180702209473, + "grad_norm": 5.558756351470947, "learning_rate": 2.0000000000000003e-06, - "loss": 0.7123, + "loss": 0.7033, "step": 210 }, { "epoch": 2.006761904761905, "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6537678837776184, - "eval_runtime": 9.1055, - "eval_samples_per_second": 8.127, - "eval_steps_per_second": 2.087, + "eval_loss": 0.6253643035888672, + "eval_runtime": 17.8289, + "eval_samples_per_second": 4.151, + "eval_steps_per_second": 1.066, "step": 213 }, { "epoch": 3.0006666666666666, - "grad_norm": 22.339202880859375, + "grad_norm": 4.486657619476318, "learning_rate": 2.0952380952380955e-06, - "loss": 0.6543, + "loss": 0.6552, "step": 220 }, { "epoch": 3.0016190476190476, - "grad_norm": 74.22434997558594, + "grad_norm": 6.97869348526001, "learning_rate": 2.1904761904761908e-06, - "loss": 0.6289, + "loss": 0.5872, "step": 230 }, { "epoch": 3.0025714285714287, - "grad_norm": 48.04328155517578, + "grad_norm": 7.152403354644775, "learning_rate": 2.285714285714286e-06, - "loss": 0.6881, + "loss": 0.6369, "step": 240 }, { "epoch": 3.0035238095238097, - "grad_norm": 71.21366119384766, + "grad_norm": 7.3148298263549805, "learning_rate": 2.380952380952381e-06, - "loss": 0.5733, + "loss": 0.5187, "step": 250 }, { "epoch": 3.0044761904761903, - "grad_norm": 40.80241775512695, + "grad_norm": 7.124256610870361, "learning_rate": 2.4761904761904764e-06, - "loss": 0.6708, + "loss": 0.6672, "step": 260 }, { "epoch": 3.0054285714285713, - "grad_norm": 10.791142463684082, + "grad_norm": 6.500947952270508, "learning_rate": 2.571428571428571e-06, - "loss": 0.7158, + "loss": 0.6748, "step": 270 }, { "epoch": 3.0063809523809524, - "grad_norm": 7.554626941680908, + "grad_norm": 10.486983299255371, "learning_rate": 2.666666666666667e-06, - "loss": 0.6797, + "loss": 0.6524, "step": 280 }, { "epoch": 3.006761904761905, "eval_accuracy": 0.6756756756756757, - "eval_loss": 0.6662525534629822, - "eval_runtime": 9.044, - "eval_samples_per_second": 8.182, - "eval_steps_per_second": 2.101, + "eval_loss": 0.6090587377548218, + "eval_runtime": 16.4336, + "eval_samples_per_second": 4.503, + "eval_steps_per_second": 1.156, "step": 284 }, { "epoch": 4.000571428571429, - "grad_norm": 44.85297393798828, + "grad_norm": 10.162851333618164, "learning_rate": 2.7619047619047625e-06, - "loss": 0.661, + "loss": 0.6116, "step": 290 }, { "epoch": 4.0015238095238095, - "grad_norm": 14.234319686889648, + "grad_norm": 6.945367336273193, "learning_rate": 2.8571428571428573e-06, - "loss": 0.6599, + "loss": 0.6209, "step": 300 }, { "epoch": 4.00247619047619, - "grad_norm": 24.012731552124023, + "grad_norm": 8.757481575012207, "learning_rate": 2.9523809523809525e-06, - "loss": 0.6375, + "loss": 0.595, "step": 310 }, { "epoch": 4.003428571428572, - "grad_norm": 15.084027290344238, + "grad_norm": 6.791728496551514, "learning_rate": 3.047619047619048e-06, - "loss": 0.6796, + "loss": 0.5939, "step": 320 }, { "epoch": 4.004380952380952, - "grad_norm": 15.230443000793457, + "grad_norm": 11.641304969787598, "learning_rate": 3.142857142857143e-06, - "loss": 0.7003, + "loss": 0.7177, "step": 330 }, { "epoch": 4.005333333333334, - "grad_norm": 40.75351333618164, + "grad_norm": 8.894887924194336, "learning_rate": 3.2380952380952385e-06, - "loss": 0.6948, + "loss": 0.5451, "step": 340 }, { "epoch": 4.006285714285714, - "grad_norm": 10.619876861572266, + "grad_norm": 10.148953437805176, "learning_rate": 3.3333333333333333e-06, - "loss": 0.6391, + "loss": 0.5611, "step": 350 }, { "epoch": 4.0067619047619045, "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6381068825721741, - "eval_runtime": 9.0359, - "eval_samples_per_second": 8.19, - "eval_steps_per_second": 2.103, + "eval_loss": 0.5565000176429749, + "eval_runtime": 16.1354, + "eval_samples_per_second": 4.586, + "eval_steps_per_second": 1.178, "step": 355 }, { "epoch": 5.00047619047619, - "grad_norm": 34.753414154052734, + "grad_norm": 16.05340576171875, "learning_rate": 3.428571428571429e-06, - "loss": 0.7451, + "loss": 0.5934, "step": 360 }, { "epoch": 5.001428571428572, - "grad_norm": 30.29311752319336, + "grad_norm": 21.09893798828125, "learning_rate": 3.523809523809524e-06, - "loss": 0.6682, + "loss": 0.599, "step": 370 }, { "epoch": 5.002380952380952, - "grad_norm": 12.413793563842773, + "grad_norm": 18.13679313659668, "learning_rate": 3.6190476190476194e-06, - "loss": 0.7035, + "loss": 0.5421, "step": 380 }, { "epoch": 5.003333333333333, - "grad_norm": 5.764993667602539, + "grad_norm": 12.51164722442627, "learning_rate": 3.7142857142857146e-06, - "loss": 0.6815, + "loss": 0.5152, "step": 390 }, { "epoch": 5.0042857142857144, - "grad_norm": 18.768259048461914, + "grad_norm": 23.869234085083008, "learning_rate": 3.80952380952381e-06, - "loss": 0.6004, + "loss": 0.4998, "step": 400 }, { "epoch": 5.005238095238095, - "grad_norm": 19.606473922729492, + "grad_norm": 14.866220474243164, "learning_rate": 3.9047619047619055e-06, - "loss": 0.6766, + "loss": 0.5252, "step": 410 }, { "epoch": 5.0061904761904765, - "grad_norm": 6.300143241882324, + "grad_norm": 9.281255722045898, "learning_rate": 4.000000000000001e-06, - "loss": 0.643, + "loss": 0.4274, "step": 420 }, { "epoch": 5.0067619047619045, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6439703702926636, - "eval_runtime": 8.1202, - "eval_samples_per_second": 9.113, - "eval_steps_per_second": 2.34, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 0.5154068470001221, + "eval_runtime": 16.3563, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 1.162, "step": 426 }, { "epoch": 6.000380952380953, - "grad_norm": 22.73067855834961, + "grad_norm": 8.937905311584473, "learning_rate": 4.095238095238096e-06, - "loss": 0.7727, + "loss": 0.6578, "step": 430 }, { "epoch": 6.001333333333333, - "grad_norm": 24.300537109375, + "grad_norm": 10.171916007995605, "learning_rate": 4.190476190476191e-06, - "loss": 0.7116, + "loss": 0.5081, "step": 440 }, { "epoch": 6.002285714285715, - "grad_norm": 5.071887493133545, + "grad_norm": 23.720195770263672, "learning_rate": 4.2857142857142855e-06, - "loss": 0.6966, + "loss": 0.5327, "step": 450 }, { "epoch": 6.003238095238095, - "grad_norm": 7.089809417724609, + "grad_norm": 19.935134887695312, "learning_rate": 4.3809523809523815e-06, - "loss": 0.6507, + "loss": 0.4616, "step": 460 }, { "epoch": 6.004190476190476, - "grad_norm": 6.718177795410156, + "grad_norm": 13.191886901855469, "learning_rate": 4.476190476190477e-06, - "loss": 0.7082, + "loss": 0.3997, "step": 470 }, { "epoch": 6.005142857142857, - "grad_norm": 5.669395923614502, + "grad_norm": 34.281044006347656, "learning_rate": 4.571428571428572e-06, - "loss": 0.661, + "loss": 0.5115, "step": 480 }, { "epoch": 6.006095238095238, - "grad_norm": 6.64644718170166, + "grad_norm": 15.229226112365723, "learning_rate": 4.666666666666667e-06, - "loss": 0.6763, + "loss": 0.4797, "step": 490 }, { "epoch": 6.0067619047619045, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6331241726875305, - "eval_runtime": 8.112, - "eval_samples_per_second": 9.122, - "eval_steps_per_second": 2.342, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 0.5644029378890991, + "eval_runtime": 15.9353, + "eval_samples_per_second": 4.644, + "eval_steps_per_second": 1.192, "step": 497 }, { "epoch": 7.000285714285714, - "grad_norm": 6.6726765632629395, + "grad_norm": 12.772289276123047, "learning_rate": 4.761904761904762e-06, - "loss": 0.6528, + "loss": 0.3397, "step": 500 }, { "epoch": 7.0012380952380955, - "grad_norm": 6.899139881134033, + "grad_norm": 30.241724014282227, "learning_rate": 4.857142857142858e-06, - "loss": 0.6098, + "loss": 0.4096, "step": 510 }, { "epoch": 7.002190476190476, - "grad_norm": 8.79203987121582, + "grad_norm": 28.286476135253906, "learning_rate": 4.952380952380953e-06, - "loss": 0.788, + "loss": 0.4814, "step": 520 }, { "epoch": 7.003142857142858, - "grad_norm": 4.763568878173828, + "grad_norm": 43.063167572021484, "learning_rate": 5.047619047619048e-06, - "loss": 0.6466, + "loss": 0.4013, "step": 530 }, { "epoch": 7.004095238095238, - "grad_norm": 7.255838871002197, + "grad_norm": 12.676410675048828, "learning_rate": 5.142857142857142e-06, - "loss": 0.6609, + "loss": 0.5215, "step": 540 }, { "epoch": 7.005047619047619, - "grad_norm": 6.9776434898376465, + "grad_norm": 33.401485443115234, "learning_rate": 5.2380952380952384e-06, - "loss": 0.7079, + "loss": 0.3229, "step": 550 }, { "epoch": 7.006, - "grad_norm": 4.618414878845215, + "grad_norm": 29.031524658203125, "learning_rate": 5.333333333333334e-06, - "loss": 0.6547, + "loss": 0.3758, "step": 560 }, { "epoch": 7.0067619047619045, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6474675536155701, - "eval_runtime": 8.0891, - "eval_samples_per_second": 9.148, - "eval_steps_per_second": 2.349, + "eval_accuracy": 0.7837837837837838, + "eval_loss": 0.49420419335365295, + "eval_runtime": 15.2009, + "eval_samples_per_second": 4.868, + "eval_steps_per_second": 1.25, "step": 568 }, { "epoch": 8.000190476190475, - "grad_norm": 10.959053993225098, + "grad_norm": 60.28892517089844, "learning_rate": 5.428571428571429e-06, - "loss": 0.7124, + "loss": 0.3936, "step": 570 }, { "epoch": 8.001142857142858, - "grad_norm": 7.726167678833008, + "grad_norm": 0.988732635974884, "learning_rate": 5.523809523809525e-06, - "loss": 0.5254, + "loss": 0.3985, "step": 580 }, { "epoch": 8.002095238095238, - "grad_norm": 5.0308427810668945, + "grad_norm": 22.0004940032959, "learning_rate": 5.619047619047619e-06, - "loss": 0.6546, + "loss": 0.8432, "step": 590 }, { "epoch": 8.003047619047619, - "grad_norm": 18.555625915527344, + "grad_norm": 6.368250846862793, "learning_rate": 5.7142857142857145e-06, - "loss": 0.7098, + "loss": 0.3863, "step": 600 }, { "epoch": 8.004, - "grad_norm": 13.083730697631836, + "grad_norm": 44.51639175415039, "learning_rate": 5.8095238095238106e-06, - "loss": 0.6742, + "loss": 0.2897, "step": 610 }, { "epoch": 8.00495238095238, - "grad_norm": 2.0079691410064697, + "grad_norm": 48.01280975341797, "learning_rate": 5.904761904761905e-06, - "loss": 0.682, + "loss": 0.2577, "step": 620 }, { "epoch": 8.005904761904763, - "grad_norm": 9.821672439575195, + "grad_norm": 37.27678680419922, "learning_rate": 6e-06, - "loss": 0.6751, + "loss": 0.4243, "step": 630 }, { "epoch": 8.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6370192170143127, - "eval_runtime": 8.3221, - "eval_samples_per_second": 8.892, - "eval_steps_per_second": 2.283, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 0.525236189365387, + "eval_runtime": 15.1435, + "eval_samples_per_second": 4.887, + "eval_steps_per_second": 1.255, "step": 639 }, { "epoch": 9.000095238095238, - "grad_norm": 11.729015350341797, + "grad_norm": 15.225837707519531, "learning_rate": 6.095238095238096e-06, - "loss": 0.7438, + "loss": 0.3945, "step": 640 }, { "epoch": 9.001047619047618, - "grad_norm": 2.8617539405822754, + "grad_norm": 15.552347183227539, "learning_rate": 6.1904761904761914e-06, - "loss": 0.6705, + "loss": 0.4775, "step": 650 }, { "epoch": 9.002, - "grad_norm": 10.742891311645508, + "grad_norm": 47.491615295410156, "learning_rate": 6.285714285714286e-06, - "loss": 0.663, + "loss": 0.3006, "step": 660 }, { "epoch": 9.002952380952381, - "grad_norm": 4.813956260681152, + "grad_norm": 37.66072463989258, "learning_rate": 6.380952380952381e-06, - "loss": 0.5673, + "loss": 0.5796, "step": 670 }, { "epoch": 9.003904761904762, - "grad_norm": 13.185957908630371, + "grad_norm": 53.462364196777344, "learning_rate": 6.476190476190477e-06, - "loss": 0.6924, + "loss": 0.473, "step": 680 }, { "epoch": 9.004857142857142, - "grad_norm": 5.769442558288574, + "grad_norm": 5.171151638031006, "learning_rate": 6.571428571428572e-06, - "loss": 0.7345, + "loss": 0.2178, "step": 690 }, { "epoch": 9.005809523809523, - "grad_norm": 6.328913688659668, + "grad_norm": 41.881038665771484, "learning_rate": 6.666666666666667e-06, - "loss": 0.6407, + "loss": 0.5717, "step": 700 }, { "epoch": 9.006761904761905, - "grad_norm": 9.461712837219238, + "grad_norm": 319.6321716308594, "learning_rate": 6.761904761904763e-06, - "loss": 0.6847, + "loss": 0.5133, "step": 710 }, { "epoch": 9.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6343960165977478, - "eval_runtime": 8.9405, - "eval_samples_per_second": 8.277, - "eval_steps_per_second": 2.125, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 0.6872884631156921, + "eval_runtime": 15.7818, + "eval_samples_per_second": 4.689, + "eval_steps_per_second": 1.204, "step": 710 }, { "epoch": 10.00095238095238, - "grad_norm": 1.5953887701034546, + "grad_norm": 32.255435943603516, "learning_rate": 6.857142857142858e-06, - "loss": 0.6736, + "loss": 0.3737, "step": 720 }, { "epoch": 10.001904761904761, - "grad_norm": 2.3095293045043945, + "grad_norm": 16.93897819519043, "learning_rate": 6.952380952380952e-06, - "loss": 0.6908, + "loss": 0.211, "step": 730 }, { "epoch": 10.002857142857144, - "grad_norm": 7.062539100646973, + "grad_norm": 59.26026153564453, "learning_rate": 7.047619047619048e-06, - "loss": 0.6194, + "loss": 0.3487, "step": 740 }, { "epoch": 10.003809523809524, - "grad_norm": 3.7363109588623047, + "grad_norm": 1.3887319564819336, "learning_rate": 7.1428571428571436e-06, - "loss": 0.743, + "loss": 0.1528, "step": 750 }, { "epoch": 10.004761904761905, - "grad_norm": 2.502424955368042, + "grad_norm": 4.799934387207031, "learning_rate": 7.238095238095239e-06, - "loss": 0.6028, + "loss": 0.3359, "step": 760 }, { "epoch": 10.005714285714285, - "grad_norm": 6.453869819641113, + "grad_norm": 60.12313461303711, "learning_rate": 7.333333333333333e-06, - "loss": 0.5983, + "loss": 0.4078, "step": 770 }, { "epoch": 10.006666666666666, - "grad_norm": 1.8229882717132568, + "grad_norm": 86.35945892333984, "learning_rate": 7.428571428571429e-06, - "loss": 0.7185, + "loss": 0.3709, "step": 780 }, { "epoch": 10.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6261557340621948, - "eval_runtime": 9.0473, - "eval_samples_per_second": 8.179, - "eval_steps_per_second": 2.1, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 0.6554831862449646, + "eval_runtime": 15.3209, + "eval_samples_per_second": 4.83, + "eval_steps_per_second": 1.24, "step": 781 }, { "epoch": 11.000857142857143, - "grad_norm": 4.57699728012085, + "grad_norm": 6.803892612457275, "learning_rate": 7.523809523809524e-06, - "loss": 0.6964, + "loss": 0.1958, "step": 790 }, { "epoch": 11.001809523809523, - "grad_norm": 6.128643035888672, + "grad_norm": 101.62811279296875, "learning_rate": 7.61904761904762e-06, - "loss": 0.6993, + "loss": 0.205, "step": 800 }, { "epoch": 11.002761904761904, - "grad_norm": 7.555062294006348, + "grad_norm": 41.35865783691406, "learning_rate": 7.714285714285716e-06, - "loss": 0.6986, + "loss": 0.3161, "step": 810 }, { "epoch": 11.003714285714286, - "grad_norm": 9.311781883239746, + "grad_norm": 5.086019992828369, "learning_rate": 7.809523809523811e-06, - "loss": 0.5926, + "loss": 0.2116, "step": 820 }, { "epoch": 11.004666666666667, - "grad_norm": 3.417293071746826, + "grad_norm": 79.2950210571289, "learning_rate": 7.904761904761904e-06, - "loss": 0.7427, + "loss": 0.3454, "step": 830 }, { "epoch": 11.005619047619048, - "grad_norm": 2.279402732849121, + "grad_norm": 6.022214412689209, "learning_rate": 8.000000000000001e-06, - "loss": 0.5867, + "loss": 0.1427, "step": 840 }, { "epoch": 11.006571428571428, - "grad_norm": 4.921882152557373, + "grad_norm": 5.330665588378906, "learning_rate": 8.095238095238097e-06, - "loss": 0.6961, + "loss": 0.2793, "step": 850 }, { "epoch": 11.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6510185599327087, - "eval_runtime": 8.7594, - "eval_samples_per_second": 8.448, - "eval_steps_per_second": 2.169, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 0.7139844298362732, + "eval_runtime": 14.3175, + "eval_samples_per_second": 5.169, + "eval_steps_per_second": 1.327, "step": 852 }, { "epoch": 12.000761904761905, - "grad_norm": 1.0956617593765259, + "grad_norm": 3.918455123901367, "learning_rate": 8.190476190476192e-06, - "loss": 0.5483, + "loss": 0.1305, "step": 860 }, { "epoch": 12.001714285714286, - "grad_norm": 9.6234712600708, + "grad_norm": 208.3162384033203, "learning_rate": 8.285714285714287e-06, - "loss": 0.5666, + "loss": 0.2397, "step": 870 }, { "epoch": 12.002666666666666, - "grad_norm": 9.854573249816895, + "grad_norm": 90.90999603271484, "learning_rate": 8.380952380952382e-06, - "loss": 0.6737, + "loss": 0.1608, "step": 880 }, { "epoch": 12.003619047619047, - "grad_norm": 5.818562984466553, + "grad_norm": 106.05206298828125, "learning_rate": 8.476190476190477e-06, - "loss": 0.7087, + "loss": 0.673, "step": 890 }, { "epoch": 12.00457142857143, - "grad_norm": 3.282334566116333, + "grad_norm": 72.45829772949219, "learning_rate": 8.571428571428571e-06, - "loss": 0.6953, + "loss": 0.5077, "step": 900 }, { "epoch": 12.00552380952381, - "grad_norm": 3.8181209564208984, + "grad_norm": 124.6170425415039, "learning_rate": 8.666666666666668e-06, - "loss": 0.6684, + "loss": 0.2456, "step": 910 }, { "epoch": 12.00647619047619, - "grad_norm": 1.6585626602172852, + "grad_norm": 36.30199432373047, "learning_rate": 8.761904761904763e-06, - "loss": 0.6824, + "loss": 0.6153, "step": 920 }, { "epoch": 12.006761904761905, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 0.6236003041267395, - "eval_runtime": 8.6629, - "eval_samples_per_second": 8.542, - "eval_steps_per_second": 2.193, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 1.3005902767181396, + "eval_runtime": 13.5492, + "eval_samples_per_second": 5.462, + "eval_steps_per_second": 1.402, "step": 923 }, { "epoch": 13.000666666666667, - "grad_norm": 7.976182460784912, + "grad_norm": 0.5888111591339111, "learning_rate": 8.857142857142858e-06, - "loss": 0.6174, + "loss": 0.1448, "step": 930 }, { "epoch": 13.001619047619048, - "grad_norm": 1.9851270914077759, + "grad_norm": 72.55310821533203, "learning_rate": 8.952380952380953e-06, - "loss": 0.6294, + "loss": 0.5699, "step": 940 }, { "epoch": 13.002571428571429, - "grad_norm": 2.4457576274871826, + "grad_norm": 21.676172256469727, "learning_rate": 9.047619047619049e-06, - "loss": 0.8039, + "loss": 0.5474, "step": 950 }, { "epoch": 13.00352380952381, - "grad_norm": 1.8087605237960815, + "grad_norm": 14.727178573608398, "learning_rate": 9.142857142857144e-06, - "loss": 0.7045, + "loss": 0.2944, "step": 960 }, { "epoch": 13.00447619047619, - "grad_norm": 5.169572830200195, + "grad_norm": 62.19205856323242, "learning_rate": 9.238095238095239e-06, - "loss": 0.651, + "loss": 0.5067, "step": 970 }, { "epoch": 13.005428571428572, - "grad_norm": 10.0159912109375, + "grad_norm": 36.74239730834961, "learning_rate": 9.333333333333334e-06, - "loss": 0.6547, + "loss": 0.1512, "step": 980 }, { "epoch": 13.006380952380953, - "grad_norm": 8.085495948791504, + "grad_norm": 162.0737762451172, "learning_rate": 9.42857142857143e-06, - "loss": 0.6169, + "loss": 0.7185, "step": 990 }, { "epoch": 13.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.6484639048576355, - "eval_runtime": 9.2062, - "eval_samples_per_second": 8.038, - "eval_steps_per_second": 2.064, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 1.6663051843643188, + "eval_runtime": 13.7203, + "eval_samples_per_second": 5.393, + "eval_steps_per_second": 1.385, "step": 994 }, { "epoch": 14.000571428571428, - "grad_norm": 5.588738441467285, + "grad_norm": 4.640463352203369, "learning_rate": 9.523809523809525e-06, - "loss": 0.498, + "loss": 0.508, "step": 1000 }, { "epoch": 14.00152380952381, - "grad_norm": 3.952481746673584, + "grad_norm": 99.73812103271484, "learning_rate": 9.61904761904762e-06, - "loss": 0.7836, + "loss": 0.4121, "step": 1010 }, { "epoch": 14.002476190476191, - "grad_norm": 3.872713327407837, + "grad_norm": 82.50711822509766, "learning_rate": 9.714285714285715e-06, - "loss": 0.5697, + "loss": 0.5418, "step": 1020 }, { "epoch": 14.003428571428572, - "grad_norm": 6.213507652282715, + "grad_norm": 8.718976974487305, "learning_rate": 9.80952380952381e-06, - "loss": 0.6853, + "loss": 0.3846, "step": 1030 }, { "epoch": 14.004380952380952, - "grad_norm": 6.559708595275879, + "grad_norm": 0.05186394974589348, "learning_rate": 9.904761904761906e-06, - "loss": 0.6426, + "loss": 0.084, "step": 1040 }, { "epoch": 14.005333333333333, - "grad_norm": 19.489044189453125, + "grad_norm": 149.41354370117188, "learning_rate": 1e-05, - "loss": 0.6337, + "loss": 0.5234, "step": 1050 }, { "epoch": 14.006285714285715, - "grad_norm": 8.181139945983887, + "grad_norm": 28.37025260925293, "learning_rate": 9.989417989417989e-06, - "loss": 0.6172, + "loss": 0.4609, "step": 1060 }, { "epoch": 14.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.5577839016914368, - "eval_runtime": 9.136, - "eval_samples_per_second": 8.1, - "eval_steps_per_second": 2.08, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 1.3522089719772339, + "eval_runtime": 13.8555, + "eval_samples_per_second": 5.341, + "eval_steps_per_second": 1.371, "step": 1065 }, { "epoch": 15.00047619047619, - "grad_norm": 3.909177541732788, + "grad_norm": 3.3527989387512207, "learning_rate": 9.97883597883598e-06, - "loss": 0.5922, + "loss": 0.0314, "step": 1070 }, { "epoch": 15.001428571428571, - "grad_norm": 11.445720672607422, + "grad_norm": 2.5399253368377686, "learning_rate": 9.968253968253969e-06, - "loss": 0.6071, + "loss": 0.6274, "step": 1080 }, { "epoch": 15.002380952380953, - "grad_norm": 5.92810583114624, + "grad_norm": 73.98735046386719, "learning_rate": 9.957671957671959e-06, - "loss": 0.5807, + "loss": 0.2495, "step": 1090 }, { "epoch": 15.003333333333334, - "grad_norm": 7.069459915161133, + "grad_norm": 0.07959811389446259, "learning_rate": 9.947089947089947e-06, - "loss": 0.8764, + "loss": 0.2372, "step": 1100 }, { "epoch": 15.004285714285714, - "grad_norm": 13.066001892089844, + "grad_norm": 0.023227743804454803, "learning_rate": 9.936507936507937e-06, - "loss": 0.6341, + "loss": 0.0695, "step": 1110 }, { "epoch": 15.005238095238095, - "grad_norm": 2.9064149856567383, + "grad_norm": 0.45850658416748047, "learning_rate": 9.925925925925927e-06, - "loss": 0.5788, + "loss": 0.3362, "step": 1120 }, { "epoch": 15.006190476190476, - "grad_norm": 18.05321502685547, + "grad_norm": 110.30821228027344, "learning_rate": 9.915343915343916e-06, - "loss": 0.6671, + "loss": 0.236, "step": 1130 }, { "epoch": 15.006761904761905, - "eval_accuracy": 0.6486486486486487, - "eval_loss": 0.5988394618034363, - "eval_runtime": 9.3059, - "eval_samples_per_second": 7.952, - "eval_steps_per_second": 2.042, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 1.2227578163146973, + "eval_runtime": 13.4678, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 1.411, "step": 1136 }, { "epoch": 16.00038095238095, - "grad_norm": 13.60728645324707, + "grad_norm": 0.26170384883880615, "learning_rate": 9.904761904761906e-06, - "loss": 0.7159, + "loss": 0.1766, "step": 1140 }, { "epoch": 16.001333333333335, - "grad_norm": 11.699554443359375, + "grad_norm": 0.05248340964317322, "learning_rate": 9.894179894179896e-06, - "loss": 0.5976, + "loss": 0.1428, "step": 1150 }, { "epoch": 16.002285714285716, - "grad_norm": 6.164352893829346, + "grad_norm": 0.059930965304374695, "learning_rate": 9.883597883597884e-06, - "loss": 0.5959, + "loss": 0.1355, "step": 1160 }, { "epoch": 16.003238095238096, - "grad_norm": 5.747880935668945, + "grad_norm": 0.021113038063049316, "learning_rate": 9.873015873015874e-06, - "loss": 0.5771, + "loss": 0.3233, "step": 1170 }, { "epoch": 16.004190476190477, - "grad_norm": 16.106416702270508, + "grad_norm": 0.026357360184192657, "learning_rate": 9.862433862433864e-06, - "loss": 0.599, + "loss": 0.463, "step": 1180 }, { "epoch": 16.005142857142857, - "grad_norm": 14.334278106689453, + "grad_norm": 0.10321231931447983, "learning_rate": 9.851851851851852e-06, - "loss": 0.6584, + "loss": 0.0118, "step": 1190 }, { "epoch": 16.006095238095238, - "grad_norm": 8.10584831237793, + "grad_norm": 0.0149616077542305, "learning_rate": 9.841269841269842e-06, - "loss": 0.6063, + "loss": 0.0519, "step": 1200 }, { "epoch": 16.006761904761905, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 0.5370726585388184, - "eval_runtime": 9.9546, - "eval_samples_per_second": 7.434, - "eval_steps_per_second": 1.909, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.0972933769226074, + "eval_runtime": 13.5211, + "eval_samples_per_second": 5.473, + "eval_steps_per_second": 1.405, "step": 1207 }, { "epoch": 17.000285714285713, - "grad_norm": 7.525519847869873, + "grad_norm": 0.028757641091942787, "learning_rate": 9.830687830687832e-06, - "loss": 0.4826, + "loss": 0.259, "step": 1210 }, { "epoch": 17.001238095238094, - "grad_norm": 22.375347137451172, + "grad_norm": 20.525066375732422, "learning_rate": 9.82010582010582e-06, - "loss": 0.5952, + "loss": 0.037, "step": 1220 }, { "epoch": 17.002190476190478, - "grad_norm": 18.709026336669922, + "grad_norm": 0.12178980559110641, "learning_rate": 9.80952380952381e-06, - "loss": 0.5568, + "loss": 0.2156, "step": 1230 }, { "epoch": 17.00314285714286, - "grad_norm": 14.8303861618042, + "grad_norm": 25.35184097290039, "learning_rate": 9.7989417989418e-06, - "loss": 0.635, + "loss": 0.181, "step": 1240 }, { "epoch": 17.00409523809524, - "grad_norm": 18.453710556030273, + "grad_norm": 27.569835662841797, "learning_rate": 9.788359788359789e-06, - "loss": 0.5748, + "loss": 0.2445, "step": 1250 }, { "epoch": 17.00504761904762, - "grad_norm": 15.447833061218262, + "grad_norm": 0.11105721443891525, "learning_rate": 9.777777777777779e-06, - "loss": 0.517, + "loss": 0.2643, "step": 1260 }, { "epoch": 17.006, - "grad_norm": 17.667949676513672, + "grad_norm": 3.3096115589141846, "learning_rate": 9.767195767195769e-06, - "loss": 0.4294, + "loss": 0.0026, "step": 1270 }, { "epoch": 17.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.9391337037086487, - "eval_runtime": 8.9542, - "eval_samples_per_second": 8.264, - "eval_steps_per_second": 2.122, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 1.4475635290145874, + "eval_runtime": 13.5861, + "eval_samples_per_second": 5.447, + "eval_steps_per_second": 1.398, "step": 1278 }, { "epoch": 18.000190476190475, - "grad_norm": 16.773576736450195, + "grad_norm": 206.65469360351562, "learning_rate": 9.756613756613757e-06, - "loss": 0.7563, + "loss": 0.4347, "step": 1280 }, { "epoch": 18.001142857142856, - "grad_norm": 5.612834930419922, + "grad_norm": 0.15358828008174896, "learning_rate": 9.746031746031747e-06, - "loss": 0.7454, + "loss": 0.0009, "step": 1290 }, { "epoch": 18.002095238095237, - "grad_norm": 11.041013717651367, + "grad_norm": 109.42031860351562, "learning_rate": 9.735449735449735e-06, - "loss": 0.6223, + "loss": 0.1551, "step": 1300 }, { "epoch": 18.00304761904762, - "grad_norm": 24.63053321838379, + "grad_norm": 0.013478557579219341, "learning_rate": 9.724867724867725e-06, - "loss": 0.518, + "loss": 0.1445, "step": 1310 }, { "epoch": 18.004, - "grad_norm": 0.9231998324394226, + "grad_norm": 0.012817839160561562, "learning_rate": 9.714285714285715e-06, - "loss": 0.2842, + "loss": 0.0023, "step": 1320 }, { "epoch": 18.004952380952382, - "grad_norm": 7.218321323394775, + "grad_norm": 0.17729417979717255, "learning_rate": 9.703703703703703e-06, - "loss": 0.8021, + "loss": 0.0118, "step": 1330 }, { "epoch": 18.005904761904763, - "grad_norm": 9.65711784362793, + "grad_norm": 0.5205228328704834, "learning_rate": 9.693121693121693e-06, - "loss": 0.5702, + "loss": 0.357, "step": 1340 }, { "epoch": 18.006761904761905, - "eval_accuracy": 0.6756756756756757, - "eval_loss": 0.5392073392868042, - "eval_runtime": 9.3427, - "eval_samples_per_second": 7.921, - "eval_steps_per_second": 2.034, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 1.448710560798645, + "eval_runtime": 13.6656, + "eval_samples_per_second": 5.415, + "eval_steps_per_second": 1.39, "step": 1349 }, { "epoch": 19.000095238095238, - "grad_norm": 21.132457733154297, + "grad_norm": 0.02125757932662964, "learning_rate": 9.682539682539683e-06, - "loss": 0.3912, + "loss": 0.0994, "step": 1350 }, { "epoch": 19.00104761904762, - "grad_norm": 4.6318840980529785, + "grad_norm": 0.006553493440151215, "learning_rate": 9.671957671957672e-06, - "loss": 0.4698, + "loss": 0.3126, "step": 1360 }, { "epoch": 19.002, - "grad_norm": 8.19008731842041, + "grad_norm": 0.07267153263092041, "learning_rate": 9.661375661375663e-06, - "loss": 0.5323, + "loss": 0.4631, "step": 1370 }, { "epoch": 19.00295238095238, - "grad_norm": 13.09170150756836, + "grad_norm": 0.1298568695783615, "learning_rate": 9.650793650793652e-06, - "loss": 0.4009, + "loss": 0.1857, "step": 1380 }, { "epoch": 19.003904761904764, - "grad_norm": 19.89316177368164, + "grad_norm": 163.23121643066406, "learning_rate": 9.64021164021164e-06, - "loss": 0.6895, + "loss": 0.0433, "step": 1390 }, { "epoch": 19.004857142857144, - "grad_norm": 13.221324920654297, + "grad_norm": 187.11117553710938, "learning_rate": 9.62962962962963e-06, - "loss": 0.4602, + "loss": 0.1836, "step": 1400 }, { "epoch": 19.005809523809525, - "grad_norm": 14.639007568359375, + "grad_norm": 193.7169189453125, "learning_rate": 9.61904761904762e-06, - "loss": 0.5751, + "loss": 0.104, "step": 1410 }, { "epoch": 19.006761904761905, - "grad_norm": 8.934476852416992, + "grad_norm": 0.1640291064977646, "learning_rate": 9.60846560846561e-06, - "loss": 0.5217, + "loss": 0.4262, "step": 1420 }, { "epoch": 19.006761904761905, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 0.5672855973243713, - "eval_runtime": 9.6627, - "eval_samples_per_second": 7.658, - "eval_steps_per_second": 1.966, + "eval_accuracy": 0.7837837837837838, + "eval_loss": 1.1604030132293701, + "eval_runtime": 14.6483, + "eval_samples_per_second": 5.052, + "eval_steps_per_second": 1.297, "step": 1420 }, { "epoch": 20.00095238095238, - "grad_norm": 20.171602249145508, + "grad_norm": 0.09950070828199387, "learning_rate": 9.597883597883598e-06, - "loss": 0.4833, + "loss": 0.282, "step": 1430 }, { "epoch": 20.00190476190476, - "grad_norm": 38.0274543762207, + "grad_norm": 0.9464425444602966, "learning_rate": 9.587301587301588e-06, - "loss": 0.4942, + "loss": 0.283, "step": 1440 }, { "epoch": 20.002857142857142, - "grad_norm": 29.71581268310547, + "grad_norm": 0.007702260743826628, "learning_rate": 9.576719576719578e-06, - "loss": 0.3247, + "loss": 0.191, "step": 1450 }, { "epoch": 20.003809523809522, - "grad_norm": 21.187746047973633, + "grad_norm": 0.018454020842909813, "learning_rate": 9.566137566137567e-06, - "loss": 0.6404, + "loss": 0.023, "step": 1460 }, { "epoch": 20.004761904761907, - "grad_norm": 28.18136215209961, + "grad_norm": 0.07745273411273956, "learning_rate": 9.555555555555556e-06, - "loss": 0.6609, + "loss": 0.1185, "step": 1470 }, { "epoch": 20.005714285714287, - "grad_norm": 17.435962677001953, + "grad_norm": 0.00807939562946558, "learning_rate": 9.544973544973546e-06, - "loss": 0.3783, + "loss": 0.0006, "step": 1480 }, { "epoch": 20.006666666666668, - "grad_norm": 22.762752532958984, + "grad_norm": 0.011443381197750568, "learning_rate": 9.534391534391535e-06, - "loss": 0.4067, + "loss": 0.0021, "step": 1490 }, { "epoch": 20.006761904761905, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 0.6191511154174805, - "eval_runtime": 10.1554, - "eval_samples_per_second": 7.287, - "eval_steps_per_second": 1.871, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 1.771959900856018, + "eval_runtime": 14.7868, + "eval_samples_per_second": 5.004, + "eval_steps_per_second": 1.285, "step": 1491 }, { "epoch": 21.000857142857143, - "grad_norm": 6.484818458557129, + "grad_norm": 1.0574586391448975, "learning_rate": 9.523809523809525e-06, - "loss": 0.3049, + "loss": 0.0004, "step": 1500 }, { "epoch": 21.001809523809523, - "grad_norm": 17.38741683959961, + "grad_norm": 0.1903315633535385, "learning_rate": 9.513227513227515e-06, - "loss": 0.4772, + "loss": 0.2308, "step": 1510 }, { "epoch": 21.002761904761904, - "grad_norm": 13.665902137756348, + "grad_norm": 0.011690843850374222, "learning_rate": 9.502645502645503e-06, - "loss": 0.6429, + "loss": 0.0002, "step": 1520 }, { "epoch": 21.003714285714285, - "grad_norm": 23.571544647216797, + "grad_norm": 0.03922194615006447, "learning_rate": 9.492063492063493e-06, - "loss": 0.4749, + "loss": 0.2452, "step": 1530 }, { "epoch": 21.004666666666665, - "grad_norm": 58.571563720703125, + "grad_norm": 0.013964025303721428, "learning_rate": 9.481481481481483e-06, - "loss": 0.4673, + "loss": 0.0625, "step": 1540 }, { "epoch": 21.005619047619046, - "grad_norm": 5.908404350280762, + "grad_norm": 0.022155698388814926, "learning_rate": 9.470899470899471e-06, - "loss": 0.4881, + "loss": 0.0013, "step": 1550 }, { "epoch": 21.00657142857143, - "grad_norm": 5.744145393371582, + "grad_norm": 0.0031711491756141186, "learning_rate": 9.460317460317461e-06, - "loss": 0.2278, + "loss": 0.0132, "step": 1560 }, { "epoch": 21.006761904761905, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 0.8933660387992859, - "eval_runtime": 9.5347, - "eval_samples_per_second": 7.761, - "eval_steps_per_second": 1.993, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 1.7387584447860718, + "eval_runtime": 14.2802, + "eval_samples_per_second": 5.182, + "eval_steps_per_second": 1.331, "step": 1562 }, { "epoch": 22.000761904761905, - "grad_norm": 5.076817989349365, + "grad_norm": 0.003926535602658987, "learning_rate": 9.449735449735451e-06, - "loss": 0.3829, + "loss": 0.0005, "step": 1570 }, { "epoch": 22.001714285714286, - "grad_norm": 32.336246490478516, + "grad_norm": 0.6359225511550903, "learning_rate": 9.43915343915344e-06, - "loss": 0.3684, + "loss": 0.0741, "step": 1580 }, { "epoch": 22.002666666666666, - "grad_norm": 54.73106002807617, + "grad_norm": 0.010339989326894283, "learning_rate": 9.42857142857143e-06, - "loss": 0.8032, + "loss": 0.1656, "step": 1590 }, { "epoch": 22.003619047619047, - "grad_norm": 35.00995635986328, + "grad_norm": 0.007642359938472509, "learning_rate": 9.417989417989418e-06, - "loss": 0.6397, + "loss": 0.0215, "step": 1600 }, { "epoch": 22.004571428571428, - "grad_norm": 45.27841567993164, + "grad_norm": 0.006055203732103109, "learning_rate": 9.407407407407408e-06, - "loss": 0.2019, + "loss": 0.0006, "step": 1610 }, { "epoch": 22.005523809523808, - "grad_norm": 1.6123958826065063, + "grad_norm": 0.01843923330307007, "learning_rate": 9.396825396825398e-06, - "loss": 0.3284, + "loss": 0.2335, "step": 1620 }, { "epoch": 22.00647619047619, - "grad_norm": 121.44664764404297, + "grad_norm": 0.009593057446181774, "learning_rate": 9.386243386243386e-06, - "loss": 0.7341, + "loss": 0.1451, "step": 1630 }, { "epoch": 22.006761904761905, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 0.6416198015213013, - "eval_runtime": 10.744, - "eval_samples_per_second": 6.888, - "eval_steps_per_second": 1.768, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 1.7954270839691162, + "eval_runtime": 14.1525, + "eval_samples_per_second": 5.229, + "eval_steps_per_second": 1.343, "step": 1633 }, { "epoch": 23.000666666666667, - "grad_norm": 45.13463592529297, + "grad_norm": 0.0035956420470029116, "learning_rate": 9.375661375661376e-06, - "loss": 0.6296, + "loss": 0.0002, "step": 1640 }, { "epoch": 23.001619047619048, - "grad_norm": 58.95222854614258, + "grad_norm": 0.006967821158468723, "learning_rate": 9.365079365079366e-06, - "loss": 0.598, + "loss": 0.0006, "step": 1650 }, { "epoch": 23.00257142857143, - "grad_norm": 46.480224609375, + "grad_norm": 0.00322016142308712, "learning_rate": 9.354497354497354e-06, - "loss": 0.4446, + "loss": 0.0927, "step": 1660 }, { "epoch": 23.00352380952381, - "grad_norm": 28.19147300720215, + "grad_norm": 81.37845611572266, "learning_rate": 9.343915343915344e-06, - "loss": 0.4959, + "loss": 0.0083, "step": 1670 }, { "epoch": 23.00447619047619, - "grad_norm": 29.306407928466797, + "grad_norm": 0.010911311022937298, "learning_rate": 9.333333333333334e-06, - "loss": 0.377, + "loss": 0.0152, "step": 1680 }, { "epoch": 23.00542857142857, - "grad_norm": 7.0006890296936035, + "grad_norm": 181.4998779296875, "learning_rate": 9.322751322751323e-06, - "loss": 0.4145, + "loss": 0.0175, "step": 1690 }, { "epoch": 23.00638095238095, - "grad_norm": 11.279711723327637, + "grad_norm": 0.01505933329463005, "learning_rate": 9.312169312169313e-06, - "loss": 0.4694, + "loss": 0.0099, "step": 1700 }, { "epoch": 23.006761904761905, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 0.4830036163330078, - "eval_runtime": 10.1152, - "eval_samples_per_second": 7.316, - "eval_steps_per_second": 1.878, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.161924123764038, + "eval_runtime": 14.9741, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 1.269, "step": 1704 }, { "epoch": 24.00057142857143, - "grad_norm": 56.81103515625, + "grad_norm": 0.12847761809825897, "learning_rate": 9.301587301587303e-06, - "loss": 0.409, + "loss": 0.1637, "step": 1710 }, { "epoch": 24.00152380952381, - "grad_norm": 18.599803924560547, + "grad_norm": 0.03343566879630089, "learning_rate": 9.291005291005291e-06, - "loss": 0.3474, + "loss": 0.0007, "step": 1720 }, { "epoch": 24.00247619047619, - "grad_norm": 0.6068246960639954, + "grad_norm": 0.0027026699390262365, "learning_rate": 9.280423280423281e-06, - "loss": 0.1888, + "loss": 0.0002, "step": 1730 }, { "epoch": 24.00342857142857, - "grad_norm": 30.965444564819336, + "grad_norm": 0.008595957420766354, "learning_rate": 9.26984126984127e-06, - "loss": 0.5208, + "loss": 0.3226, "step": 1740 }, { "epoch": 24.004380952380952, - "grad_norm": 35.79155731201172, + "grad_norm": 0.004393964074552059, "learning_rate": 9.25925925925926e-06, - "loss": 0.4365, + "loss": 0.0001, "step": 1750 }, { "epoch": 24.005333333333333, - "grad_norm": 1.8516825437545776, + "grad_norm": 0.0036259551998227835, "learning_rate": 9.248677248677249e-06, - "loss": 0.2648, + "loss": 0.0002, "step": 1760 }, { "epoch": 24.006285714285713, - "grad_norm": 97.02503204345703, + "grad_norm": 0.00916161946952343, "learning_rate": 9.238095238095239e-06, - "loss": 0.4655, + "loss": 0.0001, "step": 1770 }, { "epoch": 24.006761904761905, - "eval_accuracy": 0.6756756756756757, - "eval_loss": 0.8865509033203125, - "eval_runtime": 10.7089, - "eval_samples_per_second": 6.91, - "eval_steps_per_second": 1.774, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 1.6523562669754028, + "eval_runtime": 14.6247, + "eval_samples_per_second": 5.06, + "eval_steps_per_second": 1.299, "step": 1775 }, { "epoch": 25.000476190476192, - "grad_norm": 43.367218017578125, + "grad_norm": 0.005314418114721775, "learning_rate": 9.227513227513229e-06, - "loss": 0.4146, + "loss": 0.0001, "step": 1780 }, { "epoch": 25.001428571428573, - "grad_norm": 1.342340350151062, + "grad_norm": 0.003808894893154502, "learning_rate": 9.216931216931217e-06, - "loss": 0.6989, + "loss": 0.2069, "step": 1790 }, { "epoch": 25.002380952380953, - "grad_norm": 29.697744369506836, + "grad_norm": 0.0019496126333251595, "learning_rate": 9.206349206349207e-06, - "loss": 0.4928, + "loss": 0.1818, "step": 1800 }, { "epoch": 25.003333333333334, - "grad_norm": 463.5457763671875, + "grad_norm": 55.36027908325195, "learning_rate": 9.195767195767197e-06, - "loss": 0.3011, + "loss": 0.2119, "step": 1810 }, { "epoch": 25.004285714285714, - "grad_norm": 6.916287899017334, + "grad_norm": 0.05727091431617737, "learning_rate": 9.185185185185186e-06, - "loss": 0.1268, + "loss": 0.0027, "step": 1820 }, { "epoch": 25.005238095238095, - "grad_norm": 41.14836120605469, + "grad_norm": 0.006919063627719879, "learning_rate": 9.174603174603176e-06, - "loss": 0.5743, + "loss": 0.058, "step": 1830 }, { "epoch": 25.006190476190476, - "grad_norm": 104.40483856201172, + "grad_norm": 0.41044607758522034, "learning_rate": 9.164021164021166e-06, - "loss": 0.433, + "loss": 0.0005, "step": 1840 }, { "epoch": 25.006761904761905, - "eval_accuracy": 0.7567567567567568, - "eval_loss": 0.8913034796714783, - "eval_runtime": 10.4531, - "eval_samples_per_second": 7.079, - "eval_steps_per_second": 1.818, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 1.849861741065979, + "eval_runtime": 14.846, + "eval_samples_per_second": 4.985, + "eval_steps_per_second": 1.28, "step": 1846 }, { "epoch": 26.00038095238095, - "grad_norm": 214.91921997070312, + "grad_norm": 0.007502442691475153, "learning_rate": 9.153439153439154e-06, - "loss": 0.6528, + "loss": 0.0048, "step": 1850 }, { "epoch": 26.001333333333335, - "grad_norm": 23.635805130004883, + "grad_norm": 194.1666259765625, "learning_rate": 9.142857142857144e-06, - "loss": 0.2705, + "loss": 0.2684, "step": 1860 }, { "epoch": 26.002285714285716, - "grad_norm": 35.2174186706543, + "grad_norm": 0.04846418648958206, "learning_rate": 9.132275132275134e-06, - "loss": 0.2199, + "loss": 0.0003, "step": 1870 }, { "epoch": 26.003238095238096, - "grad_norm": 41.76405715942383, + "grad_norm": 0.0025523772928863764, "learning_rate": 9.121693121693122e-06, - "loss": 0.7814, + "loss": 0.0312, "step": 1880 }, { "epoch": 26.004190476190477, - "grad_norm": 1.089278221130371, + "grad_norm": 0.003015185473486781, "learning_rate": 9.111111111111112e-06, - "loss": 0.4366, + "loss": 0.0001, "step": 1890 }, { "epoch": 26.005142857142857, - "grad_norm": 35.63432693481445, + "grad_norm": 29.853809356689453, "learning_rate": 9.1005291005291e-06, - "loss": 0.1217, + "loss": 0.0028, "step": 1900 }, { "epoch": 26.006095238095238, - "grad_norm": 84.82231903076172, + "grad_norm": 0.006052908953279257, "learning_rate": 9.08994708994709e-06, - "loss": 0.4986, + "loss": 0.0388, "step": 1910 }, { "epoch": 26.006761904761905, "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.0155631303787231, - "eval_runtime": 10.3708, - "eval_samples_per_second": 7.135, - "eval_steps_per_second": 1.832, + "eval_loss": 1.8791685104370117, + "eval_runtime": 14.4914, + "eval_samples_per_second": 5.106, + "eval_steps_per_second": 1.311, "step": 1917 }, { "epoch": 27.000285714285713, - "grad_norm": 49.51460647583008, + "grad_norm": 0.003093892941251397, "learning_rate": 9.07936507936508e-06, - "loss": 0.6562, + "loss": 0.4347, "step": 1920 }, { "epoch": 27.001238095238094, - "grad_norm": 14.74368953704834, + "grad_norm": 0.006142797879874706, "learning_rate": 9.068783068783069e-06, - "loss": 0.1784, + "loss": 0.0019, "step": 1930 }, { "epoch": 27.002190476190478, - "grad_norm": 2.938443660736084, + "grad_norm": 272.3396301269531, "learning_rate": 9.058201058201059e-06, - "loss": 0.5015, + "loss": 0.2855, "step": 1940 }, { "epoch": 27.00314285714286, - "grad_norm": 0.3131488263607025, + "grad_norm": 0.0017996703973039985, "learning_rate": 9.047619047619049e-06, - "loss": 0.2419, + "loss": 0.2704, "step": 1950 }, { "epoch": 27.00409523809524, - "grad_norm": 30.95822525024414, + "grad_norm": 0.03826872631907463, "learning_rate": 9.037037037037037e-06, - "loss": 0.7583, + "loss": 0.1214, "step": 1960 }, { "epoch": 27.00504761904762, - "grad_norm": 29.04351043701172, + "grad_norm": 15.44973087310791, "learning_rate": 9.026455026455027e-06, - "loss": 0.4038, + "loss": 0.17, "step": 1970 }, { "epoch": 27.006, - "grad_norm": 145.082275390625, + "grad_norm": 2.6301112174987793, "learning_rate": 9.015873015873017e-06, - "loss": 0.4063, + "loss": 0.1798, "step": 1980 }, { "epoch": 27.006761904761905, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.1914985179901123, - "eval_runtime": 9.881, - "eval_samples_per_second": 7.489, - "eval_steps_per_second": 1.923, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.2950594425201416, + "eval_runtime": 14.8518, + "eval_samples_per_second": 4.983, + "eval_steps_per_second": 1.279, "step": 1988 }, { "epoch": 28.000190476190475, - "grad_norm": 95.30940246582031, + "grad_norm": 0.3684341013431549, "learning_rate": 9.005291005291005e-06, - "loss": 0.3034, + "loss": 0.0012, "step": 1990 }, { "epoch": 28.001142857142856, - "grad_norm": 66.49639892578125, + "grad_norm": 0.007811195217072964, "learning_rate": 8.994708994708995e-06, - "loss": 0.5345, + "loss": 0.0002, "step": 2000 }, { "epoch": 28.002095238095237, - "grad_norm": 88.5883560180664, + "grad_norm": 0.016496405005455017, "learning_rate": 8.984126984126985e-06, - "loss": 0.1827, + "loss": 0.0557, "step": 2010 }, { "epoch": 28.00304761904762, - "grad_norm": 0.09640676528215408, + "grad_norm": 0.014805680140852928, "learning_rate": 8.973544973544973e-06, - "loss": 0.1681, + "loss": 0.0223, "step": 2020 }, { "epoch": 28.004, - "grad_norm": 0.20055362582206726, + "grad_norm": 16.467445373535156, "learning_rate": 8.962962962962963e-06, - "loss": 0.0884, + "loss": 0.0051, "step": 2030 }, { "epoch": 28.004952380952382, - "grad_norm": 186.1331329345703, + "grad_norm": 237.7123260498047, "learning_rate": 8.952380952380953e-06, - "loss": 0.2907, + "loss": 0.9651, "step": 2040 }, { "epoch": 28.005904761904763, - "grad_norm": 0.27754858136177063, + "grad_norm": 0.03144896402955055, "learning_rate": 8.941798941798942e-06, - "loss": 0.3722, + "loss": 0.2354, "step": 2050 }, { "epoch": 28.006761904761905, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.3529411554336548, - "eval_runtime": 10.2689, - "eval_samples_per_second": 7.206, - "eval_steps_per_second": 1.85, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 1.5408201217651367, + "eval_runtime": 15.1773, + "eval_samples_per_second": 4.876, + "eval_steps_per_second": 1.252, "step": 2059 }, { "epoch": 29.000095238095238, - "grad_norm": 0.07299116998910904, + "grad_norm": 0.04290107265114784, "learning_rate": 8.931216931216932e-06, - "loss": 0.299, + "loss": 0.0665, "step": 2060 }, { "epoch": 29.00104761904762, - "grad_norm": 2.538444995880127, + "grad_norm": 0.009639314375817776, "learning_rate": 8.920634920634922e-06, - "loss": 0.7479, + "loss": 0.1078, "step": 2070 }, { "epoch": 29.002, - "grad_norm": 91.43159484863281, + "grad_norm": 0.004252797923982143, "learning_rate": 8.910052910052912e-06, - "loss": 0.605, + "loss": 0.1316, "step": 2080 }, { "epoch": 29.00295238095238, - "grad_norm": 0.6341414451599121, + "grad_norm": 0.16179914772510529, "learning_rate": 8.8994708994709e-06, - "loss": 0.0868, + "loss": 0.0026, "step": 2090 }, { "epoch": 29.003904761904764, - "grad_norm": 14.649850845336914, + "grad_norm": 0.24769946932792664, "learning_rate": 8.888888888888888e-06, - "loss": 0.2434, + "loss": 0.0009, "step": 2100 }, { "epoch": 29.004857142857144, - "grad_norm": 127.68643951416016, + "grad_norm": 0.16514120995998383, "learning_rate": 8.87830687830688e-06, - "loss": 0.2415, + "loss": 0.4554, "step": 2110 }, { "epoch": 29.005809523809525, - "grad_norm": 0.0667656809091568, + "grad_norm": 0.010989787988364697, "learning_rate": 8.867724867724868e-06, - "loss": 0.3172, + "loss": 0.0976, "step": 2120 }, { "epoch": 29.006761904761905, - "grad_norm": 0.0887688621878624, + "grad_norm": 0.005337660200893879, "learning_rate": 8.857142857142858e-06, - "loss": 0.2947, + "loss": 0.0024, "step": 2130 }, { "epoch": 29.006761904761905, - "eval_accuracy": 0.6351351351351351, - "eval_loss": 1.6800689697265625, - "eval_runtime": 9.8949, - "eval_samples_per_second": 7.479, - "eval_steps_per_second": 1.92, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 1.9223977327346802, + "eval_runtime": 15.8736, + "eval_samples_per_second": 4.662, + "eval_steps_per_second": 1.197, "step": 2130 }, { "epoch": 30.00095238095238, - "grad_norm": 0.06466253846883774, + "grad_norm": 0.07220576703548431, "learning_rate": 8.846560846560848e-06, - "loss": 0.492, + "loss": 0.0922, "step": 2140 }, { "epoch": 30.00190476190476, - "grad_norm": 0.06812964379787445, + "grad_norm": 0.008359185419976711, "learning_rate": 8.835978835978837e-06, - "loss": 0.265, + "loss": 0.1279, "step": 2150 }, { "epoch": 30.002857142857142, - "grad_norm": 100.26397705078125, + "grad_norm": 0.016845321282744408, "learning_rate": 8.825396825396827e-06, - "loss": 0.403, + "loss": 0.0001, "step": 2160 }, { "epoch": 30.003809523809522, - "grad_norm": 6.108546733856201, + "grad_norm": 0.08929844200611115, "learning_rate": 8.814814814814817e-06, - "loss": 0.3639, + "loss": 0.0452, "step": 2170 }, { "epoch": 30.004761904761907, - "grad_norm": 109.57473754882812, + "grad_norm": 0.004775241948664188, "learning_rate": 8.804232804232805e-06, - "loss": 0.3092, + "loss": 0.0145, "step": 2180 }, { "epoch": 30.005714285714287, - "grad_norm": 0.307584673166275, + "grad_norm": 0.00836123526096344, "learning_rate": 8.793650793650795e-06, - "loss": 0.6693, + "loss": 0.0796, "step": 2190 }, { "epoch": 30.006666666666668, - "grad_norm": 106.67752075195312, + "grad_norm": 0.0014215363189578056, "learning_rate": 8.783068783068783e-06, - "loss": 0.1906, + "loss": 0.0018, "step": 2200 }, { "epoch": 30.006761904761905, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 0.984470784664154, - "eval_runtime": 9.9368, - "eval_samples_per_second": 7.447, - "eval_steps_per_second": 1.912, + "eval_accuracy": 0.6486486486486487, + "eval_loss": 2.5244226455688477, + "eval_runtime": 15.1839, + "eval_samples_per_second": 4.874, + "eval_steps_per_second": 1.251, "step": 2201 }, { "epoch": 31.000857142857143, - "grad_norm": 0.7306738495826721, + "grad_norm": 0.007338838651776314, "learning_rate": 8.772486772486773e-06, - "loss": 0.0122, + "loss": 0.0004, "step": 2210 }, { "epoch": 31.001809523809523, - "grad_norm": 183.5074462890625, + "grad_norm": 0.010591942816972733, "learning_rate": 8.761904761904763e-06, - "loss": 0.4206, + "loss": 0.0001, "step": 2220 }, { "epoch": 31.002761904761904, - "grad_norm": 51.4791145324707, + "grad_norm": 0.034093666821718216, "learning_rate": 8.751322751322751e-06, - "loss": 0.7487, + "loss": 0.0002, "step": 2230 }, { "epoch": 31.003714285714285, - "grad_norm": 87.89187622070312, + "grad_norm": 0.02081795036792755, "learning_rate": 8.740740740740741e-06, - "loss": 0.3637, + "loss": 0.0001, "step": 2240 }, { "epoch": 31.004666666666665, - "grad_norm": 23.6366024017334, + "grad_norm": 1.6572245359420776, "learning_rate": 8.730158730158731e-06, - "loss": 0.5004, + "loss": 0.1236, "step": 2250 }, { "epoch": 31.005619047619046, - "grad_norm": 0.5798986554145813, + "grad_norm": 0.013071013614535332, "learning_rate": 8.71957671957672e-06, - "loss": 0.2826, + "loss": 0.1374, "step": 2260 }, { "epoch": 31.00657142857143, - "grad_norm": 0.09235603362321854, + "grad_norm": 0.10946688055992126, "learning_rate": 8.70899470899471e-06, - "loss": 0.0161, + "loss": 0.1072, "step": 2270 }, { "epoch": 31.006761904761905, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.07892906665802, - "eval_runtime": 9.7772, - "eval_samples_per_second": 7.569, - "eval_steps_per_second": 1.943, + "eval_accuracy": 0.6486486486486487, + "eval_loss": 2.8444180488586426, + "eval_runtime": 15.0199, + "eval_samples_per_second": 4.927, + "eval_steps_per_second": 1.265, "step": 2272 }, { "epoch": 32.0007619047619, - "grad_norm": 95.19580078125, + "grad_norm": 0.8098803758621216, "learning_rate": 8.6984126984127e-06, - "loss": 0.177, + "loss": 0.1896, "step": 2280 }, { "epoch": 32.001714285714286, - "grad_norm": 0.20149828493595123, + "grad_norm": 0.003531635971739888, "learning_rate": 8.687830687830688e-06, - "loss": 0.3689, + "loss": 0.1047, "step": 2290 }, { "epoch": 32.00266666666667, - "grad_norm": 74.12393188476562, + "grad_norm": 0.003388685407117009, "learning_rate": 8.677248677248678e-06, - "loss": 0.2379, + "loss": 0.0001, "step": 2300 }, { "epoch": 32.00361904761905, - "grad_norm": 77.6581802368164, + "grad_norm": 0.0055401683785021305, "learning_rate": 8.666666666666668e-06, - "loss": 0.672, + "loss": 0.0003, "step": 2310 }, { "epoch": 32.00457142857143, - "grad_norm": 0.2690005898475647, + "grad_norm": 0.019029097631573677, "learning_rate": 8.656084656084656e-06, - "loss": 0.0458, + "loss": 0.0004, "step": 2320 }, { "epoch": 32.00552380952381, - "grad_norm": 291.5118103027344, + "grad_norm": 0.0014148615300655365, "learning_rate": 8.645502645502646e-06, - "loss": 0.2605, + "loss": 0.0041, "step": 2330 }, { "epoch": 32.00647619047619, - "grad_norm": 120.34475708007812, + "grad_norm": 0.005223044194281101, "learning_rate": 8.634920634920636e-06, - "loss": 0.5682, + "loss": 0.0664, "step": 2340 }, { "epoch": 32.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.256813883781433, - "eval_runtime": 10.0534, - "eval_samples_per_second": 7.361, - "eval_steps_per_second": 1.89, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 1.8276509046554565, + "eval_runtime": 16.0726, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 1.182, "step": 2343 }, { "epoch": 33.00066666666667, - "grad_norm": 0.04087988659739494, + "grad_norm": 0.0018431965727359056, "learning_rate": 8.624338624338624e-06, - "loss": 0.2199, + "loss": 0.026, "step": 2350 }, { "epoch": 33.001619047619045, - "grad_norm": 0.2287987768650055, + "grad_norm": 0.003196166828274727, "learning_rate": 8.613756613756614e-06, - "loss": 0.214, + "loss": 0.0002, "step": 2360 }, { "epoch": 33.00257142857143, - "grad_norm": 0.6162062883377075, + "grad_norm": 0.008334203623235226, "learning_rate": 8.603174603174604e-06, - "loss": 0.1188, + "loss": 0.0006, "step": 2370 }, { "epoch": 33.00352380952381, - "grad_norm": 0.09394045174121857, + "grad_norm": 0.0023803089279681444, "learning_rate": 8.592592592592593e-06, - "loss": 0.3031, + "loss": 0.0003, "step": 2380 }, { "epoch": 33.00447619047619, - "grad_norm": 1.0619105100631714, + "grad_norm": 365.64215087890625, "learning_rate": 8.582010582010583e-06, - "loss": 0.1606, + "loss": 0.1007, "step": 2390 }, { "epoch": 33.005428571428574, - "grad_norm": 41.31269073486328, + "grad_norm": 0.0017724215285852551, "learning_rate": 8.571428571428571e-06, - "loss": 0.2138, + "loss": 0.1023, "step": 2400 }, { "epoch": 33.00638095238095, - "grad_norm": 0.06278371065855026, + "grad_norm": 0.002012843731790781, "learning_rate": 8.560846560846563e-06, - "loss": 0.1105, + "loss": 0.0122, "step": 2410 }, { "epoch": 33.0067619047619, - "eval_accuracy": 0.7432432432432432, - "eval_loss": 1.0928676128387451, - "eval_runtime": 10.1179, - "eval_samples_per_second": 7.314, - "eval_steps_per_second": 1.878, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.1148488521575928, + "eval_runtime": 17.6577, + "eval_samples_per_second": 4.191, + "eval_steps_per_second": 1.076, "step": 2414 }, { "epoch": 34.000571428571426, - "grad_norm": 1.8994898796081543, + "grad_norm": 0.0028000962920486927, "learning_rate": 8.550264550264551e-06, - "loss": 0.4501, + "loss": 0.0001, "step": 2420 }, { "epoch": 34.00152380952381, - "grad_norm": 0.023029064759612083, + "grad_norm": 0.00524574751034379, "learning_rate": 8.53968253968254e-06, - "loss": 0.4373, + "loss": 0.2539, "step": 2430 }, { "epoch": 34.00247619047619, - "grad_norm": 14.903653144836426, + "grad_norm": 0.0048956056125462055, "learning_rate": 8.529100529100531e-06, - "loss": 0.2641, + "loss": 0.1583, "step": 2440 }, { "epoch": 34.00342857142857, - "grad_norm": 0.10007156431674957, + "grad_norm": 0.0051225321367383, "learning_rate": 8.518518518518519e-06, - "loss": 0.4035, + "loss": 0.4901, "step": 2450 }, { "epoch": 34.004380952380956, - "grad_norm": 0.1376352608203888, + "grad_norm": 2.2200734615325928, "learning_rate": 8.507936507936509e-06, - "loss": 0.394, + "loss": 0.0005, "step": 2460 }, { "epoch": 34.00533333333333, - "grad_norm": 0.9594055414199829, + "grad_norm": 0.007252862676978111, "learning_rate": 8.497354497354499e-06, - "loss": 0.4471, + "loss": 0.1284, "step": 2470 }, { "epoch": 34.00628571428572, - "grad_norm": 39.818092346191406, + "grad_norm": 6.2736735343933105, "learning_rate": 8.486772486772487e-06, - "loss": 0.1818, + "loss": 0.1118, "step": 2480 }, { "epoch": 34.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.191694736480713, - "eval_runtime": 10.5117, - "eval_samples_per_second": 7.04, - "eval_steps_per_second": 1.808, + "eval_accuracy": 0.7702702702702703, + "eval_loss": 1.5536247491836548, + "eval_runtime": 15.8753, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 1.197, "step": 2485 }, { "epoch": 35.00047619047619, - "grad_norm": 0.7797194719314575, + "grad_norm": 0.0035065035335719585, "learning_rate": 8.476190476190477e-06, - "loss": 0.1458, + "loss": 0.0003, "step": 2490 }, { "epoch": 35.00142857142857, - "grad_norm": 0.035457540303468704, + "grad_norm": 0.0031883029732853174, "learning_rate": 8.465608465608466e-06, - "loss": 0.182, + "loss": 0.0003, "step": 2500 }, { "epoch": 35.00238095238095, - "grad_norm": 43.21303939819336, + "grad_norm": 1.8205444812774658, "learning_rate": 8.455026455026456e-06, - "loss": 0.3509, + "loss": 0.0798, "step": 2510 }, { "epoch": 35.00333333333333, - "grad_norm": 0.02950003370642662, + "grad_norm": 0.006734704598784447, "learning_rate": 8.444444444444446e-06, - "loss": 0.1202, + "loss": 0.0001, "step": 2520 }, { "epoch": 35.004285714285714, - "grad_norm": 18.863386154174805, + "grad_norm": 0.004560893401503563, "learning_rate": 8.433862433862434e-06, - "loss": 0.0962, + "loss": 0.0127, "step": 2530 }, { "epoch": 35.0052380952381, - "grad_norm": 15.966325759887695, + "grad_norm": 0.012678610160946846, "learning_rate": 8.423280423280424e-06, - "loss": 0.0489, + "loss": 0.0024, "step": 2540 }, { "epoch": 35.006190476190476, - "grad_norm": 345.1724853515625, + "grad_norm": 62.18834686279297, "learning_rate": 8.412698412698414e-06, - "loss": 0.5396, + "loss": 0.1987, "step": 2550 }, { "epoch": 35.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.4709502458572388, - "eval_runtime": 9.8734, - "eval_samples_per_second": 7.495, - "eval_steps_per_second": 1.924, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.292334794998169, + "eval_runtime": 16.0954, + "eval_samples_per_second": 4.598, + "eval_steps_per_second": 1.18, "step": 2556 }, { "epoch": 36.00038095238095, - "grad_norm": 0.04642634466290474, + "grad_norm": 0.032790932804346085, "learning_rate": 8.402116402116402e-06, - "loss": 0.2865, + "loss": 0.0001, "step": 2560 }, { "epoch": 36.001333333333335, - "grad_norm": 21.857728958129883, + "grad_norm": 0.0441834032535553, "learning_rate": 8.391534391534392e-06, - "loss": 0.2618, + "loss": 0.2269, "step": 2570 }, { "epoch": 36.00228571428571, - "grad_norm": 119.08516693115234, + "grad_norm": 0.006388077512383461, "learning_rate": 8.380952380952382e-06, - "loss": 0.1449, + "loss": 0.0498, "step": 2580 }, { "epoch": 36.003238095238096, - "grad_norm": 0.02755267545580864, + "grad_norm": 0.02494201622903347, "learning_rate": 8.37037037037037e-06, - "loss": 0.0028, + "loss": 0.0001, "step": 2590 }, { "epoch": 36.00419047619047, - "grad_norm": 0.3885904848575592, + "grad_norm": 0.08396976441144943, "learning_rate": 8.35978835978836e-06, - "loss": 0.288, + "loss": 0.0001, "step": 2600 }, { "epoch": 36.00514285714286, - "grad_norm": 0.25101497769355774, + "grad_norm": 0.0030845170840620995, "learning_rate": 8.34920634920635e-06, - "loss": 0.1388, + "loss": 0.0001, "step": 2610 }, { "epoch": 36.00609523809524, - "grad_norm": 236.27584838867188, + "grad_norm": 0.7375411987304688, "learning_rate": 8.338624338624339e-06, - "loss": 0.0868, + "loss": 0.0012, "step": 2620 }, { "epoch": 36.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.5798817873001099, - "eval_runtime": 9.9958, - "eval_samples_per_second": 7.403, - "eval_steps_per_second": 1.901, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.6784675121307373, + "eval_runtime": 15.9991, + "eval_samples_per_second": 4.625, + "eval_steps_per_second": 1.188, "step": 2627 }, { "epoch": 37.00028571428572, - "grad_norm": 149.51145935058594, + "grad_norm": 0.00671563483774662, "learning_rate": 8.328042328042329e-06, - "loss": 0.3526, + "loss": 0.0001, "step": 2630 }, { "epoch": 37.001238095238094, - "grad_norm": 109.33985900878906, + "grad_norm": 0.030820587649941444, "learning_rate": 8.317460317460319e-06, - "loss": 0.3271, + "loss": 0.1668, "step": 2640 }, { "epoch": 37.00219047619048, - "grad_norm": 0.4006137251853943, + "grad_norm": 0.0018743366235867143, "learning_rate": 8.306878306878307e-06, - "loss": 0.083, + "loss": 0.0001, "step": 2650 }, { "epoch": 37.003142857142855, - "grad_norm": 0.00958289299160242, + "grad_norm": 0.0010029770201072097, "learning_rate": 8.296296296296297e-06, - "loss": 0.0701, + "loss": 0.0001, "step": 2660 }, { "epoch": 37.00409523809524, - "grad_norm": 0.18436852097511292, + "grad_norm": 0.0015597708988934755, "learning_rate": 8.285714285714287e-06, - "loss": 0.1158, + "loss": 0.0001, "step": 2670 }, { "epoch": 37.005047619047616, - "grad_norm": 0.16782556474208832, + "grad_norm": 0.001962661510333419, "learning_rate": 8.275132275132275e-06, - "loss": 0.304, + "loss": 0.0001, "step": 2680 }, { "epoch": 37.006, - "grad_norm": 2.0431201457977295, + "grad_norm": 0.025595329701900482, "learning_rate": 8.264550264550265e-06, - "loss": 0.2748, + "loss": 0.0027, "step": 2690 }, { "epoch": 37.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.3386650085449219, - "eval_runtime": 9.9941, - "eval_samples_per_second": 7.404, - "eval_steps_per_second": 1.901, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.240028142929077, + "eval_runtime": 16.2638, + "eval_samples_per_second": 4.55, + "eval_steps_per_second": 1.168, "step": 2698 }, { "epoch": 38.000190476190475, - "grad_norm": 0.025548648089170456, + "grad_norm": 0.0010542930103838444, "learning_rate": 8.253968253968254e-06, - "loss": 0.7404, + "loss": 0.0012, "step": 2700 }, { "epoch": 38.00114285714286, - "grad_norm": 110.7083511352539, + "grad_norm": 0.0016928648110479116, "learning_rate": 8.243386243386245e-06, - "loss": 0.4535, + "loss": 0.0522, "step": 2710 }, { "epoch": 38.00209523809524, - "grad_norm": 0.5688247084617615, + "grad_norm": 129.03274536132812, "learning_rate": 8.232804232804234e-06, - "loss": 0.1087, + "loss": 0.1343, "step": 2720 }, { "epoch": 38.00304761904762, - "grad_norm": 0.12118399143218994, + "grad_norm": 0.1395167112350464, "learning_rate": 8.222222222222222e-06, - "loss": 0.0061, + "loss": 0.0005, "step": 2730 }, { "epoch": 38.004, - "grad_norm": 104.44693756103516, + "grad_norm": 0.011270579881966114, "learning_rate": 8.211640211640213e-06, - "loss": 0.0139, + "loss": 0.1077, "step": 2740 }, { "epoch": 38.00495238095238, - "grad_norm": 20.01863670349121, + "grad_norm": 0.1653362661600113, "learning_rate": 8.201058201058202e-06, - "loss": 0.4976, + "loss": 0.1599, "step": 2750 }, { "epoch": 38.00590476190476, - "grad_norm": 6.175614356994629, + "grad_norm": 0.005356424022465944, "learning_rate": 8.190476190476192e-06, - "loss": 0.1488, + "loss": 0.0002, "step": 2760 }, { "epoch": 38.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.4293575286865234, - "eval_runtime": 10.1151, - "eval_samples_per_second": 7.316, - "eval_steps_per_second": 1.878, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.2459213733673096, + "eval_runtime": 16.4119, + "eval_samples_per_second": 4.509, + "eval_steps_per_second": 1.158, "step": 2769 }, { "epoch": 39.00009523809524, - "grad_norm": 0.256195604801178, + "grad_norm": 0.0011994903907179832, "learning_rate": 8.179894179894182e-06, - "loss": 0.4303, + "loss": 0.4059, "step": 2770 }, { "epoch": 39.00104761904762, - "grad_norm": 0.015647174790501595, + "grad_norm": 135.89295959472656, "learning_rate": 8.16931216931217e-06, - "loss": 0.0622, + "loss": 0.0199, "step": 2780 }, { "epoch": 39.002, - "grad_norm": 4.312227249145508, + "grad_norm": 0.005463339388370514, "learning_rate": 8.15873015873016e-06, - "loss": 0.3907, + "loss": 0.0001, "step": 2790 }, { "epoch": 39.00295238095238, - "grad_norm": 0.038897234946489334, + "grad_norm": 0.002280671149492264, "learning_rate": 8.148148148148148e-06, - "loss": 0.1975, + "loss": 0.0124, "step": 2800 }, { "epoch": 39.003904761904764, - "grad_norm": 0.2860523462295532, + "grad_norm": 0.0023753687273710966, "learning_rate": 8.137566137566138e-06, - "loss": 0.4374, + "loss": 0.0001, "step": 2810 }, { "epoch": 39.00485714285714, - "grad_norm": 93.7807846069336, + "grad_norm": 0.05094405636191368, "learning_rate": 8.126984126984128e-06, - "loss": 0.3152, + "loss": 0.0002, "step": 2820 }, { "epoch": 39.005809523809525, - "grad_norm": 0.01898103393614292, + "grad_norm": 0.003550964640453458, "learning_rate": 8.116402116402117e-06, - "loss": 0.0048, + "loss": 0.0936, "step": 2830 }, { "epoch": 39.0067619047619, - "grad_norm": 0.034995187073946, + "grad_norm": 0.3809496760368347, "learning_rate": 8.105820105820107e-06, - "loss": 0.3124, + "loss": 0.0099, "step": 2840 }, { "epoch": 39.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.1472700834274292, - "eval_runtime": 9.9063, - "eval_samples_per_second": 7.47, - "eval_steps_per_second": 1.918, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.360102415084839, + "eval_runtime": 182.5763, + "eval_samples_per_second": 0.405, + "eval_steps_per_second": 0.104, "step": 2840 }, { "epoch": 40.000952380952384, - "grad_norm": 0.012256530113518238, + "grad_norm": 0.0014301723567768931, "learning_rate": 8.095238095238097e-06, - "loss": 0.3209, + "loss": 0.0041, "step": 2850 }, { "epoch": 40.00190476190476, - "grad_norm": 0.023209771141409874, + "grad_norm": 0.005376802291721106, "learning_rate": 8.084656084656085e-06, - "loss": 0.0029, + "loss": 0.0001, "step": 2860 }, { "epoch": 40.002857142857145, - "grad_norm": 0.026608197018504143, + "grad_norm": 315.20269775390625, "learning_rate": 8.074074074074075e-06, - "loss": 0.0013, + "loss": 0.0808, "step": 2870 }, { "epoch": 40.00380952380952, - "grad_norm": 27.18771743774414, + "grad_norm": 0.033404335379600525, "learning_rate": 8.063492063492065e-06, - "loss": 0.1395, + "loss": 0.0503, "step": 2880 }, { "epoch": 40.00476190476191, - "grad_norm": 0.48340219259262085, + "grad_norm": 0.002346677239984274, "learning_rate": 8.052910052910053e-06, - "loss": 0.0304, + "loss": 0.0001, "step": 2890 }, { "epoch": 40.005714285714284, - "grad_norm": 547.4931030273438, + "grad_norm": 0.004127295687794685, "learning_rate": 8.042328042328043e-06, - "loss": 0.1179, + "loss": 0.0002, "step": 2900 }, { "epoch": 40.00666666666667, - "grad_norm": 890.5328369140625, + "grad_norm": 0.0013346931664273143, "learning_rate": 8.031746031746033e-06, - "loss": 0.1499, + "loss": 0.0071, "step": 2910 }, { "epoch": 40.0067619047619, - "eval_accuracy": 0.6756756756756757, - "eval_loss": 1.8165091276168823, - "eval_runtime": 10.357, - "eval_samples_per_second": 7.145, - "eval_steps_per_second": 1.835, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.0561344623565674, + "eval_runtime": 16.8361, + "eval_samples_per_second": 4.395, + "eval_steps_per_second": 1.129, "step": 2911 }, { "epoch": 41.00085714285714, - "grad_norm": 0.045793041586875916, + "grad_norm": 0.0015484824543818831, "learning_rate": 8.021164021164021e-06, - "loss": 0.0008, + "loss": 0.0, "step": 2920 }, { "epoch": 41.00180952380953, - "grad_norm": 0.022662021219730377, + "grad_norm": 0.0011268400121480227, "learning_rate": 8.010582010582011e-06, - "loss": 0.2136, + "loss": 0.0302, "step": 2930 }, { "epoch": 41.002761904761904, - "grad_norm": 76.39334106445312, + "grad_norm": 0.002251312369480729, "learning_rate": 8.000000000000001e-06, - "loss": 0.1487, + "loss": 0.2746, "step": 2940 }, { "epoch": 41.00371428571429, - "grad_norm": 0.06917908787727356, + "grad_norm": 0.0010610901517793536, "learning_rate": 7.98941798941799e-06, - "loss": 0.2659, + "loss": 0.2491, "step": 2950 }, { "epoch": 41.004666666666665, - "grad_norm": 0.0647149607539177, + "grad_norm": 0.003384027164429426, "learning_rate": 7.97883597883598e-06, - "loss": 0.2078, + "loss": 0.1268, "step": 2960 }, { "epoch": 41.00561904761905, - "grad_norm": 0.09832796454429626, + "grad_norm": 0.01891510747373104, "learning_rate": 7.968253968253968e-06, - "loss": 0.1764, + "loss": 0.0234, "step": 2970 }, { "epoch": 41.00657142857143, - "grad_norm": 4.377299785614014, + "grad_norm": 48.88753128051758, "learning_rate": 7.957671957671958e-06, - "loss": 0.3149, + "loss": 0.0086, "step": 2980 }, { "epoch": 41.0067619047619, - "eval_accuracy": 0.6351351351351351, - "eval_loss": 2.090315580368042, - "eval_runtime": 10.333, - "eval_samples_per_second": 7.162, - "eval_steps_per_second": 1.839, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.189833641052246, + "eval_runtime": 18.7226, + "eval_samples_per_second": 3.952, + "eval_steps_per_second": 1.015, "step": 2982 }, { "epoch": 42.0007619047619, - "grad_norm": 49.249637603759766, + "grad_norm": 0.0049520800821483135, "learning_rate": 7.947089947089948e-06, - "loss": 0.6223, + "loss": 0.2009, "step": 2990 }, { "epoch": 42.001714285714286, - "grad_norm": 29.288423538208008, + "grad_norm": 0.0075964052230119705, "learning_rate": 7.936507936507936e-06, - "loss": 0.1668, + "loss": 0.0002, "step": 3000 }, { "epoch": 42.00266666666667, - "grad_norm": 0.13853886723518372, + "grad_norm": 0.172276109457016, "learning_rate": 7.925925925925926e-06, - "loss": 0.1814, + "loss": 0.0001, "step": 3010 }, { "epoch": 42.00361904761905, - "grad_norm": 0.2841092646121979, + "grad_norm": 112.24790954589844, "learning_rate": 7.915343915343916e-06, - "loss": 0.0572, + "loss": 0.251, "step": 3020 }, { "epoch": 42.00457142857143, - "grad_norm": 0.009770851582288742, + "grad_norm": 0.9104766845703125, "learning_rate": 7.904761904761904e-06, - "loss": 0.2405, + "loss": 0.0002, "step": 3030 }, { "epoch": 42.00552380952381, - "grad_norm": 0.00862161722034216, + "grad_norm": 0.05150100961327553, "learning_rate": 7.894179894179896e-06, - "loss": 0.1012, + "loss": 0.0002, "step": 3040 }, { "epoch": 42.00647619047619, - "grad_norm": 230.69879150390625, + "grad_norm": 0.032936934381723404, "learning_rate": 7.883597883597884e-06, - "loss": 0.02, + "loss": 0.0131, "step": 3050 }, { "epoch": 42.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.9185028076171875, - "eval_runtime": 10.6929, - "eval_samples_per_second": 6.92, - "eval_steps_per_second": 1.777, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.6086132526397705, + "eval_runtime": 38.9907, + "eval_samples_per_second": 1.898, + "eval_steps_per_second": 0.487, "step": 3053 }, { "epoch": 43.00066666666667, - "grad_norm": 0.05728636309504509, + "grad_norm": 0.007421289570629597, "learning_rate": 7.873015873015873e-06, - "loss": 0.415, + "loss": 0.0003, "step": 3060 }, { "epoch": 43.001619047619045, - "grad_norm": 2.755094051361084, + "grad_norm": 0.005742072127759457, "learning_rate": 7.862433862433863e-06, - "loss": 0.01, + "loss": 0.0946, "step": 3070 }, { "epoch": 43.00257142857143, - "grad_norm": 0.022400056943297386, + "grad_norm": 0.00301670515909791, "learning_rate": 7.851851851851853e-06, - "loss": 0.0853, + "loss": 0.0103, "step": 3080 }, { "epoch": 43.00352380952381, - "grad_norm": 0.040767963975667953, + "grad_norm": 62.63003158569336, "learning_rate": 7.841269841269843e-06, - "loss": 0.0153, + "loss": 0.2432, "step": 3090 }, { "epoch": 43.00447619047619, - "grad_norm": 0.006976688280701637, + "grad_norm": 0.16569894552230835, "learning_rate": 7.830687830687831e-06, - "loss": 0.2631, + "loss": 0.0008, "step": 3100 }, { "epoch": 43.005428571428574, - "grad_norm": 0.025811027735471725, + "grad_norm": 0.008355499245226383, "learning_rate": 7.820105820105821e-06, - "loss": 0.0004, + "loss": 0.0087, "step": 3110 }, { "epoch": 43.00638095238095, - "grad_norm": 0.05052373185753822, + "grad_norm": 0.015924755483865738, "learning_rate": 7.809523809523811e-06, - "loss": 0.0852, + "loss": 0.0002, "step": 3120 }, { "epoch": 43.0067619047619, "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.449135422706604, - "eval_runtime": 10.0453, - "eval_samples_per_second": 7.367, - "eval_steps_per_second": 1.891, + "eval_loss": 2.140007257461548, + "eval_runtime": 17.6835, + "eval_samples_per_second": 4.185, + "eval_steps_per_second": 1.074, "step": 3124 }, { "epoch": 44.000571428571426, - "grad_norm": 0.010279198177158833, + "grad_norm": 0.0032034190371632576, "learning_rate": 7.7989417989418e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 3130 }, { "epoch": 44.00152380952381, - "grad_norm": 0.010263322852551937, + "grad_norm": 0.0018916918197646737, "learning_rate": 7.78835978835979e-06, - "loss": 0.0005, + "loss": 0.0001, "step": 3140 }, { "epoch": 44.00247619047619, - "grad_norm": 0.21711310744285583, + "grad_norm": 0.0014891589526087046, "learning_rate": 7.77777777777778e-06, - "loss": 0.0005, + "loss": 0.002, "step": 3150 }, { "epoch": 44.00342857142857, - "grad_norm": 56.55171585083008, + "grad_norm": 0.12912853062152863, "learning_rate": 7.767195767195767e-06, - "loss": 0.4343, + "loss": 0.0001, "step": 3160 }, { "epoch": 44.004380952380956, - "grad_norm": 0.00604694988578558, + "grad_norm": 0.0015116170980036259, "learning_rate": 7.756613756613757e-06, - "loss": 0.0372, + "loss": 0.0001, "step": 3170 }, { "epoch": 44.00533333333333, - "grad_norm": 0.013936587609350681, + "grad_norm": 0.0008536073728464544, "learning_rate": 7.746031746031747e-06, - "loss": 0.084, + "loss": 0.0001, "step": 3180 }, { "epoch": 44.00628571428572, - "grad_norm": 0.36712291836738586, + "grad_norm": 0.003453182987868786, "learning_rate": 7.735449735449736e-06, - "loss": 0.0115, + "loss": 0.0001, "step": 3190 }, { "epoch": 44.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.6180015802383423, - "eval_runtime": 13.1268, - "eval_samples_per_second": 5.637, - "eval_steps_per_second": 1.447, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.260770320892334, + "eval_runtime": 17.2523, + "eval_samples_per_second": 4.289, + "eval_steps_per_second": 1.101, "step": 3195 }, { "epoch": 45.00047619047619, - "grad_norm": 0.03513185679912567, + "grad_norm": 0.008949621580541134, "learning_rate": 7.724867724867726e-06, - "loss": 0.0029, + "loss": 0.0001, "step": 3200 }, { "epoch": 45.00142857142857, - "grad_norm": 0.012012271210551262, + "grad_norm": 0.007500027772039175, "learning_rate": 7.714285714285716e-06, - "loss": 0.0019, + "loss": 0.0655, "step": 3210 }, { "epoch": 45.00238095238095, - "grad_norm": 0.013395571149885654, + "grad_norm": 0.0022459605243057013, "learning_rate": 7.703703703703704e-06, - "loss": 0.0026, + "loss": 0.0006, "step": 3220 }, { "epoch": 45.00333333333333, - "grad_norm": 5.026131629943848, + "grad_norm": 0.002970959758386016, "learning_rate": 7.693121693121694e-06, - "loss": 0.0607, + "loss": 0.0, "step": 3230 }, { "epoch": 45.004285714285714, - "grad_norm": 21.899553298950195, + "grad_norm": 0.0018117021536454558, "learning_rate": 7.682539682539684e-06, - "loss": 0.0012, + "loss": 0.0004, "step": 3240 }, { "epoch": 45.0052380952381, - "grad_norm": 0.014592028222978115, + "grad_norm": 0.01580795831978321, "learning_rate": 7.671957671957672e-06, - "loss": 0.229, + "loss": 0.0001, "step": 3250 }, { "epoch": 45.006190476190476, - "grad_norm": 0.020671457052230835, + "grad_norm": 0.0050562042742967606, "learning_rate": 7.661375661375662e-06, - "loss": 0.5243, + "loss": 0.0549, "step": 3260 }, { "epoch": 45.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.8516004085540771, - "eval_runtime": 12.3517, - "eval_samples_per_second": 5.991, - "eval_steps_per_second": 1.538, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.012903928756714, + "eval_runtime": 20.0668, + "eval_samples_per_second": 3.688, + "eval_steps_per_second": 0.947, "step": 3266 }, { "epoch": 46.00038095238095, - "grad_norm": 0.011222787201404572, + "grad_norm": 0.0019852747209370136, "learning_rate": 7.65079365079365e-06, - "loss": 0.0026, + "loss": 0.0001, "step": 3270 }, { "epoch": 46.001333333333335, - "grad_norm": 204.4291534423828, + "grad_norm": 0.0013264709850773215, "learning_rate": 7.64021164021164e-06, - "loss": 0.8472, + "loss": 0.2312, "step": 3280 }, { "epoch": 46.00228571428571, - "grad_norm": 0.008574184030294418, + "grad_norm": 0.0050531113520264626, "learning_rate": 7.62962962962963e-06, - "loss": 0.0814, + "loss": 0.0002, "step": 3290 }, { "epoch": 46.003238095238096, - "grad_norm": 1.3483171463012695, + "grad_norm": 0.002699656877666712, "learning_rate": 7.61904761904762e-06, - "loss": 0.1564, + "loss": 0.0001, "step": 3300 }, { "epoch": 46.00419047619047, - "grad_norm": 0.007891189306974411, + "grad_norm": 0.0009981177281588316, "learning_rate": 7.60846560846561e-06, - "loss": 0.0003, + "loss": 0.0001, "step": 3310 }, { "epoch": 46.00514285714286, - "grad_norm": 0.06582821905612946, + "grad_norm": 0.009648758918046951, "learning_rate": 7.597883597883599e-06, - "loss": 0.0033, + "loss": 0.0, "step": 3320 }, { "epoch": 46.00609523809524, - "grad_norm": 18.656375885009766, + "grad_norm": 0.004754742607474327, "learning_rate": 7.587301587301588e-06, - "loss": 0.0658, + "loss": 0.0, "step": 3330 }, { "epoch": 46.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.6331024169921875, - "eval_runtime": 12.5497, - "eval_samples_per_second": 5.897, - "eval_steps_per_second": 1.514, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.001845598220825, + "eval_runtime": 15.5897, + "eval_samples_per_second": 4.747, + "eval_steps_per_second": 1.219, "step": 3337 }, { "epoch": 47.00028571428572, - "grad_norm": 0.9878658652305603, + "grad_norm": 0.018193760886788368, "learning_rate": 7.576719576719578e-06, - "loss": 0.0082, + "loss": 0.0, "step": 3340 }, { "epoch": 47.001238095238094, - "grad_norm": 0.0038536698557436466, + "grad_norm": 0.0028342902660369873, "learning_rate": 7.566137566137567e-06, - "loss": 0.2541, + "loss": 0.0001, "step": 3350 }, { "epoch": 47.00219047619048, - "grad_norm": 28.105022430419922, + "grad_norm": 0.001791008049622178, "learning_rate": 7.555555555555556e-06, - "loss": 0.1217, + "loss": 0.0, "step": 3360 }, { "epoch": 47.003142857142855, - "grad_norm": 0.003185787471011281, + "grad_norm": 0.05041654407978058, "learning_rate": 7.544973544973545e-06, - "loss": 0.089, + "loss": 0.0833, "step": 3370 }, { "epoch": 47.00409523809524, - "grad_norm": 1.2206181287765503, + "grad_norm": 0.0015246650436893106, "learning_rate": 7.534391534391535e-06, - "loss": 0.0004, + "loss": 0.0011, "step": 3380 }, { "epoch": 47.005047619047616, - "grad_norm": 0.007042728830128908, + "grad_norm": 0.004255190957337618, "learning_rate": 7.523809523809524e-06, - "loss": 0.1116, + "loss": 0.2029, "step": 3390 }, { "epoch": 47.006, - "grad_norm": 0.008088290691375732, + "grad_norm": 0.007833059877157211, "learning_rate": 7.5132275132275136e-06, - "loss": 0.1269, + "loss": 0.0001, "step": 3400 }, { "epoch": 47.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.058474063873291, - "eval_runtime": 10.498, - "eval_samples_per_second": 7.049, - "eval_steps_per_second": 1.81, + "eval_accuracy": 0.7837837837837838, + "eval_loss": 1.7209464311599731, + "eval_runtime": 16.3867, + "eval_samples_per_second": 4.516, + "eval_steps_per_second": 1.159, "step": 3408 }, { "epoch": 48.000190476190475, - "grad_norm": 0.01887154020369053, + "grad_norm": 0.0008374308235943317, "learning_rate": 7.5026455026455035e-06, - "loss": 0.2957, + "loss": 0.0001, "step": 3410 }, { "epoch": 48.00114285714286, - "grad_norm": 0.01683860458433628, + "grad_norm": 0.0011125396704301238, "learning_rate": 7.492063492063493e-06, - "loss": 0.0004, + "loss": 0.0006, "step": 3420 }, { "epoch": 48.00209523809524, - "grad_norm": 0.015385139733552933, + "grad_norm": 52.88506317138672, "learning_rate": 7.481481481481482e-06, - "loss": 0.0005, + "loss": 0.2376, "step": 3430 }, { "epoch": 48.00304761904762, - "grad_norm": 0.00849748682230711, + "grad_norm": 0.004130581393837929, "learning_rate": 7.470899470899472e-06, - "loss": 0.0003, + "loss": 0.0001, "step": 3440 }, { "epoch": 48.004, - "grad_norm": 2.85455060005188, + "grad_norm": 0.004906717222183943, "learning_rate": 7.460317460317461e-06, - "loss": 0.0036, + "loss": 0.0001, "step": 3450 }, { "epoch": 48.00495238095238, - "grad_norm": 0.7138940691947937, + "grad_norm": 0.002401293022558093, "learning_rate": 7.44973544973545e-06, - "loss": 0.1686, + "loss": 0.0001, "step": 3460 }, { "epoch": 48.00590476190476, - "grad_norm": 0.007813863456249237, + "grad_norm": 5.829977512359619, "learning_rate": 7.439153439153439e-06, - "loss": 0.2941, + "loss": 0.31, "step": 3470 }, { "epoch": 48.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.1070668697357178, - "eval_runtime": 12.4233, - "eval_samples_per_second": 5.957, - "eval_steps_per_second": 1.529, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.196157693862915, + "eval_runtime": 15.4779, + "eval_samples_per_second": 4.781, + "eval_steps_per_second": 1.228, "step": 3479 }, { "epoch": 49.00009523809524, - "grad_norm": 413.8825378417969, + "grad_norm": 500.178466796875, "learning_rate": 7.428571428571429e-06, - "loss": 0.359, + "loss": 0.4734, "step": 3480 }, { "epoch": 49.00104761904762, - "grad_norm": 0.0058743273839354515, + "grad_norm": 0.0016342108137905598, "learning_rate": 7.417989417989418e-06, - "loss": 0.0391, + "loss": 0.0098, "step": 3490 }, { "epoch": 49.002, - "grad_norm": 202.56471252441406, + "grad_norm": 0.002513843821361661, "learning_rate": 7.4074074074074075e-06, - "loss": 0.2192, + "loss": 0.0001, "step": 3500 }, { "epoch": 49.00295238095238, - "grad_norm": 0.013312368653714657, + "grad_norm": 0.00166873331181705, "learning_rate": 7.3968253968253975e-06, - "loss": 0.7709, + "loss": 0.0002, "step": 3510 }, { "epoch": 49.003904761904764, - "grad_norm": 275.42401123046875, + "grad_norm": 0.005831919610500336, "learning_rate": 7.386243386243387e-06, - "loss": 0.2979, + "loss": 0.0001, "step": 3520 }, { "epoch": 49.00485714285714, - "grad_norm": 0.018910733982920647, + "grad_norm": 0.0014722439227625728, "learning_rate": 7.375661375661376e-06, - "loss": 0.1778, + "loss": 0.149, "step": 3530 }, { "epoch": 49.005809523809525, - "grad_norm": 0.29688337445259094, + "grad_norm": 0.01229450386017561, "learning_rate": 7.3650793650793666e-06, - "loss": 0.0029, + "loss": 0.0001, "step": 3540 }, { "epoch": 49.0067619047619, - "grad_norm": 0.012256086803972721, + "grad_norm": 0.23085883259773254, "learning_rate": 7.354497354497355e-06, - "loss": 0.2149, + "loss": 0.0001, "step": 3550 }, { "epoch": 49.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.4237526655197144, - "eval_runtime": 10.5745, - "eval_samples_per_second": 6.998, - "eval_steps_per_second": 1.797, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.6649614572525024, + "eval_runtime": 16.2384, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 1.17, "step": 3550 }, { "epoch": 50.000952380952384, - "grad_norm": 0.047131139785051346, + "grad_norm": 0.011244424618780613, "learning_rate": 7.343915343915344e-06, - "loss": 0.0643, + "loss": 0.0001, "step": 3560 }, { "epoch": 50.00190476190476, - "grad_norm": 0.297776997089386, + "grad_norm": 0.006904429290443659, "learning_rate": 7.333333333333333e-06, - "loss": 0.1642, + "loss": 0.0001, "step": 3570 }, { "epoch": 50.002857142857145, - "grad_norm": 0.011480898596346378, + "grad_norm": 0.008236641064286232, "learning_rate": 7.322751322751324e-06, - "loss": 0.0013, + "loss": 0.1987, "step": 3580 }, { "epoch": 50.00380952380952, - "grad_norm": 0.005816106218844652, + "grad_norm": 0.001221312559209764, "learning_rate": 7.312169312169313e-06, - "loss": 0.1256, + "loss": 0.0, "step": 3590 }, { "epoch": 50.00476190476191, - "grad_norm": 0.009959045797586441, + "grad_norm": 0.0464463084936142, "learning_rate": 7.301587301587301e-06, - "loss": 0.0006, + "loss": 0.0001, "step": 3600 }, { "epoch": 50.005714285714284, - "grad_norm": 0.5827551484107971, + "grad_norm": 0.002336872974410653, "learning_rate": 7.291005291005292e-06, - "loss": 0.0007, + "loss": 0.0, "step": 3610 }, { "epoch": 50.00666666666667, - "grad_norm": 0.9566899538040161, + "grad_norm": 0.002747748512774706, "learning_rate": 7.280423280423281e-06, - "loss": 0.0017, + "loss": 0.0, "step": 3620 }, { "epoch": 50.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.692422866821289, - "eval_runtime": 10.0896, - "eval_samples_per_second": 7.334, - "eval_steps_per_second": 1.883, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.884304165840149, + "eval_runtime": 16.5472, + "eval_samples_per_second": 4.472, + "eval_steps_per_second": 1.148, "step": 3621 }, { "epoch": 51.00085714285714, - "grad_norm": 0.010292972438037395, + "grad_norm": 0.0009597976459190249, "learning_rate": 7.2698412698412705e-06, - "loss": 0.0978, + "loss": 0.0002, "step": 3630 }, { "epoch": 51.00180952380953, - "grad_norm": 316.67608642578125, + "grad_norm": 0.1065516397356987, "learning_rate": 7.2592592592592605e-06, - "loss": 0.1542, + "loss": 0.0001, "step": 3640 }, { "epoch": 51.002761904761904, - "grad_norm": 0.016249431297183037, + "grad_norm": 0.022926034405827522, "learning_rate": 7.24867724867725e-06, - "loss": 0.2033, + "loss": 0.0002, "step": 3650 }, { "epoch": 51.00371428571429, - "grad_norm": 0.008221075870096684, + "grad_norm": 0.001492603332735598, "learning_rate": 7.238095238095239e-06, - "loss": 0.0162, + "loss": 0.0001, "step": 3660 }, { "epoch": 51.004666666666665, - "grad_norm": 0.005327010061591864, + "grad_norm": 0.000893523043487221, "learning_rate": 7.227513227513228e-06, - "loss": 0.2039, + "loss": 0.0004, "step": 3670 }, { "epoch": 51.00561904761905, - "grad_norm": 0.1792820245027542, + "grad_norm": 0.0012513647088781, "learning_rate": 7.216931216931218e-06, - "loss": 0.1504, + "loss": 0.0005, "step": 3680 }, { "epoch": 51.00657142857143, - "grad_norm": 0.03580465912818909, + "grad_norm": 0.001891888095997274, "learning_rate": 7.206349206349207e-06, - "loss": 0.0004, + "loss": 0.0, "step": 3690 }, { "epoch": 51.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.7704803943634033, - "eval_runtime": 12.4773, - "eval_samples_per_second": 5.931, - "eval_steps_per_second": 1.523, + "eval_accuracy": 0.7702702702702703, + "eval_loss": 1.9397999048233032, + "eval_runtime": 16.7047, + "eval_samples_per_second": 4.43, + "eval_steps_per_second": 1.137, "step": 3692 }, { "epoch": 52.0007619047619, - "grad_norm": 0.01219869777560234, + "grad_norm": 0.0006127876695245504, "learning_rate": 7.195767195767196e-06, - "loss": 0.0005, + "loss": 0.1558, "step": 3700 }, { "epoch": 52.001714285714286, - "grad_norm": 0.055447280406951904, + "grad_norm": 0.0021178224124014378, "learning_rate": 7.185185185185186e-06, - "loss": 0.0007, + "loss": 0.0, "step": 3710 }, { "epoch": 52.00266666666667, - "grad_norm": 59.65972900390625, + "grad_norm": 0.0013722680741921067, "learning_rate": 7.174603174603175e-06, - "loss": 0.1867, + "loss": 0.0007, "step": 3720 }, { "epoch": 52.00361904761905, - "grad_norm": 0.0034636298660188913, + "grad_norm": 0.05817762389779091, "learning_rate": 7.1640211640211644e-06, - "loss": 0.3724, + "loss": 0.0002, "step": 3730 }, { "epoch": 52.00457142857143, - "grad_norm": 0.013308661989867687, + "grad_norm": 0.0018984224880114198, "learning_rate": 7.1534391534391544e-06, - "loss": 0.0002, + "loss": 0.0, "step": 3740 }, { "epoch": 52.00552380952381, - "grad_norm": 112.25346374511719, + "grad_norm": 0.0462646409869194, "learning_rate": 7.1428571428571436e-06, - "loss": 0.2545, + "loss": 0.2027, "step": 3750 }, { "epoch": 52.00647619047619, - "grad_norm": 0.43491536378860474, + "grad_norm": 0.0014900796813890338, "learning_rate": 7.132275132275133e-06, - "loss": 0.6701, + "loss": 0.0, "step": 3760 }, { "epoch": 52.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.167912721633911, - "eval_runtime": 13.0595, - "eval_samples_per_second": 5.666, - "eval_steps_per_second": 1.455, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.7851362228393555, + "eval_runtime": 16.9023, + "eval_samples_per_second": 4.378, + "eval_steps_per_second": 1.124, "step": 3763 }, { "epoch": 53.00066666666667, - "grad_norm": 397.1830749511719, + "grad_norm": 0.005552320275455713, "learning_rate": 7.121693121693122e-06, - "loss": 0.1435, + "loss": 0.523, "step": 3770 }, { "epoch": 53.001619047619045, - "grad_norm": 24.690093994140625, + "grad_norm": 0.0013297455152496696, "learning_rate": 7.111111111111112e-06, - "loss": 0.5817, + "loss": 0.0, "step": 3780 }, { "epoch": 53.00257142857143, - "grad_norm": 0.013699243776500225, + "grad_norm": 0.0035004790406674147, "learning_rate": 7.100529100529101e-06, - "loss": 0.09, + "loss": 0.0035, "step": 3790 }, { "epoch": 53.00352380952381, - "grad_norm": 0.010473594069480896, + "grad_norm": 0.0012106123613193631, "learning_rate": 7.08994708994709e-06, - "loss": 0.1719, + "loss": 0.0, "step": 3800 }, { "epoch": 53.00447619047619, - "grad_norm": 106.57927703857422, + "grad_norm": 0.001144262496381998, "learning_rate": 7.07936507936508e-06, - "loss": 0.0357, + "loss": 0.0001, "step": 3810 }, { "epoch": 53.005428571428574, - "grad_norm": 0.020226487889885902, + "grad_norm": 0.0017582399304956198, "learning_rate": 7.068783068783069e-06, - "loss": 0.0004, + "loss": 0.0, "step": 3820 }, { "epoch": 53.00638095238095, - "grad_norm": 2.065251111984253, + "grad_norm": 2.3698835372924805, "learning_rate": 7.058201058201058e-06, - "loss": 0.5874, + "loss": 0.0001, "step": 3830 }, { "epoch": 53.0067619047619, - "eval_accuracy": 0.6351351351351351, - "eval_loss": 1.8655513525009155, - "eval_runtime": 12.75, - "eval_samples_per_second": 5.804, - "eval_steps_per_second": 1.49, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 1.9573771953582764, + "eval_runtime": 17.5172, + "eval_samples_per_second": 4.224, + "eval_steps_per_second": 1.085, "step": 3834 }, { "epoch": 54.000571428571426, - "grad_norm": 28.153850555419922, + "grad_norm": 0.001897272071801126, "learning_rate": 7.047619047619048e-06, - "loss": 0.0034, + "loss": 0.2065, "step": 3840 }, { "epoch": 54.00152380952381, - "grad_norm": 0.0053263334557414055, + "grad_norm": 0.003666786476969719, "learning_rate": 7.0370370370370375e-06, - "loss": 0.0336, + "loss": 0.0001, "step": 3850 }, { "epoch": 54.00247619047619, - "grad_norm": 0.015258570201694965, + "grad_norm": 0.001953072496689856, "learning_rate": 7.026455026455027e-06, - "loss": 0.0005, + "loss": 0.0, "step": 3860 }, { "epoch": 54.00342857142857, - "grad_norm": 0.02392808347940445, + "grad_norm": 159.1454315185547, "learning_rate": 7.015873015873016e-06, - "loss": 0.0003, + "loss": 0.1996, "step": 3870 }, { "epoch": 54.004380952380956, - "grad_norm": 0.015166191384196281, + "grad_norm": 0.006261242087930441, "learning_rate": 7.005291005291006e-06, - "loss": 0.0049, + "loss": 0.0037, "step": 3880 }, { "epoch": 54.00533333333333, - "grad_norm": 0.01293440256267786, + "grad_norm": 0.0007964144460856915, "learning_rate": 6.994708994708995e-06, - "loss": 0.1096, + "loss": 0.0, "step": 3890 }, { "epoch": 54.00628571428572, - "grad_norm": 1.0416723489761353, + "grad_norm": 2.6199288368225098, "learning_rate": 6.984126984126984e-06, - "loss": 0.0004, + "loss": 0.0002, "step": 3900 }, { "epoch": 54.0067619047619, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 2.188612699508667, - "eval_runtime": 12.7647, - "eval_samples_per_second": 5.797, - "eval_steps_per_second": 1.488, + "eval_accuracy": 0.6351351351351351, + "eval_loss": 2.6199848651885986, + "eval_runtime": 19.2121, + "eval_samples_per_second": 3.852, + "eval_steps_per_second": 0.989, "step": 3905 }, { "epoch": 55.00047619047619, - "grad_norm": 0.003094370011240244, + "grad_norm": 0.002870640717446804, "learning_rate": 6.973544973544975e-06, - "loss": 0.0998, + "loss": 0.0191, "step": 3910 }, { "epoch": 55.00142857142857, - "grad_norm": 0.004199085291475058, + "grad_norm": 0.0009840029524639249, "learning_rate": 6.962962962962964e-06, - "loss": 0.1583, + "loss": 0.0, "step": 3920 }, { "epoch": 55.00238095238095, - "grad_norm": 2.298699140548706, + "grad_norm": 0.006166788749396801, "learning_rate": 6.952380952380952e-06, - "loss": 0.0005, + "loss": 0.0003, "step": 3930 }, { "epoch": 55.00333333333333, - "grad_norm": 0.8325915336608887, + "grad_norm": 11.922721862792969, "learning_rate": 6.941798941798943e-06, - "loss": 0.3524, + "loss": 0.0008, "step": 3940 }, { "epoch": 55.004285714285714, - "grad_norm": 0.00403679721057415, + "grad_norm": 0.000661016209051013, "learning_rate": 6.931216931216932e-06, - "loss": 0.0897, + "loss": 0.0051, "step": 3950 }, { "epoch": 55.0052380952381, - "grad_norm": 0.003504963591694832, + "grad_norm": 0.4277324676513672, "learning_rate": 6.920634920634921e-06, - "loss": 0.0008, + "loss": 0.2065, "step": 3960 }, { "epoch": 55.006190476190476, - "grad_norm": 1.0103956460952759, + "grad_norm": 0.0036654549185186625, "learning_rate": 6.9100529100529105e-06, - "loss": 0.0183, + "loss": 0.0, "step": 3970 }, { "epoch": 55.0067619047619, - "eval_accuracy": 0.6486486486486487, - "eval_loss": 2.0148348808288574, - "eval_runtime": 12.7155, - "eval_samples_per_second": 5.82, - "eval_steps_per_second": 1.494, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.233295440673828, + "eval_runtime": 19.0212, + "eval_samples_per_second": 3.89, + "eval_steps_per_second": 0.999, "step": 3976 }, { "epoch": 56.00038095238095, - "grad_norm": 0.06938774883747101, + "grad_norm": 0.008012962527573109, "learning_rate": 6.8994708994709005e-06, - "loss": 0.0565, + "loss": 0.0003, "step": 3980 }, { "epoch": 56.001333333333335, - "grad_norm": 0.014017208479344845, + "grad_norm": 0.2828165888786316, "learning_rate": 6.88888888888889e-06, - "loss": 0.177, + "loss": 0.0004, "step": 3990 }, { "epoch": 56.00228571428571, - "grad_norm": 0.8127000331878662, + "grad_norm": 379.705322265625, "learning_rate": 6.878306878306879e-06, - "loss": 0.0011, + "loss": 0.2678, "step": 4000 }, { "epoch": 56.003238095238096, - "grad_norm": 0.006788068450987339, + "grad_norm": 0.005230509676039219, "learning_rate": 6.867724867724869e-06, - "loss": 0.0509, + "loss": 0.0001, "step": 4010 }, { "epoch": 56.00419047619047, - "grad_norm": 0.038642797619104385, + "grad_norm": 0.0008635453414171934, "learning_rate": 6.857142857142858e-06, - "loss": 0.0681, + "loss": 0.0, "step": 4020 }, { "epoch": 56.00514285714286, - "grad_norm": 0.01085800863802433, + "grad_norm": 0.008102879859507084, "learning_rate": 6.846560846560847e-06, - "loss": 0.1773, + "loss": 0.0, "step": 4030 }, { "epoch": 56.00609523809524, - "grad_norm": 0.005230126436799765, + "grad_norm": 0.0062914155423641205, "learning_rate": 6.835978835978837e-06, - "loss": 0.0056, + "loss": 0.0001, "step": 4040 }, { "epoch": 56.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 1.9963090419769287, - "eval_runtime": 12.9011, - "eval_samples_per_second": 5.736, - "eval_steps_per_second": 1.473, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.7799017429351807, + "eval_runtime": 18.9614, + "eval_samples_per_second": 3.903, + "eval_steps_per_second": 1.002, "step": 4047 }, { "epoch": 57.00028571428572, - "grad_norm": 0.3820611238479614, + "grad_norm": 0.0031750891357660294, "learning_rate": 6.825396825396826e-06, - "loss": 0.1649, + "loss": 0.0, "step": 4050 }, { "epoch": 57.001238095238094, - "grad_norm": 405.4330139160156, + "grad_norm": 0.0014056439977139235, "learning_rate": 6.814814814814815e-06, - "loss": 0.0209, + "loss": 0.0, "step": 4060 }, { "epoch": 57.00219047619048, - "grad_norm": 0.011841736733913422, + "grad_norm": 0.0033215824514627457, "learning_rate": 6.8042328042328045e-06, - "loss": 0.0002, + "loss": 0.0, "step": 4070 }, { "epoch": 57.003142857142855, - "grad_norm": 0.005479069892317057, + "grad_norm": 0.0037562695797532797, "learning_rate": 6.7936507936507944e-06, - "loss": 0.0003, + "loss": 0.0, "step": 4080 }, { "epoch": 57.00409523809524, - "grad_norm": 0.008774117566645145, + "grad_norm": 0.001404767157509923, "learning_rate": 6.783068783068784e-06, - "loss": 0.0003, + "loss": 0.2013, "step": 4090 }, { "epoch": 57.005047619047616, - "grad_norm": 0.008724145591259003, + "grad_norm": 0.002498056972399354, "learning_rate": 6.772486772486773e-06, - "loss": 0.0003, + "loss": 0.1808, "step": 4100 }, { "epoch": 57.006, - "grad_norm": 0.0038391784764826298, + "grad_norm": 0.006040900945663452, "learning_rate": 6.761904761904763e-06, - "loss": 0.0014, + "loss": 0.0001, "step": 4110 }, { "epoch": 57.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.9337941408157349, - "eval_runtime": 14.0711, - "eval_samples_per_second": 5.259, - "eval_steps_per_second": 1.35, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.193466901779175, + "eval_runtime": 21.025, + "eval_samples_per_second": 3.52, + "eval_steps_per_second": 0.904, "step": 4118 }, { "epoch": 58.000190476190475, - "grad_norm": 0.01911548525094986, + "grad_norm": 0.001317343907430768, "learning_rate": 6.751322751322752e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 4120 }, { "epoch": 58.00114285714286, - "grad_norm": 0.004693763330578804, + "grad_norm": 0.017525408416986465, "learning_rate": 6.740740740740741e-06, - "loss": 0.0004, + "loss": 0.0001, "step": 4130 }, { "epoch": 58.00209523809524, - "grad_norm": 0.00244527286849916, + "grad_norm": 0.004452765453606844, "learning_rate": 6.730158730158731e-06, - "loss": 0.007, + "loss": 0.0001, "step": 4140 }, { "epoch": 58.00304761904762, - "grad_norm": 0.006372438278049231, + "grad_norm": 0.0021483676973730326, "learning_rate": 6.71957671957672e-06, - "loss": 0.115, + "loss": 0.0001, "step": 4150 }, { "epoch": 58.004, - "grad_norm": 0.008796324953436852, + "grad_norm": 0.00330311874859035, "learning_rate": 6.708994708994709e-06, - "loss": 0.1956, + "loss": 0.0, "step": 4160 }, { "epoch": 58.00495238095238, - "grad_norm": 0.06007494404911995, + "grad_norm": 0.004011472221463919, "learning_rate": 6.698412698412698e-06, - "loss": 0.1988, + "loss": 0.066, "step": 4170 }, { "epoch": 58.00590476190476, - "grad_norm": 0.002893394324928522, + "grad_norm": 0.0005767460679635406, "learning_rate": 6.687830687830688e-06, - "loss": 0.2153, + "loss": 0.0188, "step": 4180 }, { "epoch": 58.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.6661046743392944, - "eval_runtime": 14.237, - "eval_samples_per_second": 5.198, - "eval_steps_per_second": 1.335, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.227210760116577, + "eval_runtime": 17.7138, + "eval_samples_per_second": 4.178, + "eval_steps_per_second": 1.073, "step": 4189 }, { "epoch": 59.00009523809524, - "grad_norm": 398.3658142089844, + "grad_norm": 0.0010956120677292347, "learning_rate": 6.6772486772486775e-06, - "loss": 0.0361, + "loss": 0.0001, "step": 4190 }, { "epoch": 59.00104761904762, - "grad_norm": 0.03949444368481636, + "grad_norm": 22.01203727722168, "learning_rate": 6.666666666666667e-06, - "loss": 0.1833, + "loss": 0.001, "step": 4200 }, { "epoch": 59.002, - "grad_norm": 0.0030886358581483364, + "grad_norm": 0.0007579278899356723, "learning_rate": 6.656084656084657e-06, - "loss": 0.0409, + "loss": 0.0001, "step": 4210 }, { "epoch": 59.00295238095238, - "grad_norm": 0.0057244510389864445, + "grad_norm": 0.00103019701782614, "learning_rate": 6.645502645502646e-06, - "loss": 0.0176, + "loss": 0.0, "step": 4220 }, { "epoch": 59.003904761904764, - "grad_norm": 0.004152972251176834, + "grad_norm": 0.7090355753898621, "learning_rate": 6.634920634920635e-06, - "loss": 0.0002, + "loss": 0.0023, "step": 4230 }, { "epoch": 59.00485714285714, - "grad_norm": 0.023913225159049034, + "grad_norm": 0.0037675583735108376, "learning_rate": 6.624338624338626e-06, - "loss": 0.0003, + "loss": 0.0001, "step": 4240 }, { "epoch": 59.005809523809525, - "grad_norm": 0.010618428699672222, + "grad_norm": 0.016517408192157745, "learning_rate": 6.613756613756615e-06, - "loss": 0.003, + "loss": 0.4185, "step": 4250 }, { "epoch": 59.0067619047619, - "grad_norm": 0.033724039793014526, + "grad_norm": 0.00235711014829576, "learning_rate": 6.603174603174603e-06, - "loss": 0.0003, + "loss": 0.1013, "step": 4260 }, { "epoch": 59.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.953974962234497, - "eval_runtime": 8.5971, - "eval_samples_per_second": 8.608, - "eval_steps_per_second": 2.21, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.3606715202331543, + "eval_runtime": 17.3702, + "eval_samples_per_second": 4.26, + "eval_steps_per_second": 1.094, "step": 4260 }, { "epoch": 60.000952380952384, - "grad_norm": 0.004475221503525972, + "grad_norm": 0.003475839737802744, "learning_rate": 6.592592592592592e-06, - "loss": 0.0024, + "loss": 0.0001, "step": 4270 }, { "epoch": 60.00190476190476, - "grad_norm": 0.0032557854428887367, + "grad_norm": 0.0010961840162053704, "learning_rate": 6.582010582010583e-06, - "loss": 0.0003, + "loss": 0.0015, "step": 4280 }, { "epoch": 60.002857142857145, - "grad_norm": 0.17536596953868866, + "grad_norm": 0.001646665041334927, "learning_rate": 6.571428571428572e-06, - "loss": 0.0006, + "loss": 0.1763, "step": 4290 }, { "epoch": 60.00380952380952, - "grad_norm": 0.0030879988335072994, + "grad_norm": 0.00105653319042176, "learning_rate": 6.560846560846561e-06, - "loss": 0.0001, + "loss": 0.0, "step": 4300 }, { "epoch": 60.00476190476191, - "grad_norm": 0.558417558670044, + "grad_norm": 0.0037881555035710335, "learning_rate": 6.550264550264551e-06, - "loss": 0.1021, + "loss": 0.2071, "step": 4310 }, { "epoch": 60.005714285714284, - "grad_norm": 0.08702671527862549, + "grad_norm": 0.0010041279019787908, "learning_rate": 6.5396825396825405e-06, - "loss": 1.0492, + "loss": 0.0, "step": 4320 }, { "epoch": 60.00666666666667, - "grad_norm": 331.0879211425781, + "grad_norm": 0.002599923172965646, "learning_rate": 6.52910052910053e-06, - "loss": 0.3193, + "loss": 0.0001, "step": 4330 }, { "epoch": 60.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.107548952102661, - "eval_runtime": 8.5393, - "eval_samples_per_second": 8.666, - "eval_steps_per_second": 2.225, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.1222922801971436, + "eval_runtime": 17.4695, + "eval_samples_per_second": 4.236, + "eval_steps_per_second": 1.088, "step": 4331 }, { "epoch": 61.00085714285714, - "grad_norm": 0.006396997254341841, + "grad_norm": 0.0010362501488998532, "learning_rate": 6.51851851851852e-06, - "loss": 0.1484, + "loss": 0.0, "step": 4340 }, { "epoch": 61.00180952380953, - "grad_norm": 0.005390217062085867, + "grad_norm": 0.0007642352138645947, "learning_rate": 6.507936507936509e-06, - "loss": 0.4848, + "loss": 0.1222, "step": 4350 }, { "epoch": 61.002761904761904, - "grad_norm": 0.634354293346405, + "grad_norm": 0.000809229037258774, "learning_rate": 6.497354497354498e-06, - "loss": 0.2016, + "loss": 0.2521, "step": 4360 }, { "epoch": 61.00371428571429, - "grad_norm": 0.018984658643603325, + "grad_norm": 0.23083215951919556, "learning_rate": 6.486772486772487e-06, - "loss": 0.1912, + "loss": 0.0782, "step": 4370 }, { "epoch": 61.004666666666665, - "grad_norm": 0.022517457604408264, + "grad_norm": 0.29525506496429443, "learning_rate": 6.476190476190477e-06, - "loss": 0.0038, + "loss": 0.0516, "step": 4380 }, { "epoch": 61.00561904761905, - "grad_norm": 0.015906190499663353, + "grad_norm": 0.0027424772270023823, "learning_rate": 6.465608465608466e-06, - "loss": 0.1621, + "loss": 0.1534, "step": 4390 }, { "epoch": 61.00657142857143, - "grad_norm": 0.03879152610898018, + "grad_norm": 0.0015537068247795105, "learning_rate": 6.455026455026455e-06, - "loss": 0.0004, + "loss": 0.0026, "step": 4400 }, { "epoch": 61.0067619047619, - "eval_accuracy": 0.7432432432432432, - "eval_loss": 1.537624478340149, - "eval_runtime": 8.4984, - "eval_samples_per_second": 8.708, - "eval_steps_per_second": 2.236, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 1.9220436811447144, + "eval_runtime": 18.9971, + "eval_samples_per_second": 3.895, + "eval_steps_per_second": 1.0, "step": 4402 }, { "epoch": 62.0007619047619, - "grad_norm": 0.020200679078698158, + "grad_norm": 0.0010604149429127574, "learning_rate": 6.444444444444445e-06, - "loss": 0.0003, + "loss": 0.1698, "step": 4410 }, { "epoch": 62.001714285714286, - "grad_norm": 0.9699930548667908, + "grad_norm": 0.0010832214029505849, "learning_rate": 6.4338624338624345e-06, - "loss": 0.0584, + "loss": 0.0, "step": 4420 }, { "epoch": 62.00266666666667, - "grad_norm": 95.47468566894531, + "grad_norm": 0.02589496225118637, "learning_rate": 6.423280423280424e-06, - "loss": 0.2909, + "loss": 0.0001, "step": 4430 }, { "epoch": 62.00361904761905, - "grad_norm": 379.65625, + "grad_norm": 0.0012127620866522193, "learning_rate": 6.412698412698414e-06, - "loss": 0.1354, + "loss": 0.0146, "step": 4440 }, { "epoch": 62.00457142857143, - "grad_norm": 0.15193375945091248, + "grad_norm": 0.00289982371032238, "learning_rate": 6.402116402116403e-06, - "loss": 0.1708, + "loss": 0.0, "step": 4450 }, { "epoch": 62.00552380952381, - "grad_norm": 0.004682454280555248, + "grad_norm": 0.0035972294863313437, "learning_rate": 6.391534391534392e-06, - "loss": 0.1846, + "loss": 0.1764, "step": 4460 }, { "epoch": 62.00647619047619, - "grad_norm": 0.0702052190899849, + "grad_norm": 0.0023234295658767223, "learning_rate": 6.380952380952381e-06, - "loss": 0.0003, + "loss": 0.193, "step": 4470 }, { "epoch": 62.0067619047619, "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.9646750688552856, - "eval_runtime": 10.8955, - "eval_samples_per_second": 6.792, - "eval_steps_per_second": 1.744, + "eval_loss": 2.22542405128479, + "eval_runtime": 17.4236, + "eval_samples_per_second": 4.247, + "eval_steps_per_second": 1.09, "step": 4473 }, { "epoch": 63.00066666666667, - "grad_norm": 0.008514277637004852, + "grad_norm": 0.010563456453382969, "learning_rate": 6.370370370370371e-06, - "loss": 0.0899, + "loss": 0.0, "step": 4480 }, { "epoch": 63.001619047619045, - "grad_norm": 0.00909407902508974, + "grad_norm": 0.0025069634430110455, "learning_rate": 6.35978835978836e-06, - "loss": 0.1035, + "loss": 0.0, "step": 4490 }, { "epoch": 63.00257142857143, - "grad_norm": 0.0038764197379350662, + "grad_norm": 0.0028519683983176947, "learning_rate": 6.349206349206349e-06, - "loss": 0.0514, + "loss": 0.0797, "step": 4500 }, { "epoch": 63.00352380952381, - "grad_norm": 0.009397774003446102, + "grad_norm": 0.0033148368820548058, "learning_rate": 6.338624338624339e-06, "loss": 0.0003, "step": 4510 }, { "epoch": 63.00447619047619, - "grad_norm": 0.0025235984940081835, + "grad_norm": 0.008310235105454922, "learning_rate": 6.328042328042328e-06, - "loss": 0.1571, + "loss": 0.4209, "step": 4520 }, { "epoch": 63.005428571428574, - "grad_norm": 0.0036749846767634153, + "grad_norm": 0.43833351135253906, "learning_rate": 6.3174603174603175e-06, - "loss": 0.0002, + "loss": 0.002, "step": 4530 }, { "epoch": 63.00638095238095, - "grad_norm": 0.06227302551269531, + "grad_norm": 0.001988427247852087, "learning_rate": 6.3068783068783075e-06, - "loss": 0.0006, + "loss": 0.0002, "step": 4540 }, { "epoch": 63.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.887770414352417, - "eval_runtime": 8.9551, - "eval_samples_per_second": 8.263, - "eval_steps_per_second": 2.122, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.2681643962860107, + "eval_runtime": 17.5739, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 1.081, "step": 4544 }, { "epoch": 64.00057142857143, - "grad_norm": 0.01653936505317688, + "grad_norm": 0.0018558625597506762, "learning_rate": 6.296296296296297e-06, - "loss": 0.0339, + "loss": 0.0, "step": 4550 }, { "epoch": 64.0015238095238, - "grad_norm": 0.005849821958690882, + "grad_norm": 0.00036164221819490194, "learning_rate": 6.285714285714286e-06, - "loss": 0.0802, + "loss": 0.0605, "step": 4560 }, { "epoch": 64.00247619047619, - "grad_norm": 0.00495440699160099, + "grad_norm": 0.001508195884525776, "learning_rate": 6.275132275132275e-06, - "loss": 0.0434, + "loss": 0.0007, "step": 4570 }, { "epoch": 64.00342857142857, - "grad_norm": 119.76103973388672, + "grad_norm": 0.01874958910048008, "learning_rate": 6.264550264550266e-06, - "loss": 0.2473, + "loss": 0.0391, "step": 4580 }, { "epoch": 64.00438095238096, - "grad_norm": 298.63104248046875, + "grad_norm": 0.0006183416116982698, "learning_rate": 6.253968253968254e-06, - "loss": 0.0313, + "loss": 0.1875, "step": 4590 }, { "epoch": 64.00533333333334, - "grad_norm": 0.004146216437220573, + "grad_norm": 0.001555442693643272, "learning_rate": 6.243386243386243e-06, - "loss": 0.0002, + "loss": 0.0007, "step": 4600 }, { "epoch": 64.00628571428571, - "grad_norm": 30.183269500732422, + "grad_norm": 0.0006395932286977768, "learning_rate": 6.232804232804234e-06, - "loss": 0.0018, + "loss": 0.0, "step": 4610 }, { "epoch": 64.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.7761112451553345, - "eval_runtime": 14.5242, - "eval_samples_per_second": 5.095, - "eval_steps_per_second": 1.308, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.685673475265503, + "eval_runtime": 17.963, + "eval_samples_per_second": 4.12, + "eval_steps_per_second": 1.058, "step": 4615 }, { "epoch": 65.00047619047619, - "grad_norm": 0.005379782058298588, + "grad_norm": 0.002215348416939378, "learning_rate": 6.222222222222223e-06, - "loss": 0.0002, + "loss": 0.0, "step": 4620 }, { "epoch": 65.00142857142858, - "grad_norm": 0.02137589082121849, + "grad_norm": 0.001222651218995452, "learning_rate": 6.211640211640212e-06, - "loss": 0.0002, + "loss": 0.0004, "step": 4630 }, { "epoch": 65.00238095238095, - "grad_norm": 0.0034357302356511354, + "grad_norm": 0.0009166031959466636, "learning_rate": 6.201058201058202e-06, - "loss": 0.041, + "loss": 0.2341, "step": 4640 }, { "epoch": 65.00333333333333, - "grad_norm": 22.05573272705078, + "grad_norm": 0.0009040706208907068, "learning_rate": 6.1904761904761914e-06, - "loss": 0.1358, + "loss": 0.0013, "step": 4650 }, { "epoch": 65.00428571428571, - "grad_norm": 0.006555379368364811, + "grad_norm": 0.003010386601090431, "learning_rate": 6.1798941798941806e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 4660 }, { "epoch": 65.0052380952381, - "grad_norm": 0.9638269543647766, + "grad_norm": 0.018778080120682716, "learning_rate": 6.16931216931217e-06, - "loss": 0.1289, + "loss": 0.0001, "step": 4670 }, { "epoch": 65.00619047619048, - "grad_norm": 0.006485573947429657, + "grad_norm": 0.0012031777296215296, "learning_rate": 6.15873015873016e-06, - "loss": 0.0002, + "loss": 0.0, "step": 4680 }, { "epoch": 65.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.75357985496521, - "eval_runtime": 14.0154, - "eval_samples_per_second": 5.28, - "eval_steps_per_second": 1.356, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.379077672958374, + "eval_runtime": 17.2414, + "eval_samples_per_second": 4.292, + "eval_steps_per_second": 1.102, "step": 4686 }, { "epoch": 66.00038095238095, - "grad_norm": 0.004733482841402292, + "grad_norm": 0.0038914321921765804, "learning_rate": 6.148148148148149e-06, - "loss": 0.0023, + "loss": 0.0, "step": 4690 }, { "epoch": 66.00133333333333, - "grad_norm": 0.00244261440820992, + "grad_norm": 0.0008610020740889013, "learning_rate": 6.137566137566138e-06, - "loss": 0.0739, + "loss": 0.0001, "step": 4700 }, { "epoch": 66.00228571428572, - "grad_norm": 0.005988690070807934, + "grad_norm": 0.00043805301538668573, "learning_rate": 6.126984126984128e-06, - "loss": 0.0002, + "loss": 0.3381, "step": 4710 }, { "epoch": 66.00323809523809, - "grad_norm": 3.9605820178985596, + "grad_norm": 7.665761947631836, "learning_rate": 6.116402116402117e-06, - "loss": 0.1871, + "loss": 0.0306, "step": 4720 }, { "epoch": 66.00419047619047, - "grad_norm": 0.0033270171843469143, + "grad_norm": 0.006835469510406256, "learning_rate": 6.105820105820106e-06, - "loss": 0.0004, + "loss": 0.0002, "step": 4730 }, { "epoch": 66.00514285714286, - "grad_norm": 0.003011047840118408, + "grad_norm": 0.0009099821327254176, "learning_rate": 6.095238095238096e-06, - "loss": 0.0026, + "loss": 0.0, "step": 4740 }, { "epoch": 66.00609523809524, - "grad_norm": 0.003045491874217987, + "grad_norm": 0.0022552493028342724, "learning_rate": 6.084656084656085e-06, - "loss": 0.0001, + "loss": 0.0076, "step": 4750 }, { "epoch": 66.0067619047619, "eval_accuracy": 0.6756756756756757, - "eval_loss": 2.2684450149536133, - "eval_runtime": 12.9471, - "eval_samples_per_second": 5.716, - "eval_steps_per_second": 1.468, + "eval_loss": 2.8393194675445557, + "eval_runtime": 13.8869, + "eval_samples_per_second": 5.329, + "eval_steps_per_second": 1.368, "step": 4757 }, { "epoch": 67.00028571428571, - "grad_norm": 0.003044073935598135, + "grad_norm": 0.13802634179592133, "learning_rate": 6.0740740740740745e-06, - "loss": 0.0637, + "loss": 0.0001, "step": 4760 }, { "epoch": 67.0012380952381, - "grad_norm": 0.0024552911054342985, + "grad_norm": 0.003053755732253194, "learning_rate": 6.063492063492064e-06, - "loss": 0.0612, + "loss": 0.2751, "step": 4770 }, { "epoch": 67.00219047619048, - "grad_norm": 0.004682763013988733, + "grad_norm": 0.060533616691827774, "learning_rate": 6.052910052910054e-06, "loss": 0.0002, "step": 4780 }, { "epoch": 67.00314285714286, - "grad_norm": 0.07617571204900742, + "grad_norm": 0.003102941671386361, "learning_rate": 6.042328042328043e-06, - "loss": 0.0417, + "loss": 0.0, "step": 4790 }, { "epoch": 67.00409523809523, - "grad_norm": 6.378434181213379, + "grad_norm": 0.0014726222725585103, "learning_rate": 6.031746031746032e-06, - "loss": 0.1841, + "loss": 0.1433, "step": 4800 }, { "epoch": 67.00504761904762, - "grad_norm": 0.0036101271398365498, + "grad_norm": 0.012411821633577347, "learning_rate": 6.021164021164022e-06, - "loss": 0.2032, + "loss": 0.0953, "step": 4810 }, { "epoch": 67.006, - "grad_norm": 0.006891037803143263, + "grad_norm": 0.003996816463768482, "learning_rate": 6.010582010582011e-06, - "loss": 0.0002, + "loss": 0.0043, "step": 4820 }, { "epoch": 67.0067619047619, "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.7061450481414795, - "eval_runtime": 14.3395, - "eval_samples_per_second": 5.161, - "eval_steps_per_second": 1.325, + "eval_loss": 1.9305188655853271, + "eval_runtime": 13.8404, + "eval_samples_per_second": 5.347, + "eval_steps_per_second": 1.373, "step": 4828 }, { "epoch": 68.00019047619048, - "grad_norm": 0.003544341307133436, + "grad_norm": 0.0005873045884072781, "learning_rate": 6e-06, "loss": 0.0001, "step": 4830 }, { "epoch": 68.00114285714285, - "grad_norm": 2.7320408821105957, + "grad_norm": 0.028566883876919746, "learning_rate": 5.989417989417989e-06, - "loss": 0.0003, + "loss": 0.0, "step": 4840 }, { "epoch": 68.00209523809524, - "grad_norm": 0.004226857330650091, + "grad_norm": 0.0005609308718703687, "learning_rate": 5.978835978835979e-06, - "loss": 0.0004, + "loss": 0.0, "step": 4850 }, { "epoch": 68.00304761904762, - "grad_norm": 0.3182094991207123, + "grad_norm": 0.001683125738054514, "learning_rate": 5.968253968253968e-06, - "loss": 0.0003, + "loss": 0.0001, "step": 4860 }, { "epoch": 68.004, - "grad_norm": 0.002105162013322115, + "grad_norm": 0.0006265339907258749, "learning_rate": 5.9576719576719576e-06, - "loss": 0.0337, + "loss": 0.0001, "step": 4870 }, { "epoch": 68.00495238095237, - "grad_norm": 0.002672486240044236, + "grad_norm": 0.0011454337509348989, "learning_rate": 5.9470899470899475e-06, - "loss": 0.0002, + "loss": 0.0, "step": 4880 }, { "epoch": 68.00590476190476, - "grad_norm": 0.004141777753829956, + "grad_norm": 3.5152859687805176, "learning_rate": 5.936507936507937e-06, - "loss": 0.0498, + "loss": 0.0003, "step": 4890 }, { "epoch": 68.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.8081656694412231, - "eval_runtime": 10.8926, - "eval_samples_per_second": 6.794, - "eval_steps_per_second": 1.744, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 1.9944276809692383, + "eval_runtime": 15.8087, + "eval_samples_per_second": 4.681, + "eval_steps_per_second": 1.202, "step": 4899 }, { "epoch": 69.00009523809524, - "grad_norm": 0.047161370515823364, + "grad_norm": 0.0005341055803000927, "learning_rate": 5.925925925925926e-06, - "loss": 0.0001, + "loss": 0.0, "step": 4900 }, { "epoch": 69.00104761904763, - "grad_norm": 0.005020506214350462, + "grad_norm": 0.0011177296983078122, "learning_rate": 5.915343915343917e-06, - "loss": 0.0032, + "loss": 0.0919, "step": 4910 }, { "epoch": 69.002, - "grad_norm": 0.04604746028780937, + "grad_norm": 0.0007625820580869913, "learning_rate": 5.904761904761905e-06, - "loss": 0.0001, + "loss": 0.0, "step": 4920 }, { "epoch": 69.00295238095238, - "grad_norm": 0.005650079809129238, + "grad_norm": 0.0006928302464075387, "learning_rate": 5.894179894179894e-06, - "loss": 0.0003, + "loss": 0.0, "step": 4930 }, { "epoch": 69.00390476190476, - "grad_norm": 0.017339196056127548, + "grad_norm": 0.002256699139252305, "learning_rate": 5.883597883597883e-06, - "loss": 0.0002, + "loss": 0.0, "step": 4940 }, { "epoch": 69.00485714285715, - "grad_norm": 0.016551831737160683, + "grad_norm": 0.0004958523204550147, "learning_rate": 5.873015873015874e-06, - "loss": 0.0058, + "loss": 0.0, "step": 4950 }, { "epoch": 69.00580952380952, - "grad_norm": 0.002221416449174285, + "grad_norm": 0.00038925904664210975, "learning_rate": 5.862433862433863e-06, - "loss": 0.0001, + "loss": 0.0, "step": 4960 }, { "epoch": 69.0067619047619, - "grad_norm": 0.004755695350468159, + "grad_norm": 0.002490544691681862, "learning_rate": 5.8518518518518515e-06, - "loss": 0.0007, + "loss": 0.0, "step": 4970 }, { "epoch": 69.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.7664811611175537, - "eval_runtime": 10.4186, - "eval_samples_per_second": 7.103, - "eval_steps_per_second": 1.824, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.58418345451355, + "eval_runtime": 15.9651, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 1.19, "step": 4970 }, { "epoch": 70.00095238095238, - "grad_norm": 0.020240673795342445, + "grad_norm": 0.0006307671428658068, "learning_rate": 5.841269841269842e-06, - "loss": 0.0001, + "loss": 0.0029, "step": 4980 }, { "epoch": 70.00190476190477, - "grad_norm": 0.003607440972700715, + "grad_norm": 0.0025409061927348375, "learning_rate": 5.8306878306878314e-06, - "loss": 0.0002, + "loss": 0.0502, "step": 4990 }, { "epoch": 70.00285714285714, - "grad_norm": 0.02144448459148407, + "grad_norm": 0.0017973057692870498, "learning_rate": 5.820105820105821e-06, - "loss": 0.0018, + "loss": 0.0, "step": 5000 }, { "epoch": 70.00380952380952, - "grad_norm": 0.003655269043520093, + "grad_norm": 0.0004459419578779489, "learning_rate": 5.8095238095238106e-06, - "loss": 0.0004, + "loss": 0.0, "step": 5010 }, { "epoch": 70.0047619047619, - "grad_norm": 0.11724881082773209, + "grad_norm": 0.0011419616639614105, "learning_rate": 5.7989417989418e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5020 }, { "epoch": 70.00571428571429, - "grad_norm": 0.0018330231541767716, + "grad_norm": 0.0005456113140098751, "learning_rate": 5.788359788359789e-06, "loss": 0.0001, "step": 5030 }, { "epoch": 70.00666666666666, - "grad_norm": 0.005465388298034668, + "grad_norm": 0.0008488246239721775, "learning_rate": 5.777777777777778e-06, - "loss": 0.0019, + "loss": 0.0001, "step": 5040 }, { "epoch": 70.0067619047619, - "eval_accuracy": 0.6756756756756757, - "eval_loss": 2.5360360145568848, - "eval_runtime": 10.857, - "eval_samples_per_second": 6.816, - "eval_steps_per_second": 1.75, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.650306463241577, + "eval_runtime": 17.1662, + "eval_samples_per_second": 4.311, + "eval_steps_per_second": 1.107, "step": 5041 }, { "epoch": 71.00085714285714, - "grad_norm": 0.0026980633847415447, + "grad_norm": 0.07173417508602142, "learning_rate": 5.767195767195768e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5050 }, { "epoch": 71.00180952380953, - "grad_norm": 0.00328731257468462, + "grad_norm": 0.0004941129591315985, "learning_rate": 5.756613756613757e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5060 }, { "epoch": 71.00276190476191, - "grad_norm": 0.008453653194010258, + "grad_norm": 0.0032803788781166077, "learning_rate": 5.746031746031746e-06, - "loss": 0.0321, + "loss": 0.0, "step": 5070 }, { "epoch": 71.00371428571428, - "grad_norm": 0.006831921171396971, + "grad_norm": 0.0007884973310865462, "learning_rate": 5.735449735449736e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5080 }, { "epoch": 71.00466666666667, - "grad_norm": 0.00503316568210721, + "grad_norm": 0.00035243743332102895, "learning_rate": 5.724867724867725e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5090 }, { "epoch": 71.00561904761905, - "grad_norm": 0.0034536656457930803, + "grad_norm": 0.001297764596529305, "learning_rate": 5.7142857142857145e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5100 }, { "epoch": 71.00657142857143, - "grad_norm": 0.003798070829361677, + "grad_norm": 0.0007165080169215798, "learning_rate": 5.7037037037037045e-06, - "loss": 0.0854, + "loss": 0.0, "step": 5110 }, { "epoch": 71.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.0175890922546387, - "eval_runtime": 9.8418, - "eval_samples_per_second": 7.519, - "eval_steps_per_second": 1.931, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.725356340408325, + "eval_runtime": 15.7698, + "eval_samples_per_second": 4.693, + "eval_steps_per_second": 1.205, "step": 5112 }, { "epoch": 72.0007619047619, - "grad_norm": 0.002290370874106884, + "grad_norm": 0.0005112860817462206, "learning_rate": 5.693121693121694e-06, - "loss": 0.0003, + "loss": 0.0, "step": 5120 }, { "epoch": 72.00171428571429, - "grad_norm": 0.001876681111752987, + "grad_norm": 0.0010144360130652785, "learning_rate": 5.682539682539683e-06, - "loss": 0.0018, + "loss": 0.0, "step": 5130 }, { "epoch": 72.00266666666667, - "grad_norm": 0.002466893056407571, + "grad_norm": 0.0012267096899449825, "learning_rate": 5.671957671957672e-06, - "loss": 0.16, + "loss": 0.0, "step": 5140 }, { "epoch": 72.00361904761905, - "grad_norm": 0.0039909700863063335, + "grad_norm": 0.0029065243434160948, "learning_rate": 5.661375661375662e-06, - "loss": 0.2172, + "loss": 0.0, "step": 5150 }, { "epoch": 72.00457142857142, - "grad_norm": 0.0021058020647615194, + "grad_norm": 0.00046578419278375804, "learning_rate": 5.650793650793651e-06, - "loss": 0.0819, + "loss": 0.0, "step": 5160 }, { "epoch": 72.00552380952381, - "grad_norm": 0.008475403301417828, + "grad_norm": 0.0018618660978972912, "learning_rate": 5.64021164021164e-06, - "loss": 0.2028, + "loss": 0.0, "step": 5170 }, { "epoch": 72.00647619047619, - "grad_norm": 0.003610498271882534, + "grad_norm": 0.0006170138367451727, "learning_rate": 5.62962962962963e-06, - "loss": 0.153, + "loss": 0.0002, "step": 5180 }, { "epoch": 72.0067619047619, - "eval_accuracy": 0.6351351351351351, - "eval_loss": 2.605787992477417, - "eval_runtime": 10.3265, - "eval_samples_per_second": 7.166, - "eval_steps_per_second": 1.84, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 3.0428946018218994, + "eval_runtime": 18.5951, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 1.022, "step": 5183 }, { "epoch": 73.00066666666666, - "grad_norm": 41.92068862915039, + "grad_norm": 0.10139445960521698, "learning_rate": 5.619047619047619e-06, - "loss": 0.3756, + "loss": 0.0, "step": 5190 }, { "epoch": 73.00161904761904, - "grad_norm": 0.04569064453244209, + "grad_norm": 0.0012034185929223895, "learning_rate": 5.6084656084656084e-06, - "loss": 0.0015, + "loss": 0.3207, "step": 5200 }, { "epoch": 73.00257142857143, - "grad_norm": 0.01304867397993803, + "grad_norm": 0.000532768142875284, "learning_rate": 5.597883597883598e-06, - "loss": 0.0182, + "loss": 0.2441, "step": 5210 }, { "epoch": 73.00352380952381, - "grad_norm": 0.011928781867027283, + "grad_norm": 0.026747262105345726, "learning_rate": 5.5873015873015876e-06, - "loss": 0.0003, + "loss": 0.0, "step": 5220 }, { "epoch": 73.0044761904762, - "grad_norm": 110.30216217041016, + "grad_norm": 0.006560751236975193, "learning_rate": 5.576719576719577e-06, - "loss": 0.0016, + "loss": 0.0001, "step": 5230 }, { "epoch": 73.00542857142857, - "grad_norm": 0.0032866550609469414, + "grad_norm": 0.0026356203015893698, "learning_rate": 5.566137566137566e-06, - "loss": 0.0108, + "loss": 0.0, "step": 5240 }, { "epoch": 73.00638095238095, - "grad_norm": 0.026729857549071312, + "grad_norm": 0.0004046234826091677, "learning_rate": 5.555555555555557e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5250 }, { "epoch": 73.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 1.941354751586914, - "eval_runtime": 9.8503, - "eval_samples_per_second": 7.512, - "eval_steps_per_second": 1.929, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.5715653896331787, + "eval_runtime": 19.502, + "eval_samples_per_second": 3.794, + "eval_steps_per_second": 0.974, "step": 5254 }, { "epoch": 74.00057142857143, - "grad_norm": 0.0036914239171892405, + "grad_norm": 0.000314861536026001, "learning_rate": 5.544973544973545e-06, - "loss": 0.1754, + "loss": 0.0, "step": 5260 }, { "epoch": 74.0015238095238, - "grad_norm": 0.0037844409234821796, + "grad_norm": 0.0012239968637004495, "learning_rate": 5.534391534391534e-06, - "loss": 0.0008, + "loss": 0.0, "step": 5270 }, { "epoch": 74.00247619047619, - "grad_norm": 0.005616551265120506, + "grad_norm": 0.001302064280025661, "learning_rate": 5.523809523809525e-06, - "loss": 0.0035, + "loss": 0.0, "step": 5280 }, { "epoch": 74.00342857142857, - "grad_norm": 0.0016844181809574366, + "grad_norm": 0.000552400597371161, "learning_rate": 5.513227513227514e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5290 }, { "epoch": 74.00438095238096, - "grad_norm": 0.002806992968544364, + "grad_norm": 0.00045846428838558495, "learning_rate": 5.502645502645503e-06, - "loss": 0.203, + "loss": 0.0, "step": 5300 }, { "epoch": 74.00533333333334, - "grad_norm": 0.002338800812140107, + "grad_norm": 0.008412440307438374, "learning_rate": 5.492063492063493e-06, - "loss": 0.4365, + "loss": 0.0136, "step": 5310 }, { "epoch": 74.00628571428571, - "grad_norm": 0.007252326235175133, + "grad_norm": 0.0009433178347535431, "learning_rate": 5.481481481481482e-06, - "loss": 0.1577, + "loss": 0.0671, "step": 5320 }, { "epoch": 74.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.187164545059204, - "eval_runtime": 10.1659, - "eval_samples_per_second": 7.279, - "eval_steps_per_second": 1.869, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.5143699645996094, + "eval_runtime": 16.2734, + "eval_samples_per_second": 4.547, + "eval_steps_per_second": 1.168, "step": 5325 }, { "epoch": 75.00047619047619, - "grad_norm": 0.0017462290124967694, + "grad_norm": 0.0010361479362472892, "learning_rate": 5.4708994708994715e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5330 }, { "epoch": 75.00142857142858, - "grad_norm": 0.002199310576543212, + "grad_norm": 0.0006854168605059385, "learning_rate": 5.460317460317461e-06, - "loss": 0.0562, + "loss": 0.0, "step": 5340 }, { "epoch": 75.00238095238095, - "grad_norm": 0.012386896647512913, + "grad_norm": 0.00024168891832232475, "learning_rate": 5.449735449735451e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5350 }, { "epoch": 75.00333333333333, - "grad_norm": 0.008624891750514507, + "grad_norm": 0.0027299614157527685, "learning_rate": 5.43915343915344e-06, - "loss": 0.0032, + "loss": 0.0, "step": 5360 }, { "epoch": 75.00428571428571, - "grad_norm": 0.002533063292503357, + "grad_norm": 0.019003285095095634, "learning_rate": 5.428571428571429e-06, - "loss": 0.2047, + "loss": 0.0, "step": 5370 }, { "epoch": 75.0052380952381, - "grad_norm": 0.0017573687946423888, + "grad_norm": 0.0002465677389409393, "learning_rate": 5.417989417989419e-06, - "loss": 0.0363, + "loss": 0.0, "step": 5380 }, { "epoch": 75.00619047619048, - "grad_norm": 0.0017679115990176797, + "grad_norm": 0.00031525909435003996, "learning_rate": 5.407407407407408e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5390 }, { "epoch": 75.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.9069937467575073, - "eval_runtime": 9.7602, - "eval_samples_per_second": 7.582, - "eval_steps_per_second": 1.947, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.893791913986206, + "eval_runtime": 16.5432, + "eval_samples_per_second": 4.473, + "eval_steps_per_second": 1.149, "step": 5396 }, { "epoch": 76.00038095238095, - "grad_norm": 0.009734553284943104, + "grad_norm": 0.000373604241758585, "learning_rate": 5.396825396825397e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5400 }, { "epoch": 76.00133333333333, - "grad_norm": 0.007349786348640919, + "grad_norm": 0.0354059673845768, "learning_rate": 5.386243386243387e-06, - "loss": 0.1829, + "loss": 0.0, "step": 5410 }, { "epoch": 76.00228571428572, - "grad_norm": 0.2736695408821106, + "grad_norm": 0.0002337087207706645, "learning_rate": 5.375661375661376e-06, - "loss": 0.0408, + "loss": 0.0, "step": 5420 }, { "epoch": 76.00323809523809, - "grad_norm": 0.0034376117400825024, + "grad_norm": 0.000298203231068328, "learning_rate": 5.365079365079365e-06, - "loss": 0.0038, + "loss": 0.0, "step": 5430 }, { "epoch": 76.00419047619047, - "grad_norm": 0.0063005550764501095, + "grad_norm": 0.000580488471314311, "learning_rate": 5.3544973544973545e-06, - "loss": 0.0003, + "loss": 0.0, "step": 5440 }, { "epoch": 76.00514285714286, - "grad_norm": 0.0818108320236206, + "grad_norm": 0.0010549610015004873, "learning_rate": 5.3439153439153445e-06, - "loss": 0.0005, + "loss": 0.0, "step": 5450 }, { "epoch": 76.00609523809524, - "grad_norm": 0.005165382754057646, + "grad_norm": 0.0011401353403925896, "learning_rate": 5.333333333333334e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5460 }, { "epoch": 76.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.158604860305786, - "eval_runtime": 48.1916, - "eval_samples_per_second": 1.536, - "eval_steps_per_second": 0.394, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.850273847579956, + "eval_runtime": 16.7983, + "eval_samples_per_second": 4.405, + "eval_steps_per_second": 1.131, "step": 5467 }, { "epoch": 77.00028571428571, - "grad_norm": 0.002257216488942504, + "grad_norm": 0.0005172080709598958, "learning_rate": 5.322751322751323e-06, - "loss": 0.0002, + "loss": 0.0, "step": 5470 }, { "epoch": 77.0012380952381, - "grad_norm": 0.05800849199295044, + "grad_norm": 0.0014666365459561348, "learning_rate": 5.312169312169313e-06, - "loss": 0.0007, + "loss": 0.0, "step": 5480 }, { "epoch": 77.00219047619048, - "grad_norm": 0.003397268010303378, + "grad_norm": 0.0014088664902374148, "learning_rate": 5.301587301587302e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5490 }, { "epoch": 77.00314285714286, - "grad_norm": 0.0016938684275373816, + "grad_norm": 0.0003868502099066973, "learning_rate": 5.291005291005291e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5500 }, { "epoch": 77.00409523809523, - "grad_norm": 0.0020050883758813143, + "grad_norm": 0.001465518376789987, "learning_rate": 5.280423280423281e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5510 }, { "epoch": 77.00504761904762, - "grad_norm": 0.0027238104958087206, + "grad_norm": 0.001903692027553916, "learning_rate": 5.26984126984127e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5520 }, { "epoch": 77.006, - "grad_norm": 0.0022144822869449854, + "grad_norm": 0.0015759927919134498, "learning_rate": 5.259259259259259e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5530 }, { "epoch": 77.0067619047619, - "eval_accuracy": 0.6756756756756757, - "eval_loss": 2.4877402782440186, - "eval_runtime": 13.4548, - "eval_samples_per_second": 5.5, - "eval_steps_per_second": 1.412, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.88606858253479, + "eval_runtime": 21.6506, + "eval_samples_per_second": 3.418, + "eval_steps_per_second": 0.878, "step": 5538 }, { "epoch": 78.00019047619048, - "grad_norm": 0.5678148865699768, + "grad_norm": 0.0005823720712214708, "learning_rate": 5.2486772486772485e-06, - "loss": 0.0002, + "loss": 0.0, "step": 5540 }, { "epoch": 78.00114285714285, - "grad_norm": 0.0013379160081967711, + "grad_norm": 0.00030079399584792554, "learning_rate": 5.2380952380952384e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5550 }, { "epoch": 78.00209523809524, - "grad_norm": 0.005288382992148399, + "grad_norm": 0.0007412757840938866, "learning_rate": 5.227513227513228e-06, - "loss": 0.0002, + "loss": 0.0, "step": 5560 }, { "epoch": 78.00304761904762, - "grad_norm": 0.0062101962976157665, + "grad_norm": 0.000413477944675833, "learning_rate": 5.216931216931217e-06, - "loss": 0.0593, + "loss": 0.0, "step": 5570 }, { "epoch": 78.004, - "grad_norm": 0.0038799692410975695, + "grad_norm": 0.000860493048094213, "learning_rate": 5.2063492063492076e-06, - "loss": 0.0002, + "loss": 0.0, "step": 5580 }, { "epoch": 78.00495238095237, - "grad_norm": 0.0038153226487338543, + "grad_norm": 0.0003023779718205333, "learning_rate": 5.195767195767196e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5590 }, { "epoch": 78.00590476190476, - "grad_norm": 0.002323978580534458, + "grad_norm": 0.00046973678399808705, "learning_rate": 5.185185185185185e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5600 }, { "epoch": 78.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.1835763454437256, - "eval_runtime": 13.8993, - "eval_samples_per_second": 5.324, - "eval_steps_per_second": 1.367, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.8524179458618164, + "eval_runtime": 23.4181, + "eval_samples_per_second": 3.16, + "eval_steps_per_second": 0.811, "step": 5609 }, { "epoch": 79.00009523809524, - "grad_norm": 0.0024446777533739805, + "grad_norm": 0.001756474724970758, "learning_rate": 5.174603174603176e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5610 }, { "epoch": 79.00104761904763, - "grad_norm": 0.002996997442096472, + "grad_norm": 0.004148818086832762, "learning_rate": 5.164021164021165e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5620 }, { "epoch": 79.002, - "grad_norm": 0.0018322393298149109, + "grad_norm": 0.0009733652113936841, "learning_rate": 5.153439153439154e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5630 }, { "epoch": 79.00295238095238, - "grad_norm": 0.002044577617198229, + "grad_norm": 0.0004708434862550348, "learning_rate": 5.142857142857142e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5640 }, { "epoch": 79.00390476190476, - "grad_norm": 0.001871368265710771, + "grad_norm": 0.0010074286255985498, "learning_rate": 5.132275132275133e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5650 }, { "epoch": 79.00485714285715, - "grad_norm": 20.724700927734375, + "grad_norm": 0.0005253396811895072, "learning_rate": 5.121693121693122e-06, - "loss": 0.0036, + "loss": 0.0, "step": 5660 }, { "epoch": 79.00580952380952, - "grad_norm": 0.006275147665292025, + "grad_norm": 0.0005340786301530898, "learning_rate": 5.1111111111111115e-06, - "loss": 0.2375, + "loss": 0.0, "step": 5670 }, { "epoch": 79.0067619047619, - "grad_norm": 0.0016398575389757752, + "grad_norm": 0.0013357801362872124, "learning_rate": 5.1005291005291015e-06, - "loss": 0.0021, + "loss": 0.0, "step": 5680 }, { "epoch": 79.0067619047619, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 2.669719696044922, - "eval_runtime": 13.7155, - "eval_samples_per_second": 5.395, - "eval_steps_per_second": 1.385, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.7961583137512207, + "eval_runtime": 24.1533, + "eval_samples_per_second": 3.064, + "eval_steps_per_second": 0.787, "step": 5680 }, { "epoch": 80.00095238095238, - "grad_norm": 0.0015869191847741604, + "grad_norm": 0.0003452278324402869, "learning_rate": 5.089947089947091e-06, - "loss": 0.0366, + "loss": 0.1595, "step": 5690 }, { "epoch": 80.00190476190477, - "grad_norm": 0.001834389171563089, + "grad_norm": 0.0025528387632220984, "learning_rate": 5.07936507936508e-06, - "loss": 0.0001, + "loss": 0.0003, "step": 5700 }, { "epoch": 80.00285714285714, - "grad_norm": 0.5443345904350281, + "grad_norm": 0.5342845320701599, "learning_rate": 5.06878306878307e-06, - "loss": 0.0002, + "loss": 0.0001, "step": 5710 }, { "epoch": 80.00380952380952, - "grad_norm": 0.005123675335198641, + "grad_norm": 0.0007849848479963839, "learning_rate": 5.058201058201059e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5720 }, { "epoch": 80.0047619047619, - "grad_norm": 0.004339130129665136, + "grad_norm": 0.0004529608122538775, "learning_rate": 5.047619047619048e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5730 }, { "epoch": 80.00571428571429, - "grad_norm": 0.0023713463451713324, + "grad_norm": 0.00045702006900683045, "learning_rate": 5.037037037037037e-06, - "loss": 0.0065, + "loss": 0.0, "step": 5740 }, { "epoch": 80.00666666666666, - "grad_norm": 0.0019026033114641905, + "grad_norm": 0.0009125975775532424, "learning_rate": 5.026455026455027e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5750 }, { "epoch": 80.0067619047619, - "eval_accuracy": 0.7432432432432432, - "eval_loss": 1.8824700117111206, - "eval_runtime": 13.7118, - "eval_samples_per_second": 5.397, - "eval_steps_per_second": 1.386, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 2.86403751373291, + "eval_runtime": 27.5707, + "eval_samples_per_second": 2.684, + "eval_steps_per_second": 0.689, "step": 5751 }, { "epoch": 81.00085714285714, - "grad_norm": 0.01365320011973381, + "grad_norm": 0.0009199812775477767, "learning_rate": 5.015873015873016e-06, - "loss": 0.04, + "loss": 0.0, "step": 5760 }, { "epoch": 81.00180952380953, - "grad_norm": 0.0015253903111442924, + "grad_norm": 0.0012672754237428308, "learning_rate": 5.005291005291005e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5770 }, { "epoch": 81.00276190476191, - "grad_norm": 0.35490384697914124, + "grad_norm": 0.0008347496041096747, "learning_rate": 4.9947089947089946e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5780 }, { "epoch": 81.00371428571428, - "grad_norm": 0.0018017725087702274, + "grad_norm": 0.0005167628987692297, "learning_rate": 4.9841269841269845e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5790 }, { "epoch": 81.00466666666667, - "grad_norm": 0.001546078477986157, + "grad_norm": 0.000547485426068306, "learning_rate": 4.973544973544974e-06, - "loss": 0.0463, + "loss": 0.0, "step": 5800 }, { "epoch": 81.00561904761905, - "grad_norm": 0.0065314993262290955, + "grad_norm": 0.0015901158330962062, "learning_rate": 4.962962962962964e-06, - "loss": 0.0473, + "loss": 0.0, "step": 5810 }, { "epoch": 81.00657142857143, - "grad_norm": 0.003981141373515129, + "grad_norm": 0.006001343484967947, "learning_rate": 4.952380952380953e-06, - "loss": 0.0004, + "loss": 0.0, "step": 5820 }, { "epoch": 81.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.15901255607605, - "eval_runtime": 13.7831, - "eval_samples_per_second": 5.369, - "eval_steps_per_second": 1.379, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.844578981399536, + "eval_runtime": 27.4966, + "eval_samples_per_second": 2.691, + "eval_steps_per_second": 0.691, "step": 5822 }, { "epoch": 82.0007619047619, - "grad_norm": 0.002247112337499857, + "grad_norm": 0.0003850508655887097, "learning_rate": 4.941798941798942e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5830 }, { "epoch": 82.00171428571429, - "grad_norm": 0.16178584098815918, + "grad_norm": 0.0013694074004888535, "learning_rate": 4.931216931216932e-06, - "loss": 0.0769, + "loss": 0.0, "step": 5840 }, { "epoch": 82.00266666666667, - "grad_norm": 0.0024550934322178364, + "grad_norm": 0.0008942610002122819, "learning_rate": 4.920634920634921e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5850 }, { "epoch": 82.00361904761905, - "grad_norm": 0.0017204447649419308, + "grad_norm": 0.0021071520168334246, "learning_rate": 4.91005291005291e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5860 }, { "epoch": 82.00457142857142, - "grad_norm": 0.0012850721832364798, + "grad_norm": 0.0001672828511800617, "learning_rate": 4.8994708994709e-06, - "loss": 0.0161, + "loss": 0.0, "step": 5870 }, { "epoch": 82.00552380952381, - "grad_norm": 0.006347615737468004, + "grad_norm": 0.051589980721473694, "learning_rate": 4.888888888888889e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5880 }, { "epoch": 82.00647619047619, - "grad_norm": 1.662007212638855, + "grad_norm": 0.008057937026023865, "learning_rate": 4.8783068783068785e-06, - "loss": 0.0003, + "loss": 0.0, "step": 5890 }, { "epoch": 82.0067619047619, - "eval_accuracy": 0.7567567567567568, - "eval_loss": 1.881416916847229, - "eval_runtime": 14.4626, - "eval_samples_per_second": 5.117, - "eval_steps_per_second": 1.314, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.6401426792144775, + "eval_runtime": 25.1047, + "eval_samples_per_second": 2.948, + "eval_steps_per_second": 0.757, "step": 5893 }, { "epoch": 83.00066666666666, - "grad_norm": 0.0023455878254026175, + "grad_norm": 0.0004187853483017534, "learning_rate": 4.867724867724868e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5900 }, { "epoch": 83.00161904761904, - "grad_norm": 0.0377412848174572, + "grad_norm": 0.0005931004998274148, "learning_rate": 4.857142857142858e-06, - "loss": 0.0003, + "loss": 0.0, "step": 5910 }, { "epoch": 83.00257142857143, - "grad_norm": 22.338958740234375, + "grad_norm": 0.0002631930401548743, "learning_rate": 4.846560846560847e-06, - "loss": 0.2291, + "loss": 0.0, "step": 5920 }, { "epoch": 83.00352380952381, - "grad_norm": 0.00845143012702465, + "grad_norm": 0.01038403995335102, "learning_rate": 4.835978835978836e-06, - "loss": 0.5281, + "loss": 0.0004, "step": 5930 }, { "epoch": 83.0044761904762, - "grad_norm": 69.15572357177734, + "grad_norm": 0.018158361315727234, "learning_rate": 4.825396825396826e-06, - "loss": 0.4115, + "loss": 0.0, "step": 5940 }, { "epoch": 83.00542857142857, - "grad_norm": 0.04171192646026611, + "grad_norm": 0.0009759682579897344, "learning_rate": 4.814814814814815e-06, - "loss": 0.0002, + "loss": 0.0032, "step": 5950 }, { "epoch": 83.00638095238095, - "grad_norm": 0.0028879311867058277, + "grad_norm": 0.0006013894453644753, "learning_rate": 4.804232804232805e-06, - "loss": 0.0118, + "loss": 0.0007, "step": 5960 }, { "epoch": 83.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 1.847873568534851, - "eval_runtime": 14.1895, - "eval_samples_per_second": 5.215, - "eval_steps_per_second": 1.339, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.398712635040283, + "eval_runtime": 25.0995, + "eval_samples_per_second": 2.948, + "eval_steps_per_second": 0.757, "step": 5964 }, { "epoch": 84.00057142857143, - "grad_norm": 0.1561601310968399, + "grad_norm": 0.0019183410331606865, "learning_rate": 4.793650793650794e-06, - "loss": 0.0001, + "loss": 0.0, "step": 5970 }, { "epoch": 84.0015238095238, - "grad_norm": 0.0037835307884961367, + "grad_norm": 0.000676738447509706, "learning_rate": 4.783068783068783e-06, - "loss": 0.1162, + "loss": 0.0, "step": 5980 }, { "epoch": 84.00247619047619, - "grad_norm": 4.43619441986084, + "grad_norm": 0.0023505811113864183, "learning_rate": 4.772486772486773e-06, - "loss": 0.0004, + "loss": 0.0, "step": 5990 }, { "epoch": 84.00342857142857, - "grad_norm": 0.001268463907763362, + "grad_norm": 0.0005468825693242252, "learning_rate": 4.761904761904762e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6000 }, { "epoch": 84.00438095238096, - "grad_norm": 0.03871821612119675, + "grad_norm": 0.00032948973239399493, "learning_rate": 4.7513227513227515e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6010 }, { "epoch": 84.00533333333334, - "grad_norm": 0.015075616538524628, + "grad_norm": 0.0016405474161729217, "learning_rate": 4.7407407407407415e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6020 }, { "epoch": 84.00628571428571, - "grad_norm": 0.0014224612386897206, + "grad_norm": 0.00029347886447794735, "learning_rate": 4.730158730158731e-06, - "loss": 0.1773, + "loss": 0.0, "step": 6030 }, { "epoch": 84.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 1.69830322265625, - "eval_runtime": 13.9192, - "eval_samples_per_second": 5.316, - "eval_steps_per_second": 1.365, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.364161729812622, + "eval_runtime": 26.3423, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 0.721, "step": 6035 }, { "epoch": 85.00047619047619, - "grad_norm": 0.0016157248755916953, + "grad_norm": 0.00021160613687243313, "learning_rate": 4.71957671957672e-06, - "loss": 0.0002, + "loss": 0.0, "step": 6040 }, { "epoch": 85.00142857142858, - "grad_norm": 13.222190856933594, + "grad_norm": 0.0007972092716954648, "learning_rate": 4.708994708994709e-06, - "loss": 0.001, + "loss": 0.0, "step": 6050 }, { "epoch": 85.00238095238095, - "grad_norm": 0.04672477766871452, + "grad_norm": 0.0047329687513411045, "learning_rate": 4.698412698412699e-06, "loss": 0.0002, "step": 6060 }, { "epoch": 85.00333333333333, - "grad_norm": 0.007947994396090508, + "grad_norm": 574.7470092773438, "learning_rate": 4.687830687830688e-06, - "loss": 0.0376, + "loss": 0.1462, "step": 6070 }, { "epoch": 85.00428571428571, - "grad_norm": 0.001851134467869997, + "grad_norm": 0.0005663606571033597, "learning_rate": 4.677248677248677e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6080 }, { "epoch": 85.0052380952381, - "grad_norm": 0.0015396237140521407, + "grad_norm": 0.0019831659737974405, "learning_rate": 4.666666666666667e-06, - "loss": 0.0068, + "loss": 0.1749, "step": 6090 }, { "epoch": 85.00619047619048, - "grad_norm": 0.005970226135104895, + "grad_norm": 0.0003156385209877044, "learning_rate": 4.656084656084656e-06, - "loss": 0.0025, + "loss": 0.0, "step": 6100 }, { "epoch": 85.0067619047619, - "eval_accuracy": 0.6351351351351351, - "eval_loss": 2.550222396850586, - "eval_runtime": 14.6373, - "eval_samples_per_second": 5.056, - "eval_steps_per_second": 1.298, + "eval_accuracy": 0.6756756756756757, + "eval_loss": 2.470996856689453, + "eval_runtime": 26.7258, + "eval_samples_per_second": 2.769, + "eval_steps_per_second": 0.711, "step": 6106 }, { "epoch": 86.00038095238095, - "grad_norm": 76.60648345947266, + "grad_norm": 0.001011149724945426, "learning_rate": 4.6455026455026454e-06, - "loss": 0.2234, + "loss": 0.1451, "step": 6110 }, { "epoch": 86.00133333333333, - "grad_norm": 0.0012446778127923608, + "grad_norm": 0.00027317553758621216, "learning_rate": 4.634920634920635e-06, "loss": 0.0001, "step": 6120 }, { "epoch": 86.00228571428572, - "grad_norm": 0.003662912407889962, + "grad_norm": 0.0022837144788354635, "learning_rate": 4.6243386243386246e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6130 }, { "epoch": 86.00323809523809, - "grad_norm": 0.002262139692902565, + "grad_norm": 0.0005552778020501137, "learning_rate": 4.6137566137566145e-06, - "loss": 0.0001, + "loss": 0.0028, "step": 6140 }, { "epoch": 86.00419047619047, - "grad_norm": 0.0015796871157363057, + "grad_norm": 0.00046171335270628333, "learning_rate": 4.603174603174604e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6150 }, { "epoch": 86.00514285714286, - "grad_norm": 0.004956441931426525, + "grad_norm": 0.00023459379735868424, "learning_rate": 4.592592592592593e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6160 }, { "epoch": 86.00609523809524, - "grad_norm": 0.002679603872820735, + "grad_norm": 0.00025353021919727325, "learning_rate": 4.582010582010583e-06, - "loss": 0.0001, + "loss": 0.0004, "step": 6170 }, { "epoch": 86.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.244560718536377, - "eval_runtime": 14.2249, - "eval_samples_per_second": 5.202, - "eval_steps_per_second": 1.336, + "eval_accuracy": 0.6486486486486487, + "eval_loss": 3.032348155975342, + "eval_runtime": 22.8767, + "eval_samples_per_second": 3.235, + "eval_steps_per_second": 0.831, "step": 6177 }, { "epoch": 87.00028571428571, - "grad_norm": 0.0019916188903152943, + "grad_norm": 0.0003504717315081507, "learning_rate": 4.571428571428572e-06, - "loss": 0.0002, + "loss": 0.0, "step": 6180 }, { "epoch": 87.0012380952381, - "grad_norm": 0.037145279347896576, + "grad_norm": 0.0009244528482668102, "learning_rate": 4.560846560846561e-06, - "loss": 0.0004, + "loss": 0.0, "step": 6190 }, { "epoch": 87.00219047619048, - "grad_norm": 246.57989501953125, + "grad_norm": 0.0005004839040338993, "learning_rate": 4.55026455026455e-06, - "loss": 0.0278, + "loss": 0.0, "step": 6200 }, { "epoch": 87.00314285714286, - "grad_norm": 0.0268414206802845, + "grad_norm": 0.0009528248338028789, "learning_rate": 4.53968253968254e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6210 }, { "epoch": 87.00409523809523, - "grad_norm": 0.0020942252594977617, + "grad_norm": 0.0006345040746964514, "learning_rate": 4.529100529100529e-06, - "loss": 0.1572, + "loss": 0.0, "step": 6220 }, { "epoch": 87.00504761904762, - "grad_norm": 0.00141229503788054, + "grad_norm": 0.00041144120041280985, "learning_rate": 4.5185185185185185e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6230 }, { "epoch": 87.006, - "grad_norm": 0.0021090495865792036, + "grad_norm": 0.00040222075767815113, "learning_rate": 4.5079365079365085e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6240 }, { "epoch": 87.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.094975233078003, - "eval_runtime": 16.0201, - "eval_samples_per_second": 4.619, - "eval_steps_per_second": 1.186, + "eval_accuracy": 0.6351351351351351, + "eval_loss": 3.0862441062927246, + "eval_runtime": 23.3262, + "eval_samples_per_second": 3.172, + "eval_steps_per_second": 0.815, "step": 6248 }, { "epoch": 88.00019047619048, - "grad_norm": 0.001960631227120757, + "grad_norm": 0.000379973032977432, "learning_rate": 4.497354497354498e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6250 }, { "epoch": 88.00114285714285, - "grad_norm": 0.0014744813088327646, + "grad_norm": 0.0004291079530958086, "learning_rate": 4.486772486772487e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6260 }, { "epoch": 88.00209523809524, - "grad_norm": 0.003915382549166679, + "grad_norm": 0.0002368905406910926, "learning_rate": 4.476190476190477e-06, "loss": 0.0001, "step": 6270 }, { "epoch": 88.00304761904762, - "grad_norm": 0.013609534129500389, + "grad_norm": 0.0003694745246320963, "learning_rate": 4.465608465608466e-06, - "loss": 0.0001, + "loss": 0.2122, "step": 6280 }, { "epoch": 88.004, - "grad_norm": 0.0028364358004182577, + "grad_norm": 0.00040288950549438596, "learning_rate": 4.455026455026456e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6290 }, { "epoch": 88.00495238095237, - "grad_norm": 0.001382499816827476, + "grad_norm": 0.0005140057182870805, "learning_rate": 4.444444444444444e-06, - "loss": 0.0001, + "loss": 0.0333, "step": 6300 }, { "epoch": 88.00590476190476, - "grad_norm": 0.0018119841115549207, + "grad_norm": 0.0006567566306330264, "learning_rate": 4.433862433862434e-06, - "loss": 0.0001, + "loss": 0.2299, "step": 6310 }, { "epoch": 88.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.2133569717407227, - "eval_runtime": 14.0391, - "eval_samples_per_second": 5.271, - "eval_steps_per_second": 1.353, + "eval_accuracy": 0.7702702702702703, + "eval_loss": 2.028332233428955, + "eval_runtime": 22.7378, + "eval_samples_per_second": 3.254, + "eval_steps_per_second": 0.836, "step": 6319 }, { "epoch": 89.00009523809524, - "grad_norm": 0.0020059419330209494, + "grad_norm": 0.0009168223477900028, "learning_rate": 4.423280423280424e-06, - "loss": 0.0001, + "loss": 0.0002, "step": 6320 }, { "epoch": 89.00104761904763, - "grad_norm": 0.007441331632435322, + "grad_norm": 0.002936070552095771, "learning_rate": 4.412698412698413e-06, - "loss": 0.0001, + "loss": 0.0403, "step": 6330 }, { "epoch": 89.002, - "grad_norm": 0.00201158388517797, + "grad_norm": 0.000776287168264389, "learning_rate": 4.402116402116402e-06, - "loss": 0.0005, + "loss": 0.0, "step": 6340 }, { "epoch": 89.00295238095238, - "grad_norm": 0.00130746653303504, + "grad_norm": 0.0005983594455756247, "learning_rate": 4.3915343915343915e-06, - "loss": 0.0027, + "loss": 0.0, "step": 6350 }, { "epoch": 89.00390476190476, - "grad_norm": 0.0013548173010349274, + "grad_norm": 0.0002185263583669439, "learning_rate": 4.3809523809523815e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6360 }, { "epoch": 89.00485714285715, - "grad_norm": 0.0010751527734100819, + "grad_norm": 0.00029487276333384216, "learning_rate": 4.370370370370371e-06, - "loss": 0.0002, + "loss": 0.0, "step": 6370 }, { "epoch": 89.00580952380952, - "grad_norm": 0.017397599294781685, + "grad_norm": 0.000874596182256937, "learning_rate": 4.35978835978836e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6380 }, { "epoch": 89.0067619047619, - "grad_norm": 0.0018166283844038844, + "grad_norm": 0.0024972488172352314, "learning_rate": 4.34920634920635e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6390 }, { "epoch": 89.0067619047619, - "eval_accuracy": 0.7432432432432432, - "eval_loss": 1.957572102546692, - "eval_runtime": 13.8712, - "eval_samples_per_second": 5.335, - "eval_steps_per_second": 1.37, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.3751778602600098, + "eval_runtime": 23.3451, + "eval_samples_per_second": 3.17, + "eval_steps_per_second": 0.814, "step": 6390 }, { "epoch": 90.00095238095238, - "grad_norm": 0.0026160378474742174, + "grad_norm": 329.7825927734375, "learning_rate": 4.338624338624339e-06, - "loss": 0.0147, + "loss": 0.428, "step": 6400 }, { "epoch": 90.00190476190477, - "grad_norm": 0.001736226724460721, + "grad_norm": 0.00045626627979800105, "learning_rate": 4.328042328042328e-06, - "loss": 0.0001, + "loss": 0.1539, "step": 6410 }, { "epoch": 90.00285714285714, - "grad_norm": 0.0011704101925715804, + "grad_norm": 9.426961898803711, "learning_rate": 4.317460317460318e-06, - "loss": 0.0001, + "loss": 0.0005, "step": 6420 }, { "epoch": 90.00380952380952, - "grad_norm": 0.0027360101230442524, + "grad_norm": 0.0024752768222242594, "learning_rate": 4.306878306878307e-06, - "loss": 0.0002, + "loss": 0.0, "step": 6430 }, { "epoch": 90.0047619047619, - "grad_norm": 0.0017641916638240218, + "grad_norm": 0.0005259969038888812, "learning_rate": 4.296296296296296e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6440 }, { "epoch": 90.00571428571429, - "grad_norm": 0.004456514958292246, + "grad_norm": 0.0007702509174123406, "learning_rate": 4.2857142857142855e-06, - "loss": 0.0693, + "loss": 0.0, "step": 6450 }, { "epoch": 90.00666666666666, - "grad_norm": 0.0016106553375720978, + "grad_norm": 0.0006307198782451451, "learning_rate": 4.2751322751322754e-06, - "loss": 0.0001, + "loss": 0.1842, "step": 6460 }, { "epoch": 90.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.042990207672119, - "eval_runtime": 12.0238, - "eval_samples_per_second": 6.154, - "eval_steps_per_second": 1.58, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 2.2107057571411133, + "eval_runtime": 25.9576, + "eval_samples_per_second": 2.851, + "eval_steps_per_second": 0.732, "step": 6461 }, { "epoch": 91.00085714285714, - "grad_norm": 0.000978887197561562, + "grad_norm": 0.0012892925878986716, "learning_rate": 4.2645502645502654e-06, - "loss": 0.0056, + "loss": 0.1752, "step": 6470 }, { "epoch": 91.00180952380953, - "grad_norm": 0.001994654070585966, + "grad_norm": 0.0017052206676453352, "learning_rate": 4.2539682539682546e-06, - "loss": 0.0002, + "loss": 0.0, "step": 6480 }, { "epoch": 91.00276190476191, - "grad_norm": 0.0011462611146271229, + "grad_norm": 0.0003308649465907365, "learning_rate": 4.243386243386244e-06, "loss": 0.0001, "step": 6490 }, { "epoch": 91.00371428571428, - "grad_norm": 0.0010819780873134732, + "grad_norm": 0.0016652209451422095, "learning_rate": 4.232804232804233e-06, - "loss": 0.0295, + "loss": 0.0, "step": 6500 }, { "epoch": 91.00466666666667, - "grad_norm": 0.006473600398749113, + "grad_norm": 0.012959081679582596, "learning_rate": 4.222222222222223e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6510 }, { "epoch": 91.00561904761905, - "grad_norm": 0.0015840963460505009, + "grad_norm": 0.001314148772507906, "learning_rate": 4.211640211640212e-06, - "loss": 0.1886, + "loss": 0.0, "step": 6520 }, { "epoch": 91.00657142857143, - "grad_norm": 0.0018584438366815448, + "grad_norm": 0.008198284544050694, "learning_rate": 4.201058201058201e-06, - "loss": 0.0001, + "loss": 0.0002, "step": 6530 }, { "epoch": 91.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.131889581680298, - "eval_runtime": 13.9379, - "eval_samples_per_second": 5.309, - "eval_steps_per_second": 1.363, + "eval_accuracy": 0.6621621621621622, + "eval_loss": 3.1361114978790283, + "eval_runtime": 23.34, + "eval_samples_per_second": 3.171, + "eval_steps_per_second": 0.814, "step": 6532 }, { "epoch": 92.0007619047619, - "grad_norm": 0.0025043929927051067, + "grad_norm": 0.005125410854816437, "learning_rate": 4.190476190476191e-06, - "loss": 0.0001, + "loss": 0.0004, "step": 6540 }, { "epoch": 92.00171428571429, - "grad_norm": 0.0010687155881896615, + "grad_norm": 0.0006265908596105874, "learning_rate": 4.17989417989418e-06, - "loss": 0.0004, + "loss": 0.0, "step": 6550 }, { "epoch": 92.00266666666667, - "grad_norm": 0.0010446676751598716, + "grad_norm": 0.0026438962668180466, "learning_rate": 4.169312169312169e-06, - "loss": 0.0001, + "loss": 0.0599, "step": 6560 }, { "epoch": 92.00361904761905, - "grad_norm": 0.002059612423181534, + "grad_norm": 0.0007586319698020816, "learning_rate": 4.158730158730159e-06, - "loss": 0.0064, + "loss": 0.0, "step": 6570 }, { "epoch": 92.00457142857142, - "grad_norm": 0.004754054360091686, + "grad_norm": 0.0017233211547136307, "learning_rate": 4.1481481481481485e-06, - "loss": 0.0949, + "loss": 0.0, "step": 6580 }, { "epoch": 92.00552380952381, - "grad_norm": 0.0029344987124204636, + "grad_norm": 0.0015403326833620667, "learning_rate": 4.137566137566138e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6590 }, { "epoch": 92.00647619047619, - "grad_norm": 0.002169994870200753, + "grad_norm": 0.013408493250608444, "learning_rate": 4.126984126984127e-06, - "loss": 0.0034, + "loss": 0.0, "step": 6600 }, { "epoch": 92.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.4718101024627686, - "eval_runtime": 13.131, - "eval_samples_per_second": 5.636, - "eval_steps_per_second": 1.447, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.7366185188293457, + "eval_runtime": 22.9942, + "eval_samples_per_second": 3.218, + "eval_steps_per_second": 0.826, "step": 6603 }, { "epoch": 93.00066666666666, - "grad_norm": 127.84756469726562, + "grad_norm": 0.0003232085146009922, "learning_rate": 4.116402116402117e-06, - "loss": 0.5065, + "loss": 0.0, "step": 6610 }, { "epoch": 93.00161904761904, - "grad_norm": 0.0012106394860893488, + "grad_norm": 0.043059222400188446, "learning_rate": 4.105820105820107e-06, - "loss": 0.1087, + "loss": 0.0, "step": 6620 }, { "epoch": 93.00257142857143, - "grad_norm": 439.0475769042969, + "grad_norm": 0.00047642309800721705, "learning_rate": 4.095238095238096e-06, - "loss": 0.3879, + "loss": 0.0, "step": 6630 }, { "epoch": 93.00352380952381, - "grad_norm": 526.8871459960938, + "grad_norm": 0.0005419534863904119, "learning_rate": 4.084656084656085e-06, - "loss": 0.2497, + "loss": 0.0016, "step": 6640 }, { "epoch": 93.0044761904762, - "grad_norm": 0.006663051433861256, + "grad_norm": 0.000353945535607636, "learning_rate": 4.074074074074074e-06, - "loss": 0.1348, + "loss": 0.0, "step": 6650 }, { "epoch": 93.00542857142857, - "grad_norm": 0.02263970859348774, + "grad_norm": 0.00030686677200719714, "learning_rate": 4.063492063492064e-06, - "loss": 0.0001, + "loss": 0.0003, "step": 6660 }, { "epoch": 93.00638095238095, - "grad_norm": 0.0017573466757312417, + "grad_norm": 0.00041551675531081855, "learning_rate": 4.052910052910053e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6670 }, { "epoch": 93.0067619047619, "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.5268473625183105, - "eval_runtime": 14.4145, - "eval_samples_per_second": 5.134, - "eval_steps_per_second": 1.318, + "eval_loss": 2.6849799156188965, + "eval_runtime": 23.033, + "eval_samples_per_second": 3.213, + "eval_steps_per_second": 0.825, "step": 6674 }, { "epoch": 94.00057142857143, - "grad_norm": 0.019557686522603035, + "grad_norm": 0.0002515302912797779, "learning_rate": 4.042328042328042e-06, - "loss": 0.0002, + "loss": 0.0, "step": 6680 }, { "epoch": 94.0015238095238, - "grad_norm": 0.002184673910960555, + "grad_norm": 0.0007501939544454217, "learning_rate": 4.031746031746032e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6690 }, { "epoch": 94.00247619047619, - "grad_norm": 0.0016807018546387553, + "grad_norm": 0.00030370696913450956, "learning_rate": 4.0211640211640215e-06, - "loss": 0.0003, + "loss": 0.0, "step": 6700 }, { "epoch": 94.00342857142857, - "grad_norm": 0.0013105234829708934, + "grad_norm": 0.010980455204844475, "learning_rate": 4.010582010582011e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6710 }, { "epoch": 94.00438095238096, - "grad_norm": 0.0016931260470300913, + "grad_norm": 0.0007709608762525022, "learning_rate": 4.000000000000001e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6720 }, { "epoch": 94.00533333333334, - "grad_norm": 0.001956460066139698, + "grad_norm": 0.0003361174603924155, "learning_rate": 3.98941798941799e-06, - "loss": 0.0195, + "loss": 0.0, "step": 6730 }, { "epoch": 94.00628571428571, - "grad_norm": 0.0012113023549318314, + "grad_norm": 0.00025106294197030365, "learning_rate": 3.978835978835979e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6740 }, { "epoch": 94.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.421144962310791, - "eval_runtime": 9.1901, - "eval_samples_per_second": 8.052, - "eval_steps_per_second": 2.067, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.696514129638672, + "eval_runtime": 21.7843, + "eval_samples_per_second": 3.397, + "eval_steps_per_second": 0.872, "step": 6745 }, { "epoch": 95.00047619047619, - "grad_norm": 0.00762981129810214, + "grad_norm": 0.0017140108393505216, "learning_rate": 3.968253968253968e-06, - "loss": 0.0457, + "loss": 0.0, "step": 6750 }, { "epoch": 95.00142857142858, - "grad_norm": 0.0016311467625200748, + "grad_norm": 0.0003946751821786165, "learning_rate": 3.957671957671958e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6760 }, { "epoch": 95.00238095238095, - "grad_norm": 0.0014087501913309097, + "grad_norm": 0.0016369846416637301, "learning_rate": 3.947089947089948e-06, - "loss": 0.0001, + "loss": 0.0582, "step": 6770 }, { "epoch": 95.00333333333333, - "grad_norm": 0.002407238818705082, + "grad_norm": 0.0007478753686882555, "learning_rate": 3.936507936507936e-06, - "loss": 0.0265, + "loss": 0.0, "step": 6780 }, { "epoch": 95.00428571428571, - "grad_norm": 0.0009130392572842538, + "grad_norm": 0.00047938968054950237, "learning_rate": 3.925925925925926e-06, - "loss": 0.0001, + "loss": 0.0028, "step": 6790 }, { "epoch": 95.0052380952381, - "grad_norm": 0.02090597338974476, + "grad_norm": 0.00035130296600982547, "learning_rate": 3.9153439153439155e-06, - "loss": 0.0071, + "loss": 0.1409, "step": 6800 }, { "epoch": 95.00619047619048, - "grad_norm": 0.0009239759529009461, + "grad_norm": 0.0006514904671348631, "learning_rate": 3.9047619047619055e-06, - "loss": 0.0001, + "loss": 0.1894, "step": 6810 }, { "epoch": 95.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.397068977355957, - "eval_runtime": 8.9045, - "eval_samples_per_second": 8.31, - "eval_steps_per_second": 2.134, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.507014036178589, + "eval_runtime": 19.8307, + "eval_samples_per_second": 3.732, + "eval_steps_per_second": 0.958, "step": 6816 }, { "epoch": 96.00038095238095, - "grad_norm": 0.0013253205688670278, + "grad_norm": 0.001034559914842248, "learning_rate": 3.894179894179895e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6820 }, { "epoch": 96.00133333333333, - "grad_norm": 0.0016497639007866383, + "grad_norm": 0.003884747624397278, "learning_rate": 3.883597883597884e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6830 }, { "epoch": 96.00228571428572, - "grad_norm": 0.001459629973396659, + "grad_norm": 0.00044443883234634995, "learning_rate": 3.873015873015874e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6840 }, { "epoch": 96.00323809523809, - "grad_norm": 0.004556193016469479, + "grad_norm": 0.03588930517435074, "learning_rate": 3.862433862433863e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6850 }, { "epoch": 96.00419047619047, - "grad_norm": 0.0008555215317755938, + "grad_norm": 0.000525305571500212, "learning_rate": 3.851851851851852e-06, - "loss": 0.0004, + "loss": 0.0, "step": 6860 }, { "epoch": 96.00514285714286, - "grad_norm": 0.0014725595247000456, + "grad_norm": 0.00042602582834661007, "learning_rate": 3.841269841269842e-06, - "loss": 0.0003, + "loss": 0.0, "step": 6870 }, { "epoch": 96.00609523809524, - "grad_norm": 0.010796592570841312, + "grad_norm": 0.0003282624820712954, "learning_rate": 3.830687830687831e-06, - "loss": 0.1517, + "loss": 0.0, "step": 6880 }, { "epoch": 96.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.2035391330718994, - "eval_runtime": 9.6362, - "eval_samples_per_second": 7.679, - "eval_steps_per_second": 1.972, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.5326974391937256, + "eval_runtime": 20.8641, + "eval_samples_per_second": 3.547, + "eval_steps_per_second": 0.911, "step": 6887 }, { "epoch": 97.00028571428571, - "grad_norm": 0.0025478184688836336, + "grad_norm": 0.0016416395083069801, "learning_rate": 3.82010582010582e-06, - "loss": 0.1666, + "loss": 0.0001, "step": 6890 }, { "epoch": 97.0012380952381, - "grad_norm": 0.011283445172011852, + "grad_norm": 0.0002830391167663038, "learning_rate": 3.80952380952381e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6900 }, { "epoch": 97.00219047619048, - "grad_norm": 0.001496862736530602, + "grad_norm": 0.0003058542206417769, "learning_rate": 3.7989417989417994e-06, - "loss": 0.053, + "loss": 0.1022, "step": 6910 }, { "epoch": 97.00314285714286, - "grad_norm": 0.0027492253575474024, + "grad_norm": 0.0010565044358372688, "learning_rate": 3.788359788359789e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6920 }, { "epoch": 97.00409523809523, - "grad_norm": 0.0020821818616241217, + "grad_norm": 0.0004148608713876456, "learning_rate": 3.777777777777778e-06, - "loss": 0.2228, + "loss": 0.0, "step": 6930 }, { "epoch": 97.00504761904762, - "grad_norm": 0.0010064197704195976, + "grad_norm": 0.0002654260606504977, "learning_rate": 3.7671957671957676e-06, - "loss": 0.1754, + "loss": 0.0002, "step": 6940 }, { "epoch": 97.006, - "grad_norm": 0.0010910567361861467, + "grad_norm": 0.00963718444108963, "learning_rate": 3.7566137566137568e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6950 }, { "epoch": 97.0067619047619, - "eval_accuracy": 0.6756756756756757, - "eval_loss": 2.375839948654175, - "eval_runtime": 9.3053, - "eval_samples_per_second": 7.952, - "eval_steps_per_second": 2.042, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.884532928466797, + "eval_runtime": 20.1819, + "eval_samples_per_second": 3.667, + "eval_steps_per_second": 0.941, "step": 6958 }, { "epoch": 98.00019047619048, - "grad_norm": 0.0020339705515652895, + "grad_norm": 0.0004373944248072803, "learning_rate": 3.7460317460317463e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6960 }, { "epoch": 98.00114285714285, - "grad_norm": 0.0017805943498387933, + "grad_norm": 0.0008808135171420872, "learning_rate": 3.735449735449736e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6970 }, { "epoch": 98.00209523809524, - "grad_norm": 0.0009461052832193673, + "grad_norm": 0.004815933760255575, "learning_rate": 3.724867724867725e-06, "loss": 0.0, "step": 6980 }, { "epoch": 98.00304761904762, - "grad_norm": 0.001265208818949759, + "grad_norm": 0.00041855985182337463, "learning_rate": 3.7142857142857146e-06, - "loss": 0.0001, + "loss": 0.0861, "step": 6990 }, { "epoch": 98.004, - "grad_norm": 0.06940359622240067, + "grad_norm": 0.00028756665415130556, "learning_rate": 3.7037037037037037e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7000 }, { "epoch": 98.00495238095237, - "grad_norm": 0.0012590874684974551, + "grad_norm": 0.0002812529855873436, "learning_rate": 3.6931216931216933e-06, - "loss": 0.0001, + "loss": 0.0046, "step": 7010 }, { "epoch": 98.00590476190476, - "grad_norm": 0.005680494476109743, + "grad_norm": 0.0004412989073898643, "learning_rate": 3.6825396825396833e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7020 }, { "epoch": 98.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.2253215312957764, - "eval_runtime": 14.7717, - "eval_samples_per_second": 5.01, - "eval_steps_per_second": 1.286, + "eval_accuracy": 0.7837837837837838, + "eval_loss": 2.003041982650757, + "eval_runtime": 19.5773, + "eval_samples_per_second": 3.78, + "eval_steps_per_second": 0.971, "step": 7029 }, { "epoch": 99.00009523809524, - "grad_norm": 0.0013683551223948598, + "grad_norm": 0.0005032554036006331, "learning_rate": 3.671957671957672e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7030 }, { "epoch": 99.00104761904763, - "grad_norm": 0.0009497467544861138, + "grad_norm": 368.5629577636719, "learning_rate": 3.661375661375662e-06, - "loss": 0.0001, + "loss": 0.0311, "step": 7040 }, { "epoch": 99.002, - "grad_norm": 0.005689162760972977, + "grad_norm": 0.0009055473492480814, "learning_rate": 3.6507936507936507e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7050 }, { "epoch": 99.00295238095238, - "grad_norm": 0.0033088515046983957, + "grad_norm": 0.005899870302528143, "learning_rate": 3.6402116402116407e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7060 }, { "epoch": 99.00390476190476, - "grad_norm": 0.0208294577896595, + "grad_norm": 0.0005574446404352784, "learning_rate": 3.6296296296296302e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7070 }, { "epoch": 99.00485714285715, - "grad_norm": 0.002364435698837042, + "grad_norm": 0.0003847317711915821, "learning_rate": 3.6190476190476194e-06, - "loss": 0.0001, + "loss": 0.0006, "step": 7080 }, { "epoch": 99.00580952380952, - "grad_norm": 0.0021444286685436964, + "grad_norm": 0.0002375604526605457, "learning_rate": 3.608465608465609e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7090 }, { "epoch": 99.0067619047619, - "grad_norm": 0.015663959085941315, + "grad_norm": 0.001055917702615261, "learning_rate": 3.597883597883598e-06, - "loss": 0.0001, + "loss": 0.1439, "step": 7100 }, { "epoch": 99.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.322640895843506, - "eval_runtime": 14.2268, - "eval_samples_per_second": 5.201, - "eval_steps_per_second": 1.336, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.7892189025878906, + "eval_runtime": 20.1076, + "eval_samples_per_second": 3.68, + "eval_steps_per_second": 0.945, "step": 7100 }, { "epoch": 100.00095238095238, - "grad_norm": 0.002384971594437957, + "grad_norm": 0.0020569399930536747, "learning_rate": 3.5873015873015877e-06, "loss": 0.0, "step": 7110 }, { "epoch": 100.00190476190477, - "grad_norm": 0.0013530774740502238, + "grad_norm": 0.0007545605767518282, "learning_rate": 3.5767195767195772e-06, - "loss": 0.0001, + "loss": 0.0297, "step": 7120 }, { "epoch": 100.00285714285714, - "grad_norm": 0.0013220744440332055, + "grad_norm": 0.0006217532209120691, "learning_rate": 3.5661375661375664e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7130 }, { "epoch": 100.00380952380952, - "grad_norm": 0.0012067657662555575, + "grad_norm": 0.00022461486514657736, "learning_rate": 3.555555555555556e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7140 }, { "epoch": 100.0047619047619, - "grad_norm": 0.0012088253861293197, + "grad_norm": 0.0004957873024977744, "learning_rate": 3.544973544973545e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7150 }, { "epoch": 100.00571428571429, - "grad_norm": 0.24812595546245575, + "grad_norm": 0.0002601814630907029, "learning_rate": 3.5343915343915346e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7160 }, { "epoch": 100.00666666666666, - "grad_norm": 0.003686284413561225, + "grad_norm": 0.0005272876587696373, "learning_rate": 3.523809523809524e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7170 }, { "epoch": 100.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.254132032394409, - "eval_runtime": 14.2598, - "eval_samples_per_second": 5.189, - "eval_steps_per_second": 1.332, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.4621739387512207, + "eval_runtime": 20.3266, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 0.935, "step": 7171 }, { "epoch": 101.00085714285714, - "grad_norm": 0.0012897474225610495, + "grad_norm": 0.0008107370231300592, "learning_rate": 3.5132275132275133e-06, - "loss": 0.0814, + "loss": 0.0, "step": 7180 }, { "epoch": 101.00180952380953, - "grad_norm": 0.0020622352603822947, + "grad_norm": 0.0010733662638813257, "learning_rate": 3.502645502645503e-06, - "loss": 0.0058, + "loss": 0.0, "step": 7190 }, { "epoch": 101.00276190476191, - "grad_norm": 338.4073486328125, + "grad_norm": 0.00023595021048095077, "learning_rate": 3.492063492063492e-06, - "loss": 0.0196, + "loss": 0.1378, "step": 7200 }, { "epoch": 101.00371428571428, - "grad_norm": 0.0013440559851005673, + "grad_norm": 0.00020432127348612994, "learning_rate": 3.481481481481482e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7210 }, { "epoch": 101.00466666666667, - "grad_norm": 0.006672668736428022, + "grad_norm": 0.0006551428232342005, "learning_rate": 3.4708994708994716e-06, - "loss": 0.1608, + "loss": 0.0, "step": 7220 }, { "epoch": 101.00561904761905, - "grad_norm": 0.0012928335927426815, + "grad_norm": 0.00026041388628073037, "learning_rate": 3.4603174603174607e-06, - "loss": 0.0001, + "loss": 0.2107, "step": 7230 }, { "epoch": 101.00657142857143, - "grad_norm": 0.0007883533253334463, + "grad_norm": 0.00023115877411328256, "learning_rate": 3.4497354497354503e-06, - "loss": 0.0, + "loss": 0.0016, "step": 7240 }, { "epoch": 101.0067619047619, - "eval_accuracy": 0.6486486486486487, - "eval_loss": 2.6355173587799072, - "eval_runtime": 14.5574, - "eval_samples_per_second": 5.083, - "eval_steps_per_second": 1.305, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.4539666175842285, + "eval_runtime": 20.1069, + "eval_samples_per_second": 3.68, + "eval_steps_per_second": 0.945, "step": 7242 }, { "epoch": 102.0007619047619, - "grad_norm": 0.000990048865787685, + "grad_norm": 0.00019595421326812357, "learning_rate": 3.4391534391534394e-06, - "loss": 0.0002, + "loss": 0.0, "step": 7250 }, { "epoch": 102.00171428571429, - "grad_norm": 0.0007298311102204025, + "grad_norm": 0.00034865373163484037, "learning_rate": 3.428571428571429e-06, - "loss": 0.0001, + "loss": 0.2287, "step": 7260 }, { "epoch": 102.00266666666667, - "grad_norm": 0.0012372963828966022, + "grad_norm": 0.0010627711890265346, "learning_rate": 3.4179894179894185e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7270 }, { "epoch": 102.00361904761905, - "grad_norm": 0.03270556777715683, + "grad_norm": 0.0002950064663309604, "learning_rate": 3.4074074074074077e-06, - "loss": 0.2316, + "loss": 0.0, "step": 7280 }, { "epoch": 102.00457142857142, - "grad_norm": 0.00168316881172359, + "grad_norm": 0.0019715323578566313, "learning_rate": 3.3968253968253972e-06, - "loss": 0.0194, + "loss": 0.0, "step": 7290 }, { "epoch": 102.00552380952381, - "grad_norm": 0.026326464489102364, + "grad_norm": 0.0005743891815654933, "learning_rate": 3.3862433862433864e-06, - "loss": 0.0083, + "loss": 0.0, "step": 7300 }, { "epoch": 102.00647619047619, - "grad_norm": 0.0012957426952198148, + "grad_norm": 0.0005560641875490546, "learning_rate": 3.375661375661376e-06, - "loss": 0.0, + "loss": 0.0006, "step": 7310 }, { - "epoch": 102.0067619047619, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 2.8393476009368896, - "eval_runtime": 14.4835, - "eval_samples_per_second": 5.109, - "eval_steps_per_second": 1.312, + "epoch": 102.0067619047619, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.485344171524048, + "eval_runtime": 18.7508, + "eval_samples_per_second": 3.946, + "eval_steps_per_second": 1.013, "step": 7313 }, { "epoch": 103.00066666666666, - "grad_norm": 0.0015681335935369134, + "grad_norm": 0.00027370688621886075, "learning_rate": 3.3650793650793655e-06, - "loss": 0.198, + "loss": 0.0, "step": 7320 }, { "epoch": 103.00161904761904, - "grad_norm": 0.001265071565285325, + "grad_norm": 0.0003905274497810751, "learning_rate": 3.3544973544973546e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7330 }, { "epoch": 103.00257142857143, - "grad_norm": 0.0017021497478708625, + "grad_norm": 0.0002848069998435676, "learning_rate": 3.343915343915344e-06, - "loss": 0.1842, + "loss": 0.0104, "step": 7340 }, { "epoch": 103.00352380952381, - "grad_norm": 0.0009849672205746174, + "grad_norm": 0.00020756880985572934, "learning_rate": 3.3333333333333333e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7350 }, { "epoch": 103.0044761904762, - "grad_norm": 0.011138002388179302, + "grad_norm": 0.4025360643863678, "learning_rate": 3.322751322751323e-06, - "loss": 0.2116, + "loss": 0.0001, "step": 7360 }, { "epoch": 103.00542857142857, - "grad_norm": 0.8435676693916321, + "grad_norm": 0.0003499074373394251, "learning_rate": 3.312169312169313e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7370 }, { "epoch": 103.00638095238095, - "grad_norm": 0.03975530341267586, + "grad_norm": 0.0005405242554843426, "learning_rate": 3.3015873015873016e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7380 }, { "epoch": 103.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.1937949657440186, - "eval_runtime": 14.8028, - "eval_samples_per_second": 4.999, - "eval_steps_per_second": 1.284, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.510071277618408, + "eval_runtime": 18.4553, + "eval_samples_per_second": 4.01, + "eval_steps_per_second": 1.03, "step": 7384 }, { "epoch": 104.00057142857143, - "grad_norm": 0.001498477766290307, + "grad_norm": 0.0011253401171416044, "learning_rate": 3.2910052910052916e-06, - "loss": 0.2066, + "loss": 0.0, "step": 7390 }, { "epoch": 104.0015238095238, - "grad_norm": 1.7284471988677979, + "grad_norm": 0.00038248911732807755, "learning_rate": 3.2804232804232807e-06, - "loss": 0.0002, + "loss": 0.0, "step": 7400 }, { "epoch": 104.00247619047619, - "grad_norm": 0.0010822723852470517, + "grad_norm": 0.004601576831191778, "learning_rate": 3.2698412698412703e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7410 }, { "epoch": 104.00342857142857, - "grad_norm": 0.001455200370401144, + "grad_norm": 0.0004940856015309691, "learning_rate": 3.25925925925926e-06, - "loss": 0.0003, + "loss": 0.0, "step": 7420 }, { "epoch": 104.00438095238096, - "grad_norm": 0.9901089668273926, + "grad_norm": 0.0004417779855430126, "learning_rate": 3.248677248677249e-06, - "loss": 0.0003, + "loss": 0.0, "step": 7430 }, { "epoch": 104.00533333333334, - "grad_norm": 0.04028749465942383, + "grad_norm": 0.0007058187038637698, "learning_rate": 3.2380952380952385e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7440 }, { "epoch": 104.00628571428571, - "grad_norm": 0.0012032322119921446, + "grad_norm": 0.0003210832073818892, "learning_rate": 3.2275132275132277e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7450 }, { "epoch": 104.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.2224631309509277, - "eval_runtime": 15.5552, - "eval_samples_per_second": 4.757, - "eval_steps_per_second": 1.221, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.5136077404022217, + "eval_runtime": 21.1895, + "eval_samples_per_second": 3.492, + "eval_steps_per_second": 0.897, "step": 7455 }, { "epoch": 105.00047619047619, - "grad_norm": 0.0006622392102144659, + "grad_norm": 0.005918905604630709, "learning_rate": 3.2169312169312172e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7460 }, { "epoch": 105.00142857142858, - "grad_norm": 0.0029132510535418987, + "grad_norm": 0.0002152614906663075, "learning_rate": 3.206349206349207e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7470 }, { "epoch": 105.00238095238095, - "grad_norm": 0.0010098450584337115, + "grad_norm": 0.0005597418639808893, "learning_rate": 3.195767195767196e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7480 }, { "epoch": 105.00333333333333, - "grad_norm": 0.006145752966403961, + "grad_norm": 0.0002892552292905748, "learning_rate": 3.1851851851851855e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7490 }, { "epoch": 105.00428571428571, - "grad_norm": 0.0012558751041069627, + "grad_norm": 0.00044008262921124697, "learning_rate": 3.1746031746031746e-06, - "loss": 0.0041, + "loss": 0.0, "step": 7500 }, { "epoch": 105.0052380952381, - "grad_norm": 0.0007008440443314612, + "grad_norm": 0.0002466421283315867, "learning_rate": 3.164021164021164e-06, "loss": 0.0, "step": 7510 }, { "epoch": 105.00619047619048, - "grad_norm": 0.0010673552751541138, + "grad_norm": 0.0002469986502546817, "learning_rate": 3.1534391534391538e-06, - "loss": 0.1038, + "loss": 0.0, "step": 7520 }, { "epoch": 105.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.4166831970214844, - "eval_runtime": 13.6248, - "eval_samples_per_second": 5.431, - "eval_steps_per_second": 1.395, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.5027740001678467, + "eval_runtime": 19.69, + "eval_samples_per_second": 3.758, + "eval_steps_per_second": 0.965, "step": 7526 }, { "epoch": 106.00038095238095, - "grad_norm": 0.004051295109093189, + "grad_norm": 0.0006333022029139102, "learning_rate": 3.142857142857143e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7530 }, { "epoch": 106.00133333333333, - "grad_norm": 0.0018393241334706545, + "grad_norm": 0.0038955979980528355, "learning_rate": 3.132275132275133e-06, "loss": 0.0, "step": 7540 }, { "epoch": 106.00228571428572, - "grad_norm": 0.0008061683620326221, + "grad_norm": 0.0007543124374933541, "learning_rate": 3.1216931216931216e-06, - "loss": 0.2633, + "loss": 0.0, "step": 7550 }, { "epoch": 106.00323809523809, - "grad_norm": 0.0010647161398082972, + "grad_norm": 0.00026103874552063644, "learning_rate": 3.1111111111111116e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7560 }, { "epoch": 106.00419047619047, - "grad_norm": 0.10086814314126968, + "grad_norm": 0.0006310672033578157, "learning_rate": 3.100529100529101e-06, - "loss": 0.0004, + "loss": 0.0, "step": 7570 }, { "epoch": 106.00514285714286, - "grad_norm": 0.22082285583019257, + "grad_norm": 0.000212406026548706, "learning_rate": 3.0899470899470903e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7580 }, { "epoch": 106.00609523809524, - "grad_norm": 0.001157531514763832, + "grad_norm": 0.0003842598816845566, "learning_rate": 3.07936507936508e-06, - "loss": 0.0001, + "loss": 0.0039, "step": 7590 }, { "epoch": 106.0067619047619, "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.2465484142303467, - "eval_runtime": 13.7054, - "eval_samples_per_second": 5.399, - "eval_steps_per_second": 1.386, + "eval_loss": 2.6881797313690186, + "eval_runtime": 18.8348, + "eval_samples_per_second": 3.929, + "eval_steps_per_second": 1.009, "step": 7597 }, { "epoch": 107.00028571428571, - "grad_norm": 0.0009280334343202412, + "grad_norm": 0.0005781868239864707, "learning_rate": 3.068783068783069e-06, "loss": 0.0, "step": 7600 }, { "epoch": 107.0012380952381, - "grad_norm": 0.0021576792933046818, + "grad_norm": 0.0003101880429312587, "learning_rate": 3.0582010582010585e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7610 }, { "epoch": 107.00219047619048, - "grad_norm": 0.0010600824607536197, + "grad_norm": 0.0003724259731825441, "learning_rate": 3.047619047619048e-06, "loss": 0.0, "step": 7620 }, { "epoch": 107.00314285714286, - "grad_norm": 0.001216122298501432, + "grad_norm": 0.0003032613603863865, "learning_rate": 3.0370370370370372e-06, - "loss": 0.0001, + "loss": 0.0007, "step": 7630 }, { "epoch": 107.00409523809523, - "grad_norm": 0.0007257874822244048, + "grad_norm": 0.0001887906837509945, "learning_rate": 3.026455026455027e-06, - "loss": 0.1911, + "loss": 0.0, "step": 7640 }, { "epoch": 107.00504761904762, - "grad_norm": 0.0006914408295415342, + "grad_norm": 0.00022369824000634253, "learning_rate": 3.015873015873016e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7650 }, { "epoch": 107.006, - "grad_norm": 0.001065646531060338, + "grad_norm": 0.00019053922733291984, "learning_rate": 3.0052910052910055e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7660 }, { "epoch": 107.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.4676618576049805, - "eval_runtime": 13.8988, - "eval_samples_per_second": 5.324, - "eval_steps_per_second": 1.367, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.8377068042755127, + "eval_runtime": 18.9954, + "eval_samples_per_second": 3.896, + "eval_steps_per_second": 1.0, "step": 7668 }, { "epoch": 108.00019047619048, - "grad_norm": 0.004718266427516937, + "grad_norm": 0.0004099408397451043, "learning_rate": 2.9947089947089946e-06, "loss": 0.0, "step": 7670 }, { "epoch": 108.00114285714285, - "grad_norm": 0.001543802791275084, + "grad_norm": 0.0004890338750556111, "learning_rate": 2.984126984126984e-06, - "loss": 0.0004, + "loss": 0.0, "step": 7680 }, { "epoch": 108.00209523809524, - "grad_norm": 0.0009165288647636771, + "grad_norm": 0.00018718685896601528, "learning_rate": 2.9735449735449738e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7690 }, { "epoch": 108.00304761904762, - "grad_norm": 0.001680697430856526, + "grad_norm": 0.0004911802243441343, "learning_rate": 2.962962962962963e-06, - "loss": 0.0312, + "loss": 0.0, "step": 7700 }, { "epoch": 108.004, - "grad_norm": 176.18521118164062, + "grad_norm": 0.0003113812126684934, "learning_rate": 2.9523809523809525e-06, - "loss": 0.0073, + "loss": 0.0, "step": 7710 }, { "epoch": 108.00495238095237, - "grad_norm": 0.13346318900585175, + "grad_norm": 0.00017770612612366676, "learning_rate": 2.9417989417989416e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7720 }, { "epoch": 108.00590476190476, - "grad_norm": 0.001556839793920517, + "grad_norm": 0.00023184904421214014, "learning_rate": 2.9312169312169316e-06, - "loss": 0.0333, + "loss": 0.0, "step": 7730 }, { "epoch": 108.0067619047619, - "eval_accuracy": 0.6621621621621622, - "eval_loss": 2.4545717239379883, - "eval_runtime": 14.038, - "eval_samples_per_second": 5.271, - "eval_steps_per_second": 1.353, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.849548816680908, + "eval_runtime": 19.7799, + "eval_samples_per_second": 3.741, + "eval_steps_per_second": 0.961, "step": 7739 }, { "epoch": 109.00009523809524, - "grad_norm": 0.000984245678409934, + "grad_norm": 0.00020311641856096685, "learning_rate": 2.920634920634921e-06, - "loss": 0.0003, + "loss": 0.0, "step": 7740 }, { "epoch": 109.00104761904763, - "grad_norm": 0.6098659038543701, + "grad_norm": 0.0002174510882468894, "learning_rate": 2.9100529100529103e-06, - "loss": 0.0003, + "loss": 0.0, "step": 7750 }, { "epoch": 109.002, - "grad_norm": 0.12003173679113388, + "grad_norm": 0.00025807766360230744, "learning_rate": 2.8994708994709e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7760 }, { "epoch": 109.00295238095238, - "grad_norm": 0.008355682715773582, + "grad_norm": 0.0005842253449372947, "learning_rate": 2.888888888888889e-06, - "loss": 0.0002, + "loss": 0.0, "step": 7770 }, { "epoch": 109.00390476190476, - "grad_norm": 0.0020827262196689844, + "grad_norm": 0.00022474676370620728, "learning_rate": 2.8783068783068786e-06, "loss": 0.0, "step": 7780 }, { "epoch": 109.00485714285715, - "grad_norm": 0.0006808519829064608, + "grad_norm": 0.00021344266133382916, "learning_rate": 2.867724867724868e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7790 }, { "epoch": 109.00580952380952, - "grad_norm": 0.0010414454154670238, + "grad_norm": 0.00037887351936660707, "learning_rate": 2.8571428571428573e-06, "loss": 0.0, "step": 7800 }, { "epoch": 109.0067619047619, - "grad_norm": 0.0006823607254773378, + "grad_norm": 0.00037470136885531247, "learning_rate": 2.846560846560847e-06, - "loss": 0.0119, + "loss": 0.0073, "step": 7810 }, { "epoch": 109.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.581144094467163, - "eval_runtime": 14.356, - "eval_samples_per_second": 5.155, - "eval_steps_per_second": 1.323, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.6624884605407715, + "eval_runtime": 22.2204, + "eval_samples_per_second": 3.33, + "eval_steps_per_second": 0.855, "step": 7810 }, { "epoch": 110.00095238095238, - "grad_norm": 0.003468712791800499, + "grad_norm": 0.0041119703091681, "learning_rate": 2.835978835978836e-06, "loss": 0.0, "step": 7820 }, { "epoch": 110.00190476190477, - "grad_norm": 0.0013633972266688943, + "grad_norm": 0.0002662258630152792, "learning_rate": 2.8253968253968255e-06, - "loss": 0.2267, + "loss": 0.0008, "step": 7830 }, { "epoch": 110.00285714285714, - "grad_norm": 0.012280252762138844, + "grad_norm": 0.0008752596913836896, "learning_rate": 2.814814814814815e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7840 }, { "epoch": 110.00380952380952, - "grad_norm": 0.001774327945895493, + "grad_norm": 0.00015781358524691314, "learning_rate": 2.8042328042328042e-06, - "loss": 0.0005, + "loss": 0.0, "step": 7850 }, { "epoch": 110.0047619047619, - "grad_norm": 0.0013728903140872717, + "grad_norm": 85.68650817871094, "learning_rate": 2.7936507936507938e-06, - "loss": 0.0, + "loss": 0.0024, "step": 7860 }, { "epoch": 110.00571428571429, - "grad_norm": 0.0017020375235006213, + "grad_norm": 0.00019260364933870733, "learning_rate": 2.783068783068783e-06, - "loss": 0.1143, + "loss": 0.0, "step": 7870 }, { "epoch": 110.00666666666666, - "grad_norm": 0.0026008477434515953, + "grad_norm": 0.00015918267308734357, "learning_rate": 2.7724867724867725e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7880 }, { "epoch": 110.0067619047619, "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.287386894226074, - "eval_runtime": 16.4402, - "eval_samples_per_second": 4.501, - "eval_steps_per_second": 1.156, + "eval_loss": 2.706251621246338, + "eval_runtime": 22.8587, + "eval_samples_per_second": 3.237, + "eval_steps_per_second": 0.831, "step": 7881 }, { "epoch": 111.00085714285714, - "grad_norm": 0.004622504580765963, + "grad_norm": 0.00036700881901197135, "learning_rate": 2.7619047619047625e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7890 }, { "epoch": 111.00180952380953, - "grad_norm": 0.0013189928140491247, + "grad_norm": 0.0001547165447846055, "learning_rate": 2.7513227513227516e-06, "loss": 0.0, "step": 7900 }, { "epoch": 111.00276190476191, - "grad_norm": 0.01059955358505249, + "grad_norm": 24.328468322753906, "learning_rate": 2.740740740740741e-06, - "loss": 0.0001, + "loss": 0.2405, "step": 7910 }, { "epoch": 111.00371428571428, - "grad_norm": 0.000857778184581548, + "grad_norm": 0.0007024611113592982, "learning_rate": 2.7301587301587303e-06, - "loss": 0.0001, + "loss": 0.0, "step": 7920 }, { "epoch": 111.00466666666667, - "grad_norm": 0.0008232994005084038, + "grad_norm": 0.0004765796475112438, "learning_rate": 2.71957671957672e-06, "loss": 0.0, "step": 7930 }, { "epoch": 111.00561904761905, - "grad_norm": 0.08803115785121918, + "grad_norm": 0.0002616800193209201, "learning_rate": 2.7089947089947094e-06, "loss": 0.0, "step": 7940 }, { "epoch": 111.00657142857143, - "grad_norm": 0.0008486019214615226, + "grad_norm": 0.0006567240925505757, "learning_rate": 2.6984126984126986e-06, "loss": 0.0, "step": 7950 }, { "epoch": 111.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.1970043182373047, - "eval_runtime": 14.0164, - "eval_samples_per_second": 5.28, - "eval_steps_per_second": 1.356, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 2.3949179649353027, + "eval_runtime": 13.9865, + "eval_samples_per_second": 5.291, + "eval_steps_per_second": 1.358, "step": 7952 }, { "epoch": 112.0007619047619, - "grad_norm": 0.0010444708168506622, + "grad_norm": 0.00034727007732726634, "learning_rate": 2.687830687830688e-06, - "loss": 0.0063, + "loss": 0.0001, "step": 7960 }, { "epoch": 112.00171428571429, - "grad_norm": 0.0009477341081947088, + "grad_norm": 0.0005837412900291383, "learning_rate": 2.6772486772486773e-06, "loss": 0.0, "step": 7970 }, { "epoch": 112.00266666666667, - "grad_norm": 0.000609098351560533, + "grad_norm": 0.00018690152501221746, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "step": 7980 }, { "epoch": 112.00361904761905, - "grad_norm": 0.011918469332158566, + "grad_norm": 0.000701416633091867, "learning_rate": 2.6560846560846564e-06, "loss": 0.0, "step": 7990 }, { "epoch": 112.00457142857142, - "grad_norm": 0.0012907920172438025, + "grad_norm": 0.00027452572248876095, "learning_rate": 2.6455026455026455e-06, "loss": 0.0001, "step": 8000 }, { "epoch": 112.00552380952381, - "grad_norm": 0.0006700385129079223, + "grad_norm": 0.00015516536950599402, "learning_rate": 2.634920634920635e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8010 }, { "epoch": 112.00647619047619, - "grad_norm": 0.000968728621955961, + "grad_norm": 0.00020343823416624218, "learning_rate": 2.6243386243386242e-06, "loss": 0.0, "step": 8020 @@ -6633,586 +6633,586 @@ { "epoch": 112.0067619047619, "eval_accuracy": 0.7432432432432432, - "eval_loss": 2.2009499073028564, - "eval_runtime": 14.2089, - "eval_samples_per_second": 5.208, - "eval_steps_per_second": 1.337, + "eval_loss": 2.5955913066864014, + "eval_runtime": 13.9752, + "eval_samples_per_second": 5.295, + "eval_steps_per_second": 1.36, "step": 8023 }, { "epoch": 113.00066666666666, - "grad_norm": 0.0010398072190582752, + "grad_norm": 0.00027498166309669614, "learning_rate": 2.613756613756614e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8030 }, { "epoch": 113.00161904761904, - "grad_norm": 0.008272141218185425, + "grad_norm": 0.0004681613063439727, "learning_rate": 2.6031746031746038e-06, "loss": 0.0, "step": 8040 }, { "epoch": 113.00257142857143, - "grad_norm": 0.0012324623530730605, + "grad_norm": 0.0002042886335402727, "learning_rate": 2.5925925925925925e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8050 }, { "epoch": 113.00352380952381, - "grad_norm": 0.0016941934591159225, + "grad_norm": 0.0007689885678701103, "learning_rate": 2.5820105820105825e-06, - "loss": 0.1289, + "loss": 0.0, "step": 8060 }, { "epoch": 113.0044761904762, - "grad_norm": 0.0024788875598460436, + "grad_norm": 0.00023886038979981095, "learning_rate": 2.571428571428571e-06, - "loss": 0.2087, + "loss": 0.0, "step": 8070 }, { "epoch": 113.00542857142857, - "grad_norm": 0.0012335249921306968, + "grad_norm": 0.00014404159446712583, "learning_rate": 2.560846560846561e-06, - "loss": 0.0, + "loss": 0.1964, "step": 8080 }, { "epoch": 113.00638095238095, - "grad_norm": 0.0008130993810482323, + "grad_norm": 0.0002148894709534943, "learning_rate": 2.5502645502645507e-06, "loss": 0.0001, "step": 8090 }, { "epoch": 113.0067619047619, - "eval_accuracy": 0.7432432432432432, - "eval_loss": 2.25538969039917, - "eval_runtime": 13.5185, - "eval_samples_per_second": 5.474, - "eval_steps_per_second": 1.405, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.9212052822113037, + "eval_runtime": 13.8012, + "eval_samples_per_second": 5.362, + "eval_steps_per_second": 1.377, "step": 8094 }, { "epoch": 114.00057142857143, - "grad_norm": 0.0006910522934049368, + "grad_norm": 0.00036819567321799695, "learning_rate": 2.53968253968254e-06, "loss": 0.0, "step": 8100 }, { "epoch": 114.0015238095238, - "grad_norm": 0.0012819025432690978, + "grad_norm": 0.0022056999150663614, "learning_rate": 2.5291005291005294e-06, "loss": 0.0, "step": 8110 }, { "epoch": 114.00247619047619, - "grad_norm": 0.0011646713828667998, + "grad_norm": 0.03204357624053955, "learning_rate": 2.5185185185185186e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8120 }, { "epoch": 114.00342857142857, - "grad_norm": 0.0007299323915503919, + "grad_norm": 0.00015915023686829954, "learning_rate": 2.507936507936508e-06, "loss": 0.0, "step": 8130 }, { "epoch": 114.00438095238096, - "grad_norm": 0.0007748621865175664, + "grad_norm": 0.0010063733207061887, "learning_rate": 2.4973544973544973e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8140 }, { "epoch": 114.00533333333334, - "grad_norm": 0.0007616358925588429, + "grad_norm": 0.0001527264976175502, "learning_rate": 2.486772486772487e-06, "loss": 0.0, "step": 8150 }, { "epoch": 114.00628571428571, - "grad_norm": 0.0007386330980807543, + "grad_norm": 0.00043166003888472915, "learning_rate": 2.4761904761904764e-06, "loss": 0.0, "step": 8160 }, { "epoch": 114.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.2651755809783936, - "eval_runtime": 14.2929, - "eval_samples_per_second": 5.177, - "eval_steps_per_second": 1.329, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.821566343307495, + "eval_runtime": 13.9047, + "eval_samples_per_second": 5.322, + "eval_steps_per_second": 1.366, "step": 8165 }, { "epoch": 115.00047619047619, - "grad_norm": 0.0017123465659096837, + "grad_norm": 0.0003170575946569443, "learning_rate": 2.465608465608466e-06, "loss": 0.0, "step": 8170 }, { "epoch": 115.00142857142858, - "grad_norm": 0.0009110061218962073, + "grad_norm": 0.00025970794376917183, "learning_rate": 2.455026455026455e-06, - "loss": 0.0157, + "loss": 0.0, "step": 8180 }, { "epoch": 115.00238095238095, - "grad_norm": 0.0007751217926852405, + "grad_norm": 0.0002181291056331247, "learning_rate": 2.4444444444444447e-06, - "loss": 0.0019, + "loss": 0.0, "step": 8190 }, { "epoch": 115.00333333333333, - "grad_norm": 0.0007961459341458976, + "grad_norm": 0.00014427877613343298, "learning_rate": 2.433862433862434e-06, "loss": 0.0, "step": 8200 }, { "epoch": 115.00428571428571, - "grad_norm": 0.0009998691966757178, + "grad_norm": 0.00027777208015322685, "learning_rate": 2.4232804232804234e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8210 }, { "epoch": 115.0052380952381, - "grad_norm": 0.00098160351626575, + "grad_norm": 0.0002333878946956247, "learning_rate": 2.412698412698413e-06, - "loss": 0.1409, + "loss": 0.0, "step": 8220 }, { "epoch": 115.00619047619048, - "grad_norm": 0.0009323691483587027, + "grad_norm": 0.00015995455032680184, "learning_rate": 2.4021164021164025e-06, "loss": 0.0, "step": 8230 }, { "epoch": 115.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.3248355388641357, - "eval_runtime": 14.0654, - "eval_samples_per_second": 5.261, - "eval_steps_per_second": 1.351, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.840902328491211, + "eval_runtime": 13.8639, + "eval_samples_per_second": 5.338, + "eval_steps_per_second": 1.37, "step": 8236 }, { "epoch": 116.00038095238095, - "grad_norm": 0.001129627344198525, + "grad_norm": 0.0001782690524123609, "learning_rate": 2.3915343915343916e-06, "loss": 0.0, "step": 8240 }, { "epoch": 116.00133333333333, - "grad_norm": 0.0011487036244943738, + "grad_norm": 0.00017185336037073284, "learning_rate": 2.380952380952381e-06, - "loss": 0.0005, + "loss": 0.0, "step": 8250 }, { "epoch": 116.00228571428572, - "grad_norm": 0.0013055442832410336, + "grad_norm": 0.002734607784077525, "learning_rate": 2.3703703703703707e-06, "loss": 0.0, "step": 8260 }, { "epoch": 116.00323809523809, - "grad_norm": 1.352319359779358, + "grad_norm": 0.00045948615297675133, "learning_rate": 2.35978835978836e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8270 }, { "epoch": 116.00419047619047, - "grad_norm": 0.001312308362685144, + "grad_norm": 0.00028406543424353004, "learning_rate": 2.3492063492063494e-06, "loss": 0.0, "step": 8280 }, { "epoch": 116.00514285714286, - "grad_norm": 0.001412849873304367, + "grad_norm": 0.00016468593094032258, "learning_rate": 2.3386243386243386e-06, - "loss": 0.0002, + "loss": 0.0, "step": 8290 }, { "epoch": 116.00609523809524, - "grad_norm": 2.120629072189331, + "grad_norm": 0.00022645114222541451, "learning_rate": 2.328042328042328e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8300 }, { "epoch": 116.0067619047619, "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.5589470863342285, - "eval_runtime": 11.1299, - "eval_samples_per_second": 6.649, - "eval_steps_per_second": 1.707, + "eval_loss": 2.8546435832977295, + "eval_runtime": 16.3842, + "eval_samples_per_second": 4.517, + "eval_steps_per_second": 1.16, "step": 8307 }, { "epoch": 117.00028571428571, - "grad_norm": 0.0009046709747053683, + "grad_norm": 0.00027977171703241765, "learning_rate": 2.3174603174603177e-06, - "loss": 0.1208, + "loss": 0.0, "step": 8310 }, { "epoch": 117.0012380952381, - "grad_norm": 0.0010104886023327708, + "grad_norm": 0.0003135088481940329, "learning_rate": 2.3068783068783073e-06, "loss": 0.0, "step": 8320 }, { "epoch": 117.00219047619048, - "grad_norm": 0.0009857664117589593, + "grad_norm": 0.00021076586563140154, "learning_rate": 2.2962962962962964e-06, - "loss": 0.002, + "loss": 0.0, "step": 8330 }, { "epoch": 117.00314285714286, - "grad_norm": 0.0009741351823322475, + "grad_norm": 0.00024846967426128685, "learning_rate": 2.285714285714286e-06, - "loss": 0.0045, + "loss": 0.0, "step": 8340 }, { "epoch": 117.00409523809523, - "grad_norm": 0.0016145688714459538, + "grad_norm": 0.0003208005800843239, "learning_rate": 2.275132275132275e-06, - "loss": 0.0006, + "loss": 0.0, "step": 8350 }, { "epoch": 117.00504761904762, - "grad_norm": 0.00107288034632802, + "grad_norm": 0.00039256789023056626, "learning_rate": 2.2645502645502647e-06, "loss": 0.0, "step": 8360 }, { "epoch": 117.006, - "grad_norm": 0.0010272023500874639, + "grad_norm": 0.00013160724483896047, "learning_rate": 2.2539682539682542e-06, "loss": 0.0, "step": 8370 }, { "epoch": 117.0067619047619, - "eval_accuracy": 0.7567567567567568, - "eval_loss": 2.2265772819519043, - "eval_runtime": 13.3717, - "eval_samples_per_second": 5.534, - "eval_steps_per_second": 1.421, + "eval_accuracy": 0.6891891891891891, + "eval_loss": 2.8172342777252197, + "eval_runtime": 19.032, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 0.998, "step": 8378 }, { "epoch": 118.00019047619048, - "grad_norm": 0.0007128997822292149, + "grad_norm": 0.00013908334949519485, "learning_rate": 2.2433862433862434e-06, "loss": 0.0, "step": 8380 }, { "epoch": 118.00114285714285, - "grad_norm": 0.0011421479284763336, + "grad_norm": 0.0002135159884346649, "learning_rate": 2.232804232804233e-06, - "loss": 0.2253, + "loss": 0.0, "step": 8390 }, { "epoch": 118.00209523809524, - "grad_norm": 0.001073920400813222, + "grad_norm": 0.00352920638397336, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "step": 8400 }, { "epoch": 118.00304761904762, - "grad_norm": 303.7464904785156, + "grad_norm": 0.000681175384670496, "learning_rate": 2.211640211640212e-06, - "loss": 0.1566, + "loss": 0.0001, "step": 8410 }, { "epoch": 118.004, - "grad_norm": 0.0007826104993000627, + "grad_norm": 0.00010591221507638693, "learning_rate": 2.201058201058201e-06, - "loss": 0.1285, + "loss": 0.0, "step": 8420 }, { "epoch": 118.00495238095237, - "grad_norm": 0.0008790806750766933, + "grad_norm": 309.2093811035156, "learning_rate": 2.1904761904761908e-06, - "loss": 0.0252, + "loss": 0.2105, "step": 8430 }, { "epoch": 118.00590476190476, - "grad_norm": 0.0009564163046889007, + "grad_norm": 0.00028071904671378434, "learning_rate": 2.17989417989418e-06, "loss": 0.0, "step": 8440 }, { "epoch": 118.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.2806921005249023, - "eval_runtime": 14.0307, - "eval_samples_per_second": 5.274, - "eval_steps_per_second": 1.354, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.4545764923095703, + "eval_runtime": 20.2563, + "eval_samples_per_second": 3.653, + "eval_steps_per_second": 0.938, "step": 8449 }, { "epoch": 119.00009523809524, - "grad_norm": 0.001905015087686479, + "grad_norm": 0.0003477052669040859, "learning_rate": 2.1693121693121695e-06, "loss": 0.0, "step": 8450 }, { "epoch": 119.00104761904763, - "grad_norm": 0.0008118277182802558, + "grad_norm": 0.0007623965502716601, "learning_rate": 2.158730158730159e-06, - "loss": 0.0027, + "loss": 0.0, "step": 8460 }, { "epoch": 119.002, - "grad_norm": 0.001871650223620236, + "grad_norm": 0.00021526910131797194, "learning_rate": 2.148148148148148e-06, "loss": 0.0, "step": 8470 }, { "epoch": 119.00295238095238, - "grad_norm": 0.0014514620415866375, + "grad_norm": 0.0005041114636696875, "learning_rate": 2.1375661375661377e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8480 }, { "epoch": 119.00390476190476, - "grad_norm": 0.0011963268043473363, + "grad_norm": 0.00018653657753020525, "learning_rate": 2.1269841269841273e-06, "loss": 0.0, "step": 8490 }, { "epoch": 119.00485714285715, - "grad_norm": 0.000760503055062145, + "grad_norm": 0.00029102060943841934, "learning_rate": 2.1164021164021164e-06, "loss": 0.0, "step": 8500 }, { "epoch": 119.00580952380952, - "grad_norm": 0.0018978703301399946, + "grad_norm": 0.00020990609482396394, "learning_rate": 2.105820105820106e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8510 }, { "epoch": 119.0067619047619, - "grad_norm": 0.0006232424639165401, + "grad_norm": 0.00012788939056918025, "learning_rate": 2.0952380952380955e-06, "loss": 0.0, "step": 8520 }, { "epoch": 119.0067619047619, - "eval_accuracy": 0.7432432432432432, - "eval_loss": 2.266402006149292, - "eval_runtime": 13.834, - "eval_samples_per_second": 5.349, - "eval_steps_per_second": 1.373, + "eval_accuracy": 0.7567567567567568, + "eval_loss": 2.381497383117676, + "eval_runtime": 21.6548, + "eval_samples_per_second": 3.417, + "eval_steps_per_second": 0.877, "step": 8520 }, { "epoch": 120.00095238095238, - "grad_norm": 0.0013166640419512987, + "grad_norm": 0.0004286629264242947, "learning_rate": 2.0846560846560847e-06, "loss": 0.0, "step": 8530 }, { "epoch": 120.00190476190477, - "grad_norm": 0.0007621021359227598, + "grad_norm": 0.00028043834026902914, "learning_rate": 2.0740740740740742e-06, "loss": 0.0, "step": 8540 }, { "epoch": 120.00285714285714, - "grad_norm": 0.000544812239240855, + "grad_norm": 0.00023993337526917458, "learning_rate": 2.0634920634920634e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8550 }, { "epoch": 120.00380952380952, - "grad_norm": 0.0008052691118791699, + "grad_norm": 0.0008844914846122265, "learning_rate": 2.0529100529100534e-06, - "loss": 0.0003, + "loss": 0.0, "step": 8560 }, { "epoch": 120.0047619047619, - "grad_norm": 0.0012564912904053926, + "grad_norm": 0.0005346594844013453, "learning_rate": 2.0423280423280425e-06, "loss": 0.0, "step": 8570 }, { "epoch": 120.00571428571429, - "grad_norm": 0.003531375201418996, + "grad_norm": 0.000264531176071614, "learning_rate": 2.031746031746032e-06, "loss": 0.0, "step": 8580 }, { "epoch": 120.00666666666666, - "grad_norm": 0.0017125386511906981, + "grad_norm": 0.00016072009748313576, "learning_rate": 2.021164021164021e-06, "loss": 0.0, "step": 8590 }, { "epoch": 120.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.1452441215515137, - "eval_runtime": 11.7378, - "eval_samples_per_second": 6.304, - "eval_steps_per_second": 1.619, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.4006309509277344, + "eval_runtime": 20.5345, + "eval_samples_per_second": 3.604, + "eval_steps_per_second": 0.925, "step": 8591 }, { "epoch": 121.00085714285714, - "grad_norm": 0.004780361894518137, + "grad_norm": 0.00020770763512700796, "learning_rate": 2.0105820105820108e-06, "loss": 0.0, "step": 8600 }, { "epoch": 121.00180952380953, - "grad_norm": 0.0008624635520391166, + "grad_norm": 0.0001392192643834278, "learning_rate": 2.0000000000000003e-06, - "loss": 0.0005, + "loss": 0.0, "step": 8610 }, { "epoch": 121.00276190476191, - "grad_norm": 0.002062013605609536, + "grad_norm": 0.0002610778028611094, "learning_rate": 1.9894179894179895e-06, "loss": 0.0, "step": 8620 }, { "epoch": 121.00371428571428, - "grad_norm": 0.01906019262969494, + "grad_norm": 0.0005038412055000663, "learning_rate": 1.978835978835979e-06, "loss": 0.0, "step": 8630 }, { "epoch": 121.00466666666667, - "grad_norm": 0.001179852755740285, + "grad_norm": 0.00024570609093643725, "learning_rate": 1.968253968253968e-06, - "loss": 0.0029, + "loss": 0.0, "step": 8640 }, { "epoch": 121.00561904761905, - "grad_norm": 0.007118129171431065, + "grad_norm": 0.0002881147665902972, "learning_rate": 1.9576719576719577e-06, "loss": 0.0, "step": 8650 }, { "epoch": 121.00657142857143, - "grad_norm": 0.0005772224394604564, + "grad_norm": 0.000290913536446169, "learning_rate": 1.9470899470899473e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8660 }, { "epoch": 121.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.2492032051086426, - "eval_runtime": 13.4013, - "eval_samples_per_second": 5.522, - "eval_steps_per_second": 1.418, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.4198105335235596, + "eval_runtime": 20.8054, + "eval_samples_per_second": 3.557, + "eval_steps_per_second": 0.913, "step": 8662 }, { "epoch": 122.0007619047619, - "grad_norm": 0.0007070142310112715, + "grad_norm": 0.0002238577581010759, "learning_rate": 1.936507936507937e-06, - "loss": 0.0001, + "loss": 0.0, "step": 8670 }, { "epoch": 122.00171428571429, - "grad_norm": 0.00115680240560323, + "grad_norm": 0.0014817883493378758, "learning_rate": 1.925925925925926e-06, "loss": 0.0, "step": 8680 }, { "epoch": 122.00266666666667, - "grad_norm": 0.0007530824514105916, + "grad_norm": 0.00022828546934761107, "learning_rate": 1.9153439153439156e-06, "loss": 0.0, "step": 8690 }, { "epoch": 122.00361904761905, - "grad_norm": 0.0009811901254579425, + "grad_norm": 0.0003081039758399129, "learning_rate": 1.904761904761905e-06, "loss": 0.0, "step": 8700 }, { "epoch": 122.00457142857142, - "grad_norm": 0.0007460628403350711, + "grad_norm": 0.000204382959054783, "learning_rate": 1.8941798941798945e-06, "loss": 0.0, "step": 8710 }, { "epoch": 122.00552380952381, - "grad_norm": 0.0007707420154474676, + "grad_norm": 0.0001458204205846414, "learning_rate": 1.8835978835978838e-06, "loss": 0.0, "step": 8720 }, { "epoch": 122.00647619047619, - "grad_norm": 0.0007953933090902865, + "grad_norm": 0.00013663896243087947, "learning_rate": 1.8730158730158732e-06, "loss": 0.0, "step": 8730 @@ -7220,57 +7220,57 @@ { "epoch": 122.0067619047619, "eval_accuracy": 0.7432432432432432, - "eval_loss": 2.230292558670044, - "eval_runtime": 13.5941, - "eval_samples_per_second": 5.444, - "eval_steps_per_second": 1.398, + "eval_loss": 2.4388883113861084, + "eval_runtime": 19.5939, + "eval_samples_per_second": 3.777, + "eval_steps_per_second": 0.97, "step": 8733 }, { "epoch": 123.00066666666666, - "grad_norm": 0.002947331639006734, + "grad_norm": 0.00021870314958505332, "learning_rate": 1.8624338624338625e-06, "loss": 0.0, "step": 8740 }, { "epoch": 123.00161904761904, - "grad_norm": 0.009799734689295292, + "grad_norm": 0.0002467467274982482, "learning_rate": 1.8518518518518519e-06, "loss": 0.0, "step": 8750 }, { "epoch": 123.00257142857143, - "grad_norm": 0.004149388987571001, + "grad_norm": 0.000263078574789688, "learning_rate": 1.8412698412698416e-06, "loss": 0.0, "step": 8760 }, { "epoch": 123.00352380952381, - "grad_norm": 0.0008202605531550944, + "grad_norm": 0.0003389718767721206, "learning_rate": 1.830687830687831e-06, "loss": 0.0, "step": 8770 }, { "epoch": 123.0044761904762, - "grad_norm": 0.0005715743754990399, + "grad_norm": 0.00014280724280979484, "learning_rate": 1.8201058201058203e-06, "loss": 0.0, "step": 8780 }, { "epoch": 123.00542857142857, - "grad_norm": 0.0007904635858722031, + "grad_norm": 0.00016550095460843295, "learning_rate": 1.8095238095238097e-06, "loss": 0.0, "step": 8790 }, { "epoch": 123.00638095238095, - "grad_norm": 0.0012801820412278175, + "grad_norm": 0.0003188049013260752, "learning_rate": 1.798941798941799e-06, "loss": 0.0, "step": 8800 @@ -7278,528 +7278,528 @@ { "epoch": 123.0067619047619, "eval_accuracy": 0.7432432432432432, - "eval_loss": 2.232011079788208, - "eval_runtime": 13.4112, - "eval_samples_per_second": 5.518, - "eval_steps_per_second": 1.417, + "eval_loss": 2.4763236045837402, + "eval_runtime": 19.7757, + "eval_samples_per_second": 3.742, + "eval_steps_per_second": 0.961, "step": 8804 }, { "epoch": 124.00057142857143, - "grad_norm": 0.0009024806204251945, + "grad_norm": 0.00017588127229828387, "learning_rate": 1.7883597883597886e-06, "loss": 0.0, "step": 8810 }, { "epoch": 124.0015238095238, - "grad_norm": 0.0014790045097470284, + "grad_norm": 0.00025684619322419167, "learning_rate": 1.777777777777778e-06, "loss": 0.0, "step": 8820 }, { "epoch": 124.00247619047619, - "grad_norm": 0.0011823130771517754, + "grad_norm": 0.0003379395930096507, "learning_rate": 1.7671957671957673e-06, - "loss": 0.0388, + "loss": 0.0, "step": 8830 }, { "epoch": 124.00342857142857, - "grad_norm": 0.0006398107507266104, + "grad_norm": 0.00019355901167728007, "learning_rate": 1.7566137566137567e-06, "loss": 0.0, "step": 8840 }, { "epoch": 124.00438095238096, - "grad_norm": 0.0008834226173348725, + "grad_norm": 0.005061679054051638, "learning_rate": 1.746031746031746e-06, "loss": 0.0, "step": 8850 }, { "epoch": 124.00533333333334, - "grad_norm": 0.0010215333895757794, + "grad_norm": 0.00023676594719290733, "learning_rate": 1.7354497354497358e-06, - "loss": 0.1379, + "loss": 0.0, "step": 8860 }, { "epoch": 124.00628571428571, - "grad_norm": 0.0011854860931634903, + "grad_norm": 0.0005030606989748776, "learning_rate": 1.7248677248677251e-06, "loss": 0.0, "step": 8870 }, { "epoch": 124.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.2219762802124023, - "eval_runtime": 13.1023, - "eval_samples_per_second": 5.648, - "eval_steps_per_second": 1.45, + "eval_accuracy": 0.7432432432432432, + "eval_loss": 2.494699716567993, + "eval_runtime": 18.7406, + "eval_samples_per_second": 3.949, + "eval_steps_per_second": 1.014, "step": 8875 }, { "epoch": 125.00047619047619, - "grad_norm": 0.0020397165790200233, + "grad_norm": 0.00018102419562637806, "learning_rate": 1.7142857142857145e-06, "loss": 0.0, "step": 8880 }, { "epoch": 125.00142857142858, - "grad_norm": 0.0005769819836132228, + "grad_norm": 0.0009986262302845716, "learning_rate": 1.7037037037037038e-06, "loss": 0.0, "step": 8890 }, { "epoch": 125.00238095238095, - "grad_norm": 0.0009467735653743148, + "grad_norm": 0.00020767083333339542, "learning_rate": 1.6931216931216932e-06, "loss": 0.0, "step": 8900 }, { "epoch": 125.00333333333333, - "grad_norm": 0.0014792312867939472, + "grad_norm": 0.0005012244218960404, "learning_rate": 1.6825396825396827e-06, "loss": 0.0, "step": 8910 }, { "epoch": 125.00428571428571, - "grad_norm": 0.0027609718963503838, + "grad_norm": 0.00021223600197117776, "learning_rate": 1.671957671957672e-06, "loss": 0.0, "step": 8920 }, { "epoch": 125.0052380952381, - "grad_norm": 0.0006079283775761724, + "grad_norm": 0.00014842044038232416, "learning_rate": 1.6613756613756614e-06, "loss": 0.0, "step": 8930 }, { "epoch": 125.00619047619048, - "grad_norm": 0.0006986471707932651, + "grad_norm": 0.0009348007733933628, "learning_rate": 1.6507936507936508e-06, "loss": 0.0, "step": 8940 }, { "epoch": 125.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.234332799911499, - "eval_runtime": 13.7771, - "eval_samples_per_second": 5.371, - "eval_steps_per_second": 1.379, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.512620687484741, + "eval_runtime": 22.2948, + "eval_samples_per_second": 3.319, + "eval_steps_per_second": 0.852, "step": 8946 }, { "epoch": 126.00038095238095, - "grad_norm": 0.0005596214905381203, + "grad_norm": 0.00016746499750297517, "learning_rate": 1.6402116402116404e-06, "loss": 0.0, "step": 8950 }, { "epoch": 126.00133333333333, - "grad_norm": 0.0014171155635267496, + "grad_norm": 0.00012043194874422625, "learning_rate": 1.62962962962963e-06, - "loss": 0.0461, + "loss": 0.0, "step": 8960 }, { "epoch": 126.00228571428572, - "grad_norm": 0.026397421956062317, + "grad_norm": 0.0005607526400126517, "learning_rate": 1.6190476190476193e-06, "loss": 0.0, "step": 8970 }, { "epoch": 126.00323809523809, - "grad_norm": 0.0020371831487864256, + "grad_norm": 0.005011443514376879, "learning_rate": 1.6084656084656086e-06, - "loss": 0.0063, + "loss": 0.0, "step": 8980 }, { "epoch": 126.00419047619047, - "grad_norm": 0.0006835910025984049, + "grad_norm": 0.0001848703541327268, "learning_rate": 1.597883597883598e-06, "loss": 0.0, "step": 8990 }, { "epoch": 126.00514285714286, - "grad_norm": 0.0007236665696837008, + "grad_norm": 0.00013767703785561025, "learning_rate": 1.5873015873015873e-06, "loss": 0.0, "step": 9000 }, { "epoch": 126.00609523809524, - "grad_norm": 0.000860642408952117, + "grad_norm": 0.0002853712940122932, "learning_rate": 1.5767195767195769e-06, "loss": 0.0, "step": 9010 }, { "epoch": 126.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.3466248512268066, - "eval_runtime": 13.333, - "eval_samples_per_second": 5.55, - "eval_steps_per_second": 1.425, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.531374931335449, + "eval_runtime": 25.9752, + "eval_samples_per_second": 2.849, + "eval_steps_per_second": 0.731, "step": 9017 }, { "epoch": 127.00028571428571, - "grad_norm": 0.004317994695156813, + "grad_norm": 0.0004919490311294794, "learning_rate": 1.5661375661375664e-06, "loss": 0.0, "step": 9020 }, { "epoch": 127.0012380952381, - "grad_norm": 0.001654667779803276, + "grad_norm": 0.000153199172927998, "learning_rate": 1.5555555555555558e-06, "loss": 0.0, "step": 9030 }, { "epoch": 127.00219047619048, - "grad_norm": 0.0008680069586262107, + "grad_norm": 0.00021435142843984067, "learning_rate": 1.5449735449735451e-06, "loss": 0.0, "step": 9040 }, { "epoch": 127.00314285714286, - "grad_norm": 0.0006574925500899553, + "grad_norm": 0.0001620359835214913, "learning_rate": 1.5343915343915345e-06, - "loss": 0.0008, + "loss": 0.0, "step": 9050 }, { "epoch": 127.00409523809523, - "grad_norm": 0.0053094481118023396, + "grad_norm": 0.00019414816051721573, "learning_rate": 1.523809523809524e-06, "loss": 0.0, "step": 9060 }, { "epoch": 127.00504761904762, - "grad_norm": 0.0006402261788025498, + "grad_norm": 0.00018438031838741153, "learning_rate": 1.5132275132275134e-06, "loss": 0.0, "step": 9070 }, { "epoch": 127.006, - "grad_norm": 0.0006702612736262381, + "grad_norm": 0.00021760053641628474, "learning_rate": 1.5026455026455028e-06, "loss": 0.0, "step": 9080 }, { "epoch": 127.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.428300142288208, - "eval_runtime": 12.6965, - "eval_samples_per_second": 5.828, - "eval_steps_per_second": 1.496, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.542877197265625, + "eval_runtime": 14.1474, + "eval_samples_per_second": 5.231, + "eval_steps_per_second": 1.343, "step": 9088 }, { "epoch": 128.00019047619048, - "grad_norm": 0.059855397790670395, + "grad_norm": 0.004726231098175049, "learning_rate": 1.492063492063492e-06, "loss": 0.0, "step": 9090 }, { "epoch": 128.00114285714287, - "grad_norm": 0.0010002696653828025, + "grad_norm": 0.0004024128429591656, "learning_rate": 1.4814814814814815e-06, - "loss": 0.0048, + "loss": 0.0, "step": 9100 }, { "epoch": 128.00209523809525, - "grad_norm": 0.000551008153706789, + "grad_norm": 0.00015126141079235822, "learning_rate": 1.4708994708994708e-06, "loss": 0.0, "step": 9110 }, { "epoch": 128.0030476190476, - "grad_norm": 0.0026592148933559656, + "grad_norm": 0.0005607677157968283, "learning_rate": 1.4603174603174606e-06, "loss": 0.0, "step": 9120 }, { "epoch": 128.004, - "grad_norm": 0.008936284109950066, + "grad_norm": 0.0008643632754683495, "learning_rate": 1.44973544973545e-06, "loss": 0.0, "step": 9130 }, { "epoch": 128.00495238095237, - "grad_norm": 0.058468934148550034, + "grad_norm": 0.00021982158068567514, "learning_rate": 1.4391534391534393e-06, "loss": 0.0, "step": 9140 }, { "epoch": 128.00590476190476, - "grad_norm": 0.001193960546515882, + "grad_norm": 0.00013301124272402376, "learning_rate": 1.4285714285714286e-06, "loss": 0.0, "step": 9150 }, { "epoch": 128.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.344726085662842, - "eval_runtime": 13.824, - "eval_samples_per_second": 5.353, - "eval_steps_per_second": 1.374, + "eval_accuracy": 0.7297297297297297, + "eval_loss": 2.56596040725708, + "eval_runtime": 13.96, + "eval_samples_per_second": 5.301, + "eval_steps_per_second": 1.361, "step": 9159 }, { "epoch": 129.00009523809524, - "grad_norm": 0.001709073898382485, + "grad_norm": 0.0001881965872598812, "learning_rate": 1.417989417989418e-06, "loss": 0.0, "step": 9160 }, { "epoch": 129.00104761904763, - "grad_norm": 0.0009027041378431022, + "grad_norm": 0.00011493435158627108, "learning_rate": 1.4074074074074075e-06, "loss": 0.0, "step": 9170 }, { "epoch": 129.002, - "grad_norm": 0.0006348963361233473, + "grad_norm": 0.00022677952074445784, "learning_rate": 1.3968253968253969e-06, "loss": 0.0, "step": 9180 }, { "epoch": 129.0029523809524, - "grad_norm": 0.0015575089491903782, + "grad_norm": 0.0008895723149180412, "learning_rate": 1.3862433862433862e-06, - "loss": 0.0001, + "loss": 0.0, "step": 9190 }, { "epoch": 129.00390476190475, - "grad_norm": 0.0006529614911414683, + "grad_norm": 0.00014621164882555604, "learning_rate": 1.3756613756613758e-06, "loss": 0.0, "step": 9200 }, { "epoch": 129.00485714285713, - "grad_norm": 0.0008999014389701188, + "grad_norm": 0.0009619208867661655, "learning_rate": 1.3650793650793652e-06, - "loss": 0.0006, + "loss": 0.0, "step": 9210 }, { "epoch": 129.00580952380952, - "grad_norm": 0.002865071641281247, + "grad_norm": 0.0004876498715020716, "learning_rate": 1.3544973544973547e-06, "loss": 0.0, "step": 9220 }, { "epoch": 129.0067619047619, - "grad_norm": 0.0017597370315343142, + "grad_norm": 0.00033994432305917144, "learning_rate": 1.343915343915344e-06, "loss": 0.0, "step": 9230 }, { "epoch": 129.0067619047619, - "eval_accuracy": 0.6891891891891891, - "eval_loss": 2.748161554336548, - "eval_runtime": 13.6729, - "eval_samples_per_second": 5.412, - "eval_steps_per_second": 1.39, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.582768440246582, + "eval_runtime": 14.486, + "eval_samples_per_second": 5.108, + "eval_steps_per_second": 1.312, "step": 9230 }, { "epoch": 130.00095238095238, - "grad_norm": 0.0004807582008652389, + "grad_norm": 0.00011261526378802955, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "step": 9240 }, { "epoch": 130.00190476190477, - "grad_norm": 0.6759098172187805, + "grad_norm": 0.00062142638489604, "learning_rate": 1.3227513227513228e-06, - "loss": 0.0001, + "loss": 0.0, "step": 9250 }, { "epoch": 130.00285714285715, - "grad_norm": 0.0005689957761205733, + "grad_norm": 0.00016485525702591985, "learning_rate": 1.3121693121693121e-06, "loss": 0.0, "step": 9260 }, { "epoch": 130.00380952380954, - "grad_norm": 0.001238631084561348, + "grad_norm": 0.00027830738690681756, "learning_rate": 1.3015873015873019e-06, "loss": 0.0, "step": 9270 }, { "epoch": 130.0047619047619, - "grad_norm": 0.0006152084679342806, + "grad_norm": 0.00020398409105837345, "learning_rate": 1.2910052910052912e-06, "loss": 0.0, "step": 9280 }, { "epoch": 130.00571428571428, - "grad_norm": 0.0011061441618949175, + "grad_norm": 0.0006426956388168037, "learning_rate": 1.2804232804232806e-06, "loss": 0.0, "step": 9290 }, { "epoch": 130.00666666666666, - "grad_norm": 0.0007120903464965522, + "grad_norm": 0.0002507289173081517, "learning_rate": 1.26984126984127e-06, "loss": 0.0, "step": 9300 }, { "epoch": 130.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.49480938911438, - "eval_runtime": 13.9419, - "eval_samples_per_second": 5.308, - "eval_steps_per_second": 1.363, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.599635124206543, + "eval_runtime": 14.2587, + "eval_samples_per_second": 5.19, + "eval_steps_per_second": 1.333, "step": 9301 }, { "epoch": 131.00085714285714, - "grad_norm": 0.002492654137313366, + "grad_norm": 0.00013853039126843214, "learning_rate": 1.2592592592592593e-06, "loss": 0.0, "step": 9310 }, { "epoch": 131.00180952380953, - "grad_norm": 0.0006992339622229338, + "grad_norm": 0.0019195597851648927, "learning_rate": 1.2486772486772486e-06, "loss": 0.0, "step": 9320 }, { "epoch": 131.0027619047619, - "grad_norm": 0.0010845977813005447, + "grad_norm": 0.00017239370208699256, "learning_rate": 1.2380952380952382e-06, "loss": 0.0, "step": 9330 }, { "epoch": 131.0037142857143, - "grad_norm": 0.0005202058237046003, + "grad_norm": 0.0006013477686792612, "learning_rate": 1.2275132275132276e-06, "loss": 0.0, "step": 9340 }, { "epoch": 131.00466666666668, - "grad_norm": 0.00109467888250947, + "grad_norm": 0.00041599702672101557, "learning_rate": 1.216931216931217e-06, "loss": 0.0, "step": 9350 }, { "epoch": 131.00561904761904, - "grad_norm": 0.002140698954463005, + "grad_norm": 0.00018188441754318774, "learning_rate": 1.2063492063492065e-06, "loss": 0.0, "step": 9360 }, { "epoch": 131.00657142857142, - "grad_norm": 0.0006515824934467673, + "grad_norm": 0.0003828182816505432, "learning_rate": 1.1957671957671958e-06, "loss": 0.0, "step": 9370 }, { "epoch": 131.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.556098699569702, - "eval_runtime": 14.1137, - "eval_samples_per_second": 5.243, - "eval_steps_per_second": 1.346, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.608135223388672, + "eval_runtime": 13.977, + "eval_samples_per_second": 5.294, + "eval_steps_per_second": 1.359, "step": 9372 }, { "epoch": 132.0007619047619, - "grad_norm": 1.6617872714996338, + "grad_norm": 0.0003586815728340298, "learning_rate": 1.1851851851851854e-06, - "loss": 0.0002, + "loss": 0.0, "step": 9380 }, { "epoch": 132.00171428571429, - "grad_norm": 0.001978787826374173, + "grad_norm": 9.796415542950854e-05, "learning_rate": 1.1746031746031747e-06, "loss": 0.0, "step": 9390 }, { "epoch": 132.00266666666667, - "grad_norm": 0.0004866773379035294, + "grad_norm": 0.00012283321120776236, "learning_rate": 1.164021164021164e-06, "loss": 0.0, "step": 9400 }, { "epoch": 132.00361904761905, - "grad_norm": 0.0009006352629512548, + "grad_norm": 0.0002064243599306792, "learning_rate": 1.1534391534391536e-06, "loss": 0.0, "step": 9410 }, { "epoch": 132.00457142857144, - "grad_norm": 0.0334663987159729, + "grad_norm": 0.00012249739666003734, "learning_rate": 1.142857142857143e-06, "loss": 0.0, "step": 9420 }, { "epoch": 132.00552380952382, - "grad_norm": 0.0006234691245481372, + "grad_norm": 0.00016905261145439, "learning_rate": 1.1322751322751323e-06, "loss": 0.0, "step": 9430 }, { "epoch": 132.00647619047618, - "grad_norm": 0.0007475957390852273, + "grad_norm": 0.00016746780602261424, "learning_rate": 1.1216931216931217e-06, "loss": 0.0, "step": 9440 @@ -7807,347 +7807,347 @@ { "epoch": 132.0067619047619, "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.4132068157196045, - "eval_runtime": 14.323, - "eval_samples_per_second": 5.167, - "eval_steps_per_second": 1.327, + "eval_loss": 2.6265084743499756, + "eval_runtime": 14.002, + "eval_samples_per_second": 5.285, + "eval_steps_per_second": 1.357, "step": 9443 }, { "epoch": 133.00066666666666, - "grad_norm": 0.002039211103692651, + "grad_norm": 0.00016834806592669338, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "step": 9450 }, { "epoch": 133.00161904761904, - "grad_norm": 0.0006134635186754167, + "grad_norm": 0.00013323896564543247, "learning_rate": 1.1005291005291006e-06, - "loss": 0.0001, + "loss": 0.0, "step": 9460 }, { "epoch": 133.00257142857143, - "grad_norm": 0.0005841179518029094, + "grad_norm": 0.0001283081219298765, "learning_rate": 1.08994708994709e-06, "loss": 0.0, "step": 9470 }, { "epoch": 133.0035238095238, - "grad_norm": 0.0006910832016728818, + "grad_norm": 0.00014765470405109227, "learning_rate": 1.0793650793650795e-06, "loss": 0.0, "step": 9480 }, { "epoch": 133.0044761904762, - "grad_norm": 0.0005905992584303021, + "grad_norm": 0.0001391248806612566, "learning_rate": 1.0687830687830689e-06, "loss": 0.0, "step": 9490 }, { "epoch": 133.00542857142858, - "grad_norm": 0.0006618525367230177, + "grad_norm": 0.00020245308405719697, "learning_rate": 1.0582010582010582e-06, "loss": 0.0, "step": 9500 }, { "epoch": 133.00638095238097, - "grad_norm": 0.0007083863019943237, + "grad_norm": 0.00014681214815936983, "learning_rate": 1.0476190476190478e-06, "loss": 0.0, "step": 9510 }, { "epoch": 133.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.3920769691467285, - "eval_runtime": 17.0856, - "eval_samples_per_second": 4.331, - "eval_steps_per_second": 1.112, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.6523962020874023, + "eval_runtime": 14.6803, + "eval_samples_per_second": 5.041, + "eval_steps_per_second": 1.294, "step": 9514 }, { "epoch": 134.00057142857142, - "grad_norm": 0.0014881724491715431, + "grad_norm": 0.00017680463497526944, "learning_rate": 1.0370370370370371e-06, "loss": 0.0, "step": 9520 }, { "epoch": 134.0015238095238, - "grad_norm": 0.0006462593446485698, + "grad_norm": 0.00021717413619626313, "learning_rate": 1.0264550264550267e-06, "loss": 0.0, "step": 9530 }, { "epoch": 134.0024761904762, - "grad_norm": 0.0007744215545244515, + "grad_norm": 0.00017716505681164563, "learning_rate": 1.015873015873016e-06, "loss": 0.0, "step": 9540 }, { "epoch": 134.00342857142857, - "grad_norm": 0.0005738343461416662, + "grad_norm": 0.00026766807422973216, "learning_rate": 1.0052910052910054e-06, "loss": 0.0, "step": 9550 }, { "epoch": 134.00438095238096, - "grad_norm": 0.026759404689073563, + "grad_norm": 0.0002117603289661929, "learning_rate": 9.947089947089947e-07, "loss": 0.0, "step": 9560 }, { "epoch": 134.00533333333334, - "grad_norm": 0.0006589809199795127, + "grad_norm": 0.00016684371803421527, "learning_rate": 9.84126984126984e-07, "loss": 0.0, "step": 9570 }, { "epoch": 134.00628571428572, - "grad_norm": 0.0007503838278353214, + "grad_norm": 0.00020365270029287785, "learning_rate": 9.735449735449736e-07, "loss": 0.0, "step": 9580 }, { "epoch": 134.0067619047619, - "eval_accuracy": 0.7297297297297297, - "eval_loss": 2.396359443664551, - "eval_runtime": 14.3582, - "eval_samples_per_second": 5.154, - "eval_steps_per_second": 1.323, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.663364887237549, + "eval_runtime": 13.7653, + "eval_samples_per_second": 5.376, + "eval_steps_per_second": 1.38, "step": 9585 }, { "epoch": 135.00047619047618, - "grad_norm": 0.0008084605797193944, + "grad_norm": 0.0001860986085375771, "learning_rate": 9.62962962962963e-07, "loss": 0.0, "step": 9590 }, { "epoch": 135.00142857142856, - "grad_norm": 0.002220474649220705, + "grad_norm": 0.00012733951734844595, "learning_rate": 9.523809523809525e-07, "loss": 0.0, "step": 9600 }, { "epoch": 135.00238095238095, - "grad_norm": 0.0009744897834025323, + "grad_norm": 0.00011300836922600865, "learning_rate": 9.417989417989419e-07, "loss": 0.0, "step": 9610 }, { "epoch": 135.00333333333333, - "grad_norm": 0.0007446310482919216, + "grad_norm": 0.0004088705172762275, "learning_rate": 9.312169312169313e-07, "loss": 0.0, "step": 9620 }, { "epoch": 135.00428571428571, - "grad_norm": 0.0007439861074090004, + "grad_norm": 0.0001340518647339195, "learning_rate": 9.206349206349208e-07, "loss": 0.0, "step": 9630 }, { "epoch": 135.0052380952381, - "grad_norm": 0.0013515371829271317, + "grad_norm": 0.00015032911323942244, "learning_rate": 9.100529100529102e-07, - "loss": 0.0017, + "loss": 0.0, "step": 9640 }, { "epoch": 135.00619047619048, - "grad_norm": 0.0014509283937513828, + "grad_norm": 0.00011336953321006149, "learning_rate": 8.994708994708995e-07, "loss": 0.0, "step": 9650 }, { "epoch": 135.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5452113151550293, - "eval_runtime": 14.5336, - "eval_samples_per_second": 5.092, - "eval_steps_per_second": 1.307, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.6924996376037598, + "eval_runtime": 13.8914, + "eval_samples_per_second": 5.327, + "eval_steps_per_second": 1.368, "step": 9656 }, { "epoch": 136.00038095238097, - "grad_norm": 0.0016311920480802655, + "grad_norm": 0.0001399925968144089, "learning_rate": 8.88888888888889e-07, "loss": 0.0, "step": 9660 }, { "epoch": 136.00133333333332, - "grad_norm": 0.0009233981836587191, + "grad_norm": 0.00018644209194462746, "learning_rate": 8.783068783068783e-07, "loss": 0.0, "step": 9670 }, { "epoch": 136.0022857142857, - "grad_norm": 0.0005894800997339189, + "grad_norm": 0.00031613794271834195, "learning_rate": 8.677248677248679e-07, "loss": 0.0, "step": 9680 }, { "epoch": 136.0032380952381, - "grad_norm": 0.0009672031155787408, + "grad_norm": 0.0002503306313883513, "learning_rate": 8.571428571428572e-07, - "loss": 0.0001, + "loss": 0.0, "step": 9690 }, { "epoch": 136.00419047619047, - "grad_norm": 0.0009769980097189546, + "grad_norm": 0.00015492939564865083, "learning_rate": 8.465608465608466e-07, "loss": 0.0, "step": 9700 }, { "epoch": 136.00514285714286, - "grad_norm": 0.0004842029884457588, + "grad_norm": 0.00011654701665975153, "learning_rate": 8.35978835978836e-07, "loss": 0.0, "step": 9710 }, { "epoch": 136.00609523809524, - "grad_norm": 0.018822021782398224, + "grad_norm": 0.0013862367486581206, "learning_rate": 8.253968253968254e-07, "loss": 0.0, "step": 9720 }, { "epoch": 136.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.528768539428711, - "eval_runtime": 14.3183, - "eval_samples_per_second": 5.168, - "eval_steps_per_second": 1.327, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.7700693607330322, + "eval_runtime": 15.7687, + "eval_samples_per_second": 4.693, + "eval_steps_per_second": 1.205, "step": 9727 }, { "epoch": 137.00028571428572, - "grad_norm": 0.0009558707824908197, + "grad_norm": 0.00044550379971042275, "learning_rate": 8.14814814814815e-07, "loss": 0.0, "step": 9730 }, { "epoch": 137.0012380952381, - "grad_norm": 0.0007363679469563067, + "grad_norm": 0.00019741806318052113, "learning_rate": 8.042328042328043e-07, "loss": 0.0, "step": 9740 }, { "epoch": 137.00219047619046, - "grad_norm": 0.0004333931137807667, + "grad_norm": 9.011803922476247e-05, "learning_rate": 7.936507936507937e-07, "loss": 0.0, "step": 9750 }, { "epoch": 137.00314285714285, - "grad_norm": 0.0006523500196635723, + "grad_norm": 0.00011569274647627026, "learning_rate": 7.830687830687832e-07, - "loss": 0.0006, + "loss": 0.0, "step": 9760 }, { "epoch": 137.00409523809523, - "grad_norm": 0.0004806246142834425, + "grad_norm": 0.00016636776854284108, "learning_rate": 7.724867724867726e-07, "loss": 0.0, "step": 9770 }, { "epoch": 137.00504761904762, - "grad_norm": 0.0015316602075472474, + "grad_norm": 0.0002474163193255663, "learning_rate": 7.61904761904762e-07, "loss": 0.0, "step": 9780 }, { "epoch": 137.006, - "grad_norm": 0.0005167218623682857, + "grad_norm": 0.00030764579423703253, "learning_rate": 7.513227513227514e-07, "loss": 0.0, "step": 9790 }, { "epoch": 137.0067619047619, - "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.497872829437256, - "eval_runtime": 14.6991, - "eval_samples_per_second": 5.034, - "eval_steps_per_second": 1.293, + "eval_accuracy": 0.7027027027027027, + "eval_loss": 2.777440309524536, + "eval_runtime": 75.651, + "eval_samples_per_second": 0.978, + "eval_steps_per_second": 0.251, "step": 9798 }, { "epoch": 138.00019047619048, - "grad_norm": 0.0007964400574564934, + "grad_norm": 0.0002310011041117832, "learning_rate": 7.407407407407407e-07, "loss": 0.0, "step": 9800 }, { "epoch": 138.00114285714287, - "grad_norm": 0.006182149518281221, + "grad_norm": 0.00014927572919987142, "learning_rate": 7.301587301587303e-07, "loss": 0.0, "step": 9810 }, { "epoch": 138.00209523809525, - "grad_norm": 0.0004261100257281214, + "grad_norm": 0.0002394427574472502, "learning_rate": 7.195767195767196e-07, "loss": 0.0, "step": 9820 }, { "epoch": 138.0030476190476, - "grad_norm": 0.00047485376126132905, + "grad_norm": 0.0006395941600203514, "learning_rate": 7.08994708994709e-07, "loss": 0.0, "step": 9830 }, { "epoch": 138.004, - "grad_norm": 0.000458546302979812, + "grad_norm": 0.00022553169401362538, "learning_rate": 6.984126984126984e-07, "loss": 0.0, "step": 9840 }, { "epoch": 138.00495238095237, - "grad_norm": 0.0007714477251283824, + "grad_norm": 0.0001349856611341238, "learning_rate": 6.878306878306879e-07, "loss": 0.0, "step": 9850 }, { "epoch": 138.00590476190476, - "grad_norm": 0.0005998592241667211, + "grad_norm": 0.00011537998216226697, "learning_rate": 6.772486772486774e-07, "loss": 0.0, "step": 9860 @@ -8155,238 +8155,238 @@ { "epoch": 138.0067619047619, "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.499077796936035, - "eval_runtime": 14.1235, - "eval_samples_per_second": 5.239, - "eval_steps_per_second": 1.345, + "eval_loss": 2.7755815982818604, + "eval_runtime": 18.245, + "eval_samples_per_second": 4.056, + "eval_steps_per_second": 1.041, "step": 9869 }, { "epoch": 139.00009523809524, - "grad_norm": 0.0015940848970785737, + "grad_norm": 0.00015412215725518763, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "step": 9870 }, { "epoch": 139.00104761904763, - "grad_norm": 0.0170315932482481, + "grad_norm": 0.000286016525933519, "learning_rate": 6.560846560846561e-07, "loss": 0.0, "step": 9880 }, { "epoch": 139.002, - "grad_norm": 0.0007317705894820392, + "grad_norm": 0.0003684030089061707, "learning_rate": 6.455026455026456e-07, "loss": 0.0, "step": 9890 }, { "epoch": 139.0029523809524, - "grad_norm": 0.000726180849596858, + "grad_norm": 9.316956857219338e-05, "learning_rate": 6.34920634920635e-07, "loss": 0.0, "step": 9900 }, { "epoch": 139.00390476190475, - "grad_norm": 0.0006973618292249739, + "grad_norm": 0.00022484293731395155, "learning_rate": 6.243386243386243e-07, "loss": 0.0, "step": 9910 }, { "epoch": 139.00485714285713, - "grad_norm": 0.0011956560192629695, + "grad_norm": 0.00016070107812993228, "learning_rate": 6.137566137566138e-07, "loss": 0.0, "step": 9920 }, { "epoch": 139.00580952380952, - "grad_norm": 0.0011069370666518807, + "grad_norm": 0.00013297729310579598, "learning_rate": 6.031746031746032e-07, "loss": 0.0, "step": 9930 }, { "epoch": 139.0067619047619, - "grad_norm": 0.2911625802516937, + "grad_norm": 0.0006777092348784208, "learning_rate": 5.925925925925927e-07, - "loss": 0.0001, + "loss": 0.0, "step": 9940 }, { "epoch": 139.0067619047619, "eval_accuracy": 0.7162162162162162, - "eval_loss": 2.4992563724517822, - "eval_runtime": 15.9301, - "eval_samples_per_second": 4.645, - "eval_steps_per_second": 1.193, + "eval_loss": 2.7789177894592285, + "eval_runtime": 16.5578, + "eval_samples_per_second": 4.469, + "eval_steps_per_second": 1.147, "step": 9940 }, { "epoch": 140.00095238095238, - "grad_norm": 0.0005724221118725836, + "grad_norm": 0.00020421307999640703, "learning_rate": 5.82010582010582e-07, "loss": 0.0, "step": 9950 }, { "epoch": 140.00190476190477, - "grad_norm": 0.0006082432228140533, + "grad_norm": 0.00014328365796245635, "learning_rate": 5.714285714285715e-07, "loss": 0.0, "step": 9960 }, { "epoch": 140.00285714285715, - "grad_norm": 0.000528970267623663, + "grad_norm": 0.00010416995792184025, "learning_rate": 5.608465608465608e-07, "loss": 0.0, "step": 9970 }, { "epoch": 140.00380952380954, - "grad_norm": 0.000567153561860323, + "grad_norm": 0.00011329493281664327, "learning_rate": 5.502645502645503e-07, "loss": 0.0, "step": 9980 }, { "epoch": 140.0047619047619, - "grad_norm": 0.0006368610193021595, + "grad_norm": 0.00016567722195759416, "learning_rate": 5.396825396825398e-07, "loss": 0.0, "step": 9990 }, { "epoch": 140.00571428571428, - "grad_norm": 0.0006508544902317226, + "grad_norm": 0.00014367059338837862, "learning_rate": 5.291005291005291e-07, "loss": 0.0, "step": 10000 }, { "epoch": 140.00666666666666, - "grad_norm": 0.0007206543232314289, + "grad_norm": 0.00011329939297866076, "learning_rate": 5.185185185185186e-07, "loss": 0.0, "step": 10010 }, { "epoch": 140.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5001914501190186, - "eval_runtime": 14.6218, - "eval_samples_per_second": 5.061, - "eval_steps_per_second": 1.299, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.7818410396575928, + "eval_runtime": 17.1649, + "eval_samples_per_second": 4.311, + "eval_steps_per_second": 1.107, "step": 10011 }, { "epoch": 141.00085714285714, - "grad_norm": 0.0006148120737634599, + "grad_norm": 0.00012502900790423155, "learning_rate": 5.07936507936508e-07, "loss": 0.0, "step": 10020 }, { "epoch": 141.00180952380953, - "grad_norm": 0.0005421801470220089, + "grad_norm": 0.0004600577231030911, "learning_rate": 4.973544973544974e-07, "loss": 0.0, "step": 10030 }, { "epoch": 141.0027619047619, - "grad_norm": 0.0006734656053595245, + "grad_norm": 0.00015750101010780782, "learning_rate": 4.867724867724868e-07, "loss": 0.0, "step": 10040 }, { "epoch": 141.0037142857143, - "grad_norm": 0.0005246912478469312, + "grad_norm": 0.00011517933307914063, "learning_rate": 4.7619047619047623e-07, "loss": 0.0, "step": 10050 }, { "epoch": 141.00466666666668, - "grad_norm": 0.0006405619787983596, + "grad_norm": 0.00012730502930935472, "learning_rate": 4.6560846560846563e-07, "loss": 0.0, "step": 10060 }, { "epoch": 141.00561904761904, - "grad_norm": 0.000817726890090853, + "grad_norm": 0.00028043414931744337, "learning_rate": 4.550264550264551e-07, "loss": 0.0, "step": 10070 }, { "epoch": 141.00657142857142, - "grad_norm": 0.0010601927060633898, + "grad_norm": 0.00022784181055612862, "learning_rate": 4.444444444444445e-07, "loss": 0.0, "step": 10080 }, { "epoch": 141.0067619047619, - "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5027854442596436, - "eval_runtime": 14.625, - "eval_samples_per_second": 5.06, - "eval_steps_per_second": 1.299, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 2.716393232345581, + "eval_runtime": 19.2663, + "eval_samples_per_second": 3.841, + "eval_steps_per_second": 0.986, "step": 10082 }, { "epoch": 142.0007619047619, - "grad_norm": 0.0007274626987054944, + "grad_norm": 0.00018083921167999506, "learning_rate": 4.3386243386243395e-07, "loss": 0.0, "step": 10090 }, { "epoch": 142.00171428571429, - "grad_norm": 0.0009082194301299751, + "grad_norm": 0.00011656145215965807, "learning_rate": 4.232804232804233e-07, "loss": 0.0, "step": 10100 }, { "epoch": 142.00266666666667, - "grad_norm": 0.0008187068742699921, + "grad_norm": 0.0002118592383340001, "learning_rate": 4.126984126984127e-07, "loss": 0.0, "step": 10110 }, { "epoch": 142.00361904761905, - "grad_norm": 0.0006291703321039677, + "grad_norm": 0.00029809624538756907, "learning_rate": 4.0211640211640215e-07, "loss": 0.0, "step": 10120 }, { "epoch": 142.00457142857144, - "grad_norm": 0.0005804167012684047, + "grad_norm": 0.0001748933136695996, "learning_rate": 3.915343915343916e-07, "loss": 0.0, "step": 10130 }, { "epoch": 142.00552380952382, - "grad_norm": 0.0005321921198628843, + "grad_norm": 0.0005882106488570571, "learning_rate": 3.80952380952381e-07, "loss": 0.0, "step": 10140 }, { "epoch": 142.00647619047618, - "grad_norm": 0.0011924570426344872, + "grad_norm": 0.00013971776934340596, "learning_rate": 3.7037037037037036e-07, "loss": 0.0, "step": 10150 @@ -8394,57 +8394,57 @@ { "epoch": 142.0067619047619, "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5063016414642334, - "eval_runtime": 14.2591, - "eval_samples_per_second": 5.19, - "eval_steps_per_second": 1.332, + "eval_loss": 2.9570682048797607, + "eval_runtime": 18.9008, + "eval_samples_per_second": 3.915, + "eval_steps_per_second": 1.005, "step": 10153 }, { "epoch": 143.00066666666666, - "grad_norm": 0.0028288825415074825, + "grad_norm": 0.0005308242398314178, "learning_rate": 3.597883597883598e-07, "loss": 0.0, "step": 10160 }, { "epoch": 143.00161904761904, - "grad_norm": 0.0005451482720673084, + "grad_norm": 0.00011192076635779813, "learning_rate": 3.492063492063492e-07, "loss": 0.0, "step": 10170 }, { "epoch": 143.00257142857143, - "grad_norm": 0.0003777625097427517, + "grad_norm": 9.586880332790315e-05, "learning_rate": 3.386243386243387e-07, "loss": 0.0, "step": 10180 }, { "epoch": 143.0035238095238, - "grad_norm": 0.0012343241833150387, + "grad_norm": 0.0002831014571711421, "learning_rate": 3.2804232804232803e-07, "loss": 0.0, "step": 10190 }, { "epoch": 143.0044761904762, - "grad_norm": 0.00047643802827224135, + "grad_norm": 0.0001229299232363701, "learning_rate": 3.174603174603175e-07, "loss": 0.0, "step": 10200 }, { "epoch": 143.00542857142858, - "grad_norm": 0.0007250914350152016, + "grad_norm": 9.351440530736e-05, "learning_rate": 3.068783068783069e-07, "loss": 0.0, "step": 10210 }, { "epoch": 143.00638095238097, - "grad_norm": 0.0008642813190817833, + "grad_norm": 0.00017162870790343732, "learning_rate": 2.9629629629629634e-07, "loss": 0.0, "step": 10220 @@ -8452,57 +8452,57 @@ { "epoch": 143.0067619047619, "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5081124305725098, - "eval_runtime": 14.3888, - "eval_samples_per_second": 5.143, - "eval_steps_per_second": 1.32, + "eval_loss": 2.9561750888824463, + "eval_runtime": 16.3241, + "eval_samples_per_second": 4.533, + "eval_steps_per_second": 1.164, "step": 10224 }, { "epoch": 144.00057142857142, - "grad_norm": 0.0008032625191845, + "grad_norm": 0.00039424237911589444, "learning_rate": 2.8571428571428575e-07, "loss": 0.0, "step": 10230 }, { "epoch": 144.0015238095238, - "grad_norm": 0.0005670238751918077, + "grad_norm": 0.0001173912751255557, "learning_rate": 2.7513227513227515e-07, "loss": 0.0, "step": 10240 }, { "epoch": 144.0024761904762, - "grad_norm": 0.0024792386684566736, + "grad_norm": 0.0003806989989243448, "learning_rate": 2.6455026455026455e-07, "loss": 0.0, "step": 10250 }, { "epoch": 144.00342857142857, - "grad_norm": 0.0005777708138339221, + "grad_norm": 0.00013932188448961824, "learning_rate": 2.53968253968254e-07, "loss": 0.0, "step": 10260 }, { "epoch": 144.00438095238096, - "grad_norm": 0.000489641388412565, + "grad_norm": 0.00012908896314911544, "learning_rate": 2.433862433862434e-07, "loss": 0.0, "step": 10270 }, { "epoch": 144.00533333333334, - "grad_norm": 0.002197200432419777, + "grad_norm": 0.00014965585432946682, "learning_rate": 2.3280423280423281e-07, "loss": 0.0, "step": 10280 }, { "epoch": 144.00628571428572, - "grad_norm": 0.0007451015990227461, + "grad_norm": 0.03986010327935219, "learning_rate": 2.2222222222222224e-07, "loss": 0.0, "step": 10290 @@ -8510,57 +8510,57 @@ { "epoch": 144.0067619047619, "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5086917877197266, - "eval_runtime": 14.571, - "eval_samples_per_second": 5.079, - "eval_steps_per_second": 1.304, + "eval_loss": 2.95378041267395, + "eval_runtime": 16.0055, + "eval_samples_per_second": 4.623, + "eval_steps_per_second": 1.187, "step": 10295 }, { "epoch": 145.00047619047618, - "grad_norm": 0.0006503051263280213, + "grad_norm": 0.00011041080870199949, "learning_rate": 2.1164021164021165e-07, "loss": 0.0, "step": 10300 }, { "epoch": 145.00142857142856, - "grad_norm": 0.0007482533692382276, + "grad_norm": 0.0001778493751771748, "learning_rate": 2.0105820105820108e-07, "loss": 0.0, "step": 10310 }, { "epoch": 145.00238095238095, - "grad_norm": 0.0005445526912808418, + "grad_norm": 0.00011482149420771748, "learning_rate": 1.904761904761905e-07, "loss": 0.0, "step": 10320 }, { "epoch": 145.00333333333333, - "grad_norm": 0.000508942932356149, + "grad_norm": 9.02183455764316e-05, "learning_rate": 1.798941798941799e-07, "loss": 0.0, "step": 10330 }, { "epoch": 145.00428571428571, - "grad_norm": 0.0008865836425684392, + "grad_norm": 0.0003151354903820902, "learning_rate": 1.6931216931216934e-07, "loss": 0.0, "step": 10340 }, { "epoch": 145.0052380952381, - "grad_norm": 0.0007896738243289292, + "grad_norm": 0.00024107014178298414, "learning_rate": 1.5873015873015874e-07, "loss": 0.0, "step": 10350 }, { "epoch": 145.00619047619048, - "grad_norm": 0.00044896735926158726, + "grad_norm": 8.7294916738756e-05, "learning_rate": 1.4814814814814817e-07, "loss": 0.0, "step": 10360 @@ -8568,57 +8568,57 @@ { "epoch": 145.0067619047619, "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.5091147422790527, - "eval_runtime": 14.6916, - "eval_samples_per_second": 5.037, - "eval_steps_per_second": 1.293, + "eval_loss": 2.9513986110687256, + "eval_runtime": 16.1391, + "eval_samples_per_second": 4.585, + "eval_steps_per_second": 1.177, "step": 10366 }, { "epoch": 146.00038095238097, - "grad_norm": 0.0006942891632206738, + "grad_norm": 0.0003729898016899824, "learning_rate": 1.3756613756613757e-07, "loss": 0.0, "step": 10370 }, { "epoch": 146.00133333333332, - "grad_norm": 0.0010396570432931185, + "grad_norm": 0.0001791256363503635, "learning_rate": 1.26984126984127e-07, "loss": 0.0, "step": 10380 }, { "epoch": 146.0022857142857, - "grad_norm": 0.0006404640153050423, + "grad_norm": 0.0001602688425919041, "learning_rate": 1.1640211640211641e-07, "loss": 0.0, "step": 10390 }, { "epoch": 146.0032380952381, - "grad_norm": 0.0008790025603957474, + "grad_norm": 9.487938223173842e-05, "learning_rate": 1.0582010582010582e-07, "loss": 0.0, "step": 10400 }, { "epoch": 146.00419047619047, - "grad_norm": 0.007224238943308592, + "grad_norm": 0.00012544992205221206, "learning_rate": 9.523809523809525e-08, "loss": 0.0, "step": 10410 }, { "epoch": 146.00514285714286, - "grad_norm": 0.0005538808181881905, + "grad_norm": 0.00013564463006332517, "learning_rate": 8.465608465608467e-08, "loss": 0.0, "step": 10420 }, { "epoch": 146.00609523809524, - "grad_norm": 0.0008910358301363885, + "grad_norm": 0.00021179339091759175, "learning_rate": 7.407407407407409e-08, "loss": 0.0, "step": 10430 @@ -8626,57 +8626,57 @@ { "epoch": 146.0067619047619, "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.509316921234131, - "eval_runtime": 14.4766, - "eval_samples_per_second": 5.112, - "eval_steps_per_second": 1.312, + "eval_loss": 2.9517064094543457, + "eval_runtime": 14.0509, + "eval_samples_per_second": 5.267, + "eval_steps_per_second": 1.352, "step": 10437 }, { "epoch": 147.00028571428572, - "grad_norm": 0.0005938304821029305, + "grad_norm": 0.00010382410255260766, "learning_rate": 6.34920634920635e-08, "loss": 0.0, "step": 10440 }, { "epoch": 147.0012380952381, - "grad_norm": 0.0007407570374198258, + "grad_norm": 0.0006173243164084852, "learning_rate": 5.291005291005291e-08, - "loss": 0.0001, + "loss": 0.0, "step": 10450 }, { "epoch": 147.00219047619046, - "grad_norm": 0.0006653261370956898, + "grad_norm": 0.0001852709538070485, "learning_rate": 4.2328042328042335e-08, "loss": 0.0, "step": 10460 }, { "epoch": 147.00314285714285, - "grad_norm": 0.0006583406357094646, + "grad_norm": 0.00010010774713009596, "learning_rate": 3.174603174603175e-08, "loss": 0.0, "step": 10470 }, { "epoch": 147.00409523809523, - "grad_norm": 0.0006689100409857929, + "grad_norm": 0.00010709642083384097, "learning_rate": 2.1164021164021167e-08, "loss": 0.0, "step": 10480 }, { "epoch": 147.00504761904762, - "grad_norm": 0.0007720529101788998, + "grad_norm": 0.00024684463278390467, "learning_rate": 1.0582010582010584e-08, "loss": 0.0, "step": 10490 }, { "epoch": 147.006, - "grad_norm": 0.0006923283799551427, + "grad_norm": 0.00019418797455728054, "learning_rate": 0.0, "loss": 0.0, "step": 10500 @@ -8684,37 +8684,37 @@ { "epoch": 147.006, "eval_accuracy": 0.7027027027027027, - "eval_loss": 2.50508189201355, - "eval_runtime": 15.5583, - "eval_samples_per_second": 4.756, - "eval_steps_per_second": 1.221, + "eval_loss": 2.9517529010772705, + "eval_runtime": 15.1931, + "eval_samples_per_second": 4.871, + "eval_steps_per_second": 1.251, "step": 10500 }, { "epoch": 147.006, "step": 10500, - "total_flos": 5.178519827385621e+19, - "train_loss": 0.16783292878849065, - "train_runtime": 11758.6675, - "train_samples_per_second": 3.572, - "train_steps_per_second": 0.893 + "total_flos": 1.8248666263741838e+20, + "train_loss": 0.08116123437592995, + "train_runtime": 21876.7284, + "train_samples_per_second": 1.92, + "train_steps_per_second": 0.48 }, { "epoch": 147.006, - "eval_accuracy": 0.8695652173913043, - "eval_loss": 0.32631269097328186, - "eval_runtime": 5.1214, - "eval_samples_per_second": 4.491, - "eval_steps_per_second": 1.172, + "eval_accuracy": 0.7837837837837838, + "eval_loss": 0.49420419335365295, + "eval_runtime": 15.3425, + "eval_samples_per_second": 4.823, + "eval_steps_per_second": 1.238, "step": 10500 }, { "epoch": 147.006, - "eval_accuracy": 0.8695652173913043, - "eval_loss": 0.32631269097328186, - "eval_runtime": 4.5747, - "eval_samples_per_second": 5.028, - "eval_steps_per_second": 1.312, + "eval_accuracy": 0.7837837837837838, + "eval_loss": 0.49420419335365295, + "eval_runtime": 13.6975, + "eval_samples_per_second": 5.402, + "eval_steps_per_second": 1.387, "step": 10500 } ], @@ -8735,7 +8735,7 @@ "attributes": {} } }, - "total_flos": 5.178519827385621e+19, + "total_flos": 1.8248666263741838e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null