diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1795 +1,2975 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.1788282447247436, + "epoch": 1.9647137412079059, "eval_steps": 200, - "global_step": 30000, + "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007858854964831625, - "eval_loss": 3.1892831325531006, - "eval_runtime": 159.5257, - "eval_samples_per_second": 35.455, - "eval_steps_per_second": 4.432, + "eval_loss": 3.175461769104004, + "eval_runtime": 144.4533, + "eval_samples_per_second": 39.155, + "eval_steps_per_second": 4.894, "eval_wer": 1.0, "step": 200 }, { "epoch": 0.01571770992966325, - "eval_loss": 2.780208110809326, - "eval_runtime": 157.5706, - "eval_samples_per_second": 35.895, - "eval_steps_per_second": 4.487, + "eval_loss": 2.8796634674072266, + "eval_runtime": 143.0854, + "eval_samples_per_second": 39.529, + "eval_steps_per_second": 4.941, "eval_wer": 1.0, "step": 400 }, { "epoch": 0.01964713741207906, - "grad_norm": 4.9997968673706055, - "learning_rate": 0.0002982, - "loss": 4.719, + "grad_norm": 1.9831087589263916, + "learning_rate": 0.00029759999999999997, + "loss": 4.8076, "step": 500 }, { "epoch": 0.023576564894494872, - "eval_loss": 1.4220576286315918, - "eval_runtime": 158.6041, - "eval_samples_per_second": 35.661, - "eval_steps_per_second": 4.458, - "eval_wer": 0.8876923817624497, + "eval_loss": 1.4753953218460083, + "eval_runtime": 143.2096, + "eval_samples_per_second": 39.495, + "eval_steps_per_second": 4.937, + "eval_wer": 0.903997689011571, "step": 600 }, { "epoch": 0.0314354198593265, - "eval_loss": 1.227359414100647, - "eval_runtime": 158.5238, - "eval_samples_per_second": 35.679, - "eval_steps_per_second": 4.46, - "eval_wer": 0.8224390557044503, + "eval_loss": 1.25261652469635, + "eval_runtime": 144.6031, + "eval_samples_per_second": 39.114, + "eval_steps_per_second": 4.889, + "eval_wer": 0.8548410393028518, "step": 800 }, { "epoch": 0.03929427482415812, - "grad_norm": 2.6001393795013428, - "learning_rate": 0.0002949457627118644, - "loss": 1.0441, + "grad_norm": 2.427387237548828, + "learning_rate": 0.0002969939393939394, + "loss": 1.1153, "step": 1000 }, { "epoch": 0.03929427482415812, - "eval_loss": 1.1094719171524048, - "eval_runtime": 161.2687, - "eval_samples_per_second": 35.072, - "eval_steps_per_second": 4.384, - "eval_wer": 0.7886729469917029, + "eval_loss": 1.1311910152435303, + "eval_runtime": 144.1542, + "eval_samples_per_second": 39.236, + "eval_steps_per_second": 4.904, + "eval_wer": 0.788769238176245, "step": 1000 }, { "epoch": 0.047153129788989744, - "eval_loss": 1.091428279876709, - "eval_runtime": 158.6043, - "eval_samples_per_second": 35.661, - "eval_steps_per_second": 4.458, - "eval_wer": 0.7549228868097125, + "eval_loss": 1.0895923376083374, + "eval_runtime": 144.7009, + "eval_samples_per_second": 39.088, + "eval_steps_per_second": 4.886, + "eval_wer": 0.7734749883648152, "step": 1200 }, { "epoch": 0.055011984753821366, - "eval_loss": 1.0177329778671265, - "eval_runtime": 159.8661, - "eval_samples_per_second": 35.38, - "eval_steps_per_second": 4.422, - "eval_wer": 0.7354881160629745, + "eval_loss": 1.0287705659866333, + "eval_runtime": 143.6225, + "eval_samples_per_second": 39.381, + "eval_steps_per_second": 4.923, + "eval_wer": 0.7571054869926658, "step": 1400 }, { "epoch": 0.05894141223623718, - "grad_norm": 2.7494542598724365, - "learning_rate": 0.0002898610169491525, - "loss": 0.8033, + "grad_norm": 2.3919336795806885, + "learning_rate": 0.0002939636363636363, + "loss": 0.8282, "step": 1500 }, { "epoch": 0.062870839718653, - "eval_loss": 0.9907466769218445, - "eval_runtime": 159.6438, - "eval_samples_per_second": 35.429, - "eval_steps_per_second": 4.429, - "eval_wer": 0.7232912326876475, + "eval_loss": 0.9747628569602966, + "eval_runtime": 144.5139, + "eval_samples_per_second": 39.138, + "eval_steps_per_second": 4.892, + "eval_wer": 0.7254096387475727, "step": 1600 }, { "epoch": 0.07072969468348461, - "eval_loss": 0.9761303067207336, - "eval_runtime": 159.1571, - "eval_samples_per_second": 35.537, - "eval_steps_per_second": 4.442, - "eval_wer": 0.7145287348943204, + "eval_loss": 0.9748485088348389, + "eval_runtime": 144.3418, + "eval_samples_per_second": 39.185, + "eval_steps_per_second": 4.898, + "eval_wer": 0.7194556338367223, "step": 1800 }, { "epoch": 0.07858854964831624, - "grad_norm": 2.753251314163208, - "learning_rate": 0.00028477627118644064, - "loss": 0.7227, + "grad_norm": 2.169008255004883, + "learning_rate": 0.0002909333333333333, + "loss": 0.7335, "step": 2000 }, { "epoch": 0.07858854964831624, - "eval_loss": 0.9555273056030273, - "eval_runtime": 159.3414, - "eval_samples_per_second": 35.496, - "eval_steps_per_second": 4.437, - "eval_wer": 0.6902794049204796, + "eval_loss": 0.9882574081420898, + "eval_runtime": 145.192, + "eval_samples_per_second": 38.955, + "eval_steps_per_second": 4.869, + "eval_wer": 0.7143682495867504, "step": 2000 }, { "epoch": 0.08644740461314787, - "eval_loss": 0.8994919061660767, - "eval_runtime": 159.0426, - "eval_samples_per_second": 35.563, - "eval_steps_per_second": 4.445, - "eval_wer": 0.6747765242092086, + "eval_loss": 0.9364911317825317, + "eval_runtime": 145.4626, + "eval_samples_per_second": 38.883, + "eval_steps_per_second": 4.86, + "eval_wer": 0.7061834989006757, "step": 2200 }, { "epoch": 0.09430625957797949, - "eval_loss": 0.8897404670715332, - "eval_runtime": 158.5492, - "eval_samples_per_second": 35.673, - "eval_steps_per_second": 4.459, - "eval_wer": 0.66655967646162, + "eval_loss": 0.9164892435073853, + "eval_runtime": 145.9321, + "eval_samples_per_second": 38.758, + "eval_steps_per_second": 4.845, + "eval_wer": 0.6801688305435637, "step": 2400 }, { "epoch": 0.0982356870603953, - "grad_norm": 2.404499053955078, - "learning_rate": 0.00027969152542372877, - "loss": 0.6794, + "grad_norm": 5.276973247528076, + "learning_rate": 0.00028790303030303027, + "loss": 0.6931, "step": 2500 }, { "epoch": 0.10216511454281112, - "eval_loss": 0.8826168775558472, - "eval_runtime": 159.8456, - "eval_samples_per_second": 35.384, - "eval_steps_per_second": 4.423, - "eval_wer": 0.6559676461619939, + "eval_loss": 0.9169939756393433, + "eval_runtime": 145.3478, + "eval_samples_per_second": 38.914, + "eval_steps_per_second": 4.864, + "eval_wer": 0.6773603376610872, "step": 2600 }, { "epoch": 0.11002396950764273, - "eval_loss": 0.8744593858718872, - "eval_runtime": 159.6838, - "eval_samples_per_second": 35.42, - "eval_steps_per_second": 4.427, - "eval_wer": 0.6445571407937604, + "eval_loss": 0.9080427289009094, + "eval_runtime": 144.7759, + "eval_samples_per_second": 39.067, + "eval_steps_per_second": 4.883, + "eval_wer": 0.6692237325672835, "step": 2800 }, { "epoch": 0.11788282447247436, - "grad_norm": 2.406255006790161, - "learning_rate": 0.00027460677966101695, - "loss": 0.6513, + "grad_norm": 2.9965720176696777, + "learning_rate": 0.00028487272727272726, + "loss": 0.67, "step": 3000 }, { "epoch": 0.11788282447247436, - "eval_loss": 0.8450209498405457, - "eval_runtime": 159.2776, - "eval_samples_per_second": 35.51, - "eval_steps_per_second": 4.439, - "eval_wer": 0.6436905201328819, + "eval_loss": 0.8609287738800049, + "eval_runtime": 145.381, + "eval_samples_per_second": 38.905, + "eval_steps_per_second": 4.863, + "eval_wer": 0.6621784275649564, "step": 3000 }, { "epoch": 0.125741679437306, - "eval_loss": 0.8596389293670654, - "eval_runtime": 159.3028, - "eval_samples_per_second": 35.505, - "eval_steps_per_second": 4.438, - "eval_wer": 0.6510888928118631, + "eval_loss": 0.8863000273704529, + "eval_runtime": 144.6247, + "eval_samples_per_second": 39.108, + "eval_steps_per_second": 4.889, + "eval_wer": 0.6659177352313396, "step": 3200 }, { "epoch": 0.13360053440213762, - "eval_loss": 0.8597909212112427, - "eval_runtime": 159.9064, - "eval_samples_per_second": 35.371, - "eval_steps_per_second": 4.421, - "eval_wer": 0.6376402240374893, + "eval_loss": 0.8669990301132202, + "eval_runtime": 145.3885, + "eval_samples_per_second": 38.903, + "eval_steps_per_second": 4.863, + "eval_wer": 0.6610710789427228, "step": 3400 }, { "epoch": 0.1375299618845534, - "grad_norm": 2.2046961784362793, - "learning_rate": 0.000269522033898305, - "loss": 0.6147, + "grad_norm": 3.541180372238159, + "learning_rate": 0.0002818424242424242, + "loss": 0.6282, "step": 3500 }, { "epoch": 0.14145938936696922, - "eval_loss": 0.8516111969947815, - "eval_runtime": 160.4338, - "eval_samples_per_second": 35.254, - "eval_steps_per_second": 4.407, - "eval_wer": 0.6375439328529473, + "eval_loss": 0.8718289136886597, + "eval_runtime": 147.388, + "eval_samples_per_second": 38.375, + "eval_steps_per_second": 4.797, + "eval_wer": 0.6819983630498628, "step": 3600 }, { "epoch": 0.14931824433180085, - "eval_loss": 0.8251617550849915, - "eval_runtime": 160.6004, - "eval_samples_per_second": 35.218, - "eval_steps_per_second": 4.402, - "eval_wer": 0.6100367511354335, + "eval_loss": 0.861672580242157, + "eval_runtime": 145.355, + "eval_samples_per_second": 38.912, + "eval_steps_per_second": 4.864, + "eval_wer": 0.6481520116833304, "step": 3800 }, { "epoch": 0.15717709929663248, - "grad_norm": 1.520897388458252, - "learning_rate": 0.0002644372881355932, - "loss": 0.6092, + "grad_norm": 1.9885746240615845, + "learning_rate": 0.0002788121212121212, + "loss": 0.6311, "step": 4000 }, { "epoch": 0.15717709929663248, - "eval_loss": 0.8579581379890442, - "eval_runtime": 159.0993, - "eval_samples_per_second": 35.55, - "eval_steps_per_second": 4.444, - "eval_wer": 0.6822551395419749, + "eval_loss": 0.8504879474639893, + "eval_runtime": 145.0997, + "eval_samples_per_second": 38.98, + "eval_steps_per_second": 4.873, + "eval_wer": 0.6597230023591341, "step": 4000 }, { "epoch": 0.1650359542614641, - "eval_loss": 0.8204948306083679, - "eval_runtime": 159.818, - "eval_samples_per_second": 35.39, - "eval_steps_per_second": 4.424, - "eval_wer": 0.6135674279019756, + "eval_loss": 0.8290337324142456, + "eval_runtime": 144.8192, + "eval_samples_per_second": 39.056, + "eval_steps_per_second": 4.882, + "eval_wer": 0.6292307939208166, "step": 4200 }, { "epoch": 0.17289480922629574, - "eval_loss": 0.8033376336097717, - "eval_runtime": 159.2754, - "eval_samples_per_second": 35.511, - "eval_steps_per_second": 4.439, - "eval_wer": 0.6385068446983678, + "eval_loss": 0.8300275206565857, + "eval_runtime": 144.9963, + "eval_samples_per_second": 39.008, + "eval_steps_per_second": 4.876, + "eval_wer": 0.6567540241690873, "step": 4400 }, { "epoch": 0.17682423670871153, - "grad_norm": 2.3011837005615234, - "learning_rate": 0.00025936271186440674, - "loss": 0.5928, + "grad_norm": 3.603195905685425, + "learning_rate": 0.0002757878787878788, + "loss": 0.615, "step": 4500 }, { "epoch": 0.18075366419112734, - "eval_loss": 0.7927771210670471, - "eval_runtime": 160.1079, - "eval_samples_per_second": 35.326, - "eval_steps_per_second": 4.416, - "eval_wer": 0.6005039238657701, + "eval_loss": 0.8007863163948059, + "eval_runtime": 144.795, + "eval_samples_per_second": 39.062, + "eval_steps_per_second": 4.883, + "eval_wer": 0.610855226204041, "step": 4600 }, { "epoch": 0.18861251915595897, - "eval_loss": 0.7911030054092407, - "eval_runtime": 160.2559, - "eval_samples_per_second": 35.294, - "eval_steps_per_second": 4.412, - "eval_wer": 0.5923512702412094, + "eval_loss": 0.8038597702980042, + "eval_runtime": 144.8128, + "eval_samples_per_second": 39.057, + "eval_steps_per_second": 4.882, + "eval_wer": 0.6045160565550224, "step": 4800 }, { "epoch": 0.1964713741207906, - "grad_norm": 6.133739948272705, - "learning_rate": 0.0002542779661016949, - "loss": 0.5681, + "grad_norm": 3.389535665512085, + "learning_rate": 0.0002727575757575757, + "loss": 0.5785, "step": 5000 }, { "epoch": 0.1964713741207906, - "eval_loss": 0.7968648076057434, - "eval_runtime": 160.0012, - "eval_samples_per_second": 35.35, - "eval_steps_per_second": 4.419, - "eval_wer": 0.5944375792396206, + "eval_loss": 0.7907959818840027, + "eval_runtime": 144.5449, + "eval_samples_per_second": 39.13, + "eval_steps_per_second": 4.891, + "eval_wer": 0.6071801126606859, "step": 5000 }, { "epoch": 0.20433022908562223, - "eval_loss": 0.7932958602905273, - "eval_runtime": 159.7818, - "eval_samples_per_second": 35.398, - "eval_steps_per_second": 4.425, - "eval_wer": 0.5898958450353871, + "eval_loss": 0.7867733836174011, + "eval_runtime": 144.7418, + "eval_samples_per_second": 39.076, + "eval_steps_per_second": 4.885, + "eval_wer": 0.6037457270786859, "step": 5200 }, { "epoch": 0.21218908405045384, - "eval_loss": 0.7830468416213989, - "eval_runtime": 160.2841, - "eval_samples_per_second": 35.287, - "eval_steps_per_second": 4.411, - "eval_wer": 0.6012742533421065, + "eval_loss": 0.7709878087043762, + "eval_runtime": 146.1138, + "eval_samples_per_second": 38.71, + "eval_steps_per_second": 4.839, + "eval_wer": 0.5988348766670412, "step": 5400 }, { "epoch": 0.21611851153286965, - "grad_norm": 2.9641568660736084, - "learning_rate": 0.00024919322033898305, - "loss": 0.5806, + "grad_norm": 2.476861000061035, + "learning_rate": 0.00026972727272727266, + "loss": 0.5928, "step": 5500 }, { "epoch": 0.22004793901528547, - "eval_loss": 0.7702626585960388, - "eval_runtime": 160.806, - "eval_samples_per_second": 35.173, - "eval_steps_per_second": 4.397, - "eval_wer": 0.5789026014668357, + "eval_loss": 0.766153872013092, + "eval_runtime": 144.8164, + "eval_samples_per_second": 39.056, + "eval_steps_per_second": 4.882, + "eval_wer": 0.5747139349392563, "step": 5600 }, { "epoch": 0.2279067939801171, - "eval_loss": 0.7665734887123108, - "eval_runtime": 160.6796, - "eval_samples_per_second": 35.2, - "eval_steps_per_second": 4.4, - "eval_wer": 0.589831650912359, + "eval_loss": 0.767308235168457, + "eval_runtime": 145.7638, + "eval_samples_per_second": 38.803, + "eval_steps_per_second": 4.85, + "eval_wer": 0.5945820160164337, "step": 5800 }, { "epoch": 0.23576564894494872, - "grad_norm": 2.6571083068847656, - "learning_rate": 0.00024410847457627117, - "loss": 0.5608, + "grad_norm": 2.2588391304016113, + "learning_rate": 0.00026669696969696966, + "loss": 0.5799, "step": 6000 }, { "epoch": 0.23576564894494872, - "eval_loss": 0.7580233216285706, - "eval_runtime": 160.371, - "eval_samples_per_second": 35.268, - "eval_steps_per_second": 4.409, - "eval_wer": 0.5694500168509573, + "eval_loss": 0.7804461121559143, + "eval_runtime": 145.0414, + "eval_samples_per_second": 38.996, + "eval_steps_per_second": 4.874, + "eval_wer": 0.5990114105053682, "step": 6000 }, { "epoch": 0.24362450390978035, - "eval_loss": 0.7478851675987244, - "eval_runtime": 162.164, - "eval_samples_per_second": 34.878, - "eval_steps_per_second": 4.36, - "eval_wer": 0.5650848164850508, + "eval_loss": 0.7586621642112732, + "eval_runtime": 145.864, + "eval_samples_per_second": 38.776, + "eval_steps_per_second": 4.847, + "eval_wer": 0.5780520293367142, "step": 6200 }, { "epoch": 0.251483358874612, - "eval_loss": 0.7638738751411438, - "eval_runtime": 160.257, - "eval_samples_per_second": 35.293, - "eval_steps_per_second": 4.412, - "eval_wer": 0.5846640240086021, + "eval_loss": 0.749543309211731, + "eval_runtime": 145.8617, + "eval_samples_per_second": 38.776, + "eval_steps_per_second": 4.847, + "eval_wer": 0.5728683539022003, "step": 6400 }, { "epoch": 0.2554127863570278, - "grad_norm": 1.5677289962768555, - "learning_rate": 0.0002390237288135593, - "loss": 0.5333, + "grad_norm": 1.971763253211975, + "learning_rate": 0.00026366666666666666, + "loss": 0.5534, "step": 6500 }, { "epoch": 0.2593422138394436, - "eval_loss": 0.7297228574752808, - "eval_runtime": 160.7223, - "eval_samples_per_second": 35.191, - "eval_steps_per_second": 4.399, - "eval_wer": 0.5676044358139012, + "eval_loss": 0.7536802291870117, + "eval_runtime": 147.4299, + "eval_samples_per_second": 38.364, + "eval_steps_per_second": 4.795, + "eval_wer": 0.5768804865914525, "step": 6600 }, { "epoch": 0.26720106880427524, - "eval_loss": 0.7441245913505554, - "eval_runtime": 160.37, - "eval_samples_per_second": 35.268, - "eval_steps_per_second": 4.409, - "eval_wer": 0.5590345203896583, + "eval_loss": 0.7661583423614502, + "eval_runtime": 149.947, + "eval_samples_per_second": 37.72, + "eval_steps_per_second": 4.715, + "eval_wer": 0.581245686957359, "step": 6800 }, { "epoch": 0.2750599237691068, - "grad_norm": 3.644160032272339, - "learning_rate": 0.00023393898305084743, - "loss": 0.5406, + "grad_norm": 2.5072972774505615, + "learning_rate": 0.0002606363636363636, + "loss": 0.5592, "step": 7000 }, { "epoch": 0.2750599237691068, - "eval_loss": 0.7404661774635315, - "eval_runtime": 160.5995, - "eval_samples_per_second": 35.218, - "eval_steps_per_second": 4.402, - "eval_wer": 0.5491165283818267, + "eval_loss": 0.7571460604667664, + "eval_runtime": 145.6344, + "eval_samples_per_second": 38.837, + "eval_steps_per_second": 4.855, + "eval_wer": 0.5607998587729294, "step": 7000 }, { "epoch": 0.28291877873393845, - "eval_loss": 0.7237815856933594, - "eval_runtime": 160.4373, - "eval_samples_per_second": 35.254, - "eval_steps_per_second": 4.407, - "eval_wer": 0.5529039816404808, + "eval_loss": 0.7475385665893555, + "eval_runtime": 145.4869, + "eval_samples_per_second": 38.876, + "eval_steps_per_second": 4.86, + "eval_wer": 0.5635120604708639, "step": 7200 }, { "epoch": 0.2907776336987701, - "eval_loss": 0.7328305840492249, - "eval_runtime": 161.925, - "eval_samples_per_second": 34.93, - "eval_steps_per_second": 4.366, - "eval_wer": 0.5543964950008826, + "eval_loss": 0.7267230749130249, + "eval_runtime": 145.614, + "eval_samples_per_second": 38.842, + "eval_steps_per_second": 4.855, + "eval_wer": 0.5591950056972285, "step": 7400 }, { "epoch": 0.2947070611811859, - "grad_norm": 3.6030795574188232, - "learning_rate": 0.00022885423728813558, - "loss": 0.535, + "grad_norm": 3.4168338775634766, + "learning_rate": 0.0002576060606060606, + "loss": 0.5512, "step": 7500 }, { "epoch": 0.2986364886636017, - "eval_loss": 0.7263395190238953, - "eval_runtime": 160.6865, - "eval_samples_per_second": 35.199, - "eval_steps_per_second": 4.4, - "eval_wer": 0.5598690439890228, + "eval_loss": 0.7362108826637268, + "eval_runtime": 145.7986, + "eval_samples_per_second": 38.793, + "eval_steps_per_second": 4.849, + "eval_wer": 0.5588098409590602, "step": 7600 }, { "epoch": 0.30649534362843334, - "eval_loss": 0.7421374320983887, - "eval_runtime": 160.2249, - "eval_samples_per_second": 35.3, - "eval_steps_per_second": 4.413, - "eval_wer": 0.5594357336585836, + "eval_loss": 0.7624097466468811, + "eval_runtime": 145.4148, + "eval_samples_per_second": 38.896, + "eval_steps_per_second": 4.862, + "eval_wer": 0.581117298711303, "step": 7800 }, { "epoch": 0.31435419859326497, - "grad_norm": 3.376089096069336, - "learning_rate": 0.0002237694915254237, - "loss": 0.5195, + "grad_norm": 2.9330873489379883, + "learning_rate": 0.00025457575757575755, + "loss": 0.54, "step": 8000 }, { "epoch": 0.31435419859326497, - "eval_loss": 0.7434934377670288, - "eval_runtime": 161.1972, - "eval_samples_per_second": 35.087, - "eval_steps_per_second": 4.386, - "eval_wer": 0.5543804464701256, + "eval_loss": 0.7657227516174316, + "eval_runtime": 146.1604, + "eval_samples_per_second": 38.697, + "eval_steps_per_second": 4.837, + "eval_wer": 0.5622442265410602, "step": 8000 }, { "epoch": 0.3222130535580966, - "eval_loss": 0.7186952233314514, - "eval_runtime": 162.677, - "eval_samples_per_second": 34.768, - "eval_steps_per_second": 4.346, - "eval_wer": 0.5423921939946398, + "eval_loss": 0.7300673127174377, + "eval_runtime": 146.8709, + "eval_samples_per_second": 38.51, + "eval_steps_per_second": 4.814, + "eval_wer": 0.5453611721846865, "step": 8200 }, { "epoch": 0.3300719085229282, - "eval_loss": 0.6976691484451294, - "eval_runtime": 159.5716, - "eval_samples_per_second": 35.445, - "eval_steps_per_second": 4.431, - "eval_wer": 0.5353308404615558, + "eval_loss": 0.7118472456932068, + "eval_runtime": 146.4543, + "eval_samples_per_second": 38.62, + "eval_steps_per_second": 4.827, + "eval_wer": 0.5381553818747894, "step": 8400 }, { "epoch": 0.33400133600534404, - "grad_norm": 1.9758217334747314, - "learning_rate": 0.00021868474576271186, - "loss": 0.5023, + "grad_norm": 2.0070419311523438, + "learning_rate": 0.00025154545454545454, + "loss": 0.531, "step": 8500 }, { "epoch": 0.33793076348775986, - "eval_loss": 0.6949788928031921, - "eval_runtime": 160.6972, - "eval_samples_per_second": 35.197, - "eval_steps_per_second": 4.4, - "eval_wer": 0.5385565951437146, + "eval_loss": 0.7252832055091858, + "eval_runtime": 145.6223, + "eval_samples_per_second": 38.84, + "eval_steps_per_second": 4.855, + "eval_wer": 0.548153616536406, "step": 8600 }, { "epoch": 0.3457896184525915, - "eval_loss": 0.7155033946037292, - "eval_runtime": 159.9521, - "eval_samples_per_second": 35.361, - "eval_steps_per_second": 4.42, - "eval_wer": 0.5450883471618173, + "eval_loss": 0.7304599285125732, + "eval_runtime": 145.89, + "eval_samples_per_second": 38.769, + "eval_steps_per_second": 4.846, + "eval_wer": 0.5582962879748359, "step": 8800 }, { "epoch": 0.35364847341742306, - "grad_norm": 3.3146464824676514, - "learning_rate": 0.00021361016949152543, - "loss": 0.5106, + "grad_norm": 2.5275588035583496, + "learning_rate": 0.00024852121212121206, + "loss": 0.5406, "step": 9000 }, { "epoch": 0.35364847341742306, - "eval_loss": 0.6857195496559143, - "eval_runtime": 160.5474, - "eval_samples_per_second": 35.229, - "eval_steps_per_second": 4.404, - "eval_wer": 0.5379467509749483, + "eval_loss": 0.7097567915916443, + "eval_runtime": 145.7013, + "eval_samples_per_second": 38.819, + "eval_steps_per_second": 4.852, + "eval_wer": 0.5520213124488453, "step": 9000 }, { "epoch": 0.3615073283822547, - "eval_loss": 0.68482905626297, - "eval_runtime": 161.0662, - "eval_samples_per_second": 35.116, - "eval_steps_per_second": 4.389, - "eval_wer": 0.5329075123172473, + "eval_loss": 0.698684573173523, + "eval_runtime": 146.3052, + "eval_samples_per_second": 38.659, + "eval_steps_per_second": 4.832, + "eval_wer": 0.5372245670908828, "step": 9200 }, { "epoch": 0.3693661833470863, - "eval_loss": 0.6732301712036133, - "eval_runtime": 160.6243, - "eval_samples_per_second": 35.213, - "eval_steps_per_second": 4.402, - "eval_wer": 0.5202291730192101, + "eval_loss": 0.7044981718063354, + "eval_runtime": 145.8062, + "eval_samples_per_second": 38.791, + "eval_steps_per_second": 4.849, + "eval_wer": 0.5472548988140136, "step": 9400 }, { "epoch": 0.37329561082950213, - "grad_norm": 4.61689567565918, - "learning_rate": 0.00020852542372881352, - "loss": 0.4968, + "grad_norm": 6.208221435546875, + "learning_rate": 0.00024549090909090906, + "loss": 0.5252, "step": 9500 }, { "epoch": 0.37722503831191795, - "eval_loss": 0.6839133501052856, - "eval_runtime": 161.2367, - "eval_samples_per_second": 35.079, - "eval_steps_per_second": 4.385, - "eval_wer": 0.5274510118598642, + "eval_loss": 0.7025354504585266, + "eval_runtime": 146.2272, + "eval_samples_per_second": 38.68, + "eval_steps_per_second": 4.835, + "eval_wer": 0.5332766285246585, "step": 9600 }, { "epoch": 0.3850838932767496, - "eval_loss": 0.6766842603683472, - "eval_runtime": 160.827, - "eval_samples_per_second": 35.168, - "eval_steps_per_second": 4.396, - "eval_wer": 0.5198279597502848, + "eval_loss": 0.7077142000198364, + "eval_runtime": 145.5575, + "eval_samples_per_second": 38.857, + "eval_steps_per_second": 4.857, + "eval_wer": 0.5461796472532939, "step": 9800 }, { "epoch": 0.3929427482415812, - "grad_norm": 3.5624563694000244, - "learning_rate": 0.0002034508474576271, - "loss": 0.4824, + "grad_norm": 4.407375812530518, + "learning_rate": 0.00024246060606060606, + "loss": 0.5156, "step": 10000 }, { "epoch": 0.3929427482415812, - "eval_loss": 0.6718243956565857, - "eval_runtime": 161.1794, - "eval_samples_per_second": 35.091, - "eval_steps_per_second": 4.386, - "eval_wer": 0.5334531623629857, + "eval_loss": 0.7006597518920898, + "eval_runtime": 146.3123, + "eval_samples_per_second": 38.657, + "eval_steps_per_second": 4.832, + "eval_wer": 0.5382516730593314, "step": 10000 }, { "epoch": 0.40080160320641284, - "eval_loss": 0.6593254804611206, - "eval_runtime": 160.9535, - "eval_samples_per_second": 35.141, - "eval_steps_per_second": 4.393, - "eval_wer": 0.5175169713212755, + "eval_loss": 0.6947250962257385, + "eval_runtime": 145.4545, + "eval_samples_per_second": 38.885, + "eval_steps_per_second": 4.861, + "eval_wer": 0.5425847763637239, "step": 10200 }, { "epoch": 0.40866045817124447, - "eval_loss": 0.6799437403678894, - "eval_runtime": 159.6664, - "eval_samples_per_second": 35.424, - "eval_steps_per_second": 4.428, - "eval_wer": 0.5173885830752195, + "eval_loss": 0.7127708196640015, + "eval_runtime": 145.5874, + "eval_samples_per_second": 38.85, + "eval_steps_per_second": 4.856, + "eval_wer": 0.5361332669994062, "step": 10400 }, { "epoch": 0.4125898856536603, - "grad_norm": 2.189781427383423, - "learning_rate": 0.00019836610169491524, - "loss": 0.48, + "grad_norm": 2.721827983856201, + "learning_rate": 0.000239430303030303, + "loss": 0.5181, "step": 10500 }, { "epoch": 0.4165193131360761, - "eval_loss": 0.6662308573722839, - "eval_runtime": 160.8779, - "eval_samples_per_second": 35.157, - "eval_steps_per_second": 4.395, - "eval_wer": 0.5128949944632569, + "eval_loss": 0.6945223212242126, + "eval_runtime": 146.0143, + "eval_samples_per_second": 38.736, + "eval_steps_per_second": 4.842, + "eval_wer": 0.5276114971674343, "step": 10600 }, { "epoch": 0.42437816810090767, - "eval_loss": 0.6619213223457336, - "eval_runtime": 160.6185, - "eval_samples_per_second": 35.214, - "eval_steps_per_second": 4.402, - "eval_wer": 0.5005536743111169, + "eval_loss": 0.6985763311386108, + "eval_runtime": 146.5657, + "eval_samples_per_second": 38.59, + "eval_steps_per_second": 4.824, + "eval_wer": 0.5310619312801913, "step": 10800 }, { "epoch": 0.4322370230657393, - "grad_norm": 10.41739559173584, - "learning_rate": 0.00019328135593220337, - "loss": 0.4693, + "grad_norm": 5.38914680480957, + "learning_rate": 0.0002364, + "loss": 0.5096, "step": 11000 }, { "epoch": 0.4322370230657393, - "eval_loss": 0.6576216220855713, - "eval_runtime": 160.9844, - "eval_samples_per_second": 35.134, - "eval_steps_per_second": 4.392, - "eval_wer": 0.519940299465584, + "eval_loss": 0.6909800171852112, + "eval_runtime": 146.0039, + "eval_samples_per_second": 38.739, + "eval_steps_per_second": 4.842, + "eval_wer": 0.5293126414276773, "step": 11000 }, { "epoch": 0.44009587803057093, - "eval_loss": 0.6406122446060181, - "eval_runtime": 160.4456, - "eval_samples_per_second": 35.252, - "eval_steps_per_second": 4.406, - "eval_wer": 0.5018696538331916, + "eval_loss": 0.6855354905128479, + "eval_runtime": 146.6844, + "eval_samples_per_second": 38.559, + "eval_steps_per_second": 4.82, + "eval_wer": 0.5280608560286306, "step": 11200 }, { "epoch": 0.44795473299540256, - "eval_loss": 0.6408420205116272, - "eval_runtime": 161.6075, - "eval_samples_per_second": 34.998, - "eval_steps_per_second": 4.375, - "eval_wer": 0.5066039704065093, + "eval_loss": 0.6889775395393372, + "eval_runtime": 146.3731, + "eval_samples_per_second": 38.641, + "eval_steps_per_second": 4.83, + "eval_wer": 0.5262313235223315, "step": 11400 }, { "epoch": 0.4518841604778184, - "grad_norm": 3.5733156204223633, - "learning_rate": 0.00018819661016949152, - "loss": 0.4691, + "grad_norm": 3.3484437465667725, + "learning_rate": 0.00023336969696969694, + "loss": 0.5099, "step": 11500 }, { "epoch": 0.4558135879602342, - "eval_loss": 0.6476473212242126, - "eval_runtime": 161.2518, - "eval_samples_per_second": 35.076, - "eval_steps_per_second": 4.384, - "eval_wer": 0.5019498964869766, + "eval_loss": 0.677577018737793, + "eval_runtime": 146.1848, + "eval_samples_per_second": 38.691, + "eval_steps_per_second": 4.836, + "eval_wer": 0.5298101458811446, "step": 11600 }, { "epoch": 0.4636724429250658, - "eval_loss": 0.6423429846763611, - "eval_runtime": 161.3676, - "eval_samples_per_second": 35.05, - "eval_steps_per_second": 4.381, - "eval_wer": 0.4945996694002664, + "eval_loss": 0.6817450523376465, + "eval_runtime": 146.2301, + "eval_samples_per_second": 38.679, + "eval_steps_per_second": 4.835, + "eval_wer": 0.5141949254545747, "step": 11800 }, { "epoch": 0.47153129788989745, - "grad_norm": 2.3962831497192383, - "learning_rate": 0.00018311186440677962, - "loss": 0.4444, + "grad_norm": 4.75791597366333, + "learning_rate": 0.00023033939393939391, + "loss": 0.481, "step": 12000 }, { "epoch": 0.47153129788989745, - "eval_loss": 0.6374172568321228, - "eval_runtime": 162.3359, - "eval_samples_per_second": 34.841, - "eval_steps_per_second": 4.355, - "eval_wer": 0.4975846961210701, + "eval_loss": 0.6749030351638794, + "eval_runtime": 144.9955, + "eval_samples_per_second": 39.008, + "eval_steps_per_second": 4.876, + "eval_wer": 0.5318483092872848, "step": 12000 }, { "epoch": 0.4793901528547291, - "eval_loss": 0.6312358379364014, - "eval_runtime": 162.5747, - "eval_samples_per_second": 34.79, - "eval_steps_per_second": 4.349, - "eval_wer": 0.4961403283529393, + "eval_loss": 0.6648340225219727, + "eval_runtime": 146.7705, + "eval_samples_per_second": 38.536, + "eval_steps_per_second": 4.817, + "eval_wer": 0.513167819486126, "step": 12200 }, { "epoch": 0.4872490078195607, - "eval_loss": 0.6170411109924316, - "eval_runtime": 161.58, - "eval_samples_per_second": 35.004, - "eval_steps_per_second": 4.376, - "eval_wer": 0.4818571359792011, + "eval_loss": 0.6659471392631531, + "eval_runtime": 145.9108, + "eval_samples_per_second": 38.763, + "eval_steps_per_second": 4.845, + "eval_wer": 0.5151096917077241, "step": 12400 }, { "epoch": 0.4911784353019765, - "grad_norm": 2.623764753341675, - "learning_rate": 0.0001780372881355932, - "loss": 0.4474, + "grad_norm": 3.3849971294403076, + "learning_rate": 0.00022730909090909089, + "loss": 0.4899, "step": 12500 }, { "epoch": 0.49510786278439234, - "eval_loss": 0.6300910115242004, - "eval_runtime": 164.417, - "eval_samples_per_second": 34.4, - "eval_steps_per_second": 4.3, - "eval_wer": 0.49325159281667763, + "eval_loss": 0.6744287014007568, + "eval_runtime": 146.3152, + "eval_samples_per_second": 38.656, + "eval_steps_per_second": 4.832, + "eval_wer": 0.5207266774726774, "step": 12600 }, { "epoch": 0.502966717749224, - "eval_loss": 0.6253496408462524, - "eval_runtime": 161.3418, - "eval_samples_per_second": 35.056, - "eval_steps_per_second": 4.382, - "eval_wer": 0.4862383848758646, + "eval_loss": 0.6732743978500366, + "eval_runtime": 146.1337, + "eval_samples_per_second": 38.704, + "eval_steps_per_second": 4.838, + "eval_wer": 0.5228771805941166, "step": 12800 }, { "epoch": 0.5108255727140556, - "grad_norm": 2.9566869735717773, - "learning_rate": 0.00017295254237288134, - "loss": 0.4471, + "grad_norm": 3.489818811416626, + "learning_rate": 0.00022427878787878786, + "loss": 0.492, "step": 13000 }, { "epoch": 0.5108255727140556, - "eval_loss": 0.622020959854126, - "eval_runtime": 161.5861, - "eval_samples_per_second": 35.003, - "eval_steps_per_second": 4.375, - "eval_wer": 0.4849224053537899, + "eval_loss": 0.6456639170646667, + "eval_runtime": 146.9518, + "eval_samples_per_second": 38.489, + "eval_steps_per_second": 4.811, + "eval_wer": 0.5041645937314438, "step": 13000 }, { "epoch": 0.5186844276788872, - "eval_loss": 0.6201028823852539, - "eval_runtime": 160.9515, - "eval_samples_per_second": 35.141, - "eval_steps_per_second": 4.393, - "eval_wer": 0.48527547303044405, + "eval_loss": 0.6671249866485596, + "eval_runtime": 145.8641, + "eval_samples_per_second": 38.776, + "eval_steps_per_second": 4.847, + "eval_wer": 0.5259103529071913, "step": 13200 }, { "epoch": 0.5265432826437189, - "eval_loss": 0.6168439984321594, - "eval_runtime": 162.0987, - "eval_samples_per_second": 34.892, - "eval_steps_per_second": 4.362, - "eval_wer": 0.4848261141692478, + "eval_loss": 0.6544414162635803, + "eval_runtime": 146.5937, + "eval_samples_per_second": 38.583, + "eval_steps_per_second": 4.823, + "eval_wer": 0.5179181845902008, "step": 13400 }, { "epoch": 0.5304727101261346, - "grad_norm": 1.5596935749053955, - "learning_rate": 0.0001678677966101695, - "loss": 0.4323, + "grad_norm": 1.4167377948760986, + "learning_rate": 0.00022125454545454546, + "loss": 0.4782, "step": 13500 }, { "epoch": 0.5344021376085505, - "eval_loss": 0.6172667741775513, - "eval_runtime": 162.3681, - "eval_samples_per_second": 34.834, - "eval_steps_per_second": 4.354, - "eval_wer": 0.47707467381361235, + "eval_loss": 0.6560591459274292, + "eval_runtime": 146.1188, + "eval_samples_per_second": 38.708, + "eval_steps_per_second": 4.839, + "eval_wer": 0.5054484761920046, "step": 13600 }, { "epoch": 0.542260992573382, - "eval_loss": 0.603190004825592, - "eval_runtime": 161.2926, - "eval_samples_per_second": 35.067, - "eval_steps_per_second": 4.383, - "eval_wer": 0.4656160228531078, + "eval_loss": 0.6381711363792419, + "eval_runtime": 145.8554, + "eval_samples_per_second": 38.778, + "eval_steps_per_second": 4.847, + "eval_wer": 0.49918954919677105, "step": 13800 }, { "epoch": 0.5501198475382136, - "grad_norm": 2.978868246078491, - "learning_rate": 0.0001627830508474576, - "loss": 0.4575, + "grad_norm": 3.582862615585327, + "learning_rate": 0.0002182242424242424, + "loss": 0.507, "step": 14000 }, { "epoch": 0.5501198475382136, - "eval_loss": 0.6097469925880432, - "eval_runtime": 161.1042, - "eval_samples_per_second": 35.108, - "eval_steps_per_second": 4.388, - "eval_wer": 0.4678307200975751, + "eval_loss": 0.6555091738700867, + "eval_runtime": 148.0584, + "eval_samples_per_second": 38.201, + "eval_steps_per_second": 4.775, + "eval_wer": 0.504437418754313, "step": 14000 }, { "epoch": 0.5579787025030453, - "eval_loss": 0.5970696806907654, - "eval_runtime": 161.5846, - "eval_samples_per_second": 35.003, - "eval_steps_per_second": 4.375, - "eval_wer": 0.4673653127056218, + "eval_loss": 0.6399552822113037, + "eval_runtime": 146.0824, + "eval_samples_per_second": 38.718, + "eval_steps_per_second": 4.84, + "eval_wer": 0.49548233859190194, "step": 14200 }, { "epoch": 0.5658375574678769, - "eval_loss": 0.5976916551589966, - "eval_runtime": 161.7136, - "eval_samples_per_second": 34.975, - "eval_steps_per_second": 4.372, - "eval_wer": 0.4697565437884162, + "eval_loss": 0.6467686891555786, + "eval_runtime": 146.3336, + "eval_samples_per_second": 38.651, + "eval_steps_per_second": 4.831, + "eval_wer": 0.5014202949719954, "step": 14400 }, { "epoch": 0.5697669849502928, - "grad_norm": 3.0501327514648438, - "learning_rate": 0.00015769830508474575, - "loss": 0.4395, + "grad_norm": 2.1453781127929688, + "learning_rate": 0.0002151939393939394, + "loss": 0.4899, "step": 14500 }, { "epoch": 0.5736964124327085, - "eval_loss": 0.6056780815124512, - "eval_runtime": 162.5963, - "eval_samples_per_second": 34.786, - "eval_steps_per_second": 4.348, - "eval_wer": 0.4734316573317713, + "eval_loss": 0.6370707750320435, + "eval_runtime": 146.4635, + "eval_samples_per_second": 38.617, + "eval_steps_per_second": 4.827, + "eval_wer": 0.49723162844441593, "step": 14600 }, { "epoch": 0.5815552673975402, - "eval_loss": 0.582733690738678, - "eval_runtime": 162.9467, - "eval_samples_per_second": 34.711, - "eval_steps_per_second": 4.339, - "eval_wer": 0.4574152236362761, + "eval_loss": 0.6356329917907715, + "eval_runtime": 145.6834, + "eval_samples_per_second": 38.824, + "eval_steps_per_second": 4.853, + "eval_wer": 0.5025597406557429, "step": 14800 }, { "epoch": 0.5894141223623718, - "grad_norm": 4.3484697341918945, - "learning_rate": 0.00015261355932203388, - "loss": 0.4119, + "grad_norm": 2.615446090698242, + "learning_rate": 0.00021216363636363634, + "loss": 0.4677, "step": 15000 }, { "epoch": 0.5894141223623718, - "eval_loss": 0.5946210622787476, - "eval_runtime": 162.2892, - "eval_samples_per_second": 34.851, - "eval_steps_per_second": 4.356, - "eval_wer": 0.4640432668389209, + "eval_loss": 0.638607919216156, + "eval_runtime": 145.7689, + "eval_samples_per_second": 38.801, + "eval_steps_per_second": 4.85, + "eval_wer": 0.5021424788560608, "step": 15000 }, { "epoch": 0.5972729773272034, - "eval_loss": 0.602292001247406, - "eval_runtime": 161.4334, - "eval_samples_per_second": 35.036, - "eval_steps_per_second": 4.38, - "eval_wer": 0.47707467381361235, + "eval_loss": 0.6653130650520325, + "eval_runtime": 146.0092, + "eval_samples_per_second": 38.737, + "eval_steps_per_second": 4.842, + "eval_wer": 0.5190255332124344, "step": 15200 }, { "epoch": 0.605131832292035, - "eval_loss": 0.6129310727119446, - "eval_runtime": 161.8649, - "eval_samples_per_second": 34.943, - "eval_steps_per_second": 4.368, - "eval_wer": 0.47266132785543485, + "eval_loss": 0.6442501544952393, + "eval_runtime": 146.2404, + "eval_samples_per_second": 38.676, + "eval_steps_per_second": 4.835, + "eval_wer": 0.4998154418962944, "step": 15400 }, { "epoch": 0.6090612597744509, - "grad_norm": 4.229031085968018, - "learning_rate": 0.00014752881355932203, - "loss": 0.4125, + "grad_norm": 2.680966854095459, + "learning_rate": 0.0002091333333333333, + "loss": 0.461, "step": 15500 }, { "epoch": 0.6129906872568667, - "eval_loss": 0.590186595916748, - "eval_runtime": 162.4898, - "eval_samples_per_second": 34.808, - "eval_steps_per_second": 4.351, - "eval_wer": 0.45837813548169665, + "eval_loss": 0.6210175156593323, + "eval_runtime": 146.9594, + "eval_samples_per_second": 38.487, + "eval_steps_per_second": 4.811, + "eval_wer": 0.4896567219271076, "step": 15600 }, { "epoch": 0.6208495422216983, - "eval_loss": 0.5955421328544617, - "eval_runtime": 161.8228, - "eval_samples_per_second": 34.952, - "eval_steps_per_second": 4.369, - "eval_wer": 0.46537529489175267, + "eval_loss": 0.6395752429962158, + "eval_runtime": 146.5911, + "eval_samples_per_second": 38.584, + "eval_steps_per_second": 4.823, + "eval_wer": 0.5011635184798832, "step": 15800 }, { "epoch": 0.6287083971865299, - "grad_norm": 1.4181621074676514, - "learning_rate": 0.00014244406779661016, - "loss": 0.4039, + "grad_norm": 2.2297749519348145, + "learning_rate": 0.00020610303030303028, + "loss": 0.4528, "step": 16000 }, { "epoch": 0.6287083971865299, - "eval_loss": 0.5955237150192261, - "eval_runtime": 161.3699, - "eval_samples_per_second": 35.05, - "eval_steps_per_second": 4.381, - "eval_wer": 0.45946943557317327, + "eval_loss": 0.6226186752319336, + "eval_runtime": 147.1935, + "eval_samples_per_second": 38.426, + "eval_steps_per_second": 4.803, + "eval_wer": 0.49333183547046267, "step": 16000 }, { "epoch": 0.6365672521513616, - "eval_loss": 0.578912079334259, - "eval_runtime": 163.2091, - "eval_samples_per_second": 34.655, - "eval_steps_per_second": 4.332, - "eval_wer": 0.4497279774036687, + "eval_loss": 0.6253554224967957, + "eval_runtime": 147.0403, + "eval_samples_per_second": 38.466, + "eval_steps_per_second": 4.808, + "eval_wer": 0.49365280608560286, "step": 16200 }, { "epoch": 0.6444261071161932, - "eval_loss": 0.5779294371604919, - "eval_runtime": 164.0491, - "eval_samples_per_second": 34.477, - "eval_steps_per_second": 4.31, - "eval_wer": 0.4630322094012293, + "eval_loss": 0.6289177536964417, + "eval_runtime": 146.8167, + "eval_samples_per_second": 38.524, + "eval_steps_per_second": 4.816, + "eval_wer": 0.5013240037874532, "step": 16400 }, { "epoch": 0.648355534598609, - "grad_norm": 2.0229876041412354, - "learning_rate": 0.00013736949152542372, - "loss": 0.3969, + "grad_norm": 1.9119956493377686, + "learning_rate": 0.00020307272727272725, + "loss": 0.451, "step": 16500 }, { "epoch": 0.6522849620810248, - "eval_loss": 0.5677434802055359, - "eval_runtime": 161.201, - "eval_samples_per_second": 35.087, - "eval_steps_per_second": 4.386, - "eval_wer": 0.45507213814575276, + "eval_loss": 0.6229738593101501, + "eval_runtime": 146.4262, + "eval_samples_per_second": 38.627, + "eval_steps_per_second": 4.828, + "eval_wer": 0.49723162844441593, "step": 16600 }, { "epoch": 0.6601438170458565, - "eval_loss": 0.586939811706543, - "eval_runtime": 161.4539, - "eval_samples_per_second": 35.032, - "eval_steps_per_second": 4.379, - "eval_wer": 0.46062492978767794, + "eval_loss": 0.6153121590614319, + "eval_runtime": 146.6615, + "eval_samples_per_second": 38.565, + "eval_steps_per_second": 4.821, + "eval_wer": 0.4957391150840141, "step": 16800 }, { "epoch": 0.6680026720106881, - "grad_norm": 4.166793346405029, - "learning_rate": 0.00013229491525423729, - "loss": 0.3923, + "grad_norm": 3.115481376647949, + "learning_rate": 0.00020004848484848485, + "loss": 0.4444, "step": 17000 }, { "epoch": 0.6680026720106881, - "eval_loss": 0.5710186958312988, - "eval_runtime": 160.5637, - "eval_samples_per_second": 35.226, - "eval_steps_per_second": 4.403, - "eval_wer": 0.45017733626486495, + "eval_loss": 0.6032531261444092, + "eval_runtime": 146.667, + "eval_samples_per_second": 38.564, + "eval_steps_per_second": 4.82, + "eval_wer": 0.47476368538460306, "step": 17000 }, { "epoch": 0.6758615269755197, - "eval_loss": 0.5639811158180237, - "eval_runtime": 161.7944, - "eval_samples_per_second": 34.958, - "eval_steps_per_second": 4.37, - "eval_wer": 0.44741698897465937, + "eval_loss": 0.6153914332389832, + "eval_runtime": 146.5404, + "eval_samples_per_second": 38.597, + "eval_steps_per_second": 4.825, + "eval_wer": 0.4771388679366404, "step": 17200 }, { "epoch": 0.6837203819403513, - "eval_loss": 0.5841760039329529, - "eval_runtime": 161.0184, - "eval_samples_per_second": 35.126, - "eval_steps_per_second": 4.391, - "eval_wer": 0.4497921715266967, + "eval_loss": 0.6169700622558594, + "eval_runtime": 146.4739, + "eval_samples_per_second": 38.614, + "eval_steps_per_second": 4.827, + "eval_wer": 0.48591741426072443, "step": 17400 }, { "epoch": 0.6876498094227671, - "grad_norm": 3.127680778503418, - "learning_rate": 0.0001272101694915254, - "loss": 0.386, + "grad_norm": 3.35622501373291, + "learning_rate": 0.0001970181818181818, + "loss": 0.4357, "step": 17500 }, { "epoch": 0.691579236905183, - "eval_loss": 0.5596618056297302, - "eval_runtime": 160.919, - "eval_samples_per_second": 35.148, - "eval_steps_per_second": 4.394, - "eval_wer": 0.44403074898493045, + "eval_loss": 0.6020850539207458, + "eval_runtime": 146.4462, + "eval_samples_per_second": 38.622, + "eval_steps_per_second": 4.828, + "eval_wer": 0.4814559227102759, "step": 17600 }, { "epoch": 0.6994380918700145, - "eval_loss": 0.5620830059051514, - "eval_runtime": 160.6614, - "eval_samples_per_second": 35.204, - "eval_steps_per_second": 4.401, - "eval_wer": 0.43812488966635105, + "eval_loss": 0.6071408987045288, + "eval_runtime": 147.1123, + "eval_samples_per_second": 38.447, + "eval_steps_per_second": 4.806, + "eval_wer": 0.47303044406284606, "step": 17800 }, { "epoch": 0.7072969468348461, - "grad_norm": 17.387800216674805, - "learning_rate": 0.00012213559322033898, - "loss": 0.3851, + "grad_norm": 2.2534916400909424, + "learning_rate": 0.0001939939393939394, + "loss": 0.4413, "step": 18000 }, { "epoch": 0.7072969468348461, - "eval_loss": 0.566453218460083, - "eval_runtime": 161.6574, - "eval_samples_per_second": 34.988, - "eval_steps_per_second": 4.373, - "eval_wer": 0.434562115838295, + "eval_loss": 0.6042246222496033, + "eval_runtime": 146.518, + "eval_samples_per_second": 38.603, + "eval_steps_per_second": 4.825, + "eval_wer": 0.47656112082938806, "step": 18000 }, { "epoch": 0.7151558017996777, - "eval_loss": 0.5572646260261536, - "eval_runtime": 162.4898, - "eval_samples_per_second": 34.808, - "eval_steps_per_second": 4.351, - "eval_wer": 0.4356213188682576, + "eval_loss": 0.6118656396865845, + "eval_runtime": 147.1712, + "eval_samples_per_second": 38.431, + "eval_steps_per_second": 4.804, + "eval_wer": 0.4837508626085282, "step": 18200 }, { "epoch": 0.7230146567645094, - "eval_loss": 0.5548349022865295, - "eval_runtime": 161.0153, - "eval_samples_per_second": 35.127, - "eval_steps_per_second": 4.391, - "eval_wer": 0.4344337275922389, + "eval_loss": 0.6045942902565002, + "eval_runtime": 146.4829, + "eval_samples_per_second": 38.612, + "eval_steps_per_second": 4.827, + "eval_wer": 0.47569450016850956, "step": 18400 }, { "epoch": 0.7269440842469252, - "grad_norm": 9.4507417678833, - "learning_rate": 0.00011705084745762712, - "loss": 0.369, + "grad_norm": 3.591475248336792, + "learning_rate": 0.00019096363636363634, + "loss": 0.4375, "step": 18500 }, { "epoch": 0.730873511729341, - "eval_loss": 0.5616690516471863, - "eval_runtime": 161.4318, - "eval_samples_per_second": 35.036, - "eval_steps_per_second": 4.38, - "eval_wer": 0.43637559981383706, + "eval_loss": 0.6081308722496033, + "eval_runtime": 147.4627, + "eval_samples_per_second": 38.355, + "eval_steps_per_second": 4.794, + "eval_wer": 0.4832854552165749, "step": 18600 }, { "epoch": 0.7387323666941726, - "eval_loss": 0.5595532655715942, - "eval_runtime": 160.8301, - "eval_samples_per_second": 35.168, - "eval_steps_per_second": 4.396, - "eval_wer": 0.4393927235961548, + "eval_loss": 0.6007533073425293, + "eval_runtime": 146.3827, + "eval_samples_per_second": 38.638, + "eval_steps_per_second": 4.83, + "eval_wer": 0.4727897161014909, "step": 18800 }, { "epoch": 0.7465912216590043, - "grad_norm": 1.8793506622314453, - "learning_rate": 0.00011196610169491524, - "loss": 0.3738, + "grad_norm": 1.425370693206787, + "learning_rate": 0.0001879333333333333, + "loss": 0.4329, "step": 19000 }, { "epoch": 0.7465912216590043, - "eval_loss": 0.549248218536377, - "eval_runtime": 161.3194, - "eval_samples_per_second": 35.061, - "eval_steps_per_second": 4.383, - "eval_wer": 0.42923400362696795, + "eval_loss": 0.6008017063140869, + "eval_runtime": 147.3011, + "eval_samples_per_second": 38.398, + "eval_steps_per_second": 4.8, + "eval_wer": 0.46924299080419185, "step": 19000 }, { "epoch": 0.7544500766238359, - "eval_loss": 0.5478147268295288, - "eval_runtime": 162.2231, - "eval_samples_per_second": 34.866, - "eval_steps_per_second": 4.358, - "eval_wer": 0.4372261719439585, + "eval_loss": 0.6007276177406311, + "eval_runtime": 146.7759, + "eval_samples_per_second": 38.535, + "eval_steps_per_second": 4.817, + "eval_wer": 0.4822262521866123, "step": 19200 }, { "epoch": 0.7623089315886675, - "eval_loss": 0.5375632047653198, - "eval_runtime": 161.0297, - "eval_samples_per_second": 35.124, - "eval_steps_per_second": 4.39, - "eval_wer": 0.42873649917350065, + "eval_loss": 0.5838043093681335, + "eval_runtime": 146.9473, + "eval_samples_per_second": 38.49, + "eval_steps_per_second": 4.811, + "eval_wer": 0.4657925566914349, "step": 19400 }, { "epoch": 0.7662383590710834, - "grad_norm": 2.159616708755493, - "learning_rate": 0.00010688135593220338, - "loss": 0.368, + "grad_norm": 2.780203342437744, + "learning_rate": 0.00018490303030303028, + "loss": 0.4318, "step": 19500 }, { "epoch": 0.7701677865534992, - "eval_loss": 0.5282244086265564, - "eval_runtime": 163.0357, - "eval_samples_per_second": 34.692, - "eval_steps_per_second": 4.336, - "eval_wer": 0.4193481086806503, + "eval_loss": 0.6007500290870667, + "eval_runtime": 146.6721, + "eval_samples_per_second": 38.562, + "eval_steps_per_second": 4.82, + "eval_wer": 0.46519876105342556, "step": 19600 }, { "epoch": 0.7780266415183308, - "eval_loss": 0.5348193049430847, - "eval_runtime": 162.5531, - "eval_samples_per_second": 34.795, - "eval_steps_per_second": 4.349, - "eval_wer": 0.42507743416090255, + "eval_loss": 0.5918843746185303, + "eval_runtime": 147.2498, + "eval_samples_per_second": 38.411, + "eval_steps_per_second": 4.801, + "eval_wer": 0.4664826435139863, "step": 19800 }, { "epoch": 0.7858854964831624, - "grad_norm": 2.2020351886749268, - "learning_rate": 0.00010179661016949151, - "loss": 0.3629, + "grad_norm": 3.501138687133789, + "learning_rate": 0.00018187272727272725, + "loss": 0.4265, "step": 20000 }, { "epoch": 0.7858854964831624, - "eval_loss": 0.5367931723594666, - "eval_runtime": 162.0053, - "eval_samples_per_second": 34.912, - "eval_steps_per_second": 4.364, - "eval_wer": 0.43130426409462214, + "eval_loss": 0.59038907289505, + "eval_runtime": 147.6976, + "eval_samples_per_second": 38.294, + "eval_steps_per_second": 4.787, + "eval_wer": 0.4721959204634816, "step": 20000 }, { "epoch": 0.793744351447994, - "eval_loss": 0.5550614595413208, - "eval_runtime": 161.9948, - "eval_samples_per_second": 34.915, - "eval_steps_per_second": 4.364, - "eval_wer": 0.44123830463321084, + "eval_loss": 0.5922533273696899, + "eval_runtime": 146.8772, + "eval_samples_per_second": 38.508, + "eval_steps_per_second": 4.814, + "eval_wer": 0.4815201168333039, "step": 20200 }, { "epoch": 0.8016032064128257, - "eval_loss": 0.5251778364181519, - "eval_runtime": 162.6214, - "eval_samples_per_second": 34.78, - "eval_steps_per_second": 4.348, - "eval_wer": 0.4105214167642952, + "eval_loss": 0.5979217886924744, + "eval_runtime": 146.9133, + "eval_samples_per_second": 38.499, + "eval_steps_per_second": 4.812, + "eval_wer": 0.4661295758373321, "step": 20400 }, { "epoch": 0.8055326338952414, - "grad_norm": 2.7725887298583984, - "learning_rate": 9.671186440677966e-05, - "loss": 0.3638, + "grad_norm": 2.374830484390259, + "learning_rate": 0.00017884242424242425, + "loss": 0.4321, "step": 20500 }, { "epoch": 0.8094620613776573, - "eval_loss": 0.5242481827735901, - "eval_runtime": 162.5731, - "eval_samples_per_second": 34.791, - "eval_steps_per_second": 4.349, - "eval_wer": 0.41174110510182793, + "eval_loss": 0.5837874412536621, + "eval_runtime": 146.6078, + "eval_samples_per_second": 38.579, + "eval_steps_per_second": 4.822, + "eval_wer": 0.45608319558344435, "step": 20600 }, { "epoch": 0.8173209163424889, - "eval_loss": 0.5233432054519653, - "eval_runtime": 161.9438, - "eval_samples_per_second": 34.926, - "eval_steps_per_second": 4.366, - "eval_wer": 0.4165877613904447, + "eval_loss": 0.5824867486953735, + "eval_runtime": 147.7105, + "eval_samples_per_second": 38.291, + "eval_steps_per_second": 4.786, + "eval_wer": 0.4523920335093322, "step": 20800 }, { "epoch": 0.8251797713073206, - "grad_norm": 2.733196496963501, - "learning_rate": 9.162711864406779e-05, - "loss": 0.3512, + "grad_norm": 1.430405616760254, + "learning_rate": 0.0001758121212121212, + "loss": 0.4192, "step": 21000 }, { "epoch": 0.8251797713073206, - "eval_loss": 0.524342954158783, - "eval_runtime": 161.947, - "eval_samples_per_second": 34.925, - "eval_steps_per_second": 4.366, - "eval_wer": 0.4160581598754634, + "eval_loss": 0.5838850140571594, + "eval_runtime": 146.699, + "eval_samples_per_second": 38.555, + "eval_steps_per_second": 4.819, + "eval_wer": 0.4551523807995378, "step": 21000 }, { "epoch": 0.8330386262721522, - "eval_loss": 0.5150259733200073, - "eval_runtime": 162.0793, - "eval_samples_per_second": 34.896, - "eval_steps_per_second": 4.362, - "eval_wer": 0.4123028036783232, + "eval_loss": 0.5804269909858704, + "eval_runtime": 147.0076, + "eval_samples_per_second": 38.474, + "eval_steps_per_second": 4.809, + "eval_wer": 0.4593731443886312, "step": 21200 }, { "epoch": 0.8408974812369838, - "eval_loss": 0.5088914632797241, - "eval_runtime": 161.2392, - "eval_samples_per_second": 35.078, - "eval_steps_per_second": 4.385, - "eval_wer": 0.4079536518431738, + "eval_loss": 0.5890819430351257, + "eval_runtime": 146.6585, + "eval_samples_per_second": 38.566, + "eval_steps_per_second": 4.821, + "eval_wer": 0.4722280175249956, "step": 21400 }, { "epoch": 0.8448269087193996, - "grad_norm": 4.562708377838135, - "learning_rate": 8.654237288135593e-05, - "loss": 0.3536, + "grad_norm": 2.7897725105285645, + "learning_rate": 0.00017278181818181817, + "loss": 0.4151, "step": 21500 }, { "epoch": 0.8487563362018153, - "eval_loss": 0.515373170375824, - "eval_runtime": 162.8063, - "eval_samples_per_second": 34.741, - "eval_steps_per_second": 4.343, - "eval_wer": 0.40899680634237934, + "eval_loss": 0.5830910205841064, + "eval_runtime": 147.6653, + "eval_samples_per_second": 38.303, + "eval_steps_per_second": 4.788, + "eval_wer": 0.4525204217553883, "step": 21600 }, { "epoch": 0.856615191166647, - "eval_loss": 0.5161571502685547, - "eval_runtime": 162.7678, - "eval_samples_per_second": 34.749, - "eval_steps_per_second": 4.344, - "eval_wer": 0.4091893887114635, + "eval_loss": 0.5677404403686523, + "eval_runtime": 146.5378, + "eval_samples_per_second": 38.598, + "eval_steps_per_second": 4.825, + "eval_wer": 0.45430180866941633, "step": 21800 }, { "epoch": 0.8644740461314786, - "grad_norm": 2.272256374359131, - "learning_rate": 8.146779661016948e-05, - "loss": 0.3464, + "grad_norm": 2.938485622406006, + "learning_rate": 0.00016975757575757574, + "loss": 0.417, "step": 22000 }, { "epoch": 0.8644740461314786, - "eval_loss": 0.5097736716270447, - "eval_runtime": 162.1935, - "eval_samples_per_second": 34.872, - "eval_steps_per_second": 4.359, - "eval_wer": 0.40527354720675324, + "eval_loss": 0.5605286359786987, + "eval_runtime": 147.3751, + "eval_samples_per_second": 38.378, + "eval_steps_per_second": 4.797, + "eval_wer": 0.446807144805893, "step": 22000 }, { "epoch": 0.8723329010963102, - "eval_loss": 0.5069981813430786, - "eval_runtime": 162.5966, - "eval_samples_per_second": 34.785, - "eval_steps_per_second": 4.348, - "eval_wer": 0.4022724719551925, + "eval_loss": 0.570513129234314, + "eval_runtime": 146.7648, + "eval_samples_per_second": 38.538, + "eval_steps_per_second": 4.817, + "eval_wer": 0.44422333135401454, "step": 22200 }, { "epoch": 0.8801917560611419, - "eval_loss": 0.5070444345474243, - "eval_runtime": 162.5617, - "eval_samples_per_second": 34.793, - "eval_steps_per_second": 4.349, - "eval_wer": 0.40707098265153824, + "eval_loss": 0.5685856938362122, + "eval_runtime": 147.3241, + "eval_samples_per_second": 38.392, + "eval_steps_per_second": 4.799, + "eval_wer": 0.4551363322687808, "step": 22400 }, { "epoch": 0.8841211835435577, - "grad_norm": 2.9740068912506104, - "learning_rate": 7.638305084745762e-05, - "loss": 0.3377, + "grad_norm": 5.145638942718506, + "learning_rate": 0.0001667272727272727, + "loss": 0.4014, "step": 22500 }, { "epoch": 0.8880506110259735, - "eval_loss": 0.5028176307678223, - "eval_runtime": 162.4451, - "eval_samples_per_second": 34.818, - "eval_steps_per_second": 4.352, - "eval_wer": 0.39670363178251034, + "eval_loss": 0.5751659870147705, + "eval_runtime": 146.2417, + "eval_samples_per_second": 38.676, + "eval_steps_per_second": 4.834, + "eval_wer": 0.4602397650495097, "step": 22600 }, { "epoch": 0.8959094659908051, - "eval_loss": 0.5036062002182007, - "eval_runtime": 162.5763, - "eval_samples_per_second": 34.79, - "eval_steps_per_second": 4.349, - "eval_wer": 0.39784307746625797, + "eval_loss": 0.5623380541801453, + "eval_runtime": 146.6371, + "eval_samples_per_second": 38.571, + "eval_steps_per_second": 4.821, + "eval_wer": 0.4452985829147342, "step": 22800 }, { "epoch": 0.9037683209556368, - "grad_norm": 1.9388916492462158, - "learning_rate": 7.129830508474575e-05, - "loss": 0.3272, + "grad_norm": 1.9630001783370972, + "learning_rate": 0.00016369696969696968, + "loss": 0.4024, "step": 23000 }, { "epoch": 0.9037683209556368, - "eval_loss": 0.5020586848258972, - "eval_runtime": 161.6894, - "eval_samples_per_second": 34.981, - "eval_steps_per_second": 4.373, - "eval_wer": 0.39538765226043554, + "eval_loss": 0.5631678700447083, + "eval_runtime": 146.9977, + "eval_samples_per_second": 38.477, + "eval_steps_per_second": 4.81, + "eval_wer": 0.4423777503169585, "step": 23000 }, { "epoch": 0.9116271759204684, - "eval_loss": 0.5032612085342407, - "eval_runtime": 163.6786, - "eval_samples_per_second": 34.556, - "eval_steps_per_second": 4.319, - "eval_wer": 0.3984529216350243, + "eval_loss": 0.568145751953125, + "eval_runtime": 146.7017, + "eval_samples_per_second": 38.554, + "eval_steps_per_second": 4.819, + "eval_wer": 0.4471120668902762, "step": 23200 }, { "epoch": 0.9194860308853, - "eval_loss": 0.49842530488967896, - "eval_runtime": 162.0701, - "eval_samples_per_second": 34.898, - "eval_steps_per_second": 4.362, - "eval_wer": 0.3971850877052206, + "eval_loss": 0.5659225583076477, + "eval_runtime": 147.422, + "eval_samples_per_second": 38.366, + "eval_steps_per_second": 4.796, + "eval_wer": 0.4510760539872575, "step": 23400 }, { "epoch": 0.9234154583677158, - "grad_norm": 3.9436373710632324, - "learning_rate": 6.621355932203389e-05, - "loss": 0.319, + "grad_norm": 2.880105972290039, + "learning_rate": 0.00016066666666666665, + "loss": 0.3899, "step": 23500 }, { "epoch": 0.9273448858501316, - "eval_loss": 0.4928737282752991, - "eval_runtime": 163.9597, - "eval_samples_per_second": 34.496, - "eval_steps_per_second": 4.312, - "eval_wer": 0.39243472260114587, + "eval_loss": 0.5653769969940186, + "eval_runtime": 147.0508, + "eval_samples_per_second": 38.463, + "eval_steps_per_second": 4.808, + "eval_wer": 0.4417197605559211, "step": 23600 }, { "epoch": 0.9352037408149633, - "eval_loss": 0.49405232071876526, - "eval_runtime": 161.8803, - "eval_samples_per_second": 34.939, - "eval_steps_per_second": 4.367, - "eval_wer": 0.4013095601097719, + "eval_loss": 0.5691047310829163, + "eval_runtime": 147.3319, + "eval_samples_per_second": 38.39, + "eval_steps_per_second": 4.799, + "eval_wer": 0.45418946895411727, "step": 23800 }, { "epoch": 0.9430625957797949, - "grad_norm": 3.4186201095581055, - "learning_rate": 6.112881355932203e-05, - "loss": 0.3184, + "grad_norm": 1.747075080871582, + "learning_rate": 0.00015763636363636365, + "loss": 0.3977, "step": 24000 }, { "epoch": 0.9430625957797949, - "eval_loss": 0.4856198728084564, - "eval_runtime": 163.6122, - "eval_samples_per_second": 34.57, - "eval_steps_per_second": 4.321, - "eval_wer": 0.387411532474202, + "eval_loss": 0.5613217949867249, + "eval_runtime": 146.5842, + "eval_samples_per_second": 38.585, + "eval_steps_per_second": 4.823, + "eval_wer": 0.4434209048161641, "step": 24000 }, { "epoch": 0.9509214507446265, - "eval_loss": 0.48915818333625793, - "eval_runtime": 162.8317, - "eval_samples_per_second": 34.735, - "eval_steps_per_second": 4.342, - "eval_wer": 0.3913755195711833, + "eval_loss": 0.5688283443450928, + "eval_runtime": 147.1422, + "eval_samples_per_second": 38.439, + "eval_steps_per_second": 4.805, + "eval_wer": 0.44326041950859396, "step": 24200 }, { "epoch": 0.9587803057094582, - "eval_loss": 0.48598504066467285, - "eval_runtime": 160.6269, - "eval_samples_per_second": 35.212, - "eval_steps_per_second": 4.402, - "eval_wer": 0.3813772849095665, + "eval_loss": 0.57487553358078, + "eval_runtime": 146.7792, + "eval_samples_per_second": 38.534, + "eval_steps_per_second": 4.817, + "eval_wer": 0.4454751167530613, "step": 24400 }, { "epoch": 0.9627097331918739, - "grad_norm": 2.70164155960083, - "learning_rate": 5.6044067796610164e-05, - "loss": 0.3091, + "grad_norm": NaN, + "learning_rate": 0.0001546121212121212, + "loss": 0.3889, "step": 24500 }, { "epoch": 0.9666391606742898, - "eval_loss": 0.4825168251991272, - "eval_runtime": 162.6242, - "eval_samples_per_second": 34.78, - "eval_steps_per_second": 4.347, - "eval_wer": 0.38336730272343567, + "eval_loss": 0.5499551892280579, + "eval_runtime": 147.156, + "eval_samples_per_second": 38.435, + "eval_steps_per_second": 4.804, + "eval_wer": 0.43180176854808944, "step": 24600 }, { "epoch": 0.9744980156391214, - "eval_loss": 0.4784228205680847, - "eval_runtime": 162.0189, - "eval_samples_per_second": 34.91, - "eval_steps_per_second": 4.364, - "eval_wer": 0.3866893485901366, + "eval_loss": 0.5436142086982727, + "eval_runtime": 147.2848, + "eval_samples_per_second": 38.402, + "eval_steps_per_second": 4.8, + "eval_wer": 0.4371780263516875, "step": 24800 }, { "epoch": 0.982356870603953, - "grad_norm": 9.408166885375977, - "learning_rate": 5.096949152542373e-05, - "loss": 0.3154, + "grad_norm": 4.918150424957275, + "learning_rate": 0.0001515818181818182, + "loss": 0.39, "step": 25000 }, { "epoch": 0.982356870603953, - "eval_loss": 0.47507792711257935, - "eval_runtime": 161.9422, - "eval_samples_per_second": 34.926, - "eval_steps_per_second": 4.366, - "eval_wer": 0.3807834892715572, + "eval_loss": 0.547515332698822, + "eval_runtime": 147.2374, + "eval_samples_per_second": 38.414, + "eval_steps_per_second": 4.802, + "eval_wer": 0.4388310250196594, "step": 25000 }, { "epoch": 0.9902157255687847, - "eval_loss": 0.4778765141963959, - "eval_runtime": 162.3405, - "eval_samples_per_second": 34.84, - "eval_steps_per_second": 4.355, - "eval_wer": 0.38492401020686556, + "eval_loss": 0.5531713366508484, + "eval_runtime": 146.8558, + "eval_samples_per_second": 38.514, + "eval_steps_per_second": 4.814, + "eval_wer": 0.4423777503169585, "step": 25200 }, { "epoch": 0.9980745805336163, - "eval_loss": 0.477267324924469, - "eval_runtime": 161.2107, - "eval_samples_per_second": 35.085, - "eval_steps_per_second": 4.386, - "eval_wer": 0.38084768339458525, + "eval_loss": 0.5450366139411926, + "eval_runtime": 147.6783, + "eval_samples_per_second": 38.299, + "eval_steps_per_second": 4.787, + "eval_wer": 0.4280945579432203, "step": 25400 }, { "epoch": 1.002004008016032, - "grad_norm": 0.7003775835037231, - "learning_rate": 4.589491525423728e-05, - "loss": 0.312, + "grad_norm": 1.2219481468200684, + "learning_rate": 0.00014855151515151514, + "loss": 0.3853, "step": 25500 }, { "epoch": 1.005933435498448, - "eval_loss": 0.47774726152420044, - "eval_runtime": 160.8535, - "eval_samples_per_second": 35.162, - "eval_steps_per_second": 4.395, - "eval_wer": 0.3757923962061273, + "eval_loss": 0.5462915897369385, + "eval_runtime": 145.543, + "eval_samples_per_second": 38.861, + "eval_steps_per_second": 4.858, + "eval_wer": 0.43079071111039785, "step": 25600 }, { "epoch": 1.0137922904632795, - "eval_loss": 0.4752050042152405, - "eval_runtime": 159.7765, - "eval_samples_per_second": 35.399, - "eval_steps_per_second": 4.425, - "eval_wer": 0.3820513232013609, + "eval_loss": 0.5457944869995117, + "eval_runtime": 145.2381, + "eval_samples_per_second": 38.943, + "eval_steps_per_second": 4.868, + "eval_wer": 0.4277896358588371, "step": 25800 }, { "epoch": 1.0216511454281112, - "grad_norm": 0.702942430973053, - "learning_rate": 4.081016949152542e-05, - "loss": 0.2651, + "grad_norm": 4.69161319732666, + "learning_rate": 0.0001455212121212121, + "loss": 0.3413, "step": 26000 }, { "epoch": 1.0216511454281112, - "eval_loss": 0.4700838327407837, - "eval_runtime": 163.2858, - "eval_samples_per_second": 34.639, - "eval_steps_per_second": 4.33, - "eval_wer": 0.37750958899712733, + "eval_loss": 0.5470069646835327, + "eval_runtime": 145.5418, + "eval_samples_per_second": 38.862, + "eval_steps_per_second": 4.858, + "eval_wer": 0.43441767906148193, "step": 26000 }, { "epoch": 1.0295100003929427, - "eval_loss": 0.47011885046958923, - "eval_runtime": 160.7741, - "eval_samples_per_second": 35.18, - "eval_steps_per_second": 4.397, - "eval_wer": 0.3760652212289965, + "eval_loss": 0.5358372330665588, + "eval_runtime": 145.609, + "eval_samples_per_second": 38.844, + "eval_steps_per_second": 4.855, + "eval_wer": 0.42258991189356615, "step": 26200 }, { "epoch": 1.0373688553577745, - "eval_loss": 0.471804678440094, - "eval_runtime": 160.2455, - "eval_samples_per_second": 35.296, - "eval_steps_per_second": 4.412, - "eval_wer": 0.37755773458939834, + "eval_loss": 0.5403576493263245, + "eval_runtime": 146.3753, + "eval_samples_per_second": 38.64, + "eval_steps_per_second": 4.83, + "eval_wer": 0.42308741634703345, "step": 26400 }, { "epoch": 1.0412982828401902, - "grad_norm": 0.98069828748703, - "learning_rate": 3.572542372881355e-05, - "loss": 0.2627, + "grad_norm": 1.2460460662841797, + "learning_rate": 0.00014249090909090908, + "loss": 0.339, "step": 26500 }, { "epoch": 1.045227710322606, - "eval_loss": 0.4638473391532898, - "eval_runtime": 160.1121, - "eval_samples_per_second": 35.325, - "eval_steps_per_second": 4.416, - "eval_wer": 0.37296785479289374, + "eval_loss": 0.5345466732978821, + "eval_runtime": 145.3146, + "eval_samples_per_second": 38.922, + "eval_steps_per_second": 4.865, + "eval_wer": 0.42433920174608014, "step": 26600 }, { "epoch": 1.0530865652874377, - "eval_loss": 0.4677112400531769, - "eval_runtime": 159.9389, - "eval_samples_per_second": 35.364, - "eval_steps_per_second": 4.42, - "eval_wer": 0.3720370400089872, + "eval_loss": 0.5396625995635986, + "eval_runtime": 145.9713, + "eval_samples_per_second": 38.747, + "eval_steps_per_second": 4.843, + "eval_wer": 0.4199579528494166, "step": 26800 }, { "epoch": 1.0609454202522692, - "grad_norm": 0.8780287504196167, - "learning_rate": 3.0640677966101693e-05, - "loss": 0.2427, + "grad_norm": 1.021347165107727, + "learning_rate": 0.00013946060606060605, + "loss": 0.3235, "step": 27000 }, { "epoch": 1.0609454202522692, - "eval_loss": 0.4642546474933624, - "eval_runtime": 160.0541, - "eval_samples_per_second": 35.338, - "eval_steps_per_second": 4.417, - "eval_wer": 0.36985443982603394, + "eval_loss": 0.5378654599189758, + "eval_runtime": 145.6291, + "eval_samples_per_second": 38.838, + "eval_steps_per_second": 4.855, + "eval_wer": 0.4183049541814447, "step": 27000 }, { "epoch": 1.0688042752171008, - "eval_loss": 0.46017909049987793, - "eval_runtime": 159.9066, - "eval_samples_per_second": 35.371, - "eval_steps_per_second": 4.421, - "eval_wer": 0.3713469531864358, + "eval_loss": 0.5305435657501221, + "eval_runtime": 145.36, + "eval_samples_per_second": 38.91, + "eval_steps_per_second": 4.864, + "eval_wer": 0.42753285936672497, "step": 27200 }, { "epoch": 1.0766631301819325, - "eval_loss": 0.46644654870033264, - "eval_runtime": 160.7516, - "eval_samples_per_second": 35.185, - "eval_steps_per_second": 4.398, - "eval_wer": 0.3703037986872302, + "eval_loss": 0.5440751910209656, + "eval_runtime": 145.3458, + "eval_samples_per_second": 38.914, + "eval_steps_per_second": 4.864, + "eval_wer": 0.4247564635457624, "step": 27400 }, { "epoch": 1.0805925576643483, - "grad_norm": 0.8659859895706177, - "learning_rate": 2.556610169491525e-05, - "loss": 0.2464, + "grad_norm": 0.5985044836997986, + "learning_rate": 0.00013643636363636362, + "loss": 0.3252, "step": 27500 }, { "epoch": 1.0845219851467642, - "eval_loss": 0.4609028100967407, - "eval_runtime": 161.4502, - "eval_samples_per_second": 35.032, - "eval_steps_per_second": 4.379, - "eval_wer": 0.36770393670459467, + "eval_loss": 0.5361995697021484, + "eval_runtime": 146.0428, + "eval_samples_per_second": 38.728, + "eval_steps_per_second": 4.841, + "eval_wer": 0.4177753526664634, "step": 27600 }, { "epoch": 1.0923808401115958, - "eval_loss": 0.4613707363605499, - "eval_runtime": 160.5963, - "eval_samples_per_second": 35.219, - "eval_steps_per_second": 4.402, - "eval_wer": 0.3687310426730433, + "eval_loss": 0.5305026173591614, + "eval_runtime": 145.9537, + "eval_samples_per_second": 38.752, + "eval_steps_per_second": 4.844, + "eval_wer": 0.42015053521850076, "step": 27800 }, { "epoch": 1.1002396950764273, - "grad_norm": 1.6944918632507324, - "learning_rate": 2.0481355932203388e-05, - "loss": 0.2537, + "grad_norm": 1.615342378616333, + "learning_rate": 0.0001334060606060606, + "loss": 0.3301, "step": 28000 }, { "epoch": 1.1002396950764273, - "eval_loss": 0.45553678274154663, - "eval_runtime": 160.1154, - "eval_samples_per_second": 35.325, - "eval_steps_per_second": 4.416, - "eval_wer": 0.36545714239861343, + "eval_loss": 0.5307178497314453, + "eval_runtime": 146.253, + "eval_samples_per_second": 38.673, + "eval_steps_per_second": 4.834, + "eval_wer": 0.41851358508128583, "step": 28000 }, { "epoch": 1.108098550041259, - "eval_loss": 0.456032931804657, - "eval_runtime": 160.97, - "eval_samples_per_second": 35.137, - "eval_steps_per_second": 4.392, - "eval_wer": 0.36447818202243587, + "eval_loss": 0.5402148365974426, + "eval_runtime": 145.7202, + "eval_samples_per_second": 38.814, + "eval_steps_per_second": 4.852, + "eval_wer": 0.431127730256295, "step": 28200 }, { "epoch": 1.1159574050060905, - "eval_loss": 0.45427000522613525, - "eval_runtime": 160.1348, - "eval_samples_per_second": 35.32, - "eval_steps_per_second": 4.415, - "eval_wer": 0.36261655245462276, + "eval_loss": 0.5308640003204346, + "eval_runtime": 145.81, + "eval_samples_per_second": 38.79, + "eval_steps_per_second": 4.849, + "eval_wer": 0.41788769238176243, "step": 28400 }, { "epoch": 1.1198868324885065, - "grad_norm": 0.8318812251091003, - "learning_rate": 1.5396610169491525e-05, - "loss": 0.2313, + "grad_norm": 1.1408910751342773, + "learning_rate": 0.00013037575757575756, + "loss": 0.3087, "step": 28500 }, { "epoch": 1.1238162599709223, - "eval_loss": 0.45402956008911133, - "eval_runtime": 160.7545, - "eval_samples_per_second": 35.184, - "eval_steps_per_second": 4.398, - "eval_wer": 0.3631461539696041, + "eval_loss": 0.5298367738723755, + "eval_runtime": 145.4349, + "eval_samples_per_second": 38.89, + "eval_steps_per_second": 4.861, + "eval_wer": 0.42137022355603343, "step": 28600 }, { "epoch": 1.1316751149357538, - "eval_loss": 0.4536111354827881, - "eval_runtime": 165.4654, - "eval_samples_per_second": 34.182, - "eval_steps_per_second": 4.273, - "eval_wer": 0.3626326009853798, + "eval_loss": 0.5330610275268555, + "eval_runtime": 145.6355, + "eval_samples_per_second": 38.837, + "eval_steps_per_second": 4.855, + "eval_wer": 0.4214665147405755, "step": 28800 }, { "epoch": 1.1395339699005855, - "grad_norm": 0.7866860032081604, - "learning_rate": 1.031186440677966e-05, - "loss": 0.2451, + "grad_norm": 0.8552046418190002, + "learning_rate": 0.00012734545454545453, + "loss": 0.3222, "step": 29000 }, { "epoch": 1.1395339699005855, - "eval_loss": 0.45293620228767395, - "eval_runtime": 160.3649, - "eval_samples_per_second": 35.27, - "eval_steps_per_second": 4.409, - "eval_wer": 0.3617338832629873, + "eval_loss": 0.5273275971412659, + "eval_runtime": 145.8763, + "eval_samples_per_second": 38.773, + "eval_steps_per_second": 4.847, + "eval_wer": 0.4145495979843045, "step": 29000 }, { "epoch": 1.147392824865417, - "eval_loss": 0.4530145823955536, - "eval_runtime": 160.576, - "eval_samples_per_second": 35.223, - "eval_steps_per_second": 4.403, - "eval_wer": 0.3598401566336602, + "eval_loss": 0.5282542705535889, + "eval_runtime": 145.6375, + "eval_samples_per_second": 38.836, + "eval_steps_per_second": 4.855, + "eval_wer": 0.4130731331546597, "step": 29200 }, { "epoch": 1.1552516798302488, - "eval_loss": 0.4515323042869568, - "eval_runtime": 160.1136, - "eval_samples_per_second": 35.325, - "eval_steps_per_second": 4.416, - "eval_wer": 0.3591500698111088, + "eval_loss": 0.5256520509719849, + "eval_runtime": 145.9987, + "eval_samples_per_second": 38.74, + "eval_steps_per_second": 4.843, + "eval_wer": 0.41159666832501485, "step": 29400 }, { "epoch": 1.1591811073126645, - "grad_norm": 3.2193210124969482, - "learning_rate": 5.227118644067796e-06, - "loss": 0.2445, + "grad_norm": 3.544210195541382, + "learning_rate": 0.0001243151515151515, + "loss": 0.3227, "step": 29500 }, { "epoch": 1.1631105347950803, - "eval_loss": 0.451358437538147, - "eval_runtime": 160.6595, - "eval_samples_per_second": 35.205, - "eval_steps_per_second": 4.401, - "eval_wer": 0.3590056330342957, + "eval_loss": 0.5168554186820984, + "eval_runtime": 145.3157, + "eval_samples_per_second": 38.922, + "eval_steps_per_second": 4.865, + "eval_wer": 0.408419059235127, "step": 29600 }, { "epoch": 1.170969389759912, - "eval_loss": 0.4514302611351013, - "eval_runtime": 160.1434, - "eval_samples_per_second": 35.318, - "eval_steps_per_second": 4.415, - "eval_wer": 0.3588772447882396, + "eval_loss": 0.5184837579727173, + "eval_runtime": 145.4598, + "eval_samples_per_second": 38.884, + "eval_steps_per_second": 4.86, + "eval_wer": 0.41068190207186533, "step": 29800 }, { "epoch": 1.1788282447247436, - "grad_norm": 0.5669330358505249, - "learning_rate": 1.423728813559322e-07, - "loss": 0.2364, + "grad_norm": 0.8857652544975281, + "learning_rate": 0.00012128484848484848, + "loss": 0.309, "step": 30000 }, { "epoch": 1.1788282447247436, - "eval_loss": 0.4510672390460968, - "eval_runtime": 160.6855, - "eval_samples_per_second": 35.199, - "eval_steps_per_second": 4.4, - "eval_wer": 0.3591179727495948, + "eval_loss": 0.5076336860656738, + "eval_runtime": 145.8517, + "eval_samples_per_second": 38.779, + "eval_steps_per_second": 4.847, + "eval_wer": 0.40275392787790276, "step": 30000 }, { - "epoch": 1.1788282447247436, - "step": 30000, - "total_flos": 3.731985674211105e+19, - "train_loss": 0.5082863594055176, - "train_runtime": 37313.8627, - "train_samples_per_second": 6.432, - "train_steps_per_second": 0.804 + "epoch": 1.1866870996895753, + "eval_loss": 0.5178284049034119, + "eval_runtime": 146.4004, + "eval_samples_per_second": 38.634, + "eval_steps_per_second": 4.829, + "eval_wer": 0.40535378986053827, + "step": 30200 + }, + { + "epoch": 1.1945459546544068, + "eval_loss": 0.5225840210914612, + "eval_runtime": 149.501, + "eval_samples_per_second": 37.833, + "eval_steps_per_second": 4.729, + "eval_wer": 0.4122065124937812, + "step": 30400 + }, + { + "epoch": 1.1984753821368226, + "grad_norm": 1.1116445064544678, + "learning_rate": 0.00011826060606060606, + "loss": 0.3138, + "step": 30500 + }, + { + "epoch": 1.2024048096192386, + "eval_loss": 0.5226925015449524, + "eval_runtime": 145.5048, + "eval_samples_per_second": 38.872, + "eval_steps_per_second": 4.859, + "eval_wer": 0.4072635650206224, + "step": 30600 + }, + { + "epoch": 1.21026366458407, + "eval_loss": 0.5130230784416199, + "eval_runtime": 144.8014, + "eval_samples_per_second": 39.06, + "eval_steps_per_second": 4.883, + "eval_wer": 0.40498467365312707, + "step": 30800 + }, + { + "epoch": 1.2181225195489018, + "grad_norm": 1.0480467081069946, + "learning_rate": 0.00011523030303030302, + "loss": 0.3083, + "step": 31000 + }, + { + "epoch": 1.2181225195489018, + "eval_loss": 0.516806423664093, + "eval_runtime": 145.4982, + "eval_samples_per_second": 38.873, + "eval_steps_per_second": 4.859, + "eval_wer": 0.4113077947713887, + "step": 31000 + }, + { + "epoch": 1.2259813745137333, + "eval_loss": 0.505409836769104, + "eval_runtime": 145.5358, + "eval_samples_per_second": 38.863, + "eval_steps_per_second": 4.858, + "eval_wer": 0.4003947938566224, + "step": 31200 + }, + { + "epoch": 1.2338402294785649, + "eval_loss": 0.5144046545028687, + "eval_runtime": 145.0631, + "eval_samples_per_second": 38.99, + "eval_steps_per_second": 4.874, + "eval_wer": 0.406653720851856, + "step": 31400 + }, + { + "epoch": 1.2377696569609808, + "grad_norm": 1.0551427602767944, + "learning_rate": 0.00011219999999999999, + "loss": 0.2981, + "step": 31500 + }, + { + "epoch": 1.2416990844433966, + "eval_loss": 0.5082244277000427, + "eval_runtime": 145.8395, + "eval_samples_per_second": 38.782, + "eval_steps_per_second": 4.848, + "eval_wer": 0.39923929964211774, + "step": 31600 + }, + { + "epoch": 1.2495579394082281, + "eval_loss": 0.5134223103523254, + "eval_runtime": 145.7659, + "eval_samples_per_second": 38.802, + "eval_steps_per_second": 4.85, + "eval_wer": 0.396125884675258, + "step": 31800 + }, + { + "epoch": 1.2574167943730599, + "grad_norm": 2.2508976459503174, + "learning_rate": 0.00010916969696969696, + "loss": 0.2952, + "step": 32000 + }, + { + "epoch": 1.2574167943730599, + "eval_loss": 0.49696260690689087, + "eval_runtime": 145.5612, + "eval_samples_per_second": 38.857, + "eval_steps_per_second": 4.857, + "eval_wer": 0.3999454349954262, + "step": 32000 + }, + { + "epoch": 1.2652756493378914, + "eval_loss": 0.50291907787323, + "eval_runtime": 145.2238, + "eval_samples_per_second": 38.947, + "eval_steps_per_second": 4.868, + "eval_wer": 0.4005713276949495, + "step": 32200 + }, + { + "epoch": 1.2731345043027231, + "eval_loss": 0.4979938268661499, + "eval_runtime": 146.0479, + "eval_samples_per_second": 38.727, + "eval_steps_per_second": 4.841, + "eval_wer": 0.4001540658952673, + "step": 32400 + }, + { + "epoch": 1.2770639317851389, + "grad_norm": 0.7384321689605713, + "learning_rate": 0.00010614545454545453, + "loss": 0.2995, + "step": 32500 + }, + { + "epoch": 1.2809933592675546, + "eval_loss": 0.49917110800743103, + "eval_runtime": 145.9484, + "eval_samples_per_second": 38.753, + "eval_steps_per_second": 4.844, + "eval_wer": 0.40463160597647285, + "step": 32600 + }, + { + "epoch": 1.2888522142323864, + "eval_loss": 0.49689990282058716, + "eval_runtime": 146.3024, + "eval_samples_per_second": 38.66, + "eval_steps_per_second": 4.832, + "eval_wer": 0.3911829372020991, + "step": 32800 + }, + { + "epoch": 1.296711069197218, + "grad_norm": 0.6462344527244568, + "learning_rate": 0.0001031151515151515, + "loss": 0.3046, + "step": 33000 + }, + { + "epoch": 1.296711069197218, + "eval_loss": 0.49431467056274414, + "eval_runtime": 145.566, + "eval_samples_per_second": 38.855, + "eval_steps_per_second": 4.857, + "eval_wer": 0.3933334403235384, + "step": 33000 + }, + { + "epoch": 1.3045699241620496, + "eval_loss": 0.4882897138595581, + "eval_runtime": 146.7921, + "eval_samples_per_second": 38.531, + "eval_steps_per_second": 4.816, + "eval_wer": 0.3932050520774823, + "step": 33200 + }, + { + "epoch": 1.3124287791268812, + "eval_loss": 0.49653205275535583, + "eval_runtime": 146.2261, + "eval_samples_per_second": 38.68, + "eval_steps_per_second": 4.835, + "eval_wer": 0.3935099741618655, + "step": 33400 + }, + { + "epoch": 1.316358206609297, + "grad_norm": 4.335805416107178, + "learning_rate": 0.00010009090909090908, + "loss": 0.2972, + "step": 33500 + }, + { + "epoch": 1.320287634091713, + "eval_loss": 0.49103957414627075, + "eval_runtime": 146.0953, + "eval_samples_per_second": 38.714, + "eval_steps_per_second": 4.839, + "eval_wer": 0.3942000609844169, + "step": 33600 + }, + { + "epoch": 1.3281464890565444, + "eval_loss": 0.5007916688919067, + "eval_runtime": 145.7572, + "eval_samples_per_second": 38.804, + "eval_steps_per_second": 4.851, + "eval_wer": 0.4097029416956878, + "step": 33800 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.6741358637809753, + "learning_rate": 9.706060606060605e-05, + "loss": 0.3093, + "step": 34000 + }, + { + "epoch": 1.3360053440213762, + "eval_loss": 0.4958365857601166, + "eval_runtime": 146.2684, + "eval_samples_per_second": 38.669, + "eval_steps_per_second": 4.834, + "eval_wer": 0.39574071993708976, + "step": 34000 + }, + { + "epoch": 1.3438641989862077, + "eval_loss": 0.5045068264007568, + "eval_runtime": 146.1991, + "eval_samples_per_second": 38.687, + "eval_steps_per_second": 4.836, + "eval_wer": 0.40179101603248224, + "step": 34200 + }, + { + "epoch": 1.3517230539510394, + "eval_loss": 0.492519348859787, + "eval_runtime": 146.1528, + "eval_samples_per_second": 38.699, + "eval_steps_per_second": 4.837, + "eval_wer": 0.3969925053361365, + "step": 34400 + }, + { + "epoch": 1.3556524814334552, + "grad_norm": 0.9136665463447571, + "learning_rate": 9.403030303030303e-05, + "loss": 0.2947, + "step": 34500 + }, + { + "epoch": 1.359581908915871, + "eval_loss": 0.4828738868236542, + "eval_runtime": 145.0639, + "eval_samples_per_second": 38.99, + "eval_steps_per_second": 4.874, + "eval_wer": 0.3905409959718188, + "step": 34600 + }, + { + "epoch": 1.3674407638807025, + "eval_loss": 0.4869907796382904, + "eval_runtime": 145.4878, + "eval_samples_per_second": 38.876, + "eval_steps_per_second": 4.86, + "eval_wer": 0.39522716695286547, + "step": 34800 + }, + { + "epoch": 1.3752996188455342, + "grad_norm": 1.0685299634933472, + "learning_rate": 9.099999999999999e-05, + "loss": 0.2801, + "step": 35000 + }, + { + "epoch": 1.3752996188455342, + "eval_loss": 0.4897337555885315, + "eval_runtime": 145.9513, + "eval_samples_per_second": 38.753, + "eval_steps_per_second": 4.844, + "eval_wer": 0.3936704594694356, + "step": 35000 + }, + { + "epoch": 1.383158473810366, + "eval_loss": 0.5006551146507263, + "eval_runtime": 145.7634, + "eval_samples_per_second": 38.803, + "eval_steps_per_second": 4.85, + "eval_wer": 0.39972075556482806, + "step": 35200 + }, + { + "epoch": 1.3910173287751975, + "eval_loss": 0.48228171467781067, + "eval_runtime": 145.956, + "eval_samples_per_second": 38.751, + "eval_steps_per_second": 4.844, + "eval_wer": 0.38492401020686556, + "step": 35400 + }, + { + "epoch": 1.3949467562576132, + "grad_norm": 0.6772143244743347, + "learning_rate": 8.796969696969696e-05, + "loss": 0.2772, + "step": 35500 + }, + { + "epoch": 1.398876183740029, + "eval_loss": 0.4848904013633728, + "eval_runtime": 145.8656, + "eval_samples_per_second": 38.775, + "eval_steps_per_second": 4.847, + "eval_wer": 0.39121503426361315, + "step": 35600 + }, + { + "epoch": 1.4067350387048607, + "eval_loss": 0.4844968020915985, + "eval_runtime": 146.3634, + "eval_samples_per_second": 38.644, + "eval_steps_per_second": 4.83, + "eval_wer": 0.3881658134197814, + "step": 35800 + }, + { + "epoch": 1.4145938936696925, + "grad_norm": 1.0455658435821533, + "learning_rate": 8.493939393939393e-05, + "loss": 0.281, + "step": 36000 + }, + { + "epoch": 1.4145938936696925, + "eval_loss": 0.482947438955307, + "eval_runtime": 145.7025, + "eval_samples_per_second": 38.819, + "eval_steps_per_second": 4.852, + "eval_wer": 0.38418577779204316, + "step": 36000 + }, + { + "epoch": 1.422452748634524, + "eval_loss": 0.48147863149642944, + "eval_runtime": 146.3811, + "eval_samples_per_second": 38.639, + "eval_steps_per_second": 4.83, + "eval_wer": 0.3859190191138001, + "step": 36200 + }, + { + "epoch": 1.4303116035993555, + "eval_loss": 0.4771769642829895, + "eval_runtime": 145.8053, + "eval_samples_per_second": 38.791, + "eval_steps_per_second": 4.849, + "eval_wer": 0.38075139221004317, + "step": 36400 + }, + { + "epoch": 1.4342410310817715, + "grad_norm": 0.6518095135688782, + "learning_rate": 8.19090909090909e-05, + "loss": 0.2697, + "step": 36500 + }, + { + "epoch": 1.4381704585641872, + "eval_loss": 0.48701608180999756, + "eval_runtime": 145.4126, + "eval_samples_per_second": 38.896, + "eval_steps_per_second": 4.862, + "eval_wer": 0.3914236651634543, + "step": 36600 + }, + { + "epoch": 1.4460293135290188, + "eval_loss": 0.47700512409210205, + "eval_runtime": 145.4281, + "eval_samples_per_second": 38.892, + "eval_steps_per_second": 4.862, + "eval_wer": 0.38662515446710854, + "step": 36800 + }, + { + "epoch": 1.4538881684938505, + "grad_norm": 2.1603991985321045, + "learning_rate": 7.887878787878789e-05, + "loss": 0.2766, + "step": 37000 + }, + { + "epoch": 1.4538881684938505, + "eval_loss": 0.4786865711212158, + "eval_runtime": 145.7912, + "eval_samples_per_second": 38.795, + "eval_steps_per_second": 4.849, + "eval_wer": 0.38209946879363194, + "step": 37000 + }, + { + "epoch": 1.461747023458682, + "eval_loss": 0.4793393015861511, + "eval_runtime": 145.5675, + "eval_samples_per_second": 38.855, + "eval_steps_per_second": 4.857, + "eval_wer": 0.38099212017139833, + "step": 37200 + }, + { + "epoch": 1.4696058784235138, + "eval_loss": 0.4738729000091553, + "eval_runtime": 145.8624, + "eval_samples_per_second": 38.776, + "eval_steps_per_second": 4.847, + "eval_wer": 0.3803341304103609, + "step": 37400 + }, + { + "epoch": 1.4735353059059295, + "grad_norm": 1.9566117525100708, + "learning_rate": 7.585454545454545e-05, + "loss": 0.2905, + "step": 37500 + }, + { + "epoch": 1.4774647333883453, + "eval_loss": 0.47245293855667114, + "eval_runtime": 145.8323, + "eval_samples_per_second": 38.784, + "eval_steps_per_second": 4.848, + "eval_wer": 0.3811205084174544, + "step": 37600 + }, + { + "epoch": 1.485323588353177, + "eval_loss": 0.47267088294029236, + "eval_runtime": 145.9296, + "eval_samples_per_second": 38.758, + "eval_steps_per_second": 4.845, + "eval_wer": 0.37827991847346376, + "step": 37800 + }, + { + "epoch": 1.4931824433180085, + "grad_norm": 2.518251895904541, + "learning_rate": 7.282424242424242e-05, + "loss": 0.2799, + "step": 38000 + }, + { + "epoch": 1.4931824433180085, + "eval_loss": 0.47050511837005615, + "eval_runtime": 146.8142, + "eval_samples_per_second": 38.525, + "eval_steps_per_second": 4.816, + "eval_wer": 0.3776700743046974, + "step": 38000 + }, + { + "epoch": 1.50104129828284, + "eval_loss": 0.4659024178981781, + "eval_runtime": 145.787, + "eval_samples_per_second": 38.796, + "eval_steps_per_second": 4.85, + "eval_wer": 0.37508626085281893, + "step": 38200 + }, + { + "epoch": 1.5089001532476718, + "eval_loss": 0.46910360455513, + "eval_runtime": 146.8808, + "eval_samples_per_second": 38.507, + "eval_steps_per_second": 4.813, + "eval_wer": 0.37429988284572546, + "step": 38400 + }, + { + "epoch": 1.5128295807300876, + "grad_norm": 1.3675510883331299, + "learning_rate": 6.979393939393939e-05, + "loss": 0.267, + "step": 38500 + }, + { + "epoch": 1.5167590082125035, + "eval_loss": 0.4690033495426178, + "eval_runtime": 145.2501, + "eval_samples_per_second": 38.94, + "eval_steps_per_second": 4.867, + "eval_wer": 0.3663558601210059, + "step": 38600 + }, + { + "epoch": 1.524617863177335, + "eval_loss": 0.4632550776004791, + "eval_runtime": 146.3252, + "eval_samples_per_second": 38.654, + "eval_steps_per_second": 4.832, + "eval_wer": 0.36810514997351995, + "step": 38800 + }, + { + "epoch": 1.5324767181421666, + "grad_norm": 1.2868680953979492, + "learning_rate": 6.676969696969697e-05, + "loss": 0.2632, + "step": 39000 + }, + { + "epoch": 1.5324767181421666, + "eval_loss": 0.4650620222091675, + "eval_runtime": 146.2691, + "eval_samples_per_second": 38.668, + "eval_steps_per_second": 4.834, + "eval_wer": 0.37255059299321147, + "step": 39000 + }, + { + "epoch": 1.5403355731069983, + "eval_loss": 0.46896418929100037, + "eval_runtime": 145.9823, + "eval_samples_per_second": 38.744, + "eval_steps_per_second": 4.843, + "eval_wer": 0.3673990146202115, + "step": 39200 + }, + { + "epoch": 1.54819442807183, + "eval_loss": 0.4612589180469513, + "eval_runtime": 145.4614, + "eval_samples_per_second": 38.883, + "eval_steps_per_second": 4.86, + "eval_wer": 0.3714913899632489, + "step": 39400 + }, + { + "epoch": 1.5521238555542456, + "grad_norm": 2.942875623703003, + "learning_rate": 6.373939393939393e-05, + "loss": 0.2716, + "step": 39500 + }, + { + "epoch": 1.5560532830366616, + "eval_loss": 0.4654790461063385, + "eval_runtime": 146.1694, + "eval_samples_per_second": 38.695, + "eval_steps_per_second": 4.837, + "eval_wer": 0.36967790598770683, + "step": 39600 + }, + { + "epoch": 1.563912138001493, + "eval_loss": 0.4596673846244812, + "eval_runtime": 145.7967, + "eval_samples_per_second": 38.794, + "eval_steps_per_second": 4.849, + "eval_wer": 0.364799152637576, + "step": 39800 + }, + { + "epoch": 1.5717709929663248, + "grad_norm": 0.4809035658836365, + "learning_rate": 6.07090909090909e-05, + "loss": 0.2651, + "step": 40000 + }, + { + "epoch": 1.5717709929663248, + "eval_loss": 0.4549534320831299, + "eval_runtime": 146.3998, + "eval_samples_per_second": 38.634, + "eval_steps_per_second": 4.829, + "eval_wer": 0.36619537481343584, + "step": 40000 + }, + { + "epoch": 1.5796298479311566, + "eval_loss": 0.4538833498954773, + "eval_runtime": 146.0948, + "eval_samples_per_second": 38.715, + "eval_steps_per_second": 4.839, + "eval_wer": 0.3676397425815667, + "step": 40200 + }, + { + "epoch": 1.587488702895988, + "eval_loss": 0.4542824625968933, + "eval_runtime": 146.3082, + "eval_samples_per_second": 38.658, + "eval_steps_per_second": 4.832, + "eval_wer": 0.36746320874323957, + "step": 40400 + }, + { + "epoch": 1.5914181303784039, + "grad_norm": 1.2710328102111816, + "learning_rate": 5.767878787878788e-05, + "loss": 0.2659, + "step": 40500 + }, + { + "epoch": 1.5953475578608196, + "eval_loss": 0.45555397868156433, + "eval_runtime": 146.1729, + "eval_samples_per_second": 38.694, + "eval_steps_per_second": 4.837, + "eval_wer": 0.3622795333087256, + "step": 40600 + }, + { + "epoch": 1.6032064128256514, + "eval_loss": 0.463294118642807, + "eval_runtime": 146.3048, + "eval_samples_per_second": 38.659, + "eval_steps_per_second": 4.832, + "eval_wer": 0.36849031471168814, + "step": 40800 + }, + { + "epoch": 1.611065267790483, + "grad_norm": 1.9250500202178955, + "learning_rate": 5.4660606060606054e-05, + "loss": 0.2559, + "step": 41000 + }, + { + "epoch": 1.611065267790483, + "eval_loss": 0.4529285132884979, + "eval_runtime": 146.9183, + "eval_samples_per_second": 38.498, + "eval_steps_per_second": 4.812, + "eval_wer": 0.36083516554059475, + "step": 41000 + }, + { + "epoch": 1.6189241227553146, + "eval_loss": 0.45345816016197205, + "eval_runtime": 145.5972, + "eval_samples_per_second": 38.847, + "eval_steps_per_second": 4.856, + "eval_wer": 0.36385228932291247, + "step": 41200 + }, + { + "epoch": 1.6267829777201461, + "eval_loss": 0.4511209726333618, + "eval_runtime": 146.7532, + "eval_samples_per_second": 38.541, + "eval_steps_per_second": 4.818, + "eval_wer": 0.3637078525460994, + "step": 41400 + }, + { + "epoch": 1.630712405202562, + "grad_norm": 0.9593771696090698, + "learning_rate": 5.1630303030303025e-05, + "loss": 0.2629, + "step": 41500 + }, + { + "epoch": 1.6346418326849776, + "eval_loss": 0.45563140511512756, + "eval_runtime": 146.0124, + "eval_samples_per_second": 38.736, + "eval_steps_per_second": 4.842, + "eval_wer": 0.36049814639469757, + "step": 41600 + }, + { + "epoch": 1.6425006876498094, + "eval_loss": 0.457055002450943, + "eval_runtime": 147.3584, + "eval_samples_per_second": 38.383, + "eval_steps_per_second": 4.798, + "eval_wer": 0.36390043491518353, + "step": 41800 + }, + { + "epoch": 1.6503595426146411, + "grad_norm": 0.9599024653434753, + "learning_rate": 4.8599999999999995e-05, + "loss": 0.259, + "step": 42000 + }, + { + "epoch": 1.6503595426146411, + "eval_loss": 0.46201661229133606, + "eval_runtime": 146.8464, + "eval_samples_per_second": 38.516, + "eval_steps_per_second": 4.815, + "eval_wer": 0.36903596475742645, + "step": 42000 + }, + { + "epoch": 1.6582183975794726, + "eval_loss": 0.45499464869499207, + "eval_runtime": 146.9092, + "eval_samples_per_second": 38.5, + "eval_steps_per_second": 4.812, + "eval_wer": 0.36348317311550127, + "step": 42200 + }, + { + "epoch": 1.6660772525443042, + "eval_loss": 0.45219454169273376, + "eval_runtime": 146.8863, + "eval_samples_per_second": 38.506, + "eval_steps_per_second": 4.813, + "eval_wer": 0.3584278859270434, + "step": 42400 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 1.0676679611206055, + "learning_rate": 4.5569696969696966e-05, + "loss": 0.2594, + "step": 42500 + }, + { + "epoch": 1.673936107509136, + "eval_loss": 0.4494900703430176, + "eval_runtime": 147.3169, + "eval_samples_per_second": 38.393, + "eval_steps_per_second": 4.799, + "eval_wer": 0.3589253903805107, + "step": 42600 + }, + { + "epoch": 1.6817949624739676, + "eval_loss": 0.4453260898590088, + "eval_runtime": 146.8159, + "eval_samples_per_second": 38.524, + "eval_steps_per_second": 4.816, + "eval_wer": 0.3562131886825761, + "step": 42800 + }, + { + "epoch": 1.6896538174387992, + "grad_norm": 0.4820586144924164, + "learning_rate": 4.253939393939394e-05, + "loss": 0.2538, + "step": 43000 + }, + { + "epoch": 1.6896538174387992, + "eval_loss": 0.4438420832157135, + "eval_runtime": 147.9055, + "eval_samples_per_second": 38.241, + "eval_steps_per_second": 4.78, + "eval_wer": 0.3555391503907817, + "step": 43000 + }, + { + "epoch": 1.6975126724036307, + "eval_loss": 0.4494447708129883, + "eval_runtime": 146.855, + "eval_samples_per_second": 38.514, + "eval_steps_per_second": 4.814, + "eval_wer": 0.3566946446052864, + "step": 43200 + }, + { + "epoch": 1.7053715273684624, + "eval_loss": 0.4443654716014862, + "eval_runtime": 146.8467, + "eval_samples_per_second": 38.516, + "eval_steps_per_second": 4.815, + "eval_wer": 0.3537898605382677, + "step": 43400 + }, + { + "epoch": 1.7093009548508782, + "grad_norm": 0.7214144468307495, + "learning_rate": 3.950909090909091e-05, + "loss": 0.2512, + "step": 43500 + }, + { + "epoch": 1.7132303823332942, + "eval_loss": 0.4454784691333771, + "eval_runtime": 147.1352, + "eval_samples_per_second": 38.441, + "eval_steps_per_second": 4.805, + "eval_wer": 0.3529713854696602, + "step": 43600 + }, + { + "epoch": 1.7210892372981257, + "eval_loss": 0.4453714191913605, + "eval_runtime": 147.5374, + "eval_samples_per_second": 38.336, + "eval_steps_per_second": 4.792, + "eval_wer": 0.3522010559933238, + "step": 43800 + }, + { + "epoch": 1.7289480922629572, + "grad_norm": 1.9711872339248657, + "learning_rate": 3.647878787878787e-05, + "loss": 0.2358, + "step": 44000 + }, + { + "epoch": 1.7289480922629572, + "eval_loss": 0.44450756907463074, + "eval_runtime": 146.8893, + "eval_samples_per_second": 38.505, + "eval_steps_per_second": 4.813, + "eval_wer": 0.3519763765627257, + "step": 44000 + }, + { + "epoch": 1.736806947227789, + "eval_loss": 0.44162794947624207, + "eval_runtime": 147.6037, + "eval_samples_per_second": 38.319, + "eval_steps_per_second": 4.79, + "eval_wer": 0.34998635874885653, + "step": 44200 + }, + { + "epoch": 1.7446658021926207, + "eval_loss": 0.44202086329460144, + "eval_runtime": 148.0767, + "eval_samples_per_second": 38.196, + "eval_steps_per_second": 4.775, + "eval_wer": 0.34897530131116494, + "step": 44400 + }, + { + "epoch": 1.7485952296750362, + "grad_norm": 1.1429784297943115, + "learning_rate": 3.344848484848484e-05, + "loss": 0.2418, + "step": 44500 + }, + { + "epoch": 1.7525246571574522, + "eval_loss": 0.43861278891563416, + "eval_runtime": 147.9549, + "eval_samples_per_second": 38.228, + "eval_steps_per_second": 4.778, + "eval_wer": 0.34790004975044536, + "step": 44600 + }, + { + "epoch": 1.7603835121222837, + "eval_loss": 0.4354783296585083, + "eval_runtime": 149.8154, + "eval_samples_per_second": 37.753, + "eval_steps_per_second": 4.719, + "eval_wer": 0.3460705172441463, + "step": 44800 + }, + { + "epoch": 1.7682423670871152, + "grad_norm": 1.684985637664795, + "learning_rate": 3.0418181818181817e-05, + "loss": 0.2421, + "step": 45000 + }, + { + "epoch": 1.7682423670871152, + "eval_loss": 0.43855908513069153, + "eval_runtime": 148.5791, + "eval_samples_per_second": 38.067, + "eval_steps_per_second": 4.758, + "eval_wer": 0.34372743175362297, + "step": 45000 + }, + { + "epoch": 1.776101222051947, + "eval_loss": 0.4347515106201172, + "eval_runtime": 147.9309, + "eval_samples_per_second": 38.234, + "eval_steps_per_second": 4.779, + "eval_wer": 0.3458297892827912, + "step": 45200 + }, + { + "epoch": 1.7839600770167787, + "eval_loss": 0.43350183963775635, + "eval_runtime": 148.2161, + "eval_samples_per_second": 38.16, + "eval_steps_per_second": 4.77, + "eval_wer": 0.3435348493845388, + "step": 45400 + }, + { + "epoch": 1.7878895044991945, + "grad_norm": 2.4373562335968018, + "learning_rate": 2.7387878787878784e-05, + "loss": 0.2418, + "step": 45500 + }, + { + "epoch": 1.7918189319816102, + "eval_loss": 0.43087294697761536, + "eval_runtime": 146.7738, + "eval_samples_per_second": 38.535, + "eval_steps_per_second": 4.817, + "eval_wer": 0.3443693729839033, + "step": 45600 + }, + { + "epoch": 1.7996777869464418, + "eval_loss": 0.43208202719688416, + "eval_runtime": 147.2129, + "eval_samples_per_second": 38.421, + "eval_steps_per_second": 4.803, + "eval_wer": 0.34249169488533326, + "step": 45800 + }, + { + "epoch": 1.8075366419112735, + "grad_norm": 1.2847892045974731, + "learning_rate": 2.4357575757575755e-05, + "loss": 0.2424, + "step": 46000 + }, + { + "epoch": 1.8075366419112735, + "eval_loss": 0.42999544739723206, + "eval_runtime": 147.0735, + "eval_samples_per_second": 38.457, + "eval_steps_per_second": 4.807, + "eval_wer": 0.34075845356357626, + "step": 46000 + }, + { + "epoch": 1.8153954968761052, + "eval_loss": 0.4301421046257019, + "eval_runtime": 146.951, + "eval_samples_per_second": 38.489, + "eval_steps_per_second": 4.811, + "eval_wer": 0.34231516104700616, + "step": 46200 + }, + { + "epoch": 1.8232543518409368, + "eval_loss": 0.4339451491832733, + "eval_runtime": 146.5189, + "eval_samples_per_second": 38.603, + "eval_steps_per_second": 4.825, + "eval_wer": 0.3407424050328192, + "step": 46400 + }, + { + "epoch": 1.8271837793233527, + "grad_norm": 7.262228965759277, + "learning_rate": 2.133333333333333e-05, + "loss": 0.228, + "step": 46500 + }, + { + "epoch": 1.8311132068057683, + "eval_loss": 0.43165403604507446, + "eval_runtime": 146.7443, + "eval_samples_per_second": 38.543, + "eval_steps_per_second": 4.818, + "eval_wer": 0.3428929081542585, + "step": 46600 + }, + { + "epoch": 1.8389720617706, + "eval_loss": 0.43002423644065857, + "eval_runtime": 146.6705, + "eval_samples_per_second": 38.563, + "eval_steps_per_second": 4.82, + "eval_wer": 0.34332621848469774, + "step": 46800 + }, + { + "epoch": 1.8468309167354318, + "grad_norm": 0.922248125076294, + "learning_rate": 1.8303030303030302e-05, + "loss": 0.2532, + "step": 47000 + }, + { + "epoch": 1.8468309167354318, + "eval_loss": 0.42492908239364624, + "eval_runtime": 147.1617, + "eval_samples_per_second": 38.434, + "eval_steps_per_second": 4.804, + "eval_wer": 0.3439360626534641, + "step": 47000 + }, + { + "epoch": 1.8546897717002633, + "eval_loss": 0.42566677927970886, + "eval_runtime": 147.1363, + "eval_samples_per_second": 38.441, + "eval_steps_per_second": 4.805, + "eval_wer": 0.3430373449310716, + "step": 47200 + }, + { + "epoch": 1.8625486266650948, + "eval_loss": 0.42639264464378357, + "eval_runtime": 147.0021, + "eval_samples_per_second": 38.476, + "eval_steps_per_second": 4.809, + "eval_wer": 0.3408226476866043, + "step": 47400 + }, + { + "epoch": 1.8664780541475108, + "grad_norm": 0.7899935841560364, + "learning_rate": 1.5272727272727273e-05, + "loss": 0.2347, + "step": 47500 + }, + { + "epoch": 1.8704074816299265, + "eval_loss": 0.4254419207572937, + "eval_runtime": 146.4448, + "eval_samples_per_second": 38.622, + "eval_steps_per_second": 4.828, + "eval_wer": 0.3408868418096323, + "step": 47600 + }, + { + "epoch": 1.8782663365947583, + "eval_loss": 0.423650860786438, + "eval_runtime": 147.0702, + "eval_samples_per_second": 38.458, + "eval_steps_per_second": 4.807, + "eval_wer": 0.3391215034263613, + "step": 47800 + }, + { + "epoch": 1.8861251915595898, + "grad_norm": 1.1323833465576172, + "learning_rate": 1.2242424242424242e-05, + "loss": 0.2265, + "step": 48000 + }, + { + "epoch": 1.8861251915595898, + "eval_loss": 0.4246509373188019, + "eval_runtime": 147.0222, + "eval_samples_per_second": 38.47, + "eval_steps_per_second": 4.809, + "eval_wer": 0.33952271669528655, + "step": 48000 + }, + { + "epoch": 1.8939840465244213, + "eval_loss": 0.42534753680229187, + "eval_runtime": 146.8715, + "eval_samples_per_second": 38.51, + "eval_steps_per_second": 4.814, + "eval_wer": 0.3389128725265202, + "step": 48200 + }, + { + "epoch": 1.901842901489253, + "eval_loss": 0.4245891273021698, + "eval_runtime": 146.4129, + "eval_samples_per_second": 38.63, + "eval_steps_per_second": 4.829, + "eval_wer": 0.33902521224181925, + "step": 48400 + }, + { + "epoch": 1.9057723289716688, + "grad_norm": 2.10141658782959, + "learning_rate": 9.212121212121211e-06, + "loss": 0.2262, + "step": 48500 + }, + { + "epoch": 1.9097017564540848, + "eval_loss": 0.4226687252521515, + "eval_runtime": 147.045, + "eval_samples_per_second": 38.464, + "eval_steps_per_second": 4.808, + "eval_wer": 0.3378536694965576, + "step": 48600 + }, + { + "epoch": 1.9175606114189163, + "eval_loss": 0.4228062033653259, + "eval_runtime": 147.4189, + "eval_samples_per_second": 38.367, + "eval_steps_per_second": 4.796, + "eval_wer": 0.33892892105727723, + "step": 48800 + }, + { + "epoch": 1.9254194663837478, + "grad_norm": 0.8046126365661621, + "learning_rate": 6.181818181818182e-06, + "loss": 0.2358, + "step": 49000 + }, + { + "epoch": 1.9254194663837478, + "eval_loss": 0.4225420653820038, + "eval_runtime": 147.7497, + "eval_samples_per_second": 38.281, + "eval_steps_per_second": 4.785, + "eval_wer": 0.3391054548956043, + "step": 49000 + }, + { + "epoch": 1.9332783213485794, + "eval_loss": 0.4224160313606262, + "eval_runtime": 147.1221, + "eval_samples_per_second": 38.444, + "eval_steps_per_second": 4.806, + "eval_wer": 0.33902521224181925, + "step": 49200 + }, + { + "epoch": 1.941137176313411, + "eval_loss": 0.4214831590652466, + "eval_runtime": 147.8229, + "eval_samples_per_second": 38.262, + "eval_steps_per_second": 4.783, + "eval_wer": 0.3389931151803052, + "step": 49400 + }, + { + "epoch": 1.9450666037958269, + "grad_norm": 1.517034888267517, + "learning_rate": 3.1575757575757576e-06, + "loss": 0.231, + "step": 49500 + }, + { + "epoch": 1.9489960312782428, + "eval_loss": 0.4215412437915802, + "eval_runtime": 147.4583, + "eval_samples_per_second": 38.357, + "eval_steps_per_second": 4.795, + "eval_wer": 0.3399560270257258, + "step": 49600 + }, + { + "epoch": 1.9568548862430744, + "eval_loss": 0.4211778938770294, + "eval_runtime": 146.928, + "eval_samples_per_second": 38.495, + "eval_steps_per_second": 4.812, + "eval_wer": 0.33933013432620246, + "step": 49800 + }, + { + "epoch": 1.9647137412079059, + "grad_norm": 2.9327681064605713, + "learning_rate": 1.2727272727272726e-07, + "loss": 0.2331, + "step": 50000 + }, + { + "epoch": 1.9647137412079059, + "eval_loss": 0.4211583733558655, + "eval_runtime": 147.1945, + "eval_samples_per_second": 38.425, + "eval_steps_per_second": 4.803, + "eval_wer": 0.33939432844923045, + "step": 50000 + }, + { + "epoch": 1.9647137412079059, + "step": 50000, + "total_flos": 6.219831968409632e+19, + "train_loss": 0.4413083312988281, + "train_runtime": 56545.9703, + "train_samples_per_second": 7.074, + "train_steps_per_second": 0.884 } ], "logging_steps": 500, - "max_steps": 30000, + "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 400, @@ -1805,7 +2985,7 @@ "attributes": {} } }, - "total_flos": 3.731985674211105e+19, + "total_flos": 6.219831968409632e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null