{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 699, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2e-05, "loss": 3.8527, "step": 1 }, { "epoch": 0.0, "learning_rate": 4e-05, "loss": 2.357, "step": 2 }, { "epoch": 0.0, "learning_rate": 6e-05, "loss": 1.6448, "step": 3 }, { "epoch": 0.01, "learning_rate": 8e-05, "loss": 3.7495, "step": 4 }, { "epoch": 0.01, "learning_rate": 0.0001, "loss": 3.6283, "step": 5 }, { "epoch": 0.01, "learning_rate": 0.00012, "loss": 2.8617, "step": 6 }, { "epoch": 0.01, "learning_rate": 0.00014, "loss": 2.1554, "step": 7 }, { "epoch": 0.01, "learning_rate": 0.00016, "loss": 1.5715, "step": 8 }, { "epoch": 0.01, "learning_rate": 0.00018, "loss": 1.162, "step": 9 }, { "epoch": 0.01, "learning_rate": 0.0002, "loss": 0.9593, "step": 10 }, { "epoch": 0.02, "learning_rate": 0.00019999988670134103, "loss": 0.8023, "step": 11 }, { "epoch": 0.02, "learning_rate": 0.00019999954680562074, "loss": 0.9134, "step": 12 }, { "epoch": 0.02, "learning_rate": 0.00019999898031360943, "loss": 0.8533, "step": 13 }, { "epoch": 0.02, "learning_rate": 0.00019999818722659068, "loss": 0.792, "step": 14 }, { "epoch": 0.02, "learning_rate": 0.00019999716754636165, "loss": 0.9903, "step": 15 }, { "epoch": 0.02, "learning_rate": 0.00019999592127523287, "loss": 0.7565, "step": 16 }, { "epoch": 0.02, "learning_rate": 0.0001999944484160284, "loss": 0.8171, "step": 17 }, { "epoch": 0.03, "learning_rate": 0.00019999274897208565, "loss": 0.9344, "step": 18 }, { "epoch": 0.03, "learning_rate": 0.00019999082294725555, "loss": 0.8733, "step": 19 }, { "epoch": 0.03, "learning_rate": 0.00019998867034590241, "loss": 0.9758, "step": 20 }, { "epoch": 0.03, "eval_loss": 0.6869800090789795, "eval_runtime": 29.3692, "eval_samples_per_second": 55.466, "eval_steps_per_second": 27.75, "step": 20 }, { "epoch": 0.03, "learning_rate": 0.00019998629117290396, "loss": 0.8391, "step": 21 }, { "epoch": 0.03, "learning_rate": 0.0001999836854336514, "loss": 1.032, "step": 22 }, { "epoch": 0.03, "learning_rate": 0.00019998085313404916, "loss": 0.9099, "step": 23 }, { "epoch": 0.03, "learning_rate": 0.00019997779428051522, "loss": 0.8684, "step": 24 }, { "epoch": 0.04, "learning_rate": 0.00019997450887998086, "loss": 0.9439, "step": 25 }, { "epoch": 0.04, "learning_rate": 0.0001999709969398907, "loss": 1.0515, "step": 26 }, { "epoch": 0.04, "learning_rate": 0.0001999672584682027, "loss": 1.1631, "step": 27 }, { "epoch": 0.04, "learning_rate": 0.00019996329347338814, "loss": 0.7389, "step": 28 }, { "epoch": 0.04, "learning_rate": 0.00019995910196443157, "loss": 0.6839, "step": 29 }, { "epoch": 0.04, "learning_rate": 0.00019995468395083088, "loss": 1.0289, "step": 30 }, { "epoch": 0.04, "learning_rate": 0.00019995003944259715, "loss": 0.9143, "step": 31 }, { "epoch": 0.05, "learning_rate": 0.00019994516845025468, "loss": 0.95, "step": 32 }, { "epoch": 0.05, "learning_rate": 0.0001999400709848411, "loss": 0.9214, "step": 33 }, { "epoch": 0.05, "learning_rate": 0.000199934747057907, "loss": 0.9471, "step": 34 }, { "epoch": 0.05, "learning_rate": 0.00019992919668151635, "loss": 0.7858, "step": 35 }, { "epoch": 0.05, "learning_rate": 0.00019992341986824612, "loss": 0.6643, "step": 36 }, { "epoch": 0.05, "learning_rate": 0.00019991741663118642, "loss": 0.7344, "step": 37 }, { "epoch": 0.05, "learning_rate": 0.00019991118698394042, "loss": 0.7909, "step": 38 }, { "epoch": 0.06, "learning_rate": 0.00019990473094062434, "loss": 0.7732, "step": 39 }, { "epoch": 0.06, "learning_rate": 0.00019989804851586743, "loss": 0.7228, "step": 40 }, { "epoch": 0.06, "eval_loss": 0.6790834069252014, "eval_runtime": 29.3451, "eval_samples_per_second": 55.512, "eval_steps_per_second": 27.773, "step": 40 }, { "epoch": 0.06, "learning_rate": 0.00019989113972481183, "loss": 0.8713, "step": 41 }, { "epoch": 0.06, "learning_rate": 0.0001998840045831127, "loss": 0.6862, "step": 42 }, { "epoch": 0.06, "learning_rate": 0.00019987664310693805, "loss": 0.7524, "step": 43 }, { "epoch": 0.06, "learning_rate": 0.00019986905531296884, "loss": 0.7405, "step": 44 }, { "epoch": 0.06, "learning_rate": 0.0001998612412183988, "loss": 0.7113, "step": 45 }, { "epoch": 0.07, "learning_rate": 0.00019985320084093443, "loss": 0.7264, "step": 46 }, { "epoch": 0.07, "learning_rate": 0.00019984493419879503, "loss": 0.717, "step": 47 }, { "epoch": 0.07, "learning_rate": 0.00019983644131071256, "loss": 0.9136, "step": 48 }, { "epoch": 0.07, "learning_rate": 0.00019982772219593172, "loss": 0.662, "step": 49 }, { "epoch": 0.07, "learning_rate": 0.00019981877687420975, "loss": 0.6811, "step": 50 }, { "epoch": 0.07, "learning_rate": 0.00019980960536581654, "loss": 0.7089, "step": 51 }, { "epoch": 0.07, "learning_rate": 0.0001998002076915345, "loss": 0.774, "step": 52 }, { "epoch": 0.08, "learning_rate": 0.00019979058387265843, "loss": 0.9975, "step": 53 }, { "epoch": 0.08, "learning_rate": 0.0001997807339309957, "loss": 0.8504, "step": 54 }, { "epoch": 0.08, "learning_rate": 0.00019977065788886602, "loss": 0.7098, "step": 55 }, { "epoch": 0.08, "learning_rate": 0.0001997603557691014, "loss": 0.6549, "step": 56 }, { "epoch": 0.08, "learning_rate": 0.00019974982759504625, "loss": 0.6731, "step": 57 }, { "epoch": 0.08, "learning_rate": 0.000199739073390557, "loss": 0.8107, "step": 58 }, { "epoch": 0.08, "learning_rate": 0.00019972809318000246, "loss": 0.7196, "step": 59 }, { "epoch": 0.09, "learning_rate": 0.00019971688698826353, "loss": 0.6804, "step": 60 }, { "epoch": 0.09, "eval_loss": 0.6613275408744812, "eval_runtime": 29.3333, "eval_samples_per_second": 55.534, "eval_steps_per_second": 27.784, "step": 60 }, { "epoch": 0.09, "learning_rate": 0.00019970545484073306, "loss": 0.7939, "step": 61 }, { "epoch": 0.09, "learning_rate": 0.00019969379676331602, "loss": 0.7578, "step": 62 }, { "epoch": 0.09, "learning_rate": 0.00019968191278242934, "loss": 0.792, "step": 63 }, { "epoch": 0.09, "learning_rate": 0.00019966980292500174, "loss": 0.9361, "step": 64 }, { "epoch": 0.09, "learning_rate": 0.00019965746721847387, "loss": 0.7841, "step": 65 }, { "epoch": 0.09, "learning_rate": 0.00019964490569079812, "loss": 0.7706, "step": 66 }, { "epoch": 0.1, "learning_rate": 0.00019963211837043852, "loss": 0.8221, "step": 67 }, { "epoch": 0.1, "learning_rate": 0.00019961910528637088, "loss": 0.6315, "step": 68 }, { "epoch": 0.1, "learning_rate": 0.0001996058664680824, "loss": 0.6358, "step": 69 }, { "epoch": 0.1, "learning_rate": 0.00019959240194557197, "loss": 0.7126, "step": 70 }, { "epoch": 0.1, "learning_rate": 0.00019957871174934978, "loss": 0.5788, "step": 71 }, { "epoch": 0.1, "learning_rate": 0.00019956479591043752, "loss": 0.7683, "step": 72 }, { "epoch": 0.1, "learning_rate": 0.000199550654460368, "loss": 0.7092, "step": 73 }, { "epoch": 0.11, "learning_rate": 0.00019953628743118546, "loss": 0.7995, "step": 74 }, { "epoch": 0.11, "learning_rate": 0.00019952169485544516, "loss": 0.6204, "step": 75 }, { "epoch": 0.11, "learning_rate": 0.00019950687676621352, "loss": 0.6226, "step": 76 }, { "epoch": 0.11, "learning_rate": 0.0001994918331970679, "loss": 0.5133, "step": 77 }, { "epoch": 0.11, "learning_rate": 0.0001994765641820966, "loss": 0.9482, "step": 78 }, { "epoch": 0.11, "learning_rate": 0.00019946106975589884, "loss": 0.7825, "step": 79 }, { "epoch": 0.11, "learning_rate": 0.00019944534995358458, "loss": 0.8117, "step": 80 }, { "epoch": 0.11, "eval_loss": 0.6359772682189941, "eval_runtime": 29.2458, "eval_samples_per_second": 55.7, "eval_steps_per_second": 27.867, "step": 80 }, { "epoch": 0.12, "learning_rate": 0.00019942940481077446, "loss": 0.844, "step": 81 }, { "epoch": 0.12, "learning_rate": 0.00019941323436359972, "loss": 0.766, "step": 82 }, { "epoch": 0.12, "learning_rate": 0.00019939683864870217, "loss": 0.7071, "step": 83 }, { "epoch": 0.12, "learning_rate": 0.0001993802177032341, "loss": 0.7645, "step": 84 }, { "epoch": 0.12, "learning_rate": 0.0001993633715648581, "loss": 0.9787, "step": 85 }, { "epoch": 0.12, "learning_rate": 0.00019934630027174707, "loss": 0.6916, "step": 86 }, { "epoch": 0.12, "learning_rate": 0.00019932900386258407, "loss": 0.6979, "step": 87 }, { "epoch": 0.13, "learning_rate": 0.0001993114823765623, "loss": 0.688, "step": 88 }, { "epoch": 0.13, "learning_rate": 0.00019929373585338508, "loss": 0.8155, "step": 89 }, { "epoch": 0.13, "learning_rate": 0.00019927576433326544, "loss": 0.6204, "step": 90 }, { "epoch": 0.13, "learning_rate": 0.0001992575678569264, "loss": 0.8641, "step": 91 }, { "epoch": 0.13, "learning_rate": 0.0001992391464656007, "loss": 0.758, "step": 92 }, { "epoch": 0.13, "learning_rate": 0.0001992205002010307, "loss": 0.6035, "step": 93 }, { "epoch": 0.13, "learning_rate": 0.00019920162910546833, "loss": 0.857, "step": 94 }, { "epoch": 0.14, "learning_rate": 0.00019918253322167502, "loss": 0.6189, "step": 95 }, { "epoch": 0.14, "learning_rate": 0.00019916321259292152, "loss": 0.8429, "step": 96 }, { "epoch": 0.14, "learning_rate": 0.00019914366726298782, "loss": 0.9827, "step": 97 }, { "epoch": 0.14, "learning_rate": 0.00019912389727616314, "loss": 0.9278, "step": 98 }, { "epoch": 0.14, "learning_rate": 0.00019910390267724576, "loss": 0.6625, "step": 99 }, { "epoch": 0.14, "learning_rate": 0.0001990836835115429, "loss": 0.6458, "step": 100 }, { "epoch": 0.14, "eval_loss": 0.6334865093231201, "eval_runtime": 29.3599, "eval_samples_per_second": 55.484, "eval_steps_per_second": 27.759, "step": 100 }, { "epoch": 0.14, "learning_rate": 0.0001990632398248706, "loss": 0.6327, "step": 101 }, { "epoch": 0.15, "learning_rate": 0.00019904257166355376, "loss": 0.719, "step": 102 }, { "epoch": 0.15, "learning_rate": 0.00019902167907442583, "loss": 0.6946, "step": 103 }, { "epoch": 0.15, "learning_rate": 0.00019900056210482892, "loss": 0.6667, "step": 104 }, { "epoch": 0.15, "learning_rate": 0.00019897922080261345, "loss": 0.6793, "step": 105 }, { "epoch": 0.15, "learning_rate": 0.0001989576552161383, "loss": 0.7095, "step": 106 }, { "epoch": 0.15, "learning_rate": 0.00019893586539427048, "loss": 0.6903, "step": 107 }, { "epoch": 0.15, "learning_rate": 0.0001989138513863851, "loss": 0.9088, "step": 108 }, { "epoch": 0.16, "learning_rate": 0.0001988916132423654, "loss": 0.7673, "step": 109 }, { "epoch": 0.16, "learning_rate": 0.00019886915101260234, "loss": 0.671, "step": 110 }, { "epoch": 0.16, "learning_rate": 0.00019884646474799475, "loss": 0.6958, "step": 111 }, { "epoch": 0.16, "learning_rate": 0.00019882355449994915, "loss": 0.6109, "step": 112 }, { "epoch": 0.16, "learning_rate": 0.00019880042032037947, "loss": 0.7404, "step": 113 }, { "epoch": 0.16, "learning_rate": 0.0001987770622617072, "loss": 0.7017, "step": 114 }, { "epoch": 0.16, "learning_rate": 0.00019875348037686106, "loss": 0.8272, "step": 115 }, { "epoch": 0.17, "learning_rate": 0.00019872967471927692, "loss": 0.8391, "step": 116 }, { "epoch": 0.17, "learning_rate": 0.00019870564534289783, "loss": 0.6695, "step": 117 }, { "epoch": 0.17, "learning_rate": 0.0001986813923021737, "loss": 0.7992, "step": 118 }, { "epoch": 0.17, "learning_rate": 0.00019865691565206122, "loss": 0.7508, "step": 119 }, { "epoch": 0.17, "learning_rate": 0.00019863221544802386, "loss": 0.7509, "step": 120 }, { "epoch": 0.17, "eval_loss": 0.6245399713516235, "eval_runtime": 29.3529, "eval_samples_per_second": 55.497, "eval_steps_per_second": 27.766, "step": 120 }, { "epoch": 0.17, "learning_rate": 0.00019860729174603163, "loss": 0.6813, "step": 121 }, { "epoch": 0.17, "learning_rate": 0.00019858214460256095, "loss": 0.6432, "step": 122 }, { "epoch": 0.18, "learning_rate": 0.00019855677407459458, "loss": 0.672, "step": 123 }, { "epoch": 0.18, "learning_rate": 0.00019853118021962148, "loss": 0.7951, "step": 124 }, { "epoch": 0.18, "learning_rate": 0.00019850536309563656, "loss": 0.7095, "step": 125 }, { "epoch": 0.18, "learning_rate": 0.00019847932276114083, "loss": 0.6037, "step": 126 }, { "epoch": 0.18, "learning_rate": 0.00019845305927514094, "loss": 0.5812, "step": 127 }, { "epoch": 0.18, "learning_rate": 0.00019842657269714923, "loss": 0.6124, "step": 128 }, { "epoch": 0.18, "learning_rate": 0.0001983998630871836, "loss": 0.6501, "step": 129 }, { "epoch": 0.19, "learning_rate": 0.0001983729305057673, "loss": 0.9065, "step": 130 }, { "epoch": 0.19, "learning_rate": 0.00019834577501392885, "loss": 0.8046, "step": 131 }, { "epoch": 0.19, "learning_rate": 0.00019831839667320183, "loss": 0.904, "step": 132 }, { "epoch": 0.19, "learning_rate": 0.00019829079554562487, "loss": 0.5364, "step": 133 }, { "epoch": 0.19, "learning_rate": 0.0001982629716937414, "loss": 0.4704, "step": 134 }, { "epoch": 0.19, "learning_rate": 0.00019823492518059946, "loss": 0.6457, "step": 135 }, { "epoch": 0.19, "learning_rate": 0.00019820665606975175, "loss": 0.752, "step": 136 }, { "epoch": 0.2, "learning_rate": 0.00019817816442525526, "loss": 0.7523, "step": 137 }, { "epoch": 0.2, "learning_rate": 0.00019814945031167134, "loss": 0.675, "step": 138 }, { "epoch": 0.2, "learning_rate": 0.0001981205137940654, "loss": 0.7782, "step": 139 }, { "epoch": 0.2, "learning_rate": 0.00019809135493800679, "loss": 0.6174, "step": 140 }, { "epoch": 0.2, "eval_loss": 0.6312826871871948, "eval_runtime": 29.36, "eval_samples_per_second": 55.484, "eval_steps_per_second": 27.759, "step": 140 }, { "epoch": 0.2, "learning_rate": 0.00019806197380956874, "loss": 0.5355, "step": 141 }, { "epoch": 0.2, "learning_rate": 0.00019803237047532802, "loss": 0.5741, "step": 142 }, { "epoch": 0.2, "learning_rate": 0.00019800254500236503, "loss": 0.7357, "step": 143 }, { "epoch": 0.21, "learning_rate": 0.00019797249745826353, "loss": 0.6641, "step": 144 }, { "epoch": 0.21, "learning_rate": 0.0001979422279111104, "loss": 0.5884, "step": 145 }, { "epoch": 0.21, "learning_rate": 0.00019791173642949564, "loss": 0.4902, "step": 146 }, { "epoch": 0.21, "learning_rate": 0.00019788102308251212, "loss": 0.6585, "step": 147 }, { "epoch": 0.21, "learning_rate": 0.00019785008793975548, "loss": 0.6445, "step": 148 }, { "epoch": 0.21, "learning_rate": 0.00019781893107132393, "loss": 0.8865, "step": 149 }, { "epoch": 0.21, "learning_rate": 0.00019778755254781807, "loss": 0.6996, "step": 150 }, { "epoch": 0.22, "learning_rate": 0.00019775595244034077, "loss": 0.5696, "step": 151 }, { "epoch": 0.22, "learning_rate": 0.00019772413082049713, "loss": 0.6353, "step": 152 }, { "epoch": 0.22, "learning_rate": 0.00019769208776039397, "loss": 0.6547, "step": 153 }, { "epoch": 0.22, "learning_rate": 0.00019765982333264006, "loss": 1.059, "step": 154 }, { "epoch": 0.22, "learning_rate": 0.0001976273376103457, "loss": 0.6234, "step": 155 }, { "epoch": 0.22, "learning_rate": 0.0001975946306671227, "loss": 0.5276, "step": 156 }, { "epoch": 0.22, "learning_rate": 0.00019756170257708412, "loss": 0.6942, "step": 157 }, { "epoch": 0.23, "learning_rate": 0.0001975285534148441, "loss": 0.7216, "step": 158 }, { "epoch": 0.23, "learning_rate": 0.00019749518325551778, "loss": 0.6563, "step": 159 }, { "epoch": 0.23, "learning_rate": 0.00019746159217472097, "loss": 0.7549, "step": 160 }, { "epoch": 0.23, "eval_loss": 0.6180239319801331, "eval_runtime": 29.3051, "eval_samples_per_second": 55.588, "eval_steps_per_second": 27.811, "step": 160 }, { "epoch": 0.23, "learning_rate": 0.00019742778024857028, "loss": 0.6399, "step": 161 }, { "epoch": 0.23, "learning_rate": 0.00019739374755368253, "loss": 0.6149, "step": 162 }, { "epoch": 0.23, "learning_rate": 0.00019735949416717493, "loss": 0.5183, "step": 163 }, { "epoch": 0.23, "learning_rate": 0.00019732502016666476, "loss": 0.5566, "step": 164 }, { "epoch": 0.24, "learning_rate": 0.00019729032563026914, "loss": 0.7261, "step": 165 }, { "epoch": 0.24, "learning_rate": 0.00019725541063660498, "loss": 0.7432, "step": 166 }, { "epoch": 0.24, "learning_rate": 0.0001972202752647887, "loss": 0.6235, "step": 167 }, { "epoch": 0.24, "learning_rate": 0.00019718491959443615, "loss": 0.8306, "step": 168 }, { "epoch": 0.24, "learning_rate": 0.00019714934370566227, "loss": 0.7739, "step": 169 }, { "epoch": 0.24, "learning_rate": 0.00019711354767908113, "loss": 0.5514, "step": 170 }, { "epoch": 0.24, "learning_rate": 0.0001970775315958055, "loss": 0.6287, "step": 171 }, { "epoch": 0.25, "learning_rate": 0.00019704129553744696, "loss": 0.5172, "step": 172 }, { "epoch": 0.25, "learning_rate": 0.00019700483958611534, "loss": 0.5628, "step": 173 }, { "epoch": 0.25, "learning_rate": 0.0001969681638244189, "loss": 0.5371, "step": 174 }, { "epoch": 0.25, "learning_rate": 0.00019693126833546392, "loss": 0.5714, "step": 175 }, { "epoch": 0.25, "learning_rate": 0.0001968941532028546, "loss": 0.7978, "step": 176 }, { "epoch": 0.25, "learning_rate": 0.00019685681851069285, "loss": 0.7627, "step": 177 }, { "epoch": 0.25, "learning_rate": 0.00019681926434357802, "loss": 0.5672, "step": 178 }, { "epoch": 0.26, "learning_rate": 0.00019678149078660692, "loss": 0.639, "step": 179 }, { "epoch": 0.26, "learning_rate": 0.00019674349792537336, "loss": 0.6015, "step": 180 }, { "epoch": 0.26, "eval_loss": 0.616681694984436, "eval_runtime": 29.315, "eval_samples_per_second": 55.569, "eval_steps_per_second": 27.801, "step": 180 }, { "epoch": 0.26, "learning_rate": 0.0001967052858459682, "loss": 0.564, "step": 181 }, { "epoch": 0.26, "learning_rate": 0.0001966668546349789, "loss": 0.6838, "step": 182 }, { "epoch": 0.26, "learning_rate": 0.00019662820437948967, "loss": 0.6492, "step": 183 }, { "epoch": 0.26, "learning_rate": 0.00019658933516708085, "loss": 0.7017, "step": 184 }, { "epoch": 0.26, "learning_rate": 0.00019655024708582904, "loss": 0.7092, "step": 185 }, { "epoch": 0.27, "learning_rate": 0.00019651094022430686, "loss": 0.4604, "step": 186 }, { "epoch": 0.27, "learning_rate": 0.0001964714146715826, "loss": 0.6142, "step": 187 }, { "epoch": 0.27, "learning_rate": 0.00019643167051722, "loss": 0.6881, "step": 188 }, { "epoch": 0.27, "learning_rate": 0.00019639170785127835, "loss": 0.5784, "step": 189 }, { "epoch": 0.27, "learning_rate": 0.0001963515267643119, "loss": 0.8617, "step": 190 }, { "epoch": 0.27, "learning_rate": 0.00019631112734737, "loss": 0.7697, "step": 191 }, { "epoch": 0.27, "learning_rate": 0.00019627050969199655, "loss": 0.5587, "step": 192 }, { "epoch": 0.28, "learning_rate": 0.00019622967389023015, "loss": 0.5915, "step": 193 }, { "epoch": 0.28, "learning_rate": 0.0001961886200346036, "loss": 0.9243, "step": 194 }, { "epoch": 0.28, "learning_rate": 0.00019614734821814383, "loss": 0.7143, "step": 195 }, { "epoch": 0.28, "learning_rate": 0.00019610585853437165, "loss": 0.9429, "step": 196 }, { "epoch": 0.28, "learning_rate": 0.00019606415107730165, "loss": 0.5176, "step": 197 }, { "epoch": 0.28, "learning_rate": 0.0001960222259414417, "loss": 0.5437, "step": 198 }, { "epoch": 0.28, "learning_rate": 0.00019598008322179312, "loss": 0.6687, "step": 199 }, { "epoch": 0.29, "learning_rate": 0.00019593772301385016, "loss": 0.716, "step": 200 }, { "epoch": 0.29, "eval_loss": 0.6165403127670288, "eval_runtime": 29.3359, "eval_samples_per_second": 55.529, "eval_steps_per_second": 27.782, "step": 200 }, { "epoch": 0.29, "learning_rate": 0.0001958951454135999, "loss": 0.6579, "step": 201 }, { "epoch": 0.29, "learning_rate": 0.000195852350517522, "loss": 0.6, "step": 202 }, { "epoch": 0.29, "learning_rate": 0.00019580933842258867, "loss": 0.6689, "step": 203 }, { "epoch": 0.29, "learning_rate": 0.00019576610922626402, "loss": 0.679, "step": 204 }, { "epoch": 0.29, "learning_rate": 0.00019572266302650434, "loss": 0.8533, "step": 205 }, { "epoch": 0.29, "learning_rate": 0.00019567899992175753, "loss": 0.5989, "step": 206 }, { "epoch": 0.3, "learning_rate": 0.000195635120010963, "loss": 0.5986, "step": 207 }, { "epoch": 0.3, "learning_rate": 0.00019559102339355148, "loss": 0.7082, "step": 208 }, { "epoch": 0.3, "learning_rate": 0.00019554671016944474, "loss": 0.6403, "step": 209 }, { "epoch": 0.3, "learning_rate": 0.00019550218043905526, "loss": 0.5886, "step": 210 }, { "epoch": 0.3, "learning_rate": 0.00019545743430328632, "loss": 0.5391, "step": 211 }, { "epoch": 0.3, "learning_rate": 0.0001954124718635314, "loss": 0.6303, "step": 212 }, { "epoch": 0.3, "learning_rate": 0.0001953672932216742, "loss": 0.7248, "step": 213 }, { "epoch": 0.31, "learning_rate": 0.00019532189848008833, "loss": 0.6166, "step": 214 }, { "epoch": 0.31, "learning_rate": 0.00019527628774163705, "loss": 0.5582, "step": 215 }, { "epoch": 0.31, "learning_rate": 0.00019523046110967305, "loss": 0.6682, "step": 216 }, { "epoch": 0.31, "learning_rate": 0.00019518441868803828, "loss": 0.6317, "step": 217 }, { "epoch": 0.31, "learning_rate": 0.0001951381605810636, "loss": 0.7702, "step": 218 }, { "epoch": 0.31, "learning_rate": 0.00019509168689356866, "loss": 0.584, "step": 219 }, { "epoch": 0.31, "learning_rate": 0.0001950449977308616, "loss": 0.6304, "step": 220 }, { "epoch": 0.31, "eval_loss": 0.6014450192451477, "eval_runtime": 29.301, "eval_samples_per_second": 55.595, "eval_steps_per_second": 27.815, "step": 220 }, { "epoch": 0.32, "learning_rate": 0.00019499809319873873, "loss": 0.5839, "step": 221 }, { "epoch": 0.32, "learning_rate": 0.00019495097340348458, "loss": 0.589, "step": 222 }, { "epoch": 0.32, "learning_rate": 0.00019490363845187125, "loss": 0.8189, "step": 223 }, { "epoch": 0.32, "learning_rate": 0.00019485608845115854, "loss": 0.5217, "step": 224 }, { "epoch": 0.32, "learning_rate": 0.00019480832350909344, "loss": 0.6624, "step": 225 }, { "epoch": 0.32, "learning_rate": 0.00019476034373391005, "loss": 0.9488, "step": 226 }, { "epoch": 0.32, "learning_rate": 0.0001947121492343292, "loss": 0.7354, "step": 227 }, { "epoch": 0.33, "learning_rate": 0.0001946637401195584, "loss": 0.6767, "step": 228 }, { "epoch": 0.33, "learning_rate": 0.00019461511649929137, "loss": 0.6337, "step": 229 }, { "epoch": 0.33, "learning_rate": 0.00019456627848370793, "loss": 0.436, "step": 230 }, { "epoch": 0.33, "learning_rate": 0.0001945172261834737, "loss": 0.4473, "step": 231 }, { "epoch": 0.33, "learning_rate": 0.00019446795970973993, "loss": 0.5208, "step": 232 }, { "epoch": 0.33, "learning_rate": 0.00019441847917414307, "loss": 0.5907, "step": 233 }, { "epoch": 0.33, "learning_rate": 0.0001943687846888047, "loss": 0.6171, "step": 234 }, { "epoch": 0.34, "learning_rate": 0.00019431887636633125, "loss": 0.7125, "step": 235 }, { "epoch": 0.34, "learning_rate": 0.00019426875431981355, "loss": 0.46, "step": 236 }, { "epoch": 0.34, "learning_rate": 0.00019421841866282686, "loss": 0.71, "step": 237 }, { "epoch": 0.34, "learning_rate": 0.00019416786950943044, "loss": 0.6707, "step": 238 }, { "epoch": 0.34, "learning_rate": 0.00019411710697416726, "loss": 0.4639, "step": 239 }, { "epoch": 0.34, "learning_rate": 0.00019406613117206397, "loss": 0.5781, "step": 240 }, { "epoch": 0.34, "eval_loss": 0.6106613874435425, "eval_runtime": 29.3327, "eval_samples_per_second": 55.535, "eval_steps_per_second": 27.785, "step": 240 }, { "epoch": 0.34, "learning_rate": 0.00019401494221863024, "loss": 0.5778, "step": 241 }, { "epoch": 0.35, "learning_rate": 0.00019396354022985896, "loss": 0.6418, "step": 242 }, { "epoch": 0.35, "learning_rate": 0.0001939119253222256, "loss": 0.4679, "step": 243 }, { "epoch": 0.35, "learning_rate": 0.00019386009761268821, "loss": 0.6215, "step": 244 }, { "epoch": 0.35, "learning_rate": 0.00019380805721868694, "loss": 0.5621, "step": 245 }, { "epoch": 0.35, "learning_rate": 0.00019375580425814396, "loss": 0.5727, "step": 246 }, { "epoch": 0.35, "learning_rate": 0.00019370333884946307, "loss": 0.7134, "step": 247 }, { "epoch": 0.35, "learning_rate": 0.0001936506611115295, "loss": 0.6081, "step": 248 }, { "epoch": 0.36, "learning_rate": 0.00019359777116370955, "loss": 0.9212, "step": 249 }, { "epoch": 0.36, "learning_rate": 0.0001935446691258504, "loss": 0.5593, "step": 250 }, { "epoch": 0.36, "learning_rate": 0.00019349135511827995, "loss": 0.6382, "step": 251 }, { "epoch": 0.36, "learning_rate": 0.0001934378292618062, "loss": 0.6483, "step": 252 }, { "epoch": 0.36, "learning_rate": 0.00019338409167771734, "loss": 0.5854, "step": 253 }, { "epoch": 0.36, "learning_rate": 0.00019333014248778133, "loss": 0.6317, "step": 254 }, { "epoch": 0.36, "learning_rate": 0.00019327598181424557, "loss": 0.556, "step": 255 }, { "epoch": 0.37, "learning_rate": 0.0001932216097798367, "loss": 0.7319, "step": 256 }, { "epoch": 0.37, "learning_rate": 0.0001931670265077602, "loss": 0.6195, "step": 257 }, { "epoch": 0.37, "learning_rate": 0.00019311223212170045, "loss": 0.5316, "step": 258 }, { "epoch": 0.37, "learning_rate": 0.00019305722674581996, "loss": 0.753, "step": 259 }, { "epoch": 0.37, "learning_rate": 0.00019300201050475948, "loss": 0.8, "step": 260 }, { "epoch": 0.37, "eval_loss": 0.594850480556488, "eval_runtime": 29.3304, "eval_samples_per_second": 55.54, "eval_steps_per_second": 27.787, "step": 260 }, { "epoch": 0.37, "learning_rate": 0.0001929465835236375, "loss": 0.5674, "step": 261 }, { "epoch": 0.37, "learning_rate": 0.00019289094592805011, "loss": 0.5302, "step": 262 }, { "epoch": 0.38, "learning_rate": 0.00019283509784407058, "loss": 0.621, "step": 263 }, { "epoch": 0.38, "learning_rate": 0.00019277903939824914, "loss": 0.6329, "step": 264 }, { "epoch": 0.38, "learning_rate": 0.00019272277071761282, "loss": 0.5637, "step": 265 }, { "epoch": 0.38, "learning_rate": 0.00019266629192966485, "loss": 0.5367, "step": 266 }, { "epoch": 0.38, "learning_rate": 0.00019260960316238467, "loss": 0.6705, "step": 267 }, { "epoch": 0.38, "learning_rate": 0.00019255270454422756, "loss": 0.7982, "step": 268 }, { "epoch": 0.38, "learning_rate": 0.00019249559620412418, "loss": 0.5728, "step": 269 }, { "epoch": 0.39, "learning_rate": 0.00019243827827148055, "loss": 0.5841, "step": 270 }, { "epoch": 0.39, "learning_rate": 0.00019238075087617759, "loss": 0.7056, "step": 271 }, { "epoch": 0.39, "learning_rate": 0.00019232301414857074, "loss": 0.6554, "step": 272 }, { "epoch": 0.39, "learning_rate": 0.00019226506821948998, "loss": 0.6277, "step": 273 }, { "epoch": 0.39, "learning_rate": 0.00019220691322023917, "loss": 0.5815, "step": 274 }, { "epoch": 0.39, "learning_rate": 0.00019214854928259603, "loss": 0.722, "step": 275 }, { "epoch": 0.39, "learning_rate": 0.00019208997653881164, "loss": 0.6855, "step": 276 }, { "epoch": 0.4, "learning_rate": 0.00019203119512161023, "loss": 0.532, "step": 277 }, { "epoch": 0.4, "learning_rate": 0.00019197220516418902, "loss": 0.5008, "step": 278 }, { "epoch": 0.4, "learning_rate": 0.00019191300680021755, "loss": 0.5297, "step": 279 }, { "epoch": 0.4, "learning_rate": 0.0001918536001638378, "loss": 0.6845, "step": 280 }, { "epoch": 0.4, "eval_loss": 0.595288097858429, "eval_runtime": 29.2376, "eval_samples_per_second": 55.716, "eval_steps_per_second": 27.875, "step": 280 }, { "epoch": 0.4, "learning_rate": 0.00019179398538966358, "loss": 0.6998, "step": 281 }, { "epoch": 0.4, "learning_rate": 0.00019173416261278044, "loss": 0.8184, "step": 282 }, { "epoch": 0.4, "learning_rate": 0.0001916741319687451, "loss": 0.4835, "step": 283 }, { "epoch": 0.41, "learning_rate": 0.0001916138935935854, "loss": 0.7273, "step": 284 }, { "epoch": 0.41, "learning_rate": 0.00019155344762379994, "loss": 0.854, "step": 285 }, { "epoch": 0.41, "learning_rate": 0.0001914927941963576, "loss": 0.6895, "step": 286 }, { "epoch": 0.41, "learning_rate": 0.0001914319334486975, "loss": 0.6412, "step": 287 }, { "epoch": 0.41, "learning_rate": 0.0001913708655187284, "loss": 0.5484, "step": 288 }, { "epoch": 0.41, "learning_rate": 0.00019130959054482858, "loss": 0.5316, "step": 289 }, { "epoch": 0.41, "learning_rate": 0.00019124810866584554, "loss": 0.5696, "step": 290 }, { "epoch": 0.42, "learning_rate": 0.00019118642002109552, "loss": 0.6214, "step": 291 }, { "epoch": 0.42, "learning_rate": 0.00019112452475036337, "loss": 0.5533, "step": 292 }, { "epoch": 0.42, "learning_rate": 0.00019106242299390212, "loss": 0.4784, "step": 293 }, { "epoch": 0.42, "learning_rate": 0.00019100011489243263, "loss": 0.856, "step": 294 }, { "epoch": 0.42, "learning_rate": 0.00019093760058714346, "loss": 0.6587, "step": 295 }, { "epoch": 0.42, "learning_rate": 0.00019087488021969027, "loss": 0.6849, "step": 296 }, { "epoch": 0.42, "learning_rate": 0.0001908119539321958, "loss": 0.5236, "step": 297 }, { "epoch": 0.43, "learning_rate": 0.00019074882186724928, "loss": 0.584, "step": 298 }, { "epoch": 0.43, "learning_rate": 0.0001906854841679063, "loss": 0.5225, "step": 299 }, { "epoch": 0.43, "learning_rate": 0.0001906219409776884, "loss": 0.5857, "step": 300 }, { "epoch": 0.43, "eval_loss": 0.5940439701080322, "eval_runtime": 29.3378, "eval_samples_per_second": 55.526, "eval_steps_per_second": 27.78, "step": 300 }, { "epoch": 0.43, "learning_rate": 0.00019055819244058272, "loss": 0.9588, "step": 301 }, { "epoch": 0.43, "learning_rate": 0.00019049423870104174, "loss": 0.5345, "step": 302 }, { "epoch": 0.43, "learning_rate": 0.00019043007990398293, "loss": 0.827, "step": 303 }, { "epoch": 0.43, "learning_rate": 0.0001903657161947884, "loss": 0.53, "step": 304 }, { "epoch": 0.44, "learning_rate": 0.0001903011477193046, "loss": 0.469, "step": 305 }, { "epoch": 0.44, "learning_rate": 0.00019023637462384194, "loss": 0.6566, "step": 306 }, { "epoch": 0.44, "learning_rate": 0.00019017139705517454, "loss": 0.5136, "step": 307 }, { "epoch": 0.44, "learning_rate": 0.00019010621516053977, "loss": 0.6217, "step": 308 }, { "epoch": 0.44, "learning_rate": 0.00019004082908763813, "loss": 0.5953, "step": 309 }, { "epoch": 0.44, "learning_rate": 0.00018997523898463267, "loss": 0.6747, "step": 310 }, { "epoch": 0.44, "learning_rate": 0.00018990944500014883, "loss": 0.5683, "step": 311 }, { "epoch": 0.45, "learning_rate": 0.00018984344728327395, "loss": 0.8048, "step": 312 }, { "epoch": 0.45, "learning_rate": 0.00018977724598355717, "loss": 0.5998, "step": 313 }, { "epoch": 0.45, "learning_rate": 0.00018971084125100882, "loss": 0.6533, "step": 314 }, { "epoch": 0.45, "learning_rate": 0.00018964423323610026, "loss": 0.5986, "step": 315 }, { "epoch": 0.45, "learning_rate": 0.00018957742208976344, "loss": 0.5504, "step": 316 }, { "epoch": 0.45, "learning_rate": 0.0001895104079633906, "loss": 0.6991, "step": 317 }, { "epoch": 0.45, "learning_rate": 0.00018944319100883404, "loss": 0.566, "step": 318 }, { "epoch": 0.46, "learning_rate": 0.0001893757713784055, "loss": 0.5838, "step": 319 }, { "epoch": 0.46, "learning_rate": 0.00018930814922487607, "loss": 0.6369, "step": 320 }, { "epoch": 0.46, "eval_loss": 0.5889019966125488, "eval_runtime": 29.353, "eval_samples_per_second": 55.497, "eval_steps_per_second": 27.765, "step": 320 }, { "epoch": 0.46, "learning_rate": 0.00018924032470147575, "loss": 0.6911, "step": 321 }, { "epoch": 0.46, "learning_rate": 0.0001891722979618931, "loss": 0.839, "step": 322 }, { "epoch": 0.46, "learning_rate": 0.0001891040691602749, "loss": 0.664, "step": 323 }, { "epoch": 0.46, "learning_rate": 0.0001890356384512257, "loss": 0.7154, "step": 324 }, { "epoch": 0.46, "learning_rate": 0.00018896700598980775, "loss": 0.6249, "step": 325 }, { "epoch": 0.47, "learning_rate": 0.0001888981719315403, "loss": 0.6035, "step": 326 }, { "epoch": 0.47, "learning_rate": 0.00018882913643239953, "loss": 0.542, "step": 327 }, { "epoch": 0.47, "learning_rate": 0.00018875989964881797, "loss": 0.7356, "step": 328 }, { "epoch": 0.47, "learning_rate": 0.0001886904617376844, "loss": 0.5376, "step": 329 }, { "epoch": 0.47, "learning_rate": 0.0001886208228563432, "loss": 0.5325, "step": 330 }, { "epoch": 0.47, "learning_rate": 0.0001885509831625942, "loss": 0.4952, "step": 331 }, { "epoch": 0.47, "learning_rate": 0.0001884809428146923, "loss": 0.5887, "step": 332 }, { "epoch": 0.48, "learning_rate": 0.00018841070197134706, "loss": 0.5159, "step": 333 }, { "epoch": 0.48, "learning_rate": 0.00018834026079172237, "loss": 0.6428, "step": 334 }, { "epoch": 0.48, "learning_rate": 0.00018826961943543594, "loss": 0.624, "step": 335 }, { "epoch": 0.48, "learning_rate": 0.00018819877806255933, "loss": 0.7188, "step": 336 }, { "epoch": 0.48, "learning_rate": 0.00018812773683361708, "loss": 0.4654, "step": 337 }, { "epoch": 0.48, "learning_rate": 0.00018805649590958678, "loss": 0.7584, "step": 338 }, { "epoch": 0.48, "learning_rate": 0.00018798505545189844, "loss": 0.5568, "step": 339 }, { "epoch": 0.49, "learning_rate": 0.00018791341562243418, "loss": 0.4767, "step": 340 }, { "epoch": 0.49, "eval_loss": 0.5945929288864136, "eval_runtime": 29.3121, "eval_samples_per_second": 55.574, "eval_steps_per_second": 27.804, "step": 340 }, { "epoch": 0.49, "learning_rate": 0.000187841576583528, "loss": 0.6751, "step": 341 }, { "epoch": 0.49, "learning_rate": 0.00018776953849796514, "loss": 0.4626, "step": 342 }, { "epoch": 0.49, "learning_rate": 0.00018769730152898208, "loss": 0.7323, "step": 343 }, { "epoch": 0.49, "learning_rate": 0.00018762486584026578, "loss": 0.4619, "step": 344 }, { "epoch": 0.49, "learning_rate": 0.0001875522315959536, "loss": 0.6054, "step": 345 }, { "epoch": 0.49, "learning_rate": 0.00018747939896063276, "loss": 0.6465, "step": 346 }, { "epoch": 0.5, "learning_rate": 0.00018740636809934009, "loss": 0.8281, "step": 347 }, { "epoch": 0.5, "learning_rate": 0.00018733313917756154, "loss": 0.6565, "step": 348 }, { "epoch": 0.5, "learning_rate": 0.0001872597123612319, "loss": 0.6819, "step": 349 }, { "epoch": 0.5, "learning_rate": 0.0001871860878167344, "loss": 0.7214, "step": 350 }, { "epoch": 0.5, "learning_rate": 0.0001871122657109002, "loss": 0.6414, "step": 351 }, { "epoch": 0.5, "learning_rate": 0.00018703824621100825, "loss": 0.3563, "step": 352 }, { "epoch": 0.51, "learning_rate": 0.00018696402948478475, "loss": 0.6458, "step": 353 }, { "epoch": 0.51, "learning_rate": 0.00018688961570040283, "loss": 0.4847, "step": 354 }, { "epoch": 0.51, "learning_rate": 0.00018681500502648214, "loss": 0.6457, "step": 355 }, { "epoch": 0.51, "learning_rate": 0.00018674019763208842, "loss": 0.7152, "step": 356 }, { "epoch": 0.51, "learning_rate": 0.00018666519368673324, "loss": 0.8583, "step": 357 }, { "epoch": 0.51, "learning_rate": 0.00018658999336037356, "loss": 0.7007, "step": 358 }, { "epoch": 0.51, "learning_rate": 0.00018651459682341126, "loss": 0.5519, "step": 359 }, { "epoch": 0.52, "learning_rate": 0.00018643900424669286, "loss": 0.4848, "step": 360 }, { "epoch": 0.52, "eval_loss": 0.5990561842918396, "eval_runtime": 29.3397, "eval_samples_per_second": 55.522, "eval_steps_per_second": 27.778, "step": 360 }, { "epoch": 0.52, "learning_rate": 0.00018636321580150917, "loss": 0.5851, "step": 361 }, { "epoch": 0.52, "learning_rate": 0.0001862872316595947, "loss": 0.5745, "step": 362 }, { "epoch": 0.52, "learning_rate": 0.00018621105199312753, "loss": 0.5597, "step": 363 }, { "epoch": 0.52, "learning_rate": 0.00018613467697472876, "loss": 0.6364, "step": 364 }, { "epoch": 0.52, "learning_rate": 0.0001860581067774621, "loss": 0.5668, "step": 365 }, { "epoch": 0.52, "learning_rate": 0.00018598134157483354, "loss": 0.7182, "step": 366 }, { "epoch": 0.53, "learning_rate": 0.000185904381540791, "loss": 0.5478, "step": 367 }, { "epoch": 0.53, "learning_rate": 0.00018582722684972383, "loss": 0.485, "step": 368 }, { "epoch": 0.53, "learning_rate": 0.00018574987767646254, "loss": 0.7902, "step": 369 }, { "epoch": 0.53, "learning_rate": 0.00018567233419627824, "loss": 0.6895, "step": 370 }, { "epoch": 0.53, "learning_rate": 0.00018559459658488238, "loss": 0.6304, "step": 371 }, { "epoch": 0.53, "learning_rate": 0.00018551666501842636, "loss": 0.4389, "step": 372 }, { "epoch": 0.53, "learning_rate": 0.0001854385396735009, "loss": 0.7157, "step": 373 }, { "epoch": 0.54, "learning_rate": 0.00018536022072713606, "loss": 0.552, "step": 374 }, { "epoch": 0.54, "learning_rate": 0.00018528170835680036, "loss": 0.5701, "step": 375 }, { "epoch": 0.54, "learning_rate": 0.00018520300274040084, "loss": 0.7629, "step": 376 }, { "epoch": 0.54, "learning_rate": 0.00018512410405628225, "loss": 0.4954, "step": 377 }, { "epoch": 0.54, "learning_rate": 0.00018504501248322686, "loss": 1.0569, "step": 378 }, { "epoch": 0.54, "learning_rate": 0.00018496572820045413, "loss": 0.5473, "step": 379 }, { "epoch": 0.54, "learning_rate": 0.00018488625138762008, "loss": 0.9067, "step": 380 }, { "epoch": 0.54, "eval_loss": 0.5943491458892822, "eval_runtime": 29.3323, "eval_samples_per_second": 55.536, "eval_steps_per_second": 27.785, "step": 380 }, { "epoch": 0.55, "learning_rate": 0.00018480658222481703, "loss": 0.4678, "step": 381 }, { "epoch": 0.55, "learning_rate": 0.0001847267208925732, "loss": 0.7051, "step": 382 }, { "epoch": 0.55, "learning_rate": 0.00018464666757185216, "loss": 0.3741, "step": 383 }, { "epoch": 0.55, "learning_rate": 0.00018456642244405266, "loss": 0.6583, "step": 384 }, { "epoch": 0.55, "learning_rate": 0.000184485985691008, "loss": 0.4571, "step": 385 }, { "epoch": 0.55, "learning_rate": 0.00018440535749498563, "loss": 0.4962, "step": 386 }, { "epoch": 0.55, "learning_rate": 0.00018432453803868696, "loss": 0.6971, "step": 387 }, { "epoch": 0.56, "learning_rate": 0.00018424352750524668, "loss": 0.7043, "step": 388 }, { "epoch": 0.56, "learning_rate": 0.0001841623260782325, "loss": 0.5788, "step": 389 }, { "epoch": 0.56, "learning_rate": 0.00018408093394164468, "loss": 0.6969, "step": 390 }, { "epoch": 0.56, "learning_rate": 0.00018399935127991554, "loss": 0.6253, "step": 391 }, { "epoch": 0.56, "learning_rate": 0.00018391757827790933, "loss": 0.6763, "step": 392 }, { "epoch": 0.56, "learning_rate": 0.00018383561512092138, "loss": 0.6583, "step": 393 }, { "epoch": 0.56, "learning_rate": 0.00018375346199467807, "loss": 0.5904, "step": 394 }, { "epoch": 0.57, "learning_rate": 0.00018367111908533615, "loss": 0.8813, "step": 395 }, { "epoch": 0.57, "learning_rate": 0.0001835885865794824, "loss": 0.7437, "step": 396 }, { "epoch": 0.57, "learning_rate": 0.00018350586466413336, "loss": 0.6893, "step": 397 }, { "epoch": 0.57, "learning_rate": 0.00018342295352673463, "loss": 0.5746, "step": 398 }, { "epoch": 0.57, "learning_rate": 0.00018333985335516057, "loss": 0.7338, "step": 399 }, { "epoch": 0.57, "learning_rate": 0.000183256564337714, "loss": 0.5943, "step": 400 }, { "epoch": 0.57, "eval_loss": 0.5853814482688904, "eval_runtime": 29.3481, "eval_samples_per_second": 55.506, "eval_steps_per_second": 27.77, "step": 400 }, { "epoch": 0.57, "learning_rate": 0.0001831730866631256, "loss": 0.631, "step": 401 }, { "epoch": 0.58, "learning_rate": 0.00018308942052055354, "loss": 0.5297, "step": 402 }, { "epoch": 0.58, "learning_rate": 0.00018300556609958304, "loss": 0.6393, "step": 403 }, { "epoch": 0.58, "learning_rate": 0.00018292152359022595, "loss": 0.4876, "step": 404 }, { "epoch": 0.58, "learning_rate": 0.00018283729318292036, "loss": 0.5634, "step": 405 }, { "epoch": 0.58, "learning_rate": 0.00018275287506853017, "loss": 0.4768, "step": 406 }, { "epoch": 0.58, "learning_rate": 0.00018266826943834445, "loss": 0.9219, "step": 407 }, { "epoch": 0.58, "learning_rate": 0.0001825834764840774, "loss": 0.5326, "step": 408 }, { "epoch": 0.59, "learning_rate": 0.00018249849639786749, "loss": 0.717, "step": 409 }, { "epoch": 0.59, "learning_rate": 0.00018241332937227734, "loss": 0.7521, "step": 410 }, { "epoch": 0.59, "learning_rate": 0.0001823279756002932, "loss": 0.8977, "step": 411 }, { "epoch": 0.59, "learning_rate": 0.0001822424352753244, "loss": 0.7813, "step": 412 }, { "epoch": 0.59, "learning_rate": 0.000182156708591203, "loss": 0.8893, "step": 413 }, { "epoch": 0.59, "learning_rate": 0.00018207079574218338, "loss": 0.6064, "step": 414 }, { "epoch": 0.59, "learning_rate": 0.00018198469692294174, "loss": 0.8171, "step": 415 }, { "epoch": 0.6, "learning_rate": 0.00018189841232857571, "loss": 0.6169, "step": 416 }, { "epoch": 0.6, "learning_rate": 0.00018181194215460388, "loss": 0.5828, "step": 417 }, { "epoch": 0.6, "learning_rate": 0.00018172528659696533, "loss": 0.5173, "step": 418 }, { "epoch": 0.6, "learning_rate": 0.0001816384458520192, "loss": 0.5217, "step": 419 }, { "epoch": 0.6, "learning_rate": 0.00018155142011654435, "loss": 0.6999, "step": 420 }, { "epoch": 0.6, "eval_loss": 0.5941495299339294, "eval_runtime": 29.3194, "eval_samples_per_second": 55.561, "eval_steps_per_second": 27.797, "step": 420 }, { "epoch": 0.6, "learning_rate": 0.0001814642095877387, "loss": 0.5381, "step": 421 }, { "epoch": 0.6, "learning_rate": 0.00018137681446321903, "loss": 0.7032, "step": 422 }, { "epoch": 0.61, "learning_rate": 0.00018128923494102028, "loss": 0.5898, "step": 423 }, { "epoch": 0.61, "learning_rate": 0.00018120147121959536, "loss": 0.4766, "step": 424 }, { "epoch": 0.61, "learning_rate": 0.0001811135234978145, "loss": 0.5916, "step": 425 }, { "epoch": 0.61, "learning_rate": 0.00018102539197496482, "loss": 0.5091, "step": 426 }, { "epoch": 0.61, "learning_rate": 0.00018093707685075004, "loss": 0.7035, "step": 427 }, { "epoch": 0.61, "learning_rate": 0.00018084857832528987, "loss": 0.7507, "step": 428 }, { "epoch": 0.61, "learning_rate": 0.00018075989659911956, "loss": 0.4905, "step": 429 }, { "epoch": 0.62, "learning_rate": 0.00018067103187318956, "loss": 0.5706, "step": 430 }, { "epoch": 0.62, "learning_rate": 0.00018058198434886492, "loss": 0.784, "step": 431 }, { "epoch": 0.62, "learning_rate": 0.00018049275422792497, "loss": 0.5877, "step": 432 }, { "epoch": 0.62, "learning_rate": 0.00018040334171256277, "loss": 0.6392, "step": 433 }, { "epoch": 0.62, "learning_rate": 0.00018031374700538467, "loss": 0.5623, "step": 434 }, { "epoch": 0.62, "learning_rate": 0.00018022397030940986, "loss": 0.6878, "step": 435 }, { "epoch": 0.62, "learning_rate": 0.00018013401182806994, "loss": 0.5297, "step": 436 }, { "epoch": 0.63, "learning_rate": 0.00018004387176520843, "loss": 0.4828, "step": 437 }, { "epoch": 0.63, "learning_rate": 0.00017995355032508027, "loss": 0.5836, "step": 438 }, { "epoch": 0.63, "learning_rate": 0.00017986304771235143, "loss": 0.5292, "step": 439 }, { "epoch": 0.63, "learning_rate": 0.0001797723641320984, "loss": 0.5173, "step": 440 }, { "epoch": 0.63, "eval_loss": 0.5887213349342346, "eval_runtime": 29.3237, "eval_samples_per_second": 55.552, "eval_steps_per_second": 27.793, "step": 440 }, { "epoch": 0.63, "learning_rate": 0.00017968149978980774, "loss": 0.6018, "step": 441 }, { "epoch": 0.63, "learning_rate": 0.00017959045489137566, "loss": 0.5353, "step": 442 }, { "epoch": 0.63, "learning_rate": 0.00017949922964310738, "loss": 0.7697, "step": 443 }, { "epoch": 0.64, "learning_rate": 0.00017940782425171693, "loss": 0.4775, "step": 444 }, { "epoch": 0.64, "learning_rate": 0.0001793162389243264, "loss": 0.6834, "step": 445 }, { "epoch": 0.64, "learning_rate": 0.00017922447386846578, "loss": 0.8259, "step": 446 }, { "epoch": 0.64, "learning_rate": 0.00017913252929207217, "loss": 0.7376, "step": 447 }, { "epoch": 0.64, "learning_rate": 0.0001790404054034895, "loss": 0.7481, "step": 448 }, { "epoch": 0.64, "learning_rate": 0.0001789481024114681, "loss": 0.8861, "step": 449 }, { "epoch": 0.64, "learning_rate": 0.00017885562052516398, "loss": 0.4938, "step": 450 }, { "epoch": 0.65, "learning_rate": 0.00017876295995413867, "loss": 0.5681, "step": 451 }, { "epoch": 0.65, "learning_rate": 0.00017867012090835854, "loss": 0.4892, "step": 452 }, { "epoch": 0.65, "learning_rate": 0.00017857710359819436, "loss": 0.6888, "step": 453 }, { "epoch": 0.65, "learning_rate": 0.00017848390823442086, "loss": 0.5947, "step": 454 }, { "epoch": 0.65, "learning_rate": 0.0001783905350282163, "loss": 0.7089, "step": 455 }, { "epoch": 0.65, "learning_rate": 0.00017829698419116177, "loss": 0.5473, "step": 456 }, { "epoch": 0.65, "learning_rate": 0.00017820325593524098, "loss": 0.6345, "step": 457 }, { "epoch": 0.66, "learning_rate": 0.00017810935047283966, "loss": 0.6343, "step": 458 }, { "epoch": 0.66, "learning_rate": 0.00017801526801674506, "loss": 0.5587, "step": 459 }, { "epoch": 0.66, "learning_rate": 0.00017792100878014552, "loss": 0.4201, "step": 460 }, { "epoch": 0.66, "eval_loss": 0.5952056646347046, "eval_runtime": 29.408, "eval_samples_per_second": 55.393, "eval_steps_per_second": 27.714, "step": 460 }, { "epoch": 0.66, "learning_rate": 0.00017782657297662992, "loss": 0.6726, "step": 461 }, { "epoch": 0.66, "learning_rate": 0.00017773196082018728, "loss": 0.5181, "step": 462 }, { "epoch": 0.66, "learning_rate": 0.0001776371725252062, "loss": 0.4734, "step": 463 }, { "epoch": 0.66, "learning_rate": 0.0001775422083064744, "loss": 0.5406, "step": 464 }, { "epoch": 0.67, "learning_rate": 0.00017744706837917828, "loss": 0.4941, "step": 465 }, { "epoch": 0.67, "learning_rate": 0.00017735175295890233, "loss": 0.5945, "step": 466 }, { "epoch": 0.67, "learning_rate": 0.00017725626226162874, "loss": 0.5451, "step": 467 }, { "epoch": 0.67, "learning_rate": 0.0001771605965037369, "loss": 0.6012, "step": 468 }, { "epoch": 0.67, "learning_rate": 0.00017706475590200285, "loss": 0.6645, "step": 469 }, { "epoch": 0.67, "learning_rate": 0.0001769687406735988, "loss": 0.8397, "step": 470 }, { "epoch": 0.67, "learning_rate": 0.00017687255103609266, "loss": 0.4745, "step": 471 }, { "epoch": 0.68, "learning_rate": 0.00017677618720744764, "loss": 0.9309, "step": 472 }, { "epoch": 0.68, "learning_rate": 0.0001766796494060215, "loss": 0.5752, "step": 473 }, { "epoch": 0.68, "learning_rate": 0.00017658293785056638, "loss": 0.846, "step": 474 }, { "epoch": 0.68, "learning_rate": 0.00017648605276022808, "loss": 0.4726, "step": 475 }, { "epoch": 0.68, "learning_rate": 0.00017638899435454554, "loss": 0.5713, "step": 476 }, { "epoch": 0.68, "learning_rate": 0.0001762917628534506, "loss": 0.5512, "step": 477 }, { "epoch": 0.68, "learning_rate": 0.00017619435847726712, "loss": 0.538, "step": 478 }, { "epoch": 0.69, "learning_rate": 0.00017609678144671093, "loss": 0.5254, "step": 479 }, { "epoch": 0.69, "learning_rate": 0.00017599903198288886, "loss": 0.667, "step": 480 }, { "epoch": 0.69, "eval_loss": 0.5801939964294434, "eval_runtime": 29.3619, "eval_samples_per_second": 55.48, "eval_steps_per_second": 27.757, "step": 480 }, { "epoch": 0.69, "learning_rate": 0.00017590111030729862, "loss": 0.8356, "step": 481 }, { "epoch": 0.69, "learning_rate": 0.00017580301664182812, "loss": 0.6087, "step": 482 }, { "epoch": 0.69, "learning_rate": 0.00017570475120875495, "loss": 0.5965, "step": 483 }, { "epoch": 0.69, "learning_rate": 0.00017560631423074591, "loss": 0.5231, "step": 484 }, { "epoch": 0.69, "learning_rate": 0.00017550770593085662, "loss": 0.595, "step": 485 }, { "epoch": 0.7, "learning_rate": 0.0001754089265325308, "loss": 0.5015, "step": 486 }, { "epoch": 0.7, "learning_rate": 0.00017530997625959992, "loss": 0.4863, "step": 487 }, { "epoch": 0.7, "learning_rate": 0.00017521085533628265, "loss": 0.7477, "step": 488 }, { "epoch": 0.7, "learning_rate": 0.00017511156398718439, "loss": 0.5633, "step": 489 }, { "epoch": 0.7, "learning_rate": 0.0001750121024372966, "loss": 0.6316, "step": 490 }, { "epoch": 0.7, "learning_rate": 0.0001749124709119965, "loss": 0.4627, "step": 491 }, { "epoch": 0.7, "learning_rate": 0.0001748126696370465, "loss": 0.662, "step": 492 }, { "epoch": 0.71, "learning_rate": 0.00017471269883859356, "loss": 0.6426, "step": 493 }, { "epoch": 0.71, "learning_rate": 0.00017461255874316885, "loss": 0.6185, "step": 494 }, { "epoch": 0.71, "learning_rate": 0.00017451224957768717, "loss": 0.6106, "step": 495 }, { "epoch": 0.71, "learning_rate": 0.00017441177156944635, "loss": 0.7092, "step": 496 }, { "epoch": 0.71, "learning_rate": 0.0001743111249461269, "loss": 0.816, "step": 497 }, { "epoch": 0.71, "learning_rate": 0.00017421030993579133, "loss": 0.8185, "step": 498 }, { "epoch": 0.71, "learning_rate": 0.00017410932676688377, "loss": 0.4779, "step": 499 }, { "epoch": 0.72, "learning_rate": 0.00017400817566822938, "loss": 0.8568, "step": 500 }, { "epoch": 0.72, "eval_loss": 0.5921781063079834, "eval_runtime": 29.3482, "eval_samples_per_second": 55.506, "eval_steps_per_second": 27.77, "step": 500 }, { "epoch": 0.72, "learning_rate": 0.0001739068568690338, "loss": 0.6461, "step": 501 }, { "epoch": 0.72, "learning_rate": 0.00017380537059888277, "loss": 0.6208, "step": 502 }, { "epoch": 0.72, "learning_rate": 0.00017370371708774141, "loss": 0.5665, "step": 503 }, { "epoch": 0.72, "learning_rate": 0.0001736018965659539, "loss": 0.5903, "step": 504 }, { "epoch": 0.72, "learning_rate": 0.00017349990926424273, "loss": 0.5639, "step": 505 }, { "epoch": 0.72, "learning_rate": 0.0001733977554137085, "loss": 0.5138, "step": 506 }, { "epoch": 0.73, "learning_rate": 0.00017329543524582898, "loss": 0.5628, "step": 507 }, { "epoch": 0.73, "learning_rate": 0.000173192948992459, "loss": 0.3835, "step": 508 }, { "epoch": 0.73, "learning_rate": 0.00017309029688582963, "loss": 0.6234, "step": 509 }, { "epoch": 0.73, "learning_rate": 0.00017298747915854782, "loss": 0.86, "step": 510 }, { "epoch": 0.73, "learning_rate": 0.00017288449604359576, "loss": 0.7151, "step": 511 }, { "epoch": 0.73, "learning_rate": 0.00017278134777433044, "loss": 0.554, "step": 512 }, { "epoch": 0.73, "learning_rate": 0.00017267803458448307, "loss": 0.583, "step": 513 }, { "epoch": 0.74, "learning_rate": 0.00017257455670815854, "loss": 0.5642, "step": 514 }, { "epoch": 0.74, "learning_rate": 0.00017247091437983496, "loss": 1.0091, "step": 515 }, { "epoch": 0.74, "learning_rate": 0.00017236710783436306, "loss": 0.5834, "step": 516 }, { "epoch": 0.74, "learning_rate": 0.00017226313730696574, "loss": 0.6862, "step": 517 }, { "epoch": 0.74, "learning_rate": 0.00017215900303323736, "loss": 0.4787, "step": 518 }, { "epoch": 0.74, "learning_rate": 0.00017205470524914342, "loss": 0.4951, "step": 519 }, { "epoch": 0.74, "learning_rate": 0.00017195024419101987, "loss": 0.515, "step": 520 }, { "epoch": 0.74, "eval_loss": 0.5800089836120605, "eval_runtime": 29.3282, "eval_samples_per_second": 55.544, "eval_steps_per_second": 27.789, "step": 520 }, { "epoch": 0.75, "learning_rate": 0.00017184562009557271, "loss": 0.6123, "step": 521 }, { "epoch": 0.75, "learning_rate": 0.00017174083319987732, "loss": 0.6898, "step": 522 }, { "epoch": 0.75, "learning_rate": 0.000171635883741378, "loss": 0.487, "step": 523 }, { "epoch": 0.75, "learning_rate": 0.0001715307719578874, "loss": 0.6113, "step": 524 }, { "epoch": 0.75, "learning_rate": 0.000171425498087586, "loss": 0.4276, "step": 525 }, { "epoch": 0.75, "learning_rate": 0.00017132006236902155, "loss": 0.5996, "step": 526 }, { "epoch": 0.75, "learning_rate": 0.00017121446504110859, "loss": 0.564, "step": 527 }, { "epoch": 0.76, "learning_rate": 0.00017110870634312784, "loss": 0.8961, "step": 528 }, { "epoch": 0.76, "learning_rate": 0.00017100278651472562, "loss": 0.4929, "step": 529 }, { "epoch": 0.76, "learning_rate": 0.0001708967057959135, "loss": 0.5149, "step": 530 }, { "epoch": 0.76, "learning_rate": 0.00017079046442706748, "loss": 0.4424, "step": 531 }, { "epoch": 0.76, "learning_rate": 0.00017068406264892768, "loss": 1.0212, "step": 532 }, { "epoch": 0.76, "learning_rate": 0.00017057750070259765, "loss": 0.4541, "step": 533 }, { "epoch": 0.76, "learning_rate": 0.00017047077882954392, "loss": 0.7563, "step": 534 }, { "epoch": 0.77, "learning_rate": 0.0001703638972715954, "loss": 0.644, "step": 535 }, { "epoch": 0.77, "learning_rate": 0.00017025685627094283, "loss": 0.4545, "step": 536 }, { "epoch": 0.77, "learning_rate": 0.00017014965607013824, "loss": 0.61, "step": 537 }, { "epoch": 0.77, "learning_rate": 0.0001700422969120944, "loss": 0.6925, "step": 538 }, { "epoch": 0.77, "learning_rate": 0.00016993477904008432, "loss": 0.4985, "step": 539 }, { "epoch": 0.77, "learning_rate": 0.00016982710269774058, "loss": 0.504, "step": 540 }, { "epoch": 0.77, "eval_loss": 0.5893893241882324, "eval_runtime": 29.3252, "eval_samples_per_second": 55.55, "eval_steps_per_second": 27.792, "step": 540 }, { "epoch": 0.77, "learning_rate": 0.00016971926812905488, "loss": 0.3721, "step": 541 }, { "epoch": 0.78, "learning_rate": 0.00016961127557837751, "loss": 0.5396, "step": 542 }, { "epoch": 0.78, "learning_rate": 0.00016950312529041663, "loss": 0.4731, "step": 543 }, { "epoch": 0.78, "learning_rate": 0.0001693948175102379, "loss": 0.5411, "step": 544 }, { "epoch": 0.78, "learning_rate": 0.0001692863524832639, "loss": 0.6364, "step": 545 }, { "epoch": 0.78, "learning_rate": 0.00016917773045527343, "loss": 0.5857, "step": 546 }, { "epoch": 0.78, "learning_rate": 0.00016906895167240112, "loss": 0.7359, "step": 547 }, { "epoch": 0.78, "learning_rate": 0.0001689600163811367, "loss": 0.5994, "step": 548 }, { "epoch": 0.79, "learning_rate": 0.0001688509248283247, "loss": 0.5892, "step": 549 }, { "epoch": 0.79, "learning_rate": 0.00016874167726116363, "loss": 0.7209, "step": 550 }, { "epoch": 0.79, "learning_rate": 0.00016863227392720554, "loss": 0.5054, "step": 551 }, { "epoch": 0.79, "learning_rate": 0.00016852271507435544, "loss": 0.5956, "step": 552 }, { "epoch": 0.79, "learning_rate": 0.00016841300095087077, "loss": 0.5568, "step": 553 }, { "epoch": 0.79, "learning_rate": 0.00016830313180536078, "loss": 0.6406, "step": 554 }, { "epoch": 0.79, "learning_rate": 0.00016819310788678603, "loss": 0.6066, "step": 555 }, { "epoch": 0.8, "learning_rate": 0.00016808292944445774, "loss": 0.4708, "step": 556 }, { "epoch": 0.8, "learning_rate": 0.0001679725967280373, "loss": 0.6026, "step": 557 }, { "epoch": 0.8, "learning_rate": 0.00016786210998753575, "loss": 0.8752, "step": 558 }, { "epoch": 0.8, "learning_rate": 0.00016775146947331298, "loss": 0.7405, "step": 559 }, { "epoch": 0.8, "learning_rate": 0.00016764067543607753, "loss": 0.6361, "step": 560 }, { "epoch": 0.8, "eval_loss": 0.5982840657234192, "eval_runtime": 29.3385, "eval_samples_per_second": 55.524, "eval_steps_per_second": 27.779, "step": 560 }, { "epoch": 0.8, "learning_rate": 0.00016752972812688564, "loss": 0.6528, "step": 561 }, { "epoch": 0.8, "learning_rate": 0.00016741862779714098, "loss": 0.5128, "step": 562 }, { "epoch": 0.81, "learning_rate": 0.00016730737469859388, "loss": 0.723, "step": 563 }, { "epoch": 0.81, "learning_rate": 0.00016719596908334092, "loss": 0.5223, "step": 564 }, { "epoch": 0.81, "learning_rate": 0.0001670844112038242, "loss": 0.5353, "step": 565 }, { "epoch": 0.81, "learning_rate": 0.0001669727013128309, "loss": 0.7645, "step": 566 }, { "epoch": 0.81, "learning_rate": 0.00016686083966349266, "loss": 0.6167, "step": 567 }, { "epoch": 0.81, "learning_rate": 0.00016674882650928493, "loss": 0.3836, "step": 568 }, { "epoch": 0.81, "learning_rate": 0.00016663666210402656, "loss": 0.6045, "step": 569 }, { "epoch": 0.82, "learning_rate": 0.00016652434670187907, "loss": 0.5973, "step": 570 }, { "epoch": 0.82, "learning_rate": 0.0001664118805573461, "loss": 0.5661, "step": 571 }, { "epoch": 0.82, "learning_rate": 0.000166299263925273, "loss": 0.4528, "step": 572 }, { "epoch": 0.82, "learning_rate": 0.00016618649706084596, "loss": 0.6458, "step": 573 }, { "epoch": 0.82, "learning_rate": 0.00016607358021959173, "loss": 0.412, "step": 574 }, { "epoch": 0.82, "learning_rate": 0.0001659605136573768, "loss": 0.5031, "step": 575 }, { "epoch": 0.82, "learning_rate": 0.00016584729763040697, "loss": 0.4902, "step": 576 }, { "epoch": 0.83, "learning_rate": 0.00016573393239522678, "loss": 0.8064, "step": 577 }, { "epoch": 0.83, "learning_rate": 0.00016562041820871874, "loss": 0.5497, "step": 578 }, { "epoch": 0.83, "learning_rate": 0.000165506755328103, "loss": 0.6602, "step": 579 }, { "epoch": 0.83, "learning_rate": 0.00016539294401093658, "loss": 0.4896, "step": 580 }, { "epoch": 0.83, "eval_loss": 0.5770359039306641, "eval_runtime": 29.3487, "eval_samples_per_second": 55.505, "eval_steps_per_second": 27.77, "step": 580 }, { "epoch": 0.83, "learning_rate": 0.00016527898451511287, "loss": 0.3522, "step": 581 }, { "epoch": 0.83, "learning_rate": 0.00016516487709886105, "loss": 0.6396, "step": 582 }, { "epoch": 0.83, "learning_rate": 0.0001650506220207454, "loss": 0.8431, "step": 583 }, { "epoch": 0.84, "learning_rate": 0.00016493621953966495, "loss": 0.5146, "step": 584 }, { "epoch": 0.84, "learning_rate": 0.00016482166991485265, "loss": 0.5748, "step": 585 }, { "epoch": 0.84, "learning_rate": 0.00016470697340587476, "loss": 0.5961, "step": 586 }, { "epoch": 0.84, "learning_rate": 0.00016459213027263063, "loss": 0.5165, "step": 587 }, { "epoch": 0.84, "learning_rate": 0.00016447714077535167, "loss": 0.8974, "step": 588 }, { "epoch": 0.84, "learning_rate": 0.000164362005174601, "loss": 0.839, "step": 589 }, { "epoch": 0.84, "learning_rate": 0.00016424672373127277, "loss": 0.5579, "step": 590 }, { "epoch": 0.85, "learning_rate": 0.00016413129670659167, "loss": 0.832, "step": 591 }, { "epoch": 0.85, "learning_rate": 0.00016401572436211222, "loss": 0.7778, "step": 592 }, { "epoch": 0.85, "learning_rate": 0.0001639000069597183, "loss": 0.4416, "step": 593 }, { "epoch": 0.85, "learning_rate": 0.0001637841447616224, "loss": 0.4954, "step": 594 }, { "epoch": 0.85, "learning_rate": 0.0001636681380303652, "loss": 0.487, "step": 595 }, { "epoch": 0.85, "learning_rate": 0.00016355198702881478, "loss": 0.6773, "step": 596 }, { "epoch": 0.85, "learning_rate": 0.0001634356920201662, "loss": 0.5598, "step": 597 }, { "epoch": 0.86, "learning_rate": 0.00016331925326794087, "loss": 0.4758, "step": 598 }, { "epoch": 0.86, "learning_rate": 0.00016320267103598585, "loss": 0.5392, "step": 599 }, { "epoch": 0.86, "learning_rate": 0.00016308594558847337, "loss": 0.6044, "step": 600 }, { "epoch": 0.86, "eval_loss": 0.5716915726661682, "eval_runtime": 29.3528, "eval_samples_per_second": 55.497, "eval_steps_per_second": 27.766, "step": 600 }, { "epoch": 0.86, "learning_rate": 0.00016296907718990015, "loss": 0.543, "step": 601 }, { "epoch": 0.86, "learning_rate": 0.00016285206610508685, "loss": 0.677, "step": 602 }, { "epoch": 0.86, "learning_rate": 0.00016273491259917745, "loss": 0.8737, "step": 603 }, { "epoch": 0.86, "learning_rate": 0.0001626176169376387, "loss": 0.6001, "step": 604 }, { "epoch": 0.87, "learning_rate": 0.0001625001793862593, "loss": 0.6186, "step": 605 }, { "epoch": 0.87, "learning_rate": 0.0001623826002111497, "loss": 0.6821, "step": 606 }, { "epoch": 0.87, "learning_rate": 0.00016226487967874116, "loss": 0.4421, "step": 607 }, { "epoch": 0.87, "learning_rate": 0.00016214701805578518, "loss": 0.5087, "step": 608 }, { "epoch": 0.87, "learning_rate": 0.0001620290156093531, "loss": 0.465, "step": 609 }, { "epoch": 0.87, "learning_rate": 0.00016191087260683523, "loss": 0.5147, "step": 610 }, { "epoch": 0.87, "learning_rate": 0.00016179258931594051, "loss": 0.6099, "step": 611 }, { "epoch": 0.88, "learning_rate": 0.0001616741660046957, "loss": 0.6062, "step": 612 }, { "epoch": 0.88, "learning_rate": 0.00016155560294144479, "loss": 0.5735, "step": 613 }, { "epoch": 0.88, "learning_rate": 0.00016143690039484857, "loss": 0.6008, "step": 614 }, { "epoch": 0.88, "learning_rate": 0.00016131805863388378, "loss": 0.6302, "step": 615 }, { "epoch": 0.88, "learning_rate": 0.00016119907792784267, "loss": 0.5624, "step": 616 }, { "epoch": 0.88, "learning_rate": 0.00016107995854633235, "loss": 0.5803, "step": 617 }, { "epoch": 0.88, "learning_rate": 0.00016096070075927415, "loss": 0.5977, "step": 618 }, { "epoch": 0.89, "learning_rate": 0.00016084130483690295, "loss": 0.4958, "step": 619 }, { "epoch": 0.89, "learning_rate": 0.0001607217710497668, "loss": 0.4925, "step": 620 }, { "epoch": 0.89, "eval_loss": 0.5715158581733704, "eval_runtime": 29.3023, "eval_samples_per_second": 55.593, "eval_steps_per_second": 27.813, "step": 620 }, { "epoch": 0.89, "learning_rate": 0.000160602099668726, "loss": 0.641, "step": 621 }, { "epoch": 0.89, "learning_rate": 0.00016048229096495272, "loss": 0.5944, "step": 622 }, { "epoch": 0.89, "learning_rate": 0.00016036234520993024, "loss": 0.655, "step": 623 }, { "epoch": 0.89, "learning_rate": 0.0001602422626754524, "loss": 0.3301, "step": 624 }, { "epoch": 0.89, "learning_rate": 0.0001601220436336231, "loss": 0.6563, "step": 625 }, { "epoch": 0.9, "learning_rate": 0.00016000168835685535, "loss": 0.6173, "step": 626 }, { "epoch": 0.9, "learning_rate": 0.00015988119711787105, "loss": 0.7029, "step": 627 }, { "epoch": 0.9, "learning_rate": 0.0001597605701897001, "loss": 0.7588, "step": 628 }, { "epoch": 0.9, "learning_rate": 0.00015963980784567986, "loss": 0.527, "step": 629 }, { "epoch": 0.9, "learning_rate": 0.00015951891035945464, "loss": 0.6378, "step": 630 }, { "epoch": 0.9, "learning_rate": 0.0001593978780049748, "loss": 0.5056, "step": 631 }, { "epoch": 0.9, "learning_rate": 0.00015927671105649648, "loss": 0.6517, "step": 632 }, { "epoch": 0.91, "learning_rate": 0.00015915540978858066, "loss": 1.0716, "step": 633 }, { "epoch": 0.91, "learning_rate": 0.00015903397447609288, "loss": 0.5948, "step": 634 }, { "epoch": 0.91, "learning_rate": 0.0001589124053942022, "loss": 0.4883, "step": 635 }, { "epoch": 0.91, "learning_rate": 0.0001587907028183809, "loss": 0.5172, "step": 636 }, { "epoch": 0.91, "learning_rate": 0.00015866886702440384, "loss": 0.4618, "step": 637 }, { "epoch": 0.91, "learning_rate": 0.00015854689828834757, "loss": 0.7591, "step": 638 }, { "epoch": 0.91, "learning_rate": 0.00015842479688659003, "loss": 0.435, "step": 639 }, { "epoch": 0.92, "learning_rate": 0.00015830256309580968, "loss": 0.4704, "step": 640 }, { "epoch": 0.92, "eval_loss": 0.5707182288169861, "eval_runtime": 29.3397, "eval_samples_per_second": 55.522, "eval_steps_per_second": 27.778, "step": 640 }, { "epoch": 0.92, "learning_rate": 0.00015818019719298504, "loss": 0.6049, "step": 641 }, { "epoch": 0.92, "learning_rate": 0.00015805769945539394, "loss": 0.5501, "step": 642 }, { "epoch": 0.92, "learning_rate": 0.00015793507016061305, "loss": 0.7212, "step": 643 }, { "epoch": 0.92, "learning_rate": 0.00015781230958651694, "loss": 0.6531, "step": 644 }, { "epoch": 0.92, "learning_rate": 0.00015768941801127783, "loss": 0.9526, "step": 645 }, { "epoch": 0.92, "learning_rate": 0.00015756639571336476, "loss": 0.5878, "step": 646 }, { "epoch": 0.93, "learning_rate": 0.00015744324297154293, "loss": 0.46, "step": 647 }, { "epoch": 0.93, "learning_rate": 0.00015731996006487317, "loss": 0.4893, "step": 648 }, { "epoch": 0.93, "learning_rate": 0.00015719654727271122, "loss": 0.5149, "step": 649 }, { "epoch": 0.93, "learning_rate": 0.00015707300487470717, "loss": 0.6435, "step": 650 }, { "epoch": 0.93, "learning_rate": 0.00015694933315080477, "loss": 0.5271, "step": 651 }, { "epoch": 0.93, "learning_rate": 0.00015682553238124082, "loss": 0.4587, "step": 652 }, { "epoch": 0.93, "learning_rate": 0.00015670160284654458, "loss": 0.4208, "step": 653 }, { "epoch": 0.94, "learning_rate": 0.00015657754482753704, "loss": 0.5887, "step": 654 }, { "epoch": 0.94, "learning_rate": 0.0001564533586053303, "loss": 0.6264, "step": 655 }, { "epoch": 0.94, "learning_rate": 0.00015632904446132706, "loss": 0.6527, "step": 656 }, { "epoch": 0.94, "learning_rate": 0.00015620460267721983, "loss": 0.6423, "step": 657 }, { "epoch": 0.94, "learning_rate": 0.00015608003353499033, "loss": 0.5313, "step": 658 }, { "epoch": 0.94, "learning_rate": 0.0001559553373169089, "loss": 0.58, "step": 659 }, { "epoch": 0.94, "learning_rate": 0.00015583051430553385, "loss": 0.5342, "step": 660 }, { "epoch": 0.94, "eval_loss": 0.5747966766357422, "eval_runtime": 29.3347, "eval_samples_per_second": 55.531, "eval_steps_per_second": 27.783, "step": 660 }, { "epoch": 0.95, "learning_rate": 0.00015570556478371075, "loss": 0.6231, "step": 661 }, { "epoch": 0.95, "learning_rate": 0.0001555804890345719, "loss": 0.3901, "step": 662 }, { "epoch": 0.95, "learning_rate": 0.00015545528734153553, "loss": 0.5303, "step": 663 }, { "epoch": 0.95, "learning_rate": 0.0001553299599883054, "loss": 0.5951, "step": 664 }, { "epoch": 0.95, "learning_rate": 0.00015520450725886988, "loss": 0.4837, "step": 665 }, { "epoch": 0.95, "learning_rate": 0.00015507892943750147, "loss": 0.6, "step": 666 }, { "epoch": 0.95, "learning_rate": 0.0001549532268087562, "loss": 0.5726, "step": 667 }, { "epoch": 0.96, "learning_rate": 0.00015482739965747282, "loss": 0.5652, "step": 668 }, { "epoch": 0.96, "learning_rate": 0.0001547014482687723, "loss": 0.8029, "step": 669 }, { "epoch": 0.96, "learning_rate": 0.0001545753729280571, "loss": 0.6563, "step": 670 }, { "epoch": 0.96, "learning_rate": 0.00015444917392101054, "loss": 0.5851, "step": 671 }, { "epoch": 0.96, "learning_rate": 0.0001543228515335962, "loss": 0.4526, "step": 672 }, { "epoch": 0.96, "learning_rate": 0.00015419640605205727, "loss": 0.6139, "step": 673 }, { "epoch": 0.96, "learning_rate": 0.0001540698377629157, "loss": 0.5471, "step": 674 }, { "epoch": 0.97, "learning_rate": 0.000153943146952972, "loss": 0.5244, "step": 675 }, { "epoch": 0.97, "learning_rate": 0.00015381633390930402, "loss": 0.6256, "step": 676 }, { "epoch": 0.97, "learning_rate": 0.0001536893989192668, "loss": 0.4401, "step": 677 }, { "epoch": 0.97, "learning_rate": 0.00015356234227049154, "loss": 0.5637, "step": 678 }, { "epoch": 0.97, "learning_rate": 0.00015343516425088524, "loss": 0.5212, "step": 679 }, { "epoch": 0.97, "learning_rate": 0.0001533078651486299, "loss": 0.755, "step": 680 }, { "epoch": 0.97, "eval_loss": 0.5672922730445862, "eval_runtime": 29.3376, "eval_samples_per_second": 55.526, "eval_steps_per_second": 27.78, "step": 680 }, { "epoch": 0.97, "learning_rate": 0.0001531804452521818, "loss": 0.6678, "step": 681 }, { "epoch": 0.98, "learning_rate": 0.00015305290485027114, "loss": 0.413, "step": 682 }, { "epoch": 0.98, "learning_rate": 0.00015292524423190094, "loss": 0.7693, "step": 683 }, { "epoch": 0.98, "learning_rate": 0.00015279746368634673, "loss": 0.7246, "step": 684 }, { "epoch": 0.98, "learning_rate": 0.00015266956350315586, "loss": 0.5489, "step": 685 }, { "epoch": 0.98, "learning_rate": 0.0001525415439721467, "loss": 0.6732, "step": 686 }, { "epoch": 0.98, "learning_rate": 0.00015241340538340808, "loss": 0.4387, "step": 687 }, { "epoch": 0.98, "learning_rate": 0.0001522851480272986, "loss": 0.7792, "step": 688 }, { "epoch": 0.99, "learning_rate": 0.00015215677219444594, "loss": 0.4398, "step": 689 }, { "epoch": 0.99, "learning_rate": 0.0001520282781757464, "loss": 0.5612, "step": 690 }, { "epoch": 0.99, "learning_rate": 0.00015189966626236385, "loss": 0.7103, "step": 691 }, { "epoch": 0.99, "learning_rate": 0.0001517709367457295, "loss": 0.6044, "step": 692 }, { "epoch": 0.99, "learning_rate": 0.000151642089917541, "loss": 0.6622, "step": 693 }, { "epoch": 0.99, "learning_rate": 0.0001515131260697618, "loss": 0.4819, "step": 694 }, { "epoch": 0.99, "learning_rate": 0.00015138404549462053, "loss": 0.5547, "step": 695 }, { "epoch": 1.0, "learning_rate": 0.00015125484848461026, "loss": 0.6227, "step": 696 }, { "epoch": 1.0, "learning_rate": 0.000151125535332488, "loss": 0.4646, "step": 697 }, { "epoch": 1.0, "learning_rate": 0.00015099610633127387, "loss": 0.5327, "step": 698 }, { "epoch": 1.0, "learning_rate": 0.00015086656177425048, "loss": 0.5109, "step": 699 } ], "logging_steps": 1, "max_steps": 2097, "num_train_epochs": 3, "save_steps": 500, "total_flos": 9.190285595765637e+17, "trial_name": null, "trial_params": null }