{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3465050505050505, "eval_steps": 16, "global_step": 5360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.464646464646465e-05, "grad_norm": NaN, "learning_rate": 0.0, "loss": 29.6213, "step": 1 }, { "epoch": 0.0001292929292929293, "grad_norm": NaN, "learning_rate": 0.0, "loss": 29.6208, "step": 2 }, { "epoch": 0.00019393939393939395, "grad_norm": 129.40235900878906, "learning_rate": 4.3010752688172043e-07, "loss": 29.5846, "step": 3 }, { "epoch": 0.0002585858585858586, "grad_norm": Infinity, "learning_rate": 4.3010752688172043e-07, "loss": 29.7161, "step": 4 }, { "epoch": 0.00032323232323232324, "grad_norm": 130.79031372070312, "learning_rate": 8.602150537634409e-07, "loss": 29.7196, "step": 5 }, { "epoch": 0.0003878787878787879, "grad_norm": 123.62369537353516, "learning_rate": 1.2903225806451614e-06, "loss": 29.2487, "step": 6 }, { "epoch": 0.0004525252525252525, "grad_norm": 135.1348876953125, "learning_rate": 1.7204301075268817e-06, "loss": 29.6055, "step": 7 }, { "epoch": 0.0005171717171717172, "grad_norm": Infinity, "learning_rate": 1.7204301075268817e-06, "loss": 28.568, "step": 8 }, { "epoch": 0.0005818181818181818, "grad_norm": 270.385498046875, "learning_rate": 2.1505376344086023e-06, "loss": 29.7127, "step": 9 }, { "epoch": 0.0006464646464646465, "grad_norm": 109.7217788696289, "learning_rate": 2.580645161290323e-06, "loss": 29.2279, "step": 10 }, { "epoch": 0.0007111111111111111, "grad_norm": 403.46337890625, "learning_rate": 3.0107526881720433e-06, "loss": 28.7925, "step": 11 }, { "epoch": 0.0007757575757575758, "grad_norm": 114.81087493896484, "learning_rate": 3.4408602150537635e-06, "loss": 28.0663, "step": 12 }, { "epoch": 0.0008404040404040404, "grad_norm": 240.21282958984375, "learning_rate": 3.870967741935484e-06, "loss": 27.3475, "step": 13 }, { "epoch": 0.000905050505050505, "grad_norm": 236.48675537109375, "learning_rate": 4.3010752688172045e-06, "loss": 26.9302, "step": 14 }, { "epoch": 0.0009696969696969697, "grad_norm": 203.8461456298828, "learning_rate": 4.731182795698925e-06, "loss": 24.0135, "step": 15 }, { "epoch": 0.0010343434343434343, "grad_norm": 156.15663146972656, "learning_rate": 5.161290322580646e-06, "loss": 22.7445, "step": 16 }, { "epoch": 0.0010343434343434343, "eval_bleu": 0.11919568898736486, "eval_loss": 21.227251052856445, "eval_runtime": 2.9098, "eval_samples_per_second": 10.997, "eval_steps_per_second": 1.375, "step": 16 }, { "epoch": 0.0010989898989898989, "grad_norm": 202.9501495361328, "learning_rate": 5.591397849462366e-06, "loss": 20.6978, "step": 17 }, { "epoch": 0.0011636363636363637, "grad_norm": 100.57891082763672, "learning_rate": 6.021505376344087e-06, "loss": 19.4305, "step": 18 }, { "epoch": 0.0012282828282828282, "grad_norm": Infinity, "learning_rate": 6.021505376344087e-06, "loss": 17.1684, "step": 19 }, { "epoch": 0.001292929292929293, "grad_norm": 108.986572265625, "learning_rate": 6.451612903225806e-06, "loss": 16.4328, "step": 20 }, { "epoch": 0.0013575757575757575, "grad_norm": 118.8450927734375, "learning_rate": 6.881720430107527e-06, "loss": 16.6908, "step": 21 }, { "epoch": 0.0014222222222222223, "grad_norm": 251.05662536621094, "learning_rate": 7.3118279569892475e-06, "loss": 15.8439, "step": 22 }, { "epoch": 0.0014868686868686868, "grad_norm": 79.44746398925781, "learning_rate": 7.741935483870968e-06, "loss": 14.1431, "step": 23 }, { "epoch": 0.0015515151515151516, "grad_norm": 170.30389404296875, "learning_rate": 8.172043010752689e-06, "loss": 13.6728, "step": 24 }, { "epoch": 0.0016161616161616162, "grad_norm": 72.50897216796875, "learning_rate": 8.602150537634409e-06, "loss": 11.2049, "step": 25 }, { "epoch": 0.0016808080808080807, "grad_norm": 65.9330825805664, "learning_rate": 9.03225806451613e-06, "loss": 9.7004, "step": 26 }, { "epoch": 0.0017454545454545455, "grad_norm": 60.561038970947266, "learning_rate": 9.46236559139785e-06, "loss": 8.3309, "step": 27 }, { "epoch": 0.00181010101010101, "grad_norm": 69.28730010986328, "learning_rate": 9.89247311827957e-06, "loss": 9.5886, "step": 28 }, { "epoch": 0.0018747474747474748, "grad_norm": 61.073875427246094, "learning_rate": 1.0322580645161291e-05, "loss": 7.9738, "step": 29 }, { "epoch": 0.0019393939393939393, "grad_norm": 49.478477478027344, "learning_rate": 1.0752688172043012e-05, "loss": 5.8632, "step": 30 }, { "epoch": 0.002004040404040404, "grad_norm": 50.41263198852539, "learning_rate": 1.1182795698924732e-05, "loss": 5.2403, "step": 31 }, { "epoch": 0.0020686868686868687, "grad_norm": 49.01531982421875, "learning_rate": 1.1612903225806453e-05, "loss": 4.9331, "step": 32 }, { "epoch": 0.0020686868686868687, "eval_bleu": 0.17225555175409368, "eval_loss": 2.72825288772583, "eval_runtime": 2.9075, "eval_samples_per_second": 11.006, "eval_steps_per_second": 1.376, "step": 32 }, { "epoch": 0.0021333333333333334, "grad_norm": 52.26249694824219, "learning_rate": 1.2043010752688173e-05, "loss": 4.3053, "step": 33 }, { "epoch": 0.0021979797979797978, "grad_norm": 52.2663688659668, "learning_rate": 1.2473118279569892e-05, "loss": 3.687, "step": 34 }, { "epoch": 0.0022626262626262625, "grad_norm": 28.177982330322266, "learning_rate": 1.2903225806451613e-05, "loss": 2.4059, "step": 35 }, { "epoch": 0.0023272727272727273, "grad_norm": 19.620092391967773, "learning_rate": 1.3333333333333333e-05, "loss": 1.8274, "step": 36 }, { "epoch": 0.002391919191919192, "grad_norm": 11.509565353393555, "learning_rate": 1.3763440860215054e-05, "loss": 1.4432, "step": 37 }, { "epoch": 0.0024565656565656564, "grad_norm": 8.501072883605957, "learning_rate": 1.4193548387096774e-05, "loss": 1.2904, "step": 38 }, { "epoch": 0.002521212121212121, "grad_norm": 6.885224342346191, "learning_rate": 1.4623655913978495e-05, "loss": 1.2263, "step": 39 }, { "epoch": 0.002585858585858586, "grad_norm": 5.4081645011901855, "learning_rate": 1.5053763440860215e-05, "loss": 1.0505, "step": 40 }, { "epoch": 0.0026505050505050507, "grad_norm": 4.560916900634766, "learning_rate": 1.5483870967741936e-05, "loss": 1.0065, "step": 41 }, { "epoch": 0.002715151515151515, "grad_norm": 3.7118706703186035, "learning_rate": 1.5913978494623657e-05, "loss": 0.9423, "step": 42 }, { "epoch": 0.00277979797979798, "grad_norm": 3.2654550075531006, "learning_rate": 1.6344086021505377e-05, "loss": 0.817, "step": 43 }, { "epoch": 0.0028444444444444446, "grad_norm": 2.726564645767212, "learning_rate": 1.6774193548387098e-05, "loss": 0.7942, "step": 44 }, { "epoch": 0.002909090909090909, "grad_norm": 2.5318379402160645, "learning_rate": 1.7204301075268818e-05, "loss": 0.6455, "step": 45 }, { "epoch": 0.0029737373737373737, "grad_norm": 2.1529147624969482, "learning_rate": 1.763440860215054e-05, "loss": 0.642, "step": 46 }, { "epoch": 0.0030383838383838385, "grad_norm": 1.7998542785644531, "learning_rate": 1.806451612903226e-05, "loss": 0.5299, "step": 47 }, { "epoch": 0.0031030303030303032, "grad_norm": 1.2775508165359497, "learning_rate": 1.849462365591398e-05, "loss": 0.4811, "step": 48 }, { "epoch": 0.0031030303030303032, "eval_bleu": 0.0, "eval_loss": 0.36655399203300476, "eval_runtime": 2.9074, "eval_samples_per_second": 11.006, "eval_steps_per_second": 1.376, "step": 48 }, { "epoch": 0.0031676767676767676, "grad_norm": 0.896143913269043, "learning_rate": 1.89247311827957e-05, "loss": 0.4713, "step": 49 }, { "epoch": 0.0032323232323232323, "grad_norm": 0.6520284414291382, "learning_rate": 1.935483870967742e-05, "loss": 0.3871, "step": 50 }, { "epoch": 0.003296969696969697, "grad_norm": 0.5051870942115784, "learning_rate": 1.978494623655914e-05, "loss": 0.3739, "step": 51 }, { "epoch": 0.0033616161616161614, "grad_norm": 0.43156668543815613, "learning_rate": 2.0215053763440862e-05, "loss": 0.3706, "step": 52 }, { "epoch": 0.003426262626262626, "grad_norm": 0.45535755157470703, "learning_rate": 2.0645161290322582e-05, "loss": 0.3435, "step": 53 }, { "epoch": 0.003490909090909091, "grad_norm": 0.560636043548584, "learning_rate": 2.1075268817204303e-05, "loss": 0.3588, "step": 54 }, { "epoch": 0.0035555555555555557, "grad_norm": 0.33022740483283997, "learning_rate": 2.1505376344086024e-05, "loss": 0.3293, "step": 55 }, { "epoch": 0.00362020202020202, "grad_norm": 0.5199302434921265, "learning_rate": 2.1935483870967744e-05, "loss": 0.373, "step": 56 }, { "epoch": 0.003684848484848485, "grad_norm": 0.4684373438358307, "learning_rate": 2.2365591397849465e-05, "loss": 0.3395, "step": 57 }, { "epoch": 0.0037494949494949496, "grad_norm": 0.3800906836986542, "learning_rate": 2.2795698924731185e-05, "loss": 0.3123, "step": 58 }, { "epoch": 0.003814141414141414, "grad_norm": 0.3400656282901764, "learning_rate": 2.3225806451612906e-05, "loss": 0.3263, "step": 59 }, { "epoch": 0.0038787878787878787, "grad_norm": 0.25802895426750183, "learning_rate": 2.3655913978494626e-05, "loss": 0.3034, "step": 60 }, { "epoch": 0.0039434343434343435, "grad_norm": 0.2955683767795563, "learning_rate": 2.4086021505376347e-05, "loss": 0.2894, "step": 61 }, { "epoch": 0.004008080808080808, "grad_norm": 0.3214697241783142, "learning_rate": 2.4516129032258064e-05, "loss": 0.2861, "step": 62 }, { "epoch": 0.004072727272727273, "grad_norm": 0.21337643265724182, "learning_rate": 2.4946236559139784e-05, "loss": 0.2621, "step": 63 }, { "epoch": 0.004137373737373737, "grad_norm": 0.20859983563423157, "learning_rate": 2.537634408602151e-05, "loss": 0.244, "step": 64 }, { "epoch": 0.004137373737373737, "eval_bleu": 0.0, "eval_loss": 0.2603618800640106, "eval_runtime": 2.8237, "eval_samples_per_second": 11.333, "eval_steps_per_second": 1.417, "step": 64 }, { "epoch": 0.004202020202020202, "grad_norm": 0.2817174196243286, "learning_rate": 2.5806451612903226e-05, "loss": 0.3045, "step": 65 }, { "epoch": 0.004266666666666667, "grad_norm": 0.3275797665119171, "learning_rate": 2.623655913978495e-05, "loss": 0.3167, "step": 66 }, { "epoch": 0.004331313131313131, "grad_norm": 0.2828182280063629, "learning_rate": 2.6666666666666667e-05, "loss": 0.2951, "step": 67 }, { "epoch": 0.0043959595959595955, "grad_norm": 0.3147842288017273, "learning_rate": 2.709677419354839e-05, "loss": 0.2565, "step": 68 }, { "epoch": 0.004460606060606061, "grad_norm": 0.2775740325450897, "learning_rate": 2.7526881720430108e-05, "loss": 0.2515, "step": 69 }, { "epoch": 0.004525252525252525, "grad_norm": 0.22093918919563293, "learning_rate": 2.7956989247311828e-05, "loss": 0.2277, "step": 70 }, { "epoch": 0.00458989898989899, "grad_norm": 0.2860693633556366, "learning_rate": 2.838709677419355e-05, "loss": 0.2625, "step": 71 }, { "epoch": 0.004654545454545455, "grad_norm": 0.2432243973016739, "learning_rate": 2.881720430107527e-05, "loss": 0.2475, "step": 72 }, { "epoch": 0.004719191919191919, "grad_norm": 0.2548949122428894, "learning_rate": 2.924731182795699e-05, "loss": 0.2591, "step": 73 }, { "epoch": 0.004783838383838384, "grad_norm": 0.2704143226146698, "learning_rate": 2.967741935483871e-05, "loss": 0.2398, "step": 74 }, { "epoch": 0.0048484848484848485, "grad_norm": 0.288873553276062, "learning_rate": 3.010752688172043e-05, "loss": 0.2207, "step": 75 }, { "epoch": 0.004913131313131313, "grad_norm": 0.43518224358558655, "learning_rate": 3.053763440860215e-05, "loss": 0.1955, "step": 76 }, { "epoch": 0.004977777777777778, "grad_norm": 0.23072396218776703, "learning_rate": 3.096774193548387e-05, "loss": 0.228, "step": 77 }, { "epoch": 0.005042424242424242, "grad_norm": 0.31609800457954407, "learning_rate": 3.139784946236559e-05, "loss": 0.2218, "step": 78 }, { "epoch": 0.005107070707070707, "grad_norm": 0.2203676551580429, "learning_rate": 3.182795698924731e-05, "loss": 0.1978, "step": 79 }, { "epoch": 0.005171717171717172, "grad_norm": 0.22825172543525696, "learning_rate": 3.2258064516129034e-05, "loss": 0.1938, "step": 80 }, { "epoch": 0.005171717171717172, "eval_bleu": 0.0, "eval_loss": 0.19310392439365387, "eval_runtime": 2.8471, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 80 }, { "epoch": 0.005236363636363636, "grad_norm": 0.24557648599147797, "learning_rate": 3.2688172043010754e-05, "loss": 0.2094, "step": 81 }, { "epoch": 0.005301010101010101, "grad_norm": 0.22608253359794617, "learning_rate": 3.3118279569892475e-05, "loss": 0.1994, "step": 82 }, { "epoch": 0.005365656565656566, "grad_norm": 0.23522968590259552, "learning_rate": 3.3548387096774195e-05, "loss": 0.2014, "step": 83 }, { "epoch": 0.00543030303030303, "grad_norm": 0.23294328153133392, "learning_rate": 3.3978494623655916e-05, "loss": 0.2068, "step": 84 }, { "epoch": 0.005494949494949495, "grad_norm": 0.23539653420448303, "learning_rate": 3.4408602150537636e-05, "loss": 0.203, "step": 85 }, { "epoch": 0.00555959595959596, "grad_norm": 0.32274749875068665, "learning_rate": 3.483870967741936e-05, "loss": 0.2463, "step": 86 }, { "epoch": 0.005624242424242424, "grad_norm": 0.25700676441192627, "learning_rate": 3.526881720430108e-05, "loss": 0.2063, "step": 87 }, { "epoch": 0.005688888888888889, "grad_norm": 0.2676103413105011, "learning_rate": 3.56989247311828e-05, "loss": 0.2091, "step": 88 }, { "epoch": 0.0057535353535353535, "grad_norm": 0.2309941202402115, "learning_rate": 3.612903225806452e-05, "loss": 0.1992, "step": 89 }, { "epoch": 0.005818181818181818, "grad_norm": 0.21144503355026245, "learning_rate": 3.655913978494624e-05, "loss": 0.1971, "step": 90 }, { "epoch": 0.005882828282828283, "grad_norm": 0.2694931924343109, "learning_rate": 3.698924731182796e-05, "loss": 0.216, "step": 91 }, { "epoch": 0.005947474747474747, "grad_norm": 0.39304476976394653, "learning_rate": 3.741935483870968e-05, "loss": 0.2672, "step": 92 }, { "epoch": 0.006012121212121212, "grad_norm": 0.2071070820093155, "learning_rate": 3.78494623655914e-05, "loss": 0.1754, "step": 93 }, { "epoch": 0.006076767676767677, "grad_norm": 0.23533540964126587, "learning_rate": 3.827956989247312e-05, "loss": 0.198, "step": 94 }, { "epoch": 0.006141414141414141, "grad_norm": 0.23313389718532562, "learning_rate": 3.870967741935484e-05, "loss": 0.1926, "step": 95 }, { "epoch": 0.0062060606060606064, "grad_norm": 0.2327883541584015, "learning_rate": 3.913978494623656e-05, "loss": 0.1932, "step": 96 }, { "epoch": 0.0062060606060606064, "eval_bleu": 0.0, "eval_loss": 0.1656179279088974, "eval_runtime": 2.8794, "eval_samples_per_second": 11.113, "eval_steps_per_second": 1.389, "step": 96 }, { "epoch": 0.006270707070707071, "grad_norm": 0.24508926272392273, "learning_rate": 3.956989247311828e-05, "loss": 0.1954, "step": 97 }, { "epoch": 0.006335353535353535, "grad_norm": 0.27829626202583313, "learning_rate": 4e-05, "loss": 0.2349, "step": 98 }, { "epoch": 0.0064, "grad_norm": 0.2639622390270233, "learning_rate": 4.0430107526881724e-05, "loss": 0.2216, "step": 99 }, { "epoch": 0.006464646464646465, "grad_norm": 0.1994614154100418, "learning_rate": 4.0860215053763444e-05, "loss": 0.1629, "step": 100 }, { "epoch": 0.006529292929292929, "grad_norm": 0.2456357777118683, "learning_rate": 4.1290322580645165e-05, "loss": 0.183, "step": 101 }, { "epoch": 0.006593939393939394, "grad_norm": 0.253951758146286, "learning_rate": 4.172043010752688e-05, "loss": 0.1886, "step": 102 }, { "epoch": 0.0066585858585858585, "grad_norm": 0.2239103466272354, "learning_rate": 4.2150537634408606e-05, "loss": 0.1814, "step": 103 }, { "epoch": 0.006723232323232323, "grad_norm": 0.25032392144203186, "learning_rate": 4.258064516129032e-05, "loss": 0.1802, "step": 104 }, { "epoch": 0.006787878787878788, "grad_norm": 0.2761096656322479, "learning_rate": 4.301075268817205e-05, "loss": 0.1888, "step": 105 }, { "epoch": 0.006852525252525252, "grad_norm": 0.2294616997241974, "learning_rate": 4.344086021505376e-05, "loss": 0.1636, "step": 106 }, { "epoch": 0.006917171717171718, "grad_norm": 0.23688875138759613, "learning_rate": 4.387096774193549e-05, "loss": 0.1546, "step": 107 }, { "epoch": 0.006981818181818182, "grad_norm": 0.27356773614883423, "learning_rate": 4.43010752688172e-05, "loss": 0.1611, "step": 108 }, { "epoch": 0.007046464646464646, "grad_norm": 0.24604901671409607, "learning_rate": 4.473118279569893e-05, "loss": 0.1598, "step": 109 }, { "epoch": 0.0071111111111111115, "grad_norm": 0.22883355617523193, "learning_rate": 4.516129032258064e-05, "loss": 0.1334, "step": 110 }, { "epoch": 0.007175757575757576, "grad_norm": 0.3058576285839081, "learning_rate": 4.559139784946237e-05, "loss": 0.1583, "step": 111 }, { "epoch": 0.00724040404040404, "grad_norm": 0.25770094990730286, "learning_rate": 4.6021505376344084e-05, "loss": 0.1449, "step": 112 }, { "epoch": 0.00724040404040404, "eval_bleu": 0.004012177831303324, "eval_loss": 0.1327122449874878, "eval_runtime": 2.8585, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 112 }, { "epoch": 0.007305050505050505, "grad_norm": 0.24761152267456055, "learning_rate": 4.645161290322581e-05, "loss": 0.1485, "step": 113 }, { "epoch": 0.00736969696969697, "grad_norm": 0.24573925137519836, "learning_rate": 4.688172043010753e-05, "loss": 0.1406, "step": 114 }, { "epoch": 0.007434343434343434, "grad_norm": 0.276619553565979, "learning_rate": 4.731182795698925e-05, "loss": 0.1494, "step": 115 }, { "epoch": 0.007498989898989899, "grad_norm": 0.2808314561843872, "learning_rate": 4.774193548387097e-05, "loss": 0.1685, "step": 116 }, { "epoch": 0.0075636363636363635, "grad_norm": 0.2550530731678009, "learning_rate": 4.8172043010752693e-05, "loss": 0.1445, "step": 117 }, { "epoch": 0.007628282828282828, "grad_norm": 0.2385779619216919, "learning_rate": 4.8602150537634414e-05, "loss": 0.1473, "step": 118 }, { "epoch": 0.007692929292929293, "grad_norm": 0.22435127198696136, "learning_rate": 4.903225806451613e-05, "loss": 0.1424, "step": 119 }, { "epoch": 0.007757575757575757, "grad_norm": 0.21921704709529877, "learning_rate": 4.9462365591397855e-05, "loss": 0.1494, "step": 120 }, { "epoch": 0.007822222222222222, "grad_norm": 0.19863726198673248, "learning_rate": 4.989247311827957e-05, "loss": 0.1431, "step": 121 }, { "epoch": 0.007886868686868687, "grad_norm": 0.2568920850753784, "learning_rate": 5.032258064516129e-05, "loss": 0.1389, "step": 122 }, { "epoch": 0.007951515151515152, "grad_norm": 0.24171364307403564, "learning_rate": 5.075268817204302e-05, "loss": 0.1616, "step": 123 }, { "epoch": 0.008016161616161616, "grad_norm": 0.1914776861667633, "learning_rate": 5.118279569892474e-05, "loss": 0.1371, "step": 124 }, { "epoch": 0.00808080808080808, "grad_norm": 0.20348064601421356, "learning_rate": 5.161290322580645e-05, "loss": 0.1361, "step": 125 }, { "epoch": 0.008145454545454546, "grad_norm": 0.19211728870868683, "learning_rate": 5.204301075268817e-05, "loss": 0.1381, "step": 126 }, { "epoch": 0.00821010101010101, "grad_norm": 0.2039393037557602, "learning_rate": 5.24731182795699e-05, "loss": 0.1473, "step": 127 }, { "epoch": 0.008274747474747475, "grad_norm": 0.17385540902614594, "learning_rate": 5.290322580645162e-05, "loss": 0.1229, "step": 128 }, { "epoch": 0.008274747474747475, "eval_bleu": 6.231980354669859, "eval_loss": 0.12336916476488113, "eval_runtime": 2.4144, "eval_samples_per_second": 13.254, "eval_steps_per_second": 1.657, "step": 128 }, { "epoch": 0.00833939393939394, "grad_norm": 0.2055896371603012, "learning_rate": 5.333333333333333e-05, "loss": 0.1519, "step": 129 }, { "epoch": 0.008404040404040403, "grad_norm": 0.1943008005619049, "learning_rate": 5.3763440860215054e-05, "loss": 0.127, "step": 130 }, { "epoch": 0.008468686868686869, "grad_norm": 0.20179802179336548, "learning_rate": 5.419354838709678e-05, "loss": 0.1548, "step": 131 }, { "epoch": 0.008533333333333334, "grad_norm": 0.20027868449687958, "learning_rate": 5.46236559139785e-05, "loss": 0.1469, "step": 132 }, { "epoch": 0.008597979797979797, "grad_norm": 0.2637341320514679, "learning_rate": 5.5053763440860215e-05, "loss": 0.1439, "step": 133 }, { "epoch": 0.008662626262626262, "grad_norm": 0.2555563747882843, "learning_rate": 5.5483870967741936e-05, "loss": 0.124, "step": 134 }, { "epoch": 0.008727272727272728, "grad_norm": 0.21842581033706665, "learning_rate": 5.5913978494623656e-05, "loss": 0.1436, "step": 135 }, { "epoch": 0.008791919191919191, "grad_norm": 0.21638083457946777, "learning_rate": 5.6344086021505384e-05, "loss": 0.1476, "step": 136 }, { "epoch": 0.008856565656565656, "grad_norm": 0.2058630734682083, "learning_rate": 5.67741935483871e-05, "loss": 0.1488, "step": 137 }, { "epoch": 0.008921212121212121, "grad_norm": 0.178453728556633, "learning_rate": 5.720430107526882e-05, "loss": 0.1342, "step": 138 }, { "epoch": 0.008985858585858587, "grad_norm": 0.22210922837257385, "learning_rate": 5.763440860215054e-05, "loss": 0.1497, "step": 139 }, { "epoch": 0.00905050505050505, "grad_norm": 0.1818244308233261, "learning_rate": 5.8064516129032266e-05, "loss": 0.1297, "step": 140 }, { "epoch": 0.009115151515151515, "grad_norm": 0.19282251596450806, "learning_rate": 5.849462365591398e-05, "loss": 0.1225, "step": 141 }, { "epoch": 0.00917979797979798, "grad_norm": 0.2010301947593689, "learning_rate": 5.89247311827957e-05, "loss": 0.1489, "step": 142 }, { "epoch": 0.009244444444444444, "grad_norm": 0.21354353427886963, "learning_rate": 5.935483870967742e-05, "loss": 0.1614, "step": 143 }, { "epoch": 0.00930909090909091, "grad_norm": 0.2229049801826477, "learning_rate": 5.978494623655915e-05, "loss": 0.1265, "step": 144 }, { "epoch": 0.00930909090909091, "eval_bleu": 4.760007484239427, "eval_loss": 0.1183546632528305, "eval_runtime": 2.5139, "eval_samples_per_second": 12.729, "eval_steps_per_second": 1.591, "step": 144 }, { "epoch": 0.009373737373737374, "grad_norm": 0.18631191551685333, "learning_rate": 6.021505376344086e-05, "loss": 0.1467, "step": 145 }, { "epoch": 0.009438383838383838, "grad_norm": 0.19207659363746643, "learning_rate": 6.064516129032258e-05, "loss": 0.134, "step": 146 }, { "epoch": 0.009503030303030303, "grad_norm": 0.18229234218597412, "learning_rate": 6.10752688172043e-05, "loss": 0.1294, "step": 147 }, { "epoch": 0.009567676767676768, "grad_norm": 0.21776947379112244, "learning_rate": 6.150537634408602e-05, "loss": 0.1417, "step": 148 }, { "epoch": 0.009632323232323232, "grad_norm": 0.1958850473165512, "learning_rate": 6.193548387096774e-05, "loss": 0.1239, "step": 149 }, { "epoch": 0.009696969696969697, "grad_norm": 0.2348705679178238, "learning_rate": 6.236559139784946e-05, "loss": 0.136, "step": 150 }, { "epoch": 0.009761616161616162, "grad_norm": 0.18097767233848572, "learning_rate": 6.279569892473119e-05, "loss": 0.1377, "step": 151 }, { "epoch": 0.009826262626262626, "grad_norm": 0.17325381934642792, "learning_rate": 6.32258064516129e-05, "loss": 0.1348, "step": 152 }, { "epoch": 0.00989090909090909, "grad_norm": 0.17179083824157715, "learning_rate": 6.365591397849463e-05, "loss": 0.1241, "step": 153 }, { "epoch": 0.009955555555555556, "grad_norm": 0.18480801582336426, "learning_rate": 6.408602150537635e-05, "loss": 0.1319, "step": 154 }, { "epoch": 0.01002020202020202, "grad_norm": 0.2286769598722458, "learning_rate": 6.451612903225807e-05, "loss": 0.152, "step": 155 }, { "epoch": 0.010084848484848485, "grad_norm": 0.17827528715133667, "learning_rate": 6.494623655913979e-05, "loss": 0.1287, "step": 156 }, { "epoch": 0.01014949494949495, "grad_norm": 0.22023025155067444, "learning_rate": 6.537634408602151e-05, "loss": 0.157, "step": 157 }, { "epoch": 0.010214141414141413, "grad_norm": 0.16185691952705383, "learning_rate": 6.580645161290323e-05, "loss": 0.1068, "step": 158 }, { "epoch": 0.010278787878787879, "grad_norm": 0.1795404702425003, "learning_rate": 6.623655913978495e-05, "loss": 0.1281, "step": 159 }, { "epoch": 0.010343434343434344, "grad_norm": 0.20915783941745758, "learning_rate": 6.666666666666667e-05, "loss": 0.1614, "step": 160 }, { "epoch": 0.010343434343434344, "eval_bleu": 6.592458544293429, "eval_loss": 0.11603529751300812, "eval_runtime": 2.5802, "eval_samples_per_second": 12.402, "eval_steps_per_second": 1.55, "step": 160 }, { "epoch": 0.010408080808080807, "grad_norm": 0.16381213068962097, "learning_rate": 6.709677419354839e-05, "loss": 0.1225, "step": 161 }, { "epoch": 0.010472727272727272, "grad_norm": 0.17188864946365356, "learning_rate": 6.752688172043011e-05, "loss": 0.1224, "step": 162 }, { "epoch": 0.010537373737373738, "grad_norm": 0.2340584695339203, "learning_rate": 6.795698924731183e-05, "loss": 0.1327, "step": 163 }, { "epoch": 0.010602020202020203, "grad_norm": 0.18974143266677856, "learning_rate": 6.838709677419355e-05, "loss": 0.1401, "step": 164 }, { "epoch": 0.010666666666666666, "grad_norm": 0.16689695417881012, "learning_rate": 6.881720430107527e-05, "loss": 0.1304, "step": 165 }, { "epoch": 0.010731313131313132, "grad_norm": 0.17095914483070374, "learning_rate": 6.924731182795699e-05, "loss": 0.1386, "step": 166 }, { "epoch": 0.010795959595959597, "grad_norm": 0.17770056426525116, "learning_rate": 6.967741935483871e-05, "loss": 0.1336, "step": 167 }, { "epoch": 0.01086060606060606, "grad_norm": 0.18237479031085968, "learning_rate": 7.010752688172043e-05, "loss": 0.136, "step": 168 }, { "epoch": 0.010925252525252525, "grad_norm": 0.16110308468341827, "learning_rate": 7.053763440860215e-05, "loss": 0.1138, "step": 169 }, { "epoch": 0.01098989898989899, "grad_norm": 0.18047171831130981, "learning_rate": 7.096774193548388e-05, "loss": 0.1378, "step": 170 }, { "epoch": 0.011054545454545454, "grad_norm": 0.16840171813964844, "learning_rate": 7.13978494623656e-05, "loss": 0.1399, "step": 171 }, { "epoch": 0.01111919191919192, "grad_norm": 0.17557452619075775, "learning_rate": 7.182795698924732e-05, "loss": 0.1335, "step": 172 }, { "epoch": 0.011183838383838384, "grad_norm": 0.15990616381168365, "learning_rate": 7.225806451612904e-05, "loss": 0.1152, "step": 173 }, { "epoch": 0.011248484848484848, "grad_norm": 0.19198541343212128, "learning_rate": 7.268817204301076e-05, "loss": 0.1474, "step": 174 }, { "epoch": 0.011313131313131313, "grad_norm": 0.1823148876428604, "learning_rate": 7.311827956989248e-05, "loss": 0.1399, "step": 175 }, { "epoch": 0.011377777777777778, "grad_norm": 0.17804116010665894, "learning_rate": 7.35483870967742e-05, "loss": 0.1154, "step": 176 }, { "epoch": 0.011377777777777778, "eval_bleu": 8.868003981640207, "eval_loss": 0.11387009918689728, "eval_runtime": 2.8192, "eval_samples_per_second": 11.351, "eval_steps_per_second": 1.419, "step": 176 }, { "epoch": 0.011442424242424242, "grad_norm": 0.18475455045700073, "learning_rate": 7.397849462365592e-05, "loss": 0.1395, "step": 177 }, { "epoch": 0.011507070707070707, "grad_norm": 0.18610043823719025, "learning_rate": 7.440860215053764e-05, "loss": 0.133, "step": 178 }, { "epoch": 0.011571717171717172, "grad_norm": 0.19519630074501038, "learning_rate": 7.483870967741936e-05, "loss": 0.1533, "step": 179 }, { "epoch": 0.011636363636363636, "grad_norm": 0.2074822634458542, "learning_rate": 7.526881720430108e-05, "loss": 0.117, "step": 180 }, { "epoch": 0.0117010101010101, "grad_norm": 0.189409539103508, "learning_rate": 7.56989247311828e-05, "loss": 0.1244, "step": 181 }, { "epoch": 0.011765656565656566, "grad_norm": 0.16102752089500427, "learning_rate": 7.612903225806451e-05, "loss": 0.1172, "step": 182 }, { "epoch": 0.01183030303030303, "grad_norm": 0.1906556636095047, "learning_rate": 7.655913978494624e-05, "loss": 0.1062, "step": 183 }, { "epoch": 0.011894949494949495, "grad_norm": 0.17854231595993042, "learning_rate": 7.698924731182796e-05, "loss": 0.1232, "step": 184 }, { "epoch": 0.01195959595959596, "grad_norm": 0.22238144278526306, "learning_rate": 7.741935483870968e-05, "loss": 0.1436, "step": 185 }, { "epoch": 0.012024242424242423, "grad_norm": 0.18775483965873718, "learning_rate": 7.784946236559139e-05, "loss": 0.1272, "step": 186 }, { "epoch": 0.012088888888888889, "grad_norm": 0.1833924651145935, "learning_rate": 7.827956989247312e-05, "loss": 0.1434, "step": 187 }, { "epoch": 0.012153535353535354, "grad_norm": 0.19351021945476532, "learning_rate": 7.870967741935484e-05, "loss": 0.1387, "step": 188 }, { "epoch": 0.012218181818181819, "grad_norm": 0.1965048760175705, "learning_rate": 7.913978494623657e-05, "loss": 0.1233, "step": 189 }, { "epoch": 0.012282828282828282, "grad_norm": 0.20669583976268768, "learning_rate": 7.956989247311829e-05, "loss": 0.1371, "step": 190 }, { "epoch": 0.012347474747474748, "grad_norm": 0.17665086686611176, "learning_rate": 8e-05, "loss": 0.1303, "step": 191 }, { "epoch": 0.012412121212121213, "grad_norm": 0.16998814046382904, "learning_rate": 8.043010752688173e-05, "loss": 0.1269, "step": 192 }, { "epoch": 0.012412121212121213, "eval_bleu": 4.975597595079175, "eval_loss": 0.1205844059586525, "eval_runtime": 2.4161, "eval_samples_per_second": 13.245, "eval_steps_per_second": 1.656, "step": 192 }, { "epoch": 0.012476767676767676, "grad_norm": 0.1767059713602066, "learning_rate": 8.086021505376345e-05, "loss": 0.1318, "step": 193 }, { "epoch": 0.012541414141414142, "grad_norm": 0.15831409394741058, "learning_rate": 8.129032258064517e-05, "loss": 0.1129, "step": 194 }, { "epoch": 0.012606060606060607, "grad_norm": 0.22060465812683105, "learning_rate": 8.172043010752689e-05, "loss": 0.1389, "step": 195 }, { "epoch": 0.01267070707070707, "grad_norm": 0.15754492580890656, "learning_rate": 8.215053763440861e-05, "loss": 0.1112, "step": 196 }, { "epoch": 0.012735353535353535, "grad_norm": 0.17196407914161682, "learning_rate": 8.258064516129033e-05, "loss": 0.1213, "step": 197 }, { "epoch": 0.0128, "grad_norm": 0.195866659283638, "learning_rate": 8.301075268817205e-05, "loss": 0.1517, "step": 198 }, { "epoch": 0.012864646464646464, "grad_norm": 0.1495412290096283, "learning_rate": 8.344086021505376e-05, "loss": 0.1067, "step": 199 }, { "epoch": 0.01292929292929293, "grad_norm": 0.22708700597286224, "learning_rate": 8.387096774193549e-05, "loss": 0.1478, "step": 200 }, { "epoch": 0.012993939393939394, "grad_norm": 0.18813180923461914, "learning_rate": 8.430107526881721e-05, "loss": 0.1474, "step": 201 }, { "epoch": 0.013058585858585858, "grad_norm": 0.212677463889122, "learning_rate": 8.473118279569893e-05, "loss": 0.1276, "step": 202 }, { "epoch": 0.013123232323232323, "grad_norm": 0.19477659463882446, "learning_rate": 8.516129032258064e-05, "loss": 0.1407, "step": 203 }, { "epoch": 0.013187878787878788, "grad_norm": 0.17175044119358063, "learning_rate": 8.559139784946237e-05, "loss": 0.1357, "step": 204 }, { "epoch": 0.013252525252525252, "grad_norm": 0.18147721886634827, "learning_rate": 8.60215053763441e-05, "loss": 0.1111, "step": 205 }, { "epoch": 0.013317171717171717, "grad_norm": 0.18739114701747894, "learning_rate": 8.645161290322581e-05, "loss": 0.1306, "step": 206 }, { "epoch": 0.013381818181818182, "grad_norm": 0.18932896852493286, "learning_rate": 8.688172043010752e-05, "loss": 0.1441, "step": 207 }, { "epoch": 0.013446464646464646, "grad_norm": 0.16915710270404816, "learning_rate": 8.731182795698926e-05, "loss": 0.1175, "step": 208 }, { "epoch": 0.013446464646464646, "eval_bleu": 7.276512799060872, "eval_loss": 0.11653009057044983, "eval_runtime": 2.4967, "eval_samples_per_second": 12.817, "eval_steps_per_second": 1.602, "step": 208 }, { "epoch": 0.013511111111111111, "grad_norm": 0.17084503173828125, "learning_rate": 8.774193548387098e-05, "loss": 0.1093, "step": 209 }, { "epoch": 0.013575757575757576, "grad_norm": 0.16955283284187317, "learning_rate": 8.81720430107527e-05, "loss": 0.1178, "step": 210 }, { "epoch": 0.01364040404040404, "grad_norm": 0.18181230127811432, "learning_rate": 8.86021505376344e-05, "loss": 0.1293, "step": 211 }, { "epoch": 0.013705050505050505, "grad_norm": 0.1908804327249527, "learning_rate": 8.903225806451614e-05, "loss": 0.139, "step": 212 }, { "epoch": 0.01376969696969697, "grad_norm": 0.19779594242572784, "learning_rate": 8.946236559139786e-05, "loss": 0.101, "step": 213 }, { "epoch": 0.013834343434343435, "grad_norm": 0.14478574693202972, "learning_rate": 8.989247311827958e-05, "loss": 0.1066, "step": 214 }, { "epoch": 0.013898989898989899, "grad_norm": 0.28901779651641846, "learning_rate": 9.032258064516129e-05, "loss": 0.1184, "step": 215 }, { "epoch": 0.013963636363636364, "grad_norm": 0.15521469712257385, "learning_rate": 9.0752688172043e-05, "loss": 0.1077, "step": 216 }, { "epoch": 0.014028282828282829, "grad_norm": 0.20941248536109924, "learning_rate": 9.118279569892474e-05, "loss": 0.1559, "step": 217 }, { "epoch": 0.014092929292929293, "grad_norm": 0.18314620852470398, "learning_rate": 9.161290322580646e-05, "loss": 0.1388, "step": 218 }, { "epoch": 0.014157575757575758, "grad_norm": 0.17776654660701752, "learning_rate": 9.204301075268817e-05, "loss": 0.117, "step": 219 }, { "epoch": 0.014222222222222223, "grad_norm": 0.2089313566684723, "learning_rate": 9.247311827956989e-05, "loss": 0.1436, "step": 220 }, { "epoch": 0.014286868686868686, "grad_norm": 0.1833810657262802, "learning_rate": 9.290322580645162e-05, "loss": 0.13, "step": 221 }, { "epoch": 0.014351515151515152, "grad_norm": 0.18056225776672363, "learning_rate": 9.333333333333334e-05, "loss": 0.1171, "step": 222 }, { "epoch": 0.014416161616161617, "grad_norm": 0.22030754387378693, "learning_rate": 9.376344086021506e-05, "loss": 0.1385, "step": 223 }, { "epoch": 0.01448080808080808, "grad_norm": 0.17485859990119934, "learning_rate": 9.419354838709677e-05, "loss": 0.1092, "step": 224 }, { "epoch": 0.01448080808080808, "eval_bleu": 7.809131634642059, "eval_loss": 0.1150018498301506, "eval_runtime": 2.5848, "eval_samples_per_second": 12.38, "eval_steps_per_second": 1.547, "step": 224 }, { "epoch": 0.014545454545454545, "grad_norm": 0.1666616052389145, "learning_rate": 9.46236559139785e-05, "loss": 0.1136, "step": 225 }, { "epoch": 0.01461010101010101, "grad_norm": 0.18403193354606628, "learning_rate": 9.505376344086023e-05, "loss": 0.1392, "step": 226 }, { "epoch": 0.014674747474747474, "grad_norm": 0.17291735112667084, "learning_rate": 9.548387096774195e-05, "loss": 0.109, "step": 227 }, { "epoch": 0.01473939393939394, "grad_norm": 0.17544101178646088, "learning_rate": 9.591397849462365e-05, "loss": 0.1334, "step": 228 }, { "epoch": 0.014804040404040405, "grad_norm": 0.17171847820281982, "learning_rate": 9.634408602150539e-05, "loss": 0.1254, "step": 229 }, { "epoch": 0.014868686868686868, "grad_norm": 0.17139862477779388, "learning_rate": 9.677419354838711e-05, "loss": 0.1071, "step": 230 }, { "epoch": 0.014933333333333333, "grad_norm": 0.17982217669487, "learning_rate": 9.720430107526883e-05, "loss": 0.097, "step": 231 }, { "epoch": 0.014997979797979798, "grad_norm": 0.1756395548582077, "learning_rate": 9.763440860215054e-05, "loss": 0.1341, "step": 232 }, { "epoch": 0.015062626262626262, "grad_norm": 0.1753191500902176, "learning_rate": 9.806451612903226e-05, "loss": 0.128, "step": 233 }, { "epoch": 0.015127272727272727, "grad_norm": 0.16588540375232697, "learning_rate": 9.849462365591399e-05, "loss": 0.1171, "step": 234 }, { "epoch": 0.015191919191919192, "grad_norm": 0.17201820015907288, "learning_rate": 9.892473118279571e-05, "loss": 0.1312, "step": 235 }, { "epoch": 0.015256565656565656, "grad_norm": 0.1373736709356308, "learning_rate": 9.935483870967742e-05, "loss": 0.0967, "step": 236 }, { "epoch": 0.015321212121212121, "grad_norm": 0.19025053083896637, "learning_rate": 9.978494623655914e-05, "loss": 0.1205, "step": 237 }, { "epoch": 0.015385858585858586, "grad_norm": 0.17612391710281372, "learning_rate": 0.00010021505376344087, "loss": 0.131, "step": 238 }, { "epoch": 0.015450505050505051, "grad_norm": 0.17722564935684204, "learning_rate": 0.00010064516129032258, "loss": 0.1275, "step": 239 }, { "epoch": 0.015515151515151515, "grad_norm": 0.157410129904747, "learning_rate": 0.0001010752688172043, "loss": 0.1163, "step": 240 }, { "epoch": 0.015515151515151515, "eval_bleu": 7.879718785339313, "eval_loss": 0.11526702344417572, "eval_runtime": 2.6169, "eval_samples_per_second": 12.228, "eval_steps_per_second": 1.529, "step": 240 }, { "epoch": 0.01557979797979798, "grad_norm": 0.15548042953014374, "learning_rate": 0.00010150537634408603, "loss": 0.113, "step": 241 }, { "epoch": 0.015644444444444443, "grad_norm": 0.16928356885910034, "learning_rate": 0.00010193548387096774, "loss": 0.1227, "step": 242 }, { "epoch": 0.01570909090909091, "grad_norm": 0.16685104370117188, "learning_rate": 0.00010236559139784947, "loss": 0.1269, "step": 243 }, { "epoch": 0.015773737373737374, "grad_norm": 0.18364295363426208, "learning_rate": 0.0001027956989247312, "loss": 0.1318, "step": 244 }, { "epoch": 0.01583838383838384, "grad_norm": 0.14628556370735168, "learning_rate": 0.0001032258064516129, "loss": 0.109, "step": 245 }, { "epoch": 0.015903030303030304, "grad_norm": 0.16133062541484833, "learning_rate": 0.00010365591397849464, "loss": 0.1171, "step": 246 }, { "epoch": 0.015967676767676766, "grad_norm": 0.2608477771282196, "learning_rate": 0.00010408602150537634, "loss": 0.1279, "step": 247 }, { "epoch": 0.01603232323232323, "grad_norm": 0.16215598583221436, "learning_rate": 0.00010451612903225806, "loss": 0.1191, "step": 248 }, { "epoch": 0.016096969696969696, "grad_norm": 0.15406298637390137, "learning_rate": 0.0001049462365591398, "loss": 0.1197, "step": 249 }, { "epoch": 0.01616161616161616, "grad_norm": 0.15008051693439484, "learning_rate": 0.0001053763440860215, "loss": 0.1099, "step": 250 }, { "epoch": 0.016226262626262627, "grad_norm": 0.1588115245103836, "learning_rate": 0.00010580645161290324, "loss": 0.1269, "step": 251 }, { "epoch": 0.016290909090909092, "grad_norm": 0.1505240648984909, "learning_rate": 0.00010623655913978495, "loss": 0.117, "step": 252 }, { "epoch": 0.016355555555555557, "grad_norm": 0.1762067973613739, "learning_rate": 0.00010666666666666667, "loss": 0.1432, "step": 253 }, { "epoch": 0.01642020202020202, "grad_norm": 0.1506965160369873, "learning_rate": 0.0001070967741935484, "loss": 0.1075, "step": 254 }, { "epoch": 0.016484848484848484, "grad_norm": 0.16768166422843933, "learning_rate": 0.00010752688172043011, "loss": 0.1211, "step": 255 }, { "epoch": 0.01654949494949495, "grad_norm": 0.18663646280765533, "learning_rate": 0.00010795698924731184, "loss": 0.1161, "step": 256 }, { "epoch": 0.01654949494949495, "eval_bleu": 7.040350332130717, "eval_loss": 0.1121298223733902, "eval_runtime": 2.4273, "eval_samples_per_second": 13.183, "eval_steps_per_second": 1.648, "step": 256 }, { "epoch": 0.016614141414141415, "grad_norm": 0.17525675892829895, "learning_rate": 0.00010838709677419356, "loss": 0.1269, "step": 257 }, { "epoch": 0.01667878787878788, "grad_norm": 0.15843874216079712, "learning_rate": 0.00010881720430107527, "loss": 0.1191, "step": 258 }, { "epoch": 0.016743434343434345, "grad_norm": 0.1823432892560959, "learning_rate": 0.000109247311827957, "loss": 0.1353, "step": 259 }, { "epoch": 0.016808080808080807, "grad_norm": 0.18377064168453217, "learning_rate": 0.00010967741935483871, "loss": 0.1424, "step": 260 }, { "epoch": 0.016872727272727272, "grad_norm": 0.1670287549495697, "learning_rate": 0.00011010752688172043, "loss": 0.1167, "step": 261 }, { "epoch": 0.016937373737373737, "grad_norm": 0.17760691046714783, "learning_rate": 0.00011053763440860216, "loss": 0.1371, "step": 262 }, { "epoch": 0.017002020202020202, "grad_norm": 0.15814171731472015, "learning_rate": 0.00011096774193548387, "loss": 0.1083, "step": 263 }, { "epoch": 0.017066666666666667, "grad_norm": 0.168021097779274, "learning_rate": 0.0001113978494623656, "loss": 0.1212, "step": 264 }, { "epoch": 0.017131313131313133, "grad_norm": 0.1638791710138321, "learning_rate": 0.00011182795698924731, "loss": 0.1227, "step": 265 }, { "epoch": 0.017195959595959594, "grad_norm": 0.16392038762569427, "learning_rate": 0.00011225806451612903, "loss": 0.1123, "step": 266 }, { "epoch": 0.01726060606060606, "grad_norm": 0.1560571789741516, "learning_rate": 0.00011268817204301077, "loss": 0.1039, "step": 267 }, { "epoch": 0.017325252525252525, "grad_norm": 0.15675808489322662, "learning_rate": 0.00011311827956989247, "loss": 0.1192, "step": 268 }, { "epoch": 0.01738989898989899, "grad_norm": 0.1586473435163498, "learning_rate": 0.0001135483870967742, "loss": 0.115, "step": 269 }, { "epoch": 0.017454545454545455, "grad_norm": 0.23141983151435852, "learning_rate": 0.00011397849462365593, "loss": 0.1105, "step": 270 }, { "epoch": 0.01751919191919192, "grad_norm": 0.17948609590530396, "learning_rate": 0.00011440860215053764, "loss": 0.1238, "step": 271 }, { "epoch": 0.017583838383838382, "grad_norm": 0.1589084267616272, "learning_rate": 0.00011483870967741937, "loss": 0.1142, "step": 272 }, { "epoch": 0.017583838383838382, "eval_bleu": 8.868664561614475, "eval_loss": 0.11044229567050934, "eval_runtime": 2.6707, "eval_samples_per_second": 11.982, "eval_steps_per_second": 1.498, "step": 272 }, { "epoch": 0.017648484848484847, "grad_norm": 0.17362943291664124, "learning_rate": 0.00011526881720430108, "loss": 0.1427, "step": 273 }, { "epoch": 0.017713131313131313, "grad_norm": 0.16138732433319092, "learning_rate": 0.0001156989247311828, "loss": 0.1143, "step": 274 }, { "epoch": 0.017777777777777778, "grad_norm": 0.15941482782363892, "learning_rate": 0.00011612903225806453, "loss": 0.1088, "step": 275 }, { "epoch": 0.017842424242424243, "grad_norm": 0.15772004425525665, "learning_rate": 0.00011655913978494624, "loss": 0.12, "step": 276 }, { "epoch": 0.017907070707070708, "grad_norm": 0.16370543837547302, "learning_rate": 0.00011698924731182796, "loss": 0.1172, "step": 277 }, { "epoch": 0.017971717171717173, "grad_norm": 0.1775922328233719, "learning_rate": 0.00011741935483870967, "loss": 0.1211, "step": 278 }, { "epoch": 0.018036363636363635, "grad_norm": 0.17987173795700073, "learning_rate": 0.0001178494623655914, "loss": 0.1354, "step": 279 }, { "epoch": 0.0181010101010101, "grad_norm": 0.1710910052061081, "learning_rate": 0.00011827956989247313, "loss": 0.1128, "step": 280 }, { "epoch": 0.018165656565656566, "grad_norm": 0.15026962757110596, "learning_rate": 0.00011870967741935484, "loss": 0.1001, "step": 281 }, { "epoch": 0.01823030303030303, "grad_norm": 0.16633881628513336, "learning_rate": 0.00011913978494623656, "loss": 0.1117, "step": 282 }, { "epoch": 0.018294949494949496, "grad_norm": 0.18486183881759644, "learning_rate": 0.0001195698924731183, "loss": 0.1416, "step": 283 }, { "epoch": 0.01835959595959596, "grad_norm": 0.15950919687747955, "learning_rate": 0.00012, "loss": 0.1145, "step": 284 }, { "epoch": 0.018424242424242423, "grad_norm": 0.17504741251468658, "learning_rate": 0.00012043010752688172, "loss": 0.1325, "step": 285 }, { "epoch": 0.018488888888888888, "grad_norm": 0.17341840267181396, "learning_rate": 0.00012086021505376343, "loss": 0.1324, "step": 286 }, { "epoch": 0.018553535353535353, "grad_norm": 0.16707849502563477, "learning_rate": 0.00012129032258064516, "loss": 0.0998, "step": 287 }, { "epoch": 0.01861818181818182, "grad_norm": 0.2094668447971344, "learning_rate": 0.0001217204301075269, "loss": 0.1306, "step": 288 }, { "epoch": 0.01861818181818182, "eval_bleu": 8.091362015630345, "eval_loss": 0.11228324472904205, "eval_runtime": 2.3875, "eval_samples_per_second": 13.403, "eval_steps_per_second": 1.675, "step": 288 }, { "epoch": 0.018682828282828284, "grad_norm": 0.15353377163410187, "learning_rate": 0.0001221505376344086, "loss": 0.1039, "step": 289 }, { "epoch": 0.01874747474747475, "grad_norm": 0.14731451869010925, "learning_rate": 0.00012258064516129034, "loss": 0.1, "step": 290 }, { "epoch": 0.01881212121212121, "grad_norm": 0.170234814286232, "learning_rate": 0.00012301075268817205, "loss": 0.1249, "step": 291 }, { "epoch": 0.018876767676767676, "grad_norm": 0.16308999061584473, "learning_rate": 0.00012344086021505375, "loss": 0.1317, "step": 292 }, { "epoch": 0.01894141414141414, "grad_norm": 0.15882478654384613, "learning_rate": 0.0001238709677419355, "loss": 0.105, "step": 293 }, { "epoch": 0.019006060606060606, "grad_norm": 0.17380963265895844, "learning_rate": 0.0001243010752688172, "loss": 0.1343, "step": 294 }, { "epoch": 0.01907070707070707, "grad_norm": 0.1621273010969162, "learning_rate": 0.00012473118279569893, "loss": 0.1197, "step": 295 }, { "epoch": 0.019135353535353537, "grad_norm": 0.16630950570106506, "learning_rate": 0.00012516129032258066, "loss": 0.1219, "step": 296 }, { "epoch": 0.0192, "grad_norm": 0.18303367495536804, "learning_rate": 0.00012559139784946237, "loss": 0.1093, "step": 297 }, { "epoch": 0.019264646464646464, "grad_norm": 0.20401039719581604, "learning_rate": 0.0001260215053763441, "loss": 0.1268, "step": 298 }, { "epoch": 0.01932929292929293, "grad_norm": 0.19388994574546814, "learning_rate": 0.0001264516129032258, "loss": 0.1371, "step": 299 }, { "epoch": 0.019393939393939394, "grad_norm": 0.16796191036701202, "learning_rate": 0.00012688172043010752, "loss": 0.1222, "step": 300 }, { "epoch": 0.01945858585858586, "grad_norm": 0.17263269424438477, "learning_rate": 0.00012731182795698925, "loss": 0.1103, "step": 301 }, { "epoch": 0.019523232323232324, "grad_norm": 0.15675833821296692, "learning_rate": 0.00012774193548387096, "loss": 0.1128, "step": 302 }, { "epoch": 0.01958787878787879, "grad_norm": 0.174618199467659, "learning_rate": 0.0001281720430107527, "loss": 0.1342, "step": 303 }, { "epoch": 0.01965252525252525, "grad_norm": 0.1889706254005432, "learning_rate": 0.00012860215053763443, "loss": 0.1336, "step": 304 }, { "epoch": 0.01965252525252525, "eval_bleu": 10.705749789759375, "eval_loss": 0.11087147146463394, "eval_runtime": 2.9216, "eval_samples_per_second": 10.953, "eval_steps_per_second": 1.369, "step": 304 }, { "epoch": 0.019717171717171716, "grad_norm": 0.1739739030599594, "learning_rate": 0.00012903225806451613, "loss": 0.1166, "step": 305 }, { "epoch": 0.01978181818181818, "grad_norm": 0.14401200413703918, "learning_rate": 0.00012946236559139787, "loss": 0.0986, "step": 306 }, { "epoch": 0.019846464646464647, "grad_norm": 0.1703801155090332, "learning_rate": 0.00012989247311827958, "loss": 0.1257, "step": 307 }, { "epoch": 0.019911111111111112, "grad_norm": 0.13767610490322113, "learning_rate": 0.0001303225806451613, "loss": 0.096, "step": 308 }, { "epoch": 0.019975757575757577, "grad_norm": 0.15267498791217804, "learning_rate": 0.00013075268817204302, "loss": 0.11, "step": 309 }, { "epoch": 0.02004040404040404, "grad_norm": 0.13670149445533752, "learning_rate": 0.00013118279569892472, "loss": 0.0986, "step": 310 }, { "epoch": 0.020105050505050504, "grad_norm": 0.1716107279062271, "learning_rate": 0.00013161290322580646, "loss": 0.1348, "step": 311 }, { "epoch": 0.02016969696969697, "grad_norm": 0.1574842482805252, "learning_rate": 0.00013204301075268816, "loss": 0.1177, "step": 312 }, { "epoch": 0.020234343434343435, "grad_norm": 0.16496491432189941, "learning_rate": 0.0001324731182795699, "loss": 0.1257, "step": 313 }, { "epoch": 0.0202989898989899, "grad_norm": 0.14341674745082855, "learning_rate": 0.00013290322580645163, "loss": 0.1009, "step": 314 }, { "epoch": 0.020363636363636365, "grad_norm": 0.16001056134700775, "learning_rate": 0.00013333333333333334, "loss": 0.1206, "step": 315 }, { "epoch": 0.020428282828282827, "grad_norm": 0.16191469132900238, "learning_rate": 0.00013376344086021507, "loss": 0.1209, "step": 316 }, { "epoch": 0.020492929292929292, "grad_norm": 0.1328209638595581, "learning_rate": 0.00013419354838709678, "loss": 0.0868, "step": 317 }, { "epoch": 0.020557575757575757, "grad_norm": 0.16641101241111755, "learning_rate": 0.0001346236559139785, "loss": 0.1241, "step": 318 }, { "epoch": 0.020622222222222222, "grad_norm": 0.15474484860897064, "learning_rate": 0.00013505376344086022, "loss": 0.1055, "step": 319 }, { "epoch": 0.020686868686868688, "grad_norm": 0.16334268450737, "learning_rate": 0.00013548387096774193, "loss": 0.111, "step": 320 }, { "epoch": 0.020686868686868688, "eval_bleu": 8.121325828182469, "eval_loss": 0.11115045845508575, "eval_runtime": 2.3728, "eval_samples_per_second": 13.486, "eval_steps_per_second": 1.686, "step": 320 }, { "epoch": 0.020751515151515153, "grad_norm": 0.129131481051445, "learning_rate": 0.00013591397849462366, "loss": 0.0885, "step": 321 }, { "epoch": 0.020816161616161614, "grad_norm": 0.1746491640806198, "learning_rate": 0.0001363440860215054, "loss": 0.1311, "step": 322 }, { "epoch": 0.02088080808080808, "grad_norm": 0.15188440680503845, "learning_rate": 0.0001367741935483871, "loss": 0.1223, "step": 323 }, { "epoch": 0.020945454545454545, "grad_norm": 0.15379559993743896, "learning_rate": 0.00013720430107526884, "loss": 0.1156, "step": 324 }, { "epoch": 0.02101010101010101, "grad_norm": 0.16021427512168884, "learning_rate": 0.00013763440860215055, "loss": 0.1242, "step": 325 }, { "epoch": 0.021074747474747475, "grad_norm": 0.16248944401741028, "learning_rate": 0.00013806451612903225, "loss": 0.109, "step": 326 }, { "epoch": 0.02113939393939394, "grad_norm": 0.1452634632587433, "learning_rate": 0.00013849462365591399, "loss": 0.1041, "step": 327 }, { "epoch": 0.021204040404040406, "grad_norm": 0.1568138748407364, "learning_rate": 0.0001389247311827957, "loss": 0.1139, "step": 328 }, { "epoch": 0.021268686868686867, "grad_norm": 0.15839411318302155, "learning_rate": 0.00013935483870967743, "loss": 0.1102, "step": 329 }, { "epoch": 0.021333333333333333, "grad_norm": 0.15745337307453156, "learning_rate": 0.00013978494623655916, "loss": 0.1223, "step": 330 }, { "epoch": 0.021397979797979798, "grad_norm": 0.1718154102563858, "learning_rate": 0.00014021505376344087, "loss": 0.1301, "step": 331 }, { "epoch": 0.021462626262626263, "grad_norm": 0.16774027049541473, "learning_rate": 0.0001406451612903226, "loss": 0.1301, "step": 332 }, { "epoch": 0.021527272727272728, "grad_norm": 0.1731647104024887, "learning_rate": 0.0001410752688172043, "loss": 0.1336, "step": 333 }, { "epoch": 0.021591919191919193, "grad_norm": 0.15440630912780762, "learning_rate": 0.00014150537634408602, "loss": 0.1105, "step": 334 }, { "epoch": 0.021656565656565655, "grad_norm": 0.16279609501361847, "learning_rate": 0.00014193548387096775, "loss": 0.1217, "step": 335 }, { "epoch": 0.02172121212121212, "grad_norm": 0.143400177359581, "learning_rate": 0.00014236559139784946, "loss": 0.0959, "step": 336 }, { "epoch": 0.02172121212121212, "eval_bleu": 9.318219961493885, "eval_loss": 0.10899758338928223, "eval_runtime": 2.8131, "eval_samples_per_second": 11.375, "eval_steps_per_second": 1.422, "step": 336 }, { "epoch": 0.021785858585858586, "grad_norm": 0.14454936981201172, "learning_rate": 0.0001427956989247312, "loss": 0.1034, "step": 337 }, { "epoch": 0.02185050505050505, "grad_norm": 0.15263137221336365, "learning_rate": 0.00014322580645161293, "loss": 0.1184, "step": 338 }, { "epoch": 0.021915151515151516, "grad_norm": 0.16419324278831482, "learning_rate": 0.00014365591397849463, "loss": 0.112, "step": 339 }, { "epoch": 0.02197979797979798, "grad_norm": 0.16806355118751526, "learning_rate": 0.00014408602150537637, "loss": 0.1249, "step": 340 }, { "epoch": 0.022044444444444443, "grad_norm": 0.1690777838230133, "learning_rate": 0.00014451612903225807, "loss": 0.1272, "step": 341 }, { "epoch": 0.022109090909090908, "grad_norm": 0.16707856953144073, "learning_rate": 0.00014494623655913978, "loss": 0.1139, "step": 342 }, { "epoch": 0.022173737373737373, "grad_norm": 0.17025162279605865, "learning_rate": 0.00014537634408602151, "loss": 0.1201, "step": 343 }, { "epoch": 0.02223838383838384, "grad_norm": 0.14075659215450287, "learning_rate": 0.00014580645161290322, "loss": 0.1028, "step": 344 }, { "epoch": 0.022303030303030304, "grad_norm": 0.17504535615444183, "learning_rate": 0.00014623655913978496, "loss": 0.1383, "step": 345 }, { "epoch": 0.02236767676767677, "grad_norm": 0.16352058947086334, "learning_rate": 0.00014666666666666666, "loss": 0.1229, "step": 346 }, { "epoch": 0.02243232323232323, "grad_norm": 0.16439294815063477, "learning_rate": 0.0001470967741935484, "loss": 0.1181, "step": 347 }, { "epoch": 0.022496969696969696, "grad_norm": 0.16326642036437988, "learning_rate": 0.00014752688172043013, "loss": 0.1095, "step": 348 }, { "epoch": 0.02256161616161616, "grad_norm": 0.1376395970582962, "learning_rate": 0.00014795698924731184, "loss": 0.0921, "step": 349 }, { "epoch": 0.022626262626262626, "grad_norm": 0.14896328747272491, "learning_rate": 0.00014838709677419355, "loss": 0.1139, "step": 350 }, { "epoch": 0.02269090909090909, "grad_norm": 0.1763393133878708, "learning_rate": 0.00014881720430107528, "loss": 0.1084, "step": 351 }, { "epoch": 0.022755555555555557, "grad_norm": 0.13971780240535736, "learning_rate": 0.00014924731182795699, "loss": 0.0977, "step": 352 }, { "epoch": 0.022755555555555557, "eval_bleu": 9.32524427394953, "eval_loss": 0.10860705375671387, "eval_runtime": 2.6408, "eval_samples_per_second": 12.118, "eval_steps_per_second": 1.515, "step": 352 }, { "epoch": 0.022820202020202022, "grad_norm": 0.1523587703704834, "learning_rate": 0.00014967741935483872, "loss": 0.1175, "step": 353 }, { "epoch": 0.022884848484848484, "grad_norm": 0.18091051280498505, "learning_rate": 0.00015010752688172043, "loss": 0.1047, "step": 354 }, { "epoch": 0.02294949494949495, "grad_norm": 0.18230056762695312, "learning_rate": 0.00015053763440860216, "loss": 0.1293, "step": 355 }, { "epoch": 0.023014141414141414, "grad_norm": 0.17838574945926666, "learning_rate": 0.0001509677419354839, "loss": 0.1225, "step": 356 }, { "epoch": 0.02307878787878788, "grad_norm": 0.18112388253211975, "learning_rate": 0.0001513978494623656, "loss": 0.1322, "step": 357 }, { "epoch": 0.023143434343434344, "grad_norm": 0.16122449934482574, "learning_rate": 0.0001518279569892473, "loss": 0.1133, "step": 358 }, { "epoch": 0.02320808080808081, "grad_norm": 0.14952917397022247, "learning_rate": 0.00015225806451612902, "loss": 0.1026, "step": 359 }, { "epoch": 0.02327272727272727, "grad_norm": 0.16230295598506927, "learning_rate": 0.00015268817204301075, "loss": 0.1224, "step": 360 }, { "epoch": 0.023337373737373737, "grad_norm": 0.14395084977149963, "learning_rate": 0.00015311827956989248, "loss": 0.1149, "step": 361 }, { "epoch": 0.0234020202020202, "grad_norm": 0.17280416190624237, "learning_rate": 0.0001535483870967742, "loss": 0.1217, "step": 362 }, { "epoch": 0.023466666666666667, "grad_norm": 0.15629975497722626, "learning_rate": 0.00015397849462365593, "loss": 0.1207, "step": 363 }, { "epoch": 0.023531313131313132, "grad_norm": 0.15779243409633636, "learning_rate": 0.00015440860215053766, "loss": 0.1247, "step": 364 }, { "epoch": 0.023595959595959597, "grad_norm": 0.16891762614250183, "learning_rate": 0.00015483870967741937, "loss": 0.1444, "step": 365 }, { "epoch": 0.02366060606060606, "grad_norm": 0.13558053970336914, "learning_rate": 0.00015526881720430107, "loss": 0.1034, "step": 366 }, { "epoch": 0.023725252525252524, "grad_norm": 0.1764685958623886, "learning_rate": 0.00015569892473118278, "loss": 0.1052, "step": 367 }, { "epoch": 0.02378989898989899, "grad_norm": 0.1592056155204773, "learning_rate": 0.00015612903225806451, "loss": 0.1225, "step": 368 }, { "epoch": 0.02378989898989899, "eval_bleu": 9.103427970211893, "eval_loss": 0.10848551988601685, "eval_runtime": 2.7711, "eval_samples_per_second": 11.548, "eval_steps_per_second": 1.443, "step": 368 }, { "epoch": 0.023854545454545455, "grad_norm": 0.13903047144412994, "learning_rate": 0.00015655913978494625, "loss": 0.1061, "step": 369 }, { "epoch": 0.02391919191919192, "grad_norm": 0.18397219479084015, "learning_rate": 0.00015698924731182796, "loss": 0.132, "step": 370 }, { "epoch": 0.023983838383838385, "grad_norm": 0.14793841540813446, "learning_rate": 0.0001574193548387097, "loss": 0.1124, "step": 371 }, { "epoch": 0.024048484848484847, "grad_norm": 0.15606354176998138, "learning_rate": 0.00015784946236559142, "loss": 0.1161, "step": 372 }, { "epoch": 0.024113131313131312, "grad_norm": 0.15853872895240784, "learning_rate": 0.00015827956989247313, "loss": 0.1166, "step": 373 }, { "epoch": 0.024177777777777777, "grad_norm": 0.15579386055469513, "learning_rate": 0.00015870967741935487, "loss": 0.1163, "step": 374 }, { "epoch": 0.024242424242424242, "grad_norm": 0.17541849613189697, "learning_rate": 0.00015913978494623657, "loss": 0.1143, "step": 375 }, { "epoch": 0.024307070707070708, "grad_norm": 0.22376100718975067, "learning_rate": 0.00015956989247311828, "loss": 0.1227, "step": 376 }, { "epoch": 0.024371717171717173, "grad_norm": 0.16659735143184662, "learning_rate": 0.00016, "loss": 0.1174, "step": 377 }, { "epoch": 0.024436363636363638, "grad_norm": 0.14780963957309723, "learning_rate": 0.00016043010752688172, "loss": 0.1116, "step": 378 }, { "epoch": 0.0245010101010101, "grad_norm": 0.16586990654468536, "learning_rate": 0.00016086021505376345, "loss": 0.1279, "step": 379 }, { "epoch": 0.024565656565656565, "grad_norm": 0.13963653147220612, "learning_rate": 0.00016129032258064516, "loss": 0.1099, "step": 380 }, { "epoch": 0.02463030303030303, "grad_norm": 0.16145245730876923, "learning_rate": 0.0001617204301075269, "loss": 0.1121, "step": 381 }, { "epoch": 0.024694949494949495, "grad_norm": 0.13311390578746796, "learning_rate": 0.00016215053763440863, "loss": 0.106, "step": 382 }, { "epoch": 0.02475959595959596, "grad_norm": 0.16619884967803955, "learning_rate": 0.00016258064516129034, "loss": 0.1158, "step": 383 }, { "epoch": 0.024824242424242426, "grad_norm": 0.15011127293109894, "learning_rate": 0.00016301075268817204, "loss": 0.1119, "step": 384 }, { "epoch": 0.024824242424242426, "eval_bleu": 10.882295300737969, "eval_loss": 0.10799592733383179, "eval_runtime": 2.601, "eval_samples_per_second": 12.303, "eval_steps_per_second": 1.538, "step": 384 }, { "epoch": 0.024888888888888887, "grad_norm": 0.11864225566387177, "learning_rate": 0.00016344086021505378, "loss": 0.0956, "step": 385 }, { "epoch": 0.024953535353535353, "grad_norm": 0.13069860637187958, "learning_rate": 0.00016387096774193548, "loss": 0.0954, "step": 386 }, { "epoch": 0.025018181818181818, "grad_norm": 0.13603512942790985, "learning_rate": 0.00016430107526881722, "loss": 0.1018, "step": 387 }, { "epoch": 0.025082828282828283, "grad_norm": 0.1649290770292282, "learning_rate": 0.00016473118279569893, "loss": 0.1242, "step": 388 }, { "epoch": 0.02514747474747475, "grad_norm": 0.16284801065921783, "learning_rate": 0.00016516129032258066, "loss": 0.1085, "step": 389 }, { "epoch": 0.025212121212121213, "grad_norm": 0.13699135184288025, "learning_rate": 0.0001655913978494624, "loss": 0.1021, "step": 390 }, { "epoch": 0.025276767676767675, "grad_norm": 0.13586992025375366, "learning_rate": 0.0001660215053763441, "loss": 0.0973, "step": 391 }, { "epoch": 0.02534141414141414, "grad_norm": 0.1439896821975708, "learning_rate": 0.0001664516129032258, "loss": 0.1043, "step": 392 }, { "epoch": 0.025406060606060606, "grad_norm": 0.18616266548633575, "learning_rate": 0.00016688172043010751, "loss": 0.1456, "step": 393 }, { "epoch": 0.02547070707070707, "grad_norm": 0.28124067187309265, "learning_rate": 0.00016731182795698925, "loss": 0.1111, "step": 394 }, { "epoch": 0.025535353535353536, "grad_norm": 0.14582084119319916, "learning_rate": 0.00016774193548387098, "loss": 0.1121, "step": 395 }, { "epoch": 0.0256, "grad_norm": 0.14482928812503815, "learning_rate": 0.0001681720430107527, "loss": 0.114, "step": 396 }, { "epoch": 0.025664646464646463, "grad_norm": 0.13774289190769196, "learning_rate": 0.00016860215053763442, "loss": 0.1123, "step": 397 }, { "epoch": 0.025729292929292928, "grad_norm": 0.14453327655792236, "learning_rate": 0.00016903225806451616, "loss": 0.1207, "step": 398 }, { "epoch": 0.025793939393939393, "grad_norm": 0.12729491293430328, "learning_rate": 0.00016946236559139786, "loss": 0.0952, "step": 399 }, { "epoch": 0.02585858585858586, "grad_norm": 0.14126811921596527, "learning_rate": 0.00016989247311827957, "loss": 0.12, "step": 400 }, { "epoch": 0.02585858585858586, "eval_bleu": 8.985186608284193, "eval_loss": 0.10655423253774643, "eval_runtime": 2.6462, "eval_samples_per_second": 12.093, "eval_steps_per_second": 1.512, "step": 400 }, { "epoch": 0.025923232323232324, "grad_norm": 0.143293097615242, "learning_rate": 0.00017032258064516128, "loss": 0.13, "step": 401 }, { "epoch": 0.02598787878787879, "grad_norm": 0.14245979487895966, "learning_rate": 0.000170752688172043, "loss": 0.1118, "step": 402 }, { "epoch": 0.026052525252525254, "grad_norm": 0.1558937132358551, "learning_rate": 0.00017118279569892475, "loss": 0.1152, "step": 403 }, { "epoch": 0.026117171717171716, "grad_norm": 0.1501445472240448, "learning_rate": 0.00017161290322580645, "loss": 0.1156, "step": 404 }, { "epoch": 0.02618181818181818, "grad_norm": 0.1374216228723526, "learning_rate": 0.0001720430107526882, "loss": 0.1002, "step": 405 }, { "epoch": 0.026246464646464646, "grad_norm": 0.1564859300851822, "learning_rate": 0.0001724731182795699, "loss": 0.1215, "step": 406 }, { "epoch": 0.02631111111111111, "grad_norm": 0.16713081300258636, "learning_rate": 0.00017290322580645163, "loss": 0.1305, "step": 407 }, { "epoch": 0.026375757575757577, "grad_norm": 0.13711397349834442, "learning_rate": 0.00017333333333333334, "loss": 0.098, "step": 408 }, { "epoch": 0.026440404040404042, "grad_norm": 0.13817580044269562, "learning_rate": 0.00017376344086021504, "loss": 0.0974, "step": 409 }, { "epoch": 0.026505050505050504, "grad_norm": 0.1366218775510788, "learning_rate": 0.00017419354838709678, "loss": 0.1044, "step": 410 }, { "epoch": 0.02656969696969697, "grad_norm": 0.19042985141277313, "learning_rate": 0.0001746236559139785, "loss": 0.1298, "step": 411 }, { "epoch": 0.026634343434343434, "grad_norm": 0.14615961909294128, "learning_rate": 0.00017505376344086022, "loss": 0.0993, "step": 412 }, { "epoch": 0.0266989898989899, "grad_norm": 0.15030674636363983, "learning_rate": 0.00017548387096774195, "loss": 0.1209, "step": 413 }, { "epoch": 0.026763636363636364, "grad_norm": 0.147713765501976, "learning_rate": 0.00017591397849462366, "loss": 0.1207, "step": 414 }, { "epoch": 0.02682828282828283, "grad_norm": 0.15125833451747894, "learning_rate": 0.0001763440860215054, "loss": 0.1223, "step": 415 }, { "epoch": 0.02689292929292929, "grad_norm": 0.12598833441734314, "learning_rate": 0.0001767741935483871, "loss": 0.0899, "step": 416 }, { "epoch": 0.02689292929292929, "eval_bleu": 10.147141304417094, "eval_loss": 0.10775532573461533, "eval_runtime": 2.6245, "eval_samples_per_second": 12.193, "eval_steps_per_second": 1.524, "step": 416 }, { "epoch": 0.026957575757575757, "grad_norm": 0.16561436653137207, "learning_rate": 0.0001772043010752688, "loss": 0.1182, "step": 417 }, { "epoch": 0.027022222222222222, "grad_norm": 0.12544482946395874, "learning_rate": 0.00017763440860215054, "loss": 0.0974, "step": 418 }, { "epoch": 0.027086868686868687, "grad_norm": 0.1391218900680542, "learning_rate": 0.00017806451612903228, "loss": 0.103, "step": 419 }, { "epoch": 0.027151515151515152, "grad_norm": 0.15751127898693085, "learning_rate": 0.00017849462365591398, "loss": 0.1109, "step": 420 }, { "epoch": 0.027216161616161617, "grad_norm": 0.13848909735679626, "learning_rate": 0.00017892473118279572, "loss": 0.1024, "step": 421 }, { "epoch": 0.02728080808080808, "grad_norm": 0.15021491050720215, "learning_rate": 0.00017935483870967742, "loss": 0.1216, "step": 422 }, { "epoch": 0.027345454545454544, "grad_norm": 0.13931679725646973, "learning_rate": 0.00017978494623655916, "loss": 0.1174, "step": 423 }, { "epoch": 0.02741010101010101, "grad_norm": 0.16443245112895966, "learning_rate": 0.00018021505376344086, "loss": 0.1227, "step": 424 }, { "epoch": 0.027474747474747475, "grad_norm": 0.13369695842266083, "learning_rate": 0.00018064516129032257, "loss": 0.0986, "step": 425 }, { "epoch": 0.02753939393939394, "grad_norm": 0.16605685651302338, "learning_rate": 0.0001810752688172043, "loss": 0.1275, "step": 426 }, { "epoch": 0.027604040404040405, "grad_norm": 0.1480400115251541, "learning_rate": 0.000181505376344086, "loss": 0.1105, "step": 427 }, { "epoch": 0.02766868686868687, "grad_norm": 0.17037196457386017, "learning_rate": 0.00018193548387096775, "loss": 0.1484, "step": 428 }, { "epoch": 0.027733333333333332, "grad_norm": 0.16051368415355682, "learning_rate": 0.00018236559139784948, "loss": 0.1316, "step": 429 }, { "epoch": 0.027797979797979797, "grad_norm": 0.16002199053764343, "learning_rate": 0.0001827956989247312, "loss": 0.1267, "step": 430 }, { "epoch": 0.027862626262626262, "grad_norm": 0.11067473888397217, "learning_rate": 0.00018322580645161292, "loss": 0.0765, "step": 431 }, { "epoch": 0.027927272727272728, "grad_norm": 0.14831194281578064, "learning_rate": 0.00018365591397849463, "loss": 0.129, "step": 432 }, { "epoch": 0.027927272727272728, "eval_bleu": 11.054277549790548, "eval_loss": 0.10615125298500061, "eval_runtime": 2.6125, "eval_samples_per_second": 12.249, "eval_steps_per_second": 1.531, "step": 432 }, { "epoch": 0.027991919191919193, "grad_norm": 0.14561083912849426, "learning_rate": 0.00018408602150537634, "loss": 0.0983, "step": 433 }, { "epoch": 0.028056565656565658, "grad_norm": 0.12259099632501602, "learning_rate": 0.00018451612903225807, "loss": 0.0862, "step": 434 }, { "epoch": 0.02812121212121212, "grad_norm": 0.1275368481874466, "learning_rate": 0.00018494623655913978, "loss": 0.0992, "step": 435 }, { "epoch": 0.028185858585858585, "grad_norm": 0.14508667588233948, "learning_rate": 0.0001853763440860215, "loss": 0.1126, "step": 436 }, { "epoch": 0.02825050505050505, "grad_norm": 0.14779242873191833, "learning_rate": 0.00018580645161290325, "loss": 0.1104, "step": 437 }, { "epoch": 0.028315151515151515, "grad_norm": 0.14809027314186096, "learning_rate": 0.00018623655913978495, "loss": 0.1158, "step": 438 }, { "epoch": 0.02837979797979798, "grad_norm": 0.18058310449123383, "learning_rate": 0.0001866666666666667, "loss": 0.1252, "step": 439 }, { "epoch": 0.028444444444444446, "grad_norm": 0.1464747041463852, "learning_rate": 0.0001870967741935484, "loss": 0.122, "step": 440 }, { "epoch": 0.028509090909090908, "grad_norm": 0.12998485565185547, "learning_rate": 0.00018752688172043013, "loss": 0.1166, "step": 441 }, { "epoch": 0.028573737373737373, "grad_norm": 0.12510083615779877, "learning_rate": 0.00018795698924731183, "loss": 0.1, "step": 442 }, { "epoch": 0.028638383838383838, "grad_norm": 0.14600686728954315, "learning_rate": 0.00018838709677419354, "loss": 0.119, "step": 443 }, { "epoch": 0.028703030303030303, "grad_norm": 0.16463568806648254, "learning_rate": 0.00018881720430107528, "loss": 0.1468, "step": 444 }, { "epoch": 0.02876767676767677, "grad_norm": 0.14338329434394836, "learning_rate": 0.000189247311827957, "loss": 0.11, "step": 445 }, { "epoch": 0.028832323232323234, "grad_norm": 0.12141603231430054, "learning_rate": 0.00018967741935483872, "loss": 0.0941, "step": 446 }, { "epoch": 0.028896969696969695, "grad_norm": 0.14474593102931976, "learning_rate": 0.00019010752688172045, "loss": 0.1278, "step": 447 }, { "epoch": 0.02896161616161616, "grad_norm": 0.13601356744766235, "learning_rate": 0.00019053763440860216, "loss": 0.1015, "step": 448 }, { "epoch": 0.02896161616161616, "eval_bleu": 10.693190110406718, "eval_loss": 0.10794844478368759, "eval_runtime": 2.5169, "eval_samples_per_second": 12.714, "eval_steps_per_second": 1.589, "step": 448 }, { "epoch": 0.029026262626262626, "grad_norm": 0.12670452892780304, "learning_rate": 0.0001909677419354839, "loss": 0.1091, "step": 449 }, { "epoch": 0.02909090909090909, "grad_norm": 0.16578026115894318, "learning_rate": 0.0001913978494623656, "loss": 0.118, "step": 450 }, { "epoch": 0.029155555555555556, "grad_norm": 0.15042001008987427, "learning_rate": 0.0001918279569892473, "loss": 0.1228, "step": 451 }, { "epoch": 0.02922020202020202, "grad_norm": 0.14987945556640625, "learning_rate": 0.00019225806451612904, "loss": 0.1228, "step": 452 }, { "epoch": 0.029284848484848486, "grad_norm": 0.1526365578174591, "learning_rate": 0.00019268817204301077, "loss": 0.1284, "step": 453 }, { "epoch": 0.029349494949494948, "grad_norm": 0.1318255364894867, "learning_rate": 0.00019311827956989248, "loss": 0.1039, "step": 454 }, { "epoch": 0.029414141414141413, "grad_norm": 0.1383906453847885, "learning_rate": 0.00019354838709677422, "loss": 0.1212, "step": 455 }, { "epoch": 0.02947878787878788, "grad_norm": 0.1535595953464508, "learning_rate": 0.00019397849462365592, "loss": 0.1155, "step": 456 }, { "epoch": 0.029543434343434344, "grad_norm": 0.1335660219192505, "learning_rate": 0.00019440860215053766, "loss": 0.1001, "step": 457 }, { "epoch": 0.02960808080808081, "grad_norm": 0.2634216248989105, "learning_rate": 0.00019483870967741936, "loss": 0.1192, "step": 458 }, { "epoch": 0.029672727272727274, "grad_norm": 0.13985425233840942, "learning_rate": 0.00019526881720430107, "loss": 0.1214, "step": 459 }, { "epoch": 0.029737373737373736, "grad_norm": 0.14772897958755493, "learning_rate": 0.0001956989247311828, "loss": 0.1125, "step": 460 }, { "epoch": 0.0298020202020202, "grad_norm": 0.1297396719455719, "learning_rate": 0.0001961290322580645, "loss": 0.0984, "step": 461 }, { "epoch": 0.029866666666666666, "grad_norm": 0.1519833654165268, "learning_rate": 0.00019655913978494625, "loss": 0.123, "step": 462 }, { "epoch": 0.02993131313131313, "grad_norm": 0.13482460379600525, "learning_rate": 0.00019698924731182798, "loss": 0.1069, "step": 463 }, { "epoch": 0.029995959595959597, "grad_norm": 0.13658182322978973, "learning_rate": 0.00019741935483870969, "loss": 0.1177, "step": 464 }, { "epoch": 0.029995959595959597, "eval_bleu": 12.572843086082546, "eval_loss": 0.10561101138591766, "eval_runtime": 2.7021, "eval_samples_per_second": 11.843, "eval_steps_per_second": 1.48, "step": 464 }, { "epoch": 0.030060606060606062, "grad_norm": 0.12853117287158966, "learning_rate": 0.00019784946236559142, "loss": 0.1016, "step": 465 }, { "epoch": 0.030125252525252524, "grad_norm": 0.14732927083969116, "learning_rate": 0.00019827956989247313, "loss": 0.128, "step": 466 }, { "epoch": 0.03018989898989899, "grad_norm": 0.1329525113105774, "learning_rate": 0.00019870967741935483, "loss": 0.1102, "step": 467 }, { "epoch": 0.030254545454545454, "grad_norm": 0.14983846247196198, "learning_rate": 0.00019913978494623657, "loss": 0.1311, "step": 468 }, { "epoch": 0.03031919191919192, "grad_norm": 0.12656426429748535, "learning_rate": 0.00019956989247311828, "loss": 0.1038, "step": 469 }, { "epoch": 0.030383838383838385, "grad_norm": 0.14365172386169434, "learning_rate": 0.0002, "loss": 0.1147, "step": 470 }, { "epoch": 0.03044848484848485, "grad_norm": 0.13733601570129395, "learning_rate": 0.00019999999976616652, "loss": 0.1123, "step": 471 }, { "epoch": 0.03051313131313131, "grad_norm": 0.1368652582168579, "learning_rate": 0.00019999999906466614, "loss": 0.1094, "step": 472 }, { "epoch": 0.030577777777777777, "grad_norm": 0.15315213799476624, "learning_rate": 0.00019999999789549876, "loss": 0.1239, "step": 473 }, { "epoch": 0.030642424242424242, "grad_norm": 0.14755412936210632, "learning_rate": 0.0001999999962586645, "loss": 0.115, "step": 474 }, { "epoch": 0.030707070707070707, "grad_norm": 0.14244329929351807, "learning_rate": 0.0001999999941541633, "loss": 0.1069, "step": 475 }, { "epoch": 0.030771717171717172, "grad_norm": 0.1433442085981369, "learning_rate": 0.0001999999915819952, "loss": 0.1073, "step": 476 }, { "epoch": 0.030836363636363637, "grad_norm": 0.14364516735076904, "learning_rate": 0.00019999998854216018, "loss": 0.1131, "step": 477 }, { "epoch": 0.030901010101010103, "grad_norm": 0.15006142854690552, "learning_rate": 0.0001999999850346583, "loss": 0.12, "step": 478 }, { "epoch": 0.030965656565656564, "grad_norm": 0.12459193915128708, "learning_rate": 0.00019999998105948953, "loss": 0.1, "step": 479 }, { "epoch": 0.03103030303030303, "grad_norm": 0.14505188167095184, "learning_rate": 0.0001999999766166539, "loss": 0.1126, "step": 480 }, { "epoch": 0.03103030303030303, "eval_bleu": 12.168736331249336, "eval_loss": 0.10556380450725555, "eval_runtime": 2.52, "eval_samples_per_second": 12.698, "eval_steps_per_second": 1.587, "step": 480 }, { "epoch": 0.031094949494949495, "grad_norm": 0.15447013080120087, "learning_rate": 0.0001999999717061514, "loss": 0.1132, "step": 481 }, { "epoch": 0.03115959595959596, "grad_norm": 0.11705537140369415, "learning_rate": 0.00019999996632798217, "loss": 0.0976, "step": 482 }, { "epoch": 0.031224242424242425, "grad_norm": 0.15111412107944489, "learning_rate": 0.00019999996048214612, "loss": 0.1139, "step": 483 }, { "epoch": 0.03128888888888889, "grad_norm": 0.14592544734477997, "learning_rate": 0.0001999999541686433, "loss": 0.1221, "step": 484 }, { "epoch": 0.031353535353535356, "grad_norm": 0.18990279734134674, "learning_rate": 0.00019999994738747378, "loss": 0.1517, "step": 485 }, { "epoch": 0.03141818181818182, "grad_norm": 0.14252693951129913, "learning_rate": 0.00019999994013863756, "loss": 0.1186, "step": 486 }, { "epoch": 0.031482828282828286, "grad_norm": 0.13235680758953094, "learning_rate": 0.00019999993242213467, "loss": 0.1163, "step": 487 }, { "epoch": 0.03154747474747475, "grad_norm": 0.21530663967132568, "learning_rate": 0.00019999992423796515, "loss": 0.1037, "step": 488 }, { "epoch": 0.03161212121212121, "grad_norm": 0.12231055647134781, "learning_rate": 0.00019999991558612904, "loss": 0.0925, "step": 489 }, { "epoch": 0.03167676767676768, "grad_norm": 0.146692156791687, "learning_rate": 0.00019999990646662642, "loss": 0.1056, "step": 490 }, { "epoch": 0.03174141414141414, "grad_norm": 0.16732384264469147, "learning_rate": 0.00019999989687945728, "loss": 0.1051, "step": 491 }, { "epoch": 0.03180606060606061, "grad_norm": 0.14350025355815887, "learning_rate": 0.00019999988682462168, "loss": 0.1193, "step": 492 }, { "epoch": 0.03187070707070707, "grad_norm": 0.21280625462532043, "learning_rate": 0.00019999987630211967, "loss": 0.1272, "step": 493 }, { "epoch": 0.03193535353535353, "grad_norm": 0.23272041976451874, "learning_rate": 0.0001999998653119513, "loss": 0.0916, "step": 494 }, { "epoch": 0.032, "grad_norm": 0.1862674206495285, "learning_rate": 0.0001999998538541166, "loss": 0.125, "step": 495 }, { "epoch": 0.03206464646464646, "grad_norm": 0.200235515832901, "learning_rate": 0.00019999984192861566, "loss": 0.1175, "step": 496 }, { "epoch": 0.03206464646464646, "eval_bleu": 11.162771160551632, "eval_loss": 0.1055934727191925, "eval_runtime": 2.5819, "eval_samples_per_second": 12.394, "eval_steps_per_second": 1.549, "step": 496 }, { "epoch": 0.03212929292929293, "grad_norm": 0.1706344038248062, "learning_rate": 0.00019999982953544852, "loss": 0.1156, "step": 497 }, { "epoch": 0.03219393939393939, "grad_norm": 0.1841149479150772, "learning_rate": 0.00019999981667461522, "loss": 0.1265, "step": 498 }, { "epoch": 0.03225858585858586, "grad_norm": 0.14012755453586578, "learning_rate": 0.00019999980334611586, "loss": 0.1173, "step": 499 }, { "epoch": 0.03232323232323232, "grad_norm": 0.1379678100347519, "learning_rate": 0.00019999978954995045, "loss": 0.1065, "step": 500 }, { "epoch": 0.032387878787878785, "grad_norm": 0.14016778767108917, "learning_rate": 0.0001999997752861191, "loss": 0.1152, "step": 501 }, { "epoch": 0.032452525252525254, "grad_norm": 0.13358506560325623, "learning_rate": 0.00019999976055462185, "loss": 0.1135, "step": 502 }, { "epoch": 0.032517171717171715, "grad_norm": 0.14606399834156036, "learning_rate": 0.0001999997453554588, "loss": 0.1066, "step": 503 }, { "epoch": 0.032581818181818184, "grad_norm": 0.14392736554145813, "learning_rate": 0.00019999972968863, "loss": 0.1052, "step": 504 }, { "epoch": 0.032646464646464646, "grad_norm": 0.14780963957309723, "learning_rate": 0.0001999997135541355, "loss": 0.1095, "step": 505 }, { "epoch": 0.032711111111111114, "grad_norm": 0.12461218237876892, "learning_rate": 0.00019999969695197543, "loss": 0.1026, "step": 506 }, { "epoch": 0.032775757575757576, "grad_norm": 0.1564641147851944, "learning_rate": 0.0001999996798821498, "loss": 0.1276, "step": 507 }, { "epoch": 0.03284040404040404, "grad_norm": 0.1745409071445465, "learning_rate": 0.00019999966234465877, "loss": 0.1014, "step": 508 }, { "epoch": 0.03290505050505051, "grad_norm": 0.1282573640346527, "learning_rate": 0.00019999964433950235, "loss": 0.1099, "step": 509 }, { "epoch": 0.03296969696969697, "grad_norm": 0.12653900682926178, "learning_rate": 0.00019999962586668063, "loss": 0.106, "step": 510 }, { "epoch": 0.03303434343434344, "grad_norm": 0.13360770046710968, "learning_rate": 0.00019999960692619376, "loss": 0.1171, "step": 511 }, { "epoch": 0.0330989898989899, "grad_norm": 0.1208031103014946, "learning_rate": 0.00019999958751804178, "loss": 0.1115, "step": 512 }, { "epoch": 0.0330989898989899, "eval_bleu": 9.488414092761792, "eval_loss": 0.10857859253883362, "eval_runtime": 2.6232, "eval_samples_per_second": 12.199, "eval_steps_per_second": 1.525, "step": 512 }, { "epoch": 0.03316363636363636, "grad_norm": 0.13872727751731873, "learning_rate": 0.00019999956764222478, "loss": 0.1228, "step": 513 }, { "epoch": 0.03322828282828283, "grad_norm": 0.25268110632896423, "learning_rate": 0.00019999954729874286, "loss": 0.1478, "step": 514 }, { "epoch": 0.03329292929292929, "grad_norm": 0.13848553597927094, "learning_rate": 0.0001999995264875961, "loss": 0.1286, "step": 515 }, { "epoch": 0.03335757575757576, "grad_norm": 0.13639119267463684, "learning_rate": 0.00019999950520878463, "loss": 0.1246, "step": 516 }, { "epoch": 0.03342222222222222, "grad_norm": 0.12548309564590454, "learning_rate": 0.00019999948346230854, "loss": 0.1087, "step": 517 }, { "epoch": 0.03348686868686869, "grad_norm": 0.13334284722805023, "learning_rate": 0.00019999946124816794, "loss": 0.1077, "step": 518 }, { "epoch": 0.03355151515151515, "grad_norm": 0.12922510504722595, "learning_rate": 0.0001999994385663629, "loss": 0.1034, "step": 519 }, { "epoch": 0.03361616161616161, "grad_norm": 0.15493009984493256, "learning_rate": 0.00019999941541689356, "loss": 0.1416, "step": 520 }, { "epoch": 0.03368080808080808, "grad_norm": 0.1641789972782135, "learning_rate": 0.00019999939179975997, "loss": 0.136, "step": 521 }, { "epoch": 0.033745454545454544, "grad_norm": 0.11691408604383469, "learning_rate": 0.00019999936771496231, "loss": 0.0844, "step": 522 }, { "epoch": 0.03381010101010101, "grad_norm": 0.13989783823490143, "learning_rate": 0.0001999993431625007, "loss": 0.1195, "step": 523 }, { "epoch": 0.033874747474747474, "grad_norm": 0.13525332510471344, "learning_rate": 0.00019999931814237515, "loss": 0.113, "step": 524 }, { "epoch": 0.03393939393939394, "grad_norm": 0.13060380518436432, "learning_rate": 0.0001999992926545859, "loss": 0.1094, "step": 525 }, { "epoch": 0.034004040404040405, "grad_norm": 0.14437176287174225, "learning_rate": 0.00019999926669913301, "loss": 0.1232, "step": 526 }, { "epoch": 0.034068686868686866, "grad_norm": 0.13507899641990662, "learning_rate": 0.0001999992402760166, "loss": 0.1078, "step": 527 }, { "epoch": 0.034133333333333335, "grad_norm": 0.13182367384433746, "learning_rate": 0.00019999921338523683, "loss": 0.1093, "step": 528 }, { "epoch": 0.034133333333333335, "eval_bleu": 10.682708795923162, "eval_loss": 0.10908666253089905, "eval_runtime": 2.9928, "eval_samples_per_second": 10.692, "eval_steps_per_second": 1.337, "step": 528 }, { "epoch": 0.0341979797979798, "grad_norm": 0.12510529160499573, "learning_rate": 0.00019999918602679376, "loss": 0.1053, "step": 529 }, { "epoch": 0.034262626262626265, "grad_norm": 0.12579026818275452, "learning_rate": 0.00019999915820068757, "loss": 0.1062, "step": 530 }, { "epoch": 0.03432727272727273, "grad_norm": 0.13125565648078918, "learning_rate": 0.0001999991299069184, "loss": 0.1243, "step": 531 }, { "epoch": 0.03439191919191919, "grad_norm": 0.13875770568847656, "learning_rate": 0.0001999991011454863, "loss": 0.1348, "step": 532 }, { "epoch": 0.03445656565656566, "grad_norm": 0.14722971618175507, "learning_rate": 0.0001999990719163915, "loss": 0.1129, "step": 533 }, { "epoch": 0.03452121212121212, "grad_norm": 0.1381601244211197, "learning_rate": 0.00019999904221963411, "loss": 0.1251, "step": 534 }, { "epoch": 0.03458585858585859, "grad_norm": 0.13787423074245453, "learning_rate": 0.00019999901205521424, "loss": 0.1298, "step": 535 }, { "epoch": 0.03465050505050505, "grad_norm": 0.12954244017601013, "learning_rate": 0.00019999898142313206, "loss": 0.1048, "step": 536 }, { "epoch": 0.03471515151515152, "grad_norm": 0.14057686924934387, "learning_rate": 0.0001999989503233877, "loss": 0.117, "step": 537 }, { "epoch": 0.03477979797979798, "grad_norm": 0.12244465202093124, "learning_rate": 0.0001999989187559813, "loss": 0.1087, "step": 538 }, { "epoch": 0.03484444444444444, "grad_norm": 0.13722139596939087, "learning_rate": 0.00019999888672091304, "loss": 0.1226, "step": 539 }, { "epoch": 0.03490909090909091, "grad_norm": 0.13450995087623596, "learning_rate": 0.00019999885421818304, "loss": 0.1092, "step": 540 }, { "epoch": 0.03497373737373737, "grad_norm": 0.11251247674226761, "learning_rate": 0.0001999988212477914, "loss": 0.1046, "step": 541 }, { "epoch": 0.03503838383838384, "grad_norm": 0.11316199600696564, "learning_rate": 0.0001999987878097384, "loss": 0.1031, "step": 542 }, { "epoch": 0.0351030303030303, "grad_norm": 0.12963828444480896, "learning_rate": 0.0001999987539040241, "loss": 0.1065, "step": 543 }, { "epoch": 0.035167676767676764, "grad_norm": 0.13680677115917206, "learning_rate": 0.0001999987195306487, "loss": 0.1242, "step": 544 }, { "epoch": 0.035167676767676764, "eval_bleu": 9.798230940457769, "eval_loss": 0.10932165384292603, "eval_runtime": 2.6972, "eval_samples_per_second": 11.864, "eval_steps_per_second": 1.483, "step": 544 }, { "epoch": 0.03523232323232323, "grad_norm": 0.14490805566310883, "learning_rate": 0.00019999868468961233, "loss": 0.1029, "step": 545 }, { "epoch": 0.035296969696969695, "grad_norm": 0.36747074127197266, "learning_rate": 0.0001999986493809152, "loss": 0.1511, "step": 546 }, { "epoch": 0.03536161616161616, "grad_norm": 0.11181472986936569, "learning_rate": 0.00019999861360455741, "loss": 0.0992, "step": 547 }, { "epoch": 0.035426262626262625, "grad_norm": 0.11787353456020355, "learning_rate": 0.00019999857736053918, "loss": 0.1168, "step": 548 }, { "epoch": 0.035490909090909094, "grad_norm": 0.11531051248311996, "learning_rate": 0.00019999854064886067, "loss": 0.1054, "step": 549 }, { "epoch": 0.035555555555555556, "grad_norm": 0.10686899721622467, "learning_rate": 0.00019999850346952205, "loss": 0.0853, "step": 550 }, { "epoch": 0.03562020202020202, "grad_norm": 0.12576760351657867, "learning_rate": 0.0001999984658225235, "loss": 0.1012, "step": 551 }, { "epoch": 0.035684848484848486, "grad_norm": 0.12727631628513336, "learning_rate": 0.00019999842770786512, "loss": 0.1039, "step": 552 }, { "epoch": 0.03574949494949495, "grad_norm": 0.150522381067276, "learning_rate": 0.0001999983891255472, "loss": 0.1128, "step": 553 }, { "epoch": 0.035814141414141416, "grad_norm": 0.11746193468570709, "learning_rate": 0.00019999835007556986, "loss": 0.0902, "step": 554 }, { "epoch": 0.03587878787878788, "grad_norm": 0.142499178647995, "learning_rate": 0.00019999831055793332, "loss": 0.1066, "step": 555 }, { "epoch": 0.03594343434343435, "grad_norm": 0.1304892897605896, "learning_rate": 0.0001999982705726377, "loss": 0.1136, "step": 556 }, { "epoch": 0.03600808080808081, "grad_norm": 0.13161268830299377, "learning_rate": 0.00019999823011968327, "loss": 0.1054, "step": 557 }, { "epoch": 0.03607272727272727, "grad_norm": 0.13755886256694794, "learning_rate": 0.00019999818919907015, "loss": 0.1147, "step": 558 }, { "epoch": 0.03613737373737374, "grad_norm": 0.11605346202850342, "learning_rate": 0.00019999814781079857, "loss": 0.0991, "step": 559 }, { "epoch": 0.0362020202020202, "grad_norm": 0.18530824780464172, "learning_rate": 0.0001999981059548687, "loss": 0.1162, "step": 560 }, { "epoch": 0.0362020202020202, "eval_bleu": 11.287920881552383, "eval_loss": 0.10759274661540985, "eval_runtime": 2.7656, "eval_samples_per_second": 11.571, "eval_steps_per_second": 1.446, "step": 560 }, { "epoch": 0.03626666666666667, "grad_norm": 0.1463087946176529, "learning_rate": 0.00019999806363128075, "loss": 0.1459, "step": 561 }, { "epoch": 0.03633131313131313, "grad_norm": 0.12586872279644012, "learning_rate": 0.00019999802084003492, "loss": 0.1187, "step": 562 }, { "epoch": 0.03639595959595959, "grad_norm": 0.15147972106933594, "learning_rate": 0.0001999979775811314, "loss": 0.1236, "step": 563 }, { "epoch": 0.03646060606060606, "grad_norm": 0.11987727135419846, "learning_rate": 0.0001999979338545704, "loss": 0.1016, "step": 564 }, { "epoch": 0.03652525252525252, "grad_norm": 0.12765085697174072, "learning_rate": 0.00019999788966035213, "loss": 0.105, "step": 565 }, { "epoch": 0.03658989898989899, "grad_norm": 0.131104975938797, "learning_rate": 0.00019999784499847678, "loss": 0.1074, "step": 566 }, { "epoch": 0.036654545454545454, "grad_norm": 0.12428110837936401, "learning_rate": 0.00019999779986894456, "loss": 0.1101, "step": 567 }, { "epoch": 0.03671919191919192, "grad_norm": 0.13196514546871185, "learning_rate": 0.00019999775427175572, "loss": 0.1157, "step": 568 }, { "epoch": 0.036783838383838384, "grad_norm": 0.11181005835533142, "learning_rate": 0.0001999977082069104, "loss": 0.0908, "step": 569 }, { "epoch": 0.036848484848484846, "grad_norm": 0.1224859431385994, "learning_rate": 0.00019999766167440886, "loss": 0.1105, "step": 570 }, { "epoch": 0.036913131313131314, "grad_norm": 0.09960032999515533, "learning_rate": 0.00019999761467425135, "loss": 0.0892, "step": 571 }, { "epoch": 0.036977777777777776, "grad_norm": 0.12447663396596909, "learning_rate": 0.00019999756720643803, "loss": 0.1115, "step": 572 }, { "epoch": 0.037042424242424245, "grad_norm": 0.12504985928535461, "learning_rate": 0.00019999751927096915, "loss": 0.1118, "step": 573 }, { "epoch": 0.037107070707070706, "grad_norm": 0.11158134788274765, "learning_rate": 0.00019999747086784492, "loss": 0.1056, "step": 574 }, { "epoch": 0.037171717171717175, "grad_norm": 0.11234010756015778, "learning_rate": 0.0001999974219970656, "loss": 0.1031, "step": 575 }, { "epoch": 0.03723636363636364, "grad_norm": 0.12932424247264862, "learning_rate": 0.0001999973726586314, "loss": 0.1098, "step": 576 }, { "epoch": 0.03723636363636364, "eval_bleu": 11.07863404655768, "eval_loss": 0.10550281405448914, "eval_runtime": 2.7115, "eval_samples_per_second": 11.802, "eval_steps_per_second": 1.475, "step": 576 }, { "epoch": 0.0373010101010101, "grad_norm": 0.13537174463272095, "learning_rate": 0.00019999732285254251, "loss": 0.1224, "step": 577 }, { "epoch": 0.03736565656565657, "grad_norm": 0.10807958990335464, "learning_rate": 0.00019999727257879923, "loss": 0.0932, "step": 578 }, { "epoch": 0.03743030303030303, "grad_norm": 0.09941566735506058, "learning_rate": 0.00019999722183740176, "loss": 0.0787, "step": 579 }, { "epoch": 0.0374949494949495, "grad_norm": 0.12469415366649628, "learning_rate": 0.00019999717062835033, "loss": 0.1173, "step": 580 }, { "epoch": 0.03755959595959596, "grad_norm": 0.12502068281173706, "learning_rate": 0.0001999971189516452, "loss": 0.0975, "step": 581 }, { "epoch": 0.03762424242424242, "grad_norm": 0.13000090420246124, "learning_rate": 0.00019999706680728663, "loss": 0.1208, "step": 582 }, { "epoch": 0.03768888888888889, "grad_norm": 0.1340045928955078, "learning_rate": 0.0001999970141952748, "loss": 0.117, "step": 583 }, { "epoch": 0.03775353535353535, "grad_norm": 0.11511174589395523, "learning_rate": 0.00019999696111561, "loss": 0.0847, "step": 584 }, { "epoch": 0.03781818181818182, "grad_norm": 0.12176530063152313, "learning_rate": 0.00019999690756829246, "loss": 0.0956, "step": 585 }, { "epoch": 0.03788282828282828, "grad_norm": 0.1807343065738678, "learning_rate": 0.00019999685355332248, "loss": 0.1061, "step": 586 }, { "epoch": 0.03794747474747475, "grad_norm": 0.14554469287395477, "learning_rate": 0.00019999679907070023, "loss": 0.1322, "step": 587 }, { "epoch": 0.03801212121212121, "grad_norm": 0.13668417930603027, "learning_rate": 0.00019999674412042603, "loss": 0.1249, "step": 588 }, { "epoch": 0.038076767676767674, "grad_norm": 0.13603554666042328, "learning_rate": 0.0001999966887025001, "loss": 0.1169, "step": 589 }, { "epoch": 0.03814141414141414, "grad_norm": 0.13147543370723724, "learning_rate": 0.00019999663281692275, "loss": 0.1123, "step": 590 }, { "epoch": 0.038206060606060605, "grad_norm": 0.26900359988212585, "learning_rate": 0.0001999965764636942, "loss": 0.1253, "step": 591 }, { "epoch": 0.03827070707070707, "grad_norm": 0.1478128433227539, "learning_rate": 0.0001999965196428147, "loss": 0.1333, "step": 592 }, { "epoch": 0.03827070707070707, "eval_bleu": 14.099159337385808, "eval_loss": 0.10521923750638962, "eval_runtime": 2.7142, "eval_samples_per_second": 11.79, "eval_steps_per_second": 1.474, "step": 592 }, { "epoch": 0.038335353535353535, "grad_norm": 0.1270078718662262, "learning_rate": 0.00019999646235428452, "loss": 0.0976, "step": 593 }, { "epoch": 0.0384, "grad_norm": 0.138322114944458, "learning_rate": 0.000199996404598104, "loss": 0.1121, "step": 594 }, { "epoch": 0.038464646464646465, "grad_norm": 0.12432148307561874, "learning_rate": 0.0001999963463742733, "loss": 0.0971, "step": 595 }, { "epoch": 0.03852929292929293, "grad_norm": 0.12516441941261292, "learning_rate": 0.00019999628768279276, "loss": 0.1074, "step": 596 }, { "epoch": 0.038593939393939396, "grad_norm": 0.13323353230953217, "learning_rate": 0.00019999622852366267, "loss": 0.1144, "step": 597 }, { "epoch": 0.03865858585858586, "grad_norm": 0.11388403177261353, "learning_rate": 0.00019999616889688327, "loss": 0.1017, "step": 598 }, { "epoch": 0.038723232323232326, "grad_norm": 0.13637560606002808, "learning_rate": 0.0001999961088024548, "loss": 0.1216, "step": 599 }, { "epoch": 0.03878787878787879, "grad_norm": 0.12641260027885437, "learning_rate": 0.00019999604824037762, "loss": 0.088, "step": 600 }, { "epoch": 0.03885252525252525, "grad_norm": 0.10515261441469193, "learning_rate": 0.00019999598721065197, "loss": 0.0901, "step": 601 }, { "epoch": 0.03891717171717172, "grad_norm": 0.1189311146736145, "learning_rate": 0.00019999592571327815, "loss": 0.1001, "step": 602 }, { "epoch": 0.03898181818181818, "grad_norm": 0.12847928702831268, "learning_rate": 0.00019999586374825644, "loss": 0.1073, "step": 603 }, { "epoch": 0.03904646464646465, "grad_norm": 0.13329587876796722, "learning_rate": 0.00019999580131558717, "loss": 0.1066, "step": 604 }, { "epoch": 0.03911111111111111, "grad_norm": 0.1321302056312561, "learning_rate": 0.00019999573841527054, "loss": 0.1219, "step": 605 }, { "epoch": 0.03917575757575758, "grad_norm": 0.15070821344852448, "learning_rate": 0.00019999567504730696, "loss": 0.1117, "step": 606 }, { "epoch": 0.03924040404040404, "grad_norm": 0.13868500292301178, "learning_rate": 0.0001999956112116966, "loss": 0.1273, "step": 607 }, { "epoch": 0.0393050505050505, "grad_norm": 0.13437072932720184, "learning_rate": 0.00019999554690843988, "loss": 0.1284, "step": 608 }, { "epoch": 0.0393050505050505, "eval_bleu": 11.33287803667904, "eval_loss": 0.10529904067516327, "eval_runtime": 2.6624, "eval_samples_per_second": 12.019, "eval_steps_per_second": 1.502, "step": 608 }, { "epoch": 0.03936969696969697, "grad_norm": 0.11254505813121796, "learning_rate": 0.00019999548213753702, "loss": 0.1005, "step": 609 }, { "epoch": 0.03943434343434343, "grad_norm": 0.1814531534910202, "learning_rate": 0.00019999541689898835, "loss": 0.1312, "step": 610 }, { "epoch": 0.0394989898989899, "grad_norm": 0.1422121226787567, "learning_rate": 0.00019999535119279415, "loss": 0.1048, "step": 611 }, { "epoch": 0.03956363636363636, "grad_norm": 0.1460379958152771, "learning_rate": 0.0001999952850189548, "loss": 0.1276, "step": 612 }, { "epoch": 0.039628282828282825, "grad_norm": 0.13203667104244232, "learning_rate": 0.00019999521837747052, "loss": 0.1288, "step": 613 }, { "epoch": 0.039692929292929294, "grad_norm": 0.1199788972735405, "learning_rate": 0.00019999515126834167, "loss": 0.1122, "step": 614 }, { "epoch": 0.039757575757575755, "grad_norm": 0.10937829315662384, "learning_rate": 0.00019999508369156855, "loss": 0.1001, "step": 615 }, { "epoch": 0.039822222222222224, "grad_norm": 0.14558622241020203, "learning_rate": 0.0001999950156471515, "loss": 0.1505, "step": 616 }, { "epoch": 0.039886868686868686, "grad_norm": 0.11247015744447708, "learning_rate": 0.0001999949471350908, "loss": 0.1009, "step": 617 }, { "epoch": 0.039951515151515155, "grad_norm": 0.13116182386875153, "learning_rate": 0.0001999948781553868, "loss": 0.1002, "step": 618 }, { "epoch": 0.040016161616161616, "grad_norm": 0.11855407804250717, "learning_rate": 0.00019999480870803985, "loss": 0.1069, "step": 619 }, { "epoch": 0.04008080808080808, "grad_norm": 0.11876469105482101, "learning_rate": 0.00019999473879305017, "loss": 0.1206, "step": 620 }, { "epoch": 0.04014545454545455, "grad_norm": 0.1143917664885521, "learning_rate": 0.00019999466841041818, "loss": 0.1061, "step": 621 }, { "epoch": 0.04021010101010101, "grad_norm": 0.11280103027820587, "learning_rate": 0.00019999459756014419, "loss": 0.0898, "step": 622 }, { "epoch": 0.04027474747474748, "grad_norm": 0.14063239097595215, "learning_rate": 0.00019999452624222853, "loss": 0.1294, "step": 623 }, { "epoch": 0.04033939393939394, "grad_norm": 0.133212149143219, "learning_rate": 0.0001999944544566715, "loss": 0.1117, "step": 624 }, { "epoch": 0.04033939393939394, "eval_bleu": 10.425502094549062, "eval_loss": 0.10544434189796448, "eval_runtime": 2.8121, "eval_samples_per_second": 11.379, "eval_steps_per_second": 1.422, "step": 624 }, { "epoch": 0.04040404040404041, "grad_norm": 0.1216183751821518, "learning_rate": 0.0001999943822034735, "loss": 0.0987, "step": 625 }, { "epoch": 0.04046868686868687, "grad_norm": 0.13168206810951233, "learning_rate": 0.00019999430948263483, "loss": 0.115, "step": 626 }, { "epoch": 0.04053333333333333, "grad_norm": 0.20381559431552887, "learning_rate": 0.00019999423629415582, "loss": 0.11, "step": 627 }, { "epoch": 0.0405979797979798, "grad_norm": 0.13594192266464233, "learning_rate": 0.0001999941626380368, "loss": 0.1132, "step": 628 }, { "epoch": 0.04066262626262626, "grad_norm": 0.1459536999464035, "learning_rate": 0.00019999408851427818, "loss": 0.1051, "step": 629 }, { "epoch": 0.04072727272727273, "grad_norm": 0.15588319301605225, "learning_rate": 0.00019999401392288023, "loss": 0.1216, "step": 630 }, { "epoch": 0.04079191919191919, "grad_norm": 0.14037740230560303, "learning_rate": 0.00019999393886384334, "loss": 0.1236, "step": 631 }, { "epoch": 0.040856565656565653, "grad_norm": 0.12617136538028717, "learning_rate": 0.00019999386333716788, "loss": 0.1009, "step": 632 }, { "epoch": 0.04092121212121212, "grad_norm": 0.1461799591779709, "learning_rate": 0.00019999378734285417, "loss": 0.1411, "step": 633 }, { "epoch": 0.040985858585858584, "grad_norm": 0.1203555092215538, "learning_rate": 0.00019999371088090255, "loss": 0.1049, "step": 634 }, { "epoch": 0.04105050505050505, "grad_norm": 0.1366298496723175, "learning_rate": 0.00019999363395131344, "loss": 0.1287, "step": 635 }, { "epoch": 0.041115151515151514, "grad_norm": 0.10302355140447617, "learning_rate": 0.00019999355655408714, "loss": 0.0862, "step": 636 }, { "epoch": 0.04117979797979798, "grad_norm": 0.1196269616484642, "learning_rate": 0.00019999347868922404, "loss": 0.1247, "step": 637 }, { "epoch": 0.041244444444444445, "grad_norm": 0.1320931762456894, "learning_rate": 0.00019999340035672448, "loss": 0.1124, "step": 638 }, { "epoch": 0.041309090909090906, "grad_norm": 0.12277361005544662, "learning_rate": 0.00019999332155658885, "loss": 0.1057, "step": 639 }, { "epoch": 0.041373737373737375, "grad_norm": 0.12430281192064285, "learning_rate": 0.00019999324228881752, "loss": 0.1164, "step": 640 }, { "epoch": 0.041373737373737375, "eval_bleu": 11.288036767724059, "eval_loss": 0.10528124868869781, "eval_runtime": 2.5817, "eval_samples_per_second": 12.395, "eval_steps_per_second": 1.549, "step": 640 }, { "epoch": 0.04143838383838384, "grad_norm": 0.11824125796556473, "learning_rate": 0.00019999316255341084, "loss": 0.1076, "step": 641 }, { "epoch": 0.041503030303030305, "grad_norm": 0.10635879635810852, "learning_rate": 0.0001999930823503692, "loss": 0.0939, "step": 642 }, { "epoch": 0.04156767676767677, "grad_norm": 0.1288403421640396, "learning_rate": 0.000199993001679693, "loss": 0.1094, "step": 643 }, { "epoch": 0.04163232323232323, "grad_norm": 0.13538114726543427, "learning_rate": 0.00019999292054138253, "loss": 0.1249, "step": 644 }, { "epoch": 0.0416969696969697, "grad_norm": 0.1185263842344284, "learning_rate": 0.00019999283893543828, "loss": 0.1004, "step": 645 }, { "epoch": 0.04176161616161616, "grad_norm": 0.1532231867313385, "learning_rate": 0.00019999275686186056, "loss": 0.1178, "step": 646 }, { "epoch": 0.04182626262626263, "grad_norm": 0.10361472517251968, "learning_rate": 0.0001999926743206498, "loss": 0.0738, "step": 647 }, { "epoch": 0.04189090909090909, "grad_norm": 0.13335101306438446, "learning_rate": 0.00019999259131180631, "loss": 0.1221, "step": 648 }, { "epoch": 0.04195555555555556, "grad_norm": 0.13752682507038116, "learning_rate": 0.00019999250783533056, "loss": 0.1448, "step": 649 }, { "epoch": 0.04202020202020202, "grad_norm": 0.11338730156421661, "learning_rate": 0.0001999924238912229, "loss": 0.099, "step": 650 }, { "epoch": 0.04208484848484848, "grad_norm": 0.15735499560832977, "learning_rate": 0.00019999233947948371, "loss": 0.1045, "step": 651 }, { "epoch": 0.04214949494949495, "grad_norm": 0.12788867950439453, "learning_rate": 0.00019999225460011344, "loss": 0.1246, "step": 652 }, { "epoch": 0.04221414141414141, "grad_norm": 0.12864696979522705, "learning_rate": 0.00019999216925311244, "loss": 0.113, "step": 653 }, { "epoch": 0.04227878787878788, "grad_norm": 0.11846766620874405, "learning_rate": 0.00019999208343848113, "loss": 0.0895, "step": 654 }, { "epoch": 0.04234343434343434, "grad_norm": 0.1231776550412178, "learning_rate": 0.00019999199715621988, "loss": 0.101, "step": 655 }, { "epoch": 0.04240808080808081, "grad_norm": 0.1102028340101242, "learning_rate": 0.00019999191040632913, "loss": 0.099, "step": 656 }, { "epoch": 0.04240808080808081, "eval_bleu": 12.270452227963574, "eval_loss": 0.10357742756605148, "eval_runtime": 2.8138, "eval_samples_per_second": 11.373, "eval_steps_per_second": 1.422, "step": 656 }, { "epoch": 0.04247272727272727, "grad_norm": 0.12100549042224884, "learning_rate": 0.00019999182318880928, "loss": 0.1089, "step": 657 }, { "epoch": 0.042537373737373735, "grad_norm": 0.13474537432193756, "learning_rate": 0.0001999917355036607, "loss": 0.1083, "step": 658 }, { "epoch": 0.042602020202020204, "grad_norm": 0.10728071630001068, "learning_rate": 0.00019999164735088384, "loss": 0.1059, "step": 659 }, { "epoch": 0.042666666666666665, "grad_norm": 0.10053954273462296, "learning_rate": 0.00019999155873047912, "loss": 0.0937, "step": 660 }, { "epoch": 0.042731313131313134, "grad_norm": 0.1097431629896164, "learning_rate": 0.00019999146964244692, "loss": 0.1004, "step": 661 }, { "epoch": 0.042795959595959596, "grad_norm": 0.1231168583035469, "learning_rate": 0.00019999138008678768, "loss": 0.0999, "step": 662 }, { "epoch": 0.04286060606060606, "grad_norm": 0.11319750547409058, "learning_rate": 0.0001999912900635018, "loss": 0.0939, "step": 663 }, { "epoch": 0.042925252525252526, "grad_norm": 0.1253257542848587, "learning_rate": 0.00019999119957258974, "loss": 0.1108, "step": 664 }, { "epoch": 0.04298989898989899, "grad_norm": 0.15099355578422546, "learning_rate": 0.0001999911086140519, "loss": 0.121, "step": 665 }, { "epoch": 0.043054545454545456, "grad_norm": 0.0973721593618393, "learning_rate": 0.00019999101718788868, "loss": 0.0905, "step": 666 }, { "epoch": 0.04311919191919192, "grad_norm": 0.12029827386140823, "learning_rate": 0.0001999909252941005, "loss": 0.1145, "step": 667 }, { "epoch": 0.04318383838383839, "grad_norm": 0.12910278141498566, "learning_rate": 0.00019999083293268784, "loss": 0.1043, "step": 668 }, { "epoch": 0.04324848484848485, "grad_norm": 0.16909056901931763, "learning_rate": 0.00019999074010365115, "loss": 0.1041, "step": 669 }, { "epoch": 0.04331313131313131, "grad_norm": 0.11273932456970215, "learning_rate": 0.0001999906468069908, "loss": 0.0946, "step": 670 }, { "epoch": 0.04337777777777778, "grad_norm": 0.1234969049692154, "learning_rate": 0.0001999905530427072, "loss": 0.0982, "step": 671 }, { "epoch": 0.04344242424242424, "grad_norm": 0.14719463884830475, "learning_rate": 0.00019999045881080092, "loss": 0.1332, "step": 672 }, { "epoch": 0.04344242424242424, "eval_bleu": 13.043288789593035, "eval_loss": 0.10298320651054382, "eval_runtime": 2.6171, "eval_samples_per_second": 12.227, "eval_steps_per_second": 1.528, "step": 672 }, { "epoch": 0.04350707070707071, "grad_norm": 0.1803755909204483, "learning_rate": 0.0001999903641112723, "loss": 0.132, "step": 673 }, { "epoch": 0.04357171717171717, "grad_norm": 0.11416936665773392, "learning_rate": 0.00019999026894412176, "loss": 0.104, "step": 674 }, { "epoch": 0.04363636363636364, "grad_norm": 0.1184062734246254, "learning_rate": 0.00019999017330934985, "loss": 0.1008, "step": 675 }, { "epoch": 0.0437010101010101, "grad_norm": 0.11440252512693405, "learning_rate": 0.0001999900772069569, "loss": 0.1008, "step": 676 }, { "epoch": 0.04376565656565656, "grad_norm": 0.13120384514331818, "learning_rate": 0.00019998998063694345, "loss": 0.0986, "step": 677 }, { "epoch": 0.04383030303030303, "grad_norm": 0.11450443416833878, "learning_rate": 0.00019998988359930988, "loss": 0.1069, "step": 678 }, { "epoch": 0.043894949494949494, "grad_norm": 0.10805507004261017, "learning_rate": 0.0001999897860940567, "loss": 0.0884, "step": 679 }, { "epoch": 0.04395959595959596, "grad_norm": 0.17670485377311707, "learning_rate": 0.00019998968812118438, "loss": 0.1046, "step": 680 }, { "epoch": 0.044024242424242424, "grad_norm": 0.13816799223423004, "learning_rate": 0.0001999895896806933, "loss": 0.1196, "step": 681 }, { "epoch": 0.044088888888888886, "grad_norm": 0.11267176270484924, "learning_rate": 0.00019998949077258398, "loss": 0.1019, "step": 682 }, { "epoch": 0.044153535353535354, "grad_norm": 0.14338566362857819, "learning_rate": 0.00019998939139685687, "loss": 0.1143, "step": 683 }, { "epoch": 0.044218181818181816, "grad_norm": 0.10855672508478165, "learning_rate": 0.00019998929155351242, "loss": 0.0893, "step": 684 }, { "epoch": 0.044282828282828285, "grad_norm": 0.11952214688062668, "learning_rate": 0.00019998919124255115, "loss": 0.1113, "step": 685 }, { "epoch": 0.04434747474747475, "grad_norm": 0.1171790063381195, "learning_rate": 0.00019998909046397344, "loss": 0.0988, "step": 686 }, { "epoch": 0.044412121212121215, "grad_norm": 0.11785004287958145, "learning_rate": 0.00019998898921777983, "loss": 0.1082, "step": 687 }, { "epoch": 0.04447676767676768, "grad_norm": 0.10744521021842957, "learning_rate": 0.00019998888750397077, "loss": 0.0962, "step": 688 }, { "epoch": 0.04447676767676768, "eval_bleu": 12.487478905348521, "eval_loss": 0.10292381793260574, "eval_runtime": 2.7172, "eval_samples_per_second": 11.777, "eval_steps_per_second": 1.472, "step": 688 }, { "epoch": 0.04454141414141414, "grad_norm": 0.12902088463306427, "learning_rate": 0.00019998878532254675, "loss": 0.1222, "step": 689 }, { "epoch": 0.04460606060606061, "grad_norm": 0.11698608845472336, "learning_rate": 0.0001999886826735082, "loss": 0.1087, "step": 690 }, { "epoch": 0.04467070707070707, "grad_norm": 0.11035232245922089, "learning_rate": 0.00019998857955685567, "loss": 0.111, "step": 691 }, { "epoch": 0.04473535353535354, "grad_norm": 0.11041948199272156, "learning_rate": 0.0001999884759725896, "loss": 0.1182, "step": 692 }, { "epoch": 0.0448, "grad_norm": 0.15364226698875427, "learning_rate": 0.0001999883719207105, "loss": 0.1015, "step": 693 }, { "epoch": 0.04486464646464646, "grad_norm": 0.11859642714262009, "learning_rate": 0.00019998826740121883, "loss": 0.0975, "step": 694 }, { "epoch": 0.04492929292929293, "grad_norm": 0.14098550379276276, "learning_rate": 0.00019998816241411507, "loss": 0.1169, "step": 695 }, { "epoch": 0.04499393939393939, "grad_norm": 0.11364419013261795, "learning_rate": 0.00019998805695939975, "loss": 0.1002, "step": 696 }, { "epoch": 0.04505858585858586, "grad_norm": 0.11612164229154587, "learning_rate": 0.00019998795103707333, "loss": 0.1018, "step": 697 }, { "epoch": 0.04512323232323232, "grad_norm": 0.13067692518234253, "learning_rate": 0.00019998784464713633, "loss": 0.1159, "step": 698 }, { "epoch": 0.04518787878787879, "grad_norm": 0.11234342306852341, "learning_rate": 0.0001999877377895892, "loss": 0.1024, "step": 699 }, { "epoch": 0.04525252525252525, "grad_norm": 0.11350338906049728, "learning_rate": 0.00019998763046443253, "loss": 0.1048, "step": 700 }, { "epoch": 0.045317171717171714, "grad_norm": 0.12426406145095825, "learning_rate": 0.00019998752267166677, "loss": 0.1105, "step": 701 }, { "epoch": 0.04538181818181818, "grad_norm": 0.12319961935281754, "learning_rate": 0.00019998741441129236, "loss": 0.1108, "step": 702 }, { "epoch": 0.045446464646464645, "grad_norm": 0.13607670366764069, "learning_rate": 0.00019998730568330993, "loss": 0.1106, "step": 703 }, { "epoch": 0.04551111111111111, "grad_norm": 0.1059730127453804, "learning_rate": 0.0001999871964877199, "loss": 0.0925, "step": 704 }, { "epoch": 0.04551111111111111, "eval_bleu": 12.392402411338868, "eval_loss": 0.10216230154037476, "eval_runtime": 2.6054, "eval_samples_per_second": 12.282, "eval_steps_per_second": 1.535, "step": 704 }, { "epoch": 0.045575757575757575, "grad_norm": 0.10643964260816574, "learning_rate": 0.00019998708682452277, "loss": 0.0972, "step": 705 }, { "epoch": 0.045640404040404044, "grad_norm": 0.14168602228164673, "learning_rate": 0.00019998697669371915, "loss": 0.1239, "step": 706 }, { "epoch": 0.045705050505050505, "grad_norm": 0.12365109473466873, "learning_rate": 0.0001999868660953095, "loss": 0.1113, "step": 707 }, { "epoch": 0.04576969696969697, "grad_norm": 0.14012353122234344, "learning_rate": 0.0001999867550292943, "loss": 0.1286, "step": 708 }, { "epoch": 0.045834343434343436, "grad_norm": 0.17314791679382324, "learning_rate": 0.0001999866434956741, "loss": 0.1505, "step": 709 }, { "epoch": 0.0458989898989899, "grad_norm": 0.12496153265237808, "learning_rate": 0.00019998653149444942, "loss": 0.1042, "step": 710 }, { "epoch": 0.045963636363636366, "grad_norm": 0.12631718814373016, "learning_rate": 0.0001999864190256208, "loss": 0.1138, "step": 711 }, { "epoch": 0.04602828282828283, "grad_norm": 0.10696897655725479, "learning_rate": 0.00019998630608918875, "loss": 0.1017, "step": 712 }, { "epoch": 0.04609292929292929, "grad_norm": 0.12372178584337234, "learning_rate": 0.00019998619268515378, "loss": 0.1163, "step": 713 }, { "epoch": 0.04615757575757576, "grad_norm": 0.11347978562116623, "learning_rate": 0.00019998607881351648, "loss": 0.1012, "step": 714 }, { "epoch": 0.04622222222222222, "grad_norm": 0.1179923266172409, "learning_rate": 0.00019998596447427734, "loss": 0.0999, "step": 715 }, { "epoch": 0.04628686868686869, "grad_norm": 0.10583227127790451, "learning_rate": 0.00019998584966743688, "loss": 0.1054, "step": 716 }, { "epoch": 0.04635151515151515, "grad_norm": 0.11413531750440598, "learning_rate": 0.00019998573439299565, "loss": 0.1194, "step": 717 }, { "epoch": 0.04641616161616162, "grad_norm": 0.11270076781511307, "learning_rate": 0.0001999856186509542, "loss": 0.1122, "step": 718 }, { "epoch": 0.04648080808080808, "grad_norm": 0.10701815783977509, "learning_rate": 0.0001999855024413131, "loss": 0.0987, "step": 719 }, { "epoch": 0.04654545454545454, "grad_norm": 0.11094026267528534, "learning_rate": 0.00019998538576407283, "loss": 0.1103, "step": 720 }, { "epoch": 0.04654545454545454, "eval_bleu": 14.192791910220867, "eval_loss": 0.103324294090271, "eval_runtime": 2.799, "eval_samples_per_second": 11.433, "eval_steps_per_second": 1.429, "step": 720 }, { "epoch": 0.04661010101010101, "grad_norm": 0.12587034702301025, "learning_rate": 0.00019998526861923397, "loss": 0.121, "step": 721 }, { "epoch": 0.04667474747474747, "grad_norm": 0.1284613162279129, "learning_rate": 0.00019998515100679706, "loss": 0.0983, "step": 722 }, { "epoch": 0.04673939393939394, "grad_norm": 0.09873173385858536, "learning_rate": 0.00019998503292676265, "loss": 0.085, "step": 723 }, { "epoch": 0.0468040404040404, "grad_norm": 0.10213294625282288, "learning_rate": 0.0001999849143791313, "loss": 0.0955, "step": 724 }, { "epoch": 0.04686868686868687, "grad_norm": 0.13524073362350464, "learning_rate": 0.00019998479536390356, "loss": 0.1133, "step": 725 }, { "epoch": 0.046933333333333334, "grad_norm": 0.12266495078802109, "learning_rate": 0.00019998467588107997, "loss": 0.1119, "step": 726 }, { "epoch": 0.046997979797979796, "grad_norm": 0.1434646099805832, "learning_rate": 0.00019998455593066115, "loss": 0.1013, "step": 727 }, { "epoch": 0.047062626262626264, "grad_norm": 0.15254801511764526, "learning_rate": 0.0001999844355126476, "loss": 0.0967, "step": 728 }, { "epoch": 0.047127272727272726, "grad_norm": 0.1079547330737114, "learning_rate": 0.00019998431462703986, "loss": 0.097, "step": 729 }, { "epoch": 0.047191919191919195, "grad_norm": 0.1393647938966751, "learning_rate": 0.00019998419327383856, "loss": 0.1102, "step": 730 }, { "epoch": 0.047256565656565656, "grad_norm": 0.1209694892168045, "learning_rate": 0.00019998407145304422, "loss": 0.1135, "step": 731 }, { "epoch": 0.04732121212121212, "grad_norm": 0.14806681871414185, "learning_rate": 0.00019998394916465747, "loss": 0.1047, "step": 732 }, { "epoch": 0.04738585858585859, "grad_norm": 0.12639784812927246, "learning_rate": 0.00019998382640867886, "loss": 0.1171, "step": 733 }, { "epoch": 0.04745050505050505, "grad_norm": 0.12218527495861053, "learning_rate": 0.0001999837031851089, "loss": 0.1011, "step": 734 }, { "epoch": 0.04751515151515152, "grad_norm": 0.10220067948102951, "learning_rate": 0.00019998357949394823, "loss": 0.0935, "step": 735 }, { "epoch": 0.04757979797979798, "grad_norm": 0.11254847049713135, "learning_rate": 0.0001999834553351974, "loss": 0.093, "step": 736 }, { "epoch": 0.04757979797979798, "eval_bleu": 11.49276421723662, "eval_loss": 0.10434786975383759, "eval_runtime": 2.606, "eval_samples_per_second": 12.279, "eval_steps_per_second": 1.535, "step": 736 }, { "epoch": 0.04764444444444445, "grad_norm": 0.12570251524448395, "learning_rate": 0.00019998333070885704, "loss": 0.09, "step": 737 }, { "epoch": 0.04770909090909091, "grad_norm": 0.12122748792171478, "learning_rate": 0.00019998320561492766, "loss": 0.0916, "step": 738 }, { "epoch": 0.04777373737373737, "grad_norm": 0.15788482129573822, "learning_rate": 0.00019998308005340988, "loss": 0.1126, "step": 739 }, { "epoch": 0.04783838383838384, "grad_norm": 0.13124330341815948, "learning_rate": 0.00019998295402430432, "loss": 0.1164, "step": 740 }, { "epoch": 0.0479030303030303, "grad_norm": 0.11488284915685654, "learning_rate": 0.00019998282752761154, "loss": 0.0928, "step": 741 }, { "epoch": 0.04796767676767677, "grad_norm": 0.13330914080142975, "learning_rate": 0.0001999827005633321, "loss": 0.1232, "step": 742 }, { "epoch": 0.04803232323232323, "grad_norm": 0.10554460436105728, "learning_rate": 0.00019998257313146663, "loss": 0.0868, "step": 743 }, { "epoch": 0.048096969696969694, "grad_norm": 0.11329661309719086, "learning_rate": 0.00019998244523201572, "loss": 0.1064, "step": 744 }, { "epoch": 0.04816161616161616, "grad_norm": 0.12269411981105804, "learning_rate": 0.00019998231686497997, "loss": 0.107, "step": 745 }, { "epoch": 0.048226262626262624, "grad_norm": 0.12911410629749298, "learning_rate": 0.00019998218803035997, "loss": 0.1201, "step": 746 }, { "epoch": 0.04829090909090909, "grad_norm": 0.12091823667287827, "learning_rate": 0.00019998205872815634, "loss": 0.1144, "step": 747 }, { "epoch": 0.048355555555555554, "grad_norm": 0.11663118749856949, "learning_rate": 0.00019998192895836968, "loss": 0.1089, "step": 748 }, { "epoch": 0.04842020202020202, "grad_norm": 0.1018366813659668, "learning_rate": 0.0001999817987210006, "loss": 0.089, "step": 749 }, { "epoch": 0.048484848484848485, "grad_norm": 0.10414136946201324, "learning_rate": 0.00019998166801604966, "loss": 0.0902, "step": 750 }, { "epoch": 0.04854949494949495, "grad_norm": 0.1267605870962143, "learning_rate": 0.00019998153684351754, "loss": 0.1109, "step": 751 }, { "epoch": 0.048614141414141415, "grad_norm": 0.13840921223163605, "learning_rate": 0.0001999814052034048, "loss": 0.1052, "step": 752 }, { "epoch": 0.048614141414141415, "eval_bleu": 12.124744245468259, "eval_loss": 0.10355029255151749, "eval_runtime": 2.7658, "eval_samples_per_second": 11.57, "eval_steps_per_second": 1.446, "step": 752 }, { "epoch": 0.04867878787878788, "grad_norm": 0.11638985574245453, "learning_rate": 0.0001999812730957121, "loss": 0.1025, "step": 753 }, { "epoch": 0.048743434343434346, "grad_norm": 0.16252492368221283, "learning_rate": 0.00019998114052044005, "loss": 0.1246, "step": 754 }, { "epoch": 0.04880808080808081, "grad_norm": 0.11385884881019592, "learning_rate": 0.00019998100747758925, "loss": 0.0987, "step": 755 }, { "epoch": 0.048872727272727276, "grad_norm": 0.12251380831003189, "learning_rate": 0.00019998087396716035, "loss": 0.1089, "step": 756 }, { "epoch": 0.04893737373737374, "grad_norm": 0.10436839610338211, "learning_rate": 0.00019998073998915393, "loss": 0.0919, "step": 757 }, { "epoch": 0.0490020202020202, "grad_norm": 0.12865278124809265, "learning_rate": 0.00019998060554357063, "loss": 0.1078, "step": 758 }, { "epoch": 0.04906666666666667, "grad_norm": 0.1327090561389923, "learning_rate": 0.00019998047063041115, "loss": 0.1163, "step": 759 }, { "epoch": 0.04913131313131313, "grad_norm": 0.11831310391426086, "learning_rate": 0.000199980335249676, "loss": 0.1122, "step": 760 }, { "epoch": 0.0491959595959596, "grad_norm": 0.10785401612520218, "learning_rate": 0.0001999801994013659, "loss": 0.0972, "step": 761 }, { "epoch": 0.04926060606060606, "grad_norm": 0.11466260999441147, "learning_rate": 0.00019998006308548144, "loss": 0.0958, "step": 762 }, { "epoch": 0.04932525252525252, "grad_norm": 0.11845093220472336, "learning_rate": 0.00019997992630202332, "loss": 0.1237, "step": 763 }, { "epoch": 0.04938989898989899, "grad_norm": 0.11316904425621033, "learning_rate": 0.0001999797890509921, "loss": 0.1053, "step": 764 }, { "epoch": 0.04945454545454545, "grad_norm": 0.11070220172405243, "learning_rate": 0.00019997965133238847, "loss": 0.1165, "step": 765 }, { "epoch": 0.04951919191919192, "grad_norm": 0.11167492717504501, "learning_rate": 0.00019997951314621305, "loss": 0.1135, "step": 766 }, { "epoch": 0.04958383838383838, "grad_norm": 0.13088814914226532, "learning_rate": 0.00019997937449246653, "loss": 0.129, "step": 767 }, { "epoch": 0.04964848484848485, "grad_norm": 0.11039507389068604, "learning_rate": 0.00019997923537114952, "loss": 0.1092, "step": 768 }, { "epoch": 0.04964848484848485, "eval_bleu": 11.15731612475645, "eval_loss": 0.10469337552785873, "eval_runtime": 2.5966, "eval_samples_per_second": 12.324, "eval_steps_per_second": 1.54, "step": 768 }, { "epoch": 0.04971313131313131, "grad_norm": 0.10391898453235626, "learning_rate": 0.00019997909578226266, "loss": 0.1037, "step": 769 }, { "epoch": 0.049777777777777775, "grad_norm": 0.1021718829870224, "learning_rate": 0.00019997895572580662, "loss": 0.0969, "step": 770 }, { "epoch": 0.049842424242424244, "grad_norm": 0.11407826095819473, "learning_rate": 0.00019997881520178207, "loss": 0.1031, "step": 771 }, { "epoch": 0.049907070707070705, "grad_norm": 0.1182626411318779, "learning_rate": 0.00019997867421018967, "loss": 0.1051, "step": 772 }, { "epoch": 0.049971717171717174, "grad_norm": 0.13272640109062195, "learning_rate": 0.00019997853275103005, "loss": 0.1205, "step": 773 }, { "epoch": 0.050036363636363636, "grad_norm": 0.12423010170459747, "learning_rate": 0.00019997839082430384, "loss": 0.1053, "step": 774 }, { "epoch": 0.050101010101010104, "grad_norm": 0.11508378386497498, "learning_rate": 0.0001999782484300118, "loss": 0.0813, "step": 775 }, { "epoch": 0.050165656565656566, "grad_norm": 0.11342547833919525, "learning_rate": 0.00019997810556815455, "loss": 0.1069, "step": 776 }, { "epoch": 0.05023030303030303, "grad_norm": 0.1191805750131607, "learning_rate": 0.00019997796223873273, "loss": 0.094, "step": 777 }, { "epoch": 0.0502949494949495, "grad_norm": 0.11147759854793549, "learning_rate": 0.00019997781844174705, "loss": 0.0943, "step": 778 }, { "epoch": 0.05035959595959596, "grad_norm": 0.11758267134428024, "learning_rate": 0.00019997767417719814, "loss": 0.106, "step": 779 }, { "epoch": 0.05042424242424243, "grad_norm": 0.11845671385526657, "learning_rate": 0.00019997752944508673, "loss": 0.102, "step": 780 }, { "epoch": 0.05048888888888889, "grad_norm": 0.12012017518281937, "learning_rate": 0.00019997738424541343, "loss": 0.1097, "step": 781 }, { "epoch": 0.05055353535353535, "grad_norm": 0.1275402158498764, "learning_rate": 0.000199977238578179, "loss": 0.1004, "step": 782 }, { "epoch": 0.05061818181818182, "grad_norm": 0.13106386363506317, "learning_rate": 0.00019997709244338403, "loss": 0.1073, "step": 783 }, { "epoch": 0.05068282828282828, "grad_norm": 0.10580660402774811, "learning_rate": 0.00019997694584102926, "loss": 0.0877, "step": 784 }, { "epoch": 0.05068282828282828, "eval_bleu": 14.201445493145247, "eval_loss": 0.10319557785987854, "eval_runtime": 2.6913, "eval_samples_per_second": 11.89, "eval_steps_per_second": 1.486, "step": 784 }, { "epoch": 0.05074747474747475, "grad_norm": 0.12241457402706146, "learning_rate": 0.0001999767987711154, "loss": 0.1254, "step": 785 }, { "epoch": 0.05081212121212121, "grad_norm": 0.13357685506343842, "learning_rate": 0.0001999766512336431, "loss": 0.1279, "step": 786 }, { "epoch": 0.05087676767676768, "grad_norm": 0.09599798172712326, "learning_rate": 0.000199976503228613, "loss": 0.0891, "step": 787 }, { "epoch": 0.05094141414141414, "grad_norm": 0.11776507645845413, "learning_rate": 0.0001999763547560259, "loss": 0.1192, "step": 788 }, { "epoch": 0.0510060606060606, "grad_norm": 0.11095654964447021, "learning_rate": 0.00019997620581588241, "loss": 0.0827, "step": 789 }, { "epoch": 0.05107070707070707, "grad_norm": 0.11389493942260742, "learning_rate": 0.00019997605640818326, "loss": 0.1184, "step": 790 }, { "epoch": 0.051135353535353534, "grad_norm": 0.12444175779819489, "learning_rate": 0.00019997590653292918, "loss": 0.1104, "step": 791 }, { "epoch": 0.0512, "grad_norm": 0.10522261261940002, "learning_rate": 0.00019997575619012082, "loss": 0.0849, "step": 792 }, { "epoch": 0.051264646464646464, "grad_norm": 0.13184869289398193, "learning_rate": 0.00019997560537975888, "loss": 0.1221, "step": 793 }, { "epoch": 0.051329292929292926, "grad_norm": 0.13569733500480652, "learning_rate": 0.0001999754541018441, "loss": 0.1338, "step": 794 }, { "epoch": 0.051393939393939395, "grad_norm": 0.12633666396141052, "learning_rate": 0.00019997530235637715, "loss": 0.126, "step": 795 }, { "epoch": 0.051458585858585856, "grad_norm": 0.10019071400165558, "learning_rate": 0.0001999751501433588, "loss": 0.0983, "step": 796 }, { "epoch": 0.051523232323232325, "grad_norm": 0.1268041431903839, "learning_rate": 0.0001999749974627897, "loss": 0.1249, "step": 797 }, { "epoch": 0.05158787878787879, "grad_norm": 0.11632261425256729, "learning_rate": 0.0001999748443146706, "loss": 0.1071, "step": 798 }, { "epoch": 0.051652525252525255, "grad_norm": 0.11214227974414825, "learning_rate": 0.00019997469069900218, "loss": 0.1137, "step": 799 }, { "epoch": 0.05171717171717172, "grad_norm": 0.10913074761629105, "learning_rate": 0.0001999745366157852, "loss": 0.1053, "step": 800 }, { "epoch": 0.05171717171717172, "eval_bleu": 13.90970352692978, "eval_loss": 0.10533279925584793, "eval_runtime": 2.5881, "eval_samples_per_second": 12.364, "eval_steps_per_second": 1.546, "step": 800 }, { "epoch": 0.05178181818181818, "grad_norm": 0.10523483157157898, "learning_rate": 0.00019997438206502036, "loss": 0.1066, "step": 801 }, { "epoch": 0.05184646464646465, "grad_norm": 0.10975678265094757, "learning_rate": 0.00019997422704670837, "loss": 0.1183, "step": 802 }, { "epoch": 0.05191111111111111, "grad_norm": 0.11845553666353226, "learning_rate": 0.00019997407156085003, "loss": 0.1073, "step": 803 }, { "epoch": 0.05197575757575758, "grad_norm": 0.13536454737186432, "learning_rate": 0.00019997391560744597, "loss": 0.1044, "step": 804 }, { "epoch": 0.05204040404040404, "grad_norm": 0.11444460600614548, "learning_rate": 0.00019997375918649692, "loss": 0.0978, "step": 805 }, { "epoch": 0.05210505050505051, "grad_norm": 0.0963529422879219, "learning_rate": 0.0001999736022980037, "loss": 0.0892, "step": 806 }, { "epoch": 0.05216969696969697, "grad_norm": 0.09845588356256485, "learning_rate": 0.00019997344494196697, "loss": 0.0776, "step": 807 }, { "epoch": 0.05223434343434343, "grad_norm": 0.12654970586299896, "learning_rate": 0.0001999732871183875, "loss": 0.1017, "step": 808 }, { "epoch": 0.0522989898989899, "grad_norm": 0.12078473716974258, "learning_rate": 0.000199973128827266, "loss": 0.1066, "step": 809 }, { "epoch": 0.05236363636363636, "grad_norm": 0.0995335727930069, "learning_rate": 0.00019997297006860325, "loss": 0.089, "step": 810 }, { "epoch": 0.05242828282828283, "grad_norm": 0.10869870334863663, "learning_rate": 0.00019997281084239993, "loss": 0.1024, "step": 811 }, { "epoch": 0.05249292929292929, "grad_norm": 0.10635609179735184, "learning_rate": 0.00019997265114865686, "loss": 0.0859, "step": 812 }, { "epoch": 0.052557575757575754, "grad_norm": 0.13891276717185974, "learning_rate": 0.00019997249098737476, "loss": 0.124, "step": 813 }, { "epoch": 0.05262222222222222, "grad_norm": 0.12540379166603088, "learning_rate": 0.00019997233035855434, "loss": 0.1067, "step": 814 }, { "epoch": 0.052686868686868685, "grad_norm": 0.1086144894361496, "learning_rate": 0.00019997216926219638, "loss": 0.0948, "step": 815 }, { "epoch": 0.05275151515151515, "grad_norm": 0.11935828626155853, "learning_rate": 0.0001999720076983017, "loss": 0.107, "step": 816 }, { "epoch": 0.05275151515151515, "eval_bleu": 12.039794100997185, "eval_loss": 0.10444250702857971, "eval_runtime": 2.72, "eval_samples_per_second": 11.765, "eval_steps_per_second": 1.471, "step": 816 }, { "epoch": 0.052816161616161615, "grad_norm": 0.11445945501327515, "learning_rate": 0.00019997184566687094, "loss": 0.1088, "step": 817 }, { "epoch": 0.052880808080808084, "grad_norm": 0.11164270341396332, "learning_rate": 0.00019997168316790486, "loss": 0.1012, "step": 818 }, { "epoch": 0.052945454545454546, "grad_norm": 0.10718589276075363, "learning_rate": 0.00019997152020140432, "loss": 0.1003, "step": 819 }, { "epoch": 0.05301010101010101, "grad_norm": 0.12261474877595901, "learning_rate": 0.00019997135676737005, "loss": 0.1371, "step": 820 }, { "epoch": 0.053074747474747476, "grad_norm": 0.11439112573862076, "learning_rate": 0.00019997119286580274, "loss": 0.1111, "step": 821 }, { "epoch": 0.05313939393939394, "grad_norm": 0.2269362509250641, "learning_rate": 0.00019997102849670325, "loss": 0.1125, "step": 822 }, { "epoch": 0.053204040404040406, "grad_norm": 0.09734158217906952, "learning_rate": 0.0001999708636600723, "loss": 0.0942, "step": 823 }, { "epoch": 0.05326868686868687, "grad_norm": 0.11116871237754822, "learning_rate": 0.00019997069835591067, "loss": 0.1077, "step": 824 }, { "epoch": 0.05333333333333334, "grad_norm": 0.15197673439979553, "learning_rate": 0.00019997053258421913, "loss": 0.0768, "step": 825 }, { "epoch": 0.0533979797979798, "grad_norm": 0.10549704730510712, "learning_rate": 0.00019997036634499847, "loss": 0.1088, "step": 826 }, { "epoch": 0.05346262626262626, "grad_norm": 0.10435540229082108, "learning_rate": 0.00019997019963824942, "loss": 0.0955, "step": 827 }, { "epoch": 0.05352727272727273, "grad_norm": 0.10653238743543625, "learning_rate": 0.00019997003246397283, "loss": 0.0927, "step": 828 }, { "epoch": 0.05359191919191919, "grad_norm": 0.12135177105665207, "learning_rate": 0.00019996986482216945, "loss": 0.1141, "step": 829 }, { "epoch": 0.05365656565656566, "grad_norm": 0.11117222905158997, "learning_rate": 0.00019996969671284007, "loss": 0.0996, "step": 830 }, { "epoch": 0.05372121212121212, "grad_norm": 0.1145494282245636, "learning_rate": 0.00019996952813598545, "loss": 0.1232, "step": 831 }, { "epoch": 0.05378585858585858, "grad_norm": 0.10023811459541321, "learning_rate": 0.0001999693590916064, "loss": 0.0984, "step": 832 }, { "epoch": 0.05378585858585858, "eval_bleu": 12.826223091065069, "eval_loss": 0.10393930971622467, "eval_runtime": 2.6105, "eval_samples_per_second": 12.258, "eval_steps_per_second": 1.532, "step": 832 }, { "epoch": 0.05385050505050505, "grad_norm": 0.11759473383426666, "learning_rate": 0.0001999691895797037, "loss": 0.1151, "step": 833 }, { "epoch": 0.05391515151515151, "grad_norm": 0.12444781512022018, "learning_rate": 0.00019996901960027814, "loss": 0.1104, "step": 834 }, { "epoch": 0.05397979797979798, "grad_norm": 0.12180424481630325, "learning_rate": 0.00019996884915333054, "loss": 0.1172, "step": 835 }, { "epoch": 0.054044444444444444, "grad_norm": 0.1297658234834671, "learning_rate": 0.00019996867823886166, "loss": 0.1334, "step": 836 }, { "epoch": 0.05410909090909091, "grad_norm": 0.13237729668617249, "learning_rate": 0.00019996850685687236, "loss": 0.1126, "step": 837 }, { "epoch": 0.054173737373737374, "grad_norm": 0.12256555259227753, "learning_rate": 0.0001999683350073634, "loss": 0.133, "step": 838 }, { "epoch": 0.054238383838383836, "grad_norm": 0.10652507841587067, "learning_rate": 0.00019996816269033554, "loss": 0.0963, "step": 839 }, { "epoch": 0.054303030303030304, "grad_norm": 0.13831791281700134, "learning_rate": 0.00019996798990578967, "loss": 0.1291, "step": 840 }, { "epoch": 0.054367676767676766, "grad_norm": 0.11334650218486786, "learning_rate": 0.00019996781665372655, "loss": 0.1073, "step": 841 }, { "epoch": 0.054432323232323235, "grad_norm": 0.10309616476297379, "learning_rate": 0.000199967642934147, "loss": 0.0952, "step": 842 }, { "epoch": 0.054496969696969697, "grad_norm": 0.12420094013214111, "learning_rate": 0.00019996746874705184, "loss": 0.1075, "step": 843 }, { "epoch": 0.05456161616161616, "grad_norm": 0.10948662459850311, "learning_rate": 0.00019996729409244186, "loss": 0.1137, "step": 844 }, { "epoch": 0.05462626262626263, "grad_norm": 0.1168128028512001, "learning_rate": 0.0001999671189703179, "loss": 0.1016, "step": 845 }, { "epoch": 0.05469090909090909, "grad_norm": 0.09760251641273499, "learning_rate": 0.00019996694338068077, "loss": 0.0934, "step": 846 }, { "epoch": 0.05475555555555556, "grad_norm": 0.12694862484931946, "learning_rate": 0.0001999667673235313, "loss": 0.1207, "step": 847 }, { "epoch": 0.05482020202020202, "grad_norm": 0.11269383877515793, "learning_rate": 0.00019996659079887032, "loss": 0.1044, "step": 848 }, { "epoch": 0.05482020202020202, "eval_bleu": 11.33978220393837, "eval_loss": 0.10266672819852829, "eval_runtime": 2.7659, "eval_samples_per_second": 11.57, "eval_steps_per_second": 1.446, "step": 848 }, { "epoch": 0.05488484848484849, "grad_norm": 0.11263251304626465, "learning_rate": 0.00019996641380669864, "loss": 0.1079, "step": 849 }, { "epoch": 0.05494949494949495, "grad_norm": 0.12025243043899536, "learning_rate": 0.0001999662363470171, "loss": 0.1043, "step": 850 }, { "epoch": 0.05501414141414141, "grad_norm": 0.1157556027173996, "learning_rate": 0.0001999660584198265, "loss": 0.105, "step": 851 }, { "epoch": 0.05507878787878788, "grad_norm": 0.10801595449447632, "learning_rate": 0.0001999658800251277, "loss": 0.0986, "step": 852 }, { "epoch": 0.05514343434343434, "grad_norm": 0.11887183785438538, "learning_rate": 0.0001999657011629215, "loss": 0.1167, "step": 853 }, { "epoch": 0.05520808080808081, "grad_norm": 0.11737223714590073, "learning_rate": 0.00019996552183320878, "loss": 0.1159, "step": 854 }, { "epoch": 0.05527272727272727, "grad_norm": 0.1060580387711525, "learning_rate": 0.00019996534203599038, "loss": 0.101, "step": 855 }, { "epoch": 0.05533737373737374, "grad_norm": 0.13838988542556763, "learning_rate": 0.00019996516177126712, "loss": 0.1187, "step": 856 }, { "epoch": 0.0554020202020202, "grad_norm": 0.10509878396987915, "learning_rate": 0.0001999649810390398, "loss": 0.1015, "step": 857 }, { "epoch": 0.055466666666666664, "grad_norm": 0.10692878067493439, "learning_rate": 0.0001999647998393094, "loss": 0.1015, "step": 858 }, { "epoch": 0.05553131313131313, "grad_norm": 0.09788931161165237, "learning_rate": 0.0001999646181720766, "loss": 0.0986, "step": 859 }, { "epoch": 0.055595959595959595, "grad_norm": 0.10287925601005554, "learning_rate": 0.00019996443603734237, "loss": 0.1001, "step": 860 }, { "epoch": 0.05566060606060606, "grad_norm": 0.10221549868583679, "learning_rate": 0.00019996425343510752, "loss": 0.0983, "step": 861 }, { "epoch": 0.055725252525252525, "grad_norm": 0.10853669792413712, "learning_rate": 0.0001999640703653729, "loss": 0.0985, "step": 862 }, { "epoch": 0.05578989898989899, "grad_norm": 0.10268131643533707, "learning_rate": 0.00019996388682813935, "loss": 0.1034, "step": 863 }, { "epoch": 0.055854545454545455, "grad_norm": 0.11127810180187225, "learning_rate": 0.00019996370282340779, "loss": 0.1109, "step": 864 }, { "epoch": 0.055854545454545455, "eval_bleu": 17.771942553667554, "eval_loss": 0.10232952237129211, "eval_runtime": 2.6745, "eval_samples_per_second": 11.965, "eval_steps_per_second": 1.496, "step": 864 }, { "epoch": 0.05591919191919192, "grad_norm": 0.08473800122737885, "learning_rate": 0.00019996351835117898, "loss": 0.08, "step": 865 }, { "epoch": 0.055983838383838386, "grad_norm": 0.11169324070215225, "learning_rate": 0.00019996333341145388, "loss": 0.1089, "step": 866 }, { "epoch": 0.05604848484848485, "grad_norm": 0.1174512729048729, "learning_rate": 0.0001999631480042333, "loss": 0.1061, "step": 867 }, { "epoch": 0.056113131313131316, "grad_norm": 0.117987722158432, "learning_rate": 0.00019996296212951813, "loss": 0.1097, "step": 868 }, { "epoch": 0.05617777777777778, "grad_norm": 0.10855425149202347, "learning_rate": 0.00019996277578730924, "loss": 0.1098, "step": 869 }, { "epoch": 0.05624242424242424, "grad_norm": 0.15426157414913177, "learning_rate": 0.00019996258897760752, "loss": 0.1092, "step": 870 }, { "epoch": 0.05630707070707071, "grad_norm": 0.09436088800430298, "learning_rate": 0.00019996240170041377, "loss": 0.0988, "step": 871 }, { "epoch": 0.05637171717171717, "grad_norm": 0.09706581383943558, "learning_rate": 0.00019996221395572893, "loss": 0.0962, "step": 872 }, { "epoch": 0.05643636363636364, "grad_norm": 0.09941123425960541, "learning_rate": 0.00019996202574355386, "loss": 0.0879, "step": 873 }, { "epoch": 0.0565010101010101, "grad_norm": 0.10898749530315399, "learning_rate": 0.00019996183706388947, "loss": 0.1095, "step": 874 }, { "epoch": 0.05656565656565657, "grad_norm": 0.12166240811347961, "learning_rate": 0.00019996164791673658, "loss": 0.0861, "step": 875 }, { "epoch": 0.05663030303030303, "grad_norm": 0.10835130512714386, "learning_rate": 0.0001999614583020961, "loss": 0.1087, "step": 876 }, { "epoch": 0.05669494949494949, "grad_norm": 0.10567472130060196, "learning_rate": 0.00019996126821996896, "loss": 0.1111, "step": 877 }, { "epoch": 0.05675959595959596, "grad_norm": 0.10107319802045822, "learning_rate": 0.00019996107767035603, "loss": 0.0893, "step": 878 }, { "epoch": 0.05682424242424242, "grad_norm": 0.10014646500349045, "learning_rate": 0.00019996088665325817, "loss": 0.0911, "step": 879 }, { "epoch": 0.05688888888888889, "grad_norm": 0.11912678927183151, "learning_rate": 0.00019996069516867626, "loss": 0.1132, "step": 880 }, { "epoch": 0.05688888888888889, "eval_bleu": 16.553064467689797, "eval_loss": 0.10099372267723083, "eval_runtime": 2.6971, "eval_samples_per_second": 11.865, "eval_steps_per_second": 1.483, "step": 880 }, { "epoch": 0.05695353535353535, "grad_norm": 0.11043732613325119, "learning_rate": 0.00019996050321661123, "loss": 0.0971, "step": 881 }, { "epoch": 0.057018181818181815, "grad_norm": 0.11203518509864807, "learning_rate": 0.000199960310797064, "loss": 0.1047, "step": 882 }, { "epoch": 0.057082828282828284, "grad_norm": 0.1297028362751007, "learning_rate": 0.00019996011791003544, "loss": 0.1084, "step": 883 }, { "epoch": 0.057147474747474745, "grad_norm": 0.11635738611221313, "learning_rate": 0.00019995992455552645, "loss": 0.097, "step": 884 }, { "epoch": 0.057212121212121214, "grad_norm": 0.11968257278203964, "learning_rate": 0.0001999597307335379, "loss": 0.1013, "step": 885 }, { "epoch": 0.057276767676767676, "grad_norm": 0.10897650569677353, "learning_rate": 0.0001999595364440708, "loss": 0.0966, "step": 886 }, { "epoch": 0.057341414141414145, "grad_norm": 0.1232411116361618, "learning_rate": 0.00019995934168712595, "loss": 0.1207, "step": 887 }, { "epoch": 0.057406060606060606, "grad_norm": 0.09599161893129349, "learning_rate": 0.00019995914646270434, "loss": 0.0935, "step": 888 }, { "epoch": 0.05747070707070707, "grad_norm": 0.110834501683712, "learning_rate": 0.0001999589507708068, "loss": 0.1069, "step": 889 }, { "epoch": 0.05753535353535354, "grad_norm": 0.12236505001783371, "learning_rate": 0.00019995875461143432, "loss": 0.1101, "step": 890 }, { "epoch": 0.0576, "grad_norm": 0.10326965898275375, "learning_rate": 0.00019995855798458781, "loss": 0.1045, "step": 891 }, { "epoch": 0.05766464646464647, "grad_norm": 0.13237006962299347, "learning_rate": 0.00019995836089026813, "loss": 0.1122, "step": 892 }, { "epoch": 0.05772929292929293, "grad_norm": 0.09868495911359787, "learning_rate": 0.00019995816332847626, "loss": 0.0962, "step": 893 }, { "epoch": 0.05779393939393939, "grad_norm": 0.11295129358768463, "learning_rate": 0.0001999579652992131, "loss": 0.1072, "step": 894 }, { "epoch": 0.05785858585858586, "grad_norm": 0.1221550777554512, "learning_rate": 0.0001999577668024796, "loss": 0.1192, "step": 895 }, { "epoch": 0.05792323232323232, "grad_norm": 0.11230450123548508, "learning_rate": 0.00019995756783827665, "loss": 0.1196, "step": 896 }, { "epoch": 0.05792323232323232, "eval_bleu": 14.791945394215508, "eval_loss": 0.10085465759038925, "eval_runtime": 2.7005, "eval_samples_per_second": 11.85, "eval_steps_per_second": 1.481, "step": 896 }, { "epoch": 0.05798787878787879, "grad_norm": 0.10296222567558289, "learning_rate": 0.00019995736840660523, "loss": 0.1048, "step": 897 }, { "epoch": 0.05805252525252525, "grad_norm": 0.1342710256576538, "learning_rate": 0.0001999571685074662, "loss": 0.1099, "step": 898 }, { "epoch": 0.05811717171717172, "grad_norm": 0.11080357432365417, "learning_rate": 0.00019995696814086058, "loss": 0.092, "step": 899 }, { "epoch": 0.05818181818181818, "grad_norm": 0.11223907023668289, "learning_rate": 0.00019995676730678925, "loss": 0.1031, "step": 900 }, { "epoch": 0.058246464646464643, "grad_norm": 0.11462075263261795, "learning_rate": 0.00019995656600525313, "loss": 0.1105, "step": 901 }, { "epoch": 0.05831111111111111, "grad_norm": 0.11297249794006348, "learning_rate": 0.00019995636423625324, "loss": 0.1002, "step": 902 }, { "epoch": 0.058375757575757574, "grad_norm": 0.10576991736888885, "learning_rate": 0.00019995616199979045, "loss": 0.0918, "step": 903 }, { "epoch": 0.05844040404040404, "grad_norm": 0.10222340375185013, "learning_rate": 0.00019995595929586576, "loss": 0.0939, "step": 904 }, { "epoch": 0.058505050505050504, "grad_norm": 0.1218513771891594, "learning_rate": 0.0001999557561244801, "loss": 0.1116, "step": 905 }, { "epoch": 0.05856969696969697, "grad_norm": 0.11331094801425934, "learning_rate": 0.00019995555248563437, "loss": 0.1126, "step": 906 }, { "epoch": 0.058634343434343435, "grad_norm": 0.1779399812221527, "learning_rate": 0.00019995534837932962, "loss": 0.104, "step": 907 }, { "epoch": 0.058698989898989896, "grad_norm": 0.11177841573953629, "learning_rate": 0.00019995514380556672, "loss": 0.1008, "step": 908 }, { "epoch": 0.058763636363636365, "grad_norm": 0.10509387403726578, "learning_rate": 0.00019995493876434666, "loss": 0.0946, "step": 909 }, { "epoch": 0.05882828282828283, "grad_norm": 0.11490184813737869, "learning_rate": 0.0001999547332556704, "loss": 0.108, "step": 910 }, { "epoch": 0.058892929292929296, "grad_norm": 0.10376391559839249, "learning_rate": 0.0001999545272795389, "loss": 0.0925, "step": 911 }, { "epoch": 0.05895757575757576, "grad_norm": 0.10855154693126678, "learning_rate": 0.00019995432083595312, "loss": 0.0989, "step": 912 }, { "epoch": 0.05895757575757576, "eval_bleu": 13.503646460032721, "eval_loss": 0.09989695250988007, "eval_runtime": 2.7356, "eval_samples_per_second": 11.698, "eval_steps_per_second": 1.462, "step": 912 }, { "epoch": 0.05902222222222222, "grad_norm": 0.1056453138589859, "learning_rate": 0.000199954113924914, "loss": 0.1062, "step": 913 }, { "epoch": 0.05908686868686869, "grad_norm": 0.1238904744386673, "learning_rate": 0.00019995390654642257, "loss": 0.1257, "step": 914 }, { "epoch": 0.05915151515151515, "grad_norm": 0.10526388138532639, "learning_rate": 0.00019995369870047972, "loss": 0.1051, "step": 915 }, { "epoch": 0.05921616161616162, "grad_norm": 0.11506146192550659, "learning_rate": 0.00019995349038708652, "loss": 0.1201, "step": 916 }, { "epoch": 0.05928080808080808, "grad_norm": 0.10063769668340683, "learning_rate": 0.00019995328160624386, "loss": 0.1043, "step": 917 }, { "epoch": 0.05934545454545455, "grad_norm": 0.11473876982927322, "learning_rate": 0.00019995307235795272, "loss": 0.0935, "step": 918 }, { "epoch": 0.05941010101010101, "grad_norm": 0.1102418303489685, "learning_rate": 0.00019995286264221413, "loss": 0.1061, "step": 919 }, { "epoch": 0.05947474747474747, "grad_norm": 0.11224579066038132, "learning_rate": 0.00019995265245902904, "loss": 0.1013, "step": 920 }, { "epoch": 0.05953939393939394, "grad_norm": 0.15346866846084595, "learning_rate": 0.00019995244180839845, "loss": 0.1065, "step": 921 }, { "epoch": 0.0596040404040404, "grad_norm": 0.16856330633163452, "learning_rate": 0.00019995223069032333, "loss": 0.107, "step": 922 }, { "epoch": 0.05966868686868687, "grad_norm": 0.13238894939422607, "learning_rate": 0.00019995201910480467, "loss": 0.1194, "step": 923 }, { "epoch": 0.05973333333333333, "grad_norm": 0.12689128518104553, "learning_rate": 0.00019995180705184346, "loss": 0.1197, "step": 924 }, { "epoch": 0.0597979797979798, "grad_norm": 0.1079198494553566, "learning_rate": 0.00019995159453144068, "loss": 0.1059, "step": 925 }, { "epoch": 0.05986262626262626, "grad_norm": 0.09220168739557266, "learning_rate": 0.00019995138154359733, "loss": 0.0818, "step": 926 }, { "epoch": 0.059927272727272725, "grad_norm": 0.11543834209442139, "learning_rate": 0.00019995116808831445, "loss": 0.1127, "step": 927 }, { "epoch": 0.059991919191919194, "grad_norm": 0.131541907787323, "learning_rate": 0.00019995095416559298, "loss": 0.109, "step": 928 }, { "epoch": 0.059991919191919194, "eval_bleu": 13.146405815577472, "eval_loss": 0.1025841161608696, "eval_runtime": 2.6904, "eval_samples_per_second": 11.894, "eval_steps_per_second": 1.487, "step": 928 }, { "epoch": 0.060056565656565655, "grad_norm": 0.10599727928638458, "learning_rate": 0.00019995073977543393, "loss": 0.0928, "step": 929 }, { "epoch": 0.060121212121212124, "grad_norm": 0.10286258906126022, "learning_rate": 0.00019995052491783832, "loss": 0.105, "step": 930 }, { "epoch": 0.060185858585858586, "grad_norm": 0.09632156789302826, "learning_rate": 0.00019995030959280716, "loss": 0.0999, "step": 931 }, { "epoch": 0.06025050505050505, "grad_norm": 0.11808812618255615, "learning_rate": 0.00019995009380034142, "loss": 0.119, "step": 932 }, { "epoch": 0.060315151515151516, "grad_norm": 0.11463737487792969, "learning_rate": 0.00019994987754044216, "loss": 0.1167, "step": 933 }, { "epoch": 0.06037979797979798, "grad_norm": 0.10428722202777863, "learning_rate": 0.00019994966081311036, "loss": 0.1012, "step": 934 }, { "epoch": 0.060444444444444446, "grad_norm": 0.09879586100578308, "learning_rate": 0.00019994944361834702, "loss": 0.0976, "step": 935 }, { "epoch": 0.06050909090909091, "grad_norm": 0.11271627247333527, "learning_rate": 0.0001999492259561532, "loss": 0.1105, "step": 936 }, { "epoch": 0.06057373737373738, "grad_norm": 0.11347619444131851, "learning_rate": 0.00019994900782652986, "loss": 0.1203, "step": 937 }, { "epoch": 0.06063838383838384, "grad_norm": 0.10328079760074615, "learning_rate": 0.00019994878922947805, "loss": 0.1068, "step": 938 }, { "epoch": 0.0607030303030303, "grad_norm": 0.10391739010810852, "learning_rate": 0.00019994857016499885, "loss": 0.1066, "step": 939 }, { "epoch": 0.06076767676767677, "grad_norm": 0.11093475669622421, "learning_rate": 0.00019994835063309318, "loss": 0.1028, "step": 940 }, { "epoch": 0.06083232323232323, "grad_norm": 0.11336583644151688, "learning_rate": 0.00019994813063376214, "loss": 0.1106, "step": 941 }, { "epoch": 0.0608969696969697, "grad_norm": 0.11894485354423523, "learning_rate": 0.00019994791016700675, "loss": 0.1108, "step": 942 }, { "epoch": 0.06096161616161616, "grad_norm": 0.10436300188302994, "learning_rate": 0.00019994768923282797, "loss": 0.0967, "step": 943 }, { "epoch": 0.06102626262626262, "grad_norm": 0.1109478771686554, "learning_rate": 0.00019994746783122693, "loss": 0.1039, "step": 944 }, { "epoch": 0.06102626262626262, "eval_bleu": 14.703895748705321, "eval_loss": 0.1030469536781311, "eval_runtime": 2.8892, "eval_samples_per_second": 11.076, "eval_steps_per_second": 1.384, "step": 944 }, { "epoch": 0.06109090909090909, "grad_norm": 0.10315844416618347, "learning_rate": 0.0001999472459622046, "loss": 0.1, "step": 945 }, { "epoch": 0.06115555555555555, "grad_norm": 0.12184132635593414, "learning_rate": 0.0001999470236257621, "loss": 0.1056, "step": 946 }, { "epoch": 0.06122020202020202, "grad_norm": 0.10203199833631516, "learning_rate": 0.00019994680082190036, "loss": 0.1004, "step": 947 }, { "epoch": 0.061284848484848484, "grad_norm": 0.10957950353622437, "learning_rate": 0.0001999465775506205, "loss": 0.1001, "step": 948 }, { "epoch": 0.06134949494949495, "grad_norm": 0.09947719424962997, "learning_rate": 0.00019994635381192353, "loss": 0.0981, "step": 949 }, { "epoch": 0.061414141414141414, "grad_norm": 0.10668642818927765, "learning_rate": 0.00019994612960581049, "loss": 0.1043, "step": 950 }, { "epoch": 0.061478787878787876, "grad_norm": 0.09808440506458282, "learning_rate": 0.00019994590493228247, "loss": 0.0849, "step": 951 }, { "epoch": 0.061543434343434344, "grad_norm": 0.1252501755952835, "learning_rate": 0.00019994567979134047, "loss": 0.1027, "step": 952 }, { "epoch": 0.061608080808080806, "grad_norm": 0.09008917212486267, "learning_rate": 0.00019994545418298558, "loss": 0.0809, "step": 953 }, { "epoch": 0.061672727272727275, "grad_norm": 0.09205963462591171, "learning_rate": 0.00019994522810721886, "loss": 0.0942, "step": 954 }, { "epoch": 0.06173737373737374, "grad_norm": 0.09121670573949814, "learning_rate": 0.0001999450015640413, "loss": 0.0854, "step": 955 }, { "epoch": 0.061802020202020205, "grad_norm": 0.1023370772600174, "learning_rate": 0.00019994477455345403, "loss": 0.0989, "step": 956 }, { "epoch": 0.06186666666666667, "grad_norm": 0.11290039122104645, "learning_rate": 0.00019994454707545815, "loss": 0.1109, "step": 957 }, { "epoch": 0.06193131313131313, "grad_norm": 0.0942951962351799, "learning_rate": 0.00019994431913005457, "loss": 0.0863, "step": 958 }, { "epoch": 0.0619959595959596, "grad_norm": 0.09380186349153519, "learning_rate": 0.0001999440907172445, "loss": 0.0879, "step": 959 }, { "epoch": 0.06206060606060606, "grad_norm": 0.10406679660081863, "learning_rate": 0.00019994386183702897, "loss": 0.082, "step": 960 }, { "epoch": 0.06206060606060606, "eval_bleu": 13.504000765398503, "eval_loss": 0.1035585030913353, "eval_runtime": 2.7527, "eval_samples_per_second": 11.625, "eval_steps_per_second": 1.453, "step": 960 }, { "epoch": 0.06212525252525253, "grad_norm": 0.09908974915742874, "learning_rate": 0.00019994363248940903, "loss": 0.0949, "step": 961 }, { "epoch": 0.06218989898989899, "grad_norm": 0.12424175441265106, "learning_rate": 0.00019994340267438573, "loss": 0.0976, "step": 962 }, { "epoch": 0.06225454545454545, "grad_norm": 0.10670045763254166, "learning_rate": 0.0001999431723919602, "loss": 0.1003, "step": 963 }, { "epoch": 0.06231919191919192, "grad_norm": 0.1120072603225708, "learning_rate": 0.0001999429416421335, "loss": 0.1041, "step": 964 }, { "epoch": 0.06238383838383838, "grad_norm": 0.14401409029960632, "learning_rate": 0.00019994271042490666, "loss": 0.1196, "step": 965 }, { "epoch": 0.06244848484848485, "grad_norm": 0.11253448575735092, "learning_rate": 0.00019994247874028086, "loss": 0.1138, "step": 966 }, { "epoch": 0.06251313131313131, "grad_norm": 0.10873868316411972, "learning_rate": 0.00019994224658825707, "loss": 0.108, "step": 967 }, { "epoch": 0.06257777777777777, "grad_norm": 0.12138410657644272, "learning_rate": 0.0001999420139688365, "loss": 0.1013, "step": 968 }, { "epoch": 0.06264242424242424, "grad_norm": 0.11792376637458801, "learning_rate": 0.00019994178088202013, "loss": 0.1078, "step": 969 }, { "epoch": 0.06270707070707071, "grad_norm": 0.09426609426736832, "learning_rate": 0.0001999415473278091, "loss": 0.0881, "step": 970 }, { "epoch": 0.06277171717171717, "grad_norm": 0.14864352345466614, "learning_rate": 0.00019994131330620452, "loss": 0.0993, "step": 971 }, { "epoch": 0.06283636363636363, "grad_norm": 0.09219281375408173, "learning_rate": 0.0001999410788172074, "loss": 0.0896, "step": 972 }, { "epoch": 0.0629010101010101, "grad_norm": 0.09379587322473526, "learning_rate": 0.00019994084386081894, "loss": 0.0865, "step": 973 }, { "epoch": 0.06296565656565657, "grad_norm": 0.12673808634281158, "learning_rate": 0.00019994060843704016, "loss": 0.1086, "step": 974 }, { "epoch": 0.06303030303030303, "grad_norm": 0.1037813052535057, "learning_rate": 0.0001999403725458722, "loss": 0.0975, "step": 975 }, { "epoch": 0.0630949494949495, "grad_norm": 0.14156074821949005, "learning_rate": 0.00019994013618731618, "loss": 0.106, "step": 976 }, { "epoch": 0.0630949494949495, "eval_bleu": 16.63288810316848, "eval_loss": 0.10185457766056061, "eval_runtime": 2.7356, "eval_samples_per_second": 11.698, "eval_steps_per_second": 1.462, "step": 976 }, { "epoch": 0.06315959595959596, "grad_norm": 0.1073005273938179, "learning_rate": 0.0001999398993613732, "loss": 0.1096, "step": 977 }, { "epoch": 0.06322424242424242, "grad_norm": 0.09988738596439362, "learning_rate": 0.00019993966206804432, "loss": 0.0997, "step": 978 }, { "epoch": 0.0632888888888889, "grad_norm": 0.10770740360021591, "learning_rate": 0.00019993942430733067, "loss": 0.1121, "step": 979 }, { "epoch": 0.06335353535353536, "grad_norm": 0.10458149015903473, "learning_rate": 0.00019993918607923337, "loss": 0.0975, "step": 980 }, { "epoch": 0.06341818181818182, "grad_norm": 0.10584570467472076, "learning_rate": 0.00019993894738375357, "loss": 0.1065, "step": 981 }, { "epoch": 0.06348282828282828, "grad_norm": 0.10891502350568771, "learning_rate": 0.00019993870822089234, "loss": 0.097, "step": 982 }, { "epoch": 0.06354747474747474, "grad_norm": 0.11425327509641647, "learning_rate": 0.00019993846859065077, "loss": 0.1051, "step": 983 }, { "epoch": 0.06361212121212122, "grad_norm": 0.09814655035734177, "learning_rate": 0.00019993822849303006, "loss": 0.0944, "step": 984 }, { "epoch": 0.06367676767676768, "grad_norm": 0.10303620249032974, "learning_rate": 0.00019993798792803129, "loss": 0.0977, "step": 985 }, { "epoch": 0.06374141414141414, "grad_norm": 0.10379397869110107, "learning_rate": 0.0001999377468956556, "loss": 0.0918, "step": 986 }, { "epoch": 0.0638060606060606, "grad_norm": 0.11233863234519958, "learning_rate": 0.0001999375053959041, "loss": 0.1014, "step": 987 }, { "epoch": 0.06387070707070706, "grad_norm": 0.09550675749778748, "learning_rate": 0.0001999372634287779, "loss": 0.1007, "step": 988 }, { "epoch": 0.06393535353535354, "grad_norm": 0.1209508553147316, "learning_rate": 0.00019993702099427817, "loss": 0.1224, "step": 989 }, { "epoch": 0.064, "grad_norm": 0.10136724263429642, "learning_rate": 0.00019993677809240605, "loss": 0.1126, "step": 990 }, { "epoch": 0.06406464646464646, "grad_norm": 0.09928274154663086, "learning_rate": 0.00019993653472316264, "loss": 0.1028, "step": 991 }, { "epoch": 0.06412929292929292, "grad_norm": 0.10656436532735825, "learning_rate": 0.0001999362908865491, "loss": 0.1068, "step": 992 }, { "epoch": 0.06412929292929292, "eval_bleu": 12.73347699399549, "eval_loss": 0.10065907984972, "eval_runtime": 2.6363, "eval_samples_per_second": 12.138, "eval_steps_per_second": 1.517, "step": 992 }, { "epoch": 0.0641939393939394, "grad_norm": 0.1007160171866417, "learning_rate": 0.00019993604658256658, "loss": 0.094, "step": 993 }, { "epoch": 0.06425858585858586, "grad_norm": 0.09793444722890854, "learning_rate": 0.0001999358018112162, "loss": 0.1046, "step": 994 }, { "epoch": 0.06432323232323232, "grad_norm": 0.10793797671794891, "learning_rate": 0.0001999355565724991, "loss": 0.1077, "step": 995 }, { "epoch": 0.06438787878787879, "grad_norm": 0.0944027304649353, "learning_rate": 0.00019993531086641645, "loss": 0.0976, "step": 996 }, { "epoch": 0.06445252525252525, "grad_norm": 0.14579692482948303, "learning_rate": 0.0001999350646929694, "loss": 0.1281, "step": 997 }, { "epoch": 0.06451717171717172, "grad_norm": 0.10719353705644608, "learning_rate": 0.00019993481805215905, "loss": 0.1067, "step": 998 }, { "epoch": 0.06458181818181818, "grad_norm": 0.08898865431547165, "learning_rate": 0.00019993457094398664, "loss": 0.0863, "step": 999 }, { "epoch": 0.06464646464646465, "grad_norm": 0.10586211085319519, "learning_rate": 0.00019993432336845327, "loss": 0.0946, "step": 1000 }, { "epoch": 0.06471111111111111, "grad_norm": 0.09605011343955994, "learning_rate": 0.00019993407532556008, "loss": 0.0895, "step": 1001 }, { "epoch": 0.06477575757575757, "grad_norm": 0.10589330643415451, "learning_rate": 0.0001999338268153083, "loss": 0.1004, "step": 1002 }, { "epoch": 0.06484040404040405, "grad_norm": 0.09274252504110336, "learning_rate": 0.000199933577837699, "loss": 0.0773, "step": 1003 }, { "epoch": 0.06490505050505051, "grad_norm": 0.10222328454256058, "learning_rate": 0.00019993332839273343, "loss": 0.1018, "step": 1004 }, { "epoch": 0.06496969696969697, "grad_norm": 0.1029796153306961, "learning_rate": 0.00019993307848041272, "loss": 0.087, "step": 1005 }, { "epoch": 0.06503434343434343, "grad_norm": 0.12443310767412186, "learning_rate": 0.00019993282810073804, "loss": 0.1269, "step": 1006 }, { "epoch": 0.06509898989898989, "grad_norm": 0.10611167550086975, "learning_rate": 0.00019993257725371054, "loss": 0.0957, "step": 1007 }, { "epoch": 0.06516363636363637, "grad_norm": 0.10289434343576431, "learning_rate": 0.0001999323259393314, "loss": 0.1098, "step": 1008 }, { "epoch": 0.06516363636363637, "eval_bleu": 15.062234492126294, "eval_loss": 0.09961952269077301, "eval_runtime": 2.8104, "eval_samples_per_second": 11.386, "eval_steps_per_second": 1.423, "step": 1008 }, { "epoch": 0.06522828282828283, "grad_norm": 0.09957101196050644, "learning_rate": 0.00019993207415760185, "loss": 0.099, "step": 1009 }, { "epoch": 0.06529292929292929, "grad_norm": 0.09299284219741821, "learning_rate": 0.000199931821908523, "loss": 0.0876, "step": 1010 }, { "epoch": 0.06535757575757575, "grad_norm": 0.10813038796186447, "learning_rate": 0.00019993156919209605, "loss": 0.1054, "step": 1011 }, { "epoch": 0.06542222222222223, "grad_norm": 0.08390337228775024, "learning_rate": 0.00019993131600832222, "loss": 0.0787, "step": 1012 }, { "epoch": 0.06548686868686869, "grad_norm": 0.10573827475309372, "learning_rate": 0.00019993106235720266, "loss": 0.1102, "step": 1013 }, { "epoch": 0.06555151515151515, "grad_norm": 0.12195960432291031, "learning_rate": 0.00019993080823873852, "loss": 0.1098, "step": 1014 }, { "epoch": 0.06561616161616161, "grad_norm": 0.1130300983786583, "learning_rate": 0.00019993055365293106, "loss": 0.1087, "step": 1015 }, { "epoch": 0.06568080808080808, "grad_norm": 0.10060533881187439, "learning_rate": 0.0001999302985997814, "loss": 0.097, "step": 1016 }, { "epoch": 0.06574545454545455, "grad_norm": 0.12042611837387085, "learning_rate": 0.0001999300430792908, "loss": 0.0994, "step": 1017 }, { "epoch": 0.06581010101010101, "grad_norm": 0.11390791088342667, "learning_rate": 0.00019992978709146042, "loss": 0.1009, "step": 1018 }, { "epoch": 0.06587474747474747, "grad_norm": 0.09423738718032837, "learning_rate": 0.00019992953063629145, "loss": 0.096, "step": 1019 }, { "epoch": 0.06593939393939394, "grad_norm": 0.11149628460407257, "learning_rate": 0.00019992927371378512, "loss": 0.1008, "step": 1020 }, { "epoch": 0.0660040404040404, "grad_norm": 0.10044733434915543, "learning_rate": 0.00019992901632394258, "loss": 0.1091, "step": 1021 }, { "epoch": 0.06606868686868687, "grad_norm": 0.14813345670700073, "learning_rate": 0.0001999287584667651, "loss": 0.0985, "step": 1022 }, { "epoch": 0.06613333333333334, "grad_norm": 0.10394751280546188, "learning_rate": 0.00019992850014225385, "loss": 0.1082, "step": 1023 }, { "epoch": 0.0661979797979798, "grad_norm": 0.10756181180477142, "learning_rate": 0.00019992824135041005, "loss": 0.1082, "step": 1024 }, { "epoch": 0.0661979797979798, "eval_bleu": 15.96788675592142, "eval_loss": 0.10119746625423431, "eval_runtime": 2.7218, "eval_samples_per_second": 11.757, "eval_steps_per_second": 1.47, "step": 1024 }, { "epoch": 0.06626262626262626, "grad_norm": 0.09936445951461792, "learning_rate": 0.00019992798209123486, "loss": 0.0895, "step": 1025 }, { "epoch": 0.06632727272727272, "grad_norm": 0.1018339991569519, "learning_rate": 0.00019992772236472955, "loss": 0.1046, "step": 1026 }, { "epoch": 0.0663919191919192, "grad_norm": 0.10769277065992355, "learning_rate": 0.00019992746217089536, "loss": 0.1102, "step": 1027 }, { "epoch": 0.06645656565656566, "grad_norm": 0.12098774313926697, "learning_rate": 0.0001999272015097334, "loss": 0.0921, "step": 1028 }, { "epoch": 0.06652121212121212, "grad_norm": 0.12512466311454773, "learning_rate": 0.000199926940381245, "loss": 0.1399, "step": 1029 }, { "epoch": 0.06658585858585858, "grad_norm": 0.10815372318029404, "learning_rate": 0.0001999266787854313, "loss": 0.107, "step": 1030 }, { "epoch": 0.06665050505050506, "grad_norm": 0.09212598949670792, "learning_rate": 0.0001999264167222936, "loss": 0.0812, "step": 1031 }, { "epoch": 0.06671515151515152, "grad_norm": 0.11654563993215561, "learning_rate": 0.00019992615419183306, "loss": 0.1214, "step": 1032 }, { "epoch": 0.06677979797979798, "grad_norm": 0.11384245753288269, "learning_rate": 0.00019992589119405092, "loss": 0.0962, "step": 1033 }, { "epoch": 0.06684444444444444, "grad_norm": 0.10957711935043335, "learning_rate": 0.00019992562772894843, "loss": 0.0967, "step": 1034 }, { "epoch": 0.0669090909090909, "grad_norm": 0.12031755596399307, "learning_rate": 0.00019992536379652683, "loss": 0.108, "step": 1035 }, { "epoch": 0.06697373737373738, "grad_norm": 0.10495950281620026, "learning_rate": 0.00019992509939678734, "loss": 0.103, "step": 1036 }, { "epoch": 0.06703838383838384, "grad_norm": 0.10424365103244781, "learning_rate": 0.00019992483452973116, "loss": 0.0944, "step": 1037 }, { "epoch": 0.0671030303030303, "grad_norm": 0.09796426445245743, "learning_rate": 0.0001999245691953596, "loss": 0.0886, "step": 1038 }, { "epoch": 0.06716767676767676, "grad_norm": 0.11019867658615112, "learning_rate": 0.00019992430339367382, "loss": 0.0839, "step": 1039 }, { "epoch": 0.06723232323232323, "grad_norm": 0.1157185360789299, "learning_rate": 0.00019992403712467517, "loss": 0.1119, "step": 1040 }, { "epoch": 0.06723232323232323, "eval_bleu": 12.040628198130733, "eval_loss": 0.1027548760175705, "eval_runtime": 2.7972, "eval_samples_per_second": 11.44, "eval_steps_per_second": 1.43, "step": 1040 }, { "epoch": 0.0672969696969697, "grad_norm": 0.09878209233283997, "learning_rate": 0.00019992377038836477, "loss": 0.0923, "step": 1041 }, { "epoch": 0.06736161616161616, "grad_norm": 0.13967718183994293, "learning_rate": 0.00019992350318474397, "loss": 0.1151, "step": 1042 }, { "epoch": 0.06742626262626263, "grad_norm": 0.13182571530342102, "learning_rate": 0.00019992323551381398, "loss": 0.1015, "step": 1043 }, { "epoch": 0.06749090909090909, "grad_norm": 0.10362154245376587, "learning_rate": 0.00019992296737557602, "loss": 0.096, "step": 1044 }, { "epoch": 0.06755555555555555, "grad_norm": 0.11503683775663376, "learning_rate": 0.00019992269877003142, "loss": 0.1191, "step": 1045 }, { "epoch": 0.06762020202020202, "grad_norm": 0.100889191031456, "learning_rate": 0.00019992242969718135, "loss": 0.095, "step": 1046 }, { "epoch": 0.06768484848484849, "grad_norm": 0.1143028512597084, "learning_rate": 0.00019992216015702713, "loss": 0.1063, "step": 1047 }, { "epoch": 0.06774949494949495, "grad_norm": 0.113642618060112, "learning_rate": 0.00019992189014957, "loss": 0.114, "step": 1048 }, { "epoch": 0.06781414141414141, "grad_norm": 0.09080003947019577, "learning_rate": 0.00019992161967481123, "loss": 0.0842, "step": 1049 }, { "epoch": 0.06787878787878789, "grad_norm": 0.10620587319135666, "learning_rate": 0.00019992134873275208, "loss": 0.1081, "step": 1050 }, { "epoch": 0.06794343434343435, "grad_norm": 0.10006501525640488, "learning_rate": 0.0001999210773233938, "loss": 0.1003, "step": 1051 }, { "epoch": 0.06800808080808081, "grad_norm": 0.09275925904512405, "learning_rate": 0.00019992080544673768, "loss": 0.0981, "step": 1052 }, { "epoch": 0.06807272727272727, "grad_norm": 0.08958086371421814, "learning_rate": 0.00019992053310278498, "loss": 0.0919, "step": 1053 }, { "epoch": 0.06813737373737373, "grad_norm": 0.09127036482095718, "learning_rate": 0.000199920260291537, "loss": 0.0906, "step": 1054 }, { "epoch": 0.06820202020202021, "grad_norm": 0.11914127320051193, "learning_rate": 0.00019991998701299497, "loss": 0.1418, "step": 1055 }, { "epoch": 0.06826666666666667, "grad_norm": 0.0990588590502739, "learning_rate": 0.0001999197132671602, "loss": 0.0944, "step": 1056 }, { "epoch": 0.06826666666666667, "eval_bleu": 13.203528345008845, "eval_loss": 0.10272073745727539, "eval_runtime": 2.7338, "eval_samples_per_second": 11.705, "eval_steps_per_second": 1.463, "step": 1056 }, { "epoch": 0.06833131313131313, "grad_norm": 0.10000976175069809, "learning_rate": 0.000199919439054034, "loss": 0.0847, "step": 1057 }, { "epoch": 0.0683959595959596, "grad_norm": 0.09563630074262619, "learning_rate": 0.00019991916437361756, "loss": 0.0933, "step": 1058 }, { "epoch": 0.06846060606060606, "grad_norm": 0.09445653855800629, "learning_rate": 0.00019991888922591224, "loss": 0.0866, "step": 1059 }, { "epoch": 0.06852525252525253, "grad_norm": 0.10733193904161453, "learning_rate": 0.00019991861361091934, "loss": 0.1126, "step": 1060 }, { "epoch": 0.06858989898989899, "grad_norm": 0.10559269785881042, "learning_rate": 0.00019991833752864007, "loss": 0.1137, "step": 1061 }, { "epoch": 0.06865454545454545, "grad_norm": 0.09451444447040558, "learning_rate": 0.00019991806097907579, "loss": 0.1016, "step": 1062 }, { "epoch": 0.06871919191919192, "grad_norm": 0.09219640493392944, "learning_rate": 0.00019991778396222777, "loss": 0.1019, "step": 1063 }, { "epoch": 0.06878383838383838, "grad_norm": 0.10495129227638245, "learning_rate": 0.00019991750647809729, "loss": 0.1007, "step": 1064 }, { "epoch": 0.06884848484848485, "grad_norm": 0.10733859241008759, "learning_rate": 0.0001999172285266857, "loss": 0.0973, "step": 1065 }, { "epoch": 0.06891313131313132, "grad_norm": 0.15368856489658356, "learning_rate": 0.00019991695010799422, "loss": 0.0881, "step": 1066 }, { "epoch": 0.06897777777777778, "grad_norm": 0.10988730937242508, "learning_rate": 0.0001999166712220242, "loss": 0.116, "step": 1067 }, { "epoch": 0.06904242424242424, "grad_norm": 0.10612314194440842, "learning_rate": 0.00019991639186877697, "loss": 0.1007, "step": 1068 }, { "epoch": 0.0691070707070707, "grad_norm": 0.10792101919651031, "learning_rate": 0.00019991611204825383, "loss": 0.1067, "step": 1069 }, { "epoch": 0.06917171717171718, "grad_norm": 0.09953682869672775, "learning_rate": 0.000199915831760456, "loss": 0.1052, "step": 1070 }, { "epoch": 0.06923636363636364, "grad_norm": 0.09427595883607864, "learning_rate": 0.0001999155510053849, "loss": 0.0935, "step": 1071 }, { "epoch": 0.0693010101010101, "grad_norm": 0.11791904270648956, "learning_rate": 0.0001999152697830418, "loss": 0.1209, "step": 1072 }, { "epoch": 0.0693010101010101, "eval_bleu": 12.398441773724599, "eval_loss": 0.10355356335639954, "eval_runtime": 2.7704, "eval_samples_per_second": 11.551, "eval_steps_per_second": 1.444, "step": 1072 }, { "epoch": 0.06936565656565656, "grad_norm": 0.1010703518986702, "learning_rate": 0.000199914988093428, "loss": 0.1115, "step": 1073 }, { "epoch": 0.06943030303030304, "grad_norm": 0.19438670575618744, "learning_rate": 0.00019991470593654485, "loss": 0.1419, "step": 1074 }, { "epoch": 0.0694949494949495, "grad_norm": 0.10723185539245605, "learning_rate": 0.00019991442331239364, "loss": 0.115, "step": 1075 }, { "epoch": 0.06955959595959596, "grad_norm": 0.10031235963106155, "learning_rate": 0.0001999141402209757, "loss": 0.0865, "step": 1076 }, { "epoch": 0.06962424242424242, "grad_norm": 0.11666223406791687, "learning_rate": 0.00019991385666229234, "loss": 0.1149, "step": 1077 }, { "epoch": 0.06968888888888888, "grad_norm": 0.10806740820407867, "learning_rate": 0.00019991357263634494, "loss": 0.098, "step": 1078 }, { "epoch": 0.06975353535353536, "grad_norm": 0.11003516614437103, "learning_rate": 0.00019991328814313475, "loss": 0.0887, "step": 1079 }, { "epoch": 0.06981818181818182, "grad_norm": 0.10464249551296234, "learning_rate": 0.00019991300318266317, "loss": 0.1023, "step": 1080 }, { "epoch": 0.06988282828282828, "grad_norm": 0.14229588210582733, "learning_rate": 0.0001999127177549315, "loss": 0.1038, "step": 1081 }, { "epoch": 0.06994747474747474, "grad_norm": 0.11465063691139221, "learning_rate": 0.00019991243185994107, "loss": 0.1041, "step": 1082 }, { "epoch": 0.0700121212121212, "grad_norm": 0.09853720664978027, "learning_rate": 0.00019991214549769325, "loss": 0.0836, "step": 1083 }, { "epoch": 0.07007676767676768, "grad_norm": 0.12108904123306274, "learning_rate": 0.00019991185866818933, "loss": 0.1054, "step": 1084 }, { "epoch": 0.07014141414141414, "grad_norm": 0.11559389531612396, "learning_rate": 0.0001999115713714307, "loss": 0.1064, "step": 1085 }, { "epoch": 0.0702060606060606, "grad_norm": 0.10156375914812088, "learning_rate": 0.00019991128360741865, "loss": 0.0898, "step": 1086 }, { "epoch": 0.07027070707070707, "grad_norm": 0.10436543822288513, "learning_rate": 0.00019991099537615458, "loss": 0.0991, "step": 1087 }, { "epoch": 0.07033535353535353, "grad_norm": 0.11197343468666077, "learning_rate": 0.00019991070667763983, "loss": 0.0948, "step": 1088 }, { "epoch": 0.07033535353535353, "eval_bleu": 16.438974162564314, "eval_loss": 0.10288131982088089, "eval_runtime": 2.5554, "eval_samples_per_second": 12.523, "eval_steps_per_second": 1.565, "step": 1088 }, { "epoch": 0.0704, "grad_norm": 0.09361731261014938, "learning_rate": 0.0001999104175118757, "loss": 0.0855, "step": 1089 }, { "epoch": 0.07046464646464647, "grad_norm": 0.09633186459541321, "learning_rate": 0.0001999101278788636, "loss": 0.0957, "step": 1090 }, { "epoch": 0.07052929292929293, "grad_norm": 0.10059010982513428, "learning_rate": 0.00019990983777860483, "loss": 0.1068, "step": 1091 }, { "epoch": 0.07059393939393939, "grad_norm": 0.0951557457447052, "learning_rate": 0.00019990954721110083, "loss": 0.0974, "step": 1092 }, { "epoch": 0.07065858585858587, "grad_norm": 0.1154366135597229, "learning_rate": 0.00019990925617635288, "loss": 0.0958, "step": 1093 }, { "epoch": 0.07072323232323233, "grad_norm": 0.16366399824619293, "learning_rate": 0.00019990896467436236, "loss": 0.1013, "step": 1094 }, { "epoch": 0.07078787878787879, "grad_norm": 0.10861239582300186, "learning_rate": 0.00019990867270513064, "loss": 0.1055, "step": 1095 }, { "epoch": 0.07085252525252525, "grad_norm": 0.0996587797999382, "learning_rate": 0.0001999083802686591, "loss": 0.0875, "step": 1096 }, { "epoch": 0.07091717171717171, "grad_norm": 0.27355629205703735, "learning_rate": 0.0001999080873649491, "loss": 0.1403, "step": 1097 }, { "epoch": 0.07098181818181819, "grad_norm": 0.10840930044651031, "learning_rate": 0.00019990779399400198, "loss": 0.103, "step": 1098 }, { "epoch": 0.07104646464646465, "grad_norm": 0.10754979401826859, "learning_rate": 0.00019990750015581914, "loss": 0.1061, "step": 1099 }, { "epoch": 0.07111111111111111, "grad_norm": 0.10941679775714874, "learning_rate": 0.00019990720585040197, "loss": 0.1095, "step": 1100 }, { "epoch": 0.07117575757575757, "grad_norm": 0.09383228421211243, "learning_rate": 0.00019990691107775184, "loss": 0.1, "step": 1101 }, { "epoch": 0.07124040404040403, "grad_norm": 0.10095507651567459, "learning_rate": 0.00019990661583787006, "loss": 0.0958, "step": 1102 }, { "epoch": 0.07130505050505051, "grad_norm": 0.1256381869316101, "learning_rate": 0.0001999063201307581, "loss": 0.1067, "step": 1103 }, { "epoch": 0.07136969696969697, "grad_norm": 0.09743168950080872, "learning_rate": 0.0001999060239564173, "loss": 0.0973, "step": 1104 }, { "epoch": 0.07136969696969697, "eval_bleu": 14.919981827351906, "eval_loss": 0.10219694674015045, "eval_runtime": 2.7215, "eval_samples_per_second": 11.758, "eval_steps_per_second": 1.47, "step": 1104 }, { "epoch": 0.07143434343434343, "grad_norm": 0.09388572722673416, "learning_rate": 0.00019990572731484909, "loss": 0.0963, "step": 1105 }, { "epoch": 0.0714989898989899, "grad_norm": 0.08546385169029236, "learning_rate": 0.00019990543020605479, "loss": 0.0934, "step": 1106 }, { "epoch": 0.07156363636363636, "grad_norm": 0.09720250219106674, "learning_rate": 0.00019990513263003584, "loss": 0.0939, "step": 1107 }, { "epoch": 0.07162828282828283, "grad_norm": 0.1317378431558609, "learning_rate": 0.0001999048345867936, "loss": 0.1225, "step": 1108 }, { "epoch": 0.0716929292929293, "grad_norm": 0.08493581414222717, "learning_rate": 0.00019990453607632947, "loss": 0.0835, "step": 1109 }, { "epoch": 0.07175757575757576, "grad_norm": 0.11314329504966736, "learning_rate": 0.00019990423709864485, "loss": 0.1107, "step": 1110 }, { "epoch": 0.07182222222222222, "grad_norm": 0.08751076459884644, "learning_rate": 0.00019990393765374114, "loss": 0.0776, "step": 1111 }, { "epoch": 0.0718868686868687, "grad_norm": 0.11137157678604126, "learning_rate": 0.00019990363774161974, "loss": 0.1002, "step": 1112 }, { "epoch": 0.07195151515151516, "grad_norm": 0.11782591044902802, "learning_rate": 0.00019990333736228207, "loss": 0.0975, "step": 1113 }, { "epoch": 0.07201616161616162, "grad_norm": 0.10530432313680649, "learning_rate": 0.00019990303651572951, "loss": 0.091, "step": 1114 }, { "epoch": 0.07208080808080808, "grad_norm": 0.11624302715063095, "learning_rate": 0.00019990273520196348, "loss": 0.1116, "step": 1115 }, { "epoch": 0.07214545454545454, "grad_norm": 0.10502476990222931, "learning_rate": 0.0001999024334209854, "loss": 0.0945, "step": 1116 }, { "epoch": 0.07221010101010102, "grad_norm": 0.11220627278089523, "learning_rate": 0.00019990213117279665, "loss": 0.1011, "step": 1117 }, { "epoch": 0.07227474747474748, "grad_norm": 0.1221824437379837, "learning_rate": 0.00019990182845739865, "loss": 0.1201, "step": 1118 }, { "epoch": 0.07233939393939394, "grad_norm": 0.10572541505098343, "learning_rate": 0.00019990152527479284, "loss": 0.1233, "step": 1119 }, { "epoch": 0.0724040404040404, "grad_norm": 0.10655274987220764, "learning_rate": 0.00019990122162498062, "loss": 0.1308, "step": 1120 }, { "epoch": 0.0724040404040404, "eval_bleu": 13.866672753758753, "eval_loss": 0.10140915215015411, "eval_runtime": 2.6783, "eval_samples_per_second": 11.948, "eval_steps_per_second": 1.493, "step": 1120 }, { "epoch": 0.07246868686868686, "grad_norm": 0.08577724546194077, "learning_rate": 0.0001999009175079634, "loss": 0.0898, "step": 1121 }, { "epoch": 0.07253333333333334, "grad_norm": 0.08553668856620789, "learning_rate": 0.00019990061292374264, "loss": 0.0922, "step": 1122 }, { "epoch": 0.0725979797979798, "grad_norm": 0.09185883402824402, "learning_rate": 0.0001999003078723197, "loss": 0.0968, "step": 1123 }, { "epoch": 0.07266262626262626, "grad_norm": 0.10707148909568787, "learning_rate": 0.00019990000235369605, "loss": 0.1122, "step": 1124 }, { "epoch": 0.07272727272727272, "grad_norm": 0.09191740304231644, "learning_rate": 0.00019989969636787316, "loss": 0.1019, "step": 1125 }, { "epoch": 0.07279191919191919, "grad_norm": 0.08890045434236526, "learning_rate": 0.00019989938991485238, "loss": 0.0877, "step": 1126 }, { "epoch": 0.07285656565656566, "grad_norm": 0.11036644130945206, "learning_rate": 0.0001998990829946352, "loss": 0.108, "step": 1127 }, { "epoch": 0.07292121212121212, "grad_norm": 0.1215878278017044, "learning_rate": 0.00019989877560722304, "loss": 0.1214, "step": 1128 }, { "epoch": 0.07298585858585858, "grad_norm": 0.11315099149942398, "learning_rate": 0.00019989846775261732, "loss": 0.1155, "step": 1129 }, { "epoch": 0.07305050505050505, "grad_norm": 0.10078810155391693, "learning_rate": 0.0001998981594308195, "loss": 0.0859, "step": 1130 }, { "epoch": 0.07311515151515152, "grad_norm": 0.12651541829109192, "learning_rate": 0.00019989785064183097, "loss": 0.1133, "step": 1131 }, { "epoch": 0.07317979797979798, "grad_norm": 0.13573110103607178, "learning_rate": 0.00019989754138565325, "loss": 0.1033, "step": 1132 }, { "epoch": 0.07324444444444445, "grad_norm": 0.10439104586839676, "learning_rate": 0.00019989723166228778, "loss": 0.103, "step": 1133 }, { "epoch": 0.07330909090909091, "grad_norm": 0.12181776016950607, "learning_rate": 0.00019989692147173596, "loss": 0.1228, "step": 1134 }, { "epoch": 0.07337373737373737, "grad_norm": 0.09804438054561615, "learning_rate": 0.00019989661081399926, "loss": 0.0918, "step": 1135 }, { "epoch": 0.07343838383838384, "grad_norm": 0.11062714457511902, "learning_rate": 0.00019989629968907913, "loss": 0.1051, "step": 1136 }, { "epoch": 0.07343838383838384, "eval_bleu": 13.482048021626605, "eval_loss": 0.10033686459064484, "eval_runtime": 2.772, "eval_samples_per_second": 11.544, "eval_steps_per_second": 1.443, "step": 1136 }, { "epoch": 0.0735030303030303, "grad_norm": 0.10200586915016174, "learning_rate": 0.00019989598809697705, "loss": 0.1001, "step": 1137 }, { "epoch": 0.07356767676767677, "grad_norm": 0.09004790335893631, "learning_rate": 0.00019989567603769445, "loss": 0.0858, "step": 1138 }, { "epoch": 0.07363232323232323, "grad_norm": 0.09158658981323242, "learning_rate": 0.0001998953635112328, "loss": 0.0987, "step": 1139 }, { "epoch": 0.07369696969696969, "grad_norm": 0.09695807844400406, "learning_rate": 0.00019989505051759356, "loss": 0.1097, "step": 1140 }, { "epoch": 0.07376161616161617, "grad_norm": 0.09070958197116852, "learning_rate": 0.00019989473705677818, "loss": 0.094, "step": 1141 }, { "epoch": 0.07382626262626263, "grad_norm": 0.1058708131313324, "learning_rate": 0.00019989442312878816, "loss": 0.0928, "step": 1142 }, { "epoch": 0.07389090909090909, "grad_norm": 0.1012599840760231, "learning_rate": 0.00019989410873362494, "loss": 0.091, "step": 1143 }, { "epoch": 0.07395555555555555, "grad_norm": 0.10348769277334213, "learning_rate": 0.00019989379387129, "loss": 0.0966, "step": 1144 }, { "epoch": 0.07402020202020201, "grad_norm": 0.08224254846572876, "learning_rate": 0.0001998934785417848, "loss": 0.0869, "step": 1145 }, { "epoch": 0.07408484848484849, "grad_norm": 0.12119830399751663, "learning_rate": 0.00019989316274511082, "loss": 0.1277, "step": 1146 }, { "epoch": 0.07414949494949495, "grad_norm": 0.14378836750984192, "learning_rate": 0.0001998928464812696, "loss": 0.1247, "step": 1147 }, { "epoch": 0.07421414141414141, "grad_norm": 0.08215869963169098, "learning_rate": 0.00019989252975026247, "loss": 0.0809, "step": 1148 }, { "epoch": 0.07427878787878787, "grad_norm": 0.1078762337565422, "learning_rate": 0.00019989221255209104, "loss": 0.1205, "step": 1149 }, { "epoch": 0.07434343434343435, "grad_norm": 0.10174203664064407, "learning_rate": 0.0001998918948867568, "loss": 0.1066, "step": 1150 }, { "epoch": 0.07440808080808081, "grad_norm": 0.12836866080760956, "learning_rate": 0.00019989157675426113, "loss": 0.1303, "step": 1151 }, { "epoch": 0.07447272727272727, "grad_norm": 0.10035769641399384, "learning_rate": 0.0001998912581546056, "loss": 0.106, "step": 1152 }, { "epoch": 0.07447272727272727, "eval_bleu": 15.257446961470782, "eval_loss": 0.10000753402709961, "eval_runtime": 2.649, "eval_samples_per_second": 12.08, "eval_steps_per_second": 1.51, "step": 1152 }, { "epoch": 0.07453737373737374, "grad_norm": 0.10295552015304565, "learning_rate": 0.00019989093908779167, "loss": 0.1117, "step": 1153 }, { "epoch": 0.0746020202020202, "grad_norm": 0.10999424755573273, "learning_rate": 0.00019989061955382086, "loss": 0.0892, "step": 1154 }, { "epoch": 0.07466666666666667, "grad_norm": 0.13196702301502228, "learning_rate": 0.00019989029955269465, "loss": 0.1094, "step": 1155 }, { "epoch": 0.07473131313131313, "grad_norm": 0.09767697006464005, "learning_rate": 0.0001998899790844145, "loss": 0.1097, "step": 1156 }, { "epoch": 0.0747959595959596, "grad_norm": 0.10190394520759583, "learning_rate": 0.00019988965814898198, "loss": 0.0882, "step": 1157 }, { "epoch": 0.07486060606060606, "grad_norm": 0.10192656517028809, "learning_rate": 0.00019988933674639853, "loss": 0.1029, "step": 1158 }, { "epoch": 0.07492525252525252, "grad_norm": 0.10996544361114502, "learning_rate": 0.00019988901487666568, "loss": 0.1043, "step": 1159 }, { "epoch": 0.074989898989899, "grad_norm": 0.11330875009298325, "learning_rate": 0.00019988869253978494, "loss": 0.1029, "step": 1160 }, { "epoch": 0.07505454545454546, "grad_norm": 0.11957574635744095, "learning_rate": 0.00019988836973575778, "loss": 0.103, "step": 1161 }, { "epoch": 0.07511919191919192, "grad_norm": 0.1305283159017563, "learning_rate": 0.00019988804646458577, "loss": 0.0986, "step": 1162 }, { "epoch": 0.07518383838383838, "grad_norm": 0.10644561052322388, "learning_rate": 0.00019988772272627037, "loss": 0.1002, "step": 1163 }, { "epoch": 0.07524848484848484, "grad_norm": 0.09223178774118423, "learning_rate": 0.00019988739852081308, "loss": 0.0899, "step": 1164 }, { "epoch": 0.07531313131313132, "grad_norm": 0.11759480088949203, "learning_rate": 0.0001998870738482155, "loss": 0.1, "step": 1165 }, { "epoch": 0.07537777777777778, "grad_norm": 0.08548406511545181, "learning_rate": 0.0001998867487084791, "loss": 0.0872, "step": 1166 }, { "epoch": 0.07544242424242424, "grad_norm": 0.10207363218069077, "learning_rate": 0.00019988642310160538, "loss": 0.1015, "step": 1167 }, { "epoch": 0.0755070707070707, "grad_norm": 0.11605361104011536, "learning_rate": 0.00019988609702759587, "loss": 0.1045, "step": 1168 }, { "epoch": 0.0755070707070707, "eval_bleu": 11.914871423195398, "eval_loss": 0.09854108095169067, "eval_runtime": 2.7721, "eval_samples_per_second": 11.544, "eval_steps_per_second": 1.443, "step": 1168 }, { "epoch": 0.07557171717171716, "grad_norm": 0.12160315364599228, "learning_rate": 0.0001998857704864521, "loss": 0.1128, "step": 1169 }, { "epoch": 0.07563636363636364, "grad_norm": 0.08649091422557831, "learning_rate": 0.0001998854434781756, "loss": 0.0992, "step": 1170 }, { "epoch": 0.0757010101010101, "grad_norm": 0.09241735190153122, "learning_rate": 0.00019988511600276793, "loss": 0.0984, "step": 1171 }, { "epoch": 0.07576565656565656, "grad_norm": 0.09227190166711807, "learning_rate": 0.0001998847880602306, "loss": 0.1002, "step": 1172 }, { "epoch": 0.07583030303030303, "grad_norm": 0.09272097796201706, "learning_rate": 0.0001998844596505651, "loss": 0.0951, "step": 1173 }, { "epoch": 0.0758949494949495, "grad_norm": 0.09505018591880798, "learning_rate": 0.00019988413077377305, "loss": 0.1011, "step": 1174 }, { "epoch": 0.07595959595959596, "grad_norm": 0.08822530508041382, "learning_rate": 0.00019988380142985592, "loss": 0.0968, "step": 1175 }, { "epoch": 0.07602424242424242, "grad_norm": 0.12706732749938965, "learning_rate": 0.0001998834716188153, "loss": 0.1065, "step": 1176 }, { "epoch": 0.07608888888888889, "grad_norm": 0.09266221523284912, "learning_rate": 0.00019988314134065265, "loss": 0.097, "step": 1177 }, { "epoch": 0.07615353535353535, "grad_norm": 0.09255720674991608, "learning_rate": 0.00019988281059536958, "loss": 0.0822, "step": 1178 }, { "epoch": 0.07621818181818182, "grad_norm": 0.09482455253601074, "learning_rate": 0.00019988247938296764, "loss": 0.0998, "step": 1179 }, { "epoch": 0.07628282828282829, "grad_norm": 0.09541239589452744, "learning_rate": 0.00019988214770344834, "loss": 0.1011, "step": 1180 }, { "epoch": 0.07634747474747475, "grad_norm": 0.10172248631715775, "learning_rate": 0.0001998818155568133, "loss": 0.0932, "step": 1181 }, { "epoch": 0.07641212121212121, "grad_norm": 0.0817946046590805, "learning_rate": 0.00019988148294306402, "loss": 0.0823, "step": 1182 }, { "epoch": 0.07647676767676767, "grad_norm": 0.10149791091680527, "learning_rate": 0.00019988114986220205, "loss": 0.0867, "step": 1183 }, { "epoch": 0.07654141414141415, "grad_norm": 0.10169646888971329, "learning_rate": 0.00019988081631422896, "loss": 0.1007, "step": 1184 }, { "epoch": 0.07654141414141415, "eval_bleu": 13.500695208523451, "eval_loss": 0.10087230801582336, "eval_runtime": 2.5616, "eval_samples_per_second": 12.492, "eval_steps_per_second": 1.562, "step": 1184 }, { "epoch": 0.07660606060606061, "grad_norm": 0.0874757468700409, "learning_rate": 0.0001998804822991463, "loss": 0.0897, "step": 1185 }, { "epoch": 0.07667070707070707, "grad_norm": 0.0955202654004097, "learning_rate": 0.0001998801478169557, "loss": 0.0965, "step": 1186 }, { "epoch": 0.07673535353535353, "grad_norm": 0.09903624653816223, "learning_rate": 0.0001998798128676586, "loss": 0.099, "step": 1187 }, { "epoch": 0.0768, "grad_norm": 0.09248454123735428, "learning_rate": 0.00019987947745125665, "loss": 0.0802, "step": 1188 }, { "epoch": 0.07686464646464647, "grad_norm": 0.10043051838874817, "learning_rate": 0.00019987914156775142, "loss": 0.0948, "step": 1189 }, { "epoch": 0.07692929292929293, "grad_norm": 0.10855479538440704, "learning_rate": 0.0001998788052171445, "loss": 0.1128, "step": 1190 }, { "epoch": 0.07699393939393939, "grad_norm": 0.09830132126808167, "learning_rate": 0.00019987846839943736, "loss": 0.1057, "step": 1191 }, { "epoch": 0.07705858585858585, "grad_norm": 0.09701931476593018, "learning_rate": 0.00019987813111463164, "loss": 0.0995, "step": 1192 }, { "epoch": 0.07712323232323233, "grad_norm": 0.10256762057542801, "learning_rate": 0.00019987779336272897, "loss": 0.1072, "step": 1193 }, { "epoch": 0.07718787878787879, "grad_norm": 0.1158410981297493, "learning_rate": 0.00019987745514373086, "loss": 0.1247, "step": 1194 }, { "epoch": 0.07725252525252525, "grad_norm": 0.10586290806531906, "learning_rate": 0.0001998771164576389, "loss": 0.1074, "step": 1195 }, { "epoch": 0.07731717171717171, "grad_norm": 0.11251550912857056, "learning_rate": 0.0001998767773044547, "loss": 0.1164, "step": 1196 }, { "epoch": 0.07738181818181818, "grad_norm": 0.09646680951118469, "learning_rate": 0.00019987643768417984, "loss": 0.0859, "step": 1197 }, { "epoch": 0.07744646464646465, "grad_norm": 0.09115240722894669, "learning_rate": 0.00019987609759681586, "loss": 0.0931, "step": 1198 }, { "epoch": 0.07751111111111111, "grad_norm": 0.09376112371683121, "learning_rate": 0.00019987575704236442, "loss": 0.0883, "step": 1199 }, { "epoch": 0.07757575757575758, "grad_norm": 0.11306974291801453, "learning_rate": 0.0001998754160208271, "loss": 0.1093, "step": 1200 }, { "epoch": 0.07757575757575758, "eval_bleu": 12.277632424523997, "eval_loss": 0.10065107047557831, "eval_runtime": 2.7447, "eval_samples_per_second": 11.659, "eval_steps_per_second": 1.457, "step": 1200 }, { "epoch": 0.07764040404040404, "grad_norm": 0.09991087764501572, "learning_rate": 0.00019987507453220546, "loss": 0.1045, "step": 1201 }, { "epoch": 0.0777050505050505, "grad_norm": 0.0946974903345108, "learning_rate": 0.00019987473257650112, "loss": 0.0932, "step": 1202 }, { "epoch": 0.07776969696969697, "grad_norm": 0.10396780073642731, "learning_rate": 0.00019987439015371565, "loss": 0.1, "step": 1203 }, { "epoch": 0.07783434343434344, "grad_norm": 0.11985617130994797, "learning_rate": 0.0001998740472638507, "loss": 0.1178, "step": 1204 }, { "epoch": 0.0778989898989899, "grad_norm": 0.10046354681253433, "learning_rate": 0.00019987370390690782, "loss": 0.0969, "step": 1205 }, { "epoch": 0.07796363636363636, "grad_norm": 0.1067626103758812, "learning_rate": 0.00019987336008288868, "loss": 0.0967, "step": 1206 }, { "epoch": 0.07802828282828282, "grad_norm": 0.11509537696838379, "learning_rate": 0.00019987301579179484, "loss": 0.1144, "step": 1207 }, { "epoch": 0.0780929292929293, "grad_norm": 0.10435677319765091, "learning_rate": 0.0001998726710336279, "loss": 0.1082, "step": 1208 }, { "epoch": 0.07815757575757576, "grad_norm": 0.10428476333618164, "learning_rate": 0.00019987232580838952, "loss": 0.1084, "step": 1209 }, { "epoch": 0.07822222222222222, "grad_norm": 0.1169833317399025, "learning_rate": 0.00019987198011608127, "loss": 0.1082, "step": 1210 }, { "epoch": 0.07828686868686868, "grad_norm": 0.11864268779754639, "learning_rate": 0.0001998716339567048, "loss": 0.1157, "step": 1211 }, { "epoch": 0.07835151515151516, "grad_norm": 0.11021723598241806, "learning_rate": 0.00019987128733026173, "loss": 0.1081, "step": 1212 }, { "epoch": 0.07841616161616162, "grad_norm": 0.10279525816440582, "learning_rate": 0.00019987094023675363, "loss": 0.1036, "step": 1213 }, { "epoch": 0.07848080808080808, "grad_norm": 0.10111741721630096, "learning_rate": 0.0001998705926761822, "loss": 0.0902, "step": 1214 }, { "epoch": 0.07854545454545454, "grad_norm": 0.09448467940092087, "learning_rate": 0.00019987024464854897, "loss": 0.0966, "step": 1215 }, { "epoch": 0.078610101010101, "grad_norm": 0.09504568576812744, "learning_rate": 0.00019986989615385567, "loss": 0.1017, "step": 1216 }, { "epoch": 0.078610101010101, "eval_bleu": 14.99782490957994, "eval_loss": 0.0987180843949318, "eval_runtime": 2.6258, "eval_samples_per_second": 12.187, "eval_steps_per_second": 1.523, "step": 1216 }, { "epoch": 0.07867474747474748, "grad_norm": 0.09722236543893814, "learning_rate": 0.00019986954719210386, "loss": 0.0873, "step": 1217 }, { "epoch": 0.07873939393939394, "grad_norm": 0.1531490683555603, "learning_rate": 0.0001998691977632952, "loss": 0.0999, "step": 1218 }, { "epoch": 0.0788040404040404, "grad_norm": 0.11008467525243759, "learning_rate": 0.0001998688478674313, "loss": 0.1231, "step": 1219 }, { "epoch": 0.07886868686868687, "grad_norm": 0.10244037955999374, "learning_rate": 0.00019986849750451387, "loss": 0.1175, "step": 1220 }, { "epoch": 0.07893333333333333, "grad_norm": 0.09780837595462799, "learning_rate": 0.00019986814667454446, "loss": 0.106, "step": 1221 }, { "epoch": 0.0789979797979798, "grad_norm": 0.09185977280139923, "learning_rate": 0.00019986779537752472, "loss": 0.0999, "step": 1222 }, { "epoch": 0.07906262626262626, "grad_norm": 0.10435672104358673, "learning_rate": 0.00019986744361345636, "loss": 0.1092, "step": 1223 }, { "epoch": 0.07912727272727273, "grad_norm": 0.09911638498306274, "learning_rate": 0.00019986709138234093, "loss": 0.092, "step": 1224 }, { "epoch": 0.07919191919191919, "grad_norm": 0.08804962784051895, "learning_rate": 0.00019986673868418016, "loss": 0.0915, "step": 1225 }, { "epoch": 0.07925656565656565, "grad_norm": 0.09933444112539291, "learning_rate": 0.00019986638551897567, "loss": 0.1046, "step": 1226 }, { "epoch": 0.07932121212121213, "grad_norm": 0.10138771682977676, "learning_rate": 0.0001998660318867291, "loss": 0.0963, "step": 1227 }, { "epoch": 0.07938585858585859, "grad_norm": 0.1047845110297203, "learning_rate": 0.00019986567778744214, "loss": 0.1146, "step": 1228 }, { "epoch": 0.07945050505050505, "grad_norm": 0.1062421202659607, "learning_rate": 0.00019986532322111637, "loss": 0.0775, "step": 1229 }, { "epoch": 0.07951515151515151, "grad_norm": 0.09797623008489609, "learning_rate": 0.00019986496818775353, "loss": 0.087, "step": 1230 }, { "epoch": 0.07957979797979799, "grad_norm": 0.11880829185247421, "learning_rate": 0.00019986461268735526, "loss": 0.1046, "step": 1231 }, { "epoch": 0.07964444444444445, "grad_norm": 0.10233789682388306, "learning_rate": 0.0001998642567199232, "loss": 0.101, "step": 1232 }, { "epoch": 0.07964444444444445, "eval_bleu": 14.30630858105872, "eval_loss": 0.09911099076271057, "eval_runtime": 2.6934, "eval_samples_per_second": 11.881, "eval_steps_per_second": 1.485, "step": 1232 }, { "epoch": 0.07970909090909091, "grad_norm": 0.11157096922397614, "learning_rate": 0.00019986390028545902, "loss": 0.0879, "step": 1233 }, { "epoch": 0.07977373737373737, "grad_norm": 0.0990234911441803, "learning_rate": 0.0001998635433839644, "loss": 0.0952, "step": 1234 }, { "epoch": 0.07983838383838383, "grad_norm": 0.12317034602165222, "learning_rate": 0.000199863186015441, "loss": 0.1093, "step": 1235 }, { "epoch": 0.07990303030303031, "grad_norm": 0.11035049706697464, "learning_rate": 0.00019986282817989048, "loss": 0.1136, "step": 1236 }, { "epoch": 0.07996767676767677, "grad_norm": 0.08728849142789841, "learning_rate": 0.00019986246987731455, "loss": 0.0916, "step": 1237 }, { "epoch": 0.08003232323232323, "grad_norm": 0.10465919971466064, "learning_rate": 0.00019986211110771486, "loss": 0.1043, "step": 1238 }, { "epoch": 0.0800969696969697, "grad_norm": 0.11658327281475067, "learning_rate": 0.00019986175187109307, "loss": 0.1197, "step": 1239 }, { "epoch": 0.08016161616161616, "grad_norm": 0.0924682766199112, "learning_rate": 0.0001998613921674509, "loss": 0.093, "step": 1240 }, { "epoch": 0.08022626262626263, "grad_norm": 0.09621009975671768, "learning_rate": 0.00019986103199679, "loss": 0.0873, "step": 1241 }, { "epoch": 0.0802909090909091, "grad_norm": 0.10269203037023544, "learning_rate": 0.00019986067135911205, "loss": 0.1012, "step": 1242 }, { "epoch": 0.08035555555555556, "grad_norm": 0.1048085018992424, "learning_rate": 0.00019986031025441878, "loss": 0.107, "step": 1243 }, { "epoch": 0.08042020202020202, "grad_norm": 0.11960818618535995, "learning_rate": 0.00019985994868271185, "loss": 0.1058, "step": 1244 }, { "epoch": 0.08048484848484848, "grad_norm": 0.1293594241142273, "learning_rate": 0.00019985958664399294, "loss": 0.1104, "step": 1245 }, { "epoch": 0.08054949494949495, "grad_norm": 0.10614243894815445, "learning_rate": 0.00019985922413826376, "loss": 0.1, "step": 1246 }, { "epoch": 0.08061414141414142, "grad_norm": 0.09893834590911865, "learning_rate": 0.000199858861165526, "loss": 0.0981, "step": 1247 }, { "epoch": 0.08067878787878788, "grad_norm": 0.08690205961465836, "learning_rate": 0.00019985849772578138, "loss": 0.0757, "step": 1248 }, { "epoch": 0.08067878787878788, "eval_bleu": 10.116267882407662, "eval_loss": 0.09995153546333313, "eval_runtime": 2.6797, "eval_samples_per_second": 11.941, "eval_steps_per_second": 1.493, "step": 1248 }, { "epoch": 0.08074343434343434, "grad_norm": 0.11978496611118317, "learning_rate": 0.00019985813381903156, "loss": 0.1203, "step": 1249 }, { "epoch": 0.08080808080808081, "grad_norm": 0.10559460520744324, "learning_rate": 0.00019985776944527825, "loss": 0.1063, "step": 1250 }, { "epoch": 0.08087272727272728, "grad_norm": 0.09509187936782837, "learning_rate": 0.00019985740460452318, "loss": 0.104, "step": 1251 }, { "epoch": 0.08093737373737374, "grad_norm": 0.10089214891195297, "learning_rate": 0.00019985703929676808, "loss": 0.1007, "step": 1252 }, { "epoch": 0.0810020202020202, "grad_norm": 0.10321296006441116, "learning_rate": 0.00019985667352201455, "loss": 0.1085, "step": 1253 }, { "epoch": 0.08106666666666666, "grad_norm": 0.08723357319831848, "learning_rate": 0.00019985630728026438, "loss": 0.0891, "step": 1254 }, { "epoch": 0.08113131313131314, "grad_norm": 0.09296173602342606, "learning_rate": 0.00019985594057151933, "loss": 0.1047, "step": 1255 }, { "epoch": 0.0811959595959596, "grad_norm": 0.08660700917243958, "learning_rate": 0.000199855573395781, "loss": 0.0883, "step": 1256 }, { "epoch": 0.08126060606060606, "grad_norm": 0.10300800949335098, "learning_rate": 0.00019985520575305118, "loss": 0.1201, "step": 1257 }, { "epoch": 0.08132525252525252, "grad_norm": 0.09489543735980988, "learning_rate": 0.00019985483764333158, "loss": 0.1009, "step": 1258 }, { "epoch": 0.08138989898989898, "grad_norm": 0.0899474024772644, "learning_rate": 0.00019985446906662394, "loss": 0.0959, "step": 1259 }, { "epoch": 0.08145454545454546, "grad_norm": 0.10913682729005814, "learning_rate": 0.00019985410002292992, "loss": 0.1086, "step": 1260 }, { "epoch": 0.08151919191919192, "grad_norm": 0.09105635434389114, "learning_rate": 0.0001998537305122513, "loss": 0.109, "step": 1261 }, { "epoch": 0.08158383838383838, "grad_norm": 0.08313827961683273, "learning_rate": 0.00019985336053458978, "loss": 0.0854, "step": 1262 }, { "epoch": 0.08164848484848485, "grad_norm": 0.10245882719755173, "learning_rate": 0.0001998529900899471, "loss": 0.0954, "step": 1263 }, { "epoch": 0.08171313131313131, "grad_norm": 0.11280017346143723, "learning_rate": 0.00019985261917832502, "loss": 0.1153, "step": 1264 }, { "epoch": 0.08171313131313131, "eval_bleu": 11.527914224001549, "eval_loss": 0.09990820288658142, "eval_runtime": 2.766, "eval_samples_per_second": 11.569, "eval_steps_per_second": 1.446, "step": 1264 }, { "epoch": 0.08177777777777778, "grad_norm": 0.10616433620452881, "learning_rate": 0.00019985224779972525, "loss": 0.101, "step": 1265 }, { "epoch": 0.08184242424242424, "grad_norm": 0.10047123581171036, "learning_rate": 0.00019985187595414954, "loss": 0.1006, "step": 1266 }, { "epoch": 0.0819070707070707, "grad_norm": 0.1606845110654831, "learning_rate": 0.00019985150364159958, "loss": 0.1179, "step": 1267 }, { "epoch": 0.08197171717171717, "grad_norm": 0.0824814885854721, "learning_rate": 0.00019985113086207716, "loss": 0.0735, "step": 1268 }, { "epoch": 0.08203636363636363, "grad_norm": 0.10125270485877991, "learning_rate": 0.00019985075761558402, "loss": 0.1048, "step": 1269 }, { "epoch": 0.0821010101010101, "grad_norm": 0.09428473562002182, "learning_rate": 0.00019985038390212188, "loss": 0.105, "step": 1270 }, { "epoch": 0.08216565656565657, "grad_norm": 0.09866265207529068, "learning_rate": 0.00019985000972169253, "loss": 0.0863, "step": 1271 }, { "epoch": 0.08223030303030303, "grad_norm": 0.09355733543634415, "learning_rate": 0.0001998496350742977, "loss": 0.0871, "step": 1272 }, { "epoch": 0.08229494949494949, "grad_norm": 0.10245216637849808, "learning_rate": 0.0001998492599599391, "loss": 0.1087, "step": 1273 }, { "epoch": 0.08235959595959597, "grad_norm": 0.09834680706262589, "learning_rate": 0.00019984888437861852, "loss": 0.1007, "step": 1274 }, { "epoch": 0.08242424242424243, "grad_norm": 0.08705113083124161, "learning_rate": 0.00019984850833033776, "loss": 0.096, "step": 1275 }, { "epoch": 0.08248888888888889, "grad_norm": 0.1057414636015892, "learning_rate": 0.0001998481318150985, "loss": 0.118, "step": 1276 }, { "epoch": 0.08255353535353535, "grad_norm": 0.08798019587993622, "learning_rate": 0.00019984775483290255, "loss": 0.0995, "step": 1277 }, { "epoch": 0.08261818181818181, "grad_norm": 0.08680333942174911, "learning_rate": 0.00019984737738375165, "loss": 0.0895, "step": 1278 }, { "epoch": 0.08268282828282829, "grad_norm": 0.12362273782491684, "learning_rate": 0.0001998469994676476, "loss": 0.1078, "step": 1279 }, { "epoch": 0.08274747474747475, "grad_norm": 0.10130532830953598, "learning_rate": 0.00019984662108459212, "loss": 0.098, "step": 1280 }, { "epoch": 0.08274747474747475, "eval_bleu": 9.529024371172472, "eval_loss": 0.09953488409519196, "eval_runtime": 2.6735, "eval_samples_per_second": 11.969, "eval_steps_per_second": 1.496, "step": 1280 }, { "epoch": 0.08281212121212121, "grad_norm": 0.10659637302160263, "learning_rate": 0.000199846242234587, "loss": 0.1015, "step": 1281 }, { "epoch": 0.08287676767676767, "grad_norm": 0.0832698866724968, "learning_rate": 0.000199845862917634, "loss": 0.0911, "step": 1282 }, { "epoch": 0.08294141414141414, "grad_norm": 0.11082912236452103, "learning_rate": 0.00019984548313373496, "loss": 0.1159, "step": 1283 }, { "epoch": 0.08300606060606061, "grad_norm": 0.10170575976371765, "learning_rate": 0.00019984510288289156, "loss": 0.1044, "step": 1284 }, { "epoch": 0.08307070707070707, "grad_norm": 0.0846438929438591, "learning_rate": 0.00019984472216510565, "loss": 0.0841, "step": 1285 }, { "epoch": 0.08313535353535353, "grad_norm": 0.10545945912599564, "learning_rate": 0.00019984434098037893, "loss": 0.0812, "step": 1286 }, { "epoch": 0.0832, "grad_norm": 0.09282685816287994, "learning_rate": 0.00019984395932871326, "loss": 0.0962, "step": 1287 }, { "epoch": 0.08326464646464646, "grad_norm": 0.1083919107913971, "learning_rate": 0.00019984357721011041, "loss": 0.1098, "step": 1288 }, { "epoch": 0.08332929292929293, "grad_norm": 0.10446532070636749, "learning_rate": 0.00019984319462457216, "loss": 0.1109, "step": 1289 }, { "epoch": 0.0833939393939394, "grad_norm": 0.09216875582933426, "learning_rate": 0.0001998428115721003, "loss": 0.1045, "step": 1290 }, { "epoch": 0.08345858585858586, "grad_norm": 0.1036292091012001, "learning_rate": 0.00019984242805269663, "loss": 0.0988, "step": 1291 }, { "epoch": 0.08352323232323232, "grad_norm": 0.10119486600160599, "learning_rate": 0.0001998420440663629, "loss": 0.0965, "step": 1292 }, { "epoch": 0.0835878787878788, "grad_norm": 0.0946914479136467, "learning_rate": 0.00019984165961310096, "loss": 0.104, "step": 1293 }, { "epoch": 0.08365252525252526, "grad_norm": 0.08623967319726944, "learning_rate": 0.0001998412746929126, "loss": 0.0933, "step": 1294 }, { "epoch": 0.08371717171717172, "grad_norm": 0.08169128745794296, "learning_rate": 0.00019984088930579956, "loss": 0.0845, "step": 1295 }, { "epoch": 0.08378181818181818, "grad_norm": 0.09612638503313065, "learning_rate": 0.0001998405034517637, "loss": 0.105, "step": 1296 }, { "epoch": 0.08378181818181818, "eval_bleu": 13.001526120588284, "eval_loss": 0.0968184769153595, "eval_runtime": 2.6676, "eval_samples_per_second": 11.996, "eval_steps_per_second": 1.499, "step": 1296 }, { "epoch": 0.08384646464646464, "grad_norm": 0.09524210542440414, "learning_rate": 0.0001998401171308068, "loss": 0.0943, "step": 1297 }, { "epoch": 0.08391111111111112, "grad_norm": 0.09286917001008987, "learning_rate": 0.0001998397303429307, "loss": 0.0954, "step": 1298 }, { "epoch": 0.08397575757575758, "grad_norm": 0.11236073076725006, "learning_rate": 0.00019983934308813721, "loss": 0.1064, "step": 1299 }, { "epoch": 0.08404040404040404, "grad_norm": 0.08568739891052246, "learning_rate": 0.00019983895536642806, "loss": 0.0824, "step": 1300 }, { "epoch": 0.0841050505050505, "grad_norm": 0.10929793119430542, "learning_rate": 0.0001998385671778052, "loss": 0.0905, "step": 1301 }, { "epoch": 0.08416969696969696, "grad_norm": 0.08664591610431671, "learning_rate": 0.00019983817852227032, "loss": 0.0847, "step": 1302 }, { "epoch": 0.08423434343434344, "grad_norm": 0.09793335944414139, "learning_rate": 0.00019983778939982528, "loss": 0.0921, "step": 1303 }, { "epoch": 0.0842989898989899, "grad_norm": 0.08782773464918137, "learning_rate": 0.00019983739981047188, "loss": 0.0834, "step": 1304 }, { "epoch": 0.08436363636363636, "grad_norm": 0.09974594414234161, "learning_rate": 0.00019983700975421202, "loss": 0.0961, "step": 1305 }, { "epoch": 0.08442828282828282, "grad_norm": 0.09977371245622635, "learning_rate": 0.00019983661923104746, "loss": 0.0979, "step": 1306 }, { "epoch": 0.08449292929292929, "grad_norm": 0.10015544295310974, "learning_rate": 0.00019983622824098002, "loss": 0.0984, "step": 1307 }, { "epoch": 0.08455757575757576, "grad_norm": 0.1517401784658432, "learning_rate": 0.00019983583678401153, "loss": 0.1205, "step": 1308 }, { "epoch": 0.08462222222222222, "grad_norm": 0.09720490127801895, "learning_rate": 0.00019983544486014388, "loss": 0.1031, "step": 1309 }, { "epoch": 0.08468686868686869, "grad_norm": 0.08698497712612152, "learning_rate": 0.00019983505246937884, "loss": 0.0989, "step": 1310 }, { "epoch": 0.08475151515151515, "grad_norm": 0.09529600292444229, "learning_rate": 0.00019983465961171824, "loss": 0.0981, "step": 1311 }, { "epoch": 0.08481616161616162, "grad_norm": 0.10238347202539444, "learning_rate": 0.000199834266287164, "loss": 0.0958, "step": 1312 }, { "epoch": 0.08481616161616162, "eval_bleu": 13.969026288273707, "eval_loss": 0.09935599565505981, "eval_runtime": 2.6161, "eval_samples_per_second": 12.232, "eval_steps_per_second": 1.529, "step": 1312 }, { "epoch": 0.08488080808080808, "grad_norm": 0.09444960951805115, "learning_rate": 0.00019983387249571785, "loss": 0.096, "step": 1313 }, { "epoch": 0.08494545454545455, "grad_norm": 0.1125500276684761, "learning_rate": 0.0001998334782373817, "loss": 0.097, "step": 1314 }, { "epoch": 0.08501010101010101, "grad_norm": 0.10057497769594193, "learning_rate": 0.0001998330835121574, "loss": 0.1104, "step": 1315 }, { "epoch": 0.08507474747474747, "grad_norm": 0.1008632481098175, "learning_rate": 0.0001998326883200467, "loss": 0.0958, "step": 1316 }, { "epoch": 0.08513939393939395, "grad_norm": 0.08887092769145966, "learning_rate": 0.00019983229266105158, "loss": 0.0836, "step": 1317 }, { "epoch": 0.08520404040404041, "grad_norm": 0.1062830463051796, "learning_rate": 0.00019983189653517385, "loss": 0.109, "step": 1318 }, { "epoch": 0.08526868686868687, "grad_norm": 0.10037776082754135, "learning_rate": 0.0001998314999424153, "loss": 0.0939, "step": 1319 }, { "epoch": 0.08533333333333333, "grad_norm": 0.08978555351495743, "learning_rate": 0.00019983110288277785, "loss": 0.0857, "step": 1320 }, { "epoch": 0.08539797979797979, "grad_norm": 0.09538564085960388, "learning_rate": 0.00019983070535626332, "loss": 0.1015, "step": 1321 }, { "epoch": 0.08546262626262627, "grad_norm": 0.10329007357358932, "learning_rate": 0.0001998303073628736, "loss": 0.1108, "step": 1322 }, { "epoch": 0.08552727272727273, "grad_norm": 0.12846879661083221, "learning_rate": 0.0001998299089026105, "loss": 0.1332, "step": 1323 }, { "epoch": 0.08559191919191919, "grad_norm": 0.08949021995067596, "learning_rate": 0.000199829509975476, "loss": 0.0994, "step": 1324 }, { "epoch": 0.08565656565656565, "grad_norm": 0.09153349697589874, "learning_rate": 0.0001998291105814718, "loss": 0.0952, "step": 1325 }, { "epoch": 0.08572121212121211, "grad_norm": 0.11797761172056198, "learning_rate": 0.00019982871072059987, "loss": 0.0915, "step": 1326 }, { "epoch": 0.08578585858585859, "grad_norm": 0.09567425400018692, "learning_rate": 0.0001998283103928621, "loss": 0.1055, "step": 1327 }, { "epoch": 0.08585050505050505, "grad_norm": 0.15089303255081177, "learning_rate": 0.0001998279095982603, "loss": 0.0975, "step": 1328 }, { "epoch": 0.08585050505050505, "eval_bleu": 12.865755999664628, "eval_loss": 0.09942425042390823, "eval_runtime": 2.7438, "eval_samples_per_second": 11.663, "eval_steps_per_second": 1.458, "step": 1328 }, { "epoch": 0.08591515151515151, "grad_norm": 0.11189599335193634, "learning_rate": 0.00019982750833679637, "loss": 0.1214, "step": 1329 }, { "epoch": 0.08597979797979798, "grad_norm": 0.09316188097000122, "learning_rate": 0.00019982710660847218, "loss": 0.0938, "step": 1330 }, { "epoch": 0.08604444444444445, "grad_norm": 0.11655101925134659, "learning_rate": 0.00019982670441328964, "loss": 0.1083, "step": 1331 }, { "epoch": 0.08610909090909091, "grad_norm": 0.13540121912956238, "learning_rate": 0.00019982630175125057, "loss": 0.1236, "step": 1332 }, { "epoch": 0.08617373737373737, "grad_norm": 0.08572167158126831, "learning_rate": 0.0001998258986223569, "loss": 0.0759, "step": 1333 }, { "epoch": 0.08623838383838384, "grad_norm": 0.11115605384111404, "learning_rate": 0.00019982549502661052, "loss": 0.1032, "step": 1334 }, { "epoch": 0.0863030303030303, "grad_norm": 0.10030196607112885, "learning_rate": 0.00019982509096401328, "loss": 0.1033, "step": 1335 }, { "epoch": 0.08636767676767677, "grad_norm": 0.10309617221355438, "learning_rate": 0.00019982468643456712, "loss": 0.1124, "step": 1336 }, { "epoch": 0.08643232323232324, "grad_norm": 0.10604587197303772, "learning_rate": 0.00019982428143827387, "loss": 0.1063, "step": 1337 }, { "epoch": 0.0864969696969697, "grad_norm": 0.08428341895341873, "learning_rate": 0.0001998238759751355, "loss": 0.0789, "step": 1338 }, { "epoch": 0.08656161616161616, "grad_norm": 0.0930686891078949, "learning_rate": 0.00019982347004515383, "loss": 0.104, "step": 1339 }, { "epoch": 0.08662626262626262, "grad_norm": 0.09911807626485825, "learning_rate": 0.0001998230636483308, "loss": 0.0996, "step": 1340 }, { "epoch": 0.0866909090909091, "grad_norm": 0.10296230018138885, "learning_rate": 0.0001998226567846683, "loss": 0.1006, "step": 1341 }, { "epoch": 0.08675555555555556, "grad_norm": 0.10285492986440659, "learning_rate": 0.0001998222494541682, "loss": 0.1122, "step": 1342 }, { "epoch": 0.08682020202020202, "grad_norm": 0.09906865656375885, "learning_rate": 0.00019982184165683248, "loss": 0.0887, "step": 1343 }, { "epoch": 0.08688484848484848, "grad_norm": 0.10971717536449432, "learning_rate": 0.000199821433392663, "loss": 0.1056, "step": 1344 }, { "epoch": 0.08688484848484848, "eval_bleu": 11.484851180167276, "eval_loss": 0.09886029362678528, "eval_runtime": 2.6457, "eval_samples_per_second": 12.095, "eval_steps_per_second": 1.512, "step": 1344 }, { "epoch": 0.08694949494949494, "grad_norm": 0.08866874128580093, "learning_rate": 0.0001998210246616617, "loss": 0.0882, "step": 1345 }, { "epoch": 0.08701414141414142, "grad_norm": 0.09244007617235184, "learning_rate": 0.00019982061546383042, "loss": 0.0932, "step": 1346 }, { "epoch": 0.08707878787878788, "grad_norm": 0.08623712509870529, "learning_rate": 0.00019982020579917116, "loss": 0.0873, "step": 1347 }, { "epoch": 0.08714343434343434, "grad_norm": 0.1054779589176178, "learning_rate": 0.0001998197956676858, "loss": 0.1143, "step": 1348 }, { "epoch": 0.0872080808080808, "grad_norm": 0.0864868238568306, "learning_rate": 0.00019981938506937624, "loss": 0.0946, "step": 1349 }, { "epoch": 0.08727272727272728, "grad_norm": 0.20312117040157318, "learning_rate": 0.00019981897400424438, "loss": 0.119, "step": 1350 }, { "epoch": 0.08733737373737374, "grad_norm": 0.09270636737346649, "learning_rate": 0.00019981856247229223, "loss": 0.1021, "step": 1351 }, { "epoch": 0.0874020202020202, "grad_norm": 0.11795508116483688, "learning_rate": 0.00019981815047352165, "loss": 0.1129, "step": 1352 }, { "epoch": 0.08746666666666666, "grad_norm": 0.08348415046930313, "learning_rate": 0.00019981773800793458, "loss": 0.0867, "step": 1353 }, { "epoch": 0.08753131313131313, "grad_norm": 0.10120968520641327, "learning_rate": 0.00019981732507553295, "loss": 0.1116, "step": 1354 }, { "epoch": 0.0875959595959596, "grad_norm": 0.09838369488716125, "learning_rate": 0.00019981691167631865, "loss": 0.1061, "step": 1355 }, { "epoch": 0.08766060606060606, "grad_norm": 0.0971892848610878, "learning_rate": 0.0001998164978102937, "loss": 0.1132, "step": 1356 }, { "epoch": 0.08772525252525253, "grad_norm": 0.07860702276229858, "learning_rate": 0.00019981608347745998, "loss": 0.0875, "step": 1357 }, { "epoch": 0.08778989898989899, "grad_norm": 0.09088286757469177, "learning_rate": 0.0001998156686778194, "loss": 0.0924, "step": 1358 }, { "epoch": 0.08785454545454545, "grad_norm": 0.08988488465547562, "learning_rate": 0.000199815253411374, "loss": 0.0954, "step": 1359 }, { "epoch": 0.08791919191919192, "grad_norm": 0.09467092156410217, "learning_rate": 0.0001998148376781256, "loss": 0.086, "step": 1360 }, { "epoch": 0.08791919191919192, "eval_bleu": 12.53385025553159, "eval_loss": 0.09877762943506241, "eval_runtime": 2.8351, "eval_samples_per_second": 11.287, "eval_steps_per_second": 1.411, "step": 1360 }, { "epoch": 0.08798383838383839, "grad_norm": 0.09027474373579025, "learning_rate": 0.00019981442147807624, "loss": 0.0891, "step": 1361 }, { "epoch": 0.08804848484848485, "grad_norm": 0.12696439027786255, "learning_rate": 0.00019981400481122784, "loss": 0.1192, "step": 1362 }, { "epoch": 0.08811313131313131, "grad_norm": 0.10165084153413773, "learning_rate": 0.00019981358767758232, "loss": 0.0935, "step": 1363 }, { "epoch": 0.08817777777777777, "grad_norm": 0.09301851689815521, "learning_rate": 0.00019981317007714163, "loss": 0.1, "step": 1364 }, { "epoch": 0.08824242424242425, "grad_norm": 0.09047587215900421, "learning_rate": 0.00019981275200990775, "loss": 0.0918, "step": 1365 }, { "epoch": 0.08830707070707071, "grad_norm": 0.09839123487472534, "learning_rate": 0.00019981233347588263, "loss": 0.1012, "step": 1366 }, { "epoch": 0.08837171717171717, "grad_norm": 0.1176568940281868, "learning_rate": 0.00019981191447506822, "loss": 0.1286, "step": 1367 }, { "epoch": 0.08843636363636363, "grad_norm": 0.10879629105329514, "learning_rate": 0.0001998114950074665, "loss": 0.1052, "step": 1368 }, { "epoch": 0.0885010101010101, "grad_norm": 0.09475315362215042, "learning_rate": 0.00019981107507307936, "loss": 0.0999, "step": 1369 }, { "epoch": 0.08856565656565657, "grad_norm": 0.27396082878112793, "learning_rate": 0.00019981065467190886, "loss": 0.1161, "step": 1370 }, { "epoch": 0.08863030303030303, "grad_norm": 0.18335482478141785, "learning_rate": 0.00019981023380395696, "loss": 0.1163, "step": 1371 }, { "epoch": 0.0886949494949495, "grad_norm": 0.13550560176372528, "learning_rate": 0.00019980981246922553, "loss": 0.1259, "step": 1372 }, { "epoch": 0.08875959595959595, "grad_norm": 0.09698712825775146, "learning_rate": 0.00019980939066771664, "loss": 0.0964, "step": 1373 }, { "epoch": 0.08882424242424243, "grad_norm": 0.10698266327381134, "learning_rate": 0.00019980896839943223, "loss": 0.0932, "step": 1374 }, { "epoch": 0.08888888888888889, "grad_norm": 0.09666618704795837, "learning_rate": 0.00019980854566437425, "loss": 0.1002, "step": 1375 }, { "epoch": 0.08895353535353535, "grad_norm": 0.10566040873527527, "learning_rate": 0.0001998081224625447, "loss": 0.1023, "step": 1376 }, { "epoch": 0.08895353535353535, "eval_bleu": 12.276032833228713, "eval_loss": 0.09839345514774323, "eval_runtime": 2.6591, "eval_samples_per_second": 12.034, "eval_steps_per_second": 1.504, "step": 1376 }, { "epoch": 0.08901818181818182, "grad_norm": 0.10848584771156311, "learning_rate": 0.0001998076987939456, "loss": 0.1169, "step": 1377 }, { "epoch": 0.08908282828282828, "grad_norm": 0.11011257022619247, "learning_rate": 0.00019980727465857882, "loss": 0.0948, "step": 1378 }, { "epoch": 0.08914747474747475, "grad_norm": 0.08588409423828125, "learning_rate": 0.00019980685005644645, "loss": 0.0915, "step": 1379 }, { "epoch": 0.08921212121212121, "grad_norm": 0.11844003945589066, "learning_rate": 0.00019980642498755043, "loss": 0.1217, "step": 1380 }, { "epoch": 0.08927676767676768, "grad_norm": 0.09860694408416748, "learning_rate": 0.00019980599945189275, "loss": 0.109, "step": 1381 }, { "epoch": 0.08934141414141414, "grad_norm": 0.10013259947299957, "learning_rate": 0.00019980557344947543, "loss": 0.0992, "step": 1382 }, { "epoch": 0.0894060606060606, "grad_norm": 0.09079885482788086, "learning_rate": 0.0001998051469803004, "loss": 0.0984, "step": 1383 }, { "epoch": 0.08947070707070708, "grad_norm": 0.0816333070397377, "learning_rate": 0.00019980472004436974, "loss": 0.0826, "step": 1384 }, { "epoch": 0.08953535353535354, "grad_norm": 0.08877380937337875, "learning_rate": 0.00019980429264168534, "loss": 0.087, "step": 1385 }, { "epoch": 0.0896, "grad_norm": 0.09078330546617508, "learning_rate": 0.0001998038647722493, "loss": 0.1023, "step": 1386 }, { "epoch": 0.08966464646464646, "grad_norm": 0.09360656142234802, "learning_rate": 0.00019980343643606358, "loss": 0.0952, "step": 1387 }, { "epoch": 0.08972929292929292, "grad_norm": 0.08943704515695572, "learning_rate": 0.00019980300763313018, "loss": 0.09, "step": 1388 }, { "epoch": 0.0897939393939394, "grad_norm": 0.08633750677108765, "learning_rate": 0.00019980257836345108, "loss": 0.0948, "step": 1389 }, { "epoch": 0.08985858585858586, "grad_norm": 0.10041945427656174, "learning_rate": 0.00019980214862702836, "loss": 0.1075, "step": 1390 }, { "epoch": 0.08992323232323232, "grad_norm": 0.10538869351148605, "learning_rate": 0.00019980171842386396, "loss": 0.0805, "step": 1391 }, { "epoch": 0.08998787878787878, "grad_norm": 0.09101137518882751, "learning_rate": 0.0001998012877539599, "loss": 0.1053, "step": 1392 }, { "epoch": 0.08998787878787878, "eval_bleu": 12.953432942015944, "eval_loss": 0.09817688167095184, "eval_runtime": 2.6914, "eval_samples_per_second": 11.89, "eval_steps_per_second": 1.486, "step": 1392 }, { "epoch": 0.09005252525252526, "grad_norm": 0.1076117604970932, "learning_rate": 0.00019980085661731823, "loss": 0.1028, "step": 1393 }, { "epoch": 0.09011717171717172, "grad_norm": 0.10198492556810379, "learning_rate": 0.00019980042501394093, "loss": 0.0919, "step": 1394 }, { "epoch": 0.09018181818181818, "grad_norm": 0.08665958046913147, "learning_rate": 0.00019979999294383007, "loss": 0.0877, "step": 1395 }, { "epoch": 0.09024646464646464, "grad_norm": 0.0990280881524086, "learning_rate": 0.0001997995604069876, "loss": 0.1077, "step": 1396 }, { "epoch": 0.0903111111111111, "grad_norm": 0.09020534157752991, "learning_rate": 0.00019979912740341563, "loss": 0.0835, "step": 1397 }, { "epoch": 0.09037575757575758, "grad_norm": 0.09370144456624985, "learning_rate": 0.00019979869393311607, "loss": 0.0938, "step": 1398 }, { "epoch": 0.09044040404040404, "grad_norm": 0.09408402442932129, "learning_rate": 0.00019979825999609104, "loss": 0.0879, "step": 1399 }, { "epoch": 0.0905050505050505, "grad_norm": 0.09311624616384506, "learning_rate": 0.00019979782559234255, "loss": 0.0871, "step": 1400 }, { "epoch": 0.09056969696969697, "grad_norm": 0.08725301176309586, "learning_rate": 0.0001997973907218726, "loss": 0.079, "step": 1401 }, { "epoch": 0.09063434343434343, "grad_norm": 0.13033892214298248, "learning_rate": 0.00019979695538468328, "loss": 0.1025, "step": 1402 }, { "epoch": 0.0906989898989899, "grad_norm": 0.099240243434906, "learning_rate": 0.00019979651958077657, "loss": 0.0974, "step": 1403 }, { "epoch": 0.09076363636363637, "grad_norm": 0.11006799340248108, "learning_rate": 0.0001997960833101545, "loss": 0.1192, "step": 1404 }, { "epoch": 0.09082828282828283, "grad_norm": 0.17573943734169006, "learning_rate": 0.0001997956465728192, "loss": 0.0939, "step": 1405 }, { "epoch": 0.09089292929292929, "grad_norm": 0.12288129329681396, "learning_rate": 0.00019979520936877262, "loss": 0.0874, "step": 1406 }, { "epoch": 0.09095757575757575, "grad_norm": 0.08029817044734955, "learning_rate": 0.00019979477169801684, "loss": 0.0826, "step": 1407 }, { "epoch": 0.09102222222222223, "grad_norm": 0.1330159604549408, "learning_rate": 0.0001997943335605539, "loss": 0.118, "step": 1408 }, { "epoch": 0.09102222222222223, "eval_bleu": 13.737769396611698, "eval_loss": 0.09946625679731369, "eval_runtime": 2.63, "eval_samples_per_second": 12.167, "eval_steps_per_second": 1.521, "step": 1408 }, { "epoch": 0.09108686868686869, "grad_norm": 0.09597237408161163, "learning_rate": 0.00019979389495638589, "loss": 0.0965, "step": 1409 }, { "epoch": 0.09115151515151515, "grad_norm": 0.09238039702177048, "learning_rate": 0.00019979345588551478, "loss": 0.0892, "step": 1410 }, { "epoch": 0.09121616161616161, "grad_norm": 0.09983370453119278, "learning_rate": 0.00019979301634794267, "loss": 0.0837, "step": 1411 }, { "epoch": 0.09128080808080809, "grad_norm": 0.09874019026756287, "learning_rate": 0.00019979257634367161, "loss": 0.0992, "step": 1412 }, { "epoch": 0.09134545454545455, "grad_norm": 0.08935726433992386, "learning_rate": 0.0001997921358727037, "loss": 0.0873, "step": 1413 }, { "epoch": 0.09141010101010101, "grad_norm": 0.11794383823871613, "learning_rate": 0.00019979169493504093, "loss": 0.0959, "step": 1414 }, { "epoch": 0.09147474747474747, "grad_norm": 0.08632034063339233, "learning_rate": 0.0001997912535306854, "loss": 0.0862, "step": 1415 }, { "epoch": 0.09153939393939393, "grad_norm": 0.08654046058654785, "learning_rate": 0.00019979081165963917, "loss": 0.105, "step": 1416 }, { "epoch": 0.09160404040404041, "grad_norm": 0.08890501409769058, "learning_rate": 0.0001997903693219043, "loss": 0.0945, "step": 1417 }, { "epoch": 0.09166868686868687, "grad_norm": 0.09299993515014648, "learning_rate": 0.00019978992651748287, "loss": 0.0934, "step": 1418 }, { "epoch": 0.09173333333333333, "grad_norm": 0.09371999651193619, "learning_rate": 0.0001997894832463769, "loss": 0.1027, "step": 1419 }, { "epoch": 0.0917979797979798, "grad_norm": 0.097237229347229, "learning_rate": 0.0001997890395085886, "loss": 0.0915, "step": 1420 }, { "epoch": 0.09186262626262626, "grad_norm": 0.11031143367290497, "learning_rate": 0.00019978859530411986, "loss": 0.1069, "step": 1421 }, { "epoch": 0.09192727272727273, "grad_norm": 0.0882914811372757, "learning_rate": 0.0001997881506329729, "loss": 0.0902, "step": 1422 }, { "epoch": 0.0919919191919192, "grad_norm": 0.09184537082910538, "learning_rate": 0.00019978770549514973, "loss": 0.1043, "step": 1423 }, { "epoch": 0.09205656565656566, "grad_norm": 0.09339790046215057, "learning_rate": 0.00019978725989065245, "loss": 0.1136, "step": 1424 }, { "epoch": 0.09205656565656566, "eval_bleu": 12.207677834612236, "eval_loss": 0.09881220012903214, "eval_runtime": 2.7918, "eval_samples_per_second": 11.462, "eval_steps_per_second": 1.433, "step": 1424 }, { "epoch": 0.09212121212121212, "grad_norm": 0.12844525277614594, "learning_rate": 0.00019978681381948316, "loss": 0.1296, "step": 1425 }, { "epoch": 0.09218585858585858, "grad_norm": 0.1130497008562088, "learning_rate": 0.00019978636728164393, "loss": 0.1234, "step": 1426 }, { "epoch": 0.09225050505050506, "grad_norm": 0.10930045694112778, "learning_rate": 0.00019978592027713682, "loss": 0.099, "step": 1427 }, { "epoch": 0.09231515151515152, "grad_norm": 0.0970892682671547, "learning_rate": 0.000199785472805964, "loss": 0.0941, "step": 1428 }, { "epoch": 0.09237979797979798, "grad_norm": 0.09091490507125854, "learning_rate": 0.00019978502486812748, "loss": 0.099, "step": 1429 }, { "epoch": 0.09244444444444444, "grad_norm": 0.08599131554365158, "learning_rate": 0.00019978457646362938, "loss": 0.0876, "step": 1430 }, { "epoch": 0.09250909090909092, "grad_norm": 0.08314812928438187, "learning_rate": 0.0001997841275924718, "loss": 0.085, "step": 1431 }, { "epoch": 0.09257373737373738, "grad_norm": 0.08717819303274155, "learning_rate": 0.00019978367825465687, "loss": 0.0965, "step": 1432 }, { "epoch": 0.09263838383838384, "grad_norm": 0.09867282211780548, "learning_rate": 0.00019978322845018665, "loss": 0.1104, "step": 1433 }, { "epoch": 0.0927030303030303, "grad_norm": 0.09576225280761719, "learning_rate": 0.00019978277817906325, "loss": 0.1086, "step": 1434 }, { "epoch": 0.09276767676767676, "grad_norm": 0.09598365426063538, "learning_rate": 0.00019978232744128878, "loss": 0.0942, "step": 1435 }, { "epoch": 0.09283232323232324, "grad_norm": 0.1044580489397049, "learning_rate": 0.00019978187623686538, "loss": 0.1113, "step": 1436 }, { "epoch": 0.0928969696969697, "grad_norm": 0.09776109457015991, "learning_rate": 0.00019978142456579512, "loss": 0.1001, "step": 1437 }, { "epoch": 0.09296161616161616, "grad_norm": 0.08872736245393753, "learning_rate": 0.0001997809724280801, "loss": 0.0815, "step": 1438 }, { "epoch": 0.09302626262626262, "grad_norm": 0.09161276370286942, "learning_rate": 0.0001997805198237225, "loss": 0.0859, "step": 1439 }, { "epoch": 0.09309090909090909, "grad_norm": 0.10025312751531601, "learning_rate": 0.00019978006675272435, "loss": 0.1076, "step": 1440 }, { "epoch": 0.09309090909090909, "eval_bleu": 11.38449438389522, "eval_loss": 0.09994243830442429, "eval_runtime": 2.7267, "eval_samples_per_second": 11.736, "eval_steps_per_second": 1.467, "step": 1440 }, { "epoch": 0.09315555555555556, "grad_norm": 0.09451809525489807, "learning_rate": 0.00019977961321508787, "loss": 0.1075, "step": 1441 }, { "epoch": 0.09322020202020202, "grad_norm": 0.09404416382312775, "learning_rate": 0.0001997791592108151, "loss": 0.0815, "step": 1442 }, { "epoch": 0.09328484848484848, "grad_norm": 0.09629783034324646, "learning_rate": 0.00019977870473990818, "loss": 0.102, "step": 1443 }, { "epoch": 0.09334949494949495, "grad_norm": 0.08409852534532547, "learning_rate": 0.00019977824980236926, "loss": 0.0852, "step": 1444 }, { "epoch": 0.09341414141414141, "grad_norm": 0.09894314408302307, "learning_rate": 0.00019977779439820043, "loss": 0.0957, "step": 1445 }, { "epoch": 0.09347878787878788, "grad_norm": 0.09826365858316422, "learning_rate": 0.00019977733852740386, "loss": 0.1001, "step": 1446 }, { "epoch": 0.09354343434343435, "grad_norm": 0.1059250757098198, "learning_rate": 0.00019977688218998166, "loss": 0.1067, "step": 1447 }, { "epoch": 0.0936080808080808, "grad_norm": 0.08950673788785934, "learning_rate": 0.00019977642538593595, "loss": 0.09, "step": 1448 }, { "epoch": 0.09367272727272727, "grad_norm": 0.12125827372074127, "learning_rate": 0.00019977596811526888, "loss": 0.0718, "step": 1449 }, { "epoch": 0.09373737373737374, "grad_norm": 0.10540442168712616, "learning_rate": 0.00019977551037798262, "loss": 0.1089, "step": 1450 }, { "epoch": 0.0938020202020202, "grad_norm": 0.09844257682561874, "learning_rate": 0.00019977505217407928, "loss": 0.1054, "step": 1451 }, { "epoch": 0.09386666666666667, "grad_norm": 0.09119300544261932, "learning_rate": 0.000199774593503561, "loss": 0.1038, "step": 1452 }, { "epoch": 0.09393131313131313, "grad_norm": 0.09256444126367569, "learning_rate": 0.00019977413436642993, "loss": 0.1069, "step": 1453 }, { "epoch": 0.09399595959595959, "grad_norm": 0.26908043026924133, "learning_rate": 0.0001997736747626882, "loss": 0.0795, "step": 1454 }, { "epoch": 0.09406060606060607, "grad_norm": 0.10583196580410004, "learning_rate": 0.00019977321469233798, "loss": 0.0907, "step": 1455 }, { "epoch": 0.09412525252525253, "grad_norm": 0.10335783660411835, "learning_rate": 0.00019977275415538145, "loss": 0.0934, "step": 1456 }, { "epoch": 0.09412525252525253, "eval_bleu": 14.092461605337165, "eval_loss": 0.10060583800077438, "eval_runtime": 2.8017, "eval_samples_per_second": 11.422, "eval_steps_per_second": 1.428, "step": 1456 }, { "epoch": 0.09418989898989899, "grad_norm": 0.09173998236656189, "learning_rate": 0.00019977229315182071, "loss": 0.0956, "step": 1457 }, { "epoch": 0.09425454545454545, "grad_norm": 0.11639188230037689, "learning_rate": 0.00019977183168165796, "loss": 0.1012, "step": 1458 }, { "epoch": 0.09431919191919191, "grad_norm": 0.09920790791511536, "learning_rate": 0.00019977136974489533, "loss": 0.1105, "step": 1459 }, { "epoch": 0.09438383838383839, "grad_norm": 0.10236836969852448, "learning_rate": 0.000199770907341535, "loss": 0.1106, "step": 1460 }, { "epoch": 0.09444848484848485, "grad_norm": 0.09884092211723328, "learning_rate": 0.00019977044447157906, "loss": 0.1051, "step": 1461 }, { "epoch": 0.09451313131313131, "grad_norm": 0.08512425422668457, "learning_rate": 0.00019976998113502978, "loss": 0.0921, "step": 1462 }, { "epoch": 0.09457777777777777, "grad_norm": 0.09624971449375153, "learning_rate": 0.0001997695173318893, "loss": 0.108, "step": 1463 }, { "epoch": 0.09464242424242424, "grad_norm": 0.09836854785680771, "learning_rate": 0.00019976905306215973, "loss": 0.1009, "step": 1464 }, { "epoch": 0.09470707070707071, "grad_norm": 0.08996815979480743, "learning_rate": 0.0001997685883258433, "loss": 0.0815, "step": 1465 }, { "epoch": 0.09477171717171717, "grad_norm": 0.09514082223176956, "learning_rate": 0.00019976812312294214, "loss": 0.107, "step": 1466 }, { "epoch": 0.09483636363636364, "grad_norm": 0.09628879278898239, "learning_rate": 0.0001997676574534585, "loss": 0.1107, "step": 1467 }, { "epoch": 0.0949010101010101, "grad_norm": 0.08442001044750214, "learning_rate": 0.0001997671913173945, "loss": 0.0888, "step": 1468 }, { "epoch": 0.09496565656565656, "grad_norm": 0.10036958754062653, "learning_rate": 0.00019976672471475228, "loss": 0.1086, "step": 1469 }, { "epoch": 0.09503030303030303, "grad_norm": 0.08386826515197754, "learning_rate": 0.00019976625764553415, "loss": 0.1001, "step": 1470 }, { "epoch": 0.0950949494949495, "grad_norm": 0.08392481505870819, "learning_rate": 0.00019976579010974214, "loss": 0.0877, "step": 1471 }, { "epoch": 0.09515959595959596, "grad_norm": 0.12759946286678314, "learning_rate": 0.00019976532210737853, "loss": 0.1093, "step": 1472 }, { "epoch": 0.09515959595959596, "eval_bleu": 14.280043781319975, "eval_loss": 0.10036460310220718, "eval_runtime": 2.7194, "eval_samples_per_second": 11.767, "eval_steps_per_second": 1.471, "step": 1472 }, { "epoch": 0.09522424242424242, "grad_norm": 0.09618207067251205, "learning_rate": 0.0001997648536384455, "loss": 0.0848, "step": 1473 }, { "epoch": 0.0952888888888889, "grad_norm": 0.09130462259054184, "learning_rate": 0.00019976438470294526, "loss": 0.0837, "step": 1474 }, { "epoch": 0.09535353535353536, "grad_norm": 0.10890621691942215, "learning_rate": 0.00019976391530087995, "loss": 0.1212, "step": 1475 }, { "epoch": 0.09541818181818182, "grad_norm": 0.10262034833431244, "learning_rate": 0.00019976344543225176, "loss": 0.1027, "step": 1476 }, { "epoch": 0.09548282828282828, "grad_norm": 0.08642224222421646, "learning_rate": 0.00019976297509706294, "loss": 0.0917, "step": 1477 }, { "epoch": 0.09554747474747474, "grad_norm": 0.09260547906160355, "learning_rate": 0.0001997625042953157, "loss": 0.09, "step": 1478 }, { "epoch": 0.09561212121212122, "grad_norm": 0.10141746699810028, "learning_rate": 0.00019976203302701214, "loss": 0.1063, "step": 1479 }, { "epoch": 0.09567676767676768, "grad_norm": 0.10753446072340012, "learning_rate": 0.0001997615612921546, "loss": 0.0958, "step": 1480 }, { "epoch": 0.09574141414141414, "grad_norm": 0.09839066863059998, "learning_rate": 0.00019976108909074518, "loss": 0.0788, "step": 1481 }, { "epoch": 0.0958060606060606, "grad_norm": 0.10989544540643692, "learning_rate": 0.0001997606164227861, "loss": 0.0916, "step": 1482 }, { "epoch": 0.09587070707070706, "grad_norm": 0.09842634946107864, "learning_rate": 0.00019976014328827965, "loss": 0.1012, "step": 1483 }, { "epoch": 0.09593535353535354, "grad_norm": 0.0982094332575798, "learning_rate": 0.00019975966968722796, "loss": 0.0947, "step": 1484 }, { "epoch": 0.096, "grad_norm": 0.09330565482378006, "learning_rate": 0.0001997591956196333, "loss": 0.0947, "step": 1485 }, { "epoch": 0.09606464646464646, "grad_norm": 0.07657897472381592, "learning_rate": 0.00019975872108549784, "loss": 0.0744, "step": 1486 }, { "epoch": 0.09612929292929293, "grad_norm": 0.0957622081041336, "learning_rate": 0.00019975824608482382, "loss": 0.0997, "step": 1487 }, { "epoch": 0.09619393939393939, "grad_norm": 0.08672267198562622, "learning_rate": 0.00019975777061761348, "loss": 0.0919, "step": 1488 }, { "epoch": 0.09619393939393939, "eval_bleu": 14.729118721791473, "eval_loss": 0.0992056280374527, "eval_runtime": 2.9026, "eval_samples_per_second": 11.025, "eval_steps_per_second": 1.378, "step": 1488 }, { "epoch": 0.09625858585858586, "grad_norm": 0.11096522957086563, "learning_rate": 0.000199757294683869, "loss": 0.1092, "step": 1489 }, { "epoch": 0.09632323232323232, "grad_norm": 0.08589675277471542, "learning_rate": 0.00019975681828359268, "loss": 0.0932, "step": 1490 }, { "epoch": 0.09638787878787879, "grad_norm": 0.09943711757659912, "learning_rate": 0.00019975634141678668, "loss": 0.1117, "step": 1491 }, { "epoch": 0.09645252525252525, "grad_norm": 0.09538694471120834, "learning_rate": 0.00019975586408345323, "loss": 0.1201, "step": 1492 }, { "epoch": 0.09651717171717172, "grad_norm": 0.07913587987422943, "learning_rate": 0.00019975538628359458, "loss": 0.0867, "step": 1493 }, { "epoch": 0.09658181818181819, "grad_norm": 0.07704527676105499, "learning_rate": 0.00019975490801721297, "loss": 0.0681, "step": 1494 }, { "epoch": 0.09664646464646465, "grad_norm": 0.08875605463981628, "learning_rate": 0.00019975442928431067, "loss": 0.1012, "step": 1495 }, { "epoch": 0.09671111111111111, "grad_norm": 0.09561815857887268, "learning_rate": 0.00019975395008488983, "loss": 0.0936, "step": 1496 }, { "epoch": 0.09677575757575757, "grad_norm": 0.10696236044168472, "learning_rate": 0.00019975347041895278, "loss": 0.1038, "step": 1497 }, { "epoch": 0.09684040404040405, "grad_norm": 0.07971223443746567, "learning_rate": 0.0001997529902865017, "loss": 0.0843, "step": 1498 }, { "epoch": 0.09690505050505051, "grad_norm": 0.0859697014093399, "learning_rate": 0.0001997525096875389, "loss": 0.0947, "step": 1499 }, { "epoch": 0.09696969696969697, "grad_norm": 0.09678880870342255, "learning_rate": 0.00019975202862206656, "loss": 0.1047, "step": 1500 }, { "epoch": 0.09703434343434343, "grad_norm": 0.088010773062706, "learning_rate": 0.00019975154709008696, "loss": 0.0856, "step": 1501 }, { "epoch": 0.0970989898989899, "grad_norm": 0.10703625530004501, "learning_rate": 0.00019975106509160235, "loss": 0.0953, "step": 1502 }, { "epoch": 0.09716363636363637, "grad_norm": 0.1084941178560257, "learning_rate": 0.000199750582626615, "loss": 0.1101, "step": 1503 }, { "epoch": 0.09722828282828283, "grad_norm": 0.09701517224311829, "learning_rate": 0.00019975009969512716, "loss": 0.1011, "step": 1504 }, { "epoch": 0.09722828282828283, "eval_bleu": 12.832692589841608, "eval_loss": 0.10086671262979507, "eval_runtime": 2.693, "eval_samples_per_second": 11.883, "eval_steps_per_second": 1.485, "step": 1504 }, { "epoch": 0.09729292929292929, "grad_norm": 0.09981659799814224, "learning_rate": 0.00019974961629714108, "loss": 0.1042, "step": 1505 }, { "epoch": 0.09735757575757575, "grad_norm": 0.09844482690095901, "learning_rate": 0.00019974913243265898, "loss": 0.1028, "step": 1506 }, { "epoch": 0.09742222222222222, "grad_norm": 0.0844825804233551, "learning_rate": 0.0001997486481016832, "loss": 0.0887, "step": 1507 }, { "epoch": 0.09748686868686869, "grad_norm": 0.0880080834031105, "learning_rate": 0.00019974816330421596, "loss": 0.0951, "step": 1508 }, { "epoch": 0.09755151515151515, "grad_norm": 0.0978274941444397, "learning_rate": 0.00019974767804025953, "loss": 0.1026, "step": 1509 }, { "epoch": 0.09761616161616161, "grad_norm": 0.07957588881254196, "learning_rate": 0.0001997471923098162, "loss": 0.0865, "step": 1510 }, { "epoch": 0.09768080808080808, "grad_norm": 0.09161031246185303, "learning_rate": 0.00019974670611288818, "loss": 0.0918, "step": 1511 }, { "epoch": 0.09774545454545455, "grad_norm": 0.08318771421909332, "learning_rate": 0.00019974621944947787, "loss": 0.0724, "step": 1512 }, { "epoch": 0.09781010101010101, "grad_norm": 0.09956041723489761, "learning_rate": 0.0001997457323195874, "loss": 0.0936, "step": 1513 }, { "epoch": 0.09787474747474748, "grad_norm": 0.09289422631263733, "learning_rate": 0.00019974524472321915, "loss": 0.0937, "step": 1514 }, { "epoch": 0.09793939393939394, "grad_norm": 0.08256366103887558, "learning_rate": 0.00019974475666037535, "loss": 0.0846, "step": 1515 }, { "epoch": 0.0980040404040404, "grad_norm": 0.0928649753332138, "learning_rate": 0.0001997442681310583, "loss": 0.0879, "step": 1516 }, { "epoch": 0.09806868686868687, "grad_norm": 0.09999298304319382, "learning_rate": 0.00019974377913527033, "loss": 0.0793, "step": 1517 }, { "epoch": 0.09813333333333334, "grad_norm": 0.09429288655519485, "learning_rate": 0.00019974328967301363, "loss": 0.0932, "step": 1518 }, { "epoch": 0.0981979797979798, "grad_norm": 0.10184887051582336, "learning_rate": 0.00019974279974429053, "loss": 0.0999, "step": 1519 }, { "epoch": 0.09826262626262626, "grad_norm": 0.10147322714328766, "learning_rate": 0.00019974230934910337, "loss": 0.1039, "step": 1520 }, { "epoch": 0.09826262626262626, "eval_bleu": 11.785813470017036, "eval_loss": 0.09970571100711823, "eval_runtime": 2.8096, "eval_samples_per_second": 11.39, "eval_steps_per_second": 1.424, "step": 1520 }, { "epoch": 0.09832727272727272, "grad_norm": 0.10534729808568954, "learning_rate": 0.00019974181848745438, "loss": 0.0985, "step": 1521 }, { "epoch": 0.0983919191919192, "grad_norm": 0.09720037132501602, "learning_rate": 0.00019974132715934586, "loss": 0.0892, "step": 1522 }, { "epoch": 0.09845656565656566, "grad_norm": 0.09086208790540695, "learning_rate": 0.00019974083536478015, "loss": 0.0903, "step": 1523 }, { "epoch": 0.09852121212121212, "grad_norm": 0.11158297955989838, "learning_rate": 0.00019974034310375952, "loss": 0.1191, "step": 1524 }, { "epoch": 0.09858585858585858, "grad_norm": 0.08970746397972107, "learning_rate": 0.00019973985037628628, "loss": 0.0878, "step": 1525 }, { "epoch": 0.09865050505050504, "grad_norm": 0.08245444297790527, "learning_rate": 0.00019973935718236275, "loss": 0.0979, "step": 1526 }, { "epoch": 0.09871515151515152, "grad_norm": 0.09014393389225006, "learning_rate": 0.0001997388635219912, "loss": 0.1019, "step": 1527 }, { "epoch": 0.09877979797979798, "grad_norm": 0.08501767367124557, "learning_rate": 0.00019973836939517393, "loss": 0.0883, "step": 1528 }, { "epoch": 0.09884444444444444, "grad_norm": 0.09111899882555008, "learning_rate": 0.00019973787480191332, "loss": 0.1117, "step": 1529 }, { "epoch": 0.0989090909090909, "grad_norm": 0.11369483172893524, "learning_rate": 0.00019973737974221165, "loss": 0.1131, "step": 1530 }, { "epoch": 0.09897373737373738, "grad_norm": 0.10262712836265564, "learning_rate": 0.0001997368842160712, "loss": 0.0999, "step": 1531 }, { "epoch": 0.09903838383838384, "grad_norm": 0.09883233904838562, "learning_rate": 0.00019973638822349434, "loss": 0.101, "step": 1532 }, { "epoch": 0.0991030303030303, "grad_norm": 0.09912887960672379, "learning_rate": 0.00019973589176448334, "loss": 0.1049, "step": 1533 }, { "epoch": 0.09916767676767677, "grad_norm": 0.12348176538944244, "learning_rate": 0.00019973539483904057, "loss": 0.0978, "step": 1534 }, { "epoch": 0.09923232323232323, "grad_norm": 0.09499776363372803, "learning_rate": 0.0001997348974471683, "loss": 0.0902, "step": 1535 }, { "epoch": 0.0992969696969697, "grad_norm": 0.09011103212833405, "learning_rate": 0.00019973439958886893, "loss": 0.0901, "step": 1536 }, { "epoch": 0.0992969696969697, "eval_bleu": 13.668856052191725, "eval_loss": 0.09831744432449341, "eval_runtime": 2.7926, "eval_samples_per_second": 11.459, "eval_steps_per_second": 1.432, "step": 1536 }, { "epoch": 0.09936161616161616, "grad_norm": 0.10506445169448853, "learning_rate": 0.0001997339012641447, "loss": 0.1105, "step": 1537 }, { "epoch": 0.09942626262626263, "grad_norm": 0.10616018623113632, "learning_rate": 0.000199733402472998, "loss": 0.1043, "step": 1538 }, { "epoch": 0.09949090909090909, "grad_norm": 0.10897423326969147, "learning_rate": 0.00019973290321543118, "loss": 0.1137, "step": 1539 }, { "epoch": 0.09955555555555555, "grad_norm": 0.11066459119319916, "learning_rate": 0.00019973240349144652, "loss": 0.1057, "step": 1540 }, { "epoch": 0.09962020202020203, "grad_norm": 0.1022406741976738, "learning_rate": 0.00019973190330104635, "loss": 0.108, "step": 1541 }, { "epoch": 0.09968484848484849, "grad_norm": 0.08966987580060959, "learning_rate": 0.00019973140264423306, "loss": 0.0938, "step": 1542 }, { "epoch": 0.09974949494949495, "grad_norm": 0.09685828536748886, "learning_rate": 0.00019973090152100898, "loss": 0.089, "step": 1543 }, { "epoch": 0.09981414141414141, "grad_norm": 0.09710504114627838, "learning_rate": 0.00019973039993137644, "loss": 0.1164, "step": 1544 }, { "epoch": 0.09987878787878787, "grad_norm": 0.07680148631334305, "learning_rate": 0.00019972989787533777, "loss": 0.0814, "step": 1545 }, { "epoch": 0.09994343434343435, "grad_norm": 0.0986066609621048, "learning_rate": 0.00019972939535289536, "loss": 0.1021, "step": 1546 }, { "epoch": 0.10000808080808081, "grad_norm": 0.09149985760450363, "learning_rate": 0.00019972889236405156, "loss": 0.0929, "step": 1547 }, { "epoch": 0.10007272727272727, "grad_norm": 0.08575946092605591, "learning_rate": 0.00019972838890880865, "loss": 0.0814, "step": 1548 }, { "epoch": 0.10013737373737373, "grad_norm": 0.08150371164083481, "learning_rate": 0.00019972788498716904, "loss": 0.0849, "step": 1549 }, { "epoch": 0.10020202020202021, "grad_norm": 0.10836182534694672, "learning_rate": 0.00019972738059913513, "loss": 0.115, "step": 1550 }, { "epoch": 0.10026666666666667, "grad_norm": 0.08745045214891434, "learning_rate": 0.00019972687574470918, "loss": 0.0877, "step": 1551 }, { "epoch": 0.10033131313131313, "grad_norm": 0.10776417702436447, "learning_rate": 0.00019972637042389363, "loss": 0.1174, "step": 1552 }, { "epoch": 0.10033131313131313, "eval_bleu": 15.192125634192314, "eval_loss": 0.09673939645290375, "eval_runtime": 2.6619, "eval_samples_per_second": 12.022, "eval_steps_per_second": 1.503, "step": 1552 }, { "epoch": 0.1003959595959596, "grad_norm": 0.08816003054380417, "learning_rate": 0.00019972586463669082, "loss": 0.0871, "step": 1553 }, { "epoch": 0.10046060606060606, "grad_norm": 0.0874270498752594, "learning_rate": 0.00019972535838310308, "loss": 0.0861, "step": 1554 }, { "epoch": 0.10052525252525253, "grad_norm": 0.09039713442325592, "learning_rate": 0.0001997248516631328, "loss": 0.0991, "step": 1555 }, { "epoch": 0.100589898989899, "grad_norm": 0.09131505340337753, "learning_rate": 0.0001997243444767824, "loss": 0.0932, "step": 1556 }, { "epoch": 0.10065454545454545, "grad_norm": 0.08119247108697891, "learning_rate": 0.0001997238368240542, "loss": 0.08, "step": 1557 }, { "epoch": 0.10071919191919192, "grad_norm": 0.10036002099514008, "learning_rate": 0.00019972332870495056, "loss": 0.115, "step": 1558 }, { "epoch": 0.10078383838383838, "grad_norm": 0.07870175689458847, "learning_rate": 0.0001997228201194739, "loss": 0.0859, "step": 1559 }, { "epoch": 0.10084848484848485, "grad_norm": 0.10069957375526428, "learning_rate": 0.00019972231106762654, "loss": 0.107, "step": 1560 }, { "epoch": 0.10091313131313132, "grad_norm": 0.1176270842552185, "learning_rate": 0.00019972180154941095, "loss": 0.0955, "step": 1561 }, { "epoch": 0.10097777777777778, "grad_norm": 0.07788178324699402, "learning_rate": 0.00019972129156482944, "loss": 0.0933, "step": 1562 }, { "epoch": 0.10104242424242424, "grad_norm": 0.08061239868402481, "learning_rate": 0.00019972078111388442, "loss": 0.0925, "step": 1563 }, { "epoch": 0.1011070707070707, "grad_norm": 0.0940290093421936, "learning_rate": 0.00019972027019657827, "loss": 0.1138, "step": 1564 }, { "epoch": 0.10117171717171718, "grad_norm": 0.08142008632421494, "learning_rate": 0.00019971975881291339, "loss": 0.0962, "step": 1565 }, { "epoch": 0.10123636363636364, "grad_norm": 0.1068977490067482, "learning_rate": 0.00019971924696289212, "loss": 0.1147, "step": 1566 }, { "epoch": 0.1013010101010101, "grad_norm": 0.08470705896615982, "learning_rate": 0.00019971873464651695, "loss": 0.0888, "step": 1567 }, { "epoch": 0.10136565656565656, "grad_norm": 0.09118818491697311, "learning_rate": 0.0001997182218637902, "loss": 0.0956, "step": 1568 }, { "epoch": 0.10136565656565656, "eval_bleu": 18.25939569531262, "eval_loss": 0.09652234613895416, "eval_runtime": 2.8391, "eval_samples_per_second": 11.271, "eval_steps_per_second": 1.409, "step": 1568 }, { "epoch": 0.10143030303030302, "grad_norm": 0.09397991001605988, "learning_rate": 0.0001997177086147143, "loss": 0.1147, "step": 1569 }, { "epoch": 0.1014949494949495, "grad_norm": 0.09927749633789062, "learning_rate": 0.00019971719489929166, "loss": 0.0985, "step": 1570 }, { "epoch": 0.10155959595959596, "grad_norm": 0.09375564008951187, "learning_rate": 0.00019971668071752468, "loss": 0.104, "step": 1571 }, { "epoch": 0.10162424242424242, "grad_norm": 0.09379369765520096, "learning_rate": 0.0001997161660694157, "loss": 0.0923, "step": 1572 }, { "epoch": 0.10168888888888888, "grad_norm": 0.11959432065486908, "learning_rate": 0.00019971565095496717, "loss": 0.1204, "step": 1573 }, { "epoch": 0.10175353535353536, "grad_norm": 0.09277225285768509, "learning_rate": 0.00019971513537418156, "loss": 0.0879, "step": 1574 }, { "epoch": 0.10181818181818182, "grad_norm": 0.09895878285169601, "learning_rate": 0.00019971461932706119, "loss": 0.115, "step": 1575 }, { "epoch": 0.10188282828282828, "grad_norm": 0.08483145385980606, "learning_rate": 0.00019971410281360852, "loss": 0.0885, "step": 1576 }, { "epoch": 0.10194747474747475, "grad_norm": 0.09375562518835068, "learning_rate": 0.00019971358583382597, "loss": 0.0944, "step": 1577 }, { "epoch": 0.1020121212121212, "grad_norm": 0.10022222995758057, "learning_rate": 0.00019971306838771592, "loss": 0.0895, "step": 1578 }, { "epoch": 0.10207676767676768, "grad_norm": 0.09891146421432495, "learning_rate": 0.00019971255047528084, "loss": 0.1159, "step": 1579 }, { "epoch": 0.10214141414141414, "grad_norm": 0.09443075954914093, "learning_rate": 0.0001997120320965231, "loss": 0.1014, "step": 1580 }, { "epoch": 0.1022060606060606, "grad_norm": 0.09817148000001907, "learning_rate": 0.00019971151325144516, "loss": 0.1053, "step": 1581 }, { "epoch": 0.10227070707070707, "grad_norm": 0.0886181890964508, "learning_rate": 0.00019971099394004943, "loss": 0.0818, "step": 1582 }, { "epoch": 0.10233535353535353, "grad_norm": 0.09783749282360077, "learning_rate": 0.00019971047416233838, "loss": 0.1, "step": 1583 }, { "epoch": 0.1024, "grad_norm": 0.09405406564474106, "learning_rate": 0.00019970995391831436, "loss": 0.0947, "step": 1584 }, { "epoch": 0.1024, "eval_bleu": 13.274058950570861, "eval_loss": 0.09767289459705353, "eval_runtime": 2.7555, "eval_samples_per_second": 11.613, "eval_steps_per_second": 1.452, "step": 1584 }, { "epoch": 0.10246464646464647, "grad_norm": 0.10955529659986496, "learning_rate": 0.00019970943320797986, "loss": 0.1174, "step": 1585 }, { "epoch": 0.10252929292929293, "grad_norm": 0.10316221415996552, "learning_rate": 0.0001997089120313373, "loss": 0.1123, "step": 1586 }, { "epoch": 0.10259393939393939, "grad_norm": 0.10168009251356125, "learning_rate": 0.00019970839038838914, "loss": 0.1066, "step": 1587 }, { "epoch": 0.10265858585858585, "grad_norm": 0.09406615793704987, "learning_rate": 0.0001997078682791378, "loss": 0.0853, "step": 1588 }, { "epoch": 0.10272323232323233, "grad_norm": 0.09565608203411102, "learning_rate": 0.00019970734570358572, "loss": 0.1041, "step": 1589 }, { "epoch": 0.10278787878787879, "grad_norm": 0.10402078926563263, "learning_rate": 0.00019970682266173535, "loss": 0.1128, "step": 1590 }, { "epoch": 0.10285252525252525, "grad_norm": 0.09411383420228958, "learning_rate": 0.00019970629915358912, "loss": 0.1001, "step": 1591 }, { "epoch": 0.10291717171717171, "grad_norm": 0.08286409080028534, "learning_rate": 0.0001997057751791495, "loss": 0.0827, "step": 1592 }, { "epoch": 0.10298181818181819, "grad_norm": 0.1055469736456871, "learning_rate": 0.00019970525073841893, "loss": 0.1196, "step": 1593 }, { "epoch": 0.10304646464646465, "grad_norm": 0.1027616485953331, "learning_rate": 0.00019970472583139985, "loss": 0.1099, "step": 1594 }, { "epoch": 0.10311111111111111, "grad_norm": 0.09544102102518082, "learning_rate": 0.00019970420045809474, "loss": 0.1027, "step": 1595 }, { "epoch": 0.10317575757575757, "grad_norm": 0.11706390976905823, "learning_rate": 0.00019970367461850605, "loss": 0.1052, "step": 1596 }, { "epoch": 0.10324040404040404, "grad_norm": 0.09325215965509415, "learning_rate": 0.00019970314831263623, "loss": 0.0922, "step": 1597 }, { "epoch": 0.10330505050505051, "grad_norm": 0.08890800923109055, "learning_rate": 0.00019970262154048776, "loss": 0.0975, "step": 1598 }, { "epoch": 0.10336969696969697, "grad_norm": 0.08755331486463547, "learning_rate": 0.00019970209430206307, "loss": 0.1068, "step": 1599 }, { "epoch": 0.10343434343434343, "grad_norm": 0.08750922977924347, "learning_rate": 0.00019970156659736467, "loss": 0.0973, "step": 1600 }, { "epoch": 0.10343434343434343, "eval_bleu": 15.997045804511021, "eval_loss": 0.09796808660030365, "eval_runtime": 2.6698, "eval_samples_per_second": 11.986, "eval_steps_per_second": 1.498, "step": 1600 }, { "epoch": 0.1034989898989899, "grad_norm": 0.08491747081279755, "learning_rate": 0.00019970103842639498, "loss": 0.0867, "step": 1601 }, { "epoch": 0.10356363636363636, "grad_norm": 0.0872446745634079, "learning_rate": 0.0001997005097891565, "loss": 0.0904, "step": 1602 }, { "epoch": 0.10362828282828283, "grad_norm": 0.09314920753240585, "learning_rate": 0.0001996999806856517, "loss": 0.0808, "step": 1603 }, { "epoch": 0.1036929292929293, "grad_norm": 0.10297390073537827, "learning_rate": 0.00019969945111588305, "loss": 0.1062, "step": 1604 }, { "epoch": 0.10375757575757576, "grad_norm": 0.09452049434185028, "learning_rate": 0.00019969892107985304, "loss": 0.1032, "step": 1605 }, { "epoch": 0.10382222222222222, "grad_norm": 0.08197373151779175, "learning_rate": 0.00019969839057756413, "loss": 0.0903, "step": 1606 }, { "epoch": 0.10388686868686868, "grad_norm": 0.08604747802019119, "learning_rate": 0.0001996978596090188, "loss": 0.0909, "step": 1607 }, { "epoch": 0.10395151515151516, "grad_norm": 0.09293990582227707, "learning_rate": 0.00019969732817421953, "loss": 0.0978, "step": 1608 }, { "epoch": 0.10401616161616162, "grad_norm": 0.07438100874423981, "learning_rate": 0.00019969679627316884, "loss": 0.076, "step": 1609 }, { "epoch": 0.10408080808080808, "grad_norm": 0.09678399562835693, "learning_rate": 0.00019969626390586918, "loss": 0.0971, "step": 1610 }, { "epoch": 0.10414545454545454, "grad_norm": 0.11207693815231323, "learning_rate": 0.00019969573107232305, "loss": 0.1046, "step": 1611 }, { "epoch": 0.10421010101010102, "grad_norm": 0.08398399502038956, "learning_rate": 0.00019969519777253295, "loss": 0.0827, "step": 1612 }, { "epoch": 0.10427474747474748, "grad_norm": 0.09463801234960556, "learning_rate": 0.00019969466400650133, "loss": 0.1048, "step": 1613 }, { "epoch": 0.10433939393939394, "grad_norm": 0.08767811954021454, "learning_rate": 0.0001996941297742308, "loss": 0.091, "step": 1614 }, { "epoch": 0.1044040404040404, "grad_norm": 0.09328879415988922, "learning_rate": 0.00019969359507572372, "loss": 0.0975, "step": 1615 }, { "epoch": 0.10446868686868686, "grad_norm": 0.08879215270280838, "learning_rate": 0.00019969305991098267, "loss": 0.0935, "step": 1616 }, { "epoch": 0.10446868686868686, "eval_bleu": 15.18489137276308, "eval_loss": 0.09929370135068893, "eval_runtime": 2.8226, "eval_samples_per_second": 11.337, "eval_steps_per_second": 1.417, "step": 1616 }, { "epoch": 0.10453333333333334, "grad_norm": 0.08979222923517227, "learning_rate": 0.00019969252428001015, "loss": 0.095, "step": 1617 }, { "epoch": 0.1045979797979798, "grad_norm": 0.09343419969081879, "learning_rate": 0.00019969198818280863, "loss": 0.0968, "step": 1618 }, { "epoch": 0.10466262626262626, "grad_norm": 0.09061822295188904, "learning_rate": 0.00019969145161938067, "loss": 0.0994, "step": 1619 }, { "epoch": 0.10472727272727272, "grad_norm": 0.10156168788671494, "learning_rate": 0.00019969091458972872, "loss": 0.0983, "step": 1620 }, { "epoch": 0.10479191919191919, "grad_norm": 0.09300708770751953, "learning_rate": 0.0001996903770938553, "loss": 0.1091, "step": 1621 }, { "epoch": 0.10485656565656566, "grad_norm": 0.0945386216044426, "learning_rate": 0.000199689839131763, "loss": 0.1012, "step": 1622 }, { "epoch": 0.10492121212121212, "grad_norm": 0.0789511650800705, "learning_rate": 0.00019968930070345426, "loss": 0.095, "step": 1623 }, { "epoch": 0.10498585858585859, "grad_norm": 0.09683208912611008, "learning_rate": 0.00019968876180893161, "loss": 0.0954, "step": 1624 }, { "epoch": 0.10505050505050505, "grad_norm": 0.10096745938062668, "learning_rate": 0.0001996882224481976, "loss": 0.1123, "step": 1625 }, { "epoch": 0.10511515151515151, "grad_norm": 0.09884243458509445, "learning_rate": 0.00019968768262125468, "loss": 0.0912, "step": 1626 }, { "epoch": 0.10517979797979798, "grad_norm": 0.09217636287212372, "learning_rate": 0.00019968714232810545, "loss": 0.0977, "step": 1627 }, { "epoch": 0.10524444444444445, "grad_norm": 0.09520828723907471, "learning_rate": 0.0001996866015687524, "loss": 0.0916, "step": 1628 }, { "epoch": 0.10530909090909091, "grad_norm": 0.10780610889196396, "learning_rate": 0.00019968606034319813, "loss": 0.1254, "step": 1629 }, { "epoch": 0.10537373737373737, "grad_norm": 0.08303985744714737, "learning_rate": 0.00019968551865144504, "loss": 0.0953, "step": 1630 }, { "epoch": 0.10543838383838385, "grad_norm": 0.08853544294834137, "learning_rate": 0.00019968497649349579, "loss": 0.1016, "step": 1631 }, { "epoch": 0.1055030303030303, "grad_norm": 0.08793741464614868, "learning_rate": 0.0001996844338693528, "loss": 0.0928, "step": 1632 }, { "epoch": 0.1055030303030303, "eval_bleu": 12.336581675143183, "eval_loss": 0.09897857904434204, "eval_runtime": 2.6274, "eval_samples_per_second": 12.179, "eval_steps_per_second": 1.522, "step": 1632 }, { "epoch": 0.10556767676767677, "grad_norm": 0.09308361262083054, "learning_rate": 0.0001996838907790187, "loss": 0.099, "step": 1633 }, { "epoch": 0.10563232323232323, "grad_norm": 0.10176729410886765, "learning_rate": 0.000199683347222496, "loss": 0.1139, "step": 1634 }, { "epoch": 0.10569696969696969, "grad_norm": 0.0855538547039032, "learning_rate": 0.00019968280319978722, "loss": 0.0976, "step": 1635 }, { "epoch": 0.10576161616161617, "grad_norm": 0.08286743611097336, "learning_rate": 0.00019968225871089495, "loss": 0.0907, "step": 1636 }, { "epoch": 0.10582626262626263, "grad_norm": 0.07679275423288345, "learning_rate": 0.0001996817137558217, "loss": 0.0728, "step": 1637 }, { "epoch": 0.10589090909090909, "grad_norm": 0.1014551892876625, "learning_rate": 0.00019968116833457003, "loss": 0.0972, "step": 1638 }, { "epoch": 0.10595555555555555, "grad_norm": 0.17819735407829285, "learning_rate": 0.0001996806224471425, "loss": 0.1213, "step": 1639 }, { "epoch": 0.10602020202020201, "grad_norm": 0.0888531506061554, "learning_rate": 0.00019968007609354164, "loss": 0.1023, "step": 1640 }, { "epoch": 0.10608484848484849, "grad_norm": 0.10027861595153809, "learning_rate": 0.00019967952927377002, "loss": 0.1159, "step": 1641 }, { "epoch": 0.10614949494949495, "grad_norm": 0.08541226387023926, "learning_rate": 0.0001996789819878302, "loss": 0.0831, "step": 1642 }, { "epoch": 0.10621414141414141, "grad_norm": 0.08947043120861053, "learning_rate": 0.0001996784342357247, "loss": 0.1035, "step": 1643 }, { "epoch": 0.10627878787878788, "grad_norm": 0.09411673247814178, "learning_rate": 0.00019967788601745615, "loss": 0.1005, "step": 1644 }, { "epoch": 0.10634343434343434, "grad_norm": 0.0875181034207344, "learning_rate": 0.00019967733733302706, "loss": 0.0955, "step": 1645 }, { "epoch": 0.10640808080808081, "grad_norm": 0.09653908759355545, "learning_rate": 0.00019967678818244005, "loss": 0.1167, "step": 1646 }, { "epoch": 0.10647272727272727, "grad_norm": 0.07128980755805969, "learning_rate": 0.0001996762385656976, "loss": 0.0706, "step": 1647 }, { "epoch": 0.10653737373737374, "grad_norm": 0.08450567722320557, "learning_rate": 0.00019967568848280241, "loss": 0.0813, "step": 1648 }, { "epoch": 0.10653737373737374, "eval_bleu": 13.922604876199163, "eval_loss": 0.09967168420553207, "eval_runtime": 2.8357, "eval_samples_per_second": 11.285, "eval_steps_per_second": 1.411, "step": 1648 }, { "epoch": 0.1066020202020202, "grad_norm": 0.08985462039709091, "learning_rate": 0.0001996751379337569, "loss": 0.1019, "step": 1649 }, { "epoch": 0.10666666666666667, "grad_norm": 0.09184836596250534, "learning_rate": 0.00019967458691856377, "loss": 0.1143, "step": 1650 }, { "epoch": 0.10673131313131314, "grad_norm": 0.09489242732524872, "learning_rate": 0.00019967403543722555, "loss": 0.1108, "step": 1651 }, { "epoch": 0.1067959595959596, "grad_norm": 0.10834208130836487, "learning_rate": 0.0001996734834897448, "loss": 0.0969, "step": 1652 }, { "epoch": 0.10686060606060606, "grad_norm": 0.0975426658987999, "learning_rate": 0.00019967293107612413, "loss": 0.1138, "step": 1653 }, { "epoch": 0.10692525252525252, "grad_norm": 0.10958606749773026, "learning_rate": 0.0001996723781963661, "loss": 0.1293, "step": 1654 }, { "epoch": 0.106989898989899, "grad_norm": 0.0845121368765831, "learning_rate": 0.00019967182485047333, "loss": 0.1006, "step": 1655 }, { "epoch": 0.10705454545454546, "grad_norm": 0.09743615239858627, "learning_rate": 0.00019967127103844836, "loss": 0.1045, "step": 1656 }, { "epoch": 0.10711919191919192, "grad_norm": 0.09257999807596207, "learning_rate": 0.0001996707167602938, "loss": 0.0977, "step": 1657 }, { "epoch": 0.10718383838383838, "grad_norm": 0.07423005253076553, "learning_rate": 0.00019967016201601228, "loss": 0.072, "step": 1658 }, { "epoch": 0.10724848484848484, "grad_norm": 0.09771029651165009, "learning_rate": 0.00019966960680560636, "loss": 0.1036, "step": 1659 }, { "epoch": 0.10731313131313132, "grad_norm": 0.1146690845489502, "learning_rate": 0.00019966905112907862, "loss": 0.1162, "step": 1660 }, { "epoch": 0.10737777777777778, "grad_norm": 0.08793772757053375, "learning_rate": 0.00019966849498643168, "loss": 0.0951, "step": 1661 }, { "epoch": 0.10744242424242424, "grad_norm": 0.10198479890823364, "learning_rate": 0.00019966793837766816, "loss": 0.1174, "step": 1662 }, { "epoch": 0.1075070707070707, "grad_norm": 0.08196443319320679, "learning_rate": 0.00019966738130279058, "loss": 0.0865, "step": 1663 }, { "epoch": 0.10757171717171717, "grad_norm": 0.0996987447142601, "learning_rate": 0.00019966682376180165, "loss": 0.1096, "step": 1664 }, { "epoch": 0.10757171717171717, "eval_bleu": 13.528937742100071, "eval_loss": 0.10002744197845459, "eval_runtime": 2.6868, "eval_samples_per_second": 11.91, "eval_steps_per_second": 1.489, "step": 1664 }, { "epoch": 0.10763636363636364, "grad_norm": 0.08952216804027557, "learning_rate": 0.00019966626575470398, "loss": 0.1019, "step": 1665 }, { "epoch": 0.1077010101010101, "grad_norm": 0.10055726021528244, "learning_rate": 0.00019966570728150007, "loss": 0.1075, "step": 1666 }, { "epoch": 0.10776565656565656, "grad_norm": 0.09140234440565109, "learning_rate": 0.0001996651483421926, "loss": 0.1015, "step": 1667 }, { "epoch": 0.10783030303030303, "grad_norm": 0.08178149908781052, "learning_rate": 0.00019966458893678422, "loss": 0.0917, "step": 1668 }, { "epoch": 0.10789494949494949, "grad_norm": 0.09599588066339493, "learning_rate": 0.00019966402906527745, "loss": 0.1041, "step": 1669 }, { "epoch": 0.10795959595959596, "grad_norm": 0.10193531960248947, "learning_rate": 0.00019966346872767502, "loss": 0.1055, "step": 1670 }, { "epoch": 0.10802424242424243, "grad_norm": 0.09694015234708786, "learning_rate": 0.0001996629079239795, "loss": 0.1032, "step": 1671 }, { "epoch": 0.10808888888888889, "grad_norm": 0.08435283601284027, "learning_rate": 0.00019966234665419344, "loss": 0.0893, "step": 1672 }, { "epoch": 0.10815353535353535, "grad_norm": 0.10433562844991684, "learning_rate": 0.0001996617849183196, "loss": 0.1104, "step": 1673 }, { "epoch": 0.10821818181818182, "grad_norm": 0.08608686178922653, "learning_rate": 0.00019966122271636048, "loss": 0.1055, "step": 1674 }, { "epoch": 0.10828282828282829, "grad_norm": 0.1521894633769989, "learning_rate": 0.0001996606600483188, "loss": 0.0906, "step": 1675 }, { "epoch": 0.10834747474747475, "grad_norm": 0.08421637862920761, "learning_rate": 0.00019966009691419715, "loss": 0.0917, "step": 1676 }, { "epoch": 0.10841212121212121, "grad_norm": 0.08783233910799026, "learning_rate": 0.0001996595333139982, "loss": 0.0909, "step": 1677 }, { "epoch": 0.10847676767676767, "grad_norm": 0.08784070611000061, "learning_rate": 0.00019965896924772455, "loss": 0.0975, "step": 1678 }, { "epoch": 0.10854141414141415, "grad_norm": 0.08449574559926987, "learning_rate": 0.00019965840471537885, "loss": 0.0871, "step": 1679 }, { "epoch": 0.10860606060606061, "grad_norm": 0.11489264667034149, "learning_rate": 0.00019965783971696372, "loss": 0.1342, "step": 1680 }, { "epoch": 0.10860606060606061, "eval_bleu": 15.383758350508417, "eval_loss": 0.09991168975830078, "eval_runtime": 2.7517, "eval_samples_per_second": 11.629, "eval_steps_per_second": 1.454, "step": 1680 }, { "epoch": 0.10867070707070707, "grad_norm": 0.0800926461815834, "learning_rate": 0.0001996572742524818, "loss": 0.0921, "step": 1681 }, { "epoch": 0.10873535353535353, "grad_norm": 0.09851634502410889, "learning_rate": 0.0001996567083219358, "loss": 0.1101, "step": 1682 }, { "epoch": 0.1088, "grad_norm": 0.08068210631608963, "learning_rate": 0.0001996561419253283, "loss": 0.0922, "step": 1683 }, { "epoch": 0.10886464646464647, "grad_norm": 0.08367714285850525, "learning_rate": 0.00019965557506266196, "loss": 0.0755, "step": 1684 }, { "epoch": 0.10892929292929293, "grad_norm": 0.08664995431900024, "learning_rate": 0.00019965500773393946, "loss": 0.0919, "step": 1685 }, { "epoch": 0.10899393939393939, "grad_norm": 0.09484238922595978, "learning_rate": 0.00019965443993916345, "loss": 0.0975, "step": 1686 }, { "epoch": 0.10905858585858585, "grad_norm": 0.10192329436540604, "learning_rate": 0.00019965387167833655, "loss": 0.1226, "step": 1687 }, { "epoch": 0.10912323232323232, "grad_norm": 0.07847811281681061, "learning_rate": 0.00019965330295146144, "loss": 0.0858, "step": 1688 }, { "epoch": 0.10918787878787879, "grad_norm": 0.1330711990594864, "learning_rate": 0.00019965273375854075, "loss": 0.0995, "step": 1689 }, { "epoch": 0.10925252525252525, "grad_norm": 0.09547499567270279, "learning_rate": 0.0001996521640995772, "loss": 0.0953, "step": 1690 }, { "epoch": 0.10931717171717172, "grad_norm": 0.10386989265680313, "learning_rate": 0.0001996515939745734, "loss": 0.1098, "step": 1691 }, { "epoch": 0.10938181818181818, "grad_norm": 0.09227786213159561, "learning_rate": 0.00019965102338353205, "loss": 0.0999, "step": 1692 }, { "epoch": 0.10944646464646465, "grad_norm": 0.11024253070354462, "learning_rate": 0.00019965045232645583, "loss": 0.104, "step": 1693 }, { "epoch": 0.10951111111111111, "grad_norm": 0.11003511399030685, "learning_rate": 0.00019964988080334734, "loss": 0.1147, "step": 1694 }, { "epoch": 0.10957575757575758, "grad_norm": 0.07205240428447723, "learning_rate": 0.00019964930881420932, "loss": 0.0756, "step": 1695 }, { "epoch": 0.10964040404040404, "grad_norm": 0.09523680806159973, "learning_rate": 0.00019964873635904446, "loss": 0.0961, "step": 1696 }, { "epoch": 0.10964040404040404, "eval_bleu": 12.824950677653414, "eval_loss": 0.09803938865661621, "eval_runtime": 2.6212, "eval_samples_per_second": 12.208, "eval_steps_per_second": 1.526, "step": 1696 }, { "epoch": 0.1097050505050505, "grad_norm": 0.08599366992712021, "learning_rate": 0.00019964816343785537, "loss": 0.0931, "step": 1697 }, { "epoch": 0.10976969696969698, "grad_norm": 0.09303499013185501, "learning_rate": 0.00019964759005064477, "loss": 0.0986, "step": 1698 }, { "epoch": 0.10983434343434344, "grad_norm": 0.0944119468331337, "learning_rate": 0.00019964701619741532, "loss": 0.1082, "step": 1699 }, { "epoch": 0.1098989898989899, "grad_norm": 0.07310118526220322, "learning_rate": 0.00019964644187816973, "loss": 0.0734, "step": 1700 }, { "epoch": 0.10996363636363636, "grad_norm": 0.09287518262863159, "learning_rate": 0.00019964586709291071, "loss": 0.1085, "step": 1701 }, { "epoch": 0.11002828282828282, "grad_norm": 0.08038844913244247, "learning_rate": 0.00019964529184164086, "loss": 0.0931, "step": 1702 }, { "epoch": 0.1100929292929293, "grad_norm": 0.09042944759130478, "learning_rate": 0.00019964471612436295, "loss": 0.1064, "step": 1703 }, { "epoch": 0.11015757575757576, "grad_norm": 0.08298671990633011, "learning_rate": 0.00019964413994107965, "loss": 0.0919, "step": 1704 }, { "epoch": 0.11022222222222222, "grad_norm": 0.08193987607955933, "learning_rate": 0.00019964356329179363, "loss": 0.0883, "step": 1705 }, { "epoch": 0.11028686868686868, "grad_norm": 0.07536068558692932, "learning_rate": 0.00019964298617650757, "loss": 0.0806, "step": 1706 }, { "epoch": 0.11035151515151514, "grad_norm": 0.0773344412446022, "learning_rate": 0.00019964240859522426, "loss": 0.0763, "step": 1707 }, { "epoch": 0.11041616161616162, "grad_norm": 0.08499639481306076, "learning_rate": 0.00019964183054794633, "loss": 0.1026, "step": 1708 }, { "epoch": 0.11048080808080808, "grad_norm": 0.08847616612911224, "learning_rate": 0.00019964125203467652, "loss": 0.0971, "step": 1709 }, { "epoch": 0.11054545454545454, "grad_norm": 0.07781341671943665, "learning_rate": 0.0001996406730554175, "loss": 0.0829, "step": 1710 }, { "epoch": 0.110610101010101, "grad_norm": 0.09329476952552795, "learning_rate": 0.00019964009361017197, "loss": 0.0983, "step": 1711 }, { "epoch": 0.11067474747474748, "grad_norm": 0.08138521760702133, "learning_rate": 0.0001996395136989427, "loss": 0.0834, "step": 1712 }, { "epoch": 0.11067474747474748, "eval_bleu": 16.31455048457529, "eval_loss": 0.09661374986171722, "eval_runtime": 2.7928, "eval_samples_per_second": 11.458, "eval_steps_per_second": 1.432, "step": 1712 }, { "epoch": 0.11073939393939394, "grad_norm": 0.09928783029317856, "learning_rate": 0.00019963893332173235, "loss": 0.107, "step": 1713 }, { "epoch": 0.1108040404040404, "grad_norm": 0.10143221914768219, "learning_rate": 0.00019963835247854364, "loss": 0.1207, "step": 1714 }, { "epoch": 0.11086868686868687, "grad_norm": 0.08343500643968582, "learning_rate": 0.0001996377711693793, "loss": 0.0916, "step": 1715 }, { "epoch": 0.11093333333333333, "grad_norm": 0.0842021107673645, "learning_rate": 0.00019963718939424206, "loss": 0.093, "step": 1716 }, { "epoch": 0.1109979797979798, "grad_norm": 0.09642371535301208, "learning_rate": 0.0001996366071531346, "loss": 0.0875, "step": 1717 }, { "epoch": 0.11106262626262627, "grad_norm": 0.09394519776105881, "learning_rate": 0.00019963602444605968, "loss": 0.0931, "step": 1718 }, { "epoch": 0.11112727272727273, "grad_norm": 0.09390253573656082, "learning_rate": 0.00019963544127302, "loss": 0.095, "step": 1719 }, { "epoch": 0.11119191919191919, "grad_norm": 0.1078769862651825, "learning_rate": 0.0001996348576340183, "loss": 0.1018, "step": 1720 }, { "epoch": 0.11125656565656565, "grad_norm": 0.08828653395175934, "learning_rate": 0.00019963427352905733, "loss": 0.096, "step": 1721 }, { "epoch": 0.11132121212121213, "grad_norm": 0.08584805577993393, "learning_rate": 0.00019963368895813978, "loss": 0.098, "step": 1722 }, { "epoch": 0.11138585858585859, "grad_norm": 0.0930759534239769, "learning_rate": 0.0001996331039212684, "loss": 0.1044, "step": 1723 }, { "epoch": 0.11145050505050505, "grad_norm": 0.1555097997188568, "learning_rate": 0.00019963251841844594, "loss": 0.1034, "step": 1724 }, { "epoch": 0.11151515151515151, "grad_norm": 0.10394205898046494, "learning_rate": 0.00019963193244967514, "loss": 0.1021, "step": 1725 }, { "epoch": 0.11157979797979797, "grad_norm": 0.1084916889667511, "learning_rate": 0.00019963134601495874, "loss": 0.1026, "step": 1726 }, { "epoch": 0.11164444444444445, "grad_norm": 0.08258991688489914, "learning_rate": 0.00019963075911429945, "loss": 0.1024, "step": 1727 }, { "epoch": 0.11170909090909091, "grad_norm": 0.08256273716688156, "learning_rate": 0.00019963017174770006, "loss": 0.0855, "step": 1728 }, { "epoch": 0.11170909090909091, "eval_bleu": 16.68002773176488, "eval_loss": 0.09654008597135544, "eval_runtime": 2.9008, "eval_samples_per_second": 11.031, "eval_steps_per_second": 1.379, "step": 1728 }, { "epoch": 0.11177373737373737, "grad_norm": 0.0940549299120903, "learning_rate": 0.00019962958391516326, "loss": 0.1068, "step": 1729 }, { "epoch": 0.11183838383838383, "grad_norm": 0.09226085990667343, "learning_rate": 0.00019962899561669185, "loss": 0.0858, "step": 1730 }, { "epoch": 0.11190303030303031, "grad_norm": 0.08729394525289536, "learning_rate": 0.00019962840685228857, "loss": 0.1006, "step": 1731 }, { "epoch": 0.11196767676767677, "grad_norm": 0.11189916729927063, "learning_rate": 0.00019962781762195616, "loss": 0.0992, "step": 1732 }, { "epoch": 0.11203232323232323, "grad_norm": 0.08743728697299957, "learning_rate": 0.0001996272279256974, "loss": 0.0861, "step": 1733 }, { "epoch": 0.1120969696969697, "grad_norm": 0.0881812646985054, "learning_rate": 0.00019962663776351502, "loss": 0.08, "step": 1734 }, { "epoch": 0.11216161616161616, "grad_norm": 0.08333799988031387, "learning_rate": 0.0001996260471354118, "loss": 0.0873, "step": 1735 }, { "epoch": 0.11222626262626263, "grad_norm": 0.07822366803884506, "learning_rate": 0.00019962545604139046, "loss": 0.088, "step": 1736 }, { "epoch": 0.1122909090909091, "grad_norm": 0.12780636548995972, "learning_rate": 0.00019962486448145381, "loss": 0.1047, "step": 1737 }, { "epoch": 0.11235555555555556, "grad_norm": 0.08140812069177628, "learning_rate": 0.00019962427245560463, "loss": 0.0912, "step": 1738 }, { "epoch": 0.11242020202020202, "grad_norm": 0.08540894091129303, "learning_rate": 0.00019962367996384563, "loss": 0.1009, "step": 1739 }, { "epoch": 0.11248484848484848, "grad_norm": 0.10637736320495605, "learning_rate": 0.00019962308700617961, "loss": 0.1045, "step": 1740 }, { "epoch": 0.11254949494949495, "grad_norm": 0.1078633964061737, "learning_rate": 0.00019962249358260938, "loss": 0.0949, "step": 1741 }, { "epoch": 0.11261414141414142, "grad_norm": 0.08576861768960953, "learning_rate": 0.00019962189969313768, "loss": 0.0966, "step": 1742 }, { "epoch": 0.11267878787878788, "grad_norm": 0.07337532192468643, "learning_rate": 0.00019962130533776726, "loss": 0.075, "step": 1743 }, { "epoch": 0.11274343434343434, "grad_norm": 0.08354844152927399, "learning_rate": 0.00019962071051650098, "loss": 0.0872, "step": 1744 }, { "epoch": 0.11274343434343434, "eval_bleu": 15.466178365357706, "eval_loss": 0.09811587631702423, "eval_runtime": 2.8782, "eval_samples_per_second": 11.118, "eval_steps_per_second": 1.39, "step": 1744 }, { "epoch": 0.1128080808080808, "grad_norm": 0.08509067445993423, "learning_rate": 0.00019962011522934152, "loss": 0.0913, "step": 1745 }, { "epoch": 0.11287272727272728, "grad_norm": 0.1303311437368393, "learning_rate": 0.0001996195194762917, "loss": 0.1034, "step": 1746 }, { "epoch": 0.11293737373737374, "grad_norm": 0.09086159616708755, "learning_rate": 0.0001996189232573544, "loss": 0.0871, "step": 1747 }, { "epoch": 0.1130020202020202, "grad_norm": 0.09347430616617203, "learning_rate": 0.00019961832657253227, "loss": 0.1027, "step": 1748 }, { "epoch": 0.11306666666666666, "grad_norm": 0.08479762077331543, "learning_rate": 0.00019961772942182816, "loss": 0.0953, "step": 1749 }, { "epoch": 0.11313131313131314, "grad_norm": 0.08756949007511139, "learning_rate": 0.00019961713180524488, "loss": 0.0883, "step": 1750 }, { "epoch": 0.1131959595959596, "grad_norm": 0.10438496619462967, "learning_rate": 0.0001996165337227852, "loss": 0.1132, "step": 1751 }, { "epoch": 0.11326060606060606, "grad_norm": 0.09598714113235474, "learning_rate": 0.00019961593517445195, "loss": 0.1014, "step": 1752 }, { "epoch": 0.11332525252525252, "grad_norm": 0.1013709232211113, "learning_rate": 0.00019961533616024788, "loss": 0.1004, "step": 1753 }, { "epoch": 0.11338989898989899, "grad_norm": 0.10158463567495346, "learning_rate": 0.0001996147366801758, "loss": 0.0976, "step": 1754 }, { "epoch": 0.11345454545454546, "grad_norm": 0.08664679527282715, "learning_rate": 0.00019961413673423855, "loss": 0.0858, "step": 1755 }, { "epoch": 0.11351919191919192, "grad_norm": 0.08453410118818283, "learning_rate": 0.00019961353632243892, "loss": 0.0914, "step": 1756 }, { "epoch": 0.11358383838383838, "grad_norm": 0.10597959160804749, "learning_rate": 0.0001996129354447797, "loss": 0.0831, "step": 1757 }, { "epoch": 0.11364848484848485, "grad_norm": 0.0837024599313736, "learning_rate": 0.0001996123341012637, "loss": 0.0969, "step": 1758 }, { "epoch": 0.11371313131313131, "grad_norm": 0.07992946356534958, "learning_rate": 0.00019961173229189375, "loss": 0.0885, "step": 1759 }, { "epoch": 0.11377777777777778, "grad_norm": 0.0935758501291275, "learning_rate": 0.00019961113001667268, "loss": 0.0998, "step": 1760 }, { "epoch": 0.11377777777777778, "eval_bleu": 17.203350793717163, "eval_loss": 0.09911344200372696, "eval_runtime": 2.7632, "eval_samples_per_second": 11.581, "eval_steps_per_second": 1.448, "step": 1760 }, { "epoch": 0.11384242424242424, "grad_norm": 0.09442166984081268, "learning_rate": 0.00019961052727560325, "loss": 0.1094, "step": 1761 }, { "epoch": 0.1139070707070707, "grad_norm": 0.0889282375574112, "learning_rate": 0.00019960992406868835, "loss": 0.0879, "step": 1762 }, { "epoch": 0.11397171717171717, "grad_norm": 0.11973419785499573, "learning_rate": 0.00019960932039593074, "loss": 0.0984, "step": 1763 }, { "epoch": 0.11403636363636363, "grad_norm": 0.10106372088193893, "learning_rate": 0.00019960871625733327, "loss": 0.0905, "step": 1764 }, { "epoch": 0.1141010101010101, "grad_norm": 0.11689893156290054, "learning_rate": 0.00019960811165289878, "loss": 0.1284, "step": 1765 }, { "epoch": 0.11416565656565657, "grad_norm": 0.1037783995270729, "learning_rate": 0.00019960750658263007, "loss": 0.1111, "step": 1766 }, { "epoch": 0.11423030303030303, "grad_norm": 0.08097214996814728, "learning_rate": 0.00019960690104653002, "loss": 0.0837, "step": 1767 }, { "epoch": 0.11429494949494949, "grad_norm": 0.08323007076978683, "learning_rate": 0.00019960629504460137, "loss": 0.0905, "step": 1768 }, { "epoch": 0.11435959595959595, "grad_norm": 0.08763524144887924, "learning_rate": 0.00019960568857684704, "loss": 0.0965, "step": 1769 }, { "epoch": 0.11442424242424243, "grad_norm": 0.08229734748601913, "learning_rate": 0.00019960508164326983, "loss": 0.0888, "step": 1770 }, { "epoch": 0.11448888888888889, "grad_norm": 0.0831150934100151, "learning_rate": 0.00019960447424387256, "loss": 0.081, "step": 1771 }, { "epoch": 0.11455353535353535, "grad_norm": 0.09266072511672974, "learning_rate": 0.0001996038663786581, "loss": 0.1176, "step": 1772 }, { "epoch": 0.11461818181818181, "grad_norm": 0.08846625685691833, "learning_rate": 0.0001996032580476293, "loss": 0.0973, "step": 1773 }, { "epoch": 0.11468282828282829, "grad_norm": 0.09097465872764587, "learning_rate": 0.00019960264925078899, "loss": 0.1031, "step": 1774 }, { "epoch": 0.11474747474747475, "grad_norm": 0.08059729635715485, "learning_rate": 0.00019960203998814, "loss": 0.0842, "step": 1775 }, { "epoch": 0.11481212121212121, "grad_norm": 0.09724695980548859, "learning_rate": 0.0001996014302596852, "loss": 0.1119, "step": 1776 }, { "epoch": 0.11481212121212121, "eval_bleu": 12.563658772797544, "eval_loss": 0.1017531007528305, "eval_runtime": 2.8573, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 1776 }, { "epoch": 0.11487676767676767, "grad_norm": 0.09084711223840714, "learning_rate": 0.00019960082006542743, "loss": 0.0879, "step": 1777 }, { "epoch": 0.11494141414141414, "grad_norm": 0.10123063623905182, "learning_rate": 0.0001996002094053696, "loss": 0.1046, "step": 1778 }, { "epoch": 0.11500606060606061, "grad_norm": 0.08672968298196793, "learning_rate": 0.00019959959827951446, "loss": 0.0877, "step": 1779 }, { "epoch": 0.11507070707070707, "grad_norm": 0.09039455652236938, "learning_rate": 0.00019959898668786495, "loss": 0.1058, "step": 1780 }, { "epoch": 0.11513535353535354, "grad_norm": 0.08296473324298859, "learning_rate": 0.00019959837463042393, "loss": 0.1029, "step": 1781 }, { "epoch": 0.1152, "grad_norm": 0.08520353585481644, "learning_rate": 0.0001995977621071942, "loss": 0.0906, "step": 1782 }, { "epoch": 0.11526464646464646, "grad_norm": 0.0956898182630539, "learning_rate": 0.0001995971491181787, "loss": 0.1013, "step": 1783 }, { "epoch": 0.11532929292929293, "grad_norm": 0.09413713961839676, "learning_rate": 0.0001995965356633802, "loss": 0.0962, "step": 1784 }, { "epoch": 0.1153939393939394, "grad_norm": 0.09771838039159775, "learning_rate": 0.0001995959217428017, "loss": 0.1062, "step": 1785 }, { "epoch": 0.11545858585858586, "grad_norm": 0.0850255936384201, "learning_rate": 0.00019959530735644596, "loss": 0.085, "step": 1786 }, { "epoch": 0.11552323232323232, "grad_norm": 0.08241092413663864, "learning_rate": 0.0001995946925043159, "loss": 0.0787, "step": 1787 }, { "epoch": 0.11558787878787878, "grad_norm": 0.09113946557044983, "learning_rate": 0.0001995940771864144, "loss": 0.0978, "step": 1788 }, { "epoch": 0.11565252525252526, "grad_norm": 0.09338917583227158, "learning_rate": 0.0001995934614027443, "loss": 0.1163, "step": 1789 }, { "epoch": 0.11571717171717172, "grad_norm": 0.09320656210184097, "learning_rate": 0.00019959284515330849, "loss": 0.0995, "step": 1790 }, { "epoch": 0.11578181818181818, "grad_norm": 0.09453748911619186, "learning_rate": 0.0001995922284381099, "loss": 0.1212, "step": 1791 }, { "epoch": 0.11584646464646464, "grad_norm": 0.08749670535326004, "learning_rate": 0.0001995916112571514, "loss": 0.0924, "step": 1792 }, { "epoch": 0.11584646464646464, "eval_bleu": 16.02098396083046, "eval_loss": 0.10107424110174179, "eval_runtime": 2.7008, "eval_samples_per_second": 11.848, "eval_steps_per_second": 1.481, "step": 1792 }, { "epoch": 0.11591111111111112, "grad_norm": 0.09330089390277863, "learning_rate": 0.00019959099361043582, "loss": 0.1074, "step": 1793 }, { "epoch": 0.11597575757575758, "grad_norm": 0.08451402187347412, "learning_rate": 0.00019959037549796614, "loss": 0.0838, "step": 1794 }, { "epoch": 0.11604040404040404, "grad_norm": 0.0749412477016449, "learning_rate": 0.00019958975691974513, "loss": 0.0806, "step": 1795 }, { "epoch": 0.1161050505050505, "grad_norm": 0.08658935129642487, "learning_rate": 0.0001995891378757758, "loss": 0.0851, "step": 1796 }, { "epoch": 0.11616969696969696, "grad_norm": 0.09365957230329514, "learning_rate": 0.00019958851836606099, "loss": 0.1087, "step": 1797 }, { "epoch": 0.11623434343434344, "grad_norm": 0.11999083310365677, "learning_rate": 0.00019958789839060357, "loss": 0.1114, "step": 1798 }, { "epoch": 0.1162989898989899, "grad_norm": 0.09224677085876465, "learning_rate": 0.00019958727794940648, "loss": 0.1095, "step": 1799 }, { "epoch": 0.11636363636363636, "grad_norm": 0.08700484037399292, "learning_rate": 0.00019958665704247264, "loss": 0.1066, "step": 1800 }, { "epoch": 0.11642828282828283, "grad_norm": 0.0821424275636673, "learning_rate": 0.00019958603566980492, "loss": 0.1015, "step": 1801 }, { "epoch": 0.11649292929292929, "grad_norm": 0.08366188406944275, "learning_rate": 0.00019958541383140624, "loss": 0.1045, "step": 1802 }, { "epoch": 0.11655757575757576, "grad_norm": 0.08512281626462936, "learning_rate": 0.0001995847915272795, "loss": 0.0841, "step": 1803 }, { "epoch": 0.11662222222222222, "grad_norm": 0.08686674386262894, "learning_rate": 0.0001995841687574276, "loss": 0.1075, "step": 1804 }, { "epoch": 0.11668686868686869, "grad_norm": 0.08095437288284302, "learning_rate": 0.00019958354552185344, "loss": 0.0938, "step": 1805 }, { "epoch": 0.11675151515151515, "grad_norm": 0.09457031637430191, "learning_rate": 0.00019958292182056, "loss": 0.1097, "step": 1806 }, { "epoch": 0.11681616161616161, "grad_norm": 0.08220674842596054, "learning_rate": 0.00019958229765355015, "loss": 0.0833, "step": 1807 }, { "epoch": 0.11688080808080809, "grad_norm": 0.08338411152362823, "learning_rate": 0.00019958167302082678, "loss": 0.0837, "step": 1808 }, { "epoch": 0.11688080808080809, "eval_bleu": 15.42548785792838, "eval_loss": 0.09879328310489655, "eval_runtime": 2.7946, "eval_samples_per_second": 11.451, "eval_steps_per_second": 1.431, "step": 1808 }, { "epoch": 0.11694545454545455, "grad_norm": 0.08020108193159103, "learning_rate": 0.0001995810479223929, "loss": 0.0899, "step": 1809 }, { "epoch": 0.11701010101010101, "grad_norm": 0.09933420270681381, "learning_rate": 0.00019958042235825136, "loss": 0.1217, "step": 1810 }, { "epoch": 0.11707474747474747, "grad_norm": 0.09501846134662628, "learning_rate": 0.0001995797963284051, "loss": 0.0839, "step": 1811 }, { "epoch": 0.11713939393939395, "grad_norm": 0.11073479801416397, "learning_rate": 0.00019957916983285705, "loss": 0.1158, "step": 1812 }, { "epoch": 0.11720404040404041, "grad_norm": 0.08117087185382843, "learning_rate": 0.00019957854287161017, "loss": 0.0988, "step": 1813 }, { "epoch": 0.11726868686868687, "grad_norm": 0.08467129617929459, "learning_rate": 0.0001995779154446673, "loss": 0.09, "step": 1814 }, { "epoch": 0.11733333333333333, "grad_norm": 0.09511814266443253, "learning_rate": 0.0001995772875520315, "loss": 0.1003, "step": 1815 }, { "epoch": 0.11739797979797979, "grad_norm": 0.09553271532058716, "learning_rate": 0.00019957665919370562, "loss": 0.1031, "step": 1816 }, { "epoch": 0.11746262626262627, "grad_norm": 0.07528268545866013, "learning_rate": 0.0001995760303696926, "loss": 0.0745, "step": 1817 }, { "epoch": 0.11752727272727273, "grad_norm": 0.08599118143320084, "learning_rate": 0.00019957540107999545, "loss": 0.0913, "step": 1818 }, { "epoch": 0.11759191919191919, "grad_norm": 0.09785564988851547, "learning_rate": 0.00019957477132461708, "loss": 0.1053, "step": 1819 }, { "epoch": 0.11765656565656565, "grad_norm": 0.08028493076562881, "learning_rate": 0.0001995741411035604, "loss": 0.0889, "step": 1820 }, { "epoch": 0.11772121212121212, "grad_norm": 0.0831385925412178, "learning_rate": 0.00019957351041682836, "loss": 0.0946, "step": 1821 }, { "epoch": 0.11778585858585859, "grad_norm": 0.08381468057632446, "learning_rate": 0.00019957287926442393, "loss": 0.0975, "step": 1822 }, { "epoch": 0.11785050505050505, "grad_norm": 0.0780530571937561, "learning_rate": 0.0001995722476463501, "loss": 0.0811, "step": 1823 }, { "epoch": 0.11791515151515151, "grad_norm": 0.09475967288017273, "learning_rate": 0.00019957161556260976, "loss": 0.1071, "step": 1824 }, { "epoch": 0.11791515151515151, "eval_bleu": 12.836995387579664, "eval_loss": 0.10115996748209, "eval_runtime": 2.6106, "eval_samples_per_second": 12.257, "eval_steps_per_second": 1.532, "step": 1824 }, { "epoch": 0.11797979797979798, "grad_norm": 0.06906456500291824, "learning_rate": 0.0001995709830132059, "loss": 0.0708, "step": 1825 }, { "epoch": 0.11804444444444444, "grad_norm": 0.07418100535869598, "learning_rate": 0.0001995703499981415, "loss": 0.0772, "step": 1826 }, { "epoch": 0.11810909090909091, "grad_norm": 0.08834165334701538, "learning_rate": 0.00019956971651741943, "loss": 0.0994, "step": 1827 }, { "epoch": 0.11817373737373738, "grad_norm": 0.08954574912786484, "learning_rate": 0.00019956908257104275, "loss": 0.092, "step": 1828 }, { "epoch": 0.11823838383838384, "grad_norm": 0.09749346226453781, "learning_rate": 0.00019956844815901436, "loss": 0.1063, "step": 1829 }, { "epoch": 0.1183030303030303, "grad_norm": 0.09436722844839096, "learning_rate": 0.00019956781328133726, "loss": 0.0965, "step": 1830 }, { "epoch": 0.11836767676767677, "grad_norm": 0.08959711343050003, "learning_rate": 0.00019956717793801442, "loss": 0.1066, "step": 1831 }, { "epoch": 0.11843232323232324, "grad_norm": 0.07377462834119797, "learning_rate": 0.00019956654212904883, "loss": 0.0805, "step": 1832 }, { "epoch": 0.1184969696969697, "grad_norm": 0.09313827008008957, "learning_rate": 0.00019956590585444342, "loss": 0.0936, "step": 1833 }, { "epoch": 0.11856161616161616, "grad_norm": 0.08020653575658798, "learning_rate": 0.00019956526911420118, "loss": 0.0844, "step": 1834 }, { "epoch": 0.11862626262626262, "grad_norm": 0.0924125388264656, "learning_rate": 0.0001995646319083251, "loss": 0.1011, "step": 1835 }, { "epoch": 0.1186909090909091, "grad_norm": 0.08196383714675903, "learning_rate": 0.00019956399423681816, "loss": 0.0915, "step": 1836 }, { "epoch": 0.11875555555555556, "grad_norm": 0.09715278446674347, "learning_rate": 0.0001995633560996833, "loss": 0.1125, "step": 1837 }, { "epoch": 0.11882020202020202, "grad_norm": 0.0908689796924591, "learning_rate": 0.00019956271749692358, "loss": 0.1001, "step": 1838 }, { "epoch": 0.11888484848484848, "grad_norm": 0.08144357800483704, "learning_rate": 0.0001995620784285419, "loss": 0.0999, "step": 1839 }, { "epoch": 0.11894949494949494, "grad_norm": 0.08233830332756042, "learning_rate": 0.00019956143889454135, "loss": 0.0867, "step": 1840 }, { "epoch": 0.11894949494949494, "eval_bleu": 13.495300645389168, "eval_loss": 0.09907495975494385, "eval_runtime": 2.7916, "eval_samples_per_second": 11.463, "eval_steps_per_second": 1.433, "step": 1840 }, { "epoch": 0.11901414141414142, "grad_norm": 0.08961008489131927, "learning_rate": 0.00019956079889492482, "loss": 0.0952, "step": 1841 }, { "epoch": 0.11907878787878788, "grad_norm": 0.10404586046934128, "learning_rate": 0.00019956015842969538, "loss": 0.0921, "step": 1842 }, { "epoch": 0.11914343434343434, "grad_norm": 0.09377691894769669, "learning_rate": 0.00019955951749885595, "loss": 0.1117, "step": 1843 }, { "epoch": 0.1192080808080808, "grad_norm": 0.07946504652500153, "learning_rate": 0.0001995588761024096, "loss": 0.0759, "step": 1844 }, { "epoch": 0.11927272727272727, "grad_norm": 0.09573984146118164, "learning_rate": 0.00019955823424035928, "loss": 0.1129, "step": 1845 }, { "epoch": 0.11933737373737374, "grad_norm": 0.0993974357843399, "learning_rate": 0.00019955759191270803, "loss": 0.1197, "step": 1846 }, { "epoch": 0.1194020202020202, "grad_norm": 0.08959382772445679, "learning_rate": 0.00019955694911945885, "loss": 0.0824, "step": 1847 }, { "epoch": 0.11946666666666667, "grad_norm": 0.07440722733736038, "learning_rate": 0.0001995563058606147, "loss": 0.0794, "step": 1848 }, { "epoch": 0.11953131313131313, "grad_norm": 0.09743315726518631, "learning_rate": 0.00019955566213617865, "loss": 0.0849, "step": 1849 }, { "epoch": 0.1195959595959596, "grad_norm": 0.08127295225858688, "learning_rate": 0.00019955501794615365, "loss": 0.0868, "step": 1850 }, { "epoch": 0.11966060606060606, "grad_norm": 0.08320796489715576, "learning_rate": 0.00019955437329054277, "loss": 0.1018, "step": 1851 }, { "epoch": 0.11972525252525253, "grad_norm": 0.09335729479789734, "learning_rate": 0.00019955372816934897, "loss": 0.0924, "step": 1852 }, { "epoch": 0.11978989898989899, "grad_norm": 0.0801360085606575, "learning_rate": 0.00019955308258257532, "loss": 0.0781, "step": 1853 }, { "epoch": 0.11985454545454545, "grad_norm": 0.0855412632226944, "learning_rate": 0.0001995524365302248, "loss": 0.0866, "step": 1854 }, { "epoch": 0.11991919191919193, "grad_norm": 0.10311412066221237, "learning_rate": 0.00019955179001230047, "loss": 0.1183, "step": 1855 }, { "epoch": 0.11998383838383839, "grad_norm": 0.08917050808668137, "learning_rate": 0.0001995511430288053, "loss": 0.0966, "step": 1856 }, { "epoch": 0.11998383838383839, "eval_bleu": 15.815607722549192, "eval_loss": 0.09831130504608154, "eval_runtime": 2.7986, "eval_samples_per_second": 11.434, "eval_steps_per_second": 1.429, "step": 1856 }, { "epoch": 0.12004848484848485, "grad_norm": 0.08762361109256744, "learning_rate": 0.00019955049557974236, "loss": 0.0914, "step": 1857 }, { "epoch": 0.12011313131313131, "grad_norm": 0.0820266604423523, "learning_rate": 0.00019954984766511465, "loss": 0.0854, "step": 1858 }, { "epoch": 0.12017777777777777, "grad_norm": 0.08227520436048508, "learning_rate": 0.00019954919928492524, "loss": 0.0931, "step": 1859 }, { "epoch": 0.12024242424242425, "grad_norm": 0.08380527049303055, "learning_rate": 0.00019954855043917712, "loss": 0.0852, "step": 1860 }, { "epoch": 0.12030707070707071, "grad_norm": 0.07569682598114014, "learning_rate": 0.00019954790112787334, "loss": 0.0835, "step": 1861 }, { "epoch": 0.12037171717171717, "grad_norm": 0.08127695322036743, "learning_rate": 0.00019954725135101694, "loss": 0.0843, "step": 1862 }, { "epoch": 0.12043636363636363, "grad_norm": 0.09327790886163712, "learning_rate": 0.00019954660110861093, "loss": 0.1031, "step": 1863 }, { "epoch": 0.1205010101010101, "grad_norm": 0.11711227148771286, "learning_rate": 0.0001995459504006584, "loss": 0.1151, "step": 1864 }, { "epoch": 0.12056565656565657, "grad_norm": 0.09174749255180359, "learning_rate": 0.00019954529922716236, "loss": 0.0901, "step": 1865 }, { "epoch": 0.12063030303030303, "grad_norm": 0.0782327950000763, "learning_rate": 0.00019954464758812588, "loss": 0.086, "step": 1866 }, { "epoch": 0.1206949494949495, "grad_norm": 0.10070976614952087, "learning_rate": 0.00019954399548355198, "loss": 0.097, "step": 1867 }, { "epoch": 0.12075959595959596, "grad_norm": 0.08702991902828217, "learning_rate": 0.00019954334291344373, "loss": 0.0857, "step": 1868 }, { "epoch": 0.12082424242424242, "grad_norm": 0.08011528849601746, "learning_rate": 0.00019954268987780417, "loss": 0.0955, "step": 1869 }, { "epoch": 0.12088888888888889, "grad_norm": 0.07657444477081299, "learning_rate": 0.00019954203637663636, "loss": 0.0851, "step": 1870 }, { "epoch": 0.12095353535353535, "grad_norm": 0.22027158737182617, "learning_rate": 0.00019954138240994333, "loss": 0.1058, "step": 1871 }, { "epoch": 0.12101818181818182, "grad_norm": 0.10446283221244812, "learning_rate": 0.00019954072797772815, "loss": 0.121, "step": 1872 }, { "epoch": 0.12101818181818182, "eval_bleu": 13.164717037230142, "eval_loss": 0.09834155440330505, "eval_runtime": 2.742, "eval_samples_per_second": 11.67, "eval_steps_per_second": 1.459, "step": 1872 }, { "epoch": 0.12108282828282828, "grad_norm": 0.09554079174995422, "learning_rate": 0.00019954007307999394, "loss": 0.1075, "step": 1873 }, { "epoch": 0.12114747474747475, "grad_norm": 0.09154904633760452, "learning_rate": 0.0001995394177167437, "loss": 0.0973, "step": 1874 }, { "epoch": 0.12121212121212122, "grad_norm": 0.08791657537221909, "learning_rate": 0.00019953876188798052, "loss": 0.0909, "step": 1875 }, { "epoch": 0.12127676767676768, "grad_norm": 0.08171246200799942, "learning_rate": 0.00019953810559370742, "loss": 0.088, "step": 1876 }, { "epoch": 0.12134141414141414, "grad_norm": 0.0776485726237297, "learning_rate": 0.00019953744883392755, "loss": 0.0797, "step": 1877 }, { "epoch": 0.1214060606060606, "grad_norm": 0.09048549830913544, "learning_rate": 0.00019953679160864392, "loss": 0.0991, "step": 1878 }, { "epoch": 0.12147070707070708, "grad_norm": 0.09326526522636414, "learning_rate": 0.00019953613391785963, "loss": 0.1003, "step": 1879 }, { "epoch": 0.12153535353535354, "grad_norm": 0.08852504193782806, "learning_rate": 0.00019953547576157774, "loss": 0.0978, "step": 1880 }, { "epoch": 0.1216, "grad_norm": 0.10462523996829987, "learning_rate": 0.00019953481713980134, "loss": 0.1207, "step": 1881 }, { "epoch": 0.12166464646464646, "grad_norm": 0.08366268873214722, "learning_rate": 0.00019953415805253348, "loss": 0.0867, "step": 1882 }, { "epoch": 0.12172929292929292, "grad_norm": 0.08039695024490356, "learning_rate": 0.0001995334984997773, "loss": 0.0809, "step": 1883 }, { "epoch": 0.1217939393939394, "grad_norm": 0.0831957682967186, "learning_rate": 0.00019953283848153584, "loss": 0.0831, "step": 1884 }, { "epoch": 0.12185858585858586, "grad_norm": 0.08359136432409286, "learning_rate": 0.0001995321779978122, "loss": 0.0813, "step": 1885 }, { "epoch": 0.12192323232323232, "grad_norm": 0.11489012092351913, "learning_rate": 0.0001995315170486095, "loss": 0.1025, "step": 1886 }, { "epoch": 0.12198787878787878, "grad_norm": 0.09697767347097397, "learning_rate": 0.00019953085563393074, "loss": 0.1042, "step": 1887 }, { "epoch": 0.12205252525252525, "grad_norm": 0.07990733534097672, "learning_rate": 0.00019953019375377912, "loss": 0.086, "step": 1888 }, { "epoch": 0.12205252525252525, "eval_bleu": 12.424350258259723, "eval_loss": 0.10116302967071533, "eval_runtime": 2.7286, "eval_samples_per_second": 11.728, "eval_steps_per_second": 1.466, "step": 1888 }, { "epoch": 0.12211717171717172, "grad_norm": 0.0896470919251442, "learning_rate": 0.00019952953140815765, "loss": 0.0883, "step": 1889 }, { "epoch": 0.12218181818181818, "grad_norm": 0.08684299886226654, "learning_rate": 0.0001995288685970695, "loss": 0.0877, "step": 1890 }, { "epoch": 0.12224646464646464, "grad_norm": 0.09075053781270981, "learning_rate": 0.00019952820532051773, "loss": 0.0958, "step": 1891 }, { "epoch": 0.1223111111111111, "grad_norm": 0.0927472859621048, "learning_rate": 0.00019952754157850545, "loss": 0.1054, "step": 1892 }, { "epoch": 0.12237575757575758, "grad_norm": 0.08077292144298553, "learning_rate": 0.00019952687737103571, "loss": 0.0852, "step": 1893 }, { "epoch": 0.12244040404040404, "grad_norm": 0.07959099858999252, "learning_rate": 0.0001995262126981117, "loss": 0.0858, "step": 1894 }, { "epoch": 0.1225050505050505, "grad_norm": 0.08473529666662216, "learning_rate": 0.00019952554755973652, "loss": 0.0901, "step": 1895 }, { "epoch": 0.12256969696969697, "grad_norm": 0.0825471356511116, "learning_rate": 0.00019952488195591324, "loss": 0.082, "step": 1896 }, { "epoch": 0.12263434343434343, "grad_norm": 0.0848509669303894, "learning_rate": 0.00019952421588664498, "loss": 0.0941, "step": 1897 }, { "epoch": 0.1226989898989899, "grad_norm": 0.08734121918678284, "learning_rate": 0.00019952354935193488, "loss": 0.0972, "step": 1898 }, { "epoch": 0.12276363636363637, "grad_norm": 0.09622342884540558, "learning_rate": 0.00019952288235178603, "loss": 0.0947, "step": 1899 }, { "epoch": 0.12282828282828283, "grad_norm": 0.08091770857572556, "learning_rate": 0.00019952221488620157, "loss": 0.0918, "step": 1900 }, { "epoch": 0.12289292929292929, "grad_norm": 0.09355922788381577, "learning_rate": 0.0001995215469551846, "loss": 0.0998, "step": 1901 }, { "epoch": 0.12295757575757575, "grad_norm": 0.08547891676425934, "learning_rate": 0.00019952087855873828, "loss": 0.0874, "step": 1902 }, { "epoch": 0.12302222222222223, "grad_norm": 0.09887318313121796, "learning_rate": 0.00019952020969686568, "loss": 0.1125, "step": 1903 }, { "epoch": 0.12308686868686869, "grad_norm": 0.1272737979888916, "learning_rate": 0.00019951954036957, "loss": 0.1213, "step": 1904 }, { "epoch": 0.12308686868686869, "eval_bleu": 15.256617319812614, "eval_loss": 0.09971657395362854, "eval_runtime": 2.6871, "eval_samples_per_second": 11.909, "eval_steps_per_second": 1.489, "step": 1904 }, { "epoch": 0.12315151515151515, "grad_norm": 0.07746249437332153, "learning_rate": 0.0001995188705768543, "loss": 0.0772, "step": 1905 }, { "epoch": 0.12321616161616161, "grad_norm": 0.09049322456121445, "learning_rate": 0.00019951820031872172, "loss": 0.1078, "step": 1906 }, { "epoch": 0.12328080808080807, "grad_norm": 0.09215855598449707, "learning_rate": 0.00019951752959517543, "loss": 0.0859, "step": 1907 }, { "epoch": 0.12334545454545455, "grad_norm": 0.075960673391819, "learning_rate": 0.0001995168584062186, "loss": 0.086, "step": 1908 }, { "epoch": 0.12341010101010101, "grad_norm": 0.2640346586704254, "learning_rate": 0.00019951618675185427, "loss": 0.1581, "step": 1909 }, { "epoch": 0.12347474747474747, "grad_norm": 0.08572307229042053, "learning_rate": 0.0001995155146320857, "loss": 0.0891, "step": 1910 }, { "epoch": 0.12353939393939393, "grad_norm": 0.09943751245737076, "learning_rate": 0.00019951484204691592, "loss": 0.1025, "step": 1911 }, { "epoch": 0.12360404040404041, "grad_norm": 0.09284835308790207, "learning_rate": 0.00019951416899634809, "loss": 0.0939, "step": 1912 }, { "epoch": 0.12366868686868687, "grad_norm": 0.10654886811971664, "learning_rate": 0.00019951349548038544, "loss": 0.1059, "step": 1913 }, { "epoch": 0.12373333333333333, "grad_norm": 0.12765763700008392, "learning_rate": 0.00019951282149903104, "loss": 0.0882, "step": 1914 }, { "epoch": 0.1237979797979798, "grad_norm": 0.08594781160354614, "learning_rate": 0.0001995121470522881, "loss": 0.0895, "step": 1915 }, { "epoch": 0.12386262626262626, "grad_norm": 0.09550819545984268, "learning_rate": 0.00019951147214015974, "loss": 0.0962, "step": 1916 }, { "epoch": 0.12392727272727273, "grad_norm": 0.09613265097141266, "learning_rate": 0.00019951079676264912, "loss": 0.1156, "step": 1917 }, { "epoch": 0.1239919191919192, "grad_norm": 0.08825942873954773, "learning_rate": 0.00019951012091975941, "loss": 0.0944, "step": 1918 }, { "epoch": 0.12405656565656566, "grad_norm": 0.09429129213094711, "learning_rate": 0.00019950944461149376, "loss": 0.0963, "step": 1919 }, { "epoch": 0.12412121212121212, "grad_norm": 0.08821070194244385, "learning_rate": 0.0001995087678378553, "loss": 0.1003, "step": 1920 }, { "epoch": 0.12412121212121212, "eval_bleu": 15.988525128879363, "eval_loss": 0.09864120185375214, "eval_runtime": 2.9596, "eval_samples_per_second": 10.812, "eval_steps_per_second": 1.352, "step": 1920 }, { "epoch": 0.12418585858585858, "grad_norm": 0.08690501749515533, "learning_rate": 0.00019950809059884726, "loss": 0.0891, "step": 1921 }, { "epoch": 0.12425050505050506, "grad_norm": 0.08644890785217285, "learning_rate": 0.00019950741289447276, "loss": 0.1049, "step": 1922 }, { "epoch": 0.12431515151515152, "grad_norm": 0.07946109026670456, "learning_rate": 0.000199506734724735, "loss": 0.089, "step": 1923 }, { "epoch": 0.12437979797979798, "grad_norm": 0.07116784900426865, "learning_rate": 0.00019950605608963712, "loss": 0.0789, "step": 1924 }, { "epoch": 0.12444444444444444, "grad_norm": 0.07321831583976746, "learning_rate": 0.00019950537698918235, "loss": 0.0811, "step": 1925 }, { "epoch": 0.1245090909090909, "grad_norm": 0.0844600722193718, "learning_rate": 0.0001995046974233738, "loss": 0.0953, "step": 1926 }, { "epoch": 0.12457373737373738, "grad_norm": 0.08651264011859894, "learning_rate": 0.00019950401739221468, "loss": 0.0974, "step": 1927 }, { "epoch": 0.12463838383838384, "grad_norm": 0.11407974362373352, "learning_rate": 0.00019950333689570818, "loss": 0.1042, "step": 1928 }, { "epoch": 0.1247030303030303, "grad_norm": 0.08301576972007751, "learning_rate": 0.00019950265593385744, "loss": 0.0831, "step": 1929 }, { "epoch": 0.12476767676767676, "grad_norm": 0.09083203971385956, "learning_rate": 0.0001995019745066657, "loss": 0.0955, "step": 1930 }, { "epoch": 0.12483232323232324, "grad_norm": 0.08654728531837463, "learning_rate": 0.00019950129261413611, "loss": 0.1089, "step": 1931 }, { "epoch": 0.1248969696969697, "grad_norm": 0.10016711801290512, "learning_rate": 0.00019950061025627186, "loss": 0.1182, "step": 1932 }, { "epoch": 0.12496161616161616, "grad_norm": 0.11128496378660202, "learning_rate": 0.00019949992743307617, "loss": 0.0971, "step": 1933 }, { "epoch": 0.12502626262626262, "grad_norm": 0.09567607939243317, "learning_rate": 0.00019949924414455219, "loss": 0.1029, "step": 1934 }, { "epoch": 0.12509090909090909, "grad_norm": 0.08960685133934021, "learning_rate": 0.00019949856039070315, "loss": 0.0936, "step": 1935 }, { "epoch": 0.12515555555555555, "grad_norm": 0.08176688104867935, "learning_rate": 0.00019949787617153225, "loss": 0.0958, "step": 1936 }, { "epoch": 0.12515555555555555, "eval_bleu": 15.18812551196839, "eval_loss": 0.09950312227010727, "eval_runtime": 2.733, "eval_samples_per_second": 11.709, "eval_steps_per_second": 1.464, "step": 1936 }, { "epoch": 0.125220202020202, "grad_norm": 0.08809114247560501, "learning_rate": 0.00019949719148704267, "loss": 0.1027, "step": 1937 }, { "epoch": 0.12528484848484847, "grad_norm": 0.08383925259113312, "learning_rate": 0.0001994965063372376, "loss": 0.1019, "step": 1938 }, { "epoch": 0.12534949494949496, "grad_norm": 0.08531201630830765, "learning_rate": 0.00019949582072212028, "loss": 0.0866, "step": 1939 }, { "epoch": 0.12541414141414142, "grad_norm": 0.09667918086051941, "learning_rate": 0.0001994951346416939, "loss": 0.0991, "step": 1940 }, { "epoch": 0.12547878787878788, "grad_norm": 0.08536456525325775, "learning_rate": 0.00019949444809596166, "loss": 0.0877, "step": 1941 }, { "epoch": 0.12554343434343435, "grad_norm": 0.09186483919620514, "learning_rate": 0.0001994937610849268, "loss": 0.1128, "step": 1942 }, { "epoch": 0.1256080808080808, "grad_norm": 0.0914599746465683, "learning_rate": 0.00019949307360859247, "loss": 0.1009, "step": 1943 }, { "epoch": 0.12567272727272727, "grad_norm": 0.08961135149002075, "learning_rate": 0.00019949238566696194, "loss": 0.0964, "step": 1944 }, { "epoch": 0.12573737373737373, "grad_norm": 0.07311463356018066, "learning_rate": 0.00019949169726003844, "loss": 0.0716, "step": 1945 }, { "epoch": 0.1258020202020202, "grad_norm": 0.097776859998703, "learning_rate": 0.00019949100838782512, "loss": 0.0958, "step": 1946 }, { "epoch": 0.12586666666666665, "grad_norm": 0.10604903101921082, "learning_rate": 0.00019949031905032528, "loss": 0.0898, "step": 1947 }, { "epoch": 0.12593131313131314, "grad_norm": 0.0812811553478241, "learning_rate": 0.0001994896292475421, "loss": 0.0793, "step": 1948 }, { "epoch": 0.1259959595959596, "grad_norm": 0.08305008709430695, "learning_rate": 0.00019948893897947883, "loss": 0.094, "step": 1949 }, { "epoch": 0.12606060606060607, "grad_norm": 0.10829906910657883, "learning_rate": 0.00019948824824613865, "loss": 0.1085, "step": 1950 }, { "epoch": 0.12612525252525253, "grad_norm": 0.08572188019752502, "learning_rate": 0.00019948755704752484, "loss": 0.0941, "step": 1951 }, { "epoch": 0.126189898989899, "grad_norm": 0.07729079574346542, "learning_rate": 0.0001994868653836406, "loss": 0.0893, "step": 1952 }, { "epoch": 0.126189898989899, "eval_bleu": 15.25016748426295, "eval_loss": 0.09827081859111786, "eval_runtime": 2.8563, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 1952 }, { "epoch": 0.12625454545454545, "grad_norm": 0.09723882377147675, "learning_rate": 0.00019948617325448923, "loss": 0.1134, "step": 1953 }, { "epoch": 0.12631919191919191, "grad_norm": 0.08374866098165512, "learning_rate": 0.00019948548066007387, "loss": 0.0843, "step": 1954 }, { "epoch": 0.12638383838383838, "grad_norm": 0.10691636800765991, "learning_rate": 0.00019948478760039779, "loss": 0.1068, "step": 1955 }, { "epoch": 0.12644848484848484, "grad_norm": 0.08607611060142517, "learning_rate": 0.00019948409407546429, "loss": 0.0942, "step": 1956 }, { "epoch": 0.1265131313131313, "grad_norm": 0.07911746203899384, "learning_rate": 0.00019948340008527654, "loss": 0.0922, "step": 1957 }, { "epoch": 0.1265777777777778, "grad_norm": 0.08027563244104385, "learning_rate": 0.00019948270562983783, "loss": 0.086, "step": 1958 }, { "epoch": 0.12664242424242425, "grad_norm": 0.10056855529546738, "learning_rate": 0.00019948201070915143, "loss": 0.0764, "step": 1959 }, { "epoch": 0.1267070707070707, "grad_norm": 0.09231962263584137, "learning_rate": 0.00019948131532322052, "loss": 0.1007, "step": 1960 }, { "epoch": 0.12677171717171717, "grad_norm": 0.12780733406543732, "learning_rate": 0.0001994806194720484, "loss": 0.1499, "step": 1961 }, { "epoch": 0.12683636363636364, "grad_norm": 0.0825180783867836, "learning_rate": 0.00019947992315563827, "loss": 0.0938, "step": 1962 }, { "epoch": 0.1269010101010101, "grad_norm": 0.08289946615695953, "learning_rate": 0.0001994792263739935, "loss": 0.0887, "step": 1963 }, { "epoch": 0.12696565656565656, "grad_norm": 0.07675183564424515, "learning_rate": 0.0001994785291271172, "loss": 0.0804, "step": 1964 }, { "epoch": 0.12703030303030302, "grad_norm": 0.0937013030052185, "learning_rate": 0.00019947783141501272, "loss": 0.1091, "step": 1965 }, { "epoch": 0.12709494949494948, "grad_norm": 0.0827571302652359, "learning_rate": 0.00019947713323768333, "loss": 0.1006, "step": 1966 }, { "epoch": 0.12715959595959597, "grad_norm": 0.08069519698619843, "learning_rate": 0.0001994764345951323, "loss": 0.0981, "step": 1967 }, { "epoch": 0.12722424242424243, "grad_norm": 0.08248139172792435, "learning_rate": 0.00019947573548736284, "loss": 0.1071, "step": 1968 }, { "epoch": 0.12722424242424243, "eval_bleu": 13.49704611250518, "eval_loss": 0.09819333255290985, "eval_runtime": 2.6274, "eval_samples_per_second": 12.179, "eval_steps_per_second": 1.522, "step": 1968 }, { "epoch": 0.1272888888888889, "grad_norm": 0.10741985589265823, "learning_rate": 0.00019947503591437823, "loss": 0.1265, "step": 1969 }, { "epoch": 0.12735353535353536, "grad_norm": 0.07623306661844254, "learning_rate": 0.0001994743358761818, "loss": 0.0883, "step": 1970 }, { "epoch": 0.12741818181818182, "grad_norm": 0.06979891657829285, "learning_rate": 0.00019947363537277678, "loss": 0.0855, "step": 1971 }, { "epoch": 0.12748282828282828, "grad_norm": 0.11560800671577454, "learning_rate": 0.00019947293440416642, "loss": 0.1162, "step": 1972 }, { "epoch": 0.12754747474747474, "grad_norm": 0.09833060950040817, "learning_rate": 0.00019947223297035404, "loss": 0.1134, "step": 1973 }, { "epoch": 0.1276121212121212, "grad_norm": 0.07940519601106644, "learning_rate": 0.00019947153107134296, "loss": 0.0831, "step": 1974 }, { "epoch": 0.12767676767676767, "grad_norm": 0.07775519043207169, "learning_rate": 0.00019947082870713637, "loss": 0.0823, "step": 1975 }, { "epoch": 0.12774141414141413, "grad_norm": 0.08320373296737671, "learning_rate": 0.0001994701258777376, "loss": 0.087, "step": 1976 }, { "epoch": 0.12780606060606062, "grad_norm": 0.07904225587844849, "learning_rate": 0.00019946942258314995, "loss": 0.0939, "step": 1977 }, { "epoch": 0.12787070707070708, "grad_norm": 0.08934198319911957, "learning_rate": 0.00019946871882337667, "loss": 0.098, "step": 1978 }, { "epoch": 0.12793535353535354, "grad_norm": 0.08127976208925247, "learning_rate": 0.0001994680145984211, "loss": 0.0926, "step": 1979 }, { "epoch": 0.128, "grad_norm": 0.09419949352741241, "learning_rate": 0.00019946730990828653, "loss": 0.1129, "step": 1980 }, { "epoch": 0.12806464646464646, "grad_norm": 0.08693555742502213, "learning_rate": 0.00019946660475297622, "loss": 0.0894, "step": 1981 }, { "epoch": 0.12812929292929293, "grad_norm": 0.06886342912912369, "learning_rate": 0.00019946589913249344, "loss": 0.0736, "step": 1982 }, { "epoch": 0.1281939393939394, "grad_norm": 0.086369588971138, "learning_rate": 0.0001994651930468416, "loss": 0.0859, "step": 1983 }, { "epoch": 0.12825858585858585, "grad_norm": 0.08950663357973099, "learning_rate": 0.00019946448649602389, "loss": 0.1025, "step": 1984 }, { "epoch": 0.12825858585858585, "eval_bleu": 14.589686856672825, "eval_loss": 0.09829030930995941, "eval_runtime": 2.8228, "eval_samples_per_second": 11.336, "eval_steps_per_second": 1.417, "step": 1984 }, { "epoch": 0.1283232323232323, "grad_norm": 0.07537104934453964, "learning_rate": 0.00019946377948004367, "loss": 0.0733, "step": 1985 }, { "epoch": 0.1283878787878788, "grad_norm": 0.08883395045995712, "learning_rate": 0.00019946307199890422, "loss": 0.1034, "step": 1986 }, { "epoch": 0.12845252525252526, "grad_norm": 0.09857851266860962, "learning_rate": 0.00019946236405260888, "loss": 0.1067, "step": 1987 }, { "epoch": 0.12851717171717172, "grad_norm": 0.08807091414928436, "learning_rate": 0.00019946165564116094, "loss": 0.0979, "step": 1988 }, { "epoch": 0.12858181818181819, "grad_norm": 0.07067893445491791, "learning_rate": 0.0001994609467645637, "loss": 0.071, "step": 1989 }, { "epoch": 0.12864646464646465, "grad_norm": 0.08889427036046982, "learning_rate": 0.00019946023742282054, "loss": 0.1053, "step": 1990 }, { "epoch": 0.1287111111111111, "grad_norm": 0.07630407065153122, "learning_rate": 0.0001994595276159347, "loss": 0.0898, "step": 1991 }, { "epoch": 0.12877575757575757, "grad_norm": 0.09465549886226654, "learning_rate": 0.00019945881734390955, "loss": 0.1044, "step": 1992 }, { "epoch": 0.12884040404040403, "grad_norm": 0.09524498134851456, "learning_rate": 0.00019945810660674835, "loss": 0.1052, "step": 1993 }, { "epoch": 0.1289050505050505, "grad_norm": 0.09550344198942184, "learning_rate": 0.00019945739540445448, "loss": 0.1032, "step": 1994 }, { "epoch": 0.12896969696969696, "grad_norm": 0.07416670024394989, "learning_rate": 0.00019945668373703128, "loss": 0.0789, "step": 1995 }, { "epoch": 0.12903434343434345, "grad_norm": 0.0895688608288765, "learning_rate": 0.00019945597160448203, "loss": 0.0949, "step": 1996 }, { "epoch": 0.1290989898989899, "grad_norm": 0.08989269286394119, "learning_rate": 0.00019945525900681008, "loss": 0.1098, "step": 1997 }, { "epoch": 0.12916363636363637, "grad_norm": 0.08079128712415695, "learning_rate": 0.00019945454594401878, "loss": 0.0886, "step": 1998 }, { "epoch": 0.12922828282828283, "grad_norm": 0.08059177547693253, "learning_rate": 0.00019945383241611145, "loss": 0.1012, "step": 1999 }, { "epoch": 0.1292929292929293, "grad_norm": 0.10066372901201248, "learning_rate": 0.0001994531184230914, "loss": 0.1121, "step": 2000 }, { "epoch": 0.1292929292929293, "eval_bleu": 13.839869148332909, "eval_loss": 0.09522367268800735, "eval_runtime": 2.7627, "eval_samples_per_second": 11.583, "eval_steps_per_second": 1.448, "step": 2000 }, { "epoch": 0.12935757575757575, "grad_norm": 0.09417274594306946, "learning_rate": 0.000199452403964962, "loss": 0.1071, "step": 2001 }, { "epoch": 0.12942222222222222, "grad_norm": 0.09200766682624817, "learning_rate": 0.00019945168904172657, "loss": 0.1009, "step": 2002 }, { "epoch": 0.12948686868686868, "grad_norm": 0.08576171100139618, "learning_rate": 0.0001994509736533885, "loss": 0.101, "step": 2003 }, { "epoch": 0.12955151515151514, "grad_norm": 0.09033402055501938, "learning_rate": 0.0001994502577999511, "loss": 0.0966, "step": 2004 }, { "epoch": 0.12961616161616163, "grad_norm": 0.07994651049375534, "learning_rate": 0.0001994495414814177, "loss": 0.0835, "step": 2005 }, { "epoch": 0.1296808080808081, "grad_norm": 0.09723614156246185, "learning_rate": 0.00019944882469779166, "loss": 0.1017, "step": 2006 }, { "epoch": 0.12974545454545455, "grad_norm": 0.08721049875020981, "learning_rate": 0.00019944810744907638, "loss": 0.0835, "step": 2007 }, { "epoch": 0.12981010101010101, "grad_norm": 0.0764656737446785, "learning_rate": 0.00019944738973527517, "loss": 0.0846, "step": 2008 }, { "epoch": 0.12987474747474748, "grad_norm": 0.07630682736635208, "learning_rate": 0.00019944667155639138, "loss": 0.0904, "step": 2009 }, { "epoch": 0.12993939393939394, "grad_norm": 0.08141928166151047, "learning_rate": 0.0001994459529124284, "loss": 0.0916, "step": 2010 }, { "epoch": 0.1300040404040404, "grad_norm": 0.08433999866247177, "learning_rate": 0.00019944523380338957, "loss": 0.0899, "step": 2011 }, { "epoch": 0.13006868686868686, "grad_norm": 0.0846228152513504, "learning_rate": 0.00019944451422927826, "loss": 0.1039, "step": 2012 }, { "epoch": 0.13013333333333332, "grad_norm": 0.10189270973205566, "learning_rate": 0.00019944379419009782, "loss": 0.107, "step": 2013 }, { "epoch": 0.13019797979797978, "grad_norm": 0.09998765587806702, "learning_rate": 0.00019944307368585163, "loss": 0.1171, "step": 2014 }, { "epoch": 0.13026262626262627, "grad_norm": 0.10510484129190445, "learning_rate": 0.00019944235271654307, "loss": 0.1121, "step": 2015 }, { "epoch": 0.13032727272727274, "grad_norm": 0.07617301493883133, "learning_rate": 0.0001994416312821755, "loss": 0.0823, "step": 2016 }, { "epoch": 0.13032727272727274, "eval_bleu": 15.509957147824409, "eval_loss": 0.09738773107528687, "eval_runtime": 3.0456, "eval_samples_per_second": 10.507, "eval_steps_per_second": 1.313, "step": 2016 }, { "epoch": 0.1303919191919192, "grad_norm": 0.09673111140727997, "learning_rate": 0.00019944090938275232, "loss": 0.1015, "step": 2017 }, { "epoch": 0.13045656565656566, "grad_norm": 0.10245068371295929, "learning_rate": 0.00019944018701827684, "loss": 0.1079, "step": 2018 }, { "epoch": 0.13052121212121212, "grad_norm": 0.07984739542007446, "learning_rate": 0.0001994394641887525, "loss": 0.0892, "step": 2019 }, { "epoch": 0.13058585858585858, "grad_norm": 0.07822220027446747, "learning_rate": 0.00019943874089418264, "loss": 0.0837, "step": 2020 }, { "epoch": 0.13065050505050504, "grad_norm": 0.08840775489807129, "learning_rate": 0.0001994380171345707, "loss": 0.0951, "step": 2021 }, { "epoch": 0.1307151515151515, "grad_norm": 0.09588046371936798, "learning_rate": 0.00019943729290992001, "loss": 0.0983, "step": 2022 }, { "epoch": 0.13077979797979797, "grad_norm": 0.08750557154417038, "learning_rate": 0.000199436568220234, "loss": 0.0888, "step": 2023 }, { "epoch": 0.13084444444444446, "grad_norm": 0.0954146534204483, "learning_rate": 0.00019943584306551598, "loss": 0.1087, "step": 2024 }, { "epoch": 0.13090909090909092, "grad_norm": 0.07021438330411911, "learning_rate": 0.00019943511744576942, "loss": 0.069, "step": 2025 }, { "epoch": 0.13097373737373738, "grad_norm": 0.11486995220184326, "learning_rate": 0.0001994343913609977, "loss": 0.1011, "step": 2026 }, { "epoch": 0.13103838383838384, "grad_norm": 0.10010086745023727, "learning_rate": 0.00019943366481120416, "loss": 0.102, "step": 2027 }, { "epoch": 0.1311030303030303, "grad_norm": 0.09551717340946198, "learning_rate": 0.00019943293779639228, "loss": 0.0978, "step": 2028 }, { "epoch": 0.13116767676767677, "grad_norm": 0.07197084277868271, "learning_rate": 0.00019943221031656542, "loss": 0.0746, "step": 2029 }, { "epoch": 0.13123232323232323, "grad_norm": 0.08440660685300827, "learning_rate": 0.00019943148237172698, "loss": 0.0959, "step": 2030 }, { "epoch": 0.1312969696969697, "grad_norm": 0.10483204573392868, "learning_rate": 0.00019943075396188035, "loss": 0.1275, "step": 2031 }, { "epoch": 0.13136161616161615, "grad_norm": 0.08914661407470703, "learning_rate": 0.000199430025087029, "loss": 0.0957, "step": 2032 }, { "epoch": 0.13136161616161615, "eval_bleu": 14.10015424273172, "eval_loss": 0.0969596803188324, "eval_runtime": 2.7952, "eval_samples_per_second": 11.448, "eval_steps_per_second": 1.431, "step": 2032 }, { "epoch": 0.1314262626262626, "grad_norm": 0.07776562124490738, "learning_rate": 0.00019942929574717625, "loss": 0.0782, "step": 2033 }, { "epoch": 0.1314909090909091, "grad_norm": 0.08268604427576065, "learning_rate": 0.00019942856594232558, "loss": 0.0894, "step": 2034 }, { "epoch": 0.13155555555555556, "grad_norm": 0.09471847116947174, "learning_rate": 0.00019942783567248037, "loss": 0.0955, "step": 2035 }, { "epoch": 0.13162020202020203, "grad_norm": 0.0754428282380104, "learning_rate": 0.00019942710493764404, "loss": 0.088, "step": 2036 }, { "epoch": 0.1316848484848485, "grad_norm": 0.08042635768651962, "learning_rate": 0.00019942637373782001, "loss": 0.0812, "step": 2037 }, { "epoch": 0.13174949494949495, "grad_norm": 0.09189002215862274, "learning_rate": 0.00019942564207301168, "loss": 0.1033, "step": 2038 }, { "epoch": 0.1318141414141414, "grad_norm": 0.08115535229444504, "learning_rate": 0.00019942490994322252, "loss": 0.0881, "step": 2039 }, { "epoch": 0.13187878787878787, "grad_norm": 0.08375845849514008, "learning_rate": 0.0001994241773484559, "loss": 0.0987, "step": 2040 }, { "epoch": 0.13194343434343433, "grad_norm": 0.10288199037313461, "learning_rate": 0.00019942344428871528, "loss": 0.1096, "step": 2041 }, { "epoch": 0.1320080808080808, "grad_norm": 0.08917142450809479, "learning_rate": 0.0001994227107640041, "loss": 0.0968, "step": 2042 }, { "epoch": 0.13207272727272729, "grad_norm": 0.08875776827335358, "learning_rate": 0.00019942197677432576, "loss": 0.1, "step": 2043 }, { "epoch": 0.13213737373737375, "grad_norm": 0.07881813496351242, "learning_rate": 0.00019942124231968371, "loss": 0.0903, "step": 2044 }, { "epoch": 0.1322020202020202, "grad_norm": 0.0783696249127388, "learning_rate": 0.00019942050740008135, "loss": 0.0791, "step": 2045 }, { "epoch": 0.13226666666666667, "grad_norm": 0.0984339788556099, "learning_rate": 0.0001994197720155222, "loss": 0.1148, "step": 2046 }, { "epoch": 0.13233131313131313, "grad_norm": 0.09149181842803955, "learning_rate": 0.00019941903616600958, "loss": 0.1006, "step": 2047 }, { "epoch": 0.1323959595959596, "grad_norm": 0.10625866800546646, "learning_rate": 0.00019941829985154703, "loss": 0.1237, "step": 2048 }, { "epoch": 0.1323959595959596, "eval_bleu": 11.790794773106036, "eval_loss": 0.09782740473747253, "eval_runtime": 2.7028, "eval_samples_per_second": 11.84, "eval_steps_per_second": 1.48, "step": 2048 }, { "epoch": 0.13246060606060606, "grad_norm": 0.10007159411907196, "learning_rate": 0.00019941756307213795, "loss": 0.102, "step": 2049 }, { "epoch": 0.13252525252525252, "grad_norm": 0.07241909950971603, "learning_rate": 0.0001994168258277858, "loss": 0.0774, "step": 2050 }, { "epoch": 0.13258989898989898, "grad_norm": 0.07422494143247604, "learning_rate": 0.000199416088118494, "loss": 0.0832, "step": 2051 }, { "epoch": 0.13265454545454544, "grad_norm": 0.0820273905992508, "learning_rate": 0.00019941534994426604, "loss": 0.0885, "step": 2052 }, { "epoch": 0.13271919191919193, "grad_norm": 0.0825883001089096, "learning_rate": 0.00019941461130510536, "loss": 0.088, "step": 2053 }, { "epoch": 0.1327838383838384, "grad_norm": 0.0916120707988739, "learning_rate": 0.00019941387220101541, "loss": 0.0998, "step": 2054 }, { "epoch": 0.13284848484848485, "grad_norm": 0.09313715249300003, "learning_rate": 0.00019941313263199963, "loss": 0.1029, "step": 2055 }, { "epoch": 0.13291313131313132, "grad_norm": 0.08485183119773865, "learning_rate": 0.0001994123925980615, "loss": 0.1002, "step": 2056 }, { "epoch": 0.13297777777777778, "grad_norm": 0.08435779809951782, "learning_rate": 0.0001994116520992045, "loss": 0.1026, "step": 2057 }, { "epoch": 0.13304242424242424, "grad_norm": 0.08964058011770248, "learning_rate": 0.00019941091113543204, "loss": 0.0982, "step": 2058 }, { "epoch": 0.1331070707070707, "grad_norm": 0.08899747580289841, "learning_rate": 0.00019941016970674761, "loss": 0.1018, "step": 2059 }, { "epoch": 0.13317171717171716, "grad_norm": 0.07826949656009674, "learning_rate": 0.0001994094278131547, "loss": 0.0952, "step": 2060 }, { "epoch": 0.13323636363636363, "grad_norm": 0.09215851128101349, "learning_rate": 0.00019940868545465675, "loss": 0.1037, "step": 2061 }, { "epoch": 0.13330101010101011, "grad_norm": 0.08982773125171661, "learning_rate": 0.00019940794263125723, "loss": 0.105, "step": 2062 }, { "epoch": 0.13336565656565658, "grad_norm": 0.0929267480969429, "learning_rate": 0.00019940719934295964, "loss": 0.1085, "step": 2063 }, { "epoch": 0.13343030303030304, "grad_norm": 0.10089726001024246, "learning_rate": 0.00019940645558976744, "loss": 0.1217, "step": 2064 }, { "epoch": 0.13343030303030304, "eval_bleu": 13.068304622729366, "eval_loss": 0.09774366021156311, "eval_runtime": 2.8263, "eval_samples_per_second": 11.322, "eval_steps_per_second": 1.415, "step": 2064 }, { "epoch": 0.1334949494949495, "grad_norm": 0.0794333666563034, "learning_rate": 0.0001994057113716841, "loss": 0.0833, "step": 2065 }, { "epoch": 0.13355959595959596, "grad_norm": 0.0743979662656784, "learning_rate": 0.00019940496668871313, "loss": 0.079, "step": 2066 }, { "epoch": 0.13362424242424242, "grad_norm": 0.07490761578083038, "learning_rate": 0.00019940422154085798, "loss": 0.0826, "step": 2067 }, { "epoch": 0.13368888888888888, "grad_norm": 0.07392819225788116, "learning_rate": 0.00019940347592812215, "loss": 0.0833, "step": 2068 }, { "epoch": 0.13375353535353535, "grad_norm": 0.07630061358213425, "learning_rate": 0.00019940272985050913, "loss": 0.0794, "step": 2069 }, { "epoch": 0.1338181818181818, "grad_norm": 0.09349751472473145, "learning_rate": 0.00019940198330802242, "loss": 0.0824, "step": 2070 }, { "epoch": 0.13388282828282827, "grad_norm": 0.08001204580068588, "learning_rate": 0.00019940123630066546, "loss": 0.0878, "step": 2071 }, { "epoch": 0.13394747474747476, "grad_norm": 0.0950464978814125, "learning_rate": 0.0001994004888284418, "loss": 0.0953, "step": 2072 }, { "epoch": 0.13401212121212122, "grad_norm": 0.09755884855985641, "learning_rate": 0.00019939974089135492, "loss": 0.1086, "step": 2073 }, { "epoch": 0.13407676767676768, "grad_norm": 0.08843854814767838, "learning_rate": 0.00019939899248940833, "loss": 0.0955, "step": 2074 }, { "epoch": 0.13414141414141414, "grad_norm": 0.09414809197187424, "learning_rate": 0.00019939824362260545, "loss": 0.1062, "step": 2075 }, { "epoch": 0.1342060606060606, "grad_norm": 0.09230902791023254, "learning_rate": 0.00019939749429094992, "loss": 0.112, "step": 2076 }, { "epoch": 0.13427070707070707, "grad_norm": 0.07946252077817917, "learning_rate": 0.0001993967444944451, "loss": 0.0777, "step": 2077 }, { "epoch": 0.13433535353535353, "grad_norm": 0.07552774995565414, "learning_rate": 0.00019939599423309461, "loss": 0.0879, "step": 2078 }, { "epoch": 0.1344, "grad_norm": 0.12982022762298584, "learning_rate": 0.0001993952435069019, "loss": 0.0953, "step": 2079 }, { "epoch": 0.13446464646464645, "grad_norm": 0.08005748689174652, "learning_rate": 0.0001993944923158705, "loss": 0.0805, "step": 2080 }, { "epoch": 0.13446464646464645, "eval_bleu": 16.284477901282663, "eval_loss": 0.09954658150672913, "eval_runtime": 2.7672, "eval_samples_per_second": 11.564, "eval_steps_per_second": 1.445, "step": 2080 }, { "epoch": 0.13452929292929294, "grad_norm": 0.1251658797264099, "learning_rate": 0.00019939374066000392, "loss": 0.1077, "step": 2081 }, { "epoch": 0.1345939393939394, "grad_norm": 0.10091298073530197, "learning_rate": 0.00019939298853930567, "loss": 0.1289, "step": 2082 }, { "epoch": 0.13465858585858587, "grad_norm": 0.07252799719572067, "learning_rate": 0.00019939223595377928, "loss": 0.0832, "step": 2083 }, { "epoch": 0.13472323232323233, "grad_norm": 0.07752204686403275, "learning_rate": 0.00019939148290342825, "loss": 0.0869, "step": 2084 }, { "epoch": 0.1347878787878788, "grad_norm": 0.07700987160205841, "learning_rate": 0.00019939072938825612, "loss": 0.079, "step": 2085 }, { "epoch": 0.13485252525252525, "grad_norm": 0.0797998309135437, "learning_rate": 0.00019938997540826638, "loss": 0.0904, "step": 2086 }, { "epoch": 0.1349171717171717, "grad_norm": 0.09481562674045563, "learning_rate": 0.0001993892209634626, "loss": 0.0845, "step": 2087 }, { "epoch": 0.13498181818181818, "grad_norm": 0.08537639677524567, "learning_rate": 0.00019938846605384831, "loss": 0.1051, "step": 2088 }, { "epoch": 0.13504646464646464, "grad_norm": 0.07488562911748886, "learning_rate": 0.00019938771067942702, "loss": 0.0841, "step": 2089 }, { "epoch": 0.1351111111111111, "grad_norm": 0.075688935816288, "learning_rate": 0.00019938695484020223, "loss": 0.0777, "step": 2090 }, { "epoch": 0.1351757575757576, "grad_norm": 0.0655268207192421, "learning_rate": 0.00019938619853617753, "loss": 0.073, "step": 2091 }, { "epoch": 0.13524040404040405, "grad_norm": 0.08922410756349564, "learning_rate": 0.00019938544176735645, "loss": 0.1101, "step": 2092 }, { "epoch": 0.1353050505050505, "grad_norm": 0.08207284659147263, "learning_rate": 0.0001993846845337425, "loss": 0.0959, "step": 2093 }, { "epoch": 0.13536969696969697, "grad_norm": 0.09461072832345963, "learning_rate": 0.00019938392683533924, "loss": 0.1096, "step": 2094 }, { "epoch": 0.13543434343434343, "grad_norm": 0.08169141411781311, "learning_rate": 0.00019938316867215022, "loss": 0.086, "step": 2095 }, { "epoch": 0.1354989898989899, "grad_norm": 0.08676749467849731, "learning_rate": 0.00019938241004417893, "loss": 0.1172, "step": 2096 }, { "epoch": 0.1354989898989899, "eval_bleu": 15.876729664676253, "eval_loss": 0.09793879091739655, "eval_runtime": 2.7421, "eval_samples_per_second": 11.67, "eval_steps_per_second": 1.459, "step": 2096 }, { "epoch": 0.13556363636363636, "grad_norm": 0.08110977709293365, "learning_rate": 0.000199381650951429, "loss": 0.0902, "step": 2097 }, { "epoch": 0.13562828282828282, "grad_norm": 0.08314534276723862, "learning_rate": 0.00019938089139390395, "loss": 0.0953, "step": 2098 }, { "epoch": 0.13569292929292928, "grad_norm": 0.07887057960033417, "learning_rate": 0.0001993801313716073, "loss": 0.0876, "step": 2099 }, { "epoch": 0.13575757575757577, "grad_norm": 0.10084131360054016, "learning_rate": 0.00019937937088454266, "loss": 0.1064, "step": 2100 }, { "epoch": 0.13582222222222223, "grad_norm": 0.10975068807601929, "learning_rate": 0.00019937860993271352, "loss": 0.1413, "step": 2101 }, { "epoch": 0.1358868686868687, "grad_norm": 0.0831703469157219, "learning_rate": 0.0001993778485161235, "loss": 0.0953, "step": 2102 }, { "epoch": 0.13595151515151516, "grad_norm": 0.07815272361040115, "learning_rate": 0.00019937708663477613, "loss": 0.1048, "step": 2103 }, { "epoch": 0.13601616161616162, "grad_norm": 0.07690869271755219, "learning_rate": 0.00019937632428867498, "loss": 0.0915, "step": 2104 }, { "epoch": 0.13608080808080808, "grad_norm": 0.08577138930559158, "learning_rate": 0.00019937556147782363, "loss": 0.0865, "step": 2105 }, { "epoch": 0.13614545454545454, "grad_norm": 0.07861332595348358, "learning_rate": 0.00019937479820222564, "loss": 0.0942, "step": 2106 }, { "epoch": 0.136210101010101, "grad_norm": 0.08143707364797592, "learning_rate": 0.00019937403446188454, "loss": 0.0918, "step": 2107 }, { "epoch": 0.13627474747474747, "grad_norm": 0.09098243713378906, "learning_rate": 0.00019937327025680395, "loss": 0.1031, "step": 2108 }, { "epoch": 0.13633939393939393, "grad_norm": 0.09335984289646149, "learning_rate": 0.0001993725055869874, "loss": 0.0978, "step": 2109 }, { "epoch": 0.13640404040404042, "grad_norm": 0.08311004936695099, "learning_rate": 0.00019937174045243854, "loss": 0.0898, "step": 2110 }, { "epoch": 0.13646868686868688, "grad_norm": 0.07999549806118011, "learning_rate": 0.00019937097485316088, "loss": 0.0828, "step": 2111 }, { "epoch": 0.13653333333333334, "grad_norm": 0.09194918721914291, "learning_rate": 0.00019937020878915798, "loss": 0.093, "step": 2112 }, { "epoch": 0.13653333333333334, "eval_bleu": 15.07902042115171, "eval_loss": 0.09937527775764465, "eval_runtime": 2.769, "eval_samples_per_second": 11.557, "eval_steps_per_second": 1.445, "step": 2112 }, { "epoch": 0.1365979797979798, "grad_norm": 0.11082398891448975, "learning_rate": 0.00019936944226043354, "loss": 0.1164, "step": 2113 }, { "epoch": 0.13666262626262626, "grad_norm": 0.09152469038963318, "learning_rate": 0.000199368675266991, "loss": 0.1022, "step": 2114 }, { "epoch": 0.13672727272727273, "grad_norm": 0.07528580725193024, "learning_rate": 0.00019936790780883406, "loss": 0.071, "step": 2115 }, { "epoch": 0.1367919191919192, "grad_norm": 0.08660375326871872, "learning_rate": 0.00019936713988596627, "loss": 0.1003, "step": 2116 }, { "epoch": 0.13685656565656565, "grad_norm": 0.10427609086036682, "learning_rate": 0.0001993663714983912, "loss": 0.118, "step": 2117 }, { "epoch": 0.1369212121212121, "grad_norm": 0.0905638113617897, "learning_rate": 0.00019936560264611247, "loss": 0.1005, "step": 2118 }, { "epoch": 0.13698585858585857, "grad_norm": 0.08334964513778687, "learning_rate": 0.00019936483332913361, "loss": 0.1102, "step": 2119 }, { "epoch": 0.13705050505050506, "grad_norm": 0.07560577988624573, "learning_rate": 0.00019936406354745834, "loss": 0.0834, "step": 2120 }, { "epoch": 0.13711515151515152, "grad_norm": 0.08507279306650162, "learning_rate": 0.00019936329330109018, "loss": 0.0962, "step": 2121 }, { "epoch": 0.13717979797979798, "grad_norm": 0.08278714865446091, "learning_rate": 0.00019936252259003274, "loss": 0.1054, "step": 2122 }, { "epoch": 0.13724444444444445, "grad_norm": 0.08289074897766113, "learning_rate": 0.00019936175141428963, "loss": 0.1094, "step": 2123 }, { "epoch": 0.1373090909090909, "grad_norm": 0.10430148988962173, "learning_rate": 0.00019936097977386446, "loss": 0.1242, "step": 2124 }, { "epoch": 0.13737373737373737, "grad_norm": 0.09121454507112503, "learning_rate": 0.00019936020766876084, "loss": 0.1315, "step": 2125 }, { "epoch": 0.13743838383838383, "grad_norm": 0.07568488270044327, "learning_rate": 0.00019935943509898237, "loss": 0.0796, "step": 2126 }, { "epoch": 0.1375030303030303, "grad_norm": 0.09529861062765121, "learning_rate": 0.00019935866206453266, "loss": 0.1151, "step": 2127 }, { "epoch": 0.13756767676767676, "grad_norm": 0.0712660551071167, "learning_rate": 0.00019935788856541536, "loss": 0.0755, "step": 2128 }, { "epoch": 0.13756767676767676, "eval_bleu": 12.6309035502833, "eval_loss": 0.09649898111820221, "eval_runtime": 2.6672, "eval_samples_per_second": 11.998, "eval_steps_per_second": 1.5, "step": 2128 }, { "epoch": 0.13763232323232324, "grad_norm": 0.09537266939878464, "learning_rate": 0.00019935711460163403, "loss": 0.1132, "step": 2129 }, { "epoch": 0.1376969696969697, "grad_norm": 0.08489110320806503, "learning_rate": 0.00019935634017319233, "loss": 0.0955, "step": 2130 }, { "epoch": 0.13776161616161617, "grad_norm": 0.14424313604831696, "learning_rate": 0.00019935556528009388, "loss": 0.1255, "step": 2131 }, { "epoch": 0.13782626262626263, "grad_norm": 0.08503853529691696, "learning_rate": 0.00019935478992234233, "loss": 0.0932, "step": 2132 }, { "epoch": 0.1378909090909091, "grad_norm": 0.08866728097200394, "learning_rate": 0.00019935401409994124, "loss": 0.0993, "step": 2133 }, { "epoch": 0.13795555555555555, "grad_norm": 0.08791413903236389, "learning_rate": 0.00019935323781289426, "loss": 0.0996, "step": 2134 }, { "epoch": 0.13802020202020202, "grad_norm": 0.08543447405099869, "learning_rate": 0.00019935246106120506, "loss": 0.0868, "step": 2135 }, { "epoch": 0.13808484848484848, "grad_norm": 0.0702652856707573, "learning_rate": 0.00019935168384487725, "loss": 0.0817, "step": 2136 }, { "epoch": 0.13814949494949494, "grad_norm": 0.08029930293560028, "learning_rate": 0.0001993509061639144, "loss": 0.0967, "step": 2137 }, { "epoch": 0.1382141414141414, "grad_norm": 0.08459579199552536, "learning_rate": 0.00019935012801832028, "loss": 0.1036, "step": 2138 }, { "epoch": 0.1382787878787879, "grad_norm": 0.07462736964225769, "learning_rate": 0.0001993493494080984, "loss": 0.0785, "step": 2139 }, { "epoch": 0.13834343434343435, "grad_norm": 0.0938514843583107, "learning_rate": 0.00019934857033325248, "loss": 0.0974, "step": 2140 }, { "epoch": 0.1384080808080808, "grad_norm": 0.07491198927164078, "learning_rate": 0.00019934779079378617, "loss": 0.0917, "step": 2141 }, { "epoch": 0.13847272727272728, "grad_norm": 0.10061255097389221, "learning_rate": 0.00019934701078970303, "loss": 0.1067, "step": 2142 }, { "epoch": 0.13853737373737374, "grad_norm": 0.0729975551366806, "learning_rate": 0.0001993462303210068, "loss": 0.0829, "step": 2143 }, { "epoch": 0.1386020202020202, "grad_norm": 0.09039177000522614, "learning_rate": 0.0001993454493877011, "loss": 0.0996, "step": 2144 }, { "epoch": 0.1386020202020202, "eval_bleu": 13.316991181706612, "eval_loss": 0.098295658826828, "eval_runtime": 2.6935, "eval_samples_per_second": 11.88, "eval_steps_per_second": 1.485, "step": 2144 }, { "epoch": 0.13866666666666666, "grad_norm": 0.08515696227550507, "learning_rate": 0.00019934466798978955, "loss": 0.0944, "step": 2145 }, { "epoch": 0.13873131313131312, "grad_norm": 0.08764570951461792, "learning_rate": 0.00019934388612727585, "loss": 0.1015, "step": 2146 }, { "epoch": 0.13879595959595958, "grad_norm": 0.09567011892795563, "learning_rate": 0.00019934310380016363, "loss": 0.1082, "step": 2147 }, { "epoch": 0.13886060606060607, "grad_norm": 0.0907915011048317, "learning_rate": 0.00019934232100845655, "loss": 0.0934, "step": 2148 }, { "epoch": 0.13892525252525253, "grad_norm": 0.07852301001548767, "learning_rate": 0.0001993415377521583, "loss": 0.0948, "step": 2149 }, { "epoch": 0.138989898989899, "grad_norm": 0.09274768084287643, "learning_rate": 0.00019934075403127248, "loss": 0.1176, "step": 2150 }, { "epoch": 0.13905454545454546, "grad_norm": 0.07157626748085022, "learning_rate": 0.00019933996984580283, "loss": 0.0847, "step": 2151 }, { "epoch": 0.13911919191919192, "grad_norm": 0.07254094630479813, "learning_rate": 0.00019933918519575298, "loss": 0.0767, "step": 2152 }, { "epoch": 0.13918383838383838, "grad_norm": 0.07105102390050888, "learning_rate": 0.0001993384000811266, "loss": 0.0853, "step": 2153 }, { "epoch": 0.13924848484848484, "grad_norm": 0.09649059176445007, "learning_rate": 0.00019933761450192735, "loss": 0.0951, "step": 2154 }, { "epoch": 0.1393131313131313, "grad_norm": 0.09500347077846527, "learning_rate": 0.00019933682845815892, "loss": 0.1087, "step": 2155 }, { "epoch": 0.13937777777777777, "grad_norm": 0.06282282620668411, "learning_rate": 0.00019933604194982495, "loss": 0.0694, "step": 2156 }, { "epoch": 0.13944242424242423, "grad_norm": 0.08335459977388382, "learning_rate": 0.00019933525497692922, "loss": 0.093, "step": 2157 }, { "epoch": 0.13950707070707072, "grad_norm": 0.0766918733716011, "learning_rate": 0.0001993344675394753, "loss": 0.0796, "step": 2158 }, { "epoch": 0.13957171717171718, "grad_norm": 0.07622380554676056, "learning_rate": 0.0001993336796374669, "loss": 0.0776, "step": 2159 }, { "epoch": 0.13963636363636364, "grad_norm": 0.08085879683494568, "learning_rate": 0.00019933289127090778, "loss": 0.0835, "step": 2160 }, { "epoch": 0.13963636363636364, "eval_bleu": 15.763808862362175, "eval_loss": 0.09635349363088608, "eval_runtime": 2.765, "eval_samples_per_second": 11.573, "eval_steps_per_second": 1.447, "step": 2160 }, { "epoch": 0.1397010101010101, "grad_norm": 0.08228936046361923, "learning_rate": 0.00019933210243980152, "loss": 0.0802, "step": 2161 }, { "epoch": 0.13976565656565657, "grad_norm": 0.10941604524850845, "learning_rate": 0.00019933131314415188, "loss": 0.1319, "step": 2162 }, { "epoch": 0.13983030303030303, "grad_norm": 0.08717752248048782, "learning_rate": 0.0001993305233839625, "loss": 0.0875, "step": 2163 }, { "epoch": 0.1398949494949495, "grad_norm": 0.08118139207363129, "learning_rate": 0.0001993297331592371, "loss": 0.0942, "step": 2164 }, { "epoch": 0.13995959595959595, "grad_norm": 0.08656337857246399, "learning_rate": 0.0001993289424699794, "loss": 0.1041, "step": 2165 }, { "epoch": 0.1400242424242424, "grad_norm": 0.09879984706640244, "learning_rate": 0.00019932815131619306, "loss": 0.0999, "step": 2166 }, { "epoch": 0.1400888888888889, "grad_norm": 0.08160009980201721, "learning_rate": 0.0001993273596978818, "loss": 0.0891, "step": 2167 }, { "epoch": 0.14015353535353536, "grad_norm": 0.07659734785556793, "learning_rate": 0.0001993265676150493, "loss": 0.0886, "step": 2168 }, { "epoch": 0.14021818181818183, "grad_norm": 0.07596676796674728, "learning_rate": 0.0001993257750676993, "loss": 0.087, "step": 2169 }, { "epoch": 0.1402828282828283, "grad_norm": 0.08992719650268555, "learning_rate": 0.0001993249820558355, "loss": 0.094, "step": 2170 }, { "epoch": 0.14034747474747475, "grad_norm": 0.07521230727434158, "learning_rate": 0.00019932418857946153, "loss": 0.096, "step": 2171 }, { "epoch": 0.1404121212121212, "grad_norm": 0.08257531374692917, "learning_rate": 0.0001993233946385812, "loss": 0.0947, "step": 2172 }, { "epoch": 0.14047676767676767, "grad_norm": 0.07799696177244186, "learning_rate": 0.00019932260023319823, "loss": 0.0807, "step": 2173 }, { "epoch": 0.14054141414141413, "grad_norm": 0.08241602033376694, "learning_rate": 0.00019932180536331624, "loss": 0.104, "step": 2174 }, { "epoch": 0.1406060606060606, "grad_norm": 0.0690343827009201, "learning_rate": 0.00019932101002893902, "loss": 0.075, "step": 2175 }, { "epoch": 0.14067070707070706, "grad_norm": 0.10477066040039062, "learning_rate": 0.00019932021423007027, "loss": 0.1208, "step": 2176 }, { "epoch": 0.14067070707070706, "eval_bleu": 12.266963111138233, "eval_loss": 0.09701789915561676, "eval_runtime": 2.7285, "eval_samples_per_second": 11.728, "eval_steps_per_second": 1.466, "step": 2176 }, { "epoch": 0.14073535353535355, "grad_norm": 0.07689818739891052, "learning_rate": 0.00019931941796671372, "loss": 0.096, "step": 2177 }, { "epoch": 0.1408, "grad_norm": 0.079193614423275, "learning_rate": 0.0001993186212388731, "loss": 0.1006, "step": 2178 }, { "epoch": 0.14086464646464647, "grad_norm": 0.0728984996676445, "learning_rate": 0.0001993178240465521, "loss": 0.0764, "step": 2179 }, { "epoch": 0.14092929292929293, "grad_norm": 0.1090875118970871, "learning_rate": 0.00019931702638975447, "loss": 0.0846, "step": 2180 }, { "epoch": 0.1409939393939394, "grad_norm": 0.09084334969520569, "learning_rate": 0.00019931622826848395, "loss": 0.1047, "step": 2181 }, { "epoch": 0.14105858585858586, "grad_norm": 0.0822165384888649, "learning_rate": 0.00019931542968274426, "loss": 0.0981, "step": 2182 }, { "epoch": 0.14112323232323232, "grad_norm": 0.08956528455018997, "learning_rate": 0.00019931463063253913, "loss": 0.1027, "step": 2183 }, { "epoch": 0.14118787878787878, "grad_norm": 0.0748949944972992, "learning_rate": 0.0001993138311178723, "loss": 0.0794, "step": 2184 }, { "epoch": 0.14125252525252524, "grad_norm": 0.07600361853837967, "learning_rate": 0.00019931303113874754, "loss": 0.0759, "step": 2185 }, { "epoch": 0.14131717171717173, "grad_norm": 0.07164584845304489, "learning_rate": 0.00019931223069516855, "loss": 0.072, "step": 2186 }, { "epoch": 0.1413818181818182, "grad_norm": 0.10086806118488312, "learning_rate": 0.0001993114297871391, "loss": 0.1071, "step": 2187 }, { "epoch": 0.14144646464646465, "grad_norm": 0.08601544052362442, "learning_rate": 0.00019931062841466293, "loss": 0.1053, "step": 2188 }, { "epoch": 0.14151111111111112, "grad_norm": 0.08842509984970093, "learning_rate": 0.00019930982657774378, "loss": 0.0925, "step": 2189 }, { "epoch": 0.14157575757575758, "grad_norm": 0.0895439088344574, "learning_rate": 0.00019930902427638537, "loss": 0.1041, "step": 2190 }, { "epoch": 0.14164040404040404, "grad_norm": 0.08682627975940704, "learning_rate": 0.0001993082215105915, "loss": 0.0983, "step": 2191 }, { "epoch": 0.1417050505050505, "grad_norm": 0.09832212328910828, "learning_rate": 0.00019930741828036593, "loss": 0.0895, "step": 2192 }, { "epoch": 0.1417050505050505, "eval_bleu": 15.835709145695853, "eval_loss": 0.09852661192417145, "eval_runtime": 2.6642, "eval_samples_per_second": 12.011, "eval_steps_per_second": 1.501, "step": 2192 }, { "epoch": 0.14176969696969696, "grad_norm": 0.07735937833786011, "learning_rate": 0.00019930661458571238, "loss": 0.0697, "step": 2193 }, { "epoch": 0.14183434343434342, "grad_norm": 0.0891536995768547, "learning_rate": 0.00019930581042663465, "loss": 0.0907, "step": 2194 }, { "epoch": 0.14189898989898989, "grad_norm": 0.09226737171411514, "learning_rate": 0.00019930500580313642, "loss": 0.0965, "step": 2195 }, { "epoch": 0.14196363636363638, "grad_norm": 0.09302262216806412, "learning_rate": 0.00019930420071522154, "loss": 0.1054, "step": 2196 }, { "epoch": 0.14202828282828284, "grad_norm": 0.086602583527565, "learning_rate": 0.00019930339516289374, "loss": 0.0949, "step": 2197 }, { "epoch": 0.1420929292929293, "grad_norm": 0.09134562313556671, "learning_rate": 0.0001993025891461568, "loss": 0.0969, "step": 2198 }, { "epoch": 0.14215757575757576, "grad_norm": 0.08391424268484116, "learning_rate": 0.00019930178266501446, "loss": 0.1002, "step": 2199 }, { "epoch": 0.14222222222222222, "grad_norm": 0.0886627584695816, "learning_rate": 0.00019930097571947052, "loss": 0.1011, "step": 2200 }, { "epoch": 0.14228686868686868, "grad_norm": 0.0875149518251419, "learning_rate": 0.00019930016830952877, "loss": 0.1038, "step": 2201 }, { "epoch": 0.14235151515151515, "grad_norm": 0.08375229686498642, "learning_rate": 0.0001992993604351929, "loss": 0.0983, "step": 2202 }, { "epoch": 0.1424161616161616, "grad_norm": 0.07363390177488327, "learning_rate": 0.00019929855209646678, "loss": 0.0881, "step": 2203 }, { "epoch": 0.14248080808080807, "grad_norm": 0.07777576148509979, "learning_rate": 0.00019929774329335417, "loss": 0.0937, "step": 2204 }, { "epoch": 0.14254545454545456, "grad_norm": 0.07619834691286087, "learning_rate": 0.00019929693402585884, "loss": 0.0966, "step": 2205 }, { "epoch": 0.14261010101010102, "grad_norm": 0.07290953397750854, "learning_rate": 0.00019929612429398457, "loss": 0.0809, "step": 2206 }, { "epoch": 0.14267474747474748, "grad_norm": 0.0811164379119873, "learning_rate": 0.00019929531409773518, "loss": 0.1013, "step": 2207 }, { "epoch": 0.14273939393939394, "grad_norm": 0.08142521232366562, "learning_rate": 0.00019929450343711438, "loss": 0.1118, "step": 2208 }, { "epoch": 0.14273939393939394, "eval_bleu": 14.90355245620719, "eval_loss": 0.09795285761356354, "eval_runtime": 2.9077, "eval_samples_per_second": 11.005, "eval_steps_per_second": 1.376, "step": 2208 }, { "epoch": 0.1428040404040404, "grad_norm": 0.0814114362001419, "learning_rate": 0.00019929369231212605, "loss": 0.0887, "step": 2209 }, { "epoch": 0.14286868686868687, "grad_norm": 0.08488152921199799, "learning_rate": 0.00019929288072277396, "loss": 0.0995, "step": 2210 }, { "epoch": 0.14293333333333333, "grad_norm": 0.07185027003288269, "learning_rate": 0.00019929206866906184, "loss": 0.0773, "step": 2211 }, { "epoch": 0.1429979797979798, "grad_norm": 0.07965119928121567, "learning_rate": 0.00019929125615099355, "loss": 0.0997, "step": 2212 }, { "epoch": 0.14306262626262625, "grad_norm": 0.08428077399730682, "learning_rate": 0.00019929044316857294, "loss": 0.1, "step": 2213 }, { "epoch": 0.14312727272727271, "grad_norm": 0.07027176767587662, "learning_rate": 0.00019928962972180368, "loss": 0.0781, "step": 2214 }, { "epoch": 0.1431919191919192, "grad_norm": 0.07408403605222702, "learning_rate": 0.00019928881581068967, "loss": 0.0838, "step": 2215 }, { "epoch": 0.14325656565656567, "grad_norm": 0.09210987389087677, "learning_rate": 0.0001992880014352347, "loss": 0.0934, "step": 2216 }, { "epoch": 0.14332121212121213, "grad_norm": 0.08894040435552597, "learning_rate": 0.0001992871865954426, "loss": 0.1007, "step": 2217 }, { "epoch": 0.1433858585858586, "grad_norm": 0.0793905034661293, "learning_rate": 0.0001992863712913171, "loss": 0.0999, "step": 2218 }, { "epoch": 0.14345050505050505, "grad_norm": 0.0838514119386673, "learning_rate": 0.0001992855555228621, "loss": 0.102, "step": 2219 }, { "epoch": 0.1435151515151515, "grad_norm": 0.08446792513132095, "learning_rate": 0.00019928473929008137, "loss": 0.0974, "step": 2220 }, { "epoch": 0.14357979797979797, "grad_norm": 0.07814288884401321, "learning_rate": 0.00019928392259297873, "loss": 0.1031, "step": 2221 }, { "epoch": 0.14364444444444444, "grad_norm": 0.08398514240980148, "learning_rate": 0.00019928310543155804, "loss": 0.0833, "step": 2222 }, { "epoch": 0.1437090909090909, "grad_norm": 0.0768071711063385, "learning_rate": 0.00019928228780582305, "loss": 0.0853, "step": 2223 }, { "epoch": 0.1437737373737374, "grad_norm": 0.10781671851873398, "learning_rate": 0.00019928146971577763, "loss": 0.1233, "step": 2224 }, { "epoch": 0.1437737373737374, "eval_bleu": 16.856761244755475, "eval_loss": 0.09815460443496704, "eval_runtime": 2.7804, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.439, "step": 2224 }, { "epoch": 0.14383838383838385, "grad_norm": 0.08463025838136673, "learning_rate": 0.0001992806511614256, "loss": 0.0993, "step": 2225 }, { "epoch": 0.1439030303030303, "grad_norm": 0.07293100655078888, "learning_rate": 0.0001992798321427708, "loss": 0.0821, "step": 2226 }, { "epoch": 0.14396767676767677, "grad_norm": 0.08723324537277222, "learning_rate": 0.000199279012659817, "loss": 0.1043, "step": 2227 }, { "epoch": 0.14403232323232323, "grad_norm": 0.07860496640205383, "learning_rate": 0.00019927819271256812, "loss": 0.0872, "step": 2228 }, { "epoch": 0.1440969696969697, "grad_norm": 0.0815499946475029, "learning_rate": 0.00019927737230102796, "loss": 0.0895, "step": 2229 }, { "epoch": 0.14416161616161616, "grad_norm": 0.0952635332942009, "learning_rate": 0.00019927655142520034, "loss": 0.1085, "step": 2230 }, { "epoch": 0.14422626262626262, "grad_norm": 0.07156683504581451, "learning_rate": 0.0001992757300850891, "loss": 0.085, "step": 2231 }, { "epoch": 0.14429090909090908, "grad_norm": 0.08956246823072433, "learning_rate": 0.0001992749082806981, "loss": 0.1136, "step": 2232 }, { "epoch": 0.14435555555555554, "grad_norm": 0.09046854823827744, "learning_rate": 0.00019927408601203117, "loss": 0.1046, "step": 2233 }, { "epoch": 0.14442020202020203, "grad_norm": 0.10297202318906784, "learning_rate": 0.00019927326327909216, "loss": 0.0935, "step": 2234 }, { "epoch": 0.1444848484848485, "grad_norm": 0.0739424079656601, "learning_rate": 0.00019927244008188493, "loss": 0.0953, "step": 2235 }, { "epoch": 0.14454949494949496, "grad_norm": 0.069561667740345, "learning_rate": 0.00019927161642041327, "loss": 0.0699, "step": 2236 }, { "epoch": 0.14461414141414142, "grad_norm": 0.09780288487672806, "learning_rate": 0.00019927079229468112, "loss": 0.1027, "step": 2237 }, { "epoch": 0.14467878787878788, "grad_norm": 0.0815785750746727, "learning_rate": 0.0001992699677046923, "loss": 0.1058, "step": 2238 }, { "epoch": 0.14474343434343434, "grad_norm": 0.085743747651577, "learning_rate": 0.00019926914265045063, "loss": 0.0974, "step": 2239 }, { "epoch": 0.1448080808080808, "grad_norm": 0.0680752694606781, "learning_rate": 0.00019926831713196002, "loss": 0.075, "step": 2240 }, { "epoch": 0.1448080808080808, "eval_bleu": 14.144843246403564, "eval_loss": 0.09855931252241135, "eval_runtime": 2.7165, "eval_samples_per_second": 11.78, "eval_steps_per_second": 1.472, "step": 2240 }, { "epoch": 0.14487272727272726, "grad_norm": 0.09069699794054031, "learning_rate": 0.0001992674911492243, "loss": 0.1214, "step": 2241 }, { "epoch": 0.14493737373737373, "grad_norm": 0.09225036203861237, "learning_rate": 0.00019926666470224732, "loss": 0.1023, "step": 2242 }, { "epoch": 0.14500202020202022, "grad_norm": 0.08832424879074097, "learning_rate": 0.00019926583779103298, "loss": 0.0962, "step": 2243 }, { "epoch": 0.14506666666666668, "grad_norm": 0.09969259053468704, "learning_rate": 0.00019926501041558515, "loss": 0.0887, "step": 2244 }, { "epoch": 0.14513131313131314, "grad_norm": 0.0939222201704979, "learning_rate": 0.00019926418257590768, "loss": 0.1039, "step": 2245 }, { "epoch": 0.1451959595959596, "grad_norm": 0.07626113295555115, "learning_rate": 0.00019926335427200443, "loss": 0.0905, "step": 2246 }, { "epoch": 0.14526060606060606, "grad_norm": 0.08937086164951324, "learning_rate": 0.00019926252550387928, "loss": 0.0928, "step": 2247 }, { "epoch": 0.14532525252525252, "grad_norm": 0.09782461822032928, "learning_rate": 0.00019926169627153614, "loss": 0.1179, "step": 2248 }, { "epoch": 0.14538989898989899, "grad_norm": 0.10443700850009918, "learning_rate": 0.00019926086657497883, "loss": 0.1022, "step": 2249 }, { "epoch": 0.14545454545454545, "grad_norm": 0.09932852536439896, "learning_rate": 0.00019926003641421129, "loss": 0.13, "step": 2250 }, { "epoch": 0.1455191919191919, "grad_norm": 0.08329958468675613, "learning_rate": 0.00019925920578923737, "loss": 0.0886, "step": 2251 }, { "epoch": 0.14558383838383837, "grad_norm": 0.07198482751846313, "learning_rate": 0.00019925837470006097, "loss": 0.0782, "step": 2252 }, { "epoch": 0.14564848484848486, "grad_norm": 0.0805986225605011, "learning_rate": 0.00019925754314668593, "loss": 0.0938, "step": 2253 }, { "epoch": 0.14571313131313132, "grad_norm": 0.08215127885341644, "learning_rate": 0.00019925671112911618, "loss": 0.0893, "step": 2254 }, { "epoch": 0.14577777777777778, "grad_norm": 0.09303683042526245, "learning_rate": 0.00019925587864735565, "loss": 0.1279, "step": 2255 }, { "epoch": 0.14584242424242425, "grad_norm": 0.07702606171369553, "learning_rate": 0.00019925504570140814, "loss": 0.0876, "step": 2256 }, { "epoch": 0.14584242424242425, "eval_bleu": 14.761206778905008, "eval_loss": 0.09789718687534332, "eval_runtime": 2.7354, "eval_samples_per_second": 11.698, "eval_steps_per_second": 1.462, "step": 2256 }, { "epoch": 0.1459070707070707, "grad_norm": 0.07773461937904358, "learning_rate": 0.00019925421229127763, "loss": 0.0904, "step": 2257 }, { "epoch": 0.14597171717171717, "grad_norm": 0.09010069072246552, "learning_rate": 0.00019925337841696797, "loss": 0.1151, "step": 2258 }, { "epoch": 0.14603636363636363, "grad_norm": 0.09520852565765381, "learning_rate": 0.00019925254407848305, "loss": 0.1268, "step": 2259 }, { "epoch": 0.1461010101010101, "grad_norm": 0.08797336369752884, "learning_rate": 0.00019925170927582683, "loss": 0.1135, "step": 2260 }, { "epoch": 0.14616565656565655, "grad_norm": 0.09137532860040665, "learning_rate": 0.00019925087400900316, "loss": 0.0998, "step": 2261 }, { "epoch": 0.14623030303030304, "grad_norm": 0.08424366265535355, "learning_rate": 0.00019925003827801595, "loss": 0.0994, "step": 2262 }, { "epoch": 0.1462949494949495, "grad_norm": 0.09606316685676575, "learning_rate": 0.00019924920208286913, "loss": 0.1081, "step": 2263 }, { "epoch": 0.14635959595959597, "grad_norm": 0.08903534710407257, "learning_rate": 0.0001992483654235666, "loss": 0.1084, "step": 2264 }, { "epoch": 0.14642424242424243, "grad_norm": 0.08161941170692444, "learning_rate": 0.0001992475283001123, "loss": 0.0923, "step": 2265 }, { "epoch": 0.1464888888888889, "grad_norm": 0.08217356353998184, "learning_rate": 0.0001992466907125101, "loss": 0.0992, "step": 2266 }, { "epoch": 0.14655353535353535, "grad_norm": 0.0858437567949295, "learning_rate": 0.00019924585266076396, "loss": 0.1012, "step": 2267 }, { "epoch": 0.14661818181818181, "grad_norm": 0.0778830498456955, "learning_rate": 0.00019924501414487775, "loss": 0.0952, "step": 2268 }, { "epoch": 0.14668282828282828, "grad_norm": 0.08928383886814117, "learning_rate": 0.00019924417516485542, "loss": 0.0894, "step": 2269 }, { "epoch": 0.14674747474747474, "grad_norm": 0.0894358828663826, "learning_rate": 0.0001992433357207009, "loss": 0.1123, "step": 2270 }, { "epoch": 0.1468121212121212, "grad_norm": 0.08054732531309128, "learning_rate": 0.00019924249581241811, "loss": 0.0969, "step": 2271 }, { "epoch": 0.1468767676767677, "grad_norm": 0.08566389232873917, "learning_rate": 0.00019924165544001098, "loss": 0.105, "step": 2272 }, { "epoch": 0.1468767676767677, "eval_bleu": 17.043232217430802, "eval_loss": 0.09771702438592911, "eval_runtime": 2.6978, "eval_samples_per_second": 11.862, "eval_steps_per_second": 1.483, "step": 2272 }, { "epoch": 0.14694141414141415, "grad_norm": 0.10433823615312576, "learning_rate": 0.00019924081460348343, "loss": 0.087, "step": 2273 }, { "epoch": 0.1470060606060606, "grad_norm": 0.0779954344034195, "learning_rate": 0.0001992399733028394, "loss": 0.085, "step": 2274 }, { "epoch": 0.14707070707070707, "grad_norm": 0.14118874073028564, "learning_rate": 0.00019923913153808282, "loss": 0.1357, "step": 2275 }, { "epoch": 0.14713535353535354, "grad_norm": 0.08732448518276215, "learning_rate": 0.00019923828930921763, "loss": 0.1043, "step": 2276 }, { "epoch": 0.1472, "grad_norm": 0.07829619199037552, "learning_rate": 0.00019923744661624776, "loss": 0.0777, "step": 2277 }, { "epoch": 0.14726464646464646, "grad_norm": 0.09334984421730042, "learning_rate": 0.00019923660345917717, "loss": 0.0994, "step": 2278 }, { "epoch": 0.14732929292929292, "grad_norm": 0.09675560146570206, "learning_rate": 0.00019923575983800979, "loss": 0.1274, "step": 2279 }, { "epoch": 0.14739393939393938, "grad_norm": 0.09379332512617111, "learning_rate": 0.00019923491575274957, "loss": 0.1112, "step": 2280 }, { "epoch": 0.14745858585858587, "grad_norm": 0.17280007898807526, "learning_rate": 0.00019923407120340047, "loss": 0.1035, "step": 2281 }, { "epoch": 0.14752323232323233, "grad_norm": 0.07100223749876022, "learning_rate": 0.0001992332261899664, "loss": 0.0787, "step": 2282 }, { "epoch": 0.1475878787878788, "grad_norm": 0.07255946844816208, "learning_rate": 0.00019923238071245134, "loss": 0.0811, "step": 2283 }, { "epoch": 0.14765252525252526, "grad_norm": 0.07578455656766891, "learning_rate": 0.00019923153477085929, "loss": 0.0857, "step": 2284 }, { "epoch": 0.14771717171717172, "grad_norm": 0.07674919068813324, "learning_rate": 0.00019923068836519408, "loss": 0.084, "step": 2285 }, { "epoch": 0.14778181818181818, "grad_norm": 0.07491938769817352, "learning_rate": 0.00019922984149545978, "loss": 0.093, "step": 2286 }, { "epoch": 0.14784646464646464, "grad_norm": 0.08271020650863647, "learning_rate": 0.00019922899416166033, "loss": 0.092, "step": 2287 }, { "epoch": 0.1479111111111111, "grad_norm": 0.06983164697885513, "learning_rate": 0.00019922814636379969, "loss": 0.0747, "step": 2288 }, { "epoch": 0.1479111111111111, "eval_bleu": 15.892574320268269, "eval_loss": 0.09509193897247314, "eval_runtime": 2.8163, "eval_samples_per_second": 11.362, "eval_steps_per_second": 1.42, "step": 2288 }, { "epoch": 0.14797575757575757, "grad_norm": 0.07476863265037537, "learning_rate": 0.00019922729810188176, "loss": 0.0874, "step": 2289 }, { "epoch": 0.14804040404040403, "grad_norm": 0.07053195685148239, "learning_rate": 0.00019922644937591058, "loss": 0.0796, "step": 2290 }, { "epoch": 0.14810505050505052, "grad_norm": 0.07486848533153534, "learning_rate": 0.00019922560018589011, "loss": 0.088, "step": 2291 }, { "epoch": 0.14816969696969698, "grad_norm": 0.07849009335041046, "learning_rate": 0.00019922475053182431, "loss": 0.0831, "step": 2292 }, { "epoch": 0.14823434343434344, "grad_norm": 0.07709590345621109, "learning_rate": 0.00019922390041371716, "loss": 0.0928, "step": 2293 }, { "epoch": 0.1482989898989899, "grad_norm": 0.08308565616607666, "learning_rate": 0.00019922304983157262, "loss": 0.0841, "step": 2294 }, { "epoch": 0.14836363636363636, "grad_norm": 0.09292647242546082, "learning_rate": 0.0001992221987853947, "loss": 0.0859, "step": 2295 }, { "epoch": 0.14842828282828283, "grad_norm": 0.0787341445684433, "learning_rate": 0.00019922134727518733, "loss": 0.0874, "step": 2296 }, { "epoch": 0.1484929292929293, "grad_norm": 0.07518694549798965, "learning_rate": 0.00019922049530095455, "loss": 0.084, "step": 2297 }, { "epoch": 0.14855757575757575, "grad_norm": 0.07190399616956711, "learning_rate": 0.0001992196428627003, "loss": 0.0723, "step": 2298 }, { "epoch": 0.1486222222222222, "grad_norm": 0.10043215751647949, "learning_rate": 0.00019921878996042856, "loss": 0.0991, "step": 2299 }, { "epoch": 0.1486868686868687, "grad_norm": 0.0940486490726471, "learning_rate": 0.00019921793659414337, "loss": 0.111, "step": 2300 }, { "epoch": 0.14875151515151516, "grad_norm": 0.12624093890190125, "learning_rate": 0.00019921708276384869, "loss": 0.1139, "step": 2301 }, { "epoch": 0.14881616161616162, "grad_norm": 0.08641842752695084, "learning_rate": 0.0001992162284695485, "loss": 0.1203, "step": 2302 }, { "epoch": 0.14888080808080809, "grad_norm": 0.07899843901395798, "learning_rate": 0.0001992153737112468, "loss": 0.0898, "step": 2303 }, { "epoch": 0.14894545454545455, "grad_norm": 0.10435502976179123, "learning_rate": 0.00019921451848894765, "loss": 0.0884, "step": 2304 }, { "epoch": 0.14894545454545455, "eval_bleu": 14.088385449194126, "eval_loss": 0.09721770882606506, "eval_runtime": 2.812, "eval_samples_per_second": 11.38, "eval_steps_per_second": 1.422, "step": 2304 }, { "epoch": 0.149010101010101, "grad_norm": 0.08355221152305603, "learning_rate": 0.00019921366280265494, "loss": 0.0821, "step": 2305 }, { "epoch": 0.14907474747474747, "grad_norm": 0.06774589419364929, "learning_rate": 0.00019921280665237276, "loss": 0.0763, "step": 2306 }, { "epoch": 0.14913939393939393, "grad_norm": 0.08457525819540024, "learning_rate": 0.00019921195003810507, "loss": 0.1103, "step": 2307 }, { "epoch": 0.1492040404040404, "grad_norm": 0.0811379924416542, "learning_rate": 0.0001992110929598559, "loss": 0.0844, "step": 2308 }, { "epoch": 0.14926868686868686, "grad_norm": 0.08011262118816376, "learning_rate": 0.00019921023541762925, "loss": 0.1078, "step": 2309 }, { "epoch": 0.14933333333333335, "grad_norm": 0.10300824791193008, "learning_rate": 0.0001992093774114291, "loss": 0.1033, "step": 2310 }, { "epoch": 0.1493979797979798, "grad_norm": 0.0863417536020279, "learning_rate": 0.0001992085189412595, "loss": 0.1033, "step": 2311 }, { "epoch": 0.14946262626262627, "grad_norm": 0.10561800003051758, "learning_rate": 0.00019920766000712444, "loss": 0.1145, "step": 2312 }, { "epoch": 0.14952727272727273, "grad_norm": 0.07916280627250671, "learning_rate": 0.000199206800609028, "loss": 0.0875, "step": 2313 }, { "epoch": 0.1495919191919192, "grad_norm": 0.10506631433963776, "learning_rate": 0.0001992059407469741, "loss": 0.1229, "step": 2314 }, { "epoch": 0.14965656565656565, "grad_norm": 0.07248834520578384, "learning_rate": 0.00019920508042096683, "loss": 0.0755, "step": 2315 }, { "epoch": 0.14972121212121212, "grad_norm": 0.08458910137414932, "learning_rate": 0.0001992042196310102, "loss": 0.0989, "step": 2316 }, { "epoch": 0.14978585858585858, "grad_norm": 0.08612111210823059, "learning_rate": 0.00019920335837710825, "loss": 0.0903, "step": 2317 }, { "epoch": 0.14985050505050504, "grad_norm": 0.08405128866434097, "learning_rate": 0.00019920249665926495, "loss": 0.1118, "step": 2318 }, { "epoch": 0.1499151515151515, "grad_norm": 0.07918649911880493, "learning_rate": 0.0001992016344774844, "loss": 0.1057, "step": 2319 }, { "epoch": 0.149979797979798, "grad_norm": 0.0907558798789978, "learning_rate": 0.0001992007718317706, "loss": 0.1175, "step": 2320 }, { "epoch": 0.149979797979798, "eval_bleu": 13.222980814557614, "eval_loss": 0.09897366166114807, "eval_runtime": 2.6835, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 2320 }, { "epoch": 0.15004444444444445, "grad_norm": 0.0756349265575409, "learning_rate": 0.00019919990872212755, "loss": 0.0882, "step": 2321 }, { "epoch": 0.15010909090909091, "grad_norm": 0.08692969381809235, "learning_rate": 0.00019919904514855934, "loss": 0.0997, "step": 2322 }, { "epoch": 0.15017373737373738, "grad_norm": 0.0826423391699791, "learning_rate": 0.00019919818111107, "loss": 0.0902, "step": 2323 }, { "epoch": 0.15023838383838384, "grad_norm": 0.10737424343824387, "learning_rate": 0.00019919731660966356, "loss": 0.0865, "step": 2324 }, { "epoch": 0.1503030303030303, "grad_norm": 0.07574409991502762, "learning_rate": 0.00019919645164434406, "loss": 0.0914, "step": 2325 }, { "epoch": 0.15036767676767676, "grad_norm": 0.08156408369541168, "learning_rate": 0.00019919558621511553, "loss": 0.1002, "step": 2326 }, { "epoch": 0.15043232323232322, "grad_norm": 0.08333850651979446, "learning_rate": 0.00019919472032198208, "loss": 0.0918, "step": 2327 }, { "epoch": 0.15049696969696968, "grad_norm": 0.08153364062309265, "learning_rate": 0.0001991938539649477, "loss": 0.0936, "step": 2328 }, { "epoch": 0.15056161616161617, "grad_norm": 0.08397586643695831, "learning_rate": 0.00019919298714401643, "loss": 0.0921, "step": 2329 }, { "epoch": 0.15062626262626264, "grad_norm": 0.07793214917182922, "learning_rate": 0.0001991921198591924, "loss": 0.0929, "step": 2330 }, { "epoch": 0.1506909090909091, "grad_norm": 0.07486416399478912, "learning_rate": 0.00019919125211047958, "loss": 0.0972, "step": 2331 }, { "epoch": 0.15075555555555556, "grad_norm": 0.08046216517686844, "learning_rate": 0.00019919038389788207, "loss": 0.0935, "step": 2332 }, { "epoch": 0.15082020202020202, "grad_norm": 0.07738444954156876, "learning_rate": 0.00019918951522140393, "loss": 0.0887, "step": 2333 }, { "epoch": 0.15088484848484848, "grad_norm": 0.1287994086742401, "learning_rate": 0.00019918864608104922, "loss": 0.0929, "step": 2334 }, { "epoch": 0.15094949494949494, "grad_norm": 0.07731743156909943, "learning_rate": 0.000199187776476822, "loss": 0.0902, "step": 2335 }, { "epoch": 0.1510141414141414, "grad_norm": 0.080807164311409, "learning_rate": 0.00019918690640872636, "loss": 0.0942, "step": 2336 }, { "epoch": 0.1510141414141414, "eval_bleu": 15.973732924570047, "eval_loss": 0.09978494793176651, "eval_runtime": 2.7055, "eval_samples_per_second": 11.828, "eval_steps_per_second": 1.478, "step": 2336 }, { "epoch": 0.15107878787878787, "grad_norm": 0.07488775998353958, "learning_rate": 0.0001991860358767663, "loss": 0.0896, "step": 2337 }, { "epoch": 0.15114343434343433, "grad_norm": 0.0880126804113388, "learning_rate": 0.000199185164880946, "loss": 0.0995, "step": 2338 }, { "epoch": 0.15120808080808082, "grad_norm": 0.07517234236001968, "learning_rate": 0.00019918429342126944, "loss": 0.0842, "step": 2339 }, { "epoch": 0.15127272727272728, "grad_norm": 0.09195252507925034, "learning_rate": 0.00019918342149774073, "loss": 0.1023, "step": 2340 }, { "epoch": 0.15133737373737374, "grad_norm": 0.08316605538129807, "learning_rate": 0.00019918254911036398, "loss": 0.105, "step": 2341 }, { "epoch": 0.1514020202020202, "grad_norm": 0.08410564810037613, "learning_rate": 0.0001991816762591432, "loss": 0.0925, "step": 2342 }, { "epoch": 0.15146666666666667, "grad_norm": 0.08235607296228409, "learning_rate": 0.00019918080294408253, "loss": 0.0881, "step": 2343 }, { "epoch": 0.15153131313131313, "grad_norm": 0.06468655169010162, "learning_rate": 0.00019917992916518602, "loss": 0.0826, "step": 2344 }, { "epoch": 0.1515959595959596, "grad_norm": 0.08290201425552368, "learning_rate": 0.0001991790549224578, "loss": 0.0997, "step": 2345 }, { "epoch": 0.15166060606060605, "grad_norm": 0.07575234770774841, "learning_rate": 0.00019917818021590188, "loss": 0.0949, "step": 2346 }, { "epoch": 0.1517252525252525, "grad_norm": 0.0716208666563034, "learning_rate": 0.00019917730504552243, "loss": 0.077, "step": 2347 }, { "epoch": 0.151789898989899, "grad_norm": 0.06823279708623886, "learning_rate": 0.0001991764294113235, "loss": 0.0904, "step": 2348 }, { "epoch": 0.15185454545454546, "grad_norm": 0.10471434891223907, "learning_rate": 0.00019917555331330918, "loss": 0.1055, "step": 2349 }, { "epoch": 0.15191919191919193, "grad_norm": 0.0712771788239479, "learning_rate": 0.00019917467675148362, "loss": 0.0892, "step": 2350 }, { "epoch": 0.1519838383838384, "grad_norm": 0.0834423080086708, "learning_rate": 0.00019917379972585086, "loss": 0.089, "step": 2351 }, { "epoch": 0.15204848484848485, "grad_norm": 0.07527291774749756, "learning_rate": 0.000199172922236415, "loss": 0.0957, "step": 2352 }, { "epoch": 0.15204848484848485, "eval_bleu": 13.92591718541453, "eval_loss": 0.09829393774271011, "eval_runtime": 2.7052, "eval_samples_per_second": 11.829, "eval_steps_per_second": 1.479, "step": 2352 }, { "epoch": 0.1521131313131313, "grad_norm": 0.0684351772069931, "learning_rate": 0.00019917204428318024, "loss": 0.0894, "step": 2353 }, { "epoch": 0.15217777777777777, "grad_norm": 0.07119576632976532, "learning_rate": 0.00019917116586615056, "loss": 0.0854, "step": 2354 }, { "epoch": 0.15224242424242423, "grad_norm": 0.0755360871553421, "learning_rate": 0.00019917028698533015, "loss": 0.0862, "step": 2355 }, { "epoch": 0.1523070707070707, "grad_norm": 0.07296937704086304, "learning_rate": 0.00019916940764072306, "loss": 0.1027, "step": 2356 }, { "epoch": 0.15237171717171716, "grad_norm": 0.07805594801902771, "learning_rate": 0.00019916852783233345, "loss": 0.0986, "step": 2357 }, { "epoch": 0.15243636363636365, "grad_norm": 0.0746074840426445, "learning_rate": 0.00019916764756016544, "loss": 0.094, "step": 2358 }, { "epoch": 0.1525010101010101, "grad_norm": 0.0822427049279213, "learning_rate": 0.0001991667668242231, "loss": 0.1101, "step": 2359 }, { "epoch": 0.15256565656565657, "grad_norm": 0.08411096036434174, "learning_rate": 0.00019916588562451058, "loss": 0.0974, "step": 2360 }, { "epoch": 0.15263030303030303, "grad_norm": 0.06998459249734879, "learning_rate": 0.000199165003961032, "loss": 0.0864, "step": 2361 }, { "epoch": 0.1526949494949495, "grad_norm": 0.08847485482692719, "learning_rate": 0.0001991641218337915, "loss": 0.0968, "step": 2362 }, { "epoch": 0.15275959595959596, "grad_norm": 0.08485672622919083, "learning_rate": 0.00019916323924279317, "loss": 0.1038, "step": 2363 }, { "epoch": 0.15282424242424242, "grad_norm": 0.08982007950544357, "learning_rate": 0.00019916235618804115, "loss": 0.1092, "step": 2364 }, { "epoch": 0.15288888888888888, "grad_norm": 0.08455164730548859, "learning_rate": 0.00019916147266953958, "loss": 0.0912, "step": 2365 }, { "epoch": 0.15295353535353534, "grad_norm": 0.0744146853685379, "learning_rate": 0.00019916058868729258, "loss": 0.0874, "step": 2366 }, { "epoch": 0.15301818181818183, "grad_norm": 0.07572925835847855, "learning_rate": 0.0001991597042413043, "loss": 0.0921, "step": 2367 }, { "epoch": 0.1530828282828283, "grad_norm": 0.07488899677991867, "learning_rate": 0.00019915881933157883, "loss": 0.0956, "step": 2368 }, { "epoch": 0.1530828282828283, "eval_bleu": 13.510725812985678, "eval_loss": 0.09653285145759583, "eval_runtime": 2.8565, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 2368 }, { "epoch": 0.15314747474747475, "grad_norm": 0.07667747884988785, "learning_rate": 0.00019915793395812036, "loss": 0.0975, "step": 2369 }, { "epoch": 0.15321212121212122, "grad_norm": 0.1020771786570549, "learning_rate": 0.000199157048120933, "loss": 0.109, "step": 2370 }, { "epoch": 0.15327676767676768, "grad_norm": 0.07340958714485168, "learning_rate": 0.00019915616182002095, "loss": 0.0953, "step": 2371 }, { "epoch": 0.15334141414141414, "grad_norm": 0.08042582124471664, "learning_rate": 0.00019915527505538826, "loss": 0.0953, "step": 2372 }, { "epoch": 0.1534060606060606, "grad_norm": 0.07423064112663269, "learning_rate": 0.00019915438782703917, "loss": 0.0932, "step": 2373 }, { "epoch": 0.15347070707070706, "grad_norm": 0.0813387781381607, "learning_rate": 0.0001991535001349778, "loss": 0.0773, "step": 2374 }, { "epoch": 0.15353535353535352, "grad_norm": 0.07985939085483551, "learning_rate": 0.00019915261197920824, "loss": 0.0816, "step": 2375 }, { "epoch": 0.1536, "grad_norm": 0.0904230922460556, "learning_rate": 0.00019915172335973472, "loss": 0.1005, "step": 2376 }, { "epoch": 0.15366464646464648, "grad_norm": 0.06688009947538376, "learning_rate": 0.00019915083427656137, "loss": 0.0858, "step": 2377 }, { "epoch": 0.15372929292929294, "grad_norm": 0.07655452936887741, "learning_rate": 0.00019914994472969232, "loss": 0.0943, "step": 2378 }, { "epoch": 0.1537939393939394, "grad_norm": 0.09423457086086273, "learning_rate": 0.0001991490547191318, "loss": 0.0997, "step": 2379 }, { "epoch": 0.15385858585858586, "grad_norm": 0.09763972461223602, "learning_rate": 0.0001991481642448839, "loss": 0.0953, "step": 2380 }, { "epoch": 0.15392323232323232, "grad_norm": 0.07487978041172028, "learning_rate": 0.00019914727330695283, "loss": 0.0888, "step": 2381 }, { "epoch": 0.15398787878787878, "grad_norm": 0.070538729429245, "learning_rate": 0.00019914638190534273, "loss": 0.0821, "step": 2382 }, { "epoch": 0.15405252525252525, "grad_norm": 0.07963468879461288, "learning_rate": 0.00019914549004005779, "loss": 0.0917, "step": 2383 }, { "epoch": 0.1541171717171717, "grad_norm": 0.08227356523275375, "learning_rate": 0.00019914459771110213, "loss": 0.0907, "step": 2384 }, { "epoch": 0.1541171717171717, "eval_bleu": 16.018994930559277, "eval_loss": 0.09477431327104568, "eval_runtime": 2.6915, "eval_samples_per_second": 11.889, "eval_steps_per_second": 1.486, "step": 2384 }, { "epoch": 0.15418181818181817, "grad_norm": 0.1069755107164383, "learning_rate": 0.00019914370491848, "loss": 0.1135, "step": 2385 }, { "epoch": 0.15424646464646466, "grad_norm": 0.09064731001853943, "learning_rate": 0.00019914281166219552, "loss": 0.0945, "step": 2386 }, { "epoch": 0.15431111111111112, "grad_norm": 0.08304800093173981, "learning_rate": 0.0001991419179422529, "loss": 0.0817, "step": 2387 }, { "epoch": 0.15437575757575758, "grad_norm": 0.07548732310533524, "learning_rate": 0.00019914102375865632, "loss": 0.0804, "step": 2388 }, { "epoch": 0.15444040404040404, "grad_norm": 0.07957342267036438, "learning_rate": 0.00019914012911140993, "loss": 0.0901, "step": 2389 }, { "epoch": 0.1545050505050505, "grad_norm": 0.08715780824422836, "learning_rate": 0.00019913923400051792, "loss": 0.0964, "step": 2390 }, { "epoch": 0.15456969696969697, "grad_norm": 0.07704520225524902, "learning_rate": 0.00019913833842598452, "loss": 0.0948, "step": 2391 }, { "epoch": 0.15463434343434343, "grad_norm": 0.08232387900352478, "learning_rate": 0.00019913744238781386, "loss": 0.1048, "step": 2392 }, { "epoch": 0.1546989898989899, "grad_norm": 0.08324352651834488, "learning_rate": 0.00019913654588601015, "loss": 0.0921, "step": 2393 }, { "epoch": 0.15476363636363635, "grad_norm": 0.0733690932393074, "learning_rate": 0.00019913564892057762, "loss": 0.0804, "step": 2394 }, { "epoch": 0.15482828282828281, "grad_norm": 0.07485280185937881, "learning_rate": 0.0001991347514915204, "loss": 0.0862, "step": 2395 }, { "epoch": 0.1548929292929293, "grad_norm": 0.07740943878889084, "learning_rate": 0.00019913385359884272, "loss": 0.0848, "step": 2396 }, { "epoch": 0.15495757575757577, "grad_norm": 0.10380509495735168, "learning_rate": 0.0001991329552425488, "loss": 0.1076, "step": 2397 }, { "epoch": 0.15502222222222223, "grad_norm": 0.08265800774097443, "learning_rate": 0.00019913205642264282, "loss": 0.1025, "step": 2398 }, { "epoch": 0.1550868686868687, "grad_norm": 0.0772845521569252, "learning_rate": 0.00019913115713912895, "loss": 0.094, "step": 2399 }, { "epoch": 0.15515151515151515, "grad_norm": 0.07430519908666611, "learning_rate": 0.00019913025739201149, "loss": 0.0855, "step": 2400 }, { "epoch": 0.15515151515151515, "eval_bleu": 14.101293051777796, "eval_loss": 0.09552083909511566, "eval_runtime": 2.6825, "eval_samples_per_second": 11.929, "eval_steps_per_second": 1.491, "step": 2400 }, { "epoch": 0.1552161616161616, "grad_norm": 0.07156430929899216, "learning_rate": 0.00019912935718129453, "loss": 0.0879, "step": 2401 }, { "epoch": 0.15528080808080807, "grad_norm": 0.09990596026182175, "learning_rate": 0.00019912845650698238, "loss": 0.1339, "step": 2402 }, { "epoch": 0.15534545454545454, "grad_norm": 0.06521645933389664, "learning_rate": 0.00019912755536907918, "loss": 0.0693, "step": 2403 }, { "epoch": 0.155410101010101, "grad_norm": 0.0766676813364029, "learning_rate": 0.00019912665376758918, "loss": 0.0904, "step": 2404 }, { "epoch": 0.1554747474747475, "grad_norm": 0.0709398090839386, "learning_rate": 0.00019912575170251661, "loss": 0.0757, "step": 2405 }, { "epoch": 0.15553939393939395, "grad_norm": 0.07931448519229889, "learning_rate": 0.00019912484917386568, "loss": 0.105, "step": 2406 }, { "epoch": 0.1556040404040404, "grad_norm": 0.07410561293363571, "learning_rate": 0.00019912394618164058, "loss": 0.0835, "step": 2407 }, { "epoch": 0.15566868686868687, "grad_norm": 0.08293446898460388, "learning_rate": 0.00019912304272584554, "loss": 0.1038, "step": 2408 }, { "epoch": 0.15573333333333333, "grad_norm": 0.09968655556440353, "learning_rate": 0.00019912213880648482, "loss": 0.1311, "step": 2409 }, { "epoch": 0.1557979797979798, "grad_norm": 0.09109925478696823, "learning_rate": 0.0001991212344235626, "loss": 0.1124, "step": 2410 }, { "epoch": 0.15586262626262626, "grad_norm": 0.09481918811798096, "learning_rate": 0.00019912032957708316, "loss": 0.0966, "step": 2411 }, { "epoch": 0.15592727272727272, "grad_norm": 0.08684804290533066, "learning_rate": 0.0001991194242670507, "loss": 0.1042, "step": 2412 }, { "epoch": 0.15599191919191918, "grad_norm": 0.08067158609628677, "learning_rate": 0.00019911851849346946, "loss": 0.1054, "step": 2413 }, { "epoch": 0.15605656565656564, "grad_norm": 0.08107931911945343, "learning_rate": 0.0001991176122563437, "loss": 0.1061, "step": 2414 }, { "epoch": 0.15612121212121213, "grad_norm": 0.07803688198328018, "learning_rate": 0.0001991167055556776, "loss": 0.1077, "step": 2415 }, { "epoch": 0.1561858585858586, "grad_norm": 0.08813579380512238, "learning_rate": 0.00019911579839147544, "loss": 0.1147, "step": 2416 }, { "epoch": 0.1561858585858586, "eval_bleu": 15.979361638317176, "eval_loss": 0.09620603919029236, "eval_runtime": 2.6589, "eval_samples_per_second": 12.035, "eval_steps_per_second": 1.504, "step": 2416 }, { "epoch": 0.15625050505050506, "grad_norm": 0.1072230115532875, "learning_rate": 0.00019911489076374149, "loss": 0.1056, "step": 2417 }, { "epoch": 0.15631515151515152, "grad_norm": 0.08445640653371811, "learning_rate": 0.00019911398267247993, "loss": 0.0989, "step": 2418 }, { "epoch": 0.15637979797979798, "grad_norm": 0.07640232890844345, "learning_rate": 0.00019911307411769503, "loss": 0.0924, "step": 2419 }, { "epoch": 0.15644444444444444, "grad_norm": 0.08556509763002396, "learning_rate": 0.00019911216509939107, "loss": 0.097, "step": 2420 }, { "epoch": 0.1565090909090909, "grad_norm": 0.08633379638195038, "learning_rate": 0.0001991112556175723, "loss": 0.0976, "step": 2421 }, { "epoch": 0.15657373737373736, "grad_norm": 0.08954688906669617, "learning_rate": 0.00019911034567224293, "loss": 0.0916, "step": 2422 }, { "epoch": 0.15663838383838383, "grad_norm": 0.08552480489015579, "learning_rate": 0.0001991094352634072, "loss": 0.1142, "step": 2423 }, { "epoch": 0.15670303030303032, "grad_norm": 0.08744465559720993, "learning_rate": 0.00019910852439106947, "loss": 0.0995, "step": 2424 }, { "epoch": 0.15676767676767678, "grad_norm": 0.08486898243427277, "learning_rate": 0.00019910761305523393, "loss": 0.0908, "step": 2425 }, { "epoch": 0.15683232323232324, "grad_norm": 0.10686268657445908, "learning_rate": 0.00019910670125590483, "loss": 0.0879, "step": 2426 }, { "epoch": 0.1568969696969697, "grad_norm": 0.09920540452003479, "learning_rate": 0.00019910578899308643, "loss": 0.1157, "step": 2427 }, { "epoch": 0.15696161616161616, "grad_norm": 0.09235216677188873, "learning_rate": 0.00019910487626678303, "loss": 0.1015, "step": 2428 }, { "epoch": 0.15702626262626262, "grad_norm": 0.07505187392234802, "learning_rate": 0.0001991039630769989, "loss": 0.0914, "step": 2429 }, { "epoch": 0.1570909090909091, "grad_norm": 0.07139051705598831, "learning_rate": 0.0001991030494237383, "loss": 0.0777, "step": 2430 }, { "epoch": 0.15715555555555555, "grad_norm": 0.09772682189941406, "learning_rate": 0.0001991021353070055, "loss": 0.1183, "step": 2431 }, { "epoch": 0.157220202020202, "grad_norm": 0.08875758200883865, "learning_rate": 0.00019910122072680478, "loss": 0.1186, "step": 2432 }, { "epoch": 0.157220202020202, "eval_bleu": 16.458234427598295, "eval_loss": 0.09614560008049011, "eval_runtime": 2.7877, "eval_samples_per_second": 11.479, "eval_steps_per_second": 1.435, "step": 2432 }, { "epoch": 0.15728484848484847, "grad_norm": 0.09600205719470978, "learning_rate": 0.0001991003056831404, "loss": 0.1105, "step": 2433 }, { "epoch": 0.15734949494949496, "grad_norm": 0.09605379402637482, "learning_rate": 0.00019909939017601666, "loss": 0.1156, "step": 2434 }, { "epoch": 0.15741414141414142, "grad_norm": 0.07681706547737122, "learning_rate": 0.0001990984742054378, "loss": 0.0809, "step": 2435 }, { "epoch": 0.15747878787878788, "grad_norm": 0.08103141188621521, "learning_rate": 0.00019909755777140818, "loss": 0.097, "step": 2436 }, { "epoch": 0.15754343434343435, "grad_norm": 0.0800071582198143, "learning_rate": 0.00019909664087393205, "loss": 0.1025, "step": 2437 }, { "epoch": 0.1576080808080808, "grad_norm": 0.09360525012016296, "learning_rate": 0.00019909572351301368, "loss": 0.0931, "step": 2438 }, { "epoch": 0.15767272727272727, "grad_norm": 0.08019673824310303, "learning_rate": 0.00019909480568865734, "loss": 0.1003, "step": 2439 }, { "epoch": 0.15773737373737373, "grad_norm": 0.08331821113824844, "learning_rate": 0.00019909388740086737, "loss": 0.0885, "step": 2440 }, { "epoch": 0.1578020202020202, "grad_norm": 0.08785795420408249, "learning_rate": 0.0001990929686496481, "loss": 0.1012, "step": 2441 }, { "epoch": 0.15786666666666666, "grad_norm": 0.07840538769960403, "learning_rate": 0.0001990920494350037, "loss": 0.0773, "step": 2442 }, { "epoch": 0.15793131313131314, "grad_norm": 0.07582083344459534, "learning_rate": 0.00019909112975693857, "loss": 0.081, "step": 2443 }, { "epoch": 0.1579959595959596, "grad_norm": 0.09787586331367493, "learning_rate": 0.00019909020961545698, "loss": 0.1002, "step": 2444 }, { "epoch": 0.15806060606060607, "grad_norm": 0.08296768367290497, "learning_rate": 0.00019908928901056326, "loss": 0.0928, "step": 2445 }, { "epoch": 0.15812525252525253, "grad_norm": 0.0977582335472107, "learning_rate": 0.00019908836794226172, "loss": 0.1044, "step": 2446 }, { "epoch": 0.158189898989899, "grad_norm": 0.09254743903875351, "learning_rate": 0.00019908744641055658, "loss": 0.0875, "step": 2447 }, { "epoch": 0.15825454545454545, "grad_norm": 0.07885557413101196, "learning_rate": 0.00019908652441545224, "loss": 0.0909, "step": 2448 }, { "epoch": 0.15825454545454545, "eval_bleu": 13.406405586738554, "eval_loss": 0.09485719352960587, "eval_runtime": 2.7328, "eval_samples_per_second": 11.71, "eval_steps_per_second": 1.464, "step": 2448 }, { "epoch": 0.15831919191919192, "grad_norm": 0.10246507078409195, "learning_rate": 0.00019908560195695295, "loss": 0.09, "step": 2449 }, { "epoch": 0.15838383838383838, "grad_norm": 0.07869229465723038, "learning_rate": 0.0001990846790350631, "loss": 0.0931, "step": 2450 }, { "epoch": 0.15844848484848484, "grad_norm": 0.08455470949411392, "learning_rate": 0.00019908375564978696, "loss": 0.0949, "step": 2451 }, { "epoch": 0.1585131313131313, "grad_norm": 0.07824108749628067, "learning_rate": 0.00019908283180112885, "loss": 0.0927, "step": 2452 }, { "epoch": 0.1585777777777778, "grad_norm": 0.09662675112485886, "learning_rate": 0.00019908190748909305, "loss": 0.0804, "step": 2453 }, { "epoch": 0.15864242424242425, "grad_norm": 0.08414657413959503, "learning_rate": 0.00019908098271368397, "loss": 0.1116, "step": 2454 }, { "epoch": 0.1587070707070707, "grad_norm": 0.07943543791770935, "learning_rate": 0.00019908005747490588, "loss": 0.1144, "step": 2455 }, { "epoch": 0.15877171717171717, "grad_norm": 0.07298710942268372, "learning_rate": 0.0001990791317727631, "loss": 0.0965, "step": 2456 }, { "epoch": 0.15883636363636364, "grad_norm": 0.07060336321592331, "learning_rate": 0.00019907820560725997, "loss": 0.084, "step": 2457 }, { "epoch": 0.1589010101010101, "grad_norm": 0.08114949613809586, "learning_rate": 0.00019907727897840085, "loss": 0.0974, "step": 2458 }, { "epoch": 0.15896565656565656, "grad_norm": 0.07823418825864792, "learning_rate": 0.00019907635188619004, "loss": 0.0944, "step": 2459 }, { "epoch": 0.15903030303030302, "grad_norm": 0.07994591444730759, "learning_rate": 0.0001990754243306319, "loss": 0.0918, "step": 2460 }, { "epoch": 0.15909494949494948, "grad_norm": 0.07660511881113052, "learning_rate": 0.00019907449631173074, "loss": 0.0852, "step": 2461 }, { "epoch": 0.15915959595959597, "grad_norm": 0.08481775969266891, "learning_rate": 0.00019907356782949092, "loss": 0.1019, "step": 2462 }, { "epoch": 0.15922424242424243, "grad_norm": 0.08431138098239899, "learning_rate": 0.00019907263888391675, "loss": 0.0905, "step": 2463 }, { "epoch": 0.1592888888888889, "grad_norm": 0.08255553990602493, "learning_rate": 0.00019907170947501267, "loss": 0.0949, "step": 2464 }, { "epoch": 0.1592888888888889, "eval_bleu": 17.949476684371227, "eval_loss": 0.0979250818490982, "eval_runtime": 2.7504, "eval_samples_per_second": 11.634, "eval_steps_per_second": 1.454, "step": 2464 }, { "epoch": 0.15935353535353536, "grad_norm": 0.08562088012695312, "learning_rate": 0.0001990707796027829, "loss": 0.097, "step": 2465 }, { "epoch": 0.15941818181818182, "grad_norm": 0.09004244208335876, "learning_rate": 0.00019906984926723186, "loss": 0.0995, "step": 2466 }, { "epoch": 0.15948282828282828, "grad_norm": 0.08006538450717926, "learning_rate": 0.0001990689184683639, "loss": 0.0904, "step": 2467 }, { "epoch": 0.15954747474747474, "grad_norm": 0.08049822598695755, "learning_rate": 0.00019906798720618332, "loss": 0.0936, "step": 2468 }, { "epoch": 0.1596121212121212, "grad_norm": 0.0833514928817749, "learning_rate": 0.00019906705548069454, "loss": 0.0958, "step": 2469 }, { "epoch": 0.15967676767676767, "grad_norm": 0.07492423057556152, "learning_rate": 0.00019906612329190188, "loss": 0.0897, "step": 2470 }, { "epoch": 0.15974141414141413, "grad_norm": 0.06778661161661148, "learning_rate": 0.00019906519063980972, "loss": 0.0861, "step": 2471 }, { "epoch": 0.15980606060606062, "grad_norm": 0.08098343759775162, "learning_rate": 0.0001990642575244224, "loss": 0.1008, "step": 2472 }, { "epoch": 0.15987070707070708, "grad_norm": 0.07971610128879547, "learning_rate": 0.00019906332394574434, "loss": 0.1022, "step": 2473 }, { "epoch": 0.15993535353535354, "grad_norm": 0.08234554529190063, "learning_rate": 0.00019906238990377982, "loss": 0.1039, "step": 2474 }, { "epoch": 0.16, "grad_norm": 0.09035717695951462, "learning_rate": 0.00019906145539853328, "loss": 0.126, "step": 2475 }, { "epoch": 0.16006464646464647, "grad_norm": 0.08886728435754776, "learning_rate": 0.00019906052043000907, "loss": 0.0941, "step": 2476 }, { "epoch": 0.16012929292929293, "grad_norm": 0.07969742268323898, "learning_rate": 0.00019905958499821154, "loss": 0.1, "step": 2477 }, { "epoch": 0.1601939393939394, "grad_norm": 0.08392511308193207, "learning_rate": 0.00019905864910314507, "loss": 0.1009, "step": 2478 }, { "epoch": 0.16025858585858585, "grad_norm": 0.06872080266475677, "learning_rate": 0.00019905771274481406, "loss": 0.0763, "step": 2479 }, { "epoch": 0.1603232323232323, "grad_norm": 0.08104293793439865, "learning_rate": 0.00019905677592322286, "loss": 0.0871, "step": 2480 }, { "epoch": 0.1603232323232323, "eval_bleu": 16.479710093346444, "eval_loss": 0.09787783771753311, "eval_runtime": 2.633, "eval_samples_per_second": 12.154, "eval_steps_per_second": 1.519, "step": 2480 }, { "epoch": 0.1603878787878788, "grad_norm": 0.08231018483638763, "learning_rate": 0.0001990558386383759, "loss": 0.0735, "step": 2481 }, { "epoch": 0.16045252525252526, "grad_norm": 0.07051029801368713, "learning_rate": 0.00019905490089027747, "loss": 0.091, "step": 2482 }, { "epoch": 0.16051717171717172, "grad_norm": 0.06911779940128326, "learning_rate": 0.00019905396267893205, "loss": 0.0788, "step": 2483 }, { "epoch": 0.1605818181818182, "grad_norm": 0.0746067687869072, "learning_rate": 0.000199053024004344, "loss": 0.0826, "step": 2484 }, { "epoch": 0.16064646464646465, "grad_norm": 0.09021712839603424, "learning_rate": 0.0001990520848665177, "loss": 0.1117, "step": 2485 }, { "epoch": 0.1607111111111111, "grad_norm": 0.08775275200605392, "learning_rate": 0.00019905114526545754, "loss": 0.0955, "step": 2486 }, { "epoch": 0.16077575757575757, "grad_norm": 0.06989617645740509, "learning_rate": 0.00019905020520116792, "loss": 0.074, "step": 2487 }, { "epoch": 0.16084040404040403, "grad_norm": 0.08801909536123276, "learning_rate": 0.00019904926467365324, "loss": 0.0985, "step": 2488 }, { "epoch": 0.1609050505050505, "grad_norm": 0.0718747079372406, "learning_rate": 0.00019904832368291788, "loss": 0.0846, "step": 2489 }, { "epoch": 0.16096969696969696, "grad_norm": 0.06711117923259735, "learning_rate": 0.00019904738222896627, "loss": 0.0668, "step": 2490 }, { "epoch": 0.16103434343434345, "grad_norm": 0.0798947736620903, "learning_rate": 0.00019904644031180278, "loss": 0.0782, "step": 2491 }, { "epoch": 0.1610989898989899, "grad_norm": 0.09056241065263748, "learning_rate": 0.0001990454979314319, "loss": 0.1003, "step": 2492 }, { "epoch": 0.16116363636363637, "grad_norm": 0.08069320023059845, "learning_rate": 0.0001990445550878579, "loss": 0.0909, "step": 2493 }, { "epoch": 0.16122828282828283, "grad_norm": 0.07866625487804413, "learning_rate": 0.00019904361178108525, "loss": 0.0889, "step": 2494 }, { "epoch": 0.1612929292929293, "grad_norm": 0.08749701827764511, "learning_rate": 0.00019904266801111842, "loss": 0.082, "step": 2495 }, { "epoch": 0.16135757575757576, "grad_norm": 0.08497153967618942, "learning_rate": 0.00019904172377796173, "loss": 0.0883, "step": 2496 }, { "epoch": 0.16135757575757576, "eval_bleu": 13.421687733459187, "eval_loss": 0.09912532567977905, "eval_runtime": 2.9212, "eval_samples_per_second": 10.954, "eval_steps_per_second": 1.369, "step": 2496 }, { "epoch": 0.16142222222222222, "grad_norm": 0.13220928609371185, "learning_rate": 0.00019904077908161966, "loss": 0.1012, "step": 2497 }, { "epoch": 0.16148686868686868, "grad_norm": 0.08096387982368469, "learning_rate": 0.0001990398339220966, "loss": 0.0958, "step": 2498 }, { "epoch": 0.16155151515151514, "grad_norm": 0.07711213827133179, "learning_rate": 0.000199038888299397, "loss": 0.0914, "step": 2499 }, { "epoch": 0.16161616161616163, "grad_norm": 0.07851159572601318, "learning_rate": 0.00019903794221352524, "loss": 0.088, "step": 2500 }, { "epoch": 0.1616808080808081, "grad_norm": 0.07726718485355377, "learning_rate": 0.0001990369956644858, "loss": 0.0838, "step": 2501 }, { "epoch": 0.16174545454545455, "grad_norm": 0.09092199057340622, "learning_rate": 0.00019903604865228306, "loss": 0.1121, "step": 2502 }, { "epoch": 0.16181010101010102, "grad_norm": 0.07309889793395996, "learning_rate": 0.0001990351011769214, "loss": 0.0871, "step": 2503 }, { "epoch": 0.16187474747474748, "grad_norm": 0.1902703493833542, "learning_rate": 0.00019903415323840537, "loss": 0.0854, "step": 2504 }, { "epoch": 0.16193939393939394, "grad_norm": 0.09022161364555359, "learning_rate": 0.00019903320483673936, "loss": 0.0939, "step": 2505 }, { "epoch": 0.1620040404040404, "grad_norm": 0.07873562723398209, "learning_rate": 0.00019903225597192776, "loss": 0.0979, "step": 2506 }, { "epoch": 0.16206868686868686, "grad_norm": 0.07803434878587723, "learning_rate": 0.00019903130664397504, "loss": 0.0961, "step": 2507 }, { "epoch": 0.16213333333333332, "grad_norm": 0.07549719512462616, "learning_rate": 0.00019903035685288564, "loss": 0.0813, "step": 2508 }, { "epoch": 0.16219797979797979, "grad_norm": 0.07908180356025696, "learning_rate": 0.000199029406598664, "loss": 0.1072, "step": 2509 }, { "epoch": 0.16226262626262627, "grad_norm": 0.0819828063249588, "learning_rate": 0.00019902845588131457, "loss": 0.0901, "step": 2510 }, { "epoch": 0.16232727272727274, "grad_norm": 0.10002878308296204, "learning_rate": 0.0001990275047008418, "loss": 0.1271, "step": 2511 }, { "epoch": 0.1623919191919192, "grad_norm": 0.09526874125003815, "learning_rate": 0.0001990265530572501, "loss": 0.1011, "step": 2512 }, { "epoch": 0.1623919191919192, "eval_bleu": 14.323994324456736, "eval_loss": 0.09852258861064911, "eval_runtime": 2.6961, "eval_samples_per_second": 11.869, "eval_steps_per_second": 1.484, "step": 2512 }, { "epoch": 0.16245656565656566, "grad_norm": 0.084198959171772, "learning_rate": 0.00019902560095054397, "loss": 0.0908, "step": 2513 }, { "epoch": 0.16252121212121212, "grad_norm": 0.0899416133761406, "learning_rate": 0.00019902464838072784, "loss": 0.1136, "step": 2514 }, { "epoch": 0.16258585858585858, "grad_norm": 0.08481559157371521, "learning_rate": 0.00019902369534780616, "loss": 0.0902, "step": 2515 }, { "epoch": 0.16265050505050505, "grad_norm": 0.07603602856397629, "learning_rate": 0.00019902274185178338, "loss": 0.0795, "step": 2516 }, { "epoch": 0.1627151515151515, "grad_norm": 0.0834091529250145, "learning_rate": 0.000199021787892664, "loss": 0.092, "step": 2517 }, { "epoch": 0.16277979797979797, "grad_norm": 0.10004030913114548, "learning_rate": 0.00019902083347045243, "loss": 0.0981, "step": 2518 }, { "epoch": 0.16284444444444446, "grad_norm": 0.08966339379549026, "learning_rate": 0.00019901987858515318, "loss": 0.0785, "step": 2519 }, { "epoch": 0.16290909090909092, "grad_norm": 0.10092730820178986, "learning_rate": 0.0001990189232367707, "loss": 0.1036, "step": 2520 }, { "epoch": 0.16297373737373738, "grad_norm": 0.09682455658912659, "learning_rate": 0.0001990179674253094, "loss": 0.086, "step": 2521 }, { "epoch": 0.16303838383838384, "grad_norm": 0.07699234038591385, "learning_rate": 0.00019901701115077386, "loss": 0.0901, "step": 2522 }, { "epoch": 0.1631030303030303, "grad_norm": 0.07940739393234253, "learning_rate": 0.00019901605441316846, "loss": 0.0905, "step": 2523 }, { "epoch": 0.16316767676767677, "grad_norm": 0.08646399527788162, "learning_rate": 0.0001990150972124977, "loss": 0.0968, "step": 2524 }, { "epoch": 0.16323232323232323, "grad_norm": 0.08938555419445038, "learning_rate": 0.00019901413954876609, "loss": 0.0707, "step": 2525 }, { "epoch": 0.1632969696969697, "grad_norm": 0.08455876260995865, "learning_rate": 0.00019901318142197808, "loss": 0.092, "step": 2526 }, { "epoch": 0.16336161616161615, "grad_norm": 0.10977445542812347, "learning_rate": 0.00019901222283213814, "loss": 0.0902, "step": 2527 }, { "epoch": 0.16342626262626261, "grad_norm": 0.06731515377759933, "learning_rate": 0.00019901126377925077, "loss": 0.0754, "step": 2528 }, { "epoch": 0.16342626262626261, "eval_bleu": 16.363837430237744, "eval_loss": 0.10056555271148682, "eval_runtime": 2.7592, "eval_samples_per_second": 11.598, "eval_steps_per_second": 1.45, "step": 2528 }, { "epoch": 0.1634909090909091, "grad_norm": 0.08187739551067352, "learning_rate": 0.00019901030426332046, "loss": 0.1005, "step": 2529 }, { "epoch": 0.16355555555555557, "grad_norm": 0.06836195290088654, "learning_rate": 0.0001990093442843517, "loss": 0.0772, "step": 2530 }, { "epoch": 0.16362020202020203, "grad_norm": 0.0767069086432457, "learning_rate": 0.0001990083838423489, "loss": 0.0891, "step": 2531 }, { "epoch": 0.1636848484848485, "grad_norm": 0.08812432736158371, "learning_rate": 0.00019900742293731672, "loss": 0.0919, "step": 2532 }, { "epoch": 0.16374949494949495, "grad_norm": 0.0799088403582573, "learning_rate": 0.0001990064615692595, "loss": 0.087, "step": 2533 }, { "epoch": 0.1638141414141414, "grad_norm": 0.08472167700529099, "learning_rate": 0.00019900549973818178, "loss": 0.0977, "step": 2534 }, { "epoch": 0.16387878787878787, "grad_norm": 0.07687152177095413, "learning_rate": 0.0001990045374440881, "loss": 0.0895, "step": 2535 }, { "epoch": 0.16394343434343434, "grad_norm": 0.08221257477998734, "learning_rate": 0.0001990035746869829, "loss": 0.0903, "step": 2536 }, { "epoch": 0.1640080808080808, "grad_norm": 0.07737286388874054, "learning_rate": 0.00019900261146687072, "loss": 0.0935, "step": 2537 }, { "epoch": 0.16407272727272726, "grad_norm": 0.08981961756944656, "learning_rate": 0.00019900164778375608, "loss": 0.1196, "step": 2538 }, { "epoch": 0.16413737373737375, "grad_norm": 0.0863732099533081, "learning_rate": 0.00019900068363764344, "loss": 0.0993, "step": 2539 }, { "epoch": 0.1642020202020202, "grad_norm": 0.08438657224178314, "learning_rate": 0.00019899971902853734, "loss": 0.1012, "step": 2540 }, { "epoch": 0.16426666666666667, "grad_norm": 0.07388683408498764, "learning_rate": 0.00019899875395644226, "loss": 0.0796, "step": 2541 }, { "epoch": 0.16433131313131313, "grad_norm": 0.07076214253902435, "learning_rate": 0.00019899778842136278, "loss": 0.0699, "step": 2542 }, { "epoch": 0.1643959595959596, "grad_norm": 0.08369190990924835, "learning_rate": 0.00019899682242330332, "loss": 0.1134, "step": 2543 }, { "epoch": 0.16446060606060606, "grad_norm": 0.08069916814565659, "learning_rate": 0.00019899585596226847, "loss": 0.0893, "step": 2544 }, { "epoch": 0.16446060606060606, "eval_bleu": 14.397395841843759, "eval_loss": 0.09635031223297119, "eval_runtime": 2.666, "eval_samples_per_second": 12.003, "eval_steps_per_second": 1.5, "step": 2544 }, { "epoch": 0.16452525252525252, "grad_norm": 0.09031973779201508, "learning_rate": 0.00019899488903826274, "loss": 0.0901, "step": 2545 }, { "epoch": 0.16458989898989898, "grad_norm": 0.08417774736881256, "learning_rate": 0.00019899392165129064, "loss": 0.0898, "step": 2546 }, { "epoch": 0.16465454545454544, "grad_norm": 0.09317639470100403, "learning_rate": 0.00019899295380135668, "loss": 0.1036, "step": 2547 }, { "epoch": 0.16471919191919193, "grad_norm": 0.0843297690153122, "learning_rate": 0.00019899198548846542, "loss": 0.1095, "step": 2548 }, { "epoch": 0.1647838383838384, "grad_norm": 0.0838332399725914, "learning_rate": 0.00019899101671262136, "loss": 0.087, "step": 2549 }, { "epoch": 0.16484848484848486, "grad_norm": 0.08582288771867752, "learning_rate": 0.00019899004747382906, "loss": 0.1032, "step": 2550 }, { "epoch": 0.16491313131313132, "grad_norm": 0.1154182031750679, "learning_rate": 0.000198989077772093, "loss": 0.1073, "step": 2551 }, { "epoch": 0.16497777777777778, "grad_norm": 0.08523426949977875, "learning_rate": 0.00019898810760741778, "loss": 0.1185, "step": 2552 }, { "epoch": 0.16504242424242424, "grad_norm": 0.0796506404876709, "learning_rate": 0.0001989871369798079, "loss": 0.1016, "step": 2553 }, { "epoch": 0.1651070707070707, "grad_norm": 0.07208889722824097, "learning_rate": 0.00019898616588926785, "loss": 0.1019, "step": 2554 }, { "epoch": 0.16517171717171716, "grad_norm": 0.07410484552383423, "learning_rate": 0.0001989851943358023, "loss": 0.0839, "step": 2555 }, { "epoch": 0.16523636363636363, "grad_norm": 0.09314025938510895, "learning_rate": 0.0001989842223194157, "loss": 0.1069, "step": 2556 }, { "epoch": 0.1653010101010101, "grad_norm": 0.07474405318498611, "learning_rate": 0.0001989832498401126, "loss": 0.105, "step": 2557 }, { "epoch": 0.16536565656565658, "grad_norm": 0.07362939417362213, "learning_rate": 0.00019898227689789754, "loss": 0.0819, "step": 2558 }, { "epoch": 0.16543030303030304, "grad_norm": 0.07677435129880905, "learning_rate": 0.00019898130349277514, "loss": 0.0991, "step": 2559 }, { "epoch": 0.1654949494949495, "grad_norm": 0.08707185089588165, "learning_rate": 0.00019898032962474988, "loss": 0.0934, "step": 2560 }, { "epoch": 0.1654949494949495, "eval_bleu": 15.90855580314918, "eval_loss": 0.09577429294586182, "eval_runtime": 2.7514, "eval_samples_per_second": 11.631, "eval_steps_per_second": 1.454, "step": 2560 }, { "epoch": 0.16555959595959596, "grad_norm": 0.06546158343553543, "learning_rate": 0.00019897935529382638, "loss": 0.0666, "step": 2561 }, { "epoch": 0.16562424242424242, "grad_norm": 0.0820000022649765, "learning_rate": 0.00019897838050000912, "loss": 0.0931, "step": 2562 }, { "epoch": 0.16568888888888889, "grad_norm": 0.07797069102525711, "learning_rate": 0.0001989774052433027, "loss": 0.0736, "step": 2563 }, { "epoch": 0.16575353535353535, "grad_norm": 0.07704420387744904, "learning_rate": 0.00019897642952371167, "loss": 0.0954, "step": 2564 }, { "epoch": 0.1658181818181818, "grad_norm": 0.08196604251861572, "learning_rate": 0.0001989754533412406, "loss": 0.0856, "step": 2565 }, { "epoch": 0.16588282828282827, "grad_norm": 0.08397327363491058, "learning_rate": 0.00019897447669589409, "loss": 0.0977, "step": 2566 }, { "epoch": 0.16594747474747476, "grad_norm": 0.07992056012153625, "learning_rate": 0.00019897349958767664, "loss": 0.0955, "step": 2567 }, { "epoch": 0.16601212121212122, "grad_norm": 0.10287249088287354, "learning_rate": 0.00019897252201659285, "loss": 0.1129, "step": 2568 }, { "epoch": 0.16607676767676768, "grad_norm": 0.08574705570936203, "learning_rate": 0.0001989715439826473, "loss": 0.0883, "step": 2569 }, { "epoch": 0.16614141414141415, "grad_norm": 0.0904935672879219, "learning_rate": 0.00019897056548584457, "loss": 0.0906, "step": 2570 }, { "epoch": 0.1662060606060606, "grad_norm": 0.08727099746465683, "learning_rate": 0.00019896958652618923, "loss": 0.0972, "step": 2571 }, { "epoch": 0.16627070707070707, "grad_norm": 0.0833154246211052, "learning_rate": 0.00019896860710368584, "loss": 0.0936, "step": 2572 }, { "epoch": 0.16633535353535353, "grad_norm": 0.06849158555269241, "learning_rate": 0.000198967627218339, "loss": 0.0843, "step": 2573 }, { "epoch": 0.1664, "grad_norm": 0.09881257265806198, "learning_rate": 0.00019896664687015327, "loss": 0.0923, "step": 2574 }, { "epoch": 0.16646464646464645, "grad_norm": 0.08209142833948135, "learning_rate": 0.00019896566605913325, "loss": 0.0936, "step": 2575 }, { "epoch": 0.16652929292929292, "grad_norm": 0.0810588151216507, "learning_rate": 0.00019896468478528356, "loss": 0.0929, "step": 2576 }, { "epoch": 0.16652929292929292, "eval_bleu": 12.835440513750095, "eval_loss": 0.09694577753543854, "eval_runtime": 2.7036, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.48, "step": 2576 }, { "epoch": 0.1665939393939394, "grad_norm": 0.07667204737663269, "learning_rate": 0.00019896370304860872, "loss": 0.0954, "step": 2577 }, { "epoch": 0.16665858585858587, "grad_norm": 0.06912059336900711, "learning_rate": 0.00019896272084911338, "loss": 0.0715, "step": 2578 }, { "epoch": 0.16672323232323233, "grad_norm": 0.06380753964185715, "learning_rate": 0.00019896173818680207, "loss": 0.0746, "step": 2579 }, { "epoch": 0.1667878787878788, "grad_norm": 0.0782894492149353, "learning_rate": 0.00019896075506167947, "loss": 0.0995, "step": 2580 }, { "epoch": 0.16685252525252525, "grad_norm": 0.0894986093044281, "learning_rate": 0.00019895977147375013, "loss": 0.1154, "step": 2581 }, { "epoch": 0.16691717171717171, "grad_norm": 0.0772000104188919, "learning_rate": 0.00019895878742301864, "loss": 0.093, "step": 2582 }, { "epoch": 0.16698181818181818, "grad_norm": 0.07464070618152618, "learning_rate": 0.0001989578029094896, "loss": 0.0702, "step": 2583 }, { "epoch": 0.16704646464646464, "grad_norm": 0.07156574726104736, "learning_rate": 0.0001989568179331677, "loss": 0.0781, "step": 2584 }, { "epoch": 0.1671111111111111, "grad_norm": 0.07308657467365265, "learning_rate": 0.00019895583249405742, "loss": 0.0728, "step": 2585 }, { "epoch": 0.1671757575757576, "grad_norm": 0.06771235167980194, "learning_rate": 0.00019895484659216344, "loss": 0.0923, "step": 2586 }, { "epoch": 0.16724040404040405, "grad_norm": 0.12589362263679504, "learning_rate": 0.00019895386022749035, "loss": 0.0999, "step": 2587 }, { "epoch": 0.1673050505050505, "grad_norm": 0.07527434825897217, "learning_rate": 0.00019895287340004276, "loss": 0.0884, "step": 2588 }, { "epoch": 0.16736969696969697, "grad_norm": 0.09707389026880264, "learning_rate": 0.00019895188610982533, "loss": 0.1048, "step": 2589 }, { "epoch": 0.16743434343434344, "grad_norm": 0.07965560257434845, "learning_rate": 0.0001989508983568426, "loss": 0.0828, "step": 2590 }, { "epoch": 0.1674989898989899, "grad_norm": 0.0849994346499443, "learning_rate": 0.00019894991014109925, "loss": 0.1028, "step": 2591 }, { "epoch": 0.16756363636363636, "grad_norm": 0.061546456068754196, "learning_rate": 0.00019894892146259986, "loss": 0.0572, "step": 2592 }, { "epoch": 0.16756363636363636, "eval_bleu": 11.527252105818075, "eval_loss": 0.09925778955221176, "eval_runtime": 2.7989, "eval_samples_per_second": 11.433, "eval_steps_per_second": 1.429, "step": 2592 }, { "epoch": 0.16762828282828282, "grad_norm": 0.12852871417999268, "learning_rate": 0.00019894793232134913, "loss": 0.1222, "step": 2593 }, { "epoch": 0.16769292929292928, "grad_norm": 0.07851792871952057, "learning_rate": 0.00019894694271735159, "loss": 0.0937, "step": 2594 }, { "epoch": 0.16775757575757574, "grad_norm": 0.08756095916032791, "learning_rate": 0.00019894595265061192, "loss": 0.0778, "step": 2595 }, { "epoch": 0.16782222222222223, "grad_norm": 0.07558748126029968, "learning_rate": 0.00019894496212113474, "loss": 0.0981, "step": 2596 }, { "epoch": 0.1678868686868687, "grad_norm": 0.07501440495252609, "learning_rate": 0.00019894397112892465, "loss": 0.0964, "step": 2597 }, { "epoch": 0.16795151515151516, "grad_norm": 0.13666491210460663, "learning_rate": 0.00019894297967398638, "loss": 0.1389, "step": 2598 }, { "epoch": 0.16801616161616162, "grad_norm": 0.07506051659584045, "learning_rate": 0.00019894198775632446, "loss": 0.0923, "step": 2599 }, { "epoch": 0.16808080808080808, "grad_norm": 0.07255323231220245, "learning_rate": 0.0001989409953759436, "loss": 0.0832, "step": 2600 }, { "epoch": 0.16814545454545454, "grad_norm": 0.07257086783647537, "learning_rate": 0.0001989400025328484, "loss": 0.0854, "step": 2601 }, { "epoch": 0.168210101010101, "grad_norm": 0.0797615498304367, "learning_rate": 0.00019893900922704353, "loss": 0.0986, "step": 2602 }, { "epoch": 0.16827474747474747, "grad_norm": 0.08850092440843582, "learning_rate": 0.00019893801545853358, "loss": 0.1085, "step": 2603 }, { "epoch": 0.16833939393939393, "grad_norm": 0.09193852543830872, "learning_rate": 0.0001989370212273233, "loss": 0.0788, "step": 2604 }, { "epoch": 0.16840404040404042, "grad_norm": 0.09792643040418625, "learning_rate": 0.00019893602653341726, "loss": 0.1048, "step": 2605 }, { "epoch": 0.16846868686868688, "grad_norm": 0.08455830812454224, "learning_rate": 0.0001989350313768201, "loss": 0.1017, "step": 2606 }, { "epoch": 0.16853333333333334, "grad_norm": 0.07427718490362167, "learning_rate": 0.00019893403575753653, "loss": 0.0883, "step": 2607 }, { "epoch": 0.1685979797979798, "grad_norm": 0.07758526504039764, "learning_rate": 0.00019893303967557117, "loss": 0.1015, "step": 2608 }, { "epoch": 0.1685979797979798, "eval_bleu": 13.913496869616926, "eval_loss": 0.09879723936319351, "eval_runtime": 2.7196, "eval_samples_per_second": 11.766, "eval_steps_per_second": 1.471, "step": 2608 }, { "epoch": 0.16866262626262626, "grad_norm": 0.06988576054573059, "learning_rate": 0.0001989320431309287, "loss": 0.082, "step": 2609 }, { "epoch": 0.16872727272727273, "grad_norm": 0.07797277718782425, "learning_rate": 0.00019893104612361378, "loss": 0.0915, "step": 2610 }, { "epoch": 0.1687919191919192, "grad_norm": 0.07000397890806198, "learning_rate": 0.00019893004865363106, "loss": 0.0882, "step": 2611 }, { "epoch": 0.16885656565656565, "grad_norm": 0.07160747051239014, "learning_rate": 0.0001989290507209852, "loss": 0.0839, "step": 2612 }, { "epoch": 0.1689212121212121, "grad_norm": 0.08234173059463501, "learning_rate": 0.00019892805232568086, "loss": 0.096, "step": 2613 }, { "epoch": 0.16898585858585857, "grad_norm": 0.07149218767881393, "learning_rate": 0.00019892705346772274, "loss": 0.0919, "step": 2614 }, { "epoch": 0.16905050505050506, "grad_norm": 0.08471017330884933, "learning_rate": 0.0001989260541471155, "loss": 0.0955, "step": 2615 }, { "epoch": 0.16911515151515152, "grad_norm": 0.08480434864759445, "learning_rate": 0.0001989250543638638, "loss": 0.0818, "step": 2616 }, { "epoch": 0.16917979797979799, "grad_norm": 0.07190948724746704, "learning_rate": 0.00019892405411797232, "loss": 0.0702, "step": 2617 }, { "epoch": 0.16924444444444445, "grad_norm": 0.07594552636146545, "learning_rate": 0.00019892305340944578, "loss": 0.0875, "step": 2618 }, { "epoch": 0.1693090909090909, "grad_norm": 0.07243770360946655, "learning_rate": 0.00019892205223828876, "loss": 0.0922, "step": 2619 }, { "epoch": 0.16937373737373737, "grad_norm": 0.0926441103219986, "learning_rate": 0.00019892105060450606, "loss": 0.1028, "step": 2620 }, { "epoch": 0.16943838383838383, "grad_norm": 0.08102330565452576, "learning_rate": 0.0001989200485081023, "loss": 0.1076, "step": 2621 }, { "epoch": 0.1695030303030303, "grad_norm": 0.08745792508125305, "learning_rate": 0.00019891904594908212, "loss": 0.1199, "step": 2622 }, { "epoch": 0.16956767676767676, "grad_norm": 0.0709044560790062, "learning_rate": 0.00019891804292745033, "loss": 0.0765, "step": 2623 }, { "epoch": 0.16963232323232325, "grad_norm": 0.07396527379751205, "learning_rate": 0.00019891703944321153, "loss": 0.1011, "step": 2624 }, { "epoch": 0.16963232323232325, "eval_bleu": 11.959962870075357, "eval_loss": 0.09875558316707611, "eval_runtime": 2.9192, "eval_samples_per_second": 10.962, "eval_steps_per_second": 1.37, "step": 2624 }, { "epoch": 0.1696969696969697, "grad_norm": 0.09100839495658875, "learning_rate": 0.00019891603549637043, "loss": 0.1102, "step": 2625 }, { "epoch": 0.16976161616161617, "grad_norm": 0.07390713691711426, "learning_rate": 0.00019891503108693175, "loss": 0.0983, "step": 2626 }, { "epoch": 0.16982626262626263, "grad_norm": 0.0841887816786766, "learning_rate": 0.00019891402621490016, "loss": 0.1207, "step": 2627 }, { "epoch": 0.1698909090909091, "grad_norm": 0.0684683546423912, "learning_rate": 0.00019891302088028039, "loss": 0.0804, "step": 2628 }, { "epoch": 0.16995555555555555, "grad_norm": 0.07695214450359344, "learning_rate": 0.00019891201508307708, "loss": 0.0961, "step": 2629 }, { "epoch": 0.17002020202020202, "grad_norm": 0.1141047403216362, "learning_rate": 0.000198911008823295, "loss": 0.1129, "step": 2630 }, { "epoch": 0.17008484848484848, "grad_norm": 0.08545970916748047, "learning_rate": 0.00019891000210093887, "loss": 0.1025, "step": 2631 }, { "epoch": 0.17014949494949494, "grad_norm": 0.08067265152931213, "learning_rate": 0.00019890899491601332, "loss": 0.0873, "step": 2632 }, { "epoch": 0.1702141414141414, "grad_norm": 0.06937942653894424, "learning_rate": 0.00019890798726852308, "loss": 0.0813, "step": 2633 }, { "epoch": 0.1702787878787879, "grad_norm": 0.07903715968132019, "learning_rate": 0.00019890697915847289, "loss": 0.0909, "step": 2634 }, { "epoch": 0.17034343434343435, "grad_norm": 0.08862186223268509, "learning_rate": 0.0001989059705858675, "loss": 0.1001, "step": 2635 }, { "epoch": 0.17040808080808081, "grad_norm": 0.07973022758960724, "learning_rate": 0.00019890496155071152, "loss": 0.0891, "step": 2636 }, { "epoch": 0.17047272727272728, "grad_norm": 0.077756866812706, "learning_rate": 0.00019890395205300978, "loss": 0.0941, "step": 2637 }, { "epoch": 0.17053737373737374, "grad_norm": 0.09999898076057434, "learning_rate": 0.00019890294209276693, "loss": 0.1058, "step": 2638 }, { "epoch": 0.1706020202020202, "grad_norm": 0.08280091732740402, "learning_rate": 0.0001989019316699877, "loss": 0.0991, "step": 2639 }, { "epoch": 0.17066666666666666, "grad_norm": 0.0946156457066536, "learning_rate": 0.00019890092078467687, "loss": 0.1022, "step": 2640 }, { "epoch": 0.17066666666666666, "eval_bleu": 13.899835500734325, "eval_loss": 0.09693938493728638, "eval_runtime": 2.6787, "eval_samples_per_second": 11.946, "eval_steps_per_second": 1.493, "step": 2640 }, { "epoch": 0.17073131313131312, "grad_norm": 0.07308728992938995, "learning_rate": 0.00019889990943683912, "loss": 0.0867, "step": 2641 }, { "epoch": 0.17079595959595958, "grad_norm": 0.08340651541948318, "learning_rate": 0.00019889889762647917, "loss": 0.0973, "step": 2642 }, { "epoch": 0.17086060606060607, "grad_norm": 0.06602702289819717, "learning_rate": 0.00019889788535360178, "loss": 0.08, "step": 2643 }, { "epoch": 0.17092525252525254, "grad_norm": 0.08125805854797363, "learning_rate": 0.0001988968726182117, "loss": 0.0898, "step": 2644 }, { "epoch": 0.170989898989899, "grad_norm": 0.08202677220106125, "learning_rate": 0.00019889585942031363, "loss": 0.1065, "step": 2645 }, { "epoch": 0.17105454545454546, "grad_norm": 0.06917142122983932, "learning_rate": 0.0001988948457599123, "loss": 0.0808, "step": 2646 }, { "epoch": 0.17111919191919192, "grad_norm": 0.08310998231172562, "learning_rate": 0.0001988938316370125, "loss": 0.0974, "step": 2647 }, { "epoch": 0.17118383838383838, "grad_norm": 0.08297887444496155, "learning_rate": 0.0001988928170516189, "loss": 0.113, "step": 2648 }, { "epoch": 0.17124848484848484, "grad_norm": 0.08804168552160263, "learning_rate": 0.00019889180200373632, "loss": 0.0992, "step": 2649 }, { "epoch": 0.1713131313131313, "grad_norm": 0.1213785782456398, "learning_rate": 0.00019889078649336947, "loss": 0.0911, "step": 2650 }, { "epoch": 0.17137777777777777, "grad_norm": 0.08714232593774796, "learning_rate": 0.0001988897705205231, "loss": 0.0938, "step": 2651 }, { "epoch": 0.17144242424242423, "grad_norm": 0.0836285948753357, "learning_rate": 0.00019888875408520198, "loss": 0.0916, "step": 2652 }, { "epoch": 0.17150707070707072, "grad_norm": 0.08034289628267288, "learning_rate": 0.0001988877371874108, "loss": 0.0957, "step": 2653 }, { "epoch": 0.17157171717171718, "grad_norm": 0.12397786229848862, "learning_rate": 0.0001988867198271544, "loss": 0.1057, "step": 2654 }, { "epoch": 0.17163636363636364, "grad_norm": 0.07458718866109848, "learning_rate": 0.00019888570200443754, "loss": 0.0928, "step": 2655 }, { "epoch": 0.1717010101010101, "grad_norm": 0.07918529212474823, "learning_rate": 0.0001988846837192649, "loss": 0.0927, "step": 2656 }, { "epoch": 0.1717010101010101, "eval_bleu": 12.301804817046529, "eval_loss": 0.09886422753334045, "eval_runtime": 2.7667, "eval_samples_per_second": 11.566, "eval_steps_per_second": 1.446, "step": 2656 }, { "epoch": 0.17176565656565657, "grad_norm": 0.0783635675907135, "learning_rate": 0.00019888366497164127, "loss": 0.0862, "step": 2657 }, { "epoch": 0.17183030303030303, "grad_norm": 0.07257837057113647, "learning_rate": 0.00019888264576157147, "loss": 0.0906, "step": 2658 }, { "epoch": 0.1718949494949495, "grad_norm": 0.08375464379787445, "learning_rate": 0.0001988816260890602, "loss": 0.0955, "step": 2659 }, { "epoch": 0.17195959595959595, "grad_norm": 0.07692074030637741, "learning_rate": 0.00019888060595411227, "loss": 0.087, "step": 2660 }, { "epoch": 0.1720242424242424, "grad_norm": 0.08843611925840378, "learning_rate": 0.0001988795853567324, "loss": 0.0972, "step": 2661 }, { "epoch": 0.1720888888888889, "grad_norm": 0.10938207805156708, "learning_rate": 0.00019887856429692545, "loss": 0.1112, "step": 2662 }, { "epoch": 0.17215353535353536, "grad_norm": 0.08177085220813751, "learning_rate": 0.0001988775427746961, "loss": 0.0925, "step": 2663 }, { "epoch": 0.17221818181818183, "grad_norm": 0.08051607757806778, "learning_rate": 0.0001988765207900492, "loss": 0.0894, "step": 2664 }, { "epoch": 0.1722828282828283, "grad_norm": 0.0673099160194397, "learning_rate": 0.00019887549834298948, "loss": 0.0808, "step": 2665 }, { "epoch": 0.17234747474747475, "grad_norm": 0.07069753855466843, "learning_rate": 0.00019887447543352177, "loss": 0.0828, "step": 2666 }, { "epoch": 0.1724121212121212, "grad_norm": 0.0805334821343422, "learning_rate": 0.0001988734520616508, "loss": 0.1047, "step": 2667 }, { "epoch": 0.17247676767676767, "grad_norm": 0.0879552885890007, "learning_rate": 0.00019887242822738137, "loss": 0.0935, "step": 2668 }, { "epoch": 0.17254141414141413, "grad_norm": 0.07935614138841629, "learning_rate": 0.00019887140393071831, "loss": 0.0922, "step": 2669 }, { "epoch": 0.1726060606060606, "grad_norm": 0.09668657183647156, "learning_rate": 0.00019887037917166637, "loss": 0.1058, "step": 2670 }, { "epoch": 0.17267070707070706, "grad_norm": 0.07810390740633011, "learning_rate": 0.00019886935395023035, "loss": 0.0972, "step": 2671 }, { "epoch": 0.17273535353535355, "grad_norm": 0.16058142483234406, "learning_rate": 0.00019886832826641505, "loss": 0.1046, "step": 2672 }, { "epoch": 0.17273535353535355, "eval_bleu": 12.522160034978606, "eval_loss": 0.09890874475240707, "eval_runtime": 2.6074, "eval_samples_per_second": 12.273, "eval_steps_per_second": 1.534, "step": 2672 }, { "epoch": 0.1728, "grad_norm": 0.10017416626214981, "learning_rate": 0.00019886730212022527, "loss": 0.0864, "step": 2673 }, { "epoch": 0.17286464646464647, "grad_norm": 0.0689869299530983, "learning_rate": 0.0001988662755116658, "loss": 0.0846, "step": 2674 }, { "epoch": 0.17292929292929293, "grad_norm": 0.06987910717725754, "learning_rate": 0.00019886524844074142, "loss": 0.0834, "step": 2675 }, { "epoch": 0.1729939393939394, "grad_norm": 0.08495312929153442, "learning_rate": 0.00019886422090745697, "loss": 0.0997, "step": 2676 }, { "epoch": 0.17305858585858586, "grad_norm": 0.06730899959802628, "learning_rate": 0.00019886319291181724, "loss": 0.0752, "step": 2677 }, { "epoch": 0.17312323232323232, "grad_norm": 0.0818965882062912, "learning_rate": 0.00019886216445382706, "loss": 0.0982, "step": 2678 }, { "epoch": 0.17318787878787878, "grad_norm": 0.06921021640300751, "learning_rate": 0.00019886113553349121, "loss": 0.0815, "step": 2679 }, { "epoch": 0.17325252525252524, "grad_norm": 0.07900305837392807, "learning_rate": 0.00019886010615081451, "loss": 0.0872, "step": 2680 }, { "epoch": 0.17331717171717173, "grad_norm": 0.06973530352115631, "learning_rate": 0.00019885907630580178, "loss": 0.0825, "step": 2681 }, { "epoch": 0.1733818181818182, "grad_norm": 0.07682498544454575, "learning_rate": 0.00019885804599845784, "loss": 0.0777, "step": 2682 }, { "epoch": 0.17344646464646465, "grad_norm": 0.07817425578832626, "learning_rate": 0.00019885701522878748, "loss": 0.1032, "step": 2683 }, { "epoch": 0.17351111111111112, "grad_norm": 0.0742013081908226, "learning_rate": 0.00019885598399679554, "loss": 0.0985, "step": 2684 }, { "epoch": 0.17357575757575758, "grad_norm": 0.07024482637643814, "learning_rate": 0.00019885495230248688, "loss": 0.0844, "step": 2685 }, { "epoch": 0.17364040404040404, "grad_norm": 0.07693494856357574, "learning_rate": 0.00019885392014586627, "loss": 0.0931, "step": 2686 }, { "epoch": 0.1737050505050505, "grad_norm": 0.06686500459909439, "learning_rate": 0.0001988528875269385, "loss": 0.0788, "step": 2687 }, { "epoch": 0.17376969696969696, "grad_norm": 0.06792235374450684, "learning_rate": 0.00019885185444570852, "loss": 0.0861, "step": 2688 }, { "epoch": 0.17376969696969696, "eval_bleu": 13.350449363620942, "eval_loss": 0.09692772477865219, "eval_runtime": 2.911, "eval_samples_per_second": 10.993, "eval_steps_per_second": 1.374, "step": 2688 }, { "epoch": 0.17383434343434342, "grad_norm": 0.07090412080287933, "learning_rate": 0.0001988508209021811, "loss": 0.0805, "step": 2689 }, { "epoch": 0.1738989898989899, "grad_norm": 0.07681573927402496, "learning_rate": 0.00019884978689636105, "loss": 0.1042, "step": 2690 }, { "epoch": 0.17396363636363638, "grad_norm": 0.07938114553689957, "learning_rate": 0.00019884875242825323, "loss": 0.1041, "step": 2691 }, { "epoch": 0.17402828282828284, "grad_norm": 0.07724738121032715, "learning_rate": 0.00019884771749786246, "loss": 0.1072, "step": 2692 }, { "epoch": 0.1740929292929293, "grad_norm": 0.11972406506538391, "learning_rate": 0.00019884668210519362, "loss": 0.0902, "step": 2693 }, { "epoch": 0.17415757575757576, "grad_norm": 0.080725759267807, "learning_rate": 0.00019884564625025148, "loss": 0.1103, "step": 2694 }, { "epoch": 0.17422222222222222, "grad_norm": 0.06825319677591324, "learning_rate": 0.000198844609933041, "loss": 0.0743, "step": 2695 }, { "epoch": 0.17428686868686868, "grad_norm": 0.06799200922250748, "learning_rate": 0.0001988435731535669, "loss": 0.077, "step": 2696 }, { "epoch": 0.17435151515151515, "grad_norm": 0.07976427674293518, "learning_rate": 0.00019884253591183408, "loss": 0.0924, "step": 2697 }, { "epoch": 0.1744161616161616, "grad_norm": 0.06222952902317047, "learning_rate": 0.00019884149820784743, "loss": 0.0706, "step": 2698 }, { "epoch": 0.17448080808080807, "grad_norm": 0.07017695903778076, "learning_rate": 0.00019884046004161175, "loss": 0.0789, "step": 2699 }, { "epoch": 0.17454545454545456, "grad_norm": 0.0755394771695137, "learning_rate": 0.00019883942141313195, "loss": 0.0961, "step": 2700 }, { "epoch": 0.17461010101010102, "grad_norm": 0.08314016461372375, "learning_rate": 0.0001988383823224128, "loss": 0.0994, "step": 2701 }, { "epoch": 0.17467474747474748, "grad_norm": 0.079079769551754, "learning_rate": 0.00019883734276945924, "loss": 0.0885, "step": 2702 }, { "epoch": 0.17473939393939394, "grad_norm": 0.08789471536874771, "learning_rate": 0.0001988363027542761, "loss": 0.1044, "step": 2703 }, { "epoch": 0.1748040404040404, "grad_norm": 0.08137782663106918, "learning_rate": 0.00019883526227686824, "loss": 0.0897, "step": 2704 }, { "epoch": 0.1748040404040404, "eval_bleu": 15.714732085423615, "eval_loss": 0.09762945771217346, "eval_runtime": 2.7307, "eval_samples_per_second": 11.719, "eval_steps_per_second": 1.465, "step": 2704 }, { "epoch": 0.17486868686868687, "grad_norm": 0.07812388241291046, "learning_rate": 0.00019883422133724056, "loss": 0.0925, "step": 2705 }, { "epoch": 0.17493333333333333, "grad_norm": 0.06671834737062454, "learning_rate": 0.00019883317993539787, "loss": 0.0777, "step": 2706 }, { "epoch": 0.1749979797979798, "grad_norm": 0.07645226269960403, "learning_rate": 0.00019883213807134507, "loss": 0.0892, "step": 2707 }, { "epoch": 0.17506262626262625, "grad_norm": 0.07224711030721664, "learning_rate": 0.00019883109574508705, "loss": 0.0757, "step": 2708 }, { "epoch": 0.17512727272727271, "grad_norm": 0.07484664022922516, "learning_rate": 0.00019883005295662866, "loss": 0.0854, "step": 2709 }, { "epoch": 0.1751919191919192, "grad_norm": 0.07461053878068924, "learning_rate": 0.0001988290097059748, "loss": 0.0843, "step": 2710 }, { "epoch": 0.17525656565656567, "grad_norm": 0.08061341941356659, "learning_rate": 0.0001988279659931303, "loss": 0.0841, "step": 2711 }, { "epoch": 0.17532121212121213, "grad_norm": 0.0966501384973526, "learning_rate": 0.0001988269218181001, "loss": 0.1058, "step": 2712 }, { "epoch": 0.1753858585858586, "grad_norm": 0.0789634957909584, "learning_rate": 0.00019882587718088906, "loss": 0.1, "step": 2713 }, { "epoch": 0.17545050505050505, "grad_norm": 0.08129527419805527, "learning_rate": 0.00019882483208150204, "loss": 0.0998, "step": 2714 }, { "epoch": 0.1755151515151515, "grad_norm": 0.08067364245653152, "learning_rate": 0.00019882378651994397, "loss": 0.0834, "step": 2715 }, { "epoch": 0.17557979797979797, "grad_norm": 0.09063126891851425, "learning_rate": 0.00019882274049621974, "loss": 0.0875, "step": 2716 }, { "epoch": 0.17564444444444444, "grad_norm": 0.08005023747682571, "learning_rate": 0.00019882169401033419, "loss": 0.1037, "step": 2717 }, { "epoch": 0.1757090909090909, "grad_norm": 0.0804174467921257, "learning_rate": 0.00019882064706229225, "loss": 0.0916, "step": 2718 }, { "epoch": 0.1757737373737374, "grad_norm": 0.07512813061475754, "learning_rate": 0.00019881959965209882, "loss": 0.0936, "step": 2719 }, { "epoch": 0.17583838383838385, "grad_norm": 0.06768741458654404, "learning_rate": 0.0001988185517797588, "loss": 0.092, "step": 2720 }, { "epoch": 0.17583838383838385, "eval_bleu": 18.492274221283008, "eval_loss": 0.09604879468679428, "eval_runtime": 2.9265, "eval_samples_per_second": 10.935, "eval_steps_per_second": 1.367, "step": 2720 }, { "epoch": 0.1759030303030303, "grad_norm": 0.07086288183927536, "learning_rate": 0.0001988175034452771, "loss": 0.0907, "step": 2721 }, { "epoch": 0.17596767676767677, "grad_norm": 0.07924073934555054, "learning_rate": 0.00019881645464865858, "loss": 0.1034, "step": 2722 }, { "epoch": 0.17603232323232323, "grad_norm": 0.07363121211528778, "learning_rate": 0.00019881540538990814, "loss": 0.0906, "step": 2723 }, { "epoch": 0.1760969696969697, "grad_norm": 0.09316180646419525, "learning_rate": 0.00019881435566903078, "loss": 0.1154, "step": 2724 }, { "epoch": 0.17616161616161616, "grad_norm": 0.09084955602884293, "learning_rate": 0.00019881330548603127, "loss": 0.1133, "step": 2725 }, { "epoch": 0.17622626262626262, "grad_norm": 0.08407731354236603, "learning_rate": 0.00019881225484091465, "loss": 0.0916, "step": 2726 }, { "epoch": 0.17629090909090908, "grad_norm": 0.08583284914493561, "learning_rate": 0.00019881120373368578, "loss": 0.0987, "step": 2727 }, { "epoch": 0.17635555555555554, "grad_norm": 0.08164938539266586, "learning_rate": 0.00019881015216434956, "loss": 0.1065, "step": 2728 }, { "epoch": 0.17642020202020203, "grad_norm": 0.09231586009263992, "learning_rate": 0.00019880910013291093, "loss": 0.0985, "step": 2729 }, { "epoch": 0.1764848484848485, "grad_norm": 0.07795228064060211, "learning_rate": 0.0001988080476393748, "loss": 0.0898, "step": 2730 }, { "epoch": 0.17654949494949496, "grad_norm": 0.06641272455453873, "learning_rate": 0.00019880699468374612, "loss": 0.0797, "step": 2731 }, { "epoch": 0.17661414141414142, "grad_norm": 0.07230307906866074, "learning_rate": 0.00019880594126602978, "loss": 0.0974, "step": 2732 }, { "epoch": 0.17667878787878788, "grad_norm": 0.07935388386249542, "learning_rate": 0.0001988048873862307, "loss": 0.0928, "step": 2733 }, { "epoch": 0.17674343434343434, "grad_norm": 0.07873757928609848, "learning_rate": 0.00019880383304435385, "loss": 0.097, "step": 2734 }, { "epoch": 0.1768080808080808, "grad_norm": 0.07702283561229706, "learning_rate": 0.0001988027782404041, "loss": 0.0874, "step": 2735 }, { "epoch": 0.17687272727272726, "grad_norm": 0.08321177959442139, "learning_rate": 0.00019880172297438646, "loss": 0.1041, "step": 2736 }, { "epoch": 0.17687272727272726, "eval_bleu": 12.285483389687151, "eval_loss": 0.09538325667381287, "eval_runtime": 2.6616, "eval_samples_per_second": 12.023, "eval_steps_per_second": 1.503, "step": 2736 }, { "epoch": 0.17693737373737373, "grad_norm": 0.07662086188793182, "learning_rate": 0.0001988006672463058, "loss": 0.0908, "step": 2737 }, { "epoch": 0.1770020202020202, "grad_norm": 0.0868905708193779, "learning_rate": 0.00019879961105616708, "loss": 0.0984, "step": 2738 }, { "epoch": 0.17706666666666668, "grad_norm": 0.06956250220537186, "learning_rate": 0.00019879855440397526, "loss": 0.0857, "step": 2739 }, { "epoch": 0.17713131313131314, "grad_norm": 0.07216699421405792, "learning_rate": 0.00019879749728973526, "loss": 0.0866, "step": 2740 }, { "epoch": 0.1771959595959596, "grad_norm": 0.08254578709602356, "learning_rate": 0.00019879643971345204, "loss": 0.0986, "step": 2741 }, { "epoch": 0.17726060606060606, "grad_norm": 0.07362942397594452, "learning_rate": 0.0001987953816751305, "loss": 0.1042, "step": 2742 }, { "epoch": 0.17732525252525252, "grad_norm": 0.06621770560741425, "learning_rate": 0.00019879432317477562, "loss": 0.0885, "step": 2743 }, { "epoch": 0.177389898989899, "grad_norm": 0.08045028150081635, "learning_rate": 0.00019879326421239237, "loss": 0.0918, "step": 2744 }, { "epoch": 0.17745454545454545, "grad_norm": 0.071648508310318, "learning_rate": 0.00019879220478798569, "loss": 0.0896, "step": 2745 }, { "epoch": 0.1775191919191919, "grad_norm": 0.0943872258067131, "learning_rate": 0.00019879114490156053, "loss": 0.1169, "step": 2746 }, { "epoch": 0.17758383838383837, "grad_norm": 0.06958484649658203, "learning_rate": 0.00019879008455312181, "loss": 0.096, "step": 2747 }, { "epoch": 0.17764848484848486, "grad_norm": 0.07820151001214981, "learning_rate": 0.00019878902374267457, "loss": 0.0851, "step": 2748 }, { "epoch": 0.17771313131313132, "grad_norm": 0.07022452354431152, "learning_rate": 0.00019878796247022368, "loss": 0.0773, "step": 2749 }, { "epoch": 0.17777777777777778, "grad_norm": 0.08070288598537445, "learning_rate": 0.00019878690073577417, "loss": 0.0946, "step": 2750 }, { "epoch": 0.17784242424242425, "grad_norm": 0.0653889998793602, "learning_rate": 0.000198785838539331, "loss": 0.0764, "step": 2751 }, { "epoch": 0.1779070707070707, "grad_norm": 0.08429327607154846, "learning_rate": 0.0001987847758808991, "loss": 0.0886, "step": 2752 }, { "epoch": 0.1779070707070707, "eval_bleu": 11.99463844629718, "eval_loss": 0.09583473950624466, "eval_runtime": 2.7464, "eval_samples_per_second": 11.651, "eval_steps_per_second": 1.456, "step": 2752 }, { "epoch": 0.17797171717171717, "grad_norm": 0.08609875291585922, "learning_rate": 0.00019878371276048346, "loss": 0.0955, "step": 2753 }, { "epoch": 0.17803636363636363, "grad_norm": 0.06654812395572662, "learning_rate": 0.00019878264917808907, "loss": 0.0755, "step": 2754 }, { "epoch": 0.1781010101010101, "grad_norm": 0.16480769217014313, "learning_rate": 0.00019878158513372086, "loss": 0.1009, "step": 2755 }, { "epoch": 0.17816565656565655, "grad_norm": 0.08210154622793198, "learning_rate": 0.00019878052062738386, "loss": 0.0983, "step": 2756 }, { "epoch": 0.17823030303030302, "grad_norm": 0.0810609981417656, "learning_rate": 0.000198779455659083, "loss": 0.0879, "step": 2757 }, { "epoch": 0.1782949494949495, "grad_norm": 0.07702967524528503, "learning_rate": 0.0001987783902288233, "loss": 0.0945, "step": 2758 }, { "epoch": 0.17835959595959597, "grad_norm": 0.07689771056175232, "learning_rate": 0.0001987773243366097, "loss": 0.0874, "step": 2759 }, { "epoch": 0.17842424242424243, "grad_norm": 0.08530224114656448, "learning_rate": 0.00019877625798244726, "loss": 0.1071, "step": 2760 }, { "epoch": 0.1784888888888889, "grad_norm": 0.08010605722665787, "learning_rate": 0.00019877519116634086, "loss": 0.0919, "step": 2761 }, { "epoch": 0.17855353535353535, "grad_norm": 0.09572355449199677, "learning_rate": 0.00019877412388829557, "loss": 0.1053, "step": 2762 }, { "epoch": 0.17861818181818181, "grad_norm": 0.07287169992923737, "learning_rate": 0.00019877305614831637, "loss": 0.0858, "step": 2763 }, { "epoch": 0.17868282828282828, "grad_norm": 0.07264987379312515, "learning_rate": 0.0001987719879464082, "loss": 0.0937, "step": 2764 }, { "epoch": 0.17874747474747474, "grad_norm": 0.09174101799726486, "learning_rate": 0.00019877091928257614, "loss": 0.0825, "step": 2765 }, { "epoch": 0.1788121212121212, "grad_norm": 0.07856519520282745, "learning_rate": 0.0001987698501568251, "loss": 0.0926, "step": 2766 }, { "epoch": 0.1788767676767677, "grad_norm": 0.07437831908464432, "learning_rate": 0.00019876878056916016, "loss": 0.0876, "step": 2767 }, { "epoch": 0.17894141414141415, "grad_norm": 0.07657970488071442, "learning_rate": 0.00019876771051958626, "loss": 0.0951, "step": 2768 }, { "epoch": 0.17894141414141415, "eval_bleu": 12.501776313075313, "eval_loss": 0.096172034740448, "eval_runtime": 2.5926, "eval_samples_per_second": 12.343, "eval_steps_per_second": 1.543, "step": 2768 }, { "epoch": 0.1790060606060606, "grad_norm": 0.07658912986516953, "learning_rate": 0.00019876664000810845, "loss": 0.0856, "step": 2769 }, { "epoch": 0.17907070707070707, "grad_norm": 0.0717838853597641, "learning_rate": 0.00019876556903473171, "loss": 0.0791, "step": 2770 }, { "epoch": 0.17913535353535354, "grad_norm": 0.08511864393949509, "learning_rate": 0.00019876449759946105, "loss": 0.1022, "step": 2771 }, { "epoch": 0.1792, "grad_norm": 0.07913220673799515, "learning_rate": 0.00019876342570230147, "loss": 0.0921, "step": 2772 }, { "epoch": 0.17926464646464646, "grad_norm": 0.07598750293254852, "learning_rate": 0.00019876235334325803, "loss": 0.0921, "step": 2773 }, { "epoch": 0.17932929292929292, "grad_norm": 0.11733365803956985, "learning_rate": 0.00019876128052233568, "loss": 0.1033, "step": 2774 }, { "epoch": 0.17939393939393938, "grad_norm": 0.07843891531229019, "learning_rate": 0.00019876020723953952, "loss": 0.0839, "step": 2775 }, { "epoch": 0.17945858585858585, "grad_norm": 0.06839077919721603, "learning_rate": 0.00019875913349487448, "loss": 0.0745, "step": 2776 }, { "epoch": 0.17952323232323233, "grad_norm": 0.07148474454879761, "learning_rate": 0.00019875805928834566, "loss": 0.0944, "step": 2777 }, { "epoch": 0.1795878787878788, "grad_norm": 0.07561158388853073, "learning_rate": 0.000198756984619958, "loss": 0.089, "step": 2778 }, { "epoch": 0.17965252525252526, "grad_norm": 0.07801327109336853, "learning_rate": 0.0001987559094897166, "loss": 0.0989, "step": 2779 }, { "epoch": 0.17971717171717172, "grad_norm": 0.07930078357458115, "learning_rate": 0.00019875483389762645, "loss": 0.0961, "step": 2780 }, { "epoch": 0.17978181818181818, "grad_norm": 0.07779914885759354, "learning_rate": 0.00019875375784369258, "loss": 0.1055, "step": 2781 }, { "epoch": 0.17984646464646464, "grad_norm": 0.07417116314172745, "learning_rate": 0.00019875268132792004, "loss": 0.0738, "step": 2782 }, { "epoch": 0.1799111111111111, "grad_norm": 0.07325158268213272, "learning_rate": 0.00019875160435031385, "loss": 0.08, "step": 2783 }, { "epoch": 0.17997575757575757, "grad_norm": 0.07600921392440796, "learning_rate": 0.00019875052691087908, "loss": 0.0897, "step": 2784 }, { "epoch": 0.17997575757575757, "eval_bleu": 12.612213977287793, "eval_loss": 0.09436735510826111, "eval_runtime": 2.7371, "eval_samples_per_second": 11.691, "eval_steps_per_second": 1.461, "step": 2784 }, { "epoch": 0.18004040404040403, "grad_norm": 0.09855237603187561, "learning_rate": 0.0001987494490096207, "loss": 0.1204, "step": 2785 }, { "epoch": 0.18010505050505052, "grad_norm": 0.06247742101550102, "learning_rate": 0.00019874837064654384, "loss": 0.0803, "step": 2786 }, { "epoch": 0.18016969696969698, "grad_norm": 0.07029539346694946, "learning_rate": 0.00019874729182165347, "loss": 0.0777, "step": 2787 }, { "epoch": 0.18023434343434344, "grad_norm": 0.07934654504060745, "learning_rate": 0.00019874621253495467, "loss": 0.0855, "step": 2788 }, { "epoch": 0.1802989898989899, "grad_norm": 0.0715228021144867, "learning_rate": 0.00019874513278645247, "loss": 0.0802, "step": 2789 }, { "epoch": 0.18036363636363636, "grad_norm": 0.0898372009396553, "learning_rate": 0.00019874405257615192, "loss": 0.1072, "step": 2790 }, { "epoch": 0.18042828282828283, "grad_norm": 0.0665682777762413, "learning_rate": 0.00019874297190405812, "loss": 0.0727, "step": 2791 }, { "epoch": 0.1804929292929293, "grad_norm": 0.08710990846157074, "learning_rate": 0.00019874189077017605, "loss": 0.0818, "step": 2792 }, { "epoch": 0.18055757575757575, "grad_norm": 0.0810735747218132, "learning_rate": 0.0001987408091745108, "loss": 0.1106, "step": 2793 }, { "epoch": 0.1806222222222222, "grad_norm": 0.09424632787704468, "learning_rate": 0.0001987397271170674, "loss": 0.1085, "step": 2794 }, { "epoch": 0.18068686868686867, "grad_norm": 0.08035526424646378, "learning_rate": 0.000198738644597851, "loss": 0.1019, "step": 2795 }, { "epoch": 0.18075151515151516, "grad_norm": 0.08920923620462418, "learning_rate": 0.00019873756161686656, "loss": 0.0944, "step": 2796 }, { "epoch": 0.18081616161616162, "grad_norm": 0.07438452541828156, "learning_rate": 0.0001987364781741192, "loss": 0.092, "step": 2797 }, { "epoch": 0.1808808080808081, "grad_norm": 0.07915820181369781, "learning_rate": 0.00019873539426961396, "loss": 0.0944, "step": 2798 }, { "epoch": 0.18094545454545455, "grad_norm": 0.06792636215686798, "learning_rate": 0.00019873430990335596, "loss": 0.0795, "step": 2799 }, { "epoch": 0.181010101010101, "grad_norm": 0.08413238078355789, "learning_rate": 0.00019873322507535019, "loss": 0.1005, "step": 2800 }, { "epoch": 0.181010101010101, "eval_bleu": 11.47564656157881, "eval_loss": 0.09421389549970627, "eval_runtime": 2.7802, "eval_samples_per_second": 11.51, "eval_steps_per_second": 1.439, "step": 2800 }, { "epoch": 0.18107474747474747, "grad_norm": 0.07722476869821548, "learning_rate": 0.00019873213978560182, "loss": 0.0952, "step": 2801 }, { "epoch": 0.18113939393939393, "grad_norm": 0.0825035348534584, "learning_rate": 0.0001987310540341158, "loss": 0.109, "step": 2802 }, { "epoch": 0.1812040404040404, "grad_norm": 0.0754779577255249, "learning_rate": 0.00019872996782089732, "loss": 0.0866, "step": 2803 }, { "epoch": 0.18126868686868686, "grad_norm": 0.08096551150083542, "learning_rate": 0.00019872888114595144, "loss": 0.0844, "step": 2804 }, { "epoch": 0.18133333333333335, "grad_norm": 0.09417939931154251, "learning_rate": 0.00019872779400928318, "loss": 0.1022, "step": 2805 }, { "epoch": 0.1813979797979798, "grad_norm": 0.07801815122365952, "learning_rate": 0.0001987267064108977, "loss": 0.0912, "step": 2806 }, { "epoch": 0.18146262626262627, "grad_norm": 0.08048546314239502, "learning_rate": 0.00019872561835080003, "loss": 0.1013, "step": 2807 }, { "epoch": 0.18152727272727273, "grad_norm": 0.08277661353349686, "learning_rate": 0.0001987245298289953, "loss": 0.0949, "step": 2808 }, { "epoch": 0.1815919191919192, "grad_norm": 0.09248380362987518, "learning_rate": 0.00019872344084548857, "loss": 0.091, "step": 2809 }, { "epoch": 0.18165656565656566, "grad_norm": 0.07722607254981995, "learning_rate": 0.00019872235140028495, "loss": 0.0879, "step": 2810 }, { "epoch": 0.18172121212121212, "grad_norm": 0.06787624955177307, "learning_rate": 0.00019872126149338953, "loss": 0.0845, "step": 2811 }, { "epoch": 0.18178585858585858, "grad_norm": 0.08037717640399933, "learning_rate": 0.0001987201711248074, "loss": 0.0801, "step": 2812 }, { "epoch": 0.18185050505050504, "grad_norm": 0.06995908915996552, "learning_rate": 0.00019871908029454367, "loss": 0.0875, "step": 2813 }, { "epoch": 0.1819151515151515, "grad_norm": 0.07385554164648056, "learning_rate": 0.00019871798900260345, "loss": 0.0826, "step": 2814 }, { "epoch": 0.181979797979798, "grad_norm": 0.06955590099096298, "learning_rate": 0.0001987168972489918, "loss": 0.0864, "step": 2815 }, { "epoch": 0.18204444444444445, "grad_norm": 0.07817044854164124, "learning_rate": 0.0001987158050337139, "loss": 0.0906, "step": 2816 }, { "epoch": 0.18204444444444445, "eval_bleu": 14.96304468250526, "eval_loss": 0.0962601900100708, "eval_runtime": 2.6835, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 2816 }, { "epoch": 0.18210909090909091, "grad_norm": 0.0857403576374054, "learning_rate": 0.00019871471235677476, "loss": 0.0925, "step": 2817 }, { "epoch": 0.18217373737373738, "grad_norm": 0.07382909208536148, "learning_rate": 0.0001987136192181796, "loss": 0.0942, "step": 2818 }, { "epoch": 0.18223838383838384, "grad_norm": 0.1192716509103775, "learning_rate": 0.00019871252561793343, "loss": 0.1067, "step": 2819 }, { "epoch": 0.1823030303030303, "grad_norm": 0.08581046015024185, "learning_rate": 0.00019871143155604143, "loss": 0.0954, "step": 2820 }, { "epoch": 0.18236767676767676, "grad_norm": 0.07248219102621078, "learning_rate": 0.0001987103370325087, "loss": 0.0825, "step": 2821 }, { "epoch": 0.18243232323232322, "grad_norm": 0.0770423635840416, "learning_rate": 0.00019870924204734035, "loss": 0.097, "step": 2822 }, { "epoch": 0.18249696969696969, "grad_norm": 0.08159273117780685, "learning_rate": 0.00019870814660054152, "loss": 0.098, "step": 2823 }, { "epoch": 0.18256161616161617, "grad_norm": 0.08246093988418579, "learning_rate": 0.0001987070506921173, "loss": 0.0976, "step": 2824 }, { "epoch": 0.18262626262626264, "grad_norm": 0.08934545516967773, "learning_rate": 0.0001987059543220729, "loss": 0.1179, "step": 2825 }, { "epoch": 0.1826909090909091, "grad_norm": 0.07338359206914902, "learning_rate": 0.0001987048574904133, "loss": 0.087, "step": 2826 }, { "epoch": 0.18275555555555556, "grad_norm": 0.08218204975128174, "learning_rate": 0.00019870376019714376, "loss": 0.12, "step": 2827 }, { "epoch": 0.18282020202020202, "grad_norm": 0.07678299397230148, "learning_rate": 0.00019870266244226934, "loss": 0.0883, "step": 2828 }, { "epoch": 0.18288484848484848, "grad_norm": 0.07436902821063995, "learning_rate": 0.00019870156422579523, "loss": 0.0896, "step": 2829 }, { "epoch": 0.18294949494949495, "grad_norm": 0.06455118954181671, "learning_rate": 0.0001987004655477265, "loss": 0.0749, "step": 2830 }, { "epoch": 0.1830141414141414, "grad_norm": 0.0726434513926506, "learning_rate": 0.00019869936640806835, "loss": 0.0917, "step": 2831 }, { "epoch": 0.18307878787878787, "grad_norm": 0.0812985897064209, "learning_rate": 0.00019869826680682587, "loss": 0.1051, "step": 2832 }, { "epoch": 0.18307878787878787, "eval_bleu": 13.207528272580483, "eval_loss": 0.09708814322948456, "eval_runtime": 2.9038, "eval_samples_per_second": 11.02, "eval_steps_per_second": 1.377, "step": 2832 }, { "epoch": 0.18314343434343433, "grad_norm": 0.06964149326086044, "learning_rate": 0.00019869716674400422, "loss": 0.0799, "step": 2833 }, { "epoch": 0.18320808080808082, "grad_norm": 0.0735994428396225, "learning_rate": 0.00019869606621960857, "loss": 0.0909, "step": 2834 }, { "epoch": 0.18327272727272728, "grad_norm": 0.07230139523744583, "learning_rate": 0.00019869496523364404, "loss": 0.0873, "step": 2835 }, { "epoch": 0.18333737373737374, "grad_norm": 0.07027056813240051, "learning_rate": 0.0001986938637861158, "loss": 0.0885, "step": 2836 }, { "epoch": 0.1834020202020202, "grad_norm": 0.07894861698150635, "learning_rate": 0.00019869276187702895, "loss": 0.0887, "step": 2837 }, { "epoch": 0.18346666666666667, "grad_norm": 0.07709158957004547, "learning_rate": 0.0001986916595063887, "loss": 0.0998, "step": 2838 }, { "epoch": 0.18353131313131313, "grad_norm": 0.08857876062393188, "learning_rate": 0.0001986905566742002, "loss": 0.107, "step": 2839 }, { "epoch": 0.1835959595959596, "grad_norm": 0.0786396712064743, "learning_rate": 0.00019868945338046858, "loss": 0.094, "step": 2840 }, { "epoch": 0.18366060606060605, "grad_norm": 0.0806623250246048, "learning_rate": 0.000198688349625199, "loss": 0.1022, "step": 2841 }, { "epoch": 0.1837252525252525, "grad_norm": 0.075035959482193, "learning_rate": 0.00019868724540839664, "loss": 0.0816, "step": 2842 }, { "epoch": 0.183789898989899, "grad_norm": 0.06769943982362747, "learning_rate": 0.00019868614073006668, "loss": 0.0789, "step": 2843 }, { "epoch": 0.18385454545454546, "grad_norm": 0.07184632867574692, "learning_rate": 0.00019868503559021425, "loss": 0.0942, "step": 2844 }, { "epoch": 0.18391919191919193, "grad_norm": 0.0876123458147049, "learning_rate": 0.00019868392998884454, "loss": 0.0945, "step": 2845 }, { "epoch": 0.1839838383838384, "grad_norm": 0.07806850969791412, "learning_rate": 0.0001986828239259627, "loss": 0.0931, "step": 2846 }, { "epoch": 0.18404848484848485, "grad_norm": 0.07110225409269333, "learning_rate": 0.00019868171740157394, "loss": 0.0827, "step": 2847 }, { "epoch": 0.1841131313131313, "grad_norm": 0.0839366614818573, "learning_rate": 0.00019868061041568337, "loss": 0.0969, "step": 2848 }, { "epoch": 0.1841131313131313, "eval_bleu": 12.64759814692508, "eval_loss": 0.09852884709835052, "eval_runtime": 2.7421, "eval_samples_per_second": 11.67, "eval_steps_per_second": 1.459, "step": 2848 }, { "epoch": 0.18417777777777777, "grad_norm": 0.08227841556072235, "learning_rate": 0.00019867950296829624, "loss": 0.0961, "step": 2849 }, { "epoch": 0.18424242424242424, "grad_norm": 0.07454248517751694, "learning_rate": 0.0001986783950594177, "loss": 0.0988, "step": 2850 }, { "epoch": 0.1843070707070707, "grad_norm": 0.07863860577344894, "learning_rate": 0.0001986772866890529, "loss": 0.0807, "step": 2851 }, { "epoch": 0.18437171717171716, "grad_norm": 0.0771082192659378, "learning_rate": 0.00019867617785720708, "loss": 0.0974, "step": 2852 }, { "epoch": 0.18443636363636365, "grad_norm": 0.08415652811527252, "learning_rate": 0.0001986750685638854, "loss": 0.0968, "step": 2853 }, { "epoch": 0.1845010101010101, "grad_norm": 0.10221920162439346, "learning_rate": 0.00019867395880909303, "loss": 0.1171, "step": 2854 }, { "epoch": 0.18456565656565657, "grad_norm": 0.08217688649892807, "learning_rate": 0.00019867284859283516, "loss": 0.089, "step": 2855 }, { "epoch": 0.18463030303030303, "grad_norm": 0.08122526854276657, "learning_rate": 0.00019867173791511704, "loss": 0.1027, "step": 2856 }, { "epoch": 0.1846949494949495, "grad_norm": 0.07600749284029007, "learning_rate": 0.0001986706267759438, "loss": 0.0921, "step": 2857 }, { "epoch": 0.18475959595959596, "grad_norm": 0.08221013844013214, "learning_rate": 0.00019866951517532068, "loss": 0.0834, "step": 2858 }, { "epoch": 0.18482424242424242, "grad_norm": 0.07795519381761551, "learning_rate": 0.0001986684031132528, "loss": 0.1004, "step": 2859 }, { "epoch": 0.18488888888888888, "grad_norm": 0.11812471598386765, "learning_rate": 0.00019866729058974546, "loss": 0.1266, "step": 2860 }, { "epoch": 0.18495353535353534, "grad_norm": 0.08074431121349335, "learning_rate": 0.00019866617760480381, "loss": 0.0966, "step": 2861 }, { "epoch": 0.18501818181818183, "grad_norm": 0.0667763203382492, "learning_rate": 0.0001986650641584331, "loss": 0.081, "step": 2862 }, { "epoch": 0.1850828282828283, "grad_norm": 0.08093970268964767, "learning_rate": 0.00019866395025063848, "loss": 0.0946, "step": 2863 }, { "epoch": 0.18514747474747476, "grad_norm": 0.0754845142364502, "learning_rate": 0.00019866283588142517, "loss": 0.0903, "step": 2864 }, { "epoch": 0.18514747474747476, "eval_bleu": 14.564012214557106, "eval_loss": 0.09604822099208832, "eval_runtime": 2.7469, "eval_samples_per_second": 11.649, "eval_steps_per_second": 1.456, "step": 2864 }, { "epoch": 0.18521212121212122, "grad_norm": 0.07853715866804123, "learning_rate": 0.00019866172105079837, "loss": 0.0909, "step": 2865 }, { "epoch": 0.18527676767676768, "grad_norm": 0.06989241391420364, "learning_rate": 0.00019866060575876335, "loss": 0.0882, "step": 2866 }, { "epoch": 0.18534141414141414, "grad_norm": 0.07622192054986954, "learning_rate": 0.0001986594900053253, "loss": 0.0936, "step": 2867 }, { "epoch": 0.1854060606060606, "grad_norm": 0.08440738171339035, "learning_rate": 0.0001986583737904894, "loss": 0.1065, "step": 2868 }, { "epoch": 0.18547070707070706, "grad_norm": 0.07761204242706299, "learning_rate": 0.00019865725711426096, "loss": 0.0953, "step": 2869 }, { "epoch": 0.18553535353535353, "grad_norm": 0.09092800319194794, "learning_rate": 0.0001986561399766451, "loss": 0.1025, "step": 2870 }, { "epoch": 0.1856, "grad_norm": 0.08442335575819016, "learning_rate": 0.0001986550223776471, "loss": 0.0889, "step": 2871 }, { "epoch": 0.18566464646464648, "grad_norm": 0.08239573985338211, "learning_rate": 0.00019865390431727216, "loss": 0.0896, "step": 2872 }, { "epoch": 0.18572929292929294, "grad_norm": 0.08330517262220383, "learning_rate": 0.00019865278579552555, "loss": 0.0953, "step": 2873 }, { "epoch": 0.1857939393939394, "grad_norm": 0.0879674181342125, "learning_rate": 0.00019865166681241246, "loss": 0.1113, "step": 2874 }, { "epoch": 0.18585858585858586, "grad_norm": 0.07764395326375961, "learning_rate": 0.00019865054736793814, "loss": 0.0936, "step": 2875 }, { "epoch": 0.18592323232323232, "grad_norm": 0.07007759809494019, "learning_rate": 0.0001986494274621078, "loss": 0.076, "step": 2876 }, { "epoch": 0.18598787878787879, "grad_norm": 0.08036404103040695, "learning_rate": 0.00019864830709492672, "loss": 0.0971, "step": 2877 }, { "epoch": 0.18605252525252525, "grad_norm": 0.08120427280664444, "learning_rate": 0.00019864718626640013, "loss": 0.0982, "step": 2878 }, { "epoch": 0.1861171717171717, "grad_norm": 0.0838732123374939, "learning_rate": 0.00019864606497653324, "loss": 0.1185, "step": 2879 }, { "epoch": 0.18618181818181817, "grad_norm": 0.07545242458581924, "learning_rate": 0.00019864494322533135, "loss": 0.0928, "step": 2880 }, { "epoch": 0.18618181818181817, "eval_bleu": 11.666511494804512, "eval_loss": 0.09546297043561935, "eval_runtime": 2.8352, "eval_samples_per_second": 11.287, "eval_steps_per_second": 1.411, "step": 2880 }, { "epoch": 0.18624646464646466, "grad_norm": 0.07434926927089691, "learning_rate": 0.00019864382101279966, "loss": 0.0801, "step": 2881 }, { "epoch": 0.18631111111111112, "grad_norm": 0.06940607726573944, "learning_rate": 0.00019864269833894343, "loss": 0.0854, "step": 2882 }, { "epoch": 0.18637575757575758, "grad_norm": 0.06883041560649872, "learning_rate": 0.0001986415752037679, "loss": 0.0879, "step": 2883 }, { "epoch": 0.18644040404040405, "grad_norm": 0.0735919252038002, "learning_rate": 0.0001986404516072783, "loss": 0.0837, "step": 2884 }, { "epoch": 0.1865050505050505, "grad_norm": 0.07767898589372635, "learning_rate": 0.00019863932754947996, "loss": 0.1079, "step": 2885 }, { "epoch": 0.18656969696969697, "grad_norm": 0.09822847694158554, "learning_rate": 0.0001986382030303781, "loss": 0.1166, "step": 2886 }, { "epoch": 0.18663434343434343, "grad_norm": 0.09037579596042633, "learning_rate": 0.00019863707804997796, "loss": 0.101, "step": 2887 }, { "epoch": 0.1866989898989899, "grad_norm": 0.07880650460720062, "learning_rate": 0.00019863595260828483, "loss": 0.091, "step": 2888 }, { "epoch": 0.18676363636363635, "grad_norm": 0.07453230768442154, "learning_rate": 0.00019863482670530393, "loss": 0.0833, "step": 2889 }, { "epoch": 0.18682828282828282, "grad_norm": 0.07326208800077438, "learning_rate": 0.00019863370034104057, "loss": 0.0791, "step": 2890 }, { "epoch": 0.1868929292929293, "grad_norm": 0.06749226152896881, "learning_rate": 0.0001986325735155, "loss": 0.0915, "step": 2891 }, { "epoch": 0.18695757575757577, "grad_norm": 0.08843649178743362, "learning_rate": 0.00019863144622868747, "loss": 0.0949, "step": 2892 }, { "epoch": 0.18702222222222223, "grad_norm": 0.06979414820671082, "learning_rate": 0.0001986303184806083, "loss": 0.0881, "step": 2893 }, { "epoch": 0.1870868686868687, "grad_norm": 0.082852803170681, "learning_rate": 0.0001986291902712677, "loss": 0.0875, "step": 2894 }, { "epoch": 0.18715151515151515, "grad_norm": 0.08005732297897339, "learning_rate": 0.00019862806160067105, "loss": 0.0937, "step": 2895 }, { "epoch": 0.1872161616161616, "grad_norm": 0.09265148639678955, "learning_rate": 0.00019862693246882352, "loss": 0.1149, "step": 2896 }, { "epoch": 0.1872161616161616, "eval_bleu": 14.385859501465685, "eval_loss": 0.09779681265354156, "eval_runtime": 2.854, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 2896 }, { "epoch": 0.18728080808080808, "grad_norm": 0.07319992035627365, "learning_rate": 0.00019862580287573046, "loss": 0.0792, "step": 2897 }, { "epoch": 0.18734545454545454, "grad_norm": 0.08132781088352203, "learning_rate": 0.0001986246728213971, "loss": 0.0905, "step": 2898 }, { "epoch": 0.187410101010101, "grad_norm": 0.07200151681900024, "learning_rate": 0.00019862354230582873, "loss": 0.0813, "step": 2899 }, { "epoch": 0.1874747474747475, "grad_norm": 0.09434063732624054, "learning_rate": 0.00019862241132903067, "loss": 0.0936, "step": 2900 }, { "epoch": 0.18753939393939395, "grad_norm": 0.07864663749933243, "learning_rate": 0.00019862127989100822, "loss": 0.0916, "step": 2901 }, { "epoch": 0.1876040404040404, "grad_norm": 0.07996901869773865, "learning_rate": 0.00019862014799176662, "loss": 0.0978, "step": 2902 }, { "epoch": 0.18766868686868687, "grad_norm": 0.07707536965608597, "learning_rate": 0.0001986190156313112, "loss": 0.0928, "step": 2903 }, { "epoch": 0.18773333333333334, "grad_norm": 0.0690661370754242, "learning_rate": 0.00019861788280964727, "loss": 0.0931, "step": 2904 }, { "epoch": 0.1877979797979798, "grad_norm": 0.08122185617685318, "learning_rate": 0.00019861674952678006, "loss": 0.098, "step": 2905 }, { "epoch": 0.18786262626262626, "grad_norm": 0.08451220393180847, "learning_rate": 0.00019861561578271493, "loss": 0.1063, "step": 2906 }, { "epoch": 0.18792727272727272, "grad_norm": 0.06468889862298965, "learning_rate": 0.0001986144815774572, "loss": 0.0817, "step": 2907 }, { "epoch": 0.18799191919191918, "grad_norm": 0.06681172549724579, "learning_rate": 0.00019861334691101211, "loss": 0.0885, "step": 2908 }, { "epoch": 0.18805656565656564, "grad_norm": 0.07753611356019974, "learning_rate": 0.000198612211783385, "loss": 0.0997, "step": 2909 }, { "epoch": 0.18812121212121213, "grad_norm": 0.07176484167575836, "learning_rate": 0.0001986110761945812, "loss": 0.083, "step": 2910 }, { "epoch": 0.1881858585858586, "grad_norm": 0.07666324079036713, "learning_rate": 0.00019860994014460597, "loss": 0.0967, "step": 2911 }, { "epoch": 0.18825050505050506, "grad_norm": 0.0668957382440567, "learning_rate": 0.00019860880363346464, "loss": 0.0828, "step": 2912 }, { "epoch": 0.18825050505050506, "eval_bleu": 14.763794760543348, "eval_loss": 0.09515655785799026, "eval_runtime": 2.685, "eval_samples_per_second": 11.918, "eval_steps_per_second": 1.49, "step": 2912 }, { "epoch": 0.18831515151515152, "grad_norm": 0.07317197322845459, "learning_rate": 0.00019860766666116258, "loss": 0.0815, "step": 2913 }, { "epoch": 0.18837979797979798, "grad_norm": 0.0779421254992485, "learning_rate": 0.00019860652922770502, "loss": 0.0945, "step": 2914 }, { "epoch": 0.18844444444444444, "grad_norm": 0.0642285868525505, "learning_rate": 0.00019860539133309733, "loss": 0.0704, "step": 2915 }, { "epoch": 0.1885090909090909, "grad_norm": 0.07154625654220581, "learning_rate": 0.0001986042529773448, "loss": 0.0781, "step": 2916 }, { "epoch": 0.18857373737373737, "grad_norm": 0.07037678360939026, "learning_rate": 0.00019860311416045284, "loss": 0.0779, "step": 2917 }, { "epoch": 0.18863838383838383, "grad_norm": 0.08008299767971039, "learning_rate": 0.00019860197488242668, "loss": 0.0966, "step": 2918 }, { "epoch": 0.18870303030303032, "grad_norm": 0.08718068152666092, "learning_rate": 0.00019860083514327168, "loss": 0.0927, "step": 2919 }, { "epoch": 0.18876767676767678, "grad_norm": 0.07753336429595947, "learning_rate": 0.00019859969494299317, "loss": 0.0844, "step": 2920 }, { "epoch": 0.18883232323232324, "grad_norm": 0.08531400561332703, "learning_rate": 0.00019859855428159645, "loss": 0.1067, "step": 2921 }, { "epoch": 0.1888969696969697, "grad_norm": 0.07498552650213242, "learning_rate": 0.00019859741315908695, "loss": 0.0845, "step": 2922 }, { "epoch": 0.18896161616161616, "grad_norm": 0.07353661209344864, "learning_rate": 0.0001985962715754699, "loss": 0.0937, "step": 2923 }, { "epoch": 0.18902626262626263, "grad_norm": 0.07552067935466766, "learning_rate": 0.00019859512953075071, "loss": 0.0928, "step": 2924 }, { "epoch": 0.1890909090909091, "grad_norm": 0.07916353642940521, "learning_rate": 0.00019859398702493466, "loss": 0.1088, "step": 2925 }, { "epoch": 0.18915555555555555, "grad_norm": 0.07933656871318817, "learning_rate": 0.00019859284405802715, "loss": 0.1058, "step": 2926 }, { "epoch": 0.189220202020202, "grad_norm": 0.0765988752245903, "learning_rate": 0.0001985917006300335, "loss": 0.0918, "step": 2927 }, { "epoch": 0.18928484848484847, "grad_norm": 0.07666276395320892, "learning_rate": 0.0001985905567409591, "loss": 0.0964, "step": 2928 }, { "epoch": 0.18928484848484847, "eval_bleu": 12.737464917734656, "eval_loss": 0.09495145827531815, "eval_runtime": 2.894, "eval_samples_per_second": 11.057, "eval_steps_per_second": 1.382, "step": 2928 }, { "epoch": 0.18934949494949496, "grad_norm": 0.08585377037525177, "learning_rate": 0.00019858941239080918, "loss": 0.1116, "step": 2929 }, { "epoch": 0.18941414141414142, "grad_norm": 0.08172925561666489, "learning_rate": 0.00019858826757958924, "loss": 0.1024, "step": 2930 }, { "epoch": 0.18947878787878789, "grad_norm": 0.07743149250745773, "learning_rate": 0.00019858712230730452, "loss": 0.1038, "step": 2931 }, { "epoch": 0.18954343434343435, "grad_norm": 0.07631954550743103, "learning_rate": 0.00019858597657396043, "loss": 0.0993, "step": 2932 }, { "epoch": 0.1896080808080808, "grad_norm": 0.07849955558776855, "learning_rate": 0.00019858483037956233, "loss": 0.0976, "step": 2933 }, { "epoch": 0.18967272727272727, "grad_norm": 0.0782211497426033, "learning_rate": 0.00019858368372411558, "loss": 0.0894, "step": 2934 }, { "epoch": 0.18973737373737373, "grad_norm": 0.06850609928369522, "learning_rate": 0.0001985825366076255, "loss": 0.0851, "step": 2935 }, { "epoch": 0.1898020202020202, "grad_norm": 0.07977654784917831, "learning_rate": 0.0001985813890300975, "loss": 0.0882, "step": 2936 }, { "epoch": 0.18986666666666666, "grad_norm": 0.07474221289157867, "learning_rate": 0.000198580240991537, "loss": 0.0843, "step": 2937 }, { "epoch": 0.18993131313131312, "grad_norm": 0.09841342270374298, "learning_rate": 0.00019857909249194918, "loss": 0.1046, "step": 2938 }, { "epoch": 0.1899959595959596, "grad_norm": 0.08717047423124313, "learning_rate": 0.00019857794353133964, "loss": 0.1034, "step": 2939 }, { "epoch": 0.19006060606060607, "grad_norm": 0.06535878032445908, "learning_rate": 0.0001985767941097136, "loss": 0.0828, "step": 2940 }, { "epoch": 0.19012525252525253, "grad_norm": 0.08358795940876007, "learning_rate": 0.00019857564422707649, "loss": 0.1115, "step": 2941 }, { "epoch": 0.190189898989899, "grad_norm": 0.06361483037471771, "learning_rate": 0.00019857449388343366, "loss": 0.0767, "step": 2942 }, { "epoch": 0.19025454545454545, "grad_norm": 0.08360166102647781, "learning_rate": 0.0001985733430787905, "loss": 0.0961, "step": 2943 }, { "epoch": 0.19031919191919192, "grad_norm": 0.07721679657697678, "learning_rate": 0.00019857219181315246, "loss": 0.099, "step": 2944 }, { "epoch": 0.19031919191919192, "eval_bleu": 13.57269188628614, "eval_loss": 0.09461914002895355, "eval_runtime": 2.7117, "eval_samples_per_second": 11.801, "eval_steps_per_second": 1.475, "step": 2944 }, { "epoch": 0.19038383838383838, "grad_norm": 0.07514526695013046, "learning_rate": 0.00019857104008652482, "loss": 0.1006, "step": 2945 }, { "epoch": 0.19044848484848484, "grad_norm": 0.08099225908517838, "learning_rate": 0.00019856988789891306, "loss": 0.1009, "step": 2946 }, { "epoch": 0.1905131313131313, "grad_norm": 0.067985400557518, "learning_rate": 0.0001985687352503225, "loss": 0.0934, "step": 2947 }, { "epoch": 0.1905777777777778, "grad_norm": 0.0791618674993515, "learning_rate": 0.00019856758214075853, "loss": 0.0964, "step": 2948 }, { "epoch": 0.19064242424242425, "grad_norm": 0.08023680746555328, "learning_rate": 0.00019856642857022657, "loss": 0.0998, "step": 2949 }, { "epoch": 0.1907070707070707, "grad_norm": 0.06242063641548157, "learning_rate": 0.00019856527453873202, "loss": 0.0643, "step": 2950 }, { "epoch": 0.19077171717171718, "grad_norm": 0.08916376531124115, "learning_rate": 0.00019856412004628027, "loss": 0.1108, "step": 2951 }, { "epoch": 0.19083636363636364, "grad_norm": 0.15652132034301758, "learning_rate": 0.00019856296509287668, "loss": 0.1063, "step": 2952 }, { "epoch": 0.1909010101010101, "grad_norm": 0.07384748011827469, "learning_rate": 0.0001985618096785267, "loss": 0.086, "step": 2953 }, { "epoch": 0.19096565656565656, "grad_norm": 0.0817989632487297, "learning_rate": 0.00019856065380323578, "loss": 0.0847, "step": 2954 }, { "epoch": 0.19103030303030302, "grad_norm": 0.08566930145025253, "learning_rate": 0.0001985594974670092, "loss": 0.1169, "step": 2955 }, { "epoch": 0.19109494949494948, "grad_norm": 0.07700645178556442, "learning_rate": 0.00019855834066985243, "loss": 0.0943, "step": 2956 }, { "epoch": 0.19115959595959595, "grad_norm": 0.08066175132989883, "learning_rate": 0.0001985571834117709, "loss": 0.0991, "step": 2957 }, { "epoch": 0.19122424242424244, "grad_norm": 0.08218420296907425, "learning_rate": 0.00019855602569277003, "loss": 0.0992, "step": 2958 }, { "epoch": 0.1912888888888889, "grad_norm": 0.06755941361188889, "learning_rate": 0.00019855486751285518, "loss": 0.079, "step": 2959 }, { "epoch": 0.19135353535353536, "grad_norm": 0.07037235796451569, "learning_rate": 0.00019855370887203181, "loss": 0.0829, "step": 2960 }, { "epoch": 0.19135353535353536, "eval_bleu": 15.749835137967121, "eval_loss": 0.09402447938919067, "eval_runtime": 2.7406, "eval_samples_per_second": 11.676, "eval_steps_per_second": 1.46, "step": 2960 }, { "epoch": 0.19141818181818182, "grad_norm": 0.06689716130495071, "learning_rate": 0.00019855254977030532, "loss": 0.0852, "step": 2961 }, { "epoch": 0.19148282828282828, "grad_norm": 0.07843676954507828, "learning_rate": 0.00019855139020768115, "loss": 0.1105, "step": 2962 }, { "epoch": 0.19154747474747474, "grad_norm": 0.07698420435190201, "learning_rate": 0.00019855023018416467, "loss": 0.0908, "step": 2963 }, { "epoch": 0.1916121212121212, "grad_norm": 0.07270284742116928, "learning_rate": 0.00019854906969976138, "loss": 0.095, "step": 2964 }, { "epoch": 0.19167676767676767, "grad_norm": 0.0765790343284607, "learning_rate": 0.00019854790875447668, "loss": 0.0909, "step": 2965 }, { "epoch": 0.19174141414141413, "grad_norm": 0.08057883381843567, "learning_rate": 0.00019854674734831596, "loss": 0.0867, "step": 2966 }, { "epoch": 0.19180606060606062, "grad_norm": 0.0739097148180008, "learning_rate": 0.0001985455854812847, "loss": 0.0948, "step": 2967 }, { "epoch": 0.19187070707070708, "grad_norm": 0.07631795853376389, "learning_rate": 0.0001985444231533883, "loss": 0.1026, "step": 2968 }, { "epoch": 0.19193535353535354, "grad_norm": 0.07131054252386093, "learning_rate": 0.00019854326036463222, "loss": 0.0856, "step": 2969 }, { "epoch": 0.192, "grad_norm": 0.09894807636737823, "learning_rate": 0.0001985420971150219, "loss": 0.1251, "step": 2970 }, { "epoch": 0.19206464646464647, "grad_norm": 0.07754914462566376, "learning_rate": 0.00019854093340456274, "loss": 0.0812, "step": 2971 }, { "epoch": 0.19212929292929293, "grad_norm": 0.08262521028518677, "learning_rate": 0.00019853976923326022, "loss": 0.1088, "step": 2972 }, { "epoch": 0.1921939393939394, "grad_norm": 0.06875734031200409, "learning_rate": 0.00019853860460111977, "loss": 0.085, "step": 2973 }, { "epoch": 0.19225858585858585, "grad_norm": 0.06864924728870392, "learning_rate": 0.00019853743950814688, "loss": 0.0819, "step": 2974 }, { "epoch": 0.1923232323232323, "grad_norm": 0.07909615337848663, "learning_rate": 0.00019853627395434692, "loss": 0.0901, "step": 2975 }, { "epoch": 0.19238787878787877, "grad_norm": 0.0733780786395073, "learning_rate": 0.00019853510793972542, "loss": 0.0854, "step": 2976 }, { "epoch": 0.19238787878787877, "eval_bleu": 12.267704260117915, "eval_loss": 0.0959477573633194, "eval_runtime": 2.8234, "eval_samples_per_second": 11.334, "eval_steps_per_second": 1.417, "step": 2976 }, { "epoch": 0.19245252525252526, "grad_norm": 0.062204692512750626, "learning_rate": 0.00019853394146428777, "loss": 0.0668, "step": 2977 }, { "epoch": 0.19251717171717173, "grad_norm": 0.08668186515569687, "learning_rate": 0.00019853277452803943, "loss": 0.1122, "step": 2978 }, { "epoch": 0.1925818181818182, "grad_norm": 0.07729778438806534, "learning_rate": 0.00019853160713098592, "loss": 0.0972, "step": 2979 }, { "epoch": 0.19264646464646465, "grad_norm": 0.07843222469091415, "learning_rate": 0.00019853043927313264, "loss": 0.09, "step": 2980 }, { "epoch": 0.1927111111111111, "grad_norm": 0.07792270183563232, "learning_rate": 0.00019852927095448508, "loss": 0.097, "step": 2981 }, { "epoch": 0.19277575757575757, "grad_norm": 0.07337860018014908, "learning_rate": 0.00019852810217504868, "loss": 0.0861, "step": 2982 }, { "epoch": 0.19284040404040403, "grad_norm": 0.08088894933462143, "learning_rate": 0.00019852693293482892, "loss": 0.0987, "step": 2983 }, { "epoch": 0.1929050505050505, "grad_norm": 0.07351638376712799, "learning_rate": 0.00019852576323383127, "loss": 0.1036, "step": 2984 }, { "epoch": 0.19296969696969696, "grad_norm": 0.07280708849430084, "learning_rate": 0.00019852459307206116, "loss": 0.0872, "step": 2985 }, { "epoch": 0.19303434343434345, "grad_norm": 0.0681147426366806, "learning_rate": 0.00019852342244952418, "loss": 0.0948, "step": 2986 }, { "epoch": 0.1930989898989899, "grad_norm": 0.08333176374435425, "learning_rate": 0.00019852225136622565, "loss": 0.1075, "step": 2987 }, { "epoch": 0.19316363636363637, "grad_norm": 0.08377640694379807, "learning_rate": 0.00019852107982217118, "loss": 0.0972, "step": 2988 }, { "epoch": 0.19322828282828283, "grad_norm": 0.08644973486661911, "learning_rate": 0.00019851990781736615, "loss": 0.1019, "step": 2989 }, { "epoch": 0.1932929292929293, "grad_norm": 0.06872477382421494, "learning_rate": 0.0001985187353518161, "loss": 0.0796, "step": 2990 }, { "epoch": 0.19335757575757576, "grad_norm": 0.06575116515159607, "learning_rate": 0.00019851756242552648, "loss": 0.0908, "step": 2991 }, { "epoch": 0.19342222222222222, "grad_norm": 0.0682578757405281, "learning_rate": 0.00019851638903850278, "loss": 0.0821, "step": 2992 }, { "epoch": 0.19342222222222222, "eval_bleu": 15.121167264177965, "eval_loss": 0.09445605427026749, "eval_runtime": 2.7557, "eval_samples_per_second": 11.612, "eval_steps_per_second": 1.452, "step": 2992 }, { "epoch": 0.19348686868686868, "grad_norm": 0.0742427334189415, "learning_rate": 0.00019851521519075052, "loss": 0.0835, "step": 2993 }, { "epoch": 0.19355151515151514, "grad_norm": 0.08182668685913086, "learning_rate": 0.00019851404088227516, "loss": 0.0999, "step": 2994 }, { "epoch": 0.1936161616161616, "grad_norm": 0.08826828002929688, "learning_rate": 0.0001985128661130822, "loss": 0.1019, "step": 2995 }, { "epoch": 0.1936808080808081, "grad_norm": 0.08324848115444183, "learning_rate": 0.00019851169088317715, "loss": 0.0945, "step": 2996 }, { "epoch": 0.19374545454545455, "grad_norm": 0.07801444083452225, "learning_rate": 0.00019851051519256544, "loss": 0.1026, "step": 2997 }, { "epoch": 0.19381010101010102, "grad_norm": 0.09497161954641342, "learning_rate": 0.00019850933904125265, "loss": 0.1218, "step": 2998 }, { "epoch": 0.19387474747474748, "grad_norm": 0.07506946474313736, "learning_rate": 0.00019850816242924425, "loss": 0.095, "step": 2999 }, { "epoch": 0.19393939393939394, "grad_norm": 0.08126571774482727, "learning_rate": 0.00019850698535654575, "loss": 0.0998, "step": 3000 }, { "epoch": 0.1940040404040404, "grad_norm": 0.07637549191713333, "learning_rate": 0.0001985058078231626, "loss": 0.0836, "step": 3001 }, { "epoch": 0.19406868686868686, "grad_norm": 0.10559996962547302, "learning_rate": 0.0001985046298291004, "loss": 0.1286, "step": 3002 }, { "epoch": 0.19413333333333332, "grad_norm": 0.07943985611200333, "learning_rate": 0.0001985034513743646, "loss": 0.0862, "step": 3003 }, { "epoch": 0.1941979797979798, "grad_norm": 0.08651348203420639, "learning_rate": 0.00019850227245896073, "loss": 0.091, "step": 3004 }, { "epoch": 0.19426262626262628, "grad_norm": 0.07314924895763397, "learning_rate": 0.00019850109308289427, "loss": 0.0937, "step": 3005 }, { "epoch": 0.19432727272727274, "grad_norm": 0.07651598751544952, "learning_rate": 0.00019849991324617078, "loss": 0.0804, "step": 3006 }, { "epoch": 0.1943919191919192, "grad_norm": 0.07446742057800293, "learning_rate": 0.00019849873294879578, "loss": 0.0865, "step": 3007 }, { "epoch": 0.19445656565656566, "grad_norm": 0.07507487386465073, "learning_rate": 0.00019849755219077472, "loss": 0.0811, "step": 3008 }, { "epoch": 0.19445656565656566, "eval_bleu": 14.356516575103246, "eval_loss": 0.09638126194477081, "eval_runtime": 2.7859, "eval_samples_per_second": 11.486, "eval_steps_per_second": 1.436, "step": 3008 }, { "epoch": 0.19452121212121212, "grad_norm": 0.07590612769126892, "learning_rate": 0.0001984963709721132, "loss": 0.0894, "step": 3009 }, { "epoch": 0.19458585858585858, "grad_norm": 0.07424801588058472, "learning_rate": 0.00019849518929281672, "loss": 0.0886, "step": 3010 }, { "epoch": 0.19465050505050505, "grad_norm": 0.0704929530620575, "learning_rate": 0.00019849400715289076, "loss": 0.0905, "step": 3011 }, { "epoch": 0.1947151515151515, "grad_norm": 0.07022596895694733, "learning_rate": 0.00019849282455234094, "loss": 0.0864, "step": 3012 }, { "epoch": 0.19477979797979797, "grad_norm": 0.07476391643285751, "learning_rate": 0.00019849164149117273, "loss": 0.086, "step": 3013 }, { "epoch": 0.19484444444444443, "grad_norm": 0.08668769150972366, "learning_rate": 0.00019849045796939166, "loss": 0.1178, "step": 3014 }, { "epoch": 0.19490909090909092, "grad_norm": 0.0779046043753624, "learning_rate": 0.00019848927398700327, "loss": 0.1039, "step": 3015 }, { "epoch": 0.19497373737373738, "grad_norm": 0.07888432592153549, "learning_rate": 0.00019848808954401316, "loss": 0.1003, "step": 3016 }, { "epoch": 0.19503838383838384, "grad_norm": 0.07456845045089722, "learning_rate": 0.00019848690464042675, "loss": 0.088, "step": 3017 }, { "epoch": 0.1951030303030303, "grad_norm": 0.07968100160360336, "learning_rate": 0.00019848571927624965, "loss": 0.1003, "step": 3018 }, { "epoch": 0.19516767676767677, "grad_norm": 0.08136183768510818, "learning_rate": 0.00019848453345148746, "loss": 0.1011, "step": 3019 }, { "epoch": 0.19523232323232323, "grad_norm": 0.07760115712881088, "learning_rate": 0.0001984833471661456, "loss": 0.0919, "step": 3020 }, { "epoch": 0.1952969696969697, "grad_norm": 0.0735422819852829, "learning_rate": 0.00019848216042022971, "loss": 0.0908, "step": 3021 }, { "epoch": 0.19536161616161615, "grad_norm": 0.07440479099750519, "learning_rate": 0.00019848097321374533, "loss": 0.0911, "step": 3022 }, { "epoch": 0.19542626262626261, "grad_norm": 0.07027467340230942, "learning_rate": 0.00019847978554669797, "loss": 0.089, "step": 3023 }, { "epoch": 0.1954909090909091, "grad_norm": 0.07545223832130432, "learning_rate": 0.0001984785974190932, "loss": 0.0897, "step": 3024 }, { "epoch": 0.1954909090909091, "eval_bleu": 12.349293801942022, "eval_loss": 0.09381656348705292, "eval_runtime": 2.7765, "eval_samples_per_second": 11.525, "eval_steps_per_second": 1.441, "step": 3024 }, { "epoch": 0.19555555555555557, "grad_norm": 0.07429604232311249, "learning_rate": 0.00019847740883093662, "loss": 0.0726, "step": 3025 }, { "epoch": 0.19562020202020203, "grad_norm": 0.06766149401664734, "learning_rate": 0.00019847621978223373, "loss": 0.0879, "step": 3026 }, { "epoch": 0.1956848484848485, "grad_norm": 0.07333466410636902, "learning_rate": 0.00019847503027299013, "loss": 0.0891, "step": 3027 }, { "epoch": 0.19574949494949495, "grad_norm": 0.0650382786989212, "learning_rate": 0.00019847384030321135, "loss": 0.0757, "step": 3028 }, { "epoch": 0.1958141414141414, "grad_norm": 0.13894008100032806, "learning_rate": 0.000198472649872903, "loss": 0.1036, "step": 3029 }, { "epoch": 0.19587878787878787, "grad_norm": 0.06723253428936005, "learning_rate": 0.0001984714589820706, "loss": 0.0853, "step": 3030 }, { "epoch": 0.19594343434343434, "grad_norm": 0.09912490099668503, "learning_rate": 0.00019847026763071974, "loss": 0.0996, "step": 3031 }, { "epoch": 0.1960080808080808, "grad_norm": 0.06248555704951286, "learning_rate": 0.00019846907581885602, "loss": 0.0675, "step": 3032 }, { "epoch": 0.19607272727272726, "grad_norm": 0.08128565549850464, "learning_rate": 0.00019846788354648497, "loss": 0.095, "step": 3033 }, { "epoch": 0.19613737373737375, "grad_norm": 0.07953748852014542, "learning_rate": 0.00019846669081361219, "loss": 0.089, "step": 3034 }, { "epoch": 0.1962020202020202, "grad_norm": 0.07747144997119904, "learning_rate": 0.00019846549762024323, "loss": 0.0957, "step": 3035 }, { "epoch": 0.19626666666666667, "grad_norm": 0.07384207844734192, "learning_rate": 0.0001984643039663837, "loss": 0.0933, "step": 3036 }, { "epoch": 0.19633131313131313, "grad_norm": 0.08308795094490051, "learning_rate": 0.00019846310985203915, "loss": 0.1029, "step": 3037 }, { "epoch": 0.1963959595959596, "grad_norm": 0.0817706510424614, "learning_rate": 0.0001984619152772152, "loss": 0.0787, "step": 3038 }, { "epoch": 0.19646060606060606, "grad_norm": 0.0723562240600586, "learning_rate": 0.00019846072024191745, "loss": 0.0877, "step": 3039 }, { "epoch": 0.19652525252525252, "grad_norm": 0.08890534937381744, "learning_rate": 0.0001984595247461514, "loss": 0.1082, "step": 3040 }, { "epoch": 0.19652525252525252, "eval_bleu": 14.590501184362248, "eval_loss": 0.09402894973754883, "eval_runtime": 2.7031, "eval_samples_per_second": 11.838, "eval_steps_per_second": 1.48, "step": 3040 }, { "epoch": 0.19658989898989898, "grad_norm": 0.08016083389520645, "learning_rate": 0.00019845832878992277, "loss": 0.0986, "step": 3041 }, { "epoch": 0.19665454545454544, "grad_norm": 0.07454028725624084, "learning_rate": 0.00019845713237323707, "loss": 0.0917, "step": 3042 }, { "epoch": 0.19671919191919193, "grad_norm": 0.06026134639978409, "learning_rate": 0.00019845593549609988, "loss": 0.0707, "step": 3043 }, { "epoch": 0.1967838383838384, "grad_norm": 0.06678041815757751, "learning_rate": 0.00019845473815851685, "loss": 0.0847, "step": 3044 }, { "epoch": 0.19684848484848486, "grad_norm": 0.0710722804069519, "learning_rate": 0.00019845354036049354, "loss": 0.0905, "step": 3045 }, { "epoch": 0.19691313131313132, "grad_norm": 0.0787622481584549, "learning_rate": 0.00019845234210203561, "loss": 0.0996, "step": 3046 }, { "epoch": 0.19697777777777778, "grad_norm": 0.07835431396961212, "learning_rate": 0.00019845114338314858, "loss": 0.1082, "step": 3047 }, { "epoch": 0.19704242424242424, "grad_norm": 0.07426532357931137, "learning_rate": 0.00019844994420383815, "loss": 0.0908, "step": 3048 }, { "epoch": 0.1971070707070707, "grad_norm": 0.06754591315984726, "learning_rate": 0.00019844874456410985, "loss": 0.0977, "step": 3049 }, { "epoch": 0.19717171717171716, "grad_norm": 0.07153677195310593, "learning_rate": 0.0001984475444639693, "loss": 0.0852, "step": 3050 }, { "epoch": 0.19723636363636363, "grad_norm": 0.0679018422961235, "learning_rate": 0.00019844634390342214, "loss": 0.0785, "step": 3051 }, { "epoch": 0.1973010101010101, "grad_norm": 0.07492662966251373, "learning_rate": 0.00019844514288247397, "loss": 0.0839, "step": 3052 }, { "epoch": 0.19736565656565658, "grad_norm": 0.1103171706199646, "learning_rate": 0.00019844394140113044, "loss": 0.1144, "step": 3053 }, { "epoch": 0.19743030303030304, "grad_norm": 0.0835791751742363, "learning_rate": 0.0001984427394593971, "loss": 0.113, "step": 3054 }, { "epoch": 0.1974949494949495, "grad_norm": 0.08254282921552658, "learning_rate": 0.00019844153705727966, "loss": 0.0984, "step": 3055 }, { "epoch": 0.19755959595959596, "grad_norm": 0.07681182771921158, "learning_rate": 0.00019844033419478367, "loss": 0.1031, "step": 3056 }, { "epoch": 0.19755959595959596, "eval_bleu": 16.18089585965196, "eval_loss": 0.09275349229574203, "eval_runtime": 2.802, "eval_samples_per_second": 11.42, "eval_steps_per_second": 1.428, "step": 3056 }, { "epoch": 0.19762424242424242, "grad_norm": 0.08036350458860397, "learning_rate": 0.0001984391308719148, "loss": 0.0937, "step": 3057 }, { "epoch": 0.1976888888888889, "grad_norm": 0.0633590966463089, "learning_rate": 0.00019843792708867862, "loss": 0.0749, "step": 3058 }, { "epoch": 0.19775353535353535, "grad_norm": 0.06803199648857117, "learning_rate": 0.00019843672284508082, "loss": 0.0846, "step": 3059 }, { "epoch": 0.1978181818181818, "grad_norm": 0.07061391323804855, "learning_rate": 0.00019843551814112702, "loss": 0.0918, "step": 3060 }, { "epoch": 0.19788282828282827, "grad_norm": 0.07257424294948578, "learning_rate": 0.00019843431297682283, "loss": 0.0906, "step": 3061 }, { "epoch": 0.19794747474747476, "grad_norm": 0.07157407701015472, "learning_rate": 0.00019843310735217392, "loss": 0.0751, "step": 3062 }, { "epoch": 0.19801212121212122, "grad_norm": 0.06550973653793335, "learning_rate": 0.00019843190126718588, "loss": 0.0792, "step": 3063 }, { "epoch": 0.19807676767676768, "grad_norm": 0.07210759818553925, "learning_rate": 0.00019843069472186438, "loss": 0.0919, "step": 3064 }, { "epoch": 0.19814141414141415, "grad_norm": 0.06413321942090988, "learning_rate": 0.0001984294877162151, "loss": 0.0729, "step": 3065 }, { "epoch": 0.1982060606060606, "grad_norm": 0.06826533377170563, "learning_rate": 0.00019842828025024362, "loss": 0.0857, "step": 3066 }, { "epoch": 0.19827070707070707, "grad_norm": 0.08026416599750519, "learning_rate": 0.0001984270723239556, "loss": 0.0908, "step": 3067 }, { "epoch": 0.19833535353535353, "grad_norm": 0.07231787592172623, "learning_rate": 0.0001984258639373567, "loss": 0.086, "step": 3068 }, { "epoch": 0.1984, "grad_norm": 0.10786189883947372, "learning_rate": 0.00019842465509045258, "loss": 0.1132, "step": 3069 }, { "epoch": 0.19846464646464645, "grad_norm": 0.08069875091314316, "learning_rate": 0.0001984234457832489, "loss": 0.0987, "step": 3070 }, { "epoch": 0.19852929292929292, "grad_norm": 0.06611949950456619, "learning_rate": 0.00019842223601575126, "loss": 0.0776, "step": 3071 }, { "epoch": 0.1985939393939394, "grad_norm": 0.0697125494480133, "learning_rate": 0.0001984210257879654, "loss": 0.0896, "step": 3072 }, { "epoch": 0.1985939393939394, "eval_bleu": 15.348511609101799, "eval_loss": 0.09288185089826584, "eval_runtime": 2.6637, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 3072 }, { "epoch": 0.19865858585858587, "grad_norm": 0.07403448224067688, "learning_rate": 0.00019841981509989695, "loss": 0.093, "step": 3073 }, { "epoch": 0.19872323232323233, "grad_norm": 0.0917670950293541, "learning_rate": 0.00019841860395155157, "loss": 0.1015, "step": 3074 }, { "epoch": 0.1987878787878788, "grad_norm": 0.08433215320110321, "learning_rate": 0.00019841739234293488, "loss": 0.0926, "step": 3075 }, { "epoch": 0.19885252525252525, "grad_norm": 0.07214537262916565, "learning_rate": 0.0001984161802740526, "loss": 0.0947, "step": 3076 }, { "epoch": 0.19891717171717171, "grad_norm": 0.07228455692529678, "learning_rate": 0.00019841496774491036, "loss": 0.0894, "step": 3077 }, { "epoch": 0.19898181818181818, "grad_norm": 0.08608461916446686, "learning_rate": 0.00019841375475551388, "loss": 0.1185, "step": 3078 }, { "epoch": 0.19904646464646464, "grad_norm": 0.07759758085012436, "learning_rate": 0.0001984125413058688, "loss": 0.0949, "step": 3079 }, { "epoch": 0.1991111111111111, "grad_norm": 0.08019310981035233, "learning_rate": 0.00019841132739598075, "loss": 0.0975, "step": 3080 }, { "epoch": 0.1991757575757576, "grad_norm": 0.07815665751695633, "learning_rate": 0.0001984101130258555, "loss": 0.0919, "step": 3081 }, { "epoch": 0.19924040404040405, "grad_norm": 0.1941106617450714, "learning_rate": 0.0001984088981954987, "loss": 0.0944, "step": 3082 }, { "epoch": 0.1993050505050505, "grad_norm": 0.14127443730831146, "learning_rate": 0.000198407682904916, "loss": 0.0818, "step": 3083 }, { "epoch": 0.19936969696969697, "grad_norm": 0.09154678136110306, "learning_rate": 0.0001984064671541131, "loss": 0.1087, "step": 3084 }, { "epoch": 0.19943434343434344, "grad_norm": 0.1412719339132309, "learning_rate": 0.00019840525094309567, "loss": 0.0978, "step": 3085 }, { "epoch": 0.1994989898989899, "grad_norm": 0.07700436562299728, "learning_rate": 0.00019840403427186942, "loss": 0.095, "step": 3086 }, { "epoch": 0.19956363636363636, "grad_norm": 0.06923733651638031, "learning_rate": 0.00019840281714044003, "loss": 0.0782, "step": 3087 }, { "epoch": 0.19962828282828282, "grad_norm": 0.0664527639746666, "learning_rate": 0.0001984015995488132, "loss": 0.0841, "step": 3088 }, { "epoch": 0.19962828282828282, "eval_bleu": 15.489048741095534, "eval_loss": 0.09590156376361847, "eval_runtime": 2.8428, "eval_samples_per_second": 11.257, "eval_steps_per_second": 1.407, "step": 3088 }, { "epoch": 0.19969292929292928, "grad_norm": 0.07878103852272034, "learning_rate": 0.00019840038149699464, "loss": 0.089, "step": 3089 }, { "epoch": 0.19975757575757574, "grad_norm": 0.09234321117401123, "learning_rate": 0.00019839916298499, "loss": 0.1009, "step": 3090 }, { "epoch": 0.19982222222222223, "grad_norm": 0.07252255827188492, "learning_rate": 0.00019839794401280502, "loss": 0.0809, "step": 3091 }, { "epoch": 0.1998868686868687, "grad_norm": 0.08006826788187027, "learning_rate": 0.00019839672458044538, "loss": 0.0682, "step": 3092 }, { "epoch": 0.19995151515151516, "grad_norm": 0.08270895481109619, "learning_rate": 0.00019839550468791678, "loss": 0.0864, "step": 3093 }, { "epoch": 0.20001616161616162, "grad_norm": 0.06841226667165756, "learning_rate": 0.00019839428433522495, "loss": 0.0775, "step": 3094 }, { "epoch": 0.20008080808080808, "grad_norm": 0.08888465166091919, "learning_rate": 0.0001983930635223756, "loss": 0.1031, "step": 3095 }, { "epoch": 0.20014545454545454, "grad_norm": 0.06844443082809448, "learning_rate": 0.00019839184224937438, "loss": 0.0904, "step": 3096 }, { "epoch": 0.200210101010101, "grad_norm": 0.07276881486177444, "learning_rate": 0.00019839062051622705, "loss": 0.0858, "step": 3097 }, { "epoch": 0.20027474747474747, "grad_norm": 0.0789208635687828, "learning_rate": 0.00019838939832293932, "loss": 0.0944, "step": 3098 }, { "epoch": 0.20033939393939393, "grad_norm": 0.07845215499401093, "learning_rate": 0.0001983881756695169, "loss": 0.0953, "step": 3099 }, { "epoch": 0.20040404040404042, "grad_norm": 0.08761543780565262, "learning_rate": 0.00019838695255596552, "loss": 0.1, "step": 3100 }, { "epoch": 0.20046868686868688, "grad_norm": 0.09423337876796722, "learning_rate": 0.00019838572898229085, "loss": 0.1231, "step": 3101 }, { "epoch": 0.20053333333333334, "grad_norm": 0.08081380277872086, "learning_rate": 0.0001983845049484987, "loss": 0.1081, "step": 3102 }, { "epoch": 0.2005979797979798, "grad_norm": 0.21056683361530304, "learning_rate": 0.00019838328045459474, "loss": 0.0955, "step": 3103 }, { "epoch": 0.20066262626262626, "grad_norm": 0.07333079725503922, "learning_rate": 0.0001983820555005847, "loss": 0.0888, "step": 3104 }, { "epoch": 0.20066262626262626, "eval_bleu": 15.040086182106487, "eval_loss": 0.0959915816783905, "eval_runtime": 2.7261, "eval_samples_per_second": 11.738, "eval_steps_per_second": 1.467, "step": 3104 }, { "epoch": 0.20072727272727273, "grad_norm": 0.08100400865077972, "learning_rate": 0.00019838083008647428, "loss": 0.1079, "step": 3105 }, { "epoch": 0.2007919191919192, "grad_norm": 0.0793466567993164, "learning_rate": 0.00019837960421226924, "loss": 0.1054, "step": 3106 }, { "epoch": 0.20085656565656565, "grad_norm": 0.06841523200273514, "learning_rate": 0.00019837837787797536, "loss": 0.0965, "step": 3107 }, { "epoch": 0.2009212121212121, "grad_norm": 0.08950537443161011, "learning_rate": 0.00019837715108359828, "loss": 0.1034, "step": 3108 }, { "epoch": 0.20098585858585857, "grad_norm": 0.0957818478345871, "learning_rate": 0.00019837592382914384, "loss": 0.089, "step": 3109 }, { "epoch": 0.20105050505050506, "grad_norm": 0.06846385449171066, "learning_rate": 0.00019837469611461769, "loss": 0.0816, "step": 3110 }, { "epoch": 0.20111515151515152, "grad_norm": 0.07595576345920563, "learning_rate": 0.00019837346794002563, "loss": 0.1032, "step": 3111 }, { "epoch": 0.201179797979798, "grad_norm": 0.08023486286401749, "learning_rate": 0.00019837223930537333, "loss": 0.0953, "step": 3112 }, { "epoch": 0.20124444444444445, "grad_norm": 0.07304797321557999, "learning_rate": 0.00019837101021066665, "loss": 0.0972, "step": 3113 }, { "epoch": 0.2013090909090909, "grad_norm": 0.061266034841537476, "learning_rate": 0.00019836978065591125, "loss": 0.0753, "step": 3114 }, { "epoch": 0.20137373737373737, "grad_norm": 0.07018446177244186, "learning_rate": 0.0001983685506411129, "loss": 0.0978, "step": 3115 }, { "epoch": 0.20143838383838383, "grad_norm": 0.13731630146503448, "learning_rate": 0.00019836732016627738, "loss": 0.0797, "step": 3116 }, { "epoch": 0.2015030303030303, "grad_norm": 0.07203269004821777, "learning_rate": 0.0001983660892314104, "loss": 0.0939, "step": 3117 }, { "epoch": 0.20156767676767676, "grad_norm": 0.0736599862575531, "learning_rate": 0.00019836485783651778, "loss": 0.0871, "step": 3118 }, { "epoch": 0.20163232323232325, "grad_norm": 0.0717538520693779, "learning_rate": 0.0001983636259816052, "loss": 0.0725, "step": 3119 }, { "epoch": 0.2016969696969697, "grad_norm": 0.07141770422458649, "learning_rate": 0.00019836239366667846, "loss": 0.0915, "step": 3120 }, { "epoch": 0.2016969696969697, "eval_bleu": 14.59868330010741, "eval_loss": 0.09569938480854034, "eval_runtime": 2.7744, "eval_samples_per_second": 11.534, "eval_steps_per_second": 1.442, "step": 3120 }, { "epoch": 0.20176161616161617, "grad_norm": 0.0767352357506752, "learning_rate": 0.00019836116089174332, "loss": 0.1003, "step": 3121 }, { "epoch": 0.20182626262626263, "grad_norm": 0.13903386890888214, "learning_rate": 0.00019835992765680557, "loss": 0.1079, "step": 3122 }, { "epoch": 0.2018909090909091, "grad_norm": 0.07657284289598465, "learning_rate": 0.00019835869396187095, "loss": 0.0927, "step": 3123 }, { "epoch": 0.20195555555555555, "grad_norm": 0.07481386512517929, "learning_rate": 0.00019835745980694523, "loss": 0.0917, "step": 3124 }, { "epoch": 0.20202020202020202, "grad_norm": 0.08460117131471634, "learning_rate": 0.00019835622519203422, "loss": 0.1096, "step": 3125 }, { "epoch": 0.20208484848484848, "grad_norm": 0.07209576666355133, "learning_rate": 0.0001983549901171436, "loss": 0.0833, "step": 3126 }, { "epoch": 0.20214949494949494, "grad_norm": 0.06695141643285751, "learning_rate": 0.00019835375458227925, "loss": 0.0902, "step": 3127 }, { "epoch": 0.2022141414141414, "grad_norm": 0.10109949856996536, "learning_rate": 0.00019835251858744688, "loss": 0.0752, "step": 3128 }, { "epoch": 0.2022787878787879, "grad_norm": 0.06644662469625473, "learning_rate": 0.0001983512821326523, "loss": 0.0814, "step": 3129 }, { "epoch": 0.20234343434343435, "grad_norm": 0.07694841921329498, "learning_rate": 0.0001983500452179013, "loss": 0.0964, "step": 3130 }, { "epoch": 0.20240808080808081, "grad_norm": 0.17065557837486267, "learning_rate": 0.00019834880784319964, "loss": 0.1117, "step": 3131 }, { "epoch": 0.20247272727272728, "grad_norm": 0.09423643350601196, "learning_rate": 0.00019834757000855314, "loss": 0.0773, "step": 3132 }, { "epoch": 0.20253737373737374, "grad_norm": 0.09630367904901505, "learning_rate": 0.00019834633171396756, "loss": 0.1148, "step": 3133 }, { "epoch": 0.2026020202020202, "grad_norm": 0.06729527562856674, "learning_rate": 0.00019834509295944868, "loss": 0.0807, "step": 3134 }, { "epoch": 0.20266666666666666, "grad_norm": 0.08414700627326965, "learning_rate": 0.00019834385374500234, "loss": 0.1041, "step": 3135 }, { "epoch": 0.20273131313131312, "grad_norm": 0.07156471163034439, "learning_rate": 0.00019834261407063427, "loss": 0.0927, "step": 3136 }, { "epoch": 0.20273131313131312, "eval_bleu": 11.91406607261676, "eval_loss": 0.09549685567617416, "eval_runtime": 2.7445, "eval_samples_per_second": 11.66, "eval_steps_per_second": 1.457, "step": 3136 }, { "epoch": 0.20279595959595959, "grad_norm": 0.06717123091220856, "learning_rate": 0.00019834137393635035, "loss": 0.0889, "step": 3137 }, { "epoch": 0.20286060606060605, "grad_norm": 0.08477167040109634, "learning_rate": 0.0001983401333421563, "loss": 0.1125, "step": 3138 }, { "epoch": 0.20292525252525254, "grad_norm": 0.07160009443759918, "learning_rate": 0.00019833889228805797, "loss": 0.0964, "step": 3139 }, { "epoch": 0.202989898989899, "grad_norm": 0.07411563396453857, "learning_rate": 0.00019833765077406114, "loss": 0.0865, "step": 3140 }, { "epoch": 0.20305454545454546, "grad_norm": 0.0777866467833519, "learning_rate": 0.00019833640880017166, "loss": 0.1114, "step": 3141 }, { "epoch": 0.20311919191919192, "grad_norm": 0.09419663995504379, "learning_rate": 0.00019833516636639527, "loss": 0.0852, "step": 3142 }, { "epoch": 0.20318383838383838, "grad_norm": 0.08539903163909912, "learning_rate": 0.00019833392347273784, "loss": 0.1093, "step": 3143 }, { "epoch": 0.20324848484848484, "grad_norm": 0.07317009568214417, "learning_rate": 0.00019833268011920513, "loss": 0.0869, "step": 3144 }, { "epoch": 0.2033131313131313, "grad_norm": 0.08566869050264359, "learning_rate": 0.000198331436305803, "loss": 0.0943, "step": 3145 }, { "epoch": 0.20337777777777777, "grad_norm": 0.0702219232916832, "learning_rate": 0.00019833019203253726, "loss": 0.0955, "step": 3146 }, { "epoch": 0.20344242424242423, "grad_norm": 0.07991989701986313, "learning_rate": 0.00019832894729941372, "loss": 0.0893, "step": 3147 }, { "epoch": 0.20350707070707072, "grad_norm": 0.07923533767461777, "learning_rate": 0.0001983277021064382, "loss": 0.1063, "step": 3148 }, { "epoch": 0.20357171717171718, "grad_norm": 0.0759860947728157, "learning_rate": 0.00019832645645361652, "loss": 0.0895, "step": 3149 }, { "epoch": 0.20363636363636364, "grad_norm": 0.080452561378479, "learning_rate": 0.0001983252103409545, "loss": 0.1026, "step": 3150 }, { "epoch": 0.2037010101010101, "grad_norm": 0.06442708522081375, "learning_rate": 0.00019832396376845798, "loss": 0.0714, "step": 3151 }, { "epoch": 0.20376565656565657, "grad_norm": 0.0781233012676239, "learning_rate": 0.00019832271673613278, "loss": 0.1001, "step": 3152 }, { "epoch": 0.20376565656565657, "eval_bleu": 13.69806246538477, "eval_loss": 0.09530284255743027, "eval_runtime": 2.7923, "eval_samples_per_second": 11.46, "eval_steps_per_second": 1.433, "step": 3152 }, { "epoch": 0.20383030303030303, "grad_norm": 0.07101649045944214, "learning_rate": 0.00019832146924398476, "loss": 0.0925, "step": 3153 }, { "epoch": 0.2038949494949495, "grad_norm": 0.07431681454181671, "learning_rate": 0.00019832022129201971, "loss": 0.1021, "step": 3154 }, { "epoch": 0.20395959595959595, "grad_norm": 0.07920096069574356, "learning_rate": 0.00019831897288024347, "loss": 0.0933, "step": 3155 }, { "epoch": 0.2040242424242424, "grad_norm": 0.0672849789261818, "learning_rate": 0.00019831772400866196, "loss": 0.0792, "step": 3156 }, { "epoch": 0.20408888888888888, "grad_norm": 0.09281349927186966, "learning_rate": 0.00019831647467728088, "loss": 0.1255, "step": 3157 }, { "epoch": 0.20415353535353536, "grad_norm": 0.116844043135643, "learning_rate": 0.0001983152248861062, "loss": 0.1077, "step": 3158 }, { "epoch": 0.20421818181818183, "grad_norm": 0.07704758644104004, "learning_rate": 0.00019831397463514373, "loss": 0.105, "step": 3159 }, { "epoch": 0.2042828282828283, "grad_norm": 0.07215922325849533, "learning_rate": 0.0001983127239243993, "loss": 0.0956, "step": 3160 }, { "epoch": 0.20434747474747475, "grad_norm": 0.08894345164299011, "learning_rate": 0.00019831147275387875, "loss": 0.1024, "step": 3161 }, { "epoch": 0.2044121212121212, "grad_norm": 0.09421950578689575, "learning_rate": 0.00019831022112358794, "loss": 0.0984, "step": 3162 }, { "epoch": 0.20447676767676767, "grad_norm": 0.07571349292993546, "learning_rate": 0.0001983089690335327, "loss": 0.1047, "step": 3163 }, { "epoch": 0.20454141414141414, "grad_norm": 0.08159632980823517, "learning_rate": 0.00019830771648371895, "loss": 0.11, "step": 3164 }, { "epoch": 0.2046060606060606, "grad_norm": 0.08643662929534912, "learning_rate": 0.00019830646347415248, "loss": 0.1243, "step": 3165 }, { "epoch": 0.20467070707070706, "grad_norm": 0.0670056864619255, "learning_rate": 0.00019830521000483922, "loss": 0.0953, "step": 3166 }, { "epoch": 0.20473535353535355, "grad_norm": 0.06531786918640137, "learning_rate": 0.00019830395607578496, "loss": 0.0878, "step": 3167 }, { "epoch": 0.2048, "grad_norm": 0.06567156314849854, "learning_rate": 0.00019830270168699562, "loss": 0.09, "step": 3168 }, { "epoch": 0.2048, "eval_bleu": 13.082742353089657, "eval_loss": 0.09588369727134705, "eval_runtime": 2.7131, "eval_samples_per_second": 11.795, "eval_steps_per_second": 1.474, "step": 3168 }, { "epoch": 0.20486464646464647, "grad_norm": 0.06386405229568481, "learning_rate": 0.000198301446838477, "loss": 0.0798, "step": 3169 }, { "epoch": 0.20492929292929293, "grad_norm": 0.0673353523015976, "learning_rate": 0.00019830019153023505, "loss": 0.0885, "step": 3170 }, { "epoch": 0.2049939393939394, "grad_norm": 0.07388516515493393, "learning_rate": 0.00019829893576227557, "loss": 0.1063, "step": 3171 }, { "epoch": 0.20505858585858586, "grad_norm": 0.06915147602558136, "learning_rate": 0.0001982976795346045, "loss": 0.0864, "step": 3172 }, { "epoch": 0.20512323232323232, "grad_norm": 0.07721282541751862, "learning_rate": 0.00019829642284722768, "loss": 0.0945, "step": 3173 }, { "epoch": 0.20518787878787878, "grad_norm": 0.06398101150989532, "learning_rate": 0.00019829516570015095, "loss": 0.082, "step": 3174 }, { "epoch": 0.20525252525252524, "grad_norm": 0.11499542742967606, "learning_rate": 0.00019829390809338024, "loss": 0.0941, "step": 3175 }, { "epoch": 0.2053171717171717, "grad_norm": 0.0740969181060791, "learning_rate": 0.0001982926500269214, "loss": 0.0942, "step": 3176 }, { "epoch": 0.2053818181818182, "grad_norm": 0.07350771874189377, "learning_rate": 0.00019829139150078038, "loss": 0.091, "step": 3177 }, { "epoch": 0.20544646464646465, "grad_norm": 0.07321209460496902, "learning_rate": 0.000198290132514963, "loss": 0.094, "step": 3178 }, { "epoch": 0.20551111111111112, "grad_norm": 0.08422143757343292, "learning_rate": 0.00019828887306947514, "loss": 0.0917, "step": 3179 }, { "epoch": 0.20557575757575758, "grad_norm": 0.07574667781591415, "learning_rate": 0.0001982876131643227, "loss": 0.0934, "step": 3180 }, { "epoch": 0.20564040404040404, "grad_norm": 0.07455884665250778, "learning_rate": 0.00019828635279951166, "loss": 0.0948, "step": 3181 }, { "epoch": 0.2057050505050505, "grad_norm": 0.08029003441333771, "learning_rate": 0.00019828509197504775, "loss": 0.0862, "step": 3182 }, { "epoch": 0.20576969696969696, "grad_norm": 0.08099525421857834, "learning_rate": 0.00019828383069093704, "loss": 0.0979, "step": 3183 }, { "epoch": 0.20583434343434343, "grad_norm": 0.0789312869310379, "learning_rate": 0.0001982825689471853, "loss": 0.0956, "step": 3184 }, { "epoch": 0.20583434343434343, "eval_bleu": 11.35523981699744, "eval_loss": 0.0962069034576416, "eval_runtime": 2.8599, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 3184 }, { "epoch": 0.2058989898989899, "grad_norm": 0.0795975774526596, "learning_rate": 0.0001982813067437985, "loss": 0.0877, "step": 3185 }, { "epoch": 0.20596363636363638, "grad_norm": 0.0632048100233078, "learning_rate": 0.0001982800440807825, "loss": 0.0804, "step": 3186 }, { "epoch": 0.20602828282828284, "grad_norm": 0.07772943377494812, "learning_rate": 0.00019827878095814323, "loss": 0.0965, "step": 3187 }, { "epoch": 0.2060929292929293, "grad_norm": 0.0854567214846611, "learning_rate": 0.00019827751737588658, "loss": 0.0952, "step": 3188 }, { "epoch": 0.20615757575757576, "grad_norm": 0.07375089079141617, "learning_rate": 0.00019827625333401847, "loss": 0.0963, "step": 3189 }, { "epoch": 0.20622222222222222, "grad_norm": 0.07229287922382355, "learning_rate": 0.00019827498883254484, "loss": 0.0847, "step": 3190 }, { "epoch": 0.20628686868686869, "grad_norm": 0.060634031891822815, "learning_rate": 0.00019827372387147156, "loss": 0.0675, "step": 3191 }, { "epoch": 0.20635151515151515, "grad_norm": 0.07454415410757065, "learning_rate": 0.00019827245845080457, "loss": 0.0959, "step": 3192 }, { "epoch": 0.2064161616161616, "grad_norm": 0.07481205463409424, "learning_rate": 0.00019827119257054978, "loss": 0.0967, "step": 3193 }, { "epoch": 0.20648080808080807, "grad_norm": 0.07120965421199799, "learning_rate": 0.0001982699262307131, "loss": 0.084, "step": 3194 }, { "epoch": 0.20654545454545453, "grad_norm": 0.07946518808603287, "learning_rate": 0.00019826865943130045, "loss": 0.1094, "step": 3195 }, { "epoch": 0.20661010101010102, "grad_norm": 0.08382433652877808, "learning_rate": 0.0001982673921723178, "loss": 0.1121, "step": 3196 }, { "epoch": 0.20667474747474748, "grad_norm": 0.07565141469240189, "learning_rate": 0.00019826612445377102, "loss": 0.092, "step": 3197 }, { "epoch": 0.20673939393939395, "grad_norm": 0.07011779397726059, "learning_rate": 0.00019826485627566607, "loss": 0.1006, "step": 3198 }, { "epoch": 0.2068040404040404, "grad_norm": 0.10079753398895264, "learning_rate": 0.00019826358763800887, "loss": 0.0924, "step": 3199 }, { "epoch": 0.20686868686868687, "grad_norm": 0.07440459728240967, "learning_rate": 0.00019826231854080535, "loss": 0.0913, "step": 3200 }, { "epoch": 0.20686868686868687, "eval_bleu": 13.84797986886752, "eval_loss": 0.09659598767757416, "eval_runtime": 2.7037, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.479, "step": 3200 }, { "epoch": 0.20693333333333333, "grad_norm": 0.08571907877922058, "learning_rate": 0.00019826104898406147, "loss": 0.0994, "step": 3201 }, { "epoch": 0.2069979797979798, "grad_norm": 0.1361107975244522, "learning_rate": 0.00019825977896778313, "loss": 0.134, "step": 3202 }, { "epoch": 0.20706262626262625, "grad_norm": 0.06828378885984421, "learning_rate": 0.0001982585084919763, "loss": 0.0887, "step": 3203 }, { "epoch": 0.20712727272727272, "grad_norm": 0.08962441980838776, "learning_rate": 0.0001982572375566469, "loss": 0.0859, "step": 3204 }, { "epoch": 0.2071919191919192, "grad_norm": 0.07443254441022873, "learning_rate": 0.0001982559661618009, "loss": 0.0868, "step": 3205 }, { "epoch": 0.20725656565656567, "grad_norm": 0.06895461678504944, "learning_rate": 0.0001982546943074442, "loss": 0.0937, "step": 3206 }, { "epoch": 0.20732121212121213, "grad_norm": 0.07234080880880356, "learning_rate": 0.00019825342199358278, "loss": 0.0876, "step": 3207 }, { "epoch": 0.2073858585858586, "grad_norm": 0.07005325704813004, "learning_rate": 0.0001982521492202226, "loss": 0.0936, "step": 3208 }, { "epoch": 0.20745050505050505, "grad_norm": 0.06727655231952667, "learning_rate": 0.00019825087598736963, "loss": 0.0744, "step": 3209 }, { "epoch": 0.2075151515151515, "grad_norm": 0.06495391577482224, "learning_rate": 0.00019824960229502975, "loss": 0.0764, "step": 3210 }, { "epoch": 0.20757979797979798, "grad_norm": 0.06698035448789597, "learning_rate": 0.00019824832814320897, "loss": 0.0867, "step": 3211 }, { "epoch": 0.20764444444444444, "grad_norm": 0.07150883227586746, "learning_rate": 0.00019824705353191325, "loss": 0.0876, "step": 3212 }, { "epoch": 0.2077090909090909, "grad_norm": 0.0784783810377121, "learning_rate": 0.00019824577846114854, "loss": 0.0953, "step": 3213 }, { "epoch": 0.20777373737373736, "grad_norm": 0.08331015706062317, "learning_rate": 0.0001982445029309208, "loss": 0.1073, "step": 3214 }, { "epoch": 0.20783838383838385, "grad_norm": 0.07842043042182922, "learning_rate": 0.00019824322694123598, "loss": 0.0961, "step": 3215 }, { "epoch": 0.2079030303030303, "grad_norm": 0.0864470899105072, "learning_rate": 0.00019824195049210008, "loss": 0.1135, "step": 3216 }, { "epoch": 0.2079030303030303, "eval_bleu": 13.403972714553266, "eval_loss": 0.09692306071519852, "eval_runtime": 2.7803, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.439, "step": 3216 }, { "epoch": 0.20796767676767677, "grad_norm": 0.07396278530359268, "learning_rate": 0.00019824067358351907, "loss": 0.0964, "step": 3217 }, { "epoch": 0.20803232323232324, "grad_norm": 0.10630341619253159, "learning_rate": 0.0001982393962154989, "loss": 0.1053, "step": 3218 }, { "epoch": 0.2080969696969697, "grad_norm": 0.06560712307691574, "learning_rate": 0.00019823811838804556, "loss": 0.0776, "step": 3219 }, { "epoch": 0.20816161616161616, "grad_norm": 0.06602195650339127, "learning_rate": 0.000198236840101165, "loss": 0.0913, "step": 3220 }, { "epoch": 0.20822626262626262, "grad_norm": 0.07872724533081055, "learning_rate": 0.0001982355613548632, "loss": 0.1024, "step": 3221 }, { "epoch": 0.20829090909090908, "grad_norm": 0.07390357553958893, "learning_rate": 0.00019823428214914618, "loss": 0.1014, "step": 3222 }, { "epoch": 0.20835555555555554, "grad_norm": 0.07097990810871124, "learning_rate": 0.0001982330024840199, "loss": 0.0851, "step": 3223 }, { "epoch": 0.20842020202020203, "grad_norm": 0.07431790232658386, "learning_rate": 0.00019823172235949033, "loss": 0.091, "step": 3224 }, { "epoch": 0.2084848484848485, "grad_norm": 0.07419148087501526, "learning_rate": 0.00019823044177556346, "loss": 0.0892, "step": 3225 }, { "epoch": 0.20854949494949496, "grad_norm": 0.09415031224489212, "learning_rate": 0.00019822916073224535, "loss": 0.1084, "step": 3226 }, { "epoch": 0.20861414141414142, "grad_norm": 0.07401181757450104, "learning_rate": 0.00019822787922954186, "loss": 0.0861, "step": 3227 }, { "epoch": 0.20867878787878788, "grad_norm": 0.08296767622232437, "learning_rate": 0.00019822659726745906, "loss": 0.1022, "step": 3228 }, { "epoch": 0.20874343434343434, "grad_norm": 0.08823227137327194, "learning_rate": 0.00019822531484600298, "loss": 0.1139, "step": 3229 }, { "epoch": 0.2088080808080808, "grad_norm": 0.07112350314855576, "learning_rate": 0.00019822403196517955, "loss": 0.0839, "step": 3230 }, { "epoch": 0.20887272727272727, "grad_norm": 0.065652035176754, "learning_rate": 0.0001982227486249948, "loss": 0.0814, "step": 3231 }, { "epoch": 0.20893737373737373, "grad_norm": 0.07163041830062866, "learning_rate": 0.00019822146482545468, "loss": 0.0882, "step": 3232 }, { "epoch": 0.20893737373737373, "eval_bleu": 12.093881575725476, "eval_loss": 0.09620682895183563, "eval_runtime": 2.9348, "eval_samples_per_second": 10.904, "eval_steps_per_second": 1.363, "step": 3232 }, { "epoch": 0.2090020202020202, "grad_norm": 0.0594286248087883, "learning_rate": 0.0001982201805665653, "loss": 0.0625, "step": 3233 }, { "epoch": 0.20906666666666668, "grad_norm": 0.0808313861489296, "learning_rate": 0.0001982188958483326, "loss": 0.0914, "step": 3234 }, { "epoch": 0.20913131313131314, "grad_norm": 0.07525745034217834, "learning_rate": 0.00019821761067076256, "loss": 0.0941, "step": 3235 }, { "epoch": 0.2091959595959596, "grad_norm": 0.18140995502471924, "learning_rate": 0.00019821632503386124, "loss": 0.0958, "step": 3236 }, { "epoch": 0.20926060606060606, "grad_norm": 0.07752675563097, "learning_rate": 0.00019821503893763468, "loss": 0.0958, "step": 3237 }, { "epoch": 0.20932525252525253, "grad_norm": 0.06594157963991165, "learning_rate": 0.0001982137523820888, "loss": 0.0787, "step": 3238 }, { "epoch": 0.209389898989899, "grad_norm": 0.06697786599397659, "learning_rate": 0.00019821246536722966, "loss": 0.081, "step": 3239 }, { "epoch": 0.20945454545454545, "grad_norm": 0.08294513821601868, "learning_rate": 0.00019821117789306332, "loss": 0.0887, "step": 3240 }, { "epoch": 0.2095191919191919, "grad_norm": 0.060510385781526566, "learning_rate": 0.00019820988995959576, "loss": 0.0827, "step": 3241 }, { "epoch": 0.20958383838383837, "grad_norm": 0.07210643589496613, "learning_rate": 0.000198208601566833, "loss": 0.0806, "step": 3242 }, { "epoch": 0.20964848484848486, "grad_norm": 0.07757668197154999, "learning_rate": 0.00019820731271478112, "loss": 0.0966, "step": 3243 }, { "epoch": 0.20971313131313132, "grad_norm": 0.08799994736909866, "learning_rate": 0.00019820602340344607, "loss": 0.0956, "step": 3244 }, { "epoch": 0.20977777777777779, "grad_norm": 0.07386535406112671, "learning_rate": 0.00019820473363283393, "loss": 0.1012, "step": 3245 }, { "epoch": 0.20984242424242425, "grad_norm": 0.07808306813240051, "learning_rate": 0.0001982034434029507, "loss": 0.0931, "step": 3246 }, { "epoch": 0.2099070707070707, "grad_norm": 0.07190465927124023, "learning_rate": 0.00019820215271380246, "loss": 0.102, "step": 3247 }, { "epoch": 0.20997171717171717, "grad_norm": 0.09075876325368881, "learning_rate": 0.00019820086156539516, "loss": 0.0922, "step": 3248 }, { "epoch": 0.20997171717171717, "eval_bleu": 11.75159459131704, "eval_loss": 0.0980924516916275, "eval_runtime": 2.7078, "eval_samples_per_second": 11.818, "eval_steps_per_second": 1.477, "step": 3248 }, { "epoch": 0.21003636363636363, "grad_norm": 0.0819455087184906, "learning_rate": 0.00019819956995773493, "loss": 0.1016, "step": 3249 }, { "epoch": 0.2101010101010101, "grad_norm": 0.07833933085203171, "learning_rate": 0.0001981982778908278, "loss": 0.0877, "step": 3250 }, { "epoch": 0.21016565656565656, "grad_norm": 0.0952921211719513, "learning_rate": 0.00019819698536467975, "loss": 0.1179, "step": 3251 }, { "epoch": 0.21023030303030302, "grad_norm": 0.07919947803020477, "learning_rate": 0.00019819569237929688, "loss": 0.0971, "step": 3252 }, { "epoch": 0.2102949494949495, "grad_norm": 0.07089057564735413, "learning_rate": 0.0001981943989346852, "loss": 0.0956, "step": 3253 }, { "epoch": 0.21035959595959597, "grad_norm": 0.09709557890892029, "learning_rate": 0.0001981931050308508, "loss": 0.1366, "step": 3254 }, { "epoch": 0.21042424242424243, "grad_norm": 0.08185214549303055, "learning_rate": 0.0001981918106677997, "loss": 0.088, "step": 3255 }, { "epoch": 0.2104888888888889, "grad_norm": 0.0734316036105156, "learning_rate": 0.000198190515845538, "loss": 0.0938, "step": 3256 }, { "epoch": 0.21055353535353535, "grad_norm": 0.06370724737644196, "learning_rate": 0.00019818922056407168, "loss": 0.0771, "step": 3257 }, { "epoch": 0.21061818181818182, "grad_norm": 0.059855539351701736, "learning_rate": 0.00019818792482340683, "loss": 0.0716, "step": 3258 }, { "epoch": 0.21068282828282828, "grad_norm": 0.06486648321151733, "learning_rate": 0.0001981866286235495, "loss": 0.0767, "step": 3259 }, { "epoch": 0.21074747474747474, "grad_norm": 0.07044275104999542, "learning_rate": 0.00019818533196450583, "loss": 0.0833, "step": 3260 }, { "epoch": 0.2108121212121212, "grad_norm": 0.06139122694730759, "learning_rate": 0.0001981840348462818, "loss": 0.0843, "step": 3261 }, { "epoch": 0.2108767676767677, "grad_norm": 0.07323456555604935, "learning_rate": 0.00019818273726888346, "loss": 0.0901, "step": 3262 }, { "epoch": 0.21094141414141415, "grad_norm": 0.07193174213171005, "learning_rate": 0.00019818143923231696, "loss": 0.1033, "step": 3263 }, { "epoch": 0.2110060606060606, "grad_norm": 0.08211849629878998, "learning_rate": 0.0001981801407365883, "loss": 0.1117, "step": 3264 }, { "epoch": 0.2110060606060606, "eval_bleu": 12.226759341908128, "eval_loss": 0.09752113372087479, "eval_runtime": 2.886, "eval_samples_per_second": 11.088, "eval_steps_per_second": 1.386, "step": 3264 }, { "epoch": 0.21107070707070708, "grad_norm": 0.06628923863172531, "learning_rate": 0.00019817884178170359, "loss": 0.0889, "step": 3265 }, { "epoch": 0.21113535353535354, "grad_norm": 0.0785524770617485, "learning_rate": 0.00019817754236766888, "loss": 0.0869, "step": 3266 }, { "epoch": 0.2112, "grad_norm": 0.06970620900392532, "learning_rate": 0.00019817624249449028, "loss": 0.089, "step": 3267 }, { "epoch": 0.21126464646464646, "grad_norm": 0.08422599732875824, "learning_rate": 0.00019817494216217384, "loss": 0.099, "step": 3268 }, { "epoch": 0.21132929292929292, "grad_norm": 0.06703177094459534, "learning_rate": 0.00019817364137072568, "loss": 0.077, "step": 3269 }, { "epoch": 0.21139393939393938, "grad_norm": 0.0788150280714035, "learning_rate": 0.0001981723401201518, "loss": 0.0976, "step": 3270 }, { "epoch": 0.21145858585858585, "grad_norm": 0.07000916451215744, "learning_rate": 0.00019817103841045838, "loss": 0.0885, "step": 3271 }, { "epoch": 0.21152323232323234, "grad_norm": 0.08158749341964722, "learning_rate": 0.00019816973624165146, "loss": 0.0917, "step": 3272 }, { "epoch": 0.2115878787878788, "grad_norm": 0.07905953377485275, "learning_rate": 0.00019816843361373712, "loss": 0.1021, "step": 3273 }, { "epoch": 0.21165252525252526, "grad_norm": 0.14402255415916443, "learning_rate": 0.0001981671305267215, "loss": 0.1106, "step": 3274 }, { "epoch": 0.21171717171717172, "grad_norm": 0.08556491881608963, "learning_rate": 0.00019816582698061066, "loss": 0.1042, "step": 3275 }, { "epoch": 0.21178181818181818, "grad_norm": 0.06387800723314285, "learning_rate": 0.0001981645229754107, "loss": 0.0794, "step": 3276 }, { "epoch": 0.21184646464646464, "grad_norm": 0.06925465166568756, "learning_rate": 0.00019816321851112768, "loss": 0.0942, "step": 3277 }, { "epoch": 0.2119111111111111, "grad_norm": 0.08130539953708649, "learning_rate": 0.00019816191358776778, "loss": 0.0927, "step": 3278 }, { "epoch": 0.21197575757575757, "grad_norm": 0.07872293144464493, "learning_rate": 0.00019816060820533707, "loss": 0.0925, "step": 3279 }, { "epoch": 0.21204040404040403, "grad_norm": 0.06228552758693695, "learning_rate": 0.00019815930236384162, "loss": 0.081, "step": 3280 }, { "epoch": 0.21204040404040403, "eval_bleu": 15.055489145493217, "eval_loss": 0.09649857878684998, "eval_runtime": 2.6538, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 3280 }, { "epoch": 0.21210505050505052, "grad_norm": 0.0830526128411293, "learning_rate": 0.00019815799606328756, "loss": 0.0986, "step": 3281 }, { "epoch": 0.21216969696969698, "grad_norm": 0.07102509588003159, "learning_rate": 0.00019815668930368103, "loss": 0.0933, "step": 3282 }, { "epoch": 0.21223434343434344, "grad_norm": 0.08107369393110275, "learning_rate": 0.00019815538208502808, "loss": 0.1022, "step": 3283 }, { "epoch": 0.2122989898989899, "grad_norm": 0.08084149658679962, "learning_rate": 0.00019815407440733487, "loss": 0.0922, "step": 3284 }, { "epoch": 0.21236363636363637, "grad_norm": 0.07209715247154236, "learning_rate": 0.00019815276627060754, "loss": 0.0943, "step": 3285 }, { "epoch": 0.21242828282828283, "grad_norm": 0.06801539659500122, "learning_rate": 0.00019815145767485213, "loss": 0.0785, "step": 3286 }, { "epoch": 0.2124929292929293, "grad_norm": 0.07762764394283295, "learning_rate": 0.0001981501486200748, "loss": 0.0983, "step": 3287 }, { "epoch": 0.21255757575757575, "grad_norm": 0.08190947026014328, "learning_rate": 0.00019814883910628168, "loss": 0.1041, "step": 3288 }, { "epoch": 0.2126222222222222, "grad_norm": 0.07338094711303711, "learning_rate": 0.0001981475291334789, "loss": 0.0862, "step": 3289 }, { "epoch": 0.21268686868686867, "grad_norm": 0.07414393126964569, "learning_rate": 0.00019814621870167253, "loss": 0.0757, "step": 3290 }, { "epoch": 0.21275151515151516, "grad_norm": 0.06347116082906723, "learning_rate": 0.0001981449078108688, "loss": 0.0812, "step": 3291 }, { "epoch": 0.21281616161616163, "grad_norm": 0.07594288885593414, "learning_rate": 0.00019814359646107375, "loss": 0.0991, "step": 3292 }, { "epoch": 0.2128808080808081, "grad_norm": 0.0660187229514122, "learning_rate": 0.00019814228465229357, "loss": 0.0773, "step": 3293 }, { "epoch": 0.21294545454545455, "grad_norm": 0.07161509990692139, "learning_rate": 0.00019814097238453435, "loss": 0.0976, "step": 3294 }, { "epoch": 0.213010101010101, "grad_norm": 0.0784306526184082, "learning_rate": 0.00019813965965780224, "loss": 0.0975, "step": 3295 }, { "epoch": 0.21307474747474747, "grad_norm": 0.07063211500644684, "learning_rate": 0.0001981383464721034, "loss": 0.0835, "step": 3296 }, { "epoch": 0.21307474747474747, "eval_bleu": 14.526641689724539, "eval_loss": 0.09730221331119537, "eval_runtime": 2.7492, "eval_samples_per_second": 11.64, "eval_steps_per_second": 1.455, "step": 3296 }, { "epoch": 0.21313939393939393, "grad_norm": 0.06933317333459854, "learning_rate": 0.00019813703282744395, "loss": 0.0875, "step": 3297 }, { "epoch": 0.2132040404040404, "grad_norm": 0.06673501431941986, "learning_rate": 0.00019813571872383004, "loss": 0.0852, "step": 3298 }, { "epoch": 0.21326868686868686, "grad_norm": 0.07761738449335098, "learning_rate": 0.00019813440416126783, "loss": 0.1004, "step": 3299 }, { "epoch": 0.21333333333333335, "grad_norm": 0.07827828824520111, "learning_rate": 0.00019813308913976344, "loss": 0.1021, "step": 3300 }, { "epoch": 0.2133979797979798, "grad_norm": 0.0728059634566307, "learning_rate": 0.00019813177365932304, "loss": 0.0941, "step": 3301 }, { "epoch": 0.21346262626262627, "grad_norm": 0.06538718193769455, "learning_rate": 0.00019813045771995276, "loss": 0.0753, "step": 3302 }, { "epoch": 0.21352727272727273, "grad_norm": 0.09158114343881607, "learning_rate": 0.0001981291413216588, "loss": 0.1154, "step": 3303 }, { "epoch": 0.2135919191919192, "grad_norm": 0.08771733939647675, "learning_rate": 0.00019812782446444724, "loss": 0.1041, "step": 3304 }, { "epoch": 0.21365656565656566, "grad_norm": 0.07040257751941681, "learning_rate": 0.00019812650714832436, "loss": 0.0863, "step": 3305 }, { "epoch": 0.21372121212121212, "grad_norm": 0.06416306644678116, "learning_rate": 0.00019812518937329618, "loss": 0.0834, "step": 3306 }, { "epoch": 0.21378585858585858, "grad_norm": 0.07003892958164215, "learning_rate": 0.00019812387113936895, "loss": 0.0837, "step": 3307 }, { "epoch": 0.21385050505050504, "grad_norm": 0.0632302537560463, "learning_rate": 0.00019812255244654883, "loss": 0.0784, "step": 3308 }, { "epoch": 0.2139151515151515, "grad_norm": 0.07356846332550049, "learning_rate": 0.00019812123329484194, "loss": 0.0988, "step": 3309 }, { "epoch": 0.213979797979798, "grad_norm": 0.07500423491001129, "learning_rate": 0.00019811991368425452, "loss": 0.085, "step": 3310 }, { "epoch": 0.21404444444444445, "grad_norm": 0.07073833048343658, "learning_rate": 0.00019811859361479266, "loss": 0.0931, "step": 3311 }, { "epoch": 0.21410909090909092, "grad_norm": 0.06631512194871902, "learning_rate": 0.00019811727308646258, "loss": 0.0843, "step": 3312 }, { "epoch": 0.21410909090909092, "eval_bleu": 13.579684149902931, "eval_loss": 0.09733209013938904, "eval_runtime": 2.7003, "eval_samples_per_second": 11.851, "eval_steps_per_second": 1.481, "step": 3312 }, { "epoch": 0.21417373737373738, "grad_norm": 0.06393156200647354, "learning_rate": 0.00019811595209927047, "loss": 0.0722, "step": 3313 }, { "epoch": 0.21423838383838384, "grad_norm": 0.07341096550226212, "learning_rate": 0.00019811463065322248, "loss": 0.0918, "step": 3314 }, { "epoch": 0.2143030303030303, "grad_norm": 0.06938652694225311, "learning_rate": 0.00019811330874832482, "loss": 0.093, "step": 3315 }, { "epoch": 0.21436767676767676, "grad_norm": 0.07169376313686371, "learning_rate": 0.0001981119863845836, "loss": 0.0994, "step": 3316 }, { "epoch": 0.21443232323232322, "grad_norm": 0.09105397015810013, "learning_rate": 0.0001981106635620051, "loss": 0.0932, "step": 3317 }, { "epoch": 0.21449696969696969, "grad_norm": 0.0699797123670578, "learning_rate": 0.00019810934028059545, "loss": 0.0896, "step": 3318 }, { "epoch": 0.21456161616161618, "grad_norm": 0.058354903012514114, "learning_rate": 0.00019810801654036086, "loss": 0.0668, "step": 3319 }, { "epoch": 0.21462626262626264, "grad_norm": 0.07138828933238983, "learning_rate": 0.00019810669234130747, "loss": 0.0917, "step": 3320 }, { "epoch": 0.2146909090909091, "grad_norm": 0.07746605575084686, "learning_rate": 0.00019810536768344157, "loss": 0.0921, "step": 3321 }, { "epoch": 0.21475555555555556, "grad_norm": 0.07738307863473892, "learning_rate": 0.00019810404256676925, "loss": 0.0866, "step": 3322 }, { "epoch": 0.21482020202020202, "grad_norm": 0.07437840104103088, "learning_rate": 0.0001981027169912968, "loss": 0.0925, "step": 3323 }, { "epoch": 0.21488484848484848, "grad_norm": 0.06851816177368164, "learning_rate": 0.00019810139095703035, "loss": 0.0827, "step": 3324 }, { "epoch": 0.21494949494949495, "grad_norm": 0.06500723212957382, "learning_rate": 0.0001981000644639761, "loss": 0.0778, "step": 3325 }, { "epoch": 0.2150141414141414, "grad_norm": 0.06486549228429794, "learning_rate": 0.00019809873751214032, "loss": 0.072, "step": 3326 }, { "epoch": 0.21507878787878787, "grad_norm": 0.07577551901340485, "learning_rate": 0.00019809741010152915, "loss": 0.0811, "step": 3327 }, { "epoch": 0.21514343434343433, "grad_norm": 0.08082138746976852, "learning_rate": 0.00019809608223214883, "loss": 0.1028, "step": 3328 }, { "epoch": 0.21514343434343433, "eval_bleu": 13.53935335450092, "eval_loss": 0.09745529294013977, "eval_runtime": 2.8652, "eval_samples_per_second": 11.169, "eval_steps_per_second": 1.396, "step": 3328 }, { "epoch": 0.21520808080808082, "grad_norm": 0.06792891025543213, "learning_rate": 0.00019809475390400558, "loss": 0.0807, "step": 3329 }, { "epoch": 0.21527272727272728, "grad_norm": 0.08012599498033524, "learning_rate": 0.0001980934251171056, "loss": 0.0822, "step": 3330 }, { "epoch": 0.21533737373737374, "grad_norm": 0.07291116565465927, "learning_rate": 0.00019809209587145506, "loss": 0.0908, "step": 3331 }, { "epoch": 0.2154020202020202, "grad_norm": 0.07829274237155914, "learning_rate": 0.00019809076616706027, "loss": 0.1032, "step": 3332 }, { "epoch": 0.21546666666666667, "grad_norm": 0.06904233247041702, "learning_rate": 0.00019808943600392733, "loss": 0.0839, "step": 3333 }, { "epoch": 0.21553131313131313, "grad_norm": 0.0646386370062828, "learning_rate": 0.00019808810538206258, "loss": 0.0736, "step": 3334 }, { "epoch": 0.2155959595959596, "grad_norm": 0.07214377075433731, "learning_rate": 0.00019808677430147218, "loss": 0.0874, "step": 3335 }, { "epoch": 0.21566060606060605, "grad_norm": 0.08438138663768768, "learning_rate": 0.00019808544276216235, "loss": 0.0931, "step": 3336 }, { "epoch": 0.21572525252525251, "grad_norm": 0.07673601061105728, "learning_rate": 0.0001980841107641393, "loss": 0.0983, "step": 3337 }, { "epoch": 0.21578989898989898, "grad_norm": 0.06860648840665817, "learning_rate": 0.00019808277830740933, "loss": 0.0836, "step": 3338 }, { "epoch": 0.21585454545454547, "grad_norm": 0.07794602960348129, "learning_rate": 0.00019808144539197863, "loss": 0.0922, "step": 3339 }, { "epoch": 0.21591919191919193, "grad_norm": 0.0965222492814064, "learning_rate": 0.00019808011201785344, "loss": 0.1341, "step": 3340 }, { "epoch": 0.2159838383838384, "grad_norm": 0.08122780919075012, "learning_rate": 0.00019807877818503998, "loss": 0.104, "step": 3341 }, { "epoch": 0.21604848484848485, "grad_norm": 0.07839896529912949, "learning_rate": 0.0001980774438935445, "loss": 0.0946, "step": 3342 }, { "epoch": 0.2161131313131313, "grad_norm": 0.0766155794262886, "learning_rate": 0.00019807610914337323, "loss": 0.0837, "step": 3343 }, { "epoch": 0.21617777777777777, "grad_norm": 0.0776563212275505, "learning_rate": 0.00019807477393453242, "loss": 0.085, "step": 3344 }, { "epoch": 0.21617777777777777, "eval_bleu": 14.494493824501257, "eval_loss": 0.09570226073265076, "eval_runtime": 2.6666, "eval_samples_per_second": 12.0, "eval_steps_per_second": 1.5, "step": 3344 }, { "epoch": 0.21624242424242424, "grad_norm": 0.06095238775014877, "learning_rate": 0.00019807343826702833, "loss": 0.0811, "step": 3345 }, { "epoch": 0.2163070707070707, "grad_norm": 0.07335347682237625, "learning_rate": 0.00019807210214086717, "loss": 0.0891, "step": 3346 }, { "epoch": 0.21637171717171716, "grad_norm": 0.08615083992481232, "learning_rate": 0.00019807076555605524, "loss": 0.1087, "step": 3347 }, { "epoch": 0.21643636363636365, "grad_norm": 0.07660216838121414, "learning_rate": 0.00019806942851259874, "loss": 0.0961, "step": 3348 }, { "epoch": 0.2165010101010101, "grad_norm": 0.08800068497657776, "learning_rate": 0.00019806809101050393, "loss": 0.0939, "step": 3349 }, { "epoch": 0.21656565656565657, "grad_norm": 0.06695350259542465, "learning_rate": 0.00019806675304977713, "loss": 0.069, "step": 3350 }, { "epoch": 0.21663030303030303, "grad_norm": 0.08028466999530792, "learning_rate": 0.0001980654146304245, "loss": 0.0935, "step": 3351 }, { "epoch": 0.2166949494949495, "grad_norm": 0.0833858922123909, "learning_rate": 0.00019806407575245237, "loss": 0.0848, "step": 3352 }, { "epoch": 0.21675959595959596, "grad_norm": 0.07134957611560822, "learning_rate": 0.00019806273641586697, "loss": 0.0805, "step": 3353 }, { "epoch": 0.21682424242424242, "grad_norm": 0.0945541113615036, "learning_rate": 0.0001980613966206746, "loss": 0.0935, "step": 3354 }, { "epoch": 0.21688888888888888, "grad_norm": 0.07783107459545135, "learning_rate": 0.00019806005636688144, "loss": 0.0905, "step": 3355 }, { "epoch": 0.21695353535353534, "grad_norm": 0.07399491220712662, "learning_rate": 0.00019805871565449384, "loss": 0.0999, "step": 3356 }, { "epoch": 0.2170181818181818, "grad_norm": 0.0729394257068634, "learning_rate": 0.00019805737448351805, "loss": 0.0872, "step": 3357 }, { "epoch": 0.2170828282828283, "grad_norm": 0.07971515506505966, "learning_rate": 0.00019805603285396033, "loss": 0.0987, "step": 3358 }, { "epoch": 0.21714747474747476, "grad_norm": 0.09197946637868881, "learning_rate": 0.00019805469076582694, "loss": 0.1106, "step": 3359 }, { "epoch": 0.21721212121212122, "grad_norm": 0.06486957520246506, "learning_rate": 0.0001980533482191242, "loss": 0.0834, "step": 3360 }, { "epoch": 0.21721212121212122, "eval_bleu": 13.108251226700098, "eval_loss": 0.09510880708694458, "eval_runtime": 2.7968, "eval_samples_per_second": 11.442, "eval_steps_per_second": 1.43, "step": 3360 }, { "epoch": 0.21727676767676768, "grad_norm": 0.09355279058218002, "learning_rate": 0.00019805200521385836, "loss": 0.0995, "step": 3361 }, { "epoch": 0.21734141414141414, "grad_norm": 0.06528133153915405, "learning_rate": 0.00019805066175003573, "loss": 0.0812, "step": 3362 }, { "epoch": 0.2174060606060606, "grad_norm": 0.06726797670125961, "learning_rate": 0.00019804931782766257, "loss": 0.0897, "step": 3363 }, { "epoch": 0.21747070707070706, "grad_norm": 0.06881358474493027, "learning_rate": 0.00019804797344674514, "loss": 0.092, "step": 3364 }, { "epoch": 0.21753535353535353, "grad_norm": 0.07933942973613739, "learning_rate": 0.00019804662860728974, "loss": 0.0951, "step": 3365 }, { "epoch": 0.2176, "grad_norm": 0.07912680506706238, "learning_rate": 0.00019804528330930269, "loss": 0.1034, "step": 3366 }, { "epoch": 0.21766464646464648, "grad_norm": 0.0710979476571083, "learning_rate": 0.00019804393755279027, "loss": 0.0876, "step": 3367 }, { "epoch": 0.21772929292929294, "grad_norm": 0.07970493286848068, "learning_rate": 0.0001980425913377588, "loss": 0.1, "step": 3368 }, { "epoch": 0.2177939393939394, "grad_norm": 0.058857958763837814, "learning_rate": 0.00019804124466421446, "loss": 0.0716, "step": 3369 }, { "epoch": 0.21785858585858586, "grad_norm": 0.1202416941523552, "learning_rate": 0.00019803989753216367, "loss": 0.1624, "step": 3370 }, { "epoch": 0.21792323232323232, "grad_norm": 0.07926231622695923, "learning_rate": 0.0001980385499416127, "loss": 0.0903, "step": 3371 }, { "epoch": 0.21798787878787879, "grad_norm": 0.07087825238704681, "learning_rate": 0.00019803720189256785, "loss": 0.0906, "step": 3372 }, { "epoch": 0.21805252525252525, "grad_norm": 0.07070551812648773, "learning_rate": 0.0001980358533850354, "loss": 0.0889, "step": 3373 }, { "epoch": 0.2181171717171717, "grad_norm": 0.06965470314025879, "learning_rate": 0.00019803450441902167, "loss": 0.084, "step": 3374 }, { "epoch": 0.21818181818181817, "grad_norm": 0.07521123439073563, "learning_rate": 0.000198033154994533, "loss": 0.0998, "step": 3375 }, { "epoch": 0.21824646464646463, "grad_norm": 0.07839778065681458, "learning_rate": 0.00019803180511157565, "loss": 0.0941, "step": 3376 }, { "epoch": 0.21824646464646463, "eval_bleu": 14.012770241737655, "eval_loss": 0.09544821828603745, "eval_runtime": 2.6637, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 3376 }, { "epoch": 0.21831111111111112, "grad_norm": 0.07042742520570755, "learning_rate": 0.00019803045477015597, "loss": 0.0941, "step": 3377 }, { "epoch": 0.21837575757575758, "grad_norm": 0.08788799494504929, "learning_rate": 0.00019802910397028023, "loss": 0.1137, "step": 3378 }, { "epoch": 0.21844040404040405, "grad_norm": 0.06507635116577148, "learning_rate": 0.00019802775271195478, "loss": 0.0769, "step": 3379 }, { "epoch": 0.2185050505050505, "grad_norm": 0.0776074007153511, "learning_rate": 0.000198026400995186, "loss": 0.095, "step": 3380 }, { "epoch": 0.21856969696969697, "grad_norm": 0.07036030292510986, "learning_rate": 0.0001980250488199801, "loss": 0.1011, "step": 3381 }, { "epoch": 0.21863434343434343, "grad_norm": 0.0770777016878128, "learning_rate": 0.00019802369618634344, "loss": 0.0974, "step": 3382 }, { "epoch": 0.2186989898989899, "grad_norm": 0.06756532192230225, "learning_rate": 0.00019802234309428238, "loss": 0.0931, "step": 3383 }, { "epoch": 0.21876363636363635, "grad_norm": 0.0934842973947525, "learning_rate": 0.00019802098954380324, "loss": 0.1144, "step": 3384 }, { "epoch": 0.21882828282828282, "grad_norm": 0.0736488550901413, "learning_rate": 0.0001980196355349123, "loss": 0.0851, "step": 3385 }, { "epoch": 0.2188929292929293, "grad_norm": 0.07394425570964813, "learning_rate": 0.00019801828106761599, "loss": 0.0917, "step": 3386 }, { "epoch": 0.21895757575757577, "grad_norm": 0.07060705125331879, "learning_rate": 0.0001980169261419205, "loss": 0.0992, "step": 3387 }, { "epoch": 0.21902222222222223, "grad_norm": 0.07473021000623703, "learning_rate": 0.00019801557075783233, "loss": 0.094, "step": 3388 }, { "epoch": 0.2190868686868687, "grad_norm": 0.08129066228866577, "learning_rate": 0.0001980142149153577, "loss": 0.1071, "step": 3389 }, { "epoch": 0.21915151515151515, "grad_norm": 0.0649336650967598, "learning_rate": 0.000198012858614503, "loss": 0.0855, "step": 3390 }, { "epoch": 0.21921616161616161, "grad_norm": 0.06824880838394165, "learning_rate": 0.00019801150185527454, "loss": 0.0837, "step": 3391 }, { "epoch": 0.21928080808080808, "grad_norm": 0.0683906301856041, "learning_rate": 0.00019801014463767872, "loss": 0.0796, "step": 3392 }, { "epoch": 0.21928080808080808, "eval_bleu": 10.845328859708639, "eval_loss": 0.09526924788951874, "eval_runtime": 2.8712, "eval_samples_per_second": 11.145, "eval_steps_per_second": 1.393, "step": 3392 }, { "epoch": 0.21934545454545454, "grad_norm": 0.06590454280376434, "learning_rate": 0.00019800878696172184, "loss": 0.0775, "step": 3393 }, { "epoch": 0.219410101010101, "grad_norm": 0.08065328747034073, "learning_rate": 0.00019800742882741026, "loss": 0.0879, "step": 3394 }, { "epoch": 0.21947474747474746, "grad_norm": 0.08675577491521835, "learning_rate": 0.00019800607023475034, "loss": 0.0953, "step": 3395 }, { "epoch": 0.21953939393939395, "grad_norm": 0.07061854749917984, "learning_rate": 0.00019800471118374842, "loss": 0.0921, "step": 3396 }, { "epoch": 0.2196040404040404, "grad_norm": 0.07788094133138657, "learning_rate": 0.00019800335167441087, "loss": 0.0829, "step": 3397 }, { "epoch": 0.21966868686868687, "grad_norm": 0.07903806120157242, "learning_rate": 0.00019800199170674407, "loss": 0.0983, "step": 3398 }, { "epoch": 0.21973333333333334, "grad_norm": 0.07318644970655441, "learning_rate": 0.00019800063128075436, "loss": 0.0873, "step": 3399 }, { "epoch": 0.2197979797979798, "grad_norm": 0.07462291419506073, "learning_rate": 0.00019799927039644804, "loss": 0.0947, "step": 3400 }, { "epoch": 0.21986262626262626, "grad_norm": 0.07499099522829056, "learning_rate": 0.00019799790905383155, "loss": 0.0954, "step": 3401 }, { "epoch": 0.21992727272727272, "grad_norm": 0.06945568323135376, "learning_rate": 0.00019799654725291126, "loss": 0.0704, "step": 3402 }, { "epoch": 0.21999191919191918, "grad_norm": 0.10157816857099533, "learning_rate": 0.0001979951849936935, "loss": 0.113, "step": 3403 }, { "epoch": 0.22005656565656564, "grad_norm": 0.08681383728981018, "learning_rate": 0.00019799382227618466, "loss": 0.0871, "step": 3404 }, { "epoch": 0.22012121212121213, "grad_norm": 0.08701842278242111, "learning_rate": 0.0001979924591003911, "loss": 0.1039, "step": 3405 }, { "epoch": 0.2201858585858586, "grad_norm": 0.059475112706422806, "learning_rate": 0.00019799109546631919, "loss": 0.0723, "step": 3406 }, { "epoch": 0.22025050505050506, "grad_norm": 0.06547842919826508, "learning_rate": 0.00019798973137397536, "loss": 0.0803, "step": 3407 }, { "epoch": 0.22031515151515152, "grad_norm": 0.06663373857736588, "learning_rate": 0.00019798836682336594, "loss": 0.0825, "step": 3408 }, { "epoch": 0.22031515151515152, "eval_bleu": 13.261001125659474, "eval_loss": 0.09563431888818741, "eval_runtime": 2.7567, "eval_samples_per_second": 11.608, "eval_steps_per_second": 1.451, "step": 3408 }, { "epoch": 0.22037979797979798, "grad_norm": 0.0813746452331543, "learning_rate": 0.0001979870018144973, "loss": 0.1197, "step": 3409 }, { "epoch": 0.22044444444444444, "grad_norm": 0.08512023091316223, "learning_rate": 0.00019798563634737588, "loss": 0.1099, "step": 3410 }, { "epoch": 0.2205090909090909, "grad_norm": 0.08940357714891434, "learning_rate": 0.00019798427042200804, "loss": 0.1139, "step": 3411 }, { "epoch": 0.22057373737373737, "grad_norm": 0.07359014451503754, "learning_rate": 0.0001979829040384001, "loss": 0.0998, "step": 3412 }, { "epoch": 0.22063838383838383, "grad_norm": 0.060967642813920975, "learning_rate": 0.00019798153719655857, "loss": 0.0774, "step": 3413 }, { "epoch": 0.2207030303030303, "grad_norm": 0.054646387696266174, "learning_rate": 0.00019798016989648975, "loss": 0.0687, "step": 3414 }, { "epoch": 0.22076767676767678, "grad_norm": 0.06471384316682816, "learning_rate": 0.0001979788021382001, "loss": 0.0877, "step": 3415 }, { "epoch": 0.22083232323232324, "grad_norm": 0.07227566093206406, "learning_rate": 0.00019797743392169594, "loss": 0.0913, "step": 3416 }, { "epoch": 0.2208969696969697, "grad_norm": 0.07610689103603363, "learning_rate": 0.00019797606524698372, "loss": 0.0909, "step": 3417 }, { "epoch": 0.22096161616161616, "grad_norm": 0.06669628620147705, "learning_rate": 0.00019797469611406986, "loss": 0.082, "step": 3418 }, { "epoch": 0.22102626262626263, "grad_norm": 0.06636548787355423, "learning_rate": 0.0001979733265229607, "loss": 0.0812, "step": 3419 }, { "epoch": 0.2210909090909091, "grad_norm": 0.06676790118217468, "learning_rate": 0.00019797195647366272, "loss": 0.08, "step": 3420 }, { "epoch": 0.22115555555555555, "grad_norm": 0.07399611175060272, "learning_rate": 0.00019797058596618226, "loss": 0.0933, "step": 3421 }, { "epoch": 0.221220202020202, "grad_norm": 0.07517257332801819, "learning_rate": 0.00019796921500052575, "loss": 0.1008, "step": 3422 }, { "epoch": 0.22128484848484847, "grad_norm": 0.0758170410990715, "learning_rate": 0.00019796784357669966, "loss": 0.0977, "step": 3423 }, { "epoch": 0.22134949494949496, "grad_norm": 0.07462489604949951, "learning_rate": 0.00019796647169471033, "loss": 0.0963, "step": 3424 }, { "epoch": 0.22134949494949496, "eval_bleu": 15.5883034251279, "eval_loss": 0.09513704478740692, "eval_runtime": 2.7633, "eval_samples_per_second": 11.58, "eval_steps_per_second": 1.448, "step": 3424 }, { "epoch": 0.22141414141414142, "grad_norm": 0.06577882915735245, "learning_rate": 0.0001979650993545642, "loss": 0.0867, "step": 3425 }, { "epoch": 0.22147878787878789, "grad_norm": 0.07720796763896942, "learning_rate": 0.00019796372655626768, "loss": 0.0954, "step": 3426 }, { "epoch": 0.22154343434343435, "grad_norm": 0.06880860030651093, "learning_rate": 0.00019796235329982715, "loss": 0.0837, "step": 3427 }, { "epoch": 0.2216080808080808, "grad_norm": 0.07525089383125305, "learning_rate": 0.00019796097958524916, "loss": 0.08, "step": 3428 }, { "epoch": 0.22167272727272727, "grad_norm": 0.07998102903366089, "learning_rate": 0.00019795960541254003, "loss": 0.0945, "step": 3429 }, { "epoch": 0.22173737373737373, "grad_norm": 0.06905552744865417, "learning_rate": 0.00019795823078170617, "loss": 0.0772, "step": 3430 }, { "epoch": 0.2218020202020202, "grad_norm": 0.06332649290561676, "learning_rate": 0.00019795685569275407, "loss": 0.0751, "step": 3431 }, { "epoch": 0.22186666666666666, "grad_norm": 0.0635744258761406, "learning_rate": 0.00019795548014569017, "loss": 0.0743, "step": 3432 }, { "epoch": 0.22193131313131312, "grad_norm": 0.07523475587368011, "learning_rate": 0.00019795410414052086, "loss": 0.0978, "step": 3433 }, { "epoch": 0.2219959595959596, "grad_norm": 0.06728078424930573, "learning_rate": 0.00019795272767725254, "loss": 0.0853, "step": 3434 }, { "epoch": 0.22206060606060607, "grad_norm": 0.0777457058429718, "learning_rate": 0.00019795135075589175, "loss": 0.102, "step": 3435 }, { "epoch": 0.22212525252525253, "grad_norm": 0.07914719730615616, "learning_rate": 0.00019794997337644485, "loss": 0.0904, "step": 3436 }, { "epoch": 0.222189898989899, "grad_norm": 0.06530313193798065, "learning_rate": 0.00019794859553891832, "loss": 0.0801, "step": 3437 }, { "epoch": 0.22225454545454545, "grad_norm": 0.06777159124612808, "learning_rate": 0.00019794721724331857, "loss": 0.0934, "step": 3438 }, { "epoch": 0.22231919191919192, "grad_norm": 0.07087704539299011, "learning_rate": 0.00019794583848965208, "loss": 0.0917, "step": 3439 }, { "epoch": 0.22238383838383838, "grad_norm": 0.07231912016868591, "learning_rate": 0.0001979444592779253, "loss": 0.0825, "step": 3440 }, { "epoch": 0.22238383838383838, "eval_bleu": 17.468634021598287, "eval_loss": 0.09492091089487076, "eval_runtime": 2.6686, "eval_samples_per_second": 11.991, "eval_steps_per_second": 1.499, "step": 3440 }, { "epoch": 0.22244848484848484, "grad_norm": 0.09987115859985352, "learning_rate": 0.00019794307960814463, "loss": 0.1112, "step": 3441 }, { "epoch": 0.2225131313131313, "grad_norm": 0.07208310812711716, "learning_rate": 0.00019794169948031659, "loss": 0.0906, "step": 3442 }, { "epoch": 0.2225777777777778, "grad_norm": 0.07307442277669907, "learning_rate": 0.00019794031889444757, "loss": 0.0929, "step": 3443 }, { "epoch": 0.22264242424242425, "grad_norm": 0.07023077458143234, "learning_rate": 0.00019793893785054407, "loss": 0.0849, "step": 3444 }, { "epoch": 0.22270707070707071, "grad_norm": 0.07088685780763626, "learning_rate": 0.00019793755634861252, "loss": 0.0842, "step": 3445 }, { "epoch": 0.22277171717171718, "grad_norm": 0.07246281951665878, "learning_rate": 0.00019793617438865942, "loss": 0.1002, "step": 3446 }, { "epoch": 0.22283636363636364, "grad_norm": 0.07885023206472397, "learning_rate": 0.0001979347919706912, "loss": 0.0841, "step": 3447 }, { "epoch": 0.2229010101010101, "grad_norm": 0.06938537955284119, "learning_rate": 0.00019793340909471434, "loss": 0.0815, "step": 3448 }, { "epoch": 0.22296565656565656, "grad_norm": 0.07416706532239914, "learning_rate": 0.0001979320257607353, "loss": 0.101, "step": 3449 }, { "epoch": 0.22303030303030302, "grad_norm": 0.08499825745820999, "learning_rate": 0.00019793064196876054, "loss": 0.0846, "step": 3450 }, { "epoch": 0.22309494949494948, "grad_norm": 0.07244010269641876, "learning_rate": 0.00019792925771879656, "loss": 0.0869, "step": 3451 }, { "epoch": 0.22315959595959595, "grad_norm": 0.0722435936331749, "learning_rate": 0.0001979278730108498, "loss": 0.0948, "step": 3452 }, { "epoch": 0.22322424242424244, "grad_norm": 0.07287343591451645, "learning_rate": 0.00019792648784492677, "loss": 0.0945, "step": 3453 }, { "epoch": 0.2232888888888889, "grad_norm": 0.06159506365656853, "learning_rate": 0.0001979251022210339, "loss": 0.0763, "step": 3454 }, { "epoch": 0.22335353535353536, "grad_norm": 0.07969187200069427, "learning_rate": 0.00019792371613917774, "loss": 0.0998, "step": 3455 }, { "epoch": 0.22341818181818182, "grad_norm": 0.0770021602511406, "learning_rate": 0.0001979223295993647, "loss": 0.0828, "step": 3456 }, { "epoch": 0.22341818181818182, "eval_bleu": 13.031251092939797, "eval_loss": 0.09550341963768005, "eval_runtime": 2.8554, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 3456 }, { "epoch": 0.22348282828282828, "grad_norm": 0.0780697911977768, "learning_rate": 0.00019792094260160132, "loss": 0.0893, "step": 3457 }, { "epoch": 0.22354747474747474, "grad_norm": 0.06838620454072952, "learning_rate": 0.00019791955514589406, "loss": 0.0888, "step": 3458 }, { "epoch": 0.2236121212121212, "grad_norm": 0.07091367989778519, "learning_rate": 0.0001979181672322494, "loss": 0.0837, "step": 3459 }, { "epoch": 0.22367676767676767, "grad_norm": 0.07299026101827621, "learning_rate": 0.00019791677886067387, "loss": 0.0796, "step": 3460 }, { "epoch": 0.22374141414141413, "grad_norm": 0.07575448602437973, "learning_rate": 0.00019791539003117387, "loss": 0.0919, "step": 3461 }, { "epoch": 0.22380606060606062, "grad_norm": 0.06466193497180939, "learning_rate": 0.000197914000743756, "loss": 0.0835, "step": 3462 }, { "epoch": 0.22387070707070708, "grad_norm": 0.06787852197885513, "learning_rate": 0.00019791261099842675, "loss": 0.0918, "step": 3463 }, { "epoch": 0.22393535353535354, "grad_norm": 0.07614251971244812, "learning_rate": 0.00019791122079519256, "loss": 0.0944, "step": 3464 }, { "epoch": 0.224, "grad_norm": 0.07440026104450226, "learning_rate": 0.00019790983013405998, "loss": 0.1008, "step": 3465 }, { "epoch": 0.22406464646464647, "grad_norm": 0.07013943046331406, "learning_rate": 0.00019790843901503546, "loss": 0.0924, "step": 3466 }, { "epoch": 0.22412929292929293, "grad_norm": 0.09238637983798981, "learning_rate": 0.00019790704743812555, "loss": 0.1108, "step": 3467 }, { "epoch": 0.2241939393939394, "grad_norm": 0.07162702828645706, "learning_rate": 0.00019790565540333676, "loss": 0.0884, "step": 3468 }, { "epoch": 0.22425858585858585, "grad_norm": 0.07698137313127518, "learning_rate": 0.00019790426291067557, "loss": 0.1059, "step": 3469 }, { "epoch": 0.2243232323232323, "grad_norm": 0.06773233413696289, "learning_rate": 0.0001979028699601485, "loss": 0.0822, "step": 3470 }, { "epoch": 0.22438787878787878, "grad_norm": 0.07118669152259827, "learning_rate": 0.0001979014765517621, "loss": 0.0927, "step": 3471 }, { "epoch": 0.22445252525252526, "grad_norm": 0.08188159018754959, "learning_rate": 0.00019790008268552286, "loss": 0.1113, "step": 3472 }, { "epoch": 0.22445252525252526, "eval_bleu": 14.413165384938702, "eval_loss": 0.09582473337650299, "eval_runtime": 2.7649, "eval_samples_per_second": 11.574, "eval_steps_per_second": 1.447, "step": 3472 }, { "epoch": 0.22451717171717173, "grad_norm": 0.05508796125650406, "learning_rate": 0.00019789868836143728, "loss": 0.0728, "step": 3473 }, { "epoch": 0.2245818181818182, "grad_norm": 0.062334559857845306, "learning_rate": 0.0001978972935795119, "loss": 0.0807, "step": 3474 }, { "epoch": 0.22464646464646465, "grad_norm": 0.10257519036531448, "learning_rate": 0.00019789589833975324, "loss": 0.0855, "step": 3475 }, { "epoch": 0.2247111111111111, "grad_norm": 0.07012440264225006, "learning_rate": 0.00019789450264216782, "loss": 0.0894, "step": 3476 }, { "epoch": 0.22477575757575757, "grad_norm": 0.08081506192684174, "learning_rate": 0.0001978931064867622, "loss": 0.1035, "step": 3477 }, { "epoch": 0.22484040404040403, "grad_norm": 0.08611779659986496, "learning_rate": 0.00019789170987354288, "loss": 0.1049, "step": 3478 }, { "epoch": 0.2249050505050505, "grad_norm": 0.08547607809305191, "learning_rate": 0.00019789031280251638, "loss": 0.1157, "step": 3479 }, { "epoch": 0.22496969696969696, "grad_norm": 0.07521193474531174, "learning_rate": 0.00019788891527368926, "loss": 0.0762, "step": 3480 }, { "epoch": 0.22503434343434345, "grad_norm": 0.07498076558113098, "learning_rate": 0.00019788751728706805, "loss": 0.0972, "step": 3481 }, { "epoch": 0.2250989898989899, "grad_norm": 0.07329077273607254, "learning_rate": 0.00019788611884265927, "loss": 0.0759, "step": 3482 }, { "epoch": 0.22516363636363637, "grad_norm": 0.1099148690700531, "learning_rate": 0.00019788471994046947, "loss": 0.0995, "step": 3483 }, { "epoch": 0.22522828282828283, "grad_norm": 0.06628064811229706, "learning_rate": 0.0001978833205805052, "loss": 0.0811, "step": 3484 }, { "epoch": 0.2252929292929293, "grad_norm": 0.07237538695335388, "learning_rate": 0.00019788192076277298, "loss": 0.0932, "step": 3485 }, { "epoch": 0.22535757575757576, "grad_norm": 0.0674620047211647, "learning_rate": 0.00019788052048727944, "loss": 0.0847, "step": 3486 }, { "epoch": 0.22542222222222222, "grad_norm": 0.07274454087018967, "learning_rate": 0.00019787911975403103, "loss": 0.0928, "step": 3487 }, { "epoch": 0.22548686868686868, "grad_norm": 0.09663775563240051, "learning_rate": 0.0001978777185630343, "loss": 0.1234, "step": 3488 }, { "epoch": 0.22548686868686868, "eval_bleu": 11.761584550655929, "eval_loss": 0.09453203529119492, "eval_runtime": 2.7702, "eval_samples_per_second": 11.551, "eval_steps_per_second": 1.444, "step": 3488 }, { "epoch": 0.22555151515151514, "grad_norm": 0.08604975789785385, "learning_rate": 0.00019787631691429587, "loss": 0.108, "step": 3489 }, { "epoch": 0.2256161616161616, "grad_norm": 0.07414434850215912, "learning_rate": 0.00019787491480782225, "loss": 0.0984, "step": 3490 }, { "epoch": 0.2256808080808081, "grad_norm": 0.07036276161670685, "learning_rate": 0.00019787351224362002, "loss": 0.093, "step": 3491 }, { "epoch": 0.22574545454545455, "grad_norm": 0.08143153041601181, "learning_rate": 0.0001978721092216957, "loss": 0.0881, "step": 3492 }, { "epoch": 0.22581010101010102, "grad_norm": 0.07141514122486115, "learning_rate": 0.00019787070574205592, "loss": 0.0873, "step": 3493 }, { "epoch": 0.22587474747474748, "grad_norm": 0.08046414703130722, "learning_rate": 0.00019786930180470721, "loss": 0.1154, "step": 3494 }, { "epoch": 0.22593939393939394, "grad_norm": 0.0730963721871376, "learning_rate": 0.00019786789740965612, "loss": 0.0968, "step": 3495 }, { "epoch": 0.2260040404040404, "grad_norm": 0.07489965856075287, "learning_rate": 0.0001978664925569092, "loss": 0.1047, "step": 3496 }, { "epoch": 0.22606868686868686, "grad_norm": 0.06927904486656189, "learning_rate": 0.00019786508724647307, "loss": 0.0713, "step": 3497 }, { "epoch": 0.22613333333333333, "grad_norm": 0.08132988959550858, "learning_rate": 0.00019786368147835427, "loss": 0.0933, "step": 3498 }, { "epoch": 0.2261979797979798, "grad_norm": 0.06674129515886307, "learning_rate": 0.00019786227525255942, "loss": 0.0937, "step": 3499 }, { "epoch": 0.22626262626262628, "grad_norm": 0.06554894894361496, "learning_rate": 0.00019786086856909502, "loss": 0.0855, "step": 3500 }, { "epoch": 0.22632727272727274, "grad_norm": 0.06557810306549072, "learning_rate": 0.0001978594614279677, "loss": 0.0863, "step": 3501 }, { "epoch": 0.2263919191919192, "grad_norm": 0.07035014033317566, "learning_rate": 0.00019785805382918406, "loss": 0.0884, "step": 3502 }, { "epoch": 0.22645656565656566, "grad_norm": 0.08184259384870529, "learning_rate": 0.00019785664577275062, "loss": 0.1097, "step": 3503 }, { "epoch": 0.22652121212121212, "grad_norm": 0.07167940586805344, "learning_rate": 0.000197855237258674, "loss": 0.0746, "step": 3504 }, { "epoch": 0.22652121212121212, "eval_bleu": 9.913851586833534, "eval_loss": 0.09565990418195724, "eval_runtime": 2.719, "eval_samples_per_second": 11.769, "eval_steps_per_second": 1.471, "step": 3504 }, { "epoch": 0.22658585858585858, "grad_norm": 0.0847000852227211, "learning_rate": 0.0001978538282869608, "loss": 0.1167, "step": 3505 }, { "epoch": 0.22665050505050505, "grad_norm": 0.09329701215028763, "learning_rate": 0.00019785241885761758, "loss": 0.0818, "step": 3506 }, { "epoch": 0.2267151515151515, "grad_norm": 0.06925345957279205, "learning_rate": 0.000197851008970651, "loss": 0.089, "step": 3507 }, { "epoch": 0.22677979797979797, "grad_norm": 0.07653499394655228, "learning_rate": 0.00019784959862606752, "loss": 0.0968, "step": 3508 }, { "epoch": 0.22684444444444443, "grad_norm": 0.06895699352025986, "learning_rate": 0.00019784818782387387, "loss": 0.0775, "step": 3509 }, { "epoch": 0.22690909090909092, "grad_norm": 0.07260050624608994, "learning_rate": 0.00019784677656407658, "loss": 0.0965, "step": 3510 }, { "epoch": 0.22697373737373738, "grad_norm": 0.11276265978813171, "learning_rate": 0.00019784536484668226, "loss": 0.0853, "step": 3511 }, { "epoch": 0.22703838383838384, "grad_norm": 0.08519134670495987, "learning_rate": 0.00019784395267169748, "loss": 0.1117, "step": 3512 }, { "epoch": 0.2271030303030303, "grad_norm": 0.0735049694776535, "learning_rate": 0.00019784254003912892, "loss": 0.0968, "step": 3513 }, { "epoch": 0.22716767676767677, "grad_norm": 0.07618353515863419, "learning_rate": 0.00019784112694898315, "loss": 0.0846, "step": 3514 }, { "epoch": 0.22723232323232323, "grad_norm": 0.0764879509806633, "learning_rate": 0.00019783971340126677, "loss": 0.1017, "step": 3515 }, { "epoch": 0.2272969696969697, "grad_norm": 0.05671005696058273, "learning_rate": 0.0001978382993959864, "loss": 0.0709, "step": 3516 }, { "epoch": 0.22736161616161615, "grad_norm": 0.0625818520784378, "learning_rate": 0.00019783688493314863, "loss": 0.0862, "step": 3517 }, { "epoch": 0.22742626262626262, "grad_norm": 0.07804487645626068, "learning_rate": 0.0001978354700127601, "loss": 0.102, "step": 3518 }, { "epoch": 0.2274909090909091, "grad_norm": 0.06514550745487213, "learning_rate": 0.00019783405463482743, "loss": 0.0847, "step": 3519 }, { "epoch": 0.22755555555555557, "grad_norm": 0.056930817663669586, "learning_rate": 0.00019783263879935721, "loss": 0.0708, "step": 3520 }, { "epoch": 0.22755555555555557, "eval_bleu": 15.892232265484392, "eval_loss": 0.09597373008728027, "eval_runtime": 2.8643, "eval_samples_per_second": 11.172, "eval_steps_per_second": 1.396, "step": 3520 }, { "epoch": 0.22762020202020203, "grad_norm": 0.07230743020772934, "learning_rate": 0.0001978312225063561, "loss": 0.0854, "step": 3521 }, { "epoch": 0.2276848484848485, "grad_norm": 0.08224190771579742, "learning_rate": 0.0001978298057558307, "loss": 0.1051, "step": 3522 }, { "epoch": 0.22774949494949495, "grad_norm": 0.07027490437030792, "learning_rate": 0.00019782838854778766, "loss": 0.0966, "step": 3523 }, { "epoch": 0.2278141414141414, "grad_norm": 0.0777537077665329, "learning_rate": 0.00019782697088223356, "loss": 0.0929, "step": 3524 }, { "epoch": 0.22787878787878788, "grad_norm": 0.07830680161714554, "learning_rate": 0.00019782555275917507, "loss": 0.0973, "step": 3525 }, { "epoch": 0.22794343434343434, "grad_norm": 0.06828243285417557, "learning_rate": 0.0001978241341786188, "loss": 0.0876, "step": 3526 }, { "epoch": 0.2280080808080808, "grad_norm": 0.07176431268453598, "learning_rate": 0.00019782271514057139, "loss": 0.0926, "step": 3527 }, { "epoch": 0.22807272727272726, "grad_norm": 0.059330422431230545, "learning_rate": 0.0001978212956450395, "loss": 0.077, "step": 3528 }, { "epoch": 0.22813737373737375, "grad_norm": 0.067512147128582, "learning_rate": 0.00019781987569202972, "loss": 0.0959, "step": 3529 }, { "epoch": 0.2282020202020202, "grad_norm": 0.09087305516004562, "learning_rate": 0.00019781845528154873, "loss": 0.1233, "step": 3530 }, { "epoch": 0.22826666666666667, "grad_norm": 0.065886490046978, "learning_rate": 0.00019781703441360319, "loss": 0.0725, "step": 3531 }, { "epoch": 0.22833131313131314, "grad_norm": 0.06722399592399597, "learning_rate": 0.0001978156130881997, "loss": 0.0865, "step": 3532 }, { "epoch": 0.2283959595959596, "grad_norm": 0.060964904725551605, "learning_rate": 0.00019781419130534492, "loss": 0.0789, "step": 3533 }, { "epoch": 0.22846060606060606, "grad_norm": 0.11481403559446335, "learning_rate": 0.00019781276906504554, "loss": 0.1105, "step": 3534 }, { "epoch": 0.22852525252525252, "grad_norm": 0.07899876683950424, "learning_rate": 0.00019781134636730815, "loss": 0.0993, "step": 3535 }, { "epoch": 0.22858989898989898, "grad_norm": 0.06887764483690262, "learning_rate": 0.0001978099232121394, "loss": 0.0903, "step": 3536 }, { "epoch": 0.22858989898989898, "eval_bleu": 15.616825101250626, "eval_loss": 0.09498222172260284, "eval_runtime": 2.6981, "eval_samples_per_second": 11.86, "eval_steps_per_second": 1.483, "step": 3536 }, { "epoch": 0.22865454545454544, "grad_norm": 0.06531254202127457, "learning_rate": 0.000197808499599546, "loss": 0.0812, "step": 3537 }, { "epoch": 0.2287191919191919, "grad_norm": 0.0698668360710144, "learning_rate": 0.00019780707552953457, "loss": 0.0775, "step": 3538 }, { "epoch": 0.2287838383838384, "grad_norm": 0.06194218993186951, "learning_rate": 0.0001978056510021118, "loss": 0.0815, "step": 3539 }, { "epoch": 0.22884848484848486, "grad_norm": 0.0733053982257843, "learning_rate": 0.00019780422601728433, "loss": 0.0805, "step": 3540 }, { "epoch": 0.22891313131313132, "grad_norm": 0.07327724993228912, "learning_rate": 0.00019780280057505882, "loss": 0.0808, "step": 3541 }, { "epoch": 0.22897777777777778, "grad_norm": 0.07861881703138351, "learning_rate": 0.00019780137467544196, "loss": 0.0957, "step": 3542 }, { "epoch": 0.22904242424242424, "grad_norm": 0.08001697063446045, "learning_rate": 0.0001977999483184404, "loss": 0.0867, "step": 3543 }, { "epoch": 0.2291070707070707, "grad_norm": 0.07066141068935394, "learning_rate": 0.0001977985215040608, "loss": 0.086, "step": 3544 }, { "epoch": 0.22917171717171717, "grad_norm": 0.07036232948303223, "learning_rate": 0.00019779709423230985, "loss": 0.0858, "step": 3545 }, { "epoch": 0.22923636363636363, "grad_norm": 0.07283928990364075, "learning_rate": 0.00019779566650319423, "loss": 0.0904, "step": 3546 }, { "epoch": 0.2293010101010101, "grad_norm": 0.08753010630607605, "learning_rate": 0.0001977942383167206, "loss": 0.0975, "step": 3547 }, { "epoch": 0.22936565656565658, "grad_norm": 0.07472671568393707, "learning_rate": 0.00019779280967289564, "loss": 0.0958, "step": 3548 }, { "epoch": 0.22943030303030304, "grad_norm": 0.07429744303226471, "learning_rate": 0.00019779138057172605, "loss": 0.0957, "step": 3549 }, { "epoch": 0.2294949494949495, "grad_norm": 0.07320588827133179, "learning_rate": 0.0001977899510132185, "loss": 0.0815, "step": 3550 }, { "epoch": 0.22955959595959596, "grad_norm": 0.06866639107465744, "learning_rate": 0.0001977885209973797, "loss": 0.0889, "step": 3551 }, { "epoch": 0.22962424242424243, "grad_norm": 0.08037056773900986, "learning_rate": 0.0001977870905242163, "loss": 0.102, "step": 3552 }, { "epoch": 0.22962424242424243, "eval_bleu": 13.410676053624968, "eval_loss": 0.09431174397468567, "eval_runtime": 2.827, "eval_samples_per_second": 11.32, "eval_steps_per_second": 1.415, "step": 3552 }, { "epoch": 0.2296888888888889, "grad_norm": 0.07771310955286026, "learning_rate": 0.00019778565959373498, "loss": 0.0973, "step": 3553 }, { "epoch": 0.22975353535353535, "grad_norm": 0.06742063909769058, "learning_rate": 0.00019778422820594248, "loss": 0.0853, "step": 3554 }, { "epoch": 0.2298181818181818, "grad_norm": 0.06602583080530167, "learning_rate": 0.00019778279636084548, "loss": 0.1, "step": 3555 }, { "epoch": 0.22988282828282827, "grad_norm": 0.08072684705257416, "learning_rate": 0.00019778136405845066, "loss": 0.1173, "step": 3556 }, { "epoch": 0.22994747474747473, "grad_norm": 0.06625766307115555, "learning_rate": 0.0001977799312987647, "loss": 0.0845, "step": 3557 }, { "epoch": 0.23001212121212122, "grad_norm": 0.08137887716293335, "learning_rate": 0.00019777849808179436, "loss": 0.1027, "step": 3558 }, { "epoch": 0.23007676767676769, "grad_norm": 0.07646719366312027, "learning_rate": 0.0001977770644075463, "loss": 0.1111, "step": 3559 }, { "epoch": 0.23014141414141415, "grad_norm": 0.08114475756883621, "learning_rate": 0.00019777563027602725, "loss": 0.0729, "step": 3560 }, { "epoch": 0.2302060606060606, "grad_norm": 0.06272187829017639, "learning_rate": 0.00019777419568724387, "loss": 0.0854, "step": 3561 }, { "epoch": 0.23027070707070707, "grad_norm": 0.09199637174606323, "learning_rate": 0.00019777276064120294, "loss": 0.1105, "step": 3562 }, { "epoch": 0.23033535353535353, "grad_norm": 0.06557215750217438, "learning_rate": 0.0001977713251379111, "loss": 0.0832, "step": 3563 }, { "epoch": 0.2304, "grad_norm": 0.07381037622690201, "learning_rate": 0.0001977698891773751, "loss": 0.0949, "step": 3564 }, { "epoch": 0.23046464646464646, "grad_norm": 0.06555640697479248, "learning_rate": 0.00019776845275960164, "loss": 0.0838, "step": 3565 }, { "epoch": 0.23052929292929292, "grad_norm": 0.06982017308473587, "learning_rate": 0.00019776701588459746, "loss": 0.0943, "step": 3566 }, { "epoch": 0.2305939393939394, "grad_norm": 0.07609958946704865, "learning_rate": 0.0001977655785523693, "loss": 0.1074, "step": 3567 }, { "epoch": 0.23065858585858587, "grad_norm": 0.07699093222618103, "learning_rate": 0.0001977641407629238, "loss": 0.1091, "step": 3568 }, { "epoch": 0.23065858585858587, "eval_bleu": 14.150156222515426, "eval_loss": 0.093361034989357, "eval_runtime": 2.8557, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 3568 }, { "epoch": 0.23072323232323233, "grad_norm": 0.08846042305231094, "learning_rate": 0.00019776270251626774, "loss": 0.1029, "step": 3569 }, { "epoch": 0.2307878787878788, "grad_norm": 0.06828977912664413, "learning_rate": 0.00019776126381240787, "loss": 0.0937, "step": 3570 }, { "epoch": 0.23085252525252525, "grad_norm": 0.07474086433649063, "learning_rate": 0.00019775982465135086, "loss": 0.0944, "step": 3571 }, { "epoch": 0.23091717171717172, "grad_norm": 0.06815038621425629, "learning_rate": 0.00019775838503310346, "loss": 0.0901, "step": 3572 }, { "epoch": 0.23098181818181818, "grad_norm": 0.06756641715765, "learning_rate": 0.00019775694495767246, "loss": 0.0817, "step": 3573 }, { "epoch": 0.23104646464646464, "grad_norm": 0.07005354762077332, "learning_rate": 0.00019775550442506452, "loss": 0.0896, "step": 3574 }, { "epoch": 0.2311111111111111, "grad_norm": 0.08190398663282394, "learning_rate": 0.0001977540634352864, "loss": 0.0967, "step": 3575 }, { "epoch": 0.23117575757575756, "grad_norm": 0.07702455669641495, "learning_rate": 0.00019775262198834482, "loss": 0.0904, "step": 3576 }, { "epoch": 0.23124040404040405, "grad_norm": 0.08336713910102844, "learning_rate": 0.00019775118008424656, "loss": 0.0997, "step": 3577 }, { "epoch": 0.2313050505050505, "grad_norm": 0.07796916365623474, "learning_rate": 0.00019774973772299835, "loss": 0.103, "step": 3578 }, { "epoch": 0.23136969696969698, "grad_norm": 0.07120048254728317, "learning_rate": 0.00019774829490460695, "loss": 0.0834, "step": 3579 }, { "epoch": 0.23143434343434344, "grad_norm": 0.07230277359485626, "learning_rate": 0.00019774685162907908, "loss": 0.0942, "step": 3580 }, { "epoch": 0.2314989898989899, "grad_norm": 0.06850824505090714, "learning_rate": 0.00019774540789642148, "loss": 0.0824, "step": 3581 }, { "epoch": 0.23156363636363636, "grad_norm": 0.0780918151140213, "learning_rate": 0.00019774396370664094, "loss": 0.0979, "step": 3582 }, { "epoch": 0.23162828282828282, "grad_norm": 0.07548228651285172, "learning_rate": 0.00019774251905974418, "loss": 0.0837, "step": 3583 }, { "epoch": 0.23169292929292928, "grad_norm": 0.06877511739730835, "learning_rate": 0.000197741073955738, "loss": 0.0901, "step": 3584 }, { "epoch": 0.23169292929292928, "eval_bleu": 11.340700999418932, "eval_loss": 0.09412933886051178, "eval_runtime": 2.711, "eval_samples_per_second": 11.804, "eval_steps_per_second": 1.475, "step": 3584 }, { "epoch": 0.23175757575757575, "grad_norm": 0.10319288820028305, "learning_rate": 0.0001977396283946291, "loss": 0.1133, "step": 3585 }, { "epoch": 0.23182222222222224, "grad_norm": 0.09334880858659744, "learning_rate": 0.0001977381823764243, "loss": 0.0972, "step": 3586 }, { "epoch": 0.2318868686868687, "grad_norm": 0.0849304124712944, "learning_rate": 0.00019773673590113032, "loss": 0.1144, "step": 3587 }, { "epoch": 0.23195151515151516, "grad_norm": 0.07076696306467056, "learning_rate": 0.00019773528896875392, "loss": 0.0971, "step": 3588 }, { "epoch": 0.23201616161616162, "grad_norm": 0.07317604124546051, "learning_rate": 0.0001977338415793019, "loss": 0.0904, "step": 3589 }, { "epoch": 0.23208080808080808, "grad_norm": 0.06971902400255203, "learning_rate": 0.000197732393732781, "loss": 0.0863, "step": 3590 }, { "epoch": 0.23214545454545454, "grad_norm": 0.08120656758546829, "learning_rate": 0.000197730945429198, "loss": 0.105, "step": 3591 }, { "epoch": 0.232210101010101, "grad_norm": 0.0819096565246582, "learning_rate": 0.00019772949666855972, "loss": 0.0994, "step": 3592 }, { "epoch": 0.23227474747474747, "grad_norm": 0.060589749366045, "learning_rate": 0.00019772804745087284, "loss": 0.0834, "step": 3593 }, { "epoch": 0.23233939393939393, "grad_norm": 0.07656625658273697, "learning_rate": 0.00019772659777614421, "loss": 0.0943, "step": 3594 }, { "epoch": 0.2324040404040404, "grad_norm": 0.06892391294240952, "learning_rate": 0.0001977251476443806, "loss": 0.0896, "step": 3595 }, { "epoch": 0.23246868686868688, "grad_norm": 0.0657782256603241, "learning_rate": 0.00019772369705558878, "loss": 0.0866, "step": 3596 }, { "epoch": 0.23253333333333334, "grad_norm": 0.08330164104700089, "learning_rate": 0.00019772224600977554, "loss": 0.1073, "step": 3597 }, { "epoch": 0.2325979797979798, "grad_norm": 0.06975338608026505, "learning_rate": 0.00019772079450694764, "loss": 0.0906, "step": 3598 }, { "epoch": 0.23266262626262627, "grad_norm": 0.06488228589296341, "learning_rate": 0.00019771934254711191, "loss": 0.0796, "step": 3599 }, { "epoch": 0.23272727272727273, "grad_norm": 0.0792783722281456, "learning_rate": 0.0001977178901302751, "loss": 0.0935, "step": 3600 }, { "epoch": 0.23272727272727273, "eval_bleu": 14.084225994172748, "eval_loss": 0.0942840650677681, "eval_runtime": 2.8737, "eval_samples_per_second": 11.136, "eval_steps_per_second": 1.392, "step": 3600 }, { "epoch": 0.2327919191919192, "grad_norm": 0.07156125456094742, "learning_rate": 0.00019771643725644404, "loss": 0.0955, "step": 3601 }, { "epoch": 0.23285656565656565, "grad_norm": 0.09084650874137878, "learning_rate": 0.0001977149839256255, "loss": 0.0954, "step": 3602 }, { "epoch": 0.2329212121212121, "grad_norm": 0.08035772293806076, "learning_rate": 0.0001977135301378263, "loss": 0.0873, "step": 3603 }, { "epoch": 0.23298585858585857, "grad_norm": 0.06201196834445, "learning_rate": 0.00019771207589305322, "loss": 0.0712, "step": 3604 }, { "epoch": 0.23305050505050506, "grad_norm": 0.0698280930519104, "learning_rate": 0.00019771062119131302, "loss": 0.0814, "step": 3605 }, { "epoch": 0.23311515151515153, "grad_norm": 0.06953717023134232, "learning_rate": 0.0001977091660326126, "loss": 0.0907, "step": 3606 }, { "epoch": 0.233179797979798, "grad_norm": 0.0626969188451767, "learning_rate": 0.00019770771041695867, "loss": 0.0811, "step": 3607 }, { "epoch": 0.23324444444444445, "grad_norm": 0.07701700180768967, "learning_rate": 0.00019770625434435812, "loss": 0.0972, "step": 3608 }, { "epoch": 0.2333090909090909, "grad_norm": 0.06883645802736282, "learning_rate": 0.0001977047978148177, "loss": 0.0827, "step": 3609 }, { "epoch": 0.23337373737373737, "grad_norm": 0.0721801370382309, "learning_rate": 0.00019770334082834423, "loss": 0.1051, "step": 3610 }, { "epoch": 0.23343838383838383, "grad_norm": 0.07021838426589966, "learning_rate": 0.00019770188338494456, "loss": 0.0928, "step": 3611 }, { "epoch": 0.2335030303030303, "grad_norm": 0.07905546575784683, "learning_rate": 0.00019770042548462549, "loss": 0.092, "step": 3612 }, { "epoch": 0.23356767676767676, "grad_norm": 0.06617802381515503, "learning_rate": 0.0001976989671273938, "loss": 0.0774, "step": 3613 }, { "epoch": 0.23363232323232322, "grad_norm": 0.06899509578943253, "learning_rate": 0.00019769750831325636, "loss": 0.0915, "step": 3614 }, { "epoch": 0.2336969696969697, "grad_norm": 0.0789894387125969, "learning_rate": 0.00019769604904221994, "loss": 0.0944, "step": 3615 }, { "epoch": 0.23376161616161617, "grad_norm": 0.0746842548251152, "learning_rate": 0.00019769458931429143, "loss": 0.0967, "step": 3616 }, { "epoch": 0.23376161616161617, "eval_bleu": 11.203320568217007, "eval_loss": 0.09357260167598724, "eval_runtime": 2.7282, "eval_samples_per_second": 11.729, "eval_steps_per_second": 1.466, "step": 3616 }, { "epoch": 0.23382626262626263, "grad_norm": 0.08139018714427948, "learning_rate": 0.0001976931291294776, "loss": 0.0945, "step": 3617 }, { "epoch": 0.2338909090909091, "grad_norm": 0.06343588978052139, "learning_rate": 0.0001976916684877853, "loss": 0.0885, "step": 3618 }, { "epoch": 0.23395555555555556, "grad_norm": 0.08190052956342697, "learning_rate": 0.00019769020738922137, "loss": 0.0884, "step": 3619 }, { "epoch": 0.23402020202020202, "grad_norm": 0.08500027656555176, "learning_rate": 0.00019768874583379266, "loss": 0.1037, "step": 3620 }, { "epoch": 0.23408484848484848, "grad_norm": 0.07637444138526917, "learning_rate": 0.00019768728382150595, "loss": 0.0916, "step": 3621 }, { "epoch": 0.23414949494949494, "grad_norm": 0.06660114973783493, "learning_rate": 0.00019768582135236813, "loss": 0.0768, "step": 3622 }, { "epoch": 0.2342141414141414, "grad_norm": 0.06850486248731613, "learning_rate": 0.000197684358426386, "loss": 0.0915, "step": 3623 }, { "epoch": 0.2342787878787879, "grad_norm": 0.06507407873868942, "learning_rate": 0.00019768289504356644, "loss": 0.0856, "step": 3624 }, { "epoch": 0.23434343434343435, "grad_norm": 0.06116962805390358, "learning_rate": 0.00019768143120391625, "loss": 0.0775, "step": 3625 }, { "epoch": 0.23440808080808082, "grad_norm": 0.07707644999027252, "learning_rate": 0.0001976799669074423, "loss": 0.0984, "step": 3626 }, { "epoch": 0.23447272727272728, "grad_norm": 0.07304911315441132, "learning_rate": 0.00019767850215415144, "loss": 0.094, "step": 3627 }, { "epoch": 0.23453737373737374, "grad_norm": 0.08153322339057922, "learning_rate": 0.00019767703694405055, "loss": 0.1062, "step": 3628 }, { "epoch": 0.2346020202020202, "grad_norm": 0.07076204568147659, "learning_rate": 0.0001976755712771464, "loss": 0.0979, "step": 3629 }, { "epoch": 0.23466666666666666, "grad_norm": 0.06623207032680511, "learning_rate": 0.00019767410515344593, "loss": 0.0854, "step": 3630 }, { "epoch": 0.23473131313131312, "grad_norm": 0.07027088105678558, "learning_rate": 0.00019767263857295596, "loss": 0.0829, "step": 3631 }, { "epoch": 0.23479595959595959, "grad_norm": 0.08217540383338928, "learning_rate": 0.00019767117153568332, "loss": 0.0844, "step": 3632 }, { "epoch": 0.23479595959595959, "eval_bleu": 13.975432203529325, "eval_loss": 0.09312742948532104, "eval_runtime": 2.7876, "eval_samples_per_second": 11.48, "eval_steps_per_second": 1.435, "step": 3632 }, { "epoch": 0.23486060606060605, "grad_norm": 0.07628919929265976, "learning_rate": 0.00019766970404163492, "loss": 0.097, "step": 3633 }, { "epoch": 0.23492525252525254, "grad_norm": 0.1570323258638382, "learning_rate": 0.00019766823609081762, "loss": 0.0788, "step": 3634 }, { "epoch": 0.234989898989899, "grad_norm": 0.07840665429830551, "learning_rate": 0.00019766676768323823, "loss": 0.0859, "step": 3635 }, { "epoch": 0.23505454545454546, "grad_norm": 0.07596184313297272, "learning_rate": 0.00019766529881890368, "loss": 0.1035, "step": 3636 }, { "epoch": 0.23511919191919192, "grad_norm": 0.08474904298782349, "learning_rate": 0.0001976638294978208, "loss": 0.0985, "step": 3637 }, { "epoch": 0.23518383838383838, "grad_norm": 0.07941339910030365, "learning_rate": 0.0001976623597199965, "loss": 0.1058, "step": 3638 }, { "epoch": 0.23524848484848485, "grad_norm": 0.0766904279589653, "learning_rate": 0.00019766088948543762, "loss": 0.0946, "step": 3639 }, { "epoch": 0.2353131313131313, "grad_norm": 0.06431964784860611, "learning_rate": 0.00019765941879415104, "loss": 0.0703, "step": 3640 }, { "epoch": 0.23537777777777777, "grad_norm": 0.07155662775039673, "learning_rate": 0.00019765794764614364, "loss": 0.0867, "step": 3641 }, { "epoch": 0.23544242424242423, "grad_norm": 0.08403602987527847, "learning_rate": 0.0001976564760414223, "loss": 0.0915, "step": 3642 }, { "epoch": 0.23550707070707072, "grad_norm": 0.1112782284617424, "learning_rate": 0.00019765500397999394, "loss": 0.0886, "step": 3643 }, { "epoch": 0.23557171717171718, "grad_norm": 0.08049486577510834, "learning_rate": 0.00019765353146186535, "loss": 0.1019, "step": 3644 }, { "epoch": 0.23563636363636364, "grad_norm": 0.059750527143478394, "learning_rate": 0.00019765205848704353, "loss": 0.0625, "step": 3645 }, { "epoch": 0.2357010101010101, "grad_norm": 0.0703006312251091, "learning_rate": 0.00019765058505553527, "loss": 0.0887, "step": 3646 }, { "epoch": 0.23576565656565657, "grad_norm": 0.07186093926429749, "learning_rate": 0.00019764911116734756, "loss": 0.1051, "step": 3647 }, { "epoch": 0.23583030303030303, "grad_norm": 0.08068804442882538, "learning_rate": 0.00019764763682248723, "loss": 0.1087, "step": 3648 }, { "epoch": 0.23583030303030303, "eval_bleu": 12.768984312443392, "eval_loss": 0.09455497562885284, "eval_runtime": 2.661, "eval_samples_per_second": 12.025, "eval_steps_per_second": 1.503, "step": 3648 }, { "epoch": 0.2358949494949495, "grad_norm": 0.07385983318090439, "learning_rate": 0.00019764616202096115, "loss": 0.0876, "step": 3649 }, { "epoch": 0.23595959595959595, "grad_norm": 0.07834664732217789, "learning_rate": 0.00019764468676277628, "loss": 0.0959, "step": 3650 }, { "epoch": 0.23602424242424241, "grad_norm": 0.06544079631567001, "learning_rate": 0.0001976432110479395, "loss": 0.0856, "step": 3651 }, { "epoch": 0.23608888888888888, "grad_norm": 0.06250978261232376, "learning_rate": 0.00019764173487645765, "loss": 0.0819, "step": 3652 }, { "epoch": 0.23615353535353537, "grad_norm": 0.0783507451415062, "learning_rate": 0.00019764025824833774, "loss": 0.0966, "step": 3653 }, { "epoch": 0.23621818181818183, "grad_norm": 0.06845230609178543, "learning_rate": 0.0001976387811635866, "loss": 0.0834, "step": 3654 }, { "epoch": 0.2362828282828283, "grad_norm": 0.06890866160392761, "learning_rate": 0.00019763730362221116, "loss": 0.0819, "step": 3655 }, { "epoch": 0.23634747474747475, "grad_norm": 0.07947429269552231, "learning_rate": 0.0001976358256242183, "loss": 0.1124, "step": 3656 }, { "epoch": 0.2364121212121212, "grad_norm": 0.0641227439045906, "learning_rate": 0.00019763434716961502, "loss": 0.078, "step": 3657 }, { "epoch": 0.23647676767676767, "grad_norm": 0.08110956102609634, "learning_rate": 0.00019763286825840814, "loss": 0.097, "step": 3658 }, { "epoch": 0.23654141414141414, "grad_norm": 0.06283240020275116, "learning_rate": 0.0001976313888906046, "loss": 0.0858, "step": 3659 }, { "epoch": 0.2366060606060606, "grad_norm": 0.06079525500535965, "learning_rate": 0.00019762990906621136, "loss": 0.0758, "step": 3660 }, { "epoch": 0.23667070707070706, "grad_norm": 0.06878741830587387, "learning_rate": 0.00019762842878523528, "loss": 0.0933, "step": 3661 }, { "epoch": 0.23673535353535355, "grad_norm": 0.07569827884435654, "learning_rate": 0.00019762694804768333, "loss": 0.1065, "step": 3662 }, { "epoch": 0.2368, "grad_norm": 0.07727054506540298, "learning_rate": 0.00019762546685356245, "loss": 0.0873, "step": 3663 }, { "epoch": 0.23686464646464647, "grad_norm": 0.06651638448238373, "learning_rate": 0.0001976239852028795, "loss": 0.0828, "step": 3664 }, { "epoch": 0.23686464646464647, "eval_bleu": 12.938585454638043, "eval_loss": 0.09288673102855682, "eval_runtime": 2.8034, "eval_samples_per_second": 11.415, "eval_steps_per_second": 1.427, "step": 3664 }, { "epoch": 0.23692929292929293, "grad_norm": 0.08869357407093048, "learning_rate": 0.00019762250309564145, "loss": 0.0954, "step": 3665 }, { "epoch": 0.2369939393939394, "grad_norm": 0.07288382947444916, "learning_rate": 0.00019762102053185525, "loss": 0.0933, "step": 3666 }, { "epoch": 0.23705858585858586, "grad_norm": 0.07795406132936478, "learning_rate": 0.00019761953751152778, "loss": 0.115, "step": 3667 }, { "epoch": 0.23712323232323232, "grad_norm": 0.074552021920681, "learning_rate": 0.00019761805403466603, "loss": 0.1013, "step": 3668 }, { "epoch": 0.23718787878787878, "grad_norm": 0.056436918675899506, "learning_rate": 0.00019761657010127688, "loss": 0.0777, "step": 3669 }, { "epoch": 0.23725252525252524, "grad_norm": 0.07594527304172516, "learning_rate": 0.0001976150857113673, "loss": 0.1091, "step": 3670 }, { "epoch": 0.2373171717171717, "grad_norm": 0.07751249521970749, "learning_rate": 0.00019761360086494427, "loss": 0.0935, "step": 3671 }, { "epoch": 0.2373818181818182, "grad_norm": 0.07881151884794235, "learning_rate": 0.0001976121155620147, "loss": 0.0835, "step": 3672 }, { "epoch": 0.23744646464646466, "grad_norm": 0.06939269602298737, "learning_rate": 0.00019761062980258552, "loss": 0.0926, "step": 3673 }, { "epoch": 0.23751111111111112, "grad_norm": 0.06641973555088043, "learning_rate": 0.0001976091435866637, "loss": 0.0899, "step": 3674 }, { "epoch": 0.23757575757575758, "grad_norm": 0.06272200495004654, "learning_rate": 0.00019760765691425615, "loss": 0.0891, "step": 3675 }, { "epoch": 0.23764040404040404, "grad_norm": 0.08037877827882767, "learning_rate": 0.0001976061697853699, "loss": 0.0865, "step": 3676 }, { "epoch": 0.2377050505050505, "grad_norm": 0.09082460403442383, "learning_rate": 0.00019760468220001185, "loss": 0.0827, "step": 3677 }, { "epoch": 0.23776969696969696, "grad_norm": 0.07090919464826584, "learning_rate": 0.00019760319415818898, "loss": 0.0874, "step": 3678 }, { "epoch": 0.23783434343434343, "grad_norm": 0.0651535913348198, "learning_rate": 0.0001976017056599082, "loss": 0.0967, "step": 3679 }, { "epoch": 0.2378989898989899, "grad_norm": 0.06856171786785126, "learning_rate": 0.00019760021670517651, "loss": 0.0982, "step": 3680 }, { "epoch": 0.2378989898989899, "eval_bleu": 12.430582261603249, "eval_loss": 0.09336743503808975, "eval_runtime": 2.7156, "eval_samples_per_second": 11.784, "eval_steps_per_second": 1.473, "step": 3680 }, { "epoch": 0.23796363636363638, "grad_norm": 0.07141885161399841, "learning_rate": 0.00019759872729400093, "loss": 0.1056, "step": 3681 }, { "epoch": 0.23802828282828284, "grad_norm": 0.07335217297077179, "learning_rate": 0.00019759723742638832, "loss": 0.098, "step": 3682 }, { "epoch": 0.2380929292929293, "grad_norm": 0.059730514883995056, "learning_rate": 0.0001975957471023457, "loss": 0.0815, "step": 3683 }, { "epoch": 0.23815757575757576, "grad_norm": 0.0720212310552597, "learning_rate": 0.00019759425632188005, "loss": 0.0968, "step": 3684 }, { "epoch": 0.23822222222222222, "grad_norm": 0.08765530586242676, "learning_rate": 0.00019759276508499833, "loss": 0.0827, "step": 3685 }, { "epoch": 0.23828686868686869, "grad_norm": 0.07773616164922714, "learning_rate": 0.00019759127339170752, "loss": 0.0963, "step": 3686 }, { "epoch": 0.23835151515151515, "grad_norm": 0.08143138885498047, "learning_rate": 0.00019758978124201457, "loss": 0.0947, "step": 3687 }, { "epoch": 0.2384161616161616, "grad_norm": 0.06535112857818604, "learning_rate": 0.00019758828863592647, "loss": 0.0753, "step": 3688 }, { "epoch": 0.23848080808080807, "grad_norm": 0.07925257831811905, "learning_rate": 0.0001975867955734502, "loss": 0.0929, "step": 3689 }, { "epoch": 0.23854545454545453, "grad_norm": 0.06724110245704651, "learning_rate": 0.00019758530205459275, "loss": 0.0865, "step": 3690 }, { "epoch": 0.23861010101010102, "grad_norm": 0.06838397681713104, "learning_rate": 0.00019758380807936114, "loss": 0.084, "step": 3691 }, { "epoch": 0.23867474747474748, "grad_norm": 0.06694263964891434, "learning_rate": 0.00019758231364776227, "loss": 0.0807, "step": 3692 }, { "epoch": 0.23873939393939395, "grad_norm": 0.07084178924560547, "learning_rate": 0.00019758081875980322, "loss": 0.0859, "step": 3693 }, { "epoch": 0.2388040404040404, "grad_norm": 0.07186637818813324, "learning_rate": 0.00019757932341549092, "loss": 0.0836, "step": 3694 }, { "epoch": 0.23886868686868687, "grad_norm": 0.0787421241402626, "learning_rate": 0.0001975778276148324, "loss": 0.0987, "step": 3695 }, { "epoch": 0.23893333333333333, "grad_norm": 0.07246874272823334, "learning_rate": 0.00019757633135783462, "loss": 0.088, "step": 3696 }, { "epoch": 0.23893333333333333, "eval_bleu": 16.48330870602483, "eval_loss": 0.09360860288143158, "eval_runtime": 2.6711, "eval_samples_per_second": 11.98, "eval_steps_per_second": 1.497, "step": 3696 }, { "epoch": 0.2389979797979798, "grad_norm": 0.0702347606420517, "learning_rate": 0.00019757483464450458, "loss": 0.084, "step": 3697 }, { "epoch": 0.23906262626262625, "grad_norm": 0.06494635343551636, "learning_rate": 0.00019757333747484935, "loss": 0.0842, "step": 3698 }, { "epoch": 0.23912727272727272, "grad_norm": 0.06721460819244385, "learning_rate": 0.00019757183984887584, "loss": 0.0821, "step": 3699 }, { "epoch": 0.2391919191919192, "grad_norm": 0.06768114119768143, "learning_rate": 0.0001975703417665911, "loss": 0.0912, "step": 3700 }, { "epoch": 0.23925656565656567, "grad_norm": 0.07227031886577606, "learning_rate": 0.00019756884322800212, "loss": 0.0931, "step": 3701 }, { "epoch": 0.23932121212121213, "grad_norm": 0.06979217380285263, "learning_rate": 0.00019756734423311592, "loss": 0.0902, "step": 3702 }, { "epoch": 0.2393858585858586, "grad_norm": 0.06806640326976776, "learning_rate": 0.00019756584478193952, "loss": 0.0911, "step": 3703 }, { "epoch": 0.23945050505050505, "grad_norm": 0.0762626975774765, "learning_rate": 0.0001975643448744799, "loss": 0.0903, "step": 3704 }, { "epoch": 0.23951515151515151, "grad_norm": 0.0667550340294838, "learning_rate": 0.00019756284451074408, "loss": 0.0858, "step": 3705 }, { "epoch": 0.23957979797979798, "grad_norm": 0.06587424129247665, "learning_rate": 0.00019756134369073913, "loss": 0.0792, "step": 3706 }, { "epoch": 0.23964444444444444, "grad_norm": 0.09357219934463501, "learning_rate": 0.000197559842414472, "loss": 0.0813, "step": 3707 }, { "epoch": 0.2397090909090909, "grad_norm": 0.07269969582557678, "learning_rate": 0.00019755834068194977, "loss": 0.0879, "step": 3708 }, { "epoch": 0.23977373737373736, "grad_norm": 0.0814739465713501, "learning_rate": 0.0001975568384931794, "loss": 0.0893, "step": 3709 }, { "epoch": 0.23983838383838385, "grad_norm": 0.0665619745850563, "learning_rate": 0.00019755533584816794, "loss": 0.0863, "step": 3710 }, { "epoch": 0.2399030303030303, "grad_norm": 0.09224192053079605, "learning_rate": 0.00019755383274692244, "loss": 0.1177, "step": 3711 }, { "epoch": 0.23996767676767677, "grad_norm": 0.08250459283590317, "learning_rate": 0.00019755232918944993, "loss": 0.0994, "step": 3712 }, { "epoch": 0.23996767676767677, "eval_bleu": 17.509389435750645, "eval_loss": 0.09335353970527649, "eval_runtime": 2.8097, "eval_samples_per_second": 11.389, "eval_steps_per_second": 1.424, "step": 3712 }, { "epoch": 0.24003232323232324, "grad_norm": 0.262030690908432, "learning_rate": 0.0001975508251757574, "loss": 0.1423, "step": 3713 }, { "epoch": 0.2400969696969697, "grad_norm": 0.09390987455844879, "learning_rate": 0.00019754932070585195, "loss": 0.0976, "step": 3714 }, { "epoch": 0.24016161616161616, "grad_norm": 0.06546153873205185, "learning_rate": 0.00019754781577974054, "loss": 0.0846, "step": 3715 }, { "epoch": 0.24022626262626262, "grad_norm": 0.07396513223648071, "learning_rate": 0.00019754631039743025, "loss": 0.1083, "step": 3716 }, { "epoch": 0.24029090909090908, "grad_norm": 0.07573252171278, "learning_rate": 0.00019754480455892815, "loss": 0.1012, "step": 3717 }, { "epoch": 0.24035555555555554, "grad_norm": 0.06593775749206543, "learning_rate": 0.00019754329826424122, "loss": 0.0908, "step": 3718 }, { "epoch": 0.24042020202020203, "grad_norm": 0.07340795546770096, "learning_rate": 0.00019754179151337651, "loss": 0.0937, "step": 3719 }, { "epoch": 0.2404848484848485, "grad_norm": 0.06906809657812119, "learning_rate": 0.0001975402843063411, "loss": 0.0982, "step": 3720 }, { "epoch": 0.24054949494949496, "grad_norm": 0.06268132477998734, "learning_rate": 0.00019753877664314203, "loss": 0.084, "step": 3721 }, { "epoch": 0.24061414141414142, "grad_norm": 0.07329512387514114, "learning_rate": 0.00019753726852378637, "loss": 0.0995, "step": 3722 }, { "epoch": 0.24067878787878788, "grad_norm": 0.06691072881221771, "learning_rate": 0.00019753575994828113, "loss": 0.082, "step": 3723 }, { "epoch": 0.24074343434343434, "grad_norm": 0.08100910484790802, "learning_rate": 0.00019753425091663336, "loss": 0.1071, "step": 3724 }, { "epoch": 0.2408080808080808, "grad_norm": 0.07027315348386765, "learning_rate": 0.0001975327414288502, "loss": 0.091, "step": 3725 }, { "epoch": 0.24087272727272727, "grad_norm": 0.06942614912986755, "learning_rate": 0.0001975312314849386, "loss": 0.0879, "step": 3726 }, { "epoch": 0.24093737373737373, "grad_norm": 0.07370313256978989, "learning_rate": 0.00019752972108490568, "loss": 0.0972, "step": 3727 }, { "epoch": 0.2410020202020202, "grad_norm": 0.07130023837089539, "learning_rate": 0.0001975282102287585, "loss": 0.0911, "step": 3728 }, { "epoch": 0.2410020202020202, "eval_bleu": 14.016115681577919, "eval_loss": 0.09258325397968292, "eval_runtime": 2.7042, "eval_samples_per_second": 11.833, "eval_steps_per_second": 1.479, "step": 3728 }, { "epoch": 0.24106666666666668, "grad_norm": 0.08074845373630524, "learning_rate": 0.00019752669891650416, "loss": 0.1003, "step": 3729 }, { "epoch": 0.24113131313131314, "grad_norm": 0.08117370307445526, "learning_rate": 0.00019752518714814967, "loss": 0.1081, "step": 3730 }, { "epoch": 0.2411959595959596, "grad_norm": 0.06793665885925293, "learning_rate": 0.00019752367492370212, "loss": 0.0797, "step": 3731 }, { "epoch": 0.24126060606060606, "grad_norm": 0.07058891654014587, "learning_rate": 0.00019752216224316858, "loss": 0.0888, "step": 3732 }, { "epoch": 0.24132525252525253, "grad_norm": 0.08567992597818375, "learning_rate": 0.00019752064910655612, "loss": 0.1102, "step": 3733 }, { "epoch": 0.241389898989899, "grad_norm": 0.07798433303833008, "learning_rate": 0.00019751913551387183, "loss": 0.0827, "step": 3734 }, { "epoch": 0.24145454545454545, "grad_norm": 0.06837143748998642, "learning_rate": 0.00019751762146512277, "loss": 0.0807, "step": 3735 }, { "epoch": 0.2415191919191919, "grad_norm": 0.07509053498506546, "learning_rate": 0.00019751610696031605, "loss": 0.0949, "step": 3736 }, { "epoch": 0.24158383838383837, "grad_norm": 0.07728492468595505, "learning_rate": 0.00019751459199945874, "loss": 0.1012, "step": 3737 }, { "epoch": 0.24164848484848483, "grad_norm": 0.07385671138763428, "learning_rate": 0.0001975130765825579, "loss": 0.0889, "step": 3738 }, { "epoch": 0.24171313131313132, "grad_norm": 0.06853918731212616, "learning_rate": 0.00019751156070962067, "loss": 0.0813, "step": 3739 }, { "epoch": 0.24177777777777779, "grad_norm": 0.07051879912614822, "learning_rate": 0.00019751004438065407, "loss": 0.0936, "step": 3740 }, { "epoch": 0.24184242424242425, "grad_norm": 0.06729122251272202, "learning_rate": 0.00019750852759566528, "loss": 0.0808, "step": 3741 }, { "epoch": 0.2419070707070707, "grad_norm": 0.07112760841846466, "learning_rate": 0.00019750701035466128, "loss": 0.1082, "step": 3742 }, { "epoch": 0.24197171717171717, "grad_norm": 0.08199884742498398, "learning_rate": 0.00019750549265764927, "loss": 0.1006, "step": 3743 }, { "epoch": 0.24203636363636363, "grad_norm": 0.06376795470714569, "learning_rate": 0.00019750397450463627, "loss": 0.0758, "step": 3744 }, { "epoch": 0.24203636363636363, "eval_bleu": 13.941046495240542, "eval_loss": 0.09321649372577667, "eval_runtime": 2.7927, "eval_samples_per_second": 11.458, "eval_steps_per_second": 1.432, "step": 3744 }, { "epoch": 0.2421010101010101, "grad_norm": 0.0710856094956398, "learning_rate": 0.00019750245589562947, "loss": 0.0886, "step": 3745 }, { "epoch": 0.24216565656565656, "grad_norm": 0.08524394780397415, "learning_rate": 0.00019750093683063586, "loss": 0.1366, "step": 3746 }, { "epoch": 0.24223030303030302, "grad_norm": 0.06681890040636063, "learning_rate": 0.00019749941730966264, "loss": 0.0845, "step": 3747 }, { "epoch": 0.2422949494949495, "grad_norm": 0.061822980642318726, "learning_rate": 0.00019749789733271688, "loss": 0.0872, "step": 3748 }, { "epoch": 0.24235959595959597, "grad_norm": 0.06491713225841522, "learning_rate": 0.00019749637689980566, "loss": 0.0959, "step": 3749 }, { "epoch": 0.24242424242424243, "grad_norm": 0.06295552104711533, "learning_rate": 0.00019749485601093612, "loss": 0.0775, "step": 3750 }, { "epoch": 0.2424888888888889, "grad_norm": 0.07560793310403824, "learning_rate": 0.00019749333466611538, "loss": 0.1077, "step": 3751 }, { "epoch": 0.24255353535353535, "grad_norm": 0.08111906051635742, "learning_rate": 0.00019749181286535055, "loss": 0.107, "step": 3752 }, { "epoch": 0.24261818181818182, "grad_norm": 0.06592545658349991, "learning_rate": 0.00019749029060864873, "loss": 0.0782, "step": 3753 }, { "epoch": 0.24268282828282828, "grad_norm": 0.08493833988904953, "learning_rate": 0.00019748876789601704, "loss": 0.1018, "step": 3754 }, { "epoch": 0.24274747474747474, "grad_norm": 0.06522996723651886, "learning_rate": 0.00019748724472746262, "loss": 0.0932, "step": 3755 }, { "epoch": 0.2428121212121212, "grad_norm": 0.07433473318815231, "learning_rate": 0.00019748572110299262, "loss": 0.094, "step": 3756 }, { "epoch": 0.24287676767676766, "grad_norm": 0.06947848945856094, "learning_rate": 0.0001974841970226141, "loss": 0.0795, "step": 3757 }, { "epoch": 0.24294141414141415, "grad_norm": 0.07081591337919235, "learning_rate": 0.00019748267248633421, "loss": 0.0904, "step": 3758 }, { "epoch": 0.24300606060606061, "grad_norm": 0.08732277154922485, "learning_rate": 0.0001974811474941601, "loss": 0.0965, "step": 3759 }, { "epoch": 0.24307070707070708, "grad_norm": 0.0738610252737999, "learning_rate": 0.00019747962204609887, "loss": 0.0836, "step": 3760 }, { "epoch": 0.24307070707070708, "eval_bleu": 13.614584913945766, "eval_loss": 0.0931699350476265, "eval_runtime": 2.6369, "eval_samples_per_second": 12.136, "eval_steps_per_second": 1.517, "step": 3760 }, { "epoch": 0.24313535353535354, "grad_norm": 0.08599385619163513, "learning_rate": 0.00019747809614215772, "loss": 0.0991, "step": 3761 }, { "epoch": 0.2432, "grad_norm": 0.06696201115846634, "learning_rate": 0.0001974765697823437, "loss": 0.075, "step": 3762 }, { "epoch": 0.24326464646464646, "grad_norm": 0.06407614052295685, "learning_rate": 0.000197475042966664, "loss": 0.084, "step": 3763 }, { "epoch": 0.24332929292929292, "grad_norm": 0.1326744556427002, "learning_rate": 0.00019747351569512572, "loss": 0.1064, "step": 3764 }, { "epoch": 0.24339393939393938, "grad_norm": 0.06654291599988937, "learning_rate": 0.00019747198796773608, "loss": 0.0884, "step": 3765 }, { "epoch": 0.24345858585858585, "grad_norm": 0.06948009133338928, "learning_rate": 0.00019747045978450216, "loss": 0.0855, "step": 3766 }, { "epoch": 0.24352323232323234, "grad_norm": 0.06581073254346848, "learning_rate": 0.0001974689311454311, "loss": 0.0775, "step": 3767 }, { "epoch": 0.2435878787878788, "grad_norm": 0.07536555826663971, "learning_rate": 0.0001974674020505301, "loss": 0.0941, "step": 3768 }, { "epoch": 0.24365252525252526, "grad_norm": 0.06603272259235382, "learning_rate": 0.00019746587249980626, "loss": 0.0785, "step": 3769 }, { "epoch": 0.24371717171717172, "grad_norm": 0.06903275847434998, "learning_rate": 0.00019746434249326677, "loss": 0.0906, "step": 3770 }, { "epoch": 0.24378181818181818, "grad_norm": 0.07458031922578812, "learning_rate": 0.00019746281203091877, "loss": 0.0859, "step": 3771 }, { "epoch": 0.24384646464646464, "grad_norm": 0.08659809082746506, "learning_rate": 0.00019746128111276941, "loss": 0.1148, "step": 3772 }, { "epoch": 0.2439111111111111, "grad_norm": 0.07417561858892441, "learning_rate": 0.00019745974973882588, "loss": 0.0873, "step": 3773 }, { "epoch": 0.24397575757575757, "grad_norm": 0.0825277641415596, "learning_rate": 0.0001974582179090953, "loss": 0.0929, "step": 3774 }, { "epoch": 0.24404040404040403, "grad_norm": 0.07200012356042862, "learning_rate": 0.00019745668562358486, "loss": 0.0985, "step": 3775 }, { "epoch": 0.2441050505050505, "grad_norm": 0.06135423108935356, "learning_rate": 0.0001974551528823017, "loss": 0.0881, "step": 3776 }, { "epoch": 0.2441050505050505, "eval_bleu": 14.072435219353501, "eval_loss": 0.0930660218000412, "eval_runtime": 2.6662, "eval_samples_per_second": 12.002, "eval_steps_per_second": 1.5, "step": 3776 }, { "epoch": 0.24416969696969698, "grad_norm": 0.062058717012405396, "learning_rate": 0.00019745361968525303, "loss": 0.0691, "step": 3777 }, { "epoch": 0.24423434343434344, "grad_norm": 0.0643954947590828, "learning_rate": 0.00019745208603244604, "loss": 0.0884, "step": 3778 }, { "epoch": 0.2442989898989899, "grad_norm": 0.07990649342536926, "learning_rate": 0.0001974505519238878, "loss": 0.0913, "step": 3779 }, { "epoch": 0.24436363636363637, "grad_norm": 0.08154579252004623, "learning_rate": 0.00019744901735958554, "loss": 0.1189, "step": 3780 }, { "epoch": 0.24442828282828283, "grad_norm": 0.0726609155535698, "learning_rate": 0.00019744748233954646, "loss": 0.0958, "step": 3781 }, { "epoch": 0.2444929292929293, "grad_norm": 0.06956779211759567, "learning_rate": 0.00019744594686377776, "loss": 0.1058, "step": 3782 }, { "epoch": 0.24455757575757575, "grad_norm": 0.07045941799879074, "learning_rate": 0.0001974444109322865, "loss": 0.1002, "step": 3783 }, { "epoch": 0.2446222222222222, "grad_norm": 0.06140134856104851, "learning_rate": 0.00019744287454508, "loss": 0.0758, "step": 3784 }, { "epoch": 0.24468686868686867, "grad_norm": 0.06358760595321655, "learning_rate": 0.0001974413377021654, "loss": 0.0937, "step": 3785 }, { "epoch": 0.24475151515151516, "grad_norm": 0.0716613382101059, "learning_rate": 0.00019743980040354985, "loss": 0.0855, "step": 3786 }, { "epoch": 0.24481616161616163, "grad_norm": 0.06060607358813286, "learning_rate": 0.0001974382626492406, "loss": 0.076, "step": 3787 }, { "epoch": 0.2448808080808081, "grad_norm": 0.06469915807247162, "learning_rate": 0.00019743672443924476, "loss": 0.0889, "step": 3788 }, { "epoch": 0.24494545454545455, "grad_norm": 0.0663461834192276, "learning_rate": 0.00019743518577356958, "loss": 0.0853, "step": 3789 }, { "epoch": 0.245010101010101, "grad_norm": 0.07696621865034103, "learning_rate": 0.00019743364665222228, "loss": 0.0763, "step": 3790 }, { "epoch": 0.24507474747474747, "grad_norm": 0.06791282445192337, "learning_rate": 0.00019743210707521002, "loss": 0.0937, "step": 3791 }, { "epoch": 0.24513939393939393, "grad_norm": 0.059839557856321335, "learning_rate": 0.00019743056704253997, "loss": 0.0859, "step": 3792 }, { "epoch": 0.24513939393939393, "eval_bleu": 14.056218074665326, "eval_loss": 0.09398597478866577, "eval_runtime": 2.7769, "eval_samples_per_second": 11.524, "eval_steps_per_second": 1.44, "step": 3792 }, { "epoch": 0.2452040404040404, "grad_norm": 0.07247661799192429, "learning_rate": 0.0001974290265542194, "loss": 0.0971, "step": 3793 }, { "epoch": 0.24526868686868686, "grad_norm": 0.06797537207603455, "learning_rate": 0.00019742748561025545, "loss": 0.0989, "step": 3794 }, { "epoch": 0.24533333333333332, "grad_norm": 0.061104148626327515, "learning_rate": 0.0001974259442106554, "loss": 0.0712, "step": 3795 }, { "epoch": 0.2453979797979798, "grad_norm": 0.06417061388492584, "learning_rate": 0.0001974244023554264, "loss": 0.0883, "step": 3796 }, { "epoch": 0.24546262626262627, "grad_norm": 0.07916979491710663, "learning_rate": 0.00019742286004457567, "loss": 0.1117, "step": 3797 }, { "epoch": 0.24552727272727273, "grad_norm": 0.0759257897734642, "learning_rate": 0.00019742131727811045, "loss": 0.0892, "step": 3798 }, { "epoch": 0.2455919191919192, "grad_norm": 0.06910717487335205, "learning_rate": 0.0001974197740560379, "loss": 0.0919, "step": 3799 }, { "epoch": 0.24565656565656566, "grad_norm": 0.0624660924077034, "learning_rate": 0.0001974182303783653, "loss": 0.0787, "step": 3800 }, { "epoch": 0.24572121212121212, "grad_norm": 0.07715356349945068, "learning_rate": 0.00019741668624509987, "loss": 0.0915, "step": 3801 }, { "epoch": 0.24578585858585858, "grad_norm": 0.06638652086257935, "learning_rate": 0.00019741514165624874, "loss": 0.0942, "step": 3802 }, { "epoch": 0.24585050505050504, "grad_norm": 0.06455226242542267, "learning_rate": 0.00019741359661181924, "loss": 0.0922, "step": 3803 }, { "epoch": 0.2459151515151515, "grad_norm": 0.06638920307159424, "learning_rate": 0.00019741205111181853, "loss": 0.0941, "step": 3804 }, { "epoch": 0.245979797979798, "grad_norm": 0.05518640950322151, "learning_rate": 0.00019741050515625387, "loss": 0.0737, "step": 3805 }, { "epoch": 0.24604444444444445, "grad_norm": 0.06490351259708405, "learning_rate": 0.0001974089587451325, "loss": 0.1007, "step": 3806 }, { "epoch": 0.24610909090909092, "grad_norm": 0.06110585108399391, "learning_rate": 0.00019740741187846162, "loss": 0.0802, "step": 3807 }, { "epoch": 0.24617373737373738, "grad_norm": 0.07082351297140121, "learning_rate": 0.00019740586455624848, "loss": 0.0813, "step": 3808 }, { "epoch": 0.24617373737373738, "eval_bleu": 16.764868008949694, "eval_loss": 0.0933462530374527, "eval_runtime": 2.7501, "eval_samples_per_second": 11.636, "eval_steps_per_second": 1.455, "step": 3808 }, { "epoch": 0.24623838383838384, "grad_norm": 0.06844015419483185, "learning_rate": 0.00019740431677850028, "loss": 0.0903, "step": 3809 }, { "epoch": 0.2463030303030303, "grad_norm": 0.06428299099206924, "learning_rate": 0.00019740276854522435, "loss": 0.0825, "step": 3810 }, { "epoch": 0.24636767676767676, "grad_norm": 0.05986114218831062, "learning_rate": 0.0001974012198564278, "loss": 0.0807, "step": 3811 }, { "epoch": 0.24643232323232322, "grad_norm": 0.08208808302879333, "learning_rate": 0.000197399670712118, "loss": 0.1083, "step": 3812 }, { "epoch": 0.2464969696969697, "grad_norm": 0.061327412724494934, "learning_rate": 0.00019739812111230215, "loss": 0.0751, "step": 3813 }, { "epoch": 0.24656161616161615, "grad_norm": 0.06825974583625793, "learning_rate": 0.00019739657105698744, "loss": 0.0769, "step": 3814 }, { "epoch": 0.24662626262626264, "grad_norm": 0.08726572245359421, "learning_rate": 0.0001973950205461812, "loss": 0.1148, "step": 3815 }, { "epoch": 0.2466909090909091, "grad_norm": 0.07514621317386627, "learning_rate": 0.00019739346957989065, "loss": 0.091, "step": 3816 }, { "epoch": 0.24675555555555556, "grad_norm": 0.06860413402318954, "learning_rate": 0.000197391918158123, "loss": 0.089, "step": 3817 }, { "epoch": 0.24682020202020202, "grad_norm": 0.0798850879073143, "learning_rate": 0.0001973903662808856, "loss": 0.0757, "step": 3818 }, { "epoch": 0.24688484848484848, "grad_norm": 0.0821099504828453, "learning_rate": 0.0001973888139481856, "loss": 0.0949, "step": 3819 }, { "epoch": 0.24694949494949495, "grad_norm": 0.07128286361694336, "learning_rate": 0.00019738726116003035, "loss": 0.0928, "step": 3820 }, { "epoch": 0.2470141414141414, "grad_norm": 0.06745872646570206, "learning_rate": 0.00019738570791642707, "loss": 0.0858, "step": 3821 }, { "epoch": 0.24707878787878787, "grad_norm": 0.06696398556232452, "learning_rate": 0.000197384154217383, "loss": 0.0821, "step": 3822 }, { "epoch": 0.24714343434343433, "grad_norm": 0.06421520560979843, "learning_rate": 0.00019738260006290547, "loss": 0.082, "step": 3823 }, { "epoch": 0.24720808080808082, "grad_norm": 0.06522439420223236, "learning_rate": 0.00019738104545300171, "loss": 0.0892, "step": 3824 }, { "epoch": 0.24720808080808082, "eval_bleu": 16.27614087403725, "eval_loss": 0.09310175478458405, "eval_runtime": 2.8713, "eval_samples_per_second": 11.145, "eval_steps_per_second": 1.393, "step": 3824 }, { "epoch": 0.24727272727272728, "grad_norm": 0.06867222487926483, "learning_rate": 0.00019737949038767897, "loss": 0.0918, "step": 3825 }, { "epoch": 0.24733737373737374, "grad_norm": 0.07688181102275848, "learning_rate": 0.00019737793486694456, "loss": 0.1024, "step": 3826 }, { "epoch": 0.2474020202020202, "grad_norm": 0.07875890284776688, "learning_rate": 0.00019737637889080575, "loss": 0.1141, "step": 3827 }, { "epoch": 0.24746666666666667, "grad_norm": 0.07064365595579147, "learning_rate": 0.0001973748224592698, "loss": 0.0893, "step": 3828 }, { "epoch": 0.24753131313131313, "grad_norm": 0.06773609668016434, "learning_rate": 0.000197373265572344, "loss": 0.0858, "step": 3829 }, { "epoch": 0.2475959595959596, "grad_norm": 0.07183733582496643, "learning_rate": 0.0001973717082300356, "loss": 0.0897, "step": 3830 }, { "epoch": 0.24766060606060605, "grad_norm": 0.07189106941223145, "learning_rate": 0.00019737015043235198, "loss": 0.1023, "step": 3831 }, { "epoch": 0.24772525252525252, "grad_norm": 0.07007001340389252, "learning_rate": 0.0001973685921793003, "loss": 0.0882, "step": 3832 }, { "epoch": 0.24778989898989898, "grad_norm": 0.07396364212036133, "learning_rate": 0.00019736703347088792, "loss": 0.0962, "step": 3833 }, { "epoch": 0.24785454545454547, "grad_norm": 0.0649465024471283, "learning_rate": 0.00019736547430712208, "loss": 0.0875, "step": 3834 }, { "epoch": 0.24791919191919193, "grad_norm": 0.06593985855579376, "learning_rate": 0.00019736391468801014, "loss": 0.0814, "step": 3835 }, { "epoch": 0.2479838383838384, "grad_norm": 0.06916589289903641, "learning_rate": 0.00019736235461355935, "loss": 0.096, "step": 3836 }, { "epoch": 0.24804848484848485, "grad_norm": 0.06952930241823196, "learning_rate": 0.00019736079408377703, "loss": 0.0825, "step": 3837 }, { "epoch": 0.2481131313131313, "grad_norm": 0.07936926931142807, "learning_rate": 0.00019735923309867047, "loss": 0.1119, "step": 3838 }, { "epoch": 0.24817777777777777, "grad_norm": 0.07878732681274414, "learning_rate": 0.00019735767165824695, "loss": 0.0969, "step": 3839 }, { "epoch": 0.24824242424242424, "grad_norm": 0.07381368428468704, "learning_rate": 0.00019735610976251376, "loss": 0.0924, "step": 3840 }, { "epoch": 0.24824242424242424, "eval_bleu": 14.521381565332044, "eval_loss": 0.09357903152704239, "eval_runtime": 2.7037, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.479, "step": 3840 }, { "epoch": 0.2483070707070707, "grad_norm": 0.07202629745006561, "learning_rate": 0.00019735454741147824, "loss": 0.0928, "step": 3841 }, { "epoch": 0.24837171717171716, "grad_norm": 0.07414745539426804, "learning_rate": 0.00019735298460514772, "loss": 0.1056, "step": 3842 }, { "epoch": 0.24843636363636365, "grad_norm": 0.06099332496523857, "learning_rate": 0.00019735142134352944, "loss": 0.0732, "step": 3843 }, { "epoch": 0.2485010101010101, "grad_norm": 0.06688322126865387, "learning_rate": 0.00019734985762663077, "loss": 0.0846, "step": 3844 }, { "epoch": 0.24856565656565657, "grad_norm": 0.07159404456615448, "learning_rate": 0.000197348293454459, "loss": 0.1065, "step": 3845 }, { "epoch": 0.24863030303030303, "grad_norm": 0.05792752653360367, "learning_rate": 0.0001973467288270214, "loss": 0.0744, "step": 3846 }, { "epoch": 0.2486949494949495, "grad_norm": 0.065129853785038, "learning_rate": 0.00019734516374432537, "loss": 0.0808, "step": 3847 }, { "epoch": 0.24875959595959596, "grad_norm": 0.07136716693639755, "learning_rate": 0.00019734359820637818, "loss": 0.0813, "step": 3848 }, { "epoch": 0.24882424242424242, "grad_norm": 0.071194589138031, "learning_rate": 0.00019734203221318718, "loss": 0.0959, "step": 3849 }, { "epoch": 0.24888888888888888, "grad_norm": 0.0713515505194664, "learning_rate": 0.00019734046576475966, "loss": 0.0942, "step": 3850 }, { "epoch": 0.24895353535353534, "grad_norm": 0.06720596551895142, "learning_rate": 0.00019733889886110295, "loss": 0.0818, "step": 3851 }, { "epoch": 0.2490181818181818, "grad_norm": 0.06579820811748505, "learning_rate": 0.00019733733150222442, "loss": 0.0846, "step": 3852 }, { "epoch": 0.2490828282828283, "grad_norm": 0.11488955467939377, "learning_rate": 0.00019733576368813135, "loss": 0.1035, "step": 3853 }, { "epoch": 0.24914747474747476, "grad_norm": 0.06219153106212616, "learning_rate": 0.00019733419541883112, "loss": 0.0808, "step": 3854 }, { "epoch": 0.24921212121212122, "grad_norm": 0.09871772676706314, "learning_rate": 0.000197332626694331, "loss": 0.1053, "step": 3855 }, { "epoch": 0.24927676767676768, "grad_norm": 0.07146307826042175, "learning_rate": 0.00019733105751463837, "loss": 0.0895, "step": 3856 }, { "epoch": 0.24927676767676768, "eval_bleu": 17.299049814793168, "eval_loss": 0.0918586328625679, "eval_runtime": 2.7495, "eval_samples_per_second": 11.638, "eval_steps_per_second": 1.455, "step": 3856 }, { "epoch": 0.24934141414141414, "grad_norm": 0.06900150328874588, "learning_rate": 0.00019732948787976057, "loss": 0.0877, "step": 3857 }, { "epoch": 0.2494060606060606, "grad_norm": 0.07269001752138138, "learning_rate": 0.00019732791778970493, "loss": 0.0908, "step": 3858 }, { "epoch": 0.24947070707070707, "grad_norm": 0.06268926709890366, "learning_rate": 0.00019732634724447878, "loss": 0.0792, "step": 3859 }, { "epoch": 0.24953535353535353, "grad_norm": 0.0876120999455452, "learning_rate": 0.00019732477624408948, "loss": 0.1067, "step": 3860 }, { "epoch": 0.2496, "grad_norm": 0.08949094265699387, "learning_rate": 0.0001973232047885444, "loss": 0.0944, "step": 3861 }, { "epoch": 0.24966464646464648, "grad_norm": 0.1021556407213211, "learning_rate": 0.00019732163287785082, "loss": 0.0927, "step": 3862 }, { "epoch": 0.24972929292929294, "grad_norm": 0.06641527265310287, "learning_rate": 0.0001973200605120162, "loss": 0.0848, "step": 3863 }, { "epoch": 0.2497939393939394, "grad_norm": 0.06752959638834, "learning_rate": 0.0001973184876910478, "loss": 0.0814, "step": 3864 }, { "epoch": 0.24985858585858586, "grad_norm": 0.07963059097528458, "learning_rate": 0.00019731691441495302, "loss": 0.1066, "step": 3865 }, { "epoch": 0.24992323232323232, "grad_norm": 0.07975966483354568, "learning_rate": 0.00019731534068373919, "loss": 0.0985, "step": 3866 }, { "epoch": 0.2499878787878788, "grad_norm": 0.07607954740524292, "learning_rate": 0.00019731376649741368, "loss": 0.1036, "step": 3867 }, { "epoch": 0.25005252525252525, "grad_norm": 0.11136452853679657, "learning_rate": 0.00019731219185598384, "loss": 0.0922, "step": 3868 }, { "epoch": 0.25011717171717174, "grad_norm": 0.07405475527048111, "learning_rate": 0.00019731061675945708, "loss": 0.0874, "step": 3869 }, { "epoch": 0.25018181818181817, "grad_norm": 0.07023259252309799, "learning_rate": 0.0001973090412078407, "loss": 0.0917, "step": 3870 }, { "epoch": 0.25024646464646466, "grad_norm": 0.06776626408100128, "learning_rate": 0.00019730746520114215, "loss": 0.0926, "step": 3871 }, { "epoch": 0.2503111111111111, "grad_norm": 0.10249544680118561, "learning_rate": 0.00019730588873936872, "loss": 0.1073, "step": 3872 }, { "epoch": 0.2503111111111111, "eval_bleu": 14.989571546850634, "eval_loss": 0.09359989315271378, "eval_runtime": 2.7169, "eval_samples_per_second": 11.778, "eval_steps_per_second": 1.472, "step": 3872 }, { "epoch": 0.2503757575757576, "grad_norm": 0.08641575276851654, "learning_rate": 0.00019730431182252782, "loss": 0.1127, "step": 3873 }, { "epoch": 0.250440404040404, "grad_norm": 0.06100583076477051, "learning_rate": 0.00019730273445062686, "loss": 0.0717, "step": 3874 }, { "epoch": 0.2505050505050505, "grad_norm": 0.07123076170682907, "learning_rate": 0.00019730115662367314, "loss": 0.087, "step": 3875 }, { "epoch": 0.25056969696969694, "grad_norm": 0.06142003461718559, "learning_rate": 0.0001972995783416741, "loss": 0.0719, "step": 3876 }, { "epoch": 0.25063434343434343, "grad_norm": 0.07932982593774796, "learning_rate": 0.00019729799960463705, "loss": 0.114, "step": 3877 }, { "epoch": 0.2506989898989899, "grad_norm": 0.09041208028793335, "learning_rate": 0.0001972964204125695, "loss": 0.093, "step": 3878 }, { "epoch": 0.25076363636363636, "grad_norm": 0.06294432282447815, "learning_rate": 0.0001972948407654787, "loss": 0.0842, "step": 3879 }, { "epoch": 0.25082828282828284, "grad_norm": 0.0731189027428627, "learning_rate": 0.0001972932606633721, "loss": 0.0979, "step": 3880 }, { "epoch": 0.2508929292929293, "grad_norm": 0.08556569367647171, "learning_rate": 0.00019729168010625708, "loss": 0.114, "step": 3881 }, { "epoch": 0.25095757575757577, "grad_norm": 0.08083625137805939, "learning_rate": 0.00019729009909414107, "loss": 0.1116, "step": 3882 }, { "epoch": 0.2510222222222222, "grad_norm": 0.07771521061658859, "learning_rate": 0.0001972885176270314, "loss": 0.0993, "step": 3883 }, { "epoch": 0.2510868686868687, "grad_norm": 0.09448112547397614, "learning_rate": 0.00019728693570493554, "loss": 0.1075, "step": 3884 }, { "epoch": 0.2511515151515151, "grad_norm": 0.06368444114923477, "learning_rate": 0.0001972853533278608, "loss": 0.087, "step": 3885 }, { "epoch": 0.2512161616161616, "grad_norm": 0.07448848336935043, "learning_rate": 0.0001972837704958146, "loss": 0.0977, "step": 3886 }, { "epoch": 0.2512808080808081, "grad_norm": 0.06747115403413773, "learning_rate": 0.00019728218720880443, "loss": 0.0856, "step": 3887 }, { "epoch": 0.25134545454545454, "grad_norm": 0.06890641897916794, "learning_rate": 0.0001972806034668376, "loss": 0.1013, "step": 3888 }, { "epoch": 0.25134545454545454, "eval_bleu": 14.539726475818581, "eval_loss": 0.09322687983512878, "eval_runtime": 2.8119, "eval_samples_per_second": 11.38, "eval_steps_per_second": 1.423, "step": 3888 }, { "epoch": 0.25141010101010103, "grad_norm": 0.06661111116409302, "learning_rate": 0.00019727901926992153, "loss": 0.0952, "step": 3889 }, { "epoch": 0.25147474747474746, "grad_norm": 0.06179904565215111, "learning_rate": 0.0001972774346180637, "loss": 0.091, "step": 3890 }, { "epoch": 0.25153939393939395, "grad_norm": 0.07435615360736847, "learning_rate": 0.00019727584951127142, "loss": 0.1097, "step": 3891 }, { "epoch": 0.2516040404040404, "grad_norm": 0.06801638752222061, "learning_rate": 0.00019727426394955218, "loss": 0.0981, "step": 3892 }, { "epoch": 0.2516686868686869, "grad_norm": 0.06010456010699272, "learning_rate": 0.00019727267793291333, "loss": 0.0774, "step": 3893 }, { "epoch": 0.2517333333333333, "grad_norm": 0.06850286573171616, "learning_rate": 0.00019727109146136233, "loss": 0.0914, "step": 3894 }, { "epoch": 0.2517979797979798, "grad_norm": 0.070985347032547, "learning_rate": 0.00019726950453490664, "loss": 0.0914, "step": 3895 }, { "epoch": 0.2518626262626263, "grad_norm": 0.06558531522750854, "learning_rate": 0.0001972679171535536, "loss": 0.0821, "step": 3896 }, { "epoch": 0.2519272727272727, "grad_norm": 0.07455248385667801, "learning_rate": 0.00019726632931731065, "loss": 0.1009, "step": 3897 }, { "epoch": 0.2519919191919192, "grad_norm": 0.06499814987182617, "learning_rate": 0.00019726474102618524, "loss": 0.0712, "step": 3898 }, { "epoch": 0.25205656565656565, "grad_norm": 0.07061312347650528, "learning_rate": 0.0001972631522801848, "loss": 0.1069, "step": 3899 }, { "epoch": 0.25212121212121213, "grad_norm": 0.07644709944725037, "learning_rate": 0.00019726156307931677, "loss": 0.1125, "step": 3900 }, { "epoch": 0.25218585858585857, "grad_norm": 0.0785532146692276, "learning_rate": 0.00019725997342358856, "loss": 0.0992, "step": 3901 }, { "epoch": 0.25225050505050506, "grad_norm": 0.06581787765026093, "learning_rate": 0.0001972583833130076, "loss": 0.0861, "step": 3902 }, { "epoch": 0.2523151515151515, "grad_norm": 0.07853641360998154, "learning_rate": 0.00019725679274758132, "loss": 0.1136, "step": 3903 }, { "epoch": 0.252379797979798, "grad_norm": 0.06795958429574966, "learning_rate": 0.00019725520172731716, "loss": 0.0885, "step": 3904 }, { "epoch": 0.252379797979798, "eval_bleu": 14.895046897536865, "eval_loss": 0.09238015115261078, "eval_runtime": 2.7203, "eval_samples_per_second": 11.764, "eval_steps_per_second": 1.47, "step": 3904 }, { "epoch": 0.25244444444444447, "grad_norm": 0.07393664121627808, "learning_rate": 0.00019725361025222263, "loss": 0.0984, "step": 3905 }, { "epoch": 0.2525090909090909, "grad_norm": 0.07062132656574249, "learning_rate": 0.00019725201832230507, "loss": 0.0887, "step": 3906 }, { "epoch": 0.2525737373737374, "grad_norm": 0.11406850069761276, "learning_rate": 0.000197250425937572, "loss": 0.0955, "step": 3907 }, { "epoch": 0.25263838383838383, "grad_norm": 0.07079752534627914, "learning_rate": 0.0001972488330980308, "loss": 0.0952, "step": 3908 }, { "epoch": 0.2527030303030303, "grad_norm": 0.07221031188964844, "learning_rate": 0.000197247239803689, "loss": 0.0977, "step": 3909 }, { "epoch": 0.25276767676767675, "grad_norm": 0.06931409984827042, "learning_rate": 0.00019724564605455398, "loss": 0.093, "step": 3910 }, { "epoch": 0.25283232323232324, "grad_norm": 0.07004189491271973, "learning_rate": 0.00019724405185063323, "loss": 0.095, "step": 3911 }, { "epoch": 0.2528969696969697, "grad_norm": 0.05664917081594467, "learning_rate": 0.0001972424571919342, "loss": 0.0714, "step": 3912 }, { "epoch": 0.25296161616161617, "grad_norm": 0.06721418350934982, "learning_rate": 0.00019724086207846436, "loss": 0.0823, "step": 3913 }, { "epoch": 0.2530262626262626, "grad_norm": 0.06993572413921356, "learning_rate": 0.00019723926651023113, "loss": 0.1066, "step": 3914 }, { "epoch": 0.2530909090909091, "grad_norm": 0.11163713783025742, "learning_rate": 0.000197237670487242, "loss": 0.0785, "step": 3915 }, { "epoch": 0.2531555555555556, "grad_norm": 0.07591798901557922, "learning_rate": 0.00019723607400950444, "loss": 0.0987, "step": 3916 }, { "epoch": 0.253220202020202, "grad_norm": 0.13017378747463226, "learning_rate": 0.0001972344770770259, "loss": 0.0993, "step": 3917 }, { "epoch": 0.2532848484848485, "grad_norm": 0.06332392245531082, "learning_rate": 0.00019723287968981384, "loss": 0.0947, "step": 3918 }, { "epoch": 0.25334949494949494, "grad_norm": 0.06982926279306412, "learning_rate": 0.0001972312818478758, "loss": 0.1006, "step": 3919 }, { "epoch": 0.2534141414141414, "grad_norm": 0.06936702132225037, "learning_rate": 0.00019722968355121915, "loss": 0.0947, "step": 3920 }, { "epoch": 0.2534141414141414, "eval_bleu": 15.983094597991043, "eval_loss": 0.09143301844596863, "eval_runtime": 2.8465, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 3920 }, { "epoch": 0.25347878787878786, "grad_norm": 0.07879718393087387, "learning_rate": 0.00019722808479985142, "loss": 0.0927, "step": 3921 }, { "epoch": 0.25354343434343435, "grad_norm": 0.05832449346780777, "learning_rate": 0.0001972264855937801, "loss": 0.0818, "step": 3922 }, { "epoch": 0.2536080808080808, "grad_norm": 0.07388099282979965, "learning_rate": 0.00019722488593301263, "loss": 0.1038, "step": 3923 }, { "epoch": 0.25367272727272727, "grad_norm": 0.059918127954006195, "learning_rate": 0.00019722328581755653, "loss": 0.0811, "step": 3924 }, { "epoch": 0.25373737373737376, "grad_norm": 0.058182474225759506, "learning_rate": 0.00019722168524741927, "loss": 0.0733, "step": 3925 }, { "epoch": 0.2538020202020202, "grad_norm": 0.0874689444899559, "learning_rate": 0.00019722008422260828, "loss": 0.0924, "step": 3926 }, { "epoch": 0.2538666666666667, "grad_norm": 0.06599444150924683, "learning_rate": 0.00019721848274313115, "loss": 0.0788, "step": 3927 }, { "epoch": 0.2539313131313131, "grad_norm": 0.06382293999195099, "learning_rate": 0.00019721688080899527, "loss": 0.0773, "step": 3928 }, { "epoch": 0.2539959595959596, "grad_norm": 0.06593070924282074, "learning_rate": 0.0001972152784202082, "loss": 0.0903, "step": 3929 }, { "epoch": 0.25406060606060604, "grad_norm": 0.06831151247024536, "learning_rate": 0.00019721367557677745, "loss": 0.0818, "step": 3930 }, { "epoch": 0.25412525252525253, "grad_norm": 0.07421889156103134, "learning_rate": 0.00019721207227871044, "loss": 0.0888, "step": 3931 }, { "epoch": 0.25418989898989897, "grad_norm": 0.07989794760942459, "learning_rate": 0.0001972104685260147, "loss": 0.1121, "step": 3932 }, { "epoch": 0.25425454545454546, "grad_norm": 0.06599510461091995, "learning_rate": 0.00019720886431869774, "loss": 0.0864, "step": 3933 }, { "epoch": 0.25431919191919194, "grad_norm": 0.07128570228815079, "learning_rate": 0.00019720725965676706, "loss": 0.0932, "step": 3934 }, { "epoch": 0.2543838383838384, "grad_norm": 0.07657445967197418, "learning_rate": 0.00019720565454023014, "loss": 0.1068, "step": 3935 }, { "epoch": 0.25444848484848487, "grad_norm": 0.08500547707080841, "learning_rate": 0.00019720404896909454, "loss": 0.1065, "step": 3936 }, { "epoch": 0.25444848484848487, "eval_bleu": 14.247262018083113, "eval_loss": 0.09327036142349243, "eval_runtime": 2.7759, "eval_samples_per_second": 11.528, "eval_steps_per_second": 1.441, "step": 3936 }, { "epoch": 0.2545131313131313, "grad_norm": 0.0704958513379097, "learning_rate": 0.00019720244294336775, "loss": 0.088, "step": 3937 }, { "epoch": 0.2545777777777778, "grad_norm": 0.07374102622270584, "learning_rate": 0.00019720083646305723, "loss": 0.0955, "step": 3938 }, { "epoch": 0.2546424242424242, "grad_norm": 0.06841718405485153, "learning_rate": 0.00019719922952817057, "loss": 0.0973, "step": 3939 }, { "epoch": 0.2547070707070707, "grad_norm": 0.06780092418193817, "learning_rate": 0.00019719762213871522, "loss": 0.0966, "step": 3940 }, { "epoch": 0.25477171717171715, "grad_norm": 0.0709770992398262, "learning_rate": 0.00019719601429469874, "loss": 0.095, "step": 3941 }, { "epoch": 0.25483636363636364, "grad_norm": 0.07099176943302155, "learning_rate": 0.00019719440599612863, "loss": 0.1004, "step": 3942 }, { "epoch": 0.25490101010101013, "grad_norm": 0.06256180256605148, "learning_rate": 0.00019719279724301242, "loss": 0.0956, "step": 3943 }, { "epoch": 0.25496565656565656, "grad_norm": 0.06627906858921051, "learning_rate": 0.0001971911880353576, "loss": 0.0847, "step": 3944 }, { "epoch": 0.25503030303030305, "grad_norm": 0.06458727270364761, "learning_rate": 0.00019718957837317174, "loss": 0.0715, "step": 3945 }, { "epoch": 0.2550949494949495, "grad_norm": 0.054595280438661575, "learning_rate": 0.00019718796825646235, "loss": 0.0759, "step": 3946 }, { "epoch": 0.255159595959596, "grad_norm": 0.07755506783723831, "learning_rate": 0.000197186357685237, "loss": 0.0781, "step": 3947 }, { "epoch": 0.2552242424242424, "grad_norm": 0.0669684037566185, "learning_rate": 0.00019718474665950315, "loss": 0.0928, "step": 3948 }, { "epoch": 0.2552888888888889, "grad_norm": 0.06139666214585304, "learning_rate": 0.00019718313517926833, "loss": 0.0768, "step": 3949 }, { "epoch": 0.25535353535353533, "grad_norm": 0.0804172083735466, "learning_rate": 0.00019718152324454017, "loss": 0.0989, "step": 3950 }, { "epoch": 0.2554181818181818, "grad_norm": 0.0711892694234848, "learning_rate": 0.00019717991085532613, "loss": 0.0907, "step": 3951 }, { "epoch": 0.25548282828282826, "grad_norm": 0.07749702781438828, "learning_rate": 0.00019717829801163382, "loss": 0.1004, "step": 3952 }, { "epoch": 0.25548282828282826, "eval_bleu": 15.335787811375432, "eval_loss": 0.09287138283252716, "eval_runtime": 2.8448, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 3952 }, { "epoch": 0.25554747474747475, "grad_norm": 0.0781218633055687, "learning_rate": 0.00019717668471347068, "loss": 0.1028, "step": 3953 }, { "epoch": 0.25561212121212123, "grad_norm": 0.06841059774160385, "learning_rate": 0.00019717507096084433, "loss": 0.0777, "step": 3954 }, { "epoch": 0.25567676767676767, "grad_norm": 0.08416417986154556, "learning_rate": 0.0001971734567537623, "loss": 0.1029, "step": 3955 }, { "epoch": 0.25574141414141416, "grad_norm": 0.08974970877170563, "learning_rate": 0.00019717184209223215, "loss": 0.114, "step": 3956 }, { "epoch": 0.2558060606060606, "grad_norm": 0.0668891966342926, "learning_rate": 0.00019717022697626142, "loss": 0.0712, "step": 3957 }, { "epoch": 0.2558707070707071, "grad_norm": 0.07214487344026566, "learning_rate": 0.00019716861140585762, "loss": 0.0829, "step": 3958 }, { "epoch": 0.2559353535353535, "grad_norm": 0.07436203211545944, "learning_rate": 0.0001971669953810284, "loss": 0.1015, "step": 3959 }, { "epoch": 0.256, "grad_norm": 0.06146183982491493, "learning_rate": 0.00019716537890178125, "loss": 0.0836, "step": 3960 }, { "epoch": 0.25606464646464644, "grad_norm": 0.06979256123304367, "learning_rate": 0.00019716376196812372, "loss": 0.0947, "step": 3961 }, { "epoch": 0.25612929292929293, "grad_norm": 0.06719477474689484, "learning_rate": 0.0001971621445800634, "loss": 0.0728, "step": 3962 }, { "epoch": 0.2561939393939394, "grad_norm": 0.08400076627731323, "learning_rate": 0.00019716052673760787, "loss": 0.1037, "step": 3963 }, { "epoch": 0.25625858585858585, "grad_norm": 0.0729844942688942, "learning_rate": 0.00019715890844076468, "loss": 0.0961, "step": 3964 }, { "epoch": 0.25632323232323234, "grad_norm": 0.06257636845111847, "learning_rate": 0.0001971572896895414, "loss": 0.0833, "step": 3965 }, { "epoch": 0.2563878787878788, "grad_norm": 0.07040037214756012, "learning_rate": 0.0001971556704839456, "loss": 0.0957, "step": 3966 }, { "epoch": 0.25645252525252527, "grad_norm": 0.06887052208185196, "learning_rate": 0.0001971540508239848, "loss": 0.0981, "step": 3967 }, { "epoch": 0.2565171717171717, "grad_norm": 0.06848348677158356, "learning_rate": 0.00019715243070966663, "loss": 0.0892, "step": 3968 }, { "epoch": 0.2565171717171717, "eval_bleu": 15.6840452912209, "eval_loss": 0.09552548825740814, "eval_runtime": 2.7638, "eval_samples_per_second": 11.578, "eval_steps_per_second": 1.447, "step": 3968 }, { "epoch": 0.2565818181818182, "grad_norm": 0.0749221071600914, "learning_rate": 0.00019715081014099871, "loss": 0.0962, "step": 3969 }, { "epoch": 0.2566464646464646, "grad_norm": 0.0686810314655304, "learning_rate": 0.00019714918911798855, "loss": 0.0905, "step": 3970 }, { "epoch": 0.2567111111111111, "grad_norm": 0.0811566561460495, "learning_rate": 0.00019714756764064375, "loss": 0.1034, "step": 3971 }, { "epoch": 0.2567757575757576, "grad_norm": 0.0615408793091774, "learning_rate": 0.00019714594570897188, "loss": 0.0797, "step": 3972 }, { "epoch": 0.25684040404040404, "grad_norm": 0.07170801609754562, "learning_rate": 0.00019714432332298054, "loss": 0.0857, "step": 3973 }, { "epoch": 0.2569050505050505, "grad_norm": 0.06380462646484375, "learning_rate": 0.00019714270048267732, "loss": 0.0769, "step": 3974 }, { "epoch": 0.25696969696969696, "grad_norm": 0.07562926411628723, "learning_rate": 0.0001971410771880698, "loss": 0.0881, "step": 3975 }, { "epoch": 0.25703434343434345, "grad_norm": 0.07395053654909134, "learning_rate": 0.00019713945343916557, "loss": 0.0895, "step": 3976 }, { "epoch": 0.2570989898989899, "grad_norm": 0.06585757434368134, "learning_rate": 0.00019713782923597224, "loss": 0.0803, "step": 3977 }, { "epoch": 0.25716363636363637, "grad_norm": 0.06458459049463272, "learning_rate": 0.0001971362045784974, "loss": 0.0836, "step": 3978 }, { "epoch": 0.2572282828282828, "grad_norm": 0.06941258907318115, "learning_rate": 0.00019713457946674866, "loss": 0.0914, "step": 3979 }, { "epoch": 0.2572929292929293, "grad_norm": 0.07414700835943222, "learning_rate": 0.00019713295390073356, "loss": 0.0966, "step": 3980 }, { "epoch": 0.2573575757575758, "grad_norm": 0.06744233518838882, "learning_rate": 0.00019713132788045976, "loss": 0.0832, "step": 3981 }, { "epoch": 0.2574222222222222, "grad_norm": 0.06691445410251617, "learning_rate": 0.00019712970140593487, "loss": 0.0978, "step": 3982 }, { "epoch": 0.2574868686868687, "grad_norm": 0.08676749467849731, "learning_rate": 0.00019712807447716646, "loss": 0.1332, "step": 3983 }, { "epoch": 0.25755151515151514, "grad_norm": 0.09434791654348373, "learning_rate": 0.00019712644709416214, "loss": 0.0971, "step": 3984 }, { "epoch": 0.25755151515151514, "eval_bleu": 16.65200315890472, "eval_loss": 0.0925353541970253, "eval_runtime": 2.8006, "eval_samples_per_second": 11.426, "eval_steps_per_second": 1.428, "step": 3984 }, { "epoch": 0.25761616161616163, "grad_norm": 0.08148640394210815, "learning_rate": 0.0001971248192569296, "loss": 0.1068, "step": 3985 }, { "epoch": 0.25768080808080807, "grad_norm": 0.0725681483745575, "learning_rate": 0.00019712319096547632, "loss": 0.0873, "step": 3986 }, { "epoch": 0.25774545454545456, "grad_norm": 0.06095732003450394, "learning_rate": 0.00019712156221981, "loss": 0.0788, "step": 3987 }, { "epoch": 0.257810101010101, "grad_norm": 0.06826002895832062, "learning_rate": 0.00019711993301993824, "loss": 0.1022, "step": 3988 }, { "epoch": 0.2578747474747475, "grad_norm": 0.07453065365552902, "learning_rate": 0.0001971183033658687, "loss": 0.0957, "step": 3989 }, { "epoch": 0.2579393939393939, "grad_norm": 0.06986382603645325, "learning_rate": 0.00019711667325760895, "loss": 0.0939, "step": 3990 }, { "epoch": 0.2580040404040404, "grad_norm": 0.07774855196475983, "learning_rate": 0.0001971150426951666, "loss": 0.1147, "step": 3991 }, { "epoch": 0.2580686868686869, "grad_norm": 0.08281876146793365, "learning_rate": 0.0001971134116785493, "loss": 0.104, "step": 3992 }, { "epoch": 0.2581333333333333, "grad_norm": 0.06474122405052185, "learning_rate": 0.0001971117802077647, "loss": 0.0773, "step": 3993 }, { "epoch": 0.2581979797979798, "grad_norm": 0.10273505002260208, "learning_rate": 0.00019711014828282036, "loss": 0.0994, "step": 3994 }, { "epoch": 0.25826262626262625, "grad_norm": 0.07526735216379166, "learning_rate": 0.00019710851590372402, "loss": 0.0975, "step": 3995 }, { "epoch": 0.25832727272727274, "grad_norm": 0.07041803002357483, "learning_rate": 0.00019710688307048323, "loss": 0.0788, "step": 3996 }, { "epoch": 0.2583919191919192, "grad_norm": 0.08080800622701645, "learning_rate": 0.00019710524978310568, "loss": 0.1069, "step": 3997 }, { "epoch": 0.25845656565656566, "grad_norm": 0.06575463712215424, "learning_rate": 0.00019710361604159893, "loss": 0.0772, "step": 3998 }, { "epoch": 0.2585212121212121, "grad_norm": 0.06528215110301971, "learning_rate": 0.0001971019818459707, "loss": 0.0785, "step": 3999 }, { "epoch": 0.2585858585858586, "grad_norm": 0.07088376581668854, "learning_rate": 0.00019710034719622857, "loss": 0.0856, "step": 4000 }, { "epoch": 0.2585858585858586, "eval_bleu": 13.79800200866905, "eval_loss": 0.09347105771303177, "eval_runtime": 2.7798, "eval_samples_per_second": 11.512, "eval_steps_per_second": 1.439, "step": 4000 }, { "epoch": 0.2586505050505051, "grad_norm": 0.06831807643175125, "learning_rate": 0.00019709871209238028, "loss": 0.0935, "step": 4001 }, { "epoch": 0.2587151515151515, "grad_norm": 0.08285486698150635, "learning_rate": 0.00019709707653443336, "loss": 0.1203, "step": 4002 }, { "epoch": 0.258779797979798, "grad_norm": 0.06411155313253403, "learning_rate": 0.00019709544052239552, "loss": 0.0853, "step": 4003 }, { "epoch": 0.25884444444444443, "grad_norm": 0.06480775028467178, "learning_rate": 0.00019709380405627443, "loss": 0.0886, "step": 4004 }, { "epoch": 0.2589090909090909, "grad_norm": 0.07566830515861511, "learning_rate": 0.00019709216713607769, "loss": 0.102, "step": 4005 }, { "epoch": 0.25897373737373736, "grad_norm": 0.06415920704603195, "learning_rate": 0.000197090529761813, "loss": 0.077, "step": 4006 }, { "epoch": 0.25903838383838385, "grad_norm": 0.07580181956291199, "learning_rate": 0.00019708889193348797, "loss": 0.0992, "step": 4007 }, { "epoch": 0.2591030303030303, "grad_norm": 0.07114388793706894, "learning_rate": 0.00019708725365111032, "loss": 0.0858, "step": 4008 }, { "epoch": 0.25916767676767677, "grad_norm": 0.06733395159244537, "learning_rate": 0.00019708561491468766, "loss": 0.0815, "step": 4009 }, { "epoch": 0.25923232323232326, "grad_norm": 0.06934670358896255, "learning_rate": 0.00019708397572422768, "loss": 0.0946, "step": 4010 }, { "epoch": 0.2592969696969697, "grad_norm": 0.0639420673251152, "learning_rate": 0.00019708233607973808, "loss": 0.0874, "step": 4011 }, { "epoch": 0.2593616161616162, "grad_norm": 0.07199052721261978, "learning_rate": 0.00019708069598122646, "loss": 0.0924, "step": 4012 }, { "epoch": 0.2594262626262626, "grad_norm": 0.07812098413705826, "learning_rate": 0.0001970790554287005, "loss": 0.1182, "step": 4013 }, { "epoch": 0.2594909090909091, "grad_norm": 0.061200935393571854, "learning_rate": 0.0001970774144221679, "loss": 0.0866, "step": 4014 }, { "epoch": 0.25955555555555554, "grad_norm": 0.08956077694892883, "learning_rate": 0.00019707577296163633, "loss": 0.1054, "step": 4015 }, { "epoch": 0.25962020202020203, "grad_norm": 0.06875993311405182, "learning_rate": 0.00019707413104711345, "loss": 0.1028, "step": 4016 }, { "epoch": 0.25962020202020203, "eval_bleu": 16.56071065349125, "eval_loss": 0.0921887457370758, "eval_runtime": 2.8312, "eval_samples_per_second": 11.303, "eval_steps_per_second": 1.413, "step": 4016 }, { "epoch": 0.25968484848484846, "grad_norm": 0.07921290397644043, "learning_rate": 0.00019707248867860698, "loss": 0.0978, "step": 4017 }, { "epoch": 0.25974949494949495, "grad_norm": 0.06352989375591278, "learning_rate": 0.00019707084585612457, "loss": 0.0876, "step": 4018 }, { "epoch": 0.25981414141414144, "grad_norm": 0.06229955330491066, "learning_rate": 0.00019706920257967387, "loss": 0.0885, "step": 4019 }, { "epoch": 0.2598787878787879, "grad_norm": 0.05789710953831673, "learning_rate": 0.00019706755884926262, "loss": 0.0699, "step": 4020 }, { "epoch": 0.25994343434343437, "grad_norm": 0.07082050293684006, "learning_rate": 0.0001970659146648985, "loss": 0.0927, "step": 4021 }, { "epoch": 0.2600080808080808, "grad_norm": 0.06347637623548508, "learning_rate": 0.0001970642700265892, "loss": 0.0834, "step": 4022 }, { "epoch": 0.2600727272727273, "grad_norm": 0.0721856951713562, "learning_rate": 0.00019706262493434236, "loss": 0.1107, "step": 4023 }, { "epoch": 0.2601373737373737, "grad_norm": 0.07536016404628754, "learning_rate": 0.00019706097938816571, "loss": 0.0793, "step": 4024 }, { "epoch": 0.2602020202020202, "grad_norm": 0.07097842544317245, "learning_rate": 0.00019705933338806696, "loss": 0.0873, "step": 4025 }, { "epoch": 0.26026666666666665, "grad_norm": 0.06801895797252655, "learning_rate": 0.00019705768693405382, "loss": 0.078, "step": 4026 }, { "epoch": 0.26033131313131314, "grad_norm": 0.07171180099248886, "learning_rate": 0.00019705604002613394, "loss": 0.0889, "step": 4027 }, { "epoch": 0.26039595959595957, "grad_norm": 0.0710098072886467, "learning_rate": 0.00019705439266431505, "loss": 0.0854, "step": 4028 }, { "epoch": 0.26046060606060606, "grad_norm": 0.06959763914346695, "learning_rate": 0.00019705274484860486, "loss": 0.0832, "step": 4029 }, { "epoch": 0.26052525252525255, "grad_norm": 0.0706619843840599, "learning_rate": 0.00019705109657901105, "loss": 0.0925, "step": 4030 }, { "epoch": 0.260589898989899, "grad_norm": 0.06845133006572723, "learning_rate": 0.00019704944785554137, "loss": 0.078, "step": 4031 }, { "epoch": 0.26065454545454547, "grad_norm": 0.06288308650255203, "learning_rate": 0.00019704779867820346, "loss": 0.0732, "step": 4032 }, { "epoch": 0.26065454545454547, "eval_bleu": 15.595858446806032, "eval_loss": 0.0905570536851883, "eval_runtime": 2.6962, "eval_samples_per_second": 11.869, "eval_steps_per_second": 1.484, "step": 4032 }, { "epoch": 0.2607191919191919, "grad_norm": 0.06585483253002167, "learning_rate": 0.0001970461490470051, "loss": 0.0848, "step": 4033 }, { "epoch": 0.2607838383838384, "grad_norm": 0.08028880506753922, "learning_rate": 0.000197044498961954, "loss": 0.1066, "step": 4034 }, { "epoch": 0.26084848484848483, "grad_norm": 0.0670945793390274, "learning_rate": 0.00019704284842305786, "loss": 0.09, "step": 4035 }, { "epoch": 0.2609131313131313, "grad_norm": 0.054228901863098145, "learning_rate": 0.00019704119743032438, "loss": 0.0719, "step": 4036 }, { "epoch": 0.26097777777777775, "grad_norm": 0.05931509658694267, "learning_rate": 0.00019703954598376133, "loss": 0.0765, "step": 4037 }, { "epoch": 0.26104242424242424, "grad_norm": 0.07083901017904282, "learning_rate": 0.00019703789408337638, "loss": 0.0911, "step": 4038 }, { "epoch": 0.26110707070707073, "grad_norm": 0.055335883051157, "learning_rate": 0.0001970362417291773, "loss": 0.0821, "step": 4039 }, { "epoch": 0.26117171717171717, "grad_norm": 0.06416014581918716, "learning_rate": 0.00019703458892117177, "loss": 0.0933, "step": 4040 }, { "epoch": 0.26123636363636366, "grad_norm": 0.07569090276956558, "learning_rate": 0.00019703293565936756, "loss": 0.0944, "step": 4041 }, { "epoch": 0.2613010101010101, "grad_norm": 0.07058663666248322, "learning_rate": 0.0001970312819437724, "loss": 0.0998, "step": 4042 }, { "epoch": 0.2613656565656566, "grad_norm": 0.26307740807533264, "learning_rate": 0.000197029627774394, "loss": 0.1567, "step": 4043 }, { "epoch": 0.261430303030303, "grad_norm": 0.06781303137540817, "learning_rate": 0.0001970279731512401, "loss": 0.0986, "step": 4044 }, { "epoch": 0.2614949494949495, "grad_norm": 0.06772926449775696, "learning_rate": 0.00019702631807431845, "loss": 0.0921, "step": 4045 }, { "epoch": 0.26155959595959594, "grad_norm": 0.07634842395782471, "learning_rate": 0.0001970246625436368, "loss": 0.1125, "step": 4046 }, { "epoch": 0.2616242424242424, "grad_norm": 0.06506653130054474, "learning_rate": 0.00019702300655920286, "loss": 0.08, "step": 4047 }, { "epoch": 0.2616888888888889, "grad_norm": 0.06255129724740982, "learning_rate": 0.00019702135012102443, "loss": 0.0848, "step": 4048 }, { "epoch": 0.2616888888888889, "eval_bleu": 13.59986281323152, "eval_loss": 0.09147222340106964, "eval_runtime": 2.7494, "eval_samples_per_second": 11.639, "eval_steps_per_second": 1.455, "step": 4048 }, { "epoch": 0.26175353535353535, "grad_norm": 0.06937336921691895, "learning_rate": 0.0001970196932291092, "loss": 0.1003, "step": 4049 }, { "epoch": 0.26181818181818184, "grad_norm": 0.07006597518920898, "learning_rate": 0.00019701803588346493, "loss": 0.0874, "step": 4050 }, { "epoch": 0.2618828282828283, "grad_norm": 0.07093960791826248, "learning_rate": 0.00019701637808409937, "loss": 0.0852, "step": 4051 }, { "epoch": 0.26194747474747476, "grad_norm": 0.07252484560012817, "learning_rate": 0.0001970147198310203, "loss": 0.086, "step": 4052 }, { "epoch": 0.2620121212121212, "grad_norm": 0.07307127863168716, "learning_rate": 0.00019701306112423548, "loss": 0.1025, "step": 4053 }, { "epoch": 0.2620767676767677, "grad_norm": 0.07132066041231155, "learning_rate": 0.0001970114019637526, "loss": 0.0927, "step": 4054 }, { "epoch": 0.2621414141414141, "grad_norm": 0.07602684944868088, "learning_rate": 0.00019700974234957948, "loss": 0.0948, "step": 4055 }, { "epoch": 0.2622060606060606, "grad_norm": 0.06550707668066025, "learning_rate": 0.00019700808228172387, "loss": 0.0768, "step": 4056 }, { "epoch": 0.26227070707070704, "grad_norm": 0.06882788985967636, "learning_rate": 0.00019700642176019356, "loss": 0.0918, "step": 4057 }, { "epoch": 0.26233535353535353, "grad_norm": 0.06616871058940887, "learning_rate": 0.00019700476078499623, "loss": 0.084, "step": 4058 }, { "epoch": 0.2624, "grad_norm": 0.08565612137317657, "learning_rate": 0.00019700309935613973, "loss": 0.1056, "step": 4059 }, { "epoch": 0.26246464646464646, "grad_norm": 0.06691957265138626, "learning_rate": 0.0001970014374736318, "loss": 0.0949, "step": 4060 }, { "epoch": 0.26252929292929295, "grad_norm": 0.0686483085155487, "learning_rate": 0.00019699977513748022, "loss": 0.0941, "step": 4061 }, { "epoch": 0.2625939393939394, "grad_norm": 0.06843416392803192, "learning_rate": 0.00019699811234769277, "loss": 0.0939, "step": 4062 }, { "epoch": 0.26265858585858587, "grad_norm": 0.06703022122383118, "learning_rate": 0.0001969964491042772, "loss": 0.0957, "step": 4063 }, { "epoch": 0.2627232323232323, "grad_norm": 0.0545797273516655, "learning_rate": 0.00019699478540724128, "loss": 0.0752, "step": 4064 }, { "epoch": 0.2627232323232323, "eval_bleu": 15.72931112639416, "eval_loss": 0.0923495963215828, "eval_runtime": 2.7634, "eval_samples_per_second": 11.58, "eval_steps_per_second": 1.447, "step": 4064 }, { "epoch": 0.2627878787878788, "grad_norm": 0.07546567916870117, "learning_rate": 0.00019699312125659284, "loss": 0.1028, "step": 4065 }, { "epoch": 0.2628525252525252, "grad_norm": 0.07956601679325104, "learning_rate": 0.00019699145665233965, "loss": 0.1, "step": 4066 }, { "epoch": 0.2629171717171717, "grad_norm": 0.06858038902282715, "learning_rate": 0.00019698979159448948, "loss": 0.0858, "step": 4067 }, { "epoch": 0.2629818181818182, "grad_norm": 0.05877428874373436, "learning_rate": 0.00019698812608305008, "loss": 0.0656, "step": 4068 }, { "epoch": 0.26304646464646464, "grad_norm": 0.08447999507188797, "learning_rate": 0.0001969864601180293, "loss": 0.0942, "step": 4069 }, { "epoch": 0.26311111111111113, "grad_norm": 0.07059522718191147, "learning_rate": 0.00019698479369943493, "loss": 0.0884, "step": 4070 }, { "epoch": 0.26317575757575756, "grad_norm": 0.06848535686731339, "learning_rate": 0.00019698312682727472, "loss": 0.0907, "step": 4071 }, { "epoch": 0.26324040404040405, "grad_norm": 0.08033183962106705, "learning_rate": 0.00019698145950155648, "loss": 0.1056, "step": 4072 }, { "epoch": 0.2633050505050505, "grad_norm": 0.09901220351457596, "learning_rate": 0.00019697979172228803, "loss": 0.1062, "step": 4073 }, { "epoch": 0.263369696969697, "grad_norm": 0.09226621687412262, "learning_rate": 0.00019697812348947716, "loss": 0.1293, "step": 4074 }, { "epoch": 0.2634343434343434, "grad_norm": 0.07036930322647095, "learning_rate": 0.00019697645480313167, "loss": 0.0874, "step": 4075 }, { "epoch": 0.2634989898989899, "grad_norm": 0.06986100226640701, "learning_rate": 0.00019697478566325935, "loss": 0.086, "step": 4076 }, { "epoch": 0.2635636363636364, "grad_norm": 0.06769050657749176, "learning_rate": 0.00019697311606986802, "loss": 0.0902, "step": 4077 }, { "epoch": 0.2636282828282828, "grad_norm": 0.07077699154615402, "learning_rate": 0.00019697144602296546, "loss": 0.1002, "step": 4078 }, { "epoch": 0.2636929292929293, "grad_norm": 0.07546249777078629, "learning_rate": 0.00019696977552255955, "loss": 0.1043, "step": 4079 }, { "epoch": 0.26375757575757575, "grad_norm": 0.07374468445777893, "learning_rate": 0.00019696810456865804, "loss": 0.1015, "step": 4080 }, { "epoch": 0.26375757575757575, "eval_bleu": 17.54633083888607, "eval_loss": 0.09299148619174957, "eval_runtime": 2.6569, "eval_samples_per_second": 12.044, "eval_steps_per_second": 1.505, "step": 4080 }, { "epoch": 0.26382222222222224, "grad_norm": 0.08618441224098206, "learning_rate": 0.00019696643316126876, "loss": 0.0947, "step": 4081 }, { "epoch": 0.26388686868686867, "grad_norm": 0.061502184718847275, "learning_rate": 0.0001969647613003995, "loss": 0.0876, "step": 4082 }, { "epoch": 0.26395151515151516, "grad_norm": 0.05714648962020874, "learning_rate": 0.00019696308898605816, "loss": 0.0704, "step": 4083 }, { "epoch": 0.2640161616161616, "grad_norm": 0.07035606354475021, "learning_rate": 0.0001969614162182525, "loss": 0.0988, "step": 4084 }, { "epoch": 0.2640808080808081, "grad_norm": 0.061043307185173035, "learning_rate": 0.00019695974299699032, "loss": 0.0856, "step": 4085 }, { "epoch": 0.26414545454545457, "grad_norm": 0.07307825982570648, "learning_rate": 0.0001969580693222795, "loss": 0.1049, "step": 4086 }, { "epoch": 0.264210101010101, "grad_norm": 0.06520330160856247, "learning_rate": 0.0001969563951941278, "loss": 0.0981, "step": 4087 }, { "epoch": 0.2642747474747475, "grad_norm": 0.07047823816537857, "learning_rate": 0.00019695472061254313, "loss": 0.0928, "step": 4088 }, { "epoch": 0.26433939393939393, "grad_norm": 0.06555815786123276, "learning_rate": 0.0001969530455775333, "loss": 0.0756, "step": 4089 }, { "epoch": 0.2644040404040404, "grad_norm": 0.06840948760509491, "learning_rate": 0.0001969513700891061, "loss": 0.0815, "step": 4090 }, { "epoch": 0.26446868686868685, "grad_norm": 0.06679616868495941, "learning_rate": 0.0001969496941472694, "loss": 0.0864, "step": 4091 }, { "epoch": 0.26453333333333334, "grad_norm": 0.07676058262586594, "learning_rate": 0.00019694801775203103, "loss": 0.0937, "step": 4092 }, { "epoch": 0.2645979797979798, "grad_norm": 0.07315148413181305, "learning_rate": 0.00019694634090339883, "loss": 0.1035, "step": 4093 }, { "epoch": 0.26466262626262627, "grad_norm": 0.07252030074596405, "learning_rate": 0.00019694466360138066, "loss": 0.0923, "step": 4094 }, { "epoch": 0.2647272727272727, "grad_norm": 0.06573876738548279, "learning_rate": 0.00019694298584598433, "loss": 0.0882, "step": 4095 }, { "epoch": 0.2647919191919192, "grad_norm": 0.1000770628452301, "learning_rate": 0.00019694130763721769, "loss": 0.1121, "step": 4096 }, { "epoch": 0.2647919191919192, "eval_bleu": 12.816742705015884, "eval_loss": 0.09407724440097809, "eval_runtime": 2.6931, "eval_samples_per_second": 11.882, "eval_steps_per_second": 1.485, "step": 4096 }, { "epoch": 0.2648565656565657, "grad_norm": 0.09010086208581924, "learning_rate": 0.00019693962897508863, "loss": 0.1212, "step": 4097 }, { "epoch": 0.2649212121212121, "grad_norm": 0.06527906656265259, "learning_rate": 0.00019693794985960495, "loss": 0.0828, "step": 4098 }, { "epoch": 0.2649858585858586, "grad_norm": 0.06824596971273422, "learning_rate": 0.00019693627029077458, "loss": 0.0846, "step": 4099 }, { "epoch": 0.26505050505050504, "grad_norm": 0.06638594716787338, "learning_rate": 0.00019693459026860527, "loss": 0.0841, "step": 4100 }, { "epoch": 0.2651151515151515, "grad_norm": 0.06276428699493408, "learning_rate": 0.00019693290979310493, "loss": 0.0863, "step": 4101 }, { "epoch": 0.26517979797979796, "grad_norm": 0.06336116790771484, "learning_rate": 0.0001969312288642814, "loss": 0.0791, "step": 4102 }, { "epoch": 0.26524444444444445, "grad_norm": 0.07096125185489655, "learning_rate": 0.0001969295474821426, "loss": 0.0934, "step": 4103 }, { "epoch": 0.2653090909090909, "grad_norm": 0.061906930059194565, "learning_rate": 0.0001969278656466963, "loss": 0.0821, "step": 4104 }, { "epoch": 0.2653737373737374, "grad_norm": 0.07907071709632874, "learning_rate": 0.00019692618335795048, "loss": 0.1106, "step": 4105 }, { "epoch": 0.26543838383838386, "grad_norm": 0.08103641867637634, "learning_rate": 0.00019692450061591291, "loss": 0.1143, "step": 4106 }, { "epoch": 0.2655030303030303, "grad_norm": 0.06572283804416656, "learning_rate": 0.00019692281742059148, "loss": 0.092, "step": 4107 }, { "epoch": 0.2655676767676768, "grad_norm": 0.05516389012336731, "learning_rate": 0.0001969211337719941, "loss": 0.0709, "step": 4108 }, { "epoch": 0.2656323232323232, "grad_norm": 0.06728114187717438, "learning_rate": 0.00019691944967012862, "loss": 0.0812, "step": 4109 }, { "epoch": 0.2656969696969697, "grad_norm": 0.06304038316011429, "learning_rate": 0.0001969177651150029, "loss": 0.0811, "step": 4110 }, { "epoch": 0.26576161616161614, "grad_norm": 0.06709577888250351, "learning_rate": 0.00019691608010662483, "loss": 0.089, "step": 4111 }, { "epoch": 0.26582626262626263, "grad_norm": 0.07005739957094193, "learning_rate": 0.00019691439464500232, "loss": 0.0997, "step": 4112 }, { "epoch": 0.26582626262626263, "eval_bleu": 11.942859156509565, "eval_loss": 0.09186488389968872, "eval_runtime": 2.8176, "eval_samples_per_second": 11.357, "eval_steps_per_second": 1.42, "step": 4112 }, { "epoch": 0.26589090909090907, "grad_norm": 0.07307527959346771, "learning_rate": 0.0001969127087301432, "loss": 0.0844, "step": 4113 }, { "epoch": 0.26595555555555556, "grad_norm": 0.07943221181631088, "learning_rate": 0.00019691102236205538, "loss": 0.0869, "step": 4114 }, { "epoch": 0.26602020202020205, "grad_norm": 0.06850095093250275, "learning_rate": 0.00019690933554074677, "loss": 0.0804, "step": 4115 }, { "epoch": 0.2660848484848485, "grad_norm": 0.06646157056093216, "learning_rate": 0.0001969076482662252, "loss": 0.0833, "step": 4116 }, { "epoch": 0.26614949494949497, "grad_norm": 0.06631097197532654, "learning_rate": 0.0001969059605384986, "loss": 0.0881, "step": 4117 }, { "epoch": 0.2662141414141414, "grad_norm": 0.06769176572561264, "learning_rate": 0.00019690427235757486, "loss": 0.0838, "step": 4118 }, { "epoch": 0.2662787878787879, "grad_norm": 0.07445267587900162, "learning_rate": 0.00019690258372346192, "loss": 0.0922, "step": 4119 }, { "epoch": 0.2663434343434343, "grad_norm": 0.07536054402589798, "learning_rate": 0.0001969008946361676, "loss": 0.0929, "step": 4120 }, { "epoch": 0.2664080808080808, "grad_norm": 0.08731085062026978, "learning_rate": 0.0001968992050956998, "loss": 0.1109, "step": 4121 }, { "epoch": 0.26647272727272725, "grad_norm": 0.07295013964176178, "learning_rate": 0.0001968975151020665, "loss": 0.0904, "step": 4122 }, { "epoch": 0.26653737373737374, "grad_norm": 0.07037265598773956, "learning_rate": 0.00019689582465527554, "loss": 0.0814, "step": 4123 }, { "epoch": 0.26660202020202023, "grad_norm": 0.06497006118297577, "learning_rate": 0.00019689413375533483, "loss": 0.0859, "step": 4124 }, { "epoch": 0.26666666666666666, "grad_norm": 0.06783854961395264, "learning_rate": 0.00019689244240225228, "loss": 0.0887, "step": 4125 }, { "epoch": 0.26673131313131315, "grad_norm": 0.06034316495060921, "learning_rate": 0.00019689075059603584, "loss": 0.085, "step": 4126 }, { "epoch": 0.2667959595959596, "grad_norm": 0.07142467051744461, "learning_rate": 0.00019688905833669336, "loss": 0.0974, "step": 4127 }, { "epoch": 0.2668606060606061, "grad_norm": 0.07669708877801895, "learning_rate": 0.0001968873656242328, "loss": 0.105, "step": 4128 }, { "epoch": 0.2668606060606061, "eval_bleu": 12.597572317323614, "eval_loss": 0.09188902378082275, "eval_runtime": 2.6064, "eval_samples_per_second": 12.278, "eval_steps_per_second": 1.535, "step": 4128 }, { "epoch": 0.2669252525252525, "grad_norm": 0.07436151057481766, "learning_rate": 0.00019688567245866207, "loss": 0.1007, "step": 4129 }, { "epoch": 0.266989898989899, "grad_norm": 0.07025950402021408, "learning_rate": 0.0001968839788399891, "loss": 0.0951, "step": 4130 }, { "epoch": 0.26705454545454543, "grad_norm": 0.07281804829835892, "learning_rate": 0.00019688228476822173, "loss": 0.0927, "step": 4131 }, { "epoch": 0.2671191919191919, "grad_norm": 0.07857491821050644, "learning_rate": 0.000196880590243368, "loss": 0.118, "step": 4132 }, { "epoch": 0.26718383838383836, "grad_norm": 0.0814911499619484, "learning_rate": 0.00019687889526543573, "loss": 0.1245, "step": 4133 }, { "epoch": 0.26724848484848485, "grad_norm": 0.0695725679397583, "learning_rate": 0.00019687719983443294, "loss": 0.1037, "step": 4134 }, { "epoch": 0.26731313131313134, "grad_norm": 0.07808953523635864, "learning_rate": 0.0001968755039503675, "loss": 0.1111, "step": 4135 }, { "epoch": 0.26737777777777777, "grad_norm": 0.10729251056909561, "learning_rate": 0.00019687380761324733, "loss": 0.1094, "step": 4136 }, { "epoch": 0.26744242424242426, "grad_norm": 0.06723996996879578, "learning_rate": 0.0001968721108230804, "loss": 0.1074, "step": 4137 }, { "epoch": 0.2675070707070707, "grad_norm": 0.06742935627698898, "learning_rate": 0.00019687041357987464, "loss": 0.101, "step": 4138 }, { "epoch": 0.2675717171717172, "grad_norm": 0.0687510296702385, "learning_rate": 0.000196868715883638, "loss": 0.0991, "step": 4139 }, { "epoch": 0.2676363636363636, "grad_norm": 0.06135712191462517, "learning_rate": 0.00019686701773437832, "loss": 0.0768, "step": 4140 }, { "epoch": 0.2677010101010101, "grad_norm": 0.08449401706457138, "learning_rate": 0.0001968653191321037, "loss": 0.1067, "step": 4141 }, { "epoch": 0.26776565656565654, "grad_norm": 0.07005251199007034, "learning_rate": 0.00019686362007682198, "loss": 0.095, "step": 4142 }, { "epoch": 0.26783030303030303, "grad_norm": 0.06596967577934265, "learning_rate": 0.00019686192056854113, "loss": 0.0865, "step": 4143 }, { "epoch": 0.2678949494949495, "grad_norm": 0.07281509041786194, "learning_rate": 0.00019686022060726912, "loss": 0.0987, "step": 4144 }, { "epoch": 0.2678949494949495, "eval_bleu": 14.480730080839121, "eval_loss": 0.09211601316928864, "eval_runtime": 2.9376, "eval_samples_per_second": 10.893, "eval_steps_per_second": 1.362, "step": 4144 }, { "epoch": 0.26795959595959595, "grad_norm": 0.0651697888970375, "learning_rate": 0.00019685852019301386, "loss": 0.0816, "step": 4145 }, { "epoch": 0.26802424242424244, "grad_norm": 0.08566612005233765, "learning_rate": 0.00019685681932578333, "loss": 0.1194, "step": 4146 }, { "epoch": 0.2680888888888889, "grad_norm": 0.06304305791854858, "learning_rate": 0.00019685511800558546, "loss": 0.0806, "step": 4147 }, { "epoch": 0.26815353535353537, "grad_norm": 0.07262536138296127, "learning_rate": 0.00019685341623242822, "loss": 0.102, "step": 4148 }, { "epoch": 0.2682181818181818, "grad_norm": 0.07394552230834961, "learning_rate": 0.00019685171400631962, "loss": 0.0941, "step": 4149 }, { "epoch": 0.2682828282828283, "grad_norm": 0.08123172074556351, "learning_rate": 0.0001968500113272675, "loss": 0.1118, "step": 4150 }, { "epoch": 0.2683474747474747, "grad_norm": 0.07228460907936096, "learning_rate": 0.00019684830819527993, "loss": 0.0809, "step": 4151 }, { "epoch": 0.2684121212121212, "grad_norm": 0.07674358785152435, "learning_rate": 0.00019684660461036487, "loss": 0.0953, "step": 4152 }, { "epoch": 0.2684767676767677, "grad_norm": 0.0802217498421669, "learning_rate": 0.00019684490057253023, "loss": 0.1075, "step": 4153 }, { "epoch": 0.26854141414141414, "grad_norm": 0.07244308292865753, "learning_rate": 0.000196843196081784, "loss": 0.1015, "step": 4154 }, { "epoch": 0.2686060606060606, "grad_norm": 0.06194354221224785, "learning_rate": 0.00019684149113813416, "loss": 0.0699, "step": 4155 }, { "epoch": 0.26867070707070706, "grad_norm": 0.07043841481208801, "learning_rate": 0.0001968397857415887, "loss": 0.0988, "step": 4156 }, { "epoch": 0.26873535353535355, "grad_norm": 0.07158217579126358, "learning_rate": 0.00019683807989215554, "loss": 0.0969, "step": 4157 }, { "epoch": 0.2688, "grad_norm": 0.06150287017226219, "learning_rate": 0.00019683637358984272, "loss": 0.078, "step": 4158 }, { "epoch": 0.2688646464646465, "grad_norm": 0.06979337334632874, "learning_rate": 0.0001968346668346582, "loss": 0.0944, "step": 4159 }, { "epoch": 0.2689292929292929, "grad_norm": 0.06354733556509018, "learning_rate": 0.00019683295962660995, "loss": 0.0865, "step": 4160 }, { "epoch": 0.2689292929292929, "eval_bleu": 12.0324787057154, "eval_loss": 0.09236061573028564, "eval_runtime": 2.7182, "eval_samples_per_second": 11.773, "eval_steps_per_second": 1.472, "step": 4160 }, { "epoch": 0.2689939393939394, "grad_norm": 0.07068794220685959, "learning_rate": 0.00019683125196570594, "loss": 0.0892, "step": 4161 }, { "epoch": 0.2690585858585859, "grad_norm": 0.06503379344940186, "learning_rate": 0.0001968295438519542, "loss": 0.0836, "step": 4162 }, { "epoch": 0.2691232323232323, "grad_norm": 0.06750360131263733, "learning_rate": 0.0001968278352853627, "loss": 0.087, "step": 4163 }, { "epoch": 0.2691878787878788, "grad_norm": 0.06069406867027283, "learning_rate": 0.0001968261262659394, "loss": 0.0807, "step": 4164 }, { "epoch": 0.26925252525252524, "grad_norm": 0.06780527532100677, "learning_rate": 0.00019682441679369232, "loss": 0.0729, "step": 4165 }, { "epoch": 0.26931717171717173, "grad_norm": 0.07256273925304413, "learning_rate": 0.00019682270686862947, "loss": 0.0977, "step": 4166 }, { "epoch": 0.26938181818181817, "grad_norm": 0.0687905102968216, "learning_rate": 0.0001968209964907588, "loss": 0.0993, "step": 4167 }, { "epoch": 0.26944646464646466, "grad_norm": 0.06238539516925812, "learning_rate": 0.00019681928566008837, "loss": 0.0735, "step": 4168 }, { "epoch": 0.2695111111111111, "grad_norm": 0.07545731216669083, "learning_rate": 0.00019681757437662616, "loss": 0.1009, "step": 4169 }, { "epoch": 0.2695757575757576, "grad_norm": 0.0640399381518364, "learning_rate": 0.0001968158626403801, "loss": 0.0823, "step": 4170 }, { "epoch": 0.269640404040404, "grad_norm": 0.06572475284337997, "learning_rate": 0.0001968141504513583, "loss": 0.0844, "step": 4171 }, { "epoch": 0.2697050505050505, "grad_norm": 0.07045052200555801, "learning_rate": 0.00019681243780956872, "loss": 0.0975, "step": 4172 }, { "epoch": 0.269769696969697, "grad_norm": 0.07127857953310013, "learning_rate": 0.0001968107247150194, "loss": 0.1157, "step": 4173 }, { "epoch": 0.2698343434343434, "grad_norm": 0.07238567620515823, "learning_rate": 0.00019680901116771827, "loss": 0.0889, "step": 4174 }, { "epoch": 0.2698989898989899, "grad_norm": 0.07276687771081924, "learning_rate": 0.00019680729716767346, "loss": 0.0933, "step": 4175 }, { "epoch": 0.26996363636363635, "grad_norm": 0.07266431301832199, "learning_rate": 0.0001968055827148929, "loss": 0.0957, "step": 4176 }, { "epoch": 0.26996363636363635, "eval_bleu": 13.410844823593065, "eval_loss": 0.09154077619314194, "eval_runtime": 2.8797, "eval_samples_per_second": 11.112, "eval_steps_per_second": 1.389, "step": 4176 }, { "epoch": 0.27002828282828284, "grad_norm": 0.06851734966039658, "learning_rate": 0.0001968038678093846, "loss": 0.0918, "step": 4177 }, { "epoch": 0.2700929292929293, "grad_norm": 0.07407873868942261, "learning_rate": 0.00019680215245115664, "loss": 0.1107, "step": 4178 }, { "epoch": 0.27015757575757576, "grad_norm": 0.06345862150192261, "learning_rate": 0.00019680043664021702, "loss": 0.0877, "step": 4179 }, { "epoch": 0.2702222222222222, "grad_norm": 0.06200195103883743, "learning_rate": 0.00019679872037657378, "loss": 0.0856, "step": 4180 }, { "epoch": 0.2702868686868687, "grad_norm": 0.06358761340379715, "learning_rate": 0.0001967970036602349, "loss": 0.0868, "step": 4181 }, { "epoch": 0.2703515151515152, "grad_norm": 0.08329736441373825, "learning_rate": 0.00019679528649120843, "loss": 0.0924, "step": 4182 }, { "epoch": 0.2704161616161616, "grad_norm": 0.07827464491128922, "learning_rate": 0.0001967935688695024, "loss": 0.0906, "step": 4183 }, { "epoch": 0.2704808080808081, "grad_norm": 0.05834691971540451, "learning_rate": 0.00019679185079512487, "loss": 0.0743, "step": 4184 }, { "epoch": 0.27054545454545453, "grad_norm": 0.06297656148672104, "learning_rate": 0.00019679013226808385, "loss": 0.079, "step": 4185 }, { "epoch": 0.270610101010101, "grad_norm": 0.06820444762706757, "learning_rate": 0.00019678841328838736, "loss": 0.0846, "step": 4186 }, { "epoch": 0.27067474747474746, "grad_norm": 0.0740358978509903, "learning_rate": 0.00019678669385604346, "loss": 0.0984, "step": 4187 }, { "epoch": 0.27073939393939395, "grad_norm": 0.07049202919006348, "learning_rate": 0.0001967849739710602, "loss": 0.0921, "step": 4188 }, { "epoch": 0.2708040404040404, "grad_norm": 0.06729531288146973, "learning_rate": 0.00019678325363344562, "loss": 0.0932, "step": 4189 }, { "epoch": 0.27086868686868687, "grad_norm": 0.08443735539913177, "learning_rate": 0.00019678153284320775, "loss": 0.0954, "step": 4190 }, { "epoch": 0.27093333333333336, "grad_norm": 0.06944546103477478, "learning_rate": 0.00019677981160035467, "loss": 0.0959, "step": 4191 }, { "epoch": 0.2709979797979798, "grad_norm": 0.07118839770555496, "learning_rate": 0.00019677808990489437, "loss": 0.0995, "step": 4192 }, { "epoch": 0.2709979797979798, "eval_bleu": 13.054764479958287, "eval_loss": 0.09221772849559784, "eval_runtime": 2.6813, "eval_samples_per_second": 11.935, "eval_steps_per_second": 1.492, "step": 4192 }, { "epoch": 0.2710626262626263, "grad_norm": 0.07209117710590363, "learning_rate": 0.00019677636775683493, "loss": 0.0971, "step": 4193 }, { "epoch": 0.2711272727272727, "grad_norm": 0.07843372970819473, "learning_rate": 0.00019677464515618445, "loss": 0.0993, "step": 4194 }, { "epoch": 0.2711919191919192, "grad_norm": 0.06937476992607117, "learning_rate": 0.00019677292210295094, "loss": 0.0894, "step": 4195 }, { "epoch": 0.27125656565656564, "grad_norm": 0.07377466559410095, "learning_rate": 0.00019677119859714244, "loss": 0.102, "step": 4196 }, { "epoch": 0.27132121212121213, "grad_norm": 0.06878658384084702, "learning_rate": 0.00019676947463876705, "loss": 0.0849, "step": 4197 }, { "epoch": 0.27138585858585856, "grad_norm": 0.06169632449746132, "learning_rate": 0.0001967677502278328, "loss": 0.072, "step": 4198 }, { "epoch": 0.27145050505050505, "grad_norm": 0.06099303811788559, "learning_rate": 0.0001967660253643478, "loss": 0.0817, "step": 4199 }, { "epoch": 0.27151515151515154, "grad_norm": 0.06536420434713364, "learning_rate": 0.00019676430004832008, "loss": 0.094, "step": 4200 }, { "epoch": 0.271579797979798, "grad_norm": 0.06275838613510132, "learning_rate": 0.00019676257427975774, "loss": 0.0773, "step": 4201 }, { "epoch": 0.27164444444444447, "grad_norm": 0.08492186665534973, "learning_rate": 0.00019676084805866882, "loss": 0.1222, "step": 4202 }, { "epoch": 0.2717090909090909, "grad_norm": 0.08464498817920685, "learning_rate": 0.0001967591213850614, "loss": 0.0994, "step": 4203 }, { "epoch": 0.2717737373737374, "grad_norm": 0.06961265951395035, "learning_rate": 0.00019675739425894354, "loss": 0.0928, "step": 4204 }, { "epoch": 0.2718383838383838, "grad_norm": 0.06475110352039337, "learning_rate": 0.00019675566668032332, "loss": 0.0972, "step": 4205 }, { "epoch": 0.2719030303030303, "grad_norm": 0.0936918705701828, "learning_rate": 0.00019675393864920885, "loss": 0.105, "step": 4206 }, { "epoch": 0.27196767676767675, "grad_norm": 0.05950174108147621, "learning_rate": 0.0001967522101656082, "loss": 0.0822, "step": 4207 }, { "epoch": 0.27203232323232324, "grad_norm": 0.06783674657344818, "learning_rate": 0.00019675048122952946, "loss": 0.0862, "step": 4208 }, { "epoch": 0.27203232323232324, "eval_bleu": 12.37491224465256, "eval_loss": 0.09030617773532867, "eval_runtime": 2.7019, "eval_samples_per_second": 11.844, "eval_steps_per_second": 1.48, "step": 4208 }, { "epoch": 0.27209696969696967, "grad_norm": 0.07009829580783844, "learning_rate": 0.00019674875184098073, "loss": 0.099, "step": 4209 }, { "epoch": 0.27216161616161616, "grad_norm": 0.07490433752536774, "learning_rate": 0.00019674702199997, "loss": 0.1026, "step": 4210 }, { "epoch": 0.27222626262626265, "grad_norm": 0.06616954505443573, "learning_rate": 0.00019674529170650548, "loss": 0.0874, "step": 4211 }, { "epoch": 0.2722909090909091, "grad_norm": 0.08078459650278091, "learning_rate": 0.0001967435609605952, "loss": 0.1052, "step": 4212 }, { "epoch": 0.2723555555555556, "grad_norm": 0.07755498588085175, "learning_rate": 0.00019674182976224728, "loss": 0.0922, "step": 4213 }, { "epoch": 0.272420202020202, "grad_norm": 0.067042276263237, "learning_rate": 0.0001967400981114698, "loss": 0.0847, "step": 4214 }, { "epoch": 0.2724848484848485, "grad_norm": 0.0760740265250206, "learning_rate": 0.00019673836600827085, "loss": 0.0953, "step": 4215 }, { "epoch": 0.27254949494949493, "grad_norm": 0.06655211746692657, "learning_rate": 0.00019673663345265858, "loss": 0.0951, "step": 4216 }, { "epoch": 0.2726141414141414, "grad_norm": 0.06735111773014069, "learning_rate": 0.000196734900444641, "loss": 0.0995, "step": 4217 }, { "epoch": 0.27267878787878785, "grad_norm": 0.06664618104696274, "learning_rate": 0.00019673316698422632, "loss": 0.0897, "step": 4218 }, { "epoch": 0.27274343434343434, "grad_norm": 0.0812290832400322, "learning_rate": 0.0001967314330714226, "loss": 0.1149, "step": 4219 }, { "epoch": 0.27280808080808083, "grad_norm": 0.0654434934258461, "learning_rate": 0.0001967296987062379, "loss": 0.0942, "step": 4220 }, { "epoch": 0.27287272727272727, "grad_norm": 0.06732401251792908, "learning_rate": 0.0001967279638886804, "loss": 0.0977, "step": 4221 }, { "epoch": 0.27293737373737376, "grad_norm": 0.08123961091041565, "learning_rate": 0.0001967262286187582, "loss": 0.0893, "step": 4222 }, { "epoch": 0.2730020202020202, "grad_norm": 0.06223050132393837, "learning_rate": 0.0001967244928964794, "loss": 0.0963, "step": 4223 }, { "epoch": 0.2730666666666667, "grad_norm": 0.0640723705291748, "learning_rate": 0.0001967227567218521, "loss": 0.0831, "step": 4224 }, { "epoch": 0.2730666666666667, "eval_bleu": 11.359121908479228, "eval_loss": 0.09155243635177612, "eval_runtime": 2.7331, "eval_samples_per_second": 11.708, "eval_steps_per_second": 1.464, "step": 4224 }, { "epoch": 0.2731313131313131, "grad_norm": 0.06645813584327698, "learning_rate": 0.00019672102009488446, "loss": 0.0807, "step": 4225 }, { "epoch": 0.2731959595959596, "grad_norm": 0.06468161195516586, "learning_rate": 0.0001967192830155846, "loss": 0.0788, "step": 4226 }, { "epoch": 0.27326060606060604, "grad_norm": 0.06649275869131088, "learning_rate": 0.0001967175454839606, "loss": 0.0853, "step": 4227 }, { "epoch": 0.2733252525252525, "grad_norm": 0.06614337116479874, "learning_rate": 0.00019671580750002066, "loss": 0.0897, "step": 4228 }, { "epoch": 0.273389898989899, "grad_norm": 0.07503994554281235, "learning_rate": 0.0001967140690637728, "loss": 0.1082, "step": 4229 }, { "epoch": 0.27345454545454545, "grad_norm": 0.07090439647436142, "learning_rate": 0.00019671233017522525, "loss": 0.0936, "step": 4230 }, { "epoch": 0.27351919191919194, "grad_norm": 0.0631272941827774, "learning_rate": 0.0001967105908343861, "loss": 0.0883, "step": 4231 }, { "epoch": 0.2735838383838384, "grad_norm": 0.0636717900633812, "learning_rate": 0.00019670885104126348, "loss": 0.0846, "step": 4232 }, { "epoch": 0.27364848484848486, "grad_norm": 0.06532414257526398, "learning_rate": 0.00019670711079586555, "loss": 0.0868, "step": 4233 }, { "epoch": 0.2737131313131313, "grad_norm": 0.07801687717437744, "learning_rate": 0.0001967053700982004, "loss": 0.1081, "step": 4234 }, { "epoch": 0.2737777777777778, "grad_norm": 0.07180683314800262, "learning_rate": 0.00019670362894827625, "loss": 0.0866, "step": 4235 }, { "epoch": 0.2738424242424242, "grad_norm": 0.06685402989387512, "learning_rate": 0.00019670188734610116, "loss": 0.0985, "step": 4236 }, { "epoch": 0.2739070707070707, "grad_norm": 0.09973353892564774, "learning_rate": 0.00019670014529168334, "loss": 0.0859, "step": 4237 }, { "epoch": 0.27397171717171714, "grad_norm": 0.1273084133863449, "learning_rate": 0.0001966984027850309, "loss": 0.0935, "step": 4238 }, { "epoch": 0.27403636363636363, "grad_norm": 0.06016869097948074, "learning_rate": 0.000196696659826152, "loss": 0.0791, "step": 4239 }, { "epoch": 0.2741010101010101, "grad_norm": 0.08196477591991425, "learning_rate": 0.0001966949164150548, "loss": 0.1009, "step": 4240 }, { "epoch": 0.2741010101010101, "eval_bleu": 13.639647915974061, "eval_loss": 0.09162388741970062, "eval_runtime": 2.8332, "eval_samples_per_second": 11.295, "eval_steps_per_second": 1.412, "step": 4240 }, { "epoch": 0.27416565656565656, "grad_norm": 0.07231196016073227, "learning_rate": 0.0001966931725517474, "loss": 0.0955, "step": 4241 }, { "epoch": 0.27423030303030305, "grad_norm": 0.061195503920316696, "learning_rate": 0.00019669142823623803, "loss": 0.0774, "step": 4242 }, { "epoch": 0.2742949494949495, "grad_norm": 0.06734532862901688, "learning_rate": 0.00019668968346853483, "loss": 0.0924, "step": 4243 }, { "epoch": 0.27435959595959597, "grad_norm": 0.06743648648262024, "learning_rate": 0.0001966879382486459, "loss": 0.0928, "step": 4244 }, { "epoch": 0.2744242424242424, "grad_norm": 0.06553462147712708, "learning_rate": 0.0001966861925765795, "loss": 0.0802, "step": 4245 }, { "epoch": 0.2744888888888889, "grad_norm": 0.07010167837142944, "learning_rate": 0.00019668444645234368, "loss": 0.085, "step": 4246 }, { "epoch": 0.2745535353535353, "grad_norm": 0.10386139899492264, "learning_rate": 0.00019668269987594672, "loss": 0.0772, "step": 4247 }, { "epoch": 0.2746181818181818, "grad_norm": 0.07533600926399231, "learning_rate": 0.0001966809528473967, "loss": 0.1038, "step": 4248 }, { "epoch": 0.2746828282828283, "grad_norm": 0.05863481014966965, "learning_rate": 0.00019667920536670187, "loss": 0.0829, "step": 4249 }, { "epoch": 0.27474747474747474, "grad_norm": 0.06582505255937576, "learning_rate": 0.00019667745743387032, "loss": 0.0917, "step": 4250 }, { "epoch": 0.27481212121212123, "grad_norm": 0.06666736304759979, "learning_rate": 0.0001966757090489103, "loss": 0.0927, "step": 4251 }, { "epoch": 0.27487676767676766, "grad_norm": 0.06843499094247818, "learning_rate": 0.00019667396021182988, "loss": 0.0825, "step": 4252 }, { "epoch": 0.27494141414141415, "grad_norm": 0.06754453480243683, "learning_rate": 0.00019667221092263738, "loss": 0.0861, "step": 4253 }, { "epoch": 0.2750060606060606, "grad_norm": 0.06907545030117035, "learning_rate": 0.00019667046118134084, "loss": 0.0897, "step": 4254 }, { "epoch": 0.2750707070707071, "grad_norm": 0.08253129571676254, "learning_rate": 0.00019666871098794858, "loss": 0.0896, "step": 4255 }, { "epoch": 0.2751353535353535, "grad_norm": 0.07836899906396866, "learning_rate": 0.00019666696034246868, "loss": 0.1165, "step": 4256 }, { "epoch": 0.2751353535353535, "eval_bleu": 13.948044391220451, "eval_loss": 0.09246565401554108, "eval_runtime": 2.7025, "eval_samples_per_second": 11.841, "eval_steps_per_second": 1.48, "step": 4256 }, { "epoch": 0.2752, "grad_norm": 0.05887774005532265, "learning_rate": 0.00019666520924490938, "loss": 0.0791, "step": 4257 }, { "epoch": 0.2752646464646465, "grad_norm": 0.07451488077640533, "learning_rate": 0.00019666345769527884, "loss": 0.115, "step": 4258 }, { "epoch": 0.2753292929292929, "grad_norm": 0.07511168718338013, "learning_rate": 0.00019666170569358527, "loss": 0.1021, "step": 4259 }, { "epoch": 0.2753939393939394, "grad_norm": 0.0797128900885582, "learning_rate": 0.00019665995323983685, "loss": 0.1086, "step": 4260 }, { "epoch": 0.27545858585858585, "grad_norm": 0.09208791702985764, "learning_rate": 0.0001966582003340418, "loss": 0.1192, "step": 4261 }, { "epoch": 0.27552323232323234, "grad_norm": 0.0739036276936531, "learning_rate": 0.0001966564469762083, "loss": 0.0949, "step": 4262 }, { "epoch": 0.27558787878787877, "grad_norm": 0.08899285644292831, "learning_rate": 0.00019665469316634452, "loss": 0.0847, "step": 4263 }, { "epoch": 0.27565252525252526, "grad_norm": 0.07904451340436935, "learning_rate": 0.00019665293890445872, "loss": 0.0983, "step": 4264 }, { "epoch": 0.2757171717171717, "grad_norm": 0.07048787921667099, "learning_rate": 0.00019665118419055907, "loss": 0.0984, "step": 4265 }, { "epoch": 0.2757818181818182, "grad_norm": 0.07577227056026459, "learning_rate": 0.0001966494290246538, "loss": 0.077, "step": 4266 }, { "epoch": 0.2758464646464647, "grad_norm": 0.059506893157958984, "learning_rate": 0.0001966476734067511, "loss": 0.0818, "step": 4267 }, { "epoch": 0.2759111111111111, "grad_norm": 0.0668000876903534, "learning_rate": 0.00019664591733685916, "loss": 0.0936, "step": 4268 }, { "epoch": 0.2759757575757576, "grad_norm": 0.06816105544567108, "learning_rate": 0.00019664416081498622, "loss": 0.0985, "step": 4269 }, { "epoch": 0.27604040404040403, "grad_norm": 0.07102940231561661, "learning_rate": 0.00019664240384114053, "loss": 0.0963, "step": 4270 }, { "epoch": 0.2761050505050505, "grad_norm": 0.061725977808237076, "learning_rate": 0.0001966406464153302, "loss": 0.0749, "step": 4271 }, { "epoch": 0.27616969696969695, "grad_norm": 0.08572505414485931, "learning_rate": 0.00019663888853756358, "loss": 0.1197, "step": 4272 }, { "epoch": 0.27616969696969695, "eval_bleu": 14.99990202706812, "eval_loss": 0.09235581755638123, "eval_runtime": 2.7835, "eval_samples_per_second": 11.496, "eval_steps_per_second": 1.437, "step": 4272 }, { "epoch": 0.27623434343434344, "grad_norm": 0.06754416227340698, "learning_rate": 0.00019663713020784878, "loss": 0.0937, "step": 4273 }, { "epoch": 0.2762989898989899, "grad_norm": 0.06085435673594475, "learning_rate": 0.0001966353714261941, "loss": 0.0917, "step": 4274 }, { "epoch": 0.27636363636363637, "grad_norm": 0.05978894606232643, "learning_rate": 0.0001966336121926077, "loss": 0.0821, "step": 4275 }, { "epoch": 0.2764282828282828, "grad_norm": 0.07794878631830215, "learning_rate": 0.00019663185250709788, "loss": 0.1126, "step": 4276 }, { "epoch": 0.2764929292929293, "grad_norm": 0.06527364999055862, "learning_rate": 0.0001966300923696728, "loss": 0.0767, "step": 4277 }, { "epoch": 0.2765575757575758, "grad_norm": 0.06329480558633804, "learning_rate": 0.00019662833178034074, "loss": 0.093, "step": 4278 }, { "epoch": 0.2766222222222222, "grad_norm": 0.0752754807472229, "learning_rate": 0.00019662657073910993, "loss": 0.1161, "step": 4279 }, { "epoch": 0.2766868686868687, "grad_norm": 0.060222890228033066, "learning_rate": 0.00019662480924598859, "loss": 0.0818, "step": 4280 }, { "epoch": 0.27675151515151514, "grad_norm": 0.06183062493801117, "learning_rate": 0.00019662304730098495, "loss": 0.0805, "step": 4281 }, { "epoch": 0.2768161616161616, "grad_norm": 0.08070927858352661, "learning_rate": 0.00019662128490410724, "loss": 0.1063, "step": 4282 }, { "epoch": 0.27688080808080806, "grad_norm": 0.06460665166378021, "learning_rate": 0.00019661952205536376, "loss": 0.0852, "step": 4283 }, { "epoch": 0.27694545454545455, "grad_norm": 0.05833077058196068, "learning_rate": 0.0001966177587547627, "loss": 0.0882, "step": 4284 }, { "epoch": 0.277010101010101, "grad_norm": 0.059798646718263626, "learning_rate": 0.00019661599500231231, "loss": 0.0902, "step": 4285 }, { "epoch": 0.2770747474747475, "grad_norm": 0.06571953743696213, "learning_rate": 0.00019661423079802087, "loss": 0.097, "step": 4286 }, { "epoch": 0.27713939393939396, "grad_norm": 0.06831257045269012, "learning_rate": 0.0001966124661418966, "loss": 0.0878, "step": 4287 }, { "epoch": 0.2772040404040404, "grad_norm": 0.06506771594285965, "learning_rate": 0.00019661070103394777, "loss": 0.0773, "step": 4288 }, { "epoch": 0.2772040404040404, "eval_bleu": 17.090568784230705, "eval_loss": 0.09156984090805054, "eval_runtime": 2.6853, "eval_samples_per_second": 11.917, "eval_steps_per_second": 1.49, "step": 4288 }, { "epoch": 0.2772686868686869, "grad_norm": 0.05979195237159729, "learning_rate": 0.00019660893547418265, "loss": 0.0774, "step": 4289 }, { "epoch": 0.2773333333333333, "grad_norm": 0.06204604357481003, "learning_rate": 0.00019660716946260946, "loss": 0.0854, "step": 4290 }, { "epoch": 0.2773979797979798, "grad_norm": 0.06890398263931274, "learning_rate": 0.0001966054029992365, "loss": 0.0954, "step": 4291 }, { "epoch": 0.27746262626262624, "grad_norm": 0.06365709751844406, "learning_rate": 0.00019660363608407194, "loss": 0.0691, "step": 4292 }, { "epoch": 0.27752727272727273, "grad_norm": 0.07831145077943802, "learning_rate": 0.0001966018687171242, "loss": 0.1112, "step": 4293 }, { "epoch": 0.27759191919191917, "grad_norm": 0.06532642990350723, "learning_rate": 0.0001966001008984014, "loss": 0.081, "step": 4294 }, { "epoch": 0.27765656565656566, "grad_norm": 0.06608685106039047, "learning_rate": 0.00019659833262791184, "loss": 0.088, "step": 4295 }, { "epoch": 0.27772121212121215, "grad_norm": 0.09675134718418121, "learning_rate": 0.00019659656390566386, "loss": 0.0829, "step": 4296 }, { "epoch": 0.2777858585858586, "grad_norm": 0.0656266063451767, "learning_rate": 0.00019659479473166567, "loss": 0.0914, "step": 4297 }, { "epoch": 0.27785050505050507, "grad_norm": 0.07426586002111435, "learning_rate": 0.00019659302510592556, "loss": 0.1001, "step": 4298 }, { "epoch": 0.2779151515151515, "grad_norm": 0.09900396317243576, "learning_rate": 0.0001965912550284518, "loss": 0.093, "step": 4299 }, { "epoch": 0.277979797979798, "grad_norm": 0.06313488632440567, "learning_rate": 0.0001965894844992527, "loss": 0.0967, "step": 4300 }, { "epoch": 0.2780444444444444, "grad_norm": 0.06991353631019592, "learning_rate": 0.0001965877135183365, "loss": 0.0971, "step": 4301 }, { "epoch": 0.2781090909090909, "grad_norm": 0.06899707019329071, "learning_rate": 0.00019658594208571146, "loss": 0.0987, "step": 4302 }, { "epoch": 0.27817373737373735, "grad_norm": 0.06870637089014053, "learning_rate": 0.00019658417020138594, "loss": 0.0927, "step": 4303 }, { "epoch": 0.27823838383838384, "grad_norm": 0.06643574684858322, "learning_rate": 0.00019658239786536817, "loss": 0.0857, "step": 4304 }, { "epoch": 0.27823838383838384, "eval_bleu": 14.586639612431172, "eval_loss": 0.09181059896945953, "eval_runtime": 2.8757, "eval_samples_per_second": 11.128, "eval_steps_per_second": 1.391, "step": 4304 }, { "epoch": 0.27830303030303033, "grad_norm": 0.07510837912559509, "learning_rate": 0.00019658062507766646, "loss": 0.0895, "step": 4305 }, { "epoch": 0.27836767676767676, "grad_norm": 0.06540877372026443, "learning_rate": 0.00019657885183828908, "loss": 0.0818, "step": 4306 }, { "epoch": 0.27843232323232325, "grad_norm": 0.07625401020050049, "learning_rate": 0.00019657707814724434, "loss": 0.1041, "step": 4307 }, { "epoch": 0.2784969696969697, "grad_norm": 0.08106216788291931, "learning_rate": 0.00019657530400454056, "loss": 0.1121, "step": 4308 }, { "epoch": 0.2785616161616162, "grad_norm": 0.06568999588489532, "learning_rate": 0.00019657352941018598, "loss": 0.0871, "step": 4309 }, { "epoch": 0.2786262626262626, "grad_norm": 0.06949364393949509, "learning_rate": 0.00019657175436418897, "loss": 0.0984, "step": 4310 }, { "epoch": 0.2786909090909091, "grad_norm": 0.06937297433614731, "learning_rate": 0.00019656997886655778, "loss": 0.0922, "step": 4311 }, { "epoch": 0.27875555555555553, "grad_norm": 0.06498976796865463, "learning_rate": 0.00019656820291730068, "loss": 0.0924, "step": 4312 }, { "epoch": 0.278820202020202, "grad_norm": 0.058843135833740234, "learning_rate": 0.00019656642651642606, "loss": 0.0763, "step": 4313 }, { "epoch": 0.27888484848484846, "grad_norm": 0.09469771385192871, "learning_rate": 0.00019656464966394217, "loss": 0.0878, "step": 4314 }, { "epoch": 0.27894949494949495, "grad_norm": 0.05855628475546837, "learning_rate": 0.00019656287235985732, "loss": 0.0769, "step": 4315 }, { "epoch": 0.27901414141414144, "grad_norm": 0.06659188121557236, "learning_rate": 0.00019656109460417987, "loss": 0.0856, "step": 4316 }, { "epoch": 0.27907878787878787, "grad_norm": 0.06598351150751114, "learning_rate": 0.00019655931639691812, "loss": 0.0969, "step": 4317 }, { "epoch": 0.27914343434343436, "grad_norm": 0.0817653238773346, "learning_rate": 0.0001965575377380803, "loss": 0.0967, "step": 4318 }, { "epoch": 0.2792080808080808, "grad_norm": 0.06330971419811249, "learning_rate": 0.00019655575862767484, "loss": 0.0899, "step": 4319 }, { "epoch": 0.2792727272727273, "grad_norm": 0.06465650349855423, "learning_rate": 0.00019655397906571004, "loss": 0.0936, "step": 4320 }, { "epoch": 0.2792727272727273, "eval_bleu": 14.333354007505713, "eval_loss": 0.09306371212005615, "eval_runtime": 2.7214, "eval_samples_per_second": 11.759, "eval_steps_per_second": 1.47, "step": 4320 }, { "epoch": 0.2793373737373737, "grad_norm": 0.07256720215082169, "learning_rate": 0.00019655219905219417, "loss": 0.0987, "step": 4321 }, { "epoch": 0.2794020202020202, "grad_norm": 0.06694379448890686, "learning_rate": 0.0001965504185871356, "loss": 0.0888, "step": 4322 }, { "epoch": 0.27946666666666664, "grad_norm": 0.06541089713573456, "learning_rate": 0.00019654863767054265, "loss": 0.0968, "step": 4323 }, { "epoch": 0.27953131313131313, "grad_norm": 0.07408475130796432, "learning_rate": 0.00019654685630242363, "loss": 0.0955, "step": 4324 }, { "epoch": 0.2795959595959596, "grad_norm": 0.06434882432222366, "learning_rate": 0.00019654507448278686, "loss": 0.0933, "step": 4325 }, { "epoch": 0.27966060606060605, "grad_norm": 0.057125017046928406, "learning_rate": 0.0001965432922116407, "loss": 0.0791, "step": 4326 }, { "epoch": 0.27972525252525254, "grad_norm": 0.06498373299837112, "learning_rate": 0.0001965415094889935, "loss": 0.0828, "step": 4327 }, { "epoch": 0.279789898989899, "grad_norm": 0.06335562467575073, "learning_rate": 0.00019653972631485358, "loss": 0.0796, "step": 4328 }, { "epoch": 0.27985454545454547, "grad_norm": 0.057698626071214676, "learning_rate": 0.00019653794268922924, "loss": 0.077, "step": 4329 }, { "epoch": 0.2799191919191919, "grad_norm": 0.061878301203250885, "learning_rate": 0.0001965361586121289, "loss": 0.0842, "step": 4330 }, { "epoch": 0.2799838383838384, "grad_norm": 0.06939241290092468, "learning_rate": 0.00019653437408356084, "loss": 0.097, "step": 4331 }, { "epoch": 0.2800484848484848, "grad_norm": 0.05771104618906975, "learning_rate": 0.0001965325891035334, "loss": 0.0787, "step": 4332 }, { "epoch": 0.2801131313131313, "grad_norm": 0.07069814205169678, "learning_rate": 0.00019653080367205498, "loss": 0.105, "step": 4333 }, { "epoch": 0.2801777777777778, "grad_norm": 0.07531116157770157, "learning_rate": 0.0001965290177891339, "loss": 0.1061, "step": 4334 }, { "epoch": 0.28024242424242424, "grad_norm": 0.06086823344230652, "learning_rate": 0.00019652723145477852, "loss": 0.0898, "step": 4335 }, { "epoch": 0.2803070707070707, "grad_norm": 0.0644456297159195, "learning_rate": 0.00019652544466899717, "loss": 0.09, "step": 4336 }, { "epoch": 0.2803070707070707, "eval_bleu": 14.48172711037433, "eval_loss": 0.09253813326358795, "eval_runtime": 2.7143, "eval_samples_per_second": 11.789, "eval_steps_per_second": 1.474, "step": 4336 }, { "epoch": 0.28037171717171716, "grad_norm": 0.061860889196395874, "learning_rate": 0.00019652365743179825, "loss": 0.0912, "step": 4337 }, { "epoch": 0.28043636363636365, "grad_norm": 0.08942409604787827, "learning_rate": 0.00019652186974319008, "loss": 0.1097, "step": 4338 }, { "epoch": 0.2805010101010101, "grad_norm": 0.055456697940826416, "learning_rate": 0.00019652008160318105, "loss": 0.0789, "step": 4339 }, { "epoch": 0.2805656565656566, "grad_norm": 0.06779647618532181, "learning_rate": 0.0001965182930117795, "loss": 0.0901, "step": 4340 }, { "epoch": 0.280630303030303, "grad_norm": 0.07024422287940979, "learning_rate": 0.0001965165039689938, "loss": 0.0984, "step": 4341 }, { "epoch": 0.2806949494949495, "grad_norm": 0.08629406988620758, "learning_rate": 0.0001965147144748323, "loss": 0.081, "step": 4342 }, { "epoch": 0.280759595959596, "grad_norm": 0.06911172717809677, "learning_rate": 0.0001965129245293034, "loss": 0.0906, "step": 4343 }, { "epoch": 0.2808242424242424, "grad_norm": 0.06524579226970673, "learning_rate": 0.00019651113413241548, "loss": 0.094, "step": 4344 }, { "epoch": 0.2808888888888889, "grad_norm": 0.06509728729724884, "learning_rate": 0.00019650934328417687, "loss": 0.0821, "step": 4345 }, { "epoch": 0.28095353535353534, "grad_norm": 0.065675288438797, "learning_rate": 0.00019650755198459598, "loss": 0.0848, "step": 4346 }, { "epoch": 0.28101818181818183, "grad_norm": 0.06912438571453094, "learning_rate": 0.00019650576023368119, "loss": 0.0975, "step": 4347 }, { "epoch": 0.28108282828282827, "grad_norm": 0.058166082948446274, "learning_rate": 0.00019650396803144084, "loss": 0.0801, "step": 4348 }, { "epoch": 0.28114747474747476, "grad_norm": 0.07102544605731964, "learning_rate": 0.00019650217537788334, "loss": 0.0953, "step": 4349 }, { "epoch": 0.2812121212121212, "grad_norm": 0.08068510890007019, "learning_rate": 0.00019650038227301707, "loss": 0.105, "step": 4350 }, { "epoch": 0.2812767676767677, "grad_norm": 0.07778412848711014, "learning_rate": 0.0001964985887168504, "loss": 0.0838, "step": 4351 }, { "epoch": 0.2813414141414141, "grad_norm": 0.06701838970184326, "learning_rate": 0.00019649679470939174, "loss": 0.0883, "step": 4352 }, { "epoch": 0.2813414141414141, "eval_bleu": 14.10159700192039, "eval_loss": 0.0937669649720192, "eval_runtime": 2.6934, "eval_samples_per_second": 11.881, "eval_steps_per_second": 1.485, "step": 4352 }, { "epoch": 0.2814060606060606, "grad_norm": 0.06856022030115128, "learning_rate": 0.00019649500025064947, "loss": 0.0919, "step": 4353 }, { "epoch": 0.2814707070707071, "grad_norm": 0.06348799169063568, "learning_rate": 0.00019649320534063198, "loss": 0.0811, "step": 4354 }, { "epoch": 0.2815353535353535, "grad_norm": 0.0736413300037384, "learning_rate": 0.0001964914099793477, "loss": 0.108, "step": 4355 }, { "epoch": 0.2816, "grad_norm": 0.07365409284830093, "learning_rate": 0.00019648961416680495, "loss": 0.1012, "step": 4356 }, { "epoch": 0.28166464646464645, "grad_norm": 0.07459267228841782, "learning_rate": 0.0001964878179030122, "loss": 0.0858, "step": 4357 }, { "epoch": 0.28172929292929294, "grad_norm": 0.0640048235654831, "learning_rate": 0.0001964860211879778, "loss": 0.0876, "step": 4358 }, { "epoch": 0.2817939393939394, "grad_norm": 0.07002885639667511, "learning_rate": 0.00019648422402171022, "loss": 0.093, "step": 4359 }, { "epoch": 0.28185858585858586, "grad_norm": 0.07678961008787155, "learning_rate": 0.00019648242640421778, "loss": 0.1066, "step": 4360 }, { "epoch": 0.2819232323232323, "grad_norm": 0.06830359995365143, "learning_rate": 0.00019648062833550896, "loss": 0.0966, "step": 4361 }, { "epoch": 0.2819878787878788, "grad_norm": 0.06086006760597229, "learning_rate": 0.00019647882981559212, "loss": 0.0797, "step": 4362 }, { "epoch": 0.2820525252525253, "grad_norm": 0.06256676465272903, "learning_rate": 0.00019647703084447568, "loss": 0.0887, "step": 4363 }, { "epoch": 0.2821171717171717, "grad_norm": 0.05955759808421135, "learning_rate": 0.0001964752314221681, "loss": 0.0736, "step": 4364 }, { "epoch": 0.2821818181818182, "grad_norm": 0.05913713946938515, "learning_rate": 0.00019647343154867772, "loss": 0.0758, "step": 4365 }, { "epoch": 0.28224646464646463, "grad_norm": 0.05970035493373871, "learning_rate": 0.000196471631224013, "loss": 0.0858, "step": 4366 }, { "epoch": 0.2823111111111111, "grad_norm": 0.06839437782764435, "learning_rate": 0.00019646983044818237, "loss": 0.0885, "step": 4367 }, { "epoch": 0.28237575757575756, "grad_norm": 0.07102259248495102, "learning_rate": 0.0001964680292211942, "loss": 0.0935, "step": 4368 }, { "epoch": 0.28237575757575756, "eval_bleu": 13.240661033997364, "eval_loss": 0.09453465789556503, "eval_runtime": 2.7499, "eval_samples_per_second": 11.637, "eval_steps_per_second": 1.455, "step": 4368 }, { "epoch": 0.28244040404040405, "grad_norm": 0.07567720115184784, "learning_rate": 0.00019646622754305697, "loss": 0.0991, "step": 4369 }, { "epoch": 0.2825050505050505, "grad_norm": 0.06866313517093658, "learning_rate": 0.00019646442541377908, "loss": 0.0897, "step": 4370 }, { "epoch": 0.28256969696969697, "grad_norm": 0.06221785768866539, "learning_rate": 0.00019646262283336898, "loss": 0.0914, "step": 4371 }, { "epoch": 0.28263434343434346, "grad_norm": 0.07376563549041748, "learning_rate": 0.00019646081980183509, "loss": 0.1005, "step": 4372 }, { "epoch": 0.2826989898989899, "grad_norm": 0.07264561951160431, "learning_rate": 0.0001964590163191858, "loss": 0.104, "step": 4373 }, { "epoch": 0.2827636363636364, "grad_norm": 0.0632186159491539, "learning_rate": 0.0001964572123854296, "loss": 0.078, "step": 4374 }, { "epoch": 0.2828282828282828, "grad_norm": 0.06842995434999466, "learning_rate": 0.0001964554080005749, "loss": 0.0998, "step": 4375 }, { "epoch": 0.2828929292929293, "grad_norm": 0.062108393758535385, "learning_rate": 0.00019645360316463016, "loss": 0.0752, "step": 4376 }, { "epoch": 0.28295757575757574, "grad_norm": 0.07050774991512299, "learning_rate": 0.00019645179787760377, "loss": 0.085, "step": 4377 }, { "epoch": 0.28302222222222223, "grad_norm": 0.06702041625976562, "learning_rate": 0.00019644999213950425, "loss": 0.0928, "step": 4378 }, { "epoch": 0.28308686868686866, "grad_norm": 0.06477727741003036, "learning_rate": 0.00019644818595033998, "loss": 0.0907, "step": 4379 }, { "epoch": 0.28315151515151515, "grad_norm": 0.05960860103368759, "learning_rate": 0.00019644637931011942, "loss": 0.0876, "step": 4380 }, { "epoch": 0.28321616161616164, "grad_norm": 0.07777737081050873, "learning_rate": 0.00019644457221885104, "loss": 0.1025, "step": 4381 }, { "epoch": 0.2832808080808081, "grad_norm": 0.06811051815748215, "learning_rate": 0.0001964427646765433, "loss": 0.0935, "step": 4382 }, { "epoch": 0.28334545454545457, "grad_norm": 0.06178711727261543, "learning_rate": 0.0001964409566832046, "loss": 0.0801, "step": 4383 }, { "epoch": 0.283410101010101, "grad_norm": 0.06660346686840057, "learning_rate": 0.00019643914823884342, "loss": 0.093, "step": 4384 }, { "epoch": 0.283410101010101, "eval_bleu": 13.503905398349396, "eval_loss": 0.09409144520759583, "eval_runtime": 2.6955, "eval_samples_per_second": 11.872, "eval_steps_per_second": 1.484, "step": 4384 }, { "epoch": 0.2834747474747475, "grad_norm": 0.07667375355958939, "learning_rate": 0.00019643733934346825, "loss": 0.0918, "step": 4385 }, { "epoch": 0.2835393939393939, "grad_norm": 0.0823507308959961, "learning_rate": 0.00019643552999708752, "loss": 0.0943, "step": 4386 }, { "epoch": 0.2836040404040404, "grad_norm": 0.0671856701374054, "learning_rate": 0.0001964337201997097, "loss": 0.0764, "step": 4387 }, { "epoch": 0.28366868686868685, "grad_norm": 0.0804782509803772, "learning_rate": 0.00019643190995134323, "loss": 0.1021, "step": 4388 }, { "epoch": 0.28373333333333334, "grad_norm": 0.07721947133541107, "learning_rate": 0.00019643009925199664, "loss": 0.1041, "step": 4389 }, { "epoch": 0.28379797979797977, "grad_norm": 0.07023712247610092, "learning_rate": 0.00019642828810167835, "loss": 0.0951, "step": 4390 }, { "epoch": 0.28386262626262626, "grad_norm": 0.06833957880735397, "learning_rate": 0.00019642647650039682, "loss": 0.104, "step": 4391 }, { "epoch": 0.28392727272727275, "grad_norm": 0.07807779312133789, "learning_rate": 0.00019642466444816053, "loss": 0.1184, "step": 4392 }, { "epoch": 0.2839919191919192, "grad_norm": 0.06609499454498291, "learning_rate": 0.00019642285194497797, "loss": 0.0804, "step": 4393 }, { "epoch": 0.2840565656565657, "grad_norm": 0.07198118418455124, "learning_rate": 0.00019642103899085756, "loss": 0.1014, "step": 4394 }, { "epoch": 0.2841212121212121, "grad_norm": 0.07774338871240616, "learning_rate": 0.0001964192255858079, "loss": 0.0968, "step": 4395 }, { "epoch": 0.2841858585858586, "grad_norm": 0.06879827380180359, "learning_rate": 0.00019641741172983737, "loss": 0.0944, "step": 4396 }, { "epoch": 0.28425050505050503, "grad_norm": 0.05825714394450188, "learning_rate": 0.00019641559742295447, "loss": 0.0841, "step": 4397 }, { "epoch": 0.2843151515151515, "grad_norm": 0.07056107372045517, "learning_rate": 0.0001964137826651677, "loss": 0.0914, "step": 4398 }, { "epoch": 0.28437979797979795, "grad_norm": 0.06801684945821762, "learning_rate": 0.00019641196745648557, "loss": 0.0967, "step": 4399 }, { "epoch": 0.28444444444444444, "grad_norm": 0.0754716619849205, "learning_rate": 0.0001964101517969165, "loss": 0.0923, "step": 4400 }, { "epoch": 0.28444444444444444, "eval_bleu": 13.84754353500129, "eval_loss": 0.09553204476833344, "eval_runtime": 2.7286, "eval_samples_per_second": 11.728, "eval_steps_per_second": 1.466, "step": 4400 }, { "epoch": 0.28450909090909093, "grad_norm": 0.08046947419643402, "learning_rate": 0.00019640833568646904, "loss": 0.0999, "step": 4401 }, { "epoch": 0.28457373737373737, "grad_norm": 0.0713997632265091, "learning_rate": 0.00019640651912515165, "loss": 0.0956, "step": 4402 }, { "epoch": 0.28463838383838386, "grad_norm": 0.0697353333234787, "learning_rate": 0.0001964047021129729, "loss": 0.0933, "step": 4403 }, { "epoch": 0.2847030303030303, "grad_norm": 0.07646799087524414, "learning_rate": 0.00019640288464994117, "loss": 0.1084, "step": 4404 }, { "epoch": 0.2847676767676768, "grad_norm": 0.06223762780427933, "learning_rate": 0.00019640106673606504, "loss": 0.0757, "step": 4405 }, { "epoch": 0.2848323232323232, "grad_norm": 0.08348898589611053, "learning_rate": 0.00019639924837135299, "loss": 0.0979, "step": 4406 }, { "epoch": 0.2848969696969697, "grad_norm": 0.08296437561511993, "learning_rate": 0.0001963974295558135, "loss": 0.0816, "step": 4407 }, { "epoch": 0.28496161616161614, "grad_norm": 0.06300361454486847, "learning_rate": 0.00019639561028945513, "loss": 0.0974, "step": 4408 }, { "epoch": 0.2850262626262626, "grad_norm": 0.07703543454408646, "learning_rate": 0.00019639379057228635, "loss": 0.09, "step": 4409 }, { "epoch": 0.2850909090909091, "grad_norm": 0.06007029861211777, "learning_rate": 0.00019639197040431568, "loss": 0.0684, "step": 4410 }, { "epoch": 0.28515555555555555, "grad_norm": 0.06998424232006073, "learning_rate": 0.00019639014978555165, "loss": 0.1021, "step": 4411 }, { "epoch": 0.28522020202020204, "grad_norm": 0.06395740061998367, "learning_rate": 0.0001963883287160027, "loss": 0.0878, "step": 4412 }, { "epoch": 0.2852848484848485, "grad_norm": 0.05911776423454285, "learning_rate": 0.00019638650719567747, "loss": 0.0719, "step": 4413 }, { "epoch": 0.28534949494949496, "grad_norm": 0.07202507555484772, "learning_rate": 0.00019638468522458434, "loss": 0.0884, "step": 4414 }, { "epoch": 0.2854141414141414, "grad_norm": 0.0732078030705452, "learning_rate": 0.00019638286280273195, "loss": 0.1066, "step": 4415 }, { "epoch": 0.2854787878787879, "grad_norm": 0.06537657976150513, "learning_rate": 0.00019638103993012879, "loss": 0.075, "step": 4416 }, { "epoch": 0.2854787878787879, "eval_bleu": 14.100920826305225, "eval_loss": 0.0930790975689888, "eval_runtime": 2.8647, "eval_samples_per_second": 11.171, "eval_steps_per_second": 1.396, "step": 4416 }, { "epoch": 0.2855434343434343, "grad_norm": 0.07066395878791809, "learning_rate": 0.00019637921660678332, "loss": 0.082, "step": 4417 }, { "epoch": 0.2856080808080808, "grad_norm": 0.06402572989463806, "learning_rate": 0.00019637739283270413, "loss": 0.0834, "step": 4418 }, { "epoch": 0.2856727272727273, "grad_norm": 0.06865181773900986, "learning_rate": 0.00019637556860789974, "loss": 0.1013, "step": 4419 }, { "epoch": 0.28573737373737373, "grad_norm": 0.07499369978904724, "learning_rate": 0.00019637374393237868, "loss": 0.1034, "step": 4420 }, { "epoch": 0.2858020202020202, "grad_norm": 0.07298306375741959, "learning_rate": 0.0001963719188061495, "loss": 0.0932, "step": 4421 }, { "epoch": 0.28586666666666666, "grad_norm": 0.057881537824869156, "learning_rate": 0.0001963700932292207, "loss": 0.0676, "step": 4422 }, { "epoch": 0.28593131313131315, "grad_norm": 0.060981057584285736, "learning_rate": 0.0001963682672016008, "loss": 0.0888, "step": 4423 }, { "epoch": 0.2859959595959596, "grad_norm": 0.06692329049110413, "learning_rate": 0.00019636644072329842, "loss": 0.0888, "step": 4424 }, { "epoch": 0.28606060606060607, "grad_norm": 0.08159921318292618, "learning_rate": 0.00019636461379432201, "loss": 0.1023, "step": 4425 }, { "epoch": 0.2861252525252525, "grad_norm": 0.0881388932466507, "learning_rate": 0.0001963627864146802, "loss": 0.1158, "step": 4426 }, { "epoch": 0.286189898989899, "grad_norm": 0.06777847558259964, "learning_rate": 0.0001963609585843815, "loss": 0.0921, "step": 4427 }, { "epoch": 0.28625454545454543, "grad_norm": 0.06777142733335495, "learning_rate": 0.00019635913030343442, "loss": 0.0937, "step": 4428 }, { "epoch": 0.2863191919191919, "grad_norm": 0.060988642275333405, "learning_rate": 0.00019635730157184758, "loss": 0.0873, "step": 4429 }, { "epoch": 0.2863838383838384, "grad_norm": 0.054307758808135986, "learning_rate": 0.00019635547238962945, "loss": 0.0685, "step": 4430 }, { "epoch": 0.28644848484848484, "grad_norm": 0.06294847279787064, "learning_rate": 0.00019635364275678865, "loss": 0.0904, "step": 4431 }, { "epoch": 0.28651313131313133, "grad_norm": 0.06527037173509598, "learning_rate": 0.00019635181267333373, "loss": 0.0842, "step": 4432 }, { "epoch": 0.28651313131313133, "eval_bleu": 13.80523211003128, "eval_loss": 0.09447641670703888, "eval_runtime": 2.7065, "eval_samples_per_second": 11.824, "eval_steps_per_second": 1.478, "step": 4432 }, { "epoch": 0.28657777777777776, "grad_norm": 0.0796068087220192, "learning_rate": 0.00019634998213927322, "loss": 0.1319, "step": 4433 }, { "epoch": 0.28664242424242425, "grad_norm": 0.06002984941005707, "learning_rate": 0.00019634815115461568, "loss": 0.087, "step": 4434 }, { "epoch": 0.2867070707070707, "grad_norm": 0.09939252585172653, "learning_rate": 0.00019634631971936973, "loss": 0.1247, "step": 4435 }, { "epoch": 0.2867717171717172, "grad_norm": 0.0634773001074791, "learning_rate": 0.00019634448783354387, "loss": 0.0832, "step": 4436 }, { "epoch": 0.2868363636363636, "grad_norm": 0.07285909354686737, "learning_rate": 0.0001963426554971467, "loss": 0.0955, "step": 4437 }, { "epoch": 0.2869010101010101, "grad_norm": 0.06427568197250366, "learning_rate": 0.0001963408227101868, "loss": 0.0816, "step": 4438 }, { "epoch": 0.2869656565656566, "grad_norm": 0.0669107660651207, "learning_rate": 0.0001963389894726727, "loss": 0.0962, "step": 4439 }, { "epoch": 0.287030303030303, "grad_norm": 0.07926435768604279, "learning_rate": 0.00019633715578461298, "loss": 0.099, "step": 4440 }, { "epoch": 0.2870949494949495, "grad_norm": 0.061039771884679794, "learning_rate": 0.00019633532164601625, "loss": 0.0835, "step": 4441 }, { "epoch": 0.28715959595959595, "grad_norm": 0.06401325017213821, "learning_rate": 0.00019633348705689105, "loss": 0.0908, "step": 4442 }, { "epoch": 0.28722424242424244, "grad_norm": 0.05776556208729744, "learning_rate": 0.00019633165201724597, "loss": 0.0649, "step": 4443 }, { "epoch": 0.28728888888888887, "grad_norm": 0.07252118736505508, "learning_rate": 0.00019632981652708965, "loss": 0.0922, "step": 4444 }, { "epoch": 0.28735353535353536, "grad_norm": 0.07046280801296234, "learning_rate": 0.00019632798058643057, "loss": 0.0909, "step": 4445 }, { "epoch": 0.2874181818181818, "grad_norm": 0.06610973924398422, "learning_rate": 0.0001963261441952774, "loss": 0.0893, "step": 4446 }, { "epoch": 0.2874828282828283, "grad_norm": 0.06593465059995651, "learning_rate": 0.00019632430735363871, "loss": 0.0853, "step": 4447 }, { "epoch": 0.2875474747474748, "grad_norm": 0.0564500167965889, "learning_rate": 0.00019632247006152307, "loss": 0.0747, "step": 4448 }, { "epoch": 0.2875474747474748, "eval_bleu": 16.61669271709447, "eval_loss": 0.0922250747680664, "eval_runtime": 2.8013, "eval_samples_per_second": 11.423, "eval_steps_per_second": 1.428, "step": 4448 }, { "epoch": 0.2876121212121212, "grad_norm": 0.06765074282884598, "learning_rate": 0.00019632063231893903, "loss": 0.0879, "step": 4449 }, { "epoch": 0.2876767676767677, "grad_norm": 0.05686028674244881, "learning_rate": 0.00019631879412589532, "loss": 0.0869, "step": 4450 }, { "epoch": 0.28774141414141413, "grad_norm": 0.0712445080280304, "learning_rate": 0.00019631695548240038, "loss": 0.1143, "step": 4451 }, { "epoch": 0.2878060606060606, "grad_norm": 0.06979040056467056, "learning_rate": 0.00019631511638846293, "loss": 0.1033, "step": 4452 }, { "epoch": 0.28787070707070705, "grad_norm": 0.06922651082277298, "learning_rate": 0.00019631327684409148, "loss": 0.089, "step": 4453 }, { "epoch": 0.28793535353535354, "grad_norm": 0.06405875086784363, "learning_rate": 0.0001963114368492947, "loss": 0.0887, "step": 4454 }, { "epoch": 0.288, "grad_norm": 0.06777805089950562, "learning_rate": 0.00019630959640408118, "loss": 0.0821, "step": 4455 }, { "epoch": 0.28806464646464647, "grad_norm": 0.06872130185365677, "learning_rate": 0.0001963077555084595, "loss": 0.0959, "step": 4456 }, { "epoch": 0.2881292929292929, "grad_norm": 0.06281452625989914, "learning_rate": 0.00019630591416243832, "loss": 0.0891, "step": 4457 }, { "epoch": 0.2881939393939394, "grad_norm": 0.05928211659193039, "learning_rate": 0.00019630407236602619, "loss": 0.0871, "step": 4458 }, { "epoch": 0.2882585858585859, "grad_norm": 0.05711644887924194, "learning_rate": 0.00019630223011923175, "loss": 0.0777, "step": 4459 }, { "epoch": 0.2883232323232323, "grad_norm": 0.08966274559497833, "learning_rate": 0.00019630038742206362, "loss": 0.0908, "step": 4460 }, { "epoch": 0.2883878787878788, "grad_norm": 0.07473871111869812, "learning_rate": 0.0001962985442745304, "loss": 0.0938, "step": 4461 }, { "epoch": 0.28845252525252524, "grad_norm": 0.0743502676486969, "learning_rate": 0.00019629670067664075, "loss": 0.1059, "step": 4462 }, { "epoch": 0.2885171717171717, "grad_norm": 0.068879134953022, "learning_rate": 0.00019629485662840324, "loss": 0.0868, "step": 4463 }, { "epoch": 0.28858181818181816, "grad_norm": 0.059188731014728546, "learning_rate": 0.00019629301212982655, "loss": 0.0834, "step": 4464 }, { "epoch": 0.28858181818181816, "eval_bleu": 15.642684770728016, "eval_loss": 0.09318391978740692, "eval_runtime": 2.8318, "eval_samples_per_second": 11.3, "eval_steps_per_second": 1.413, "step": 4464 }, { "epoch": 0.28864646464646465, "grad_norm": 0.07137424498796463, "learning_rate": 0.00019629116718091925, "loss": 0.1055, "step": 4465 }, { "epoch": 0.2887111111111111, "grad_norm": 0.061770759522914886, "learning_rate": 0.00019628932178169, "loss": 0.08, "step": 4466 }, { "epoch": 0.2887757575757576, "grad_norm": 0.07105202972888947, "learning_rate": 0.0001962874759321474, "loss": 0.1008, "step": 4467 }, { "epoch": 0.28884040404040406, "grad_norm": 0.0716095119714737, "learning_rate": 0.00019628562963230014, "loss": 0.0935, "step": 4468 }, { "epoch": 0.2889050505050505, "grad_norm": 0.07197891920804977, "learning_rate": 0.00019628378288215684, "loss": 0.0944, "step": 4469 }, { "epoch": 0.288969696969697, "grad_norm": 0.06876778602600098, "learning_rate": 0.00019628193568172608, "loss": 0.0809, "step": 4470 }, { "epoch": 0.2890343434343434, "grad_norm": 0.06736275553703308, "learning_rate": 0.00019628008803101654, "loss": 0.0846, "step": 4471 }, { "epoch": 0.2890989898989899, "grad_norm": 0.06846721470355988, "learning_rate": 0.00019627823993003686, "loss": 0.0922, "step": 4472 }, { "epoch": 0.28916363636363634, "grad_norm": 0.07255737483501434, "learning_rate": 0.0001962763913787957, "loss": 0.0789, "step": 4473 }, { "epoch": 0.28922828282828283, "grad_norm": 0.06241066753864288, "learning_rate": 0.00019627454237730168, "loss": 0.0843, "step": 4474 }, { "epoch": 0.28929292929292927, "grad_norm": 0.07016219943761826, "learning_rate": 0.00019627269292556344, "loss": 0.1055, "step": 4475 }, { "epoch": 0.28935757575757576, "grad_norm": 0.0690203458070755, "learning_rate": 0.00019627084302358965, "loss": 0.0945, "step": 4476 }, { "epoch": 0.28942222222222225, "grad_norm": 0.06753858923912048, "learning_rate": 0.00019626899267138898, "loss": 0.0889, "step": 4477 }, { "epoch": 0.2894868686868687, "grad_norm": 0.07447067648172379, "learning_rate": 0.00019626714186897002, "loss": 0.0981, "step": 4478 }, { "epoch": 0.28955151515151517, "grad_norm": 0.06929203122854233, "learning_rate": 0.00019626529061634148, "loss": 0.0843, "step": 4479 }, { "epoch": 0.2896161616161616, "grad_norm": 0.07303477078676224, "learning_rate": 0.00019626343891351197, "loss": 0.1005, "step": 4480 }, { "epoch": 0.2896161616161616, "eval_bleu": 13.938958960526056, "eval_loss": 0.09255760163068771, "eval_runtime": 2.7465, "eval_samples_per_second": 11.651, "eval_steps_per_second": 1.456, "step": 4480 }, { "epoch": 0.2896808080808081, "grad_norm": 0.07850413024425507, "learning_rate": 0.00019626158676049022, "loss": 0.1163, "step": 4481 }, { "epoch": 0.28974545454545453, "grad_norm": 0.07225518673658371, "learning_rate": 0.00019625973415728486, "loss": 0.0835, "step": 4482 }, { "epoch": 0.289810101010101, "grad_norm": 0.06698223203420639, "learning_rate": 0.0001962578811039045, "loss": 0.0815, "step": 4483 }, { "epoch": 0.28987474747474745, "grad_norm": 0.06548656523227692, "learning_rate": 0.00019625602760035786, "loss": 0.0832, "step": 4484 }, { "epoch": 0.28993939393939394, "grad_norm": 0.06506742537021637, "learning_rate": 0.00019625417364665362, "loss": 0.0913, "step": 4485 }, { "epoch": 0.29000404040404043, "grad_norm": 0.06188466399908066, "learning_rate": 0.00019625231924280044, "loss": 0.0865, "step": 4486 }, { "epoch": 0.29006868686868686, "grad_norm": 0.06359705328941345, "learning_rate": 0.00019625046438880698, "loss": 0.0872, "step": 4487 }, { "epoch": 0.29013333333333335, "grad_norm": 0.05754384398460388, "learning_rate": 0.00019624860908468188, "loss": 0.0831, "step": 4488 }, { "epoch": 0.2901979797979798, "grad_norm": 0.11809027940034866, "learning_rate": 0.00019624675333043389, "loss": 0.0974, "step": 4489 }, { "epoch": 0.2902626262626263, "grad_norm": 0.06136356666684151, "learning_rate": 0.00019624489712607166, "loss": 0.0793, "step": 4490 }, { "epoch": 0.2903272727272727, "grad_norm": 0.05404827371239662, "learning_rate": 0.00019624304047160385, "loss": 0.0766, "step": 4491 }, { "epoch": 0.2903919191919192, "grad_norm": 0.07475696504116058, "learning_rate": 0.00019624118336703913, "loss": 0.0974, "step": 4492 }, { "epoch": 0.29045656565656564, "grad_norm": 0.059779733419418335, "learning_rate": 0.00019623932581238626, "loss": 0.0855, "step": 4493 }, { "epoch": 0.2905212121212121, "grad_norm": 0.06780587881803513, "learning_rate": 0.00019623746780765387, "loss": 0.0884, "step": 4494 }, { "epoch": 0.29058585858585856, "grad_norm": 0.06374438852071762, "learning_rate": 0.00019623560935285063, "loss": 0.0952, "step": 4495 }, { "epoch": 0.29065050505050505, "grad_norm": 0.08313052356243134, "learning_rate": 0.00019623375044798528, "loss": 0.1256, "step": 4496 }, { "epoch": 0.29065050505050505, "eval_bleu": 14.767739999072903, "eval_loss": 0.09278881549835205, "eval_runtime": 2.8711, "eval_samples_per_second": 11.146, "eval_steps_per_second": 1.393, "step": 4496 }, { "epoch": 0.29071515151515154, "grad_norm": 0.0713055208325386, "learning_rate": 0.0001962318910930665, "loss": 0.0965, "step": 4497 }, { "epoch": 0.29077979797979797, "grad_norm": 0.06727954745292664, "learning_rate": 0.00019623003128810295, "loss": 0.0965, "step": 4498 }, { "epoch": 0.29084444444444446, "grad_norm": 0.06295987218618393, "learning_rate": 0.0001962281710331034, "loss": 0.0669, "step": 4499 }, { "epoch": 0.2909090909090909, "grad_norm": 0.06775689125061035, "learning_rate": 0.00019622631032807647, "loss": 0.0909, "step": 4500 }, { "epoch": 0.2909737373737374, "grad_norm": 0.06560097634792328, "learning_rate": 0.0001962244491730309, "loss": 0.0941, "step": 4501 }, { "epoch": 0.2910383838383838, "grad_norm": 0.05855448544025421, "learning_rate": 0.00019622258756797541, "loss": 0.0808, "step": 4502 }, { "epoch": 0.2911030303030303, "grad_norm": 0.06965626776218414, "learning_rate": 0.0001962207255129187, "loss": 0.1066, "step": 4503 }, { "epoch": 0.29116767676767674, "grad_norm": 0.07694855332374573, "learning_rate": 0.00019621886300786944, "loss": 0.1066, "step": 4504 }, { "epoch": 0.29123232323232323, "grad_norm": 0.07424485683441162, "learning_rate": 0.00019621700005283637, "loss": 0.0749, "step": 4505 }, { "epoch": 0.2912969696969697, "grad_norm": 0.07173068821430206, "learning_rate": 0.0001962151366478282, "loss": 0.0906, "step": 4506 }, { "epoch": 0.29136161616161615, "grad_norm": 0.06268858164548874, "learning_rate": 0.00019621327279285367, "loss": 0.084, "step": 4507 }, { "epoch": 0.29142626262626264, "grad_norm": 0.06644877046346664, "learning_rate": 0.00019621140848792142, "loss": 0.1096, "step": 4508 }, { "epoch": 0.2914909090909091, "grad_norm": 0.06466013938188553, "learning_rate": 0.00019620954373304026, "loss": 0.0885, "step": 4509 }, { "epoch": 0.29155555555555557, "grad_norm": 0.08469715714454651, "learning_rate": 0.00019620767852821885, "loss": 0.1143, "step": 4510 }, { "epoch": 0.291620202020202, "grad_norm": 0.05712594464421272, "learning_rate": 0.00019620581287346596, "loss": 0.0915, "step": 4511 }, { "epoch": 0.2916848484848485, "grad_norm": 0.06825772672891617, "learning_rate": 0.00019620394676879026, "loss": 0.0928, "step": 4512 }, { "epoch": 0.2916848484848485, "eval_bleu": 17.286629611609083, "eval_loss": 0.09411090612411499, "eval_runtime": 2.7429, "eval_samples_per_second": 11.667, "eval_steps_per_second": 1.458, "step": 4512 }, { "epoch": 0.2917494949494949, "grad_norm": 0.0751626268029213, "learning_rate": 0.00019620208021420052, "loss": 0.125, "step": 4513 }, { "epoch": 0.2918141414141414, "grad_norm": 0.10134122520685196, "learning_rate": 0.00019620021320970545, "loss": 0.0795, "step": 4514 }, { "epoch": 0.2918787878787879, "grad_norm": 0.06724418699741364, "learning_rate": 0.00019619834575531378, "loss": 0.1045, "step": 4515 }, { "epoch": 0.29194343434343434, "grad_norm": 0.05141282081604004, "learning_rate": 0.00019619647785103426, "loss": 0.066, "step": 4516 }, { "epoch": 0.2920080808080808, "grad_norm": 0.061379093676805496, "learning_rate": 0.00019619460949687562, "loss": 0.0874, "step": 4517 }, { "epoch": 0.29207272727272726, "grad_norm": 0.060092389583587646, "learning_rate": 0.00019619274069284658, "loss": 0.0767, "step": 4518 }, { "epoch": 0.29213737373737375, "grad_norm": 0.06881596893072128, "learning_rate": 0.00019619087143895588, "loss": 0.0974, "step": 4519 }, { "epoch": 0.2922020202020202, "grad_norm": 0.062164973467588425, "learning_rate": 0.0001961890017352123, "loss": 0.088, "step": 4520 }, { "epoch": 0.2922666666666667, "grad_norm": 0.07124926149845123, "learning_rate": 0.00019618713158162457, "loss": 0.0942, "step": 4521 }, { "epoch": 0.2923313131313131, "grad_norm": 0.07605507224798203, "learning_rate": 0.00019618526097820138, "loss": 0.0996, "step": 4522 }, { "epoch": 0.2923959595959596, "grad_norm": 0.09574977308511734, "learning_rate": 0.00019618338992495157, "loss": 0.0942, "step": 4523 }, { "epoch": 0.2924606060606061, "grad_norm": 0.0649600401520729, "learning_rate": 0.00019618151842188382, "loss": 0.0952, "step": 4524 }, { "epoch": 0.2925252525252525, "grad_norm": 0.1014251634478569, "learning_rate": 0.00019617964646900687, "loss": 0.0861, "step": 4525 }, { "epoch": 0.292589898989899, "grad_norm": 0.04904953017830849, "learning_rate": 0.00019617777406632955, "loss": 0.0632, "step": 4526 }, { "epoch": 0.29265454545454544, "grad_norm": 0.06114598363637924, "learning_rate": 0.00019617590121386058, "loss": 0.0842, "step": 4527 }, { "epoch": 0.29271919191919193, "grad_norm": 0.0655544251203537, "learning_rate": 0.0001961740279116087, "loss": 0.0811, "step": 4528 }, { "epoch": 0.29271919191919193, "eval_bleu": 18.58902217675777, "eval_loss": 0.09360867738723755, "eval_runtime": 2.7774, "eval_samples_per_second": 11.521, "eval_steps_per_second": 1.44, "step": 4528 }, { "epoch": 0.29278383838383837, "grad_norm": 0.06083182245492935, "learning_rate": 0.0001961721541595827, "loss": 0.0793, "step": 4529 }, { "epoch": 0.29284848484848486, "grad_norm": 0.08451911062002182, "learning_rate": 0.0001961702799577913, "loss": 0.1057, "step": 4530 }, { "epoch": 0.2929131313131313, "grad_norm": 0.06429535150527954, "learning_rate": 0.00019616840530624333, "loss": 0.0913, "step": 4531 }, { "epoch": 0.2929777777777778, "grad_norm": 0.06467705965042114, "learning_rate": 0.0001961665302049475, "loss": 0.0883, "step": 4532 }, { "epoch": 0.2930424242424242, "grad_norm": 0.07456497848033905, "learning_rate": 0.0001961646546539126, "loss": 0.1085, "step": 4533 }, { "epoch": 0.2931070707070707, "grad_norm": 0.07370718568563461, "learning_rate": 0.00019616277865314744, "loss": 0.0974, "step": 4534 }, { "epoch": 0.2931717171717172, "grad_norm": 0.07023972272872925, "learning_rate": 0.0001961609022026607, "loss": 0.0918, "step": 4535 }, { "epoch": 0.29323636363636363, "grad_norm": 0.06576091796159744, "learning_rate": 0.00019615902530246125, "loss": 0.0892, "step": 4536 }, { "epoch": 0.2933010101010101, "grad_norm": 0.10609713196754456, "learning_rate": 0.00019615714795255778, "loss": 0.1272, "step": 4537 }, { "epoch": 0.29336565656565655, "grad_norm": 0.06828856468200684, "learning_rate": 0.00019615527015295916, "loss": 0.0862, "step": 4538 }, { "epoch": 0.29343030303030304, "grad_norm": 0.05812883749604225, "learning_rate": 0.00019615339190367414, "loss": 0.0782, "step": 4539 }, { "epoch": 0.2934949494949495, "grad_norm": 0.09120214730501175, "learning_rate": 0.00019615151320471146, "loss": 0.11, "step": 4540 }, { "epoch": 0.29355959595959596, "grad_norm": 0.0586036778986454, "learning_rate": 0.00019614963405607995, "loss": 0.0738, "step": 4541 }, { "epoch": 0.2936242424242424, "grad_norm": 0.06411392986774445, "learning_rate": 0.0001961477544577884, "loss": 0.0754, "step": 4542 }, { "epoch": 0.2936888888888889, "grad_norm": 0.058797746896743774, "learning_rate": 0.00019614587440984558, "loss": 0.0762, "step": 4543 }, { "epoch": 0.2937535353535354, "grad_norm": 0.06301523000001907, "learning_rate": 0.00019614399391226027, "loss": 0.0976, "step": 4544 }, { "epoch": 0.2937535353535354, "eval_bleu": 17.05628275009619, "eval_loss": 0.09224610030651093, "eval_runtime": 2.6654, "eval_samples_per_second": 12.006, "eval_steps_per_second": 1.501, "step": 4544 }, { "epoch": 0.2938181818181818, "grad_norm": 0.06774929910898209, "learning_rate": 0.00019614211296504133, "loss": 0.0975, "step": 4545 }, { "epoch": 0.2938828282828283, "grad_norm": 0.07908044010400772, "learning_rate": 0.00019614023156819748, "loss": 0.0853, "step": 4546 }, { "epoch": 0.29394747474747474, "grad_norm": 0.06701932102441788, "learning_rate": 0.00019613834972173754, "loss": 0.0878, "step": 4547 }, { "epoch": 0.2940121212121212, "grad_norm": 0.06733550131320953, "learning_rate": 0.00019613646742567035, "loss": 0.098, "step": 4548 }, { "epoch": 0.29407676767676766, "grad_norm": 0.07471128553152084, "learning_rate": 0.00019613458468000468, "loss": 0.0917, "step": 4549 }, { "epoch": 0.29414141414141415, "grad_norm": 0.058018557727336884, "learning_rate": 0.00019613270148474931, "loss": 0.0786, "step": 4550 }, { "epoch": 0.2942060606060606, "grad_norm": 0.08356033265590668, "learning_rate": 0.0001961308178399131, "loss": 0.0853, "step": 4551 }, { "epoch": 0.29427070707070707, "grad_norm": 0.07202444225549698, "learning_rate": 0.00019612893374550483, "loss": 0.0939, "step": 4552 }, { "epoch": 0.29433535353535356, "grad_norm": 0.06706680357456207, "learning_rate": 0.00019612704920153332, "loss": 0.1023, "step": 4553 }, { "epoch": 0.2944, "grad_norm": 0.07510372996330261, "learning_rate": 0.0001961251642080074, "loss": 0.1065, "step": 4554 }, { "epoch": 0.2944646464646465, "grad_norm": 0.05924130231142044, "learning_rate": 0.0001961232787649358, "loss": 0.0764, "step": 4555 }, { "epoch": 0.2945292929292929, "grad_norm": 0.06250309944152832, "learning_rate": 0.00019612139287232747, "loss": 0.0809, "step": 4556 }, { "epoch": 0.2945939393939394, "grad_norm": 0.0651947483420372, "learning_rate": 0.00019611950653019113, "loss": 0.0852, "step": 4557 }, { "epoch": 0.29465858585858584, "grad_norm": 0.06940531730651855, "learning_rate": 0.00019611761973853566, "loss": 0.0825, "step": 4558 }, { "epoch": 0.29472323232323233, "grad_norm": 0.06890036910772324, "learning_rate": 0.00019611573249736982, "loss": 0.0871, "step": 4559 }, { "epoch": 0.29478787878787877, "grad_norm": 0.06867414712905884, "learning_rate": 0.0001961138448067025, "loss": 0.0949, "step": 4560 }, { "epoch": 0.29478787878787877, "eval_bleu": 15.953762655402342, "eval_loss": 0.09117183089256287, "eval_runtime": 2.6845, "eval_samples_per_second": 11.92, "eval_steps_per_second": 1.49, "step": 4560 }, { "epoch": 0.29485252525252525, "grad_norm": 0.06641063839197159, "learning_rate": 0.00019611195666654249, "loss": 0.0918, "step": 4561 }, { "epoch": 0.29491717171717174, "grad_norm": 0.07135960459709167, "learning_rate": 0.00019611006807689866, "loss": 0.0918, "step": 4562 }, { "epoch": 0.2949818181818182, "grad_norm": 0.06634759157896042, "learning_rate": 0.00019610817903777981, "loss": 0.0789, "step": 4563 }, { "epoch": 0.29504646464646467, "grad_norm": 0.08527235686779022, "learning_rate": 0.00019610628954919474, "loss": 0.1055, "step": 4564 }, { "epoch": 0.2951111111111111, "grad_norm": 0.08668415248394012, "learning_rate": 0.00019610439961115236, "loss": 0.1067, "step": 4565 }, { "epoch": 0.2951757575757576, "grad_norm": 0.06364767998456955, "learning_rate": 0.00019610250922366148, "loss": 0.0833, "step": 4566 }, { "epoch": 0.295240404040404, "grad_norm": 0.07508257031440735, "learning_rate": 0.0001961006183867309, "loss": 0.1044, "step": 4567 }, { "epoch": 0.2953050505050505, "grad_norm": 0.05892026796936989, "learning_rate": 0.00019609872710036954, "loss": 0.0795, "step": 4568 }, { "epoch": 0.29536969696969695, "grad_norm": 0.06696969270706177, "learning_rate": 0.00019609683536458617, "loss": 0.0906, "step": 4569 }, { "epoch": 0.29543434343434344, "grad_norm": 0.0762874111533165, "learning_rate": 0.0001960949431793897, "loss": 0.0985, "step": 4570 }, { "epoch": 0.29549898989898987, "grad_norm": 0.06808783113956451, "learning_rate": 0.00019609305054478892, "loss": 0.0855, "step": 4571 }, { "epoch": 0.29556363636363636, "grad_norm": 0.06726190447807312, "learning_rate": 0.00019609115746079269, "loss": 0.1051, "step": 4572 }, { "epoch": 0.29562828282828285, "grad_norm": 0.05773718282580376, "learning_rate": 0.00019608926392740994, "loss": 0.0765, "step": 4573 }, { "epoch": 0.2956929292929293, "grad_norm": 0.07584203034639359, "learning_rate": 0.00019608736994464944, "loss": 0.1132, "step": 4574 }, { "epoch": 0.2957575757575758, "grad_norm": 0.06285490840673447, "learning_rate": 0.00019608547551252007, "loss": 0.0832, "step": 4575 }, { "epoch": 0.2958222222222222, "grad_norm": 0.06950333714485168, "learning_rate": 0.00019608358063103072, "loss": 0.0871, "step": 4576 }, { "epoch": 0.2958222222222222, "eval_bleu": 16.400183046369527, "eval_loss": 0.09258623421192169, "eval_runtime": 2.6873, "eval_samples_per_second": 11.908, "eval_steps_per_second": 1.488, "step": 4576 }, { "epoch": 0.2958868686868687, "grad_norm": 0.06954176723957062, "learning_rate": 0.0001960816853001902, "loss": 0.0964, "step": 4577 }, { "epoch": 0.29595151515151513, "grad_norm": 0.07281685620546341, "learning_rate": 0.00019607978952000744, "loss": 0.1175, "step": 4578 }, { "epoch": 0.2960161616161616, "grad_norm": 0.06163204088807106, "learning_rate": 0.00019607789329049123, "loss": 0.0824, "step": 4579 }, { "epoch": 0.29608080808080806, "grad_norm": 0.06183109059929848, "learning_rate": 0.0001960759966116505, "loss": 0.1033, "step": 4580 }, { "epoch": 0.29614545454545455, "grad_norm": 0.06978536397218704, "learning_rate": 0.00019607409948349405, "loss": 0.0843, "step": 4581 }, { "epoch": 0.29621010101010103, "grad_norm": 0.06802777200937271, "learning_rate": 0.00019607220190603086, "loss": 0.0962, "step": 4582 }, { "epoch": 0.29627474747474747, "grad_norm": 0.07489711791276932, "learning_rate": 0.00019607030387926971, "loss": 0.0946, "step": 4583 }, { "epoch": 0.29633939393939396, "grad_norm": 0.05409941077232361, "learning_rate": 0.00019606840540321955, "loss": 0.0723, "step": 4584 }, { "epoch": 0.2964040404040404, "grad_norm": 0.05896497145295143, "learning_rate": 0.00019606650647788917, "loss": 0.0814, "step": 4585 }, { "epoch": 0.2964686868686869, "grad_norm": 0.061247263103723526, "learning_rate": 0.00019606460710328748, "loss": 0.0882, "step": 4586 }, { "epoch": 0.2965333333333333, "grad_norm": 0.07093510031700134, "learning_rate": 0.00019606270727942345, "loss": 0.1093, "step": 4587 }, { "epoch": 0.2965979797979798, "grad_norm": 0.06910679489374161, "learning_rate": 0.00019606080700630585, "loss": 0.1, "step": 4588 }, { "epoch": 0.29666262626262624, "grad_norm": 0.05494191125035286, "learning_rate": 0.0001960589062839436, "loss": 0.0809, "step": 4589 }, { "epoch": 0.29672727272727273, "grad_norm": 0.06923728436231613, "learning_rate": 0.00019605700511234563, "loss": 0.0823, "step": 4590 }, { "epoch": 0.2967919191919192, "grad_norm": 0.07791271060705185, "learning_rate": 0.00019605510349152082, "loss": 0.1124, "step": 4591 }, { "epoch": 0.29685656565656565, "grad_norm": 0.0683489739894867, "learning_rate": 0.000196053201421478, "loss": 0.0849, "step": 4592 }, { "epoch": 0.29685656565656565, "eval_bleu": 14.771845035022036, "eval_loss": 0.09264057129621506, "eval_runtime": 2.753, "eval_samples_per_second": 11.624, "eval_steps_per_second": 1.453, "step": 4592 }, { "epoch": 0.29692121212121214, "grad_norm": 0.0798630639910698, "learning_rate": 0.00019605129890222614, "loss": 0.1065, "step": 4593 }, { "epoch": 0.2969858585858586, "grad_norm": 0.05915911868214607, "learning_rate": 0.00019604939593377408, "loss": 0.0802, "step": 4594 }, { "epoch": 0.29705050505050506, "grad_norm": 0.07560048997402191, "learning_rate": 0.0001960474925161308, "loss": 0.09, "step": 4595 }, { "epoch": 0.2971151515151515, "grad_norm": 0.05218242108821869, "learning_rate": 0.0001960455886493051, "loss": 0.0619, "step": 4596 }, { "epoch": 0.297179797979798, "grad_norm": 0.07150762528181076, "learning_rate": 0.00019604368433330597, "loss": 0.1023, "step": 4597 }, { "epoch": 0.2972444444444444, "grad_norm": 0.07948549091815948, "learning_rate": 0.00019604177956814226, "loss": 0.1009, "step": 4598 }, { "epoch": 0.2973090909090909, "grad_norm": 0.12578381597995758, "learning_rate": 0.00019603987435382292, "loss": 0.0879, "step": 4599 }, { "epoch": 0.2973737373737374, "grad_norm": 0.07147827744483948, "learning_rate": 0.0001960379686903568, "loss": 0.0834, "step": 4600 }, { "epoch": 0.29743838383838384, "grad_norm": 0.0709298625588417, "learning_rate": 0.00019603606257775287, "loss": 0.0922, "step": 4601 }, { "epoch": 0.2975030303030303, "grad_norm": 0.07045287638902664, "learning_rate": 0.00019603415601602002, "loss": 0.097, "step": 4602 }, { "epoch": 0.29756767676767676, "grad_norm": 0.06109197810292244, "learning_rate": 0.0001960322490051672, "loss": 0.0776, "step": 4603 }, { "epoch": 0.29763232323232325, "grad_norm": 0.06679968535900116, "learning_rate": 0.00019603034154520326, "loss": 0.0885, "step": 4604 }, { "epoch": 0.2976969696969697, "grad_norm": 0.06881806254386902, "learning_rate": 0.0001960284336361372, "loss": 0.0845, "step": 4605 }, { "epoch": 0.29776161616161617, "grad_norm": 0.06673866510391235, "learning_rate": 0.00019602652527797788, "loss": 0.0801, "step": 4606 }, { "epoch": 0.2978262626262626, "grad_norm": 0.07082986831665039, "learning_rate": 0.00019602461647073423, "loss": 0.0854, "step": 4607 }, { "epoch": 0.2978909090909091, "grad_norm": 0.0699472576379776, "learning_rate": 0.0001960227072144152, "loss": 0.0922, "step": 4608 }, { "epoch": 0.2978909090909091, "eval_bleu": 14.474955979377038, "eval_loss": 0.09158635884523392, "eval_runtime": 2.6993, "eval_samples_per_second": 11.855, "eval_steps_per_second": 1.482, "step": 4608 }, { "epoch": 0.29795555555555553, "grad_norm": 0.08134989440441132, "learning_rate": 0.00019602079750902972, "loss": 0.0712, "step": 4609 }, { "epoch": 0.298020202020202, "grad_norm": 0.06048596277832985, "learning_rate": 0.0001960188873545867, "loss": 0.0772, "step": 4610 }, { "epoch": 0.2980848484848485, "grad_norm": 0.08579091727733612, "learning_rate": 0.00019601697675109513, "loss": 0.0932, "step": 4611 }, { "epoch": 0.29814949494949494, "grad_norm": 0.06653161346912384, "learning_rate": 0.00019601506569856384, "loss": 0.0766, "step": 4612 }, { "epoch": 0.29821414141414143, "grad_norm": 0.06581110507249832, "learning_rate": 0.00019601315419700188, "loss": 0.0909, "step": 4613 }, { "epoch": 0.29827878787878787, "grad_norm": 0.05982992798089981, "learning_rate": 0.0001960112422464181, "loss": 0.0836, "step": 4614 }, { "epoch": 0.29834343434343435, "grad_norm": 0.0542968325316906, "learning_rate": 0.00019600932984682151, "loss": 0.0773, "step": 4615 }, { "epoch": 0.2984080808080808, "grad_norm": 0.1185133159160614, "learning_rate": 0.00019600741699822102, "loss": 0.112, "step": 4616 }, { "epoch": 0.2984727272727273, "grad_norm": 0.06345734000205994, "learning_rate": 0.0001960055037006256, "loss": 0.0819, "step": 4617 }, { "epoch": 0.2985373737373737, "grad_norm": 0.07264839112758636, "learning_rate": 0.00019600358995404413, "loss": 0.0926, "step": 4618 }, { "epoch": 0.2986020202020202, "grad_norm": 0.0693463459610939, "learning_rate": 0.00019600167575848564, "loss": 0.0869, "step": 4619 }, { "epoch": 0.2986666666666667, "grad_norm": 0.09668222069740295, "learning_rate": 0.00019599976111395905, "loss": 0.1131, "step": 4620 }, { "epoch": 0.2987313131313131, "grad_norm": 0.06024109944701195, "learning_rate": 0.0001959978460204733, "loss": 0.0738, "step": 4621 }, { "epoch": 0.2987959595959596, "grad_norm": 0.057824525982141495, "learning_rate": 0.0001959959304780374, "loss": 0.0851, "step": 4622 }, { "epoch": 0.29886060606060605, "grad_norm": 0.0650748461484909, "learning_rate": 0.00019599401448666022, "loss": 0.0869, "step": 4623 }, { "epoch": 0.29892525252525254, "grad_norm": 0.06472223252058029, "learning_rate": 0.00019599209804635077, "loss": 0.0982, "step": 4624 }, { "epoch": 0.29892525252525254, "eval_bleu": 15.133697997813023, "eval_loss": 0.09448139369487762, "eval_runtime": 2.7323, "eval_samples_per_second": 11.712, "eval_steps_per_second": 1.464, "step": 4624 }, { "epoch": 0.298989898989899, "grad_norm": 0.06019340828061104, "learning_rate": 0.00019599018115711806, "loss": 0.088, "step": 4625 }, { "epoch": 0.29905454545454546, "grad_norm": 0.08765371143817902, "learning_rate": 0.00019598826381897095, "loss": 0.095, "step": 4626 }, { "epoch": 0.2991191919191919, "grad_norm": 0.07066169381141663, "learning_rate": 0.0001959863460319185, "loss": 0.0828, "step": 4627 }, { "epoch": 0.2991838383838384, "grad_norm": 0.06294021755456924, "learning_rate": 0.00019598442779596961, "loss": 0.0945, "step": 4628 }, { "epoch": 0.2992484848484849, "grad_norm": 0.05835629627108574, "learning_rate": 0.0001959825091111333, "loss": 0.0916, "step": 4629 }, { "epoch": 0.2993131313131313, "grad_norm": 0.06739974766969681, "learning_rate": 0.00019598058997741854, "loss": 0.091, "step": 4630 }, { "epoch": 0.2993777777777778, "grad_norm": 0.06979475915431976, "learning_rate": 0.00019597867039483426, "loss": 0.1017, "step": 4631 }, { "epoch": 0.29944242424242423, "grad_norm": 0.062409088015556335, "learning_rate": 0.0001959767503633895, "loss": 0.0872, "step": 4632 }, { "epoch": 0.2995070707070707, "grad_norm": 0.06673561781644821, "learning_rate": 0.0001959748298830932, "loss": 0.1009, "step": 4633 }, { "epoch": 0.29957171717171716, "grad_norm": 0.06147103011608124, "learning_rate": 0.00019597290895395435, "loss": 0.0906, "step": 4634 }, { "epoch": 0.29963636363636365, "grad_norm": 0.06785107403993607, "learning_rate": 0.00019597098757598194, "loss": 0.0843, "step": 4635 }, { "epoch": 0.2997010101010101, "grad_norm": 0.0652463361620903, "learning_rate": 0.00019596906574918493, "loss": 0.0875, "step": 4636 }, { "epoch": 0.29976565656565657, "grad_norm": 0.06536533683538437, "learning_rate": 0.0001959671434735723, "loss": 0.0955, "step": 4637 }, { "epoch": 0.299830303030303, "grad_norm": 0.05749241262674332, "learning_rate": 0.00019596522074915313, "loss": 0.077, "step": 4638 }, { "epoch": 0.2998949494949495, "grad_norm": 0.06882575154304504, "learning_rate": 0.0001959632975759363, "loss": 0.0959, "step": 4639 }, { "epoch": 0.299959595959596, "grad_norm": 0.06407667696475983, "learning_rate": 0.0001959613739539309, "loss": 0.0911, "step": 4640 }, { "epoch": 0.299959595959596, "eval_bleu": 13.214919627610731, "eval_loss": 0.09465215355157852, "eval_runtime": 2.7415, "eval_samples_per_second": 11.672, "eval_steps_per_second": 1.459, "step": 4640 }, { "epoch": 0.3000242424242424, "grad_norm": 0.07222005724906921, "learning_rate": 0.00019595944988314582, "loss": 0.0962, "step": 4641 }, { "epoch": 0.3000888888888889, "grad_norm": 0.08042989671230316, "learning_rate": 0.00019595752536359016, "loss": 0.1114, "step": 4642 }, { "epoch": 0.30015353535353534, "grad_norm": 0.06781364977359772, "learning_rate": 0.00019595560039527285, "loss": 0.0948, "step": 4643 }, { "epoch": 0.30021818181818183, "grad_norm": 0.06815223395824432, "learning_rate": 0.00019595367497820293, "loss": 0.0861, "step": 4644 }, { "epoch": 0.30028282828282826, "grad_norm": 0.06212089955806732, "learning_rate": 0.0001959517491123894, "loss": 0.0883, "step": 4645 }, { "epoch": 0.30034747474747475, "grad_norm": 0.05732403323054314, "learning_rate": 0.00019594982279784126, "loss": 0.0755, "step": 4646 }, { "epoch": 0.3004121212121212, "grad_norm": 0.06334793567657471, "learning_rate": 0.0001959478960345675, "loss": 0.0882, "step": 4647 }, { "epoch": 0.3004767676767677, "grad_norm": 0.08146253973245621, "learning_rate": 0.0001959459688225772, "loss": 0.1079, "step": 4648 }, { "epoch": 0.30054141414141416, "grad_norm": 0.06087877228856087, "learning_rate": 0.00019594404116187925, "loss": 0.0771, "step": 4649 }, { "epoch": 0.3006060606060606, "grad_norm": 0.0601593479514122, "learning_rate": 0.00019594211305248277, "loss": 0.0703, "step": 4650 }, { "epoch": 0.3006707070707071, "grad_norm": 0.07142464816570282, "learning_rate": 0.00019594018449439675, "loss": 0.0983, "step": 4651 }, { "epoch": 0.3007353535353535, "grad_norm": 0.06425745785236359, "learning_rate": 0.0001959382554876302, "loss": 0.0863, "step": 4652 }, { "epoch": 0.3008, "grad_norm": 0.06959160417318344, "learning_rate": 0.00019593632603219217, "loss": 0.0889, "step": 4653 }, { "epoch": 0.30086464646464645, "grad_norm": 0.06369398534297943, "learning_rate": 0.00019593439612809163, "loss": 0.0862, "step": 4654 }, { "epoch": 0.30092929292929294, "grad_norm": 0.07400577515363693, "learning_rate": 0.00019593246577533763, "loss": 0.1027, "step": 4655 }, { "epoch": 0.30099393939393937, "grad_norm": 0.07124436646699905, "learning_rate": 0.00019593053497393923, "loss": 0.0959, "step": 4656 }, { "epoch": 0.30099393939393937, "eval_bleu": 12.94114265878201, "eval_loss": 0.09464993327856064, "eval_runtime": 2.6959, "eval_samples_per_second": 11.87, "eval_steps_per_second": 1.484, "step": 4656 }, { "epoch": 0.30105858585858586, "grad_norm": 0.06135867163538933, "learning_rate": 0.00019592860372390542, "loss": 0.0925, "step": 4657 }, { "epoch": 0.30112323232323235, "grad_norm": 0.12385788559913635, "learning_rate": 0.00019592667202524523, "loss": 0.0886, "step": 4658 }, { "epoch": 0.3011878787878788, "grad_norm": 0.08610004186630249, "learning_rate": 0.00019592473987796772, "loss": 0.1036, "step": 4659 }, { "epoch": 0.30125252525252527, "grad_norm": 0.06922711431980133, "learning_rate": 0.00019592280728208193, "loss": 0.0859, "step": 4660 }, { "epoch": 0.3013171717171717, "grad_norm": 0.0694604441523552, "learning_rate": 0.00019592087423759687, "loss": 0.105, "step": 4661 }, { "epoch": 0.3013818181818182, "grad_norm": 0.06808525323867798, "learning_rate": 0.00019591894074452157, "loss": 0.0891, "step": 4662 }, { "epoch": 0.30144646464646463, "grad_norm": 0.07607869058847427, "learning_rate": 0.00019591700680286512, "loss": 0.1001, "step": 4663 }, { "epoch": 0.3015111111111111, "grad_norm": 0.07043629884719849, "learning_rate": 0.00019591507241263653, "loss": 0.0927, "step": 4664 }, { "epoch": 0.30157575757575755, "grad_norm": 0.05468107387423515, "learning_rate": 0.00019591313757384487, "loss": 0.082, "step": 4665 }, { "epoch": 0.30164040404040404, "grad_norm": 0.07286622375249863, "learning_rate": 0.00019591120228649916, "loss": 0.0962, "step": 4666 }, { "epoch": 0.30170505050505053, "grad_norm": 0.06715156137943268, "learning_rate": 0.00019590926655060843, "loss": 0.0899, "step": 4667 }, { "epoch": 0.30176969696969697, "grad_norm": 0.09657955914735794, "learning_rate": 0.0001959073303661818, "loss": 0.1081, "step": 4668 }, { "epoch": 0.30183434343434346, "grad_norm": 0.07730289548635483, "learning_rate": 0.0001959053937332283, "loss": 0.1133, "step": 4669 }, { "epoch": 0.3018989898989899, "grad_norm": 0.05900173634290695, "learning_rate": 0.00019590345665175697, "loss": 0.0786, "step": 4670 }, { "epoch": 0.3019636363636364, "grad_norm": 0.06607168167829514, "learning_rate": 0.00019590151912177688, "loss": 0.0878, "step": 4671 }, { "epoch": 0.3020282828282828, "grad_norm": 0.07803653925657272, "learning_rate": 0.0001958995811432971, "loss": 0.0969, "step": 4672 }, { "epoch": 0.3020282828282828, "eval_bleu": 13.620256756175307, "eval_loss": 0.09350509196519852, "eval_runtime": 2.8482, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 4672 }, { "epoch": 0.3020929292929293, "grad_norm": 0.06570162624120712, "learning_rate": 0.00019589764271632666, "loss": 0.0886, "step": 4673 }, { "epoch": 0.30215757575757574, "grad_norm": 0.07181063294410706, "learning_rate": 0.00019589570384087468, "loss": 0.0783, "step": 4674 }, { "epoch": 0.3022222222222222, "grad_norm": 0.06506641954183578, "learning_rate": 0.00019589376451695017, "loss": 0.0915, "step": 4675 }, { "epoch": 0.30228686868686866, "grad_norm": 0.06903707981109619, "learning_rate": 0.00019589182474456223, "loss": 0.0959, "step": 4676 }, { "epoch": 0.30235151515151515, "grad_norm": 0.06432800740003586, "learning_rate": 0.00019588988452371993, "loss": 0.1011, "step": 4677 }, { "epoch": 0.30241616161616164, "grad_norm": 0.06585089862346649, "learning_rate": 0.00019588794385443233, "loss": 0.0987, "step": 4678 }, { "epoch": 0.3024808080808081, "grad_norm": 0.06807604432106018, "learning_rate": 0.00019588600273670853, "loss": 0.093, "step": 4679 }, { "epoch": 0.30254545454545456, "grad_norm": 0.06365378946065903, "learning_rate": 0.0001958840611705576, "loss": 0.0969, "step": 4680 }, { "epoch": 0.302610101010101, "grad_norm": 0.06447930634021759, "learning_rate": 0.00019588211915598858, "loss": 0.0846, "step": 4681 }, { "epoch": 0.3026747474747475, "grad_norm": 0.07107406109571457, "learning_rate": 0.00019588017669301062, "loss": 0.0904, "step": 4682 }, { "epoch": 0.3027393939393939, "grad_norm": 0.06212260574102402, "learning_rate": 0.00019587823378163278, "loss": 0.0846, "step": 4683 }, { "epoch": 0.3028040404040404, "grad_norm": 0.07482921332120895, "learning_rate": 0.0001958762904218641, "loss": 0.119, "step": 4684 }, { "epoch": 0.30286868686868684, "grad_norm": 0.06667369604110718, "learning_rate": 0.0001958743466137137, "loss": 0.0931, "step": 4685 }, { "epoch": 0.30293333333333333, "grad_norm": 0.06501346081495285, "learning_rate": 0.00019587240235719074, "loss": 0.0971, "step": 4686 }, { "epoch": 0.3029979797979798, "grad_norm": 0.0637492686510086, "learning_rate": 0.0001958704576523042, "loss": 0.0765, "step": 4687 }, { "epoch": 0.30306262626262626, "grad_norm": 0.06262986361980438, "learning_rate": 0.0001958685124990632, "loss": 0.09, "step": 4688 }, { "epoch": 0.30306262626262626, "eval_bleu": 16.012961307451263, "eval_loss": 0.09376657754182816, "eval_runtime": 2.7259, "eval_samples_per_second": 11.739, "eval_steps_per_second": 1.467, "step": 4688 }, { "epoch": 0.30312727272727275, "grad_norm": 0.08209948241710663, "learning_rate": 0.00019586656689747693, "loss": 0.1065, "step": 4689 }, { "epoch": 0.3031919191919192, "grad_norm": 0.06138643994927406, "learning_rate": 0.0001958646208475544, "loss": 0.0835, "step": 4690 }, { "epoch": 0.30325656565656567, "grad_norm": 0.0823470875620842, "learning_rate": 0.00019586267434930468, "loss": 0.125, "step": 4691 }, { "epoch": 0.3033212121212121, "grad_norm": 0.060678817331790924, "learning_rate": 0.000195860727402737, "loss": 0.0832, "step": 4692 }, { "epoch": 0.3033858585858586, "grad_norm": 0.08106742799282074, "learning_rate": 0.0001958587800078603, "loss": 0.1017, "step": 4693 }, { "epoch": 0.303450505050505, "grad_norm": 0.07476026564836502, "learning_rate": 0.00019585683216468384, "loss": 0.1118, "step": 4694 }, { "epoch": 0.3035151515151515, "grad_norm": 0.05703451484441757, "learning_rate": 0.00019585488387321664, "loss": 0.0724, "step": 4695 }, { "epoch": 0.303579797979798, "grad_norm": 0.06587185710668564, "learning_rate": 0.00019585293513346786, "loss": 0.0922, "step": 4696 }, { "epoch": 0.30364444444444444, "grad_norm": 0.06442519277334213, "learning_rate": 0.00019585098594544659, "loss": 0.0709, "step": 4697 }, { "epoch": 0.30370909090909093, "grad_norm": 0.08398552238941193, "learning_rate": 0.00019584903630916195, "loss": 0.1026, "step": 4698 }, { "epoch": 0.30377373737373736, "grad_norm": 0.056892458349466324, "learning_rate": 0.00019584708622462303, "loss": 0.0802, "step": 4699 }, { "epoch": 0.30383838383838385, "grad_norm": 0.07192113995552063, "learning_rate": 0.00019584513569183897, "loss": 0.0975, "step": 4700 }, { "epoch": 0.3039030303030303, "grad_norm": 0.06274757534265518, "learning_rate": 0.0001958431847108189, "loss": 0.0861, "step": 4701 }, { "epoch": 0.3039676767676768, "grad_norm": 0.0750383585691452, "learning_rate": 0.00019584123328157197, "loss": 0.0863, "step": 4702 }, { "epoch": 0.3040323232323232, "grad_norm": 0.05529523640871048, "learning_rate": 0.00019583928140410724, "loss": 0.0759, "step": 4703 }, { "epoch": 0.3040969696969697, "grad_norm": 0.06533263623714447, "learning_rate": 0.00019583732907843388, "loss": 0.072, "step": 4704 }, { "epoch": 0.3040969696969697, "eval_bleu": 13.073941613645246, "eval_loss": 0.09473906457424164, "eval_runtime": 2.805, "eval_samples_per_second": 11.408, "eval_steps_per_second": 1.426, "step": 4704 }, { "epoch": 0.3041616161616162, "grad_norm": 0.07780378311872482, "learning_rate": 0.00019583537630456102, "loss": 0.1279, "step": 4705 }, { "epoch": 0.3042262626262626, "grad_norm": 0.0677536204457283, "learning_rate": 0.00019583342308249782, "loss": 0.1005, "step": 4706 }, { "epoch": 0.3042909090909091, "grad_norm": 0.06919342279434204, "learning_rate": 0.00019583146941225333, "loss": 0.0946, "step": 4707 }, { "epoch": 0.30435555555555555, "grad_norm": 0.10964915156364441, "learning_rate": 0.00019582951529383675, "loss": 0.0815, "step": 4708 }, { "epoch": 0.30442020202020204, "grad_norm": 0.06256582587957382, "learning_rate": 0.00019582756072725722, "loss": 0.0785, "step": 4709 }, { "epoch": 0.30448484848484847, "grad_norm": 0.05990685150027275, "learning_rate": 0.0001958256057125239, "loss": 0.0849, "step": 4710 }, { "epoch": 0.30454949494949496, "grad_norm": 0.06409945338964462, "learning_rate": 0.00019582365024964587, "loss": 0.1001, "step": 4711 }, { "epoch": 0.3046141414141414, "grad_norm": 0.06273867934942245, "learning_rate": 0.0001958216943386323, "loss": 0.0803, "step": 4712 }, { "epoch": 0.3046787878787879, "grad_norm": 0.07001735270023346, "learning_rate": 0.00019581973797949234, "loss": 0.0959, "step": 4713 }, { "epoch": 0.3047434343434343, "grad_norm": 0.05816273018717766, "learning_rate": 0.00019581778117223517, "loss": 0.0762, "step": 4714 }, { "epoch": 0.3048080808080808, "grad_norm": 0.06626967340707779, "learning_rate": 0.00019581582391686988, "loss": 0.0946, "step": 4715 }, { "epoch": 0.3048727272727273, "grad_norm": 0.061270006000995636, "learning_rate": 0.0001958138662134057, "loss": 0.0736, "step": 4716 }, { "epoch": 0.30493737373737373, "grad_norm": 0.05927988141775131, "learning_rate": 0.0001958119080618517, "loss": 0.0769, "step": 4717 }, { "epoch": 0.3050020202020202, "grad_norm": 0.06026887148618698, "learning_rate": 0.00019580994946221712, "loss": 0.0794, "step": 4718 }, { "epoch": 0.30506666666666665, "grad_norm": 0.07352053374052048, "learning_rate": 0.00019580799041451103, "loss": 0.0985, "step": 4719 }, { "epoch": 0.30513131313131314, "grad_norm": 0.07282546907663345, "learning_rate": 0.00019580603091874267, "loss": 0.0933, "step": 4720 }, { "epoch": 0.30513131313131314, "eval_bleu": 16.774989595062156, "eval_loss": 0.09336063265800476, "eval_runtime": 2.7165, "eval_samples_per_second": 11.78, "eval_steps_per_second": 1.472, "step": 4720 }, { "epoch": 0.3051959595959596, "grad_norm": 0.05864718183875084, "learning_rate": 0.0001958040709749212, "loss": 0.0762, "step": 4721 }, { "epoch": 0.30526060606060607, "grad_norm": 0.07190041244029999, "learning_rate": 0.00019580211058305573, "loss": 0.0954, "step": 4722 }, { "epoch": 0.3053252525252525, "grad_norm": 0.0585903562605381, "learning_rate": 0.00019580014974315545, "loss": 0.074, "step": 4723 }, { "epoch": 0.305389898989899, "grad_norm": 0.06493786722421646, "learning_rate": 0.00019579818845522957, "loss": 0.0871, "step": 4724 }, { "epoch": 0.3054545454545455, "grad_norm": 0.07651641964912415, "learning_rate": 0.00019579622671928723, "loss": 0.105, "step": 4725 }, { "epoch": 0.3055191919191919, "grad_norm": 0.06296389549970627, "learning_rate": 0.0001957942645353376, "loss": 0.0906, "step": 4726 }, { "epoch": 0.3055838383838384, "grad_norm": 0.0711960420012474, "learning_rate": 0.00019579230190338987, "loss": 0.1005, "step": 4727 }, { "epoch": 0.30564848484848484, "grad_norm": 0.06564712524414062, "learning_rate": 0.0001957903388234532, "loss": 0.0871, "step": 4728 }, { "epoch": 0.3057131313131313, "grad_norm": 0.06702519953250885, "learning_rate": 0.0001957883752955368, "loss": 0.0887, "step": 4729 }, { "epoch": 0.30577777777777776, "grad_norm": 0.06969790905714035, "learning_rate": 0.00019578641131964982, "loss": 0.107, "step": 4730 }, { "epoch": 0.30584242424242425, "grad_norm": 0.061255622655153275, "learning_rate": 0.0001957844468958015, "loss": 0.0896, "step": 4731 }, { "epoch": 0.3059070707070707, "grad_norm": 0.08252348750829697, "learning_rate": 0.00019578248202400093, "loss": 0.095, "step": 4732 }, { "epoch": 0.3059717171717172, "grad_norm": 0.06545408815145493, "learning_rate": 0.00019578051670425737, "loss": 0.0856, "step": 4733 }, { "epoch": 0.30603636363636366, "grad_norm": 0.06466548144817352, "learning_rate": 0.00019577855093658002, "loss": 0.0931, "step": 4734 }, { "epoch": 0.3061010101010101, "grad_norm": 0.0644887238740921, "learning_rate": 0.00019577658472097803, "loss": 0.0942, "step": 4735 }, { "epoch": 0.3061656565656566, "grad_norm": 0.06501445919275284, "learning_rate": 0.0001957746180574606, "loss": 0.0938, "step": 4736 }, { "epoch": 0.3061656565656566, "eval_bleu": 14.989629406095618, "eval_loss": 0.09555211663246155, "eval_runtime": 2.6007, "eval_samples_per_second": 12.304, "eval_steps_per_second": 1.538, "step": 4736 }, { "epoch": 0.306230303030303, "grad_norm": 0.06556830555200577, "learning_rate": 0.00019577265094603702, "loss": 0.0856, "step": 4737 }, { "epoch": 0.3062949494949495, "grad_norm": 0.10416121780872345, "learning_rate": 0.00019577068338671633, "loss": 0.1236, "step": 4738 }, { "epoch": 0.30635959595959594, "grad_norm": 0.05639476701617241, "learning_rate": 0.00019576871537950786, "loss": 0.0812, "step": 4739 }, { "epoch": 0.30642424242424243, "grad_norm": 0.06095866858959198, "learning_rate": 0.00019576674692442077, "loss": 0.0763, "step": 4740 }, { "epoch": 0.30648888888888887, "grad_norm": 0.07491468638181686, "learning_rate": 0.00019576477802146425, "loss": 0.1069, "step": 4741 }, { "epoch": 0.30655353535353536, "grad_norm": 0.07674787193536758, "learning_rate": 0.00019576280867064752, "loss": 0.104, "step": 4742 }, { "epoch": 0.30661818181818185, "grad_norm": 0.06851784884929657, "learning_rate": 0.00019576083887197978, "loss": 0.0865, "step": 4743 }, { "epoch": 0.3066828282828283, "grad_norm": 0.06821895390748978, "learning_rate": 0.00019575886862547028, "loss": 0.0939, "step": 4744 }, { "epoch": 0.30674747474747477, "grad_norm": 0.07297416031360626, "learning_rate": 0.00019575689793112822, "loss": 0.1185, "step": 4745 }, { "epoch": 0.3068121212121212, "grad_norm": 0.055229298770427704, "learning_rate": 0.00019575492678896276, "loss": 0.072, "step": 4746 }, { "epoch": 0.3068767676767677, "grad_norm": 0.055858418345451355, "learning_rate": 0.0001957529551989832, "loss": 0.0714, "step": 4747 }, { "epoch": 0.3069414141414141, "grad_norm": 0.07207828760147095, "learning_rate": 0.0001957509831611987, "loss": 0.0992, "step": 4748 }, { "epoch": 0.3070060606060606, "grad_norm": 0.0785556212067604, "learning_rate": 0.00019574901067561854, "loss": 0.1036, "step": 4749 }, { "epoch": 0.30707070707070705, "grad_norm": 0.05766552686691284, "learning_rate": 0.0001957470377422519, "loss": 0.0824, "step": 4750 }, { "epoch": 0.30713535353535354, "grad_norm": 0.08286474645137787, "learning_rate": 0.00019574506436110799, "loss": 0.12, "step": 4751 }, { "epoch": 0.3072, "grad_norm": 0.05716665834188461, "learning_rate": 0.0001957430905321961, "loss": 0.0753, "step": 4752 }, { "epoch": 0.3072, "eval_bleu": 13.38680526728827, "eval_loss": 0.09434428811073303, "eval_runtime": 2.6643, "eval_samples_per_second": 12.011, "eval_steps_per_second": 1.501, "step": 4752 }, { "epoch": 0.30726464646464646, "grad_norm": 0.07537861913442612, "learning_rate": 0.00019574111625552537, "loss": 0.1003, "step": 4753 }, { "epoch": 0.30732929292929295, "grad_norm": 0.0637507438659668, "learning_rate": 0.00019573914153110514, "loss": 0.0876, "step": 4754 }, { "epoch": 0.3073939393939394, "grad_norm": 0.06305477023124695, "learning_rate": 0.00019573716635894458, "loss": 0.0879, "step": 4755 }, { "epoch": 0.3074585858585859, "grad_norm": 0.06771845370531082, "learning_rate": 0.00019573519073905297, "loss": 0.0919, "step": 4756 }, { "epoch": 0.3075232323232323, "grad_norm": 0.0642908588051796, "learning_rate": 0.0001957332146714395, "loss": 0.0782, "step": 4757 }, { "epoch": 0.3075878787878788, "grad_norm": 0.06687585264444351, "learning_rate": 0.00019573123815611344, "loss": 0.0855, "step": 4758 }, { "epoch": 0.30765252525252523, "grad_norm": 0.06200970709323883, "learning_rate": 0.00019572926119308404, "loss": 0.0725, "step": 4759 }, { "epoch": 0.3077171717171717, "grad_norm": 0.06523467600345612, "learning_rate": 0.0001957272837823605, "loss": 0.0816, "step": 4760 }, { "epoch": 0.30778181818181816, "grad_norm": 0.07990944385528564, "learning_rate": 0.00019572530592395213, "loss": 0.0998, "step": 4761 }, { "epoch": 0.30784646464646465, "grad_norm": 0.06983045488595963, "learning_rate": 0.00019572332761786813, "loss": 0.0972, "step": 4762 }, { "epoch": 0.30791111111111114, "grad_norm": 0.06714247912168503, "learning_rate": 0.00019572134886411776, "loss": 0.0891, "step": 4763 }, { "epoch": 0.30797575757575757, "grad_norm": 0.08239343762397766, "learning_rate": 0.00019571936966271034, "loss": 0.0821, "step": 4764 }, { "epoch": 0.30804040404040406, "grad_norm": 0.057748764753341675, "learning_rate": 0.00019571739001365504, "loss": 0.0769, "step": 4765 }, { "epoch": 0.3081050505050505, "grad_norm": 0.0627964586019516, "learning_rate": 0.00019571540991696114, "loss": 0.0862, "step": 4766 }, { "epoch": 0.308169696969697, "grad_norm": 0.06751590967178345, "learning_rate": 0.00019571342937263792, "loss": 0.089, "step": 4767 }, { "epoch": 0.3082343434343434, "grad_norm": 0.07098577916622162, "learning_rate": 0.00019571144838069463, "loss": 0.0855, "step": 4768 }, { "epoch": 0.3082343434343434, "eval_bleu": 11.205770360479216, "eval_loss": 0.09519843757152557, "eval_runtime": 2.8521, "eval_samples_per_second": 11.22, "eval_steps_per_second": 1.402, "step": 4768 }, { "epoch": 0.3082989898989899, "grad_norm": 0.06362498551607132, "learning_rate": 0.00019570946694114053, "loss": 0.0843, "step": 4769 }, { "epoch": 0.30836363636363634, "grad_norm": 0.06605575978755951, "learning_rate": 0.00019570748505398492, "loss": 0.0881, "step": 4770 }, { "epoch": 0.30842828282828283, "grad_norm": 0.061822086572647095, "learning_rate": 0.00019570550271923702, "loss": 0.0938, "step": 4771 }, { "epoch": 0.3084929292929293, "grad_norm": 0.06998146325349808, "learning_rate": 0.00019570351993690612, "loss": 0.0966, "step": 4772 }, { "epoch": 0.30855757575757575, "grad_norm": 0.059741441160440445, "learning_rate": 0.00019570153670700152, "loss": 0.0821, "step": 4773 }, { "epoch": 0.30862222222222224, "grad_norm": 0.05550830811262131, "learning_rate": 0.00019569955302953246, "loss": 0.0679, "step": 4774 }, { "epoch": 0.3086868686868687, "grad_norm": 0.05758580192923546, "learning_rate": 0.00019569756890450824, "loss": 0.0681, "step": 4775 }, { "epoch": 0.30875151515151517, "grad_norm": 0.08599646389484406, "learning_rate": 0.00019569558433193808, "loss": 0.1019, "step": 4776 }, { "epoch": 0.3088161616161616, "grad_norm": 0.10221553593873978, "learning_rate": 0.00019569359931183135, "loss": 0.0969, "step": 4777 }, { "epoch": 0.3088808080808081, "grad_norm": 0.07317476719617844, "learning_rate": 0.0001956916138441973, "loss": 0.0976, "step": 4778 }, { "epoch": 0.3089454545454545, "grad_norm": 0.06070096790790558, "learning_rate": 0.0001956896279290452, "loss": 0.0862, "step": 4779 }, { "epoch": 0.309010101010101, "grad_norm": 0.06562337279319763, "learning_rate": 0.00019568764156638433, "loss": 0.0878, "step": 4780 }, { "epoch": 0.3090747474747475, "grad_norm": 0.07425237447023392, "learning_rate": 0.00019568565475622398, "loss": 0.0926, "step": 4781 }, { "epoch": 0.30913939393939394, "grad_norm": 0.0745721161365509, "learning_rate": 0.0001956836674985735, "loss": 0.1009, "step": 4782 }, { "epoch": 0.3092040404040404, "grad_norm": 0.059794675558805466, "learning_rate": 0.00019568167979344212, "loss": 0.0851, "step": 4783 }, { "epoch": 0.30926868686868686, "grad_norm": 0.053508102893829346, "learning_rate": 0.00019567969164083912, "loss": 0.0688, "step": 4784 }, { "epoch": 0.30926868686868686, "eval_bleu": 15.035717079060975, "eval_loss": 0.09469657391309738, "eval_runtime": 2.7942, "eval_samples_per_second": 11.452, "eval_steps_per_second": 1.432, "step": 4784 }, { "epoch": 0.30933333333333335, "grad_norm": 0.0803999975323677, "learning_rate": 0.00019567770304077388, "loss": 0.0947, "step": 4785 }, { "epoch": 0.3093979797979798, "grad_norm": 0.08259466290473938, "learning_rate": 0.00019567571399325563, "loss": 0.1044, "step": 4786 }, { "epoch": 0.3094626262626263, "grad_norm": 0.06777393072843552, "learning_rate": 0.0001956737244982937, "loss": 0.1108, "step": 4787 }, { "epoch": 0.3095272727272727, "grad_norm": 0.059062596410512924, "learning_rate": 0.0001956717345558974, "loss": 0.0888, "step": 4788 }, { "epoch": 0.3095919191919192, "grad_norm": 0.05767020583152771, "learning_rate": 0.00019566974416607602, "loss": 0.0789, "step": 4789 }, { "epoch": 0.30965656565656563, "grad_norm": 0.06474613398313522, "learning_rate": 0.00019566775332883885, "loss": 0.0996, "step": 4790 }, { "epoch": 0.3097212121212121, "grad_norm": 0.0566738024353981, "learning_rate": 0.00019566576204419527, "loss": 0.0817, "step": 4791 }, { "epoch": 0.3097858585858586, "grad_norm": 0.05534113571047783, "learning_rate": 0.00019566377031215453, "loss": 0.0754, "step": 4792 }, { "epoch": 0.30985050505050504, "grad_norm": 0.06616433709859848, "learning_rate": 0.00019566177813272595, "loss": 0.1053, "step": 4793 }, { "epoch": 0.30991515151515153, "grad_norm": 0.05442001670598984, "learning_rate": 0.00019565978550591885, "loss": 0.0716, "step": 4794 }, { "epoch": 0.30997979797979797, "grad_norm": 0.06467068195343018, "learning_rate": 0.00019565779243174258, "loss": 0.0871, "step": 4795 }, { "epoch": 0.31004444444444446, "grad_norm": 0.06228816881775856, "learning_rate": 0.00019565579891020645, "loss": 0.0911, "step": 4796 }, { "epoch": 0.3101090909090909, "grad_norm": 0.07857277989387512, "learning_rate": 0.00019565380494131974, "loss": 0.1219, "step": 4797 }, { "epoch": 0.3101737373737374, "grad_norm": 0.06959903985261917, "learning_rate": 0.00019565181052509181, "loss": 0.101, "step": 4798 }, { "epoch": 0.3102383838383838, "grad_norm": 0.07208540290594101, "learning_rate": 0.000195649815661532, "loss": 0.0949, "step": 4799 }, { "epoch": 0.3103030303030303, "grad_norm": 0.05932864174246788, "learning_rate": 0.00019564782035064963, "loss": 0.085, "step": 4800 }, { "epoch": 0.3103030303030303, "eval_bleu": 17.41184961237193, "eval_loss": 0.09466177970170975, "eval_runtime": 2.7229, "eval_samples_per_second": 11.752, "eval_steps_per_second": 1.469, "step": 4800 }, { "epoch": 0.3103676767676768, "grad_norm": 0.07277443259954453, "learning_rate": 0.00019564582459245399, "loss": 0.1045, "step": 4801 }, { "epoch": 0.3104323232323232, "grad_norm": 0.06777191907167435, "learning_rate": 0.00019564382838695447, "loss": 0.1013, "step": 4802 }, { "epoch": 0.3104969696969697, "grad_norm": 0.06147941201925278, "learning_rate": 0.0001956418317341604, "loss": 0.0893, "step": 4803 }, { "epoch": 0.31056161616161615, "grad_norm": 0.07320127636194229, "learning_rate": 0.0001956398346340811, "loss": 0.1013, "step": 4804 }, { "epoch": 0.31062626262626264, "grad_norm": 0.06448844820261002, "learning_rate": 0.00019563783708672586, "loss": 0.0835, "step": 4805 }, { "epoch": 0.3106909090909091, "grad_norm": 0.06548895686864853, "learning_rate": 0.00019563583909210411, "loss": 0.0859, "step": 4806 }, { "epoch": 0.31075555555555556, "grad_norm": 0.06265900284051895, "learning_rate": 0.00019563384065022517, "loss": 0.0802, "step": 4807 }, { "epoch": 0.310820202020202, "grad_norm": 0.0711962878704071, "learning_rate": 0.00019563184176109836, "loss": 0.0859, "step": 4808 }, { "epoch": 0.3108848484848485, "grad_norm": 0.06866409629583359, "learning_rate": 0.00019562984242473306, "loss": 0.0851, "step": 4809 }, { "epoch": 0.310949494949495, "grad_norm": 0.06327687948942184, "learning_rate": 0.0001956278426411386, "loss": 0.0874, "step": 4810 }, { "epoch": 0.3110141414141414, "grad_norm": 0.11567649245262146, "learning_rate": 0.00019562584241032428, "loss": 0.0916, "step": 4811 }, { "epoch": 0.3110787878787879, "grad_norm": 0.06246242672204971, "learning_rate": 0.00019562384173229958, "loss": 0.0866, "step": 4812 }, { "epoch": 0.31114343434343433, "grad_norm": 0.07560346275568008, "learning_rate": 0.00019562184060707374, "loss": 0.0973, "step": 4813 }, { "epoch": 0.3112080808080808, "grad_norm": 0.06402620673179626, "learning_rate": 0.00019561983903465616, "loss": 0.0968, "step": 4814 }, { "epoch": 0.31127272727272726, "grad_norm": 0.07922115921974182, "learning_rate": 0.00019561783701505623, "loss": 0.1115, "step": 4815 }, { "epoch": 0.31133737373737375, "grad_norm": 0.062015239149332047, "learning_rate": 0.00019561583454828327, "loss": 0.0861, "step": 4816 }, { "epoch": 0.31133737373737375, "eval_bleu": 12.5886452578275, "eval_loss": 0.0951358824968338, "eval_runtime": 2.8554, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 4816 }, { "epoch": 0.3114020202020202, "grad_norm": 0.058490097522735596, "learning_rate": 0.00019561383163434667, "loss": 0.0764, "step": 4817 }, { "epoch": 0.31146666666666667, "grad_norm": 0.07096497714519501, "learning_rate": 0.0001956118282732558, "loss": 0.106, "step": 4818 }, { "epoch": 0.31153131313131316, "grad_norm": 0.059846751391887665, "learning_rate": 0.00019560982446501998, "loss": 0.0857, "step": 4819 }, { "epoch": 0.3115959595959596, "grad_norm": 0.06803713738918304, "learning_rate": 0.00019560782020964864, "loss": 0.0925, "step": 4820 }, { "epoch": 0.3116606060606061, "grad_norm": 0.06713566929101944, "learning_rate": 0.00019560581550715113, "loss": 0.0892, "step": 4821 }, { "epoch": 0.3117252525252525, "grad_norm": 0.07706134766340256, "learning_rate": 0.00019560381035753683, "loss": 0.1105, "step": 4822 }, { "epoch": 0.311789898989899, "grad_norm": 0.06967047601938248, "learning_rate": 0.00019560180476081514, "loss": 0.0773, "step": 4823 }, { "epoch": 0.31185454545454544, "grad_norm": 0.06331364065408707, "learning_rate": 0.0001955997987169954, "loss": 0.0823, "step": 4824 }, { "epoch": 0.31191919191919193, "grad_norm": 0.06123747304081917, "learning_rate": 0.000195597792226087, "loss": 0.081, "step": 4825 }, { "epoch": 0.31198383838383836, "grad_norm": 0.07577239722013474, "learning_rate": 0.00019559578528809932, "loss": 0.1067, "step": 4826 }, { "epoch": 0.31204848484848485, "grad_norm": 0.06580839306116104, "learning_rate": 0.00019559377790304177, "loss": 0.0913, "step": 4827 }, { "epoch": 0.3121131313131313, "grad_norm": 0.07268828898668289, "learning_rate": 0.00019559177007092367, "loss": 0.0835, "step": 4828 }, { "epoch": 0.3121777777777778, "grad_norm": 0.0747152790427208, "learning_rate": 0.00019558976179175454, "loss": 0.1046, "step": 4829 }, { "epoch": 0.31224242424242427, "grad_norm": 0.06427829712629318, "learning_rate": 0.00019558775306554368, "loss": 0.0716, "step": 4830 }, { "epoch": 0.3123070707070707, "grad_norm": 0.07013077288866043, "learning_rate": 0.00019558574389230048, "loss": 0.0884, "step": 4831 }, { "epoch": 0.3123717171717172, "grad_norm": 0.05118946731090546, "learning_rate": 0.00019558373427203436, "loss": 0.069, "step": 4832 }, { "epoch": 0.3123717171717172, "eval_bleu": 15.852056043665936, "eval_loss": 0.09288105368614197, "eval_runtime": 2.6122, "eval_samples_per_second": 12.25, "eval_steps_per_second": 1.531, "step": 4832 }, { "epoch": 0.3124363636363636, "grad_norm": 0.0632585808634758, "learning_rate": 0.00019558172420475471, "loss": 0.0894, "step": 4833 }, { "epoch": 0.3125010101010101, "grad_norm": 0.059064168483018875, "learning_rate": 0.00019557971369047096, "loss": 0.0764, "step": 4834 }, { "epoch": 0.31256565656565655, "grad_norm": 0.06656047701835632, "learning_rate": 0.00019557770272919244, "loss": 0.1028, "step": 4835 }, { "epoch": 0.31263030303030304, "grad_norm": 0.0714767724275589, "learning_rate": 0.00019557569132092864, "loss": 0.1004, "step": 4836 }, { "epoch": 0.31269494949494947, "grad_norm": 0.0642804428935051, "learning_rate": 0.00019557367946568892, "loss": 0.089, "step": 4837 }, { "epoch": 0.31275959595959596, "grad_norm": 0.07364820688962936, "learning_rate": 0.00019557166716348268, "loss": 0.0984, "step": 4838 }, { "epoch": 0.31282424242424245, "grad_norm": 0.0542878694832325, "learning_rate": 0.00019556965441431938, "loss": 0.0639, "step": 4839 }, { "epoch": 0.3128888888888889, "grad_norm": 0.06015294790267944, "learning_rate": 0.00019556764121820837, "loss": 0.0764, "step": 4840 }, { "epoch": 0.3129535353535354, "grad_norm": 0.06800837814807892, "learning_rate": 0.00019556562757515913, "loss": 0.0994, "step": 4841 }, { "epoch": 0.3130181818181818, "grad_norm": 0.061244990676641464, "learning_rate": 0.000195563613485181, "loss": 0.0854, "step": 4842 }, { "epoch": 0.3130828282828283, "grad_norm": 0.06450098007917404, "learning_rate": 0.0001955615989482835, "loss": 0.0759, "step": 4843 }, { "epoch": 0.31314747474747473, "grad_norm": 0.05966275930404663, "learning_rate": 0.00019555958396447594, "loss": 0.0785, "step": 4844 }, { "epoch": 0.3132121212121212, "grad_norm": 0.06054462492465973, "learning_rate": 0.0001955575685337678, "loss": 0.086, "step": 4845 }, { "epoch": 0.31327676767676765, "grad_norm": 0.07119578868150711, "learning_rate": 0.00019555555265616853, "loss": 0.092, "step": 4846 }, { "epoch": 0.31334141414141414, "grad_norm": 0.07607617229223251, "learning_rate": 0.0001955535363316875, "loss": 0.1032, "step": 4847 }, { "epoch": 0.31340606060606063, "grad_norm": 0.06838122010231018, "learning_rate": 0.0001955515195603342, "loss": 0.0932, "step": 4848 }, { "epoch": 0.31340606060606063, "eval_bleu": 14.165919280145383, "eval_loss": 0.09376853704452515, "eval_runtime": 2.7251, "eval_samples_per_second": 11.743, "eval_steps_per_second": 1.468, "step": 4848 }, { "epoch": 0.31347070707070707, "grad_norm": 0.06580153852701187, "learning_rate": 0.000195549502342118, "loss": 0.084, "step": 4849 }, { "epoch": 0.31353535353535356, "grad_norm": 0.06856783479452133, "learning_rate": 0.00019554748467704843, "loss": 0.0874, "step": 4850 }, { "epoch": 0.3136, "grad_norm": 0.06505312770605087, "learning_rate": 0.0001955454665651348, "loss": 0.0918, "step": 4851 }, { "epoch": 0.3136646464646465, "grad_norm": 0.06147868186235428, "learning_rate": 0.0001955434480063866, "loss": 0.0844, "step": 4852 }, { "epoch": 0.3137292929292929, "grad_norm": 0.06927355378866196, "learning_rate": 0.00019554142900081334, "loss": 0.0943, "step": 4853 }, { "epoch": 0.3137939393939394, "grad_norm": 0.07105158269405365, "learning_rate": 0.00019553940954842436, "loss": 0.1024, "step": 4854 }, { "epoch": 0.31385858585858584, "grad_norm": 0.06368059664964676, "learning_rate": 0.00019553738964922914, "loss": 0.1013, "step": 4855 }, { "epoch": 0.3139232323232323, "grad_norm": 0.06101905554533005, "learning_rate": 0.00019553536930323718, "loss": 0.0831, "step": 4856 }, { "epoch": 0.31398787878787876, "grad_norm": 0.05785023048520088, "learning_rate": 0.00019553334851045784, "loss": 0.0852, "step": 4857 }, { "epoch": 0.31405252525252525, "grad_norm": 0.06406204402446747, "learning_rate": 0.0001955313272709006, "loss": 0.0851, "step": 4858 }, { "epoch": 0.31411717171717174, "grad_norm": 0.07250870019197464, "learning_rate": 0.00019552930558457496, "loss": 0.0991, "step": 4859 }, { "epoch": 0.3141818181818182, "grad_norm": 0.06991460919380188, "learning_rate": 0.0001955272834514903, "loss": 0.0921, "step": 4860 }, { "epoch": 0.31424646464646466, "grad_norm": 0.0643521249294281, "learning_rate": 0.00019552526087165614, "loss": 0.1065, "step": 4861 }, { "epoch": 0.3143111111111111, "grad_norm": 0.059560347348451614, "learning_rate": 0.00019552323784508192, "loss": 0.0873, "step": 4862 }, { "epoch": 0.3143757575757576, "grad_norm": 0.05429724231362343, "learning_rate": 0.0001955212143717771, "loss": 0.0695, "step": 4863 }, { "epoch": 0.314440404040404, "grad_norm": 0.05925700441002846, "learning_rate": 0.00019551919045175114, "loss": 0.0784, "step": 4864 }, { "epoch": 0.314440404040404, "eval_bleu": 17.800953675232073, "eval_loss": 0.09244292229413986, "eval_runtime": 2.7021, "eval_samples_per_second": 11.843, "eval_steps_per_second": 1.48, "step": 4864 }, { "epoch": 0.3145050505050505, "grad_norm": 0.05828540027141571, "learning_rate": 0.0001955171660850135, "loss": 0.0814, "step": 4865 }, { "epoch": 0.31456969696969694, "grad_norm": 0.07010854035615921, "learning_rate": 0.00019551514127157362, "loss": 0.1048, "step": 4866 }, { "epoch": 0.31463434343434343, "grad_norm": 0.0630769357085228, "learning_rate": 0.00019551311601144104, "loss": 0.0826, "step": 4867 }, { "epoch": 0.3146989898989899, "grad_norm": 0.06398430466651917, "learning_rate": 0.0001955110903046252, "loss": 0.0823, "step": 4868 }, { "epoch": 0.31476363636363636, "grad_norm": 0.06699857115745544, "learning_rate": 0.00019550906415113554, "loss": 0.0711, "step": 4869 }, { "epoch": 0.31482828282828285, "grad_norm": 0.05718837305903435, "learning_rate": 0.00019550703755098154, "loss": 0.0677, "step": 4870 }, { "epoch": 0.3148929292929293, "grad_norm": 0.07908740639686584, "learning_rate": 0.00019550501050417273, "loss": 0.102, "step": 4871 }, { "epoch": 0.31495757575757577, "grad_norm": 0.0655069574713707, "learning_rate": 0.00019550298301071857, "loss": 0.0802, "step": 4872 }, { "epoch": 0.3150222222222222, "grad_norm": 0.07686731964349747, "learning_rate": 0.00019550095507062852, "loss": 0.0956, "step": 4873 }, { "epoch": 0.3150868686868687, "grad_norm": 0.07690903544425964, "learning_rate": 0.00019549892668391206, "loss": 0.0959, "step": 4874 }, { "epoch": 0.3151515151515151, "grad_norm": 0.06656327843666077, "learning_rate": 0.00019549689785057872, "loss": 0.1028, "step": 4875 }, { "epoch": 0.3152161616161616, "grad_norm": 0.06390737742185593, "learning_rate": 0.00019549486857063793, "loss": 0.0778, "step": 4876 }, { "epoch": 0.3152808080808081, "grad_norm": 0.07323991507291794, "learning_rate": 0.0001954928388440992, "loss": 0.1061, "step": 4877 }, { "epoch": 0.31534545454545454, "grad_norm": 0.06591487675905228, "learning_rate": 0.0001954908086709721, "loss": 0.0972, "step": 4878 }, { "epoch": 0.31541010101010103, "grad_norm": 0.0657346174120903, "learning_rate": 0.00019548877805126598, "loss": 0.0851, "step": 4879 }, { "epoch": 0.31547474747474746, "grad_norm": 0.07673000544309616, "learning_rate": 0.00019548674698499044, "loss": 0.1101, "step": 4880 }, { "epoch": 0.31547474747474746, "eval_bleu": 13.40367618882521, "eval_loss": 0.09336511045694351, "eval_runtime": 2.8707, "eval_samples_per_second": 11.147, "eval_steps_per_second": 1.393, "step": 4880 }, { "epoch": 0.31553939393939395, "grad_norm": 0.05856234207749367, "learning_rate": 0.00019548471547215497, "loss": 0.0744, "step": 4881 }, { "epoch": 0.3156040404040404, "grad_norm": 0.10287497937679291, "learning_rate": 0.00019548268351276903, "loss": 0.0995, "step": 4882 }, { "epoch": 0.3156686868686869, "grad_norm": 0.06215628236532211, "learning_rate": 0.00019548065110684215, "loss": 0.0834, "step": 4883 }, { "epoch": 0.3157333333333333, "grad_norm": 0.062197282910346985, "learning_rate": 0.00019547861825438385, "loss": 0.0865, "step": 4884 }, { "epoch": 0.3157979797979798, "grad_norm": 0.06098649278283119, "learning_rate": 0.0001954765849554036, "loss": 0.0873, "step": 4885 }, { "epoch": 0.3158626262626263, "grad_norm": 0.057393934577703476, "learning_rate": 0.00019547455120991095, "loss": 0.0873, "step": 4886 }, { "epoch": 0.3159272727272727, "grad_norm": 0.05440773814916611, "learning_rate": 0.00019547251701791533, "loss": 0.0782, "step": 4887 }, { "epoch": 0.3159919191919192, "grad_norm": 0.06450515240430832, "learning_rate": 0.00019547048237942636, "loss": 0.1002, "step": 4888 }, { "epoch": 0.31605656565656565, "grad_norm": 0.05709666386246681, "learning_rate": 0.00019546844729445348, "loss": 0.0888, "step": 4889 }, { "epoch": 0.31612121212121214, "grad_norm": 0.06465613842010498, "learning_rate": 0.00019546641176300625, "loss": 0.0896, "step": 4890 }, { "epoch": 0.31618585858585857, "grad_norm": 0.06351575255393982, "learning_rate": 0.00019546437578509416, "loss": 0.0831, "step": 4891 }, { "epoch": 0.31625050505050506, "grad_norm": 0.06688931584358215, "learning_rate": 0.00019546233936072676, "loss": 0.1025, "step": 4892 }, { "epoch": 0.3163151515151515, "grad_norm": 0.058955784887075424, "learning_rate": 0.00019546030248991354, "loss": 0.0773, "step": 4893 }, { "epoch": 0.316379797979798, "grad_norm": 0.0577361173927784, "learning_rate": 0.00019545826517266404, "loss": 0.0815, "step": 4894 }, { "epoch": 0.3164444444444444, "grad_norm": 0.0661374032497406, "learning_rate": 0.00019545622740898782, "loss": 0.0963, "step": 4895 }, { "epoch": 0.3165090909090909, "grad_norm": 0.06921987980604172, "learning_rate": 0.00019545418919889436, "loss": 0.0891, "step": 4896 }, { "epoch": 0.3165090909090909, "eval_bleu": 12.683546899011779, "eval_loss": 0.0946357250213623, "eval_runtime": 2.6926, "eval_samples_per_second": 11.884, "eval_steps_per_second": 1.486, "step": 4896 }, { "epoch": 0.3165737373737374, "grad_norm": 0.0733335018157959, "learning_rate": 0.00019545215054239323, "loss": 0.1108, "step": 4897 }, { "epoch": 0.31663838383838383, "grad_norm": 0.06435233354568481, "learning_rate": 0.00019545011143949392, "loss": 0.0871, "step": 4898 }, { "epoch": 0.3167030303030303, "grad_norm": 0.06435911357402802, "learning_rate": 0.00019544807189020603, "loss": 0.0883, "step": 4899 }, { "epoch": 0.31676767676767675, "grad_norm": 0.07331840693950653, "learning_rate": 0.00019544603189453904, "loss": 0.1151, "step": 4900 }, { "epoch": 0.31683232323232324, "grad_norm": 0.06468986719846725, "learning_rate": 0.00019544399145250249, "loss": 0.093, "step": 4901 }, { "epoch": 0.3168969696969697, "grad_norm": 0.07084932923316956, "learning_rate": 0.000195441950564106, "loss": 0.089, "step": 4902 }, { "epoch": 0.31696161616161617, "grad_norm": 0.07533324509859085, "learning_rate": 0.000195439909229359, "loss": 0.1045, "step": 4903 }, { "epoch": 0.3170262626262626, "grad_norm": 0.06669335067272186, "learning_rate": 0.00019543786744827114, "loss": 0.0858, "step": 4904 }, { "epoch": 0.3170909090909091, "grad_norm": 0.06391675025224686, "learning_rate": 0.0001954358252208519, "loss": 0.0902, "step": 4905 }, { "epoch": 0.3171555555555556, "grad_norm": 0.06471959501504898, "learning_rate": 0.0001954337825471109, "loss": 0.0825, "step": 4906 }, { "epoch": 0.317220202020202, "grad_norm": 0.07275552302598953, "learning_rate": 0.0001954317394270576, "loss": 0.1025, "step": 4907 }, { "epoch": 0.3172848484848485, "grad_norm": 0.062462080270051956, "learning_rate": 0.00019542969586070164, "loss": 0.0964, "step": 4908 }, { "epoch": 0.31734949494949494, "grad_norm": 0.06980860978364944, "learning_rate": 0.0001954276518480525, "loss": 0.0823, "step": 4909 }, { "epoch": 0.3174141414141414, "grad_norm": 0.07418327033519745, "learning_rate": 0.00019542560738911981, "loss": 0.1083, "step": 4910 }, { "epoch": 0.31747878787878786, "grad_norm": 0.06741069257259369, "learning_rate": 0.0001954235624839131, "loss": 0.0874, "step": 4911 }, { "epoch": 0.31754343434343435, "grad_norm": 0.06400905549526215, "learning_rate": 0.00019542151713244191, "loss": 0.0838, "step": 4912 }, { "epoch": 0.31754343434343435, "eval_bleu": 13.657495976169672, "eval_loss": 0.09310087561607361, "eval_runtime": 2.7443, "eval_samples_per_second": 11.661, "eval_steps_per_second": 1.458, "step": 4912 }, { "epoch": 0.3176080808080808, "grad_norm": 0.0613933689892292, "learning_rate": 0.00019541947133471587, "loss": 0.0812, "step": 4913 }, { "epoch": 0.3176727272727273, "grad_norm": 0.06032070517539978, "learning_rate": 0.00019541742509074448, "loss": 0.0744, "step": 4914 }, { "epoch": 0.31773737373737376, "grad_norm": 0.07494694739580154, "learning_rate": 0.00019541537840053733, "loss": 0.1026, "step": 4915 }, { "epoch": 0.3178020202020202, "grad_norm": 0.061808351427316666, "learning_rate": 0.000195413331264104, "loss": 0.0854, "step": 4916 }, { "epoch": 0.3178666666666667, "grad_norm": 0.05917581915855408, "learning_rate": 0.00019541128368145408, "loss": 0.079, "step": 4917 }, { "epoch": 0.3179313131313131, "grad_norm": 0.07325282692909241, "learning_rate": 0.0001954092356525971, "loss": 0.1025, "step": 4918 }, { "epoch": 0.3179959595959596, "grad_norm": 0.06206470727920532, "learning_rate": 0.0001954071871775427, "loss": 0.0951, "step": 4919 }, { "epoch": 0.31806060606060604, "grad_norm": 0.06596078723669052, "learning_rate": 0.00019540513825630043, "loss": 0.0986, "step": 4920 }, { "epoch": 0.31812525252525253, "grad_norm": 0.06132863834500313, "learning_rate": 0.00019540308888887987, "loss": 0.0883, "step": 4921 }, { "epoch": 0.31818989898989897, "grad_norm": 0.06334802508354187, "learning_rate": 0.0001954010390752906, "loss": 0.0945, "step": 4922 }, { "epoch": 0.31825454545454546, "grad_norm": 0.0976632758975029, "learning_rate": 0.00019539898881554217, "loss": 0.0809, "step": 4923 }, { "epoch": 0.31831919191919195, "grad_norm": 0.0650988519191742, "learning_rate": 0.00019539693810964424, "loss": 0.0926, "step": 4924 }, { "epoch": 0.3183838383838384, "grad_norm": 0.06815777719020844, "learning_rate": 0.00019539488695760637, "loss": 0.0912, "step": 4925 }, { "epoch": 0.31844848484848487, "grad_norm": 0.06816314160823822, "learning_rate": 0.00019539283535943813, "loss": 0.0883, "step": 4926 }, { "epoch": 0.3185131313131313, "grad_norm": 0.06158888712525368, "learning_rate": 0.00019539078331514914, "loss": 0.0914, "step": 4927 }, { "epoch": 0.3185777777777778, "grad_norm": 0.0751294270157814, "learning_rate": 0.00019538873082474903, "loss": 0.1189, "step": 4928 }, { "epoch": 0.3185777777777778, "eval_bleu": 14.969845234431299, "eval_loss": 0.09190497547388077, "eval_runtime": 2.6879, "eval_samples_per_second": 11.905, "eval_steps_per_second": 1.488, "step": 4928 }, { "epoch": 0.3186424242424242, "grad_norm": 0.060045670717954636, "learning_rate": 0.00019538667788824733, "loss": 0.0875, "step": 4929 }, { "epoch": 0.3187070707070707, "grad_norm": 0.06228606030344963, "learning_rate": 0.00019538462450565365, "loss": 0.0927, "step": 4930 }, { "epoch": 0.31877171717171715, "grad_norm": 0.06696313619613647, "learning_rate": 0.00019538257067697765, "loss": 0.0964, "step": 4931 }, { "epoch": 0.31883636363636364, "grad_norm": 0.05998144671320915, "learning_rate": 0.00019538051640222888, "loss": 0.0817, "step": 4932 }, { "epoch": 0.3189010101010101, "grad_norm": 0.06348895281553268, "learning_rate": 0.00019537846168141699, "loss": 0.086, "step": 4933 }, { "epoch": 0.31896565656565656, "grad_norm": 0.06780138611793518, "learning_rate": 0.00019537640651455155, "loss": 0.0935, "step": 4934 }, { "epoch": 0.31903030303030305, "grad_norm": 0.0802244022488594, "learning_rate": 0.00019537435090164217, "loss": 0.0878, "step": 4935 }, { "epoch": 0.3190949494949495, "grad_norm": 0.06998592615127563, "learning_rate": 0.00019537229484269851, "loss": 0.0838, "step": 4936 }, { "epoch": 0.319159595959596, "grad_norm": 0.054281946271657944, "learning_rate": 0.00019537023833773013, "loss": 0.0691, "step": 4937 }, { "epoch": 0.3192242424242424, "grad_norm": 0.06317893415689468, "learning_rate": 0.00019536818138674668, "loss": 0.0788, "step": 4938 }, { "epoch": 0.3192888888888889, "grad_norm": 0.057587601244449615, "learning_rate": 0.0001953661239897578, "loss": 0.0851, "step": 4939 }, { "epoch": 0.31935353535353533, "grad_norm": 0.06710262596607208, "learning_rate": 0.00019536406614677307, "loss": 0.09, "step": 4940 }, { "epoch": 0.3194181818181818, "grad_norm": 0.06977008283138275, "learning_rate": 0.00019536200785780214, "loss": 0.0875, "step": 4941 }, { "epoch": 0.31948282828282826, "grad_norm": 0.06874189525842667, "learning_rate": 0.0001953599491228546, "loss": 0.0866, "step": 4942 }, { "epoch": 0.31954747474747475, "grad_norm": 0.06986114382743835, "learning_rate": 0.0001953578899419401, "loss": 0.094, "step": 4943 }, { "epoch": 0.31961212121212124, "grad_norm": 0.07599589228630066, "learning_rate": 0.0001953558303150683, "loss": 0.1091, "step": 4944 }, { "epoch": 0.31961212121212124, "eval_bleu": 12.465814519403771, "eval_loss": 0.0939895510673523, "eval_runtime": 2.7312, "eval_samples_per_second": 11.716, "eval_steps_per_second": 1.465, "step": 4944 }, { "epoch": 0.31967676767676767, "grad_norm": 0.07070202380418777, "learning_rate": 0.0001953537702422488, "loss": 0.0944, "step": 4945 }, { "epoch": 0.31974141414141416, "grad_norm": 0.06438890099525452, "learning_rate": 0.00019535170972349123, "loss": 0.0808, "step": 4946 }, { "epoch": 0.3198060606060606, "grad_norm": 0.05122537165880203, "learning_rate": 0.00019534964875880527, "loss": 0.0655, "step": 4947 }, { "epoch": 0.3198707070707071, "grad_norm": 0.06715032458305359, "learning_rate": 0.00019534758734820047, "loss": 0.0954, "step": 4948 }, { "epoch": 0.3199353535353535, "grad_norm": 0.06961732357740402, "learning_rate": 0.00019534552549168658, "loss": 0.0747, "step": 4949 }, { "epoch": 0.32, "grad_norm": 0.06812962889671326, "learning_rate": 0.00019534346318927315, "loss": 0.0955, "step": 4950 }, { "epoch": 0.32006464646464644, "grad_norm": 0.06949734687805176, "learning_rate": 0.00019534140044096988, "loss": 0.0892, "step": 4951 }, { "epoch": 0.32012929292929293, "grad_norm": 0.0696600005030632, "learning_rate": 0.0001953393372467864, "loss": 0.0983, "step": 4952 }, { "epoch": 0.3201939393939394, "grad_norm": 0.05432440713047981, "learning_rate": 0.00019533727360673234, "loss": 0.0658, "step": 4953 }, { "epoch": 0.32025858585858585, "grad_norm": 0.06622884422540665, "learning_rate": 0.00019533520952081738, "loss": 0.0929, "step": 4954 }, { "epoch": 0.32032323232323234, "grad_norm": 0.06357545405626297, "learning_rate": 0.00019533314498905116, "loss": 0.0905, "step": 4955 }, { "epoch": 0.3203878787878788, "grad_norm": 0.070916548371315, "learning_rate": 0.00019533108001144333, "loss": 0.0968, "step": 4956 }, { "epoch": 0.32045252525252527, "grad_norm": 0.0630759745836258, "learning_rate": 0.00019532901458800357, "loss": 0.0865, "step": 4957 }, { "epoch": 0.3205171717171717, "grad_norm": 0.06789695471525192, "learning_rate": 0.0001953269487187415, "loss": 0.0863, "step": 4958 }, { "epoch": 0.3205818181818182, "grad_norm": 0.07558803260326385, "learning_rate": 0.00019532488240366685, "loss": 0.1069, "step": 4959 }, { "epoch": 0.3206464646464646, "grad_norm": 0.06685559451580048, "learning_rate": 0.0001953228156427892, "loss": 0.0962, "step": 4960 }, { "epoch": 0.3206464646464646, "eval_bleu": 12.772305385473597, "eval_loss": 0.09306998550891876, "eval_runtime": 2.8422, "eval_samples_per_second": 11.259, "eval_steps_per_second": 1.407, "step": 4960 }, { "epoch": 0.3207111111111111, "grad_norm": 0.061944857239723206, "learning_rate": 0.00019532074843611828, "loss": 0.0872, "step": 4961 }, { "epoch": 0.3207757575757576, "grad_norm": 0.056623827666044235, "learning_rate": 0.0001953186807836637, "loss": 0.0802, "step": 4962 }, { "epoch": 0.32084040404040404, "grad_norm": 0.08329882472753525, "learning_rate": 0.0001953166126854352, "loss": 0.1031, "step": 4963 }, { "epoch": 0.3209050505050505, "grad_norm": 0.05859116464853287, "learning_rate": 0.0001953145441414424, "loss": 0.0758, "step": 4964 }, { "epoch": 0.32096969696969696, "grad_norm": 0.0700867548584938, "learning_rate": 0.00019531247515169496, "loss": 0.1055, "step": 4965 }, { "epoch": 0.32103434343434345, "grad_norm": 0.08815409243106842, "learning_rate": 0.0001953104057162026, "loss": 0.0773, "step": 4966 }, { "epoch": 0.3210989898989899, "grad_norm": 0.06450144201517105, "learning_rate": 0.00019530833583497498, "loss": 0.0866, "step": 4967 }, { "epoch": 0.3211636363636364, "grad_norm": 0.07723083347082138, "learning_rate": 0.0001953062655080218, "loss": 0.1056, "step": 4968 }, { "epoch": 0.3212282828282828, "grad_norm": 0.0583147332072258, "learning_rate": 0.00019530419473535272, "loss": 0.0774, "step": 4969 }, { "epoch": 0.3212929292929293, "grad_norm": 0.059548504650592804, "learning_rate": 0.00019530212351697742, "loss": 0.0675, "step": 4970 }, { "epoch": 0.32135757575757573, "grad_norm": 0.06677448749542236, "learning_rate": 0.00019530005185290557, "loss": 0.0881, "step": 4971 }, { "epoch": 0.3214222222222222, "grad_norm": 0.07151346653699875, "learning_rate": 0.00019529797974314692, "loss": 0.103, "step": 4972 }, { "epoch": 0.3214868686868687, "grad_norm": 0.07726530730724335, "learning_rate": 0.0001952959071877111, "loss": 0.1003, "step": 4973 }, { "epoch": 0.32155151515151514, "grad_norm": 0.05737035349011421, "learning_rate": 0.00019529383418660784, "loss": 0.0811, "step": 4974 }, { "epoch": 0.32161616161616163, "grad_norm": 0.07651960849761963, "learning_rate": 0.00019529176073984682, "loss": 0.0986, "step": 4975 }, { "epoch": 0.32168080808080807, "grad_norm": 0.06654155254364014, "learning_rate": 0.00019528968684743772, "loss": 0.0809, "step": 4976 }, { "epoch": 0.32168080808080807, "eval_bleu": 13.39792781488739, "eval_loss": 0.09400483965873718, "eval_runtime": 2.7493, "eval_samples_per_second": 11.639, "eval_steps_per_second": 1.455, "step": 4976 }, { "epoch": 0.32174545454545456, "grad_norm": 0.06384915113449097, "learning_rate": 0.00019528761250939028, "loss": 0.1004, "step": 4977 }, { "epoch": 0.321810101010101, "grad_norm": 0.07585534453392029, "learning_rate": 0.00019528553772571417, "loss": 0.0859, "step": 4978 }, { "epoch": 0.3218747474747475, "grad_norm": 0.0623493455350399, "learning_rate": 0.00019528346249641913, "loss": 0.0817, "step": 4979 }, { "epoch": 0.3219393939393939, "grad_norm": 0.06571008265018463, "learning_rate": 0.0001952813868215148, "loss": 0.0918, "step": 4980 }, { "epoch": 0.3220040404040404, "grad_norm": 0.057330675423145294, "learning_rate": 0.00019527931070101092, "loss": 0.0704, "step": 4981 }, { "epoch": 0.3220686868686869, "grad_norm": 0.07009575515985489, "learning_rate": 0.0001952772341349172, "loss": 0.0998, "step": 4982 }, { "epoch": 0.3221333333333333, "grad_norm": 0.05783507972955704, "learning_rate": 0.0001952751571232434, "loss": 0.0817, "step": 4983 }, { "epoch": 0.3221979797979798, "grad_norm": 0.0648999735713005, "learning_rate": 0.00019527307966599912, "loss": 0.1036, "step": 4984 }, { "epoch": 0.32226262626262625, "grad_norm": 0.059820324182510376, "learning_rate": 0.0001952710017631942, "loss": 0.0822, "step": 4985 }, { "epoch": 0.32232727272727274, "grad_norm": 0.06529063731431961, "learning_rate": 0.0001952689234148383, "loss": 0.1029, "step": 4986 }, { "epoch": 0.3223919191919192, "grad_norm": 0.0750807449221611, "learning_rate": 0.0001952668446209411, "loss": 0.1171, "step": 4987 }, { "epoch": 0.32245656565656566, "grad_norm": 0.08445903658866882, "learning_rate": 0.00019526476538151238, "loss": 0.1071, "step": 4988 }, { "epoch": 0.3225212121212121, "grad_norm": 0.05898655205965042, "learning_rate": 0.00019526268569656184, "loss": 0.0819, "step": 4989 }, { "epoch": 0.3225858585858586, "grad_norm": 0.052940040826797485, "learning_rate": 0.00019526060556609922, "loss": 0.0706, "step": 4990 }, { "epoch": 0.3226505050505051, "grad_norm": 0.061189912259578705, "learning_rate": 0.00019525852499013423, "loss": 0.0868, "step": 4991 }, { "epoch": 0.3227151515151515, "grad_norm": 0.07143127918243408, "learning_rate": 0.00019525644396867664, "loss": 0.0843, "step": 4992 }, { "epoch": 0.3227151515151515, "eval_bleu": 15.672153208444069, "eval_loss": 0.09378588944673538, "eval_runtime": 2.7914, "eval_samples_per_second": 11.464, "eval_steps_per_second": 1.433, "step": 4992 }, { "epoch": 0.322779797979798, "grad_norm": 0.060790855437517166, "learning_rate": 0.00019525436250173613, "loss": 0.0834, "step": 4993 }, { "epoch": 0.32284444444444443, "grad_norm": 0.05707736685872078, "learning_rate": 0.00019525228058932245, "loss": 0.0802, "step": 4994 }, { "epoch": 0.3229090909090909, "grad_norm": 0.06695713102817535, "learning_rate": 0.00019525019823144537, "loss": 0.0811, "step": 4995 }, { "epoch": 0.32297373737373736, "grad_norm": 0.08092916756868362, "learning_rate": 0.00019524811542811457, "loss": 0.093, "step": 4996 }, { "epoch": 0.32303838383838385, "grad_norm": 0.0689164400100708, "learning_rate": 0.00019524603217933986, "loss": 0.098, "step": 4997 }, { "epoch": 0.3231030303030303, "grad_norm": 0.06591492891311646, "learning_rate": 0.0001952439484851309, "loss": 0.0966, "step": 4998 }, { "epoch": 0.32316767676767677, "grad_norm": 0.06822417676448822, "learning_rate": 0.00019524186434549752, "loss": 0.1026, "step": 4999 }, { "epoch": 0.32323232323232326, "grad_norm": 0.05683770403265953, "learning_rate": 0.0001952397797604494, "loss": 0.0806, "step": 5000 }, { "epoch": 0.3232969696969697, "grad_norm": 0.06668691337108612, "learning_rate": 0.00019523769472999634, "loss": 0.1039, "step": 5001 }, { "epoch": 0.3233616161616162, "grad_norm": 0.062347084283828735, "learning_rate": 0.00019523560925414804, "loss": 0.1027, "step": 5002 }, { "epoch": 0.3234262626262626, "grad_norm": 0.062487971037626266, "learning_rate": 0.00019523352333291428, "loss": 0.0896, "step": 5003 }, { "epoch": 0.3234909090909091, "grad_norm": 0.05898349732160568, "learning_rate": 0.00019523143696630486, "loss": 0.0982, "step": 5004 }, { "epoch": 0.32355555555555554, "grad_norm": 0.06970492750406265, "learning_rate": 0.00019522935015432944, "loss": 0.1061, "step": 5005 }, { "epoch": 0.32362020202020203, "grad_norm": 0.06570102274417877, "learning_rate": 0.00019522726289699787, "loss": 0.0934, "step": 5006 }, { "epoch": 0.32368484848484846, "grad_norm": 0.06419141590595245, "learning_rate": 0.00019522517519431984, "loss": 0.0996, "step": 5007 }, { "epoch": 0.32374949494949495, "grad_norm": 0.06400182843208313, "learning_rate": 0.00019522308704630515, "loss": 0.0893, "step": 5008 }, { "epoch": 0.32374949494949495, "eval_bleu": 16.083217913553234, "eval_loss": 0.09427288174629211, "eval_runtime": 2.9016, "eval_samples_per_second": 11.028, "eval_steps_per_second": 1.379, "step": 5008 }, { "epoch": 0.3238141414141414, "grad_norm": 0.07289660722017288, "learning_rate": 0.00019522099845296358, "loss": 0.11, "step": 5009 }, { "epoch": 0.3238787878787879, "grad_norm": 0.05868663266301155, "learning_rate": 0.00019521890941430486, "loss": 0.072, "step": 5010 }, { "epoch": 0.32394343434343437, "grad_norm": 0.06341391801834106, "learning_rate": 0.0001952168199303388, "loss": 0.0918, "step": 5011 }, { "epoch": 0.3240080808080808, "grad_norm": 0.06332357972860336, "learning_rate": 0.00019521473000107516, "loss": 0.0908, "step": 5012 }, { "epoch": 0.3240727272727273, "grad_norm": 0.05746617540717125, "learning_rate": 0.00019521263962652368, "loss": 0.0809, "step": 5013 }, { "epoch": 0.3241373737373737, "grad_norm": 0.06302088499069214, "learning_rate": 0.00019521054880669415, "loss": 0.0842, "step": 5014 }, { "epoch": 0.3242020202020202, "grad_norm": 0.05614808201789856, "learning_rate": 0.0001952084575415964, "loss": 0.0714, "step": 5015 }, { "epoch": 0.32426666666666665, "grad_norm": 0.08165333420038223, "learning_rate": 0.00019520636583124015, "loss": 0.1204, "step": 5016 }, { "epoch": 0.32433131313131314, "grad_norm": 0.06666591018438339, "learning_rate": 0.0001952042736756352, "loss": 0.0843, "step": 5017 }, { "epoch": 0.32439595959595957, "grad_norm": 0.0809096246957779, "learning_rate": 0.00019520218107479132, "loss": 0.1043, "step": 5018 }, { "epoch": 0.32446060606060606, "grad_norm": 0.08147800713777542, "learning_rate": 0.00019520008802871832, "loss": 0.1007, "step": 5019 }, { "epoch": 0.32452525252525255, "grad_norm": 0.06933315098285675, "learning_rate": 0.000195197994537426, "loss": 0.1058, "step": 5020 }, { "epoch": 0.324589898989899, "grad_norm": 0.0682990625500679, "learning_rate": 0.0001951959006009241, "loss": 0.1006, "step": 5021 }, { "epoch": 0.3246545454545455, "grad_norm": 0.10208892822265625, "learning_rate": 0.00019519380621922249, "loss": 0.0916, "step": 5022 }, { "epoch": 0.3247191919191919, "grad_norm": 0.06210468336939812, "learning_rate": 0.0001951917113923309, "loss": 0.0868, "step": 5023 }, { "epoch": 0.3247838383838384, "grad_norm": 0.06577180325984955, "learning_rate": 0.00019518961612025913, "loss": 0.0864, "step": 5024 }, { "epoch": 0.3247838383838384, "eval_bleu": 12.472514122238822, "eval_loss": 0.0954444408416748, "eval_runtime": 2.714, "eval_samples_per_second": 11.791, "eval_steps_per_second": 1.474, "step": 5024 }, { "epoch": 0.32484848484848483, "grad_norm": 0.072667196393013, "learning_rate": 0.000195187520403017, "loss": 0.1103, "step": 5025 }, { "epoch": 0.3249131313131313, "grad_norm": 0.06775262206792831, "learning_rate": 0.00019518542424061433, "loss": 0.0843, "step": 5026 }, { "epoch": 0.32497777777777775, "grad_norm": 0.08101565390825272, "learning_rate": 0.00019518332763306085, "loss": 0.1159, "step": 5027 }, { "epoch": 0.32504242424242424, "grad_norm": 0.0728599950671196, "learning_rate": 0.00019518123058036646, "loss": 0.1021, "step": 5028 }, { "epoch": 0.32510707070707073, "grad_norm": 0.06834661215543747, "learning_rate": 0.0001951791330825409, "loss": 0.0975, "step": 5029 }, { "epoch": 0.32517171717171717, "grad_norm": 0.060497451573610306, "learning_rate": 0.00019517703513959397, "loss": 0.085, "step": 5030 }, { "epoch": 0.32523636363636366, "grad_norm": 0.06539740413427353, "learning_rate": 0.00019517493675153555, "loss": 0.0863, "step": 5031 }, { "epoch": 0.3253010101010101, "grad_norm": 0.07119203358888626, "learning_rate": 0.0001951728379183754, "loss": 0.0973, "step": 5032 }, { "epoch": 0.3253656565656566, "grad_norm": 0.06744899600744247, "learning_rate": 0.0001951707386401234, "loss": 0.0915, "step": 5033 }, { "epoch": 0.325430303030303, "grad_norm": 0.06295450776815414, "learning_rate": 0.00019516863891678928, "loss": 0.0767, "step": 5034 }, { "epoch": 0.3254949494949495, "grad_norm": 0.07959683239459991, "learning_rate": 0.00019516653874838288, "loss": 0.1101, "step": 5035 }, { "epoch": 0.32555959595959594, "grad_norm": 0.059050001204013824, "learning_rate": 0.00019516443813491404, "loss": 0.0836, "step": 5036 }, { "epoch": 0.3256242424242424, "grad_norm": 0.06866364181041718, "learning_rate": 0.0001951623370763926, "loss": 0.1026, "step": 5037 }, { "epoch": 0.3256888888888889, "grad_norm": 0.05746452137827873, "learning_rate": 0.00019516023557282836, "loss": 0.0717, "step": 5038 }, { "epoch": 0.32575353535353535, "grad_norm": 0.06633614748716354, "learning_rate": 0.0001951581336242312, "loss": 0.092, "step": 5039 }, { "epoch": 0.32581818181818184, "grad_norm": 0.06402871012687683, "learning_rate": 0.00019515603123061083, "loss": 0.0855, "step": 5040 }, { "epoch": 0.32581818181818184, "eval_bleu": 17.5241996538824, "eval_loss": 0.09342949092388153, "eval_runtime": 2.7274, "eval_samples_per_second": 11.733, "eval_steps_per_second": 1.467, "step": 5040 }, { "epoch": 0.3258828282828283, "grad_norm": 0.06713303178548813, "learning_rate": 0.00019515392839197722, "loss": 0.0993, "step": 5041 }, { "epoch": 0.32594747474747476, "grad_norm": 0.0637454017996788, "learning_rate": 0.00019515182510834015, "loss": 0.0811, "step": 5042 }, { "epoch": 0.3260121212121212, "grad_norm": 0.0662960335612297, "learning_rate": 0.00019514972137970942, "loss": 0.0882, "step": 5043 }, { "epoch": 0.3260767676767677, "grad_norm": 0.06274198740720749, "learning_rate": 0.00019514761720609492, "loss": 0.0855, "step": 5044 }, { "epoch": 0.3261414141414141, "grad_norm": 0.0655633732676506, "learning_rate": 0.00019514551258750643, "loss": 0.096, "step": 5045 }, { "epoch": 0.3262060606060606, "grad_norm": 0.07143282145261765, "learning_rate": 0.00019514340752395387, "loss": 0.1056, "step": 5046 }, { "epoch": 0.32627070707070704, "grad_norm": 0.05767039954662323, "learning_rate": 0.00019514130201544701, "loss": 0.0798, "step": 5047 }, { "epoch": 0.32633535353535353, "grad_norm": 0.07465171068906784, "learning_rate": 0.00019513919606199578, "loss": 0.0908, "step": 5048 }, { "epoch": 0.3264, "grad_norm": 0.07277978211641312, "learning_rate": 0.00019513708966360996, "loss": 0.1049, "step": 5049 }, { "epoch": 0.32646464646464646, "grad_norm": 0.06794938445091248, "learning_rate": 0.00019513498282029942, "loss": 0.0943, "step": 5050 }, { "epoch": 0.32652929292929295, "grad_norm": 0.07003027200698853, "learning_rate": 0.00019513287553207402, "loss": 0.0775, "step": 5051 }, { "epoch": 0.3265939393939394, "grad_norm": 0.06780106574296951, "learning_rate": 0.0001951307677989436, "loss": 0.094, "step": 5052 }, { "epoch": 0.32665858585858587, "grad_norm": 0.06895982474088669, "learning_rate": 0.00019512865962091803, "loss": 0.0862, "step": 5053 }, { "epoch": 0.3267232323232323, "grad_norm": 0.06453032046556473, "learning_rate": 0.0001951265509980072, "loss": 0.0901, "step": 5054 }, { "epoch": 0.3267878787878788, "grad_norm": 0.06938889622688293, "learning_rate": 0.00019512444193022093, "loss": 0.0947, "step": 5055 }, { "epoch": 0.32685252525252523, "grad_norm": 0.07029138505458832, "learning_rate": 0.00019512233241756908, "loss": 0.1069, "step": 5056 }, { "epoch": 0.32685252525252523, "eval_bleu": 18.00563460780774, "eval_loss": 0.09389109909534454, "eval_runtime": 2.6714, "eval_samples_per_second": 11.979, "eval_steps_per_second": 1.497, "step": 5056 }, { "epoch": 0.3269171717171717, "grad_norm": 0.06267280876636505, "learning_rate": 0.0001951202224600615, "loss": 0.0923, "step": 5057 }, { "epoch": 0.3269818181818182, "grad_norm": 0.0786575898528099, "learning_rate": 0.00019511811205770815, "loss": 0.1096, "step": 5058 }, { "epoch": 0.32704646464646464, "grad_norm": 0.05854830890893936, "learning_rate": 0.00019511600121051878, "loss": 0.083, "step": 5059 }, { "epoch": 0.32711111111111113, "grad_norm": 0.06258115917444229, "learning_rate": 0.00019511388991850335, "loss": 0.0891, "step": 5060 }, { "epoch": 0.32717575757575756, "grad_norm": 0.06314460933208466, "learning_rate": 0.00019511177818167167, "loss": 0.094, "step": 5061 }, { "epoch": 0.32724040404040405, "grad_norm": 0.06478629261255264, "learning_rate": 0.0001951096660000337, "loss": 0.1056, "step": 5062 }, { "epoch": 0.3273050505050505, "grad_norm": 0.0707220509648323, "learning_rate": 0.0001951075533735992, "loss": 0.1125, "step": 5063 }, { "epoch": 0.327369696969697, "grad_norm": 0.06447993963956833, "learning_rate": 0.00019510544030237816, "loss": 0.093, "step": 5064 }, { "epoch": 0.3274343434343434, "grad_norm": 0.05793168768286705, "learning_rate": 0.0001951033267863804, "loss": 0.0945, "step": 5065 }, { "epoch": 0.3274989898989899, "grad_norm": 0.0653253123164177, "learning_rate": 0.00019510121282561582, "loss": 0.0928, "step": 5066 }, { "epoch": 0.3275636363636364, "grad_norm": 0.06183865666389465, "learning_rate": 0.0001950990984200943, "loss": 0.0943, "step": 5067 }, { "epoch": 0.3276282828282828, "grad_norm": 0.06048472225666046, "learning_rate": 0.00019509698356982575, "loss": 0.0802, "step": 5068 }, { "epoch": 0.3276929292929293, "grad_norm": 0.06875723600387573, "learning_rate": 0.00019509486827482003, "loss": 0.098, "step": 5069 }, { "epoch": 0.32775757575757575, "grad_norm": 0.06606148928403854, "learning_rate": 0.00019509275253508704, "loss": 0.0858, "step": 5070 }, { "epoch": 0.32782222222222224, "grad_norm": 0.07038073241710663, "learning_rate": 0.00019509063635063674, "loss": 0.0867, "step": 5071 }, { "epoch": 0.32788686868686867, "grad_norm": 0.07425768673419952, "learning_rate": 0.00019508851972147893, "loss": 0.0878, "step": 5072 }, { "epoch": 0.32788686868686867, "eval_bleu": 14.867180770181093, "eval_loss": 0.09418272972106934, "eval_runtime": 2.67, "eval_samples_per_second": 11.985, "eval_steps_per_second": 1.498, "step": 5072 }, { "epoch": 0.32795151515151516, "grad_norm": 0.06115090101957321, "learning_rate": 0.00019508640264762354, "loss": 0.0787, "step": 5073 }, { "epoch": 0.3280161616161616, "grad_norm": 0.07193652540445328, "learning_rate": 0.0001950842851290805, "loss": 0.0884, "step": 5074 }, { "epoch": 0.3280808080808081, "grad_norm": 0.06728909909725189, "learning_rate": 0.00019508216716585967, "loss": 0.079, "step": 5075 }, { "epoch": 0.3281454545454545, "grad_norm": 0.061412930488586426, "learning_rate": 0.000195080048757971, "loss": 0.0869, "step": 5076 }, { "epoch": 0.328210101010101, "grad_norm": 0.06911016255617142, "learning_rate": 0.00019507792990542438, "loss": 0.0877, "step": 5077 }, { "epoch": 0.3282747474747475, "grad_norm": 0.08196385949850082, "learning_rate": 0.0001950758106082297, "loss": 0.1193, "step": 5078 }, { "epoch": 0.32833939393939393, "grad_norm": 0.0695338249206543, "learning_rate": 0.0001950736908663969, "loss": 0.0923, "step": 5079 }, { "epoch": 0.3284040404040404, "grad_norm": 0.06336675584316254, "learning_rate": 0.00019507157067993585, "loss": 0.0874, "step": 5080 }, { "epoch": 0.32846868686868685, "grad_norm": 0.07306136190891266, "learning_rate": 0.00019506945004885652, "loss": 0.0967, "step": 5081 }, { "epoch": 0.32853333333333334, "grad_norm": 0.0652216300368309, "learning_rate": 0.00019506732897316877, "loss": 0.0912, "step": 5082 }, { "epoch": 0.3285979797979798, "grad_norm": 0.06382980942726135, "learning_rate": 0.00019506520745288258, "loss": 0.0884, "step": 5083 }, { "epoch": 0.32866262626262627, "grad_norm": 0.06619251519441605, "learning_rate": 0.00019506308548800785, "loss": 0.0918, "step": 5084 }, { "epoch": 0.3287272727272727, "grad_norm": 0.06469406932592392, "learning_rate": 0.00019506096307855448, "loss": 0.1021, "step": 5085 }, { "epoch": 0.3287919191919192, "grad_norm": 0.07647312432527542, "learning_rate": 0.0001950588402245324, "loss": 0.1155, "step": 5086 }, { "epoch": 0.3288565656565657, "grad_norm": 0.06618805229663849, "learning_rate": 0.00019505671692595159, "loss": 0.0964, "step": 5087 }, { "epoch": 0.3289212121212121, "grad_norm": 0.05884721130132675, "learning_rate": 0.0001950545931828219, "loss": 0.0825, "step": 5088 }, { "epoch": 0.3289212121212121, "eval_bleu": 20.92322999700363, "eval_loss": 0.09511638432741165, "eval_runtime": 2.7073, "eval_samples_per_second": 11.82, "eval_steps_per_second": 1.478, "step": 5088 }, { "epoch": 0.3289858585858586, "grad_norm": 0.08250955492258072, "learning_rate": 0.00019505246899515332, "loss": 0.0885, "step": 5089 }, { "epoch": 0.32905050505050504, "grad_norm": 0.06676743924617767, "learning_rate": 0.00019505034436295574, "loss": 0.0991, "step": 5090 }, { "epoch": 0.3291151515151515, "grad_norm": 0.06790205091238022, "learning_rate": 0.00019504821928623914, "loss": 0.0957, "step": 5091 }, { "epoch": 0.32917979797979796, "grad_norm": 0.056500665843486786, "learning_rate": 0.00019504609376501346, "loss": 0.0754, "step": 5092 }, { "epoch": 0.32924444444444445, "grad_norm": 0.06154552102088928, "learning_rate": 0.00019504396779928862, "loss": 0.0892, "step": 5093 }, { "epoch": 0.3293090909090909, "grad_norm": 0.0593990683555603, "learning_rate": 0.00019504184138907453, "loss": 0.0935, "step": 5094 }, { "epoch": 0.3293737373737374, "grad_norm": 0.05421570688486099, "learning_rate": 0.0001950397145343812, "loss": 0.0738, "step": 5095 }, { "epoch": 0.32943838383838386, "grad_norm": 0.06184278056025505, "learning_rate": 0.00019503758723521855, "loss": 0.0843, "step": 5096 }, { "epoch": 0.3295030303030303, "grad_norm": 0.06964139640331268, "learning_rate": 0.0001950354594915965, "loss": 0.0894, "step": 5097 }, { "epoch": 0.3295676767676768, "grad_norm": 0.06350988894701004, "learning_rate": 0.000195033331303525, "loss": 0.0932, "step": 5098 }, { "epoch": 0.3296323232323232, "grad_norm": 0.06691033393144608, "learning_rate": 0.00019503120267101406, "loss": 0.088, "step": 5099 }, { "epoch": 0.3296969696969697, "grad_norm": 0.06917152553796768, "learning_rate": 0.00019502907359407362, "loss": 0.1005, "step": 5100 }, { "epoch": 0.32976161616161614, "grad_norm": 0.0704154521226883, "learning_rate": 0.00019502694407271359, "loss": 0.0953, "step": 5101 }, { "epoch": 0.32982626262626263, "grad_norm": 0.06082426384091377, "learning_rate": 0.00019502481410694396, "loss": 0.0676, "step": 5102 }, { "epoch": 0.32989090909090907, "grad_norm": 0.061995577067136765, "learning_rate": 0.0001950226836967747, "loss": 0.0823, "step": 5103 }, { "epoch": 0.32995555555555556, "grad_norm": 0.0694991797208786, "learning_rate": 0.00019502055284221576, "loss": 0.0883, "step": 5104 }, { "epoch": 0.32995555555555556, "eval_bleu": 14.581788892973183, "eval_loss": 0.0953206941485405, "eval_runtime": 2.8137, "eval_samples_per_second": 11.373, "eval_steps_per_second": 1.422, "step": 5104 }, { "epoch": 0.33002020202020205, "grad_norm": 0.05407920479774475, "learning_rate": 0.0001950184215432771, "loss": 0.0689, "step": 5105 }, { "epoch": 0.3300848484848485, "grad_norm": 0.07092015445232391, "learning_rate": 0.0001950162897999687, "loss": 0.1005, "step": 5106 }, { "epoch": 0.33014949494949497, "grad_norm": 0.07112333923578262, "learning_rate": 0.00019501415761230052, "loss": 0.1015, "step": 5107 }, { "epoch": 0.3302141414141414, "grad_norm": 0.07231850922107697, "learning_rate": 0.00019501202498028251, "loss": 0.091, "step": 5108 }, { "epoch": 0.3302787878787879, "grad_norm": 0.06771163642406464, "learning_rate": 0.00019500989190392474, "loss": 0.0907, "step": 5109 }, { "epoch": 0.33034343434343433, "grad_norm": 0.06725805252790451, "learning_rate": 0.00019500775838323707, "loss": 0.0808, "step": 5110 }, { "epoch": 0.3304080808080808, "grad_norm": 0.06543751060962677, "learning_rate": 0.00019500562441822955, "loss": 0.0941, "step": 5111 }, { "epoch": 0.33047272727272725, "grad_norm": 0.06468949466943741, "learning_rate": 0.00019500349000891207, "loss": 0.0861, "step": 5112 }, { "epoch": 0.33053737373737374, "grad_norm": 0.07135318964719772, "learning_rate": 0.00019500135515529473, "loss": 0.0985, "step": 5113 }, { "epoch": 0.3306020202020202, "grad_norm": 0.07334398478269577, "learning_rate": 0.00019499921985738743, "loss": 0.1032, "step": 5114 }, { "epoch": 0.33066666666666666, "grad_norm": 0.06897985935211182, "learning_rate": 0.00019499708411520021, "loss": 0.0902, "step": 5115 }, { "epoch": 0.33073131313131315, "grad_norm": 0.0717405378818512, "learning_rate": 0.00019499494792874302, "loss": 0.1036, "step": 5116 }, { "epoch": 0.3307959595959596, "grad_norm": 0.057063259184360504, "learning_rate": 0.00019499281129802586, "loss": 0.0759, "step": 5117 }, { "epoch": 0.3308606060606061, "grad_norm": 0.0608498677611351, "learning_rate": 0.00019499067422305873, "loss": 0.0789, "step": 5118 }, { "epoch": 0.3309252525252525, "grad_norm": 0.08243165910243988, "learning_rate": 0.00019498853670385163, "loss": 0.1021, "step": 5119 }, { "epoch": 0.330989898989899, "grad_norm": 0.06876850873231888, "learning_rate": 0.00019498639874041454, "loss": 0.0989, "step": 5120 }, { "epoch": 0.330989898989899, "eval_bleu": 14.253365311508597, "eval_loss": 0.0932714119553566, "eval_runtime": 2.7139, "eval_samples_per_second": 11.791, "eval_steps_per_second": 1.474, "step": 5120 }, { "epoch": 0.33105454545454543, "grad_norm": 0.06286542862653732, "learning_rate": 0.00019498426033275746, "loss": 0.088, "step": 5121 }, { "epoch": 0.3311191919191919, "grad_norm": 0.058825310319662094, "learning_rate": 0.00019498212148089038, "loss": 0.0868, "step": 5122 }, { "epoch": 0.33118383838383836, "grad_norm": 0.05647850036621094, "learning_rate": 0.00019497998218482337, "loss": 0.0789, "step": 5123 }, { "epoch": 0.33124848484848485, "grad_norm": 0.06813567876815796, "learning_rate": 0.00019497784244456635, "loss": 0.0967, "step": 5124 }, { "epoch": 0.33131313131313134, "grad_norm": 0.07070878893136978, "learning_rate": 0.00019497570226012934, "loss": 0.0998, "step": 5125 }, { "epoch": 0.33137777777777777, "grad_norm": 0.06068276986479759, "learning_rate": 0.00019497356163152238, "loss": 0.0816, "step": 5126 }, { "epoch": 0.33144242424242426, "grad_norm": 0.07000171393156052, "learning_rate": 0.0001949714205587555, "loss": 0.111, "step": 5127 }, { "epoch": 0.3315070707070707, "grad_norm": 0.05142157897353172, "learning_rate": 0.00019496927904183865, "loss": 0.0651, "step": 5128 }, { "epoch": 0.3315717171717172, "grad_norm": 0.06334732472896576, "learning_rate": 0.00019496713708078186, "loss": 0.088, "step": 5129 }, { "epoch": 0.3316363636363636, "grad_norm": 0.0659305527806282, "learning_rate": 0.00019496499467559522, "loss": 0.1042, "step": 5130 }, { "epoch": 0.3317010101010101, "grad_norm": 0.05799352005124092, "learning_rate": 0.00019496285182628867, "loss": 0.0813, "step": 5131 }, { "epoch": 0.33176565656565654, "grad_norm": 0.060741059482097626, "learning_rate": 0.00019496070853287226, "loss": 0.0965, "step": 5132 }, { "epoch": 0.33183030303030303, "grad_norm": 0.07164571434259415, "learning_rate": 0.000194958564795356, "loss": 0.0819, "step": 5133 }, { "epoch": 0.3318949494949495, "grad_norm": 0.06385243684053421, "learning_rate": 0.00019495642061374992, "loss": 0.0811, "step": 5134 }, { "epoch": 0.33195959595959595, "grad_norm": 0.060044482350349426, "learning_rate": 0.00019495427598806406, "loss": 0.0878, "step": 5135 }, { "epoch": 0.33202424242424244, "grad_norm": 0.06265515089035034, "learning_rate": 0.00019495213091830845, "loss": 0.0962, "step": 5136 }, { "epoch": 0.33202424242424244, "eval_bleu": 11.448756728010238, "eval_loss": 0.095070980489254, "eval_runtime": 2.7304, "eval_samples_per_second": 11.72, "eval_steps_per_second": 1.465, "step": 5136 }, { "epoch": 0.3320888888888889, "grad_norm": 0.07399669289588928, "learning_rate": 0.00019494998540449312, "loss": 0.0827, "step": 5137 }, { "epoch": 0.33215353535353537, "grad_norm": 0.08994245529174805, "learning_rate": 0.00019494783944662807, "loss": 0.1051, "step": 5138 }, { "epoch": 0.3322181818181818, "grad_norm": 0.067339688539505, "learning_rate": 0.00019494569304472336, "loss": 0.0947, "step": 5139 }, { "epoch": 0.3322828282828283, "grad_norm": 0.06321671605110168, "learning_rate": 0.00019494354619878907, "loss": 0.0902, "step": 5140 }, { "epoch": 0.3323474747474747, "grad_norm": 0.059462081640958786, "learning_rate": 0.0001949413989088352, "loss": 0.0908, "step": 5141 }, { "epoch": 0.3324121212121212, "grad_norm": 0.06893975287675858, "learning_rate": 0.00019493925117487177, "loss": 0.0967, "step": 5142 }, { "epoch": 0.3324767676767677, "grad_norm": 0.07452491670846939, "learning_rate": 0.00019493710299690886, "loss": 0.1017, "step": 5143 }, { "epoch": 0.33254141414141414, "grad_norm": 0.06371667981147766, "learning_rate": 0.0001949349543749565, "loss": 0.096, "step": 5144 }, { "epoch": 0.3326060606060606, "grad_norm": 0.06552218645811081, "learning_rate": 0.00019493280530902474, "loss": 0.0931, "step": 5145 }, { "epoch": 0.33267070707070706, "grad_norm": 0.07016447186470032, "learning_rate": 0.00019493065579912364, "loss": 0.107, "step": 5146 }, { "epoch": 0.33273535353535355, "grad_norm": 0.0675966739654541, "learning_rate": 0.00019492850584526325, "loss": 0.1009, "step": 5147 }, { "epoch": 0.3328, "grad_norm": 0.0669718086719513, "learning_rate": 0.00019492635544745364, "loss": 0.1027, "step": 5148 }, { "epoch": 0.3328646464646465, "grad_norm": 0.07508904486894608, "learning_rate": 0.0001949242046057048, "loss": 0.0938, "step": 5149 }, { "epoch": 0.3329292929292929, "grad_norm": 0.06506823003292084, "learning_rate": 0.00019492205332002688, "loss": 0.0835, "step": 5150 }, { "epoch": 0.3329939393939394, "grad_norm": 0.06267925351858139, "learning_rate": 0.00019491990159042986, "loss": 0.0816, "step": 5151 }, { "epoch": 0.33305858585858583, "grad_norm": 0.0638953223824501, "learning_rate": 0.00019491774941692388, "loss": 0.0875, "step": 5152 }, { "epoch": 0.33305858585858583, "eval_bleu": 16.664173136053467, "eval_loss": 0.09464696049690247, "eval_runtime": 2.8599, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 5152 }, { "epoch": 0.3331232323232323, "grad_norm": 0.07019636780023575, "learning_rate": 0.00019491559679951892, "loss": 0.1021, "step": 5153 }, { "epoch": 0.3331878787878788, "grad_norm": 0.06573473662137985, "learning_rate": 0.00019491344373822513, "loss": 0.0882, "step": 5154 }, { "epoch": 0.33325252525252524, "grad_norm": 0.06170053035020828, "learning_rate": 0.00019491129023305252, "loss": 0.0917, "step": 5155 }, { "epoch": 0.33331717171717173, "grad_norm": 0.06148397549986839, "learning_rate": 0.0001949091362840112, "loss": 0.0905, "step": 5156 }, { "epoch": 0.33338181818181817, "grad_norm": 0.05336204171180725, "learning_rate": 0.0001949069818911112, "loss": 0.0838, "step": 5157 }, { "epoch": 0.33344646464646466, "grad_norm": 0.05824108049273491, "learning_rate": 0.00019490482705436266, "loss": 0.0852, "step": 5158 }, { "epoch": 0.3335111111111111, "grad_norm": 0.06073099002242088, "learning_rate": 0.0001949026717737756, "loss": 0.0941, "step": 5159 }, { "epoch": 0.3335757575757576, "grad_norm": 0.0639898031949997, "learning_rate": 0.00019490051604936009, "loss": 0.0948, "step": 5160 }, { "epoch": 0.333640404040404, "grad_norm": 0.05767025053501129, "learning_rate": 0.00019489835988112625, "loss": 0.0815, "step": 5161 }, { "epoch": 0.3337050505050505, "grad_norm": 0.06693828105926514, "learning_rate": 0.0001948962032690842, "loss": 0.1062, "step": 5162 }, { "epoch": 0.333769696969697, "grad_norm": 0.05790676549077034, "learning_rate": 0.00019489404621324393, "loss": 0.0816, "step": 5163 }, { "epoch": 0.33383434343434343, "grad_norm": 0.06812599301338196, "learning_rate": 0.0001948918887136156, "loss": 0.0893, "step": 5164 }, { "epoch": 0.3338989898989899, "grad_norm": 0.06080705672502518, "learning_rate": 0.00019488973077020928, "loss": 0.0802, "step": 5165 }, { "epoch": 0.33396363636363635, "grad_norm": 0.06341090053319931, "learning_rate": 0.00019488757238303505, "loss": 0.0908, "step": 5166 }, { "epoch": 0.33402828282828284, "grad_norm": 0.06379783153533936, "learning_rate": 0.00019488541355210302, "loss": 0.0923, "step": 5167 }, { "epoch": 0.3340929292929293, "grad_norm": 0.06463515758514404, "learning_rate": 0.00019488325427742328, "loss": 0.0933, "step": 5168 }, { "epoch": 0.3340929292929293, "eval_bleu": 14.276964004148109, "eval_loss": 0.0936284065246582, "eval_runtime": 2.7323, "eval_samples_per_second": 11.712, "eval_steps_per_second": 1.464, "step": 5168 }, { "epoch": 0.33415757575757576, "grad_norm": 0.06666377931833267, "learning_rate": 0.0001948810945590059, "loss": 0.092, "step": 5169 }, { "epoch": 0.3342222222222222, "grad_norm": 0.05954353138804436, "learning_rate": 0.00019487893439686102, "loss": 0.079, "step": 5170 }, { "epoch": 0.3342868686868687, "grad_norm": 0.06045042350888252, "learning_rate": 0.00019487677379099875, "loss": 0.0854, "step": 5171 }, { "epoch": 0.3343515151515152, "grad_norm": 0.06336052715778351, "learning_rate": 0.00019487461274142915, "loss": 0.0898, "step": 5172 }, { "epoch": 0.3344161616161616, "grad_norm": 0.06168041378259659, "learning_rate": 0.00019487245124816239, "loss": 0.0917, "step": 5173 }, { "epoch": 0.3344808080808081, "grad_norm": 0.07327160984277725, "learning_rate": 0.00019487028931120852, "loss": 0.1058, "step": 5174 }, { "epoch": 0.33454545454545453, "grad_norm": 0.07012353837490082, "learning_rate": 0.00019486812693057765, "loss": 0.0874, "step": 5175 }, { "epoch": 0.334610101010101, "grad_norm": 0.06728319823741913, "learning_rate": 0.00019486596410627993, "loss": 0.0904, "step": 5176 }, { "epoch": 0.33467474747474746, "grad_norm": 0.061720218509435654, "learning_rate": 0.00019486380083832546, "loss": 0.0991, "step": 5177 }, { "epoch": 0.33473939393939395, "grad_norm": 0.08499522507190704, "learning_rate": 0.00019486163712672435, "loss": 0.0969, "step": 5178 }, { "epoch": 0.3348040404040404, "grad_norm": 0.09189268201589584, "learning_rate": 0.00019485947297148674, "loss": 0.0913, "step": 5179 }, { "epoch": 0.33486868686868687, "grad_norm": 0.057125989347696304, "learning_rate": 0.0001948573083726227, "loss": 0.0924, "step": 5180 }, { "epoch": 0.33493333333333336, "grad_norm": 0.0634625107049942, "learning_rate": 0.0001948551433301424, "loss": 0.0922, "step": 5181 }, { "epoch": 0.3349979797979798, "grad_norm": 0.06872695684432983, "learning_rate": 0.00019485297784405597, "loss": 0.1013, "step": 5182 }, { "epoch": 0.3350626262626263, "grad_norm": 0.059110675007104874, "learning_rate": 0.0001948508119143735, "loss": 0.0808, "step": 5183 }, { "epoch": 0.3351272727272727, "grad_norm": 0.08530732989311218, "learning_rate": 0.00019484864554110515, "loss": 0.0816, "step": 5184 }, { "epoch": 0.3351272727272727, "eval_bleu": 13.516917873060715, "eval_loss": 0.09367828071117401, "eval_runtime": 2.9274, "eval_samples_per_second": 10.931, "eval_steps_per_second": 1.366, "step": 5184 }, { "epoch": 0.3351919191919192, "grad_norm": 0.06850428879261017, "learning_rate": 0.00019484647872426106, "loss": 0.1009, "step": 5185 }, { "epoch": 0.33525656565656564, "grad_norm": 0.06608329713344574, "learning_rate": 0.00019484431146385133, "loss": 0.0988, "step": 5186 }, { "epoch": 0.33532121212121213, "grad_norm": 0.061334338039159775, "learning_rate": 0.0001948421437598861, "loss": 0.0869, "step": 5187 }, { "epoch": 0.33538585858585857, "grad_norm": 0.059113241732120514, "learning_rate": 0.0001948399756123755, "loss": 0.0812, "step": 5188 }, { "epoch": 0.33545050505050505, "grad_norm": 0.07684799283742905, "learning_rate": 0.00019483780702132973, "loss": 0.1306, "step": 5189 }, { "epoch": 0.3355151515151515, "grad_norm": 0.06829513609409332, "learning_rate": 0.00019483563798675885, "loss": 0.0916, "step": 5190 }, { "epoch": 0.335579797979798, "grad_norm": 0.06430917978286743, "learning_rate": 0.0001948334685086731, "loss": 0.0878, "step": 5191 }, { "epoch": 0.33564444444444447, "grad_norm": 0.06298217177391052, "learning_rate": 0.00019483129858708251, "loss": 0.0983, "step": 5192 }, { "epoch": 0.3357090909090909, "grad_norm": 0.05535086616873741, "learning_rate": 0.00019482912822199732, "loss": 0.0815, "step": 5193 }, { "epoch": 0.3357737373737374, "grad_norm": 0.0719243511557579, "learning_rate": 0.00019482695741342764, "loss": 0.1071, "step": 5194 }, { "epoch": 0.3358383838383838, "grad_norm": 0.05423068627715111, "learning_rate": 0.00019482478616138362, "loss": 0.0683, "step": 5195 }, { "epoch": 0.3359030303030303, "grad_norm": 0.062150537967681885, "learning_rate": 0.00019482261446587544, "loss": 0.0929, "step": 5196 }, { "epoch": 0.33596767676767675, "grad_norm": 0.07422970235347748, "learning_rate": 0.00019482044232691322, "loss": 0.1092, "step": 5197 }, { "epoch": 0.33603232323232324, "grad_norm": 0.06694428622722626, "learning_rate": 0.00019481826974450717, "loss": 0.106, "step": 5198 }, { "epoch": 0.33609696969696967, "grad_norm": 0.06134125590324402, "learning_rate": 0.0001948160967186674, "loss": 0.0939, "step": 5199 }, { "epoch": 0.33616161616161616, "grad_norm": 0.0593220479786396, "learning_rate": 0.00019481392324940407, "loss": 0.0801, "step": 5200 }, { "epoch": 0.33616161616161616, "eval_bleu": 11.728057412943766, "eval_loss": 0.09482674300670624, "eval_runtime": 2.7335, "eval_samples_per_second": 11.707, "eval_steps_per_second": 1.463, "step": 5200 }, { "epoch": 0.33622626262626265, "grad_norm": 0.059335388243198395, "learning_rate": 0.00019481174933672738, "loss": 0.0758, "step": 5201 }, { "epoch": 0.3362909090909091, "grad_norm": 0.054792359471321106, "learning_rate": 0.00019480957498064748, "loss": 0.0745, "step": 5202 }, { "epoch": 0.3363555555555556, "grad_norm": 0.05974455550312996, "learning_rate": 0.00019480740018117457, "loss": 0.086, "step": 5203 }, { "epoch": 0.336420202020202, "grad_norm": 0.06615287810564041, "learning_rate": 0.00019480522493831877, "loss": 0.0849, "step": 5204 }, { "epoch": 0.3364848484848485, "grad_norm": 0.08033490180969238, "learning_rate": 0.00019480304925209027, "loss": 0.0967, "step": 5205 }, { "epoch": 0.33654949494949493, "grad_norm": 0.05968150123953819, "learning_rate": 0.00019480087312249926, "loss": 0.0823, "step": 5206 }, { "epoch": 0.3366141414141414, "grad_norm": 0.06405745446681976, "learning_rate": 0.0001947986965495559, "loss": 0.1034, "step": 5207 }, { "epoch": 0.33667878787878786, "grad_norm": 0.06672141700983047, "learning_rate": 0.0001947965195332704, "loss": 0.0938, "step": 5208 }, { "epoch": 0.33674343434343434, "grad_norm": 0.0601082518696785, "learning_rate": 0.00019479434207365288, "loss": 0.0892, "step": 5209 }, { "epoch": 0.33680808080808083, "grad_norm": 0.059394244104623795, "learning_rate": 0.00019479216417071356, "loss": 0.087, "step": 5210 }, { "epoch": 0.33687272727272727, "grad_norm": 0.06454958021640778, "learning_rate": 0.00019478998582446265, "loss": 0.0839, "step": 5211 }, { "epoch": 0.33693737373737376, "grad_norm": 0.06793256103992462, "learning_rate": 0.0001947878070349103, "loss": 0.0721, "step": 5212 }, { "epoch": 0.3370020202020202, "grad_norm": 0.0711396262049675, "learning_rate": 0.00019478562780206673, "loss": 0.1046, "step": 5213 }, { "epoch": 0.3370666666666667, "grad_norm": 0.07256491482257843, "learning_rate": 0.0001947834481259421, "loss": 0.1001, "step": 5214 }, { "epoch": 0.3371313131313131, "grad_norm": 0.0646984875202179, "learning_rate": 0.0001947812680065466, "loss": 0.0942, "step": 5215 }, { "epoch": 0.3371959595959596, "grad_norm": 0.05478161573410034, "learning_rate": 0.0001947790874438905, "loss": 0.0785, "step": 5216 }, { "epoch": 0.3371959595959596, "eval_bleu": 12.803245476791671, "eval_loss": 0.09418702125549316, "eval_runtime": 2.8351, "eval_samples_per_second": 11.287, "eval_steps_per_second": 1.411, "step": 5216 }, { "epoch": 0.33726060606060604, "grad_norm": 0.06974922120571136, "learning_rate": 0.0001947769064379839, "loss": 0.0973, "step": 5217 }, { "epoch": 0.33732525252525253, "grad_norm": 0.07079936563968658, "learning_rate": 0.00019477472498883702, "loss": 0.0864, "step": 5218 }, { "epoch": 0.337389898989899, "grad_norm": 0.06208309158682823, "learning_rate": 0.00019477254309646012, "loss": 0.0759, "step": 5219 }, { "epoch": 0.33745454545454545, "grad_norm": 0.07577164471149445, "learning_rate": 0.00019477036076086336, "loss": 0.0919, "step": 5220 }, { "epoch": 0.33751919191919194, "grad_norm": 0.07931704074144363, "learning_rate": 0.00019476817798205697, "loss": 0.1095, "step": 5221 }, { "epoch": 0.3375838383838384, "grad_norm": 0.06813856214284897, "learning_rate": 0.00019476599476005112, "loss": 0.0899, "step": 5222 }, { "epoch": 0.33764848484848486, "grad_norm": 0.06303229182958603, "learning_rate": 0.00019476381109485603, "loss": 0.0855, "step": 5223 }, { "epoch": 0.3377131313131313, "grad_norm": 0.06302332133054733, "learning_rate": 0.00019476162698648194, "loss": 0.0955, "step": 5224 }, { "epoch": 0.3377777777777778, "grad_norm": 0.06261551380157471, "learning_rate": 0.00019475944243493905, "loss": 0.0949, "step": 5225 }, { "epoch": 0.3378424242424242, "grad_norm": 0.056157827377319336, "learning_rate": 0.00019475725744023755, "loss": 0.077, "step": 5226 }, { "epoch": 0.3379070707070707, "grad_norm": 0.06013309955596924, "learning_rate": 0.0001947550720023877, "loss": 0.0758, "step": 5227 }, { "epoch": 0.33797171717171715, "grad_norm": 0.061196219176054, "learning_rate": 0.00019475288612139972, "loss": 0.0821, "step": 5228 }, { "epoch": 0.33803636363636363, "grad_norm": 0.0713590756058693, "learning_rate": 0.0001947506997972838, "loss": 0.0937, "step": 5229 }, { "epoch": 0.3381010101010101, "grad_norm": 0.0648992732167244, "learning_rate": 0.0001947485130300502, "loss": 0.0788, "step": 5230 }, { "epoch": 0.33816565656565656, "grad_norm": 0.06307211518287659, "learning_rate": 0.00019474632581970908, "loss": 0.0836, "step": 5231 }, { "epoch": 0.33823030303030305, "grad_norm": 0.08481273055076599, "learning_rate": 0.00019474413816627077, "loss": 0.099, "step": 5232 }, { "epoch": 0.33823030303030305, "eval_bleu": 13.761151269058102, "eval_loss": 0.09390333294868469, "eval_runtime": 2.8015, "eval_samples_per_second": 11.422, "eval_steps_per_second": 1.428, "step": 5232 }, { "epoch": 0.3382949494949495, "grad_norm": 0.05996965616941452, "learning_rate": 0.00019474195006974543, "loss": 0.0866, "step": 5233 }, { "epoch": 0.33835959595959597, "grad_norm": 0.060087621212005615, "learning_rate": 0.00019473976153014331, "loss": 0.069, "step": 5234 }, { "epoch": 0.3384242424242424, "grad_norm": 0.059721577912569046, "learning_rate": 0.00019473757254747463, "loss": 0.0804, "step": 5235 }, { "epoch": 0.3384888888888889, "grad_norm": 0.06644900143146515, "learning_rate": 0.00019473538312174963, "loss": 0.1078, "step": 5236 }, { "epoch": 0.33855353535353533, "grad_norm": 0.05856110900640488, "learning_rate": 0.0001947331932529786, "loss": 0.0782, "step": 5237 }, { "epoch": 0.3386181818181818, "grad_norm": 0.06397783756256104, "learning_rate": 0.00019473100294117174, "loss": 0.0943, "step": 5238 }, { "epoch": 0.3386828282828283, "grad_norm": 0.06593852490186691, "learning_rate": 0.00019472881218633928, "loss": 0.0987, "step": 5239 }, { "epoch": 0.33874747474747474, "grad_norm": 0.06024523451924324, "learning_rate": 0.00019472662098849147, "loss": 0.0888, "step": 5240 }, { "epoch": 0.33881212121212123, "grad_norm": 0.066012904047966, "learning_rate": 0.00019472442934763862, "loss": 0.0949, "step": 5241 }, { "epoch": 0.33887676767676767, "grad_norm": 0.0693393349647522, "learning_rate": 0.0001947222372637909, "loss": 0.1041, "step": 5242 }, { "epoch": 0.33894141414141415, "grad_norm": 0.07089252769947052, "learning_rate": 0.00019472004473695858, "loss": 0.1106, "step": 5243 }, { "epoch": 0.3390060606060606, "grad_norm": 0.06830541789531708, "learning_rate": 0.00019471785176715194, "loss": 0.1073, "step": 5244 }, { "epoch": 0.3390707070707071, "grad_norm": 0.06370512396097183, "learning_rate": 0.00019471565835438122, "loss": 0.0839, "step": 5245 }, { "epoch": 0.3391353535353535, "grad_norm": 0.052051398903131485, "learning_rate": 0.00019471346449865666, "loss": 0.0699, "step": 5246 }, { "epoch": 0.3392, "grad_norm": 0.06528504192829132, "learning_rate": 0.00019471127019998855, "loss": 0.0924, "step": 5247 }, { "epoch": 0.3392646464646465, "grad_norm": 0.0579901747405529, "learning_rate": 0.00019470907545838715, "loss": 0.0912, "step": 5248 }, { "epoch": 0.3392646464646465, "eval_bleu": 13.231524313588503, "eval_loss": 0.0944802537560463, "eval_runtime": 2.8616, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 5248 }, { "epoch": 0.3393292929292929, "grad_norm": 0.05912841111421585, "learning_rate": 0.0001947068802738627, "loss": 0.0764, "step": 5249 }, { "epoch": 0.3393939393939394, "grad_norm": 0.06294563412666321, "learning_rate": 0.0001947046846464255, "loss": 0.098, "step": 5250 }, { "epoch": 0.33945858585858585, "grad_norm": 0.046023931354284286, "learning_rate": 0.00019470248857608575, "loss": 0.0663, "step": 5251 }, { "epoch": 0.33952323232323234, "grad_norm": 0.06932958960533142, "learning_rate": 0.00019470029206285382, "loss": 0.1127, "step": 5252 }, { "epoch": 0.33958787878787877, "grad_norm": 0.06204749643802643, "learning_rate": 0.0001946980951067399, "loss": 0.0907, "step": 5253 }, { "epoch": 0.33965252525252526, "grad_norm": 0.06371843069791794, "learning_rate": 0.00019469589770775434, "loss": 0.0965, "step": 5254 }, { "epoch": 0.3397171717171717, "grad_norm": 0.05734868347644806, "learning_rate": 0.00019469369986590732, "loss": 0.0775, "step": 5255 }, { "epoch": 0.3397818181818182, "grad_norm": 0.06512448936700821, "learning_rate": 0.0001946915015812092, "loss": 0.0748, "step": 5256 }, { "epoch": 0.3398464646464646, "grad_norm": 0.05566174536943436, "learning_rate": 0.0001946893028536702, "loss": 0.0743, "step": 5257 }, { "epoch": 0.3399111111111111, "grad_norm": 0.06467824429273605, "learning_rate": 0.0001946871036833007, "loss": 0.0785, "step": 5258 }, { "epoch": 0.3399757575757576, "grad_norm": 0.07455947995185852, "learning_rate": 0.00019468490407011086, "loss": 0.1108, "step": 5259 }, { "epoch": 0.34004040404040403, "grad_norm": 0.2361825853586197, "learning_rate": 0.00019468270401411104, "loss": 0.1275, "step": 5260 }, { "epoch": 0.3401050505050505, "grad_norm": 0.05676371976733208, "learning_rate": 0.0001946805035153115, "loss": 0.0748, "step": 5261 }, { "epoch": 0.34016969696969696, "grad_norm": 0.06662259995937347, "learning_rate": 0.00019467830257372258, "loss": 0.0903, "step": 5262 }, { "epoch": 0.34023434343434344, "grad_norm": 0.06699920445680618, "learning_rate": 0.00019467610118935452, "loss": 0.0967, "step": 5263 }, { "epoch": 0.3402989898989899, "grad_norm": 0.07223822921514511, "learning_rate": 0.00019467389936221764, "loss": 0.1111, "step": 5264 }, { "epoch": 0.3402989898989899, "eval_bleu": 13.43408585263464, "eval_loss": 0.09522901475429535, "eval_runtime": 2.7676, "eval_samples_per_second": 11.562, "eval_steps_per_second": 1.445, "step": 5264 }, { "epoch": 0.34036363636363637, "grad_norm": 0.47410309314727783, "learning_rate": 0.00019467169709232223, "loss": 0.1506, "step": 5265 }, { "epoch": 0.3404282828282828, "grad_norm": 0.05834919586777687, "learning_rate": 0.0001946694943796786, "loss": 0.086, "step": 5266 }, { "epoch": 0.3404929292929293, "grad_norm": 0.059020742774009705, "learning_rate": 0.00019466729122429702, "loss": 0.0868, "step": 5267 }, { "epoch": 0.3405575757575758, "grad_norm": 0.05215485021471977, "learning_rate": 0.00019466508762618783, "loss": 0.0758, "step": 5268 }, { "epoch": 0.3406222222222222, "grad_norm": 0.055135779082775116, "learning_rate": 0.0001946628835853613, "loss": 0.0847, "step": 5269 }, { "epoch": 0.3406868686868687, "grad_norm": 0.057472482323646545, "learning_rate": 0.0001946606791018278, "loss": 0.0806, "step": 5270 }, { "epoch": 0.34075151515151514, "grad_norm": 0.05483287572860718, "learning_rate": 0.00019465847417559758, "loss": 0.0842, "step": 5271 }, { "epoch": 0.34081616161616163, "grad_norm": 0.05201990529894829, "learning_rate": 0.00019465626880668096, "loss": 0.0774, "step": 5272 }, { "epoch": 0.34088080808080806, "grad_norm": 0.06047874316573143, "learning_rate": 0.00019465406299508825, "loss": 0.0905, "step": 5273 }, { "epoch": 0.34094545454545455, "grad_norm": 0.05382782965898514, "learning_rate": 0.0001946518567408298, "loss": 0.0734, "step": 5274 }, { "epoch": 0.341010101010101, "grad_norm": 0.056603722274303436, "learning_rate": 0.0001946496500439159, "loss": 0.0879, "step": 5275 }, { "epoch": 0.3410747474747475, "grad_norm": 0.06553099304437637, "learning_rate": 0.0001946474429043569, "loss": 0.0985, "step": 5276 }, { "epoch": 0.34113939393939396, "grad_norm": 0.07809021323919296, "learning_rate": 0.00019464523532216306, "loss": 0.1017, "step": 5277 }, { "epoch": 0.3412040404040404, "grad_norm": 0.07104026526212692, "learning_rate": 0.0001946430272973448, "loss": 0.0932, "step": 5278 }, { "epoch": 0.3412686868686869, "grad_norm": 0.06565055251121521, "learning_rate": 0.00019464081882991234, "loss": 0.0918, "step": 5279 }, { "epoch": 0.3413333333333333, "grad_norm": 0.07072681933641434, "learning_rate": 0.00019463860991987607, "loss": 0.0974, "step": 5280 }, { "epoch": 0.3413333333333333, "eval_bleu": 12.433872544318636, "eval_loss": 0.09418949484825134, "eval_runtime": 2.7248, "eval_samples_per_second": 11.744, "eval_steps_per_second": 1.468, "step": 5280 }, { "epoch": 0.3413979797979798, "grad_norm": 0.07309387624263763, "learning_rate": 0.0001946364005672463, "loss": 0.1164, "step": 5281 }, { "epoch": 0.34146262626262625, "grad_norm": 0.06455183774232864, "learning_rate": 0.00019463419077203338, "loss": 0.0914, "step": 5282 }, { "epoch": 0.34152727272727273, "grad_norm": 0.06871473789215088, "learning_rate": 0.00019463198053424761, "loss": 0.0856, "step": 5283 }, { "epoch": 0.34159191919191917, "grad_norm": 0.07536285370588303, "learning_rate": 0.0001946297698538994, "loss": 0.1153, "step": 5284 }, { "epoch": 0.34165656565656566, "grad_norm": 0.06737662106752396, "learning_rate": 0.000194627558730999, "loss": 0.0918, "step": 5285 }, { "epoch": 0.34172121212121215, "grad_norm": 0.0628575012087822, "learning_rate": 0.00019462534716555683, "loss": 0.0777, "step": 5286 }, { "epoch": 0.3417858585858586, "grad_norm": 0.06593616306781769, "learning_rate": 0.00019462313515758317, "loss": 0.0802, "step": 5287 }, { "epoch": 0.34185050505050507, "grad_norm": 0.06449372321367264, "learning_rate": 0.0001946209227070884, "loss": 0.0898, "step": 5288 }, { "epoch": 0.3419151515151515, "grad_norm": 0.06949684023857117, "learning_rate": 0.00019461870981408286, "loss": 0.0769, "step": 5289 }, { "epoch": 0.341979797979798, "grad_norm": 0.06232437863945961, "learning_rate": 0.00019461649647857686, "loss": 0.0811, "step": 5290 }, { "epoch": 0.34204444444444443, "grad_norm": 0.06185486167669296, "learning_rate": 0.0001946142827005808, "loss": 0.0948, "step": 5291 }, { "epoch": 0.3421090909090909, "grad_norm": 0.06736941635608673, "learning_rate": 0.00019461206848010504, "loss": 0.078, "step": 5292 }, { "epoch": 0.34217373737373735, "grad_norm": 0.07362458109855652, "learning_rate": 0.0001946098538171599, "loss": 0.0963, "step": 5293 }, { "epoch": 0.34223838383838384, "grad_norm": 0.06348512321710587, "learning_rate": 0.00019460763871175573, "loss": 0.0952, "step": 5294 }, { "epoch": 0.3423030303030303, "grad_norm": 0.057083532214164734, "learning_rate": 0.00019460542316390296, "loss": 0.0794, "step": 5295 }, { "epoch": 0.34236767676767677, "grad_norm": 0.05776918679475784, "learning_rate": 0.00019460320717361188, "loss": 0.0784, "step": 5296 }, { "epoch": 0.34236767676767677, "eval_bleu": 11.69131504905, "eval_loss": 0.09222513437271118, "eval_runtime": 2.7522, "eval_samples_per_second": 11.627, "eval_steps_per_second": 1.453, "step": 5296 }, { "epoch": 0.34243232323232325, "grad_norm": 0.07010868936777115, "learning_rate": 0.00019460099074089288, "loss": 0.0918, "step": 5297 }, { "epoch": 0.3424969696969697, "grad_norm": 0.06908310949802399, "learning_rate": 0.0001945987738657563, "loss": 0.0959, "step": 5298 }, { "epoch": 0.3425616161616162, "grad_norm": 0.06377406418323517, "learning_rate": 0.00019459655654821252, "loss": 0.0849, "step": 5299 }, { "epoch": 0.3426262626262626, "grad_norm": 0.0693165510892868, "learning_rate": 0.0001945943387882719, "loss": 0.1027, "step": 5300 }, { "epoch": 0.3426909090909091, "grad_norm": 0.06693820655345917, "learning_rate": 0.0001945921205859449, "loss": 0.0902, "step": 5301 }, { "epoch": 0.34275555555555554, "grad_norm": 0.07123447209596634, "learning_rate": 0.00019458990194124178, "loss": 0.111, "step": 5302 }, { "epoch": 0.342820202020202, "grad_norm": 0.06732272356748581, "learning_rate": 0.00019458768285417297, "loss": 0.0923, "step": 5303 }, { "epoch": 0.34288484848484846, "grad_norm": 0.06970855593681335, "learning_rate": 0.00019458546332474884, "loss": 0.0867, "step": 5304 }, { "epoch": 0.34294949494949495, "grad_norm": 0.06890291720628738, "learning_rate": 0.00019458324335297977, "loss": 0.1075, "step": 5305 }, { "epoch": 0.34301414141414144, "grad_norm": 0.061176273971796036, "learning_rate": 0.00019458102293887613, "loss": 0.0841, "step": 5306 }, { "epoch": 0.34307878787878787, "grad_norm": 0.061422199010849, "learning_rate": 0.0001945788020824483, "loss": 0.0868, "step": 5307 }, { "epoch": 0.34314343434343436, "grad_norm": 0.05408511310815811, "learning_rate": 0.0001945765807837067, "loss": 0.0757, "step": 5308 }, { "epoch": 0.3432080808080808, "grad_norm": 0.07778436690568924, "learning_rate": 0.00019457435904266172, "loss": 0.111, "step": 5309 }, { "epoch": 0.3432727272727273, "grad_norm": 0.05752675235271454, "learning_rate": 0.00019457213685932369, "loss": 0.0836, "step": 5310 }, { "epoch": 0.3433373737373737, "grad_norm": 0.0678258165717125, "learning_rate": 0.00019456991423370305, "loss": 0.0845, "step": 5311 }, { "epoch": 0.3434020202020202, "grad_norm": 0.06808333098888397, "learning_rate": 0.00019456769116581017, "loss": 0.0871, "step": 5312 }, { "epoch": 0.3434020202020202, "eval_bleu": 12.656713778127095, "eval_loss": 0.09386211633682251, "eval_runtime": 2.7997, "eval_samples_per_second": 11.43, "eval_steps_per_second": 1.429, "step": 5312 }, { "epoch": 0.34346666666666664, "grad_norm": 0.0648265928030014, "learning_rate": 0.0001945654676556555, "loss": 0.0839, "step": 5313 }, { "epoch": 0.34353131313131313, "grad_norm": 0.06835661828517914, "learning_rate": 0.00019456324370324937, "loss": 0.094, "step": 5314 }, { "epoch": 0.3435959595959596, "grad_norm": 0.07128773629665375, "learning_rate": 0.0001945610193086022, "loss": 0.0988, "step": 5315 }, { "epoch": 0.34366060606060606, "grad_norm": 0.06076571345329285, "learning_rate": 0.00019455879447172444, "loss": 0.086, "step": 5316 }, { "epoch": 0.34372525252525254, "grad_norm": 0.0822317898273468, "learning_rate": 0.00019455656919262646, "loss": 0.1219, "step": 5317 }, { "epoch": 0.343789898989899, "grad_norm": 0.06159370392560959, "learning_rate": 0.00019455434347131866, "loss": 0.0878, "step": 5318 }, { "epoch": 0.34385454545454547, "grad_norm": 0.0648675262928009, "learning_rate": 0.00019455211730781143, "loss": 0.0946, "step": 5319 }, { "epoch": 0.3439191919191919, "grad_norm": 0.05378996580839157, "learning_rate": 0.00019454989070211522, "loss": 0.0743, "step": 5320 }, { "epoch": 0.3439838383838384, "grad_norm": 0.06676554679870605, "learning_rate": 0.00019454766365424044, "loss": 0.0842, "step": 5321 }, { "epoch": 0.3440484848484848, "grad_norm": 0.08499591797590256, "learning_rate": 0.0001945454361641975, "loss": 0.1261, "step": 5322 }, { "epoch": 0.3441131313131313, "grad_norm": 0.05680030584335327, "learning_rate": 0.00019454320823199676, "loss": 0.0771, "step": 5323 }, { "epoch": 0.3441777777777778, "grad_norm": 0.06661302596330643, "learning_rate": 0.00019454097985764873, "loss": 0.0951, "step": 5324 }, { "epoch": 0.34424242424242424, "grad_norm": 0.06658933311700821, "learning_rate": 0.0001945387510411638, "loss": 0.0977, "step": 5325 }, { "epoch": 0.34430707070707073, "grad_norm": 0.06929687410593033, "learning_rate": 0.00019453652178255237, "loss": 0.0811, "step": 5326 }, { "epoch": 0.34437171717171716, "grad_norm": 0.06370438635349274, "learning_rate": 0.00019453429208182488, "loss": 0.0772, "step": 5327 }, { "epoch": 0.34443636363636365, "grad_norm": 0.06585221737623215, "learning_rate": 0.00019453206193899175, "loss": 0.0971, "step": 5328 }, { "epoch": 0.34443636363636365, "eval_bleu": 12.990024539645304, "eval_loss": 0.09530485421419144, "eval_runtime": 2.7507, "eval_samples_per_second": 11.633, "eval_steps_per_second": 1.454, "step": 5328 }, { "epoch": 0.3445010101010101, "grad_norm": 0.07433836162090302, "learning_rate": 0.00019452983135406345, "loss": 0.088, "step": 5329 }, { "epoch": 0.3445656565656566, "grad_norm": 0.07311110198497772, "learning_rate": 0.00019452760032705036, "loss": 0.1066, "step": 5330 }, { "epoch": 0.344630303030303, "grad_norm": 0.0717409998178482, "learning_rate": 0.00019452536885796292, "loss": 0.0963, "step": 5331 }, { "epoch": 0.3446949494949495, "grad_norm": 0.06740804761648178, "learning_rate": 0.00019452313694681158, "loss": 0.0924, "step": 5332 }, { "epoch": 0.34475959595959593, "grad_norm": 0.06627276539802551, "learning_rate": 0.0001945209045936068, "loss": 0.0898, "step": 5333 }, { "epoch": 0.3448242424242424, "grad_norm": 0.06454344093799591, "learning_rate": 0.000194518671798359, "loss": 0.0908, "step": 5334 }, { "epoch": 0.3448888888888889, "grad_norm": 0.06541433185338974, "learning_rate": 0.00019451643856107859, "loss": 0.0854, "step": 5335 }, { "epoch": 0.34495353535353535, "grad_norm": 0.06708386540412903, "learning_rate": 0.00019451420488177608, "loss": 0.0836, "step": 5336 }, { "epoch": 0.34501818181818183, "grad_norm": 0.06929788738489151, "learning_rate": 0.00019451197076046184, "loss": 0.0875, "step": 5337 }, { "epoch": 0.34508282828282827, "grad_norm": 0.0669521912932396, "learning_rate": 0.00019450973619714637, "loss": 0.0775, "step": 5338 }, { "epoch": 0.34514747474747476, "grad_norm": 0.07428229600191116, "learning_rate": 0.0001945075011918401, "loss": 0.0943, "step": 5339 }, { "epoch": 0.3452121212121212, "grad_norm": 0.0791178047657013, "learning_rate": 0.00019450526574455354, "loss": 0.0958, "step": 5340 }, { "epoch": 0.3452767676767677, "grad_norm": 0.07511323690414429, "learning_rate": 0.00019450302985529706, "loss": 0.1071, "step": 5341 }, { "epoch": 0.3453414141414141, "grad_norm": 0.05654502660036087, "learning_rate": 0.00019450079352408115, "loss": 0.0829, "step": 5342 }, { "epoch": 0.3454060606060606, "grad_norm": 0.07276176661252975, "learning_rate": 0.00019449855675091626, "loss": 0.1154, "step": 5343 }, { "epoch": 0.3454707070707071, "grad_norm": 0.06119151785969734, "learning_rate": 0.00019449631953581288, "loss": 0.0814, "step": 5344 }, { "epoch": 0.3454707070707071, "eval_bleu": 15.206516793387227, "eval_loss": 0.09421934187412262, "eval_runtime": 2.6773, "eval_samples_per_second": 11.952, "eval_steps_per_second": 1.494, "step": 5344 }, { "epoch": 0.34553535353535353, "grad_norm": 0.05989248678088188, "learning_rate": 0.00019449408187878145, "loss": 0.0867, "step": 5345 }, { "epoch": 0.3456, "grad_norm": 0.06290517747402191, "learning_rate": 0.00019449184377983244, "loss": 0.0917, "step": 5346 }, { "epoch": 0.34566464646464645, "grad_norm": 0.06920047104358673, "learning_rate": 0.0001944896052389763, "loss": 0.0833, "step": 5347 }, { "epoch": 0.34572929292929294, "grad_norm": 0.06957939267158508, "learning_rate": 0.00019448736625622353, "loss": 0.0913, "step": 5348 }, { "epoch": 0.3457939393939394, "grad_norm": 0.06561220437288284, "learning_rate": 0.00019448512683158456, "loss": 0.0866, "step": 5349 }, { "epoch": 0.34585858585858587, "grad_norm": 0.05995111167430878, "learning_rate": 0.0001944828869650699, "loss": 0.08, "step": 5350 }, { "epoch": 0.3459232323232323, "grad_norm": 0.06013043597340584, "learning_rate": 0.00019448064665669002, "loss": 0.0881, "step": 5351 }, { "epoch": 0.3459878787878788, "grad_norm": 0.07432939112186432, "learning_rate": 0.0001944784059064554, "loss": 0.0956, "step": 5352 }, { "epoch": 0.3460525252525253, "grad_norm": 0.06742550432682037, "learning_rate": 0.0001944761647143765, "loss": 0.0889, "step": 5353 }, { "epoch": 0.3461171717171717, "grad_norm": 0.058633383363485336, "learning_rate": 0.00019447392308046378, "loss": 0.0826, "step": 5354 }, { "epoch": 0.3461818181818182, "grad_norm": 0.05573435127735138, "learning_rate": 0.00019447168100472778, "loss": 0.0791, "step": 5355 }, { "epoch": 0.34624646464646464, "grad_norm": 0.06877442449331284, "learning_rate": 0.00019446943848717898, "loss": 0.0924, "step": 5356 }, { "epoch": 0.3463111111111111, "grad_norm": 0.056518591940402985, "learning_rate": 0.00019446719552782785, "loss": 0.0793, "step": 5357 }, { "epoch": 0.34637575757575756, "grad_norm": 0.07469072192907333, "learning_rate": 0.00019446495212668485, "loss": 0.1142, "step": 5358 }, { "epoch": 0.34644040404040405, "grad_norm": 0.06297776848077774, "learning_rate": 0.0001944627082837605, "loss": 0.0864, "step": 5359 }, { "epoch": 0.3465050505050505, "grad_norm": 0.060830045491456985, "learning_rate": 0.0001944604639990653, "loss": 0.0776, "step": 5360 }, { "epoch": 0.3465050505050505, "eval_bleu": 14.47072287750069, "eval_loss": 0.09344345331192017, "eval_runtime": 2.7694, "eval_samples_per_second": 11.555, "eval_steps_per_second": 1.444, "step": 5360 } ], "logging_steps": 1, "max_steps": 46404, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.044484573888512e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }